diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 47c3b89..7d0e761 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,49 +1,88 @@ name: Tests - on: [push, pull_request] jobs: - check: name: Check runs-on: ubuntu-latest steps: - name: Checkout sources - uses: actions/checkout@v2 + uses: actions/checkout@v4 - - name: Install Protoc - uses: arduino/setup-protoc@v1 - - - name: Install stable toolchain - uses: actions-rs/toolchain@v1 + - name: Install protoc + uses: taiki-e/install-action@v2 with: - profile: minimal - toolchain: stable - override: true + tool: protoc + + - name: Install toolchain + id: rust-toolchain + uses: dtolnay/rust-toolchain@stable - name: Install cuda - uses: Jimver/cuda-toolkit@v0.2.8 + uses: Jimver/cuda-toolkit@v0.2.16 with: - cuda: '11.2.2' + cuda: '12.5.0' method: 'network' sub-packages: '["nvcc", "cudart"]' - - name: Cache - uses: actions/cache@v2 + - name: Cache cargo bin + uses: actions/cache@v4 with: - path: | - ~/.cargo/bin/ - ~/.cargo/registry/index/ - ~/.cargo/registry/cache/ - ~/.cargo/git/db/ - target/ - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + path: ~/.cargo/bin/ + key: ${{ runner.os }}-cargo-bin-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-bin- - - name: Run cargo check - uses: actions-rs/cargo@v1 + - name: Cache cargo registry index + uses: actions/cache@v4 + with: + path: ~/.cargo/registry/index/ + key: ${{ runner.os }}-cargo-registry-index-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry-index- + + - name: Cache cargo registry cache + uses: actions/cache@v4 with: - command: check - args: --tests + path: ~/.cargo/registry/cache/ + key: ${{ runner.os }}-cargo-registry-cache-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry-cache- + + - name: Cache cargo git db + uses: actions/cache@v4 + with: + path: ~/.cargo/git/db/ + key: ${{ runner.os }}-cargo-git-db-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-git-db- + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry- + + - name: Cache cargo git + uses: actions/cache@v4 + with: + path: ~/.cargo/git + key: ${{ runner.os }}-cargo-git-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-git- + + - name: Cache build + uses: actions/cache@v4 + with: + path: target + key: ${{ runner.os }}-cargo-target-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-target- + + - name: Run cargo check + run: cargo check --tests test: name: Test Suite @@ -54,86 +93,101 @@ jobs: os: [ ubuntu-latest, windows-latest ] steps: - name: Checkout sources - uses: actions/checkout@v2 - - - name: Fix LibOpenCL on Linux - if: runner.os == 'Linux' - run: | - sudo apt update - sudo apt install ocl-icd-opencl-dev -y + uses: actions/checkout@v4 - name: Fix CRLF on Windows if: runner.os == 'Windows' run: git config --global core.autocrlf false - - name: Install Protoc - uses: arduino/setup-protoc@v1 - - - name: Install stable toolchain - uses: actions-rs/toolchain@v1 + - name: Install protoc + uses: taiki-e/install-action@v2 with: - profile: minimal - toolchain: stable - override: true + tool: protoc + + - name: Install toolchain + id: rust-toolchain + uses: dtolnay/rust-toolchain@stable - name: Install CUDA Linux if: runner.os == 'Linux' - uses: Jimver/cuda-toolkit@v0.2.8 + uses: Jimver/cuda-toolkit@v0.2.16 with: - cuda: '11.2.2' + cuda: '12.5.0' method: 'network' sub-packages: '["nvcc", "cudart"]' - name: Install CUDA Windows if: runner.os == 'Windows' - uses: Jimver/cuda-toolkit@v0.2.8 + uses: Jimver/cuda-toolkit@v0.2.16 with: - cuda: '11.5.1' + cuda: '12.5.0' method: 'network' sub-packages: '["nvcc", "cudart"]' - - name: Cache - uses: actions/cache@v2 + - name: Cache cargo bin + uses: actions/cache@v4 with: - path: | - ~/.cargo/bin/ - ~/.cargo/registry/index/ - ~/.cargo/registry/cache/ - ~/.cargo/git/db/ - target/ - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + path: ~/.cargo/bin/ + key: ${{ runner.os }}-cargo-bin-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-bin- - - name: Run cargo test regular features - uses: actions-rs/cargo@v1 + - name: Cache cargo registry index + uses: actions/cache@v4 with: - command: test - args: -p karlsen-miner + path: ~/.cargo/registry/index/ + key: ${{ runner.os }}-cargo-registry-index-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry-index- - - name: Run cargo test no asm - uses: actions-rs/cargo@v1 + - name: Cache cargo registry cache + uses: actions/cache@v4 with: - command: test - args: -p karlsen-miner --features=no-asm + path: ~/.cargo/registry/cache/ + key: ${{ runner.os }}-cargo-registry-cache-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry-cache- - - - name: Run cargo test no parking_lot - uses: actions-rs/cargo@v1 + - name: Cache cargo git db + uses: actions/cache@v4 with: - command: test - args: -p karlsen-miner --no-default-features + path: ~/.cargo/git/db/ + key: ${{ runner.os }}-cargo-git-db-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-git-db- - - name: Run cargo test shuttle - uses: actions-rs/cargo@v1 + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry- + + - name: Cache cargo git + uses: actions/cache@v4 with: - command: test - args: -p karlsen-miner --no-default-features --features=shuttle + path: ~/.cargo/git + key: ${{ runner.os }}-cargo-git-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-git- - - name: Run cargo test for kaspaopencl - uses: actions-rs/cargo@v1 + - name: Cache build + uses: actions/cache@v4 with: - command: test - args: -p kaspaopencl + path: target + key: ${{ runner.os }}-cargo-target-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-target- + + - name: Run cargo test regular features + run: cargo test -p karlsen-miner + + - name: Run cargo test no asm + run: cargo test -p karlsen-miner --features=no-asm + - name: Run cargo test shuttle + run: cargo test -p karlsen-miner --no-default-features --features=shuttle test-release: name: Test Suite Release @@ -144,130 +198,185 @@ jobs: os: [ ubuntu-latest, windows-latest ] steps: - name: Checkout sources - uses: actions/checkout@v2 - - - name: Fix LibOpenCL on Linux - if: runner.os == 'Linux' - run: | - sudo apt update - sudo apt install ocl-icd-opencl-dev -y + uses: actions/checkout@v4 - name: Fix CRLF on Windows if: runner.os == 'Windows' run: git config --global core.autocrlf false - - name: Install Protoc - uses: arduino/setup-protoc@v1 - - - name: Install stable toolchain - uses: actions-rs/toolchain@v1 + - name: Install protoc + uses: taiki-e/install-action@v2 with: - profile: minimal - toolchain: stable - override: true + tool: protoc + + - name: Install toolchain + id: rust-toolchain + uses: dtolnay/rust-toolchain@stable - name: Install CUDA Linux if: runner.os == 'Linux' - uses: Jimver/cuda-toolkit@v0.2.8 + uses: Jimver/cuda-toolkit@v0.2.16 with: - cuda: '11.2.2' + cuda: '12.5.0' method: 'network' sub-packages: '["nvcc", "cudart"]' - name: Install CUDA Windows if: runner.os == 'Windows' - uses: Jimver/cuda-toolkit@v0.2.8 + uses: Jimver/cuda-toolkit@v0.2.16 with: - cuda: '11.5.1' + cuda: '12.5.0' method: 'network' sub-packages: '["nvcc", "cudart"]' - - name: Cache - uses: actions/cache@v2 + - name: Cache cargo bin + uses: actions/cache@v4 with: - path: | - ~/.cargo/bin/ - ~/.cargo/registry/index/ - ~/.cargo/registry/cache/ - ~/.cargo/git/db/ - target/ - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + path: ~/.cargo/bin/ + key: ${{ runner.os }}-cargo-bin-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-bin- - - name: Run cargo test release regular features - uses: actions-rs/cargo@v1 + - name: Cache cargo registry index + uses: actions/cache@v4 with: - command: test - args: --release -p karlsen-miner + path: ~/.cargo/registry/index/ + key: ${{ runner.os }}-cargo-registry-index-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry-index- - - name: Run cargo test release no asm - uses: actions-rs/cargo@v1 + - name: Cache cargo registry cache + uses: actions/cache@v4 with: - command: test - args: --features=no-asm --release -p karlsen-miner + path: ~/.cargo/registry/cache/ + key: ${{ runner.os }}-cargo-registry-cache-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry-cache- - - name: Run cargo test release no parking_lot - uses: actions-rs/cargo@v1 + - name: Cache cargo git db + uses: actions/cache@v4 with: - command: test - args: --no-default-features --release -p karlsen-miner + path: ~/.cargo/git/db/ + key: ${{ runner.os }}-cargo-git-db-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-git-db- - - name: Run cargo test release shuttle - uses: actions-rs/cargo@v1 + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry- + + - name: Cache cargo git + uses: actions/cache@v4 with: - command: test - args: --no-default-features --features=shuttle --release -p karlsen-miner + path: ~/.cargo/git + key: ${{ runner.os }}-cargo-git-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-git- - - name: Run cargo test for kaspaopencl - uses: actions-rs/cargo@v1 + - name: Cache build + uses: actions/cache@v4 with: - command: test - args: --release -p kaspaopencl + path: target + key: ${{ runner.os }}-cargo-target-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-target- + + - name: Run cargo test regular features + run: cargo test -p karlsen-miner --release + + - name: Run cargo test release no asm + run: cargo test -p karlsen-miner --features=no-asm --release + + - name: Run cargo test release shuttle + run: cargo test -p karlsen-miner --no-default-features --features=shuttle --release lints: name: Lints runs-on: ubuntu-latest steps: - name: Checkout sources - uses: actions/checkout@v2 + uses: actions/checkout@v4 - - name: Install Protoc - uses: arduino/setup-protoc@v1 + - name: Install protoc + uses: taiki-e/install-action@v2 + with: + tool: protoc - - name: Install stable toolchain - uses: actions-rs/toolchain@v1 + - name: Install toolchain + id: rust-toolchain + uses: dtolnay/rust-toolchain@stable with: - profile: minimal - toolchain: stable - override: true components: rustfmt, clippy - name: Install cuda - uses: Jimver/cuda-toolkit@v0.2.8 + uses: Jimver/cuda-toolkit@v0.2.16 with: - cuda: '11.2.2' + cuda: '12.5.0' method: 'network' sub-packages: '["nvcc", "cudart"]' - - name: Cache - uses: actions/cache@v2 + - name: Cache cargo bin + uses: actions/cache@v4 with: - path: | - ~/.cargo/bin/ - ~/.cargo/registry/index/ - ~/.cargo/registry/cache/ - ~/.cargo/git/db/ - target/ - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + path: ~/.cargo/bin/ + key: ${{ runner.os }}-cargo-bin-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-bin- + - name: Cache cargo registry index + uses: actions/cache@v4 + with: + path: ~/.cargo/registry/index/ + key: ${{ runner.os }}-cargo-registry-index-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry-index- - - name: Run cargo fmt - uses: actions-rs/cargo@v1 + - name: Cache cargo registry cache + uses: actions/cache@v4 with: - command: fmt - args: --all -- --check + path: ~/.cargo/registry/cache/ + key: ${{ runner.os }}-cargo-registry-cache-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry-cache- - - name: Run cargo clippy - uses: actions-rs/cargo@v1 + - name: Cache cargo git db + uses: actions/cache@v4 + with: + path: ~/.cargo/git/db/ + key: ${{ runner.os }}-cargo-git-db-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-git-db- + + - name: Cache cargo registry + uses: actions/cache@v4 with: - command: clippy - args: --tests -- -D warnings + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-registry- + + - name: Cache cargo git + uses: actions/cache@v4 + with: + path: ~/.cargo/git + key: ${{ runner.os }}-cargo-git-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-git- + + - name: Cache build + uses: actions/cache@v4 + with: + path: target + key: ${{ runner.os }}-cargo-target-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-target- + + - name: Run cargo fmt + run: cargo fmt --all -- --check + + - name: Run cargo clippy + run: cargo clippy --tests -- -D warnings diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 6e3c678..2c69287 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -9,26 +9,22 @@ jobs: strategy: fail-fast: false matrix: - # Build gnu-linux on ubuntu-18.04 and musl on ubuntu latest - os: [ ubuntu-18.04, windows-latest, macos-latest ] - features: ["default", "kaspacuda/overclock "] + # Build gnu-linux on ubuntu-20.04 and musl on ubuntu latest + os: [ ubuntu-20.04, windows-latest, macos-latest ] + features: [ "karlsencuda/overclock" ] name: Building, ${{ matrix.os }} ${{ matrix.features }} steps: - name: Fix CRLF on Windows if: runner.os == 'Windows' run: git config --global core.autocrlf false - - name: Fix LibOpenCL on Linux - if: runner.os == 'Linux' - run: | - sudo apt update - sudo apt install ocl-icd-opencl-dev -y - - name: Check out code into the Go module directory - uses: actions/checkout@v2 + uses: actions/checkout@v4 - - name: Install Protoc - uses: arduino/setup-protoc@v1 + - name: Install protoc + uses: taiki-e/install-action@v2 + with: + tool: protoc - name: Setup Rust uses: actions-rs/toolchain@v1 @@ -39,22 +35,22 @@ jobs: - name: Install CUDA Linux if: runner.os == 'Linux' - uses: Jimver/cuda-toolkit@v0.2.8 + uses: Jimver/cuda-toolkit@v0.2.16 with: - cuda: '11.2.2' + cuda: '12.5.0' method: 'network' sub-packages: '["nvcc", "cudart"]' - name: Install CUDA Windows if: runner.os == 'Windows' - uses: Jimver/cuda-toolkit@v0.2.8 + uses: Jimver/cuda-toolkit@v0.2.16 with: - cuda: '11.5.1' + cuda: '12.5.0' method: 'network' sub-packages: '["nvcc", "cudart"]' - name: Build on Linux GNU - if: matrix.os == 'ubuntu-18.04' + if: matrix.os == 'ubuntu-20.04' # We're using musl to make the binaries statically linked and portable run: | cargo build --target=x86_64-unknown-linux-gnu --release --all --features ${{ matrix.features }} @@ -63,7 +59,7 @@ jobs: strip ./target/x86_64-unknown-linux-gnu/release/karlsen-miner mkdir ${asset_name} mv ./target/x86_64-unknown-linux-gnu/release/karlsen-miner ${asset_name}/${asset_name} - mv ./target/x86_64-unknown-linux-gnu/release/libkaspa*.so ${asset_name}/ + mv ./target/x86_64-unknown-linux-gnu/release/libkarlsen*.so ${asset_name}/ tar czvf ${asset_name}.tgz ${asset_name} echo "archive=${asset_name}.tgz" >> $GITHUB_ENV echo "asset_name=${asset_name}.tgz" >> $GITHUB_ENV @@ -78,7 +74,7 @@ jobs: asset_name="karlsen-miner-${{ github.event.release.tag_name }}-${feature_name/\//-}-win64-amd64" mkdir ${asset_name} mv ./target/x86_64-pc-windows-msvc/release/karlsen-miner.exe ${asset_name}/${asset_name}.exe - mv ./target/x86_64-pc-windows-msvc/release/kaspa*.dll ${asset_name}/ + mv ./target/x86_64-pc-windows-msvc/release/karlsen*.dll ${asset_name}/ bash ./integrations/windows/create_bat.sh ${asset_name} 7z a -tzip -r ${asset_name}.zip ${asset_name} echo "archive=${asset_name}.zip" >> $GITHUB_ENV @@ -87,6 +83,7 @@ jobs: - name: Build on MacOS if: matrix.os == 'macos-latest' run: | + rustup target add x86_64-apple-darwin cargo build --target=x86_64-apple-darwin --release -p karlsen-miner asset_name="karlsen-miner-${{ github.event.release.tag_name }}-cpu-only-osx-amd64" mkdir ${asset_name} @@ -107,24 +104,25 @@ jobs: asset_content_type: application/zip intergrations: - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: - # Build gnu-linux on ubuntu-18.04 and musl on ubuntu latest + # Build gnu-linux on ubuntu-20.04 and musl on ubuntu latest itegration: [ hiveos ] name: Integrating, ${{ matrix.itegration }} steps: - - name: Fix LibOpenCL on Linux + - name: upgrade all the installed packages run: | - sudo apt update - sudo apt install ocl-icd-opencl-dev -y + sudo apt update && sudo apt upgrade -y - name: Check out code into the module directory - uses: actions/checkout@v2 + uses: actions/checkout@v4 - - name: Install Protoc - uses: arduino/setup-protoc@v1 + - name: Install protoc + uses: taiki-e/install-action@v2 + with: + tool: protoc - name: Setup Rust uses: actions-rs/toolchain@v1 @@ -135,9 +133,9 @@ jobs: - name: Install CUDA Linux if: runner.os == 'Linux' - uses: Jimver/cuda-toolkit@v0.2.8 + uses: Jimver/cuda-toolkit@v0.2.16 with: - cuda: '11.2.2' + cuda: '12.5.0' method: 'network' sub-packages: '["nvcc", "cudart"]' @@ -149,7 +147,7 @@ jobs: strip ./target/x86_64-unknown-linux-gnu/release/karlsen-miner mkdir karlsen-miner mv ./target/x86_64-unknown-linux-gnu/release/karlsen-miner karlsen-miner/${binary_name} - mv ./target/x86_64-unknown-linux-gnu/release/libkaspa*.so karlsen-miner/ + mv ./target/x86_64-unknown-linux-gnu/release/libkarlsen*.so karlsen-miner/ bash integrations/${{ matrix.itegration }}/build.sh "${{ github.event.release.tag_name }}" "${binary_name}" karlsen-miner echo "archive=${asset_name}.tgz" >> $GITHUB_ENV echo "asset_name=${asset_name}.tgz" >> $GITHUB_ENV diff --git a/.gitignore b/.gitignore index ea8c4bf..ad2f87d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +dataset.bin diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index 58c4bcf..0000000 --- a/Cargo.lock +++ /dev/null @@ -1,2204 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "addr2line" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "aho-corasick" -version = "0.7.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" -dependencies = [ - "memchr", -] - -[[package]] -name = "ansi_term" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "anyhow" -version = "1.0.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84450d0b4a8bd1ba4144ce8ce718fbc5d071358b1e5384bace6536b3d1f2d5b3" - -[[package]] -name = "approx" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" -dependencies = [ - "num-traits", -] - -[[package]] -name = "arrayref" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" - -[[package]] -name = "arrayvec" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" - -[[package]] -name = "async-stream" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171374e7e3b2504e0e5236e3b59260560f9fe94bfe9ac39ba5e4e929c5590625" -dependencies = [ - "async-stream-impl", - "futures-core", -] - -[[package]] -name = "async-stream-impl" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "648ed8c8d2ce5409ccd57453d9d1b214b342a0d69376a6feda1fd6cae3299308" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "async-trait" -version = "0.1.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061a7acccaa286c011ddc30970520b98fa40e00c9d644633fb26b5fc63a265e3" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "autocfg" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" - -[[package]] -name = "axum" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4af7447fc1214c1f3a1ace861d0216a6c8bb13965b64bbad9650f375b67689a" -dependencies = [ - "async-trait", - "axum-core", - "bitflags", - "bytes", - "futures-util", - "http", - "http-body", - "hyper", - "itoa 1.0.1", - "matchit", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "serde", - "sync_wrapper", - "tokio", - "tower", - "tower-http", - "tower-layer", - "tower-service", -] - -[[package]] -name = "axum-core" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdc19781b16e32f8a7200368a336fa4509d4b72ef15dd4e41df5290855ee1e6" -dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http", - "http-body", - "mime", -] - -[[package]] -name = "backtrace" -version = "0.3.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" -dependencies = [ - "addr2line", - "cc", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "base64" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitvec" -version = "0.21.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "470fbd40e959c961f16841fbf96edbbdcff766ead89a1ae2b53d22852be20998" -dependencies = [ - "funty", - "radium", - "tap", - "wyz", -] - -[[package]] -name = "blake2b_simd" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72936ee4afc7f8f736d1c38383b56480b5497b4617b4a77bdbf1d2ababc76127" -dependencies = [ - "arrayref", - "arrayvec", - "constant_time_eq 0.1.5", -] - -[[package]] -name = "blake3" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0231f06152bf547e9c2b5194f247cd97aacf6dcd8b15d8e5ec0663f64580da87" -dependencies = [ - "arrayref", - "arrayvec", - "cc", - "cfg-if", - "constant_time_eq 0.3.0", -] - -[[package]] -name = "block-buffer" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" -dependencies = [ - "block-padding", - "generic-array", -] - -[[package]] -name = "block-padding" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d696c370c750c948ada61c69a0ee2cbbb9c50b1019ddb86d9317157a99c2cae" - -[[package]] -name = "bytemuck" -version = "1.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" - -[[package]] -name = "bytes" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" - -[[package]] -name = "cc" -version = "1.0.72" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "cl-sys" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8573fa3ff8acd6c49e8e113296c54277e82376b96c6ca6307848632cce38e44" -dependencies = [ - "libc", -] - -[[package]] -name = "cl3" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ffa85802035f535ab9671b5691b06c0bc0262317909de32a52a6b3f427e5e" -dependencies = [ - "cl-sys", - "libc", -] - -[[package]] -name = "clap" -version = "3.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f34b09b9ee8c7c7b400fe2f8df39cafc9538b03d6ba7f4ae13e4cb90bfbb7d" -dependencies = [ - "atty", - "bitflags", - "clap_derive", - "indexmap", - "lazy_static", - "os_str_bytes", - "strsim 0.10.0", - "termcolor", - "textwrap", -] - -[[package]] -name = "clap_derive" -version = "3.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41a0645a430ec9136d2d701e54a95d557de12649a9dd7109ced3187e648ac824" -dependencies = [ - "heck", - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "constant_time_eq" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" - -[[package]] -name = "constant_time_eq" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" - -[[package]] -name = "cust" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d6cc71911e179f12483b9734120b45bd00bf64fab085cc4818428523eedd469" -dependencies = [ - "bitflags", - "bytemuck", - "cust_core", - "cust_derive", - "cust_raw", - "find_cuda_helper", -] - -[[package]] -name = "cust_core" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "039f79662cb8f890cbf335e818cd522d6e3a53fe63f61d1aaaf859cd3d975f06" -dependencies = [ - "cust_derive", - "glam", - "mint", - "vek", -] - -[[package]] -name = "cust_derive" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8a3bc95fe629aed92b2423de6ccff9e40174b21d19cb6ee6281a4d04ac72f66" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "cust_raw" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbf40d6ade12cb9828bbc844b9875c7b93d25e67a3c9bf61c7aa3ae09e402bf8" -dependencies = [ - "find_cuda_helper", -] - -[[package]] -name = "darling" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d706e75d87e35569db781a9b5e2416cff1236a47ed380831f959382ccd5f858" -dependencies = [ - "darling_core", - "darling_macro", -] - -[[package]] -name = "darling_core" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim 0.9.3", - "syn 1.0.99", -] - -[[package]] -name = "darling_macro" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72" -dependencies = [ - "darling_core", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "digest" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" -dependencies = [ - "generic-array", -] - -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - -[[package]] -name = "env_logger" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b2cf0344971ee6c64c31be0d530793fba457d322dfec2810c453d0ef228f9c3" -dependencies = [ - "atty", - "humantime", - "log", - "regex", - "termcolor", -] - -[[package]] -name = "find_cuda_helper" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad" -dependencies = [ - "glob", -] - -[[package]] -name = "fixedbitset" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "398ea4fabe40b9b0d885340a2a991a44c8a645624075ad966d21f88688e2b69e" - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "funty" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1847abb9cb65d566acd5942e94aea9c8f547ad02c98e1649326fc0e8910b8b1e" - -[[package]] -name = "futures" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" - -[[package]] -name = "futures-executor" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" - -[[package]] -name = "futures-macro" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "futures-sink" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" - -[[package]] -name = "futures-task" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" - -[[package]] -name = "futures-util" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", -] - -[[package]] -name = "generator" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1d9279ca822891c1a4dae06d185612cf8fc6acfe5dff37781b41297811b12ee" -dependencies = [ - "cc", - "libc", - "log", - "rustversion", - "winapi 0.3.9", -] - -[[package]] -name = "generic-array" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", -] - -[[package]] -name = "getrandom" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.10.2+wasi-snapshot-preview1", -] - -[[package]] -name = "gimli" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" - -[[package]] -name = "glam" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3412e74893a912839a67e975aca0c88561e20e5461d2d358a5fa6d3b229fae59" -dependencies = [ - "num-traits", -] - -[[package]] -name = "glob" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" - -[[package]] -name = "h2" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f072413d126e57991455e0a922b31e4c8ba7c2ffbebf6b78b4f8521397d65cd" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap", - "slab", - "tokio", - "tokio-util 0.6.9", - "tracing", -] - -[[package]] -name = "hashbrown" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" - -[[package]] -name = "heck" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" - -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - -[[package]] -name = "http" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1323096b05d41827dadeaee54c9981958c0f94e670bc94ed80037d1a7b8b186b" -dependencies = [ - "bytes", - "fnv", - "itoa 0.4.8", -] - -[[package]] -name = "http-body" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ff4f84919677303da5f147645dbea6b1881f368d03ac84e1dc09031ebd7b2c6" -dependencies = [ - "bytes", - "http", - "pin-project-lite", -] - -[[package]] -name = "http-range-header" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" - -[[package]] -name = "httparse" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acd94fdbe1d4ff688b67b04eee2e17bd50995534a61539e45adfefb45e5e5503" - -[[package]] -name = "httpdate" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" - -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - -[[package]] -name = "hyper" -version = "0.14.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7ec3e62bdc98a2f0393a5048e4c30ef659440ea6e0e572965103e72bd836f55" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "httparse", - "httpdate", - "itoa 0.4.8", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper-timeout" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" -dependencies = [ - "hyper", - "pin-project-lite", - "tokio", - "tokio-io-timeout", -] - -[[package]] -name = "ident_case" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" - -[[package]] -name = "include_dir" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "482a2e29200b7eed25d7fdbd14423326760b7f6658d21a4cf12d55a50713c69f" -dependencies = [ - "include_dir_macros", -] - -[[package]] -name = "include_dir_macros" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e074c19deab2501407c91ba1860fa3d6820bfde307db6d8cb851b55a10be89b" -dependencies = [ - "proc-macro2", - "quote", -] - -[[package]] -name = "indexmap" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" -dependencies = [ - "autocfg", - "hashbrown", -] - -[[package]] -name = "itertools" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - -[[package]] -name = "itoa" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" - -[[package]] -name = "karlsen-miner" -version = "0.0.1-GPU-0.1" -dependencies = [ - "async-trait", - "blake2b_simd", - "blake3", - "bytes", - "cc", - "clap", - "env_logger", - "futures", - "futures-util", - "hex", - "keccak", - "kernel32-sys", - "libloading", - "log", - "nix", - "num", - "num_cpus", - "once_cell", - "parking_lot", - "prost", - "rand 0.8.4", - "semver", - "serde", - "serde_json", - "serde_repr", - "sha3", - "shuttle", - "time", - "tokio", - "tokio-stream", - "tokio-util 0.7.0", - "tonic", - "tonic-build", - "win32console", -] - -[[package]] -name = "kaspacuda" -version = "0.1.0" -dependencies = [ - "clap", - "cust", - "env_logger", - "karlsen-miner", - "log", - "nvml-wrapper", - "rand 0.8.4", -] - -[[package]] -name = "kaspaopencl" -version = "0.1.0" -dependencies = [ - "clap", - "env_logger", - "include_dir", - "karlsen-miner", - "log", - "opencl3", - "rand 0.8.4", -] - -[[package]] -name = "keccak" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67c21572b4949434e4fc1e1978b99c5f77064153c59d998bf13ecd96fb5ecba7" - -[[package]] -name = "kernel32-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "libc" -version = "0.2.150" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" - -[[package]] -name = "libloading" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afe203d669ec979b7128619bae5a63b7b42e9203c1b29146079ee05e2f604b52" -dependencies = [ - "cfg-if", - "winapi 0.3.9", -] - -[[package]] -name = "libm" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33a33a362ce288760ec6a508b94caaec573ae7d3bbbd91b87aa0bad4456839db" - -[[package]] -name = "lock_api" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" -dependencies = [ - "autocfg", - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "matchit" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" - -[[package]] -name = "memchr" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" - -[[package]] -name = "memoffset" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" -dependencies = [ - "autocfg", -] - -[[package]] -name = "mime" -version = "0.3.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" - -[[package]] -name = "miniz_oxide" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" -dependencies = [ - "adler", -] - -[[package]] -name = "mint" -version = "0.5.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e53debba6bda7a793e5f99b8dacf19e626084f525f7829104ba9898f367d85ff" - -[[package]] -name = "mio" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dce281c5e46beae905d4de1870d8b1509a9142b62eedf18b443b011ca8343d0" -dependencies = [ - "libc", - "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.48.0", -] - -[[package]] -name = "multimap" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" - -[[package]] -name = "nix" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e322c04a9e3440c327fca7b6c8a63e6890a32fa2ad689db972425f07e0d22abb" -dependencies = [ - "autocfg", - "bitflags", - "cfg-if", - "libc", - "memoffset", - "pin-utils", -] - -[[package]] -name = "num" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] - -[[package]] -name = "num-bigint" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-complex" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26873667bbbb7c5182d4a37c1add32cdf09f841af72da53318fdb81543c15085" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-integer" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" -dependencies = [ - "autocfg", - "num-traits", -] - -[[package]] -name = "num-iter" -version = "0.1.42" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" -dependencies = [ - "autocfg", - "num-bigint", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" -dependencies = [ - "autocfg", - "libm", -] - -[[package]] -name = "num_cpus" -version = "1.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" -dependencies = [ - "hermit-abi", - "libc", -] - -[[package]] -name = "num_threads" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0" -dependencies = [ - "libc", -] - -[[package]] -name = "nvml-wrapper" -version = "0.7.0" -source = "git+https://github.com/benrod3k/nvml-wrapper?branch=495.29.05#53f80372fdfdca4616a7cc65f005360b7afb1ca0" -dependencies = [ - "bitflags", - "libloading", - "nvml-wrapper-sys", - "static_assertions", - "thiserror", - "wrapcenum-derive", -] - -[[package]] -name = "nvml-wrapper-sys" -version = "0.5.0" -source = "git+https://github.com/benrod3k/nvml-wrapper?branch=495.29.05#53f80372fdfdca4616a7cc65f005360b7afb1ca0" -dependencies = [ - "libloading", -] - -[[package]] -name = "object" -version = "0.32.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0" -dependencies = [ - "memchr", -] - -[[package]] -name = "once_cell" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" - -[[package]] -name = "opaque-debug" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" - -[[package]] -name = "opencl3" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "931cc2ab3068142384dbdaba142681c11b315cf3b96c7a59e8480d062363387f" -dependencies = [ - "cl3", - "libc", -] - -[[package]] -name = "os_str_bytes" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" -dependencies = [ - "memchr", -] - -[[package]] -name = "parking_lot" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "995f667a6c822200b0433ac218e05582f0e2efa1b922a3fd2fbaadc5f87bab37" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-sys 0.34.0", -] - -[[package]] -name = "percent-encoding" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" - -[[package]] -name = "petgraph" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" -dependencies = [ - "fixedbitset", - "indexmap", -] - -[[package]] -name = "pin-project" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1622113ce508488160cff04e6abc60960e676d330e1ca0f77c0b8df17c81438f" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95af56fee93df76d721d356ac1ca41fccf168bc448eb14049234df764ba3e76" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "ppv-lite86" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" - -[[package]] -name = "prettyplease" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "697ae720ee02011f439e0701db107ffe2916d83f718342d65d7f8bf7b8a5fee9" -dependencies = [ - "proc-macro2", - "syn 1.0.99", -] - -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.99", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - -[[package]] -name = "proc-macro2" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "prost" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "399c3c31cdec40583bb68f0b18403400d01ec4289c383aa047560439952c4dd7" -dependencies = [ - "bytes", - "prost-derive", -] - -[[package]] -name = "prost-build" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f835c582e6bd972ba8347313300219fed5bfa52caf175298d860b61ff6069bb" -dependencies = [ - "bytes", - "heck", - "itertools", - "lazy_static", - "log", - "multimap", - "petgraph", - "prost", - "prost-types", - "regex", - "tempfile", - "which", -] - -[[package]] -name = "prost-derive" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7345d5f0e08c0536d7ac7229952590239e77abf0a0100a1b1d890add6ea96364" -dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "prost-types" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dfaa718ad76a44b3415e6c4d53b17c8f99160dcb3a99b10470fce8ad43f6e3e" -dependencies = [ - "bytes", - "prost", -] - -[[package]] -name = "quote" -version = "1.0.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "radium" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643f8f41a8ebc4c5dc4515c82bb8abd397b527fc20fd681b7c011c2aee5d44fb" - -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom 0.1.16", - "libc", - "rand_chacha 0.2.2", - "rand_core 0.5.1", - "rand_hc 0.2.0", -] - -[[package]] -name = "rand" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" -dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.3", - "rand_hc 0.3.1", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core 0.6.3", -] - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom 0.1.16", -] - -[[package]] -name = "rand_core" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" -dependencies = [ - "getrandom 0.2.3", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rand_hc" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" -dependencies = [ - "rand_core 0.6.3", -] - -[[package]] -name = "rand_pcg" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "redox_syscall" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" -dependencies = [ - "bitflags", -] - -[[package]] -name = "regex" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.6.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" - -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" - -[[package]] -name = "rustc_version" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" -dependencies = [ - "semver", -] - -[[package]] -name = "rustversion" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" - -[[package]] -name = "ryu" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" - -[[package]] -name = "scoped-tls" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" - -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - -[[package]] -name = "semver" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a3381e03edd24287172047536f20cabde766e2cd3e65e6b00fb3af51c4f38d" - -[[package]] -name = "serde" -version = "1.0.136" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.136" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "serde_json" -version = "1.0.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e8d9fa5c3b304765ce1fd9c4c8a3de2c8db365a5b91be52f186efc675681d95" -dependencies = [ - "itoa 1.0.1", - "ryu", - "serde", -] - -[[package]] -name = "serde_repr" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98d0516900518c29efa217c298fa1f4e6c6ffc85ae29fd7f4ee48f176e1a9ed5" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "sha3" -version = "0.9.1" -source = "git+https://github.com/elichai/hashes?branch=cSHAKE#c3847d04c37f4d486db591ce1bc43c9e41a39e36" -dependencies = [ - "block-buffer", - "digest", - "keccak", - "opaque-debug", -] - -[[package]] -name = "shuttle" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9530a86401642e98211ee26f7bdfe12d9b580fef6d4f94b9416321c284e7632" -dependencies = [ - "ansi_term", - "bitvec", - "generator", - "hex", - "rand 0.7.3", - "rand_core 0.5.1", - "rand_pcg", - "scoped-tls", - "smallvec", - "tracing", - "varmint", -] - -[[package]] -name = "slab" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" - -[[package]] -name = "smallvec" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" - -[[package]] -name = "socket2" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7916fc008ca5542385b89a3d3ce689953c143e9304a9bf8beec1de48994c0d" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - -[[package]] -name = "strsim" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c" - -[[package]] -name = "strsim" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - -[[package]] -name = "syn" -version = "1.0.99" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.39" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "sync_wrapper" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" - -[[package]] -name = "tap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" - -[[package]] -name = "tempfile" -version = "3.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" -dependencies = [ - "cfg-if", - "libc", - "rand 0.8.4", - "redox_syscall", - "remove_dir_all", - "winapi 0.3.9", -] - -[[package]] -name = "termcolor" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "textwrap" -version = "0.14.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" - -[[package]] -name = "thiserror" -version = "1.0.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "time" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" -dependencies = [ - "itoa 1.0.1", - "libc", - "num_threads", - "time-macros", -] - -[[package]] -name = "time-macros" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" - -[[package]] -name = "tokio" -version = "1.29.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "532826ff75199d5833b9d2c5fe410f29235e25704ee5f0ef599fb51c21f4a4da" -dependencies = [ - "autocfg", - "backtrace", - "bytes", - "libc", - "mio", - "num_cpus", - "parking_lot", - "pin-project-lite", - "socket2", - "tokio-macros", - "windows-sys 0.48.0", -] - -[[package]] -name = "tokio-io-timeout" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90c49f106be240de154571dd31fbe48acb10ba6c6dd6f6517ad603abffa42de9" -dependencies = [ - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-macros" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.39", -] - -[[package]] -name = "tokio-stream" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.6.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e99e1983e5d376cd8eb4b66604d2e99e79f5bd988c3055891dcd8c9e2604cc0" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64910e1b9c1901aaf5375561e35b9c057d95ff41a44ede043a03e09279eabaf1" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tonic" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "498f271adc46acce75d66f639e4d35b31b2394c295c82496727dafa16d465dd2" -dependencies = [ - "async-stream", - "async-trait", - "axum", - "base64", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "prost-derive", - "tokio", - "tokio-stream", - "tokio-util 0.7.0", - "tower", - "tower-layer", - "tower-service", - "tracing", - "tracing-futures", -] - -[[package]] -name = "tonic-build" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fbcd2800e34e743b9ae795867d5f77b535d3a3be69fd731e39145719752df8c" -dependencies = [ - "prettyplease", - "proc-macro2", - "prost-build", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "tower" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5651b5f6860a99bd1adb59dbfe1db8beb433e73709d9032b413a77e2fb7c066a" -dependencies = [ - "futures-core", - "futures-util", - "indexmap", - "pin-project", - "pin-project-lite", - "rand 0.8.4", - "slab", - "tokio", - "tokio-stream", - "tokio-util 0.6.9", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower-http" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d342c6d58709c0a6d48d48dabbb62d4ef955cf5f0f3bbfd845838e7ae88dbae" -dependencies = [ - "bitflags", - "bytes", - "futures-core", - "futures-util", - "http", - "http-body", - "http-range-header", - "pin-project-lite", - "tower", - "tower-layer", - "tower-service", -] - -[[package]] -name = "tower-layer" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" - -[[package]] -name = "tower-service" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" - -[[package]] -name = "tracing" -version = "0.1.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "375a639232caf30edfc78e8d89b2d4c375515393e7af7e16f01cd96917fb2105" -dependencies = [ - "cfg-if", - "log", - "pin-project-lite", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f480b8f81512e825f337ad51e94c1eb5d3bbdf2b363dcd01e2b19a9ffe3f8e" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "tracing-core" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "tracing-futures" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" -dependencies = [ - "pin-project", - "tracing", -] - -[[package]] -name = "try-lock" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" - -[[package]] -name = "typenum" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" - -[[package]] -name = "unicode-ident" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf" - -[[package]] -name = "varmint" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37964b04ce5402f3c72374add45209115a44782d83a1ddfd4ca372d12ddd45d7" - -[[package]] -name = "vek" -version = "0.15.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dcfb4368fdf4143fe9fe414293e7228b30e75a866ac94464d19824ca5c491df" -dependencies = [ - "approx", - "num-integer", - "num-traits", - "rustc_version", - "static_assertions", -] - -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - -[[package]] -name = "want" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" -dependencies = [ - "log", - "try-lock", -] - -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - -[[package]] -name = "wasi" -version = "0.10.2+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" - -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - -[[package]] -name = "which" -version = "4.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea187a8ef279bc014ec368c27a920da2024d2a711109bfbe3440585d5cf27ad9" -dependencies = [ - "either", - "lazy_static", - "libc", -] - -[[package]] -name = "win32console" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e69bbdf01990d3e8b9f5a7c4667feda30c63be20aa2f8e66b2f4efb6c06f673" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "winapi" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-build" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "windows-sys" -version = "0.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5acdd78cb4ba54c0045ac14f62d8f94a03d10047904ae2a40afa1e99d8f70825" -dependencies = [ - "windows_aarch64_msvc 0.34.0", - "windows_i686_gnu 0.34.0", - "windows_i686_msvc 0.34.0", - "windows_x86_64_gnu 0.34.0", - "windows_x86_64_msvc 0.34.0", -] - -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc 0.48.5", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17cffbe740121affb56fad0fc0e421804adf0ae00891205213b5cecd30db881d" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - -[[package]] -name = "windows_i686_gnu" -version = "0.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2564fde759adb79129d9b4f54be42b32c89970c18ebf93124ca8870a498688ed" - -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - -[[package]] -name = "windows_i686_msvc" -version = "0.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cd9d32ba70453522332c14d38814bceeb747d80b3958676007acadd7e166956" - -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfce6deae227ee8d356d19effc141a509cc503dfd1f850622ec4b0f84428e1f4" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d19538ccc21819d01deaf88d6a17eae6596a12e9aafdbb97916fb49896d89de9" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - -[[package]] -name = "wrapcenum-derive" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bcc065c85ad2c3bd12aa4118bf164835712e25080c392557801a13292c60aec" -dependencies = [ - "darling", - "proc-macro2", - "quote", - "syn 1.0.99", -] - -[[package]] -name = "wyz" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" diff --git a/Cargo.toml b/Cargo.toml index 40f19df..48b2f1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,80 +1,79 @@ -[package] -name = "karlsen-miner" -version = "0.0.3-GPU-0.1" -edition = "2021" -license = "MIT/Apache-2.0" -authors = ["Elichai "] -repository = "https://github.com/karlsen-network/karlsen-miner" -readme = "README.md" -description = "A fast CPU & GPU miner for Karlsen" -categories = ["command-line-utilities"] -keywords = ["blockchain", "cli"] -include = [ - "src/**/*.rs", - "src/**/*.s", - "proto/**", - "Cargo.toml", - "Cargo.lock", - "build.rs", - "LICENSE-MIT", - "LICENSE-APACHE", - "README.md", -] - -[dependencies] -blake3 = "1.5.0" -tonic = "0.8" -tokio = { version = "1.28.0", features = ["macros", "rt-multi-thread"] } -prost = "0.11" -futures-util = "0.3" -tokio-stream = {version = "0.1", features = ["net"]} -once_cell = "1" -num_cpus = "1" -rand = "0.8" -blake2b_simd = "1.0.0" -clap = { version = "3.0", features = ["color", "derive"]} -log = "0.4" -env_logger = "0.9" -keccak = { version = "0.1", optional = true } -parking = { package = "parking_lot", version = "0.12", optional = true } -shuttle = { version = "0.2.0", optional = true } -libloading = "0.7" -tokio-util = {version = "0.7.0", features = ["codec"]} -serde_json = "1.0" -serde_repr = "0.1" -serde = {version="1.0", features=["derive"]} -futures = "0.3.21" -bytes = "1.1.0" -async-trait = "0.1" -num = "0.4" -nix = "0.25" -hex = "0.4" -semver = "1.0" -time = { version = "0.3", features = ["formatting", "macros"] } - -[features] -default = ["parking_lot"] -parking_lot = ["parking", "tokio/parking_lot"] -bench = [] -no-asm = ["keccak"] - -[target.'cfg(target_os = "windows")'.dependencies] -keccak = "0.1" -kernel32-sys = "0.2" -win32console = "0.1" - -[profile.release] -lto = true -codegen-units = 1 - -[build-dependencies] -tonic-build = { version = "0.8", default-features = false, features = ["prost", "transport"] } -cc = "1" -time = { version = "0.3", features = ["formatting"] } - -[dev-dependencies] -sha3 = { git = "https://github.com/elichai/hashes", branch = "cSHAKE" } - -[workspace] -members = ["plugins/*"] -default-members = [".", "plugins/cuda", "plugins/opencl"] \ No newline at end of file +[package] +name = "karlsen-miner" +version = "2.0.0" +edition = "2021" +license = "MIT/Apache-2.0" +repository = "https://github.com/karlsen-network/karlsen-miner" +readme = "README.md" +description = "A fast CPU & GPU miner for Karlsen" +categories = ["command-line-utilities"] +keywords = ["blockchain", "cli"] +include = [ + "src/**/*.rs", + "src/**/*.s", + "proto/**", + "Cargo.toml", + "Cargo.lock", + "build.rs", + "LICENSE-MIT", + "LICENSE-APACHE", + "README.md", +] + +[dependencies] +blake3 = "1.5.0" +tonic = "0.8" +tokio = { version = "1.28.0", features = ["macros", "rt-multi-thread"] } +prost = "0.11" +futures-util = "0.3" +tokio-stream = {version = "0.1", features = ["net"]} +once_cell = "1" +num_cpus = "1" +rand = "0.8" +blake2b_simd = "1.0.0" +clap = { version = "3.0", features = ["color", "derive"]} +log = "0.4" +env_logger = "0.9" +keccak = { version = "0.1", optional = true } +parking = { package = "parking_lot", version = "0.12", optional = true } +shuttle = { version = "0.4.1", optional = true } +libloading = "0.7" +tokio-util = {version = "0.7.0", features = ["codec"]} +serde_json = "1.0" +serde_repr = "0.1" +serde = {version="1.0", features=["derive"]} +futures = "0.3.21" +bytes = "1.1.0" +async-trait = "0.1" +num = "0.4" +nix = "0.25" +hex = "0.4" +semver = "1.0" +time = { version = "0.3", features = ["formatting", "macros"] } + +[features] +default = ["parking_lot"] +parking_lot = ["parking", "tokio/parking_lot"] +bench = [] +no-asm = ["keccak"] + +[target.'cfg(target_os = "windows")'.dependencies] +keccak = "0.1" +kernel32-sys = "0.2" +win32console = "0.1" + +[profile.release] +lto = true +codegen-units = 1 + +[build-dependencies] +tonic-build = { version = "0.8", default-features = false, features = ["prost", "transport"] } +cc = "1" +time = { version = "0.3", features = ["formatting"] } + +[dev-dependencies] +sha3 = { git = "https://github.com/elichai/hashes", branch = "cSHAKE" } + +[workspace] +members = ["plugins/*"] +default-members = [".", "plugins/cuda"] diff --git a/README.md b/README.md index 8b867e9..c544871 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,15 @@ # Karlsen-miner +[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/karlsen-network/karlsen-miner/ci.yaml)](https://github.com/karlsen-network/karlsen-miner/actions) +[![Latest Release](https://img.shields.io/github/v/release/karlsen-network/karlsen-miner?display_name=tag&style=flat-square)](https://github.com/karlsen-network/karlsen-miner/releases) +[![Downloads Latest](https://img.shields.io/github/downloads/karlsen-network/karlsen-miner/latest/total?style=flat-square)](https://github.com/karlsen-network/karlsen-miner/releases/latest) +[![dependency status](https://deps.rs/repo/github/karlsen-network/karlsen-miner/status.svg)](https://deps.rs/repo/github/karlsen-network/karlsen-miner) +[![Join the Karlsen Discord Server](https://img.shields.io/discord/1169939685280337930.svg?label=&logo=discord&logoColor=ffffff)](https://discord.gg/ZPZRvgMJDT) This is a modification of Kaspa Rust Miner for the Karlsen network -please consider donate to the original dev : +please consider donate to the original dev: + **Elichai**: `kaspa:qzvqtx5gkvl3tc54up6r8pk5mhuft9rtr0lvn624w9mtv4eqm9rvc9zfdmmpu` + **HauntedCook**: `kaspa:qz4jdyu04hv4hpyy00pl6trzw4gllnhnwy62xattejv2vaj5r0p5quvns058f` @@ -16,7 +23,7 @@ packages in the workspace. To compile a specific package, you run the following ```sh git clone https://github.com/karlsen-network/karlsen-miner cd karlsen-miner -cargo build --release -p karlsen-miner -p kaspacuda -p kaspaopencl +cargo build --release -p karlsen-miner -p karlsencuda ``` And, the miner (and plugins) will be in `targets/release`. You can replace the last line with ```sh @@ -24,28 +31,28 @@ cargo build --release --all ``` ### From Binaries -The [release page](https://github.com/tmrlvi/karlsen-miner/releases) includes precompiled binaries for Linux, and Windows (for the GPU version). +The [release page](https://github.com/karlsen-network/karlsen-miner/releases) includes precompiled binaries for Linux, and Windows (for the GPU version). ### Removing Plugins To remove a plugin, you simply remove the corresponding `dll`/`so` for the directory of the miner. -* `libkaspacuda.so`, `libkaspacuda.dll`: Cuda support for karlsen-miner -* `libkaspaopencl.so`, `libkaspaopencl.dll`: OpenCL support for karlsen-miner +* `libkarlsencuda.so`, `libkarlsencuda.dll`: Cuda support for karlsen-miner +* `libkarlsenopencl.so`, `libkarlsenopencl.dll`: OpenCL support for karlsen-miner (currently disabled) # Usage -To start mining, you need to run [karlsend](https://github.com/karlsen-network/karlsend) and have an address to send the rewards to. -Here is a guidance on how to run a full node and how to generate addresses: https://github.com/kaspanet/docs/blob/main/Getting%20Started/Full%20Node%20Installation.md +To start mining, you need to run [karlsend](https://github.com/karlsen-network/rusty-karlsen) and have an address to send the rewards to. +Here is a guidance on how to run a full node and how to generate addresses: https://github.com/karlsen-network/docs/blob/main/Getting%20Started/Rust%20Full%20Node%20Installation.md Help: ``` karlsen-miner -A Kaspa high performance CPU miner +A Karlsen high performance CPU/GPU miner USAGE: karlsen-miner [OPTIONS] --mining-address OPTIONS: - -a, --mining-address The Kaspa address for the miner reward + -a, --mining-address The Karlsen address for the miner reward --cuda-device Which CUDA GPUs to use [default: all] --cuda-disable Disable cuda workers --cuda-lock-core-clocks Lock core clocks eg: ,1200, [default: 0] @@ -55,41 +62,38 @@ OPTIONS: --cuda-workload Ratio of nonces to GPU possible parrallel run [default: 64] --cuda-workload-absolute The values given by workload are not ratio, but absolute number of nonces [default: false] -d, --debug Enable debug logging level - --devfund-percent The percentage of blocks to send to the devfund (minimum 2%) [default: 2] - --experimental-amd Uses SMID instructions in AMD. Miner will crash if instruction is not supported + --devfund-percent The percentage of blocks to send to the devfund (minimum 0%) [default: 0] -h, --help Print help information --mine-when-not-synced Mine even when karlsend says it is not synced --nonce-gen The random method used to generate nonces. Options: (i) xoshiro (ii) lean [default: lean] - --opencl-amd-disable Disables AMD mining (does not override opencl-enable) - --opencl-device Which OpenCL GPUs to use on a specific platform - --opencl-enable Enable opencl, and take all devices of the chosen platform - --opencl-no-amd-binary Disable fetching of precompiled AMD kernel (if exists) - --opencl-platform Which OpenCL platform to use (limited to one per executable) - --opencl-workload Ratio of nonces to GPU possible parrallel run in OpenCL [default: 512] - --opencl-workload-absolute The values given by workload are not ratio, but absolute number of nonces in OpenCL [default: false] - -p, --port karlsend port [default: Mainnet = 16110, Testnet = 16211] - -s, --karlsend-address The IP of the karlsend instance [default: 127.0.0.1] + -p, --port karlsend port [default: Mainnet = 42110, Testnet = 42210, Devnet = 42610] + -s, --karlsend-address IP, pool, or node address of the Karlsend instance. Use stratum+tcp:// for stratum or grpc:// for Karlsend (default: grpc://127.0.0.1) -t, --threads Amount of CPU miner threads to launch [default: 0] --testnet Use testnet instead of mainnet [default: false] + --devnet Use devnet instead of mainnet [default: false] ``` To start mining, you just need to run the following: - -`./karlsen-miner --mining-address kaspa:XXXXX` +``` +./karlsen-miner --mining-address karlsen:XXXXX +``` This will run the miner on all the available GPU devcies. # Devfund -The devfund is a fund managed by the Kaspa community in order to fund Kaspa development
+The devfund is a fund managed by the Karlsen community in order to fund Karlsen development
A miner that wants to mine higher percentage into the dev-fund can pass the following flags:
`--devfund-precent=XX.YY` to mine only XX.YY% of the blocks into the devfund. -**This version automatically sets the devfund donation to the community designated address. -Due to community decision, the minimum amount in the precompiled binaries is 2%** - -# Donation Addresses +**This version automatically sets the devfund donation to the Karlsen Devfund, with a default donation rate of 0%** -**Elichai**: `kaspa:qzvqtx5gkvl3tc54up6r8pk5mhuft9rtr0lvn624w9mtv4eqm9rvc9zfdmmpu` +If you would like to support us, run the miner with the following command: +``` +./karlsen-miner --devfund-percent --mining-address karlsen:XXXXX +``` -**HauntedCook**: `kaspa:qz4jdyu04hv4hpyy00pl6trzw4gllnhnwy62xattejv2vaj5r0p5quvns058f` +# Karlsen Dev Fund +``` +karlsen:qzrq7v5jhsc5znvtfdg6vxg7dz5x8dqe4wrh90jkdnwehp6vr8uj7csdss2l7 +``` diff --git a/build_fishlibs.sh b/build_fishlibs.sh new file mode 100644 index 0000000..1fa01a4 --- /dev/null +++ b/build_fishlibs.sh @@ -0,0 +1,14 @@ +nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_86 --gpu-code=sm_86 -o plugins/cuda/resources/karlsen-cuda-sm86.ptx -Xptxas -O3 -Xcompiler -O3 + +nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_75 --gpu-code=sm_75 -o plugins/cuda/resources/karlsen-cuda-sm75.ptx -Xptxas -O3 -Xcompiler -O3 + +nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_61 --gpu-code=sm_61 -o plugins/cuda/resources/karlsen-cuda-sm61.ptx -Xptxas -O3 -Xcompiler -O3 + +nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -ccbin=gcc-7 -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_30 --gpu-code=sm_30 -o plugins/cuda/resources/karlsen-cuda-sm30.ptx + +nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -ccbin=gcc-5 -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_20 --gpu-code=sm_20 -o plugins/cuda/resources/karlsen-cuda-sm20.ptx + +cargo build --release + + + diff --git a/integrations/windows/create_bat.sh b/integrations/windows/create_bat.sh index 0d2fdae..d6a8fb9 100644 --- a/integrations/windows/create_bat.sh +++ b/integrations/windows/create_bat.sh @@ -1,7 +1,7 @@ echo REM When mining to a local node, you can drop the -s option. > ${1}/mine.bat echo echo ============================================================ >> ${1}/mine.bat -echo echo = Running Kaspa Miner with default .bat. Edit to configure = >> ${1}/mine.bat +echo echo = Running Karlsen Miner with default .bat. Edit to configure = >> ${1}/mine.bat echo echo ============================================================ >> ${1}/mine.bat echo :start >> ${1}/mine.bat -echo ${1}.exe -a kaspa:qz4jdyu04hv4hpyy00pl6trzw4gllnhnwy62xattejv2vaj5r0p5quvns058f -s n.seeder1.kaspad.net >> ${1}/mine.bat +echo ${1}.exe -a karlsen:qzrq7v5jhsc5znvtfdg6vxg7dz5x8dqe4wrh90jkdnwehp6vr8uj7csdss2l7 >> ${1}/mine.bat echo goto start >> ${1}/mine.bat \ No newline at end of file diff --git a/plugins/cuda/Cargo.lock b/plugins/cuda/Cargo.lock deleted file mode 100644 index 0fb96b3..0000000 --- a/plugins/cuda/Cargo.lock +++ /dev/null @@ -1,1198 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "aho-corasick" -version = "0.7.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" -dependencies = [ - "memchr", -] - -[[package]] -name = "anyhow" -version = "1.0.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84450d0b4a8bd1ba4144ce8ce718fbc5d071358b1e5384bace6536b3d1f2d5b3" - -[[package]] -name = "arrayref" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" - -[[package]] -name = "arrayvec" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" - -[[package]] -name = "async-stream" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171374e7e3b2504e0e5236e3b59260560f9fe94bfe9ac39ba5e4e929c5590625" -dependencies = [ - "async-stream-impl", - "futures-core", -] - -[[package]] -name = "async-stream-impl" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "648ed8c8d2ce5409ccd57453d9d1b214b342a0d69376a6feda1fd6cae3299308" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "async-trait" -version = "0.1.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061a7acccaa286c011ddc30970520b98fa40e00c9d644633fb26b5fc63a265e3" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - -[[package]] -name = "autocfg" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" - -[[package]] -name = "base64" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "blake2b_simd" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72936ee4afc7f8f736d1c38383b56480b5497b4617b4a77bdbf1d2ababc76127" -dependencies = [ - "arrayref", - "arrayvec", - "constant_time_eq", -] - -[[package]] -name = "bytes" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" - -[[package]] -name = "cc" -version = "1.0.72" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "clap" -version = "3.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f34b09b9ee8c7c7b400fe2f8df39cafc9538b03d6ba7f4ae13e4cb90bfbb7d" -dependencies = [ - "atty", - "bitflags", - "clap_derive", - "indexmap", - "lazy_static", - "os_str_bytes", - "strsim", - "termcolor", - "textwrap", -] - -[[package]] -name = "clap_derive" -version = "3.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41a0645a430ec9136d2d701e54a95d557de12649a9dd7109ced3187e648ac824" -dependencies = [ - "heck 0.4.0", - "proc-macro-error", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "constant_time_eq" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" - -[[package]] -name = "cust" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e947a46de036afc50a482cbe88e2b09680b1c8ebea79ce714e4c168371a267c9" -dependencies = [ - "bitflags", - "cust_derive", - "cust_raw", - "find_cuda_helper", -] - -[[package]] -name = "cust_derive" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57ca82ac24c045b317909d4722bb7e0dad7ef97bdbdba5b68aebd0e8a79904a6" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "cust_raw" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbf40d6ade12cb9828bbc844b9875c7b93d25e67a3c9bf61c7aa3ae09e402bf8" -dependencies = [ - "find_cuda_helper", -] - -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - -[[package]] -name = "env_logger" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b2cf0344971ee6c64c31be0d530793fba457d322dfec2810c453d0ef228f9c3" -dependencies = [ - "atty", - "humantime", - "log", - "regex", - "termcolor", -] - -[[package]] -name = "find_cuda_helper" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad" -dependencies = [ - "glob", -] - -[[package]] -name = "fixedbitset" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "futures-channel" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3dda0b6588335f360afc675d0564c17a77a2bda81ca178a4b6081bd86c7f0b" -dependencies = [ - "futures-core", -] - -[[package]] -name = "futures-core" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c8ff0461b82559810cdccfde3215c3f373807f5e5232b71479bff7bb2583d7" - -[[package]] -name = "futures-macro" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dbd947adfffb0efc70599b3ddcf7b5597bb5fa9e245eb99f62b3a5f7bb8bd3c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3055baccb68d74ff6480350f8d6eb8fcfa3aa11bdc1a1ae3afdd0514617d508" - -[[package]] -name = "futures-task" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee7c6485c30167ce4dfb83ac568a849fe53274c831081476ee13e0dce1aad72" - -[[package]] -name = "futures-util" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b5cf40b47a271f77a8b1bec03ca09044d99d2372c0de244e66430761127164" -dependencies = [ - "futures-core", - "futures-macro", - "futures-task", - "pin-project-lite", - "pin-utils", - "slab", -] - -[[package]] -name = "getrandom" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - -[[package]] -name = "glob" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" - -[[package]] -name = "h2" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f072413d126e57991455e0a922b31e4c8ba7c2ffbebf6b78b4f8521397d65cd" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", -] - -[[package]] -name = "hashbrown" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" - -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - -[[package]] -name = "heck" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" - -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - -[[package]] -name = "http" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f4c6746584866f0feabcc69893c5b51beef3831656a968ed7ae254cdc4fd03" -dependencies = [ - "bytes", - "fnv", - "itoa 1.0.1", -] - -[[package]] -name = "http-body" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ff4f84919677303da5f147645dbea6b1881f368d03ac84e1dc09031ebd7b2c6" -dependencies = [ - "bytes", - "http", - "pin-project-lite", -] - -[[package]] -name = "httparse" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acd94fdbe1d4ff688b67b04eee2e17bd50995534a61539e45adfefb45e5e5503" - -[[package]] -name = "httpdate" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" - -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - -[[package]] -name = "hyper" -version = "0.14.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7ec3e62bdc98a2f0393a5048e4c30ef659440ea6e0e572965103e72bd836f55" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "httparse", - "httpdate", - "itoa 0.4.8", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper-timeout" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" -dependencies = [ - "hyper", - "pin-project-lite", - "tokio", - "tokio-io-timeout", -] - -[[package]] -name = "indexmap" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" -dependencies = [ - "autocfg", - "hashbrown", -] - -[[package]] -name = "instant" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "itertools" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - -[[package]] -name = "itoa" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" - -[[package]] -name = "karlsen-miner" -version = "0.2.1-GPU-0.1" -dependencies = [ - "blake2b_simd", - "cc", - "clap", - "env_logger", - "futures-util", - "keccak", - "libloading", - "log", - "num_cpus", - "once_cell", - "parking_lot", - "prost", - "rand", - "tokio", - "tokio-stream", - "tonic", - "tonic-build", -] - -[[package]] -name = "kaspacuda" -version = "0.1.0" -dependencies = [ - "clap", - "cust", - "env_logger", - "karlsen-miner", - "log", - "rand", -] - -[[package]] -name = "keccak" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67c21572b4949434e4fc1e1978b99c5f77064153c59d998bf13ecd96fb5ecba7" - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "libc" -version = "0.2.112" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b03d17f364a3a042d5e5d46b053bbbf82c92c9430c592dd4c064dc6ee997125" - -[[package]] -name = "libloading" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afe203d669ec979b7128619bae5a63b7b42e9203c1b29146079ee05e2f604b52" -dependencies = [ - "cfg-if", - "winapi", -] - -[[package]] -name = "lock_api" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" -dependencies = [ - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "memchr" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" - -[[package]] -name = "mio" -version = "0.7.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc" -dependencies = [ - "libc", - "log", - "miow", - "ntapi", - "winapi", -] - -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi", -] - -[[package]] -name = "multimap" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" - -[[package]] -name = "ntapi" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" -dependencies = [ - "winapi", -] - -[[package]] -name = "num_cpus" -version = "1.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" -dependencies = [ - "hermit-abi", - "libc", -] - -[[package]] -name = "once_cell" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" - -[[package]] -name = "os_str_bytes" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" -dependencies = [ - "memchr", -] - -[[package]] -name = "parking_lot" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall", - "smallvec", - "winapi", -] - -[[package]] -name = "percent-encoding" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" - -[[package]] -name = "petgraph" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" -dependencies = [ - "fixedbitset", - "indexmap", -] - -[[package]] -name = "pin-project" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ad3879ad3baf4e44784bc6a718a8698867bb991f8ce24d1bcbe2cfb4c3a75e" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744b6f092ba29c3650faf274db506afd39944f48420f6c86b17cfe0ee1cb36bb" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e280fbe77cc62c91527259e9442153f4688736748d24660126286329742b4c6c" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "ppv-lite86" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" - -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - -[[package]] -name = "proc-macro2" -version = "1.0.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" -dependencies = [ - "unicode-xid", -] - -[[package]] -name = "prost" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001" -dependencies = [ - "bytes", - "prost-derive", -] - -[[package]] -name = "prost-build" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5" -dependencies = [ - "bytes", - "heck 0.3.3", - "itertools", - "lazy_static", - "log", - "multimap", - "petgraph", - "prost", - "prost-types", - "regex", - "tempfile", - "which", -] - -[[package]] -name = "prost-derive" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe" -dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "prost-types" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a" -dependencies = [ - "bytes", - "prost", -] - -[[package]] -name = "quote" -version = "1.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47aa80447ce4daf1717500037052af176af5d38cc3e571d9ec1c7353fc10c87d" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", - "rand_hc", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rand_hc" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" -dependencies = [ - "rand_core", -] - -[[package]] -name = "redox_syscall" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" -dependencies = [ - "bitflags", -] - -[[package]] -name = "regex" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.6.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" - -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi", -] - -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - -[[package]] -name = "slab" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" - -[[package]] -name = "smallvec" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" - -[[package]] -name = "socket2" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dc90fe6c7be1a323296982db1836d1ea9e47b6839496dde9a541bc496df3516" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "strsim" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - -[[package]] -name = "syn" -version = "1.0.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecb2e6da8ee5eb9a61068762a32fa9619cc591ceb055b3687f4cd4051ec2e06b" -dependencies = [ - "proc-macro2", - "quote", - "unicode-xid", -] - -[[package]] -name = "tempfile" -version = "3.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" -dependencies = [ - "cfg-if", - "libc", - "rand", - "redox_syscall", - "remove_dir_all", - "winapi", -] - -[[package]] -name = "termcolor" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "textwrap" -version = "0.14.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" - -[[package]] -name = "tokio" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbbf1c778ec206785635ce8ad57fe52b3009ae9e0c9f574a728f3049d3e55838" -dependencies = [ - "bytes", - "libc", - "memchr", - "mio", - "num_cpus", - "parking_lot", - "pin-project-lite", - "tokio-macros", - "winapi", -] - -[[package]] -name = "tokio-io-timeout" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" -dependencies = [ - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-macros" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tokio-stream" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.6.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e99e1983e5d376cd8eb4b66604d2e99e79f5bd988c3055891dcd8c9e2604cc0" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tonic" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a" -dependencies = [ - "async-stream", - "async-trait", - "base64", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "prost-derive", - "tokio", - "tokio-stream", - "tokio-util", - "tower", - "tower-layer", - "tower-service", - "tracing", - "tracing-futures", -] - -[[package]] -name = "tonic-build" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9403f1bafde247186684b230dc6f38b5cd514584e8bec1dd32514be4745fa757" -dependencies = [ - "proc-macro2", - "prost-build", - "quote", - "syn", -] - -[[package]] -name = "tower" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5651b5f6860a99bd1adb59dbfe1db8beb433e73709d9032b413a77e2fb7c066a" -dependencies = [ - "futures-core", - "futures-util", - "indexmap", - "pin-project", - "pin-project-lite", - "rand", - "slab", - "tokio", - "tokio-stream", - "tokio-util", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower-layer" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" - -[[package]] -name = "tower-service" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" - -[[package]] -name = "tracing" -version = "0.1.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "375a639232caf30edfc78e8d89b2d4c375515393e7af7e16f01cd96917fb2105" -dependencies = [ - "cfg-if", - "log", - "pin-project-lite", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f480b8f81512e825f337ad51e94c1eb5d3bbdf2b363dcd01e2b19a9ffe3f8e" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tracing-core" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "tracing-futures" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" -dependencies = [ - "pin-project", - "tracing", -] - -[[package]] -name = "try-lock" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" - -[[package]] -name = "unicode-segmentation" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" - -[[package]] -name = "unicode-xid" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" - -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - -[[package]] -name = "want" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" -dependencies = [ - "log", - "try-lock", -] - -[[package]] -name = "wasi" -version = "0.10.2+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" - -[[package]] -name = "which" -version = "4.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea187a8ef279bc014ec368c27a920da2024d2a711109bfbe3440585d5cf27ad9" -dependencies = [ - "either", - "lazy_static", - "libc", -] - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/plugins/cuda/Cargo.toml b/plugins/cuda/Cargo.toml index 59a2e9d..46a3d1b 100644 --- a/plugins/cuda/Cargo.toml +++ b/plugins/cuda/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "kaspacuda" +name = "karlsencuda" version = "0.1.0" edition = "2021" @@ -13,6 +13,8 @@ rand = "0.8" clap = { version = "3.0", features = ["color", "derive"]} env_logger = "0.9" nvml-wrapper = { git = "https://github.com/benrod3k/nvml-wrapper", branch = "495.29.05", optional = true } +tiny-keccak = { version = "2.0.2", features = ["keccak"] } +memmap = "0.7.0" [lib] crate-type = ["cdylib", "rlib"] diff --git a/plugins/cuda/README.md b/plugins/cuda/README.md index 2220f12..6118f3d 100644 --- a/plugins/cuda/README.md +++ b/plugins/cuda/README.md @@ -5,25 +5,25 @@ The plugin is a shared library file that resides in the same library as the miner. You can build the library by running ```sh -cargo build -p kaspacuda +cargo build -p karlsencuda ``` This version includes a precompiled PTX, which would work with most modern GPUs. To compile the PTX youself, you have to clone the project: ```sh -git clone https://github.com/tmrlvi/karlsen-miner.git +git clone https://github.com/karlsen-network/karlsen-miner.git cd karlsen-miner # Compute version 8.6 -/usr/local/cuda-11.5/bin/nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_86 --gpu-code=sm_86 -o plugins/cuda/resources/kaspa-cuda-sm86.ptx -Xptxas -O3 -Xcompiler -O3 +/usr/local/cuda-11.5/bin/nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_86 --gpu-code=sm_86 -o plugins/cuda/resources/karlsen-cuda-sm86.ptx -Xptxas -O3 -Xcompiler -O3 # Compute version 7.5 -/usr/local/cuda-11.5/bin/nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_75 --gpu-code=sm_75 -o plugins/cuda/resources/kaspa-cuda-sm75.ptx -Xptxas -O3 -Xcompiler -O3 +/usr/local/cuda-11.5/bin/nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_75 --gpu-code=sm_75 -o plugins/cuda/resources/karlsen-cuda-sm75.ptx -Xptxas -O3 -Xcompiler -O3 # Compute version 6.1 -/usr/local/cuda-11.2/bin/nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_61 --gpu-code=sm_61 -o plugins/cuda/resources/kaspa-cuda-sm61.ptx -Xptxas -O3 -Xcompiler -O3 +/usr/local/cuda-11.2/bin/nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_61 --gpu-code=sm_61 -o plugins/cuda/resources/karlsen-cuda-sm61.ptx -Xptxas -O3 -Xcompiler -O3 # Compute version 3.0 -/usr/local/cuda-9.2/bin/nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -ccbin=gcc-7 -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_30 --gpu-code=sm_30 -o plugins/cuda/resources/kaspa-cuda-sm30.ptx +/usr/local/cuda-9.2/bin/nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -ccbin=gcc-7 -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_30 --gpu-code=sm_30 -o plugins/cuda/resources/karlsen-cuda-sm30.ptx # Compute version 2.0 -/usr/local/cuda-8.0/bin/nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -ccbin=gcc-5 -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_20 --gpu-code=sm_20 -o plugins/cuda/resources/kaspa-cuda-sm20.ptx +/usr/local/cuda-8.0/bin/nvcc plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu -ccbin=gcc-5 -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_20 --gpu-code=sm_20 -o plugins/cuda/resources/karlsen-cuda-sm20.ptx cargo build --release ``` diff --git a/plugins/cuda/kaspa-cuda-native/src/blake3_compact.h b/plugins/cuda/karlsen-cuda-native/src/blake3_compact.h similarity index 97% rename from plugins/cuda/kaspa-cuda-native/src/blake3_compact.h rename to plugins/cuda/karlsen-cuda-native/src/blake3_compact.h index 66f7b1b..85e21b1 100644 --- a/plugins/cuda/kaspa-cuda-native/src/blake3_compact.h +++ b/plugins/cuda/karlsen-cuda-native/src/blake3_compact.h @@ -1,693 +1,693 @@ -#pragma once - -//@Credit - https://github.com/BLAKE3-team/BLAKE3/tree/master - -#define INLINE __forceinline__ - -#define BLAKE3_KEY_LEN 32 -#define BLAKE3_OUT_LEN 32 -#define BLAKE3_BLOCK_LEN 64 -#define BLAKE3_CHUNK_LEN 1024 -#define BLAKE3_MAX_DEPTH 54 - -#define MAX_SIMD_DEGREE_OR_2 2 - -__device__ unsigned int highest_one(uint64_t x) -{ - unsigned int c = 0; - if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } - if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } - if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } - if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } - if (x & 0x000000000000000cULL) { x >>= 2; c += 2; } - if (x & 0x0000000000000002ULL) { c += 1; } - return c; -} - -enum blake3_flags -{ - CHUNK_START = 1 << 0, - CHUNK_END = 1 << 1, - PARENT = 1 << 2, - ROOT = 1 << 3, - KEYED_HASH = 1 << 4, - DERIVE_KEY_CONTEXT = 1 << 5, - DERIVE_KEY_MATERIAL = 1 << 6, -}; - -__device__ unsigned int popcnt(uint64_t x) -{ - unsigned int count = 0; - while (x != 0) { - count += 1; - x &= x - 1; - } - return count; -} - -__device__ uint64_t round_down_to_power_of_2(uint64_t x) -{ - return 1ULL << highest_one(x | 1); -} - -__device__ uint32_t counter_low(uint64_t counter) -{ - return (uint32_t)counter; -} - -__device__ uint32_t counter_high(uint64_t counter) -{ - return (uint32_t)(counter >> 32); -} - -__device__ uint32_t load32(const void* src) -{ - const uint8_t* p = (const uint8_t*)src; - return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | ((uint32_t)(p[2]) << 16) | - ((uint32_t)(p[3]) << 24); -} - -__device__ void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8]) -{ - key_words[0] = load32(&key[0 * 4]); - key_words[1] = load32(&key[1 * 4]); - key_words[2] = load32(&key[2 * 4]); - key_words[3] = load32(&key[3 * 4]); - key_words[4] = load32(&key[4 * 4]); - key_words[5] = load32(&key[5 * 4]); - key_words[6] = load32(&key[6 * 4]); - key_words[7] = load32(&key[7 * 4]); -} - -__device__ void store32(void* dst, uint32_t w) -{ - uint8_t* p = (uint8_t*)dst; - p[0] = (uint8_t)(w >> 0); - p[1] = (uint8_t)(w >> 8); - p[2] = (uint8_t)(w >> 16); - p[3] = (uint8_t)(w >> 24); -} - -__device__ void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) -{ - store32(&bytes_out[0 * 4], cv_words[0]); - store32(&bytes_out[1 * 4], cv_words[1]); - store32(&bytes_out[2 * 4], cv_words[2]); - store32(&bytes_out[3 * 4], cv_words[3]); - store32(&bytes_out[4 * 4], cv_words[4]); - store32(&bytes_out[5 * 4], cv_words[5]); - store32(&bytes_out[6 * 4], cv_words[6]); - store32(&bytes_out[7 * 4], cv_words[7]); -} - -__device__ const uint32_t IV[8] = { - 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, - 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL -}; - -__device__ const uint8_t MSG_SCHEDULE[7][16] = { - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, - {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, - {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, - {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, - {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, - {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, -}; - -typedef struct -{ - uint32_t cv[8]; - uint64_t chunk_counter; - uint8_t buf[BLAKE3_BLOCK_LEN]; - uint8_t buf_len; - uint8_t blocks_compressed; - uint8_t flags; -} blake3_chunk_state; - -typedef struct -{ - uint32_t key[8]; - blake3_chunk_state chunk; - uint8_t cv_stack_len; - uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; -} blake3_hasher; - -__device__ void chunk_state_init(blake3_chunk_state* self, const uint32_t key[8], uint8_t flags) -{ - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; - self->blocks_compressed = 0; - self->flags = flags; -} - -__device__ void chunk_state_reset( - blake3_chunk_state* self, const uint32_t key[8], uint64_t chunk_counter) -{ - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = chunk_counter; - self->blocks_compressed = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; -} - -__device__ size_t chunk_state_len(const blake3_chunk_state* self) -{ - return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + ((size_t)self->buf_len); -} - -__device__ size_t chunk_state_fill_buf( - blake3_chunk_state* self, const uint8_t* input, size_t input_len) -{ - size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); - if (take > input_len) { - take = input_len; - } - uint8_t* dest = self->buf + ((size_t)self->buf_len); - memcpy(dest, input, take); - self->buf_len += (uint8_t)take; - return take; -} - -__device__ uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state* self) -{ - if (self->blocks_compressed == 0) { - return CHUNK_START; - } else { - return 0; - } -} - -typedef struct -{ - uint32_t input_cv[8]; - uint64_t counter; - uint8_t block[BLAKE3_BLOCK_LEN]; - uint8_t block_len; - uint8_t flags; -} output_t; - -__device__ output_t make_output(const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) -{ - output_t ret; - memcpy(ret.input_cv, input_cv, 32); - memcpy(ret.block, block, BLAKE3_BLOCK_LEN); - ret.block_len = block_len; - ret.counter = counter; - ret.flags = flags; - return ret; -} - -__device__ uint32_t rotr32(uint32_t w, uint32_t c) -{ - return (w >> c) | (w << (32 - c)); -} - -__device__ void g(uint32_t* state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y) -{ - state[a] = state[a] + state[b] + x; - state[d] = rotr32(state[d] ^ state[a], 16); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 12); - state[a] = state[a] + state[b] + y; - state[d] = rotr32(state[d] ^ state[a], 8); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 7); -} - -__device__ void round_fn(uint32_t state[16], const uint32_t* msg, size_t round) -{ - // Select the message schedule based on the round. - const uint8_t* schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the rows. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -__device__ void compress_pre(uint32_t state[16], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) -{ - uint32_t block_words[16]; - block_words[0] = load32(block + 4 * 0); - block_words[1] = load32(block + 4 * 1); - block_words[2] = load32(block + 4 * 2); - block_words[3] = load32(block + 4 * 3); - block_words[4] = load32(block + 4 * 4); - block_words[5] = load32(block + 4 * 5); - block_words[6] = load32(block + 4 * 6); - block_words[7] = load32(block + 4 * 7); - block_words[8] = load32(block + 4 * 8); - block_words[9] = load32(block + 4 * 9); - block_words[10] = load32(block + 4 * 10); - block_words[11] = load32(block + 4 * 11); - block_words[12] = load32(block + 4 * 12); - block_words[13] = load32(block + 4 * 13); - block_words[14] = load32(block + 4 * 14); - block_words[15] = load32(block + 4 * 15); - - state[0] = cv[0]; - state[1] = cv[1]; - state[2] = cv[2]; - state[3] = cv[3]; - state[4] = cv[4]; - state[5] = cv[5]; - state[6] = cv[6]; - state[7] = cv[7]; - state[8] = IV[0]; - state[9] = IV[1]; - state[10] = IV[2]; - state[11] = IV[3]; - state[12] = counter_low(counter); - state[13] = counter_high(counter); - state[14] = (uint32_t)block_len; - state[15] = (uint32_t)flags; - - round_fn(state, &block_words[0], 0); - round_fn(state, &block_words[0], 1); - round_fn(state, &block_words[0], 2); - round_fn(state, &block_words[0], 3); - round_fn(state, &block_words[0], 4); - round_fn(state, &block_words[0], 5); - round_fn(state, &block_words[0], 6); -} - -__device__ void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) -{ - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - cv[0] = state[0] ^ state[8]; - cv[1] = state[1] ^ state[9]; - cv[2] = state[2] ^ state[10]; - cv[3] = state[3] ^ state[11]; - cv[4] = state[4] ^ state[12]; - cv[5] = state[5] ^ state[13]; - cv[6] = state[6] ^ state[14]; - cv[7] = state[7] ^ state[15]; -} - -__device__ void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) -{ - blake3_compress_in_place_portable(cv, block, block_len, counter, flags); -} - -__device__ void output_chaining_value(const output_t* self, uint8_t cv[32]) -{ - uint32_t cv_words[8]; - memcpy(cv_words, self->input_cv, 32); - blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags); - store_cv_words(cv, cv_words); -} - -__device__ void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) -{ - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - - store32(&out[0 * 4], state[0] ^ state[8]); - store32(&out[1 * 4], state[1] ^ state[9]); - store32(&out[2 * 4], state[2] ^ state[10]); - store32(&out[3 * 4], state[3] ^ state[11]); - store32(&out[4 * 4], state[4] ^ state[12]); - store32(&out[5 * 4], state[5] ^ state[13]); - store32(&out[6 * 4], state[6] ^ state[14]); - store32(&out[7 * 4], state[7] ^ state[15]); - store32(&out[8 * 4], state[8] ^ cv[0]); - store32(&out[9 * 4], state[9] ^ cv[1]); - store32(&out[10 * 4], state[10] ^ cv[2]); - store32(&out[11 * 4], state[11] ^ cv[3]); - store32(&out[12 * 4], state[12] ^ cv[4]); - store32(&out[13 * 4], state[13] ^ cv[5]); - store32(&out[14 * 4], state[14] ^ cv[6]); - store32(&out[15 * 4], state[15] ^ cv[7]); -} - -__device__ void hash_one(const uint8_t* input, size_t blocks, const uint32_t key[8], - uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, - uint8_t out[BLAKE3_OUT_LEN]) -{ - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - store_cv_words(out, cv); -} - -__device__ void blake3_hash_many(const uint8_t* const* inputs, size_t num_inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t* out) -{ - while (num_inputs > 0) { - hash_one(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} - -__device__ void output_root_bytes(const output_t* self, uint64_t seek, uint8_t* out, size_t out_len) -{ - uint64_t output_block_counter = seek / 64; - size_t offset_within_block = seek % 64; - uint8_t wide_buf[64]; - while (out_len > 0) { - blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, - self->flags | ROOT, wide_buf); - size_t available_bytes = 64 - offset_within_block; - size_t memcpy_len; - if (out_len > available_bytes) { - memcpy_len = available_bytes; - } else { - memcpy_len = out_len; - } - memcpy(out, wide_buf + offset_within_block, memcpy_len); - out += memcpy_len; - out_len -= memcpy_len; - output_block_counter += 1; - offset_within_block = 0; - } -} - -__device__ void chunk_state_update(blake3_chunk_state* self, const uint8_t* input, size_t input_len) -{ - if (self->buf_len > 0) { - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; - if (input_len > 0) { - blake3_compress_in_place(self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - self->buf_len = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - } - } - - while (input_len > BLAKE3_BLOCK_LEN) { - blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - input += BLAKE3_BLOCK_LEN; - input_len -= BLAKE3_BLOCK_LEN; - } - - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; -} - -__device__ output_t chunk_state_output(const blake3_chunk_state* self) -{ - uint8_t block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; - return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags); -} - -__device__ output_t parent_output( - const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags) -{ - return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); -} - -__device__ size_t left_len(size_t content_len) -{ - size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; - return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; -} - -__device__ size_t compress_chunks_parallel(const uint8_t* input, size_t input_len, - const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t* out) -{ - const uint8_t* chunks_array[1]; - - size_t input_position = 0; - size_t chunks_array_len = 0; - while (input_len - input_position >= BLAKE3_CHUNK_LEN) { - chunks_array[chunks_array_len] = &input[input_position]; - input_position += BLAKE3_CHUNK_LEN; - chunks_array_len += 1; - } - - blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, - chunk_counter, true, flags, CHUNK_START, CHUNK_END, out); - - - if (input_len > input_position) { - uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, key, flags); - chunk_state.chunk_counter = counter; - chunk_state_update(&chunk_state, &input[input_position], input_len - input_position); - output_t output = chunk_state_output(&chunk_state); - output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); - return chunks_array_len + 1; - } else { - return chunks_array_len; - } -} - -__device__ size_t compress_parents_parallel(const uint8_t* child_chaining_values, - size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t* out) -{ - const uint8_t* parents_array[MAX_SIMD_DEGREE_OR_2]; - size_t parents_array_len = 0; - while (num_chaining_values - (2 * parents_array_len) >= 2) { - parents_array[parents_array_len] = - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; - parents_array_len += 1; - } - - blake3_hash_many(parents_array, parents_array_len, 1, key, - 0, // Parents always use counter 0. - false, flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out); - - // If there's an odd child left over, it becomes an output. - if (num_chaining_values > 2 * parents_array_len) { - memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); - return parents_array_len + 1; - } else { - return parents_array_len; - } -} - -__device__ static size_t blake3_compress_subtree_wide(const uint8_t* input, size_t input_len, - const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t* out) -{ - if (input_len <= 1 * BLAKE3_CHUNK_LEN) { - return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out); - } - size_t left_input_len = left_len(input_len); - size_t right_input_len = input_len - left_input_len; - const uint8_t* right_input = &input[left_input_len]; - uint64_t right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); - - uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t degree = 1; // todo they hard coded to 1?? - if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { - degree = 2; - } - uint8_t* right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; - - size_t left_n = - blake3_compress_subtree_wide(input, left_input_len, key, chunk_counter, flags, cv_array); - size_t right_n = blake3_compress_subtree_wide( - right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); - - if (left_n == 1) { - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); - return 2; - } - - // Otherwise, do one layer of parent node compression. - size_t num_chaining_values = left_n + right_n; - return compress_parents_parallel(cv_array, num_chaining_values, key, flags, out); -} - -__device__ void compress_subtree_to_parent_node(const uint8_t* input, size_t input_len, - const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) -{ - uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t num_cvs = - blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array); - - uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - - while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { - num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); - memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); - } - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); -} - -__device__ void hasher_init_base(blake3_hasher* self, const uint32_t key[8], uint8_t flags) -{ - memcpy(self->key, key, BLAKE3_KEY_LEN); - chunk_state_init(&self->chunk, key, flags); - self->cv_stack_len = 0; -} - -__device__ void blake3_hasher_init(blake3_hasher* self) -{ - hasher_init_base(self, IV, 0); -} - -__device__ void blake3_hasher_init_keyed(blake3_hasher* self, const uint8_t key[BLAKE3_KEY_LEN]) -{ - uint32_t key_words[8]; - load_key_words(key, key_words); - hasher_init_base(self, key_words, KEYED_HASH); -} - -__device__ void hasher_merge_cv_stack(blake3_hasher* self, uint64_t total_len) -{ - size_t post_merge_stack_len = (size_t)popcnt(total_len); - while (self->cv_stack_len > post_merge_stack_len) { - uint8_t* parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; - output_t output = parent_output(parent_node, self->key, self->chunk.flags); - output_chaining_value(&output, parent_node); - self->cv_stack_len -= 1; - } -} - -__device__ void hasher_push_cv( - blake3_hasher* self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter) -{ - hasher_merge_cv_stack(self, chunk_counter); - memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN); - self->cv_stack_len += 1; -} - -__device__ void blake3_hasher_update(blake3_hasher* self, const void* input, size_t input_len) -{ - if (input_len == 0) { - return; - } - - const uint8_t* input_bytes = (const uint8_t*)input; - - if (chunk_state_len(&self->chunk) > 0) { - size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); - if (take > input_len) { - take = input_len; - } - chunk_state_update(&self->chunk, input_bytes, take); - input_bytes += take; - input_len -= take; - - if (input_len > 0) { - output_t output = chunk_state_output(&self->chunk); - uint8_t chunk_cv[32]; - output_chaining_value(&output, chunk_cv); - hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); - chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); - } else { - return; - } - } - - while (input_len > BLAKE3_CHUNK_LEN) { - size_t subtree_len = round_down_to_power_of_2(input_len); - uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; - - while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { - subtree_len /= 2; - } - - uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; - if (subtree_len <= BLAKE3_CHUNK_LEN) { - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, self->key, self->chunk.flags); - chunk_state.chunk_counter = self->chunk.chunk_counter; - chunk_state_update(&chunk_state, input_bytes, subtree_len); - output_t output = chunk_state_output(&chunk_state); - uint8_t cv[BLAKE3_OUT_LEN]; - output_chaining_value(&output, cv); - hasher_push_cv(self, cv, chunk_state.chunk_counter); - } else { - uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; - compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, - self->chunk.chunk_counter, self->chunk.flags, cv_pair); - hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); - hasher_push_cv( - self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2)); - } - self->chunk.chunk_counter += subtree_chunks; - input_bytes += subtree_len; - input_len -= subtree_len; - } - - if (input_len > 0) { - chunk_state_update(&self->chunk, input_bytes, input_len); - hasher_merge_cv_stack(self, self->chunk.chunk_counter); - } -} - -__device__ void blake3_hasher_finalize_seek( - const blake3_hasher* self, uint64_t seek, uint8_t* out, size_t out_len) -{ - if (out_len == 0) { - return; - } - - if (self->cv_stack_len == 0) { - output_t output = chunk_state_output(&self->chunk); - output_root_bytes(&output, seek, out, out_len); - return; - } - output_t output; - size_t cvs_remaining; - if (chunk_state_len(&self->chunk) > 0) { - cvs_remaining = self->cv_stack_len; - output = chunk_state_output(&self->chunk); - } else { - cvs_remaining = self->cv_stack_len - 2; - output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags); - } - while (cvs_remaining > 0) { - cvs_remaining -= 1; - uint8_t parent_block[BLAKE3_BLOCK_LEN]; - memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); - output_chaining_value(&output, &parent_block[32]); - output = parent_output(parent_block, self->key, self->chunk.flags); - } - output_root_bytes(&output, seek, out, out_len); -} - -__device__ void blake3_hasher_finalize(const blake3_hasher* self, uint8_t* out, size_t out_len) -{ - blake3_hasher_finalize_seek(self, 0, out, out_len); -} - -__device__ void blake3_hasher_reset(blake3_hasher* self) -{ - chunk_state_reset(&self->chunk, self->key, 0); - self->cv_stack_len = 0; +#pragma once + +//@Credit - https://github.com/BLAKE3-team/BLAKE3/tree/master + +#define INLINE __forceinline__ + +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 + +#define MAX_SIMD_DEGREE_OR_2 2 + +__device__ unsigned int highest_one(uint64_t x) +{ + unsigned int c = 0; + if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if (x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if (x & 0x0000000000000002ULL) { c += 1; } + return c; +} + +enum blake3_flags +{ + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +__device__ unsigned int popcnt(uint64_t x) +{ + unsigned int count = 0; + while (x != 0) { + count += 1; + x &= x - 1; + } + return count; +} + +__device__ uint64_t round_down_to_power_of_2(uint64_t x) +{ + return 1ULL << highest_one(x | 1); +} + +__device__ uint32_t counter_low(uint64_t counter) +{ + return (uint32_t)counter; +} + +__device__ uint32_t counter_high(uint64_t counter) +{ + return (uint32_t)(counter >> 32); +} + +__device__ uint32_t load32(const void* src) +{ + const uint8_t* p = (const uint8_t*)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | ((uint32_t)(p[2]) << 16) | + ((uint32_t)(p[3]) << 24); +} + +__device__ void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8]) +{ + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +__device__ void store32(void* dst, uint32_t w) +{ + uint8_t* p = (uint8_t*)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +__device__ void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) +{ + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +__device__ const uint32_t IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL +}; + +__device__ const uint8_t MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +typedef struct +{ + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct +{ + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +__device__ void chunk_state_init(blake3_chunk_state* self, const uint32_t key[8], uint8_t flags) +{ + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +__device__ void chunk_state_reset( + blake3_chunk_state* self, const uint32_t key[8], uint64_t chunk_counter) +{ + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +__device__ size_t chunk_state_len(const blake3_chunk_state* self) +{ + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + ((size_t)self->buf_len); +} + +__device__ size_t chunk_state_fill_buf( + blake3_chunk_state* self, const uint8_t* input, size_t input_len) +{ + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t* dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +__device__ uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state* self) +{ + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct +{ + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +__device__ output_t make_output(const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) +{ + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +__device__ uint32_t rotr32(uint32_t w, uint32_t c) +{ + return (w >> c) | (w << (32 - c)); +} + +__device__ void g(uint32_t* state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y) +{ + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +__device__ void round_fn(uint32_t state[16], const uint32_t* msg, size_t round) +{ + // Select the message schedule based on the round. + const uint8_t* schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the rows. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +__device__ void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) +{ + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = IV[0]; + state[9] = IV[1]; + state[10] = IV[2]; + state[11] = IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +__device__ void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +__device__ void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) +{ + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); +} + +__device__ void output_chaining_value(const output_t* self, uint8_t cv[32]) +{ + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +__device__ void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +__device__ void hash_one(const uint8_t* input, size_t blocks, const uint32_t key[8], + uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, + uint8_t out[BLAKE3_OUT_LEN]) +{ + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +__device__ void blake3_hash_many(const uint8_t* const* inputs, size_t num_inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t* out) +{ + while (num_inputs > 0) { + hash_one(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} + +__device__ void output_root_bytes(const output_t* self, uint64_t seek, uint8_t* out, size_t out_len) +{ + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, + self->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +__device__ void chunk_state_update(blake3_chunk_state* self, const uint8_t* input, size_t input_len) +{ + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place(self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; +} + +__device__ output_t chunk_state_output(const blake3_chunk_state* self) +{ + uint8_t block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags); +} + +__device__ output_t parent_output( + const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags) +{ + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +__device__ size_t left_len(size_t content_len) +{ + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +__device__ size_t compress_chunks_parallel(const uint8_t* input, size_t input_len, + const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t* out) +{ + const uint8_t* chunks_array[1]; + + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, + chunk_counter, true, flags, CHUNK_START, CHUNK_END, out); + + + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +__device__ size_t compress_parents_parallel(const uint8_t* child_chaining_values, + size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t* out) +{ + const uint8_t* parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +__device__ static size_t blake3_compress_subtree_wide(const uint8_t* input, size_t input_len, + const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t* out) +{ + if (input_len <= 1 * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out); + } + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t* right_input = &input[left_input_len]; + uint64_t right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = 1; // todo they hard coded to 1?? + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + degree = 2; + } + uint8_t* right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + size_t left_n = + blake3_compress_subtree_wide(input, left_input_len, key, chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide( + right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, out); +} + +__device__ void compress_subtree_to_parent_node(const uint8_t* input, size_t input_len, + const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) +{ + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = + blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array); + + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + + while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { + num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +__device__ void hasher_init_base(blake3_hasher* self, const uint32_t key[8], uint8_t flags) +{ + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +__device__ void blake3_hasher_init(blake3_hasher* self) +{ + hasher_init_base(self, IV, 0); +} + +__device__ void blake3_hasher_init_keyed(blake3_hasher* self, const uint8_t key[BLAKE3_KEY_LEN]) +{ + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +__device__ void hasher_merge_cv_stack(blake3_hasher* self, uint64_t total_len) +{ + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t* parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +__device__ void hasher_push_cv( + blake3_hasher* self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter) +{ + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +__device__ void blake3_hasher_update(blake3_hasher* self, const void* input, size_t input_len) +{ + if (input_len == 0) { + return; + } + + const uint8_t* input_bytes = (const uint8_t*)input; + + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv( + self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +__device__ void blake3_hasher_finalize_seek( + const blake3_hasher* self, uint64_t seek, uint8_t* out, size_t out_len) +{ + if (out_len == 0) { + return; + } + + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} + +__device__ void blake3_hasher_finalize(const blake3_hasher* self, uint8_t* out, size_t out_len) +{ + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +__device__ void blake3_hasher_reset(blake3_hasher* self) +{ + chunk_state_reset(&self->chunk, self->key, 0); + self->cv_stack_len = 0; } \ No newline at end of file diff --git a/plugins/cuda/karlsen-cuda-native/src/cuda_helper.h b/plugins/cuda/karlsen-cuda-native/src/cuda_helper.h new file mode 100644 index 0000000..7a48709 --- /dev/null +++ b/plugins/cuda/karlsen-cuda-native/src/cuda_helper.h @@ -0,0 +1,989 @@ +#pragma once + +#include + +#include + +#define DEV_INLINE __device__ __forceinline__ + +#ifdef __INTELLISENSE__ +/* reduce vstudio warnings (__byteperm, blockIdx...) */ +#include +#include +#define __launch_bounds__(max_tpb, min_blocks) +#define asm("a" : "=l"(result) : "l"(a)) +#define __CUDA_ARCH__ 520 // highlight shuffle code by default. + +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); +uint32_t atomicExch(uint32_t* x, uint32_t y); +uint32_t atomicAdd(uint32_t* x, uint32_t y); +void __syncthreads(void); +void __threadfence(void); +void __threadfence_block(void); +#endif + +#include + +#ifndef MAX_GPUS +#define MAX_GPUS 32 +#endif + +extern "C" int device_map[MAX_GPUS]; +extern "C" long device_sm[MAX_GPUS]; +extern cudaStream_t gpustream[MAX_GPUS]; + +// common functions +extern void cuda_check_cpu_init(int thr_id, uint32_t threads); +extern void cuda_check_cpu_setTarget(const void* ptarget); +extern void cuda_check_cpu_setTarget_mod(const void* ptarget, const void* ptarget2); +extern uint32_t cuda_check_hash( + int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* d_inputHash); +extern uint32_t cuda_check_hash_suppl( + int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* d_inputHash, uint32_t foundnonce); +extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func); + +#ifndef __CUDA_ARCH__ +// define blockDim and threadIdx for host +extern const dim3 blockDim; +extern const uint3 threadIdx; +#endif + + +#ifndef SPH_C32 +#define SPH_C32(x) ((x##U)) +// #define SPH_C32(x) ((uint32_t)(x ## U)) +#endif + +#ifndef SPH_C64 +#define SPH_C64(x) ((x##ULL)) +// #define SPH_C64(x) ((uint64_t)(x ## ULL)) +#endif + +#ifndef SPH_T32 +#define SPH_T32(x) (x) +// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#endif +#ifndef SPH_T64 +#define SPH_T64(x) (x) +// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) +#endif + +#define ROTL32c(x, n) ((x) << (n)) | ((x) >> (32 - (n))) + +#if __CUDA_ARCH__ < 320 +// Kepler (Compute 3.0) +#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) +#else +// Kepler (Compute 3.5, 5.0) +DEV_INLINE uint32_t ROTL32(const uint32_t x, const uint32_t n) +{ + return (__funnelshift_l((x), (x), (n))); +} +#endif +#if __CUDA_ARCH__ < 320 +// Kepler (Compute 3.0) +#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#else +DEV_INLINE uint32_t ROTR32(const uint32_t x, const uint32_t n) +{ + return (__funnelshift_r((x), (x), (n))); +} +#endif + +DEV_INLINE uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI) +{ + uint64_t result; + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(LO), "r"(HI)); + return result; +} + +// Endian Drehung für 32 Bit Typen +#ifdef __CUDA_ARCH__ +DEV_INLINE uint32_t cuda_swab32(const uint32_t x) +{ + /* device */ + return __byte_perm(x, x, 0x0123); +} +#else +/* host */ +#define cuda_swab32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | \ + (((x) >> 24) & 0x000000ffu)) +#endif + +#ifdef __CUDA_ARCH__ +DEV_INLINE uint64_t cuda_swab64(const uint64_t x) +{ + uint64_t result; + uint2 t; + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(t.x), "=r"(t.y) : "l"(x)); + t.x = __byte_perm(t.x, 0, 0x0123); + t.y = __byte_perm(t.y, 0, 0x0123); + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(t.y), "r"(t.x)); + return result; +} +#else +/* host */ +#define cuda_swab64(x) \ + ((uint64_t)((((uint64_t)(x)&0xff00000000000000ULL) >> 56) | \ + (((uint64_t)(x)&0x00ff000000000000ULL) >> 40) | \ + (((uint64_t)(x)&0x0000ff0000000000ULL) >> 24) | \ + (((uint64_t)(x)&0x000000ff00000000ULL) >> 8) | \ + (((uint64_t)(x)&0x00000000ff000000ULL) << 8) | \ + (((uint64_t)(x)&0x0000000000ff0000ULL) << 24) | \ + (((uint64_t)(x)&0x000000000000ff00ULL) << 40) | \ + (((uint64_t)(x)&0x00000000000000ffULL) << 56))) +#endif + + +#ifdef _WIN64 +#define USE_XOR_ASM_OPTS 0 +#else +#define USE_XOR_ASM_OPTS 1 +#endif + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +DEV_INLINE uint64_t xor1(const uint64_t a, const uint64_t b) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b)); + return result; +} +#else +#define xor1(a, b) (a ^ b) +#endif + +/* +#if USE_XOR_ASM_OPTS +// device asm for whirpool +DEV_INLINE +uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c) +{ + uint64_t result; + asm("xor.b64 %0, %2, %3;\n\t" + "xor.b64 %0, %0, %1;\n\t" + //output : input registers + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} +#else +#define xor3(a,b,c) (a ^ b ^ c) +#endif +*/ + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +DEV_INLINE uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c, + const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const uint64_t h) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g), "l"(h)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); + return result; +} +#else +#define xor8(a, b, c, d, e, f, g, h) ((a ^ b) ^ (c ^ d) ^ (e ^ f) ^ (g ^ h)) +#endif + +// device asm for x17 +DEV_INLINE uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c) +{ + uint64_t result; + asm("{\n\t" + ".reg .u64 n;\n\t" + "xor.b64 %0, %2, %3;\n\t" + "and.b64 n, %0, %1;\n\t" + "xor.b64 %0, n, %3;" + "}\n" + : "=l"(result) + : "l"(a), "l"(b), "l"(c)); + return result; +} + +// device asm for x17 +DEV_INLINE uint64_t andor(uint64_t a, uint64_t b, uint64_t c) +{ + uint64_t result; + asm("{\n\t" + ".reg .u64 m,n;\n\t" + "and.b64 m, %1, %2;\n\t" + " or.b64 n, %1, %2;\n\t" + "and.b64 %0, n, %3;\n\t" + " or.b64 %0, %0, m ;\n\t" + "}\n" + : "=l"(result) + : "l"(a), "l"(b), "l"(c)); + return result; +} + +// device asm for x17 +DEV_INLINE uint64_t shr_t64(uint64_t x, uint32_t n) +{ + uint64_t result; + asm("shr.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); + return result; +} + +// device asm for ? +DEV_INLINE uint64_t shl_t64(uint64_t x, uint32_t n) +{ + uint64_t result; + asm("shl.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); + return result; +} + +#ifndef USE_ROT_ASM_OPT +#define USE_ROT_ASM_OPT 2 +#endif + +// 64-bit ROTATE RIGHT +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 +/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ +DEV_INLINE uint64_t ROTR64(const uint64_t value, const int offset) +{ + uint2 result; + if (offset < 32) + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2loint(__longlong_as_double(value))), + "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2hiint(__longlong_as_double(value))), + "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } + else + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2hiint(__longlong_as_double(value))), + "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2loint(__longlong_as_double(value))), + "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 +DEV_INLINE uint64_t ROTR64(const uint64_t x, const int offset) +{ + uint64_t result; + asm("{\n\t" + ".reg .b64 lhs;\n\t" + ".reg .u32 roff;\n\t" + "shr.b64 lhs, %1, %2;\n\t" + "sub.u32 roff, 64, %2;\n\t" + "shl.b64 %0, %1, roff;\n\t" + "add.u64 %0, %0, lhs;\n\t" + "}\n" + : "=l"(result) + : "l"(x), "r"(offset)); + return result; +} +#else +/* host */ +#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#endif + +// 64-bit ROTATE LEFT +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 +DEV_INLINE uint64_t ROTL64(const uint64_t value, const int offset) +{ + uint2 result; + if (offset >= 32) + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2loint(__longlong_as_double(value))), + "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2hiint(__longlong_as_double(value))), + "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } + else + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2hiint(__longlong_as_double(value))), + "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2loint(__longlong_as_double(value))), + "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 +DEV_INLINE uint64_t ROTL64(const uint64_t x, const int offset) +{ + uint64_t result; + asm("{\n\t" + ".reg .b64 lhs;\n\t" + ".reg .u32 roff;\n\t" + "shl.b64 lhs, %1, %2;\n\t" + "sub.u32 roff, 64, %2;\n\t" + "shr.b64 %0, %1, roff;\n\t" + "add.u64 %0, lhs, %0;\n\t" + "}\n" + : "=l"(result) + : "l"(x), "r"(offset)); + return result; +} +#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3 +__device__ uint64_t ROTL64(const uint64_t x, const int offset) +{ + uint64_t res; + asm("{\n\t" + ".reg .u32 tl,th,vl,vh;\n\t" + ".reg .pred p;\n\t" + "mov.b64 {tl,th}, %1;\n\t" + "shf.l.wrap.b32 vl, tl, th, %2;\n\t" + "shf.l.wrap.b32 vh, th, tl, %2;\n\t" + "setp.lt.u32 p, %2, 32;\n\t" + "@!p mov.b64 %0, {vl,vh};\n\t" + "@p mov.b64 %0, {vh,vl};\n\t" + "}" + : "=l"(res) + : "l"(x), "r"(offset)); + return res; +} +#else +/* host */ +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#endif + +DEV_INLINE uint64_t SWAPDWORDS(uint64_t value) +{ +#if __CUDA_ARCH__ >= 320 + uint2 temp; + asm("mov.b64 {%0, %1}, %2; " : "=r"(temp.x), "=r"(temp.y) : "l"(value)); + asm("mov.b64 %0, {%1, %2}; " : "=l"(value) : "r"(temp.y), "r"(temp.x)); + return value; +#else + return ROTL64(value, 32); +#endif +} + +/* lyra2 - int2 operators */ + +DEV_INLINE void LOHI(uint32_t& lo, uint32_t& hi, uint64_t x) +{ + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(lo), "=r"(hi) : "l"(x)); +} + +DEV_INLINE uint64_t devectorize(uint2 x) +{ + uint64_t result; + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(x.x), "r"(x.y)); + return result; +} + + +DEV_INLINE uint2 vectorize(const uint64_t x) +{ + uint2 result; + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.x), "=r"(result.y) : "l"(x)); + return result; +} +DEV_INLINE void devectorize2(uint4 inn, uint2& x, uint2& y) +{ + x.x = inn.x; + x.y = inn.y; + y.x = inn.z; + y.y = inn.w; +} + + +DEV_INLINE uint4 vectorize2(uint2 x, uint2 y) +{ + uint4 result; + result.x = x.x; + result.y = x.y; + result.z = y.x; + result.w = y.y; + + return result; +} + +DEV_INLINE uint4 vectorize2(uint2 x) +{ + uint4 result; + result.x = x.x; + result.y = x.y; + result.z = x.x; + result.w = x.y; + return result; +} + + +DEV_INLINE uint4 vectorize4(uint64_t x, uint64_t y) +{ + uint4 result; + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.x), "=r"(result.y) : "l"(x)); + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.z), "=r"(result.w) : "l"(y)); + return result; +} +DEV_INLINE void devectorize4(uint4 inn, uint64_t& x, uint64_t& y) +{ + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(x) : "r"(inn.x), "r"(inn.y)); + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(y) : "r"(inn.z), "r"(inn.w)); +} + + +static DEV_INLINE uint2 vectorizelow(uint32_t v) +{ + uint2 result; + result.x = v; + result.y = 0; + return result; +} +static DEV_INLINE uint2 vectorizehigh(uint32_t v) +{ + uint2 result; + result.x = 0; + result.y = v; + return result; +} + +static DEV_INLINE uint2 operator^(uint2 a, uint32_t b) +{ + return make_uint2(a.x ^ b, a.y); +} +static DEV_INLINE uint2 operator^(uint2 a, uint2 b) +{ + return make_uint2(a.x ^ b.x, a.y ^ b.y); +} +static DEV_INLINE uint2 operator&(uint2 a, uint2 b) +{ + return make_uint2(a.x & b.x, a.y & b.y); +} +static DEV_INLINE uint2 operator|(uint2 a, uint2 b) +{ + return make_uint2(a.x | b.x, a.y | b.y); +} +static DEV_INLINE uint2 operator~(uint2 a) +{ + return make_uint2(~a.x, ~a.y); +} +static DEV_INLINE void operator^=(uint2& a, uint2 b) +{ + a = a ^ b; +} +static DEV_INLINE uint2 operator+(uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + +static DEV_INLINE uint2 operator+(uint2 a, uint32_t b) +{ + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +} + + +static DEV_INLINE uint2 operator-(uint2 a, uint32_t b) +{ + uint2 result; + asm("{\n\t" + "sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +} + + +static DEV_INLINE uint2 operator-(uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + + +static DEV_INLINE uint4 operator^(uint4 a, uint4 b) +{ + return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} +static DEV_INLINE uint4 operator&(uint4 a, uint4 b) +{ + return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); +} +static DEV_INLINE uint4 operator|(uint4 a, uint4 b) +{ + return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); +} +static DEV_INLINE uint4 operator~(uint4 a) +{ + return make_uint4(~a.x, ~a.y, ~a.z, ~a.w); +} +static DEV_INLINE void operator^=(uint4& a, uint4 b) +{ + a = a ^ b; +} +static DEV_INLINE uint4 operator^(uint4 a, uint2 b) +{ + return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.x, a.w ^ b.y); +} + + +static DEV_INLINE void operator+=(uint2& a, uint2 b) +{ + a = a + b; +} + +/** + * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b)) + * (what does uint64 "*" operator) + */ +static DEV_INLINE uint2 operator*(uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "mul.lo.u32 %0,%2,%4; \n\t" + "mul.hi.u32 %1,%2,%4; \n\t" + "mad.lo.cc.u32 %1,%3,%4,%1; \n\t" + "madc.lo.u32 %1,%3,%5,%1; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + +// uint2 method +#if __CUDA_ARCH__ >= 350 +DEV_INLINE uint2 ROR2(const uint2 a, const int offset) +{ + uint2 result; + if (offset < 32) + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +DEV_INLINE uint2 ROR2(const uint2 v, const int n) +{ + uint2 result; + if (n <= 32) + { + result.y = ((v.y >> (n)) | (v.x << (32 - n))); + result.x = ((v.x >> (n)) | (v.y << (32 - n))); + } + else + { + result.y = ((v.x >> (n - 32)) | (v.y << (64 - n))); + result.x = ((v.y >> (n - 32)) | (v.x << (64 - n))); + } + return result; +} +#endif + + +DEV_INLINE uint32_t ROL8(const uint32_t x) +{ + return __byte_perm(x, x, 0x2103); +} +DEV_INLINE uint32_t ROL16(const uint32_t x) +{ + return __byte_perm(x, x, 0x1032); +} +DEV_INLINE uint32_t ROL24(const uint32_t x) +{ + return __byte_perm(x, x, 0x0321); +} + +DEV_INLINE uint2 ROR8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x0765); + result.y = __byte_perm(a.y, a.x, 0x4321); + + return result; +} + +DEV_INLINE uint2 ROR16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + + return result; +} + +DEV_INLINE uint2 ROR24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + + return result; +} + +DEV_INLINE uint2 ROL8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x6543); + result.y = __byte_perm(a.y, a.x, 0x2107); + + return result; +} + +DEV_INLINE uint2 ROL16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x5432); + result.y = __byte_perm(a.y, a.x, 0x1076); + + return result; +} + +DEV_INLINE uint2 ROL24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x4321); + result.y = __byte_perm(a.y, a.x, 0x0765); + + return result; +} + + +#if __CUDA_ARCH__ >= 350 +__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) +{ + uint2 result; + if (offset >= 32) + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +__inline__ __device__ uint2 ROL2(const uint2 v, const int n) +{ + uint2 result; + if (n <= 32) + { + result.y = ((v.y << (n)) | (v.x >> (32 - n))); + result.x = ((v.x << (n)) | (v.y >> (32 - n))); + } + else + { + result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); + result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); + } + return result; +} +#endif + +DEV_INLINE uint64_t ROTR16(uint64_t x) +{ +#if __CUDA_ARCH__ > 500 + short4 temp; + asm("mov.b64 { %0, %1, %2, %3 }, %4; " + : "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) + : "l"(x)); + asm("mov.b64 %0, {%1, %2, %3 , %4}; " + : "=l"(x) + : "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x)); + return x; +#else + return ROTR64(x, 16); +#endif +} +DEV_INLINE uint64_t ROTL16(uint64_t x) +{ +#if __CUDA_ARCH__ > 500 + short4 temp; + asm("mov.b64 { %0, %1, %2, %3 }, %4; " + : "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) + : "l"(x)); + asm("mov.b64 %0, {%1, %2, %3 , %4}; " + : "=l"(x) + : "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z)); + return x; +#else + return ROTL64(x, 16); +#endif +} + +static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset) +{ +#if __CUDA_ARCH__ > 300 + uint2 result; + if (offset < 32) + { + asm("{\n\t" + "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" + "shl.b32 %0,%2,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(offset)); + } + else + { + asm("{\n\t" + "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" + "shl.b32 %0,%2,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.y), "r"(a.x), "r"(offset)); + } + return result; +#else + if (offset <= 32) + { + a.y = (a.y << offset) | (a.x >> (32 - offset)); + a.x = (a.x << offset); + } + else + { + a.y = (a.x << (offset - 32)); + a.x = 0; + } + return a; +#endif +} +static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset) +{ +#if __CUDA_ARCH__ > 300 + uint2 result; + if (offset < 32) + { + asm("{\n\t" + "shf.r.clamp.b32 %0,%2,%3,%4; \n\t" + "shr.b32 %1,%3,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(offset)); + } + else + { + asm("{\n\t" + "shf.l.clamp.b32 %0,%2,%3,%4; \n\t" + "shl.b32 %1,%3,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.y), "r"(a.x), "r"(offset)); + } + return result; +#else + if (offset <= 32) + { + a.x = (a.x >> offset) | (a.y << (32 - offset)); + a.y = (a.y >> offset); + } + else + { + a.x = (a.y >> (offset - 32)); + a.y = 0; + } + return a; +#endif +} + +static DEV_INLINE uint64_t devectorizeswap(uint2 v) +{ + return MAKE_ULONGLONG(cuda_swab32(v.y), cuda_swab32(v.x)); +} +static DEV_INLINE uint2 vectorizeswap(uint64_t v) +{ + uint2 result; + LOHI(result.y, result.x, v); + result.x = cuda_swab32(result.x); + result.y = cuda_swab32(result.y); + return result; +} + + +DEV_INLINE uint32_t devectorize16(ushort2 x) +{ + uint32_t result; + asm("mov.b32 %0,{%1,%2}; \n\t" : "=r"(result) : "h"(x.x), "h"(x.y)); + return result; +} + + +DEV_INLINE ushort2 vectorize16(uint32_t x) +{ + ushort2 result; + asm("mov.b32 {%0,%1},%2; \n\t" : "=h"(result.x), "=h"(result.y) : "r"(x)); + return result; +} + + +static DEV_INLINE uint4 mul4(uint4 a) +{ + uint4 result; + asm("{\n\t" + "mul.lo.u32 %0,%4,%5; \n\t" + "mul.hi.u32 %1,%4,%5; \n\t" + "mul.lo.u32 %2,%6,%7; \n\t" + "mul.hi.u32 %3,%6,%7; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) + : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w)); + return result; +} +static DEV_INLINE uint4 add4(uint4 a, uint4 b) +{ + uint4 result; + asm("{\n\t" + "add.cc.u32 %0,%4,%8; \n\t" + "addc.u32 %1,%5,%9; \n\t" + "add.cc.u32 %2,%6,%10; \n\t" + "addc.u32 %3,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) + : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; +} + +static DEV_INLINE uint4 madd4(uint4 a, uint4 b) +{ + uint4 result; + asm("{\n\t" + "mad.lo.cc.u32 %0,%4,%5,%8; \n\t" + "madc.hi.u32 %1,%4,%5,%9; \n\t" + "mad.lo.cc.u32 %2,%6,%7,%10; \n\t" + "madc.hi.u32 %3,%6,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) + : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; +} + +static DEV_INLINE ulonglong2 madd4long(ulonglong2 a, ulonglong2 b) +{ + ulonglong2 result; + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %2;\n\t" + "mov.b64 {a2,a3}, %3;\n\t" + "mov.b64 {b0,b1}, %4;\n\t" + "mov.b64 {b2,b3}, %5;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "=l"(result.x), "=l"(result.y) + : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y)); + return result; +} +static DEV_INLINE void madd4long2(ulonglong2& a, ulonglong2 b) +{ + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %0;\n\t" + "mov.b64 {a2,a3}, %1;\n\t" + "mov.b64 {b0,b1}, %2;\n\t" + "mov.b64 {b2,b3}, %3;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "+l"(a.x), "+l"(a.y) + : "l"(b.x), "l"(b.y)); +} + +DEV_INLINE uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) +{ + uint32_t result; + asm("{ .reg .u32 t1;\n\t" + "xor.b32 t1, %2, %3;\n\t" + "xor.b32 %0, %1, t1;\n\t" + "}" + : "=r"(result) + : "r"(a), "r"(b), "r"(c)); + return result; +} + +DEV_INLINE uint32_t shr_t32(uint32_t x, uint32_t n) +{ + uint32_t result; + asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +} + +DEV_INLINE uint32_t shl_t32(uint32_t x, uint32_t n) +{ + uint32_t result; + asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +} + +// device asm 32 for pluck +DEV_INLINE uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) +{ + uint32_t result; + asm("{ .reg .u32 m,n,o;\n\t" + "and.b32 m, %1, %2;\n\t" + " or.b32 n, %1, %2;\n\t" + "and.b32 o, n, %3;\n\t" + " or.b32 %0, m, o ;\n\t" + "}\n\t" + : "=r"(result) + : "r"(a), "r"(b), "r"(c)); + return result; +} + +DEV_INLINE uint32_t bfe(uint32_t x, uint32_t bit, uint32_t numBits) +{ + uint32_t ret; + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(bit), "r"(numBits)); + return ret; +} + +DEV_INLINE uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits) +{ + uint32_t ret; + asm("bfi.b32 %0, %1, %2, %3,%4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits)); + return ret; +} \ No newline at end of file diff --git a/plugins/cuda/karlsen-cuda-native/src/fishhash_cuda_kernel.cuh b/plugins/cuda/karlsen-cuda-native/src/fishhash_cuda_kernel.cuh new file mode 100644 index 0000000..f077553 --- /dev/null +++ b/plugins/cuda/karlsen-cuda-native/src/fishhash_cuda_kernel.cuh @@ -0,0 +1,227 @@ +#include "fishhash_cuda_kernel.h" +#include "keccak.cuh" + +#define FNV_PRIME 0x01000193 + +//change these in #define +//static int full_dataset_item_parents = 512; +#define full_dataset_item_parents 512 +//static int num_dataset_accesses = 32; +#define num_dataset_accesses 32 +//static int light_cache_rounds = 3; +#define light_cache_rounds 3 + +const int light_cache_num_items = 1179641; +//#define light_cache_num_items 1179641 +const int full_dataset_num_items = 37748717; +//#define full_dataset_num_items 37748717 + +#define DEV_INLINE __device__ __forceinline__ + +#define copy(dst, src, count) \ + for (int i = 0; i != count; ++i) \ + { \ + (dst)[i] = (src)[i]; \ + } + + +static DEV_INLINE uint32_t fnv1(uint32_t u, uint32_t v) noexcept { + return (u * FNV_PRIME) ^ v; +} + +DEV_INLINE hash512 fnv1(const hash512& u, const hash512& v) noexcept { + hash512 r; + for (size_t i = 0; i < sizeof(r) / sizeof(r.word32s[0]); ++i) + r.word32s[i] = fnv1(u.word32s[i], v.word32s[i]); + return r; +} + +typedef struct item_state + { + const hash512* const cache; + const int64_t num_cache_items; + const uint32_t seed; + + hash512 mix; + + DEV_INLINE item_state(const fishhash_context& ctx, int64_t index) noexcept + : cache{ctx.light_cache}, + num_cache_items{ctx.light_cache_num_items}, + seed{static_cast(index)} { + //printf("item_state debug 1 %p - %d", &cache, num_cache_items); + mix = cache[index % num_cache_items]; + //printf("item_state debug 2"); + mix.word32s[0] ^= seed; + //keccak(mix.word64s, 512, mix.bytes, 64); + //printf("item_state debug 3"); + SHA3_512(mix.uint2s); + } + + DEV_INLINE void update(uint32_t round) noexcept { + static constexpr size_t num_words = sizeof(mix) / sizeof(uint32_t); + const uint32_t t = fnv1(seed ^ round, mix.word32s[round % num_words]); + const int64_t parent_index = t % num_cache_items; + mix = fnv1(mix, cache[parent_index]); + } + + DEV_INLINE hash512 final() noexcept { + //keccak(mix.word64s, 512, mix.bytes, 64); + SHA3_512(mix.uint2s); + return mix; + } + } item_state; + + + +DEV_INLINE hash1024 calculate_dataset_item_1024(const fishhash_context& ctx, uint32_t index) noexcept { + //printf("heavy_hash Thread %d, Block %d\n", threadIdx.x, blockIdx.x); + //printf("calculate_dataset_item_1024 debug 1"); + item_state item0{ctx, int64_t(index) * 2}; + //printf("calculate_dataset_item_1024 debug 2"); + item_state item1{ctx, int64_t(index) * 2 + 1}; + + //printf("calculate_dataset_item_1024 debug 3"); + for (uint32_t j = 0; j < full_dataset_item_parents; ++j) { + item0.update(j); + item1.update(j); + } + + hash512 it0 = item0.final(); + hash512 it1 = item1.final(); + + return hash1024{{it0, it1}}; +} + +DEV_INLINE hash1024 lookup(const fishhash_context& ctx, uint32_t index) { + if (ctx.full_dataset != NULL) { + //printf("lookup debug 1"); + hash1024 * item = &ctx.full_dataset[index]; + + // Ability to handle lazy lookup + if (item->word64s[0] == 0) { + *item = calculate_dataset_item_1024(ctx, index); + } + + return *item; + } else { + //printf("lookup debug 2"); + return calculate_dataset_item_1024(ctx, index); + } +} + +DEV_INLINE hash256 fishhash_kernel( const fishhash_context& ctx, const hash512& seed) noexcept { + //printf("fishhash_kernel debug 1"); + const uint32_t index_limit = static_cast(ctx.full_dataset_num_items); + //printf("fishhash_kernel debug 1.1"); + //const uint32_t seed_init = seed.word32s[0]; + //printf("fishhash_kernel debug 2"); + hash1024 mix{seed, seed}; + //printf("fishhash_kernel debug 3"); + //printf("The index_limit is : %d \n", index_limit); + for (uint32_t i = 0; i < num_dataset_accesses; ++i) { + + //printf("fishhash_kernel debug 4, %d", index_limit); + //printf("fishhash_kernel debug 4.1, %032x", mix.word32s[0]); + // Calculate new fetching indexes + //const uint32_t p0 = mix.word32s[0] % index_limit; + //printf("fishhash_kernel debug 4.2, %032x", mix.word32s[4]); + //const uint32_t p1 = mix.word32s[4] % index_limit; + //printf("fishhash_kernel debug 4.3, %032x", mix.word32s[8]); + //const uint32_t p2 = mix.word32s[8] % index_limit; + + /* + const uint32_t p0 = mix.word32s[0] % index_limit; + const uint32_t p1 = mix.word32s[4] % index_limit; + const uint32_t p2 = mix.word32s[8] % index_limit; + */ + + uint32_t mixGroup[8]; + for (uint32_t c=0; c<8; c++) { + mixGroup[c] = (mix.word32s[4*c + 0] ^ mix.word32s[4*c + 1] ^ mix.word32s[4*c + 2] ^ mix.word32s[4*c + 3]); + } + + const uint32_t p0 = (mixGroup[0] ^ mixGroup[3] ^ mixGroup[6]) % index_limit; + const uint32_t p1 = (mixGroup[1] ^ mixGroup[4] ^ mixGroup[7]) % index_limit; + const uint32_t p2 = (mixGroup[2] ^ mixGroup[5] ^ i) % index_limit; + + + //printf("fishhash_kernel debug 5"); + hash1024 fetch0 = lookup(ctx, p0); + hash1024 fetch1 = lookup(ctx, p1); + hash1024 fetch2 = lookup(ctx, p2); + + //printf("fishhash_kernel debug 6"); + // Modify fetch1 and fetch2 + for (size_t j = 0; j < 32; ++j) { + fetch1.word32s[j] = fnv1(mix.word32s[j], fetch1.word32s[j]); + fetch2.word32s[j] = mix.word32s[j] ^ fetch2.word32s[j]; + } + + //printf("fishhash_kernel debug 7"); + // Final computation of new mix + for (size_t j = 0; j < 16; ++j) + mix.word64s[j] = fetch0.word64s[j] * fetch1.word64s[j] + fetch2.word64s[j]; + } + + //printf("fishhash_kernel debug 8"); + // Collapse the result into 32 bytes + hash256 mix_hash; + static constexpr size_t num_words = sizeof(mix) / sizeof(uint32_t); + //printf("fishhash_kernel debug 9"); + for (size_t i = 0; i < num_words; i += 4) { + const uint32_t h1 = fnv1(mix.word32s[i], mix.word32s[i + 1]); + const uint32_t h2 = fnv1(h1, mix.word32s[i + 2]); + const uint32_t h3 = fnv1(h2, mix.word32s[i + 3]); + mix_hash.word32s[i / 4] = h3; + } + + //printf("fishhash_kernel debug 10"); + return mix_hash; + } + +DEV_INLINE void printHash(char* msg, const uint8_t* hash, int size) { + printf(msg); + for(int i = 0; i < size; i++) { + //printf("%02x", output[i]); + printf("%02x", hash[i]); + } + printf("\n"); + } + +//DEV_INLINE void hashFish(uint8_t * output, const fishhash_context * ctx, const uint8_t * header, uint64_t header_size) noexcept { +DEV_INLINE void hashFish( + const fishhash_context * ctx, + uint8_t* out, + const uint8_t* in) { + hash512 seed; + //*seed.bytes = *in; + memset(seed.bytes, 0, 64); + memcpy(seed.bytes, in, 32); + /* + if (threadIdx.x == 0 && blockIdx.x == 0) { + printHash("hashFish-1 in is : ", in, 32); + printHash("hashFish-1 in is : ", seed.bytes, 32); + } + */ + //printf("hashFish debug 1"); + const hash256 mix_hash = fishhash_kernel(*ctx, seed); + //*out = *mix_hash.bytes; + + memcpy(out, mix_hash.bytes, 32); + /* + if (threadIdx.x == 0 && blockIdx.x == 0) { + printHash("hashFish-2 in is : ", mix_hash.bytes, 32); + printHash("hashFish-2 in is : ", out, 32); + } + */ + } + + + +DEV_INLINE hash512 bitwise_xor(const hash512& x, const hash512& y) noexcept { + hash512 z; + for (size_t i = 0; i < sizeof(z) / sizeof(z.word64s[0]); ++i) + z.word64s[i] = x.word64s[i] ^ y.word64s[i]; + return z; + } + diff --git a/plugins/cuda/karlsen-cuda-native/src/fishhash_cuda_kernel.h b/plugins/cuda/karlsen-cuda-native/src/fishhash_cuda_kernel.h new file mode 100644 index 0000000..9dbb03b --- /dev/null +++ b/plugins/cuda/karlsen-cuda-native/src/fishhash_cuda_kernel.h @@ -0,0 +1,51 @@ +#pragma once + +#include + +typedef union { + uint64_t word64s[4]; + uint32_t word32s[8]; + uint8_t bytes[32]; + char str[32]; + uint2 uint2s[4]; +} hash256; + +typedef union { + uint64_t word64s[8]; + uint32_t word32s[16]; + uint8_t bytes[64]; + char str[64]; + uint2 uint2s[8]; +} hash512; + +typedef union { + //union hash512 hash512s[2]; + hash512 hash512s[2]; + uint64_t word64s[16]; + uint32_t word32s[32]; + uint8_t bytes[128]; + char str[128]; + uint2 uint2s[16]; +} hash1024; + +typedef struct { + const int light_cache_num_items; + //hash512* const light_cache; + hash512* light_cache; + const int full_dataset_num_items; + hash1024* full_dataset; +} fishhash_context; + + +#define CUDA_SAFE_CALL(call) \ + do \ + { \ + cudaError_t err = call; \ + if (cudaSuccess != err) \ + { \ + std::stringstream ss; \ + ss << "CUDA error in func " << __FUNCTION__ << " at line " << __LINE__ << ' ' \ + << cudaGetErrorString(err); \ + throw cuda_runtime_error(ss.str()); \ + } \ + } while (0) \ No newline at end of file diff --git a/plugins/cuda/karlsen-cuda-native/src/fnv.cuh b/plugins/cuda/karlsen-cuda-native/src/fnv.cuh new file mode 100644 index 0000000..6526521 --- /dev/null +++ b/plugins/cuda/karlsen-cuda-native/src/fnv.cuh @@ -0,0 +1,18 @@ +#define FNV_PRIME 0x01000193 + +#define fnv(x, y) ((x)*FNV_PRIME ^ (y)) + +DEV_INLINE uint4 fnv4(uint4 a, uint4 b) +{ + uint4 c; + c.x = a.x * FNV_PRIME ^ b.x; + c.y = a.y * FNV_PRIME ^ b.y; + c.z = a.z * FNV_PRIME ^ b.z; + c.w = a.w * FNV_PRIME ^ b.w; + return c; +} + +DEV_INLINE uint32_t fnv_reduce(uint4 v) +{ + return fnv(fnv(fnv(v.x, v.y), v.z), v.w); +} \ No newline at end of file diff --git a/plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu b/plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu new file mode 100644 index 0000000..298f088 --- /dev/null +++ b/plugins/cuda/karlsen-cuda-native/src/karlsen-cuda.cu @@ -0,0 +1,149 @@ +#include +#include +#include "keccak-tiny.c" +#include "xoshiro256starstar.c" +#include "fishhash_cuda_kernel.cuh" + + + +typedef uint8_t Hash[32]; + +typedef union _uint256_t { + uint64_t number[4]; + uint8_t hash[32]; +} uint256_t; + +#define BLOCKDIM 1024 +#define MATRIX_SIZE 64 +#define HALF_MATRIX_SIZE 32 +#define QUARTER_MATRIX_SIZE 16 +#define HASH_HEADER_SIZE 72 + +#define RANDOM_LEAN 0 +#define RANDOM_XOSHIRO 1 + +#define LT_U256(X,Y) (X.number[3] != Y.number[3] ? X.number[3] < Y.number[3] : X.number[2] != Y.number[2] ? X.number[2] < Y.number[2] : X.number[1] != Y.number[1] ? X.number[1] < Y.number[1] : X.number[0] < Y.number[0]) + +__constant__ uint8_t matrix[MATRIX_SIZE][MATRIX_SIZE]; +__constant__ uint8_t hash_header[HASH_HEADER_SIZE]; +__constant__ uint256_t target; +//__constant__ static const uint8_t powP[Plen] = { 0x3d, 0xd8, 0xf6, 0xa1, 0x0d, 0xff, 0x3c, 0x11, 0x3c, 0x7e, 0x02, 0xb7, 0x55, 0x88, 0xbf, 0x29, 0xd2, 0x44, 0xfb, 0x0e, 0x72, 0x2e, 0x5f, 0x1e, 0xa0, 0x69, 0x98, 0xf5, 0xa3, 0xa4, 0xa5, 0x1b, 0x65, 0x2d, 0x5e, 0x87, 0xca, 0xaf, 0x2f, 0x7b, 0x46, 0xe2, 0xdc, 0x29, 0xd6, 0x61, 0xef, 0x4a, 0x10, 0x5b, 0x41, 0xad, 0x1e, 0x98, 0x3a, 0x18, 0x9c, 0xc2, 0x9b, 0x78, 0x0c, 0xf6, 0x6b, 0x77, 0x40, 0x31, 0x66, 0x88, 0x33, 0xf1, 0xeb, 0xf8, 0xf0, 0x5f, 0x28, 0x43, 0x3c, 0x1c, 0x65, 0x2e, 0x0a, 0x4a, 0xf1, 0x40, 0x05, 0x07, 0x96, 0x0f, 0x52, 0x91, 0x29, 0x5b, 0x87, 0x67, 0xe3, 0x44, 0x15, 0x37, 0xb1, 0x25, 0xa4, 0xf1, 0x70, 0xec, 0x89, 0xda, 0xe9, 0x82, 0x8f, 0x5d, 0xc8, 0xe6, 0x23, 0xb2, 0xb4, 0x85, 0x1f, 0x60, 0x1a, 0xb2, 0x46, 0x6a, 0xa3, 0x64, 0x90, 0x54, 0x85, 0x34, 0x1a, 0x85, 0x2f, 0x7a, 0x1c, 0xdd, 0x06, 0x0f, 0x42, 0xb1, 0x3b, 0x56, 0x1d, 0x02, 0xa2, 0xc1, 0xe4, 0x68, 0x16, 0x45, 0xe4, 0xe5, 0x1d, 0xba, 0x8d, 0x5f, 0x09, 0x05, 0x41, 0x57, 0x02, 0xd1, 0x4a, 0xcf, 0xce, 0x9b, 0x84, 0x4e, 0xca, 0x89, 0xdb, 0x2e, 0x74, 0xa8, 0x27, 0x94, 0xb0, 0x48, 0x72, 0x52, 0x8b, 0xe7, 0x9c, 0xce, 0xfc, 0xb1, 0xbc, 0xa5, 0xaf, 0x82, 0xcf, 0x29, 0x11, 0x5d, 0x83, 0x43, 0x82, 0x6f, 0x78, 0x7c, 0xb9, 0x02 }; +//__constant__ static const uint8_t heavyP[Plen] = { 0x09, 0x85, 0x24, 0xb2, 0x52, 0x4c, 0xd7, 0x3a, 0x16, 0x42, 0x9f, 0x2f, 0x0e, 0x9b, 0x62, 0x79, 0xee, 0xf8, 0xc7, 0x16, 0x48, 0xff, 0x14, 0x7a, 0x98, 0x64, 0x05, 0x80, 0x4c, 0x5f, 0xa7, 0x11, 0xda, 0xce, 0xee, 0x44, 0xdf, 0xe0, 0x20, 0xe7, 0x69, 0x40, 0xf3, 0x14, 0x2e, 0xd8, 0xc7, 0x72, 0xba, 0x35, 0x89, 0x93, 0x2a, 0xff, 0x00, 0xc1, 0x62, 0xc4, 0x0f, 0x25, 0x40, 0x90, 0x21, 0x5e, 0x48, 0x6a, 0xcf, 0x0d, 0xa6, 0xf9, 0x39, 0x80, 0x0c, 0x3d, 0x2a, 0x79, 0x9f, 0xaa, 0xbc, 0xa0, 0x26, 0xa2, 0xa9, 0xd0, 0x5d, 0xc0, 0x31, 0xf4, 0x3f, 0x8c, 0xc1, 0x54, 0xc3, 0x4c, 0x1f, 0xd3, 0x3d, 0xcc, 0x69, 0xa7, 0x01, 0x7d, 0x6b, 0x6c, 0xe4, 0x93, 0x24, 0x56, 0xd3, 0x5b, 0xc6, 0x2e, 0x44, 0xb0, 0xcd, 0x99, 0x3a, 0x4b, 0xf7, 0x4e, 0xb0, 0xf2, 0x34, 0x54, 0x83, 0x86, 0x4c, 0x77, 0x16, 0x94, 0xbc, 0x36, 0xb0, 0x61, 0xe9, 0x07, 0x07, 0xcc, 0x65, 0x77, 0xb1, 0x1d, 0x8f, 0x7e, 0x39, 0x6d, 0xc4, 0xba, 0x80, 0xdb, 0x8f, 0xea, 0x58, 0xca, 0x34, 0x7b, 0xd3, 0xf2, 0x92, 0xb9, 0x57, 0xb9, 0x81, 0x84, 0x04, 0xc5, 0x76, 0xc7, 0x2e, 0xc2, 0x12, 0x51, 0x67, 0x9f, 0xc3, 0x47, 0x0a, 0x0c, 0x29, 0xb5, 0x9d, 0x39, 0xbb, 0x92, 0x15, 0xc6, 0x9f, 0x2f, 0x31, 0xe0, 0x9a, 0x54, 0x35, 0xda, 0xb9, 0x10, 0x7d, 0x32, 0x19, 0x16 }; + + __shared__ hash512* light_cache; + __shared__ hash1024* full_dataset; + +__shared__ uint8_t cache_test[10]; + +__device__ __inline__ void amul4bit(uint32_t packed_vec1[32], uint32_t packed_vec2[32], uint32_t *ret) { + // We assume each 32 bits have four values: A0 B0 C0 D0 + unsigned int res = 0; + #if __CUDA_ARCH__ < 610 + char4 *a4 = (char4*)packed_vec1; + char4 *b4 = (char4*)packed_vec2; + #endif + #pragma unroll + for (int i=0; i= 610 + res = __dp4a(packed_vec1[i], packed_vec2[i], res); + #else + res += a4[i].x*b4[i].x; + res += a4[i].y*b4[i].y; + res += a4[i].z*b4[i].z; + res += a4[i].w*b4[i].w; + #endif + } + + *ret = res; +} + +extern "C" { + + __global__ void heavy_hash( + const uint64_t nonce_mask, + const uint64_t nonce_fixed, + const uint64_t nonces_len, + uint8_t random_type, + void* states, + uint64_t *final_nonce, + hash1024* dataset, + hash512* cache + ) { + + // assuming header_len is 72 + /* + if (threadIdx.x == 0 && blockIdx.x == 0) { + printf("heavy_hash Thread %d, Block %d\n", threadIdx.x, blockIdx.x); + printHash("The cache[10] is : ", cache[10].bytes, 128); + printHash("The cache[42] is : ", cache[42].bytes, 128); + printHash("The dataset[10] is : ", dataset[10].bytes, 128); + printHash("The dataset[42] is : ", dataset[42].bytes, 128); + printHash("The dataset[12345] is : ", dataset[12345].bytes, 128); + } + */ + + + int nonceId = threadIdx.x + blockIdx.x*blockDim.x; + if (nonceId < nonces_len) { + if (nonceId == 0) *final_nonce = 0; + uint64_t nonce; + switch (random_type) { + case RANDOM_LEAN: + nonce = ((uint64_t *)states)[0] ^ nonceId; + break; + case RANDOM_XOSHIRO: + default: + nonce = xoshiro256_next(((ulonglong4 *)states) + nonceId); + break; + } + nonce = (nonce & nonce_mask) | nonce_fixed; + // header + uint8_t input[80]; + memcpy(input, hash_header, HASH_HEADER_SIZE); + // data + // TODO: check endianity? + uint256_t hash_; + memcpy(input + HASH_HEADER_SIZE, (uint8_t *)(&nonce), 8); + hashB3(hash_.hash, input, 80); + + /* + if (threadIdx.x == 0 && blockIdx.x == 0) { + printHash("hashb3-1 is : ", hash_.hash, 32); + } + */ + + fishhash_context ctx { + light_cache_num_items, + cache, + full_dataset_num_items, + dataset + }; + + memset(input, 0, 80); + memcpy(input, hash_.hash, 32); + hashFish(&ctx, hash_.hash, input); + + /* + if (threadIdx.x == 0 && blockIdx.x == 0) { + printHash("hashFish is : ", hash_.hash, 32); + } + */ + + memset(input, 0, 80); + memcpy(input, hash_.hash, 32); + hashB3(hash_.hash, input, 32); + + /* + if (threadIdx.x == 0 && blockIdx.x == 0) { + printHash("hashb3-2 is : ", hash_.hash, 32); + } + */ + + + if (LT_U256(hash_, target)){ + atomicCAS((unsigned long long int*) final_nonce, 0, (unsigned long long int) nonce); + } + } + } + +} \ No newline at end of file diff --git a/plugins/cuda/kaspa-cuda-native/src/keccak-tiny-unrolled.c b/plugins/cuda/karlsen-cuda-native/src/keccak-tiny-unrolled.c similarity index 100% rename from plugins/cuda/kaspa-cuda-native/src/keccak-tiny-unrolled.c rename to plugins/cuda/karlsen-cuda-native/src/keccak-tiny-unrolled.c diff --git a/plugins/cuda/kaspa-cuda-native/src/keccak-tiny.c b/plugins/cuda/karlsen-cuda-native/src/keccak-tiny.c similarity index 86% rename from plugins/cuda/kaspa-cuda-native/src/keccak-tiny.c rename to plugins/cuda/karlsen-cuda-native/src/keccak-tiny.c index fd05241..ffed568 100644 --- a/plugins/cuda/kaspa-cuda-native/src/keccak-tiny.c +++ b/plugins/cuda/karlsen-cuda-native/src/keccak-tiny.c @@ -1,122 +1,138 @@ -/** libkeccak-tiny - * - * A single-file implementation of SHA-3 and SHAKE. - * - * Implementor: David Leon Gil - * License: CC0, attribution kindly requested. Blame taken too, - * but not liability. - */ -#define __STDC_WANT_LIB_EXT1__ 1 -#include "blake3_compact.h" - -#include -#include -#include -#include - -/******** The Keccak-f[1600] permutation ********/ - -/*** Constants. ***/ -__device__ static const uint8_t rho[24] = \ - { 1, 3, 6, 10, 15, 21, - 28, 36, 45, 55, 2, 14, - 27, 41, 56, 8, 25, 43, - 62, 18, 39, 61, 20, 44}; -__device__ static const uint8_t pi[24] = \ - {10, 7, 11, 17, 18, 3, - 5, 16, 8, 21, 24, 4, - 15, 23, 19, 13, 12, 2, - 20, 14, 22, 9, 6, 1}; -__device__ static const uint64_t RC[24] = \ - {1ULL, 0x8082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, - 0x808bULL, 0x80000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, - 0x8aULL, 0x88ULL, 0x80008009ULL, 0x8000000aULL, - 0x8000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL, - 0x8000000000008002ULL, 0x8000000000000080ULL, 0x800aULL, 0x800000008000000aULL, - 0x8000000080008081ULL, 0x8000000000008080ULL, 0x80000001ULL, 0x8000000080008008ULL}; - -/*** Helper macros to unroll the permutation. ***/ -#define rol(x, s) (((x) << s) | ((x) >> (64 - s))) -#define REPEAT6(e) e e e e e e -#define REPEAT24(e) REPEAT6(e e e e) -#define REPEAT5(e) e e e e e -#define FOR5(v, s, e) \ - v = 0; \ - REPEAT5(e; v += s;) - -/*** Keccak-f[1600] ***/ -__device__ static inline void keccakf(void* state) { - uint64_t* a = (uint64_t*)state; - uint64_t b[5] = {0}; - uint64_t t = 0; - uint8_t x, y; - - for (int i = 0; i < 24; i++) { - // Theta - FOR5(x, 1, - b[x] = 0; - FOR5(y, 5, - b[x] ^= a[x + y]; )) - FOR5(x, 1, - FOR5(y, 5, - a[y + x] ^= b[(x + 4) % 5] ^ rol(b[(x + 1) % 5], 1); )) - // Rho and pi - t = a[1]; - x = 0; - REPEAT24(b[0] = a[pi[x]]; - a[pi[x]] = rol(t, rho[x]); - t = b[0]; - x++; ) - // Chi - FOR5(y, - 5, - FOR5(x, 1, - b[x] = a[y + x];) - FOR5(x, 1, - a[y + x] = b[x] ^ ((~b[(x + 1) % 5]) & b[(x + 2) % 5]); )) - // Iota - a[0] ^= RC[i]; - } -} - -/******** The FIPS202-defined functions. ********/ - -/*** Some helper macros. ***/ -#define P keccakf -#define Plen 200 - - -/** The sponge-based hash construction. **/ -__device__ __forceinline__ static void hash( - const uint8_t initP[Plen], - uint8_t* out, - const uint8_t* in) { - uint8_t a[Plen] = {0}; - - #pragma unroll - for (int i=0; i<10; i++) ((uint64_t *)a)[i] = ((uint64_t *)initP)[i] ^ ((uint64_t *)in)[i]; - #pragma unroll - for (int i=10; i<25; i++) ((uint64_t *)a)[i] = ((uint64_t *)initP)[i]; - - // Apply P - P(a); - // Squeeze output. - #pragma unroll - for (int i=0; i<4; i++) ((uint64_t *)out)[i] = ((uint64_t *)a)[i]; - -} - -/** The sponge-based hash construction. **/ -__device__ __forceinline__ static void hashB3( - const uint8_t initP[Plen], - uint8_t* out, - const uint8_t* in) { - - blake3_hasher hasher; - blake3_hasher_init(&hasher); - blake3_hasher_update(&hasher, in, 80); - blake3_hasher_finalize(&hasher, out, BLAKE3_OUT_LEN); - - -} - +/** libkeccak-tiny + * + * A single-file implementation of SHA-3 and SHAKE. + * + * Implementor: David Leon Gil + * License: CC0, attribution kindly requested. Blame taken too, + * but not liability. + */ +#define __STDC_WANT_LIB_EXT1__ 1 +#include "blake3_compact.h" + +#include +#include +#include +#include + +/******** The Keccak-f[1600] permutation ********/ + +/*** Constants. ***/ +__device__ static const uint8_t rho[24] = \ + { 1, 3, 6, 10, 15, 21, + 28, 36, 45, 55, 2, 14, + 27, 41, 56, 8, 25, 43, + 62, 18, 39, 61, 20, 44}; +__device__ static const uint8_t pi[24] = \ + {10, 7, 11, 17, 18, 3, + 5, 16, 8, 21, 24, 4, + 15, 23, 19, 13, 12, 2, + 20, 14, 22, 9, 6, 1}; +__device__ static const uint64_t RC[24] = \ + {1ULL, 0x8082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, + 0x808bULL, 0x80000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, + 0x8aULL, 0x88ULL, 0x80008009ULL, 0x8000000aULL, + 0x8000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL, + 0x8000000000008002ULL, 0x8000000000000080ULL, 0x800aULL, 0x800000008000000aULL, + 0x8000000080008081ULL, 0x8000000000008080ULL, 0x80000001ULL, 0x8000000080008008ULL}; + +/*** Helper macros to unroll the permutation. ***/ +#define rol(x, s) (((x) << s) | ((x) >> (64 - s))) +#define REPEAT6(e) e e e e e e +#define REPEAT24(e) REPEAT6(e e e e) +#define REPEAT5(e) e e e e e +#define FOR5(v, s, e) \ + v = 0; \ + REPEAT5(e; v += s;) + +/*** Keccak-f[1600] ***/ +__device__ static inline void keccakf(void* state) { + uint64_t* a = (uint64_t*)state; + uint64_t b[5] = {0}; + uint64_t t = 0; + uint8_t x, y; + + for (int i = 0; i < 24; i++) { + // Theta + FOR5(x, 1, + b[x] = 0; + FOR5(y, 5, + b[x] ^= a[x + y]; )) + FOR5(x, 1, + FOR5(y, 5, + a[y + x] ^= b[(x + 4) % 5] ^ rol(b[(x + 1) % 5], 1); )) + // Rho and pi + t = a[1]; + x = 0; + REPEAT24(b[0] = a[pi[x]]; + a[pi[x]] = rol(t, rho[x]); + t = b[0]; + x++; ) + // Chi + FOR5(y, + 5, + FOR5(x, 1, + b[x] = a[y + x];) + FOR5(x, 1, + a[y + x] = b[x] ^ ((~b[(x + 1) % 5]) & b[(x + 2) % 5]); )) + // Iota + a[0] ^= RC[i]; + } +} + +/******** The FIPS202-defined functions. ********/ + +/*** Some helper macros. ***/ +#define P keccakf +#define Plen 200 + + +/** The sponge-based hash construction. **/ +__device__ __forceinline__ static void hash( + const uint8_t initP[Plen], + uint8_t* out, + const uint8_t* in) { + uint8_t a[Plen] = {0}; + + #pragma unroll + for (int i=0; i<10; i++) ((uint64_t *)a)[i] = ((uint64_t *)initP)[i] ^ ((uint64_t *)in)[i]; + #pragma unroll + for (int i=10; i<25; i++) ((uint64_t *)a)[i] = ((uint64_t *)initP)[i]; + + // Apply P + P(a); + // Squeeze output. + #pragma unroll + for (int i=0; i<4; i++) ((uint64_t *)out)[i] = ((uint64_t *)a)[i]; + +} + +/** The sponge-based hash construction. **/ +__device__ __forceinline__ static void hashK( + uint8_t* out, + const uint8_t* in) { + + uint8_t a[Plen] = {0}; + + #pragma unroll + for (int i=0; i<10; i++) ((uint64_t *)a)[i] = ((uint64_t *)in)[i]; + // Apply P + P(a); + // Squeeze output. + #pragma unroll + for (int i=0; i<4; i++) ((uint64_t *)out)[i] = ((uint64_t *)a)[i]; +} + +/** The sponge-based hash construction. **/ +__device__ __forceinline__ static void hashB3( + uint8_t* out, + const uint8_t* in, + size_t len) { + + blake3_hasher hasher; + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, in, len); + blake3_hasher_finalize(&hasher, out, BLAKE3_OUT_LEN); + + +} + diff --git a/plugins/cuda/kaspa-cuda-native/src/keccak-tiny.h b/plugins/cuda/karlsen-cuda-native/src/keccak-tiny.h similarity index 100% rename from plugins/cuda/kaspa-cuda-native/src/keccak-tiny.h rename to plugins/cuda/karlsen-cuda-native/src/keccak-tiny.h diff --git a/plugins/cuda/karlsen-cuda-native/src/keccak.cuh b/plugins/cuda/karlsen-cuda-native/src/keccak.cuh new file mode 100644 index 0000000..a710b8f --- /dev/null +++ b/plugins/cuda/karlsen-cuda-native/src/keccak.cuh @@ -0,0 +1,260 @@ +#include "cuda_helper.h" + +__device__ __constant__ uint2 const keccak_round_constants[24] = { + { 0x00000001, 0x00000000 }, { 0x00008082, 0x00000000 }, { 0x0000808a, 0x80000000 }, { 0x80008000, 0x80000000 }, + { 0x0000808b, 0x00000000 }, { 0x80000001, 0x00000000 }, { 0x80008081, 0x80000000 }, { 0x00008009, 0x80000000 }, + { 0x0000008a, 0x00000000 }, { 0x00000088, 0x00000000 }, { 0x80008009, 0x00000000 }, { 0x8000000a, 0x00000000 }, + { 0x8000808b, 0x00000000 }, { 0x0000008b, 0x80000000 }, { 0x00008089, 0x80000000 }, { 0x00008003, 0x80000000 }, + { 0x00008002, 0x80000000 }, { 0x00000080, 0x80000000 }, { 0x0000800a, 0x00000000 }, { 0x8000000a, 0x80000000 }, + { 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 }, { 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 } +}; + +DEV_INLINE uint2 xor5( + const uint2 a, const uint2 b, const uint2 c, const uint2 d, const uint2 e) +{ +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + uint2 result; + asm volatile ( + "// xor5\n\t" + "lop3.b32 %0, %2, %3, %4, 0x96;\n\t" + "lop3.b32 %0, %0, %5, %6, 0x96;\n\t" + "lop3.b32 %1, %7, %8, %9, 0x96;\n\t" + "lop3.b32 %1, %1, %10, %11, 0x96;" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(b.x), "r"(c.x),"r"(d.x),"r"(e.x), + "r"(a.y), "r"(b.y), "r"(c.y),"r"(d.y),"r"(e.y)); + return result; +#else + return a ^ b ^ c ^ d ^ e; +#endif +} + +DEV_INLINE uint2 xor3(const uint2 a, const uint2 b, const uint2 c) +{ +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + uint2 result; + asm volatile ( + "// xor3\n\t" + "lop3.b32 %0, %2, %3, %4, 0x96;\n\t" + "lop3.b32 %1, %5, %6, %7, 0x96;" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(b.x), "r"(c.x), "r"(a.y), "r"(b.y), "r"(c.y)); + return result; +#else + return a ^ b ^ c; +#endif +} + +DEV_INLINE uint2 chi(const uint2 a, const uint2 b, const uint2 c) +{ +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + uint2 result; + asm volatile ( + "// chi\n\t" + "lop3.b32 %0, %2, %3, %4, 0xD2;\n\t" + "lop3.b32 %1, %5, %6, %7, 0xD2;" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(b.x), "r"(c.x), // 0xD2 = 0xF0 ^ ((~0xCC) & 0xAA) + "r"(a.y), "r"(b.y), "r"(c.y)); // 0xD2 = 0xF0 ^ ((~0xCC) & 0xAA) + return result; +#else + return a ^ (~b) & c; +#endif +} + +#if (__CUDA_ARCH__ >= 320) +#define LDG(x) __ldg(&(x)) +#else +#define LDG(x) (x) +#endif + +DEV_INLINE void SHA3_512(uint2* s) +{ + uint2 t[5], u, v; + + for (uint32_t i = 8; i < 25; i++) + { + s[i] = make_uint2(0, 0); + } + s[8].x = 1; + s[8].y = 0x80000000; + + for (int i = 0; i < 23; i++) + { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = xor5(s[0], s[5], s[10], s[15], s[20]); + t[1] = xor5(s[1], s[6], s[11], s[16], s[21]); + t[2] = xor5(s[2], s[7], s[12], s[17], s[22]); + t[3] = xor5(s[3], s[8], s[13], s[18], s[23]); + t[4] = xor5(s[4], s[9], s[14], s[19], s[24]); + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + + u = t[4] ^ ROL2(t[1], 1); + s[0] ^= u; + s[5] ^= u; + s[10] ^= u; + s[15] ^= u; + s[20] ^= u; + + u = t[0] ^ ROL2(t[2], 1); + s[1] ^= u; + s[6] ^= u; + s[11] ^= u; + s[16] ^= u; + s[21] ^= u; + + u = t[1] ^ ROL2(t[3], 1); + s[2] ^= u; + s[7] ^= u; + s[12] ^= u; + s[17] ^= u; + s[22] ^= u; + + u = t[2] ^ ROL2(t[4], 1); + s[3] ^= u; + s[8] ^= u; + s[13] ^= u; + s[18] ^= u; + s[23] ^= u; + + u = t[3] ^ ROL2(t[0], 1); + s[4] ^= u; + s[9] ^= u; + s[14] ^= u; + s[19] ^= u; + s[24] ^= u; + + /* rho pi: b[..] = rotl(a[..], ..) */ + u = s[1]; + + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[22] = ROL2(s[14], 39); + s[14] = ROL2(s[20], 18); + s[20] = ROL2(s[2], 62); + s[2] = ROL2(s[12], 43); + s[12] = ROL2(s[13], 25); + s[13] = ROL2(s[19], 8); + s[19] = ROL2(s[23], 56); + s[23] = ROL2(s[15], 41); + s[15] = ROL2(s[4], 27); + s[4] = ROL2(s[24], 14); + s[24] = ROL2(s[21], 2); + s[21] = ROL2(s[8], 55); + s[8] = ROL2(s[16], 45); + s[16] = ROL2(s[5], 36); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[18] = ROL2(s[17], 15); + s[17] = ROL2(s[11], 10); + s[11] = ROL2(s[7], 6); + s[7] = ROL2(s[10], 3); + s[10] = ROL2(u, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + u = s[0]; + v = s[1]; + s[0] = chi(s[0], s[1], s[2]); + s[1] = chi(s[1], s[2], s[3]); + s[2] = chi(s[2], s[3], s[4]); + s[3] = chi(s[3], s[4], u); + s[4] = chi(s[4], u, v); + + u = s[5]; + v = s[6]; + s[5] = chi(s[5], s[6], s[7]); + s[6] = chi(s[6], s[7], s[8]); + s[7] = chi(s[7], s[8], s[9]); + s[8] = chi(s[8], s[9], u); + s[9] = chi(s[9], u, v); + + u = s[10]; + v = s[11]; + s[10] = chi(s[10], s[11], s[12]); + s[11] = chi(s[11], s[12], s[13]); + s[12] = chi(s[12], s[13], s[14]); + s[13] = chi(s[13], s[14], u); + s[14] = chi(s[14], u, v); + + u = s[15]; + v = s[16]; + s[15] = chi(s[15], s[16], s[17]); + s[16] = chi(s[16], s[17], s[18]); + s[17] = chi(s[17], s[18], s[19]); + s[18] = chi(s[18], s[19], u); + s[19] = chi(s[19], u, v); + + u = s[20]; + v = s[21]; + s[20] = chi(s[20], s[21], s[22]); + s[21] = chi(s[21], s[22], s[23]); + s[22] = chi(s[22], s[23], s[24]); + s[23] = chi(s[23], s[24], u); + s[24] = chi(s[24], u, v); + + /* iota: a[0,0] ^= round constant */ + s[0] ^= LDG(keccak_round_constants[i]); + } + + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = xor5(s[0], s[5], s[10], s[15], s[20]); + t[1] = xor5(s[1], s[6], s[11], s[16], s[21]); + t[2] = xor5(s[2], s[7], s[12], s[17], s[22]); + t[3] = xor5(s[3], s[8], s[13], s[18], s[23]); + t[4] = xor5(s[4], s[9], s[14], s[19], s[24]); + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + + u = t[4] ^ ROL2(t[1], 1); + s[0] ^= u; + s[10] ^= u; + + u = t[0] ^ ROL2(t[2], 1); + s[6] ^= u; + s[16] ^= u; + + u = t[1] ^ ROL2(t[3], 1); + s[12] ^= u; + s[22] ^= u; + + u = t[2] ^ ROL2(t[4], 1); + s[3] ^= u; + s[18] ^= u; + + u = t[3] ^ ROL2(t[0], 1); + s[9] ^= u; + s[24] ^= u; + + /* rho pi: b[..] = rotl(a[..], ..) */ + u = s[1]; + + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[2] = ROL2(s[12], 43); + s[4] = ROL2(s[24], 14); + s[8] = ROL2(s[16], 45); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[7] = ROL2(s[10], 3); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + + u = s[0]; + v = s[1]; + s[0] = chi(s[0], s[1], s[2]); + s[1] = chi(s[1], s[2], s[3]); + s[2] = chi(s[2], s[3], s[4]); + s[3] = chi(s[3], s[4], u); + s[4] = chi(s[4], u, v); + s[5] = chi(s[5], s[6], s[7]); + s[6] = chi(s[6], s[7], s[8]); + s[7] = chi(s[7], s[8], s[9]); + + /* iota: a[0,0] ^= round constant */ + s[0] ^= LDG(keccak_round_constants[23]); +} \ No newline at end of file diff --git a/plugins/cuda/kaspa-cuda-native/src/xoshiro256starstar.c b/plugins/cuda/karlsen-cuda-native/src/xoshiro256starstar.c similarity index 100% rename from plugins/cuda/kaspa-cuda-native/src/xoshiro256starstar.c rename to plugins/cuda/karlsen-cuda-native/src/xoshiro256starstar.c diff --git a/plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu b/plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu deleted file mode 100644 index fa703ab..0000000 --- a/plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include "keccak-tiny.c" -#include "xoshiro256starstar.c" - - - -typedef uint8_t Hash[32]; - -typedef union _uint256_t { - uint64_t number[4]; - uint8_t hash[32]; -} uint256_t; - -#define BLOCKDIM 1024 -#define MATRIX_SIZE 64 -#define HALF_MATRIX_SIZE 32 -#define QUARTER_MATRIX_SIZE 16 -#define HASH_HEADER_SIZE 72 - -#define RANDOM_LEAN 0 -#define RANDOM_XOSHIRO 1 - -#define LT_U256(X,Y) (X.number[3] != Y.number[3] ? X.number[3] < Y.number[3] : X.number[2] != Y.number[2] ? X.number[2] < Y.number[2] : X.number[1] != Y.number[1] ? X.number[1] < Y.number[1] : X.number[0] < Y.number[0]) - -__constant__ uint8_t matrix[MATRIX_SIZE][MATRIX_SIZE]; -__constant__ uint8_t hash_header[HASH_HEADER_SIZE]; -__constant__ uint256_t target; -__constant__ static const uint8_t powP[Plen] = { 0x3d, 0xd8, 0xf6, 0xa1, 0x0d, 0xff, 0x3c, 0x11, 0x3c, 0x7e, 0x02, 0xb7, 0x55, 0x88, 0xbf, 0x29, 0xd2, 0x44, 0xfb, 0x0e, 0x72, 0x2e, 0x5f, 0x1e, 0xa0, 0x69, 0x98, 0xf5, 0xa3, 0xa4, 0xa5, 0x1b, 0x65, 0x2d, 0x5e, 0x87, 0xca, 0xaf, 0x2f, 0x7b, 0x46, 0xe2, 0xdc, 0x29, 0xd6, 0x61, 0xef, 0x4a, 0x10, 0x5b, 0x41, 0xad, 0x1e, 0x98, 0x3a, 0x18, 0x9c, 0xc2, 0x9b, 0x78, 0x0c, 0xf6, 0x6b, 0x77, 0x40, 0x31, 0x66, 0x88, 0x33, 0xf1, 0xeb, 0xf8, 0xf0, 0x5f, 0x28, 0x43, 0x3c, 0x1c, 0x65, 0x2e, 0x0a, 0x4a, 0xf1, 0x40, 0x05, 0x07, 0x96, 0x0f, 0x52, 0x91, 0x29, 0x5b, 0x87, 0x67, 0xe3, 0x44, 0x15, 0x37, 0xb1, 0x25, 0xa4, 0xf1, 0x70, 0xec, 0x89, 0xda, 0xe9, 0x82, 0x8f, 0x5d, 0xc8, 0xe6, 0x23, 0xb2, 0xb4, 0x85, 0x1f, 0x60, 0x1a, 0xb2, 0x46, 0x6a, 0xa3, 0x64, 0x90, 0x54, 0x85, 0x34, 0x1a, 0x85, 0x2f, 0x7a, 0x1c, 0xdd, 0x06, 0x0f, 0x42, 0xb1, 0x3b, 0x56, 0x1d, 0x02, 0xa2, 0xc1, 0xe4, 0x68, 0x16, 0x45, 0xe4, 0xe5, 0x1d, 0xba, 0x8d, 0x5f, 0x09, 0x05, 0x41, 0x57, 0x02, 0xd1, 0x4a, 0xcf, 0xce, 0x9b, 0x84, 0x4e, 0xca, 0x89, 0xdb, 0x2e, 0x74, 0xa8, 0x27, 0x94, 0xb0, 0x48, 0x72, 0x52, 0x8b, 0xe7, 0x9c, 0xce, 0xfc, 0xb1, 0xbc, 0xa5, 0xaf, 0x82, 0xcf, 0x29, 0x11, 0x5d, 0x83, 0x43, 0x82, 0x6f, 0x78, 0x7c, 0xb9, 0x02 }; -__constant__ static const uint8_t heavyP[Plen] = { 0x09, 0x85, 0x24, 0xb2, 0x52, 0x4c, 0xd7, 0x3a, 0x16, 0x42, 0x9f, 0x2f, 0x0e, 0x9b, 0x62, 0x79, 0xee, 0xf8, 0xc7, 0x16, 0x48, 0xff, 0x14, 0x7a, 0x98, 0x64, 0x05, 0x80, 0x4c, 0x5f, 0xa7, 0x11, 0xda, 0xce, 0xee, 0x44, 0xdf, 0xe0, 0x20, 0xe7, 0x69, 0x40, 0xf3, 0x14, 0x2e, 0xd8, 0xc7, 0x72, 0xba, 0x35, 0x89, 0x93, 0x2a, 0xff, 0x00, 0xc1, 0x62, 0xc4, 0x0f, 0x25, 0x40, 0x90, 0x21, 0x5e, 0x48, 0x6a, 0xcf, 0x0d, 0xa6, 0xf9, 0x39, 0x80, 0x0c, 0x3d, 0x2a, 0x79, 0x9f, 0xaa, 0xbc, 0xa0, 0x26, 0xa2, 0xa9, 0xd0, 0x5d, 0xc0, 0x31, 0xf4, 0x3f, 0x8c, 0xc1, 0x54, 0xc3, 0x4c, 0x1f, 0xd3, 0x3d, 0xcc, 0x69, 0xa7, 0x01, 0x7d, 0x6b, 0x6c, 0xe4, 0x93, 0x24, 0x56, 0xd3, 0x5b, 0xc6, 0x2e, 0x44, 0xb0, 0xcd, 0x99, 0x3a, 0x4b, 0xf7, 0x4e, 0xb0, 0xf2, 0x34, 0x54, 0x83, 0x86, 0x4c, 0x77, 0x16, 0x94, 0xbc, 0x36, 0xb0, 0x61, 0xe9, 0x07, 0x07, 0xcc, 0x65, 0x77, 0xb1, 0x1d, 0x8f, 0x7e, 0x39, 0x6d, 0xc4, 0xba, 0x80, 0xdb, 0x8f, 0xea, 0x58, 0xca, 0x34, 0x7b, 0xd3, 0xf2, 0x92, 0xb9, 0x57, 0xb9, 0x81, 0x84, 0x04, 0xc5, 0x76, 0xc7, 0x2e, 0xc2, 0x12, 0x51, 0x67, 0x9f, 0xc3, 0x47, 0x0a, 0x0c, 0x29, 0xb5, 0x9d, 0x39, 0xbb, 0x92, 0x15, 0xc6, 0x9f, 0x2f, 0x31, 0xe0, 0x9a, 0x54, 0x35, 0xda, 0xb9, 0x10, 0x7d, 0x32, 0x19, 0x16 }; - -__device__ __inline__ void amul4bit(uint32_t packed_vec1[32], uint32_t packed_vec2[32], uint32_t *ret) { - // We assume each 32 bits have four values: A0 B0 C0 D0 - unsigned int res = 0; - #if __CUDA_ARCH__ < 610 - char4 *a4 = (char4*)packed_vec1; - char4 *b4 = (char4*)packed_vec2; - #endif - #pragma unroll - for (int i=0; i= 610 - res = __dp4a(packed_vec1[i], packed_vec2[i], res); - #else - res += a4[i].x*b4[i].x; - res += a4[i].y*b4[i].y; - res += a4[i].z*b4[i].z; - res += a4[i].w*b4[i].w; - #endif - } - - *ret = res; -} - - -extern "C" { - - - __global__ void heavy_hash(const uint64_t nonce_mask, const uint64_t nonce_fixed, const uint64_t nonces_len, uint8_t random_type, void* states, uint64_t *final_nonce) { - // assuming header_len is 72 - int nonceId = threadIdx.x + blockIdx.x*blockDim.x; - if (nonceId < nonces_len) { - if (nonceId == 0) *final_nonce = 0; - uint64_t nonce; - switch (random_type) { - case RANDOM_LEAN: - nonce = ((uint64_t *)states)[0] ^ nonceId; - break; - case RANDOM_XOSHIRO: - default: - nonce = xoshiro256_next(((ulonglong4 *)states) + nonceId); - break; - } - nonce = (nonce & nonce_mask) | nonce_fixed; - // header - uint8_t input[80]; - memcpy(input, hash_header, HASH_HEADER_SIZE); - // data - // TODO: check endianity? - uint256_t hash_; - memcpy(input + HASH_HEADER_SIZE, (uint8_t *)(&nonce), 8); - hashB3(powP, hash_.hash, input); - - //assert((rowId != 0) || (hashId != 0) ); - uchar4 packed_hash[QUARTER_MATRIX_SIZE] = {0}; - #pragma unroll - for (int i=0; i> 4 , - (hash_.hash[2*i] & 0x0F), - (hash_.hash[2*i+1] & 0xF0) >> 4, - (hash_.hash[2*i+1] & 0x0F) - ); - } - uint32_t product1, product2; - #pragma unroll - for (int rowId=0; rowId>= 6; - product1 &= 0xF0; - product2 >>= 10; - #if __CUDA_ARCH__ < 500 || __CUDA_ARCH__ > 700 - hash_.hash[rowId] = hash_.hash[rowId] ^ ((uint8_t)(product1) | (uint8_t)(product2)); - #else - uint32_t lop_temp = hash_.hash[rowId]; - asm("lop3.b32" " %0, %1, %2, %3, 0x56;": "=r" (lop_temp): "r" (product1), "r" (product2), "r" (lop_temp)); - hash_.hash[rowId] = lop_temp; - #endif - } - memset(input, 0, 80); - memcpy(input, hash_.hash, 32); - hash(heavyP, hash_.hash, input); - if (LT_U256(hash_, target)){ - atomicCAS((unsigned long long int*) final_nonce, 0, (unsigned long long int) nonce); - } - } - } - -} \ No newline at end of file diff --git a/plugins/cuda/resources/kaspa-cuda-sm20.ptx b/plugins/cuda/resources/karlsen-cuda-sm20.ptx similarity index 97% rename from plugins/cuda/resources/kaspa-cuda-sm20.ptx rename to plugins/cuda/resources/karlsen-cuda-sm20.ptx index 932cec1..192e7a8 100644 --- a/plugins/cuda/resources/kaspa-cuda-sm20.ptx +++ b/plugins/cuda/resources/karlsen-cuda-sm20.ptx @@ -1,14623 +1,14623 @@ -// -// Generated by NVIDIA NVVM Compiler -// -// Compiler Build ID: CL-21554848 -// Cuda compilation tools, release 8.0, V8.0.61 -// Based on LLVM 3.4svn -// - -.version 5.0 -.target sm_20 -.address_size 64 - - // .globl heavy_hash -.const .align 1 .b8 matrix[4096]; -.const .align 8 .b8 hash_header[72]; -.const .align 8 .b8 target[32]; -.global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; -.global .align 1 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; -.global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; -.global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; -.global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; -.const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; -.const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; - -.visible .entry heavy_hash( - .param .u64 heavy_hash_param_0, - .param .u64 heavy_hash_param_1, - .param .u64 heavy_hash_param_2, - .param .u8 heavy_hash_param_3, - .param .u64 heavy_hash_param_4, - .param .u64 heavy_hash_param_5 -) -{ - .reg .pred %p<18>; - .reg .b16 %rs<8194>; - .reg .b32 %r<12570>; - .reg .b64 %rd<683>; - - - ld.param.u64 %rd128, [heavy_hash_param_0]; - ld.param.u64 %rd129, [heavy_hash_param_1]; - ld.param.u64 %rd131, [heavy_hash_param_2]; - ld.param.u64 %rd130, [heavy_hash_param_4]; - ld.param.u64 %rd132, [heavy_hash_param_5]; - ld.param.u8 %rs1, [heavy_hash_param_3]; - cvta.to.global.u64 %rd1, %rd132; - mov.u32 %r5, %ntid.x; - mov.u32 %r6, %ctaid.x; - mov.u32 %r7, %tid.x; - mad.lo.s32 %r8, %r5, %r6, %r7; - cvt.s64.s32 %rd2, %r8; - setp.ge.u64 %p6, %rd2, %rd131; - @%p6 bra BB0_18; - - cvt.u32.u64 %r9, %rd2; - setp.ne.s32 %p7, %r9, 0; - @%p7 bra BB0_3; - - mov.u64 %rd133, 0; - st.global.u64 [%rd1], %rd133; - -BB0_3: - setp.eq.s16 %p8, %rs1, 0; - @%p8 bra BB0_5; - - cvta.to.global.u64 %rd134, %rd130; - shl.b64 %rd135, %rd2, 5; - add.s64 %rd136, %rd134, %rd135; - ld.global.v2.u64 {%rd137, %rd138}, [%rd136]; - mul.lo.s64 %rd141, %rd138, 5; - mul.lo.s64 %rd142, %rd138, 640; - shr.u64 %rd143, %rd141, 57; - or.b64 %rd144, %rd143, %rd142; - mul.lo.s64 %rd630, %rd144, 9; - shl.b64 %rd145, %rd138, 17; - ld.global.v2.u64 {%rd146, %rd147}, [%rd136+16]; - xor.b64 %rd149, %rd146, %rd137; - xor.b64 %rd151, %rd147, %rd138; - xor.b64 %rd152, %rd138, %rd149; - xor.b64 %rd153, %rd137, %rd151; - st.global.v2.u64 [%rd136], {%rd153, %rd152}; - xor.b64 %rd154, %rd149, %rd145; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd151, 45; - shr.b64 %rhs, %rd151, 19; - add.u64 %rd155, %lhs, %rhs; - } - st.global.v2.u64 [%rd136+16], {%rd154, %rd155}; - bra.uni BB0_6; - -BB0_5: - cvta.to.global.u64 %rd156, %rd130; - ld.global.u64 %rd157, [%rd156]; - xor.b64 %rd630, %rd157, %rd2; - -BB0_6: - and.b64 %rd174, %rd630, %rd128; - or.b64 %rd6, %rd174, %rd129; - xor.b64 %rd635, %rd6, 3343109343542796272; - ld.const.u64 %rd175, [hash_header+64]; - xor.b64 %rd640, %rd175, -510048929142394560; - ld.const.u64 %rd176, [hash_header+56]; - xor.b64 %rd645, %rd176, 8605242046444978844; - ld.const.u64 %rd177, [hash_header+48]; - xor.b64 %rd650, %rd177, 1745875063082670864; - ld.const.u64 %rd178, [hash_header+40]; - xor.b64 %rd655, %rd178, 5399642050693751366; - ld.const.u64 %rd179, [hash_header+32]; - xor.b64 %rd636, %rd179, 8876506674959887717; - ld.const.u64 %rd180, [hash_header+24]; - xor.b64 %rd641, %rd180, 1992179434288343456; - ld.const.u64 %rd181, [hash_header+16]; - xor.b64 %rd646, %rd181, 2188519011337848018; - ld.const.u64 %rd182, [hash_header+8]; - xor.b64 %rd651, %rd182, 3008272977830772284; - ld.const.u64 %rd183, [hash_header]; - xor.b64 %rd656, %rd183, 1242148031264380989; - mov.u64 %rd654, 1123092876221303306; - mov.u64 %rd653, 3784524041015224902; - mov.u64 %rd652, -8517909413761200310; - mov.u64 %rd649, 4963925045340115282; - mov.u64 %rd648, 1082795874807940378; - mov.u64 %rd647, 5237849264682708699; - mov.u64 %rd644, -1409360996057663723; - mov.u64 %rd643, -4494027153138273982; - mov.u64 %rd642, -5621391061570334094; - mov.u64 %rd639, -1817099578685924727; - mov.u64 %rd638, -5035616039755945756; - mov.u64 %rd637, 6706187291358897596; - mov.u64 %rd634, -5613068297060437469; - mov.u64 %rd633, -3386048033060200563; - mov.u64 %rd632, 196324915476054915; - mov.u64 %rd631, RC; - mov.u32 %r12568, -24; - -BB0_7: - xor.b64 %rd184, %rd655, %rd656; - xor.b64 %rd185, %rd184, %rd654; - xor.b64 %rd186, %rd185, %rd653; - xor.b64 %rd187, %rd186, %rd652; - xor.b64 %rd188, %rd650, %rd651; - xor.b64 %rd189, %rd188, %rd649; - xor.b64 %rd190, %rd189, %rd648; - xor.b64 %rd191, %rd190, %rd647; - xor.b64 %rd192, %rd645, %rd646; - xor.b64 %rd193, %rd192, %rd644; - xor.b64 %rd194, %rd193, %rd643; - xor.b64 %rd195, %rd194, %rd642; - xor.b64 %rd196, %rd640, %rd641; - xor.b64 %rd197, %rd196, %rd639; - xor.b64 %rd198, %rd197, %rd638; - xor.b64 %rd199, %rd198, %rd637; - xor.b64 %rd200, %rd635, %rd636; - xor.b64 %rd201, %rd200, %rd634; - xor.b64 %rd202, %rd201, %rd633; - xor.b64 %rd203, %rd202, %rd632; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd191, 1; - shr.b64 %rhs, %rd191, 63; - add.u64 %rd204, %lhs, %rhs; - } - xor.b64 %rd205, %rd203, %rd204; - xor.b64 %rd206, %rd656, %rd205; - xor.b64 %rd207, %rd655, %rd205; - xor.b64 %rd208, %rd654, %rd205; - xor.b64 %rd209, %rd653, %rd205; - xor.b64 %rd210, %rd652, %rd205; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd195, 1; - shr.b64 %rhs, %rd195, 63; - add.u64 %rd211, %lhs, %rhs; - } - xor.b64 %rd212, %rd211, %rd187; - xor.b64 %rd213, %rd651, %rd212; - xor.b64 %rd214, %rd650, %rd212; - xor.b64 %rd215, %rd649, %rd212; - xor.b64 %rd216, %rd648, %rd212; - xor.b64 %rd217, %rd647, %rd212; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd199, 1; - shr.b64 %rhs, %rd199, 63; - add.u64 %rd218, %lhs, %rhs; - } - xor.b64 %rd219, %rd218, %rd191; - xor.b64 %rd220, %rd646, %rd219; - xor.b64 %rd221, %rd645, %rd219; - xor.b64 %rd222, %rd644, %rd219; - xor.b64 %rd223, %rd643, %rd219; - xor.b64 %rd224, %rd642, %rd219; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd203, 1; - shr.b64 %rhs, %rd203, 63; - add.u64 %rd225, %lhs, %rhs; - } - xor.b64 %rd226, %rd225, %rd195; - xor.b64 %rd227, %rd641, %rd226; - xor.b64 %rd228, %rd640, %rd226; - xor.b64 %rd229, %rd639, %rd226; - xor.b64 %rd230, %rd638, %rd226; - xor.b64 %rd231, %rd637, %rd226; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd187, 1; - shr.b64 %rhs, %rd187, 63; - add.u64 %rd232, %lhs, %rhs; - } - xor.b64 %rd233, %rd232, %rd199; - xor.b64 %rd234, %rd636, %rd233; - xor.b64 %rd235, %rd635, %rd233; - xor.b64 %rd236, %rd634, %rd233; - xor.b64 %rd237, %rd633, %rd233; - xor.b64 %rd238, %rd632, %rd233; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd213, 1; - shr.b64 %rhs, %rd213, 63; - add.u64 %rd239, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd208, 3; - shr.b64 %rhs, %rd208, 61; - add.u64 %rd240, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd221, 6; - shr.b64 %rhs, %rd221, 58; - add.u64 %rd241, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd215, 10; - shr.b64 %rhs, %rd215, 54; - add.u64 %rd242, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd223, 15; - shr.b64 %rhs, %rd223, 49; - add.u64 %rd243, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd230, 21; - shr.b64 %rhs, %rd230, 43; - add.u64 %rd244, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd227, 28; - shr.b64 %rhs, %rd227, 36; - add.u64 %rd245, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd207, 36; - shr.b64 %rhs, %rd207, 28; - add.u64 %rd246, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd216, 45; - shr.b64 %rhs, %rd216, 19; - add.u64 %rd247, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd228, 55; - shr.b64 %rhs, %rd228, 9; - add.u64 %rd248, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd217, 2; - shr.b64 %rhs, %rd217, 62; - add.u64 %rd249, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd238, 14; - shr.b64 %rhs, %rd238, 50; - add.u64 %rd250, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd234, 27; - shr.b64 %rhs, %rd234, 37; - add.u64 %rd251, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd209, 41; - shr.b64 %rhs, %rd209, 23; - add.u64 %rd252, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd231, 56; - shr.b64 %rhs, %rd231, 8; - add.u64 %rd253, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd237, 8; - shr.b64 %rhs, %rd237, 56; - add.u64 %rd254, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd229, 25; - shr.b64 %rhs, %rd229, 39; - add.u64 %rd255, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd222, 43; - shr.b64 %rhs, %rd222, 21; - add.u64 %rd256, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd220, 62; - shr.b64 %rhs, %rd220, 2; - add.u64 %rd257, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd210, 18; - shr.b64 %rhs, %rd210, 46; - add.u64 %rd258, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd236, 39; - shr.b64 %rhs, %rd236, 25; - add.u64 %rd259, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd224, 61; - shr.b64 %rhs, %rd224, 3; - add.u64 %rd260, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd235, 20; - shr.b64 %rhs, %rd235, 44; - add.u64 %rd261, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd214, 44; - shr.b64 %rhs, %rd214, 20; - add.u64 %rd262, %lhs, %rhs; - } - not.b64 %rd263, %rd262; - and.b64 %rd264, %rd256, %rd263; - xor.b64 %rd265, %rd264, %rd206; - not.b64 %rd266, %rd256; - and.b64 %rd267, %rd244, %rd266; - xor.b64 %rd651, %rd267, %rd262; - not.b64 %rd268, %rd244; - and.b64 %rd269, %rd250, %rd268; - xor.b64 %rd646, %rd269, %rd256; - not.b64 %rd270, %rd250; - and.b64 %rd271, %rd206, %rd270; - xor.b64 %rd641, %rd271, %rd244; - not.b64 %rd272, %rd206; - and.b64 %rd273, %rd262, %rd272; - xor.b64 %rd636, %rd250, %rd273; - not.b64 %rd274, %rd261; - and.b64 %rd275, %rd240, %rd274; - xor.b64 %rd655, %rd275, %rd245; - not.b64 %rd276, %rd240; - and.b64 %rd277, %rd247, %rd276; - xor.b64 %rd650, %rd277, %rd261; - not.b64 %rd278, %rd247; - and.b64 %rd279, %rd260, %rd278; - xor.b64 %rd645, %rd279, %rd240; - not.b64 %rd280, %rd260; - and.b64 %rd281, %rd245, %rd280; - xor.b64 %rd640, %rd281, %rd247; - not.b64 %rd282, %rd245; - and.b64 %rd283, %rd261, %rd282; - xor.b64 %rd635, %rd260, %rd283; - not.b64 %rd284, %rd241; - and.b64 %rd285, %rd255, %rd284; - xor.b64 %rd654, %rd285, %rd239; - not.b64 %rd286, %rd255; - and.b64 %rd287, %rd254, %rd286; - xor.b64 %rd649, %rd287, %rd241; - not.b64 %rd288, %rd254; - and.b64 %rd289, %rd258, %rd288; - xor.b64 %rd644, %rd289, %rd255; - not.b64 %rd290, %rd258; - and.b64 %rd291, %rd239, %rd290; - xor.b64 %rd639, %rd291, %rd254; - not.b64 %rd292, %rd239; - and.b64 %rd293, %rd241, %rd292; - xor.b64 %rd634, %rd258, %rd293; - not.b64 %rd294, %rd246; - and.b64 %rd295, %rd242, %rd294; - xor.b64 %rd653, %rd295, %rd251; - not.b64 %rd296, %rd242; - and.b64 %rd297, %rd243, %rd296; - xor.b64 %rd648, %rd297, %rd246; - not.b64 %rd298, %rd243; - and.b64 %rd299, %rd253, %rd298; - xor.b64 %rd643, %rd299, %rd242; - not.b64 %rd300, %rd253; - and.b64 %rd301, %rd251, %rd300; - xor.b64 %rd638, %rd301, %rd243; - not.b64 %rd302, %rd251; - and.b64 %rd303, %rd246, %rd302; - xor.b64 %rd633, %rd253, %rd303; - not.b64 %rd304, %rd248; - and.b64 %rd305, %rd259, %rd304; - xor.b64 %rd652, %rd305, %rd257; - not.b64 %rd306, %rd259; - and.b64 %rd307, %rd252, %rd306; - xor.b64 %rd647, %rd307, %rd248; - not.b64 %rd308, %rd252; - and.b64 %rd309, %rd249, %rd308; - xor.b64 %rd642, %rd309, %rd259; - not.b64 %rd310, %rd249; - and.b64 %rd311, %rd257, %rd310; - xor.b64 %rd637, %rd311, %rd252; - not.b64 %rd312, %rd257; - and.b64 %rd313, %rd248, %rd312; - xor.b64 %rd632, %rd249, %rd313; - ld.global.u64 %rd314, [%rd631]; - xor.b64 %rd656, %rd265, %rd314; - add.s64 %rd631, %rd631, 8; - add.s32 %r12568, %r12568, 1; - setp.ne.s32 %p9, %r12568, 0; - @%p9 bra BB0_7; - - shr.u64 %rd337, %rd656, 8; - cvt.u32.u64 %r12, %rd337; - shr.u64 %rd338, %rd656, 16; - cvt.u32.u64 %r13, %rd338; - shr.u64 %rd339, %rd656, 24; - cvt.u32.u64 %r14, %rd339; - shr.u64 %rd340, %rd656, 32; - cvt.u32.u64 %r15, %rd340; - shr.u64 %rd341, %rd656, 40; - cvt.u32.u64 %r16, %rd341; - shr.u64 %rd342, %rd656, 48; - cvt.u32.u64 %r17, %rd342; - shr.u64 %rd343, %rd656, 56; - cvt.u32.u64 %r18, %rd343; - shr.u64 %rd344, %rd651, 8; - cvt.u32.u64 %r19, %rd344; - shr.u64 %rd345, %rd651, 16; - cvt.u32.u64 %r20, %rd345; - shr.u64 %rd346, %rd651, 24; - cvt.u32.u64 %r21, %rd346; - shr.u64 %rd347, %rd651, 32; - cvt.u32.u64 %r22, %rd347; - shr.u64 %rd348, %rd651, 40; - cvt.u32.u64 %r23, %rd348; - shr.u64 %rd349, %rd651, 48; - cvt.u32.u64 %r24, %rd349; - shr.u64 %rd350, %rd651, 56; - cvt.u32.u64 %r25, %rd350; - shr.u64 %rd351, %rd646, 8; - cvt.u32.u64 %r26, %rd351; - shr.u64 %rd352, %rd646, 16; - cvt.u32.u64 %r27, %rd352; - shr.u64 %rd353, %rd646, 24; - cvt.u32.u64 %r28, %rd353; - shr.u64 %rd354, %rd646, 32; - cvt.u32.u64 %r29, %rd354; - shr.u64 %rd355, %rd646, 40; - cvt.u32.u64 %r30, %rd355; - shr.u64 %rd356, %rd646, 48; - cvt.u32.u64 %r31, %rd356; - shr.u64 %rd357, %rd646, 56; - cvt.u32.u64 %r32, %rd357; - cvt.u32.u64 %r33, %rd656; - and.b32 %r34, %r33, 15; - bfe.u32 %r35, %r33, 12, 4; - and.b32 %r36, %r12, 15; - bfe.u32 %r37, %r33, 20, 4; - and.b32 %r38, %r13, 15; - shr.u32 %r39, %r33, 28; - and.b32 %r40, %r14, 15; - shr.u64 %rd358, %rd656, 36; - cvt.u32.u64 %r41, %rd358; - and.b32 %r42, %r41, 15; - and.b32 %r43, %r15, 15; - shr.u64 %rd359, %rd656, 44; - cvt.u32.u64 %r44, %rd359; - and.b32 %r45, %r44, 15; - and.b32 %r46, %r16, 15; - shr.u64 %rd360, %rd656, 52; - cvt.u32.u64 %r47, %rd360; - and.b32 %r48, %r47, 15; - and.b32 %r49, %r17, 15; - shr.u64 %rd361, %rd656, 60; - cvt.u32.u64 %r50, %rd361; - and.b32 %r51, %r18, 15; - cvt.u32.u64 %r52, %rd651; - and.b32 %r53, %r52, 15; - bfe.u32 %r54, %r52, 12, 4; - and.b32 %r55, %r19, 15; - bfe.u32 %r56, %r52, 20, 4; - and.b32 %r57, %r20, 15; - shr.u32 %r58, %r52, 28; - and.b32 %r59, %r21, 15; - shr.u64 %rd362, %rd651, 36; - cvt.u32.u64 %r60, %rd362; - and.b32 %r61, %r60, 15; - and.b32 %r62, %r22, 15; - shr.u64 %rd363, %rd651, 44; - cvt.u32.u64 %r63, %rd363; - and.b32 %r64, %r63, 15; - and.b32 %r65, %r23, 15; - shr.u64 %rd364, %rd651, 52; - cvt.u32.u64 %r66, %rd364; - and.b32 %r67, %r66, 15; - and.b32 %r68, %r24, 15; - shr.u64 %rd365, %rd651, 60; - cvt.u32.u64 %r69, %rd365; - and.b32 %r70, %r25, 15; - cvt.u32.u64 %r71, %rd646; - and.b32 %r72, %r71, 15; - bfe.u32 %r73, %r71, 12, 4; - and.b32 %r74, %r26, 15; - bfe.u32 %r75, %r71, 20, 4; - and.b32 %r76, %r27, 15; - shr.u32 %r77, %r71, 28; - and.b32 %r78, %r28, 15; - shr.u64 %rd366, %rd646, 36; - cvt.u32.u64 %r79, %rd366; - and.b32 %r80, %r79, 15; - and.b32 %r81, %r29, 15; - shr.u64 %rd367, %rd646, 44; - cvt.u32.u64 %r82, %rd367; - and.b32 %r83, %r82, 15; - and.b32 %r84, %r30, 15; - shr.u64 %rd368, %rd646, 52; - cvt.u32.u64 %r85, %rd368; - and.b32 %r86, %r85, 15; - and.b32 %r87, %r31, 15; - shr.u64 %rd369, %rd646, 60; - cvt.u32.u64 %r88, %rd369; - and.b32 %r89, %r32, 15; - cvt.u32.u64 %r90, %rd641; - and.b32 %r91, %r90, 15; - shr.u64 %rd370, %rd641, 8; - cvt.u32.u64 %r92, %rd370; - bfe.u32 %r93, %r90, 12, 4; - and.b32 %r94, %r92, 15; - shr.u64 %rd371, %rd641, 16; - cvt.u32.u64 %r95, %rd371; - bfe.u32 %r96, %r90, 20, 4; - and.b32 %r97, %r95, 15; - shr.u64 %rd372, %rd641, 24; - cvt.u32.u64 %r98, %rd372; - shr.u32 %r99, %r90, 28; - and.b32 %r100, %r98, 15; - shr.u64 %rd373, %rd641, 32; - cvt.u32.u64 %r101, %rd373; - shr.u64 %rd374, %rd641, 36; - cvt.u32.u64 %r102, %rd374; - and.b32 %r103, %r102, 15; - and.b32 %r104, %r101, 15; - shr.u64 %rd375, %rd641, 40; - cvt.u32.u64 %r105, %rd375; - shr.u64 %rd376, %rd641, 44; - cvt.u32.u64 %r106, %rd376; - and.b32 %r107, %r106, 15; - and.b32 %r108, %r105, 15; - shr.u64 %rd377, %rd641, 48; - cvt.u32.u64 %r109, %rd377; - shr.u64 %rd378, %rd641, 52; - cvt.u32.u64 %r110, %rd378; - and.b32 %r111, %r110, 15; - and.b32 %r112, %r109, 15; - shr.u64 %rd379, %rd641, 56; - cvt.u32.u64 %r113, %rd379; - shr.u64 %rd380, %rd641, 60; - cvt.u32.u64 %r114, %rd380; - and.b32 %r115, %r113, 15; - ld.const.v4.u8 {%rs2, %rs3, %rs4, %rs5}, [matrix]; - cvt.u32.u16 %r116, %rs5; - cvt.s32.s8 %r117, %r116; - cvt.u32.u16 %r118, %rs4; - cvt.s32.s8 %r119, %r118; - cvt.u32.u16 %r120, %rs2; - cvt.s32.s8 %r121, %r120; - cvt.u32.u16 %r122, %rs3; - cvt.s32.s8 %r123, %r122; - bfe.u32 %r124, %r33, 4, 4; - mul.lo.s32 %r125, %r34, %r123; - mad.lo.s32 %r126, %r124, %r121, %r125; - mad.lo.s32 %r127, %r35, %r119, %r126; - mad.lo.s32 %r128, %r36, %r117, %r127; - ld.const.v4.u8 {%rs10, %rs11, %rs12, %rs13}, [matrix+4]; - cvt.u32.u16 %r129, %rs13; - cvt.s32.s8 %r130, %r129; - cvt.u32.u16 %r131, %rs12; - cvt.s32.s8 %r132, %r131; - cvt.u32.u16 %r133, %rs11; - cvt.s32.s8 %r134, %r133; - cvt.u32.u16 %r135, %rs10; - cvt.s32.s8 %r136, %r135; - mad.lo.s32 %r137, %r37, %r136, %r128; - mad.lo.s32 %r138, %r38, %r134, %r137; - mad.lo.s32 %r139, %r39, %r132, %r138; - mad.lo.s32 %r140, %r40, %r130, %r139; - ld.const.v4.u8 {%rs18, %rs19, %rs20, %rs21}, [matrix+8]; - cvt.u32.u16 %r141, %rs21; - cvt.s32.s8 %r142, %r141; - cvt.u32.u16 %r143, %rs20; - cvt.s32.s8 %r144, %r143; - cvt.u32.u16 %r145, %rs19; - cvt.s32.s8 %r146, %r145; - cvt.u32.u16 %r147, %rs18; - cvt.s32.s8 %r148, %r147; - mad.lo.s32 %r149, %r42, %r148, %r140; - mad.lo.s32 %r150, %r43, %r146, %r149; - mad.lo.s32 %r151, %r45, %r144, %r150; - mad.lo.s32 %r152, %r46, %r142, %r151; - ld.const.v4.u8 {%rs26, %rs27, %rs28, %rs29}, [matrix+12]; - cvt.u32.u16 %r153, %rs29; - cvt.s32.s8 %r154, %r153; - cvt.u32.u16 %r155, %rs28; - cvt.s32.s8 %r156, %r155; - cvt.u32.u16 %r157, %rs27; - cvt.s32.s8 %r158, %r157; - cvt.u32.u16 %r159, %rs26; - cvt.s32.s8 %r160, %r159; - mad.lo.s32 %r161, %r48, %r160, %r152; - mad.lo.s32 %r162, %r49, %r158, %r161; - mad.lo.s32 %r163, %r50, %r156, %r162; - mad.lo.s32 %r164, %r51, %r154, %r163; - ld.const.v4.u8 {%rs34, %rs35, %rs36, %rs37}, [matrix+16]; - cvt.u32.u16 %r165, %rs37; - cvt.s32.s8 %r166, %r165; - cvt.u32.u16 %r167, %rs36; - cvt.s32.s8 %r168, %r167; - cvt.u32.u16 %r169, %rs35; - cvt.s32.s8 %r170, %r169; - cvt.u32.u16 %r171, %rs34; - cvt.s32.s8 %r172, %r171; - bfe.u32 %r173, %r52, 4, 4; - mad.lo.s32 %r174, %r173, %r172, %r164; - mad.lo.s32 %r175, %r53, %r170, %r174; - mad.lo.s32 %r176, %r54, %r168, %r175; - mad.lo.s32 %r177, %r55, %r166, %r176; - ld.const.v4.u8 {%rs42, %rs43, %rs44, %rs45}, [matrix+20]; - cvt.u32.u16 %r178, %rs45; - cvt.s32.s8 %r179, %r178; - cvt.u32.u16 %r180, %rs44; - cvt.s32.s8 %r181, %r180; - cvt.u32.u16 %r182, %rs43; - cvt.s32.s8 %r183, %r182; - cvt.u32.u16 %r184, %rs42; - cvt.s32.s8 %r185, %r184; - mad.lo.s32 %r186, %r56, %r185, %r177; - mad.lo.s32 %r187, %r57, %r183, %r186; - mad.lo.s32 %r188, %r58, %r181, %r187; - mad.lo.s32 %r189, %r59, %r179, %r188; - ld.const.v4.u8 {%rs50, %rs51, %rs52, %rs53}, [matrix+24]; - cvt.u32.u16 %r190, %rs53; - cvt.s32.s8 %r191, %r190; - cvt.u32.u16 %r192, %rs52; - cvt.s32.s8 %r193, %r192; - cvt.u32.u16 %r194, %rs51; - cvt.s32.s8 %r195, %r194; - cvt.u32.u16 %r196, %rs50; - cvt.s32.s8 %r197, %r196; - mad.lo.s32 %r198, %r61, %r197, %r189; - mad.lo.s32 %r199, %r62, %r195, %r198; - mad.lo.s32 %r200, %r64, %r193, %r199; - mad.lo.s32 %r201, %r65, %r191, %r200; - ld.const.v4.u8 {%rs58, %rs59, %rs60, %rs61}, [matrix+28]; - cvt.u32.u16 %r202, %rs61; - cvt.s32.s8 %r203, %r202; - cvt.u32.u16 %r204, %rs60; - cvt.s32.s8 %r205, %r204; - cvt.u32.u16 %r206, %rs59; - cvt.s32.s8 %r207, %r206; - cvt.u32.u16 %r208, %rs58; - cvt.s32.s8 %r209, %r208; - mad.lo.s32 %r210, %r67, %r209, %r201; - mad.lo.s32 %r211, %r68, %r207, %r210; - mad.lo.s32 %r212, %r69, %r205, %r211; - mad.lo.s32 %r213, %r70, %r203, %r212; - ld.const.v4.u8 {%rs66, %rs67, %rs68, %rs69}, [matrix+32]; - cvt.u32.u16 %r214, %rs69; - cvt.s32.s8 %r215, %r214; - cvt.u32.u16 %r216, %rs68; - cvt.s32.s8 %r217, %r216; - cvt.u32.u16 %r218, %rs67; - cvt.s32.s8 %r219, %r218; - cvt.u32.u16 %r220, %rs66; - cvt.s32.s8 %r221, %r220; - bfe.u32 %r222, %r71, 4, 4; - mad.lo.s32 %r223, %r222, %r221, %r213; - mad.lo.s32 %r224, %r72, %r219, %r223; - mad.lo.s32 %r225, %r73, %r217, %r224; - mad.lo.s32 %r226, %r74, %r215, %r225; - ld.const.v4.u8 {%rs74, %rs75, %rs76, %rs77}, [matrix+36]; - cvt.u32.u16 %r227, %rs77; - cvt.s32.s8 %r228, %r227; - cvt.u32.u16 %r229, %rs76; - cvt.s32.s8 %r230, %r229; - cvt.u32.u16 %r231, %rs75; - cvt.s32.s8 %r232, %r231; - cvt.u32.u16 %r233, %rs74; - cvt.s32.s8 %r234, %r233; - mad.lo.s32 %r235, %r75, %r234, %r226; - mad.lo.s32 %r236, %r76, %r232, %r235; - mad.lo.s32 %r237, %r77, %r230, %r236; - mad.lo.s32 %r238, %r78, %r228, %r237; - ld.const.v4.u8 {%rs82, %rs83, %rs84, %rs85}, [matrix+40]; - cvt.u32.u16 %r239, %rs85; - cvt.s32.s8 %r240, %r239; - cvt.u32.u16 %r241, %rs84; - cvt.s32.s8 %r242, %r241; - cvt.u32.u16 %r243, %rs83; - cvt.s32.s8 %r244, %r243; - cvt.u32.u16 %r245, %rs82; - cvt.s32.s8 %r246, %r245; - mad.lo.s32 %r247, %r80, %r246, %r238; - mad.lo.s32 %r248, %r81, %r244, %r247; - mad.lo.s32 %r249, %r83, %r242, %r248; - mad.lo.s32 %r250, %r84, %r240, %r249; - ld.const.v4.u8 {%rs90, %rs91, %rs92, %rs93}, [matrix+44]; - cvt.u32.u16 %r251, %rs93; - cvt.s32.s8 %r252, %r251; - cvt.u32.u16 %r253, %rs92; - cvt.s32.s8 %r254, %r253; - cvt.u32.u16 %r255, %rs91; - cvt.s32.s8 %r256, %r255; - cvt.u32.u16 %r257, %rs90; - cvt.s32.s8 %r258, %r257; - mad.lo.s32 %r259, %r86, %r258, %r250; - mad.lo.s32 %r260, %r87, %r256, %r259; - mad.lo.s32 %r261, %r88, %r254, %r260; - mad.lo.s32 %r262, %r89, %r252, %r261; - ld.const.v4.u8 {%rs98, %rs99, %rs100, %rs101}, [matrix+48]; - cvt.u32.u16 %r263, %rs101; - cvt.s32.s8 %r264, %r263; - cvt.u32.u16 %r265, %rs100; - cvt.s32.s8 %r266, %r265; - cvt.u32.u16 %r267, %rs99; - cvt.s32.s8 %r268, %r267; - cvt.u32.u16 %r269, %rs98; - cvt.s32.s8 %r270, %r269; - bfe.u32 %r271, %r90, 4, 4; - mad.lo.s32 %r272, %r271, %r270, %r262; - mad.lo.s32 %r273, %r91, %r268, %r272; - mad.lo.s32 %r274, %r93, %r266, %r273; - mad.lo.s32 %r275, %r94, %r264, %r274; - ld.const.v4.u8 {%rs106, %rs107, %rs108, %rs109}, [matrix+52]; - cvt.u32.u16 %r276, %rs109; - cvt.s32.s8 %r277, %r276; - cvt.u32.u16 %r278, %rs108; - cvt.s32.s8 %r279, %r278; - cvt.u32.u16 %r280, %rs107; - cvt.s32.s8 %r281, %r280; - cvt.u32.u16 %r282, %rs106; - cvt.s32.s8 %r283, %r282; - mad.lo.s32 %r284, %r96, %r283, %r275; - mad.lo.s32 %r285, %r97, %r281, %r284; - mad.lo.s32 %r286, %r99, %r279, %r285; - mad.lo.s32 %r287, %r100, %r277, %r286; - ld.const.v4.u8 {%rs114, %rs115, %rs116, %rs117}, [matrix+56]; - cvt.u32.u16 %r288, %rs117; - cvt.s32.s8 %r289, %r288; - cvt.u32.u16 %r290, %rs116; - cvt.s32.s8 %r291, %r290; - cvt.u32.u16 %r292, %rs115; - cvt.s32.s8 %r293, %r292; - cvt.u32.u16 %r294, %rs114; - cvt.s32.s8 %r295, %r294; - mad.lo.s32 %r296, %r103, %r295, %r287; - mad.lo.s32 %r297, %r104, %r293, %r296; - mad.lo.s32 %r298, %r107, %r291, %r297; - mad.lo.s32 %r299, %r108, %r289, %r298; - ld.const.v4.u8 {%rs122, %rs123, %rs124, %rs125}, [matrix+60]; - cvt.u32.u16 %r300, %rs125; - cvt.s32.s8 %r301, %r300; - cvt.u32.u16 %r302, %rs124; - cvt.s32.s8 %r303, %r302; - cvt.u32.u16 %r304, %rs123; - cvt.s32.s8 %r305, %r304; - cvt.u32.u16 %r306, %rs122; - cvt.s32.s8 %r307, %r306; - mad.lo.s32 %r308, %r111, %r307, %r299; - mad.lo.s32 %r309, %r112, %r305, %r308; - mad.lo.s32 %r310, %r114, %r303, %r309; - mad.lo.s32 %r311, %r115, %r301, %r310; - ld.const.v4.u8 {%rs130, %rs131, %rs132, %rs133}, [matrix+64]; - cvt.u32.u16 %r312, %rs133; - cvt.s32.s8 %r313, %r312; - cvt.u32.u16 %r314, %rs132; - cvt.s32.s8 %r315, %r314; - cvt.u32.u16 %r316, %rs130; - cvt.s32.s8 %r317, %r316; - cvt.u32.u16 %r318, %rs131; - cvt.s32.s8 %r319, %r318; - mul.lo.s32 %r320, %r34, %r319; - mad.lo.s32 %r321, %r124, %r317, %r320; - mad.lo.s32 %r322, %r35, %r315, %r321; - mad.lo.s32 %r323, %r36, %r313, %r322; - ld.const.v4.u8 {%rs138, %rs139, %rs140, %rs141}, [matrix+68]; - cvt.u32.u16 %r324, %rs141; - cvt.s32.s8 %r325, %r324; - cvt.u32.u16 %r326, %rs140; - cvt.s32.s8 %r327, %r326; - cvt.u32.u16 %r328, %rs139; - cvt.s32.s8 %r329, %r328; - cvt.u32.u16 %r330, %rs138; - cvt.s32.s8 %r331, %r330; - mad.lo.s32 %r332, %r37, %r331, %r323; - mad.lo.s32 %r333, %r38, %r329, %r332; - mad.lo.s32 %r334, %r39, %r327, %r333; - mad.lo.s32 %r335, %r40, %r325, %r334; - ld.const.v4.u8 {%rs146, %rs147, %rs148, %rs149}, [matrix+72]; - cvt.u32.u16 %r336, %rs149; - cvt.s32.s8 %r337, %r336; - cvt.u32.u16 %r338, %rs148; - cvt.s32.s8 %r339, %r338; - cvt.u32.u16 %r340, %rs147; - cvt.s32.s8 %r341, %r340; - cvt.u32.u16 %r342, %rs146; - cvt.s32.s8 %r343, %r342; - mad.lo.s32 %r344, %r42, %r343, %r335; - mad.lo.s32 %r345, %r43, %r341, %r344; - mad.lo.s32 %r346, %r45, %r339, %r345; - mad.lo.s32 %r347, %r46, %r337, %r346; - ld.const.v4.u8 {%rs154, %rs155, %rs156, %rs157}, [matrix+76]; - cvt.u32.u16 %r348, %rs157; - cvt.s32.s8 %r349, %r348; - cvt.u32.u16 %r350, %rs156; - cvt.s32.s8 %r351, %r350; - cvt.u32.u16 %r352, %rs155; - cvt.s32.s8 %r353, %r352; - cvt.u32.u16 %r354, %rs154; - cvt.s32.s8 %r355, %r354; - mad.lo.s32 %r356, %r48, %r355, %r347; - mad.lo.s32 %r357, %r49, %r353, %r356; - mad.lo.s32 %r358, %r50, %r351, %r357; - mad.lo.s32 %r359, %r51, %r349, %r358; - ld.const.v4.u8 {%rs162, %rs163, %rs164, %rs165}, [matrix+80]; - cvt.u32.u16 %r360, %rs165; - cvt.s32.s8 %r361, %r360; - cvt.u32.u16 %r362, %rs164; - cvt.s32.s8 %r363, %r362; - cvt.u32.u16 %r364, %rs163; - cvt.s32.s8 %r365, %r364; - cvt.u32.u16 %r366, %rs162; - cvt.s32.s8 %r367, %r366; - mad.lo.s32 %r368, %r173, %r367, %r359; - mad.lo.s32 %r369, %r53, %r365, %r368; - mad.lo.s32 %r370, %r54, %r363, %r369; - mad.lo.s32 %r371, %r55, %r361, %r370; - ld.const.v4.u8 {%rs170, %rs171, %rs172, %rs173}, [matrix+84]; - cvt.u32.u16 %r372, %rs173; - cvt.s32.s8 %r373, %r372; - cvt.u32.u16 %r374, %rs172; - cvt.s32.s8 %r375, %r374; - cvt.u32.u16 %r376, %rs171; - cvt.s32.s8 %r377, %r376; - cvt.u32.u16 %r378, %rs170; - cvt.s32.s8 %r379, %r378; - mad.lo.s32 %r380, %r56, %r379, %r371; - mad.lo.s32 %r381, %r57, %r377, %r380; - mad.lo.s32 %r382, %r58, %r375, %r381; - mad.lo.s32 %r383, %r59, %r373, %r382; - ld.const.v4.u8 {%rs178, %rs179, %rs180, %rs181}, [matrix+88]; - cvt.u32.u16 %r384, %rs181; - cvt.s32.s8 %r385, %r384; - cvt.u32.u16 %r386, %rs180; - cvt.s32.s8 %r387, %r386; - cvt.u32.u16 %r388, %rs179; - cvt.s32.s8 %r389, %r388; - cvt.u32.u16 %r390, %rs178; - cvt.s32.s8 %r391, %r390; - mad.lo.s32 %r392, %r61, %r391, %r383; - mad.lo.s32 %r393, %r62, %r389, %r392; - mad.lo.s32 %r394, %r64, %r387, %r393; - mad.lo.s32 %r395, %r65, %r385, %r394; - ld.const.v4.u8 {%rs186, %rs187, %rs188, %rs189}, [matrix+92]; - cvt.u32.u16 %r396, %rs189; - cvt.s32.s8 %r397, %r396; - cvt.u32.u16 %r398, %rs188; - cvt.s32.s8 %r399, %r398; - cvt.u32.u16 %r400, %rs187; - cvt.s32.s8 %r401, %r400; - cvt.u32.u16 %r402, %rs186; - cvt.s32.s8 %r403, %r402; - mad.lo.s32 %r404, %r67, %r403, %r395; - mad.lo.s32 %r405, %r68, %r401, %r404; - mad.lo.s32 %r406, %r69, %r399, %r405; - mad.lo.s32 %r407, %r70, %r397, %r406; - ld.const.v4.u8 {%rs194, %rs195, %rs196, %rs197}, [matrix+96]; - cvt.u32.u16 %r408, %rs197; - cvt.s32.s8 %r409, %r408; - cvt.u32.u16 %r410, %rs196; - cvt.s32.s8 %r411, %r410; - cvt.u32.u16 %r412, %rs195; - cvt.s32.s8 %r413, %r412; - cvt.u32.u16 %r414, %rs194; - cvt.s32.s8 %r415, %r414; - mad.lo.s32 %r416, %r222, %r415, %r407; - mad.lo.s32 %r417, %r72, %r413, %r416; - mad.lo.s32 %r418, %r73, %r411, %r417; - mad.lo.s32 %r419, %r74, %r409, %r418; - ld.const.v4.u8 {%rs202, %rs203, %rs204, %rs205}, [matrix+100]; - cvt.u32.u16 %r420, %rs205; - cvt.s32.s8 %r421, %r420; - cvt.u32.u16 %r422, %rs204; - cvt.s32.s8 %r423, %r422; - cvt.u32.u16 %r424, %rs203; - cvt.s32.s8 %r425, %r424; - cvt.u32.u16 %r426, %rs202; - cvt.s32.s8 %r427, %r426; - mad.lo.s32 %r428, %r75, %r427, %r419; - mad.lo.s32 %r429, %r76, %r425, %r428; - mad.lo.s32 %r430, %r77, %r423, %r429; - mad.lo.s32 %r431, %r78, %r421, %r430; - ld.const.v4.u8 {%rs210, %rs211, %rs212, %rs213}, [matrix+104]; - cvt.u32.u16 %r432, %rs213; - cvt.s32.s8 %r433, %r432; - cvt.u32.u16 %r434, %rs212; - cvt.s32.s8 %r435, %r434; - cvt.u32.u16 %r436, %rs211; - cvt.s32.s8 %r437, %r436; - cvt.u32.u16 %r438, %rs210; - cvt.s32.s8 %r439, %r438; - mad.lo.s32 %r440, %r80, %r439, %r431; - mad.lo.s32 %r441, %r81, %r437, %r440; - mad.lo.s32 %r442, %r83, %r435, %r441; - mad.lo.s32 %r443, %r84, %r433, %r442; - ld.const.v4.u8 {%rs218, %rs219, %rs220, %rs221}, [matrix+108]; - cvt.u32.u16 %r444, %rs221; - cvt.s32.s8 %r445, %r444; - cvt.u32.u16 %r446, %rs220; - cvt.s32.s8 %r447, %r446; - cvt.u32.u16 %r448, %rs219; - cvt.s32.s8 %r449, %r448; - cvt.u32.u16 %r450, %rs218; - cvt.s32.s8 %r451, %r450; - mad.lo.s32 %r452, %r86, %r451, %r443; - mad.lo.s32 %r453, %r87, %r449, %r452; - mad.lo.s32 %r454, %r88, %r447, %r453; - mad.lo.s32 %r455, %r89, %r445, %r454; - ld.const.v4.u8 {%rs226, %rs227, %rs228, %rs229}, [matrix+112]; - cvt.u32.u16 %r456, %rs229; - cvt.s32.s8 %r457, %r456; - cvt.u32.u16 %r458, %rs228; - cvt.s32.s8 %r459, %r458; - cvt.u32.u16 %r460, %rs227; - cvt.s32.s8 %r461, %r460; - cvt.u32.u16 %r462, %rs226; - cvt.s32.s8 %r463, %r462; - mad.lo.s32 %r464, %r271, %r463, %r455; - mad.lo.s32 %r465, %r91, %r461, %r464; - mad.lo.s32 %r466, %r93, %r459, %r465; - mad.lo.s32 %r467, %r94, %r457, %r466; - ld.const.v4.u8 {%rs234, %rs235, %rs236, %rs237}, [matrix+116]; - cvt.u32.u16 %r468, %rs237; - cvt.s32.s8 %r469, %r468; - cvt.u32.u16 %r470, %rs236; - cvt.s32.s8 %r471, %r470; - cvt.u32.u16 %r472, %rs235; - cvt.s32.s8 %r473, %r472; - cvt.u32.u16 %r474, %rs234; - cvt.s32.s8 %r475, %r474; - mad.lo.s32 %r476, %r96, %r475, %r467; - mad.lo.s32 %r477, %r97, %r473, %r476; - mad.lo.s32 %r478, %r99, %r471, %r477; - mad.lo.s32 %r479, %r100, %r469, %r478; - ld.const.v4.u8 {%rs242, %rs243, %rs244, %rs245}, [matrix+120]; - cvt.u32.u16 %r480, %rs245; - cvt.s32.s8 %r481, %r480; - cvt.u32.u16 %r482, %rs244; - cvt.s32.s8 %r483, %r482; - cvt.u32.u16 %r484, %rs243; - cvt.s32.s8 %r485, %r484; - cvt.u32.u16 %r486, %rs242; - cvt.s32.s8 %r487, %r486; - mad.lo.s32 %r488, %r103, %r487, %r479; - mad.lo.s32 %r489, %r104, %r485, %r488; - mad.lo.s32 %r490, %r107, %r483, %r489; - mad.lo.s32 %r491, %r108, %r481, %r490; - ld.const.v4.u8 {%rs250, %rs251, %rs252, %rs253}, [matrix+124]; - cvt.u32.u16 %r492, %rs253; - cvt.s32.s8 %r493, %r492; - cvt.u32.u16 %r494, %rs252; - cvt.s32.s8 %r495, %r494; - cvt.u32.u16 %r496, %rs251; - cvt.s32.s8 %r497, %r496; - cvt.u32.u16 %r498, %rs250; - cvt.s32.s8 %r499, %r498; - mad.lo.s32 %r500, %r111, %r499, %r491; - mad.lo.s32 %r501, %r112, %r497, %r500; - mad.lo.s32 %r502, %r114, %r495, %r501; - mad.lo.s32 %r503, %r115, %r493, %r502; - shr.u32 %r504, %r311, 6; - and.b32 %r505, %r504, 240; - shr.u32 %r506, %r503, 10; - or.b32 %r507, %r506, %r505; - xor.b32 %r508, %r33, %r507; - cvt.u64.u32 %rd381, %r508; - ld.const.v4.u8 {%rs258, %rs259, %rs260, %rs261}, [matrix+128]; - cvt.u32.u16 %r509, %rs261; - cvt.s32.s8 %r510, %r509; - cvt.u32.u16 %r511, %rs260; - cvt.s32.s8 %r512, %r511; - cvt.u32.u16 %r513, %rs258; - cvt.s32.s8 %r514, %r513; - cvt.u32.u16 %r515, %rs259; - cvt.s32.s8 %r516, %r515; - mul.lo.s32 %r517, %r34, %r516; - mad.lo.s32 %r518, %r124, %r514, %r517; - mad.lo.s32 %r519, %r35, %r512, %r518; - mad.lo.s32 %r520, %r36, %r510, %r519; - ld.const.v4.u8 {%rs266, %rs267, %rs268, %rs269}, [matrix+132]; - cvt.u32.u16 %r521, %rs269; - cvt.s32.s8 %r522, %r521; - cvt.u32.u16 %r523, %rs268; - cvt.s32.s8 %r524, %r523; - cvt.u32.u16 %r525, %rs267; - cvt.s32.s8 %r526, %r525; - cvt.u32.u16 %r527, %rs266; - cvt.s32.s8 %r528, %r527; - mad.lo.s32 %r529, %r37, %r528, %r520; - mad.lo.s32 %r530, %r38, %r526, %r529; - mad.lo.s32 %r531, %r39, %r524, %r530; - mad.lo.s32 %r532, %r40, %r522, %r531; - ld.const.v4.u8 {%rs274, %rs275, %rs276, %rs277}, [matrix+136]; - cvt.u32.u16 %r533, %rs277; - cvt.s32.s8 %r534, %r533; - cvt.u32.u16 %r535, %rs276; - cvt.s32.s8 %r536, %r535; - cvt.u32.u16 %r537, %rs275; - cvt.s32.s8 %r538, %r537; - cvt.u32.u16 %r539, %rs274; - cvt.s32.s8 %r540, %r539; - mad.lo.s32 %r541, %r42, %r540, %r532; - mad.lo.s32 %r542, %r43, %r538, %r541; - mad.lo.s32 %r543, %r45, %r536, %r542; - mad.lo.s32 %r544, %r46, %r534, %r543; - ld.const.v4.u8 {%rs282, %rs283, %rs284, %rs285}, [matrix+140]; - cvt.u32.u16 %r545, %rs285; - cvt.s32.s8 %r546, %r545; - cvt.u32.u16 %r547, %rs284; - cvt.s32.s8 %r548, %r547; - cvt.u32.u16 %r549, %rs283; - cvt.s32.s8 %r550, %r549; - cvt.u32.u16 %r551, %rs282; - cvt.s32.s8 %r552, %r551; - mad.lo.s32 %r553, %r48, %r552, %r544; - mad.lo.s32 %r554, %r49, %r550, %r553; - mad.lo.s32 %r555, %r50, %r548, %r554; - mad.lo.s32 %r556, %r51, %r546, %r555; - ld.const.v4.u8 {%rs290, %rs291, %rs292, %rs293}, [matrix+144]; - cvt.u32.u16 %r557, %rs293; - cvt.s32.s8 %r558, %r557; - cvt.u32.u16 %r559, %rs292; - cvt.s32.s8 %r560, %r559; - cvt.u32.u16 %r561, %rs291; - cvt.s32.s8 %r562, %r561; - cvt.u32.u16 %r563, %rs290; - cvt.s32.s8 %r564, %r563; - mad.lo.s32 %r565, %r173, %r564, %r556; - mad.lo.s32 %r566, %r53, %r562, %r565; - mad.lo.s32 %r567, %r54, %r560, %r566; - mad.lo.s32 %r568, %r55, %r558, %r567; - ld.const.v4.u8 {%rs298, %rs299, %rs300, %rs301}, [matrix+148]; - cvt.u32.u16 %r569, %rs301; - cvt.s32.s8 %r570, %r569; - cvt.u32.u16 %r571, %rs300; - cvt.s32.s8 %r572, %r571; - cvt.u32.u16 %r573, %rs299; - cvt.s32.s8 %r574, %r573; - cvt.u32.u16 %r575, %rs298; - cvt.s32.s8 %r576, %r575; - mad.lo.s32 %r577, %r56, %r576, %r568; - mad.lo.s32 %r578, %r57, %r574, %r577; - mad.lo.s32 %r579, %r58, %r572, %r578; - mad.lo.s32 %r580, %r59, %r570, %r579; - ld.const.v4.u8 {%rs306, %rs307, %rs308, %rs309}, [matrix+152]; - cvt.u32.u16 %r581, %rs309; - cvt.s32.s8 %r582, %r581; - cvt.u32.u16 %r583, %rs308; - cvt.s32.s8 %r584, %r583; - cvt.u32.u16 %r585, %rs307; - cvt.s32.s8 %r586, %r585; - cvt.u32.u16 %r587, %rs306; - cvt.s32.s8 %r588, %r587; - mad.lo.s32 %r589, %r61, %r588, %r580; - mad.lo.s32 %r590, %r62, %r586, %r589; - mad.lo.s32 %r591, %r64, %r584, %r590; - mad.lo.s32 %r592, %r65, %r582, %r591; - ld.const.v4.u8 {%rs314, %rs315, %rs316, %rs317}, [matrix+156]; - cvt.u32.u16 %r593, %rs317; - cvt.s32.s8 %r594, %r593; - cvt.u32.u16 %r595, %rs316; - cvt.s32.s8 %r596, %r595; - cvt.u32.u16 %r597, %rs315; - cvt.s32.s8 %r598, %r597; - cvt.u32.u16 %r599, %rs314; - cvt.s32.s8 %r600, %r599; - mad.lo.s32 %r601, %r67, %r600, %r592; - mad.lo.s32 %r602, %r68, %r598, %r601; - mad.lo.s32 %r603, %r69, %r596, %r602; - mad.lo.s32 %r604, %r70, %r594, %r603; - ld.const.v4.u8 {%rs322, %rs323, %rs324, %rs325}, [matrix+160]; - cvt.u32.u16 %r605, %rs325; - cvt.s32.s8 %r606, %r605; - cvt.u32.u16 %r607, %rs324; - cvt.s32.s8 %r608, %r607; - cvt.u32.u16 %r609, %rs323; - cvt.s32.s8 %r610, %r609; - cvt.u32.u16 %r611, %rs322; - cvt.s32.s8 %r612, %r611; - mad.lo.s32 %r613, %r222, %r612, %r604; - mad.lo.s32 %r614, %r72, %r610, %r613; - mad.lo.s32 %r615, %r73, %r608, %r614; - mad.lo.s32 %r616, %r74, %r606, %r615; - ld.const.v4.u8 {%rs330, %rs331, %rs332, %rs333}, [matrix+164]; - cvt.u32.u16 %r617, %rs333; - cvt.s32.s8 %r618, %r617; - cvt.u32.u16 %r619, %rs332; - cvt.s32.s8 %r620, %r619; - cvt.u32.u16 %r621, %rs331; - cvt.s32.s8 %r622, %r621; - cvt.u32.u16 %r623, %rs330; - cvt.s32.s8 %r624, %r623; - mad.lo.s32 %r625, %r75, %r624, %r616; - mad.lo.s32 %r626, %r76, %r622, %r625; - mad.lo.s32 %r627, %r77, %r620, %r626; - mad.lo.s32 %r628, %r78, %r618, %r627; - ld.const.v4.u8 {%rs338, %rs339, %rs340, %rs341}, [matrix+168]; - cvt.u32.u16 %r629, %rs341; - cvt.s32.s8 %r630, %r629; - cvt.u32.u16 %r631, %rs340; - cvt.s32.s8 %r632, %r631; - cvt.u32.u16 %r633, %rs339; - cvt.s32.s8 %r634, %r633; - cvt.u32.u16 %r635, %rs338; - cvt.s32.s8 %r636, %r635; - mad.lo.s32 %r637, %r80, %r636, %r628; - mad.lo.s32 %r638, %r81, %r634, %r637; - mad.lo.s32 %r639, %r83, %r632, %r638; - mad.lo.s32 %r640, %r84, %r630, %r639; - ld.const.v4.u8 {%rs346, %rs347, %rs348, %rs349}, [matrix+172]; - cvt.u32.u16 %r641, %rs349; - cvt.s32.s8 %r642, %r641; - cvt.u32.u16 %r643, %rs348; - cvt.s32.s8 %r644, %r643; - cvt.u32.u16 %r645, %rs347; - cvt.s32.s8 %r646, %r645; - cvt.u32.u16 %r647, %rs346; - cvt.s32.s8 %r648, %r647; - mad.lo.s32 %r649, %r86, %r648, %r640; - mad.lo.s32 %r650, %r87, %r646, %r649; - mad.lo.s32 %r651, %r88, %r644, %r650; - mad.lo.s32 %r652, %r89, %r642, %r651; - ld.const.v4.u8 {%rs354, %rs355, %rs356, %rs357}, [matrix+176]; - cvt.u32.u16 %r653, %rs357; - cvt.s32.s8 %r654, %r653; - cvt.u32.u16 %r655, %rs356; - cvt.s32.s8 %r656, %r655; - cvt.u32.u16 %r657, %rs355; - cvt.s32.s8 %r658, %r657; - cvt.u32.u16 %r659, %rs354; - cvt.s32.s8 %r660, %r659; - mad.lo.s32 %r661, %r271, %r660, %r652; - mad.lo.s32 %r662, %r91, %r658, %r661; - mad.lo.s32 %r663, %r93, %r656, %r662; - mad.lo.s32 %r664, %r94, %r654, %r663; - ld.const.v4.u8 {%rs362, %rs363, %rs364, %rs365}, [matrix+180]; - cvt.u32.u16 %r665, %rs365; - cvt.s32.s8 %r666, %r665; - cvt.u32.u16 %r667, %rs364; - cvt.s32.s8 %r668, %r667; - cvt.u32.u16 %r669, %rs363; - cvt.s32.s8 %r670, %r669; - cvt.u32.u16 %r671, %rs362; - cvt.s32.s8 %r672, %r671; - mad.lo.s32 %r673, %r96, %r672, %r664; - mad.lo.s32 %r674, %r97, %r670, %r673; - mad.lo.s32 %r675, %r99, %r668, %r674; - mad.lo.s32 %r676, %r100, %r666, %r675; - ld.const.v4.u8 {%rs370, %rs371, %rs372, %rs373}, [matrix+184]; - cvt.u32.u16 %r677, %rs373; - cvt.s32.s8 %r678, %r677; - cvt.u32.u16 %r679, %rs372; - cvt.s32.s8 %r680, %r679; - cvt.u32.u16 %r681, %rs371; - cvt.s32.s8 %r682, %r681; - cvt.u32.u16 %r683, %rs370; - cvt.s32.s8 %r684, %r683; - mad.lo.s32 %r685, %r103, %r684, %r676; - mad.lo.s32 %r686, %r104, %r682, %r685; - mad.lo.s32 %r687, %r107, %r680, %r686; - mad.lo.s32 %r688, %r108, %r678, %r687; - ld.const.v4.u8 {%rs378, %rs379, %rs380, %rs381}, [matrix+188]; - cvt.u32.u16 %r689, %rs381; - cvt.s32.s8 %r690, %r689; - cvt.u32.u16 %r691, %rs380; - cvt.s32.s8 %r692, %r691; - cvt.u32.u16 %r693, %rs379; - cvt.s32.s8 %r694, %r693; - cvt.u32.u16 %r695, %rs378; - cvt.s32.s8 %r696, %r695; - mad.lo.s32 %r697, %r111, %r696, %r688; - mad.lo.s32 %r698, %r112, %r694, %r697; - mad.lo.s32 %r699, %r114, %r692, %r698; - mad.lo.s32 %r700, %r115, %r690, %r699; - ld.const.v4.u8 {%rs386, %rs387, %rs388, %rs389}, [matrix+192]; - cvt.u32.u16 %r701, %rs389; - cvt.s32.s8 %r702, %r701; - cvt.u32.u16 %r703, %rs388; - cvt.s32.s8 %r704, %r703; - cvt.u32.u16 %r705, %rs386; - cvt.s32.s8 %r706, %r705; - cvt.u32.u16 %r707, %rs387; - cvt.s32.s8 %r708, %r707; - mul.lo.s32 %r709, %r34, %r708; - mad.lo.s32 %r710, %r124, %r706, %r709; - mad.lo.s32 %r711, %r35, %r704, %r710; - mad.lo.s32 %r712, %r36, %r702, %r711; - ld.const.v4.u8 {%rs394, %rs395, %rs396, %rs397}, [matrix+196]; - cvt.u32.u16 %r713, %rs397; - cvt.s32.s8 %r714, %r713; - cvt.u32.u16 %r715, %rs396; - cvt.s32.s8 %r716, %r715; - cvt.u32.u16 %r717, %rs395; - cvt.s32.s8 %r718, %r717; - cvt.u32.u16 %r719, %rs394; - cvt.s32.s8 %r720, %r719; - mad.lo.s32 %r721, %r37, %r720, %r712; - mad.lo.s32 %r722, %r38, %r718, %r721; - mad.lo.s32 %r723, %r39, %r716, %r722; - mad.lo.s32 %r724, %r40, %r714, %r723; - ld.const.v4.u8 {%rs402, %rs403, %rs404, %rs405}, [matrix+200]; - cvt.u32.u16 %r725, %rs405; - cvt.s32.s8 %r726, %r725; - cvt.u32.u16 %r727, %rs404; - cvt.s32.s8 %r728, %r727; - cvt.u32.u16 %r729, %rs403; - cvt.s32.s8 %r730, %r729; - cvt.u32.u16 %r731, %rs402; - cvt.s32.s8 %r732, %r731; - mad.lo.s32 %r733, %r42, %r732, %r724; - mad.lo.s32 %r734, %r43, %r730, %r733; - mad.lo.s32 %r735, %r45, %r728, %r734; - mad.lo.s32 %r736, %r46, %r726, %r735; - ld.const.v4.u8 {%rs410, %rs411, %rs412, %rs413}, [matrix+204]; - cvt.u32.u16 %r737, %rs413; - cvt.s32.s8 %r738, %r737; - cvt.u32.u16 %r739, %rs412; - cvt.s32.s8 %r740, %r739; - cvt.u32.u16 %r741, %rs411; - cvt.s32.s8 %r742, %r741; - cvt.u32.u16 %r743, %rs410; - cvt.s32.s8 %r744, %r743; - mad.lo.s32 %r745, %r48, %r744, %r736; - mad.lo.s32 %r746, %r49, %r742, %r745; - mad.lo.s32 %r747, %r50, %r740, %r746; - mad.lo.s32 %r748, %r51, %r738, %r747; - ld.const.v4.u8 {%rs418, %rs419, %rs420, %rs421}, [matrix+208]; - cvt.u32.u16 %r749, %rs421; - cvt.s32.s8 %r750, %r749; - cvt.u32.u16 %r751, %rs420; - cvt.s32.s8 %r752, %r751; - cvt.u32.u16 %r753, %rs419; - cvt.s32.s8 %r754, %r753; - cvt.u32.u16 %r755, %rs418; - cvt.s32.s8 %r756, %r755; - mad.lo.s32 %r757, %r173, %r756, %r748; - mad.lo.s32 %r758, %r53, %r754, %r757; - mad.lo.s32 %r759, %r54, %r752, %r758; - mad.lo.s32 %r760, %r55, %r750, %r759; - ld.const.v4.u8 {%rs426, %rs427, %rs428, %rs429}, [matrix+212]; - cvt.u32.u16 %r761, %rs429; - cvt.s32.s8 %r762, %r761; - cvt.u32.u16 %r763, %rs428; - cvt.s32.s8 %r764, %r763; - cvt.u32.u16 %r765, %rs427; - cvt.s32.s8 %r766, %r765; - cvt.u32.u16 %r767, %rs426; - cvt.s32.s8 %r768, %r767; - mad.lo.s32 %r769, %r56, %r768, %r760; - mad.lo.s32 %r770, %r57, %r766, %r769; - mad.lo.s32 %r771, %r58, %r764, %r770; - mad.lo.s32 %r772, %r59, %r762, %r771; - ld.const.v4.u8 {%rs434, %rs435, %rs436, %rs437}, [matrix+216]; - cvt.u32.u16 %r773, %rs437; - cvt.s32.s8 %r774, %r773; - cvt.u32.u16 %r775, %rs436; - cvt.s32.s8 %r776, %r775; - cvt.u32.u16 %r777, %rs435; - cvt.s32.s8 %r778, %r777; - cvt.u32.u16 %r779, %rs434; - cvt.s32.s8 %r780, %r779; - mad.lo.s32 %r781, %r61, %r780, %r772; - mad.lo.s32 %r782, %r62, %r778, %r781; - mad.lo.s32 %r783, %r64, %r776, %r782; - mad.lo.s32 %r784, %r65, %r774, %r783; - ld.const.v4.u8 {%rs442, %rs443, %rs444, %rs445}, [matrix+220]; - cvt.u32.u16 %r785, %rs445; - cvt.s32.s8 %r786, %r785; - cvt.u32.u16 %r787, %rs444; - cvt.s32.s8 %r788, %r787; - cvt.u32.u16 %r789, %rs443; - cvt.s32.s8 %r790, %r789; - cvt.u32.u16 %r791, %rs442; - cvt.s32.s8 %r792, %r791; - mad.lo.s32 %r793, %r67, %r792, %r784; - mad.lo.s32 %r794, %r68, %r790, %r793; - mad.lo.s32 %r795, %r69, %r788, %r794; - mad.lo.s32 %r796, %r70, %r786, %r795; - ld.const.v4.u8 {%rs450, %rs451, %rs452, %rs453}, [matrix+224]; - cvt.u32.u16 %r797, %rs453; - cvt.s32.s8 %r798, %r797; - cvt.u32.u16 %r799, %rs452; - cvt.s32.s8 %r800, %r799; - cvt.u32.u16 %r801, %rs451; - cvt.s32.s8 %r802, %r801; - cvt.u32.u16 %r803, %rs450; - cvt.s32.s8 %r804, %r803; - mad.lo.s32 %r805, %r222, %r804, %r796; - mad.lo.s32 %r806, %r72, %r802, %r805; - mad.lo.s32 %r807, %r73, %r800, %r806; - mad.lo.s32 %r808, %r74, %r798, %r807; - ld.const.v4.u8 {%rs458, %rs459, %rs460, %rs461}, [matrix+228]; - cvt.u32.u16 %r809, %rs461; - cvt.s32.s8 %r810, %r809; - cvt.u32.u16 %r811, %rs460; - cvt.s32.s8 %r812, %r811; - cvt.u32.u16 %r813, %rs459; - cvt.s32.s8 %r814, %r813; - cvt.u32.u16 %r815, %rs458; - cvt.s32.s8 %r816, %r815; - mad.lo.s32 %r817, %r75, %r816, %r808; - mad.lo.s32 %r818, %r76, %r814, %r817; - mad.lo.s32 %r819, %r77, %r812, %r818; - mad.lo.s32 %r820, %r78, %r810, %r819; - ld.const.v4.u8 {%rs466, %rs467, %rs468, %rs469}, [matrix+232]; - cvt.u32.u16 %r821, %rs469; - cvt.s32.s8 %r822, %r821; - cvt.u32.u16 %r823, %rs468; - cvt.s32.s8 %r824, %r823; - cvt.u32.u16 %r825, %rs467; - cvt.s32.s8 %r826, %r825; - cvt.u32.u16 %r827, %rs466; - cvt.s32.s8 %r828, %r827; - mad.lo.s32 %r829, %r80, %r828, %r820; - mad.lo.s32 %r830, %r81, %r826, %r829; - mad.lo.s32 %r831, %r83, %r824, %r830; - mad.lo.s32 %r832, %r84, %r822, %r831; - ld.const.v4.u8 {%rs474, %rs475, %rs476, %rs477}, [matrix+236]; - cvt.u32.u16 %r833, %rs477; - cvt.s32.s8 %r834, %r833; - cvt.u32.u16 %r835, %rs476; - cvt.s32.s8 %r836, %r835; - cvt.u32.u16 %r837, %rs475; - cvt.s32.s8 %r838, %r837; - cvt.u32.u16 %r839, %rs474; - cvt.s32.s8 %r840, %r839; - mad.lo.s32 %r841, %r86, %r840, %r832; - mad.lo.s32 %r842, %r87, %r838, %r841; - mad.lo.s32 %r843, %r88, %r836, %r842; - mad.lo.s32 %r844, %r89, %r834, %r843; - ld.const.v4.u8 {%rs482, %rs483, %rs484, %rs485}, [matrix+240]; - cvt.u32.u16 %r845, %rs485; - cvt.s32.s8 %r846, %r845; - cvt.u32.u16 %r847, %rs484; - cvt.s32.s8 %r848, %r847; - cvt.u32.u16 %r849, %rs483; - cvt.s32.s8 %r850, %r849; - cvt.u32.u16 %r851, %rs482; - cvt.s32.s8 %r852, %r851; - mad.lo.s32 %r853, %r271, %r852, %r844; - mad.lo.s32 %r854, %r91, %r850, %r853; - mad.lo.s32 %r855, %r93, %r848, %r854; - mad.lo.s32 %r856, %r94, %r846, %r855; - ld.const.v4.u8 {%rs490, %rs491, %rs492, %rs493}, [matrix+244]; - cvt.u32.u16 %r857, %rs493; - cvt.s32.s8 %r858, %r857; - cvt.u32.u16 %r859, %rs492; - cvt.s32.s8 %r860, %r859; - cvt.u32.u16 %r861, %rs491; - cvt.s32.s8 %r862, %r861; - cvt.u32.u16 %r863, %rs490; - cvt.s32.s8 %r864, %r863; - mad.lo.s32 %r865, %r96, %r864, %r856; - mad.lo.s32 %r866, %r97, %r862, %r865; - mad.lo.s32 %r867, %r99, %r860, %r866; - mad.lo.s32 %r868, %r100, %r858, %r867; - ld.const.v4.u8 {%rs498, %rs499, %rs500, %rs501}, [matrix+248]; - cvt.u32.u16 %r869, %rs501; - cvt.s32.s8 %r870, %r869; - cvt.u32.u16 %r871, %rs500; - cvt.s32.s8 %r872, %r871; - cvt.u32.u16 %r873, %rs499; - cvt.s32.s8 %r874, %r873; - cvt.u32.u16 %r875, %rs498; - cvt.s32.s8 %r876, %r875; - mad.lo.s32 %r877, %r103, %r876, %r868; - mad.lo.s32 %r878, %r104, %r874, %r877; - mad.lo.s32 %r879, %r107, %r872, %r878; - mad.lo.s32 %r880, %r108, %r870, %r879; - ld.const.v4.u8 {%rs506, %rs507, %rs508, %rs509}, [matrix+252]; - cvt.u32.u16 %r881, %rs509; - cvt.s32.s8 %r882, %r881; - cvt.u32.u16 %r883, %rs508; - cvt.s32.s8 %r884, %r883; - cvt.u32.u16 %r885, %rs507; - cvt.s32.s8 %r886, %r885; - cvt.u32.u16 %r887, %rs506; - cvt.s32.s8 %r888, %r887; - mad.lo.s32 %r889, %r111, %r888, %r880; - mad.lo.s32 %r890, %r112, %r886, %r889; - mad.lo.s32 %r891, %r114, %r884, %r890; - mad.lo.s32 %r892, %r115, %r882, %r891; - shr.u32 %r893, %r700, 6; - and.b32 %r894, %r893, 240; - shr.u32 %r895, %r892, 10; - or.b32 %r896, %r895, %r894; - xor.b32 %r897, %r12, %r896; - ld.const.v4.u8 {%rs514, %rs515, %rs516, %rs517}, [matrix+256]; - cvt.u32.u16 %r898, %rs517; - cvt.s32.s8 %r899, %r898; - cvt.u32.u16 %r900, %rs516; - cvt.s32.s8 %r901, %r900; - cvt.u32.u16 %r902, %rs514; - cvt.s32.s8 %r903, %r902; - cvt.u32.u16 %r904, %rs515; - cvt.s32.s8 %r905, %r904; - mul.lo.s32 %r906, %r34, %r905; - mad.lo.s32 %r907, %r124, %r903, %r906; - mad.lo.s32 %r908, %r35, %r901, %r907; - mad.lo.s32 %r909, %r36, %r899, %r908; - ld.const.v4.u8 {%rs522, %rs523, %rs524, %rs525}, [matrix+260]; - cvt.u32.u16 %r910, %rs525; - cvt.s32.s8 %r911, %r910; - cvt.u32.u16 %r912, %rs524; - cvt.s32.s8 %r913, %r912; - cvt.u32.u16 %r914, %rs523; - cvt.s32.s8 %r915, %r914; - cvt.u32.u16 %r916, %rs522; - cvt.s32.s8 %r917, %r916; - mad.lo.s32 %r918, %r37, %r917, %r909; - mad.lo.s32 %r919, %r38, %r915, %r918; - mad.lo.s32 %r920, %r39, %r913, %r919; - mad.lo.s32 %r921, %r40, %r911, %r920; - ld.const.v4.u8 {%rs530, %rs531, %rs532, %rs533}, [matrix+264]; - cvt.u32.u16 %r922, %rs533; - cvt.s32.s8 %r923, %r922; - cvt.u32.u16 %r924, %rs532; - cvt.s32.s8 %r925, %r924; - cvt.u32.u16 %r926, %rs531; - cvt.s32.s8 %r927, %r926; - cvt.u32.u16 %r928, %rs530; - cvt.s32.s8 %r929, %r928; - mad.lo.s32 %r930, %r42, %r929, %r921; - mad.lo.s32 %r931, %r43, %r927, %r930; - mad.lo.s32 %r932, %r45, %r925, %r931; - mad.lo.s32 %r933, %r46, %r923, %r932; - ld.const.v4.u8 {%rs538, %rs539, %rs540, %rs541}, [matrix+268]; - cvt.u32.u16 %r934, %rs541; - cvt.s32.s8 %r935, %r934; - cvt.u32.u16 %r936, %rs540; - cvt.s32.s8 %r937, %r936; - cvt.u32.u16 %r938, %rs539; - cvt.s32.s8 %r939, %r938; - cvt.u32.u16 %r940, %rs538; - cvt.s32.s8 %r941, %r940; - mad.lo.s32 %r942, %r48, %r941, %r933; - mad.lo.s32 %r943, %r49, %r939, %r942; - mad.lo.s32 %r944, %r50, %r937, %r943; - mad.lo.s32 %r945, %r51, %r935, %r944; - ld.const.v4.u8 {%rs546, %rs547, %rs548, %rs549}, [matrix+272]; - cvt.u32.u16 %r946, %rs549; - cvt.s32.s8 %r947, %r946; - cvt.u32.u16 %r948, %rs548; - cvt.s32.s8 %r949, %r948; - cvt.u32.u16 %r950, %rs547; - cvt.s32.s8 %r951, %r950; - cvt.u32.u16 %r952, %rs546; - cvt.s32.s8 %r953, %r952; - mad.lo.s32 %r954, %r173, %r953, %r945; - mad.lo.s32 %r955, %r53, %r951, %r954; - mad.lo.s32 %r956, %r54, %r949, %r955; - mad.lo.s32 %r957, %r55, %r947, %r956; - ld.const.v4.u8 {%rs554, %rs555, %rs556, %rs557}, [matrix+276]; - cvt.u32.u16 %r958, %rs557; - cvt.s32.s8 %r959, %r958; - cvt.u32.u16 %r960, %rs556; - cvt.s32.s8 %r961, %r960; - cvt.u32.u16 %r962, %rs555; - cvt.s32.s8 %r963, %r962; - cvt.u32.u16 %r964, %rs554; - cvt.s32.s8 %r965, %r964; - mad.lo.s32 %r966, %r56, %r965, %r957; - mad.lo.s32 %r967, %r57, %r963, %r966; - mad.lo.s32 %r968, %r58, %r961, %r967; - mad.lo.s32 %r969, %r59, %r959, %r968; - ld.const.v4.u8 {%rs562, %rs563, %rs564, %rs565}, [matrix+280]; - cvt.u32.u16 %r970, %rs565; - cvt.s32.s8 %r971, %r970; - cvt.u32.u16 %r972, %rs564; - cvt.s32.s8 %r973, %r972; - cvt.u32.u16 %r974, %rs563; - cvt.s32.s8 %r975, %r974; - cvt.u32.u16 %r976, %rs562; - cvt.s32.s8 %r977, %r976; - mad.lo.s32 %r978, %r61, %r977, %r969; - mad.lo.s32 %r979, %r62, %r975, %r978; - mad.lo.s32 %r980, %r64, %r973, %r979; - mad.lo.s32 %r981, %r65, %r971, %r980; - ld.const.v4.u8 {%rs570, %rs571, %rs572, %rs573}, [matrix+284]; - cvt.u32.u16 %r982, %rs573; - cvt.s32.s8 %r983, %r982; - cvt.u32.u16 %r984, %rs572; - cvt.s32.s8 %r985, %r984; - cvt.u32.u16 %r986, %rs571; - cvt.s32.s8 %r987, %r986; - cvt.u32.u16 %r988, %rs570; - cvt.s32.s8 %r989, %r988; - mad.lo.s32 %r990, %r67, %r989, %r981; - mad.lo.s32 %r991, %r68, %r987, %r990; - mad.lo.s32 %r992, %r69, %r985, %r991; - mad.lo.s32 %r993, %r70, %r983, %r992; - ld.const.v4.u8 {%rs578, %rs579, %rs580, %rs581}, [matrix+288]; - cvt.u32.u16 %r994, %rs581; - cvt.s32.s8 %r995, %r994; - cvt.u32.u16 %r996, %rs580; - cvt.s32.s8 %r997, %r996; - cvt.u32.u16 %r998, %rs579; - cvt.s32.s8 %r999, %r998; - cvt.u32.u16 %r1000, %rs578; - cvt.s32.s8 %r1001, %r1000; - mad.lo.s32 %r1002, %r222, %r1001, %r993; - mad.lo.s32 %r1003, %r72, %r999, %r1002; - mad.lo.s32 %r1004, %r73, %r997, %r1003; - mad.lo.s32 %r1005, %r74, %r995, %r1004; - ld.const.v4.u8 {%rs586, %rs587, %rs588, %rs589}, [matrix+292]; - cvt.u32.u16 %r1006, %rs589; - cvt.s32.s8 %r1007, %r1006; - cvt.u32.u16 %r1008, %rs588; - cvt.s32.s8 %r1009, %r1008; - cvt.u32.u16 %r1010, %rs587; - cvt.s32.s8 %r1011, %r1010; - cvt.u32.u16 %r1012, %rs586; - cvt.s32.s8 %r1013, %r1012; - mad.lo.s32 %r1014, %r75, %r1013, %r1005; - mad.lo.s32 %r1015, %r76, %r1011, %r1014; - mad.lo.s32 %r1016, %r77, %r1009, %r1015; - mad.lo.s32 %r1017, %r78, %r1007, %r1016; - ld.const.v4.u8 {%rs594, %rs595, %rs596, %rs597}, [matrix+296]; - cvt.u32.u16 %r1018, %rs597; - cvt.s32.s8 %r1019, %r1018; - cvt.u32.u16 %r1020, %rs596; - cvt.s32.s8 %r1021, %r1020; - cvt.u32.u16 %r1022, %rs595; - cvt.s32.s8 %r1023, %r1022; - cvt.u32.u16 %r1024, %rs594; - cvt.s32.s8 %r1025, %r1024; - mad.lo.s32 %r1026, %r80, %r1025, %r1017; - mad.lo.s32 %r1027, %r81, %r1023, %r1026; - mad.lo.s32 %r1028, %r83, %r1021, %r1027; - mad.lo.s32 %r1029, %r84, %r1019, %r1028; - ld.const.v4.u8 {%rs602, %rs603, %rs604, %rs605}, [matrix+300]; - cvt.u32.u16 %r1030, %rs605; - cvt.s32.s8 %r1031, %r1030; - cvt.u32.u16 %r1032, %rs604; - cvt.s32.s8 %r1033, %r1032; - cvt.u32.u16 %r1034, %rs603; - cvt.s32.s8 %r1035, %r1034; - cvt.u32.u16 %r1036, %rs602; - cvt.s32.s8 %r1037, %r1036; - mad.lo.s32 %r1038, %r86, %r1037, %r1029; - mad.lo.s32 %r1039, %r87, %r1035, %r1038; - mad.lo.s32 %r1040, %r88, %r1033, %r1039; - mad.lo.s32 %r1041, %r89, %r1031, %r1040; - ld.const.v4.u8 {%rs610, %rs611, %rs612, %rs613}, [matrix+304]; - cvt.u32.u16 %r1042, %rs613; - cvt.s32.s8 %r1043, %r1042; - cvt.u32.u16 %r1044, %rs612; - cvt.s32.s8 %r1045, %r1044; - cvt.u32.u16 %r1046, %rs611; - cvt.s32.s8 %r1047, %r1046; - cvt.u32.u16 %r1048, %rs610; - cvt.s32.s8 %r1049, %r1048; - mad.lo.s32 %r1050, %r271, %r1049, %r1041; - mad.lo.s32 %r1051, %r91, %r1047, %r1050; - mad.lo.s32 %r1052, %r93, %r1045, %r1051; - mad.lo.s32 %r1053, %r94, %r1043, %r1052; - ld.const.v4.u8 {%rs618, %rs619, %rs620, %rs621}, [matrix+308]; - cvt.u32.u16 %r1054, %rs621; - cvt.s32.s8 %r1055, %r1054; - cvt.u32.u16 %r1056, %rs620; - cvt.s32.s8 %r1057, %r1056; - cvt.u32.u16 %r1058, %rs619; - cvt.s32.s8 %r1059, %r1058; - cvt.u32.u16 %r1060, %rs618; - cvt.s32.s8 %r1061, %r1060; - mad.lo.s32 %r1062, %r96, %r1061, %r1053; - mad.lo.s32 %r1063, %r97, %r1059, %r1062; - mad.lo.s32 %r1064, %r99, %r1057, %r1063; - mad.lo.s32 %r1065, %r100, %r1055, %r1064; - ld.const.v4.u8 {%rs626, %rs627, %rs628, %rs629}, [matrix+312]; - cvt.u32.u16 %r1066, %rs629; - cvt.s32.s8 %r1067, %r1066; - cvt.u32.u16 %r1068, %rs628; - cvt.s32.s8 %r1069, %r1068; - cvt.u32.u16 %r1070, %rs627; - cvt.s32.s8 %r1071, %r1070; - cvt.u32.u16 %r1072, %rs626; - cvt.s32.s8 %r1073, %r1072; - mad.lo.s32 %r1074, %r103, %r1073, %r1065; - mad.lo.s32 %r1075, %r104, %r1071, %r1074; - mad.lo.s32 %r1076, %r107, %r1069, %r1075; - mad.lo.s32 %r1077, %r108, %r1067, %r1076; - ld.const.v4.u8 {%rs634, %rs635, %rs636, %rs637}, [matrix+316]; - cvt.u32.u16 %r1078, %rs637; - cvt.s32.s8 %r1079, %r1078; - cvt.u32.u16 %r1080, %rs636; - cvt.s32.s8 %r1081, %r1080; - cvt.u32.u16 %r1082, %rs635; - cvt.s32.s8 %r1083, %r1082; - cvt.u32.u16 %r1084, %rs634; - cvt.s32.s8 %r1085, %r1084; - mad.lo.s32 %r1086, %r111, %r1085, %r1077; - mad.lo.s32 %r1087, %r112, %r1083, %r1086; - mad.lo.s32 %r1088, %r114, %r1081, %r1087; - mad.lo.s32 %r1089, %r115, %r1079, %r1088; - ld.const.v4.u8 {%rs642, %rs643, %rs644, %rs645}, [matrix+320]; - cvt.u32.u16 %r1090, %rs645; - cvt.s32.s8 %r1091, %r1090; - cvt.u32.u16 %r1092, %rs644; - cvt.s32.s8 %r1093, %r1092; - cvt.u32.u16 %r1094, %rs642; - cvt.s32.s8 %r1095, %r1094; - cvt.u32.u16 %r1096, %rs643; - cvt.s32.s8 %r1097, %r1096; - mul.lo.s32 %r1098, %r34, %r1097; - mad.lo.s32 %r1099, %r124, %r1095, %r1098; - mad.lo.s32 %r1100, %r35, %r1093, %r1099; - mad.lo.s32 %r1101, %r36, %r1091, %r1100; - ld.const.v4.u8 {%rs650, %rs651, %rs652, %rs653}, [matrix+324]; - cvt.u32.u16 %r1102, %rs653; - cvt.s32.s8 %r1103, %r1102; - cvt.u32.u16 %r1104, %rs652; - cvt.s32.s8 %r1105, %r1104; - cvt.u32.u16 %r1106, %rs651; - cvt.s32.s8 %r1107, %r1106; - cvt.u32.u16 %r1108, %rs650; - cvt.s32.s8 %r1109, %r1108; - mad.lo.s32 %r1110, %r37, %r1109, %r1101; - mad.lo.s32 %r1111, %r38, %r1107, %r1110; - mad.lo.s32 %r1112, %r39, %r1105, %r1111; - mad.lo.s32 %r1113, %r40, %r1103, %r1112; - ld.const.v4.u8 {%rs658, %rs659, %rs660, %rs661}, [matrix+328]; - cvt.u32.u16 %r1114, %rs661; - cvt.s32.s8 %r1115, %r1114; - cvt.u32.u16 %r1116, %rs660; - cvt.s32.s8 %r1117, %r1116; - cvt.u32.u16 %r1118, %rs659; - cvt.s32.s8 %r1119, %r1118; - cvt.u32.u16 %r1120, %rs658; - cvt.s32.s8 %r1121, %r1120; - mad.lo.s32 %r1122, %r42, %r1121, %r1113; - mad.lo.s32 %r1123, %r43, %r1119, %r1122; - mad.lo.s32 %r1124, %r45, %r1117, %r1123; - mad.lo.s32 %r1125, %r46, %r1115, %r1124; - ld.const.v4.u8 {%rs666, %rs667, %rs668, %rs669}, [matrix+332]; - cvt.u32.u16 %r1126, %rs669; - cvt.s32.s8 %r1127, %r1126; - cvt.u32.u16 %r1128, %rs668; - cvt.s32.s8 %r1129, %r1128; - cvt.u32.u16 %r1130, %rs667; - cvt.s32.s8 %r1131, %r1130; - cvt.u32.u16 %r1132, %rs666; - cvt.s32.s8 %r1133, %r1132; - mad.lo.s32 %r1134, %r48, %r1133, %r1125; - mad.lo.s32 %r1135, %r49, %r1131, %r1134; - mad.lo.s32 %r1136, %r50, %r1129, %r1135; - mad.lo.s32 %r1137, %r51, %r1127, %r1136; - ld.const.v4.u8 {%rs674, %rs675, %rs676, %rs677}, [matrix+336]; - cvt.u32.u16 %r1138, %rs677; - cvt.s32.s8 %r1139, %r1138; - cvt.u32.u16 %r1140, %rs676; - cvt.s32.s8 %r1141, %r1140; - cvt.u32.u16 %r1142, %rs675; - cvt.s32.s8 %r1143, %r1142; - cvt.u32.u16 %r1144, %rs674; - cvt.s32.s8 %r1145, %r1144; - mad.lo.s32 %r1146, %r173, %r1145, %r1137; - mad.lo.s32 %r1147, %r53, %r1143, %r1146; - mad.lo.s32 %r1148, %r54, %r1141, %r1147; - mad.lo.s32 %r1149, %r55, %r1139, %r1148; - ld.const.v4.u8 {%rs682, %rs683, %rs684, %rs685}, [matrix+340]; - cvt.u32.u16 %r1150, %rs685; - cvt.s32.s8 %r1151, %r1150; - cvt.u32.u16 %r1152, %rs684; - cvt.s32.s8 %r1153, %r1152; - cvt.u32.u16 %r1154, %rs683; - cvt.s32.s8 %r1155, %r1154; - cvt.u32.u16 %r1156, %rs682; - cvt.s32.s8 %r1157, %r1156; - mad.lo.s32 %r1158, %r56, %r1157, %r1149; - mad.lo.s32 %r1159, %r57, %r1155, %r1158; - mad.lo.s32 %r1160, %r58, %r1153, %r1159; - mad.lo.s32 %r1161, %r59, %r1151, %r1160; - ld.const.v4.u8 {%rs690, %rs691, %rs692, %rs693}, [matrix+344]; - cvt.u32.u16 %r1162, %rs693; - cvt.s32.s8 %r1163, %r1162; - cvt.u32.u16 %r1164, %rs692; - cvt.s32.s8 %r1165, %r1164; - cvt.u32.u16 %r1166, %rs691; - cvt.s32.s8 %r1167, %r1166; - cvt.u32.u16 %r1168, %rs690; - cvt.s32.s8 %r1169, %r1168; - mad.lo.s32 %r1170, %r61, %r1169, %r1161; - mad.lo.s32 %r1171, %r62, %r1167, %r1170; - mad.lo.s32 %r1172, %r64, %r1165, %r1171; - mad.lo.s32 %r1173, %r65, %r1163, %r1172; - ld.const.v4.u8 {%rs698, %rs699, %rs700, %rs701}, [matrix+348]; - cvt.u32.u16 %r1174, %rs701; - cvt.s32.s8 %r1175, %r1174; - cvt.u32.u16 %r1176, %rs700; - cvt.s32.s8 %r1177, %r1176; - cvt.u32.u16 %r1178, %rs699; - cvt.s32.s8 %r1179, %r1178; - cvt.u32.u16 %r1180, %rs698; - cvt.s32.s8 %r1181, %r1180; - mad.lo.s32 %r1182, %r67, %r1181, %r1173; - mad.lo.s32 %r1183, %r68, %r1179, %r1182; - mad.lo.s32 %r1184, %r69, %r1177, %r1183; - mad.lo.s32 %r1185, %r70, %r1175, %r1184; - ld.const.v4.u8 {%rs706, %rs707, %rs708, %rs709}, [matrix+352]; - cvt.u32.u16 %r1186, %rs709; - cvt.s32.s8 %r1187, %r1186; - cvt.u32.u16 %r1188, %rs708; - cvt.s32.s8 %r1189, %r1188; - cvt.u32.u16 %r1190, %rs707; - cvt.s32.s8 %r1191, %r1190; - cvt.u32.u16 %r1192, %rs706; - cvt.s32.s8 %r1193, %r1192; - mad.lo.s32 %r1194, %r222, %r1193, %r1185; - mad.lo.s32 %r1195, %r72, %r1191, %r1194; - mad.lo.s32 %r1196, %r73, %r1189, %r1195; - mad.lo.s32 %r1197, %r74, %r1187, %r1196; - ld.const.v4.u8 {%rs714, %rs715, %rs716, %rs717}, [matrix+356]; - cvt.u32.u16 %r1198, %rs717; - cvt.s32.s8 %r1199, %r1198; - cvt.u32.u16 %r1200, %rs716; - cvt.s32.s8 %r1201, %r1200; - cvt.u32.u16 %r1202, %rs715; - cvt.s32.s8 %r1203, %r1202; - cvt.u32.u16 %r1204, %rs714; - cvt.s32.s8 %r1205, %r1204; - mad.lo.s32 %r1206, %r75, %r1205, %r1197; - mad.lo.s32 %r1207, %r76, %r1203, %r1206; - mad.lo.s32 %r1208, %r77, %r1201, %r1207; - mad.lo.s32 %r1209, %r78, %r1199, %r1208; - ld.const.v4.u8 {%rs722, %rs723, %rs724, %rs725}, [matrix+360]; - cvt.u32.u16 %r1210, %rs725; - cvt.s32.s8 %r1211, %r1210; - cvt.u32.u16 %r1212, %rs724; - cvt.s32.s8 %r1213, %r1212; - cvt.u32.u16 %r1214, %rs723; - cvt.s32.s8 %r1215, %r1214; - cvt.u32.u16 %r1216, %rs722; - cvt.s32.s8 %r1217, %r1216; - mad.lo.s32 %r1218, %r80, %r1217, %r1209; - mad.lo.s32 %r1219, %r81, %r1215, %r1218; - mad.lo.s32 %r1220, %r83, %r1213, %r1219; - mad.lo.s32 %r1221, %r84, %r1211, %r1220; - ld.const.v4.u8 {%rs730, %rs731, %rs732, %rs733}, [matrix+364]; - cvt.u32.u16 %r1222, %rs733; - cvt.s32.s8 %r1223, %r1222; - cvt.u32.u16 %r1224, %rs732; - cvt.s32.s8 %r1225, %r1224; - cvt.u32.u16 %r1226, %rs731; - cvt.s32.s8 %r1227, %r1226; - cvt.u32.u16 %r1228, %rs730; - cvt.s32.s8 %r1229, %r1228; - mad.lo.s32 %r1230, %r86, %r1229, %r1221; - mad.lo.s32 %r1231, %r87, %r1227, %r1230; - mad.lo.s32 %r1232, %r88, %r1225, %r1231; - mad.lo.s32 %r1233, %r89, %r1223, %r1232; - ld.const.v4.u8 {%rs738, %rs739, %rs740, %rs741}, [matrix+368]; - cvt.u32.u16 %r1234, %rs741; - cvt.s32.s8 %r1235, %r1234; - cvt.u32.u16 %r1236, %rs740; - cvt.s32.s8 %r1237, %r1236; - cvt.u32.u16 %r1238, %rs739; - cvt.s32.s8 %r1239, %r1238; - cvt.u32.u16 %r1240, %rs738; - cvt.s32.s8 %r1241, %r1240; - mad.lo.s32 %r1242, %r271, %r1241, %r1233; - mad.lo.s32 %r1243, %r91, %r1239, %r1242; - mad.lo.s32 %r1244, %r93, %r1237, %r1243; - mad.lo.s32 %r1245, %r94, %r1235, %r1244; - ld.const.v4.u8 {%rs746, %rs747, %rs748, %rs749}, [matrix+372]; - cvt.u32.u16 %r1246, %rs749; - cvt.s32.s8 %r1247, %r1246; - cvt.u32.u16 %r1248, %rs748; - cvt.s32.s8 %r1249, %r1248; - cvt.u32.u16 %r1250, %rs747; - cvt.s32.s8 %r1251, %r1250; - cvt.u32.u16 %r1252, %rs746; - cvt.s32.s8 %r1253, %r1252; - mad.lo.s32 %r1254, %r96, %r1253, %r1245; - mad.lo.s32 %r1255, %r97, %r1251, %r1254; - mad.lo.s32 %r1256, %r99, %r1249, %r1255; - mad.lo.s32 %r1257, %r100, %r1247, %r1256; - ld.const.v4.u8 {%rs754, %rs755, %rs756, %rs757}, [matrix+376]; - cvt.u32.u16 %r1258, %rs757; - cvt.s32.s8 %r1259, %r1258; - cvt.u32.u16 %r1260, %rs756; - cvt.s32.s8 %r1261, %r1260; - cvt.u32.u16 %r1262, %rs755; - cvt.s32.s8 %r1263, %r1262; - cvt.u32.u16 %r1264, %rs754; - cvt.s32.s8 %r1265, %r1264; - mad.lo.s32 %r1266, %r103, %r1265, %r1257; - mad.lo.s32 %r1267, %r104, %r1263, %r1266; - mad.lo.s32 %r1268, %r107, %r1261, %r1267; - mad.lo.s32 %r1269, %r108, %r1259, %r1268; - ld.const.v4.u8 {%rs762, %rs763, %rs764, %rs765}, [matrix+380]; - cvt.u32.u16 %r1270, %rs765; - cvt.s32.s8 %r1271, %r1270; - cvt.u32.u16 %r1272, %rs764; - cvt.s32.s8 %r1273, %r1272; - cvt.u32.u16 %r1274, %rs763; - cvt.s32.s8 %r1275, %r1274; - cvt.u32.u16 %r1276, %rs762; - cvt.s32.s8 %r1277, %r1276; - mad.lo.s32 %r1278, %r111, %r1277, %r1269; - mad.lo.s32 %r1279, %r112, %r1275, %r1278; - mad.lo.s32 %r1280, %r114, %r1273, %r1279; - mad.lo.s32 %r1281, %r115, %r1271, %r1280; - shr.u32 %r1282, %r1089, 6; - and.b32 %r1283, %r1282, 240; - shr.u32 %r1284, %r1281, 10; - or.b32 %r1285, %r1284, %r1283; - xor.b32 %r1286, %r13, %r1285; - cvt.u64.u32 %rd382, %r1286; - ld.const.v4.u8 {%rs770, %rs771, %rs772, %rs773}, [matrix+384]; - cvt.u32.u16 %r1287, %rs773; - cvt.s32.s8 %r1288, %r1287; - cvt.u32.u16 %r1289, %rs772; - cvt.s32.s8 %r1290, %r1289; - cvt.u32.u16 %r1291, %rs770; - cvt.s32.s8 %r1292, %r1291; - cvt.u32.u16 %r1293, %rs771; - cvt.s32.s8 %r1294, %r1293; - mul.lo.s32 %r1295, %r34, %r1294; - mad.lo.s32 %r1296, %r124, %r1292, %r1295; - mad.lo.s32 %r1297, %r35, %r1290, %r1296; - mad.lo.s32 %r1298, %r36, %r1288, %r1297; - ld.const.v4.u8 {%rs778, %rs779, %rs780, %rs781}, [matrix+388]; - cvt.u32.u16 %r1299, %rs781; - cvt.s32.s8 %r1300, %r1299; - cvt.u32.u16 %r1301, %rs780; - cvt.s32.s8 %r1302, %r1301; - cvt.u32.u16 %r1303, %rs779; - cvt.s32.s8 %r1304, %r1303; - cvt.u32.u16 %r1305, %rs778; - cvt.s32.s8 %r1306, %r1305; - mad.lo.s32 %r1307, %r37, %r1306, %r1298; - mad.lo.s32 %r1308, %r38, %r1304, %r1307; - mad.lo.s32 %r1309, %r39, %r1302, %r1308; - mad.lo.s32 %r1310, %r40, %r1300, %r1309; - ld.const.v4.u8 {%rs786, %rs787, %rs788, %rs789}, [matrix+392]; - cvt.u32.u16 %r1311, %rs789; - cvt.s32.s8 %r1312, %r1311; - cvt.u32.u16 %r1313, %rs788; - cvt.s32.s8 %r1314, %r1313; - cvt.u32.u16 %r1315, %rs787; - cvt.s32.s8 %r1316, %r1315; - cvt.u32.u16 %r1317, %rs786; - cvt.s32.s8 %r1318, %r1317; - mad.lo.s32 %r1319, %r42, %r1318, %r1310; - mad.lo.s32 %r1320, %r43, %r1316, %r1319; - mad.lo.s32 %r1321, %r45, %r1314, %r1320; - mad.lo.s32 %r1322, %r46, %r1312, %r1321; - ld.const.v4.u8 {%rs794, %rs795, %rs796, %rs797}, [matrix+396]; - cvt.u32.u16 %r1323, %rs797; - cvt.s32.s8 %r1324, %r1323; - cvt.u32.u16 %r1325, %rs796; - cvt.s32.s8 %r1326, %r1325; - cvt.u32.u16 %r1327, %rs795; - cvt.s32.s8 %r1328, %r1327; - cvt.u32.u16 %r1329, %rs794; - cvt.s32.s8 %r1330, %r1329; - mad.lo.s32 %r1331, %r48, %r1330, %r1322; - mad.lo.s32 %r1332, %r49, %r1328, %r1331; - mad.lo.s32 %r1333, %r50, %r1326, %r1332; - mad.lo.s32 %r1334, %r51, %r1324, %r1333; - ld.const.v4.u8 {%rs802, %rs803, %rs804, %rs805}, [matrix+400]; - cvt.u32.u16 %r1335, %rs805; - cvt.s32.s8 %r1336, %r1335; - cvt.u32.u16 %r1337, %rs804; - cvt.s32.s8 %r1338, %r1337; - cvt.u32.u16 %r1339, %rs803; - cvt.s32.s8 %r1340, %r1339; - cvt.u32.u16 %r1341, %rs802; - cvt.s32.s8 %r1342, %r1341; - mad.lo.s32 %r1343, %r173, %r1342, %r1334; - mad.lo.s32 %r1344, %r53, %r1340, %r1343; - mad.lo.s32 %r1345, %r54, %r1338, %r1344; - mad.lo.s32 %r1346, %r55, %r1336, %r1345; - ld.const.v4.u8 {%rs810, %rs811, %rs812, %rs813}, [matrix+404]; - cvt.u32.u16 %r1347, %rs813; - cvt.s32.s8 %r1348, %r1347; - cvt.u32.u16 %r1349, %rs812; - cvt.s32.s8 %r1350, %r1349; - cvt.u32.u16 %r1351, %rs811; - cvt.s32.s8 %r1352, %r1351; - cvt.u32.u16 %r1353, %rs810; - cvt.s32.s8 %r1354, %r1353; - mad.lo.s32 %r1355, %r56, %r1354, %r1346; - mad.lo.s32 %r1356, %r57, %r1352, %r1355; - mad.lo.s32 %r1357, %r58, %r1350, %r1356; - mad.lo.s32 %r1358, %r59, %r1348, %r1357; - ld.const.v4.u8 {%rs818, %rs819, %rs820, %rs821}, [matrix+408]; - cvt.u32.u16 %r1359, %rs821; - cvt.s32.s8 %r1360, %r1359; - cvt.u32.u16 %r1361, %rs820; - cvt.s32.s8 %r1362, %r1361; - cvt.u32.u16 %r1363, %rs819; - cvt.s32.s8 %r1364, %r1363; - cvt.u32.u16 %r1365, %rs818; - cvt.s32.s8 %r1366, %r1365; - mad.lo.s32 %r1367, %r61, %r1366, %r1358; - mad.lo.s32 %r1368, %r62, %r1364, %r1367; - mad.lo.s32 %r1369, %r64, %r1362, %r1368; - mad.lo.s32 %r1370, %r65, %r1360, %r1369; - ld.const.v4.u8 {%rs826, %rs827, %rs828, %rs829}, [matrix+412]; - cvt.u32.u16 %r1371, %rs829; - cvt.s32.s8 %r1372, %r1371; - cvt.u32.u16 %r1373, %rs828; - cvt.s32.s8 %r1374, %r1373; - cvt.u32.u16 %r1375, %rs827; - cvt.s32.s8 %r1376, %r1375; - cvt.u32.u16 %r1377, %rs826; - cvt.s32.s8 %r1378, %r1377; - mad.lo.s32 %r1379, %r67, %r1378, %r1370; - mad.lo.s32 %r1380, %r68, %r1376, %r1379; - mad.lo.s32 %r1381, %r69, %r1374, %r1380; - mad.lo.s32 %r1382, %r70, %r1372, %r1381; - ld.const.v4.u8 {%rs834, %rs835, %rs836, %rs837}, [matrix+416]; - cvt.u32.u16 %r1383, %rs837; - cvt.s32.s8 %r1384, %r1383; - cvt.u32.u16 %r1385, %rs836; - cvt.s32.s8 %r1386, %r1385; - cvt.u32.u16 %r1387, %rs835; - cvt.s32.s8 %r1388, %r1387; - cvt.u32.u16 %r1389, %rs834; - cvt.s32.s8 %r1390, %r1389; - mad.lo.s32 %r1391, %r222, %r1390, %r1382; - mad.lo.s32 %r1392, %r72, %r1388, %r1391; - mad.lo.s32 %r1393, %r73, %r1386, %r1392; - mad.lo.s32 %r1394, %r74, %r1384, %r1393; - ld.const.v4.u8 {%rs842, %rs843, %rs844, %rs845}, [matrix+420]; - cvt.u32.u16 %r1395, %rs845; - cvt.s32.s8 %r1396, %r1395; - cvt.u32.u16 %r1397, %rs844; - cvt.s32.s8 %r1398, %r1397; - cvt.u32.u16 %r1399, %rs843; - cvt.s32.s8 %r1400, %r1399; - cvt.u32.u16 %r1401, %rs842; - cvt.s32.s8 %r1402, %r1401; - mad.lo.s32 %r1403, %r75, %r1402, %r1394; - mad.lo.s32 %r1404, %r76, %r1400, %r1403; - mad.lo.s32 %r1405, %r77, %r1398, %r1404; - mad.lo.s32 %r1406, %r78, %r1396, %r1405; - ld.const.v4.u8 {%rs850, %rs851, %rs852, %rs853}, [matrix+424]; - cvt.u32.u16 %r1407, %rs853; - cvt.s32.s8 %r1408, %r1407; - cvt.u32.u16 %r1409, %rs852; - cvt.s32.s8 %r1410, %r1409; - cvt.u32.u16 %r1411, %rs851; - cvt.s32.s8 %r1412, %r1411; - cvt.u32.u16 %r1413, %rs850; - cvt.s32.s8 %r1414, %r1413; - mad.lo.s32 %r1415, %r80, %r1414, %r1406; - mad.lo.s32 %r1416, %r81, %r1412, %r1415; - mad.lo.s32 %r1417, %r83, %r1410, %r1416; - mad.lo.s32 %r1418, %r84, %r1408, %r1417; - ld.const.v4.u8 {%rs858, %rs859, %rs860, %rs861}, [matrix+428]; - cvt.u32.u16 %r1419, %rs861; - cvt.s32.s8 %r1420, %r1419; - cvt.u32.u16 %r1421, %rs860; - cvt.s32.s8 %r1422, %r1421; - cvt.u32.u16 %r1423, %rs859; - cvt.s32.s8 %r1424, %r1423; - cvt.u32.u16 %r1425, %rs858; - cvt.s32.s8 %r1426, %r1425; - mad.lo.s32 %r1427, %r86, %r1426, %r1418; - mad.lo.s32 %r1428, %r87, %r1424, %r1427; - mad.lo.s32 %r1429, %r88, %r1422, %r1428; - mad.lo.s32 %r1430, %r89, %r1420, %r1429; - ld.const.v4.u8 {%rs866, %rs867, %rs868, %rs869}, [matrix+432]; - cvt.u32.u16 %r1431, %rs869; - cvt.s32.s8 %r1432, %r1431; - cvt.u32.u16 %r1433, %rs868; - cvt.s32.s8 %r1434, %r1433; - cvt.u32.u16 %r1435, %rs867; - cvt.s32.s8 %r1436, %r1435; - cvt.u32.u16 %r1437, %rs866; - cvt.s32.s8 %r1438, %r1437; - mad.lo.s32 %r1439, %r271, %r1438, %r1430; - mad.lo.s32 %r1440, %r91, %r1436, %r1439; - mad.lo.s32 %r1441, %r93, %r1434, %r1440; - mad.lo.s32 %r1442, %r94, %r1432, %r1441; - ld.const.v4.u8 {%rs874, %rs875, %rs876, %rs877}, [matrix+436]; - cvt.u32.u16 %r1443, %rs877; - cvt.s32.s8 %r1444, %r1443; - cvt.u32.u16 %r1445, %rs876; - cvt.s32.s8 %r1446, %r1445; - cvt.u32.u16 %r1447, %rs875; - cvt.s32.s8 %r1448, %r1447; - cvt.u32.u16 %r1449, %rs874; - cvt.s32.s8 %r1450, %r1449; - mad.lo.s32 %r1451, %r96, %r1450, %r1442; - mad.lo.s32 %r1452, %r97, %r1448, %r1451; - mad.lo.s32 %r1453, %r99, %r1446, %r1452; - mad.lo.s32 %r1454, %r100, %r1444, %r1453; - ld.const.v4.u8 {%rs882, %rs883, %rs884, %rs885}, [matrix+440]; - cvt.u32.u16 %r1455, %rs885; - cvt.s32.s8 %r1456, %r1455; - cvt.u32.u16 %r1457, %rs884; - cvt.s32.s8 %r1458, %r1457; - cvt.u32.u16 %r1459, %rs883; - cvt.s32.s8 %r1460, %r1459; - cvt.u32.u16 %r1461, %rs882; - cvt.s32.s8 %r1462, %r1461; - mad.lo.s32 %r1463, %r103, %r1462, %r1454; - mad.lo.s32 %r1464, %r104, %r1460, %r1463; - mad.lo.s32 %r1465, %r107, %r1458, %r1464; - mad.lo.s32 %r1466, %r108, %r1456, %r1465; - ld.const.v4.u8 {%rs890, %rs891, %rs892, %rs893}, [matrix+444]; - cvt.u32.u16 %r1467, %rs893; - cvt.s32.s8 %r1468, %r1467; - cvt.u32.u16 %r1469, %rs892; - cvt.s32.s8 %r1470, %r1469; - cvt.u32.u16 %r1471, %rs891; - cvt.s32.s8 %r1472, %r1471; - cvt.u32.u16 %r1473, %rs890; - cvt.s32.s8 %r1474, %r1473; - mad.lo.s32 %r1475, %r111, %r1474, %r1466; - mad.lo.s32 %r1476, %r112, %r1472, %r1475; - mad.lo.s32 %r1477, %r114, %r1470, %r1476; - mad.lo.s32 %r1478, %r115, %r1468, %r1477; - ld.const.v4.u8 {%rs898, %rs899, %rs900, %rs901}, [matrix+448]; - cvt.u32.u16 %r1479, %rs901; - cvt.s32.s8 %r1480, %r1479; - cvt.u32.u16 %r1481, %rs900; - cvt.s32.s8 %r1482, %r1481; - cvt.u32.u16 %r1483, %rs898; - cvt.s32.s8 %r1484, %r1483; - cvt.u32.u16 %r1485, %rs899; - cvt.s32.s8 %r1486, %r1485; - mul.lo.s32 %r1487, %r34, %r1486; - mad.lo.s32 %r1488, %r124, %r1484, %r1487; - mad.lo.s32 %r1489, %r35, %r1482, %r1488; - mad.lo.s32 %r1490, %r36, %r1480, %r1489; - ld.const.v4.u8 {%rs906, %rs907, %rs908, %rs909}, [matrix+452]; - cvt.u32.u16 %r1491, %rs909; - cvt.s32.s8 %r1492, %r1491; - cvt.u32.u16 %r1493, %rs908; - cvt.s32.s8 %r1494, %r1493; - cvt.u32.u16 %r1495, %rs907; - cvt.s32.s8 %r1496, %r1495; - cvt.u32.u16 %r1497, %rs906; - cvt.s32.s8 %r1498, %r1497; - mad.lo.s32 %r1499, %r37, %r1498, %r1490; - mad.lo.s32 %r1500, %r38, %r1496, %r1499; - mad.lo.s32 %r1501, %r39, %r1494, %r1500; - mad.lo.s32 %r1502, %r40, %r1492, %r1501; - ld.const.v4.u8 {%rs914, %rs915, %rs916, %rs917}, [matrix+456]; - cvt.u32.u16 %r1503, %rs917; - cvt.s32.s8 %r1504, %r1503; - cvt.u32.u16 %r1505, %rs916; - cvt.s32.s8 %r1506, %r1505; - cvt.u32.u16 %r1507, %rs915; - cvt.s32.s8 %r1508, %r1507; - cvt.u32.u16 %r1509, %rs914; - cvt.s32.s8 %r1510, %r1509; - mad.lo.s32 %r1511, %r42, %r1510, %r1502; - mad.lo.s32 %r1512, %r43, %r1508, %r1511; - mad.lo.s32 %r1513, %r45, %r1506, %r1512; - mad.lo.s32 %r1514, %r46, %r1504, %r1513; - ld.const.v4.u8 {%rs922, %rs923, %rs924, %rs925}, [matrix+460]; - cvt.u32.u16 %r1515, %rs925; - cvt.s32.s8 %r1516, %r1515; - cvt.u32.u16 %r1517, %rs924; - cvt.s32.s8 %r1518, %r1517; - cvt.u32.u16 %r1519, %rs923; - cvt.s32.s8 %r1520, %r1519; - cvt.u32.u16 %r1521, %rs922; - cvt.s32.s8 %r1522, %r1521; - mad.lo.s32 %r1523, %r48, %r1522, %r1514; - mad.lo.s32 %r1524, %r49, %r1520, %r1523; - mad.lo.s32 %r1525, %r50, %r1518, %r1524; - mad.lo.s32 %r1526, %r51, %r1516, %r1525; - ld.const.v4.u8 {%rs930, %rs931, %rs932, %rs933}, [matrix+464]; - cvt.u32.u16 %r1527, %rs933; - cvt.s32.s8 %r1528, %r1527; - cvt.u32.u16 %r1529, %rs932; - cvt.s32.s8 %r1530, %r1529; - cvt.u32.u16 %r1531, %rs931; - cvt.s32.s8 %r1532, %r1531; - cvt.u32.u16 %r1533, %rs930; - cvt.s32.s8 %r1534, %r1533; - mad.lo.s32 %r1535, %r173, %r1534, %r1526; - mad.lo.s32 %r1536, %r53, %r1532, %r1535; - mad.lo.s32 %r1537, %r54, %r1530, %r1536; - mad.lo.s32 %r1538, %r55, %r1528, %r1537; - ld.const.v4.u8 {%rs938, %rs939, %rs940, %rs941}, [matrix+468]; - cvt.u32.u16 %r1539, %rs941; - cvt.s32.s8 %r1540, %r1539; - cvt.u32.u16 %r1541, %rs940; - cvt.s32.s8 %r1542, %r1541; - cvt.u32.u16 %r1543, %rs939; - cvt.s32.s8 %r1544, %r1543; - cvt.u32.u16 %r1545, %rs938; - cvt.s32.s8 %r1546, %r1545; - mad.lo.s32 %r1547, %r56, %r1546, %r1538; - mad.lo.s32 %r1548, %r57, %r1544, %r1547; - mad.lo.s32 %r1549, %r58, %r1542, %r1548; - mad.lo.s32 %r1550, %r59, %r1540, %r1549; - ld.const.v4.u8 {%rs946, %rs947, %rs948, %rs949}, [matrix+472]; - cvt.u32.u16 %r1551, %rs949; - cvt.s32.s8 %r1552, %r1551; - cvt.u32.u16 %r1553, %rs948; - cvt.s32.s8 %r1554, %r1553; - cvt.u32.u16 %r1555, %rs947; - cvt.s32.s8 %r1556, %r1555; - cvt.u32.u16 %r1557, %rs946; - cvt.s32.s8 %r1558, %r1557; - mad.lo.s32 %r1559, %r61, %r1558, %r1550; - mad.lo.s32 %r1560, %r62, %r1556, %r1559; - mad.lo.s32 %r1561, %r64, %r1554, %r1560; - mad.lo.s32 %r1562, %r65, %r1552, %r1561; - ld.const.v4.u8 {%rs954, %rs955, %rs956, %rs957}, [matrix+476]; - cvt.u32.u16 %r1563, %rs957; - cvt.s32.s8 %r1564, %r1563; - cvt.u32.u16 %r1565, %rs956; - cvt.s32.s8 %r1566, %r1565; - cvt.u32.u16 %r1567, %rs955; - cvt.s32.s8 %r1568, %r1567; - cvt.u32.u16 %r1569, %rs954; - cvt.s32.s8 %r1570, %r1569; - mad.lo.s32 %r1571, %r67, %r1570, %r1562; - mad.lo.s32 %r1572, %r68, %r1568, %r1571; - mad.lo.s32 %r1573, %r69, %r1566, %r1572; - mad.lo.s32 %r1574, %r70, %r1564, %r1573; - ld.const.v4.u8 {%rs962, %rs963, %rs964, %rs965}, [matrix+480]; - cvt.u32.u16 %r1575, %rs965; - cvt.s32.s8 %r1576, %r1575; - cvt.u32.u16 %r1577, %rs964; - cvt.s32.s8 %r1578, %r1577; - cvt.u32.u16 %r1579, %rs963; - cvt.s32.s8 %r1580, %r1579; - cvt.u32.u16 %r1581, %rs962; - cvt.s32.s8 %r1582, %r1581; - mad.lo.s32 %r1583, %r222, %r1582, %r1574; - mad.lo.s32 %r1584, %r72, %r1580, %r1583; - mad.lo.s32 %r1585, %r73, %r1578, %r1584; - mad.lo.s32 %r1586, %r74, %r1576, %r1585; - ld.const.v4.u8 {%rs970, %rs971, %rs972, %rs973}, [matrix+484]; - cvt.u32.u16 %r1587, %rs973; - cvt.s32.s8 %r1588, %r1587; - cvt.u32.u16 %r1589, %rs972; - cvt.s32.s8 %r1590, %r1589; - cvt.u32.u16 %r1591, %rs971; - cvt.s32.s8 %r1592, %r1591; - cvt.u32.u16 %r1593, %rs970; - cvt.s32.s8 %r1594, %r1593; - mad.lo.s32 %r1595, %r75, %r1594, %r1586; - mad.lo.s32 %r1596, %r76, %r1592, %r1595; - mad.lo.s32 %r1597, %r77, %r1590, %r1596; - mad.lo.s32 %r1598, %r78, %r1588, %r1597; - ld.const.v4.u8 {%rs978, %rs979, %rs980, %rs981}, [matrix+488]; - cvt.u32.u16 %r1599, %rs981; - cvt.s32.s8 %r1600, %r1599; - cvt.u32.u16 %r1601, %rs980; - cvt.s32.s8 %r1602, %r1601; - cvt.u32.u16 %r1603, %rs979; - cvt.s32.s8 %r1604, %r1603; - cvt.u32.u16 %r1605, %rs978; - cvt.s32.s8 %r1606, %r1605; - mad.lo.s32 %r1607, %r80, %r1606, %r1598; - mad.lo.s32 %r1608, %r81, %r1604, %r1607; - mad.lo.s32 %r1609, %r83, %r1602, %r1608; - mad.lo.s32 %r1610, %r84, %r1600, %r1609; - ld.const.v4.u8 {%rs986, %rs987, %rs988, %rs989}, [matrix+492]; - cvt.u32.u16 %r1611, %rs989; - cvt.s32.s8 %r1612, %r1611; - cvt.u32.u16 %r1613, %rs988; - cvt.s32.s8 %r1614, %r1613; - cvt.u32.u16 %r1615, %rs987; - cvt.s32.s8 %r1616, %r1615; - cvt.u32.u16 %r1617, %rs986; - cvt.s32.s8 %r1618, %r1617; - mad.lo.s32 %r1619, %r86, %r1618, %r1610; - mad.lo.s32 %r1620, %r87, %r1616, %r1619; - mad.lo.s32 %r1621, %r88, %r1614, %r1620; - mad.lo.s32 %r1622, %r89, %r1612, %r1621; - ld.const.v4.u8 {%rs994, %rs995, %rs996, %rs997}, [matrix+496]; - cvt.u32.u16 %r1623, %rs997; - cvt.s32.s8 %r1624, %r1623; - cvt.u32.u16 %r1625, %rs996; - cvt.s32.s8 %r1626, %r1625; - cvt.u32.u16 %r1627, %rs995; - cvt.s32.s8 %r1628, %r1627; - cvt.u32.u16 %r1629, %rs994; - cvt.s32.s8 %r1630, %r1629; - mad.lo.s32 %r1631, %r271, %r1630, %r1622; - mad.lo.s32 %r1632, %r91, %r1628, %r1631; - mad.lo.s32 %r1633, %r93, %r1626, %r1632; - mad.lo.s32 %r1634, %r94, %r1624, %r1633; - ld.const.v4.u8 {%rs1002, %rs1003, %rs1004, %rs1005}, [matrix+500]; - cvt.u32.u16 %r1635, %rs1005; - cvt.s32.s8 %r1636, %r1635; - cvt.u32.u16 %r1637, %rs1004; - cvt.s32.s8 %r1638, %r1637; - cvt.u32.u16 %r1639, %rs1003; - cvt.s32.s8 %r1640, %r1639; - cvt.u32.u16 %r1641, %rs1002; - cvt.s32.s8 %r1642, %r1641; - mad.lo.s32 %r1643, %r96, %r1642, %r1634; - mad.lo.s32 %r1644, %r97, %r1640, %r1643; - mad.lo.s32 %r1645, %r99, %r1638, %r1644; - mad.lo.s32 %r1646, %r100, %r1636, %r1645; - ld.const.v4.u8 {%rs1010, %rs1011, %rs1012, %rs1013}, [matrix+504]; - cvt.u32.u16 %r1647, %rs1013; - cvt.s32.s8 %r1648, %r1647; - cvt.u32.u16 %r1649, %rs1012; - cvt.s32.s8 %r1650, %r1649; - cvt.u32.u16 %r1651, %rs1011; - cvt.s32.s8 %r1652, %r1651; - cvt.u32.u16 %r1653, %rs1010; - cvt.s32.s8 %r1654, %r1653; - mad.lo.s32 %r1655, %r103, %r1654, %r1646; - mad.lo.s32 %r1656, %r104, %r1652, %r1655; - mad.lo.s32 %r1657, %r107, %r1650, %r1656; - mad.lo.s32 %r1658, %r108, %r1648, %r1657; - ld.const.v4.u8 {%rs1018, %rs1019, %rs1020, %rs1021}, [matrix+508]; - cvt.u32.u16 %r1659, %rs1021; - cvt.s32.s8 %r1660, %r1659; - cvt.u32.u16 %r1661, %rs1020; - cvt.s32.s8 %r1662, %r1661; - cvt.u32.u16 %r1663, %rs1019; - cvt.s32.s8 %r1664, %r1663; - cvt.u32.u16 %r1665, %rs1018; - cvt.s32.s8 %r1666, %r1665; - mad.lo.s32 %r1667, %r111, %r1666, %r1658; - mad.lo.s32 %r1668, %r112, %r1664, %r1667; - mad.lo.s32 %r1669, %r114, %r1662, %r1668; - mad.lo.s32 %r1670, %r115, %r1660, %r1669; - shr.u32 %r1671, %r1478, 6; - and.b32 %r1672, %r1671, 240; - shr.u32 %r1673, %r1670, 10; - or.b32 %r1674, %r1673, %r1672; - xor.b32 %r1675, %r14, %r1674; - cvt.u64.u32 %rd383, %r1675; - ld.const.v4.u8 {%rs1026, %rs1027, %rs1028, %rs1029}, [matrix+512]; - cvt.u32.u16 %r1676, %rs1029; - cvt.s32.s8 %r1677, %r1676; - cvt.u32.u16 %r1678, %rs1028; - cvt.s32.s8 %r1679, %r1678; - cvt.u32.u16 %r1680, %rs1026; - cvt.s32.s8 %r1681, %r1680; - cvt.u32.u16 %r1682, %rs1027; - cvt.s32.s8 %r1683, %r1682; - mul.lo.s32 %r1684, %r34, %r1683; - mad.lo.s32 %r1685, %r124, %r1681, %r1684; - mad.lo.s32 %r1686, %r35, %r1679, %r1685; - mad.lo.s32 %r1687, %r36, %r1677, %r1686; - ld.const.v4.u8 {%rs1034, %rs1035, %rs1036, %rs1037}, [matrix+516]; - cvt.u32.u16 %r1688, %rs1037; - cvt.s32.s8 %r1689, %r1688; - cvt.u32.u16 %r1690, %rs1036; - cvt.s32.s8 %r1691, %r1690; - cvt.u32.u16 %r1692, %rs1035; - cvt.s32.s8 %r1693, %r1692; - cvt.u32.u16 %r1694, %rs1034; - cvt.s32.s8 %r1695, %r1694; - mad.lo.s32 %r1696, %r37, %r1695, %r1687; - mad.lo.s32 %r1697, %r38, %r1693, %r1696; - mad.lo.s32 %r1698, %r39, %r1691, %r1697; - mad.lo.s32 %r1699, %r40, %r1689, %r1698; - ld.const.v4.u8 {%rs1042, %rs1043, %rs1044, %rs1045}, [matrix+520]; - cvt.u32.u16 %r1700, %rs1045; - cvt.s32.s8 %r1701, %r1700; - cvt.u32.u16 %r1702, %rs1044; - cvt.s32.s8 %r1703, %r1702; - cvt.u32.u16 %r1704, %rs1043; - cvt.s32.s8 %r1705, %r1704; - cvt.u32.u16 %r1706, %rs1042; - cvt.s32.s8 %r1707, %r1706; - mad.lo.s32 %r1708, %r42, %r1707, %r1699; - mad.lo.s32 %r1709, %r43, %r1705, %r1708; - mad.lo.s32 %r1710, %r45, %r1703, %r1709; - mad.lo.s32 %r1711, %r46, %r1701, %r1710; - ld.const.v4.u8 {%rs1050, %rs1051, %rs1052, %rs1053}, [matrix+524]; - cvt.u32.u16 %r1712, %rs1053; - cvt.s32.s8 %r1713, %r1712; - cvt.u32.u16 %r1714, %rs1052; - cvt.s32.s8 %r1715, %r1714; - cvt.u32.u16 %r1716, %rs1051; - cvt.s32.s8 %r1717, %r1716; - cvt.u32.u16 %r1718, %rs1050; - cvt.s32.s8 %r1719, %r1718; - mad.lo.s32 %r1720, %r48, %r1719, %r1711; - mad.lo.s32 %r1721, %r49, %r1717, %r1720; - mad.lo.s32 %r1722, %r50, %r1715, %r1721; - mad.lo.s32 %r1723, %r51, %r1713, %r1722; - ld.const.v4.u8 {%rs1058, %rs1059, %rs1060, %rs1061}, [matrix+528]; - cvt.u32.u16 %r1724, %rs1061; - cvt.s32.s8 %r1725, %r1724; - cvt.u32.u16 %r1726, %rs1060; - cvt.s32.s8 %r1727, %r1726; - cvt.u32.u16 %r1728, %rs1059; - cvt.s32.s8 %r1729, %r1728; - cvt.u32.u16 %r1730, %rs1058; - cvt.s32.s8 %r1731, %r1730; - mad.lo.s32 %r1732, %r173, %r1731, %r1723; - mad.lo.s32 %r1733, %r53, %r1729, %r1732; - mad.lo.s32 %r1734, %r54, %r1727, %r1733; - mad.lo.s32 %r1735, %r55, %r1725, %r1734; - ld.const.v4.u8 {%rs1066, %rs1067, %rs1068, %rs1069}, [matrix+532]; - cvt.u32.u16 %r1736, %rs1069; - cvt.s32.s8 %r1737, %r1736; - cvt.u32.u16 %r1738, %rs1068; - cvt.s32.s8 %r1739, %r1738; - cvt.u32.u16 %r1740, %rs1067; - cvt.s32.s8 %r1741, %r1740; - cvt.u32.u16 %r1742, %rs1066; - cvt.s32.s8 %r1743, %r1742; - mad.lo.s32 %r1744, %r56, %r1743, %r1735; - mad.lo.s32 %r1745, %r57, %r1741, %r1744; - mad.lo.s32 %r1746, %r58, %r1739, %r1745; - mad.lo.s32 %r1747, %r59, %r1737, %r1746; - ld.const.v4.u8 {%rs1074, %rs1075, %rs1076, %rs1077}, [matrix+536]; - cvt.u32.u16 %r1748, %rs1077; - cvt.s32.s8 %r1749, %r1748; - cvt.u32.u16 %r1750, %rs1076; - cvt.s32.s8 %r1751, %r1750; - cvt.u32.u16 %r1752, %rs1075; - cvt.s32.s8 %r1753, %r1752; - cvt.u32.u16 %r1754, %rs1074; - cvt.s32.s8 %r1755, %r1754; - mad.lo.s32 %r1756, %r61, %r1755, %r1747; - mad.lo.s32 %r1757, %r62, %r1753, %r1756; - mad.lo.s32 %r1758, %r64, %r1751, %r1757; - mad.lo.s32 %r1759, %r65, %r1749, %r1758; - ld.const.v4.u8 {%rs1082, %rs1083, %rs1084, %rs1085}, [matrix+540]; - cvt.u32.u16 %r1760, %rs1085; - cvt.s32.s8 %r1761, %r1760; - cvt.u32.u16 %r1762, %rs1084; - cvt.s32.s8 %r1763, %r1762; - cvt.u32.u16 %r1764, %rs1083; - cvt.s32.s8 %r1765, %r1764; - cvt.u32.u16 %r1766, %rs1082; - cvt.s32.s8 %r1767, %r1766; - mad.lo.s32 %r1768, %r67, %r1767, %r1759; - mad.lo.s32 %r1769, %r68, %r1765, %r1768; - mad.lo.s32 %r1770, %r69, %r1763, %r1769; - mad.lo.s32 %r1771, %r70, %r1761, %r1770; - ld.const.v4.u8 {%rs1090, %rs1091, %rs1092, %rs1093}, [matrix+544]; - cvt.u32.u16 %r1772, %rs1093; - cvt.s32.s8 %r1773, %r1772; - cvt.u32.u16 %r1774, %rs1092; - cvt.s32.s8 %r1775, %r1774; - cvt.u32.u16 %r1776, %rs1091; - cvt.s32.s8 %r1777, %r1776; - cvt.u32.u16 %r1778, %rs1090; - cvt.s32.s8 %r1779, %r1778; - mad.lo.s32 %r1780, %r222, %r1779, %r1771; - mad.lo.s32 %r1781, %r72, %r1777, %r1780; - mad.lo.s32 %r1782, %r73, %r1775, %r1781; - mad.lo.s32 %r1783, %r74, %r1773, %r1782; - ld.const.v4.u8 {%rs1098, %rs1099, %rs1100, %rs1101}, [matrix+548]; - cvt.u32.u16 %r1784, %rs1101; - cvt.s32.s8 %r1785, %r1784; - cvt.u32.u16 %r1786, %rs1100; - cvt.s32.s8 %r1787, %r1786; - cvt.u32.u16 %r1788, %rs1099; - cvt.s32.s8 %r1789, %r1788; - cvt.u32.u16 %r1790, %rs1098; - cvt.s32.s8 %r1791, %r1790; - mad.lo.s32 %r1792, %r75, %r1791, %r1783; - mad.lo.s32 %r1793, %r76, %r1789, %r1792; - mad.lo.s32 %r1794, %r77, %r1787, %r1793; - mad.lo.s32 %r1795, %r78, %r1785, %r1794; - ld.const.v4.u8 {%rs1106, %rs1107, %rs1108, %rs1109}, [matrix+552]; - cvt.u32.u16 %r1796, %rs1109; - cvt.s32.s8 %r1797, %r1796; - cvt.u32.u16 %r1798, %rs1108; - cvt.s32.s8 %r1799, %r1798; - cvt.u32.u16 %r1800, %rs1107; - cvt.s32.s8 %r1801, %r1800; - cvt.u32.u16 %r1802, %rs1106; - cvt.s32.s8 %r1803, %r1802; - mad.lo.s32 %r1804, %r80, %r1803, %r1795; - mad.lo.s32 %r1805, %r81, %r1801, %r1804; - mad.lo.s32 %r1806, %r83, %r1799, %r1805; - mad.lo.s32 %r1807, %r84, %r1797, %r1806; - ld.const.v4.u8 {%rs1114, %rs1115, %rs1116, %rs1117}, [matrix+556]; - cvt.u32.u16 %r1808, %rs1117; - cvt.s32.s8 %r1809, %r1808; - cvt.u32.u16 %r1810, %rs1116; - cvt.s32.s8 %r1811, %r1810; - cvt.u32.u16 %r1812, %rs1115; - cvt.s32.s8 %r1813, %r1812; - cvt.u32.u16 %r1814, %rs1114; - cvt.s32.s8 %r1815, %r1814; - mad.lo.s32 %r1816, %r86, %r1815, %r1807; - mad.lo.s32 %r1817, %r87, %r1813, %r1816; - mad.lo.s32 %r1818, %r88, %r1811, %r1817; - mad.lo.s32 %r1819, %r89, %r1809, %r1818; - ld.const.v4.u8 {%rs1122, %rs1123, %rs1124, %rs1125}, [matrix+560]; - cvt.u32.u16 %r1820, %rs1125; - cvt.s32.s8 %r1821, %r1820; - cvt.u32.u16 %r1822, %rs1124; - cvt.s32.s8 %r1823, %r1822; - cvt.u32.u16 %r1824, %rs1123; - cvt.s32.s8 %r1825, %r1824; - cvt.u32.u16 %r1826, %rs1122; - cvt.s32.s8 %r1827, %r1826; - mad.lo.s32 %r1828, %r271, %r1827, %r1819; - mad.lo.s32 %r1829, %r91, %r1825, %r1828; - mad.lo.s32 %r1830, %r93, %r1823, %r1829; - mad.lo.s32 %r1831, %r94, %r1821, %r1830; - ld.const.v4.u8 {%rs1130, %rs1131, %rs1132, %rs1133}, [matrix+564]; - cvt.u32.u16 %r1832, %rs1133; - cvt.s32.s8 %r1833, %r1832; - cvt.u32.u16 %r1834, %rs1132; - cvt.s32.s8 %r1835, %r1834; - cvt.u32.u16 %r1836, %rs1131; - cvt.s32.s8 %r1837, %r1836; - cvt.u32.u16 %r1838, %rs1130; - cvt.s32.s8 %r1839, %r1838; - mad.lo.s32 %r1840, %r96, %r1839, %r1831; - mad.lo.s32 %r1841, %r97, %r1837, %r1840; - mad.lo.s32 %r1842, %r99, %r1835, %r1841; - mad.lo.s32 %r1843, %r100, %r1833, %r1842; - ld.const.v4.u8 {%rs1138, %rs1139, %rs1140, %rs1141}, [matrix+568]; - cvt.u32.u16 %r1844, %rs1141; - cvt.s32.s8 %r1845, %r1844; - cvt.u32.u16 %r1846, %rs1140; - cvt.s32.s8 %r1847, %r1846; - cvt.u32.u16 %r1848, %rs1139; - cvt.s32.s8 %r1849, %r1848; - cvt.u32.u16 %r1850, %rs1138; - cvt.s32.s8 %r1851, %r1850; - mad.lo.s32 %r1852, %r103, %r1851, %r1843; - mad.lo.s32 %r1853, %r104, %r1849, %r1852; - mad.lo.s32 %r1854, %r107, %r1847, %r1853; - mad.lo.s32 %r1855, %r108, %r1845, %r1854; - ld.const.v4.u8 {%rs1146, %rs1147, %rs1148, %rs1149}, [matrix+572]; - cvt.u32.u16 %r1856, %rs1149; - cvt.s32.s8 %r1857, %r1856; - cvt.u32.u16 %r1858, %rs1148; - cvt.s32.s8 %r1859, %r1858; - cvt.u32.u16 %r1860, %rs1147; - cvt.s32.s8 %r1861, %r1860; - cvt.u32.u16 %r1862, %rs1146; - cvt.s32.s8 %r1863, %r1862; - mad.lo.s32 %r1864, %r111, %r1863, %r1855; - mad.lo.s32 %r1865, %r112, %r1861, %r1864; - mad.lo.s32 %r1866, %r114, %r1859, %r1865; - mad.lo.s32 %r1867, %r115, %r1857, %r1866; - ld.const.v4.u8 {%rs1154, %rs1155, %rs1156, %rs1157}, [matrix+576]; - cvt.u32.u16 %r1868, %rs1157; - cvt.s32.s8 %r1869, %r1868; - cvt.u32.u16 %r1870, %rs1156; - cvt.s32.s8 %r1871, %r1870; - cvt.u32.u16 %r1872, %rs1154; - cvt.s32.s8 %r1873, %r1872; - cvt.u32.u16 %r1874, %rs1155; - cvt.s32.s8 %r1875, %r1874; - mul.lo.s32 %r1876, %r34, %r1875; - mad.lo.s32 %r1877, %r124, %r1873, %r1876; - mad.lo.s32 %r1878, %r35, %r1871, %r1877; - mad.lo.s32 %r1879, %r36, %r1869, %r1878; - ld.const.v4.u8 {%rs1162, %rs1163, %rs1164, %rs1165}, [matrix+580]; - cvt.u32.u16 %r1880, %rs1165; - cvt.s32.s8 %r1881, %r1880; - cvt.u32.u16 %r1882, %rs1164; - cvt.s32.s8 %r1883, %r1882; - cvt.u32.u16 %r1884, %rs1163; - cvt.s32.s8 %r1885, %r1884; - cvt.u32.u16 %r1886, %rs1162; - cvt.s32.s8 %r1887, %r1886; - mad.lo.s32 %r1888, %r37, %r1887, %r1879; - mad.lo.s32 %r1889, %r38, %r1885, %r1888; - mad.lo.s32 %r1890, %r39, %r1883, %r1889; - mad.lo.s32 %r1891, %r40, %r1881, %r1890; - ld.const.v4.u8 {%rs1170, %rs1171, %rs1172, %rs1173}, [matrix+584]; - cvt.u32.u16 %r1892, %rs1173; - cvt.s32.s8 %r1893, %r1892; - cvt.u32.u16 %r1894, %rs1172; - cvt.s32.s8 %r1895, %r1894; - cvt.u32.u16 %r1896, %rs1171; - cvt.s32.s8 %r1897, %r1896; - cvt.u32.u16 %r1898, %rs1170; - cvt.s32.s8 %r1899, %r1898; - mad.lo.s32 %r1900, %r42, %r1899, %r1891; - mad.lo.s32 %r1901, %r43, %r1897, %r1900; - mad.lo.s32 %r1902, %r45, %r1895, %r1901; - mad.lo.s32 %r1903, %r46, %r1893, %r1902; - ld.const.v4.u8 {%rs1178, %rs1179, %rs1180, %rs1181}, [matrix+588]; - cvt.u32.u16 %r1904, %rs1181; - cvt.s32.s8 %r1905, %r1904; - cvt.u32.u16 %r1906, %rs1180; - cvt.s32.s8 %r1907, %r1906; - cvt.u32.u16 %r1908, %rs1179; - cvt.s32.s8 %r1909, %r1908; - cvt.u32.u16 %r1910, %rs1178; - cvt.s32.s8 %r1911, %r1910; - mad.lo.s32 %r1912, %r48, %r1911, %r1903; - mad.lo.s32 %r1913, %r49, %r1909, %r1912; - mad.lo.s32 %r1914, %r50, %r1907, %r1913; - mad.lo.s32 %r1915, %r51, %r1905, %r1914; - ld.const.v4.u8 {%rs1186, %rs1187, %rs1188, %rs1189}, [matrix+592]; - cvt.u32.u16 %r1916, %rs1189; - cvt.s32.s8 %r1917, %r1916; - cvt.u32.u16 %r1918, %rs1188; - cvt.s32.s8 %r1919, %r1918; - cvt.u32.u16 %r1920, %rs1187; - cvt.s32.s8 %r1921, %r1920; - cvt.u32.u16 %r1922, %rs1186; - cvt.s32.s8 %r1923, %r1922; - mad.lo.s32 %r1924, %r173, %r1923, %r1915; - mad.lo.s32 %r1925, %r53, %r1921, %r1924; - mad.lo.s32 %r1926, %r54, %r1919, %r1925; - mad.lo.s32 %r1927, %r55, %r1917, %r1926; - ld.const.v4.u8 {%rs1194, %rs1195, %rs1196, %rs1197}, [matrix+596]; - cvt.u32.u16 %r1928, %rs1197; - cvt.s32.s8 %r1929, %r1928; - cvt.u32.u16 %r1930, %rs1196; - cvt.s32.s8 %r1931, %r1930; - cvt.u32.u16 %r1932, %rs1195; - cvt.s32.s8 %r1933, %r1932; - cvt.u32.u16 %r1934, %rs1194; - cvt.s32.s8 %r1935, %r1934; - mad.lo.s32 %r1936, %r56, %r1935, %r1927; - mad.lo.s32 %r1937, %r57, %r1933, %r1936; - mad.lo.s32 %r1938, %r58, %r1931, %r1937; - mad.lo.s32 %r1939, %r59, %r1929, %r1938; - ld.const.v4.u8 {%rs1202, %rs1203, %rs1204, %rs1205}, [matrix+600]; - cvt.u32.u16 %r1940, %rs1205; - cvt.s32.s8 %r1941, %r1940; - cvt.u32.u16 %r1942, %rs1204; - cvt.s32.s8 %r1943, %r1942; - cvt.u32.u16 %r1944, %rs1203; - cvt.s32.s8 %r1945, %r1944; - cvt.u32.u16 %r1946, %rs1202; - cvt.s32.s8 %r1947, %r1946; - mad.lo.s32 %r1948, %r61, %r1947, %r1939; - mad.lo.s32 %r1949, %r62, %r1945, %r1948; - mad.lo.s32 %r1950, %r64, %r1943, %r1949; - mad.lo.s32 %r1951, %r65, %r1941, %r1950; - ld.const.v4.u8 {%rs1210, %rs1211, %rs1212, %rs1213}, [matrix+604]; - cvt.u32.u16 %r1952, %rs1213; - cvt.s32.s8 %r1953, %r1952; - cvt.u32.u16 %r1954, %rs1212; - cvt.s32.s8 %r1955, %r1954; - cvt.u32.u16 %r1956, %rs1211; - cvt.s32.s8 %r1957, %r1956; - cvt.u32.u16 %r1958, %rs1210; - cvt.s32.s8 %r1959, %r1958; - mad.lo.s32 %r1960, %r67, %r1959, %r1951; - mad.lo.s32 %r1961, %r68, %r1957, %r1960; - mad.lo.s32 %r1962, %r69, %r1955, %r1961; - mad.lo.s32 %r1963, %r70, %r1953, %r1962; - ld.const.v4.u8 {%rs1218, %rs1219, %rs1220, %rs1221}, [matrix+608]; - cvt.u32.u16 %r1964, %rs1221; - cvt.s32.s8 %r1965, %r1964; - cvt.u32.u16 %r1966, %rs1220; - cvt.s32.s8 %r1967, %r1966; - cvt.u32.u16 %r1968, %rs1219; - cvt.s32.s8 %r1969, %r1968; - cvt.u32.u16 %r1970, %rs1218; - cvt.s32.s8 %r1971, %r1970; - mad.lo.s32 %r1972, %r222, %r1971, %r1963; - mad.lo.s32 %r1973, %r72, %r1969, %r1972; - mad.lo.s32 %r1974, %r73, %r1967, %r1973; - mad.lo.s32 %r1975, %r74, %r1965, %r1974; - ld.const.v4.u8 {%rs1226, %rs1227, %rs1228, %rs1229}, [matrix+612]; - cvt.u32.u16 %r1976, %rs1229; - cvt.s32.s8 %r1977, %r1976; - cvt.u32.u16 %r1978, %rs1228; - cvt.s32.s8 %r1979, %r1978; - cvt.u32.u16 %r1980, %rs1227; - cvt.s32.s8 %r1981, %r1980; - cvt.u32.u16 %r1982, %rs1226; - cvt.s32.s8 %r1983, %r1982; - mad.lo.s32 %r1984, %r75, %r1983, %r1975; - mad.lo.s32 %r1985, %r76, %r1981, %r1984; - mad.lo.s32 %r1986, %r77, %r1979, %r1985; - mad.lo.s32 %r1987, %r78, %r1977, %r1986; - ld.const.v4.u8 {%rs1234, %rs1235, %rs1236, %rs1237}, [matrix+616]; - cvt.u32.u16 %r1988, %rs1237; - cvt.s32.s8 %r1989, %r1988; - cvt.u32.u16 %r1990, %rs1236; - cvt.s32.s8 %r1991, %r1990; - cvt.u32.u16 %r1992, %rs1235; - cvt.s32.s8 %r1993, %r1992; - cvt.u32.u16 %r1994, %rs1234; - cvt.s32.s8 %r1995, %r1994; - mad.lo.s32 %r1996, %r80, %r1995, %r1987; - mad.lo.s32 %r1997, %r81, %r1993, %r1996; - mad.lo.s32 %r1998, %r83, %r1991, %r1997; - mad.lo.s32 %r1999, %r84, %r1989, %r1998; - ld.const.v4.u8 {%rs1242, %rs1243, %rs1244, %rs1245}, [matrix+620]; - cvt.u32.u16 %r2000, %rs1245; - cvt.s32.s8 %r2001, %r2000; - cvt.u32.u16 %r2002, %rs1244; - cvt.s32.s8 %r2003, %r2002; - cvt.u32.u16 %r2004, %rs1243; - cvt.s32.s8 %r2005, %r2004; - cvt.u32.u16 %r2006, %rs1242; - cvt.s32.s8 %r2007, %r2006; - mad.lo.s32 %r2008, %r86, %r2007, %r1999; - mad.lo.s32 %r2009, %r87, %r2005, %r2008; - mad.lo.s32 %r2010, %r88, %r2003, %r2009; - mad.lo.s32 %r2011, %r89, %r2001, %r2010; - ld.const.v4.u8 {%rs1250, %rs1251, %rs1252, %rs1253}, [matrix+624]; - cvt.u32.u16 %r2012, %rs1253; - cvt.s32.s8 %r2013, %r2012; - cvt.u32.u16 %r2014, %rs1252; - cvt.s32.s8 %r2015, %r2014; - cvt.u32.u16 %r2016, %rs1251; - cvt.s32.s8 %r2017, %r2016; - cvt.u32.u16 %r2018, %rs1250; - cvt.s32.s8 %r2019, %r2018; - mad.lo.s32 %r2020, %r271, %r2019, %r2011; - mad.lo.s32 %r2021, %r91, %r2017, %r2020; - mad.lo.s32 %r2022, %r93, %r2015, %r2021; - mad.lo.s32 %r2023, %r94, %r2013, %r2022; - ld.const.v4.u8 {%rs1258, %rs1259, %rs1260, %rs1261}, [matrix+628]; - cvt.u32.u16 %r2024, %rs1261; - cvt.s32.s8 %r2025, %r2024; - cvt.u32.u16 %r2026, %rs1260; - cvt.s32.s8 %r2027, %r2026; - cvt.u32.u16 %r2028, %rs1259; - cvt.s32.s8 %r2029, %r2028; - cvt.u32.u16 %r2030, %rs1258; - cvt.s32.s8 %r2031, %r2030; - mad.lo.s32 %r2032, %r96, %r2031, %r2023; - mad.lo.s32 %r2033, %r97, %r2029, %r2032; - mad.lo.s32 %r2034, %r99, %r2027, %r2033; - mad.lo.s32 %r2035, %r100, %r2025, %r2034; - ld.const.v4.u8 {%rs1266, %rs1267, %rs1268, %rs1269}, [matrix+632]; - cvt.u32.u16 %r2036, %rs1269; - cvt.s32.s8 %r2037, %r2036; - cvt.u32.u16 %r2038, %rs1268; - cvt.s32.s8 %r2039, %r2038; - cvt.u32.u16 %r2040, %rs1267; - cvt.s32.s8 %r2041, %r2040; - cvt.u32.u16 %r2042, %rs1266; - cvt.s32.s8 %r2043, %r2042; - mad.lo.s32 %r2044, %r103, %r2043, %r2035; - mad.lo.s32 %r2045, %r104, %r2041, %r2044; - mad.lo.s32 %r2046, %r107, %r2039, %r2045; - mad.lo.s32 %r2047, %r108, %r2037, %r2046; - ld.const.v4.u8 {%rs1274, %rs1275, %rs1276, %rs1277}, [matrix+636]; - cvt.u32.u16 %r2048, %rs1277; - cvt.s32.s8 %r2049, %r2048; - cvt.u32.u16 %r2050, %rs1276; - cvt.s32.s8 %r2051, %r2050; - cvt.u32.u16 %r2052, %rs1275; - cvt.s32.s8 %r2053, %r2052; - cvt.u32.u16 %r2054, %rs1274; - cvt.s32.s8 %r2055, %r2054; - mad.lo.s32 %r2056, %r111, %r2055, %r2047; - mad.lo.s32 %r2057, %r112, %r2053, %r2056; - mad.lo.s32 %r2058, %r114, %r2051, %r2057; - mad.lo.s32 %r2059, %r115, %r2049, %r2058; - shr.u32 %r2060, %r1867, 6; - and.b32 %r2061, %r2060, 240; - shr.u32 %r2062, %r2059, 10; - or.b32 %r2063, %r2062, %r2061; - xor.b32 %r2064, %r15, %r2063; - cvt.u64.u32 %rd384, %r2064; - ld.const.v4.u8 {%rs1282, %rs1283, %rs1284, %rs1285}, [matrix+640]; - cvt.u32.u16 %r2065, %rs1285; - cvt.s32.s8 %r2066, %r2065; - cvt.u32.u16 %r2067, %rs1284; - cvt.s32.s8 %r2068, %r2067; - cvt.u32.u16 %r2069, %rs1282; - cvt.s32.s8 %r2070, %r2069; - cvt.u32.u16 %r2071, %rs1283; - cvt.s32.s8 %r2072, %r2071; - mul.lo.s32 %r2073, %r34, %r2072; - mad.lo.s32 %r2074, %r124, %r2070, %r2073; - mad.lo.s32 %r2075, %r35, %r2068, %r2074; - mad.lo.s32 %r2076, %r36, %r2066, %r2075; - ld.const.v4.u8 {%rs1290, %rs1291, %rs1292, %rs1293}, [matrix+644]; - cvt.u32.u16 %r2077, %rs1293; - cvt.s32.s8 %r2078, %r2077; - cvt.u32.u16 %r2079, %rs1292; - cvt.s32.s8 %r2080, %r2079; - cvt.u32.u16 %r2081, %rs1291; - cvt.s32.s8 %r2082, %r2081; - cvt.u32.u16 %r2083, %rs1290; - cvt.s32.s8 %r2084, %r2083; - mad.lo.s32 %r2085, %r37, %r2084, %r2076; - mad.lo.s32 %r2086, %r38, %r2082, %r2085; - mad.lo.s32 %r2087, %r39, %r2080, %r2086; - mad.lo.s32 %r2088, %r40, %r2078, %r2087; - ld.const.v4.u8 {%rs1298, %rs1299, %rs1300, %rs1301}, [matrix+648]; - cvt.u32.u16 %r2089, %rs1301; - cvt.s32.s8 %r2090, %r2089; - cvt.u32.u16 %r2091, %rs1300; - cvt.s32.s8 %r2092, %r2091; - cvt.u32.u16 %r2093, %rs1299; - cvt.s32.s8 %r2094, %r2093; - cvt.u32.u16 %r2095, %rs1298; - cvt.s32.s8 %r2096, %r2095; - mad.lo.s32 %r2097, %r42, %r2096, %r2088; - mad.lo.s32 %r2098, %r43, %r2094, %r2097; - mad.lo.s32 %r2099, %r45, %r2092, %r2098; - mad.lo.s32 %r2100, %r46, %r2090, %r2099; - ld.const.v4.u8 {%rs1306, %rs1307, %rs1308, %rs1309}, [matrix+652]; - cvt.u32.u16 %r2101, %rs1309; - cvt.s32.s8 %r2102, %r2101; - cvt.u32.u16 %r2103, %rs1308; - cvt.s32.s8 %r2104, %r2103; - cvt.u32.u16 %r2105, %rs1307; - cvt.s32.s8 %r2106, %r2105; - cvt.u32.u16 %r2107, %rs1306; - cvt.s32.s8 %r2108, %r2107; - mad.lo.s32 %r2109, %r48, %r2108, %r2100; - mad.lo.s32 %r2110, %r49, %r2106, %r2109; - mad.lo.s32 %r2111, %r50, %r2104, %r2110; - mad.lo.s32 %r2112, %r51, %r2102, %r2111; - ld.const.v4.u8 {%rs1314, %rs1315, %rs1316, %rs1317}, [matrix+656]; - cvt.u32.u16 %r2113, %rs1317; - cvt.s32.s8 %r2114, %r2113; - cvt.u32.u16 %r2115, %rs1316; - cvt.s32.s8 %r2116, %r2115; - cvt.u32.u16 %r2117, %rs1315; - cvt.s32.s8 %r2118, %r2117; - cvt.u32.u16 %r2119, %rs1314; - cvt.s32.s8 %r2120, %r2119; - mad.lo.s32 %r2121, %r173, %r2120, %r2112; - mad.lo.s32 %r2122, %r53, %r2118, %r2121; - mad.lo.s32 %r2123, %r54, %r2116, %r2122; - mad.lo.s32 %r2124, %r55, %r2114, %r2123; - ld.const.v4.u8 {%rs1322, %rs1323, %rs1324, %rs1325}, [matrix+660]; - cvt.u32.u16 %r2125, %rs1325; - cvt.s32.s8 %r2126, %r2125; - cvt.u32.u16 %r2127, %rs1324; - cvt.s32.s8 %r2128, %r2127; - cvt.u32.u16 %r2129, %rs1323; - cvt.s32.s8 %r2130, %r2129; - cvt.u32.u16 %r2131, %rs1322; - cvt.s32.s8 %r2132, %r2131; - mad.lo.s32 %r2133, %r56, %r2132, %r2124; - mad.lo.s32 %r2134, %r57, %r2130, %r2133; - mad.lo.s32 %r2135, %r58, %r2128, %r2134; - mad.lo.s32 %r2136, %r59, %r2126, %r2135; - ld.const.v4.u8 {%rs1330, %rs1331, %rs1332, %rs1333}, [matrix+664]; - cvt.u32.u16 %r2137, %rs1333; - cvt.s32.s8 %r2138, %r2137; - cvt.u32.u16 %r2139, %rs1332; - cvt.s32.s8 %r2140, %r2139; - cvt.u32.u16 %r2141, %rs1331; - cvt.s32.s8 %r2142, %r2141; - cvt.u32.u16 %r2143, %rs1330; - cvt.s32.s8 %r2144, %r2143; - mad.lo.s32 %r2145, %r61, %r2144, %r2136; - mad.lo.s32 %r2146, %r62, %r2142, %r2145; - mad.lo.s32 %r2147, %r64, %r2140, %r2146; - mad.lo.s32 %r2148, %r65, %r2138, %r2147; - ld.const.v4.u8 {%rs1338, %rs1339, %rs1340, %rs1341}, [matrix+668]; - cvt.u32.u16 %r2149, %rs1341; - cvt.s32.s8 %r2150, %r2149; - cvt.u32.u16 %r2151, %rs1340; - cvt.s32.s8 %r2152, %r2151; - cvt.u32.u16 %r2153, %rs1339; - cvt.s32.s8 %r2154, %r2153; - cvt.u32.u16 %r2155, %rs1338; - cvt.s32.s8 %r2156, %r2155; - mad.lo.s32 %r2157, %r67, %r2156, %r2148; - mad.lo.s32 %r2158, %r68, %r2154, %r2157; - mad.lo.s32 %r2159, %r69, %r2152, %r2158; - mad.lo.s32 %r2160, %r70, %r2150, %r2159; - ld.const.v4.u8 {%rs1346, %rs1347, %rs1348, %rs1349}, [matrix+672]; - cvt.u32.u16 %r2161, %rs1349; - cvt.s32.s8 %r2162, %r2161; - cvt.u32.u16 %r2163, %rs1348; - cvt.s32.s8 %r2164, %r2163; - cvt.u32.u16 %r2165, %rs1347; - cvt.s32.s8 %r2166, %r2165; - cvt.u32.u16 %r2167, %rs1346; - cvt.s32.s8 %r2168, %r2167; - mad.lo.s32 %r2169, %r222, %r2168, %r2160; - mad.lo.s32 %r2170, %r72, %r2166, %r2169; - mad.lo.s32 %r2171, %r73, %r2164, %r2170; - mad.lo.s32 %r2172, %r74, %r2162, %r2171; - ld.const.v4.u8 {%rs1354, %rs1355, %rs1356, %rs1357}, [matrix+676]; - cvt.u32.u16 %r2173, %rs1357; - cvt.s32.s8 %r2174, %r2173; - cvt.u32.u16 %r2175, %rs1356; - cvt.s32.s8 %r2176, %r2175; - cvt.u32.u16 %r2177, %rs1355; - cvt.s32.s8 %r2178, %r2177; - cvt.u32.u16 %r2179, %rs1354; - cvt.s32.s8 %r2180, %r2179; - mad.lo.s32 %r2181, %r75, %r2180, %r2172; - mad.lo.s32 %r2182, %r76, %r2178, %r2181; - mad.lo.s32 %r2183, %r77, %r2176, %r2182; - mad.lo.s32 %r2184, %r78, %r2174, %r2183; - ld.const.v4.u8 {%rs1362, %rs1363, %rs1364, %rs1365}, [matrix+680]; - cvt.u32.u16 %r2185, %rs1365; - cvt.s32.s8 %r2186, %r2185; - cvt.u32.u16 %r2187, %rs1364; - cvt.s32.s8 %r2188, %r2187; - cvt.u32.u16 %r2189, %rs1363; - cvt.s32.s8 %r2190, %r2189; - cvt.u32.u16 %r2191, %rs1362; - cvt.s32.s8 %r2192, %r2191; - mad.lo.s32 %r2193, %r80, %r2192, %r2184; - mad.lo.s32 %r2194, %r81, %r2190, %r2193; - mad.lo.s32 %r2195, %r83, %r2188, %r2194; - mad.lo.s32 %r2196, %r84, %r2186, %r2195; - ld.const.v4.u8 {%rs1370, %rs1371, %rs1372, %rs1373}, [matrix+684]; - cvt.u32.u16 %r2197, %rs1373; - cvt.s32.s8 %r2198, %r2197; - cvt.u32.u16 %r2199, %rs1372; - cvt.s32.s8 %r2200, %r2199; - cvt.u32.u16 %r2201, %rs1371; - cvt.s32.s8 %r2202, %r2201; - cvt.u32.u16 %r2203, %rs1370; - cvt.s32.s8 %r2204, %r2203; - mad.lo.s32 %r2205, %r86, %r2204, %r2196; - mad.lo.s32 %r2206, %r87, %r2202, %r2205; - mad.lo.s32 %r2207, %r88, %r2200, %r2206; - mad.lo.s32 %r2208, %r89, %r2198, %r2207; - ld.const.v4.u8 {%rs1378, %rs1379, %rs1380, %rs1381}, [matrix+688]; - cvt.u32.u16 %r2209, %rs1381; - cvt.s32.s8 %r2210, %r2209; - cvt.u32.u16 %r2211, %rs1380; - cvt.s32.s8 %r2212, %r2211; - cvt.u32.u16 %r2213, %rs1379; - cvt.s32.s8 %r2214, %r2213; - cvt.u32.u16 %r2215, %rs1378; - cvt.s32.s8 %r2216, %r2215; - mad.lo.s32 %r2217, %r271, %r2216, %r2208; - mad.lo.s32 %r2218, %r91, %r2214, %r2217; - mad.lo.s32 %r2219, %r93, %r2212, %r2218; - mad.lo.s32 %r2220, %r94, %r2210, %r2219; - ld.const.v4.u8 {%rs1386, %rs1387, %rs1388, %rs1389}, [matrix+692]; - cvt.u32.u16 %r2221, %rs1389; - cvt.s32.s8 %r2222, %r2221; - cvt.u32.u16 %r2223, %rs1388; - cvt.s32.s8 %r2224, %r2223; - cvt.u32.u16 %r2225, %rs1387; - cvt.s32.s8 %r2226, %r2225; - cvt.u32.u16 %r2227, %rs1386; - cvt.s32.s8 %r2228, %r2227; - mad.lo.s32 %r2229, %r96, %r2228, %r2220; - mad.lo.s32 %r2230, %r97, %r2226, %r2229; - mad.lo.s32 %r2231, %r99, %r2224, %r2230; - mad.lo.s32 %r2232, %r100, %r2222, %r2231; - ld.const.v4.u8 {%rs1394, %rs1395, %rs1396, %rs1397}, [matrix+696]; - cvt.u32.u16 %r2233, %rs1397; - cvt.s32.s8 %r2234, %r2233; - cvt.u32.u16 %r2235, %rs1396; - cvt.s32.s8 %r2236, %r2235; - cvt.u32.u16 %r2237, %rs1395; - cvt.s32.s8 %r2238, %r2237; - cvt.u32.u16 %r2239, %rs1394; - cvt.s32.s8 %r2240, %r2239; - mad.lo.s32 %r2241, %r103, %r2240, %r2232; - mad.lo.s32 %r2242, %r104, %r2238, %r2241; - mad.lo.s32 %r2243, %r107, %r2236, %r2242; - mad.lo.s32 %r2244, %r108, %r2234, %r2243; - ld.const.v4.u8 {%rs1402, %rs1403, %rs1404, %rs1405}, [matrix+700]; - cvt.u32.u16 %r2245, %rs1405; - cvt.s32.s8 %r2246, %r2245; - cvt.u32.u16 %r2247, %rs1404; - cvt.s32.s8 %r2248, %r2247; - cvt.u32.u16 %r2249, %rs1403; - cvt.s32.s8 %r2250, %r2249; - cvt.u32.u16 %r2251, %rs1402; - cvt.s32.s8 %r2252, %r2251; - mad.lo.s32 %r2253, %r111, %r2252, %r2244; - mad.lo.s32 %r2254, %r112, %r2250, %r2253; - mad.lo.s32 %r2255, %r114, %r2248, %r2254; - mad.lo.s32 %r2256, %r115, %r2246, %r2255; - ld.const.v4.u8 {%rs1410, %rs1411, %rs1412, %rs1413}, [matrix+704]; - cvt.u32.u16 %r2257, %rs1413; - cvt.s32.s8 %r2258, %r2257; - cvt.u32.u16 %r2259, %rs1412; - cvt.s32.s8 %r2260, %r2259; - cvt.u32.u16 %r2261, %rs1410; - cvt.s32.s8 %r2262, %r2261; - cvt.u32.u16 %r2263, %rs1411; - cvt.s32.s8 %r2264, %r2263; - mul.lo.s32 %r2265, %r34, %r2264; - mad.lo.s32 %r2266, %r124, %r2262, %r2265; - mad.lo.s32 %r2267, %r35, %r2260, %r2266; - mad.lo.s32 %r2268, %r36, %r2258, %r2267; - ld.const.v4.u8 {%rs1418, %rs1419, %rs1420, %rs1421}, [matrix+708]; - cvt.u32.u16 %r2269, %rs1421; - cvt.s32.s8 %r2270, %r2269; - cvt.u32.u16 %r2271, %rs1420; - cvt.s32.s8 %r2272, %r2271; - cvt.u32.u16 %r2273, %rs1419; - cvt.s32.s8 %r2274, %r2273; - cvt.u32.u16 %r2275, %rs1418; - cvt.s32.s8 %r2276, %r2275; - mad.lo.s32 %r2277, %r37, %r2276, %r2268; - mad.lo.s32 %r2278, %r38, %r2274, %r2277; - mad.lo.s32 %r2279, %r39, %r2272, %r2278; - mad.lo.s32 %r2280, %r40, %r2270, %r2279; - ld.const.v4.u8 {%rs1426, %rs1427, %rs1428, %rs1429}, [matrix+712]; - cvt.u32.u16 %r2281, %rs1429; - cvt.s32.s8 %r2282, %r2281; - cvt.u32.u16 %r2283, %rs1428; - cvt.s32.s8 %r2284, %r2283; - cvt.u32.u16 %r2285, %rs1427; - cvt.s32.s8 %r2286, %r2285; - cvt.u32.u16 %r2287, %rs1426; - cvt.s32.s8 %r2288, %r2287; - mad.lo.s32 %r2289, %r42, %r2288, %r2280; - mad.lo.s32 %r2290, %r43, %r2286, %r2289; - mad.lo.s32 %r2291, %r45, %r2284, %r2290; - mad.lo.s32 %r2292, %r46, %r2282, %r2291; - ld.const.v4.u8 {%rs1434, %rs1435, %rs1436, %rs1437}, [matrix+716]; - cvt.u32.u16 %r2293, %rs1437; - cvt.s32.s8 %r2294, %r2293; - cvt.u32.u16 %r2295, %rs1436; - cvt.s32.s8 %r2296, %r2295; - cvt.u32.u16 %r2297, %rs1435; - cvt.s32.s8 %r2298, %r2297; - cvt.u32.u16 %r2299, %rs1434; - cvt.s32.s8 %r2300, %r2299; - mad.lo.s32 %r2301, %r48, %r2300, %r2292; - mad.lo.s32 %r2302, %r49, %r2298, %r2301; - mad.lo.s32 %r2303, %r50, %r2296, %r2302; - mad.lo.s32 %r2304, %r51, %r2294, %r2303; - ld.const.v4.u8 {%rs1442, %rs1443, %rs1444, %rs1445}, [matrix+720]; - cvt.u32.u16 %r2305, %rs1445; - cvt.s32.s8 %r2306, %r2305; - cvt.u32.u16 %r2307, %rs1444; - cvt.s32.s8 %r2308, %r2307; - cvt.u32.u16 %r2309, %rs1443; - cvt.s32.s8 %r2310, %r2309; - cvt.u32.u16 %r2311, %rs1442; - cvt.s32.s8 %r2312, %r2311; - mad.lo.s32 %r2313, %r173, %r2312, %r2304; - mad.lo.s32 %r2314, %r53, %r2310, %r2313; - mad.lo.s32 %r2315, %r54, %r2308, %r2314; - mad.lo.s32 %r2316, %r55, %r2306, %r2315; - ld.const.v4.u8 {%rs1450, %rs1451, %rs1452, %rs1453}, [matrix+724]; - cvt.u32.u16 %r2317, %rs1453; - cvt.s32.s8 %r2318, %r2317; - cvt.u32.u16 %r2319, %rs1452; - cvt.s32.s8 %r2320, %r2319; - cvt.u32.u16 %r2321, %rs1451; - cvt.s32.s8 %r2322, %r2321; - cvt.u32.u16 %r2323, %rs1450; - cvt.s32.s8 %r2324, %r2323; - mad.lo.s32 %r2325, %r56, %r2324, %r2316; - mad.lo.s32 %r2326, %r57, %r2322, %r2325; - mad.lo.s32 %r2327, %r58, %r2320, %r2326; - mad.lo.s32 %r2328, %r59, %r2318, %r2327; - ld.const.v4.u8 {%rs1458, %rs1459, %rs1460, %rs1461}, [matrix+728]; - cvt.u32.u16 %r2329, %rs1461; - cvt.s32.s8 %r2330, %r2329; - cvt.u32.u16 %r2331, %rs1460; - cvt.s32.s8 %r2332, %r2331; - cvt.u32.u16 %r2333, %rs1459; - cvt.s32.s8 %r2334, %r2333; - cvt.u32.u16 %r2335, %rs1458; - cvt.s32.s8 %r2336, %r2335; - mad.lo.s32 %r2337, %r61, %r2336, %r2328; - mad.lo.s32 %r2338, %r62, %r2334, %r2337; - mad.lo.s32 %r2339, %r64, %r2332, %r2338; - mad.lo.s32 %r2340, %r65, %r2330, %r2339; - ld.const.v4.u8 {%rs1466, %rs1467, %rs1468, %rs1469}, [matrix+732]; - cvt.u32.u16 %r2341, %rs1469; - cvt.s32.s8 %r2342, %r2341; - cvt.u32.u16 %r2343, %rs1468; - cvt.s32.s8 %r2344, %r2343; - cvt.u32.u16 %r2345, %rs1467; - cvt.s32.s8 %r2346, %r2345; - cvt.u32.u16 %r2347, %rs1466; - cvt.s32.s8 %r2348, %r2347; - mad.lo.s32 %r2349, %r67, %r2348, %r2340; - mad.lo.s32 %r2350, %r68, %r2346, %r2349; - mad.lo.s32 %r2351, %r69, %r2344, %r2350; - mad.lo.s32 %r2352, %r70, %r2342, %r2351; - ld.const.v4.u8 {%rs1474, %rs1475, %rs1476, %rs1477}, [matrix+736]; - cvt.u32.u16 %r2353, %rs1477; - cvt.s32.s8 %r2354, %r2353; - cvt.u32.u16 %r2355, %rs1476; - cvt.s32.s8 %r2356, %r2355; - cvt.u32.u16 %r2357, %rs1475; - cvt.s32.s8 %r2358, %r2357; - cvt.u32.u16 %r2359, %rs1474; - cvt.s32.s8 %r2360, %r2359; - mad.lo.s32 %r2361, %r222, %r2360, %r2352; - mad.lo.s32 %r2362, %r72, %r2358, %r2361; - mad.lo.s32 %r2363, %r73, %r2356, %r2362; - mad.lo.s32 %r2364, %r74, %r2354, %r2363; - ld.const.v4.u8 {%rs1482, %rs1483, %rs1484, %rs1485}, [matrix+740]; - cvt.u32.u16 %r2365, %rs1485; - cvt.s32.s8 %r2366, %r2365; - cvt.u32.u16 %r2367, %rs1484; - cvt.s32.s8 %r2368, %r2367; - cvt.u32.u16 %r2369, %rs1483; - cvt.s32.s8 %r2370, %r2369; - cvt.u32.u16 %r2371, %rs1482; - cvt.s32.s8 %r2372, %r2371; - mad.lo.s32 %r2373, %r75, %r2372, %r2364; - mad.lo.s32 %r2374, %r76, %r2370, %r2373; - mad.lo.s32 %r2375, %r77, %r2368, %r2374; - mad.lo.s32 %r2376, %r78, %r2366, %r2375; - ld.const.v4.u8 {%rs1490, %rs1491, %rs1492, %rs1493}, [matrix+744]; - cvt.u32.u16 %r2377, %rs1493; - cvt.s32.s8 %r2378, %r2377; - cvt.u32.u16 %r2379, %rs1492; - cvt.s32.s8 %r2380, %r2379; - cvt.u32.u16 %r2381, %rs1491; - cvt.s32.s8 %r2382, %r2381; - cvt.u32.u16 %r2383, %rs1490; - cvt.s32.s8 %r2384, %r2383; - mad.lo.s32 %r2385, %r80, %r2384, %r2376; - mad.lo.s32 %r2386, %r81, %r2382, %r2385; - mad.lo.s32 %r2387, %r83, %r2380, %r2386; - mad.lo.s32 %r2388, %r84, %r2378, %r2387; - ld.const.v4.u8 {%rs1498, %rs1499, %rs1500, %rs1501}, [matrix+748]; - cvt.u32.u16 %r2389, %rs1501; - cvt.s32.s8 %r2390, %r2389; - cvt.u32.u16 %r2391, %rs1500; - cvt.s32.s8 %r2392, %r2391; - cvt.u32.u16 %r2393, %rs1499; - cvt.s32.s8 %r2394, %r2393; - cvt.u32.u16 %r2395, %rs1498; - cvt.s32.s8 %r2396, %r2395; - mad.lo.s32 %r2397, %r86, %r2396, %r2388; - mad.lo.s32 %r2398, %r87, %r2394, %r2397; - mad.lo.s32 %r2399, %r88, %r2392, %r2398; - mad.lo.s32 %r2400, %r89, %r2390, %r2399; - ld.const.v4.u8 {%rs1506, %rs1507, %rs1508, %rs1509}, [matrix+752]; - cvt.u32.u16 %r2401, %rs1509; - cvt.s32.s8 %r2402, %r2401; - cvt.u32.u16 %r2403, %rs1508; - cvt.s32.s8 %r2404, %r2403; - cvt.u32.u16 %r2405, %rs1507; - cvt.s32.s8 %r2406, %r2405; - cvt.u32.u16 %r2407, %rs1506; - cvt.s32.s8 %r2408, %r2407; - mad.lo.s32 %r2409, %r271, %r2408, %r2400; - mad.lo.s32 %r2410, %r91, %r2406, %r2409; - mad.lo.s32 %r2411, %r93, %r2404, %r2410; - mad.lo.s32 %r2412, %r94, %r2402, %r2411; - ld.const.v4.u8 {%rs1514, %rs1515, %rs1516, %rs1517}, [matrix+756]; - cvt.u32.u16 %r2413, %rs1517; - cvt.s32.s8 %r2414, %r2413; - cvt.u32.u16 %r2415, %rs1516; - cvt.s32.s8 %r2416, %r2415; - cvt.u32.u16 %r2417, %rs1515; - cvt.s32.s8 %r2418, %r2417; - cvt.u32.u16 %r2419, %rs1514; - cvt.s32.s8 %r2420, %r2419; - mad.lo.s32 %r2421, %r96, %r2420, %r2412; - mad.lo.s32 %r2422, %r97, %r2418, %r2421; - mad.lo.s32 %r2423, %r99, %r2416, %r2422; - mad.lo.s32 %r2424, %r100, %r2414, %r2423; - ld.const.v4.u8 {%rs1522, %rs1523, %rs1524, %rs1525}, [matrix+760]; - cvt.u32.u16 %r2425, %rs1525; - cvt.s32.s8 %r2426, %r2425; - cvt.u32.u16 %r2427, %rs1524; - cvt.s32.s8 %r2428, %r2427; - cvt.u32.u16 %r2429, %rs1523; - cvt.s32.s8 %r2430, %r2429; - cvt.u32.u16 %r2431, %rs1522; - cvt.s32.s8 %r2432, %r2431; - mad.lo.s32 %r2433, %r103, %r2432, %r2424; - mad.lo.s32 %r2434, %r104, %r2430, %r2433; - mad.lo.s32 %r2435, %r107, %r2428, %r2434; - mad.lo.s32 %r2436, %r108, %r2426, %r2435; - ld.const.v4.u8 {%rs1530, %rs1531, %rs1532, %rs1533}, [matrix+764]; - cvt.u32.u16 %r2437, %rs1533; - cvt.s32.s8 %r2438, %r2437; - cvt.u32.u16 %r2439, %rs1532; - cvt.s32.s8 %r2440, %r2439; - cvt.u32.u16 %r2441, %rs1531; - cvt.s32.s8 %r2442, %r2441; - cvt.u32.u16 %r2443, %rs1530; - cvt.s32.s8 %r2444, %r2443; - mad.lo.s32 %r2445, %r111, %r2444, %r2436; - mad.lo.s32 %r2446, %r112, %r2442, %r2445; - mad.lo.s32 %r2447, %r114, %r2440, %r2446; - mad.lo.s32 %r2448, %r115, %r2438, %r2447; - shr.u32 %r2449, %r2256, 6; - and.b32 %r2450, %r2449, 240; - shr.u32 %r2451, %r2448, 10; - or.b32 %r2452, %r2451, %r2450; - xor.b32 %r2453, %r16, %r2452; - cvt.u64.u32 %rd385, %r2453; - ld.const.v4.u8 {%rs1538, %rs1539, %rs1540, %rs1541}, [matrix+768]; - cvt.u32.u16 %r2454, %rs1541; - cvt.s32.s8 %r2455, %r2454; - cvt.u32.u16 %r2456, %rs1540; - cvt.s32.s8 %r2457, %r2456; - cvt.u32.u16 %r2458, %rs1538; - cvt.s32.s8 %r2459, %r2458; - cvt.u32.u16 %r2460, %rs1539; - cvt.s32.s8 %r2461, %r2460; - mul.lo.s32 %r2462, %r34, %r2461; - mad.lo.s32 %r2463, %r124, %r2459, %r2462; - mad.lo.s32 %r2464, %r35, %r2457, %r2463; - mad.lo.s32 %r2465, %r36, %r2455, %r2464; - ld.const.v4.u8 {%rs1546, %rs1547, %rs1548, %rs1549}, [matrix+772]; - cvt.u32.u16 %r2466, %rs1549; - cvt.s32.s8 %r2467, %r2466; - cvt.u32.u16 %r2468, %rs1548; - cvt.s32.s8 %r2469, %r2468; - cvt.u32.u16 %r2470, %rs1547; - cvt.s32.s8 %r2471, %r2470; - cvt.u32.u16 %r2472, %rs1546; - cvt.s32.s8 %r2473, %r2472; - mad.lo.s32 %r2474, %r37, %r2473, %r2465; - mad.lo.s32 %r2475, %r38, %r2471, %r2474; - mad.lo.s32 %r2476, %r39, %r2469, %r2475; - mad.lo.s32 %r2477, %r40, %r2467, %r2476; - ld.const.v4.u8 {%rs1554, %rs1555, %rs1556, %rs1557}, [matrix+776]; - cvt.u32.u16 %r2478, %rs1557; - cvt.s32.s8 %r2479, %r2478; - cvt.u32.u16 %r2480, %rs1556; - cvt.s32.s8 %r2481, %r2480; - cvt.u32.u16 %r2482, %rs1555; - cvt.s32.s8 %r2483, %r2482; - cvt.u32.u16 %r2484, %rs1554; - cvt.s32.s8 %r2485, %r2484; - mad.lo.s32 %r2486, %r42, %r2485, %r2477; - mad.lo.s32 %r2487, %r43, %r2483, %r2486; - mad.lo.s32 %r2488, %r45, %r2481, %r2487; - mad.lo.s32 %r2489, %r46, %r2479, %r2488; - ld.const.v4.u8 {%rs1562, %rs1563, %rs1564, %rs1565}, [matrix+780]; - cvt.u32.u16 %r2490, %rs1565; - cvt.s32.s8 %r2491, %r2490; - cvt.u32.u16 %r2492, %rs1564; - cvt.s32.s8 %r2493, %r2492; - cvt.u32.u16 %r2494, %rs1563; - cvt.s32.s8 %r2495, %r2494; - cvt.u32.u16 %r2496, %rs1562; - cvt.s32.s8 %r2497, %r2496; - mad.lo.s32 %r2498, %r48, %r2497, %r2489; - mad.lo.s32 %r2499, %r49, %r2495, %r2498; - mad.lo.s32 %r2500, %r50, %r2493, %r2499; - mad.lo.s32 %r2501, %r51, %r2491, %r2500; - ld.const.v4.u8 {%rs1570, %rs1571, %rs1572, %rs1573}, [matrix+784]; - cvt.u32.u16 %r2502, %rs1573; - cvt.s32.s8 %r2503, %r2502; - cvt.u32.u16 %r2504, %rs1572; - cvt.s32.s8 %r2505, %r2504; - cvt.u32.u16 %r2506, %rs1571; - cvt.s32.s8 %r2507, %r2506; - cvt.u32.u16 %r2508, %rs1570; - cvt.s32.s8 %r2509, %r2508; - mad.lo.s32 %r2510, %r173, %r2509, %r2501; - mad.lo.s32 %r2511, %r53, %r2507, %r2510; - mad.lo.s32 %r2512, %r54, %r2505, %r2511; - mad.lo.s32 %r2513, %r55, %r2503, %r2512; - ld.const.v4.u8 {%rs1578, %rs1579, %rs1580, %rs1581}, [matrix+788]; - cvt.u32.u16 %r2514, %rs1581; - cvt.s32.s8 %r2515, %r2514; - cvt.u32.u16 %r2516, %rs1580; - cvt.s32.s8 %r2517, %r2516; - cvt.u32.u16 %r2518, %rs1579; - cvt.s32.s8 %r2519, %r2518; - cvt.u32.u16 %r2520, %rs1578; - cvt.s32.s8 %r2521, %r2520; - mad.lo.s32 %r2522, %r56, %r2521, %r2513; - mad.lo.s32 %r2523, %r57, %r2519, %r2522; - mad.lo.s32 %r2524, %r58, %r2517, %r2523; - mad.lo.s32 %r2525, %r59, %r2515, %r2524; - ld.const.v4.u8 {%rs1586, %rs1587, %rs1588, %rs1589}, [matrix+792]; - cvt.u32.u16 %r2526, %rs1589; - cvt.s32.s8 %r2527, %r2526; - cvt.u32.u16 %r2528, %rs1588; - cvt.s32.s8 %r2529, %r2528; - cvt.u32.u16 %r2530, %rs1587; - cvt.s32.s8 %r2531, %r2530; - cvt.u32.u16 %r2532, %rs1586; - cvt.s32.s8 %r2533, %r2532; - mad.lo.s32 %r2534, %r61, %r2533, %r2525; - mad.lo.s32 %r2535, %r62, %r2531, %r2534; - mad.lo.s32 %r2536, %r64, %r2529, %r2535; - mad.lo.s32 %r2537, %r65, %r2527, %r2536; - ld.const.v4.u8 {%rs1594, %rs1595, %rs1596, %rs1597}, [matrix+796]; - cvt.u32.u16 %r2538, %rs1597; - cvt.s32.s8 %r2539, %r2538; - cvt.u32.u16 %r2540, %rs1596; - cvt.s32.s8 %r2541, %r2540; - cvt.u32.u16 %r2542, %rs1595; - cvt.s32.s8 %r2543, %r2542; - cvt.u32.u16 %r2544, %rs1594; - cvt.s32.s8 %r2545, %r2544; - mad.lo.s32 %r2546, %r67, %r2545, %r2537; - mad.lo.s32 %r2547, %r68, %r2543, %r2546; - mad.lo.s32 %r2548, %r69, %r2541, %r2547; - mad.lo.s32 %r2549, %r70, %r2539, %r2548; - ld.const.v4.u8 {%rs1602, %rs1603, %rs1604, %rs1605}, [matrix+800]; - cvt.u32.u16 %r2550, %rs1605; - cvt.s32.s8 %r2551, %r2550; - cvt.u32.u16 %r2552, %rs1604; - cvt.s32.s8 %r2553, %r2552; - cvt.u32.u16 %r2554, %rs1603; - cvt.s32.s8 %r2555, %r2554; - cvt.u32.u16 %r2556, %rs1602; - cvt.s32.s8 %r2557, %r2556; - mad.lo.s32 %r2558, %r222, %r2557, %r2549; - mad.lo.s32 %r2559, %r72, %r2555, %r2558; - mad.lo.s32 %r2560, %r73, %r2553, %r2559; - mad.lo.s32 %r2561, %r74, %r2551, %r2560; - ld.const.v4.u8 {%rs1610, %rs1611, %rs1612, %rs1613}, [matrix+804]; - cvt.u32.u16 %r2562, %rs1613; - cvt.s32.s8 %r2563, %r2562; - cvt.u32.u16 %r2564, %rs1612; - cvt.s32.s8 %r2565, %r2564; - cvt.u32.u16 %r2566, %rs1611; - cvt.s32.s8 %r2567, %r2566; - cvt.u32.u16 %r2568, %rs1610; - cvt.s32.s8 %r2569, %r2568; - mad.lo.s32 %r2570, %r75, %r2569, %r2561; - mad.lo.s32 %r2571, %r76, %r2567, %r2570; - mad.lo.s32 %r2572, %r77, %r2565, %r2571; - mad.lo.s32 %r2573, %r78, %r2563, %r2572; - ld.const.v4.u8 {%rs1618, %rs1619, %rs1620, %rs1621}, [matrix+808]; - cvt.u32.u16 %r2574, %rs1621; - cvt.s32.s8 %r2575, %r2574; - cvt.u32.u16 %r2576, %rs1620; - cvt.s32.s8 %r2577, %r2576; - cvt.u32.u16 %r2578, %rs1619; - cvt.s32.s8 %r2579, %r2578; - cvt.u32.u16 %r2580, %rs1618; - cvt.s32.s8 %r2581, %r2580; - mad.lo.s32 %r2582, %r80, %r2581, %r2573; - mad.lo.s32 %r2583, %r81, %r2579, %r2582; - mad.lo.s32 %r2584, %r83, %r2577, %r2583; - mad.lo.s32 %r2585, %r84, %r2575, %r2584; - ld.const.v4.u8 {%rs1626, %rs1627, %rs1628, %rs1629}, [matrix+812]; - cvt.u32.u16 %r2586, %rs1629; - cvt.s32.s8 %r2587, %r2586; - cvt.u32.u16 %r2588, %rs1628; - cvt.s32.s8 %r2589, %r2588; - cvt.u32.u16 %r2590, %rs1627; - cvt.s32.s8 %r2591, %r2590; - cvt.u32.u16 %r2592, %rs1626; - cvt.s32.s8 %r2593, %r2592; - mad.lo.s32 %r2594, %r86, %r2593, %r2585; - mad.lo.s32 %r2595, %r87, %r2591, %r2594; - mad.lo.s32 %r2596, %r88, %r2589, %r2595; - mad.lo.s32 %r2597, %r89, %r2587, %r2596; - ld.const.v4.u8 {%rs1634, %rs1635, %rs1636, %rs1637}, [matrix+816]; - cvt.u32.u16 %r2598, %rs1637; - cvt.s32.s8 %r2599, %r2598; - cvt.u32.u16 %r2600, %rs1636; - cvt.s32.s8 %r2601, %r2600; - cvt.u32.u16 %r2602, %rs1635; - cvt.s32.s8 %r2603, %r2602; - cvt.u32.u16 %r2604, %rs1634; - cvt.s32.s8 %r2605, %r2604; - mad.lo.s32 %r2606, %r271, %r2605, %r2597; - mad.lo.s32 %r2607, %r91, %r2603, %r2606; - mad.lo.s32 %r2608, %r93, %r2601, %r2607; - mad.lo.s32 %r2609, %r94, %r2599, %r2608; - ld.const.v4.u8 {%rs1642, %rs1643, %rs1644, %rs1645}, [matrix+820]; - cvt.u32.u16 %r2610, %rs1645; - cvt.s32.s8 %r2611, %r2610; - cvt.u32.u16 %r2612, %rs1644; - cvt.s32.s8 %r2613, %r2612; - cvt.u32.u16 %r2614, %rs1643; - cvt.s32.s8 %r2615, %r2614; - cvt.u32.u16 %r2616, %rs1642; - cvt.s32.s8 %r2617, %r2616; - mad.lo.s32 %r2618, %r96, %r2617, %r2609; - mad.lo.s32 %r2619, %r97, %r2615, %r2618; - mad.lo.s32 %r2620, %r99, %r2613, %r2619; - mad.lo.s32 %r2621, %r100, %r2611, %r2620; - ld.const.v4.u8 {%rs1650, %rs1651, %rs1652, %rs1653}, [matrix+824]; - cvt.u32.u16 %r2622, %rs1653; - cvt.s32.s8 %r2623, %r2622; - cvt.u32.u16 %r2624, %rs1652; - cvt.s32.s8 %r2625, %r2624; - cvt.u32.u16 %r2626, %rs1651; - cvt.s32.s8 %r2627, %r2626; - cvt.u32.u16 %r2628, %rs1650; - cvt.s32.s8 %r2629, %r2628; - mad.lo.s32 %r2630, %r103, %r2629, %r2621; - mad.lo.s32 %r2631, %r104, %r2627, %r2630; - mad.lo.s32 %r2632, %r107, %r2625, %r2631; - mad.lo.s32 %r2633, %r108, %r2623, %r2632; - ld.const.v4.u8 {%rs1658, %rs1659, %rs1660, %rs1661}, [matrix+828]; - cvt.u32.u16 %r2634, %rs1661; - cvt.s32.s8 %r2635, %r2634; - cvt.u32.u16 %r2636, %rs1660; - cvt.s32.s8 %r2637, %r2636; - cvt.u32.u16 %r2638, %rs1659; - cvt.s32.s8 %r2639, %r2638; - cvt.u32.u16 %r2640, %rs1658; - cvt.s32.s8 %r2641, %r2640; - mad.lo.s32 %r2642, %r111, %r2641, %r2633; - mad.lo.s32 %r2643, %r112, %r2639, %r2642; - mad.lo.s32 %r2644, %r114, %r2637, %r2643; - mad.lo.s32 %r2645, %r115, %r2635, %r2644; - ld.const.v4.u8 {%rs1666, %rs1667, %rs1668, %rs1669}, [matrix+832]; - cvt.u32.u16 %r2646, %rs1669; - cvt.s32.s8 %r2647, %r2646; - cvt.u32.u16 %r2648, %rs1668; - cvt.s32.s8 %r2649, %r2648; - cvt.u32.u16 %r2650, %rs1666; - cvt.s32.s8 %r2651, %r2650; - cvt.u32.u16 %r2652, %rs1667; - cvt.s32.s8 %r2653, %r2652; - mul.lo.s32 %r2654, %r34, %r2653; - mad.lo.s32 %r2655, %r124, %r2651, %r2654; - mad.lo.s32 %r2656, %r35, %r2649, %r2655; - mad.lo.s32 %r2657, %r36, %r2647, %r2656; - ld.const.v4.u8 {%rs1674, %rs1675, %rs1676, %rs1677}, [matrix+836]; - cvt.u32.u16 %r2658, %rs1677; - cvt.s32.s8 %r2659, %r2658; - cvt.u32.u16 %r2660, %rs1676; - cvt.s32.s8 %r2661, %r2660; - cvt.u32.u16 %r2662, %rs1675; - cvt.s32.s8 %r2663, %r2662; - cvt.u32.u16 %r2664, %rs1674; - cvt.s32.s8 %r2665, %r2664; - mad.lo.s32 %r2666, %r37, %r2665, %r2657; - mad.lo.s32 %r2667, %r38, %r2663, %r2666; - mad.lo.s32 %r2668, %r39, %r2661, %r2667; - mad.lo.s32 %r2669, %r40, %r2659, %r2668; - ld.const.v4.u8 {%rs1682, %rs1683, %rs1684, %rs1685}, [matrix+840]; - cvt.u32.u16 %r2670, %rs1685; - cvt.s32.s8 %r2671, %r2670; - cvt.u32.u16 %r2672, %rs1684; - cvt.s32.s8 %r2673, %r2672; - cvt.u32.u16 %r2674, %rs1683; - cvt.s32.s8 %r2675, %r2674; - cvt.u32.u16 %r2676, %rs1682; - cvt.s32.s8 %r2677, %r2676; - mad.lo.s32 %r2678, %r42, %r2677, %r2669; - mad.lo.s32 %r2679, %r43, %r2675, %r2678; - mad.lo.s32 %r2680, %r45, %r2673, %r2679; - mad.lo.s32 %r2681, %r46, %r2671, %r2680; - ld.const.v4.u8 {%rs1690, %rs1691, %rs1692, %rs1693}, [matrix+844]; - cvt.u32.u16 %r2682, %rs1693; - cvt.s32.s8 %r2683, %r2682; - cvt.u32.u16 %r2684, %rs1692; - cvt.s32.s8 %r2685, %r2684; - cvt.u32.u16 %r2686, %rs1691; - cvt.s32.s8 %r2687, %r2686; - cvt.u32.u16 %r2688, %rs1690; - cvt.s32.s8 %r2689, %r2688; - mad.lo.s32 %r2690, %r48, %r2689, %r2681; - mad.lo.s32 %r2691, %r49, %r2687, %r2690; - mad.lo.s32 %r2692, %r50, %r2685, %r2691; - mad.lo.s32 %r2693, %r51, %r2683, %r2692; - ld.const.v4.u8 {%rs1698, %rs1699, %rs1700, %rs1701}, [matrix+848]; - cvt.u32.u16 %r2694, %rs1701; - cvt.s32.s8 %r2695, %r2694; - cvt.u32.u16 %r2696, %rs1700; - cvt.s32.s8 %r2697, %r2696; - cvt.u32.u16 %r2698, %rs1699; - cvt.s32.s8 %r2699, %r2698; - cvt.u32.u16 %r2700, %rs1698; - cvt.s32.s8 %r2701, %r2700; - mad.lo.s32 %r2702, %r173, %r2701, %r2693; - mad.lo.s32 %r2703, %r53, %r2699, %r2702; - mad.lo.s32 %r2704, %r54, %r2697, %r2703; - mad.lo.s32 %r2705, %r55, %r2695, %r2704; - ld.const.v4.u8 {%rs1706, %rs1707, %rs1708, %rs1709}, [matrix+852]; - cvt.u32.u16 %r2706, %rs1709; - cvt.s32.s8 %r2707, %r2706; - cvt.u32.u16 %r2708, %rs1708; - cvt.s32.s8 %r2709, %r2708; - cvt.u32.u16 %r2710, %rs1707; - cvt.s32.s8 %r2711, %r2710; - cvt.u32.u16 %r2712, %rs1706; - cvt.s32.s8 %r2713, %r2712; - mad.lo.s32 %r2714, %r56, %r2713, %r2705; - mad.lo.s32 %r2715, %r57, %r2711, %r2714; - mad.lo.s32 %r2716, %r58, %r2709, %r2715; - mad.lo.s32 %r2717, %r59, %r2707, %r2716; - ld.const.v4.u8 {%rs1714, %rs1715, %rs1716, %rs1717}, [matrix+856]; - cvt.u32.u16 %r2718, %rs1717; - cvt.s32.s8 %r2719, %r2718; - cvt.u32.u16 %r2720, %rs1716; - cvt.s32.s8 %r2721, %r2720; - cvt.u32.u16 %r2722, %rs1715; - cvt.s32.s8 %r2723, %r2722; - cvt.u32.u16 %r2724, %rs1714; - cvt.s32.s8 %r2725, %r2724; - mad.lo.s32 %r2726, %r61, %r2725, %r2717; - mad.lo.s32 %r2727, %r62, %r2723, %r2726; - mad.lo.s32 %r2728, %r64, %r2721, %r2727; - mad.lo.s32 %r2729, %r65, %r2719, %r2728; - ld.const.v4.u8 {%rs1722, %rs1723, %rs1724, %rs1725}, [matrix+860]; - cvt.u32.u16 %r2730, %rs1725; - cvt.s32.s8 %r2731, %r2730; - cvt.u32.u16 %r2732, %rs1724; - cvt.s32.s8 %r2733, %r2732; - cvt.u32.u16 %r2734, %rs1723; - cvt.s32.s8 %r2735, %r2734; - cvt.u32.u16 %r2736, %rs1722; - cvt.s32.s8 %r2737, %r2736; - mad.lo.s32 %r2738, %r67, %r2737, %r2729; - mad.lo.s32 %r2739, %r68, %r2735, %r2738; - mad.lo.s32 %r2740, %r69, %r2733, %r2739; - mad.lo.s32 %r2741, %r70, %r2731, %r2740; - ld.const.v4.u8 {%rs1730, %rs1731, %rs1732, %rs1733}, [matrix+864]; - cvt.u32.u16 %r2742, %rs1733; - cvt.s32.s8 %r2743, %r2742; - cvt.u32.u16 %r2744, %rs1732; - cvt.s32.s8 %r2745, %r2744; - cvt.u32.u16 %r2746, %rs1731; - cvt.s32.s8 %r2747, %r2746; - cvt.u32.u16 %r2748, %rs1730; - cvt.s32.s8 %r2749, %r2748; - mad.lo.s32 %r2750, %r222, %r2749, %r2741; - mad.lo.s32 %r2751, %r72, %r2747, %r2750; - mad.lo.s32 %r2752, %r73, %r2745, %r2751; - mad.lo.s32 %r2753, %r74, %r2743, %r2752; - ld.const.v4.u8 {%rs1738, %rs1739, %rs1740, %rs1741}, [matrix+868]; - cvt.u32.u16 %r2754, %rs1741; - cvt.s32.s8 %r2755, %r2754; - cvt.u32.u16 %r2756, %rs1740; - cvt.s32.s8 %r2757, %r2756; - cvt.u32.u16 %r2758, %rs1739; - cvt.s32.s8 %r2759, %r2758; - cvt.u32.u16 %r2760, %rs1738; - cvt.s32.s8 %r2761, %r2760; - mad.lo.s32 %r2762, %r75, %r2761, %r2753; - mad.lo.s32 %r2763, %r76, %r2759, %r2762; - mad.lo.s32 %r2764, %r77, %r2757, %r2763; - mad.lo.s32 %r2765, %r78, %r2755, %r2764; - ld.const.v4.u8 {%rs1746, %rs1747, %rs1748, %rs1749}, [matrix+872]; - cvt.u32.u16 %r2766, %rs1749; - cvt.s32.s8 %r2767, %r2766; - cvt.u32.u16 %r2768, %rs1748; - cvt.s32.s8 %r2769, %r2768; - cvt.u32.u16 %r2770, %rs1747; - cvt.s32.s8 %r2771, %r2770; - cvt.u32.u16 %r2772, %rs1746; - cvt.s32.s8 %r2773, %r2772; - mad.lo.s32 %r2774, %r80, %r2773, %r2765; - mad.lo.s32 %r2775, %r81, %r2771, %r2774; - mad.lo.s32 %r2776, %r83, %r2769, %r2775; - mad.lo.s32 %r2777, %r84, %r2767, %r2776; - ld.const.v4.u8 {%rs1754, %rs1755, %rs1756, %rs1757}, [matrix+876]; - cvt.u32.u16 %r2778, %rs1757; - cvt.s32.s8 %r2779, %r2778; - cvt.u32.u16 %r2780, %rs1756; - cvt.s32.s8 %r2781, %r2780; - cvt.u32.u16 %r2782, %rs1755; - cvt.s32.s8 %r2783, %r2782; - cvt.u32.u16 %r2784, %rs1754; - cvt.s32.s8 %r2785, %r2784; - mad.lo.s32 %r2786, %r86, %r2785, %r2777; - mad.lo.s32 %r2787, %r87, %r2783, %r2786; - mad.lo.s32 %r2788, %r88, %r2781, %r2787; - mad.lo.s32 %r2789, %r89, %r2779, %r2788; - ld.const.v4.u8 {%rs1762, %rs1763, %rs1764, %rs1765}, [matrix+880]; - cvt.u32.u16 %r2790, %rs1765; - cvt.s32.s8 %r2791, %r2790; - cvt.u32.u16 %r2792, %rs1764; - cvt.s32.s8 %r2793, %r2792; - cvt.u32.u16 %r2794, %rs1763; - cvt.s32.s8 %r2795, %r2794; - cvt.u32.u16 %r2796, %rs1762; - cvt.s32.s8 %r2797, %r2796; - mad.lo.s32 %r2798, %r271, %r2797, %r2789; - mad.lo.s32 %r2799, %r91, %r2795, %r2798; - mad.lo.s32 %r2800, %r93, %r2793, %r2799; - mad.lo.s32 %r2801, %r94, %r2791, %r2800; - ld.const.v4.u8 {%rs1770, %rs1771, %rs1772, %rs1773}, [matrix+884]; - cvt.u32.u16 %r2802, %rs1773; - cvt.s32.s8 %r2803, %r2802; - cvt.u32.u16 %r2804, %rs1772; - cvt.s32.s8 %r2805, %r2804; - cvt.u32.u16 %r2806, %rs1771; - cvt.s32.s8 %r2807, %r2806; - cvt.u32.u16 %r2808, %rs1770; - cvt.s32.s8 %r2809, %r2808; - mad.lo.s32 %r2810, %r96, %r2809, %r2801; - mad.lo.s32 %r2811, %r97, %r2807, %r2810; - mad.lo.s32 %r2812, %r99, %r2805, %r2811; - mad.lo.s32 %r2813, %r100, %r2803, %r2812; - ld.const.v4.u8 {%rs1778, %rs1779, %rs1780, %rs1781}, [matrix+888]; - cvt.u32.u16 %r2814, %rs1781; - cvt.s32.s8 %r2815, %r2814; - cvt.u32.u16 %r2816, %rs1780; - cvt.s32.s8 %r2817, %r2816; - cvt.u32.u16 %r2818, %rs1779; - cvt.s32.s8 %r2819, %r2818; - cvt.u32.u16 %r2820, %rs1778; - cvt.s32.s8 %r2821, %r2820; - mad.lo.s32 %r2822, %r103, %r2821, %r2813; - mad.lo.s32 %r2823, %r104, %r2819, %r2822; - mad.lo.s32 %r2824, %r107, %r2817, %r2823; - mad.lo.s32 %r2825, %r108, %r2815, %r2824; - ld.const.v4.u8 {%rs1786, %rs1787, %rs1788, %rs1789}, [matrix+892]; - cvt.u32.u16 %r2826, %rs1789; - cvt.s32.s8 %r2827, %r2826; - cvt.u32.u16 %r2828, %rs1788; - cvt.s32.s8 %r2829, %r2828; - cvt.u32.u16 %r2830, %rs1787; - cvt.s32.s8 %r2831, %r2830; - cvt.u32.u16 %r2832, %rs1786; - cvt.s32.s8 %r2833, %r2832; - mad.lo.s32 %r2834, %r111, %r2833, %r2825; - mad.lo.s32 %r2835, %r112, %r2831, %r2834; - mad.lo.s32 %r2836, %r114, %r2829, %r2835; - mad.lo.s32 %r2837, %r115, %r2827, %r2836; - shr.u32 %r2838, %r2645, 6; - and.b32 %r2839, %r2838, 240; - shr.u32 %r2840, %r2837, 10; - or.b32 %r2841, %r2840, %r2839; - xor.b32 %r2842, %r17, %r2841; - cvt.u64.u32 %rd386, %r2842; - ld.const.v4.u8 {%rs1794, %rs1795, %rs1796, %rs1797}, [matrix+896]; - cvt.u32.u16 %r2843, %rs1797; - cvt.s32.s8 %r2844, %r2843; - cvt.u32.u16 %r2845, %rs1796; - cvt.s32.s8 %r2846, %r2845; - cvt.u32.u16 %r2847, %rs1794; - cvt.s32.s8 %r2848, %r2847; - cvt.u32.u16 %r2849, %rs1795; - cvt.s32.s8 %r2850, %r2849; - mul.lo.s32 %r2851, %r34, %r2850; - mad.lo.s32 %r2852, %r124, %r2848, %r2851; - mad.lo.s32 %r2853, %r35, %r2846, %r2852; - mad.lo.s32 %r2854, %r36, %r2844, %r2853; - ld.const.v4.u8 {%rs1802, %rs1803, %rs1804, %rs1805}, [matrix+900]; - cvt.u32.u16 %r2855, %rs1805; - cvt.s32.s8 %r2856, %r2855; - cvt.u32.u16 %r2857, %rs1804; - cvt.s32.s8 %r2858, %r2857; - cvt.u32.u16 %r2859, %rs1803; - cvt.s32.s8 %r2860, %r2859; - cvt.u32.u16 %r2861, %rs1802; - cvt.s32.s8 %r2862, %r2861; - mad.lo.s32 %r2863, %r37, %r2862, %r2854; - mad.lo.s32 %r2864, %r38, %r2860, %r2863; - mad.lo.s32 %r2865, %r39, %r2858, %r2864; - mad.lo.s32 %r2866, %r40, %r2856, %r2865; - ld.const.v4.u8 {%rs1810, %rs1811, %rs1812, %rs1813}, [matrix+904]; - cvt.u32.u16 %r2867, %rs1813; - cvt.s32.s8 %r2868, %r2867; - cvt.u32.u16 %r2869, %rs1812; - cvt.s32.s8 %r2870, %r2869; - cvt.u32.u16 %r2871, %rs1811; - cvt.s32.s8 %r2872, %r2871; - cvt.u32.u16 %r2873, %rs1810; - cvt.s32.s8 %r2874, %r2873; - mad.lo.s32 %r2875, %r42, %r2874, %r2866; - mad.lo.s32 %r2876, %r43, %r2872, %r2875; - mad.lo.s32 %r2877, %r45, %r2870, %r2876; - mad.lo.s32 %r2878, %r46, %r2868, %r2877; - ld.const.v4.u8 {%rs1818, %rs1819, %rs1820, %rs1821}, [matrix+908]; - cvt.u32.u16 %r2879, %rs1821; - cvt.s32.s8 %r2880, %r2879; - cvt.u32.u16 %r2881, %rs1820; - cvt.s32.s8 %r2882, %r2881; - cvt.u32.u16 %r2883, %rs1819; - cvt.s32.s8 %r2884, %r2883; - cvt.u32.u16 %r2885, %rs1818; - cvt.s32.s8 %r2886, %r2885; - mad.lo.s32 %r2887, %r48, %r2886, %r2878; - mad.lo.s32 %r2888, %r49, %r2884, %r2887; - mad.lo.s32 %r2889, %r50, %r2882, %r2888; - mad.lo.s32 %r2890, %r51, %r2880, %r2889; - ld.const.v4.u8 {%rs1826, %rs1827, %rs1828, %rs1829}, [matrix+912]; - cvt.u32.u16 %r2891, %rs1829; - cvt.s32.s8 %r2892, %r2891; - cvt.u32.u16 %r2893, %rs1828; - cvt.s32.s8 %r2894, %r2893; - cvt.u32.u16 %r2895, %rs1827; - cvt.s32.s8 %r2896, %r2895; - cvt.u32.u16 %r2897, %rs1826; - cvt.s32.s8 %r2898, %r2897; - mad.lo.s32 %r2899, %r173, %r2898, %r2890; - mad.lo.s32 %r2900, %r53, %r2896, %r2899; - mad.lo.s32 %r2901, %r54, %r2894, %r2900; - mad.lo.s32 %r2902, %r55, %r2892, %r2901; - ld.const.v4.u8 {%rs1834, %rs1835, %rs1836, %rs1837}, [matrix+916]; - cvt.u32.u16 %r2903, %rs1837; - cvt.s32.s8 %r2904, %r2903; - cvt.u32.u16 %r2905, %rs1836; - cvt.s32.s8 %r2906, %r2905; - cvt.u32.u16 %r2907, %rs1835; - cvt.s32.s8 %r2908, %r2907; - cvt.u32.u16 %r2909, %rs1834; - cvt.s32.s8 %r2910, %r2909; - mad.lo.s32 %r2911, %r56, %r2910, %r2902; - mad.lo.s32 %r2912, %r57, %r2908, %r2911; - mad.lo.s32 %r2913, %r58, %r2906, %r2912; - mad.lo.s32 %r2914, %r59, %r2904, %r2913; - ld.const.v4.u8 {%rs1842, %rs1843, %rs1844, %rs1845}, [matrix+920]; - cvt.u32.u16 %r2915, %rs1845; - cvt.s32.s8 %r2916, %r2915; - cvt.u32.u16 %r2917, %rs1844; - cvt.s32.s8 %r2918, %r2917; - cvt.u32.u16 %r2919, %rs1843; - cvt.s32.s8 %r2920, %r2919; - cvt.u32.u16 %r2921, %rs1842; - cvt.s32.s8 %r2922, %r2921; - mad.lo.s32 %r2923, %r61, %r2922, %r2914; - mad.lo.s32 %r2924, %r62, %r2920, %r2923; - mad.lo.s32 %r2925, %r64, %r2918, %r2924; - mad.lo.s32 %r2926, %r65, %r2916, %r2925; - ld.const.v4.u8 {%rs1850, %rs1851, %rs1852, %rs1853}, [matrix+924]; - cvt.u32.u16 %r2927, %rs1853; - cvt.s32.s8 %r2928, %r2927; - cvt.u32.u16 %r2929, %rs1852; - cvt.s32.s8 %r2930, %r2929; - cvt.u32.u16 %r2931, %rs1851; - cvt.s32.s8 %r2932, %r2931; - cvt.u32.u16 %r2933, %rs1850; - cvt.s32.s8 %r2934, %r2933; - mad.lo.s32 %r2935, %r67, %r2934, %r2926; - mad.lo.s32 %r2936, %r68, %r2932, %r2935; - mad.lo.s32 %r2937, %r69, %r2930, %r2936; - mad.lo.s32 %r2938, %r70, %r2928, %r2937; - ld.const.v4.u8 {%rs1858, %rs1859, %rs1860, %rs1861}, [matrix+928]; - cvt.u32.u16 %r2939, %rs1861; - cvt.s32.s8 %r2940, %r2939; - cvt.u32.u16 %r2941, %rs1860; - cvt.s32.s8 %r2942, %r2941; - cvt.u32.u16 %r2943, %rs1859; - cvt.s32.s8 %r2944, %r2943; - cvt.u32.u16 %r2945, %rs1858; - cvt.s32.s8 %r2946, %r2945; - mad.lo.s32 %r2947, %r222, %r2946, %r2938; - mad.lo.s32 %r2948, %r72, %r2944, %r2947; - mad.lo.s32 %r2949, %r73, %r2942, %r2948; - mad.lo.s32 %r2950, %r74, %r2940, %r2949; - ld.const.v4.u8 {%rs1866, %rs1867, %rs1868, %rs1869}, [matrix+932]; - cvt.u32.u16 %r2951, %rs1869; - cvt.s32.s8 %r2952, %r2951; - cvt.u32.u16 %r2953, %rs1868; - cvt.s32.s8 %r2954, %r2953; - cvt.u32.u16 %r2955, %rs1867; - cvt.s32.s8 %r2956, %r2955; - cvt.u32.u16 %r2957, %rs1866; - cvt.s32.s8 %r2958, %r2957; - mad.lo.s32 %r2959, %r75, %r2958, %r2950; - mad.lo.s32 %r2960, %r76, %r2956, %r2959; - mad.lo.s32 %r2961, %r77, %r2954, %r2960; - mad.lo.s32 %r2962, %r78, %r2952, %r2961; - ld.const.v4.u8 {%rs1874, %rs1875, %rs1876, %rs1877}, [matrix+936]; - cvt.u32.u16 %r2963, %rs1877; - cvt.s32.s8 %r2964, %r2963; - cvt.u32.u16 %r2965, %rs1876; - cvt.s32.s8 %r2966, %r2965; - cvt.u32.u16 %r2967, %rs1875; - cvt.s32.s8 %r2968, %r2967; - cvt.u32.u16 %r2969, %rs1874; - cvt.s32.s8 %r2970, %r2969; - mad.lo.s32 %r2971, %r80, %r2970, %r2962; - mad.lo.s32 %r2972, %r81, %r2968, %r2971; - mad.lo.s32 %r2973, %r83, %r2966, %r2972; - mad.lo.s32 %r2974, %r84, %r2964, %r2973; - ld.const.v4.u8 {%rs1882, %rs1883, %rs1884, %rs1885}, [matrix+940]; - cvt.u32.u16 %r2975, %rs1885; - cvt.s32.s8 %r2976, %r2975; - cvt.u32.u16 %r2977, %rs1884; - cvt.s32.s8 %r2978, %r2977; - cvt.u32.u16 %r2979, %rs1883; - cvt.s32.s8 %r2980, %r2979; - cvt.u32.u16 %r2981, %rs1882; - cvt.s32.s8 %r2982, %r2981; - mad.lo.s32 %r2983, %r86, %r2982, %r2974; - mad.lo.s32 %r2984, %r87, %r2980, %r2983; - mad.lo.s32 %r2985, %r88, %r2978, %r2984; - mad.lo.s32 %r2986, %r89, %r2976, %r2985; - ld.const.v4.u8 {%rs1890, %rs1891, %rs1892, %rs1893}, [matrix+944]; - cvt.u32.u16 %r2987, %rs1893; - cvt.s32.s8 %r2988, %r2987; - cvt.u32.u16 %r2989, %rs1892; - cvt.s32.s8 %r2990, %r2989; - cvt.u32.u16 %r2991, %rs1891; - cvt.s32.s8 %r2992, %r2991; - cvt.u32.u16 %r2993, %rs1890; - cvt.s32.s8 %r2994, %r2993; - mad.lo.s32 %r2995, %r271, %r2994, %r2986; - mad.lo.s32 %r2996, %r91, %r2992, %r2995; - mad.lo.s32 %r2997, %r93, %r2990, %r2996; - mad.lo.s32 %r2998, %r94, %r2988, %r2997; - ld.const.v4.u8 {%rs1898, %rs1899, %rs1900, %rs1901}, [matrix+948]; - cvt.u32.u16 %r2999, %rs1901; - cvt.s32.s8 %r3000, %r2999; - cvt.u32.u16 %r3001, %rs1900; - cvt.s32.s8 %r3002, %r3001; - cvt.u32.u16 %r3003, %rs1899; - cvt.s32.s8 %r3004, %r3003; - cvt.u32.u16 %r3005, %rs1898; - cvt.s32.s8 %r3006, %r3005; - mad.lo.s32 %r3007, %r96, %r3006, %r2998; - mad.lo.s32 %r3008, %r97, %r3004, %r3007; - mad.lo.s32 %r3009, %r99, %r3002, %r3008; - mad.lo.s32 %r3010, %r100, %r3000, %r3009; - ld.const.v4.u8 {%rs1906, %rs1907, %rs1908, %rs1909}, [matrix+952]; - cvt.u32.u16 %r3011, %rs1909; - cvt.s32.s8 %r3012, %r3011; - cvt.u32.u16 %r3013, %rs1908; - cvt.s32.s8 %r3014, %r3013; - cvt.u32.u16 %r3015, %rs1907; - cvt.s32.s8 %r3016, %r3015; - cvt.u32.u16 %r3017, %rs1906; - cvt.s32.s8 %r3018, %r3017; - mad.lo.s32 %r3019, %r103, %r3018, %r3010; - mad.lo.s32 %r3020, %r104, %r3016, %r3019; - mad.lo.s32 %r3021, %r107, %r3014, %r3020; - mad.lo.s32 %r3022, %r108, %r3012, %r3021; - ld.const.v4.u8 {%rs1914, %rs1915, %rs1916, %rs1917}, [matrix+956]; - cvt.u32.u16 %r3023, %rs1917; - cvt.s32.s8 %r3024, %r3023; - cvt.u32.u16 %r3025, %rs1916; - cvt.s32.s8 %r3026, %r3025; - cvt.u32.u16 %r3027, %rs1915; - cvt.s32.s8 %r3028, %r3027; - cvt.u32.u16 %r3029, %rs1914; - cvt.s32.s8 %r3030, %r3029; - mad.lo.s32 %r3031, %r111, %r3030, %r3022; - mad.lo.s32 %r3032, %r112, %r3028, %r3031; - mad.lo.s32 %r3033, %r114, %r3026, %r3032; - mad.lo.s32 %r3034, %r115, %r3024, %r3033; - ld.const.v4.u8 {%rs1922, %rs1923, %rs1924, %rs1925}, [matrix+960]; - cvt.u32.u16 %r3035, %rs1925; - cvt.s32.s8 %r3036, %r3035; - cvt.u32.u16 %r3037, %rs1924; - cvt.s32.s8 %r3038, %r3037; - cvt.u32.u16 %r3039, %rs1922; - cvt.s32.s8 %r3040, %r3039; - cvt.u32.u16 %r3041, %rs1923; - cvt.s32.s8 %r3042, %r3041; - mul.lo.s32 %r3043, %r34, %r3042; - mad.lo.s32 %r3044, %r124, %r3040, %r3043; - mad.lo.s32 %r3045, %r35, %r3038, %r3044; - mad.lo.s32 %r3046, %r36, %r3036, %r3045; - ld.const.v4.u8 {%rs1930, %rs1931, %rs1932, %rs1933}, [matrix+964]; - cvt.u32.u16 %r3047, %rs1933; - cvt.s32.s8 %r3048, %r3047; - cvt.u32.u16 %r3049, %rs1932; - cvt.s32.s8 %r3050, %r3049; - cvt.u32.u16 %r3051, %rs1931; - cvt.s32.s8 %r3052, %r3051; - cvt.u32.u16 %r3053, %rs1930; - cvt.s32.s8 %r3054, %r3053; - mad.lo.s32 %r3055, %r37, %r3054, %r3046; - mad.lo.s32 %r3056, %r38, %r3052, %r3055; - mad.lo.s32 %r3057, %r39, %r3050, %r3056; - mad.lo.s32 %r3058, %r40, %r3048, %r3057; - ld.const.v4.u8 {%rs1938, %rs1939, %rs1940, %rs1941}, [matrix+968]; - cvt.u32.u16 %r3059, %rs1941; - cvt.s32.s8 %r3060, %r3059; - cvt.u32.u16 %r3061, %rs1940; - cvt.s32.s8 %r3062, %r3061; - cvt.u32.u16 %r3063, %rs1939; - cvt.s32.s8 %r3064, %r3063; - cvt.u32.u16 %r3065, %rs1938; - cvt.s32.s8 %r3066, %r3065; - mad.lo.s32 %r3067, %r42, %r3066, %r3058; - mad.lo.s32 %r3068, %r43, %r3064, %r3067; - mad.lo.s32 %r3069, %r45, %r3062, %r3068; - mad.lo.s32 %r3070, %r46, %r3060, %r3069; - ld.const.v4.u8 {%rs1946, %rs1947, %rs1948, %rs1949}, [matrix+972]; - cvt.u32.u16 %r3071, %rs1949; - cvt.s32.s8 %r3072, %r3071; - cvt.u32.u16 %r3073, %rs1948; - cvt.s32.s8 %r3074, %r3073; - cvt.u32.u16 %r3075, %rs1947; - cvt.s32.s8 %r3076, %r3075; - cvt.u32.u16 %r3077, %rs1946; - cvt.s32.s8 %r3078, %r3077; - mad.lo.s32 %r3079, %r48, %r3078, %r3070; - mad.lo.s32 %r3080, %r49, %r3076, %r3079; - mad.lo.s32 %r3081, %r50, %r3074, %r3080; - mad.lo.s32 %r3082, %r51, %r3072, %r3081; - ld.const.v4.u8 {%rs1954, %rs1955, %rs1956, %rs1957}, [matrix+976]; - cvt.u32.u16 %r3083, %rs1957; - cvt.s32.s8 %r3084, %r3083; - cvt.u32.u16 %r3085, %rs1956; - cvt.s32.s8 %r3086, %r3085; - cvt.u32.u16 %r3087, %rs1955; - cvt.s32.s8 %r3088, %r3087; - cvt.u32.u16 %r3089, %rs1954; - cvt.s32.s8 %r3090, %r3089; - mad.lo.s32 %r3091, %r173, %r3090, %r3082; - mad.lo.s32 %r3092, %r53, %r3088, %r3091; - mad.lo.s32 %r3093, %r54, %r3086, %r3092; - mad.lo.s32 %r3094, %r55, %r3084, %r3093; - ld.const.v4.u8 {%rs1962, %rs1963, %rs1964, %rs1965}, [matrix+980]; - cvt.u32.u16 %r3095, %rs1965; - cvt.s32.s8 %r3096, %r3095; - cvt.u32.u16 %r3097, %rs1964; - cvt.s32.s8 %r3098, %r3097; - cvt.u32.u16 %r3099, %rs1963; - cvt.s32.s8 %r3100, %r3099; - cvt.u32.u16 %r3101, %rs1962; - cvt.s32.s8 %r3102, %r3101; - mad.lo.s32 %r3103, %r56, %r3102, %r3094; - mad.lo.s32 %r3104, %r57, %r3100, %r3103; - mad.lo.s32 %r3105, %r58, %r3098, %r3104; - mad.lo.s32 %r3106, %r59, %r3096, %r3105; - ld.const.v4.u8 {%rs1970, %rs1971, %rs1972, %rs1973}, [matrix+984]; - cvt.u32.u16 %r3107, %rs1973; - cvt.s32.s8 %r3108, %r3107; - cvt.u32.u16 %r3109, %rs1972; - cvt.s32.s8 %r3110, %r3109; - cvt.u32.u16 %r3111, %rs1971; - cvt.s32.s8 %r3112, %r3111; - cvt.u32.u16 %r3113, %rs1970; - cvt.s32.s8 %r3114, %r3113; - mad.lo.s32 %r3115, %r61, %r3114, %r3106; - mad.lo.s32 %r3116, %r62, %r3112, %r3115; - mad.lo.s32 %r3117, %r64, %r3110, %r3116; - mad.lo.s32 %r3118, %r65, %r3108, %r3117; - ld.const.v4.u8 {%rs1978, %rs1979, %rs1980, %rs1981}, [matrix+988]; - cvt.u32.u16 %r3119, %rs1981; - cvt.s32.s8 %r3120, %r3119; - cvt.u32.u16 %r3121, %rs1980; - cvt.s32.s8 %r3122, %r3121; - cvt.u32.u16 %r3123, %rs1979; - cvt.s32.s8 %r3124, %r3123; - cvt.u32.u16 %r3125, %rs1978; - cvt.s32.s8 %r3126, %r3125; - mad.lo.s32 %r3127, %r67, %r3126, %r3118; - mad.lo.s32 %r3128, %r68, %r3124, %r3127; - mad.lo.s32 %r3129, %r69, %r3122, %r3128; - mad.lo.s32 %r3130, %r70, %r3120, %r3129; - ld.const.v4.u8 {%rs1986, %rs1987, %rs1988, %rs1989}, [matrix+992]; - cvt.u32.u16 %r3131, %rs1989; - cvt.s32.s8 %r3132, %r3131; - cvt.u32.u16 %r3133, %rs1988; - cvt.s32.s8 %r3134, %r3133; - cvt.u32.u16 %r3135, %rs1987; - cvt.s32.s8 %r3136, %r3135; - cvt.u32.u16 %r3137, %rs1986; - cvt.s32.s8 %r3138, %r3137; - mad.lo.s32 %r3139, %r222, %r3138, %r3130; - mad.lo.s32 %r3140, %r72, %r3136, %r3139; - mad.lo.s32 %r3141, %r73, %r3134, %r3140; - mad.lo.s32 %r3142, %r74, %r3132, %r3141; - ld.const.v4.u8 {%rs1994, %rs1995, %rs1996, %rs1997}, [matrix+996]; - cvt.u32.u16 %r3143, %rs1997; - cvt.s32.s8 %r3144, %r3143; - cvt.u32.u16 %r3145, %rs1996; - cvt.s32.s8 %r3146, %r3145; - cvt.u32.u16 %r3147, %rs1995; - cvt.s32.s8 %r3148, %r3147; - cvt.u32.u16 %r3149, %rs1994; - cvt.s32.s8 %r3150, %r3149; - mad.lo.s32 %r3151, %r75, %r3150, %r3142; - mad.lo.s32 %r3152, %r76, %r3148, %r3151; - mad.lo.s32 %r3153, %r77, %r3146, %r3152; - mad.lo.s32 %r3154, %r78, %r3144, %r3153; - ld.const.v4.u8 {%rs2002, %rs2003, %rs2004, %rs2005}, [matrix+1000]; - cvt.u32.u16 %r3155, %rs2005; - cvt.s32.s8 %r3156, %r3155; - cvt.u32.u16 %r3157, %rs2004; - cvt.s32.s8 %r3158, %r3157; - cvt.u32.u16 %r3159, %rs2003; - cvt.s32.s8 %r3160, %r3159; - cvt.u32.u16 %r3161, %rs2002; - cvt.s32.s8 %r3162, %r3161; - mad.lo.s32 %r3163, %r80, %r3162, %r3154; - mad.lo.s32 %r3164, %r81, %r3160, %r3163; - mad.lo.s32 %r3165, %r83, %r3158, %r3164; - mad.lo.s32 %r3166, %r84, %r3156, %r3165; - ld.const.v4.u8 {%rs2010, %rs2011, %rs2012, %rs2013}, [matrix+1004]; - cvt.u32.u16 %r3167, %rs2013; - cvt.s32.s8 %r3168, %r3167; - cvt.u32.u16 %r3169, %rs2012; - cvt.s32.s8 %r3170, %r3169; - cvt.u32.u16 %r3171, %rs2011; - cvt.s32.s8 %r3172, %r3171; - cvt.u32.u16 %r3173, %rs2010; - cvt.s32.s8 %r3174, %r3173; - mad.lo.s32 %r3175, %r86, %r3174, %r3166; - mad.lo.s32 %r3176, %r87, %r3172, %r3175; - mad.lo.s32 %r3177, %r88, %r3170, %r3176; - mad.lo.s32 %r3178, %r89, %r3168, %r3177; - ld.const.v4.u8 {%rs2018, %rs2019, %rs2020, %rs2021}, [matrix+1008]; - cvt.u32.u16 %r3179, %rs2021; - cvt.s32.s8 %r3180, %r3179; - cvt.u32.u16 %r3181, %rs2020; - cvt.s32.s8 %r3182, %r3181; - cvt.u32.u16 %r3183, %rs2019; - cvt.s32.s8 %r3184, %r3183; - cvt.u32.u16 %r3185, %rs2018; - cvt.s32.s8 %r3186, %r3185; - mad.lo.s32 %r3187, %r271, %r3186, %r3178; - mad.lo.s32 %r3188, %r91, %r3184, %r3187; - mad.lo.s32 %r3189, %r93, %r3182, %r3188; - mad.lo.s32 %r3190, %r94, %r3180, %r3189; - ld.const.v4.u8 {%rs2026, %rs2027, %rs2028, %rs2029}, [matrix+1012]; - cvt.u32.u16 %r3191, %rs2029; - cvt.s32.s8 %r3192, %r3191; - cvt.u32.u16 %r3193, %rs2028; - cvt.s32.s8 %r3194, %r3193; - cvt.u32.u16 %r3195, %rs2027; - cvt.s32.s8 %r3196, %r3195; - cvt.u32.u16 %r3197, %rs2026; - cvt.s32.s8 %r3198, %r3197; - mad.lo.s32 %r3199, %r96, %r3198, %r3190; - mad.lo.s32 %r3200, %r97, %r3196, %r3199; - mad.lo.s32 %r3201, %r99, %r3194, %r3200; - mad.lo.s32 %r3202, %r100, %r3192, %r3201; - ld.const.v4.u8 {%rs2034, %rs2035, %rs2036, %rs2037}, [matrix+1016]; - cvt.u32.u16 %r3203, %rs2037; - cvt.s32.s8 %r3204, %r3203; - cvt.u32.u16 %r3205, %rs2036; - cvt.s32.s8 %r3206, %r3205; - cvt.u32.u16 %r3207, %rs2035; - cvt.s32.s8 %r3208, %r3207; - cvt.u32.u16 %r3209, %rs2034; - cvt.s32.s8 %r3210, %r3209; - mad.lo.s32 %r3211, %r103, %r3210, %r3202; - mad.lo.s32 %r3212, %r104, %r3208, %r3211; - mad.lo.s32 %r3213, %r107, %r3206, %r3212; - mad.lo.s32 %r3214, %r108, %r3204, %r3213; - ld.const.v4.u8 {%rs2042, %rs2043, %rs2044, %rs2045}, [matrix+1020]; - cvt.u32.u16 %r3215, %rs2045; - cvt.s32.s8 %r3216, %r3215; - cvt.u32.u16 %r3217, %rs2044; - cvt.s32.s8 %r3218, %r3217; - cvt.u32.u16 %r3219, %rs2043; - cvt.s32.s8 %r3220, %r3219; - cvt.u32.u16 %r3221, %rs2042; - cvt.s32.s8 %r3222, %r3221; - mad.lo.s32 %r3223, %r111, %r3222, %r3214; - mad.lo.s32 %r3224, %r112, %r3220, %r3223; - mad.lo.s32 %r3225, %r114, %r3218, %r3224; - mad.lo.s32 %r3226, %r115, %r3216, %r3225; - shr.u32 %r3227, %r3034, 6; - and.b32 %r3228, %r3227, 240; - shr.u32 %r3229, %r3226, 10; - or.b32 %r3230, %r3229, %r3228; - xor.b32 %r3231, %r18, %r3230; - ld.const.v4.u8 {%rs2050, %rs2051, %rs2052, %rs2053}, [matrix+1024]; - cvt.u32.u16 %r3232, %rs2053; - cvt.s32.s8 %r3233, %r3232; - cvt.u32.u16 %r3234, %rs2052; - cvt.s32.s8 %r3235, %r3234; - cvt.u32.u16 %r3236, %rs2050; - cvt.s32.s8 %r3237, %r3236; - cvt.u32.u16 %r3238, %rs2051; - cvt.s32.s8 %r3239, %r3238; - mul.lo.s32 %r3240, %r34, %r3239; - mad.lo.s32 %r3241, %r124, %r3237, %r3240; - mad.lo.s32 %r3242, %r35, %r3235, %r3241; - mad.lo.s32 %r3243, %r36, %r3233, %r3242; - ld.const.v4.u8 {%rs2058, %rs2059, %rs2060, %rs2061}, [matrix+1028]; - cvt.u32.u16 %r3244, %rs2061; - cvt.s32.s8 %r3245, %r3244; - cvt.u32.u16 %r3246, %rs2060; - cvt.s32.s8 %r3247, %r3246; - cvt.u32.u16 %r3248, %rs2059; - cvt.s32.s8 %r3249, %r3248; - cvt.u32.u16 %r3250, %rs2058; - cvt.s32.s8 %r3251, %r3250; - mad.lo.s32 %r3252, %r37, %r3251, %r3243; - mad.lo.s32 %r3253, %r38, %r3249, %r3252; - mad.lo.s32 %r3254, %r39, %r3247, %r3253; - mad.lo.s32 %r3255, %r40, %r3245, %r3254; - ld.const.v4.u8 {%rs2066, %rs2067, %rs2068, %rs2069}, [matrix+1032]; - cvt.u32.u16 %r3256, %rs2069; - cvt.s32.s8 %r3257, %r3256; - cvt.u32.u16 %r3258, %rs2068; - cvt.s32.s8 %r3259, %r3258; - cvt.u32.u16 %r3260, %rs2067; - cvt.s32.s8 %r3261, %r3260; - cvt.u32.u16 %r3262, %rs2066; - cvt.s32.s8 %r3263, %r3262; - mad.lo.s32 %r3264, %r42, %r3263, %r3255; - mad.lo.s32 %r3265, %r43, %r3261, %r3264; - mad.lo.s32 %r3266, %r45, %r3259, %r3265; - mad.lo.s32 %r3267, %r46, %r3257, %r3266; - ld.const.v4.u8 {%rs2074, %rs2075, %rs2076, %rs2077}, [matrix+1036]; - cvt.u32.u16 %r3268, %rs2077; - cvt.s32.s8 %r3269, %r3268; - cvt.u32.u16 %r3270, %rs2076; - cvt.s32.s8 %r3271, %r3270; - cvt.u32.u16 %r3272, %rs2075; - cvt.s32.s8 %r3273, %r3272; - cvt.u32.u16 %r3274, %rs2074; - cvt.s32.s8 %r3275, %r3274; - mad.lo.s32 %r3276, %r48, %r3275, %r3267; - mad.lo.s32 %r3277, %r49, %r3273, %r3276; - mad.lo.s32 %r3278, %r50, %r3271, %r3277; - mad.lo.s32 %r3279, %r51, %r3269, %r3278; - ld.const.v4.u8 {%rs2082, %rs2083, %rs2084, %rs2085}, [matrix+1040]; - cvt.u32.u16 %r3280, %rs2085; - cvt.s32.s8 %r3281, %r3280; - cvt.u32.u16 %r3282, %rs2084; - cvt.s32.s8 %r3283, %r3282; - cvt.u32.u16 %r3284, %rs2083; - cvt.s32.s8 %r3285, %r3284; - cvt.u32.u16 %r3286, %rs2082; - cvt.s32.s8 %r3287, %r3286; - mad.lo.s32 %r3288, %r173, %r3287, %r3279; - mad.lo.s32 %r3289, %r53, %r3285, %r3288; - mad.lo.s32 %r3290, %r54, %r3283, %r3289; - mad.lo.s32 %r3291, %r55, %r3281, %r3290; - ld.const.v4.u8 {%rs2090, %rs2091, %rs2092, %rs2093}, [matrix+1044]; - cvt.u32.u16 %r3292, %rs2093; - cvt.s32.s8 %r3293, %r3292; - cvt.u32.u16 %r3294, %rs2092; - cvt.s32.s8 %r3295, %r3294; - cvt.u32.u16 %r3296, %rs2091; - cvt.s32.s8 %r3297, %r3296; - cvt.u32.u16 %r3298, %rs2090; - cvt.s32.s8 %r3299, %r3298; - mad.lo.s32 %r3300, %r56, %r3299, %r3291; - mad.lo.s32 %r3301, %r57, %r3297, %r3300; - mad.lo.s32 %r3302, %r58, %r3295, %r3301; - mad.lo.s32 %r3303, %r59, %r3293, %r3302; - ld.const.v4.u8 {%rs2098, %rs2099, %rs2100, %rs2101}, [matrix+1048]; - cvt.u32.u16 %r3304, %rs2101; - cvt.s32.s8 %r3305, %r3304; - cvt.u32.u16 %r3306, %rs2100; - cvt.s32.s8 %r3307, %r3306; - cvt.u32.u16 %r3308, %rs2099; - cvt.s32.s8 %r3309, %r3308; - cvt.u32.u16 %r3310, %rs2098; - cvt.s32.s8 %r3311, %r3310; - mad.lo.s32 %r3312, %r61, %r3311, %r3303; - mad.lo.s32 %r3313, %r62, %r3309, %r3312; - mad.lo.s32 %r3314, %r64, %r3307, %r3313; - mad.lo.s32 %r3315, %r65, %r3305, %r3314; - ld.const.v4.u8 {%rs2106, %rs2107, %rs2108, %rs2109}, [matrix+1052]; - cvt.u32.u16 %r3316, %rs2109; - cvt.s32.s8 %r3317, %r3316; - cvt.u32.u16 %r3318, %rs2108; - cvt.s32.s8 %r3319, %r3318; - cvt.u32.u16 %r3320, %rs2107; - cvt.s32.s8 %r3321, %r3320; - cvt.u32.u16 %r3322, %rs2106; - cvt.s32.s8 %r3323, %r3322; - mad.lo.s32 %r3324, %r67, %r3323, %r3315; - mad.lo.s32 %r3325, %r68, %r3321, %r3324; - mad.lo.s32 %r3326, %r69, %r3319, %r3325; - mad.lo.s32 %r3327, %r70, %r3317, %r3326; - ld.const.v4.u8 {%rs2114, %rs2115, %rs2116, %rs2117}, [matrix+1056]; - cvt.u32.u16 %r3328, %rs2117; - cvt.s32.s8 %r3329, %r3328; - cvt.u32.u16 %r3330, %rs2116; - cvt.s32.s8 %r3331, %r3330; - cvt.u32.u16 %r3332, %rs2115; - cvt.s32.s8 %r3333, %r3332; - cvt.u32.u16 %r3334, %rs2114; - cvt.s32.s8 %r3335, %r3334; - mad.lo.s32 %r3336, %r222, %r3335, %r3327; - mad.lo.s32 %r3337, %r72, %r3333, %r3336; - mad.lo.s32 %r3338, %r73, %r3331, %r3337; - mad.lo.s32 %r3339, %r74, %r3329, %r3338; - ld.const.v4.u8 {%rs2122, %rs2123, %rs2124, %rs2125}, [matrix+1060]; - cvt.u32.u16 %r3340, %rs2125; - cvt.s32.s8 %r3341, %r3340; - cvt.u32.u16 %r3342, %rs2124; - cvt.s32.s8 %r3343, %r3342; - cvt.u32.u16 %r3344, %rs2123; - cvt.s32.s8 %r3345, %r3344; - cvt.u32.u16 %r3346, %rs2122; - cvt.s32.s8 %r3347, %r3346; - mad.lo.s32 %r3348, %r75, %r3347, %r3339; - mad.lo.s32 %r3349, %r76, %r3345, %r3348; - mad.lo.s32 %r3350, %r77, %r3343, %r3349; - mad.lo.s32 %r3351, %r78, %r3341, %r3350; - ld.const.v4.u8 {%rs2130, %rs2131, %rs2132, %rs2133}, [matrix+1064]; - cvt.u32.u16 %r3352, %rs2133; - cvt.s32.s8 %r3353, %r3352; - cvt.u32.u16 %r3354, %rs2132; - cvt.s32.s8 %r3355, %r3354; - cvt.u32.u16 %r3356, %rs2131; - cvt.s32.s8 %r3357, %r3356; - cvt.u32.u16 %r3358, %rs2130; - cvt.s32.s8 %r3359, %r3358; - mad.lo.s32 %r3360, %r80, %r3359, %r3351; - mad.lo.s32 %r3361, %r81, %r3357, %r3360; - mad.lo.s32 %r3362, %r83, %r3355, %r3361; - mad.lo.s32 %r3363, %r84, %r3353, %r3362; - ld.const.v4.u8 {%rs2138, %rs2139, %rs2140, %rs2141}, [matrix+1068]; - cvt.u32.u16 %r3364, %rs2141; - cvt.s32.s8 %r3365, %r3364; - cvt.u32.u16 %r3366, %rs2140; - cvt.s32.s8 %r3367, %r3366; - cvt.u32.u16 %r3368, %rs2139; - cvt.s32.s8 %r3369, %r3368; - cvt.u32.u16 %r3370, %rs2138; - cvt.s32.s8 %r3371, %r3370; - mad.lo.s32 %r3372, %r86, %r3371, %r3363; - mad.lo.s32 %r3373, %r87, %r3369, %r3372; - mad.lo.s32 %r3374, %r88, %r3367, %r3373; - mad.lo.s32 %r3375, %r89, %r3365, %r3374; - ld.const.v4.u8 {%rs2146, %rs2147, %rs2148, %rs2149}, [matrix+1072]; - cvt.u32.u16 %r3376, %rs2149; - cvt.s32.s8 %r3377, %r3376; - cvt.u32.u16 %r3378, %rs2148; - cvt.s32.s8 %r3379, %r3378; - cvt.u32.u16 %r3380, %rs2147; - cvt.s32.s8 %r3381, %r3380; - cvt.u32.u16 %r3382, %rs2146; - cvt.s32.s8 %r3383, %r3382; - mad.lo.s32 %r3384, %r271, %r3383, %r3375; - mad.lo.s32 %r3385, %r91, %r3381, %r3384; - mad.lo.s32 %r3386, %r93, %r3379, %r3385; - mad.lo.s32 %r3387, %r94, %r3377, %r3386; - ld.const.v4.u8 {%rs2154, %rs2155, %rs2156, %rs2157}, [matrix+1076]; - cvt.u32.u16 %r3388, %rs2157; - cvt.s32.s8 %r3389, %r3388; - cvt.u32.u16 %r3390, %rs2156; - cvt.s32.s8 %r3391, %r3390; - cvt.u32.u16 %r3392, %rs2155; - cvt.s32.s8 %r3393, %r3392; - cvt.u32.u16 %r3394, %rs2154; - cvt.s32.s8 %r3395, %r3394; - mad.lo.s32 %r3396, %r96, %r3395, %r3387; - mad.lo.s32 %r3397, %r97, %r3393, %r3396; - mad.lo.s32 %r3398, %r99, %r3391, %r3397; - mad.lo.s32 %r3399, %r100, %r3389, %r3398; - ld.const.v4.u8 {%rs2162, %rs2163, %rs2164, %rs2165}, [matrix+1080]; - cvt.u32.u16 %r3400, %rs2165; - cvt.s32.s8 %r3401, %r3400; - cvt.u32.u16 %r3402, %rs2164; - cvt.s32.s8 %r3403, %r3402; - cvt.u32.u16 %r3404, %rs2163; - cvt.s32.s8 %r3405, %r3404; - cvt.u32.u16 %r3406, %rs2162; - cvt.s32.s8 %r3407, %r3406; - mad.lo.s32 %r3408, %r103, %r3407, %r3399; - mad.lo.s32 %r3409, %r104, %r3405, %r3408; - mad.lo.s32 %r3410, %r107, %r3403, %r3409; - mad.lo.s32 %r3411, %r108, %r3401, %r3410; - ld.const.v4.u8 {%rs2170, %rs2171, %rs2172, %rs2173}, [matrix+1084]; - cvt.u32.u16 %r3412, %rs2173; - cvt.s32.s8 %r3413, %r3412; - cvt.u32.u16 %r3414, %rs2172; - cvt.s32.s8 %r3415, %r3414; - cvt.u32.u16 %r3416, %rs2171; - cvt.s32.s8 %r3417, %r3416; - cvt.u32.u16 %r3418, %rs2170; - cvt.s32.s8 %r3419, %r3418; - mad.lo.s32 %r3420, %r111, %r3419, %r3411; - mad.lo.s32 %r3421, %r112, %r3417, %r3420; - mad.lo.s32 %r3422, %r114, %r3415, %r3421; - mad.lo.s32 %r3423, %r115, %r3413, %r3422; - ld.const.v4.u8 {%rs2178, %rs2179, %rs2180, %rs2181}, [matrix+1088]; - cvt.u32.u16 %r3424, %rs2181; - cvt.s32.s8 %r3425, %r3424; - cvt.u32.u16 %r3426, %rs2180; - cvt.s32.s8 %r3427, %r3426; - cvt.u32.u16 %r3428, %rs2178; - cvt.s32.s8 %r3429, %r3428; - cvt.u32.u16 %r3430, %rs2179; - cvt.s32.s8 %r3431, %r3430; - mul.lo.s32 %r3432, %r34, %r3431; - mad.lo.s32 %r3433, %r124, %r3429, %r3432; - mad.lo.s32 %r3434, %r35, %r3427, %r3433; - mad.lo.s32 %r3435, %r36, %r3425, %r3434; - ld.const.v4.u8 {%rs2186, %rs2187, %rs2188, %rs2189}, [matrix+1092]; - cvt.u32.u16 %r3436, %rs2189; - cvt.s32.s8 %r3437, %r3436; - cvt.u32.u16 %r3438, %rs2188; - cvt.s32.s8 %r3439, %r3438; - cvt.u32.u16 %r3440, %rs2187; - cvt.s32.s8 %r3441, %r3440; - cvt.u32.u16 %r3442, %rs2186; - cvt.s32.s8 %r3443, %r3442; - mad.lo.s32 %r3444, %r37, %r3443, %r3435; - mad.lo.s32 %r3445, %r38, %r3441, %r3444; - mad.lo.s32 %r3446, %r39, %r3439, %r3445; - mad.lo.s32 %r3447, %r40, %r3437, %r3446; - ld.const.v4.u8 {%rs2194, %rs2195, %rs2196, %rs2197}, [matrix+1096]; - cvt.u32.u16 %r3448, %rs2197; - cvt.s32.s8 %r3449, %r3448; - cvt.u32.u16 %r3450, %rs2196; - cvt.s32.s8 %r3451, %r3450; - cvt.u32.u16 %r3452, %rs2195; - cvt.s32.s8 %r3453, %r3452; - cvt.u32.u16 %r3454, %rs2194; - cvt.s32.s8 %r3455, %r3454; - mad.lo.s32 %r3456, %r42, %r3455, %r3447; - mad.lo.s32 %r3457, %r43, %r3453, %r3456; - mad.lo.s32 %r3458, %r45, %r3451, %r3457; - mad.lo.s32 %r3459, %r46, %r3449, %r3458; - ld.const.v4.u8 {%rs2202, %rs2203, %rs2204, %rs2205}, [matrix+1100]; - cvt.u32.u16 %r3460, %rs2205; - cvt.s32.s8 %r3461, %r3460; - cvt.u32.u16 %r3462, %rs2204; - cvt.s32.s8 %r3463, %r3462; - cvt.u32.u16 %r3464, %rs2203; - cvt.s32.s8 %r3465, %r3464; - cvt.u32.u16 %r3466, %rs2202; - cvt.s32.s8 %r3467, %r3466; - mad.lo.s32 %r3468, %r48, %r3467, %r3459; - mad.lo.s32 %r3469, %r49, %r3465, %r3468; - mad.lo.s32 %r3470, %r50, %r3463, %r3469; - mad.lo.s32 %r3471, %r51, %r3461, %r3470; - ld.const.v4.u8 {%rs2210, %rs2211, %rs2212, %rs2213}, [matrix+1104]; - cvt.u32.u16 %r3472, %rs2213; - cvt.s32.s8 %r3473, %r3472; - cvt.u32.u16 %r3474, %rs2212; - cvt.s32.s8 %r3475, %r3474; - cvt.u32.u16 %r3476, %rs2211; - cvt.s32.s8 %r3477, %r3476; - cvt.u32.u16 %r3478, %rs2210; - cvt.s32.s8 %r3479, %r3478; - mad.lo.s32 %r3480, %r173, %r3479, %r3471; - mad.lo.s32 %r3481, %r53, %r3477, %r3480; - mad.lo.s32 %r3482, %r54, %r3475, %r3481; - mad.lo.s32 %r3483, %r55, %r3473, %r3482; - ld.const.v4.u8 {%rs2218, %rs2219, %rs2220, %rs2221}, [matrix+1108]; - cvt.u32.u16 %r3484, %rs2221; - cvt.s32.s8 %r3485, %r3484; - cvt.u32.u16 %r3486, %rs2220; - cvt.s32.s8 %r3487, %r3486; - cvt.u32.u16 %r3488, %rs2219; - cvt.s32.s8 %r3489, %r3488; - cvt.u32.u16 %r3490, %rs2218; - cvt.s32.s8 %r3491, %r3490; - mad.lo.s32 %r3492, %r56, %r3491, %r3483; - mad.lo.s32 %r3493, %r57, %r3489, %r3492; - mad.lo.s32 %r3494, %r58, %r3487, %r3493; - mad.lo.s32 %r3495, %r59, %r3485, %r3494; - ld.const.v4.u8 {%rs2226, %rs2227, %rs2228, %rs2229}, [matrix+1112]; - cvt.u32.u16 %r3496, %rs2229; - cvt.s32.s8 %r3497, %r3496; - cvt.u32.u16 %r3498, %rs2228; - cvt.s32.s8 %r3499, %r3498; - cvt.u32.u16 %r3500, %rs2227; - cvt.s32.s8 %r3501, %r3500; - cvt.u32.u16 %r3502, %rs2226; - cvt.s32.s8 %r3503, %r3502; - mad.lo.s32 %r3504, %r61, %r3503, %r3495; - mad.lo.s32 %r3505, %r62, %r3501, %r3504; - mad.lo.s32 %r3506, %r64, %r3499, %r3505; - mad.lo.s32 %r3507, %r65, %r3497, %r3506; - ld.const.v4.u8 {%rs2234, %rs2235, %rs2236, %rs2237}, [matrix+1116]; - cvt.u32.u16 %r3508, %rs2237; - cvt.s32.s8 %r3509, %r3508; - cvt.u32.u16 %r3510, %rs2236; - cvt.s32.s8 %r3511, %r3510; - cvt.u32.u16 %r3512, %rs2235; - cvt.s32.s8 %r3513, %r3512; - cvt.u32.u16 %r3514, %rs2234; - cvt.s32.s8 %r3515, %r3514; - mad.lo.s32 %r3516, %r67, %r3515, %r3507; - mad.lo.s32 %r3517, %r68, %r3513, %r3516; - mad.lo.s32 %r3518, %r69, %r3511, %r3517; - mad.lo.s32 %r3519, %r70, %r3509, %r3518; - ld.const.v4.u8 {%rs2242, %rs2243, %rs2244, %rs2245}, [matrix+1120]; - cvt.u32.u16 %r3520, %rs2245; - cvt.s32.s8 %r3521, %r3520; - cvt.u32.u16 %r3522, %rs2244; - cvt.s32.s8 %r3523, %r3522; - cvt.u32.u16 %r3524, %rs2243; - cvt.s32.s8 %r3525, %r3524; - cvt.u32.u16 %r3526, %rs2242; - cvt.s32.s8 %r3527, %r3526; - mad.lo.s32 %r3528, %r222, %r3527, %r3519; - mad.lo.s32 %r3529, %r72, %r3525, %r3528; - mad.lo.s32 %r3530, %r73, %r3523, %r3529; - mad.lo.s32 %r3531, %r74, %r3521, %r3530; - ld.const.v4.u8 {%rs2250, %rs2251, %rs2252, %rs2253}, [matrix+1124]; - cvt.u32.u16 %r3532, %rs2253; - cvt.s32.s8 %r3533, %r3532; - cvt.u32.u16 %r3534, %rs2252; - cvt.s32.s8 %r3535, %r3534; - cvt.u32.u16 %r3536, %rs2251; - cvt.s32.s8 %r3537, %r3536; - cvt.u32.u16 %r3538, %rs2250; - cvt.s32.s8 %r3539, %r3538; - mad.lo.s32 %r3540, %r75, %r3539, %r3531; - mad.lo.s32 %r3541, %r76, %r3537, %r3540; - mad.lo.s32 %r3542, %r77, %r3535, %r3541; - mad.lo.s32 %r3543, %r78, %r3533, %r3542; - ld.const.v4.u8 {%rs2258, %rs2259, %rs2260, %rs2261}, [matrix+1128]; - cvt.u32.u16 %r3544, %rs2261; - cvt.s32.s8 %r3545, %r3544; - cvt.u32.u16 %r3546, %rs2260; - cvt.s32.s8 %r3547, %r3546; - cvt.u32.u16 %r3548, %rs2259; - cvt.s32.s8 %r3549, %r3548; - cvt.u32.u16 %r3550, %rs2258; - cvt.s32.s8 %r3551, %r3550; - mad.lo.s32 %r3552, %r80, %r3551, %r3543; - mad.lo.s32 %r3553, %r81, %r3549, %r3552; - mad.lo.s32 %r3554, %r83, %r3547, %r3553; - mad.lo.s32 %r3555, %r84, %r3545, %r3554; - ld.const.v4.u8 {%rs2266, %rs2267, %rs2268, %rs2269}, [matrix+1132]; - cvt.u32.u16 %r3556, %rs2269; - cvt.s32.s8 %r3557, %r3556; - cvt.u32.u16 %r3558, %rs2268; - cvt.s32.s8 %r3559, %r3558; - cvt.u32.u16 %r3560, %rs2267; - cvt.s32.s8 %r3561, %r3560; - cvt.u32.u16 %r3562, %rs2266; - cvt.s32.s8 %r3563, %r3562; - mad.lo.s32 %r3564, %r86, %r3563, %r3555; - mad.lo.s32 %r3565, %r87, %r3561, %r3564; - mad.lo.s32 %r3566, %r88, %r3559, %r3565; - mad.lo.s32 %r3567, %r89, %r3557, %r3566; - ld.const.v4.u8 {%rs2274, %rs2275, %rs2276, %rs2277}, [matrix+1136]; - cvt.u32.u16 %r3568, %rs2277; - cvt.s32.s8 %r3569, %r3568; - cvt.u32.u16 %r3570, %rs2276; - cvt.s32.s8 %r3571, %r3570; - cvt.u32.u16 %r3572, %rs2275; - cvt.s32.s8 %r3573, %r3572; - cvt.u32.u16 %r3574, %rs2274; - cvt.s32.s8 %r3575, %r3574; - mad.lo.s32 %r3576, %r271, %r3575, %r3567; - mad.lo.s32 %r3577, %r91, %r3573, %r3576; - mad.lo.s32 %r3578, %r93, %r3571, %r3577; - mad.lo.s32 %r3579, %r94, %r3569, %r3578; - ld.const.v4.u8 {%rs2282, %rs2283, %rs2284, %rs2285}, [matrix+1140]; - cvt.u32.u16 %r3580, %rs2285; - cvt.s32.s8 %r3581, %r3580; - cvt.u32.u16 %r3582, %rs2284; - cvt.s32.s8 %r3583, %r3582; - cvt.u32.u16 %r3584, %rs2283; - cvt.s32.s8 %r3585, %r3584; - cvt.u32.u16 %r3586, %rs2282; - cvt.s32.s8 %r3587, %r3586; - mad.lo.s32 %r3588, %r96, %r3587, %r3579; - mad.lo.s32 %r3589, %r97, %r3585, %r3588; - mad.lo.s32 %r3590, %r99, %r3583, %r3589; - mad.lo.s32 %r3591, %r100, %r3581, %r3590; - ld.const.v4.u8 {%rs2290, %rs2291, %rs2292, %rs2293}, [matrix+1144]; - cvt.u32.u16 %r3592, %rs2293; - cvt.s32.s8 %r3593, %r3592; - cvt.u32.u16 %r3594, %rs2292; - cvt.s32.s8 %r3595, %r3594; - cvt.u32.u16 %r3596, %rs2291; - cvt.s32.s8 %r3597, %r3596; - cvt.u32.u16 %r3598, %rs2290; - cvt.s32.s8 %r3599, %r3598; - mad.lo.s32 %r3600, %r103, %r3599, %r3591; - mad.lo.s32 %r3601, %r104, %r3597, %r3600; - mad.lo.s32 %r3602, %r107, %r3595, %r3601; - mad.lo.s32 %r3603, %r108, %r3593, %r3602; - ld.const.v4.u8 {%rs2298, %rs2299, %rs2300, %rs2301}, [matrix+1148]; - cvt.u32.u16 %r3604, %rs2301; - cvt.s32.s8 %r3605, %r3604; - cvt.u32.u16 %r3606, %rs2300; - cvt.s32.s8 %r3607, %r3606; - cvt.u32.u16 %r3608, %rs2299; - cvt.s32.s8 %r3609, %r3608; - cvt.u32.u16 %r3610, %rs2298; - cvt.s32.s8 %r3611, %r3610; - mad.lo.s32 %r3612, %r111, %r3611, %r3603; - mad.lo.s32 %r3613, %r112, %r3609, %r3612; - mad.lo.s32 %r3614, %r114, %r3607, %r3613; - mad.lo.s32 %r3615, %r115, %r3605, %r3614; - shr.u32 %r3616, %r3423, 6; - and.b32 %r3617, %r3616, 240; - shr.u32 %r3618, %r3615, 10; - or.b32 %r3619, %r3618, %r3617; - xor.b32 %r3620, %r52, %r3619; - cvt.u64.u32 %rd387, %r3620; - ld.const.v4.u8 {%rs2306, %rs2307, %rs2308, %rs2309}, [matrix+1152]; - cvt.u32.u16 %r3621, %rs2309; - cvt.s32.s8 %r3622, %r3621; - cvt.u32.u16 %r3623, %rs2308; - cvt.s32.s8 %r3624, %r3623; - cvt.u32.u16 %r3625, %rs2306; - cvt.s32.s8 %r3626, %r3625; - cvt.u32.u16 %r3627, %rs2307; - cvt.s32.s8 %r3628, %r3627; - mul.lo.s32 %r3629, %r34, %r3628; - mad.lo.s32 %r3630, %r124, %r3626, %r3629; - mad.lo.s32 %r3631, %r35, %r3624, %r3630; - mad.lo.s32 %r3632, %r36, %r3622, %r3631; - ld.const.v4.u8 {%rs2314, %rs2315, %rs2316, %rs2317}, [matrix+1156]; - cvt.u32.u16 %r3633, %rs2317; - cvt.s32.s8 %r3634, %r3633; - cvt.u32.u16 %r3635, %rs2316; - cvt.s32.s8 %r3636, %r3635; - cvt.u32.u16 %r3637, %rs2315; - cvt.s32.s8 %r3638, %r3637; - cvt.u32.u16 %r3639, %rs2314; - cvt.s32.s8 %r3640, %r3639; - mad.lo.s32 %r3641, %r37, %r3640, %r3632; - mad.lo.s32 %r3642, %r38, %r3638, %r3641; - mad.lo.s32 %r3643, %r39, %r3636, %r3642; - mad.lo.s32 %r3644, %r40, %r3634, %r3643; - ld.const.v4.u8 {%rs2322, %rs2323, %rs2324, %rs2325}, [matrix+1160]; - cvt.u32.u16 %r3645, %rs2325; - cvt.s32.s8 %r3646, %r3645; - cvt.u32.u16 %r3647, %rs2324; - cvt.s32.s8 %r3648, %r3647; - cvt.u32.u16 %r3649, %rs2323; - cvt.s32.s8 %r3650, %r3649; - cvt.u32.u16 %r3651, %rs2322; - cvt.s32.s8 %r3652, %r3651; - mad.lo.s32 %r3653, %r42, %r3652, %r3644; - mad.lo.s32 %r3654, %r43, %r3650, %r3653; - mad.lo.s32 %r3655, %r45, %r3648, %r3654; - mad.lo.s32 %r3656, %r46, %r3646, %r3655; - ld.const.v4.u8 {%rs2330, %rs2331, %rs2332, %rs2333}, [matrix+1164]; - cvt.u32.u16 %r3657, %rs2333; - cvt.s32.s8 %r3658, %r3657; - cvt.u32.u16 %r3659, %rs2332; - cvt.s32.s8 %r3660, %r3659; - cvt.u32.u16 %r3661, %rs2331; - cvt.s32.s8 %r3662, %r3661; - cvt.u32.u16 %r3663, %rs2330; - cvt.s32.s8 %r3664, %r3663; - mad.lo.s32 %r3665, %r48, %r3664, %r3656; - mad.lo.s32 %r3666, %r49, %r3662, %r3665; - mad.lo.s32 %r3667, %r50, %r3660, %r3666; - mad.lo.s32 %r3668, %r51, %r3658, %r3667; - ld.const.v4.u8 {%rs2338, %rs2339, %rs2340, %rs2341}, [matrix+1168]; - cvt.u32.u16 %r3669, %rs2341; - cvt.s32.s8 %r3670, %r3669; - cvt.u32.u16 %r3671, %rs2340; - cvt.s32.s8 %r3672, %r3671; - cvt.u32.u16 %r3673, %rs2339; - cvt.s32.s8 %r3674, %r3673; - cvt.u32.u16 %r3675, %rs2338; - cvt.s32.s8 %r3676, %r3675; - mad.lo.s32 %r3677, %r173, %r3676, %r3668; - mad.lo.s32 %r3678, %r53, %r3674, %r3677; - mad.lo.s32 %r3679, %r54, %r3672, %r3678; - mad.lo.s32 %r3680, %r55, %r3670, %r3679; - ld.const.v4.u8 {%rs2346, %rs2347, %rs2348, %rs2349}, [matrix+1172]; - cvt.u32.u16 %r3681, %rs2349; - cvt.s32.s8 %r3682, %r3681; - cvt.u32.u16 %r3683, %rs2348; - cvt.s32.s8 %r3684, %r3683; - cvt.u32.u16 %r3685, %rs2347; - cvt.s32.s8 %r3686, %r3685; - cvt.u32.u16 %r3687, %rs2346; - cvt.s32.s8 %r3688, %r3687; - mad.lo.s32 %r3689, %r56, %r3688, %r3680; - mad.lo.s32 %r3690, %r57, %r3686, %r3689; - mad.lo.s32 %r3691, %r58, %r3684, %r3690; - mad.lo.s32 %r3692, %r59, %r3682, %r3691; - ld.const.v4.u8 {%rs2354, %rs2355, %rs2356, %rs2357}, [matrix+1176]; - cvt.u32.u16 %r3693, %rs2357; - cvt.s32.s8 %r3694, %r3693; - cvt.u32.u16 %r3695, %rs2356; - cvt.s32.s8 %r3696, %r3695; - cvt.u32.u16 %r3697, %rs2355; - cvt.s32.s8 %r3698, %r3697; - cvt.u32.u16 %r3699, %rs2354; - cvt.s32.s8 %r3700, %r3699; - mad.lo.s32 %r3701, %r61, %r3700, %r3692; - mad.lo.s32 %r3702, %r62, %r3698, %r3701; - mad.lo.s32 %r3703, %r64, %r3696, %r3702; - mad.lo.s32 %r3704, %r65, %r3694, %r3703; - ld.const.v4.u8 {%rs2362, %rs2363, %rs2364, %rs2365}, [matrix+1180]; - cvt.u32.u16 %r3705, %rs2365; - cvt.s32.s8 %r3706, %r3705; - cvt.u32.u16 %r3707, %rs2364; - cvt.s32.s8 %r3708, %r3707; - cvt.u32.u16 %r3709, %rs2363; - cvt.s32.s8 %r3710, %r3709; - cvt.u32.u16 %r3711, %rs2362; - cvt.s32.s8 %r3712, %r3711; - mad.lo.s32 %r3713, %r67, %r3712, %r3704; - mad.lo.s32 %r3714, %r68, %r3710, %r3713; - mad.lo.s32 %r3715, %r69, %r3708, %r3714; - mad.lo.s32 %r3716, %r70, %r3706, %r3715; - ld.const.v4.u8 {%rs2370, %rs2371, %rs2372, %rs2373}, [matrix+1184]; - cvt.u32.u16 %r3717, %rs2373; - cvt.s32.s8 %r3718, %r3717; - cvt.u32.u16 %r3719, %rs2372; - cvt.s32.s8 %r3720, %r3719; - cvt.u32.u16 %r3721, %rs2371; - cvt.s32.s8 %r3722, %r3721; - cvt.u32.u16 %r3723, %rs2370; - cvt.s32.s8 %r3724, %r3723; - mad.lo.s32 %r3725, %r222, %r3724, %r3716; - mad.lo.s32 %r3726, %r72, %r3722, %r3725; - mad.lo.s32 %r3727, %r73, %r3720, %r3726; - mad.lo.s32 %r3728, %r74, %r3718, %r3727; - ld.const.v4.u8 {%rs2378, %rs2379, %rs2380, %rs2381}, [matrix+1188]; - cvt.u32.u16 %r3729, %rs2381; - cvt.s32.s8 %r3730, %r3729; - cvt.u32.u16 %r3731, %rs2380; - cvt.s32.s8 %r3732, %r3731; - cvt.u32.u16 %r3733, %rs2379; - cvt.s32.s8 %r3734, %r3733; - cvt.u32.u16 %r3735, %rs2378; - cvt.s32.s8 %r3736, %r3735; - mad.lo.s32 %r3737, %r75, %r3736, %r3728; - mad.lo.s32 %r3738, %r76, %r3734, %r3737; - mad.lo.s32 %r3739, %r77, %r3732, %r3738; - mad.lo.s32 %r3740, %r78, %r3730, %r3739; - ld.const.v4.u8 {%rs2386, %rs2387, %rs2388, %rs2389}, [matrix+1192]; - cvt.u32.u16 %r3741, %rs2389; - cvt.s32.s8 %r3742, %r3741; - cvt.u32.u16 %r3743, %rs2388; - cvt.s32.s8 %r3744, %r3743; - cvt.u32.u16 %r3745, %rs2387; - cvt.s32.s8 %r3746, %r3745; - cvt.u32.u16 %r3747, %rs2386; - cvt.s32.s8 %r3748, %r3747; - mad.lo.s32 %r3749, %r80, %r3748, %r3740; - mad.lo.s32 %r3750, %r81, %r3746, %r3749; - mad.lo.s32 %r3751, %r83, %r3744, %r3750; - mad.lo.s32 %r3752, %r84, %r3742, %r3751; - ld.const.v4.u8 {%rs2394, %rs2395, %rs2396, %rs2397}, [matrix+1196]; - cvt.u32.u16 %r3753, %rs2397; - cvt.s32.s8 %r3754, %r3753; - cvt.u32.u16 %r3755, %rs2396; - cvt.s32.s8 %r3756, %r3755; - cvt.u32.u16 %r3757, %rs2395; - cvt.s32.s8 %r3758, %r3757; - cvt.u32.u16 %r3759, %rs2394; - cvt.s32.s8 %r3760, %r3759; - mad.lo.s32 %r3761, %r86, %r3760, %r3752; - mad.lo.s32 %r3762, %r87, %r3758, %r3761; - mad.lo.s32 %r3763, %r88, %r3756, %r3762; - mad.lo.s32 %r3764, %r89, %r3754, %r3763; - ld.const.v4.u8 {%rs2402, %rs2403, %rs2404, %rs2405}, [matrix+1200]; - cvt.u32.u16 %r3765, %rs2405; - cvt.s32.s8 %r3766, %r3765; - cvt.u32.u16 %r3767, %rs2404; - cvt.s32.s8 %r3768, %r3767; - cvt.u32.u16 %r3769, %rs2403; - cvt.s32.s8 %r3770, %r3769; - cvt.u32.u16 %r3771, %rs2402; - cvt.s32.s8 %r3772, %r3771; - mad.lo.s32 %r3773, %r271, %r3772, %r3764; - mad.lo.s32 %r3774, %r91, %r3770, %r3773; - mad.lo.s32 %r3775, %r93, %r3768, %r3774; - mad.lo.s32 %r3776, %r94, %r3766, %r3775; - ld.const.v4.u8 {%rs2410, %rs2411, %rs2412, %rs2413}, [matrix+1204]; - cvt.u32.u16 %r3777, %rs2413; - cvt.s32.s8 %r3778, %r3777; - cvt.u32.u16 %r3779, %rs2412; - cvt.s32.s8 %r3780, %r3779; - cvt.u32.u16 %r3781, %rs2411; - cvt.s32.s8 %r3782, %r3781; - cvt.u32.u16 %r3783, %rs2410; - cvt.s32.s8 %r3784, %r3783; - mad.lo.s32 %r3785, %r96, %r3784, %r3776; - mad.lo.s32 %r3786, %r97, %r3782, %r3785; - mad.lo.s32 %r3787, %r99, %r3780, %r3786; - mad.lo.s32 %r3788, %r100, %r3778, %r3787; - ld.const.v4.u8 {%rs2418, %rs2419, %rs2420, %rs2421}, [matrix+1208]; - cvt.u32.u16 %r3789, %rs2421; - cvt.s32.s8 %r3790, %r3789; - cvt.u32.u16 %r3791, %rs2420; - cvt.s32.s8 %r3792, %r3791; - cvt.u32.u16 %r3793, %rs2419; - cvt.s32.s8 %r3794, %r3793; - cvt.u32.u16 %r3795, %rs2418; - cvt.s32.s8 %r3796, %r3795; - mad.lo.s32 %r3797, %r103, %r3796, %r3788; - mad.lo.s32 %r3798, %r104, %r3794, %r3797; - mad.lo.s32 %r3799, %r107, %r3792, %r3798; - mad.lo.s32 %r3800, %r108, %r3790, %r3799; - ld.const.v4.u8 {%rs2426, %rs2427, %rs2428, %rs2429}, [matrix+1212]; - cvt.u32.u16 %r3801, %rs2429; - cvt.s32.s8 %r3802, %r3801; - cvt.u32.u16 %r3803, %rs2428; - cvt.s32.s8 %r3804, %r3803; - cvt.u32.u16 %r3805, %rs2427; - cvt.s32.s8 %r3806, %r3805; - cvt.u32.u16 %r3807, %rs2426; - cvt.s32.s8 %r3808, %r3807; - mad.lo.s32 %r3809, %r111, %r3808, %r3800; - mad.lo.s32 %r3810, %r112, %r3806, %r3809; - mad.lo.s32 %r3811, %r114, %r3804, %r3810; - mad.lo.s32 %r3812, %r115, %r3802, %r3811; - ld.const.v4.u8 {%rs2434, %rs2435, %rs2436, %rs2437}, [matrix+1216]; - cvt.u32.u16 %r3813, %rs2437; - cvt.s32.s8 %r3814, %r3813; - cvt.u32.u16 %r3815, %rs2436; - cvt.s32.s8 %r3816, %r3815; - cvt.u32.u16 %r3817, %rs2434; - cvt.s32.s8 %r3818, %r3817; - cvt.u32.u16 %r3819, %rs2435; - cvt.s32.s8 %r3820, %r3819; - mul.lo.s32 %r3821, %r34, %r3820; - mad.lo.s32 %r3822, %r124, %r3818, %r3821; - mad.lo.s32 %r3823, %r35, %r3816, %r3822; - mad.lo.s32 %r3824, %r36, %r3814, %r3823; - ld.const.v4.u8 {%rs2442, %rs2443, %rs2444, %rs2445}, [matrix+1220]; - cvt.u32.u16 %r3825, %rs2445; - cvt.s32.s8 %r3826, %r3825; - cvt.u32.u16 %r3827, %rs2444; - cvt.s32.s8 %r3828, %r3827; - cvt.u32.u16 %r3829, %rs2443; - cvt.s32.s8 %r3830, %r3829; - cvt.u32.u16 %r3831, %rs2442; - cvt.s32.s8 %r3832, %r3831; - mad.lo.s32 %r3833, %r37, %r3832, %r3824; - mad.lo.s32 %r3834, %r38, %r3830, %r3833; - mad.lo.s32 %r3835, %r39, %r3828, %r3834; - mad.lo.s32 %r3836, %r40, %r3826, %r3835; - ld.const.v4.u8 {%rs2450, %rs2451, %rs2452, %rs2453}, [matrix+1224]; - cvt.u32.u16 %r3837, %rs2453; - cvt.s32.s8 %r3838, %r3837; - cvt.u32.u16 %r3839, %rs2452; - cvt.s32.s8 %r3840, %r3839; - cvt.u32.u16 %r3841, %rs2451; - cvt.s32.s8 %r3842, %r3841; - cvt.u32.u16 %r3843, %rs2450; - cvt.s32.s8 %r3844, %r3843; - mad.lo.s32 %r3845, %r42, %r3844, %r3836; - mad.lo.s32 %r3846, %r43, %r3842, %r3845; - mad.lo.s32 %r3847, %r45, %r3840, %r3846; - mad.lo.s32 %r3848, %r46, %r3838, %r3847; - ld.const.v4.u8 {%rs2458, %rs2459, %rs2460, %rs2461}, [matrix+1228]; - cvt.u32.u16 %r3849, %rs2461; - cvt.s32.s8 %r3850, %r3849; - cvt.u32.u16 %r3851, %rs2460; - cvt.s32.s8 %r3852, %r3851; - cvt.u32.u16 %r3853, %rs2459; - cvt.s32.s8 %r3854, %r3853; - cvt.u32.u16 %r3855, %rs2458; - cvt.s32.s8 %r3856, %r3855; - mad.lo.s32 %r3857, %r48, %r3856, %r3848; - mad.lo.s32 %r3858, %r49, %r3854, %r3857; - mad.lo.s32 %r3859, %r50, %r3852, %r3858; - mad.lo.s32 %r3860, %r51, %r3850, %r3859; - ld.const.v4.u8 {%rs2466, %rs2467, %rs2468, %rs2469}, [matrix+1232]; - cvt.u32.u16 %r3861, %rs2469; - cvt.s32.s8 %r3862, %r3861; - cvt.u32.u16 %r3863, %rs2468; - cvt.s32.s8 %r3864, %r3863; - cvt.u32.u16 %r3865, %rs2467; - cvt.s32.s8 %r3866, %r3865; - cvt.u32.u16 %r3867, %rs2466; - cvt.s32.s8 %r3868, %r3867; - mad.lo.s32 %r3869, %r173, %r3868, %r3860; - mad.lo.s32 %r3870, %r53, %r3866, %r3869; - mad.lo.s32 %r3871, %r54, %r3864, %r3870; - mad.lo.s32 %r3872, %r55, %r3862, %r3871; - ld.const.v4.u8 {%rs2474, %rs2475, %rs2476, %rs2477}, [matrix+1236]; - cvt.u32.u16 %r3873, %rs2477; - cvt.s32.s8 %r3874, %r3873; - cvt.u32.u16 %r3875, %rs2476; - cvt.s32.s8 %r3876, %r3875; - cvt.u32.u16 %r3877, %rs2475; - cvt.s32.s8 %r3878, %r3877; - cvt.u32.u16 %r3879, %rs2474; - cvt.s32.s8 %r3880, %r3879; - mad.lo.s32 %r3881, %r56, %r3880, %r3872; - mad.lo.s32 %r3882, %r57, %r3878, %r3881; - mad.lo.s32 %r3883, %r58, %r3876, %r3882; - mad.lo.s32 %r3884, %r59, %r3874, %r3883; - ld.const.v4.u8 {%rs2482, %rs2483, %rs2484, %rs2485}, [matrix+1240]; - cvt.u32.u16 %r3885, %rs2485; - cvt.s32.s8 %r3886, %r3885; - cvt.u32.u16 %r3887, %rs2484; - cvt.s32.s8 %r3888, %r3887; - cvt.u32.u16 %r3889, %rs2483; - cvt.s32.s8 %r3890, %r3889; - cvt.u32.u16 %r3891, %rs2482; - cvt.s32.s8 %r3892, %r3891; - mad.lo.s32 %r3893, %r61, %r3892, %r3884; - mad.lo.s32 %r3894, %r62, %r3890, %r3893; - mad.lo.s32 %r3895, %r64, %r3888, %r3894; - mad.lo.s32 %r3896, %r65, %r3886, %r3895; - ld.const.v4.u8 {%rs2490, %rs2491, %rs2492, %rs2493}, [matrix+1244]; - cvt.u32.u16 %r3897, %rs2493; - cvt.s32.s8 %r3898, %r3897; - cvt.u32.u16 %r3899, %rs2492; - cvt.s32.s8 %r3900, %r3899; - cvt.u32.u16 %r3901, %rs2491; - cvt.s32.s8 %r3902, %r3901; - cvt.u32.u16 %r3903, %rs2490; - cvt.s32.s8 %r3904, %r3903; - mad.lo.s32 %r3905, %r67, %r3904, %r3896; - mad.lo.s32 %r3906, %r68, %r3902, %r3905; - mad.lo.s32 %r3907, %r69, %r3900, %r3906; - mad.lo.s32 %r3908, %r70, %r3898, %r3907; - ld.const.v4.u8 {%rs2498, %rs2499, %rs2500, %rs2501}, [matrix+1248]; - cvt.u32.u16 %r3909, %rs2501; - cvt.s32.s8 %r3910, %r3909; - cvt.u32.u16 %r3911, %rs2500; - cvt.s32.s8 %r3912, %r3911; - cvt.u32.u16 %r3913, %rs2499; - cvt.s32.s8 %r3914, %r3913; - cvt.u32.u16 %r3915, %rs2498; - cvt.s32.s8 %r3916, %r3915; - mad.lo.s32 %r3917, %r222, %r3916, %r3908; - mad.lo.s32 %r3918, %r72, %r3914, %r3917; - mad.lo.s32 %r3919, %r73, %r3912, %r3918; - mad.lo.s32 %r3920, %r74, %r3910, %r3919; - ld.const.v4.u8 {%rs2506, %rs2507, %rs2508, %rs2509}, [matrix+1252]; - cvt.u32.u16 %r3921, %rs2509; - cvt.s32.s8 %r3922, %r3921; - cvt.u32.u16 %r3923, %rs2508; - cvt.s32.s8 %r3924, %r3923; - cvt.u32.u16 %r3925, %rs2507; - cvt.s32.s8 %r3926, %r3925; - cvt.u32.u16 %r3927, %rs2506; - cvt.s32.s8 %r3928, %r3927; - mad.lo.s32 %r3929, %r75, %r3928, %r3920; - mad.lo.s32 %r3930, %r76, %r3926, %r3929; - mad.lo.s32 %r3931, %r77, %r3924, %r3930; - mad.lo.s32 %r3932, %r78, %r3922, %r3931; - ld.const.v4.u8 {%rs2514, %rs2515, %rs2516, %rs2517}, [matrix+1256]; - cvt.u32.u16 %r3933, %rs2517; - cvt.s32.s8 %r3934, %r3933; - cvt.u32.u16 %r3935, %rs2516; - cvt.s32.s8 %r3936, %r3935; - cvt.u32.u16 %r3937, %rs2515; - cvt.s32.s8 %r3938, %r3937; - cvt.u32.u16 %r3939, %rs2514; - cvt.s32.s8 %r3940, %r3939; - mad.lo.s32 %r3941, %r80, %r3940, %r3932; - mad.lo.s32 %r3942, %r81, %r3938, %r3941; - mad.lo.s32 %r3943, %r83, %r3936, %r3942; - mad.lo.s32 %r3944, %r84, %r3934, %r3943; - ld.const.v4.u8 {%rs2522, %rs2523, %rs2524, %rs2525}, [matrix+1260]; - cvt.u32.u16 %r3945, %rs2525; - cvt.s32.s8 %r3946, %r3945; - cvt.u32.u16 %r3947, %rs2524; - cvt.s32.s8 %r3948, %r3947; - cvt.u32.u16 %r3949, %rs2523; - cvt.s32.s8 %r3950, %r3949; - cvt.u32.u16 %r3951, %rs2522; - cvt.s32.s8 %r3952, %r3951; - mad.lo.s32 %r3953, %r86, %r3952, %r3944; - mad.lo.s32 %r3954, %r87, %r3950, %r3953; - mad.lo.s32 %r3955, %r88, %r3948, %r3954; - mad.lo.s32 %r3956, %r89, %r3946, %r3955; - ld.const.v4.u8 {%rs2530, %rs2531, %rs2532, %rs2533}, [matrix+1264]; - cvt.u32.u16 %r3957, %rs2533; - cvt.s32.s8 %r3958, %r3957; - cvt.u32.u16 %r3959, %rs2532; - cvt.s32.s8 %r3960, %r3959; - cvt.u32.u16 %r3961, %rs2531; - cvt.s32.s8 %r3962, %r3961; - cvt.u32.u16 %r3963, %rs2530; - cvt.s32.s8 %r3964, %r3963; - mad.lo.s32 %r3965, %r271, %r3964, %r3956; - mad.lo.s32 %r3966, %r91, %r3962, %r3965; - mad.lo.s32 %r3967, %r93, %r3960, %r3966; - mad.lo.s32 %r3968, %r94, %r3958, %r3967; - ld.const.v4.u8 {%rs2538, %rs2539, %rs2540, %rs2541}, [matrix+1268]; - cvt.u32.u16 %r3969, %rs2541; - cvt.s32.s8 %r3970, %r3969; - cvt.u32.u16 %r3971, %rs2540; - cvt.s32.s8 %r3972, %r3971; - cvt.u32.u16 %r3973, %rs2539; - cvt.s32.s8 %r3974, %r3973; - cvt.u32.u16 %r3975, %rs2538; - cvt.s32.s8 %r3976, %r3975; - mad.lo.s32 %r3977, %r96, %r3976, %r3968; - mad.lo.s32 %r3978, %r97, %r3974, %r3977; - mad.lo.s32 %r3979, %r99, %r3972, %r3978; - mad.lo.s32 %r3980, %r100, %r3970, %r3979; - ld.const.v4.u8 {%rs2546, %rs2547, %rs2548, %rs2549}, [matrix+1272]; - cvt.u32.u16 %r3981, %rs2549; - cvt.s32.s8 %r3982, %r3981; - cvt.u32.u16 %r3983, %rs2548; - cvt.s32.s8 %r3984, %r3983; - cvt.u32.u16 %r3985, %rs2547; - cvt.s32.s8 %r3986, %r3985; - cvt.u32.u16 %r3987, %rs2546; - cvt.s32.s8 %r3988, %r3987; - mad.lo.s32 %r3989, %r103, %r3988, %r3980; - mad.lo.s32 %r3990, %r104, %r3986, %r3989; - mad.lo.s32 %r3991, %r107, %r3984, %r3990; - mad.lo.s32 %r3992, %r108, %r3982, %r3991; - ld.const.v4.u8 {%rs2554, %rs2555, %rs2556, %rs2557}, [matrix+1276]; - cvt.u32.u16 %r3993, %rs2557; - cvt.s32.s8 %r3994, %r3993; - cvt.u32.u16 %r3995, %rs2556; - cvt.s32.s8 %r3996, %r3995; - cvt.u32.u16 %r3997, %rs2555; - cvt.s32.s8 %r3998, %r3997; - cvt.u32.u16 %r3999, %rs2554; - cvt.s32.s8 %r4000, %r3999; - mad.lo.s32 %r4001, %r111, %r4000, %r3992; - mad.lo.s32 %r4002, %r112, %r3998, %r4001; - mad.lo.s32 %r4003, %r114, %r3996, %r4002; - mad.lo.s32 %r4004, %r115, %r3994, %r4003; - shr.u32 %r4005, %r3812, 6; - and.b32 %r4006, %r4005, 240; - shr.u32 %r4007, %r4004, 10; - or.b32 %r4008, %r4007, %r4006; - xor.b32 %r4009, %r19, %r4008; - ld.const.v4.u8 {%rs2562, %rs2563, %rs2564, %rs2565}, [matrix+1280]; - cvt.u32.u16 %r4010, %rs2565; - cvt.s32.s8 %r4011, %r4010; - cvt.u32.u16 %r4012, %rs2564; - cvt.s32.s8 %r4013, %r4012; - cvt.u32.u16 %r4014, %rs2562; - cvt.s32.s8 %r4015, %r4014; - cvt.u32.u16 %r4016, %rs2563; - cvt.s32.s8 %r4017, %r4016; - mul.lo.s32 %r4018, %r34, %r4017; - mad.lo.s32 %r4019, %r124, %r4015, %r4018; - mad.lo.s32 %r4020, %r35, %r4013, %r4019; - mad.lo.s32 %r4021, %r36, %r4011, %r4020; - ld.const.v4.u8 {%rs2570, %rs2571, %rs2572, %rs2573}, [matrix+1284]; - cvt.u32.u16 %r4022, %rs2573; - cvt.s32.s8 %r4023, %r4022; - cvt.u32.u16 %r4024, %rs2572; - cvt.s32.s8 %r4025, %r4024; - cvt.u32.u16 %r4026, %rs2571; - cvt.s32.s8 %r4027, %r4026; - cvt.u32.u16 %r4028, %rs2570; - cvt.s32.s8 %r4029, %r4028; - mad.lo.s32 %r4030, %r37, %r4029, %r4021; - mad.lo.s32 %r4031, %r38, %r4027, %r4030; - mad.lo.s32 %r4032, %r39, %r4025, %r4031; - mad.lo.s32 %r4033, %r40, %r4023, %r4032; - ld.const.v4.u8 {%rs2578, %rs2579, %rs2580, %rs2581}, [matrix+1288]; - cvt.u32.u16 %r4034, %rs2581; - cvt.s32.s8 %r4035, %r4034; - cvt.u32.u16 %r4036, %rs2580; - cvt.s32.s8 %r4037, %r4036; - cvt.u32.u16 %r4038, %rs2579; - cvt.s32.s8 %r4039, %r4038; - cvt.u32.u16 %r4040, %rs2578; - cvt.s32.s8 %r4041, %r4040; - mad.lo.s32 %r4042, %r42, %r4041, %r4033; - mad.lo.s32 %r4043, %r43, %r4039, %r4042; - mad.lo.s32 %r4044, %r45, %r4037, %r4043; - mad.lo.s32 %r4045, %r46, %r4035, %r4044; - ld.const.v4.u8 {%rs2586, %rs2587, %rs2588, %rs2589}, [matrix+1292]; - cvt.u32.u16 %r4046, %rs2589; - cvt.s32.s8 %r4047, %r4046; - cvt.u32.u16 %r4048, %rs2588; - cvt.s32.s8 %r4049, %r4048; - cvt.u32.u16 %r4050, %rs2587; - cvt.s32.s8 %r4051, %r4050; - cvt.u32.u16 %r4052, %rs2586; - cvt.s32.s8 %r4053, %r4052; - mad.lo.s32 %r4054, %r48, %r4053, %r4045; - mad.lo.s32 %r4055, %r49, %r4051, %r4054; - mad.lo.s32 %r4056, %r50, %r4049, %r4055; - mad.lo.s32 %r4057, %r51, %r4047, %r4056; - ld.const.v4.u8 {%rs2594, %rs2595, %rs2596, %rs2597}, [matrix+1296]; - cvt.u32.u16 %r4058, %rs2597; - cvt.s32.s8 %r4059, %r4058; - cvt.u32.u16 %r4060, %rs2596; - cvt.s32.s8 %r4061, %r4060; - cvt.u32.u16 %r4062, %rs2595; - cvt.s32.s8 %r4063, %r4062; - cvt.u32.u16 %r4064, %rs2594; - cvt.s32.s8 %r4065, %r4064; - mad.lo.s32 %r4066, %r173, %r4065, %r4057; - mad.lo.s32 %r4067, %r53, %r4063, %r4066; - mad.lo.s32 %r4068, %r54, %r4061, %r4067; - mad.lo.s32 %r4069, %r55, %r4059, %r4068; - ld.const.v4.u8 {%rs2602, %rs2603, %rs2604, %rs2605}, [matrix+1300]; - cvt.u32.u16 %r4070, %rs2605; - cvt.s32.s8 %r4071, %r4070; - cvt.u32.u16 %r4072, %rs2604; - cvt.s32.s8 %r4073, %r4072; - cvt.u32.u16 %r4074, %rs2603; - cvt.s32.s8 %r4075, %r4074; - cvt.u32.u16 %r4076, %rs2602; - cvt.s32.s8 %r4077, %r4076; - mad.lo.s32 %r4078, %r56, %r4077, %r4069; - mad.lo.s32 %r4079, %r57, %r4075, %r4078; - mad.lo.s32 %r4080, %r58, %r4073, %r4079; - mad.lo.s32 %r4081, %r59, %r4071, %r4080; - ld.const.v4.u8 {%rs2610, %rs2611, %rs2612, %rs2613}, [matrix+1304]; - cvt.u32.u16 %r4082, %rs2613; - cvt.s32.s8 %r4083, %r4082; - cvt.u32.u16 %r4084, %rs2612; - cvt.s32.s8 %r4085, %r4084; - cvt.u32.u16 %r4086, %rs2611; - cvt.s32.s8 %r4087, %r4086; - cvt.u32.u16 %r4088, %rs2610; - cvt.s32.s8 %r4089, %r4088; - mad.lo.s32 %r4090, %r61, %r4089, %r4081; - mad.lo.s32 %r4091, %r62, %r4087, %r4090; - mad.lo.s32 %r4092, %r64, %r4085, %r4091; - mad.lo.s32 %r4093, %r65, %r4083, %r4092; - ld.const.v4.u8 {%rs2618, %rs2619, %rs2620, %rs2621}, [matrix+1308]; - cvt.u32.u16 %r4094, %rs2621; - cvt.s32.s8 %r4095, %r4094; - cvt.u32.u16 %r4096, %rs2620; - cvt.s32.s8 %r4097, %r4096; - cvt.u32.u16 %r4098, %rs2619; - cvt.s32.s8 %r4099, %r4098; - cvt.u32.u16 %r4100, %rs2618; - cvt.s32.s8 %r4101, %r4100; - mad.lo.s32 %r4102, %r67, %r4101, %r4093; - mad.lo.s32 %r4103, %r68, %r4099, %r4102; - mad.lo.s32 %r4104, %r69, %r4097, %r4103; - mad.lo.s32 %r4105, %r70, %r4095, %r4104; - ld.const.v4.u8 {%rs2626, %rs2627, %rs2628, %rs2629}, [matrix+1312]; - cvt.u32.u16 %r4106, %rs2629; - cvt.s32.s8 %r4107, %r4106; - cvt.u32.u16 %r4108, %rs2628; - cvt.s32.s8 %r4109, %r4108; - cvt.u32.u16 %r4110, %rs2627; - cvt.s32.s8 %r4111, %r4110; - cvt.u32.u16 %r4112, %rs2626; - cvt.s32.s8 %r4113, %r4112; - mad.lo.s32 %r4114, %r222, %r4113, %r4105; - mad.lo.s32 %r4115, %r72, %r4111, %r4114; - mad.lo.s32 %r4116, %r73, %r4109, %r4115; - mad.lo.s32 %r4117, %r74, %r4107, %r4116; - ld.const.v4.u8 {%rs2634, %rs2635, %rs2636, %rs2637}, [matrix+1316]; - cvt.u32.u16 %r4118, %rs2637; - cvt.s32.s8 %r4119, %r4118; - cvt.u32.u16 %r4120, %rs2636; - cvt.s32.s8 %r4121, %r4120; - cvt.u32.u16 %r4122, %rs2635; - cvt.s32.s8 %r4123, %r4122; - cvt.u32.u16 %r4124, %rs2634; - cvt.s32.s8 %r4125, %r4124; - mad.lo.s32 %r4126, %r75, %r4125, %r4117; - mad.lo.s32 %r4127, %r76, %r4123, %r4126; - mad.lo.s32 %r4128, %r77, %r4121, %r4127; - mad.lo.s32 %r4129, %r78, %r4119, %r4128; - ld.const.v4.u8 {%rs2642, %rs2643, %rs2644, %rs2645}, [matrix+1320]; - cvt.u32.u16 %r4130, %rs2645; - cvt.s32.s8 %r4131, %r4130; - cvt.u32.u16 %r4132, %rs2644; - cvt.s32.s8 %r4133, %r4132; - cvt.u32.u16 %r4134, %rs2643; - cvt.s32.s8 %r4135, %r4134; - cvt.u32.u16 %r4136, %rs2642; - cvt.s32.s8 %r4137, %r4136; - mad.lo.s32 %r4138, %r80, %r4137, %r4129; - mad.lo.s32 %r4139, %r81, %r4135, %r4138; - mad.lo.s32 %r4140, %r83, %r4133, %r4139; - mad.lo.s32 %r4141, %r84, %r4131, %r4140; - ld.const.v4.u8 {%rs2650, %rs2651, %rs2652, %rs2653}, [matrix+1324]; - cvt.u32.u16 %r4142, %rs2653; - cvt.s32.s8 %r4143, %r4142; - cvt.u32.u16 %r4144, %rs2652; - cvt.s32.s8 %r4145, %r4144; - cvt.u32.u16 %r4146, %rs2651; - cvt.s32.s8 %r4147, %r4146; - cvt.u32.u16 %r4148, %rs2650; - cvt.s32.s8 %r4149, %r4148; - mad.lo.s32 %r4150, %r86, %r4149, %r4141; - mad.lo.s32 %r4151, %r87, %r4147, %r4150; - mad.lo.s32 %r4152, %r88, %r4145, %r4151; - mad.lo.s32 %r4153, %r89, %r4143, %r4152; - ld.const.v4.u8 {%rs2658, %rs2659, %rs2660, %rs2661}, [matrix+1328]; - cvt.u32.u16 %r4154, %rs2661; - cvt.s32.s8 %r4155, %r4154; - cvt.u32.u16 %r4156, %rs2660; - cvt.s32.s8 %r4157, %r4156; - cvt.u32.u16 %r4158, %rs2659; - cvt.s32.s8 %r4159, %r4158; - cvt.u32.u16 %r4160, %rs2658; - cvt.s32.s8 %r4161, %r4160; - mad.lo.s32 %r4162, %r271, %r4161, %r4153; - mad.lo.s32 %r4163, %r91, %r4159, %r4162; - mad.lo.s32 %r4164, %r93, %r4157, %r4163; - mad.lo.s32 %r4165, %r94, %r4155, %r4164; - ld.const.v4.u8 {%rs2666, %rs2667, %rs2668, %rs2669}, [matrix+1332]; - cvt.u32.u16 %r4166, %rs2669; - cvt.s32.s8 %r4167, %r4166; - cvt.u32.u16 %r4168, %rs2668; - cvt.s32.s8 %r4169, %r4168; - cvt.u32.u16 %r4170, %rs2667; - cvt.s32.s8 %r4171, %r4170; - cvt.u32.u16 %r4172, %rs2666; - cvt.s32.s8 %r4173, %r4172; - mad.lo.s32 %r4174, %r96, %r4173, %r4165; - mad.lo.s32 %r4175, %r97, %r4171, %r4174; - mad.lo.s32 %r4176, %r99, %r4169, %r4175; - mad.lo.s32 %r4177, %r100, %r4167, %r4176; - ld.const.v4.u8 {%rs2674, %rs2675, %rs2676, %rs2677}, [matrix+1336]; - cvt.u32.u16 %r4178, %rs2677; - cvt.s32.s8 %r4179, %r4178; - cvt.u32.u16 %r4180, %rs2676; - cvt.s32.s8 %r4181, %r4180; - cvt.u32.u16 %r4182, %rs2675; - cvt.s32.s8 %r4183, %r4182; - cvt.u32.u16 %r4184, %rs2674; - cvt.s32.s8 %r4185, %r4184; - mad.lo.s32 %r4186, %r103, %r4185, %r4177; - mad.lo.s32 %r4187, %r104, %r4183, %r4186; - mad.lo.s32 %r4188, %r107, %r4181, %r4187; - mad.lo.s32 %r4189, %r108, %r4179, %r4188; - ld.const.v4.u8 {%rs2682, %rs2683, %rs2684, %rs2685}, [matrix+1340]; - cvt.u32.u16 %r4190, %rs2685; - cvt.s32.s8 %r4191, %r4190; - cvt.u32.u16 %r4192, %rs2684; - cvt.s32.s8 %r4193, %r4192; - cvt.u32.u16 %r4194, %rs2683; - cvt.s32.s8 %r4195, %r4194; - cvt.u32.u16 %r4196, %rs2682; - cvt.s32.s8 %r4197, %r4196; - mad.lo.s32 %r4198, %r111, %r4197, %r4189; - mad.lo.s32 %r4199, %r112, %r4195, %r4198; - mad.lo.s32 %r4200, %r114, %r4193, %r4199; - mad.lo.s32 %r4201, %r115, %r4191, %r4200; - ld.const.v4.u8 {%rs2690, %rs2691, %rs2692, %rs2693}, [matrix+1344]; - cvt.u32.u16 %r4202, %rs2693; - cvt.s32.s8 %r4203, %r4202; - cvt.u32.u16 %r4204, %rs2692; - cvt.s32.s8 %r4205, %r4204; - cvt.u32.u16 %r4206, %rs2690; - cvt.s32.s8 %r4207, %r4206; - cvt.u32.u16 %r4208, %rs2691; - cvt.s32.s8 %r4209, %r4208; - mul.lo.s32 %r4210, %r34, %r4209; - mad.lo.s32 %r4211, %r124, %r4207, %r4210; - mad.lo.s32 %r4212, %r35, %r4205, %r4211; - mad.lo.s32 %r4213, %r36, %r4203, %r4212; - ld.const.v4.u8 {%rs2698, %rs2699, %rs2700, %rs2701}, [matrix+1348]; - cvt.u32.u16 %r4214, %rs2701; - cvt.s32.s8 %r4215, %r4214; - cvt.u32.u16 %r4216, %rs2700; - cvt.s32.s8 %r4217, %r4216; - cvt.u32.u16 %r4218, %rs2699; - cvt.s32.s8 %r4219, %r4218; - cvt.u32.u16 %r4220, %rs2698; - cvt.s32.s8 %r4221, %r4220; - mad.lo.s32 %r4222, %r37, %r4221, %r4213; - mad.lo.s32 %r4223, %r38, %r4219, %r4222; - mad.lo.s32 %r4224, %r39, %r4217, %r4223; - mad.lo.s32 %r4225, %r40, %r4215, %r4224; - ld.const.v4.u8 {%rs2706, %rs2707, %rs2708, %rs2709}, [matrix+1352]; - cvt.u32.u16 %r4226, %rs2709; - cvt.s32.s8 %r4227, %r4226; - cvt.u32.u16 %r4228, %rs2708; - cvt.s32.s8 %r4229, %r4228; - cvt.u32.u16 %r4230, %rs2707; - cvt.s32.s8 %r4231, %r4230; - cvt.u32.u16 %r4232, %rs2706; - cvt.s32.s8 %r4233, %r4232; - mad.lo.s32 %r4234, %r42, %r4233, %r4225; - mad.lo.s32 %r4235, %r43, %r4231, %r4234; - mad.lo.s32 %r4236, %r45, %r4229, %r4235; - mad.lo.s32 %r4237, %r46, %r4227, %r4236; - ld.const.v4.u8 {%rs2714, %rs2715, %rs2716, %rs2717}, [matrix+1356]; - cvt.u32.u16 %r4238, %rs2717; - cvt.s32.s8 %r4239, %r4238; - cvt.u32.u16 %r4240, %rs2716; - cvt.s32.s8 %r4241, %r4240; - cvt.u32.u16 %r4242, %rs2715; - cvt.s32.s8 %r4243, %r4242; - cvt.u32.u16 %r4244, %rs2714; - cvt.s32.s8 %r4245, %r4244; - mad.lo.s32 %r4246, %r48, %r4245, %r4237; - mad.lo.s32 %r4247, %r49, %r4243, %r4246; - mad.lo.s32 %r4248, %r50, %r4241, %r4247; - mad.lo.s32 %r4249, %r51, %r4239, %r4248; - ld.const.v4.u8 {%rs2722, %rs2723, %rs2724, %rs2725}, [matrix+1360]; - cvt.u32.u16 %r4250, %rs2725; - cvt.s32.s8 %r4251, %r4250; - cvt.u32.u16 %r4252, %rs2724; - cvt.s32.s8 %r4253, %r4252; - cvt.u32.u16 %r4254, %rs2723; - cvt.s32.s8 %r4255, %r4254; - cvt.u32.u16 %r4256, %rs2722; - cvt.s32.s8 %r4257, %r4256; - mad.lo.s32 %r4258, %r173, %r4257, %r4249; - mad.lo.s32 %r4259, %r53, %r4255, %r4258; - mad.lo.s32 %r4260, %r54, %r4253, %r4259; - mad.lo.s32 %r4261, %r55, %r4251, %r4260; - ld.const.v4.u8 {%rs2730, %rs2731, %rs2732, %rs2733}, [matrix+1364]; - cvt.u32.u16 %r4262, %rs2733; - cvt.s32.s8 %r4263, %r4262; - cvt.u32.u16 %r4264, %rs2732; - cvt.s32.s8 %r4265, %r4264; - cvt.u32.u16 %r4266, %rs2731; - cvt.s32.s8 %r4267, %r4266; - cvt.u32.u16 %r4268, %rs2730; - cvt.s32.s8 %r4269, %r4268; - mad.lo.s32 %r4270, %r56, %r4269, %r4261; - mad.lo.s32 %r4271, %r57, %r4267, %r4270; - mad.lo.s32 %r4272, %r58, %r4265, %r4271; - mad.lo.s32 %r4273, %r59, %r4263, %r4272; - ld.const.v4.u8 {%rs2738, %rs2739, %rs2740, %rs2741}, [matrix+1368]; - cvt.u32.u16 %r4274, %rs2741; - cvt.s32.s8 %r4275, %r4274; - cvt.u32.u16 %r4276, %rs2740; - cvt.s32.s8 %r4277, %r4276; - cvt.u32.u16 %r4278, %rs2739; - cvt.s32.s8 %r4279, %r4278; - cvt.u32.u16 %r4280, %rs2738; - cvt.s32.s8 %r4281, %r4280; - mad.lo.s32 %r4282, %r61, %r4281, %r4273; - mad.lo.s32 %r4283, %r62, %r4279, %r4282; - mad.lo.s32 %r4284, %r64, %r4277, %r4283; - mad.lo.s32 %r4285, %r65, %r4275, %r4284; - ld.const.v4.u8 {%rs2746, %rs2747, %rs2748, %rs2749}, [matrix+1372]; - cvt.u32.u16 %r4286, %rs2749; - cvt.s32.s8 %r4287, %r4286; - cvt.u32.u16 %r4288, %rs2748; - cvt.s32.s8 %r4289, %r4288; - cvt.u32.u16 %r4290, %rs2747; - cvt.s32.s8 %r4291, %r4290; - cvt.u32.u16 %r4292, %rs2746; - cvt.s32.s8 %r4293, %r4292; - mad.lo.s32 %r4294, %r67, %r4293, %r4285; - mad.lo.s32 %r4295, %r68, %r4291, %r4294; - mad.lo.s32 %r4296, %r69, %r4289, %r4295; - mad.lo.s32 %r4297, %r70, %r4287, %r4296; - ld.const.v4.u8 {%rs2754, %rs2755, %rs2756, %rs2757}, [matrix+1376]; - cvt.u32.u16 %r4298, %rs2757; - cvt.s32.s8 %r4299, %r4298; - cvt.u32.u16 %r4300, %rs2756; - cvt.s32.s8 %r4301, %r4300; - cvt.u32.u16 %r4302, %rs2755; - cvt.s32.s8 %r4303, %r4302; - cvt.u32.u16 %r4304, %rs2754; - cvt.s32.s8 %r4305, %r4304; - mad.lo.s32 %r4306, %r222, %r4305, %r4297; - mad.lo.s32 %r4307, %r72, %r4303, %r4306; - mad.lo.s32 %r4308, %r73, %r4301, %r4307; - mad.lo.s32 %r4309, %r74, %r4299, %r4308; - ld.const.v4.u8 {%rs2762, %rs2763, %rs2764, %rs2765}, [matrix+1380]; - cvt.u32.u16 %r4310, %rs2765; - cvt.s32.s8 %r4311, %r4310; - cvt.u32.u16 %r4312, %rs2764; - cvt.s32.s8 %r4313, %r4312; - cvt.u32.u16 %r4314, %rs2763; - cvt.s32.s8 %r4315, %r4314; - cvt.u32.u16 %r4316, %rs2762; - cvt.s32.s8 %r4317, %r4316; - mad.lo.s32 %r4318, %r75, %r4317, %r4309; - mad.lo.s32 %r4319, %r76, %r4315, %r4318; - mad.lo.s32 %r4320, %r77, %r4313, %r4319; - mad.lo.s32 %r4321, %r78, %r4311, %r4320; - ld.const.v4.u8 {%rs2770, %rs2771, %rs2772, %rs2773}, [matrix+1384]; - cvt.u32.u16 %r4322, %rs2773; - cvt.s32.s8 %r4323, %r4322; - cvt.u32.u16 %r4324, %rs2772; - cvt.s32.s8 %r4325, %r4324; - cvt.u32.u16 %r4326, %rs2771; - cvt.s32.s8 %r4327, %r4326; - cvt.u32.u16 %r4328, %rs2770; - cvt.s32.s8 %r4329, %r4328; - mad.lo.s32 %r4330, %r80, %r4329, %r4321; - mad.lo.s32 %r4331, %r81, %r4327, %r4330; - mad.lo.s32 %r4332, %r83, %r4325, %r4331; - mad.lo.s32 %r4333, %r84, %r4323, %r4332; - ld.const.v4.u8 {%rs2778, %rs2779, %rs2780, %rs2781}, [matrix+1388]; - cvt.u32.u16 %r4334, %rs2781; - cvt.s32.s8 %r4335, %r4334; - cvt.u32.u16 %r4336, %rs2780; - cvt.s32.s8 %r4337, %r4336; - cvt.u32.u16 %r4338, %rs2779; - cvt.s32.s8 %r4339, %r4338; - cvt.u32.u16 %r4340, %rs2778; - cvt.s32.s8 %r4341, %r4340; - mad.lo.s32 %r4342, %r86, %r4341, %r4333; - mad.lo.s32 %r4343, %r87, %r4339, %r4342; - mad.lo.s32 %r4344, %r88, %r4337, %r4343; - mad.lo.s32 %r4345, %r89, %r4335, %r4344; - ld.const.v4.u8 {%rs2786, %rs2787, %rs2788, %rs2789}, [matrix+1392]; - cvt.u32.u16 %r4346, %rs2789; - cvt.s32.s8 %r4347, %r4346; - cvt.u32.u16 %r4348, %rs2788; - cvt.s32.s8 %r4349, %r4348; - cvt.u32.u16 %r4350, %rs2787; - cvt.s32.s8 %r4351, %r4350; - cvt.u32.u16 %r4352, %rs2786; - cvt.s32.s8 %r4353, %r4352; - mad.lo.s32 %r4354, %r271, %r4353, %r4345; - mad.lo.s32 %r4355, %r91, %r4351, %r4354; - mad.lo.s32 %r4356, %r93, %r4349, %r4355; - mad.lo.s32 %r4357, %r94, %r4347, %r4356; - ld.const.v4.u8 {%rs2794, %rs2795, %rs2796, %rs2797}, [matrix+1396]; - cvt.u32.u16 %r4358, %rs2797; - cvt.s32.s8 %r4359, %r4358; - cvt.u32.u16 %r4360, %rs2796; - cvt.s32.s8 %r4361, %r4360; - cvt.u32.u16 %r4362, %rs2795; - cvt.s32.s8 %r4363, %r4362; - cvt.u32.u16 %r4364, %rs2794; - cvt.s32.s8 %r4365, %r4364; - mad.lo.s32 %r4366, %r96, %r4365, %r4357; - mad.lo.s32 %r4367, %r97, %r4363, %r4366; - mad.lo.s32 %r4368, %r99, %r4361, %r4367; - mad.lo.s32 %r4369, %r100, %r4359, %r4368; - ld.const.v4.u8 {%rs2802, %rs2803, %rs2804, %rs2805}, [matrix+1400]; - cvt.u32.u16 %r4370, %rs2805; - cvt.s32.s8 %r4371, %r4370; - cvt.u32.u16 %r4372, %rs2804; - cvt.s32.s8 %r4373, %r4372; - cvt.u32.u16 %r4374, %rs2803; - cvt.s32.s8 %r4375, %r4374; - cvt.u32.u16 %r4376, %rs2802; - cvt.s32.s8 %r4377, %r4376; - mad.lo.s32 %r4378, %r103, %r4377, %r4369; - mad.lo.s32 %r4379, %r104, %r4375, %r4378; - mad.lo.s32 %r4380, %r107, %r4373, %r4379; - mad.lo.s32 %r4381, %r108, %r4371, %r4380; - ld.const.v4.u8 {%rs2810, %rs2811, %rs2812, %rs2813}, [matrix+1404]; - cvt.u32.u16 %r4382, %rs2813; - cvt.s32.s8 %r4383, %r4382; - cvt.u32.u16 %r4384, %rs2812; - cvt.s32.s8 %r4385, %r4384; - cvt.u32.u16 %r4386, %rs2811; - cvt.s32.s8 %r4387, %r4386; - cvt.u32.u16 %r4388, %rs2810; - cvt.s32.s8 %r4389, %r4388; - mad.lo.s32 %r4390, %r111, %r4389, %r4381; - mad.lo.s32 %r4391, %r112, %r4387, %r4390; - mad.lo.s32 %r4392, %r114, %r4385, %r4391; - mad.lo.s32 %r4393, %r115, %r4383, %r4392; - shr.u32 %r4394, %r4201, 6; - and.b32 %r4395, %r4394, 240; - shr.u32 %r4396, %r4393, 10; - or.b32 %r4397, %r4396, %r4395; - xor.b32 %r4398, %r20, %r4397; - cvt.u64.u32 %rd388, %r4398; - ld.const.v4.u8 {%rs2818, %rs2819, %rs2820, %rs2821}, [matrix+1408]; - cvt.u32.u16 %r4399, %rs2821; - cvt.s32.s8 %r4400, %r4399; - cvt.u32.u16 %r4401, %rs2820; - cvt.s32.s8 %r4402, %r4401; - cvt.u32.u16 %r4403, %rs2818; - cvt.s32.s8 %r4404, %r4403; - cvt.u32.u16 %r4405, %rs2819; - cvt.s32.s8 %r4406, %r4405; - mul.lo.s32 %r4407, %r34, %r4406; - mad.lo.s32 %r4408, %r124, %r4404, %r4407; - mad.lo.s32 %r4409, %r35, %r4402, %r4408; - mad.lo.s32 %r4410, %r36, %r4400, %r4409; - ld.const.v4.u8 {%rs2826, %rs2827, %rs2828, %rs2829}, [matrix+1412]; - cvt.u32.u16 %r4411, %rs2829; - cvt.s32.s8 %r4412, %r4411; - cvt.u32.u16 %r4413, %rs2828; - cvt.s32.s8 %r4414, %r4413; - cvt.u32.u16 %r4415, %rs2827; - cvt.s32.s8 %r4416, %r4415; - cvt.u32.u16 %r4417, %rs2826; - cvt.s32.s8 %r4418, %r4417; - mad.lo.s32 %r4419, %r37, %r4418, %r4410; - mad.lo.s32 %r4420, %r38, %r4416, %r4419; - mad.lo.s32 %r4421, %r39, %r4414, %r4420; - mad.lo.s32 %r4422, %r40, %r4412, %r4421; - ld.const.v4.u8 {%rs2834, %rs2835, %rs2836, %rs2837}, [matrix+1416]; - cvt.u32.u16 %r4423, %rs2837; - cvt.s32.s8 %r4424, %r4423; - cvt.u32.u16 %r4425, %rs2836; - cvt.s32.s8 %r4426, %r4425; - cvt.u32.u16 %r4427, %rs2835; - cvt.s32.s8 %r4428, %r4427; - cvt.u32.u16 %r4429, %rs2834; - cvt.s32.s8 %r4430, %r4429; - mad.lo.s32 %r4431, %r42, %r4430, %r4422; - mad.lo.s32 %r4432, %r43, %r4428, %r4431; - mad.lo.s32 %r4433, %r45, %r4426, %r4432; - mad.lo.s32 %r4434, %r46, %r4424, %r4433; - ld.const.v4.u8 {%rs2842, %rs2843, %rs2844, %rs2845}, [matrix+1420]; - cvt.u32.u16 %r4435, %rs2845; - cvt.s32.s8 %r4436, %r4435; - cvt.u32.u16 %r4437, %rs2844; - cvt.s32.s8 %r4438, %r4437; - cvt.u32.u16 %r4439, %rs2843; - cvt.s32.s8 %r4440, %r4439; - cvt.u32.u16 %r4441, %rs2842; - cvt.s32.s8 %r4442, %r4441; - mad.lo.s32 %r4443, %r48, %r4442, %r4434; - mad.lo.s32 %r4444, %r49, %r4440, %r4443; - mad.lo.s32 %r4445, %r50, %r4438, %r4444; - mad.lo.s32 %r4446, %r51, %r4436, %r4445; - ld.const.v4.u8 {%rs2850, %rs2851, %rs2852, %rs2853}, [matrix+1424]; - cvt.u32.u16 %r4447, %rs2853; - cvt.s32.s8 %r4448, %r4447; - cvt.u32.u16 %r4449, %rs2852; - cvt.s32.s8 %r4450, %r4449; - cvt.u32.u16 %r4451, %rs2851; - cvt.s32.s8 %r4452, %r4451; - cvt.u32.u16 %r4453, %rs2850; - cvt.s32.s8 %r4454, %r4453; - mad.lo.s32 %r4455, %r173, %r4454, %r4446; - mad.lo.s32 %r4456, %r53, %r4452, %r4455; - mad.lo.s32 %r4457, %r54, %r4450, %r4456; - mad.lo.s32 %r4458, %r55, %r4448, %r4457; - ld.const.v4.u8 {%rs2858, %rs2859, %rs2860, %rs2861}, [matrix+1428]; - cvt.u32.u16 %r4459, %rs2861; - cvt.s32.s8 %r4460, %r4459; - cvt.u32.u16 %r4461, %rs2860; - cvt.s32.s8 %r4462, %r4461; - cvt.u32.u16 %r4463, %rs2859; - cvt.s32.s8 %r4464, %r4463; - cvt.u32.u16 %r4465, %rs2858; - cvt.s32.s8 %r4466, %r4465; - mad.lo.s32 %r4467, %r56, %r4466, %r4458; - mad.lo.s32 %r4468, %r57, %r4464, %r4467; - mad.lo.s32 %r4469, %r58, %r4462, %r4468; - mad.lo.s32 %r4470, %r59, %r4460, %r4469; - ld.const.v4.u8 {%rs2866, %rs2867, %rs2868, %rs2869}, [matrix+1432]; - cvt.u32.u16 %r4471, %rs2869; - cvt.s32.s8 %r4472, %r4471; - cvt.u32.u16 %r4473, %rs2868; - cvt.s32.s8 %r4474, %r4473; - cvt.u32.u16 %r4475, %rs2867; - cvt.s32.s8 %r4476, %r4475; - cvt.u32.u16 %r4477, %rs2866; - cvt.s32.s8 %r4478, %r4477; - mad.lo.s32 %r4479, %r61, %r4478, %r4470; - mad.lo.s32 %r4480, %r62, %r4476, %r4479; - mad.lo.s32 %r4481, %r64, %r4474, %r4480; - mad.lo.s32 %r4482, %r65, %r4472, %r4481; - ld.const.v4.u8 {%rs2874, %rs2875, %rs2876, %rs2877}, [matrix+1436]; - cvt.u32.u16 %r4483, %rs2877; - cvt.s32.s8 %r4484, %r4483; - cvt.u32.u16 %r4485, %rs2876; - cvt.s32.s8 %r4486, %r4485; - cvt.u32.u16 %r4487, %rs2875; - cvt.s32.s8 %r4488, %r4487; - cvt.u32.u16 %r4489, %rs2874; - cvt.s32.s8 %r4490, %r4489; - mad.lo.s32 %r4491, %r67, %r4490, %r4482; - mad.lo.s32 %r4492, %r68, %r4488, %r4491; - mad.lo.s32 %r4493, %r69, %r4486, %r4492; - mad.lo.s32 %r4494, %r70, %r4484, %r4493; - ld.const.v4.u8 {%rs2882, %rs2883, %rs2884, %rs2885}, [matrix+1440]; - cvt.u32.u16 %r4495, %rs2885; - cvt.s32.s8 %r4496, %r4495; - cvt.u32.u16 %r4497, %rs2884; - cvt.s32.s8 %r4498, %r4497; - cvt.u32.u16 %r4499, %rs2883; - cvt.s32.s8 %r4500, %r4499; - cvt.u32.u16 %r4501, %rs2882; - cvt.s32.s8 %r4502, %r4501; - mad.lo.s32 %r4503, %r222, %r4502, %r4494; - mad.lo.s32 %r4504, %r72, %r4500, %r4503; - mad.lo.s32 %r4505, %r73, %r4498, %r4504; - mad.lo.s32 %r4506, %r74, %r4496, %r4505; - ld.const.v4.u8 {%rs2890, %rs2891, %rs2892, %rs2893}, [matrix+1444]; - cvt.u32.u16 %r4507, %rs2893; - cvt.s32.s8 %r4508, %r4507; - cvt.u32.u16 %r4509, %rs2892; - cvt.s32.s8 %r4510, %r4509; - cvt.u32.u16 %r4511, %rs2891; - cvt.s32.s8 %r4512, %r4511; - cvt.u32.u16 %r4513, %rs2890; - cvt.s32.s8 %r4514, %r4513; - mad.lo.s32 %r4515, %r75, %r4514, %r4506; - mad.lo.s32 %r4516, %r76, %r4512, %r4515; - mad.lo.s32 %r4517, %r77, %r4510, %r4516; - mad.lo.s32 %r4518, %r78, %r4508, %r4517; - ld.const.v4.u8 {%rs2898, %rs2899, %rs2900, %rs2901}, [matrix+1448]; - cvt.u32.u16 %r4519, %rs2901; - cvt.s32.s8 %r4520, %r4519; - cvt.u32.u16 %r4521, %rs2900; - cvt.s32.s8 %r4522, %r4521; - cvt.u32.u16 %r4523, %rs2899; - cvt.s32.s8 %r4524, %r4523; - cvt.u32.u16 %r4525, %rs2898; - cvt.s32.s8 %r4526, %r4525; - mad.lo.s32 %r4527, %r80, %r4526, %r4518; - mad.lo.s32 %r4528, %r81, %r4524, %r4527; - mad.lo.s32 %r4529, %r83, %r4522, %r4528; - mad.lo.s32 %r4530, %r84, %r4520, %r4529; - ld.const.v4.u8 {%rs2906, %rs2907, %rs2908, %rs2909}, [matrix+1452]; - cvt.u32.u16 %r4531, %rs2909; - cvt.s32.s8 %r4532, %r4531; - cvt.u32.u16 %r4533, %rs2908; - cvt.s32.s8 %r4534, %r4533; - cvt.u32.u16 %r4535, %rs2907; - cvt.s32.s8 %r4536, %r4535; - cvt.u32.u16 %r4537, %rs2906; - cvt.s32.s8 %r4538, %r4537; - mad.lo.s32 %r4539, %r86, %r4538, %r4530; - mad.lo.s32 %r4540, %r87, %r4536, %r4539; - mad.lo.s32 %r4541, %r88, %r4534, %r4540; - mad.lo.s32 %r4542, %r89, %r4532, %r4541; - ld.const.v4.u8 {%rs2914, %rs2915, %rs2916, %rs2917}, [matrix+1456]; - cvt.u32.u16 %r4543, %rs2917; - cvt.s32.s8 %r4544, %r4543; - cvt.u32.u16 %r4545, %rs2916; - cvt.s32.s8 %r4546, %r4545; - cvt.u32.u16 %r4547, %rs2915; - cvt.s32.s8 %r4548, %r4547; - cvt.u32.u16 %r4549, %rs2914; - cvt.s32.s8 %r4550, %r4549; - mad.lo.s32 %r4551, %r271, %r4550, %r4542; - mad.lo.s32 %r4552, %r91, %r4548, %r4551; - mad.lo.s32 %r4553, %r93, %r4546, %r4552; - mad.lo.s32 %r4554, %r94, %r4544, %r4553; - ld.const.v4.u8 {%rs2922, %rs2923, %rs2924, %rs2925}, [matrix+1460]; - cvt.u32.u16 %r4555, %rs2925; - cvt.s32.s8 %r4556, %r4555; - cvt.u32.u16 %r4557, %rs2924; - cvt.s32.s8 %r4558, %r4557; - cvt.u32.u16 %r4559, %rs2923; - cvt.s32.s8 %r4560, %r4559; - cvt.u32.u16 %r4561, %rs2922; - cvt.s32.s8 %r4562, %r4561; - mad.lo.s32 %r4563, %r96, %r4562, %r4554; - mad.lo.s32 %r4564, %r97, %r4560, %r4563; - mad.lo.s32 %r4565, %r99, %r4558, %r4564; - mad.lo.s32 %r4566, %r100, %r4556, %r4565; - ld.const.v4.u8 {%rs2930, %rs2931, %rs2932, %rs2933}, [matrix+1464]; - cvt.u32.u16 %r4567, %rs2933; - cvt.s32.s8 %r4568, %r4567; - cvt.u32.u16 %r4569, %rs2932; - cvt.s32.s8 %r4570, %r4569; - cvt.u32.u16 %r4571, %rs2931; - cvt.s32.s8 %r4572, %r4571; - cvt.u32.u16 %r4573, %rs2930; - cvt.s32.s8 %r4574, %r4573; - mad.lo.s32 %r4575, %r103, %r4574, %r4566; - mad.lo.s32 %r4576, %r104, %r4572, %r4575; - mad.lo.s32 %r4577, %r107, %r4570, %r4576; - mad.lo.s32 %r4578, %r108, %r4568, %r4577; - ld.const.v4.u8 {%rs2938, %rs2939, %rs2940, %rs2941}, [matrix+1468]; - cvt.u32.u16 %r4579, %rs2941; - cvt.s32.s8 %r4580, %r4579; - cvt.u32.u16 %r4581, %rs2940; - cvt.s32.s8 %r4582, %r4581; - cvt.u32.u16 %r4583, %rs2939; - cvt.s32.s8 %r4584, %r4583; - cvt.u32.u16 %r4585, %rs2938; - cvt.s32.s8 %r4586, %r4585; - mad.lo.s32 %r4587, %r111, %r4586, %r4578; - mad.lo.s32 %r4588, %r112, %r4584, %r4587; - mad.lo.s32 %r4589, %r114, %r4582, %r4588; - mad.lo.s32 %r4590, %r115, %r4580, %r4589; - ld.const.v4.u8 {%rs2946, %rs2947, %rs2948, %rs2949}, [matrix+1472]; - cvt.u32.u16 %r4591, %rs2949; - cvt.s32.s8 %r4592, %r4591; - cvt.u32.u16 %r4593, %rs2948; - cvt.s32.s8 %r4594, %r4593; - cvt.u32.u16 %r4595, %rs2946; - cvt.s32.s8 %r4596, %r4595; - cvt.u32.u16 %r4597, %rs2947; - cvt.s32.s8 %r4598, %r4597; - mul.lo.s32 %r4599, %r34, %r4598; - mad.lo.s32 %r4600, %r124, %r4596, %r4599; - mad.lo.s32 %r4601, %r35, %r4594, %r4600; - mad.lo.s32 %r4602, %r36, %r4592, %r4601; - ld.const.v4.u8 {%rs2954, %rs2955, %rs2956, %rs2957}, [matrix+1476]; - cvt.u32.u16 %r4603, %rs2957; - cvt.s32.s8 %r4604, %r4603; - cvt.u32.u16 %r4605, %rs2956; - cvt.s32.s8 %r4606, %r4605; - cvt.u32.u16 %r4607, %rs2955; - cvt.s32.s8 %r4608, %r4607; - cvt.u32.u16 %r4609, %rs2954; - cvt.s32.s8 %r4610, %r4609; - mad.lo.s32 %r4611, %r37, %r4610, %r4602; - mad.lo.s32 %r4612, %r38, %r4608, %r4611; - mad.lo.s32 %r4613, %r39, %r4606, %r4612; - mad.lo.s32 %r4614, %r40, %r4604, %r4613; - ld.const.v4.u8 {%rs2962, %rs2963, %rs2964, %rs2965}, [matrix+1480]; - cvt.u32.u16 %r4615, %rs2965; - cvt.s32.s8 %r4616, %r4615; - cvt.u32.u16 %r4617, %rs2964; - cvt.s32.s8 %r4618, %r4617; - cvt.u32.u16 %r4619, %rs2963; - cvt.s32.s8 %r4620, %r4619; - cvt.u32.u16 %r4621, %rs2962; - cvt.s32.s8 %r4622, %r4621; - mad.lo.s32 %r4623, %r42, %r4622, %r4614; - mad.lo.s32 %r4624, %r43, %r4620, %r4623; - mad.lo.s32 %r4625, %r45, %r4618, %r4624; - mad.lo.s32 %r4626, %r46, %r4616, %r4625; - ld.const.v4.u8 {%rs2970, %rs2971, %rs2972, %rs2973}, [matrix+1484]; - cvt.u32.u16 %r4627, %rs2973; - cvt.s32.s8 %r4628, %r4627; - cvt.u32.u16 %r4629, %rs2972; - cvt.s32.s8 %r4630, %r4629; - cvt.u32.u16 %r4631, %rs2971; - cvt.s32.s8 %r4632, %r4631; - cvt.u32.u16 %r4633, %rs2970; - cvt.s32.s8 %r4634, %r4633; - mad.lo.s32 %r4635, %r48, %r4634, %r4626; - mad.lo.s32 %r4636, %r49, %r4632, %r4635; - mad.lo.s32 %r4637, %r50, %r4630, %r4636; - mad.lo.s32 %r4638, %r51, %r4628, %r4637; - ld.const.v4.u8 {%rs2978, %rs2979, %rs2980, %rs2981}, [matrix+1488]; - cvt.u32.u16 %r4639, %rs2981; - cvt.s32.s8 %r4640, %r4639; - cvt.u32.u16 %r4641, %rs2980; - cvt.s32.s8 %r4642, %r4641; - cvt.u32.u16 %r4643, %rs2979; - cvt.s32.s8 %r4644, %r4643; - cvt.u32.u16 %r4645, %rs2978; - cvt.s32.s8 %r4646, %r4645; - mad.lo.s32 %r4647, %r173, %r4646, %r4638; - mad.lo.s32 %r4648, %r53, %r4644, %r4647; - mad.lo.s32 %r4649, %r54, %r4642, %r4648; - mad.lo.s32 %r4650, %r55, %r4640, %r4649; - ld.const.v4.u8 {%rs2986, %rs2987, %rs2988, %rs2989}, [matrix+1492]; - cvt.u32.u16 %r4651, %rs2989; - cvt.s32.s8 %r4652, %r4651; - cvt.u32.u16 %r4653, %rs2988; - cvt.s32.s8 %r4654, %r4653; - cvt.u32.u16 %r4655, %rs2987; - cvt.s32.s8 %r4656, %r4655; - cvt.u32.u16 %r4657, %rs2986; - cvt.s32.s8 %r4658, %r4657; - mad.lo.s32 %r4659, %r56, %r4658, %r4650; - mad.lo.s32 %r4660, %r57, %r4656, %r4659; - mad.lo.s32 %r4661, %r58, %r4654, %r4660; - mad.lo.s32 %r4662, %r59, %r4652, %r4661; - ld.const.v4.u8 {%rs2994, %rs2995, %rs2996, %rs2997}, [matrix+1496]; - cvt.u32.u16 %r4663, %rs2997; - cvt.s32.s8 %r4664, %r4663; - cvt.u32.u16 %r4665, %rs2996; - cvt.s32.s8 %r4666, %r4665; - cvt.u32.u16 %r4667, %rs2995; - cvt.s32.s8 %r4668, %r4667; - cvt.u32.u16 %r4669, %rs2994; - cvt.s32.s8 %r4670, %r4669; - mad.lo.s32 %r4671, %r61, %r4670, %r4662; - mad.lo.s32 %r4672, %r62, %r4668, %r4671; - mad.lo.s32 %r4673, %r64, %r4666, %r4672; - mad.lo.s32 %r4674, %r65, %r4664, %r4673; - ld.const.v4.u8 {%rs3002, %rs3003, %rs3004, %rs3005}, [matrix+1500]; - cvt.u32.u16 %r4675, %rs3005; - cvt.s32.s8 %r4676, %r4675; - cvt.u32.u16 %r4677, %rs3004; - cvt.s32.s8 %r4678, %r4677; - cvt.u32.u16 %r4679, %rs3003; - cvt.s32.s8 %r4680, %r4679; - cvt.u32.u16 %r4681, %rs3002; - cvt.s32.s8 %r4682, %r4681; - mad.lo.s32 %r4683, %r67, %r4682, %r4674; - mad.lo.s32 %r4684, %r68, %r4680, %r4683; - mad.lo.s32 %r4685, %r69, %r4678, %r4684; - mad.lo.s32 %r4686, %r70, %r4676, %r4685; - ld.const.v4.u8 {%rs3010, %rs3011, %rs3012, %rs3013}, [matrix+1504]; - cvt.u32.u16 %r4687, %rs3013; - cvt.s32.s8 %r4688, %r4687; - cvt.u32.u16 %r4689, %rs3012; - cvt.s32.s8 %r4690, %r4689; - cvt.u32.u16 %r4691, %rs3011; - cvt.s32.s8 %r4692, %r4691; - cvt.u32.u16 %r4693, %rs3010; - cvt.s32.s8 %r4694, %r4693; - mad.lo.s32 %r4695, %r222, %r4694, %r4686; - mad.lo.s32 %r4696, %r72, %r4692, %r4695; - mad.lo.s32 %r4697, %r73, %r4690, %r4696; - mad.lo.s32 %r4698, %r74, %r4688, %r4697; - ld.const.v4.u8 {%rs3018, %rs3019, %rs3020, %rs3021}, [matrix+1508]; - cvt.u32.u16 %r4699, %rs3021; - cvt.s32.s8 %r4700, %r4699; - cvt.u32.u16 %r4701, %rs3020; - cvt.s32.s8 %r4702, %r4701; - cvt.u32.u16 %r4703, %rs3019; - cvt.s32.s8 %r4704, %r4703; - cvt.u32.u16 %r4705, %rs3018; - cvt.s32.s8 %r4706, %r4705; - mad.lo.s32 %r4707, %r75, %r4706, %r4698; - mad.lo.s32 %r4708, %r76, %r4704, %r4707; - mad.lo.s32 %r4709, %r77, %r4702, %r4708; - mad.lo.s32 %r4710, %r78, %r4700, %r4709; - ld.const.v4.u8 {%rs3026, %rs3027, %rs3028, %rs3029}, [matrix+1512]; - cvt.u32.u16 %r4711, %rs3029; - cvt.s32.s8 %r4712, %r4711; - cvt.u32.u16 %r4713, %rs3028; - cvt.s32.s8 %r4714, %r4713; - cvt.u32.u16 %r4715, %rs3027; - cvt.s32.s8 %r4716, %r4715; - cvt.u32.u16 %r4717, %rs3026; - cvt.s32.s8 %r4718, %r4717; - mad.lo.s32 %r4719, %r80, %r4718, %r4710; - mad.lo.s32 %r4720, %r81, %r4716, %r4719; - mad.lo.s32 %r4721, %r83, %r4714, %r4720; - mad.lo.s32 %r4722, %r84, %r4712, %r4721; - ld.const.v4.u8 {%rs3034, %rs3035, %rs3036, %rs3037}, [matrix+1516]; - cvt.u32.u16 %r4723, %rs3037; - cvt.s32.s8 %r4724, %r4723; - cvt.u32.u16 %r4725, %rs3036; - cvt.s32.s8 %r4726, %r4725; - cvt.u32.u16 %r4727, %rs3035; - cvt.s32.s8 %r4728, %r4727; - cvt.u32.u16 %r4729, %rs3034; - cvt.s32.s8 %r4730, %r4729; - mad.lo.s32 %r4731, %r86, %r4730, %r4722; - mad.lo.s32 %r4732, %r87, %r4728, %r4731; - mad.lo.s32 %r4733, %r88, %r4726, %r4732; - mad.lo.s32 %r4734, %r89, %r4724, %r4733; - ld.const.v4.u8 {%rs3042, %rs3043, %rs3044, %rs3045}, [matrix+1520]; - cvt.u32.u16 %r4735, %rs3045; - cvt.s32.s8 %r4736, %r4735; - cvt.u32.u16 %r4737, %rs3044; - cvt.s32.s8 %r4738, %r4737; - cvt.u32.u16 %r4739, %rs3043; - cvt.s32.s8 %r4740, %r4739; - cvt.u32.u16 %r4741, %rs3042; - cvt.s32.s8 %r4742, %r4741; - mad.lo.s32 %r4743, %r271, %r4742, %r4734; - mad.lo.s32 %r4744, %r91, %r4740, %r4743; - mad.lo.s32 %r4745, %r93, %r4738, %r4744; - mad.lo.s32 %r4746, %r94, %r4736, %r4745; - ld.const.v4.u8 {%rs3050, %rs3051, %rs3052, %rs3053}, [matrix+1524]; - cvt.u32.u16 %r4747, %rs3053; - cvt.s32.s8 %r4748, %r4747; - cvt.u32.u16 %r4749, %rs3052; - cvt.s32.s8 %r4750, %r4749; - cvt.u32.u16 %r4751, %rs3051; - cvt.s32.s8 %r4752, %r4751; - cvt.u32.u16 %r4753, %rs3050; - cvt.s32.s8 %r4754, %r4753; - mad.lo.s32 %r4755, %r96, %r4754, %r4746; - mad.lo.s32 %r4756, %r97, %r4752, %r4755; - mad.lo.s32 %r4757, %r99, %r4750, %r4756; - mad.lo.s32 %r4758, %r100, %r4748, %r4757; - ld.const.v4.u8 {%rs3058, %rs3059, %rs3060, %rs3061}, [matrix+1528]; - cvt.u32.u16 %r4759, %rs3061; - cvt.s32.s8 %r4760, %r4759; - cvt.u32.u16 %r4761, %rs3060; - cvt.s32.s8 %r4762, %r4761; - cvt.u32.u16 %r4763, %rs3059; - cvt.s32.s8 %r4764, %r4763; - cvt.u32.u16 %r4765, %rs3058; - cvt.s32.s8 %r4766, %r4765; - mad.lo.s32 %r4767, %r103, %r4766, %r4758; - mad.lo.s32 %r4768, %r104, %r4764, %r4767; - mad.lo.s32 %r4769, %r107, %r4762, %r4768; - mad.lo.s32 %r4770, %r108, %r4760, %r4769; - ld.const.v4.u8 {%rs3066, %rs3067, %rs3068, %rs3069}, [matrix+1532]; - cvt.u32.u16 %r4771, %rs3069; - cvt.s32.s8 %r4772, %r4771; - cvt.u32.u16 %r4773, %rs3068; - cvt.s32.s8 %r4774, %r4773; - cvt.u32.u16 %r4775, %rs3067; - cvt.s32.s8 %r4776, %r4775; - cvt.u32.u16 %r4777, %rs3066; - cvt.s32.s8 %r4778, %r4777; - mad.lo.s32 %r4779, %r111, %r4778, %r4770; - mad.lo.s32 %r4780, %r112, %r4776, %r4779; - mad.lo.s32 %r4781, %r114, %r4774, %r4780; - mad.lo.s32 %r4782, %r115, %r4772, %r4781; - shr.u32 %r4783, %r4590, 6; - and.b32 %r4784, %r4783, 240; - shr.u32 %r4785, %r4782, 10; - or.b32 %r4786, %r4785, %r4784; - xor.b32 %r4787, %r21, %r4786; - cvt.u64.u32 %rd389, %r4787; - ld.const.v4.u8 {%rs3074, %rs3075, %rs3076, %rs3077}, [matrix+1536]; - cvt.u32.u16 %r4788, %rs3077; - cvt.s32.s8 %r4789, %r4788; - cvt.u32.u16 %r4790, %rs3076; - cvt.s32.s8 %r4791, %r4790; - cvt.u32.u16 %r4792, %rs3074; - cvt.s32.s8 %r4793, %r4792; - cvt.u32.u16 %r4794, %rs3075; - cvt.s32.s8 %r4795, %r4794; - mul.lo.s32 %r4796, %r34, %r4795; - mad.lo.s32 %r4797, %r124, %r4793, %r4796; - mad.lo.s32 %r4798, %r35, %r4791, %r4797; - mad.lo.s32 %r4799, %r36, %r4789, %r4798; - ld.const.v4.u8 {%rs3082, %rs3083, %rs3084, %rs3085}, [matrix+1540]; - cvt.u32.u16 %r4800, %rs3085; - cvt.s32.s8 %r4801, %r4800; - cvt.u32.u16 %r4802, %rs3084; - cvt.s32.s8 %r4803, %r4802; - cvt.u32.u16 %r4804, %rs3083; - cvt.s32.s8 %r4805, %r4804; - cvt.u32.u16 %r4806, %rs3082; - cvt.s32.s8 %r4807, %r4806; - mad.lo.s32 %r4808, %r37, %r4807, %r4799; - mad.lo.s32 %r4809, %r38, %r4805, %r4808; - mad.lo.s32 %r4810, %r39, %r4803, %r4809; - mad.lo.s32 %r4811, %r40, %r4801, %r4810; - ld.const.v4.u8 {%rs3090, %rs3091, %rs3092, %rs3093}, [matrix+1544]; - cvt.u32.u16 %r4812, %rs3093; - cvt.s32.s8 %r4813, %r4812; - cvt.u32.u16 %r4814, %rs3092; - cvt.s32.s8 %r4815, %r4814; - cvt.u32.u16 %r4816, %rs3091; - cvt.s32.s8 %r4817, %r4816; - cvt.u32.u16 %r4818, %rs3090; - cvt.s32.s8 %r4819, %r4818; - mad.lo.s32 %r4820, %r42, %r4819, %r4811; - mad.lo.s32 %r4821, %r43, %r4817, %r4820; - mad.lo.s32 %r4822, %r45, %r4815, %r4821; - mad.lo.s32 %r4823, %r46, %r4813, %r4822; - ld.const.v4.u8 {%rs3098, %rs3099, %rs3100, %rs3101}, [matrix+1548]; - cvt.u32.u16 %r4824, %rs3101; - cvt.s32.s8 %r4825, %r4824; - cvt.u32.u16 %r4826, %rs3100; - cvt.s32.s8 %r4827, %r4826; - cvt.u32.u16 %r4828, %rs3099; - cvt.s32.s8 %r4829, %r4828; - cvt.u32.u16 %r4830, %rs3098; - cvt.s32.s8 %r4831, %r4830; - mad.lo.s32 %r4832, %r48, %r4831, %r4823; - mad.lo.s32 %r4833, %r49, %r4829, %r4832; - mad.lo.s32 %r4834, %r50, %r4827, %r4833; - mad.lo.s32 %r4835, %r51, %r4825, %r4834; - ld.const.v4.u8 {%rs3106, %rs3107, %rs3108, %rs3109}, [matrix+1552]; - cvt.u32.u16 %r4836, %rs3109; - cvt.s32.s8 %r4837, %r4836; - cvt.u32.u16 %r4838, %rs3108; - cvt.s32.s8 %r4839, %r4838; - cvt.u32.u16 %r4840, %rs3107; - cvt.s32.s8 %r4841, %r4840; - cvt.u32.u16 %r4842, %rs3106; - cvt.s32.s8 %r4843, %r4842; - mad.lo.s32 %r4844, %r173, %r4843, %r4835; - mad.lo.s32 %r4845, %r53, %r4841, %r4844; - mad.lo.s32 %r4846, %r54, %r4839, %r4845; - mad.lo.s32 %r4847, %r55, %r4837, %r4846; - ld.const.v4.u8 {%rs3114, %rs3115, %rs3116, %rs3117}, [matrix+1556]; - cvt.u32.u16 %r4848, %rs3117; - cvt.s32.s8 %r4849, %r4848; - cvt.u32.u16 %r4850, %rs3116; - cvt.s32.s8 %r4851, %r4850; - cvt.u32.u16 %r4852, %rs3115; - cvt.s32.s8 %r4853, %r4852; - cvt.u32.u16 %r4854, %rs3114; - cvt.s32.s8 %r4855, %r4854; - mad.lo.s32 %r4856, %r56, %r4855, %r4847; - mad.lo.s32 %r4857, %r57, %r4853, %r4856; - mad.lo.s32 %r4858, %r58, %r4851, %r4857; - mad.lo.s32 %r4859, %r59, %r4849, %r4858; - ld.const.v4.u8 {%rs3122, %rs3123, %rs3124, %rs3125}, [matrix+1560]; - cvt.u32.u16 %r4860, %rs3125; - cvt.s32.s8 %r4861, %r4860; - cvt.u32.u16 %r4862, %rs3124; - cvt.s32.s8 %r4863, %r4862; - cvt.u32.u16 %r4864, %rs3123; - cvt.s32.s8 %r4865, %r4864; - cvt.u32.u16 %r4866, %rs3122; - cvt.s32.s8 %r4867, %r4866; - mad.lo.s32 %r4868, %r61, %r4867, %r4859; - mad.lo.s32 %r4869, %r62, %r4865, %r4868; - mad.lo.s32 %r4870, %r64, %r4863, %r4869; - mad.lo.s32 %r4871, %r65, %r4861, %r4870; - ld.const.v4.u8 {%rs3130, %rs3131, %rs3132, %rs3133}, [matrix+1564]; - cvt.u32.u16 %r4872, %rs3133; - cvt.s32.s8 %r4873, %r4872; - cvt.u32.u16 %r4874, %rs3132; - cvt.s32.s8 %r4875, %r4874; - cvt.u32.u16 %r4876, %rs3131; - cvt.s32.s8 %r4877, %r4876; - cvt.u32.u16 %r4878, %rs3130; - cvt.s32.s8 %r4879, %r4878; - mad.lo.s32 %r4880, %r67, %r4879, %r4871; - mad.lo.s32 %r4881, %r68, %r4877, %r4880; - mad.lo.s32 %r4882, %r69, %r4875, %r4881; - mad.lo.s32 %r4883, %r70, %r4873, %r4882; - ld.const.v4.u8 {%rs3138, %rs3139, %rs3140, %rs3141}, [matrix+1568]; - cvt.u32.u16 %r4884, %rs3141; - cvt.s32.s8 %r4885, %r4884; - cvt.u32.u16 %r4886, %rs3140; - cvt.s32.s8 %r4887, %r4886; - cvt.u32.u16 %r4888, %rs3139; - cvt.s32.s8 %r4889, %r4888; - cvt.u32.u16 %r4890, %rs3138; - cvt.s32.s8 %r4891, %r4890; - mad.lo.s32 %r4892, %r222, %r4891, %r4883; - mad.lo.s32 %r4893, %r72, %r4889, %r4892; - mad.lo.s32 %r4894, %r73, %r4887, %r4893; - mad.lo.s32 %r4895, %r74, %r4885, %r4894; - ld.const.v4.u8 {%rs3146, %rs3147, %rs3148, %rs3149}, [matrix+1572]; - cvt.u32.u16 %r4896, %rs3149; - cvt.s32.s8 %r4897, %r4896; - cvt.u32.u16 %r4898, %rs3148; - cvt.s32.s8 %r4899, %r4898; - cvt.u32.u16 %r4900, %rs3147; - cvt.s32.s8 %r4901, %r4900; - cvt.u32.u16 %r4902, %rs3146; - cvt.s32.s8 %r4903, %r4902; - mad.lo.s32 %r4904, %r75, %r4903, %r4895; - mad.lo.s32 %r4905, %r76, %r4901, %r4904; - mad.lo.s32 %r4906, %r77, %r4899, %r4905; - mad.lo.s32 %r4907, %r78, %r4897, %r4906; - ld.const.v4.u8 {%rs3154, %rs3155, %rs3156, %rs3157}, [matrix+1576]; - cvt.u32.u16 %r4908, %rs3157; - cvt.s32.s8 %r4909, %r4908; - cvt.u32.u16 %r4910, %rs3156; - cvt.s32.s8 %r4911, %r4910; - cvt.u32.u16 %r4912, %rs3155; - cvt.s32.s8 %r4913, %r4912; - cvt.u32.u16 %r4914, %rs3154; - cvt.s32.s8 %r4915, %r4914; - mad.lo.s32 %r4916, %r80, %r4915, %r4907; - mad.lo.s32 %r4917, %r81, %r4913, %r4916; - mad.lo.s32 %r4918, %r83, %r4911, %r4917; - mad.lo.s32 %r4919, %r84, %r4909, %r4918; - ld.const.v4.u8 {%rs3162, %rs3163, %rs3164, %rs3165}, [matrix+1580]; - cvt.u32.u16 %r4920, %rs3165; - cvt.s32.s8 %r4921, %r4920; - cvt.u32.u16 %r4922, %rs3164; - cvt.s32.s8 %r4923, %r4922; - cvt.u32.u16 %r4924, %rs3163; - cvt.s32.s8 %r4925, %r4924; - cvt.u32.u16 %r4926, %rs3162; - cvt.s32.s8 %r4927, %r4926; - mad.lo.s32 %r4928, %r86, %r4927, %r4919; - mad.lo.s32 %r4929, %r87, %r4925, %r4928; - mad.lo.s32 %r4930, %r88, %r4923, %r4929; - mad.lo.s32 %r4931, %r89, %r4921, %r4930; - ld.const.v4.u8 {%rs3170, %rs3171, %rs3172, %rs3173}, [matrix+1584]; - cvt.u32.u16 %r4932, %rs3173; - cvt.s32.s8 %r4933, %r4932; - cvt.u32.u16 %r4934, %rs3172; - cvt.s32.s8 %r4935, %r4934; - cvt.u32.u16 %r4936, %rs3171; - cvt.s32.s8 %r4937, %r4936; - cvt.u32.u16 %r4938, %rs3170; - cvt.s32.s8 %r4939, %r4938; - mad.lo.s32 %r4940, %r271, %r4939, %r4931; - mad.lo.s32 %r4941, %r91, %r4937, %r4940; - mad.lo.s32 %r4942, %r93, %r4935, %r4941; - mad.lo.s32 %r4943, %r94, %r4933, %r4942; - ld.const.v4.u8 {%rs3178, %rs3179, %rs3180, %rs3181}, [matrix+1588]; - cvt.u32.u16 %r4944, %rs3181; - cvt.s32.s8 %r4945, %r4944; - cvt.u32.u16 %r4946, %rs3180; - cvt.s32.s8 %r4947, %r4946; - cvt.u32.u16 %r4948, %rs3179; - cvt.s32.s8 %r4949, %r4948; - cvt.u32.u16 %r4950, %rs3178; - cvt.s32.s8 %r4951, %r4950; - mad.lo.s32 %r4952, %r96, %r4951, %r4943; - mad.lo.s32 %r4953, %r97, %r4949, %r4952; - mad.lo.s32 %r4954, %r99, %r4947, %r4953; - mad.lo.s32 %r4955, %r100, %r4945, %r4954; - ld.const.v4.u8 {%rs3186, %rs3187, %rs3188, %rs3189}, [matrix+1592]; - cvt.u32.u16 %r4956, %rs3189; - cvt.s32.s8 %r4957, %r4956; - cvt.u32.u16 %r4958, %rs3188; - cvt.s32.s8 %r4959, %r4958; - cvt.u32.u16 %r4960, %rs3187; - cvt.s32.s8 %r4961, %r4960; - cvt.u32.u16 %r4962, %rs3186; - cvt.s32.s8 %r4963, %r4962; - mad.lo.s32 %r4964, %r103, %r4963, %r4955; - mad.lo.s32 %r4965, %r104, %r4961, %r4964; - mad.lo.s32 %r4966, %r107, %r4959, %r4965; - mad.lo.s32 %r4967, %r108, %r4957, %r4966; - ld.const.v4.u8 {%rs3194, %rs3195, %rs3196, %rs3197}, [matrix+1596]; - cvt.u32.u16 %r4968, %rs3197; - cvt.s32.s8 %r4969, %r4968; - cvt.u32.u16 %r4970, %rs3196; - cvt.s32.s8 %r4971, %r4970; - cvt.u32.u16 %r4972, %rs3195; - cvt.s32.s8 %r4973, %r4972; - cvt.u32.u16 %r4974, %rs3194; - cvt.s32.s8 %r4975, %r4974; - mad.lo.s32 %r4976, %r111, %r4975, %r4967; - mad.lo.s32 %r4977, %r112, %r4973, %r4976; - mad.lo.s32 %r4978, %r114, %r4971, %r4977; - mad.lo.s32 %r4979, %r115, %r4969, %r4978; - ld.const.v4.u8 {%rs3202, %rs3203, %rs3204, %rs3205}, [matrix+1600]; - cvt.u32.u16 %r4980, %rs3205; - cvt.s32.s8 %r4981, %r4980; - cvt.u32.u16 %r4982, %rs3204; - cvt.s32.s8 %r4983, %r4982; - cvt.u32.u16 %r4984, %rs3202; - cvt.s32.s8 %r4985, %r4984; - cvt.u32.u16 %r4986, %rs3203; - cvt.s32.s8 %r4987, %r4986; - mul.lo.s32 %r4988, %r34, %r4987; - mad.lo.s32 %r4989, %r124, %r4985, %r4988; - mad.lo.s32 %r4990, %r35, %r4983, %r4989; - mad.lo.s32 %r4991, %r36, %r4981, %r4990; - ld.const.v4.u8 {%rs3210, %rs3211, %rs3212, %rs3213}, [matrix+1604]; - cvt.u32.u16 %r4992, %rs3213; - cvt.s32.s8 %r4993, %r4992; - cvt.u32.u16 %r4994, %rs3212; - cvt.s32.s8 %r4995, %r4994; - cvt.u32.u16 %r4996, %rs3211; - cvt.s32.s8 %r4997, %r4996; - cvt.u32.u16 %r4998, %rs3210; - cvt.s32.s8 %r4999, %r4998; - mad.lo.s32 %r5000, %r37, %r4999, %r4991; - mad.lo.s32 %r5001, %r38, %r4997, %r5000; - mad.lo.s32 %r5002, %r39, %r4995, %r5001; - mad.lo.s32 %r5003, %r40, %r4993, %r5002; - ld.const.v4.u8 {%rs3218, %rs3219, %rs3220, %rs3221}, [matrix+1608]; - cvt.u32.u16 %r5004, %rs3221; - cvt.s32.s8 %r5005, %r5004; - cvt.u32.u16 %r5006, %rs3220; - cvt.s32.s8 %r5007, %r5006; - cvt.u32.u16 %r5008, %rs3219; - cvt.s32.s8 %r5009, %r5008; - cvt.u32.u16 %r5010, %rs3218; - cvt.s32.s8 %r5011, %r5010; - mad.lo.s32 %r5012, %r42, %r5011, %r5003; - mad.lo.s32 %r5013, %r43, %r5009, %r5012; - mad.lo.s32 %r5014, %r45, %r5007, %r5013; - mad.lo.s32 %r5015, %r46, %r5005, %r5014; - ld.const.v4.u8 {%rs3226, %rs3227, %rs3228, %rs3229}, [matrix+1612]; - cvt.u32.u16 %r5016, %rs3229; - cvt.s32.s8 %r5017, %r5016; - cvt.u32.u16 %r5018, %rs3228; - cvt.s32.s8 %r5019, %r5018; - cvt.u32.u16 %r5020, %rs3227; - cvt.s32.s8 %r5021, %r5020; - cvt.u32.u16 %r5022, %rs3226; - cvt.s32.s8 %r5023, %r5022; - mad.lo.s32 %r5024, %r48, %r5023, %r5015; - mad.lo.s32 %r5025, %r49, %r5021, %r5024; - mad.lo.s32 %r5026, %r50, %r5019, %r5025; - mad.lo.s32 %r5027, %r51, %r5017, %r5026; - ld.const.v4.u8 {%rs3234, %rs3235, %rs3236, %rs3237}, [matrix+1616]; - cvt.u32.u16 %r5028, %rs3237; - cvt.s32.s8 %r5029, %r5028; - cvt.u32.u16 %r5030, %rs3236; - cvt.s32.s8 %r5031, %r5030; - cvt.u32.u16 %r5032, %rs3235; - cvt.s32.s8 %r5033, %r5032; - cvt.u32.u16 %r5034, %rs3234; - cvt.s32.s8 %r5035, %r5034; - mad.lo.s32 %r5036, %r173, %r5035, %r5027; - mad.lo.s32 %r5037, %r53, %r5033, %r5036; - mad.lo.s32 %r5038, %r54, %r5031, %r5037; - mad.lo.s32 %r5039, %r55, %r5029, %r5038; - ld.const.v4.u8 {%rs3242, %rs3243, %rs3244, %rs3245}, [matrix+1620]; - cvt.u32.u16 %r5040, %rs3245; - cvt.s32.s8 %r5041, %r5040; - cvt.u32.u16 %r5042, %rs3244; - cvt.s32.s8 %r5043, %r5042; - cvt.u32.u16 %r5044, %rs3243; - cvt.s32.s8 %r5045, %r5044; - cvt.u32.u16 %r5046, %rs3242; - cvt.s32.s8 %r5047, %r5046; - mad.lo.s32 %r5048, %r56, %r5047, %r5039; - mad.lo.s32 %r5049, %r57, %r5045, %r5048; - mad.lo.s32 %r5050, %r58, %r5043, %r5049; - mad.lo.s32 %r5051, %r59, %r5041, %r5050; - ld.const.v4.u8 {%rs3250, %rs3251, %rs3252, %rs3253}, [matrix+1624]; - cvt.u32.u16 %r5052, %rs3253; - cvt.s32.s8 %r5053, %r5052; - cvt.u32.u16 %r5054, %rs3252; - cvt.s32.s8 %r5055, %r5054; - cvt.u32.u16 %r5056, %rs3251; - cvt.s32.s8 %r5057, %r5056; - cvt.u32.u16 %r5058, %rs3250; - cvt.s32.s8 %r5059, %r5058; - mad.lo.s32 %r5060, %r61, %r5059, %r5051; - mad.lo.s32 %r5061, %r62, %r5057, %r5060; - mad.lo.s32 %r5062, %r64, %r5055, %r5061; - mad.lo.s32 %r5063, %r65, %r5053, %r5062; - ld.const.v4.u8 {%rs3258, %rs3259, %rs3260, %rs3261}, [matrix+1628]; - cvt.u32.u16 %r5064, %rs3261; - cvt.s32.s8 %r5065, %r5064; - cvt.u32.u16 %r5066, %rs3260; - cvt.s32.s8 %r5067, %r5066; - cvt.u32.u16 %r5068, %rs3259; - cvt.s32.s8 %r5069, %r5068; - cvt.u32.u16 %r5070, %rs3258; - cvt.s32.s8 %r5071, %r5070; - mad.lo.s32 %r5072, %r67, %r5071, %r5063; - mad.lo.s32 %r5073, %r68, %r5069, %r5072; - mad.lo.s32 %r5074, %r69, %r5067, %r5073; - mad.lo.s32 %r5075, %r70, %r5065, %r5074; - ld.const.v4.u8 {%rs3266, %rs3267, %rs3268, %rs3269}, [matrix+1632]; - cvt.u32.u16 %r5076, %rs3269; - cvt.s32.s8 %r5077, %r5076; - cvt.u32.u16 %r5078, %rs3268; - cvt.s32.s8 %r5079, %r5078; - cvt.u32.u16 %r5080, %rs3267; - cvt.s32.s8 %r5081, %r5080; - cvt.u32.u16 %r5082, %rs3266; - cvt.s32.s8 %r5083, %r5082; - mad.lo.s32 %r5084, %r222, %r5083, %r5075; - mad.lo.s32 %r5085, %r72, %r5081, %r5084; - mad.lo.s32 %r5086, %r73, %r5079, %r5085; - mad.lo.s32 %r5087, %r74, %r5077, %r5086; - ld.const.v4.u8 {%rs3274, %rs3275, %rs3276, %rs3277}, [matrix+1636]; - cvt.u32.u16 %r5088, %rs3277; - cvt.s32.s8 %r5089, %r5088; - cvt.u32.u16 %r5090, %rs3276; - cvt.s32.s8 %r5091, %r5090; - cvt.u32.u16 %r5092, %rs3275; - cvt.s32.s8 %r5093, %r5092; - cvt.u32.u16 %r5094, %rs3274; - cvt.s32.s8 %r5095, %r5094; - mad.lo.s32 %r5096, %r75, %r5095, %r5087; - mad.lo.s32 %r5097, %r76, %r5093, %r5096; - mad.lo.s32 %r5098, %r77, %r5091, %r5097; - mad.lo.s32 %r5099, %r78, %r5089, %r5098; - ld.const.v4.u8 {%rs3282, %rs3283, %rs3284, %rs3285}, [matrix+1640]; - cvt.u32.u16 %r5100, %rs3285; - cvt.s32.s8 %r5101, %r5100; - cvt.u32.u16 %r5102, %rs3284; - cvt.s32.s8 %r5103, %r5102; - cvt.u32.u16 %r5104, %rs3283; - cvt.s32.s8 %r5105, %r5104; - cvt.u32.u16 %r5106, %rs3282; - cvt.s32.s8 %r5107, %r5106; - mad.lo.s32 %r5108, %r80, %r5107, %r5099; - mad.lo.s32 %r5109, %r81, %r5105, %r5108; - mad.lo.s32 %r5110, %r83, %r5103, %r5109; - mad.lo.s32 %r5111, %r84, %r5101, %r5110; - ld.const.v4.u8 {%rs3290, %rs3291, %rs3292, %rs3293}, [matrix+1644]; - cvt.u32.u16 %r5112, %rs3293; - cvt.s32.s8 %r5113, %r5112; - cvt.u32.u16 %r5114, %rs3292; - cvt.s32.s8 %r5115, %r5114; - cvt.u32.u16 %r5116, %rs3291; - cvt.s32.s8 %r5117, %r5116; - cvt.u32.u16 %r5118, %rs3290; - cvt.s32.s8 %r5119, %r5118; - mad.lo.s32 %r5120, %r86, %r5119, %r5111; - mad.lo.s32 %r5121, %r87, %r5117, %r5120; - mad.lo.s32 %r5122, %r88, %r5115, %r5121; - mad.lo.s32 %r5123, %r89, %r5113, %r5122; - ld.const.v4.u8 {%rs3298, %rs3299, %rs3300, %rs3301}, [matrix+1648]; - cvt.u32.u16 %r5124, %rs3301; - cvt.s32.s8 %r5125, %r5124; - cvt.u32.u16 %r5126, %rs3300; - cvt.s32.s8 %r5127, %r5126; - cvt.u32.u16 %r5128, %rs3299; - cvt.s32.s8 %r5129, %r5128; - cvt.u32.u16 %r5130, %rs3298; - cvt.s32.s8 %r5131, %r5130; - mad.lo.s32 %r5132, %r271, %r5131, %r5123; - mad.lo.s32 %r5133, %r91, %r5129, %r5132; - mad.lo.s32 %r5134, %r93, %r5127, %r5133; - mad.lo.s32 %r5135, %r94, %r5125, %r5134; - ld.const.v4.u8 {%rs3306, %rs3307, %rs3308, %rs3309}, [matrix+1652]; - cvt.u32.u16 %r5136, %rs3309; - cvt.s32.s8 %r5137, %r5136; - cvt.u32.u16 %r5138, %rs3308; - cvt.s32.s8 %r5139, %r5138; - cvt.u32.u16 %r5140, %rs3307; - cvt.s32.s8 %r5141, %r5140; - cvt.u32.u16 %r5142, %rs3306; - cvt.s32.s8 %r5143, %r5142; - mad.lo.s32 %r5144, %r96, %r5143, %r5135; - mad.lo.s32 %r5145, %r97, %r5141, %r5144; - mad.lo.s32 %r5146, %r99, %r5139, %r5145; - mad.lo.s32 %r5147, %r100, %r5137, %r5146; - ld.const.v4.u8 {%rs3314, %rs3315, %rs3316, %rs3317}, [matrix+1656]; - cvt.u32.u16 %r5148, %rs3317; - cvt.s32.s8 %r5149, %r5148; - cvt.u32.u16 %r5150, %rs3316; - cvt.s32.s8 %r5151, %r5150; - cvt.u32.u16 %r5152, %rs3315; - cvt.s32.s8 %r5153, %r5152; - cvt.u32.u16 %r5154, %rs3314; - cvt.s32.s8 %r5155, %r5154; - mad.lo.s32 %r5156, %r103, %r5155, %r5147; - mad.lo.s32 %r5157, %r104, %r5153, %r5156; - mad.lo.s32 %r5158, %r107, %r5151, %r5157; - mad.lo.s32 %r5159, %r108, %r5149, %r5158; - ld.const.v4.u8 {%rs3322, %rs3323, %rs3324, %rs3325}, [matrix+1660]; - cvt.u32.u16 %r5160, %rs3325; - cvt.s32.s8 %r5161, %r5160; - cvt.u32.u16 %r5162, %rs3324; - cvt.s32.s8 %r5163, %r5162; - cvt.u32.u16 %r5164, %rs3323; - cvt.s32.s8 %r5165, %r5164; - cvt.u32.u16 %r5166, %rs3322; - cvt.s32.s8 %r5167, %r5166; - mad.lo.s32 %r5168, %r111, %r5167, %r5159; - mad.lo.s32 %r5169, %r112, %r5165, %r5168; - mad.lo.s32 %r5170, %r114, %r5163, %r5169; - mad.lo.s32 %r5171, %r115, %r5161, %r5170; - shr.u32 %r5172, %r4979, 6; - and.b32 %r5173, %r5172, 240; - shr.u32 %r5174, %r5171, 10; - or.b32 %r5175, %r5174, %r5173; - xor.b32 %r5176, %r22, %r5175; - cvt.u64.u32 %rd390, %r5176; - ld.const.v4.u8 {%rs3330, %rs3331, %rs3332, %rs3333}, [matrix+1664]; - cvt.u32.u16 %r5177, %rs3333; - cvt.s32.s8 %r5178, %r5177; - cvt.u32.u16 %r5179, %rs3332; - cvt.s32.s8 %r5180, %r5179; - cvt.u32.u16 %r5181, %rs3330; - cvt.s32.s8 %r5182, %r5181; - cvt.u32.u16 %r5183, %rs3331; - cvt.s32.s8 %r5184, %r5183; - mul.lo.s32 %r5185, %r34, %r5184; - mad.lo.s32 %r5186, %r124, %r5182, %r5185; - mad.lo.s32 %r5187, %r35, %r5180, %r5186; - mad.lo.s32 %r5188, %r36, %r5178, %r5187; - ld.const.v4.u8 {%rs3338, %rs3339, %rs3340, %rs3341}, [matrix+1668]; - cvt.u32.u16 %r5189, %rs3341; - cvt.s32.s8 %r5190, %r5189; - cvt.u32.u16 %r5191, %rs3340; - cvt.s32.s8 %r5192, %r5191; - cvt.u32.u16 %r5193, %rs3339; - cvt.s32.s8 %r5194, %r5193; - cvt.u32.u16 %r5195, %rs3338; - cvt.s32.s8 %r5196, %r5195; - mad.lo.s32 %r5197, %r37, %r5196, %r5188; - mad.lo.s32 %r5198, %r38, %r5194, %r5197; - mad.lo.s32 %r5199, %r39, %r5192, %r5198; - mad.lo.s32 %r5200, %r40, %r5190, %r5199; - ld.const.v4.u8 {%rs3346, %rs3347, %rs3348, %rs3349}, [matrix+1672]; - cvt.u32.u16 %r5201, %rs3349; - cvt.s32.s8 %r5202, %r5201; - cvt.u32.u16 %r5203, %rs3348; - cvt.s32.s8 %r5204, %r5203; - cvt.u32.u16 %r5205, %rs3347; - cvt.s32.s8 %r5206, %r5205; - cvt.u32.u16 %r5207, %rs3346; - cvt.s32.s8 %r5208, %r5207; - mad.lo.s32 %r5209, %r42, %r5208, %r5200; - mad.lo.s32 %r5210, %r43, %r5206, %r5209; - mad.lo.s32 %r5211, %r45, %r5204, %r5210; - mad.lo.s32 %r5212, %r46, %r5202, %r5211; - ld.const.v4.u8 {%rs3354, %rs3355, %rs3356, %rs3357}, [matrix+1676]; - cvt.u32.u16 %r5213, %rs3357; - cvt.s32.s8 %r5214, %r5213; - cvt.u32.u16 %r5215, %rs3356; - cvt.s32.s8 %r5216, %r5215; - cvt.u32.u16 %r5217, %rs3355; - cvt.s32.s8 %r5218, %r5217; - cvt.u32.u16 %r5219, %rs3354; - cvt.s32.s8 %r5220, %r5219; - mad.lo.s32 %r5221, %r48, %r5220, %r5212; - mad.lo.s32 %r5222, %r49, %r5218, %r5221; - mad.lo.s32 %r5223, %r50, %r5216, %r5222; - mad.lo.s32 %r5224, %r51, %r5214, %r5223; - ld.const.v4.u8 {%rs3362, %rs3363, %rs3364, %rs3365}, [matrix+1680]; - cvt.u32.u16 %r5225, %rs3365; - cvt.s32.s8 %r5226, %r5225; - cvt.u32.u16 %r5227, %rs3364; - cvt.s32.s8 %r5228, %r5227; - cvt.u32.u16 %r5229, %rs3363; - cvt.s32.s8 %r5230, %r5229; - cvt.u32.u16 %r5231, %rs3362; - cvt.s32.s8 %r5232, %r5231; - mad.lo.s32 %r5233, %r173, %r5232, %r5224; - mad.lo.s32 %r5234, %r53, %r5230, %r5233; - mad.lo.s32 %r5235, %r54, %r5228, %r5234; - mad.lo.s32 %r5236, %r55, %r5226, %r5235; - ld.const.v4.u8 {%rs3370, %rs3371, %rs3372, %rs3373}, [matrix+1684]; - cvt.u32.u16 %r5237, %rs3373; - cvt.s32.s8 %r5238, %r5237; - cvt.u32.u16 %r5239, %rs3372; - cvt.s32.s8 %r5240, %r5239; - cvt.u32.u16 %r5241, %rs3371; - cvt.s32.s8 %r5242, %r5241; - cvt.u32.u16 %r5243, %rs3370; - cvt.s32.s8 %r5244, %r5243; - mad.lo.s32 %r5245, %r56, %r5244, %r5236; - mad.lo.s32 %r5246, %r57, %r5242, %r5245; - mad.lo.s32 %r5247, %r58, %r5240, %r5246; - mad.lo.s32 %r5248, %r59, %r5238, %r5247; - ld.const.v4.u8 {%rs3378, %rs3379, %rs3380, %rs3381}, [matrix+1688]; - cvt.u32.u16 %r5249, %rs3381; - cvt.s32.s8 %r5250, %r5249; - cvt.u32.u16 %r5251, %rs3380; - cvt.s32.s8 %r5252, %r5251; - cvt.u32.u16 %r5253, %rs3379; - cvt.s32.s8 %r5254, %r5253; - cvt.u32.u16 %r5255, %rs3378; - cvt.s32.s8 %r5256, %r5255; - mad.lo.s32 %r5257, %r61, %r5256, %r5248; - mad.lo.s32 %r5258, %r62, %r5254, %r5257; - mad.lo.s32 %r5259, %r64, %r5252, %r5258; - mad.lo.s32 %r5260, %r65, %r5250, %r5259; - ld.const.v4.u8 {%rs3386, %rs3387, %rs3388, %rs3389}, [matrix+1692]; - cvt.u32.u16 %r5261, %rs3389; - cvt.s32.s8 %r5262, %r5261; - cvt.u32.u16 %r5263, %rs3388; - cvt.s32.s8 %r5264, %r5263; - cvt.u32.u16 %r5265, %rs3387; - cvt.s32.s8 %r5266, %r5265; - cvt.u32.u16 %r5267, %rs3386; - cvt.s32.s8 %r5268, %r5267; - mad.lo.s32 %r5269, %r67, %r5268, %r5260; - mad.lo.s32 %r5270, %r68, %r5266, %r5269; - mad.lo.s32 %r5271, %r69, %r5264, %r5270; - mad.lo.s32 %r5272, %r70, %r5262, %r5271; - ld.const.v4.u8 {%rs3394, %rs3395, %rs3396, %rs3397}, [matrix+1696]; - cvt.u32.u16 %r5273, %rs3397; - cvt.s32.s8 %r5274, %r5273; - cvt.u32.u16 %r5275, %rs3396; - cvt.s32.s8 %r5276, %r5275; - cvt.u32.u16 %r5277, %rs3395; - cvt.s32.s8 %r5278, %r5277; - cvt.u32.u16 %r5279, %rs3394; - cvt.s32.s8 %r5280, %r5279; - mad.lo.s32 %r5281, %r222, %r5280, %r5272; - mad.lo.s32 %r5282, %r72, %r5278, %r5281; - mad.lo.s32 %r5283, %r73, %r5276, %r5282; - mad.lo.s32 %r5284, %r74, %r5274, %r5283; - ld.const.v4.u8 {%rs3402, %rs3403, %rs3404, %rs3405}, [matrix+1700]; - cvt.u32.u16 %r5285, %rs3405; - cvt.s32.s8 %r5286, %r5285; - cvt.u32.u16 %r5287, %rs3404; - cvt.s32.s8 %r5288, %r5287; - cvt.u32.u16 %r5289, %rs3403; - cvt.s32.s8 %r5290, %r5289; - cvt.u32.u16 %r5291, %rs3402; - cvt.s32.s8 %r5292, %r5291; - mad.lo.s32 %r5293, %r75, %r5292, %r5284; - mad.lo.s32 %r5294, %r76, %r5290, %r5293; - mad.lo.s32 %r5295, %r77, %r5288, %r5294; - mad.lo.s32 %r5296, %r78, %r5286, %r5295; - ld.const.v4.u8 {%rs3410, %rs3411, %rs3412, %rs3413}, [matrix+1704]; - cvt.u32.u16 %r5297, %rs3413; - cvt.s32.s8 %r5298, %r5297; - cvt.u32.u16 %r5299, %rs3412; - cvt.s32.s8 %r5300, %r5299; - cvt.u32.u16 %r5301, %rs3411; - cvt.s32.s8 %r5302, %r5301; - cvt.u32.u16 %r5303, %rs3410; - cvt.s32.s8 %r5304, %r5303; - mad.lo.s32 %r5305, %r80, %r5304, %r5296; - mad.lo.s32 %r5306, %r81, %r5302, %r5305; - mad.lo.s32 %r5307, %r83, %r5300, %r5306; - mad.lo.s32 %r5308, %r84, %r5298, %r5307; - ld.const.v4.u8 {%rs3418, %rs3419, %rs3420, %rs3421}, [matrix+1708]; - cvt.u32.u16 %r5309, %rs3421; - cvt.s32.s8 %r5310, %r5309; - cvt.u32.u16 %r5311, %rs3420; - cvt.s32.s8 %r5312, %r5311; - cvt.u32.u16 %r5313, %rs3419; - cvt.s32.s8 %r5314, %r5313; - cvt.u32.u16 %r5315, %rs3418; - cvt.s32.s8 %r5316, %r5315; - mad.lo.s32 %r5317, %r86, %r5316, %r5308; - mad.lo.s32 %r5318, %r87, %r5314, %r5317; - mad.lo.s32 %r5319, %r88, %r5312, %r5318; - mad.lo.s32 %r5320, %r89, %r5310, %r5319; - ld.const.v4.u8 {%rs3426, %rs3427, %rs3428, %rs3429}, [matrix+1712]; - cvt.u32.u16 %r5321, %rs3429; - cvt.s32.s8 %r5322, %r5321; - cvt.u32.u16 %r5323, %rs3428; - cvt.s32.s8 %r5324, %r5323; - cvt.u32.u16 %r5325, %rs3427; - cvt.s32.s8 %r5326, %r5325; - cvt.u32.u16 %r5327, %rs3426; - cvt.s32.s8 %r5328, %r5327; - mad.lo.s32 %r5329, %r271, %r5328, %r5320; - mad.lo.s32 %r5330, %r91, %r5326, %r5329; - mad.lo.s32 %r5331, %r93, %r5324, %r5330; - mad.lo.s32 %r5332, %r94, %r5322, %r5331; - ld.const.v4.u8 {%rs3434, %rs3435, %rs3436, %rs3437}, [matrix+1716]; - cvt.u32.u16 %r5333, %rs3437; - cvt.s32.s8 %r5334, %r5333; - cvt.u32.u16 %r5335, %rs3436; - cvt.s32.s8 %r5336, %r5335; - cvt.u32.u16 %r5337, %rs3435; - cvt.s32.s8 %r5338, %r5337; - cvt.u32.u16 %r5339, %rs3434; - cvt.s32.s8 %r5340, %r5339; - mad.lo.s32 %r5341, %r96, %r5340, %r5332; - mad.lo.s32 %r5342, %r97, %r5338, %r5341; - mad.lo.s32 %r5343, %r99, %r5336, %r5342; - mad.lo.s32 %r5344, %r100, %r5334, %r5343; - ld.const.v4.u8 {%rs3442, %rs3443, %rs3444, %rs3445}, [matrix+1720]; - cvt.u32.u16 %r5345, %rs3445; - cvt.s32.s8 %r5346, %r5345; - cvt.u32.u16 %r5347, %rs3444; - cvt.s32.s8 %r5348, %r5347; - cvt.u32.u16 %r5349, %rs3443; - cvt.s32.s8 %r5350, %r5349; - cvt.u32.u16 %r5351, %rs3442; - cvt.s32.s8 %r5352, %r5351; - mad.lo.s32 %r5353, %r103, %r5352, %r5344; - mad.lo.s32 %r5354, %r104, %r5350, %r5353; - mad.lo.s32 %r5355, %r107, %r5348, %r5354; - mad.lo.s32 %r5356, %r108, %r5346, %r5355; - ld.const.v4.u8 {%rs3450, %rs3451, %rs3452, %rs3453}, [matrix+1724]; - cvt.u32.u16 %r5357, %rs3453; - cvt.s32.s8 %r5358, %r5357; - cvt.u32.u16 %r5359, %rs3452; - cvt.s32.s8 %r5360, %r5359; - cvt.u32.u16 %r5361, %rs3451; - cvt.s32.s8 %r5362, %r5361; - cvt.u32.u16 %r5363, %rs3450; - cvt.s32.s8 %r5364, %r5363; - mad.lo.s32 %r5365, %r111, %r5364, %r5356; - mad.lo.s32 %r5366, %r112, %r5362, %r5365; - mad.lo.s32 %r5367, %r114, %r5360, %r5366; - mad.lo.s32 %r5368, %r115, %r5358, %r5367; - ld.const.v4.u8 {%rs3458, %rs3459, %rs3460, %rs3461}, [matrix+1728]; - cvt.u32.u16 %r5369, %rs3461; - cvt.s32.s8 %r5370, %r5369; - cvt.u32.u16 %r5371, %rs3460; - cvt.s32.s8 %r5372, %r5371; - cvt.u32.u16 %r5373, %rs3458; - cvt.s32.s8 %r5374, %r5373; - cvt.u32.u16 %r5375, %rs3459; - cvt.s32.s8 %r5376, %r5375; - mul.lo.s32 %r5377, %r34, %r5376; - mad.lo.s32 %r5378, %r124, %r5374, %r5377; - mad.lo.s32 %r5379, %r35, %r5372, %r5378; - mad.lo.s32 %r5380, %r36, %r5370, %r5379; - ld.const.v4.u8 {%rs3466, %rs3467, %rs3468, %rs3469}, [matrix+1732]; - cvt.u32.u16 %r5381, %rs3469; - cvt.s32.s8 %r5382, %r5381; - cvt.u32.u16 %r5383, %rs3468; - cvt.s32.s8 %r5384, %r5383; - cvt.u32.u16 %r5385, %rs3467; - cvt.s32.s8 %r5386, %r5385; - cvt.u32.u16 %r5387, %rs3466; - cvt.s32.s8 %r5388, %r5387; - mad.lo.s32 %r5389, %r37, %r5388, %r5380; - mad.lo.s32 %r5390, %r38, %r5386, %r5389; - mad.lo.s32 %r5391, %r39, %r5384, %r5390; - mad.lo.s32 %r5392, %r40, %r5382, %r5391; - ld.const.v4.u8 {%rs3474, %rs3475, %rs3476, %rs3477}, [matrix+1736]; - cvt.u32.u16 %r5393, %rs3477; - cvt.s32.s8 %r5394, %r5393; - cvt.u32.u16 %r5395, %rs3476; - cvt.s32.s8 %r5396, %r5395; - cvt.u32.u16 %r5397, %rs3475; - cvt.s32.s8 %r5398, %r5397; - cvt.u32.u16 %r5399, %rs3474; - cvt.s32.s8 %r5400, %r5399; - mad.lo.s32 %r5401, %r42, %r5400, %r5392; - mad.lo.s32 %r5402, %r43, %r5398, %r5401; - mad.lo.s32 %r5403, %r45, %r5396, %r5402; - mad.lo.s32 %r5404, %r46, %r5394, %r5403; - ld.const.v4.u8 {%rs3482, %rs3483, %rs3484, %rs3485}, [matrix+1740]; - cvt.u32.u16 %r5405, %rs3485; - cvt.s32.s8 %r5406, %r5405; - cvt.u32.u16 %r5407, %rs3484; - cvt.s32.s8 %r5408, %r5407; - cvt.u32.u16 %r5409, %rs3483; - cvt.s32.s8 %r5410, %r5409; - cvt.u32.u16 %r5411, %rs3482; - cvt.s32.s8 %r5412, %r5411; - mad.lo.s32 %r5413, %r48, %r5412, %r5404; - mad.lo.s32 %r5414, %r49, %r5410, %r5413; - mad.lo.s32 %r5415, %r50, %r5408, %r5414; - mad.lo.s32 %r5416, %r51, %r5406, %r5415; - ld.const.v4.u8 {%rs3490, %rs3491, %rs3492, %rs3493}, [matrix+1744]; - cvt.u32.u16 %r5417, %rs3493; - cvt.s32.s8 %r5418, %r5417; - cvt.u32.u16 %r5419, %rs3492; - cvt.s32.s8 %r5420, %r5419; - cvt.u32.u16 %r5421, %rs3491; - cvt.s32.s8 %r5422, %r5421; - cvt.u32.u16 %r5423, %rs3490; - cvt.s32.s8 %r5424, %r5423; - mad.lo.s32 %r5425, %r173, %r5424, %r5416; - mad.lo.s32 %r5426, %r53, %r5422, %r5425; - mad.lo.s32 %r5427, %r54, %r5420, %r5426; - mad.lo.s32 %r5428, %r55, %r5418, %r5427; - ld.const.v4.u8 {%rs3498, %rs3499, %rs3500, %rs3501}, [matrix+1748]; - cvt.u32.u16 %r5429, %rs3501; - cvt.s32.s8 %r5430, %r5429; - cvt.u32.u16 %r5431, %rs3500; - cvt.s32.s8 %r5432, %r5431; - cvt.u32.u16 %r5433, %rs3499; - cvt.s32.s8 %r5434, %r5433; - cvt.u32.u16 %r5435, %rs3498; - cvt.s32.s8 %r5436, %r5435; - mad.lo.s32 %r5437, %r56, %r5436, %r5428; - mad.lo.s32 %r5438, %r57, %r5434, %r5437; - mad.lo.s32 %r5439, %r58, %r5432, %r5438; - mad.lo.s32 %r5440, %r59, %r5430, %r5439; - ld.const.v4.u8 {%rs3506, %rs3507, %rs3508, %rs3509}, [matrix+1752]; - cvt.u32.u16 %r5441, %rs3509; - cvt.s32.s8 %r5442, %r5441; - cvt.u32.u16 %r5443, %rs3508; - cvt.s32.s8 %r5444, %r5443; - cvt.u32.u16 %r5445, %rs3507; - cvt.s32.s8 %r5446, %r5445; - cvt.u32.u16 %r5447, %rs3506; - cvt.s32.s8 %r5448, %r5447; - mad.lo.s32 %r5449, %r61, %r5448, %r5440; - mad.lo.s32 %r5450, %r62, %r5446, %r5449; - mad.lo.s32 %r5451, %r64, %r5444, %r5450; - mad.lo.s32 %r5452, %r65, %r5442, %r5451; - ld.const.v4.u8 {%rs3514, %rs3515, %rs3516, %rs3517}, [matrix+1756]; - cvt.u32.u16 %r5453, %rs3517; - cvt.s32.s8 %r5454, %r5453; - cvt.u32.u16 %r5455, %rs3516; - cvt.s32.s8 %r5456, %r5455; - cvt.u32.u16 %r5457, %rs3515; - cvt.s32.s8 %r5458, %r5457; - cvt.u32.u16 %r5459, %rs3514; - cvt.s32.s8 %r5460, %r5459; - mad.lo.s32 %r5461, %r67, %r5460, %r5452; - mad.lo.s32 %r5462, %r68, %r5458, %r5461; - mad.lo.s32 %r5463, %r69, %r5456, %r5462; - mad.lo.s32 %r5464, %r70, %r5454, %r5463; - ld.const.v4.u8 {%rs3522, %rs3523, %rs3524, %rs3525}, [matrix+1760]; - cvt.u32.u16 %r5465, %rs3525; - cvt.s32.s8 %r5466, %r5465; - cvt.u32.u16 %r5467, %rs3524; - cvt.s32.s8 %r5468, %r5467; - cvt.u32.u16 %r5469, %rs3523; - cvt.s32.s8 %r5470, %r5469; - cvt.u32.u16 %r5471, %rs3522; - cvt.s32.s8 %r5472, %r5471; - mad.lo.s32 %r5473, %r222, %r5472, %r5464; - mad.lo.s32 %r5474, %r72, %r5470, %r5473; - mad.lo.s32 %r5475, %r73, %r5468, %r5474; - mad.lo.s32 %r5476, %r74, %r5466, %r5475; - ld.const.v4.u8 {%rs3530, %rs3531, %rs3532, %rs3533}, [matrix+1764]; - cvt.u32.u16 %r5477, %rs3533; - cvt.s32.s8 %r5478, %r5477; - cvt.u32.u16 %r5479, %rs3532; - cvt.s32.s8 %r5480, %r5479; - cvt.u32.u16 %r5481, %rs3531; - cvt.s32.s8 %r5482, %r5481; - cvt.u32.u16 %r5483, %rs3530; - cvt.s32.s8 %r5484, %r5483; - mad.lo.s32 %r5485, %r75, %r5484, %r5476; - mad.lo.s32 %r5486, %r76, %r5482, %r5485; - mad.lo.s32 %r5487, %r77, %r5480, %r5486; - mad.lo.s32 %r5488, %r78, %r5478, %r5487; - ld.const.v4.u8 {%rs3538, %rs3539, %rs3540, %rs3541}, [matrix+1768]; - cvt.u32.u16 %r5489, %rs3541; - cvt.s32.s8 %r5490, %r5489; - cvt.u32.u16 %r5491, %rs3540; - cvt.s32.s8 %r5492, %r5491; - cvt.u32.u16 %r5493, %rs3539; - cvt.s32.s8 %r5494, %r5493; - cvt.u32.u16 %r5495, %rs3538; - cvt.s32.s8 %r5496, %r5495; - mad.lo.s32 %r5497, %r80, %r5496, %r5488; - mad.lo.s32 %r5498, %r81, %r5494, %r5497; - mad.lo.s32 %r5499, %r83, %r5492, %r5498; - mad.lo.s32 %r5500, %r84, %r5490, %r5499; - ld.const.v4.u8 {%rs3546, %rs3547, %rs3548, %rs3549}, [matrix+1772]; - cvt.u32.u16 %r5501, %rs3549; - cvt.s32.s8 %r5502, %r5501; - cvt.u32.u16 %r5503, %rs3548; - cvt.s32.s8 %r5504, %r5503; - cvt.u32.u16 %r5505, %rs3547; - cvt.s32.s8 %r5506, %r5505; - cvt.u32.u16 %r5507, %rs3546; - cvt.s32.s8 %r5508, %r5507; - mad.lo.s32 %r5509, %r86, %r5508, %r5500; - mad.lo.s32 %r5510, %r87, %r5506, %r5509; - mad.lo.s32 %r5511, %r88, %r5504, %r5510; - mad.lo.s32 %r5512, %r89, %r5502, %r5511; - ld.const.v4.u8 {%rs3554, %rs3555, %rs3556, %rs3557}, [matrix+1776]; - cvt.u32.u16 %r5513, %rs3557; - cvt.s32.s8 %r5514, %r5513; - cvt.u32.u16 %r5515, %rs3556; - cvt.s32.s8 %r5516, %r5515; - cvt.u32.u16 %r5517, %rs3555; - cvt.s32.s8 %r5518, %r5517; - cvt.u32.u16 %r5519, %rs3554; - cvt.s32.s8 %r5520, %r5519; - mad.lo.s32 %r5521, %r271, %r5520, %r5512; - mad.lo.s32 %r5522, %r91, %r5518, %r5521; - mad.lo.s32 %r5523, %r93, %r5516, %r5522; - mad.lo.s32 %r5524, %r94, %r5514, %r5523; - ld.const.v4.u8 {%rs3562, %rs3563, %rs3564, %rs3565}, [matrix+1780]; - cvt.u32.u16 %r5525, %rs3565; - cvt.s32.s8 %r5526, %r5525; - cvt.u32.u16 %r5527, %rs3564; - cvt.s32.s8 %r5528, %r5527; - cvt.u32.u16 %r5529, %rs3563; - cvt.s32.s8 %r5530, %r5529; - cvt.u32.u16 %r5531, %rs3562; - cvt.s32.s8 %r5532, %r5531; - mad.lo.s32 %r5533, %r96, %r5532, %r5524; - mad.lo.s32 %r5534, %r97, %r5530, %r5533; - mad.lo.s32 %r5535, %r99, %r5528, %r5534; - mad.lo.s32 %r5536, %r100, %r5526, %r5535; - ld.const.v4.u8 {%rs3570, %rs3571, %rs3572, %rs3573}, [matrix+1784]; - cvt.u32.u16 %r5537, %rs3573; - cvt.s32.s8 %r5538, %r5537; - cvt.u32.u16 %r5539, %rs3572; - cvt.s32.s8 %r5540, %r5539; - cvt.u32.u16 %r5541, %rs3571; - cvt.s32.s8 %r5542, %r5541; - cvt.u32.u16 %r5543, %rs3570; - cvt.s32.s8 %r5544, %r5543; - mad.lo.s32 %r5545, %r103, %r5544, %r5536; - mad.lo.s32 %r5546, %r104, %r5542, %r5545; - mad.lo.s32 %r5547, %r107, %r5540, %r5546; - mad.lo.s32 %r5548, %r108, %r5538, %r5547; - ld.const.v4.u8 {%rs3578, %rs3579, %rs3580, %rs3581}, [matrix+1788]; - cvt.u32.u16 %r5549, %rs3581; - cvt.s32.s8 %r5550, %r5549; - cvt.u32.u16 %r5551, %rs3580; - cvt.s32.s8 %r5552, %r5551; - cvt.u32.u16 %r5553, %rs3579; - cvt.s32.s8 %r5554, %r5553; - cvt.u32.u16 %r5555, %rs3578; - cvt.s32.s8 %r5556, %r5555; - mad.lo.s32 %r5557, %r111, %r5556, %r5548; - mad.lo.s32 %r5558, %r112, %r5554, %r5557; - mad.lo.s32 %r5559, %r114, %r5552, %r5558; - mad.lo.s32 %r5560, %r115, %r5550, %r5559; - shr.u32 %r5561, %r5368, 6; - and.b32 %r5562, %r5561, 240; - shr.u32 %r5563, %r5560, 10; - or.b32 %r5564, %r5563, %r5562; - xor.b32 %r5565, %r23, %r5564; - cvt.u64.u32 %rd391, %r5565; - ld.const.v4.u8 {%rs3586, %rs3587, %rs3588, %rs3589}, [matrix+1792]; - cvt.u32.u16 %r5566, %rs3589; - cvt.s32.s8 %r5567, %r5566; - cvt.u32.u16 %r5568, %rs3588; - cvt.s32.s8 %r5569, %r5568; - cvt.u32.u16 %r5570, %rs3586; - cvt.s32.s8 %r5571, %r5570; - cvt.u32.u16 %r5572, %rs3587; - cvt.s32.s8 %r5573, %r5572; - mul.lo.s32 %r5574, %r34, %r5573; - mad.lo.s32 %r5575, %r124, %r5571, %r5574; - mad.lo.s32 %r5576, %r35, %r5569, %r5575; - mad.lo.s32 %r5577, %r36, %r5567, %r5576; - ld.const.v4.u8 {%rs3594, %rs3595, %rs3596, %rs3597}, [matrix+1796]; - cvt.u32.u16 %r5578, %rs3597; - cvt.s32.s8 %r5579, %r5578; - cvt.u32.u16 %r5580, %rs3596; - cvt.s32.s8 %r5581, %r5580; - cvt.u32.u16 %r5582, %rs3595; - cvt.s32.s8 %r5583, %r5582; - cvt.u32.u16 %r5584, %rs3594; - cvt.s32.s8 %r5585, %r5584; - mad.lo.s32 %r5586, %r37, %r5585, %r5577; - mad.lo.s32 %r5587, %r38, %r5583, %r5586; - mad.lo.s32 %r5588, %r39, %r5581, %r5587; - mad.lo.s32 %r5589, %r40, %r5579, %r5588; - ld.const.v4.u8 {%rs3602, %rs3603, %rs3604, %rs3605}, [matrix+1800]; - cvt.u32.u16 %r5590, %rs3605; - cvt.s32.s8 %r5591, %r5590; - cvt.u32.u16 %r5592, %rs3604; - cvt.s32.s8 %r5593, %r5592; - cvt.u32.u16 %r5594, %rs3603; - cvt.s32.s8 %r5595, %r5594; - cvt.u32.u16 %r5596, %rs3602; - cvt.s32.s8 %r5597, %r5596; - mad.lo.s32 %r5598, %r42, %r5597, %r5589; - mad.lo.s32 %r5599, %r43, %r5595, %r5598; - mad.lo.s32 %r5600, %r45, %r5593, %r5599; - mad.lo.s32 %r5601, %r46, %r5591, %r5600; - ld.const.v4.u8 {%rs3610, %rs3611, %rs3612, %rs3613}, [matrix+1804]; - cvt.u32.u16 %r5602, %rs3613; - cvt.s32.s8 %r5603, %r5602; - cvt.u32.u16 %r5604, %rs3612; - cvt.s32.s8 %r5605, %r5604; - cvt.u32.u16 %r5606, %rs3611; - cvt.s32.s8 %r5607, %r5606; - cvt.u32.u16 %r5608, %rs3610; - cvt.s32.s8 %r5609, %r5608; - mad.lo.s32 %r5610, %r48, %r5609, %r5601; - mad.lo.s32 %r5611, %r49, %r5607, %r5610; - mad.lo.s32 %r5612, %r50, %r5605, %r5611; - mad.lo.s32 %r5613, %r51, %r5603, %r5612; - ld.const.v4.u8 {%rs3618, %rs3619, %rs3620, %rs3621}, [matrix+1808]; - cvt.u32.u16 %r5614, %rs3621; - cvt.s32.s8 %r5615, %r5614; - cvt.u32.u16 %r5616, %rs3620; - cvt.s32.s8 %r5617, %r5616; - cvt.u32.u16 %r5618, %rs3619; - cvt.s32.s8 %r5619, %r5618; - cvt.u32.u16 %r5620, %rs3618; - cvt.s32.s8 %r5621, %r5620; - mad.lo.s32 %r5622, %r173, %r5621, %r5613; - mad.lo.s32 %r5623, %r53, %r5619, %r5622; - mad.lo.s32 %r5624, %r54, %r5617, %r5623; - mad.lo.s32 %r5625, %r55, %r5615, %r5624; - ld.const.v4.u8 {%rs3626, %rs3627, %rs3628, %rs3629}, [matrix+1812]; - cvt.u32.u16 %r5626, %rs3629; - cvt.s32.s8 %r5627, %r5626; - cvt.u32.u16 %r5628, %rs3628; - cvt.s32.s8 %r5629, %r5628; - cvt.u32.u16 %r5630, %rs3627; - cvt.s32.s8 %r5631, %r5630; - cvt.u32.u16 %r5632, %rs3626; - cvt.s32.s8 %r5633, %r5632; - mad.lo.s32 %r5634, %r56, %r5633, %r5625; - mad.lo.s32 %r5635, %r57, %r5631, %r5634; - mad.lo.s32 %r5636, %r58, %r5629, %r5635; - mad.lo.s32 %r5637, %r59, %r5627, %r5636; - ld.const.v4.u8 {%rs3634, %rs3635, %rs3636, %rs3637}, [matrix+1816]; - cvt.u32.u16 %r5638, %rs3637; - cvt.s32.s8 %r5639, %r5638; - cvt.u32.u16 %r5640, %rs3636; - cvt.s32.s8 %r5641, %r5640; - cvt.u32.u16 %r5642, %rs3635; - cvt.s32.s8 %r5643, %r5642; - cvt.u32.u16 %r5644, %rs3634; - cvt.s32.s8 %r5645, %r5644; - mad.lo.s32 %r5646, %r61, %r5645, %r5637; - mad.lo.s32 %r5647, %r62, %r5643, %r5646; - mad.lo.s32 %r5648, %r64, %r5641, %r5647; - mad.lo.s32 %r5649, %r65, %r5639, %r5648; - ld.const.v4.u8 {%rs3642, %rs3643, %rs3644, %rs3645}, [matrix+1820]; - cvt.u32.u16 %r5650, %rs3645; - cvt.s32.s8 %r5651, %r5650; - cvt.u32.u16 %r5652, %rs3644; - cvt.s32.s8 %r5653, %r5652; - cvt.u32.u16 %r5654, %rs3643; - cvt.s32.s8 %r5655, %r5654; - cvt.u32.u16 %r5656, %rs3642; - cvt.s32.s8 %r5657, %r5656; - mad.lo.s32 %r5658, %r67, %r5657, %r5649; - mad.lo.s32 %r5659, %r68, %r5655, %r5658; - mad.lo.s32 %r5660, %r69, %r5653, %r5659; - mad.lo.s32 %r5661, %r70, %r5651, %r5660; - ld.const.v4.u8 {%rs3650, %rs3651, %rs3652, %rs3653}, [matrix+1824]; - cvt.u32.u16 %r5662, %rs3653; - cvt.s32.s8 %r5663, %r5662; - cvt.u32.u16 %r5664, %rs3652; - cvt.s32.s8 %r5665, %r5664; - cvt.u32.u16 %r5666, %rs3651; - cvt.s32.s8 %r5667, %r5666; - cvt.u32.u16 %r5668, %rs3650; - cvt.s32.s8 %r5669, %r5668; - mad.lo.s32 %r5670, %r222, %r5669, %r5661; - mad.lo.s32 %r5671, %r72, %r5667, %r5670; - mad.lo.s32 %r5672, %r73, %r5665, %r5671; - mad.lo.s32 %r5673, %r74, %r5663, %r5672; - ld.const.v4.u8 {%rs3658, %rs3659, %rs3660, %rs3661}, [matrix+1828]; - cvt.u32.u16 %r5674, %rs3661; - cvt.s32.s8 %r5675, %r5674; - cvt.u32.u16 %r5676, %rs3660; - cvt.s32.s8 %r5677, %r5676; - cvt.u32.u16 %r5678, %rs3659; - cvt.s32.s8 %r5679, %r5678; - cvt.u32.u16 %r5680, %rs3658; - cvt.s32.s8 %r5681, %r5680; - mad.lo.s32 %r5682, %r75, %r5681, %r5673; - mad.lo.s32 %r5683, %r76, %r5679, %r5682; - mad.lo.s32 %r5684, %r77, %r5677, %r5683; - mad.lo.s32 %r5685, %r78, %r5675, %r5684; - ld.const.v4.u8 {%rs3666, %rs3667, %rs3668, %rs3669}, [matrix+1832]; - cvt.u32.u16 %r5686, %rs3669; - cvt.s32.s8 %r5687, %r5686; - cvt.u32.u16 %r5688, %rs3668; - cvt.s32.s8 %r5689, %r5688; - cvt.u32.u16 %r5690, %rs3667; - cvt.s32.s8 %r5691, %r5690; - cvt.u32.u16 %r5692, %rs3666; - cvt.s32.s8 %r5693, %r5692; - mad.lo.s32 %r5694, %r80, %r5693, %r5685; - mad.lo.s32 %r5695, %r81, %r5691, %r5694; - mad.lo.s32 %r5696, %r83, %r5689, %r5695; - mad.lo.s32 %r5697, %r84, %r5687, %r5696; - ld.const.v4.u8 {%rs3674, %rs3675, %rs3676, %rs3677}, [matrix+1836]; - cvt.u32.u16 %r5698, %rs3677; - cvt.s32.s8 %r5699, %r5698; - cvt.u32.u16 %r5700, %rs3676; - cvt.s32.s8 %r5701, %r5700; - cvt.u32.u16 %r5702, %rs3675; - cvt.s32.s8 %r5703, %r5702; - cvt.u32.u16 %r5704, %rs3674; - cvt.s32.s8 %r5705, %r5704; - mad.lo.s32 %r5706, %r86, %r5705, %r5697; - mad.lo.s32 %r5707, %r87, %r5703, %r5706; - mad.lo.s32 %r5708, %r88, %r5701, %r5707; - mad.lo.s32 %r5709, %r89, %r5699, %r5708; - ld.const.v4.u8 {%rs3682, %rs3683, %rs3684, %rs3685}, [matrix+1840]; - cvt.u32.u16 %r5710, %rs3685; - cvt.s32.s8 %r5711, %r5710; - cvt.u32.u16 %r5712, %rs3684; - cvt.s32.s8 %r5713, %r5712; - cvt.u32.u16 %r5714, %rs3683; - cvt.s32.s8 %r5715, %r5714; - cvt.u32.u16 %r5716, %rs3682; - cvt.s32.s8 %r5717, %r5716; - mad.lo.s32 %r5718, %r271, %r5717, %r5709; - mad.lo.s32 %r5719, %r91, %r5715, %r5718; - mad.lo.s32 %r5720, %r93, %r5713, %r5719; - mad.lo.s32 %r5721, %r94, %r5711, %r5720; - ld.const.v4.u8 {%rs3690, %rs3691, %rs3692, %rs3693}, [matrix+1844]; - cvt.u32.u16 %r5722, %rs3693; - cvt.s32.s8 %r5723, %r5722; - cvt.u32.u16 %r5724, %rs3692; - cvt.s32.s8 %r5725, %r5724; - cvt.u32.u16 %r5726, %rs3691; - cvt.s32.s8 %r5727, %r5726; - cvt.u32.u16 %r5728, %rs3690; - cvt.s32.s8 %r5729, %r5728; - mad.lo.s32 %r5730, %r96, %r5729, %r5721; - mad.lo.s32 %r5731, %r97, %r5727, %r5730; - mad.lo.s32 %r5732, %r99, %r5725, %r5731; - mad.lo.s32 %r5733, %r100, %r5723, %r5732; - ld.const.v4.u8 {%rs3698, %rs3699, %rs3700, %rs3701}, [matrix+1848]; - cvt.u32.u16 %r5734, %rs3701; - cvt.s32.s8 %r5735, %r5734; - cvt.u32.u16 %r5736, %rs3700; - cvt.s32.s8 %r5737, %r5736; - cvt.u32.u16 %r5738, %rs3699; - cvt.s32.s8 %r5739, %r5738; - cvt.u32.u16 %r5740, %rs3698; - cvt.s32.s8 %r5741, %r5740; - mad.lo.s32 %r5742, %r103, %r5741, %r5733; - mad.lo.s32 %r5743, %r104, %r5739, %r5742; - mad.lo.s32 %r5744, %r107, %r5737, %r5743; - mad.lo.s32 %r5745, %r108, %r5735, %r5744; - ld.const.v4.u8 {%rs3706, %rs3707, %rs3708, %rs3709}, [matrix+1852]; - cvt.u32.u16 %r5746, %rs3709; - cvt.s32.s8 %r5747, %r5746; - cvt.u32.u16 %r5748, %rs3708; - cvt.s32.s8 %r5749, %r5748; - cvt.u32.u16 %r5750, %rs3707; - cvt.s32.s8 %r5751, %r5750; - cvt.u32.u16 %r5752, %rs3706; - cvt.s32.s8 %r5753, %r5752; - mad.lo.s32 %r5754, %r111, %r5753, %r5745; - mad.lo.s32 %r5755, %r112, %r5751, %r5754; - mad.lo.s32 %r5756, %r114, %r5749, %r5755; - mad.lo.s32 %r5757, %r115, %r5747, %r5756; - ld.const.v4.u8 {%rs3714, %rs3715, %rs3716, %rs3717}, [matrix+1856]; - cvt.u32.u16 %r5758, %rs3717; - cvt.s32.s8 %r5759, %r5758; - cvt.u32.u16 %r5760, %rs3716; - cvt.s32.s8 %r5761, %r5760; - cvt.u32.u16 %r5762, %rs3714; - cvt.s32.s8 %r5763, %r5762; - cvt.u32.u16 %r5764, %rs3715; - cvt.s32.s8 %r5765, %r5764; - mul.lo.s32 %r5766, %r34, %r5765; - mad.lo.s32 %r5767, %r124, %r5763, %r5766; - mad.lo.s32 %r5768, %r35, %r5761, %r5767; - mad.lo.s32 %r5769, %r36, %r5759, %r5768; - ld.const.v4.u8 {%rs3722, %rs3723, %rs3724, %rs3725}, [matrix+1860]; - cvt.u32.u16 %r5770, %rs3725; - cvt.s32.s8 %r5771, %r5770; - cvt.u32.u16 %r5772, %rs3724; - cvt.s32.s8 %r5773, %r5772; - cvt.u32.u16 %r5774, %rs3723; - cvt.s32.s8 %r5775, %r5774; - cvt.u32.u16 %r5776, %rs3722; - cvt.s32.s8 %r5777, %r5776; - mad.lo.s32 %r5778, %r37, %r5777, %r5769; - mad.lo.s32 %r5779, %r38, %r5775, %r5778; - mad.lo.s32 %r5780, %r39, %r5773, %r5779; - mad.lo.s32 %r5781, %r40, %r5771, %r5780; - ld.const.v4.u8 {%rs3730, %rs3731, %rs3732, %rs3733}, [matrix+1864]; - cvt.u32.u16 %r5782, %rs3733; - cvt.s32.s8 %r5783, %r5782; - cvt.u32.u16 %r5784, %rs3732; - cvt.s32.s8 %r5785, %r5784; - cvt.u32.u16 %r5786, %rs3731; - cvt.s32.s8 %r5787, %r5786; - cvt.u32.u16 %r5788, %rs3730; - cvt.s32.s8 %r5789, %r5788; - mad.lo.s32 %r5790, %r42, %r5789, %r5781; - mad.lo.s32 %r5791, %r43, %r5787, %r5790; - mad.lo.s32 %r5792, %r45, %r5785, %r5791; - mad.lo.s32 %r5793, %r46, %r5783, %r5792; - ld.const.v4.u8 {%rs3738, %rs3739, %rs3740, %rs3741}, [matrix+1868]; - cvt.u32.u16 %r5794, %rs3741; - cvt.s32.s8 %r5795, %r5794; - cvt.u32.u16 %r5796, %rs3740; - cvt.s32.s8 %r5797, %r5796; - cvt.u32.u16 %r5798, %rs3739; - cvt.s32.s8 %r5799, %r5798; - cvt.u32.u16 %r5800, %rs3738; - cvt.s32.s8 %r5801, %r5800; - mad.lo.s32 %r5802, %r48, %r5801, %r5793; - mad.lo.s32 %r5803, %r49, %r5799, %r5802; - mad.lo.s32 %r5804, %r50, %r5797, %r5803; - mad.lo.s32 %r5805, %r51, %r5795, %r5804; - ld.const.v4.u8 {%rs3746, %rs3747, %rs3748, %rs3749}, [matrix+1872]; - cvt.u32.u16 %r5806, %rs3749; - cvt.s32.s8 %r5807, %r5806; - cvt.u32.u16 %r5808, %rs3748; - cvt.s32.s8 %r5809, %r5808; - cvt.u32.u16 %r5810, %rs3747; - cvt.s32.s8 %r5811, %r5810; - cvt.u32.u16 %r5812, %rs3746; - cvt.s32.s8 %r5813, %r5812; - mad.lo.s32 %r5814, %r173, %r5813, %r5805; - mad.lo.s32 %r5815, %r53, %r5811, %r5814; - mad.lo.s32 %r5816, %r54, %r5809, %r5815; - mad.lo.s32 %r5817, %r55, %r5807, %r5816; - ld.const.v4.u8 {%rs3754, %rs3755, %rs3756, %rs3757}, [matrix+1876]; - cvt.u32.u16 %r5818, %rs3757; - cvt.s32.s8 %r5819, %r5818; - cvt.u32.u16 %r5820, %rs3756; - cvt.s32.s8 %r5821, %r5820; - cvt.u32.u16 %r5822, %rs3755; - cvt.s32.s8 %r5823, %r5822; - cvt.u32.u16 %r5824, %rs3754; - cvt.s32.s8 %r5825, %r5824; - mad.lo.s32 %r5826, %r56, %r5825, %r5817; - mad.lo.s32 %r5827, %r57, %r5823, %r5826; - mad.lo.s32 %r5828, %r58, %r5821, %r5827; - mad.lo.s32 %r5829, %r59, %r5819, %r5828; - ld.const.v4.u8 {%rs3762, %rs3763, %rs3764, %rs3765}, [matrix+1880]; - cvt.u32.u16 %r5830, %rs3765; - cvt.s32.s8 %r5831, %r5830; - cvt.u32.u16 %r5832, %rs3764; - cvt.s32.s8 %r5833, %r5832; - cvt.u32.u16 %r5834, %rs3763; - cvt.s32.s8 %r5835, %r5834; - cvt.u32.u16 %r5836, %rs3762; - cvt.s32.s8 %r5837, %r5836; - mad.lo.s32 %r5838, %r61, %r5837, %r5829; - mad.lo.s32 %r5839, %r62, %r5835, %r5838; - mad.lo.s32 %r5840, %r64, %r5833, %r5839; - mad.lo.s32 %r5841, %r65, %r5831, %r5840; - ld.const.v4.u8 {%rs3770, %rs3771, %rs3772, %rs3773}, [matrix+1884]; - cvt.u32.u16 %r5842, %rs3773; - cvt.s32.s8 %r5843, %r5842; - cvt.u32.u16 %r5844, %rs3772; - cvt.s32.s8 %r5845, %r5844; - cvt.u32.u16 %r5846, %rs3771; - cvt.s32.s8 %r5847, %r5846; - cvt.u32.u16 %r5848, %rs3770; - cvt.s32.s8 %r5849, %r5848; - mad.lo.s32 %r5850, %r67, %r5849, %r5841; - mad.lo.s32 %r5851, %r68, %r5847, %r5850; - mad.lo.s32 %r5852, %r69, %r5845, %r5851; - mad.lo.s32 %r5853, %r70, %r5843, %r5852; - ld.const.v4.u8 {%rs3778, %rs3779, %rs3780, %rs3781}, [matrix+1888]; - cvt.u32.u16 %r5854, %rs3781; - cvt.s32.s8 %r5855, %r5854; - cvt.u32.u16 %r5856, %rs3780; - cvt.s32.s8 %r5857, %r5856; - cvt.u32.u16 %r5858, %rs3779; - cvt.s32.s8 %r5859, %r5858; - cvt.u32.u16 %r5860, %rs3778; - cvt.s32.s8 %r5861, %r5860; - mad.lo.s32 %r5862, %r222, %r5861, %r5853; - mad.lo.s32 %r5863, %r72, %r5859, %r5862; - mad.lo.s32 %r5864, %r73, %r5857, %r5863; - mad.lo.s32 %r5865, %r74, %r5855, %r5864; - ld.const.v4.u8 {%rs3786, %rs3787, %rs3788, %rs3789}, [matrix+1892]; - cvt.u32.u16 %r5866, %rs3789; - cvt.s32.s8 %r5867, %r5866; - cvt.u32.u16 %r5868, %rs3788; - cvt.s32.s8 %r5869, %r5868; - cvt.u32.u16 %r5870, %rs3787; - cvt.s32.s8 %r5871, %r5870; - cvt.u32.u16 %r5872, %rs3786; - cvt.s32.s8 %r5873, %r5872; - mad.lo.s32 %r5874, %r75, %r5873, %r5865; - mad.lo.s32 %r5875, %r76, %r5871, %r5874; - mad.lo.s32 %r5876, %r77, %r5869, %r5875; - mad.lo.s32 %r5877, %r78, %r5867, %r5876; - ld.const.v4.u8 {%rs3794, %rs3795, %rs3796, %rs3797}, [matrix+1896]; - cvt.u32.u16 %r5878, %rs3797; - cvt.s32.s8 %r5879, %r5878; - cvt.u32.u16 %r5880, %rs3796; - cvt.s32.s8 %r5881, %r5880; - cvt.u32.u16 %r5882, %rs3795; - cvt.s32.s8 %r5883, %r5882; - cvt.u32.u16 %r5884, %rs3794; - cvt.s32.s8 %r5885, %r5884; - mad.lo.s32 %r5886, %r80, %r5885, %r5877; - mad.lo.s32 %r5887, %r81, %r5883, %r5886; - mad.lo.s32 %r5888, %r83, %r5881, %r5887; - mad.lo.s32 %r5889, %r84, %r5879, %r5888; - ld.const.v4.u8 {%rs3802, %rs3803, %rs3804, %rs3805}, [matrix+1900]; - cvt.u32.u16 %r5890, %rs3805; - cvt.s32.s8 %r5891, %r5890; - cvt.u32.u16 %r5892, %rs3804; - cvt.s32.s8 %r5893, %r5892; - cvt.u32.u16 %r5894, %rs3803; - cvt.s32.s8 %r5895, %r5894; - cvt.u32.u16 %r5896, %rs3802; - cvt.s32.s8 %r5897, %r5896; - mad.lo.s32 %r5898, %r86, %r5897, %r5889; - mad.lo.s32 %r5899, %r87, %r5895, %r5898; - mad.lo.s32 %r5900, %r88, %r5893, %r5899; - mad.lo.s32 %r5901, %r89, %r5891, %r5900; - ld.const.v4.u8 {%rs3810, %rs3811, %rs3812, %rs3813}, [matrix+1904]; - cvt.u32.u16 %r5902, %rs3813; - cvt.s32.s8 %r5903, %r5902; - cvt.u32.u16 %r5904, %rs3812; - cvt.s32.s8 %r5905, %r5904; - cvt.u32.u16 %r5906, %rs3811; - cvt.s32.s8 %r5907, %r5906; - cvt.u32.u16 %r5908, %rs3810; - cvt.s32.s8 %r5909, %r5908; - mad.lo.s32 %r5910, %r271, %r5909, %r5901; - mad.lo.s32 %r5911, %r91, %r5907, %r5910; - mad.lo.s32 %r5912, %r93, %r5905, %r5911; - mad.lo.s32 %r5913, %r94, %r5903, %r5912; - ld.const.v4.u8 {%rs3818, %rs3819, %rs3820, %rs3821}, [matrix+1908]; - cvt.u32.u16 %r5914, %rs3821; - cvt.s32.s8 %r5915, %r5914; - cvt.u32.u16 %r5916, %rs3820; - cvt.s32.s8 %r5917, %r5916; - cvt.u32.u16 %r5918, %rs3819; - cvt.s32.s8 %r5919, %r5918; - cvt.u32.u16 %r5920, %rs3818; - cvt.s32.s8 %r5921, %r5920; - mad.lo.s32 %r5922, %r96, %r5921, %r5913; - mad.lo.s32 %r5923, %r97, %r5919, %r5922; - mad.lo.s32 %r5924, %r99, %r5917, %r5923; - mad.lo.s32 %r5925, %r100, %r5915, %r5924; - ld.const.v4.u8 {%rs3826, %rs3827, %rs3828, %rs3829}, [matrix+1912]; - cvt.u32.u16 %r5926, %rs3829; - cvt.s32.s8 %r5927, %r5926; - cvt.u32.u16 %r5928, %rs3828; - cvt.s32.s8 %r5929, %r5928; - cvt.u32.u16 %r5930, %rs3827; - cvt.s32.s8 %r5931, %r5930; - cvt.u32.u16 %r5932, %rs3826; - cvt.s32.s8 %r5933, %r5932; - mad.lo.s32 %r5934, %r103, %r5933, %r5925; - mad.lo.s32 %r5935, %r104, %r5931, %r5934; - mad.lo.s32 %r5936, %r107, %r5929, %r5935; - mad.lo.s32 %r5937, %r108, %r5927, %r5936; - ld.const.v4.u8 {%rs3834, %rs3835, %rs3836, %rs3837}, [matrix+1916]; - cvt.u32.u16 %r5938, %rs3837; - cvt.s32.s8 %r5939, %r5938; - cvt.u32.u16 %r5940, %rs3836; - cvt.s32.s8 %r5941, %r5940; - cvt.u32.u16 %r5942, %rs3835; - cvt.s32.s8 %r5943, %r5942; - cvt.u32.u16 %r5944, %rs3834; - cvt.s32.s8 %r5945, %r5944; - mad.lo.s32 %r5946, %r111, %r5945, %r5937; - mad.lo.s32 %r5947, %r112, %r5943, %r5946; - mad.lo.s32 %r5948, %r114, %r5941, %r5947; - mad.lo.s32 %r5949, %r115, %r5939, %r5948; - shr.u32 %r5950, %r5757, 6; - and.b32 %r5951, %r5950, 240; - shr.u32 %r5952, %r5949, 10; - or.b32 %r5953, %r5952, %r5951; - xor.b32 %r5954, %r24, %r5953; - cvt.u64.u32 %rd392, %r5954; - ld.const.v4.u8 {%rs3842, %rs3843, %rs3844, %rs3845}, [matrix+1920]; - cvt.u32.u16 %r5955, %rs3845; - cvt.s32.s8 %r5956, %r5955; - cvt.u32.u16 %r5957, %rs3844; - cvt.s32.s8 %r5958, %r5957; - cvt.u32.u16 %r5959, %rs3842; - cvt.s32.s8 %r5960, %r5959; - cvt.u32.u16 %r5961, %rs3843; - cvt.s32.s8 %r5962, %r5961; - mul.lo.s32 %r5963, %r34, %r5962; - mad.lo.s32 %r5964, %r124, %r5960, %r5963; - mad.lo.s32 %r5965, %r35, %r5958, %r5964; - mad.lo.s32 %r5966, %r36, %r5956, %r5965; - ld.const.v4.u8 {%rs3850, %rs3851, %rs3852, %rs3853}, [matrix+1924]; - cvt.u32.u16 %r5967, %rs3853; - cvt.s32.s8 %r5968, %r5967; - cvt.u32.u16 %r5969, %rs3852; - cvt.s32.s8 %r5970, %r5969; - cvt.u32.u16 %r5971, %rs3851; - cvt.s32.s8 %r5972, %r5971; - cvt.u32.u16 %r5973, %rs3850; - cvt.s32.s8 %r5974, %r5973; - mad.lo.s32 %r5975, %r37, %r5974, %r5966; - mad.lo.s32 %r5976, %r38, %r5972, %r5975; - mad.lo.s32 %r5977, %r39, %r5970, %r5976; - mad.lo.s32 %r5978, %r40, %r5968, %r5977; - ld.const.v4.u8 {%rs3858, %rs3859, %rs3860, %rs3861}, [matrix+1928]; - cvt.u32.u16 %r5979, %rs3861; - cvt.s32.s8 %r5980, %r5979; - cvt.u32.u16 %r5981, %rs3860; - cvt.s32.s8 %r5982, %r5981; - cvt.u32.u16 %r5983, %rs3859; - cvt.s32.s8 %r5984, %r5983; - cvt.u32.u16 %r5985, %rs3858; - cvt.s32.s8 %r5986, %r5985; - mad.lo.s32 %r5987, %r42, %r5986, %r5978; - mad.lo.s32 %r5988, %r43, %r5984, %r5987; - mad.lo.s32 %r5989, %r45, %r5982, %r5988; - mad.lo.s32 %r5990, %r46, %r5980, %r5989; - ld.const.v4.u8 {%rs3866, %rs3867, %rs3868, %rs3869}, [matrix+1932]; - cvt.u32.u16 %r5991, %rs3869; - cvt.s32.s8 %r5992, %r5991; - cvt.u32.u16 %r5993, %rs3868; - cvt.s32.s8 %r5994, %r5993; - cvt.u32.u16 %r5995, %rs3867; - cvt.s32.s8 %r5996, %r5995; - cvt.u32.u16 %r5997, %rs3866; - cvt.s32.s8 %r5998, %r5997; - mad.lo.s32 %r5999, %r48, %r5998, %r5990; - mad.lo.s32 %r6000, %r49, %r5996, %r5999; - mad.lo.s32 %r6001, %r50, %r5994, %r6000; - mad.lo.s32 %r6002, %r51, %r5992, %r6001; - ld.const.v4.u8 {%rs3874, %rs3875, %rs3876, %rs3877}, [matrix+1936]; - cvt.u32.u16 %r6003, %rs3877; - cvt.s32.s8 %r6004, %r6003; - cvt.u32.u16 %r6005, %rs3876; - cvt.s32.s8 %r6006, %r6005; - cvt.u32.u16 %r6007, %rs3875; - cvt.s32.s8 %r6008, %r6007; - cvt.u32.u16 %r6009, %rs3874; - cvt.s32.s8 %r6010, %r6009; - mad.lo.s32 %r6011, %r173, %r6010, %r6002; - mad.lo.s32 %r6012, %r53, %r6008, %r6011; - mad.lo.s32 %r6013, %r54, %r6006, %r6012; - mad.lo.s32 %r6014, %r55, %r6004, %r6013; - ld.const.v4.u8 {%rs3882, %rs3883, %rs3884, %rs3885}, [matrix+1940]; - cvt.u32.u16 %r6015, %rs3885; - cvt.s32.s8 %r6016, %r6015; - cvt.u32.u16 %r6017, %rs3884; - cvt.s32.s8 %r6018, %r6017; - cvt.u32.u16 %r6019, %rs3883; - cvt.s32.s8 %r6020, %r6019; - cvt.u32.u16 %r6021, %rs3882; - cvt.s32.s8 %r6022, %r6021; - mad.lo.s32 %r6023, %r56, %r6022, %r6014; - mad.lo.s32 %r6024, %r57, %r6020, %r6023; - mad.lo.s32 %r6025, %r58, %r6018, %r6024; - mad.lo.s32 %r6026, %r59, %r6016, %r6025; - ld.const.v4.u8 {%rs3890, %rs3891, %rs3892, %rs3893}, [matrix+1944]; - cvt.u32.u16 %r6027, %rs3893; - cvt.s32.s8 %r6028, %r6027; - cvt.u32.u16 %r6029, %rs3892; - cvt.s32.s8 %r6030, %r6029; - cvt.u32.u16 %r6031, %rs3891; - cvt.s32.s8 %r6032, %r6031; - cvt.u32.u16 %r6033, %rs3890; - cvt.s32.s8 %r6034, %r6033; - mad.lo.s32 %r6035, %r61, %r6034, %r6026; - mad.lo.s32 %r6036, %r62, %r6032, %r6035; - mad.lo.s32 %r6037, %r64, %r6030, %r6036; - mad.lo.s32 %r6038, %r65, %r6028, %r6037; - ld.const.v4.u8 {%rs3898, %rs3899, %rs3900, %rs3901}, [matrix+1948]; - cvt.u32.u16 %r6039, %rs3901; - cvt.s32.s8 %r6040, %r6039; - cvt.u32.u16 %r6041, %rs3900; - cvt.s32.s8 %r6042, %r6041; - cvt.u32.u16 %r6043, %rs3899; - cvt.s32.s8 %r6044, %r6043; - cvt.u32.u16 %r6045, %rs3898; - cvt.s32.s8 %r6046, %r6045; - mad.lo.s32 %r6047, %r67, %r6046, %r6038; - mad.lo.s32 %r6048, %r68, %r6044, %r6047; - mad.lo.s32 %r6049, %r69, %r6042, %r6048; - mad.lo.s32 %r6050, %r70, %r6040, %r6049; - ld.const.v4.u8 {%rs3906, %rs3907, %rs3908, %rs3909}, [matrix+1952]; - cvt.u32.u16 %r6051, %rs3909; - cvt.s32.s8 %r6052, %r6051; - cvt.u32.u16 %r6053, %rs3908; - cvt.s32.s8 %r6054, %r6053; - cvt.u32.u16 %r6055, %rs3907; - cvt.s32.s8 %r6056, %r6055; - cvt.u32.u16 %r6057, %rs3906; - cvt.s32.s8 %r6058, %r6057; - mad.lo.s32 %r6059, %r222, %r6058, %r6050; - mad.lo.s32 %r6060, %r72, %r6056, %r6059; - mad.lo.s32 %r6061, %r73, %r6054, %r6060; - mad.lo.s32 %r6062, %r74, %r6052, %r6061; - ld.const.v4.u8 {%rs3914, %rs3915, %rs3916, %rs3917}, [matrix+1956]; - cvt.u32.u16 %r6063, %rs3917; - cvt.s32.s8 %r6064, %r6063; - cvt.u32.u16 %r6065, %rs3916; - cvt.s32.s8 %r6066, %r6065; - cvt.u32.u16 %r6067, %rs3915; - cvt.s32.s8 %r6068, %r6067; - cvt.u32.u16 %r6069, %rs3914; - cvt.s32.s8 %r6070, %r6069; - mad.lo.s32 %r6071, %r75, %r6070, %r6062; - mad.lo.s32 %r6072, %r76, %r6068, %r6071; - mad.lo.s32 %r6073, %r77, %r6066, %r6072; - mad.lo.s32 %r6074, %r78, %r6064, %r6073; - ld.const.v4.u8 {%rs3922, %rs3923, %rs3924, %rs3925}, [matrix+1960]; - cvt.u32.u16 %r6075, %rs3925; - cvt.s32.s8 %r6076, %r6075; - cvt.u32.u16 %r6077, %rs3924; - cvt.s32.s8 %r6078, %r6077; - cvt.u32.u16 %r6079, %rs3923; - cvt.s32.s8 %r6080, %r6079; - cvt.u32.u16 %r6081, %rs3922; - cvt.s32.s8 %r6082, %r6081; - mad.lo.s32 %r6083, %r80, %r6082, %r6074; - mad.lo.s32 %r6084, %r81, %r6080, %r6083; - mad.lo.s32 %r6085, %r83, %r6078, %r6084; - mad.lo.s32 %r6086, %r84, %r6076, %r6085; - ld.const.v4.u8 {%rs3930, %rs3931, %rs3932, %rs3933}, [matrix+1964]; - cvt.u32.u16 %r6087, %rs3933; - cvt.s32.s8 %r6088, %r6087; - cvt.u32.u16 %r6089, %rs3932; - cvt.s32.s8 %r6090, %r6089; - cvt.u32.u16 %r6091, %rs3931; - cvt.s32.s8 %r6092, %r6091; - cvt.u32.u16 %r6093, %rs3930; - cvt.s32.s8 %r6094, %r6093; - mad.lo.s32 %r6095, %r86, %r6094, %r6086; - mad.lo.s32 %r6096, %r87, %r6092, %r6095; - mad.lo.s32 %r6097, %r88, %r6090, %r6096; - mad.lo.s32 %r6098, %r89, %r6088, %r6097; - ld.const.v4.u8 {%rs3938, %rs3939, %rs3940, %rs3941}, [matrix+1968]; - cvt.u32.u16 %r6099, %rs3941; - cvt.s32.s8 %r6100, %r6099; - cvt.u32.u16 %r6101, %rs3940; - cvt.s32.s8 %r6102, %r6101; - cvt.u32.u16 %r6103, %rs3939; - cvt.s32.s8 %r6104, %r6103; - cvt.u32.u16 %r6105, %rs3938; - cvt.s32.s8 %r6106, %r6105; - mad.lo.s32 %r6107, %r271, %r6106, %r6098; - mad.lo.s32 %r6108, %r91, %r6104, %r6107; - mad.lo.s32 %r6109, %r93, %r6102, %r6108; - mad.lo.s32 %r6110, %r94, %r6100, %r6109; - ld.const.v4.u8 {%rs3946, %rs3947, %rs3948, %rs3949}, [matrix+1972]; - cvt.u32.u16 %r6111, %rs3949; - cvt.s32.s8 %r6112, %r6111; - cvt.u32.u16 %r6113, %rs3948; - cvt.s32.s8 %r6114, %r6113; - cvt.u32.u16 %r6115, %rs3947; - cvt.s32.s8 %r6116, %r6115; - cvt.u32.u16 %r6117, %rs3946; - cvt.s32.s8 %r6118, %r6117; - mad.lo.s32 %r6119, %r96, %r6118, %r6110; - mad.lo.s32 %r6120, %r97, %r6116, %r6119; - mad.lo.s32 %r6121, %r99, %r6114, %r6120; - mad.lo.s32 %r6122, %r100, %r6112, %r6121; - ld.const.v4.u8 {%rs3954, %rs3955, %rs3956, %rs3957}, [matrix+1976]; - cvt.u32.u16 %r6123, %rs3957; - cvt.s32.s8 %r6124, %r6123; - cvt.u32.u16 %r6125, %rs3956; - cvt.s32.s8 %r6126, %r6125; - cvt.u32.u16 %r6127, %rs3955; - cvt.s32.s8 %r6128, %r6127; - cvt.u32.u16 %r6129, %rs3954; - cvt.s32.s8 %r6130, %r6129; - mad.lo.s32 %r6131, %r103, %r6130, %r6122; - mad.lo.s32 %r6132, %r104, %r6128, %r6131; - mad.lo.s32 %r6133, %r107, %r6126, %r6132; - mad.lo.s32 %r6134, %r108, %r6124, %r6133; - ld.const.v4.u8 {%rs3962, %rs3963, %rs3964, %rs3965}, [matrix+1980]; - cvt.u32.u16 %r6135, %rs3965; - cvt.s32.s8 %r6136, %r6135; - cvt.u32.u16 %r6137, %rs3964; - cvt.s32.s8 %r6138, %r6137; - cvt.u32.u16 %r6139, %rs3963; - cvt.s32.s8 %r6140, %r6139; - cvt.u32.u16 %r6141, %rs3962; - cvt.s32.s8 %r6142, %r6141; - mad.lo.s32 %r6143, %r111, %r6142, %r6134; - mad.lo.s32 %r6144, %r112, %r6140, %r6143; - mad.lo.s32 %r6145, %r114, %r6138, %r6144; - mad.lo.s32 %r6146, %r115, %r6136, %r6145; - ld.const.v4.u8 {%rs3970, %rs3971, %rs3972, %rs3973}, [matrix+1984]; - cvt.u32.u16 %r6147, %rs3973; - cvt.s32.s8 %r6148, %r6147; - cvt.u32.u16 %r6149, %rs3972; - cvt.s32.s8 %r6150, %r6149; - cvt.u32.u16 %r6151, %rs3970; - cvt.s32.s8 %r6152, %r6151; - cvt.u32.u16 %r6153, %rs3971; - cvt.s32.s8 %r6154, %r6153; - mul.lo.s32 %r6155, %r34, %r6154; - mad.lo.s32 %r6156, %r124, %r6152, %r6155; - mad.lo.s32 %r6157, %r35, %r6150, %r6156; - mad.lo.s32 %r6158, %r36, %r6148, %r6157; - ld.const.v4.u8 {%rs3978, %rs3979, %rs3980, %rs3981}, [matrix+1988]; - cvt.u32.u16 %r6159, %rs3981; - cvt.s32.s8 %r6160, %r6159; - cvt.u32.u16 %r6161, %rs3980; - cvt.s32.s8 %r6162, %r6161; - cvt.u32.u16 %r6163, %rs3979; - cvt.s32.s8 %r6164, %r6163; - cvt.u32.u16 %r6165, %rs3978; - cvt.s32.s8 %r6166, %r6165; - mad.lo.s32 %r6167, %r37, %r6166, %r6158; - mad.lo.s32 %r6168, %r38, %r6164, %r6167; - mad.lo.s32 %r6169, %r39, %r6162, %r6168; - mad.lo.s32 %r6170, %r40, %r6160, %r6169; - ld.const.v4.u8 {%rs3986, %rs3987, %rs3988, %rs3989}, [matrix+1992]; - cvt.u32.u16 %r6171, %rs3989; - cvt.s32.s8 %r6172, %r6171; - cvt.u32.u16 %r6173, %rs3988; - cvt.s32.s8 %r6174, %r6173; - cvt.u32.u16 %r6175, %rs3987; - cvt.s32.s8 %r6176, %r6175; - cvt.u32.u16 %r6177, %rs3986; - cvt.s32.s8 %r6178, %r6177; - mad.lo.s32 %r6179, %r42, %r6178, %r6170; - mad.lo.s32 %r6180, %r43, %r6176, %r6179; - mad.lo.s32 %r6181, %r45, %r6174, %r6180; - mad.lo.s32 %r6182, %r46, %r6172, %r6181; - ld.const.v4.u8 {%rs3994, %rs3995, %rs3996, %rs3997}, [matrix+1996]; - cvt.u32.u16 %r6183, %rs3997; - cvt.s32.s8 %r6184, %r6183; - cvt.u32.u16 %r6185, %rs3996; - cvt.s32.s8 %r6186, %r6185; - cvt.u32.u16 %r6187, %rs3995; - cvt.s32.s8 %r6188, %r6187; - cvt.u32.u16 %r6189, %rs3994; - cvt.s32.s8 %r6190, %r6189; - mad.lo.s32 %r6191, %r48, %r6190, %r6182; - mad.lo.s32 %r6192, %r49, %r6188, %r6191; - mad.lo.s32 %r6193, %r50, %r6186, %r6192; - mad.lo.s32 %r6194, %r51, %r6184, %r6193; - ld.const.v4.u8 {%rs4002, %rs4003, %rs4004, %rs4005}, [matrix+2000]; - cvt.u32.u16 %r6195, %rs4005; - cvt.s32.s8 %r6196, %r6195; - cvt.u32.u16 %r6197, %rs4004; - cvt.s32.s8 %r6198, %r6197; - cvt.u32.u16 %r6199, %rs4003; - cvt.s32.s8 %r6200, %r6199; - cvt.u32.u16 %r6201, %rs4002; - cvt.s32.s8 %r6202, %r6201; - mad.lo.s32 %r6203, %r173, %r6202, %r6194; - mad.lo.s32 %r6204, %r53, %r6200, %r6203; - mad.lo.s32 %r6205, %r54, %r6198, %r6204; - mad.lo.s32 %r6206, %r55, %r6196, %r6205; - ld.const.v4.u8 {%rs4010, %rs4011, %rs4012, %rs4013}, [matrix+2004]; - cvt.u32.u16 %r6207, %rs4013; - cvt.s32.s8 %r6208, %r6207; - cvt.u32.u16 %r6209, %rs4012; - cvt.s32.s8 %r6210, %r6209; - cvt.u32.u16 %r6211, %rs4011; - cvt.s32.s8 %r6212, %r6211; - cvt.u32.u16 %r6213, %rs4010; - cvt.s32.s8 %r6214, %r6213; - mad.lo.s32 %r6215, %r56, %r6214, %r6206; - mad.lo.s32 %r6216, %r57, %r6212, %r6215; - mad.lo.s32 %r6217, %r58, %r6210, %r6216; - mad.lo.s32 %r6218, %r59, %r6208, %r6217; - ld.const.v4.u8 {%rs4018, %rs4019, %rs4020, %rs4021}, [matrix+2008]; - cvt.u32.u16 %r6219, %rs4021; - cvt.s32.s8 %r6220, %r6219; - cvt.u32.u16 %r6221, %rs4020; - cvt.s32.s8 %r6222, %r6221; - cvt.u32.u16 %r6223, %rs4019; - cvt.s32.s8 %r6224, %r6223; - cvt.u32.u16 %r6225, %rs4018; - cvt.s32.s8 %r6226, %r6225; - mad.lo.s32 %r6227, %r61, %r6226, %r6218; - mad.lo.s32 %r6228, %r62, %r6224, %r6227; - mad.lo.s32 %r6229, %r64, %r6222, %r6228; - mad.lo.s32 %r6230, %r65, %r6220, %r6229; - ld.const.v4.u8 {%rs4026, %rs4027, %rs4028, %rs4029}, [matrix+2012]; - cvt.u32.u16 %r6231, %rs4029; - cvt.s32.s8 %r6232, %r6231; - cvt.u32.u16 %r6233, %rs4028; - cvt.s32.s8 %r6234, %r6233; - cvt.u32.u16 %r6235, %rs4027; - cvt.s32.s8 %r6236, %r6235; - cvt.u32.u16 %r6237, %rs4026; - cvt.s32.s8 %r6238, %r6237; - mad.lo.s32 %r6239, %r67, %r6238, %r6230; - mad.lo.s32 %r6240, %r68, %r6236, %r6239; - mad.lo.s32 %r6241, %r69, %r6234, %r6240; - mad.lo.s32 %r6242, %r70, %r6232, %r6241; - ld.const.v4.u8 {%rs4034, %rs4035, %rs4036, %rs4037}, [matrix+2016]; - cvt.u32.u16 %r6243, %rs4037; - cvt.s32.s8 %r6244, %r6243; - cvt.u32.u16 %r6245, %rs4036; - cvt.s32.s8 %r6246, %r6245; - cvt.u32.u16 %r6247, %rs4035; - cvt.s32.s8 %r6248, %r6247; - cvt.u32.u16 %r6249, %rs4034; - cvt.s32.s8 %r6250, %r6249; - mad.lo.s32 %r6251, %r222, %r6250, %r6242; - mad.lo.s32 %r6252, %r72, %r6248, %r6251; - mad.lo.s32 %r6253, %r73, %r6246, %r6252; - mad.lo.s32 %r6254, %r74, %r6244, %r6253; - ld.const.v4.u8 {%rs4042, %rs4043, %rs4044, %rs4045}, [matrix+2020]; - cvt.u32.u16 %r6255, %rs4045; - cvt.s32.s8 %r6256, %r6255; - cvt.u32.u16 %r6257, %rs4044; - cvt.s32.s8 %r6258, %r6257; - cvt.u32.u16 %r6259, %rs4043; - cvt.s32.s8 %r6260, %r6259; - cvt.u32.u16 %r6261, %rs4042; - cvt.s32.s8 %r6262, %r6261; - mad.lo.s32 %r6263, %r75, %r6262, %r6254; - mad.lo.s32 %r6264, %r76, %r6260, %r6263; - mad.lo.s32 %r6265, %r77, %r6258, %r6264; - mad.lo.s32 %r6266, %r78, %r6256, %r6265; - ld.const.v4.u8 {%rs4050, %rs4051, %rs4052, %rs4053}, [matrix+2024]; - cvt.u32.u16 %r6267, %rs4053; - cvt.s32.s8 %r6268, %r6267; - cvt.u32.u16 %r6269, %rs4052; - cvt.s32.s8 %r6270, %r6269; - cvt.u32.u16 %r6271, %rs4051; - cvt.s32.s8 %r6272, %r6271; - cvt.u32.u16 %r6273, %rs4050; - cvt.s32.s8 %r6274, %r6273; - mad.lo.s32 %r6275, %r80, %r6274, %r6266; - mad.lo.s32 %r6276, %r81, %r6272, %r6275; - mad.lo.s32 %r6277, %r83, %r6270, %r6276; - mad.lo.s32 %r6278, %r84, %r6268, %r6277; - ld.const.v4.u8 {%rs4058, %rs4059, %rs4060, %rs4061}, [matrix+2028]; - cvt.u32.u16 %r6279, %rs4061; - cvt.s32.s8 %r6280, %r6279; - cvt.u32.u16 %r6281, %rs4060; - cvt.s32.s8 %r6282, %r6281; - cvt.u32.u16 %r6283, %rs4059; - cvt.s32.s8 %r6284, %r6283; - cvt.u32.u16 %r6285, %rs4058; - cvt.s32.s8 %r6286, %r6285; - mad.lo.s32 %r6287, %r86, %r6286, %r6278; - mad.lo.s32 %r6288, %r87, %r6284, %r6287; - mad.lo.s32 %r6289, %r88, %r6282, %r6288; - mad.lo.s32 %r6290, %r89, %r6280, %r6289; - ld.const.v4.u8 {%rs4066, %rs4067, %rs4068, %rs4069}, [matrix+2032]; - cvt.u32.u16 %r6291, %rs4069; - cvt.s32.s8 %r6292, %r6291; - cvt.u32.u16 %r6293, %rs4068; - cvt.s32.s8 %r6294, %r6293; - cvt.u32.u16 %r6295, %rs4067; - cvt.s32.s8 %r6296, %r6295; - cvt.u32.u16 %r6297, %rs4066; - cvt.s32.s8 %r6298, %r6297; - mad.lo.s32 %r6299, %r271, %r6298, %r6290; - mad.lo.s32 %r6300, %r91, %r6296, %r6299; - mad.lo.s32 %r6301, %r93, %r6294, %r6300; - mad.lo.s32 %r6302, %r94, %r6292, %r6301; - ld.const.v4.u8 {%rs4074, %rs4075, %rs4076, %rs4077}, [matrix+2036]; - cvt.u32.u16 %r6303, %rs4077; - cvt.s32.s8 %r6304, %r6303; - cvt.u32.u16 %r6305, %rs4076; - cvt.s32.s8 %r6306, %r6305; - cvt.u32.u16 %r6307, %rs4075; - cvt.s32.s8 %r6308, %r6307; - cvt.u32.u16 %r6309, %rs4074; - cvt.s32.s8 %r6310, %r6309; - mad.lo.s32 %r6311, %r96, %r6310, %r6302; - mad.lo.s32 %r6312, %r97, %r6308, %r6311; - mad.lo.s32 %r6313, %r99, %r6306, %r6312; - mad.lo.s32 %r6314, %r100, %r6304, %r6313; - ld.const.v4.u8 {%rs4082, %rs4083, %rs4084, %rs4085}, [matrix+2040]; - cvt.u32.u16 %r6315, %rs4085; - cvt.s32.s8 %r6316, %r6315; - cvt.u32.u16 %r6317, %rs4084; - cvt.s32.s8 %r6318, %r6317; - cvt.u32.u16 %r6319, %rs4083; - cvt.s32.s8 %r6320, %r6319; - cvt.u32.u16 %r6321, %rs4082; - cvt.s32.s8 %r6322, %r6321; - mad.lo.s32 %r6323, %r103, %r6322, %r6314; - mad.lo.s32 %r6324, %r104, %r6320, %r6323; - mad.lo.s32 %r6325, %r107, %r6318, %r6324; - mad.lo.s32 %r6326, %r108, %r6316, %r6325; - ld.const.v4.u8 {%rs4090, %rs4091, %rs4092, %rs4093}, [matrix+2044]; - cvt.u32.u16 %r6327, %rs4093; - cvt.s32.s8 %r6328, %r6327; - cvt.u32.u16 %r6329, %rs4092; - cvt.s32.s8 %r6330, %r6329; - cvt.u32.u16 %r6331, %rs4091; - cvt.s32.s8 %r6332, %r6331; - cvt.u32.u16 %r6333, %rs4090; - cvt.s32.s8 %r6334, %r6333; - mad.lo.s32 %r6335, %r111, %r6334, %r6326; - mad.lo.s32 %r6336, %r112, %r6332, %r6335; - mad.lo.s32 %r6337, %r114, %r6330, %r6336; - mad.lo.s32 %r6338, %r115, %r6328, %r6337; - shr.u32 %r6339, %r6146, 6; - and.b32 %r6340, %r6339, 240; - shr.u32 %r6341, %r6338, 10; - or.b32 %r6342, %r6341, %r6340; - xor.b32 %r6343, %r25, %r6342; - ld.const.v4.u8 {%rs4098, %rs4099, %rs4100, %rs4101}, [matrix+2048]; - cvt.u32.u16 %r6344, %rs4101; - cvt.s32.s8 %r6345, %r6344; - cvt.u32.u16 %r6346, %rs4100; - cvt.s32.s8 %r6347, %r6346; - cvt.u32.u16 %r6348, %rs4098; - cvt.s32.s8 %r6349, %r6348; - cvt.u32.u16 %r6350, %rs4099; - cvt.s32.s8 %r6351, %r6350; - mul.lo.s32 %r6352, %r34, %r6351; - mad.lo.s32 %r6353, %r124, %r6349, %r6352; - mad.lo.s32 %r6354, %r35, %r6347, %r6353; - mad.lo.s32 %r6355, %r36, %r6345, %r6354; - ld.const.v4.u8 {%rs4106, %rs4107, %rs4108, %rs4109}, [matrix+2052]; - cvt.u32.u16 %r6356, %rs4109; - cvt.s32.s8 %r6357, %r6356; - cvt.u32.u16 %r6358, %rs4108; - cvt.s32.s8 %r6359, %r6358; - cvt.u32.u16 %r6360, %rs4107; - cvt.s32.s8 %r6361, %r6360; - cvt.u32.u16 %r6362, %rs4106; - cvt.s32.s8 %r6363, %r6362; - mad.lo.s32 %r6364, %r37, %r6363, %r6355; - mad.lo.s32 %r6365, %r38, %r6361, %r6364; - mad.lo.s32 %r6366, %r39, %r6359, %r6365; - mad.lo.s32 %r6367, %r40, %r6357, %r6366; - ld.const.v4.u8 {%rs4114, %rs4115, %rs4116, %rs4117}, [matrix+2056]; - cvt.u32.u16 %r6368, %rs4117; - cvt.s32.s8 %r6369, %r6368; - cvt.u32.u16 %r6370, %rs4116; - cvt.s32.s8 %r6371, %r6370; - cvt.u32.u16 %r6372, %rs4115; - cvt.s32.s8 %r6373, %r6372; - cvt.u32.u16 %r6374, %rs4114; - cvt.s32.s8 %r6375, %r6374; - mad.lo.s32 %r6376, %r42, %r6375, %r6367; - mad.lo.s32 %r6377, %r43, %r6373, %r6376; - mad.lo.s32 %r6378, %r45, %r6371, %r6377; - mad.lo.s32 %r6379, %r46, %r6369, %r6378; - ld.const.v4.u8 {%rs4122, %rs4123, %rs4124, %rs4125}, [matrix+2060]; - cvt.u32.u16 %r6380, %rs4125; - cvt.s32.s8 %r6381, %r6380; - cvt.u32.u16 %r6382, %rs4124; - cvt.s32.s8 %r6383, %r6382; - cvt.u32.u16 %r6384, %rs4123; - cvt.s32.s8 %r6385, %r6384; - cvt.u32.u16 %r6386, %rs4122; - cvt.s32.s8 %r6387, %r6386; - mad.lo.s32 %r6388, %r48, %r6387, %r6379; - mad.lo.s32 %r6389, %r49, %r6385, %r6388; - mad.lo.s32 %r6390, %r50, %r6383, %r6389; - mad.lo.s32 %r6391, %r51, %r6381, %r6390; - ld.const.v4.u8 {%rs4130, %rs4131, %rs4132, %rs4133}, [matrix+2064]; - cvt.u32.u16 %r6392, %rs4133; - cvt.s32.s8 %r6393, %r6392; - cvt.u32.u16 %r6394, %rs4132; - cvt.s32.s8 %r6395, %r6394; - cvt.u32.u16 %r6396, %rs4131; - cvt.s32.s8 %r6397, %r6396; - cvt.u32.u16 %r6398, %rs4130; - cvt.s32.s8 %r6399, %r6398; - mad.lo.s32 %r6400, %r173, %r6399, %r6391; - mad.lo.s32 %r6401, %r53, %r6397, %r6400; - mad.lo.s32 %r6402, %r54, %r6395, %r6401; - mad.lo.s32 %r6403, %r55, %r6393, %r6402; - ld.const.v4.u8 {%rs4138, %rs4139, %rs4140, %rs4141}, [matrix+2068]; - cvt.u32.u16 %r6404, %rs4141; - cvt.s32.s8 %r6405, %r6404; - cvt.u32.u16 %r6406, %rs4140; - cvt.s32.s8 %r6407, %r6406; - cvt.u32.u16 %r6408, %rs4139; - cvt.s32.s8 %r6409, %r6408; - cvt.u32.u16 %r6410, %rs4138; - cvt.s32.s8 %r6411, %r6410; - mad.lo.s32 %r6412, %r56, %r6411, %r6403; - mad.lo.s32 %r6413, %r57, %r6409, %r6412; - mad.lo.s32 %r6414, %r58, %r6407, %r6413; - mad.lo.s32 %r6415, %r59, %r6405, %r6414; - ld.const.v4.u8 {%rs4146, %rs4147, %rs4148, %rs4149}, [matrix+2072]; - cvt.u32.u16 %r6416, %rs4149; - cvt.s32.s8 %r6417, %r6416; - cvt.u32.u16 %r6418, %rs4148; - cvt.s32.s8 %r6419, %r6418; - cvt.u32.u16 %r6420, %rs4147; - cvt.s32.s8 %r6421, %r6420; - cvt.u32.u16 %r6422, %rs4146; - cvt.s32.s8 %r6423, %r6422; - mad.lo.s32 %r6424, %r61, %r6423, %r6415; - mad.lo.s32 %r6425, %r62, %r6421, %r6424; - mad.lo.s32 %r6426, %r64, %r6419, %r6425; - mad.lo.s32 %r6427, %r65, %r6417, %r6426; - ld.const.v4.u8 {%rs4154, %rs4155, %rs4156, %rs4157}, [matrix+2076]; - cvt.u32.u16 %r6428, %rs4157; - cvt.s32.s8 %r6429, %r6428; - cvt.u32.u16 %r6430, %rs4156; - cvt.s32.s8 %r6431, %r6430; - cvt.u32.u16 %r6432, %rs4155; - cvt.s32.s8 %r6433, %r6432; - cvt.u32.u16 %r6434, %rs4154; - cvt.s32.s8 %r6435, %r6434; - mad.lo.s32 %r6436, %r67, %r6435, %r6427; - mad.lo.s32 %r6437, %r68, %r6433, %r6436; - mad.lo.s32 %r6438, %r69, %r6431, %r6437; - mad.lo.s32 %r6439, %r70, %r6429, %r6438; - ld.const.v4.u8 {%rs4162, %rs4163, %rs4164, %rs4165}, [matrix+2080]; - cvt.u32.u16 %r6440, %rs4165; - cvt.s32.s8 %r6441, %r6440; - cvt.u32.u16 %r6442, %rs4164; - cvt.s32.s8 %r6443, %r6442; - cvt.u32.u16 %r6444, %rs4163; - cvt.s32.s8 %r6445, %r6444; - cvt.u32.u16 %r6446, %rs4162; - cvt.s32.s8 %r6447, %r6446; - mad.lo.s32 %r6448, %r222, %r6447, %r6439; - mad.lo.s32 %r6449, %r72, %r6445, %r6448; - mad.lo.s32 %r6450, %r73, %r6443, %r6449; - mad.lo.s32 %r6451, %r74, %r6441, %r6450; - ld.const.v4.u8 {%rs4170, %rs4171, %rs4172, %rs4173}, [matrix+2084]; - cvt.u32.u16 %r6452, %rs4173; - cvt.s32.s8 %r6453, %r6452; - cvt.u32.u16 %r6454, %rs4172; - cvt.s32.s8 %r6455, %r6454; - cvt.u32.u16 %r6456, %rs4171; - cvt.s32.s8 %r6457, %r6456; - cvt.u32.u16 %r6458, %rs4170; - cvt.s32.s8 %r6459, %r6458; - mad.lo.s32 %r6460, %r75, %r6459, %r6451; - mad.lo.s32 %r6461, %r76, %r6457, %r6460; - mad.lo.s32 %r6462, %r77, %r6455, %r6461; - mad.lo.s32 %r6463, %r78, %r6453, %r6462; - ld.const.v4.u8 {%rs4178, %rs4179, %rs4180, %rs4181}, [matrix+2088]; - cvt.u32.u16 %r6464, %rs4181; - cvt.s32.s8 %r6465, %r6464; - cvt.u32.u16 %r6466, %rs4180; - cvt.s32.s8 %r6467, %r6466; - cvt.u32.u16 %r6468, %rs4179; - cvt.s32.s8 %r6469, %r6468; - cvt.u32.u16 %r6470, %rs4178; - cvt.s32.s8 %r6471, %r6470; - mad.lo.s32 %r6472, %r80, %r6471, %r6463; - mad.lo.s32 %r6473, %r81, %r6469, %r6472; - mad.lo.s32 %r6474, %r83, %r6467, %r6473; - mad.lo.s32 %r6475, %r84, %r6465, %r6474; - ld.const.v4.u8 {%rs4186, %rs4187, %rs4188, %rs4189}, [matrix+2092]; - cvt.u32.u16 %r6476, %rs4189; - cvt.s32.s8 %r6477, %r6476; - cvt.u32.u16 %r6478, %rs4188; - cvt.s32.s8 %r6479, %r6478; - cvt.u32.u16 %r6480, %rs4187; - cvt.s32.s8 %r6481, %r6480; - cvt.u32.u16 %r6482, %rs4186; - cvt.s32.s8 %r6483, %r6482; - mad.lo.s32 %r6484, %r86, %r6483, %r6475; - mad.lo.s32 %r6485, %r87, %r6481, %r6484; - mad.lo.s32 %r6486, %r88, %r6479, %r6485; - mad.lo.s32 %r6487, %r89, %r6477, %r6486; - ld.const.v4.u8 {%rs4194, %rs4195, %rs4196, %rs4197}, [matrix+2096]; - cvt.u32.u16 %r6488, %rs4197; - cvt.s32.s8 %r6489, %r6488; - cvt.u32.u16 %r6490, %rs4196; - cvt.s32.s8 %r6491, %r6490; - cvt.u32.u16 %r6492, %rs4195; - cvt.s32.s8 %r6493, %r6492; - cvt.u32.u16 %r6494, %rs4194; - cvt.s32.s8 %r6495, %r6494; - mad.lo.s32 %r6496, %r271, %r6495, %r6487; - mad.lo.s32 %r6497, %r91, %r6493, %r6496; - mad.lo.s32 %r6498, %r93, %r6491, %r6497; - mad.lo.s32 %r6499, %r94, %r6489, %r6498; - ld.const.v4.u8 {%rs4202, %rs4203, %rs4204, %rs4205}, [matrix+2100]; - cvt.u32.u16 %r6500, %rs4205; - cvt.s32.s8 %r6501, %r6500; - cvt.u32.u16 %r6502, %rs4204; - cvt.s32.s8 %r6503, %r6502; - cvt.u32.u16 %r6504, %rs4203; - cvt.s32.s8 %r6505, %r6504; - cvt.u32.u16 %r6506, %rs4202; - cvt.s32.s8 %r6507, %r6506; - mad.lo.s32 %r6508, %r96, %r6507, %r6499; - mad.lo.s32 %r6509, %r97, %r6505, %r6508; - mad.lo.s32 %r6510, %r99, %r6503, %r6509; - mad.lo.s32 %r6511, %r100, %r6501, %r6510; - ld.const.v4.u8 {%rs4210, %rs4211, %rs4212, %rs4213}, [matrix+2104]; - cvt.u32.u16 %r6512, %rs4213; - cvt.s32.s8 %r6513, %r6512; - cvt.u32.u16 %r6514, %rs4212; - cvt.s32.s8 %r6515, %r6514; - cvt.u32.u16 %r6516, %rs4211; - cvt.s32.s8 %r6517, %r6516; - cvt.u32.u16 %r6518, %rs4210; - cvt.s32.s8 %r6519, %r6518; - mad.lo.s32 %r6520, %r103, %r6519, %r6511; - mad.lo.s32 %r6521, %r104, %r6517, %r6520; - mad.lo.s32 %r6522, %r107, %r6515, %r6521; - mad.lo.s32 %r6523, %r108, %r6513, %r6522; - ld.const.v4.u8 {%rs4218, %rs4219, %rs4220, %rs4221}, [matrix+2108]; - cvt.u32.u16 %r6524, %rs4221; - cvt.s32.s8 %r6525, %r6524; - cvt.u32.u16 %r6526, %rs4220; - cvt.s32.s8 %r6527, %r6526; - cvt.u32.u16 %r6528, %rs4219; - cvt.s32.s8 %r6529, %r6528; - cvt.u32.u16 %r6530, %rs4218; - cvt.s32.s8 %r6531, %r6530; - mad.lo.s32 %r6532, %r111, %r6531, %r6523; - mad.lo.s32 %r6533, %r112, %r6529, %r6532; - mad.lo.s32 %r6534, %r114, %r6527, %r6533; - mad.lo.s32 %r6535, %r115, %r6525, %r6534; - ld.const.v4.u8 {%rs4226, %rs4227, %rs4228, %rs4229}, [matrix+2112]; - cvt.u32.u16 %r6536, %rs4229; - cvt.s32.s8 %r6537, %r6536; - cvt.u32.u16 %r6538, %rs4228; - cvt.s32.s8 %r6539, %r6538; - cvt.u32.u16 %r6540, %rs4226; - cvt.s32.s8 %r6541, %r6540; - cvt.u32.u16 %r6542, %rs4227; - cvt.s32.s8 %r6543, %r6542; - mul.lo.s32 %r6544, %r34, %r6543; - mad.lo.s32 %r6545, %r124, %r6541, %r6544; - mad.lo.s32 %r6546, %r35, %r6539, %r6545; - mad.lo.s32 %r6547, %r36, %r6537, %r6546; - ld.const.v4.u8 {%rs4234, %rs4235, %rs4236, %rs4237}, [matrix+2116]; - cvt.u32.u16 %r6548, %rs4237; - cvt.s32.s8 %r6549, %r6548; - cvt.u32.u16 %r6550, %rs4236; - cvt.s32.s8 %r6551, %r6550; - cvt.u32.u16 %r6552, %rs4235; - cvt.s32.s8 %r6553, %r6552; - cvt.u32.u16 %r6554, %rs4234; - cvt.s32.s8 %r6555, %r6554; - mad.lo.s32 %r6556, %r37, %r6555, %r6547; - mad.lo.s32 %r6557, %r38, %r6553, %r6556; - mad.lo.s32 %r6558, %r39, %r6551, %r6557; - mad.lo.s32 %r6559, %r40, %r6549, %r6558; - ld.const.v4.u8 {%rs4242, %rs4243, %rs4244, %rs4245}, [matrix+2120]; - cvt.u32.u16 %r6560, %rs4245; - cvt.s32.s8 %r6561, %r6560; - cvt.u32.u16 %r6562, %rs4244; - cvt.s32.s8 %r6563, %r6562; - cvt.u32.u16 %r6564, %rs4243; - cvt.s32.s8 %r6565, %r6564; - cvt.u32.u16 %r6566, %rs4242; - cvt.s32.s8 %r6567, %r6566; - mad.lo.s32 %r6568, %r42, %r6567, %r6559; - mad.lo.s32 %r6569, %r43, %r6565, %r6568; - mad.lo.s32 %r6570, %r45, %r6563, %r6569; - mad.lo.s32 %r6571, %r46, %r6561, %r6570; - ld.const.v4.u8 {%rs4250, %rs4251, %rs4252, %rs4253}, [matrix+2124]; - cvt.u32.u16 %r6572, %rs4253; - cvt.s32.s8 %r6573, %r6572; - cvt.u32.u16 %r6574, %rs4252; - cvt.s32.s8 %r6575, %r6574; - cvt.u32.u16 %r6576, %rs4251; - cvt.s32.s8 %r6577, %r6576; - cvt.u32.u16 %r6578, %rs4250; - cvt.s32.s8 %r6579, %r6578; - mad.lo.s32 %r6580, %r48, %r6579, %r6571; - mad.lo.s32 %r6581, %r49, %r6577, %r6580; - mad.lo.s32 %r6582, %r50, %r6575, %r6581; - mad.lo.s32 %r6583, %r51, %r6573, %r6582; - ld.const.v4.u8 {%rs4258, %rs4259, %rs4260, %rs4261}, [matrix+2128]; - cvt.u32.u16 %r6584, %rs4261; - cvt.s32.s8 %r6585, %r6584; - cvt.u32.u16 %r6586, %rs4260; - cvt.s32.s8 %r6587, %r6586; - cvt.u32.u16 %r6588, %rs4259; - cvt.s32.s8 %r6589, %r6588; - cvt.u32.u16 %r6590, %rs4258; - cvt.s32.s8 %r6591, %r6590; - mad.lo.s32 %r6592, %r173, %r6591, %r6583; - mad.lo.s32 %r6593, %r53, %r6589, %r6592; - mad.lo.s32 %r6594, %r54, %r6587, %r6593; - mad.lo.s32 %r6595, %r55, %r6585, %r6594; - ld.const.v4.u8 {%rs4266, %rs4267, %rs4268, %rs4269}, [matrix+2132]; - cvt.u32.u16 %r6596, %rs4269; - cvt.s32.s8 %r6597, %r6596; - cvt.u32.u16 %r6598, %rs4268; - cvt.s32.s8 %r6599, %r6598; - cvt.u32.u16 %r6600, %rs4267; - cvt.s32.s8 %r6601, %r6600; - cvt.u32.u16 %r6602, %rs4266; - cvt.s32.s8 %r6603, %r6602; - mad.lo.s32 %r6604, %r56, %r6603, %r6595; - mad.lo.s32 %r6605, %r57, %r6601, %r6604; - mad.lo.s32 %r6606, %r58, %r6599, %r6605; - mad.lo.s32 %r6607, %r59, %r6597, %r6606; - ld.const.v4.u8 {%rs4274, %rs4275, %rs4276, %rs4277}, [matrix+2136]; - cvt.u32.u16 %r6608, %rs4277; - cvt.s32.s8 %r6609, %r6608; - cvt.u32.u16 %r6610, %rs4276; - cvt.s32.s8 %r6611, %r6610; - cvt.u32.u16 %r6612, %rs4275; - cvt.s32.s8 %r6613, %r6612; - cvt.u32.u16 %r6614, %rs4274; - cvt.s32.s8 %r6615, %r6614; - mad.lo.s32 %r6616, %r61, %r6615, %r6607; - mad.lo.s32 %r6617, %r62, %r6613, %r6616; - mad.lo.s32 %r6618, %r64, %r6611, %r6617; - mad.lo.s32 %r6619, %r65, %r6609, %r6618; - ld.const.v4.u8 {%rs4282, %rs4283, %rs4284, %rs4285}, [matrix+2140]; - cvt.u32.u16 %r6620, %rs4285; - cvt.s32.s8 %r6621, %r6620; - cvt.u32.u16 %r6622, %rs4284; - cvt.s32.s8 %r6623, %r6622; - cvt.u32.u16 %r6624, %rs4283; - cvt.s32.s8 %r6625, %r6624; - cvt.u32.u16 %r6626, %rs4282; - cvt.s32.s8 %r6627, %r6626; - mad.lo.s32 %r6628, %r67, %r6627, %r6619; - mad.lo.s32 %r6629, %r68, %r6625, %r6628; - mad.lo.s32 %r6630, %r69, %r6623, %r6629; - mad.lo.s32 %r6631, %r70, %r6621, %r6630; - ld.const.v4.u8 {%rs4290, %rs4291, %rs4292, %rs4293}, [matrix+2144]; - cvt.u32.u16 %r6632, %rs4293; - cvt.s32.s8 %r6633, %r6632; - cvt.u32.u16 %r6634, %rs4292; - cvt.s32.s8 %r6635, %r6634; - cvt.u32.u16 %r6636, %rs4291; - cvt.s32.s8 %r6637, %r6636; - cvt.u32.u16 %r6638, %rs4290; - cvt.s32.s8 %r6639, %r6638; - mad.lo.s32 %r6640, %r222, %r6639, %r6631; - mad.lo.s32 %r6641, %r72, %r6637, %r6640; - mad.lo.s32 %r6642, %r73, %r6635, %r6641; - mad.lo.s32 %r6643, %r74, %r6633, %r6642; - ld.const.v4.u8 {%rs4298, %rs4299, %rs4300, %rs4301}, [matrix+2148]; - cvt.u32.u16 %r6644, %rs4301; - cvt.s32.s8 %r6645, %r6644; - cvt.u32.u16 %r6646, %rs4300; - cvt.s32.s8 %r6647, %r6646; - cvt.u32.u16 %r6648, %rs4299; - cvt.s32.s8 %r6649, %r6648; - cvt.u32.u16 %r6650, %rs4298; - cvt.s32.s8 %r6651, %r6650; - mad.lo.s32 %r6652, %r75, %r6651, %r6643; - mad.lo.s32 %r6653, %r76, %r6649, %r6652; - mad.lo.s32 %r6654, %r77, %r6647, %r6653; - mad.lo.s32 %r6655, %r78, %r6645, %r6654; - ld.const.v4.u8 {%rs4306, %rs4307, %rs4308, %rs4309}, [matrix+2152]; - cvt.u32.u16 %r6656, %rs4309; - cvt.s32.s8 %r6657, %r6656; - cvt.u32.u16 %r6658, %rs4308; - cvt.s32.s8 %r6659, %r6658; - cvt.u32.u16 %r6660, %rs4307; - cvt.s32.s8 %r6661, %r6660; - cvt.u32.u16 %r6662, %rs4306; - cvt.s32.s8 %r6663, %r6662; - mad.lo.s32 %r6664, %r80, %r6663, %r6655; - mad.lo.s32 %r6665, %r81, %r6661, %r6664; - mad.lo.s32 %r6666, %r83, %r6659, %r6665; - mad.lo.s32 %r6667, %r84, %r6657, %r6666; - ld.const.v4.u8 {%rs4314, %rs4315, %rs4316, %rs4317}, [matrix+2156]; - cvt.u32.u16 %r6668, %rs4317; - cvt.s32.s8 %r6669, %r6668; - cvt.u32.u16 %r6670, %rs4316; - cvt.s32.s8 %r6671, %r6670; - cvt.u32.u16 %r6672, %rs4315; - cvt.s32.s8 %r6673, %r6672; - cvt.u32.u16 %r6674, %rs4314; - cvt.s32.s8 %r6675, %r6674; - mad.lo.s32 %r6676, %r86, %r6675, %r6667; - mad.lo.s32 %r6677, %r87, %r6673, %r6676; - mad.lo.s32 %r6678, %r88, %r6671, %r6677; - mad.lo.s32 %r6679, %r89, %r6669, %r6678; - ld.const.v4.u8 {%rs4322, %rs4323, %rs4324, %rs4325}, [matrix+2160]; - cvt.u32.u16 %r6680, %rs4325; - cvt.s32.s8 %r6681, %r6680; - cvt.u32.u16 %r6682, %rs4324; - cvt.s32.s8 %r6683, %r6682; - cvt.u32.u16 %r6684, %rs4323; - cvt.s32.s8 %r6685, %r6684; - cvt.u32.u16 %r6686, %rs4322; - cvt.s32.s8 %r6687, %r6686; - mad.lo.s32 %r6688, %r271, %r6687, %r6679; - mad.lo.s32 %r6689, %r91, %r6685, %r6688; - mad.lo.s32 %r6690, %r93, %r6683, %r6689; - mad.lo.s32 %r6691, %r94, %r6681, %r6690; - ld.const.v4.u8 {%rs4330, %rs4331, %rs4332, %rs4333}, [matrix+2164]; - cvt.u32.u16 %r6692, %rs4333; - cvt.s32.s8 %r6693, %r6692; - cvt.u32.u16 %r6694, %rs4332; - cvt.s32.s8 %r6695, %r6694; - cvt.u32.u16 %r6696, %rs4331; - cvt.s32.s8 %r6697, %r6696; - cvt.u32.u16 %r6698, %rs4330; - cvt.s32.s8 %r6699, %r6698; - mad.lo.s32 %r6700, %r96, %r6699, %r6691; - mad.lo.s32 %r6701, %r97, %r6697, %r6700; - mad.lo.s32 %r6702, %r99, %r6695, %r6701; - mad.lo.s32 %r6703, %r100, %r6693, %r6702; - ld.const.v4.u8 {%rs4338, %rs4339, %rs4340, %rs4341}, [matrix+2168]; - cvt.u32.u16 %r6704, %rs4341; - cvt.s32.s8 %r6705, %r6704; - cvt.u32.u16 %r6706, %rs4340; - cvt.s32.s8 %r6707, %r6706; - cvt.u32.u16 %r6708, %rs4339; - cvt.s32.s8 %r6709, %r6708; - cvt.u32.u16 %r6710, %rs4338; - cvt.s32.s8 %r6711, %r6710; - mad.lo.s32 %r6712, %r103, %r6711, %r6703; - mad.lo.s32 %r6713, %r104, %r6709, %r6712; - mad.lo.s32 %r6714, %r107, %r6707, %r6713; - mad.lo.s32 %r6715, %r108, %r6705, %r6714; - ld.const.v4.u8 {%rs4346, %rs4347, %rs4348, %rs4349}, [matrix+2172]; - cvt.u32.u16 %r6716, %rs4349; - cvt.s32.s8 %r6717, %r6716; - cvt.u32.u16 %r6718, %rs4348; - cvt.s32.s8 %r6719, %r6718; - cvt.u32.u16 %r6720, %rs4347; - cvt.s32.s8 %r6721, %r6720; - cvt.u32.u16 %r6722, %rs4346; - cvt.s32.s8 %r6723, %r6722; - mad.lo.s32 %r6724, %r111, %r6723, %r6715; - mad.lo.s32 %r6725, %r112, %r6721, %r6724; - mad.lo.s32 %r6726, %r114, %r6719, %r6725; - mad.lo.s32 %r6727, %r115, %r6717, %r6726; - shr.u32 %r6728, %r6535, 6; - and.b32 %r6729, %r6728, 240; - shr.u32 %r6730, %r6727, 10; - or.b32 %r6731, %r6730, %r6729; - xor.b32 %r6732, %r71, %r6731; - cvt.u64.u32 %rd393, %r6732; - ld.const.v4.u8 {%rs4354, %rs4355, %rs4356, %rs4357}, [matrix+2176]; - cvt.u32.u16 %r6733, %rs4357; - cvt.s32.s8 %r6734, %r6733; - cvt.u32.u16 %r6735, %rs4356; - cvt.s32.s8 %r6736, %r6735; - cvt.u32.u16 %r6737, %rs4354; - cvt.s32.s8 %r6738, %r6737; - cvt.u32.u16 %r6739, %rs4355; - cvt.s32.s8 %r6740, %r6739; - mul.lo.s32 %r6741, %r34, %r6740; - mad.lo.s32 %r6742, %r124, %r6738, %r6741; - mad.lo.s32 %r6743, %r35, %r6736, %r6742; - mad.lo.s32 %r6744, %r36, %r6734, %r6743; - ld.const.v4.u8 {%rs4362, %rs4363, %rs4364, %rs4365}, [matrix+2180]; - cvt.u32.u16 %r6745, %rs4365; - cvt.s32.s8 %r6746, %r6745; - cvt.u32.u16 %r6747, %rs4364; - cvt.s32.s8 %r6748, %r6747; - cvt.u32.u16 %r6749, %rs4363; - cvt.s32.s8 %r6750, %r6749; - cvt.u32.u16 %r6751, %rs4362; - cvt.s32.s8 %r6752, %r6751; - mad.lo.s32 %r6753, %r37, %r6752, %r6744; - mad.lo.s32 %r6754, %r38, %r6750, %r6753; - mad.lo.s32 %r6755, %r39, %r6748, %r6754; - mad.lo.s32 %r6756, %r40, %r6746, %r6755; - ld.const.v4.u8 {%rs4370, %rs4371, %rs4372, %rs4373}, [matrix+2184]; - cvt.u32.u16 %r6757, %rs4373; - cvt.s32.s8 %r6758, %r6757; - cvt.u32.u16 %r6759, %rs4372; - cvt.s32.s8 %r6760, %r6759; - cvt.u32.u16 %r6761, %rs4371; - cvt.s32.s8 %r6762, %r6761; - cvt.u32.u16 %r6763, %rs4370; - cvt.s32.s8 %r6764, %r6763; - mad.lo.s32 %r6765, %r42, %r6764, %r6756; - mad.lo.s32 %r6766, %r43, %r6762, %r6765; - mad.lo.s32 %r6767, %r45, %r6760, %r6766; - mad.lo.s32 %r6768, %r46, %r6758, %r6767; - ld.const.v4.u8 {%rs4378, %rs4379, %rs4380, %rs4381}, [matrix+2188]; - cvt.u32.u16 %r6769, %rs4381; - cvt.s32.s8 %r6770, %r6769; - cvt.u32.u16 %r6771, %rs4380; - cvt.s32.s8 %r6772, %r6771; - cvt.u32.u16 %r6773, %rs4379; - cvt.s32.s8 %r6774, %r6773; - cvt.u32.u16 %r6775, %rs4378; - cvt.s32.s8 %r6776, %r6775; - mad.lo.s32 %r6777, %r48, %r6776, %r6768; - mad.lo.s32 %r6778, %r49, %r6774, %r6777; - mad.lo.s32 %r6779, %r50, %r6772, %r6778; - mad.lo.s32 %r6780, %r51, %r6770, %r6779; - ld.const.v4.u8 {%rs4386, %rs4387, %rs4388, %rs4389}, [matrix+2192]; - cvt.u32.u16 %r6781, %rs4389; - cvt.s32.s8 %r6782, %r6781; - cvt.u32.u16 %r6783, %rs4388; - cvt.s32.s8 %r6784, %r6783; - cvt.u32.u16 %r6785, %rs4387; - cvt.s32.s8 %r6786, %r6785; - cvt.u32.u16 %r6787, %rs4386; - cvt.s32.s8 %r6788, %r6787; - mad.lo.s32 %r6789, %r173, %r6788, %r6780; - mad.lo.s32 %r6790, %r53, %r6786, %r6789; - mad.lo.s32 %r6791, %r54, %r6784, %r6790; - mad.lo.s32 %r6792, %r55, %r6782, %r6791; - ld.const.v4.u8 {%rs4394, %rs4395, %rs4396, %rs4397}, [matrix+2196]; - cvt.u32.u16 %r6793, %rs4397; - cvt.s32.s8 %r6794, %r6793; - cvt.u32.u16 %r6795, %rs4396; - cvt.s32.s8 %r6796, %r6795; - cvt.u32.u16 %r6797, %rs4395; - cvt.s32.s8 %r6798, %r6797; - cvt.u32.u16 %r6799, %rs4394; - cvt.s32.s8 %r6800, %r6799; - mad.lo.s32 %r6801, %r56, %r6800, %r6792; - mad.lo.s32 %r6802, %r57, %r6798, %r6801; - mad.lo.s32 %r6803, %r58, %r6796, %r6802; - mad.lo.s32 %r6804, %r59, %r6794, %r6803; - ld.const.v4.u8 {%rs4402, %rs4403, %rs4404, %rs4405}, [matrix+2200]; - cvt.u32.u16 %r6805, %rs4405; - cvt.s32.s8 %r6806, %r6805; - cvt.u32.u16 %r6807, %rs4404; - cvt.s32.s8 %r6808, %r6807; - cvt.u32.u16 %r6809, %rs4403; - cvt.s32.s8 %r6810, %r6809; - cvt.u32.u16 %r6811, %rs4402; - cvt.s32.s8 %r6812, %r6811; - mad.lo.s32 %r6813, %r61, %r6812, %r6804; - mad.lo.s32 %r6814, %r62, %r6810, %r6813; - mad.lo.s32 %r6815, %r64, %r6808, %r6814; - mad.lo.s32 %r6816, %r65, %r6806, %r6815; - ld.const.v4.u8 {%rs4410, %rs4411, %rs4412, %rs4413}, [matrix+2204]; - cvt.u32.u16 %r6817, %rs4413; - cvt.s32.s8 %r6818, %r6817; - cvt.u32.u16 %r6819, %rs4412; - cvt.s32.s8 %r6820, %r6819; - cvt.u32.u16 %r6821, %rs4411; - cvt.s32.s8 %r6822, %r6821; - cvt.u32.u16 %r6823, %rs4410; - cvt.s32.s8 %r6824, %r6823; - mad.lo.s32 %r6825, %r67, %r6824, %r6816; - mad.lo.s32 %r6826, %r68, %r6822, %r6825; - mad.lo.s32 %r6827, %r69, %r6820, %r6826; - mad.lo.s32 %r6828, %r70, %r6818, %r6827; - ld.const.v4.u8 {%rs4418, %rs4419, %rs4420, %rs4421}, [matrix+2208]; - cvt.u32.u16 %r6829, %rs4421; - cvt.s32.s8 %r6830, %r6829; - cvt.u32.u16 %r6831, %rs4420; - cvt.s32.s8 %r6832, %r6831; - cvt.u32.u16 %r6833, %rs4419; - cvt.s32.s8 %r6834, %r6833; - cvt.u32.u16 %r6835, %rs4418; - cvt.s32.s8 %r6836, %r6835; - mad.lo.s32 %r6837, %r222, %r6836, %r6828; - mad.lo.s32 %r6838, %r72, %r6834, %r6837; - mad.lo.s32 %r6839, %r73, %r6832, %r6838; - mad.lo.s32 %r6840, %r74, %r6830, %r6839; - ld.const.v4.u8 {%rs4426, %rs4427, %rs4428, %rs4429}, [matrix+2212]; - cvt.u32.u16 %r6841, %rs4429; - cvt.s32.s8 %r6842, %r6841; - cvt.u32.u16 %r6843, %rs4428; - cvt.s32.s8 %r6844, %r6843; - cvt.u32.u16 %r6845, %rs4427; - cvt.s32.s8 %r6846, %r6845; - cvt.u32.u16 %r6847, %rs4426; - cvt.s32.s8 %r6848, %r6847; - mad.lo.s32 %r6849, %r75, %r6848, %r6840; - mad.lo.s32 %r6850, %r76, %r6846, %r6849; - mad.lo.s32 %r6851, %r77, %r6844, %r6850; - mad.lo.s32 %r6852, %r78, %r6842, %r6851; - ld.const.v4.u8 {%rs4434, %rs4435, %rs4436, %rs4437}, [matrix+2216]; - cvt.u32.u16 %r6853, %rs4437; - cvt.s32.s8 %r6854, %r6853; - cvt.u32.u16 %r6855, %rs4436; - cvt.s32.s8 %r6856, %r6855; - cvt.u32.u16 %r6857, %rs4435; - cvt.s32.s8 %r6858, %r6857; - cvt.u32.u16 %r6859, %rs4434; - cvt.s32.s8 %r6860, %r6859; - mad.lo.s32 %r6861, %r80, %r6860, %r6852; - mad.lo.s32 %r6862, %r81, %r6858, %r6861; - mad.lo.s32 %r6863, %r83, %r6856, %r6862; - mad.lo.s32 %r6864, %r84, %r6854, %r6863; - ld.const.v4.u8 {%rs4442, %rs4443, %rs4444, %rs4445}, [matrix+2220]; - cvt.u32.u16 %r6865, %rs4445; - cvt.s32.s8 %r6866, %r6865; - cvt.u32.u16 %r6867, %rs4444; - cvt.s32.s8 %r6868, %r6867; - cvt.u32.u16 %r6869, %rs4443; - cvt.s32.s8 %r6870, %r6869; - cvt.u32.u16 %r6871, %rs4442; - cvt.s32.s8 %r6872, %r6871; - mad.lo.s32 %r6873, %r86, %r6872, %r6864; - mad.lo.s32 %r6874, %r87, %r6870, %r6873; - mad.lo.s32 %r6875, %r88, %r6868, %r6874; - mad.lo.s32 %r6876, %r89, %r6866, %r6875; - ld.const.v4.u8 {%rs4450, %rs4451, %rs4452, %rs4453}, [matrix+2224]; - cvt.u32.u16 %r6877, %rs4453; - cvt.s32.s8 %r6878, %r6877; - cvt.u32.u16 %r6879, %rs4452; - cvt.s32.s8 %r6880, %r6879; - cvt.u32.u16 %r6881, %rs4451; - cvt.s32.s8 %r6882, %r6881; - cvt.u32.u16 %r6883, %rs4450; - cvt.s32.s8 %r6884, %r6883; - mad.lo.s32 %r6885, %r271, %r6884, %r6876; - mad.lo.s32 %r6886, %r91, %r6882, %r6885; - mad.lo.s32 %r6887, %r93, %r6880, %r6886; - mad.lo.s32 %r6888, %r94, %r6878, %r6887; - ld.const.v4.u8 {%rs4458, %rs4459, %rs4460, %rs4461}, [matrix+2228]; - cvt.u32.u16 %r6889, %rs4461; - cvt.s32.s8 %r6890, %r6889; - cvt.u32.u16 %r6891, %rs4460; - cvt.s32.s8 %r6892, %r6891; - cvt.u32.u16 %r6893, %rs4459; - cvt.s32.s8 %r6894, %r6893; - cvt.u32.u16 %r6895, %rs4458; - cvt.s32.s8 %r6896, %r6895; - mad.lo.s32 %r6897, %r96, %r6896, %r6888; - mad.lo.s32 %r6898, %r97, %r6894, %r6897; - mad.lo.s32 %r6899, %r99, %r6892, %r6898; - mad.lo.s32 %r6900, %r100, %r6890, %r6899; - ld.const.v4.u8 {%rs4466, %rs4467, %rs4468, %rs4469}, [matrix+2232]; - cvt.u32.u16 %r6901, %rs4469; - cvt.s32.s8 %r6902, %r6901; - cvt.u32.u16 %r6903, %rs4468; - cvt.s32.s8 %r6904, %r6903; - cvt.u32.u16 %r6905, %rs4467; - cvt.s32.s8 %r6906, %r6905; - cvt.u32.u16 %r6907, %rs4466; - cvt.s32.s8 %r6908, %r6907; - mad.lo.s32 %r6909, %r103, %r6908, %r6900; - mad.lo.s32 %r6910, %r104, %r6906, %r6909; - mad.lo.s32 %r6911, %r107, %r6904, %r6910; - mad.lo.s32 %r6912, %r108, %r6902, %r6911; - ld.const.v4.u8 {%rs4474, %rs4475, %rs4476, %rs4477}, [matrix+2236]; - cvt.u32.u16 %r6913, %rs4477; - cvt.s32.s8 %r6914, %r6913; - cvt.u32.u16 %r6915, %rs4476; - cvt.s32.s8 %r6916, %r6915; - cvt.u32.u16 %r6917, %rs4475; - cvt.s32.s8 %r6918, %r6917; - cvt.u32.u16 %r6919, %rs4474; - cvt.s32.s8 %r6920, %r6919; - mad.lo.s32 %r6921, %r111, %r6920, %r6912; - mad.lo.s32 %r6922, %r112, %r6918, %r6921; - mad.lo.s32 %r6923, %r114, %r6916, %r6922; - mad.lo.s32 %r6924, %r115, %r6914, %r6923; - ld.const.v4.u8 {%rs4482, %rs4483, %rs4484, %rs4485}, [matrix+2240]; - cvt.u32.u16 %r6925, %rs4485; - cvt.s32.s8 %r6926, %r6925; - cvt.u32.u16 %r6927, %rs4484; - cvt.s32.s8 %r6928, %r6927; - cvt.u32.u16 %r6929, %rs4482; - cvt.s32.s8 %r6930, %r6929; - cvt.u32.u16 %r6931, %rs4483; - cvt.s32.s8 %r6932, %r6931; - mul.lo.s32 %r6933, %r34, %r6932; - mad.lo.s32 %r6934, %r124, %r6930, %r6933; - mad.lo.s32 %r6935, %r35, %r6928, %r6934; - mad.lo.s32 %r6936, %r36, %r6926, %r6935; - ld.const.v4.u8 {%rs4490, %rs4491, %rs4492, %rs4493}, [matrix+2244]; - cvt.u32.u16 %r6937, %rs4493; - cvt.s32.s8 %r6938, %r6937; - cvt.u32.u16 %r6939, %rs4492; - cvt.s32.s8 %r6940, %r6939; - cvt.u32.u16 %r6941, %rs4491; - cvt.s32.s8 %r6942, %r6941; - cvt.u32.u16 %r6943, %rs4490; - cvt.s32.s8 %r6944, %r6943; - mad.lo.s32 %r6945, %r37, %r6944, %r6936; - mad.lo.s32 %r6946, %r38, %r6942, %r6945; - mad.lo.s32 %r6947, %r39, %r6940, %r6946; - mad.lo.s32 %r6948, %r40, %r6938, %r6947; - ld.const.v4.u8 {%rs4498, %rs4499, %rs4500, %rs4501}, [matrix+2248]; - cvt.u32.u16 %r6949, %rs4501; - cvt.s32.s8 %r6950, %r6949; - cvt.u32.u16 %r6951, %rs4500; - cvt.s32.s8 %r6952, %r6951; - cvt.u32.u16 %r6953, %rs4499; - cvt.s32.s8 %r6954, %r6953; - cvt.u32.u16 %r6955, %rs4498; - cvt.s32.s8 %r6956, %r6955; - mad.lo.s32 %r6957, %r42, %r6956, %r6948; - mad.lo.s32 %r6958, %r43, %r6954, %r6957; - mad.lo.s32 %r6959, %r45, %r6952, %r6958; - mad.lo.s32 %r6960, %r46, %r6950, %r6959; - ld.const.v4.u8 {%rs4506, %rs4507, %rs4508, %rs4509}, [matrix+2252]; - cvt.u32.u16 %r6961, %rs4509; - cvt.s32.s8 %r6962, %r6961; - cvt.u32.u16 %r6963, %rs4508; - cvt.s32.s8 %r6964, %r6963; - cvt.u32.u16 %r6965, %rs4507; - cvt.s32.s8 %r6966, %r6965; - cvt.u32.u16 %r6967, %rs4506; - cvt.s32.s8 %r6968, %r6967; - mad.lo.s32 %r6969, %r48, %r6968, %r6960; - mad.lo.s32 %r6970, %r49, %r6966, %r6969; - mad.lo.s32 %r6971, %r50, %r6964, %r6970; - mad.lo.s32 %r6972, %r51, %r6962, %r6971; - ld.const.v4.u8 {%rs4514, %rs4515, %rs4516, %rs4517}, [matrix+2256]; - cvt.u32.u16 %r6973, %rs4517; - cvt.s32.s8 %r6974, %r6973; - cvt.u32.u16 %r6975, %rs4516; - cvt.s32.s8 %r6976, %r6975; - cvt.u32.u16 %r6977, %rs4515; - cvt.s32.s8 %r6978, %r6977; - cvt.u32.u16 %r6979, %rs4514; - cvt.s32.s8 %r6980, %r6979; - mad.lo.s32 %r6981, %r173, %r6980, %r6972; - mad.lo.s32 %r6982, %r53, %r6978, %r6981; - mad.lo.s32 %r6983, %r54, %r6976, %r6982; - mad.lo.s32 %r6984, %r55, %r6974, %r6983; - ld.const.v4.u8 {%rs4522, %rs4523, %rs4524, %rs4525}, [matrix+2260]; - cvt.u32.u16 %r6985, %rs4525; - cvt.s32.s8 %r6986, %r6985; - cvt.u32.u16 %r6987, %rs4524; - cvt.s32.s8 %r6988, %r6987; - cvt.u32.u16 %r6989, %rs4523; - cvt.s32.s8 %r6990, %r6989; - cvt.u32.u16 %r6991, %rs4522; - cvt.s32.s8 %r6992, %r6991; - mad.lo.s32 %r6993, %r56, %r6992, %r6984; - mad.lo.s32 %r6994, %r57, %r6990, %r6993; - mad.lo.s32 %r6995, %r58, %r6988, %r6994; - mad.lo.s32 %r6996, %r59, %r6986, %r6995; - ld.const.v4.u8 {%rs4530, %rs4531, %rs4532, %rs4533}, [matrix+2264]; - cvt.u32.u16 %r6997, %rs4533; - cvt.s32.s8 %r6998, %r6997; - cvt.u32.u16 %r6999, %rs4532; - cvt.s32.s8 %r7000, %r6999; - cvt.u32.u16 %r7001, %rs4531; - cvt.s32.s8 %r7002, %r7001; - cvt.u32.u16 %r7003, %rs4530; - cvt.s32.s8 %r7004, %r7003; - mad.lo.s32 %r7005, %r61, %r7004, %r6996; - mad.lo.s32 %r7006, %r62, %r7002, %r7005; - mad.lo.s32 %r7007, %r64, %r7000, %r7006; - mad.lo.s32 %r7008, %r65, %r6998, %r7007; - ld.const.v4.u8 {%rs4538, %rs4539, %rs4540, %rs4541}, [matrix+2268]; - cvt.u32.u16 %r7009, %rs4541; - cvt.s32.s8 %r7010, %r7009; - cvt.u32.u16 %r7011, %rs4540; - cvt.s32.s8 %r7012, %r7011; - cvt.u32.u16 %r7013, %rs4539; - cvt.s32.s8 %r7014, %r7013; - cvt.u32.u16 %r7015, %rs4538; - cvt.s32.s8 %r7016, %r7015; - mad.lo.s32 %r7017, %r67, %r7016, %r7008; - mad.lo.s32 %r7018, %r68, %r7014, %r7017; - mad.lo.s32 %r7019, %r69, %r7012, %r7018; - mad.lo.s32 %r7020, %r70, %r7010, %r7019; - ld.const.v4.u8 {%rs4546, %rs4547, %rs4548, %rs4549}, [matrix+2272]; - cvt.u32.u16 %r7021, %rs4549; - cvt.s32.s8 %r7022, %r7021; - cvt.u32.u16 %r7023, %rs4548; - cvt.s32.s8 %r7024, %r7023; - cvt.u32.u16 %r7025, %rs4547; - cvt.s32.s8 %r7026, %r7025; - cvt.u32.u16 %r7027, %rs4546; - cvt.s32.s8 %r7028, %r7027; - mad.lo.s32 %r7029, %r222, %r7028, %r7020; - mad.lo.s32 %r7030, %r72, %r7026, %r7029; - mad.lo.s32 %r7031, %r73, %r7024, %r7030; - mad.lo.s32 %r7032, %r74, %r7022, %r7031; - ld.const.v4.u8 {%rs4554, %rs4555, %rs4556, %rs4557}, [matrix+2276]; - cvt.u32.u16 %r7033, %rs4557; - cvt.s32.s8 %r7034, %r7033; - cvt.u32.u16 %r7035, %rs4556; - cvt.s32.s8 %r7036, %r7035; - cvt.u32.u16 %r7037, %rs4555; - cvt.s32.s8 %r7038, %r7037; - cvt.u32.u16 %r7039, %rs4554; - cvt.s32.s8 %r7040, %r7039; - mad.lo.s32 %r7041, %r75, %r7040, %r7032; - mad.lo.s32 %r7042, %r76, %r7038, %r7041; - mad.lo.s32 %r7043, %r77, %r7036, %r7042; - mad.lo.s32 %r7044, %r78, %r7034, %r7043; - ld.const.v4.u8 {%rs4562, %rs4563, %rs4564, %rs4565}, [matrix+2280]; - cvt.u32.u16 %r7045, %rs4565; - cvt.s32.s8 %r7046, %r7045; - cvt.u32.u16 %r7047, %rs4564; - cvt.s32.s8 %r7048, %r7047; - cvt.u32.u16 %r7049, %rs4563; - cvt.s32.s8 %r7050, %r7049; - cvt.u32.u16 %r7051, %rs4562; - cvt.s32.s8 %r7052, %r7051; - mad.lo.s32 %r7053, %r80, %r7052, %r7044; - mad.lo.s32 %r7054, %r81, %r7050, %r7053; - mad.lo.s32 %r7055, %r83, %r7048, %r7054; - mad.lo.s32 %r7056, %r84, %r7046, %r7055; - ld.const.v4.u8 {%rs4570, %rs4571, %rs4572, %rs4573}, [matrix+2284]; - cvt.u32.u16 %r7057, %rs4573; - cvt.s32.s8 %r7058, %r7057; - cvt.u32.u16 %r7059, %rs4572; - cvt.s32.s8 %r7060, %r7059; - cvt.u32.u16 %r7061, %rs4571; - cvt.s32.s8 %r7062, %r7061; - cvt.u32.u16 %r7063, %rs4570; - cvt.s32.s8 %r7064, %r7063; - mad.lo.s32 %r7065, %r86, %r7064, %r7056; - mad.lo.s32 %r7066, %r87, %r7062, %r7065; - mad.lo.s32 %r7067, %r88, %r7060, %r7066; - mad.lo.s32 %r7068, %r89, %r7058, %r7067; - ld.const.v4.u8 {%rs4578, %rs4579, %rs4580, %rs4581}, [matrix+2288]; - cvt.u32.u16 %r7069, %rs4581; - cvt.s32.s8 %r7070, %r7069; - cvt.u32.u16 %r7071, %rs4580; - cvt.s32.s8 %r7072, %r7071; - cvt.u32.u16 %r7073, %rs4579; - cvt.s32.s8 %r7074, %r7073; - cvt.u32.u16 %r7075, %rs4578; - cvt.s32.s8 %r7076, %r7075; - mad.lo.s32 %r7077, %r271, %r7076, %r7068; - mad.lo.s32 %r7078, %r91, %r7074, %r7077; - mad.lo.s32 %r7079, %r93, %r7072, %r7078; - mad.lo.s32 %r7080, %r94, %r7070, %r7079; - ld.const.v4.u8 {%rs4586, %rs4587, %rs4588, %rs4589}, [matrix+2292]; - cvt.u32.u16 %r7081, %rs4589; - cvt.s32.s8 %r7082, %r7081; - cvt.u32.u16 %r7083, %rs4588; - cvt.s32.s8 %r7084, %r7083; - cvt.u32.u16 %r7085, %rs4587; - cvt.s32.s8 %r7086, %r7085; - cvt.u32.u16 %r7087, %rs4586; - cvt.s32.s8 %r7088, %r7087; - mad.lo.s32 %r7089, %r96, %r7088, %r7080; - mad.lo.s32 %r7090, %r97, %r7086, %r7089; - mad.lo.s32 %r7091, %r99, %r7084, %r7090; - mad.lo.s32 %r7092, %r100, %r7082, %r7091; - ld.const.v4.u8 {%rs4594, %rs4595, %rs4596, %rs4597}, [matrix+2296]; - cvt.u32.u16 %r7093, %rs4597; - cvt.s32.s8 %r7094, %r7093; - cvt.u32.u16 %r7095, %rs4596; - cvt.s32.s8 %r7096, %r7095; - cvt.u32.u16 %r7097, %rs4595; - cvt.s32.s8 %r7098, %r7097; - cvt.u32.u16 %r7099, %rs4594; - cvt.s32.s8 %r7100, %r7099; - mad.lo.s32 %r7101, %r103, %r7100, %r7092; - mad.lo.s32 %r7102, %r104, %r7098, %r7101; - mad.lo.s32 %r7103, %r107, %r7096, %r7102; - mad.lo.s32 %r7104, %r108, %r7094, %r7103; - ld.const.v4.u8 {%rs4602, %rs4603, %rs4604, %rs4605}, [matrix+2300]; - cvt.u32.u16 %r7105, %rs4605; - cvt.s32.s8 %r7106, %r7105; - cvt.u32.u16 %r7107, %rs4604; - cvt.s32.s8 %r7108, %r7107; - cvt.u32.u16 %r7109, %rs4603; - cvt.s32.s8 %r7110, %r7109; - cvt.u32.u16 %r7111, %rs4602; - cvt.s32.s8 %r7112, %r7111; - mad.lo.s32 %r7113, %r111, %r7112, %r7104; - mad.lo.s32 %r7114, %r112, %r7110, %r7113; - mad.lo.s32 %r7115, %r114, %r7108, %r7114; - mad.lo.s32 %r7116, %r115, %r7106, %r7115; - shr.u32 %r7117, %r6924, 6; - and.b32 %r7118, %r7117, 240; - shr.u32 %r7119, %r7116, 10; - or.b32 %r7120, %r7119, %r7118; - xor.b32 %r7121, %r26, %r7120; - ld.const.v4.u8 {%rs4610, %rs4611, %rs4612, %rs4613}, [matrix+2304]; - cvt.u32.u16 %r7122, %rs4613; - cvt.s32.s8 %r7123, %r7122; - cvt.u32.u16 %r7124, %rs4612; - cvt.s32.s8 %r7125, %r7124; - cvt.u32.u16 %r7126, %rs4610; - cvt.s32.s8 %r7127, %r7126; - cvt.u32.u16 %r7128, %rs4611; - cvt.s32.s8 %r7129, %r7128; - mul.lo.s32 %r7130, %r34, %r7129; - mad.lo.s32 %r7131, %r124, %r7127, %r7130; - mad.lo.s32 %r7132, %r35, %r7125, %r7131; - mad.lo.s32 %r7133, %r36, %r7123, %r7132; - ld.const.v4.u8 {%rs4618, %rs4619, %rs4620, %rs4621}, [matrix+2308]; - cvt.u32.u16 %r7134, %rs4621; - cvt.s32.s8 %r7135, %r7134; - cvt.u32.u16 %r7136, %rs4620; - cvt.s32.s8 %r7137, %r7136; - cvt.u32.u16 %r7138, %rs4619; - cvt.s32.s8 %r7139, %r7138; - cvt.u32.u16 %r7140, %rs4618; - cvt.s32.s8 %r7141, %r7140; - mad.lo.s32 %r7142, %r37, %r7141, %r7133; - mad.lo.s32 %r7143, %r38, %r7139, %r7142; - mad.lo.s32 %r7144, %r39, %r7137, %r7143; - mad.lo.s32 %r7145, %r40, %r7135, %r7144; - ld.const.v4.u8 {%rs4626, %rs4627, %rs4628, %rs4629}, [matrix+2312]; - cvt.u32.u16 %r7146, %rs4629; - cvt.s32.s8 %r7147, %r7146; - cvt.u32.u16 %r7148, %rs4628; - cvt.s32.s8 %r7149, %r7148; - cvt.u32.u16 %r7150, %rs4627; - cvt.s32.s8 %r7151, %r7150; - cvt.u32.u16 %r7152, %rs4626; - cvt.s32.s8 %r7153, %r7152; - mad.lo.s32 %r7154, %r42, %r7153, %r7145; - mad.lo.s32 %r7155, %r43, %r7151, %r7154; - mad.lo.s32 %r7156, %r45, %r7149, %r7155; - mad.lo.s32 %r7157, %r46, %r7147, %r7156; - ld.const.v4.u8 {%rs4634, %rs4635, %rs4636, %rs4637}, [matrix+2316]; - cvt.u32.u16 %r7158, %rs4637; - cvt.s32.s8 %r7159, %r7158; - cvt.u32.u16 %r7160, %rs4636; - cvt.s32.s8 %r7161, %r7160; - cvt.u32.u16 %r7162, %rs4635; - cvt.s32.s8 %r7163, %r7162; - cvt.u32.u16 %r7164, %rs4634; - cvt.s32.s8 %r7165, %r7164; - mad.lo.s32 %r7166, %r48, %r7165, %r7157; - mad.lo.s32 %r7167, %r49, %r7163, %r7166; - mad.lo.s32 %r7168, %r50, %r7161, %r7167; - mad.lo.s32 %r7169, %r51, %r7159, %r7168; - ld.const.v4.u8 {%rs4642, %rs4643, %rs4644, %rs4645}, [matrix+2320]; - cvt.u32.u16 %r7170, %rs4645; - cvt.s32.s8 %r7171, %r7170; - cvt.u32.u16 %r7172, %rs4644; - cvt.s32.s8 %r7173, %r7172; - cvt.u32.u16 %r7174, %rs4643; - cvt.s32.s8 %r7175, %r7174; - cvt.u32.u16 %r7176, %rs4642; - cvt.s32.s8 %r7177, %r7176; - mad.lo.s32 %r7178, %r173, %r7177, %r7169; - mad.lo.s32 %r7179, %r53, %r7175, %r7178; - mad.lo.s32 %r7180, %r54, %r7173, %r7179; - mad.lo.s32 %r7181, %r55, %r7171, %r7180; - ld.const.v4.u8 {%rs4650, %rs4651, %rs4652, %rs4653}, [matrix+2324]; - cvt.u32.u16 %r7182, %rs4653; - cvt.s32.s8 %r7183, %r7182; - cvt.u32.u16 %r7184, %rs4652; - cvt.s32.s8 %r7185, %r7184; - cvt.u32.u16 %r7186, %rs4651; - cvt.s32.s8 %r7187, %r7186; - cvt.u32.u16 %r7188, %rs4650; - cvt.s32.s8 %r7189, %r7188; - mad.lo.s32 %r7190, %r56, %r7189, %r7181; - mad.lo.s32 %r7191, %r57, %r7187, %r7190; - mad.lo.s32 %r7192, %r58, %r7185, %r7191; - mad.lo.s32 %r7193, %r59, %r7183, %r7192; - ld.const.v4.u8 {%rs4658, %rs4659, %rs4660, %rs4661}, [matrix+2328]; - cvt.u32.u16 %r7194, %rs4661; - cvt.s32.s8 %r7195, %r7194; - cvt.u32.u16 %r7196, %rs4660; - cvt.s32.s8 %r7197, %r7196; - cvt.u32.u16 %r7198, %rs4659; - cvt.s32.s8 %r7199, %r7198; - cvt.u32.u16 %r7200, %rs4658; - cvt.s32.s8 %r7201, %r7200; - mad.lo.s32 %r7202, %r61, %r7201, %r7193; - mad.lo.s32 %r7203, %r62, %r7199, %r7202; - mad.lo.s32 %r7204, %r64, %r7197, %r7203; - mad.lo.s32 %r7205, %r65, %r7195, %r7204; - ld.const.v4.u8 {%rs4666, %rs4667, %rs4668, %rs4669}, [matrix+2332]; - cvt.u32.u16 %r7206, %rs4669; - cvt.s32.s8 %r7207, %r7206; - cvt.u32.u16 %r7208, %rs4668; - cvt.s32.s8 %r7209, %r7208; - cvt.u32.u16 %r7210, %rs4667; - cvt.s32.s8 %r7211, %r7210; - cvt.u32.u16 %r7212, %rs4666; - cvt.s32.s8 %r7213, %r7212; - mad.lo.s32 %r7214, %r67, %r7213, %r7205; - mad.lo.s32 %r7215, %r68, %r7211, %r7214; - mad.lo.s32 %r7216, %r69, %r7209, %r7215; - mad.lo.s32 %r7217, %r70, %r7207, %r7216; - ld.const.v4.u8 {%rs4674, %rs4675, %rs4676, %rs4677}, [matrix+2336]; - cvt.u32.u16 %r7218, %rs4677; - cvt.s32.s8 %r7219, %r7218; - cvt.u32.u16 %r7220, %rs4676; - cvt.s32.s8 %r7221, %r7220; - cvt.u32.u16 %r7222, %rs4675; - cvt.s32.s8 %r7223, %r7222; - cvt.u32.u16 %r7224, %rs4674; - cvt.s32.s8 %r7225, %r7224; - mad.lo.s32 %r7226, %r222, %r7225, %r7217; - mad.lo.s32 %r7227, %r72, %r7223, %r7226; - mad.lo.s32 %r7228, %r73, %r7221, %r7227; - mad.lo.s32 %r7229, %r74, %r7219, %r7228; - ld.const.v4.u8 {%rs4682, %rs4683, %rs4684, %rs4685}, [matrix+2340]; - cvt.u32.u16 %r7230, %rs4685; - cvt.s32.s8 %r7231, %r7230; - cvt.u32.u16 %r7232, %rs4684; - cvt.s32.s8 %r7233, %r7232; - cvt.u32.u16 %r7234, %rs4683; - cvt.s32.s8 %r7235, %r7234; - cvt.u32.u16 %r7236, %rs4682; - cvt.s32.s8 %r7237, %r7236; - mad.lo.s32 %r7238, %r75, %r7237, %r7229; - mad.lo.s32 %r7239, %r76, %r7235, %r7238; - mad.lo.s32 %r7240, %r77, %r7233, %r7239; - mad.lo.s32 %r7241, %r78, %r7231, %r7240; - ld.const.v4.u8 {%rs4690, %rs4691, %rs4692, %rs4693}, [matrix+2344]; - cvt.u32.u16 %r7242, %rs4693; - cvt.s32.s8 %r7243, %r7242; - cvt.u32.u16 %r7244, %rs4692; - cvt.s32.s8 %r7245, %r7244; - cvt.u32.u16 %r7246, %rs4691; - cvt.s32.s8 %r7247, %r7246; - cvt.u32.u16 %r7248, %rs4690; - cvt.s32.s8 %r7249, %r7248; - mad.lo.s32 %r7250, %r80, %r7249, %r7241; - mad.lo.s32 %r7251, %r81, %r7247, %r7250; - mad.lo.s32 %r7252, %r83, %r7245, %r7251; - mad.lo.s32 %r7253, %r84, %r7243, %r7252; - ld.const.v4.u8 {%rs4698, %rs4699, %rs4700, %rs4701}, [matrix+2348]; - cvt.u32.u16 %r7254, %rs4701; - cvt.s32.s8 %r7255, %r7254; - cvt.u32.u16 %r7256, %rs4700; - cvt.s32.s8 %r7257, %r7256; - cvt.u32.u16 %r7258, %rs4699; - cvt.s32.s8 %r7259, %r7258; - cvt.u32.u16 %r7260, %rs4698; - cvt.s32.s8 %r7261, %r7260; - mad.lo.s32 %r7262, %r86, %r7261, %r7253; - mad.lo.s32 %r7263, %r87, %r7259, %r7262; - mad.lo.s32 %r7264, %r88, %r7257, %r7263; - mad.lo.s32 %r7265, %r89, %r7255, %r7264; - ld.const.v4.u8 {%rs4706, %rs4707, %rs4708, %rs4709}, [matrix+2352]; - cvt.u32.u16 %r7266, %rs4709; - cvt.s32.s8 %r7267, %r7266; - cvt.u32.u16 %r7268, %rs4708; - cvt.s32.s8 %r7269, %r7268; - cvt.u32.u16 %r7270, %rs4707; - cvt.s32.s8 %r7271, %r7270; - cvt.u32.u16 %r7272, %rs4706; - cvt.s32.s8 %r7273, %r7272; - mad.lo.s32 %r7274, %r271, %r7273, %r7265; - mad.lo.s32 %r7275, %r91, %r7271, %r7274; - mad.lo.s32 %r7276, %r93, %r7269, %r7275; - mad.lo.s32 %r7277, %r94, %r7267, %r7276; - ld.const.v4.u8 {%rs4714, %rs4715, %rs4716, %rs4717}, [matrix+2356]; - cvt.u32.u16 %r7278, %rs4717; - cvt.s32.s8 %r7279, %r7278; - cvt.u32.u16 %r7280, %rs4716; - cvt.s32.s8 %r7281, %r7280; - cvt.u32.u16 %r7282, %rs4715; - cvt.s32.s8 %r7283, %r7282; - cvt.u32.u16 %r7284, %rs4714; - cvt.s32.s8 %r7285, %r7284; - mad.lo.s32 %r7286, %r96, %r7285, %r7277; - mad.lo.s32 %r7287, %r97, %r7283, %r7286; - mad.lo.s32 %r7288, %r99, %r7281, %r7287; - mad.lo.s32 %r7289, %r100, %r7279, %r7288; - ld.const.v4.u8 {%rs4722, %rs4723, %rs4724, %rs4725}, [matrix+2360]; - cvt.u32.u16 %r7290, %rs4725; - cvt.s32.s8 %r7291, %r7290; - cvt.u32.u16 %r7292, %rs4724; - cvt.s32.s8 %r7293, %r7292; - cvt.u32.u16 %r7294, %rs4723; - cvt.s32.s8 %r7295, %r7294; - cvt.u32.u16 %r7296, %rs4722; - cvt.s32.s8 %r7297, %r7296; - mad.lo.s32 %r7298, %r103, %r7297, %r7289; - mad.lo.s32 %r7299, %r104, %r7295, %r7298; - mad.lo.s32 %r7300, %r107, %r7293, %r7299; - mad.lo.s32 %r7301, %r108, %r7291, %r7300; - ld.const.v4.u8 {%rs4730, %rs4731, %rs4732, %rs4733}, [matrix+2364]; - cvt.u32.u16 %r7302, %rs4733; - cvt.s32.s8 %r7303, %r7302; - cvt.u32.u16 %r7304, %rs4732; - cvt.s32.s8 %r7305, %r7304; - cvt.u32.u16 %r7306, %rs4731; - cvt.s32.s8 %r7307, %r7306; - cvt.u32.u16 %r7308, %rs4730; - cvt.s32.s8 %r7309, %r7308; - mad.lo.s32 %r7310, %r111, %r7309, %r7301; - mad.lo.s32 %r7311, %r112, %r7307, %r7310; - mad.lo.s32 %r7312, %r114, %r7305, %r7311; - mad.lo.s32 %r7313, %r115, %r7303, %r7312; - ld.const.v4.u8 {%rs4738, %rs4739, %rs4740, %rs4741}, [matrix+2368]; - cvt.u32.u16 %r7314, %rs4741; - cvt.s32.s8 %r7315, %r7314; - cvt.u32.u16 %r7316, %rs4740; - cvt.s32.s8 %r7317, %r7316; - cvt.u32.u16 %r7318, %rs4738; - cvt.s32.s8 %r7319, %r7318; - cvt.u32.u16 %r7320, %rs4739; - cvt.s32.s8 %r7321, %r7320; - mul.lo.s32 %r7322, %r34, %r7321; - mad.lo.s32 %r7323, %r124, %r7319, %r7322; - mad.lo.s32 %r7324, %r35, %r7317, %r7323; - mad.lo.s32 %r7325, %r36, %r7315, %r7324; - ld.const.v4.u8 {%rs4746, %rs4747, %rs4748, %rs4749}, [matrix+2372]; - cvt.u32.u16 %r7326, %rs4749; - cvt.s32.s8 %r7327, %r7326; - cvt.u32.u16 %r7328, %rs4748; - cvt.s32.s8 %r7329, %r7328; - cvt.u32.u16 %r7330, %rs4747; - cvt.s32.s8 %r7331, %r7330; - cvt.u32.u16 %r7332, %rs4746; - cvt.s32.s8 %r7333, %r7332; - mad.lo.s32 %r7334, %r37, %r7333, %r7325; - mad.lo.s32 %r7335, %r38, %r7331, %r7334; - mad.lo.s32 %r7336, %r39, %r7329, %r7335; - mad.lo.s32 %r7337, %r40, %r7327, %r7336; - ld.const.v4.u8 {%rs4754, %rs4755, %rs4756, %rs4757}, [matrix+2376]; - cvt.u32.u16 %r7338, %rs4757; - cvt.s32.s8 %r7339, %r7338; - cvt.u32.u16 %r7340, %rs4756; - cvt.s32.s8 %r7341, %r7340; - cvt.u32.u16 %r7342, %rs4755; - cvt.s32.s8 %r7343, %r7342; - cvt.u32.u16 %r7344, %rs4754; - cvt.s32.s8 %r7345, %r7344; - mad.lo.s32 %r7346, %r42, %r7345, %r7337; - mad.lo.s32 %r7347, %r43, %r7343, %r7346; - mad.lo.s32 %r7348, %r45, %r7341, %r7347; - mad.lo.s32 %r7349, %r46, %r7339, %r7348; - ld.const.v4.u8 {%rs4762, %rs4763, %rs4764, %rs4765}, [matrix+2380]; - cvt.u32.u16 %r7350, %rs4765; - cvt.s32.s8 %r7351, %r7350; - cvt.u32.u16 %r7352, %rs4764; - cvt.s32.s8 %r7353, %r7352; - cvt.u32.u16 %r7354, %rs4763; - cvt.s32.s8 %r7355, %r7354; - cvt.u32.u16 %r7356, %rs4762; - cvt.s32.s8 %r7357, %r7356; - mad.lo.s32 %r7358, %r48, %r7357, %r7349; - mad.lo.s32 %r7359, %r49, %r7355, %r7358; - mad.lo.s32 %r7360, %r50, %r7353, %r7359; - mad.lo.s32 %r7361, %r51, %r7351, %r7360; - ld.const.v4.u8 {%rs4770, %rs4771, %rs4772, %rs4773}, [matrix+2384]; - cvt.u32.u16 %r7362, %rs4773; - cvt.s32.s8 %r7363, %r7362; - cvt.u32.u16 %r7364, %rs4772; - cvt.s32.s8 %r7365, %r7364; - cvt.u32.u16 %r7366, %rs4771; - cvt.s32.s8 %r7367, %r7366; - cvt.u32.u16 %r7368, %rs4770; - cvt.s32.s8 %r7369, %r7368; - mad.lo.s32 %r7370, %r173, %r7369, %r7361; - mad.lo.s32 %r7371, %r53, %r7367, %r7370; - mad.lo.s32 %r7372, %r54, %r7365, %r7371; - mad.lo.s32 %r7373, %r55, %r7363, %r7372; - ld.const.v4.u8 {%rs4778, %rs4779, %rs4780, %rs4781}, [matrix+2388]; - cvt.u32.u16 %r7374, %rs4781; - cvt.s32.s8 %r7375, %r7374; - cvt.u32.u16 %r7376, %rs4780; - cvt.s32.s8 %r7377, %r7376; - cvt.u32.u16 %r7378, %rs4779; - cvt.s32.s8 %r7379, %r7378; - cvt.u32.u16 %r7380, %rs4778; - cvt.s32.s8 %r7381, %r7380; - mad.lo.s32 %r7382, %r56, %r7381, %r7373; - mad.lo.s32 %r7383, %r57, %r7379, %r7382; - mad.lo.s32 %r7384, %r58, %r7377, %r7383; - mad.lo.s32 %r7385, %r59, %r7375, %r7384; - ld.const.v4.u8 {%rs4786, %rs4787, %rs4788, %rs4789}, [matrix+2392]; - cvt.u32.u16 %r7386, %rs4789; - cvt.s32.s8 %r7387, %r7386; - cvt.u32.u16 %r7388, %rs4788; - cvt.s32.s8 %r7389, %r7388; - cvt.u32.u16 %r7390, %rs4787; - cvt.s32.s8 %r7391, %r7390; - cvt.u32.u16 %r7392, %rs4786; - cvt.s32.s8 %r7393, %r7392; - mad.lo.s32 %r7394, %r61, %r7393, %r7385; - mad.lo.s32 %r7395, %r62, %r7391, %r7394; - mad.lo.s32 %r7396, %r64, %r7389, %r7395; - mad.lo.s32 %r7397, %r65, %r7387, %r7396; - ld.const.v4.u8 {%rs4794, %rs4795, %rs4796, %rs4797}, [matrix+2396]; - cvt.u32.u16 %r7398, %rs4797; - cvt.s32.s8 %r7399, %r7398; - cvt.u32.u16 %r7400, %rs4796; - cvt.s32.s8 %r7401, %r7400; - cvt.u32.u16 %r7402, %rs4795; - cvt.s32.s8 %r7403, %r7402; - cvt.u32.u16 %r7404, %rs4794; - cvt.s32.s8 %r7405, %r7404; - mad.lo.s32 %r7406, %r67, %r7405, %r7397; - mad.lo.s32 %r7407, %r68, %r7403, %r7406; - mad.lo.s32 %r7408, %r69, %r7401, %r7407; - mad.lo.s32 %r7409, %r70, %r7399, %r7408; - ld.const.v4.u8 {%rs4802, %rs4803, %rs4804, %rs4805}, [matrix+2400]; - cvt.u32.u16 %r7410, %rs4805; - cvt.s32.s8 %r7411, %r7410; - cvt.u32.u16 %r7412, %rs4804; - cvt.s32.s8 %r7413, %r7412; - cvt.u32.u16 %r7414, %rs4803; - cvt.s32.s8 %r7415, %r7414; - cvt.u32.u16 %r7416, %rs4802; - cvt.s32.s8 %r7417, %r7416; - mad.lo.s32 %r7418, %r222, %r7417, %r7409; - mad.lo.s32 %r7419, %r72, %r7415, %r7418; - mad.lo.s32 %r7420, %r73, %r7413, %r7419; - mad.lo.s32 %r7421, %r74, %r7411, %r7420; - ld.const.v4.u8 {%rs4810, %rs4811, %rs4812, %rs4813}, [matrix+2404]; - cvt.u32.u16 %r7422, %rs4813; - cvt.s32.s8 %r7423, %r7422; - cvt.u32.u16 %r7424, %rs4812; - cvt.s32.s8 %r7425, %r7424; - cvt.u32.u16 %r7426, %rs4811; - cvt.s32.s8 %r7427, %r7426; - cvt.u32.u16 %r7428, %rs4810; - cvt.s32.s8 %r7429, %r7428; - mad.lo.s32 %r7430, %r75, %r7429, %r7421; - mad.lo.s32 %r7431, %r76, %r7427, %r7430; - mad.lo.s32 %r7432, %r77, %r7425, %r7431; - mad.lo.s32 %r7433, %r78, %r7423, %r7432; - ld.const.v4.u8 {%rs4818, %rs4819, %rs4820, %rs4821}, [matrix+2408]; - cvt.u32.u16 %r7434, %rs4821; - cvt.s32.s8 %r7435, %r7434; - cvt.u32.u16 %r7436, %rs4820; - cvt.s32.s8 %r7437, %r7436; - cvt.u32.u16 %r7438, %rs4819; - cvt.s32.s8 %r7439, %r7438; - cvt.u32.u16 %r7440, %rs4818; - cvt.s32.s8 %r7441, %r7440; - mad.lo.s32 %r7442, %r80, %r7441, %r7433; - mad.lo.s32 %r7443, %r81, %r7439, %r7442; - mad.lo.s32 %r7444, %r83, %r7437, %r7443; - mad.lo.s32 %r7445, %r84, %r7435, %r7444; - ld.const.v4.u8 {%rs4826, %rs4827, %rs4828, %rs4829}, [matrix+2412]; - cvt.u32.u16 %r7446, %rs4829; - cvt.s32.s8 %r7447, %r7446; - cvt.u32.u16 %r7448, %rs4828; - cvt.s32.s8 %r7449, %r7448; - cvt.u32.u16 %r7450, %rs4827; - cvt.s32.s8 %r7451, %r7450; - cvt.u32.u16 %r7452, %rs4826; - cvt.s32.s8 %r7453, %r7452; - mad.lo.s32 %r7454, %r86, %r7453, %r7445; - mad.lo.s32 %r7455, %r87, %r7451, %r7454; - mad.lo.s32 %r7456, %r88, %r7449, %r7455; - mad.lo.s32 %r7457, %r89, %r7447, %r7456; - ld.const.v4.u8 {%rs4834, %rs4835, %rs4836, %rs4837}, [matrix+2416]; - cvt.u32.u16 %r7458, %rs4837; - cvt.s32.s8 %r7459, %r7458; - cvt.u32.u16 %r7460, %rs4836; - cvt.s32.s8 %r7461, %r7460; - cvt.u32.u16 %r7462, %rs4835; - cvt.s32.s8 %r7463, %r7462; - cvt.u32.u16 %r7464, %rs4834; - cvt.s32.s8 %r7465, %r7464; - mad.lo.s32 %r7466, %r271, %r7465, %r7457; - mad.lo.s32 %r7467, %r91, %r7463, %r7466; - mad.lo.s32 %r7468, %r93, %r7461, %r7467; - mad.lo.s32 %r7469, %r94, %r7459, %r7468; - ld.const.v4.u8 {%rs4842, %rs4843, %rs4844, %rs4845}, [matrix+2420]; - cvt.u32.u16 %r7470, %rs4845; - cvt.s32.s8 %r7471, %r7470; - cvt.u32.u16 %r7472, %rs4844; - cvt.s32.s8 %r7473, %r7472; - cvt.u32.u16 %r7474, %rs4843; - cvt.s32.s8 %r7475, %r7474; - cvt.u32.u16 %r7476, %rs4842; - cvt.s32.s8 %r7477, %r7476; - mad.lo.s32 %r7478, %r96, %r7477, %r7469; - mad.lo.s32 %r7479, %r97, %r7475, %r7478; - mad.lo.s32 %r7480, %r99, %r7473, %r7479; - mad.lo.s32 %r7481, %r100, %r7471, %r7480; - ld.const.v4.u8 {%rs4850, %rs4851, %rs4852, %rs4853}, [matrix+2424]; - cvt.u32.u16 %r7482, %rs4853; - cvt.s32.s8 %r7483, %r7482; - cvt.u32.u16 %r7484, %rs4852; - cvt.s32.s8 %r7485, %r7484; - cvt.u32.u16 %r7486, %rs4851; - cvt.s32.s8 %r7487, %r7486; - cvt.u32.u16 %r7488, %rs4850; - cvt.s32.s8 %r7489, %r7488; - mad.lo.s32 %r7490, %r103, %r7489, %r7481; - mad.lo.s32 %r7491, %r104, %r7487, %r7490; - mad.lo.s32 %r7492, %r107, %r7485, %r7491; - mad.lo.s32 %r7493, %r108, %r7483, %r7492; - ld.const.v4.u8 {%rs4858, %rs4859, %rs4860, %rs4861}, [matrix+2428]; - cvt.u32.u16 %r7494, %rs4861; - cvt.s32.s8 %r7495, %r7494; - cvt.u32.u16 %r7496, %rs4860; - cvt.s32.s8 %r7497, %r7496; - cvt.u32.u16 %r7498, %rs4859; - cvt.s32.s8 %r7499, %r7498; - cvt.u32.u16 %r7500, %rs4858; - cvt.s32.s8 %r7501, %r7500; - mad.lo.s32 %r7502, %r111, %r7501, %r7493; - mad.lo.s32 %r7503, %r112, %r7499, %r7502; - mad.lo.s32 %r7504, %r114, %r7497, %r7503; - mad.lo.s32 %r7505, %r115, %r7495, %r7504; - shr.u32 %r7506, %r7313, 6; - and.b32 %r7507, %r7506, 240; - shr.u32 %r7508, %r7505, 10; - or.b32 %r7509, %r7508, %r7507; - xor.b32 %r7510, %r27, %r7509; - cvt.u64.u32 %rd394, %r7510; - ld.const.v4.u8 {%rs4866, %rs4867, %rs4868, %rs4869}, [matrix+2432]; - cvt.u32.u16 %r7511, %rs4869; - cvt.s32.s8 %r7512, %r7511; - cvt.u32.u16 %r7513, %rs4868; - cvt.s32.s8 %r7514, %r7513; - cvt.u32.u16 %r7515, %rs4866; - cvt.s32.s8 %r7516, %r7515; - cvt.u32.u16 %r7517, %rs4867; - cvt.s32.s8 %r7518, %r7517; - mul.lo.s32 %r7519, %r34, %r7518; - mad.lo.s32 %r7520, %r124, %r7516, %r7519; - mad.lo.s32 %r7521, %r35, %r7514, %r7520; - mad.lo.s32 %r7522, %r36, %r7512, %r7521; - ld.const.v4.u8 {%rs4874, %rs4875, %rs4876, %rs4877}, [matrix+2436]; - cvt.u32.u16 %r7523, %rs4877; - cvt.s32.s8 %r7524, %r7523; - cvt.u32.u16 %r7525, %rs4876; - cvt.s32.s8 %r7526, %r7525; - cvt.u32.u16 %r7527, %rs4875; - cvt.s32.s8 %r7528, %r7527; - cvt.u32.u16 %r7529, %rs4874; - cvt.s32.s8 %r7530, %r7529; - mad.lo.s32 %r7531, %r37, %r7530, %r7522; - mad.lo.s32 %r7532, %r38, %r7528, %r7531; - mad.lo.s32 %r7533, %r39, %r7526, %r7532; - mad.lo.s32 %r7534, %r40, %r7524, %r7533; - ld.const.v4.u8 {%rs4882, %rs4883, %rs4884, %rs4885}, [matrix+2440]; - cvt.u32.u16 %r7535, %rs4885; - cvt.s32.s8 %r7536, %r7535; - cvt.u32.u16 %r7537, %rs4884; - cvt.s32.s8 %r7538, %r7537; - cvt.u32.u16 %r7539, %rs4883; - cvt.s32.s8 %r7540, %r7539; - cvt.u32.u16 %r7541, %rs4882; - cvt.s32.s8 %r7542, %r7541; - mad.lo.s32 %r7543, %r42, %r7542, %r7534; - mad.lo.s32 %r7544, %r43, %r7540, %r7543; - mad.lo.s32 %r7545, %r45, %r7538, %r7544; - mad.lo.s32 %r7546, %r46, %r7536, %r7545; - ld.const.v4.u8 {%rs4890, %rs4891, %rs4892, %rs4893}, [matrix+2444]; - cvt.u32.u16 %r7547, %rs4893; - cvt.s32.s8 %r7548, %r7547; - cvt.u32.u16 %r7549, %rs4892; - cvt.s32.s8 %r7550, %r7549; - cvt.u32.u16 %r7551, %rs4891; - cvt.s32.s8 %r7552, %r7551; - cvt.u32.u16 %r7553, %rs4890; - cvt.s32.s8 %r7554, %r7553; - mad.lo.s32 %r7555, %r48, %r7554, %r7546; - mad.lo.s32 %r7556, %r49, %r7552, %r7555; - mad.lo.s32 %r7557, %r50, %r7550, %r7556; - mad.lo.s32 %r7558, %r51, %r7548, %r7557; - ld.const.v4.u8 {%rs4898, %rs4899, %rs4900, %rs4901}, [matrix+2448]; - cvt.u32.u16 %r7559, %rs4901; - cvt.s32.s8 %r7560, %r7559; - cvt.u32.u16 %r7561, %rs4900; - cvt.s32.s8 %r7562, %r7561; - cvt.u32.u16 %r7563, %rs4899; - cvt.s32.s8 %r7564, %r7563; - cvt.u32.u16 %r7565, %rs4898; - cvt.s32.s8 %r7566, %r7565; - mad.lo.s32 %r7567, %r173, %r7566, %r7558; - mad.lo.s32 %r7568, %r53, %r7564, %r7567; - mad.lo.s32 %r7569, %r54, %r7562, %r7568; - mad.lo.s32 %r7570, %r55, %r7560, %r7569; - ld.const.v4.u8 {%rs4906, %rs4907, %rs4908, %rs4909}, [matrix+2452]; - cvt.u32.u16 %r7571, %rs4909; - cvt.s32.s8 %r7572, %r7571; - cvt.u32.u16 %r7573, %rs4908; - cvt.s32.s8 %r7574, %r7573; - cvt.u32.u16 %r7575, %rs4907; - cvt.s32.s8 %r7576, %r7575; - cvt.u32.u16 %r7577, %rs4906; - cvt.s32.s8 %r7578, %r7577; - mad.lo.s32 %r7579, %r56, %r7578, %r7570; - mad.lo.s32 %r7580, %r57, %r7576, %r7579; - mad.lo.s32 %r7581, %r58, %r7574, %r7580; - mad.lo.s32 %r7582, %r59, %r7572, %r7581; - ld.const.v4.u8 {%rs4914, %rs4915, %rs4916, %rs4917}, [matrix+2456]; - cvt.u32.u16 %r7583, %rs4917; - cvt.s32.s8 %r7584, %r7583; - cvt.u32.u16 %r7585, %rs4916; - cvt.s32.s8 %r7586, %r7585; - cvt.u32.u16 %r7587, %rs4915; - cvt.s32.s8 %r7588, %r7587; - cvt.u32.u16 %r7589, %rs4914; - cvt.s32.s8 %r7590, %r7589; - mad.lo.s32 %r7591, %r61, %r7590, %r7582; - mad.lo.s32 %r7592, %r62, %r7588, %r7591; - mad.lo.s32 %r7593, %r64, %r7586, %r7592; - mad.lo.s32 %r7594, %r65, %r7584, %r7593; - ld.const.v4.u8 {%rs4922, %rs4923, %rs4924, %rs4925}, [matrix+2460]; - cvt.u32.u16 %r7595, %rs4925; - cvt.s32.s8 %r7596, %r7595; - cvt.u32.u16 %r7597, %rs4924; - cvt.s32.s8 %r7598, %r7597; - cvt.u32.u16 %r7599, %rs4923; - cvt.s32.s8 %r7600, %r7599; - cvt.u32.u16 %r7601, %rs4922; - cvt.s32.s8 %r7602, %r7601; - mad.lo.s32 %r7603, %r67, %r7602, %r7594; - mad.lo.s32 %r7604, %r68, %r7600, %r7603; - mad.lo.s32 %r7605, %r69, %r7598, %r7604; - mad.lo.s32 %r7606, %r70, %r7596, %r7605; - ld.const.v4.u8 {%rs4930, %rs4931, %rs4932, %rs4933}, [matrix+2464]; - cvt.u32.u16 %r7607, %rs4933; - cvt.s32.s8 %r7608, %r7607; - cvt.u32.u16 %r7609, %rs4932; - cvt.s32.s8 %r7610, %r7609; - cvt.u32.u16 %r7611, %rs4931; - cvt.s32.s8 %r7612, %r7611; - cvt.u32.u16 %r7613, %rs4930; - cvt.s32.s8 %r7614, %r7613; - mad.lo.s32 %r7615, %r222, %r7614, %r7606; - mad.lo.s32 %r7616, %r72, %r7612, %r7615; - mad.lo.s32 %r7617, %r73, %r7610, %r7616; - mad.lo.s32 %r7618, %r74, %r7608, %r7617; - ld.const.v4.u8 {%rs4938, %rs4939, %rs4940, %rs4941}, [matrix+2468]; - cvt.u32.u16 %r7619, %rs4941; - cvt.s32.s8 %r7620, %r7619; - cvt.u32.u16 %r7621, %rs4940; - cvt.s32.s8 %r7622, %r7621; - cvt.u32.u16 %r7623, %rs4939; - cvt.s32.s8 %r7624, %r7623; - cvt.u32.u16 %r7625, %rs4938; - cvt.s32.s8 %r7626, %r7625; - mad.lo.s32 %r7627, %r75, %r7626, %r7618; - mad.lo.s32 %r7628, %r76, %r7624, %r7627; - mad.lo.s32 %r7629, %r77, %r7622, %r7628; - mad.lo.s32 %r7630, %r78, %r7620, %r7629; - ld.const.v4.u8 {%rs4946, %rs4947, %rs4948, %rs4949}, [matrix+2472]; - cvt.u32.u16 %r7631, %rs4949; - cvt.s32.s8 %r7632, %r7631; - cvt.u32.u16 %r7633, %rs4948; - cvt.s32.s8 %r7634, %r7633; - cvt.u32.u16 %r7635, %rs4947; - cvt.s32.s8 %r7636, %r7635; - cvt.u32.u16 %r7637, %rs4946; - cvt.s32.s8 %r7638, %r7637; - mad.lo.s32 %r7639, %r80, %r7638, %r7630; - mad.lo.s32 %r7640, %r81, %r7636, %r7639; - mad.lo.s32 %r7641, %r83, %r7634, %r7640; - mad.lo.s32 %r7642, %r84, %r7632, %r7641; - ld.const.v4.u8 {%rs4954, %rs4955, %rs4956, %rs4957}, [matrix+2476]; - cvt.u32.u16 %r7643, %rs4957; - cvt.s32.s8 %r7644, %r7643; - cvt.u32.u16 %r7645, %rs4956; - cvt.s32.s8 %r7646, %r7645; - cvt.u32.u16 %r7647, %rs4955; - cvt.s32.s8 %r7648, %r7647; - cvt.u32.u16 %r7649, %rs4954; - cvt.s32.s8 %r7650, %r7649; - mad.lo.s32 %r7651, %r86, %r7650, %r7642; - mad.lo.s32 %r7652, %r87, %r7648, %r7651; - mad.lo.s32 %r7653, %r88, %r7646, %r7652; - mad.lo.s32 %r7654, %r89, %r7644, %r7653; - ld.const.v4.u8 {%rs4962, %rs4963, %rs4964, %rs4965}, [matrix+2480]; - cvt.u32.u16 %r7655, %rs4965; - cvt.s32.s8 %r7656, %r7655; - cvt.u32.u16 %r7657, %rs4964; - cvt.s32.s8 %r7658, %r7657; - cvt.u32.u16 %r7659, %rs4963; - cvt.s32.s8 %r7660, %r7659; - cvt.u32.u16 %r7661, %rs4962; - cvt.s32.s8 %r7662, %r7661; - mad.lo.s32 %r7663, %r271, %r7662, %r7654; - mad.lo.s32 %r7664, %r91, %r7660, %r7663; - mad.lo.s32 %r7665, %r93, %r7658, %r7664; - mad.lo.s32 %r7666, %r94, %r7656, %r7665; - ld.const.v4.u8 {%rs4970, %rs4971, %rs4972, %rs4973}, [matrix+2484]; - cvt.u32.u16 %r7667, %rs4973; - cvt.s32.s8 %r7668, %r7667; - cvt.u32.u16 %r7669, %rs4972; - cvt.s32.s8 %r7670, %r7669; - cvt.u32.u16 %r7671, %rs4971; - cvt.s32.s8 %r7672, %r7671; - cvt.u32.u16 %r7673, %rs4970; - cvt.s32.s8 %r7674, %r7673; - mad.lo.s32 %r7675, %r96, %r7674, %r7666; - mad.lo.s32 %r7676, %r97, %r7672, %r7675; - mad.lo.s32 %r7677, %r99, %r7670, %r7676; - mad.lo.s32 %r7678, %r100, %r7668, %r7677; - ld.const.v4.u8 {%rs4978, %rs4979, %rs4980, %rs4981}, [matrix+2488]; - cvt.u32.u16 %r7679, %rs4981; - cvt.s32.s8 %r7680, %r7679; - cvt.u32.u16 %r7681, %rs4980; - cvt.s32.s8 %r7682, %r7681; - cvt.u32.u16 %r7683, %rs4979; - cvt.s32.s8 %r7684, %r7683; - cvt.u32.u16 %r7685, %rs4978; - cvt.s32.s8 %r7686, %r7685; - mad.lo.s32 %r7687, %r103, %r7686, %r7678; - mad.lo.s32 %r7688, %r104, %r7684, %r7687; - mad.lo.s32 %r7689, %r107, %r7682, %r7688; - mad.lo.s32 %r7690, %r108, %r7680, %r7689; - ld.const.v4.u8 {%rs4986, %rs4987, %rs4988, %rs4989}, [matrix+2492]; - cvt.u32.u16 %r7691, %rs4989; - cvt.s32.s8 %r7692, %r7691; - cvt.u32.u16 %r7693, %rs4988; - cvt.s32.s8 %r7694, %r7693; - cvt.u32.u16 %r7695, %rs4987; - cvt.s32.s8 %r7696, %r7695; - cvt.u32.u16 %r7697, %rs4986; - cvt.s32.s8 %r7698, %r7697; - mad.lo.s32 %r7699, %r111, %r7698, %r7690; - mad.lo.s32 %r7700, %r112, %r7696, %r7699; - mad.lo.s32 %r7701, %r114, %r7694, %r7700; - mad.lo.s32 %r7702, %r115, %r7692, %r7701; - ld.const.v4.u8 {%rs4994, %rs4995, %rs4996, %rs4997}, [matrix+2496]; - cvt.u32.u16 %r7703, %rs4997; - cvt.s32.s8 %r7704, %r7703; - cvt.u32.u16 %r7705, %rs4996; - cvt.s32.s8 %r7706, %r7705; - cvt.u32.u16 %r7707, %rs4994; - cvt.s32.s8 %r7708, %r7707; - cvt.u32.u16 %r7709, %rs4995; - cvt.s32.s8 %r7710, %r7709; - mul.lo.s32 %r7711, %r34, %r7710; - mad.lo.s32 %r7712, %r124, %r7708, %r7711; - mad.lo.s32 %r7713, %r35, %r7706, %r7712; - mad.lo.s32 %r7714, %r36, %r7704, %r7713; - ld.const.v4.u8 {%rs5002, %rs5003, %rs5004, %rs5005}, [matrix+2500]; - cvt.u32.u16 %r7715, %rs5005; - cvt.s32.s8 %r7716, %r7715; - cvt.u32.u16 %r7717, %rs5004; - cvt.s32.s8 %r7718, %r7717; - cvt.u32.u16 %r7719, %rs5003; - cvt.s32.s8 %r7720, %r7719; - cvt.u32.u16 %r7721, %rs5002; - cvt.s32.s8 %r7722, %r7721; - mad.lo.s32 %r7723, %r37, %r7722, %r7714; - mad.lo.s32 %r7724, %r38, %r7720, %r7723; - mad.lo.s32 %r7725, %r39, %r7718, %r7724; - mad.lo.s32 %r7726, %r40, %r7716, %r7725; - ld.const.v4.u8 {%rs5010, %rs5011, %rs5012, %rs5013}, [matrix+2504]; - cvt.u32.u16 %r7727, %rs5013; - cvt.s32.s8 %r7728, %r7727; - cvt.u32.u16 %r7729, %rs5012; - cvt.s32.s8 %r7730, %r7729; - cvt.u32.u16 %r7731, %rs5011; - cvt.s32.s8 %r7732, %r7731; - cvt.u32.u16 %r7733, %rs5010; - cvt.s32.s8 %r7734, %r7733; - mad.lo.s32 %r7735, %r42, %r7734, %r7726; - mad.lo.s32 %r7736, %r43, %r7732, %r7735; - mad.lo.s32 %r7737, %r45, %r7730, %r7736; - mad.lo.s32 %r7738, %r46, %r7728, %r7737; - ld.const.v4.u8 {%rs5018, %rs5019, %rs5020, %rs5021}, [matrix+2508]; - cvt.u32.u16 %r7739, %rs5021; - cvt.s32.s8 %r7740, %r7739; - cvt.u32.u16 %r7741, %rs5020; - cvt.s32.s8 %r7742, %r7741; - cvt.u32.u16 %r7743, %rs5019; - cvt.s32.s8 %r7744, %r7743; - cvt.u32.u16 %r7745, %rs5018; - cvt.s32.s8 %r7746, %r7745; - mad.lo.s32 %r7747, %r48, %r7746, %r7738; - mad.lo.s32 %r7748, %r49, %r7744, %r7747; - mad.lo.s32 %r7749, %r50, %r7742, %r7748; - mad.lo.s32 %r7750, %r51, %r7740, %r7749; - ld.const.v4.u8 {%rs5026, %rs5027, %rs5028, %rs5029}, [matrix+2512]; - cvt.u32.u16 %r7751, %rs5029; - cvt.s32.s8 %r7752, %r7751; - cvt.u32.u16 %r7753, %rs5028; - cvt.s32.s8 %r7754, %r7753; - cvt.u32.u16 %r7755, %rs5027; - cvt.s32.s8 %r7756, %r7755; - cvt.u32.u16 %r7757, %rs5026; - cvt.s32.s8 %r7758, %r7757; - mad.lo.s32 %r7759, %r173, %r7758, %r7750; - mad.lo.s32 %r7760, %r53, %r7756, %r7759; - mad.lo.s32 %r7761, %r54, %r7754, %r7760; - mad.lo.s32 %r7762, %r55, %r7752, %r7761; - ld.const.v4.u8 {%rs5034, %rs5035, %rs5036, %rs5037}, [matrix+2516]; - cvt.u32.u16 %r7763, %rs5037; - cvt.s32.s8 %r7764, %r7763; - cvt.u32.u16 %r7765, %rs5036; - cvt.s32.s8 %r7766, %r7765; - cvt.u32.u16 %r7767, %rs5035; - cvt.s32.s8 %r7768, %r7767; - cvt.u32.u16 %r7769, %rs5034; - cvt.s32.s8 %r7770, %r7769; - mad.lo.s32 %r7771, %r56, %r7770, %r7762; - mad.lo.s32 %r7772, %r57, %r7768, %r7771; - mad.lo.s32 %r7773, %r58, %r7766, %r7772; - mad.lo.s32 %r7774, %r59, %r7764, %r7773; - ld.const.v4.u8 {%rs5042, %rs5043, %rs5044, %rs5045}, [matrix+2520]; - cvt.u32.u16 %r7775, %rs5045; - cvt.s32.s8 %r7776, %r7775; - cvt.u32.u16 %r7777, %rs5044; - cvt.s32.s8 %r7778, %r7777; - cvt.u32.u16 %r7779, %rs5043; - cvt.s32.s8 %r7780, %r7779; - cvt.u32.u16 %r7781, %rs5042; - cvt.s32.s8 %r7782, %r7781; - mad.lo.s32 %r7783, %r61, %r7782, %r7774; - mad.lo.s32 %r7784, %r62, %r7780, %r7783; - mad.lo.s32 %r7785, %r64, %r7778, %r7784; - mad.lo.s32 %r7786, %r65, %r7776, %r7785; - ld.const.v4.u8 {%rs5050, %rs5051, %rs5052, %rs5053}, [matrix+2524]; - cvt.u32.u16 %r7787, %rs5053; - cvt.s32.s8 %r7788, %r7787; - cvt.u32.u16 %r7789, %rs5052; - cvt.s32.s8 %r7790, %r7789; - cvt.u32.u16 %r7791, %rs5051; - cvt.s32.s8 %r7792, %r7791; - cvt.u32.u16 %r7793, %rs5050; - cvt.s32.s8 %r7794, %r7793; - mad.lo.s32 %r7795, %r67, %r7794, %r7786; - mad.lo.s32 %r7796, %r68, %r7792, %r7795; - mad.lo.s32 %r7797, %r69, %r7790, %r7796; - mad.lo.s32 %r7798, %r70, %r7788, %r7797; - ld.const.v4.u8 {%rs5058, %rs5059, %rs5060, %rs5061}, [matrix+2528]; - cvt.u32.u16 %r7799, %rs5061; - cvt.s32.s8 %r7800, %r7799; - cvt.u32.u16 %r7801, %rs5060; - cvt.s32.s8 %r7802, %r7801; - cvt.u32.u16 %r7803, %rs5059; - cvt.s32.s8 %r7804, %r7803; - cvt.u32.u16 %r7805, %rs5058; - cvt.s32.s8 %r7806, %r7805; - mad.lo.s32 %r7807, %r222, %r7806, %r7798; - mad.lo.s32 %r7808, %r72, %r7804, %r7807; - mad.lo.s32 %r7809, %r73, %r7802, %r7808; - mad.lo.s32 %r7810, %r74, %r7800, %r7809; - ld.const.v4.u8 {%rs5066, %rs5067, %rs5068, %rs5069}, [matrix+2532]; - cvt.u32.u16 %r7811, %rs5069; - cvt.s32.s8 %r7812, %r7811; - cvt.u32.u16 %r7813, %rs5068; - cvt.s32.s8 %r7814, %r7813; - cvt.u32.u16 %r7815, %rs5067; - cvt.s32.s8 %r7816, %r7815; - cvt.u32.u16 %r7817, %rs5066; - cvt.s32.s8 %r7818, %r7817; - mad.lo.s32 %r7819, %r75, %r7818, %r7810; - mad.lo.s32 %r7820, %r76, %r7816, %r7819; - mad.lo.s32 %r7821, %r77, %r7814, %r7820; - mad.lo.s32 %r7822, %r78, %r7812, %r7821; - ld.const.v4.u8 {%rs5074, %rs5075, %rs5076, %rs5077}, [matrix+2536]; - cvt.u32.u16 %r7823, %rs5077; - cvt.s32.s8 %r7824, %r7823; - cvt.u32.u16 %r7825, %rs5076; - cvt.s32.s8 %r7826, %r7825; - cvt.u32.u16 %r7827, %rs5075; - cvt.s32.s8 %r7828, %r7827; - cvt.u32.u16 %r7829, %rs5074; - cvt.s32.s8 %r7830, %r7829; - mad.lo.s32 %r7831, %r80, %r7830, %r7822; - mad.lo.s32 %r7832, %r81, %r7828, %r7831; - mad.lo.s32 %r7833, %r83, %r7826, %r7832; - mad.lo.s32 %r7834, %r84, %r7824, %r7833; - ld.const.v4.u8 {%rs5082, %rs5083, %rs5084, %rs5085}, [matrix+2540]; - cvt.u32.u16 %r7835, %rs5085; - cvt.s32.s8 %r7836, %r7835; - cvt.u32.u16 %r7837, %rs5084; - cvt.s32.s8 %r7838, %r7837; - cvt.u32.u16 %r7839, %rs5083; - cvt.s32.s8 %r7840, %r7839; - cvt.u32.u16 %r7841, %rs5082; - cvt.s32.s8 %r7842, %r7841; - mad.lo.s32 %r7843, %r86, %r7842, %r7834; - mad.lo.s32 %r7844, %r87, %r7840, %r7843; - mad.lo.s32 %r7845, %r88, %r7838, %r7844; - mad.lo.s32 %r7846, %r89, %r7836, %r7845; - ld.const.v4.u8 {%rs5090, %rs5091, %rs5092, %rs5093}, [matrix+2544]; - cvt.u32.u16 %r7847, %rs5093; - cvt.s32.s8 %r7848, %r7847; - cvt.u32.u16 %r7849, %rs5092; - cvt.s32.s8 %r7850, %r7849; - cvt.u32.u16 %r7851, %rs5091; - cvt.s32.s8 %r7852, %r7851; - cvt.u32.u16 %r7853, %rs5090; - cvt.s32.s8 %r7854, %r7853; - mad.lo.s32 %r7855, %r271, %r7854, %r7846; - mad.lo.s32 %r7856, %r91, %r7852, %r7855; - mad.lo.s32 %r7857, %r93, %r7850, %r7856; - mad.lo.s32 %r7858, %r94, %r7848, %r7857; - ld.const.v4.u8 {%rs5098, %rs5099, %rs5100, %rs5101}, [matrix+2548]; - cvt.u32.u16 %r7859, %rs5101; - cvt.s32.s8 %r7860, %r7859; - cvt.u32.u16 %r7861, %rs5100; - cvt.s32.s8 %r7862, %r7861; - cvt.u32.u16 %r7863, %rs5099; - cvt.s32.s8 %r7864, %r7863; - cvt.u32.u16 %r7865, %rs5098; - cvt.s32.s8 %r7866, %r7865; - mad.lo.s32 %r7867, %r96, %r7866, %r7858; - mad.lo.s32 %r7868, %r97, %r7864, %r7867; - mad.lo.s32 %r7869, %r99, %r7862, %r7868; - mad.lo.s32 %r7870, %r100, %r7860, %r7869; - ld.const.v4.u8 {%rs5106, %rs5107, %rs5108, %rs5109}, [matrix+2552]; - cvt.u32.u16 %r7871, %rs5109; - cvt.s32.s8 %r7872, %r7871; - cvt.u32.u16 %r7873, %rs5108; - cvt.s32.s8 %r7874, %r7873; - cvt.u32.u16 %r7875, %rs5107; - cvt.s32.s8 %r7876, %r7875; - cvt.u32.u16 %r7877, %rs5106; - cvt.s32.s8 %r7878, %r7877; - mad.lo.s32 %r7879, %r103, %r7878, %r7870; - mad.lo.s32 %r7880, %r104, %r7876, %r7879; - mad.lo.s32 %r7881, %r107, %r7874, %r7880; - mad.lo.s32 %r7882, %r108, %r7872, %r7881; - ld.const.v4.u8 {%rs5114, %rs5115, %rs5116, %rs5117}, [matrix+2556]; - cvt.u32.u16 %r7883, %rs5117; - cvt.s32.s8 %r7884, %r7883; - cvt.u32.u16 %r7885, %rs5116; - cvt.s32.s8 %r7886, %r7885; - cvt.u32.u16 %r7887, %rs5115; - cvt.s32.s8 %r7888, %r7887; - cvt.u32.u16 %r7889, %rs5114; - cvt.s32.s8 %r7890, %r7889; - mad.lo.s32 %r7891, %r111, %r7890, %r7882; - mad.lo.s32 %r7892, %r112, %r7888, %r7891; - mad.lo.s32 %r7893, %r114, %r7886, %r7892; - mad.lo.s32 %r7894, %r115, %r7884, %r7893; - shr.u32 %r7895, %r7702, 6; - and.b32 %r7896, %r7895, 240; - shr.u32 %r7897, %r7894, 10; - or.b32 %r7898, %r7897, %r7896; - xor.b32 %r7899, %r28, %r7898; - cvt.u64.u32 %rd395, %r7899; - ld.const.v4.u8 {%rs5122, %rs5123, %rs5124, %rs5125}, [matrix+2560]; - cvt.u32.u16 %r7900, %rs5125; - cvt.s32.s8 %r7901, %r7900; - cvt.u32.u16 %r7902, %rs5124; - cvt.s32.s8 %r7903, %r7902; - cvt.u32.u16 %r7904, %rs5122; - cvt.s32.s8 %r7905, %r7904; - cvt.u32.u16 %r7906, %rs5123; - cvt.s32.s8 %r7907, %r7906; - mul.lo.s32 %r7908, %r34, %r7907; - mad.lo.s32 %r7909, %r124, %r7905, %r7908; - mad.lo.s32 %r7910, %r35, %r7903, %r7909; - mad.lo.s32 %r7911, %r36, %r7901, %r7910; - ld.const.v4.u8 {%rs5130, %rs5131, %rs5132, %rs5133}, [matrix+2564]; - cvt.u32.u16 %r7912, %rs5133; - cvt.s32.s8 %r7913, %r7912; - cvt.u32.u16 %r7914, %rs5132; - cvt.s32.s8 %r7915, %r7914; - cvt.u32.u16 %r7916, %rs5131; - cvt.s32.s8 %r7917, %r7916; - cvt.u32.u16 %r7918, %rs5130; - cvt.s32.s8 %r7919, %r7918; - mad.lo.s32 %r7920, %r37, %r7919, %r7911; - mad.lo.s32 %r7921, %r38, %r7917, %r7920; - mad.lo.s32 %r7922, %r39, %r7915, %r7921; - mad.lo.s32 %r7923, %r40, %r7913, %r7922; - ld.const.v4.u8 {%rs5138, %rs5139, %rs5140, %rs5141}, [matrix+2568]; - cvt.u32.u16 %r7924, %rs5141; - cvt.s32.s8 %r7925, %r7924; - cvt.u32.u16 %r7926, %rs5140; - cvt.s32.s8 %r7927, %r7926; - cvt.u32.u16 %r7928, %rs5139; - cvt.s32.s8 %r7929, %r7928; - cvt.u32.u16 %r7930, %rs5138; - cvt.s32.s8 %r7931, %r7930; - mad.lo.s32 %r7932, %r42, %r7931, %r7923; - mad.lo.s32 %r7933, %r43, %r7929, %r7932; - mad.lo.s32 %r7934, %r45, %r7927, %r7933; - mad.lo.s32 %r7935, %r46, %r7925, %r7934; - ld.const.v4.u8 {%rs5146, %rs5147, %rs5148, %rs5149}, [matrix+2572]; - cvt.u32.u16 %r7936, %rs5149; - cvt.s32.s8 %r7937, %r7936; - cvt.u32.u16 %r7938, %rs5148; - cvt.s32.s8 %r7939, %r7938; - cvt.u32.u16 %r7940, %rs5147; - cvt.s32.s8 %r7941, %r7940; - cvt.u32.u16 %r7942, %rs5146; - cvt.s32.s8 %r7943, %r7942; - mad.lo.s32 %r7944, %r48, %r7943, %r7935; - mad.lo.s32 %r7945, %r49, %r7941, %r7944; - mad.lo.s32 %r7946, %r50, %r7939, %r7945; - mad.lo.s32 %r7947, %r51, %r7937, %r7946; - ld.const.v4.u8 {%rs5154, %rs5155, %rs5156, %rs5157}, [matrix+2576]; - cvt.u32.u16 %r7948, %rs5157; - cvt.s32.s8 %r7949, %r7948; - cvt.u32.u16 %r7950, %rs5156; - cvt.s32.s8 %r7951, %r7950; - cvt.u32.u16 %r7952, %rs5155; - cvt.s32.s8 %r7953, %r7952; - cvt.u32.u16 %r7954, %rs5154; - cvt.s32.s8 %r7955, %r7954; - mad.lo.s32 %r7956, %r173, %r7955, %r7947; - mad.lo.s32 %r7957, %r53, %r7953, %r7956; - mad.lo.s32 %r7958, %r54, %r7951, %r7957; - mad.lo.s32 %r7959, %r55, %r7949, %r7958; - ld.const.v4.u8 {%rs5162, %rs5163, %rs5164, %rs5165}, [matrix+2580]; - cvt.u32.u16 %r7960, %rs5165; - cvt.s32.s8 %r7961, %r7960; - cvt.u32.u16 %r7962, %rs5164; - cvt.s32.s8 %r7963, %r7962; - cvt.u32.u16 %r7964, %rs5163; - cvt.s32.s8 %r7965, %r7964; - cvt.u32.u16 %r7966, %rs5162; - cvt.s32.s8 %r7967, %r7966; - mad.lo.s32 %r7968, %r56, %r7967, %r7959; - mad.lo.s32 %r7969, %r57, %r7965, %r7968; - mad.lo.s32 %r7970, %r58, %r7963, %r7969; - mad.lo.s32 %r7971, %r59, %r7961, %r7970; - ld.const.v4.u8 {%rs5170, %rs5171, %rs5172, %rs5173}, [matrix+2584]; - cvt.u32.u16 %r7972, %rs5173; - cvt.s32.s8 %r7973, %r7972; - cvt.u32.u16 %r7974, %rs5172; - cvt.s32.s8 %r7975, %r7974; - cvt.u32.u16 %r7976, %rs5171; - cvt.s32.s8 %r7977, %r7976; - cvt.u32.u16 %r7978, %rs5170; - cvt.s32.s8 %r7979, %r7978; - mad.lo.s32 %r7980, %r61, %r7979, %r7971; - mad.lo.s32 %r7981, %r62, %r7977, %r7980; - mad.lo.s32 %r7982, %r64, %r7975, %r7981; - mad.lo.s32 %r7983, %r65, %r7973, %r7982; - ld.const.v4.u8 {%rs5178, %rs5179, %rs5180, %rs5181}, [matrix+2588]; - cvt.u32.u16 %r7984, %rs5181; - cvt.s32.s8 %r7985, %r7984; - cvt.u32.u16 %r7986, %rs5180; - cvt.s32.s8 %r7987, %r7986; - cvt.u32.u16 %r7988, %rs5179; - cvt.s32.s8 %r7989, %r7988; - cvt.u32.u16 %r7990, %rs5178; - cvt.s32.s8 %r7991, %r7990; - mad.lo.s32 %r7992, %r67, %r7991, %r7983; - mad.lo.s32 %r7993, %r68, %r7989, %r7992; - mad.lo.s32 %r7994, %r69, %r7987, %r7993; - mad.lo.s32 %r7995, %r70, %r7985, %r7994; - ld.const.v4.u8 {%rs5186, %rs5187, %rs5188, %rs5189}, [matrix+2592]; - cvt.u32.u16 %r7996, %rs5189; - cvt.s32.s8 %r7997, %r7996; - cvt.u32.u16 %r7998, %rs5188; - cvt.s32.s8 %r7999, %r7998; - cvt.u32.u16 %r8000, %rs5187; - cvt.s32.s8 %r8001, %r8000; - cvt.u32.u16 %r8002, %rs5186; - cvt.s32.s8 %r8003, %r8002; - mad.lo.s32 %r8004, %r222, %r8003, %r7995; - mad.lo.s32 %r8005, %r72, %r8001, %r8004; - mad.lo.s32 %r8006, %r73, %r7999, %r8005; - mad.lo.s32 %r8007, %r74, %r7997, %r8006; - ld.const.v4.u8 {%rs5194, %rs5195, %rs5196, %rs5197}, [matrix+2596]; - cvt.u32.u16 %r8008, %rs5197; - cvt.s32.s8 %r8009, %r8008; - cvt.u32.u16 %r8010, %rs5196; - cvt.s32.s8 %r8011, %r8010; - cvt.u32.u16 %r8012, %rs5195; - cvt.s32.s8 %r8013, %r8012; - cvt.u32.u16 %r8014, %rs5194; - cvt.s32.s8 %r8015, %r8014; - mad.lo.s32 %r8016, %r75, %r8015, %r8007; - mad.lo.s32 %r8017, %r76, %r8013, %r8016; - mad.lo.s32 %r8018, %r77, %r8011, %r8017; - mad.lo.s32 %r8019, %r78, %r8009, %r8018; - ld.const.v4.u8 {%rs5202, %rs5203, %rs5204, %rs5205}, [matrix+2600]; - cvt.u32.u16 %r8020, %rs5205; - cvt.s32.s8 %r8021, %r8020; - cvt.u32.u16 %r8022, %rs5204; - cvt.s32.s8 %r8023, %r8022; - cvt.u32.u16 %r8024, %rs5203; - cvt.s32.s8 %r8025, %r8024; - cvt.u32.u16 %r8026, %rs5202; - cvt.s32.s8 %r8027, %r8026; - mad.lo.s32 %r8028, %r80, %r8027, %r8019; - mad.lo.s32 %r8029, %r81, %r8025, %r8028; - mad.lo.s32 %r8030, %r83, %r8023, %r8029; - mad.lo.s32 %r8031, %r84, %r8021, %r8030; - ld.const.v4.u8 {%rs5210, %rs5211, %rs5212, %rs5213}, [matrix+2604]; - cvt.u32.u16 %r8032, %rs5213; - cvt.s32.s8 %r8033, %r8032; - cvt.u32.u16 %r8034, %rs5212; - cvt.s32.s8 %r8035, %r8034; - cvt.u32.u16 %r8036, %rs5211; - cvt.s32.s8 %r8037, %r8036; - cvt.u32.u16 %r8038, %rs5210; - cvt.s32.s8 %r8039, %r8038; - mad.lo.s32 %r8040, %r86, %r8039, %r8031; - mad.lo.s32 %r8041, %r87, %r8037, %r8040; - mad.lo.s32 %r8042, %r88, %r8035, %r8041; - mad.lo.s32 %r8043, %r89, %r8033, %r8042; - ld.const.v4.u8 {%rs5218, %rs5219, %rs5220, %rs5221}, [matrix+2608]; - cvt.u32.u16 %r8044, %rs5221; - cvt.s32.s8 %r8045, %r8044; - cvt.u32.u16 %r8046, %rs5220; - cvt.s32.s8 %r8047, %r8046; - cvt.u32.u16 %r8048, %rs5219; - cvt.s32.s8 %r8049, %r8048; - cvt.u32.u16 %r8050, %rs5218; - cvt.s32.s8 %r8051, %r8050; - mad.lo.s32 %r8052, %r271, %r8051, %r8043; - mad.lo.s32 %r8053, %r91, %r8049, %r8052; - mad.lo.s32 %r8054, %r93, %r8047, %r8053; - mad.lo.s32 %r8055, %r94, %r8045, %r8054; - ld.const.v4.u8 {%rs5226, %rs5227, %rs5228, %rs5229}, [matrix+2612]; - cvt.u32.u16 %r8056, %rs5229; - cvt.s32.s8 %r8057, %r8056; - cvt.u32.u16 %r8058, %rs5228; - cvt.s32.s8 %r8059, %r8058; - cvt.u32.u16 %r8060, %rs5227; - cvt.s32.s8 %r8061, %r8060; - cvt.u32.u16 %r8062, %rs5226; - cvt.s32.s8 %r8063, %r8062; - mad.lo.s32 %r8064, %r96, %r8063, %r8055; - mad.lo.s32 %r8065, %r97, %r8061, %r8064; - mad.lo.s32 %r8066, %r99, %r8059, %r8065; - mad.lo.s32 %r8067, %r100, %r8057, %r8066; - ld.const.v4.u8 {%rs5234, %rs5235, %rs5236, %rs5237}, [matrix+2616]; - cvt.u32.u16 %r8068, %rs5237; - cvt.s32.s8 %r8069, %r8068; - cvt.u32.u16 %r8070, %rs5236; - cvt.s32.s8 %r8071, %r8070; - cvt.u32.u16 %r8072, %rs5235; - cvt.s32.s8 %r8073, %r8072; - cvt.u32.u16 %r8074, %rs5234; - cvt.s32.s8 %r8075, %r8074; - mad.lo.s32 %r8076, %r103, %r8075, %r8067; - mad.lo.s32 %r8077, %r104, %r8073, %r8076; - mad.lo.s32 %r8078, %r107, %r8071, %r8077; - mad.lo.s32 %r8079, %r108, %r8069, %r8078; - ld.const.v4.u8 {%rs5242, %rs5243, %rs5244, %rs5245}, [matrix+2620]; - cvt.u32.u16 %r8080, %rs5245; - cvt.s32.s8 %r8081, %r8080; - cvt.u32.u16 %r8082, %rs5244; - cvt.s32.s8 %r8083, %r8082; - cvt.u32.u16 %r8084, %rs5243; - cvt.s32.s8 %r8085, %r8084; - cvt.u32.u16 %r8086, %rs5242; - cvt.s32.s8 %r8087, %r8086; - mad.lo.s32 %r8088, %r111, %r8087, %r8079; - mad.lo.s32 %r8089, %r112, %r8085, %r8088; - mad.lo.s32 %r8090, %r114, %r8083, %r8089; - mad.lo.s32 %r8091, %r115, %r8081, %r8090; - ld.const.v4.u8 {%rs5250, %rs5251, %rs5252, %rs5253}, [matrix+2624]; - cvt.u32.u16 %r8092, %rs5253; - cvt.s32.s8 %r8093, %r8092; - cvt.u32.u16 %r8094, %rs5252; - cvt.s32.s8 %r8095, %r8094; - cvt.u32.u16 %r8096, %rs5250; - cvt.s32.s8 %r8097, %r8096; - cvt.u32.u16 %r8098, %rs5251; - cvt.s32.s8 %r8099, %r8098; - mul.lo.s32 %r8100, %r34, %r8099; - mad.lo.s32 %r8101, %r124, %r8097, %r8100; - mad.lo.s32 %r8102, %r35, %r8095, %r8101; - mad.lo.s32 %r8103, %r36, %r8093, %r8102; - ld.const.v4.u8 {%rs5258, %rs5259, %rs5260, %rs5261}, [matrix+2628]; - cvt.u32.u16 %r8104, %rs5261; - cvt.s32.s8 %r8105, %r8104; - cvt.u32.u16 %r8106, %rs5260; - cvt.s32.s8 %r8107, %r8106; - cvt.u32.u16 %r8108, %rs5259; - cvt.s32.s8 %r8109, %r8108; - cvt.u32.u16 %r8110, %rs5258; - cvt.s32.s8 %r8111, %r8110; - mad.lo.s32 %r8112, %r37, %r8111, %r8103; - mad.lo.s32 %r8113, %r38, %r8109, %r8112; - mad.lo.s32 %r8114, %r39, %r8107, %r8113; - mad.lo.s32 %r8115, %r40, %r8105, %r8114; - ld.const.v4.u8 {%rs5266, %rs5267, %rs5268, %rs5269}, [matrix+2632]; - cvt.u32.u16 %r8116, %rs5269; - cvt.s32.s8 %r8117, %r8116; - cvt.u32.u16 %r8118, %rs5268; - cvt.s32.s8 %r8119, %r8118; - cvt.u32.u16 %r8120, %rs5267; - cvt.s32.s8 %r8121, %r8120; - cvt.u32.u16 %r8122, %rs5266; - cvt.s32.s8 %r8123, %r8122; - mad.lo.s32 %r8124, %r42, %r8123, %r8115; - mad.lo.s32 %r8125, %r43, %r8121, %r8124; - mad.lo.s32 %r8126, %r45, %r8119, %r8125; - mad.lo.s32 %r8127, %r46, %r8117, %r8126; - ld.const.v4.u8 {%rs5274, %rs5275, %rs5276, %rs5277}, [matrix+2636]; - cvt.u32.u16 %r8128, %rs5277; - cvt.s32.s8 %r8129, %r8128; - cvt.u32.u16 %r8130, %rs5276; - cvt.s32.s8 %r8131, %r8130; - cvt.u32.u16 %r8132, %rs5275; - cvt.s32.s8 %r8133, %r8132; - cvt.u32.u16 %r8134, %rs5274; - cvt.s32.s8 %r8135, %r8134; - mad.lo.s32 %r8136, %r48, %r8135, %r8127; - mad.lo.s32 %r8137, %r49, %r8133, %r8136; - mad.lo.s32 %r8138, %r50, %r8131, %r8137; - mad.lo.s32 %r8139, %r51, %r8129, %r8138; - ld.const.v4.u8 {%rs5282, %rs5283, %rs5284, %rs5285}, [matrix+2640]; - cvt.u32.u16 %r8140, %rs5285; - cvt.s32.s8 %r8141, %r8140; - cvt.u32.u16 %r8142, %rs5284; - cvt.s32.s8 %r8143, %r8142; - cvt.u32.u16 %r8144, %rs5283; - cvt.s32.s8 %r8145, %r8144; - cvt.u32.u16 %r8146, %rs5282; - cvt.s32.s8 %r8147, %r8146; - mad.lo.s32 %r8148, %r173, %r8147, %r8139; - mad.lo.s32 %r8149, %r53, %r8145, %r8148; - mad.lo.s32 %r8150, %r54, %r8143, %r8149; - mad.lo.s32 %r8151, %r55, %r8141, %r8150; - ld.const.v4.u8 {%rs5290, %rs5291, %rs5292, %rs5293}, [matrix+2644]; - cvt.u32.u16 %r8152, %rs5293; - cvt.s32.s8 %r8153, %r8152; - cvt.u32.u16 %r8154, %rs5292; - cvt.s32.s8 %r8155, %r8154; - cvt.u32.u16 %r8156, %rs5291; - cvt.s32.s8 %r8157, %r8156; - cvt.u32.u16 %r8158, %rs5290; - cvt.s32.s8 %r8159, %r8158; - mad.lo.s32 %r8160, %r56, %r8159, %r8151; - mad.lo.s32 %r8161, %r57, %r8157, %r8160; - mad.lo.s32 %r8162, %r58, %r8155, %r8161; - mad.lo.s32 %r8163, %r59, %r8153, %r8162; - ld.const.v4.u8 {%rs5298, %rs5299, %rs5300, %rs5301}, [matrix+2648]; - cvt.u32.u16 %r8164, %rs5301; - cvt.s32.s8 %r8165, %r8164; - cvt.u32.u16 %r8166, %rs5300; - cvt.s32.s8 %r8167, %r8166; - cvt.u32.u16 %r8168, %rs5299; - cvt.s32.s8 %r8169, %r8168; - cvt.u32.u16 %r8170, %rs5298; - cvt.s32.s8 %r8171, %r8170; - mad.lo.s32 %r8172, %r61, %r8171, %r8163; - mad.lo.s32 %r8173, %r62, %r8169, %r8172; - mad.lo.s32 %r8174, %r64, %r8167, %r8173; - mad.lo.s32 %r8175, %r65, %r8165, %r8174; - ld.const.v4.u8 {%rs5306, %rs5307, %rs5308, %rs5309}, [matrix+2652]; - cvt.u32.u16 %r8176, %rs5309; - cvt.s32.s8 %r8177, %r8176; - cvt.u32.u16 %r8178, %rs5308; - cvt.s32.s8 %r8179, %r8178; - cvt.u32.u16 %r8180, %rs5307; - cvt.s32.s8 %r8181, %r8180; - cvt.u32.u16 %r8182, %rs5306; - cvt.s32.s8 %r8183, %r8182; - mad.lo.s32 %r8184, %r67, %r8183, %r8175; - mad.lo.s32 %r8185, %r68, %r8181, %r8184; - mad.lo.s32 %r8186, %r69, %r8179, %r8185; - mad.lo.s32 %r8187, %r70, %r8177, %r8186; - ld.const.v4.u8 {%rs5314, %rs5315, %rs5316, %rs5317}, [matrix+2656]; - cvt.u32.u16 %r8188, %rs5317; - cvt.s32.s8 %r8189, %r8188; - cvt.u32.u16 %r8190, %rs5316; - cvt.s32.s8 %r8191, %r8190; - cvt.u32.u16 %r8192, %rs5315; - cvt.s32.s8 %r8193, %r8192; - cvt.u32.u16 %r8194, %rs5314; - cvt.s32.s8 %r8195, %r8194; - mad.lo.s32 %r8196, %r222, %r8195, %r8187; - mad.lo.s32 %r8197, %r72, %r8193, %r8196; - mad.lo.s32 %r8198, %r73, %r8191, %r8197; - mad.lo.s32 %r8199, %r74, %r8189, %r8198; - ld.const.v4.u8 {%rs5322, %rs5323, %rs5324, %rs5325}, [matrix+2660]; - cvt.u32.u16 %r8200, %rs5325; - cvt.s32.s8 %r8201, %r8200; - cvt.u32.u16 %r8202, %rs5324; - cvt.s32.s8 %r8203, %r8202; - cvt.u32.u16 %r8204, %rs5323; - cvt.s32.s8 %r8205, %r8204; - cvt.u32.u16 %r8206, %rs5322; - cvt.s32.s8 %r8207, %r8206; - mad.lo.s32 %r8208, %r75, %r8207, %r8199; - mad.lo.s32 %r8209, %r76, %r8205, %r8208; - mad.lo.s32 %r8210, %r77, %r8203, %r8209; - mad.lo.s32 %r8211, %r78, %r8201, %r8210; - ld.const.v4.u8 {%rs5330, %rs5331, %rs5332, %rs5333}, [matrix+2664]; - cvt.u32.u16 %r8212, %rs5333; - cvt.s32.s8 %r8213, %r8212; - cvt.u32.u16 %r8214, %rs5332; - cvt.s32.s8 %r8215, %r8214; - cvt.u32.u16 %r8216, %rs5331; - cvt.s32.s8 %r8217, %r8216; - cvt.u32.u16 %r8218, %rs5330; - cvt.s32.s8 %r8219, %r8218; - mad.lo.s32 %r8220, %r80, %r8219, %r8211; - mad.lo.s32 %r8221, %r81, %r8217, %r8220; - mad.lo.s32 %r8222, %r83, %r8215, %r8221; - mad.lo.s32 %r8223, %r84, %r8213, %r8222; - ld.const.v4.u8 {%rs5338, %rs5339, %rs5340, %rs5341}, [matrix+2668]; - cvt.u32.u16 %r8224, %rs5341; - cvt.s32.s8 %r8225, %r8224; - cvt.u32.u16 %r8226, %rs5340; - cvt.s32.s8 %r8227, %r8226; - cvt.u32.u16 %r8228, %rs5339; - cvt.s32.s8 %r8229, %r8228; - cvt.u32.u16 %r8230, %rs5338; - cvt.s32.s8 %r8231, %r8230; - mad.lo.s32 %r8232, %r86, %r8231, %r8223; - mad.lo.s32 %r8233, %r87, %r8229, %r8232; - mad.lo.s32 %r8234, %r88, %r8227, %r8233; - mad.lo.s32 %r8235, %r89, %r8225, %r8234; - ld.const.v4.u8 {%rs5346, %rs5347, %rs5348, %rs5349}, [matrix+2672]; - cvt.u32.u16 %r8236, %rs5349; - cvt.s32.s8 %r8237, %r8236; - cvt.u32.u16 %r8238, %rs5348; - cvt.s32.s8 %r8239, %r8238; - cvt.u32.u16 %r8240, %rs5347; - cvt.s32.s8 %r8241, %r8240; - cvt.u32.u16 %r8242, %rs5346; - cvt.s32.s8 %r8243, %r8242; - mad.lo.s32 %r8244, %r271, %r8243, %r8235; - mad.lo.s32 %r8245, %r91, %r8241, %r8244; - mad.lo.s32 %r8246, %r93, %r8239, %r8245; - mad.lo.s32 %r8247, %r94, %r8237, %r8246; - ld.const.v4.u8 {%rs5354, %rs5355, %rs5356, %rs5357}, [matrix+2676]; - cvt.u32.u16 %r8248, %rs5357; - cvt.s32.s8 %r8249, %r8248; - cvt.u32.u16 %r8250, %rs5356; - cvt.s32.s8 %r8251, %r8250; - cvt.u32.u16 %r8252, %rs5355; - cvt.s32.s8 %r8253, %r8252; - cvt.u32.u16 %r8254, %rs5354; - cvt.s32.s8 %r8255, %r8254; - mad.lo.s32 %r8256, %r96, %r8255, %r8247; - mad.lo.s32 %r8257, %r97, %r8253, %r8256; - mad.lo.s32 %r8258, %r99, %r8251, %r8257; - mad.lo.s32 %r8259, %r100, %r8249, %r8258; - ld.const.v4.u8 {%rs5362, %rs5363, %rs5364, %rs5365}, [matrix+2680]; - cvt.u32.u16 %r8260, %rs5365; - cvt.s32.s8 %r8261, %r8260; - cvt.u32.u16 %r8262, %rs5364; - cvt.s32.s8 %r8263, %r8262; - cvt.u32.u16 %r8264, %rs5363; - cvt.s32.s8 %r8265, %r8264; - cvt.u32.u16 %r8266, %rs5362; - cvt.s32.s8 %r8267, %r8266; - mad.lo.s32 %r8268, %r103, %r8267, %r8259; - mad.lo.s32 %r8269, %r104, %r8265, %r8268; - mad.lo.s32 %r8270, %r107, %r8263, %r8269; - mad.lo.s32 %r8271, %r108, %r8261, %r8270; - ld.const.v4.u8 {%rs5370, %rs5371, %rs5372, %rs5373}, [matrix+2684]; - cvt.u32.u16 %r8272, %rs5373; - cvt.s32.s8 %r8273, %r8272; - cvt.u32.u16 %r8274, %rs5372; - cvt.s32.s8 %r8275, %r8274; - cvt.u32.u16 %r8276, %rs5371; - cvt.s32.s8 %r8277, %r8276; - cvt.u32.u16 %r8278, %rs5370; - cvt.s32.s8 %r8279, %r8278; - mad.lo.s32 %r8280, %r111, %r8279, %r8271; - mad.lo.s32 %r8281, %r112, %r8277, %r8280; - mad.lo.s32 %r8282, %r114, %r8275, %r8281; - mad.lo.s32 %r8283, %r115, %r8273, %r8282; - shr.u32 %r8284, %r8091, 6; - and.b32 %r8285, %r8284, 240; - shr.u32 %r8286, %r8283, 10; - or.b32 %r8287, %r8286, %r8285; - xor.b32 %r8288, %r29, %r8287; - cvt.u64.u32 %rd396, %r8288; - ld.const.v4.u8 {%rs5378, %rs5379, %rs5380, %rs5381}, [matrix+2688]; - cvt.u32.u16 %r8289, %rs5381; - cvt.s32.s8 %r8290, %r8289; - cvt.u32.u16 %r8291, %rs5380; - cvt.s32.s8 %r8292, %r8291; - cvt.u32.u16 %r8293, %rs5378; - cvt.s32.s8 %r8294, %r8293; - cvt.u32.u16 %r8295, %rs5379; - cvt.s32.s8 %r8296, %r8295; - mul.lo.s32 %r8297, %r34, %r8296; - mad.lo.s32 %r8298, %r124, %r8294, %r8297; - mad.lo.s32 %r8299, %r35, %r8292, %r8298; - mad.lo.s32 %r8300, %r36, %r8290, %r8299; - ld.const.v4.u8 {%rs5386, %rs5387, %rs5388, %rs5389}, [matrix+2692]; - cvt.u32.u16 %r8301, %rs5389; - cvt.s32.s8 %r8302, %r8301; - cvt.u32.u16 %r8303, %rs5388; - cvt.s32.s8 %r8304, %r8303; - cvt.u32.u16 %r8305, %rs5387; - cvt.s32.s8 %r8306, %r8305; - cvt.u32.u16 %r8307, %rs5386; - cvt.s32.s8 %r8308, %r8307; - mad.lo.s32 %r8309, %r37, %r8308, %r8300; - mad.lo.s32 %r8310, %r38, %r8306, %r8309; - mad.lo.s32 %r8311, %r39, %r8304, %r8310; - mad.lo.s32 %r8312, %r40, %r8302, %r8311; - ld.const.v4.u8 {%rs5394, %rs5395, %rs5396, %rs5397}, [matrix+2696]; - cvt.u32.u16 %r8313, %rs5397; - cvt.s32.s8 %r8314, %r8313; - cvt.u32.u16 %r8315, %rs5396; - cvt.s32.s8 %r8316, %r8315; - cvt.u32.u16 %r8317, %rs5395; - cvt.s32.s8 %r8318, %r8317; - cvt.u32.u16 %r8319, %rs5394; - cvt.s32.s8 %r8320, %r8319; - mad.lo.s32 %r8321, %r42, %r8320, %r8312; - mad.lo.s32 %r8322, %r43, %r8318, %r8321; - mad.lo.s32 %r8323, %r45, %r8316, %r8322; - mad.lo.s32 %r8324, %r46, %r8314, %r8323; - ld.const.v4.u8 {%rs5402, %rs5403, %rs5404, %rs5405}, [matrix+2700]; - cvt.u32.u16 %r8325, %rs5405; - cvt.s32.s8 %r8326, %r8325; - cvt.u32.u16 %r8327, %rs5404; - cvt.s32.s8 %r8328, %r8327; - cvt.u32.u16 %r8329, %rs5403; - cvt.s32.s8 %r8330, %r8329; - cvt.u32.u16 %r8331, %rs5402; - cvt.s32.s8 %r8332, %r8331; - mad.lo.s32 %r8333, %r48, %r8332, %r8324; - mad.lo.s32 %r8334, %r49, %r8330, %r8333; - mad.lo.s32 %r8335, %r50, %r8328, %r8334; - mad.lo.s32 %r8336, %r51, %r8326, %r8335; - ld.const.v4.u8 {%rs5410, %rs5411, %rs5412, %rs5413}, [matrix+2704]; - cvt.u32.u16 %r8337, %rs5413; - cvt.s32.s8 %r8338, %r8337; - cvt.u32.u16 %r8339, %rs5412; - cvt.s32.s8 %r8340, %r8339; - cvt.u32.u16 %r8341, %rs5411; - cvt.s32.s8 %r8342, %r8341; - cvt.u32.u16 %r8343, %rs5410; - cvt.s32.s8 %r8344, %r8343; - mad.lo.s32 %r8345, %r173, %r8344, %r8336; - mad.lo.s32 %r8346, %r53, %r8342, %r8345; - mad.lo.s32 %r8347, %r54, %r8340, %r8346; - mad.lo.s32 %r8348, %r55, %r8338, %r8347; - ld.const.v4.u8 {%rs5418, %rs5419, %rs5420, %rs5421}, [matrix+2708]; - cvt.u32.u16 %r8349, %rs5421; - cvt.s32.s8 %r8350, %r8349; - cvt.u32.u16 %r8351, %rs5420; - cvt.s32.s8 %r8352, %r8351; - cvt.u32.u16 %r8353, %rs5419; - cvt.s32.s8 %r8354, %r8353; - cvt.u32.u16 %r8355, %rs5418; - cvt.s32.s8 %r8356, %r8355; - mad.lo.s32 %r8357, %r56, %r8356, %r8348; - mad.lo.s32 %r8358, %r57, %r8354, %r8357; - mad.lo.s32 %r8359, %r58, %r8352, %r8358; - mad.lo.s32 %r8360, %r59, %r8350, %r8359; - ld.const.v4.u8 {%rs5426, %rs5427, %rs5428, %rs5429}, [matrix+2712]; - cvt.u32.u16 %r8361, %rs5429; - cvt.s32.s8 %r8362, %r8361; - cvt.u32.u16 %r8363, %rs5428; - cvt.s32.s8 %r8364, %r8363; - cvt.u32.u16 %r8365, %rs5427; - cvt.s32.s8 %r8366, %r8365; - cvt.u32.u16 %r8367, %rs5426; - cvt.s32.s8 %r8368, %r8367; - mad.lo.s32 %r8369, %r61, %r8368, %r8360; - mad.lo.s32 %r8370, %r62, %r8366, %r8369; - mad.lo.s32 %r8371, %r64, %r8364, %r8370; - mad.lo.s32 %r8372, %r65, %r8362, %r8371; - ld.const.v4.u8 {%rs5434, %rs5435, %rs5436, %rs5437}, [matrix+2716]; - cvt.u32.u16 %r8373, %rs5437; - cvt.s32.s8 %r8374, %r8373; - cvt.u32.u16 %r8375, %rs5436; - cvt.s32.s8 %r8376, %r8375; - cvt.u32.u16 %r8377, %rs5435; - cvt.s32.s8 %r8378, %r8377; - cvt.u32.u16 %r8379, %rs5434; - cvt.s32.s8 %r8380, %r8379; - mad.lo.s32 %r8381, %r67, %r8380, %r8372; - mad.lo.s32 %r8382, %r68, %r8378, %r8381; - mad.lo.s32 %r8383, %r69, %r8376, %r8382; - mad.lo.s32 %r8384, %r70, %r8374, %r8383; - ld.const.v4.u8 {%rs5442, %rs5443, %rs5444, %rs5445}, [matrix+2720]; - cvt.u32.u16 %r8385, %rs5445; - cvt.s32.s8 %r8386, %r8385; - cvt.u32.u16 %r8387, %rs5444; - cvt.s32.s8 %r8388, %r8387; - cvt.u32.u16 %r8389, %rs5443; - cvt.s32.s8 %r8390, %r8389; - cvt.u32.u16 %r8391, %rs5442; - cvt.s32.s8 %r8392, %r8391; - mad.lo.s32 %r8393, %r222, %r8392, %r8384; - mad.lo.s32 %r8394, %r72, %r8390, %r8393; - mad.lo.s32 %r8395, %r73, %r8388, %r8394; - mad.lo.s32 %r8396, %r74, %r8386, %r8395; - ld.const.v4.u8 {%rs5450, %rs5451, %rs5452, %rs5453}, [matrix+2724]; - cvt.u32.u16 %r8397, %rs5453; - cvt.s32.s8 %r8398, %r8397; - cvt.u32.u16 %r8399, %rs5452; - cvt.s32.s8 %r8400, %r8399; - cvt.u32.u16 %r8401, %rs5451; - cvt.s32.s8 %r8402, %r8401; - cvt.u32.u16 %r8403, %rs5450; - cvt.s32.s8 %r8404, %r8403; - mad.lo.s32 %r8405, %r75, %r8404, %r8396; - mad.lo.s32 %r8406, %r76, %r8402, %r8405; - mad.lo.s32 %r8407, %r77, %r8400, %r8406; - mad.lo.s32 %r8408, %r78, %r8398, %r8407; - ld.const.v4.u8 {%rs5458, %rs5459, %rs5460, %rs5461}, [matrix+2728]; - cvt.u32.u16 %r8409, %rs5461; - cvt.s32.s8 %r8410, %r8409; - cvt.u32.u16 %r8411, %rs5460; - cvt.s32.s8 %r8412, %r8411; - cvt.u32.u16 %r8413, %rs5459; - cvt.s32.s8 %r8414, %r8413; - cvt.u32.u16 %r8415, %rs5458; - cvt.s32.s8 %r8416, %r8415; - mad.lo.s32 %r8417, %r80, %r8416, %r8408; - mad.lo.s32 %r8418, %r81, %r8414, %r8417; - mad.lo.s32 %r8419, %r83, %r8412, %r8418; - mad.lo.s32 %r8420, %r84, %r8410, %r8419; - ld.const.v4.u8 {%rs5466, %rs5467, %rs5468, %rs5469}, [matrix+2732]; - cvt.u32.u16 %r8421, %rs5469; - cvt.s32.s8 %r8422, %r8421; - cvt.u32.u16 %r8423, %rs5468; - cvt.s32.s8 %r8424, %r8423; - cvt.u32.u16 %r8425, %rs5467; - cvt.s32.s8 %r8426, %r8425; - cvt.u32.u16 %r8427, %rs5466; - cvt.s32.s8 %r8428, %r8427; - mad.lo.s32 %r8429, %r86, %r8428, %r8420; - mad.lo.s32 %r8430, %r87, %r8426, %r8429; - mad.lo.s32 %r8431, %r88, %r8424, %r8430; - mad.lo.s32 %r8432, %r89, %r8422, %r8431; - ld.const.v4.u8 {%rs5474, %rs5475, %rs5476, %rs5477}, [matrix+2736]; - cvt.u32.u16 %r8433, %rs5477; - cvt.s32.s8 %r8434, %r8433; - cvt.u32.u16 %r8435, %rs5476; - cvt.s32.s8 %r8436, %r8435; - cvt.u32.u16 %r8437, %rs5475; - cvt.s32.s8 %r8438, %r8437; - cvt.u32.u16 %r8439, %rs5474; - cvt.s32.s8 %r8440, %r8439; - mad.lo.s32 %r8441, %r271, %r8440, %r8432; - mad.lo.s32 %r8442, %r91, %r8438, %r8441; - mad.lo.s32 %r8443, %r93, %r8436, %r8442; - mad.lo.s32 %r8444, %r94, %r8434, %r8443; - ld.const.v4.u8 {%rs5482, %rs5483, %rs5484, %rs5485}, [matrix+2740]; - cvt.u32.u16 %r8445, %rs5485; - cvt.s32.s8 %r8446, %r8445; - cvt.u32.u16 %r8447, %rs5484; - cvt.s32.s8 %r8448, %r8447; - cvt.u32.u16 %r8449, %rs5483; - cvt.s32.s8 %r8450, %r8449; - cvt.u32.u16 %r8451, %rs5482; - cvt.s32.s8 %r8452, %r8451; - mad.lo.s32 %r8453, %r96, %r8452, %r8444; - mad.lo.s32 %r8454, %r97, %r8450, %r8453; - mad.lo.s32 %r8455, %r99, %r8448, %r8454; - mad.lo.s32 %r8456, %r100, %r8446, %r8455; - ld.const.v4.u8 {%rs5490, %rs5491, %rs5492, %rs5493}, [matrix+2744]; - cvt.u32.u16 %r8457, %rs5493; - cvt.s32.s8 %r8458, %r8457; - cvt.u32.u16 %r8459, %rs5492; - cvt.s32.s8 %r8460, %r8459; - cvt.u32.u16 %r8461, %rs5491; - cvt.s32.s8 %r8462, %r8461; - cvt.u32.u16 %r8463, %rs5490; - cvt.s32.s8 %r8464, %r8463; - mad.lo.s32 %r8465, %r103, %r8464, %r8456; - mad.lo.s32 %r8466, %r104, %r8462, %r8465; - mad.lo.s32 %r8467, %r107, %r8460, %r8466; - mad.lo.s32 %r8468, %r108, %r8458, %r8467; - ld.const.v4.u8 {%rs5498, %rs5499, %rs5500, %rs5501}, [matrix+2748]; - cvt.u32.u16 %r8469, %rs5501; - cvt.s32.s8 %r8470, %r8469; - cvt.u32.u16 %r8471, %rs5500; - cvt.s32.s8 %r8472, %r8471; - cvt.u32.u16 %r8473, %rs5499; - cvt.s32.s8 %r8474, %r8473; - cvt.u32.u16 %r8475, %rs5498; - cvt.s32.s8 %r8476, %r8475; - mad.lo.s32 %r8477, %r111, %r8476, %r8468; - mad.lo.s32 %r8478, %r112, %r8474, %r8477; - mad.lo.s32 %r8479, %r114, %r8472, %r8478; - mad.lo.s32 %r8480, %r115, %r8470, %r8479; - ld.const.v4.u8 {%rs5506, %rs5507, %rs5508, %rs5509}, [matrix+2752]; - cvt.u32.u16 %r8481, %rs5509; - cvt.s32.s8 %r8482, %r8481; - cvt.u32.u16 %r8483, %rs5508; - cvt.s32.s8 %r8484, %r8483; - cvt.u32.u16 %r8485, %rs5506; - cvt.s32.s8 %r8486, %r8485; - cvt.u32.u16 %r8487, %rs5507; - cvt.s32.s8 %r8488, %r8487; - mul.lo.s32 %r8489, %r34, %r8488; - mad.lo.s32 %r8490, %r124, %r8486, %r8489; - mad.lo.s32 %r8491, %r35, %r8484, %r8490; - mad.lo.s32 %r8492, %r36, %r8482, %r8491; - ld.const.v4.u8 {%rs5514, %rs5515, %rs5516, %rs5517}, [matrix+2756]; - cvt.u32.u16 %r8493, %rs5517; - cvt.s32.s8 %r8494, %r8493; - cvt.u32.u16 %r8495, %rs5516; - cvt.s32.s8 %r8496, %r8495; - cvt.u32.u16 %r8497, %rs5515; - cvt.s32.s8 %r8498, %r8497; - cvt.u32.u16 %r8499, %rs5514; - cvt.s32.s8 %r8500, %r8499; - mad.lo.s32 %r8501, %r37, %r8500, %r8492; - mad.lo.s32 %r8502, %r38, %r8498, %r8501; - mad.lo.s32 %r8503, %r39, %r8496, %r8502; - mad.lo.s32 %r8504, %r40, %r8494, %r8503; - ld.const.v4.u8 {%rs5522, %rs5523, %rs5524, %rs5525}, [matrix+2760]; - cvt.u32.u16 %r8505, %rs5525; - cvt.s32.s8 %r8506, %r8505; - cvt.u32.u16 %r8507, %rs5524; - cvt.s32.s8 %r8508, %r8507; - cvt.u32.u16 %r8509, %rs5523; - cvt.s32.s8 %r8510, %r8509; - cvt.u32.u16 %r8511, %rs5522; - cvt.s32.s8 %r8512, %r8511; - mad.lo.s32 %r8513, %r42, %r8512, %r8504; - mad.lo.s32 %r8514, %r43, %r8510, %r8513; - mad.lo.s32 %r8515, %r45, %r8508, %r8514; - mad.lo.s32 %r8516, %r46, %r8506, %r8515; - ld.const.v4.u8 {%rs5530, %rs5531, %rs5532, %rs5533}, [matrix+2764]; - cvt.u32.u16 %r8517, %rs5533; - cvt.s32.s8 %r8518, %r8517; - cvt.u32.u16 %r8519, %rs5532; - cvt.s32.s8 %r8520, %r8519; - cvt.u32.u16 %r8521, %rs5531; - cvt.s32.s8 %r8522, %r8521; - cvt.u32.u16 %r8523, %rs5530; - cvt.s32.s8 %r8524, %r8523; - mad.lo.s32 %r8525, %r48, %r8524, %r8516; - mad.lo.s32 %r8526, %r49, %r8522, %r8525; - mad.lo.s32 %r8527, %r50, %r8520, %r8526; - mad.lo.s32 %r8528, %r51, %r8518, %r8527; - ld.const.v4.u8 {%rs5538, %rs5539, %rs5540, %rs5541}, [matrix+2768]; - cvt.u32.u16 %r8529, %rs5541; - cvt.s32.s8 %r8530, %r8529; - cvt.u32.u16 %r8531, %rs5540; - cvt.s32.s8 %r8532, %r8531; - cvt.u32.u16 %r8533, %rs5539; - cvt.s32.s8 %r8534, %r8533; - cvt.u32.u16 %r8535, %rs5538; - cvt.s32.s8 %r8536, %r8535; - mad.lo.s32 %r8537, %r173, %r8536, %r8528; - mad.lo.s32 %r8538, %r53, %r8534, %r8537; - mad.lo.s32 %r8539, %r54, %r8532, %r8538; - mad.lo.s32 %r8540, %r55, %r8530, %r8539; - ld.const.v4.u8 {%rs5546, %rs5547, %rs5548, %rs5549}, [matrix+2772]; - cvt.u32.u16 %r8541, %rs5549; - cvt.s32.s8 %r8542, %r8541; - cvt.u32.u16 %r8543, %rs5548; - cvt.s32.s8 %r8544, %r8543; - cvt.u32.u16 %r8545, %rs5547; - cvt.s32.s8 %r8546, %r8545; - cvt.u32.u16 %r8547, %rs5546; - cvt.s32.s8 %r8548, %r8547; - mad.lo.s32 %r8549, %r56, %r8548, %r8540; - mad.lo.s32 %r8550, %r57, %r8546, %r8549; - mad.lo.s32 %r8551, %r58, %r8544, %r8550; - mad.lo.s32 %r8552, %r59, %r8542, %r8551; - ld.const.v4.u8 {%rs5554, %rs5555, %rs5556, %rs5557}, [matrix+2776]; - cvt.u32.u16 %r8553, %rs5557; - cvt.s32.s8 %r8554, %r8553; - cvt.u32.u16 %r8555, %rs5556; - cvt.s32.s8 %r8556, %r8555; - cvt.u32.u16 %r8557, %rs5555; - cvt.s32.s8 %r8558, %r8557; - cvt.u32.u16 %r8559, %rs5554; - cvt.s32.s8 %r8560, %r8559; - mad.lo.s32 %r8561, %r61, %r8560, %r8552; - mad.lo.s32 %r8562, %r62, %r8558, %r8561; - mad.lo.s32 %r8563, %r64, %r8556, %r8562; - mad.lo.s32 %r8564, %r65, %r8554, %r8563; - ld.const.v4.u8 {%rs5562, %rs5563, %rs5564, %rs5565}, [matrix+2780]; - cvt.u32.u16 %r8565, %rs5565; - cvt.s32.s8 %r8566, %r8565; - cvt.u32.u16 %r8567, %rs5564; - cvt.s32.s8 %r8568, %r8567; - cvt.u32.u16 %r8569, %rs5563; - cvt.s32.s8 %r8570, %r8569; - cvt.u32.u16 %r8571, %rs5562; - cvt.s32.s8 %r8572, %r8571; - mad.lo.s32 %r8573, %r67, %r8572, %r8564; - mad.lo.s32 %r8574, %r68, %r8570, %r8573; - mad.lo.s32 %r8575, %r69, %r8568, %r8574; - mad.lo.s32 %r8576, %r70, %r8566, %r8575; - ld.const.v4.u8 {%rs5570, %rs5571, %rs5572, %rs5573}, [matrix+2784]; - cvt.u32.u16 %r8577, %rs5573; - cvt.s32.s8 %r8578, %r8577; - cvt.u32.u16 %r8579, %rs5572; - cvt.s32.s8 %r8580, %r8579; - cvt.u32.u16 %r8581, %rs5571; - cvt.s32.s8 %r8582, %r8581; - cvt.u32.u16 %r8583, %rs5570; - cvt.s32.s8 %r8584, %r8583; - mad.lo.s32 %r8585, %r222, %r8584, %r8576; - mad.lo.s32 %r8586, %r72, %r8582, %r8585; - mad.lo.s32 %r8587, %r73, %r8580, %r8586; - mad.lo.s32 %r8588, %r74, %r8578, %r8587; - ld.const.v4.u8 {%rs5578, %rs5579, %rs5580, %rs5581}, [matrix+2788]; - cvt.u32.u16 %r8589, %rs5581; - cvt.s32.s8 %r8590, %r8589; - cvt.u32.u16 %r8591, %rs5580; - cvt.s32.s8 %r8592, %r8591; - cvt.u32.u16 %r8593, %rs5579; - cvt.s32.s8 %r8594, %r8593; - cvt.u32.u16 %r8595, %rs5578; - cvt.s32.s8 %r8596, %r8595; - mad.lo.s32 %r8597, %r75, %r8596, %r8588; - mad.lo.s32 %r8598, %r76, %r8594, %r8597; - mad.lo.s32 %r8599, %r77, %r8592, %r8598; - mad.lo.s32 %r8600, %r78, %r8590, %r8599; - ld.const.v4.u8 {%rs5586, %rs5587, %rs5588, %rs5589}, [matrix+2792]; - cvt.u32.u16 %r8601, %rs5589; - cvt.s32.s8 %r8602, %r8601; - cvt.u32.u16 %r8603, %rs5588; - cvt.s32.s8 %r8604, %r8603; - cvt.u32.u16 %r8605, %rs5587; - cvt.s32.s8 %r8606, %r8605; - cvt.u32.u16 %r8607, %rs5586; - cvt.s32.s8 %r8608, %r8607; - mad.lo.s32 %r8609, %r80, %r8608, %r8600; - mad.lo.s32 %r8610, %r81, %r8606, %r8609; - mad.lo.s32 %r8611, %r83, %r8604, %r8610; - mad.lo.s32 %r8612, %r84, %r8602, %r8611; - ld.const.v4.u8 {%rs5594, %rs5595, %rs5596, %rs5597}, [matrix+2796]; - cvt.u32.u16 %r8613, %rs5597; - cvt.s32.s8 %r8614, %r8613; - cvt.u32.u16 %r8615, %rs5596; - cvt.s32.s8 %r8616, %r8615; - cvt.u32.u16 %r8617, %rs5595; - cvt.s32.s8 %r8618, %r8617; - cvt.u32.u16 %r8619, %rs5594; - cvt.s32.s8 %r8620, %r8619; - mad.lo.s32 %r8621, %r86, %r8620, %r8612; - mad.lo.s32 %r8622, %r87, %r8618, %r8621; - mad.lo.s32 %r8623, %r88, %r8616, %r8622; - mad.lo.s32 %r8624, %r89, %r8614, %r8623; - ld.const.v4.u8 {%rs5602, %rs5603, %rs5604, %rs5605}, [matrix+2800]; - cvt.u32.u16 %r8625, %rs5605; - cvt.s32.s8 %r8626, %r8625; - cvt.u32.u16 %r8627, %rs5604; - cvt.s32.s8 %r8628, %r8627; - cvt.u32.u16 %r8629, %rs5603; - cvt.s32.s8 %r8630, %r8629; - cvt.u32.u16 %r8631, %rs5602; - cvt.s32.s8 %r8632, %r8631; - mad.lo.s32 %r8633, %r271, %r8632, %r8624; - mad.lo.s32 %r8634, %r91, %r8630, %r8633; - mad.lo.s32 %r8635, %r93, %r8628, %r8634; - mad.lo.s32 %r8636, %r94, %r8626, %r8635; - ld.const.v4.u8 {%rs5610, %rs5611, %rs5612, %rs5613}, [matrix+2804]; - cvt.u32.u16 %r8637, %rs5613; - cvt.s32.s8 %r8638, %r8637; - cvt.u32.u16 %r8639, %rs5612; - cvt.s32.s8 %r8640, %r8639; - cvt.u32.u16 %r8641, %rs5611; - cvt.s32.s8 %r8642, %r8641; - cvt.u32.u16 %r8643, %rs5610; - cvt.s32.s8 %r8644, %r8643; - mad.lo.s32 %r8645, %r96, %r8644, %r8636; - mad.lo.s32 %r8646, %r97, %r8642, %r8645; - mad.lo.s32 %r8647, %r99, %r8640, %r8646; - mad.lo.s32 %r8648, %r100, %r8638, %r8647; - ld.const.v4.u8 {%rs5618, %rs5619, %rs5620, %rs5621}, [matrix+2808]; - cvt.u32.u16 %r8649, %rs5621; - cvt.s32.s8 %r8650, %r8649; - cvt.u32.u16 %r8651, %rs5620; - cvt.s32.s8 %r8652, %r8651; - cvt.u32.u16 %r8653, %rs5619; - cvt.s32.s8 %r8654, %r8653; - cvt.u32.u16 %r8655, %rs5618; - cvt.s32.s8 %r8656, %r8655; - mad.lo.s32 %r8657, %r103, %r8656, %r8648; - mad.lo.s32 %r8658, %r104, %r8654, %r8657; - mad.lo.s32 %r8659, %r107, %r8652, %r8658; - mad.lo.s32 %r8660, %r108, %r8650, %r8659; - ld.const.v4.u8 {%rs5626, %rs5627, %rs5628, %rs5629}, [matrix+2812]; - cvt.u32.u16 %r8661, %rs5629; - cvt.s32.s8 %r8662, %r8661; - cvt.u32.u16 %r8663, %rs5628; - cvt.s32.s8 %r8664, %r8663; - cvt.u32.u16 %r8665, %rs5627; - cvt.s32.s8 %r8666, %r8665; - cvt.u32.u16 %r8667, %rs5626; - cvt.s32.s8 %r8668, %r8667; - mad.lo.s32 %r8669, %r111, %r8668, %r8660; - mad.lo.s32 %r8670, %r112, %r8666, %r8669; - mad.lo.s32 %r8671, %r114, %r8664, %r8670; - mad.lo.s32 %r8672, %r115, %r8662, %r8671; - shr.u32 %r8673, %r8480, 6; - and.b32 %r8674, %r8673, 240; - shr.u32 %r8675, %r8672, 10; - or.b32 %r8676, %r8675, %r8674; - xor.b32 %r8677, %r30, %r8676; - cvt.u64.u32 %rd397, %r8677; - ld.const.v4.u8 {%rs5634, %rs5635, %rs5636, %rs5637}, [matrix+2816]; - cvt.u32.u16 %r8678, %rs5637; - cvt.s32.s8 %r8679, %r8678; - cvt.u32.u16 %r8680, %rs5636; - cvt.s32.s8 %r8681, %r8680; - cvt.u32.u16 %r8682, %rs5634; - cvt.s32.s8 %r8683, %r8682; - cvt.u32.u16 %r8684, %rs5635; - cvt.s32.s8 %r8685, %r8684; - mul.lo.s32 %r8686, %r34, %r8685; - mad.lo.s32 %r8687, %r124, %r8683, %r8686; - mad.lo.s32 %r8688, %r35, %r8681, %r8687; - mad.lo.s32 %r8689, %r36, %r8679, %r8688; - ld.const.v4.u8 {%rs5642, %rs5643, %rs5644, %rs5645}, [matrix+2820]; - cvt.u32.u16 %r8690, %rs5645; - cvt.s32.s8 %r8691, %r8690; - cvt.u32.u16 %r8692, %rs5644; - cvt.s32.s8 %r8693, %r8692; - cvt.u32.u16 %r8694, %rs5643; - cvt.s32.s8 %r8695, %r8694; - cvt.u32.u16 %r8696, %rs5642; - cvt.s32.s8 %r8697, %r8696; - mad.lo.s32 %r8698, %r37, %r8697, %r8689; - mad.lo.s32 %r8699, %r38, %r8695, %r8698; - mad.lo.s32 %r8700, %r39, %r8693, %r8699; - mad.lo.s32 %r8701, %r40, %r8691, %r8700; - ld.const.v4.u8 {%rs5650, %rs5651, %rs5652, %rs5653}, [matrix+2824]; - cvt.u32.u16 %r8702, %rs5653; - cvt.s32.s8 %r8703, %r8702; - cvt.u32.u16 %r8704, %rs5652; - cvt.s32.s8 %r8705, %r8704; - cvt.u32.u16 %r8706, %rs5651; - cvt.s32.s8 %r8707, %r8706; - cvt.u32.u16 %r8708, %rs5650; - cvt.s32.s8 %r8709, %r8708; - mad.lo.s32 %r8710, %r42, %r8709, %r8701; - mad.lo.s32 %r8711, %r43, %r8707, %r8710; - mad.lo.s32 %r8712, %r45, %r8705, %r8711; - mad.lo.s32 %r8713, %r46, %r8703, %r8712; - ld.const.v4.u8 {%rs5658, %rs5659, %rs5660, %rs5661}, [matrix+2828]; - cvt.u32.u16 %r8714, %rs5661; - cvt.s32.s8 %r8715, %r8714; - cvt.u32.u16 %r8716, %rs5660; - cvt.s32.s8 %r8717, %r8716; - cvt.u32.u16 %r8718, %rs5659; - cvt.s32.s8 %r8719, %r8718; - cvt.u32.u16 %r8720, %rs5658; - cvt.s32.s8 %r8721, %r8720; - mad.lo.s32 %r8722, %r48, %r8721, %r8713; - mad.lo.s32 %r8723, %r49, %r8719, %r8722; - mad.lo.s32 %r8724, %r50, %r8717, %r8723; - mad.lo.s32 %r8725, %r51, %r8715, %r8724; - ld.const.v4.u8 {%rs5666, %rs5667, %rs5668, %rs5669}, [matrix+2832]; - cvt.u32.u16 %r8726, %rs5669; - cvt.s32.s8 %r8727, %r8726; - cvt.u32.u16 %r8728, %rs5668; - cvt.s32.s8 %r8729, %r8728; - cvt.u32.u16 %r8730, %rs5667; - cvt.s32.s8 %r8731, %r8730; - cvt.u32.u16 %r8732, %rs5666; - cvt.s32.s8 %r8733, %r8732; - mad.lo.s32 %r8734, %r173, %r8733, %r8725; - mad.lo.s32 %r8735, %r53, %r8731, %r8734; - mad.lo.s32 %r8736, %r54, %r8729, %r8735; - mad.lo.s32 %r8737, %r55, %r8727, %r8736; - ld.const.v4.u8 {%rs5674, %rs5675, %rs5676, %rs5677}, [matrix+2836]; - cvt.u32.u16 %r8738, %rs5677; - cvt.s32.s8 %r8739, %r8738; - cvt.u32.u16 %r8740, %rs5676; - cvt.s32.s8 %r8741, %r8740; - cvt.u32.u16 %r8742, %rs5675; - cvt.s32.s8 %r8743, %r8742; - cvt.u32.u16 %r8744, %rs5674; - cvt.s32.s8 %r8745, %r8744; - mad.lo.s32 %r8746, %r56, %r8745, %r8737; - mad.lo.s32 %r8747, %r57, %r8743, %r8746; - mad.lo.s32 %r8748, %r58, %r8741, %r8747; - mad.lo.s32 %r8749, %r59, %r8739, %r8748; - ld.const.v4.u8 {%rs5682, %rs5683, %rs5684, %rs5685}, [matrix+2840]; - cvt.u32.u16 %r8750, %rs5685; - cvt.s32.s8 %r8751, %r8750; - cvt.u32.u16 %r8752, %rs5684; - cvt.s32.s8 %r8753, %r8752; - cvt.u32.u16 %r8754, %rs5683; - cvt.s32.s8 %r8755, %r8754; - cvt.u32.u16 %r8756, %rs5682; - cvt.s32.s8 %r8757, %r8756; - mad.lo.s32 %r8758, %r61, %r8757, %r8749; - mad.lo.s32 %r8759, %r62, %r8755, %r8758; - mad.lo.s32 %r8760, %r64, %r8753, %r8759; - mad.lo.s32 %r8761, %r65, %r8751, %r8760; - ld.const.v4.u8 {%rs5690, %rs5691, %rs5692, %rs5693}, [matrix+2844]; - cvt.u32.u16 %r8762, %rs5693; - cvt.s32.s8 %r8763, %r8762; - cvt.u32.u16 %r8764, %rs5692; - cvt.s32.s8 %r8765, %r8764; - cvt.u32.u16 %r8766, %rs5691; - cvt.s32.s8 %r8767, %r8766; - cvt.u32.u16 %r8768, %rs5690; - cvt.s32.s8 %r8769, %r8768; - mad.lo.s32 %r8770, %r67, %r8769, %r8761; - mad.lo.s32 %r8771, %r68, %r8767, %r8770; - mad.lo.s32 %r8772, %r69, %r8765, %r8771; - mad.lo.s32 %r8773, %r70, %r8763, %r8772; - ld.const.v4.u8 {%rs5698, %rs5699, %rs5700, %rs5701}, [matrix+2848]; - cvt.u32.u16 %r8774, %rs5701; - cvt.s32.s8 %r8775, %r8774; - cvt.u32.u16 %r8776, %rs5700; - cvt.s32.s8 %r8777, %r8776; - cvt.u32.u16 %r8778, %rs5699; - cvt.s32.s8 %r8779, %r8778; - cvt.u32.u16 %r8780, %rs5698; - cvt.s32.s8 %r8781, %r8780; - mad.lo.s32 %r8782, %r222, %r8781, %r8773; - mad.lo.s32 %r8783, %r72, %r8779, %r8782; - mad.lo.s32 %r8784, %r73, %r8777, %r8783; - mad.lo.s32 %r8785, %r74, %r8775, %r8784; - ld.const.v4.u8 {%rs5706, %rs5707, %rs5708, %rs5709}, [matrix+2852]; - cvt.u32.u16 %r8786, %rs5709; - cvt.s32.s8 %r8787, %r8786; - cvt.u32.u16 %r8788, %rs5708; - cvt.s32.s8 %r8789, %r8788; - cvt.u32.u16 %r8790, %rs5707; - cvt.s32.s8 %r8791, %r8790; - cvt.u32.u16 %r8792, %rs5706; - cvt.s32.s8 %r8793, %r8792; - mad.lo.s32 %r8794, %r75, %r8793, %r8785; - mad.lo.s32 %r8795, %r76, %r8791, %r8794; - mad.lo.s32 %r8796, %r77, %r8789, %r8795; - mad.lo.s32 %r8797, %r78, %r8787, %r8796; - ld.const.v4.u8 {%rs5714, %rs5715, %rs5716, %rs5717}, [matrix+2856]; - cvt.u32.u16 %r8798, %rs5717; - cvt.s32.s8 %r8799, %r8798; - cvt.u32.u16 %r8800, %rs5716; - cvt.s32.s8 %r8801, %r8800; - cvt.u32.u16 %r8802, %rs5715; - cvt.s32.s8 %r8803, %r8802; - cvt.u32.u16 %r8804, %rs5714; - cvt.s32.s8 %r8805, %r8804; - mad.lo.s32 %r8806, %r80, %r8805, %r8797; - mad.lo.s32 %r8807, %r81, %r8803, %r8806; - mad.lo.s32 %r8808, %r83, %r8801, %r8807; - mad.lo.s32 %r8809, %r84, %r8799, %r8808; - ld.const.v4.u8 {%rs5722, %rs5723, %rs5724, %rs5725}, [matrix+2860]; - cvt.u32.u16 %r8810, %rs5725; - cvt.s32.s8 %r8811, %r8810; - cvt.u32.u16 %r8812, %rs5724; - cvt.s32.s8 %r8813, %r8812; - cvt.u32.u16 %r8814, %rs5723; - cvt.s32.s8 %r8815, %r8814; - cvt.u32.u16 %r8816, %rs5722; - cvt.s32.s8 %r8817, %r8816; - mad.lo.s32 %r8818, %r86, %r8817, %r8809; - mad.lo.s32 %r8819, %r87, %r8815, %r8818; - mad.lo.s32 %r8820, %r88, %r8813, %r8819; - mad.lo.s32 %r8821, %r89, %r8811, %r8820; - ld.const.v4.u8 {%rs5730, %rs5731, %rs5732, %rs5733}, [matrix+2864]; - cvt.u32.u16 %r8822, %rs5733; - cvt.s32.s8 %r8823, %r8822; - cvt.u32.u16 %r8824, %rs5732; - cvt.s32.s8 %r8825, %r8824; - cvt.u32.u16 %r8826, %rs5731; - cvt.s32.s8 %r8827, %r8826; - cvt.u32.u16 %r8828, %rs5730; - cvt.s32.s8 %r8829, %r8828; - mad.lo.s32 %r8830, %r271, %r8829, %r8821; - mad.lo.s32 %r8831, %r91, %r8827, %r8830; - mad.lo.s32 %r8832, %r93, %r8825, %r8831; - mad.lo.s32 %r8833, %r94, %r8823, %r8832; - ld.const.v4.u8 {%rs5738, %rs5739, %rs5740, %rs5741}, [matrix+2868]; - cvt.u32.u16 %r8834, %rs5741; - cvt.s32.s8 %r8835, %r8834; - cvt.u32.u16 %r8836, %rs5740; - cvt.s32.s8 %r8837, %r8836; - cvt.u32.u16 %r8838, %rs5739; - cvt.s32.s8 %r8839, %r8838; - cvt.u32.u16 %r8840, %rs5738; - cvt.s32.s8 %r8841, %r8840; - mad.lo.s32 %r8842, %r96, %r8841, %r8833; - mad.lo.s32 %r8843, %r97, %r8839, %r8842; - mad.lo.s32 %r8844, %r99, %r8837, %r8843; - mad.lo.s32 %r8845, %r100, %r8835, %r8844; - ld.const.v4.u8 {%rs5746, %rs5747, %rs5748, %rs5749}, [matrix+2872]; - cvt.u32.u16 %r8846, %rs5749; - cvt.s32.s8 %r8847, %r8846; - cvt.u32.u16 %r8848, %rs5748; - cvt.s32.s8 %r8849, %r8848; - cvt.u32.u16 %r8850, %rs5747; - cvt.s32.s8 %r8851, %r8850; - cvt.u32.u16 %r8852, %rs5746; - cvt.s32.s8 %r8853, %r8852; - mad.lo.s32 %r8854, %r103, %r8853, %r8845; - mad.lo.s32 %r8855, %r104, %r8851, %r8854; - mad.lo.s32 %r8856, %r107, %r8849, %r8855; - mad.lo.s32 %r8857, %r108, %r8847, %r8856; - ld.const.v4.u8 {%rs5754, %rs5755, %rs5756, %rs5757}, [matrix+2876]; - cvt.u32.u16 %r8858, %rs5757; - cvt.s32.s8 %r8859, %r8858; - cvt.u32.u16 %r8860, %rs5756; - cvt.s32.s8 %r8861, %r8860; - cvt.u32.u16 %r8862, %rs5755; - cvt.s32.s8 %r8863, %r8862; - cvt.u32.u16 %r8864, %rs5754; - cvt.s32.s8 %r8865, %r8864; - mad.lo.s32 %r8866, %r111, %r8865, %r8857; - mad.lo.s32 %r8867, %r112, %r8863, %r8866; - mad.lo.s32 %r8868, %r114, %r8861, %r8867; - mad.lo.s32 %r8869, %r115, %r8859, %r8868; - ld.const.v4.u8 {%rs5762, %rs5763, %rs5764, %rs5765}, [matrix+2880]; - cvt.u32.u16 %r8870, %rs5765; - cvt.s32.s8 %r8871, %r8870; - cvt.u32.u16 %r8872, %rs5764; - cvt.s32.s8 %r8873, %r8872; - cvt.u32.u16 %r8874, %rs5762; - cvt.s32.s8 %r8875, %r8874; - cvt.u32.u16 %r8876, %rs5763; - cvt.s32.s8 %r8877, %r8876; - mul.lo.s32 %r8878, %r34, %r8877; - mad.lo.s32 %r8879, %r124, %r8875, %r8878; - mad.lo.s32 %r8880, %r35, %r8873, %r8879; - mad.lo.s32 %r8881, %r36, %r8871, %r8880; - ld.const.v4.u8 {%rs5770, %rs5771, %rs5772, %rs5773}, [matrix+2884]; - cvt.u32.u16 %r8882, %rs5773; - cvt.s32.s8 %r8883, %r8882; - cvt.u32.u16 %r8884, %rs5772; - cvt.s32.s8 %r8885, %r8884; - cvt.u32.u16 %r8886, %rs5771; - cvt.s32.s8 %r8887, %r8886; - cvt.u32.u16 %r8888, %rs5770; - cvt.s32.s8 %r8889, %r8888; - mad.lo.s32 %r8890, %r37, %r8889, %r8881; - mad.lo.s32 %r8891, %r38, %r8887, %r8890; - mad.lo.s32 %r8892, %r39, %r8885, %r8891; - mad.lo.s32 %r8893, %r40, %r8883, %r8892; - ld.const.v4.u8 {%rs5778, %rs5779, %rs5780, %rs5781}, [matrix+2888]; - cvt.u32.u16 %r8894, %rs5781; - cvt.s32.s8 %r8895, %r8894; - cvt.u32.u16 %r8896, %rs5780; - cvt.s32.s8 %r8897, %r8896; - cvt.u32.u16 %r8898, %rs5779; - cvt.s32.s8 %r8899, %r8898; - cvt.u32.u16 %r8900, %rs5778; - cvt.s32.s8 %r8901, %r8900; - mad.lo.s32 %r8902, %r42, %r8901, %r8893; - mad.lo.s32 %r8903, %r43, %r8899, %r8902; - mad.lo.s32 %r8904, %r45, %r8897, %r8903; - mad.lo.s32 %r8905, %r46, %r8895, %r8904; - ld.const.v4.u8 {%rs5786, %rs5787, %rs5788, %rs5789}, [matrix+2892]; - cvt.u32.u16 %r8906, %rs5789; - cvt.s32.s8 %r8907, %r8906; - cvt.u32.u16 %r8908, %rs5788; - cvt.s32.s8 %r8909, %r8908; - cvt.u32.u16 %r8910, %rs5787; - cvt.s32.s8 %r8911, %r8910; - cvt.u32.u16 %r8912, %rs5786; - cvt.s32.s8 %r8913, %r8912; - mad.lo.s32 %r8914, %r48, %r8913, %r8905; - mad.lo.s32 %r8915, %r49, %r8911, %r8914; - mad.lo.s32 %r8916, %r50, %r8909, %r8915; - mad.lo.s32 %r8917, %r51, %r8907, %r8916; - ld.const.v4.u8 {%rs5794, %rs5795, %rs5796, %rs5797}, [matrix+2896]; - cvt.u32.u16 %r8918, %rs5797; - cvt.s32.s8 %r8919, %r8918; - cvt.u32.u16 %r8920, %rs5796; - cvt.s32.s8 %r8921, %r8920; - cvt.u32.u16 %r8922, %rs5795; - cvt.s32.s8 %r8923, %r8922; - cvt.u32.u16 %r8924, %rs5794; - cvt.s32.s8 %r8925, %r8924; - mad.lo.s32 %r8926, %r173, %r8925, %r8917; - mad.lo.s32 %r8927, %r53, %r8923, %r8926; - mad.lo.s32 %r8928, %r54, %r8921, %r8927; - mad.lo.s32 %r8929, %r55, %r8919, %r8928; - ld.const.v4.u8 {%rs5802, %rs5803, %rs5804, %rs5805}, [matrix+2900]; - cvt.u32.u16 %r8930, %rs5805; - cvt.s32.s8 %r8931, %r8930; - cvt.u32.u16 %r8932, %rs5804; - cvt.s32.s8 %r8933, %r8932; - cvt.u32.u16 %r8934, %rs5803; - cvt.s32.s8 %r8935, %r8934; - cvt.u32.u16 %r8936, %rs5802; - cvt.s32.s8 %r8937, %r8936; - mad.lo.s32 %r8938, %r56, %r8937, %r8929; - mad.lo.s32 %r8939, %r57, %r8935, %r8938; - mad.lo.s32 %r8940, %r58, %r8933, %r8939; - mad.lo.s32 %r8941, %r59, %r8931, %r8940; - ld.const.v4.u8 {%rs5810, %rs5811, %rs5812, %rs5813}, [matrix+2904]; - cvt.u32.u16 %r8942, %rs5813; - cvt.s32.s8 %r8943, %r8942; - cvt.u32.u16 %r8944, %rs5812; - cvt.s32.s8 %r8945, %r8944; - cvt.u32.u16 %r8946, %rs5811; - cvt.s32.s8 %r8947, %r8946; - cvt.u32.u16 %r8948, %rs5810; - cvt.s32.s8 %r8949, %r8948; - mad.lo.s32 %r8950, %r61, %r8949, %r8941; - mad.lo.s32 %r8951, %r62, %r8947, %r8950; - mad.lo.s32 %r8952, %r64, %r8945, %r8951; - mad.lo.s32 %r8953, %r65, %r8943, %r8952; - ld.const.v4.u8 {%rs5818, %rs5819, %rs5820, %rs5821}, [matrix+2908]; - cvt.u32.u16 %r8954, %rs5821; - cvt.s32.s8 %r8955, %r8954; - cvt.u32.u16 %r8956, %rs5820; - cvt.s32.s8 %r8957, %r8956; - cvt.u32.u16 %r8958, %rs5819; - cvt.s32.s8 %r8959, %r8958; - cvt.u32.u16 %r8960, %rs5818; - cvt.s32.s8 %r8961, %r8960; - mad.lo.s32 %r8962, %r67, %r8961, %r8953; - mad.lo.s32 %r8963, %r68, %r8959, %r8962; - mad.lo.s32 %r8964, %r69, %r8957, %r8963; - mad.lo.s32 %r8965, %r70, %r8955, %r8964; - ld.const.v4.u8 {%rs5826, %rs5827, %rs5828, %rs5829}, [matrix+2912]; - cvt.u32.u16 %r8966, %rs5829; - cvt.s32.s8 %r8967, %r8966; - cvt.u32.u16 %r8968, %rs5828; - cvt.s32.s8 %r8969, %r8968; - cvt.u32.u16 %r8970, %rs5827; - cvt.s32.s8 %r8971, %r8970; - cvt.u32.u16 %r8972, %rs5826; - cvt.s32.s8 %r8973, %r8972; - mad.lo.s32 %r8974, %r222, %r8973, %r8965; - mad.lo.s32 %r8975, %r72, %r8971, %r8974; - mad.lo.s32 %r8976, %r73, %r8969, %r8975; - mad.lo.s32 %r8977, %r74, %r8967, %r8976; - ld.const.v4.u8 {%rs5834, %rs5835, %rs5836, %rs5837}, [matrix+2916]; - cvt.u32.u16 %r8978, %rs5837; - cvt.s32.s8 %r8979, %r8978; - cvt.u32.u16 %r8980, %rs5836; - cvt.s32.s8 %r8981, %r8980; - cvt.u32.u16 %r8982, %rs5835; - cvt.s32.s8 %r8983, %r8982; - cvt.u32.u16 %r8984, %rs5834; - cvt.s32.s8 %r8985, %r8984; - mad.lo.s32 %r8986, %r75, %r8985, %r8977; - mad.lo.s32 %r8987, %r76, %r8983, %r8986; - mad.lo.s32 %r8988, %r77, %r8981, %r8987; - mad.lo.s32 %r8989, %r78, %r8979, %r8988; - ld.const.v4.u8 {%rs5842, %rs5843, %rs5844, %rs5845}, [matrix+2920]; - cvt.u32.u16 %r8990, %rs5845; - cvt.s32.s8 %r8991, %r8990; - cvt.u32.u16 %r8992, %rs5844; - cvt.s32.s8 %r8993, %r8992; - cvt.u32.u16 %r8994, %rs5843; - cvt.s32.s8 %r8995, %r8994; - cvt.u32.u16 %r8996, %rs5842; - cvt.s32.s8 %r8997, %r8996; - mad.lo.s32 %r8998, %r80, %r8997, %r8989; - mad.lo.s32 %r8999, %r81, %r8995, %r8998; - mad.lo.s32 %r9000, %r83, %r8993, %r8999; - mad.lo.s32 %r9001, %r84, %r8991, %r9000; - ld.const.v4.u8 {%rs5850, %rs5851, %rs5852, %rs5853}, [matrix+2924]; - cvt.u32.u16 %r9002, %rs5853; - cvt.s32.s8 %r9003, %r9002; - cvt.u32.u16 %r9004, %rs5852; - cvt.s32.s8 %r9005, %r9004; - cvt.u32.u16 %r9006, %rs5851; - cvt.s32.s8 %r9007, %r9006; - cvt.u32.u16 %r9008, %rs5850; - cvt.s32.s8 %r9009, %r9008; - mad.lo.s32 %r9010, %r86, %r9009, %r9001; - mad.lo.s32 %r9011, %r87, %r9007, %r9010; - mad.lo.s32 %r9012, %r88, %r9005, %r9011; - mad.lo.s32 %r9013, %r89, %r9003, %r9012; - ld.const.v4.u8 {%rs5858, %rs5859, %rs5860, %rs5861}, [matrix+2928]; - cvt.u32.u16 %r9014, %rs5861; - cvt.s32.s8 %r9015, %r9014; - cvt.u32.u16 %r9016, %rs5860; - cvt.s32.s8 %r9017, %r9016; - cvt.u32.u16 %r9018, %rs5859; - cvt.s32.s8 %r9019, %r9018; - cvt.u32.u16 %r9020, %rs5858; - cvt.s32.s8 %r9021, %r9020; - mad.lo.s32 %r9022, %r271, %r9021, %r9013; - mad.lo.s32 %r9023, %r91, %r9019, %r9022; - mad.lo.s32 %r9024, %r93, %r9017, %r9023; - mad.lo.s32 %r9025, %r94, %r9015, %r9024; - ld.const.v4.u8 {%rs5866, %rs5867, %rs5868, %rs5869}, [matrix+2932]; - cvt.u32.u16 %r9026, %rs5869; - cvt.s32.s8 %r9027, %r9026; - cvt.u32.u16 %r9028, %rs5868; - cvt.s32.s8 %r9029, %r9028; - cvt.u32.u16 %r9030, %rs5867; - cvt.s32.s8 %r9031, %r9030; - cvt.u32.u16 %r9032, %rs5866; - cvt.s32.s8 %r9033, %r9032; - mad.lo.s32 %r9034, %r96, %r9033, %r9025; - mad.lo.s32 %r9035, %r97, %r9031, %r9034; - mad.lo.s32 %r9036, %r99, %r9029, %r9035; - mad.lo.s32 %r9037, %r100, %r9027, %r9036; - ld.const.v4.u8 {%rs5874, %rs5875, %rs5876, %rs5877}, [matrix+2936]; - cvt.u32.u16 %r9038, %rs5877; - cvt.s32.s8 %r9039, %r9038; - cvt.u32.u16 %r9040, %rs5876; - cvt.s32.s8 %r9041, %r9040; - cvt.u32.u16 %r9042, %rs5875; - cvt.s32.s8 %r9043, %r9042; - cvt.u32.u16 %r9044, %rs5874; - cvt.s32.s8 %r9045, %r9044; - mad.lo.s32 %r9046, %r103, %r9045, %r9037; - mad.lo.s32 %r9047, %r104, %r9043, %r9046; - mad.lo.s32 %r9048, %r107, %r9041, %r9047; - mad.lo.s32 %r9049, %r108, %r9039, %r9048; - ld.const.v4.u8 {%rs5882, %rs5883, %rs5884, %rs5885}, [matrix+2940]; - cvt.u32.u16 %r9050, %rs5885; - cvt.s32.s8 %r9051, %r9050; - cvt.u32.u16 %r9052, %rs5884; - cvt.s32.s8 %r9053, %r9052; - cvt.u32.u16 %r9054, %rs5883; - cvt.s32.s8 %r9055, %r9054; - cvt.u32.u16 %r9056, %rs5882; - cvt.s32.s8 %r9057, %r9056; - mad.lo.s32 %r9058, %r111, %r9057, %r9049; - mad.lo.s32 %r9059, %r112, %r9055, %r9058; - mad.lo.s32 %r9060, %r114, %r9053, %r9059; - mad.lo.s32 %r9061, %r115, %r9051, %r9060; - shr.u32 %r9062, %r8869, 6; - and.b32 %r9063, %r9062, 240; - shr.u32 %r9064, %r9061, 10; - or.b32 %r9065, %r9064, %r9063; - xor.b32 %r9066, %r31, %r9065; - cvt.u64.u32 %rd398, %r9066; - ld.const.v4.u8 {%rs5890, %rs5891, %rs5892, %rs5893}, [matrix+2944]; - cvt.u32.u16 %r9067, %rs5893; - cvt.s32.s8 %r9068, %r9067; - cvt.u32.u16 %r9069, %rs5892; - cvt.s32.s8 %r9070, %r9069; - cvt.u32.u16 %r9071, %rs5890; - cvt.s32.s8 %r9072, %r9071; - cvt.u32.u16 %r9073, %rs5891; - cvt.s32.s8 %r9074, %r9073; - mul.lo.s32 %r9075, %r34, %r9074; - mad.lo.s32 %r9076, %r124, %r9072, %r9075; - mad.lo.s32 %r9077, %r35, %r9070, %r9076; - mad.lo.s32 %r9078, %r36, %r9068, %r9077; - ld.const.v4.u8 {%rs5898, %rs5899, %rs5900, %rs5901}, [matrix+2948]; - cvt.u32.u16 %r9079, %rs5901; - cvt.s32.s8 %r9080, %r9079; - cvt.u32.u16 %r9081, %rs5900; - cvt.s32.s8 %r9082, %r9081; - cvt.u32.u16 %r9083, %rs5899; - cvt.s32.s8 %r9084, %r9083; - cvt.u32.u16 %r9085, %rs5898; - cvt.s32.s8 %r9086, %r9085; - mad.lo.s32 %r9087, %r37, %r9086, %r9078; - mad.lo.s32 %r9088, %r38, %r9084, %r9087; - mad.lo.s32 %r9089, %r39, %r9082, %r9088; - mad.lo.s32 %r9090, %r40, %r9080, %r9089; - ld.const.v4.u8 {%rs5906, %rs5907, %rs5908, %rs5909}, [matrix+2952]; - cvt.u32.u16 %r9091, %rs5909; - cvt.s32.s8 %r9092, %r9091; - cvt.u32.u16 %r9093, %rs5908; - cvt.s32.s8 %r9094, %r9093; - cvt.u32.u16 %r9095, %rs5907; - cvt.s32.s8 %r9096, %r9095; - cvt.u32.u16 %r9097, %rs5906; - cvt.s32.s8 %r9098, %r9097; - mad.lo.s32 %r9099, %r42, %r9098, %r9090; - mad.lo.s32 %r9100, %r43, %r9096, %r9099; - mad.lo.s32 %r9101, %r45, %r9094, %r9100; - mad.lo.s32 %r9102, %r46, %r9092, %r9101; - ld.const.v4.u8 {%rs5914, %rs5915, %rs5916, %rs5917}, [matrix+2956]; - cvt.u32.u16 %r9103, %rs5917; - cvt.s32.s8 %r9104, %r9103; - cvt.u32.u16 %r9105, %rs5916; - cvt.s32.s8 %r9106, %r9105; - cvt.u32.u16 %r9107, %rs5915; - cvt.s32.s8 %r9108, %r9107; - cvt.u32.u16 %r9109, %rs5914; - cvt.s32.s8 %r9110, %r9109; - mad.lo.s32 %r9111, %r48, %r9110, %r9102; - mad.lo.s32 %r9112, %r49, %r9108, %r9111; - mad.lo.s32 %r9113, %r50, %r9106, %r9112; - mad.lo.s32 %r9114, %r51, %r9104, %r9113; - ld.const.v4.u8 {%rs5922, %rs5923, %rs5924, %rs5925}, [matrix+2960]; - cvt.u32.u16 %r9115, %rs5925; - cvt.s32.s8 %r9116, %r9115; - cvt.u32.u16 %r9117, %rs5924; - cvt.s32.s8 %r9118, %r9117; - cvt.u32.u16 %r9119, %rs5923; - cvt.s32.s8 %r9120, %r9119; - cvt.u32.u16 %r9121, %rs5922; - cvt.s32.s8 %r9122, %r9121; - mad.lo.s32 %r9123, %r173, %r9122, %r9114; - mad.lo.s32 %r9124, %r53, %r9120, %r9123; - mad.lo.s32 %r9125, %r54, %r9118, %r9124; - mad.lo.s32 %r9126, %r55, %r9116, %r9125; - ld.const.v4.u8 {%rs5930, %rs5931, %rs5932, %rs5933}, [matrix+2964]; - cvt.u32.u16 %r9127, %rs5933; - cvt.s32.s8 %r9128, %r9127; - cvt.u32.u16 %r9129, %rs5932; - cvt.s32.s8 %r9130, %r9129; - cvt.u32.u16 %r9131, %rs5931; - cvt.s32.s8 %r9132, %r9131; - cvt.u32.u16 %r9133, %rs5930; - cvt.s32.s8 %r9134, %r9133; - mad.lo.s32 %r9135, %r56, %r9134, %r9126; - mad.lo.s32 %r9136, %r57, %r9132, %r9135; - mad.lo.s32 %r9137, %r58, %r9130, %r9136; - mad.lo.s32 %r9138, %r59, %r9128, %r9137; - ld.const.v4.u8 {%rs5938, %rs5939, %rs5940, %rs5941}, [matrix+2968]; - cvt.u32.u16 %r9139, %rs5941; - cvt.s32.s8 %r9140, %r9139; - cvt.u32.u16 %r9141, %rs5940; - cvt.s32.s8 %r9142, %r9141; - cvt.u32.u16 %r9143, %rs5939; - cvt.s32.s8 %r9144, %r9143; - cvt.u32.u16 %r9145, %rs5938; - cvt.s32.s8 %r9146, %r9145; - mad.lo.s32 %r9147, %r61, %r9146, %r9138; - mad.lo.s32 %r9148, %r62, %r9144, %r9147; - mad.lo.s32 %r9149, %r64, %r9142, %r9148; - mad.lo.s32 %r9150, %r65, %r9140, %r9149; - ld.const.v4.u8 {%rs5946, %rs5947, %rs5948, %rs5949}, [matrix+2972]; - cvt.u32.u16 %r9151, %rs5949; - cvt.s32.s8 %r9152, %r9151; - cvt.u32.u16 %r9153, %rs5948; - cvt.s32.s8 %r9154, %r9153; - cvt.u32.u16 %r9155, %rs5947; - cvt.s32.s8 %r9156, %r9155; - cvt.u32.u16 %r9157, %rs5946; - cvt.s32.s8 %r9158, %r9157; - mad.lo.s32 %r9159, %r67, %r9158, %r9150; - mad.lo.s32 %r9160, %r68, %r9156, %r9159; - mad.lo.s32 %r9161, %r69, %r9154, %r9160; - mad.lo.s32 %r9162, %r70, %r9152, %r9161; - ld.const.v4.u8 {%rs5954, %rs5955, %rs5956, %rs5957}, [matrix+2976]; - cvt.u32.u16 %r9163, %rs5957; - cvt.s32.s8 %r9164, %r9163; - cvt.u32.u16 %r9165, %rs5956; - cvt.s32.s8 %r9166, %r9165; - cvt.u32.u16 %r9167, %rs5955; - cvt.s32.s8 %r9168, %r9167; - cvt.u32.u16 %r9169, %rs5954; - cvt.s32.s8 %r9170, %r9169; - mad.lo.s32 %r9171, %r222, %r9170, %r9162; - mad.lo.s32 %r9172, %r72, %r9168, %r9171; - mad.lo.s32 %r9173, %r73, %r9166, %r9172; - mad.lo.s32 %r9174, %r74, %r9164, %r9173; - ld.const.v4.u8 {%rs5962, %rs5963, %rs5964, %rs5965}, [matrix+2980]; - cvt.u32.u16 %r9175, %rs5965; - cvt.s32.s8 %r9176, %r9175; - cvt.u32.u16 %r9177, %rs5964; - cvt.s32.s8 %r9178, %r9177; - cvt.u32.u16 %r9179, %rs5963; - cvt.s32.s8 %r9180, %r9179; - cvt.u32.u16 %r9181, %rs5962; - cvt.s32.s8 %r9182, %r9181; - mad.lo.s32 %r9183, %r75, %r9182, %r9174; - mad.lo.s32 %r9184, %r76, %r9180, %r9183; - mad.lo.s32 %r9185, %r77, %r9178, %r9184; - mad.lo.s32 %r9186, %r78, %r9176, %r9185; - ld.const.v4.u8 {%rs5970, %rs5971, %rs5972, %rs5973}, [matrix+2984]; - cvt.u32.u16 %r9187, %rs5973; - cvt.s32.s8 %r9188, %r9187; - cvt.u32.u16 %r9189, %rs5972; - cvt.s32.s8 %r9190, %r9189; - cvt.u32.u16 %r9191, %rs5971; - cvt.s32.s8 %r9192, %r9191; - cvt.u32.u16 %r9193, %rs5970; - cvt.s32.s8 %r9194, %r9193; - mad.lo.s32 %r9195, %r80, %r9194, %r9186; - mad.lo.s32 %r9196, %r81, %r9192, %r9195; - mad.lo.s32 %r9197, %r83, %r9190, %r9196; - mad.lo.s32 %r9198, %r84, %r9188, %r9197; - ld.const.v4.u8 {%rs5978, %rs5979, %rs5980, %rs5981}, [matrix+2988]; - cvt.u32.u16 %r9199, %rs5981; - cvt.s32.s8 %r9200, %r9199; - cvt.u32.u16 %r9201, %rs5980; - cvt.s32.s8 %r9202, %r9201; - cvt.u32.u16 %r9203, %rs5979; - cvt.s32.s8 %r9204, %r9203; - cvt.u32.u16 %r9205, %rs5978; - cvt.s32.s8 %r9206, %r9205; - mad.lo.s32 %r9207, %r86, %r9206, %r9198; - mad.lo.s32 %r9208, %r87, %r9204, %r9207; - mad.lo.s32 %r9209, %r88, %r9202, %r9208; - mad.lo.s32 %r9210, %r89, %r9200, %r9209; - ld.const.v4.u8 {%rs5986, %rs5987, %rs5988, %rs5989}, [matrix+2992]; - cvt.u32.u16 %r9211, %rs5989; - cvt.s32.s8 %r9212, %r9211; - cvt.u32.u16 %r9213, %rs5988; - cvt.s32.s8 %r9214, %r9213; - cvt.u32.u16 %r9215, %rs5987; - cvt.s32.s8 %r9216, %r9215; - cvt.u32.u16 %r9217, %rs5986; - cvt.s32.s8 %r9218, %r9217; - mad.lo.s32 %r9219, %r271, %r9218, %r9210; - mad.lo.s32 %r9220, %r91, %r9216, %r9219; - mad.lo.s32 %r9221, %r93, %r9214, %r9220; - mad.lo.s32 %r9222, %r94, %r9212, %r9221; - ld.const.v4.u8 {%rs5994, %rs5995, %rs5996, %rs5997}, [matrix+2996]; - cvt.u32.u16 %r9223, %rs5997; - cvt.s32.s8 %r9224, %r9223; - cvt.u32.u16 %r9225, %rs5996; - cvt.s32.s8 %r9226, %r9225; - cvt.u32.u16 %r9227, %rs5995; - cvt.s32.s8 %r9228, %r9227; - cvt.u32.u16 %r9229, %rs5994; - cvt.s32.s8 %r9230, %r9229; - mad.lo.s32 %r9231, %r96, %r9230, %r9222; - mad.lo.s32 %r9232, %r97, %r9228, %r9231; - mad.lo.s32 %r9233, %r99, %r9226, %r9232; - mad.lo.s32 %r9234, %r100, %r9224, %r9233; - ld.const.v4.u8 {%rs6002, %rs6003, %rs6004, %rs6005}, [matrix+3000]; - cvt.u32.u16 %r9235, %rs6005; - cvt.s32.s8 %r9236, %r9235; - cvt.u32.u16 %r9237, %rs6004; - cvt.s32.s8 %r9238, %r9237; - cvt.u32.u16 %r9239, %rs6003; - cvt.s32.s8 %r9240, %r9239; - cvt.u32.u16 %r9241, %rs6002; - cvt.s32.s8 %r9242, %r9241; - mad.lo.s32 %r9243, %r103, %r9242, %r9234; - mad.lo.s32 %r9244, %r104, %r9240, %r9243; - mad.lo.s32 %r9245, %r107, %r9238, %r9244; - mad.lo.s32 %r9246, %r108, %r9236, %r9245; - ld.const.v4.u8 {%rs6010, %rs6011, %rs6012, %rs6013}, [matrix+3004]; - cvt.u32.u16 %r9247, %rs6013; - cvt.s32.s8 %r9248, %r9247; - cvt.u32.u16 %r9249, %rs6012; - cvt.s32.s8 %r9250, %r9249; - cvt.u32.u16 %r9251, %rs6011; - cvt.s32.s8 %r9252, %r9251; - cvt.u32.u16 %r9253, %rs6010; - cvt.s32.s8 %r9254, %r9253; - mad.lo.s32 %r9255, %r111, %r9254, %r9246; - mad.lo.s32 %r9256, %r112, %r9252, %r9255; - mad.lo.s32 %r9257, %r114, %r9250, %r9256; - mad.lo.s32 %r9258, %r115, %r9248, %r9257; - ld.const.v4.u8 {%rs6018, %rs6019, %rs6020, %rs6021}, [matrix+3008]; - cvt.u32.u16 %r9259, %rs6021; - cvt.s32.s8 %r9260, %r9259; - cvt.u32.u16 %r9261, %rs6020; - cvt.s32.s8 %r9262, %r9261; - cvt.u32.u16 %r9263, %rs6018; - cvt.s32.s8 %r9264, %r9263; - cvt.u32.u16 %r9265, %rs6019; - cvt.s32.s8 %r9266, %r9265; - mul.lo.s32 %r9267, %r34, %r9266; - mad.lo.s32 %r9268, %r124, %r9264, %r9267; - mad.lo.s32 %r9269, %r35, %r9262, %r9268; - mad.lo.s32 %r9270, %r36, %r9260, %r9269; - ld.const.v4.u8 {%rs6026, %rs6027, %rs6028, %rs6029}, [matrix+3012]; - cvt.u32.u16 %r9271, %rs6029; - cvt.s32.s8 %r9272, %r9271; - cvt.u32.u16 %r9273, %rs6028; - cvt.s32.s8 %r9274, %r9273; - cvt.u32.u16 %r9275, %rs6027; - cvt.s32.s8 %r9276, %r9275; - cvt.u32.u16 %r9277, %rs6026; - cvt.s32.s8 %r9278, %r9277; - mad.lo.s32 %r9279, %r37, %r9278, %r9270; - mad.lo.s32 %r9280, %r38, %r9276, %r9279; - mad.lo.s32 %r9281, %r39, %r9274, %r9280; - mad.lo.s32 %r9282, %r40, %r9272, %r9281; - ld.const.v4.u8 {%rs6034, %rs6035, %rs6036, %rs6037}, [matrix+3016]; - cvt.u32.u16 %r9283, %rs6037; - cvt.s32.s8 %r9284, %r9283; - cvt.u32.u16 %r9285, %rs6036; - cvt.s32.s8 %r9286, %r9285; - cvt.u32.u16 %r9287, %rs6035; - cvt.s32.s8 %r9288, %r9287; - cvt.u32.u16 %r9289, %rs6034; - cvt.s32.s8 %r9290, %r9289; - mad.lo.s32 %r9291, %r42, %r9290, %r9282; - mad.lo.s32 %r9292, %r43, %r9288, %r9291; - mad.lo.s32 %r9293, %r45, %r9286, %r9292; - mad.lo.s32 %r9294, %r46, %r9284, %r9293; - ld.const.v4.u8 {%rs6042, %rs6043, %rs6044, %rs6045}, [matrix+3020]; - cvt.u32.u16 %r9295, %rs6045; - cvt.s32.s8 %r9296, %r9295; - cvt.u32.u16 %r9297, %rs6044; - cvt.s32.s8 %r9298, %r9297; - cvt.u32.u16 %r9299, %rs6043; - cvt.s32.s8 %r9300, %r9299; - cvt.u32.u16 %r9301, %rs6042; - cvt.s32.s8 %r9302, %r9301; - mad.lo.s32 %r9303, %r48, %r9302, %r9294; - mad.lo.s32 %r9304, %r49, %r9300, %r9303; - mad.lo.s32 %r9305, %r50, %r9298, %r9304; - mad.lo.s32 %r9306, %r51, %r9296, %r9305; - ld.const.v4.u8 {%rs6050, %rs6051, %rs6052, %rs6053}, [matrix+3024]; - cvt.u32.u16 %r9307, %rs6053; - cvt.s32.s8 %r9308, %r9307; - cvt.u32.u16 %r9309, %rs6052; - cvt.s32.s8 %r9310, %r9309; - cvt.u32.u16 %r9311, %rs6051; - cvt.s32.s8 %r9312, %r9311; - cvt.u32.u16 %r9313, %rs6050; - cvt.s32.s8 %r9314, %r9313; - mad.lo.s32 %r9315, %r173, %r9314, %r9306; - mad.lo.s32 %r9316, %r53, %r9312, %r9315; - mad.lo.s32 %r9317, %r54, %r9310, %r9316; - mad.lo.s32 %r9318, %r55, %r9308, %r9317; - ld.const.v4.u8 {%rs6058, %rs6059, %rs6060, %rs6061}, [matrix+3028]; - cvt.u32.u16 %r9319, %rs6061; - cvt.s32.s8 %r9320, %r9319; - cvt.u32.u16 %r9321, %rs6060; - cvt.s32.s8 %r9322, %r9321; - cvt.u32.u16 %r9323, %rs6059; - cvt.s32.s8 %r9324, %r9323; - cvt.u32.u16 %r9325, %rs6058; - cvt.s32.s8 %r9326, %r9325; - mad.lo.s32 %r9327, %r56, %r9326, %r9318; - mad.lo.s32 %r9328, %r57, %r9324, %r9327; - mad.lo.s32 %r9329, %r58, %r9322, %r9328; - mad.lo.s32 %r9330, %r59, %r9320, %r9329; - ld.const.v4.u8 {%rs6066, %rs6067, %rs6068, %rs6069}, [matrix+3032]; - cvt.u32.u16 %r9331, %rs6069; - cvt.s32.s8 %r9332, %r9331; - cvt.u32.u16 %r9333, %rs6068; - cvt.s32.s8 %r9334, %r9333; - cvt.u32.u16 %r9335, %rs6067; - cvt.s32.s8 %r9336, %r9335; - cvt.u32.u16 %r9337, %rs6066; - cvt.s32.s8 %r9338, %r9337; - mad.lo.s32 %r9339, %r61, %r9338, %r9330; - mad.lo.s32 %r9340, %r62, %r9336, %r9339; - mad.lo.s32 %r9341, %r64, %r9334, %r9340; - mad.lo.s32 %r9342, %r65, %r9332, %r9341; - ld.const.v4.u8 {%rs6074, %rs6075, %rs6076, %rs6077}, [matrix+3036]; - cvt.u32.u16 %r9343, %rs6077; - cvt.s32.s8 %r9344, %r9343; - cvt.u32.u16 %r9345, %rs6076; - cvt.s32.s8 %r9346, %r9345; - cvt.u32.u16 %r9347, %rs6075; - cvt.s32.s8 %r9348, %r9347; - cvt.u32.u16 %r9349, %rs6074; - cvt.s32.s8 %r9350, %r9349; - mad.lo.s32 %r9351, %r67, %r9350, %r9342; - mad.lo.s32 %r9352, %r68, %r9348, %r9351; - mad.lo.s32 %r9353, %r69, %r9346, %r9352; - mad.lo.s32 %r9354, %r70, %r9344, %r9353; - ld.const.v4.u8 {%rs6082, %rs6083, %rs6084, %rs6085}, [matrix+3040]; - cvt.u32.u16 %r9355, %rs6085; - cvt.s32.s8 %r9356, %r9355; - cvt.u32.u16 %r9357, %rs6084; - cvt.s32.s8 %r9358, %r9357; - cvt.u32.u16 %r9359, %rs6083; - cvt.s32.s8 %r9360, %r9359; - cvt.u32.u16 %r9361, %rs6082; - cvt.s32.s8 %r9362, %r9361; - mad.lo.s32 %r9363, %r222, %r9362, %r9354; - mad.lo.s32 %r9364, %r72, %r9360, %r9363; - mad.lo.s32 %r9365, %r73, %r9358, %r9364; - mad.lo.s32 %r9366, %r74, %r9356, %r9365; - ld.const.v4.u8 {%rs6090, %rs6091, %rs6092, %rs6093}, [matrix+3044]; - cvt.u32.u16 %r9367, %rs6093; - cvt.s32.s8 %r9368, %r9367; - cvt.u32.u16 %r9369, %rs6092; - cvt.s32.s8 %r9370, %r9369; - cvt.u32.u16 %r9371, %rs6091; - cvt.s32.s8 %r9372, %r9371; - cvt.u32.u16 %r9373, %rs6090; - cvt.s32.s8 %r9374, %r9373; - mad.lo.s32 %r9375, %r75, %r9374, %r9366; - mad.lo.s32 %r9376, %r76, %r9372, %r9375; - mad.lo.s32 %r9377, %r77, %r9370, %r9376; - mad.lo.s32 %r9378, %r78, %r9368, %r9377; - ld.const.v4.u8 {%rs6098, %rs6099, %rs6100, %rs6101}, [matrix+3048]; - cvt.u32.u16 %r9379, %rs6101; - cvt.s32.s8 %r9380, %r9379; - cvt.u32.u16 %r9381, %rs6100; - cvt.s32.s8 %r9382, %r9381; - cvt.u32.u16 %r9383, %rs6099; - cvt.s32.s8 %r9384, %r9383; - cvt.u32.u16 %r9385, %rs6098; - cvt.s32.s8 %r9386, %r9385; - mad.lo.s32 %r9387, %r80, %r9386, %r9378; - mad.lo.s32 %r9388, %r81, %r9384, %r9387; - mad.lo.s32 %r9389, %r83, %r9382, %r9388; - mad.lo.s32 %r9390, %r84, %r9380, %r9389; - ld.const.v4.u8 {%rs6106, %rs6107, %rs6108, %rs6109}, [matrix+3052]; - cvt.u32.u16 %r9391, %rs6109; - cvt.s32.s8 %r9392, %r9391; - cvt.u32.u16 %r9393, %rs6108; - cvt.s32.s8 %r9394, %r9393; - cvt.u32.u16 %r9395, %rs6107; - cvt.s32.s8 %r9396, %r9395; - cvt.u32.u16 %r9397, %rs6106; - cvt.s32.s8 %r9398, %r9397; - mad.lo.s32 %r9399, %r86, %r9398, %r9390; - mad.lo.s32 %r9400, %r87, %r9396, %r9399; - mad.lo.s32 %r9401, %r88, %r9394, %r9400; - mad.lo.s32 %r9402, %r89, %r9392, %r9401; - ld.const.v4.u8 {%rs6114, %rs6115, %rs6116, %rs6117}, [matrix+3056]; - cvt.u32.u16 %r9403, %rs6117; - cvt.s32.s8 %r9404, %r9403; - cvt.u32.u16 %r9405, %rs6116; - cvt.s32.s8 %r9406, %r9405; - cvt.u32.u16 %r9407, %rs6115; - cvt.s32.s8 %r9408, %r9407; - cvt.u32.u16 %r9409, %rs6114; - cvt.s32.s8 %r9410, %r9409; - mad.lo.s32 %r9411, %r271, %r9410, %r9402; - mad.lo.s32 %r9412, %r91, %r9408, %r9411; - mad.lo.s32 %r9413, %r93, %r9406, %r9412; - mad.lo.s32 %r9414, %r94, %r9404, %r9413; - ld.const.v4.u8 {%rs6122, %rs6123, %rs6124, %rs6125}, [matrix+3060]; - cvt.u32.u16 %r9415, %rs6125; - cvt.s32.s8 %r9416, %r9415; - cvt.u32.u16 %r9417, %rs6124; - cvt.s32.s8 %r9418, %r9417; - cvt.u32.u16 %r9419, %rs6123; - cvt.s32.s8 %r9420, %r9419; - cvt.u32.u16 %r9421, %rs6122; - cvt.s32.s8 %r9422, %r9421; - mad.lo.s32 %r9423, %r96, %r9422, %r9414; - mad.lo.s32 %r9424, %r97, %r9420, %r9423; - mad.lo.s32 %r9425, %r99, %r9418, %r9424; - mad.lo.s32 %r9426, %r100, %r9416, %r9425; - ld.const.v4.u8 {%rs6130, %rs6131, %rs6132, %rs6133}, [matrix+3064]; - cvt.u32.u16 %r9427, %rs6133; - cvt.s32.s8 %r9428, %r9427; - cvt.u32.u16 %r9429, %rs6132; - cvt.s32.s8 %r9430, %r9429; - cvt.u32.u16 %r9431, %rs6131; - cvt.s32.s8 %r9432, %r9431; - cvt.u32.u16 %r9433, %rs6130; - cvt.s32.s8 %r9434, %r9433; - mad.lo.s32 %r9435, %r103, %r9434, %r9426; - mad.lo.s32 %r9436, %r104, %r9432, %r9435; - mad.lo.s32 %r9437, %r107, %r9430, %r9436; - mad.lo.s32 %r9438, %r108, %r9428, %r9437; - ld.const.v4.u8 {%rs6138, %rs6139, %rs6140, %rs6141}, [matrix+3068]; - cvt.u32.u16 %r9439, %rs6141; - cvt.s32.s8 %r9440, %r9439; - cvt.u32.u16 %r9441, %rs6140; - cvt.s32.s8 %r9442, %r9441; - cvt.u32.u16 %r9443, %rs6139; - cvt.s32.s8 %r9444, %r9443; - cvt.u32.u16 %r9445, %rs6138; - cvt.s32.s8 %r9446, %r9445; - mad.lo.s32 %r9447, %r111, %r9446, %r9438; - mad.lo.s32 %r9448, %r112, %r9444, %r9447; - mad.lo.s32 %r9449, %r114, %r9442, %r9448; - mad.lo.s32 %r9450, %r115, %r9440, %r9449; - shr.u32 %r9451, %r9258, 6; - and.b32 %r9452, %r9451, 240; - shr.u32 %r9453, %r9450, 10; - or.b32 %r9454, %r9453, %r9452; - xor.b32 %r9455, %r32, %r9454; - ld.const.v4.u8 {%rs6146, %rs6147, %rs6148, %rs6149}, [matrix+3072]; - cvt.u32.u16 %r9456, %rs6149; - cvt.s32.s8 %r9457, %r9456; - cvt.u32.u16 %r9458, %rs6148; - cvt.s32.s8 %r9459, %r9458; - cvt.u32.u16 %r9460, %rs6146; - cvt.s32.s8 %r9461, %r9460; - cvt.u32.u16 %r9462, %rs6147; - cvt.s32.s8 %r9463, %r9462; - mul.lo.s32 %r9464, %r34, %r9463; - mad.lo.s32 %r9465, %r124, %r9461, %r9464; - mad.lo.s32 %r9466, %r35, %r9459, %r9465; - mad.lo.s32 %r9467, %r36, %r9457, %r9466; - ld.const.v4.u8 {%rs6154, %rs6155, %rs6156, %rs6157}, [matrix+3076]; - cvt.u32.u16 %r9468, %rs6157; - cvt.s32.s8 %r9469, %r9468; - cvt.u32.u16 %r9470, %rs6156; - cvt.s32.s8 %r9471, %r9470; - cvt.u32.u16 %r9472, %rs6155; - cvt.s32.s8 %r9473, %r9472; - cvt.u32.u16 %r9474, %rs6154; - cvt.s32.s8 %r9475, %r9474; - mad.lo.s32 %r9476, %r37, %r9475, %r9467; - mad.lo.s32 %r9477, %r38, %r9473, %r9476; - mad.lo.s32 %r9478, %r39, %r9471, %r9477; - mad.lo.s32 %r9479, %r40, %r9469, %r9478; - ld.const.v4.u8 {%rs6162, %rs6163, %rs6164, %rs6165}, [matrix+3080]; - cvt.u32.u16 %r9480, %rs6165; - cvt.s32.s8 %r9481, %r9480; - cvt.u32.u16 %r9482, %rs6164; - cvt.s32.s8 %r9483, %r9482; - cvt.u32.u16 %r9484, %rs6163; - cvt.s32.s8 %r9485, %r9484; - cvt.u32.u16 %r9486, %rs6162; - cvt.s32.s8 %r9487, %r9486; - mad.lo.s32 %r9488, %r42, %r9487, %r9479; - mad.lo.s32 %r9489, %r43, %r9485, %r9488; - mad.lo.s32 %r9490, %r45, %r9483, %r9489; - mad.lo.s32 %r9491, %r46, %r9481, %r9490; - ld.const.v4.u8 {%rs6170, %rs6171, %rs6172, %rs6173}, [matrix+3084]; - cvt.u32.u16 %r9492, %rs6173; - cvt.s32.s8 %r9493, %r9492; - cvt.u32.u16 %r9494, %rs6172; - cvt.s32.s8 %r9495, %r9494; - cvt.u32.u16 %r9496, %rs6171; - cvt.s32.s8 %r9497, %r9496; - cvt.u32.u16 %r9498, %rs6170; - cvt.s32.s8 %r9499, %r9498; - mad.lo.s32 %r9500, %r48, %r9499, %r9491; - mad.lo.s32 %r9501, %r49, %r9497, %r9500; - mad.lo.s32 %r9502, %r50, %r9495, %r9501; - mad.lo.s32 %r9503, %r51, %r9493, %r9502; - ld.const.v4.u8 {%rs6178, %rs6179, %rs6180, %rs6181}, [matrix+3088]; - cvt.u32.u16 %r9504, %rs6181; - cvt.s32.s8 %r9505, %r9504; - cvt.u32.u16 %r9506, %rs6180; - cvt.s32.s8 %r9507, %r9506; - cvt.u32.u16 %r9508, %rs6179; - cvt.s32.s8 %r9509, %r9508; - cvt.u32.u16 %r9510, %rs6178; - cvt.s32.s8 %r9511, %r9510; - mad.lo.s32 %r9512, %r173, %r9511, %r9503; - mad.lo.s32 %r9513, %r53, %r9509, %r9512; - mad.lo.s32 %r9514, %r54, %r9507, %r9513; - mad.lo.s32 %r9515, %r55, %r9505, %r9514; - ld.const.v4.u8 {%rs6186, %rs6187, %rs6188, %rs6189}, [matrix+3092]; - cvt.u32.u16 %r9516, %rs6189; - cvt.s32.s8 %r9517, %r9516; - cvt.u32.u16 %r9518, %rs6188; - cvt.s32.s8 %r9519, %r9518; - cvt.u32.u16 %r9520, %rs6187; - cvt.s32.s8 %r9521, %r9520; - cvt.u32.u16 %r9522, %rs6186; - cvt.s32.s8 %r9523, %r9522; - mad.lo.s32 %r9524, %r56, %r9523, %r9515; - mad.lo.s32 %r9525, %r57, %r9521, %r9524; - mad.lo.s32 %r9526, %r58, %r9519, %r9525; - mad.lo.s32 %r9527, %r59, %r9517, %r9526; - ld.const.v4.u8 {%rs6194, %rs6195, %rs6196, %rs6197}, [matrix+3096]; - cvt.u32.u16 %r9528, %rs6197; - cvt.s32.s8 %r9529, %r9528; - cvt.u32.u16 %r9530, %rs6196; - cvt.s32.s8 %r9531, %r9530; - cvt.u32.u16 %r9532, %rs6195; - cvt.s32.s8 %r9533, %r9532; - cvt.u32.u16 %r9534, %rs6194; - cvt.s32.s8 %r9535, %r9534; - mad.lo.s32 %r9536, %r61, %r9535, %r9527; - mad.lo.s32 %r9537, %r62, %r9533, %r9536; - mad.lo.s32 %r9538, %r64, %r9531, %r9537; - mad.lo.s32 %r9539, %r65, %r9529, %r9538; - ld.const.v4.u8 {%rs6202, %rs6203, %rs6204, %rs6205}, [matrix+3100]; - cvt.u32.u16 %r9540, %rs6205; - cvt.s32.s8 %r9541, %r9540; - cvt.u32.u16 %r9542, %rs6204; - cvt.s32.s8 %r9543, %r9542; - cvt.u32.u16 %r9544, %rs6203; - cvt.s32.s8 %r9545, %r9544; - cvt.u32.u16 %r9546, %rs6202; - cvt.s32.s8 %r9547, %r9546; - mad.lo.s32 %r9548, %r67, %r9547, %r9539; - mad.lo.s32 %r9549, %r68, %r9545, %r9548; - mad.lo.s32 %r9550, %r69, %r9543, %r9549; - mad.lo.s32 %r9551, %r70, %r9541, %r9550; - ld.const.v4.u8 {%rs6210, %rs6211, %rs6212, %rs6213}, [matrix+3104]; - cvt.u32.u16 %r9552, %rs6213; - cvt.s32.s8 %r9553, %r9552; - cvt.u32.u16 %r9554, %rs6212; - cvt.s32.s8 %r9555, %r9554; - cvt.u32.u16 %r9556, %rs6211; - cvt.s32.s8 %r9557, %r9556; - cvt.u32.u16 %r9558, %rs6210; - cvt.s32.s8 %r9559, %r9558; - mad.lo.s32 %r9560, %r222, %r9559, %r9551; - mad.lo.s32 %r9561, %r72, %r9557, %r9560; - mad.lo.s32 %r9562, %r73, %r9555, %r9561; - mad.lo.s32 %r9563, %r74, %r9553, %r9562; - ld.const.v4.u8 {%rs6218, %rs6219, %rs6220, %rs6221}, [matrix+3108]; - cvt.u32.u16 %r9564, %rs6221; - cvt.s32.s8 %r9565, %r9564; - cvt.u32.u16 %r9566, %rs6220; - cvt.s32.s8 %r9567, %r9566; - cvt.u32.u16 %r9568, %rs6219; - cvt.s32.s8 %r9569, %r9568; - cvt.u32.u16 %r9570, %rs6218; - cvt.s32.s8 %r9571, %r9570; - mad.lo.s32 %r9572, %r75, %r9571, %r9563; - mad.lo.s32 %r9573, %r76, %r9569, %r9572; - mad.lo.s32 %r9574, %r77, %r9567, %r9573; - mad.lo.s32 %r9575, %r78, %r9565, %r9574; - ld.const.v4.u8 {%rs6226, %rs6227, %rs6228, %rs6229}, [matrix+3112]; - cvt.u32.u16 %r9576, %rs6229; - cvt.s32.s8 %r9577, %r9576; - cvt.u32.u16 %r9578, %rs6228; - cvt.s32.s8 %r9579, %r9578; - cvt.u32.u16 %r9580, %rs6227; - cvt.s32.s8 %r9581, %r9580; - cvt.u32.u16 %r9582, %rs6226; - cvt.s32.s8 %r9583, %r9582; - mad.lo.s32 %r9584, %r80, %r9583, %r9575; - mad.lo.s32 %r9585, %r81, %r9581, %r9584; - mad.lo.s32 %r9586, %r83, %r9579, %r9585; - mad.lo.s32 %r9587, %r84, %r9577, %r9586; - ld.const.v4.u8 {%rs6234, %rs6235, %rs6236, %rs6237}, [matrix+3116]; - cvt.u32.u16 %r9588, %rs6237; - cvt.s32.s8 %r9589, %r9588; - cvt.u32.u16 %r9590, %rs6236; - cvt.s32.s8 %r9591, %r9590; - cvt.u32.u16 %r9592, %rs6235; - cvt.s32.s8 %r9593, %r9592; - cvt.u32.u16 %r9594, %rs6234; - cvt.s32.s8 %r9595, %r9594; - mad.lo.s32 %r9596, %r86, %r9595, %r9587; - mad.lo.s32 %r9597, %r87, %r9593, %r9596; - mad.lo.s32 %r9598, %r88, %r9591, %r9597; - mad.lo.s32 %r9599, %r89, %r9589, %r9598; - ld.const.v4.u8 {%rs6242, %rs6243, %rs6244, %rs6245}, [matrix+3120]; - cvt.u32.u16 %r9600, %rs6245; - cvt.s32.s8 %r9601, %r9600; - cvt.u32.u16 %r9602, %rs6244; - cvt.s32.s8 %r9603, %r9602; - cvt.u32.u16 %r9604, %rs6243; - cvt.s32.s8 %r9605, %r9604; - cvt.u32.u16 %r9606, %rs6242; - cvt.s32.s8 %r9607, %r9606; - mad.lo.s32 %r9608, %r271, %r9607, %r9599; - mad.lo.s32 %r9609, %r91, %r9605, %r9608; - mad.lo.s32 %r9610, %r93, %r9603, %r9609; - mad.lo.s32 %r9611, %r94, %r9601, %r9610; - ld.const.v4.u8 {%rs6250, %rs6251, %rs6252, %rs6253}, [matrix+3124]; - cvt.u32.u16 %r9612, %rs6253; - cvt.s32.s8 %r9613, %r9612; - cvt.u32.u16 %r9614, %rs6252; - cvt.s32.s8 %r9615, %r9614; - cvt.u32.u16 %r9616, %rs6251; - cvt.s32.s8 %r9617, %r9616; - cvt.u32.u16 %r9618, %rs6250; - cvt.s32.s8 %r9619, %r9618; - mad.lo.s32 %r9620, %r96, %r9619, %r9611; - mad.lo.s32 %r9621, %r97, %r9617, %r9620; - mad.lo.s32 %r9622, %r99, %r9615, %r9621; - mad.lo.s32 %r9623, %r100, %r9613, %r9622; - ld.const.v4.u8 {%rs6258, %rs6259, %rs6260, %rs6261}, [matrix+3128]; - cvt.u32.u16 %r9624, %rs6261; - cvt.s32.s8 %r9625, %r9624; - cvt.u32.u16 %r9626, %rs6260; - cvt.s32.s8 %r9627, %r9626; - cvt.u32.u16 %r9628, %rs6259; - cvt.s32.s8 %r9629, %r9628; - cvt.u32.u16 %r9630, %rs6258; - cvt.s32.s8 %r9631, %r9630; - mad.lo.s32 %r9632, %r103, %r9631, %r9623; - mad.lo.s32 %r9633, %r104, %r9629, %r9632; - mad.lo.s32 %r9634, %r107, %r9627, %r9633; - mad.lo.s32 %r9635, %r108, %r9625, %r9634; - ld.const.v4.u8 {%rs6266, %rs6267, %rs6268, %rs6269}, [matrix+3132]; - cvt.u32.u16 %r9636, %rs6269; - cvt.s32.s8 %r9637, %r9636; - cvt.u32.u16 %r9638, %rs6268; - cvt.s32.s8 %r9639, %r9638; - cvt.u32.u16 %r9640, %rs6267; - cvt.s32.s8 %r9641, %r9640; - cvt.u32.u16 %r9642, %rs6266; - cvt.s32.s8 %r9643, %r9642; - mad.lo.s32 %r9644, %r111, %r9643, %r9635; - mad.lo.s32 %r9645, %r112, %r9641, %r9644; - mad.lo.s32 %r9646, %r114, %r9639, %r9645; - mad.lo.s32 %r9647, %r115, %r9637, %r9646; - ld.const.v4.u8 {%rs6274, %rs6275, %rs6276, %rs6277}, [matrix+3136]; - cvt.u32.u16 %r9648, %rs6277; - cvt.s32.s8 %r9649, %r9648; - cvt.u32.u16 %r9650, %rs6276; - cvt.s32.s8 %r9651, %r9650; - cvt.u32.u16 %r9652, %rs6274; - cvt.s32.s8 %r9653, %r9652; - cvt.u32.u16 %r9654, %rs6275; - cvt.s32.s8 %r9655, %r9654; - mul.lo.s32 %r9656, %r34, %r9655; - mad.lo.s32 %r9657, %r124, %r9653, %r9656; - mad.lo.s32 %r9658, %r35, %r9651, %r9657; - mad.lo.s32 %r9659, %r36, %r9649, %r9658; - ld.const.v4.u8 {%rs6282, %rs6283, %rs6284, %rs6285}, [matrix+3140]; - cvt.u32.u16 %r9660, %rs6285; - cvt.s32.s8 %r9661, %r9660; - cvt.u32.u16 %r9662, %rs6284; - cvt.s32.s8 %r9663, %r9662; - cvt.u32.u16 %r9664, %rs6283; - cvt.s32.s8 %r9665, %r9664; - cvt.u32.u16 %r9666, %rs6282; - cvt.s32.s8 %r9667, %r9666; - mad.lo.s32 %r9668, %r37, %r9667, %r9659; - mad.lo.s32 %r9669, %r38, %r9665, %r9668; - mad.lo.s32 %r9670, %r39, %r9663, %r9669; - mad.lo.s32 %r9671, %r40, %r9661, %r9670; - ld.const.v4.u8 {%rs6290, %rs6291, %rs6292, %rs6293}, [matrix+3144]; - cvt.u32.u16 %r9672, %rs6293; - cvt.s32.s8 %r9673, %r9672; - cvt.u32.u16 %r9674, %rs6292; - cvt.s32.s8 %r9675, %r9674; - cvt.u32.u16 %r9676, %rs6291; - cvt.s32.s8 %r9677, %r9676; - cvt.u32.u16 %r9678, %rs6290; - cvt.s32.s8 %r9679, %r9678; - mad.lo.s32 %r9680, %r42, %r9679, %r9671; - mad.lo.s32 %r9681, %r43, %r9677, %r9680; - mad.lo.s32 %r9682, %r45, %r9675, %r9681; - mad.lo.s32 %r9683, %r46, %r9673, %r9682; - ld.const.v4.u8 {%rs6298, %rs6299, %rs6300, %rs6301}, [matrix+3148]; - cvt.u32.u16 %r9684, %rs6301; - cvt.s32.s8 %r9685, %r9684; - cvt.u32.u16 %r9686, %rs6300; - cvt.s32.s8 %r9687, %r9686; - cvt.u32.u16 %r9688, %rs6299; - cvt.s32.s8 %r9689, %r9688; - cvt.u32.u16 %r9690, %rs6298; - cvt.s32.s8 %r9691, %r9690; - mad.lo.s32 %r9692, %r48, %r9691, %r9683; - mad.lo.s32 %r9693, %r49, %r9689, %r9692; - mad.lo.s32 %r9694, %r50, %r9687, %r9693; - mad.lo.s32 %r9695, %r51, %r9685, %r9694; - ld.const.v4.u8 {%rs6306, %rs6307, %rs6308, %rs6309}, [matrix+3152]; - cvt.u32.u16 %r9696, %rs6309; - cvt.s32.s8 %r9697, %r9696; - cvt.u32.u16 %r9698, %rs6308; - cvt.s32.s8 %r9699, %r9698; - cvt.u32.u16 %r9700, %rs6307; - cvt.s32.s8 %r9701, %r9700; - cvt.u32.u16 %r9702, %rs6306; - cvt.s32.s8 %r9703, %r9702; - mad.lo.s32 %r9704, %r173, %r9703, %r9695; - mad.lo.s32 %r9705, %r53, %r9701, %r9704; - mad.lo.s32 %r9706, %r54, %r9699, %r9705; - mad.lo.s32 %r9707, %r55, %r9697, %r9706; - ld.const.v4.u8 {%rs6314, %rs6315, %rs6316, %rs6317}, [matrix+3156]; - cvt.u32.u16 %r9708, %rs6317; - cvt.s32.s8 %r9709, %r9708; - cvt.u32.u16 %r9710, %rs6316; - cvt.s32.s8 %r9711, %r9710; - cvt.u32.u16 %r9712, %rs6315; - cvt.s32.s8 %r9713, %r9712; - cvt.u32.u16 %r9714, %rs6314; - cvt.s32.s8 %r9715, %r9714; - mad.lo.s32 %r9716, %r56, %r9715, %r9707; - mad.lo.s32 %r9717, %r57, %r9713, %r9716; - mad.lo.s32 %r9718, %r58, %r9711, %r9717; - mad.lo.s32 %r9719, %r59, %r9709, %r9718; - ld.const.v4.u8 {%rs6322, %rs6323, %rs6324, %rs6325}, [matrix+3160]; - cvt.u32.u16 %r9720, %rs6325; - cvt.s32.s8 %r9721, %r9720; - cvt.u32.u16 %r9722, %rs6324; - cvt.s32.s8 %r9723, %r9722; - cvt.u32.u16 %r9724, %rs6323; - cvt.s32.s8 %r9725, %r9724; - cvt.u32.u16 %r9726, %rs6322; - cvt.s32.s8 %r9727, %r9726; - mad.lo.s32 %r9728, %r61, %r9727, %r9719; - mad.lo.s32 %r9729, %r62, %r9725, %r9728; - mad.lo.s32 %r9730, %r64, %r9723, %r9729; - mad.lo.s32 %r9731, %r65, %r9721, %r9730; - ld.const.v4.u8 {%rs6330, %rs6331, %rs6332, %rs6333}, [matrix+3164]; - cvt.u32.u16 %r9732, %rs6333; - cvt.s32.s8 %r9733, %r9732; - cvt.u32.u16 %r9734, %rs6332; - cvt.s32.s8 %r9735, %r9734; - cvt.u32.u16 %r9736, %rs6331; - cvt.s32.s8 %r9737, %r9736; - cvt.u32.u16 %r9738, %rs6330; - cvt.s32.s8 %r9739, %r9738; - mad.lo.s32 %r9740, %r67, %r9739, %r9731; - mad.lo.s32 %r9741, %r68, %r9737, %r9740; - mad.lo.s32 %r9742, %r69, %r9735, %r9741; - mad.lo.s32 %r9743, %r70, %r9733, %r9742; - ld.const.v4.u8 {%rs6338, %rs6339, %rs6340, %rs6341}, [matrix+3168]; - cvt.u32.u16 %r9744, %rs6341; - cvt.s32.s8 %r9745, %r9744; - cvt.u32.u16 %r9746, %rs6340; - cvt.s32.s8 %r9747, %r9746; - cvt.u32.u16 %r9748, %rs6339; - cvt.s32.s8 %r9749, %r9748; - cvt.u32.u16 %r9750, %rs6338; - cvt.s32.s8 %r9751, %r9750; - mad.lo.s32 %r9752, %r222, %r9751, %r9743; - mad.lo.s32 %r9753, %r72, %r9749, %r9752; - mad.lo.s32 %r9754, %r73, %r9747, %r9753; - mad.lo.s32 %r9755, %r74, %r9745, %r9754; - ld.const.v4.u8 {%rs6346, %rs6347, %rs6348, %rs6349}, [matrix+3172]; - cvt.u32.u16 %r9756, %rs6349; - cvt.s32.s8 %r9757, %r9756; - cvt.u32.u16 %r9758, %rs6348; - cvt.s32.s8 %r9759, %r9758; - cvt.u32.u16 %r9760, %rs6347; - cvt.s32.s8 %r9761, %r9760; - cvt.u32.u16 %r9762, %rs6346; - cvt.s32.s8 %r9763, %r9762; - mad.lo.s32 %r9764, %r75, %r9763, %r9755; - mad.lo.s32 %r9765, %r76, %r9761, %r9764; - mad.lo.s32 %r9766, %r77, %r9759, %r9765; - mad.lo.s32 %r9767, %r78, %r9757, %r9766; - ld.const.v4.u8 {%rs6354, %rs6355, %rs6356, %rs6357}, [matrix+3176]; - cvt.u32.u16 %r9768, %rs6357; - cvt.s32.s8 %r9769, %r9768; - cvt.u32.u16 %r9770, %rs6356; - cvt.s32.s8 %r9771, %r9770; - cvt.u32.u16 %r9772, %rs6355; - cvt.s32.s8 %r9773, %r9772; - cvt.u32.u16 %r9774, %rs6354; - cvt.s32.s8 %r9775, %r9774; - mad.lo.s32 %r9776, %r80, %r9775, %r9767; - mad.lo.s32 %r9777, %r81, %r9773, %r9776; - mad.lo.s32 %r9778, %r83, %r9771, %r9777; - mad.lo.s32 %r9779, %r84, %r9769, %r9778; - ld.const.v4.u8 {%rs6362, %rs6363, %rs6364, %rs6365}, [matrix+3180]; - cvt.u32.u16 %r9780, %rs6365; - cvt.s32.s8 %r9781, %r9780; - cvt.u32.u16 %r9782, %rs6364; - cvt.s32.s8 %r9783, %r9782; - cvt.u32.u16 %r9784, %rs6363; - cvt.s32.s8 %r9785, %r9784; - cvt.u32.u16 %r9786, %rs6362; - cvt.s32.s8 %r9787, %r9786; - mad.lo.s32 %r9788, %r86, %r9787, %r9779; - mad.lo.s32 %r9789, %r87, %r9785, %r9788; - mad.lo.s32 %r9790, %r88, %r9783, %r9789; - mad.lo.s32 %r9791, %r89, %r9781, %r9790; - ld.const.v4.u8 {%rs6370, %rs6371, %rs6372, %rs6373}, [matrix+3184]; - cvt.u32.u16 %r9792, %rs6373; - cvt.s32.s8 %r9793, %r9792; - cvt.u32.u16 %r9794, %rs6372; - cvt.s32.s8 %r9795, %r9794; - cvt.u32.u16 %r9796, %rs6371; - cvt.s32.s8 %r9797, %r9796; - cvt.u32.u16 %r9798, %rs6370; - cvt.s32.s8 %r9799, %r9798; - mad.lo.s32 %r9800, %r271, %r9799, %r9791; - mad.lo.s32 %r9801, %r91, %r9797, %r9800; - mad.lo.s32 %r9802, %r93, %r9795, %r9801; - mad.lo.s32 %r9803, %r94, %r9793, %r9802; - ld.const.v4.u8 {%rs6378, %rs6379, %rs6380, %rs6381}, [matrix+3188]; - cvt.u32.u16 %r9804, %rs6381; - cvt.s32.s8 %r9805, %r9804; - cvt.u32.u16 %r9806, %rs6380; - cvt.s32.s8 %r9807, %r9806; - cvt.u32.u16 %r9808, %rs6379; - cvt.s32.s8 %r9809, %r9808; - cvt.u32.u16 %r9810, %rs6378; - cvt.s32.s8 %r9811, %r9810; - mad.lo.s32 %r9812, %r96, %r9811, %r9803; - mad.lo.s32 %r9813, %r97, %r9809, %r9812; - mad.lo.s32 %r9814, %r99, %r9807, %r9813; - mad.lo.s32 %r9815, %r100, %r9805, %r9814; - ld.const.v4.u8 {%rs6386, %rs6387, %rs6388, %rs6389}, [matrix+3192]; - cvt.u32.u16 %r9816, %rs6389; - cvt.s32.s8 %r9817, %r9816; - cvt.u32.u16 %r9818, %rs6388; - cvt.s32.s8 %r9819, %r9818; - cvt.u32.u16 %r9820, %rs6387; - cvt.s32.s8 %r9821, %r9820; - cvt.u32.u16 %r9822, %rs6386; - cvt.s32.s8 %r9823, %r9822; - mad.lo.s32 %r9824, %r103, %r9823, %r9815; - mad.lo.s32 %r9825, %r104, %r9821, %r9824; - mad.lo.s32 %r9826, %r107, %r9819, %r9825; - mad.lo.s32 %r9827, %r108, %r9817, %r9826; - ld.const.v4.u8 {%rs6394, %rs6395, %rs6396, %rs6397}, [matrix+3196]; - cvt.u32.u16 %r9828, %rs6397; - cvt.s32.s8 %r9829, %r9828; - cvt.u32.u16 %r9830, %rs6396; - cvt.s32.s8 %r9831, %r9830; - cvt.u32.u16 %r9832, %rs6395; - cvt.s32.s8 %r9833, %r9832; - cvt.u32.u16 %r9834, %rs6394; - cvt.s32.s8 %r9835, %r9834; - mad.lo.s32 %r9836, %r111, %r9835, %r9827; - mad.lo.s32 %r9837, %r112, %r9833, %r9836; - mad.lo.s32 %r9838, %r114, %r9831, %r9837; - mad.lo.s32 %r9839, %r115, %r9829, %r9838; - shr.u32 %r9840, %r9647, 6; - and.b32 %r9841, %r9840, 240; - shr.u32 %r9842, %r9839, 10; - or.b32 %r9843, %r9842, %r9841; - xor.b32 %r9844, %r90, %r9843; - ld.const.v4.u8 {%rs6402, %rs6403, %rs6404, %rs6405}, [matrix+3200]; - cvt.u32.u16 %r9845, %rs6405; - cvt.s32.s8 %r9846, %r9845; - cvt.u32.u16 %r9847, %rs6404; - cvt.s32.s8 %r9848, %r9847; - cvt.u32.u16 %r9849, %rs6402; - cvt.s32.s8 %r9850, %r9849; - cvt.u32.u16 %r9851, %rs6403; - cvt.s32.s8 %r9852, %r9851; - mul.lo.s32 %r9853, %r34, %r9852; - mad.lo.s32 %r9854, %r124, %r9850, %r9853; - mad.lo.s32 %r9855, %r35, %r9848, %r9854; - mad.lo.s32 %r9856, %r36, %r9846, %r9855; - ld.const.v4.u8 {%rs6410, %rs6411, %rs6412, %rs6413}, [matrix+3204]; - cvt.u32.u16 %r9857, %rs6413; - cvt.s32.s8 %r9858, %r9857; - cvt.u32.u16 %r9859, %rs6412; - cvt.s32.s8 %r9860, %r9859; - cvt.u32.u16 %r9861, %rs6411; - cvt.s32.s8 %r9862, %r9861; - cvt.u32.u16 %r9863, %rs6410; - cvt.s32.s8 %r9864, %r9863; - mad.lo.s32 %r9865, %r37, %r9864, %r9856; - mad.lo.s32 %r9866, %r38, %r9862, %r9865; - mad.lo.s32 %r9867, %r39, %r9860, %r9866; - mad.lo.s32 %r9868, %r40, %r9858, %r9867; - ld.const.v4.u8 {%rs6418, %rs6419, %rs6420, %rs6421}, [matrix+3208]; - cvt.u32.u16 %r9869, %rs6421; - cvt.s32.s8 %r9870, %r9869; - cvt.u32.u16 %r9871, %rs6420; - cvt.s32.s8 %r9872, %r9871; - cvt.u32.u16 %r9873, %rs6419; - cvt.s32.s8 %r9874, %r9873; - cvt.u32.u16 %r9875, %rs6418; - cvt.s32.s8 %r9876, %r9875; - mad.lo.s32 %r9877, %r42, %r9876, %r9868; - mad.lo.s32 %r9878, %r43, %r9874, %r9877; - mad.lo.s32 %r9879, %r45, %r9872, %r9878; - mad.lo.s32 %r9880, %r46, %r9870, %r9879; - ld.const.v4.u8 {%rs6426, %rs6427, %rs6428, %rs6429}, [matrix+3212]; - cvt.u32.u16 %r9881, %rs6429; - cvt.s32.s8 %r9882, %r9881; - cvt.u32.u16 %r9883, %rs6428; - cvt.s32.s8 %r9884, %r9883; - cvt.u32.u16 %r9885, %rs6427; - cvt.s32.s8 %r9886, %r9885; - cvt.u32.u16 %r9887, %rs6426; - cvt.s32.s8 %r9888, %r9887; - mad.lo.s32 %r9889, %r48, %r9888, %r9880; - mad.lo.s32 %r9890, %r49, %r9886, %r9889; - mad.lo.s32 %r9891, %r50, %r9884, %r9890; - mad.lo.s32 %r9892, %r51, %r9882, %r9891; - ld.const.v4.u8 {%rs6434, %rs6435, %rs6436, %rs6437}, [matrix+3216]; - cvt.u32.u16 %r9893, %rs6437; - cvt.s32.s8 %r9894, %r9893; - cvt.u32.u16 %r9895, %rs6436; - cvt.s32.s8 %r9896, %r9895; - cvt.u32.u16 %r9897, %rs6435; - cvt.s32.s8 %r9898, %r9897; - cvt.u32.u16 %r9899, %rs6434; - cvt.s32.s8 %r9900, %r9899; - mad.lo.s32 %r9901, %r173, %r9900, %r9892; - mad.lo.s32 %r9902, %r53, %r9898, %r9901; - mad.lo.s32 %r9903, %r54, %r9896, %r9902; - mad.lo.s32 %r9904, %r55, %r9894, %r9903; - ld.const.v4.u8 {%rs6442, %rs6443, %rs6444, %rs6445}, [matrix+3220]; - cvt.u32.u16 %r9905, %rs6445; - cvt.s32.s8 %r9906, %r9905; - cvt.u32.u16 %r9907, %rs6444; - cvt.s32.s8 %r9908, %r9907; - cvt.u32.u16 %r9909, %rs6443; - cvt.s32.s8 %r9910, %r9909; - cvt.u32.u16 %r9911, %rs6442; - cvt.s32.s8 %r9912, %r9911; - mad.lo.s32 %r9913, %r56, %r9912, %r9904; - mad.lo.s32 %r9914, %r57, %r9910, %r9913; - mad.lo.s32 %r9915, %r58, %r9908, %r9914; - mad.lo.s32 %r9916, %r59, %r9906, %r9915; - ld.const.v4.u8 {%rs6450, %rs6451, %rs6452, %rs6453}, [matrix+3224]; - cvt.u32.u16 %r9917, %rs6453; - cvt.s32.s8 %r9918, %r9917; - cvt.u32.u16 %r9919, %rs6452; - cvt.s32.s8 %r9920, %r9919; - cvt.u32.u16 %r9921, %rs6451; - cvt.s32.s8 %r9922, %r9921; - cvt.u32.u16 %r9923, %rs6450; - cvt.s32.s8 %r9924, %r9923; - mad.lo.s32 %r9925, %r61, %r9924, %r9916; - mad.lo.s32 %r9926, %r62, %r9922, %r9925; - mad.lo.s32 %r9927, %r64, %r9920, %r9926; - mad.lo.s32 %r9928, %r65, %r9918, %r9927; - ld.const.v4.u8 {%rs6458, %rs6459, %rs6460, %rs6461}, [matrix+3228]; - cvt.u32.u16 %r9929, %rs6461; - cvt.s32.s8 %r9930, %r9929; - cvt.u32.u16 %r9931, %rs6460; - cvt.s32.s8 %r9932, %r9931; - cvt.u32.u16 %r9933, %rs6459; - cvt.s32.s8 %r9934, %r9933; - cvt.u32.u16 %r9935, %rs6458; - cvt.s32.s8 %r9936, %r9935; - mad.lo.s32 %r9937, %r67, %r9936, %r9928; - mad.lo.s32 %r9938, %r68, %r9934, %r9937; - mad.lo.s32 %r9939, %r69, %r9932, %r9938; - mad.lo.s32 %r9940, %r70, %r9930, %r9939; - ld.const.v4.u8 {%rs6466, %rs6467, %rs6468, %rs6469}, [matrix+3232]; - cvt.u32.u16 %r9941, %rs6469; - cvt.s32.s8 %r9942, %r9941; - cvt.u32.u16 %r9943, %rs6468; - cvt.s32.s8 %r9944, %r9943; - cvt.u32.u16 %r9945, %rs6467; - cvt.s32.s8 %r9946, %r9945; - cvt.u32.u16 %r9947, %rs6466; - cvt.s32.s8 %r9948, %r9947; - mad.lo.s32 %r9949, %r222, %r9948, %r9940; - mad.lo.s32 %r9950, %r72, %r9946, %r9949; - mad.lo.s32 %r9951, %r73, %r9944, %r9950; - mad.lo.s32 %r9952, %r74, %r9942, %r9951; - ld.const.v4.u8 {%rs6474, %rs6475, %rs6476, %rs6477}, [matrix+3236]; - cvt.u32.u16 %r9953, %rs6477; - cvt.s32.s8 %r9954, %r9953; - cvt.u32.u16 %r9955, %rs6476; - cvt.s32.s8 %r9956, %r9955; - cvt.u32.u16 %r9957, %rs6475; - cvt.s32.s8 %r9958, %r9957; - cvt.u32.u16 %r9959, %rs6474; - cvt.s32.s8 %r9960, %r9959; - mad.lo.s32 %r9961, %r75, %r9960, %r9952; - mad.lo.s32 %r9962, %r76, %r9958, %r9961; - mad.lo.s32 %r9963, %r77, %r9956, %r9962; - mad.lo.s32 %r9964, %r78, %r9954, %r9963; - ld.const.v4.u8 {%rs6482, %rs6483, %rs6484, %rs6485}, [matrix+3240]; - cvt.u32.u16 %r9965, %rs6485; - cvt.s32.s8 %r9966, %r9965; - cvt.u32.u16 %r9967, %rs6484; - cvt.s32.s8 %r9968, %r9967; - cvt.u32.u16 %r9969, %rs6483; - cvt.s32.s8 %r9970, %r9969; - cvt.u32.u16 %r9971, %rs6482; - cvt.s32.s8 %r9972, %r9971; - mad.lo.s32 %r9973, %r80, %r9972, %r9964; - mad.lo.s32 %r9974, %r81, %r9970, %r9973; - mad.lo.s32 %r9975, %r83, %r9968, %r9974; - mad.lo.s32 %r9976, %r84, %r9966, %r9975; - ld.const.v4.u8 {%rs6490, %rs6491, %rs6492, %rs6493}, [matrix+3244]; - cvt.u32.u16 %r9977, %rs6493; - cvt.s32.s8 %r9978, %r9977; - cvt.u32.u16 %r9979, %rs6492; - cvt.s32.s8 %r9980, %r9979; - cvt.u32.u16 %r9981, %rs6491; - cvt.s32.s8 %r9982, %r9981; - cvt.u32.u16 %r9983, %rs6490; - cvt.s32.s8 %r9984, %r9983; - mad.lo.s32 %r9985, %r86, %r9984, %r9976; - mad.lo.s32 %r9986, %r87, %r9982, %r9985; - mad.lo.s32 %r9987, %r88, %r9980, %r9986; - mad.lo.s32 %r9988, %r89, %r9978, %r9987; - ld.const.v4.u8 {%rs6498, %rs6499, %rs6500, %rs6501}, [matrix+3248]; - cvt.u32.u16 %r9989, %rs6501; - cvt.s32.s8 %r9990, %r9989; - cvt.u32.u16 %r9991, %rs6500; - cvt.s32.s8 %r9992, %r9991; - cvt.u32.u16 %r9993, %rs6499; - cvt.s32.s8 %r9994, %r9993; - cvt.u32.u16 %r9995, %rs6498; - cvt.s32.s8 %r9996, %r9995; - mad.lo.s32 %r9997, %r271, %r9996, %r9988; - mad.lo.s32 %r9998, %r91, %r9994, %r9997; - mad.lo.s32 %r9999, %r93, %r9992, %r9998; - mad.lo.s32 %r10000, %r94, %r9990, %r9999; - ld.const.v4.u8 {%rs6506, %rs6507, %rs6508, %rs6509}, [matrix+3252]; - cvt.u32.u16 %r10001, %rs6509; - cvt.s32.s8 %r10002, %r10001; - cvt.u32.u16 %r10003, %rs6508; - cvt.s32.s8 %r10004, %r10003; - cvt.u32.u16 %r10005, %rs6507; - cvt.s32.s8 %r10006, %r10005; - cvt.u32.u16 %r10007, %rs6506; - cvt.s32.s8 %r10008, %r10007; - mad.lo.s32 %r10009, %r96, %r10008, %r10000; - mad.lo.s32 %r10010, %r97, %r10006, %r10009; - mad.lo.s32 %r10011, %r99, %r10004, %r10010; - mad.lo.s32 %r10012, %r100, %r10002, %r10011; - ld.const.v4.u8 {%rs6514, %rs6515, %rs6516, %rs6517}, [matrix+3256]; - cvt.u32.u16 %r10013, %rs6517; - cvt.s32.s8 %r10014, %r10013; - cvt.u32.u16 %r10015, %rs6516; - cvt.s32.s8 %r10016, %r10015; - cvt.u32.u16 %r10017, %rs6515; - cvt.s32.s8 %r10018, %r10017; - cvt.u32.u16 %r10019, %rs6514; - cvt.s32.s8 %r10020, %r10019; - mad.lo.s32 %r10021, %r103, %r10020, %r10012; - mad.lo.s32 %r10022, %r104, %r10018, %r10021; - mad.lo.s32 %r10023, %r107, %r10016, %r10022; - mad.lo.s32 %r10024, %r108, %r10014, %r10023; - ld.const.v4.u8 {%rs6522, %rs6523, %rs6524, %rs6525}, [matrix+3260]; - cvt.u32.u16 %r10025, %rs6525; - cvt.s32.s8 %r10026, %r10025; - cvt.u32.u16 %r10027, %rs6524; - cvt.s32.s8 %r10028, %r10027; - cvt.u32.u16 %r10029, %rs6523; - cvt.s32.s8 %r10030, %r10029; - cvt.u32.u16 %r10031, %rs6522; - cvt.s32.s8 %r10032, %r10031; - mad.lo.s32 %r10033, %r111, %r10032, %r10024; - mad.lo.s32 %r10034, %r112, %r10030, %r10033; - mad.lo.s32 %r10035, %r114, %r10028, %r10034; - mad.lo.s32 %r10036, %r115, %r10026, %r10035; - ld.const.v4.u8 {%rs6530, %rs6531, %rs6532, %rs6533}, [matrix+3264]; - cvt.u32.u16 %r10037, %rs6533; - cvt.s32.s8 %r10038, %r10037; - cvt.u32.u16 %r10039, %rs6532; - cvt.s32.s8 %r10040, %r10039; - cvt.u32.u16 %r10041, %rs6530; - cvt.s32.s8 %r10042, %r10041; - cvt.u32.u16 %r10043, %rs6531; - cvt.s32.s8 %r10044, %r10043; - mul.lo.s32 %r10045, %r34, %r10044; - mad.lo.s32 %r10046, %r124, %r10042, %r10045; - mad.lo.s32 %r10047, %r35, %r10040, %r10046; - mad.lo.s32 %r10048, %r36, %r10038, %r10047; - ld.const.v4.u8 {%rs6538, %rs6539, %rs6540, %rs6541}, [matrix+3268]; - cvt.u32.u16 %r10049, %rs6541; - cvt.s32.s8 %r10050, %r10049; - cvt.u32.u16 %r10051, %rs6540; - cvt.s32.s8 %r10052, %r10051; - cvt.u32.u16 %r10053, %rs6539; - cvt.s32.s8 %r10054, %r10053; - cvt.u32.u16 %r10055, %rs6538; - cvt.s32.s8 %r10056, %r10055; - mad.lo.s32 %r10057, %r37, %r10056, %r10048; - mad.lo.s32 %r10058, %r38, %r10054, %r10057; - mad.lo.s32 %r10059, %r39, %r10052, %r10058; - mad.lo.s32 %r10060, %r40, %r10050, %r10059; - ld.const.v4.u8 {%rs6546, %rs6547, %rs6548, %rs6549}, [matrix+3272]; - cvt.u32.u16 %r10061, %rs6549; - cvt.s32.s8 %r10062, %r10061; - cvt.u32.u16 %r10063, %rs6548; - cvt.s32.s8 %r10064, %r10063; - cvt.u32.u16 %r10065, %rs6547; - cvt.s32.s8 %r10066, %r10065; - cvt.u32.u16 %r10067, %rs6546; - cvt.s32.s8 %r10068, %r10067; - mad.lo.s32 %r10069, %r42, %r10068, %r10060; - mad.lo.s32 %r10070, %r43, %r10066, %r10069; - mad.lo.s32 %r10071, %r45, %r10064, %r10070; - mad.lo.s32 %r10072, %r46, %r10062, %r10071; - ld.const.v4.u8 {%rs6554, %rs6555, %rs6556, %rs6557}, [matrix+3276]; - cvt.u32.u16 %r10073, %rs6557; - cvt.s32.s8 %r10074, %r10073; - cvt.u32.u16 %r10075, %rs6556; - cvt.s32.s8 %r10076, %r10075; - cvt.u32.u16 %r10077, %rs6555; - cvt.s32.s8 %r10078, %r10077; - cvt.u32.u16 %r10079, %rs6554; - cvt.s32.s8 %r10080, %r10079; - mad.lo.s32 %r10081, %r48, %r10080, %r10072; - mad.lo.s32 %r10082, %r49, %r10078, %r10081; - mad.lo.s32 %r10083, %r50, %r10076, %r10082; - mad.lo.s32 %r10084, %r51, %r10074, %r10083; - ld.const.v4.u8 {%rs6562, %rs6563, %rs6564, %rs6565}, [matrix+3280]; - cvt.u32.u16 %r10085, %rs6565; - cvt.s32.s8 %r10086, %r10085; - cvt.u32.u16 %r10087, %rs6564; - cvt.s32.s8 %r10088, %r10087; - cvt.u32.u16 %r10089, %rs6563; - cvt.s32.s8 %r10090, %r10089; - cvt.u32.u16 %r10091, %rs6562; - cvt.s32.s8 %r10092, %r10091; - mad.lo.s32 %r10093, %r173, %r10092, %r10084; - mad.lo.s32 %r10094, %r53, %r10090, %r10093; - mad.lo.s32 %r10095, %r54, %r10088, %r10094; - mad.lo.s32 %r10096, %r55, %r10086, %r10095; - ld.const.v4.u8 {%rs6570, %rs6571, %rs6572, %rs6573}, [matrix+3284]; - cvt.u32.u16 %r10097, %rs6573; - cvt.s32.s8 %r10098, %r10097; - cvt.u32.u16 %r10099, %rs6572; - cvt.s32.s8 %r10100, %r10099; - cvt.u32.u16 %r10101, %rs6571; - cvt.s32.s8 %r10102, %r10101; - cvt.u32.u16 %r10103, %rs6570; - cvt.s32.s8 %r10104, %r10103; - mad.lo.s32 %r10105, %r56, %r10104, %r10096; - mad.lo.s32 %r10106, %r57, %r10102, %r10105; - mad.lo.s32 %r10107, %r58, %r10100, %r10106; - mad.lo.s32 %r10108, %r59, %r10098, %r10107; - ld.const.v4.u8 {%rs6578, %rs6579, %rs6580, %rs6581}, [matrix+3288]; - cvt.u32.u16 %r10109, %rs6581; - cvt.s32.s8 %r10110, %r10109; - cvt.u32.u16 %r10111, %rs6580; - cvt.s32.s8 %r10112, %r10111; - cvt.u32.u16 %r10113, %rs6579; - cvt.s32.s8 %r10114, %r10113; - cvt.u32.u16 %r10115, %rs6578; - cvt.s32.s8 %r10116, %r10115; - mad.lo.s32 %r10117, %r61, %r10116, %r10108; - mad.lo.s32 %r10118, %r62, %r10114, %r10117; - mad.lo.s32 %r10119, %r64, %r10112, %r10118; - mad.lo.s32 %r10120, %r65, %r10110, %r10119; - ld.const.v4.u8 {%rs6586, %rs6587, %rs6588, %rs6589}, [matrix+3292]; - cvt.u32.u16 %r10121, %rs6589; - cvt.s32.s8 %r10122, %r10121; - cvt.u32.u16 %r10123, %rs6588; - cvt.s32.s8 %r10124, %r10123; - cvt.u32.u16 %r10125, %rs6587; - cvt.s32.s8 %r10126, %r10125; - cvt.u32.u16 %r10127, %rs6586; - cvt.s32.s8 %r10128, %r10127; - mad.lo.s32 %r10129, %r67, %r10128, %r10120; - mad.lo.s32 %r10130, %r68, %r10126, %r10129; - mad.lo.s32 %r10131, %r69, %r10124, %r10130; - mad.lo.s32 %r10132, %r70, %r10122, %r10131; - ld.const.v4.u8 {%rs6594, %rs6595, %rs6596, %rs6597}, [matrix+3296]; - cvt.u32.u16 %r10133, %rs6597; - cvt.s32.s8 %r10134, %r10133; - cvt.u32.u16 %r10135, %rs6596; - cvt.s32.s8 %r10136, %r10135; - cvt.u32.u16 %r10137, %rs6595; - cvt.s32.s8 %r10138, %r10137; - cvt.u32.u16 %r10139, %rs6594; - cvt.s32.s8 %r10140, %r10139; - mad.lo.s32 %r10141, %r222, %r10140, %r10132; - mad.lo.s32 %r10142, %r72, %r10138, %r10141; - mad.lo.s32 %r10143, %r73, %r10136, %r10142; - mad.lo.s32 %r10144, %r74, %r10134, %r10143; - ld.const.v4.u8 {%rs6602, %rs6603, %rs6604, %rs6605}, [matrix+3300]; - cvt.u32.u16 %r10145, %rs6605; - cvt.s32.s8 %r10146, %r10145; - cvt.u32.u16 %r10147, %rs6604; - cvt.s32.s8 %r10148, %r10147; - cvt.u32.u16 %r10149, %rs6603; - cvt.s32.s8 %r10150, %r10149; - cvt.u32.u16 %r10151, %rs6602; - cvt.s32.s8 %r10152, %r10151; - mad.lo.s32 %r10153, %r75, %r10152, %r10144; - mad.lo.s32 %r10154, %r76, %r10150, %r10153; - mad.lo.s32 %r10155, %r77, %r10148, %r10154; - mad.lo.s32 %r10156, %r78, %r10146, %r10155; - ld.const.v4.u8 {%rs6610, %rs6611, %rs6612, %rs6613}, [matrix+3304]; - cvt.u32.u16 %r10157, %rs6613; - cvt.s32.s8 %r10158, %r10157; - cvt.u32.u16 %r10159, %rs6612; - cvt.s32.s8 %r10160, %r10159; - cvt.u32.u16 %r10161, %rs6611; - cvt.s32.s8 %r10162, %r10161; - cvt.u32.u16 %r10163, %rs6610; - cvt.s32.s8 %r10164, %r10163; - mad.lo.s32 %r10165, %r80, %r10164, %r10156; - mad.lo.s32 %r10166, %r81, %r10162, %r10165; - mad.lo.s32 %r10167, %r83, %r10160, %r10166; - mad.lo.s32 %r10168, %r84, %r10158, %r10167; - ld.const.v4.u8 {%rs6618, %rs6619, %rs6620, %rs6621}, [matrix+3308]; - cvt.u32.u16 %r10169, %rs6621; - cvt.s32.s8 %r10170, %r10169; - cvt.u32.u16 %r10171, %rs6620; - cvt.s32.s8 %r10172, %r10171; - cvt.u32.u16 %r10173, %rs6619; - cvt.s32.s8 %r10174, %r10173; - cvt.u32.u16 %r10175, %rs6618; - cvt.s32.s8 %r10176, %r10175; - mad.lo.s32 %r10177, %r86, %r10176, %r10168; - mad.lo.s32 %r10178, %r87, %r10174, %r10177; - mad.lo.s32 %r10179, %r88, %r10172, %r10178; - mad.lo.s32 %r10180, %r89, %r10170, %r10179; - ld.const.v4.u8 {%rs6626, %rs6627, %rs6628, %rs6629}, [matrix+3312]; - cvt.u32.u16 %r10181, %rs6629; - cvt.s32.s8 %r10182, %r10181; - cvt.u32.u16 %r10183, %rs6628; - cvt.s32.s8 %r10184, %r10183; - cvt.u32.u16 %r10185, %rs6627; - cvt.s32.s8 %r10186, %r10185; - cvt.u32.u16 %r10187, %rs6626; - cvt.s32.s8 %r10188, %r10187; - mad.lo.s32 %r10189, %r271, %r10188, %r10180; - mad.lo.s32 %r10190, %r91, %r10186, %r10189; - mad.lo.s32 %r10191, %r93, %r10184, %r10190; - mad.lo.s32 %r10192, %r94, %r10182, %r10191; - ld.const.v4.u8 {%rs6634, %rs6635, %rs6636, %rs6637}, [matrix+3316]; - cvt.u32.u16 %r10193, %rs6637; - cvt.s32.s8 %r10194, %r10193; - cvt.u32.u16 %r10195, %rs6636; - cvt.s32.s8 %r10196, %r10195; - cvt.u32.u16 %r10197, %rs6635; - cvt.s32.s8 %r10198, %r10197; - cvt.u32.u16 %r10199, %rs6634; - cvt.s32.s8 %r10200, %r10199; - mad.lo.s32 %r10201, %r96, %r10200, %r10192; - mad.lo.s32 %r10202, %r97, %r10198, %r10201; - mad.lo.s32 %r10203, %r99, %r10196, %r10202; - mad.lo.s32 %r10204, %r100, %r10194, %r10203; - ld.const.v4.u8 {%rs6642, %rs6643, %rs6644, %rs6645}, [matrix+3320]; - cvt.u32.u16 %r10205, %rs6645; - cvt.s32.s8 %r10206, %r10205; - cvt.u32.u16 %r10207, %rs6644; - cvt.s32.s8 %r10208, %r10207; - cvt.u32.u16 %r10209, %rs6643; - cvt.s32.s8 %r10210, %r10209; - cvt.u32.u16 %r10211, %rs6642; - cvt.s32.s8 %r10212, %r10211; - mad.lo.s32 %r10213, %r103, %r10212, %r10204; - mad.lo.s32 %r10214, %r104, %r10210, %r10213; - mad.lo.s32 %r10215, %r107, %r10208, %r10214; - mad.lo.s32 %r10216, %r108, %r10206, %r10215; - ld.const.v4.u8 {%rs6650, %rs6651, %rs6652, %rs6653}, [matrix+3324]; - cvt.u32.u16 %r10217, %rs6653; - cvt.s32.s8 %r10218, %r10217; - cvt.u32.u16 %r10219, %rs6652; - cvt.s32.s8 %r10220, %r10219; - cvt.u32.u16 %r10221, %rs6651; - cvt.s32.s8 %r10222, %r10221; - cvt.u32.u16 %r10223, %rs6650; - cvt.s32.s8 %r10224, %r10223; - mad.lo.s32 %r10225, %r111, %r10224, %r10216; - mad.lo.s32 %r10226, %r112, %r10222, %r10225; - mad.lo.s32 %r10227, %r114, %r10220, %r10226; - mad.lo.s32 %r10228, %r115, %r10218, %r10227; - shr.u32 %r10229, %r10036, 6; - and.b32 %r10230, %r10229, 240; - shr.u32 %r10231, %r10228, 10; - or.b32 %r10232, %r10231, %r10230; - xor.b32 %r10233, %r92, %r10232; - cvt.u64.u32 %rd399, %r10233; - ld.const.v4.u8 {%rs6658, %rs6659, %rs6660, %rs6661}, [matrix+3328]; - cvt.u32.u16 %r10234, %rs6661; - cvt.s32.s8 %r10235, %r10234; - cvt.u32.u16 %r10236, %rs6660; - cvt.s32.s8 %r10237, %r10236; - cvt.u32.u16 %r10238, %rs6658; - cvt.s32.s8 %r10239, %r10238; - cvt.u32.u16 %r10240, %rs6659; - cvt.s32.s8 %r10241, %r10240; - mul.lo.s32 %r10242, %r34, %r10241; - mad.lo.s32 %r10243, %r124, %r10239, %r10242; - mad.lo.s32 %r10244, %r35, %r10237, %r10243; - mad.lo.s32 %r10245, %r36, %r10235, %r10244; - ld.const.v4.u8 {%rs6666, %rs6667, %rs6668, %rs6669}, [matrix+3332]; - cvt.u32.u16 %r10246, %rs6669; - cvt.s32.s8 %r10247, %r10246; - cvt.u32.u16 %r10248, %rs6668; - cvt.s32.s8 %r10249, %r10248; - cvt.u32.u16 %r10250, %rs6667; - cvt.s32.s8 %r10251, %r10250; - cvt.u32.u16 %r10252, %rs6666; - cvt.s32.s8 %r10253, %r10252; - mad.lo.s32 %r10254, %r37, %r10253, %r10245; - mad.lo.s32 %r10255, %r38, %r10251, %r10254; - mad.lo.s32 %r10256, %r39, %r10249, %r10255; - mad.lo.s32 %r10257, %r40, %r10247, %r10256; - ld.const.v4.u8 {%rs6674, %rs6675, %rs6676, %rs6677}, [matrix+3336]; - cvt.u32.u16 %r10258, %rs6677; - cvt.s32.s8 %r10259, %r10258; - cvt.u32.u16 %r10260, %rs6676; - cvt.s32.s8 %r10261, %r10260; - cvt.u32.u16 %r10262, %rs6675; - cvt.s32.s8 %r10263, %r10262; - cvt.u32.u16 %r10264, %rs6674; - cvt.s32.s8 %r10265, %r10264; - mad.lo.s32 %r10266, %r42, %r10265, %r10257; - mad.lo.s32 %r10267, %r43, %r10263, %r10266; - mad.lo.s32 %r10268, %r45, %r10261, %r10267; - mad.lo.s32 %r10269, %r46, %r10259, %r10268; - ld.const.v4.u8 {%rs6682, %rs6683, %rs6684, %rs6685}, [matrix+3340]; - cvt.u32.u16 %r10270, %rs6685; - cvt.s32.s8 %r10271, %r10270; - cvt.u32.u16 %r10272, %rs6684; - cvt.s32.s8 %r10273, %r10272; - cvt.u32.u16 %r10274, %rs6683; - cvt.s32.s8 %r10275, %r10274; - cvt.u32.u16 %r10276, %rs6682; - cvt.s32.s8 %r10277, %r10276; - mad.lo.s32 %r10278, %r48, %r10277, %r10269; - mad.lo.s32 %r10279, %r49, %r10275, %r10278; - mad.lo.s32 %r10280, %r50, %r10273, %r10279; - mad.lo.s32 %r10281, %r51, %r10271, %r10280; - ld.const.v4.u8 {%rs6690, %rs6691, %rs6692, %rs6693}, [matrix+3344]; - cvt.u32.u16 %r10282, %rs6693; - cvt.s32.s8 %r10283, %r10282; - cvt.u32.u16 %r10284, %rs6692; - cvt.s32.s8 %r10285, %r10284; - cvt.u32.u16 %r10286, %rs6691; - cvt.s32.s8 %r10287, %r10286; - cvt.u32.u16 %r10288, %rs6690; - cvt.s32.s8 %r10289, %r10288; - mad.lo.s32 %r10290, %r173, %r10289, %r10281; - mad.lo.s32 %r10291, %r53, %r10287, %r10290; - mad.lo.s32 %r10292, %r54, %r10285, %r10291; - mad.lo.s32 %r10293, %r55, %r10283, %r10292; - ld.const.v4.u8 {%rs6698, %rs6699, %rs6700, %rs6701}, [matrix+3348]; - cvt.u32.u16 %r10294, %rs6701; - cvt.s32.s8 %r10295, %r10294; - cvt.u32.u16 %r10296, %rs6700; - cvt.s32.s8 %r10297, %r10296; - cvt.u32.u16 %r10298, %rs6699; - cvt.s32.s8 %r10299, %r10298; - cvt.u32.u16 %r10300, %rs6698; - cvt.s32.s8 %r10301, %r10300; - mad.lo.s32 %r10302, %r56, %r10301, %r10293; - mad.lo.s32 %r10303, %r57, %r10299, %r10302; - mad.lo.s32 %r10304, %r58, %r10297, %r10303; - mad.lo.s32 %r10305, %r59, %r10295, %r10304; - ld.const.v4.u8 {%rs6706, %rs6707, %rs6708, %rs6709}, [matrix+3352]; - cvt.u32.u16 %r10306, %rs6709; - cvt.s32.s8 %r10307, %r10306; - cvt.u32.u16 %r10308, %rs6708; - cvt.s32.s8 %r10309, %r10308; - cvt.u32.u16 %r10310, %rs6707; - cvt.s32.s8 %r10311, %r10310; - cvt.u32.u16 %r10312, %rs6706; - cvt.s32.s8 %r10313, %r10312; - mad.lo.s32 %r10314, %r61, %r10313, %r10305; - mad.lo.s32 %r10315, %r62, %r10311, %r10314; - mad.lo.s32 %r10316, %r64, %r10309, %r10315; - mad.lo.s32 %r10317, %r65, %r10307, %r10316; - ld.const.v4.u8 {%rs6714, %rs6715, %rs6716, %rs6717}, [matrix+3356]; - cvt.u32.u16 %r10318, %rs6717; - cvt.s32.s8 %r10319, %r10318; - cvt.u32.u16 %r10320, %rs6716; - cvt.s32.s8 %r10321, %r10320; - cvt.u32.u16 %r10322, %rs6715; - cvt.s32.s8 %r10323, %r10322; - cvt.u32.u16 %r10324, %rs6714; - cvt.s32.s8 %r10325, %r10324; - mad.lo.s32 %r10326, %r67, %r10325, %r10317; - mad.lo.s32 %r10327, %r68, %r10323, %r10326; - mad.lo.s32 %r10328, %r69, %r10321, %r10327; - mad.lo.s32 %r10329, %r70, %r10319, %r10328; - ld.const.v4.u8 {%rs6722, %rs6723, %rs6724, %rs6725}, [matrix+3360]; - cvt.u32.u16 %r10330, %rs6725; - cvt.s32.s8 %r10331, %r10330; - cvt.u32.u16 %r10332, %rs6724; - cvt.s32.s8 %r10333, %r10332; - cvt.u32.u16 %r10334, %rs6723; - cvt.s32.s8 %r10335, %r10334; - cvt.u32.u16 %r10336, %rs6722; - cvt.s32.s8 %r10337, %r10336; - mad.lo.s32 %r10338, %r222, %r10337, %r10329; - mad.lo.s32 %r10339, %r72, %r10335, %r10338; - mad.lo.s32 %r10340, %r73, %r10333, %r10339; - mad.lo.s32 %r10341, %r74, %r10331, %r10340; - ld.const.v4.u8 {%rs6730, %rs6731, %rs6732, %rs6733}, [matrix+3364]; - cvt.u32.u16 %r10342, %rs6733; - cvt.s32.s8 %r10343, %r10342; - cvt.u32.u16 %r10344, %rs6732; - cvt.s32.s8 %r10345, %r10344; - cvt.u32.u16 %r10346, %rs6731; - cvt.s32.s8 %r10347, %r10346; - cvt.u32.u16 %r10348, %rs6730; - cvt.s32.s8 %r10349, %r10348; - mad.lo.s32 %r10350, %r75, %r10349, %r10341; - mad.lo.s32 %r10351, %r76, %r10347, %r10350; - mad.lo.s32 %r10352, %r77, %r10345, %r10351; - mad.lo.s32 %r10353, %r78, %r10343, %r10352; - ld.const.v4.u8 {%rs6738, %rs6739, %rs6740, %rs6741}, [matrix+3368]; - cvt.u32.u16 %r10354, %rs6741; - cvt.s32.s8 %r10355, %r10354; - cvt.u32.u16 %r10356, %rs6740; - cvt.s32.s8 %r10357, %r10356; - cvt.u32.u16 %r10358, %rs6739; - cvt.s32.s8 %r10359, %r10358; - cvt.u32.u16 %r10360, %rs6738; - cvt.s32.s8 %r10361, %r10360; - mad.lo.s32 %r10362, %r80, %r10361, %r10353; - mad.lo.s32 %r10363, %r81, %r10359, %r10362; - mad.lo.s32 %r10364, %r83, %r10357, %r10363; - mad.lo.s32 %r10365, %r84, %r10355, %r10364; - ld.const.v4.u8 {%rs6746, %rs6747, %rs6748, %rs6749}, [matrix+3372]; - cvt.u32.u16 %r10366, %rs6749; - cvt.s32.s8 %r10367, %r10366; - cvt.u32.u16 %r10368, %rs6748; - cvt.s32.s8 %r10369, %r10368; - cvt.u32.u16 %r10370, %rs6747; - cvt.s32.s8 %r10371, %r10370; - cvt.u32.u16 %r10372, %rs6746; - cvt.s32.s8 %r10373, %r10372; - mad.lo.s32 %r10374, %r86, %r10373, %r10365; - mad.lo.s32 %r10375, %r87, %r10371, %r10374; - mad.lo.s32 %r10376, %r88, %r10369, %r10375; - mad.lo.s32 %r10377, %r89, %r10367, %r10376; - ld.const.v4.u8 {%rs6754, %rs6755, %rs6756, %rs6757}, [matrix+3376]; - cvt.u32.u16 %r10378, %rs6757; - cvt.s32.s8 %r10379, %r10378; - cvt.u32.u16 %r10380, %rs6756; - cvt.s32.s8 %r10381, %r10380; - cvt.u32.u16 %r10382, %rs6755; - cvt.s32.s8 %r10383, %r10382; - cvt.u32.u16 %r10384, %rs6754; - cvt.s32.s8 %r10385, %r10384; - mad.lo.s32 %r10386, %r271, %r10385, %r10377; - mad.lo.s32 %r10387, %r91, %r10383, %r10386; - mad.lo.s32 %r10388, %r93, %r10381, %r10387; - mad.lo.s32 %r10389, %r94, %r10379, %r10388; - ld.const.v4.u8 {%rs6762, %rs6763, %rs6764, %rs6765}, [matrix+3380]; - cvt.u32.u16 %r10390, %rs6765; - cvt.s32.s8 %r10391, %r10390; - cvt.u32.u16 %r10392, %rs6764; - cvt.s32.s8 %r10393, %r10392; - cvt.u32.u16 %r10394, %rs6763; - cvt.s32.s8 %r10395, %r10394; - cvt.u32.u16 %r10396, %rs6762; - cvt.s32.s8 %r10397, %r10396; - mad.lo.s32 %r10398, %r96, %r10397, %r10389; - mad.lo.s32 %r10399, %r97, %r10395, %r10398; - mad.lo.s32 %r10400, %r99, %r10393, %r10399; - mad.lo.s32 %r10401, %r100, %r10391, %r10400; - ld.const.v4.u8 {%rs6770, %rs6771, %rs6772, %rs6773}, [matrix+3384]; - cvt.u32.u16 %r10402, %rs6773; - cvt.s32.s8 %r10403, %r10402; - cvt.u32.u16 %r10404, %rs6772; - cvt.s32.s8 %r10405, %r10404; - cvt.u32.u16 %r10406, %rs6771; - cvt.s32.s8 %r10407, %r10406; - cvt.u32.u16 %r10408, %rs6770; - cvt.s32.s8 %r10409, %r10408; - mad.lo.s32 %r10410, %r103, %r10409, %r10401; - mad.lo.s32 %r10411, %r104, %r10407, %r10410; - mad.lo.s32 %r10412, %r107, %r10405, %r10411; - mad.lo.s32 %r10413, %r108, %r10403, %r10412; - ld.const.v4.u8 {%rs6778, %rs6779, %rs6780, %rs6781}, [matrix+3388]; - cvt.u32.u16 %r10414, %rs6781; - cvt.s32.s8 %r10415, %r10414; - cvt.u32.u16 %r10416, %rs6780; - cvt.s32.s8 %r10417, %r10416; - cvt.u32.u16 %r10418, %rs6779; - cvt.s32.s8 %r10419, %r10418; - cvt.u32.u16 %r10420, %rs6778; - cvt.s32.s8 %r10421, %r10420; - mad.lo.s32 %r10422, %r111, %r10421, %r10413; - mad.lo.s32 %r10423, %r112, %r10419, %r10422; - mad.lo.s32 %r10424, %r114, %r10417, %r10423; - mad.lo.s32 %r10425, %r115, %r10415, %r10424; - ld.const.v4.u8 {%rs6786, %rs6787, %rs6788, %rs6789}, [matrix+3392]; - cvt.u32.u16 %r10426, %rs6789; - cvt.s32.s8 %r10427, %r10426; - cvt.u32.u16 %r10428, %rs6788; - cvt.s32.s8 %r10429, %r10428; - cvt.u32.u16 %r10430, %rs6786; - cvt.s32.s8 %r10431, %r10430; - cvt.u32.u16 %r10432, %rs6787; - cvt.s32.s8 %r10433, %r10432; - mul.lo.s32 %r10434, %r34, %r10433; - mad.lo.s32 %r10435, %r124, %r10431, %r10434; - mad.lo.s32 %r10436, %r35, %r10429, %r10435; - mad.lo.s32 %r10437, %r36, %r10427, %r10436; - ld.const.v4.u8 {%rs6794, %rs6795, %rs6796, %rs6797}, [matrix+3396]; - cvt.u32.u16 %r10438, %rs6797; - cvt.s32.s8 %r10439, %r10438; - cvt.u32.u16 %r10440, %rs6796; - cvt.s32.s8 %r10441, %r10440; - cvt.u32.u16 %r10442, %rs6795; - cvt.s32.s8 %r10443, %r10442; - cvt.u32.u16 %r10444, %rs6794; - cvt.s32.s8 %r10445, %r10444; - mad.lo.s32 %r10446, %r37, %r10445, %r10437; - mad.lo.s32 %r10447, %r38, %r10443, %r10446; - mad.lo.s32 %r10448, %r39, %r10441, %r10447; - mad.lo.s32 %r10449, %r40, %r10439, %r10448; - ld.const.v4.u8 {%rs6802, %rs6803, %rs6804, %rs6805}, [matrix+3400]; - cvt.u32.u16 %r10450, %rs6805; - cvt.s32.s8 %r10451, %r10450; - cvt.u32.u16 %r10452, %rs6804; - cvt.s32.s8 %r10453, %r10452; - cvt.u32.u16 %r10454, %rs6803; - cvt.s32.s8 %r10455, %r10454; - cvt.u32.u16 %r10456, %rs6802; - cvt.s32.s8 %r10457, %r10456; - mad.lo.s32 %r10458, %r42, %r10457, %r10449; - mad.lo.s32 %r10459, %r43, %r10455, %r10458; - mad.lo.s32 %r10460, %r45, %r10453, %r10459; - mad.lo.s32 %r10461, %r46, %r10451, %r10460; - ld.const.v4.u8 {%rs6810, %rs6811, %rs6812, %rs6813}, [matrix+3404]; - cvt.u32.u16 %r10462, %rs6813; - cvt.s32.s8 %r10463, %r10462; - cvt.u32.u16 %r10464, %rs6812; - cvt.s32.s8 %r10465, %r10464; - cvt.u32.u16 %r10466, %rs6811; - cvt.s32.s8 %r10467, %r10466; - cvt.u32.u16 %r10468, %rs6810; - cvt.s32.s8 %r10469, %r10468; - mad.lo.s32 %r10470, %r48, %r10469, %r10461; - mad.lo.s32 %r10471, %r49, %r10467, %r10470; - mad.lo.s32 %r10472, %r50, %r10465, %r10471; - mad.lo.s32 %r10473, %r51, %r10463, %r10472; - ld.const.v4.u8 {%rs6818, %rs6819, %rs6820, %rs6821}, [matrix+3408]; - cvt.u32.u16 %r10474, %rs6821; - cvt.s32.s8 %r10475, %r10474; - cvt.u32.u16 %r10476, %rs6820; - cvt.s32.s8 %r10477, %r10476; - cvt.u32.u16 %r10478, %rs6819; - cvt.s32.s8 %r10479, %r10478; - cvt.u32.u16 %r10480, %rs6818; - cvt.s32.s8 %r10481, %r10480; - mad.lo.s32 %r10482, %r173, %r10481, %r10473; - mad.lo.s32 %r10483, %r53, %r10479, %r10482; - mad.lo.s32 %r10484, %r54, %r10477, %r10483; - mad.lo.s32 %r10485, %r55, %r10475, %r10484; - ld.const.v4.u8 {%rs6826, %rs6827, %rs6828, %rs6829}, [matrix+3412]; - cvt.u32.u16 %r10486, %rs6829; - cvt.s32.s8 %r10487, %r10486; - cvt.u32.u16 %r10488, %rs6828; - cvt.s32.s8 %r10489, %r10488; - cvt.u32.u16 %r10490, %rs6827; - cvt.s32.s8 %r10491, %r10490; - cvt.u32.u16 %r10492, %rs6826; - cvt.s32.s8 %r10493, %r10492; - mad.lo.s32 %r10494, %r56, %r10493, %r10485; - mad.lo.s32 %r10495, %r57, %r10491, %r10494; - mad.lo.s32 %r10496, %r58, %r10489, %r10495; - mad.lo.s32 %r10497, %r59, %r10487, %r10496; - ld.const.v4.u8 {%rs6834, %rs6835, %rs6836, %rs6837}, [matrix+3416]; - cvt.u32.u16 %r10498, %rs6837; - cvt.s32.s8 %r10499, %r10498; - cvt.u32.u16 %r10500, %rs6836; - cvt.s32.s8 %r10501, %r10500; - cvt.u32.u16 %r10502, %rs6835; - cvt.s32.s8 %r10503, %r10502; - cvt.u32.u16 %r10504, %rs6834; - cvt.s32.s8 %r10505, %r10504; - mad.lo.s32 %r10506, %r61, %r10505, %r10497; - mad.lo.s32 %r10507, %r62, %r10503, %r10506; - mad.lo.s32 %r10508, %r64, %r10501, %r10507; - mad.lo.s32 %r10509, %r65, %r10499, %r10508; - ld.const.v4.u8 {%rs6842, %rs6843, %rs6844, %rs6845}, [matrix+3420]; - cvt.u32.u16 %r10510, %rs6845; - cvt.s32.s8 %r10511, %r10510; - cvt.u32.u16 %r10512, %rs6844; - cvt.s32.s8 %r10513, %r10512; - cvt.u32.u16 %r10514, %rs6843; - cvt.s32.s8 %r10515, %r10514; - cvt.u32.u16 %r10516, %rs6842; - cvt.s32.s8 %r10517, %r10516; - mad.lo.s32 %r10518, %r67, %r10517, %r10509; - mad.lo.s32 %r10519, %r68, %r10515, %r10518; - mad.lo.s32 %r10520, %r69, %r10513, %r10519; - mad.lo.s32 %r10521, %r70, %r10511, %r10520; - ld.const.v4.u8 {%rs6850, %rs6851, %rs6852, %rs6853}, [matrix+3424]; - cvt.u32.u16 %r10522, %rs6853; - cvt.s32.s8 %r10523, %r10522; - cvt.u32.u16 %r10524, %rs6852; - cvt.s32.s8 %r10525, %r10524; - cvt.u32.u16 %r10526, %rs6851; - cvt.s32.s8 %r10527, %r10526; - cvt.u32.u16 %r10528, %rs6850; - cvt.s32.s8 %r10529, %r10528; - mad.lo.s32 %r10530, %r222, %r10529, %r10521; - mad.lo.s32 %r10531, %r72, %r10527, %r10530; - mad.lo.s32 %r10532, %r73, %r10525, %r10531; - mad.lo.s32 %r10533, %r74, %r10523, %r10532; - ld.const.v4.u8 {%rs6858, %rs6859, %rs6860, %rs6861}, [matrix+3428]; - cvt.u32.u16 %r10534, %rs6861; - cvt.s32.s8 %r10535, %r10534; - cvt.u32.u16 %r10536, %rs6860; - cvt.s32.s8 %r10537, %r10536; - cvt.u32.u16 %r10538, %rs6859; - cvt.s32.s8 %r10539, %r10538; - cvt.u32.u16 %r10540, %rs6858; - cvt.s32.s8 %r10541, %r10540; - mad.lo.s32 %r10542, %r75, %r10541, %r10533; - mad.lo.s32 %r10543, %r76, %r10539, %r10542; - mad.lo.s32 %r10544, %r77, %r10537, %r10543; - mad.lo.s32 %r10545, %r78, %r10535, %r10544; - ld.const.v4.u8 {%rs6866, %rs6867, %rs6868, %rs6869}, [matrix+3432]; - cvt.u32.u16 %r10546, %rs6869; - cvt.s32.s8 %r10547, %r10546; - cvt.u32.u16 %r10548, %rs6868; - cvt.s32.s8 %r10549, %r10548; - cvt.u32.u16 %r10550, %rs6867; - cvt.s32.s8 %r10551, %r10550; - cvt.u32.u16 %r10552, %rs6866; - cvt.s32.s8 %r10553, %r10552; - mad.lo.s32 %r10554, %r80, %r10553, %r10545; - mad.lo.s32 %r10555, %r81, %r10551, %r10554; - mad.lo.s32 %r10556, %r83, %r10549, %r10555; - mad.lo.s32 %r10557, %r84, %r10547, %r10556; - ld.const.v4.u8 {%rs6874, %rs6875, %rs6876, %rs6877}, [matrix+3436]; - cvt.u32.u16 %r10558, %rs6877; - cvt.s32.s8 %r10559, %r10558; - cvt.u32.u16 %r10560, %rs6876; - cvt.s32.s8 %r10561, %r10560; - cvt.u32.u16 %r10562, %rs6875; - cvt.s32.s8 %r10563, %r10562; - cvt.u32.u16 %r10564, %rs6874; - cvt.s32.s8 %r10565, %r10564; - mad.lo.s32 %r10566, %r86, %r10565, %r10557; - mad.lo.s32 %r10567, %r87, %r10563, %r10566; - mad.lo.s32 %r10568, %r88, %r10561, %r10567; - mad.lo.s32 %r10569, %r89, %r10559, %r10568; - ld.const.v4.u8 {%rs6882, %rs6883, %rs6884, %rs6885}, [matrix+3440]; - cvt.u32.u16 %r10570, %rs6885; - cvt.s32.s8 %r10571, %r10570; - cvt.u32.u16 %r10572, %rs6884; - cvt.s32.s8 %r10573, %r10572; - cvt.u32.u16 %r10574, %rs6883; - cvt.s32.s8 %r10575, %r10574; - cvt.u32.u16 %r10576, %rs6882; - cvt.s32.s8 %r10577, %r10576; - mad.lo.s32 %r10578, %r271, %r10577, %r10569; - mad.lo.s32 %r10579, %r91, %r10575, %r10578; - mad.lo.s32 %r10580, %r93, %r10573, %r10579; - mad.lo.s32 %r10581, %r94, %r10571, %r10580; - ld.const.v4.u8 {%rs6890, %rs6891, %rs6892, %rs6893}, [matrix+3444]; - cvt.u32.u16 %r10582, %rs6893; - cvt.s32.s8 %r10583, %r10582; - cvt.u32.u16 %r10584, %rs6892; - cvt.s32.s8 %r10585, %r10584; - cvt.u32.u16 %r10586, %rs6891; - cvt.s32.s8 %r10587, %r10586; - cvt.u32.u16 %r10588, %rs6890; - cvt.s32.s8 %r10589, %r10588; - mad.lo.s32 %r10590, %r96, %r10589, %r10581; - mad.lo.s32 %r10591, %r97, %r10587, %r10590; - mad.lo.s32 %r10592, %r99, %r10585, %r10591; - mad.lo.s32 %r10593, %r100, %r10583, %r10592; - ld.const.v4.u8 {%rs6898, %rs6899, %rs6900, %rs6901}, [matrix+3448]; - cvt.u32.u16 %r10594, %rs6901; - cvt.s32.s8 %r10595, %r10594; - cvt.u32.u16 %r10596, %rs6900; - cvt.s32.s8 %r10597, %r10596; - cvt.u32.u16 %r10598, %rs6899; - cvt.s32.s8 %r10599, %r10598; - cvt.u32.u16 %r10600, %rs6898; - cvt.s32.s8 %r10601, %r10600; - mad.lo.s32 %r10602, %r103, %r10601, %r10593; - mad.lo.s32 %r10603, %r104, %r10599, %r10602; - mad.lo.s32 %r10604, %r107, %r10597, %r10603; - mad.lo.s32 %r10605, %r108, %r10595, %r10604; - ld.const.v4.u8 {%rs6906, %rs6907, %rs6908, %rs6909}, [matrix+3452]; - cvt.u32.u16 %r10606, %rs6909; - cvt.s32.s8 %r10607, %r10606; - cvt.u32.u16 %r10608, %rs6908; - cvt.s32.s8 %r10609, %r10608; - cvt.u32.u16 %r10610, %rs6907; - cvt.s32.s8 %r10611, %r10610; - cvt.u32.u16 %r10612, %rs6906; - cvt.s32.s8 %r10613, %r10612; - mad.lo.s32 %r10614, %r111, %r10613, %r10605; - mad.lo.s32 %r10615, %r112, %r10611, %r10614; - mad.lo.s32 %r10616, %r114, %r10609, %r10615; - mad.lo.s32 %r10617, %r115, %r10607, %r10616; - shr.u32 %r10618, %r10425, 6; - and.b32 %r10619, %r10618, 240; - shr.u32 %r10620, %r10617, 10; - or.b32 %r10621, %r10620, %r10619; - xor.b32 %r10622, %r95, %r10621; - cvt.u64.u32 %rd400, %r10622; - ld.const.v4.u8 {%rs6914, %rs6915, %rs6916, %rs6917}, [matrix+3456]; - cvt.u32.u16 %r10623, %rs6917; - cvt.s32.s8 %r10624, %r10623; - cvt.u32.u16 %r10625, %rs6916; - cvt.s32.s8 %r10626, %r10625; - cvt.u32.u16 %r10627, %rs6914; - cvt.s32.s8 %r10628, %r10627; - cvt.u32.u16 %r10629, %rs6915; - cvt.s32.s8 %r10630, %r10629; - mul.lo.s32 %r10631, %r34, %r10630; - mad.lo.s32 %r10632, %r124, %r10628, %r10631; - mad.lo.s32 %r10633, %r35, %r10626, %r10632; - mad.lo.s32 %r10634, %r36, %r10624, %r10633; - ld.const.v4.u8 {%rs6922, %rs6923, %rs6924, %rs6925}, [matrix+3460]; - cvt.u32.u16 %r10635, %rs6925; - cvt.s32.s8 %r10636, %r10635; - cvt.u32.u16 %r10637, %rs6924; - cvt.s32.s8 %r10638, %r10637; - cvt.u32.u16 %r10639, %rs6923; - cvt.s32.s8 %r10640, %r10639; - cvt.u32.u16 %r10641, %rs6922; - cvt.s32.s8 %r10642, %r10641; - mad.lo.s32 %r10643, %r37, %r10642, %r10634; - mad.lo.s32 %r10644, %r38, %r10640, %r10643; - mad.lo.s32 %r10645, %r39, %r10638, %r10644; - mad.lo.s32 %r10646, %r40, %r10636, %r10645; - ld.const.v4.u8 {%rs6930, %rs6931, %rs6932, %rs6933}, [matrix+3464]; - cvt.u32.u16 %r10647, %rs6933; - cvt.s32.s8 %r10648, %r10647; - cvt.u32.u16 %r10649, %rs6932; - cvt.s32.s8 %r10650, %r10649; - cvt.u32.u16 %r10651, %rs6931; - cvt.s32.s8 %r10652, %r10651; - cvt.u32.u16 %r10653, %rs6930; - cvt.s32.s8 %r10654, %r10653; - mad.lo.s32 %r10655, %r42, %r10654, %r10646; - mad.lo.s32 %r10656, %r43, %r10652, %r10655; - mad.lo.s32 %r10657, %r45, %r10650, %r10656; - mad.lo.s32 %r10658, %r46, %r10648, %r10657; - ld.const.v4.u8 {%rs6938, %rs6939, %rs6940, %rs6941}, [matrix+3468]; - cvt.u32.u16 %r10659, %rs6941; - cvt.s32.s8 %r10660, %r10659; - cvt.u32.u16 %r10661, %rs6940; - cvt.s32.s8 %r10662, %r10661; - cvt.u32.u16 %r10663, %rs6939; - cvt.s32.s8 %r10664, %r10663; - cvt.u32.u16 %r10665, %rs6938; - cvt.s32.s8 %r10666, %r10665; - mad.lo.s32 %r10667, %r48, %r10666, %r10658; - mad.lo.s32 %r10668, %r49, %r10664, %r10667; - mad.lo.s32 %r10669, %r50, %r10662, %r10668; - mad.lo.s32 %r10670, %r51, %r10660, %r10669; - ld.const.v4.u8 {%rs6946, %rs6947, %rs6948, %rs6949}, [matrix+3472]; - cvt.u32.u16 %r10671, %rs6949; - cvt.s32.s8 %r10672, %r10671; - cvt.u32.u16 %r10673, %rs6948; - cvt.s32.s8 %r10674, %r10673; - cvt.u32.u16 %r10675, %rs6947; - cvt.s32.s8 %r10676, %r10675; - cvt.u32.u16 %r10677, %rs6946; - cvt.s32.s8 %r10678, %r10677; - mad.lo.s32 %r10679, %r173, %r10678, %r10670; - mad.lo.s32 %r10680, %r53, %r10676, %r10679; - mad.lo.s32 %r10681, %r54, %r10674, %r10680; - mad.lo.s32 %r10682, %r55, %r10672, %r10681; - ld.const.v4.u8 {%rs6954, %rs6955, %rs6956, %rs6957}, [matrix+3476]; - cvt.u32.u16 %r10683, %rs6957; - cvt.s32.s8 %r10684, %r10683; - cvt.u32.u16 %r10685, %rs6956; - cvt.s32.s8 %r10686, %r10685; - cvt.u32.u16 %r10687, %rs6955; - cvt.s32.s8 %r10688, %r10687; - cvt.u32.u16 %r10689, %rs6954; - cvt.s32.s8 %r10690, %r10689; - mad.lo.s32 %r10691, %r56, %r10690, %r10682; - mad.lo.s32 %r10692, %r57, %r10688, %r10691; - mad.lo.s32 %r10693, %r58, %r10686, %r10692; - mad.lo.s32 %r10694, %r59, %r10684, %r10693; - ld.const.v4.u8 {%rs6962, %rs6963, %rs6964, %rs6965}, [matrix+3480]; - cvt.u32.u16 %r10695, %rs6965; - cvt.s32.s8 %r10696, %r10695; - cvt.u32.u16 %r10697, %rs6964; - cvt.s32.s8 %r10698, %r10697; - cvt.u32.u16 %r10699, %rs6963; - cvt.s32.s8 %r10700, %r10699; - cvt.u32.u16 %r10701, %rs6962; - cvt.s32.s8 %r10702, %r10701; - mad.lo.s32 %r10703, %r61, %r10702, %r10694; - mad.lo.s32 %r10704, %r62, %r10700, %r10703; - mad.lo.s32 %r10705, %r64, %r10698, %r10704; - mad.lo.s32 %r10706, %r65, %r10696, %r10705; - ld.const.v4.u8 {%rs6970, %rs6971, %rs6972, %rs6973}, [matrix+3484]; - cvt.u32.u16 %r10707, %rs6973; - cvt.s32.s8 %r10708, %r10707; - cvt.u32.u16 %r10709, %rs6972; - cvt.s32.s8 %r10710, %r10709; - cvt.u32.u16 %r10711, %rs6971; - cvt.s32.s8 %r10712, %r10711; - cvt.u32.u16 %r10713, %rs6970; - cvt.s32.s8 %r10714, %r10713; - mad.lo.s32 %r10715, %r67, %r10714, %r10706; - mad.lo.s32 %r10716, %r68, %r10712, %r10715; - mad.lo.s32 %r10717, %r69, %r10710, %r10716; - mad.lo.s32 %r10718, %r70, %r10708, %r10717; - ld.const.v4.u8 {%rs6978, %rs6979, %rs6980, %rs6981}, [matrix+3488]; - cvt.u32.u16 %r10719, %rs6981; - cvt.s32.s8 %r10720, %r10719; - cvt.u32.u16 %r10721, %rs6980; - cvt.s32.s8 %r10722, %r10721; - cvt.u32.u16 %r10723, %rs6979; - cvt.s32.s8 %r10724, %r10723; - cvt.u32.u16 %r10725, %rs6978; - cvt.s32.s8 %r10726, %r10725; - mad.lo.s32 %r10727, %r222, %r10726, %r10718; - mad.lo.s32 %r10728, %r72, %r10724, %r10727; - mad.lo.s32 %r10729, %r73, %r10722, %r10728; - mad.lo.s32 %r10730, %r74, %r10720, %r10729; - ld.const.v4.u8 {%rs6986, %rs6987, %rs6988, %rs6989}, [matrix+3492]; - cvt.u32.u16 %r10731, %rs6989; - cvt.s32.s8 %r10732, %r10731; - cvt.u32.u16 %r10733, %rs6988; - cvt.s32.s8 %r10734, %r10733; - cvt.u32.u16 %r10735, %rs6987; - cvt.s32.s8 %r10736, %r10735; - cvt.u32.u16 %r10737, %rs6986; - cvt.s32.s8 %r10738, %r10737; - mad.lo.s32 %r10739, %r75, %r10738, %r10730; - mad.lo.s32 %r10740, %r76, %r10736, %r10739; - mad.lo.s32 %r10741, %r77, %r10734, %r10740; - mad.lo.s32 %r10742, %r78, %r10732, %r10741; - ld.const.v4.u8 {%rs6994, %rs6995, %rs6996, %rs6997}, [matrix+3496]; - cvt.u32.u16 %r10743, %rs6997; - cvt.s32.s8 %r10744, %r10743; - cvt.u32.u16 %r10745, %rs6996; - cvt.s32.s8 %r10746, %r10745; - cvt.u32.u16 %r10747, %rs6995; - cvt.s32.s8 %r10748, %r10747; - cvt.u32.u16 %r10749, %rs6994; - cvt.s32.s8 %r10750, %r10749; - mad.lo.s32 %r10751, %r80, %r10750, %r10742; - mad.lo.s32 %r10752, %r81, %r10748, %r10751; - mad.lo.s32 %r10753, %r83, %r10746, %r10752; - mad.lo.s32 %r10754, %r84, %r10744, %r10753; - ld.const.v4.u8 {%rs7002, %rs7003, %rs7004, %rs7005}, [matrix+3500]; - cvt.u32.u16 %r10755, %rs7005; - cvt.s32.s8 %r10756, %r10755; - cvt.u32.u16 %r10757, %rs7004; - cvt.s32.s8 %r10758, %r10757; - cvt.u32.u16 %r10759, %rs7003; - cvt.s32.s8 %r10760, %r10759; - cvt.u32.u16 %r10761, %rs7002; - cvt.s32.s8 %r10762, %r10761; - mad.lo.s32 %r10763, %r86, %r10762, %r10754; - mad.lo.s32 %r10764, %r87, %r10760, %r10763; - mad.lo.s32 %r10765, %r88, %r10758, %r10764; - mad.lo.s32 %r10766, %r89, %r10756, %r10765; - ld.const.v4.u8 {%rs7010, %rs7011, %rs7012, %rs7013}, [matrix+3504]; - cvt.u32.u16 %r10767, %rs7013; - cvt.s32.s8 %r10768, %r10767; - cvt.u32.u16 %r10769, %rs7012; - cvt.s32.s8 %r10770, %r10769; - cvt.u32.u16 %r10771, %rs7011; - cvt.s32.s8 %r10772, %r10771; - cvt.u32.u16 %r10773, %rs7010; - cvt.s32.s8 %r10774, %r10773; - mad.lo.s32 %r10775, %r271, %r10774, %r10766; - mad.lo.s32 %r10776, %r91, %r10772, %r10775; - mad.lo.s32 %r10777, %r93, %r10770, %r10776; - mad.lo.s32 %r10778, %r94, %r10768, %r10777; - ld.const.v4.u8 {%rs7018, %rs7019, %rs7020, %rs7021}, [matrix+3508]; - cvt.u32.u16 %r10779, %rs7021; - cvt.s32.s8 %r10780, %r10779; - cvt.u32.u16 %r10781, %rs7020; - cvt.s32.s8 %r10782, %r10781; - cvt.u32.u16 %r10783, %rs7019; - cvt.s32.s8 %r10784, %r10783; - cvt.u32.u16 %r10785, %rs7018; - cvt.s32.s8 %r10786, %r10785; - mad.lo.s32 %r10787, %r96, %r10786, %r10778; - mad.lo.s32 %r10788, %r97, %r10784, %r10787; - mad.lo.s32 %r10789, %r99, %r10782, %r10788; - mad.lo.s32 %r10790, %r100, %r10780, %r10789; - ld.const.v4.u8 {%rs7026, %rs7027, %rs7028, %rs7029}, [matrix+3512]; - cvt.u32.u16 %r10791, %rs7029; - cvt.s32.s8 %r10792, %r10791; - cvt.u32.u16 %r10793, %rs7028; - cvt.s32.s8 %r10794, %r10793; - cvt.u32.u16 %r10795, %rs7027; - cvt.s32.s8 %r10796, %r10795; - cvt.u32.u16 %r10797, %rs7026; - cvt.s32.s8 %r10798, %r10797; - mad.lo.s32 %r10799, %r103, %r10798, %r10790; - mad.lo.s32 %r10800, %r104, %r10796, %r10799; - mad.lo.s32 %r10801, %r107, %r10794, %r10800; - mad.lo.s32 %r10802, %r108, %r10792, %r10801; - ld.const.v4.u8 {%rs7034, %rs7035, %rs7036, %rs7037}, [matrix+3516]; - cvt.u32.u16 %r10803, %rs7037; - cvt.s32.s8 %r10804, %r10803; - cvt.u32.u16 %r10805, %rs7036; - cvt.s32.s8 %r10806, %r10805; - cvt.u32.u16 %r10807, %rs7035; - cvt.s32.s8 %r10808, %r10807; - cvt.u32.u16 %r10809, %rs7034; - cvt.s32.s8 %r10810, %r10809; - mad.lo.s32 %r10811, %r111, %r10810, %r10802; - mad.lo.s32 %r10812, %r112, %r10808, %r10811; - mad.lo.s32 %r10813, %r114, %r10806, %r10812; - mad.lo.s32 %r10814, %r115, %r10804, %r10813; - ld.const.v4.u8 {%rs7042, %rs7043, %rs7044, %rs7045}, [matrix+3520]; - cvt.u32.u16 %r10815, %rs7045; - cvt.s32.s8 %r10816, %r10815; - cvt.u32.u16 %r10817, %rs7044; - cvt.s32.s8 %r10818, %r10817; - cvt.u32.u16 %r10819, %rs7042; - cvt.s32.s8 %r10820, %r10819; - cvt.u32.u16 %r10821, %rs7043; - cvt.s32.s8 %r10822, %r10821; - mul.lo.s32 %r10823, %r34, %r10822; - mad.lo.s32 %r10824, %r124, %r10820, %r10823; - mad.lo.s32 %r10825, %r35, %r10818, %r10824; - mad.lo.s32 %r10826, %r36, %r10816, %r10825; - ld.const.v4.u8 {%rs7050, %rs7051, %rs7052, %rs7053}, [matrix+3524]; - cvt.u32.u16 %r10827, %rs7053; - cvt.s32.s8 %r10828, %r10827; - cvt.u32.u16 %r10829, %rs7052; - cvt.s32.s8 %r10830, %r10829; - cvt.u32.u16 %r10831, %rs7051; - cvt.s32.s8 %r10832, %r10831; - cvt.u32.u16 %r10833, %rs7050; - cvt.s32.s8 %r10834, %r10833; - mad.lo.s32 %r10835, %r37, %r10834, %r10826; - mad.lo.s32 %r10836, %r38, %r10832, %r10835; - mad.lo.s32 %r10837, %r39, %r10830, %r10836; - mad.lo.s32 %r10838, %r40, %r10828, %r10837; - ld.const.v4.u8 {%rs7058, %rs7059, %rs7060, %rs7061}, [matrix+3528]; - cvt.u32.u16 %r10839, %rs7061; - cvt.s32.s8 %r10840, %r10839; - cvt.u32.u16 %r10841, %rs7060; - cvt.s32.s8 %r10842, %r10841; - cvt.u32.u16 %r10843, %rs7059; - cvt.s32.s8 %r10844, %r10843; - cvt.u32.u16 %r10845, %rs7058; - cvt.s32.s8 %r10846, %r10845; - mad.lo.s32 %r10847, %r42, %r10846, %r10838; - mad.lo.s32 %r10848, %r43, %r10844, %r10847; - mad.lo.s32 %r10849, %r45, %r10842, %r10848; - mad.lo.s32 %r10850, %r46, %r10840, %r10849; - ld.const.v4.u8 {%rs7066, %rs7067, %rs7068, %rs7069}, [matrix+3532]; - cvt.u32.u16 %r10851, %rs7069; - cvt.s32.s8 %r10852, %r10851; - cvt.u32.u16 %r10853, %rs7068; - cvt.s32.s8 %r10854, %r10853; - cvt.u32.u16 %r10855, %rs7067; - cvt.s32.s8 %r10856, %r10855; - cvt.u32.u16 %r10857, %rs7066; - cvt.s32.s8 %r10858, %r10857; - mad.lo.s32 %r10859, %r48, %r10858, %r10850; - mad.lo.s32 %r10860, %r49, %r10856, %r10859; - mad.lo.s32 %r10861, %r50, %r10854, %r10860; - mad.lo.s32 %r10862, %r51, %r10852, %r10861; - ld.const.v4.u8 {%rs7074, %rs7075, %rs7076, %rs7077}, [matrix+3536]; - cvt.u32.u16 %r10863, %rs7077; - cvt.s32.s8 %r10864, %r10863; - cvt.u32.u16 %r10865, %rs7076; - cvt.s32.s8 %r10866, %r10865; - cvt.u32.u16 %r10867, %rs7075; - cvt.s32.s8 %r10868, %r10867; - cvt.u32.u16 %r10869, %rs7074; - cvt.s32.s8 %r10870, %r10869; - mad.lo.s32 %r10871, %r173, %r10870, %r10862; - mad.lo.s32 %r10872, %r53, %r10868, %r10871; - mad.lo.s32 %r10873, %r54, %r10866, %r10872; - mad.lo.s32 %r10874, %r55, %r10864, %r10873; - ld.const.v4.u8 {%rs7082, %rs7083, %rs7084, %rs7085}, [matrix+3540]; - cvt.u32.u16 %r10875, %rs7085; - cvt.s32.s8 %r10876, %r10875; - cvt.u32.u16 %r10877, %rs7084; - cvt.s32.s8 %r10878, %r10877; - cvt.u32.u16 %r10879, %rs7083; - cvt.s32.s8 %r10880, %r10879; - cvt.u32.u16 %r10881, %rs7082; - cvt.s32.s8 %r10882, %r10881; - mad.lo.s32 %r10883, %r56, %r10882, %r10874; - mad.lo.s32 %r10884, %r57, %r10880, %r10883; - mad.lo.s32 %r10885, %r58, %r10878, %r10884; - mad.lo.s32 %r10886, %r59, %r10876, %r10885; - ld.const.v4.u8 {%rs7090, %rs7091, %rs7092, %rs7093}, [matrix+3544]; - cvt.u32.u16 %r10887, %rs7093; - cvt.s32.s8 %r10888, %r10887; - cvt.u32.u16 %r10889, %rs7092; - cvt.s32.s8 %r10890, %r10889; - cvt.u32.u16 %r10891, %rs7091; - cvt.s32.s8 %r10892, %r10891; - cvt.u32.u16 %r10893, %rs7090; - cvt.s32.s8 %r10894, %r10893; - mad.lo.s32 %r10895, %r61, %r10894, %r10886; - mad.lo.s32 %r10896, %r62, %r10892, %r10895; - mad.lo.s32 %r10897, %r64, %r10890, %r10896; - mad.lo.s32 %r10898, %r65, %r10888, %r10897; - ld.const.v4.u8 {%rs7098, %rs7099, %rs7100, %rs7101}, [matrix+3548]; - cvt.u32.u16 %r10899, %rs7101; - cvt.s32.s8 %r10900, %r10899; - cvt.u32.u16 %r10901, %rs7100; - cvt.s32.s8 %r10902, %r10901; - cvt.u32.u16 %r10903, %rs7099; - cvt.s32.s8 %r10904, %r10903; - cvt.u32.u16 %r10905, %rs7098; - cvt.s32.s8 %r10906, %r10905; - mad.lo.s32 %r10907, %r67, %r10906, %r10898; - mad.lo.s32 %r10908, %r68, %r10904, %r10907; - mad.lo.s32 %r10909, %r69, %r10902, %r10908; - mad.lo.s32 %r10910, %r70, %r10900, %r10909; - ld.const.v4.u8 {%rs7106, %rs7107, %rs7108, %rs7109}, [matrix+3552]; - cvt.u32.u16 %r10911, %rs7109; - cvt.s32.s8 %r10912, %r10911; - cvt.u32.u16 %r10913, %rs7108; - cvt.s32.s8 %r10914, %r10913; - cvt.u32.u16 %r10915, %rs7107; - cvt.s32.s8 %r10916, %r10915; - cvt.u32.u16 %r10917, %rs7106; - cvt.s32.s8 %r10918, %r10917; - mad.lo.s32 %r10919, %r222, %r10918, %r10910; - mad.lo.s32 %r10920, %r72, %r10916, %r10919; - mad.lo.s32 %r10921, %r73, %r10914, %r10920; - mad.lo.s32 %r10922, %r74, %r10912, %r10921; - ld.const.v4.u8 {%rs7114, %rs7115, %rs7116, %rs7117}, [matrix+3556]; - cvt.u32.u16 %r10923, %rs7117; - cvt.s32.s8 %r10924, %r10923; - cvt.u32.u16 %r10925, %rs7116; - cvt.s32.s8 %r10926, %r10925; - cvt.u32.u16 %r10927, %rs7115; - cvt.s32.s8 %r10928, %r10927; - cvt.u32.u16 %r10929, %rs7114; - cvt.s32.s8 %r10930, %r10929; - mad.lo.s32 %r10931, %r75, %r10930, %r10922; - mad.lo.s32 %r10932, %r76, %r10928, %r10931; - mad.lo.s32 %r10933, %r77, %r10926, %r10932; - mad.lo.s32 %r10934, %r78, %r10924, %r10933; - ld.const.v4.u8 {%rs7122, %rs7123, %rs7124, %rs7125}, [matrix+3560]; - cvt.u32.u16 %r10935, %rs7125; - cvt.s32.s8 %r10936, %r10935; - cvt.u32.u16 %r10937, %rs7124; - cvt.s32.s8 %r10938, %r10937; - cvt.u32.u16 %r10939, %rs7123; - cvt.s32.s8 %r10940, %r10939; - cvt.u32.u16 %r10941, %rs7122; - cvt.s32.s8 %r10942, %r10941; - mad.lo.s32 %r10943, %r80, %r10942, %r10934; - mad.lo.s32 %r10944, %r81, %r10940, %r10943; - mad.lo.s32 %r10945, %r83, %r10938, %r10944; - mad.lo.s32 %r10946, %r84, %r10936, %r10945; - ld.const.v4.u8 {%rs7130, %rs7131, %rs7132, %rs7133}, [matrix+3564]; - cvt.u32.u16 %r10947, %rs7133; - cvt.s32.s8 %r10948, %r10947; - cvt.u32.u16 %r10949, %rs7132; - cvt.s32.s8 %r10950, %r10949; - cvt.u32.u16 %r10951, %rs7131; - cvt.s32.s8 %r10952, %r10951; - cvt.u32.u16 %r10953, %rs7130; - cvt.s32.s8 %r10954, %r10953; - mad.lo.s32 %r10955, %r86, %r10954, %r10946; - mad.lo.s32 %r10956, %r87, %r10952, %r10955; - mad.lo.s32 %r10957, %r88, %r10950, %r10956; - mad.lo.s32 %r10958, %r89, %r10948, %r10957; - ld.const.v4.u8 {%rs7138, %rs7139, %rs7140, %rs7141}, [matrix+3568]; - cvt.u32.u16 %r10959, %rs7141; - cvt.s32.s8 %r10960, %r10959; - cvt.u32.u16 %r10961, %rs7140; - cvt.s32.s8 %r10962, %r10961; - cvt.u32.u16 %r10963, %rs7139; - cvt.s32.s8 %r10964, %r10963; - cvt.u32.u16 %r10965, %rs7138; - cvt.s32.s8 %r10966, %r10965; - mad.lo.s32 %r10967, %r271, %r10966, %r10958; - mad.lo.s32 %r10968, %r91, %r10964, %r10967; - mad.lo.s32 %r10969, %r93, %r10962, %r10968; - mad.lo.s32 %r10970, %r94, %r10960, %r10969; - ld.const.v4.u8 {%rs7146, %rs7147, %rs7148, %rs7149}, [matrix+3572]; - cvt.u32.u16 %r10971, %rs7149; - cvt.s32.s8 %r10972, %r10971; - cvt.u32.u16 %r10973, %rs7148; - cvt.s32.s8 %r10974, %r10973; - cvt.u32.u16 %r10975, %rs7147; - cvt.s32.s8 %r10976, %r10975; - cvt.u32.u16 %r10977, %rs7146; - cvt.s32.s8 %r10978, %r10977; - mad.lo.s32 %r10979, %r96, %r10978, %r10970; - mad.lo.s32 %r10980, %r97, %r10976, %r10979; - mad.lo.s32 %r10981, %r99, %r10974, %r10980; - mad.lo.s32 %r10982, %r100, %r10972, %r10981; - ld.const.v4.u8 {%rs7154, %rs7155, %rs7156, %rs7157}, [matrix+3576]; - cvt.u32.u16 %r10983, %rs7157; - cvt.s32.s8 %r10984, %r10983; - cvt.u32.u16 %r10985, %rs7156; - cvt.s32.s8 %r10986, %r10985; - cvt.u32.u16 %r10987, %rs7155; - cvt.s32.s8 %r10988, %r10987; - cvt.u32.u16 %r10989, %rs7154; - cvt.s32.s8 %r10990, %r10989; - mad.lo.s32 %r10991, %r103, %r10990, %r10982; - mad.lo.s32 %r10992, %r104, %r10988, %r10991; - mad.lo.s32 %r10993, %r107, %r10986, %r10992; - mad.lo.s32 %r10994, %r108, %r10984, %r10993; - ld.const.v4.u8 {%rs7162, %rs7163, %rs7164, %rs7165}, [matrix+3580]; - cvt.u32.u16 %r10995, %rs7165; - cvt.s32.s8 %r10996, %r10995; - cvt.u32.u16 %r10997, %rs7164; - cvt.s32.s8 %r10998, %r10997; - cvt.u32.u16 %r10999, %rs7163; - cvt.s32.s8 %r11000, %r10999; - cvt.u32.u16 %r11001, %rs7162; - cvt.s32.s8 %r11002, %r11001; - mad.lo.s32 %r11003, %r111, %r11002, %r10994; - mad.lo.s32 %r11004, %r112, %r11000, %r11003; - mad.lo.s32 %r11005, %r114, %r10998, %r11004; - mad.lo.s32 %r11006, %r115, %r10996, %r11005; - shr.u32 %r11007, %r10814, 6; - and.b32 %r11008, %r11007, 240; - shr.u32 %r11009, %r11006, 10; - or.b32 %r11010, %r11009, %r11008; - xor.b32 %r11011, %r98, %r11010; - cvt.u64.u32 %rd401, %r11011; - ld.const.v4.u8 {%rs7170, %rs7171, %rs7172, %rs7173}, [matrix+3584]; - cvt.u32.u16 %r11012, %rs7173; - cvt.s32.s8 %r11013, %r11012; - cvt.u32.u16 %r11014, %rs7172; - cvt.s32.s8 %r11015, %r11014; - cvt.u32.u16 %r11016, %rs7170; - cvt.s32.s8 %r11017, %r11016; - cvt.u32.u16 %r11018, %rs7171; - cvt.s32.s8 %r11019, %r11018; - mul.lo.s32 %r11020, %r34, %r11019; - mad.lo.s32 %r11021, %r124, %r11017, %r11020; - mad.lo.s32 %r11022, %r35, %r11015, %r11021; - mad.lo.s32 %r11023, %r36, %r11013, %r11022; - ld.const.v4.u8 {%rs7178, %rs7179, %rs7180, %rs7181}, [matrix+3588]; - cvt.u32.u16 %r11024, %rs7181; - cvt.s32.s8 %r11025, %r11024; - cvt.u32.u16 %r11026, %rs7180; - cvt.s32.s8 %r11027, %r11026; - cvt.u32.u16 %r11028, %rs7179; - cvt.s32.s8 %r11029, %r11028; - cvt.u32.u16 %r11030, %rs7178; - cvt.s32.s8 %r11031, %r11030; - mad.lo.s32 %r11032, %r37, %r11031, %r11023; - mad.lo.s32 %r11033, %r38, %r11029, %r11032; - mad.lo.s32 %r11034, %r39, %r11027, %r11033; - mad.lo.s32 %r11035, %r40, %r11025, %r11034; - ld.const.v4.u8 {%rs7186, %rs7187, %rs7188, %rs7189}, [matrix+3592]; - cvt.u32.u16 %r11036, %rs7189; - cvt.s32.s8 %r11037, %r11036; - cvt.u32.u16 %r11038, %rs7188; - cvt.s32.s8 %r11039, %r11038; - cvt.u32.u16 %r11040, %rs7187; - cvt.s32.s8 %r11041, %r11040; - cvt.u32.u16 %r11042, %rs7186; - cvt.s32.s8 %r11043, %r11042; - mad.lo.s32 %r11044, %r42, %r11043, %r11035; - mad.lo.s32 %r11045, %r43, %r11041, %r11044; - mad.lo.s32 %r11046, %r45, %r11039, %r11045; - mad.lo.s32 %r11047, %r46, %r11037, %r11046; - ld.const.v4.u8 {%rs7194, %rs7195, %rs7196, %rs7197}, [matrix+3596]; - cvt.u32.u16 %r11048, %rs7197; - cvt.s32.s8 %r11049, %r11048; - cvt.u32.u16 %r11050, %rs7196; - cvt.s32.s8 %r11051, %r11050; - cvt.u32.u16 %r11052, %rs7195; - cvt.s32.s8 %r11053, %r11052; - cvt.u32.u16 %r11054, %rs7194; - cvt.s32.s8 %r11055, %r11054; - mad.lo.s32 %r11056, %r48, %r11055, %r11047; - mad.lo.s32 %r11057, %r49, %r11053, %r11056; - mad.lo.s32 %r11058, %r50, %r11051, %r11057; - mad.lo.s32 %r11059, %r51, %r11049, %r11058; - ld.const.v4.u8 {%rs7202, %rs7203, %rs7204, %rs7205}, [matrix+3600]; - cvt.u32.u16 %r11060, %rs7205; - cvt.s32.s8 %r11061, %r11060; - cvt.u32.u16 %r11062, %rs7204; - cvt.s32.s8 %r11063, %r11062; - cvt.u32.u16 %r11064, %rs7203; - cvt.s32.s8 %r11065, %r11064; - cvt.u32.u16 %r11066, %rs7202; - cvt.s32.s8 %r11067, %r11066; - mad.lo.s32 %r11068, %r173, %r11067, %r11059; - mad.lo.s32 %r11069, %r53, %r11065, %r11068; - mad.lo.s32 %r11070, %r54, %r11063, %r11069; - mad.lo.s32 %r11071, %r55, %r11061, %r11070; - ld.const.v4.u8 {%rs7210, %rs7211, %rs7212, %rs7213}, [matrix+3604]; - cvt.u32.u16 %r11072, %rs7213; - cvt.s32.s8 %r11073, %r11072; - cvt.u32.u16 %r11074, %rs7212; - cvt.s32.s8 %r11075, %r11074; - cvt.u32.u16 %r11076, %rs7211; - cvt.s32.s8 %r11077, %r11076; - cvt.u32.u16 %r11078, %rs7210; - cvt.s32.s8 %r11079, %r11078; - mad.lo.s32 %r11080, %r56, %r11079, %r11071; - mad.lo.s32 %r11081, %r57, %r11077, %r11080; - mad.lo.s32 %r11082, %r58, %r11075, %r11081; - mad.lo.s32 %r11083, %r59, %r11073, %r11082; - ld.const.v4.u8 {%rs7218, %rs7219, %rs7220, %rs7221}, [matrix+3608]; - cvt.u32.u16 %r11084, %rs7221; - cvt.s32.s8 %r11085, %r11084; - cvt.u32.u16 %r11086, %rs7220; - cvt.s32.s8 %r11087, %r11086; - cvt.u32.u16 %r11088, %rs7219; - cvt.s32.s8 %r11089, %r11088; - cvt.u32.u16 %r11090, %rs7218; - cvt.s32.s8 %r11091, %r11090; - mad.lo.s32 %r11092, %r61, %r11091, %r11083; - mad.lo.s32 %r11093, %r62, %r11089, %r11092; - mad.lo.s32 %r11094, %r64, %r11087, %r11093; - mad.lo.s32 %r11095, %r65, %r11085, %r11094; - ld.const.v4.u8 {%rs7226, %rs7227, %rs7228, %rs7229}, [matrix+3612]; - cvt.u32.u16 %r11096, %rs7229; - cvt.s32.s8 %r11097, %r11096; - cvt.u32.u16 %r11098, %rs7228; - cvt.s32.s8 %r11099, %r11098; - cvt.u32.u16 %r11100, %rs7227; - cvt.s32.s8 %r11101, %r11100; - cvt.u32.u16 %r11102, %rs7226; - cvt.s32.s8 %r11103, %r11102; - mad.lo.s32 %r11104, %r67, %r11103, %r11095; - mad.lo.s32 %r11105, %r68, %r11101, %r11104; - mad.lo.s32 %r11106, %r69, %r11099, %r11105; - mad.lo.s32 %r11107, %r70, %r11097, %r11106; - ld.const.v4.u8 {%rs7234, %rs7235, %rs7236, %rs7237}, [matrix+3616]; - cvt.u32.u16 %r11108, %rs7237; - cvt.s32.s8 %r11109, %r11108; - cvt.u32.u16 %r11110, %rs7236; - cvt.s32.s8 %r11111, %r11110; - cvt.u32.u16 %r11112, %rs7235; - cvt.s32.s8 %r11113, %r11112; - cvt.u32.u16 %r11114, %rs7234; - cvt.s32.s8 %r11115, %r11114; - mad.lo.s32 %r11116, %r222, %r11115, %r11107; - mad.lo.s32 %r11117, %r72, %r11113, %r11116; - mad.lo.s32 %r11118, %r73, %r11111, %r11117; - mad.lo.s32 %r11119, %r74, %r11109, %r11118; - ld.const.v4.u8 {%rs7242, %rs7243, %rs7244, %rs7245}, [matrix+3620]; - cvt.u32.u16 %r11120, %rs7245; - cvt.s32.s8 %r11121, %r11120; - cvt.u32.u16 %r11122, %rs7244; - cvt.s32.s8 %r11123, %r11122; - cvt.u32.u16 %r11124, %rs7243; - cvt.s32.s8 %r11125, %r11124; - cvt.u32.u16 %r11126, %rs7242; - cvt.s32.s8 %r11127, %r11126; - mad.lo.s32 %r11128, %r75, %r11127, %r11119; - mad.lo.s32 %r11129, %r76, %r11125, %r11128; - mad.lo.s32 %r11130, %r77, %r11123, %r11129; - mad.lo.s32 %r11131, %r78, %r11121, %r11130; - ld.const.v4.u8 {%rs7250, %rs7251, %rs7252, %rs7253}, [matrix+3624]; - cvt.u32.u16 %r11132, %rs7253; - cvt.s32.s8 %r11133, %r11132; - cvt.u32.u16 %r11134, %rs7252; - cvt.s32.s8 %r11135, %r11134; - cvt.u32.u16 %r11136, %rs7251; - cvt.s32.s8 %r11137, %r11136; - cvt.u32.u16 %r11138, %rs7250; - cvt.s32.s8 %r11139, %r11138; - mad.lo.s32 %r11140, %r80, %r11139, %r11131; - mad.lo.s32 %r11141, %r81, %r11137, %r11140; - mad.lo.s32 %r11142, %r83, %r11135, %r11141; - mad.lo.s32 %r11143, %r84, %r11133, %r11142; - ld.const.v4.u8 {%rs7258, %rs7259, %rs7260, %rs7261}, [matrix+3628]; - cvt.u32.u16 %r11144, %rs7261; - cvt.s32.s8 %r11145, %r11144; - cvt.u32.u16 %r11146, %rs7260; - cvt.s32.s8 %r11147, %r11146; - cvt.u32.u16 %r11148, %rs7259; - cvt.s32.s8 %r11149, %r11148; - cvt.u32.u16 %r11150, %rs7258; - cvt.s32.s8 %r11151, %r11150; - mad.lo.s32 %r11152, %r86, %r11151, %r11143; - mad.lo.s32 %r11153, %r87, %r11149, %r11152; - mad.lo.s32 %r11154, %r88, %r11147, %r11153; - mad.lo.s32 %r11155, %r89, %r11145, %r11154; - ld.const.v4.u8 {%rs7266, %rs7267, %rs7268, %rs7269}, [matrix+3632]; - cvt.u32.u16 %r11156, %rs7269; - cvt.s32.s8 %r11157, %r11156; - cvt.u32.u16 %r11158, %rs7268; - cvt.s32.s8 %r11159, %r11158; - cvt.u32.u16 %r11160, %rs7267; - cvt.s32.s8 %r11161, %r11160; - cvt.u32.u16 %r11162, %rs7266; - cvt.s32.s8 %r11163, %r11162; - mad.lo.s32 %r11164, %r271, %r11163, %r11155; - mad.lo.s32 %r11165, %r91, %r11161, %r11164; - mad.lo.s32 %r11166, %r93, %r11159, %r11165; - mad.lo.s32 %r11167, %r94, %r11157, %r11166; - ld.const.v4.u8 {%rs7274, %rs7275, %rs7276, %rs7277}, [matrix+3636]; - cvt.u32.u16 %r11168, %rs7277; - cvt.s32.s8 %r11169, %r11168; - cvt.u32.u16 %r11170, %rs7276; - cvt.s32.s8 %r11171, %r11170; - cvt.u32.u16 %r11172, %rs7275; - cvt.s32.s8 %r11173, %r11172; - cvt.u32.u16 %r11174, %rs7274; - cvt.s32.s8 %r11175, %r11174; - mad.lo.s32 %r11176, %r96, %r11175, %r11167; - mad.lo.s32 %r11177, %r97, %r11173, %r11176; - mad.lo.s32 %r11178, %r99, %r11171, %r11177; - mad.lo.s32 %r11179, %r100, %r11169, %r11178; - ld.const.v4.u8 {%rs7282, %rs7283, %rs7284, %rs7285}, [matrix+3640]; - cvt.u32.u16 %r11180, %rs7285; - cvt.s32.s8 %r11181, %r11180; - cvt.u32.u16 %r11182, %rs7284; - cvt.s32.s8 %r11183, %r11182; - cvt.u32.u16 %r11184, %rs7283; - cvt.s32.s8 %r11185, %r11184; - cvt.u32.u16 %r11186, %rs7282; - cvt.s32.s8 %r11187, %r11186; - mad.lo.s32 %r11188, %r103, %r11187, %r11179; - mad.lo.s32 %r11189, %r104, %r11185, %r11188; - mad.lo.s32 %r11190, %r107, %r11183, %r11189; - mad.lo.s32 %r11191, %r108, %r11181, %r11190; - ld.const.v4.u8 {%rs7290, %rs7291, %rs7292, %rs7293}, [matrix+3644]; - cvt.u32.u16 %r11192, %rs7293; - cvt.s32.s8 %r11193, %r11192; - cvt.u32.u16 %r11194, %rs7292; - cvt.s32.s8 %r11195, %r11194; - cvt.u32.u16 %r11196, %rs7291; - cvt.s32.s8 %r11197, %r11196; - cvt.u32.u16 %r11198, %rs7290; - cvt.s32.s8 %r11199, %r11198; - mad.lo.s32 %r11200, %r111, %r11199, %r11191; - mad.lo.s32 %r11201, %r112, %r11197, %r11200; - mad.lo.s32 %r11202, %r114, %r11195, %r11201; - mad.lo.s32 %r11203, %r115, %r11193, %r11202; - ld.const.v4.u8 {%rs7298, %rs7299, %rs7300, %rs7301}, [matrix+3648]; - cvt.u32.u16 %r11204, %rs7301; - cvt.s32.s8 %r11205, %r11204; - cvt.u32.u16 %r11206, %rs7300; - cvt.s32.s8 %r11207, %r11206; - cvt.u32.u16 %r11208, %rs7298; - cvt.s32.s8 %r11209, %r11208; - cvt.u32.u16 %r11210, %rs7299; - cvt.s32.s8 %r11211, %r11210; - mul.lo.s32 %r11212, %r34, %r11211; - mad.lo.s32 %r11213, %r124, %r11209, %r11212; - mad.lo.s32 %r11214, %r35, %r11207, %r11213; - mad.lo.s32 %r11215, %r36, %r11205, %r11214; - ld.const.v4.u8 {%rs7306, %rs7307, %rs7308, %rs7309}, [matrix+3652]; - cvt.u32.u16 %r11216, %rs7309; - cvt.s32.s8 %r11217, %r11216; - cvt.u32.u16 %r11218, %rs7308; - cvt.s32.s8 %r11219, %r11218; - cvt.u32.u16 %r11220, %rs7307; - cvt.s32.s8 %r11221, %r11220; - cvt.u32.u16 %r11222, %rs7306; - cvt.s32.s8 %r11223, %r11222; - mad.lo.s32 %r11224, %r37, %r11223, %r11215; - mad.lo.s32 %r11225, %r38, %r11221, %r11224; - mad.lo.s32 %r11226, %r39, %r11219, %r11225; - mad.lo.s32 %r11227, %r40, %r11217, %r11226; - ld.const.v4.u8 {%rs7314, %rs7315, %rs7316, %rs7317}, [matrix+3656]; - cvt.u32.u16 %r11228, %rs7317; - cvt.s32.s8 %r11229, %r11228; - cvt.u32.u16 %r11230, %rs7316; - cvt.s32.s8 %r11231, %r11230; - cvt.u32.u16 %r11232, %rs7315; - cvt.s32.s8 %r11233, %r11232; - cvt.u32.u16 %r11234, %rs7314; - cvt.s32.s8 %r11235, %r11234; - mad.lo.s32 %r11236, %r42, %r11235, %r11227; - mad.lo.s32 %r11237, %r43, %r11233, %r11236; - mad.lo.s32 %r11238, %r45, %r11231, %r11237; - mad.lo.s32 %r11239, %r46, %r11229, %r11238; - ld.const.v4.u8 {%rs7322, %rs7323, %rs7324, %rs7325}, [matrix+3660]; - cvt.u32.u16 %r11240, %rs7325; - cvt.s32.s8 %r11241, %r11240; - cvt.u32.u16 %r11242, %rs7324; - cvt.s32.s8 %r11243, %r11242; - cvt.u32.u16 %r11244, %rs7323; - cvt.s32.s8 %r11245, %r11244; - cvt.u32.u16 %r11246, %rs7322; - cvt.s32.s8 %r11247, %r11246; - mad.lo.s32 %r11248, %r48, %r11247, %r11239; - mad.lo.s32 %r11249, %r49, %r11245, %r11248; - mad.lo.s32 %r11250, %r50, %r11243, %r11249; - mad.lo.s32 %r11251, %r51, %r11241, %r11250; - ld.const.v4.u8 {%rs7330, %rs7331, %rs7332, %rs7333}, [matrix+3664]; - cvt.u32.u16 %r11252, %rs7333; - cvt.s32.s8 %r11253, %r11252; - cvt.u32.u16 %r11254, %rs7332; - cvt.s32.s8 %r11255, %r11254; - cvt.u32.u16 %r11256, %rs7331; - cvt.s32.s8 %r11257, %r11256; - cvt.u32.u16 %r11258, %rs7330; - cvt.s32.s8 %r11259, %r11258; - mad.lo.s32 %r11260, %r173, %r11259, %r11251; - mad.lo.s32 %r11261, %r53, %r11257, %r11260; - mad.lo.s32 %r11262, %r54, %r11255, %r11261; - mad.lo.s32 %r11263, %r55, %r11253, %r11262; - ld.const.v4.u8 {%rs7338, %rs7339, %rs7340, %rs7341}, [matrix+3668]; - cvt.u32.u16 %r11264, %rs7341; - cvt.s32.s8 %r11265, %r11264; - cvt.u32.u16 %r11266, %rs7340; - cvt.s32.s8 %r11267, %r11266; - cvt.u32.u16 %r11268, %rs7339; - cvt.s32.s8 %r11269, %r11268; - cvt.u32.u16 %r11270, %rs7338; - cvt.s32.s8 %r11271, %r11270; - mad.lo.s32 %r11272, %r56, %r11271, %r11263; - mad.lo.s32 %r11273, %r57, %r11269, %r11272; - mad.lo.s32 %r11274, %r58, %r11267, %r11273; - mad.lo.s32 %r11275, %r59, %r11265, %r11274; - ld.const.v4.u8 {%rs7346, %rs7347, %rs7348, %rs7349}, [matrix+3672]; - cvt.u32.u16 %r11276, %rs7349; - cvt.s32.s8 %r11277, %r11276; - cvt.u32.u16 %r11278, %rs7348; - cvt.s32.s8 %r11279, %r11278; - cvt.u32.u16 %r11280, %rs7347; - cvt.s32.s8 %r11281, %r11280; - cvt.u32.u16 %r11282, %rs7346; - cvt.s32.s8 %r11283, %r11282; - mad.lo.s32 %r11284, %r61, %r11283, %r11275; - mad.lo.s32 %r11285, %r62, %r11281, %r11284; - mad.lo.s32 %r11286, %r64, %r11279, %r11285; - mad.lo.s32 %r11287, %r65, %r11277, %r11286; - ld.const.v4.u8 {%rs7354, %rs7355, %rs7356, %rs7357}, [matrix+3676]; - cvt.u32.u16 %r11288, %rs7357; - cvt.s32.s8 %r11289, %r11288; - cvt.u32.u16 %r11290, %rs7356; - cvt.s32.s8 %r11291, %r11290; - cvt.u32.u16 %r11292, %rs7355; - cvt.s32.s8 %r11293, %r11292; - cvt.u32.u16 %r11294, %rs7354; - cvt.s32.s8 %r11295, %r11294; - mad.lo.s32 %r11296, %r67, %r11295, %r11287; - mad.lo.s32 %r11297, %r68, %r11293, %r11296; - mad.lo.s32 %r11298, %r69, %r11291, %r11297; - mad.lo.s32 %r11299, %r70, %r11289, %r11298; - ld.const.v4.u8 {%rs7362, %rs7363, %rs7364, %rs7365}, [matrix+3680]; - cvt.u32.u16 %r11300, %rs7365; - cvt.s32.s8 %r11301, %r11300; - cvt.u32.u16 %r11302, %rs7364; - cvt.s32.s8 %r11303, %r11302; - cvt.u32.u16 %r11304, %rs7363; - cvt.s32.s8 %r11305, %r11304; - cvt.u32.u16 %r11306, %rs7362; - cvt.s32.s8 %r11307, %r11306; - mad.lo.s32 %r11308, %r222, %r11307, %r11299; - mad.lo.s32 %r11309, %r72, %r11305, %r11308; - mad.lo.s32 %r11310, %r73, %r11303, %r11309; - mad.lo.s32 %r11311, %r74, %r11301, %r11310; - ld.const.v4.u8 {%rs7370, %rs7371, %rs7372, %rs7373}, [matrix+3684]; - cvt.u32.u16 %r11312, %rs7373; - cvt.s32.s8 %r11313, %r11312; - cvt.u32.u16 %r11314, %rs7372; - cvt.s32.s8 %r11315, %r11314; - cvt.u32.u16 %r11316, %rs7371; - cvt.s32.s8 %r11317, %r11316; - cvt.u32.u16 %r11318, %rs7370; - cvt.s32.s8 %r11319, %r11318; - mad.lo.s32 %r11320, %r75, %r11319, %r11311; - mad.lo.s32 %r11321, %r76, %r11317, %r11320; - mad.lo.s32 %r11322, %r77, %r11315, %r11321; - mad.lo.s32 %r11323, %r78, %r11313, %r11322; - ld.const.v4.u8 {%rs7378, %rs7379, %rs7380, %rs7381}, [matrix+3688]; - cvt.u32.u16 %r11324, %rs7381; - cvt.s32.s8 %r11325, %r11324; - cvt.u32.u16 %r11326, %rs7380; - cvt.s32.s8 %r11327, %r11326; - cvt.u32.u16 %r11328, %rs7379; - cvt.s32.s8 %r11329, %r11328; - cvt.u32.u16 %r11330, %rs7378; - cvt.s32.s8 %r11331, %r11330; - mad.lo.s32 %r11332, %r80, %r11331, %r11323; - mad.lo.s32 %r11333, %r81, %r11329, %r11332; - mad.lo.s32 %r11334, %r83, %r11327, %r11333; - mad.lo.s32 %r11335, %r84, %r11325, %r11334; - ld.const.v4.u8 {%rs7386, %rs7387, %rs7388, %rs7389}, [matrix+3692]; - cvt.u32.u16 %r11336, %rs7389; - cvt.s32.s8 %r11337, %r11336; - cvt.u32.u16 %r11338, %rs7388; - cvt.s32.s8 %r11339, %r11338; - cvt.u32.u16 %r11340, %rs7387; - cvt.s32.s8 %r11341, %r11340; - cvt.u32.u16 %r11342, %rs7386; - cvt.s32.s8 %r11343, %r11342; - mad.lo.s32 %r11344, %r86, %r11343, %r11335; - mad.lo.s32 %r11345, %r87, %r11341, %r11344; - mad.lo.s32 %r11346, %r88, %r11339, %r11345; - mad.lo.s32 %r11347, %r89, %r11337, %r11346; - ld.const.v4.u8 {%rs7394, %rs7395, %rs7396, %rs7397}, [matrix+3696]; - cvt.u32.u16 %r11348, %rs7397; - cvt.s32.s8 %r11349, %r11348; - cvt.u32.u16 %r11350, %rs7396; - cvt.s32.s8 %r11351, %r11350; - cvt.u32.u16 %r11352, %rs7395; - cvt.s32.s8 %r11353, %r11352; - cvt.u32.u16 %r11354, %rs7394; - cvt.s32.s8 %r11355, %r11354; - mad.lo.s32 %r11356, %r271, %r11355, %r11347; - mad.lo.s32 %r11357, %r91, %r11353, %r11356; - mad.lo.s32 %r11358, %r93, %r11351, %r11357; - mad.lo.s32 %r11359, %r94, %r11349, %r11358; - ld.const.v4.u8 {%rs7402, %rs7403, %rs7404, %rs7405}, [matrix+3700]; - cvt.u32.u16 %r11360, %rs7405; - cvt.s32.s8 %r11361, %r11360; - cvt.u32.u16 %r11362, %rs7404; - cvt.s32.s8 %r11363, %r11362; - cvt.u32.u16 %r11364, %rs7403; - cvt.s32.s8 %r11365, %r11364; - cvt.u32.u16 %r11366, %rs7402; - cvt.s32.s8 %r11367, %r11366; - mad.lo.s32 %r11368, %r96, %r11367, %r11359; - mad.lo.s32 %r11369, %r97, %r11365, %r11368; - mad.lo.s32 %r11370, %r99, %r11363, %r11369; - mad.lo.s32 %r11371, %r100, %r11361, %r11370; - ld.const.v4.u8 {%rs7410, %rs7411, %rs7412, %rs7413}, [matrix+3704]; - cvt.u32.u16 %r11372, %rs7413; - cvt.s32.s8 %r11373, %r11372; - cvt.u32.u16 %r11374, %rs7412; - cvt.s32.s8 %r11375, %r11374; - cvt.u32.u16 %r11376, %rs7411; - cvt.s32.s8 %r11377, %r11376; - cvt.u32.u16 %r11378, %rs7410; - cvt.s32.s8 %r11379, %r11378; - mad.lo.s32 %r11380, %r103, %r11379, %r11371; - mad.lo.s32 %r11381, %r104, %r11377, %r11380; - mad.lo.s32 %r11382, %r107, %r11375, %r11381; - mad.lo.s32 %r11383, %r108, %r11373, %r11382; - ld.const.v4.u8 {%rs7418, %rs7419, %rs7420, %rs7421}, [matrix+3708]; - cvt.u32.u16 %r11384, %rs7421; - cvt.s32.s8 %r11385, %r11384; - cvt.u32.u16 %r11386, %rs7420; - cvt.s32.s8 %r11387, %r11386; - cvt.u32.u16 %r11388, %rs7419; - cvt.s32.s8 %r11389, %r11388; - cvt.u32.u16 %r11390, %rs7418; - cvt.s32.s8 %r11391, %r11390; - mad.lo.s32 %r11392, %r111, %r11391, %r11383; - mad.lo.s32 %r11393, %r112, %r11389, %r11392; - mad.lo.s32 %r11394, %r114, %r11387, %r11393; - mad.lo.s32 %r11395, %r115, %r11385, %r11394; - shr.u32 %r11396, %r11203, 6; - and.b32 %r11397, %r11396, 240; - shr.u32 %r11398, %r11395, 10; - or.b32 %r11399, %r11398, %r11397; - xor.b32 %r11400, %r101, %r11399; - cvt.u64.u32 %rd402, %r11400; - and.b64 %rd403, %rd402, 255; - ld.const.v4.u8 {%rs7426, %rs7427, %rs7428, %rs7429}, [matrix+3712]; - cvt.u32.u16 %r11401, %rs7429; - cvt.s32.s8 %r11402, %r11401; - cvt.u32.u16 %r11403, %rs7428; - cvt.s32.s8 %r11404, %r11403; - cvt.u32.u16 %r11405, %rs7426; - cvt.s32.s8 %r11406, %r11405; - cvt.u32.u16 %r11407, %rs7427; - cvt.s32.s8 %r11408, %r11407; - mul.lo.s32 %r11409, %r34, %r11408; - mad.lo.s32 %r11410, %r124, %r11406, %r11409; - mad.lo.s32 %r11411, %r35, %r11404, %r11410; - mad.lo.s32 %r11412, %r36, %r11402, %r11411; - ld.const.v4.u8 {%rs7434, %rs7435, %rs7436, %rs7437}, [matrix+3716]; - cvt.u32.u16 %r11413, %rs7437; - cvt.s32.s8 %r11414, %r11413; - cvt.u32.u16 %r11415, %rs7436; - cvt.s32.s8 %r11416, %r11415; - cvt.u32.u16 %r11417, %rs7435; - cvt.s32.s8 %r11418, %r11417; - cvt.u32.u16 %r11419, %rs7434; - cvt.s32.s8 %r11420, %r11419; - mad.lo.s32 %r11421, %r37, %r11420, %r11412; - mad.lo.s32 %r11422, %r38, %r11418, %r11421; - mad.lo.s32 %r11423, %r39, %r11416, %r11422; - mad.lo.s32 %r11424, %r40, %r11414, %r11423; - ld.const.v4.u8 {%rs7442, %rs7443, %rs7444, %rs7445}, [matrix+3720]; - cvt.u32.u16 %r11425, %rs7445; - cvt.s32.s8 %r11426, %r11425; - cvt.u32.u16 %r11427, %rs7444; - cvt.s32.s8 %r11428, %r11427; - cvt.u32.u16 %r11429, %rs7443; - cvt.s32.s8 %r11430, %r11429; - cvt.u32.u16 %r11431, %rs7442; - cvt.s32.s8 %r11432, %r11431; - mad.lo.s32 %r11433, %r42, %r11432, %r11424; - mad.lo.s32 %r11434, %r43, %r11430, %r11433; - mad.lo.s32 %r11435, %r45, %r11428, %r11434; - mad.lo.s32 %r11436, %r46, %r11426, %r11435; - ld.const.v4.u8 {%rs7450, %rs7451, %rs7452, %rs7453}, [matrix+3724]; - cvt.u32.u16 %r11437, %rs7453; - cvt.s32.s8 %r11438, %r11437; - cvt.u32.u16 %r11439, %rs7452; - cvt.s32.s8 %r11440, %r11439; - cvt.u32.u16 %r11441, %rs7451; - cvt.s32.s8 %r11442, %r11441; - cvt.u32.u16 %r11443, %rs7450; - cvt.s32.s8 %r11444, %r11443; - mad.lo.s32 %r11445, %r48, %r11444, %r11436; - mad.lo.s32 %r11446, %r49, %r11442, %r11445; - mad.lo.s32 %r11447, %r50, %r11440, %r11446; - mad.lo.s32 %r11448, %r51, %r11438, %r11447; - ld.const.v4.u8 {%rs7458, %rs7459, %rs7460, %rs7461}, [matrix+3728]; - cvt.u32.u16 %r11449, %rs7461; - cvt.s32.s8 %r11450, %r11449; - cvt.u32.u16 %r11451, %rs7460; - cvt.s32.s8 %r11452, %r11451; - cvt.u32.u16 %r11453, %rs7459; - cvt.s32.s8 %r11454, %r11453; - cvt.u32.u16 %r11455, %rs7458; - cvt.s32.s8 %r11456, %r11455; - mad.lo.s32 %r11457, %r173, %r11456, %r11448; - mad.lo.s32 %r11458, %r53, %r11454, %r11457; - mad.lo.s32 %r11459, %r54, %r11452, %r11458; - mad.lo.s32 %r11460, %r55, %r11450, %r11459; - ld.const.v4.u8 {%rs7466, %rs7467, %rs7468, %rs7469}, [matrix+3732]; - cvt.u32.u16 %r11461, %rs7469; - cvt.s32.s8 %r11462, %r11461; - cvt.u32.u16 %r11463, %rs7468; - cvt.s32.s8 %r11464, %r11463; - cvt.u32.u16 %r11465, %rs7467; - cvt.s32.s8 %r11466, %r11465; - cvt.u32.u16 %r11467, %rs7466; - cvt.s32.s8 %r11468, %r11467; - mad.lo.s32 %r11469, %r56, %r11468, %r11460; - mad.lo.s32 %r11470, %r57, %r11466, %r11469; - mad.lo.s32 %r11471, %r58, %r11464, %r11470; - mad.lo.s32 %r11472, %r59, %r11462, %r11471; - ld.const.v4.u8 {%rs7474, %rs7475, %rs7476, %rs7477}, [matrix+3736]; - cvt.u32.u16 %r11473, %rs7477; - cvt.s32.s8 %r11474, %r11473; - cvt.u32.u16 %r11475, %rs7476; - cvt.s32.s8 %r11476, %r11475; - cvt.u32.u16 %r11477, %rs7475; - cvt.s32.s8 %r11478, %r11477; - cvt.u32.u16 %r11479, %rs7474; - cvt.s32.s8 %r11480, %r11479; - mad.lo.s32 %r11481, %r61, %r11480, %r11472; - mad.lo.s32 %r11482, %r62, %r11478, %r11481; - mad.lo.s32 %r11483, %r64, %r11476, %r11482; - mad.lo.s32 %r11484, %r65, %r11474, %r11483; - ld.const.v4.u8 {%rs7482, %rs7483, %rs7484, %rs7485}, [matrix+3740]; - cvt.u32.u16 %r11485, %rs7485; - cvt.s32.s8 %r11486, %r11485; - cvt.u32.u16 %r11487, %rs7484; - cvt.s32.s8 %r11488, %r11487; - cvt.u32.u16 %r11489, %rs7483; - cvt.s32.s8 %r11490, %r11489; - cvt.u32.u16 %r11491, %rs7482; - cvt.s32.s8 %r11492, %r11491; - mad.lo.s32 %r11493, %r67, %r11492, %r11484; - mad.lo.s32 %r11494, %r68, %r11490, %r11493; - mad.lo.s32 %r11495, %r69, %r11488, %r11494; - mad.lo.s32 %r11496, %r70, %r11486, %r11495; - ld.const.v4.u8 {%rs7490, %rs7491, %rs7492, %rs7493}, [matrix+3744]; - cvt.u32.u16 %r11497, %rs7493; - cvt.s32.s8 %r11498, %r11497; - cvt.u32.u16 %r11499, %rs7492; - cvt.s32.s8 %r11500, %r11499; - cvt.u32.u16 %r11501, %rs7491; - cvt.s32.s8 %r11502, %r11501; - cvt.u32.u16 %r11503, %rs7490; - cvt.s32.s8 %r11504, %r11503; - mad.lo.s32 %r11505, %r222, %r11504, %r11496; - mad.lo.s32 %r11506, %r72, %r11502, %r11505; - mad.lo.s32 %r11507, %r73, %r11500, %r11506; - mad.lo.s32 %r11508, %r74, %r11498, %r11507; - ld.const.v4.u8 {%rs7498, %rs7499, %rs7500, %rs7501}, [matrix+3748]; - cvt.u32.u16 %r11509, %rs7501; - cvt.s32.s8 %r11510, %r11509; - cvt.u32.u16 %r11511, %rs7500; - cvt.s32.s8 %r11512, %r11511; - cvt.u32.u16 %r11513, %rs7499; - cvt.s32.s8 %r11514, %r11513; - cvt.u32.u16 %r11515, %rs7498; - cvt.s32.s8 %r11516, %r11515; - mad.lo.s32 %r11517, %r75, %r11516, %r11508; - mad.lo.s32 %r11518, %r76, %r11514, %r11517; - mad.lo.s32 %r11519, %r77, %r11512, %r11518; - mad.lo.s32 %r11520, %r78, %r11510, %r11519; - ld.const.v4.u8 {%rs7506, %rs7507, %rs7508, %rs7509}, [matrix+3752]; - cvt.u32.u16 %r11521, %rs7509; - cvt.s32.s8 %r11522, %r11521; - cvt.u32.u16 %r11523, %rs7508; - cvt.s32.s8 %r11524, %r11523; - cvt.u32.u16 %r11525, %rs7507; - cvt.s32.s8 %r11526, %r11525; - cvt.u32.u16 %r11527, %rs7506; - cvt.s32.s8 %r11528, %r11527; - mad.lo.s32 %r11529, %r80, %r11528, %r11520; - mad.lo.s32 %r11530, %r81, %r11526, %r11529; - mad.lo.s32 %r11531, %r83, %r11524, %r11530; - mad.lo.s32 %r11532, %r84, %r11522, %r11531; - ld.const.v4.u8 {%rs7514, %rs7515, %rs7516, %rs7517}, [matrix+3756]; - cvt.u32.u16 %r11533, %rs7517; - cvt.s32.s8 %r11534, %r11533; - cvt.u32.u16 %r11535, %rs7516; - cvt.s32.s8 %r11536, %r11535; - cvt.u32.u16 %r11537, %rs7515; - cvt.s32.s8 %r11538, %r11537; - cvt.u32.u16 %r11539, %rs7514; - cvt.s32.s8 %r11540, %r11539; - mad.lo.s32 %r11541, %r86, %r11540, %r11532; - mad.lo.s32 %r11542, %r87, %r11538, %r11541; - mad.lo.s32 %r11543, %r88, %r11536, %r11542; - mad.lo.s32 %r11544, %r89, %r11534, %r11543; - ld.const.v4.u8 {%rs7522, %rs7523, %rs7524, %rs7525}, [matrix+3760]; - cvt.u32.u16 %r11545, %rs7525; - cvt.s32.s8 %r11546, %r11545; - cvt.u32.u16 %r11547, %rs7524; - cvt.s32.s8 %r11548, %r11547; - cvt.u32.u16 %r11549, %rs7523; - cvt.s32.s8 %r11550, %r11549; - cvt.u32.u16 %r11551, %rs7522; - cvt.s32.s8 %r11552, %r11551; - mad.lo.s32 %r11553, %r271, %r11552, %r11544; - mad.lo.s32 %r11554, %r91, %r11550, %r11553; - mad.lo.s32 %r11555, %r93, %r11548, %r11554; - mad.lo.s32 %r11556, %r94, %r11546, %r11555; - ld.const.v4.u8 {%rs7530, %rs7531, %rs7532, %rs7533}, [matrix+3764]; - cvt.u32.u16 %r11557, %rs7533; - cvt.s32.s8 %r11558, %r11557; - cvt.u32.u16 %r11559, %rs7532; - cvt.s32.s8 %r11560, %r11559; - cvt.u32.u16 %r11561, %rs7531; - cvt.s32.s8 %r11562, %r11561; - cvt.u32.u16 %r11563, %rs7530; - cvt.s32.s8 %r11564, %r11563; - mad.lo.s32 %r11565, %r96, %r11564, %r11556; - mad.lo.s32 %r11566, %r97, %r11562, %r11565; - mad.lo.s32 %r11567, %r99, %r11560, %r11566; - mad.lo.s32 %r11568, %r100, %r11558, %r11567; - ld.const.v4.u8 {%rs7538, %rs7539, %rs7540, %rs7541}, [matrix+3768]; - cvt.u32.u16 %r11569, %rs7541; - cvt.s32.s8 %r11570, %r11569; - cvt.u32.u16 %r11571, %rs7540; - cvt.s32.s8 %r11572, %r11571; - cvt.u32.u16 %r11573, %rs7539; - cvt.s32.s8 %r11574, %r11573; - cvt.u32.u16 %r11575, %rs7538; - cvt.s32.s8 %r11576, %r11575; - mad.lo.s32 %r11577, %r103, %r11576, %r11568; - mad.lo.s32 %r11578, %r104, %r11574, %r11577; - mad.lo.s32 %r11579, %r107, %r11572, %r11578; - mad.lo.s32 %r11580, %r108, %r11570, %r11579; - ld.const.v4.u8 {%rs7546, %rs7547, %rs7548, %rs7549}, [matrix+3772]; - cvt.u32.u16 %r11581, %rs7549; - cvt.s32.s8 %r11582, %r11581; - cvt.u32.u16 %r11583, %rs7548; - cvt.s32.s8 %r11584, %r11583; - cvt.u32.u16 %r11585, %rs7547; - cvt.s32.s8 %r11586, %r11585; - cvt.u32.u16 %r11587, %rs7546; - cvt.s32.s8 %r11588, %r11587; - mad.lo.s32 %r11589, %r111, %r11588, %r11580; - mad.lo.s32 %r11590, %r112, %r11586, %r11589; - mad.lo.s32 %r11591, %r114, %r11584, %r11590; - mad.lo.s32 %r11592, %r115, %r11582, %r11591; - ld.const.v4.u8 {%rs7554, %rs7555, %rs7556, %rs7557}, [matrix+3776]; - cvt.u32.u16 %r11593, %rs7557; - cvt.s32.s8 %r11594, %r11593; - cvt.u32.u16 %r11595, %rs7556; - cvt.s32.s8 %r11596, %r11595; - cvt.u32.u16 %r11597, %rs7554; - cvt.s32.s8 %r11598, %r11597; - cvt.u32.u16 %r11599, %rs7555; - cvt.s32.s8 %r11600, %r11599; - mul.lo.s32 %r11601, %r34, %r11600; - mad.lo.s32 %r11602, %r124, %r11598, %r11601; - mad.lo.s32 %r11603, %r35, %r11596, %r11602; - mad.lo.s32 %r11604, %r36, %r11594, %r11603; - ld.const.v4.u8 {%rs7562, %rs7563, %rs7564, %rs7565}, [matrix+3780]; - cvt.u32.u16 %r11605, %rs7565; - cvt.s32.s8 %r11606, %r11605; - cvt.u32.u16 %r11607, %rs7564; - cvt.s32.s8 %r11608, %r11607; - cvt.u32.u16 %r11609, %rs7563; - cvt.s32.s8 %r11610, %r11609; - cvt.u32.u16 %r11611, %rs7562; - cvt.s32.s8 %r11612, %r11611; - mad.lo.s32 %r11613, %r37, %r11612, %r11604; - mad.lo.s32 %r11614, %r38, %r11610, %r11613; - mad.lo.s32 %r11615, %r39, %r11608, %r11614; - mad.lo.s32 %r11616, %r40, %r11606, %r11615; - ld.const.v4.u8 {%rs7570, %rs7571, %rs7572, %rs7573}, [matrix+3784]; - cvt.u32.u16 %r11617, %rs7573; - cvt.s32.s8 %r11618, %r11617; - cvt.u32.u16 %r11619, %rs7572; - cvt.s32.s8 %r11620, %r11619; - cvt.u32.u16 %r11621, %rs7571; - cvt.s32.s8 %r11622, %r11621; - cvt.u32.u16 %r11623, %rs7570; - cvt.s32.s8 %r11624, %r11623; - mad.lo.s32 %r11625, %r42, %r11624, %r11616; - mad.lo.s32 %r11626, %r43, %r11622, %r11625; - mad.lo.s32 %r11627, %r45, %r11620, %r11626; - mad.lo.s32 %r11628, %r46, %r11618, %r11627; - ld.const.v4.u8 {%rs7578, %rs7579, %rs7580, %rs7581}, [matrix+3788]; - cvt.u32.u16 %r11629, %rs7581; - cvt.s32.s8 %r11630, %r11629; - cvt.u32.u16 %r11631, %rs7580; - cvt.s32.s8 %r11632, %r11631; - cvt.u32.u16 %r11633, %rs7579; - cvt.s32.s8 %r11634, %r11633; - cvt.u32.u16 %r11635, %rs7578; - cvt.s32.s8 %r11636, %r11635; - mad.lo.s32 %r11637, %r48, %r11636, %r11628; - mad.lo.s32 %r11638, %r49, %r11634, %r11637; - mad.lo.s32 %r11639, %r50, %r11632, %r11638; - mad.lo.s32 %r11640, %r51, %r11630, %r11639; - ld.const.v4.u8 {%rs7586, %rs7587, %rs7588, %rs7589}, [matrix+3792]; - cvt.u32.u16 %r11641, %rs7589; - cvt.s32.s8 %r11642, %r11641; - cvt.u32.u16 %r11643, %rs7588; - cvt.s32.s8 %r11644, %r11643; - cvt.u32.u16 %r11645, %rs7587; - cvt.s32.s8 %r11646, %r11645; - cvt.u32.u16 %r11647, %rs7586; - cvt.s32.s8 %r11648, %r11647; - mad.lo.s32 %r11649, %r173, %r11648, %r11640; - mad.lo.s32 %r11650, %r53, %r11646, %r11649; - mad.lo.s32 %r11651, %r54, %r11644, %r11650; - mad.lo.s32 %r11652, %r55, %r11642, %r11651; - ld.const.v4.u8 {%rs7594, %rs7595, %rs7596, %rs7597}, [matrix+3796]; - cvt.u32.u16 %r11653, %rs7597; - cvt.s32.s8 %r11654, %r11653; - cvt.u32.u16 %r11655, %rs7596; - cvt.s32.s8 %r11656, %r11655; - cvt.u32.u16 %r11657, %rs7595; - cvt.s32.s8 %r11658, %r11657; - cvt.u32.u16 %r11659, %rs7594; - cvt.s32.s8 %r11660, %r11659; - mad.lo.s32 %r11661, %r56, %r11660, %r11652; - mad.lo.s32 %r11662, %r57, %r11658, %r11661; - mad.lo.s32 %r11663, %r58, %r11656, %r11662; - mad.lo.s32 %r11664, %r59, %r11654, %r11663; - ld.const.v4.u8 {%rs7602, %rs7603, %rs7604, %rs7605}, [matrix+3800]; - cvt.u32.u16 %r11665, %rs7605; - cvt.s32.s8 %r11666, %r11665; - cvt.u32.u16 %r11667, %rs7604; - cvt.s32.s8 %r11668, %r11667; - cvt.u32.u16 %r11669, %rs7603; - cvt.s32.s8 %r11670, %r11669; - cvt.u32.u16 %r11671, %rs7602; - cvt.s32.s8 %r11672, %r11671; - mad.lo.s32 %r11673, %r61, %r11672, %r11664; - mad.lo.s32 %r11674, %r62, %r11670, %r11673; - mad.lo.s32 %r11675, %r64, %r11668, %r11674; - mad.lo.s32 %r11676, %r65, %r11666, %r11675; - ld.const.v4.u8 {%rs7610, %rs7611, %rs7612, %rs7613}, [matrix+3804]; - cvt.u32.u16 %r11677, %rs7613; - cvt.s32.s8 %r11678, %r11677; - cvt.u32.u16 %r11679, %rs7612; - cvt.s32.s8 %r11680, %r11679; - cvt.u32.u16 %r11681, %rs7611; - cvt.s32.s8 %r11682, %r11681; - cvt.u32.u16 %r11683, %rs7610; - cvt.s32.s8 %r11684, %r11683; - mad.lo.s32 %r11685, %r67, %r11684, %r11676; - mad.lo.s32 %r11686, %r68, %r11682, %r11685; - mad.lo.s32 %r11687, %r69, %r11680, %r11686; - mad.lo.s32 %r11688, %r70, %r11678, %r11687; - ld.const.v4.u8 {%rs7618, %rs7619, %rs7620, %rs7621}, [matrix+3808]; - cvt.u32.u16 %r11689, %rs7621; - cvt.s32.s8 %r11690, %r11689; - cvt.u32.u16 %r11691, %rs7620; - cvt.s32.s8 %r11692, %r11691; - cvt.u32.u16 %r11693, %rs7619; - cvt.s32.s8 %r11694, %r11693; - cvt.u32.u16 %r11695, %rs7618; - cvt.s32.s8 %r11696, %r11695; - mad.lo.s32 %r11697, %r222, %r11696, %r11688; - mad.lo.s32 %r11698, %r72, %r11694, %r11697; - mad.lo.s32 %r11699, %r73, %r11692, %r11698; - mad.lo.s32 %r11700, %r74, %r11690, %r11699; - ld.const.v4.u8 {%rs7626, %rs7627, %rs7628, %rs7629}, [matrix+3812]; - cvt.u32.u16 %r11701, %rs7629; - cvt.s32.s8 %r11702, %r11701; - cvt.u32.u16 %r11703, %rs7628; - cvt.s32.s8 %r11704, %r11703; - cvt.u32.u16 %r11705, %rs7627; - cvt.s32.s8 %r11706, %r11705; - cvt.u32.u16 %r11707, %rs7626; - cvt.s32.s8 %r11708, %r11707; - mad.lo.s32 %r11709, %r75, %r11708, %r11700; - mad.lo.s32 %r11710, %r76, %r11706, %r11709; - mad.lo.s32 %r11711, %r77, %r11704, %r11710; - mad.lo.s32 %r11712, %r78, %r11702, %r11711; - ld.const.v4.u8 {%rs7634, %rs7635, %rs7636, %rs7637}, [matrix+3816]; - cvt.u32.u16 %r11713, %rs7637; - cvt.s32.s8 %r11714, %r11713; - cvt.u32.u16 %r11715, %rs7636; - cvt.s32.s8 %r11716, %r11715; - cvt.u32.u16 %r11717, %rs7635; - cvt.s32.s8 %r11718, %r11717; - cvt.u32.u16 %r11719, %rs7634; - cvt.s32.s8 %r11720, %r11719; - mad.lo.s32 %r11721, %r80, %r11720, %r11712; - mad.lo.s32 %r11722, %r81, %r11718, %r11721; - mad.lo.s32 %r11723, %r83, %r11716, %r11722; - mad.lo.s32 %r11724, %r84, %r11714, %r11723; - ld.const.v4.u8 {%rs7642, %rs7643, %rs7644, %rs7645}, [matrix+3820]; - cvt.u32.u16 %r11725, %rs7645; - cvt.s32.s8 %r11726, %r11725; - cvt.u32.u16 %r11727, %rs7644; - cvt.s32.s8 %r11728, %r11727; - cvt.u32.u16 %r11729, %rs7643; - cvt.s32.s8 %r11730, %r11729; - cvt.u32.u16 %r11731, %rs7642; - cvt.s32.s8 %r11732, %r11731; - mad.lo.s32 %r11733, %r86, %r11732, %r11724; - mad.lo.s32 %r11734, %r87, %r11730, %r11733; - mad.lo.s32 %r11735, %r88, %r11728, %r11734; - mad.lo.s32 %r11736, %r89, %r11726, %r11735; - ld.const.v4.u8 {%rs7650, %rs7651, %rs7652, %rs7653}, [matrix+3824]; - cvt.u32.u16 %r11737, %rs7653; - cvt.s32.s8 %r11738, %r11737; - cvt.u32.u16 %r11739, %rs7652; - cvt.s32.s8 %r11740, %r11739; - cvt.u32.u16 %r11741, %rs7651; - cvt.s32.s8 %r11742, %r11741; - cvt.u32.u16 %r11743, %rs7650; - cvt.s32.s8 %r11744, %r11743; - mad.lo.s32 %r11745, %r271, %r11744, %r11736; - mad.lo.s32 %r11746, %r91, %r11742, %r11745; - mad.lo.s32 %r11747, %r93, %r11740, %r11746; - mad.lo.s32 %r11748, %r94, %r11738, %r11747; - ld.const.v4.u8 {%rs7658, %rs7659, %rs7660, %rs7661}, [matrix+3828]; - cvt.u32.u16 %r11749, %rs7661; - cvt.s32.s8 %r11750, %r11749; - cvt.u32.u16 %r11751, %rs7660; - cvt.s32.s8 %r11752, %r11751; - cvt.u32.u16 %r11753, %rs7659; - cvt.s32.s8 %r11754, %r11753; - cvt.u32.u16 %r11755, %rs7658; - cvt.s32.s8 %r11756, %r11755; - mad.lo.s32 %r11757, %r96, %r11756, %r11748; - mad.lo.s32 %r11758, %r97, %r11754, %r11757; - mad.lo.s32 %r11759, %r99, %r11752, %r11758; - mad.lo.s32 %r11760, %r100, %r11750, %r11759; - ld.const.v4.u8 {%rs7666, %rs7667, %rs7668, %rs7669}, [matrix+3832]; - cvt.u32.u16 %r11761, %rs7669; - cvt.s32.s8 %r11762, %r11761; - cvt.u32.u16 %r11763, %rs7668; - cvt.s32.s8 %r11764, %r11763; - cvt.u32.u16 %r11765, %rs7667; - cvt.s32.s8 %r11766, %r11765; - cvt.u32.u16 %r11767, %rs7666; - cvt.s32.s8 %r11768, %r11767; - mad.lo.s32 %r11769, %r103, %r11768, %r11760; - mad.lo.s32 %r11770, %r104, %r11766, %r11769; - mad.lo.s32 %r11771, %r107, %r11764, %r11770; - mad.lo.s32 %r11772, %r108, %r11762, %r11771; - ld.const.v4.u8 {%rs7674, %rs7675, %rs7676, %rs7677}, [matrix+3836]; - cvt.u32.u16 %r11773, %rs7677; - cvt.s32.s8 %r11774, %r11773; - cvt.u32.u16 %r11775, %rs7676; - cvt.s32.s8 %r11776, %r11775; - cvt.u32.u16 %r11777, %rs7675; - cvt.s32.s8 %r11778, %r11777; - cvt.u32.u16 %r11779, %rs7674; - cvt.s32.s8 %r11780, %r11779; - mad.lo.s32 %r11781, %r111, %r11780, %r11772; - mad.lo.s32 %r11782, %r112, %r11778, %r11781; - mad.lo.s32 %r11783, %r114, %r11776, %r11782; - mad.lo.s32 %r11784, %r115, %r11774, %r11783; - shr.u32 %r11785, %r11592, 6; - and.b32 %r11786, %r11785, 240; - shr.u32 %r11787, %r11784, 10; - or.b32 %r11788, %r11787, %r11786; - xor.b32 %r11789, %r105, %r11788; - cvt.u64.u32 %rd404, %r11789; - ld.const.v4.u8 {%rs7682, %rs7683, %rs7684, %rs7685}, [matrix+3840]; - cvt.u32.u16 %r11790, %rs7685; - cvt.s32.s8 %r11791, %r11790; - cvt.u32.u16 %r11792, %rs7684; - cvt.s32.s8 %r11793, %r11792; - cvt.u32.u16 %r11794, %rs7682; - cvt.s32.s8 %r11795, %r11794; - cvt.u32.u16 %r11796, %rs7683; - cvt.s32.s8 %r11797, %r11796; - mul.lo.s32 %r11798, %r34, %r11797; - mad.lo.s32 %r11799, %r124, %r11795, %r11798; - mad.lo.s32 %r11800, %r35, %r11793, %r11799; - mad.lo.s32 %r11801, %r36, %r11791, %r11800; - ld.const.v4.u8 {%rs7690, %rs7691, %rs7692, %rs7693}, [matrix+3844]; - cvt.u32.u16 %r11802, %rs7693; - cvt.s32.s8 %r11803, %r11802; - cvt.u32.u16 %r11804, %rs7692; - cvt.s32.s8 %r11805, %r11804; - cvt.u32.u16 %r11806, %rs7691; - cvt.s32.s8 %r11807, %r11806; - cvt.u32.u16 %r11808, %rs7690; - cvt.s32.s8 %r11809, %r11808; - mad.lo.s32 %r11810, %r37, %r11809, %r11801; - mad.lo.s32 %r11811, %r38, %r11807, %r11810; - mad.lo.s32 %r11812, %r39, %r11805, %r11811; - mad.lo.s32 %r11813, %r40, %r11803, %r11812; - ld.const.v4.u8 {%rs7698, %rs7699, %rs7700, %rs7701}, [matrix+3848]; - cvt.u32.u16 %r11814, %rs7701; - cvt.s32.s8 %r11815, %r11814; - cvt.u32.u16 %r11816, %rs7700; - cvt.s32.s8 %r11817, %r11816; - cvt.u32.u16 %r11818, %rs7699; - cvt.s32.s8 %r11819, %r11818; - cvt.u32.u16 %r11820, %rs7698; - cvt.s32.s8 %r11821, %r11820; - mad.lo.s32 %r11822, %r42, %r11821, %r11813; - mad.lo.s32 %r11823, %r43, %r11819, %r11822; - mad.lo.s32 %r11824, %r45, %r11817, %r11823; - mad.lo.s32 %r11825, %r46, %r11815, %r11824; - ld.const.v4.u8 {%rs7706, %rs7707, %rs7708, %rs7709}, [matrix+3852]; - cvt.u32.u16 %r11826, %rs7709; - cvt.s32.s8 %r11827, %r11826; - cvt.u32.u16 %r11828, %rs7708; - cvt.s32.s8 %r11829, %r11828; - cvt.u32.u16 %r11830, %rs7707; - cvt.s32.s8 %r11831, %r11830; - cvt.u32.u16 %r11832, %rs7706; - cvt.s32.s8 %r11833, %r11832; - mad.lo.s32 %r11834, %r48, %r11833, %r11825; - mad.lo.s32 %r11835, %r49, %r11831, %r11834; - mad.lo.s32 %r11836, %r50, %r11829, %r11835; - mad.lo.s32 %r11837, %r51, %r11827, %r11836; - ld.const.v4.u8 {%rs7714, %rs7715, %rs7716, %rs7717}, [matrix+3856]; - cvt.u32.u16 %r11838, %rs7717; - cvt.s32.s8 %r11839, %r11838; - cvt.u32.u16 %r11840, %rs7716; - cvt.s32.s8 %r11841, %r11840; - cvt.u32.u16 %r11842, %rs7715; - cvt.s32.s8 %r11843, %r11842; - cvt.u32.u16 %r11844, %rs7714; - cvt.s32.s8 %r11845, %r11844; - mad.lo.s32 %r11846, %r173, %r11845, %r11837; - mad.lo.s32 %r11847, %r53, %r11843, %r11846; - mad.lo.s32 %r11848, %r54, %r11841, %r11847; - mad.lo.s32 %r11849, %r55, %r11839, %r11848; - ld.const.v4.u8 {%rs7722, %rs7723, %rs7724, %rs7725}, [matrix+3860]; - cvt.u32.u16 %r11850, %rs7725; - cvt.s32.s8 %r11851, %r11850; - cvt.u32.u16 %r11852, %rs7724; - cvt.s32.s8 %r11853, %r11852; - cvt.u32.u16 %r11854, %rs7723; - cvt.s32.s8 %r11855, %r11854; - cvt.u32.u16 %r11856, %rs7722; - cvt.s32.s8 %r11857, %r11856; - mad.lo.s32 %r11858, %r56, %r11857, %r11849; - mad.lo.s32 %r11859, %r57, %r11855, %r11858; - mad.lo.s32 %r11860, %r58, %r11853, %r11859; - mad.lo.s32 %r11861, %r59, %r11851, %r11860; - ld.const.v4.u8 {%rs7730, %rs7731, %rs7732, %rs7733}, [matrix+3864]; - cvt.u32.u16 %r11862, %rs7733; - cvt.s32.s8 %r11863, %r11862; - cvt.u32.u16 %r11864, %rs7732; - cvt.s32.s8 %r11865, %r11864; - cvt.u32.u16 %r11866, %rs7731; - cvt.s32.s8 %r11867, %r11866; - cvt.u32.u16 %r11868, %rs7730; - cvt.s32.s8 %r11869, %r11868; - mad.lo.s32 %r11870, %r61, %r11869, %r11861; - mad.lo.s32 %r11871, %r62, %r11867, %r11870; - mad.lo.s32 %r11872, %r64, %r11865, %r11871; - mad.lo.s32 %r11873, %r65, %r11863, %r11872; - ld.const.v4.u8 {%rs7738, %rs7739, %rs7740, %rs7741}, [matrix+3868]; - cvt.u32.u16 %r11874, %rs7741; - cvt.s32.s8 %r11875, %r11874; - cvt.u32.u16 %r11876, %rs7740; - cvt.s32.s8 %r11877, %r11876; - cvt.u32.u16 %r11878, %rs7739; - cvt.s32.s8 %r11879, %r11878; - cvt.u32.u16 %r11880, %rs7738; - cvt.s32.s8 %r11881, %r11880; - mad.lo.s32 %r11882, %r67, %r11881, %r11873; - mad.lo.s32 %r11883, %r68, %r11879, %r11882; - mad.lo.s32 %r11884, %r69, %r11877, %r11883; - mad.lo.s32 %r11885, %r70, %r11875, %r11884; - ld.const.v4.u8 {%rs7746, %rs7747, %rs7748, %rs7749}, [matrix+3872]; - cvt.u32.u16 %r11886, %rs7749; - cvt.s32.s8 %r11887, %r11886; - cvt.u32.u16 %r11888, %rs7748; - cvt.s32.s8 %r11889, %r11888; - cvt.u32.u16 %r11890, %rs7747; - cvt.s32.s8 %r11891, %r11890; - cvt.u32.u16 %r11892, %rs7746; - cvt.s32.s8 %r11893, %r11892; - mad.lo.s32 %r11894, %r222, %r11893, %r11885; - mad.lo.s32 %r11895, %r72, %r11891, %r11894; - mad.lo.s32 %r11896, %r73, %r11889, %r11895; - mad.lo.s32 %r11897, %r74, %r11887, %r11896; - ld.const.v4.u8 {%rs7754, %rs7755, %rs7756, %rs7757}, [matrix+3876]; - cvt.u32.u16 %r11898, %rs7757; - cvt.s32.s8 %r11899, %r11898; - cvt.u32.u16 %r11900, %rs7756; - cvt.s32.s8 %r11901, %r11900; - cvt.u32.u16 %r11902, %rs7755; - cvt.s32.s8 %r11903, %r11902; - cvt.u32.u16 %r11904, %rs7754; - cvt.s32.s8 %r11905, %r11904; - mad.lo.s32 %r11906, %r75, %r11905, %r11897; - mad.lo.s32 %r11907, %r76, %r11903, %r11906; - mad.lo.s32 %r11908, %r77, %r11901, %r11907; - mad.lo.s32 %r11909, %r78, %r11899, %r11908; - ld.const.v4.u8 {%rs7762, %rs7763, %rs7764, %rs7765}, [matrix+3880]; - cvt.u32.u16 %r11910, %rs7765; - cvt.s32.s8 %r11911, %r11910; - cvt.u32.u16 %r11912, %rs7764; - cvt.s32.s8 %r11913, %r11912; - cvt.u32.u16 %r11914, %rs7763; - cvt.s32.s8 %r11915, %r11914; - cvt.u32.u16 %r11916, %rs7762; - cvt.s32.s8 %r11917, %r11916; - mad.lo.s32 %r11918, %r80, %r11917, %r11909; - mad.lo.s32 %r11919, %r81, %r11915, %r11918; - mad.lo.s32 %r11920, %r83, %r11913, %r11919; - mad.lo.s32 %r11921, %r84, %r11911, %r11920; - ld.const.v4.u8 {%rs7770, %rs7771, %rs7772, %rs7773}, [matrix+3884]; - cvt.u32.u16 %r11922, %rs7773; - cvt.s32.s8 %r11923, %r11922; - cvt.u32.u16 %r11924, %rs7772; - cvt.s32.s8 %r11925, %r11924; - cvt.u32.u16 %r11926, %rs7771; - cvt.s32.s8 %r11927, %r11926; - cvt.u32.u16 %r11928, %rs7770; - cvt.s32.s8 %r11929, %r11928; - mad.lo.s32 %r11930, %r86, %r11929, %r11921; - mad.lo.s32 %r11931, %r87, %r11927, %r11930; - mad.lo.s32 %r11932, %r88, %r11925, %r11931; - mad.lo.s32 %r11933, %r89, %r11923, %r11932; - ld.const.v4.u8 {%rs7778, %rs7779, %rs7780, %rs7781}, [matrix+3888]; - cvt.u32.u16 %r11934, %rs7781; - cvt.s32.s8 %r11935, %r11934; - cvt.u32.u16 %r11936, %rs7780; - cvt.s32.s8 %r11937, %r11936; - cvt.u32.u16 %r11938, %rs7779; - cvt.s32.s8 %r11939, %r11938; - cvt.u32.u16 %r11940, %rs7778; - cvt.s32.s8 %r11941, %r11940; - mad.lo.s32 %r11942, %r271, %r11941, %r11933; - mad.lo.s32 %r11943, %r91, %r11939, %r11942; - mad.lo.s32 %r11944, %r93, %r11937, %r11943; - mad.lo.s32 %r11945, %r94, %r11935, %r11944; - ld.const.v4.u8 {%rs7786, %rs7787, %rs7788, %rs7789}, [matrix+3892]; - cvt.u32.u16 %r11946, %rs7789; - cvt.s32.s8 %r11947, %r11946; - cvt.u32.u16 %r11948, %rs7788; - cvt.s32.s8 %r11949, %r11948; - cvt.u32.u16 %r11950, %rs7787; - cvt.s32.s8 %r11951, %r11950; - cvt.u32.u16 %r11952, %rs7786; - cvt.s32.s8 %r11953, %r11952; - mad.lo.s32 %r11954, %r96, %r11953, %r11945; - mad.lo.s32 %r11955, %r97, %r11951, %r11954; - mad.lo.s32 %r11956, %r99, %r11949, %r11955; - mad.lo.s32 %r11957, %r100, %r11947, %r11956; - ld.const.v4.u8 {%rs7794, %rs7795, %rs7796, %rs7797}, [matrix+3896]; - cvt.u32.u16 %r11958, %rs7797; - cvt.s32.s8 %r11959, %r11958; - cvt.u32.u16 %r11960, %rs7796; - cvt.s32.s8 %r11961, %r11960; - cvt.u32.u16 %r11962, %rs7795; - cvt.s32.s8 %r11963, %r11962; - cvt.u32.u16 %r11964, %rs7794; - cvt.s32.s8 %r11965, %r11964; - mad.lo.s32 %r11966, %r103, %r11965, %r11957; - mad.lo.s32 %r11967, %r104, %r11963, %r11966; - mad.lo.s32 %r11968, %r107, %r11961, %r11967; - mad.lo.s32 %r11969, %r108, %r11959, %r11968; - ld.const.v4.u8 {%rs7802, %rs7803, %rs7804, %rs7805}, [matrix+3900]; - cvt.u32.u16 %r11970, %rs7805; - cvt.s32.s8 %r11971, %r11970; - cvt.u32.u16 %r11972, %rs7804; - cvt.s32.s8 %r11973, %r11972; - cvt.u32.u16 %r11974, %rs7803; - cvt.s32.s8 %r11975, %r11974; - cvt.u32.u16 %r11976, %rs7802; - cvt.s32.s8 %r11977, %r11976; - mad.lo.s32 %r11978, %r111, %r11977, %r11969; - mad.lo.s32 %r11979, %r112, %r11975, %r11978; - mad.lo.s32 %r11980, %r114, %r11973, %r11979; - mad.lo.s32 %r11981, %r115, %r11971, %r11980; - ld.const.v4.u8 {%rs7810, %rs7811, %rs7812, %rs7813}, [matrix+3904]; - cvt.u32.u16 %r11982, %rs7813; - cvt.s32.s8 %r11983, %r11982; - cvt.u32.u16 %r11984, %rs7812; - cvt.s32.s8 %r11985, %r11984; - cvt.u32.u16 %r11986, %rs7810; - cvt.s32.s8 %r11987, %r11986; - cvt.u32.u16 %r11988, %rs7811; - cvt.s32.s8 %r11989, %r11988; - mul.lo.s32 %r11990, %r34, %r11989; - mad.lo.s32 %r11991, %r124, %r11987, %r11990; - mad.lo.s32 %r11992, %r35, %r11985, %r11991; - mad.lo.s32 %r11993, %r36, %r11983, %r11992; - ld.const.v4.u8 {%rs7818, %rs7819, %rs7820, %rs7821}, [matrix+3908]; - cvt.u32.u16 %r11994, %rs7821; - cvt.s32.s8 %r11995, %r11994; - cvt.u32.u16 %r11996, %rs7820; - cvt.s32.s8 %r11997, %r11996; - cvt.u32.u16 %r11998, %rs7819; - cvt.s32.s8 %r11999, %r11998; - cvt.u32.u16 %r12000, %rs7818; - cvt.s32.s8 %r12001, %r12000; - mad.lo.s32 %r12002, %r37, %r12001, %r11993; - mad.lo.s32 %r12003, %r38, %r11999, %r12002; - mad.lo.s32 %r12004, %r39, %r11997, %r12003; - mad.lo.s32 %r12005, %r40, %r11995, %r12004; - ld.const.v4.u8 {%rs7826, %rs7827, %rs7828, %rs7829}, [matrix+3912]; - cvt.u32.u16 %r12006, %rs7829; - cvt.s32.s8 %r12007, %r12006; - cvt.u32.u16 %r12008, %rs7828; - cvt.s32.s8 %r12009, %r12008; - cvt.u32.u16 %r12010, %rs7827; - cvt.s32.s8 %r12011, %r12010; - cvt.u32.u16 %r12012, %rs7826; - cvt.s32.s8 %r12013, %r12012; - mad.lo.s32 %r12014, %r42, %r12013, %r12005; - mad.lo.s32 %r12015, %r43, %r12011, %r12014; - mad.lo.s32 %r12016, %r45, %r12009, %r12015; - mad.lo.s32 %r12017, %r46, %r12007, %r12016; - ld.const.v4.u8 {%rs7834, %rs7835, %rs7836, %rs7837}, [matrix+3916]; - cvt.u32.u16 %r12018, %rs7837; - cvt.s32.s8 %r12019, %r12018; - cvt.u32.u16 %r12020, %rs7836; - cvt.s32.s8 %r12021, %r12020; - cvt.u32.u16 %r12022, %rs7835; - cvt.s32.s8 %r12023, %r12022; - cvt.u32.u16 %r12024, %rs7834; - cvt.s32.s8 %r12025, %r12024; - mad.lo.s32 %r12026, %r48, %r12025, %r12017; - mad.lo.s32 %r12027, %r49, %r12023, %r12026; - mad.lo.s32 %r12028, %r50, %r12021, %r12027; - mad.lo.s32 %r12029, %r51, %r12019, %r12028; - ld.const.v4.u8 {%rs7842, %rs7843, %rs7844, %rs7845}, [matrix+3920]; - cvt.u32.u16 %r12030, %rs7845; - cvt.s32.s8 %r12031, %r12030; - cvt.u32.u16 %r12032, %rs7844; - cvt.s32.s8 %r12033, %r12032; - cvt.u32.u16 %r12034, %rs7843; - cvt.s32.s8 %r12035, %r12034; - cvt.u32.u16 %r12036, %rs7842; - cvt.s32.s8 %r12037, %r12036; - mad.lo.s32 %r12038, %r173, %r12037, %r12029; - mad.lo.s32 %r12039, %r53, %r12035, %r12038; - mad.lo.s32 %r12040, %r54, %r12033, %r12039; - mad.lo.s32 %r12041, %r55, %r12031, %r12040; - ld.const.v4.u8 {%rs7850, %rs7851, %rs7852, %rs7853}, [matrix+3924]; - cvt.u32.u16 %r12042, %rs7853; - cvt.s32.s8 %r12043, %r12042; - cvt.u32.u16 %r12044, %rs7852; - cvt.s32.s8 %r12045, %r12044; - cvt.u32.u16 %r12046, %rs7851; - cvt.s32.s8 %r12047, %r12046; - cvt.u32.u16 %r12048, %rs7850; - cvt.s32.s8 %r12049, %r12048; - mad.lo.s32 %r12050, %r56, %r12049, %r12041; - mad.lo.s32 %r12051, %r57, %r12047, %r12050; - mad.lo.s32 %r12052, %r58, %r12045, %r12051; - mad.lo.s32 %r12053, %r59, %r12043, %r12052; - ld.const.v4.u8 {%rs7858, %rs7859, %rs7860, %rs7861}, [matrix+3928]; - cvt.u32.u16 %r12054, %rs7861; - cvt.s32.s8 %r12055, %r12054; - cvt.u32.u16 %r12056, %rs7860; - cvt.s32.s8 %r12057, %r12056; - cvt.u32.u16 %r12058, %rs7859; - cvt.s32.s8 %r12059, %r12058; - cvt.u32.u16 %r12060, %rs7858; - cvt.s32.s8 %r12061, %r12060; - mad.lo.s32 %r12062, %r61, %r12061, %r12053; - mad.lo.s32 %r12063, %r62, %r12059, %r12062; - mad.lo.s32 %r12064, %r64, %r12057, %r12063; - mad.lo.s32 %r12065, %r65, %r12055, %r12064; - ld.const.v4.u8 {%rs7866, %rs7867, %rs7868, %rs7869}, [matrix+3932]; - cvt.u32.u16 %r12066, %rs7869; - cvt.s32.s8 %r12067, %r12066; - cvt.u32.u16 %r12068, %rs7868; - cvt.s32.s8 %r12069, %r12068; - cvt.u32.u16 %r12070, %rs7867; - cvt.s32.s8 %r12071, %r12070; - cvt.u32.u16 %r12072, %rs7866; - cvt.s32.s8 %r12073, %r12072; - mad.lo.s32 %r12074, %r67, %r12073, %r12065; - mad.lo.s32 %r12075, %r68, %r12071, %r12074; - mad.lo.s32 %r12076, %r69, %r12069, %r12075; - mad.lo.s32 %r12077, %r70, %r12067, %r12076; - ld.const.v4.u8 {%rs7874, %rs7875, %rs7876, %rs7877}, [matrix+3936]; - cvt.u32.u16 %r12078, %rs7877; - cvt.s32.s8 %r12079, %r12078; - cvt.u32.u16 %r12080, %rs7876; - cvt.s32.s8 %r12081, %r12080; - cvt.u32.u16 %r12082, %rs7875; - cvt.s32.s8 %r12083, %r12082; - cvt.u32.u16 %r12084, %rs7874; - cvt.s32.s8 %r12085, %r12084; - mad.lo.s32 %r12086, %r222, %r12085, %r12077; - mad.lo.s32 %r12087, %r72, %r12083, %r12086; - mad.lo.s32 %r12088, %r73, %r12081, %r12087; - mad.lo.s32 %r12089, %r74, %r12079, %r12088; - ld.const.v4.u8 {%rs7882, %rs7883, %rs7884, %rs7885}, [matrix+3940]; - cvt.u32.u16 %r12090, %rs7885; - cvt.s32.s8 %r12091, %r12090; - cvt.u32.u16 %r12092, %rs7884; - cvt.s32.s8 %r12093, %r12092; - cvt.u32.u16 %r12094, %rs7883; - cvt.s32.s8 %r12095, %r12094; - cvt.u32.u16 %r12096, %rs7882; - cvt.s32.s8 %r12097, %r12096; - mad.lo.s32 %r12098, %r75, %r12097, %r12089; - mad.lo.s32 %r12099, %r76, %r12095, %r12098; - mad.lo.s32 %r12100, %r77, %r12093, %r12099; - mad.lo.s32 %r12101, %r78, %r12091, %r12100; - ld.const.v4.u8 {%rs7890, %rs7891, %rs7892, %rs7893}, [matrix+3944]; - cvt.u32.u16 %r12102, %rs7893; - cvt.s32.s8 %r12103, %r12102; - cvt.u32.u16 %r12104, %rs7892; - cvt.s32.s8 %r12105, %r12104; - cvt.u32.u16 %r12106, %rs7891; - cvt.s32.s8 %r12107, %r12106; - cvt.u32.u16 %r12108, %rs7890; - cvt.s32.s8 %r12109, %r12108; - mad.lo.s32 %r12110, %r80, %r12109, %r12101; - mad.lo.s32 %r12111, %r81, %r12107, %r12110; - mad.lo.s32 %r12112, %r83, %r12105, %r12111; - mad.lo.s32 %r12113, %r84, %r12103, %r12112; - ld.const.v4.u8 {%rs7898, %rs7899, %rs7900, %rs7901}, [matrix+3948]; - cvt.u32.u16 %r12114, %rs7901; - cvt.s32.s8 %r12115, %r12114; - cvt.u32.u16 %r12116, %rs7900; - cvt.s32.s8 %r12117, %r12116; - cvt.u32.u16 %r12118, %rs7899; - cvt.s32.s8 %r12119, %r12118; - cvt.u32.u16 %r12120, %rs7898; - cvt.s32.s8 %r12121, %r12120; - mad.lo.s32 %r12122, %r86, %r12121, %r12113; - mad.lo.s32 %r12123, %r87, %r12119, %r12122; - mad.lo.s32 %r12124, %r88, %r12117, %r12123; - mad.lo.s32 %r12125, %r89, %r12115, %r12124; - ld.const.v4.u8 {%rs7906, %rs7907, %rs7908, %rs7909}, [matrix+3952]; - cvt.u32.u16 %r12126, %rs7909; - cvt.s32.s8 %r12127, %r12126; - cvt.u32.u16 %r12128, %rs7908; - cvt.s32.s8 %r12129, %r12128; - cvt.u32.u16 %r12130, %rs7907; - cvt.s32.s8 %r12131, %r12130; - cvt.u32.u16 %r12132, %rs7906; - cvt.s32.s8 %r12133, %r12132; - mad.lo.s32 %r12134, %r271, %r12133, %r12125; - mad.lo.s32 %r12135, %r91, %r12131, %r12134; - mad.lo.s32 %r12136, %r93, %r12129, %r12135; - mad.lo.s32 %r12137, %r94, %r12127, %r12136; - ld.const.v4.u8 {%rs7914, %rs7915, %rs7916, %rs7917}, [matrix+3956]; - cvt.u32.u16 %r12138, %rs7917; - cvt.s32.s8 %r12139, %r12138; - cvt.u32.u16 %r12140, %rs7916; - cvt.s32.s8 %r12141, %r12140; - cvt.u32.u16 %r12142, %rs7915; - cvt.s32.s8 %r12143, %r12142; - cvt.u32.u16 %r12144, %rs7914; - cvt.s32.s8 %r12145, %r12144; - mad.lo.s32 %r12146, %r96, %r12145, %r12137; - mad.lo.s32 %r12147, %r97, %r12143, %r12146; - mad.lo.s32 %r12148, %r99, %r12141, %r12147; - mad.lo.s32 %r12149, %r100, %r12139, %r12148; - ld.const.v4.u8 {%rs7922, %rs7923, %rs7924, %rs7925}, [matrix+3960]; - cvt.u32.u16 %r12150, %rs7925; - cvt.s32.s8 %r12151, %r12150; - cvt.u32.u16 %r12152, %rs7924; - cvt.s32.s8 %r12153, %r12152; - cvt.u32.u16 %r12154, %rs7923; - cvt.s32.s8 %r12155, %r12154; - cvt.u32.u16 %r12156, %rs7922; - cvt.s32.s8 %r12157, %r12156; - mad.lo.s32 %r12158, %r103, %r12157, %r12149; - mad.lo.s32 %r12159, %r104, %r12155, %r12158; - mad.lo.s32 %r12160, %r107, %r12153, %r12159; - mad.lo.s32 %r12161, %r108, %r12151, %r12160; - ld.const.v4.u8 {%rs7930, %rs7931, %rs7932, %rs7933}, [matrix+3964]; - cvt.u32.u16 %r12162, %rs7933; - cvt.s32.s8 %r12163, %r12162; - cvt.u32.u16 %r12164, %rs7932; - cvt.s32.s8 %r12165, %r12164; - cvt.u32.u16 %r12166, %rs7931; - cvt.s32.s8 %r12167, %r12166; - cvt.u32.u16 %r12168, %rs7930; - cvt.s32.s8 %r12169, %r12168; - mad.lo.s32 %r12170, %r111, %r12169, %r12161; - mad.lo.s32 %r12171, %r112, %r12167, %r12170; - mad.lo.s32 %r12172, %r114, %r12165, %r12171; - mad.lo.s32 %r12173, %r115, %r12163, %r12172; - shr.u32 %r12174, %r11981, 6; - and.b32 %r12175, %r12174, 240; - shr.u32 %r12176, %r12173, 10; - or.b32 %r12177, %r12176, %r12175; - xor.b32 %r12178, %r109, %r12177; - cvt.u64.u32 %rd405, %r12178; - ld.const.v4.u8 {%rs7938, %rs7939, %rs7940, %rs7941}, [matrix+3968]; - cvt.u32.u16 %r12179, %rs7941; - cvt.s32.s8 %r12180, %r12179; - cvt.u32.u16 %r12181, %rs7940; - cvt.s32.s8 %r12182, %r12181; - cvt.u32.u16 %r12183, %rs7938; - cvt.s32.s8 %r12184, %r12183; - cvt.u32.u16 %r12185, %rs7939; - cvt.s32.s8 %r12186, %r12185; - mul.lo.s32 %r12187, %r34, %r12186; - mad.lo.s32 %r12188, %r124, %r12184, %r12187; - mad.lo.s32 %r12189, %r35, %r12182, %r12188; - mad.lo.s32 %r12190, %r36, %r12180, %r12189; - ld.const.v4.u8 {%rs7946, %rs7947, %rs7948, %rs7949}, [matrix+3972]; - cvt.u32.u16 %r12191, %rs7949; - cvt.s32.s8 %r12192, %r12191; - cvt.u32.u16 %r12193, %rs7948; - cvt.s32.s8 %r12194, %r12193; - cvt.u32.u16 %r12195, %rs7947; - cvt.s32.s8 %r12196, %r12195; - cvt.u32.u16 %r12197, %rs7946; - cvt.s32.s8 %r12198, %r12197; - mad.lo.s32 %r12199, %r37, %r12198, %r12190; - mad.lo.s32 %r12200, %r38, %r12196, %r12199; - mad.lo.s32 %r12201, %r39, %r12194, %r12200; - mad.lo.s32 %r12202, %r40, %r12192, %r12201; - ld.const.v4.u8 {%rs7954, %rs7955, %rs7956, %rs7957}, [matrix+3976]; - cvt.u32.u16 %r12203, %rs7957; - cvt.s32.s8 %r12204, %r12203; - cvt.u32.u16 %r12205, %rs7956; - cvt.s32.s8 %r12206, %r12205; - cvt.u32.u16 %r12207, %rs7955; - cvt.s32.s8 %r12208, %r12207; - cvt.u32.u16 %r12209, %rs7954; - cvt.s32.s8 %r12210, %r12209; - mad.lo.s32 %r12211, %r42, %r12210, %r12202; - mad.lo.s32 %r12212, %r43, %r12208, %r12211; - mad.lo.s32 %r12213, %r45, %r12206, %r12212; - mad.lo.s32 %r12214, %r46, %r12204, %r12213; - ld.const.v4.u8 {%rs7962, %rs7963, %rs7964, %rs7965}, [matrix+3980]; - cvt.u32.u16 %r12215, %rs7965; - cvt.s32.s8 %r12216, %r12215; - cvt.u32.u16 %r12217, %rs7964; - cvt.s32.s8 %r12218, %r12217; - cvt.u32.u16 %r12219, %rs7963; - cvt.s32.s8 %r12220, %r12219; - cvt.u32.u16 %r12221, %rs7962; - cvt.s32.s8 %r12222, %r12221; - mad.lo.s32 %r12223, %r48, %r12222, %r12214; - mad.lo.s32 %r12224, %r49, %r12220, %r12223; - mad.lo.s32 %r12225, %r50, %r12218, %r12224; - mad.lo.s32 %r12226, %r51, %r12216, %r12225; - ld.const.v4.u8 {%rs7970, %rs7971, %rs7972, %rs7973}, [matrix+3984]; - cvt.u32.u16 %r12227, %rs7973; - cvt.s32.s8 %r12228, %r12227; - cvt.u32.u16 %r12229, %rs7972; - cvt.s32.s8 %r12230, %r12229; - cvt.u32.u16 %r12231, %rs7971; - cvt.s32.s8 %r12232, %r12231; - cvt.u32.u16 %r12233, %rs7970; - cvt.s32.s8 %r12234, %r12233; - mad.lo.s32 %r12235, %r173, %r12234, %r12226; - mad.lo.s32 %r12236, %r53, %r12232, %r12235; - mad.lo.s32 %r12237, %r54, %r12230, %r12236; - mad.lo.s32 %r12238, %r55, %r12228, %r12237; - ld.const.v4.u8 {%rs7978, %rs7979, %rs7980, %rs7981}, [matrix+3988]; - cvt.u32.u16 %r12239, %rs7981; - cvt.s32.s8 %r12240, %r12239; - cvt.u32.u16 %r12241, %rs7980; - cvt.s32.s8 %r12242, %r12241; - cvt.u32.u16 %r12243, %rs7979; - cvt.s32.s8 %r12244, %r12243; - cvt.u32.u16 %r12245, %rs7978; - cvt.s32.s8 %r12246, %r12245; - mad.lo.s32 %r12247, %r56, %r12246, %r12238; - mad.lo.s32 %r12248, %r57, %r12244, %r12247; - mad.lo.s32 %r12249, %r58, %r12242, %r12248; - mad.lo.s32 %r12250, %r59, %r12240, %r12249; - ld.const.v4.u8 {%rs7986, %rs7987, %rs7988, %rs7989}, [matrix+3992]; - cvt.u32.u16 %r12251, %rs7989; - cvt.s32.s8 %r12252, %r12251; - cvt.u32.u16 %r12253, %rs7988; - cvt.s32.s8 %r12254, %r12253; - cvt.u32.u16 %r12255, %rs7987; - cvt.s32.s8 %r12256, %r12255; - cvt.u32.u16 %r12257, %rs7986; - cvt.s32.s8 %r12258, %r12257; - mad.lo.s32 %r12259, %r61, %r12258, %r12250; - mad.lo.s32 %r12260, %r62, %r12256, %r12259; - mad.lo.s32 %r12261, %r64, %r12254, %r12260; - mad.lo.s32 %r12262, %r65, %r12252, %r12261; - ld.const.v4.u8 {%rs7994, %rs7995, %rs7996, %rs7997}, [matrix+3996]; - cvt.u32.u16 %r12263, %rs7997; - cvt.s32.s8 %r12264, %r12263; - cvt.u32.u16 %r12265, %rs7996; - cvt.s32.s8 %r12266, %r12265; - cvt.u32.u16 %r12267, %rs7995; - cvt.s32.s8 %r12268, %r12267; - cvt.u32.u16 %r12269, %rs7994; - cvt.s32.s8 %r12270, %r12269; - mad.lo.s32 %r12271, %r67, %r12270, %r12262; - mad.lo.s32 %r12272, %r68, %r12268, %r12271; - mad.lo.s32 %r12273, %r69, %r12266, %r12272; - mad.lo.s32 %r12274, %r70, %r12264, %r12273; - ld.const.v4.u8 {%rs8002, %rs8003, %rs8004, %rs8005}, [matrix+4000]; - cvt.u32.u16 %r12275, %rs8005; - cvt.s32.s8 %r12276, %r12275; - cvt.u32.u16 %r12277, %rs8004; - cvt.s32.s8 %r12278, %r12277; - cvt.u32.u16 %r12279, %rs8003; - cvt.s32.s8 %r12280, %r12279; - cvt.u32.u16 %r12281, %rs8002; - cvt.s32.s8 %r12282, %r12281; - mad.lo.s32 %r12283, %r222, %r12282, %r12274; - mad.lo.s32 %r12284, %r72, %r12280, %r12283; - mad.lo.s32 %r12285, %r73, %r12278, %r12284; - mad.lo.s32 %r12286, %r74, %r12276, %r12285; - ld.const.v4.u8 {%rs8010, %rs8011, %rs8012, %rs8013}, [matrix+4004]; - cvt.u32.u16 %r12287, %rs8013; - cvt.s32.s8 %r12288, %r12287; - cvt.u32.u16 %r12289, %rs8012; - cvt.s32.s8 %r12290, %r12289; - cvt.u32.u16 %r12291, %rs8011; - cvt.s32.s8 %r12292, %r12291; - cvt.u32.u16 %r12293, %rs8010; - cvt.s32.s8 %r12294, %r12293; - mad.lo.s32 %r12295, %r75, %r12294, %r12286; - mad.lo.s32 %r12296, %r76, %r12292, %r12295; - mad.lo.s32 %r12297, %r77, %r12290, %r12296; - mad.lo.s32 %r12298, %r78, %r12288, %r12297; - ld.const.v4.u8 {%rs8018, %rs8019, %rs8020, %rs8021}, [matrix+4008]; - cvt.u32.u16 %r12299, %rs8021; - cvt.s32.s8 %r12300, %r12299; - cvt.u32.u16 %r12301, %rs8020; - cvt.s32.s8 %r12302, %r12301; - cvt.u32.u16 %r12303, %rs8019; - cvt.s32.s8 %r12304, %r12303; - cvt.u32.u16 %r12305, %rs8018; - cvt.s32.s8 %r12306, %r12305; - mad.lo.s32 %r12307, %r80, %r12306, %r12298; - mad.lo.s32 %r12308, %r81, %r12304, %r12307; - mad.lo.s32 %r12309, %r83, %r12302, %r12308; - mad.lo.s32 %r12310, %r84, %r12300, %r12309; - ld.const.v4.u8 {%rs8026, %rs8027, %rs8028, %rs8029}, [matrix+4012]; - cvt.u32.u16 %r12311, %rs8029; - cvt.s32.s8 %r12312, %r12311; - cvt.u32.u16 %r12313, %rs8028; - cvt.s32.s8 %r12314, %r12313; - cvt.u32.u16 %r12315, %rs8027; - cvt.s32.s8 %r12316, %r12315; - cvt.u32.u16 %r12317, %rs8026; - cvt.s32.s8 %r12318, %r12317; - mad.lo.s32 %r12319, %r86, %r12318, %r12310; - mad.lo.s32 %r12320, %r87, %r12316, %r12319; - mad.lo.s32 %r12321, %r88, %r12314, %r12320; - mad.lo.s32 %r12322, %r89, %r12312, %r12321; - ld.const.v4.u8 {%rs8034, %rs8035, %rs8036, %rs8037}, [matrix+4016]; - cvt.u32.u16 %r12323, %rs8037; - cvt.s32.s8 %r12324, %r12323; - cvt.u32.u16 %r12325, %rs8036; - cvt.s32.s8 %r12326, %r12325; - cvt.u32.u16 %r12327, %rs8035; - cvt.s32.s8 %r12328, %r12327; - cvt.u32.u16 %r12329, %rs8034; - cvt.s32.s8 %r12330, %r12329; - mad.lo.s32 %r12331, %r271, %r12330, %r12322; - mad.lo.s32 %r12332, %r91, %r12328, %r12331; - mad.lo.s32 %r12333, %r93, %r12326, %r12332; - mad.lo.s32 %r12334, %r94, %r12324, %r12333; - ld.const.v4.u8 {%rs8042, %rs8043, %rs8044, %rs8045}, [matrix+4020]; - cvt.u32.u16 %r12335, %rs8045; - cvt.s32.s8 %r12336, %r12335; - cvt.u32.u16 %r12337, %rs8044; - cvt.s32.s8 %r12338, %r12337; - cvt.u32.u16 %r12339, %rs8043; - cvt.s32.s8 %r12340, %r12339; - cvt.u32.u16 %r12341, %rs8042; - cvt.s32.s8 %r12342, %r12341; - mad.lo.s32 %r12343, %r96, %r12342, %r12334; - mad.lo.s32 %r12344, %r97, %r12340, %r12343; - mad.lo.s32 %r12345, %r99, %r12338, %r12344; - mad.lo.s32 %r12346, %r100, %r12336, %r12345; - ld.const.v4.u8 {%rs8050, %rs8051, %rs8052, %rs8053}, [matrix+4024]; - cvt.u32.u16 %r12347, %rs8053; - cvt.s32.s8 %r12348, %r12347; - cvt.u32.u16 %r12349, %rs8052; - cvt.s32.s8 %r12350, %r12349; - cvt.u32.u16 %r12351, %rs8051; - cvt.s32.s8 %r12352, %r12351; - cvt.u32.u16 %r12353, %rs8050; - cvt.s32.s8 %r12354, %r12353; - mad.lo.s32 %r12355, %r103, %r12354, %r12346; - mad.lo.s32 %r12356, %r104, %r12352, %r12355; - mad.lo.s32 %r12357, %r107, %r12350, %r12356; - mad.lo.s32 %r12358, %r108, %r12348, %r12357; - ld.const.v4.u8 {%rs8058, %rs8059, %rs8060, %rs8061}, [matrix+4028]; - cvt.u32.u16 %r12359, %rs8061; - cvt.s32.s8 %r12360, %r12359; - cvt.u32.u16 %r12361, %rs8060; - cvt.s32.s8 %r12362, %r12361; - cvt.u32.u16 %r12363, %rs8059; - cvt.s32.s8 %r12364, %r12363; - cvt.u32.u16 %r12365, %rs8058; - cvt.s32.s8 %r12366, %r12365; - mad.lo.s32 %r12367, %r111, %r12366, %r12358; - mad.lo.s32 %r12368, %r112, %r12364, %r12367; - mad.lo.s32 %r12369, %r114, %r12362, %r12368; - mad.lo.s32 %r12370, %r115, %r12360, %r12369; - ld.const.v4.u8 {%rs8066, %rs8067, %rs8068, %rs8069}, [matrix+4032]; - cvt.u32.u16 %r12371, %rs8069; - cvt.s32.s8 %r12372, %r12371; - cvt.u32.u16 %r12373, %rs8068; - cvt.s32.s8 %r12374, %r12373; - cvt.u32.u16 %r12375, %rs8066; - cvt.s32.s8 %r12376, %r12375; - cvt.u32.u16 %r12377, %rs8067; - cvt.s32.s8 %r12378, %r12377; - mul.lo.s32 %r12379, %r34, %r12378; - mad.lo.s32 %r12380, %r124, %r12376, %r12379; - mad.lo.s32 %r12381, %r35, %r12374, %r12380; - mad.lo.s32 %r12382, %r36, %r12372, %r12381; - ld.const.v4.u8 {%rs8074, %rs8075, %rs8076, %rs8077}, [matrix+4036]; - cvt.u32.u16 %r12383, %rs8077; - cvt.s32.s8 %r12384, %r12383; - cvt.u32.u16 %r12385, %rs8076; - cvt.s32.s8 %r12386, %r12385; - cvt.u32.u16 %r12387, %rs8075; - cvt.s32.s8 %r12388, %r12387; - cvt.u32.u16 %r12389, %rs8074; - cvt.s32.s8 %r12390, %r12389; - mad.lo.s32 %r12391, %r37, %r12390, %r12382; - mad.lo.s32 %r12392, %r38, %r12388, %r12391; - mad.lo.s32 %r12393, %r39, %r12386, %r12392; - mad.lo.s32 %r12394, %r40, %r12384, %r12393; - ld.const.v4.u8 {%rs8082, %rs8083, %rs8084, %rs8085}, [matrix+4040]; - cvt.u32.u16 %r12395, %rs8085; - cvt.s32.s8 %r12396, %r12395; - cvt.u32.u16 %r12397, %rs8084; - cvt.s32.s8 %r12398, %r12397; - cvt.u32.u16 %r12399, %rs8083; - cvt.s32.s8 %r12400, %r12399; - cvt.u32.u16 %r12401, %rs8082; - cvt.s32.s8 %r12402, %r12401; - mad.lo.s32 %r12403, %r42, %r12402, %r12394; - mad.lo.s32 %r12404, %r43, %r12400, %r12403; - mad.lo.s32 %r12405, %r45, %r12398, %r12404; - mad.lo.s32 %r12406, %r46, %r12396, %r12405; - ld.const.v4.u8 {%rs8090, %rs8091, %rs8092, %rs8093}, [matrix+4044]; - cvt.u32.u16 %r12407, %rs8093; - cvt.s32.s8 %r12408, %r12407; - cvt.u32.u16 %r12409, %rs8092; - cvt.s32.s8 %r12410, %r12409; - cvt.u32.u16 %r12411, %rs8091; - cvt.s32.s8 %r12412, %r12411; - cvt.u32.u16 %r12413, %rs8090; - cvt.s32.s8 %r12414, %r12413; - mad.lo.s32 %r12415, %r48, %r12414, %r12406; - mad.lo.s32 %r12416, %r49, %r12412, %r12415; - mad.lo.s32 %r12417, %r50, %r12410, %r12416; - mad.lo.s32 %r12418, %r51, %r12408, %r12417; - ld.const.v4.u8 {%rs8098, %rs8099, %rs8100, %rs8101}, [matrix+4048]; - cvt.u32.u16 %r12419, %rs8101; - cvt.s32.s8 %r12420, %r12419; - cvt.u32.u16 %r12421, %rs8100; - cvt.s32.s8 %r12422, %r12421; - cvt.u32.u16 %r12423, %rs8099; - cvt.s32.s8 %r12424, %r12423; - cvt.u32.u16 %r12425, %rs8098; - cvt.s32.s8 %r12426, %r12425; - mad.lo.s32 %r12427, %r173, %r12426, %r12418; - mad.lo.s32 %r12428, %r53, %r12424, %r12427; - mad.lo.s32 %r12429, %r54, %r12422, %r12428; - mad.lo.s32 %r12430, %r55, %r12420, %r12429; - ld.const.v4.u8 {%rs8106, %rs8107, %rs8108, %rs8109}, [matrix+4052]; - cvt.u32.u16 %r12431, %rs8109; - cvt.s32.s8 %r12432, %r12431; - cvt.u32.u16 %r12433, %rs8108; - cvt.s32.s8 %r12434, %r12433; - cvt.u32.u16 %r12435, %rs8107; - cvt.s32.s8 %r12436, %r12435; - cvt.u32.u16 %r12437, %rs8106; - cvt.s32.s8 %r12438, %r12437; - mad.lo.s32 %r12439, %r56, %r12438, %r12430; - mad.lo.s32 %r12440, %r57, %r12436, %r12439; - mad.lo.s32 %r12441, %r58, %r12434, %r12440; - mad.lo.s32 %r12442, %r59, %r12432, %r12441; - ld.const.v4.u8 {%rs8114, %rs8115, %rs8116, %rs8117}, [matrix+4056]; - cvt.u32.u16 %r12443, %rs8117; - cvt.s32.s8 %r12444, %r12443; - cvt.u32.u16 %r12445, %rs8116; - cvt.s32.s8 %r12446, %r12445; - cvt.u32.u16 %r12447, %rs8115; - cvt.s32.s8 %r12448, %r12447; - cvt.u32.u16 %r12449, %rs8114; - cvt.s32.s8 %r12450, %r12449; - mad.lo.s32 %r12451, %r61, %r12450, %r12442; - mad.lo.s32 %r12452, %r62, %r12448, %r12451; - mad.lo.s32 %r12453, %r64, %r12446, %r12452; - mad.lo.s32 %r12454, %r65, %r12444, %r12453; - ld.const.v4.u8 {%rs8122, %rs8123, %rs8124, %rs8125}, [matrix+4060]; - cvt.u32.u16 %r12455, %rs8125; - cvt.s32.s8 %r12456, %r12455; - cvt.u32.u16 %r12457, %rs8124; - cvt.s32.s8 %r12458, %r12457; - cvt.u32.u16 %r12459, %rs8123; - cvt.s32.s8 %r12460, %r12459; - cvt.u32.u16 %r12461, %rs8122; - cvt.s32.s8 %r12462, %r12461; - mad.lo.s32 %r12463, %r67, %r12462, %r12454; - mad.lo.s32 %r12464, %r68, %r12460, %r12463; - mad.lo.s32 %r12465, %r69, %r12458, %r12464; - mad.lo.s32 %r12466, %r70, %r12456, %r12465; - ld.const.v4.u8 {%rs8130, %rs8131, %rs8132, %rs8133}, [matrix+4064]; - cvt.u32.u16 %r12467, %rs8133; - cvt.s32.s8 %r12468, %r12467; - cvt.u32.u16 %r12469, %rs8132; - cvt.s32.s8 %r12470, %r12469; - cvt.u32.u16 %r12471, %rs8131; - cvt.s32.s8 %r12472, %r12471; - cvt.u32.u16 %r12473, %rs8130; - cvt.s32.s8 %r12474, %r12473; - mad.lo.s32 %r12475, %r222, %r12474, %r12466; - mad.lo.s32 %r12476, %r72, %r12472, %r12475; - mad.lo.s32 %r12477, %r73, %r12470, %r12476; - mad.lo.s32 %r12478, %r74, %r12468, %r12477; - ld.const.v4.u8 {%rs8138, %rs8139, %rs8140, %rs8141}, [matrix+4068]; - cvt.u32.u16 %r12479, %rs8141; - cvt.s32.s8 %r12480, %r12479; - cvt.u32.u16 %r12481, %rs8140; - cvt.s32.s8 %r12482, %r12481; - cvt.u32.u16 %r12483, %rs8139; - cvt.s32.s8 %r12484, %r12483; - cvt.u32.u16 %r12485, %rs8138; - cvt.s32.s8 %r12486, %r12485; - mad.lo.s32 %r12487, %r75, %r12486, %r12478; - mad.lo.s32 %r12488, %r76, %r12484, %r12487; - mad.lo.s32 %r12489, %r77, %r12482, %r12488; - mad.lo.s32 %r12490, %r78, %r12480, %r12489; - ld.const.v4.u8 {%rs8146, %rs8147, %rs8148, %rs8149}, [matrix+4072]; - cvt.u32.u16 %r12491, %rs8149; - cvt.s32.s8 %r12492, %r12491; - cvt.u32.u16 %r12493, %rs8148; - cvt.s32.s8 %r12494, %r12493; - cvt.u32.u16 %r12495, %rs8147; - cvt.s32.s8 %r12496, %r12495; - cvt.u32.u16 %r12497, %rs8146; - cvt.s32.s8 %r12498, %r12497; - mad.lo.s32 %r12499, %r80, %r12498, %r12490; - mad.lo.s32 %r12500, %r81, %r12496, %r12499; - mad.lo.s32 %r12501, %r83, %r12494, %r12500; - mad.lo.s32 %r12502, %r84, %r12492, %r12501; - ld.const.v4.u8 {%rs8154, %rs8155, %rs8156, %rs8157}, [matrix+4076]; - cvt.u32.u16 %r12503, %rs8157; - cvt.s32.s8 %r12504, %r12503; - cvt.u32.u16 %r12505, %rs8156; - cvt.s32.s8 %r12506, %r12505; - cvt.u32.u16 %r12507, %rs8155; - cvt.s32.s8 %r12508, %r12507; - cvt.u32.u16 %r12509, %rs8154; - cvt.s32.s8 %r12510, %r12509; - mad.lo.s32 %r12511, %r86, %r12510, %r12502; - mad.lo.s32 %r12512, %r87, %r12508, %r12511; - mad.lo.s32 %r12513, %r88, %r12506, %r12512; - mad.lo.s32 %r12514, %r89, %r12504, %r12513; - ld.const.v4.u8 {%rs8162, %rs8163, %rs8164, %rs8165}, [matrix+4080]; - cvt.u32.u16 %r12515, %rs8165; - cvt.s32.s8 %r12516, %r12515; - cvt.u32.u16 %r12517, %rs8164; - cvt.s32.s8 %r12518, %r12517; - cvt.u32.u16 %r12519, %rs8163; - cvt.s32.s8 %r12520, %r12519; - cvt.u32.u16 %r12521, %rs8162; - cvt.s32.s8 %r12522, %r12521; - mad.lo.s32 %r12523, %r271, %r12522, %r12514; - mad.lo.s32 %r12524, %r91, %r12520, %r12523; - mad.lo.s32 %r12525, %r93, %r12518, %r12524; - mad.lo.s32 %r12526, %r94, %r12516, %r12525; - ld.const.v4.u8 {%rs8170, %rs8171, %rs8172, %rs8173}, [matrix+4084]; - cvt.u32.u16 %r12527, %rs8173; - cvt.s32.s8 %r12528, %r12527; - cvt.u32.u16 %r12529, %rs8172; - cvt.s32.s8 %r12530, %r12529; - cvt.u32.u16 %r12531, %rs8171; - cvt.s32.s8 %r12532, %r12531; - cvt.u32.u16 %r12533, %rs8170; - cvt.s32.s8 %r12534, %r12533; - mad.lo.s32 %r12535, %r96, %r12534, %r12526; - mad.lo.s32 %r12536, %r97, %r12532, %r12535; - mad.lo.s32 %r12537, %r99, %r12530, %r12536; - mad.lo.s32 %r12538, %r100, %r12528, %r12537; - ld.const.v4.u8 {%rs8178, %rs8179, %rs8180, %rs8181}, [matrix+4088]; - cvt.u32.u16 %r12539, %rs8181; - cvt.s32.s8 %r12540, %r12539; - cvt.u32.u16 %r12541, %rs8180; - cvt.s32.s8 %r12542, %r12541; - cvt.u32.u16 %r12543, %rs8179; - cvt.s32.s8 %r12544, %r12543; - cvt.u32.u16 %r12545, %rs8178; - cvt.s32.s8 %r12546, %r12545; - mad.lo.s32 %r12547, %r103, %r12546, %r12538; - mad.lo.s32 %r12548, %r104, %r12544, %r12547; - mad.lo.s32 %r12549, %r107, %r12542, %r12548; - mad.lo.s32 %r12550, %r108, %r12540, %r12549; - ld.const.v4.u8 {%rs8186, %rs8187, %rs8188, %rs8189}, [matrix+4092]; - cvt.u32.u16 %r12551, %rs8189; - cvt.s32.s8 %r12552, %r12551; - cvt.u32.u16 %r12553, %rs8188; - cvt.s32.s8 %r12554, %r12553; - cvt.u32.u16 %r12555, %rs8187; - cvt.s32.s8 %r12556, %r12555; - cvt.u32.u16 %r12557, %rs8186; - cvt.s32.s8 %r12558, %r12557; - mad.lo.s32 %r12559, %r111, %r12558, %r12550; - mad.lo.s32 %r12560, %r112, %r12556, %r12559; - mad.lo.s32 %r12561, %r114, %r12554, %r12560; - mad.lo.s32 %r12562, %r115, %r12552, %r12561; - shr.u32 %r12563, %r12370, 6; - and.b32 %r12564, %r12563, 240; - shr.u32 %r12565, %r12562, 10; - or.b32 %r12566, %r12565, %r12564; - xor.b32 %r12567, %r113, %r12566; - and.b64 %rd406, %rd394, 255; - and.b64 %rd407, %rd395, 255; - and.b64 %rd408, %rd388, 255; - and.b64 %rd409, %rd389, 255; - and.b64 %rd410, %rd382, 255; - and.b64 %rd411, %rd383, 255; - mul.wide.u32 %rd412, %r7121, 256; - shl.b64 %rd413, %rd406, 16; - shl.b64 %rd414, %rd407, 24; - and.b64 %rd415, %rd398, 255; - and.b64 %rd416, %rd397, 255; - and.b64 %rd417, %rd396, 255; - mul.wide.u32 %rd418, %r4009, 256; - shl.b64 %rd419, %rd408, 16; - shl.b64 %rd420, %rd409, 24; - and.b64 %rd421, %rd392, 255; - and.b64 %rd422, %rd391, 255; - and.b64 %rd423, %rd390, 255; - mul.wide.u32 %rd424, %r897, 256; - shl.b64 %rd425, %rd410, 16; - shl.b64 %rd426, %rd411, 24; - and.b64 %rd427, %rd386, 255; - and.b64 %rd428, %rd385, 255; - and.b64 %rd429, %rd384, 255; - cvt.u64.u32 %rd430, %r12567; - cvt.u64.u32 %rd431, %r9844; - cvt.u64.u32 %rd432, %r9455; - cvt.u64.u32 %rd433, %r6343; - cvt.u64.u32 %rd434, %r3231; - shl.b64 %rd435, %rd434, 56; - shl.b64 %rd436, %rd427, 48; - or.b64 %rd437, %rd435, %rd436; - shl.b64 %rd438, %rd428, 40; - or.b64 %rd439, %rd437, %rd438; - shl.b64 %rd440, %rd429, 32; - or.b64 %rd441, %rd439, %rd440; - or.b64 %rd442, %rd441, %rd426; - or.b64 %rd443, %rd442, %rd425; - and.b64 %rd444, %rd381, 255; - and.b64 %rd445, %rd424, 65280; - or.b64 %rd446, %rd443, %rd445; - or.b64 %rd447, %rd446, %rd444; - shl.b64 %rd448, %rd433, 56; - shl.b64 %rd449, %rd421, 48; - or.b64 %rd450, %rd448, %rd449; - shl.b64 %rd451, %rd422, 40; - or.b64 %rd452, %rd450, %rd451; - shl.b64 %rd453, %rd423, 32; - or.b64 %rd454, %rd452, %rd453; - or.b64 %rd455, %rd454, %rd420; - or.b64 %rd456, %rd455, %rd419; - and.b64 %rd457, %rd387, 255; - and.b64 %rd458, %rd418, 65280; - or.b64 %rd459, %rd456, %rd458; - or.b64 %rd460, %rd459, %rd457; - shl.b64 %rd461, %rd432, 56; - shl.b64 %rd462, %rd415, 48; - or.b64 %rd463, %rd461, %rd462; - shl.b64 %rd464, %rd416, 40; - or.b64 %rd465, %rd463, %rd464; - shl.b64 %rd466, %rd417, 32; - or.b64 %rd467, %rd465, %rd466; - or.b64 %rd468, %rd467, %rd414; - or.b64 %rd469, %rd468, %rd413; - and.b64 %rd470, %rd393, 255; - and.b64 %rd471, %rd412, 65280; - or.b64 %rd472, %rd469, %rd471; - or.b64 %rd473, %rd472, %rd470; - shl.b64 %rd474, %rd430, 56; - and.b64 %rd475, %rd405, 255; - shl.b64 %rd476, %rd475, 48; - or.b64 %rd477, %rd474, %rd476; - and.b64 %rd478, %rd404, 255; - shl.b64 %rd479, %rd478, 40; - or.b64 %rd480, %rd477, %rd479; - shl.b64 %rd481, %rd403, 32; - or.b64 %rd482, %rd480, %rd481; - and.b64 %rd483, %rd401, 255; - shl.b64 %rd484, %rd483, 24; - or.b64 %rd485, %rd482, %rd484; - and.b64 %rd486, %rd400, 255; - shl.b64 %rd487, %rd486, 16; - and.b64 %rd488, %rd399, 255; - shl.b64 %rd489, %rd488, 8; - or.b64 %rd490, %rd485, %rd487; - and.b64 %rd491, %rd431, 255; - or.b64 %rd492, %rd490, %rd489; - or.b64 %rd493, %rd492, %rd491; - xor.b64 %rd667, %rd493, 1272090201925444760; - xor.b64 %rd672, %rd473, 8796936657246353646; - xor.b64 %rd677, %rd460, 8746723911537738262; - xor.b64 %rd682, %rd447, 4239941492252378377; - mov.u64 %rd681, 8270816933120786537; - mov.u64 %rd680, -850687345431043546; - mov.u64 %rd679, 8596393687355028144; - mov.u64 %rd678, -4073852189716399785; - mov.u64 %rd676, -4539347866060507718; - mov.u64 %rd675, -3233781605604422593; - mov.u64 %rd674, 570094237299545110; - mov.u64 %rd673, 5171152063242093102; - mov.u64 %rd671, 6782861118970774626; - mov.u64 %rd670, 7812475424661425213; - mov.u64 %rd669, 9119540418498120711; - mov.u64 %rd668, -7873636174015165430; - mov.u64 %rd666, -9207053471590684088; - mov.u64 %rd665, 3370482334374859748; - mov.u64 %rd664, -1544774801229058759; - mov.u64 %rd663, 6096431547456407061; - mov.u64 %rd662, -1792185402154627366; - mov.u64 %rd661, -6864424130110145268; - mov.u64 %rd660, 5690099369266491460; - mov.u64 %rd659, -5074726839974049192; - mov.u64 %rd658, 1592359455985097269; - mov.u64 %rd657, RC; - mov.u32 %r12569, -24; - -BB0_9: - xor.b64 %rd494, %rd681, %rd682; - xor.b64 %rd495, %rd494, %rd680; - xor.b64 %rd496, %rd495, %rd679; - xor.b64 %rd497, %rd496, %rd678; - xor.b64 %rd498, %rd676, %rd677; - xor.b64 %rd499, %rd498, %rd675; - xor.b64 %rd500, %rd499, %rd674; - xor.b64 %rd501, %rd500, %rd673; - xor.b64 %rd502, %rd671, %rd672; - xor.b64 %rd503, %rd502, %rd670; - xor.b64 %rd504, %rd503, %rd669; - xor.b64 %rd505, %rd504, %rd668; - xor.b64 %rd506, %rd666, %rd667; - xor.b64 %rd507, %rd506, %rd665; - xor.b64 %rd508, %rd507, %rd664; - xor.b64 %rd509, %rd508, %rd663; - xor.b64 %rd510, %rd661, %rd662; - xor.b64 %rd511, %rd510, %rd660; - xor.b64 %rd512, %rd511, %rd659; - xor.b64 %rd513, %rd512, %rd658; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd501, 1; - shr.b64 %rhs, %rd501, 63; - add.u64 %rd514, %lhs, %rhs; - } - xor.b64 %rd515, %rd513, %rd514; - xor.b64 %rd516, %rd682, %rd515; - xor.b64 %rd517, %rd681, %rd515; - xor.b64 %rd518, %rd680, %rd515; - xor.b64 %rd519, %rd679, %rd515; - xor.b64 %rd520, %rd678, %rd515; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd505, 1; - shr.b64 %rhs, %rd505, 63; - add.u64 %rd521, %lhs, %rhs; - } - xor.b64 %rd522, %rd521, %rd497; - xor.b64 %rd523, %rd677, %rd522; - xor.b64 %rd524, %rd676, %rd522; - xor.b64 %rd525, %rd675, %rd522; - xor.b64 %rd526, %rd674, %rd522; - xor.b64 %rd527, %rd673, %rd522; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd509, 1; - shr.b64 %rhs, %rd509, 63; - add.u64 %rd528, %lhs, %rhs; - } - xor.b64 %rd529, %rd528, %rd501; - xor.b64 %rd530, %rd672, %rd529; - xor.b64 %rd531, %rd671, %rd529; - xor.b64 %rd532, %rd670, %rd529; - xor.b64 %rd533, %rd669, %rd529; - xor.b64 %rd534, %rd668, %rd529; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd513, 1; - shr.b64 %rhs, %rd513, 63; - add.u64 %rd535, %lhs, %rhs; - } - xor.b64 %rd536, %rd535, %rd505; - xor.b64 %rd537, %rd667, %rd536; - xor.b64 %rd538, %rd666, %rd536; - xor.b64 %rd539, %rd665, %rd536; - xor.b64 %rd540, %rd664, %rd536; - xor.b64 %rd541, %rd663, %rd536; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd497, 1; - shr.b64 %rhs, %rd497, 63; - add.u64 %rd542, %lhs, %rhs; - } - xor.b64 %rd543, %rd542, %rd509; - xor.b64 %rd544, %rd662, %rd543; - xor.b64 %rd545, %rd661, %rd543; - xor.b64 %rd546, %rd660, %rd543; - xor.b64 %rd547, %rd659, %rd543; - xor.b64 %rd548, %rd658, %rd543; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd523, 1; - shr.b64 %rhs, %rd523, 63; - add.u64 %rd549, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd518, 3; - shr.b64 %rhs, %rd518, 61; - add.u64 %rd550, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd531, 6; - shr.b64 %rhs, %rd531, 58; - add.u64 %rd551, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd525, 10; - shr.b64 %rhs, %rd525, 54; - add.u64 %rd552, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd533, 15; - shr.b64 %rhs, %rd533, 49; - add.u64 %rd553, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd540, 21; - shr.b64 %rhs, %rd540, 43; - add.u64 %rd554, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd537, 28; - shr.b64 %rhs, %rd537, 36; - add.u64 %rd555, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd517, 36; - shr.b64 %rhs, %rd517, 28; - add.u64 %rd556, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd526, 45; - shr.b64 %rhs, %rd526, 19; - add.u64 %rd557, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd538, 55; - shr.b64 %rhs, %rd538, 9; - add.u64 %rd558, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd527, 2; - shr.b64 %rhs, %rd527, 62; - add.u64 %rd559, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd548, 14; - shr.b64 %rhs, %rd548, 50; - add.u64 %rd560, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd544, 27; - shr.b64 %rhs, %rd544, 37; - add.u64 %rd561, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd519, 41; - shr.b64 %rhs, %rd519, 23; - add.u64 %rd562, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd541, 56; - shr.b64 %rhs, %rd541, 8; - add.u64 %rd563, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd547, 8; - shr.b64 %rhs, %rd547, 56; - add.u64 %rd564, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd539, 25; - shr.b64 %rhs, %rd539, 39; - add.u64 %rd565, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd532, 43; - shr.b64 %rhs, %rd532, 21; - add.u64 %rd566, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd530, 62; - shr.b64 %rhs, %rd530, 2; - add.u64 %rd567, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd520, 18; - shr.b64 %rhs, %rd520, 46; - add.u64 %rd568, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd546, 39; - shr.b64 %rhs, %rd546, 25; - add.u64 %rd569, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd534, 61; - shr.b64 %rhs, %rd534, 3; - add.u64 %rd570, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd545, 20; - shr.b64 %rhs, %rd545, 44; - add.u64 %rd571, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd524, 44; - shr.b64 %rhs, %rd524, 20; - add.u64 %rd572, %lhs, %rhs; - } - not.b64 %rd573, %rd572; - and.b64 %rd574, %rd566, %rd573; - xor.b64 %rd575, %rd574, %rd516; - not.b64 %rd576, %rd566; - and.b64 %rd577, %rd554, %rd576; - xor.b64 %rd677, %rd577, %rd572; - not.b64 %rd578, %rd554; - and.b64 %rd579, %rd560, %rd578; - xor.b64 %rd672, %rd579, %rd566; - not.b64 %rd580, %rd560; - and.b64 %rd581, %rd516, %rd580; - xor.b64 %rd667, %rd581, %rd554; - not.b64 %rd582, %rd516; - and.b64 %rd583, %rd572, %rd582; - xor.b64 %rd662, %rd560, %rd583; - not.b64 %rd584, %rd571; - and.b64 %rd585, %rd550, %rd584; - xor.b64 %rd681, %rd585, %rd555; - not.b64 %rd586, %rd550; - and.b64 %rd587, %rd557, %rd586; - xor.b64 %rd676, %rd587, %rd571; - not.b64 %rd588, %rd557; - and.b64 %rd589, %rd570, %rd588; - xor.b64 %rd671, %rd589, %rd550; - not.b64 %rd590, %rd570; - and.b64 %rd591, %rd555, %rd590; - xor.b64 %rd666, %rd591, %rd557; - not.b64 %rd592, %rd555; - and.b64 %rd593, %rd571, %rd592; - xor.b64 %rd661, %rd570, %rd593; - not.b64 %rd594, %rd551; - and.b64 %rd595, %rd565, %rd594; - xor.b64 %rd680, %rd595, %rd549; - not.b64 %rd596, %rd565; - and.b64 %rd597, %rd564, %rd596; - xor.b64 %rd675, %rd597, %rd551; - not.b64 %rd598, %rd564; - and.b64 %rd599, %rd568, %rd598; - xor.b64 %rd670, %rd599, %rd565; - not.b64 %rd600, %rd568; - and.b64 %rd601, %rd549, %rd600; - xor.b64 %rd665, %rd601, %rd564; - not.b64 %rd602, %rd549; - and.b64 %rd603, %rd551, %rd602; - xor.b64 %rd660, %rd568, %rd603; - not.b64 %rd604, %rd556; - and.b64 %rd605, %rd552, %rd604; - xor.b64 %rd679, %rd605, %rd561; - not.b64 %rd606, %rd552; - and.b64 %rd607, %rd553, %rd606; - xor.b64 %rd674, %rd607, %rd556; - not.b64 %rd608, %rd553; - and.b64 %rd609, %rd563, %rd608; - xor.b64 %rd669, %rd609, %rd552; - not.b64 %rd610, %rd563; - and.b64 %rd611, %rd561, %rd610; - xor.b64 %rd664, %rd611, %rd553; - not.b64 %rd612, %rd561; - and.b64 %rd613, %rd556, %rd612; - xor.b64 %rd659, %rd563, %rd613; - not.b64 %rd614, %rd558; - and.b64 %rd615, %rd569, %rd614; - xor.b64 %rd678, %rd615, %rd567; - not.b64 %rd616, %rd569; - and.b64 %rd617, %rd562, %rd616; - xor.b64 %rd673, %rd617, %rd558; - not.b64 %rd618, %rd562; - and.b64 %rd619, %rd559, %rd618; - xor.b64 %rd668, %rd619, %rd569; - not.b64 %rd620, %rd559; - and.b64 %rd621, %rd567, %rd620; - xor.b64 %rd663, %rd621, %rd562; - not.b64 %rd622, %rd567; - and.b64 %rd623, %rd558, %rd622; - xor.b64 %rd658, %rd559, %rd623; - ld.global.u64 %rd624, [%rd657]; - xor.b64 %rd682, %rd575, %rd624; - add.s64 %rd657, %rd657, 8; - add.s32 %r12569, %r12569, 1; - setp.ne.s32 %p10, %r12569, 0; - @%p10 bra BB0_9; - - ld.const.u64 %rd125, [target+24]; - setp.eq.s64 %p11, %rd667, %rd125; - @%p11 bra BB0_12; - bra.uni BB0_11; - -BB0_12: - ld.const.u64 %rd126, [target+16]; - setp.eq.s64 %p12, %rd672, %rd126; - @%p12 bra BB0_14; - bra.uni BB0_13; - -BB0_14: - ld.const.u64 %rd127, [target+8]; - setp.eq.s64 %p13, %rd677, %rd127; - @%p13 bra BB0_16; - bra.uni BB0_15; - -BB0_16: - ld.const.u64 %rd625, [target]; - setp.lt.u64 %p4, %rd682, %rd625; - @!%p4 bra BB0_18; - bra.uni BB0_17; - -BB0_11: - setp.lt.u64 %p1, %rd667, %rd125; - @!%p1 bra BB0_18; - bra.uni BB0_17; - -BB0_13: - setp.lt.u64 %p2, %rd672, %rd126; - @!%p2 bra BB0_18; - bra.uni BB0_17; - -BB0_15: - setp.lt.u64 %p3, %rd677, %rd127; - @!%p3 bra BB0_18; - bra.uni BB0_17; - -BB0_17: - ld.param.u64 %rd629, [heavy_hash_param_5]; - cvta.to.global.u64 %rd628, %rd629; - mov.u64 %rd626, 0; - atom.global.cas.b64 %rd627, [%rd628], %rd626, %rd6; - -BB0_18: - ret; -} - - +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-21554848 +// Cuda compilation tools, release 8.0, V8.0.61 +// Based on LLVM 3.4svn +// + +.version 5.0 +.target sm_20 +.address_size 64 + + // .globl heavy_hash +.const .align 1 .b8 matrix[4096]; +.const .align 8 .b8 hash_header[72]; +.const .align 8 .b8 target[32]; +.global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; +.global .align 1 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; +.global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; +.global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; +.const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; +.const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; + +.visible .entry heavy_hash( + .param .u64 heavy_hash_param_0, + .param .u64 heavy_hash_param_1, + .param .u64 heavy_hash_param_2, + .param .u8 heavy_hash_param_3, + .param .u64 heavy_hash_param_4, + .param .u64 heavy_hash_param_5 +) +{ + .reg .pred %p<18>; + .reg .b16 %rs<8194>; + .reg .b32 %r<12570>; + .reg .b64 %rd<683>; + + + ld.param.u64 %rd128, [heavy_hash_param_0]; + ld.param.u64 %rd129, [heavy_hash_param_1]; + ld.param.u64 %rd131, [heavy_hash_param_2]; + ld.param.u64 %rd130, [heavy_hash_param_4]; + ld.param.u64 %rd132, [heavy_hash_param_5]; + ld.param.u8 %rs1, [heavy_hash_param_3]; + cvta.to.global.u64 %rd1, %rd132; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r8, %r5, %r6, %r7; + cvt.s64.s32 %rd2, %r8; + setp.ge.u64 %p6, %rd2, %rd131; + @%p6 bra BB0_18; + + cvt.u32.u64 %r9, %rd2; + setp.ne.s32 %p7, %r9, 0; + @%p7 bra BB0_3; + + mov.u64 %rd133, 0; + st.global.u64 [%rd1], %rd133; + +BB0_3: + setp.eq.s16 %p8, %rs1, 0; + @%p8 bra BB0_5; + + cvta.to.global.u64 %rd134, %rd130; + shl.b64 %rd135, %rd2, 5; + add.s64 %rd136, %rd134, %rd135; + ld.global.v2.u64 {%rd137, %rd138}, [%rd136]; + mul.lo.s64 %rd141, %rd138, 5; + mul.lo.s64 %rd142, %rd138, 640; + shr.u64 %rd143, %rd141, 57; + or.b64 %rd144, %rd143, %rd142; + mul.lo.s64 %rd630, %rd144, 9; + shl.b64 %rd145, %rd138, 17; + ld.global.v2.u64 {%rd146, %rd147}, [%rd136+16]; + xor.b64 %rd149, %rd146, %rd137; + xor.b64 %rd151, %rd147, %rd138; + xor.b64 %rd152, %rd138, %rd149; + xor.b64 %rd153, %rd137, %rd151; + st.global.v2.u64 [%rd136], {%rd153, %rd152}; + xor.b64 %rd154, %rd149, %rd145; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd151, 45; + shr.b64 %rhs, %rd151, 19; + add.u64 %rd155, %lhs, %rhs; + } + st.global.v2.u64 [%rd136+16], {%rd154, %rd155}; + bra.uni BB0_6; + +BB0_5: + cvta.to.global.u64 %rd156, %rd130; + ld.global.u64 %rd157, [%rd156]; + xor.b64 %rd630, %rd157, %rd2; + +BB0_6: + and.b64 %rd174, %rd630, %rd128; + or.b64 %rd6, %rd174, %rd129; + xor.b64 %rd635, %rd6, 3343109343542796272; + ld.const.u64 %rd175, [hash_header+64]; + xor.b64 %rd640, %rd175, -510048929142394560; + ld.const.u64 %rd176, [hash_header+56]; + xor.b64 %rd645, %rd176, 8605242046444978844; + ld.const.u64 %rd177, [hash_header+48]; + xor.b64 %rd650, %rd177, 1745875063082670864; + ld.const.u64 %rd178, [hash_header+40]; + xor.b64 %rd655, %rd178, 5399642050693751366; + ld.const.u64 %rd179, [hash_header+32]; + xor.b64 %rd636, %rd179, 8876506674959887717; + ld.const.u64 %rd180, [hash_header+24]; + xor.b64 %rd641, %rd180, 1992179434288343456; + ld.const.u64 %rd181, [hash_header+16]; + xor.b64 %rd646, %rd181, 2188519011337848018; + ld.const.u64 %rd182, [hash_header+8]; + xor.b64 %rd651, %rd182, 3008272977830772284; + ld.const.u64 %rd183, [hash_header]; + xor.b64 %rd656, %rd183, 1242148031264380989; + mov.u64 %rd654, 1123092876221303306; + mov.u64 %rd653, 3784524041015224902; + mov.u64 %rd652, -8517909413761200310; + mov.u64 %rd649, 4963925045340115282; + mov.u64 %rd648, 1082795874807940378; + mov.u64 %rd647, 5237849264682708699; + mov.u64 %rd644, -1409360996057663723; + mov.u64 %rd643, -4494027153138273982; + mov.u64 %rd642, -5621391061570334094; + mov.u64 %rd639, -1817099578685924727; + mov.u64 %rd638, -5035616039755945756; + mov.u64 %rd637, 6706187291358897596; + mov.u64 %rd634, -5613068297060437469; + mov.u64 %rd633, -3386048033060200563; + mov.u64 %rd632, 196324915476054915; + mov.u64 %rd631, RC; + mov.u32 %r12568, -24; + +BB0_7: + xor.b64 %rd184, %rd655, %rd656; + xor.b64 %rd185, %rd184, %rd654; + xor.b64 %rd186, %rd185, %rd653; + xor.b64 %rd187, %rd186, %rd652; + xor.b64 %rd188, %rd650, %rd651; + xor.b64 %rd189, %rd188, %rd649; + xor.b64 %rd190, %rd189, %rd648; + xor.b64 %rd191, %rd190, %rd647; + xor.b64 %rd192, %rd645, %rd646; + xor.b64 %rd193, %rd192, %rd644; + xor.b64 %rd194, %rd193, %rd643; + xor.b64 %rd195, %rd194, %rd642; + xor.b64 %rd196, %rd640, %rd641; + xor.b64 %rd197, %rd196, %rd639; + xor.b64 %rd198, %rd197, %rd638; + xor.b64 %rd199, %rd198, %rd637; + xor.b64 %rd200, %rd635, %rd636; + xor.b64 %rd201, %rd200, %rd634; + xor.b64 %rd202, %rd201, %rd633; + xor.b64 %rd203, %rd202, %rd632; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd191, 1; + shr.b64 %rhs, %rd191, 63; + add.u64 %rd204, %lhs, %rhs; + } + xor.b64 %rd205, %rd203, %rd204; + xor.b64 %rd206, %rd656, %rd205; + xor.b64 %rd207, %rd655, %rd205; + xor.b64 %rd208, %rd654, %rd205; + xor.b64 %rd209, %rd653, %rd205; + xor.b64 %rd210, %rd652, %rd205; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd195, 1; + shr.b64 %rhs, %rd195, 63; + add.u64 %rd211, %lhs, %rhs; + } + xor.b64 %rd212, %rd211, %rd187; + xor.b64 %rd213, %rd651, %rd212; + xor.b64 %rd214, %rd650, %rd212; + xor.b64 %rd215, %rd649, %rd212; + xor.b64 %rd216, %rd648, %rd212; + xor.b64 %rd217, %rd647, %rd212; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd199, 1; + shr.b64 %rhs, %rd199, 63; + add.u64 %rd218, %lhs, %rhs; + } + xor.b64 %rd219, %rd218, %rd191; + xor.b64 %rd220, %rd646, %rd219; + xor.b64 %rd221, %rd645, %rd219; + xor.b64 %rd222, %rd644, %rd219; + xor.b64 %rd223, %rd643, %rd219; + xor.b64 %rd224, %rd642, %rd219; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd203, 1; + shr.b64 %rhs, %rd203, 63; + add.u64 %rd225, %lhs, %rhs; + } + xor.b64 %rd226, %rd225, %rd195; + xor.b64 %rd227, %rd641, %rd226; + xor.b64 %rd228, %rd640, %rd226; + xor.b64 %rd229, %rd639, %rd226; + xor.b64 %rd230, %rd638, %rd226; + xor.b64 %rd231, %rd637, %rd226; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd187, 1; + shr.b64 %rhs, %rd187, 63; + add.u64 %rd232, %lhs, %rhs; + } + xor.b64 %rd233, %rd232, %rd199; + xor.b64 %rd234, %rd636, %rd233; + xor.b64 %rd235, %rd635, %rd233; + xor.b64 %rd236, %rd634, %rd233; + xor.b64 %rd237, %rd633, %rd233; + xor.b64 %rd238, %rd632, %rd233; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd213, 1; + shr.b64 %rhs, %rd213, 63; + add.u64 %rd239, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd208, 3; + shr.b64 %rhs, %rd208, 61; + add.u64 %rd240, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd221, 6; + shr.b64 %rhs, %rd221, 58; + add.u64 %rd241, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd215, 10; + shr.b64 %rhs, %rd215, 54; + add.u64 %rd242, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd223, 15; + shr.b64 %rhs, %rd223, 49; + add.u64 %rd243, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd230, 21; + shr.b64 %rhs, %rd230, 43; + add.u64 %rd244, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd227, 28; + shr.b64 %rhs, %rd227, 36; + add.u64 %rd245, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd207, 36; + shr.b64 %rhs, %rd207, 28; + add.u64 %rd246, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd216, 45; + shr.b64 %rhs, %rd216, 19; + add.u64 %rd247, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd228, 55; + shr.b64 %rhs, %rd228, 9; + add.u64 %rd248, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd217, 2; + shr.b64 %rhs, %rd217, 62; + add.u64 %rd249, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd238, 14; + shr.b64 %rhs, %rd238, 50; + add.u64 %rd250, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd234, 27; + shr.b64 %rhs, %rd234, 37; + add.u64 %rd251, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd209, 41; + shr.b64 %rhs, %rd209, 23; + add.u64 %rd252, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd231, 56; + shr.b64 %rhs, %rd231, 8; + add.u64 %rd253, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd237, 8; + shr.b64 %rhs, %rd237, 56; + add.u64 %rd254, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd229, 25; + shr.b64 %rhs, %rd229, 39; + add.u64 %rd255, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd222, 43; + shr.b64 %rhs, %rd222, 21; + add.u64 %rd256, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd220, 62; + shr.b64 %rhs, %rd220, 2; + add.u64 %rd257, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd210, 18; + shr.b64 %rhs, %rd210, 46; + add.u64 %rd258, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd236, 39; + shr.b64 %rhs, %rd236, 25; + add.u64 %rd259, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd224, 61; + shr.b64 %rhs, %rd224, 3; + add.u64 %rd260, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd235, 20; + shr.b64 %rhs, %rd235, 44; + add.u64 %rd261, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd214, 44; + shr.b64 %rhs, %rd214, 20; + add.u64 %rd262, %lhs, %rhs; + } + not.b64 %rd263, %rd262; + and.b64 %rd264, %rd256, %rd263; + xor.b64 %rd265, %rd264, %rd206; + not.b64 %rd266, %rd256; + and.b64 %rd267, %rd244, %rd266; + xor.b64 %rd651, %rd267, %rd262; + not.b64 %rd268, %rd244; + and.b64 %rd269, %rd250, %rd268; + xor.b64 %rd646, %rd269, %rd256; + not.b64 %rd270, %rd250; + and.b64 %rd271, %rd206, %rd270; + xor.b64 %rd641, %rd271, %rd244; + not.b64 %rd272, %rd206; + and.b64 %rd273, %rd262, %rd272; + xor.b64 %rd636, %rd250, %rd273; + not.b64 %rd274, %rd261; + and.b64 %rd275, %rd240, %rd274; + xor.b64 %rd655, %rd275, %rd245; + not.b64 %rd276, %rd240; + and.b64 %rd277, %rd247, %rd276; + xor.b64 %rd650, %rd277, %rd261; + not.b64 %rd278, %rd247; + and.b64 %rd279, %rd260, %rd278; + xor.b64 %rd645, %rd279, %rd240; + not.b64 %rd280, %rd260; + and.b64 %rd281, %rd245, %rd280; + xor.b64 %rd640, %rd281, %rd247; + not.b64 %rd282, %rd245; + and.b64 %rd283, %rd261, %rd282; + xor.b64 %rd635, %rd260, %rd283; + not.b64 %rd284, %rd241; + and.b64 %rd285, %rd255, %rd284; + xor.b64 %rd654, %rd285, %rd239; + not.b64 %rd286, %rd255; + and.b64 %rd287, %rd254, %rd286; + xor.b64 %rd649, %rd287, %rd241; + not.b64 %rd288, %rd254; + and.b64 %rd289, %rd258, %rd288; + xor.b64 %rd644, %rd289, %rd255; + not.b64 %rd290, %rd258; + and.b64 %rd291, %rd239, %rd290; + xor.b64 %rd639, %rd291, %rd254; + not.b64 %rd292, %rd239; + and.b64 %rd293, %rd241, %rd292; + xor.b64 %rd634, %rd258, %rd293; + not.b64 %rd294, %rd246; + and.b64 %rd295, %rd242, %rd294; + xor.b64 %rd653, %rd295, %rd251; + not.b64 %rd296, %rd242; + and.b64 %rd297, %rd243, %rd296; + xor.b64 %rd648, %rd297, %rd246; + not.b64 %rd298, %rd243; + and.b64 %rd299, %rd253, %rd298; + xor.b64 %rd643, %rd299, %rd242; + not.b64 %rd300, %rd253; + and.b64 %rd301, %rd251, %rd300; + xor.b64 %rd638, %rd301, %rd243; + not.b64 %rd302, %rd251; + and.b64 %rd303, %rd246, %rd302; + xor.b64 %rd633, %rd253, %rd303; + not.b64 %rd304, %rd248; + and.b64 %rd305, %rd259, %rd304; + xor.b64 %rd652, %rd305, %rd257; + not.b64 %rd306, %rd259; + and.b64 %rd307, %rd252, %rd306; + xor.b64 %rd647, %rd307, %rd248; + not.b64 %rd308, %rd252; + and.b64 %rd309, %rd249, %rd308; + xor.b64 %rd642, %rd309, %rd259; + not.b64 %rd310, %rd249; + and.b64 %rd311, %rd257, %rd310; + xor.b64 %rd637, %rd311, %rd252; + not.b64 %rd312, %rd257; + and.b64 %rd313, %rd248, %rd312; + xor.b64 %rd632, %rd249, %rd313; + ld.global.u64 %rd314, [%rd631]; + xor.b64 %rd656, %rd265, %rd314; + add.s64 %rd631, %rd631, 8; + add.s32 %r12568, %r12568, 1; + setp.ne.s32 %p9, %r12568, 0; + @%p9 bra BB0_7; + + shr.u64 %rd337, %rd656, 8; + cvt.u32.u64 %r12, %rd337; + shr.u64 %rd338, %rd656, 16; + cvt.u32.u64 %r13, %rd338; + shr.u64 %rd339, %rd656, 24; + cvt.u32.u64 %r14, %rd339; + shr.u64 %rd340, %rd656, 32; + cvt.u32.u64 %r15, %rd340; + shr.u64 %rd341, %rd656, 40; + cvt.u32.u64 %r16, %rd341; + shr.u64 %rd342, %rd656, 48; + cvt.u32.u64 %r17, %rd342; + shr.u64 %rd343, %rd656, 56; + cvt.u32.u64 %r18, %rd343; + shr.u64 %rd344, %rd651, 8; + cvt.u32.u64 %r19, %rd344; + shr.u64 %rd345, %rd651, 16; + cvt.u32.u64 %r20, %rd345; + shr.u64 %rd346, %rd651, 24; + cvt.u32.u64 %r21, %rd346; + shr.u64 %rd347, %rd651, 32; + cvt.u32.u64 %r22, %rd347; + shr.u64 %rd348, %rd651, 40; + cvt.u32.u64 %r23, %rd348; + shr.u64 %rd349, %rd651, 48; + cvt.u32.u64 %r24, %rd349; + shr.u64 %rd350, %rd651, 56; + cvt.u32.u64 %r25, %rd350; + shr.u64 %rd351, %rd646, 8; + cvt.u32.u64 %r26, %rd351; + shr.u64 %rd352, %rd646, 16; + cvt.u32.u64 %r27, %rd352; + shr.u64 %rd353, %rd646, 24; + cvt.u32.u64 %r28, %rd353; + shr.u64 %rd354, %rd646, 32; + cvt.u32.u64 %r29, %rd354; + shr.u64 %rd355, %rd646, 40; + cvt.u32.u64 %r30, %rd355; + shr.u64 %rd356, %rd646, 48; + cvt.u32.u64 %r31, %rd356; + shr.u64 %rd357, %rd646, 56; + cvt.u32.u64 %r32, %rd357; + cvt.u32.u64 %r33, %rd656; + and.b32 %r34, %r33, 15; + bfe.u32 %r35, %r33, 12, 4; + and.b32 %r36, %r12, 15; + bfe.u32 %r37, %r33, 20, 4; + and.b32 %r38, %r13, 15; + shr.u32 %r39, %r33, 28; + and.b32 %r40, %r14, 15; + shr.u64 %rd358, %rd656, 36; + cvt.u32.u64 %r41, %rd358; + and.b32 %r42, %r41, 15; + and.b32 %r43, %r15, 15; + shr.u64 %rd359, %rd656, 44; + cvt.u32.u64 %r44, %rd359; + and.b32 %r45, %r44, 15; + and.b32 %r46, %r16, 15; + shr.u64 %rd360, %rd656, 52; + cvt.u32.u64 %r47, %rd360; + and.b32 %r48, %r47, 15; + and.b32 %r49, %r17, 15; + shr.u64 %rd361, %rd656, 60; + cvt.u32.u64 %r50, %rd361; + and.b32 %r51, %r18, 15; + cvt.u32.u64 %r52, %rd651; + and.b32 %r53, %r52, 15; + bfe.u32 %r54, %r52, 12, 4; + and.b32 %r55, %r19, 15; + bfe.u32 %r56, %r52, 20, 4; + and.b32 %r57, %r20, 15; + shr.u32 %r58, %r52, 28; + and.b32 %r59, %r21, 15; + shr.u64 %rd362, %rd651, 36; + cvt.u32.u64 %r60, %rd362; + and.b32 %r61, %r60, 15; + and.b32 %r62, %r22, 15; + shr.u64 %rd363, %rd651, 44; + cvt.u32.u64 %r63, %rd363; + and.b32 %r64, %r63, 15; + and.b32 %r65, %r23, 15; + shr.u64 %rd364, %rd651, 52; + cvt.u32.u64 %r66, %rd364; + and.b32 %r67, %r66, 15; + and.b32 %r68, %r24, 15; + shr.u64 %rd365, %rd651, 60; + cvt.u32.u64 %r69, %rd365; + and.b32 %r70, %r25, 15; + cvt.u32.u64 %r71, %rd646; + and.b32 %r72, %r71, 15; + bfe.u32 %r73, %r71, 12, 4; + and.b32 %r74, %r26, 15; + bfe.u32 %r75, %r71, 20, 4; + and.b32 %r76, %r27, 15; + shr.u32 %r77, %r71, 28; + and.b32 %r78, %r28, 15; + shr.u64 %rd366, %rd646, 36; + cvt.u32.u64 %r79, %rd366; + and.b32 %r80, %r79, 15; + and.b32 %r81, %r29, 15; + shr.u64 %rd367, %rd646, 44; + cvt.u32.u64 %r82, %rd367; + and.b32 %r83, %r82, 15; + and.b32 %r84, %r30, 15; + shr.u64 %rd368, %rd646, 52; + cvt.u32.u64 %r85, %rd368; + and.b32 %r86, %r85, 15; + and.b32 %r87, %r31, 15; + shr.u64 %rd369, %rd646, 60; + cvt.u32.u64 %r88, %rd369; + and.b32 %r89, %r32, 15; + cvt.u32.u64 %r90, %rd641; + and.b32 %r91, %r90, 15; + shr.u64 %rd370, %rd641, 8; + cvt.u32.u64 %r92, %rd370; + bfe.u32 %r93, %r90, 12, 4; + and.b32 %r94, %r92, 15; + shr.u64 %rd371, %rd641, 16; + cvt.u32.u64 %r95, %rd371; + bfe.u32 %r96, %r90, 20, 4; + and.b32 %r97, %r95, 15; + shr.u64 %rd372, %rd641, 24; + cvt.u32.u64 %r98, %rd372; + shr.u32 %r99, %r90, 28; + and.b32 %r100, %r98, 15; + shr.u64 %rd373, %rd641, 32; + cvt.u32.u64 %r101, %rd373; + shr.u64 %rd374, %rd641, 36; + cvt.u32.u64 %r102, %rd374; + and.b32 %r103, %r102, 15; + and.b32 %r104, %r101, 15; + shr.u64 %rd375, %rd641, 40; + cvt.u32.u64 %r105, %rd375; + shr.u64 %rd376, %rd641, 44; + cvt.u32.u64 %r106, %rd376; + and.b32 %r107, %r106, 15; + and.b32 %r108, %r105, 15; + shr.u64 %rd377, %rd641, 48; + cvt.u32.u64 %r109, %rd377; + shr.u64 %rd378, %rd641, 52; + cvt.u32.u64 %r110, %rd378; + and.b32 %r111, %r110, 15; + and.b32 %r112, %r109, 15; + shr.u64 %rd379, %rd641, 56; + cvt.u32.u64 %r113, %rd379; + shr.u64 %rd380, %rd641, 60; + cvt.u32.u64 %r114, %rd380; + and.b32 %r115, %r113, 15; + ld.const.v4.u8 {%rs2, %rs3, %rs4, %rs5}, [matrix]; + cvt.u32.u16 %r116, %rs5; + cvt.s32.s8 %r117, %r116; + cvt.u32.u16 %r118, %rs4; + cvt.s32.s8 %r119, %r118; + cvt.u32.u16 %r120, %rs2; + cvt.s32.s8 %r121, %r120; + cvt.u32.u16 %r122, %rs3; + cvt.s32.s8 %r123, %r122; + bfe.u32 %r124, %r33, 4, 4; + mul.lo.s32 %r125, %r34, %r123; + mad.lo.s32 %r126, %r124, %r121, %r125; + mad.lo.s32 %r127, %r35, %r119, %r126; + mad.lo.s32 %r128, %r36, %r117, %r127; + ld.const.v4.u8 {%rs10, %rs11, %rs12, %rs13}, [matrix+4]; + cvt.u32.u16 %r129, %rs13; + cvt.s32.s8 %r130, %r129; + cvt.u32.u16 %r131, %rs12; + cvt.s32.s8 %r132, %r131; + cvt.u32.u16 %r133, %rs11; + cvt.s32.s8 %r134, %r133; + cvt.u32.u16 %r135, %rs10; + cvt.s32.s8 %r136, %r135; + mad.lo.s32 %r137, %r37, %r136, %r128; + mad.lo.s32 %r138, %r38, %r134, %r137; + mad.lo.s32 %r139, %r39, %r132, %r138; + mad.lo.s32 %r140, %r40, %r130, %r139; + ld.const.v4.u8 {%rs18, %rs19, %rs20, %rs21}, [matrix+8]; + cvt.u32.u16 %r141, %rs21; + cvt.s32.s8 %r142, %r141; + cvt.u32.u16 %r143, %rs20; + cvt.s32.s8 %r144, %r143; + cvt.u32.u16 %r145, %rs19; + cvt.s32.s8 %r146, %r145; + cvt.u32.u16 %r147, %rs18; + cvt.s32.s8 %r148, %r147; + mad.lo.s32 %r149, %r42, %r148, %r140; + mad.lo.s32 %r150, %r43, %r146, %r149; + mad.lo.s32 %r151, %r45, %r144, %r150; + mad.lo.s32 %r152, %r46, %r142, %r151; + ld.const.v4.u8 {%rs26, %rs27, %rs28, %rs29}, [matrix+12]; + cvt.u32.u16 %r153, %rs29; + cvt.s32.s8 %r154, %r153; + cvt.u32.u16 %r155, %rs28; + cvt.s32.s8 %r156, %r155; + cvt.u32.u16 %r157, %rs27; + cvt.s32.s8 %r158, %r157; + cvt.u32.u16 %r159, %rs26; + cvt.s32.s8 %r160, %r159; + mad.lo.s32 %r161, %r48, %r160, %r152; + mad.lo.s32 %r162, %r49, %r158, %r161; + mad.lo.s32 %r163, %r50, %r156, %r162; + mad.lo.s32 %r164, %r51, %r154, %r163; + ld.const.v4.u8 {%rs34, %rs35, %rs36, %rs37}, [matrix+16]; + cvt.u32.u16 %r165, %rs37; + cvt.s32.s8 %r166, %r165; + cvt.u32.u16 %r167, %rs36; + cvt.s32.s8 %r168, %r167; + cvt.u32.u16 %r169, %rs35; + cvt.s32.s8 %r170, %r169; + cvt.u32.u16 %r171, %rs34; + cvt.s32.s8 %r172, %r171; + bfe.u32 %r173, %r52, 4, 4; + mad.lo.s32 %r174, %r173, %r172, %r164; + mad.lo.s32 %r175, %r53, %r170, %r174; + mad.lo.s32 %r176, %r54, %r168, %r175; + mad.lo.s32 %r177, %r55, %r166, %r176; + ld.const.v4.u8 {%rs42, %rs43, %rs44, %rs45}, [matrix+20]; + cvt.u32.u16 %r178, %rs45; + cvt.s32.s8 %r179, %r178; + cvt.u32.u16 %r180, %rs44; + cvt.s32.s8 %r181, %r180; + cvt.u32.u16 %r182, %rs43; + cvt.s32.s8 %r183, %r182; + cvt.u32.u16 %r184, %rs42; + cvt.s32.s8 %r185, %r184; + mad.lo.s32 %r186, %r56, %r185, %r177; + mad.lo.s32 %r187, %r57, %r183, %r186; + mad.lo.s32 %r188, %r58, %r181, %r187; + mad.lo.s32 %r189, %r59, %r179, %r188; + ld.const.v4.u8 {%rs50, %rs51, %rs52, %rs53}, [matrix+24]; + cvt.u32.u16 %r190, %rs53; + cvt.s32.s8 %r191, %r190; + cvt.u32.u16 %r192, %rs52; + cvt.s32.s8 %r193, %r192; + cvt.u32.u16 %r194, %rs51; + cvt.s32.s8 %r195, %r194; + cvt.u32.u16 %r196, %rs50; + cvt.s32.s8 %r197, %r196; + mad.lo.s32 %r198, %r61, %r197, %r189; + mad.lo.s32 %r199, %r62, %r195, %r198; + mad.lo.s32 %r200, %r64, %r193, %r199; + mad.lo.s32 %r201, %r65, %r191, %r200; + ld.const.v4.u8 {%rs58, %rs59, %rs60, %rs61}, [matrix+28]; + cvt.u32.u16 %r202, %rs61; + cvt.s32.s8 %r203, %r202; + cvt.u32.u16 %r204, %rs60; + cvt.s32.s8 %r205, %r204; + cvt.u32.u16 %r206, %rs59; + cvt.s32.s8 %r207, %r206; + cvt.u32.u16 %r208, %rs58; + cvt.s32.s8 %r209, %r208; + mad.lo.s32 %r210, %r67, %r209, %r201; + mad.lo.s32 %r211, %r68, %r207, %r210; + mad.lo.s32 %r212, %r69, %r205, %r211; + mad.lo.s32 %r213, %r70, %r203, %r212; + ld.const.v4.u8 {%rs66, %rs67, %rs68, %rs69}, [matrix+32]; + cvt.u32.u16 %r214, %rs69; + cvt.s32.s8 %r215, %r214; + cvt.u32.u16 %r216, %rs68; + cvt.s32.s8 %r217, %r216; + cvt.u32.u16 %r218, %rs67; + cvt.s32.s8 %r219, %r218; + cvt.u32.u16 %r220, %rs66; + cvt.s32.s8 %r221, %r220; + bfe.u32 %r222, %r71, 4, 4; + mad.lo.s32 %r223, %r222, %r221, %r213; + mad.lo.s32 %r224, %r72, %r219, %r223; + mad.lo.s32 %r225, %r73, %r217, %r224; + mad.lo.s32 %r226, %r74, %r215, %r225; + ld.const.v4.u8 {%rs74, %rs75, %rs76, %rs77}, [matrix+36]; + cvt.u32.u16 %r227, %rs77; + cvt.s32.s8 %r228, %r227; + cvt.u32.u16 %r229, %rs76; + cvt.s32.s8 %r230, %r229; + cvt.u32.u16 %r231, %rs75; + cvt.s32.s8 %r232, %r231; + cvt.u32.u16 %r233, %rs74; + cvt.s32.s8 %r234, %r233; + mad.lo.s32 %r235, %r75, %r234, %r226; + mad.lo.s32 %r236, %r76, %r232, %r235; + mad.lo.s32 %r237, %r77, %r230, %r236; + mad.lo.s32 %r238, %r78, %r228, %r237; + ld.const.v4.u8 {%rs82, %rs83, %rs84, %rs85}, [matrix+40]; + cvt.u32.u16 %r239, %rs85; + cvt.s32.s8 %r240, %r239; + cvt.u32.u16 %r241, %rs84; + cvt.s32.s8 %r242, %r241; + cvt.u32.u16 %r243, %rs83; + cvt.s32.s8 %r244, %r243; + cvt.u32.u16 %r245, %rs82; + cvt.s32.s8 %r246, %r245; + mad.lo.s32 %r247, %r80, %r246, %r238; + mad.lo.s32 %r248, %r81, %r244, %r247; + mad.lo.s32 %r249, %r83, %r242, %r248; + mad.lo.s32 %r250, %r84, %r240, %r249; + ld.const.v4.u8 {%rs90, %rs91, %rs92, %rs93}, [matrix+44]; + cvt.u32.u16 %r251, %rs93; + cvt.s32.s8 %r252, %r251; + cvt.u32.u16 %r253, %rs92; + cvt.s32.s8 %r254, %r253; + cvt.u32.u16 %r255, %rs91; + cvt.s32.s8 %r256, %r255; + cvt.u32.u16 %r257, %rs90; + cvt.s32.s8 %r258, %r257; + mad.lo.s32 %r259, %r86, %r258, %r250; + mad.lo.s32 %r260, %r87, %r256, %r259; + mad.lo.s32 %r261, %r88, %r254, %r260; + mad.lo.s32 %r262, %r89, %r252, %r261; + ld.const.v4.u8 {%rs98, %rs99, %rs100, %rs101}, [matrix+48]; + cvt.u32.u16 %r263, %rs101; + cvt.s32.s8 %r264, %r263; + cvt.u32.u16 %r265, %rs100; + cvt.s32.s8 %r266, %r265; + cvt.u32.u16 %r267, %rs99; + cvt.s32.s8 %r268, %r267; + cvt.u32.u16 %r269, %rs98; + cvt.s32.s8 %r270, %r269; + bfe.u32 %r271, %r90, 4, 4; + mad.lo.s32 %r272, %r271, %r270, %r262; + mad.lo.s32 %r273, %r91, %r268, %r272; + mad.lo.s32 %r274, %r93, %r266, %r273; + mad.lo.s32 %r275, %r94, %r264, %r274; + ld.const.v4.u8 {%rs106, %rs107, %rs108, %rs109}, [matrix+52]; + cvt.u32.u16 %r276, %rs109; + cvt.s32.s8 %r277, %r276; + cvt.u32.u16 %r278, %rs108; + cvt.s32.s8 %r279, %r278; + cvt.u32.u16 %r280, %rs107; + cvt.s32.s8 %r281, %r280; + cvt.u32.u16 %r282, %rs106; + cvt.s32.s8 %r283, %r282; + mad.lo.s32 %r284, %r96, %r283, %r275; + mad.lo.s32 %r285, %r97, %r281, %r284; + mad.lo.s32 %r286, %r99, %r279, %r285; + mad.lo.s32 %r287, %r100, %r277, %r286; + ld.const.v4.u8 {%rs114, %rs115, %rs116, %rs117}, [matrix+56]; + cvt.u32.u16 %r288, %rs117; + cvt.s32.s8 %r289, %r288; + cvt.u32.u16 %r290, %rs116; + cvt.s32.s8 %r291, %r290; + cvt.u32.u16 %r292, %rs115; + cvt.s32.s8 %r293, %r292; + cvt.u32.u16 %r294, %rs114; + cvt.s32.s8 %r295, %r294; + mad.lo.s32 %r296, %r103, %r295, %r287; + mad.lo.s32 %r297, %r104, %r293, %r296; + mad.lo.s32 %r298, %r107, %r291, %r297; + mad.lo.s32 %r299, %r108, %r289, %r298; + ld.const.v4.u8 {%rs122, %rs123, %rs124, %rs125}, [matrix+60]; + cvt.u32.u16 %r300, %rs125; + cvt.s32.s8 %r301, %r300; + cvt.u32.u16 %r302, %rs124; + cvt.s32.s8 %r303, %r302; + cvt.u32.u16 %r304, %rs123; + cvt.s32.s8 %r305, %r304; + cvt.u32.u16 %r306, %rs122; + cvt.s32.s8 %r307, %r306; + mad.lo.s32 %r308, %r111, %r307, %r299; + mad.lo.s32 %r309, %r112, %r305, %r308; + mad.lo.s32 %r310, %r114, %r303, %r309; + mad.lo.s32 %r311, %r115, %r301, %r310; + ld.const.v4.u8 {%rs130, %rs131, %rs132, %rs133}, [matrix+64]; + cvt.u32.u16 %r312, %rs133; + cvt.s32.s8 %r313, %r312; + cvt.u32.u16 %r314, %rs132; + cvt.s32.s8 %r315, %r314; + cvt.u32.u16 %r316, %rs130; + cvt.s32.s8 %r317, %r316; + cvt.u32.u16 %r318, %rs131; + cvt.s32.s8 %r319, %r318; + mul.lo.s32 %r320, %r34, %r319; + mad.lo.s32 %r321, %r124, %r317, %r320; + mad.lo.s32 %r322, %r35, %r315, %r321; + mad.lo.s32 %r323, %r36, %r313, %r322; + ld.const.v4.u8 {%rs138, %rs139, %rs140, %rs141}, [matrix+68]; + cvt.u32.u16 %r324, %rs141; + cvt.s32.s8 %r325, %r324; + cvt.u32.u16 %r326, %rs140; + cvt.s32.s8 %r327, %r326; + cvt.u32.u16 %r328, %rs139; + cvt.s32.s8 %r329, %r328; + cvt.u32.u16 %r330, %rs138; + cvt.s32.s8 %r331, %r330; + mad.lo.s32 %r332, %r37, %r331, %r323; + mad.lo.s32 %r333, %r38, %r329, %r332; + mad.lo.s32 %r334, %r39, %r327, %r333; + mad.lo.s32 %r335, %r40, %r325, %r334; + ld.const.v4.u8 {%rs146, %rs147, %rs148, %rs149}, [matrix+72]; + cvt.u32.u16 %r336, %rs149; + cvt.s32.s8 %r337, %r336; + cvt.u32.u16 %r338, %rs148; + cvt.s32.s8 %r339, %r338; + cvt.u32.u16 %r340, %rs147; + cvt.s32.s8 %r341, %r340; + cvt.u32.u16 %r342, %rs146; + cvt.s32.s8 %r343, %r342; + mad.lo.s32 %r344, %r42, %r343, %r335; + mad.lo.s32 %r345, %r43, %r341, %r344; + mad.lo.s32 %r346, %r45, %r339, %r345; + mad.lo.s32 %r347, %r46, %r337, %r346; + ld.const.v4.u8 {%rs154, %rs155, %rs156, %rs157}, [matrix+76]; + cvt.u32.u16 %r348, %rs157; + cvt.s32.s8 %r349, %r348; + cvt.u32.u16 %r350, %rs156; + cvt.s32.s8 %r351, %r350; + cvt.u32.u16 %r352, %rs155; + cvt.s32.s8 %r353, %r352; + cvt.u32.u16 %r354, %rs154; + cvt.s32.s8 %r355, %r354; + mad.lo.s32 %r356, %r48, %r355, %r347; + mad.lo.s32 %r357, %r49, %r353, %r356; + mad.lo.s32 %r358, %r50, %r351, %r357; + mad.lo.s32 %r359, %r51, %r349, %r358; + ld.const.v4.u8 {%rs162, %rs163, %rs164, %rs165}, [matrix+80]; + cvt.u32.u16 %r360, %rs165; + cvt.s32.s8 %r361, %r360; + cvt.u32.u16 %r362, %rs164; + cvt.s32.s8 %r363, %r362; + cvt.u32.u16 %r364, %rs163; + cvt.s32.s8 %r365, %r364; + cvt.u32.u16 %r366, %rs162; + cvt.s32.s8 %r367, %r366; + mad.lo.s32 %r368, %r173, %r367, %r359; + mad.lo.s32 %r369, %r53, %r365, %r368; + mad.lo.s32 %r370, %r54, %r363, %r369; + mad.lo.s32 %r371, %r55, %r361, %r370; + ld.const.v4.u8 {%rs170, %rs171, %rs172, %rs173}, [matrix+84]; + cvt.u32.u16 %r372, %rs173; + cvt.s32.s8 %r373, %r372; + cvt.u32.u16 %r374, %rs172; + cvt.s32.s8 %r375, %r374; + cvt.u32.u16 %r376, %rs171; + cvt.s32.s8 %r377, %r376; + cvt.u32.u16 %r378, %rs170; + cvt.s32.s8 %r379, %r378; + mad.lo.s32 %r380, %r56, %r379, %r371; + mad.lo.s32 %r381, %r57, %r377, %r380; + mad.lo.s32 %r382, %r58, %r375, %r381; + mad.lo.s32 %r383, %r59, %r373, %r382; + ld.const.v4.u8 {%rs178, %rs179, %rs180, %rs181}, [matrix+88]; + cvt.u32.u16 %r384, %rs181; + cvt.s32.s8 %r385, %r384; + cvt.u32.u16 %r386, %rs180; + cvt.s32.s8 %r387, %r386; + cvt.u32.u16 %r388, %rs179; + cvt.s32.s8 %r389, %r388; + cvt.u32.u16 %r390, %rs178; + cvt.s32.s8 %r391, %r390; + mad.lo.s32 %r392, %r61, %r391, %r383; + mad.lo.s32 %r393, %r62, %r389, %r392; + mad.lo.s32 %r394, %r64, %r387, %r393; + mad.lo.s32 %r395, %r65, %r385, %r394; + ld.const.v4.u8 {%rs186, %rs187, %rs188, %rs189}, [matrix+92]; + cvt.u32.u16 %r396, %rs189; + cvt.s32.s8 %r397, %r396; + cvt.u32.u16 %r398, %rs188; + cvt.s32.s8 %r399, %r398; + cvt.u32.u16 %r400, %rs187; + cvt.s32.s8 %r401, %r400; + cvt.u32.u16 %r402, %rs186; + cvt.s32.s8 %r403, %r402; + mad.lo.s32 %r404, %r67, %r403, %r395; + mad.lo.s32 %r405, %r68, %r401, %r404; + mad.lo.s32 %r406, %r69, %r399, %r405; + mad.lo.s32 %r407, %r70, %r397, %r406; + ld.const.v4.u8 {%rs194, %rs195, %rs196, %rs197}, [matrix+96]; + cvt.u32.u16 %r408, %rs197; + cvt.s32.s8 %r409, %r408; + cvt.u32.u16 %r410, %rs196; + cvt.s32.s8 %r411, %r410; + cvt.u32.u16 %r412, %rs195; + cvt.s32.s8 %r413, %r412; + cvt.u32.u16 %r414, %rs194; + cvt.s32.s8 %r415, %r414; + mad.lo.s32 %r416, %r222, %r415, %r407; + mad.lo.s32 %r417, %r72, %r413, %r416; + mad.lo.s32 %r418, %r73, %r411, %r417; + mad.lo.s32 %r419, %r74, %r409, %r418; + ld.const.v4.u8 {%rs202, %rs203, %rs204, %rs205}, [matrix+100]; + cvt.u32.u16 %r420, %rs205; + cvt.s32.s8 %r421, %r420; + cvt.u32.u16 %r422, %rs204; + cvt.s32.s8 %r423, %r422; + cvt.u32.u16 %r424, %rs203; + cvt.s32.s8 %r425, %r424; + cvt.u32.u16 %r426, %rs202; + cvt.s32.s8 %r427, %r426; + mad.lo.s32 %r428, %r75, %r427, %r419; + mad.lo.s32 %r429, %r76, %r425, %r428; + mad.lo.s32 %r430, %r77, %r423, %r429; + mad.lo.s32 %r431, %r78, %r421, %r430; + ld.const.v4.u8 {%rs210, %rs211, %rs212, %rs213}, [matrix+104]; + cvt.u32.u16 %r432, %rs213; + cvt.s32.s8 %r433, %r432; + cvt.u32.u16 %r434, %rs212; + cvt.s32.s8 %r435, %r434; + cvt.u32.u16 %r436, %rs211; + cvt.s32.s8 %r437, %r436; + cvt.u32.u16 %r438, %rs210; + cvt.s32.s8 %r439, %r438; + mad.lo.s32 %r440, %r80, %r439, %r431; + mad.lo.s32 %r441, %r81, %r437, %r440; + mad.lo.s32 %r442, %r83, %r435, %r441; + mad.lo.s32 %r443, %r84, %r433, %r442; + ld.const.v4.u8 {%rs218, %rs219, %rs220, %rs221}, [matrix+108]; + cvt.u32.u16 %r444, %rs221; + cvt.s32.s8 %r445, %r444; + cvt.u32.u16 %r446, %rs220; + cvt.s32.s8 %r447, %r446; + cvt.u32.u16 %r448, %rs219; + cvt.s32.s8 %r449, %r448; + cvt.u32.u16 %r450, %rs218; + cvt.s32.s8 %r451, %r450; + mad.lo.s32 %r452, %r86, %r451, %r443; + mad.lo.s32 %r453, %r87, %r449, %r452; + mad.lo.s32 %r454, %r88, %r447, %r453; + mad.lo.s32 %r455, %r89, %r445, %r454; + ld.const.v4.u8 {%rs226, %rs227, %rs228, %rs229}, [matrix+112]; + cvt.u32.u16 %r456, %rs229; + cvt.s32.s8 %r457, %r456; + cvt.u32.u16 %r458, %rs228; + cvt.s32.s8 %r459, %r458; + cvt.u32.u16 %r460, %rs227; + cvt.s32.s8 %r461, %r460; + cvt.u32.u16 %r462, %rs226; + cvt.s32.s8 %r463, %r462; + mad.lo.s32 %r464, %r271, %r463, %r455; + mad.lo.s32 %r465, %r91, %r461, %r464; + mad.lo.s32 %r466, %r93, %r459, %r465; + mad.lo.s32 %r467, %r94, %r457, %r466; + ld.const.v4.u8 {%rs234, %rs235, %rs236, %rs237}, [matrix+116]; + cvt.u32.u16 %r468, %rs237; + cvt.s32.s8 %r469, %r468; + cvt.u32.u16 %r470, %rs236; + cvt.s32.s8 %r471, %r470; + cvt.u32.u16 %r472, %rs235; + cvt.s32.s8 %r473, %r472; + cvt.u32.u16 %r474, %rs234; + cvt.s32.s8 %r475, %r474; + mad.lo.s32 %r476, %r96, %r475, %r467; + mad.lo.s32 %r477, %r97, %r473, %r476; + mad.lo.s32 %r478, %r99, %r471, %r477; + mad.lo.s32 %r479, %r100, %r469, %r478; + ld.const.v4.u8 {%rs242, %rs243, %rs244, %rs245}, [matrix+120]; + cvt.u32.u16 %r480, %rs245; + cvt.s32.s8 %r481, %r480; + cvt.u32.u16 %r482, %rs244; + cvt.s32.s8 %r483, %r482; + cvt.u32.u16 %r484, %rs243; + cvt.s32.s8 %r485, %r484; + cvt.u32.u16 %r486, %rs242; + cvt.s32.s8 %r487, %r486; + mad.lo.s32 %r488, %r103, %r487, %r479; + mad.lo.s32 %r489, %r104, %r485, %r488; + mad.lo.s32 %r490, %r107, %r483, %r489; + mad.lo.s32 %r491, %r108, %r481, %r490; + ld.const.v4.u8 {%rs250, %rs251, %rs252, %rs253}, [matrix+124]; + cvt.u32.u16 %r492, %rs253; + cvt.s32.s8 %r493, %r492; + cvt.u32.u16 %r494, %rs252; + cvt.s32.s8 %r495, %r494; + cvt.u32.u16 %r496, %rs251; + cvt.s32.s8 %r497, %r496; + cvt.u32.u16 %r498, %rs250; + cvt.s32.s8 %r499, %r498; + mad.lo.s32 %r500, %r111, %r499, %r491; + mad.lo.s32 %r501, %r112, %r497, %r500; + mad.lo.s32 %r502, %r114, %r495, %r501; + mad.lo.s32 %r503, %r115, %r493, %r502; + shr.u32 %r504, %r311, 6; + and.b32 %r505, %r504, 240; + shr.u32 %r506, %r503, 10; + or.b32 %r507, %r506, %r505; + xor.b32 %r508, %r33, %r507; + cvt.u64.u32 %rd381, %r508; + ld.const.v4.u8 {%rs258, %rs259, %rs260, %rs261}, [matrix+128]; + cvt.u32.u16 %r509, %rs261; + cvt.s32.s8 %r510, %r509; + cvt.u32.u16 %r511, %rs260; + cvt.s32.s8 %r512, %r511; + cvt.u32.u16 %r513, %rs258; + cvt.s32.s8 %r514, %r513; + cvt.u32.u16 %r515, %rs259; + cvt.s32.s8 %r516, %r515; + mul.lo.s32 %r517, %r34, %r516; + mad.lo.s32 %r518, %r124, %r514, %r517; + mad.lo.s32 %r519, %r35, %r512, %r518; + mad.lo.s32 %r520, %r36, %r510, %r519; + ld.const.v4.u8 {%rs266, %rs267, %rs268, %rs269}, [matrix+132]; + cvt.u32.u16 %r521, %rs269; + cvt.s32.s8 %r522, %r521; + cvt.u32.u16 %r523, %rs268; + cvt.s32.s8 %r524, %r523; + cvt.u32.u16 %r525, %rs267; + cvt.s32.s8 %r526, %r525; + cvt.u32.u16 %r527, %rs266; + cvt.s32.s8 %r528, %r527; + mad.lo.s32 %r529, %r37, %r528, %r520; + mad.lo.s32 %r530, %r38, %r526, %r529; + mad.lo.s32 %r531, %r39, %r524, %r530; + mad.lo.s32 %r532, %r40, %r522, %r531; + ld.const.v4.u8 {%rs274, %rs275, %rs276, %rs277}, [matrix+136]; + cvt.u32.u16 %r533, %rs277; + cvt.s32.s8 %r534, %r533; + cvt.u32.u16 %r535, %rs276; + cvt.s32.s8 %r536, %r535; + cvt.u32.u16 %r537, %rs275; + cvt.s32.s8 %r538, %r537; + cvt.u32.u16 %r539, %rs274; + cvt.s32.s8 %r540, %r539; + mad.lo.s32 %r541, %r42, %r540, %r532; + mad.lo.s32 %r542, %r43, %r538, %r541; + mad.lo.s32 %r543, %r45, %r536, %r542; + mad.lo.s32 %r544, %r46, %r534, %r543; + ld.const.v4.u8 {%rs282, %rs283, %rs284, %rs285}, [matrix+140]; + cvt.u32.u16 %r545, %rs285; + cvt.s32.s8 %r546, %r545; + cvt.u32.u16 %r547, %rs284; + cvt.s32.s8 %r548, %r547; + cvt.u32.u16 %r549, %rs283; + cvt.s32.s8 %r550, %r549; + cvt.u32.u16 %r551, %rs282; + cvt.s32.s8 %r552, %r551; + mad.lo.s32 %r553, %r48, %r552, %r544; + mad.lo.s32 %r554, %r49, %r550, %r553; + mad.lo.s32 %r555, %r50, %r548, %r554; + mad.lo.s32 %r556, %r51, %r546, %r555; + ld.const.v4.u8 {%rs290, %rs291, %rs292, %rs293}, [matrix+144]; + cvt.u32.u16 %r557, %rs293; + cvt.s32.s8 %r558, %r557; + cvt.u32.u16 %r559, %rs292; + cvt.s32.s8 %r560, %r559; + cvt.u32.u16 %r561, %rs291; + cvt.s32.s8 %r562, %r561; + cvt.u32.u16 %r563, %rs290; + cvt.s32.s8 %r564, %r563; + mad.lo.s32 %r565, %r173, %r564, %r556; + mad.lo.s32 %r566, %r53, %r562, %r565; + mad.lo.s32 %r567, %r54, %r560, %r566; + mad.lo.s32 %r568, %r55, %r558, %r567; + ld.const.v4.u8 {%rs298, %rs299, %rs300, %rs301}, [matrix+148]; + cvt.u32.u16 %r569, %rs301; + cvt.s32.s8 %r570, %r569; + cvt.u32.u16 %r571, %rs300; + cvt.s32.s8 %r572, %r571; + cvt.u32.u16 %r573, %rs299; + cvt.s32.s8 %r574, %r573; + cvt.u32.u16 %r575, %rs298; + cvt.s32.s8 %r576, %r575; + mad.lo.s32 %r577, %r56, %r576, %r568; + mad.lo.s32 %r578, %r57, %r574, %r577; + mad.lo.s32 %r579, %r58, %r572, %r578; + mad.lo.s32 %r580, %r59, %r570, %r579; + ld.const.v4.u8 {%rs306, %rs307, %rs308, %rs309}, [matrix+152]; + cvt.u32.u16 %r581, %rs309; + cvt.s32.s8 %r582, %r581; + cvt.u32.u16 %r583, %rs308; + cvt.s32.s8 %r584, %r583; + cvt.u32.u16 %r585, %rs307; + cvt.s32.s8 %r586, %r585; + cvt.u32.u16 %r587, %rs306; + cvt.s32.s8 %r588, %r587; + mad.lo.s32 %r589, %r61, %r588, %r580; + mad.lo.s32 %r590, %r62, %r586, %r589; + mad.lo.s32 %r591, %r64, %r584, %r590; + mad.lo.s32 %r592, %r65, %r582, %r591; + ld.const.v4.u8 {%rs314, %rs315, %rs316, %rs317}, [matrix+156]; + cvt.u32.u16 %r593, %rs317; + cvt.s32.s8 %r594, %r593; + cvt.u32.u16 %r595, %rs316; + cvt.s32.s8 %r596, %r595; + cvt.u32.u16 %r597, %rs315; + cvt.s32.s8 %r598, %r597; + cvt.u32.u16 %r599, %rs314; + cvt.s32.s8 %r600, %r599; + mad.lo.s32 %r601, %r67, %r600, %r592; + mad.lo.s32 %r602, %r68, %r598, %r601; + mad.lo.s32 %r603, %r69, %r596, %r602; + mad.lo.s32 %r604, %r70, %r594, %r603; + ld.const.v4.u8 {%rs322, %rs323, %rs324, %rs325}, [matrix+160]; + cvt.u32.u16 %r605, %rs325; + cvt.s32.s8 %r606, %r605; + cvt.u32.u16 %r607, %rs324; + cvt.s32.s8 %r608, %r607; + cvt.u32.u16 %r609, %rs323; + cvt.s32.s8 %r610, %r609; + cvt.u32.u16 %r611, %rs322; + cvt.s32.s8 %r612, %r611; + mad.lo.s32 %r613, %r222, %r612, %r604; + mad.lo.s32 %r614, %r72, %r610, %r613; + mad.lo.s32 %r615, %r73, %r608, %r614; + mad.lo.s32 %r616, %r74, %r606, %r615; + ld.const.v4.u8 {%rs330, %rs331, %rs332, %rs333}, [matrix+164]; + cvt.u32.u16 %r617, %rs333; + cvt.s32.s8 %r618, %r617; + cvt.u32.u16 %r619, %rs332; + cvt.s32.s8 %r620, %r619; + cvt.u32.u16 %r621, %rs331; + cvt.s32.s8 %r622, %r621; + cvt.u32.u16 %r623, %rs330; + cvt.s32.s8 %r624, %r623; + mad.lo.s32 %r625, %r75, %r624, %r616; + mad.lo.s32 %r626, %r76, %r622, %r625; + mad.lo.s32 %r627, %r77, %r620, %r626; + mad.lo.s32 %r628, %r78, %r618, %r627; + ld.const.v4.u8 {%rs338, %rs339, %rs340, %rs341}, [matrix+168]; + cvt.u32.u16 %r629, %rs341; + cvt.s32.s8 %r630, %r629; + cvt.u32.u16 %r631, %rs340; + cvt.s32.s8 %r632, %r631; + cvt.u32.u16 %r633, %rs339; + cvt.s32.s8 %r634, %r633; + cvt.u32.u16 %r635, %rs338; + cvt.s32.s8 %r636, %r635; + mad.lo.s32 %r637, %r80, %r636, %r628; + mad.lo.s32 %r638, %r81, %r634, %r637; + mad.lo.s32 %r639, %r83, %r632, %r638; + mad.lo.s32 %r640, %r84, %r630, %r639; + ld.const.v4.u8 {%rs346, %rs347, %rs348, %rs349}, [matrix+172]; + cvt.u32.u16 %r641, %rs349; + cvt.s32.s8 %r642, %r641; + cvt.u32.u16 %r643, %rs348; + cvt.s32.s8 %r644, %r643; + cvt.u32.u16 %r645, %rs347; + cvt.s32.s8 %r646, %r645; + cvt.u32.u16 %r647, %rs346; + cvt.s32.s8 %r648, %r647; + mad.lo.s32 %r649, %r86, %r648, %r640; + mad.lo.s32 %r650, %r87, %r646, %r649; + mad.lo.s32 %r651, %r88, %r644, %r650; + mad.lo.s32 %r652, %r89, %r642, %r651; + ld.const.v4.u8 {%rs354, %rs355, %rs356, %rs357}, [matrix+176]; + cvt.u32.u16 %r653, %rs357; + cvt.s32.s8 %r654, %r653; + cvt.u32.u16 %r655, %rs356; + cvt.s32.s8 %r656, %r655; + cvt.u32.u16 %r657, %rs355; + cvt.s32.s8 %r658, %r657; + cvt.u32.u16 %r659, %rs354; + cvt.s32.s8 %r660, %r659; + mad.lo.s32 %r661, %r271, %r660, %r652; + mad.lo.s32 %r662, %r91, %r658, %r661; + mad.lo.s32 %r663, %r93, %r656, %r662; + mad.lo.s32 %r664, %r94, %r654, %r663; + ld.const.v4.u8 {%rs362, %rs363, %rs364, %rs365}, [matrix+180]; + cvt.u32.u16 %r665, %rs365; + cvt.s32.s8 %r666, %r665; + cvt.u32.u16 %r667, %rs364; + cvt.s32.s8 %r668, %r667; + cvt.u32.u16 %r669, %rs363; + cvt.s32.s8 %r670, %r669; + cvt.u32.u16 %r671, %rs362; + cvt.s32.s8 %r672, %r671; + mad.lo.s32 %r673, %r96, %r672, %r664; + mad.lo.s32 %r674, %r97, %r670, %r673; + mad.lo.s32 %r675, %r99, %r668, %r674; + mad.lo.s32 %r676, %r100, %r666, %r675; + ld.const.v4.u8 {%rs370, %rs371, %rs372, %rs373}, [matrix+184]; + cvt.u32.u16 %r677, %rs373; + cvt.s32.s8 %r678, %r677; + cvt.u32.u16 %r679, %rs372; + cvt.s32.s8 %r680, %r679; + cvt.u32.u16 %r681, %rs371; + cvt.s32.s8 %r682, %r681; + cvt.u32.u16 %r683, %rs370; + cvt.s32.s8 %r684, %r683; + mad.lo.s32 %r685, %r103, %r684, %r676; + mad.lo.s32 %r686, %r104, %r682, %r685; + mad.lo.s32 %r687, %r107, %r680, %r686; + mad.lo.s32 %r688, %r108, %r678, %r687; + ld.const.v4.u8 {%rs378, %rs379, %rs380, %rs381}, [matrix+188]; + cvt.u32.u16 %r689, %rs381; + cvt.s32.s8 %r690, %r689; + cvt.u32.u16 %r691, %rs380; + cvt.s32.s8 %r692, %r691; + cvt.u32.u16 %r693, %rs379; + cvt.s32.s8 %r694, %r693; + cvt.u32.u16 %r695, %rs378; + cvt.s32.s8 %r696, %r695; + mad.lo.s32 %r697, %r111, %r696, %r688; + mad.lo.s32 %r698, %r112, %r694, %r697; + mad.lo.s32 %r699, %r114, %r692, %r698; + mad.lo.s32 %r700, %r115, %r690, %r699; + ld.const.v4.u8 {%rs386, %rs387, %rs388, %rs389}, [matrix+192]; + cvt.u32.u16 %r701, %rs389; + cvt.s32.s8 %r702, %r701; + cvt.u32.u16 %r703, %rs388; + cvt.s32.s8 %r704, %r703; + cvt.u32.u16 %r705, %rs386; + cvt.s32.s8 %r706, %r705; + cvt.u32.u16 %r707, %rs387; + cvt.s32.s8 %r708, %r707; + mul.lo.s32 %r709, %r34, %r708; + mad.lo.s32 %r710, %r124, %r706, %r709; + mad.lo.s32 %r711, %r35, %r704, %r710; + mad.lo.s32 %r712, %r36, %r702, %r711; + ld.const.v4.u8 {%rs394, %rs395, %rs396, %rs397}, [matrix+196]; + cvt.u32.u16 %r713, %rs397; + cvt.s32.s8 %r714, %r713; + cvt.u32.u16 %r715, %rs396; + cvt.s32.s8 %r716, %r715; + cvt.u32.u16 %r717, %rs395; + cvt.s32.s8 %r718, %r717; + cvt.u32.u16 %r719, %rs394; + cvt.s32.s8 %r720, %r719; + mad.lo.s32 %r721, %r37, %r720, %r712; + mad.lo.s32 %r722, %r38, %r718, %r721; + mad.lo.s32 %r723, %r39, %r716, %r722; + mad.lo.s32 %r724, %r40, %r714, %r723; + ld.const.v4.u8 {%rs402, %rs403, %rs404, %rs405}, [matrix+200]; + cvt.u32.u16 %r725, %rs405; + cvt.s32.s8 %r726, %r725; + cvt.u32.u16 %r727, %rs404; + cvt.s32.s8 %r728, %r727; + cvt.u32.u16 %r729, %rs403; + cvt.s32.s8 %r730, %r729; + cvt.u32.u16 %r731, %rs402; + cvt.s32.s8 %r732, %r731; + mad.lo.s32 %r733, %r42, %r732, %r724; + mad.lo.s32 %r734, %r43, %r730, %r733; + mad.lo.s32 %r735, %r45, %r728, %r734; + mad.lo.s32 %r736, %r46, %r726, %r735; + ld.const.v4.u8 {%rs410, %rs411, %rs412, %rs413}, [matrix+204]; + cvt.u32.u16 %r737, %rs413; + cvt.s32.s8 %r738, %r737; + cvt.u32.u16 %r739, %rs412; + cvt.s32.s8 %r740, %r739; + cvt.u32.u16 %r741, %rs411; + cvt.s32.s8 %r742, %r741; + cvt.u32.u16 %r743, %rs410; + cvt.s32.s8 %r744, %r743; + mad.lo.s32 %r745, %r48, %r744, %r736; + mad.lo.s32 %r746, %r49, %r742, %r745; + mad.lo.s32 %r747, %r50, %r740, %r746; + mad.lo.s32 %r748, %r51, %r738, %r747; + ld.const.v4.u8 {%rs418, %rs419, %rs420, %rs421}, [matrix+208]; + cvt.u32.u16 %r749, %rs421; + cvt.s32.s8 %r750, %r749; + cvt.u32.u16 %r751, %rs420; + cvt.s32.s8 %r752, %r751; + cvt.u32.u16 %r753, %rs419; + cvt.s32.s8 %r754, %r753; + cvt.u32.u16 %r755, %rs418; + cvt.s32.s8 %r756, %r755; + mad.lo.s32 %r757, %r173, %r756, %r748; + mad.lo.s32 %r758, %r53, %r754, %r757; + mad.lo.s32 %r759, %r54, %r752, %r758; + mad.lo.s32 %r760, %r55, %r750, %r759; + ld.const.v4.u8 {%rs426, %rs427, %rs428, %rs429}, [matrix+212]; + cvt.u32.u16 %r761, %rs429; + cvt.s32.s8 %r762, %r761; + cvt.u32.u16 %r763, %rs428; + cvt.s32.s8 %r764, %r763; + cvt.u32.u16 %r765, %rs427; + cvt.s32.s8 %r766, %r765; + cvt.u32.u16 %r767, %rs426; + cvt.s32.s8 %r768, %r767; + mad.lo.s32 %r769, %r56, %r768, %r760; + mad.lo.s32 %r770, %r57, %r766, %r769; + mad.lo.s32 %r771, %r58, %r764, %r770; + mad.lo.s32 %r772, %r59, %r762, %r771; + ld.const.v4.u8 {%rs434, %rs435, %rs436, %rs437}, [matrix+216]; + cvt.u32.u16 %r773, %rs437; + cvt.s32.s8 %r774, %r773; + cvt.u32.u16 %r775, %rs436; + cvt.s32.s8 %r776, %r775; + cvt.u32.u16 %r777, %rs435; + cvt.s32.s8 %r778, %r777; + cvt.u32.u16 %r779, %rs434; + cvt.s32.s8 %r780, %r779; + mad.lo.s32 %r781, %r61, %r780, %r772; + mad.lo.s32 %r782, %r62, %r778, %r781; + mad.lo.s32 %r783, %r64, %r776, %r782; + mad.lo.s32 %r784, %r65, %r774, %r783; + ld.const.v4.u8 {%rs442, %rs443, %rs444, %rs445}, [matrix+220]; + cvt.u32.u16 %r785, %rs445; + cvt.s32.s8 %r786, %r785; + cvt.u32.u16 %r787, %rs444; + cvt.s32.s8 %r788, %r787; + cvt.u32.u16 %r789, %rs443; + cvt.s32.s8 %r790, %r789; + cvt.u32.u16 %r791, %rs442; + cvt.s32.s8 %r792, %r791; + mad.lo.s32 %r793, %r67, %r792, %r784; + mad.lo.s32 %r794, %r68, %r790, %r793; + mad.lo.s32 %r795, %r69, %r788, %r794; + mad.lo.s32 %r796, %r70, %r786, %r795; + ld.const.v4.u8 {%rs450, %rs451, %rs452, %rs453}, [matrix+224]; + cvt.u32.u16 %r797, %rs453; + cvt.s32.s8 %r798, %r797; + cvt.u32.u16 %r799, %rs452; + cvt.s32.s8 %r800, %r799; + cvt.u32.u16 %r801, %rs451; + cvt.s32.s8 %r802, %r801; + cvt.u32.u16 %r803, %rs450; + cvt.s32.s8 %r804, %r803; + mad.lo.s32 %r805, %r222, %r804, %r796; + mad.lo.s32 %r806, %r72, %r802, %r805; + mad.lo.s32 %r807, %r73, %r800, %r806; + mad.lo.s32 %r808, %r74, %r798, %r807; + ld.const.v4.u8 {%rs458, %rs459, %rs460, %rs461}, [matrix+228]; + cvt.u32.u16 %r809, %rs461; + cvt.s32.s8 %r810, %r809; + cvt.u32.u16 %r811, %rs460; + cvt.s32.s8 %r812, %r811; + cvt.u32.u16 %r813, %rs459; + cvt.s32.s8 %r814, %r813; + cvt.u32.u16 %r815, %rs458; + cvt.s32.s8 %r816, %r815; + mad.lo.s32 %r817, %r75, %r816, %r808; + mad.lo.s32 %r818, %r76, %r814, %r817; + mad.lo.s32 %r819, %r77, %r812, %r818; + mad.lo.s32 %r820, %r78, %r810, %r819; + ld.const.v4.u8 {%rs466, %rs467, %rs468, %rs469}, [matrix+232]; + cvt.u32.u16 %r821, %rs469; + cvt.s32.s8 %r822, %r821; + cvt.u32.u16 %r823, %rs468; + cvt.s32.s8 %r824, %r823; + cvt.u32.u16 %r825, %rs467; + cvt.s32.s8 %r826, %r825; + cvt.u32.u16 %r827, %rs466; + cvt.s32.s8 %r828, %r827; + mad.lo.s32 %r829, %r80, %r828, %r820; + mad.lo.s32 %r830, %r81, %r826, %r829; + mad.lo.s32 %r831, %r83, %r824, %r830; + mad.lo.s32 %r832, %r84, %r822, %r831; + ld.const.v4.u8 {%rs474, %rs475, %rs476, %rs477}, [matrix+236]; + cvt.u32.u16 %r833, %rs477; + cvt.s32.s8 %r834, %r833; + cvt.u32.u16 %r835, %rs476; + cvt.s32.s8 %r836, %r835; + cvt.u32.u16 %r837, %rs475; + cvt.s32.s8 %r838, %r837; + cvt.u32.u16 %r839, %rs474; + cvt.s32.s8 %r840, %r839; + mad.lo.s32 %r841, %r86, %r840, %r832; + mad.lo.s32 %r842, %r87, %r838, %r841; + mad.lo.s32 %r843, %r88, %r836, %r842; + mad.lo.s32 %r844, %r89, %r834, %r843; + ld.const.v4.u8 {%rs482, %rs483, %rs484, %rs485}, [matrix+240]; + cvt.u32.u16 %r845, %rs485; + cvt.s32.s8 %r846, %r845; + cvt.u32.u16 %r847, %rs484; + cvt.s32.s8 %r848, %r847; + cvt.u32.u16 %r849, %rs483; + cvt.s32.s8 %r850, %r849; + cvt.u32.u16 %r851, %rs482; + cvt.s32.s8 %r852, %r851; + mad.lo.s32 %r853, %r271, %r852, %r844; + mad.lo.s32 %r854, %r91, %r850, %r853; + mad.lo.s32 %r855, %r93, %r848, %r854; + mad.lo.s32 %r856, %r94, %r846, %r855; + ld.const.v4.u8 {%rs490, %rs491, %rs492, %rs493}, [matrix+244]; + cvt.u32.u16 %r857, %rs493; + cvt.s32.s8 %r858, %r857; + cvt.u32.u16 %r859, %rs492; + cvt.s32.s8 %r860, %r859; + cvt.u32.u16 %r861, %rs491; + cvt.s32.s8 %r862, %r861; + cvt.u32.u16 %r863, %rs490; + cvt.s32.s8 %r864, %r863; + mad.lo.s32 %r865, %r96, %r864, %r856; + mad.lo.s32 %r866, %r97, %r862, %r865; + mad.lo.s32 %r867, %r99, %r860, %r866; + mad.lo.s32 %r868, %r100, %r858, %r867; + ld.const.v4.u8 {%rs498, %rs499, %rs500, %rs501}, [matrix+248]; + cvt.u32.u16 %r869, %rs501; + cvt.s32.s8 %r870, %r869; + cvt.u32.u16 %r871, %rs500; + cvt.s32.s8 %r872, %r871; + cvt.u32.u16 %r873, %rs499; + cvt.s32.s8 %r874, %r873; + cvt.u32.u16 %r875, %rs498; + cvt.s32.s8 %r876, %r875; + mad.lo.s32 %r877, %r103, %r876, %r868; + mad.lo.s32 %r878, %r104, %r874, %r877; + mad.lo.s32 %r879, %r107, %r872, %r878; + mad.lo.s32 %r880, %r108, %r870, %r879; + ld.const.v4.u8 {%rs506, %rs507, %rs508, %rs509}, [matrix+252]; + cvt.u32.u16 %r881, %rs509; + cvt.s32.s8 %r882, %r881; + cvt.u32.u16 %r883, %rs508; + cvt.s32.s8 %r884, %r883; + cvt.u32.u16 %r885, %rs507; + cvt.s32.s8 %r886, %r885; + cvt.u32.u16 %r887, %rs506; + cvt.s32.s8 %r888, %r887; + mad.lo.s32 %r889, %r111, %r888, %r880; + mad.lo.s32 %r890, %r112, %r886, %r889; + mad.lo.s32 %r891, %r114, %r884, %r890; + mad.lo.s32 %r892, %r115, %r882, %r891; + shr.u32 %r893, %r700, 6; + and.b32 %r894, %r893, 240; + shr.u32 %r895, %r892, 10; + or.b32 %r896, %r895, %r894; + xor.b32 %r897, %r12, %r896; + ld.const.v4.u8 {%rs514, %rs515, %rs516, %rs517}, [matrix+256]; + cvt.u32.u16 %r898, %rs517; + cvt.s32.s8 %r899, %r898; + cvt.u32.u16 %r900, %rs516; + cvt.s32.s8 %r901, %r900; + cvt.u32.u16 %r902, %rs514; + cvt.s32.s8 %r903, %r902; + cvt.u32.u16 %r904, %rs515; + cvt.s32.s8 %r905, %r904; + mul.lo.s32 %r906, %r34, %r905; + mad.lo.s32 %r907, %r124, %r903, %r906; + mad.lo.s32 %r908, %r35, %r901, %r907; + mad.lo.s32 %r909, %r36, %r899, %r908; + ld.const.v4.u8 {%rs522, %rs523, %rs524, %rs525}, [matrix+260]; + cvt.u32.u16 %r910, %rs525; + cvt.s32.s8 %r911, %r910; + cvt.u32.u16 %r912, %rs524; + cvt.s32.s8 %r913, %r912; + cvt.u32.u16 %r914, %rs523; + cvt.s32.s8 %r915, %r914; + cvt.u32.u16 %r916, %rs522; + cvt.s32.s8 %r917, %r916; + mad.lo.s32 %r918, %r37, %r917, %r909; + mad.lo.s32 %r919, %r38, %r915, %r918; + mad.lo.s32 %r920, %r39, %r913, %r919; + mad.lo.s32 %r921, %r40, %r911, %r920; + ld.const.v4.u8 {%rs530, %rs531, %rs532, %rs533}, [matrix+264]; + cvt.u32.u16 %r922, %rs533; + cvt.s32.s8 %r923, %r922; + cvt.u32.u16 %r924, %rs532; + cvt.s32.s8 %r925, %r924; + cvt.u32.u16 %r926, %rs531; + cvt.s32.s8 %r927, %r926; + cvt.u32.u16 %r928, %rs530; + cvt.s32.s8 %r929, %r928; + mad.lo.s32 %r930, %r42, %r929, %r921; + mad.lo.s32 %r931, %r43, %r927, %r930; + mad.lo.s32 %r932, %r45, %r925, %r931; + mad.lo.s32 %r933, %r46, %r923, %r932; + ld.const.v4.u8 {%rs538, %rs539, %rs540, %rs541}, [matrix+268]; + cvt.u32.u16 %r934, %rs541; + cvt.s32.s8 %r935, %r934; + cvt.u32.u16 %r936, %rs540; + cvt.s32.s8 %r937, %r936; + cvt.u32.u16 %r938, %rs539; + cvt.s32.s8 %r939, %r938; + cvt.u32.u16 %r940, %rs538; + cvt.s32.s8 %r941, %r940; + mad.lo.s32 %r942, %r48, %r941, %r933; + mad.lo.s32 %r943, %r49, %r939, %r942; + mad.lo.s32 %r944, %r50, %r937, %r943; + mad.lo.s32 %r945, %r51, %r935, %r944; + ld.const.v4.u8 {%rs546, %rs547, %rs548, %rs549}, [matrix+272]; + cvt.u32.u16 %r946, %rs549; + cvt.s32.s8 %r947, %r946; + cvt.u32.u16 %r948, %rs548; + cvt.s32.s8 %r949, %r948; + cvt.u32.u16 %r950, %rs547; + cvt.s32.s8 %r951, %r950; + cvt.u32.u16 %r952, %rs546; + cvt.s32.s8 %r953, %r952; + mad.lo.s32 %r954, %r173, %r953, %r945; + mad.lo.s32 %r955, %r53, %r951, %r954; + mad.lo.s32 %r956, %r54, %r949, %r955; + mad.lo.s32 %r957, %r55, %r947, %r956; + ld.const.v4.u8 {%rs554, %rs555, %rs556, %rs557}, [matrix+276]; + cvt.u32.u16 %r958, %rs557; + cvt.s32.s8 %r959, %r958; + cvt.u32.u16 %r960, %rs556; + cvt.s32.s8 %r961, %r960; + cvt.u32.u16 %r962, %rs555; + cvt.s32.s8 %r963, %r962; + cvt.u32.u16 %r964, %rs554; + cvt.s32.s8 %r965, %r964; + mad.lo.s32 %r966, %r56, %r965, %r957; + mad.lo.s32 %r967, %r57, %r963, %r966; + mad.lo.s32 %r968, %r58, %r961, %r967; + mad.lo.s32 %r969, %r59, %r959, %r968; + ld.const.v4.u8 {%rs562, %rs563, %rs564, %rs565}, [matrix+280]; + cvt.u32.u16 %r970, %rs565; + cvt.s32.s8 %r971, %r970; + cvt.u32.u16 %r972, %rs564; + cvt.s32.s8 %r973, %r972; + cvt.u32.u16 %r974, %rs563; + cvt.s32.s8 %r975, %r974; + cvt.u32.u16 %r976, %rs562; + cvt.s32.s8 %r977, %r976; + mad.lo.s32 %r978, %r61, %r977, %r969; + mad.lo.s32 %r979, %r62, %r975, %r978; + mad.lo.s32 %r980, %r64, %r973, %r979; + mad.lo.s32 %r981, %r65, %r971, %r980; + ld.const.v4.u8 {%rs570, %rs571, %rs572, %rs573}, [matrix+284]; + cvt.u32.u16 %r982, %rs573; + cvt.s32.s8 %r983, %r982; + cvt.u32.u16 %r984, %rs572; + cvt.s32.s8 %r985, %r984; + cvt.u32.u16 %r986, %rs571; + cvt.s32.s8 %r987, %r986; + cvt.u32.u16 %r988, %rs570; + cvt.s32.s8 %r989, %r988; + mad.lo.s32 %r990, %r67, %r989, %r981; + mad.lo.s32 %r991, %r68, %r987, %r990; + mad.lo.s32 %r992, %r69, %r985, %r991; + mad.lo.s32 %r993, %r70, %r983, %r992; + ld.const.v4.u8 {%rs578, %rs579, %rs580, %rs581}, [matrix+288]; + cvt.u32.u16 %r994, %rs581; + cvt.s32.s8 %r995, %r994; + cvt.u32.u16 %r996, %rs580; + cvt.s32.s8 %r997, %r996; + cvt.u32.u16 %r998, %rs579; + cvt.s32.s8 %r999, %r998; + cvt.u32.u16 %r1000, %rs578; + cvt.s32.s8 %r1001, %r1000; + mad.lo.s32 %r1002, %r222, %r1001, %r993; + mad.lo.s32 %r1003, %r72, %r999, %r1002; + mad.lo.s32 %r1004, %r73, %r997, %r1003; + mad.lo.s32 %r1005, %r74, %r995, %r1004; + ld.const.v4.u8 {%rs586, %rs587, %rs588, %rs589}, [matrix+292]; + cvt.u32.u16 %r1006, %rs589; + cvt.s32.s8 %r1007, %r1006; + cvt.u32.u16 %r1008, %rs588; + cvt.s32.s8 %r1009, %r1008; + cvt.u32.u16 %r1010, %rs587; + cvt.s32.s8 %r1011, %r1010; + cvt.u32.u16 %r1012, %rs586; + cvt.s32.s8 %r1013, %r1012; + mad.lo.s32 %r1014, %r75, %r1013, %r1005; + mad.lo.s32 %r1015, %r76, %r1011, %r1014; + mad.lo.s32 %r1016, %r77, %r1009, %r1015; + mad.lo.s32 %r1017, %r78, %r1007, %r1016; + ld.const.v4.u8 {%rs594, %rs595, %rs596, %rs597}, [matrix+296]; + cvt.u32.u16 %r1018, %rs597; + cvt.s32.s8 %r1019, %r1018; + cvt.u32.u16 %r1020, %rs596; + cvt.s32.s8 %r1021, %r1020; + cvt.u32.u16 %r1022, %rs595; + cvt.s32.s8 %r1023, %r1022; + cvt.u32.u16 %r1024, %rs594; + cvt.s32.s8 %r1025, %r1024; + mad.lo.s32 %r1026, %r80, %r1025, %r1017; + mad.lo.s32 %r1027, %r81, %r1023, %r1026; + mad.lo.s32 %r1028, %r83, %r1021, %r1027; + mad.lo.s32 %r1029, %r84, %r1019, %r1028; + ld.const.v4.u8 {%rs602, %rs603, %rs604, %rs605}, [matrix+300]; + cvt.u32.u16 %r1030, %rs605; + cvt.s32.s8 %r1031, %r1030; + cvt.u32.u16 %r1032, %rs604; + cvt.s32.s8 %r1033, %r1032; + cvt.u32.u16 %r1034, %rs603; + cvt.s32.s8 %r1035, %r1034; + cvt.u32.u16 %r1036, %rs602; + cvt.s32.s8 %r1037, %r1036; + mad.lo.s32 %r1038, %r86, %r1037, %r1029; + mad.lo.s32 %r1039, %r87, %r1035, %r1038; + mad.lo.s32 %r1040, %r88, %r1033, %r1039; + mad.lo.s32 %r1041, %r89, %r1031, %r1040; + ld.const.v4.u8 {%rs610, %rs611, %rs612, %rs613}, [matrix+304]; + cvt.u32.u16 %r1042, %rs613; + cvt.s32.s8 %r1043, %r1042; + cvt.u32.u16 %r1044, %rs612; + cvt.s32.s8 %r1045, %r1044; + cvt.u32.u16 %r1046, %rs611; + cvt.s32.s8 %r1047, %r1046; + cvt.u32.u16 %r1048, %rs610; + cvt.s32.s8 %r1049, %r1048; + mad.lo.s32 %r1050, %r271, %r1049, %r1041; + mad.lo.s32 %r1051, %r91, %r1047, %r1050; + mad.lo.s32 %r1052, %r93, %r1045, %r1051; + mad.lo.s32 %r1053, %r94, %r1043, %r1052; + ld.const.v4.u8 {%rs618, %rs619, %rs620, %rs621}, [matrix+308]; + cvt.u32.u16 %r1054, %rs621; + cvt.s32.s8 %r1055, %r1054; + cvt.u32.u16 %r1056, %rs620; + cvt.s32.s8 %r1057, %r1056; + cvt.u32.u16 %r1058, %rs619; + cvt.s32.s8 %r1059, %r1058; + cvt.u32.u16 %r1060, %rs618; + cvt.s32.s8 %r1061, %r1060; + mad.lo.s32 %r1062, %r96, %r1061, %r1053; + mad.lo.s32 %r1063, %r97, %r1059, %r1062; + mad.lo.s32 %r1064, %r99, %r1057, %r1063; + mad.lo.s32 %r1065, %r100, %r1055, %r1064; + ld.const.v4.u8 {%rs626, %rs627, %rs628, %rs629}, [matrix+312]; + cvt.u32.u16 %r1066, %rs629; + cvt.s32.s8 %r1067, %r1066; + cvt.u32.u16 %r1068, %rs628; + cvt.s32.s8 %r1069, %r1068; + cvt.u32.u16 %r1070, %rs627; + cvt.s32.s8 %r1071, %r1070; + cvt.u32.u16 %r1072, %rs626; + cvt.s32.s8 %r1073, %r1072; + mad.lo.s32 %r1074, %r103, %r1073, %r1065; + mad.lo.s32 %r1075, %r104, %r1071, %r1074; + mad.lo.s32 %r1076, %r107, %r1069, %r1075; + mad.lo.s32 %r1077, %r108, %r1067, %r1076; + ld.const.v4.u8 {%rs634, %rs635, %rs636, %rs637}, [matrix+316]; + cvt.u32.u16 %r1078, %rs637; + cvt.s32.s8 %r1079, %r1078; + cvt.u32.u16 %r1080, %rs636; + cvt.s32.s8 %r1081, %r1080; + cvt.u32.u16 %r1082, %rs635; + cvt.s32.s8 %r1083, %r1082; + cvt.u32.u16 %r1084, %rs634; + cvt.s32.s8 %r1085, %r1084; + mad.lo.s32 %r1086, %r111, %r1085, %r1077; + mad.lo.s32 %r1087, %r112, %r1083, %r1086; + mad.lo.s32 %r1088, %r114, %r1081, %r1087; + mad.lo.s32 %r1089, %r115, %r1079, %r1088; + ld.const.v4.u8 {%rs642, %rs643, %rs644, %rs645}, [matrix+320]; + cvt.u32.u16 %r1090, %rs645; + cvt.s32.s8 %r1091, %r1090; + cvt.u32.u16 %r1092, %rs644; + cvt.s32.s8 %r1093, %r1092; + cvt.u32.u16 %r1094, %rs642; + cvt.s32.s8 %r1095, %r1094; + cvt.u32.u16 %r1096, %rs643; + cvt.s32.s8 %r1097, %r1096; + mul.lo.s32 %r1098, %r34, %r1097; + mad.lo.s32 %r1099, %r124, %r1095, %r1098; + mad.lo.s32 %r1100, %r35, %r1093, %r1099; + mad.lo.s32 %r1101, %r36, %r1091, %r1100; + ld.const.v4.u8 {%rs650, %rs651, %rs652, %rs653}, [matrix+324]; + cvt.u32.u16 %r1102, %rs653; + cvt.s32.s8 %r1103, %r1102; + cvt.u32.u16 %r1104, %rs652; + cvt.s32.s8 %r1105, %r1104; + cvt.u32.u16 %r1106, %rs651; + cvt.s32.s8 %r1107, %r1106; + cvt.u32.u16 %r1108, %rs650; + cvt.s32.s8 %r1109, %r1108; + mad.lo.s32 %r1110, %r37, %r1109, %r1101; + mad.lo.s32 %r1111, %r38, %r1107, %r1110; + mad.lo.s32 %r1112, %r39, %r1105, %r1111; + mad.lo.s32 %r1113, %r40, %r1103, %r1112; + ld.const.v4.u8 {%rs658, %rs659, %rs660, %rs661}, [matrix+328]; + cvt.u32.u16 %r1114, %rs661; + cvt.s32.s8 %r1115, %r1114; + cvt.u32.u16 %r1116, %rs660; + cvt.s32.s8 %r1117, %r1116; + cvt.u32.u16 %r1118, %rs659; + cvt.s32.s8 %r1119, %r1118; + cvt.u32.u16 %r1120, %rs658; + cvt.s32.s8 %r1121, %r1120; + mad.lo.s32 %r1122, %r42, %r1121, %r1113; + mad.lo.s32 %r1123, %r43, %r1119, %r1122; + mad.lo.s32 %r1124, %r45, %r1117, %r1123; + mad.lo.s32 %r1125, %r46, %r1115, %r1124; + ld.const.v4.u8 {%rs666, %rs667, %rs668, %rs669}, [matrix+332]; + cvt.u32.u16 %r1126, %rs669; + cvt.s32.s8 %r1127, %r1126; + cvt.u32.u16 %r1128, %rs668; + cvt.s32.s8 %r1129, %r1128; + cvt.u32.u16 %r1130, %rs667; + cvt.s32.s8 %r1131, %r1130; + cvt.u32.u16 %r1132, %rs666; + cvt.s32.s8 %r1133, %r1132; + mad.lo.s32 %r1134, %r48, %r1133, %r1125; + mad.lo.s32 %r1135, %r49, %r1131, %r1134; + mad.lo.s32 %r1136, %r50, %r1129, %r1135; + mad.lo.s32 %r1137, %r51, %r1127, %r1136; + ld.const.v4.u8 {%rs674, %rs675, %rs676, %rs677}, [matrix+336]; + cvt.u32.u16 %r1138, %rs677; + cvt.s32.s8 %r1139, %r1138; + cvt.u32.u16 %r1140, %rs676; + cvt.s32.s8 %r1141, %r1140; + cvt.u32.u16 %r1142, %rs675; + cvt.s32.s8 %r1143, %r1142; + cvt.u32.u16 %r1144, %rs674; + cvt.s32.s8 %r1145, %r1144; + mad.lo.s32 %r1146, %r173, %r1145, %r1137; + mad.lo.s32 %r1147, %r53, %r1143, %r1146; + mad.lo.s32 %r1148, %r54, %r1141, %r1147; + mad.lo.s32 %r1149, %r55, %r1139, %r1148; + ld.const.v4.u8 {%rs682, %rs683, %rs684, %rs685}, [matrix+340]; + cvt.u32.u16 %r1150, %rs685; + cvt.s32.s8 %r1151, %r1150; + cvt.u32.u16 %r1152, %rs684; + cvt.s32.s8 %r1153, %r1152; + cvt.u32.u16 %r1154, %rs683; + cvt.s32.s8 %r1155, %r1154; + cvt.u32.u16 %r1156, %rs682; + cvt.s32.s8 %r1157, %r1156; + mad.lo.s32 %r1158, %r56, %r1157, %r1149; + mad.lo.s32 %r1159, %r57, %r1155, %r1158; + mad.lo.s32 %r1160, %r58, %r1153, %r1159; + mad.lo.s32 %r1161, %r59, %r1151, %r1160; + ld.const.v4.u8 {%rs690, %rs691, %rs692, %rs693}, [matrix+344]; + cvt.u32.u16 %r1162, %rs693; + cvt.s32.s8 %r1163, %r1162; + cvt.u32.u16 %r1164, %rs692; + cvt.s32.s8 %r1165, %r1164; + cvt.u32.u16 %r1166, %rs691; + cvt.s32.s8 %r1167, %r1166; + cvt.u32.u16 %r1168, %rs690; + cvt.s32.s8 %r1169, %r1168; + mad.lo.s32 %r1170, %r61, %r1169, %r1161; + mad.lo.s32 %r1171, %r62, %r1167, %r1170; + mad.lo.s32 %r1172, %r64, %r1165, %r1171; + mad.lo.s32 %r1173, %r65, %r1163, %r1172; + ld.const.v4.u8 {%rs698, %rs699, %rs700, %rs701}, [matrix+348]; + cvt.u32.u16 %r1174, %rs701; + cvt.s32.s8 %r1175, %r1174; + cvt.u32.u16 %r1176, %rs700; + cvt.s32.s8 %r1177, %r1176; + cvt.u32.u16 %r1178, %rs699; + cvt.s32.s8 %r1179, %r1178; + cvt.u32.u16 %r1180, %rs698; + cvt.s32.s8 %r1181, %r1180; + mad.lo.s32 %r1182, %r67, %r1181, %r1173; + mad.lo.s32 %r1183, %r68, %r1179, %r1182; + mad.lo.s32 %r1184, %r69, %r1177, %r1183; + mad.lo.s32 %r1185, %r70, %r1175, %r1184; + ld.const.v4.u8 {%rs706, %rs707, %rs708, %rs709}, [matrix+352]; + cvt.u32.u16 %r1186, %rs709; + cvt.s32.s8 %r1187, %r1186; + cvt.u32.u16 %r1188, %rs708; + cvt.s32.s8 %r1189, %r1188; + cvt.u32.u16 %r1190, %rs707; + cvt.s32.s8 %r1191, %r1190; + cvt.u32.u16 %r1192, %rs706; + cvt.s32.s8 %r1193, %r1192; + mad.lo.s32 %r1194, %r222, %r1193, %r1185; + mad.lo.s32 %r1195, %r72, %r1191, %r1194; + mad.lo.s32 %r1196, %r73, %r1189, %r1195; + mad.lo.s32 %r1197, %r74, %r1187, %r1196; + ld.const.v4.u8 {%rs714, %rs715, %rs716, %rs717}, [matrix+356]; + cvt.u32.u16 %r1198, %rs717; + cvt.s32.s8 %r1199, %r1198; + cvt.u32.u16 %r1200, %rs716; + cvt.s32.s8 %r1201, %r1200; + cvt.u32.u16 %r1202, %rs715; + cvt.s32.s8 %r1203, %r1202; + cvt.u32.u16 %r1204, %rs714; + cvt.s32.s8 %r1205, %r1204; + mad.lo.s32 %r1206, %r75, %r1205, %r1197; + mad.lo.s32 %r1207, %r76, %r1203, %r1206; + mad.lo.s32 %r1208, %r77, %r1201, %r1207; + mad.lo.s32 %r1209, %r78, %r1199, %r1208; + ld.const.v4.u8 {%rs722, %rs723, %rs724, %rs725}, [matrix+360]; + cvt.u32.u16 %r1210, %rs725; + cvt.s32.s8 %r1211, %r1210; + cvt.u32.u16 %r1212, %rs724; + cvt.s32.s8 %r1213, %r1212; + cvt.u32.u16 %r1214, %rs723; + cvt.s32.s8 %r1215, %r1214; + cvt.u32.u16 %r1216, %rs722; + cvt.s32.s8 %r1217, %r1216; + mad.lo.s32 %r1218, %r80, %r1217, %r1209; + mad.lo.s32 %r1219, %r81, %r1215, %r1218; + mad.lo.s32 %r1220, %r83, %r1213, %r1219; + mad.lo.s32 %r1221, %r84, %r1211, %r1220; + ld.const.v4.u8 {%rs730, %rs731, %rs732, %rs733}, [matrix+364]; + cvt.u32.u16 %r1222, %rs733; + cvt.s32.s8 %r1223, %r1222; + cvt.u32.u16 %r1224, %rs732; + cvt.s32.s8 %r1225, %r1224; + cvt.u32.u16 %r1226, %rs731; + cvt.s32.s8 %r1227, %r1226; + cvt.u32.u16 %r1228, %rs730; + cvt.s32.s8 %r1229, %r1228; + mad.lo.s32 %r1230, %r86, %r1229, %r1221; + mad.lo.s32 %r1231, %r87, %r1227, %r1230; + mad.lo.s32 %r1232, %r88, %r1225, %r1231; + mad.lo.s32 %r1233, %r89, %r1223, %r1232; + ld.const.v4.u8 {%rs738, %rs739, %rs740, %rs741}, [matrix+368]; + cvt.u32.u16 %r1234, %rs741; + cvt.s32.s8 %r1235, %r1234; + cvt.u32.u16 %r1236, %rs740; + cvt.s32.s8 %r1237, %r1236; + cvt.u32.u16 %r1238, %rs739; + cvt.s32.s8 %r1239, %r1238; + cvt.u32.u16 %r1240, %rs738; + cvt.s32.s8 %r1241, %r1240; + mad.lo.s32 %r1242, %r271, %r1241, %r1233; + mad.lo.s32 %r1243, %r91, %r1239, %r1242; + mad.lo.s32 %r1244, %r93, %r1237, %r1243; + mad.lo.s32 %r1245, %r94, %r1235, %r1244; + ld.const.v4.u8 {%rs746, %rs747, %rs748, %rs749}, [matrix+372]; + cvt.u32.u16 %r1246, %rs749; + cvt.s32.s8 %r1247, %r1246; + cvt.u32.u16 %r1248, %rs748; + cvt.s32.s8 %r1249, %r1248; + cvt.u32.u16 %r1250, %rs747; + cvt.s32.s8 %r1251, %r1250; + cvt.u32.u16 %r1252, %rs746; + cvt.s32.s8 %r1253, %r1252; + mad.lo.s32 %r1254, %r96, %r1253, %r1245; + mad.lo.s32 %r1255, %r97, %r1251, %r1254; + mad.lo.s32 %r1256, %r99, %r1249, %r1255; + mad.lo.s32 %r1257, %r100, %r1247, %r1256; + ld.const.v4.u8 {%rs754, %rs755, %rs756, %rs757}, [matrix+376]; + cvt.u32.u16 %r1258, %rs757; + cvt.s32.s8 %r1259, %r1258; + cvt.u32.u16 %r1260, %rs756; + cvt.s32.s8 %r1261, %r1260; + cvt.u32.u16 %r1262, %rs755; + cvt.s32.s8 %r1263, %r1262; + cvt.u32.u16 %r1264, %rs754; + cvt.s32.s8 %r1265, %r1264; + mad.lo.s32 %r1266, %r103, %r1265, %r1257; + mad.lo.s32 %r1267, %r104, %r1263, %r1266; + mad.lo.s32 %r1268, %r107, %r1261, %r1267; + mad.lo.s32 %r1269, %r108, %r1259, %r1268; + ld.const.v4.u8 {%rs762, %rs763, %rs764, %rs765}, [matrix+380]; + cvt.u32.u16 %r1270, %rs765; + cvt.s32.s8 %r1271, %r1270; + cvt.u32.u16 %r1272, %rs764; + cvt.s32.s8 %r1273, %r1272; + cvt.u32.u16 %r1274, %rs763; + cvt.s32.s8 %r1275, %r1274; + cvt.u32.u16 %r1276, %rs762; + cvt.s32.s8 %r1277, %r1276; + mad.lo.s32 %r1278, %r111, %r1277, %r1269; + mad.lo.s32 %r1279, %r112, %r1275, %r1278; + mad.lo.s32 %r1280, %r114, %r1273, %r1279; + mad.lo.s32 %r1281, %r115, %r1271, %r1280; + shr.u32 %r1282, %r1089, 6; + and.b32 %r1283, %r1282, 240; + shr.u32 %r1284, %r1281, 10; + or.b32 %r1285, %r1284, %r1283; + xor.b32 %r1286, %r13, %r1285; + cvt.u64.u32 %rd382, %r1286; + ld.const.v4.u8 {%rs770, %rs771, %rs772, %rs773}, [matrix+384]; + cvt.u32.u16 %r1287, %rs773; + cvt.s32.s8 %r1288, %r1287; + cvt.u32.u16 %r1289, %rs772; + cvt.s32.s8 %r1290, %r1289; + cvt.u32.u16 %r1291, %rs770; + cvt.s32.s8 %r1292, %r1291; + cvt.u32.u16 %r1293, %rs771; + cvt.s32.s8 %r1294, %r1293; + mul.lo.s32 %r1295, %r34, %r1294; + mad.lo.s32 %r1296, %r124, %r1292, %r1295; + mad.lo.s32 %r1297, %r35, %r1290, %r1296; + mad.lo.s32 %r1298, %r36, %r1288, %r1297; + ld.const.v4.u8 {%rs778, %rs779, %rs780, %rs781}, [matrix+388]; + cvt.u32.u16 %r1299, %rs781; + cvt.s32.s8 %r1300, %r1299; + cvt.u32.u16 %r1301, %rs780; + cvt.s32.s8 %r1302, %r1301; + cvt.u32.u16 %r1303, %rs779; + cvt.s32.s8 %r1304, %r1303; + cvt.u32.u16 %r1305, %rs778; + cvt.s32.s8 %r1306, %r1305; + mad.lo.s32 %r1307, %r37, %r1306, %r1298; + mad.lo.s32 %r1308, %r38, %r1304, %r1307; + mad.lo.s32 %r1309, %r39, %r1302, %r1308; + mad.lo.s32 %r1310, %r40, %r1300, %r1309; + ld.const.v4.u8 {%rs786, %rs787, %rs788, %rs789}, [matrix+392]; + cvt.u32.u16 %r1311, %rs789; + cvt.s32.s8 %r1312, %r1311; + cvt.u32.u16 %r1313, %rs788; + cvt.s32.s8 %r1314, %r1313; + cvt.u32.u16 %r1315, %rs787; + cvt.s32.s8 %r1316, %r1315; + cvt.u32.u16 %r1317, %rs786; + cvt.s32.s8 %r1318, %r1317; + mad.lo.s32 %r1319, %r42, %r1318, %r1310; + mad.lo.s32 %r1320, %r43, %r1316, %r1319; + mad.lo.s32 %r1321, %r45, %r1314, %r1320; + mad.lo.s32 %r1322, %r46, %r1312, %r1321; + ld.const.v4.u8 {%rs794, %rs795, %rs796, %rs797}, [matrix+396]; + cvt.u32.u16 %r1323, %rs797; + cvt.s32.s8 %r1324, %r1323; + cvt.u32.u16 %r1325, %rs796; + cvt.s32.s8 %r1326, %r1325; + cvt.u32.u16 %r1327, %rs795; + cvt.s32.s8 %r1328, %r1327; + cvt.u32.u16 %r1329, %rs794; + cvt.s32.s8 %r1330, %r1329; + mad.lo.s32 %r1331, %r48, %r1330, %r1322; + mad.lo.s32 %r1332, %r49, %r1328, %r1331; + mad.lo.s32 %r1333, %r50, %r1326, %r1332; + mad.lo.s32 %r1334, %r51, %r1324, %r1333; + ld.const.v4.u8 {%rs802, %rs803, %rs804, %rs805}, [matrix+400]; + cvt.u32.u16 %r1335, %rs805; + cvt.s32.s8 %r1336, %r1335; + cvt.u32.u16 %r1337, %rs804; + cvt.s32.s8 %r1338, %r1337; + cvt.u32.u16 %r1339, %rs803; + cvt.s32.s8 %r1340, %r1339; + cvt.u32.u16 %r1341, %rs802; + cvt.s32.s8 %r1342, %r1341; + mad.lo.s32 %r1343, %r173, %r1342, %r1334; + mad.lo.s32 %r1344, %r53, %r1340, %r1343; + mad.lo.s32 %r1345, %r54, %r1338, %r1344; + mad.lo.s32 %r1346, %r55, %r1336, %r1345; + ld.const.v4.u8 {%rs810, %rs811, %rs812, %rs813}, [matrix+404]; + cvt.u32.u16 %r1347, %rs813; + cvt.s32.s8 %r1348, %r1347; + cvt.u32.u16 %r1349, %rs812; + cvt.s32.s8 %r1350, %r1349; + cvt.u32.u16 %r1351, %rs811; + cvt.s32.s8 %r1352, %r1351; + cvt.u32.u16 %r1353, %rs810; + cvt.s32.s8 %r1354, %r1353; + mad.lo.s32 %r1355, %r56, %r1354, %r1346; + mad.lo.s32 %r1356, %r57, %r1352, %r1355; + mad.lo.s32 %r1357, %r58, %r1350, %r1356; + mad.lo.s32 %r1358, %r59, %r1348, %r1357; + ld.const.v4.u8 {%rs818, %rs819, %rs820, %rs821}, [matrix+408]; + cvt.u32.u16 %r1359, %rs821; + cvt.s32.s8 %r1360, %r1359; + cvt.u32.u16 %r1361, %rs820; + cvt.s32.s8 %r1362, %r1361; + cvt.u32.u16 %r1363, %rs819; + cvt.s32.s8 %r1364, %r1363; + cvt.u32.u16 %r1365, %rs818; + cvt.s32.s8 %r1366, %r1365; + mad.lo.s32 %r1367, %r61, %r1366, %r1358; + mad.lo.s32 %r1368, %r62, %r1364, %r1367; + mad.lo.s32 %r1369, %r64, %r1362, %r1368; + mad.lo.s32 %r1370, %r65, %r1360, %r1369; + ld.const.v4.u8 {%rs826, %rs827, %rs828, %rs829}, [matrix+412]; + cvt.u32.u16 %r1371, %rs829; + cvt.s32.s8 %r1372, %r1371; + cvt.u32.u16 %r1373, %rs828; + cvt.s32.s8 %r1374, %r1373; + cvt.u32.u16 %r1375, %rs827; + cvt.s32.s8 %r1376, %r1375; + cvt.u32.u16 %r1377, %rs826; + cvt.s32.s8 %r1378, %r1377; + mad.lo.s32 %r1379, %r67, %r1378, %r1370; + mad.lo.s32 %r1380, %r68, %r1376, %r1379; + mad.lo.s32 %r1381, %r69, %r1374, %r1380; + mad.lo.s32 %r1382, %r70, %r1372, %r1381; + ld.const.v4.u8 {%rs834, %rs835, %rs836, %rs837}, [matrix+416]; + cvt.u32.u16 %r1383, %rs837; + cvt.s32.s8 %r1384, %r1383; + cvt.u32.u16 %r1385, %rs836; + cvt.s32.s8 %r1386, %r1385; + cvt.u32.u16 %r1387, %rs835; + cvt.s32.s8 %r1388, %r1387; + cvt.u32.u16 %r1389, %rs834; + cvt.s32.s8 %r1390, %r1389; + mad.lo.s32 %r1391, %r222, %r1390, %r1382; + mad.lo.s32 %r1392, %r72, %r1388, %r1391; + mad.lo.s32 %r1393, %r73, %r1386, %r1392; + mad.lo.s32 %r1394, %r74, %r1384, %r1393; + ld.const.v4.u8 {%rs842, %rs843, %rs844, %rs845}, [matrix+420]; + cvt.u32.u16 %r1395, %rs845; + cvt.s32.s8 %r1396, %r1395; + cvt.u32.u16 %r1397, %rs844; + cvt.s32.s8 %r1398, %r1397; + cvt.u32.u16 %r1399, %rs843; + cvt.s32.s8 %r1400, %r1399; + cvt.u32.u16 %r1401, %rs842; + cvt.s32.s8 %r1402, %r1401; + mad.lo.s32 %r1403, %r75, %r1402, %r1394; + mad.lo.s32 %r1404, %r76, %r1400, %r1403; + mad.lo.s32 %r1405, %r77, %r1398, %r1404; + mad.lo.s32 %r1406, %r78, %r1396, %r1405; + ld.const.v4.u8 {%rs850, %rs851, %rs852, %rs853}, [matrix+424]; + cvt.u32.u16 %r1407, %rs853; + cvt.s32.s8 %r1408, %r1407; + cvt.u32.u16 %r1409, %rs852; + cvt.s32.s8 %r1410, %r1409; + cvt.u32.u16 %r1411, %rs851; + cvt.s32.s8 %r1412, %r1411; + cvt.u32.u16 %r1413, %rs850; + cvt.s32.s8 %r1414, %r1413; + mad.lo.s32 %r1415, %r80, %r1414, %r1406; + mad.lo.s32 %r1416, %r81, %r1412, %r1415; + mad.lo.s32 %r1417, %r83, %r1410, %r1416; + mad.lo.s32 %r1418, %r84, %r1408, %r1417; + ld.const.v4.u8 {%rs858, %rs859, %rs860, %rs861}, [matrix+428]; + cvt.u32.u16 %r1419, %rs861; + cvt.s32.s8 %r1420, %r1419; + cvt.u32.u16 %r1421, %rs860; + cvt.s32.s8 %r1422, %r1421; + cvt.u32.u16 %r1423, %rs859; + cvt.s32.s8 %r1424, %r1423; + cvt.u32.u16 %r1425, %rs858; + cvt.s32.s8 %r1426, %r1425; + mad.lo.s32 %r1427, %r86, %r1426, %r1418; + mad.lo.s32 %r1428, %r87, %r1424, %r1427; + mad.lo.s32 %r1429, %r88, %r1422, %r1428; + mad.lo.s32 %r1430, %r89, %r1420, %r1429; + ld.const.v4.u8 {%rs866, %rs867, %rs868, %rs869}, [matrix+432]; + cvt.u32.u16 %r1431, %rs869; + cvt.s32.s8 %r1432, %r1431; + cvt.u32.u16 %r1433, %rs868; + cvt.s32.s8 %r1434, %r1433; + cvt.u32.u16 %r1435, %rs867; + cvt.s32.s8 %r1436, %r1435; + cvt.u32.u16 %r1437, %rs866; + cvt.s32.s8 %r1438, %r1437; + mad.lo.s32 %r1439, %r271, %r1438, %r1430; + mad.lo.s32 %r1440, %r91, %r1436, %r1439; + mad.lo.s32 %r1441, %r93, %r1434, %r1440; + mad.lo.s32 %r1442, %r94, %r1432, %r1441; + ld.const.v4.u8 {%rs874, %rs875, %rs876, %rs877}, [matrix+436]; + cvt.u32.u16 %r1443, %rs877; + cvt.s32.s8 %r1444, %r1443; + cvt.u32.u16 %r1445, %rs876; + cvt.s32.s8 %r1446, %r1445; + cvt.u32.u16 %r1447, %rs875; + cvt.s32.s8 %r1448, %r1447; + cvt.u32.u16 %r1449, %rs874; + cvt.s32.s8 %r1450, %r1449; + mad.lo.s32 %r1451, %r96, %r1450, %r1442; + mad.lo.s32 %r1452, %r97, %r1448, %r1451; + mad.lo.s32 %r1453, %r99, %r1446, %r1452; + mad.lo.s32 %r1454, %r100, %r1444, %r1453; + ld.const.v4.u8 {%rs882, %rs883, %rs884, %rs885}, [matrix+440]; + cvt.u32.u16 %r1455, %rs885; + cvt.s32.s8 %r1456, %r1455; + cvt.u32.u16 %r1457, %rs884; + cvt.s32.s8 %r1458, %r1457; + cvt.u32.u16 %r1459, %rs883; + cvt.s32.s8 %r1460, %r1459; + cvt.u32.u16 %r1461, %rs882; + cvt.s32.s8 %r1462, %r1461; + mad.lo.s32 %r1463, %r103, %r1462, %r1454; + mad.lo.s32 %r1464, %r104, %r1460, %r1463; + mad.lo.s32 %r1465, %r107, %r1458, %r1464; + mad.lo.s32 %r1466, %r108, %r1456, %r1465; + ld.const.v4.u8 {%rs890, %rs891, %rs892, %rs893}, [matrix+444]; + cvt.u32.u16 %r1467, %rs893; + cvt.s32.s8 %r1468, %r1467; + cvt.u32.u16 %r1469, %rs892; + cvt.s32.s8 %r1470, %r1469; + cvt.u32.u16 %r1471, %rs891; + cvt.s32.s8 %r1472, %r1471; + cvt.u32.u16 %r1473, %rs890; + cvt.s32.s8 %r1474, %r1473; + mad.lo.s32 %r1475, %r111, %r1474, %r1466; + mad.lo.s32 %r1476, %r112, %r1472, %r1475; + mad.lo.s32 %r1477, %r114, %r1470, %r1476; + mad.lo.s32 %r1478, %r115, %r1468, %r1477; + ld.const.v4.u8 {%rs898, %rs899, %rs900, %rs901}, [matrix+448]; + cvt.u32.u16 %r1479, %rs901; + cvt.s32.s8 %r1480, %r1479; + cvt.u32.u16 %r1481, %rs900; + cvt.s32.s8 %r1482, %r1481; + cvt.u32.u16 %r1483, %rs898; + cvt.s32.s8 %r1484, %r1483; + cvt.u32.u16 %r1485, %rs899; + cvt.s32.s8 %r1486, %r1485; + mul.lo.s32 %r1487, %r34, %r1486; + mad.lo.s32 %r1488, %r124, %r1484, %r1487; + mad.lo.s32 %r1489, %r35, %r1482, %r1488; + mad.lo.s32 %r1490, %r36, %r1480, %r1489; + ld.const.v4.u8 {%rs906, %rs907, %rs908, %rs909}, [matrix+452]; + cvt.u32.u16 %r1491, %rs909; + cvt.s32.s8 %r1492, %r1491; + cvt.u32.u16 %r1493, %rs908; + cvt.s32.s8 %r1494, %r1493; + cvt.u32.u16 %r1495, %rs907; + cvt.s32.s8 %r1496, %r1495; + cvt.u32.u16 %r1497, %rs906; + cvt.s32.s8 %r1498, %r1497; + mad.lo.s32 %r1499, %r37, %r1498, %r1490; + mad.lo.s32 %r1500, %r38, %r1496, %r1499; + mad.lo.s32 %r1501, %r39, %r1494, %r1500; + mad.lo.s32 %r1502, %r40, %r1492, %r1501; + ld.const.v4.u8 {%rs914, %rs915, %rs916, %rs917}, [matrix+456]; + cvt.u32.u16 %r1503, %rs917; + cvt.s32.s8 %r1504, %r1503; + cvt.u32.u16 %r1505, %rs916; + cvt.s32.s8 %r1506, %r1505; + cvt.u32.u16 %r1507, %rs915; + cvt.s32.s8 %r1508, %r1507; + cvt.u32.u16 %r1509, %rs914; + cvt.s32.s8 %r1510, %r1509; + mad.lo.s32 %r1511, %r42, %r1510, %r1502; + mad.lo.s32 %r1512, %r43, %r1508, %r1511; + mad.lo.s32 %r1513, %r45, %r1506, %r1512; + mad.lo.s32 %r1514, %r46, %r1504, %r1513; + ld.const.v4.u8 {%rs922, %rs923, %rs924, %rs925}, [matrix+460]; + cvt.u32.u16 %r1515, %rs925; + cvt.s32.s8 %r1516, %r1515; + cvt.u32.u16 %r1517, %rs924; + cvt.s32.s8 %r1518, %r1517; + cvt.u32.u16 %r1519, %rs923; + cvt.s32.s8 %r1520, %r1519; + cvt.u32.u16 %r1521, %rs922; + cvt.s32.s8 %r1522, %r1521; + mad.lo.s32 %r1523, %r48, %r1522, %r1514; + mad.lo.s32 %r1524, %r49, %r1520, %r1523; + mad.lo.s32 %r1525, %r50, %r1518, %r1524; + mad.lo.s32 %r1526, %r51, %r1516, %r1525; + ld.const.v4.u8 {%rs930, %rs931, %rs932, %rs933}, [matrix+464]; + cvt.u32.u16 %r1527, %rs933; + cvt.s32.s8 %r1528, %r1527; + cvt.u32.u16 %r1529, %rs932; + cvt.s32.s8 %r1530, %r1529; + cvt.u32.u16 %r1531, %rs931; + cvt.s32.s8 %r1532, %r1531; + cvt.u32.u16 %r1533, %rs930; + cvt.s32.s8 %r1534, %r1533; + mad.lo.s32 %r1535, %r173, %r1534, %r1526; + mad.lo.s32 %r1536, %r53, %r1532, %r1535; + mad.lo.s32 %r1537, %r54, %r1530, %r1536; + mad.lo.s32 %r1538, %r55, %r1528, %r1537; + ld.const.v4.u8 {%rs938, %rs939, %rs940, %rs941}, [matrix+468]; + cvt.u32.u16 %r1539, %rs941; + cvt.s32.s8 %r1540, %r1539; + cvt.u32.u16 %r1541, %rs940; + cvt.s32.s8 %r1542, %r1541; + cvt.u32.u16 %r1543, %rs939; + cvt.s32.s8 %r1544, %r1543; + cvt.u32.u16 %r1545, %rs938; + cvt.s32.s8 %r1546, %r1545; + mad.lo.s32 %r1547, %r56, %r1546, %r1538; + mad.lo.s32 %r1548, %r57, %r1544, %r1547; + mad.lo.s32 %r1549, %r58, %r1542, %r1548; + mad.lo.s32 %r1550, %r59, %r1540, %r1549; + ld.const.v4.u8 {%rs946, %rs947, %rs948, %rs949}, [matrix+472]; + cvt.u32.u16 %r1551, %rs949; + cvt.s32.s8 %r1552, %r1551; + cvt.u32.u16 %r1553, %rs948; + cvt.s32.s8 %r1554, %r1553; + cvt.u32.u16 %r1555, %rs947; + cvt.s32.s8 %r1556, %r1555; + cvt.u32.u16 %r1557, %rs946; + cvt.s32.s8 %r1558, %r1557; + mad.lo.s32 %r1559, %r61, %r1558, %r1550; + mad.lo.s32 %r1560, %r62, %r1556, %r1559; + mad.lo.s32 %r1561, %r64, %r1554, %r1560; + mad.lo.s32 %r1562, %r65, %r1552, %r1561; + ld.const.v4.u8 {%rs954, %rs955, %rs956, %rs957}, [matrix+476]; + cvt.u32.u16 %r1563, %rs957; + cvt.s32.s8 %r1564, %r1563; + cvt.u32.u16 %r1565, %rs956; + cvt.s32.s8 %r1566, %r1565; + cvt.u32.u16 %r1567, %rs955; + cvt.s32.s8 %r1568, %r1567; + cvt.u32.u16 %r1569, %rs954; + cvt.s32.s8 %r1570, %r1569; + mad.lo.s32 %r1571, %r67, %r1570, %r1562; + mad.lo.s32 %r1572, %r68, %r1568, %r1571; + mad.lo.s32 %r1573, %r69, %r1566, %r1572; + mad.lo.s32 %r1574, %r70, %r1564, %r1573; + ld.const.v4.u8 {%rs962, %rs963, %rs964, %rs965}, [matrix+480]; + cvt.u32.u16 %r1575, %rs965; + cvt.s32.s8 %r1576, %r1575; + cvt.u32.u16 %r1577, %rs964; + cvt.s32.s8 %r1578, %r1577; + cvt.u32.u16 %r1579, %rs963; + cvt.s32.s8 %r1580, %r1579; + cvt.u32.u16 %r1581, %rs962; + cvt.s32.s8 %r1582, %r1581; + mad.lo.s32 %r1583, %r222, %r1582, %r1574; + mad.lo.s32 %r1584, %r72, %r1580, %r1583; + mad.lo.s32 %r1585, %r73, %r1578, %r1584; + mad.lo.s32 %r1586, %r74, %r1576, %r1585; + ld.const.v4.u8 {%rs970, %rs971, %rs972, %rs973}, [matrix+484]; + cvt.u32.u16 %r1587, %rs973; + cvt.s32.s8 %r1588, %r1587; + cvt.u32.u16 %r1589, %rs972; + cvt.s32.s8 %r1590, %r1589; + cvt.u32.u16 %r1591, %rs971; + cvt.s32.s8 %r1592, %r1591; + cvt.u32.u16 %r1593, %rs970; + cvt.s32.s8 %r1594, %r1593; + mad.lo.s32 %r1595, %r75, %r1594, %r1586; + mad.lo.s32 %r1596, %r76, %r1592, %r1595; + mad.lo.s32 %r1597, %r77, %r1590, %r1596; + mad.lo.s32 %r1598, %r78, %r1588, %r1597; + ld.const.v4.u8 {%rs978, %rs979, %rs980, %rs981}, [matrix+488]; + cvt.u32.u16 %r1599, %rs981; + cvt.s32.s8 %r1600, %r1599; + cvt.u32.u16 %r1601, %rs980; + cvt.s32.s8 %r1602, %r1601; + cvt.u32.u16 %r1603, %rs979; + cvt.s32.s8 %r1604, %r1603; + cvt.u32.u16 %r1605, %rs978; + cvt.s32.s8 %r1606, %r1605; + mad.lo.s32 %r1607, %r80, %r1606, %r1598; + mad.lo.s32 %r1608, %r81, %r1604, %r1607; + mad.lo.s32 %r1609, %r83, %r1602, %r1608; + mad.lo.s32 %r1610, %r84, %r1600, %r1609; + ld.const.v4.u8 {%rs986, %rs987, %rs988, %rs989}, [matrix+492]; + cvt.u32.u16 %r1611, %rs989; + cvt.s32.s8 %r1612, %r1611; + cvt.u32.u16 %r1613, %rs988; + cvt.s32.s8 %r1614, %r1613; + cvt.u32.u16 %r1615, %rs987; + cvt.s32.s8 %r1616, %r1615; + cvt.u32.u16 %r1617, %rs986; + cvt.s32.s8 %r1618, %r1617; + mad.lo.s32 %r1619, %r86, %r1618, %r1610; + mad.lo.s32 %r1620, %r87, %r1616, %r1619; + mad.lo.s32 %r1621, %r88, %r1614, %r1620; + mad.lo.s32 %r1622, %r89, %r1612, %r1621; + ld.const.v4.u8 {%rs994, %rs995, %rs996, %rs997}, [matrix+496]; + cvt.u32.u16 %r1623, %rs997; + cvt.s32.s8 %r1624, %r1623; + cvt.u32.u16 %r1625, %rs996; + cvt.s32.s8 %r1626, %r1625; + cvt.u32.u16 %r1627, %rs995; + cvt.s32.s8 %r1628, %r1627; + cvt.u32.u16 %r1629, %rs994; + cvt.s32.s8 %r1630, %r1629; + mad.lo.s32 %r1631, %r271, %r1630, %r1622; + mad.lo.s32 %r1632, %r91, %r1628, %r1631; + mad.lo.s32 %r1633, %r93, %r1626, %r1632; + mad.lo.s32 %r1634, %r94, %r1624, %r1633; + ld.const.v4.u8 {%rs1002, %rs1003, %rs1004, %rs1005}, [matrix+500]; + cvt.u32.u16 %r1635, %rs1005; + cvt.s32.s8 %r1636, %r1635; + cvt.u32.u16 %r1637, %rs1004; + cvt.s32.s8 %r1638, %r1637; + cvt.u32.u16 %r1639, %rs1003; + cvt.s32.s8 %r1640, %r1639; + cvt.u32.u16 %r1641, %rs1002; + cvt.s32.s8 %r1642, %r1641; + mad.lo.s32 %r1643, %r96, %r1642, %r1634; + mad.lo.s32 %r1644, %r97, %r1640, %r1643; + mad.lo.s32 %r1645, %r99, %r1638, %r1644; + mad.lo.s32 %r1646, %r100, %r1636, %r1645; + ld.const.v4.u8 {%rs1010, %rs1011, %rs1012, %rs1013}, [matrix+504]; + cvt.u32.u16 %r1647, %rs1013; + cvt.s32.s8 %r1648, %r1647; + cvt.u32.u16 %r1649, %rs1012; + cvt.s32.s8 %r1650, %r1649; + cvt.u32.u16 %r1651, %rs1011; + cvt.s32.s8 %r1652, %r1651; + cvt.u32.u16 %r1653, %rs1010; + cvt.s32.s8 %r1654, %r1653; + mad.lo.s32 %r1655, %r103, %r1654, %r1646; + mad.lo.s32 %r1656, %r104, %r1652, %r1655; + mad.lo.s32 %r1657, %r107, %r1650, %r1656; + mad.lo.s32 %r1658, %r108, %r1648, %r1657; + ld.const.v4.u8 {%rs1018, %rs1019, %rs1020, %rs1021}, [matrix+508]; + cvt.u32.u16 %r1659, %rs1021; + cvt.s32.s8 %r1660, %r1659; + cvt.u32.u16 %r1661, %rs1020; + cvt.s32.s8 %r1662, %r1661; + cvt.u32.u16 %r1663, %rs1019; + cvt.s32.s8 %r1664, %r1663; + cvt.u32.u16 %r1665, %rs1018; + cvt.s32.s8 %r1666, %r1665; + mad.lo.s32 %r1667, %r111, %r1666, %r1658; + mad.lo.s32 %r1668, %r112, %r1664, %r1667; + mad.lo.s32 %r1669, %r114, %r1662, %r1668; + mad.lo.s32 %r1670, %r115, %r1660, %r1669; + shr.u32 %r1671, %r1478, 6; + and.b32 %r1672, %r1671, 240; + shr.u32 %r1673, %r1670, 10; + or.b32 %r1674, %r1673, %r1672; + xor.b32 %r1675, %r14, %r1674; + cvt.u64.u32 %rd383, %r1675; + ld.const.v4.u8 {%rs1026, %rs1027, %rs1028, %rs1029}, [matrix+512]; + cvt.u32.u16 %r1676, %rs1029; + cvt.s32.s8 %r1677, %r1676; + cvt.u32.u16 %r1678, %rs1028; + cvt.s32.s8 %r1679, %r1678; + cvt.u32.u16 %r1680, %rs1026; + cvt.s32.s8 %r1681, %r1680; + cvt.u32.u16 %r1682, %rs1027; + cvt.s32.s8 %r1683, %r1682; + mul.lo.s32 %r1684, %r34, %r1683; + mad.lo.s32 %r1685, %r124, %r1681, %r1684; + mad.lo.s32 %r1686, %r35, %r1679, %r1685; + mad.lo.s32 %r1687, %r36, %r1677, %r1686; + ld.const.v4.u8 {%rs1034, %rs1035, %rs1036, %rs1037}, [matrix+516]; + cvt.u32.u16 %r1688, %rs1037; + cvt.s32.s8 %r1689, %r1688; + cvt.u32.u16 %r1690, %rs1036; + cvt.s32.s8 %r1691, %r1690; + cvt.u32.u16 %r1692, %rs1035; + cvt.s32.s8 %r1693, %r1692; + cvt.u32.u16 %r1694, %rs1034; + cvt.s32.s8 %r1695, %r1694; + mad.lo.s32 %r1696, %r37, %r1695, %r1687; + mad.lo.s32 %r1697, %r38, %r1693, %r1696; + mad.lo.s32 %r1698, %r39, %r1691, %r1697; + mad.lo.s32 %r1699, %r40, %r1689, %r1698; + ld.const.v4.u8 {%rs1042, %rs1043, %rs1044, %rs1045}, [matrix+520]; + cvt.u32.u16 %r1700, %rs1045; + cvt.s32.s8 %r1701, %r1700; + cvt.u32.u16 %r1702, %rs1044; + cvt.s32.s8 %r1703, %r1702; + cvt.u32.u16 %r1704, %rs1043; + cvt.s32.s8 %r1705, %r1704; + cvt.u32.u16 %r1706, %rs1042; + cvt.s32.s8 %r1707, %r1706; + mad.lo.s32 %r1708, %r42, %r1707, %r1699; + mad.lo.s32 %r1709, %r43, %r1705, %r1708; + mad.lo.s32 %r1710, %r45, %r1703, %r1709; + mad.lo.s32 %r1711, %r46, %r1701, %r1710; + ld.const.v4.u8 {%rs1050, %rs1051, %rs1052, %rs1053}, [matrix+524]; + cvt.u32.u16 %r1712, %rs1053; + cvt.s32.s8 %r1713, %r1712; + cvt.u32.u16 %r1714, %rs1052; + cvt.s32.s8 %r1715, %r1714; + cvt.u32.u16 %r1716, %rs1051; + cvt.s32.s8 %r1717, %r1716; + cvt.u32.u16 %r1718, %rs1050; + cvt.s32.s8 %r1719, %r1718; + mad.lo.s32 %r1720, %r48, %r1719, %r1711; + mad.lo.s32 %r1721, %r49, %r1717, %r1720; + mad.lo.s32 %r1722, %r50, %r1715, %r1721; + mad.lo.s32 %r1723, %r51, %r1713, %r1722; + ld.const.v4.u8 {%rs1058, %rs1059, %rs1060, %rs1061}, [matrix+528]; + cvt.u32.u16 %r1724, %rs1061; + cvt.s32.s8 %r1725, %r1724; + cvt.u32.u16 %r1726, %rs1060; + cvt.s32.s8 %r1727, %r1726; + cvt.u32.u16 %r1728, %rs1059; + cvt.s32.s8 %r1729, %r1728; + cvt.u32.u16 %r1730, %rs1058; + cvt.s32.s8 %r1731, %r1730; + mad.lo.s32 %r1732, %r173, %r1731, %r1723; + mad.lo.s32 %r1733, %r53, %r1729, %r1732; + mad.lo.s32 %r1734, %r54, %r1727, %r1733; + mad.lo.s32 %r1735, %r55, %r1725, %r1734; + ld.const.v4.u8 {%rs1066, %rs1067, %rs1068, %rs1069}, [matrix+532]; + cvt.u32.u16 %r1736, %rs1069; + cvt.s32.s8 %r1737, %r1736; + cvt.u32.u16 %r1738, %rs1068; + cvt.s32.s8 %r1739, %r1738; + cvt.u32.u16 %r1740, %rs1067; + cvt.s32.s8 %r1741, %r1740; + cvt.u32.u16 %r1742, %rs1066; + cvt.s32.s8 %r1743, %r1742; + mad.lo.s32 %r1744, %r56, %r1743, %r1735; + mad.lo.s32 %r1745, %r57, %r1741, %r1744; + mad.lo.s32 %r1746, %r58, %r1739, %r1745; + mad.lo.s32 %r1747, %r59, %r1737, %r1746; + ld.const.v4.u8 {%rs1074, %rs1075, %rs1076, %rs1077}, [matrix+536]; + cvt.u32.u16 %r1748, %rs1077; + cvt.s32.s8 %r1749, %r1748; + cvt.u32.u16 %r1750, %rs1076; + cvt.s32.s8 %r1751, %r1750; + cvt.u32.u16 %r1752, %rs1075; + cvt.s32.s8 %r1753, %r1752; + cvt.u32.u16 %r1754, %rs1074; + cvt.s32.s8 %r1755, %r1754; + mad.lo.s32 %r1756, %r61, %r1755, %r1747; + mad.lo.s32 %r1757, %r62, %r1753, %r1756; + mad.lo.s32 %r1758, %r64, %r1751, %r1757; + mad.lo.s32 %r1759, %r65, %r1749, %r1758; + ld.const.v4.u8 {%rs1082, %rs1083, %rs1084, %rs1085}, [matrix+540]; + cvt.u32.u16 %r1760, %rs1085; + cvt.s32.s8 %r1761, %r1760; + cvt.u32.u16 %r1762, %rs1084; + cvt.s32.s8 %r1763, %r1762; + cvt.u32.u16 %r1764, %rs1083; + cvt.s32.s8 %r1765, %r1764; + cvt.u32.u16 %r1766, %rs1082; + cvt.s32.s8 %r1767, %r1766; + mad.lo.s32 %r1768, %r67, %r1767, %r1759; + mad.lo.s32 %r1769, %r68, %r1765, %r1768; + mad.lo.s32 %r1770, %r69, %r1763, %r1769; + mad.lo.s32 %r1771, %r70, %r1761, %r1770; + ld.const.v4.u8 {%rs1090, %rs1091, %rs1092, %rs1093}, [matrix+544]; + cvt.u32.u16 %r1772, %rs1093; + cvt.s32.s8 %r1773, %r1772; + cvt.u32.u16 %r1774, %rs1092; + cvt.s32.s8 %r1775, %r1774; + cvt.u32.u16 %r1776, %rs1091; + cvt.s32.s8 %r1777, %r1776; + cvt.u32.u16 %r1778, %rs1090; + cvt.s32.s8 %r1779, %r1778; + mad.lo.s32 %r1780, %r222, %r1779, %r1771; + mad.lo.s32 %r1781, %r72, %r1777, %r1780; + mad.lo.s32 %r1782, %r73, %r1775, %r1781; + mad.lo.s32 %r1783, %r74, %r1773, %r1782; + ld.const.v4.u8 {%rs1098, %rs1099, %rs1100, %rs1101}, [matrix+548]; + cvt.u32.u16 %r1784, %rs1101; + cvt.s32.s8 %r1785, %r1784; + cvt.u32.u16 %r1786, %rs1100; + cvt.s32.s8 %r1787, %r1786; + cvt.u32.u16 %r1788, %rs1099; + cvt.s32.s8 %r1789, %r1788; + cvt.u32.u16 %r1790, %rs1098; + cvt.s32.s8 %r1791, %r1790; + mad.lo.s32 %r1792, %r75, %r1791, %r1783; + mad.lo.s32 %r1793, %r76, %r1789, %r1792; + mad.lo.s32 %r1794, %r77, %r1787, %r1793; + mad.lo.s32 %r1795, %r78, %r1785, %r1794; + ld.const.v4.u8 {%rs1106, %rs1107, %rs1108, %rs1109}, [matrix+552]; + cvt.u32.u16 %r1796, %rs1109; + cvt.s32.s8 %r1797, %r1796; + cvt.u32.u16 %r1798, %rs1108; + cvt.s32.s8 %r1799, %r1798; + cvt.u32.u16 %r1800, %rs1107; + cvt.s32.s8 %r1801, %r1800; + cvt.u32.u16 %r1802, %rs1106; + cvt.s32.s8 %r1803, %r1802; + mad.lo.s32 %r1804, %r80, %r1803, %r1795; + mad.lo.s32 %r1805, %r81, %r1801, %r1804; + mad.lo.s32 %r1806, %r83, %r1799, %r1805; + mad.lo.s32 %r1807, %r84, %r1797, %r1806; + ld.const.v4.u8 {%rs1114, %rs1115, %rs1116, %rs1117}, [matrix+556]; + cvt.u32.u16 %r1808, %rs1117; + cvt.s32.s8 %r1809, %r1808; + cvt.u32.u16 %r1810, %rs1116; + cvt.s32.s8 %r1811, %r1810; + cvt.u32.u16 %r1812, %rs1115; + cvt.s32.s8 %r1813, %r1812; + cvt.u32.u16 %r1814, %rs1114; + cvt.s32.s8 %r1815, %r1814; + mad.lo.s32 %r1816, %r86, %r1815, %r1807; + mad.lo.s32 %r1817, %r87, %r1813, %r1816; + mad.lo.s32 %r1818, %r88, %r1811, %r1817; + mad.lo.s32 %r1819, %r89, %r1809, %r1818; + ld.const.v4.u8 {%rs1122, %rs1123, %rs1124, %rs1125}, [matrix+560]; + cvt.u32.u16 %r1820, %rs1125; + cvt.s32.s8 %r1821, %r1820; + cvt.u32.u16 %r1822, %rs1124; + cvt.s32.s8 %r1823, %r1822; + cvt.u32.u16 %r1824, %rs1123; + cvt.s32.s8 %r1825, %r1824; + cvt.u32.u16 %r1826, %rs1122; + cvt.s32.s8 %r1827, %r1826; + mad.lo.s32 %r1828, %r271, %r1827, %r1819; + mad.lo.s32 %r1829, %r91, %r1825, %r1828; + mad.lo.s32 %r1830, %r93, %r1823, %r1829; + mad.lo.s32 %r1831, %r94, %r1821, %r1830; + ld.const.v4.u8 {%rs1130, %rs1131, %rs1132, %rs1133}, [matrix+564]; + cvt.u32.u16 %r1832, %rs1133; + cvt.s32.s8 %r1833, %r1832; + cvt.u32.u16 %r1834, %rs1132; + cvt.s32.s8 %r1835, %r1834; + cvt.u32.u16 %r1836, %rs1131; + cvt.s32.s8 %r1837, %r1836; + cvt.u32.u16 %r1838, %rs1130; + cvt.s32.s8 %r1839, %r1838; + mad.lo.s32 %r1840, %r96, %r1839, %r1831; + mad.lo.s32 %r1841, %r97, %r1837, %r1840; + mad.lo.s32 %r1842, %r99, %r1835, %r1841; + mad.lo.s32 %r1843, %r100, %r1833, %r1842; + ld.const.v4.u8 {%rs1138, %rs1139, %rs1140, %rs1141}, [matrix+568]; + cvt.u32.u16 %r1844, %rs1141; + cvt.s32.s8 %r1845, %r1844; + cvt.u32.u16 %r1846, %rs1140; + cvt.s32.s8 %r1847, %r1846; + cvt.u32.u16 %r1848, %rs1139; + cvt.s32.s8 %r1849, %r1848; + cvt.u32.u16 %r1850, %rs1138; + cvt.s32.s8 %r1851, %r1850; + mad.lo.s32 %r1852, %r103, %r1851, %r1843; + mad.lo.s32 %r1853, %r104, %r1849, %r1852; + mad.lo.s32 %r1854, %r107, %r1847, %r1853; + mad.lo.s32 %r1855, %r108, %r1845, %r1854; + ld.const.v4.u8 {%rs1146, %rs1147, %rs1148, %rs1149}, [matrix+572]; + cvt.u32.u16 %r1856, %rs1149; + cvt.s32.s8 %r1857, %r1856; + cvt.u32.u16 %r1858, %rs1148; + cvt.s32.s8 %r1859, %r1858; + cvt.u32.u16 %r1860, %rs1147; + cvt.s32.s8 %r1861, %r1860; + cvt.u32.u16 %r1862, %rs1146; + cvt.s32.s8 %r1863, %r1862; + mad.lo.s32 %r1864, %r111, %r1863, %r1855; + mad.lo.s32 %r1865, %r112, %r1861, %r1864; + mad.lo.s32 %r1866, %r114, %r1859, %r1865; + mad.lo.s32 %r1867, %r115, %r1857, %r1866; + ld.const.v4.u8 {%rs1154, %rs1155, %rs1156, %rs1157}, [matrix+576]; + cvt.u32.u16 %r1868, %rs1157; + cvt.s32.s8 %r1869, %r1868; + cvt.u32.u16 %r1870, %rs1156; + cvt.s32.s8 %r1871, %r1870; + cvt.u32.u16 %r1872, %rs1154; + cvt.s32.s8 %r1873, %r1872; + cvt.u32.u16 %r1874, %rs1155; + cvt.s32.s8 %r1875, %r1874; + mul.lo.s32 %r1876, %r34, %r1875; + mad.lo.s32 %r1877, %r124, %r1873, %r1876; + mad.lo.s32 %r1878, %r35, %r1871, %r1877; + mad.lo.s32 %r1879, %r36, %r1869, %r1878; + ld.const.v4.u8 {%rs1162, %rs1163, %rs1164, %rs1165}, [matrix+580]; + cvt.u32.u16 %r1880, %rs1165; + cvt.s32.s8 %r1881, %r1880; + cvt.u32.u16 %r1882, %rs1164; + cvt.s32.s8 %r1883, %r1882; + cvt.u32.u16 %r1884, %rs1163; + cvt.s32.s8 %r1885, %r1884; + cvt.u32.u16 %r1886, %rs1162; + cvt.s32.s8 %r1887, %r1886; + mad.lo.s32 %r1888, %r37, %r1887, %r1879; + mad.lo.s32 %r1889, %r38, %r1885, %r1888; + mad.lo.s32 %r1890, %r39, %r1883, %r1889; + mad.lo.s32 %r1891, %r40, %r1881, %r1890; + ld.const.v4.u8 {%rs1170, %rs1171, %rs1172, %rs1173}, [matrix+584]; + cvt.u32.u16 %r1892, %rs1173; + cvt.s32.s8 %r1893, %r1892; + cvt.u32.u16 %r1894, %rs1172; + cvt.s32.s8 %r1895, %r1894; + cvt.u32.u16 %r1896, %rs1171; + cvt.s32.s8 %r1897, %r1896; + cvt.u32.u16 %r1898, %rs1170; + cvt.s32.s8 %r1899, %r1898; + mad.lo.s32 %r1900, %r42, %r1899, %r1891; + mad.lo.s32 %r1901, %r43, %r1897, %r1900; + mad.lo.s32 %r1902, %r45, %r1895, %r1901; + mad.lo.s32 %r1903, %r46, %r1893, %r1902; + ld.const.v4.u8 {%rs1178, %rs1179, %rs1180, %rs1181}, [matrix+588]; + cvt.u32.u16 %r1904, %rs1181; + cvt.s32.s8 %r1905, %r1904; + cvt.u32.u16 %r1906, %rs1180; + cvt.s32.s8 %r1907, %r1906; + cvt.u32.u16 %r1908, %rs1179; + cvt.s32.s8 %r1909, %r1908; + cvt.u32.u16 %r1910, %rs1178; + cvt.s32.s8 %r1911, %r1910; + mad.lo.s32 %r1912, %r48, %r1911, %r1903; + mad.lo.s32 %r1913, %r49, %r1909, %r1912; + mad.lo.s32 %r1914, %r50, %r1907, %r1913; + mad.lo.s32 %r1915, %r51, %r1905, %r1914; + ld.const.v4.u8 {%rs1186, %rs1187, %rs1188, %rs1189}, [matrix+592]; + cvt.u32.u16 %r1916, %rs1189; + cvt.s32.s8 %r1917, %r1916; + cvt.u32.u16 %r1918, %rs1188; + cvt.s32.s8 %r1919, %r1918; + cvt.u32.u16 %r1920, %rs1187; + cvt.s32.s8 %r1921, %r1920; + cvt.u32.u16 %r1922, %rs1186; + cvt.s32.s8 %r1923, %r1922; + mad.lo.s32 %r1924, %r173, %r1923, %r1915; + mad.lo.s32 %r1925, %r53, %r1921, %r1924; + mad.lo.s32 %r1926, %r54, %r1919, %r1925; + mad.lo.s32 %r1927, %r55, %r1917, %r1926; + ld.const.v4.u8 {%rs1194, %rs1195, %rs1196, %rs1197}, [matrix+596]; + cvt.u32.u16 %r1928, %rs1197; + cvt.s32.s8 %r1929, %r1928; + cvt.u32.u16 %r1930, %rs1196; + cvt.s32.s8 %r1931, %r1930; + cvt.u32.u16 %r1932, %rs1195; + cvt.s32.s8 %r1933, %r1932; + cvt.u32.u16 %r1934, %rs1194; + cvt.s32.s8 %r1935, %r1934; + mad.lo.s32 %r1936, %r56, %r1935, %r1927; + mad.lo.s32 %r1937, %r57, %r1933, %r1936; + mad.lo.s32 %r1938, %r58, %r1931, %r1937; + mad.lo.s32 %r1939, %r59, %r1929, %r1938; + ld.const.v4.u8 {%rs1202, %rs1203, %rs1204, %rs1205}, [matrix+600]; + cvt.u32.u16 %r1940, %rs1205; + cvt.s32.s8 %r1941, %r1940; + cvt.u32.u16 %r1942, %rs1204; + cvt.s32.s8 %r1943, %r1942; + cvt.u32.u16 %r1944, %rs1203; + cvt.s32.s8 %r1945, %r1944; + cvt.u32.u16 %r1946, %rs1202; + cvt.s32.s8 %r1947, %r1946; + mad.lo.s32 %r1948, %r61, %r1947, %r1939; + mad.lo.s32 %r1949, %r62, %r1945, %r1948; + mad.lo.s32 %r1950, %r64, %r1943, %r1949; + mad.lo.s32 %r1951, %r65, %r1941, %r1950; + ld.const.v4.u8 {%rs1210, %rs1211, %rs1212, %rs1213}, [matrix+604]; + cvt.u32.u16 %r1952, %rs1213; + cvt.s32.s8 %r1953, %r1952; + cvt.u32.u16 %r1954, %rs1212; + cvt.s32.s8 %r1955, %r1954; + cvt.u32.u16 %r1956, %rs1211; + cvt.s32.s8 %r1957, %r1956; + cvt.u32.u16 %r1958, %rs1210; + cvt.s32.s8 %r1959, %r1958; + mad.lo.s32 %r1960, %r67, %r1959, %r1951; + mad.lo.s32 %r1961, %r68, %r1957, %r1960; + mad.lo.s32 %r1962, %r69, %r1955, %r1961; + mad.lo.s32 %r1963, %r70, %r1953, %r1962; + ld.const.v4.u8 {%rs1218, %rs1219, %rs1220, %rs1221}, [matrix+608]; + cvt.u32.u16 %r1964, %rs1221; + cvt.s32.s8 %r1965, %r1964; + cvt.u32.u16 %r1966, %rs1220; + cvt.s32.s8 %r1967, %r1966; + cvt.u32.u16 %r1968, %rs1219; + cvt.s32.s8 %r1969, %r1968; + cvt.u32.u16 %r1970, %rs1218; + cvt.s32.s8 %r1971, %r1970; + mad.lo.s32 %r1972, %r222, %r1971, %r1963; + mad.lo.s32 %r1973, %r72, %r1969, %r1972; + mad.lo.s32 %r1974, %r73, %r1967, %r1973; + mad.lo.s32 %r1975, %r74, %r1965, %r1974; + ld.const.v4.u8 {%rs1226, %rs1227, %rs1228, %rs1229}, [matrix+612]; + cvt.u32.u16 %r1976, %rs1229; + cvt.s32.s8 %r1977, %r1976; + cvt.u32.u16 %r1978, %rs1228; + cvt.s32.s8 %r1979, %r1978; + cvt.u32.u16 %r1980, %rs1227; + cvt.s32.s8 %r1981, %r1980; + cvt.u32.u16 %r1982, %rs1226; + cvt.s32.s8 %r1983, %r1982; + mad.lo.s32 %r1984, %r75, %r1983, %r1975; + mad.lo.s32 %r1985, %r76, %r1981, %r1984; + mad.lo.s32 %r1986, %r77, %r1979, %r1985; + mad.lo.s32 %r1987, %r78, %r1977, %r1986; + ld.const.v4.u8 {%rs1234, %rs1235, %rs1236, %rs1237}, [matrix+616]; + cvt.u32.u16 %r1988, %rs1237; + cvt.s32.s8 %r1989, %r1988; + cvt.u32.u16 %r1990, %rs1236; + cvt.s32.s8 %r1991, %r1990; + cvt.u32.u16 %r1992, %rs1235; + cvt.s32.s8 %r1993, %r1992; + cvt.u32.u16 %r1994, %rs1234; + cvt.s32.s8 %r1995, %r1994; + mad.lo.s32 %r1996, %r80, %r1995, %r1987; + mad.lo.s32 %r1997, %r81, %r1993, %r1996; + mad.lo.s32 %r1998, %r83, %r1991, %r1997; + mad.lo.s32 %r1999, %r84, %r1989, %r1998; + ld.const.v4.u8 {%rs1242, %rs1243, %rs1244, %rs1245}, [matrix+620]; + cvt.u32.u16 %r2000, %rs1245; + cvt.s32.s8 %r2001, %r2000; + cvt.u32.u16 %r2002, %rs1244; + cvt.s32.s8 %r2003, %r2002; + cvt.u32.u16 %r2004, %rs1243; + cvt.s32.s8 %r2005, %r2004; + cvt.u32.u16 %r2006, %rs1242; + cvt.s32.s8 %r2007, %r2006; + mad.lo.s32 %r2008, %r86, %r2007, %r1999; + mad.lo.s32 %r2009, %r87, %r2005, %r2008; + mad.lo.s32 %r2010, %r88, %r2003, %r2009; + mad.lo.s32 %r2011, %r89, %r2001, %r2010; + ld.const.v4.u8 {%rs1250, %rs1251, %rs1252, %rs1253}, [matrix+624]; + cvt.u32.u16 %r2012, %rs1253; + cvt.s32.s8 %r2013, %r2012; + cvt.u32.u16 %r2014, %rs1252; + cvt.s32.s8 %r2015, %r2014; + cvt.u32.u16 %r2016, %rs1251; + cvt.s32.s8 %r2017, %r2016; + cvt.u32.u16 %r2018, %rs1250; + cvt.s32.s8 %r2019, %r2018; + mad.lo.s32 %r2020, %r271, %r2019, %r2011; + mad.lo.s32 %r2021, %r91, %r2017, %r2020; + mad.lo.s32 %r2022, %r93, %r2015, %r2021; + mad.lo.s32 %r2023, %r94, %r2013, %r2022; + ld.const.v4.u8 {%rs1258, %rs1259, %rs1260, %rs1261}, [matrix+628]; + cvt.u32.u16 %r2024, %rs1261; + cvt.s32.s8 %r2025, %r2024; + cvt.u32.u16 %r2026, %rs1260; + cvt.s32.s8 %r2027, %r2026; + cvt.u32.u16 %r2028, %rs1259; + cvt.s32.s8 %r2029, %r2028; + cvt.u32.u16 %r2030, %rs1258; + cvt.s32.s8 %r2031, %r2030; + mad.lo.s32 %r2032, %r96, %r2031, %r2023; + mad.lo.s32 %r2033, %r97, %r2029, %r2032; + mad.lo.s32 %r2034, %r99, %r2027, %r2033; + mad.lo.s32 %r2035, %r100, %r2025, %r2034; + ld.const.v4.u8 {%rs1266, %rs1267, %rs1268, %rs1269}, [matrix+632]; + cvt.u32.u16 %r2036, %rs1269; + cvt.s32.s8 %r2037, %r2036; + cvt.u32.u16 %r2038, %rs1268; + cvt.s32.s8 %r2039, %r2038; + cvt.u32.u16 %r2040, %rs1267; + cvt.s32.s8 %r2041, %r2040; + cvt.u32.u16 %r2042, %rs1266; + cvt.s32.s8 %r2043, %r2042; + mad.lo.s32 %r2044, %r103, %r2043, %r2035; + mad.lo.s32 %r2045, %r104, %r2041, %r2044; + mad.lo.s32 %r2046, %r107, %r2039, %r2045; + mad.lo.s32 %r2047, %r108, %r2037, %r2046; + ld.const.v4.u8 {%rs1274, %rs1275, %rs1276, %rs1277}, [matrix+636]; + cvt.u32.u16 %r2048, %rs1277; + cvt.s32.s8 %r2049, %r2048; + cvt.u32.u16 %r2050, %rs1276; + cvt.s32.s8 %r2051, %r2050; + cvt.u32.u16 %r2052, %rs1275; + cvt.s32.s8 %r2053, %r2052; + cvt.u32.u16 %r2054, %rs1274; + cvt.s32.s8 %r2055, %r2054; + mad.lo.s32 %r2056, %r111, %r2055, %r2047; + mad.lo.s32 %r2057, %r112, %r2053, %r2056; + mad.lo.s32 %r2058, %r114, %r2051, %r2057; + mad.lo.s32 %r2059, %r115, %r2049, %r2058; + shr.u32 %r2060, %r1867, 6; + and.b32 %r2061, %r2060, 240; + shr.u32 %r2062, %r2059, 10; + or.b32 %r2063, %r2062, %r2061; + xor.b32 %r2064, %r15, %r2063; + cvt.u64.u32 %rd384, %r2064; + ld.const.v4.u8 {%rs1282, %rs1283, %rs1284, %rs1285}, [matrix+640]; + cvt.u32.u16 %r2065, %rs1285; + cvt.s32.s8 %r2066, %r2065; + cvt.u32.u16 %r2067, %rs1284; + cvt.s32.s8 %r2068, %r2067; + cvt.u32.u16 %r2069, %rs1282; + cvt.s32.s8 %r2070, %r2069; + cvt.u32.u16 %r2071, %rs1283; + cvt.s32.s8 %r2072, %r2071; + mul.lo.s32 %r2073, %r34, %r2072; + mad.lo.s32 %r2074, %r124, %r2070, %r2073; + mad.lo.s32 %r2075, %r35, %r2068, %r2074; + mad.lo.s32 %r2076, %r36, %r2066, %r2075; + ld.const.v4.u8 {%rs1290, %rs1291, %rs1292, %rs1293}, [matrix+644]; + cvt.u32.u16 %r2077, %rs1293; + cvt.s32.s8 %r2078, %r2077; + cvt.u32.u16 %r2079, %rs1292; + cvt.s32.s8 %r2080, %r2079; + cvt.u32.u16 %r2081, %rs1291; + cvt.s32.s8 %r2082, %r2081; + cvt.u32.u16 %r2083, %rs1290; + cvt.s32.s8 %r2084, %r2083; + mad.lo.s32 %r2085, %r37, %r2084, %r2076; + mad.lo.s32 %r2086, %r38, %r2082, %r2085; + mad.lo.s32 %r2087, %r39, %r2080, %r2086; + mad.lo.s32 %r2088, %r40, %r2078, %r2087; + ld.const.v4.u8 {%rs1298, %rs1299, %rs1300, %rs1301}, [matrix+648]; + cvt.u32.u16 %r2089, %rs1301; + cvt.s32.s8 %r2090, %r2089; + cvt.u32.u16 %r2091, %rs1300; + cvt.s32.s8 %r2092, %r2091; + cvt.u32.u16 %r2093, %rs1299; + cvt.s32.s8 %r2094, %r2093; + cvt.u32.u16 %r2095, %rs1298; + cvt.s32.s8 %r2096, %r2095; + mad.lo.s32 %r2097, %r42, %r2096, %r2088; + mad.lo.s32 %r2098, %r43, %r2094, %r2097; + mad.lo.s32 %r2099, %r45, %r2092, %r2098; + mad.lo.s32 %r2100, %r46, %r2090, %r2099; + ld.const.v4.u8 {%rs1306, %rs1307, %rs1308, %rs1309}, [matrix+652]; + cvt.u32.u16 %r2101, %rs1309; + cvt.s32.s8 %r2102, %r2101; + cvt.u32.u16 %r2103, %rs1308; + cvt.s32.s8 %r2104, %r2103; + cvt.u32.u16 %r2105, %rs1307; + cvt.s32.s8 %r2106, %r2105; + cvt.u32.u16 %r2107, %rs1306; + cvt.s32.s8 %r2108, %r2107; + mad.lo.s32 %r2109, %r48, %r2108, %r2100; + mad.lo.s32 %r2110, %r49, %r2106, %r2109; + mad.lo.s32 %r2111, %r50, %r2104, %r2110; + mad.lo.s32 %r2112, %r51, %r2102, %r2111; + ld.const.v4.u8 {%rs1314, %rs1315, %rs1316, %rs1317}, [matrix+656]; + cvt.u32.u16 %r2113, %rs1317; + cvt.s32.s8 %r2114, %r2113; + cvt.u32.u16 %r2115, %rs1316; + cvt.s32.s8 %r2116, %r2115; + cvt.u32.u16 %r2117, %rs1315; + cvt.s32.s8 %r2118, %r2117; + cvt.u32.u16 %r2119, %rs1314; + cvt.s32.s8 %r2120, %r2119; + mad.lo.s32 %r2121, %r173, %r2120, %r2112; + mad.lo.s32 %r2122, %r53, %r2118, %r2121; + mad.lo.s32 %r2123, %r54, %r2116, %r2122; + mad.lo.s32 %r2124, %r55, %r2114, %r2123; + ld.const.v4.u8 {%rs1322, %rs1323, %rs1324, %rs1325}, [matrix+660]; + cvt.u32.u16 %r2125, %rs1325; + cvt.s32.s8 %r2126, %r2125; + cvt.u32.u16 %r2127, %rs1324; + cvt.s32.s8 %r2128, %r2127; + cvt.u32.u16 %r2129, %rs1323; + cvt.s32.s8 %r2130, %r2129; + cvt.u32.u16 %r2131, %rs1322; + cvt.s32.s8 %r2132, %r2131; + mad.lo.s32 %r2133, %r56, %r2132, %r2124; + mad.lo.s32 %r2134, %r57, %r2130, %r2133; + mad.lo.s32 %r2135, %r58, %r2128, %r2134; + mad.lo.s32 %r2136, %r59, %r2126, %r2135; + ld.const.v4.u8 {%rs1330, %rs1331, %rs1332, %rs1333}, [matrix+664]; + cvt.u32.u16 %r2137, %rs1333; + cvt.s32.s8 %r2138, %r2137; + cvt.u32.u16 %r2139, %rs1332; + cvt.s32.s8 %r2140, %r2139; + cvt.u32.u16 %r2141, %rs1331; + cvt.s32.s8 %r2142, %r2141; + cvt.u32.u16 %r2143, %rs1330; + cvt.s32.s8 %r2144, %r2143; + mad.lo.s32 %r2145, %r61, %r2144, %r2136; + mad.lo.s32 %r2146, %r62, %r2142, %r2145; + mad.lo.s32 %r2147, %r64, %r2140, %r2146; + mad.lo.s32 %r2148, %r65, %r2138, %r2147; + ld.const.v4.u8 {%rs1338, %rs1339, %rs1340, %rs1341}, [matrix+668]; + cvt.u32.u16 %r2149, %rs1341; + cvt.s32.s8 %r2150, %r2149; + cvt.u32.u16 %r2151, %rs1340; + cvt.s32.s8 %r2152, %r2151; + cvt.u32.u16 %r2153, %rs1339; + cvt.s32.s8 %r2154, %r2153; + cvt.u32.u16 %r2155, %rs1338; + cvt.s32.s8 %r2156, %r2155; + mad.lo.s32 %r2157, %r67, %r2156, %r2148; + mad.lo.s32 %r2158, %r68, %r2154, %r2157; + mad.lo.s32 %r2159, %r69, %r2152, %r2158; + mad.lo.s32 %r2160, %r70, %r2150, %r2159; + ld.const.v4.u8 {%rs1346, %rs1347, %rs1348, %rs1349}, [matrix+672]; + cvt.u32.u16 %r2161, %rs1349; + cvt.s32.s8 %r2162, %r2161; + cvt.u32.u16 %r2163, %rs1348; + cvt.s32.s8 %r2164, %r2163; + cvt.u32.u16 %r2165, %rs1347; + cvt.s32.s8 %r2166, %r2165; + cvt.u32.u16 %r2167, %rs1346; + cvt.s32.s8 %r2168, %r2167; + mad.lo.s32 %r2169, %r222, %r2168, %r2160; + mad.lo.s32 %r2170, %r72, %r2166, %r2169; + mad.lo.s32 %r2171, %r73, %r2164, %r2170; + mad.lo.s32 %r2172, %r74, %r2162, %r2171; + ld.const.v4.u8 {%rs1354, %rs1355, %rs1356, %rs1357}, [matrix+676]; + cvt.u32.u16 %r2173, %rs1357; + cvt.s32.s8 %r2174, %r2173; + cvt.u32.u16 %r2175, %rs1356; + cvt.s32.s8 %r2176, %r2175; + cvt.u32.u16 %r2177, %rs1355; + cvt.s32.s8 %r2178, %r2177; + cvt.u32.u16 %r2179, %rs1354; + cvt.s32.s8 %r2180, %r2179; + mad.lo.s32 %r2181, %r75, %r2180, %r2172; + mad.lo.s32 %r2182, %r76, %r2178, %r2181; + mad.lo.s32 %r2183, %r77, %r2176, %r2182; + mad.lo.s32 %r2184, %r78, %r2174, %r2183; + ld.const.v4.u8 {%rs1362, %rs1363, %rs1364, %rs1365}, [matrix+680]; + cvt.u32.u16 %r2185, %rs1365; + cvt.s32.s8 %r2186, %r2185; + cvt.u32.u16 %r2187, %rs1364; + cvt.s32.s8 %r2188, %r2187; + cvt.u32.u16 %r2189, %rs1363; + cvt.s32.s8 %r2190, %r2189; + cvt.u32.u16 %r2191, %rs1362; + cvt.s32.s8 %r2192, %r2191; + mad.lo.s32 %r2193, %r80, %r2192, %r2184; + mad.lo.s32 %r2194, %r81, %r2190, %r2193; + mad.lo.s32 %r2195, %r83, %r2188, %r2194; + mad.lo.s32 %r2196, %r84, %r2186, %r2195; + ld.const.v4.u8 {%rs1370, %rs1371, %rs1372, %rs1373}, [matrix+684]; + cvt.u32.u16 %r2197, %rs1373; + cvt.s32.s8 %r2198, %r2197; + cvt.u32.u16 %r2199, %rs1372; + cvt.s32.s8 %r2200, %r2199; + cvt.u32.u16 %r2201, %rs1371; + cvt.s32.s8 %r2202, %r2201; + cvt.u32.u16 %r2203, %rs1370; + cvt.s32.s8 %r2204, %r2203; + mad.lo.s32 %r2205, %r86, %r2204, %r2196; + mad.lo.s32 %r2206, %r87, %r2202, %r2205; + mad.lo.s32 %r2207, %r88, %r2200, %r2206; + mad.lo.s32 %r2208, %r89, %r2198, %r2207; + ld.const.v4.u8 {%rs1378, %rs1379, %rs1380, %rs1381}, [matrix+688]; + cvt.u32.u16 %r2209, %rs1381; + cvt.s32.s8 %r2210, %r2209; + cvt.u32.u16 %r2211, %rs1380; + cvt.s32.s8 %r2212, %r2211; + cvt.u32.u16 %r2213, %rs1379; + cvt.s32.s8 %r2214, %r2213; + cvt.u32.u16 %r2215, %rs1378; + cvt.s32.s8 %r2216, %r2215; + mad.lo.s32 %r2217, %r271, %r2216, %r2208; + mad.lo.s32 %r2218, %r91, %r2214, %r2217; + mad.lo.s32 %r2219, %r93, %r2212, %r2218; + mad.lo.s32 %r2220, %r94, %r2210, %r2219; + ld.const.v4.u8 {%rs1386, %rs1387, %rs1388, %rs1389}, [matrix+692]; + cvt.u32.u16 %r2221, %rs1389; + cvt.s32.s8 %r2222, %r2221; + cvt.u32.u16 %r2223, %rs1388; + cvt.s32.s8 %r2224, %r2223; + cvt.u32.u16 %r2225, %rs1387; + cvt.s32.s8 %r2226, %r2225; + cvt.u32.u16 %r2227, %rs1386; + cvt.s32.s8 %r2228, %r2227; + mad.lo.s32 %r2229, %r96, %r2228, %r2220; + mad.lo.s32 %r2230, %r97, %r2226, %r2229; + mad.lo.s32 %r2231, %r99, %r2224, %r2230; + mad.lo.s32 %r2232, %r100, %r2222, %r2231; + ld.const.v4.u8 {%rs1394, %rs1395, %rs1396, %rs1397}, [matrix+696]; + cvt.u32.u16 %r2233, %rs1397; + cvt.s32.s8 %r2234, %r2233; + cvt.u32.u16 %r2235, %rs1396; + cvt.s32.s8 %r2236, %r2235; + cvt.u32.u16 %r2237, %rs1395; + cvt.s32.s8 %r2238, %r2237; + cvt.u32.u16 %r2239, %rs1394; + cvt.s32.s8 %r2240, %r2239; + mad.lo.s32 %r2241, %r103, %r2240, %r2232; + mad.lo.s32 %r2242, %r104, %r2238, %r2241; + mad.lo.s32 %r2243, %r107, %r2236, %r2242; + mad.lo.s32 %r2244, %r108, %r2234, %r2243; + ld.const.v4.u8 {%rs1402, %rs1403, %rs1404, %rs1405}, [matrix+700]; + cvt.u32.u16 %r2245, %rs1405; + cvt.s32.s8 %r2246, %r2245; + cvt.u32.u16 %r2247, %rs1404; + cvt.s32.s8 %r2248, %r2247; + cvt.u32.u16 %r2249, %rs1403; + cvt.s32.s8 %r2250, %r2249; + cvt.u32.u16 %r2251, %rs1402; + cvt.s32.s8 %r2252, %r2251; + mad.lo.s32 %r2253, %r111, %r2252, %r2244; + mad.lo.s32 %r2254, %r112, %r2250, %r2253; + mad.lo.s32 %r2255, %r114, %r2248, %r2254; + mad.lo.s32 %r2256, %r115, %r2246, %r2255; + ld.const.v4.u8 {%rs1410, %rs1411, %rs1412, %rs1413}, [matrix+704]; + cvt.u32.u16 %r2257, %rs1413; + cvt.s32.s8 %r2258, %r2257; + cvt.u32.u16 %r2259, %rs1412; + cvt.s32.s8 %r2260, %r2259; + cvt.u32.u16 %r2261, %rs1410; + cvt.s32.s8 %r2262, %r2261; + cvt.u32.u16 %r2263, %rs1411; + cvt.s32.s8 %r2264, %r2263; + mul.lo.s32 %r2265, %r34, %r2264; + mad.lo.s32 %r2266, %r124, %r2262, %r2265; + mad.lo.s32 %r2267, %r35, %r2260, %r2266; + mad.lo.s32 %r2268, %r36, %r2258, %r2267; + ld.const.v4.u8 {%rs1418, %rs1419, %rs1420, %rs1421}, [matrix+708]; + cvt.u32.u16 %r2269, %rs1421; + cvt.s32.s8 %r2270, %r2269; + cvt.u32.u16 %r2271, %rs1420; + cvt.s32.s8 %r2272, %r2271; + cvt.u32.u16 %r2273, %rs1419; + cvt.s32.s8 %r2274, %r2273; + cvt.u32.u16 %r2275, %rs1418; + cvt.s32.s8 %r2276, %r2275; + mad.lo.s32 %r2277, %r37, %r2276, %r2268; + mad.lo.s32 %r2278, %r38, %r2274, %r2277; + mad.lo.s32 %r2279, %r39, %r2272, %r2278; + mad.lo.s32 %r2280, %r40, %r2270, %r2279; + ld.const.v4.u8 {%rs1426, %rs1427, %rs1428, %rs1429}, [matrix+712]; + cvt.u32.u16 %r2281, %rs1429; + cvt.s32.s8 %r2282, %r2281; + cvt.u32.u16 %r2283, %rs1428; + cvt.s32.s8 %r2284, %r2283; + cvt.u32.u16 %r2285, %rs1427; + cvt.s32.s8 %r2286, %r2285; + cvt.u32.u16 %r2287, %rs1426; + cvt.s32.s8 %r2288, %r2287; + mad.lo.s32 %r2289, %r42, %r2288, %r2280; + mad.lo.s32 %r2290, %r43, %r2286, %r2289; + mad.lo.s32 %r2291, %r45, %r2284, %r2290; + mad.lo.s32 %r2292, %r46, %r2282, %r2291; + ld.const.v4.u8 {%rs1434, %rs1435, %rs1436, %rs1437}, [matrix+716]; + cvt.u32.u16 %r2293, %rs1437; + cvt.s32.s8 %r2294, %r2293; + cvt.u32.u16 %r2295, %rs1436; + cvt.s32.s8 %r2296, %r2295; + cvt.u32.u16 %r2297, %rs1435; + cvt.s32.s8 %r2298, %r2297; + cvt.u32.u16 %r2299, %rs1434; + cvt.s32.s8 %r2300, %r2299; + mad.lo.s32 %r2301, %r48, %r2300, %r2292; + mad.lo.s32 %r2302, %r49, %r2298, %r2301; + mad.lo.s32 %r2303, %r50, %r2296, %r2302; + mad.lo.s32 %r2304, %r51, %r2294, %r2303; + ld.const.v4.u8 {%rs1442, %rs1443, %rs1444, %rs1445}, [matrix+720]; + cvt.u32.u16 %r2305, %rs1445; + cvt.s32.s8 %r2306, %r2305; + cvt.u32.u16 %r2307, %rs1444; + cvt.s32.s8 %r2308, %r2307; + cvt.u32.u16 %r2309, %rs1443; + cvt.s32.s8 %r2310, %r2309; + cvt.u32.u16 %r2311, %rs1442; + cvt.s32.s8 %r2312, %r2311; + mad.lo.s32 %r2313, %r173, %r2312, %r2304; + mad.lo.s32 %r2314, %r53, %r2310, %r2313; + mad.lo.s32 %r2315, %r54, %r2308, %r2314; + mad.lo.s32 %r2316, %r55, %r2306, %r2315; + ld.const.v4.u8 {%rs1450, %rs1451, %rs1452, %rs1453}, [matrix+724]; + cvt.u32.u16 %r2317, %rs1453; + cvt.s32.s8 %r2318, %r2317; + cvt.u32.u16 %r2319, %rs1452; + cvt.s32.s8 %r2320, %r2319; + cvt.u32.u16 %r2321, %rs1451; + cvt.s32.s8 %r2322, %r2321; + cvt.u32.u16 %r2323, %rs1450; + cvt.s32.s8 %r2324, %r2323; + mad.lo.s32 %r2325, %r56, %r2324, %r2316; + mad.lo.s32 %r2326, %r57, %r2322, %r2325; + mad.lo.s32 %r2327, %r58, %r2320, %r2326; + mad.lo.s32 %r2328, %r59, %r2318, %r2327; + ld.const.v4.u8 {%rs1458, %rs1459, %rs1460, %rs1461}, [matrix+728]; + cvt.u32.u16 %r2329, %rs1461; + cvt.s32.s8 %r2330, %r2329; + cvt.u32.u16 %r2331, %rs1460; + cvt.s32.s8 %r2332, %r2331; + cvt.u32.u16 %r2333, %rs1459; + cvt.s32.s8 %r2334, %r2333; + cvt.u32.u16 %r2335, %rs1458; + cvt.s32.s8 %r2336, %r2335; + mad.lo.s32 %r2337, %r61, %r2336, %r2328; + mad.lo.s32 %r2338, %r62, %r2334, %r2337; + mad.lo.s32 %r2339, %r64, %r2332, %r2338; + mad.lo.s32 %r2340, %r65, %r2330, %r2339; + ld.const.v4.u8 {%rs1466, %rs1467, %rs1468, %rs1469}, [matrix+732]; + cvt.u32.u16 %r2341, %rs1469; + cvt.s32.s8 %r2342, %r2341; + cvt.u32.u16 %r2343, %rs1468; + cvt.s32.s8 %r2344, %r2343; + cvt.u32.u16 %r2345, %rs1467; + cvt.s32.s8 %r2346, %r2345; + cvt.u32.u16 %r2347, %rs1466; + cvt.s32.s8 %r2348, %r2347; + mad.lo.s32 %r2349, %r67, %r2348, %r2340; + mad.lo.s32 %r2350, %r68, %r2346, %r2349; + mad.lo.s32 %r2351, %r69, %r2344, %r2350; + mad.lo.s32 %r2352, %r70, %r2342, %r2351; + ld.const.v4.u8 {%rs1474, %rs1475, %rs1476, %rs1477}, [matrix+736]; + cvt.u32.u16 %r2353, %rs1477; + cvt.s32.s8 %r2354, %r2353; + cvt.u32.u16 %r2355, %rs1476; + cvt.s32.s8 %r2356, %r2355; + cvt.u32.u16 %r2357, %rs1475; + cvt.s32.s8 %r2358, %r2357; + cvt.u32.u16 %r2359, %rs1474; + cvt.s32.s8 %r2360, %r2359; + mad.lo.s32 %r2361, %r222, %r2360, %r2352; + mad.lo.s32 %r2362, %r72, %r2358, %r2361; + mad.lo.s32 %r2363, %r73, %r2356, %r2362; + mad.lo.s32 %r2364, %r74, %r2354, %r2363; + ld.const.v4.u8 {%rs1482, %rs1483, %rs1484, %rs1485}, [matrix+740]; + cvt.u32.u16 %r2365, %rs1485; + cvt.s32.s8 %r2366, %r2365; + cvt.u32.u16 %r2367, %rs1484; + cvt.s32.s8 %r2368, %r2367; + cvt.u32.u16 %r2369, %rs1483; + cvt.s32.s8 %r2370, %r2369; + cvt.u32.u16 %r2371, %rs1482; + cvt.s32.s8 %r2372, %r2371; + mad.lo.s32 %r2373, %r75, %r2372, %r2364; + mad.lo.s32 %r2374, %r76, %r2370, %r2373; + mad.lo.s32 %r2375, %r77, %r2368, %r2374; + mad.lo.s32 %r2376, %r78, %r2366, %r2375; + ld.const.v4.u8 {%rs1490, %rs1491, %rs1492, %rs1493}, [matrix+744]; + cvt.u32.u16 %r2377, %rs1493; + cvt.s32.s8 %r2378, %r2377; + cvt.u32.u16 %r2379, %rs1492; + cvt.s32.s8 %r2380, %r2379; + cvt.u32.u16 %r2381, %rs1491; + cvt.s32.s8 %r2382, %r2381; + cvt.u32.u16 %r2383, %rs1490; + cvt.s32.s8 %r2384, %r2383; + mad.lo.s32 %r2385, %r80, %r2384, %r2376; + mad.lo.s32 %r2386, %r81, %r2382, %r2385; + mad.lo.s32 %r2387, %r83, %r2380, %r2386; + mad.lo.s32 %r2388, %r84, %r2378, %r2387; + ld.const.v4.u8 {%rs1498, %rs1499, %rs1500, %rs1501}, [matrix+748]; + cvt.u32.u16 %r2389, %rs1501; + cvt.s32.s8 %r2390, %r2389; + cvt.u32.u16 %r2391, %rs1500; + cvt.s32.s8 %r2392, %r2391; + cvt.u32.u16 %r2393, %rs1499; + cvt.s32.s8 %r2394, %r2393; + cvt.u32.u16 %r2395, %rs1498; + cvt.s32.s8 %r2396, %r2395; + mad.lo.s32 %r2397, %r86, %r2396, %r2388; + mad.lo.s32 %r2398, %r87, %r2394, %r2397; + mad.lo.s32 %r2399, %r88, %r2392, %r2398; + mad.lo.s32 %r2400, %r89, %r2390, %r2399; + ld.const.v4.u8 {%rs1506, %rs1507, %rs1508, %rs1509}, [matrix+752]; + cvt.u32.u16 %r2401, %rs1509; + cvt.s32.s8 %r2402, %r2401; + cvt.u32.u16 %r2403, %rs1508; + cvt.s32.s8 %r2404, %r2403; + cvt.u32.u16 %r2405, %rs1507; + cvt.s32.s8 %r2406, %r2405; + cvt.u32.u16 %r2407, %rs1506; + cvt.s32.s8 %r2408, %r2407; + mad.lo.s32 %r2409, %r271, %r2408, %r2400; + mad.lo.s32 %r2410, %r91, %r2406, %r2409; + mad.lo.s32 %r2411, %r93, %r2404, %r2410; + mad.lo.s32 %r2412, %r94, %r2402, %r2411; + ld.const.v4.u8 {%rs1514, %rs1515, %rs1516, %rs1517}, [matrix+756]; + cvt.u32.u16 %r2413, %rs1517; + cvt.s32.s8 %r2414, %r2413; + cvt.u32.u16 %r2415, %rs1516; + cvt.s32.s8 %r2416, %r2415; + cvt.u32.u16 %r2417, %rs1515; + cvt.s32.s8 %r2418, %r2417; + cvt.u32.u16 %r2419, %rs1514; + cvt.s32.s8 %r2420, %r2419; + mad.lo.s32 %r2421, %r96, %r2420, %r2412; + mad.lo.s32 %r2422, %r97, %r2418, %r2421; + mad.lo.s32 %r2423, %r99, %r2416, %r2422; + mad.lo.s32 %r2424, %r100, %r2414, %r2423; + ld.const.v4.u8 {%rs1522, %rs1523, %rs1524, %rs1525}, [matrix+760]; + cvt.u32.u16 %r2425, %rs1525; + cvt.s32.s8 %r2426, %r2425; + cvt.u32.u16 %r2427, %rs1524; + cvt.s32.s8 %r2428, %r2427; + cvt.u32.u16 %r2429, %rs1523; + cvt.s32.s8 %r2430, %r2429; + cvt.u32.u16 %r2431, %rs1522; + cvt.s32.s8 %r2432, %r2431; + mad.lo.s32 %r2433, %r103, %r2432, %r2424; + mad.lo.s32 %r2434, %r104, %r2430, %r2433; + mad.lo.s32 %r2435, %r107, %r2428, %r2434; + mad.lo.s32 %r2436, %r108, %r2426, %r2435; + ld.const.v4.u8 {%rs1530, %rs1531, %rs1532, %rs1533}, [matrix+764]; + cvt.u32.u16 %r2437, %rs1533; + cvt.s32.s8 %r2438, %r2437; + cvt.u32.u16 %r2439, %rs1532; + cvt.s32.s8 %r2440, %r2439; + cvt.u32.u16 %r2441, %rs1531; + cvt.s32.s8 %r2442, %r2441; + cvt.u32.u16 %r2443, %rs1530; + cvt.s32.s8 %r2444, %r2443; + mad.lo.s32 %r2445, %r111, %r2444, %r2436; + mad.lo.s32 %r2446, %r112, %r2442, %r2445; + mad.lo.s32 %r2447, %r114, %r2440, %r2446; + mad.lo.s32 %r2448, %r115, %r2438, %r2447; + shr.u32 %r2449, %r2256, 6; + and.b32 %r2450, %r2449, 240; + shr.u32 %r2451, %r2448, 10; + or.b32 %r2452, %r2451, %r2450; + xor.b32 %r2453, %r16, %r2452; + cvt.u64.u32 %rd385, %r2453; + ld.const.v4.u8 {%rs1538, %rs1539, %rs1540, %rs1541}, [matrix+768]; + cvt.u32.u16 %r2454, %rs1541; + cvt.s32.s8 %r2455, %r2454; + cvt.u32.u16 %r2456, %rs1540; + cvt.s32.s8 %r2457, %r2456; + cvt.u32.u16 %r2458, %rs1538; + cvt.s32.s8 %r2459, %r2458; + cvt.u32.u16 %r2460, %rs1539; + cvt.s32.s8 %r2461, %r2460; + mul.lo.s32 %r2462, %r34, %r2461; + mad.lo.s32 %r2463, %r124, %r2459, %r2462; + mad.lo.s32 %r2464, %r35, %r2457, %r2463; + mad.lo.s32 %r2465, %r36, %r2455, %r2464; + ld.const.v4.u8 {%rs1546, %rs1547, %rs1548, %rs1549}, [matrix+772]; + cvt.u32.u16 %r2466, %rs1549; + cvt.s32.s8 %r2467, %r2466; + cvt.u32.u16 %r2468, %rs1548; + cvt.s32.s8 %r2469, %r2468; + cvt.u32.u16 %r2470, %rs1547; + cvt.s32.s8 %r2471, %r2470; + cvt.u32.u16 %r2472, %rs1546; + cvt.s32.s8 %r2473, %r2472; + mad.lo.s32 %r2474, %r37, %r2473, %r2465; + mad.lo.s32 %r2475, %r38, %r2471, %r2474; + mad.lo.s32 %r2476, %r39, %r2469, %r2475; + mad.lo.s32 %r2477, %r40, %r2467, %r2476; + ld.const.v4.u8 {%rs1554, %rs1555, %rs1556, %rs1557}, [matrix+776]; + cvt.u32.u16 %r2478, %rs1557; + cvt.s32.s8 %r2479, %r2478; + cvt.u32.u16 %r2480, %rs1556; + cvt.s32.s8 %r2481, %r2480; + cvt.u32.u16 %r2482, %rs1555; + cvt.s32.s8 %r2483, %r2482; + cvt.u32.u16 %r2484, %rs1554; + cvt.s32.s8 %r2485, %r2484; + mad.lo.s32 %r2486, %r42, %r2485, %r2477; + mad.lo.s32 %r2487, %r43, %r2483, %r2486; + mad.lo.s32 %r2488, %r45, %r2481, %r2487; + mad.lo.s32 %r2489, %r46, %r2479, %r2488; + ld.const.v4.u8 {%rs1562, %rs1563, %rs1564, %rs1565}, [matrix+780]; + cvt.u32.u16 %r2490, %rs1565; + cvt.s32.s8 %r2491, %r2490; + cvt.u32.u16 %r2492, %rs1564; + cvt.s32.s8 %r2493, %r2492; + cvt.u32.u16 %r2494, %rs1563; + cvt.s32.s8 %r2495, %r2494; + cvt.u32.u16 %r2496, %rs1562; + cvt.s32.s8 %r2497, %r2496; + mad.lo.s32 %r2498, %r48, %r2497, %r2489; + mad.lo.s32 %r2499, %r49, %r2495, %r2498; + mad.lo.s32 %r2500, %r50, %r2493, %r2499; + mad.lo.s32 %r2501, %r51, %r2491, %r2500; + ld.const.v4.u8 {%rs1570, %rs1571, %rs1572, %rs1573}, [matrix+784]; + cvt.u32.u16 %r2502, %rs1573; + cvt.s32.s8 %r2503, %r2502; + cvt.u32.u16 %r2504, %rs1572; + cvt.s32.s8 %r2505, %r2504; + cvt.u32.u16 %r2506, %rs1571; + cvt.s32.s8 %r2507, %r2506; + cvt.u32.u16 %r2508, %rs1570; + cvt.s32.s8 %r2509, %r2508; + mad.lo.s32 %r2510, %r173, %r2509, %r2501; + mad.lo.s32 %r2511, %r53, %r2507, %r2510; + mad.lo.s32 %r2512, %r54, %r2505, %r2511; + mad.lo.s32 %r2513, %r55, %r2503, %r2512; + ld.const.v4.u8 {%rs1578, %rs1579, %rs1580, %rs1581}, [matrix+788]; + cvt.u32.u16 %r2514, %rs1581; + cvt.s32.s8 %r2515, %r2514; + cvt.u32.u16 %r2516, %rs1580; + cvt.s32.s8 %r2517, %r2516; + cvt.u32.u16 %r2518, %rs1579; + cvt.s32.s8 %r2519, %r2518; + cvt.u32.u16 %r2520, %rs1578; + cvt.s32.s8 %r2521, %r2520; + mad.lo.s32 %r2522, %r56, %r2521, %r2513; + mad.lo.s32 %r2523, %r57, %r2519, %r2522; + mad.lo.s32 %r2524, %r58, %r2517, %r2523; + mad.lo.s32 %r2525, %r59, %r2515, %r2524; + ld.const.v4.u8 {%rs1586, %rs1587, %rs1588, %rs1589}, [matrix+792]; + cvt.u32.u16 %r2526, %rs1589; + cvt.s32.s8 %r2527, %r2526; + cvt.u32.u16 %r2528, %rs1588; + cvt.s32.s8 %r2529, %r2528; + cvt.u32.u16 %r2530, %rs1587; + cvt.s32.s8 %r2531, %r2530; + cvt.u32.u16 %r2532, %rs1586; + cvt.s32.s8 %r2533, %r2532; + mad.lo.s32 %r2534, %r61, %r2533, %r2525; + mad.lo.s32 %r2535, %r62, %r2531, %r2534; + mad.lo.s32 %r2536, %r64, %r2529, %r2535; + mad.lo.s32 %r2537, %r65, %r2527, %r2536; + ld.const.v4.u8 {%rs1594, %rs1595, %rs1596, %rs1597}, [matrix+796]; + cvt.u32.u16 %r2538, %rs1597; + cvt.s32.s8 %r2539, %r2538; + cvt.u32.u16 %r2540, %rs1596; + cvt.s32.s8 %r2541, %r2540; + cvt.u32.u16 %r2542, %rs1595; + cvt.s32.s8 %r2543, %r2542; + cvt.u32.u16 %r2544, %rs1594; + cvt.s32.s8 %r2545, %r2544; + mad.lo.s32 %r2546, %r67, %r2545, %r2537; + mad.lo.s32 %r2547, %r68, %r2543, %r2546; + mad.lo.s32 %r2548, %r69, %r2541, %r2547; + mad.lo.s32 %r2549, %r70, %r2539, %r2548; + ld.const.v4.u8 {%rs1602, %rs1603, %rs1604, %rs1605}, [matrix+800]; + cvt.u32.u16 %r2550, %rs1605; + cvt.s32.s8 %r2551, %r2550; + cvt.u32.u16 %r2552, %rs1604; + cvt.s32.s8 %r2553, %r2552; + cvt.u32.u16 %r2554, %rs1603; + cvt.s32.s8 %r2555, %r2554; + cvt.u32.u16 %r2556, %rs1602; + cvt.s32.s8 %r2557, %r2556; + mad.lo.s32 %r2558, %r222, %r2557, %r2549; + mad.lo.s32 %r2559, %r72, %r2555, %r2558; + mad.lo.s32 %r2560, %r73, %r2553, %r2559; + mad.lo.s32 %r2561, %r74, %r2551, %r2560; + ld.const.v4.u8 {%rs1610, %rs1611, %rs1612, %rs1613}, [matrix+804]; + cvt.u32.u16 %r2562, %rs1613; + cvt.s32.s8 %r2563, %r2562; + cvt.u32.u16 %r2564, %rs1612; + cvt.s32.s8 %r2565, %r2564; + cvt.u32.u16 %r2566, %rs1611; + cvt.s32.s8 %r2567, %r2566; + cvt.u32.u16 %r2568, %rs1610; + cvt.s32.s8 %r2569, %r2568; + mad.lo.s32 %r2570, %r75, %r2569, %r2561; + mad.lo.s32 %r2571, %r76, %r2567, %r2570; + mad.lo.s32 %r2572, %r77, %r2565, %r2571; + mad.lo.s32 %r2573, %r78, %r2563, %r2572; + ld.const.v4.u8 {%rs1618, %rs1619, %rs1620, %rs1621}, [matrix+808]; + cvt.u32.u16 %r2574, %rs1621; + cvt.s32.s8 %r2575, %r2574; + cvt.u32.u16 %r2576, %rs1620; + cvt.s32.s8 %r2577, %r2576; + cvt.u32.u16 %r2578, %rs1619; + cvt.s32.s8 %r2579, %r2578; + cvt.u32.u16 %r2580, %rs1618; + cvt.s32.s8 %r2581, %r2580; + mad.lo.s32 %r2582, %r80, %r2581, %r2573; + mad.lo.s32 %r2583, %r81, %r2579, %r2582; + mad.lo.s32 %r2584, %r83, %r2577, %r2583; + mad.lo.s32 %r2585, %r84, %r2575, %r2584; + ld.const.v4.u8 {%rs1626, %rs1627, %rs1628, %rs1629}, [matrix+812]; + cvt.u32.u16 %r2586, %rs1629; + cvt.s32.s8 %r2587, %r2586; + cvt.u32.u16 %r2588, %rs1628; + cvt.s32.s8 %r2589, %r2588; + cvt.u32.u16 %r2590, %rs1627; + cvt.s32.s8 %r2591, %r2590; + cvt.u32.u16 %r2592, %rs1626; + cvt.s32.s8 %r2593, %r2592; + mad.lo.s32 %r2594, %r86, %r2593, %r2585; + mad.lo.s32 %r2595, %r87, %r2591, %r2594; + mad.lo.s32 %r2596, %r88, %r2589, %r2595; + mad.lo.s32 %r2597, %r89, %r2587, %r2596; + ld.const.v4.u8 {%rs1634, %rs1635, %rs1636, %rs1637}, [matrix+816]; + cvt.u32.u16 %r2598, %rs1637; + cvt.s32.s8 %r2599, %r2598; + cvt.u32.u16 %r2600, %rs1636; + cvt.s32.s8 %r2601, %r2600; + cvt.u32.u16 %r2602, %rs1635; + cvt.s32.s8 %r2603, %r2602; + cvt.u32.u16 %r2604, %rs1634; + cvt.s32.s8 %r2605, %r2604; + mad.lo.s32 %r2606, %r271, %r2605, %r2597; + mad.lo.s32 %r2607, %r91, %r2603, %r2606; + mad.lo.s32 %r2608, %r93, %r2601, %r2607; + mad.lo.s32 %r2609, %r94, %r2599, %r2608; + ld.const.v4.u8 {%rs1642, %rs1643, %rs1644, %rs1645}, [matrix+820]; + cvt.u32.u16 %r2610, %rs1645; + cvt.s32.s8 %r2611, %r2610; + cvt.u32.u16 %r2612, %rs1644; + cvt.s32.s8 %r2613, %r2612; + cvt.u32.u16 %r2614, %rs1643; + cvt.s32.s8 %r2615, %r2614; + cvt.u32.u16 %r2616, %rs1642; + cvt.s32.s8 %r2617, %r2616; + mad.lo.s32 %r2618, %r96, %r2617, %r2609; + mad.lo.s32 %r2619, %r97, %r2615, %r2618; + mad.lo.s32 %r2620, %r99, %r2613, %r2619; + mad.lo.s32 %r2621, %r100, %r2611, %r2620; + ld.const.v4.u8 {%rs1650, %rs1651, %rs1652, %rs1653}, [matrix+824]; + cvt.u32.u16 %r2622, %rs1653; + cvt.s32.s8 %r2623, %r2622; + cvt.u32.u16 %r2624, %rs1652; + cvt.s32.s8 %r2625, %r2624; + cvt.u32.u16 %r2626, %rs1651; + cvt.s32.s8 %r2627, %r2626; + cvt.u32.u16 %r2628, %rs1650; + cvt.s32.s8 %r2629, %r2628; + mad.lo.s32 %r2630, %r103, %r2629, %r2621; + mad.lo.s32 %r2631, %r104, %r2627, %r2630; + mad.lo.s32 %r2632, %r107, %r2625, %r2631; + mad.lo.s32 %r2633, %r108, %r2623, %r2632; + ld.const.v4.u8 {%rs1658, %rs1659, %rs1660, %rs1661}, [matrix+828]; + cvt.u32.u16 %r2634, %rs1661; + cvt.s32.s8 %r2635, %r2634; + cvt.u32.u16 %r2636, %rs1660; + cvt.s32.s8 %r2637, %r2636; + cvt.u32.u16 %r2638, %rs1659; + cvt.s32.s8 %r2639, %r2638; + cvt.u32.u16 %r2640, %rs1658; + cvt.s32.s8 %r2641, %r2640; + mad.lo.s32 %r2642, %r111, %r2641, %r2633; + mad.lo.s32 %r2643, %r112, %r2639, %r2642; + mad.lo.s32 %r2644, %r114, %r2637, %r2643; + mad.lo.s32 %r2645, %r115, %r2635, %r2644; + ld.const.v4.u8 {%rs1666, %rs1667, %rs1668, %rs1669}, [matrix+832]; + cvt.u32.u16 %r2646, %rs1669; + cvt.s32.s8 %r2647, %r2646; + cvt.u32.u16 %r2648, %rs1668; + cvt.s32.s8 %r2649, %r2648; + cvt.u32.u16 %r2650, %rs1666; + cvt.s32.s8 %r2651, %r2650; + cvt.u32.u16 %r2652, %rs1667; + cvt.s32.s8 %r2653, %r2652; + mul.lo.s32 %r2654, %r34, %r2653; + mad.lo.s32 %r2655, %r124, %r2651, %r2654; + mad.lo.s32 %r2656, %r35, %r2649, %r2655; + mad.lo.s32 %r2657, %r36, %r2647, %r2656; + ld.const.v4.u8 {%rs1674, %rs1675, %rs1676, %rs1677}, [matrix+836]; + cvt.u32.u16 %r2658, %rs1677; + cvt.s32.s8 %r2659, %r2658; + cvt.u32.u16 %r2660, %rs1676; + cvt.s32.s8 %r2661, %r2660; + cvt.u32.u16 %r2662, %rs1675; + cvt.s32.s8 %r2663, %r2662; + cvt.u32.u16 %r2664, %rs1674; + cvt.s32.s8 %r2665, %r2664; + mad.lo.s32 %r2666, %r37, %r2665, %r2657; + mad.lo.s32 %r2667, %r38, %r2663, %r2666; + mad.lo.s32 %r2668, %r39, %r2661, %r2667; + mad.lo.s32 %r2669, %r40, %r2659, %r2668; + ld.const.v4.u8 {%rs1682, %rs1683, %rs1684, %rs1685}, [matrix+840]; + cvt.u32.u16 %r2670, %rs1685; + cvt.s32.s8 %r2671, %r2670; + cvt.u32.u16 %r2672, %rs1684; + cvt.s32.s8 %r2673, %r2672; + cvt.u32.u16 %r2674, %rs1683; + cvt.s32.s8 %r2675, %r2674; + cvt.u32.u16 %r2676, %rs1682; + cvt.s32.s8 %r2677, %r2676; + mad.lo.s32 %r2678, %r42, %r2677, %r2669; + mad.lo.s32 %r2679, %r43, %r2675, %r2678; + mad.lo.s32 %r2680, %r45, %r2673, %r2679; + mad.lo.s32 %r2681, %r46, %r2671, %r2680; + ld.const.v4.u8 {%rs1690, %rs1691, %rs1692, %rs1693}, [matrix+844]; + cvt.u32.u16 %r2682, %rs1693; + cvt.s32.s8 %r2683, %r2682; + cvt.u32.u16 %r2684, %rs1692; + cvt.s32.s8 %r2685, %r2684; + cvt.u32.u16 %r2686, %rs1691; + cvt.s32.s8 %r2687, %r2686; + cvt.u32.u16 %r2688, %rs1690; + cvt.s32.s8 %r2689, %r2688; + mad.lo.s32 %r2690, %r48, %r2689, %r2681; + mad.lo.s32 %r2691, %r49, %r2687, %r2690; + mad.lo.s32 %r2692, %r50, %r2685, %r2691; + mad.lo.s32 %r2693, %r51, %r2683, %r2692; + ld.const.v4.u8 {%rs1698, %rs1699, %rs1700, %rs1701}, [matrix+848]; + cvt.u32.u16 %r2694, %rs1701; + cvt.s32.s8 %r2695, %r2694; + cvt.u32.u16 %r2696, %rs1700; + cvt.s32.s8 %r2697, %r2696; + cvt.u32.u16 %r2698, %rs1699; + cvt.s32.s8 %r2699, %r2698; + cvt.u32.u16 %r2700, %rs1698; + cvt.s32.s8 %r2701, %r2700; + mad.lo.s32 %r2702, %r173, %r2701, %r2693; + mad.lo.s32 %r2703, %r53, %r2699, %r2702; + mad.lo.s32 %r2704, %r54, %r2697, %r2703; + mad.lo.s32 %r2705, %r55, %r2695, %r2704; + ld.const.v4.u8 {%rs1706, %rs1707, %rs1708, %rs1709}, [matrix+852]; + cvt.u32.u16 %r2706, %rs1709; + cvt.s32.s8 %r2707, %r2706; + cvt.u32.u16 %r2708, %rs1708; + cvt.s32.s8 %r2709, %r2708; + cvt.u32.u16 %r2710, %rs1707; + cvt.s32.s8 %r2711, %r2710; + cvt.u32.u16 %r2712, %rs1706; + cvt.s32.s8 %r2713, %r2712; + mad.lo.s32 %r2714, %r56, %r2713, %r2705; + mad.lo.s32 %r2715, %r57, %r2711, %r2714; + mad.lo.s32 %r2716, %r58, %r2709, %r2715; + mad.lo.s32 %r2717, %r59, %r2707, %r2716; + ld.const.v4.u8 {%rs1714, %rs1715, %rs1716, %rs1717}, [matrix+856]; + cvt.u32.u16 %r2718, %rs1717; + cvt.s32.s8 %r2719, %r2718; + cvt.u32.u16 %r2720, %rs1716; + cvt.s32.s8 %r2721, %r2720; + cvt.u32.u16 %r2722, %rs1715; + cvt.s32.s8 %r2723, %r2722; + cvt.u32.u16 %r2724, %rs1714; + cvt.s32.s8 %r2725, %r2724; + mad.lo.s32 %r2726, %r61, %r2725, %r2717; + mad.lo.s32 %r2727, %r62, %r2723, %r2726; + mad.lo.s32 %r2728, %r64, %r2721, %r2727; + mad.lo.s32 %r2729, %r65, %r2719, %r2728; + ld.const.v4.u8 {%rs1722, %rs1723, %rs1724, %rs1725}, [matrix+860]; + cvt.u32.u16 %r2730, %rs1725; + cvt.s32.s8 %r2731, %r2730; + cvt.u32.u16 %r2732, %rs1724; + cvt.s32.s8 %r2733, %r2732; + cvt.u32.u16 %r2734, %rs1723; + cvt.s32.s8 %r2735, %r2734; + cvt.u32.u16 %r2736, %rs1722; + cvt.s32.s8 %r2737, %r2736; + mad.lo.s32 %r2738, %r67, %r2737, %r2729; + mad.lo.s32 %r2739, %r68, %r2735, %r2738; + mad.lo.s32 %r2740, %r69, %r2733, %r2739; + mad.lo.s32 %r2741, %r70, %r2731, %r2740; + ld.const.v4.u8 {%rs1730, %rs1731, %rs1732, %rs1733}, [matrix+864]; + cvt.u32.u16 %r2742, %rs1733; + cvt.s32.s8 %r2743, %r2742; + cvt.u32.u16 %r2744, %rs1732; + cvt.s32.s8 %r2745, %r2744; + cvt.u32.u16 %r2746, %rs1731; + cvt.s32.s8 %r2747, %r2746; + cvt.u32.u16 %r2748, %rs1730; + cvt.s32.s8 %r2749, %r2748; + mad.lo.s32 %r2750, %r222, %r2749, %r2741; + mad.lo.s32 %r2751, %r72, %r2747, %r2750; + mad.lo.s32 %r2752, %r73, %r2745, %r2751; + mad.lo.s32 %r2753, %r74, %r2743, %r2752; + ld.const.v4.u8 {%rs1738, %rs1739, %rs1740, %rs1741}, [matrix+868]; + cvt.u32.u16 %r2754, %rs1741; + cvt.s32.s8 %r2755, %r2754; + cvt.u32.u16 %r2756, %rs1740; + cvt.s32.s8 %r2757, %r2756; + cvt.u32.u16 %r2758, %rs1739; + cvt.s32.s8 %r2759, %r2758; + cvt.u32.u16 %r2760, %rs1738; + cvt.s32.s8 %r2761, %r2760; + mad.lo.s32 %r2762, %r75, %r2761, %r2753; + mad.lo.s32 %r2763, %r76, %r2759, %r2762; + mad.lo.s32 %r2764, %r77, %r2757, %r2763; + mad.lo.s32 %r2765, %r78, %r2755, %r2764; + ld.const.v4.u8 {%rs1746, %rs1747, %rs1748, %rs1749}, [matrix+872]; + cvt.u32.u16 %r2766, %rs1749; + cvt.s32.s8 %r2767, %r2766; + cvt.u32.u16 %r2768, %rs1748; + cvt.s32.s8 %r2769, %r2768; + cvt.u32.u16 %r2770, %rs1747; + cvt.s32.s8 %r2771, %r2770; + cvt.u32.u16 %r2772, %rs1746; + cvt.s32.s8 %r2773, %r2772; + mad.lo.s32 %r2774, %r80, %r2773, %r2765; + mad.lo.s32 %r2775, %r81, %r2771, %r2774; + mad.lo.s32 %r2776, %r83, %r2769, %r2775; + mad.lo.s32 %r2777, %r84, %r2767, %r2776; + ld.const.v4.u8 {%rs1754, %rs1755, %rs1756, %rs1757}, [matrix+876]; + cvt.u32.u16 %r2778, %rs1757; + cvt.s32.s8 %r2779, %r2778; + cvt.u32.u16 %r2780, %rs1756; + cvt.s32.s8 %r2781, %r2780; + cvt.u32.u16 %r2782, %rs1755; + cvt.s32.s8 %r2783, %r2782; + cvt.u32.u16 %r2784, %rs1754; + cvt.s32.s8 %r2785, %r2784; + mad.lo.s32 %r2786, %r86, %r2785, %r2777; + mad.lo.s32 %r2787, %r87, %r2783, %r2786; + mad.lo.s32 %r2788, %r88, %r2781, %r2787; + mad.lo.s32 %r2789, %r89, %r2779, %r2788; + ld.const.v4.u8 {%rs1762, %rs1763, %rs1764, %rs1765}, [matrix+880]; + cvt.u32.u16 %r2790, %rs1765; + cvt.s32.s8 %r2791, %r2790; + cvt.u32.u16 %r2792, %rs1764; + cvt.s32.s8 %r2793, %r2792; + cvt.u32.u16 %r2794, %rs1763; + cvt.s32.s8 %r2795, %r2794; + cvt.u32.u16 %r2796, %rs1762; + cvt.s32.s8 %r2797, %r2796; + mad.lo.s32 %r2798, %r271, %r2797, %r2789; + mad.lo.s32 %r2799, %r91, %r2795, %r2798; + mad.lo.s32 %r2800, %r93, %r2793, %r2799; + mad.lo.s32 %r2801, %r94, %r2791, %r2800; + ld.const.v4.u8 {%rs1770, %rs1771, %rs1772, %rs1773}, [matrix+884]; + cvt.u32.u16 %r2802, %rs1773; + cvt.s32.s8 %r2803, %r2802; + cvt.u32.u16 %r2804, %rs1772; + cvt.s32.s8 %r2805, %r2804; + cvt.u32.u16 %r2806, %rs1771; + cvt.s32.s8 %r2807, %r2806; + cvt.u32.u16 %r2808, %rs1770; + cvt.s32.s8 %r2809, %r2808; + mad.lo.s32 %r2810, %r96, %r2809, %r2801; + mad.lo.s32 %r2811, %r97, %r2807, %r2810; + mad.lo.s32 %r2812, %r99, %r2805, %r2811; + mad.lo.s32 %r2813, %r100, %r2803, %r2812; + ld.const.v4.u8 {%rs1778, %rs1779, %rs1780, %rs1781}, [matrix+888]; + cvt.u32.u16 %r2814, %rs1781; + cvt.s32.s8 %r2815, %r2814; + cvt.u32.u16 %r2816, %rs1780; + cvt.s32.s8 %r2817, %r2816; + cvt.u32.u16 %r2818, %rs1779; + cvt.s32.s8 %r2819, %r2818; + cvt.u32.u16 %r2820, %rs1778; + cvt.s32.s8 %r2821, %r2820; + mad.lo.s32 %r2822, %r103, %r2821, %r2813; + mad.lo.s32 %r2823, %r104, %r2819, %r2822; + mad.lo.s32 %r2824, %r107, %r2817, %r2823; + mad.lo.s32 %r2825, %r108, %r2815, %r2824; + ld.const.v4.u8 {%rs1786, %rs1787, %rs1788, %rs1789}, [matrix+892]; + cvt.u32.u16 %r2826, %rs1789; + cvt.s32.s8 %r2827, %r2826; + cvt.u32.u16 %r2828, %rs1788; + cvt.s32.s8 %r2829, %r2828; + cvt.u32.u16 %r2830, %rs1787; + cvt.s32.s8 %r2831, %r2830; + cvt.u32.u16 %r2832, %rs1786; + cvt.s32.s8 %r2833, %r2832; + mad.lo.s32 %r2834, %r111, %r2833, %r2825; + mad.lo.s32 %r2835, %r112, %r2831, %r2834; + mad.lo.s32 %r2836, %r114, %r2829, %r2835; + mad.lo.s32 %r2837, %r115, %r2827, %r2836; + shr.u32 %r2838, %r2645, 6; + and.b32 %r2839, %r2838, 240; + shr.u32 %r2840, %r2837, 10; + or.b32 %r2841, %r2840, %r2839; + xor.b32 %r2842, %r17, %r2841; + cvt.u64.u32 %rd386, %r2842; + ld.const.v4.u8 {%rs1794, %rs1795, %rs1796, %rs1797}, [matrix+896]; + cvt.u32.u16 %r2843, %rs1797; + cvt.s32.s8 %r2844, %r2843; + cvt.u32.u16 %r2845, %rs1796; + cvt.s32.s8 %r2846, %r2845; + cvt.u32.u16 %r2847, %rs1794; + cvt.s32.s8 %r2848, %r2847; + cvt.u32.u16 %r2849, %rs1795; + cvt.s32.s8 %r2850, %r2849; + mul.lo.s32 %r2851, %r34, %r2850; + mad.lo.s32 %r2852, %r124, %r2848, %r2851; + mad.lo.s32 %r2853, %r35, %r2846, %r2852; + mad.lo.s32 %r2854, %r36, %r2844, %r2853; + ld.const.v4.u8 {%rs1802, %rs1803, %rs1804, %rs1805}, [matrix+900]; + cvt.u32.u16 %r2855, %rs1805; + cvt.s32.s8 %r2856, %r2855; + cvt.u32.u16 %r2857, %rs1804; + cvt.s32.s8 %r2858, %r2857; + cvt.u32.u16 %r2859, %rs1803; + cvt.s32.s8 %r2860, %r2859; + cvt.u32.u16 %r2861, %rs1802; + cvt.s32.s8 %r2862, %r2861; + mad.lo.s32 %r2863, %r37, %r2862, %r2854; + mad.lo.s32 %r2864, %r38, %r2860, %r2863; + mad.lo.s32 %r2865, %r39, %r2858, %r2864; + mad.lo.s32 %r2866, %r40, %r2856, %r2865; + ld.const.v4.u8 {%rs1810, %rs1811, %rs1812, %rs1813}, [matrix+904]; + cvt.u32.u16 %r2867, %rs1813; + cvt.s32.s8 %r2868, %r2867; + cvt.u32.u16 %r2869, %rs1812; + cvt.s32.s8 %r2870, %r2869; + cvt.u32.u16 %r2871, %rs1811; + cvt.s32.s8 %r2872, %r2871; + cvt.u32.u16 %r2873, %rs1810; + cvt.s32.s8 %r2874, %r2873; + mad.lo.s32 %r2875, %r42, %r2874, %r2866; + mad.lo.s32 %r2876, %r43, %r2872, %r2875; + mad.lo.s32 %r2877, %r45, %r2870, %r2876; + mad.lo.s32 %r2878, %r46, %r2868, %r2877; + ld.const.v4.u8 {%rs1818, %rs1819, %rs1820, %rs1821}, [matrix+908]; + cvt.u32.u16 %r2879, %rs1821; + cvt.s32.s8 %r2880, %r2879; + cvt.u32.u16 %r2881, %rs1820; + cvt.s32.s8 %r2882, %r2881; + cvt.u32.u16 %r2883, %rs1819; + cvt.s32.s8 %r2884, %r2883; + cvt.u32.u16 %r2885, %rs1818; + cvt.s32.s8 %r2886, %r2885; + mad.lo.s32 %r2887, %r48, %r2886, %r2878; + mad.lo.s32 %r2888, %r49, %r2884, %r2887; + mad.lo.s32 %r2889, %r50, %r2882, %r2888; + mad.lo.s32 %r2890, %r51, %r2880, %r2889; + ld.const.v4.u8 {%rs1826, %rs1827, %rs1828, %rs1829}, [matrix+912]; + cvt.u32.u16 %r2891, %rs1829; + cvt.s32.s8 %r2892, %r2891; + cvt.u32.u16 %r2893, %rs1828; + cvt.s32.s8 %r2894, %r2893; + cvt.u32.u16 %r2895, %rs1827; + cvt.s32.s8 %r2896, %r2895; + cvt.u32.u16 %r2897, %rs1826; + cvt.s32.s8 %r2898, %r2897; + mad.lo.s32 %r2899, %r173, %r2898, %r2890; + mad.lo.s32 %r2900, %r53, %r2896, %r2899; + mad.lo.s32 %r2901, %r54, %r2894, %r2900; + mad.lo.s32 %r2902, %r55, %r2892, %r2901; + ld.const.v4.u8 {%rs1834, %rs1835, %rs1836, %rs1837}, [matrix+916]; + cvt.u32.u16 %r2903, %rs1837; + cvt.s32.s8 %r2904, %r2903; + cvt.u32.u16 %r2905, %rs1836; + cvt.s32.s8 %r2906, %r2905; + cvt.u32.u16 %r2907, %rs1835; + cvt.s32.s8 %r2908, %r2907; + cvt.u32.u16 %r2909, %rs1834; + cvt.s32.s8 %r2910, %r2909; + mad.lo.s32 %r2911, %r56, %r2910, %r2902; + mad.lo.s32 %r2912, %r57, %r2908, %r2911; + mad.lo.s32 %r2913, %r58, %r2906, %r2912; + mad.lo.s32 %r2914, %r59, %r2904, %r2913; + ld.const.v4.u8 {%rs1842, %rs1843, %rs1844, %rs1845}, [matrix+920]; + cvt.u32.u16 %r2915, %rs1845; + cvt.s32.s8 %r2916, %r2915; + cvt.u32.u16 %r2917, %rs1844; + cvt.s32.s8 %r2918, %r2917; + cvt.u32.u16 %r2919, %rs1843; + cvt.s32.s8 %r2920, %r2919; + cvt.u32.u16 %r2921, %rs1842; + cvt.s32.s8 %r2922, %r2921; + mad.lo.s32 %r2923, %r61, %r2922, %r2914; + mad.lo.s32 %r2924, %r62, %r2920, %r2923; + mad.lo.s32 %r2925, %r64, %r2918, %r2924; + mad.lo.s32 %r2926, %r65, %r2916, %r2925; + ld.const.v4.u8 {%rs1850, %rs1851, %rs1852, %rs1853}, [matrix+924]; + cvt.u32.u16 %r2927, %rs1853; + cvt.s32.s8 %r2928, %r2927; + cvt.u32.u16 %r2929, %rs1852; + cvt.s32.s8 %r2930, %r2929; + cvt.u32.u16 %r2931, %rs1851; + cvt.s32.s8 %r2932, %r2931; + cvt.u32.u16 %r2933, %rs1850; + cvt.s32.s8 %r2934, %r2933; + mad.lo.s32 %r2935, %r67, %r2934, %r2926; + mad.lo.s32 %r2936, %r68, %r2932, %r2935; + mad.lo.s32 %r2937, %r69, %r2930, %r2936; + mad.lo.s32 %r2938, %r70, %r2928, %r2937; + ld.const.v4.u8 {%rs1858, %rs1859, %rs1860, %rs1861}, [matrix+928]; + cvt.u32.u16 %r2939, %rs1861; + cvt.s32.s8 %r2940, %r2939; + cvt.u32.u16 %r2941, %rs1860; + cvt.s32.s8 %r2942, %r2941; + cvt.u32.u16 %r2943, %rs1859; + cvt.s32.s8 %r2944, %r2943; + cvt.u32.u16 %r2945, %rs1858; + cvt.s32.s8 %r2946, %r2945; + mad.lo.s32 %r2947, %r222, %r2946, %r2938; + mad.lo.s32 %r2948, %r72, %r2944, %r2947; + mad.lo.s32 %r2949, %r73, %r2942, %r2948; + mad.lo.s32 %r2950, %r74, %r2940, %r2949; + ld.const.v4.u8 {%rs1866, %rs1867, %rs1868, %rs1869}, [matrix+932]; + cvt.u32.u16 %r2951, %rs1869; + cvt.s32.s8 %r2952, %r2951; + cvt.u32.u16 %r2953, %rs1868; + cvt.s32.s8 %r2954, %r2953; + cvt.u32.u16 %r2955, %rs1867; + cvt.s32.s8 %r2956, %r2955; + cvt.u32.u16 %r2957, %rs1866; + cvt.s32.s8 %r2958, %r2957; + mad.lo.s32 %r2959, %r75, %r2958, %r2950; + mad.lo.s32 %r2960, %r76, %r2956, %r2959; + mad.lo.s32 %r2961, %r77, %r2954, %r2960; + mad.lo.s32 %r2962, %r78, %r2952, %r2961; + ld.const.v4.u8 {%rs1874, %rs1875, %rs1876, %rs1877}, [matrix+936]; + cvt.u32.u16 %r2963, %rs1877; + cvt.s32.s8 %r2964, %r2963; + cvt.u32.u16 %r2965, %rs1876; + cvt.s32.s8 %r2966, %r2965; + cvt.u32.u16 %r2967, %rs1875; + cvt.s32.s8 %r2968, %r2967; + cvt.u32.u16 %r2969, %rs1874; + cvt.s32.s8 %r2970, %r2969; + mad.lo.s32 %r2971, %r80, %r2970, %r2962; + mad.lo.s32 %r2972, %r81, %r2968, %r2971; + mad.lo.s32 %r2973, %r83, %r2966, %r2972; + mad.lo.s32 %r2974, %r84, %r2964, %r2973; + ld.const.v4.u8 {%rs1882, %rs1883, %rs1884, %rs1885}, [matrix+940]; + cvt.u32.u16 %r2975, %rs1885; + cvt.s32.s8 %r2976, %r2975; + cvt.u32.u16 %r2977, %rs1884; + cvt.s32.s8 %r2978, %r2977; + cvt.u32.u16 %r2979, %rs1883; + cvt.s32.s8 %r2980, %r2979; + cvt.u32.u16 %r2981, %rs1882; + cvt.s32.s8 %r2982, %r2981; + mad.lo.s32 %r2983, %r86, %r2982, %r2974; + mad.lo.s32 %r2984, %r87, %r2980, %r2983; + mad.lo.s32 %r2985, %r88, %r2978, %r2984; + mad.lo.s32 %r2986, %r89, %r2976, %r2985; + ld.const.v4.u8 {%rs1890, %rs1891, %rs1892, %rs1893}, [matrix+944]; + cvt.u32.u16 %r2987, %rs1893; + cvt.s32.s8 %r2988, %r2987; + cvt.u32.u16 %r2989, %rs1892; + cvt.s32.s8 %r2990, %r2989; + cvt.u32.u16 %r2991, %rs1891; + cvt.s32.s8 %r2992, %r2991; + cvt.u32.u16 %r2993, %rs1890; + cvt.s32.s8 %r2994, %r2993; + mad.lo.s32 %r2995, %r271, %r2994, %r2986; + mad.lo.s32 %r2996, %r91, %r2992, %r2995; + mad.lo.s32 %r2997, %r93, %r2990, %r2996; + mad.lo.s32 %r2998, %r94, %r2988, %r2997; + ld.const.v4.u8 {%rs1898, %rs1899, %rs1900, %rs1901}, [matrix+948]; + cvt.u32.u16 %r2999, %rs1901; + cvt.s32.s8 %r3000, %r2999; + cvt.u32.u16 %r3001, %rs1900; + cvt.s32.s8 %r3002, %r3001; + cvt.u32.u16 %r3003, %rs1899; + cvt.s32.s8 %r3004, %r3003; + cvt.u32.u16 %r3005, %rs1898; + cvt.s32.s8 %r3006, %r3005; + mad.lo.s32 %r3007, %r96, %r3006, %r2998; + mad.lo.s32 %r3008, %r97, %r3004, %r3007; + mad.lo.s32 %r3009, %r99, %r3002, %r3008; + mad.lo.s32 %r3010, %r100, %r3000, %r3009; + ld.const.v4.u8 {%rs1906, %rs1907, %rs1908, %rs1909}, [matrix+952]; + cvt.u32.u16 %r3011, %rs1909; + cvt.s32.s8 %r3012, %r3011; + cvt.u32.u16 %r3013, %rs1908; + cvt.s32.s8 %r3014, %r3013; + cvt.u32.u16 %r3015, %rs1907; + cvt.s32.s8 %r3016, %r3015; + cvt.u32.u16 %r3017, %rs1906; + cvt.s32.s8 %r3018, %r3017; + mad.lo.s32 %r3019, %r103, %r3018, %r3010; + mad.lo.s32 %r3020, %r104, %r3016, %r3019; + mad.lo.s32 %r3021, %r107, %r3014, %r3020; + mad.lo.s32 %r3022, %r108, %r3012, %r3021; + ld.const.v4.u8 {%rs1914, %rs1915, %rs1916, %rs1917}, [matrix+956]; + cvt.u32.u16 %r3023, %rs1917; + cvt.s32.s8 %r3024, %r3023; + cvt.u32.u16 %r3025, %rs1916; + cvt.s32.s8 %r3026, %r3025; + cvt.u32.u16 %r3027, %rs1915; + cvt.s32.s8 %r3028, %r3027; + cvt.u32.u16 %r3029, %rs1914; + cvt.s32.s8 %r3030, %r3029; + mad.lo.s32 %r3031, %r111, %r3030, %r3022; + mad.lo.s32 %r3032, %r112, %r3028, %r3031; + mad.lo.s32 %r3033, %r114, %r3026, %r3032; + mad.lo.s32 %r3034, %r115, %r3024, %r3033; + ld.const.v4.u8 {%rs1922, %rs1923, %rs1924, %rs1925}, [matrix+960]; + cvt.u32.u16 %r3035, %rs1925; + cvt.s32.s8 %r3036, %r3035; + cvt.u32.u16 %r3037, %rs1924; + cvt.s32.s8 %r3038, %r3037; + cvt.u32.u16 %r3039, %rs1922; + cvt.s32.s8 %r3040, %r3039; + cvt.u32.u16 %r3041, %rs1923; + cvt.s32.s8 %r3042, %r3041; + mul.lo.s32 %r3043, %r34, %r3042; + mad.lo.s32 %r3044, %r124, %r3040, %r3043; + mad.lo.s32 %r3045, %r35, %r3038, %r3044; + mad.lo.s32 %r3046, %r36, %r3036, %r3045; + ld.const.v4.u8 {%rs1930, %rs1931, %rs1932, %rs1933}, [matrix+964]; + cvt.u32.u16 %r3047, %rs1933; + cvt.s32.s8 %r3048, %r3047; + cvt.u32.u16 %r3049, %rs1932; + cvt.s32.s8 %r3050, %r3049; + cvt.u32.u16 %r3051, %rs1931; + cvt.s32.s8 %r3052, %r3051; + cvt.u32.u16 %r3053, %rs1930; + cvt.s32.s8 %r3054, %r3053; + mad.lo.s32 %r3055, %r37, %r3054, %r3046; + mad.lo.s32 %r3056, %r38, %r3052, %r3055; + mad.lo.s32 %r3057, %r39, %r3050, %r3056; + mad.lo.s32 %r3058, %r40, %r3048, %r3057; + ld.const.v4.u8 {%rs1938, %rs1939, %rs1940, %rs1941}, [matrix+968]; + cvt.u32.u16 %r3059, %rs1941; + cvt.s32.s8 %r3060, %r3059; + cvt.u32.u16 %r3061, %rs1940; + cvt.s32.s8 %r3062, %r3061; + cvt.u32.u16 %r3063, %rs1939; + cvt.s32.s8 %r3064, %r3063; + cvt.u32.u16 %r3065, %rs1938; + cvt.s32.s8 %r3066, %r3065; + mad.lo.s32 %r3067, %r42, %r3066, %r3058; + mad.lo.s32 %r3068, %r43, %r3064, %r3067; + mad.lo.s32 %r3069, %r45, %r3062, %r3068; + mad.lo.s32 %r3070, %r46, %r3060, %r3069; + ld.const.v4.u8 {%rs1946, %rs1947, %rs1948, %rs1949}, [matrix+972]; + cvt.u32.u16 %r3071, %rs1949; + cvt.s32.s8 %r3072, %r3071; + cvt.u32.u16 %r3073, %rs1948; + cvt.s32.s8 %r3074, %r3073; + cvt.u32.u16 %r3075, %rs1947; + cvt.s32.s8 %r3076, %r3075; + cvt.u32.u16 %r3077, %rs1946; + cvt.s32.s8 %r3078, %r3077; + mad.lo.s32 %r3079, %r48, %r3078, %r3070; + mad.lo.s32 %r3080, %r49, %r3076, %r3079; + mad.lo.s32 %r3081, %r50, %r3074, %r3080; + mad.lo.s32 %r3082, %r51, %r3072, %r3081; + ld.const.v4.u8 {%rs1954, %rs1955, %rs1956, %rs1957}, [matrix+976]; + cvt.u32.u16 %r3083, %rs1957; + cvt.s32.s8 %r3084, %r3083; + cvt.u32.u16 %r3085, %rs1956; + cvt.s32.s8 %r3086, %r3085; + cvt.u32.u16 %r3087, %rs1955; + cvt.s32.s8 %r3088, %r3087; + cvt.u32.u16 %r3089, %rs1954; + cvt.s32.s8 %r3090, %r3089; + mad.lo.s32 %r3091, %r173, %r3090, %r3082; + mad.lo.s32 %r3092, %r53, %r3088, %r3091; + mad.lo.s32 %r3093, %r54, %r3086, %r3092; + mad.lo.s32 %r3094, %r55, %r3084, %r3093; + ld.const.v4.u8 {%rs1962, %rs1963, %rs1964, %rs1965}, [matrix+980]; + cvt.u32.u16 %r3095, %rs1965; + cvt.s32.s8 %r3096, %r3095; + cvt.u32.u16 %r3097, %rs1964; + cvt.s32.s8 %r3098, %r3097; + cvt.u32.u16 %r3099, %rs1963; + cvt.s32.s8 %r3100, %r3099; + cvt.u32.u16 %r3101, %rs1962; + cvt.s32.s8 %r3102, %r3101; + mad.lo.s32 %r3103, %r56, %r3102, %r3094; + mad.lo.s32 %r3104, %r57, %r3100, %r3103; + mad.lo.s32 %r3105, %r58, %r3098, %r3104; + mad.lo.s32 %r3106, %r59, %r3096, %r3105; + ld.const.v4.u8 {%rs1970, %rs1971, %rs1972, %rs1973}, [matrix+984]; + cvt.u32.u16 %r3107, %rs1973; + cvt.s32.s8 %r3108, %r3107; + cvt.u32.u16 %r3109, %rs1972; + cvt.s32.s8 %r3110, %r3109; + cvt.u32.u16 %r3111, %rs1971; + cvt.s32.s8 %r3112, %r3111; + cvt.u32.u16 %r3113, %rs1970; + cvt.s32.s8 %r3114, %r3113; + mad.lo.s32 %r3115, %r61, %r3114, %r3106; + mad.lo.s32 %r3116, %r62, %r3112, %r3115; + mad.lo.s32 %r3117, %r64, %r3110, %r3116; + mad.lo.s32 %r3118, %r65, %r3108, %r3117; + ld.const.v4.u8 {%rs1978, %rs1979, %rs1980, %rs1981}, [matrix+988]; + cvt.u32.u16 %r3119, %rs1981; + cvt.s32.s8 %r3120, %r3119; + cvt.u32.u16 %r3121, %rs1980; + cvt.s32.s8 %r3122, %r3121; + cvt.u32.u16 %r3123, %rs1979; + cvt.s32.s8 %r3124, %r3123; + cvt.u32.u16 %r3125, %rs1978; + cvt.s32.s8 %r3126, %r3125; + mad.lo.s32 %r3127, %r67, %r3126, %r3118; + mad.lo.s32 %r3128, %r68, %r3124, %r3127; + mad.lo.s32 %r3129, %r69, %r3122, %r3128; + mad.lo.s32 %r3130, %r70, %r3120, %r3129; + ld.const.v4.u8 {%rs1986, %rs1987, %rs1988, %rs1989}, [matrix+992]; + cvt.u32.u16 %r3131, %rs1989; + cvt.s32.s8 %r3132, %r3131; + cvt.u32.u16 %r3133, %rs1988; + cvt.s32.s8 %r3134, %r3133; + cvt.u32.u16 %r3135, %rs1987; + cvt.s32.s8 %r3136, %r3135; + cvt.u32.u16 %r3137, %rs1986; + cvt.s32.s8 %r3138, %r3137; + mad.lo.s32 %r3139, %r222, %r3138, %r3130; + mad.lo.s32 %r3140, %r72, %r3136, %r3139; + mad.lo.s32 %r3141, %r73, %r3134, %r3140; + mad.lo.s32 %r3142, %r74, %r3132, %r3141; + ld.const.v4.u8 {%rs1994, %rs1995, %rs1996, %rs1997}, [matrix+996]; + cvt.u32.u16 %r3143, %rs1997; + cvt.s32.s8 %r3144, %r3143; + cvt.u32.u16 %r3145, %rs1996; + cvt.s32.s8 %r3146, %r3145; + cvt.u32.u16 %r3147, %rs1995; + cvt.s32.s8 %r3148, %r3147; + cvt.u32.u16 %r3149, %rs1994; + cvt.s32.s8 %r3150, %r3149; + mad.lo.s32 %r3151, %r75, %r3150, %r3142; + mad.lo.s32 %r3152, %r76, %r3148, %r3151; + mad.lo.s32 %r3153, %r77, %r3146, %r3152; + mad.lo.s32 %r3154, %r78, %r3144, %r3153; + ld.const.v4.u8 {%rs2002, %rs2003, %rs2004, %rs2005}, [matrix+1000]; + cvt.u32.u16 %r3155, %rs2005; + cvt.s32.s8 %r3156, %r3155; + cvt.u32.u16 %r3157, %rs2004; + cvt.s32.s8 %r3158, %r3157; + cvt.u32.u16 %r3159, %rs2003; + cvt.s32.s8 %r3160, %r3159; + cvt.u32.u16 %r3161, %rs2002; + cvt.s32.s8 %r3162, %r3161; + mad.lo.s32 %r3163, %r80, %r3162, %r3154; + mad.lo.s32 %r3164, %r81, %r3160, %r3163; + mad.lo.s32 %r3165, %r83, %r3158, %r3164; + mad.lo.s32 %r3166, %r84, %r3156, %r3165; + ld.const.v4.u8 {%rs2010, %rs2011, %rs2012, %rs2013}, [matrix+1004]; + cvt.u32.u16 %r3167, %rs2013; + cvt.s32.s8 %r3168, %r3167; + cvt.u32.u16 %r3169, %rs2012; + cvt.s32.s8 %r3170, %r3169; + cvt.u32.u16 %r3171, %rs2011; + cvt.s32.s8 %r3172, %r3171; + cvt.u32.u16 %r3173, %rs2010; + cvt.s32.s8 %r3174, %r3173; + mad.lo.s32 %r3175, %r86, %r3174, %r3166; + mad.lo.s32 %r3176, %r87, %r3172, %r3175; + mad.lo.s32 %r3177, %r88, %r3170, %r3176; + mad.lo.s32 %r3178, %r89, %r3168, %r3177; + ld.const.v4.u8 {%rs2018, %rs2019, %rs2020, %rs2021}, [matrix+1008]; + cvt.u32.u16 %r3179, %rs2021; + cvt.s32.s8 %r3180, %r3179; + cvt.u32.u16 %r3181, %rs2020; + cvt.s32.s8 %r3182, %r3181; + cvt.u32.u16 %r3183, %rs2019; + cvt.s32.s8 %r3184, %r3183; + cvt.u32.u16 %r3185, %rs2018; + cvt.s32.s8 %r3186, %r3185; + mad.lo.s32 %r3187, %r271, %r3186, %r3178; + mad.lo.s32 %r3188, %r91, %r3184, %r3187; + mad.lo.s32 %r3189, %r93, %r3182, %r3188; + mad.lo.s32 %r3190, %r94, %r3180, %r3189; + ld.const.v4.u8 {%rs2026, %rs2027, %rs2028, %rs2029}, [matrix+1012]; + cvt.u32.u16 %r3191, %rs2029; + cvt.s32.s8 %r3192, %r3191; + cvt.u32.u16 %r3193, %rs2028; + cvt.s32.s8 %r3194, %r3193; + cvt.u32.u16 %r3195, %rs2027; + cvt.s32.s8 %r3196, %r3195; + cvt.u32.u16 %r3197, %rs2026; + cvt.s32.s8 %r3198, %r3197; + mad.lo.s32 %r3199, %r96, %r3198, %r3190; + mad.lo.s32 %r3200, %r97, %r3196, %r3199; + mad.lo.s32 %r3201, %r99, %r3194, %r3200; + mad.lo.s32 %r3202, %r100, %r3192, %r3201; + ld.const.v4.u8 {%rs2034, %rs2035, %rs2036, %rs2037}, [matrix+1016]; + cvt.u32.u16 %r3203, %rs2037; + cvt.s32.s8 %r3204, %r3203; + cvt.u32.u16 %r3205, %rs2036; + cvt.s32.s8 %r3206, %r3205; + cvt.u32.u16 %r3207, %rs2035; + cvt.s32.s8 %r3208, %r3207; + cvt.u32.u16 %r3209, %rs2034; + cvt.s32.s8 %r3210, %r3209; + mad.lo.s32 %r3211, %r103, %r3210, %r3202; + mad.lo.s32 %r3212, %r104, %r3208, %r3211; + mad.lo.s32 %r3213, %r107, %r3206, %r3212; + mad.lo.s32 %r3214, %r108, %r3204, %r3213; + ld.const.v4.u8 {%rs2042, %rs2043, %rs2044, %rs2045}, [matrix+1020]; + cvt.u32.u16 %r3215, %rs2045; + cvt.s32.s8 %r3216, %r3215; + cvt.u32.u16 %r3217, %rs2044; + cvt.s32.s8 %r3218, %r3217; + cvt.u32.u16 %r3219, %rs2043; + cvt.s32.s8 %r3220, %r3219; + cvt.u32.u16 %r3221, %rs2042; + cvt.s32.s8 %r3222, %r3221; + mad.lo.s32 %r3223, %r111, %r3222, %r3214; + mad.lo.s32 %r3224, %r112, %r3220, %r3223; + mad.lo.s32 %r3225, %r114, %r3218, %r3224; + mad.lo.s32 %r3226, %r115, %r3216, %r3225; + shr.u32 %r3227, %r3034, 6; + and.b32 %r3228, %r3227, 240; + shr.u32 %r3229, %r3226, 10; + or.b32 %r3230, %r3229, %r3228; + xor.b32 %r3231, %r18, %r3230; + ld.const.v4.u8 {%rs2050, %rs2051, %rs2052, %rs2053}, [matrix+1024]; + cvt.u32.u16 %r3232, %rs2053; + cvt.s32.s8 %r3233, %r3232; + cvt.u32.u16 %r3234, %rs2052; + cvt.s32.s8 %r3235, %r3234; + cvt.u32.u16 %r3236, %rs2050; + cvt.s32.s8 %r3237, %r3236; + cvt.u32.u16 %r3238, %rs2051; + cvt.s32.s8 %r3239, %r3238; + mul.lo.s32 %r3240, %r34, %r3239; + mad.lo.s32 %r3241, %r124, %r3237, %r3240; + mad.lo.s32 %r3242, %r35, %r3235, %r3241; + mad.lo.s32 %r3243, %r36, %r3233, %r3242; + ld.const.v4.u8 {%rs2058, %rs2059, %rs2060, %rs2061}, [matrix+1028]; + cvt.u32.u16 %r3244, %rs2061; + cvt.s32.s8 %r3245, %r3244; + cvt.u32.u16 %r3246, %rs2060; + cvt.s32.s8 %r3247, %r3246; + cvt.u32.u16 %r3248, %rs2059; + cvt.s32.s8 %r3249, %r3248; + cvt.u32.u16 %r3250, %rs2058; + cvt.s32.s8 %r3251, %r3250; + mad.lo.s32 %r3252, %r37, %r3251, %r3243; + mad.lo.s32 %r3253, %r38, %r3249, %r3252; + mad.lo.s32 %r3254, %r39, %r3247, %r3253; + mad.lo.s32 %r3255, %r40, %r3245, %r3254; + ld.const.v4.u8 {%rs2066, %rs2067, %rs2068, %rs2069}, [matrix+1032]; + cvt.u32.u16 %r3256, %rs2069; + cvt.s32.s8 %r3257, %r3256; + cvt.u32.u16 %r3258, %rs2068; + cvt.s32.s8 %r3259, %r3258; + cvt.u32.u16 %r3260, %rs2067; + cvt.s32.s8 %r3261, %r3260; + cvt.u32.u16 %r3262, %rs2066; + cvt.s32.s8 %r3263, %r3262; + mad.lo.s32 %r3264, %r42, %r3263, %r3255; + mad.lo.s32 %r3265, %r43, %r3261, %r3264; + mad.lo.s32 %r3266, %r45, %r3259, %r3265; + mad.lo.s32 %r3267, %r46, %r3257, %r3266; + ld.const.v4.u8 {%rs2074, %rs2075, %rs2076, %rs2077}, [matrix+1036]; + cvt.u32.u16 %r3268, %rs2077; + cvt.s32.s8 %r3269, %r3268; + cvt.u32.u16 %r3270, %rs2076; + cvt.s32.s8 %r3271, %r3270; + cvt.u32.u16 %r3272, %rs2075; + cvt.s32.s8 %r3273, %r3272; + cvt.u32.u16 %r3274, %rs2074; + cvt.s32.s8 %r3275, %r3274; + mad.lo.s32 %r3276, %r48, %r3275, %r3267; + mad.lo.s32 %r3277, %r49, %r3273, %r3276; + mad.lo.s32 %r3278, %r50, %r3271, %r3277; + mad.lo.s32 %r3279, %r51, %r3269, %r3278; + ld.const.v4.u8 {%rs2082, %rs2083, %rs2084, %rs2085}, [matrix+1040]; + cvt.u32.u16 %r3280, %rs2085; + cvt.s32.s8 %r3281, %r3280; + cvt.u32.u16 %r3282, %rs2084; + cvt.s32.s8 %r3283, %r3282; + cvt.u32.u16 %r3284, %rs2083; + cvt.s32.s8 %r3285, %r3284; + cvt.u32.u16 %r3286, %rs2082; + cvt.s32.s8 %r3287, %r3286; + mad.lo.s32 %r3288, %r173, %r3287, %r3279; + mad.lo.s32 %r3289, %r53, %r3285, %r3288; + mad.lo.s32 %r3290, %r54, %r3283, %r3289; + mad.lo.s32 %r3291, %r55, %r3281, %r3290; + ld.const.v4.u8 {%rs2090, %rs2091, %rs2092, %rs2093}, [matrix+1044]; + cvt.u32.u16 %r3292, %rs2093; + cvt.s32.s8 %r3293, %r3292; + cvt.u32.u16 %r3294, %rs2092; + cvt.s32.s8 %r3295, %r3294; + cvt.u32.u16 %r3296, %rs2091; + cvt.s32.s8 %r3297, %r3296; + cvt.u32.u16 %r3298, %rs2090; + cvt.s32.s8 %r3299, %r3298; + mad.lo.s32 %r3300, %r56, %r3299, %r3291; + mad.lo.s32 %r3301, %r57, %r3297, %r3300; + mad.lo.s32 %r3302, %r58, %r3295, %r3301; + mad.lo.s32 %r3303, %r59, %r3293, %r3302; + ld.const.v4.u8 {%rs2098, %rs2099, %rs2100, %rs2101}, [matrix+1048]; + cvt.u32.u16 %r3304, %rs2101; + cvt.s32.s8 %r3305, %r3304; + cvt.u32.u16 %r3306, %rs2100; + cvt.s32.s8 %r3307, %r3306; + cvt.u32.u16 %r3308, %rs2099; + cvt.s32.s8 %r3309, %r3308; + cvt.u32.u16 %r3310, %rs2098; + cvt.s32.s8 %r3311, %r3310; + mad.lo.s32 %r3312, %r61, %r3311, %r3303; + mad.lo.s32 %r3313, %r62, %r3309, %r3312; + mad.lo.s32 %r3314, %r64, %r3307, %r3313; + mad.lo.s32 %r3315, %r65, %r3305, %r3314; + ld.const.v4.u8 {%rs2106, %rs2107, %rs2108, %rs2109}, [matrix+1052]; + cvt.u32.u16 %r3316, %rs2109; + cvt.s32.s8 %r3317, %r3316; + cvt.u32.u16 %r3318, %rs2108; + cvt.s32.s8 %r3319, %r3318; + cvt.u32.u16 %r3320, %rs2107; + cvt.s32.s8 %r3321, %r3320; + cvt.u32.u16 %r3322, %rs2106; + cvt.s32.s8 %r3323, %r3322; + mad.lo.s32 %r3324, %r67, %r3323, %r3315; + mad.lo.s32 %r3325, %r68, %r3321, %r3324; + mad.lo.s32 %r3326, %r69, %r3319, %r3325; + mad.lo.s32 %r3327, %r70, %r3317, %r3326; + ld.const.v4.u8 {%rs2114, %rs2115, %rs2116, %rs2117}, [matrix+1056]; + cvt.u32.u16 %r3328, %rs2117; + cvt.s32.s8 %r3329, %r3328; + cvt.u32.u16 %r3330, %rs2116; + cvt.s32.s8 %r3331, %r3330; + cvt.u32.u16 %r3332, %rs2115; + cvt.s32.s8 %r3333, %r3332; + cvt.u32.u16 %r3334, %rs2114; + cvt.s32.s8 %r3335, %r3334; + mad.lo.s32 %r3336, %r222, %r3335, %r3327; + mad.lo.s32 %r3337, %r72, %r3333, %r3336; + mad.lo.s32 %r3338, %r73, %r3331, %r3337; + mad.lo.s32 %r3339, %r74, %r3329, %r3338; + ld.const.v4.u8 {%rs2122, %rs2123, %rs2124, %rs2125}, [matrix+1060]; + cvt.u32.u16 %r3340, %rs2125; + cvt.s32.s8 %r3341, %r3340; + cvt.u32.u16 %r3342, %rs2124; + cvt.s32.s8 %r3343, %r3342; + cvt.u32.u16 %r3344, %rs2123; + cvt.s32.s8 %r3345, %r3344; + cvt.u32.u16 %r3346, %rs2122; + cvt.s32.s8 %r3347, %r3346; + mad.lo.s32 %r3348, %r75, %r3347, %r3339; + mad.lo.s32 %r3349, %r76, %r3345, %r3348; + mad.lo.s32 %r3350, %r77, %r3343, %r3349; + mad.lo.s32 %r3351, %r78, %r3341, %r3350; + ld.const.v4.u8 {%rs2130, %rs2131, %rs2132, %rs2133}, [matrix+1064]; + cvt.u32.u16 %r3352, %rs2133; + cvt.s32.s8 %r3353, %r3352; + cvt.u32.u16 %r3354, %rs2132; + cvt.s32.s8 %r3355, %r3354; + cvt.u32.u16 %r3356, %rs2131; + cvt.s32.s8 %r3357, %r3356; + cvt.u32.u16 %r3358, %rs2130; + cvt.s32.s8 %r3359, %r3358; + mad.lo.s32 %r3360, %r80, %r3359, %r3351; + mad.lo.s32 %r3361, %r81, %r3357, %r3360; + mad.lo.s32 %r3362, %r83, %r3355, %r3361; + mad.lo.s32 %r3363, %r84, %r3353, %r3362; + ld.const.v4.u8 {%rs2138, %rs2139, %rs2140, %rs2141}, [matrix+1068]; + cvt.u32.u16 %r3364, %rs2141; + cvt.s32.s8 %r3365, %r3364; + cvt.u32.u16 %r3366, %rs2140; + cvt.s32.s8 %r3367, %r3366; + cvt.u32.u16 %r3368, %rs2139; + cvt.s32.s8 %r3369, %r3368; + cvt.u32.u16 %r3370, %rs2138; + cvt.s32.s8 %r3371, %r3370; + mad.lo.s32 %r3372, %r86, %r3371, %r3363; + mad.lo.s32 %r3373, %r87, %r3369, %r3372; + mad.lo.s32 %r3374, %r88, %r3367, %r3373; + mad.lo.s32 %r3375, %r89, %r3365, %r3374; + ld.const.v4.u8 {%rs2146, %rs2147, %rs2148, %rs2149}, [matrix+1072]; + cvt.u32.u16 %r3376, %rs2149; + cvt.s32.s8 %r3377, %r3376; + cvt.u32.u16 %r3378, %rs2148; + cvt.s32.s8 %r3379, %r3378; + cvt.u32.u16 %r3380, %rs2147; + cvt.s32.s8 %r3381, %r3380; + cvt.u32.u16 %r3382, %rs2146; + cvt.s32.s8 %r3383, %r3382; + mad.lo.s32 %r3384, %r271, %r3383, %r3375; + mad.lo.s32 %r3385, %r91, %r3381, %r3384; + mad.lo.s32 %r3386, %r93, %r3379, %r3385; + mad.lo.s32 %r3387, %r94, %r3377, %r3386; + ld.const.v4.u8 {%rs2154, %rs2155, %rs2156, %rs2157}, [matrix+1076]; + cvt.u32.u16 %r3388, %rs2157; + cvt.s32.s8 %r3389, %r3388; + cvt.u32.u16 %r3390, %rs2156; + cvt.s32.s8 %r3391, %r3390; + cvt.u32.u16 %r3392, %rs2155; + cvt.s32.s8 %r3393, %r3392; + cvt.u32.u16 %r3394, %rs2154; + cvt.s32.s8 %r3395, %r3394; + mad.lo.s32 %r3396, %r96, %r3395, %r3387; + mad.lo.s32 %r3397, %r97, %r3393, %r3396; + mad.lo.s32 %r3398, %r99, %r3391, %r3397; + mad.lo.s32 %r3399, %r100, %r3389, %r3398; + ld.const.v4.u8 {%rs2162, %rs2163, %rs2164, %rs2165}, [matrix+1080]; + cvt.u32.u16 %r3400, %rs2165; + cvt.s32.s8 %r3401, %r3400; + cvt.u32.u16 %r3402, %rs2164; + cvt.s32.s8 %r3403, %r3402; + cvt.u32.u16 %r3404, %rs2163; + cvt.s32.s8 %r3405, %r3404; + cvt.u32.u16 %r3406, %rs2162; + cvt.s32.s8 %r3407, %r3406; + mad.lo.s32 %r3408, %r103, %r3407, %r3399; + mad.lo.s32 %r3409, %r104, %r3405, %r3408; + mad.lo.s32 %r3410, %r107, %r3403, %r3409; + mad.lo.s32 %r3411, %r108, %r3401, %r3410; + ld.const.v4.u8 {%rs2170, %rs2171, %rs2172, %rs2173}, [matrix+1084]; + cvt.u32.u16 %r3412, %rs2173; + cvt.s32.s8 %r3413, %r3412; + cvt.u32.u16 %r3414, %rs2172; + cvt.s32.s8 %r3415, %r3414; + cvt.u32.u16 %r3416, %rs2171; + cvt.s32.s8 %r3417, %r3416; + cvt.u32.u16 %r3418, %rs2170; + cvt.s32.s8 %r3419, %r3418; + mad.lo.s32 %r3420, %r111, %r3419, %r3411; + mad.lo.s32 %r3421, %r112, %r3417, %r3420; + mad.lo.s32 %r3422, %r114, %r3415, %r3421; + mad.lo.s32 %r3423, %r115, %r3413, %r3422; + ld.const.v4.u8 {%rs2178, %rs2179, %rs2180, %rs2181}, [matrix+1088]; + cvt.u32.u16 %r3424, %rs2181; + cvt.s32.s8 %r3425, %r3424; + cvt.u32.u16 %r3426, %rs2180; + cvt.s32.s8 %r3427, %r3426; + cvt.u32.u16 %r3428, %rs2178; + cvt.s32.s8 %r3429, %r3428; + cvt.u32.u16 %r3430, %rs2179; + cvt.s32.s8 %r3431, %r3430; + mul.lo.s32 %r3432, %r34, %r3431; + mad.lo.s32 %r3433, %r124, %r3429, %r3432; + mad.lo.s32 %r3434, %r35, %r3427, %r3433; + mad.lo.s32 %r3435, %r36, %r3425, %r3434; + ld.const.v4.u8 {%rs2186, %rs2187, %rs2188, %rs2189}, [matrix+1092]; + cvt.u32.u16 %r3436, %rs2189; + cvt.s32.s8 %r3437, %r3436; + cvt.u32.u16 %r3438, %rs2188; + cvt.s32.s8 %r3439, %r3438; + cvt.u32.u16 %r3440, %rs2187; + cvt.s32.s8 %r3441, %r3440; + cvt.u32.u16 %r3442, %rs2186; + cvt.s32.s8 %r3443, %r3442; + mad.lo.s32 %r3444, %r37, %r3443, %r3435; + mad.lo.s32 %r3445, %r38, %r3441, %r3444; + mad.lo.s32 %r3446, %r39, %r3439, %r3445; + mad.lo.s32 %r3447, %r40, %r3437, %r3446; + ld.const.v4.u8 {%rs2194, %rs2195, %rs2196, %rs2197}, [matrix+1096]; + cvt.u32.u16 %r3448, %rs2197; + cvt.s32.s8 %r3449, %r3448; + cvt.u32.u16 %r3450, %rs2196; + cvt.s32.s8 %r3451, %r3450; + cvt.u32.u16 %r3452, %rs2195; + cvt.s32.s8 %r3453, %r3452; + cvt.u32.u16 %r3454, %rs2194; + cvt.s32.s8 %r3455, %r3454; + mad.lo.s32 %r3456, %r42, %r3455, %r3447; + mad.lo.s32 %r3457, %r43, %r3453, %r3456; + mad.lo.s32 %r3458, %r45, %r3451, %r3457; + mad.lo.s32 %r3459, %r46, %r3449, %r3458; + ld.const.v4.u8 {%rs2202, %rs2203, %rs2204, %rs2205}, [matrix+1100]; + cvt.u32.u16 %r3460, %rs2205; + cvt.s32.s8 %r3461, %r3460; + cvt.u32.u16 %r3462, %rs2204; + cvt.s32.s8 %r3463, %r3462; + cvt.u32.u16 %r3464, %rs2203; + cvt.s32.s8 %r3465, %r3464; + cvt.u32.u16 %r3466, %rs2202; + cvt.s32.s8 %r3467, %r3466; + mad.lo.s32 %r3468, %r48, %r3467, %r3459; + mad.lo.s32 %r3469, %r49, %r3465, %r3468; + mad.lo.s32 %r3470, %r50, %r3463, %r3469; + mad.lo.s32 %r3471, %r51, %r3461, %r3470; + ld.const.v4.u8 {%rs2210, %rs2211, %rs2212, %rs2213}, [matrix+1104]; + cvt.u32.u16 %r3472, %rs2213; + cvt.s32.s8 %r3473, %r3472; + cvt.u32.u16 %r3474, %rs2212; + cvt.s32.s8 %r3475, %r3474; + cvt.u32.u16 %r3476, %rs2211; + cvt.s32.s8 %r3477, %r3476; + cvt.u32.u16 %r3478, %rs2210; + cvt.s32.s8 %r3479, %r3478; + mad.lo.s32 %r3480, %r173, %r3479, %r3471; + mad.lo.s32 %r3481, %r53, %r3477, %r3480; + mad.lo.s32 %r3482, %r54, %r3475, %r3481; + mad.lo.s32 %r3483, %r55, %r3473, %r3482; + ld.const.v4.u8 {%rs2218, %rs2219, %rs2220, %rs2221}, [matrix+1108]; + cvt.u32.u16 %r3484, %rs2221; + cvt.s32.s8 %r3485, %r3484; + cvt.u32.u16 %r3486, %rs2220; + cvt.s32.s8 %r3487, %r3486; + cvt.u32.u16 %r3488, %rs2219; + cvt.s32.s8 %r3489, %r3488; + cvt.u32.u16 %r3490, %rs2218; + cvt.s32.s8 %r3491, %r3490; + mad.lo.s32 %r3492, %r56, %r3491, %r3483; + mad.lo.s32 %r3493, %r57, %r3489, %r3492; + mad.lo.s32 %r3494, %r58, %r3487, %r3493; + mad.lo.s32 %r3495, %r59, %r3485, %r3494; + ld.const.v4.u8 {%rs2226, %rs2227, %rs2228, %rs2229}, [matrix+1112]; + cvt.u32.u16 %r3496, %rs2229; + cvt.s32.s8 %r3497, %r3496; + cvt.u32.u16 %r3498, %rs2228; + cvt.s32.s8 %r3499, %r3498; + cvt.u32.u16 %r3500, %rs2227; + cvt.s32.s8 %r3501, %r3500; + cvt.u32.u16 %r3502, %rs2226; + cvt.s32.s8 %r3503, %r3502; + mad.lo.s32 %r3504, %r61, %r3503, %r3495; + mad.lo.s32 %r3505, %r62, %r3501, %r3504; + mad.lo.s32 %r3506, %r64, %r3499, %r3505; + mad.lo.s32 %r3507, %r65, %r3497, %r3506; + ld.const.v4.u8 {%rs2234, %rs2235, %rs2236, %rs2237}, [matrix+1116]; + cvt.u32.u16 %r3508, %rs2237; + cvt.s32.s8 %r3509, %r3508; + cvt.u32.u16 %r3510, %rs2236; + cvt.s32.s8 %r3511, %r3510; + cvt.u32.u16 %r3512, %rs2235; + cvt.s32.s8 %r3513, %r3512; + cvt.u32.u16 %r3514, %rs2234; + cvt.s32.s8 %r3515, %r3514; + mad.lo.s32 %r3516, %r67, %r3515, %r3507; + mad.lo.s32 %r3517, %r68, %r3513, %r3516; + mad.lo.s32 %r3518, %r69, %r3511, %r3517; + mad.lo.s32 %r3519, %r70, %r3509, %r3518; + ld.const.v4.u8 {%rs2242, %rs2243, %rs2244, %rs2245}, [matrix+1120]; + cvt.u32.u16 %r3520, %rs2245; + cvt.s32.s8 %r3521, %r3520; + cvt.u32.u16 %r3522, %rs2244; + cvt.s32.s8 %r3523, %r3522; + cvt.u32.u16 %r3524, %rs2243; + cvt.s32.s8 %r3525, %r3524; + cvt.u32.u16 %r3526, %rs2242; + cvt.s32.s8 %r3527, %r3526; + mad.lo.s32 %r3528, %r222, %r3527, %r3519; + mad.lo.s32 %r3529, %r72, %r3525, %r3528; + mad.lo.s32 %r3530, %r73, %r3523, %r3529; + mad.lo.s32 %r3531, %r74, %r3521, %r3530; + ld.const.v4.u8 {%rs2250, %rs2251, %rs2252, %rs2253}, [matrix+1124]; + cvt.u32.u16 %r3532, %rs2253; + cvt.s32.s8 %r3533, %r3532; + cvt.u32.u16 %r3534, %rs2252; + cvt.s32.s8 %r3535, %r3534; + cvt.u32.u16 %r3536, %rs2251; + cvt.s32.s8 %r3537, %r3536; + cvt.u32.u16 %r3538, %rs2250; + cvt.s32.s8 %r3539, %r3538; + mad.lo.s32 %r3540, %r75, %r3539, %r3531; + mad.lo.s32 %r3541, %r76, %r3537, %r3540; + mad.lo.s32 %r3542, %r77, %r3535, %r3541; + mad.lo.s32 %r3543, %r78, %r3533, %r3542; + ld.const.v4.u8 {%rs2258, %rs2259, %rs2260, %rs2261}, [matrix+1128]; + cvt.u32.u16 %r3544, %rs2261; + cvt.s32.s8 %r3545, %r3544; + cvt.u32.u16 %r3546, %rs2260; + cvt.s32.s8 %r3547, %r3546; + cvt.u32.u16 %r3548, %rs2259; + cvt.s32.s8 %r3549, %r3548; + cvt.u32.u16 %r3550, %rs2258; + cvt.s32.s8 %r3551, %r3550; + mad.lo.s32 %r3552, %r80, %r3551, %r3543; + mad.lo.s32 %r3553, %r81, %r3549, %r3552; + mad.lo.s32 %r3554, %r83, %r3547, %r3553; + mad.lo.s32 %r3555, %r84, %r3545, %r3554; + ld.const.v4.u8 {%rs2266, %rs2267, %rs2268, %rs2269}, [matrix+1132]; + cvt.u32.u16 %r3556, %rs2269; + cvt.s32.s8 %r3557, %r3556; + cvt.u32.u16 %r3558, %rs2268; + cvt.s32.s8 %r3559, %r3558; + cvt.u32.u16 %r3560, %rs2267; + cvt.s32.s8 %r3561, %r3560; + cvt.u32.u16 %r3562, %rs2266; + cvt.s32.s8 %r3563, %r3562; + mad.lo.s32 %r3564, %r86, %r3563, %r3555; + mad.lo.s32 %r3565, %r87, %r3561, %r3564; + mad.lo.s32 %r3566, %r88, %r3559, %r3565; + mad.lo.s32 %r3567, %r89, %r3557, %r3566; + ld.const.v4.u8 {%rs2274, %rs2275, %rs2276, %rs2277}, [matrix+1136]; + cvt.u32.u16 %r3568, %rs2277; + cvt.s32.s8 %r3569, %r3568; + cvt.u32.u16 %r3570, %rs2276; + cvt.s32.s8 %r3571, %r3570; + cvt.u32.u16 %r3572, %rs2275; + cvt.s32.s8 %r3573, %r3572; + cvt.u32.u16 %r3574, %rs2274; + cvt.s32.s8 %r3575, %r3574; + mad.lo.s32 %r3576, %r271, %r3575, %r3567; + mad.lo.s32 %r3577, %r91, %r3573, %r3576; + mad.lo.s32 %r3578, %r93, %r3571, %r3577; + mad.lo.s32 %r3579, %r94, %r3569, %r3578; + ld.const.v4.u8 {%rs2282, %rs2283, %rs2284, %rs2285}, [matrix+1140]; + cvt.u32.u16 %r3580, %rs2285; + cvt.s32.s8 %r3581, %r3580; + cvt.u32.u16 %r3582, %rs2284; + cvt.s32.s8 %r3583, %r3582; + cvt.u32.u16 %r3584, %rs2283; + cvt.s32.s8 %r3585, %r3584; + cvt.u32.u16 %r3586, %rs2282; + cvt.s32.s8 %r3587, %r3586; + mad.lo.s32 %r3588, %r96, %r3587, %r3579; + mad.lo.s32 %r3589, %r97, %r3585, %r3588; + mad.lo.s32 %r3590, %r99, %r3583, %r3589; + mad.lo.s32 %r3591, %r100, %r3581, %r3590; + ld.const.v4.u8 {%rs2290, %rs2291, %rs2292, %rs2293}, [matrix+1144]; + cvt.u32.u16 %r3592, %rs2293; + cvt.s32.s8 %r3593, %r3592; + cvt.u32.u16 %r3594, %rs2292; + cvt.s32.s8 %r3595, %r3594; + cvt.u32.u16 %r3596, %rs2291; + cvt.s32.s8 %r3597, %r3596; + cvt.u32.u16 %r3598, %rs2290; + cvt.s32.s8 %r3599, %r3598; + mad.lo.s32 %r3600, %r103, %r3599, %r3591; + mad.lo.s32 %r3601, %r104, %r3597, %r3600; + mad.lo.s32 %r3602, %r107, %r3595, %r3601; + mad.lo.s32 %r3603, %r108, %r3593, %r3602; + ld.const.v4.u8 {%rs2298, %rs2299, %rs2300, %rs2301}, [matrix+1148]; + cvt.u32.u16 %r3604, %rs2301; + cvt.s32.s8 %r3605, %r3604; + cvt.u32.u16 %r3606, %rs2300; + cvt.s32.s8 %r3607, %r3606; + cvt.u32.u16 %r3608, %rs2299; + cvt.s32.s8 %r3609, %r3608; + cvt.u32.u16 %r3610, %rs2298; + cvt.s32.s8 %r3611, %r3610; + mad.lo.s32 %r3612, %r111, %r3611, %r3603; + mad.lo.s32 %r3613, %r112, %r3609, %r3612; + mad.lo.s32 %r3614, %r114, %r3607, %r3613; + mad.lo.s32 %r3615, %r115, %r3605, %r3614; + shr.u32 %r3616, %r3423, 6; + and.b32 %r3617, %r3616, 240; + shr.u32 %r3618, %r3615, 10; + or.b32 %r3619, %r3618, %r3617; + xor.b32 %r3620, %r52, %r3619; + cvt.u64.u32 %rd387, %r3620; + ld.const.v4.u8 {%rs2306, %rs2307, %rs2308, %rs2309}, [matrix+1152]; + cvt.u32.u16 %r3621, %rs2309; + cvt.s32.s8 %r3622, %r3621; + cvt.u32.u16 %r3623, %rs2308; + cvt.s32.s8 %r3624, %r3623; + cvt.u32.u16 %r3625, %rs2306; + cvt.s32.s8 %r3626, %r3625; + cvt.u32.u16 %r3627, %rs2307; + cvt.s32.s8 %r3628, %r3627; + mul.lo.s32 %r3629, %r34, %r3628; + mad.lo.s32 %r3630, %r124, %r3626, %r3629; + mad.lo.s32 %r3631, %r35, %r3624, %r3630; + mad.lo.s32 %r3632, %r36, %r3622, %r3631; + ld.const.v4.u8 {%rs2314, %rs2315, %rs2316, %rs2317}, [matrix+1156]; + cvt.u32.u16 %r3633, %rs2317; + cvt.s32.s8 %r3634, %r3633; + cvt.u32.u16 %r3635, %rs2316; + cvt.s32.s8 %r3636, %r3635; + cvt.u32.u16 %r3637, %rs2315; + cvt.s32.s8 %r3638, %r3637; + cvt.u32.u16 %r3639, %rs2314; + cvt.s32.s8 %r3640, %r3639; + mad.lo.s32 %r3641, %r37, %r3640, %r3632; + mad.lo.s32 %r3642, %r38, %r3638, %r3641; + mad.lo.s32 %r3643, %r39, %r3636, %r3642; + mad.lo.s32 %r3644, %r40, %r3634, %r3643; + ld.const.v4.u8 {%rs2322, %rs2323, %rs2324, %rs2325}, [matrix+1160]; + cvt.u32.u16 %r3645, %rs2325; + cvt.s32.s8 %r3646, %r3645; + cvt.u32.u16 %r3647, %rs2324; + cvt.s32.s8 %r3648, %r3647; + cvt.u32.u16 %r3649, %rs2323; + cvt.s32.s8 %r3650, %r3649; + cvt.u32.u16 %r3651, %rs2322; + cvt.s32.s8 %r3652, %r3651; + mad.lo.s32 %r3653, %r42, %r3652, %r3644; + mad.lo.s32 %r3654, %r43, %r3650, %r3653; + mad.lo.s32 %r3655, %r45, %r3648, %r3654; + mad.lo.s32 %r3656, %r46, %r3646, %r3655; + ld.const.v4.u8 {%rs2330, %rs2331, %rs2332, %rs2333}, [matrix+1164]; + cvt.u32.u16 %r3657, %rs2333; + cvt.s32.s8 %r3658, %r3657; + cvt.u32.u16 %r3659, %rs2332; + cvt.s32.s8 %r3660, %r3659; + cvt.u32.u16 %r3661, %rs2331; + cvt.s32.s8 %r3662, %r3661; + cvt.u32.u16 %r3663, %rs2330; + cvt.s32.s8 %r3664, %r3663; + mad.lo.s32 %r3665, %r48, %r3664, %r3656; + mad.lo.s32 %r3666, %r49, %r3662, %r3665; + mad.lo.s32 %r3667, %r50, %r3660, %r3666; + mad.lo.s32 %r3668, %r51, %r3658, %r3667; + ld.const.v4.u8 {%rs2338, %rs2339, %rs2340, %rs2341}, [matrix+1168]; + cvt.u32.u16 %r3669, %rs2341; + cvt.s32.s8 %r3670, %r3669; + cvt.u32.u16 %r3671, %rs2340; + cvt.s32.s8 %r3672, %r3671; + cvt.u32.u16 %r3673, %rs2339; + cvt.s32.s8 %r3674, %r3673; + cvt.u32.u16 %r3675, %rs2338; + cvt.s32.s8 %r3676, %r3675; + mad.lo.s32 %r3677, %r173, %r3676, %r3668; + mad.lo.s32 %r3678, %r53, %r3674, %r3677; + mad.lo.s32 %r3679, %r54, %r3672, %r3678; + mad.lo.s32 %r3680, %r55, %r3670, %r3679; + ld.const.v4.u8 {%rs2346, %rs2347, %rs2348, %rs2349}, [matrix+1172]; + cvt.u32.u16 %r3681, %rs2349; + cvt.s32.s8 %r3682, %r3681; + cvt.u32.u16 %r3683, %rs2348; + cvt.s32.s8 %r3684, %r3683; + cvt.u32.u16 %r3685, %rs2347; + cvt.s32.s8 %r3686, %r3685; + cvt.u32.u16 %r3687, %rs2346; + cvt.s32.s8 %r3688, %r3687; + mad.lo.s32 %r3689, %r56, %r3688, %r3680; + mad.lo.s32 %r3690, %r57, %r3686, %r3689; + mad.lo.s32 %r3691, %r58, %r3684, %r3690; + mad.lo.s32 %r3692, %r59, %r3682, %r3691; + ld.const.v4.u8 {%rs2354, %rs2355, %rs2356, %rs2357}, [matrix+1176]; + cvt.u32.u16 %r3693, %rs2357; + cvt.s32.s8 %r3694, %r3693; + cvt.u32.u16 %r3695, %rs2356; + cvt.s32.s8 %r3696, %r3695; + cvt.u32.u16 %r3697, %rs2355; + cvt.s32.s8 %r3698, %r3697; + cvt.u32.u16 %r3699, %rs2354; + cvt.s32.s8 %r3700, %r3699; + mad.lo.s32 %r3701, %r61, %r3700, %r3692; + mad.lo.s32 %r3702, %r62, %r3698, %r3701; + mad.lo.s32 %r3703, %r64, %r3696, %r3702; + mad.lo.s32 %r3704, %r65, %r3694, %r3703; + ld.const.v4.u8 {%rs2362, %rs2363, %rs2364, %rs2365}, [matrix+1180]; + cvt.u32.u16 %r3705, %rs2365; + cvt.s32.s8 %r3706, %r3705; + cvt.u32.u16 %r3707, %rs2364; + cvt.s32.s8 %r3708, %r3707; + cvt.u32.u16 %r3709, %rs2363; + cvt.s32.s8 %r3710, %r3709; + cvt.u32.u16 %r3711, %rs2362; + cvt.s32.s8 %r3712, %r3711; + mad.lo.s32 %r3713, %r67, %r3712, %r3704; + mad.lo.s32 %r3714, %r68, %r3710, %r3713; + mad.lo.s32 %r3715, %r69, %r3708, %r3714; + mad.lo.s32 %r3716, %r70, %r3706, %r3715; + ld.const.v4.u8 {%rs2370, %rs2371, %rs2372, %rs2373}, [matrix+1184]; + cvt.u32.u16 %r3717, %rs2373; + cvt.s32.s8 %r3718, %r3717; + cvt.u32.u16 %r3719, %rs2372; + cvt.s32.s8 %r3720, %r3719; + cvt.u32.u16 %r3721, %rs2371; + cvt.s32.s8 %r3722, %r3721; + cvt.u32.u16 %r3723, %rs2370; + cvt.s32.s8 %r3724, %r3723; + mad.lo.s32 %r3725, %r222, %r3724, %r3716; + mad.lo.s32 %r3726, %r72, %r3722, %r3725; + mad.lo.s32 %r3727, %r73, %r3720, %r3726; + mad.lo.s32 %r3728, %r74, %r3718, %r3727; + ld.const.v4.u8 {%rs2378, %rs2379, %rs2380, %rs2381}, [matrix+1188]; + cvt.u32.u16 %r3729, %rs2381; + cvt.s32.s8 %r3730, %r3729; + cvt.u32.u16 %r3731, %rs2380; + cvt.s32.s8 %r3732, %r3731; + cvt.u32.u16 %r3733, %rs2379; + cvt.s32.s8 %r3734, %r3733; + cvt.u32.u16 %r3735, %rs2378; + cvt.s32.s8 %r3736, %r3735; + mad.lo.s32 %r3737, %r75, %r3736, %r3728; + mad.lo.s32 %r3738, %r76, %r3734, %r3737; + mad.lo.s32 %r3739, %r77, %r3732, %r3738; + mad.lo.s32 %r3740, %r78, %r3730, %r3739; + ld.const.v4.u8 {%rs2386, %rs2387, %rs2388, %rs2389}, [matrix+1192]; + cvt.u32.u16 %r3741, %rs2389; + cvt.s32.s8 %r3742, %r3741; + cvt.u32.u16 %r3743, %rs2388; + cvt.s32.s8 %r3744, %r3743; + cvt.u32.u16 %r3745, %rs2387; + cvt.s32.s8 %r3746, %r3745; + cvt.u32.u16 %r3747, %rs2386; + cvt.s32.s8 %r3748, %r3747; + mad.lo.s32 %r3749, %r80, %r3748, %r3740; + mad.lo.s32 %r3750, %r81, %r3746, %r3749; + mad.lo.s32 %r3751, %r83, %r3744, %r3750; + mad.lo.s32 %r3752, %r84, %r3742, %r3751; + ld.const.v4.u8 {%rs2394, %rs2395, %rs2396, %rs2397}, [matrix+1196]; + cvt.u32.u16 %r3753, %rs2397; + cvt.s32.s8 %r3754, %r3753; + cvt.u32.u16 %r3755, %rs2396; + cvt.s32.s8 %r3756, %r3755; + cvt.u32.u16 %r3757, %rs2395; + cvt.s32.s8 %r3758, %r3757; + cvt.u32.u16 %r3759, %rs2394; + cvt.s32.s8 %r3760, %r3759; + mad.lo.s32 %r3761, %r86, %r3760, %r3752; + mad.lo.s32 %r3762, %r87, %r3758, %r3761; + mad.lo.s32 %r3763, %r88, %r3756, %r3762; + mad.lo.s32 %r3764, %r89, %r3754, %r3763; + ld.const.v4.u8 {%rs2402, %rs2403, %rs2404, %rs2405}, [matrix+1200]; + cvt.u32.u16 %r3765, %rs2405; + cvt.s32.s8 %r3766, %r3765; + cvt.u32.u16 %r3767, %rs2404; + cvt.s32.s8 %r3768, %r3767; + cvt.u32.u16 %r3769, %rs2403; + cvt.s32.s8 %r3770, %r3769; + cvt.u32.u16 %r3771, %rs2402; + cvt.s32.s8 %r3772, %r3771; + mad.lo.s32 %r3773, %r271, %r3772, %r3764; + mad.lo.s32 %r3774, %r91, %r3770, %r3773; + mad.lo.s32 %r3775, %r93, %r3768, %r3774; + mad.lo.s32 %r3776, %r94, %r3766, %r3775; + ld.const.v4.u8 {%rs2410, %rs2411, %rs2412, %rs2413}, [matrix+1204]; + cvt.u32.u16 %r3777, %rs2413; + cvt.s32.s8 %r3778, %r3777; + cvt.u32.u16 %r3779, %rs2412; + cvt.s32.s8 %r3780, %r3779; + cvt.u32.u16 %r3781, %rs2411; + cvt.s32.s8 %r3782, %r3781; + cvt.u32.u16 %r3783, %rs2410; + cvt.s32.s8 %r3784, %r3783; + mad.lo.s32 %r3785, %r96, %r3784, %r3776; + mad.lo.s32 %r3786, %r97, %r3782, %r3785; + mad.lo.s32 %r3787, %r99, %r3780, %r3786; + mad.lo.s32 %r3788, %r100, %r3778, %r3787; + ld.const.v4.u8 {%rs2418, %rs2419, %rs2420, %rs2421}, [matrix+1208]; + cvt.u32.u16 %r3789, %rs2421; + cvt.s32.s8 %r3790, %r3789; + cvt.u32.u16 %r3791, %rs2420; + cvt.s32.s8 %r3792, %r3791; + cvt.u32.u16 %r3793, %rs2419; + cvt.s32.s8 %r3794, %r3793; + cvt.u32.u16 %r3795, %rs2418; + cvt.s32.s8 %r3796, %r3795; + mad.lo.s32 %r3797, %r103, %r3796, %r3788; + mad.lo.s32 %r3798, %r104, %r3794, %r3797; + mad.lo.s32 %r3799, %r107, %r3792, %r3798; + mad.lo.s32 %r3800, %r108, %r3790, %r3799; + ld.const.v4.u8 {%rs2426, %rs2427, %rs2428, %rs2429}, [matrix+1212]; + cvt.u32.u16 %r3801, %rs2429; + cvt.s32.s8 %r3802, %r3801; + cvt.u32.u16 %r3803, %rs2428; + cvt.s32.s8 %r3804, %r3803; + cvt.u32.u16 %r3805, %rs2427; + cvt.s32.s8 %r3806, %r3805; + cvt.u32.u16 %r3807, %rs2426; + cvt.s32.s8 %r3808, %r3807; + mad.lo.s32 %r3809, %r111, %r3808, %r3800; + mad.lo.s32 %r3810, %r112, %r3806, %r3809; + mad.lo.s32 %r3811, %r114, %r3804, %r3810; + mad.lo.s32 %r3812, %r115, %r3802, %r3811; + ld.const.v4.u8 {%rs2434, %rs2435, %rs2436, %rs2437}, [matrix+1216]; + cvt.u32.u16 %r3813, %rs2437; + cvt.s32.s8 %r3814, %r3813; + cvt.u32.u16 %r3815, %rs2436; + cvt.s32.s8 %r3816, %r3815; + cvt.u32.u16 %r3817, %rs2434; + cvt.s32.s8 %r3818, %r3817; + cvt.u32.u16 %r3819, %rs2435; + cvt.s32.s8 %r3820, %r3819; + mul.lo.s32 %r3821, %r34, %r3820; + mad.lo.s32 %r3822, %r124, %r3818, %r3821; + mad.lo.s32 %r3823, %r35, %r3816, %r3822; + mad.lo.s32 %r3824, %r36, %r3814, %r3823; + ld.const.v4.u8 {%rs2442, %rs2443, %rs2444, %rs2445}, [matrix+1220]; + cvt.u32.u16 %r3825, %rs2445; + cvt.s32.s8 %r3826, %r3825; + cvt.u32.u16 %r3827, %rs2444; + cvt.s32.s8 %r3828, %r3827; + cvt.u32.u16 %r3829, %rs2443; + cvt.s32.s8 %r3830, %r3829; + cvt.u32.u16 %r3831, %rs2442; + cvt.s32.s8 %r3832, %r3831; + mad.lo.s32 %r3833, %r37, %r3832, %r3824; + mad.lo.s32 %r3834, %r38, %r3830, %r3833; + mad.lo.s32 %r3835, %r39, %r3828, %r3834; + mad.lo.s32 %r3836, %r40, %r3826, %r3835; + ld.const.v4.u8 {%rs2450, %rs2451, %rs2452, %rs2453}, [matrix+1224]; + cvt.u32.u16 %r3837, %rs2453; + cvt.s32.s8 %r3838, %r3837; + cvt.u32.u16 %r3839, %rs2452; + cvt.s32.s8 %r3840, %r3839; + cvt.u32.u16 %r3841, %rs2451; + cvt.s32.s8 %r3842, %r3841; + cvt.u32.u16 %r3843, %rs2450; + cvt.s32.s8 %r3844, %r3843; + mad.lo.s32 %r3845, %r42, %r3844, %r3836; + mad.lo.s32 %r3846, %r43, %r3842, %r3845; + mad.lo.s32 %r3847, %r45, %r3840, %r3846; + mad.lo.s32 %r3848, %r46, %r3838, %r3847; + ld.const.v4.u8 {%rs2458, %rs2459, %rs2460, %rs2461}, [matrix+1228]; + cvt.u32.u16 %r3849, %rs2461; + cvt.s32.s8 %r3850, %r3849; + cvt.u32.u16 %r3851, %rs2460; + cvt.s32.s8 %r3852, %r3851; + cvt.u32.u16 %r3853, %rs2459; + cvt.s32.s8 %r3854, %r3853; + cvt.u32.u16 %r3855, %rs2458; + cvt.s32.s8 %r3856, %r3855; + mad.lo.s32 %r3857, %r48, %r3856, %r3848; + mad.lo.s32 %r3858, %r49, %r3854, %r3857; + mad.lo.s32 %r3859, %r50, %r3852, %r3858; + mad.lo.s32 %r3860, %r51, %r3850, %r3859; + ld.const.v4.u8 {%rs2466, %rs2467, %rs2468, %rs2469}, [matrix+1232]; + cvt.u32.u16 %r3861, %rs2469; + cvt.s32.s8 %r3862, %r3861; + cvt.u32.u16 %r3863, %rs2468; + cvt.s32.s8 %r3864, %r3863; + cvt.u32.u16 %r3865, %rs2467; + cvt.s32.s8 %r3866, %r3865; + cvt.u32.u16 %r3867, %rs2466; + cvt.s32.s8 %r3868, %r3867; + mad.lo.s32 %r3869, %r173, %r3868, %r3860; + mad.lo.s32 %r3870, %r53, %r3866, %r3869; + mad.lo.s32 %r3871, %r54, %r3864, %r3870; + mad.lo.s32 %r3872, %r55, %r3862, %r3871; + ld.const.v4.u8 {%rs2474, %rs2475, %rs2476, %rs2477}, [matrix+1236]; + cvt.u32.u16 %r3873, %rs2477; + cvt.s32.s8 %r3874, %r3873; + cvt.u32.u16 %r3875, %rs2476; + cvt.s32.s8 %r3876, %r3875; + cvt.u32.u16 %r3877, %rs2475; + cvt.s32.s8 %r3878, %r3877; + cvt.u32.u16 %r3879, %rs2474; + cvt.s32.s8 %r3880, %r3879; + mad.lo.s32 %r3881, %r56, %r3880, %r3872; + mad.lo.s32 %r3882, %r57, %r3878, %r3881; + mad.lo.s32 %r3883, %r58, %r3876, %r3882; + mad.lo.s32 %r3884, %r59, %r3874, %r3883; + ld.const.v4.u8 {%rs2482, %rs2483, %rs2484, %rs2485}, [matrix+1240]; + cvt.u32.u16 %r3885, %rs2485; + cvt.s32.s8 %r3886, %r3885; + cvt.u32.u16 %r3887, %rs2484; + cvt.s32.s8 %r3888, %r3887; + cvt.u32.u16 %r3889, %rs2483; + cvt.s32.s8 %r3890, %r3889; + cvt.u32.u16 %r3891, %rs2482; + cvt.s32.s8 %r3892, %r3891; + mad.lo.s32 %r3893, %r61, %r3892, %r3884; + mad.lo.s32 %r3894, %r62, %r3890, %r3893; + mad.lo.s32 %r3895, %r64, %r3888, %r3894; + mad.lo.s32 %r3896, %r65, %r3886, %r3895; + ld.const.v4.u8 {%rs2490, %rs2491, %rs2492, %rs2493}, [matrix+1244]; + cvt.u32.u16 %r3897, %rs2493; + cvt.s32.s8 %r3898, %r3897; + cvt.u32.u16 %r3899, %rs2492; + cvt.s32.s8 %r3900, %r3899; + cvt.u32.u16 %r3901, %rs2491; + cvt.s32.s8 %r3902, %r3901; + cvt.u32.u16 %r3903, %rs2490; + cvt.s32.s8 %r3904, %r3903; + mad.lo.s32 %r3905, %r67, %r3904, %r3896; + mad.lo.s32 %r3906, %r68, %r3902, %r3905; + mad.lo.s32 %r3907, %r69, %r3900, %r3906; + mad.lo.s32 %r3908, %r70, %r3898, %r3907; + ld.const.v4.u8 {%rs2498, %rs2499, %rs2500, %rs2501}, [matrix+1248]; + cvt.u32.u16 %r3909, %rs2501; + cvt.s32.s8 %r3910, %r3909; + cvt.u32.u16 %r3911, %rs2500; + cvt.s32.s8 %r3912, %r3911; + cvt.u32.u16 %r3913, %rs2499; + cvt.s32.s8 %r3914, %r3913; + cvt.u32.u16 %r3915, %rs2498; + cvt.s32.s8 %r3916, %r3915; + mad.lo.s32 %r3917, %r222, %r3916, %r3908; + mad.lo.s32 %r3918, %r72, %r3914, %r3917; + mad.lo.s32 %r3919, %r73, %r3912, %r3918; + mad.lo.s32 %r3920, %r74, %r3910, %r3919; + ld.const.v4.u8 {%rs2506, %rs2507, %rs2508, %rs2509}, [matrix+1252]; + cvt.u32.u16 %r3921, %rs2509; + cvt.s32.s8 %r3922, %r3921; + cvt.u32.u16 %r3923, %rs2508; + cvt.s32.s8 %r3924, %r3923; + cvt.u32.u16 %r3925, %rs2507; + cvt.s32.s8 %r3926, %r3925; + cvt.u32.u16 %r3927, %rs2506; + cvt.s32.s8 %r3928, %r3927; + mad.lo.s32 %r3929, %r75, %r3928, %r3920; + mad.lo.s32 %r3930, %r76, %r3926, %r3929; + mad.lo.s32 %r3931, %r77, %r3924, %r3930; + mad.lo.s32 %r3932, %r78, %r3922, %r3931; + ld.const.v4.u8 {%rs2514, %rs2515, %rs2516, %rs2517}, [matrix+1256]; + cvt.u32.u16 %r3933, %rs2517; + cvt.s32.s8 %r3934, %r3933; + cvt.u32.u16 %r3935, %rs2516; + cvt.s32.s8 %r3936, %r3935; + cvt.u32.u16 %r3937, %rs2515; + cvt.s32.s8 %r3938, %r3937; + cvt.u32.u16 %r3939, %rs2514; + cvt.s32.s8 %r3940, %r3939; + mad.lo.s32 %r3941, %r80, %r3940, %r3932; + mad.lo.s32 %r3942, %r81, %r3938, %r3941; + mad.lo.s32 %r3943, %r83, %r3936, %r3942; + mad.lo.s32 %r3944, %r84, %r3934, %r3943; + ld.const.v4.u8 {%rs2522, %rs2523, %rs2524, %rs2525}, [matrix+1260]; + cvt.u32.u16 %r3945, %rs2525; + cvt.s32.s8 %r3946, %r3945; + cvt.u32.u16 %r3947, %rs2524; + cvt.s32.s8 %r3948, %r3947; + cvt.u32.u16 %r3949, %rs2523; + cvt.s32.s8 %r3950, %r3949; + cvt.u32.u16 %r3951, %rs2522; + cvt.s32.s8 %r3952, %r3951; + mad.lo.s32 %r3953, %r86, %r3952, %r3944; + mad.lo.s32 %r3954, %r87, %r3950, %r3953; + mad.lo.s32 %r3955, %r88, %r3948, %r3954; + mad.lo.s32 %r3956, %r89, %r3946, %r3955; + ld.const.v4.u8 {%rs2530, %rs2531, %rs2532, %rs2533}, [matrix+1264]; + cvt.u32.u16 %r3957, %rs2533; + cvt.s32.s8 %r3958, %r3957; + cvt.u32.u16 %r3959, %rs2532; + cvt.s32.s8 %r3960, %r3959; + cvt.u32.u16 %r3961, %rs2531; + cvt.s32.s8 %r3962, %r3961; + cvt.u32.u16 %r3963, %rs2530; + cvt.s32.s8 %r3964, %r3963; + mad.lo.s32 %r3965, %r271, %r3964, %r3956; + mad.lo.s32 %r3966, %r91, %r3962, %r3965; + mad.lo.s32 %r3967, %r93, %r3960, %r3966; + mad.lo.s32 %r3968, %r94, %r3958, %r3967; + ld.const.v4.u8 {%rs2538, %rs2539, %rs2540, %rs2541}, [matrix+1268]; + cvt.u32.u16 %r3969, %rs2541; + cvt.s32.s8 %r3970, %r3969; + cvt.u32.u16 %r3971, %rs2540; + cvt.s32.s8 %r3972, %r3971; + cvt.u32.u16 %r3973, %rs2539; + cvt.s32.s8 %r3974, %r3973; + cvt.u32.u16 %r3975, %rs2538; + cvt.s32.s8 %r3976, %r3975; + mad.lo.s32 %r3977, %r96, %r3976, %r3968; + mad.lo.s32 %r3978, %r97, %r3974, %r3977; + mad.lo.s32 %r3979, %r99, %r3972, %r3978; + mad.lo.s32 %r3980, %r100, %r3970, %r3979; + ld.const.v4.u8 {%rs2546, %rs2547, %rs2548, %rs2549}, [matrix+1272]; + cvt.u32.u16 %r3981, %rs2549; + cvt.s32.s8 %r3982, %r3981; + cvt.u32.u16 %r3983, %rs2548; + cvt.s32.s8 %r3984, %r3983; + cvt.u32.u16 %r3985, %rs2547; + cvt.s32.s8 %r3986, %r3985; + cvt.u32.u16 %r3987, %rs2546; + cvt.s32.s8 %r3988, %r3987; + mad.lo.s32 %r3989, %r103, %r3988, %r3980; + mad.lo.s32 %r3990, %r104, %r3986, %r3989; + mad.lo.s32 %r3991, %r107, %r3984, %r3990; + mad.lo.s32 %r3992, %r108, %r3982, %r3991; + ld.const.v4.u8 {%rs2554, %rs2555, %rs2556, %rs2557}, [matrix+1276]; + cvt.u32.u16 %r3993, %rs2557; + cvt.s32.s8 %r3994, %r3993; + cvt.u32.u16 %r3995, %rs2556; + cvt.s32.s8 %r3996, %r3995; + cvt.u32.u16 %r3997, %rs2555; + cvt.s32.s8 %r3998, %r3997; + cvt.u32.u16 %r3999, %rs2554; + cvt.s32.s8 %r4000, %r3999; + mad.lo.s32 %r4001, %r111, %r4000, %r3992; + mad.lo.s32 %r4002, %r112, %r3998, %r4001; + mad.lo.s32 %r4003, %r114, %r3996, %r4002; + mad.lo.s32 %r4004, %r115, %r3994, %r4003; + shr.u32 %r4005, %r3812, 6; + and.b32 %r4006, %r4005, 240; + shr.u32 %r4007, %r4004, 10; + or.b32 %r4008, %r4007, %r4006; + xor.b32 %r4009, %r19, %r4008; + ld.const.v4.u8 {%rs2562, %rs2563, %rs2564, %rs2565}, [matrix+1280]; + cvt.u32.u16 %r4010, %rs2565; + cvt.s32.s8 %r4011, %r4010; + cvt.u32.u16 %r4012, %rs2564; + cvt.s32.s8 %r4013, %r4012; + cvt.u32.u16 %r4014, %rs2562; + cvt.s32.s8 %r4015, %r4014; + cvt.u32.u16 %r4016, %rs2563; + cvt.s32.s8 %r4017, %r4016; + mul.lo.s32 %r4018, %r34, %r4017; + mad.lo.s32 %r4019, %r124, %r4015, %r4018; + mad.lo.s32 %r4020, %r35, %r4013, %r4019; + mad.lo.s32 %r4021, %r36, %r4011, %r4020; + ld.const.v4.u8 {%rs2570, %rs2571, %rs2572, %rs2573}, [matrix+1284]; + cvt.u32.u16 %r4022, %rs2573; + cvt.s32.s8 %r4023, %r4022; + cvt.u32.u16 %r4024, %rs2572; + cvt.s32.s8 %r4025, %r4024; + cvt.u32.u16 %r4026, %rs2571; + cvt.s32.s8 %r4027, %r4026; + cvt.u32.u16 %r4028, %rs2570; + cvt.s32.s8 %r4029, %r4028; + mad.lo.s32 %r4030, %r37, %r4029, %r4021; + mad.lo.s32 %r4031, %r38, %r4027, %r4030; + mad.lo.s32 %r4032, %r39, %r4025, %r4031; + mad.lo.s32 %r4033, %r40, %r4023, %r4032; + ld.const.v4.u8 {%rs2578, %rs2579, %rs2580, %rs2581}, [matrix+1288]; + cvt.u32.u16 %r4034, %rs2581; + cvt.s32.s8 %r4035, %r4034; + cvt.u32.u16 %r4036, %rs2580; + cvt.s32.s8 %r4037, %r4036; + cvt.u32.u16 %r4038, %rs2579; + cvt.s32.s8 %r4039, %r4038; + cvt.u32.u16 %r4040, %rs2578; + cvt.s32.s8 %r4041, %r4040; + mad.lo.s32 %r4042, %r42, %r4041, %r4033; + mad.lo.s32 %r4043, %r43, %r4039, %r4042; + mad.lo.s32 %r4044, %r45, %r4037, %r4043; + mad.lo.s32 %r4045, %r46, %r4035, %r4044; + ld.const.v4.u8 {%rs2586, %rs2587, %rs2588, %rs2589}, [matrix+1292]; + cvt.u32.u16 %r4046, %rs2589; + cvt.s32.s8 %r4047, %r4046; + cvt.u32.u16 %r4048, %rs2588; + cvt.s32.s8 %r4049, %r4048; + cvt.u32.u16 %r4050, %rs2587; + cvt.s32.s8 %r4051, %r4050; + cvt.u32.u16 %r4052, %rs2586; + cvt.s32.s8 %r4053, %r4052; + mad.lo.s32 %r4054, %r48, %r4053, %r4045; + mad.lo.s32 %r4055, %r49, %r4051, %r4054; + mad.lo.s32 %r4056, %r50, %r4049, %r4055; + mad.lo.s32 %r4057, %r51, %r4047, %r4056; + ld.const.v4.u8 {%rs2594, %rs2595, %rs2596, %rs2597}, [matrix+1296]; + cvt.u32.u16 %r4058, %rs2597; + cvt.s32.s8 %r4059, %r4058; + cvt.u32.u16 %r4060, %rs2596; + cvt.s32.s8 %r4061, %r4060; + cvt.u32.u16 %r4062, %rs2595; + cvt.s32.s8 %r4063, %r4062; + cvt.u32.u16 %r4064, %rs2594; + cvt.s32.s8 %r4065, %r4064; + mad.lo.s32 %r4066, %r173, %r4065, %r4057; + mad.lo.s32 %r4067, %r53, %r4063, %r4066; + mad.lo.s32 %r4068, %r54, %r4061, %r4067; + mad.lo.s32 %r4069, %r55, %r4059, %r4068; + ld.const.v4.u8 {%rs2602, %rs2603, %rs2604, %rs2605}, [matrix+1300]; + cvt.u32.u16 %r4070, %rs2605; + cvt.s32.s8 %r4071, %r4070; + cvt.u32.u16 %r4072, %rs2604; + cvt.s32.s8 %r4073, %r4072; + cvt.u32.u16 %r4074, %rs2603; + cvt.s32.s8 %r4075, %r4074; + cvt.u32.u16 %r4076, %rs2602; + cvt.s32.s8 %r4077, %r4076; + mad.lo.s32 %r4078, %r56, %r4077, %r4069; + mad.lo.s32 %r4079, %r57, %r4075, %r4078; + mad.lo.s32 %r4080, %r58, %r4073, %r4079; + mad.lo.s32 %r4081, %r59, %r4071, %r4080; + ld.const.v4.u8 {%rs2610, %rs2611, %rs2612, %rs2613}, [matrix+1304]; + cvt.u32.u16 %r4082, %rs2613; + cvt.s32.s8 %r4083, %r4082; + cvt.u32.u16 %r4084, %rs2612; + cvt.s32.s8 %r4085, %r4084; + cvt.u32.u16 %r4086, %rs2611; + cvt.s32.s8 %r4087, %r4086; + cvt.u32.u16 %r4088, %rs2610; + cvt.s32.s8 %r4089, %r4088; + mad.lo.s32 %r4090, %r61, %r4089, %r4081; + mad.lo.s32 %r4091, %r62, %r4087, %r4090; + mad.lo.s32 %r4092, %r64, %r4085, %r4091; + mad.lo.s32 %r4093, %r65, %r4083, %r4092; + ld.const.v4.u8 {%rs2618, %rs2619, %rs2620, %rs2621}, [matrix+1308]; + cvt.u32.u16 %r4094, %rs2621; + cvt.s32.s8 %r4095, %r4094; + cvt.u32.u16 %r4096, %rs2620; + cvt.s32.s8 %r4097, %r4096; + cvt.u32.u16 %r4098, %rs2619; + cvt.s32.s8 %r4099, %r4098; + cvt.u32.u16 %r4100, %rs2618; + cvt.s32.s8 %r4101, %r4100; + mad.lo.s32 %r4102, %r67, %r4101, %r4093; + mad.lo.s32 %r4103, %r68, %r4099, %r4102; + mad.lo.s32 %r4104, %r69, %r4097, %r4103; + mad.lo.s32 %r4105, %r70, %r4095, %r4104; + ld.const.v4.u8 {%rs2626, %rs2627, %rs2628, %rs2629}, [matrix+1312]; + cvt.u32.u16 %r4106, %rs2629; + cvt.s32.s8 %r4107, %r4106; + cvt.u32.u16 %r4108, %rs2628; + cvt.s32.s8 %r4109, %r4108; + cvt.u32.u16 %r4110, %rs2627; + cvt.s32.s8 %r4111, %r4110; + cvt.u32.u16 %r4112, %rs2626; + cvt.s32.s8 %r4113, %r4112; + mad.lo.s32 %r4114, %r222, %r4113, %r4105; + mad.lo.s32 %r4115, %r72, %r4111, %r4114; + mad.lo.s32 %r4116, %r73, %r4109, %r4115; + mad.lo.s32 %r4117, %r74, %r4107, %r4116; + ld.const.v4.u8 {%rs2634, %rs2635, %rs2636, %rs2637}, [matrix+1316]; + cvt.u32.u16 %r4118, %rs2637; + cvt.s32.s8 %r4119, %r4118; + cvt.u32.u16 %r4120, %rs2636; + cvt.s32.s8 %r4121, %r4120; + cvt.u32.u16 %r4122, %rs2635; + cvt.s32.s8 %r4123, %r4122; + cvt.u32.u16 %r4124, %rs2634; + cvt.s32.s8 %r4125, %r4124; + mad.lo.s32 %r4126, %r75, %r4125, %r4117; + mad.lo.s32 %r4127, %r76, %r4123, %r4126; + mad.lo.s32 %r4128, %r77, %r4121, %r4127; + mad.lo.s32 %r4129, %r78, %r4119, %r4128; + ld.const.v4.u8 {%rs2642, %rs2643, %rs2644, %rs2645}, [matrix+1320]; + cvt.u32.u16 %r4130, %rs2645; + cvt.s32.s8 %r4131, %r4130; + cvt.u32.u16 %r4132, %rs2644; + cvt.s32.s8 %r4133, %r4132; + cvt.u32.u16 %r4134, %rs2643; + cvt.s32.s8 %r4135, %r4134; + cvt.u32.u16 %r4136, %rs2642; + cvt.s32.s8 %r4137, %r4136; + mad.lo.s32 %r4138, %r80, %r4137, %r4129; + mad.lo.s32 %r4139, %r81, %r4135, %r4138; + mad.lo.s32 %r4140, %r83, %r4133, %r4139; + mad.lo.s32 %r4141, %r84, %r4131, %r4140; + ld.const.v4.u8 {%rs2650, %rs2651, %rs2652, %rs2653}, [matrix+1324]; + cvt.u32.u16 %r4142, %rs2653; + cvt.s32.s8 %r4143, %r4142; + cvt.u32.u16 %r4144, %rs2652; + cvt.s32.s8 %r4145, %r4144; + cvt.u32.u16 %r4146, %rs2651; + cvt.s32.s8 %r4147, %r4146; + cvt.u32.u16 %r4148, %rs2650; + cvt.s32.s8 %r4149, %r4148; + mad.lo.s32 %r4150, %r86, %r4149, %r4141; + mad.lo.s32 %r4151, %r87, %r4147, %r4150; + mad.lo.s32 %r4152, %r88, %r4145, %r4151; + mad.lo.s32 %r4153, %r89, %r4143, %r4152; + ld.const.v4.u8 {%rs2658, %rs2659, %rs2660, %rs2661}, [matrix+1328]; + cvt.u32.u16 %r4154, %rs2661; + cvt.s32.s8 %r4155, %r4154; + cvt.u32.u16 %r4156, %rs2660; + cvt.s32.s8 %r4157, %r4156; + cvt.u32.u16 %r4158, %rs2659; + cvt.s32.s8 %r4159, %r4158; + cvt.u32.u16 %r4160, %rs2658; + cvt.s32.s8 %r4161, %r4160; + mad.lo.s32 %r4162, %r271, %r4161, %r4153; + mad.lo.s32 %r4163, %r91, %r4159, %r4162; + mad.lo.s32 %r4164, %r93, %r4157, %r4163; + mad.lo.s32 %r4165, %r94, %r4155, %r4164; + ld.const.v4.u8 {%rs2666, %rs2667, %rs2668, %rs2669}, [matrix+1332]; + cvt.u32.u16 %r4166, %rs2669; + cvt.s32.s8 %r4167, %r4166; + cvt.u32.u16 %r4168, %rs2668; + cvt.s32.s8 %r4169, %r4168; + cvt.u32.u16 %r4170, %rs2667; + cvt.s32.s8 %r4171, %r4170; + cvt.u32.u16 %r4172, %rs2666; + cvt.s32.s8 %r4173, %r4172; + mad.lo.s32 %r4174, %r96, %r4173, %r4165; + mad.lo.s32 %r4175, %r97, %r4171, %r4174; + mad.lo.s32 %r4176, %r99, %r4169, %r4175; + mad.lo.s32 %r4177, %r100, %r4167, %r4176; + ld.const.v4.u8 {%rs2674, %rs2675, %rs2676, %rs2677}, [matrix+1336]; + cvt.u32.u16 %r4178, %rs2677; + cvt.s32.s8 %r4179, %r4178; + cvt.u32.u16 %r4180, %rs2676; + cvt.s32.s8 %r4181, %r4180; + cvt.u32.u16 %r4182, %rs2675; + cvt.s32.s8 %r4183, %r4182; + cvt.u32.u16 %r4184, %rs2674; + cvt.s32.s8 %r4185, %r4184; + mad.lo.s32 %r4186, %r103, %r4185, %r4177; + mad.lo.s32 %r4187, %r104, %r4183, %r4186; + mad.lo.s32 %r4188, %r107, %r4181, %r4187; + mad.lo.s32 %r4189, %r108, %r4179, %r4188; + ld.const.v4.u8 {%rs2682, %rs2683, %rs2684, %rs2685}, [matrix+1340]; + cvt.u32.u16 %r4190, %rs2685; + cvt.s32.s8 %r4191, %r4190; + cvt.u32.u16 %r4192, %rs2684; + cvt.s32.s8 %r4193, %r4192; + cvt.u32.u16 %r4194, %rs2683; + cvt.s32.s8 %r4195, %r4194; + cvt.u32.u16 %r4196, %rs2682; + cvt.s32.s8 %r4197, %r4196; + mad.lo.s32 %r4198, %r111, %r4197, %r4189; + mad.lo.s32 %r4199, %r112, %r4195, %r4198; + mad.lo.s32 %r4200, %r114, %r4193, %r4199; + mad.lo.s32 %r4201, %r115, %r4191, %r4200; + ld.const.v4.u8 {%rs2690, %rs2691, %rs2692, %rs2693}, [matrix+1344]; + cvt.u32.u16 %r4202, %rs2693; + cvt.s32.s8 %r4203, %r4202; + cvt.u32.u16 %r4204, %rs2692; + cvt.s32.s8 %r4205, %r4204; + cvt.u32.u16 %r4206, %rs2690; + cvt.s32.s8 %r4207, %r4206; + cvt.u32.u16 %r4208, %rs2691; + cvt.s32.s8 %r4209, %r4208; + mul.lo.s32 %r4210, %r34, %r4209; + mad.lo.s32 %r4211, %r124, %r4207, %r4210; + mad.lo.s32 %r4212, %r35, %r4205, %r4211; + mad.lo.s32 %r4213, %r36, %r4203, %r4212; + ld.const.v4.u8 {%rs2698, %rs2699, %rs2700, %rs2701}, [matrix+1348]; + cvt.u32.u16 %r4214, %rs2701; + cvt.s32.s8 %r4215, %r4214; + cvt.u32.u16 %r4216, %rs2700; + cvt.s32.s8 %r4217, %r4216; + cvt.u32.u16 %r4218, %rs2699; + cvt.s32.s8 %r4219, %r4218; + cvt.u32.u16 %r4220, %rs2698; + cvt.s32.s8 %r4221, %r4220; + mad.lo.s32 %r4222, %r37, %r4221, %r4213; + mad.lo.s32 %r4223, %r38, %r4219, %r4222; + mad.lo.s32 %r4224, %r39, %r4217, %r4223; + mad.lo.s32 %r4225, %r40, %r4215, %r4224; + ld.const.v4.u8 {%rs2706, %rs2707, %rs2708, %rs2709}, [matrix+1352]; + cvt.u32.u16 %r4226, %rs2709; + cvt.s32.s8 %r4227, %r4226; + cvt.u32.u16 %r4228, %rs2708; + cvt.s32.s8 %r4229, %r4228; + cvt.u32.u16 %r4230, %rs2707; + cvt.s32.s8 %r4231, %r4230; + cvt.u32.u16 %r4232, %rs2706; + cvt.s32.s8 %r4233, %r4232; + mad.lo.s32 %r4234, %r42, %r4233, %r4225; + mad.lo.s32 %r4235, %r43, %r4231, %r4234; + mad.lo.s32 %r4236, %r45, %r4229, %r4235; + mad.lo.s32 %r4237, %r46, %r4227, %r4236; + ld.const.v4.u8 {%rs2714, %rs2715, %rs2716, %rs2717}, [matrix+1356]; + cvt.u32.u16 %r4238, %rs2717; + cvt.s32.s8 %r4239, %r4238; + cvt.u32.u16 %r4240, %rs2716; + cvt.s32.s8 %r4241, %r4240; + cvt.u32.u16 %r4242, %rs2715; + cvt.s32.s8 %r4243, %r4242; + cvt.u32.u16 %r4244, %rs2714; + cvt.s32.s8 %r4245, %r4244; + mad.lo.s32 %r4246, %r48, %r4245, %r4237; + mad.lo.s32 %r4247, %r49, %r4243, %r4246; + mad.lo.s32 %r4248, %r50, %r4241, %r4247; + mad.lo.s32 %r4249, %r51, %r4239, %r4248; + ld.const.v4.u8 {%rs2722, %rs2723, %rs2724, %rs2725}, [matrix+1360]; + cvt.u32.u16 %r4250, %rs2725; + cvt.s32.s8 %r4251, %r4250; + cvt.u32.u16 %r4252, %rs2724; + cvt.s32.s8 %r4253, %r4252; + cvt.u32.u16 %r4254, %rs2723; + cvt.s32.s8 %r4255, %r4254; + cvt.u32.u16 %r4256, %rs2722; + cvt.s32.s8 %r4257, %r4256; + mad.lo.s32 %r4258, %r173, %r4257, %r4249; + mad.lo.s32 %r4259, %r53, %r4255, %r4258; + mad.lo.s32 %r4260, %r54, %r4253, %r4259; + mad.lo.s32 %r4261, %r55, %r4251, %r4260; + ld.const.v4.u8 {%rs2730, %rs2731, %rs2732, %rs2733}, [matrix+1364]; + cvt.u32.u16 %r4262, %rs2733; + cvt.s32.s8 %r4263, %r4262; + cvt.u32.u16 %r4264, %rs2732; + cvt.s32.s8 %r4265, %r4264; + cvt.u32.u16 %r4266, %rs2731; + cvt.s32.s8 %r4267, %r4266; + cvt.u32.u16 %r4268, %rs2730; + cvt.s32.s8 %r4269, %r4268; + mad.lo.s32 %r4270, %r56, %r4269, %r4261; + mad.lo.s32 %r4271, %r57, %r4267, %r4270; + mad.lo.s32 %r4272, %r58, %r4265, %r4271; + mad.lo.s32 %r4273, %r59, %r4263, %r4272; + ld.const.v4.u8 {%rs2738, %rs2739, %rs2740, %rs2741}, [matrix+1368]; + cvt.u32.u16 %r4274, %rs2741; + cvt.s32.s8 %r4275, %r4274; + cvt.u32.u16 %r4276, %rs2740; + cvt.s32.s8 %r4277, %r4276; + cvt.u32.u16 %r4278, %rs2739; + cvt.s32.s8 %r4279, %r4278; + cvt.u32.u16 %r4280, %rs2738; + cvt.s32.s8 %r4281, %r4280; + mad.lo.s32 %r4282, %r61, %r4281, %r4273; + mad.lo.s32 %r4283, %r62, %r4279, %r4282; + mad.lo.s32 %r4284, %r64, %r4277, %r4283; + mad.lo.s32 %r4285, %r65, %r4275, %r4284; + ld.const.v4.u8 {%rs2746, %rs2747, %rs2748, %rs2749}, [matrix+1372]; + cvt.u32.u16 %r4286, %rs2749; + cvt.s32.s8 %r4287, %r4286; + cvt.u32.u16 %r4288, %rs2748; + cvt.s32.s8 %r4289, %r4288; + cvt.u32.u16 %r4290, %rs2747; + cvt.s32.s8 %r4291, %r4290; + cvt.u32.u16 %r4292, %rs2746; + cvt.s32.s8 %r4293, %r4292; + mad.lo.s32 %r4294, %r67, %r4293, %r4285; + mad.lo.s32 %r4295, %r68, %r4291, %r4294; + mad.lo.s32 %r4296, %r69, %r4289, %r4295; + mad.lo.s32 %r4297, %r70, %r4287, %r4296; + ld.const.v4.u8 {%rs2754, %rs2755, %rs2756, %rs2757}, [matrix+1376]; + cvt.u32.u16 %r4298, %rs2757; + cvt.s32.s8 %r4299, %r4298; + cvt.u32.u16 %r4300, %rs2756; + cvt.s32.s8 %r4301, %r4300; + cvt.u32.u16 %r4302, %rs2755; + cvt.s32.s8 %r4303, %r4302; + cvt.u32.u16 %r4304, %rs2754; + cvt.s32.s8 %r4305, %r4304; + mad.lo.s32 %r4306, %r222, %r4305, %r4297; + mad.lo.s32 %r4307, %r72, %r4303, %r4306; + mad.lo.s32 %r4308, %r73, %r4301, %r4307; + mad.lo.s32 %r4309, %r74, %r4299, %r4308; + ld.const.v4.u8 {%rs2762, %rs2763, %rs2764, %rs2765}, [matrix+1380]; + cvt.u32.u16 %r4310, %rs2765; + cvt.s32.s8 %r4311, %r4310; + cvt.u32.u16 %r4312, %rs2764; + cvt.s32.s8 %r4313, %r4312; + cvt.u32.u16 %r4314, %rs2763; + cvt.s32.s8 %r4315, %r4314; + cvt.u32.u16 %r4316, %rs2762; + cvt.s32.s8 %r4317, %r4316; + mad.lo.s32 %r4318, %r75, %r4317, %r4309; + mad.lo.s32 %r4319, %r76, %r4315, %r4318; + mad.lo.s32 %r4320, %r77, %r4313, %r4319; + mad.lo.s32 %r4321, %r78, %r4311, %r4320; + ld.const.v4.u8 {%rs2770, %rs2771, %rs2772, %rs2773}, [matrix+1384]; + cvt.u32.u16 %r4322, %rs2773; + cvt.s32.s8 %r4323, %r4322; + cvt.u32.u16 %r4324, %rs2772; + cvt.s32.s8 %r4325, %r4324; + cvt.u32.u16 %r4326, %rs2771; + cvt.s32.s8 %r4327, %r4326; + cvt.u32.u16 %r4328, %rs2770; + cvt.s32.s8 %r4329, %r4328; + mad.lo.s32 %r4330, %r80, %r4329, %r4321; + mad.lo.s32 %r4331, %r81, %r4327, %r4330; + mad.lo.s32 %r4332, %r83, %r4325, %r4331; + mad.lo.s32 %r4333, %r84, %r4323, %r4332; + ld.const.v4.u8 {%rs2778, %rs2779, %rs2780, %rs2781}, [matrix+1388]; + cvt.u32.u16 %r4334, %rs2781; + cvt.s32.s8 %r4335, %r4334; + cvt.u32.u16 %r4336, %rs2780; + cvt.s32.s8 %r4337, %r4336; + cvt.u32.u16 %r4338, %rs2779; + cvt.s32.s8 %r4339, %r4338; + cvt.u32.u16 %r4340, %rs2778; + cvt.s32.s8 %r4341, %r4340; + mad.lo.s32 %r4342, %r86, %r4341, %r4333; + mad.lo.s32 %r4343, %r87, %r4339, %r4342; + mad.lo.s32 %r4344, %r88, %r4337, %r4343; + mad.lo.s32 %r4345, %r89, %r4335, %r4344; + ld.const.v4.u8 {%rs2786, %rs2787, %rs2788, %rs2789}, [matrix+1392]; + cvt.u32.u16 %r4346, %rs2789; + cvt.s32.s8 %r4347, %r4346; + cvt.u32.u16 %r4348, %rs2788; + cvt.s32.s8 %r4349, %r4348; + cvt.u32.u16 %r4350, %rs2787; + cvt.s32.s8 %r4351, %r4350; + cvt.u32.u16 %r4352, %rs2786; + cvt.s32.s8 %r4353, %r4352; + mad.lo.s32 %r4354, %r271, %r4353, %r4345; + mad.lo.s32 %r4355, %r91, %r4351, %r4354; + mad.lo.s32 %r4356, %r93, %r4349, %r4355; + mad.lo.s32 %r4357, %r94, %r4347, %r4356; + ld.const.v4.u8 {%rs2794, %rs2795, %rs2796, %rs2797}, [matrix+1396]; + cvt.u32.u16 %r4358, %rs2797; + cvt.s32.s8 %r4359, %r4358; + cvt.u32.u16 %r4360, %rs2796; + cvt.s32.s8 %r4361, %r4360; + cvt.u32.u16 %r4362, %rs2795; + cvt.s32.s8 %r4363, %r4362; + cvt.u32.u16 %r4364, %rs2794; + cvt.s32.s8 %r4365, %r4364; + mad.lo.s32 %r4366, %r96, %r4365, %r4357; + mad.lo.s32 %r4367, %r97, %r4363, %r4366; + mad.lo.s32 %r4368, %r99, %r4361, %r4367; + mad.lo.s32 %r4369, %r100, %r4359, %r4368; + ld.const.v4.u8 {%rs2802, %rs2803, %rs2804, %rs2805}, [matrix+1400]; + cvt.u32.u16 %r4370, %rs2805; + cvt.s32.s8 %r4371, %r4370; + cvt.u32.u16 %r4372, %rs2804; + cvt.s32.s8 %r4373, %r4372; + cvt.u32.u16 %r4374, %rs2803; + cvt.s32.s8 %r4375, %r4374; + cvt.u32.u16 %r4376, %rs2802; + cvt.s32.s8 %r4377, %r4376; + mad.lo.s32 %r4378, %r103, %r4377, %r4369; + mad.lo.s32 %r4379, %r104, %r4375, %r4378; + mad.lo.s32 %r4380, %r107, %r4373, %r4379; + mad.lo.s32 %r4381, %r108, %r4371, %r4380; + ld.const.v4.u8 {%rs2810, %rs2811, %rs2812, %rs2813}, [matrix+1404]; + cvt.u32.u16 %r4382, %rs2813; + cvt.s32.s8 %r4383, %r4382; + cvt.u32.u16 %r4384, %rs2812; + cvt.s32.s8 %r4385, %r4384; + cvt.u32.u16 %r4386, %rs2811; + cvt.s32.s8 %r4387, %r4386; + cvt.u32.u16 %r4388, %rs2810; + cvt.s32.s8 %r4389, %r4388; + mad.lo.s32 %r4390, %r111, %r4389, %r4381; + mad.lo.s32 %r4391, %r112, %r4387, %r4390; + mad.lo.s32 %r4392, %r114, %r4385, %r4391; + mad.lo.s32 %r4393, %r115, %r4383, %r4392; + shr.u32 %r4394, %r4201, 6; + and.b32 %r4395, %r4394, 240; + shr.u32 %r4396, %r4393, 10; + or.b32 %r4397, %r4396, %r4395; + xor.b32 %r4398, %r20, %r4397; + cvt.u64.u32 %rd388, %r4398; + ld.const.v4.u8 {%rs2818, %rs2819, %rs2820, %rs2821}, [matrix+1408]; + cvt.u32.u16 %r4399, %rs2821; + cvt.s32.s8 %r4400, %r4399; + cvt.u32.u16 %r4401, %rs2820; + cvt.s32.s8 %r4402, %r4401; + cvt.u32.u16 %r4403, %rs2818; + cvt.s32.s8 %r4404, %r4403; + cvt.u32.u16 %r4405, %rs2819; + cvt.s32.s8 %r4406, %r4405; + mul.lo.s32 %r4407, %r34, %r4406; + mad.lo.s32 %r4408, %r124, %r4404, %r4407; + mad.lo.s32 %r4409, %r35, %r4402, %r4408; + mad.lo.s32 %r4410, %r36, %r4400, %r4409; + ld.const.v4.u8 {%rs2826, %rs2827, %rs2828, %rs2829}, [matrix+1412]; + cvt.u32.u16 %r4411, %rs2829; + cvt.s32.s8 %r4412, %r4411; + cvt.u32.u16 %r4413, %rs2828; + cvt.s32.s8 %r4414, %r4413; + cvt.u32.u16 %r4415, %rs2827; + cvt.s32.s8 %r4416, %r4415; + cvt.u32.u16 %r4417, %rs2826; + cvt.s32.s8 %r4418, %r4417; + mad.lo.s32 %r4419, %r37, %r4418, %r4410; + mad.lo.s32 %r4420, %r38, %r4416, %r4419; + mad.lo.s32 %r4421, %r39, %r4414, %r4420; + mad.lo.s32 %r4422, %r40, %r4412, %r4421; + ld.const.v4.u8 {%rs2834, %rs2835, %rs2836, %rs2837}, [matrix+1416]; + cvt.u32.u16 %r4423, %rs2837; + cvt.s32.s8 %r4424, %r4423; + cvt.u32.u16 %r4425, %rs2836; + cvt.s32.s8 %r4426, %r4425; + cvt.u32.u16 %r4427, %rs2835; + cvt.s32.s8 %r4428, %r4427; + cvt.u32.u16 %r4429, %rs2834; + cvt.s32.s8 %r4430, %r4429; + mad.lo.s32 %r4431, %r42, %r4430, %r4422; + mad.lo.s32 %r4432, %r43, %r4428, %r4431; + mad.lo.s32 %r4433, %r45, %r4426, %r4432; + mad.lo.s32 %r4434, %r46, %r4424, %r4433; + ld.const.v4.u8 {%rs2842, %rs2843, %rs2844, %rs2845}, [matrix+1420]; + cvt.u32.u16 %r4435, %rs2845; + cvt.s32.s8 %r4436, %r4435; + cvt.u32.u16 %r4437, %rs2844; + cvt.s32.s8 %r4438, %r4437; + cvt.u32.u16 %r4439, %rs2843; + cvt.s32.s8 %r4440, %r4439; + cvt.u32.u16 %r4441, %rs2842; + cvt.s32.s8 %r4442, %r4441; + mad.lo.s32 %r4443, %r48, %r4442, %r4434; + mad.lo.s32 %r4444, %r49, %r4440, %r4443; + mad.lo.s32 %r4445, %r50, %r4438, %r4444; + mad.lo.s32 %r4446, %r51, %r4436, %r4445; + ld.const.v4.u8 {%rs2850, %rs2851, %rs2852, %rs2853}, [matrix+1424]; + cvt.u32.u16 %r4447, %rs2853; + cvt.s32.s8 %r4448, %r4447; + cvt.u32.u16 %r4449, %rs2852; + cvt.s32.s8 %r4450, %r4449; + cvt.u32.u16 %r4451, %rs2851; + cvt.s32.s8 %r4452, %r4451; + cvt.u32.u16 %r4453, %rs2850; + cvt.s32.s8 %r4454, %r4453; + mad.lo.s32 %r4455, %r173, %r4454, %r4446; + mad.lo.s32 %r4456, %r53, %r4452, %r4455; + mad.lo.s32 %r4457, %r54, %r4450, %r4456; + mad.lo.s32 %r4458, %r55, %r4448, %r4457; + ld.const.v4.u8 {%rs2858, %rs2859, %rs2860, %rs2861}, [matrix+1428]; + cvt.u32.u16 %r4459, %rs2861; + cvt.s32.s8 %r4460, %r4459; + cvt.u32.u16 %r4461, %rs2860; + cvt.s32.s8 %r4462, %r4461; + cvt.u32.u16 %r4463, %rs2859; + cvt.s32.s8 %r4464, %r4463; + cvt.u32.u16 %r4465, %rs2858; + cvt.s32.s8 %r4466, %r4465; + mad.lo.s32 %r4467, %r56, %r4466, %r4458; + mad.lo.s32 %r4468, %r57, %r4464, %r4467; + mad.lo.s32 %r4469, %r58, %r4462, %r4468; + mad.lo.s32 %r4470, %r59, %r4460, %r4469; + ld.const.v4.u8 {%rs2866, %rs2867, %rs2868, %rs2869}, [matrix+1432]; + cvt.u32.u16 %r4471, %rs2869; + cvt.s32.s8 %r4472, %r4471; + cvt.u32.u16 %r4473, %rs2868; + cvt.s32.s8 %r4474, %r4473; + cvt.u32.u16 %r4475, %rs2867; + cvt.s32.s8 %r4476, %r4475; + cvt.u32.u16 %r4477, %rs2866; + cvt.s32.s8 %r4478, %r4477; + mad.lo.s32 %r4479, %r61, %r4478, %r4470; + mad.lo.s32 %r4480, %r62, %r4476, %r4479; + mad.lo.s32 %r4481, %r64, %r4474, %r4480; + mad.lo.s32 %r4482, %r65, %r4472, %r4481; + ld.const.v4.u8 {%rs2874, %rs2875, %rs2876, %rs2877}, [matrix+1436]; + cvt.u32.u16 %r4483, %rs2877; + cvt.s32.s8 %r4484, %r4483; + cvt.u32.u16 %r4485, %rs2876; + cvt.s32.s8 %r4486, %r4485; + cvt.u32.u16 %r4487, %rs2875; + cvt.s32.s8 %r4488, %r4487; + cvt.u32.u16 %r4489, %rs2874; + cvt.s32.s8 %r4490, %r4489; + mad.lo.s32 %r4491, %r67, %r4490, %r4482; + mad.lo.s32 %r4492, %r68, %r4488, %r4491; + mad.lo.s32 %r4493, %r69, %r4486, %r4492; + mad.lo.s32 %r4494, %r70, %r4484, %r4493; + ld.const.v4.u8 {%rs2882, %rs2883, %rs2884, %rs2885}, [matrix+1440]; + cvt.u32.u16 %r4495, %rs2885; + cvt.s32.s8 %r4496, %r4495; + cvt.u32.u16 %r4497, %rs2884; + cvt.s32.s8 %r4498, %r4497; + cvt.u32.u16 %r4499, %rs2883; + cvt.s32.s8 %r4500, %r4499; + cvt.u32.u16 %r4501, %rs2882; + cvt.s32.s8 %r4502, %r4501; + mad.lo.s32 %r4503, %r222, %r4502, %r4494; + mad.lo.s32 %r4504, %r72, %r4500, %r4503; + mad.lo.s32 %r4505, %r73, %r4498, %r4504; + mad.lo.s32 %r4506, %r74, %r4496, %r4505; + ld.const.v4.u8 {%rs2890, %rs2891, %rs2892, %rs2893}, [matrix+1444]; + cvt.u32.u16 %r4507, %rs2893; + cvt.s32.s8 %r4508, %r4507; + cvt.u32.u16 %r4509, %rs2892; + cvt.s32.s8 %r4510, %r4509; + cvt.u32.u16 %r4511, %rs2891; + cvt.s32.s8 %r4512, %r4511; + cvt.u32.u16 %r4513, %rs2890; + cvt.s32.s8 %r4514, %r4513; + mad.lo.s32 %r4515, %r75, %r4514, %r4506; + mad.lo.s32 %r4516, %r76, %r4512, %r4515; + mad.lo.s32 %r4517, %r77, %r4510, %r4516; + mad.lo.s32 %r4518, %r78, %r4508, %r4517; + ld.const.v4.u8 {%rs2898, %rs2899, %rs2900, %rs2901}, [matrix+1448]; + cvt.u32.u16 %r4519, %rs2901; + cvt.s32.s8 %r4520, %r4519; + cvt.u32.u16 %r4521, %rs2900; + cvt.s32.s8 %r4522, %r4521; + cvt.u32.u16 %r4523, %rs2899; + cvt.s32.s8 %r4524, %r4523; + cvt.u32.u16 %r4525, %rs2898; + cvt.s32.s8 %r4526, %r4525; + mad.lo.s32 %r4527, %r80, %r4526, %r4518; + mad.lo.s32 %r4528, %r81, %r4524, %r4527; + mad.lo.s32 %r4529, %r83, %r4522, %r4528; + mad.lo.s32 %r4530, %r84, %r4520, %r4529; + ld.const.v4.u8 {%rs2906, %rs2907, %rs2908, %rs2909}, [matrix+1452]; + cvt.u32.u16 %r4531, %rs2909; + cvt.s32.s8 %r4532, %r4531; + cvt.u32.u16 %r4533, %rs2908; + cvt.s32.s8 %r4534, %r4533; + cvt.u32.u16 %r4535, %rs2907; + cvt.s32.s8 %r4536, %r4535; + cvt.u32.u16 %r4537, %rs2906; + cvt.s32.s8 %r4538, %r4537; + mad.lo.s32 %r4539, %r86, %r4538, %r4530; + mad.lo.s32 %r4540, %r87, %r4536, %r4539; + mad.lo.s32 %r4541, %r88, %r4534, %r4540; + mad.lo.s32 %r4542, %r89, %r4532, %r4541; + ld.const.v4.u8 {%rs2914, %rs2915, %rs2916, %rs2917}, [matrix+1456]; + cvt.u32.u16 %r4543, %rs2917; + cvt.s32.s8 %r4544, %r4543; + cvt.u32.u16 %r4545, %rs2916; + cvt.s32.s8 %r4546, %r4545; + cvt.u32.u16 %r4547, %rs2915; + cvt.s32.s8 %r4548, %r4547; + cvt.u32.u16 %r4549, %rs2914; + cvt.s32.s8 %r4550, %r4549; + mad.lo.s32 %r4551, %r271, %r4550, %r4542; + mad.lo.s32 %r4552, %r91, %r4548, %r4551; + mad.lo.s32 %r4553, %r93, %r4546, %r4552; + mad.lo.s32 %r4554, %r94, %r4544, %r4553; + ld.const.v4.u8 {%rs2922, %rs2923, %rs2924, %rs2925}, [matrix+1460]; + cvt.u32.u16 %r4555, %rs2925; + cvt.s32.s8 %r4556, %r4555; + cvt.u32.u16 %r4557, %rs2924; + cvt.s32.s8 %r4558, %r4557; + cvt.u32.u16 %r4559, %rs2923; + cvt.s32.s8 %r4560, %r4559; + cvt.u32.u16 %r4561, %rs2922; + cvt.s32.s8 %r4562, %r4561; + mad.lo.s32 %r4563, %r96, %r4562, %r4554; + mad.lo.s32 %r4564, %r97, %r4560, %r4563; + mad.lo.s32 %r4565, %r99, %r4558, %r4564; + mad.lo.s32 %r4566, %r100, %r4556, %r4565; + ld.const.v4.u8 {%rs2930, %rs2931, %rs2932, %rs2933}, [matrix+1464]; + cvt.u32.u16 %r4567, %rs2933; + cvt.s32.s8 %r4568, %r4567; + cvt.u32.u16 %r4569, %rs2932; + cvt.s32.s8 %r4570, %r4569; + cvt.u32.u16 %r4571, %rs2931; + cvt.s32.s8 %r4572, %r4571; + cvt.u32.u16 %r4573, %rs2930; + cvt.s32.s8 %r4574, %r4573; + mad.lo.s32 %r4575, %r103, %r4574, %r4566; + mad.lo.s32 %r4576, %r104, %r4572, %r4575; + mad.lo.s32 %r4577, %r107, %r4570, %r4576; + mad.lo.s32 %r4578, %r108, %r4568, %r4577; + ld.const.v4.u8 {%rs2938, %rs2939, %rs2940, %rs2941}, [matrix+1468]; + cvt.u32.u16 %r4579, %rs2941; + cvt.s32.s8 %r4580, %r4579; + cvt.u32.u16 %r4581, %rs2940; + cvt.s32.s8 %r4582, %r4581; + cvt.u32.u16 %r4583, %rs2939; + cvt.s32.s8 %r4584, %r4583; + cvt.u32.u16 %r4585, %rs2938; + cvt.s32.s8 %r4586, %r4585; + mad.lo.s32 %r4587, %r111, %r4586, %r4578; + mad.lo.s32 %r4588, %r112, %r4584, %r4587; + mad.lo.s32 %r4589, %r114, %r4582, %r4588; + mad.lo.s32 %r4590, %r115, %r4580, %r4589; + ld.const.v4.u8 {%rs2946, %rs2947, %rs2948, %rs2949}, [matrix+1472]; + cvt.u32.u16 %r4591, %rs2949; + cvt.s32.s8 %r4592, %r4591; + cvt.u32.u16 %r4593, %rs2948; + cvt.s32.s8 %r4594, %r4593; + cvt.u32.u16 %r4595, %rs2946; + cvt.s32.s8 %r4596, %r4595; + cvt.u32.u16 %r4597, %rs2947; + cvt.s32.s8 %r4598, %r4597; + mul.lo.s32 %r4599, %r34, %r4598; + mad.lo.s32 %r4600, %r124, %r4596, %r4599; + mad.lo.s32 %r4601, %r35, %r4594, %r4600; + mad.lo.s32 %r4602, %r36, %r4592, %r4601; + ld.const.v4.u8 {%rs2954, %rs2955, %rs2956, %rs2957}, [matrix+1476]; + cvt.u32.u16 %r4603, %rs2957; + cvt.s32.s8 %r4604, %r4603; + cvt.u32.u16 %r4605, %rs2956; + cvt.s32.s8 %r4606, %r4605; + cvt.u32.u16 %r4607, %rs2955; + cvt.s32.s8 %r4608, %r4607; + cvt.u32.u16 %r4609, %rs2954; + cvt.s32.s8 %r4610, %r4609; + mad.lo.s32 %r4611, %r37, %r4610, %r4602; + mad.lo.s32 %r4612, %r38, %r4608, %r4611; + mad.lo.s32 %r4613, %r39, %r4606, %r4612; + mad.lo.s32 %r4614, %r40, %r4604, %r4613; + ld.const.v4.u8 {%rs2962, %rs2963, %rs2964, %rs2965}, [matrix+1480]; + cvt.u32.u16 %r4615, %rs2965; + cvt.s32.s8 %r4616, %r4615; + cvt.u32.u16 %r4617, %rs2964; + cvt.s32.s8 %r4618, %r4617; + cvt.u32.u16 %r4619, %rs2963; + cvt.s32.s8 %r4620, %r4619; + cvt.u32.u16 %r4621, %rs2962; + cvt.s32.s8 %r4622, %r4621; + mad.lo.s32 %r4623, %r42, %r4622, %r4614; + mad.lo.s32 %r4624, %r43, %r4620, %r4623; + mad.lo.s32 %r4625, %r45, %r4618, %r4624; + mad.lo.s32 %r4626, %r46, %r4616, %r4625; + ld.const.v4.u8 {%rs2970, %rs2971, %rs2972, %rs2973}, [matrix+1484]; + cvt.u32.u16 %r4627, %rs2973; + cvt.s32.s8 %r4628, %r4627; + cvt.u32.u16 %r4629, %rs2972; + cvt.s32.s8 %r4630, %r4629; + cvt.u32.u16 %r4631, %rs2971; + cvt.s32.s8 %r4632, %r4631; + cvt.u32.u16 %r4633, %rs2970; + cvt.s32.s8 %r4634, %r4633; + mad.lo.s32 %r4635, %r48, %r4634, %r4626; + mad.lo.s32 %r4636, %r49, %r4632, %r4635; + mad.lo.s32 %r4637, %r50, %r4630, %r4636; + mad.lo.s32 %r4638, %r51, %r4628, %r4637; + ld.const.v4.u8 {%rs2978, %rs2979, %rs2980, %rs2981}, [matrix+1488]; + cvt.u32.u16 %r4639, %rs2981; + cvt.s32.s8 %r4640, %r4639; + cvt.u32.u16 %r4641, %rs2980; + cvt.s32.s8 %r4642, %r4641; + cvt.u32.u16 %r4643, %rs2979; + cvt.s32.s8 %r4644, %r4643; + cvt.u32.u16 %r4645, %rs2978; + cvt.s32.s8 %r4646, %r4645; + mad.lo.s32 %r4647, %r173, %r4646, %r4638; + mad.lo.s32 %r4648, %r53, %r4644, %r4647; + mad.lo.s32 %r4649, %r54, %r4642, %r4648; + mad.lo.s32 %r4650, %r55, %r4640, %r4649; + ld.const.v4.u8 {%rs2986, %rs2987, %rs2988, %rs2989}, [matrix+1492]; + cvt.u32.u16 %r4651, %rs2989; + cvt.s32.s8 %r4652, %r4651; + cvt.u32.u16 %r4653, %rs2988; + cvt.s32.s8 %r4654, %r4653; + cvt.u32.u16 %r4655, %rs2987; + cvt.s32.s8 %r4656, %r4655; + cvt.u32.u16 %r4657, %rs2986; + cvt.s32.s8 %r4658, %r4657; + mad.lo.s32 %r4659, %r56, %r4658, %r4650; + mad.lo.s32 %r4660, %r57, %r4656, %r4659; + mad.lo.s32 %r4661, %r58, %r4654, %r4660; + mad.lo.s32 %r4662, %r59, %r4652, %r4661; + ld.const.v4.u8 {%rs2994, %rs2995, %rs2996, %rs2997}, [matrix+1496]; + cvt.u32.u16 %r4663, %rs2997; + cvt.s32.s8 %r4664, %r4663; + cvt.u32.u16 %r4665, %rs2996; + cvt.s32.s8 %r4666, %r4665; + cvt.u32.u16 %r4667, %rs2995; + cvt.s32.s8 %r4668, %r4667; + cvt.u32.u16 %r4669, %rs2994; + cvt.s32.s8 %r4670, %r4669; + mad.lo.s32 %r4671, %r61, %r4670, %r4662; + mad.lo.s32 %r4672, %r62, %r4668, %r4671; + mad.lo.s32 %r4673, %r64, %r4666, %r4672; + mad.lo.s32 %r4674, %r65, %r4664, %r4673; + ld.const.v4.u8 {%rs3002, %rs3003, %rs3004, %rs3005}, [matrix+1500]; + cvt.u32.u16 %r4675, %rs3005; + cvt.s32.s8 %r4676, %r4675; + cvt.u32.u16 %r4677, %rs3004; + cvt.s32.s8 %r4678, %r4677; + cvt.u32.u16 %r4679, %rs3003; + cvt.s32.s8 %r4680, %r4679; + cvt.u32.u16 %r4681, %rs3002; + cvt.s32.s8 %r4682, %r4681; + mad.lo.s32 %r4683, %r67, %r4682, %r4674; + mad.lo.s32 %r4684, %r68, %r4680, %r4683; + mad.lo.s32 %r4685, %r69, %r4678, %r4684; + mad.lo.s32 %r4686, %r70, %r4676, %r4685; + ld.const.v4.u8 {%rs3010, %rs3011, %rs3012, %rs3013}, [matrix+1504]; + cvt.u32.u16 %r4687, %rs3013; + cvt.s32.s8 %r4688, %r4687; + cvt.u32.u16 %r4689, %rs3012; + cvt.s32.s8 %r4690, %r4689; + cvt.u32.u16 %r4691, %rs3011; + cvt.s32.s8 %r4692, %r4691; + cvt.u32.u16 %r4693, %rs3010; + cvt.s32.s8 %r4694, %r4693; + mad.lo.s32 %r4695, %r222, %r4694, %r4686; + mad.lo.s32 %r4696, %r72, %r4692, %r4695; + mad.lo.s32 %r4697, %r73, %r4690, %r4696; + mad.lo.s32 %r4698, %r74, %r4688, %r4697; + ld.const.v4.u8 {%rs3018, %rs3019, %rs3020, %rs3021}, [matrix+1508]; + cvt.u32.u16 %r4699, %rs3021; + cvt.s32.s8 %r4700, %r4699; + cvt.u32.u16 %r4701, %rs3020; + cvt.s32.s8 %r4702, %r4701; + cvt.u32.u16 %r4703, %rs3019; + cvt.s32.s8 %r4704, %r4703; + cvt.u32.u16 %r4705, %rs3018; + cvt.s32.s8 %r4706, %r4705; + mad.lo.s32 %r4707, %r75, %r4706, %r4698; + mad.lo.s32 %r4708, %r76, %r4704, %r4707; + mad.lo.s32 %r4709, %r77, %r4702, %r4708; + mad.lo.s32 %r4710, %r78, %r4700, %r4709; + ld.const.v4.u8 {%rs3026, %rs3027, %rs3028, %rs3029}, [matrix+1512]; + cvt.u32.u16 %r4711, %rs3029; + cvt.s32.s8 %r4712, %r4711; + cvt.u32.u16 %r4713, %rs3028; + cvt.s32.s8 %r4714, %r4713; + cvt.u32.u16 %r4715, %rs3027; + cvt.s32.s8 %r4716, %r4715; + cvt.u32.u16 %r4717, %rs3026; + cvt.s32.s8 %r4718, %r4717; + mad.lo.s32 %r4719, %r80, %r4718, %r4710; + mad.lo.s32 %r4720, %r81, %r4716, %r4719; + mad.lo.s32 %r4721, %r83, %r4714, %r4720; + mad.lo.s32 %r4722, %r84, %r4712, %r4721; + ld.const.v4.u8 {%rs3034, %rs3035, %rs3036, %rs3037}, [matrix+1516]; + cvt.u32.u16 %r4723, %rs3037; + cvt.s32.s8 %r4724, %r4723; + cvt.u32.u16 %r4725, %rs3036; + cvt.s32.s8 %r4726, %r4725; + cvt.u32.u16 %r4727, %rs3035; + cvt.s32.s8 %r4728, %r4727; + cvt.u32.u16 %r4729, %rs3034; + cvt.s32.s8 %r4730, %r4729; + mad.lo.s32 %r4731, %r86, %r4730, %r4722; + mad.lo.s32 %r4732, %r87, %r4728, %r4731; + mad.lo.s32 %r4733, %r88, %r4726, %r4732; + mad.lo.s32 %r4734, %r89, %r4724, %r4733; + ld.const.v4.u8 {%rs3042, %rs3043, %rs3044, %rs3045}, [matrix+1520]; + cvt.u32.u16 %r4735, %rs3045; + cvt.s32.s8 %r4736, %r4735; + cvt.u32.u16 %r4737, %rs3044; + cvt.s32.s8 %r4738, %r4737; + cvt.u32.u16 %r4739, %rs3043; + cvt.s32.s8 %r4740, %r4739; + cvt.u32.u16 %r4741, %rs3042; + cvt.s32.s8 %r4742, %r4741; + mad.lo.s32 %r4743, %r271, %r4742, %r4734; + mad.lo.s32 %r4744, %r91, %r4740, %r4743; + mad.lo.s32 %r4745, %r93, %r4738, %r4744; + mad.lo.s32 %r4746, %r94, %r4736, %r4745; + ld.const.v4.u8 {%rs3050, %rs3051, %rs3052, %rs3053}, [matrix+1524]; + cvt.u32.u16 %r4747, %rs3053; + cvt.s32.s8 %r4748, %r4747; + cvt.u32.u16 %r4749, %rs3052; + cvt.s32.s8 %r4750, %r4749; + cvt.u32.u16 %r4751, %rs3051; + cvt.s32.s8 %r4752, %r4751; + cvt.u32.u16 %r4753, %rs3050; + cvt.s32.s8 %r4754, %r4753; + mad.lo.s32 %r4755, %r96, %r4754, %r4746; + mad.lo.s32 %r4756, %r97, %r4752, %r4755; + mad.lo.s32 %r4757, %r99, %r4750, %r4756; + mad.lo.s32 %r4758, %r100, %r4748, %r4757; + ld.const.v4.u8 {%rs3058, %rs3059, %rs3060, %rs3061}, [matrix+1528]; + cvt.u32.u16 %r4759, %rs3061; + cvt.s32.s8 %r4760, %r4759; + cvt.u32.u16 %r4761, %rs3060; + cvt.s32.s8 %r4762, %r4761; + cvt.u32.u16 %r4763, %rs3059; + cvt.s32.s8 %r4764, %r4763; + cvt.u32.u16 %r4765, %rs3058; + cvt.s32.s8 %r4766, %r4765; + mad.lo.s32 %r4767, %r103, %r4766, %r4758; + mad.lo.s32 %r4768, %r104, %r4764, %r4767; + mad.lo.s32 %r4769, %r107, %r4762, %r4768; + mad.lo.s32 %r4770, %r108, %r4760, %r4769; + ld.const.v4.u8 {%rs3066, %rs3067, %rs3068, %rs3069}, [matrix+1532]; + cvt.u32.u16 %r4771, %rs3069; + cvt.s32.s8 %r4772, %r4771; + cvt.u32.u16 %r4773, %rs3068; + cvt.s32.s8 %r4774, %r4773; + cvt.u32.u16 %r4775, %rs3067; + cvt.s32.s8 %r4776, %r4775; + cvt.u32.u16 %r4777, %rs3066; + cvt.s32.s8 %r4778, %r4777; + mad.lo.s32 %r4779, %r111, %r4778, %r4770; + mad.lo.s32 %r4780, %r112, %r4776, %r4779; + mad.lo.s32 %r4781, %r114, %r4774, %r4780; + mad.lo.s32 %r4782, %r115, %r4772, %r4781; + shr.u32 %r4783, %r4590, 6; + and.b32 %r4784, %r4783, 240; + shr.u32 %r4785, %r4782, 10; + or.b32 %r4786, %r4785, %r4784; + xor.b32 %r4787, %r21, %r4786; + cvt.u64.u32 %rd389, %r4787; + ld.const.v4.u8 {%rs3074, %rs3075, %rs3076, %rs3077}, [matrix+1536]; + cvt.u32.u16 %r4788, %rs3077; + cvt.s32.s8 %r4789, %r4788; + cvt.u32.u16 %r4790, %rs3076; + cvt.s32.s8 %r4791, %r4790; + cvt.u32.u16 %r4792, %rs3074; + cvt.s32.s8 %r4793, %r4792; + cvt.u32.u16 %r4794, %rs3075; + cvt.s32.s8 %r4795, %r4794; + mul.lo.s32 %r4796, %r34, %r4795; + mad.lo.s32 %r4797, %r124, %r4793, %r4796; + mad.lo.s32 %r4798, %r35, %r4791, %r4797; + mad.lo.s32 %r4799, %r36, %r4789, %r4798; + ld.const.v4.u8 {%rs3082, %rs3083, %rs3084, %rs3085}, [matrix+1540]; + cvt.u32.u16 %r4800, %rs3085; + cvt.s32.s8 %r4801, %r4800; + cvt.u32.u16 %r4802, %rs3084; + cvt.s32.s8 %r4803, %r4802; + cvt.u32.u16 %r4804, %rs3083; + cvt.s32.s8 %r4805, %r4804; + cvt.u32.u16 %r4806, %rs3082; + cvt.s32.s8 %r4807, %r4806; + mad.lo.s32 %r4808, %r37, %r4807, %r4799; + mad.lo.s32 %r4809, %r38, %r4805, %r4808; + mad.lo.s32 %r4810, %r39, %r4803, %r4809; + mad.lo.s32 %r4811, %r40, %r4801, %r4810; + ld.const.v4.u8 {%rs3090, %rs3091, %rs3092, %rs3093}, [matrix+1544]; + cvt.u32.u16 %r4812, %rs3093; + cvt.s32.s8 %r4813, %r4812; + cvt.u32.u16 %r4814, %rs3092; + cvt.s32.s8 %r4815, %r4814; + cvt.u32.u16 %r4816, %rs3091; + cvt.s32.s8 %r4817, %r4816; + cvt.u32.u16 %r4818, %rs3090; + cvt.s32.s8 %r4819, %r4818; + mad.lo.s32 %r4820, %r42, %r4819, %r4811; + mad.lo.s32 %r4821, %r43, %r4817, %r4820; + mad.lo.s32 %r4822, %r45, %r4815, %r4821; + mad.lo.s32 %r4823, %r46, %r4813, %r4822; + ld.const.v4.u8 {%rs3098, %rs3099, %rs3100, %rs3101}, [matrix+1548]; + cvt.u32.u16 %r4824, %rs3101; + cvt.s32.s8 %r4825, %r4824; + cvt.u32.u16 %r4826, %rs3100; + cvt.s32.s8 %r4827, %r4826; + cvt.u32.u16 %r4828, %rs3099; + cvt.s32.s8 %r4829, %r4828; + cvt.u32.u16 %r4830, %rs3098; + cvt.s32.s8 %r4831, %r4830; + mad.lo.s32 %r4832, %r48, %r4831, %r4823; + mad.lo.s32 %r4833, %r49, %r4829, %r4832; + mad.lo.s32 %r4834, %r50, %r4827, %r4833; + mad.lo.s32 %r4835, %r51, %r4825, %r4834; + ld.const.v4.u8 {%rs3106, %rs3107, %rs3108, %rs3109}, [matrix+1552]; + cvt.u32.u16 %r4836, %rs3109; + cvt.s32.s8 %r4837, %r4836; + cvt.u32.u16 %r4838, %rs3108; + cvt.s32.s8 %r4839, %r4838; + cvt.u32.u16 %r4840, %rs3107; + cvt.s32.s8 %r4841, %r4840; + cvt.u32.u16 %r4842, %rs3106; + cvt.s32.s8 %r4843, %r4842; + mad.lo.s32 %r4844, %r173, %r4843, %r4835; + mad.lo.s32 %r4845, %r53, %r4841, %r4844; + mad.lo.s32 %r4846, %r54, %r4839, %r4845; + mad.lo.s32 %r4847, %r55, %r4837, %r4846; + ld.const.v4.u8 {%rs3114, %rs3115, %rs3116, %rs3117}, [matrix+1556]; + cvt.u32.u16 %r4848, %rs3117; + cvt.s32.s8 %r4849, %r4848; + cvt.u32.u16 %r4850, %rs3116; + cvt.s32.s8 %r4851, %r4850; + cvt.u32.u16 %r4852, %rs3115; + cvt.s32.s8 %r4853, %r4852; + cvt.u32.u16 %r4854, %rs3114; + cvt.s32.s8 %r4855, %r4854; + mad.lo.s32 %r4856, %r56, %r4855, %r4847; + mad.lo.s32 %r4857, %r57, %r4853, %r4856; + mad.lo.s32 %r4858, %r58, %r4851, %r4857; + mad.lo.s32 %r4859, %r59, %r4849, %r4858; + ld.const.v4.u8 {%rs3122, %rs3123, %rs3124, %rs3125}, [matrix+1560]; + cvt.u32.u16 %r4860, %rs3125; + cvt.s32.s8 %r4861, %r4860; + cvt.u32.u16 %r4862, %rs3124; + cvt.s32.s8 %r4863, %r4862; + cvt.u32.u16 %r4864, %rs3123; + cvt.s32.s8 %r4865, %r4864; + cvt.u32.u16 %r4866, %rs3122; + cvt.s32.s8 %r4867, %r4866; + mad.lo.s32 %r4868, %r61, %r4867, %r4859; + mad.lo.s32 %r4869, %r62, %r4865, %r4868; + mad.lo.s32 %r4870, %r64, %r4863, %r4869; + mad.lo.s32 %r4871, %r65, %r4861, %r4870; + ld.const.v4.u8 {%rs3130, %rs3131, %rs3132, %rs3133}, [matrix+1564]; + cvt.u32.u16 %r4872, %rs3133; + cvt.s32.s8 %r4873, %r4872; + cvt.u32.u16 %r4874, %rs3132; + cvt.s32.s8 %r4875, %r4874; + cvt.u32.u16 %r4876, %rs3131; + cvt.s32.s8 %r4877, %r4876; + cvt.u32.u16 %r4878, %rs3130; + cvt.s32.s8 %r4879, %r4878; + mad.lo.s32 %r4880, %r67, %r4879, %r4871; + mad.lo.s32 %r4881, %r68, %r4877, %r4880; + mad.lo.s32 %r4882, %r69, %r4875, %r4881; + mad.lo.s32 %r4883, %r70, %r4873, %r4882; + ld.const.v4.u8 {%rs3138, %rs3139, %rs3140, %rs3141}, [matrix+1568]; + cvt.u32.u16 %r4884, %rs3141; + cvt.s32.s8 %r4885, %r4884; + cvt.u32.u16 %r4886, %rs3140; + cvt.s32.s8 %r4887, %r4886; + cvt.u32.u16 %r4888, %rs3139; + cvt.s32.s8 %r4889, %r4888; + cvt.u32.u16 %r4890, %rs3138; + cvt.s32.s8 %r4891, %r4890; + mad.lo.s32 %r4892, %r222, %r4891, %r4883; + mad.lo.s32 %r4893, %r72, %r4889, %r4892; + mad.lo.s32 %r4894, %r73, %r4887, %r4893; + mad.lo.s32 %r4895, %r74, %r4885, %r4894; + ld.const.v4.u8 {%rs3146, %rs3147, %rs3148, %rs3149}, [matrix+1572]; + cvt.u32.u16 %r4896, %rs3149; + cvt.s32.s8 %r4897, %r4896; + cvt.u32.u16 %r4898, %rs3148; + cvt.s32.s8 %r4899, %r4898; + cvt.u32.u16 %r4900, %rs3147; + cvt.s32.s8 %r4901, %r4900; + cvt.u32.u16 %r4902, %rs3146; + cvt.s32.s8 %r4903, %r4902; + mad.lo.s32 %r4904, %r75, %r4903, %r4895; + mad.lo.s32 %r4905, %r76, %r4901, %r4904; + mad.lo.s32 %r4906, %r77, %r4899, %r4905; + mad.lo.s32 %r4907, %r78, %r4897, %r4906; + ld.const.v4.u8 {%rs3154, %rs3155, %rs3156, %rs3157}, [matrix+1576]; + cvt.u32.u16 %r4908, %rs3157; + cvt.s32.s8 %r4909, %r4908; + cvt.u32.u16 %r4910, %rs3156; + cvt.s32.s8 %r4911, %r4910; + cvt.u32.u16 %r4912, %rs3155; + cvt.s32.s8 %r4913, %r4912; + cvt.u32.u16 %r4914, %rs3154; + cvt.s32.s8 %r4915, %r4914; + mad.lo.s32 %r4916, %r80, %r4915, %r4907; + mad.lo.s32 %r4917, %r81, %r4913, %r4916; + mad.lo.s32 %r4918, %r83, %r4911, %r4917; + mad.lo.s32 %r4919, %r84, %r4909, %r4918; + ld.const.v4.u8 {%rs3162, %rs3163, %rs3164, %rs3165}, [matrix+1580]; + cvt.u32.u16 %r4920, %rs3165; + cvt.s32.s8 %r4921, %r4920; + cvt.u32.u16 %r4922, %rs3164; + cvt.s32.s8 %r4923, %r4922; + cvt.u32.u16 %r4924, %rs3163; + cvt.s32.s8 %r4925, %r4924; + cvt.u32.u16 %r4926, %rs3162; + cvt.s32.s8 %r4927, %r4926; + mad.lo.s32 %r4928, %r86, %r4927, %r4919; + mad.lo.s32 %r4929, %r87, %r4925, %r4928; + mad.lo.s32 %r4930, %r88, %r4923, %r4929; + mad.lo.s32 %r4931, %r89, %r4921, %r4930; + ld.const.v4.u8 {%rs3170, %rs3171, %rs3172, %rs3173}, [matrix+1584]; + cvt.u32.u16 %r4932, %rs3173; + cvt.s32.s8 %r4933, %r4932; + cvt.u32.u16 %r4934, %rs3172; + cvt.s32.s8 %r4935, %r4934; + cvt.u32.u16 %r4936, %rs3171; + cvt.s32.s8 %r4937, %r4936; + cvt.u32.u16 %r4938, %rs3170; + cvt.s32.s8 %r4939, %r4938; + mad.lo.s32 %r4940, %r271, %r4939, %r4931; + mad.lo.s32 %r4941, %r91, %r4937, %r4940; + mad.lo.s32 %r4942, %r93, %r4935, %r4941; + mad.lo.s32 %r4943, %r94, %r4933, %r4942; + ld.const.v4.u8 {%rs3178, %rs3179, %rs3180, %rs3181}, [matrix+1588]; + cvt.u32.u16 %r4944, %rs3181; + cvt.s32.s8 %r4945, %r4944; + cvt.u32.u16 %r4946, %rs3180; + cvt.s32.s8 %r4947, %r4946; + cvt.u32.u16 %r4948, %rs3179; + cvt.s32.s8 %r4949, %r4948; + cvt.u32.u16 %r4950, %rs3178; + cvt.s32.s8 %r4951, %r4950; + mad.lo.s32 %r4952, %r96, %r4951, %r4943; + mad.lo.s32 %r4953, %r97, %r4949, %r4952; + mad.lo.s32 %r4954, %r99, %r4947, %r4953; + mad.lo.s32 %r4955, %r100, %r4945, %r4954; + ld.const.v4.u8 {%rs3186, %rs3187, %rs3188, %rs3189}, [matrix+1592]; + cvt.u32.u16 %r4956, %rs3189; + cvt.s32.s8 %r4957, %r4956; + cvt.u32.u16 %r4958, %rs3188; + cvt.s32.s8 %r4959, %r4958; + cvt.u32.u16 %r4960, %rs3187; + cvt.s32.s8 %r4961, %r4960; + cvt.u32.u16 %r4962, %rs3186; + cvt.s32.s8 %r4963, %r4962; + mad.lo.s32 %r4964, %r103, %r4963, %r4955; + mad.lo.s32 %r4965, %r104, %r4961, %r4964; + mad.lo.s32 %r4966, %r107, %r4959, %r4965; + mad.lo.s32 %r4967, %r108, %r4957, %r4966; + ld.const.v4.u8 {%rs3194, %rs3195, %rs3196, %rs3197}, [matrix+1596]; + cvt.u32.u16 %r4968, %rs3197; + cvt.s32.s8 %r4969, %r4968; + cvt.u32.u16 %r4970, %rs3196; + cvt.s32.s8 %r4971, %r4970; + cvt.u32.u16 %r4972, %rs3195; + cvt.s32.s8 %r4973, %r4972; + cvt.u32.u16 %r4974, %rs3194; + cvt.s32.s8 %r4975, %r4974; + mad.lo.s32 %r4976, %r111, %r4975, %r4967; + mad.lo.s32 %r4977, %r112, %r4973, %r4976; + mad.lo.s32 %r4978, %r114, %r4971, %r4977; + mad.lo.s32 %r4979, %r115, %r4969, %r4978; + ld.const.v4.u8 {%rs3202, %rs3203, %rs3204, %rs3205}, [matrix+1600]; + cvt.u32.u16 %r4980, %rs3205; + cvt.s32.s8 %r4981, %r4980; + cvt.u32.u16 %r4982, %rs3204; + cvt.s32.s8 %r4983, %r4982; + cvt.u32.u16 %r4984, %rs3202; + cvt.s32.s8 %r4985, %r4984; + cvt.u32.u16 %r4986, %rs3203; + cvt.s32.s8 %r4987, %r4986; + mul.lo.s32 %r4988, %r34, %r4987; + mad.lo.s32 %r4989, %r124, %r4985, %r4988; + mad.lo.s32 %r4990, %r35, %r4983, %r4989; + mad.lo.s32 %r4991, %r36, %r4981, %r4990; + ld.const.v4.u8 {%rs3210, %rs3211, %rs3212, %rs3213}, [matrix+1604]; + cvt.u32.u16 %r4992, %rs3213; + cvt.s32.s8 %r4993, %r4992; + cvt.u32.u16 %r4994, %rs3212; + cvt.s32.s8 %r4995, %r4994; + cvt.u32.u16 %r4996, %rs3211; + cvt.s32.s8 %r4997, %r4996; + cvt.u32.u16 %r4998, %rs3210; + cvt.s32.s8 %r4999, %r4998; + mad.lo.s32 %r5000, %r37, %r4999, %r4991; + mad.lo.s32 %r5001, %r38, %r4997, %r5000; + mad.lo.s32 %r5002, %r39, %r4995, %r5001; + mad.lo.s32 %r5003, %r40, %r4993, %r5002; + ld.const.v4.u8 {%rs3218, %rs3219, %rs3220, %rs3221}, [matrix+1608]; + cvt.u32.u16 %r5004, %rs3221; + cvt.s32.s8 %r5005, %r5004; + cvt.u32.u16 %r5006, %rs3220; + cvt.s32.s8 %r5007, %r5006; + cvt.u32.u16 %r5008, %rs3219; + cvt.s32.s8 %r5009, %r5008; + cvt.u32.u16 %r5010, %rs3218; + cvt.s32.s8 %r5011, %r5010; + mad.lo.s32 %r5012, %r42, %r5011, %r5003; + mad.lo.s32 %r5013, %r43, %r5009, %r5012; + mad.lo.s32 %r5014, %r45, %r5007, %r5013; + mad.lo.s32 %r5015, %r46, %r5005, %r5014; + ld.const.v4.u8 {%rs3226, %rs3227, %rs3228, %rs3229}, [matrix+1612]; + cvt.u32.u16 %r5016, %rs3229; + cvt.s32.s8 %r5017, %r5016; + cvt.u32.u16 %r5018, %rs3228; + cvt.s32.s8 %r5019, %r5018; + cvt.u32.u16 %r5020, %rs3227; + cvt.s32.s8 %r5021, %r5020; + cvt.u32.u16 %r5022, %rs3226; + cvt.s32.s8 %r5023, %r5022; + mad.lo.s32 %r5024, %r48, %r5023, %r5015; + mad.lo.s32 %r5025, %r49, %r5021, %r5024; + mad.lo.s32 %r5026, %r50, %r5019, %r5025; + mad.lo.s32 %r5027, %r51, %r5017, %r5026; + ld.const.v4.u8 {%rs3234, %rs3235, %rs3236, %rs3237}, [matrix+1616]; + cvt.u32.u16 %r5028, %rs3237; + cvt.s32.s8 %r5029, %r5028; + cvt.u32.u16 %r5030, %rs3236; + cvt.s32.s8 %r5031, %r5030; + cvt.u32.u16 %r5032, %rs3235; + cvt.s32.s8 %r5033, %r5032; + cvt.u32.u16 %r5034, %rs3234; + cvt.s32.s8 %r5035, %r5034; + mad.lo.s32 %r5036, %r173, %r5035, %r5027; + mad.lo.s32 %r5037, %r53, %r5033, %r5036; + mad.lo.s32 %r5038, %r54, %r5031, %r5037; + mad.lo.s32 %r5039, %r55, %r5029, %r5038; + ld.const.v4.u8 {%rs3242, %rs3243, %rs3244, %rs3245}, [matrix+1620]; + cvt.u32.u16 %r5040, %rs3245; + cvt.s32.s8 %r5041, %r5040; + cvt.u32.u16 %r5042, %rs3244; + cvt.s32.s8 %r5043, %r5042; + cvt.u32.u16 %r5044, %rs3243; + cvt.s32.s8 %r5045, %r5044; + cvt.u32.u16 %r5046, %rs3242; + cvt.s32.s8 %r5047, %r5046; + mad.lo.s32 %r5048, %r56, %r5047, %r5039; + mad.lo.s32 %r5049, %r57, %r5045, %r5048; + mad.lo.s32 %r5050, %r58, %r5043, %r5049; + mad.lo.s32 %r5051, %r59, %r5041, %r5050; + ld.const.v4.u8 {%rs3250, %rs3251, %rs3252, %rs3253}, [matrix+1624]; + cvt.u32.u16 %r5052, %rs3253; + cvt.s32.s8 %r5053, %r5052; + cvt.u32.u16 %r5054, %rs3252; + cvt.s32.s8 %r5055, %r5054; + cvt.u32.u16 %r5056, %rs3251; + cvt.s32.s8 %r5057, %r5056; + cvt.u32.u16 %r5058, %rs3250; + cvt.s32.s8 %r5059, %r5058; + mad.lo.s32 %r5060, %r61, %r5059, %r5051; + mad.lo.s32 %r5061, %r62, %r5057, %r5060; + mad.lo.s32 %r5062, %r64, %r5055, %r5061; + mad.lo.s32 %r5063, %r65, %r5053, %r5062; + ld.const.v4.u8 {%rs3258, %rs3259, %rs3260, %rs3261}, [matrix+1628]; + cvt.u32.u16 %r5064, %rs3261; + cvt.s32.s8 %r5065, %r5064; + cvt.u32.u16 %r5066, %rs3260; + cvt.s32.s8 %r5067, %r5066; + cvt.u32.u16 %r5068, %rs3259; + cvt.s32.s8 %r5069, %r5068; + cvt.u32.u16 %r5070, %rs3258; + cvt.s32.s8 %r5071, %r5070; + mad.lo.s32 %r5072, %r67, %r5071, %r5063; + mad.lo.s32 %r5073, %r68, %r5069, %r5072; + mad.lo.s32 %r5074, %r69, %r5067, %r5073; + mad.lo.s32 %r5075, %r70, %r5065, %r5074; + ld.const.v4.u8 {%rs3266, %rs3267, %rs3268, %rs3269}, [matrix+1632]; + cvt.u32.u16 %r5076, %rs3269; + cvt.s32.s8 %r5077, %r5076; + cvt.u32.u16 %r5078, %rs3268; + cvt.s32.s8 %r5079, %r5078; + cvt.u32.u16 %r5080, %rs3267; + cvt.s32.s8 %r5081, %r5080; + cvt.u32.u16 %r5082, %rs3266; + cvt.s32.s8 %r5083, %r5082; + mad.lo.s32 %r5084, %r222, %r5083, %r5075; + mad.lo.s32 %r5085, %r72, %r5081, %r5084; + mad.lo.s32 %r5086, %r73, %r5079, %r5085; + mad.lo.s32 %r5087, %r74, %r5077, %r5086; + ld.const.v4.u8 {%rs3274, %rs3275, %rs3276, %rs3277}, [matrix+1636]; + cvt.u32.u16 %r5088, %rs3277; + cvt.s32.s8 %r5089, %r5088; + cvt.u32.u16 %r5090, %rs3276; + cvt.s32.s8 %r5091, %r5090; + cvt.u32.u16 %r5092, %rs3275; + cvt.s32.s8 %r5093, %r5092; + cvt.u32.u16 %r5094, %rs3274; + cvt.s32.s8 %r5095, %r5094; + mad.lo.s32 %r5096, %r75, %r5095, %r5087; + mad.lo.s32 %r5097, %r76, %r5093, %r5096; + mad.lo.s32 %r5098, %r77, %r5091, %r5097; + mad.lo.s32 %r5099, %r78, %r5089, %r5098; + ld.const.v4.u8 {%rs3282, %rs3283, %rs3284, %rs3285}, [matrix+1640]; + cvt.u32.u16 %r5100, %rs3285; + cvt.s32.s8 %r5101, %r5100; + cvt.u32.u16 %r5102, %rs3284; + cvt.s32.s8 %r5103, %r5102; + cvt.u32.u16 %r5104, %rs3283; + cvt.s32.s8 %r5105, %r5104; + cvt.u32.u16 %r5106, %rs3282; + cvt.s32.s8 %r5107, %r5106; + mad.lo.s32 %r5108, %r80, %r5107, %r5099; + mad.lo.s32 %r5109, %r81, %r5105, %r5108; + mad.lo.s32 %r5110, %r83, %r5103, %r5109; + mad.lo.s32 %r5111, %r84, %r5101, %r5110; + ld.const.v4.u8 {%rs3290, %rs3291, %rs3292, %rs3293}, [matrix+1644]; + cvt.u32.u16 %r5112, %rs3293; + cvt.s32.s8 %r5113, %r5112; + cvt.u32.u16 %r5114, %rs3292; + cvt.s32.s8 %r5115, %r5114; + cvt.u32.u16 %r5116, %rs3291; + cvt.s32.s8 %r5117, %r5116; + cvt.u32.u16 %r5118, %rs3290; + cvt.s32.s8 %r5119, %r5118; + mad.lo.s32 %r5120, %r86, %r5119, %r5111; + mad.lo.s32 %r5121, %r87, %r5117, %r5120; + mad.lo.s32 %r5122, %r88, %r5115, %r5121; + mad.lo.s32 %r5123, %r89, %r5113, %r5122; + ld.const.v4.u8 {%rs3298, %rs3299, %rs3300, %rs3301}, [matrix+1648]; + cvt.u32.u16 %r5124, %rs3301; + cvt.s32.s8 %r5125, %r5124; + cvt.u32.u16 %r5126, %rs3300; + cvt.s32.s8 %r5127, %r5126; + cvt.u32.u16 %r5128, %rs3299; + cvt.s32.s8 %r5129, %r5128; + cvt.u32.u16 %r5130, %rs3298; + cvt.s32.s8 %r5131, %r5130; + mad.lo.s32 %r5132, %r271, %r5131, %r5123; + mad.lo.s32 %r5133, %r91, %r5129, %r5132; + mad.lo.s32 %r5134, %r93, %r5127, %r5133; + mad.lo.s32 %r5135, %r94, %r5125, %r5134; + ld.const.v4.u8 {%rs3306, %rs3307, %rs3308, %rs3309}, [matrix+1652]; + cvt.u32.u16 %r5136, %rs3309; + cvt.s32.s8 %r5137, %r5136; + cvt.u32.u16 %r5138, %rs3308; + cvt.s32.s8 %r5139, %r5138; + cvt.u32.u16 %r5140, %rs3307; + cvt.s32.s8 %r5141, %r5140; + cvt.u32.u16 %r5142, %rs3306; + cvt.s32.s8 %r5143, %r5142; + mad.lo.s32 %r5144, %r96, %r5143, %r5135; + mad.lo.s32 %r5145, %r97, %r5141, %r5144; + mad.lo.s32 %r5146, %r99, %r5139, %r5145; + mad.lo.s32 %r5147, %r100, %r5137, %r5146; + ld.const.v4.u8 {%rs3314, %rs3315, %rs3316, %rs3317}, [matrix+1656]; + cvt.u32.u16 %r5148, %rs3317; + cvt.s32.s8 %r5149, %r5148; + cvt.u32.u16 %r5150, %rs3316; + cvt.s32.s8 %r5151, %r5150; + cvt.u32.u16 %r5152, %rs3315; + cvt.s32.s8 %r5153, %r5152; + cvt.u32.u16 %r5154, %rs3314; + cvt.s32.s8 %r5155, %r5154; + mad.lo.s32 %r5156, %r103, %r5155, %r5147; + mad.lo.s32 %r5157, %r104, %r5153, %r5156; + mad.lo.s32 %r5158, %r107, %r5151, %r5157; + mad.lo.s32 %r5159, %r108, %r5149, %r5158; + ld.const.v4.u8 {%rs3322, %rs3323, %rs3324, %rs3325}, [matrix+1660]; + cvt.u32.u16 %r5160, %rs3325; + cvt.s32.s8 %r5161, %r5160; + cvt.u32.u16 %r5162, %rs3324; + cvt.s32.s8 %r5163, %r5162; + cvt.u32.u16 %r5164, %rs3323; + cvt.s32.s8 %r5165, %r5164; + cvt.u32.u16 %r5166, %rs3322; + cvt.s32.s8 %r5167, %r5166; + mad.lo.s32 %r5168, %r111, %r5167, %r5159; + mad.lo.s32 %r5169, %r112, %r5165, %r5168; + mad.lo.s32 %r5170, %r114, %r5163, %r5169; + mad.lo.s32 %r5171, %r115, %r5161, %r5170; + shr.u32 %r5172, %r4979, 6; + and.b32 %r5173, %r5172, 240; + shr.u32 %r5174, %r5171, 10; + or.b32 %r5175, %r5174, %r5173; + xor.b32 %r5176, %r22, %r5175; + cvt.u64.u32 %rd390, %r5176; + ld.const.v4.u8 {%rs3330, %rs3331, %rs3332, %rs3333}, [matrix+1664]; + cvt.u32.u16 %r5177, %rs3333; + cvt.s32.s8 %r5178, %r5177; + cvt.u32.u16 %r5179, %rs3332; + cvt.s32.s8 %r5180, %r5179; + cvt.u32.u16 %r5181, %rs3330; + cvt.s32.s8 %r5182, %r5181; + cvt.u32.u16 %r5183, %rs3331; + cvt.s32.s8 %r5184, %r5183; + mul.lo.s32 %r5185, %r34, %r5184; + mad.lo.s32 %r5186, %r124, %r5182, %r5185; + mad.lo.s32 %r5187, %r35, %r5180, %r5186; + mad.lo.s32 %r5188, %r36, %r5178, %r5187; + ld.const.v4.u8 {%rs3338, %rs3339, %rs3340, %rs3341}, [matrix+1668]; + cvt.u32.u16 %r5189, %rs3341; + cvt.s32.s8 %r5190, %r5189; + cvt.u32.u16 %r5191, %rs3340; + cvt.s32.s8 %r5192, %r5191; + cvt.u32.u16 %r5193, %rs3339; + cvt.s32.s8 %r5194, %r5193; + cvt.u32.u16 %r5195, %rs3338; + cvt.s32.s8 %r5196, %r5195; + mad.lo.s32 %r5197, %r37, %r5196, %r5188; + mad.lo.s32 %r5198, %r38, %r5194, %r5197; + mad.lo.s32 %r5199, %r39, %r5192, %r5198; + mad.lo.s32 %r5200, %r40, %r5190, %r5199; + ld.const.v4.u8 {%rs3346, %rs3347, %rs3348, %rs3349}, [matrix+1672]; + cvt.u32.u16 %r5201, %rs3349; + cvt.s32.s8 %r5202, %r5201; + cvt.u32.u16 %r5203, %rs3348; + cvt.s32.s8 %r5204, %r5203; + cvt.u32.u16 %r5205, %rs3347; + cvt.s32.s8 %r5206, %r5205; + cvt.u32.u16 %r5207, %rs3346; + cvt.s32.s8 %r5208, %r5207; + mad.lo.s32 %r5209, %r42, %r5208, %r5200; + mad.lo.s32 %r5210, %r43, %r5206, %r5209; + mad.lo.s32 %r5211, %r45, %r5204, %r5210; + mad.lo.s32 %r5212, %r46, %r5202, %r5211; + ld.const.v4.u8 {%rs3354, %rs3355, %rs3356, %rs3357}, [matrix+1676]; + cvt.u32.u16 %r5213, %rs3357; + cvt.s32.s8 %r5214, %r5213; + cvt.u32.u16 %r5215, %rs3356; + cvt.s32.s8 %r5216, %r5215; + cvt.u32.u16 %r5217, %rs3355; + cvt.s32.s8 %r5218, %r5217; + cvt.u32.u16 %r5219, %rs3354; + cvt.s32.s8 %r5220, %r5219; + mad.lo.s32 %r5221, %r48, %r5220, %r5212; + mad.lo.s32 %r5222, %r49, %r5218, %r5221; + mad.lo.s32 %r5223, %r50, %r5216, %r5222; + mad.lo.s32 %r5224, %r51, %r5214, %r5223; + ld.const.v4.u8 {%rs3362, %rs3363, %rs3364, %rs3365}, [matrix+1680]; + cvt.u32.u16 %r5225, %rs3365; + cvt.s32.s8 %r5226, %r5225; + cvt.u32.u16 %r5227, %rs3364; + cvt.s32.s8 %r5228, %r5227; + cvt.u32.u16 %r5229, %rs3363; + cvt.s32.s8 %r5230, %r5229; + cvt.u32.u16 %r5231, %rs3362; + cvt.s32.s8 %r5232, %r5231; + mad.lo.s32 %r5233, %r173, %r5232, %r5224; + mad.lo.s32 %r5234, %r53, %r5230, %r5233; + mad.lo.s32 %r5235, %r54, %r5228, %r5234; + mad.lo.s32 %r5236, %r55, %r5226, %r5235; + ld.const.v4.u8 {%rs3370, %rs3371, %rs3372, %rs3373}, [matrix+1684]; + cvt.u32.u16 %r5237, %rs3373; + cvt.s32.s8 %r5238, %r5237; + cvt.u32.u16 %r5239, %rs3372; + cvt.s32.s8 %r5240, %r5239; + cvt.u32.u16 %r5241, %rs3371; + cvt.s32.s8 %r5242, %r5241; + cvt.u32.u16 %r5243, %rs3370; + cvt.s32.s8 %r5244, %r5243; + mad.lo.s32 %r5245, %r56, %r5244, %r5236; + mad.lo.s32 %r5246, %r57, %r5242, %r5245; + mad.lo.s32 %r5247, %r58, %r5240, %r5246; + mad.lo.s32 %r5248, %r59, %r5238, %r5247; + ld.const.v4.u8 {%rs3378, %rs3379, %rs3380, %rs3381}, [matrix+1688]; + cvt.u32.u16 %r5249, %rs3381; + cvt.s32.s8 %r5250, %r5249; + cvt.u32.u16 %r5251, %rs3380; + cvt.s32.s8 %r5252, %r5251; + cvt.u32.u16 %r5253, %rs3379; + cvt.s32.s8 %r5254, %r5253; + cvt.u32.u16 %r5255, %rs3378; + cvt.s32.s8 %r5256, %r5255; + mad.lo.s32 %r5257, %r61, %r5256, %r5248; + mad.lo.s32 %r5258, %r62, %r5254, %r5257; + mad.lo.s32 %r5259, %r64, %r5252, %r5258; + mad.lo.s32 %r5260, %r65, %r5250, %r5259; + ld.const.v4.u8 {%rs3386, %rs3387, %rs3388, %rs3389}, [matrix+1692]; + cvt.u32.u16 %r5261, %rs3389; + cvt.s32.s8 %r5262, %r5261; + cvt.u32.u16 %r5263, %rs3388; + cvt.s32.s8 %r5264, %r5263; + cvt.u32.u16 %r5265, %rs3387; + cvt.s32.s8 %r5266, %r5265; + cvt.u32.u16 %r5267, %rs3386; + cvt.s32.s8 %r5268, %r5267; + mad.lo.s32 %r5269, %r67, %r5268, %r5260; + mad.lo.s32 %r5270, %r68, %r5266, %r5269; + mad.lo.s32 %r5271, %r69, %r5264, %r5270; + mad.lo.s32 %r5272, %r70, %r5262, %r5271; + ld.const.v4.u8 {%rs3394, %rs3395, %rs3396, %rs3397}, [matrix+1696]; + cvt.u32.u16 %r5273, %rs3397; + cvt.s32.s8 %r5274, %r5273; + cvt.u32.u16 %r5275, %rs3396; + cvt.s32.s8 %r5276, %r5275; + cvt.u32.u16 %r5277, %rs3395; + cvt.s32.s8 %r5278, %r5277; + cvt.u32.u16 %r5279, %rs3394; + cvt.s32.s8 %r5280, %r5279; + mad.lo.s32 %r5281, %r222, %r5280, %r5272; + mad.lo.s32 %r5282, %r72, %r5278, %r5281; + mad.lo.s32 %r5283, %r73, %r5276, %r5282; + mad.lo.s32 %r5284, %r74, %r5274, %r5283; + ld.const.v4.u8 {%rs3402, %rs3403, %rs3404, %rs3405}, [matrix+1700]; + cvt.u32.u16 %r5285, %rs3405; + cvt.s32.s8 %r5286, %r5285; + cvt.u32.u16 %r5287, %rs3404; + cvt.s32.s8 %r5288, %r5287; + cvt.u32.u16 %r5289, %rs3403; + cvt.s32.s8 %r5290, %r5289; + cvt.u32.u16 %r5291, %rs3402; + cvt.s32.s8 %r5292, %r5291; + mad.lo.s32 %r5293, %r75, %r5292, %r5284; + mad.lo.s32 %r5294, %r76, %r5290, %r5293; + mad.lo.s32 %r5295, %r77, %r5288, %r5294; + mad.lo.s32 %r5296, %r78, %r5286, %r5295; + ld.const.v4.u8 {%rs3410, %rs3411, %rs3412, %rs3413}, [matrix+1704]; + cvt.u32.u16 %r5297, %rs3413; + cvt.s32.s8 %r5298, %r5297; + cvt.u32.u16 %r5299, %rs3412; + cvt.s32.s8 %r5300, %r5299; + cvt.u32.u16 %r5301, %rs3411; + cvt.s32.s8 %r5302, %r5301; + cvt.u32.u16 %r5303, %rs3410; + cvt.s32.s8 %r5304, %r5303; + mad.lo.s32 %r5305, %r80, %r5304, %r5296; + mad.lo.s32 %r5306, %r81, %r5302, %r5305; + mad.lo.s32 %r5307, %r83, %r5300, %r5306; + mad.lo.s32 %r5308, %r84, %r5298, %r5307; + ld.const.v4.u8 {%rs3418, %rs3419, %rs3420, %rs3421}, [matrix+1708]; + cvt.u32.u16 %r5309, %rs3421; + cvt.s32.s8 %r5310, %r5309; + cvt.u32.u16 %r5311, %rs3420; + cvt.s32.s8 %r5312, %r5311; + cvt.u32.u16 %r5313, %rs3419; + cvt.s32.s8 %r5314, %r5313; + cvt.u32.u16 %r5315, %rs3418; + cvt.s32.s8 %r5316, %r5315; + mad.lo.s32 %r5317, %r86, %r5316, %r5308; + mad.lo.s32 %r5318, %r87, %r5314, %r5317; + mad.lo.s32 %r5319, %r88, %r5312, %r5318; + mad.lo.s32 %r5320, %r89, %r5310, %r5319; + ld.const.v4.u8 {%rs3426, %rs3427, %rs3428, %rs3429}, [matrix+1712]; + cvt.u32.u16 %r5321, %rs3429; + cvt.s32.s8 %r5322, %r5321; + cvt.u32.u16 %r5323, %rs3428; + cvt.s32.s8 %r5324, %r5323; + cvt.u32.u16 %r5325, %rs3427; + cvt.s32.s8 %r5326, %r5325; + cvt.u32.u16 %r5327, %rs3426; + cvt.s32.s8 %r5328, %r5327; + mad.lo.s32 %r5329, %r271, %r5328, %r5320; + mad.lo.s32 %r5330, %r91, %r5326, %r5329; + mad.lo.s32 %r5331, %r93, %r5324, %r5330; + mad.lo.s32 %r5332, %r94, %r5322, %r5331; + ld.const.v4.u8 {%rs3434, %rs3435, %rs3436, %rs3437}, [matrix+1716]; + cvt.u32.u16 %r5333, %rs3437; + cvt.s32.s8 %r5334, %r5333; + cvt.u32.u16 %r5335, %rs3436; + cvt.s32.s8 %r5336, %r5335; + cvt.u32.u16 %r5337, %rs3435; + cvt.s32.s8 %r5338, %r5337; + cvt.u32.u16 %r5339, %rs3434; + cvt.s32.s8 %r5340, %r5339; + mad.lo.s32 %r5341, %r96, %r5340, %r5332; + mad.lo.s32 %r5342, %r97, %r5338, %r5341; + mad.lo.s32 %r5343, %r99, %r5336, %r5342; + mad.lo.s32 %r5344, %r100, %r5334, %r5343; + ld.const.v4.u8 {%rs3442, %rs3443, %rs3444, %rs3445}, [matrix+1720]; + cvt.u32.u16 %r5345, %rs3445; + cvt.s32.s8 %r5346, %r5345; + cvt.u32.u16 %r5347, %rs3444; + cvt.s32.s8 %r5348, %r5347; + cvt.u32.u16 %r5349, %rs3443; + cvt.s32.s8 %r5350, %r5349; + cvt.u32.u16 %r5351, %rs3442; + cvt.s32.s8 %r5352, %r5351; + mad.lo.s32 %r5353, %r103, %r5352, %r5344; + mad.lo.s32 %r5354, %r104, %r5350, %r5353; + mad.lo.s32 %r5355, %r107, %r5348, %r5354; + mad.lo.s32 %r5356, %r108, %r5346, %r5355; + ld.const.v4.u8 {%rs3450, %rs3451, %rs3452, %rs3453}, [matrix+1724]; + cvt.u32.u16 %r5357, %rs3453; + cvt.s32.s8 %r5358, %r5357; + cvt.u32.u16 %r5359, %rs3452; + cvt.s32.s8 %r5360, %r5359; + cvt.u32.u16 %r5361, %rs3451; + cvt.s32.s8 %r5362, %r5361; + cvt.u32.u16 %r5363, %rs3450; + cvt.s32.s8 %r5364, %r5363; + mad.lo.s32 %r5365, %r111, %r5364, %r5356; + mad.lo.s32 %r5366, %r112, %r5362, %r5365; + mad.lo.s32 %r5367, %r114, %r5360, %r5366; + mad.lo.s32 %r5368, %r115, %r5358, %r5367; + ld.const.v4.u8 {%rs3458, %rs3459, %rs3460, %rs3461}, [matrix+1728]; + cvt.u32.u16 %r5369, %rs3461; + cvt.s32.s8 %r5370, %r5369; + cvt.u32.u16 %r5371, %rs3460; + cvt.s32.s8 %r5372, %r5371; + cvt.u32.u16 %r5373, %rs3458; + cvt.s32.s8 %r5374, %r5373; + cvt.u32.u16 %r5375, %rs3459; + cvt.s32.s8 %r5376, %r5375; + mul.lo.s32 %r5377, %r34, %r5376; + mad.lo.s32 %r5378, %r124, %r5374, %r5377; + mad.lo.s32 %r5379, %r35, %r5372, %r5378; + mad.lo.s32 %r5380, %r36, %r5370, %r5379; + ld.const.v4.u8 {%rs3466, %rs3467, %rs3468, %rs3469}, [matrix+1732]; + cvt.u32.u16 %r5381, %rs3469; + cvt.s32.s8 %r5382, %r5381; + cvt.u32.u16 %r5383, %rs3468; + cvt.s32.s8 %r5384, %r5383; + cvt.u32.u16 %r5385, %rs3467; + cvt.s32.s8 %r5386, %r5385; + cvt.u32.u16 %r5387, %rs3466; + cvt.s32.s8 %r5388, %r5387; + mad.lo.s32 %r5389, %r37, %r5388, %r5380; + mad.lo.s32 %r5390, %r38, %r5386, %r5389; + mad.lo.s32 %r5391, %r39, %r5384, %r5390; + mad.lo.s32 %r5392, %r40, %r5382, %r5391; + ld.const.v4.u8 {%rs3474, %rs3475, %rs3476, %rs3477}, [matrix+1736]; + cvt.u32.u16 %r5393, %rs3477; + cvt.s32.s8 %r5394, %r5393; + cvt.u32.u16 %r5395, %rs3476; + cvt.s32.s8 %r5396, %r5395; + cvt.u32.u16 %r5397, %rs3475; + cvt.s32.s8 %r5398, %r5397; + cvt.u32.u16 %r5399, %rs3474; + cvt.s32.s8 %r5400, %r5399; + mad.lo.s32 %r5401, %r42, %r5400, %r5392; + mad.lo.s32 %r5402, %r43, %r5398, %r5401; + mad.lo.s32 %r5403, %r45, %r5396, %r5402; + mad.lo.s32 %r5404, %r46, %r5394, %r5403; + ld.const.v4.u8 {%rs3482, %rs3483, %rs3484, %rs3485}, [matrix+1740]; + cvt.u32.u16 %r5405, %rs3485; + cvt.s32.s8 %r5406, %r5405; + cvt.u32.u16 %r5407, %rs3484; + cvt.s32.s8 %r5408, %r5407; + cvt.u32.u16 %r5409, %rs3483; + cvt.s32.s8 %r5410, %r5409; + cvt.u32.u16 %r5411, %rs3482; + cvt.s32.s8 %r5412, %r5411; + mad.lo.s32 %r5413, %r48, %r5412, %r5404; + mad.lo.s32 %r5414, %r49, %r5410, %r5413; + mad.lo.s32 %r5415, %r50, %r5408, %r5414; + mad.lo.s32 %r5416, %r51, %r5406, %r5415; + ld.const.v4.u8 {%rs3490, %rs3491, %rs3492, %rs3493}, [matrix+1744]; + cvt.u32.u16 %r5417, %rs3493; + cvt.s32.s8 %r5418, %r5417; + cvt.u32.u16 %r5419, %rs3492; + cvt.s32.s8 %r5420, %r5419; + cvt.u32.u16 %r5421, %rs3491; + cvt.s32.s8 %r5422, %r5421; + cvt.u32.u16 %r5423, %rs3490; + cvt.s32.s8 %r5424, %r5423; + mad.lo.s32 %r5425, %r173, %r5424, %r5416; + mad.lo.s32 %r5426, %r53, %r5422, %r5425; + mad.lo.s32 %r5427, %r54, %r5420, %r5426; + mad.lo.s32 %r5428, %r55, %r5418, %r5427; + ld.const.v4.u8 {%rs3498, %rs3499, %rs3500, %rs3501}, [matrix+1748]; + cvt.u32.u16 %r5429, %rs3501; + cvt.s32.s8 %r5430, %r5429; + cvt.u32.u16 %r5431, %rs3500; + cvt.s32.s8 %r5432, %r5431; + cvt.u32.u16 %r5433, %rs3499; + cvt.s32.s8 %r5434, %r5433; + cvt.u32.u16 %r5435, %rs3498; + cvt.s32.s8 %r5436, %r5435; + mad.lo.s32 %r5437, %r56, %r5436, %r5428; + mad.lo.s32 %r5438, %r57, %r5434, %r5437; + mad.lo.s32 %r5439, %r58, %r5432, %r5438; + mad.lo.s32 %r5440, %r59, %r5430, %r5439; + ld.const.v4.u8 {%rs3506, %rs3507, %rs3508, %rs3509}, [matrix+1752]; + cvt.u32.u16 %r5441, %rs3509; + cvt.s32.s8 %r5442, %r5441; + cvt.u32.u16 %r5443, %rs3508; + cvt.s32.s8 %r5444, %r5443; + cvt.u32.u16 %r5445, %rs3507; + cvt.s32.s8 %r5446, %r5445; + cvt.u32.u16 %r5447, %rs3506; + cvt.s32.s8 %r5448, %r5447; + mad.lo.s32 %r5449, %r61, %r5448, %r5440; + mad.lo.s32 %r5450, %r62, %r5446, %r5449; + mad.lo.s32 %r5451, %r64, %r5444, %r5450; + mad.lo.s32 %r5452, %r65, %r5442, %r5451; + ld.const.v4.u8 {%rs3514, %rs3515, %rs3516, %rs3517}, [matrix+1756]; + cvt.u32.u16 %r5453, %rs3517; + cvt.s32.s8 %r5454, %r5453; + cvt.u32.u16 %r5455, %rs3516; + cvt.s32.s8 %r5456, %r5455; + cvt.u32.u16 %r5457, %rs3515; + cvt.s32.s8 %r5458, %r5457; + cvt.u32.u16 %r5459, %rs3514; + cvt.s32.s8 %r5460, %r5459; + mad.lo.s32 %r5461, %r67, %r5460, %r5452; + mad.lo.s32 %r5462, %r68, %r5458, %r5461; + mad.lo.s32 %r5463, %r69, %r5456, %r5462; + mad.lo.s32 %r5464, %r70, %r5454, %r5463; + ld.const.v4.u8 {%rs3522, %rs3523, %rs3524, %rs3525}, [matrix+1760]; + cvt.u32.u16 %r5465, %rs3525; + cvt.s32.s8 %r5466, %r5465; + cvt.u32.u16 %r5467, %rs3524; + cvt.s32.s8 %r5468, %r5467; + cvt.u32.u16 %r5469, %rs3523; + cvt.s32.s8 %r5470, %r5469; + cvt.u32.u16 %r5471, %rs3522; + cvt.s32.s8 %r5472, %r5471; + mad.lo.s32 %r5473, %r222, %r5472, %r5464; + mad.lo.s32 %r5474, %r72, %r5470, %r5473; + mad.lo.s32 %r5475, %r73, %r5468, %r5474; + mad.lo.s32 %r5476, %r74, %r5466, %r5475; + ld.const.v4.u8 {%rs3530, %rs3531, %rs3532, %rs3533}, [matrix+1764]; + cvt.u32.u16 %r5477, %rs3533; + cvt.s32.s8 %r5478, %r5477; + cvt.u32.u16 %r5479, %rs3532; + cvt.s32.s8 %r5480, %r5479; + cvt.u32.u16 %r5481, %rs3531; + cvt.s32.s8 %r5482, %r5481; + cvt.u32.u16 %r5483, %rs3530; + cvt.s32.s8 %r5484, %r5483; + mad.lo.s32 %r5485, %r75, %r5484, %r5476; + mad.lo.s32 %r5486, %r76, %r5482, %r5485; + mad.lo.s32 %r5487, %r77, %r5480, %r5486; + mad.lo.s32 %r5488, %r78, %r5478, %r5487; + ld.const.v4.u8 {%rs3538, %rs3539, %rs3540, %rs3541}, [matrix+1768]; + cvt.u32.u16 %r5489, %rs3541; + cvt.s32.s8 %r5490, %r5489; + cvt.u32.u16 %r5491, %rs3540; + cvt.s32.s8 %r5492, %r5491; + cvt.u32.u16 %r5493, %rs3539; + cvt.s32.s8 %r5494, %r5493; + cvt.u32.u16 %r5495, %rs3538; + cvt.s32.s8 %r5496, %r5495; + mad.lo.s32 %r5497, %r80, %r5496, %r5488; + mad.lo.s32 %r5498, %r81, %r5494, %r5497; + mad.lo.s32 %r5499, %r83, %r5492, %r5498; + mad.lo.s32 %r5500, %r84, %r5490, %r5499; + ld.const.v4.u8 {%rs3546, %rs3547, %rs3548, %rs3549}, [matrix+1772]; + cvt.u32.u16 %r5501, %rs3549; + cvt.s32.s8 %r5502, %r5501; + cvt.u32.u16 %r5503, %rs3548; + cvt.s32.s8 %r5504, %r5503; + cvt.u32.u16 %r5505, %rs3547; + cvt.s32.s8 %r5506, %r5505; + cvt.u32.u16 %r5507, %rs3546; + cvt.s32.s8 %r5508, %r5507; + mad.lo.s32 %r5509, %r86, %r5508, %r5500; + mad.lo.s32 %r5510, %r87, %r5506, %r5509; + mad.lo.s32 %r5511, %r88, %r5504, %r5510; + mad.lo.s32 %r5512, %r89, %r5502, %r5511; + ld.const.v4.u8 {%rs3554, %rs3555, %rs3556, %rs3557}, [matrix+1776]; + cvt.u32.u16 %r5513, %rs3557; + cvt.s32.s8 %r5514, %r5513; + cvt.u32.u16 %r5515, %rs3556; + cvt.s32.s8 %r5516, %r5515; + cvt.u32.u16 %r5517, %rs3555; + cvt.s32.s8 %r5518, %r5517; + cvt.u32.u16 %r5519, %rs3554; + cvt.s32.s8 %r5520, %r5519; + mad.lo.s32 %r5521, %r271, %r5520, %r5512; + mad.lo.s32 %r5522, %r91, %r5518, %r5521; + mad.lo.s32 %r5523, %r93, %r5516, %r5522; + mad.lo.s32 %r5524, %r94, %r5514, %r5523; + ld.const.v4.u8 {%rs3562, %rs3563, %rs3564, %rs3565}, [matrix+1780]; + cvt.u32.u16 %r5525, %rs3565; + cvt.s32.s8 %r5526, %r5525; + cvt.u32.u16 %r5527, %rs3564; + cvt.s32.s8 %r5528, %r5527; + cvt.u32.u16 %r5529, %rs3563; + cvt.s32.s8 %r5530, %r5529; + cvt.u32.u16 %r5531, %rs3562; + cvt.s32.s8 %r5532, %r5531; + mad.lo.s32 %r5533, %r96, %r5532, %r5524; + mad.lo.s32 %r5534, %r97, %r5530, %r5533; + mad.lo.s32 %r5535, %r99, %r5528, %r5534; + mad.lo.s32 %r5536, %r100, %r5526, %r5535; + ld.const.v4.u8 {%rs3570, %rs3571, %rs3572, %rs3573}, [matrix+1784]; + cvt.u32.u16 %r5537, %rs3573; + cvt.s32.s8 %r5538, %r5537; + cvt.u32.u16 %r5539, %rs3572; + cvt.s32.s8 %r5540, %r5539; + cvt.u32.u16 %r5541, %rs3571; + cvt.s32.s8 %r5542, %r5541; + cvt.u32.u16 %r5543, %rs3570; + cvt.s32.s8 %r5544, %r5543; + mad.lo.s32 %r5545, %r103, %r5544, %r5536; + mad.lo.s32 %r5546, %r104, %r5542, %r5545; + mad.lo.s32 %r5547, %r107, %r5540, %r5546; + mad.lo.s32 %r5548, %r108, %r5538, %r5547; + ld.const.v4.u8 {%rs3578, %rs3579, %rs3580, %rs3581}, [matrix+1788]; + cvt.u32.u16 %r5549, %rs3581; + cvt.s32.s8 %r5550, %r5549; + cvt.u32.u16 %r5551, %rs3580; + cvt.s32.s8 %r5552, %r5551; + cvt.u32.u16 %r5553, %rs3579; + cvt.s32.s8 %r5554, %r5553; + cvt.u32.u16 %r5555, %rs3578; + cvt.s32.s8 %r5556, %r5555; + mad.lo.s32 %r5557, %r111, %r5556, %r5548; + mad.lo.s32 %r5558, %r112, %r5554, %r5557; + mad.lo.s32 %r5559, %r114, %r5552, %r5558; + mad.lo.s32 %r5560, %r115, %r5550, %r5559; + shr.u32 %r5561, %r5368, 6; + and.b32 %r5562, %r5561, 240; + shr.u32 %r5563, %r5560, 10; + or.b32 %r5564, %r5563, %r5562; + xor.b32 %r5565, %r23, %r5564; + cvt.u64.u32 %rd391, %r5565; + ld.const.v4.u8 {%rs3586, %rs3587, %rs3588, %rs3589}, [matrix+1792]; + cvt.u32.u16 %r5566, %rs3589; + cvt.s32.s8 %r5567, %r5566; + cvt.u32.u16 %r5568, %rs3588; + cvt.s32.s8 %r5569, %r5568; + cvt.u32.u16 %r5570, %rs3586; + cvt.s32.s8 %r5571, %r5570; + cvt.u32.u16 %r5572, %rs3587; + cvt.s32.s8 %r5573, %r5572; + mul.lo.s32 %r5574, %r34, %r5573; + mad.lo.s32 %r5575, %r124, %r5571, %r5574; + mad.lo.s32 %r5576, %r35, %r5569, %r5575; + mad.lo.s32 %r5577, %r36, %r5567, %r5576; + ld.const.v4.u8 {%rs3594, %rs3595, %rs3596, %rs3597}, [matrix+1796]; + cvt.u32.u16 %r5578, %rs3597; + cvt.s32.s8 %r5579, %r5578; + cvt.u32.u16 %r5580, %rs3596; + cvt.s32.s8 %r5581, %r5580; + cvt.u32.u16 %r5582, %rs3595; + cvt.s32.s8 %r5583, %r5582; + cvt.u32.u16 %r5584, %rs3594; + cvt.s32.s8 %r5585, %r5584; + mad.lo.s32 %r5586, %r37, %r5585, %r5577; + mad.lo.s32 %r5587, %r38, %r5583, %r5586; + mad.lo.s32 %r5588, %r39, %r5581, %r5587; + mad.lo.s32 %r5589, %r40, %r5579, %r5588; + ld.const.v4.u8 {%rs3602, %rs3603, %rs3604, %rs3605}, [matrix+1800]; + cvt.u32.u16 %r5590, %rs3605; + cvt.s32.s8 %r5591, %r5590; + cvt.u32.u16 %r5592, %rs3604; + cvt.s32.s8 %r5593, %r5592; + cvt.u32.u16 %r5594, %rs3603; + cvt.s32.s8 %r5595, %r5594; + cvt.u32.u16 %r5596, %rs3602; + cvt.s32.s8 %r5597, %r5596; + mad.lo.s32 %r5598, %r42, %r5597, %r5589; + mad.lo.s32 %r5599, %r43, %r5595, %r5598; + mad.lo.s32 %r5600, %r45, %r5593, %r5599; + mad.lo.s32 %r5601, %r46, %r5591, %r5600; + ld.const.v4.u8 {%rs3610, %rs3611, %rs3612, %rs3613}, [matrix+1804]; + cvt.u32.u16 %r5602, %rs3613; + cvt.s32.s8 %r5603, %r5602; + cvt.u32.u16 %r5604, %rs3612; + cvt.s32.s8 %r5605, %r5604; + cvt.u32.u16 %r5606, %rs3611; + cvt.s32.s8 %r5607, %r5606; + cvt.u32.u16 %r5608, %rs3610; + cvt.s32.s8 %r5609, %r5608; + mad.lo.s32 %r5610, %r48, %r5609, %r5601; + mad.lo.s32 %r5611, %r49, %r5607, %r5610; + mad.lo.s32 %r5612, %r50, %r5605, %r5611; + mad.lo.s32 %r5613, %r51, %r5603, %r5612; + ld.const.v4.u8 {%rs3618, %rs3619, %rs3620, %rs3621}, [matrix+1808]; + cvt.u32.u16 %r5614, %rs3621; + cvt.s32.s8 %r5615, %r5614; + cvt.u32.u16 %r5616, %rs3620; + cvt.s32.s8 %r5617, %r5616; + cvt.u32.u16 %r5618, %rs3619; + cvt.s32.s8 %r5619, %r5618; + cvt.u32.u16 %r5620, %rs3618; + cvt.s32.s8 %r5621, %r5620; + mad.lo.s32 %r5622, %r173, %r5621, %r5613; + mad.lo.s32 %r5623, %r53, %r5619, %r5622; + mad.lo.s32 %r5624, %r54, %r5617, %r5623; + mad.lo.s32 %r5625, %r55, %r5615, %r5624; + ld.const.v4.u8 {%rs3626, %rs3627, %rs3628, %rs3629}, [matrix+1812]; + cvt.u32.u16 %r5626, %rs3629; + cvt.s32.s8 %r5627, %r5626; + cvt.u32.u16 %r5628, %rs3628; + cvt.s32.s8 %r5629, %r5628; + cvt.u32.u16 %r5630, %rs3627; + cvt.s32.s8 %r5631, %r5630; + cvt.u32.u16 %r5632, %rs3626; + cvt.s32.s8 %r5633, %r5632; + mad.lo.s32 %r5634, %r56, %r5633, %r5625; + mad.lo.s32 %r5635, %r57, %r5631, %r5634; + mad.lo.s32 %r5636, %r58, %r5629, %r5635; + mad.lo.s32 %r5637, %r59, %r5627, %r5636; + ld.const.v4.u8 {%rs3634, %rs3635, %rs3636, %rs3637}, [matrix+1816]; + cvt.u32.u16 %r5638, %rs3637; + cvt.s32.s8 %r5639, %r5638; + cvt.u32.u16 %r5640, %rs3636; + cvt.s32.s8 %r5641, %r5640; + cvt.u32.u16 %r5642, %rs3635; + cvt.s32.s8 %r5643, %r5642; + cvt.u32.u16 %r5644, %rs3634; + cvt.s32.s8 %r5645, %r5644; + mad.lo.s32 %r5646, %r61, %r5645, %r5637; + mad.lo.s32 %r5647, %r62, %r5643, %r5646; + mad.lo.s32 %r5648, %r64, %r5641, %r5647; + mad.lo.s32 %r5649, %r65, %r5639, %r5648; + ld.const.v4.u8 {%rs3642, %rs3643, %rs3644, %rs3645}, [matrix+1820]; + cvt.u32.u16 %r5650, %rs3645; + cvt.s32.s8 %r5651, %r5650; + cvt.u32.u16 %r5652, %rs3644; + cvt.s32.s8 %r5653, %r5652; + cvt.u32.u16 %r5654, %rs3643; + cvt.s32.s8 %r5655, %r5654; + cvt.u32.u16 %r5656, %rs3642; + cvt.s32.s8 %r5657, %r5656; + mad.lo.s32 %r5658, %r67, %r5657, %r5649; + mad.lo.s32 %r5659, %r68, %r5655, %r5658; + mad.lo.s32 %r5660, %r69, %r5653, %r5659; + mad.lo.s32 %r5661, %r70, %r5651, %r5660; + ld.const.v4.u8 {%rs3650, %rs3651, %rs3652, %rs3653}, [matrix+1824]; + cvt.u32.u16 %r5662, %rs3653; + cvt.s32.s8 %r5663, %r5662; + cvt.u32.u16 %r5664, %rs3652; + cvt.s32.s8 %r5665, %r5664; + cvt.u32.u16 %r5666, %rs3651; + cvt.s32.s8 %r5667, %r5666; + cvt.u32.u16 %r5668, %rs3650; + cvt.s32.s8 %r5669, %r5668; + mad.lo.s32 %r5670, %r222, %r5669, %r5661; + mad.lo.s32 %r5671, %r72, %r5667, %r5670; + mad.lo.s32 %r5672, %r73, %r5665, %r5671; + mad.lo.s32 %r5673, %r74, %r5663, %r5672; + ld.const.v4.u8 {%rs3658, %rs3659, %rs3660, %rs3661}, [matrix+1828]; + cvt.u32.u16 %r5674, %rs3661; + cvt.s32.s8 %r5675, %r5674; + cvt.u32.u16 %r5676, %rs3660; + cvt.s32.s8 %r5677, %r5676; + cvt.u32.u16 %r5678, %rs3659; + cvt.s32.s8 %r5679, %r5678; + cvt.u32.u16 %r5680, %rs3658; + cvt.s32.s8 %r5681, %r5680; + mad.lo.s32 %r5682, %r75, %r5681, %r5673; + mad.lo.s32 %r5683, %r76, %r5679, %r5682; + mad.lo.s32 %r5684, %r77, %r5677, %r5683; + mad.lo.s32 %r5685, %r78, %r5675, %r5684; + ld.const.v4.u8 {%rs3666, %rs3667, %rs3668, %rs3669}, [matrix+1832]; + cvt.u32.u16 %r5686, %rs3669; + cvt.s32.s8 %r5687, %r5686; + cvt.u32.u16 %r5688, %rs3668; + cvt.s32.s8 %r5689, %r5688; + cvt.u32.u16 %r5690, %rs3667; + cvt.s32.s8 %r5691, %r5690; + cvt.u32.u16 %r5692, %rs3666; + cvt.s32.s8 %r5693, %r5692; + mad.lo.s32 %r5694, %r80, %r5693, %r5685; + mad.lo.s32 %r5695, %r81, %r5691, %r5694; + mad.lo.s32 %r5696, %r83, %r5689, %r5695; + mad.lo.s32 %r5697, %r84, %r5687, %r5696; + ld.const.v4.u8 {%rs3674, %rs3675, %rs3676, %rs3677}, [matrix+1836]; + cvt.u32.u16 %r5698, %rs3677; + cvt.s32.s8 %r5699, %r5698; + cvt.u32.u16 %r5700, %rs3676; + cvt.s32.s8 %r5701, %r5700; + cvt.u32.u16 %r5702, %rs3675; + cvt.s32.s8 %r5703, %r5702; + cvt.u32.u16 %r5704, %rs3674; + cvt.s32.s8 %r5705, %r5704; + mad.lo.s32 %r5706, %r86, %r5705, %r5697; + mad.lo.s32 %r5707, %r87, %r5703, %r5706; + mad.lo.s32 %r5708, %r88, %r5701, %r5707; + mad.lo.s32 %r5709, %r89, %r5699, %r5708; + ld.const.v4.u8 {%rs3682, %rs3683, %rs3684, %rs3685}, [matrix+1840]; + cvt.u32.u16 %r5710, %rs3685; + cvt.s32.s8 %r5711, %r5710; + cvt.u32.u16 %r5712, %rs3684; + cvt.s32.s8 %r5713, %r5712; + cvt.u32.u16 %r5714, %rs3683; + cvt.s32.s8 %r5715, %r5714; + cvt.u32.u16 %r5716, %rs3682; + cvt.s32.s8 %r5717, %r5716; + mad.lo.s32 %r5718, %r271, %r5717, %r5709; + mad.lo.s32 %r5719, %r91, %r5715, %r5718; + mad.lo.s32 %r5720, %r93, %r5713, %r5719; + mad.lo.s32 %r5721, %r94, %r5711, %r5720; + ld.const.v4.u8 {%rs3690, %rs3691, %rs3692, %rs3693}, [matrix+1844]; + cvt.u32.u16 %r5722, %rs3693; + cvt.s32.s8 %r5723, %r5722; + cvt.u32.u16 %r5724, %rs3692; + cvt.s32.s8 %r5725, %r5724; + cvt.u32.u16 %r5726, %rs3691; + cvt.s32.s8 %r5727, %r5726; + cvt.u32.u16 %r5728, %rs3690; + cvt.s32.s8 %r5729, %r5728; + mad.lo.s32 %r5730, %r96, %r5729, %r5721; + mad.lo.s32 %r5731, %r97, %r5727, %r5730; + mad.lo.s32 %r5732, %r99, %r5725, %r5731; + mad.lo.s32 %r5733, %r100, %r5723, %r5732; + ld.const.v4.u8 {%rs3698, %rs3699, %rs3700, %rs3701}, [matrix+1848]; + cvt.u32.u16 %r5734, %rs3701; + cvt.s32.s8 %r5735, %r5734; + cvt.u32.u16 %r5736, %rs3700; + cvt.s32.s8 %r5737, %r5736; + cvt.u32.u16 %r5738, %rs3699; + cvt.s32.s8 %r5739, %r5738; + cvt.u32.u16 %r5740, %rs3698; + cvt.s32.s8 %r5741, %r5740; + mad.lo.s32 %r5742, %r103, %r5741, %r5733; + mad.lo.s32 %r5743, %r104, %r5739, %r5742; + mad.lo.s32 %r5744, %r107, %r5737, %r5743; + mad.lo.s32 %r5745, %r108, %r5735, %r5744; + ld.const.v4.u8 {%rs3706, %rs3707, %rs3708, %rs3709}, [matrix+1852]; + cvt.u32.u16 %r5746, %rs3709; + cvt.s32.s8 %r5747, %r5746; + cvt.u32.u16 %r5748, %rs3708; + cvt.s32.s8 %r5749, %r5748; + cvt.u32.u16 %r5750, %rs3707; + cvt.s32.s8 %r5751, %r5750; + cvt.u32.u16 %r5752, %rs3706; + cvt.s32.s8 %r5753, %r5752; + mad.lo.s32 %r5754, %r111, %r5753, %r5745; + mad.lo.s32 %r5755, %r112, %r5751, %r5754; + mad.lo.s32 %r5756, %r114, %r5749, %r5755; + mad.lo.s32 %r5757, %r115, %r5747, %r5756; + ld.const.v4.u8 {%rs3714, %rs3715, %rs3716, %rs3717}, [matrix+1856]; + cvt.u32.u16 %r5758, %rs3717; + cvt.s32.s8 %r5759, %r5758; + cvt.u32.u16 %r5760, %rs3716; + cvt.s32.s8 %r5761, %r5760; + cvt.u32.u16 %r5762, %rs3714; + cvt.s32.s8 %r5763, %r5762; + cvt.u32.u16 %r5764, %rs3715; + cvt.s32.s8 %r5765, %r5764; + mul.lo.s32 %r5766, %r34, %r5765; + mad.lo.s32 %r5767, %r124, %r5763, %r5766; + mad.lo.s32 %r5768, %r35, %r5761, %r5767; + mad.lo.s32 %r5769, %r36, %r5759, %r5768; + ld.const.v4.u8 {%rs3722, %rs3723, %rs3724, %rs3725}, [matrix+1860]; + cvt.u32.u16 %r5770, %rs3725; + cvt.s32.s8 %r5771, %r5770; + cvt.u32.u16 %r5772, %rs3724; + cvt.s32.s8 %r5773, %r5772; + cvt.u32.u16 %r5774, %rs3723; + cvt.s32.s8 %r5775, %r5774; + cvt.u32.u16 %r5776, %rs3722; + cvt.s32.s8 %r5777, %r5776; + mad.lo.s32 %r5778, %r37, %r5777, %r5769; + mad.lo.s32 %r5779, %r38, %r5775, %r5778; + mad.lo.s32 %r5780, %r39, %r5773, %r5779; + mad.lo.s32 %r5781, %r40, %r5771, %r5780; + ld.const.v4.u8 {%rs3730, %rs3731, %rs3732, %rs3733}, [matrix+1864]; + cvt.u32.u16 %r5782, %rs3733; + cvt.s32.s8 %r5783, %r5782; + cvt.u32.u16 %r5784, %rs3732; + cvt.s32.s8 %r5785, %r5784; + cvt.u32.u16 %r5786, %rs3731; + cvt.s32.s8 %r5787, %r5786; + cvt.u32.u16 %r5788, %rs3730; + cvt.s32.s8 %r5789, %r5788; + mad.lo.s32 %r5790, %r42, %r5789, %r5781; + mad.lo.s32 %r5791, %r43, %r5787, %r5790; + mad.lo.s32 %r5792, %r45, %r5785, %r5791; + mad.lo.s32 %r5793, %r46, %r5783, %r5792; + ld.const.v4.u8 {%rs3738, %rs3739, %rs3740, %rs3741}, [matrix+1868]; + cvt.u32.u16 %r5794, %rs3741; + cvt.s32.s8 %r5795, %r5794; + cvt.u32.u16 %r5796, %rs3740; + cvt.s32.s8 %r5797, %r5796; + cvt.u32.u16 %r5798, %rs3739; + cvt.s32.s8 %r5799, %r5798; + cvt.u32.u16 %r5800, %rs3738; + cvt.s32.s8 %r5801, %r5800; + mad.lo.s32 %r5802, %r48, %r5801, %r5793; + mad.lo.s32 %r5803, %r49, %r5799, %r5802; + mad.lo.s32 %r5804, %r50, %r5797, %r5803; + mad.lo.s32 %r5805, %r51, %r5795, %r5804; + ld.const.v4.u8 {%rs3746, %rs3747, %rs3748, %rs3749}, [matrix+1872]; + cvt.u32.u16 %r5806, %rs3749; + cvt.s32.s8 %r5807, %r5806; + cvt.u32.u16 %r5808, %rs3748; + cvt.s32.s8 %r5809, %r5808; + cvt.u32.u16 %r5810, %rs3747; + cvt.s32.s8 %r5811, %r5810; + cvt.u32.u16 %r5812, %rs3746; + cvt.s32.s8 %r5813, %r5812; + mad.lo.s32 %r5814, %r173, %r5813, %r5805; + mad.lo.s32 %r5815, %r53, %r5811, %r5814; + mad.lo.s32 %r5816, %r54, %r5809, %r5815; + mad.lo.s32 %r5817, %r55, %r5807, %r5816; + ld.const.v4.u8 {%rs3754, %rs3755, %rs3756, %rs3757}, [matrix+1876]; + cvt.u32.u16 %r5818, %rs3757; + cvt.s32.s8 %r5819, %r5818; + cvt.u32.u16 %r5820, %rs3756; + cvt.s32.s8 %r5821, %r5820; + cvt.u32.u16 %r5822, %rs3755; + cvt.s32.s8 %r5823, %r5822; + cvt.u32.u16 %r5824, %rs3754; + cvt.s32.s8 %r5825, %r5824; + mad.lo.s32 %r5826, %r56, %r5825, %r5817; + mad.lo.s32 %r5827, %r57, %r5823, %r5826; + mad.lo.s32 %r5828, %r58, %r5821, %r5827; + mad.lo.s32 %r5829, %r59, %r5819, %r5828; + ld.const.v4.u8 {%rs3762, %rs3763, %rs3764, %rs3765}, [matrix+1880]; + cvt.u32.u16 %r5830, %rs3765; + cvt.s32.s8 %r5831, %r5830; + cvt.u32.u16 %r5832, %rs3764; + cvt.s32.s8 %r5833, %r5832; + cvt.u32.u16 %r5834, %rs3763; + cvt.s32.s8 %r5835, %r5834; + cvt.u32.u16 %r5836, %rs3762; + cvt.s32.s8 %r5837, %r5836; + mad.lo.s32 %r5838, %r61, %r5837, %r5829; + mad.lo.s32 %r5839, %r62, %r5835, %r5838; + mad.lo.s32 %r5840, %r64, %r5833, %r5839; + mad.lo.s32 %r5841, %r65, %r5831, %r5840; + ld.const.v4.u8 {%rs3770, %rs3771, %rs3772, %rs3773}, [matrix+1884]; + cvt.u32.u16 %r5842, %rs3773; + cvt.s32.s8 %r5843, %r5842; + cvt.u32.u16 %r5844, %rs3772; + cvt.s32.s8 %r5845, %r5844; + cvt.u32.u16 %r5846, %rs3771; + cvt.s32.s8 %r5847, %r5846; + cvt.u32.u16 %r5848, %rs3770; + cvt.s32.s8 %r5849, %r5848; + mad.lo.s32 %r5850, %r67, %r5849, %r5841; + mad.lo.s32 %r5851, %r68, %r5847, %r5850; + mad.lo.s32 %r5852, %r69, %r5845, %r5851; + mad.lo.s32 %r5853, %r70, %r5843, %r5852; + ld.const.v4.u8 {%rs3778, %rs3779, %rs3780, %rs3781}, [matrix+1888]; + cvt.u32.u16 %r5854, %rs3781; + cvt.s32.s8 %r5855, %r5854; + cvt.u32.u16 %r5856, %rs3780; + cvt.s32.s8 %r5857, %r5856; + cvt.u32.u16 %r5858, %rs3779; + cvt.s32.s8 %r5859, %r5858; + cvt.u32.u16 %r5860, %rs3778; + cvt.s32.s8 %r5861, %r5860; + mad.lo.s32 %r5862, %r222, %r5861, %r5853; + mad.lo.s32 %r5863, %r72, %r5859, %r5862; + mad.lo.s32 %r5864, %r73, %r5857, %r5863; + mad.lo.s32 %r5865, %r74, %r5855, %r5864; + ld.const.v4.u8 {%rs3786, %rs3787, %rs3788, %rs3789}, [matrix+1892]; + cvt.u32.u16 %r5866, %rs3789; + cvt.s32.s8 %r5867, %r5866; + cvt.u32.u16 %r5868, %rs3788; + cvt.s32.s8 %r5869, %r5868; + cvt.u32.u16 %r5870, %rs3787; + cvt.s32.s8 %r5871, %r5870; + cvt.u32.u16 %r5872, %rs3786; + cvt.s32.s8 %r5873, %r5872; + mad.lo.s32 %r5874, %r75, %r5873, %r5865; + mad.lo.s32 %r5875, %r76, %r5871, %r5874; + mad.lo.s32 %r5876, %r77, %r5869, %r5875; + mad.lo.s32 %r5877, %r78, %r5867, %r5876; + ld.const.v4.u8 {%rs3794, %rs3795, %rs3796, %rs3797}, [matrix+1896]; + cvt.u32.u16 %r5878, %rs3797; + cvt.s32.s8 %r5879, %r5878; + cvt.u32.u16 %r5880, %rs3796; + cvt.s32.s8 %r5881, %r5880; + cvt.u32.u16 %r5882, %rs3795; + cvt.s32.s8 %r5883, %r5882; + cvt.u32.u16 %r5884, %rs3794; + cvt.s32.s8 %r5885, %r5884; + mad.lo.s32 %r5886, %r80, %r5885, %r5877; + mad.lo.s32 %r5887, %r81, %r5883, %r5886; + mad.lo.s32 %r5888, %r83, %r5881, %r5887; + mad.lo.s32 %r5889, %r84, %r5879, %r5888; + ld.const.v4.u8 {%rs3802, %rs3803, %rs3804, %rs3805}, [matrix+1900]; + cvt.u32.u16 %r5890, %rs3805; + cvt.s32.s8 %r5891, %r5890; + cvt.u32.u16 %r5892, %rs3804; + cvt.s32.s8 %r5893, %r5892; + cvt.u32.u16 %r5894, %rs3803; + cvt.s32.s8 %r5895, %r5894; + cvt.u32.u16 %r5896, %rs3802; + cvt.s32.s8 %r5897, %r5896; + mad.lo.s32 %r5898, %r86, %r5897, %r5889; + mad.lo.s32 %r5899, %r87, %r5895, %r5898; + mad.lo.s32 %r5900, %r88, %r5893, %r5899; + mad.lo.s32 %r5901, %r89, %r5891, %r5900; + ld.const.v4.u8 {%rs3810, %rs3811, %rs3812, %rs3813}, [matrix+1904]; + cvt.u32.u16 %r5902, %rs3813; + cvt.s32.s8 %r5903, %r5902; + cvt.u32.u16 %r5904, %rs3812; + cvt.s32.s8 %r5905, %r5904; + cvt.u32.u16 %r5906, %rs3811; + cvt.s32.s8 %r5907, %r5906; + cvt.u32.u16 %r5908, %rs3810; + cvt.s32.s8 %r5909, %r5908; + mad.lo.s32 %r5910, %r271, %r5909, %r5901; + mad.lo.s32 %r5911, %r91, %r5907, %r5910; + mad.lo.s32 %r5912, %r93, %r5905, %r5911; + mad.lo.s32 %r5913, %r94, %r5903, %r5912; + ld.const.v4.u8 {%rs3818, %rs3819, %rs3820, %rs3821}, [matrix+1908]; + cvt.u32.u16 %r5914, %rs3821; + cvt.s32.s8 %r5915, %r5914; + cvt.u32.u16 %r5916, %rs3820; + cvt.s32.s8 %r5917, %r5916; + cvt.u32.u16 %r5918, %rs3819; + cvt.s32.s8 %r5919, %r5918; + cvt.u32.u16 %r5920, %rs3818; + cvt.s32.s8 %r5921, %r5920; + mad.lo.s32 %r5922, %r96, %r5921, %r5913; + mad.lo.s32 %r5923, %r97, %r5919, %r5922; + mad.lo.s32 %r5924, %r99, %r5917, %r5923; + mad.lo.s32 %r5925, %r100, %r5915, %r5924; + ld.const.v4.u8 {%rs3826, %rs3827, %rs3828, %rs3829}, [matrix+1912]; + cvt.u32.u16 %r5926, %rs3829; + cvt.s32.s8 %r5927, %r5926; + cvt.u32.u16 %r5928, %rs3828; + cvt.s32.s8 %r5929, %r5928; + cvt.u32.u16 %r5930, %rs3827; + cvt.s32.s8 %r5931, %r5930; + cvt.u32.u16 %r5932, %rs3826; + cvt.s32.s8 %r5933, %r5932; + mad.lo.s32 %r5934, %r103, %r5933, %r5925; + mad.lo.s32 %r5935, %r104, %r5931, %r5934; + mad.lo.s32 %r5936, %r107, %r5929, %r5935; + mad.lo.s32 %r5937, %r108, %r5927, %r5936; + ld.const.v4.u8 {%rs3834, %rs3835, %rs3836, %rs3837}, [matrix+1916]; + cvt.u32.u16 %r5938, %rs3837; + cvt.s32.s8 %r5939, %r5938; + cvt.u32.u16 %r5940, %rs3836; + cvt.s32.s8 %r5941, %r5940; + cvt.u32.u16 %r5942, %rs3835; + cvt.s32.s8 %r5943, %r5942; + cvt.u32.u16 %r5944, %rs3834; + cvt.s32.s8 %r5945, %r5944; + mad.lo.s32 %r5946, %r111, %r5945, %r5937; + mad.lo.s32 %r5947, %r112, %r5943, %r5946; + mad.lo.s32 %r5948, %r114, %r5941, %r5947; + mad.lo.s32 %r5949, %r115, %r5939, %r5948; + shr.u32 %r5950, %r5757, 6; + and.b32 %r5951, %r5950, 240; + shr.u32 %r5952, %r5949, 10; + or.b32 %r5953, %r5952, %r5951; + xor.b32 %r5954, %r24, %r5953; + cvt.u64.u32 %rd392, %r5954; + ld.const.v4.u8 {%rs3842, %rs3843, %rs3844, %rs3845}, [matrix+1920]; + cvt.u32.u16 %r5955, %rs3845; + cvt.s32.s8 %r5956, %r5955; + cvt.u32.u16 %r5957, %rs3844; + cvt.s32.s8 %r5958, %r5957; + cvt.u32.u16 %r5959, %rs3842; + cvt.s32.s8 %r5960, %r5959; + cvt.u32.u16 %r5961, %rs3843; + cvt.s32.s8 %r5962, %r5961; + mul.lo.s32 %r5963, %r34, %r5962; + mad.lo.s32 %r5964, %r124, %r5960, %r5963; + mad.lo.s32 %r5965, %r35, %r5958, %r5964; + mad.lo.s32 %r5966, %r36, %r5956, %r5965; + ld.const.v4.u8 {%rs3850, %rs3851, %rs3852, %rs3853}, [matrix+1924]; + cvt.u32.u16 %r5967, %rs3853; + cvt.s32.s8 %r5968, %r5967; + cvt.u32.u16 %r5969, %rs3852; + cvt.s32.s8 %r5970, %r5969; + cvt.u32.u16 %r5971, %rs3851; + cvt.s32.s8 %r5972, %r5971; + cvt.u32.u16 %r5973, %rs3850; + cvt.s32.s8 %r5974, %r5973; + mad.lo.s32 %r5975, %r37, %r5974, %r5966; + mad.lo.s32 %r5976, %r38, %r5972, %r5975; + mad.lo.s32 %r5977, %r39, %r5970, %r5976; + mad.lo.s32 %r5978, %r40, %r5968, %r5977; + ld.const.v4.u8 {%rs3858, %rs3859, %rs3860, %rs3861}, [matrix+1928]; + cvt.u32.u16 %r5979, %rs3861; + cvt.s32.s8 %r5980, %r5979; + cvt.u32.u16 %r5981, %rs3860; + cvt.s32.s8 %r5982, %r5981; + cvt.u32.u16 %r5983, %rs3859; + cvt.s32.s8 %r5984, %r5983; + cvt.u32.u16 %r5985, %rs3858; + cvt.s32.s8 %r5986, %r5985; + mad.lo.s32 %r5987, %r42, %r5986, %r5978; + mad.lo.s32 %r5988, %r43, %r5984, %r5987; + mad.lo.s32 %r5989, %r45, %r5982, %r5988; + mad.lo.s32 %r5990, %r46, %r5980, %r5989; + ld.const.v4.u8 {%rs3866, %rs3867, %rs3868, %rs3869}, [matrix+1932]; + cvt.u32.u16 %r5991, %rs3869; + cvt.s32.s8 %r5992, %r5991; + cvt.u32.u16 %r5993, %rs3868; + cvt.s32.s8 %r5994, %r5993; + cvt.u32.u16 %r5995, %rs3867; + cvt.s32.s8 %r5996, %r5995; + cvt.u32.u16 %r5997, %rs3866; + cvt.s32.s8 %r5998, %r5997; + mad.lo.s32 %r5999, %r48, %r5998, %r5990; + mad.lo.s32 %r6000, %r49, %r5996, %r5999; + mad.lo.s32 %r6001, %r50, %r5994, %r6000; + mad.lo.s32 %r6002, %r51, %r5992, %r6001; + ld.const.v4.u8 {%rs3874, %rs3875, %rs3876, %rs3877}, [matrix+1936]; + cvt.u32.u16 %r6003, %rs3877; + cvt.s32.s8 %r6004, %r6003; + cvt.u32.u16 %r6005, %rs3876; + cvt.s32.s8 %r6006, %r6005; + cvt.u32.u16 %r6007, %rs3875; + cvt.s32.s8 %r6008, %r6007; + cvt.u32.u16 %r6009, %rs3874; + cvt.s32.s8 %r6010, %r6009; + mad.lo.s32 %r6011, %r173, %r6010, %r6002; + mad.lo.s32 %r6012, %r53, %r6008, %r6011; + mad.lo.s32 %r6013, %r54, %r6006, %r6012; + mad.lo.s32 %r6014, %r55, %r6004, %r6013; + ld.const.v4.u8 {%rs3882, %rs3883, %rs3884, %rs3885}, [matrix+1940]; + cvt.u32.u16 %r6015, %rs3885; + cvt.s32.s8 %r6016, %r6015; + cvt.u32.u16 %r6017, %rs3884; + cvt.s32.s8 %r6018, %r6017; + cvt.u32.u16 %r6019, %rs3883; + cvt.s32.s8 %r6020, %r6019; + cvt.u32.u16 %r6021, %rs3882; + cvt.s32.s8 %r6022, %r6021; + mad.lo.s32 %r6023, %r56, %r6022, %r6014; + mad.lo.s32 %r6024, %r57, %r6020, %r6023; + mad.lo.s32 %r6025, %r58, %r6018, %r6024; + mad.lo.s32 %r6026, %r59, %r6016, %r6025; + ld.const.v4.u8 {%rs3890, %rs3891, %rs3892, %rs3893}, [matrix+1944]; + cvt.u32.u16 %r6027, %rs3893; + cvt.s32.s8 %r6028, %r6027; + cvt.u32.u16 %r6029, %rs3892; + cvt.s32.s8 %r6030, %r6029; + cvt.u32.u16 %r6031, %rs3891; + cvt.s32.s8 %r6032, %r6031; + cvt.u32.u16 %r6033, %rs3890; + cvt.s32.s8 %r6034, %r6033; + mad.lo.s32 %r6035, %r61, %r6034, %r6026; + mad.lo.s32 %r6036, %r62, %r6032, %r6035; + mad.lo.s32 %r6037, %r64, %r6030, %r6036; + mad.lo.s32 %r6038, %r65, %r6028, %r6037; + ld.const.v4.u8 {%rs3898, %rs3899, %rs3900, %rs3901}, [matrix+1948]; + cvt.u32.u16 %r6039, %rs3901; + cvt.s32.s8 %r6040, %r6039; + cvt.u32.u16 %r6041, %rs3900; + cvt.s32.s8 %r6042, %r6041; + cvt.u32.u16 %r6043, %rs3899; + cvt.s32.s8 %r6044, %r6043; + cvt.u32.u16 %r6045, %rs3898; + cvt.s32.s8 %r6046, %r6045; + mad.lo.s32 %r6047, %r67, %r6046, %r6038; + mad.lo.s32 %r6048, %r68, %r6044, %r6047; + mad.lo.s32 %r6049, %r69, %r6042, %r6048; + mad.lo.s32 %r6050, %r70, %r6040, %r6049; + ld.const.v4.u8 {%rs3906, %rs3907, %rs3908, %rs3909}, [matrix+1952]; + cvt.u32.u16 %r6051, %rs3909; + cvt.s32.s8 %r6052, %r6051; + cvt.u32.u16 %r6053, %rs3908; + cvt.s32.s8 %r6054, %r6053; + cvt.u32.u16 %r6055, %rs3907; + cvt.s32.s8 %r6056, %r6055; + cvt.u32.u16 %r6057, %rs3906; + cvt.s32.s8 %r6058, %r6057; + mad.lo.s32 %r6059, %r222, %r6058, %r6050; + mad.lo.s32 %r6060, %r72, %r6056, %r6059; + mad.lo.s32 %r6061, %r73, %r6054, %r6060; + mad.lo.s32 %r6062, %r74, %r6052, %r6061; + ld.const.v4.u8 {%rs3914, %rs3915, %rs3916, %rs3917}, [matrix+1956]; + cvt.u32.u16 %r6063, %rs3917; + cvt.s32.s8 %r6064, %r6063; + cvt.u32.u16 %r6065, %rs3916; + cvt.s32.s8 %r6066, %r6065; + cvt.u32.u16 %r6067, %rs3915; + cvt.s32.s8 %r6068, %r6067; + cvt.u32.u16 %r6069, %rs3914; + cvt.s32.s8 %r6070, %r6069; + mad.lo.s32 %r6071, %r75, %r6070, %r6062; + mad.lo.s32 %r6072, %r76, %r6068, %r6071; + mad.lo.s32 %r6073, %r77, %r6066, %r6072; + mad.lo.s32 %r6074, %r78, %r6064, %r6073; + ld.const.v4.u8 {%rs3922, %rs3923, %rs3924, %rs3925}, [matrix+1960]; + cvt.u32.u16 %r6075, %rs3925; + cvt.s32.s8 %r6076, %r6075; + cvt.u32.u16 %r6077, %rs3924; + cvt.s32.s8 %r6078, %r6077; + cvt.u32.u16 %r6079, %rs3923; + cvt.s32.s8 %r6080, %r6079; + cvt.u32.u16 %r6081, %rs3922; + cvt.s32.s8 %r6082, %r6081; + mad.lo.s32 %r6083, %r80, %r6082, %r6074; + mad.lo.s32 %r6084, %r81, %r6080, %r6083; + mad.lo.s32 %r6085, %r83, %r6078, %r6084; + mad.lo.s32 %r6086, %r84, %r6076, %r6085; + ld.const.v4.u8 {%rs3930, %rs3931, %rs3932, %rs3933}, [matrix+1964]; + cvt.u32.u16 %r6087, %rs3933; + cvt.s32.s8 %r6088, %r6087; + cvt.u32.u16 %r6089, %rs3932; + cvt.s32.s8 %r6090, %r6089; + cvt.u32.u16 %r6091, %rs3931; + cvt.s32.s8 %r6092, %r6091; + cvt.u32.u16 %r6093, %rs3930; + cvt.s32.s8 %r6094, %r6093; + mad.lo.s32 %r6095, %r86, %r6094, %r6086; + mad.lo.s32 %r6096, %r87, %r6092, %r6095; + mad.lo.s32 %r6097, %r88, %r6090, %r6096; + mad.lo.s32 %r6098, %r89, %r6088, %r6097; + ld.const.v4.u8 {%rs3938, %rs3939, %rs3940, %rs3941}, [matrix+1968]; + cvt.u32.u16 %r6099, %rs3941; + cvt.s32.s8 %r6100, %r6099; + cvt.u32.u16 %r6101, %rs3940; + cvt.s32.s8 %r6102, %r6101; + cvt.u32.u16 %r6103, %rs3939; + cvt.s32.s8 %r6104, %r6103; + cvt.u32.u16 %r6105, %rs3938; + cvt.s32.s8 %r6106, %r6105; + mad.lo.s32 %r6107, %r271, %r6106, %r6098; + mad.lo.s32 %r6108, %r91, %r6104, %r6107; + mad.lo.s32 %r6109, %r93, %r6102, %r6108; + mad.lo.s32 %r6110, %r94, %r6100, %r6109; + ld.const.v4.u8 {%rs3946, %rs3947, %rs3948, %rs3949}, [matrix+1972]; + cvt.u32.u16 %r6111, %rs3949; + cvt.s32.s8 %r6112, %r6111; + cvt.u32.u16 %r6113, %rs3948; + cvt.s32.s8 %r6114, %r6113; + cvt.u32.u16 %r6115, %rs3947; + cvt.s32.s8 %r6116, %r6115; + cvt.u32.u16 %r6117, %rs3946; + cvt.s32.s8 %r6118, %r6117; + mad.lo.s32 %r6119, %r96, %r6118, %r6110; + mad.lo.s32 %r6120, %r97, %r6116, %r6119; + mad.lo.s32 %r6121, %r99, %r6114, %r6120; + mad.lo.s32 %r6122, %r100, %r6112, %r6121; + ld.const.v4.u8 {%rs3954, %rs3955, %rs3956, %rs3957}, [matrix+1976]; + cvt.u32.u16 %r6123, %rs3957; + cvt.s32.s8 %r6124, %r6123; + cvt.u32.u16 %r6125, %rs3956; + cvt.s32.s8 %r6126, %r6125; + cvt.u32.u16 %r6127, %rs3955; + cvt.s32.s8 %r6128, %r6127; + cvt.u32.u16 %r6129, %rs3954; + cvt.s32.s8 %r6130, %r6129; + mad.lo.s32 %r6131, %r103, %r6130, %r6122; + mad.lo.s32 %r6132, %r104, %r6128, %r6131; + mad.lo.s32 %r6133, %r107, %r6126, %r6132; + mad.lo.s32 %r6134, %r108, %r6124, %r6133; + ld.const.v4.u8 {%rs3962, %rs3963, %rs3964, %rs3965}, [matrix+1980]; + cvt.u32.u16 %r6135, %rs3965; + cvt.s32.s8 %r6136, %r6135; + cvt.u32.u16 %r6137, %rs3964; + cvt.s32.s8 %r6138, %r6137; + cvt.u32.u16 %r6139, %rs3963; + cvt.s32.s8 %r6140, %r6139; + cvt.u32.u16 %r6141, %rs3962; + cvt.s32.s8 %r6142, %r6141; + mad.lo.s32 %r6143, %r111, %r6142, %r6134; + mad.lo.s32 %r6144, %r112, %r6140, %r6143; + mad.lo.s32 %r6145, %r114, %r6138, %r6144; + mad.lo.s32 %r6146, %r115, %r6136, %r6145; + ld.const.v4.u8 {%rs3970, %rs3971, %rs3972, %rs3973}, [matrix+1984]; + cvt.u32.u16 %r6147, %rs3973; + cvt.s32.s8 %r6148, %r6147; + cvt.u32.u16 %r6149, %rs3972; + cvt.s32.s8 %r6150, %r6149; + cvt.u32.u16 %r6151, %rs3970; + cvt.s32.s8 %r6152, %r6151; + cvt.u32.u16 %r6153, %rs3971; + cvt.s32.s8 %r6154, %r6153; + mul.lo.s32 %r6155, %r34, %r6154; + mad.lo.s32 %r6156, %r124, %r6152, %r6155; + mad.lo.s32 %r6157, %r35, %r6150, %r6156; + mad.lo.s32 %r6158, %r36, %r6148, %r6157; + ld.const.v4.u8 {%rs3978, %rs3979, %rs3980, %rs3981}, [matrix+1988]; + cvt.u32.u16 %r6159, %rs3981; + cvt.s32.s8 %r6160, %r6159; + cvt.u32.u16 %r6161, %rs3980; + cvt.s32.s8 %r6162, %r6161; + cvt.u32.u16 %r6163, %rs3979; + cvt.s32.s8 %r6164, %r6163; + cvt.u32.u16 %r6165, %rs3978; + cvt.s32.s8 %r6166, %r6165; + mad.lo.s32 %r6167, %r37, %r6166, %r6158; + mad.lo.s32 %r6168, %r38, %r6164, %r6167; + mad.lo.s32 %r6169, %r39, %r6162, %r6168; + mad.lo.s32 %r6170, %r40, %r6160, %r6169; + ld.const.v4.u8 {%rs3986, %rs3987, %rs3988, %rs3989}, [matrix+1992]; + cvt.u32.u16 %r6171, %rs3989; + cvt.s32.s8 %r6172, %r6171; + cvt.u32.u16 %r6173, %rs3988; + cvt.s32.s8 %r6174, %r6173; + cvt.u32.u16 %r6175, %rs3987; + cvt.s32.s8 %r6176, %r6175; + cvt.u32.u16 %r6177, %rs3986; + cvt.s32.s8 %r6178, %r6177; + mad.lo.s32 %r6179, %r42, %r6178, %r6170; + mad.lo.s32 %r6180, %r43, %r6176, %r6179; + mad.lo.s32 %r6181, %r45, %r6174, %r6180; + mad.lo.s32 %r6182, %r46, %r6172, %r6181; + ld.const.v4.u8 {%rs3994, %rs3995, %rs3996, %rs3997}, [matrix+1996]; + cvt.u32.u16 %r6183, %rs3997; + cvt.s32.s8 %r6184, %r6183; + cvt.u32.u16 %r6185, %rs3996; + cvt.s32.s8 %r6186, %r6185; + cvt.u32.u16 %r6187, %rs3995; + cvt.s32.s8 %r6188, %r6187; + cvt.u32.u16 %r6189, %rs3994; + cvt.s32.s8 %r6190, %r6189; + mad.lo.s32 %r6191, %r48, %r6190, %r6182; + mad.lo.s32 %r6192, %r49, %r6188, %r6191; + mad.lo.s32 %r6193, %r50, %r6186, %r6192; + mad.lo.s32 %r6194, %r51, %r6184, %r6193; + ld.const.v4.u8 {%rs4002, %rs4003, %rs4004, %rs4005}, [matrix+2000]; + cvt.u32.u16 %r6195, %rs4005; + cvt.s32.s8 %r6196, %r6195; + cvt.u32.u16 %r6197, %rs4004; + cvt.s32.s8 %r6198, %r6197; + cvt.u32.u16 %r6199, %rs4003; + cvt.s32.s8 %r6200, %r6199; + cvt.u32.u16 %r6201, %rs4002; + cvt.s32.s8 %r6202, %r6201; + mad.lo.s32 %r6203, %r173, %r6202, %r6194; + mad.lo.s32 %r6204, %r53, %r6200, %r6203; + mad.lo.s32 %r6205, %r54, %r6198, %r6204; + mad.lo.s32 %r6206, %r55, %r6196, %r6205; + ld.const.v4.u8 {%rs4010, %rs4011, %rs4012, %rs4013}, [matrix+2004]; + cvt.u32.u16 %r6207, %rs4013; + cvt.s32.s8 %r6208, %r6207; + cvt.u32.u16 %r6209, %rs4012; + cvt.s32.s8 %r6210, %r6209; + cvt.u32.u16 %r6211, %rs4011; + cvt.s32.s8 %r6212, %r6211; + cvt.u32.u16 %r6213, %rs4010; + cvt.s32.s8 %r6214, %r6213; + mad.lo.s32 %r6215, %r56, %r6214, %r6206; + mad.lo.s32 %r6216, %r57, %r6212, %r6215; + mad.lo.s32 %r6217, %r58, %r6210, %r6216; + mad.lo.s32 %r6218, %r59, %r6208, %r6217; + ld.const.v4.u8 {%rs4018, %rs4019, %rs4020, %rs4021}, [matrix+2008]; + cvt.u32.u16 %r6219, %rs4021; + cvt.s32.s8 %r6220, %r6219; + cvt.u32.u16 %r6221, %rs4020; + cvt.s32.s8 %r6222, %r6221; + cvt.u32.u16 %r6223, %rs4019; + cvt.s32.s8 %r6224, %r6223; + cvt.u32.u16 %r6225, %rs4018; + cvt.s32.s8 %r6226, %r6225; + mad.lo.s32 %r6227, %r61, %r6226, %r6218; + mad.lo.s32 %r6228, %r62, %r6224, %r6227; + mad.lo.s32 %r6229, %r64, %r6222, %r6228; + mad.lo.s32 %r6230, %r65, %r6220, %r6229; + ld.const.v4.u8 {%rs4026, %rs4027, %rs4028, %rs4029}, [matrix+2012]; + cvt.u32.u16 %r6231, %rs4029; + cvt.s32.s8 %r6232, %r6231; + cvt.u32.u16 %r6233, %rs4028; + cvt.s32.s8 %r6234, %r6233; + cvt.u32.u16 %r6235, %rs4027; + cvt.s32.s8 %r6236, %r6235; + cvt.u32.u16 %r6237, %rs4026; + cvt.s32.s8 %r6238, %r6237; + mad.lo.s32 %r6239, %r67, %r6238, %r6230; + mad.lo.s32 %r6240, %r68, %r6236, %r6239; + mad.lo.s32 %r6241, %r69, %r6234, %r6240; + mad.lo.s32 %r6242, %r70, %r6232, %r6241; + ld.const.v4.u8 {%rs4034, %rs4035, %rs4036, %rs4037}, [matrix+2016]; + cvt.u32.u16 %r6243, %rs4037; + cvt.s32.s8 %r6244, %r6243; + cvt.u32.u16 %r6245, %rs4036; + cvt.s32.s8 %r6246, %r6245; + cvt.u32.u16 %r6247, %rs4035; + cvt.s32.s8 %r6248, %r6247; + cvt.u32.u16 %r6249, %rs4034; + cvt.s32.s8 %r6250, %r6249; + mad.lo.s32 %r6251, %r222, %r6250, %r6242; + mad.lo.s32 %r6252, %r72, %r6248, %r6251; + mad.lo.s32 %r6253, %r73, %r6246, %r6252; + mad.lo.s32 %r6254, %r74, %r6244, %r6253; + ld.const.v4.u8 {%rs4042, %rs4043, %rs4044, %rs4045}, [matrix+2020]; + cvt.u32.u16 %r6255, %rs4045; + cvt.s32.s8 %r6256, %r6255; + cvt.u32.u16 %r6257, %rs4044; + cvt.s32.s8 %r6258, %r6257; + cvt.u32.u16 %r6259, %rs4043; + cvt.s32.s8 %r6260, %r6259; + cvt.u32.u16 %r6261, %rs4042; + cvt.s32.s8 %r6262, %r6261; + mad.lo.s32 %r6263, %r75, %r6262, %r6254; + mad.lo.s32 %r6264, %r76, %r6260, %r6263; + mad.lo.s32 %r6265, %r77, %r6258, %r6264; + mad.lo.s32 %r6266, %r78, %r6256, %r6265; + ld.const.v4.u8 {%rs4050, %rs4051, %rs4052, %rs4053}, [matrix+2024]; + cvt.u32.u16 %r6267, %rs4053; + cvt.s32.s8 %r6268, %r6267; + cvt.u32.u16 %r6269, %rs4052; + cvt.s32.s8 %r6270, %r6269; + cvt.u32.u16 %r6271, %rs4051; + cvt.s32.s8 %r6272, %r6271; + cvt.u32.u16 %r6273, %rs4050; + cvt.s32.s8 %r6274, %r6273; + mad.lo.s32 %r6275, %r80, %r6274, %r6266; + mad.lo.s32 %r6276, %r81, %r6272, %r6275; + mad.lo.s32 %r6277, %r83, %r6270, %r6276; + mad.lo.s32 %r6278, %r84, %r6268, %r6277; + ld.const.v4.u8 {%rs4058, %rs4059, %rs4060, %rs4061}, [matrix+2028]; + cvt.u32.u16 %r6279, %rs4061; + cvt.s32.s8 %r6280, %r6279; + cvt.u32.u16 %r6281, %rs4060; + cvt.s32.s8 %r6282, %r6281; + cvt.u32.u16 %r6283, %rs4059; + cvt.s32.s8 %r6284, %r6283; + cvt.u32.u16 %r6285, %rs4058; + cvt.s32.s8 %r6286, %r6285; + mad.lo.s32 %r6287, %r86, %r6286, %r6278; + mad.lo.s32 %r6288, %r87, %r6284, %r6287; + mad.lo.s32 %r6289, %r88, %r6282, %r6288; + mad.lo.s32 %r6290, %r89, %r6280, %r6289; + ld.const.v4.u8 {%rs4066, %rs4067, %rs4068, %rs4069}, [matrix+2032]; + cvt.u32.u16 %r6291, %rs4069; + cvt.s32.s8 %r6292, %r6291; + cvt.u32.u16 %r6293, %rs4068; + cvt.s32.s8 %r6294, %r6293; + cvt.u32.u16 %r6295, %rs4067; + cvt.s32.s8 %r6296, %r6295; + cvt.u32.u16 %r6297, %rs4066; + cvt.s32.s8 %r6298, %r6297; + mad.lo.s32 %r6299, %r271, %r6298, %r6290; + mad.lo.s32 %r6300, %r91, %r6296, %r6299; + mad.lo.s32 %r6301, %r93, %r6294, %r6300; + mad.lo.s32 %r6302, %r94, %r6292, %r6301; + ld.const.v4.u8 {%rs4074, %rs4075, %rs4076, %rs4077}, [matrix+2036]; + cvt.u32.u16 %r6303, %rs4077; + cvt.s32.s8 %r6304, %r6303; + cvt.u32.u16 %r6305, %rs4076; + cvt.s32.s8 %r6306, %r6305; + cvt.u32.u16 %r6307, %rs4075; + cvt.s32.s8 %r6308, %r6307; + cvt.u32.u16 %r6309, %rs4074; + cvt.s32.s8 %r6310, %r6309; + mad.lo.s32 %r6311, %r96, %r6310, %r6302; + mad.lo.s32 %r6312, %r97, %r6308, %r6311; + mad.lo.s32 %r6313, %r99, %r6306, %r6312; + mad.lo.s32 %r6314, %r100, %r6304, %r6313; + ld.const.v4.u8 {%rs4082, %rs4083, %rs4084, %rs4085}, [matrix+2040]; + cvt.u32.u16 %r6315, %rs4085; + cvt.s32.s8 %r6316, %r6315; + cvt.u32.u16 %r6317, %rs4084; + cvt.s32.s8 %r6318, %r6317; + cvt.u32.u16 %r6319, %rs4083; + cvt.s32.s8 %r6320, %r6319; + cvt.u32.u16 %r6321, %rs4082; + cvt.s32.s8 %r6322, %r6321; + mad.lo.s32 %r6323, %r103, %r6322, %r6314; + mad.lo.s32 %r6324, %r104, %r6320, %r6323; + mad.lo.s32 %r6325, %r107, %r6318, %r6324; + mad.lo.s32 %r6326, %r108, %r6316, %r6325; + ld.const.v4.u8 {%rs4090, %rs4091, %rs4092, %rs4093}, [matrix+2044]; + cvt.u32.u16 %r6327, %rs4093; + cvt.s32.s8 %r6328, %r6327; + cvt.u32.u16 %r6329, %rs4092; + cvt.s32.s8 %r6330, %r6329; + cvt.u32.u16 %r6331, %rs4091; + cvt.s32.s8 %r6332, %r6331; + cvt.u32.u16 %r6333, %rs4090; + cvt.s32.s8 %r6334, %r6333; + mad.lo.s32 %r6335, %r111, %r6334, %r6326; + mad.lo.s32 %r6336, %r112, %r6332, %r6335; + mad.lo.s32 %r6337, %r114, %r6330, %r6336; + mad.lo.s32 %r6338, %r115, %r6328, %r6337; + shr.u32 %r6339, %r6146, 6; + and.b32 %r6340, %r6339, 240; + shr.u32 %r6341, %r6338, 10; + or.b32 %r6342, %r6341, %r6340; + xor.b32 %r6343, %r25, %r6342; + ld.const.v4.u8 {%rs4098, %rs4099, %rs4100, %rs4101}, [matrix+2048]; + cvt.u32.u16 %r6344, %rs4101; + cvt.s32.s8 %r6345, %r6344; + cvt.u32.u16 %r6346, %rs4100; + cvt.s32.s8 %r6347, %r6346; + cvt.u32.u16 %r6348, %rs4098; + cvt.s32.s8 %r6349, %r6348; + cvt.u32.u16 %r6350, %rs4099; + cvt.s32.s8 %r6351, %r6350; + mul.lo.s32 %r6352, %r34, %r6351; + mad.lo.s32 %r6353, %r124, %r6349, %r6352; + mad.lo.s32 %r6354, %r35, %r6347, %r6353; + mad.lo.s32 %r6355, %r36, %r6345, %r6354; + ld.const.v4.u8 {%rs4106, %rs4107, %rs4108, %rs4109}, [matrix+2052]; + cvt.u32.u16 %r6356, %rs4109; + cvt.s32.s8 %r6357, %r6356; + cvt.u32.u16 %r6358, %rs4108; + cvt.s32.s8 %r6359, %r6358; + cvt.u32.u16 %r6360, %rs4107; + cvt.s32.s8 %r6361, %r6360; + cvt.u32.u16 %r6362, %rs4106; + cvt.s32.s8 %r6363, %r6362; + mad.lo.s32 %r6364, %r37, %r6363, %r6355; + mad.lo.s32 %r6365, %r38, %r6361, %r6364; + mad.lo.s32 %r6366, %r39, %r6359, %r6365; + mad.lo.s32 %r6367, %r40, %r6357, %r6366; + ld.const.v4.u8 {%rs4114, %rs4115, %rs4116, %rs4117}, [matrix+2056]; + cvt.u32.u16 %r6368, %rs4117; + cvt.s32.s8 %r6369, %r6368; + cvt.u32.u16 %r6370, %rs4116; + cvt.s32.s8 %r6371, %r6370; + cvt.u32.u16 %r6372, %rs4115; + cvt.s32.s8 %r6373, %r6372; + cvt.u32.u16 %r6374, %rs4114; + cvt.s32.s8 %r6375, %r6374; + mad.lo.s32 %r6376, %r42, %r6375, %r6367; + mad.lo.s32 %r6377, %r43, %r6373, %r6376; + mad.lo.s32 %r6378, %r45, %r6371, %r6377; + mad.lo.s32 %r6379, %r46, %r6369, %r6378; + ld.const.v4.u8 {%rs4122, %rs4123, %rs4124, %rs4125}, [matrix+2060]; + cvt.u32.u16 %r6380, %rs4125; + cvt.s32.s8 %r6381, %r6380; + cvt.u32.u16 %r6382, %rs4124; + cvt.s32.s8 %r6383, %r6382; + cvt.u32.u16 %r6384, %rs4123; + cvt.s32.s8 %r6385, %r6384; + cvt.u32.u16 %r6386, %rs4122; + cvt.s32.s8 %r6387, %r6386; + mad.lo.s32 %r6388, %r48, %r6387, %r6379; + mad.lo.s32 %r6389, %r49, %r6385, %r6388; + mad.lo.s32 %r6390, %r50, %r6383, %r6389; + mad.lo.s32 %r6391, %r51, %r6381, %r6390; + ld.const.v4.u8 {%rs4130, %rs4131, %rs4132, %rs4133}, [matrix+2064]; + cvt.u32.u16 %r6392, %rs4133; + cvt.s32.s8 %r6393, %r6392; + cvt.u32.u16 %r6394, %rs4132; + cvt.s32.s8 %r6395, %r6394; + cvt.u32.u16 %r6396, %rs4131; + cvt.s32.s8 %r6397, %r6396; + cvt.u32.u16 %r6398, %rs4130; + cvt.s32.s8 %r6399, %r6398; + mad.lo.s32 %r6400, %r173, %r6399, %r6391; + mad.lo.s32 %r6401, %r53, %r6397, %r6400; + mad.lo.s32 %r6402, %r54, %r6395, %r6401; + mad.lo.s32 %r6403, %r55, %r6393, %r6402; + ld.const.v4.u8 {%rs4138, %rs4139, %rs4140, %rs4141}, [matrix+2068]; + cvt.u32.u16 %r6404, %rs4141; + cvt.s32.s8 %r6405, %r6404; + cvt.u32.u16 %r6406, %rs4140; + cvt.s32.s8 %r6407, %r6406; + cvt.u32.u16 %r6408, %rs4139; + cvt.s32.s8 %r6409, %r6408; + cvt.u32.u16 %r6410, %rs4138; + cvt.s32.s8 %r6411, %r6410; + mad.lo.s32 %r6412, %r56, %r6411, %r6403; + mad.lo.s32 %r6413, %r57, %r6409, %r6412; + mad.lo.s32 %r6414, %r58, %r6407, %r6413; + mad.lo.s32 %r6415, %r59, %r6405, %r6414; + ld.const.v4.u8 {%rs4146, %rs4147, %rs4148, %rs4149}, [matrix+2072]; + cvt.u32.u16 %r6416, %rs4149; + cvt.s32.s8 %r6417, %r6416; + cvt.u32.u16 %r6418, %rs4148; + cvt.s32.s8 %r6419, %r6418; + cvt.u32.u16 %r6420, %rs4147; + cvt.s32.s8 %r6421, %r6420; + cvt.u32.u16 %r6422, %rs4146; + cvt.s32.s8 %r6423, %r6422; + mad.lo.s32 %r6424, %r61, %r6423, %r6415; + mad.lo.s32 %r6425, %r62, %r6421, %r6424; + mad.lo.s32 %r6426, %r64, %r6419, %r6425; + mad.lo.s32 %r6427, %r65, %r6417, %r6426; + ld.const.v4.u8 {%rs4154, %rs4155, %rs4156, %rs4157}, [matrix+2076]; + cvt.u32.u16 %r6428, %rs4157; + cvt.s32.s8 %r6429, %r6428; + cvt.u32.u16 %r6430, %rs4156; + cvt.s32.s8 %r6431, %r6430; + cvt.u32.u16 %r6432, %rs4155; + cvt.s32.s8 %r6433, %r6432; + cvt.u32.u16 %r6434, %rs4154; + cvt.s32.s8 %r6435, %r6434; + mad.lo.s32 %r6436, %r67, %r6435, %r6427; + mad.lo.s32 %r6437, %r68, %r6433, %r6436; + mad.lo.s32 %r6438, %r69, %r6431, %r6437; + mad.lo.s32 %r6439, %r70, %r6429, %r6438; + ld.const.v4.u8 {%rs4162, %rs4163, %rs4164, %rs4165}, [matrix+2080]; + cvt.u32.u16 %r6440, %rs4165; + cvt.s32.s8 %r6441, %r6440; + cvt.u32.u16 %r6442, %rs4164; + cvt.s32.s8 %r6443, %r6442; + cvt.u32.u16 %r6444, %rs4163; + cvt.s32.s8 %r6445, %r6444; + cvt.u32.u16 %r6446, %rs4162; + cvt.s32.s8 %r6447, %r6446; + mad.lo.s32 %r6448, %r222, %r6447, %r6439; + mad.lo.s32 %r6449, %r72, %r6445, %r6448; + mad.lo.s32 %r6450, %r73, %r6443, %r6449; + mad.lo.s32 %r6451, %r74, %r6441, %r6450; + ld.const.v4.u8 {%rs4170, %rs4171, %rs4172, %rs4173}, [matrix+2084]; + cvt.u32.u16 %r6452, %rs4173; + cvt.s32.s8 %r6453, %r6452; + cvt.u32.u16 %r6454, %rs4172; + cvt.s32.s8 %r6455, %r6454; + cvt.u32.u16 %r6456, %rs4171; + cvt.s32.s8 %r6457, %r6456; + cvt.u32.u16 %r6458, %rs4170; + cvt.s32.s8 %r6459, %r6458; + mad.lo.s32 %r6460, %r75, %r6459, %r6451; + mad.lo.s32 %r6461, %r76, %r6457, %r6460; + mad.lo.s32 %r6462, %r77, %r6455, %r6461; + mad.lo.s32 %r6463, %r78, %r6453, %r6462; + ld.const.v4.u8 {%rs4178, %rs4179, %rs4180, %rs4181}, [matrix+2088]; + cvt.u32.u16 %r6464, %rs4181; + cvt.s32.s8 %r6465, %r6464; + cvt.u32.u16 %r6466, %rs4180; + cvt.s32.s8 %r6467, %r6466; + cvt.u32.u16 %r6468, %rs4179; + cvt.s32.s8 %r6469, %r6468; + cvt.u32.u16 %r6470, %rs4178; + cvt.s32.s8 %r6471, %r6470; + mad.lo.s32 %r6472, %r80, %r6471, %r6463; + mad.lo.s32 %r6473, %r81, %r6469, %r6472; + mad.lo.s32 %r6474, %r83, %r6467, %r6473; + mad.lo.s32 %r6475, %r84, %r6465, %r6474; + ld.const.v4.u8 {%rs4186, %rs4187, %rs4188, %rs4189}, [matrix+2092]; + cvt.u32.u16 %r6476, %rs4189; + cvt.s32.s8 %r6477, %r6476; + cvt.u32.u16 %r6478, %rs4188; + cvt.s32.s8 %r6479, %r6478; + cvt.u32.u16 %r6480, %rs4187; + cvt.s32.s8 %r6481, %r6480; + cvt.u32.u16 %r6482, %rs4186; + cvt.s32.s8 %r6483, %r6482; + mad.lo.s32 %r6484, %r86, %r6483, %r6475; + mad.lo.s32 %r6485, %r87, %r6481, %r6484; + mad.lo.s32 %r6486, %r88, %r6479, %r6485; + mad.lo.s32 %r6487, %r89, %r6477, %r6486; + ld.const.v4.u8 {%rs4194, %rs4195, %rs4196, %rs4197}, [matrix+2096]; + cvt.u32.u16 %r6488, %rs4197; + cvt.s32.s8 %r6489, %r6488; + cvt.u32.u16 %r6490, %rs4196; + cvt.s32.s8 %r6491, %r6490; + cvt.u32.u16 %r6492, %rs4195; + cvt.s32.s8 %r6493, %r6492; + cvt.u32.u16 %r6494, %rs4194; + cvt.s32.s8 %r6495, %r6494; + mad.lo.s32 %r6496, %r271, %r6495, %r6487; + mad.lo.s32 %r6497, %r91, %r6493, %r6496; + mad.lo.s32 %r6498, %r93, %r6491, %r6497; + mad.lo.s32 %r6499, %r94, %r6489, %r6498; + ld.const.v4.u8 {%rs4202, %rs4203, %rs4204, %rs4205}, [matrix+2100]; + cvt.u32.u16 %r6500, %rs4205; + cvt.s32.s8 %r6501, %r6500; + cvt.u32.u16 %r6502, %rs4204; + cvt.s32.s8 %r6503, %r6502; + cvt.u32.u16 %r6504, %rs4203; + cvt.s32.s8 %r6505, %r6504; + cvt.u32.u16 %r6506, %rs4202; + cvt.s32.s8 %r6507, %r6506; + mad.lo.s32 %r6508, %r96, %r6507, %r6499; + mad.lo.s32 %r6509, %r97, %r6505, %r6508; + mad.lo.s32 %r6510, %r99, %r6503, %r6509; + mad.lo.s32 %r6511, %r100, %r6501, %r6510; + ld.const.v4.u8 {%rs4210, %rs4211, %rs4212, %rs4213}, [matrix+2104]; + cvt.u32.u16 %r6512, %rs4213; + cvt.s32.s8 %r6513, %r6512; + cvt.u32.u16 %r6514, %rs4212; + cvt.s32.s8 %r6515, %r6514; + cvt.u32.u16 %r6516, %rs4211; + cvt.s32.s8 %r6517, %r6516; + cvt.u32.u16 %r6518, %rs4210; + cvt.s32.s8 %r6519, %r6518; + mad.lo.s32 %r6520, %r103, %r6519, %r6511; + mad.lo.s32 %r6521, %r104, %r6517, %r6520; + mad.lo.s32 %r6522, %r107, %r6515, %r6521; + mad.lo.s32 %r6523, %r108, %r6513, %r6522; + ld.const.v4.u8 {%rs4218, %rs4219, %rs4220, %rs4221}, [matrix+2108]; + cvt.u32.u16 %r6524, %rs4221; + cvt.s32.s8 %r6525, %r6524; + cvt.u32.u16 %r6526, %rs4220; + cvt.s32.s8 %r6527, %r6526; + cvt.u32.u16 %r6528, %rs4219; + cvt.s32.s8 %r6529, %r6528; + cvt.u32.u16 %r6530, %rs4218; + cvt.s32.s8 %r6531, %r6530; + mad.lo.s32 %r6532, %r111, %r6531, %r6523; + mad.lo.s32 %r6533, %r112, %r6529, %r6532; + mad.lo.s32 %r6534, %r114, %r6527, %r6533; + mad.lo.s32 %r6535, %r115, %r6525, %r6534; + ld.const.v4.u8 {%rs4226, %rs4227, %rs4228, %rs4229}, [matrix+2112]; + cvt.u32.u16 %r6536, %rs4229; + cvt.s32.s8 %r6537, %r6536; + cvt.u32.u16 %r6538, %rs4228; + cvt.s32.s8 %r6539, %r6538; + cvt.u32.u16 %r6540, %rs4226; + cvt.s32.s8 %r6541, %r6540; + cvt.u32.u16 %r6542, %rs4227; + cvt.s32.s8 %r6543, %r6542; + mul.lo.s32 %r6544, %r34, %r6543; + mad.lo.s32 %r6545, %r124, %r6541, %r6544; + mad.lo.s32 %r6546, %r35, %r6539, %r6545; + mad.lo.s32 %r6547, %r36, %r6537, %r6546; + ld.const.v4.u8 {%rs4234, %rs4235, %rs4236, %rs4237}, [matrix+2116]; + cvt.u32.u16 %r6548, %rs4237; + cvt.s32.s8 %r6549, %r6548; + cvt.u32.u16 %r6550, %rs4236; + cvt.s32.s8 %r6551, %r6550; + cvt.u32.u16 %r6552, %rs4235; + cvt.s32.s8 %r6553, %r6552; + cvt.u32.u16 %r6554, %rs4234; + cvt.s32.s8 %r6555, %r6554; + mad.lo.s32 %r6556, %r37, %r6555, %r6547; + mad.lo.s32 %r6557, %r38, %r6553, %r6556; + mad.lo.s32 %r6558, %r39, %r6551, %r6557; + mad.lo.s32 %r6559, %r40, %r6549, %r6558; + ld.const.v4.u8 {%rs4242, %rs4243, %rs4244, %rs4245}, [matrix+2120]; + cvt.u32.u16 %r6560, %rs4245; + cvt.s32.s8 %r6561, %r6560; + cvt.u32.u16 %r6562, %rs4244; + cvt.s32.s8 %r6563, %r6562; + cvt.u32.u16 %r6564, %rs4243; + cvt.s32.s8 %r6565, %r6564; + cvt.u32.u16 %r6566, %rs4242; + cvt.s32.s8 %r6567, %r6566; + mad.lo.s32 %r6568, %r42, %r6567, %r6559; + mad.lo.s32 %r6569, %r43, %r6565, %r6568; + mad.lo.s32 %r6570, %r45, %r6563, %r6569; + mad.lo.s32 %r6571, %r46, %r6561, %r6570; + ld.const.v4.u8 {%rs4250, %rs4251, %rs4252, %rs4253}, [matrix+2124]; + cvt.u32.u16 %r6572, %rs4253; + cvt.s32.s8 %r6573, %r6572; + cvt.u32.u16 %r6574, %rs4252; + cvt.s32.s8 %r6575, %r6574; + cvt.u32.u16 %r6576, %rs4251; + cvt.s32.s8 %r6577, %r6576; + cvt.u32.u16 %r6578, %rs4250; + cvt.s32.s8 %r6579, %r6578; + mad.lo.s32 %r6580, %r48, %r6579, %r6571; + mad.lo.s32 %r6581, %r49, %r6577, %r6580; + mad.lo.s32 %r6582, %r50, %r6575, %r6581; + mad.lo.s32 %r6583, %r51, %r6573, %r6582; + ld.const.v4.u8 {%rs4258, %rs4259, %rs4260, %rs4261}, [matrix+2128]; + cvt.u32.u16 %r6584, %rs4261; + cvt.s32.s8 %r6585, %r6584; + cvt.u32.u16 %r6586, %rs4260; + cvt.s32.s8 %r6587, %r6586; + cvt.u32.u16 %r6588, %rs4259; + cvt.s32.s8 %r6589, %r6588; + cvt.u32.u16 %r6590, %rs4258; + cvt.s32.s8 %r6591, %r6590; + mad.lo.s32 %r6592, %r173, %r6591, %r6583; + mad.lo.s32 %r6593, %r53, %r6589, %r6592; + mad.lo.s32 %r6594, %r54, %r6587, %r6593; + mad.lo.s32 %r6595, %r55, %r6585, %r6594; + ld.const.v4.u8 {%rs4266, %rs4267, %rs4268, %rs4269}, [matrix+2132]; + cvt.u32.u16 %r6596, %rs4269; + cvt.s32.s8 %r6597, %r6596; + cvt.u32.u16 %r6598, %rs4268; + cvt.s32.s8 %r6599, %r6598; + cvt.u32.u16 %r6600, %rs4267; + cvt.s32.s8 %r6601, %r6600; + cvt.u32.u16 %r6602, %rs4266; + cvt.s32.s8 %r6603, %r6602; + mad.lo.s32 %r6604, %r56, %r6603, %r6595; + mad.lo.s32 %r6605, %r57, %r6601, %r6604; + mad.lo.s32 %r6606, %r58, %r6599, %r6605; + mad.lo.s32 %r6607, %r59, %r6597, %r6606; + ld.const.v4.u8 {%rs4274, %rs4275, %rs4276, %rs4277}, [matrix+2136]; + cvt.u32.u16 %r6608, %rs4277; + cvt.s32.s8 %r6609, %r6608; + cvt.u32.u16 %r6610, %rs4276; + cvt.s32.s8 %r6611, %r6610; + cvt.u32.u16 %r6612, %rs4275; + cvt.s32.s8 %r6613, %r6612; + cvt.u32.u16 %r6614, %rs4274; + cvt.s32.s8 %r6615, %r6614; + mad.lo.s32 %r6616, %r61, %r6615, %r6607; + mad.lo.s32 %r6617, %r62, %r6613, %r6616; + mad.lo.s32 %r6618, %r64, %r6611, %r6617; + mad.lo.s32 %r6619, %r65, %r6609, %r6618; + ld.const.v4.u8 {%rs4282, %rs4283, %rs4284, %rs4285}, [matrix+2140]; + cvt.u32.u16 %r6620, %rs4285; + cvt.s32.s8 %r6621, %r6620; + cvt.u32.u16 %r6622, %rs4284; + cvt.s32.s8 %r6623, %r6622; + cvt.u32.u16 %r6624, %rs4283; + cvt.s32.s8 %r6625, %r6624; + cvt.u32.u16 %r6626, %rs4282; + cvt.s32.s8 %r6627, %r6626; + mad.lo.s32 %r6628, %r67, %r6627, %r6619; + mad.lo.s32 %r6629, %r68, %r6625, %r6628; + mad.lo.s32 %r6630, %r69, %r6623, %r6629; + mad.lo.s32 %r6631, %r70, %r6621, %r6630; + ld.const.v4.u8 {%rs4290, %rs4291, %rs4292, %rs4293}, [matrix+2144]; + cvt.u32.u16 %r6632, %rs4293; + cvt.s32.s8 %r6633, %r6632; + cvt.u32.u16 %r6634, %rs4292; + cvt.s32.s8 %r6635, %r6634; + cvt.u32.u16 %r6636, %rs4291; + cvt.s32.s8 %r6637, %r6636; + cvt.u32.u16 %r6638, %rs4290; + cvt.s32.s8 %r6639, %r6638; + mad.lo.s32 %r6640, %r222, %r6639, %r6631; + mad.lo.s32 %r6641, %r72, %r6637, %r6640; + mad.lo.s32 %r6642, %r73, %r6635, %r6641; + mad.lo.s32 %r6643, %r74, %r6633, %r6642; + ld.const.v4.u8 {%rs4298, %rs4299, %rs4300, %rs4301}, [matrix+2148]; + cvt.u32.u16 %r6644, %rs4301; + cvt.s32.s8 %r6645, %r6644; + cvt.u32.u16 %r6646, %rs4300; + cvt.s32.s8 %r6647, %r6646; + cvt.u32.u16 %r6648, %rs4299; + cvt.s32.s8 %r6649, %r6648; + cvt.u32.u16 %r6650, %rs4298; + cvt.s32.s8 %r6651, %r6650; + mad.lo.s32 %r6652, %r75, %r6651, %r6643; + mad.lo.s32 %r6653, %r76, %r6649, %r6652; + mad.lo.s32 %r6654, %r77, %r6647, %r6653; + mad.lo.s32 %r6655, %r78, %r6645, %r6654; + ld.const.v4.u8 {%rs4306, %rs4307, %rs4308, %rs4309}, [matrix+2152]; + cvt.u32.u16 %r6656, %rs4309; + cvt.s32.s8 %r6657, %r6656; + cvt.u32.u16 %r6658, %rs4308; + cvt.s32.s8 %r6659, %r6658; + cvt.u32.u16 %r6660, %rs4307; + cvt.s32.s8 %r6661, %r6660; + cvt.u32.u16 %r6662, %rs4306; + cvt.s32.s8 %r6663, %r6662; + mad.lo.s32 %r6664, %r80, %r6663, %r6655; + mad.lo.s32 %r6665, %r81, %r6661, %r6664; + mad.lo.s32 %r6666, %r83, %r6659, %r6665; + mad.lo.s32 %r6667, %r84, %r6657, %r6666; + ld.const.v4.u8 {%rs4314, %rs4315, %rs4316, %rs4317}, [matrix+2156]; + cvt.u32.u16 %r6668, %rs4317; + cvt.s32.s8 %r6669, %r6668; + cvt.u32.u16 %r6670, %rs4316; + cvt.s32.s8 %r6671, %r6670; + cvt.u32.u16 %r6672, %rs4315; + cvt.s32.s8 %r6673, %r6672; + cvt.u32.u16 %r6674, %rs4314; + cvt.s32.s8 %r6675, %r6674; + mad.lo.s32 %r6676, %r86, %r6675, %r6667; + mad.lo.s32 %r6677, %r87, %r6673, %r6676; + mad.lo.s32 %r6678, %r88, %r6671, %r6677; + mad.lo.s32 %r6679, %r89, %r6669, %r6678; + ld.const.v4.u8 {%rs4322, %rs4323, %rs4324, %rs4325}, [matrix+2160]; + cvt.u32.u16 %r6680, %rs4325; + cvt.s32.s8 %r6681, %r6680; + cvt.u32.u16 %r6682, %rs4324; + cvt.s32.s8 %r6683, %r6682; + cvt.u32.u16 %r6684, %rs4323; + cvt.s32.s8 %r6685, %r6684; + cvt.u32.u16 %r6686, %rs4322; + cvt.s32.s8 %r6687, %r6686; + mad.lo.s32 %r6688, %r271, %r6687, %r6679; + mad.lo.s32 %r6689, %r91, %r6685, %r6688; + mad.lo.s32 %r6690, %r93, %r6683, %r6689; + mad.lo.s32 %r6691, %r94, %r6681, %r6690; + ld.const.v4.u8 {%rs4330, %rs4331, %rs4332, %rs4333}, [matrix+2164]; + cvt.u32.u16 %r6692, %rs4333; + cvt.s32.s8 %r6693, %r6692; + cvt.u32.u16 %r6694, %rs4332; + cvt.s32.s8 %r6695, %r6694; + cvt.u32.u16 %r6696, %rs4331; + cvt.s32.s8 %r6697, %r6696; + cvt.u32.u16 %r6698, %rs4330; + cvt.s32.s8 %r6699, %r6698; + mad.lo.s32 %r6700, %r96, %r6699, %r6691; + mad.lo.s32 %r6701, %r97, %r6697, %r6700; + mad.lo.s32 %r6702, %r99, %r6695, %r6701; + mad.lo.s32 %r6703, %r100, %r6693, %r6702; + ld.const.v4.u8 {%rs4338, %rs4339, %rs4340, %rs4341}, [matrix+2168]; + cvt.u32.u16 %r6704, %rs4341; + cvt.s32.s8 %r6705, %r6704; + cvt.u32.u16 %r6706, %rs4340; + cvt.s32.s8 %r6707, %r6706; + cvt.u32.u16 %r6708, %rs4339; + cvt.s32.s8 %r6709, %r6708; + cvt.u32.u16 %r6710, %rs4338; + cvt.s32.s8 %r6711, %r6710; + mad.lo.s32 %r6712, %r103, %r6711, %r6703; + mad.lo.s32 %r6713, %r104, %r6709, %r6712; + mad.lo.s32 %r6714, %r107, %r6707, %r6713; + mad.lo.s32 %r6715, %r108, %r6705, %r6714; + ld.const.v4.u8 {%rs4346, %rs4347, %rs4348, %rs4349}, [matrix+2172]; + cvt.u32.u16 %r6716, %rs4349; + cvt.s32.s8 %r6717, %r6716; + cvt.u32.u16 %r6718, %rs4348; + cvt.s32.s8 %r6719, %r6718; + cvt.u32.u16 %r6720, %rs4347; + cvt.s32.s8 %r6721, %r6720; + cvt.u32.u16 %r6722, %rs4346; + cvt.s32.s8 %r6723, %r6722; + mad.lo.s32 %r6724, %r111, %r6723, %r6715; + mad.lo.s32 %r6725, %r112, %r6721, %r6724; + mad.lo.s32 %r6726, %r114, %r6719, %r6725; + mad.lo.s32 %r6727, %r115, %r6717, %r6726; + shr.u32 %r6728, %r6535, 6; + and.b32 %r6729, %r6728, 240; + shr.u32 %r6730, %r6727, 10; + or.b32 %r6731, %r6730, %r6729; + xor.b32 %r6732, %r71, %r6731; + cvt.u64.u32 %rd393, %r6732; + ld.const.v4.u8 {%rs4354, %rs4355, %rs4356, %rs4357}, [matrix+2176]; + cvt.u32.u16 %r6733, %rs4357; + cvt.s32.s8 %r6734, %r6733; + cvt.u32.u16 %r6735, %rs4356; + cvt.s32.s8 %r6736, %r6735; + cvt.u32.u16 %r6737, %rs4354; + cvt.s32.s8 %r6738, %r6737; + cvt.u32.u16 %r6739, %rs4355; + cvt.s32.s8 %r6740, %r6739; + mul.lo.s32 %r6741, %r34, %r6740; + mad.lo.s32 %r6742, %r124, %r6738, %r6741; + mad.lo.s32 %r6743, %r35, %r6736, %r6742; + mad.lo.s32 %r6744, %r36, %r6734, %r6743; + ld.const.v4.u8 {%rs4362, %rs4363, %rs4364, %rs4365}, [matrix+2180]; + cvt.u32.u16 %r6745, %rs4365; + cvt.s32.s8 %r6746, %r6745; + cvt.u32.u16 %r6747, %rs4364; + cvt.s32.s8 %r6748, %r6747; + cvt.u32.u16 %r6749, %rs4363; + cvt.s32.s8 %r6750, %r6749; + cvt.u32.u16 %r6751, %rs4362; + cvt.s32.s8 %r6752, %r6751; + mad.lo.s32 %r6753, %r37, %r6752, %r6744; + mad.lo.s32 %r6754, %r38, %r6750, %r6753; + mad.lo.s32 %r6755, %r39, %r6748, %r6754; + mad.lo.s32 %r6756, %r40, %r6746, %r6755; + ld.const.v4.u8 {%rs4370, %rs4371, %rs4372, %rs4373}, [matrix+2184]; + cvt.u32.u16 %r6757, %rs4373; + cvt.s32.s8 %r6758, %r6757; + cvt.u32.u16 %r6759, %rs4372; + cvt.s32.s8 %r6760, %r6759; + cvt.u32.u16 %r6761, %rs4371; + cvt.s32.s8 %r6762, %r6761; + cvt.u32.u16 %r6763, %rs4370; + cvt.s32.s8 %r6764, %r6763; + mad.lo.s32 %r6765, %r42, %r6764, %r6756; + mad.lo.s32 %r6766, %r43, %r6762, %r6765; + mad.lo.s32 %r6767, %r45, %r6760, %r6766; + mad.lo.s32 %r6768, %r46, %r6758, %r6767; + ld.const.v4.u8 {%rs4378, %rs4379, %rs4380, %rs4381}, [matrix+2188]; + cvt.u32.u16 %r6769, %rs4381; + cvt.s32.s8 %r6770, %r6769; + cvt.u32.u16 %r6771, %rs4380; + cvt.s32.s8 %r6772, %r6771; + cvt.u32.u16 %r6773, %rs4379; + cvt.s32.s8 %r6774, %r6773; + cvt.u32.u16 %r6775, %rs4378; + cvt.s32.s8 %r6776, %r6775; + mad.lo.s32 %r6777, %r48, %r6776, %r6768; + mad.lo.s32 %r6778, %r49, %r6774, %r6777; + mad.lo.s32 %r6779, %r50, %r6772, %r6778; + mad.lo.s32 %r6780, %r51, %r6770, %r6779; + ld.const.v4.u8 {%rs4386, %rs4387, %rs4388, %rs4389}, [matrix+2192]; + cvt.u32.u16 %r6781, %rs4389; + cvt.s32.s8 %r6782, %r6781; + cvt.u32.u16 %r6783, %rs4388; + cvt.s32.s8 %r6784, %r6783; + cvt.u32.u16 %r6785, %rs4387; + cvt.s32.s8 %r6786, %r6785; + cvt.u32.u16 %r6787, %rs4386; + cvt.s32.s8 %r6788, %r6787; + mad.lo.s32 %r6789, %r173, %r6788, %r6780; + mad.lo.s32 %r6790, %r53, %r6786, %r6789; + mad.lo.s32 %r6791, %r54, %r6784, %r6790; + mad.lo.s32 %r6792, %r55, %r6782, %r6791; + ld.const.v4.u8 {%rs4394, %rs4395, %rs4396, %rs4397}, [matrix+2196]; + cvt.u32.u16 %r6793, %rs4397; + cvt.s32.s8 %r6794, %r6793; + cvt.u32.u16 %r6795, %rs4396; + cvt.s32.s8 %r6796, %r6795; + cvt.u32.u16 %r6797, %rs4395; + cvt.s32.s8 %r6798, %r6797; + cvt.u32.u16 %r6799, %rs4394; + cvt.s32.s8 %r6800, %r6799; + mad.lo.s32 %r6801, %r56, %r6800, %r6792; + mad.lo.s32 %r6802, %r57, %r6798, %r6801; + mad.lo.s32 %r6803, %r58, %r6796, %r6802; + mad.lo.s32 %r6804, %r59, %r6794, %r6803; + ld.const.v4.u8 {%rs4402, %rs4403, %rs4404, %rs4405}, [matrix+2200]; + cvt.u32.u16 %r6805, %rs4405; + cvt.s32.s8 %r6806, %r6805; + cvt.u32.u16 %r6807, %rs4404; + cvt.s32.s8 %r6808, %r6807; + cvt.u32.u16 %r6809, %rs4403; + cvt.s32.s8 %r6810, %r6809; + cvt.u32.u16 %r6811, %rs4402; + cvt.s32.s8 %r6812, %r6811; + mad.lo.s32 %r6813, %r61, %r6812, %r6804; + mad.lo.s32 %r6814, %r62, %r6810, %r6813; + mad.lo.s32 %r6815, %r64, %r6808, %r6814; + mad.lo.s32 %r6816, %r65, %r6806, %r6815; + ld.const.v4.u8 {%rs4410, %rs4411, %rs4412, %rs4413}, [matrix+2204]; + cvt.u32.u16 %r6817, %rs4413; + cvt.s32.s8 %r6818, %r6817; + cvt.u32.u16 %r6819, %rs4412; + cvt.s32.s8 %r6820, %r6819; + cvt.u32.u16 %r6821, %rs4411; + cvt.s32.s8 %r6822, %r6821; + cvt.u32.u16 %r6823, %rs4410; + cvt.s32.s8 %r6824, %r6823; + mad.lo.s32 %r6825, %r67, %r6824, %r6816; + mad.lo.s32 %r6826, %r68, %r6822, %r6825; + mad.lo.s32 %r6827, %r69, %r6820, %r6826; + mad.lo.s32 %r6828, %r70, %r6818, %r6827; + ld.const.v4.u8 {%rs4418, %rs4419, %rs4420, %rs4421}, [matrix+2208]; + cvt.u32.u16 %r6829, %rs4421; + cvt.s32.s8 %r6830, %r6829; + cvt.u32.u16 %r6831, %rs4420; + cvt.s32.s8 %r6832, %r6831; + cvt.u32.u16 %r6833, %rs4419; + cvt.s32.s8 %r6834, %r6833; + cvt.u32.u16 %r6835, %rs4418; + cvt.s32.s8 %r6836, %r6835; + mad.lo.s32 %r6837, %r222, %r6836, %r6828; + mad.lo.s32 %r6838, %r72, %r6834, %r6837; + mad.lo.s32 %r6839, %r73, %r6832, %r6838; + mad.lo.s32 %r6840, %r74, %r6830, %r6839; + ld.const.v4.u8 {%rs4426, %rs4427, %rs4428, %rs4429}, [matrix+2212]; + cvt.u32.u16 %r6841, %rs4429; + cvt.s32.s8 %r6842, %r6841; + cvt.u32.u16 %r6843, %rs4428; + cvt.s32.s8 %r6844, %r6843; + cvt.u32.u16 %r6845, %rs4427; + cvt.s32.s8 %r6846, %r6845; + cvt.u32.u16 %r6847, %rs4426; + cvt.s32.s8 %r6848, %r6847; + mad.lo.s32 %r6849, %r75, %r6848, %r6840; + mad.lo.s32 %r6850, %r76, %r6846, %r6849; + mad.lo.s32 %r6851, %r77, %r6844, %r6850; + mad.lo.s32 %r6852, %r78, %r6842, %r6851; + ld.const.v4.u8 {%rs4434, %rs4435, %rs4436, %rs4437}, [matrix+2216]; + cvt.u32.u16 %r6853, %rs4437; + cvt.s32.s8 %r6854, %r6853; + cvt.u32.u16 %r6855, %rs4436; + cvt.s32.s8 %r6856, %r6855; + cvt.u32.u16 %r6857, %rs4435; + cvt.s32.s8 %r6858, %r6857; + cvt.u32.u16 %r6859, %rs4434; + cvt.s32.s8 %r6860, %r6859; + mad.lo.s32 %r6861, %r80, %r6860, %r6852; + mad.lo.s32 %r6862, %r81, %r6858, %r6861; + mad.lo.s32 %r6863, %r83, %r6856, %r6862; + mad.lo.s32 %r6864, %r84, %r6854, %r6863; + ld.const.v4.u8 {%rs4442, %rs4443, %rs4444, %rs4445}, [matrix+2220]; + cvt.u32.u16 %r6865, %rs4445; + cvt.s32.s8 %r6866, %r6865; + cvt.u32.u16 %r6867, %rs4444; + cvt.s32.s8 %r6868, %r6867; + cvt.u32.u16 %r6869, %rs4443; + cvt.s32.s8 %r6870, %r6869; + cvt.u32.u16 %r6871, %rs4442; + cvt.s32.s8 %r6872, %r6871; + mad.lo.s32 %r6873, %r86, %r6872, %r6864; + mad.lo.s32 %r6874, %r87, %r6870, %r6873; + mad.lo.s32 %r6875, %r88, %r6868, %r6874; + mad.lo.s32 %r6876, %r89, %r6866, %r6875; + ld.const.v4.u8 {%rs4450, %rs4451, %rs4452, %rs4453}, [matrix+2224]; + cvt.u32.u16 %r6877, %rs4453; + cvt.s32.s8 %r6878, %r6877; + cvt.u32.u16 %r6879, %rs4452; + cvt.s32.s8 %r6880, %r6879; + cvt.u32.u16 %r6881, %rs4451; + cvt.s32.s8 %r6882, %r6881; + cvt.u32.u16 %r6883, %rs4450; + cvt.s32.s8 %r6884, %r6883; + mad.lo.s32 %r6885, %r271, %r6884, %r6876; + mad.lo.s32 %r6886, %r91, %r6882, %r6885; + mad.lo.s32 %r6887, %r93, %r6880, %r6886; + mad.lo.s32 %r6888, %r94, %r6878, %r6887; + ld.const.v4.u8 {%rs4458, %rs4459, %rs4460, %rs4461}, [matrix+2228]; + cvt.u32.u16 %r6889, %rs4461; + cvt.s32.s8 %r6890, %r6889; + cvt.u32.u16 %r6891, %rs4460; + cvt.s32.s8 %r6892, %r6891; + cvt.u32.u16 %r6893, %rs4459; + cvt.s32.s8 %r6894, %r6893; + cvt.u32.u16 %r6895, %rs4458; + cvt.s32.s8 %r6896, %r6895; + mad.lo.s32 %r6897, %r96, %r6896, %r6888; + mad.lo.s32 %r6898, %r97, %r6894, %r6897; + mad.lo.s32 %r6899, %r99, %r6892, %r6898; + mad.lo.s32 %r6900, %r100, %r6890, %r6899; + ld.const.v4.u8 {%rs4466, %rs4467, %rs4468, %rs4469}, [matrix+2232]; + cvt.u32.u16 %r6901, %rs4469; + cvt.s32.s8 %r6902, %r6901; + cvt.u32.u16 %r6903, %rs4468; + cvt.s32.s8 %r6904, %r6903; + cvt.u32.u16 %r6905, %rs4467; + cvt.s32.s8 %r6906, %r6905; + cvt.u32.u16 %r6907, %rs4466; + cvt.s32.s8 %r6908, %r6907; + mad.lo.s32 %r6909, %r103, %r6908, %r6900; + mad.lo.s32 %r6910, %r104, %r6906, %r6909; + mad.lo.s32 %r6911, %r107, %r6904, %r6910; + mad.lo.s32 %r6912, %r108, %r6902, %r6911; + ld.const.v4.u8 {%rs4474, %rs4475, %rs4476, %rs4477}, [matrix+2236]; + cvt.u32.u16 %r6913, %rs4477; + cvt.s32.s8 %r6914, %r6913; + cvt.u32.u16 %r6915, %rs4476; + cvt.s32.s8 %r6916, %r6915; + cvt.u32.u16 %r6917, %rs4475; + cvt.s32.s8 %r6918, %r6917; + cvt.u32.u16 %r6919, %rs4474; + cvt.s32.s8 %r6920, %r6919; + mad.lo.s32 %r6921, %r111, %r6920, %r6912; + mad.lo.s32 %r6922, %r112, %r6918, %r6921; + mad.lo.s32 %r6923, %r114, %r6916, %r6922; + mad.lo.s32 %r6924, %r115, %r6914, %r6923; + ld.const.v4.u8 {%rs4482, %rs4483, %rs4484, %rs4485}, [matrix+2240]; + cvt.u32.u16 %r6925, %rs4485; + cvt.s32.s8 %r6926, %r6925; + cvt.u32.u16 %r6927, %rs4484; + cvt.s32.s8 %r6928, %r6927; + cvt.u32.u16 %r6929, %rs4482; + cvt.s32.s8 %r6930, %r6929; + cvt.u32.u16 %r6931, %rs4483; + cvt.s32.s8 %r6932, %r6931; + mul.lo.s32 %r6933, %r34, %r6932; + mad.lo.s32 %r6934, %r124, %r6930, %r6933; + mad.lo.s32 %r6935, %r35, %r6928, %r6934; + mad.lo.s32 %r6936, %r36, %r6926, %r6935; + ld.const.v4.u8 {%rs4490, %rs4491, %rs4492, %rs4493}, [matrix+2244]; + cvt.u32.u16 %r6937, %rs4493; + cvt.s32.s8 %r6938, %r6937; + cvt.u32.u16 %r6939, %rs4492; + cvt.s32.s8 %r6940, %r6939; + cvt.u32.u16 %r6941, %rs4491; + cvt.s32.s8 %r6942, %r6941; + cvt.u32.u16 %r6943, %rs4490; + cvt.s32.s8 %r6944, %r6943; + mad.lo.s32 %r6945, %r37, %r6944, %r6936; + mad.lo.s32 %r6946, %r38, %r6942, %r6945; + mad.lo.s32 %r6947, %r39, %r6940, %r6946; + mad.lo.s32 %r6948, %r40, %r6938, %r6947; + ld.const.v4.u8 {%rs4498, %rs4499, %rs4500, %rs4501}, [matrix+2248]; + cvt.u32.u16 %r6949, %rs4501; + cvt.s32.s8 %r6950, %r6949; + cvt.u32.u16 %r6951, %rs4500; + cvt.s32.s8 %r6952, %r6951; + cvt.u32.u16 %r6953, %rs4499; + cvt.s32.s8 %r6954, %r6953; + cvt.u32.u16 %r6955, %rs4498; + cvt.s32.s8 %r6956, %r6955; + mad.lo.s32 %r6957, %r42, %r6956, %r6948; + mad.lo.s32 %r6958, %r43, %r6954, %r6957; + mad.lo.s32 %r6959, %r45, %r6952, %r6958; + mad.lo.s32 %r6960, %r46, %r6950, %r6959; + ld.const.v4.u8 {%rs4506, %rs4507, %rs4508, %rs4509}, [matrix+2252]; + cvt.u32.u16 %r6961, %rs4509; + cvt.s32.s8 %r6962, %r6961; + cvt.u32.u16 %r6963, %rs4508; + cvt.s32.s8 %r6964, %r6963; + cvt.u32.u16 %r6965, %rs4507; + cvt.s32.s8 %r6966, %r6965; + cvt.u32.u16 %r6967, %rs4506; + cvt.s32.s8 %r6968, %r6967; + mad.lo.s32 %r6969, %r48, %r6968, %r6960; + mad.lo.s32 %r6970, %r49, %r6966, %r6969; + mad.lo.s32 %r6971, %r50, %r6964, %r6970; + mad.lo.s32 %r6972, %r51, %r6962, %r6971; + ld.const.v4.u8 {%rs4514, %rs4515, %rs4516, %rs4517}, [matrix+2256]; + cvt.u32.u16 %r6973, %rs4517; + cvt.s32.s8 %r6974, %r6973; + cvt.u32.u16 %r6975, %rs4516; + cvt.s32.s8 %r6976, %r6975; + cvt.u32.u16 %r6977, %rs4515; + cvt.s32.s8 %r6978, %r6977; + cvt.u32.u16 %r6979, %rs4514; + cvt.s32.s8 %r6980, %r6979; + mad.lo.s32 %r6981, %r173, %r6980, %r6972; + mad.lo.s32 %r6982, %r53, %r6978, %r6981; + mad.lo.s32 %r6983, %r54, %r6976, %r6982; + mad.lo.s32 %r6984, %r55, %r6974, %r6983; + ld.const.v4.u8 {%rs4522, %rs4523, %rs4524, %rs4525}, [matrix+2260]; + cvt.u32.u16 %r6985, %rs4525; + cvt.s32.s8 %r6986, %r6985; + cvt.u32.u16 %r6987, %rs4524; + cvt.s32.s8 %r6988, %r6987; + cvt.u32.u16 %r6989, %rs4523; + cvt.s32.s8 %r6990, %r6989; + cvt.u32.u16 %r6991, %rs4522; + cvt.s32.s8 %r6992, %r6991; + mad.lo.s32 %r6993, %r56, %r6992, %r6984; + mad.lo.s32 %r6994, %r57, %r6990, %r6993; + mad.lo.s32 %r6995, %r58, %r6988, %r6994; + mad.lo.s32 %r6996, %r59, %r6986, %r6995; + ld.const.v4.u8 {%rs4530, %rs4531, %rs4532, %rs4533}, [matrix+2264]; + cvt.u32.u16 %r6997, %rs4533; + cvt.s32.s8 %r6998, %r6997; + cvt.u32.u16 %r6999, %rs4532; + cvt.s32.s8 %r7000, %r6999; + cvt.u32.u16 %r7001, %rs4531; + cvt.s32.s8 %r7002, %r7001; + cvt.u32.u16 %r7003, %rs4530; + cvt.s32.s8 %r7004, %r7003; + mad.lo.s32 %r7005, %r61, %r7004, %r6996; + mad.lo.s32 %r7006, %r62, %r7002, %r7005; + mad.lo.s32 %r7007, %r64, %r7000, %r7006; + mad.lo.s32 %r7008, %r65, %r6998, %r7007; + ld.const.v4.u8 {%rs4538, %rs4539, %rs4540, %rs4541}, [matrix+2268]; + cvt.u32.u16 %r7009, %rs4541; + cvt.s32.s8 %r7010, %r7009; + cvt.u32.u16 %r7011, %rs4540; + cvt.s32.s8 %r7012, %r7011; + cvt.u32.u16 %r7013, %rs4539; + cvt.s32.s8 %r7014, %r7013; + cvt.u32.u16 %r7015, %rs4538; + cvt.s32.s8 %r7016, %r7015; + mad.lo.s32 %r7017, %r67, %r7016, %r7008; + mad.lo.s32 %r7018, %r68, %r7014, %r7017; + mad.lo.s32 %r7019, %r69, %r7012, %r7018; + mad.lo.s32 %r7020, %r70, %r7010, %r7019; + ld.const.v4.u8 {%rs4546, %rs4547, %rs4548, %rs4549}, [matrix+2272]; + cvt.u32.u16 %r7021, %rs4549; + cvt.s32.s8 %r7022, %r7021; + cvt.u32.u16 %r7023, %rs4548; + cvt.s32.s8 %r7024, %r7023; + cvt.u32.u16 %r7025, %rs4547; + cvt.s32.s8 %r7026, %r7025; + cvt.u32.u16 %r7027, %rs4546; + cvt.s32.s8 %r7028, %r7027; + mad.lo.s32 %r7029, %r222, %r7028, %r7020; + mad.lo.s32 %r7030, %r72, %r7026, %r7029; + mad.lo.s32 %r7031, %r73, %r7024, %r7030; + mad.lo.s32 %r7032, %r74, %r7022, %r7031; + ld.const.v4.u8 {%rs4554, %rs4555, %rs4556, %rs4557}, [matrix+2276]; + cvt.u32.u16 %r7033, %rs4557; + cvt.s32.s8 %r7034, %r7033; + cvt.u32.u16 %r7035, %rs4556; + cvt.s32.s8 %r7036, %r7035; + cvt.u32.u16 %r7037, %rs4555; + cvt.s32.s8 %r7038, %r7037; + cvt.u32.u16 %r7039, %rs4554; + cvt.s32.s8 %r7040, %r7039; + mad.lo.s32 %r7041, %r75, %r7040, %r7032; + mad.lo.s32 %r7042, %r76, %r7038, %r7041; + mad.lo.s32 %r7043, %r77, %r7036, %r7042; + mad.lo.s32 %r7044, %r78, %r7034, %r7043; + ld.const.v4.u8 {%rs4562, %rs4563, %rs4564, %rs4565}, [matrix+2280]; + cvt.u32.u16 %r7045, %rs4565; + cvt.s32.s8 %r7046, %r7045; + cvt.u32.u16 %r7047, %rs4564; + cvt.s32.s8 %r7048, %r7047; + cvt.u32.u16 %r7049, %rs4563; + cvt.s32.s8 %r7050, %r7049; + cvt.u32.u16 %r7051, %rs4562; + cvt.s32.s8 %r7052, %r7051; + mad.lo.s32 %r7053, %r80, %r7052, %r7044; + mad.lo.s32 %r7054, %r81, %r7050, %r7053; + mad.lo.s32 %r7055, %r83, %r7048, %r7054; + mad.lo.s32 %r7056, %r84, %r7046, %r7055; + ld.const.v4.u8 {%rs4570, %rs4571, %rs4572, %rs4573}, [matrix+2284]; + cvt.u32.u16 %r7057, %rs4573; + cvt.s32.s8 %r7058, %r7057; + cvt.u32.u16 %r7059, %rs4572; + cvt.s32.s8 %r7060, %r7059; + cvt.u32.u16 %r7061, %rs4571; + cvt.s32.s8 %r7062, %r7061; + cvt.u32.u16 %r7063, %rs4570; + cvt.s32.s8 %r7064, %r7063; + mad.lo.s32 %r7065, %r86, %r7064, %r7056; + mad.lo.s32 %r7066, %r87, %r7062, %r7065; + mad.lo.s32 %r7067, %r88, %r7060, %r7066; + mad.lo.s32 %r7068, %r89, %r7058, %r7067; + ld.const.v4.u8 {%rs4578, %rs4579, %rs4580, %rs4581}, [matrix+2288]; + cvt.u32.u16 %r7069, %rs4581; + cvt.s32.s8 %r7070, %r7069; + cvt.u32.u16 %r7071, %rs4580; + cvt.s32.s8 %r7072, %r7071; + cvt.u32.u16 %r7073, %rs4579; + cvt.s32.s8 %r7074, %r7073; + cvt.u32.u16 %r7075, %rs4578; + cvt.s32.s8 %r7076, %r7075; + mad.lo.s32 %r7077, %r271, %r7076, %r7068; + mad.lo.s32 %r7078, %r91, %r7074, %r7077; + mad.lo.s32 %r7079, %r93, %r7072, %r7078; + mad.lo.s32 %r7080, %r94, %r7070, %r7079; + ld.const.v4.u8 {%rs4586, %rs4587, %rs4588, %rs4589}, [matrix+2292]; + cvt.u32.u16 %r7081, %rs4589; + cvt.s32.s8 %r7082, %r7081; + cvt.u32.u16 %r7083, %rs4588; + cvt.s32.s8 %r7084, %r7083; + cvt.u32.u16 %r7085, %rs4587; + cvt.s32.s8 %r7086, %r7085; + cvt.u32.u16 %r7087, %rs4586; + cvt.s32.s8 %r7088, %r7087; + mad.lo.s32 %r7089, %r96, %r7088, %r7080; + mad.lo.s32 %r7090, %r97, %r7086, %r7089; + mad.lo.s32 %r7091, %r99, %r7084, %r7090; + mad.lo.s32 %r7092, %r100, %r7082, %r7091; + ld.const.v4.u8 {%rs4594, %rs4595, %rs4596, %rs4597}, [matrix+2296]; + cvt.u32.u16 %r7093, %rs4597; + cvt.s32.s8 %r7094, %r7093; + cvt.u32.u16 %r7095, %rs4596; + cvt.s32.s8 %r7096, %r7095; + cvt.u32.u16 %r7097, %rs4595; + cvt.s32.s8 %r7098, %r7097; + cvt.u32.u16 %r7099, %rs4594; + cvt.s32.s8 %r7100, %r7099; + mad.lo.s32 %r7101, %r103, %r7100, %r7092; + mad.lo.s32 %r7102, %r104, %r7098, %r7101; + mad.lo.s32 %r7103, %r107, %r7096, %r7102; + mad.lo.s32 %r7104, %r108, %r7094, %r7103; + ld.const.v4.u8 {%rs4602, %rs4603, %rs4604, %rs4605}, [matrix+2300]; + cvt.u32.u16 %r7105, %rs4605; + cvt.s32.s8 %r7106, %r7105; + cvt.u32.u16 %r7107, %rs4604; + cvt.s32.s8 %r7108, %r7107; + cvt.u32.u16 %r7109, %rs4603; + cvt.s32.s8 %r7110, %r7109; + cvt.u32.u16 %r7111, %rs4602; + cvt.s32.s8 %r7112, %r7111; + mad.lo.s32 %r7113, %r111, %r7112, %r7104; + mad.lo.s32 %r7114, %r112, %r7110, %r7113; + mad.lo.s32 %r7115, %r114, %r7108, %r7114; + mad.lo.s32 %r7116, %r115, %r7106, %r7115; + shr.u32 %r7117, %r6924, 6; + and.b32 %r7118, %r7117, 240; + shr.u32 %r7119, %r7116, 10; + or.b32 %r7120, %r7119, %r7118; + xor.b32 %r7121, %r26, %r7120; + ld.const.v4.u8 {%rs4610, %rs4611, %rs4612, %rs4613}, [matrix+2304]; + cvt.u32.u16 %r7122, %rs4613; + cvt.s32.s8 %r7123, %r7122; + cvt.u32.u16 %r7124, %rs4612; + cvt.s32.s8 %r7125, %r7124; + cvt.u32.u16 %r7126, %rs4610; + cvt.s32.s8 %r7127, %r7126; + cvt.u32.u16 %r7128, %rs4611; + cvt.s32.s8 %r7129, %r7128; + mul.lo.s32 %r7130, %r34, %r7129; + mad.lo.s32 %r7131, %r124, %r7127, %r7130; + mad.lo.s32 %r7132, %r35, %r7125, %r7131; + mad.lo.s32 %r7133, %r36, %r7123, %r7132; + ld.const.v4.u8 {%rs4618, %rs4619, %rs4620, %rs4621}, [matrix+2308]; + cvt.u32.u16 %r7134, %rs4621; + cvt.s32.s8 %r7135, %r7134; + cvt.u32.u16 %r7136, %rs4620; + cvt.s32.s8 %r7137, %r7136; + cvt.u32.u16 %r7138, %rs4619; + cvt.s32.s8 %r7139, %r7138; + cvt.u32.u16 %r7140, %rs4618; + cvt.s32.s8 %r7141, %r7140; + mad.lo.s32 %r7142, %r37, %r7141, %r7133; + mad.lo.s32 %r7143, %r38, %r7139, %r7142; + mad.lo.s32 %r7144, %r39, %r7137, %r7143; + mad.lo.s32 %r7145, %r40, %r7135, %r7144; + ld.const.v4.u8 {%rs4626, %rs4627, %rs4628, %rs4629}, [matrix+2312]; + cvt.u32.u16 %r7146, %rs4629; + cvt.s32.s8 %r7147, %r7146; + cvt.u32.u16 %r7148, %rs4628; + cvt.s32.s8 %r7149, %r7148; + cvt.u32.u16 %r7150, %rs4627; + cvt.s32.s8 %r7151, %r7150; + cvt.u32.u16 %r7152, %rs4626; + cvt.s32.s8 %r7153, %r7152; + mad.lo.s32 %r7154, %r42, %r7153, %r7145; + mad.lo.s32 %r7155, %r43, %r7151, %r7154; + mad.lo.s32 %r7156, %r45, %r7149, %r7155; + mad.lo.s32 %r7157, %r46, %r7147, %r7156; + ld.const.v4.u8 {%rs4634, %rs4635, %rs4636, %rs4637}, [matrix+2316]; + cvt.u32.u16 %r7158, %rs4637; + cvt.s32.s8 %r7159, %r7158; + cvt.u32.u16 %r7160, %rs4636; + cvt.s32.s8 %r7161, %r7160; + cvt.u32.u16 %r7162, %rs4635; + cvt.s32.s8 %r7163, %r7162; + cvt.u32.u16 %r7164, %rs4634; + cvt.s32.s8 %r7165, %r7164; + mad.lo.s32 %r7166, %r48, %r7165, %r7157; + mad.lo.s32 %r7167, %r49, %r7163, %r7166; + mad.lo.s32 %r7168, %r50, %r7161, %r7167; + mad.lo.s32 %r7169, %r51, %r7159, %r7168; + ld.const.v4.u8 {%rs4642, %rs4643, %rs4644, %rs4645}, [matrix+2320]; + cvt.u32.u16 %r7170, %rs4645; + cvt.s32.s8 %r7171, %r7170; + cvt.u32.u16 %r7172, %rs4644; + cvt.s32.s8 %r7173, %r7172; + cvt.u32.u16 %r7174, %rs4643; + cvt.s32.s8 %r7175, %r7174; + cvt.u32.u16 %r7176, %rs4642; + cvt.s32.s8 %r7177, %r7176; + mad.lo.s32 %r7178, %r173, %r7177, %r7169; + mad.lo.s32 %r7179, %r53, %r7175, %r7178; + mad.lo.s32 %r7180, %r54, %r7173, %r7179; + mad.lo.s32 %r7181, %r55, %r7171, %r7180; + ld.const.v4.u8 {%rs4650, %rs4651, %rs4652, %rs4653}, [matrix+2324]; + cvt.u32.u16 %r7182, %rs4653; + cvt.s32.s8 %r7183, %r7182; + cvt.u32.u16 %r7184, %rs4652; + cvt.s32.s8 %r7185, %r7184; + cvt.u32.u16 %r7186, %rs4651; + cvt.s32.s8 %r7187, %r7186; + cvt.u32.u16 %r7188, %rs4650; + cvt.s32.s8 %r7189, %r7188; + mad.lo.s32 %r7190, %r56, %r7189, %r7181; + mad.lo.s32 %r7191, %r57, %r7187, %r7190; + mad.lo.s32 %r7192, %r58, %r7185, %r7191; + mad.lo.s32 %r7193, %r59, %r7183, %r7192; + ld.const.v4.u8 {%rs4658, %rs4659, %rs4660, %rs4661}, [matrix+2328]; + cvt.u32.u16 %r7194, %rs4661; + cvt.s32.s8 %r7195, %r7194; + cvt.u32.u16 %r7196, %rs4660; + cvt.s32.s8 %r7197, %r7196; + cvt.u32.u16 %r7198, %rs4659; + cvt.s32.s8 %r7199, %r7198; + cvt.u32.u16 %r7200, %rs4658; + cvt.s32.s8 %r7201, %r7200; + mad.lo.s32 %r7202, %r61, %r7201, %r7193; + mad.lo.s32 %r7203, %r62, %r7199, %r7202; + mad.lo.s32 %r7204, %r64, %r7197, %r7203; + mad.lo.s32 %r7205, %r65, %r7195, %r7204; + ld.const.v4.u8 {%rs4666, %rs4667, %rs4668, %rs4669}, [matrix+2332]; + cvt.u32.u16 %r7206, %rs4669; + cvt.s32.s8 %r7207, %r7206; + cvt.u32.u16 %r7208, %rs4668; + cvt.s32.s8 %r7209, %r7208; + cvt.u32.u16 %r7210, %rs4667; + cvt.s32.s8 %r7211, %r7210; + cvt.u32.u16 %r7212, %rs4666; + cvt.s32.s8 %r7213, %r7212; + mad.lo.s32 %r7214, %r67, %r7213, %r7205; + mad.lo.s32 %r7215, %r68, %r7211, %r7214; + mad.lo.s32 %r7216, %r69, %r7209, %r7215; + mad.lo.s32 %r7217, %r70, %r7207, %r7216; + ld.const.v4.u8 {%rs4674, %rs4675, %rs4676, %rs4677}, [matrix+2336]; + cvt.u32.u16 %r7218, %rs4677; + cvt.s32.s8 %r7219, %r7218; + cvt.u32.u16 %r7220, %rs4676; + cvt.s32.s8 %r7221, %r7220; + cvt.u32.u16 %r7222, %rs4675; + cvt.s32.s8 %r7223, %r7222; + cvt.u32.u16 %r7224, %rs4674; + cvt.s32.s8 %r7225, %r7224; + mad.lo.s32 %r7226, %r222, %r7225, %r7217; + mad.lo.s32 %r7227, %r72, %r7223, %r7226; + mad.lo.s32 %r7228, %r73, %r7221, %r7227; + mad.lo.s32 %r7229, %r74, %r7219, %r7228; + ld.const.v4.u8 {%rs4682, %rs4683, %rs4684, %rs4685}, [matrix+2340]; + cvt.u32.u16 %r7230, %rs4685; + cvt.s32.s8 %r7231, %r7230; + cvt.u32.u16 %r7232, %rs4684; + cvt.s32.s8 %r7233, %r7232; + cvt.u32.u16 %r7234, %rs4683; + cvt.s32.s8 %r7235, %r7234; + cvt.u32.u16 %r7236, %rs4682; + cvt.s32.s8 %r7237, %r7236; + mad.lo.s32 %r7238, %r75, %r7237, %r7229; + mad.lo.s32 %r7239, %r76, %r7235, %r7238; + mad.lo.s32 %r7240, %r77, %r7233, %r7239; + mad.lo.s32 %r7241, %r78, %r7231, %r7240; + ld.const.v4.u8 {%rs4690, %rs4691, %rs4692, %rs4693}, [matrix+2344]; + cvt.u32.u16 %r7242, %rs4693; + cvt.s32.s8 %r7243, %r7242; + cvt.u32.u16 %r7244, %rs4692; + cvt.s32.s8 %r7245, %r7244; + cvt.u32.u16 %r7246, %rs4691; + cvt.s32.s8 %r7247, %r7246; + cvt.u32.u16 %r7248, %rs4690; + cvt.s32.s8 %r7249, %r7248; + mad.lo.s32 %r7250, %r80, %r7249, %r7241; + mad.lo.s32 %r7251, %r81, %r7247, %r7250; + mad.lo.s32 %r7252, %r83, %r7245, %r7251; + mad.lo.s32 %r7253, %r84, %r7243, %r7252; + ld.const.v4.u8 {%rs4698, %rs4699, %rs4700, %rs4701}, [matrix+2348]; + cvt.u32.u16 %r7254, %rs4701; + cvt.s32.s8 %r7255, %r7254; + cvt.u32.u16 %r7256, %rs4700; + cvt.s32.s8 %r7257, %r7256; + cvt.u32.u16 %r7258, %rs4699; + cvt.s32.s8 %r7259, %r7258; + cvt.u32.u16 %r7260, %rs4698; + cvt.s32.s8 %r7261, %r7260; + mad.lo.s32 %r7262, %r86, %r7261, %r7253; + mad.lo.s32 %r7263, %r87, %r7259, %r7262; + mad.lo.s32 %r7264, %r88, %r7257, %r7263; + mad.lo.s32 %r7265, %r89, %r7255, %r7264; + ld.const.v4.u8 {%rs4706, %rs4707, %rs4708, %rs4709}, [matrix+2352]; + cvt.u32.u16 %r7266, %rs4709; + cvt.s32.s8 %r7267, %r7266; + cvt.u32.u16 %r7268, %rs4708; + cvt.s32.s8 %r7269, %r7268; + cvt.u32.u16 %r7270, %rs4707; + cvt.s32.s8 %r7271, %r7270; + cvt.u32.u16 %r7272, %rs4706; + cvt.s32.s8 %r7273, %r7272; + mad.lo.s32 %r7274, %r271, %r7273, %r7265; + mad.lo.s32 %r7275, %r91, %r7271, %r7274; + mad.lo.s32 %r7276, %r93, %r7269, %r7275; + mad.lo.s32 %r7277, %r94, %r7267, %r7276; + ld.const.v4.u8 {%rs4714, %rs4715, %rs4716, %rs4717}, [matrix+2356]; + cvt.u32.u16 %r7278, %rs4717; + cvt.s32.s8 %r7279, %r7278; + cvt.u32.u16 %r7280, %rs4716; + cvt.s32.s8 %r7281, %r7280; + cvt.u32.u16 %r7282, %rs4715; + cvt.s32.s8 %r7283, %r7282; + cvt.u32.u16 %r7284, %rs4714; + cvt.s32.s8 %r7285, %r7284; + mad.lo.s32 %r7286, %r96, %r7285, %r7277; + mad.lo.s32 %r7287, %r97, %r7283, %r7286; + mad.lo.s32 %r7288, %r99, %r7281, %r7287; + mad.lo.s32 %r7289, %r100, %r7279, %r7288; + ld.const.v4.u8 {%rs4722, %rs4723, %rs4724, %rs4725}, [matrix+2360]; + cvt.u32.u16 %r7290, %rs4725; + cvt.s32.s8 %r7291, %r7290; + cvt.u32.u16 %r7292, %rs4724; + cvt.s32.s8 %r7293, %r7292; + cvt.u32.u16 %r7294, %rs4723; + cvt.s32.s8 %r7295, %r7294; + cvt.u32.u16 %r7296, %rs4722; + cvt.s32.s8 %r7297, %r7296; + mad.lo.s32 %r7298, %r103, %r7297, %r7289; + mad.lo.s32 %r7299, %r104, %r7295, %r7298; + mad.lo.s32 %r7300, %r107, %r7293, %r7299; + mad.lo.s32 %r7301, %r108, %r7291, %r7300; + ld.const.v4.u8 {%rs4730, %rs4731, %rs4732, %rs4733}, [matrix+2364]; + cvt.u32.u16 %r7302, %rs4733; + cvt.s32.s8 %r7303, %r7302; + cvt.u32.u16 %r7304, %rs4732; + cvt.s32.s8 %r7305, %r7304; + cvt.u32.u16 %r7306, %rs4731; + cvt.s32.s8 %r7307, %r7306; + cvt.u32.u16 %r7308, %rs4730; + cvt.s32.s8 %r7309, %r7308; + mad.lo.s32 %r7310, %r111, %r7309, %r7301; + mad.lo.s32 %r7311, %r112, %r7307, %r7310; + mad.lo.s32 %r7312, %r114, %r7305, %r7311; + mad.lo.s32 %r7313, %r115, %r7303, %r7312; + ld.const.v4.u8 {%rs4738, %rs4739, %rs4740, %rs4741}, [matrix+2368]; + cvt.u32.u16 %r7314, %rs4741; + cvt.s32.s8 %r7315, %r7314; + cvt.u32.u16 %r7316, %rs4740; + cvt.s32.s8 %r7317, %r7316; + cvt.u32.u16 %r7318, %rs4738; + cvt.s32.s8 %r7319, %r7318; + cvt.u32.u16 %r7320, %rs4739; + cvt.s32.s8 %r7321, %r7320; + mul.lo.s32 %r7322, %r34, %r7321; + mad.lo.s32 %r7323, %r124, %r7319, %r7322; + mad.lo.s32 %r7324, %r35, %r7317, %r7323; + mad.lo.s32 %r7325, %r36, %r7315, %r7324; + ld.const.v4.u8 {%rs4746, %rs4747, %rs4748, %rs4749}, [matrix+2372]; + cvt.u32.u16 %r7326, %rs4749; + cvt.s32.s8 %r7327, %r7326; + cvt.u32.u16 %r7328, %rs4748; + cvt.s32.s8 %r7329, %r7328; + cvt.u32.u16 %r7330, %rs4747; + cvt.s32.s8 %r7331, %r7330; + cvt.u32.u16 %r7332, %rs4746; + cvt.s32.s8 %r7333, %r7332; + mad.lo.s32 %r7334, %r37, %r7333, %r7325; + mad.lo.s32 %r7335, %r38, %r7331, %r7334; + mad.lo.s32 %r7336, %r39, %r7329, %r7335; + mad.lo.s32 %r7337, %r40, %r7327, %r7336; + ld.const.v4.u8 {%rs4754, %rs4755, %rs4756, %rs4757}, [matrix+2376]; + cvt.u32.u16 %r7338, %rs4757; + cvt.s32.s8 %r7339, %r7338; + cvt.u32.u16 %r7340, %rs4756; + cvt.s32.s8 %r7341, %r7340; + cvt.u32.u16 %r7342, %rs4755; + cvt.s32.s8 %r7343, %r7342; + cvt.u32.u16 %r7344, %rs4754; + cvt.s32.s8 %r7345, %r7344; + mad.lo.s32 %r7346, %r42, %r7345, %r7337; + mad.lo.s32 %r7347, %r43, %r7343, %r7346; + mad.lo.s32 %r7348, %r45, %r7341, %r7347; + mad.lo.s32 %r7349, %r46, %r7339, %r7348; + ld.const.v4.u8 {%rs4762, %rs4763, %rs4764, %rs4765}, [matrix+2380]; + cvt.u32.u16 %r7350, %rs4765; + cvt.s32.s8 %r7351, %r7350; + cvt.u32.u16 %r7352, %rs4764; + cvt.s32.s8 %r7353, %r7352; + cvt.u32.u16 %r7354, %rs4763; + cvt.s32.s8 %r7355, %r7354; + cvt.u32.u16 %r7356, %rs4762; + cvt.s32.s8 %r7357, %r7356; + mad.lo.s32 %r7358, %r48, %r7357, %r7349; + mad.lo.s32 %r7359, %r49, %r7355, %r7358; + mad.lo.s32 %r7360, %r50, %r7353, %r7359; + mad.lo.s32 %r7361, %r51, %r7351, %r7360; + ld.const.v4.u8 {%rs4770, %rs4771, %rs4772, %rs4773}, [matrix+2384]; + cvt.u32.u16 %r7362, %rs4773; + cvt.s32.s8 %r7363, %r7362; + cvt.u32.u16 %r7364, %rs4772; + cvt.s32.s8 %r7365, %r7364; + cvt.u32.u16 %r7366, %rs4771; + cvt.s32.s8 %r7367, %r7366; + cvt.u32.u16 %r7368, %rs4770; + cvt.s32.s8 %r7369, %r7368; + mad.lo.s32 %r7370, %r173, %r7369, %r7361; + mad.lo.s32 %r7371, %r53, %r7367, %r7370; + mad.lo.s32 %r7372, %r54, %r7365, %r7371; + mad.lo.s32 %r7373, %r55, %r7363, %r7372; + ld.const.v4.u8 {%rs4778, %rs4779, %rs4780, %rs4781}, [matrix+2388]; + cvt.u32.u16 %r7374, %rs4781; + cvt.s32.s8 %r7375, %r7374; + cvt.u32.u16 %r7376, %rs4780; + cvt.s32.s8 %r7377, %r7376; + cvt.u32.u16 %r7378, %rs4779; + cvt.s32.s8 %r7379, %r7378; + cvt.u32.u16 %r7380, %rs4778; + cvt.s32.s8 %r7381, %r7380; + mad.lo.s32 %r7382, %r56, %r7381, %r7373; + mad.lo.s32 %r7383, %r57, %r7379, %r7382; + mad.lo.s32 %r7384, %r58, %r7377, %r7383; + mad.lo.s32 %r7385, %r59, %r7375, %r7384; + ld.const.v4.u8 {%rs4786, %rs4787, %rs4788, %rs4789}, [matrix+2392]; + cvt.u32.u16 %r7386, %rs4789; + cvt.s32.s8 %r7387, %r7386; + cvt.u32.u16 %r7388, %rs4788; + cvt.s32.s8 %r7389, %r7388; + cvt.u32.u16 %r7390, %rs4787; + cvt.s32.s8 %r7391, %r7390; + cvt.u32.u16 %r7392, %rs4786; + cvt.s32.s8 %r7393, %r7392; + mad.lo.s32 %r7394, %r61, %r7393, %r7385; + mad.lo.s32 %r7395, %r62, %r7391, %r7394; + mad.lo.s32 %r7396, %r64, %r7389, %r7395; + mad.lo.s32 %r7397, %r65, %r7387, %r7396; + ld.const.v4.u8 {%rs4794, %rs4795, %rs4796, %rs4797}, [matrix+2396]; + cvt.u32.u16 %r7398, %rs4797; + cvt.s32.s8 %r7399, %r7398; + cvt.u32.u16 %r7400, %rs4796; + cvt.s32.s8 %r7401, %r7400; + cvt.u32.u16 %r7402, %rs4795; + cvt.s32.s8 %r7403, %r7402; + cvt.u32.u16 %r7404, %rs4794; + cvt.s32.s8 %r7405, %r7404; + mad.lo.s32 %r7406, %r67, %r7405, %r7397; + mad.lo.s32 %r7407, %r68, %r7403, %r7406; + mad.lo.s32 %r7408, %r69, %r7401, %r7407; + mad.lo.s32 %r7409, %r70, %r7399, %r7408; + ld.const.v4.u8 {%rs4802, %rs4803, %rs4804, %rs4805}, [matrix+2400]; + cvt.u32.u16 %r7410, %rs4805; + cvt.s32.s8 %r7411, %r7410; + cvt.u32.u16 %r7412, %rs4804; + cvt.s32.s8 %r7413, %r7412; + cvt.u32.u16 %r7414, %rs4803; + cvt.s32.s8 %r7415, %r7414; + cvt.u32.u16 %r7416, %rs4802; + cvt.s32.s8 %r7417, %r7416; + mad.lo.s32 %r7418, %r222, %r7417, %r7409; + mad.lo.s32 %r7419, %r72, %r7415, %r7418; + mad.lo.s32 %r7420, %r73, %r7413, %r7419; + mad.lo.s32 %r7421, %r74, %r7411, %r7420; + ld.const.v4.u8 {%rs4810, %rs4811, %rs4812, %rs4813}, [matrix+2404]; + cvt.u32.u16 %r7422, %rs4813; + cvt.s32.s8 %r7423, %r7422; + cvt.u32.u16 %r7424, %rs4812; + cvt.s32.s8 %r7425, %r7424; + cvt.u32.u16 %r7426, %rs4811; + cvt.s32.s8 %r7427, %r7426; + cvt.u32.u16 %r7428, %rs4810; + cvt.s32.s8 %r7429, %r7428; + mad.lo.s32 %r7430, %r75, %r7429, %r7421; + mad.lo.s32 %r7431, %r76, %r7427, %r7430; + mad.lo.s32 %r7432, %r77, %r7425, %r7431; + mad.lo.s32 %r7433, %r78, %r7423, %r7432; + ld.const.v4.u8 {%rs4818, %rs4819, %rs4820, %rs4821}, [matrix+2408]; + cvt.u32.u16 %r7434, %rs4821; + cvt.s32.s8 %r7435, %r7434; + cvt.u32.u16 %r7436, %rs4820; + cvt.s32.s8 %r7437, %r7436; + cvt.u32.u16 %r7438, %rs4819; + cvt.s32.s8 %r7439, %r7438; + cvt.u32.u16 %r7440, %rs4818; + cvt.s32.s8 %r7441, %r7440; + mad.lo.s32 %r7442, %r80, %r7441, %r7433; + mad.lo.s32 %r7443, %r81, %r7439, %r7442; + mad.lo.s32 %r7444, %r83, %r7437, %r7443; + mad.lo.s32 %r7445, %r84, %r7435, %r7444; + ld.const.v4.u8 {%rs4826, %rs4827, %rs4828, %rs4829}, [matrix+2412]; + cvt.u32.u16 %r7446, %rs4829; + cvt.s32.s8 %r7447, %r7446; + cvt.u32.u16 %r7448, %rs4828; + cvt.s32.s8 %r7449, %r7448; + cvt.u32.u16 %r7450, %rs4827; + cvt.s32.s8 %r7451, %r7450; + cvt.u32.u16 %r7452, %rs4826; + cvt.s32.s8 %r7453, %r7452; + mad.lo.s32 %r7454, %r86, %r7453, %r7445; + mad.lo.s32 %r7455, %r87, %r7451, %r7454; + mad.lo.s32 %r7456, %r88, %r7449, %r7455; + mad.lo.s32 %r7457, %r89, %r7447, %r7456; + ld.const.v4.u8 {%rs4834, %rs4835, %rs4836, %rs4837}, [matrix+2416]; + cvt.u32.u16 %r7458, %rs4837; + cvt.s32.s8 %r7459, %r7458; + cvt.u32.u16 %r7460, %rs4836; + cvt.s32.s8 %r7461, %r7460; + cvt.u32.u16 %r7462, %rs4835; + cvt.s32.s8 %r7463, %r7462; + cvt.u32.u16 %r7464, %rs4834; + cvt.s32.s8 %r7465, %r7464; + mad.lo.s32 %r7466, %r271, %r7465, %r7457; + mad.lo.s32 %r7467, %r91, %r7463, %r7466; + mad.lo.s32 %r7468, %r93, %r7461, %r7467; + mad.lo.s32 %r7469, %r94, %r7459, %r7468; + ld.const.v4.u8 {%rs4842, %rs4843, %rs4844, %rs4845}, [matrix+2420]; + cvt.u32.u16 %r7470, %rs4845; + cvt.s32.s8 %r7471, %r7470; + cvt.u32.u16 %r7472, %rs4844; + cvt.s32.s8 %r7473, %r7472; + cvt.u32.u16 %r7474, %rs4843; + cvt.s32.s8 %r7475, %r7474; + cvt.u32.u16 %r7476, %rs4842; + cvt.s32.s8 %r7477, %r7476; + mad.lo.s32 %r7478, %r96, %r7477, %r7469; + mad.lo.s32 %r7479, %r97, %r7475, %r7478; + mad.lo.s32 %r7480, %r99, %r7473, %r7479; + mad.lo.s32 %r7481, %r100, %r7471, %r7480; + ld.const.v4.u8 {%rs4850, %rs4851, %rs4852, %rs4853}, [matrix+2424]; + cvt.u32.u16 %r7482, %rs4853; + cvt.s32.s8 %r7483, %r7482; + cvt.u32.u16 %r7484, %rs4852; + cvt.s32.s8 %r7485, %r7484; + cvt.u32.u16 %r7486, %rs4851; + cvt.s32.s8 %r7487, %r7486; + cvt.u32.u16 %r7488, %rs4850; + cvt.s32.s8 %r7489, %r7488; + mad.lo.s32 %r7490, %r103, %r7489, %r7481; + mad.lo.s32 %r7491, %r104, %r7487, %r7490; + mad.lo.s32 %r7492, %r107, %r7485, %r7491; + mad.lo.s32 %r7493, %r108, %r7483, %r7492; + ld.const.v4.u8 {%rs4858, %rs4859, %rs4860, %rs4861}, [matrix+2428]; + cvt.u32.u16 %r7494, %rs4861; + cvt.s32.s8 %r7495, %r7494; + cvt.u32.u16 %r7496, %rs4860; + cvt.s32.s8 %r7497, %r7496; + cvt.u32.u16 %r7498, %rs4859; + cvt.s32.s8 %r7499, %r7498; + cvt.u32.u16 %r7500, %rs4858; + cvt.s32.s8 %r7501, %r7500; + mad.lo.s32 %r7502, %r111, %r7501, %r7493; + mad.lo.s32 %r7503, %r112, %r7499, %r7502; + mad.lo.s32 %r7504, %r114, %r7497, %r7503; + mad.lo.s32 %r7505, %r115, %r7495, %r7504; + shr.u32 %r7506, %r7313, 6; + and.b32 %r7507, %r7506, 240; + shr.u32 %r7508, %r7505, 10; + or.b32 %r7509, %r7508, %r7507; + xor.b32 %r7510, %r27, %r7509; + cvt.u64.u32 %rd394, %r7510; + ld.const.v4.u8 {%rs4866, %rs4867, %rs4868, %rs4869}, [matrix+2432]; + cvt.u32.u16 %r7511, %rs4869; + cvt.s32.s8 %r7512, %r7511; + cvt.u32.u16 %r7513, %rs4868; + cvt.s32.s8 %r7514, %r7513; + cvt.u32.u16 %r7515, %rs4866; + cvt.s32.s8 %r7516, %r7515; + cvt.u32.u16 %r7517, %rs4867; + cvt.s32.s8 %r7518, %r7517; + mul.lo.s32 %r7519, %r34, %r7518; + mad.lo.s32 %r7520, %r124, %r7516, %r7519; + mad.lo.s32 %r7521, %r35, %r7514, %r7520; + mad.lo.s32 %r7522, %r36, %r7512, %r7521; + ld.const.v4.u8 {%rs4874, %rs4875, %rs4876, %rs4877}, [matrix+2436]; + cvt.u32.u16 %r7523, %rs4877; + cvt.s32.s8 %r7524, %r7523; + cvt.u32.u16 %r7525, %rs4876; + cvt.s32.s8 %r7526, %r7525; + cvt.u32.u16 %r7527, %rs4875; + cvt.s32.s8 %r7528, %r7527; + cvt.u32.u16 %r7529, %rs4874; + cvt.s32.s8 %r7530, %r7529; + mad.lo.s32 %r7531, %r37, %r7530, %r7522; + mad.lo.s32 %r7532, %r38, %r7528, %r7531; + mad.lo.s32 %r7533, %r39, %r7526, %r7532; + mad.lo.s32 %r7534, %r40, %r7524, %r7533; + ld.const.v4.u8 {%rs4882, %rs4883, %rs4884, %rs4885}, [matrix+2440]; + cvt.u32.u16 %r7535, %rs4885; + cvt.s32.s8 %r7536, %r7535; + cvt.u32.u16 %r7537, %rs4884; + cvt.s32.s8 %r7538, %r7537; + cvt.u32.u16 %r7539, %rs4883; + cvt.s32.s8 %r7540, %r7539; + cvt.u32.u16 %r7541, %rs4882; + cvt.s32.s8 %r7542, %r7541; + mad.lo.s32 %r7543, %r42, %r7542, %r7534; + mad.lo.s32 %r7544, %r43, %r7540, %r7543; + mad.lo.s32 %r7545, %r45, %r7538, %r7544; + mad.lo.s32 %r7546, %r46, %r7536, %r7545; + ld.const.v4.u8 {%rs4890, %rs4891, %rs4892, %rs4893}, [matrix+2444]; + cvt.u32.u16 %r7547, %rs4893; + cvt.s32.s8 %r7548, %r7547; + cvt.u32.u16 %r7549, %rs4892; + cvt.s32.s8 %r7550, %r7549; + cvt.u32.u16 %r7551, %rs4891; + cvt.s32.s8 %r7552, %r7551; + cvt.u32.u16 %r7553, %rs4890; + cvt.s32.s8 %r7554, %r7553; + mad.lo.s32 %r7555, %r48, %r7554, %r7546; + mad.lo.s32 %r7556, %r49, %r7552, %r7555; + mad.lo.s32 %r7557, %r50, %r7550, %r7556; + mad.lo.s32 %r7558, %r51, %r7548, %r7557; + ld.const.v4.u8 {%rs4898, %rs4899, %rs4900, %rs4901}, [matrix+2448]; + cvt.u32.u16 %r7559, %rs4901; + cvt.s32.s8 %r7560, %r7559; + cvt.u32.u16 %r7561, %rs4900; + cvt.s32.s8 %r7562, %r7561; + cvt.u32.u16 %r7563, %rs4899; + cvt.s32.s8 %r7564, %r7563; + cvt.u32.u16 %r7565, %rs4898; + cvt.s32.s8 %r7566, %r7565; + mad.lo.s32 %r7567, %r173, %r7566, %r7558; + mad.lo.s32 %r7568, %r53, %r7564, %r7567; + mad.lo.s32 %r7569, %r54, %r7562, %r7568; + mad.lo.s32 %r7570, %r55, %r7560, %r7569; + ld.const.v4.u8 {%rs4906, %rs4907, %rs4908, %rs4909}, [matrix+2452]; + cvt.u32.u16 %r7571, %rs4909; + cvt.s32.s8 %r7572, %r7571; + cvt.u32.u16 %r7573, %rs4908; + cvt.s32.s8 %r7574, %r7573; + cvt.u32.u16 %r7575, %rs4907; + cvt.s32.s8 %r7576, %r7575; + cvt.u32.u16 %r7577, %rs4906; + cvt.s32.s8 %r7578, %r7577; + mad.lo.s32 %r7579, %r56, %r7578, %r7570; + mad.lo.s32 %r7580, %r57, %r7576, %r7579; + mad.lo.s32 %r7581, %r58, %r7574, %r7580; + mad.lo.s32 %r7582, %r59, %r7572, %r7581; + ld.const.v4.u8 {%rs4914, %rs4915, %rs4916, %rs4917}, [matrix+2456]; + cvt.u32.u16 %r7583, %rs4917; + cvt.s32.s8 %r7584, %r7583; + cvt.u32.u16 %r7585, %rs4916; + cvt.s32.s8 %r7586, %r7585; + cvt.u32.u16 %r7587, %rs4915; + cvt.s32.s8 %r7588, %r7587; + cvt.u32.u16 %r7589, %rs4914; + cvt.s32.s8 %r7590, %r7589; + mad.lo.s32 %r7591, %r61, %r7590, %r7582; + mad.lo.s32 %r7592, %r62, %r7588, %r7591; + mad.lo.s32 %r7593, %r64, %r7586, %r7592; + mad.lo.s32 %r7594, %r65, %r7584, %r7593; + ld.const.v4.u8 {%rs4922, %rs4923, %rs4924, %rs4925}, [matrix+2460]; + cvt.u32.u16 %r7595, %rs4925; + cvt.s32.s8 %r7596, %r7595; + cvt.u32.u16 %r7597, %rs4924; + cvt.s32.s8 %r7598, %r7597; + cvt.u32.u16 %r7599, %rs4923; + cvt.s32.s8 %r7600, %r7599; + cvt.u32.u16 %r7601, %rs4922; + cvt.s32.s8 %r7602, %r7601; + mad.lo.s32 %r7603, %r67, %r7602, %r7594; + mad.lo.s32 %r7604, %r68, %r7600, %r7603; + mad.lo.s32 %r7605, %r69, %r7598, %r7604; + mad.lo.s32 %r7606, %r70, %r7596, %r7605; + ld.const.v4.u8 {%rs4930, %rs4931, %rs4932, %rs4933}, [matrix+2464]; + cvt.u32.u16 %r7607, %rs4933; + cvt.s32.s8 %r7608, %r7607; + cvt.u32.u16 %r7609, %rs4932; + cvt.s32.s8 %r7610, %r7609; + cvt.u32.u16 %r7611, %rs4931; + cvt.s32.s8 %r7612, %r7611; + cvt.u32.u16 %r7613, %rs4930; + cvt.s32.s8 %r7614, %r7613; + mad.lo.s32 %r7615, %r222, %r7614, %r7606; + mad.lo.s32 %r7616, %r72, %r7612, %r7615; + mad.lo.s32 %r7617, %r73, %r7610, %r7616; + mad.lo.s32 %r7618, %r74, %r7608, %r7617; + ld.const.v4.u8 {%rs4938, %rs4939, %rs4940, %rs4941}, [matrix+2468]; + cvt.u32.u16 %r7619, %rs4941; + cvt.s32.s8 %r7620, %r7619; + cvt.u32.u16 %r7621, %rs4940; + cvt.s32.s8 %r7622, %r7621; + cvt.u32.u16 %r7623, %rs4939; + cvt.s32.s8 %r7624, %r7623; + cvt.u32.u16 %r7625, %rs4938; + cvt.s32.s8 %r7626, %r7625; + mad.lo.s32 %r7627, %r75, %r7626, %r7618; + mad.lo.s32 %r7628, %r76, %r7624, %r7627; + mad.lo.s32 %r7629, %r77, %r7622, %r7628; + mad.lo.s32 %r7630, %r78, %r7620, %r7629; + ld.const.v4.u8 {%rs4946, %rs4947, %rs4948, %rs4949}, [matrix+2472]; + cvt.u32.u16 %r7631, %rs4949; + cvt.s32.s8 %r7632, %r7631; + cvt.u32.u16 %r7633, %rs4948; + cvt.s32.s8 %r7634, %r7633; + cvt.u32.u16 %r7635, %rs4947; + cvt.s32.s8 %r7636, %r7635; + cvt.u32.u16 %r7637, %rs4946; + cvt.s32.s8 %r7638, %r7637; + mad.lo.s32 %r7639, %r80, %r7638, %r7630; + mad.lo.s32 %r7640, %r81, %r7636, %r7639; + mad.lo.s32 %r7641, %r83, %r7634, %r7640; + mad.lo.s32 %r7642, %r84, %r7632, %r7641; + ld.const.v4.u8 {%rs4954, %rs4955, %rs4956, %rs4957}, [matrix+2476]; + cvt.u32.u16 %r7643, %rs4957; + cvt.s32.s8 %r7644, %r7643; + cvt.u32.u16 %r7645, %rs4956; + cvt.s32.s8 %r7646, %r7645; + cvt.u32.u16 %r7647, %rs4955; + cvt.s32.s8 %r7648, %r7647; + cvt.u32.u16 %r7649, %rs4954; + cvt.s32.s8 %r7650, %r7649; + mad.lo.s32 %r7651, %r86, %r7650, %r7642; + mad.lo.s32 %r7652, %r87, %r7648, %r7651; + mad.lo.s32 %r7653, %r88, %r7646, %r7652; + mad.lo.s32 %r7654, %r89, %r7644, %r7653; + ld.const.v4.u8 {%rs4962, %rs4963, %rs4964, %rs4965}, [matrix+2480]; + cvt.u32.u16 %r7655, %rs4965; + cvt.s32.s8 %r7656, %r7655; + cvt.u32.u16 %r7657, %rs4964; + cvt.s32.s8 %r7658, %r7657; + cvt.u32.u16 %r7659, %rs4963; + cvt.s32.s8 %r7660, %r7659; + cvt.u32.u16 %r7661, %rs4962; + cvt.s32.s8 %r7662, %r7661; + mad.lo.s32 %r7663, %r271, %r7662, %r7654; + mad.lo.s32 %r7664, %r91, %r7660, %r7663; + mad.lo.s32 %r7665, %r93, %r7658, %r7664; + mad.lo.s32 %r7666, %r94, %r7656, %r7665; + ld.const.v4.u8 {%rs4970, %rs4971, %rs4972, %rs4973}, [matrix+2484]; + cvt.u32.u16 %r7667, %rs4973; + cvt.s32.s8 %r7668, %r7667; + cvt.u32.u16 %r7669, %rs4972; + cvt.s32.s8 %r7670, %r7669; + cvt.u32.u16 %r7671, %rs4971; + cvt.s32.s8 %r7672, %r7671; + cvt.u32.u16 %r7673, %rs4970; + cvt.s32.s8 %r7674, %r7673; + mad.lo.s32 %r7675, %r96, %r7674, %r7666; + mad.lo.s32 %r7676, %r97, %r7672, %r7675; + mad.lo.s32 %r7677, %r99, %r7670, %r7676; + mad.lo.s32 %r7678, %r100, %r7668, %r7677; + ld.const.v4.u8 {%rs4978, %rs4979, %rs4980, %rs4981}, [matrix+2488]; + cvt.u32.u16 %r7679, %rs4981; + cvt.s32.s8 %r7680, %r7679; + cvt.u32.u16 %r7681, %rs4980; + cvt.s32.s8 %r7682, %r7681; + cvt.u32.u16 %r7683, %rs4979; + cvt.s32.s8 %r7684, %r7683; + cvt.u32.u16 %r7685, %rs4978; + cvt.s32.s8 %r7686, %r7685; + mad.lo.s32 %r7687, %r103, %r7686, %r7678; + mad.lo.s32 %r7688, %r104, %r7684, %r7687; + mad.lo.s32 %r7689, %r107, %r7682, %r7688; + mad.lo.s32 %r7690, %r108, %r7680, %r7689; + ld.const.v4.u8 {%rs4986, %rs4987, %rs4988, %rs4989}, [matrix+2492]; + cvt.u32.u16 %r7691, %rs4989; + cvt.s32.s8 %r7692, %r7691; + cvt.u32.u16 %r7693, %rs4988; + cvt.s32.s8 %r7694, %r7693; + cvt.u32.u16 %r7695, %rs4987; + cvt.s32.s8 %r7696, %r7695; + cvt.u32.u16 %r7697, %rs4986; + cvt.s32.s8 %r7698, %r7697; + mad.lo.s32 %r7699, %r111, %r7698, %r7690; + mad.lo.s32 %r7700, %r112, %r7696, %r7699; + mad.lo.s32 %r7701, %r114, %r7694, %r7700; + mad.lo.s32 %r7702, %r115, %r7692, %r7701; + ld.const.v4.u8 {%rs4994, %rs4995, %rs4996, %rs4997}, [matrix+2496]; + cvt.u32.u16 %r7703, %rs4997; + cvt.s32.s8 %r7704, %r7703; + cvt.u32.u16 %r7705, %rs4996; + cvt.s32.s8 %r7706, %r7705; + cvt.u32.u16 %r7707, %rs4994; + cvt.s32.s8 %r7708, %r7707; + cvt.u32.u16 %r7709, %rs4995; + cvt.s32.s8 %r7710, %r7709; + mul.lo.s32 %r7711, %r34, %r7710; + mad.lo.s32 %r7712, %r124, %r7708, %r7711; + mad.lo.s32 %r7713, %r35, %r7706, %r7712; + mad.lo.s32 %r7714, %r36, %r7704, %r7713; + ld.const.v4.u8 {%rs5002, %rs5003, %rs5004, %rs5005}, [matrix+2500]; + cvt.u32.u16 %r7715, %rs5005; + cvt.s32.s8 %r7716, %r7715; + cvt.u32.u16 %r7717, %rs5004; + cvt.s32.s8 %r7718, %r7717; + cvt.u32.u16 %r7719, %rs5003; + cvt.s32.s8 %r7720, %r7719; + cvt.u32.u16 %r7721, %rs5002; + cvt.s32.s8 %r7722, %r7721; + mad.lo.s32 %r7723, %r37, %r7722, %r7714; + mad.lo.s32 %r7724, %r38, %r7720, %r7723; + mad.lo.s32 %r7725, %r39, %r7718, %r7724; + mad.lo.s32 %r7726, %r40, %r7716, %r7725; + ld.const.v4.u8 {%rs5010, %rs5011, %rs5012, %rs5013}, [matrix+2504]; + cvt.u32.u16 %r7727, %rs5013; + cvt.s32.s8 %r7728, %r7727; + cvt.u32.u16 %r7729, %rs5012; + cvt.s32.s8 %r7730, %r7729; + cvt.u32.u16 %r7731, %rs5011; + cvt.s32.s8 %r7732, %r7731; + cvt.u32.u16 %r7733, %rs5010; + cvt.s32.s8 %r7734, %r7733; + mad.lo.s32 %r7735, %r42, %r7734, %r7726; + mad.lo.s32 %r7736, %r43, %r7732, %r7735; + mad.lo.s32 %r7737, %r45, %r7730, %r7736; + mad.lo.s32 %r7738, %r46, %r7728, %r7737; + ld.const.v4.u8 {%rs5018, %rs5019, %rs5020, %rs5021}, [matrix+2508]; + cvt.u32.u16 %r7739, %rs5021; + cvt.s32.s8 %r7740, %r7739; + cvt.u32.u16 %r7741, %rs5020; + cvt.s32.s8 %r7742, %r7741; + cvt.u32.u16 %r7743, %rs5019; + cvt.s32.s8 %r7744, %r7743; + cvt.u32.u16 %r7745, %rs5018; + cvt.s32.s8 %r7746, %r7745; + mad.lo.s32 %r7747, %r48, %r7746, %r7738; + mad.lo.s32 %r7748, %r49, %r7744, %r7747; + mad.lo.s32 %r7749, %r50, %r7742, %r7748; + mad.lo.s32 %r7750, %r51, %r7740, %r7749; + ld.const.v4.u8 {%rs5026, %rs5027, %rs5028, %rs5029}, [matrix+2512]; + cvt.u32.u16 %r7751, %rs5029; + cvt.s32.s8 %r7752, %r7751; + cvt.u32.u16 %r7753, %rs5028; + cvt.s32.s8 %r7754, %r7753; + cvt.u32.u16 %r7755, %rs5027; + cvt.s32.s8 %r7756, %r7755; + cvt.u32.u16 %r7757, %rs5026; + cvt.s32.s8 %r7758, %r7757; + mad.lo.s32 %r7759, %r173, %r7758, %r7750; + mad.lo.s32 %r7760, %r53, %r7756, %r7759; + mad.lo.s32 %r7761, %r54, %r7754, %r7760; + mad.lo.s32 %r7762, %r55, %r7752, %r7761; + ld.const.v4.u8 {%rs5034, %rs5035, %rs5036, %rs5037}, [matrix+2516]; + cvt.u32.u16 %r7763, %rs5037; + cvt.s32.s8 %r7764, %r7763; + cvt.u32.u16 %r7765, %rs5036; + cvt.s32.s8 %r7766, %r7765; + cvt.u32.u16 %r7767, %rs5035; + cvt.s32.s8 %r7768, %r7767; + cvt.u32.u16 %r7769, %rs5034; + cvt.s32.s8 %r7770, %r7769; + mad.lo.s32 %r7771, %r56, %r7770, %r7762; + mad.lo.s32 %r7772, %r57, %r7768, %r7771; + mad.lo.s32 %r7773, %r58, %r7766, %r7772; + mad.lo.s32 %r7774, %r59, %r7764, %r7773; + ld.const.v4.u8 {%rs5042, %rs5043, %rs5044, %rs5045}, [matrix+2520]; + cvt.u32.u16 %r7775, %rs5045; + cvt.s32.s8 %r7776, %r7775; + cvt.u32.u16 %r7777, %rs5044; + cvt.s32.s8 %r7778, %r7777; + cvt.u32.u16 %r7779, %rs5043; + cvt.s32.s8 %r7780, %r7779; + cvt.u32.u16 %r7781, %rs5042; + cvt.s32.s8 %r7782, %r7781; + mad.lo.s32 %r7783, %r61, %r7782, %r7774; + mad.lo.s32 %r7784, %r62, %r7780, %r7783; + mad.lo.s32 %r7785, %r64, %r7778, %r7784; + mad.lo.s32 %r7786, %r65, %r7776, %r7785; + ld.const.v4.u8 {%rs5050, %rs5051, %rs5052, %rs5053}, [matrix+2524]; + cvt.u32.u16 %r7787, %rs5053; + cvt.s32.s8 %r7788, %r7787; + cvt.u32.u16 %r7789, %rs5052; + cvt.s32.s8 %r7790, %r7789; + cvt.u32.u16 %r7791, %rs5051; + cvt.s32.s8 %r7792, %r7791; + cvt.u32.u16 %r7793, %rs5050; + cvt.s32.s8 %r7794, %r7793; + mad.lo.s32 %r7795, %r67, %r7794, %r7786; + mad.lo.s32 %r7796, %r68, %r7792, %r7795; + mad.lo.s32 %r7797, %r69, %r7790, %r7796; + mad.lo.s32 %r7798, %r70, %r7788, %r7797; + ld.const.v4.u8 {%rs5058, %rs5059, %rs5060, %rs5061}, [matrix+2528]; + cvt.u32.u16 %r7799, %rs5061; + cvt.s32.s8 %r7800, %r7799; + cvt.u32.u16 %r7801, %rs5060; + cvt.s32.s8 %r7802, %r7801; + cvt.u32.u16 %r7803, %rs5059; + cvt.s32.s8 %r7804, %r7803; + cvt.u32.u16 %r7805, %rs5058; + cvt.s32.s8 %r7806, %r7805; + mad.lo.s32 %r7807, %r222, %r7806, %r7798; + mad.lo.s32 %r7808, %r72, %r7804, %r7807; + mad.lo.s32 %r7809, %r73, %r7802, %r7808; + mad.lo.s32 %r7810, %r74, %r7800, %r7809; + ld.const.v4.u8 {%rs5066, %rs5067, %rs5068, %rs5069}, [matrix+2532]; + cvt.u32.u16 %r7811, %rs5069; + cvt.s32.s8 %r7812, %r7811; + cvt.u32.u16 %r7813, %rs5068; + cvt.s32.s8 %r7814, %r7813; + cvt.u32.u16 %r7815, %rs5067; + cvt.s32.s8 %r7816, %r7815; + cvt.u32.u16 %r7817, %rs5066; + cvt.s32.s8 %r7818, %r7817; + mad.lo.s32 %r7819, %r75, %r7818, %r7810; + mad.lo.s32 %r7820, %r76, %r7816, %r7819; + mad.lo.s32 %r7821, %r77, %r7814, %r7820; + mad.lo.s32 %r7822, %r78, %r7812, %r7821; + ld.const.v4.u8 {%rs5074, %rs5075, %rs5076, %rs5077}, [matrix+2536]; + cvt.u32.u16 %r7823, %rs5077; + cvt.s32.s8 %r7824, %r7823; + cvt.u32.u16 %r7825, %rs5076; + cvt.s32.s8 %r7826, %r7825; + cvt.u32.u16 %r7827, %rs5075; + cvt.s32.s8 %r7828, %r7827; + cvt.u32.u16 %r7829, %rs5074; + cvt.s32.s8 %r7830, %r7829; + mad.lo.s32 %r7831, %r80, %r7830, %r7822; + mad.lo.s32 %r7832, %r81, %r7828, %r7831; + mad.lo.s32 %r7833, %r83, %r7826, %r7832; + mad.lo.s32 %r7834, %r84, %r7824, %r7833; + ld.const.v4.u8 {%rs5082, %rs5083, %rs5084, %rs5085}, [matrix+2540]; + cvt.u32.u16 %r7835, %rs5085; + cvt.s32.s8 %r7836, %r7835; + cvt.u32.u16 %r7837, %rs5084; + cvt.s32.s8 %r7838, %r7837; + cvt.u32.u16 %r7839, %rs5083; + cvt.s32.s8 %r7840, %r7839; + cvt.u32.u16 %r7841, %rs5082; + cvt.s32.s8 %r7842, %r7841; + mad.lo.s32 %r7843, %r86, %r7842, %r7834; + mad.lo.s32 %r7844, %r87, %r7840, %r7843; + mad.lo.s32 %r7845, %r88, %r7838, %r7844; + mad.lo.s32 %r7846, %r89, %r7836, %r7845; + ld.const.v4.u8 {%rs5090, %rs5091, %rs5092, %rs5093}, [matrix+2544]; + cvt.u32.u16 %r7847, %rs5093; + cvt.s32.s8 %r7848, %r7847; + cvt.u32.u16 %r7849, %rs5092; + cvt.s32.s8 %r7850, %r7849; + cvt.u32.u16 %r7851, %rs5091; + cvt.s32.s8 %r7852, %r7851; + cvt.u32.u16 %r7853, %rs5090; + cvt.s32.s8 %r7854, %r7853; + mad.lo.s32 %r7855, %r271, %r7854, %r7846; + mad.lo.s32 %r7856, %r91, %r7852, %r7855; + mad.lo.s32 %r7857, %r93, %r7850, %r7856; + mad.lo.s32 %r7858, %r94, %r7848, %r7857; + ld.const.v4.u8 {%rs5098, %rs5099, %rs5100, %rs5101}, [matrix+2548]; + cvt.u32.u16 %r7859, %rs5101; + cvt.s32.s8 %r7860, %r7859; + cvt.u32.u16 %r7861, %rs5100; + cvt.s32.s8 %r7862, %r7861; + cvt.u32.u16 %r7863, %rs5099; + cvt.s32.s8 %r7864, %r7863; + cvt.u32.u16 %r7865, %rs5098; + cvt.s32.s8 %r7866, %r7865; + mad.lo.s32 %r7867, %r96, %r7866, %r7858; + mad.lo.s32 %r7868, %r97, %r7864, %r7867; + mad.lo.s32 %r7869, %r99, %r7862, %r7868; + mad.lo.s32 %r7870, %r100, %r7860, %r7869; + ld.const.v4.u8 {%rs5106, %rs5107, %rs5108, %rs5109}, [matrix+2552]; + cvt.u32.u16 %r7871, %rs5109; + cvt.s32.s8 %r7872, %r7871; + cvt.u32.u16 %r7873, %rs5108; + cvt.s32.s8 %r7874, %r7873; + cvt.u32.u16 %r7875, %rs5107; + cvt.s32.s8 %r7876, %r7875; + cvt.u32.u16 %r7877, %rs5106; + cvt.s32.s8 %r7878, %r7877; + mad.lo.s32 %r7879, %r103, %r7878, %r7870; + mad.lo.s32 %r7880, %r104, %r7876, %r7879; + mad.lo.s32 %r7881, %r107, %r7874, %r7880; + mad.lo.s32 %r7882, %r108, %r7872, %r7881; + ld.const.v4.u8 {%rs5114, %rs5115, %rs5116, %rs5117}, [matrix+2556]; + cvt.u32.u16 %r7883, %rs5117; + cvt.s32.s8 %r7884, %r7883; + cvt.u32.u16 %r7885, %rs5116; + cvt.s32.s8 %r7886, %r7885; + cvt.u32.u16 %r7887, %rs5115; + cvt.s32.s8 %r7888, %r7887; + cvt.u32.u16 %r7889, %rs5114; + cvt.s32.s8 %r7890, %r7889; + mad.lo.s32 %r7891, %r111, %r7890, %r7882; + mad.lo.s32 %r7892, %r112, %r7888, %r7891; + mad.lo.s32 %r7893, %r114, %r7886, %r7892; + mad.lo.s32 %r7894, %r115, %r7884, %r7893; + shr.u32 %r7895, %r7702, 6; + and.b32 %r7896, %r7895, 240; + shr.u32 %r7897, %r7894, 10; + or.b32 %r7898, %r7897, %r7896; + xor.b32 %r7899, %r28, %r7898; + cvt.u64.u32 %rd395, %r7899; + ld.const.v4.u8 {%rs5122, %rs5123, %rs5124, %rs5125}, [matrix+2560]; + cvt.u32.u16 %r7900, %rs5125; + cvt.s32.s8 %r7901, %r7900; + cvt.u32.u16 %r7902, %rs5124; + cvt.s32.s8 %r7903, %r7902; + cvt.u32.u16 %r7904, %rs5122; + cvt.s32.s8 %r7905, %r7904; + cvt.u32.u16 %r7906, %rs5123; + cvt.s32.s8 %r7907, %r7906; + mul.lo.s32 %r7908, %r34, %r7907; + mad.lo.s32 %r7909, %r124, %r7905, %r7908; + mad.lo.s32 %r7910, %r35, %r7903, %r7909; + mad.lo.s32 %r7911, %r36, %r7901, %r7910; + ld.const.v4.u8 {%rs5130, %rs5131, %rs5132, %rs5133}, [matrix+2564]; + cvt.u32.u16 %r7912, %rs5133; + cvt.s32.s8 %r7913, %r7912; + cvt.u32.u16 %r7914, %rs5132; + cvt.s32.s8 %r7915, %r7914; + cvt.u32.u16 %r7916, %rs5131; + cvt.s32.s8 %r7917, %r7916; + cvt.u32.u16 %r7918, %rs5130; + cvt.s32.s8 %r7919, %r7918; + mad.lo.s32 %r7920, %r37, %r7919, %r7911; + mad.lo.s32 %r7921, %r38, %r7917, %r7920; + mad.lo.s32 %r7922, %r39, %r7915, %r7921; + mad.lo.s32 %r7923, %r40, %r7913, %r7922; + ld.const.v4.u8 {%rs5138, %rs5139, %rs5140, %rs5141}, [matrix+2568]; + cvt.u32.u16 %r7924, %rs5141; + cvt.s32.s8 %r7925, %r7924; + cvt.u32.u16 %r7926, %rs5140; + cvt.s32.s8 %r7927, %r7926; + cvt.u32.u16 %r7928, %rs5139; + cvt.s32.s8 %r7929, %r7928; + cvt.u32.u16 %r7930, %rs5138; + cvt.s32.s8 %r7931, %r7930; + mad.lo.s32 %r7932, %r42, %r7931, %r7923; + mad.lo.s32 %r7933, %r43, %r7929, %r7932; + mad.lo.s32 %r7934, %r45, %r7927, %r7933; + mad.lo.s32 %r7935, %r46, %r7925, %r7934; + ld.const.v4.u8 {%rs5146, %rs5147, %rs5148, %rs5149}, [matrix+2572]; + cvt.u32.u16 %r7936, %rs5149; + cvt.s32.s8 %r7937, %r7936; + cvt.u32.u16 %r7938, %rs5148; + cvt.s32.s8 %r7939, %r7938; + cvt.u32.u16 %r7940, %rs5147; + cvt.s32.s8 %r7941, %r7940; + cvt.u32.u16 %r7942, %rs5146; + cvt.s32.s8 %r7943, %r7942; + mad.lo.s32 %r7944, %r48, %r7943, %r7935; + mad.lo.s32 %r7945, %r49, %r7941, %r7944; + mad.lo.s32 %r7946, %r50, %r7939, %r7945; + mad.lo.s32 %r7947, %r51, %r7937, %r7946; + ld.const.v4.u8 {%rs5154, %rs5155, %rs5156, %rs5157}, [matrix+2576]; + cvt.u32.u16 %r7948, %rs5157; + cvt.s32.s8 %r7949, %r7948; + cvt.u32.u16 %r7950, %rs5156; + cvt.s32.s8 %r7951, %r7950; + cvt.u32.u16 %r7952, %rs5155; + cvt.s32.s8 %r7953, %r7952; + cvt.u32.u16 %r7954, %rs5154; + cvt.s32.s8 %r7955, %r7954; + mad.lo.s32 %r7956, %r173, %r7955, %r7947; + mad.lo.s32 %r7957, %r53, %r7953, %r7956; + mad.lo.s32 %r7958, %r54, %r7951, %r7957; + mad.lo.s32 %r7959, %r55, %r7949, %r7958; + ld.const.v4.u8 {%rs5162, %rs5163, %rs5164, %rs5165}, [matrix+2580]; + cvt.u32.u16 %r7960, %rs5165; + cvt.s32.s8 %r7961, %r7960; + cvt.u32.u16 %r7962, %rs5164; + cvt.s32.s8 %r7963, %r7962; + cvt.u32.u16 %r7964, %rs5163; + cvt.s32.s8 %r7965, %r7964; + cvt.u32.u16 %r7966, %rs5162; + cvt.s32.s8 %r7967, %r7966; + mad.lo.s32 %r7968, %r56, %r7967, %r7959; + mad.lo.s32 %r7969, %r57, %r7965, %r7968; + mad.lo.s32 %r7970, %r58, %r7963, %r7969; + mad.lo.s32 %r7971, %r59, %r7961, %r7970; + ld.const.v4.u8 {%rs5170, %rs5171, %rs5172, %rs5173}, [matrix+2584]; + cvt.u32.u16 %r7972, %rs5173; + cvt.s32.s8 %r7973, %r7972; + cvt.u32.u16 %r7974, %rs5172; + cvt.s32.s8 %r7975, %r7974; + cvt.u32.u16 %r7976, %rs5171; + cvt.s32.s8 %r7977, %r7976; + cvt.u32.u16 %r7978, %rs5170; + cvt.s32.s8 %r7979, %r7978; + mad.lo.s32 %r7980, %r61, %r7979, %r7971; + mad.lo.s32 %r7981, %r62, %r7977, %r7980; + mad.lo.s32 %r7982, %r64, %r7975, %r7981; + mad.lo.s32 %r7983, %r65, %r7973, %r7982; + ld.const.v4.u8 {%rs5178, %rs5179, %rs5180, %rs5181}, [matrix+2588]; + cvt.u32.u16 %r7984, %rs5181; + cvt.s32.s8 %r7985, %r7984; + cvt.u32.u16 %r7986, %rs5180; + cvt.s32.s8 %r7987, %r7986; + cvt.u32.u16 %r7988, %rs5179; + cvt.s32.s8 %r7989, %r7988; + cvt.u32.u16 %r7990, %rs5178; + cvt.s32.s8 %r7991, %r7990; + mad.lo.s32 %r7992, %r67, %r7991, %r7983; + mad.lo.s32 %r7993, %r68, %r7989, %r7992; + mad.lo.s32 %r7994, %r69, %r7987, %r7993; + mad.lo.s32 %r7995, %r70, %r7985, %r7994; + ld.const.v4.u8 {%rs5186, %rs5187, %rs5188, %rs5189}, [matrix+2592]; + cvt.u32.u16 %r7996, %rs5189; + cvt.s32.s8 %r7997, %r7996; + cvt.u32.u16 %r7998, %rs5188; + cvt.s32.s8 %r7999, %r7998; + cvt.u32.u16 %r8000, %rs5187; + cvt.s32.s8 %r8001, %r8000; + cvt.u32.u16 %r8002, %rs5186; + cvt.s32.s8 %r8003, %r8002; + mad.lo.s32 %r8004, %r222, %r8003, %r7995; + mad.lo.s32 %r8005, %r72, %r8001, %r8004; + mad.lo.s32 %r8006, %r73, %r7999, %r8005; + mad.lo.s32 %r8007, %r74, %r7997, %r8006; + ld.const.v4.u8 {%rs5194, %rs5195, %rs5196, %rs5197}, [matrix+2596]; + cvt.u32.u16 %r8008, %rs5197; + cvt.s32.s8 %r8009, %r8008; + cvt.u32.u16 %r8010, %rs5196; + cvt.s32.s8 %r8011, %r8010; + cvt.u32.u16 %r8012, %rs5195; + cvt.s32.s8 %r8013, %r8012; + cvt.u32.u16 %r8014, %rs5194; + cvt.s32.s8 %r8015, %r8014; + mad.lo.s32 %r8016, %r75, %r8015, %r8007; + mad.lo.s32 %r8017, %r76, %r8013, %r8016; + mad.lo.s32 %r8018, %r77, %r8011, %r8017; + mad.lo.s32 %r8019, %r78, %r8009, %r8018; + ld.const.v4.u8 {%rs5202, %rs5203, %rs5204, %rs5205}, [matrix+2600]; + cvt.u32.u16 %r8020, %rs5205; + cvt.s32.s8 %r8021, %r8020; + cvt.u32.u16 %r8022, %rs5204; + cvt.s32.s8 %r8023, %r8022; + cvt.u32.u16 %r8024, %rs5203; + cvt.s32.s8 %r8025, %r8024; + cvt.u32.u16 %r8026, %rs5202; + cvt.s32.s8 %r8027, %r8026; + mad.lo.s32 %r8028, %r80, %r8027, %r8019; + mad.lo.s32 %r8029, %r81, %r8025, %r8028; + mad.lo.s32 %r8030, %r83, %r8023, %r8029; + mad.lo.s32 %r8031, %r84, %r8021, %r8030; + ld.const.v4.u8 {%rs5210, %rs5211, %rs5212, %rs5213}, [matrix+2604]; + cvt.u32.u16 %r8032, %rs5213; + cvt.s32.s8 %r8033, %r8032; + cvt.u32.u16 %r8034, %rs5212; + cvt.s32.s8 %r8035, %r8034; + cvt.u32.u16 %r8036, %rs5211; + cvt.s32.s8 %r8037, %r8036; + cvt.u32.u16 %r8038, %rs5210; + cvt.s32.s8 %r8039, %r8038; + mad.lo.s32 %r8040, %r86, %r8039, %r8031; + mad.lo.s32 %r8041, %r87, %r8037, %r8040; + mad.lo.s32 %r8042, %r88, %r8035, %r8041; + mad.lo.s32 %r8043, %r89, %r8033, %r8042; + ld.const.v4.u8 {%rs5218, %rs5219, %rs5220, %rs5221}, [matrix+2608]; + cvt.u32.u16 %r8044, %rs5221; + cvt.s32.s8 %r8045, %r8044; + cvt.u32.u16 %r8046, %rs5220; + cvt.s32.s8 %r8047, %r8046; + cvt.u32.u16 %r8048, %rs5219; + cvt.s32.s8 %r8049, %r8048; + cvt.u32.u16 %r8050, %rs5218; + cvt.s32.s8 %r8051, %r8050; + mad.lo.s32 %r8052, %r271, %r8051, %r8043; + mad.lo.s32 %r8053, %r91, %r8049, %r8052; + mad.lo.s32 %r8054, %r93, %r8047, %r8053; + mad.lo.s32 %r8055, %r94, %r8045, %r8054; + ld.const.v4.u8 {%rs5226, %rs5227, %rs5228, %rs5229}, [matrix+2612]; + cvt.u32.u16 %r8056, %rs5229; + cvt.s32.s8 %r8057, %r8056; + cvt.u32.u16 %r8058, %rs5228; + cvt.s32.s8 %r8059, %r8058; + cvt.u32.u16 %r8060, %rs5227; + cvt.s32.s8 %r8061, %r8060; + cvt.u32.u16 %r8062, %rs5226; + cvt.s32.s8 %r8063, %r8062; + mad.lo.s32 %r8064, %r96, %r8063, %r8055; + mad.lo.s32 %r8065, %r97, %r8061, %r8064; + mad.lo.s32 %r8066, %r99, %r8059, %r8065; + mad.lo.s32 %r8067, %r100, %r8057, %r8066; + ld.const.v4.u8 {%rs5234, %rs5235, %rs5236, %rs5237}, [matrix+2616]; + cvt.u32.u16 %r8068, %rs5237; + cvt.s32.s8 %r8069, %r8068; + cvt.u32.u16 %r8070, %rs5236; + cvt.s32.s8 %r8071, %r8070; + cvt.u32.u16 %r8072, %rs5235; + cvt.s32.s8 %r8073, %r8072; + cvt.u32.u16 %r8074, %rs5234; + cvt.s32.s8 %r8075, %r8074; + mad.lo.s32 %r8076, %r103, %r8075, %r8067; + mad.lo.s32 %r8077, %r104, %r8073, %r8076; + mad.lo.s32 %r8078, %r107, %r8071, %r8077; + mad.lo.s32 %r8079, %r108, %r8069, %r8078; + ld.const.v4.u8 {%rs5242, %rs5243, %rs5244, %rs5245}, [matrix+2620]; + cvt.u32.u16 %r8080, %rs5245; + cvt.s32.s8 %r8081, %r8080; + cvt.u32.u16 %r8082, %rs5244; + cvt.s32.s8 %r8083, %r8082; + cvt.u32.u16 %r8084, %rs5243; + cvt.s32.s8 %r8085, %r8084; + cvt.u32.u16 %r8086, %rs5242; + cvt.s32.s8 %r8087, %r8086; + mad.lo.s32 %r8088, %r111, %r8087, %r8079; + mad.lo.s32 %r8089, %r112, %r8085, %r8088; + mad.lo.s32 %r8090, %r114, %r8083, %r8089; + mad.lo.s32 %r8091, %r115, %r8081, %r8090; + ld.const.v4.u8 {%rs5250, %rs5251, %rs5252, %rs5253}, [matrix+2624]; + cvt.u32.u16 %r8092, %rs5253; + cvt.s32.s8 %r8093, %r8092; + cvt.u32.u16 %r8094, %rs5252; + cvt.s32.s8 %r8095, %r8094; + cvt.u32.u16 %r8096, %rs5250; + cvt.s32.s8 %r8097, %r8096; + cvt.u32.u16 %r8098, %rs5251; + cvt.s32.s8 %r8099, %r8098; + mul.lo.s32 %r8100, %r34, %r8099; + mad.lo.s32 %r8101, %r124, %r8097, %r8100; + mad.lo.s32 %r8102, %r35, %r8095, %r8101; + mad.lo.s32 %r8103, %r36, %r8093, %r8102; + ld.const.v4.u8 {%rs5258, %rs5259, %rs5260, %rs5261}, [matrix+2628]; + cvt.u32.u16 %r8104, %rs5261; + cvt.s32.s8 %r8105, %r8104; + cvt.u32.u16 %r8106, %rs5260; + cvt.s32.s8 %r8107, %r8106; + cvt.u32.u16 %r8108, %rs5259; + cvt.s32.s8 %r8109, %r8108; + cvt.u32.u16 %r8110, %rs5258; + cvt.s32.s8 %r8111, %r8110; + mad.lo.s32 %r8112, %r37, %r8111, %r8103; + mad.lo.s32 %r8113, %r38, %r8109, %r8112; + mad.lo.s32 %r8114, %r39, %r8107, %r8113; + mad.lo.s32 %r8115, %r40, %r8105, %r8114; + ld.const.v4.u8 {%rs5266, %rs5267, %rs5268, %rs5269}, [matrix+2632]; + cvt.u32.u16 %r8116, %rs5269; + cvt.s32.s8 %r8117, %r8116; + cvt.u32.u16 %r8118, %rs5268; + cvt.s32.s8 %r8119, %r8118; + cvt.u32.u16 %r8120, %rs5267; + cvt.s32.s8 %r8121, %r8120; + cvt.u32.u16 %r8122, %rs5266; + cvt.s32.s8 %r8123, %r8122; + mad.lo.s32 %r8124, %r42, %r8123, %r8115; + mad.lo.s32 %r8125, %r43, %r8121, %r8124; + mad.lo.s32 %r8126, %r45, %r8119, %r8125; + mad.lo.s32 %r8127, %r46, %r8117, %r8126; + ld.const.v4.u8 {%rs5274, %rs5275, %rs5276, %rs5277}, [matrix+2636]; + cvt.u32.u16 %r8128, %rs5277; + cvt.s32.s8 %r8129, %r8128; + cvt.u32.u16 %r8130, %rs5276; + cvt.s32.s8 %r8131, %r8130; + cvt.u32.u16 %r8132, %rs5275; + cvt.s32.s8 %r8133, %r8132; + cvt.u32.u16 %r8134, %rs5274; + cvt.s32.s8 %r8135, %r8134; + mad.lo.s32 %r8136, %r48, %r8135, %r8127; + mad.lo.s32 %r8137, %r49, %r8133, %r8136; + mad.lo.s32 %r8138, %r50, %r8131, %r8137; + mad.lo.s32 %r8139, %r51, %r8129, %r8138; + ld.const.v4.u8 {%rs5282, %rs5283, %rs5284, %rs5285}, [matrix+2640]; + cvt.u32.u16 %r8140, %rs5285; + cvt.s32.s8 %r8141, %r8140; + cvt.u32.u16 %r8142, %rs5284; + cvt.s32.s8 %r8143, %r8142; + cvt.u32.u16 %r8144, %rs5283; + cvt.s32.s8 %r8145, %r8144; + cvt.u32.u16 %r8146, %rs5282; + cvt.s32.s8 %r8147, %r8146; + mad.lo.s32 %r8148, %r173, %r8147, %r8139; + mad.lo.s32 %r8149, %r53, %r8145, %r8148; + mad.lo.s32 %r8150, %r54, %r8143, %r8149; + mad.lo.s32 %r8151, %r55, %r8141, %r8150; + ld.const.v4.u8 {%rs5290, %rs5291, %rs5292, %rs5293}, [matrix+2644]; + cvt.u32.u16 %r8152, %rs5293; + cvt.s32.s8 %r8153, %r8152; + cvt.u32.u16 %r8154, %rs5292; + cvt.s32.s8 %r8155, %r8154; + cvt.u32.u16 %r8156, %rs5291; + cvt.s32.s8 %r8157, %r8156; + cvt.u32.u16 %r8158, %rs5290; + cvt.s32.s8 %r8159, %r8158; + mad.lo.s32 %r8160, %r56, %r8159, %r8151; + mad.lo.s32 %r8161, %r57, %r8157, %r8160; + mad.lo.s32 %r8162, %r58, %r8155, %r8161; + mad.lo.s32 %r8163, %r59, %r8153, %r8162; + ld.const.v4.u8 {%rs5298, %rs5299, %rs5300, %rs5301}, [matrix+2648]; + cvt.u32.u16 %r8164, %rs5301; + cvt.s32.s8 %r8165, %r8164; + cvt.u32.u16 %r8166, %rs5300; + cvt.s32.s8 %r8167, %r8166; + cvt.u32.u16 %r8168, %rs5299; + cvt.s32.s8 %r8169, %r8168; + cvt.u32.u16 %r8170, %rs5298; + cvt.s32.s8 %r8171, %r8170; + mad.lo.s32 %r8172, %r61, %r8171, %r8163; + mad.lo.s32 %r8173, %r62, %r8169, %r8172; + mad.lo.s32 %r8174, %r64, %r8167, %r8173; + mad.lo.s32 %r8175, %r65, %r8165, %r8174; + ld.const.v4.u8 {%rs5306, %rs5307, %rs5308, %rs5309}, [matrix+2652]; + cvt.u32.u16 %r8176, %rs5309; + cvt.s32.s8 %r8177, %r8176; + cvt.u32.u16 %r8178, %rs5308; + cvt.s32.s8 %r8179, %r8178; + cvt.u32.u16 %r8180, %rs5307; + cvt.s32.s8 %r8181, %r8180; + cvt.u32.u16 %r8182, %rs5306; + cvt.s32.s8 %r8183, %r8182; + mad.lo.s32 %r8184, %r67, %r8183, %r8175; + mad.lo.s32 %r8185, %r68, %r8181, %r8184; + mad.lo.s32 %r8186, %r69, %r8179, %r8185; + mad.lo.s32 %r8187, %r70, %r8177, %r8186; + ld.const.v4.u8 {%rs5314, %rs5315, %rs5316, %rs5317}, [matrix+2656]; + cvt.u32.u16 %r8188, %rs5317; + cvt.s32.s8 %r8189, %r8188; + cvt.u32.u16 %r8190, %rs5316; + cvt.s32.s8 %r8191, %r8190; + cvt.u32.u16 %r8192, %rs5315; + cvt.s32.s8 %r8193, %r8192; + cvt.u32.u16 %r8194, %rs5314; + cvt.s32.s8 %r8195, %r8194; + mad.lo.s32 %r8196, %r222, %r8195, %r8187; + mad.lo.s32 %r8197, %r72, %r8193, %r8196; + mad.lo.s32 %r8198, %r73, %r8191, %r8197; + mad.lo.s32 %r8199, %r74, %r8189, %r8198; + ld.const.v4.u8 {%rs5322, %rs5323, %rs5324, %rs5325}, [matrix+2660]; + cvt.u32.u16 %r8200, %rs5325; + cvt.s32.s8 %r8201, %r8200; + cvt.u32.u16 %r8202, %rs5324; + cvt.s32.s8 %r8203, %r8202; + cvt.u32.u16 %r8204, %rs5323; + cvt.s32.s8 %r8205, %r8204; + cvt.u32.u16 %r8206, %rs5322; + cvt.s32.s8 %r8207, %r8206; + mad.lo.s32 %r8208, %r75, %r8207, %r8199; + mad.lo.s32 %r8209, %r76, %r8205, %r8208; + mad.lo.s32 %r8210, %r77, %r8203, %r8209; + mad.lo.s32 %r8211, %r78, %r8201, %r8210; + ld.const.v4.u8 {%rs5330, %rs5331, %rs5332, %rs5333}, [matrix+2664]; + cvt.u32.u16 %r8212, %rs5333; + cvt.s32.s8 %r8213, %r8212; + cvt.u32.u16 %r8214, %rs5332; + cvt.s32.s8 %r8215, %r8214; + cvt.u32.u16 %r8216, %rs5331; + cvt.s32.s8 %r8217, %r8216; + cvt.u32.u16 %r8218, %rs5330; + cvt.s32.s8 %r8219, %r8218; + mad.lo.s32 %r8220, %r80, %r8219, %r8211; + mad.lo.s32 %r8221, %r81, %r8217, %r8220; + mad.lo.s32 %r8222, %r83, %r8215, %r8221; + mad.lo.s32 %r8223, %r84, %r8213, %r8222; + ld.const.v4.u8 {%rs5338, %rs5339, %rs5340, %rs5341}, [matrix+2668]; + cvt.u32.u16 %r8224, %rs5341; + cvt.s32.s8 %r8225, %r8224; + cvt.u32.u16 %r8226, %rs5340; + cvt.s32.s8 %r8227, %r8226; + cvt.u32.u16 %r8228, %rs5339; + cvt.s32.s8 %r8229, %r8228; + cvt.u32.u16 %r8230, %rs5338; + cvt.s32.s8 %r8231, %r8230; + mad.lo.s32 %r8232, %r86, %r8231, %r8223; + mad.lo.s32 %r8233, %r87, %r8229, %r8232; + mad.lo.s32 %r8234, %r88, %r8227, %r8233; + mad.lo.s32 %r8235, %r89, %r8225, %r8234; + ld.const.v4.u8 {%rs5346, %rs5347, %rs5348, %rs5349}, [matrix+2672]; + cvt.u32.u16 %r8236, %rs5349; + cvt.s32.s8 %r8237, %r8236; + cvt.u32.u16 %r8238, %rs5348; + cvt.s32.s8 %r8239, %r8238; + cvt.u32.u16 %r8240, %rs5347; + cvt.s32.s8 %r8241, %r8240; + cvt.u32.u16 %r8242, %rs5346; + cvt.s32.s8 %r8243, %r8242; + mad.lo.s32 %r8244, %r271, %r8243, %r8235; + mad.lo.s32 %r8245, %r91, %r8241, %r8244; + mad.lo.s32 %r8246, %r93, %r8239, %r8245; + mad.lo.s32 %r8247, %r94, %r8237, %r8246; + ld.const.v4.u8 {%rs5354, %rs5355, %rs5356, %rs5357}, [matrix+2676]; + cvt.u32.u16 %r8248, %rs5357; + cvt.s32.s8 %r8249, %r8248; + cvt.u32.u16 %r8250, %rs5356; + cvt.s32.s8 %r8251, %r8250; + cvt.u32.u16 %r8252, %rs5355; + cvt.s32.s8 %r8253, %r8252; + cvt.u32.u16 %r8254, %rs5354; + cvt.s32.s8 %r8255, %r8254; + mad.lo.s32 %r8256, %r96, %r8255, %r8247; + mad.lo.s32 %r8257, %r97, %r8253, %r8256; + mad.lo.s32 %r8258, %r99, %r8251, %r8257; + mad.lo.s32 %r8259, %r100, %r8249, %r8258; + ld.const.v4.u8 {%rs5362, %rs5363, %rs5364, %rs5365}, [matrix+2680]; + cvt.u32.u16 %r8260, %rs5365; + cvt.s32.s8 %r8261, %r8260; + cvt.u32.u16 %r8262, %rs5364; + cvt.s32.s8 %r8263, %r8262; + cvt.u32.u16 %r8264, %rs5363; + cvt.s32.s8 %r8265, %r8264; + cvt.u32.u16 %r8266, %rs5362; + cvt.s32.s8 %r8267, %r8266; + mad.lo.s32 %r8268, %r103, %r8267, %r8259; + mad.lo.s32 %r8269, %r104, %r8265, %r8268; + mad.lo.s32 %r8270, %r107, %r8263, %r8269; + mad.lo.s32 %r8271, %r108, %r8261, %r8270; + ld.const.v4.u8 {%rs5370, %rs5371, %rs5372, %rs5373}, [matrix+2684]; + cvt.u32.u16 %r8272, %rs5373; + cvt.s32.s8 %r8273, %r8272; + cvt.u32.u16 %r8274, %rs5372; + cvt.s32.s8 %r8275, %r8274; + cvt.u32.u16 %r8276, %rs5371; + cvt.s32.s8 %r8277, %r8276; + cvt.u32.u16 %r8278, %rs5370; + cvt.s32.s8 %r8279, %r8278; + mad.lo.s32 %r8280, %r111, %r8279, %r8271; + mad.lo.s32 %r8281, %r112, %r8277, %r8280; + mad.lo.s32 %r8282, %r114, %r8275, %r8281; + mad.lo.s32 %r8283, %r115, %r8273, %r8282; + shr.u32 %r8284, %r8091, 6; + and.b32 %r8285, %r8284, 240; + shr.u32 %r8286, %r8283, 10; + or.b32 %r8287, %r8286, %r8285; + xor.b32 %r8288, %r29, %r8287; + cvt.u64.u32 %rd396, %r8288; + ld.const.v4.u8 {%rs5378, %rs5379, %rs5380, %rs5381}, [matrix+2688]; + cvt.u32.u16 %r8289, %rs5381; + cvt.s32.s8 %r8290, %r8289; + cvt.u32.u16 %r8291, %rs5380; + cvt.s32.s8 %r8292, %r8291; + cvt.u32.u16 %r8293, %rs5378; + cvt.s32.s8 %r8294, %r8293; + cvt.u32.u16 %r8295, %rs5379; + cvt.s32.s8 %r8296, %r8295; + mul.lo.s32 %r8297, %r34, %r8296; + mad.lo.s32 %r8298, %r124, %r8294, %r8297; + mad.lo.s32 %r8299, %r35, %r8292, %r8298; + mad.lo.s32 %r8300, %r36, %r8290, %r8299; + ld.const.v4.u8 {%rs5386, %rs5387, %rs5388, %rs5389}, [matrix+2692]; + cvt.u32.u16 %r8301, %rs5389; + cvt.s32.s8 %r8302, %r8301; + cvt.u32.u16 %r8303, %rs5388; + cvt.s32.s8 %r8304, %r8303; + cvt.u32.u16 %r8305, %rs5387; + cvt.s32.s8 %r8306, %r8305; + cvt.u32.u16 %r8307, %rs5386; + cvt.s32.s8 %r8308, %r8307; + mad.lo.s32 %r8309, %r37, %r8308, %r8300; + mad.lo.s32 %r8310, %r38, %r8306, %r8309; + mad.lo.s32 %r8311, %r39, %r8304, %r8310; + mad.lo.s32 %r8312, %r40, %r8302, %r8311; + ld.const.v4.u8 {%rs5394, %rs5395, %rs5396, %rs5397}, [matrix+2696]; + cvt.u32.u16 %r8313, %rs5397; + cvt.s32.s8 %r8314, %r8313; + cvt.u32.u16 %r8315, %rs5396; + cvt.s32.s8 %r8316, %r8315; + cvt.u32.u16 %r8317, %rs5395; + cvt.s32.s8 %r8318, %r8317; + cvt.u32.u16 %r8319, %rs5394; + cvt.s32.s8 %r8320, %r8319; + mad.lo.s32 %r8321, %r42, %r8320, %r8312; + mad.lo.s32 %r8322, %r43, %r8318, %r8321; + mad.lo.s32 %r8323, %r45, %r8316, %r8322; + mad.lo.s32 %r8324, %r46, %r8314, %r8323; + ld.const.v4.u8 {%rs5402, %rs5403, %rs5404, %rs5405}, [matrix+2700]; + cvt.u32.u16 %r8325, %rs5405; + cvt.s32.s8 %r8326, %r8325; + cvt.u32.u16 %r8327, %rs5404; + cvt.s32.s8 %r8328, %r8327; + cvt.u32.u16 %r8329, %rs5403; + cvt.s32.s8 %r8330, %r8329; + cvt.u32.u16 %r8331, %rs5402; + cvt.s32.s8 %r8332, %r8331; + mad.lo.s32 %r8333, %r48, %r8332, %r8324; + mad.lo.s32 %r8334, %r49, %r8330, %r8333; + mad.lo.s32 %r8335, %r50, %r8328, %r8334; + mad.lo.s32 %r8336, %r51, %r8326, %r8335; + ld.const.v4.u8 {%rs5410, %rs5411, %rs5412, %rs5413}, [matrix+2704]; + cvt.u32.u16 %r8337, %rs5413; + cvt.s32.s8 %r8338, %r8337; + cvt.u32.u16 %r8339, %rs5412; + cvt.s32.s8 %r8340, %r8339; + cvt.u32.u16 %r8341, %rs5411; + cvt.s32.s8 %r8342, %r8341; + cvt.u32.u16 %r8343, %rs5410; + cvt.s32.s8 %r8344, %r8343; + mad.lo.s32 %r8345, %r173, %r8344, %r8336; + mad.lo.s32 %r8346, %r53, %r8342, %r8345; + mad.lo.s32 %r8347, %r54, %r8340, %r8346; + mad.lo.s32 %r8348, %r55, %r8338, %r8347; + ld.const.v4.u8 {%rs5418, %rs5419, %rs5420, %rs5421}, [matrix+2708]; + cvt.u32.u16 %r8349, %rs5421; + cvt.s32.s8 %r8350, %r8349; + cvt.u32.u16 %r8351, %rs5420; + cvt.s32.s8 %r8352, %r8351; + cvt.u32.u16 %r8353, %rs5419; + cvt.s32.s8 %r8354, %r8353; + cvt.u32.u16 %r8355, %rs5418; + cvt.s32.s8 %r8356, %r8355; + mad.lo.s32 %r8357, %r56, %r8356, %r8348; + mad.lo.s32 %r8358, %r57, %r8354, %r8357; + mad.lo.s32 %r8359, %r58, %r8352, %r8358; + mad.lo.s32 %r8360, %r59, %r8350, %r8359; + ld.const.v4.u8 {%rs5426, %rs5427, %rs5428, %rs5429}, [matrix+2712]; + cvt.u32.u16 %r8361, %rs5429; + cvt.s32.s8 %r8362, %r8361; + cvt.u32.u16 %r8363, %rs5428; + cvt.s32.s8 %r8364, %r8363; + cvt.u32.u16 %r8365, %rs5427; + cvt.s32.s8 %r8366, %r8365; + cvt.u32.u16 %r8367, %rs5426; + cvt.s32.s8 %r8368, %r8367; + mad.lo.s32 %r8369, %r61, %r8368, %r8360; + mad.lo.s32 %r8370, %r62, %r8366, %r8369; + mad.lo.s32 %r8371, %r64, %r8364, %r8370; + mad.lo.s32 %r8372, %r65, %r8362, %r8371; + ld.const.v4.u8 {%rs5434, %rs5435, %rs5436, %rs5437}, [matrix+2716]; + cvt.u32.u16 %r8373, %rs5437; + cvt.s32.s8 %r8374, %r8373; + cvt.u32.u16 %r8375, %rs5436; + cvt.s32.s8 %r8376, %r8375; + cvt.u32.u16 %r8377, %rs5435; + cvt.s32.s8 %r8378, %r8377; + cvt.u32.u16 %r8379, %rs5434; + cvt.s32.s8 %r8380, %r8379; + mad.lo.s32 %r8381, %r67, %r8380, %r8372; + mad.lo.s32 %r8382, %r68, %r8378, %r8381; + mad.lo.s32 %r8383, %r69, %r8376, %r8382; + mad.lo.s32 %r8384, %r70, %r8374, %r8383; + ld.const.v4.u8 {%rs5442, %rs5443, %rs5444, %rs5445}, [matrix+2720]; + cvt.u32.u16 %r8385, %rs5445; + cvt.s32.s8 %r8386, %r8385; + cvt.u32.u16 %r8387, %rs5444; + cvt.s32.s8 %r8388, %r8387; + cvt.u32.u16 %r8389, %rs5443; + cvt.s32.s8 %r8390, %r8389; + cvt.u32.u16 %r8391, %rs5442; + cvt.s32.s8 %r8392, %r8391; + mad.lo.s32 %r8393, %r222, %r8392, %r8384; + mad.lo.s32 %r8394, %r72, %r8390, %r8393; + mad.lo.s32 %r8395, %r73, %r8388, %r8394; + mad.lo.s32 %r8396, %r74, %r8386, %r8395; + ld.const.v4.u8 {%rs5450, %rs5451, %rs5452, %rs5453}, [matrix+2724]; + cvt.u32.u16 %r8397, %rs5453; + cvt.s32.s8 %r8398, %r8397; + cvt.u32.u16 %r8399, %rs5452; + cvt.s32.s8 %r8400, %r8399; + cvt.u32.u16 %r8401, %rs5451; + cvt.s32.s8 %r8402, %r8401; + cvt.u32.u16 %r8403, %rs5450; + cvt.s32.s8 %r8404, %r8403; + mad.lo.s32 %r8405, %r75, %r8404, %r8396; + mad.lo.s32 %r8406, %r76, %r8402, %r8405; + mad.lo.s32 %r8407, %r77, %r8400, %r8406; + mad.lo.s32 %r8408, %r78, %r8398, %r8407; + ld.const.v4.u8 {%rs5458, %rs5459, %rs5460, %rs5461}, [matrix+2728]; + cvt.u32.u16 %r8409, %rs5461; + cvt.s32.s8 %r8410, %r8409; + cvt.u32.u16 %r8411, %rs5460; + cvt.s32.s8 %r8412, %r8411; + cvt.u32.u16 %r8413, %rs5459; + cvt.s32.s8 %r8414, %r8413; + cvt.u32.u16 %r8415, %rs5458; + cvt.s32.s8 %r8416, %r8415; + mad.lo.s32 %r8417, %r80, %r8416, %r8408; + mad.lo.s32 %r8418, %r81, %r8414, %r8417; + mad.lo.s32 %r8419, %r83, %r8412, %r8418; + mad.lo.s32 %r8420, %r84, %r8410, %r8419; + ld.const.v4.u8 {%rs5466, %rs5467, %rs5468, %rs5469}, [matrix+2732]; + cvt.u32.u16 %r8421, %rs5469; + cvt.s32.s8 %r8422, %r8421; + cvt.u32.u16 %r8423, %rs5468; + cvt.s32.s8 %r8424, %r8423; + cvt.u32.u16 %r8425, %rs5467; + cvt.s32.s8 %r8426, %r8425; + cvt.u32.u16 %r8427, %rs5466; + cvt.s32.s8 %r8428, %r8427; + mad.lo.s32 %r8429, %r86, %r8428, %r8420; + mad.lo.s32 %r8430, %r87, %r8426, %r8429; + mad.lo.s32 %r8431, %r88, %r8424, %r8430; + mad.lo.s32 %r8432, %r89, %r8422, %r8431; + ld.const.v4.u8 {%rs5474, %rs5475, %rs5476, %rs5477}, [matrix+2736]; + cvt.u32.u16 %r8433, %rs5477; + cvt.s32.s8 %r8434, %r8433; + cvt.u32.u16 %r8435, %rs5476; + cvt.s32.s8 %r8436, %r8435; + cvt.u32.u16 %r8437, %rs5475; + cvt.s32.s8 %r8438, %r8437; + cvt.u32.u16 %r8439, %rs5474; + cvt.s32.s8 %r8440, %r8439; + mad.lo.s32 %r8441, %r271, %r8440, %r8432; + mad.lo.s32 %r8442, %r91, %r8438, %r8441; + mad.lo.s32 %r8443, %r93, %r8436, %r8442; + mad.lo.s32 %r8444, %r94, %r8434, %r8443; + ld.const.v4.u8 {%rs5482, %rs5483, %rs5484, %rs5485}, [matrix+2740]; + cvt.u32.u16 %r8445, %rs5485; + cvt.s32.s8 %r8446, %r8445; + cvt.u32.u16 %r8447, %rs5484; + cvt.s32.s8 %r8448, %r8447; + cvt.u32.u16 %r8449, %rs5483; + cvt.s32.s8 %r8450, %r8449; + cvt.u32.u16 %r8451, %rs5482; + cvt.s32.s8 %r8452, %r8451; + mad.lo.s32 %r8453, %r96, %r8452, %r8444; + mad.lo.s32 %r8454, %r97, %r8450, %r8453; + mad.lo.s32 %r8455, %r99, %r8448, %r8454; + mad.lo.s32 %r8456, %r100, %r8446, %r8455; + ld.const.v4.u8 {%rs5490, %rs5491, %rs5492, %rs5493}, [matrix+2744]; + cvt.u32.u16 %r8457, %rs5493; + cvt.s32.s8 %r8458, %r8457; + cvt.u32.u16 %r8459, %rs5492; + cvt.s32.s8 %r8460, %r8459; + cvt.u32.u16 %r8461, %rs5491; + cvt.s32.s8 %r8462, %r8461; + cvt.u32.u16 %r8463, %rs5490; + cvt.s32.s8 %r8464, %r8463; + mad.lo.s32 %r8465, %r103, %r8464, %r8456; + mad.lo.s32 %r8466, %r104, %r8462, %r8465; + mad.lo.s32 %r8467, %r107, %r8460, %r8466; + mad.lo.s32 %r8468, %r108, %r8458, %r8467; + ld.const.v4.u8 {%rs5498, %rs5499, %rs5500, %rs5501}, [matrix+2748]; + cvt.u32.u16 %r8469, %rs5501; + cvt.s32.s8 %r8470, %r8469; + cvt.u32.u16 %r8471, %rs5500; + cvt.s32.s8 %r8472, %r8471; + cvt.u32.u16 %r8473, %rs5499; + cvt.s32.s8 %r8474, %r8473; + cvt.u32.u16 %r8475, %rs5498; + cvt.s32.s8 %r8476, %r8475; + mad.lo.s32 %r8477, %r111, %r8476, %r8468; + mad.lo.s32 %r8478, %r112, %r8474, %r8477; + mad.lo.s32 %r8479, %r114, %r8472, %r8478; + mad.lo.s32 %r8480, %r115, %r8470, %r8479; + ld.const.v4.u8 {%rs5506, %rs5507, %rs5508, %rs5509}, [matrix+2752]; + cvt.u32.u16 %r8481, %rs5509; + cvt.s32.s8 %r8482, %r8481; + cvt.u32.u16 %r8483, %rs5508; + cvt.s32.s8 %r8484, %r8483; + cvt.u32.u16 %r8485, %rs5506; + cvt.s32.s8 %r8486, %r8485; + cvt.u32.u16 %r8487, %rs5507; + cvt.s32.s8 %r8488, %r8487; + mul.lo.s32 %r8489, %r34, %r8488; + mad.lo.s32 %r8490, %r124, %r8486, %r8489; + mad.lo.s32 %r8491, %r35, %r8484, %r8490; + mad.lo.s32 %r8492, %r36, %r8482, %r8491; + ld.const.v4.u8 {%rs5514, %rs5515, %rs5516, %rs5517}, [matrix+2756]; + cvt.u32.u16 %r8493, %rs5517; + cvt.s32.s8 %r8494, %r8493; + cvt.u32.u16 %r8495, %rs5516; + cvt.s32.s8 %r8496, %r8495; + cvt.u32.u16 %r8497, %rs5515; + cvt.s32.s8 %r8498, %r8497; + cvt.u32.u16 %r8499, %rs5514; + cvt.s32.s8 %r8500, %r8499; + mad.lo.s32 %r8501, %r37, %r8500, %r8492; + mad.lo.s32 %r8502, %r38, %r8498, %r8501; + mad.lo.s32 %r8503, %r39, %r8496, %r8502; + mad.lo.s32 %r8504, %r40, %r8494, %r8503; + ld.const.v4.u8 {%rs5522, %rs5523, %rs5524, %rs5525}, [matrix+2760]; + cvt.u32.u16 %r8505, %rs5525; + cvt.s32.s8 %r8506, %r8505; + cvt.u32.u16 %r8507, %rs5524; + cvt.s32.s8 %r8508, %r8507; + cvt.u32.u16 %r8509, %rs5523; + cvt.s32.s8 %r8510, %r8509; + cvt.u32.u16 %r8511, %rs5522; + cvt.s32.s8 %r8512, %r8511; + mad.lo.s32 %r8513, %r42, %r8512, %r8504; + mad.lo.s32 %r8514, %r43, %r8510, %r8513; + mad.lo.s32 %r8515, %r45, %r8508, %r8514; + mad.lo.s32 %r8516, %r46, %r8506, %r8515; + ld.const.v4.u8 {%rs5530, %rs5531, %rs5532, %rs5533}, [matrix+2764]; + cvt.u32.u16 %r8517, %rs5533; + cvt.s32.s8 %r8518, %r8517; + cvt.u32.u16 %r8519, %rs5532; + cvt.s32.s8 %r8520, %r8519; + cvt.u32.u16 %r8521, %rs5531; + cvt.s32.s8 %r8522, %r8521; + cvt.u32.u16 %r8523, %rs5530; + cvt.s32.s8 %r8524, %r8523; + mad.lo.s32 %r8525, %r48, %r8524, %r8516; + mad.lo.s32 %r8526, %r49, %r8522, %r8525; + mad.lo.s32 %r8527, %r50, %r8520, %r8526; + mad.lo.s32 %r8528, %r51, %r8518, %r8527; + ld.const.v4.u8 {%rs5538, %rs5539, %rs5540, %rs5541}, [matrix+2768]; + cvt.u32.u16 %r8529, %rs5541; + cvt.s32.s8 %r8530, %r8529; + cvt.u32.u16 %r8531, %rs5540; + cvt.s32.s8 %r8532, %r8531; + cvt.u32.u16 %r8533, %rs5539; + cvt.s32.s8 %r8534, %r8533; + cvt.u32.u16 %r8535, %rs5538; + cvt.s32.s8 %r8536, %r8535; + mad.lo.s32 %r8537, %r173, %r8536, %r8528; + mad.lo.s32 %r8538, %r53, %r8534, %r8537; + mad.lo.s32 %r8539, %r54, %r8532, %r8538; + mad.lo.s32 %r8540, %r55, %r8530, %r8539; + ld.const.v4.u8 {%rs5546, %rs5547, %rs5548, %rs5549}, [matrix+2772]; + cvt.u32.u16 %r8541, %rs5549; + cvt.s32.s8 %r8542, %r8541; + cvt.u32.u16 %r8543, %rs5548; + cvt.s32.s8 %r8544, %r8543; + cvt.u32.u16 %r8545, %rs5547; + cvt.s32.s8 %r8546, %r8545; + cvt.u32.u16 %r8547, %rs5546; + cvt.s32.s8 %r8548, %r8547; + mad.lo.s32 %r8549, %r56, %r8548, %r8540; + mad.lo.s32 %r8550, %r57, %r8546, %r8549; + mad.lo.s32 %r8551, %r58, %r8544, %r8550; + mad.lo.s32 %r8552, %r59, %r8542, %r8551; + ld.const.v4.u8 {%rs5554, %rs5555, %rs5556, %rs5557}, [matrix+2776]; + cvt.u32.u16 %r8553, %rs5557; + cvt.s32.s8 %r8554, %r8553; + cvt.u32.u16 %r8555, %rs5556; + cvt.s32.s8 %r8556, %r8555; + cvt.u32.u16 %r8557, %rs5555; + cvt.s32.s8 %r8558, %r8557; + cvt.u32.u16 %r8559, %rs5554; + cvt.s32.s8 %r8560, %r8559; + mad.lo.s32 %r8561, %r61, %r8560, %r8552; + mad.lo.s32 %r8562, %r62, %r8558, %r8561; + mad.lo.s32 %r8563, %r64, %r8556, %r8562; + mad.lo.s32 %r8564, %r65, %r8554, %r8563; + ld.const.v4.u8 {%rs5562, %rs5563, %rs5564, %rs5565}, [matrix+2780]; + cvt.u32.u16 %r8565, %rs5565; + cvt.s32.s8 %r8566, %r8565; + cvt.u32.u16 %r8567, %rs5564; + cvt.s32.s8 %r8568, %r8567; + cvt.u32.u16 %r8569, %rs5563; + cvt.s32.s8 %r8570, %r8569; + cvt.u32.u16 %r8571, %rs5562; + cvt.s32.s8 %r8572, %r8571; + mad.lo.s32 %r8573, %r67, %r8572, %r8564; + mad.lo.s32 %r8574, %r68, %r8570, %r8573; + mad.lo.s32 %r8575, %r69, %r8568, %r8574; + mad.lo.s32 %r8576, %r70, %r8566, %r8575; + ld.const.v4.u8 {%rs5570, %rs5571, %rs5572, %rs5573}, [matrix+2784]; + cvt.u32.u16 %r8577, %rs5573; + cvt.s32.s8 %r8578, %r8577; + cvt.u32.u16 %r8579, %rs5572; + cvt.s32.s8 %r8580, %r8579; + cvt.u32.u16 %r8581, %rs5571; + cvt.s32.s8 %r8582, %r8581; + cvt.u32.u16 %r8583, %rs5570; + cvt.s32.s8 %r8584, %r8583; + mad.lo.s32 %r8585, %r222, %r8584, %r8576; + mad.lo.s32 %r8586, %r72, %r8582, %r8585; + mad.lo.s32 %r8587, %r73, %r8580, %r8586; + mad.lo.s32 %r8588, %r74, %r8578, %r8587; + ld.const.v4.u8 {%rs5578, %rs5579, %rs5580, %rs5581}, [matrix+2788]; + cvt.u32.u16 %r8589, %rs5581; + cvt.s32.s8 %r8590, %r8589; + cvt.u32.u16 %r8591, %rs5580; + cvt.s32.s8 %r8592, %r8591; + cvt.u32.u16 %r8593, %rs5579; + cvt.s32.s8 %r8594, %r8593; + cvt.u32.u16 %r8595, %rs5578; + cvt.s32.s8 %r8596, %r8595; + mad.lo.s32 %r8597, %r75, %r8596, %r8588; + mad.lo.s32 %r8598, %r76, %r8594, %r8597; + mad.lo.s32 %r8599, %r77, %r8592, %r8598; + mad.lo.s32 %r8600, %r78, %r8590, %r8599; + ld.const.v4.u8 {%rs5586, %rs5587, %rs5588, %rs5589}, [matrix+2792]; + cvt.u32.u16 %r8601, %rs5589; + cvt.s32.s8 %r8602, %r8601; + cvt.u32.u16 %r8603, %rs5588; + cvt.s32.s8 %r8604, %r8603; + cvt.u32.u16 %r8605, %rs5587; + cvt.s32.s8 %r8606, %r8605; + cvt.u32.u16 %r8607, %rs5586; + cvt.s32.s8 %r8608, %r8607; + mad.lo.s32 %r8609, %r80, %r8608, %r8600; + mad.lo.s32 %r8610, %r81, %r8606, %r8609; + mad.lo.s32 %r8611, %r83, %r8604, %r8610; + mad.lo.s32 %r8612, %r84, %r8602, %r8611; + ld.const.v4.u8 {%rs5594, %rs5595, %rs5596, %rs5597}, [matrix+2796]; + cvt.u32.u16 %r8613, %rs5597; + cvt.s32.s8 %r8614, %r8613; + cvt.u32.u16 %r8615, %rs5596; + cvt.s32.s8 %r8616, %r8615; + cvt.u32.u16 %r8617, %rs5595; + cvt.s32.s8 %r8618, %r8617; + cvt.u32.u16 %r8619, %rs5594; + cvt.s32.s8 %r8620, %r8619; + mad.lo.s32 %r8621, %r86, %r8620, %r8612; + mad.lo.s32 %r8622, %r87, %r8618, %r8621; + mad.lo.s32 %r8623, %r88, %r8616, %r8622; + mad.lo.s32 %r8624, %r89, %r8614, %r8623; + ld.const.v4.u8 {%rs5602, %rs5603, %rs5604, %rs5605}, [matrix+2800]; + cvt.u32.u16 %r8625, %rs5605; + cvt.s32.s8 %r8626, %r8625; + cvt.u32.u16 %r8627, %rs5604; + cvt.s32.s8 %r8628, %r8627; + cvt.u32.u16 %r8629, %rs5603; + cvt.s32.s8 %r8630, %r8629; + cvt.u32.u16 %r8631, %rs5602; + cvt.s32.s8 %r8632, %r8631; + mad.lo.s32 %r8633, %r271, %r8632, %r8624; + mad.lo.s32 %r8634, %r91, %r8630, %r8633; + mad.lo.s32 %r8635, %r93, %r8628, %r8634; + mad.lo.s32 %r8636, %r94, %r8626, %r8635; + ld.const.v4.u8 {%rs5610, %rs5611, %rs5612, %rs5613}, [matrix+2804]; + cvt.u32.u16 %r8637, %rs5613; + cvt.s32.s8 %r8638, %r8637; + cvt.u32.u16 %r8639, %rs5612; + cvt.s32.s8 %r8640, %r8639; + cvt.u32.u16 %r8641, %rs5611; + cvt.s32.s8 %r8642, %r8641; + cvt.u32.u16 %r8643, %rs5610; + cvt.s32.s8 %r8644, %r8643; + mad.lo.s32 %r8645, %r96, %r8644, %r8636; + mad.lo.s32 %r8646, %r97, %r8642, %r8645; + mad.lo.s32 %r8647, %r99, %r8640, %r8646; + mad.lo.s32 %r8648, %r100, %r8638, %r8647; + ld.const.v4.u8 {%rs5618, %rs5619, %rs5620, %rs5621}, [matrix+2808]; + cvt.u32.u16 %r8649, %rs5621; + cvt.s32.s8 %r8650, %r8649; + cvt.u32.u16 %r8651, %rs5620; + cvt.s32.s8 %r8652, %r8651; + cvt.u32.u16 %r8653, %rs5619; + cvt.s32.s8 %r8654, %r8653; + cvt.u32.u16 %r8655, %rs5618; + cvt.s32.s8 %r8656, %r8655; + mad.lo.s32 %r8657, %r103, %r8656, %r8648; + mad.lo.s32 %r8658, %r104, %r8654, %r8657; + mad.lo.s32 %r8659, %r107, %r8652, %r8658; + mad.lo.s32 %r8660, %r108, %r8650, %r8659; + ld.const.v4.u8 {%rs5626, %rs5627, %rs5628, %rs5629}, [matrix+2812]; + cvt.u32.u16 %r8661, %rs5629; + cvt.s32.s8 %r8662, %r8661; + cvt.u32.u16 %r8663, %rs5628; + cvt.s32.s8 %r8664, %r8663; + cvt.u32.u16 %r8665, %rs5627; + cvt.s32.s8 %r8666, %r8665; + cvt.u32.u16 %r8667, %rs5626; + cvt.s32.s8 %r8668, %r8667; + mad.lo.s32 %r8669, %r111, %r8668, %r8660; + mad.lo.s32 %r8670, %r112, %r8666, %r8669; + mad.lo.s32 %r8671, %r114, %r8664, %r8670; + mad.lo.s32 %r8672, %r115, %r8662, %r8671; + shr.u32 %r8673, %r8480, 6; + and.b32 %r8674, %r8673, 240; + shr.u32 %r8675, %r8672, 10; + or.b32 %r8676, %r8675, %r8674; + xor.b32 %r8677, %r30, %r8676; + cvt.u64.u32 %rd397, %r8677; + ld.const.v4.u8 {%rs5634, %rs5635, %rs5636, %rs5637}, [matrix+2816]; + cvt.u32.u16 %r8678, %rs5637; + cvt.s32.s8 %r8679, %r8678; + cvt.u32.u16 %r8680, %rs5636; + cvt.s32.s8 %r8681, %r8680; + cvt.u32.u16 %r8682, %rs5634; + cvt.s32.s8 %r8683, %r8682; + cvt.u32.u16 %r8684, %rs5635; + cvt.s32.s8 %r8685, %r8684; + mul.lo.s32 %r8686, %r34, %r8685; + mad.lo.s32 %r8687, %r124, %r8683, %r8686; + mad.lo.s32 %r8688, %r35, %r8681, %r8687; + mad.lo.s32 %r8689, %r36, %r8679, %r8688; + ld.const.v4.u8 {%rs5642, %rs5643, %rs5644, %rs5645}, [matrix+2820]; + cvt.u32.u16 %r8690, %rs5645; + cvt.s32.s8 %r8691, %r8690; + cvt.u32.u16 %r8692, %rs5644; + cvt.s32.s8 %r8693, %r8692; + cvt.u32.u16 %r8694, %rs5643; + cvt.s32.s8 %r8695, %r8694; + cvt.u32.u16 %r8696, %rs5642; + cvt.s32.s8 %r8697, %r8696; + mad.lo.s32 %r8698, %r37, %r8697, %r8689; + mad.lo.s32 %r8699, %r38, %r8695, %r8698; + mad.lo.s32 %r8700, %r39, %r8693, %r8699; + mad.lo.s32 %r8701, %r40, %r8691, %r8700; + ld.const.v4.u8 {%rs5650, %rs5651, %rs5652, %rs5653}, [matrix+2824]; + cvt.u32.u16 %r8702, %rs5653; + cvt.s32.s8 %r8703, %r8702; + cvt.u32.u16 %r8704, %rs5652; + cvt.s32.s8 %r8705, %r8704; + cvt.u32.u16 %r8706, %rs5651; + cvt.s32.s8 %r8707, %r8706; + cvt.u32.u16 %r8708, %rs5650; + cvt.s32.s8 %r8709, %r8708; + mad.lo.s32 %r8710, %r42, %r8709, %r8701; + mad.lo.s32 %r8711, %r43, %r8707, %r8710; + mad.lo.s32 %r8712, %r45, %r8705, %r8711; + mad.lo.s32 %r8713, %r46, %r8703, %r8712; + ld.const.v4.u8 {%rs5658, %rs5659, %rs5660, %rs5661}, [matrix+2828]; + cvt.u32.u16 %r8714, %rs5661; + cvt.s32.s8 %r8715, %r8714; + cvt.u32.u16 %r8716, %rs5660; + cvt.s32.s8 %r8717, %r8716; + cvt.u32.u16 %r8718, %rs5659; + cvt.s32.s8 %r8719, %r8718; + cvt.u32.u16 %r8720, %rs5658; + cvt.s32.s8 %r8721, %r8720; + mad.lo.s32 %r8722, %r48, %r8721, %r8713; + mad.lo.s32 %r8723, %r49, %r8719, %r8722; + mad.lo.s32 %r8724, %r50, %r8717, %r8723; + mad.lo.s32 %r8725, %r51, %r8715, %r8724; + ld.const.v4.u8 {%rs5666, %rs5667, %rs5668, %rs5669}, [matrix+2832]; + cvt.u32.u16 %r8726, %rs5669; + cvt.s32.s8 %r8727, %r8726; + cvt.u32.u16 %r8728, %rs5668; + cvt.s32.s8 %r8729, %r8728; + cvt.u32.u16 %r8730, %rs5667; + cvt.s32.s8 %r8731, %r8730; + cvt.u32.u16 %r8732, %rs5666; + cvt.s32.s8 %r8733, %r8732; + mad.lo.s32 %r8734, %r173, %r8733, %r8725; + mad.lo.s32 %r8735, %r53, %r8731, %r8734; + mad.lo.s32 %r8736, %r54, %r8729, %r8735; + mad.lo.s32 %r8737, %r55, %r8727, %r8736; + ld.const.v4.u8 {%rs5674, %rs5675, %rs5676, %rs5677}, [matrix+2836]; + cvt.u32.u16 %r8738, %rs5677; + cvt.s32.s8 %r8739, %r8738; + cvt.u32.u16 %r8740, %rs5676; + cvt.s32.s8 %r8741, %r8740; + cvt.u32.u16 %r8742, %rs5675; + cvt.s32.s8 %r8743, %r8742; + cvt.u32.u16 %r8744, %rs5674; + cvt.s32.s8 %r8745, %r8744; + mad.lo.s32 %r8746, %r56, %r8745, %r8737; + mad.lo.s32 %r8747, %r57, %r8743, %r8746; + mad.lo.s32 %r8748, %r58, %r8741, %r8747; + mad.lo.s32 %r8749, %r59, %r8739, %r8748; + ld.const.v4.u8 {%rs5682, %rs5683, %rs5684, %rs5685}, [matrix+2840]; + cvt.u32.u16 %r8750, %rs5685; + cvt.s32.s8 %r8751, %r8750; + cvt.u32.u16 %r8752, %rs5684; + cvt.s32.s8 %r8753, %r8752; + cvt.u32.u16 %r8754, %rs5683; + cvt.s32.s8 %r8755, %r8754; + cvt.u32.u16 %r8756, %rs5682; + cvt.s32.s8 %r8757, %r8756; + mad.lo.s32 %r8758, %r61, %r8757, %r8749; + mad.lo.s32 %r8759, %r62, %r8755, %r8758; + mad.lo.s32 %r8760, %r64, %r8753, %r8759; + mad.lo.s32 %r8761, %r65, %r8751, %r8760; + ld.const.v4.u8 {%rs5690, %rs5691, %rs5692, %rs5693}, [matrix+2844]; + cvt.u32.u16 %r8762, %rs5693; + cvt.s32.s8 %r8763, %r8762; + cvt.u32.u16 %r8764, %rs5692; + cvt.s32.s8 %r8765, %r8764; + cvt.u32.u16 %r8766, %rs5691; + cvt.s32.s8 %r8767, %r8766; + cvt.u32.u16 %r8768, %rs5690; + cvt.s32.s8 %r8769, %r8768; + mad.lo.s32 %r8770, %r67, %r8769, %r8761; + mad.lo.s32 %r8771, %r68, %r8767, %r8770; + mad.lo.s32 %r8772, %r69, %r8765, %r8771; + mad.lo.s32 %r8773, %r70, %r8763, %r8772; + ld.const.v4.u8 {%rs5698, %rs5699, %rs5700, %rs5701}, [matrix+2848]; + cvt.u32.u16 %r8774, %rs5701; + cvt.s32.s8 %r8775, %r8774; + cvt.u32.u16 %r8776, %rs5700; + cvt.s32.s8 %r8777, %r8776; + cvt.u32.u16 %r8778, %rs5699; + cvt.s32.s8 %r8779, %r8778; + cvt.u32.u16 %r8780, %rs5698; + cvt.s32.s8 %r8781, %r8780; + mad.lo.s32 %r8782, %r222, %r8781, %r8773; + mad.lo.s32 %r8783, %r72, %r8779, %r8782; + mad.lo.s32 %r8784, %r73, %r8777, %r8783; + mad.lo.s32 %r8785, %r74, %r8775, %r8784; + ld.const.v4.u8 {%rs5706, %rs5707, %rs5708, %rs5709}, [matrix+2852]; + cvt.u32.u16 %r8786, %rs5709; + cvt.s32.s8 %r8787, %r8786; + cvt.u32.u16 %r8788, %rs5708; + cvt.s32.s8 %r8789, %r8788; + cvt.u32.u16 %r8790, %rs5707; + cvt.s32.s8 %r8791, %r8790; + cvt.u32.u16 %r8792, %rs5706; + cvt.s32.s8 %r8793, %r8792; + mad.lo.s32 %r8794, %r75, %r8793, %r8785; + mad.lo.s32 %r8795, %r76, %r8791, %r8794; + mad.lo.s32 %r8796, %r77, %r8789, %r8795; + mad.lo.s32 %r8797, %r78, %r8787, %r8796; + ld.const.v4.u8 {%rs5714, %rs5715, %rs5716, %rs5717}, [matrix+2856]; + cvt.u32.u16 %r8798, %rs5717; + cvt.s32.s8 %r8799, %r8798; + cvt.u32.u16 %r8800, %rs5716; + cvt.s32.s8 %r8801, %r8800; + cvt.u32.u16 %r8802, %rs5715; + cvt.s32.s8 %r8803, %r8802; + cvt.u32.u16 %r8804, %rs5714; + cvt.s32.s8 %r8805, %r8804; + mad.lo.s32 %r8806, %r80, %r8805, %r8797; + mad.lo.s32 %r8807, %r81, %r8803, %r8806; + mad.lo.s32 %r8808, %r83, %r8801, %r8807; + mad.lo.s32 %r8809, %r84, %r8799, %r8808; + ld.const.v4.u8 {%rs5722, %rs5723, %rs5724, %rs5725}, [matrix+2860]; + cvt.u32.u16 %r8810, %rs5725; + cvt.s32.s8 %r8811, %r8810; + cvt.u32.u16 %r8812, %rs5724; + cvt.s32.s8 %r8813, %r8812; + cvt.u32.u16 %r8814, %rs5723; + cvt.s32.s8 %r8815, %r8814; + cvt.u32.u16 %r8816, %rs5722; + cvt.s32.s8 %r8817, %r8816; + mad.lo.s32 %r8818, %r86, %r8817, %r8809; + mad.lo.s32 %r8819, %r87, %r8815, %r8818; + mad.lo.s32 %r8820, %r88, %r8813, %r8819; + mad.lo.s32 %r8821, %r89, %r8811, %r8820; + ld.const.v4.u8 {%rs5730, %rs5731, %rs5732, %rs5733}, [matrix+2864]; + cvt.u32.u16 %r8822, %rs5733; + cvt.s32.s8 %r8823, %r8822; + cvt.u32.u16 %r8824, %rs5732; + cvt.s32.s8 %r8825, %r8824; + cvt.u32.u16 %r8826, %rs5731; + cvt.s32.s8 %r8827, %r8826; + cvt.u32.u16 %r8828, %rs5730; + cvt.s32.s8 %r8829, %r8828; + mad.lo.s32 %r8830, %r271, %r8829, %r8821; + mad.lo.s32 %r8831, %r91, %r8827, %r8830; + mad.lo.s32 %r8832, %r93, %r8825, %r8831; + mad.lo.s32 %r8833, %r94, %r8823, %r8832; + ld.const.v4.u8 {%rs5738, %rs5739, %rs5740, %rs5741}, [matrix+2868]; + cvt.u32.u16 %r8834, %rs5741; + cvt.s32.s8 %r8835, %r8834; + cvt.u32.u16 %r8836, %rs5740; + cvt.s32.s8 %r8837, %r8836; + cvt.u32.u16 %r8838, %rs5739; + cvt.s32.s8 %r8839, %r8838; + cvt.u32.u16 %r8840, %rs5738; + cvt.s32.s8 %r8841, %r8840; + mad.lo.s32 %r8842, %r96, %r8841, %r8833; + mad.lo.s32 %r8843, %r97, %r8839, %r8842; + mad.lo.s32 %r8844, %r99, %r8837, %r8843; + mad.lo.s32 %r8845, %r100, %r8835, %r8844; + ld.const.v4.u8 {%rs5746, %rs5747, %rs5748, %rs5749}, [matrix+2872]; + cvt.u32.u16 %r8846, %rs5749; + cvt.s32.s8 %r8847, %r8846; + cvt.u32.u16 %r8848, %rs5748; + cvt.s32.s8 %r8849, %r8848; + cvt.u32.u16 %r8850, %rs5747; + cvt.s32.s8 %r8851, %r8850; + cvt.u32.u16 %r8852, %rs5746; + cvt.s32.s8 %r8853, %r8852; + mad.lo.s32 %r8854, %r103, %r8853, %r8845; + mad.lo.s32 %r8855, %r104, %r8851, %r8854; + mad.lo.s32 %r8856, %r107, %r8849, %r8855; + mad.lo.s32 %r8857, %r108, %r8847, %r8856; + ld.const.v4.u8 {%rs5754, %rs5755, %rs5756, %rs5757}, [matrix+2876]; + cvt.u32.u16 %r8858, %rs5757; + cvt.s32.s8 %r8859, %r8858; + cvt.u32.u16 %r8860, %rs5756; + cvt.s32.s8 %r8861, %r8860; + cvt.u32.u16 %r8862, %rs5755; + cvt.s32.s8 %r8863, %r8862; + cvt.u32.u16 %r8864, %rs5754; + cvt.s32.s8 %r8865, %r8864; + mad.lo.s32 %r8866, %r111, %r8865, %r8857; + mad.lo.s32 %r8867, %r112, %r8863, %r8866; + mad.lo.s32 %r8868, %r114, %r8861, %r8867; + mad.lo.s32 %r8869, %r115, %r8859, %r8868; + ld.const.v4.u8 {%rs5762, %rs5763, %rs5764, %rs5765}, [matrix+2880]; + cvt.u32.u16 %r8870, %rs5765; + cvt.s32.s8 %r8871, %r8870; + cvt.u32.u16 %r8872, %rs5764; + cvt.s32.s8 %r8873, %r8872; + cvt.u32.u16 %r8874, %rs5762; + cvt.s32.s8 %r8875, %r8874; + cvt.u32.u16 %r8876, %rs5763; + cvt.s32.s8 %r8877, %r8876; + mul.lo.s32 %r8878, %r34, %r8877; + mad.lo.s32 %r8879, %r124, %r8875, %r8878; + mad.lo.s32 %r8880, %r35, %r8873, %r8879; + mad.lo.s32 %r8881, %r36, %r8871, %r8880; + ld.const.v4.u8 {%rs5770, %rs5771, %rs5772, %rs5773}, [matrix+2884]; + cvt.u32.u16 %r8882, %rs5773; + cvt.s32.s8 %r8883, %r8882; + cvt.u32.u16 %r8884, %rs5772; + cvt.s32.s8 %r8885, %r8884; + cvt.u32.u16 %r8886, %rs5771; + cvt.s32.s8 %r8887, %r8886; + cvt.u32.u16 %r8888, %rs5770; + cvt.s32.s8 %r8889, %r8888; + mad.lo.s32 %r8890, %r37, %r8889, %r8881; + mad.lo.s32 %r8891, %r38, %r8887, %r8890; + mad.lo.s32 %r8892, %r39, %r8885, %r8891; + mad.lo.s32 %r8893, %r40, %r8883, %r8892; + ld.const.v4.u8 {%rs5778, %rs5779, %rs5780, %rs5781}, [matrix+2888]; + cvt.u32.u16 %r8894, %rs5781; + cvt.s32.s8 %r8895, %r8894; + cvt.u32.u16 %r8896, %rs5780; + cvt.s32.s8 %r8897, %r8896; + cvt.u32.u16 %r8898, %rs5779; + cvt.s32.s8 %r8899, %r8898; + cvt.u32.u16 %r8900, %rs5778; + cvt.s32.s8 %r8901, %r8900; + mad.lo.s32 %r8902, %r42, %r8901, %r8893; + mad.lo.s32 %r8903, %r43, %r8899, %r8902; + mad.lo.s32 %r8904, %r45, %r8897, %r8903; + mad.lo.s32 %r8905, %r46, %r8895, %r8904; + ld.const.v4.u8 {%rs5786, %rs5787, %rs5788, %rs5789}, [matrix+2892]; + cvt.u32.u16 %r8906, %rs5789; + cvt.s32.s8 %r8907, %r8906; + cvt.u32.u16 %r8908, %rs5788; + cvt.s32.s8 %r8909, %r8908; + cvt.u32.u16 %r8910, %rs5787; + cvt.s32.s8 %r8911, %r8910; + cvt.u32.u16 %r8912, %rs5786; + cvt.s32.s8 %r8913, %r8912; + mad.lo.s32 %r8914, %r48, %r8913, %r8905; + mad.lo.s32 %r8915, %r49, %r8911, %r8914; + mad.lo.s32 %r8916, %r50, %r8909, %r8915; + mad.lo.s32 %r8917, %r51, %r8907, %r8916; + ld.const.v4.u8 {%rs5794, %rs5795, %rs5796, %rs5797}, [matrix+2896]; + cvt.u32.u16 %r8918, %rs5797; + cvt.s32.s8 %r8919, %r8918; + cvt.u32.u16 %r8920, %rs5796; + cvt.s32.s8 %r8921, %r8920; + cvt.u32.u16 %r8922, %rs5795; + cvt.s32.s8 %r8923, %r8922; + cvt.u32.u16 %r8924, %rs5794; + cvt.s32.s8 %r8925, %r8924; + mad.lo.s32 %r8926, %r173, %r8925, %r8917; + mad.lo.s32 %r8927, %r53, %r8923, %r8926; + mad.lo.s32 %r8928, %r54, %r8921, %r8927; + mad.lo.s32 %r8929, %r55, %r8919, %r8928; + ld.const.v4.u8 {%rs5802, %rs5803, %rs5804, %rs5805}, [matrix+2900]; + cvt.u32.u16 %r8930, %rs5805; + cvt.s32.s8 %r8931, %r8930; + cvt.u32.u16 %r8932, %rs5804; + cvt.s32.s8 %r8933, %r8932; + cvt.u32.u16 %r8934, %rs5803; + cvt.s32.s8 %r8935, %r8934; + cvt.u32.u16 %r8936, %rs5802; + cvt.s32.s8 %r8937, %r8936; + mad.lo.s32 %r8938, %r56, %r8937, %r8929; + mad.lo.s32 %r8939, %r57, %r8935, %r8938; + mad.lo.s32 %r8940, %r58, %r8933, %r8939; + mad.lo.s32 %r8941, %r59, %r8931, %r8940; + ld.const.v4.u8 {%rs5810, %rs5811, %rs5812, %rs5813}, [matrix+2904]; + cvt.u32.u16 %r8942, %rs5813; + cvt.s32.s8 %r8943, %r8942; + cvt.u32.u16 %r8944, %rs5812; + cvt.s32.s8 %r8945, %r8944; + cvt.u32.u16 %r8946, %rs5811; + cvt.s32.s8 %r8947, %r8946; + cvt.u32.u16 %r8948, %rs5810; + cvt.s32.s8 %r8949, %r8948; + mad.lo.s32 %r8950, %r61, %r8949, %r8941; + mad.lo.s32 %r8951, %r62, %r8947, %r8950; + mad.lo.s32 %r8952, %r64, %r8945, %r8951; + mad.lo.s32 %r8953, %r65, %r8943, %r8952; + ld.const.v4.u8 {%rs5818, %rs5819, %rs5820, %rs5821}, [matrix+2908]; + cvt.u32.u16 %r8954, %rs5821; + cvt.s32.s8 %r8955, %r8954; + cvt.u32.u16 %r8956, %rs5820; + cvt.s32.s8 %r8957, %r8956; + cvt.u32.u16 %r8958, %rs5819; + cvt.s32.s8 %r8959, %r8958; + cvt.u32.u16 %r8960, %rs5818; + cvt.s32.s8 %r8961, %r8960; + mad.lo.s32 %r8962, %r67, %r8961, %r8953; + mad.lo.s32 %r8963, %r68, %r8959, %r8962; + mad.lo.s32 %r8964, %r69, %r8957, %r8963; + mad.lo.s32 %r8965, %r70, %r8955, %r8964; + ld.const.v4.u8 {%rs5826, %rs5827, %rs5828, %rs5829}, [matrix+2912]; + cvt.u32.u16 %r8966, %rs5829; + cvt.s32.s8 %r8967, %r8966; + cvt.u32.u16 %r8968, %rs5828; + cvt.s32.s8 %r8969, %r8968; + cvt.u32.u16 %r8970, %rs5827; + cvt.s32.s8 %r8971, %r8970; + cvt.u32.u16 %r8972, %rs5826; + cvt.s32.s8 %r8973, %r8972; + mad.lo.s32 %r8974, %r222, %r8973, %r8965; + mad.lo.s32 %r8975, %r72, %r8971, %r8974; + mad.lo.s32 %r8976, %r73, %r8969, %r8975; + mad.lo.s32 %r8977, %r74, %r8967, %r8976; + ld.const.v4.u8 {%rs5834, %rs5835, %rs5836, %rs5837}, [matrix+2916]; + cvt.u32.u16 %r8978, %rs5837; + cvt.s32.s8 %r8979, %r8978; + cvt.u32.u16 %r8980, %rs5836; + cvt.s32.s8 %r8981, %r8980; + cvt.u32.u16 %r8982, %rs5835; + cvt.s32.s8 %r8983, %r8982; + cvt.u32.u16 %r8984, %rs5834; + cvt.s32.s8 %r8985, %r8984; + mad.lo.s32 %r8986, %r75, %r8985, %r8977; + mad.lo.s32 %r8987, %r76, %r8983, %r8986; + mad.lo.s32 %r8988, %r77, %r8981, %r8987; + mad.lo.s32 %r8989, %r78, %r8979, %r8988; + ld.const.v4.u8 {%rs5842, %rs5843, %rs5844, %rs5845}, [matrix+2920]; + cvt.u32.u16 %r8990, %rs5845; + cvt.s32.s8 %r8991, %r8990; + cvt.u32.u16 %r8992, %rs5844; + cvt.s32.s8 %r8993, %r8992; + cvt.u32.u16 %r8994, %rs5843; + cvt.s32.s8 %r8995, %r8994; + cvt.u32.u16 %r8996, %rs5842; + cvt.s32.s8 %r8997, %r8996; + mad.lo.s32 %r8998, %r80, %r8997, %r8989; + mad.lo.s32 %r8999, %r81, %r8995, %r8998; + mad.lo.s32 %r9000, %r83, %r8993, %r8999; + mad.lo.s32 %r9001, %r84, %r8991, %r9000; + ld.const.v4.u8 {%rs5850, %rs5851, %rs5852, %rs5853}, [matrix+2924]; + cvt.u32.u16 %r9002, %rs5853; + cvt.s32.s8 %r9003, %r9002; + cvt.u32.u16 %r9004, %rs5852; + cvt.s32.s8 %r9005, %r9004; + cvt.u32.u16 %r9006, %rs5851; + cvt.s32.s8 %r9007, %r9006; + cvt.u32.u16 %r9008, %rs5850; + cvt.s32.s8 %r9009, %r9008; + mad.lo.s32 %r9010, %r86, %r9009, %r9001; + mad.lo.s32 %r9011, %r87, %r9007, %r9010; + mad.lo.s32 %r9012, %r88, %r9005, %r9011; + mad.lo.s32 %r9013, %r89, %r9003, %r9012; + ld.const.v4.u8 {%rs5858, %rs5859, %rs5860, %rs5861}, [matrix+2928]; + cvt.u32.u16 %r9014, %rs5861; + cvt.s32.s8 %r9015, %r9014; + cvt.u32.u16 %r9016, %rs5860; + cvt.s32.s8 %r9017, %r9016; + cvt.u32.u16 %r9018, %rs5859; + cvt.s32.s8 %r9019, %r9018; + cvt.u32.u16 %r9020, %rs5858; + cvt.s32.s8 %r9021, %r9020; + mad.lo.s32 %r9022, %r271, %r9021, %r9013; + mad.lo.s32 %r9023, %r91, %r9019, %r9022; + mad.lo.s32 %r9024, %r93, %r9017, %r9023; + mad.lo.s32 %r9025, %r94, %r9015, %r9024; + ld.const.v4.u8 {%rs5866, %rs5867, %rs5868, %rs5869}, [matrix+2932]; + cvt.u32.u16 %r9026, %rs5869; + cvt.s32.s8 %r9027, %r9026; + cvt.u32.u16 %r9028, %rs5868; + cvt.s32.s8 %r9029, %r9028; + cvt.u32.u16 %r9030, %rs5867; + cvt.s32.s8 %r9031, %r9030; + cvt.u32.u16 %r9032, %rs5866; + cvt.s32.s8 %r9033, %r9032; + mad.lo.s32 %r9034, %r96, %r9033, %r9025; + mad.lo.s32 %r9035, %r97, %r9031, %r9034; + mad.lo.s32 %r9036, %r99, %r9029, %r9035; + mad.lo.s32 %r9037, %r100, %r9027, %r9036; + ld.const.v4.u8 {%rs5874, %rs5875, %rs5876, %rs5877}, [matrix+2936]; + cvt.u32.u16 %r9038, %rs5877; + cvt.s32.s8 %r9039, %r9038; + cvt.u32.u16 %r9040, %rs5876; + cvt.s32.s8 %r9041, %r9040; + cvt.u32.u16 %r9042, %rs5875; + cvt.s32.s8 %r9043, %r9042; + cvt.u32.u16 %r9044, %rs5874; + cvt.s32.s8 %r9045, %r9044; + mad.lo.s32 %r9046, %r103, %r9045, %r9037; + mad.lo.s32 %r9047, %r104, %r9043, %r9046; + mad.lo.s32 %r9048, %r107, %r9041, %r9047; + mad.lo.s32 %r9049, %r108, %r9039, %r9048; + ld.const.v4.u8 {%rs5882, %rs5883, %rs5884, %rs5885}, [matrix+2940]; + cvt.u32.u16 %r9050, %rs5885; + cvt.s32.s8 %r9051, %r9050; + cvt.u32.u16 %r9052, %rs5884; + cvt.s32.s8 %r9053, %r9052; + cvt.u32.u16 %r9054, %rs5883; + cvt.s32.s8 %r9055, %r9054; + cvt.u32.u16 %r9056, %rs5882; + cvt.s32.s8 %r9057, %r9056; + mad.lo.s32 %r9058, %r111, %r9057, %r9049; + mad.lo.s32 %r9059, %r112, %r9055, %r9058; + mad.lo.s32 %r9060, %r114, %r9053, %r9059; + mad.lo.s32 %r9061, %r115, %r9051, %r9060; + shr.u32 %r9062, %r8869, 6; + and.b32 %r9063, %r9062, 240; + shr.u32 %r9064, %r9061, 10; + or.b32 %r9065, %r9064, %r9063; + xor.b32 %r9066, %r31, %r9065; + cvt.u64.u32 %rd398, %r9066; + ld.const.v4.u8 {%rs5890, %rs5891, %rs5892, %rs5893}, [matrix+2944]; + cvt.u32.u16 %r9067, %rs5893; + cvt.s32.s8 %r9068, %r9067; + cvt.u32.u16 %r9069, %rs5892; + cvt.s32.s8 %r9070, %r9069; + cvt.u32.u16 %r9071, %rs5890; + cvt.s32.s8 %r9072, %r9071; + cvt.u32.u16 %r9073, %rs5891; + cvt.s32.s8 %r9074, %r9073; + mul.lo.s32 %r9075, %r34, %r9074; + mad.lo.s32 %r9076, %r124, %r9072, %r9075; + mad.lo.s32 %r9077, %r35, %r9070, %r9076; + mad.lo.s32 %r9078, %r36, %r9068, %r9077; + ld.const.v4.u8 {%rs5898, %rs5899, %rs5900, %rs5901}, [matrix+2948]; + cvt.u32.u16 %r9079, %rs5901; + cvt.s32.s8 %r9080, %r9079; + cvt.u32.u16 %r9081, %rs5900; + cvt.s32.s8 %r9082, %r9081; + cvt.u32.u16 %r9083, %rs5899; + cvt.s32.s8 %r9084, %r9083; + cvt.u32.u16 %r9085, %rs5898; + cvt.s32.s8 %r9086, %r9085; + mad.lo.s32 %r9087, %r37, %r9086, %r9078; + mad.lo.s32 %r9088, %r38, %r9084, %r9087; + mad.lo.s32 %r9089, %r39, %r9082, %r9088; + mad.lo.s32 %r9090, %r40, %r9080, %r9089; + ld.const.v4.u8 {%rs5906, %rs5907, %rs5908, %rs5909}, [matrix+2952]; + cvt.u32.u16 %r9091, %rs5909; + cvt.s32.s8 %r9092, %r9091; + cvt.u32.u16 %r9093, %rs5908; + cvt.s32.s8 %r9094, %r9093; + cvt.u32.u16 %r9095, %rs5907; + cvt.s32.s8 %r9096, %r9095; + cvt.u32.u16 %r9097, %rs5906; + cvt.s32.s8 %r9098, %r9097; + mad.lo.s32 %r9099, %r42, %r9098, %r9090; + mad.lo.s32 %r9100, %r43, %r9096, %r9099; + mad.lo.s32 %r9101, %r45, %r9094, %r9100; + mad.lo.s32 %r9102, %r46, %r9092, %r9101; + ld.const.v4.u8 {%rs5914, %rs5915, %rs5916, %rs5917}, [matrix+2956]; + cvt.u32.u16 %r9103, %rs5917; + cvt.s32.s8 %r9104, %r9103; + cvt.u32.u16 %r9105, %rs5916; + cvt.s32.s8 %r9106, %r9105; + cvt.u32.u16 %r9107, %rs5915; + cvt.s32.s8 %r9108, %r9107; + cvt.u32.u16 %r9109, %rs5914; + cvt.s32.s8 %r9110, %r9109; + mad.lo.s32 %r9111, %r48, %r9110, %r9102; + mad.lo.s32 %r9112, %r49, %r9108, %r9111; + mad.lo.s32 %r9113, %r50, %r9106, %r9112; + mad.lo.s32 %r9114, %r51, %r9104, %r9113; + ld.const.v4.u8 {%rs5922, %rs5923, %rs5924, %rs5925}, [matrix+2960]; + cvt.u32.u16 %r9115, %rs5925; + cvt.s32.s8 %r9116, %r9115; + cvt.u32.u16 %r9117, %rs5924; + cvt.s32.s8 %r9118, %r9117; + cvt.u32.u16 %r9119, %rs5923; + cvt.s32.s8 %r9120, %r9119; + cvt.u32.u16 %r9121, %rs5922; + cvt.s32.s8 %r9122, %r9121; + mad.lo.s32 %r9123, %r173, %r9122, %r9114; + mad.lo.s32 %r9124, %r53, %r9120, %r9123; + mad.lo.s32 %r9125, %r54, %r9118, %r9124; + mad.lo.s32 %r9126, %r55, %r9116, %r9125; + ld.const.v4.u8 {%rs5930, %rs5931, %rs5932, %rs5933}, [matrix+2964]; + cvt.u32.u16 %r9127, %rs5933; + cvt.s32.s8 %r9128, %r9127; + cvt.u32.u16 %r9129, %rs5932; + cvt.s32.s8 %r9130, %r9129; + cvt.u32.u16 %r9131, %rs5931; + cvt.s32.s8 %r9132, %r9131; + cvt.u32.u16 %r9133, %rs5930; + cvt.s32.s8 %r9134, %r9133; + mad.lo.s32 %r9135, %r56, %r9134, %r9126; + mad.lo.s32 %r9136, %r57, %r9132, %r9135; + mad.lo.s32 %r9137, %r58, %r9130, %r9136; + mad.lo.s32 %r9138, %r59, %r9128, %r9137; + ld.const.v4.u8 {%rs5938, %rs5939, %rs5940, %rs5941}, [matrix+2968]; + cvt.u32.u16 %r9139, %rs5941; + cvt.s32.s8 %r9140, %r9139; + cvt.u32.u16 %r9141, %rs5940; + cvt.s32.s8 %r9142, %r9141; + cvt.u32.u16 %r9143, %rs5939; + cvt.s32.s8 %r9144, %r9143; + cvt.u32.u16 %r9145, %rs5938; + cvt.s32.s8 %r9146, %r9145; + mad.lo.s32 %r9147, %r61, %r9146, %r9138; + mad.lo.s32 %r9148, %r62, %r9144, %r9147; + mad.lo.s32 %r9149, %r64, %r9142, %r9148; + mad.lo.s32 %r9150, %r65, %r9140, %r9149; + ld.const.v4.u8 {%rs5946, %rs5947, %rs5948, %rs5949}, [matrix+2972]; + cvt.u32.u16 %r9151, %rs5949; + cvt.s32.s8 %r9152, %r9151; + cvt.u32.u16 %r9153, %rs5948; + cvt.s32.s8 %r9154, %r9153; + cvt.u32.u16 %r9155, %rs5947; + cvt.s32.s8 %r9156, %r9155; + cvt.u32.u16 %r9157, %rs5946; + cvt.s32.s8 %r9158, %r9157; + mad.lo.s32 %r9159, %r67, %r9158, %r9150; + mad.lo.s32 %r9160, %r68, %r9156, %r9159; + mad.lo.s32 %r9161, %r69, %r9154, %r9160; + mad.lo.s32 %r9162, %r70, %r9152, %r9161; + ld.const.v4.u8 {%rs5954, %rs5955, %rs5956, %rs5957}, [matrix+2976]; + cvt.u32.u16 %r9163, %rs5957; + cvt.s32.s8 %r9164, %r9163; + cvt.u32.u16 %r9165, %rs5956; + cvt.s32.s8 %r9166, %r9165; + cvt.u32.u16 %r9167, %rs5955; + cvt.s32.s8 %r9168, %r9167; + cvt.u32.u16 %r9169, %rs5954; + cvt.s32.s8 %r9170, %r9169; + mad.lo.s32 %r9171, %r222, %r9170, %r9162; + mad.lo.s32 %r9172, %r72, %r9168, %r9171; + mad.lo.s32 %r9173, %r73, %r9166, %r9172; + mad.lo.s32 %r9174, %r74, %r9164, %r9173; + ld.const.v4.u8 {%rs5962, %rs5963, %rs5964, %rs5965}, [matrix+2980]; + cvt.u32.u16 %r9175, %rs5965; + cvt.s32.s8 %r9176, %r9175; + cvt.u32.u16 %r9177, %rs5964; + cvt.s32.s8 %r9178, %r9177; + cvt.u32.u16 %r9179, %rs5963; + cvt.s32.s8 %r9180, %r9179; + cvt.u32.u16 %r9181, %rs5962; + cvt.s32.s8 %r9182, %r9181; + mad.lo.s32 %r9183, %r75, %r9182, %r9174; + mad.lo.s32 %r9184, %r76, %r9180, %r9183; + mad.lo.s32 %r9185, %r77, %r9178, %r9184; + mad.lo.s32 %r9186, %r78, %r9176, %r9185; + ld.const.v4.u8 {%rs5970, %rs5971, %rs5972, %rs5973}, [matrix+2984]; + cvt.u32.u16 %r9187, %rs5973; + cvt.s32.s8 %r9188, %r9187; + cvt.u32.u16 %r9189, %rs5972; + cvt.s32.s8 %r9190, %r9189; + cvt.u32.u16 %r9191, %rs5971; + cvt.s32.s8 %r9192, %r9191; + cvt.u32.u16 %r9193, %rs5970; + cvt.s32.s8 %r9194, %r9193; + mad.lo.s32 %r9195, %r80, %r9194, %r9186; + mad.lo.s32 %r9196, %r81, %r9192, %r9195; + mad.lo.s32 %r9197, %r83, %r9190, %r9196; + mad.lo.s32 %r9198, %r84, %r9188, %r9197; + ld.const.v4.u8 {%rs5978, %rs5979, %rs5980, %rs5981}, [matrix+2988]; + cvt.u32.u16 %r9199, %rs5981; + cvt.s32.s8 %r9200, %r9199; + cvt.u32.u16 %r9201, %rs5980; + cvt.s32.s8 %r9202, %r9201; + cvt.u32.u16 %r9203, %rs5979; + cvt.s32.s8 %r9204, %r9203; + cvt.u32.u16 %r9205, %rs5978; + cvt.s32.s8 %r9206, %r9205; + mad.lo.s32 %r9207, %r86, %r9206, %r9198; + mad.lo.s32 %r9208, %r87, %r9204, %r9207; + mad.lo.s32 %r9209, %r88, %r9202, %r9208; + mad.lo.s32 %r9210, %r89, %r9200, %r9209; + ld.const.v4.u8 {%rs5986, %rs5987, %rs5988, %rs5989}, [matrix+2992]; + cvt.u32.u16 %r9211, %rs5989; + cvt.s32.s8 %r9212, %r9211; + cvt.u32.u16 %r9213, %rs5988; + cvt.s32.s8 %r9214, %r9213; + cvt.u32.u16 %r9215, %rs5987; + cvt.s32.s8 %r9216, %r9215; + cvt.u32.u16 %r9217, %rs5986; + cvt.s32.s8 %r9218, %r9217; + mad.lo.s32 %r9219, %r271, %r9218, %r9210; + mad.lo.s32 %r9220, %r91, %r9216, %r9219; + mad.lo.s32 %r9221, %r93, %r9214, %r9220; + mad.lo.s32 %r9222, %r94, %r9212, %r9221; + ld.const.v4.u8 {%rs5994, %rs5995, %rs5996, %rs5997}, [matrix+2996]; + cvt.u32.u16 %r9223, %rs5997; + cvt.s32.s8 %r9224, %r9223; + cvt.u32.u16 %r9225, %rs5996; + cvt.s32.s8 %r9226, %r9225; + cvt.u32.u16 %r9227, %rs5995; + cvt.s32.s8 %r9228, %r9227; + cvt.u32.u16 %r9229, %rs5994; + cvt.s32.s8 %r9230, %r9229; + mad.lo.s32 %r9231, %r96, %r9230, %r9222; + mad.lo.s32 %r9232, %r97, %r9228, %r9231; + mad.lo.s32 %r9233, %r99, %r9226, %r9232; + mad.lo.s32 %r9234, %r100, %r9224, %r9233; + ld.const.v4.u8 {%rs6002, %rs6003, %rs6004, %rs6005}, [matrix+3000]; + cvt.u32.u16 %r9235, %rs6005; + cvt.s32.s8 %r9236, %r9235; + cvt.u32.u16 %r9237, %rs6004; + cvt.s32.s8 %r9238, %r9237; + cvt.u32.u16 %r9239, %rs6003; + cvt.s32.s8 %r9240, %r9239; + cvt.u32.u16 %r9241, %rs6002; + cvt.s32.s8 %r9242, %r9241; + mad.lo.s32 %r9243, %r103, %r9242, %r9234; + mad.lo.s32 %r9244, %r104, %r9240, %r9243; + mad.lo.s32 %r9245, %r107, %r9238, %r9244; + mad.lo.s32 %r9246, %r108, %r9236, %r9245; + ld.const.v4.u8 {%rs6010, %rs6011, %rs6012, %rs6013}, [matrix+3004]; + cvt.u32.u16 %r9247, %rs6013; + cvt.s32.s8 %r9248, %r9247; + cvt.u32.u16 %r9249, %rs6012; + cvt.s32.s8 %r9250, %r9249; + cvt.u32.u16 %r9251, %rs6011; + cvt.s32.s8 %r9252, %r9251; + cvt.u32.u16 %r9253, %rs6010; + cvt.s32.s8 %r9254, %r9253; + mad.lo.s32 %r9255, %r111, %r9254, %r9246; + mad.lo.s32 %r9256, %r112, %r9252, %r9255; + mad.lo.s32 %r9257, %r114, %r9250, %r9256; + mad.lo.s32 %r9258, %r115, %r9248, %r9257; + ld.const.v4.u8 {%rs6018, %rs6019, %rs6020, %rs6021}, [matrix+3008]; + cvt.u32.u16 %r9259, %rs6021; + cvt.s32.s8 %r9260, %r9259; + cvt.u32.u16 %r9261, %rs6020; + cvt.s32.s8 %r9262, %r9261; + cvt.u32.u16 %r9263, %rs6018; + cvt.s32.s8 %r9264, %r9263; + cvt.u32.u16 %r9265, %rs6019; + cvt.s32.s8 %r9266, %r9265; + mul.lo.s32 %r9267, %r34, %r9266; + mad.lo.s32 %r9268, %r124, %r9264, %r9267; + mad.lo.s32 %r9269, %r35, %r9262, %r9268; + mad.lo.s32 %r9270, %r36, %r9260, %r9269; + ld.const.v4.u8 {%rs6026, %rs6027, %rs6028, %rs6029}, [matrix+3012]; + cvt.u32.u16 %r9271, %rs6029; + cvt.s32.s8 %r9272, %r9271; + cvt.u32.u16 %r9273, %rs6028; + cvt.s32.s8 %r9274, %r9273; + cvt.u32.u16 %r9275, %rs6027; + cvt.s32.s8 %r9276, %r9275; + cvt.u32.u16 %r9277, %rs6026; + cvt.s32.s8 %r9278, %r9277; + mad.lo.s32 %r9279, %r37, %r9278, %r9270; + mad.lo.s32 %r9280, %r38, %r9276, %r9279; + mad.lo.s32 %r9281, %r39, %r9274, %r9280; + mad.lo.s32 %r9282, %r40, %r9272, %r9281; + ld.const.v4.u8 {%rs6034, %rs6035, %rs6036, %rs6037}, [matrix+3016]; + cvt.u32.u16 %r9283, %rs6037; + cvt.s32.s8 %r9284, %r9283; + cvt.u32.u16 %r9285, %rs6036; + cvt.s32.s8 %r9286, %r9285; + cvt.u32.u16 %r9287, %rs6035; + cvt.s32.s8 %r9288, %r9287; + cvt.u32.u16 %r9289, %rs6034; + cvt.s32.s8 %r9290, %r9289; + mad.lo.s32 %r9291, %r42, %r9290, %r9282; + mad.lo.s32 %r9292, %r43, %r9288, %r9291; + mad.lo.s32 %r9293, %r45, %r9286, %r9292; + mad.lo.s32 %r9294, %r46, %r9284, %r9293; + ld.const.v4.u8 {%rs6042, %rs6043, %rs6044, %rs6045}, [matrix+3020]; + cvt.u32.u16 %r9295, %rs6045; + cvt.s32.s8 %r9296, %r9295; + cvt.u32.u16 %r9297, %rs6044; + cvt.s32.s8 %r9298, %r9297; + cvt.u32.u16 %r9299, %rs6043; + cvt.s32.s8 %r9300, %r9299; + cvt.u32.u16 %r9301, %rs6042; + cvt.s32.s8 %r9302, %r9301; + mad.lo.s32 %r9303, %r48, %r9302, %r9294; + mad.lo.s32 %r9304, %r49, %r9300, %r9303; + mad.lo.s32 %r9305, %r50, %r9298, %r9304; + mad.lo.s32 %r9306, %r51, %r9296, %r9305; + ld.const.v4.u8 {%rs6050, %rs6051, %rs6052, %rs6053}, [matrix+3024]; + cvt.u32.u16 %r9307, %rs6053; + cvt.s32.s8 %r9308, %r9307; + cvt.u32.u16 %r9309, %rs6052; + cvt.s32.s8 %r9310, %r9309; + cvt.u32.u16 %r9311, %rs6051; + cvt.s32.s8 %r9312, %r9311; + cvt.u32.u16 %r9313, %rs6050; + cvt.s32.s8 %r9314, %r9313; + mad.lo.s32 %r9315, %r173, %r9314, %r9306; + mad.lo.s32 %r9316, %r53, %r9312, %r9315; + mad.lo.s32 %r9317, %r54, %r9310, %r9316; + mad.lo.s32 %r9318, %r55, %r9308, %r9317; + ld.const.v4.u8 {%rs6058, %rs6059, %rs6060, %rs6061}, [matrix+3028]; + cvt.u32.u16 %r9319, %rs6061; + cvt.s32.s8 %r9320, %r9319; + cvt.u32.u16 %r9321, %rs6060; + cvt.s32.s8 %r9322, %r9321; + cvt.u32.u16 %r9323, %rs6059; + cvt.s32.s8 %r9324, %r9323; + cvt.u32.u16 %r9325, %rs6058; + cvt.s32.s8 %r9326, %r9325; + mad.lo.s32 %r9327, %r56, %r9326, %r9318; + mad.lo.s32 %r9328, %r57, %r9324, %r9327; + mad.lo.s32 %r9329, %r58, %r9322, %r9328; + mad.lo.s32 %r9330, %r59, %r9320, %r9329; + ld.const.v4.u8 {%rs6066, %rs6067, %rs6068, %rs6069}, [matrix+3032]; + cvt.u32.u16 %r9331, %rs6069; + cvt.s32.s8 %r9332, %r9331; + cvt.u32.u16 %r9333, %rs6068; + cvt.s32.s8 %r9334, %r9333; + cvt.u32.u16 %r9335, %rs6067; + cvt.s32.s8 %r9336, %r9335; + cvt.u32.u16 %r9337, %rs6066; + cvt.s32.s8 %r9338, %r9337; + mad.lo.s32 %r9339, %r61, %r9338, %r9330; + mad.lo.s32 %r9340, %r62, %r9336, %r9339; + mad.lo.s32 %r9341, %r64, %r9334, %r9340; + mad.lo.s32 %r9342, %r65, %r9332, %r9341; + ld.const.v4.u8 {%rs6074, %rs6075, %rs6076, %rs6077}, [matrix+3036]; + cvt.u32.u16 %r9343, %rs6077; + cvt.s32.s8 %r9344, %r9343; + cvt.u32.u16 %r9345, %rs6076; + cvt.s32.s8 %r9346, %r9345; + cvt.u32.u16 %r9347, %rs6075; + cvt.s32.s8 %r9348, %r9347; + cvt.u32.u16 %r9349, %rs6074; + cvt.s32.s8 %r9350, %r9349; + mad.lo.s32 %r9351, %r67, %r9350, %r9342; + mad.lo.s32 %r9352, %r68, %r9348, %r9351; + mad.lo.s32 %r9353, %r69, %r9346, %r9352; + mad.lo.s32 %r9354, %r70, %r9344, %r9353; + ld.const.v4.u8 {%rs6082, %rs6083, %rs6084, %rs6085}, [matrix+3040]; + cvt.u32.u16 %r9355, %rs6085; + cvt.s32.s8 %r9356, %r9355; + cvt.u32.u16 %r9357, %rs6084; + cvt.s32.s8 %r9358, %r9357; + cvt.u32.u16 %r9359, %rs6083; + cvt.s32.s8 %r9360, %r9359; + cvt.u32.u16 %r9361, %rs6082; + cvt.s32.s8 %r9362, %r9361; + mad.lo.s32 %r9363, %r222, %r9362, %r9354; + mad.lo.s32 %r9364, %r72, %r9360, %r9363; + mad.lo.s32 %r9365, %r73, %r9358, %r9364; + mad.lo.s32 %r9366, %r74, %r9356, %r9365; + ld.const.v4.u8 {%rs6090, %rs6091, %rs6092, %rs6093}, [matrix+3044]; + cvt.u32.u16 %r9367, %rs6093; + cvt.s32.s8 %r9368, %r9367; + cvt.u32.u16 %r9369, %rs6092; + cvt.s32.s8 %r9370, %r9369; + cvt.u32.u16 %r9371, %rs6091; + cvt.s32.s8 %r9372, %r9371; + cvt.u32.u16 %r9373, %rs6090; + cvt.s32.s8 %r9374, %r9373; + mad.lo.s32 %r9375, %r75, %r9374, %r9366; + mad.lo.s32 %r9376, %r76, %r9372, %r9375; + mad.lo.s32 %r9377, %r77, %r9370, %r9376; + mad.lo.s32 %r9378, %r78, %r9368, %r9377; + ld.const.v4.u8 {%rs6098, %rs6099, %rs6100, %rs6101}, [matrix+3048]; + cvt.u32.u16 %r9379, %rs6101; + cvt.s32.s8 %r9380, %r9379; + cvt.u32.u16 %r9381, %rs6100; + cvt.s32.s8 %r9382, %r9381; + cvt.u32.u16 %r9383, %rs6099; + cvt.s32.s8 %r9384, %r9383; + cvt.u32.u16 %r9385, %rs6098; + cvt.s32.s8 %r9386, %r9385; + mad.lo.s32 %r9387, %r80, %r9386, %r9378; + mad.lo.s32 %r9388, %r81, %r9384, %r9387; + mad.lo.s32 %r9389, %r83, %r9382, %r9388; + mad.lo.s32 %r9390, %r84, %r9380, %r9389; + ld.const.v4.u8 {%rs6106, %rs6107, %rs6108, %rs6109}, [matrix+3052]; + cvt.u32.u16 %r9391, %rs6109; + cvt.s32.s8 %r9392, %r9391; + cvt.u32.u16 %r9393, %rs6108; + cvt.s32.s8 %r9394, %r9393; + cvt.u32.u16 %r9395, %rs6107; + cvt.s32.s8 %r9396, %r9395; + cvt.u32.u16 %r9397, %rs6106; + cvt.s32.s8 %r9398, %r9397; + mad.lo.s32 %r9399, %r86, %r9398, %r9390; + mad.lo.s32 %r9400, %r87, %r9396, %r9399; + mad.lo.s32 %r9401, %r88, %r9394, %r9400; + mad.lo.s32 %r9402, %r89, %r9392, %r9401; + ld.const.v4.u8 {%rs6114, %rs6115, %rs6116, %rs6117}, [matrix+3056]; + cvt.u32.u16 %r9403, %rs6117; + cvt.s32.s8 %r9404, %r9403; + cvt.u32.u16 %r9405, %rs6116; + cvt.s32.s8 %r9406, %r9405; + cvt.u32.u16 %r9407, %rs6115; + cvt.s32.s8 %r9408, %r9407; + cvt.u32.u16 %r9409, %rs6114; + cvt.s32.s8 %r9410, %r9409; + mad.lo.s32 %r9411, %r271, %r9410, %r9402; + mad.lo.s32 %r9412, %r91, %r9408, %r9411; + mad.lo.s32 %r9413, %r93, %r9406, %r9412; + mad.lo.s32 %r9414, %r94, %r9404, %r9413; + ld.const.v4.u8 {%rs6122, %rs6123, %rs6124, %rs6125}, [matrix+3060]; + cvt.u32.u16 %r9415, %rs6125; + cvt.s32.s8 %r9416, %r9415; + cvt.u32.u16 %r9417, %rs6124; + cvt.s32.s8 %r9418, %r9417; + cvt.u32.u16 %r9419, %rs6123; + cvt.s32.s8 %r9420, %r9419; + cvt.u32.u16 %r9421, %rs6122; + cvt.s32.s8 %r9422, %r9421; + mad.lo.s32 %r9423, %r96, %r9422, %r9414; + mad.lo.s32 %r9424, %r97, %r9420, %r9423; + mad.lo.s32 %r9425, %r99, %r9418, %r9424; + mad.lo.s32 %r9426, %r100, %r9416, %r9425; + ld.const.v4.u8 {%rs6130, %rs6131, %rs6132, %rs6133}, [matrix+3064]; + cvt.u32.u16 %r9427, %rs6133; + cvt.s32.s8 %r9428, %r9427; + cvt.u32.u16 %r9429, %rs6132; + cvt.s32.s8 %r9430, %r9429; + cvt.u32.u16 %r9431, %rs6131; + cvt.s32.s8 %r9432, %r9431; + cvt.u32.u16 %r9433, %rs6130; + cvt.s32.s8 %r9434, %r9433; + mad.lo.s32 %r9435, %r103, %r9434, %r9426; + mad.lo.s32 %r9436, %r104, %r9432, %r9435; + mad.lo.s32 %r9437, %r107, %r9430, %r9436; + mad.lo.s32 %r9438, %r108, %r9428, %r9437; + ld.const.v4.u8 {%rs6138, %rs6139, %rs6140, %rs6141}, [matrix+3068]; + cvt.u32.u16 %r9439, %rs6141; + cvt.s32.s8 %r9440, %r9439; + cvt.u32.u16 %r9441, %rs6140; + cvt.s32.s8 %r9442, %r9441; + cvt.u32.u16 %r9443, %rs6139; + cvt.s32.s8 %r9444, %r9443; + cvt.u32.u16 %r9445, %rs6138; + cvt.s32.s8 %r9446, %r9445; + mad.lo.s32 %r9447, %r111, %r9446, %r9438; + mad.lo.s32 %r9448, %r112, %r9444, %r9447; + mad.lo.s32 %r9449, %r114, %r9442, %r9448; + mad.lo.s32 %r9450, %r115, %r9440, %r9449; + shr.u32 %r9451, %r9258, 6; + and.b32 %r9452, %r9451, 240; + shr.u32 %r9453, %r9450, 10; + or.b32 %r9454, %r9453, %r9452; + xor.b32 %r9455, %r32, %r9454; + ld.const.v4.u8 {%rs6146, %rs6147, %rs6148, %rs6149}, [matrix+3072]; + cvt.u32.u16 %r9456, %rs6149; + cvt.s32.s8 %r9457, %r9456; + cvt.u32.u16 %r9458, %rs6148; + cvt.s32.s8 %r9459, %r9458; + cvt.u32.u16 %r9460, %rs6146; + cvt.s32.s8 %r9461, %r9460; + cvt.u32.u16 %r9462, %rs6147; + cvt.s32.s8 %r9463, %r9462; + mul.lo.s32 %r9464, %r34, %r9463; + mad.lo.s32 %r9465, %r124, %r9461, %r9464; + mad.lo.s32 %r9466, %r35, %r9459, %r9465; + mad.lo.s32 %r9467, %r36, %r9457, %r9466; + ld.const.v4.u8 {%rs6154, %rs6155, %rs6156, %rs6157}, [matrix+3076]; + cvt.u32.u16 %r9468, %rs6157; + cvt.s32.s8 %r9469, %r9468; + cvt.u32.u16 %r9470, %rs6156; + cvt.s32.s8 %r9471, %r9470; + cvt.u32.u16 %r9472, %rs6155; + cvt.s32.s8 %r9473, %r9472; + cvt.u32.u16 %r9474, %rs6154; + cvt.s32.s8 %r9475, %r9474; + mad.lo.s32 %r9476, %r37, %r9475, %r9467; + mad.lo.s32 %r9477, %r38, %r9473, %r9476; + mad.lo.s32 %r9478, %r39, %r9471, %r9477; + mad.lo.s32 %r9479, %r40, %r9469, %r9478; + ld.const.v4.u8 {%rs6162, %rs6163, %rs6164, %rs6165}, [matrix+3080]; + cvt.u32.u16 %r9480, %rs6165; + cvt.s32.s8 %r9481, %r9480; + cvt.u32.u16 %r9482, %rs6164; + cvt.s32.s8 %r9483, %r9482; + cvt.u32.u16 %r9484, %rs6163; + cvt.s32.s8 %r9485, %r9484; + cvt.u32.u16 %r9486, %rs6162; + cvt.s32.s8 %r9487, %r9486; + mad.lo.s32 %r9488, %r42, %r9487, %r9479; + mad.lo.s32 %r9489, %r43, %r9485, %r9488; + mad.lo.s32 %r9490, %r45, %r9483, %r9489; + mad.lo.s32 %r9491, %r46, %r9481, %r9490; + ld.const.v4.u8 {%rs6170, %rs6171, %rs6172, %rs6173}, [matrix+3084]; + cvt.u32.u16 %r9492, %rs6173; + cvt.s32.s8 %r9493, %r9492; + cvt.u32.u16 %r9494, %rs6172; + cvt.s32.s8 %r9495, %r9494; + cvt.u32.u16 %r9496, %rs6171; + cvt.s32.s8 %r9497, %r9496; + cvt.u32.u16 %r9498, %rs6170; + cvt.s32.s8 %r9499, %r9498; + mad.lo.s32 %r9500, %r48, %r9499, %r9491; + mad.lo.s32 %r9501, %r49, %r9497, %r9500; + mad.lo.s32 %r9502, %r50, %r9495, %r9501; + mad.lo.s32 %r9503, %r51, %r9493, %r9502; + ld.const.v4.u8 {%rs6178, %rs6179, %rs6180, %rs6181}, [matrix+3088]; + cvt.u32.u16 %r9504, %rs6181; + cvt.s32.s8 %r9505, %r9504; + cvt.u32.u16 %r9506, %rs6180; + cvt.s32.s8 %r9507, %r9506; + cvt.u32.u16 %r9508, %rs6179; + cvt.s32.s8 %r9509, %r9508; + cvt.u32.u16 %r9510, %rs6178; + cvt.s32.s8 %r9511, %r9510; + mad.lo.s32 %r9512, %r173, %r9511, %r9503; + mad.lo.s32 %r9513, %r53, %r9509, %r9512; + mad.lo.s32 %r9514, %r54, %r9507, %r9513; + mad.lo.s32 %r9515, %r55, %r9505, %r9514; + ld.const.v4.u8 {%rs6186, %rs6187, %rs6188, %rs6189}, [matrix+3092]; + cvt.u32.u16 %r9516, %rs6189; + cvt.s32.s8 %r9517, %r9516; + cvt.u32.u16 %r9518, %rs6188; + cvt.s32.s8 %r9519, %r9518; + cvt.u32.u16 %r9520, %rs6187; + cvt.s32.s8 %r9521, %r9520; + cvt.u32.u16 %r9522, %rs6186; + cvt.s32.s8 %r9523, %r9522; + mad.lo.s32 %r9524, %r56, %r9523, %r9515; + mad.lo.s32 %r9525, %r57, %r9521, %r9524; + mad.lo.s32 %r9526, %r58, %r9519, %r9525; + mad.lo.s32 %r9527, %r59, %r9517, %r9526; + ld.const.v4.u8 {%rs6194, %rs6195, %rs6196, %rs6197}, [matrix+3096]; + cvt.u32.u16 %r9528, %rs6197; + cvt.s32.s8 %r9529, %r9528; + cvt.u32.u16 %r9530, %rs6196; + cvt.s32.s8 %r9531, %r9530; + cvt.u32.u16 %r9532, %rs6195; + cvt.s32.s8 %r9533, %r9532; + cvt.u32.u16 %r9534, %rs6194; + cvt.s32.s8 %r9535, %r9534; + mad.lo.s32 %r9536, %r61, %r9535, %r9527; + mad.lo.s32 %r9537, %r62, %r9533, %r9536; + mad.lo.s32 %r9538, %r64, %r9531, %r9537; + mad.lo.s32 %r9539, %r65, %r9529, %r9538; + ld.const.v4.u8 {%rs6202, %rs6203, %rs6204, %rs6205}, [matrix+3100]; + cvt.u32.u16 %r9540, %rs6205; + cvt.s32.s8 %r9541, %r9540; + cvt.u32.u16 %r9542, %rs6204; + cvt.s32.s8 %r9543, %r9542; + cvt.u32.u16 %r9544, %rs6203; + cvt.s32.s8 %r9545, %r9544; + cvt.u32.u16 %r9546, %rs6202; + cvt.s32.s8 %r9547, %r9546; + mad.lo.s32 %r9548, %r67, %r9547, %r9539; + mad.lo.s32 %r9549, %r68, %r9545, %r9548; + mad.lo.s32 %r9550, %r69, %r9543, %r9549; + mad.lo.s32 %r9551, %r70, %r9541, %r9550; + ld.const.v4.u8 {%rs6210, %rs6211, %rs6212, %rs6213}, [matrix+3104]; + cvt.u32.u16 %r9552, %rs6213; + cvt.s32.s8 %r9553, %r9552; + cvt.u32.u16 %r9554, %rs6212; + cvt.s32.s8 %r9555, %r9554; + cvt.u32.u16 %r9556, %rs6211; + cvt.s32.s8 %r9557, %r9556; + cvt.u32.u16 %r9558, %rs6210; + cvt.s32.s8 %r9559, %r9558; + mad.lo.s32 %r9560, %r222, %r9559, %r9551; + mad.lo.s32 %r9561, %r72, %r9557, %r9560; + mad.lo.s32 %r9562, %r73, %r9555, %r9561; + mad.lo.s32 %r9563, %r74, %r9553, %r9562; + ld.const.v4.u8 {%rs6218, %rs6219, %rs6220, %rs6221}, [matrix+3108]; + cvt.u32.u16 %r9564, %rs6221; + cvt.s32.s8 %r9565, %r9564; + cvt.u32.u16 %r9566, %rs6220; + cvt.s32.s8 %r9567, %r9566; + cvt.u32.u16 %r9568, %rs6219; + cvt.s32.s8 %r9569, %r9568; + cvt.u32.u16 %r9570, %rs6218; + cvt.s32.s8 %r9571, %r9570; + mad.lo.s32 %r9572, %r75, %r9571, %r9563; + mad.lo.s32 %r9573, %r76, %r9569, %r9572; + mad.lo.s32 %r9574, %r77, %r9567, %r9573; + mad.lo.s32 %r9575, %r78, %r9565, %r9574; + ld.const.v4.u8 {%rs6226, %rs6227, %rs6228, %rs6229}, [matrix+3112]; + cvt.u32.u16 %r9576, %rs6229; + cvt.s32.s8 %r9577, %r9576; + cvt.u32.u16 %r9578, %rs6228; + cvt.s32.s8 %r9579, %r9578; + cvt.u32.u16 %r9580, %rs6227; + cvt.s32.s8 %r9581, %r9580; + cvt.u32.u16 %r9582, %rs6226; + cvt.s32.s8 %r9583, %r9582; + mad.lo.s32 %r9584, %r80, %r9583, %r9575; + mad.lo.s32 %r9585, %r81, %r9581, %r9584; + mad.lo.s32 %r9586, %r83, %r9579, %r9585; + mad.lo.s32 %r9587, %r84, %r9577, %r9586; + ld.const.v4.u8 {%rs6234, %rs6235, %rs6236, %rs6237}, [matrix+3116]; + cvt.u32.u16 %r9588, %rs6237; + cvt.s32.s8 %r9589, %r9588; + cvt.u32.u16 %r9590, %rs6236; + cvt.s32.s8 %r9591, %r9590; + cvt.u32.u16 %r9592, %rs6235; + cvt.s32.s8 %r9593, %r9592; + cvt.u32.u16 %r9594, %rs6234; + cvt.s32.s8 %r9595, %r9594; + mad.lo.s32 %r9596, %r86, %r9595, %r9587; + mad.lo.s32 %r9597, %r87, %r9593, %r9596; + mad.lo.s32 %r9598, %r88, %r9591, %r9597; + mad.lo.s32 %r9599, %r89, %r9589, %r9598; + ld.const.v4.u8 {%rs6242, %rs6243, %rs6244, %rs6245}, [matrix+3120]; + cvt.u32.u16 %r9600, %rs6245; + cvt.s32.s8 %r9601, %r9600; + cvt.u32.u16 %r9602, %rs6244; + cvt.s32.s8 %r9603, %r9602; + cvt.u32.u16 %r9604, %rs6243; + cvt.s32.s8 %r9605, %r9604; + cvt.u32.u16 %r9606, %rs6242; + cvt.s32.s8 %r9607, %r9606; + mad.lo.s32 %r9608, %r271, %r9607, %r9599; + mad.lo.s32 %r9609, %r91, %r9605, %r9608; + mad.lo.s32 %r9610, %r93, %r9603, %r9609; + mad.lo.s32 %r9611, %r94, %r9601, %r9610; + ld.const.v4.u8 {%rs6250, %rs6251, %rs6252, %rs6253}, [matrix+3124]; + cvt.u32.u16 %r9612, %rs6253; + cvt.s32.s8 %r9613, %r9612; + cvt.u32.u16 %r9614, %rs6252; + cvt.s32.s8 %r9615, %r9614; + cvt.u32.u16 %r9616, %rs6251; + cvt.s32.s8 %r9617, %r9616; + cvt.u32.u16 %r9618, %rs6250; + cvt.s32.s8 %r9619, %r9618; + mad.lo.s32 %r9620, %r96, %r9619, %r9611; + mad.lo.s32 %r9621, %r97, %r9617, %r9620; + mad.lo.s32 %r9622, %r99, %r9615, %r9621; + mad.lo.s32 %r9623, %r100, %r9613, %r9622; + ld.const.v4.u8 {%rs6258, %rs6259, %rs6260, %rs6261}, [matrix+3128]; + cvt.u32.u16 %r9624, %rs6261; + cvt.s32.s8 %r9625, %r9624; + cvt.u32.u16 %r9626, %rs6260; + cvt.s32.s8 %r9627, %r9626; + cvt.u32.u16 %r9628, %rs6259; + cvt.s32.s8 %r9629, %r9628; + cvt.u32.u16 %r9630, %rs6258; + cvt.s32.s8 %r9631, %r9630; + mad.lo.s32 %r9632, %r103, %r9631, %r9623; + mad.lo.s32 %r9633, %r104, %r9629, %r9632; + mad.lo.s32 %r9634, %r107, %r9627, %r9633; + mad.lo.s32 %r9635, %r108, %r9625, %r9634; + ld.const.v4.u8 {%rs6266, %rs6267, %rs6268, %rs6269}, [matrix+3132]; + cvt.u32.u16 %r9636, %rs6269; + cvt.s32.s8 %r9637, %r9636; + cvt.u32.u16 %r9638, %rs6268; + cvt.s32.s8 %r9639, %r9638; + cvt.u32.u16 %r9640, %rs6267; + cvt.s32.s8 %r9641, %r9640; + cvt.u32.u16 %r9642, %rs6266; + cvt.s32.s8 %r9643, %r9642; + mad.lo.s32 %r9644, %r111, %r9643, %r9635; + mad.lo.s32 %r9645, %r112, %r9641, %r9644; + mad.lo.s32 %r9646, %r114, %r9639, %r9645; + mad.lo.s32 %r9647, %r115, %r9637, %r9646; + ld.const.v4.u8 {%rs6274, %rs6275, %rs6276, %rs6277}, [matrix+3136]; + cvt.u32.u16 %r9648, %rs6277; + cvt.s32.s8 %r9649, %r9648; + cvt.u32.u16 %r9650, %rs6276; + cvt.s32.s8 %r9651, %r9650; + cvt.u32.u16 %r9652, %rs6274; + cvt.s32.s8 %r9653, %r9652; + cvt.u32.u16 %r9654, %rs6275; + cvt.s32.s8 %r9655, %r9654; + mul.lo.s32 %r9656, %r34, %r9655; + mad.lo.s32 %r9657, %r124, %r9653, %r9656; + mad.lo.s32 %r9658, %r35, %r9651, %r9657; + mad.lo.s32 %r9659, %r36, %r9649, %r9658; + ld.const.v4.u8 {%rs6282, %rs6283, %rs6284, %rs6285}, [matrix+3140]; + cvt.u32.u16 %r9660, %rs6285; + cvt.s32.s8 %r9661, %r9660; + cvt.u32.u16 %r9662, %rs6284; + cvt.s32.s8 %r9663, %r9662; + cvt.u32.u16 %r9664, %rs6283; + cvt.s32.s8 %r9665, %r9664; + cvt.u32.u16 %r9666, %rs6282; + cvt.s32.s8 %r9667, %r9666; + mad.lo.s32 %r9668, %r37, %r9667, %r9659; + mad.lo.s32 %r9669, %r38, %r9665, %r9668; + mad.lo.s32 %r9670, %r39, %r9663, %r9669; + mad.lo.s32 %r9671, %r40, %r9661, %r9670; + ld.const.v4.u8 {%rs6290, %rs6291, %rs6292, %rs6293}, [matrix+3144]; + cvt.u32.u16 %r9672, %rs6293; + cvt.s32.s8 %r9673, %r9672; + cvt.u32.u16 %r9674, %rs6292; + cvt.s32.s8 %r9675, %r9674; + cvt.u32.u16 %r9676, %rs6291; + cvt.s32.s8 %r9677, %r9676; + cvt.u32.u16 %r9678, %rs6290; + cvt.s32.s8 %r9679, %r9678; + mad.lo.s32 %r9680, %r42, %r9679, %r9671; + mad.lo.s32 %r9681, %r43, %r9677, %r9680; + mad.lo.s32 %r9682, %r45, %r9675, %r9681; + mad.lo.s32 %r9683, %r46, %r9673, %r9682; + ld.const.v4.u8 {%rs6298, %rs6299, %rs6300, %rs6301}, [matrix+3148]; + cvt.u32.u16 %r9684, %rs6301; + cvt.s32.s8 %r9685, %r9684; + cvt.u32.u16 %r9686, %rs6300; + cvt.s32.s8 %r9687, %r9686; + cvt.u32.u16 %r9688, %rs6299; + cvt.s32.s8 %r9689, %r9688; + cvt.u32.u16 %r9690, %rs6298; + cvt.s32.s8 %r9691, %r9690; + mad.lo.s32 %r9692, %r48, %r9691, %r9683; + mad.lo.s32 %r9693, %r49, %r9689, %r9692; + mad.lo.s32 %r9694, %r50, %r9687, %r9693; + mad.lo.s32 %r9695, %r51, %r9685, %r9694; + ld.const.v4.u8 {%rs6306, %rs6307, %rs6308, %rs6309}, [matrix+3152]; + cvt.u32.u16 %r9696, %rs6309; + cvt.s32.s8 %r9697, %r9696; + cvt.u32.u16 %r9698, %rs6308; + cvt.s32.s8 %r9699, %r9698; + cvt.u32.u16 %r9700, %rs6307; + cvt.s32.s8 %r9701, %r9700; + cvt.u32.u16 %r9702, %rs6306; + cvt.s32.s8 %r9703, %r9702; + mad.lo.s32 %r9704, %r173, %r9703, %r9695; + mad.lo.s32 %r9705, %r53, %r9701, %r9704; + mad.lo.s32 %r9706, %r54, %r9699, %r9705; + mad.lo.s32 %r9707, %r55, %r9697, %r9706; + ld.const.v4.u8 {%rs6314, %rs6315, %rs6316, %rs6317}, [matrix+3156]; + cvt.u32.u16 %r9708, %rs6317; + cvt.s32.s8 %r9709, %r9708; + cvt.u32.u16 %r9710, %rs6316; + cvt.s32.s8 %r9711, %r9710; + cvt.u32.u16 %r9712, %rs6315; + cvt.s32.s8 %r9713, %r9712; + cvt.u32.u16 %r9714, %rs6314; + cvt.s32.s8 %r9715, %r9714; + mad.lo.s32 %r9716, %r56, %r9715, %r9707; + mad.lo.s32 %r9717, %r57, %r9713, %r9716; + mad.lo.s32 %r9718, %r58, %r9711, %r9717; + mad.lo.s32 %r9719, %r59, %r9709, %r9718; + ld.const.v4.u8 {%rs6322, %rs6323, %rs6324, %rs6325}, [matrix+3160]; + cvt.u32.u16 %r9720, %rs6325; + cvt.s32.s8 %r9721, %r9720; + cvt.u32.u16 %r9722, %rs6324; + cvt.s32.s8 %r9723, %r9722; + cvt.u32.u16 %r9724, %rs6323; + cvt.s32.s8 %r9725, %r9724; + cvt.u32.u16 %r9726, %rs6322; + cvt.s32.s8 %r9727, %r9726; + mad.lo.s32 %r9728, %r61, %r9727, %r9719; + mad.lo.s32 %r9729, %r62, %r9725, %r9728; + mad.lo.s32 %r9730, %r64, %r9723, %r9729; + mad.lo.s32 %r9731, %r65, %r9721, %r9730; + ld.const.v4.u8 {%rs6330, %rs6331, %rs6332, %rs6333}, [matrix+3164]; + cvt.u32.u16 %r9732, %rs6333; + cvt.s32.s8 %r9733, %r9732; + cvt.u32.u16 %r9734, %rs6332; + cvt.s32.s8 %r9735, %r9734; + cvt.u32.u16 %r9736, %rs6331; + cvt.s32.s8 %r9737, %r9736; + cvt.u32.u16 %r9738, %rs6330; + cvt.s32.s8 %r9739, %r9738; + mad.lo.s32 %r9740, %r67, %r9739, %r9731; + mad.lo.s32 %r9741, %r68, %r9737, %r9740; + mad.lo.s32 %r9742, %r69, %r9735, %r9741; + mad.lo.s32 %r9743, %r70, %r9733, %r9742; + ld.const.v4.u8 {%rs6338, %rs6339, %rs6340, %rs6341}, [matrix+3168]; + cvt.u32.u16 %r9744, %rs6341; + cvt.s32.s8 %r9745, %r9744; + cvt.u32.u16 %r9746, %rs6340; + cvt.s32.s8 %r9747, %r9746; + cvt.u32.u16 %r9748, %rs6339; + cvt.s32.s8 %r9749, %r9748; + cvt.u32.u16 %r9750, %rs6338; + cvt.s32.s8 %r9751, %r9750; + mad.lo.s32 %r9752, %r222, %r9751, %r9743; + mad.lo.s32 %r9753, %r72, %r9749, %r9752; + mad.lo.s32 %r9754, %r73, %r9747, %r9753; + mad.lo.s32 %r9755, %r74, %r9745, %r9754; + ld.const.v4.u8 {%rs6346, %rs6347, %rs6348, %rs6349}, [matrix+3172]; + cvt.u32.u16 %r9756, %rs6349; + cvt.s32.s8 %r9757, %r9756; + cvt.u32.u16 %r9758, %rs6348; + cvt.s32.s8 %r9759, %r9758; + cvt.u32.u16 %r9760, %rs6347; + cvt.s32.s8 %r9761, %r9760; + cvt.u32.u16 %r9762, %rs6346; + cvt.s32.s8 %r9763, %r9762; + mad.lo.s32 %r9764, %r75, %r9763, %r9755; + mad.lo.s32 %r9765, %r76, %r9761, %r9764; + mad.lo.s32 %r9766, %r77, %r9759, %r9765; + mad.lo.s32 %r9767, %r78, %r9757, %r9766; + ld.const.v4.u8 {%rs6354, %rs6355, %rs6356, %rs6357}, [matrix+3176]; + cvt.u32.u16 %r9768, %rs6357; + cvt.s32.s8 %r9769, %r9768; + cvt.u32.u16 %r9770, %rs6356; + cvt.s32.s8 %r9771, %r9770; + cvt.u32.u16 %r9772, %rs6355; + cvt.s32.s8 %r9773, %r9772; + cvt.u32.u16 %r9774, %rs6354; + cvt.s32.s8 %r9775, %r9774; + mad.lo.s32 %r9776, %r80, %r9775, %r9767; + mad.lo.s32 %r9777, %r81, %r9773, %r9776; + mad.lo.s32 %r9778, %r83, %r9771, %r9777; + mad.lo.s32 %r9779, %r84, %r9769, %r9778; + ld.const.v4.u8 {%rs6362, %rs6363, %rs6364, %rs6365}, [matrix+3180]; + cvt.u32.u16 %r9780, %rs6365; + cvt.s32.s8 %r9781, %r9780; + cvt.u32.u16 %r9782, %rs6364; + cvt.s32.s8 %r9783, %r9782; + cvt.u32.u16 %r9784, %rs6363; + cvt.s32.s8 %r9785, %r9784; + cvt.u32.u16 %r9786, %rs6362; + cvt.s32.s8 %r9787, %r9786; + mad.lo.s32 %r9788, %r86, %r9787, %r9779; + mad.lo.s32 %r9789, %r87, %r9785, %r9788; + mad.lo.s32 %r9790, %r88, %r9783, %r9789; + mad.lo.s32 %r9791, %r89, %r9781, %r9790; + ld.const.v4.u8 {%rs6370, %rs6371, %rs6372, %rs6373}, [matrix+3184]; + cvt.u32.u16 %r9792, %rs6373; + cvt.s32.s8 %r9793, %r9792; + cvt.u32.u16 %r9794, %rs6372; + cvt.s32.s8 %r9795, %r9794; + cvt.u32.u16 %r9796, %rs6371; + cvt.s32.s8 %r9797, %r9796; + cvt.u32.u16 %r9798, %rs6370; + cvt.s32.s8 %r9799, %r9798; + mad.lo.s32 %r9800, %r271, %r9799, %r9791; + mad.lo.s32 %r9801, %r91, %r9797, %r9800; + mad.lo.s32 %r9802, %r93, %r9795, %r9801; + mad.lo.s32 %r9803, %r94, %r9793, %r9802; + ld.const.v4.u8 {%rs6378, %rs6379, %rs6380, %rs6381}, [matrix+3188]; + cvt.u32.u16 %r9804, %rs6381; + cvt.s32.s8 %r9805, %r9804; + cvt.u32.u16 %r9806, %rs6380; + cvt.s32.s8 %r9807, %r9806; + cvt.u32.u16 %r9808, %rs6379; + cvt.s32.s8 %r9809, %r9808; + cvt.u32.u16 %r9810, %rs6378; + cvt.s32.s8 %r9811, %r9810; + mad.lo.s32 %r9812, %r96, %r9811, %r9803; + mad.lo.s32 %r9813, %r97, %r9809, %r9812; + mad.lo.s32 %r9814, %r99, %r9807, %r9813; + mad.lo.s32 %r9815, %r100, %r9805, %r9814; + ld.const.v4.u8 {%rs6386, %rs6387, %rs6388, %rs6389}, [matrix+3192]; + cvt.u32.u16 %r9816, %rs6389; + cvt.s32.s8 %r9817, %r9816; + cvt.u32.u16 %r9818, %rs6388; + cvt.s32.s8 %r9819, %r9818; + cvt.u32.u16 %r9820, %rs6387; + cvt.s32.s8 %r9821, %r9820; + cvt.u32.u16 %r9822, %rs6386; + cvt.s32.s8 %r9823, %r9822; + mad.lo.s32 %r9824, %r103, %r9823, %r9815; + mad.lo.s32 %r9825, %r104, %r9821, %r9824; + mad.lo.s32 %r9826, %r107, %r9819, %r9825; + mad.lo.s32 %r9827, %r108, %r9817, %r9826; + ld.const.v4.u8 {%rs6394, %rs6395, %rs6396, %rs6397}, [matrix+3196]; + cvt.u32.u16 %r9828, %rs6397; + cvt.s32.s8 %r9829, %r9828; + cvt.u32.u16 %r9830, %rs6396; + cvt.s32.s8 %r9831, %r9830; + cvt.u32.u16 %r9832, %rs6395; + cvt.s32.s8 %r9833, %r9832; + cvt.u32.u16 %r9834, %rs6394; + cvt.s32.s8 %r9835, %r9834; + mad.lo.s32 %r9836, %r111, %r9835, %r9827; + mad.lo.s32 %r9837, %r112, %r9833, %r9836; + mad.lo.s32 %r9838, %r114, %r9831, %r9837; + mad.lo.s32 %r9839, %r115, %r9829, %r9838; + shr.u32 %r9840, %r9647, 6; + and.b32 %r9841, %r9840, 240; + shr.u32 %r9842, %r9839, 10; + or.b32 %r9843, %r9842, %r9841; + xor.b32 %r9844, %r90, %r9843; + ld.const.v4.u8 {%rs6402, %rs6403, %rs6404, %rs6405}, [matrix+3200]; + cvt.u32.u16 %r9845, %rs6405; + cvt.s32.s8 %r9846, %r9845; + cvt.u32.u16 %r9847, %rs6404; + cvt.s32.s8 %r9848, %r9847; + cvt.u32.u16 %r9849, %rs6402; + cvt.s32.s8 %r9850, %r9849; + cvt.u32.u16 %r9851, %rs6403; + cvt.s32.s8 %r9852, %r9851; + mul.lo.s32 %r9853, %r34, %r9852; + mad.lo.s32 %r9854, %r124, %r9850, %r9853; + mad.lo.s32 %r9855, %r35, %r9848, %r9854; + mad.lo.s32 %r9856, %r36, %r9846, %r9855; + ld.const.v4.u8 {%rs6410, %rs6411, %rs6412, %rs6413}, [matrix+3204]; + cvt.u32.u16 %r9857, %rs6413; + cvt.s32.s8 %r9858, %r9857; + cvt.u32.u16 %r9859, %rs6412; + cvt.s32.s8 %r9860, %r9859; + cvt.u32.u16 %r9861, %rs6411; + cvt.s32.s8 %r9862, %r9861; + cvt.u32.u16 %r9863, %rs6410; + cvt.s32.s8 %r9864, %r9863; + mad.lo.s32 %r9865, %r37, %r9864, %r9856; + mad.lo.s32 %r9866, %r38, %r9862, %r9865; + mad.lo.s32 %r9867, %r39, %r9860, %r9866; + mad.lo.s32 %r9868, %r40, %r9858, %r9867; + ld.const.v4.u8 {%rs6418, %rs6419, %rs6420, %rs6421}, [matrix+3208]; + cvt.u32.u16 %r9869, %rs6421; + cvt.s32.s8 %r9870, %r9869; + cvt.u32.u16 %r9871, %rs6420; + cvt.s32.s8 %r9872, %r9871; + cvt.u32.u16 %r9873, %rs6419; + cvt.s32.s8 %r9874, %r9873; + cvt.u32.u16 %r9875, %rs6418; + cvt.s32.s8 %r9876, %r9875; + mad.lo.s32 %r9877, %r42, %r9876, %r9868; + mad.lo.s32 %r9878, %r43, %r9874, %r9877; + mad.lo.s32 %r9879, %r45, %r9872, %r9878; + mad.lo.s32 %r9880, %r46, %r9870, %r9879; + ld.const.v4.u8 {%rs6426, %rs6427, %rs6428, %rs6429}, [matrix+3212]; + cvt.u32.u16 %r9881, %rs6429; + cvt.s32.s8 %r9882, %r9881; + cvt.u32.u16 %r9883, %rs6428; + cvt.s32.s8 %r9884, %r9883; + cvt.u32.u16 %r9885, %rs6427; + cvt.s32.s8 %r9886, %r9885; + cvt.u32.u16 %r9887, %rs6426; + cvt.s32.s8 %r9888, %r9887; + mad.lo.s32 %r9889, %r48, %r9888, %r9880; + mad.lo.s32 %r9890, %r49, %r9886, %r9889; + mad.lo.s32 %r9891, %r50, %r9884, %r9890; + mad.lo.s32 %r9892, %r51, %r9882, %r9891; + ld.const.v4.u8 {%rs6434, %rs6435, %rs6436, %rs6437}, [matrix+3216]; + cvt.u32.u16 %r9893, %rs6437; + cvt.s32.s8 %r9894, %r9893; + cvt.u32.u16 %r9895, %rs6436; + cvt.s32.s8 %r9896, %r9895; + cvt.u32.u16 %r9897, %rs6435; + cvt.s32.s8 %r9898, %r9897; + cvt.u32.u16 %r9899, %rs6434; + cvt.s32.s8 %r9900, %r9899; + mad.lo.s32 %r9901, %r173, %r9900, %r9892; + mad.lo.s32 %r9902, %r53, %r9898, %r9901; + mad.lo.s32 %r9903, %r54, %r9896, %r9902; + mad.lo.s32 %r9904, %r55, %r9894, %r9903; + ld.const.v4.u8 {%rs6442, %rs6443, %rs6444, %rs6445}, [matrix+3220]; + cvt.u32.u16 %r9905, %rs6445; + cvt.s32.s8 %r9906, %r9905; + cvt.u32.u16 %r9907, %rs6444; + cvt.s32.s8 %r9908, %r9907; + cvt.u32.u16 %r9909, %rs6443; + cvt.s32.s8 %r9910, %r9909; + cvt.u32.u16 %r9911, %rs6442; + cvt.s32.s8 %r9912, %r9911; + mad.lo.s32 %r9913, %r56, %r9912, %r9904; + mad.lo.s32 %r9914, %r57, %r9910, %r9913; + mad.lo.s32 %r9915, %r58, %r9908, %r9914; + mad.lo.s32 %r9916, %r59, %r9906, %r9915; + ld.const.v4.u8 {%rs6450, %rs6451, %rs6452, %rs6453}, [matrix+3224]; + cvt.u32.u16 %r9917, %rs6453; + cvt.s32.s8 %r9918, %r9917; + cvt.u32.u16 %r9919, %rs6452; + cvt.s32.s8 %r9920, %r9919; + cvt.u32.u16 %r9921, %rs6451; + cvt.s32.s8 %r9922, %r9921; + cvt.u32.u16 %r9923, %rs6450; + cvt.s32.s8 %r9924, %r9923; + mad.lo.s32 %r9925, %r61, %r9924, %r9916; + mad.lo.s32 %r9926, %r62, %r9922, %r9925; + mad.lo.s32 %r9927, %r64, %r9920, %r9926; + mad.lo.s32 %r9928, %r65, %r9918, %r9927; + ld.const.v4.u8 {%rs6458, %rs6459, %rs6460, %rs6461}, [matrix+3228]; + cvt.u32.u16 %r9929, %rs6461; + cvt.s32.s8 %r9930, %r9929; + cvt.u32.u16 %r9931, %rs6460; + cvt.s32.s8 %r9932, %r9931; + cvt.u32.u16 %r9933, %rs6459; + cvt.s32.s8 %r9934, %r9933; + cvt.u32.u16 %r9935, %rs6458; + cvt.s32.s8 %r9936, %r9935; + mad.lo.s32 %r9937, %r67, %r9936, %r9928; + mad.lo.s32 %r9938, %r68, %r9934, %r9937; + mad.lo.s32 %r9939, %r69, %r9932, %r9938; + mad.lo.s32 %r9940, %r70, %r9930, %r9939; + ld.const.v4.u8 {%rs6466, %rs6467, %rs6468, %rs6469}, [matrix+3232]; + cvt.u32.u16 %r9941, %rs6469; + cvt.s32.s8 %r9942, %r9941; + cvt.u32.u16 %r9943, %rs6468; + cvt.s32.s8 %r9944, %r9943; + cvt.u32.u16 %r9945, %rs6467; + cvt.s32.s8 %r9946, %r9945; + cvt.u32.u16 %r9947, %rs6466; + cvt.s32.s8 %r9948, %r9947; + mad.lo.s32 %r9949, %r222, %r9948, %r9940; + mad.lo.s32 %r9950, %r72, %r9946, %r9949; + mad.lo.s32 %r9951, %r73, %r9944, %r9950; + mad.lo.s32 %r9952, %r74, %r9942, %r9951; + ld.const.v4.u8 {%rs6474, %rs6475, %rs6476, %rs6477}, [matrix+3236]; + cvt.u32.u16 %r9953, %rs6477; + cvt.s32.s8 %r9954, %r9953; + cvt.u32.u16 %r9955, %rs6476; + cvt.s32.s8 %r9956, %r9955; + cvt.u32.u16 %r9957, %rs6475; + cvt.s32.s8 %r9958, %r9957; + cvt.u32.u16 %r9959, %rs6474; + cvt.s32.s8 %r9960, %r9959; + mad.lo.s32 %r9961, %r75, %r9960, %r9952; + mad.lo.s32 %r9962, %r76, %r9958, %r9961; + mad.lo.s32 %r9963, %r77, %r9956, %r9962; + mad.lo.s32 %r9964, %r78, %r9954, %r9963; + ld.const.v4.u8 {%rs6482, %rs6483, %rs6484, %rs6485}, [matrix+3240]; + cvt.u32.u16 %r9965, %rs6485; + cvt.s32.s8 %r9966, %r9965; + cvt.u32.u16 %r9967, %rs6484; + cvt.s32.s8 %r9968, %r9967; + cvt.u32.u16 %r9969, %rs6483; + cvt.s32.s8 %r9970, %r9969; + cvt.u32.u16 %r9971, %rs6482; + cvt.s32.s8 %r9972, %r9971; + mad.lo.s32 %r9973, %r80, %r9972, %r9964; + mad.lo.s32 %r9974, %r81, %r9970, %r9973; + mad.lo.s32 %r9975, %r83, %r9968, %r9974; + mad.lo.s32 %r9976, %r84, %r9966, %r9975; + ld.const.v4.u8 {%rs6490, %rs6491, %rs6492, %rs6493}, [matrix+3244]; + cvt.u32.u16 %r9977, %rs6493; + cvt.s32.s8 %r9978, %r9977; + cvt.u32.u16 %r9979, %rs6492; + cvt.s32.s8 %r9980, %r9979; + cvt.u32.u16 %r9981, %rs6491; + cvt.s32.s8 %r9982, %r9981; + cvt.u32.u16 %r9983, %rs6490; + cvt.s32.s8 %r9984, %r9983; + mad.lo.s32 %r9985, %r86, %r9984, %r9976; + mad.lo.s32 %r9986, %r87, %r9982, %r9985; + mad.lo.s32 %r9987, %r88, %r9980, %r9986; + mad.lo.s32 %r9988, %r89, %r9978, %r9987; + ld.const.v4.u8 {%rs6498, %rs6499, %rs6500, %rs6501}, [matrix+3248]; + cvt.u32.u16 %r9989, %rs6501; + cvt.s32.s8 %r9990, %r9989; + cvt.u32.u16 %r9991, %rs6500; + cvt.s32.s8 %r9992, %r9991; + cvt.u32.u16 %r9993, %rs6499; + cvt.s32.s8 %r9994, %r9993; + cvt.u32.u16 %r9995, %rs6498; + cvt.s32.s8 %r9996, %r9995; + mad.lo.s32 %r9997, %r271, %r9996, %r9988; + mad.lo.s32 %r9998, %r91, %r9994, %r9997; + mad.lo.s32 %r9999, %r93, %r9992, %r9998; + mad.lo.s32 %r10000, %r94, %r9990, %r9999; + ld.const.v4.u8 {%rs6506, %rs6507, %rs6508, %rs6509}, [matrix+3252]; + cvt.u32.u16 %r10001, %rs6509; + cvt.s32.s8 %r10002, %r10001; + cvt.u32.u16 %r10003, %rs6508; + cvt.s32.s8 %r10004, %r10003; + cvt.u32.u16 %r10005, %rs6507; + cvt.s32.s8 %r10006, %r10005; + cvt.u32.u16 %r10007, %rs6506; + cvt.s32.s8 %r10008, %r10007; + mad.lo.s32 %r10009, %r96, %r10008, %r10000; + mad.lo.s32 %r10010, %r97, %r10006, %r10009; + mad.lo.s32 %r10011, %r99, %r10004, %r10010; + mad.lo.s32 %r10012, %r100, %r10002, %r10011; + ld.const.v4.u8 {%rs6514, %rs6515, %rs6516, %rs6517}, [matrix+3256]; + cvt.u32.u16 %r10013, %rs6517; + cvt.s32.s8 %r10014, %r10013; + cvt.u32.u16 %r10015, %rs6516; + cvt.s32.s8 %r10016, %r10015; + cvt.u32.u16 %r10017, %rs6515; + cvt.s32.s8 %r10018, %r10017; + cvt.u32.u16 %r10019, %rs6514; + cvt.s32.s8 %r10020, %r10019; + mad.lo.s32 %r10021, %r103, %r10020, %r10012; + mad.lo.s32 %r10022, %r104, %r10018, %r10021; + mad.lo.s32 %r10023, %r107, %r10016, %r10022; + mad.lo.s32 %r10024, %r108, %r10014, %r10023; + ld.const.v4.u8 {%rs6522, %rs6523, %rs6524, %rs6525}, [matrix+3260]; + cvt.u32.u16 %r10025, %rs6525; + cvt.s32.s8 %r10026, %r10025; + cvt.u32.u16 %r10027, %rs6524; + cvt.s32.s8 %r10028, %r10027; + cvt.u32.u16 %r10029, %rs6523; + cvt.s32.s8 %r10030, %r10029; + cvt.u32.u16 %r10031, %rs6522; + cvt.s32.s8 %r10032, %r10031; + mad.lo.s32 %r10033, %r111, %r10032, %r10024; + mad.lo.s32 %r10034, %r112, %r10030, %r10033; + mad.lo.s32 %r10035, %r114, %r10028, %r10034; + mad.lo.s32 %r10036, %r115, %r10026, %r10035; + ld.const.v4.u8 {%rs6530, %rs6531, %rs6532, %rs6533}, [matrix+3264]; + cvt.u32.u16 %r10037, %rs6533; + cvt.s32.s8 %r10038, %r10037; + cvt.u32.u16 %r10039, %rs6532; + cvt.s32.s8 %r10040, %r10039; + cvt.u32.u16 %r10041, %rs6530; + cvt.s32.s8 %r10042, %r10041; + cvt.u32.u16 %r10043, %rs6531; + cvt.s32.s8 %r10044, %r10043; + mul.lo.s32 %r10045, %r34, %r10044; + mad.lo.s32 %r10046, %r124, %r10042, %r10045; + mad.lo.s32 %r10047, %r35, %r10040, %r10046; + mad.lo.s32 %r10048, %r36, %r10038, %r10047; + ld.const.v4.u8 {%rs6538, %rs6539, %rs6540, %rs6541}, [matrix+3268]; + cvt.u32.u16 %r10049, %rs6541; + cvt.s32.s8 %r10050, %r10049; + cvt.u32.u16 %r10051, %rs6540; + cvt.s32.s8 %r10052, %r10051; + cvt.u32.u16 %r10053, %rs6539; + cvt.s32.s8 %r10054, %r10053; + cvt.u32.u16 %r10055, %rs6538; + cvt.s32.s8 %r10056, %r10055; + mad.lo.s32 %r10057, %r37, %r10056, %r10048; + mad.lo.s32 %r10058, %r38, %r10054, %r10057; + mad.lo.s32 %r10059, %r39, %r10052, %r10058; + mad.lo.s32 %r10060, %r40, %r10050, %r10059; + ld.const.v4.u8 {%rs6546, %rs6547, %rs6548, %rs6549}, [matrix+3272]; + cvt.u32.u16 %r10061, %rs6549; + cvt.s32.s8 %r10062, %r10061; + cvt.u32.u16 %r10063, %rs6548; + cvt.s32.s8 %r10064, %r10063; + cvt.u32.u16 %r10065, %rs6547; + cvt.s32.s8 %r10066, %r10065; + cvt.u32.u16 %r10067, %rs6546; + cvt.s32.s8 %r10068, %r10067; + mad.lo.s32 %r10069, %r42, %r10068, %r10060; + mad.lo.s32 %r10070, %r43, %r10066, %r10069; + mad.lo.s32 %r10071, %r45, %r10064, %r10070; + mad.lo.s32 %r10072, %r46, %r10062, %r10071; + ld.const.v4.u8 {%rs6554, %rs6555, %rs6556, %rs6557}, [matrix+3276]; + cvt.u32.u16 %r10073, %rs6557; + cvt.s32.s8 %r10074, %r10073; + cvt.u32.u16 %r10075, %rs6556; + cvt.s32.s8 %r10076, %r10075; + cvt.u32.u16 %r10077, %rs6555; + cvt.s32.s8 %r10078, %r10077; + cvt.u32.u16 %r10079, %rs6554; + cvt.s32.s8 %r10080, %r10079; + mad.lo.s32 %r10081, %r48, %r10080, %r10072; + mad.lo.s32 %r10082, %r49, %r10078, %r10081; + mad.lo.s32 %r10083, %r50, %r10076, %r10082; + mad.lo.s32 %r10084, %r51, %r10074, %r10083; + ld.const.v4.u8 {%rs6562, %rs6563, %rs6564, %rs6565}, [matrix+3280]; + cvt.u32.u16 %r10085, %rs6565; + cvt.s32.s8 %r10086, %r10085; + cvt.u32.u16 %r10087, %rs6564; + cvt.s32.s8 %r10088, %r10087; + cvt.u32.u16 %r10089, %rs6563; + cvt.s32.s8 %r10090, %r10089; + cvt.u32.u16 %r10091, %rs6562; + cvt.s32.s8 %r10092, %r10091; + mad.lo.s32 %r10093, %r173, %r10092, %r10084; + mad.lo.s32 %r10094, %r53, %r10090, %r10093; + mad.lo.s32 %r10095, %r54, %r10088, %r10094; + mad.lo.s32 %r10096, %r55, %r10086, %r10095; + ld.const.v4.u8 {%rs6570, %rs6571, %rs6572, %rs6573}, [matrix+3284]; + cvt.u32.u16 %r10097, %rs6573; + cvt.s32.s8 %r10098, %r10097; + cvt.u32.u16 %r10099, %rs6572; + cvt.s32.s8 %r10100, %r10099; + cvt.u32.u16 %r10101, %rs6571; + cvt.s32.s8 %r10102, %r10101; + cvt.u32.u16 %r10103, %rs6570; + cvt.s32.s8 %r10104, %r10103; + mad.lo.s32 %r10105, %r56, %r10104, %r10096; + mad.lo.s32 %r10106, %r57, %r10102, %r10105; + mad.lo.s32 %r10107, %r58, %r10100, %r10106; + mad.lo.s32 %r10108, %r59, %r10098, %r10107; + ld.const.v4.u8 {%rs6578, %rs6579, %rs6580, %rs6581}, [matrix+3288]; + cvt.u32.u16 %r10109, %rs6581; + cvt.s32.s8 %r10110, %r10109; + cvt.u32.u16 %r10111, %rs6580; + cvt.s32.s8 %r10112, %r10111; + cvt.u32.u16 %r10113, %rs6579; + cvt.s32.s8 %r10114, %r10113; + cvt.u32.u16 %r10115, %rs6578; + cvt.s32.s8 %r10116, %r10115; + mad.lo.s32 %r10117, %r61, %r10116, %r10108; + mad.lo.s32 %r10118, %r62, %r10114, %r10117; + mad.lo.s32 %r10119, %r64, %r10112, %r10118; + mad.lo.s32 %r10120, %r65, %r10110, %r10119; + ld.const.v4.u8 {%rs6586, %rs6587, %rs6588, %rs6589}, [matrix+3292]; + cvt.u32.u16 %r10121, %rs6589; + cvt.s32.s8 %r10122, %r10121; + cvt.u32.u16 %r10123, %rs6588; + cvt.s32.s8 %r10124, %r10123; + cvt.u32.u16 %r10125, %rs6587; + cvt.s32.s8 %r10126, %r10125; + cvt.u32.u16 %r10127, %rs6586; + cvt.s32.s8 %r10128, %r10127; + mad.lo.s32 %r10129, %r67, %r10128, %r10120; + mad.lo.s32 %r10130, %r68, %r10126, %r10129; + mad.lo.s32 %r10131, %r69, %r10124, %r10130; + mad.lo.s32 %r10132, %r70, %r10122, %r10131; + ld.const.v4.u8 {%rs6594, %rs6595, %rs6596, %rs6597}, [matrix+3296]; + cvt.u32.u16 %r10133, %rs6597; + cvt.s32.s8 %r10134, %r10133; + cvt.u32.u16 %r10135, %rs6596; + cvt.s32.s8 %r10136, %r10135; + cvt.u32.u16 %r10137, %rs6595; + cvt.s32.s8 %r10138, %r10137; + cvt.u32.u16 %r10139, %rs6594; + cvt.s32.s8 %r10140, %r10139; + mad.lo.s32 %r10141, %r222, %r10140, %r10132; + mad.lo.s32 %r10142, %r72, %r10138, %r10141; + mad.lo.s32 %r10143, %r73, %r10136, %r10142; + mad.lo.s32 %r10144, %r74, %r10134, %r10143; + ld.const.v4.u8 {%rs6602, %rs6603, %rs6604, %rs6605}, [matrix+3300]; + cvt.u32.u16 %r10145, %rs6605; + cvt.s32.s8 %r10146, %r10145; + cvt.u32.u16 %r10147, %rs6604; + cvt.s32.s8 %r10148, %r10147; + cvt.u32.u16 %r10149, %rs6603; + cvt.s32.s8 %r10150, %r10149; + cvt.u32.u16 %r10151, %rs6602; + cvt.s32.s8 %r10152, %r10151; + mad.lo.s32 %r10153, %r75, %r10152, %r10144; + mad.lo.s32 %r10154, %r76, %r10150, %r10153; + mad.lo.s32 %r10155, %r77, %r10148, %r10154; + mad.lo.s32 %r10156, %r78, %r10146, %r10155; + ld.const.v4.u8 {%rs6610, %rs6611, %rs6612, %rs6613}, [matrix+3304]; + cvt.u32.u16 %r10157, %rs6613; + cvt.s32.s8 %r10158, %r10157; + cvt.u32.u16 %r10159, %rs6612; + cvt.s32.s8 %r10160, %r10159; + cvt.u32.u16 %r10161, %rs6611; + cvt.s32.s8 %r10162, %r10161; + cvt.u32.u16 %r10163, %rs6610; + cvt.s32.s8 %r10164, %r10163; + mad.lo.s32 %r10165, %r80, %r10164, %r10156; + mad.lo.s32 %r10166, %r81, %r10162, %r10165; + mad.lo.s32 %r10167, %r83, %r10160, %r10166; + mad.lo.s32 %r10168, %r84, %r10158, %r10167; + ld.const.v4.u8 {%rs6618, %rs6619, %rs6620, %rs6621}, [matrix+3308]; + cvt.u32.u16 %r10169, %rs6621; + cvt.s32.s8 %r10170, %r10169; + cvt.u32.u16 %r10171, %rs6620; + cvt.s32.s8 %r10172, %r10171; + cvt.u32.u16 %r10173, %rs6619; + cvt.s32.s8 %r10174, %r10173; + cvt.u32.u16 %r10175, %rs6618; + cvt.s32.s8 %r10176, %r10175; + mad.lo.s32 %r10177, %r86, %r10176, %r10168; + mad.lo.s32 %r10178, %r87, %r10174, %r10177; + mad.lo.s32 %r10179, %r88, %r10172, %r10178; + mad.lo.s32 %r10180, %r89, %r10170, %r10179; + ld.const.v4.u8 {%rs6626, %rs6627, %rs6628, %rs6629}, [matrix+3312]; + cvt.u32.u16 %r10181, %rs6629; + cvt.s32.s8 %r10182, %r10181; + cvt.u32.u16 %r10183, %rs6628; + cvt.s32.s8 %r10184, %r10183; + cvt.u32.u16 %r10185, %rs6627; + cvt.s32.s8 %r10186, %r10185; + cvt.u32.u16 %r10187, %rs6626; + cvt.s32.s8 %r10188, %r10187; + mad.lo.s32 %r10189, %r271, %r10188, %r10180; + mad.lo.s32 %r10190, %r91, %r10186, %r10189; + mad.lo.s32 %r10191, %r93, %r10184, %r10190; + mad.lo.s32 %r10192, %r94, %r10182, %r10191; + ld.const.v4.u8 {%rs6634, %rs6635, %rs6636, %rs6637}, [matrix+3316]; + cvt.u32.u16 %r10193, %rs6637; + cvt.s32.s8 %r10194, %r10193; + cvt.u32.u16 %r10195, %rs6636; + cvt.s32.s8 %r10196, %r10195; + cvt.u32.u16 %r10197, %rs6635; + cvt.s32.s8 %r10198, %r10197; + cvt.u32.u16 %r10199, %rs6634; + cvt.s32.s8 %r10200, %r10199; + mad.lo.s32 %r10201, %r96, %r10200, %r10192; + mad.lo.s32 %r10202, %r97, %r10198, %r10201; + mad.lo.s32 %r10203, %r99, %r10196, %r10202; + mad.lo.s32 %r10204, %r100, %r10194, %r10203; + ld.const.v4.u8 {%rs6642, %rs6643, %rs6644, %rs6645}, [matrix+3320]; + cvt.u32.u16 %r10205, %rs6645; + cvt.s32.s8 %r10206, %r10205; + cvt.u32.u16 %r10207, %rs6644; + cvt.s32.s8 %r10208, %r10207; + cvt.u32.u16 %r10209, %rs6643; + cvt.s32.s8 %r10210, %r10209; + cvt.u32.u16 %r10211, %rs6642; + cvt.s32.s8 %r10212, %r10211; + mad.lo.s32 %r10213, %r103, %r10212, %r10204; + mad.lo.s32 %r10214, %r104, %r10210, %r10213; + mad.lo.s32 %r10215, %r107, %r10208, %r10214; + mad.lo.s32 %r10216, %r108, %r10206, %r10215; + ld.const.v4.u8 {%rs6650, %rs6651, %rs6652, %rs6653}, [matrix+3324]; + cvt.u32.u16 %r10217, %rs6653; + cvt.s32.s8 %r10218, %r10217; + cvt.u32.u16 %r10219, %rs6652; + cvt.s32.s8 %r10220, %r10219; + cvt.u32.u16 %r10221, %rs6651; + cvt.s32.s8 %r10222, %r10221; + cvt.u32.u16 %r10223, %rs6650; + cvt.s32.s8 %r10224, %r10223; + mad.lo.s32 %r10225, %r111, %r10224, %r10216; + mad.lo.s32 %r10226, %r112, %r10222, %r10225; + mad.lo.s32 %r10227, %r114, %r10220, %r10226; + mad.lo.s32 %r10228, %r115, %r10218, %r10227; + shr.u32 %r10229, %r10036, 6; + and.b32 %r10230, %r10229, 240; + shr.u32 %r10231, %r10228, 10; + or.b32 %r10232, %r10231, %r10230; + xor.b32 %r10233, %r92, %r10232; + cvt.u64.u32 %rd399, %r10233; + ld.const.v4.u8 {%rs6658, %rs6659, %rs6660, %rs6661}, [matrix+3328]; + cvt.u32.u16 %r10234, %rs6661; + cvt.s32.s8 %r10235, %r10234; + cvt.u32.u16 %r10236, %rs6660; + cvt.s32.s8 %r10237, %r10236; + cvt.u32.u16 %r10238, %rs6658; + cvt.s32.s8 %r10239, %r10238; + cvt.u32.u16 %r10240, %rs6659; + cvt.s32.s8 %r10241, %r10240; + mul.lo.s32 %r10242, %r34, %r10241; + mad.lo.s32 %r10243, %r124, %r10239, %r10242; + mad.lo.s32 %r10244, %r35, %r10237, %r10243; + mad.lo.s32 %r10245, %r36, %r10235, %r10244; + ld.const.v4.u8 {%rs6666, %rs6667, %rs6668, %rs6669}, [matrix+3332]; + cvt.u32.u16 %r10246, %rs6669; + cvt.s32.s8 %r10247, %r10246; + cvt.u32.u16 %r10248, %rs6668; + cvt.s32.s8 %r10249, %r10248; + cvt.u32.u16 %r10250, %rs6667; + cvt.s32.s8 %r10251, %r10250; + cvt.u32.u16 %r10252, %rs6666; + cvt.s32.s8 %r10253, %r10252; + mad.lo.s32 %r10254, %r37, %r10253, %r10245; + mad.lo.s32 %r10255, %r38, %r10251, %r10254; + mad.lo.s32 %r10256, %r39, %r10249, %r10255; + mad.lo.s32 %r10257, %r40, %r10247, %r10256; + ld.const.v4.u8 {%rs6674, %rs6675, %rs6676, %rs6677}, [matrix+3336]; + cvt.u32.u16 %r10258, %rs6677; + cvt.s32.s8 %r10259, %r10258; + cvt.u32.u16 %r10260, %rs6676; + cvt.s32.s8 %r10261, %r10260; + cvt.u32.u16 %r10262, %rs6675; + cvt.s32.s8 %r10263, %r10262; + cvt.u32.u16 %r10264, %rs6674; + cvt.s32.s8 %r10265, %r10264; + mad.lo.s32 %r10266, %r42, %r10265, %r10257; + mad.lo.s32 %r10267, %r43, %r10263, %r10266; + mad.lo.s32 %r10268, %r45, %r10261, %r10267; + mad.lo.s32 %r10269, %r46, %r10259, %r10268; + ld.const.v4.u8 {%rs6682, %rs6683, %rs6684, %rs6685}, [matrix+3340]; + cvt.u32.u16 %r10270, %rs6685; + cvt.s32.s8 %r10271, %r10270; + cvt.u32.u16 %r10272, %rs6684; + cvt.s32.s8 %r10273, %r10272; + cvt.u32.u16 %r10274, %rs6683; + cvt.s32.s8 %r10275, %r10274; + cvt.u32.u16 %r10276, %rs6682; + cvt.s32.s8 %r10277, %r10276; + mad.lo.s32 %r10278, %r48, %r10277, %r10269; + mad.lo.s32 %r10279, %r49, %r10275, %r10278; + mad.lo.s32 %r10280, %r50, %r10273, %r10279; + mad.lo.s32 %r10281, %r51, %r10271, %r10280; + ld.const.v4.u8 {%rs6690, %rs6691, %rs6692, %rs6693}, [matrix+3344]; + cvt.u32.u16 %r10282, %rs6693; + cvt.s32.s8 %r10283, %r10282; + cvt.u32.u16 %r10284, %rs6692; + cvt.s32.s8 %r10285, %r10284; + cvt.u32.u16 %r10286, %rs6691; + cvt.s32.s8 %r10287, %r10286; + cvt.u32.u16 %r10288, %rs6690; + cvt.s32.s8 %r10289, %r10288; + mad.lo.s32 %r10290, %r173, %r10289, %r10281; + mad.lo.s32 %r10291, %r53, %r10287, %r10290; + mad.lo.s32 %r10292, %r54, %r10285, %r10291; + mad.lo.s32 %r10293, %r55, %r10283, %r10292; + ld.const.v4.u8 {%rs6698, %rs6699, %rs6700, %rs6701}, [matrix+3348]; + cvt.u32.u16 %r10294, %rs6701; + cvt.s32.s8 %r10295, %r10294; + cvt.u32.u16 %r10296, %rs6700; + cvt.s32.s8 %r10297, %r10296; + cvt.u32.u16 %r10298, %rs6699; + cvt.s32.s8 %r10299, %r10298; + cvt.u32.u16 %r10300, %rs6698; + cvt.s32.s8 %r10301, %r10300; + mad.lo.s32 %r10302, %r56, %r10301, %r10293; + mad.lo.s32 %r10303, %r57, %r10299, %r10302; + mad.lo.s32 %r10304, %r58, %r10297, %r10303; + mad.lo.s32 %r10305, %r59, %r10295, %r10304; + ld.const.v4.u8 {%rs6706, %rs6707, %rs6708, %rs6709}, [matrix+3352]; + cvt.u32.u16 %r10306, %rs6709; + cvt.s32.s8 %r10307, %r10306; + cvt.u32.u16 %r10308, %rs6708; + cvt.s32.s8 %r10309, %r10308; + cvt.u32.u16 %r10310, %rs6707; + cvt.s32.s8 %r10311, %r10310; + cvt.u32.u16 %r10312, %rs6706; + cvt.s32.s8 %r10313, %r10312; + mad.lo.s32 %r10314, %r61, %r10313, %r10305; + mad.lo.s32 %r10315, %r62, %r10311, %r10314; + mad.lo.s32 %r10316, %r64, %r10309, %r10315; + mad.lo.s32 %r10317, %r65, %r10307, %r10316; + ld.const.v4.u8 {%rs6714, %rs6715, %rs6716, %rs6717}, [matrix+3356]; + cvt.u32.u16 %r10318, %rs6717; + cvt.s32.s8 %r10319, %r10318; + cvt.u32.u16 %r10320, %rs6716; + cvt.s32.s8 %r10321, %r10320; + cvt.u32.u16 %r10322, %rs6715; + cvt.s32.s8 %r10323, %r10322; + cvt.u32.u16 %r10324, %rs6714; + cvt.s32.s8 %r10325, %r10324; + mad.lo.s32 %r10326, %r67, %r10325, %r10317; + mad.lo.s32 %r10327, %r68, %r10323, %r10326; + mad.lo.s32 %r10328, %r69, %r10321, %r10327; + mad.lo.s32 %r10329, %r70, %r10319, %r10328; + ld.const.v4.u8 {%rs6722, %rs6723, %rs6724, %rs6725}, [matrix+3360]; + cvt.u32.u16 %r10330, %rs6725; + cvt.s32.s8 %r10331, %r10330; + cvt.u32.u16 %r10332, %rs6724; + cvt.s32.s8 %r10333, %r10332; + cvt.u32.u16 %r10334, %rs6723; + cvt.s32.s8 %r10335, %r10334; + cvt.u32.u16 %r10336, %rs6722; + cvt.s32.s8 %r10337, %r10336; + mad.lo.s32 %r10338, %r222, %r10337, %r10329; + mad.lo.s32 %r10339, %r72, %r10335, %r10338; + mad.lo.s32 %r10340, %r73, %r10333, %r10339; + mad.lo.s32 %r10341, %r74, %r10331, %r10340; + ld.const.v4.u8 {%rs6730, %rs6731, %rs6732, %rs6733}, [matrix+3364]; + cvt.u32.u16 %r10342, %rs6733; + cvt.s32.s8 %r10343, %r10342; + cvt.u32.u16 %r10344, %rs6732; + cvt.s32.s8 %r10345, %r10344; + cvt.u32.u16 %r10346, %rs6731; + cvt.s32.s8 %r10347, %r10346; + cvt.u32.u16 %r10348, %rs6730; + cvt.s32.s8 %r10349, %r10348; + mad.lo.s32 %r10350, %r75, %r10349, %r10341; + mad.lo.s32 %r10351, %r76, %r10347, %r10350; + mad.lo.s32 %r10352, %r77, %r10345, %r10351; + mad.lo.s32 %r10353, %r78, %r10343, %r10352; + ld.const.v4.u8 {%rs6738, %rs6739, %rs6740, %rs6741}, [matrix+3368]; + cvt.u32.u16 %r10354, %rs6741; + cvt.s32.s8 %r10355, %r10354; + cvt.u32.u16 %r10356, %rs6740; + cvt.s32.s8 %r10357, %r10356; + cvt.u32.u16 %r10358, %rs6739; + cvt.s32.s8 %r10359, %r10358; + cvt.u32.u16 %r10360, %rs6738; + cvt.s32.s8 %r10361, %r10360; + mad.lo.s32 %r10362, %r80, %r10361, %r10353; + mad.lo.s32 %r10363, %r81, %r10359, %r10362; + mad.lo.s32 %r10364, %r83, %r10357, %r10363; + mad.lo.s32 %r10365, %r84, %r10355, %r10364; + ld.const.v4.u8 {%rs6746, %rs6747, %rs6748, %rs6749}, [matrix+3372]; + cvt.u32.u16 %r10366, %rs6749; + cvt.s32.s8 %r10367, %r10366; + cvt.u32.u16 %r10368, %rs6748; + cvt.s32.s8 %r10369, %r10368; + cvt.u32.u16 %r10370, %rs6747; + cvt.s32.s8 %r10371, %r10370; + cvt.u32.u16 %r10372, %rs6746; + cvt.s32.s8 %r10373, %r10372; + mad.lo.s32 %r10374, %r86, %r10373, %r10365; + mad.lo.s32 %r10375, %r87, %r10371, %r10374; + mad.lo.s32 %r10376, %r88, %r10369, %r10375; + mad.lo.s32 %r10377, %r89, %r10367, %r10376; + ld.const.v4.u8 {%rs6754, %rs6755, %rs6756, %rs6757}, [matrix+3376]; + cvt.u32.u16 %r10378, %rs6757; + cvt.s32.s8 %r10379, %r10378; + cvt.u32.u16 %r10380, %rs6756; + cvt.s32.s8 %r10381, %r10380; + cvt.u32.u16 %r10382, %rs6755; + cvt.s32.s8 %r10383, %r10382; + cvt.u32.u16 %r10384, %rs6754; + cvt.s32.s8 %r10385, %r10384; + mad.lo.s32 %r10386, %r271, %r10385, %r10377; + mad.lo.s32 %r10387, %r91, %r10383, %r10386; + mad.lo.s32 %r10388, %r93, %r10381, %r10387; + mad.lo.s32 %r10389, %r94, %r10379, %r10388; + ld.const.v4.u8 {%rs6762, %rs6763, %rs6764, %rs6765}, [matrix+3380]; + cvt.u32.u16 %r10390, %rs6765; + cvt.s32.s8 %r10391, %r10390; + cvt.u32.u16 %r10392, %rs6764; + cvt.s32.s8 %r10393, %r10392; + cvt.u32.u16 %r10394, %rs6763; + cvt.s32.s8 %r10395, %r10394; + cvt.u32.u16 %r10396, %rs6762; + cvt.s32.s8 %r10397, %r10396; + mad.lo.s32 %r10398, %r96, %r10397, %r10389; + mad.lo.s32 %r10399, %r97, %r10395, %r10398; + mad.lo.s32 %r10400, %r99, %r10393, %r10399; + mad.lo.s32 %r10401, %r100, %r10391, %r10400; + ld.const.v4.u8 {%rs6770, %rs6771, %rs6772, %rs6773}, [matrix+3384]; + cvt.u32.u16 %r10402, %rs6773; + cvt.s32.s8 %r10403, %r10402; + cvt.u32.u16 %r10404, %rs6772; + cvt.s32.s8 %r10405, %r10404; + cvt.u32.u16 %r10406, %rs6771; + cvt.s32.s8 %r10407, %r10406; + cvt.u32.u16 %r10408, %rs6770; + cvt.s32.s8 %r10409, %r10408; + mad.lo.s32 %r10410, %r103, %r10409, %r10401; + mad.lo.s32 %r10411, %r104, %r10407, %r10410; + mad.lo.s32 %r10412, %r107, %r10405, %r10411; + mad.lo.s32 %r10413, %r108, %r10403, %r10412; + ld.const.v4.u8 {%rs6778, %rs6779, %rs6780, %rs6781}, [matrix+3388]; + cvt.u32.u16 %r10414, %rs6781; + cvt.s32.s8 %r10415, %r10414; + cvt.u32.u16 %r10416, %rs6780; + cvt.s32.s8 %r10417, %r10416; + cvt.u32.u16 %r10418, %rs6779; + cvt.s32.s8 %r10419, %r10418; + cvt.u32.u16 %r10420, %rs6778; + cvt.s32.s8 %r10421, %r10420; + mad.lo.s32 %r10422, %r111, %r10421, %r10413; + mad.lo.s32 %r10423, %r112, %r10419, %r10422; + mad.lo.s32 %r10424, %r114, %r10417, %r10423; + mad.lo.s32 %r10425, %r115, %r10415, %r10424; + ld.const.v4.u8 {%rs6786, %rs6787, %rs6788, %rs6789}, [matrix+3392]; + cvt.u32.u16 %r10426, %rs6789; + cvt.s32.s8 %r10427, %r10426; + cvt.u32.u16 %r10428, %rs6788; + cvt.s32.s8 %r10429, %r10428; + cvt.u32.u16 %r10430, %rs6786; + cvt.s32.s8 %r10431, %r10430; + cvt.u32.u16 %r10432, %rs6787; + cvt.s32.s8 %r10433, %r10432; + mul.lo.s32 %r10434, %r34, %r10433; + mad.lo.s32 %r10435, %r124, %r10431, %r10434; + mad.lo.s32 %r10436, %r35, %r10429, %r10435; + mad.lo.s32 %r10437, %r36, %r10427, %r10436; + ld.const.v4.u8 {%rs6794, %rs6795, %rs6796, %rs6797}, [matrix+3396]; + cvt.u32.u16 %r10438, %rs6797; + cvt.s32.s8 %r10439, %r10438; + cvt.u32.u16 %r10440, %rs6796; + cvt.s32.s8 %r10441, %r10440; + cvt.u32.u16 %r10442, %rs6795; + cvt.s32.s8 %r10443, %r10442; + cvt.u32.u16 %r10444, %rs6794; + cvt.s32.s8 %r10445, %r10444; + mad.lo.s32 %r10446, %r37, %r10445, %r10437; + mad.lo.s32 %r10447, %r38, %r10443, %r10446; + mad.lo.s32 %r10448, %r39, %r10441, %r10447; + mad.lo.s32 %r10449, %r40, %r10439, %r10448; + ld.const.v4.u8 {%rs6802, %rs6803, %rs6804, %rs6805}, [matrix+3400]; + cvt.u32.u16 %r10450, %rs6805; + cvt.s32.s8 %r10451, %r10450; + cvt.u32.u16 %r10452, %rs6804; + cvt.s32.s8 %r10453, %r10452; + cvt.u32.u16 %r10454, %rs6803; + cvt.s32.s8 %r10455, %r10454; + cvt.u32.u16 %r10456, %rs6802; + cvt.s32.s8 %r10457, %r10456; + mad.lo.s32 %r10458, %r42, %r10457, %r10449; + mad.lo.s32 %r10459, %r43, %r10455, %r10458; + mad.lo.s32 %r10460, %r45, %r10453, %r10459; + mad.lo.s32 %r10461, %r46, %r10451, %r10460; + ld.const.v4.u8 {%rs6810, %rs6811, %rs6812, %rs6813}, [matrix+3404]; + cvt.u32.u16 %r10462, %rs6813; + cvt.s32.s8 %r10463, %r10462; + cvt.u32.u16 %r10464, %rs6812; + cvt.s32.s8 %r10465, %r10464; + cvt.u32.u16 %r10466, %rs6811; + cvt.s32.s8 %r10467, %r10466; + cvt.u32.u16 %r10468, %rs6810; + cvt.s32.s8 %r10469, %r10468; + mad.lo.s32 %r10470, %r48, %r10469, %r10461; + mad.lo.s32 %r10471, %r49, %r10467, %r10470; + mad.lo.s32 %r10472, %r50, %r10465, %r10471; + mad.lo.s32 %r10473, %r51, %r10463, %r10472; + ld.const.v4.u8 {%rs6818, %rs6819, %rs6820, %rs6821}, [matrix+3408]; + cvt.u32.u16 %r10474, %rs6821; + cvt.s32.s8 %r10475, %r10474; + cvt.u32.u16 %r10476, %rs6820; + cvt.s32.s8 %r10477, %r10476; + cvt.u32.u16 %r10478, %rs6819; + cvt.s32.s8 %r10479, %r10478; + cvt.u32.u16 %r10480, %rs6818; + cvt.s32.s8 %r10481, %r10480; + mad.lo.s32 %r10482, %r173, %r10481, %r10473; + mad.lo.s32 %r10483, %r53, %r10479, %r10482; + mad.lo.s32 %r10484, %r54, %r10477, %r10483; + mad.lo.s32 %r10485, %r55, %r10475, %r10484; + ld.const.v4.u8 {%rs6826, %rs6827, %rs6828, %rs6829}, [matrix+3412]; + cvt.u32.u16 %r10486, %rs6829; + cvt.s32.s8 %r10487, %r10486; + cvt.u32.u16 %r10488, %rs6828; + cvt.s32.s8 %r10489, %r10488; + cvt.u32.u16 %r10490, %rs6827; + cvt.s32.s8 %r10491, %r10490; + cvt.u32.u16 %r10492, %rs6826; + cvt.s32.s8 %r10493, %r10492; + mad.lo.s32 %r10494, %r56, %r10493, %r10485; + mad.lo.s32 %r10495, %r57, %r10491, %r10494; + mad.lo.s32 %r10496, %r58, %r10489, %r10495; + mad.lo.s32 %r10497, %r59, %r10487, %r10496; + ld.const.v4.u8 {%rs6834, %rs6835, %rs6836, %rs6837}, [matrix+3416]; + cvt.u32.u16 %r10498, %rs6837; + cvt.s32.s8 %r10499, %r10498; + cvt.u32.u16 %r10500, %rs6836; + cvt.s32.s8 %r10501, %r10500; + cvt.u32.u16 %r10502, %rs6835; + cvt.s32.s8 %r10503, %r10502; + cvt.u32.u16 %r10504, %rs6834; + cvt.s32.s8 %r10505, %r10504; + mad.lo.s32 %r10506, %r61, %r10505, %r10497; + mad.lo.s32 %r10507, %r62, %r10503, %r10506; + mad.lo.s32 %r10508, %r64, %r10501, %r10507; + mad.lo.s32 %r10509, %r65, %r10499, %r10508; + ld.const.v4.u8 {%rs6842, %rs6843, %rs6844, %rs6845}, [matrix+3420]; + cvt.u32.u16 %r10510, %rs6845; + cvt.s32.s8 %r10511, %r10510; + cvt.u32.u16 %r10512, %rs6844; + cvt.s32.s8 %r10513, %r10512; + cvt.u32.u16 %r10514, %rs6843; + cvt.s32.s8 %r10515, %r10514; + cvt.u32.u16 %r10516, %rs6842; + cvt.s32.s8 %r10517, %r10516; + mad.lo.s32 %r10518, %r67, %r10517, %r10509; + mad.lo.s32 %r10519, %r68, %r10515, %r10518; + mad.lo.s32 %r10520, %r69, %r10513, %r10519; + mad.lo.s32 %r10521, %r70, %r10511, %r10520; + ld.const.v4.u8 {%rs6850, %rs6851, %rs6852, %rs6853}, [matrix+3424]; + cvt.u32.u16 %r10522, %rs6853; + cvt.s32.s8 %r10523, %r10522; + cvt.u32.u16 %r10524, %rs6852; + cvt.s32.s8 %r10525, %r10524; + cvt.u32.u16 %r10526, %rs6851; + cvt.s32.s8 %r10527, %r10526; + cvt.u32.u16 %r10528, %rs6850; + cvt.s32.s8 %r10529, %r10528; + mad.lo.s32 %r10530, %r222, %r10529, %r10521; + mad.lo.s32 %r10531, %r72, %r10527, %r10530; + mad.lo.s32 %r10532, %r73, %r10525, %r10531; + mad.lo.s32 %r10533, %r74, %r10523, %r10532; + ld.const.v4.u8 {%rs6858, %rs6859, %rs6860, %rs6861}, [matrix+3428]; + cvt.u32.u16 %r10534, %rs6861; + cvt.s32.s8 %r10535, %r10534; + cvt.u32.u16 %r10536, %rs6860; + cvt.s32.s8 %r10537, %r10536; + cvt.u32.u16 %r10538, %rs6859; + cvt.s32.s8 %r10539, %r10538; + cvt.u32.u16 %r10540, %rs6858; + cvt.s32.s8 %r10541, %r10540; + mad.lo.s32 %r10542, %r75, %r10541, %r10533; + mad.lo.s32 %r10543, %r76, %r10539, %r10542; + mad.lo.s32 %r10544, %r77, %r10537, %r10543; + mad.lo.s32 %r10545, %r78, %r10535, %r10544; + ld.const.v4.u8 {%rs6866, %rs6867, %rs6868, %rs6869}, [matrix+3432]; + cvt.u32.u16 %r10546, %rs6869; + cvt.s32.s8 %r10547, %r10546; + cvt.u32.u16 %r10548, %rs6868; + cvt.s32.s8 %r10549, %r10548; + cvt.u32.u16 %r10550, %rs6867; + cvt.s32.s8 %r10551, %r10550; + cvt.u32.u16 %r10552, %rs6866; + cvt.s32.s8 %r10553, %r10552; + mad.lo.s32 %r10554, %r80, %r10553, %r10545; + mad.lo.s32 %r10555, %r81, %r10551, %r10554; + mad.lo.s32 %r10556, %r83, %r10549, %r10555; + mad.lo.s32 %r10557, %r84, %r10547, %r10556; + ld.const.v4.u8 {%rs6874, %rs6875, %rs6876, %rs6877}, [matrix+3436]; + cvt.u32.u16 %r10558, %rs6877; + cvt.s32.s8 %r10559, %r10558; + cvt.u32.u16 %r10560, %rs6876; + cvt.s32.s8 %r10561, %r10560; + cvt.u32.u16 %r10562, %rs6875; + cvt.s32.s8 %r10563, %r10562; + cvt.u32.u16 %r10564, %rs6874; + cvt.s32.s8 %r10565, %r10564; + mad.lo.s32 %r10566, %r86, %r10565, %r10557; + mad.lo.s32 %r10567, %r87, %r10563, %r10566; + mad.lo.s32 %r10568, %r88, %r10561, %r10567; + mad.lo.s32 %r10569, %r89, %r10559, %r10568; + ld.const.v4.u8 {%rs6882, %rs6883, %rs6884, %rs6885}, [matrix+3440]; + cvt.u32.u16 %r10570, %rs6885; + cvt.s32.s8 %r10571, %r10570; + cvt.u32.u16 %r10572, %rs6884; + cvt.s32.s8 %r10573, %r10572; + cvt.u32.u16 %r10574, %rs6883; + cvt.s32.s8 %r10575, %r10574; + cvt.u32.u16 %r10576, %rs6882; + cvt.s32.s8 %r10577, %r10576; + mad.lo.s32 %r10578, %r271, %r10577, %r10569; + mad.lo.s32 %r10579, %r91, %r10575, %r10578; + mad.lo.s32 %r10580, %r93, %r10573, %r10579; + mad.lo.s32 %r10581, %r94, %r10571, %r10580; + ld.const.v4.u8 {%rs6890, %rs6891, %rs6892, %rs6893}, [matrix+3444]; + cvt.u32.u16 %r10582, %rs6893; + cvt.s32.s8 %r10583, %r10582; + cvt.u32.u16 %r10584, %rs6892; + cvt.s32.s8 %r10585, %r10584; + cvt.u32.u16 %r10586, %rs6891; + cvt.s32.s8 %r10587, %r10586; + cvt.u32.u16 %r10588, %rs6890; + cvt.s32.s8 %r10589, %r10588; + mad.lo.s32 %r10590, %r96, %r10589, %r10581; + mad.lo.s32 %r10591, %r97, %r10587, %r10590; + mad.lo.s32 %r10592, %r99, %r10585, %r10591; + mad.lo.s32 %r10593, %r100, %r10583, %r10592; + ld.const.v4.u8 {%rs6898, %rs6899, %rs6900, %rs6901}, [matrix+3448]; + cvt.u32.u16 %r10594, %rs6901; + cvt.s32.s8 %r10595, %r10594; + cvt.u32.u16 %r10596, %rs6900; + cvt.s32.s8 %r10597, %r10596; + cvt.u32.u16 %r10598, %rs6899; + cvt.s32.s8 %r10599, %r10598; + cvt.u32.u16 %r10600, %rs6898; + cvt.s32.s8 %r10601, %r10600; + mad.lo.s32 %r10602, %r103, %r10601, %r10593; + mad.lo.s32 %r10603, %r104, %r10599, %r10602; + mad.lo.s32 %r10604, %r107, %r10597, %r10603; + mad.lo.s32 %r10605, %r108, %r10595, %r10604; + ld.const.v4.u8 {%rs6906, %rs6907, %rs6908, %rs6909}, [matrix+3452]; + cvt.u32.u16 %r10606, %rs6909; + cvt.s32.s8 %r10607, %r10606; + cvt.u32.u16 %r10608, %rs6908; + cvt.s32.s8 %r10609, %r10608; + cvt.u32.u16 %r10610, %rs6907; + cvt.s32.s8 %r10611, %r10610; + cvt.u32.u16 %r10612, %rs6906; + cvt.s32.s8 %r10613, %r10612; + mad.lo.s32 %r10614, %r111, %r10613, %r10605; + mad.lo.s32 %r10615, %r112, %r10611, %r10614; + mad.lo.s32 %r10616, %r114, %r10609, %r10615; + mad.lo.s32 %r10617, %r115, %r10607, %r10616; + shr.u32 %r10618, %r10425, 6; + and.b32 %r10619, %r10618, 240; + shr.u32 %r10620, %r10617, 10; + or.b32 %r10621, %r10620, %r10619; + xor.b32 %r10622, %r95, %r10621; + cvt.u64.u32 %rd400, %r10622; + ld.const.v4.u8 {%rs6914, %rs6915, %rs6916, %rs6917}, [matrix+3456]; + cvt.u32.u16 %r10623, %rs6917; + cvt.s32.s8 %r10624, %r10623; + cvt.u32.u16 %r10625, %rs6916; + cvt.s32.s8 %r10626, %r10625; + cvt.u32.u16 %r10627, %rs6914; + cvt.s32.s8 %r10628, %r10627; + cvt.u32.u16 %r10629, %rs6915; + cvt.s32.s8 %r10630, %r10629; + mul.lo.s32 %r10631, %r34, %r10630; + mad.lo.s32 %r10632, %r124, %r10628, %r10631; + mad.lo.s32 %r10633, %r35, %r10626, %r10632; + mad.lo.s32 %r10634, %r36, %r10624, %r10633; + ld.const.v4.u8 {%rs6922, %rs6923, %rs6924, %rs6925}, [matrix+3460]; + cvt.u32.u16 %r10635, %rs6925; + cvt.s32.s8 %r10636, %r10635; + cvt.u32.u16 %r10637, %rs6924; + cvt.s32.s8 %r10638, %r10637; + cvt.u32.u16 %r10639, %rs6923; + cvt.s32.s8 %r10640, %r10639; + cvt.u32.u16 %r10641, %rs6922; + cvt.s32.s8 %r10642, %r10641; + mad.lo.s32 %r10643, %r37, %r10642, %r10634; + mad.lo.s32 %r10644, %r38, %r10640, %r10643; + mad.lo.s32 %r10645, %r39, %r10638, %r10644; + mad.lo.s32 %r10646, %r40, %r10636, %r10645; + ld.const.v4.u8 {%rs6930, %rs6931, %rs6932, %rs6933}, [matrix+3464]; + cvt.u32.u16 %r10647, %rs6933; + cvt.s32.s8 %r10648, %r10647; + cvt.u32.u16 %r10649, %rs6932; + cvt.s32.s8 %r10650, %r10649; + cvt.u32.u16 %r10651, %rs6931; + cvt.s32.s8 %r10652, %r10651; + cvt.u32.u16 %r10653, %rs6930; + cvt.s32.s8 %r10654, %r10653; + mad.lo.s32 %r10655, %r42, %r10654, %r10646; + mad.lo.s32 %r10656, %r43, %r10652, %r10655; + mad.lo.s32 %r10657, %r45, %r10650, %r10656; + mad.lo.s32 %r10658, %r46, %r10648, %r10657; + ld.const.v4.u8 {%rs6938, %rs6939, %rs6940, %rs6941}, [matrix+3468]; + cvt.u32.u16 %r10659, %rs6941; + cvt.s32.s8 %r10660, %r10659; + cvt.u32.u16 %r10661, %rs6940; + cvt.s32.s8 %r10662, %r10661; + cvt.u32.u16 %r10663, %rs6939; + cvt.s32.s8 %r10664, %r10663; + cvt.u32.u16 %r10665, %rs6938; + cvt.s32.s8 %r10666, %r10665; + mad.lo.s32 %r10667, %r48, %r10666, %r10658; + mad.lo.s32 %r10668, %r49, %r10664, %r10667; + mad.lo.s32 %r10669, %r50, %r10662, %r10668; + mad.lo.s32 %r10670, %r51, %r10660, %r10669; + ld.const.v4.u8 {%rs6946, %rs6947, %rs6948, %rs6949}, [matrix+3472]; + cvt.u32.u16 %r10671, %rs6949; + cvt.s32.s8 %r10672, %r10671; + cvt.u32.u16 %r10673, %rs6948; + cvt.s32.s8 %r10674, %r10673; + cvt.u32.u16 %r10675, %rs6947; + cvt.s32.s8 %r10676, %r10675; + cvt.u32.u16 %r10677, %rs6946; + cvt.s32.s8 %r10678, %r10677; + mad.lo.s32 %r10679, %r173, %r10678, %r10670; + mad.lo.s32 %r10680, %r53, %r10676, %r10679; + mad.lo.s32 %r10681, %r54, %r10674, %r10680; + mad.lo.s32 %r10682, %r55, %r10672, %r10681; + ld.const.v4.u8 {%rs6954, %rs6955, %rs6956, %rs6957}, [matrix+3476]; + cvt.u32.u16 %r10683, %rs6957; + cvt.s32.s8 %r10684, %r10683; + cvt.u32.u16 %r10685, %rs6956; + cvt.s32.s8 %r10686, %r10685; + cvt.u32.u16 %r10687, %rs6955; + cvt.s32.s8 %r10688, %r10687; + cvt.u32.u16 %r10689, %rs6954; + cvt.s32.s8 %r10690, %r10689; + mad.lo.s32 %r10691, %r56, %r10690, %r10682; + mad.lo.s32 %r10692, %r57, %r10688, %r10691; + mad.lo.s32 %r10693, %r58, %r10686, %r10692; + mad.lo.s32 %r10694, %r59, %r10684, %r10693; + ld.const.v4.u8 {%rs6962, %rs6963, %rs6964, %rs6965}, [matrix+3480]; + cvt.u32.u16 %r10695, %rs6965; + cvt.s32.s8 %r10696, %r10695; + cvt.u32.u16 %r10697, %rs6964; + cvt.s32.s8 %r10698, %r10697; + cvt.u32.u16 %r10699, %rs6963; + cvt.s32.s8 %r10700, %r10699; + cvt.u32.u16 %r10701, %rs6962; + cvt.s32.s8 %r10702, %r10701; + mad.lo.s32 %r10703, %r61, %r10702, %r10694; + mad.lo.s32 %r10704, %r62, %r10700, %r10703; + mad.lo.s32 %r10705, %r64, %r10698, %r10704; + mad.lo.s32 %r10706, %r65, %r10696, %r10705; + ld.const.v4.u8 {%rs6970, %rs6971, %rs6972, %rs6973}, [matrix+3484]; + cvt.u32.u16 %r10707, %rs6973; + cvt.s32.s8 %r10708, %r10707; + cvt.u32.u16 %r10709, %rs6972; + cvt.s32.s8 %r10710, %r10709; + cvt.u32.u16 %r10711, %rs6971; + cvt.s32.s8 %r10712, %r10711; + cvt.u32.u16 %r10713, %rs6970; + cvt.s32.s8 %r10714, %r10713; + mad.lo.s32 %r10715, %r67, %r10714, %r10706; + mad.lo.s32 %r10716, %r68, %r10712, %r10715; + mad.lo.s32 %r10717, %r69, %r10710, %r10716; + mad.lo.s32 %r10718, %r70, %r10708, %r10717; + ld.const.v4.u8 {%rs6978, %rs6979, %rs6980, %rs6981}, [matrix+3488]; + cvt.u32.u16 %r10719, %rs6981; + cvt.s32.s8 %r10720, %r10719; + cvt.u32.u16 %r10721, %rs6980; + cvt.s32.s8 %r10722, %r10721; + cvt.u32.u16 %r10723, %rs6979; + cvt.s32.s8 %r10724, %r10723; + cvt.u32.u16 %r10725, %rs6978; + cvt.s32.s8 %r10726, %r10725; + mad.lo.s32 %r10727, %r222, %r10726, %r10718; + mad.lo.s32 %r10728, %r72, %r10724, %r10727; + mad.lo.s32 %r10729, %r73, %r10722, %r10728; + mad.lo.s32 %r10730, %r74, %r10720, %r10729; + ld.const.v4.u8 {%rs6986, %rs6987, %rs6988, %rs6989}, [matrix+3492]; + cvt.u32.u16 %r10731, %rs6989; + cvt.s32.s8 %r10732, %r10731; + cvt.u32.u16 %r10733, %rs6988; + cvt.s32.s8 %r10734, %r10733; + cvt.u32.u16 %r10735, %rs6987; + cvt.s32.s8 %r10736, %r10735; + cvt.u32.u16 %r10737, %rs6986; + cvt.s32.s8 %r10738, %r10737; + mad.lo.s32 %r10739, %r75, %r10738, %r10730; + mad.lo.s32 %r10740, %r76, %r10736, %r10739; + mad.lo.s32 %r10741, %r77, %r10734, %r10740; + mad.lo.s32 %r10742, %r78, %r10732, %r10741; + ld.const.v4.u8 {%rs6994, %rs6995, %rs6996, %rs6997}, [matrix+3496]; + cvt.u32.u16 %r10743, %rs6997; + cvt.s32.s8 %r10744, %r10743; + cvt.u32.u16 %r10745, %rs6996; + cvt.s32.s8 %r10746, %r10745; + cvt.u32.u16 %r10747, %rs6995; + cvt.s32.s8 %r10748, %r10747; + cvt.u32.u16 %r10749, %rs6994; + cvt.s32.s8 %r10750, %r10749; + mad.lo.s32 %r10751, %r80, %r10750, %r10742; + mad.lo.s32 %r10752, %r81, %r10748, %r10751; + mad.lo.s32 %r10753, %r83, %r10746, %r10752; + mad.lo.s32 %r10754, %r84, %r10744, %r10753; + ld.const.v4.u8 {%rs7002, %rs7003, %rs7004, %rs7005}, [matrix+3500]; + cvt.u32.u16 %r10755, %rs7005; + cvt.s32.s8 %r10756, %r10755; + cvt.u32.u16 %r10757, %rs7004; + cvt.s32.s8 %r10758, %r10757; + cvt.u32.u16 %r10759, %rs7003; + cvt.s32.s8 %r10760, %r10759; + cvt.u32.u16 %r10761, %rs7002; + cvt.s32.s8 %r10762, %r10761; + mad.lo.s32 %r10763, %r86, %r10762, %r10754; + mad.lo.s32 %r10764, %r87, %r10760, %r10763; + mad.lo.s32 %r10765, %r88, %r10758, %r10764; + mad.lo.s32 %r10766, %r89, %r10756, %r10765; + ld.const.v4.u8 {%rs7010, %rs7011, %rs7012, %rs7013}, [matrix+3504]; + cvt.u32.u16 %r10767, %rs7013; + cvt.s32.s8 %r10768, %r10767; + cvt.u32.u16 %r10769, %rs7012; + cvt.s32.s8 %r10770, %r10769; + cvt.u32.u16 %r10771, %rs7011; + cvt.s32.s8 %r10772, %r10771; + cvt.u32.u16 %r10773, %rs7010; + cvt.s32.s8 %r10774, %r10773; + mad.lo.s32 %r10775, %r271, %r10774, %r10766; + mad.lo.s32 %r10776, %r91, %r10772, %r10775; + mad.lo.s32 %r10777, %r93, %r10770, %r10776; + mad.lo.s32 %r10778, %r94, %r10768, %r10777; + ld.const.v4.u8 {%rs7018, %rs7019, %rs7020, %rs7021}, [matrix+3508]; + cvt.u32.u16 %r10779, %rs7021; + cvt.s32.s8 %r10780, %r10779; + cvt.u32.u16 %r10781, %rs7020; + cvt.s32.s8 %r10782, %r10781; + cvt.u32.u16 %r10783, %rs7019; + cvt.s32.s8 %r10784, %r10783; + cvt.u32.u16 %r10785, %rs7018; + cvt.s32.s8 %r10786, %r10785; + mad.lo.s32 %r10787, %r96, %r10786, %r10778; + mad.lo.s32 %r10788, %r97, %r10784, %r10787; + mad.lo.s32 %r10789, %r99, %r10782, %r10788; + mad.lo.s32 %r10790, %r100, %r10780, %r10789; + ld.const.v4.u8 {%rs7026, %rs7027, %rs7028, %rs7029}, [matrix+3512]; + cvt.u32.u16 %r10791, %rs7029; + cvt.s32.s8 %r10792, %r10791; + cvt.u32.u16 %r10793, %rs7028; + cvt.s32.s8 %r10794, %r10793; + cvt.u32.u16 %r10795, %rs7027; + cvt.s32.s8 %r10796, %r10795; + cvt.u32.u16 %r10797, %rs7026; + cvt.s32.s8 %r10798, %r10797; + mad.lo.s32 %r10799, %r103, %r10798, %r10790; + mad.lo.s32 %r10800, %r104, %r10796, %r10799; + mad.lo.s32 %r10801, %r107, %r10794, %r10800; + mad.lo.s32 %r10802, %r108, %r10792, %r10801; + ld.const.v4.u8 {%rs7034, %rs7035, %rs7036, %rs7037}, [matrix+3516]; + cvt.u32.u16 %r10803, %rs7037; + cvt.s32.s8 %r10804, %r10803; + cvt.u32.u16 %r10805, %rs7036; + cvt.s32.s8 %r10806, %r10805; + cvt.u32.u16 %r10807, %rs7035; + cvt.s32.s8 %r10808, %r10807; + cvt.u32.u16 %r10809, %rs7034; + cvt.s32.s8 %r10810, %r10809; + mad.lo.s32 %r10811, %r111, %r10810, %r10802; + mad.lo.s32 %r10812, %r112, %r10808, %r10811; + mad.lo.s32 %r10813, %r114, %r10806, %r10812; + mad.lo.s32 %r10814, %r115, %r10804, %r10813; + ld.const.v4.u8 {%rs7042, %rs7043, %rs7044, %rs7045}, [matrix+3520]; + cvt.u32.u16 %r10815, %rs7045; + cvt.s32.s8 %r10816, %r10815; + cvt.u32.u16 %r10817, %rs7044; + cvt.s32.s8 %r10818, %r10817; + cvt.u32.u16 %r10819, %rs7042; + cvt.s32.s8 %r10820, %r10819; + cvt.u32.u16 %r10821, %rs7043; + cvt.s32.s8 %r10822, %r10821; + mul.lo.s32 %r10823, %r34, %r10822; + mad.lo.s32 %r10824, %r124, %r10820, %r10823; + mad.lo.s32 %r10825, %r35, %r10818, %r10824; + mad.lo.s32 %r10826, %r36, %r10816, %r10825; + ld.const.v4.u8 {%rs7050, %rs7051, %rs7052, %rs7053}, [matrix+3524]; + cvt.u32.u16 %r10827, %rs7053; + cvt.s32.s8 %r10828, %r10827; + cvt.u32.u16 %r10829, %rs7052; + cvt.s32.s8 %r10830, %r10829; + cvt.u32.u16 %r10831, %rs7051; + cvt.s32.s8 %r10832, %r10831; + cvt.u32.u16 %r10833, %rs7050; + cvt.s32.s8 %r10834, %r10833; + mad.lo.s32 %r10835, %r37, %r10834, %r10826; + mad.lo.s32 %r10836, %r38, %r10832, %r10835; + mad.lo.s32 %r10837, %r39, %r10830, %r10836; + mad.lo.s32 %r10838, %r40, %r10828, %r10837; + ld.const.v4.u8 {%rs7058, %rs7059, %rs7060, %rs7061}, [matrix+3528]; + cvt.u32.u16 %r10839, %rs7061; + cvt.s32.s8 %r10840, %r10839; + cvt.u32.u16 %r10841, %rs7060; + cvt.s32.s8 %r10842, %r10841; + cvt.u32.u16 %r10843, %rs7059; + cvt.s32.s8 %r10844, %r10843; + cvt.u32.u16 %r10845, %rs7058; + cvt.s32.s8 %r10846, %r10845; + mad.lo.s32 %r10847, %r42, %r10846, %r10838; + mad.lo.s32 %r10848, %r43, %r10844, %r10847; + mad.lo.s32 %r10849, %r45, %r10842, %r10848; + mad.lo.s32 %r10850, %r46, %r10840, %r10849; + ld.const.v4.u8 {%rs7066, %rs7067, %rs7068, %rs7069}, [matrix+3532]; + cvt.u32.u16 %r10851, %rs7069; + cvt.s32.s8 %r10852, %r10851; + cvt.u32.u16 %r10853, %rs7068; + cvt.s32.s8 %r10854, %r10853; + cvt.u32.u16 %r10855, %rs7067; + cvt.s32.s8 %r10856, %r10855; + cvt.u32.u16 %r10857, %rs7066; + cvt.s32.s8 %r10858, %r10857; + mad.lo.s32 %r10859, %r48, %r10858, %r10850; + mad.lo.s32 %r10860, %r49, %r10856, %r10859; + mad.lo.s32 %r10861, %r50, %r10854, %r10860; + mad.lo.s32 %r10862, %r51, %r10852, %r10861; + ld.const.v4.u8 {%rs7074, %rs7075, %rs7076, %rs7077}, [matrix+3536]; + cvt.u32.u16 %r10863, %rs7077; + cvt.s32.s8 %r10864, %r10863; + cvt.u32.u16 %r10865, %rs7076; + cvt.s32.s8 %r10866, %r10865; + cvt.u32.u16 %r10867, %rs7075; + cvt.s32.s8 %r10868, %r10867; + cvt.u32.u16 %r10869, %rs7074; + cvt.s32.s8 %r10870, %r10869; + mad.lo.s32 %r10871, %r173, %r10870, %r10862; + mad.lo.s32 %r10872, %r53, %r10868, %r10871; + mad.lo.s32 %r10873, %r54, %r10866, %r10872; + mad.lo.s32 %r10874, %r55, %r10864, %r10873; + ld.const.v4.u8 {%rs7082, %rs7083, %rs7084, %rs7085}, [matrix+3540]; + cvt.u32.u16 %r10875, %rs7085; + cvt.s32.s8 %r10876, %r10875; + cvt.u32.u16 %r10877, %rs7084; + cvt.s32.s8 %r10878, %r10877; + cvt.u32.u16 %r10879, %rs7083; + cvt.s32.s8 %r10880, %r10879; + cvt.u32.u16 %r10881, %rs7082; + cvt.s32.s8 %r10882, %r10881; + mad.lo.s32 %r10883, %r56, %r10882, %r10874; + mad.lo.s32 %r10884, %r57, %r10880, %r10883; + mad.lo.s32 %r10885, %r58, %r10878, %r10884; + mad.lo.s32 %r10886, %r59, %r10876, %r10885; + ld.const.v4.u8 {%rs7090, %rs7091, %rs7092, %rs7093}, [matrix+3544]; + cvt.u32.u16 %r10887, %rs7093; + cvt.s32.s8 %r10888, %r10887; + cvt.u32.u16 %r10889, %rs7092; + cvt.s32.s8 %r10890, %r10889; + cvt.u32.u16 %r10891, %rs7091; + cvt.s32.s8 %r10892, %r10891; + cvt.u32.u16 %r10893, %rs7090; + cvt.s32.s8 %r10894, %r10893; + mad.lo.s32 %r10895, %r61, %r10894, %r10886; + mad.lo.s32 %r10896, %r62, %r10892, %r10895; + mad.lo.s32 %r10897, %r64, %r10890, %r10896; + mad.lo.s32 %r10898, %r65, %r10888, %r10897; + ld.const.v4.u8 {%rs7098, %rs7099, %rs7100, %rs7101}, [matrix+3548]; + cvt.u32.u16 %r10899, %rs7101; + cvt.s32.s8 %r10900, %r10899; + cvt.u32.u16 %r10901, %rs7100; + cvt.s32.s8 %r10902, %r10901; + cvt.u32.u16 %r10903, %rs7099; + cvt.s32.s8 %r10904, %r10903; + cvt.u32.u16 %r10905, %rs7098; + cvt.s32.s8 %r10906, %r10905; + mad.lo.s32 %r10907, %r67, %r10906, %r10898; + mad.lo.s32 %r10908, %r68, %r10904, %r10907; + mad.lo.s32 %r10909, %r69, %r10902, %r10908; + mad.lo.s32 %r10910, %r70, %r10900, %r10909; + ld.const.v4.u8 {%rs7106, %rs7107, %rs7108, %rs7109}, [matrix+3552]; + cvt.u32.u16 %r10911, %rs7109; + cvt.s32.s8 %r10912, %r10911; + cvt.u32.u16 %r10913, %rs7108; + cvt.s32.s8 %r10914, %r10913; + cvt.u32.u16 %r10915, %rs7107; + cvt.s32.s8 %r10916, %r10915; + cvt.u32.u16 %r10917, %rs7106; + cvt.s32.s8 %r10918, %r10917; + mad.lo.s32 %r10919, %r222, %r10918, %r10910; + mad.lo.s32 %r10920, %r72, %r10916, %r10919; + mad.lo.s32 %r10921, %r73, %r10914, %r10920; + mad.lo.s32 %r10922, %r74, %r10912, %r10921; + ld.const.v4.u8 {%rs7114, %rs7115, %rs7116, %rs7117}, [matrix+3556]; + cvt.u32.u16 %r10923, %rs7117; + cvt.s32.s8 %r10924, %r10923; + cvt.u32.u16 %r10925, %rs7116; + cvt.s32.s8 %r10926, %r10925; + cvt.u32.u16 %r10927, %rs7115; + cvt.s32.s8 %r10928, %r10927; + cvt.u32.u16 %r10929, %rs7114; + cvt.s32.s8 %r10930, %r10929; + mad.lo.s32 %r10931, %r75, %r10930, %r10922; + mad.lo.s32 %r10932, %r76, %r10928, %r10931; + mad.lo.s32 %r10933, %r77, %r10926, %r10932; + mad.lo.s32 %r10934, %r78, %r10924, %r10933; + ld.const.v4.u8 {%rs7122, %rs7123, %rs7124, %rs7125}, [matrix+3560]; + cvt.u32.u16 %r10935, %rs7125; + cvt.s32.s8 %r10936, %r10935; + cvt.u32.u16 %r10937, %rs7124; + cvt.s32.s8 %r10938, %r10937; + cvt.u32.u16 %r10939, %rs7123; + cvt.s32.s8 %r10940, %r10939; + cvt.u32.u16 %r10941, %rs7122; + cvt.s32.s8 %r10942, %r10941; + mad.lo.s32 %r10943, %r80, %r10942, %r10934; + mad.lo.s32 %r10944, %r81, %r10940, %r10943; + mad.lo.s32 %r10945, %r83, %r10938, %r10944; + mad.lo.s32 %r10946, %r84, %r10936, %r10945; + ld.const.v4.u8 {%rs7130, %rs7131, %rs7132, %rs7133}, [matrix+3564]; + cvt.u32.u16 %r10947, %rs7133; + cvt.s32.s8 %r10948, %r10947; + cvt.u32.u16 %r10949, %rs7132; + cvt.s32.s8 %r10950, %r10949; + cvt.u32.u16 %r10951, %rs7131; + cvt.s32.s8 %r10952, %r10951; + cvt.u32.u16 %r10953, %rs7130; + cvt.s32.s8 %r10954, %r10953; + mad.lo.s32 %r10955, %r86, %r10954, %r10946; + mad.lo.s32 %r10956, %r87, %r10952, %r10955; + mad.lo.s32 %r10957, %r88, %r10950, %r10956; + mad.lo.s32 %r10958, %r89, %r10948, %r10957; + ld.const.v4.u8 {%rs7138, %rs7139, %rs7140, %rs7141}, [matrix+3568]; + cvt.u32.u16 %r10959, %rs7141; + cvt.s32.s8 %r10960, %r10959; + cvt.u32.u16 %r10961, %rs7140; + cvt.s32.s8 %r10962, %r10961; + cvt.u32.u16 %r10963, %rs7139; + cvt.s32.s8 %r10964, %r10963; + cvt.u32.u16 %r10965, %rs7138; + cvt.s32.s8 %r10966, %r10965; + mad.lo.s32 %r10967, %r271, %r10966, %r10958; + mad.lo.s32 %r10968, %r91, %r10964, %r10967; + mad.lo.s32 %r10969, %r93, %r10962, %r10968; + mad.lo.s32 %r10970, %r94, %r10960, %r10969; + ld.const.v4.u8 {%rs7146, %rs7147, %rs7148, %rs7149}, [matrix+3572]; + cvt.u32.u16 %r10971, %rs7149; + cvt.s32.s8 %r10972, %r10971; + cvt.u32.u16 %r10973, %rs7148; + cvt.s32.s8 %r10974, %r10973; + cvt.u32.u16 %r10975, %rs7147; + cvt.s32.s8 %r10976, %r10975; + cvt.u32.u16 %r10977, %rs7146; + cvt.s32.s8 %r10978, %r10977; + mad.lo.s32 %r10979, %r96, %r10978, %r10970; + mad.lo.s32 %r10980, %r97, %r10976, %r10979; + mad.lo.s32 %r10981, %r99, %r10974, %r10980; + mad.lo.s32 %r10982, %r100, %r10972, %r10981; + ld.const.v4.u8 {%rs7154, %rs7155, %rs7156, %rs7157}, [matrix+3576]; + cvt.u32.u16 %r10983, %rs7157; + cvt.s32.s8 %r10984, %r10983; + cvt.u32.u16 %r10985, %rs7156; + cvt.s32.s8 %r10986, %r10985; + cvt.u32.u16 %r10987, %rs7155; + cvt.s32.s8 %r10988, %r10987; + cvt.u32.u16 %r10989, %rs7154; + cvt.s32.s8 %r10990, %r10989; + mad.lo.s32 %r10991, %r103, %r10990, %r10982; + mad.lo.s32 %r10992, %r104, %r10988, %r10991; + mad.lo.s32 %r10993, %r107, %r10986, %r10992; + mad.lo.s32 %r10994, %r108, %r10984, %r10993; + ld.const.v4.u8 {%rs7162, %rs7163, %rs7164, %rs7165}, [matrix+3580]; + cvt.u32.u16 %r10995, %rs7165; + cvt.s32.s8 %r10996, %r10995; + cvt.u32.u16 %r10997, %rs7164; + cvt.s32.s8 %r10998, %r10997; + cvt.u32.u16 %r10999, %rs7163; + cvt.s32.s8 %r11000, %r10999; + cvt.u32.u16 %r11001, %rs7162; + cvt.s32.s8 %r11002, %r11001; + mad.lo.s32 %r11003, %r111, %r11002, %r10994; + mad.lo.s32 %r11004, %r112, %r11000, %r11003; + mad.lo.s32 %r11005, %r114, %r10998, %r11004; + mad.lo.s32 %r11006, %r115, %r10996, %r11005; + shr.u32 %r11007, %r10814, 6; + and.b32 %r11008, %r11007, 240; + shr.u32 %r11009, %r11006, 10; + or.b32 %r11010, %r11009, %r11008; + xor.b32 %r11011, %r98, %r11010; + cvt.u64.u32 %rd401, %r11011; + ld.const.v4.u8 {%rs7170, %rs7171, %rs7172, %rs7173}, [matrix+3584]; + cvt.u32.u16 %r11012, %rs7173; + cvt.s32.s8 %r11013, %r11012; + cvt.u32.u16 %r11014, %rs7172; + cvt.s32.s8 %r11015, %r11014; + cvt.u32.u16 %r11016, %rs7170; + cvt.s32.s8 %r11017, %r11016; + cvt.u32.u16 %r11018, %rs7171; + cvt.s32.s8 %r11019, %r11018; + mul.lo.s32 %r11020, %r34, %r11019; + mad.lo.s32 %r11021, %r124, %r11017, %r11020; + mad.lo.s32 %r11022, %r35, %r11015, %r11021; + mad.lo.s32 %r11023, %r36, %r11013, %r11022; + ld.const.v4.u8 {%rs7178, %rs7179, %rs7180, %rs7181}, [matrix+3588]; + cvt.u32.u16 %r11024, %rs7181; + cvt.s32.s8 %r11025, %r11024; + cvt.u32.u16 %r11026, %rs7180; + cvt.s32.s8 %r11027, %r11026; + cvt.u32.u16 %r11028, %rs7179; + cvt.s32.s8 %r11029, %r11028; + cvt.u32.u16 %r11030, %rs7178; + cvt.s32.s8 %r11031, %r11030; + mad.lo.s32 %r11032, %r37, %r11031, %r11023; + mad.lo.s32 %r11033, %r38, %r11029, %r11032; + mad.lo.s32 %r11034, %r39, %r11027, %r11033; + mad.lo.s32 %r11035, %r40, %r11025, %r11034; + ld.const.v4.u8 {%rs7186, %rs7187, %rs7188, %rs7189}, [matrix+3592]; + cvt.u32.u16 %r11036, %rs7189; + cvt.s32.s8 %r11037, %r11036; + cvt.u32.u16 %r11038, %rs7188; + cvt.s32.s8 %r11039, %r11038; + cvt.u32.u16 %r11040, %rs7187; + cvt.s32.s8 %r11041, %r11040; + cvt.u32.u16 %r11042, %rs7186; + cvt.s32.s8 %r11043, %r11042; + mad.lo.s32 %r11044, %r42, %r11043, %r11035; + mad.lo.s32 %r11045, %r43, %r11041, %r11044; + mad.lo.s32 %r11046, %r45, %r11039, %r11045; + mad.lo.s32 %r11047, %r46, %r11037, %r11046; + ld.const.v4.u8 {%rs7194, %rs7195, %rs7196, %rs7197}, [matrix+3596]; + cvt.u32.u16 %r11048, %rs7197; + cvt.s32.s8 %r11049, %r11048; + cvt.u32.u16 %r11050, %rs7196; + cvt.s32.s8 %r11051, %r11050; + cvt.u32.u16 %r11052, %rs7195; + cvt.s32.s8 %r11053, %r11052; + cvt.u32.u16 %r11054, %rs7194; + cvt.s32.s8 %r11055, %r11054; + mad.lo.s32 %r11056, %r48, %r11055, %r11047; + mad.lo.s32 %r11057, %r49, %r11053, %r11056; + mad.lo.s32 %r11058, %r50, %r11051, %r11057; + mad.lo.s32 %r11059, %r51, %r11049, %r11058; + ld.const.v4.u8 {%rs7202, %rs7203, %rs7204, %rs7205}, [matrix+3600]; + cvt.u32.u16 %r11060, %rs7205; + cvt.s32.s8 %r11061, %r11060; + cvt.u32.u16 %r11062, %rs7204; + cvt.s32.s8 %r11063, %r11062; + cvt.u32.u16 %r11064, %rs7203; + cvt.s32.s8 %r11065, %r11064; + cvt.u32.u16 %r11066, %rs7202; + cvt.s32.s8 %r11067, %r11066; + mad.lo.s32 %r11068, %r173, %r11067, %r11059; + mad.lo.s32 %r11069, %r53, %r11065, %r11068; + mad.lo.s32 %r11070, %r54, %r11063, %r11069; + mad.lo.s32 %r11071, %r55, %r11061, %r11070; + ld.const.v4.u8 {%rs7210, %rs7211, %rs7212, %rs7213}, [matrix+3604]; + cvt.u32.u16 %r11072, %rs7213; + cvt.s32.s8 %r11073, %r11072; + cvt.u32.u16 %r11074, %rs7212; + cvt.s32.s8 %r11075, %r11074; + cvt.u32.u16 %r11076, %rs7211; + cvt.s32.s8 %r11077, %r11076; + cvt.u32.u16 %r11078, %rs7210; + cvt.s32.s8 %r11079, %r11078; + mad.lo.s32 %r11080, %r56, %r11079, %r11071; + mad.lo.s32 %r11081, %r57, %r11077, %r11080; + mad.lo.s32 %r11082, %r58, %r11075, %r11081; + mad.lo.s32 %r11083, %r59, %r11073, %r11082; + ld.const.v4.u8 {%rs7218, %rs7219, %rs7220, %rs7221}, [matrix+3608]; + cvt.u32.u16 %r11084, %rs7221; + cvt.s32.s8 %r11085, %r11084; + cvt.u32.u16 %r11086, %rs7220; + cvt.s32.s8 %r11087, %r11086; + cvt.u32.u16 %r11088, %rs7219; + cvt.s32.s8 %r11089, %r11088; + cvt.u32.u16 %r11090, %rs7218; + cvt.s32.s8 %r11091, %r11090; + mad.lo.s32 %r11092, %r61, %r11091, %r11083; + mad.lo.s32 %r11093, %r62, %r11089, %r11092; + mad.lo.s32 %r11094, %r64, %r11087, %r11093; + mad.lo.s32 %r11095, %r65, %r11085, %r11094; + ld.const.v4.u8 {%rs7226, %rs7227, %rs7228, %rs7229}, [matrix+3612]; + cvt.u32.u16 %r11096, %rs7229; + cvt.s32.s8 %r11097, %r11096; + cvt.u32.u16 %r11098, %rs7228; + cvt.s32.s8 %r11099, %r11098; + cvt.u32.u16 %r11100, %rs7227; + cvt.s32.s8 %r11101, %r11100; + cvt.u32.u16 %r11102, %rs7226; + cvt.s32.s8 %r11103, %r11102; + mad.lo.s32 %r11104, %r67, %r11103, %r11095; + mad.lo.s32 %r11105, %r68, %r11101, %r11104; + mad.lo.s32 %r11106, %r69, %r11099, %r11105; + mad.lo.s32 %r11107, %r70, %r11097, %r11106; + ld.const.v4.u8 {%rs7234, %rs7235, %rs7236, %rs7237}, [matrix+3616]; + cvt.u32.u16 %r11108, %rs7237; + cvt.s32.s8 %r11109, %r11108; + cvt.u32.u16 %r11110, %rs7236; + cvt.s32.s8 %r11111, %r11110; + cvt.u32.u16 %r11112, %rs7235; + cvt.s32.s8 %r11113, %r11112; + cvt.u32.u16 %r11114, %rs7234; + cvt.s32.s8 %r11115, %r11114; + mad.lo.s32 %r11116, %r222, %r11115, %r11107; + mad.lo.s32 %r11117, %r72, %r11113, %r11116; + mad.lo.s32 %r11118, %r73, %r11111, %r11117; + mad.lo.s32 %r11119, %r74, %r11109, %r11118; + ld.const.v4.u8 {%rs7242, %rs7243, %rs7244, %rs7245}, [matrix+3620]; + cvt.u32.u16 %r11120, %rs7245; + cvt.s32.s8 %r11121, %r11120; + cvt.u32.u16 %r11122, %rs7244; + cvt.s32.s8 %r11123, %r11122; + cvt.u32.u16 %r11124, %rs7243; + cvt.s32.s8 %r11125, %r11124; + cvt.u32.u16 %r11126, %rs7242; + cvt.s32.s8 %r11127, %r11126; + mad.lo.s32 %r11128, %r75, %r11127, %r11119; + mad.lo.s32 %r11129, %r76, %r11125, %r11128; + mad.lo.s32 %r11130, %r77, %r11123, %r11129; + mad.lo.s32 %r11131, %r78, %r11121, %r11130; + ld.const.v4.u8 {%rs7250, %rs7251, %rs7252, %rs7253}, [matrix+3624]; + cvt.u32.u16 %r11132, %rs7253; + cvt.s32.s8 %r11133, %r11132; + cvt.u32.u16 %r11134, %rs7252; + cvt.s32.s8 %r11135, %r11134; + cvt.u32.u16 %r11136, %rs7251; + cvt.s32.s8 %r11137, %r11136; + cvt.u32.u16 %r11138, %rs7250; + cvt.s32.s8 %r11139, %r11138; + mad.lo.s32 %r11140, %r80, %r11139, %r11131; + mad.lo.s32 %r11141, %r81, %r11137, %r11140; + mad.lo.s32 %r11142, %r83, %r11135, %r11141; + mad.lo.s32 %r11143, %r84, %r11133, %r11142; + ld.const.v4.u8 {%rs7258, %rs7259, %rs7260, %rs7261}, [matrix+3628]; + cvt.u32.u16 %r11144, %rs7261; + cvt.s32.s8 %r11145, %r11144; + cvt.u32.u16 %r11146, %rs7260; + cvt.s32.s8 %r11147, %r11146; + cvt.u32.u16 %r11148, %rs7259; + cvt.s32.s8 %r11149, %r11148; + cvt.u32.u16 %r11150, %rs7258; + cvt.s32.s8 %r11151, %r11150; + mad.lo.s32 %r11152, %r86, %r11151, %r11143; + mad.lo.s32 %r11153, %r87, %r11149, %r11152; + mad.lo.s32 %r11154, %r88, %r11147, %r11153; + mad.lo.s32 %r11155, %r89, %r11145, %r11154; + ld.const.v4.u8 {%rs7266, %rs7267, %rs7268, %rs7269}, [matrix+3632]; + cvt.u32.u16 %r11156, %rs7269; + cvt.s32.s8 %r11157, %r11156; + cvt.u32.u16 %r11158, %rs7268; + cvt.s32.s8 %r11159, %r11158; + cvt.u32.u16 %r11160, %rs7267; + cvt.s32.s8 %r11161, %r11160; + cvt.u32.u16 %r11162, %rs7266; + cvt.s32.s8 %r11163, %r11162; + mad.lo.s32 %r11164, %r271, %r11163, %r11155; + mad.lo.s32 %r11165, %r91, %r11161, %r11164; + mad.lo.s32 %r11166, %r93, %r11159, %r11165; + mad.lo.s32 %r11167, %r94, %r11157, %r11166; + ld.const.v4.u8 {%rs7274, %rs7275, %rs7276, %rs7277}, [matrix+3636]; + cvt.u32.u16 %r11168, %rs7277; + cvt.s32.s8 %r11169, %r11168; + cvt.u32.u16 %r11170, %rs7276; + cvt.s32.s8 %r11171, %r11170; + cvt.u32.u16 %r11172, %rs7275; + cvt.s32.s8 %r11173, %r11172; + cvt.u32.u16 %r11174, %rs7274; + cvt.s32.s8 %r11175, %r11174; + mad.lo.s32 %r11176, %r96, %r11175, %r11167; + mad.lo.s32 %r11177, %r97, %r11173, %r11176; + mad.lo.s32 %r11178, %r99, %r11171, %r11177; + mad.lo.s32 %r11179, %r100, %r11169, %r11178; + ld.const.v4.u8 {%rs7282, %rs7283, %rs7284, %rs7285}, [matrix+3640]; + cvt.u32.u16 %r11180, %rs7285; + cvt.s32.s8 %r11181, %r11180; + cvt.u32.u16 %r11182, %rs7284; + cvt.s32.s8 %r11183, %r11182; + cvt.u32.u16 %r11184, %rs7283; + cvt.s32.s8 %r11185, %r11184; + cvt.u32.u16 %r11186, %rs7282; + cvt.s32.s8 %r11187, %r11186; + mad.lo.s32 %r11188, %r103, %r11187, %r11179; + mad.lo.s32 %r11189, %r104, %r11185, %r11188; + mad.lo.s32 %r11190, %r107, %r11183, %r11189; + mad.lo.s32 %r11191, %r108, %r11181, %r11190; + ld.const.v4.u8 {%rs7290, %rs7291, %rs7292, %rs7293}, [matrix+3644]; + cvt.u32.u16 %r11192, %rs7293; + cvt.s32.s8 %r11193, %r11192; + cvt.u32.u16 %r11194, %rs7292; + cvt.s32.s8 %r11195, %r11194; + cvt.u32.u16 %r11196, %rs7291; + cvt.s32.s8 %r11197, %r11196; + cvt.u32.u16 %r11198, %rs7290; + cvt.s32.s8 %r11199, %r11198; + mad.lo.s32 %r11200, %r111, %r11199, %r11191; + mad.lo.s32 %r11201, %r112, %r11197, %r11200; + mad.lo.s32 %r11202, %r114, %r11195, %r11201; + mad.lo.s32 %r11203, %r115, %r11193, %r11202; + ld.const.v4.u8 {%rs7298, %rs7299, %rs7300, %rs7301}, [matrix+3648]; + cvt.u32.u16 %r11204, %rs7301; + cvt.s32.s8 %r11205, %r11204; + cvt.u32.u16 %r11206, %rs7300; + cvt.s32.s8 %r11207, %r11206; + cvt.u32.u16 %r11208, %rs7298; + cvt.s32.s8 %r11209, %r11208; + cvt.u32.u16 %r11210, %rs7299; + cvt.s32.s8 %r11211, %r11210; + mul.lo.s32 %r11212, %r34, %r11211; + mad.lo.s32 %r11213, %r124, %r11209, %r11212; + mad.lo.s32 %r11214, %r35, %r11207, %r11213; + mad.lo.s32 %r11215, %r36, %r11205, %r11214; + ld.const.v4.u8 {%rs7306, %rs7307, %rs7308, %rs7309}, [matrix+3652]; + cvt.u32.u16 %r11216, %rs7309; + cvt.s32.s8 %r11217, %r11216; + cvt.u32.u16 %r11218, %rs7308; + cvt.s32.s8 %r11219, %r11218; + cvt.u32.u16 %r11220, %rs7307; + cvt.s32.s8 %r11221, %r11220; + cvt.u32.u16 %r11222, %rs7306; + cvt.s32.s8 %r11223, %r11222; + mad.lo.s32 %r11224, %r37, %r11223, %r11215; + mad.lo.s32 %r11225, %r38, %r11221, %r11224; + mad.lo.s32 %r11226, %r39, %r11219, %r11225; + mad.lo.s32 %r11227, %r40, %r11217, %r11226; + ld.const.v4.u8 {%rs7314, %rs7315, %rs7316, %rs7317}, [matrix+3656]; + cvt.u32.u16 %r11228, %rs7317; + cvt.s32.s8 %r11229, %r11228; + cvt.u32.u16 %r11230, %rs7316; + cvt.s32.s8 %r11231, %r11230; + cvt.u32.u16 %r11232, %rs7315; + cvt.s32.s8 %r11233, %r11232; + cvt.u32.u16 %r11234, %rs7314; + cvt.s32.s8 %r11235, %r11234; + mad.lo.s32 %r11236, %r42, %r11235, %r11227; + mad.lo.s32 %r11237, %r43, %r11233, %r11236; + mad.lo.s32 %r11238, %r45, %r11231, %r11237; + mad.lo.s32 %r11239, %r46, %r11229, %r11238; + ld.const.v4.u8 {%rs7322, %rs7323, %rs7324, %rs7325}, [matrix+3660]; + cvt.u32.u16 %r11240, %rs7325; + cvt.s32.s8 %r11241, %r11240; + cvt.u32.u16 %r11242, %rs7324; + cvt.s32.s8 %r11243, %r11242; + cvt.u32.u16 %r11244, %rs7323; + cvt.s32.s8 %r11245, %r11244; + cvt.u32.u16 %r11246, %rs7322; + cvt.s32.s8 %r11247, %r11246; + mad.lo.s32 %r11248, %r48, %r11247, %r11239; + mad.lo.s32 %r11249, %r49, %r11245, %r11248; + mad.lo.s32 %r11250, %r50, %r11243, %r11249; + mad.lo.s32 %r11251, %r51, %r11241, %r11250; + ld.const.v4.u8 {%rs7330, %rs7331, %rs7332, %rs7333}, [matrix+3664]; + cvt.u32.u16 %r11252, %rs7333; + cvt.s32.s8 %r11253, %r11252; + cvt.u32.u16 %r11254, %rs7332; + cvt.s32.s8 %r11255, %r11254; + cvt.u32.u16 %r11256, %rs7331; + cvt.s32.s8 %r11257, %r11256; + cvt.u32.u16 %r11258, %rs7330; + cvt.s32.s8 %r11259, %r11258; + mad.lo.s32 %r11260, %r173, %r11259, %r11251; + mad.lo.s32 %r11261, %r53, %r11257, %r11260; + mad.lo.s32 %r11262, %r54, %r11255, %r11261; + mad.lo.s32 %r11263, %r55, %r11253, %r11262; + ld.const.v4.u8 {%rs7338, %rs7339, %rs7340, %rs7341}, [matrix+3668]; + cvt.u32.u16 %r11264, %rs7341; + cvt.s32.s8 %r11265, %r11264; + cvt.u32.u16 %r11266, %rs7340; + cvt.s32.s8 %r11267, %r11266; + cvt.u32.u16 %r11268, %rs7339; + cvt.s32.s8 %r11269, %r11268; + cvt.u32.u16 %r11270, %rs7338; + cvt.s32.s8 %r11271, %r11270; + mad.lo.s32 %r11272, %r56, %r11271, %r11263; + mad.lo.s32 %r11273, %r57, %r11269, %r11272; + mad.lo.s32 %r11274, %r58, %r11267, %r11273; + mad.lo.s32 %r11275, %r59, %r11265, %r11274; + ld.const.v4.u8 {%rs7346, %rs7347, %rs7348, %rs7349}, [matrix+3672]; + cvt.u32.u16 %r11276, %rs7349; + cvt.s32.s8 %r11277, %r11276; + cvt.u32.u16 %r11278, %rs7348; + cvt.s32.s8 %r11279, %r11278; + cvt.u32.u16 %r11280, %rs7347; + cvt.s32.s8 %r11281, %r11280; + cvt.u32.u16 %r11282, %rs7346; + cvt.s32.s8 %r11283, %r11282; + mad.lo.s32 %r11284, %r61, %r11283, %r11275; + mad.lo.s32 %r11285, %r62, %r11281, %r11284; + mad.lo.s32 %r11286, %r64, %r11279, %r11285; + mad.lo.s32 %r11287, %r65, %r11277, %r11286; + ld.const.v4.u8 {%rs7354, %rs7355, %rs7356, %rs7357}, [matrix+3676]; + cvt.u32.u16 %r11288, %rs7357; + cvt.s32.s8 %r11289, %r11288; + cvt.u32.u16 %r11290, %rs7356; + cvt.s32.s8 %r11291, %r11290; + cvt.u32.u16 %r11292, %rs7355; + cvt.s32.s8 %r11293, %r11292; + cvt.u32.u16 %r11294, %rs7354; + cvt.s32.s8 %r11295, %r11294; + mad.lo.s32 %r11296, %r67, %r11295, %r11287; + mad.lo.s32 %r11297, %r68, %r11293, %r11296; + mad.lo.s32 %r11298, %r69, %r11291, %r11297; + mad.lo.s32 %r11299, %r70, %r11289, %r11298; + ld.const.v4.u8 {%rs7362, %rs7363, %rs7364, %rs7365}, [matrix+3680]; + cvt.u32.u16 %r11300, %rs7365; + cvt.s32.s8 %r11301, %r11300; + cvt.u32.u16 %r11302, %rs7364; + cvt.s32.s8 %r11303, %r11302; + cvt.u32.u16 %r11304, %rs7363; + cvt.s32.s8 %r11305, %r11304; + cvt.u32.u16 %r11306, %rs7362; + cvt.s32.s8 %r11307, %r11306; + mad.lo.s32 %r11308, %r222, %r11307, %r11299; + mad.lo.s32 %r11309, %r72, %r11305, %r11308; + mad.lo.s32 %r11310, %r73, %r11303, %r11309; + mad.lo.s32 %r11311, %r74, %r11301, %r11310; + ld.const.v4.u8 {%rs7370, %rs7371, %rs7372, %rs7373}, [matrix+3684]; + cvt.u32.u16 %r11312, %rs7373; + cvt.s32.s8 %r11313, %r11312; + cvt.u32.u16 %r11314, %rs7372; + cvt.s32.s8 %r11315, %r11314; + cvt.u32.u16 %r11316, %rs7371; + cvt.s32.s8 %r11317, %r11316; + cvt.u32.u16 %r11318, %rs7370; + cvt.s32.s8 %r11319, %r11318; + mad.lo.s32 %r11320, %r75, %r11319, %r11311; + mad.lo.s32 %r11321, %r76, %r11317, %r11320; + mad.lo.s32 %r11322, %r77, %r11315, %r11321; + mad.lo.s32 %r11323, %r78, %r11313, %r11322; + ld.const.v4.u8 {%rs7378, %rs7379, %rs7380, %rs7381}, [matrix+3688]; + cvt.u32.u16 %r11324, %rs7381; + cvt.s32.s8 %r11325, %r11324; + cvt.u32.u16 %r11326, %rs7380; + cvt.s32.s8 %r11327, %r11326; + cvt.u32.u16 %r11328, %rs7379; + cvt.s32.s8 %r11329, %r11328; + cvt.u32.u16 %r11330, %rs7378; + cvt.s32.s8 %r11331, %r11330; + mad.lo.s32 %r11332, %r80, %r11331, %r11323; + mad.lo.s32 %r11333, %r81, %r11329, %r11332; + mad.lo.s32 %r11334, %r83, %r11327, %r11333; + mad.lo.s32 %r11335, %r84, %r11325, %r11334; + ld.const.v4.u8 {%rs7386, %rs7387, %rs7388, %rs7389}, [matrix+3692]; + cvt.u32.u16 %r11336, %rs7389; + cvt.s32.s8 %r11337, %r11336; + cvt.u32.u16 %r11338, %rs7388; + cvt.s32.s8 %r11339, %r11338; + cvt.u32.u16 %r11340, %rs7387; + cvt.s32.s8 %r11341, %r11340; + cvt.u32.u16 %r11342, %rs7386; + cvt.s32.s8 %r11343, %r11342; + mad.lo.s32 %r11344, %r86, %r11343, %r11335; + mad.lo.s32 %r11345, %r87, %r11341, %r11344; + mad.lo.s32 %r11346, %r88, %r11339, %r11345; + mad.lo.s32 %r11347, %r89, %r11337, %r11346; + ld.const.v4.u8 {%rs7394, %rs7395, %rs7396, %rs7397}, [matrix+3696]; + cvt.u32.u16 %r11348, %rs7397; + cvt.s32.s8 %r11349, %r11348; + cvt.u32.u16 %r11350, %rs7396; + cvt.s32.s8 %r11351, %r11350; + cvt.u32.u16 %r11352, %rs7395; + cvt.s32.s8 %r11353, %r11352; + cvt.u32.u16 %r11354, %rs7394; + cvt.s32.s8 %r11355, %r11354; + mad.lo.s32 %r11356, %r271, %r11355, %r11347; + mad.lo.s32 %r11357, %r91, %r11353, %r11356; + mad.lo.s32 %r11358, %r93, %r11351, %r11357; + mad.lo.s32 %r11359, %r94, %r11349, %r11358; + ld.const.v4.u8 {%rs7402, %rs7403, %rs7404, %rs7405}, [matrix+3700]; + cvt.u32.u16 %r11360, %rs7405; + cvt.s32.s8 %r11361, %r11360; + cvt.u32.u16 %r11362, %rs7404; + cvt.s32.s8 %r11363, %r11362; + cvt.u32.u16 %r11364, %rs7403; + cvt.s32.s8 %r11365, %r11364; + cvt.u32.u16 %r11366, %rs7402; + cvt.s32.s8 %r11367, %r11366; + mad.lo.s32 %r11368, %r96, %r11367, %r11359; + mad.lo.s32 %r11369, %r97, %r11365, %r11368; + mad.lo.s32 %r11370, %r99, %r11363, %r11369; + mad.lo.s32 %r11371, %r100, %r11361, %r11370; + ld.const.v4.u8 {%rs7410, %rs7411, %rs7412, %rs7413}, [matrix+3704]; + cvt.u32.u16 %r11372, %rs7413; + cvt.s32.s8 %r11373, %r11372; + cvt.u32.u16 %r11374, %rs7412; + cvt.s32.s8 %r11375, %r11374; + cvt.u32.u16 %r11376, %rs7411; + cvt.s32.s8 %r11377, %r11376; + cvt.u32.u16 %r11378, %rs7410; + cvt.s32.s8 %r11379, %r11378; + mad.lo.s32 %r11380, %r103, %r11379, %r11371; + mad.lo.s32 %r11381, %r104, %r11377, %r11380; + mad.lo.s32 %r11382, %r107, %r11375, %r11381; + mad.lo.s32 %r11383, %r108, %r11373, %r11382; + ld.const.v4.u8 {%rs7418, %rs7419, %rs7420, %rs7421}, [matrix+3708]; + cvt.u32.u16 %r11384, %rs7421; + cvt.s32.s8 %r11385, %r11384; + cvt.u32.u16 %r11386, %rs7420; + cvt.s32.s8 %r11387, %r11386; + cvt.u32.u16 %r11388, %rs7419; + cvt.s32.s8 %r11389, %r11388; + cvt.u32.u16 %r11390, %rs7418; + cvt.s32.s8 %r11391, %r11390; + mad.lo.s32 %r11392, %r111, %r11391, %r11383; + mad.lo.s32 %r11393, %r112, %r11389, %r11392; + mad.lo.s32 %r11394, %r114, %r11387, %r11393; + mad.lo.s32 %r11395, %r115, %r11385, %r11394; + shr.u32 %r11396, %r11203, 6; + and.b32 %r11397, %r11396, 240; + shr.u32 %r11398, %r11395, 10; + or.b32 %r11399, %r11398, %r11397; + xor.b32 %r11400, %r101, %r11399; + cvt.u64.u32 %rd402, %r11400; + and.b64 %rd403, %rd402, 255; + ld.const.v4.u8 {%rs7426, %rs7427, %rs7428, %rs7429}, [matrix+3712]; + cvt.u32.u16 %r11401, %rs7429; + cvt.s32.s8 %r11402, %r11401; + cvt.u32.u16 %r11403, %rs7428; + cvt.s32.s8 %r11404, %r11403; + cvt.u32.u16 %r11405, %rs7426; + cvt.s32.s8 %r11406, %r11405; + cvt.u32.u16 %r11407, %rs7427; + cvt.s32.s8 %r11408, %r11407; + mul.lo.s32 %r11409, %r34, %r11408; + mad.lo.s32 %r11410, %r124, %r11406, %r11409; + mad.lo.s32 %r11411, %r35, %r11404, %r11410; + mad.lo.s32 %r11412, %r36, %r11402, %r11411; + ld.const.v4.u8 {%rs7434, %rs7435, %rs7436, %rs7437}, [matrix+3716]; + cvt.u32.u16 %r11413, %rs7437; + cvt.s32.s8 %r11414, %r11413; + cvt.u32.u16 %r11415, %rs7436; + cvt.s32.s8 %r11416, %r11415; + cvt.u32.u16 %r11417, %rs7435; + cvt.s32.s8 %r11418, %r11417; + cvt.u32.u16 %r11419, %rs7434; + cvt.s32.s8 %r11420, %r11419; + mad.lo.s32 %r11421, %r37, %r11420, %r11412; + mad.lo.s32 %r11422, %r38, %r11418, %r11421; + mad.lo.s32 %r11423, %r39, %r11416, %r11422; + mad.lo.s32 %r11424, %r40, %r11414, %r11423; + ld.const.v4.u8 {%rs7442, %rs7443, %rs7444, %rs7445}, [matrix+3720]; + cvt.u32.u16 %r11425, %rs7445; + cvt.s32.s8 %r11426, %r11425; + cvt.u32.u16 %r11427, %rs7444; + cvt.s32.s8 %r11428, %r11427; + cvt.u32.u16 %r11429, %rs7443; + cvt.s32.s8 %r11430, %r11429; + cvt.u32.u16 %r11431, %rs7442; + cvt.s32.s8 %r11432, %r11431; + mad.lo.s32 %r11433, %r42, %r11432, %r11424; + mad.lo.s32 %r11434, %r43, %r11430, %r11433; + mad.lo.s32 %r11435, %r45, %r11428, %r11434; + mad.lo.s32 %r11436, %r46, %r11426, %r11435; + ld.const.v4.u8 {%rs7450, %rs7451, %rs7452, %rs7453}, [matrix+3724]; + cvt.u32.u16 %r11437, %rs7453; + cvt.s32.s8 %r11438, %r11437; + cvt.u32.u16 %r11439, %rs7452; + cvt.s32.s8 %r11440, %r11439; + cvt.u32.u16 %r11441, %rs7451; + cvt.s32.s8 %r11442, %r11441; + cvt.u32.u16 %r11443, %rs7450; + cvt.s32.s8 %r11444, %r11443; + mad.lo.s32 %r11445, %r48, %r11444, %r11436; + mad.lo.s32 %r11446, %r49, %r11442, %r11445; + mad.lo.s32 %r11447, %r50, %r11440, %r11446; + mad.lo.s32 %r11448, %r51, %r11438, %r11447; + ld.const.v4.u8 {%rs7458, %rs7459, %rs7460, %rs7461}, [matrix+3728]; + cvt.u32.u16 %r11449, %rs7461; + cvt.s32.s8 %r11450, %r11449; + cvt.u32.u16 %r11451, %rs7460; + cvt.s32.s8 %r11452, %r11451; + cvt.u32.u16 %r11453, %rs7459; + cvt.s32.s8 %r11454, %r11453; + cvt.u32.u16 %r11455, %rs7458; + cvt.s32.s8 %r11456, %r11455; + mad.lo.s32 %r11457, %r173, %r11456, %r11448; + mad.lo.s32 %r11458, %r53, %r11454, %r11457; + mad.lo.s32 %r11459, %r54, %r11452, %r11458; + mad.lo.s32 %r11460, %r55, %r11450, %r11459; + ld.const.v4.u8 {%rs7466, %rs7467, %rs7468, %rs7469}, [matrix+3732]; + cvt.u32.u16 %r11461, %rs7469; + cvt.s32.s8 %r11462, %r11461; + cvt.u32.u16 %r11463, %rs7468; + cvt.s32.s8 %r11464, %r11463; + cvt.u32.u16 %r11465, %rs7467; + cvt.s32.s8 %r11466, %r11465; + cvt.u32.u16 %r11467, %rs7466; + cvt.s32.s8 %r11468, %r11467; + mad.lo.s32 %r11469, %r56, %r11468, %r11460; + mad.lo.s32 %r11470, %r57, %r11466, %r11469; + mad.lo.s32 %r11471, %r58, %r11464, %r11470; + mad.lo.s32 %r11472, %r59, %r11462, %r11471; + ld.const.v4.u8 {%rs7474, %rs7475, %rs7476, %rs7477}, [matrix+3736]; + cvt.u32.u16 %r11473, %rs7477; + cvt.s32.s8 %r11474, %r11473; + cvt.u32.u16 %r11475, %rs7476; + cvt.s32.s8 %r11476, %r11475; + cvt.u32.u16 %r11477, %rs7475; + cvt.s32.s8 %r11478, %r11477; + cvt.u32.u16 %r11479, %rs7474; + cvt.s32.s8 %r11480, %r11479; + mad.lo.s32 %r11481, %r61, %r11480, %r11472; + mad.lo.s32 %r11482, %r62, %r11478, %r11481; + mad.lo.s32 %r11483, %r64, %r11476, %r11482; + mad.lo.s32 %r11484, %r65, %r11474, %r11483; + ld.const.v4.u8 {%rs7482, %rs7483, %rs7484, %rs7485}, [matrix+3740]; + cvt.u32.u16 %r11485, %rs7485; + cvt.s32.s8 %r11486, %r11485; + cvt.u32.u16 %r11487, %rs7484; + cvt.s32.s8 %r11488, %r11487; + cvt.u32.u16 %r11489, %rs7483; + cvt.s32.s8 %r11490, %r11489; + cvt.u32.u16 %r11491, %rs7482; + cvt.s32.s8 %r11492, %r11491; + mad.lo.s32 %r11493, %r67, %r11492, %r11484; + mad.lo.s32 %r11494, %r68, %r11490, %r11493; + mad.lo.s32 %r11495, %r69, %r11488, %r11494; + mad.lo.s32 %r11496, %r70, %r11486, %r11495; + ld.const.v4.u8 {%rs7490, %rs7491, %rs7492, %rs7493}, [matrix+3744]; + cvt.u32.u16 %r11497, %rs7493; + cvt.s32.s8 %r11498, %r11497; + cvt.u32.u16 %r11499, %rs7492; + cvt.s32.s8 %r11500, %r11499; + cvt.u32.u16 %r11501, %rs7491; + cvt.s32.s8 %r11502, %r11501; + cvt.u32.u16 %r11503, %rs7490; + cvt.s32.s8 %r11504, %r11503; + mad.lo.s32 %r11505, %r222, %r11504, %r11496; + mad.lo.s32 %r11506, %r72, %r11502, %r11505; + mad.lo.s32 %r11507, %r73, %r11500, %r11506; + mad.lo.s32 %r11508, %r74, %r11498, %r11507; + ld.const.v4.u8 {%rs7498, %rs7499, %rs7500, %rs7501}, [matrix+3748]; + cvt.u32.u16 %r11509, %rs7501; + cvt.s32.s8 %r11510, %r11509; + cvt.u32.u16 %r11511, %rs7500; + cvt.s32.s8 %r11512, %r11511; + cvt.u32.u16 %r11513, %rs7499; + cvt.s32.s8 %r11514, %r11513; + cvt.u32.u16 %r11515, %rs7498; + cvt.s32.s8 %r11516, %r11515; + mad.lo.s32 %r11517, %r75, %r11516, %r11508; + mad.lo.s32 %r11518, %r76, %r11514, %r11517; + mad.lo.s32 %r11519, %r77, %r11512, %r11518; + mad.lo.s32 %r11520, %r78, %r11510, %r11519; + ld.const.v4.u8 {%rs7506, %rs7507, %rs7508, %rs7509}, [matrix+3752]; + cvt.u32.u16 %r11521, %rs7509; + cvt.s32.s8 %r11522, %r11521; + cvt.u32.u16 %r11523, %rs7508; + cvt.s32.s8 %r11524, %r11523; + cvt.u32.u16 %r11525, %rs7507; + cvt.s32.s8 %r11526, %r11525; + cvt.u32.u16 %r11527, %rs7506; + cvt.s32.s8 %r11528, %r11527; + mad.lo.s32 %r11529, %r80, %r11528, %r11520; + mad.lo.s32 %r11530, %r81, %r11526, %r11529; + mad.lo.s32 %r11531, %r83, %r11524, %r11530; + mad.lo.s32 %r11532, %r84, %r11522, %r11531; + ld.const.v4.u8 {%rs7514, %rs7515, %rs7516, %rs7517}, [matrix+3756]; + cvt.u32.u16 %r11533, %rs7517; + cvt.s32.s8 %r11534, %r11533; + cvt.u32.u16 %r11535, %rs7516; + cvt.s32.s8 %r11536, %r11535; + cvt.u32.u16 %r11537, %rs7515; + cvt.s32.s8 %r11538, %r11537; + cvt.u32.u16 %r11539, %rs7514; + cvt.s32.s8 %r11540, %r11539; + mad.lo.s32 %r11541, %r86, %r11540, %r11532; + mad.lo.s32 %r11542, %r87, %r11538, %r11541; + mad.lo.s32 %r11543, %r88, %r11536, %r11542; + mad.lo.s32 %r11544, %r89, %r11534, %r11543; + ld.const.v4.u8 {%rs7522, %rs7523, %rs7524, %rs7525}, [matrix+3760]; + cvt.u32.u16 %r11545, %rs7525; + cvt.s32.s8 %r11546, %r11545; + cvt.u32.u16 %r11547, %rs7524; + cvt.s32.s8 %r11548, %r11547; + cvt.u32.u16 %r11549, %rs7523; + cvt.s32.s8 %r11550, %r11549; + cvt.u32.u16 %r11551, %rs7522; + cvt.s32.s8 %r11552, %r11551; + mad.lo.s32 %r11553, %r271, %r11552, %r11544; + mad.lo.s32 %r11554, %r91, %r11550, %r11553; + mad.lo.s32 %r11555, %r93, %r11548, %r11554; + mad.lo.s32 %r11556, %r94, %r11546, %r11555; + ld.const.v4.u8 {%rs7530, %rs7531, %rs7532, %rs7533}, [matrix+3764]; + cvt.u32.u16 %r11557, %rs7533; + cvt.s32.s8 %r11558, %r11557; + cvt.u32.u16 %r11559, %rs7532; + cvt.s32.s8 %r11560, %r11559; + cvt.u32.u16 %r11561, %rs7531; + cvt.s32.s8 %r11562, %r11561; + cvt.u32.u16 %r11563, %rs7530; + cvt.s32.s8 %r11564, %r11563; + mad.lo.s32 %r11565, %r96, %r11564, %r11556; + mad.lo.s32 %r11566, %r97, %r11562, %r11565; + mad.lo.s32 %r11567, %r99, %r11560, %r11566; + mad.lo.s32 %r11568, %r100, %r11558, %r11567; + ld.const.v4.u8 {%rs7538, %rs7539, %rs7540, %rs7541}, [matrix+3768]; + cvt.u32.u16 %r11569, %rs7541; + cvt.s32.s8 %r11570, %r11569; + cvt.u32.u16 %r11571, %rs7540; + cvt.s32.s8 %r11572, %r11571; + cvt.u32.u16 %r11573, %rs7539; + cvt.s32.s8 %r11574, %r11573; + cvt.u32.u16 %r11575, %rs7538; + cvt.s32.s8 %r11576, %r11575; + mad.lo.s32 %r11577, %r103, %r11576, %r11568; + mad.lo.s32 %r11578, %r104, %r11574, %r11577; + mad.lo.s32 %r11579, %r107, %r11572, %r11578; + mad.lo.s32 %r11580, %r108, %r11570, %r11579; + ld.const.v4.u8 {%rs7546, %rs7547, %rs7548, %rs7549}, [matrix+3772]; + cvt.u32.u16 %r11581, %rs7549; + cvt.s32.s8 %r11582, %r11581; + cvt.u32.u16 %r11583, %rs7548; + cvt.s32.s8 %r11584, %r11583; + cvt.u32.u16 %r11585, %rs7547; + cvt.s32.s8 %r11586, %r11585; + cvt.u32.u16 %r11587, %rs7546; + cvt.s32.s8 %r11588, %r11587; + mad.lo.s32 %r11589, %r111, %r11588, %r11580; + mad.lo.s32 %r11590, %r112, %r11586, %r11589; + mad.lo.s32 %r11591, %r114, %r11584, %r11590; + mad.lo.s32 %r11592, %r115, %r11582, %r11591; + ld.const.v4.u8 {%rs7554, %rs7555, %rs7556, %rs7557}, [matrix+3776]; + cvt.u32.u16 %r11593, %rs7557; + cvt.s32.s8 %r11594, %r11593; + cvt.u32.u16 %r11595, %rs7556; + cvt.s32.s8 %r11596, %r11595; + cvt.u32.u16 %r11597, %rs7554; + cvt.s32.s8 %r11598, %r11597; + cvt.u32.u16 %r11599, %rs7555; + cvt.s32.s8 %r11600, %r11599; + mul.lo.s32 %r11601, %r34, %r11600; + mad.lo.s32 %r11602, %r124, %r11598, %r11601; + mad.lo.s32 %r11603, %r35, %r11596, %r11602; + mad.lo.s32 %r11604, %r36, %r11594, %r11603; + ld.const.v4.u8 {%rs7562, %rs7563, %rs7564, %rs7565}, [matrix+3780]; + cvt.u32.u16 %r11605, %rs7565; + cvt.s32.s8 %r11606, %r11605; + cvt.u32.u16 %r11607, %rs7564; + cvt.s32.s8 %r11608, %r11607; + cvt.u32.u16 %r11609, %rs7563; + cvt.s32.s8 %r11610, %r11609; + cvt.u32.u16 %r11611, %rs7562; + cvt.s32.s8 %r11612, %r11611; + mad.lo.s32 %r11613, %r37, %r11612, %r11604; + mad.lo.s32 %r11614, %r38, %r11610, %r11613; + mad.lo.s32 %r11615, %r39, %r11608, %r11614; + mad.lo.s32 %r11616, %r40, %r11606, %r11615; + ld.const.v4.u8 {%rs7570, %rs7571, %rs7572, %rs7573}, [matrix+3784]; + cvt.u32.u16 %r11617, %rs7573; + cvt.s32.s8 %r11618, %r11617; + cvt.u32.u16 %r11619, %rs7572; + cvt.s32.s8 %r11620, %r11619; + cvt.u32.u16 %r11621, %rs7571; + cvt.s32.s8 %r11622, %r11621; + cvt.u32.u16 %r11623, %rs7570; + cvt.s32.s8 %r11624, %r11623; + mad.lo.s32 %r11625, %r42, %r11624, %r11616; + mad.lo.s32 %r11626, %r43, %r11622, %r11625; + mad.lo.s32 %r11627, %r45, %r11620, %r11626; + mad.lo.s32 %r11628, %r46, %r11618, %r11627; + ld.const.v4.u8 {%rs7578, %rs7579, %rs7580, %rs7581}, [matrix+3788]; + cvt.u32.u16 %r11629, %rs7581; + cvt.s32.s8 %r11630, %r11629; + cvt.u32.u16 %r11631, %rs7580; + cvt.s32.s8 %r11632, %r11631; + cvt.u32.u16 %r11633, %rs7579; + cvt.s32.s8 %r11634, %r11633; + cvt.u32.u16 %r11635, %rs7578; + cvt.s32.s8 %r11636, %r11635; + mad.lo.s32 %r11637, %r48, %r11636, %r11628; + mad.lo.s32 %r11638, %r49, %r11634, %r11637; + mad.lo.s32 %r11639, %r50, %r11632, %r11638; + mad.lo.s32 %r11640, %r51, %r11630, %r11639; + ld.const.v4.u8 {%rs7586, %rs7587, %rs7588, %rs7589}, [matrix+3792]; + cvt.u32.u16 %r11641, %rs7589; + cvt.s32.s8 %r11642, %r11641; + cvt.u32.u16 %r11643, %rs7588; + cvt.s32.s8 %r11644, %r11643; + cvt.u32.u16 %r11645, %rs7587; + cvt.s32.s8 %r11646, %r11645; + cvt.u32.u16 %r11647, %rs7586; + cvt.s32.s8 %r11648, %r11647; + mad.lo.s32 %r11649, %r173, %r11648, %r11640; + mad.lo.s32 %r11650, %r53, %r11646, %r11649; + mad.lo.s32 %r11651, %r54, %r11644, %r11650; + mad.lo.s32 %r11652, %r55, %r11642, %r11651; + ld.const.v4.u8 {%rs7594, %rs7595, %rs7596, %rs7597}, [matrix+3796]; + cvt.u32.u16 %r11653, %rs7597; + cvt.s32.s8 %r11654, %r11653; + cvt.u32.u16 %r11655, %rs7596; + cvt.s32.s8 %r11656, %r11655; + cvt.u32.u16 %r11657, %rs7595; + cvt.s32.s8 %r11658, %r11657; + cvt.u32.u16 %r11659, %rs7594; + cvt.s32.s8 %r11660, %r11659; + mad.lo.s32 %r11661, %r56, %r11660, %r11652; + mad.lo.s32 %r11662, %r57, %r11658, %r11661; + mad.lo.s32 %r11663, %r58, %r11656, %r11662; + mad.lo.s32 %r11664, %r59, %r11654, %r11663; + ld.const.v4.u8 {%rs7602, %rs7603, %rs7604, %rs7605}, [matrix+3800]; + cvt.u32.u16 %r11665, %rs7605; + cvt.s32.s8 %r11666, %r11665; + cvt.u32.u16 %r11667, %rs7604; + cvt.s32.s8 %r11668, %r11667; + cvt.u32.u16 %r11669, %rs7603; + cvt.s32.s8 %r11670, %r11669; + cvt.u32.u16 %r11671, %rs7602; + cvt.s32.s8 %r11672, %r11671; + mad.lo.s32 %r11673, %r61, %r11672, %r11664; + mad.lo.s32 %r11674, %r62, %r11670, %r11673; + mad.lo.s32 %r11675, %r64, %r11668, %r11674; + mad.lo.s32 %r11676, %r65, %r11666, %r11675; + ld.const.v4.u8 {%rs7610, %rs7611, %rs7612, %rs7613}, [matrix+3804]; + cvt.u32.u16 %r11677, %rs7613; + cvt.s32.s8 %r11678, %r11677; + cvt.u32.u16 %r11679, %rs7612; + cvt.s32.s8 %r11680, %r11679; + cvt.u32.u16 %r11681, %rs7611; + cvt.s32.s8 %r11682, %r11681; + cvt.u32.u16 %r11683, %rs7610; + cvt.s32.s8 %r11684, %r11683; + mad.lo.s32 %r11685, %r67, %r11684, %r11676; + mad.lo.s32 %r11686, %r68, %r11682, %r11685; + mad.lo.s32 %r11687, %r69, %r11680, %r11686; + mad.lo.s32 %r11688, %r70, %r11678, %r11687; + ld.const.v4.u8 {%rs7618, %rs7619, %rs7620, %rs7621}, [matrix+3808]; + cvt.u32.u16 %r11689, %rs7621; + cvt.s32.s8 %r11690, %r11689; + cvt.u32.u16 %r11691, %rs7620; + cvt.s32.s8 %r11692, %r11691; + cvt.u32.u16 %r11693, %rs7619; + cvt.s32.s8 %r11694, %r11693; + cvt.u32.u16 %r11695, %rs7618; + cvt.s32.s8 %r11696, %r11695; + mad.lo.s32 %r11697, %r222, %r11696, %r11688; + mad.lo.s32 %r11698, %r72, %r11694, %r11697; + mad.lo.s32 %r11699, %r73, %r11692, %r11698; + mad.lo.s32 %r11700, %r74, %r11690, %r11699; + ld.const.v4.u8 {%rs7626, %rs7627, %rs7628, %rs7629}, [matrix+3812]; + cvt.u32.u16 %r11701, %rs7629; + cvt.s32.s8 %r11702, %r11701; + cvt.u32.u16 %r11703, %rs7628; + cvt.s32.s8 %r11704, %r11703; + cvt.u32.u16 %r11705, %rs7627; + cvt.s32.s8 %r11706, %r11705; + cvt.u32.u16 %r11707, %rs7626; + cvt.s32.s8 %r11708, %r11707; + mad.lo.s32 %r11709, %r75, %r11708, %r11700; + mad.lo.s32 %r11710, %r76, %r11706, %r11709; + mad.lo.s32 %r11711, %r77, %r11704, %r11710; + mad.lo.s32 %r11712, %r78, %r11702, %r11711; + ld.const.v4.u8 {%rs7634, %rs7635, %rs7636, %rs7637}, [matrix+3816]; + cvt.u32.u16 %r11713, %rs7637; + cvt.s32.s8 %r11714, %r11713; + cvt.u32.u16 %r11715, %rs7636; + cvt.s32.s8 %r11716, %r11715; + cvt.u32.u16 %r11717, %rs7635; + cvt.s32.s8 %r11718, %r11717; + cvt.u32.u16 %r11719, %rs7634; + cvt.s32.s8 %r11720, %r11719; + mad.lo.s32 %r11721, %r80, %r11720, %r11712; + mad.lo.s32 %r11722, %r81, %r11718, %r11721; + mad.lo.s32 %r11723, %r83, %r11716, %r11722; + mad.lo.s32 %r11724, %r84, %r11714, %r11723; + ld.const.v4.u8 {%rs7642, %rs7643, %rs7644, %rs7645}, [matrix+3820]; + cvt.u32.u16 %r11725, %rs7645; + cvt.s32.s8 %r11726, %r11725; + cvt.u32.u16 %r11727, %rs7644; + cvt.s32.s8 %r11728, %r11727; + cvt.u32.u16 %r11729, %rs7643; + cvt.s32.s8 %r11730, %r11729; + cvt.u32.u16 %r11731, %rs7642; + cvt.s32.s8 %r11732, %r11731; + mad.lo.s32 %r11733, %r86, %r11732, %r11724; + mad.lo.s32 %r11734, %r87, %r11730, %r11733; + mad.lo.s32 %r11735, %r88, %r11728, %r11734; + mad.lo.s32 %r11736, %r89, %r11726, %r11735; + ld.const.v4.u8 {%rs7650, %rs7651, %rs7652, %rs7653}, [matrix+3824]; + cvt.u32.u16 %r11737, %rs7653; + cvt.s32.s8 %r11738, %r11737; + cvt.u32.u16 %r11739, %rs7652; + cvt.s32.s8 %r11740, %r11739; + cvt.u32.u16 %r11741, %rs7651; + cvt.s32.s8 %r11742, %r11741; + cvt.u32.u16 %r11743, %rs7650; + cvt.s32.s8 %r11744, %r11743; + mad.lo.s32 %r11745, %r271, %r11744, %r11736; + mad.lo.s32 %r11746, %r91, %r11742, %r11745; + mad.lo.s32 %r11747, %r93, %r11740, %r11746; + mad.lo.s32 %r11748, %r94, %r11738, %r11747; + ld.const.v4.u8 {%rs7658, %rs7659, %rs7660, %rs7661}, [matrix+3828]; + cvt.u32.u16 %r11749, %rs7661; + cvt.s32.s8 %r11750, %r11749; + cvt.u32.u16 %r11751, %rs7660; + cvt.s32.s8 %r11752, %r11751; + cvt.u32.u16 %r11753, %rs7659; + cvt.s32.s8 %r11754, %r11753; + cvt.u32.u16 %r11755, %rs7658; + cvt.s32.s8 %r11756, %r11755; + mad.lo.s32 %r11757, %r96, %r11756, %r11748; + mad.lo.s32 %r11758, %r97, %r11754, %r11757; + mad.lo.s32 %r11759, %r99, %r11752, %r11758; + mad.lo.s32 %r11760, %r100, %r11750, %r11759; + ld.const.v4.u8 {%rs7666, %rs7667, %rs7668, %rs7669}, [matrix+3832]; + cvt.u32.u16 %r11761, %rs7669; + cvt.s32.s8 %r11762, %r11761; + cvt.u32.u16 %r11763, %rs7668; + cvt.s32.s8 %r11764, %r11763; + cvt.u32.u16 %r11765, %rs7667; + cvt.s32.s8 %r11766, %r11765; + cvt.u32.u16 %r11767, %rs7666; + cvt.s32.s8 %r11768, %r11767; + mad.lo.s32 %r11769, %r103, %r11768, %r11760; + mad.lo.s32 %r11770, %r104, %r11766, %r11769; + mad.lo.s32 %r11771, %r107, %r11764, %r11770; + mad.lo.s32 %r11772, %r108, %r11762, %r11771; + ld.const.v4.u8 {%rs7674, %rs7675, %rs7676, %rs7677}, [matrix+3836]; + cvt.u32.u16 %r11773, %rs7677; + cvt.s32.s8 %r11774, %r11773; + cvt.u32.u16 %r11775, %rs7676; + cvt.s32.s8 %r11776, %r11775; + cvt.u32.u16 %r11777, %rs7675; + cvt.s32.s8 %r11778, %r11777; + cvt.u32.u16 %r11779, %rs7674; + cvt.s32.s8 %r11780, %r11779; + mad.lo.s32 %r11781, %r111, %r11780, %r11772; + mad.lo.s32 %r11782, %r112, %r11778, %r11781; + mad.lo.s32 %r11783, %r114, %r11776, %r11782; + mad.lo.s32 %r11784, %r115, %r11774, %r11783; + shr.u32 %r11785, %r11592, 6; + and.b32 %r11786, %r11785, 240; + shr.u32 %r11787, %r11784, 10; + or.b32 %r11788, %r11787, %r11786; + xor.b32 %r11789, %r105, %r11788; + cvt.u64.u32 %rd404, %r11789; + ld.const.v4.u8 {%rs7682, %rs7683, %rs7684, %rs7685}, [matrix+3840]; + cvt.u32.u16 %r11790, %rs7685; + cvt.s32.s8 %r11791, %r11790; + cvt.u32.u16 %r11792, %rs7684; + cvt.s32.s8 %r11793, %r11792; + cvt.u32.u16 %r11794, %rs7682; + cvt.s32.s8 %r11795, %r11794; + cvt.u32.u16 %r11796, %rs7683; + cvt.s32.s8 %r11797, %r11796; + mul.lo.s32 %r11798, %r34, %r11797; + mad.lo.s32 %r11799, %r124, %r11795, %r11798; + mad.lo.s32 %r11800, %r35, %r11793, %r11799; + mad.lo.s32 %r11801, %r36, %r11791, %r11800; + ld.const.v4.u8 {%rs7690, %rs7691, %rs7692, %rs7693}, [matrix+3844]; + cvt.u32.u16 %r11802, %rs7693; + cvt.s32.s8 %r11803, %r11802; + cvt.u32.u16 %r11804, %rs7692; + cvt.s32.s8 %r11805, %r11804; + cvt.u32.u16 %r11806, %rs7691; + cvt.s32.s8 %r11807, %r11806; + cvt.u32.u16 %r11808, %rs7690; + cvt.s32.s8 %r11809, %r11808; + mad.lo.s32 %r11810, %r37, %r11809, %r11801; + mad.lo.s32 %r11811, %r38, %r11807, %r11810; + mad.lo.s32 %r11812, %r39, %r11805, %r11811; + mad.lo.s32 %r11813, %r40, %r11803, %r11812; + ld.const.v4.u8 {%rs7698, %rs7699, %rs7700, %rs7701}, [matrix+3848]; + cvt.u32.u16 %r11814, %rs7701; + cvt.s32.s8 %r11815, %r11814; + cvt.u32.u16 %r11816, %rs7700; + cvt.s32.s8 %r11817, %r11816; + cvt.u32.u16 %r11818, %rs7699; + cvt.s32.s8 %r11819, %r11818; + cvt.u32.u16 %r11820, %rs7698; + cvt.s32.s8 %r11821, %r11820; + mad.lo.s32 %r11822, %r42, %r11821, %r11813; + mad.lo.s32 %r11823, %r43, %r11819, %r11822; + mad.lo.s32 %r11824, %r45, %r11817, %r11823; + mad.lo.s32 %r11825, %r46, %r11815, %r11824; + ld.const.v4.u8 {%rs7706, %rs7707, %rs7708, %rs7709}, [matrix+3852]; + cvt.u32.u16 %r11826, %rs7709; + cvt.s32.s8 %r11827, %r11826; + cvt.u32.u16 %r11828, %rs7708; + cvt.s32.s8 %r11829, %r11828; + cvt.u32.u16 %r11830, %rs7707; + cvt.s32.s8 %r11831, %r11830; + cvt.u32.u16 %r11832, %rs7706; + cvt.s32.s8 %r11833, %r11832; + mad.lo.s32 %r11834, %r48, %r11833, %r11825; + mad.lo.s32 %r11835, %r49, %r11831, %r11834; + mad.lo.s32 %r11836, %r50, %r11829, %r11835; + mad.lo.s32 %r11837, %r51, %r11827, %r11836; + ld.const.v4.u8 {%rs7714, %rs7715, %rs7716, %rs7717}, [matrix+3856]; + cvt.u32.u16 %r11838, %rs7717; + cvt.s32.s8 %r11839, %r11838; + cvt.u32.u16 %r11840, %rs7716; + cvt.s32.s8 %r11841, %r11840; + cvt.u32.u16 %r11842, %rs7715; + cvt.s32.s8 %r11843, %r11842; + cvt.u32.u16 %r11844, %rs7714; + cvt.s32.s8 %r11845, %r11844; + mad.lo.s32 %r11846, %r173, %r11845, %r11837; + mad.lo.s32 %r11847, %r53, %r11843, %r11846; + mad.lo.s32 %r11848, %r54, %r11841, %r11847; + mad.lo.s32 %r11849, %r55, %r11839, %r11848; + ld.const.v4.u8 {%rs7722, %rs7723, %rs7724, %rs7725}, [matrix+3860]; + cvt.u32.u16 %r11850, %rs7725; + cvt.s32.s8 %r11851, %r11850; + cvt.u32.u16 %r11852, %rs7724; + cvt.s32.s8 %r11853, %r11852; + cvt.u32.u16 %r11854, %rs7723; + cvt.s32.s8 %r11855, %r11854; + cvt.u32.u16 %r11856, %rs7722; + cvt.s32.s8 %r11857, %r11856; + mad.lo.s32 %r11858, %r56, %r11857, %r11849; + mad.lo.s32 %r11859, %r57, %r11855, %r11858; + mad.lo.s32 %r11860, %r58, %r11853, %r11859; + mad.lo.s32 %r11861, %r59, %r11851, %r11860; + ld.const.v4.u8 {%rs7730, %rs7731, %rs7732, %rs7733}, [matrix+3864]; + cvt.u32.u16 %r11862, %rs7733; + cvt.s32.s8 %r11863, %r11862; + cvt.u32.u16 %r11864, %rs7732; + cvt.s32.s8 %r11865, %r11864; + cvt.u32.u16 %r11866, %rs7731; + cvt.s32.s8 %r11867, %r11866; + cvt.u32.u16 %r11868, %rs7730; + cvt.s32.s8 %r11869, %r11868; + mad.lo.s32 %r11870, %r61, %r11869, %r11861; + mad.lo.s32 %r11871, %r62, %r11867, %r11870; + mad.lo.s32 %r11872, %r64, %r11865, %r11871; + mad.lo.s32 %r11873, %r65, %r11863, %r11872; + ld.const.v4.u8 {%rs7738, %rs7739, %rs7740, %rs7741}, [matrix+3868]; + cvt.u32.u16 %r11874, %rs7741; + cvt.s32.s8 %r11875, %r11874; + cvt.u32.u16 %r11876, %rs7740; + cvt.s32.s8 %r11877, %r11876; + cvt.u32.u16 %r11878, %rs7739; + cvt.s32.s8 %r11879, %r11878; + cvt.u32.u16 %r11880, %rs7738; + cvt.s32.s8 %r11881, %r11880; + mad.lo.s32 %r11882, %r67, %r11881, %r11873; + mad.lo.s32 %r11883, %r68, %r11879, %r11882; + mad.lo.s32 %r11884, %r69, %r11877, %r11883; + mad.lo.s32 %r11885, %r70, %r11875, %r11884; + ld.const.v4.u8 {%rs7746, %rs7747, %rs7748, %rs7749}, [matrix+3872]; + cvt.u32.u16 %r11886, %rs7749; + cvt.s32.s8 %r11887, %r11886; + cvt.u32.u16 %r11888, %rs7748; + cvt.s32.s8 %r11889, %r11888; + cvt.u32.u16 %r11890, %rs7747; + cvt.s32.s8 %r11891, %r11890; + cvt.u32.u16 %r11892, %rs7746; + cvt.s32.s8 %r11893, %r11892; + mad.lo.s32 %r11894, %r222, %r11893, %r11885; + mad.lo.s32 %r11895, %r72, %r11891, %r11894; + mad.lo.s32 %r11896, %r73, %r11889, %r11895; + mad.lo.s32 %r11897, %r74, %r11887, %r11896; + ld.const.v4.u8 {%rs7754, %rs7755, %rs7756, %rs7757}, [matrix+3876]; + cvt.u32.u16 %r11898, %rs7757; + cvt.s32.s8 %r11899, %r11898; + cvt.u32.u16 %r11900, %rs7756; + cvt.s32.s8 %r11901, %r11900; + cvt.u32.u16 %r11902, %rs7755; + cvt.s32.s8 %r11903, %r11902; + cvt.u32.u16 %r11904, %rs7754; + cvt.s32.s8 %r11905, %r11904; + mad.lo.s32 %r11906, %r75, %r11905, %r11897; + mad.lo.s32 %r11907, %r76, %r11903, %r11906; + mad.lo.s32 %r11908, %r77, %r11901, %r11907; + mad.lo.s32 %r11909, %r78, %r11899, %r11908; + ld.const.v4.u8 {%rs7762, %rs7763, %rs7764, %rs7765}, [matrix+3880]; + cvt.u32.u16 %r11910, %rs7765; + cvt.s32.s8 %r11911, %r11910; + cvt.u32.u16 %r11912, %rs7764; + cvt.s32.s8 %r11913, %r11912; + cvt.u32.u16 %r11914, %rs7763; + cvt.s32.s8 %r11915, %r11914; + cvt.u32.u16 %r11916, %rs7762; + cvt.s32.s8 %r11917, %r11916; + mad.lo.s32 %r11918, %r80, %r11917, %r11909; + mad.lo.s32 %r11919, %r81, %r11915, %r11918; + mad.lo.s32 %r11920, %r83, %r11913, %r11919; + mad.lo.s32 %r11921, %r84, %r11911, %r11920; + ld.const.v4.u8 {%rs7770, %rs7771, %rs7772, %rs7773}, [matrix+3884]; + cvt.u32.u16 %r11922, %rs7773; + cvt.s32.s8 %r11923, %r11922; + cvt.u32.u16 %r11924, %rs7772; + cvt.s32.s8 %r11925, %r11924; + cvt.u32.u16 %r11926, %rs7771; + cvt.s32.s8 %r11927, %r11926; + cvt.u32.u16 %r11928, %rs7770; + cvt.s32.s8 %r11929, %r11928; + mad.lo.s32 %r11930, %r86, %r11929, %r11921; + mad.lo.s32 %r11931, %r87, %r11927, %r11930; + mad.lo.s32 %r11932, %r88, %r11925, %r11931; + mad.lo.s32 %r11933, %r89, %r11923, %r11932; + ld.const.v4.u8 {%rs7778, %rs7779, %rs7780, %rs7781}, [matrix+3888]; + cvt.u32.u16 %r11934, %rs7781; + cvt.s32.s8 %r11935, %r11934; + cvt.u32.u16 %r11936, %rs7780; + cvt.s32.s8 %r11937, %r11936; + cvt.u32.u16 %r11938, %rs7779; + cvt.s32.s8 %r11939, %r11938; + cvt.u32.u16 %r11940, %rs7778; + cvt.s32.s8 %r11941, %r11940; + mad.lo.s32 %r11942, %r271, %r11941, %r11933; + mad.lo.s32 %r11943, %r91, %r11939, %r11942; + mad.lo.s32 %r11944, %r93, %r11937, %r11943; + mad.lo.s32 %r11945, %r94, %r11935, %r11944; + ld.const.v4.u8 {%rs7786, %rs7787, %rs7788, %rs7789}, [matrix+3892]; + cvt.u32.u16 %r11946, %rs7789; + cvt.s32.s8 %r11947, %r11946; + cvt.u32.u16 %r11948, %rs7788; + cvt.s32.s8 %r11949, %r11948; + cvt.u32.u16 %r11950, %rs7787; + cvt.s32.s8 %r11951, %r11950; + cvt.u32.u16 %r11952, %rs7786; + cvt.s32.s8 %r11953, %r11952; + mad.lo.s32 %r11954, %r96, %r11953, %r11945; + mad.lo.s32 %r11955, %r97, %r11951, %r11954; + mad.lo.s32 %r11956, %r99, %r11949, %r11955; + mad.lo.s32 %r11957, %r100, %r11947, %r11956; + ld.const.v4.u8 {%rs7794, %rs7795, %rs7796, %rs7797}, [matrix+3896]; + cvt.u32.u16 %r11958, %rs7797; + cvt.s32.s8 %r11959, %r11958; + cvt.u32.u16 %r11960, %rs7796; + cvt.s32.s8 %r11961, %r11960; + cvt.u32.u16 %r11962, %rs7795; + cvt.s32.s8 %r11963, %r11962; + cvt.u32.u16 %r11964, %rs7794; + cvt.s32.s8 %r11965, %r11964; + mad.lo.s32 %r11966, %r103, %r11965, %r11957; + mad.lo.s32 %r11967, %r104, %r11963, %r11966; + mad.lo.s32 %r11968, %r107, %r11961, %r11967; + mad.lo.s32 %r11969, %r108, %r11959, %r11968; + ld.const.v4.u8 {%rs7802, %rs7803, %rs7804, %rs7805}, [matrix+3900]; + cvt.u32.u16 %r11970, %rs7805; + cvt.s32.s8 %r11971, %r11970; + cvt.u32.u16 %r11972, %rs7804; + cvt.s32.s8 %r11973, %r11972; + cvt.u32.u16 %r11974, %rs7803; + cvt.s32.s8 %r11975, %r11974; + cvt.u32.u16 %r11976, %rs7802; + cvt.s32.s8 %r11977, %r11976; + mad.lo.s32 %r11978, %r111, %r11977, %r11969; + mad.lo.s32 %r11979, %r112, %r11975, %r11978; + mad.lo.s32 %r11980, %r114, %r11973, %r11979; + mad.lo.s32 %r11981, %r115, %r11971, %r11980; + ld.const.v4.u8 {%rs7810, %rs7811, %rs7812, %rs7813}, [matrix+3904]; + cvt.u32.u16 %r11982, %rs7813; + cvt.s32.s8 %r11983, %r11982; + cvt.u32.u16 %r11984, %rs7812; + cvt.s32.s8 %r11985, %r11984; + cvt.u32.u16 %r11986, %rs7810; + cvt.s32.s8 %r11987, %r11986; + cvt.u32.u16 %r11988, %rs7811; + cvt.s32.s8 %r11989, %r11988; + mul.lo.s32 %r11990, %r34, %r11989; + mad.lo.s32 %r11991, %r124, %r11987, %r11990; + mad.lo.s32 %r11992, %r35, %r11985, %r11991; + mad.lo.s32 %r11993, %r36, %r11983, %r11992; + ld.const.v4.u8 {%rs7818, %rs7819, %rs7820, %rs7821}, [matrix+3908]; + cvt.u32.u16 %r11994, %rs7821; + cvt.s32.s8 %r11995, %r11994; + cvt.u32.u16 %r11996, %rs7820; + cvt.s32.s8 %r11997, %r11996; + cvt.u32.u16 %r11998, %rs7819; + cvt.s32.s8 %r11999, %r11998; + cvt.u32.u16 %r12000, %rs7818; + cvt.s32.s8 %r12001, %r12000; + mad.lo.s32 %r12002, %r37, %r12001, %r11993; + mad.lo.s32 %r12003, %r38, %r11999, %r12002; + mad.lo.s32 %r12004, %r39, %r11997, %r12003; + mad.lo.s32 %r12005, %r40, %r11995, %r12004; + ld.const.v4.u8 {%rs7826, %rs7827, %rs7828, %rs7829}, [matrix+3912]; + cvt.u32.u16 %r12006, %rs7829; + cvt.s32.s8 %r12007, %r12006; + cvt.u32.u16 %r12008, %rs7828; + cvt.s32.s8 %r12009, %r12008; + cvt.u32.u16 %r12010, %rs7827; + cvt.s32.s8 %r12011, %r12010; + cvt.u32.u16 %r12012, %rs7826; + cvt.s32.s8 %r12013, %r12012; + mad.lo.s32 %r12014, %r42, %r12013, %r12005; + mad.lo.s32 %r12015, %r43, %r12011, %r12014; + mad.lo.s32 %r12016, %r45, %r12009, %r12015; + mad.lo.s32 %r12017, %r46, %r12007, %r12016; + ld.const.v4.u8 {%rs7834, %rs7835, %rs7836, %rs7837}, [matrix+3916]; + cvt.u32.u16 %r12018, %rs7837; + cvt.s32.s8 %r12019, %r12018; + cvt.u32.u16 %r12020, %rs7836; + cvt.s32.s8 %r12021, %r12020; + cvt.u32.u16 %r12022, %rs7835; + cvt.s32.s8 %r12023, %r12022; + cvt.u32.u16 %r12024, %rs7834; + cvt.s32.s8 %r12025, %r12024; + mad.lo.s32 %r12026, %r48, %r12025, %r12017; + mad.lo.s32 %r12027, %r49, %r12023, %r12026; + mad.lo.s32 %r12028, %r50, %r12021, %r12027; + mad.lo.s32 %r12029, %r51, %r12019, %r12028; + ld.const.v4.u8 {%rs7842, %rs7843, %rs7844, %rs7845}, [matrix+3920]; + cvt.u32.u16 %r12030, %rs7845; + cvt.s32.s8 %r12031, %r12030; + cvt.u32.u16 %r12032, %rs7844; + cvt.s32.s8 %r12033, %r12032; + cvt.u32.u16 %r12034, %rs7843; + cvt.s32.s8 %r12035, %r12034; + cvt.u32.u16 %r12036, %rs7842; + cvt.s32.s8 %r12037, %r12036; + mad.lo.s32 %r12038, %r173, %r12037, %r12029; + mad.lo.s32 %r12039, %r53, %r12035, %r12038; + mad.lo.s32 %r12040, %r54, %r12033, %r12039; + mad.lo.s32 %r12041, %r55, %r12031, %r12040; + ld.const.v4.u8 {%rs7850, %rs7851, %rs7852, %rs7853}, [matrix+3924]; + cvt.u32.u16 %r12042, %rs7853; + cvt.s32.s8 %r12043, %r12042; + cvt.u32.u16 %r12044, %rs7852; + cvt.s32.s8 %r12045, %r12044; + cvt.u32.u16 %r12046, %rs7851; + cvt.s32.s8 %r12047, %r12046; + cvt.u32.u16 %r12048, %rs7850; + cvt.s32.s8 %r12049, %r12048; + mad.lo.s32 %r12050, %r56, %r12049, %r12041; + mad.lo.s32 %r12051, %r57, %r12047, %r12050; + mad.lo.s32 %r12052, %r58, %r12045, %r12051; + mad.lo.s32 %r12053, %r59, %r12043, %r12052; + ld.const.v4.u8 {%rs7858, %rs7859, %rs7860, %rs7861}, [matrix+3928]; + cvt.u32.u16 %r12054, %rs7861; + cvt.s32.s8 %r12055, %r12054; + cvt.u32.u16 %r12056, %rs7860; + cvt.s32.s8 %r12057, %r12056; + cvt.u32.u16 %r12058, %rs7859; + cvt.s32.s8 %r12059, %r12058; + cvt.u32.u16 %r12060, %rs7858; + cvt.s32.s8 %r12061, %r12060; + mad.lo.s32 %r12062, %r61, %r12061, %r12053; + mad.lo.s32 %r12063, %r62, %r12059, %r12062; + mad.lo.s32 %r12064, %r64, %r12057, %r12063; + mad.lo.s32 %r12065, %r65, %r12055, %r12064; + ld.const.v4.u8 {%rs7866, %rs7867, %rs7868, %rs7869}, [matrix+3932]; + cvt.u32.u16 %r12066, %rs7869; + cvt.s32.s8 %r12067, %r12066; + cvt.u32.u16 %r12068, %rs7868; + cvt.s32.s8 %r12069, %r12068; + cvt.u32.u16 %r12070, %rs7867; + cvt.s32.s8 %r12071, %r12070; + cvt.u32.u16 %r12072, %rs7866; + cvt.s32.s8 %r12073, %r12072; + mad.lo.s32 %r12074, %r67, %r12073, %r12065; + mad.lo.s32 %r12075, %r68, %r12071, %r12074; + mad.lo.s32 %r12076, %r69, %r12069, %r12075; + mad.lo.s32 %r12077, %r70, %r12067, %r12076; + ld.const.v4.u8 {%rs7874, %rs7875, %rs7876, %rs7877}, [matrix+3936]; + cvt.u32.u16 %r12078, %rs7877; + cvt.s32.s8 %r12079, %r12078; + cvt.u32.u16 %r12080, %rs7876; + cvt.s32.s8 %r12081, %r12080; + cvt.u32.u16 %r12082, %rs7875; + cvt.s32.s8 %r12083, %r12082; + cvt.u32.u16 %r12084, %rs7874; + cvt.s32.s8 %r12085, %r12084; + mad.lo.s32 %r12086, %r222, %r12085, %r12077; + mad.lo.s32 %r12087, %r72, %r12083, %r12086; + mad.lo.s32 %r12088, %r73, %r12081, %r12087; + mad.lo.s32 %r12089, %r74, %r12079, %r12088; + ld.const.v4.u8 {%rs7882, %rs7883, %rs7884, %rs7885}, [matrix+3940]; + cvt.u32.u16 %r12090, %rs7885; + cvt.s32.s8 %r12091, %r12090; + cvt.u32.u16 %r12092, %rs7884; + cvt.s32.s8 %r12093, %r12092; + cvt.u32.u16 %r12094, %rs7883; + cvt.s32.s8 %r12095, %r12094; + cvt.u32.u16 %r12096, %rs7882; + cvt.s32.s8 %r12097, %r12096; + mad.lo.s32 %r12098, %r75, %r12097, %r12089; + mad.lo.s32 %r12099, %r76, %r12095, %r12098; + mad.lo.s32 %r12100, %r77, %r12093, %r12099; + mad.lo.s32 %r12101, %r78, %r12091, %r12100; + ld.const.v4.u8 {%rs7890, %rs7891, %rs7892, %rs7893}, [matrix+3944]; + cvt.u32.u16 %r12102, %rs7893; + cvt.s32.s8 %r12103, %r12102; + cvt.u32.u16 %r12104, %rs7892; + cvt.s32.s8 %r12105, %r12104; + cvt.u32.u16 %r12106, %rs7891; + cvt.s32.s8 %r12107, %r12106; + cvt.u32.u16 %r12108, %rs7890; + cvt.s32.s8 %r12109, %r12108; + mad.lo.s32 %r12110, %r80, %r12109, %r12101; + mad.lo.s32 %r12111, %r81, %r12107, %r12110; + mad.lo.s32 %r12112, %r83, %r12105, %r12111; + mad.lo.s32 %r12113, %r84, %r12103, %r12112; + ld.const.v4.u8 {%rs7898, %rs7899, %rs7900, %rs7901}, [matrix+3948]; + cvt.u32.u16 %r12114, %rs7901; + cvt.s32.s8 %r12115, %r12114; + cvt.u32.u16 %r12116, %rs7900; + cvt.s32.s8 %r12117, %r12116; + cvt.u32.u16 %r12118, %rs7899; + cvt.s32.s8 %r12119, %r12118; + cvt.u32.u16 %r12120, %rs7898; + cvt.s32.s8 %r12121, %r12120; + mad.lo.s32 %r12122, %r86, %r12121, %r12113; + mad.lo.s32 %r12123, %r87, %r12119, %r12122; + mad.lo.s32 %r12124, %r88, %r12117, %r12123; + mad.lo.s32 %r12125, %r89, %r12115, %r12124; + ld.const.v4.u8 {%rs7906, %rs7907, %rs7908, %rs7909}, [matrix+3952]; + cvt.u32.u16 %r12126, %rs7909; + cvt.s32.s8 %r12127, %r12126; + cvt.u32.u16 %r12128, %rs7908; + cvt.s32.s8 %r12129, %r12128; + cvt.u32.u16 %r12130, %rs7907; + cvt.s32.s8 %r12131, %r12130; + cvt.u32.u16 %r12132, %rs7906; + cvt.s32.s8 %r12133, %r12132; + mad.lo.s32 %r12134, %r271, %r12133, %r12125; + mad.lo.s32 %r12135, %r91, %r12131, %r12134; + mad.lo.s32 %r12136, %r93, %r12129, %r12135; + mad.lo.s32 %r12137, %r94, %r12127, %r12136; + ld.const.v4.u8 {%rs7914, %rs7915, %rs7916, %rs7917}, [matrix+3956]; + cvt.u32.u16 %r12138, %rs7917; + cvt.s32.s8 %r12139, %r12138; + cvt.u32.u16 %r12140, %rs7916; + cvt.s32.s8 %r12141, %r12140; + cvt.u32.u16 %r12142, %rs7915; + cvt.s32.s8 %r12143, %r12142; + cvt.u32.u16 %r12144, %rs7914; + cvt.s32.s8 %r12145, %r12144; + mad.lo.s32 %r12146, %r96, %r12145, %r12137; + mad.lo.s32 %r12147, %r97, %r12143, %r12146; + mad.lo.s32 %r12148, %r99, %r12141, %r12147; + mad.lo.s32 %r12149, %r100, %r12139, %r12148; + ld.const.v4.u8 {%rs7922, %rs7923, %rs7924, %rs7925}, [matrix+3960]; + cvt.u32.u16 %r12150, %rs7925; + cvt.s32.s8 %r12151, %r12150; + cvt.u32.u16 %r12152, %rs7924; + cvt.s32.s8 %r12153, %r12152; + cvt.u32.u16 %r12154, %rs7923; + cvt.s32.s8 %r12155, %r12154; + cvt.u32.u16 %r12156, %rs7922; + cvt.s32.s8 %r12157, %r12156; + mad.lo.s32 %r12158, %r103, %r12157, %r12149; + mad.lo.s32 %r12159, %r104, %r12155, %r12158; + mad.lo.s32 %r12160, %r107, %r12153, %r12159; + mad.lo.s32 %r12161, %r108, %r12151, %r12160; + ld.const.v4.u8 {%rs7930, %rs7931, %rs7932, %rs7933}, [matrix+3964]; + cvt.u32.u16 %r12162, %rs7933; + cvt.s32.s8 %r12163, %r12162; + cvt.u32.u16 %r12164, %rs7932; + cvt.s32.s8 %r12165, %r12164; + cvt.u32.u16 %r12166, %rs7931; + cvt.s32.s8 %r12167, %r12166; + cvt.u32.u16 %r12168, %rs7930; + cvt.s32.s8 %r12169, %r12168; + mad.lo.s32 %r12170, %r111, %r12169, %r12161; + mad.lo.s32 %r12171, %r112, %r12167, %r12170; + mad.lo.s32 %r12172, %r114, %r12165, %r12171; + mad.lo.s32 %r12173, %r115, %r12163, %r12172; + shr.u32 %r12174, %r11981, 6; + and.b32 %r12175, %r12174, 240; + shr.u32 %r12176, %r12173, 10; + or.b32 %r12177, %r12176, %r12175; + xor.b32 %r12178, %r109, %r12177; + cvt.u64.u32 %rd405, %r12178; + ld.const.v4.u8 {%rs7938, %rs7939, %rs7940, %rs7941}, [matrix+3968]; + cvt.u32.u16 %r12179, %rs7941; + cvt.s32.s8 %r12180, %r12179; + cvt.u32.u16 %r12181, %rs7940; + cvt.s32.s8 %r12182, %r12181; + cvt.u32.u16 %r12183, %rs7938; + cvt.s32.s8 %r12184, %r12183; + cvt.u32.u16 %r12185, %rs7939; + cvt.s32.s8 %r12186, %r12185; + mul.lo.s32 %r12187, %r34, %r12186; + mad.lo.s32 %r12188, %r124, %r12184, %r12187; + mad.lo.s32 %r12189, %r35, %r12182, %r12188; + mad.lo.s32 %r12190, %r36, %r12180, %r12189; + ld.const.v4.u8 {%rs7946, %rs7947, %rs7948, %rs7949}, [matrix+3972]; + cvt.u32.u16 %r12191, %rs7949; + cvt.s32.s8 %r12192, %r12191; + cvt.u32.u16 %r12193, %rs7948; + cvt.s32.s8 %r12194, %r12193; + cvt.u32.u16 %r12195, %rs7947; + cvt.s32.s8 %r12196, %r12195; + cvt.u32.u16 %r12197, %rs7946; + cvt.s32.s8 %r12198, %r12197; + mad.lo.s32 %r12199, %r37, %r12198, %r12190; + mad.lo.s32 %r12200, %r38, %r12196, %r12199; + mad.lo.s32 %r12201, %r39, %r12194, %r12200; + mad.lo.s32 %r12202, %r40, %r12192, %r12201; + ld.const.v4.u8 {%rs7954, %rs7955, %rs7956, %rs7957}, [matrix+3976]; + cvt.u32.u16 %r12203, %rs7957; + cvt.s32.s8 %r12204, %r12203; + cvt.u32.u16 %r12205, %rs7956; + cvt.s32.s8 %r12206, %r12205; + cvt.u32.u16 %r12207, %rs7955; + cvt.s32.s8 %r12208, %r12207; + cvt.u32.u16 %r12209, %rs7954; + cvt.s32.s8 %r12210, %r12209; + mad.lo.s32 %r12211, %r42, %r12210, %r12202; + mad.lo.s32 %r12212, %r43, %r12208, %r12211; + mad.lo.s32 %r12213, %r45, %r12206, %r12212; + mad.lo.s32 %r12214, %r46, %r12204, %r12213; + ld.const.v4.u8 {%rs7962, %rs7963, %rs7964, %rs7965}, [matrix+3980]; + cvt.u32.u16 %r12215, %rs7965; + cvt.s32.s8 %r12216, %r12215; + cvt.u32.u16 %r12217, %rs7964; + cvt.s32.s8 %r12218, %r12217; + cvt.u32.u16 %r12219, %rs7963; + cvt.s32.s8 %r12220, %r12219; + cvt.u32.u16 %r12221, %rs7962; + cvt.s32.s8 %r12222, %r12221; + mad.lo.s32 %r12223, %r48, %r12222, %r12214; + mad.lo.s32 %r12224, %r49, %r12220, %r12223; + mad.lo.s32 %r12225, %r50, %r12218, %r12224; + mad.lo.s32 %r12226, %r51, %r12216, %r12225; + ld.const.v4.u8 {%rs7970, %rs7971, %rs7972, %rs7973}, [matrix+3984]; + cvt.u32.u16 %r12227, %rs7973; + cvt.s32.s8 %r12228, %r12227; + cvt.u32.u16 %r12229, %rs7972; + cvt.s32.s8 %r12230, %r12229; + cvt.u32.u16 %r12231, %rs7971; + cvt.s32.s8 %r12232, %r12231; + cvt.u32.u16 %r12233, %rs7970; + cvt.s32.s8 %r12234, %r12233; + mad.lo.s32 %r12235, %r173, %r12234, %r12226; + mad.lo.s32 %r12236, %r53, %r12232, %r12235; + mad.lo.s32 %r12237, %r54, %r12230, %r12236; + mad.lo.s32 %r12238, %r55, %r12228, %r12237; + ld.const.v4.u8 {%rs7978, %rs7979, %rs7980, %rs7981}, [matrix+3988]; + cvt.u32.u16 %r12239, %rs7981; + cvt.s32.s8 %r12240, %r12239; + cvt.u32.u16 %r12241, %rs7980; + cvt.s32.s8 %r12242, %r12241; + cvt.u32.u16 %r12243, %rs7979; + cvt.s32.s8 %r12244, %r12243; + cvt.u32.u16 %r12245, %rs7978; + cvt.s32.s8 %r12246, %r12245; + mad.lo.s32 %r12247, %r56, %r12246, %r12238; + mad.lo.s32 %r12248, %r57, %r12244, %r12247; + mad.lo.s32 %r12249, %r58, %r12242, %r12248; + mad.lo.s32 %r12250, %r59, %r12240, %r12249; + ld.const.v4.u8 {%rs7986, %rs7987, %rs7988, %rs7989}, [matrix+3992]; + cvt.u32.u16 %r12251, %rs7989; + cvt.s32.s8 %r12252, %r12251; + cvt.u32.u16 %r12253, %rs7988; + cvt.s32.s8 %r12254, %r12253; + cvt.u32.u16 %r12255, %rs7987; + cvt.s32.s8 %r12256, %r12255; + cvt.u32.u16 %r12257, %rs7986; + cvt.s32.s8 %r12258, %r12257; + mad.lo.s32 %r12259, %r61, %r12258, %r12250; + mad.lo.s32 %r12260, %r62, %r12256, %r12259; + mad.lo.s32 %r12261, %r64, %r12254, %r12260; + mad.lo.s32 %r12262, %r65, %r12252, %r12261; + ld.const.v4.u8 {%rs7994, %rs7995, %rs7996, %rs7997}, [matrix+3996]; + cvt.u32.u16 %r12263, %rs7997; + cvt.s32.s8 %r12264, %r12263; + cvt.u32.u16 %r12265, %rs7996; + cvt.s32.s8 %r12266, %r12265; + cvt.u32.u16 %r12267, %rs7995; + cvt.s32.s8 %r12268, %r12267; + cvt.u32.u16 %r12269, %rs7994; + cvt.s32.s8 %r12270, %r12269; + mad.lo.s32 %r12271, %r67, %r12270, %r12262; + mad.lo.s32 %r12272, %r68, %r12268, %r12271; + mad.lo.s32 %r12273, %r69, %r12266, %r12272; + mad.lo.s32 %r12274, %r70, %r12264, %r12273; + ld.const.v4.u8 {%rs8002, %rs8003, %rs8004, %rs8005}, [matrix+4000]; + cvt.u32.u16 %r12275, %rs8005; + cvt.s32.s8 %r12276, %r12275; + cvt.u32.u16 %r12277, %rs8004; + cvt.s32.s8 %r12278, %r12277; + cvt.u32.u16 %r12279, %rs8003; + cvt.s32.s8 %r12280, %r12279; + cvt.u32.u16 %r12281, %rs8002; + cvt.s32.s8 %r12282, %r12281; + mad.lo.s32 %r12283, %r222, %r12282, %r12274; + mad.lo.s32 %r12284, %r72, %r12280, %r12283; + mad.lo.s32 %r12285, %r73, %r12278, %r12284; + mad.lo.s32 %r12286, %r74, %r12276, %r12285; + ld.const.v4.u8 {%rs8010, %rs8011, %rs8012, %rs8013}, [matrix+4004]; + cvt.u32.u16 %r12287, %rs8013; + cvt.s32.s8 %r12288, %r12287; + cvt.u32.u16 %r12289, %rs8012; + cvt.s32.s8 %r12290, %r12289; + cvt.u32.u16 %r12291, %rs8011; + cvt.s32.s8 %r12292, %r12291; + cvt.u32.u16 %r12293, %rs8010; + cvt.s32.s8 %r12294, %r12293; + mad.lo.s32 %r12295, %r75, %r12294, %r12286; + mad.lo.s32 %r12296, %r76, %r12292, %r12295; + mad.lo.s32 %r12297, %r77, %r12290, %r12296; + mad.lo.s32 %r12298, %r78, %r12288, %r12297; + ld.const.v4.u8 {%rs8018, %rs8019, %rs8020, %rs8021}, [matrix+4008]; + cvt.u32.u16 %r12299, %rs8021; + cvt.s32.s8 %r12300, %r12299; + cvt.u32.u16 %r12301, %rs8020; + cvt.s32.s8 %r12302, %r12301; + cvt.u32.u16 %r12303, %rs8019; + cvt.s32.s8 %r12304, %r12303; + cvt.u32.u16 %r12305, %rs8018; + cvt.s32.s8 %r12306, %r12305; + mad.lo.s32 %r12307, %r80, %r12306, %r12298; + mad.lo.s32 %r12308, %r81, %r12304, %r12307; + mad.lo.s32 %r12309, %r83, %r12302, %r12308; + mad.lo.s32 %r12310, %r84, %r12300, %r12309; + ld.const.v4.u8 {%rs8026, %rs8027, %rs8028, %rs8029}, [matrix+4012]; + cvt.u32.u16 %r12311, %rs8029; + cvt.s32.s8 %r12312, %r12311; + cvt.u32.u16 %r12313, %rs8028; + cvt.s32.s8 %r12314, %r12313; + cvt.u32.u16 %r12315, %rs8027; + cvt.s32.s8 %r12316, %r12315; + cvt.u32.u16 %r12317, %rs8026; + cvt.s32.s8 %r12318, %r12317; + mad.lo.s32 %r12319, %r86, %r12318, %r12310; + mad.lo.s32 %r12320, %r87, %r12316, %r12319; + mad.lo.s32 %r12321, %r88, %r12314, %r12320; + mad.lo.s32 %r12322, %r89, %r12312, %r12321; + ld.const.v4.u8 {%rs8034, %rs8035, %rs8036, %rs8037}, [matrix+4016]; + cvt.u32.u16 %r12323, %rs8037; + cvt.s32.s8 %r12324, %r12323; + cvt.u32.u16 %r12325, %rs8036; + cvt.s32.s8 %r12326, %r12325; + cvt.u32.u16 %r12327, %rs8035; + cvt.s32.s8 %r12328, %r12327; + cvt.u32.u16 %r12329, %rs8034; + cvt.s32.s8 %r12330, %r12329; + mad.lo.s32 %r12331, %r271, %r12330, %r12322; + mad.lo.s32 %r12332, %r91, %r12328, %r12331; + mad.lo.s32 %r12333, %r93, %r12326, %r12332; + mad.lo.s32 %r12334, %r94, %r12324, %r12333; + ld.const.v4.u8 {%rs8042, %rs8043, %rs8044, %rs8045}, [matrix+4020]; + cvt.u32.u16 %r12335, %rs8045; + cvt.s32.s8 %r12336, %r12335; + cvt.u32.u16 %r12337, %rs8044; + cvt.s32.s8 %r12338, %r12337; + cvt.u32.u16 %r12339, %rs8043; + cvt.s32.s8 %r12340, %r12339; + cvt.u32.u16 %r12341, %rs8042; + cvt.s32.s8 %r12342, %r12341; + mad.lo.s32 %r12343, %r96, %r12342, %r12334; + mad.lo.s32 %r12344, %r97, %r12340, %r12343; + mad.lo.s32 %r12345, %r99, %r12338, %r12344; + mad.lo.s32 %r12346, %r100, %r12336, %r12345; + ld.const.v4.u8 {%rs8050, %rs8051, %rs8052, %rs8053}, [matrix+4024]; + cvt.u32.u16 %r12347, %rs8053; + cvt.s32.s8 %r12348, %r12347; + cvt.u32.u16 %r12349, %rs8052; + cvt.s32.s8 %r12350, %r12349; + cvt.u32.u16 %r12351, %rs8051; + cvt.s32.s8 %r12352, %r12351; + cvt.u32.u16 %r12353, %rs8050; + cvt.s32.s8 %r12354, %r12353; + mad.lo.s32 %r12355, %r103, %r12354, %r12346; + mad.lo.s32 %r12356, %r104, %r12352, %r12355; + mad.lo.s32 %r12357, %r107, %r12350, %r12356; + mad.lo.s32 %r12358, %r108, %r12348, %r12357; + ld.const.v4.u8 {%rs8058, %rs8059, %rs8060, %rs8061}, [matrix+4028]; + cvt.u32.u16 %r12359, %rs8061; + cvt.s32.s8 %r12360, %r12359; + cvt.u32.u16 %r12361, %rs8060; + cvt.s32.s8 %r12362, %r12361; + cvt.u32.u16 %r12363, %rs8059; + cvt.s32.s8 %r12364, %r12363; + cvt.u32.u16 %r12365, %rs8058; + cvt.s32.s8 %r12366, %r12365; + mad.lo.s32 %r12367, %r111, %r12366, %r12358; + mad.lo.s32 %r12368, %r112, %r12364, %r12367; + mad.lo.s32 %r12369, %r114, %r12362, %r12368; + mad.lo.s32 %r12370, %r115, %r12360, %r12369; + ld.const.v4.u8 {%rs8066, %rs8067, %rs8068, %rs8069}, [matrix+4032]; + cvt.u32.u16 %r12371, %rs8069; + cvt.s32.s8 %r12372, %r12371; + cvt.u32.u16 %r12373, %rs8068; + cvt.s32.s8 %r12374, %r12373; + cvt.u32.u16 %r12375, %rs8066; + cvt.s32.s8 %r12376, %r12375; + cvt.u32.u16 %r12377, %rs8067; + cvt.s32.s8 %r12378, %r12377; + mul.lo.s32 %r12379, %r34, %r12378; + mad.lo.s32 %r12380, %r124, %r12376, %r12379; + mad.lo.s32 %r12381, %r35, %r12374, %r12380; + mad.lo.s32 %r12382, %r36, %r12372, %r12381; + ld.const.v4.u8 {%rs8074, %rs8075, %rs8076, %rs8077}, [matrix+4036]; + cvt.u32.u16 %r12383, %rs8077; + cvt.s32.s8 %r12384, %r12383; + cvt.u32.u16 %r12385, %rs8076; + cvt.s32.s8 %r12386, %r12385; + cvt.u32.u16 %r12387, %rs8075; + cvt.s32.s8 %r12388, %r12387; + cvt.u32.u16 %r12389, %rs8074; + cvt.s32.s8 %r12390, %r12389; + mad.lo.s32 %r12391, %r37, %r12390, %r12382; + mad.lo.s32 %r12392, %r38, %r12388, %r12391; + mad.lo.s32 %r12393, %r39, %r12386, %r12392; + mad.lo.s32 %r12394, %r40, %r12384, %r12393; + ld.const.v4.u8 {%rs8082, %rs8083, %rs8084, %rs8085}, [matrix+4040]; + cvt.u32.u16 %r12395, %rs8085; + cvt.s32.s8 %r12396, %r12395; + cvt.u32.u16 %r12397, %rs8084; + cvt.s32.s8 %r12398, %r12397; + cvt.u32.u16 %r12399, %rs8083; + cvt.s32.s8 %r12400, %r12399; + cvt.u32.u16 %r12401, %rs8082; + cvt.s32.s8 %r12402, %r12401; + mad.lo.s32 %r12403, %r42, %r12402, %r12394; + mad.lo.s32 %r12404, %r43, %r12400, %r12403; + mad.lo.s32 %r12405, %r45, %r12398, %r12404; + mad.lo.s32 %r12406, %r46, %r12396, %r12405; + ld.const.v4.u8 {%rs8090, %rs8091, %rs8092, %rs8093}, [matrix+4044]; + cvt.u32.u16 %r12407, %rs8093; + cvt.s32.s8 %r12408, %r12407; + cvt.u32.u16 %r12409, %rs8092; + cvt.s32.s8 %r12410, %r12409; + cvt.u32.u16 %r12411, %rs8091; + cvt.s32.s8 %r12412, %r12411; + cvt.u32.u16 %r12413, %rs8090; + cvt.s32.s8 %r12414, %r12413; + mad.lo.s32 %r12415, %r48, %r12414, %r12406; + mad.lo.s32 %r12416, %r49, %r12412, %r12415; + mad.lo.s32 %r12417, %r50, %r12410, %r12416; + mad.lo.s32 %r12418, %r51, %r12408, %r12417; + ld.const.v4.u8 {%rs8098, %rs8099, %rs8100, %rs8101}, [matrix+4048]; + cvt.u32.u16 %r12419, %rs8101; + cvt.s32.s8 %r12420, %r12419; + cvt.u32.u16 %r12421, %rs8100; + cvt.s32.s8 %r12422, %r12421; + cvt.u32.u16 %r12423, %rs8099; + cvt.s32.s8 %r12424, %r12423; + cvt.u32.u16 %r12425, %rs8098; + cvt.s32.s8 %r12426, %r12425; + mad.lo.s32 %r12427, %r173, %r12426, %r12418; + mad.lo.s32 %r12428, %r53, %r12424, %r12427; + mad.lo.s32 %r12429, %r54, %r12422, %r12428; + mad.lo.s32 %r12430, %r55, %r12420, %r12429; + ld.const.v4.u8 {%rs8106, %rs8107, %rs8108, %rs8109}, [matrix+4052]; + cvt.u32.u16 %r12431, %rs8109; + cvt.s32.s8 %r12432, %r12431; + cvt.u32.u16 %r12433, %rs8108; + cvt.s32.s8 %r12434, %r12433; + cvt.u32.u16 %r12435, %rs8107; + cvt.s32.s8 %r12436, %r12435; + cvt.u32.u16 %r12437, %rs8106; + cvt.s32.s8 %r12438, %r12437; + mad.lo.s32 %r12439, %r56, %r12438, %r12430; + mad.lo.s32 %r12440, %r57, %r12436, %r12439; + mad.lo.s32 %r12441, %r58, %r12434, %r12440; + mad.lo.s32 %r12442, %r59, %r12432, %r12441; + ld.const.v4.u8 {%rs8114, %rs8115, %rs8116, %rs8117}, [matrix+4056]; + cvt.u32.u16 %r12443, %rs8117; + cvt.s32.s8 %r12444, %r12443; + cvt.u32.u16 %r12445, %rs8116; + cvt.s32.s8 %r12446, %r12445; + cvt.u32.u16 %r12447, %rs8115; + cvt.s32.s8 %r12448, %r12447; + cvt.u32.u16 %r12449, %rs8114; + cvt.s32.s8 %r12450, %r12449; + mad.lo.s32 %r12451, %r61, %r12450, %r12442; + mad.lo.s32 %r12452, %r62, %r12448, %r12451; + mad.lo.s32 %r12453, %r64, %r12446, %r12452; + mad.lo.s32 %r12454, %r65, %r12444, %r12453; + ld.const.v4.u8 {%rs8122, %rs8123, %rs8124, %rs8125}, [matrix+4060]; + cvt.u32.u16 %r12455, %rs8125; + cvt.s32.s8 %r12456, %r12455; + cvt.u32.u16 %r12457, %rs8124; + cvt.s32.s8 %r12458, %r12457; + cvt.u32.u16 %r12459, %rs8123; + cvt.s32.s8 %r12460, %r12459; + cvt.u32.u16 %r12461, %rs8122; + cvt.s32.s8 %r12462, %r12461; + mad.lo.s32 %r12463, %r67, %r12462, %r12454; + mad.lo.s32 %r12464, %r68, %r12460, %r12463; + mad.lo.s32 %r12465, %r69, %r12458, %r12464; + mad.lo.s32 %r12466, %r70, %r12456, %r12465; + ld.const.v4.u8 {%rs8130, %rs8131, %rs8132, %rs8133}, [matrix+4064]; + cvt.u32.u16 %r12467, %rs8133; + cvt.s32.s8 %r12468, %r12467; + cvt.u32.u16 %r12469, %rs8132; + cvt.s32.s8 %r12470, %r12469; + cvt.u32.u16 %r12471, %rs8131; + cvt.s32.s8 %r12472, %r12471; + cvt.u32.u16 %r12473, %rs8130; + cvt.s32.s8 %r12474, %r12473; + mad.lo.s32 %r12475, %r222, %r12474, %r12466; + mad.lo.s32 %r12476, %r72, %r12472, %r12475; + mad.lo.s32 %r12477, %r73, %r12470, %r12476; + mad.lo.s32 %r12478, %r74, %r12468, %r12477; + ld.const.v4.u8 {%rs8138, %rs8139, %rs8140, %rs8141}, [matrix+4068]; + cvt.u32.u16 %r12479, %rs8141; + cvt.s32.s8 %r12480, %r12479; + cvt.u32.u16 %r12481, %rs8140; + cvt.s32.s8 %r12482, %r12481; + cvt.u32.u16 %r12483, %rs8139; + cvt.s32.s8 %r12484, %r12483; + cvt.u32.u16 %r12485, %rs8138; + cvt.s32.s8 %r12486, %r12485; + mad.lo.s32 %r12487, %r75, %r12486, %r12478; + mad.lo.s32 %r12488, %r76, %r12484, %r12487; + mad.lo.s32 %r12489, %r77, %r12482, %r12488; + mad.lo.s32 %r12490, %r78, %r12480, %r12489; + ld.const.v4.u8 {%rs8146, %rs8147, %rs8148, %rs8149}, [matrix+4072]; + cvt.u32.u16 %r12491, %rs8149; + cvt.s32.s8 %r12492, %r12491; + cvt.u32.u16 %r12493, %rs8148; + cvt.s32.s8 %r12494, %r12493; + cvt.u32.u16 %r12495, %rs8147; + cvt.s32.s8 %r12496, %r12495; + cvt.u32.u16 %r12497, %rs8146; + cvt.s32.s8 %r12498, %r12497; + mad.lo.s32 %r12499, %r80, %r12498, %r12490; + mad.lo.s32 %r12500, %r81, %r12496, %r12499; + mad.lo.s32 %r12501, %r83, %r12494, %r12500; + mad.lo.s32 %r12502, %r84, %r12492, %r12501; + ld.const.v4.u8 {%rs8154, %rs8155, %rs8156, %rs8157}, [matrix+4076]; + cvt.u32.u16 %r12503, %rs8157; + cvt.s32.s8 %r12504, %r12503; + cvt.u32.u16 %r12505, %rs8156; + cvt.s32.s8 %r12506, %r12505; + cvt.u32.u16 %r12507, %rs8155; + cvt.s32.s8 %r12508, %r12507; + cvt.u32.u16 %r12509, %rs8154; + cvt.s32.s8 %r12510, %r12509; + mad.lo.s32 %r12511, %r86, %r12510, %r12502; + mad.lo.s32 %r12512, %r87, %r12508, %r12511; + mad.lo.s32 %r12513, %r88, %r12506, %r12512; + mad.lo.s32 %r12514, %r89, %r12504, %r12513; + ld.const.v4.u8 {%rs8162, %rs8163, %rs8164, %rs8165}, [matrix+4080]; + cvt.u32.u16 %r12515, %rs8165; + cvt.s32.s8 %r12516, %r12515; + cvt.u32.u16 %r12517, %rs8164; + cvt.s32.s8 %r12518, %r12517; + cvt.u32.u16 %r12519, %rs8163; + cvt.s32.s8 %r12520, %r12519; + cvt.u32.u16 %r12521, %rs8162; + cvt.s32.s8 %r12522, %r12521; + mad.lo.s32 %r12523, %r271, %r12522, %r12514; + mad.lo.s32 %r12524, %r91, %r12520, %r12523; + mad.lo.s32 %r12525, %r93, %r12518, %r12524; + mad.lo.s32 %r12526, %r94, %r12516, %r12525; + ld.const.v4.u8 {%rs8170, %rs8171, %rs8172, %rs8173}, [matrix+4084]; + cvt.u32.u16 %r12527, %rs8173; + cvt.s32.s8 %r12528, %r12527; + cvt.u32.u16 %r12529, %rs8172; + cvt.s32.s8 %r12530, %r12529; + cvt.u32.u16 %r12531, %rs8171; + cvt.s32.s8 %r12532, %r12531; + cvt.u32.u16 %r12533, %rs8170; + cvt.s32.s8 %r12534, %r12533; + mad.lo.s32 %r12535, %r96, %r12534, %r12526; + mad.lo.s32 %r12536, %r97, %r12532, %r12535; + mad.lo.s32 %r12537, %r99, %r12530, %r12536; + mad.lo.s32 %r12538, %r100, %r12528, %r12537; + ld.const.v4.u8 {%rs8178, %rs8179, %rs8180, %rs8181}, [matrix+4088]; + cvt.u32.u16 %r12539, %rs8181; + cvt.s32.s8 %r12540, %r12539; + cvt.u32.u16 %r12541, %rs8180; + cvt.s32.s8 %r12542, %r12541; + cvt.u32.u16 %r12543, %rs8179; + cvt.s32.s8 %r12544, %r12543; + cvt.u32.u16 %r12545, %rs8178; + cvt.s32.s8 %r12546, %r12545; + mad.lo.s32 %r12547, %r103, %r12546, %r12538; + mad.lo.s32 %r12548, %r104, %r12544, %r12547; + mad.lo.s32 %r12549, %r107, %r12542, %r12548; + mad.lo.s32 %r12550, %r108, %r12540, %r12549; + ld.const.v4.u8 {%rs8186, %rs8187, %rs8188, %rs8189}, [matrix+4092]; + cvt.u32.u16 %r12551, %rs8189; + cvt.s32.s8 %r12552, %r12551; + cvt.u32.u16 %r12553, %rs8188; + cvt.s32.s8 %r12554, %r12553; + cvt.u32.u16 %r12555, %rs8187; + cvt.s32.s8 %r12556, %r12555; + cvt.u32.u16 %r12557, %rs8186; + cvt.s32.s8 %r12558, %r12557; + mad.lo.s32 %r12559, %r111, %r12558, %r12550; + mad.lo.s32 %r12560, %r112, %r12556, %r12559; + mad.lo.s32 %r12561, %r114, %r12554, %r12560; + mad.lo.s32 %r12562, %r115, %r12552, %r12561; + shr.u32 %r12563, %r12370, 6; + and.b32 %r12564, %r12563, 240; + shr.u32 %r12565, %r12562, 10; + or.b32 %r12566, %r12565, %r12564; + xor.b32 %r12567, %r113, %r12566; + and.b64 %rd406, %rd394, 255; + and.b64 %rd407, %rd395, 255; + and.b64 %rd408, %rd388, 255; + and.b64 %rd409, %rd389, 255; + and.b64 %rd410, %rd382, 255; + and.b64 %rd411, %rd383, 255; + mul.wide.u32 %rd412, %r7121, 256; + shl.b64 %rd413, %rd406, 16; + shl.b64 %rd414, %rd407, 24; + and.b64 %rd415, %rd398, 255; + and.b64 %rd416, %rd397, 255; + and.b64 %rd417, %rd396, 255; + mul.wide.u32 %rd418, %r4009, 256; + shl.b64 %rd419, %rd408, 16; + shl.b64 %rd420, %rd409, 24; + and.b64 %rd421, %rd392, 255; + and.b64 %rd422, %rd391, 255; + and.b64 %rd423, %rd390, 255; + mul.wide.u32 %rd424, %r897, 256; + shl.b64 %rd425, %rd410, 16; + shl.b64 %rd426, %rd411, 24; + and.b64 %rd427, %rd386, 255; + and.b64 %rd428, %rd385, 255; + and.b64 %rd429, %rd384, 255; + cvt.u64.u32 %rd430, %r12567; + cvt.u64.u32 %rd431, %r9844; + cvt.u64.u32 %rd432, %r9455; + cvt.u64.u32 %rd433, %r6343; + cvt.u64.u32 %rd434, %r3231; + shl.b64 %rd435, %rd434, 56; + shl.b64 %rd436, %rd427, 48; + or.b64 %rd437, %rd435, %rd436; + shl.b64 %rd438, %rd428, 40; + or.b64 %rd439, %rd437, %rd438; + shl.b64 %rd440, %rd429, 32; + or.b64 %rd441, %rd439, %rd440; + or.b64 %rd442, %rd441, %rd426; + or.b64 %rd443, %rd442, %rd425; + and.b64 %rd444, %rd381, 255; + and.b64 %rd445, %rd424, 65280; + or.b64 %rd446, %rd443, %rd445; + or.b64 %rd447, %rd446, %rd444; + shl.b64 %rd448, %rd433, 56; + shl.b64 %rd449, %rd421, 48; + or.b64 %rd450, %rd448, %rd449; + shl.b64 %rd451, %rd422, 40; + or.b64 %rd452, %rd450, %rd451; + shl.b64 %rd453, %rd423, 32; + or.b64 %rd454, %rd452, %rd453; + or.b64 %rd455, %rd454, %rd420; + or.b64 %rd456, %rd455, %rd419; + and.b64 %rd457, %rd387, 255; + and.b64 %rd458, %rd418, 65280; + or.b64 %rd459, %rd456, %rd458; + or.b64 %rd460, %rd459, %rd457; + shl.b64 %rd461, %rd432, 56; + shl.b64 %rd462, %rd415, 48; + or.b64 %rd463, %rd461, %rd462; + shl.b64 %rd464, %rd416, 40; + or.b64 %rd465, %rd463, %rd464; + shl.b64 %rd466, %rd417, 32; + or.b64 %rd467, %rd465, %rd466; + or.b64 %rd468, %rd467, %rd414; + or.b64 %rd469, %rd468, %rd413; + and.b64 %rd470, %rd393, 255; + and.b64 %rd471, %rd412, 65280; + or.b64 %rd472, %rd469, %rd471; + or.b64 %rd473, %rd472, %rd470; + shl.b64 %rd474, %rd430, 56; + and.b64 %rd475, %rd405, 255; + shl.b64 %rd476, %rd475, 48; + or.b64 %rd477, %rd474, %rd476; + and.b64 %rd478, %rd404, 255; + shl.b64 %rd479, %rd478, 40; + or.b64 %rd480, %rd477, %rd479; + shl.b64 %rd481, %rd403, 32; + or.b64 %rd482, %rd480, %rd481; + and.b64 %rd483, %rd401, 255; + shl.b64 %rd484, %rd483, 24; + or.b64 %rd485, %rd482, %rd484; + and.b64 %rd486, %rd400, 255; + shl.b64 %rd487, %rd486, 16; + and.b64 %rd488, %rd399, 255; + shl.b64 %rd489, %rd488, 8; + or.b64 %rd490, %rd485, %rd487; + and.b64 %rd491, %rd431, 255; + or.b64 %rd492, %rd490, %rd489; + or.b64 %rd493, %rd492, %rd491; + xor.b64 %rd667, %rd493, 1272090201925444760; + xor.b64 %rd672, %rd473, 8796936657246353646; + xor.b64 %rd677, %rd460, 8746723911537738262; + xor.b64 %rd682, %rd447, 4239941492252378377; + mov.u64 %rd681, 8270816933120786537; + mov.u64 %rd680, -850687345431043546; + mov.u64 %rd679, 8596393687355028144; + mov.u64 %rd678, -4073852189716399785; + mov.u64 %rd676, -4539347866060507718; + mov.u64 %rd675, -3233781605604422593; + mov.u64 %rd674, 570094237299545110; + mov.u64 %rd673, 5171152063242093102; + mov.u64 %rd671, 6782861118970774626; + mov.u64 %rd670, 7812475424661425213; + mov.u64 %rd669, 9119540418498120711; + mov.u64 %rd668, -7873636174015165430; + mov.u64 %rd666, -9207053471590684088; + mov.u64 %rd665, 3370482334374859748; + mov.u64 %rd664, -1544774801229058759; + mov.u64 %rd663, 6096431547456407061; + mov.u64 %rd662, -1792185402154627366; + mov.u64 %rd661, -6864424130110145268; + mov.u64 %rd660, 5690099369266491460; + mov.u64 %rd659, -5074726839974049192; + mov.u64 %rd658, 1592359455985097269; + mov.u64 %rd657, RC; + mov.u32 %r12569, -24; + +BB0_9: + xor.b64 %rd494, %rd681, %rd682; + xor.b64 %rd495, %rd494, %rd680; + xor.b64 %rd496, %rd495, %rd679; + xor.b64 %rd497, %rd496, %rd678; + xor.b64 %rd498, %rd676, %rd677; + xor.b64 %rd499, %rd498, %rd675; + xor.b64 %rd500, %rd499, %rd674; + xor.b64 %rd501, %rd500, %rd673; + xor.b64 %rd502, %rd671, %rd672; + xor.b64 %rd503, %rd502, %rd670; + xor.b64 %rd504, %rd503, %rd669; + xor.b64 %rd505, %rd504, %rd668; + xor.b64 %rd506, %rd666, %rd667; + xor.b64 %rd507, %rd506, %rd665; + xor.b64 %rd508, %rd507, %rd664; + xor.b64 %rd509, %rd508, %rd663; + xor.b64 %rd510, %rd661, %rd662; + xor.b64 %rd511, %rd510, %rd660; + xor.b64 %rd512, %rd511, %rd659; + xor.b64 %rd513, %rd512, %rd658; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd501, 1; + shr.b64 %rhs, %rd501, 63; + add.u64 %rd514, %lhs, %rhs; + } + xor.b64 %rd515, %rd513, %rd514; + xor.b64 %rd516, %rd682, %rd515; + xor.b64 %rd517, %rd681, %rd515; + xor.b64 %rd518, %rd680, %rd515; + xor.b64 %rd519, %rd679, %rd515; + xor.b64 %rd520, %rd678, %rd515; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd505, 1; + shr.b64 %rhs, %rd505, 63; + add.u64 %rd521, %lhs, %rhs; + } + xor.b64 %rd522, %rd521, %rd497; + xor.b64 %rd523, %rd677, %rd522; + xor.b64 %rd524, %rd676, %rd522; + xor.b64 %rd525, %rd675, %rd522; + xor.b64 %rd526, %rd674, %rd522; + xor.b64 %rd527, %rd673, %rd522; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd509, 1; + shr.b64 %rhs, %rd509, 63; + add.u64 %rd528, %lhs, %rhs; + } + xor.b64 %rd529, %rd528, %rd501; + xor.b64 %rd530, %rd672, %rd529; + xor.b64 %rd531, %rd671, %rd529; + xor.b64 %rd532, %rd670, %rd529; + xor.b64 %rd533, %rd669, %rd529; + xor.b64 %rd534, %rd668, %rd529; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd513, 1; + shr.b64 %rhs, %rd513, 63; + add.u64 %rd535, %lhs, %rhs; + } + xor.b64 %rd536, %rd535, %rd505; + xor.b64 %rd537, %rd667, %rd536; + xor.b64 %rd538, %rd666, %rd536; + xor.b64 %rd539, %rd665, %rd536; + xor.b64 %rd540, %rd664, %rd536; + xor.b64 %rd541, %rd663, %rd536; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd497, 1; + shr.b64 %rhs, %rd497, 63; + add.u64 %rd542, %lhs, %rhs; + } + xor.b64 %rd543, %rd542, %rd509; + xor.b64 %rd544, %rd662, %rd543; + xor.b64 %rd545, %rd661, %rd543; + xor.b64 %rd546, %rd660, %rd543; + xor.b64 %rd547, %rd659, %rd543; + xor.b64 %rd548, %rd658, %rd543; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd523, 1; + shr.b64 %rhs, %rd523, 63; + add.u64 %rd549, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd518, 3; + shr.b64 %rhs, %rd518, 61; + add.u64 %rd550, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd531, 6; + shr.b64 %rhs, %rd531, 58; + add.u64 %rd551, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd525, 10; + shr.b64 %rhs, %rd525, 54; + add.u64 %rd552, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd533, 15; + shr.b64 %rhs, %rd533, 49; + add.u64 %rd553, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd540, 21; + shr.b64 %rhs, %rd540, 43; + add.u64 %rd554, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd537, 28; + shr.b64 %rhs, %rd537, 36; + add.u64 %rd555, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd517, 36; + shr.b64 %rhs, %rd517, 28; + add.u64 %rd556, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd526, 45; + shr.b64 %rhs, %rd526, 19; + add.u64 %rd557, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd538, 55; + shr.b64 %rhs, %rd538, 9; + add.u64 %rd558, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd527, 2; + shr.b64 %rhs, %rd527, 62; + add.u64 %rd559, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd548, 14; + shr.b64 %rhs, %rd548, 50; + add.u64 %rd560, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd544, 27; + shr.b64 %rhs, %rd544, 37; + add.u64 %rd561, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd519, 41; + shr.b64 %rhs, %rd519, 23; + add.u64 %rd562, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd541, 56; + shr.b64 %rhs, %rd541, 8; + add.u64 %rd563, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd547, 8; + shr.b64 %rhs, %rd547, 56; + add.u64 %rd564, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd539, 25; + shr.b64 %rhs, %rd539, 39; + add.u64 %rd565, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd532, 43; + shr.b64 %rhs, %rd532, 21; + add.u64 %rd566, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd530, 62; + shr.b64 %rhs, %rd530, 2; + add.u64 %rd567, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd520, 18; + shr.b64 %rhs, %rd520, 46; + add.u64 %rd568, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd546, 39; + shr.b64 %rhs, %rd546, 25; + add.u64 %rd569, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd534, 61; + shr.b64 %rhs, %rd534, 3; + add.u64 %rd570, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd545, 20; + shr.b64 %rhs, %rd545, 44; + add.u64 %rd571, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd524, 44; + shr.b64 %rhs, %rd524, 20; + add.u64 %rd572, %lhs, %rhs; + } + not.b64 %rd573, %rd572; + and.b64 %rd574, %rd566, %rd573; + xor.b64 %rd575, %rd574, %rd516; + not.b64 %rd576, %rd566; + and.b64 %rd577, %rd554, %rd576; + xor.b64 %rd677, %rd577, %rd572; + not.b64 %rd578, %rd554; + and.b64 %rd579, %rd560, %rd578; + xor.b64 %rd672, %rd579, %rd566; + not.b64 %rd580, %rd560; + and.b64 %rd581, %rd516, %rd580; + xor.b64 %rd667, %rd581, %rd554; + not.b64 %rd582, %rd516; + and.b64 %rd583, %rd572, %rd582; + xor.b64 %rd662, %rd560, %rd583; + not.b64 %rd584, %rd571; + and.b64 %rd585, %rd550, %rd584; + xor.b64 %rd681, %rd585, %rd555; + not.b64 %rd586, %rd550; + and.b64 %rd587, %rd557, %rd586; + xor.b64 %rd676, %rd587, %rd571; + not.b64 %rd588, %rd557; + and.b64 %rd589, %rd570, %rd588; + xor.b64 %rd671, %rd589, %rd550; + not.b64 %rd590, %rd570; + and.b64 %rd591, %rd555, %rd590; + xor.b64 %rd666, %rd591, %rd557; + not.b64 %rd592, %rd555; + and.b64 %rd593, %rd571, %rd592; + xor.b64 %rd661, %rd570, %rd593; + not.b64 %rd594, %rd551; + and.b64 %rd595, %rd565, %rd594; + xor.b64 %rd680, %rd595, %rd549; + not.b64 %rd596, %rd565; + and.b64 %rd597, %rd564, %rd596; + xor.b64 %rd675, %rd597, %rd551; + not.b64 %rd598, %rd564; + and.b64 %rd599, %rd568, %rd598; + xor.b64 %rd670, %rd599, %rd565; + not.b64 %rd600, %rd568; + and.b64 %rd601, %rd549, %rd600; + xor.b64 %rd665, %rd601, %rd564; + not.b64 %rd602, %rd549; + and.b64 %rd603, %rd551, %rd602; + xor.b64 %rd660, %rd568, %rd603; + not.b64 %rd604, %rd556; + and.b64 %rd605, %rd552, %rd604; + xor.b64 %rd679, %rd605, %rd561; + not.b64 %rd606, %rd552; + and.b64 %rd607, %rd553, %rd606; + xor.b64 %rd674, %rd607, %rd556; + not.b64 %rd608, %rd553; + and.b64 %rd609, %rd563, %rd608; + xor.b64 %rd669, %rd609, %rd552; + not.b64 %rd610, %rd563; + and.b64 %rd611, %rd561, %rd610; + xor.b64 %rd664, %rd611, %rd553; + not.b64 %rd612, %rd561; + and.b64 %rd613, %rd556, %rd612; + xor.b64 %rd659, %rd563, %rd613; + not.b64 %rd614, %rd558; + and.b64 %rd615, %rd569, %rd614; + xor.b64 %rd678, %rd615, %rd567; + not.b64 %rd616, %rd569; + and.b64 %rd617, %rd562, %rd616; + xor.b64 %rd673, %rd617, %rd558; + not.b64 %rd618, %rd562; + and.b64 %rd619, %rd559, %rd618; + xor.b64 %rd668, %rd619, %rd569; + not.b64 %rd620, %rd559; + and.b64 %rd621, %rd567, %rd620; + xor.b64 %rd663, %rd621, %rd562; + not.b64 %rd622, %rd567; + and.b64 %rd623, %rd558, %rd622; + xor.b64 %rd658, %rd559, %rd623; + ld.global.u64 %rd624, [%rd657]; + xor.b64 %rd682, %rd575, %rd624; + add.s64 %rd657, %rd657, 8; + add.s32 %r12569, %r12569, 1; + setp.ne.s32 %p10, %r12569, 0; + @%p10 bra BB0_9; + + ld.const.u64 %rd125, [target+24]; + setp.eq.s64 %p11, %rd667, %rd125; + @%p11 bra BB0_12; + bra.uni BB0_11; + +BB0_12: + ld.const.u64 %rd126, [target+16]; + setp.eq.s64 %p12, %rd672, %rd126; + @%p12 bra BB0_14; + bra.uni BB0_13; + +BB0_14: + ld.const.u64 %rd127, [target+8]; + setp.eq.s64 %p13, %rd677, %rd127; + @%p13 bra BB0_16; + bra.uni BB0_15; + +BB0_16: + ld.const.u64 %rd625, [target]; + setp.lt.u64 %p4, %rd682, %rd625; + @!%p4 bra BB0_18; + bra.uni BB0_17; + +BB0_11: + setp.lt.u64 %p1, %rd667, %rd125; + @!%p1 bra BB0_18; + bra.uni BB0_17; + +BB0_13: + setp.lt.u64 %p2, %rd672, %rd126; + @!%p2 bra BB0_18; + bra.uni BB0_17; + +BB0_15: + setp.lt.u64 %p3, %rd677, %rd127; + @!%p3 bra BB0_18; + bra.uni BB0_17; + +BB0_17: + ld.param.u64 %rd629, [heavy_hash_param_5]; + cvta.to.global.u64 %rd628, %rd629; + mov.u64 %rd626, 0; + atom.global.cas.b64 %rd627, [%rd628], %rd626, %rd6; + +BB0_18: + ret; +} + + diff --git a/plugins/cuda/resources/kaspa-cuda-sm30.ptx b/plugins/cuda/resources/karlsen-cuda-sm30.ptx similarity index 97% rename from plugins/cuda/resources/kaspa-cuda-sm30.ptx rename to plugins/cuda/resources/karlsen-cuda-sm30.ptx index e31f409..9334697 100644 --- a/plugins/cuda/resources/kaspa-cuda-sm30.ptx +++ b/plugins/cuda/resources/karlsen-cuda-sm30.ptx @@ -1,14630 +1,14630 @@ -// -// Generated by NVIDIA NVVM Compiler -// -// Compiler Build ID: CL-24330188 -// Cuda compilation tools, release 9.2, V9.2.148 -// Based on LLVM 3.4svn -// - -.version 6.2 -.target sm_30 -.address_size 64 - - // .globl heavy_hash -.global .align 16 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; -.global .align 16 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; -.global .align 16 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; -.global .align 16 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; -.global .align 16 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; -.const .align 1 .b8 matrix[4096]; -.const .align 8 .b8 hash_header[72]; -.const .align 8 .b8 target[32]; -.const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; -.const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; - -.visible .entry heavy_hash( - .param .u64 heavy_hash_param_0, - .param .u64 heavy_hash_param_1, - .param .u64 heavy_hash_param_2, - .param .u8 heavy_hash_param_3, - .param .u64 heavy_hash_param_4, - .param .u64 heavy_hash_param_5 -) -{ - .reg .pred %p<18>; - .reg .b16 %rs<8194>; - .reg .b32 %r<12573>; - .reg .b64 %rd<687>; - - - ld.param.u64 %rd128, [heavy_hash_param_0]; - ld.param.u64 %rd129, [heavy_hash_param_1]; - ld.param.u64 %rd131, [heavy_hash_param_2]; - ld.param.u64 %rd130, [heavy_hash_param_4]; - ld.param.u64 %rd132, [heavy_hash_param_5]; - ld.param.u8 %rs1, [heavy_hash_param_3]; - cvta.to.global.u64 %rd1, %rd132; - mov.u32 %r5, %ntid.x; - mov.u32 %r6, %ctaid.x; - mov.u32 %r7, %tid.x; - mad.lo.s32 %r8, %r5, %r6, %r7; - cvt.s64.s32 %rd2, %r8; - setp.ge.u64 %p6, %rd2, %rd131; - @%p6 bra BB0_18; - - cvt.u32.u64 %r9, %rd2; - setp.ne.s32 %p7, %r9, 0; - @%p7 bra BB0_3; - - mov.u64 %rd133, 0; - st.global.u64 [%rd1], %rd133; - -BB0_3: - setp.eq.s16 %p8, %rs1, 0; - @%p8 bra BB0_5; - - cvta.to.global.u64 %rd134, %rd130; - shl.b64 %rd135, %rd2, 5; - add.s64 %rd136, %rd134, %rd135; - ld.global.v2.u64 {%rd137, %rd138}, [%rd136]; - mul.lo.s64 %rd141, %rd138, 5; - mul.lo.s64 %rd142, %rd138, 640; - shr.u64 %rd143, %rd141, 57; - or.b64 %rd144, %rd143, %rd142; - mul.lo.s64 %rd634, %rd144, 9; - shl.b64 %rd145, %rd138, 17; - ld.global.v2.u64 {%rd146, %rd147}, [%rd136+16]; - xor.b64 %rd149, %rd146, %rd137; - xor.b64 %rd151, %rd147, %rd138; - xor.b64 %rd152, %rd138, %rd149; - xor.b64 %rd153, %rd137, %rd151; - st.global.v2.u64 [%rd136], {%rd153, %rd152}; - xor.b64 %rd154, %rd149, %rd145; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd151, 45; - shr.b64 %rhs, %rd151, 19; - add.u64 %rd155, %lhs, %rhs; - } - st.global.v2.u64 [%rd136+16], {%rd154, %rd155}; - bra.uni BB0_6; - -BB0_5: - cvta.to.global.u64 %rd156, %rd130; - ld.global.u64 %rd157, [%rd156]; - xor.b64 %rd634, %rd157, %rd2; - -BB0_6: - and.b64 %rd174, %rd634, %rd128; - or.b64 %rd6, %rd174, %rd129; - ld.const.u64 %rd175, [hash_header]; - xor.b64 %rd660, %rd175, 1242148031264380989; - ld.const.u64 %rd176, [hash_header+8]; - xor.b64 %rd655, %rd176, 3008272977830772284; - ld.const.u64 %rd177, [hash_header+16]; - xor.b64 %rd650, %rd177, 2188519011337848018; - ld.const.u64 %rd178, [hash_header+24]; - xor.b64 %rd645, %rd178, 1992179434288343456; - ld.const.u64 %rd179, [hash_header+32]; - xor.b64 %rd640, %rd179, 8876506674959887717; - ld.const.u64 %rd180, [hash_header+40]; - xor.b64 %rd659, %rd180, 5399642050693751366; - ld.const.u64 %rd181, [hash_header+48]; - xor.b64 %rd654, %rd181, 1745875063082670864; - ld.const.u64 %rd182, [hash_header+56]; - xor.b64 %rd649, %rd182, 8605242046444978844; - ld.const.u64 %rd183, [hash_header+64]; - xor.b64 %rd644, %rd183, -510048929142394560; - xor.b64 %rd639, %rd6, 3343109343542796272; - mov.u64 %rd658, 1123092876221303306; - mov.u64 %rd657, 3784524041015224902; - mov.u64 %rd656, -8517909413761200310; - mov.u64 %rd653, 4963925045340115282; - mov.u64 %rd652, 1082795874807940378; - mov.u64 %rd651, 5237849264682708699; - mov.u64 %rd648, -1409360996057663723; - mov.u64 %rd647, -4494027153138273982; - mov.u64 %rd646, -5621391061570334094; - mov.u64 %rd643, -1817099578685924727; - mov.u64 %rd642, -5035616039755945756; - mov.u64 %rd641, 6706187291358897596; - mov.u64 %rd638, -5613068297060437469; - mov.u64 %rd637, -3386048033060200563; - mov.u64 %rd636, 196324915476054915; - mov.u64 %rd635, RC; - mov.u32 %r12571, -24; - -BB0_7: - xor.b64 %rd184, %rd659, %rd660; - xor.b64 %rd185, %rd184, %rd658; - xor.b64 %rd186, %rd185, %rd657; - xor.b64 %rd187, %rd186, %rd656; - xor.b64 %rd188, %rd654, %rd655; - xor.b64 %rd189, %rd188, %rd653; - xor.b64 %rd190, %rd189, %rd652; - xor.b64 %rd191, %rd190, %rd651; - xor.b64 %rd192, %rd649, %rd650; - xor.b64 %rd193, %rd192, %rd648; - xor.b64 %rd194, %rd193, %rd647; - xor.b64 %rd195, %rd194, %rd646; - xor.b64 %rd196, %rd644, %rd645; - xor.b64 %rd197, %rd196, %rd643; - xor.b64 %rd198, %rd197, %rd642; - xor.b64 %rd199, %rd198, %rd641; - xor.b64 %rd200, %rd639, %rd640; - xor.b64 %rd201, %rd200, %rd638; - xor.b64 %rd202, %rd201, %rd637; - xor.b64 %rd203, %rd202, %rd636; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd191, 1; - shr.b64 %rhs, %rd191, 63; - add.u64 %rd204, %lhs, %rhs; - } - xor.b64 %rd205, %rd203, %rd204; - xor.b64 %rd206, %rd660, %rd205; - xor.b64 %rd207, %rd659, %rd205; - xor.b64 %rd208, %rd658, %rd205; - xor.b64 %rd209, %rd657, %rd205; - xor.b64 %rd210, %rd656, %rd205; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd195, 1; - shr.b64 %rhs, %rd195, 63; - add.u64 %rd211, %lhs, %rhs; - } - xor.b64 %rd212, %rd211, %rd187; - xor.b64 %rd213, %rd655, %rd212; - xor.b64 %rd214, %rd654, %rd212; - xor.b64 %rd215, %rd653, %rd212; - xor.b64 %rd216, %rd652, %rd212; - xor.b64 %rd217, %rd651, %rd212; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd199, 1; - shr.b64 %rhs, %rd199, 63; - add.u64 %rd218, %lhs, %rhs; - } - xor.b64 %rd219, %rd218, %rd191; - xor.b64 %rd220, %rd650, %rd219; - xor.b64 %rd221, %rd649, %rd219; - xor.b64 %rd222, %rd648, %rd219; - xor.b64 %rd223, %rd647, %rd219; - xor.b64 %rd224, %rd646, %rd219; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd203, 1; - shr.b64 %rhs, %rd203, 63; - add.u64 %rd225, %lhs, %rhs; - } - xor.b64 %rd226, %rd225, %rd195; - xor.b64 %rd227, %rd645, %rd226; - xor.b64 %rd228, %rd644, %rd226; - xor.b64 %rd229, %rd643, %rd226; - xor.b64 %rd230, %rd642, %rd226; - xor.b64 %rd231, %rd641, %rd226; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd187, 1; - shr.b64 %rhs, %rd187, 63; - add.u64 %rd232, %lhs, %rhs; - } - xor.b64 %rd233, %rd232, %rd199; - xor.b64 %rd234, %rd640, %rd233; - xor.b64 %rd235, %rd639, %rd233; - xor.b64 %rd236, %rd638, %rd233; - xor.b64 %rd237, %rd637, %rd233; - xor.b64 %rd238, %rd636, %rd233; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd213, 1; - shr.b64 %rhs, %rd213, 63; - add.u64 %rd239, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd208, 3; - shr.b64 %rhs, %rd208, 61; - add.u64 %rd240, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd221, 6; - shr.b64 %rhs, %rd221, 58; - add.u64 %rd241, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd215, 10; - shr.b64 %rhs, %rd215, 54; - add.u64 %rd242, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd223, 15; - shr.b64 %rhs, %rd223, 49; - add.u64 %rd243, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd230, 21; - shr.b64 %rhs, %rd230, 43; - add.u64 %rd244, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd227, 28; - shr.b64 %rhs, %rd227, 36; - add.u64 %rd245, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd207, 36; - shr.b64 %rhs, %rd207, 28; - add.u64 %rd246, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd216, 45; - shr.b64 %rhs, %rd216, 19; - add.u64 %rd247, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd228, 55; - shr.b64 %rhs, %rd228, 9; - add.u64 %rd248, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd217, 2; - shr.b64 %rhs, %rd217, 62; - add.u64 %rd249, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd238, 14; - shr.b64 %rhs, %rd238, 50; - add.u64 %rd250, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd234, 27; - shr.b64 %rhs, %rd234, 37; - add.u64 %rd251, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd209, 41; - shr.b64 %rhs, %rd209, 23; - add.u64 %rd252, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd231, 56; - shr.b64 %rhs, %rd231, 8; - add.u64 %rd253, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd237, 8; - shr.b64 %rhs, %rd237, 56; - add.u64 %rd254, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd229, 25; - shr.b64 %rhs, %rd229, 39; - add.u64 %rd255, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd222, 43; - shr.b64 %rhs, %rd222, 21; - add.u64 %rd256, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd220, 62; - shr.b64 %rhs, %rd220, 2; - add.u64 %rd257, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd210, 18; - shr.b64 %rhs, %rd210, 46; - add.u64 %rd258, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd236, 39; - shr.b64 %rhs, %rd236, 25; - add.u64 %rd259, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd224, 61; - shr.b64 %rhs, %rd224, 3; - add.u64 %rd260, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd235, 20; - shr.b64 %rhs, %rd235, 44; - add.u64 %rd261, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd214, 44; - shr.b64 %rhs, %rd214, 20; - add.u64 %rd262, %lhs, %rhs; - } - not.b64 %rd263, %rd262; - and.b64 %rd264, %rd256, %rd263; - xor.b64 %rd265, %rd264, %rd206; - not.b64 %rd266, %rd256; - and.b64 %rd267, %rd244, %rd266; - xor.b64 %rd655, %rd267, %rd262; - not.b64 %rd268, %rd244; - and.b64 %rd269, %rd250, %rd268; - xor.b64 %rd650, %rd269, %rd256; - not.b64 %rd270, %rd250; - and.b64 %rd271, %rd206, %rd270; - xor.b64 %rd645, %rd271, %rd244; - not.b64 %rd272, %rd206; - and.b64 %rd273, %rd262, %rd272; - xor.b64 %rd640, %rd250, %rd273; - not.b64 %rd274, %rd261; - and.b64 %rd275, %rd240, %rd274; - xor.b64 %rd659, %rd275, %rd245; - not.b64 %rd276, %rd240; - and.b64 %rd277, %rd247, %rd276; - xor.b64 %rd654, %rd277, %rd261; - not.b64 %rd278, %rd247; - and.b64 %rd279, %rd260, %rd278; - xor.b64 %rd649, %rd279, %rd240; - not.b64 %rd280, %rd260; - and.b64 %rd281, %rd245, %rd280; - xor.b64 %rd644, %rd281, %rd247; - not.b64 %rd282, %rd245; - and.b64 %rd283, %rd261, %rd282; - xor.b64 %rd639, %rd260, %rd283; - not.b64 %rd284, %rd241; - and.b64 %rd285, %rd255, %rd284; - xor.b64 %rd658, %rd285, %rd239; - not.b64 %rd286, %rd255; - and.b64 %rd287, %rd254, %rd286; - xor.b64 %rd653, %rd287, %rd241; - not.b64 %rd288, %rd254; - and.b64 %rd289, %rd258, %rd288; - xor.b64 %rd648, %rd289, %rd255; - not.b64 %rd290, %rd258; - and.b64 %rd291, %rd239, %rd290; - xor.b64 %rd643, %rd291, %rd254; - not.b64 %rd292, %rd239; - and.b64 %rd293, %rd241, %rd292; - xor.b64 %rd638, %rd258, %rd293; - not.b64 %rd294, %rd246; - and.b64 %rd295, %rd242, %rd294; - xor.b64 %rd657, %rd295, %rd251; - not.b64 %rd296, %rd242; - and.b64 %rd297, %rd243, %rd296; - xor.b64 %rd652, %rd297, %rd246; - not.b64 %rd298, %rd243; - and.b64 %rd299, %rd253, %rd298; - xor.b64 %rd647, %rd299, %rd242; - not.b64 %rd300, %rd253; - and.b64 %rd301, %rd251, %rd300; - xor.b64 %rd642, %rd301, %rd243; - not.b64 %rd302, %rd251; - and.b64 %rd303, %rd246, %rd302; - xor.b64 %rd637, %rd253, %rd303; - not.b64 %rd304, %rd248; - and.b64 %rd305, %rd259, %rd304; - xor.b64 %rd656, %rd305, %rd257; - not.b64 %rd306, %rd259; - and.b64 %rd307, %rd252, %rd306; - xor.b64 %rd651, %rd307, %rd248; - not.b64 %rd308, %rd252; - and.b64 %rd309, %rd249, %rd308; - xor.b64 %rd646, %rd309, %rd259; - not.b64 %rd310, %rd249; - and.b64 %rd311, %rd257, %rd310; - xor.b64 %rd641, %rd311, %rd252; - not.b64 %rd312, %rd257; - and.b64 %rd313, %rd248, %rd312; - xor.b64 %rd636, %rd249, %rd313; - ld.global.u64 %rd314, [%rd635]; - xor.b64 %rd660, %rd265, %rd314; - add.s64 %rd635, %rd635, 8; - add.s32 %r12571, %r12571, 1; - setp.ne.s32 %p9, %r12571, 0; - @%p9 bra BB0_7; - - cvt.u32.u64 %r12, %rd660; - shr.u64 %rd337, %rd660, 8; - cvt.u32.u64 %r13, %rd337; - shr.u64 %rd338, %rd660, 16; - cvt.u32.u64 %r14, %rd338; - shr.u64 %rd339, %rd660, 24; - cvt.u32.u64 %r15, %rd339; - shr.u64 %rd340, %rd660, 32; - cvt.u32.u64 %r16, %rd340; - shr.u64 %rd341, %rd660, 40; - cvt.u32.u64 %r17, %rd341; - shr.u64 %rd342, %rd660, 48; - cvt.u32.u64 %r18, %rd342; - shr.u64 %rd343, %rd660, 56; - cvt.u32.u64 %r19, %rd343; - shr.u64 %rd344, %rd655, 8; - cvt.u32.u64 %r20, %rd344; - shr.u64 %rd345, %rd655, 16; - cvt.u32.u64 %r21, %rd345; - shr.u64 %rd346, %rd655, 24; - cvt.u32.u64 %r22, %rd346; - shr.u64 %rd347, %rd655, 32; - cvt.u32.u64 %r23, %rd347; - shr.u64 %rd348, %rd655, 40; - cvt.u32.u64 %r24, %rd348; - shr.u64 %rd349, %rd655, 48; - cvt.u32.u64 %r25, %rd349; - shr.u64 %rd350, %rd655, 56; - cvt.u32.u64 %r26, %rd350; - shr.u64 %rd351, %rd650, 8; - cvt.u32.u64 %r27, %rd351; - shr.u64 %rd352, %rd650, 16; - cvt.u32.u64 %r28, %rd352; - shr.u64 %rd353, %rd650, 24; - cvt.u32.u64 %r29, %rd353; - shr.u64 %rd354, %rd650, 32; - cvt.u32.u64 %r30, %rd354; - shr.u64 %rd355, %rd650, 40; - cvt.u32.u64 %r31, %rd355; - shr.u64 %rd356, %rd650, 48; - cvt.u32.u64 %r32, %rd356; - shr.u64 %rd357, %rd650, 56; - cvt.u32.u64 %r33, %rd357; - and.b32 %r34, %r12, 15; - bfe.u32 %r35, %r12, 12, 4; - and.b32 %r36, %r13, 15; - bfe.u32 %r37, %r12, 20, 4; - and.b32 %r38, %r14, 15; - shr.u32 %r39, %r12, 28; - and.b32 %r40, %r15, 15; - shr.u64 %rd358, %rd660, 36; - cvt.u32.u64 %r41, %rd358; - and.b32 %r42, %r41, 15; - and.b32 %r43, %r16, 15; - shr.u64 %rd359, %rd660, 44; - cvt.u32.u64 %r44, %rd359; - and.b32 %r45, %r44, 15; - and.b32 %r46, %r17, 15; - shr.u64 %rd360, %rd660, 52; - cvt.u32.u64 %r47, %rd360; - and.b32 %r48, %r47, 15; - and.b32 %r49, %r18, 15; - shr.u64 %rd361, %rd660, 60; - cvt.u32.u64 %r50, %rd361; - and.b32 %r51, %r19, 15; - cvt.u32.u64 %r52, %rd655; - and.b32 %r53, %r52, 15; - bfe.u32 %r54, %r52, 12, 4; - and.b32 %r55, %r20, 15; - bfe.u32 %r56, %r52, 20, 4; - and.b32 %r57, %r21, 15; - shr.u32 %r58, %r52, 28; - and.b32 %r59, %r22, 15; - shr.u64 %rd362, %rd655, 36; - cvt.u32.u64 %r60, %rd362; - and.b32 %r61, %r60, 15; - and.b32 %r62, %r23, 15; - shr.u64 %rd363, %rd655, 44; - cvt.u32.u64 %r63, %rd363; - and.b32 %r64, %r63, 15; - and.b32 %r65, %r24, 15; - shr.u64 %rd364, %rd655, 52; - cvt.u32.u64 %r66, %rd364; - and.b32 %r67, %r66, 15; - and.b32 %r68, %r25, 15; - shr.u64 %rd365, %rd655, 60; - cvt.u32.u64 %r69, %rd365; - and.b32 %r70, %r26, 15; - cvt.u32.u64 %r71, %rd650; - and.b32 %r72, %r71, 15; - bfe.u32 %r73, %r71, 12, 4; - and.b32 %r74, %r27, 15; - bfe.u32 %r75, %r71, 20, 4; - and.b32 %r76, %r28, 15; - shr.u32 %r77, %r71, 28; - and.b32 %r78, %r29, 15; - shr.u64 %rd366, %rd650, 36; - cvt.u32.u64 %r79, %rd366; - and.b32 %r80, %r79, 15; - and.b32 %r81, %r30, 15; - shr.u64 %rd367, %rd650, 44; - cvt.u32.u64 %r82, %rd367; - and.b32 %r83, %r82, 15; - and.b32 %r84, %r31, 15; - shr.u64 %rd368, %rd650, 52; - cvt.u32.u64 %r85, %rd368; - and.b32 %r86, %r85, 15; - and.b32 %r87, %r32, 15; - shr.u64 %rd369, %rd650, 60; - cvt.u32.u64 %r88, %rd369; - and.b32 %r89, %r33, 15; - cvt.u32.u64 %r90, %rd645; - and.b32 %r91, %r90, 15; - shr.u64 %rd370, %rd645, 8; - cvt.u32.u64 %r92, %rd370; - bfe.u32 %r93, %r90, 12, 4; - and.b32 %r94, %r92, 15; - shr.u64 %rd371, %rd645, 16; - cvt.u32.u64 %r95, %rd371; - bfe.u32 %r96, %r90, 20, 4; - and.b32 %r97, %r95, 15; - shr.u64 %rd372, %rd645, 24; - cvt.u32.u64 %r98, %rd372; - shr.u32 %r99, %r90, 28; - and.b32 %r100, %r98, 15; - shr.u64 %rd373, %rd645, 32; - cvt.u32.u64 %r101, %rd373; - shr.u64 %rd374, %rd645, 36; - cvt.u32.u64 %r102, %rd374; - and.b32 %r103, %r102, 15; - and.b32 %r104, %r101, 15; - shr.u64 %rd375, %rd645, 40; - cvt.u32.u64 %r105, %rd375; - shr.u64 %rd376, %rd645, 44; - cvt.u32.u64 %r106, %rd376; - and.b32 %r107, %r106, 15; - and.b32 %r108, %r105, 15; - shr.u64 %rd377, %rd645, 48; - cvt.u32.u64 %r109, %rd377; - shr.u64 %rd378, %rd645, 52; - cvt.u32.u64 %r110, %rd378; - and.b32 %r111, %r110, 15; - and.b32 %r112, %r109, 15; - shr.u64 %rd379, %rd645, 56; - cvt.u32.u64 %r113, %rd379; - shr.u64 %rd380, %rd645, 60; - cvt.u32.u64 %r114, %rd380; - and.b32 %r115, %r113, 15; - ld.const.v4.u8 {%rs2, %rs3, %rs4, %rs5}, [matrix]; - cvt.u32.u16 %r116, %rs5; - cvt.s32.s8 %r117, %r116; - cvt.u32.u16 %r118, %rs4; - cvt.s32.s8 %r119, %r118; - cvt.u32.u16 %r120, %rs2; - cvt.s32.s8 %r121, %r120; - cvt.u32.u16 %r122, %rs3; - cvt.s32.s8 %r123, %r122; - bfe.u32 %r124, %r12, 4, 4; - mul.lo.s32 %r125, %r34, %r123; - mad.lo.s32 %r126, %r124, %r121, %r125; - mad.lo.s32 %r127, %r35, %r119, %r126; - mad.lo.s32 %r128, %r36, %r117, %r127; - ld.const.v4.u8 {%rs10, %rs11, %rs12, %rs13}, [matrix+4]; - cvt.u32.u16 %r129, %rs13; - cvt.s32.s8 %r130, %r129; - cvt.u32.u16 %r131, %rs12; - cvt.s32.s8 %r132, %r131; - cvt.u32.u16 %r133, %rs11; - cvt.s32.s8 %r134, %r133; - cvt.u32.u16 %r135, %rs10; - cvt.s32.s8 %r136, %r135; - mad.lo.s32 %r137, %r37, %r136, %r128; - mad.lo.s32 %r138, %r38, %r134, %r137; - mad.lo.s32 %r139, %r39, %r132, %r138; - mad.lo.s32 %r140, %r40, %r130, %r139; - ld.const.v4.u8 {%rs18, %rs19, %rs20, %rs21}, [matrix+8]; - cvt.u32.u16 %r141, %rs21; - cvt.s32.s8 %r142, %r141; - cvt.u32.u16 %r143, %rs20; - cvt.s32.s8 %r144, %r143; - cvt.u32.u16 %r145, %rs19; - cvt.s32.s8 %r146, %r145; - cvt.u32.u16 %r147, %rs18; - cvt.s32.s8 %r148, %r147; - mad.lo.s32 %r149, %r42, %r148, %r140; - mad.lo.s32 %r150, %r43, %r146, %r149; - mad.lo.s32 %r151, %r45, %r144, %r150; - mad.lo.s32 %r152, %r46, %r142, %r151; - ld.const.v4.u8 {%rs26, %rs27, %rs28, %rs29}, [matrix+12]; - cvt.u32.u16 %r153, %rs29; - cvt.s32.s8 %r154, %r153; - cvt.u32.u16 %r155, %rs28; - cvt.s32.s8 %r156, %r155; - cvt.u32.u16 %r157, %rs27; - cvt.s32.s8 %r158, %r157; - cvt.u32.u16 %r159, %rs26; - cvt.s32.s8 %r160, %r159; - mad.lo.s32 %r161, %r48, %r160, %r152; - mad.lo.s32 %r162, %r49, %r158, %r161; - mad.lo.s32 %r163, %r50, %r156, %r162; - mad.lo.s32 %r164, %r51, %r154, %r163; - ld.const.v4.u8 {%rs34, %rs35, %rs36, %rs37}, [matrix+16]; - cvt.u32.u16 %r165, %rs37; - cvt.s32.s8 %r166, %r165; - cvt.u32.u16 %r167, %rs36; - cvt.s32.s8 %r168, %r167; - cvt.u32.u16 %r169, %rs35; - cvt.s32.s8 %r170, %r169; - cvt.u32.u16 %r171, %rs34; - cvt.s32.s8 %r172, %r171; - bfe.u32 %r173, %r52, 4, 4; - mad.lo.s32 %r174, %r173, %r172, %r164; - mad.lo.s32 %r175, %r53, %r170, %r174; - mad.lo.s32 %r176, %r54, %r168, %r175; - mad.lo.s32 %r177, %r55, %r166, %r176; - ld.const.v4.u8 {%rs42, %rs43, %rs44, %rs45}, [matrix+20]; - cvt.u32.u16 %r178, %rs45; - cvt.s32.s8 %r179, %r178; - cvt.u32.u16 %r180, %rs44; - cvt.s32.s8 %r181, %r180; - cvt.u32.u16 %r182, %rs43; - cvt.s32.s8 %r183, %r182; - cvt.u32.u16 %r184, %rs42; - cvt.s32.s8 %r185, %r184; - mad.lo.s32 %r186, %r56, %r185, %r177; - mad.lo.s32 %r187, %r57, %r183, %r186; - mad.lo.s32 %r188, %r58, %r181, %r187; - mad.lo.s32 %r189, %r59, %r179, %r188; - ld.const.v4.u8 {%rs50, %rs51, %rs52, %rs53}, [matrix+24]; - cvt.u32.u16 %r190, %rs53; - cvt.s32.s8 %r191, %r190; - cvt.u32.u16 %r192, %rs52; - cvt.s32.s8 %r193, %r192; - cvt.u32.u16 %r194, %rs51; - cvt.s32.s8 %r195, %r194; - cvt.u32.u16 %r196, %rs50; - cvt.s32.s8 %r197, %r196; - mad.lo.s32 %r198, %r61, %r197, %r189; - mad.lo.s32 %r199, %r62, %r195, %r198; - mad.lo.s32 %r200, %r64, %r193, %r199; - mad.lo.s32 %r201, %r65, %r191, %r200; - ld.const.v4.u8 {%rs58, %rs59, %rs60, %rs61}, [matrix+28]; - cvt.u32.u16 %r202, %rs61; - cvt.s32.s8 %r203, %r202; - cvt.u32.u16 %r204, %rs60; - cvt.s32.s8 %r205, %r204; - cvt.u32.u16 %r206, %rs59; - cvt.s32.s8 %r207, %r206; - cvt.u32.u16 %r208, %rs58; - cvt.s32.s8 %r209, %r208; - mad.lo.s32 %r210, %r67, %r209, %r201; - mad.lo.s32 %r211, %r68, %r207, %r210; - mad.lo.s32 %r212, %r69, %r205, %r211; - mad.lo.s32 %r213, %r70, %r203, %r212; - ld.const.v4.u8 {%rs66, %rs67, %rs68, %rs69}, [matrix+32]; - cvt.u32.u16 %r214, %rs69; - cvt.s32.s8 %r215, %r214; - cvt.u32.u16 %r216, %rs68; - cvt.s32.s8 %r217, %r216; - cvt.u32.u16 %r218, %rs67; - cvt.s32.s8 %r219, %r218; - cvt.u32.u16 %r220, %rs66; - cvt.s32.s8 %r221, %r220; - bfe.u32 %r222, %r71, 4, 4; - mad.lo.s32 %r223, %r222, %r221, %r213; - mad.lo.s32 %r224, %r72, %r219, %r223; - mad.lo.s32 %r225, %r73, %r217, %r224; - mad.lo.s32 %r226, %r74, %r215, %r225; - ld.const.v4.u8 {%rs74, %rs75, %rs76, %rs77}, [matrix+36]; - cvt.u32.u16 %r227, %rs77; - cvt.s32.s8 %r228, %r227; - cvt.u32.u16 %r229, %rs76; - cvt.s32.s8 %r230, %r229; - cvt.u32.u16 %r231, %rs75; - cvt.s32.s8 %r232, %r231; - cvt.u32.u16 %r233, %rs74; - cvt.s32.s8 %r234, %r233; - mad.lo.s32 %r235, %r75, %r234, %r226; - mad.lo.s32 %r236, %r76, %r232, %r235; - mad.lo.s32 %r237, %r77, %r230, %r236; - mad.lo.s32 %r238, %r78, %r228, %r237; - ld.const.v4.u8 {%rs82, %rs83, %rs84, %rs85}, [matrix+40]; - cvt.u32.u16 %r239, %rs85; - cvt.s32.s8 %r240, %r239; - cvt.u32.u16 %r241, %rs84; - cvt.s32.s8 %r242, %r241; - cvt.u32.u16 %r243, %rs83; - cvt.s32.s8 %r244, %r243; - cvt.u32.u16 %r245, %rs82; - cvt.s32.s8 %r246, %r245; - mad.lo.s32 %r247, %r80, %r246, %r238; - mad.lo.s32 %r248, %r81, %r244, %r247; - mad.lo.s32 %r249, %r83, %r242, %r248; - mad.lo.s32 %r250, %r84, %r240, %r249; - ld.const.v4.u8 {%rs90, %rs91, %rs92, %rs93}, [matrix+44]; - cvt.u32.u16 %r251, %rs93; - cvt.s32.s8 %r252, %r251; - cvt.u32.u16 %r253, %rs92; - cvt.s32.s8 %r254, %r253; - cvt.u32.u16 %r255, %rs91; - cvt.s32.s8 %r256, %r255; - cvt.u32.u16 %r257, %rs90; - cvt.s32.s8 %r258, %r257; - mad.lo.s32 %r259, %r86, %r258, %r250; - mad.lo.s32 %r260, %r87, %r256, %r259; - mad.lo.s32 %r261, %r88, %r254, %r260; - mad.lo.s32 %r262, %r89, %r252, %r261; - ld.const.v4.u8 {%rs98, %rs99, %rs100, %rs101}, [matrix+48]; - cvt.u32.u16 %r263, %rs101; - cvt.s32.s8 %r264, %r263; - cvt.u32.u16 %r265, %rs100; - cvt.s32.s8 %r266, %r265; - cvt.u32.u16 %r267, %rs99; - cvt.s32.s8 %r268, %r267; - cvt.u32.u16 %r269, %rs98; - cvt.s32.s8 %r270, %r269; - bfe.u32 %r271, %r90, 4, 4; - mad.lo.s32 %r272, %r271, %r270, %r262; - mad.lo.s32 %r273, %r91, %r268, %r272; - mad.lo.s32 %r274, %r93, %r266, %r273; - mad.lo.s32 %r275, %r94, %r264, %r274; - ld.const.v4.u8 {%rs106, %rs107, %rs108, %rs109}, [matrix+52]; - cvt.u32.u16 %r276, %rs109; - cvt.s32.s8 %r277, %r276; - cvt.u32.u16 %r278, %rs108; - cvt.s32.s8 %r279, %r278; - cvt.u32.u16 %r280, %rs107; - cvt.s32.s8 %r281, %r280; - cvt.u32.u16 %r282, %rs106; - cvt.s32.s8 %r283, %r282; - mad.lo.s32 %r284, %r96, %r283, %r275; - mad.lo.s32 %r285, %r97, %r281, %r284; - mad.lo.s32 %r286, %r99, %r279, %r285; - mad.lo.s32 %r287, %r100, %r277, %r286; - ld.const.v4.u8 {%rs114, %rs115, %rs116, %rs117}, [matrix+56]; - cvt.u32.u16 %r288, %rs117; - cvt.s32.s8 %r289, %r288; - cvt.u32.u16 %r290, %rs116; - cvt.s32.s8 %r291, %r290; - cvt.u32.u16 %r292, %rs115; - cvt.s32.s8 %r293, %r292; - cvt.u32.u16 %r294, %rs114; - cvt.s32.s8 %r295, %r294; - mad.lo.s32 %r296, %r103, %r295, %r287; - mad.lo.s32 %r297, %r104, %r293, %r296; - mad.lo.s32 %r298, %r107, %r291, %r297; - mad.lo.s32 %r299, %r108, %r289, %r298; - ld.const.v4.u8 {%rs122, %rs123, %rs124, %rs125}, [matrix+60]; - cvt.u32.u16 %r300, %rs125; - cvt.s32.s8 %r301, %r300; - cvt.u32.u16 %r302, %rs124; - cvt.s32.s8 %r303, %r302; - cvt.u32.u16 %r304, %rs123; - cvt.s32.s8 %r305, %r304; - cvt.u32.u16 %r306, %rs122; - cvt.s32.s8 %r307, %r306; - mad.lo.s32 %r308, %r111, %r307, %r299; - mad.lo.s32 %r309, %r112, %r305, %r308; - mad.lo.s32 %r310, %r114, %r303, %r309; - mad.lo.s32 %r311, %r115, %r301, %r310; - ld.const.v4.u8 {%rs130, %rs131, %rs132, %rs133}, [matrix+64]; - cvt.u32.u16 %r312, %rs133; - cvt.s32.s8 %r313, %r312; - cvt.u32.u16 %r314, %rs132; - cvt.s32.s8 %r315, %r314; - cvt.u32.u16 %r316, %rs130; - cvt.s32.s8 %r317, %r316; - cvt.u32.u16 %r318, %rs131; - cvt.s32.s8 %r319, %r318; - mul.lo.s32 %r320, %r34, %r319; - mad.lo.s32 %r321, %r124, %r317, %r320; - mad.lo.s32 %r322, %r35, %r315, %r321; - mad.lo.s32 %r323, %r36, %r313, %r322; - ld.const.v4.u8 {%rs138, %rs139, %rs140, %rs141}, [matrix+68]; - cvt.u32.u16 %r324, %rs141; - cvt.s32.s8 %r325, %r324; - cvt.u32.u16 %r326, %rs140; - cvt.s32.s8 %r327, %r326; - cvt.u32.u16 %r328, %rs139; - cvt.s32.s8 %r329, %r328; - cvt.u32.u16 %r330, %rs138; - cvt.s32.s8 %r331, %r330; - mad.lo.s32 %r332, %r37, %r331, %r323; - mad.lo.s32 %r333, %r38, %r329, %r332; - mad.lo.s32 %r334, %r39, %r327, %r333; - mad.lo.s32 %r335, %r40, %r325, %r334; - ld.const.v4.u8 {%rs146, %rs147, %rs148, %rs149}, [matrix+72]; - cvt.u32.u16 %r336, %rs149; - cvt.s32.s8 %r337, %r336; - cvt.u32.u16 %r338, %rs148; - cvt.s32.s8 %r339, %r338; - cvt.u32.u16 %r340, %rs147; - cvt.s32.s8 %r341, %r340; - cvt.u32.u16 %r342, %rs146; - cvt.s32.s8 %r343, %r342; - mad.lo.s32 %r344, %r42, %r343, %r335; - mad.lo.s32 %r345, %r43, %r341, %r344; - mad.lo.s32 %r346, %r45, %r339, %r345; - mad.lo.s32 %r347, %r46, %r337, %r346; - ld.const.v4.u8 {%rs154, %rs155, %rs156, %rs157}, [matrix+76]; - cvt.u32.u16 %r348, %rs157; - cvt.s32.s8 %r349, %r348; - cvt.u32.u16 %r350, %rs156; - cvt.s32.s8 %r351, %r350; - cvt.u32.u16 %r352, %rs155; - cvt.s32.s8 %r353, %r352; - cvt.u32.u16 %r354, %rs154; - cvt.s32.s8 %r355, %r354; - mad.lo.s32 %r356, %r48, %r355, %r347; - mad.lo.s32 %r357, %r49, %r353, %r356; - mad.lo.s32 %r358, %r50, %r351, %r357; - mad.lo.s32 %r359, %r51, %r349, %r358; - ld.const.v4.u8 {%rs162, %rs163, %rs164, %rs165}, [matrix+80]; - cvt.u32.u16 %r360, %rs165; - cvt.s32.s8 %r361, %r360; - cvt.u32.u16 %r362, %rs164; - cvt.s32.s8 %r363, %r362; - cvt.u32.u16 %r364, %rs163; - cvt.s32.s8 %r365, %r364; - cvt.u32.u16 %r366, %rs162; - cvt.s32.s8 %r367, %r366; - mad.lo.s32 %r368, %r173, %r367, %r359; - mad.lo.s32 %r369, %r53, %r365, %r368; - mad.lo.s32 %r370, %r54, %r363, %r369; - mad.lo.s32 %r371, %r55, %r361, %r370; - ld.const.v4.u8 {%rs170, %rs171, %rs172, %rs173}, [matrix+84]; - cvt.u32.u16 %r372, %rs173; - cvt.s32.s8 %r373, %r372; - cvt.u32.u16 %r374, %rs172; - cvt.s32.s8 %r375, %r374; - cvt.u32.u16 %r376, %rs171; - cvt.s32.s8 %r377, %r376; - cvt.u32.u16 %r378, %rs170; - cvt.s32.s8 %r379, %r378; - mad.lo.s32 %r380, %r56, %r379, %r371; - mad.lo.s32 %r381, %r57, %r377, %r380; - mad.lo.s32 %r382, %r58, %r375, %r381; - mad.lo.s32 %r383, %r59, %r373, %r382; - ld.const.v4.u8 {%rs178, %rs179, %rs180, %rs181}, [matrix+88]; - cvt.u32.u16 %r384, %rs181; - cvt.s32.s8 %r385, %r384; - cvt.u32.u16 %r386, %rs180; - cvt.s32.s8 %r387, %r386; - cvt.u32.u16 %r388, %rs179; - cvt.s32.s8 %r389, %r388; - cvt.u32.u16 %r390, %rs178; - cvt.s32.s8 %r391, %r390; - mad.lo.s32 %r392, %r61, %r391, %r383; - mad.lo.s32 %r393, %r62, %r389, %r392; - mad.lo.s32 %r394, %r64, %r387, %r393; - mad.lo.s32 %r395, %r65, %r385, %r394; - ld.const.v4.u8 {%rs186, %rs187, %rs188, %rs189}, [matrix+92]; - cvt.u32.u16 %r396, %rs189; - cvt.s32.s8 %r397, %r396; - cvt.u32.u16 %r398, %rs188; - cvt.s32.s8 %r399, %r398; - cvt.u32.u16 %r400, %rs187; - cvt.s32.s8 %r401, %r400; - cvt.u32.u16 %r402, %rs186; - cvt.s32.s8 %r403, %r402; - mad.lo.s32 %r404, %r67, %r403, %r395; - mad.lo.s32 %r405, %r68, %r401, %r404; - mad.lo.s32 %r406, %r69, %r399, %r405; - mad.lo.s32 %r407, %r70, %r397, %r406; - ld.const.v4.u8 {%rs194, %rs195, %rs196, %rs197}, [matrix+96]; - cvt.u32.u16 %r408, %rs197; - cvt.s32.s8 %r409, %r408; - cvt.u32.u16 %r410, %rs196; - cvt.s32.s8 %r411, %r410; - cvt.u32.u16 %r412, %rs195; - cvt.s32.s8 %r413, %r412; - cvt.u32.u16 %r414, %rs194; - cvt.s32.s8 %r415, %r414; - mad.lo.s32 %r416, %r222, %r415, %r407; - mad.lo.s32 %r417, %r72, %r413, %r416; - mad.lo.s32 %r418, %r73, %r411, %r417; - mad.lo.s32 %r419, %r74, %r409, %r418; - ld.const.v4.u8 {%rs202, %rs203, %rs204, %rs205}, [matrix+100]; - cvt.u32.u16 %r420, %rs205; - cvt.s32.s8 %r421, %r420; - cvt.u32.u16 %r422, %rs204; - cvt.s32.s8 %r423, %r422; - cvt.u32.u16 %r424, %rs203; - cvt.s32.s8 %r425, %r424; - cvt.u32.u16 %r426, %rs202; - cvt.s32.s8 %r427, %r426; - mad.lo.s32 %r428, %r75, %r427, %r419; - mad.lo.s32 %r429, %r76, %r425, %r428; - mad.lo.s32 %r430, %r77, %r423, %r429; - mad.lo.s32 %r431, %r78, %r421, %r430; - ld.const.v4.u8 {%rs210, %rs211, %rs212, %rs213}, [matrix+104]; - cvt.u32.u16 %r432, %rs213; - cvt.s32.s8 %r433, %r432; - cvt.u32.u16 %r434, %rs212; - cvt.s32.s8 %r435, %r434; - cvt.u32.u16 %r436, %rs211; - cvt.s32.s8 %r437, %r436; - cvt.u32.u16 %r438, %rs210; - cvt.s32.s8 %r439, %r438; - mad.lo.s32 %r440, %r80, %r439, %r431; - mad.lo.s32 %r441, %r81, %r437, %r440; - mad.lo.s32 %r442, %r83, %r435, %r441; - mad.lo.s32 %r443, %r84, %r433, %r442; - ld.const.v4.u8 {%rs218, %rs219, %rs220, %rs221}, [matrix+108]; - cvt.u32.u16 %r444, %rs221; - cvt.s32.s8 %r445, %r444; - cvt.u32.u16 %r446, %rs220; - cvt.s32.s8 %r447, %r446; - cvt.u32.u16 %r448, %rs219; - cvt.s32.s8 %r449, %r448; - cvt.u32.u16 %r450, %rs218; - cvt.s32.s8 %r451, %r450; - mad.lo.s32 %r452, %r86, %r451, %r443; - mad.lo.s32 %r453, %r87, %r449, %r452; - mad.lo.s32 %r454, %r88, %r447, %r453; - mad.lo.s32 %r455, %r89, %r445, %r454; - ld.const.v4.u8 {%rs226, %rs227, %rs228, %rs229}, [matrix+112]; - cvt.u32.u16 %r456, %rs229; - cvt.s32.s8 %r457, %r456; - cvt.u32.u16 %r458, %rs228; - cvt.s32.s8 %r459, %r458; - cvt.u32.u16 %r460, %rs227; - cvt.s32.s8 %r461, %r460; - cvt.u32.u16 %r462, %rs226; - cvt.s32.s8 %r463, %r462; - mad.lo.s32 %r464, %r271, %r463, %r455; - mad.lo.s32 %r465, %r91, %r461, %r464; - mad.lo.s32 %r466, %r93, %r459, %r465; - mad.lo.s32 %r467, %r94, %r457, %r466; - ld.const.v4.u8 {%rs234, %rs235, %rs236, %rs237}, [matrix+116]; - cvt.u32.u16 %r468, %rs237; - cvt.s32.s8 %r469, %r468; - cvt.u32.u16 %r470, %rs236; - cvt.s32.s8 %r471, %r470; - cvt.u32.u16 %r472, %rs235; - cvt.s32.s8 %r473, %r472; - cvt.u32.u16 %r474, %rs234; - cvt.s32.s8 %r475, %r474; - mad.lo.s32 %r476, %r96, %r475, %r467; - mad.lo.s32 %r477, %r97, %r473, %r476; - mad.lo.s32 %r478, %r99, %r471, %r477; - mad.lo.s32 %r479, %r100, %r469, %r478; - ld.const.v4.u8 {%rs242, %rs243, %rs244, %rs245}, [matrix+120]; - cvt.u32.u16 %r480, %rs245; - cvt.s32.s8 %r481, %r480; - cvt.u32.u16 %r482, %rs244; - cvt.s32.s8 %r483, %r482; - cvt.u32.u16 %r484, %rs243; - cvt.s32.s8 %r485, %r484; - cvt.u32.u16 %r486, %rs242; - cvt.s32.s8 %r487, %r486; - mad.lo.s32 %r488, %r103, %r487, %r479; - mad.lo.s32 %r489, %r104, %r485, %r488; - mad.lo.s32 %r490, %r107, %r483, %r489; - mad.lo.s32 %r491, %r108, %r481, %r490; - ld.const.v4.u8 {%rs250, %rs251, %rs252, %rs253}, [matrix+124]; - cvt.u32.u16 %r492, %rs253; - cvt.s32.s8 %r493, %r492; - cvt.u32.u16 %r494, %rs252; - cvt.s32.s8 %r495, %r494; - cvt.u32.u16 %r496, %rs251; - cvt.s32.s8 %r497, %r496; - cvt.u32.u16 %r498, %rs250; - cvt.s32.s8 %r499, %r498; - mad.lo.s32 %r500, %r111, %r499, %r491; - mad.lo.s32 %r501, %r112, %r497, %r500; - mad.lo.s32 %r502, %r114, %r495, %r501; - mad.lo.s32 %r503, %r115, %r493, %r502; - shr.u32 %r504, %r311, 6; - and.b32 %r505, %r504, 240; - shr.u32 %r506, %r503, 10; - or.b32 %r507, %r506, %r505; - xor.b32 %r508, %r12, %r507; - cvt.u64.u32 %rd381, %r508; - ld.const.v4.u8 {%rs258, %rs259, %rs260, %rs261}, [matrix+128]; - cvt.u32.u16 %r509, %rs261; - cvt.s32.s8 %r510, %r509; - cvt.u32.u16 %r511, %rs260; - cvt.s32.s8 %r512, %r511; - cvt.u32.u16 %r513, %rs258; - cvt.s32.s8 %r514, %r513; - cvt.u32.u16 %r515, %rs259; - cvt.s32.s8 %r516, %r515; - mul.lo.s32 %r517, %r34, %r516; - mad.lo.s32 %r518, %r124, %r514, %r517; - mad.lo.s32 %r519, %r35, %r512, %r518; - mad.lo.s32 %r520, %r36, %r510, %r519; - ld.const.v4.u8 {%rs266, %rs267, %rs268, %rs269}, [matrix+132]; - cvt.u32.u16 %r521, %rs269; - cvt.s32.s8 %r522, %r521; - cvt.u32.u16 %r523, %rs268; - cvt.s32.s8 %r524, %r523; - cvt.u32.u16 %r525, %rs267; - cvt.s32.s8 %r526, %r525; - cvt.u32.u16 %r527, %rs266; - cvt.s32.s8 %r528, %r527; - mad.lo.s32 %r529, %r37, %r528, %r520; - mad.lo.s32 %r530, %r38, %r526, %r529; - mad.lo.s32 %r531, %r39, %r524, %r530; - mad.lo.s32 %r532, %r40, %r522, %r531; - ld.const.v4.u8 {%rs274, %rs275, %rs276, %rs277}, [matrix+136]; - cvt.u32.u16 %r533, %rs277; - cvt.s32.s8 %r534, %r533; - cvt.u32.u16 %r535, %rs276; - cvt.s32.s8 %r536, %r535; - cvt.u32.u16 %r537, %rs275; - cvt.s32.s8 %r538, %r537; - cvt.u32.u16 %r539, %rs274; - cvt.s32.s8 %r540, %r539; - mad.lo.s32 %r541, %r42, %r540, %r532; - mad.lo.s32 %r542, %r43, %r538, %r541; - mad.lo.s32 %r543, %r45, %r536, %r542; - mad.lo.s32 %r544, %r46, %r534, %r543; - ld.const.v4.u8 {%rs282, %rs283, %rs284, %rs285}, [matrix+140]; - cvt.u32.u16 %r545, %rs285; - cvt.s32.s8 %r546, %r545; - cvt.u32.u16 %r547, %rs284; - cvt.s32.s8 %r548, %r547; - cvt.u32.u16 %r549, %rs283; - cvt.s32.s8 %r550, %r549; - cvt.u32.u16 %r551, %rs282; - cvt.s32.s8 %r552, %r551; - mad.lo.s32 %r553, %r48, %r552, %r544; - mad.lo.s32 %r554, %r49, %r550, %r553; - mad.lo.s32 %r555, %r50, %r548, %r554; - mad.lo.s32 %r556, %r51, %r546, %r555; - ld.const.v4.u8 {%rs290, %rs291, %rs292, %rs293}, [matrix+144]; - cvt.u32.u16 %r557, %rs293; - cvt.s32.s8 %r558, %r557; - cvt.u32.u16 %r559, %rs292; - cvt.s32.s8 %r560, %r559; - cvt.u32.u16 %r561, %rs291; - cvt.s32.s8 %r562, %r561; - cvt.u32.u16 %r563, %rs290; - cvt.s32.s8 %r564, %r563; - mad.lo.s32 %r565, %r173, %r564, %r556; - mad.lo.s32 %r566, %r53, %r562, %r565; - mad.lo.s32 %r567, %r54, %r560, %r566; - mad.lo.s32 %r568, %r55, %r558, %r567; - ld.const.v4.u8 {%rs298, %rs299, %rs300, %rs301}, [matrix+148]; - cvt.u32.u16 %r569, %rs301; - cvt.s32.s8 %r570, %r569; - cvt.u32.u16 %r571, %rs300; - cvt.s32.s8 %r572, %r571; - cvt.u32.u16 %r573, %rs299; - cvt.s32.s8 %r574, %r573; - cvt.u32.u16 %r575, %rs298; - cvt.s32.s8 %r576, %r575; - mad.lo.s32 %r577, %r56, %r576, %r568; - mad.lo.s32 %r578, %r57, %r574, %r577; - mad.lo.s32 %r579, %r58, %r572, %r578; - mad.lo.s32 %r580, %r59, %r570, %r579; - ld.const.v4.u8 {%rs306, %rs307, %rs308, %rs309}, [matrix+152]; - cvt.u32.u16 %r581, %rs309; - cvt.s32.s8 %r582, %r581; - cvt.u32.u16 %r583, %rs308; - cvt.s32.s8 %r584, %r583; - cvt.u32.u16 %r585, %rs307; - cvt.s32.s8 %r586, %r585; - cvt.u32.u16 %r587, %rs306; - cvt.s32.s8 %r588, %r587; - mad.lo.s32 %r589, %r61, %r588, %r580; - mad.lo.s32 %r590, %r62, %r586, %r589; - mad.lo.s32 %r591, %r64, %r584, %r590; - mad.lo.s32 %r592, %r65, %r582, %r591; - ld.const.v4.u8 {%rs314, %rs315, %rs316, %rs317}, [matrix+156]; - cvt.u32.u16 %r593, %rs317; - cvt.s32.s8 %r594, %r593; - cvt.u32.u16 %r595, %rs316; - cvt.s32.s8 %r596, %r595; - cvt.u32.u16 %r597, %rs315; - cvt.s32.s8 %r598, %r597; - cvt.u32.u16 %r599, %rs314; - cvt.s32.s8 %r600, %r599; - mad.lo.s32 %r601, %r67, %r600, %r592; - mad.lo.s32 %r602, %r68, %r598, %r601; - mad.lo.s32 %r603, %r69, %r596, %r602; - mad.lo.s32 %r604, %r70, %r594, %r603; - ld.const.v4.u8 {%rs322, %rs323, %rs324, %rs325}, [matrix+160]; - cvt.u32.u16 %r605, %rs325; - cvt.s32.s8 %r606, %r605; - cvt.u32.u16 %r607, %rs324; - cvt.s32.s8 %r608, %r607; - cvt.u32.u16 %r609, %rs323; - cvt.s32.s8 %r610, %r609; - cvt.u32.u16 %r611, %rs322; - cvt.s32.s8 %r612, %r611; - mad.lo.s32 %r613, %r222, %r612, %r604; - mad.lo.s32 %r614, %r72, %r610, %r613; - mad.lo.s32 %r615, %r73, %r608, %r614; - mad.lo.s32 %r616, %r74, %r606, %r615; - ld.const.v4.u8 {%rs330, %rs331, %rs332, %rs333}, [matrix+164]; - cvt.u32.u16 %r617, %rs333; - cvt.s32.s8 %r618, %r617; - cvt.u32.u16 %r619, %rs332; - cvt.s32.s8 %r620, %r619; - cvt.u32.u16 %r621, %rs331; - cvt.s32.s8 %r622, %r621; - cvt.u32.u16 %r623, %rs330; - cvt.s32.s8 %r624, %r623; - mad.lo.s32 %r625, %r75, %r624, %r616; - mad.lo.s32 %r626, %r76, %r622, %r625; - mad.lo.s32 %r627, %r77, %r620, %r626; - mad.lo.s32 %r628, %r78, %r618, %r627; - ld.const.v4.u8 {%rs338, %rs339, %rs340, %rs341}, [matrix+168]; - cvt.u32.u16 %r629, %rs341; - cvt.s32.s8 %r630, %r629; - cvt.u32.u16 %r631, %rs340; - cvt.s32.s8 %r632, %r631; - cvt.u32.u16 %r633, %rs339; - cvt.s32.s8 %r634, %r633; - cvt.u32.u16 %r635, %rs338; - cvt.s32.s8 %r636, %r635; - mad.lo.s32 %r637, %r80, %r636, %r628; - mad.lo.s32 %r638, %r81, %r634, %r637; - mad.lo.s32 %r639, %r83, %r632, %r638; - mad.lo.s32 %r640, %r84, %r630, %r639; - ld.const.v4.u8 {%rs346, %rs347, %rs348, %rs349}, [matrix+172]; - cvt.u32.u16 %r641, %rs349; - cvt.s32.s8 %r642, %r641; - cvt.u32.u16 %r643, %rs348; - cvt.s32.s8 %r644, %r643; - cvt.u32.u16 %r645, %rs347; - cvt.s32.s8 %r646, %r645; - cvt.u32.u16 %r647, %rs346; - cvt.s32.s8 %r648, %r647; - mad.lo.s32 %r649, %r86, %r648, %r640; - mad.lo.s32 %r650, %r87, %r646, %r649; - mad.lo.s32 %r651, %r88, %r644, %r650; - mad.lo.s32 %r652, %r89, %r642, %r651; - ld.const.v4.u8 {%rs354, %rs355, %rs356, %rs357}, [matrix+176]; - cvt.u32.u16 %r653, %rs357; - cvt.s32.s8 %r654, %r653; - cvt.u32.u16 %r655, %rs356; - cvt.s32.s8 %r656, %r655; - cvt.u32.u16 %r657, %rs355; - cvt.s32.s8 %r658, %r657; - cvt.u32.u16 %r659, %rs354; - cvt.s32.s8 %r660, %r659; - mad.lo.s32 %r661, %r271, %r660, %r652; - mad.lo.s32 %r662, %r91, %r658, %r661; - mad.lo.s32 %r663, %r93, %r656, %r662; - mad.lo.s32 %r664, %r94, %r654, %r663; - ld.const.v4.u8 {%rs362, %rs363, %rs364, %rs365}, [matrix+180]; - cvt.u32.u16 %r665, %rs365; - cvt.s32.s8 %r666, %r665; - cvt.u32.u16 %r667, %rs364; - cvt.s32.s8 %r668, %r667; - cvt.u32.u16 %r669, %rs363; - cvt.s32.s8 %r670, %r669; - cvt.u32.u16 %r671, %rs362; - cvt.s32.s8 %r672, %r671; - mad.lo.s32 %r673, %r96, %r672, %r664; - mad.lo.s32 %r674, %r97, %r670, %r673; - mad.lo.s32 %r675, %r99, %r668, %r674; - mad.lo.s32 %r676, %r100, %r666, %r675; - ld.const.v4.u8 {%rs370, %rs371, %rs372, %rs373}, [matrix+184]; - cvt.u32.u16 %r677, %rs373; - cvt.s32.s8 %r678, %r677; - cvt.u32.u16 %r679, %rs372; - cvt.s32.s8 %r680, %r679; - cvt.u32.u16 %r681, %rs371; - cvt.s32.s8 %r682, %r681; - cvt.u32.u16 %r683, %rs370; - cvt.s32.s8 %r684, %r683; - mad.lo.s32 %r685, %r103, %r684, %r676; - mad.lo.s32 %r686, %r104, %r682, %r685; - mad.lo.s32 %r687, %r107, %r680, %r686; - mad.lo.s32 %r688, %r108, %r678, %r687; - ld.const.v4.u8 {%rs378, %rs379, %rs380, %rs381}, [matrix+188]; - cvt.u32.u16 %r689, %rs381; - cvt.s32.s8 %r690, %r689; - cvt.u32.u16 %r691, %rs380; - cvt.s32.s8 %r692, %r691; - cvt.u32.u16 %r693, %rs379; - cvt.s32.s8 %r694, %r693; - cvt.u32.u16 %r695, %rs378; - cvt.s32.s8 %r696, %r695; - mad.lo.s32 %r697, %r111, %r696, %r688; - mad.lo.s32 %r698, %r112, %r694, %r697; - mad.lo.s32 %r699, %r114, %r692, %r698; - mad.lo.s32 %r700, %r115, %r690, %r699; - ld.const.v4.u8 {%rs386, %rs387, %rs388, %rs389}, [matrix+192]; - cvt.u32.u16 %r701, %rs389; - cvt.s32.s8 %r702, %r701; - cvt.u32.u16 %r703, %rs388; - cvt.s32.s8 %r704, %r703; - cvt.u32.u16 %r705, %rs386; - cvt.s32.s8 %r706, %r705; - cvt.u32.u16 %r707, %rs387; - cvt.s32.s8 %r708, %r707; - mul.lo.s32 %r709, %r34, %r708; - mad.lo.s32 %r710, %r124, %r706, %r709; - mad.lo.s32 %r711, %r35, %r704, %r710; - mad.lo.s32 %r712, %r36, %r702, %r711; - ld.const.v4.u8 {%rs394, %rs395, %rs396, %rs397}, [matrix+196]; - cvt.u32.u16 %r713, %rs397; - cvt.s32.s8 %r714, %r713; - cvt.u32.u16 %r715, %rs396; - cvt.s32.s8 %r716, %r715; - cvt.u32.u16 %r717, %rs395; - cvt.s32.s8 %r718, %r717; - cvt.u32.u16 %r719, %rs394; - cvt.s32.s8 %r720, %r719; - mad.lo.s32 %r721, %r37, %r720, %r712; - mad.lo.s32 %r722, %r38, %r718, %r721; - mad.lo.s32 %r723, %r39, %r716, %r722; - mad.lo.s32 %r724, %r40, %r714, %r723; - ld.const.v4.u8 {%rs402, %rs403, %rs404, %rs405}, [matrix+200]; - cvt.u32.u16 %r725, %rs405; - cvt.s32.s8 %r726, %r725; - cvt.u32.u16 %r727, %rs404; - cvt.s32.s8 %r728, %r727; - cvt.u32.u16 %r729, %rs403; - cvt.s32.s8 %r730, %r729; - cvt.u32.u16 %r731, %rs402; - cvt.s32.s8 %r732, %r731; - mad.lo.s32 %r733, %r42, %r732, %r724; - mad.lo.s32 %r734, %r43, %r730, %r733; - mad.lo.s32 %r735, %r45, %r728, %r734; - mad.lo.s32 %r736, %r46, %r726, %r735; - ld.const.v4.u8 {%rs410, %rs411, %rs412, %rs413}, [matrix+204]; - cvt.u32.u16 %r737, %rs413; - cvt.s32.s8 %r738, %r737; - cvt.u32.u16 %r739, %rs412; - cvt.s32.s8 %r740, %r739; - cvt.u32.u16 %r741, %rs411; - cvt.s32.s8 %r742, %r741; - cvt.u32.u16 %r743, %rs410; - cvt.s32.s8 %r744, %r743; - mad.lo.s32 %r745, %r48, %r744, %r736; - mad.lo.s32 %r746, %r49, %r742, %r745; - mad.lo.s32 %r747, %r50, %r740, %r746; - mad.lo.s32 %r748, %r51, %r738, %r747; - ld.const.v4.u8 {%rs418, %rs419, %rs420, %rs421}, [matrix+208]; - cvt.u32.u16 %r749, %rs421; - cvt.s32.s8 %r750, %r749; - cvt.u32.u16 %r751, %rs420; - cvt.s32.s8 %r752, %r751; - cvt.u32.u16 %r753, %rs419; - cvt.s32.s8 %r754, %r753; - cvt.u32.u16 %r755, %rs418; - cvt.s32.s8 %r756, %r755; - mad.lo.s32 %r757, %r173, %r756, %r748; - mad.lo.s32 %r758, %r53, %r754, %r757; - mad.lo.s32 %r759, %r54, %r752, %r758; - mad.lo.s32 %r760, %r55, %r750, %r759; - ld.const.v4.u8 {%rs426, %rs427, %rs428, %rs429}, [matrix+212]; - cvt.u32.u16 %r761, %rs429; - cvt.s32.s8 %r762, %r761; - cvt.u32.u16 %r763, %rs428; - cvt.s32.s8 %r764, %r763; - cvt.u32.u16 %r765, %rs427; - cvt.s32.s8 %r766, %r765; - cvt.u32.u16 %r767, %rs426; - cvt.s32.s8 %r768, %r767; - mad.lo.s32 %r769, %r56, %r768, %r760; - mad.lo.s32 %r770, %r57, %r766, %r769; - mad.lo.s32 %r771, %r58, %r764, %r770; - mad.lo.s32 %r772, %r59, %r762, %r771; - ld.const.v4.u8 {%rs434, %rs435, %rs436, %rs437}, [matrix+216]; - cvt.u32.u16 %r773, %rs437; - cvt.s32.s8 %r774, %r773; - cvt.u32.u16 %r775, %rs436; - cvt.s32.s8 %r776, %r775; - cvt.u32.u16 %r777, %rs435; - cvt.s32.s8 %r778, %r777; - cvt.u32.u16 %r779, %rs434; - cvt.s32.s8 %r780, %r779; - mad.lo.s32 %r781, %r61, %r780, %r772; - mad.lo.s32 %r782, %r62, %r778, %r781; - mad.lo.s32 %r783, %r64, %r776, %r782; - mad.lo.s32 %r784, %r65, %r774, %r783; - ld.const.v4.u8 {%rs442, %rs443, %rs444, %rs445}, [matrix+220]; - cvt.u32.u16 %r785, %rs445; - cvt.s32.s8 %r786, %r785; - cvt.u32.u16 %r787, %rs444; - cvt.s32.s8 %r788, %r787; - cvt.u32.u16 %r789, %rs443; - cvt.s32.s8 %r790, %r789; - cvt.u32.u16 %r791, %rs442; - cvt.s32.s8 %r792, %r791; - mad.lo.s32 %r793, %r67, %r792, %r784; - mad.lo.s32 %r794, %r68, %r790, %r793; - mad.lo.s32 %r795, %r69, %r788, %r794; - mad.lo.s32 %r796, %r70, %r786, %r795; - ld.const.v4.u8 {%rs450, %rs451, %rs452, %rs453}, [matrix+224]; - cvt.u32.u16 %r797, %rs453; - cvt.s32.s8 %r798, %r797; - cvt.u32.u16 %r799, %rs452; - cvt.s32.s8 %r800, %r799; - cvt.u32.u16 %r801, %rs451; - cvt.s32.s8 %r802, %r801; - cvt.u32.u16 %r803, %rs450; - cvt.s32.s8 %r804, %r803; - mad.lo.s32 %r805, %r222, %r804, %r796; - mad.lo.s32 %r806, %r72, %r802, %r805; - mad.lo.s32 %r807, %r73, %r800, %r806; - mad.lo.s32 %r808, %r74, %r798, %r807; - ld.const.v4.u8 {%rs458, %rs459, %rs460, %rs461}, [matrix+228]; - cvt.u32.u16 %r809, %rs461; - cvt.s32.s8 %r810, %r809; - cvt.u32.u16 %r811, %rs460; - cvt.s32.s8 %r812, %r811; - cvt.u32.u16 %r813, %rs459; - cvt.s32.s8 %r814, %r813; - cvt.u32.u16 %r815, %rs458; - cvt.s32.s8 %r816, %r815; - mad.lo.s32 %r817, %r75, %r816, %r808; - mad.lo.s32 %r818, %r76, %r814, %r817; - mad.lo.s32 %r819, %r77, %r812, %r818; - mad.lo.s32 %r820, %r78, %r810, %r819; - ld.const.v4.u8 {%rs466, %rs467, %rs468, %rs469}, [matrix+232]; - cvt.u32.u16 %r821, %rs469; - cvt.s32.s8 %r822, %r821; - cvt.u32.u16 %r823, %rs468; - cvt.s32.s8 %r824, %r823; - cvt.u32.u16 %r825, %rs467; - cvt.s32.s8 %r826, %r825; - cvt.u32.u16 %r827, %rs466; - cvt.s32.s8 %r828, %r827; - mad.lo.s32 %r829, %r80, %r828, %r820; - mad.lo.s32 %r830, %r81, %r826, %r829; - mad.lo.s32 %r831, %r83, %r824, %r830; - mad.lo.s32 %r832, %r84, %r822, %r831; - ld.const.v4.u8 {%rs474, %rs475, %rs476, %rs477}, [matrix+236]; - cvt.u32.u16 %r833, %rs477; - cvt.s32.s8 %r834, %r833; - cvt.u32.u16 %r835, %rs476; - cvt.s32.s8 %r836, %r835; - cvt.u32.u16 %r837, %rs475; - cvt.s32.s8 %r838, %r837; - cvt.u32.u16 %r839, %rs474; - cvt.s32.s8 %r840, %r839; - mad.lo.s32 %r841, %r86, %r840, %r832; - mad.lo.s32 %r842, %r87, %r838, %r841; - mad.lo.s32 %r843, %r88, %r836, %r842; - mad.lo.s32 %r844, %r89, %r834, %r843; - ld.const.v4.u8 {%rs482, %rs483, %rs484, %rs485}, [matrix+240]; - cvt.u32.u16 %r845, %rs485; - cvt.s32.s8 %r846, %r845; - cvt.u32.u16 %r847, %rs484; - cvt.s32.s8 %r848, %r847; - cvt.u32.u16 %r849, %rs483; - cvt.s32.s8 %r850, %r849; - cvt.u32.u16 %r851, %rs482; - cvt.s32.s8 %r852, %r851; - mad.lo.s32 %r853, %r271, %r852, %r844; - mad.lo.s32 %r854, %r91, %r850, %r853; - mad.lo.s32 %r855, %r93, %r848, %r854; - mad.lo.s32 %r856, %r94, %r846, %r855; - ld.const.v4.u8 {%rs490, %rs491, %rs492, %rs493}, [matrix+244]; - cvt.u32.u16 %r857, %rs493; - cvt.s32.s8 %r858, %r857; - cvt.u32.u16 %r859, %rs492; - cvt.s32.s8 %r860, %r859; - cvt.u32.u16 %r861, %rs491; - cvt.s32.s8 %r862, %r861; - cvt.u32.u16 %r863, %rs490; - cvt.s32.s8 %r864, %r863; - mad.lo.s32 %r865, %r96, %r864, %r856; - mad.lo.s32 %r866, %r97, %r862, %r865; - mad.lo.s32 %r867, %r99, %r860, %r866; - mad.lo.s32 %r868, %r100, %r858, %r867; - ld.const.v4.u8 {%rs498, %rs499, %rs500, %rs501}, [matrix+248]; - cvt.u32.u16 %r869, %rs501; - cvt.s32.s8 %r870, %r869; - cvt.u32.u16 %r871, %rs500; - cvt.s32.s8 %r872, %r871; - cvt.u32.u16 %r873, %rs499; - cvt.s32.s8 %r874, %r873; - cvt.u32.u16 %r875, %rs498; - cvt.s32.s8 %r876, %r875; - mad.lo.s32 %r877, %r103, %r876, %r868; - mad.lo.s32 %r878, %r104, %r874, %r877; - mad.lo.s32 %r879, %r107, %r872, %r878; - mad.lo.s32 %r880, %r108, %r870, %r879; - ld.const.v4.u8 {%rs506, %rs507, %rs508, %rs509}, [matrix+252]; - cvt.u32.u16 %r881, %rs509; - cvt.s32.s8 %r882, %r881; - cvt.u32.u16 %r883, %rs508; - cvt.s32.s8 %r884, %r883; - cvt.u32.u16 %r885, %rs507; - cvt.s32.s8 %r886, %r885; - cvt.u32.u16 %r887, %rs506; - cvt.s32.s8 %r888, %r887; - mad.lo.s32 %r889, %r111, %r888, %r880; - mad.lo.s32 %r890, %r112, %r886, %r889; - mad.lo.s32 %r891, %r114, %r884, %r890; - mad.lo.s32 %r892, %r115, %r882, %r891; - shr.u32 %r893, %r700, 6; - and.b32 %r894, %r893, 240; - shr.u32 %r895, %r892, 10; - or.b32 %r896, %r895, %r894; - xor.b32 %r897, %r13, %r896; - ld.const.v4.u8 {%rs514, %rs515, %rs516, %rs517}, [matrix+256]; - cvt.u32.u16 %r898, %rs517; - cvt.s32.s8 %r899, %r898; - cvt.u32.u16 %r900, %rs516; - cvt.s32.s8 %r901, %r900; - cvt.u32.u16 %r902, %rs514; - cvt.s32.s8 %r903, %r902; - cvt.u32.u16 %r904, %rs515; - cvt.s32.s8 %r905, %r904; - mul.lo.s32 %r906, %r34, %r905; - mad.lo.s32 %r907, %r124, %r903, %r906; - mad.lo.s32 %r908, %r35, %r901, %r907; - mad.lo.s32 %r909, %r36, %r899, %r908; - ld.const.v4.u8 {%rs522, %rs523, %rs524, %rs525}, [matrix+260]; - cvt.u32.u16 %r910, %rs525; - cvt.s32.s8 %r911, %r910; - cvt.u32.u16 %r912, %rs524; - cvt.s32.s8 %r913, %r912; - cvt.u32.u16 %r914, %rs523; - cvt.s32.s8 %r915, %r914; - cvt.u32.u16 %r916, %rs522; - cvt.s32.s8 %r917, %r916; - mad.lo.s32 %r918, %r37, %r917, %r909; - mad.lo.s32 %r919, %r38, %r915, %r918; - mad.lo.s32 %r920, %r39, %r913, %r919; - mad.lo.s32 %r921, %r40, %r911, %r920; - ld.const.v4.u8 {%rs530, %rs531, %rs532, %rs533}, [matrix+264]; - cvt.u32.u16 %r922, %rs533; - cvt.s32.s8 %r923, %r922; - cvt.u32.u16 %r924, %rs532; - cvt.s32.s8 %r925, %r924; - cvt.u32.u16 %r926, %rs531; - cvt.s32.s8 %r927, %r926; - cvt.u32.u16 %r928, %rs530; - cvt.s32.s8 %r929, %r928; - mad.lo.s32 %r930, %r42, %r929, %r921; - mad.lo.s32 %r931, %r43, %r927, %r930; - mad.lo.s32 %r932, %r45, %r925, %r931; - mad.lo.s32 %r933, %r46, %r923, %r932; - ld.const.v4.u8 {%rs538, %rs539, %rs540, %rs541}, [matrix+268]; - cvt.u32.u16 %r934, %rs541; - cvt.s32.s8 %r935, %r934; - cvt.u32.u16 %r936, %rs540; - cvt.s32.s8 %r937, %r936; - cvt.u32.u16 %r938, %rs539; - cvt.s32.s8 %r939, %r938; - cvt.u32.u16 %r940, %rs538; - cvt.s32.s8 %r941, %r940; - mad.lo.s32 %r942, %r48, %r941, %r933; - mad.lo.s32 %r943, %r49, %r939, %r942; - mad.lo.s32 %r944, %r50, %r937, %r943; - mad.lo.s32 %r945, %r51, %r935, %r944; - ld.const.v4.u8 {%rs546, %rs547, %rs548, %rs549}, [matrix+272]; - cvt.u32.u16 %r946, %rs549; - cvt.s32.s8 %r947, %r946; - cvt.u32.u16 %r948, %rs548; - cvt.s32.s8 %r949, %r948; - cvt.u32.u16 %r950, %rs547; - cvt.s32.s8 %r951, %r950; - cvt.u32.u16 %r952, %rs546; - cvt.s32.s8 %r953, %r952; - mad.lo.s32 %r954, %r173, %r953, %r945; - mad.lo.s32 %r955, %r53, %r951, %r954; - mad.lo.s32 %r956, %r54, %r949, %r955; - mad.lo.s32 %r957, %r55, %r947, %r956; - ld.const.v4.u8 {%rs554, %rs555, %rs556, %rs557}, [matrix+276]; - cvt.u32.u16 %r958, %rs557; - cvt.s32.s8 %r959, %r958; - cvt.u32.u16 %r960, %rs556; - cvt.s32.s8 %r961, %r960; - cvt.u32.u16 %r962, %rs555; - cvt.s32.s8 %r963, %r962; - cvt.u32.u16 %r964, %rs554; - cvt.s32.s8 %r965, %r964; - mad.lo.s32 %r966, %r56, %r965, %r957; - mad.lo.s32 %r967, %r57, %r963, %r966; - mad.lo.s32 %r968, %r58, %r961, %r967; - mad.lo.s32 %r969, %r59, %r959, %r968; - ld.const.v4.u8 {%rs562, %rs563, %rs564, %rs565}, [matrix+280]; - cvt.u32.u16 %r970, %rs565; - cvt.s32.s8 %r971, %r970; - cvt.u32.u16 %r972, %rs564; - cvt.s32.s8 %r973, %r972; - cvt.u32.u16 %r974, %rs563; - cvt.s32.s8 %r975, %r974; - cvt.u32.u16 %r976, %rs562; - cvt.s32.s8 %r977, %r976; - mad.lo.s32 %r978, %r61, %r977, %r969; - mad.lo.s32 %r979, %r62, %r975, %r978; - mad.lo.s32 %r980, %r64, %r973, %r979; - mad.lo.s32 %r981, %r65, %r971, %r980; - ld.const.v4.u8 {%rs570, %rs571, %rs572, %rs573}, [matrix+284]; - cvt.u32.u16 %r982, %rs573; - cvt.s32.s8 %r983, %r982; - cvt.u32.u16 %r984, %rs572; - cvt.s32.s8 %r985, %r984; - cvt.u32.u16 %r986, %rs571; - cvt.s32.s8 %r987, %r986; - cvt.u32.u16 %r988, %rs570; - cvt.s32.s8 %r989, %r988; - mad.lo.s32 %r990, %r67, %r989, %r981; - mad.lo.s32 %r991, %r68, %r987, %r990; - mad.lo.s32 %r992, %r69, %r985, %r991; - mad.lo.s32 %r993, %r70, %r983, %r992; - ld.const.v4.u8 {%rs578, %rs579, %rs580, %rs581}, [matrix+288]; - cvt.u32.u16 %r994, %rs581; - cvt.s32.s8 %r995, %r994; - cvt.u32.u16 %r996, %rs580; - cvt.s32.s8 %r997, %r996; - cvt.u32.u16 %r998, %rs579; - cvt.s32.s8 %r999, %r998; - cvt.u32.u16 %r1000, %rs578; - cvt.s32.s8 %r1001, %r1000; - mad.lo.s32 %r1002, %r222, %r1001, %r993; - mad.lo.s32 %r1003, %r72, %r999, %r1002; - mad.lo.s32 %r1004, %r73, %r997, %r1003; - mad.lo.s32 %r1005, %r74, %r995, %r1004; - ld.const.v4.u8 {%rs586, %rs587, %rs588, %rs589}, [matrix+292]; - cvt.u32.u16 %r1006, %rs589; - cvt.s32.s8 %r1007, %r1006; - cvt.u32.u16 %r1008, %rs588; - cvt.s32.s8 %r1009, %r1008; - cvt.u32.u16 %r1010, %rs587; - cvt.s32.s8 %r1011, %r1010; - cvt.u32.u16 %r1012, %rs586; - cvt.s32.s8 %r1013, %r1012; - mad.lo.s32 %r1014, %r75, %r1013, %r1005; - mad.lo.s32 %r1015, %r76, %r1011, %r1014; - mad.lo.s32 %r1016, %r77, %r1009, %r1015; - mad.lo.s32 %r1017, %r78, %r1007, %r1016; - ld.const.v4.u8 {%rs594, %rs595, %rs596, %rs597}, [matrix+296]; - cvt.u32.u16 %r1018, %rs597; - cvt.s32.s8 %r1019, %r1018; - cvt.u32.u16 %r1020, %rs596; - cvt.s32.s8 %r1021, %r1020; - cvt.u32.u16 %r1022, %rs595; - cvt.s32.s8 %r1023, %r1022; - cvt.u32.u16 %r1024, %rs594; - cvt.s32.s8 %r1025, %r1024; - mad.lo.s32 %r1026, %r80, %r1025, %r1017; - mad.lo.s32 %r1027, %r81, %r1023, %r1026; - mad.lo.s32 %r1028, %r83, %r1021, %r1027; - mad.lo.s32 %r1029, %r84, %r1019, %r1028; - ld.const.v4.u8 {%rs602, %rs603, %rs604, %rs605}, [matrix+300]; - cvt.u32.u16 %r1030, %rs605; - cvt.s32.s8 %r1031, %r1030; - cvt.u32.u16 %r1032, %rs604; - cvt.s32.s8 %r1033, %r1032; - cvt.u32.u16 %r1034, %rs603; - cvt.s32.s8 %r1035, %r1034; - cvt.u32.u16 %r1036, %rs602; - cvt.s32.s8 %r1037, %r1036; - mad.lo.s32 %r1038, %r86, %r1037, %r1029; - mad.lo.s32 %r1039, %r87, %r1035, %r1038; - mad.lo.s32 %r1040, %r88, %r1033, %r1039; - mad.lo.s32 %r1041, %r89, %r1031, %r1040; - ld.const.v4.u8 {%rs610, %rs611, %rs612, %rs613}, [matrix+304]; - cvt.u32.u16 %r1042, %rs613; - cvt.s32.s8 %r1043, %r1042; - cvt.u32.u16 %r1044, %rs612; - cvt.s32.s8 %r1045, %r1044; - cvt.u32.u16 %r1046, %rs611; - cvt.s32.s8 %r1047, %r1046; - cvt.u32.u16 %r1048, %rs610; - cvt.s32.s8 %r1049, %r1048; - mad.lo.s32 %r1050, %r271, %r1049, %r1041; - mad.lo.s32 %r1051, %r91, %r1047, %r1050; - mad.lo.s32 %r1052, %r93, %r1045, %r1051; - mad.lo.s32 %r1053, %r94, %r1043, %r1052; - ld.const.v4.u8 {%rs618, %rs619, %rs620, %rs621}, [matrix+308]; - cvt.u32.u16 %r1054, %rs621; - cvt.s32.s8 %r1055, %r1054; - cvt.u32.u16 %r1056, %rs620; - cvt.s32.s8 %r1057, %r1056; - cvt.u32.u16 %r1058, %rs619; - cvt.s32.s8 %r1059, %r1058; - cvt.u32.u16 %r1060, %rs618; - cvt.s32.s8 %r1061, %r1060; - mad.lo.s32 %r1062, %r96, %r1061, %r1053; - mad.lo.s32 %r1063, %r97, %r1059, %r1062; - mad.lo.s32 %r1064, %r99, %r1057, %r1063; - mad.lo.s32 %r1065, %r100, %r1055, %r1064; - ld.const.v4.u8 {%rs626, %rs627, %rs628, %rs629}, [matrix+312]; - cvt.u32.u16 %r1066, %rs629; - cvt.s32.s8 %r1067, %r1066; - cvt.u32.u16 %r1068, %rs628; - cvt.s32.s8 %r1069, %r1068; - cvt.u32.u16 %r1070, %rs627; - cvt.s32.s8 %r1071, %r1070; - cvt.u32.u16 %r1072, %rs626; - cvt.s32.s8 %r1073, %r1072; - mad.lo.s32 %r1074, %r103, %r1073, %r1065; - mad.lo.s32 %r1075, %r104, %r1071, %r1074; - mad.lo.s32 %r1076, %r107, %r1069, %r1075; - mad.lo.s32 %r1077, %r108, %r1067, %r1076; - ld.const.v4.u8 {%rs634, %rs635, %rs636, %rs637}, [matrix+316]; - cvt.u32.u16 %r1078, %rs637; - cvt.s32.s8 %r1079, %r1078; - cvt.u32.u16 %r1080, %rs636; - cvt.s32.s8 %r1081, %r1080; - cvt.u32.u16 %r1082, %rs635; - cvt.s32.s8 %r1083, %r1082; - cvt.u32.u16 %r1084, %rs634; - cvt.s32.s8 %r1085, %r1084; - mad.lo.s32 %r1086, %r111, %r1085, %r1077; - mad.lo.s32 %r1087, %r112, %r1083, %r1086; - mad.lo.s32 %r1088, %r114, %r1081, %r1087; - mad.lo.s32 %r1089, %r115, %r1079, %r1088; - ld.const.v4.u8 {%rs642, %rs643, %rs644, %rs645}, [matrix+320]; - cvt.u32.u16 %r1090, %rs645; - cvt.s32.s8 %r1091, %r1090; - cvt.u32.u16 %r1092, %rs644; - cvt.s32.s8 %r1093, %r1092; - cvt.u32.u16 %r1094, %rs642; - cvt.s32.s8 %r1095, %r1094; - cvt.u32.u16 %r1096, %rs643; - cvt.s32.s8 %r1097, %r1096; - mul.lo.s32 %r1098, %r34, %r1097; - mad.lo.s32 %r1099, %r124, %r1095, %r1098; - mad.lo.s32 %r1100, %r35, %r1093, %r1099; - mad.lo.s32 %r1101, %r36, %r1091, %r1100; - ld.const.v4.u8 {%rs650, %rs651, %rs652, %rs653}, [matrix+324]; - cvt.u32.u16 %r1102, %rs653; - cvt.s32.s8 %r1103, %r1102; - cvt.u32.u16 %r1104, %rs652; - cvt.s32.s8 %r1105, %r1104; - cvt.u32.u16 %r1106, %rs651; - cvt.s32.s8 %r1107, %r1106; - cvt.u32.u16 %r1108, %rs650; - cvt.s32.s8 %r1109, %r1108; - mad.lo.s32 %r1110, %r37, %r1109, %r1101; - mad.lo.s32 %r1111, %r38, %r1107, %r1110; - mad.lo.s32 %r1112, %r39, %r1105, %r1111; - mad.lo.s32 %r1113, %r40, %r1103, %r1112; - ld.const.v4.u8 {%rs658, %rs659, %rs660, %rs661}, [matrix+328]; - cvt.u32.u16 %r1114, %rs661; - cvt.s32.s8 %r1115, %r1114; - cvt.u32.u16 %r1116, %rs660; - cvt.s32.s8 %r1117, %r1116; - cvt.u32.u16 %r1118, %rs659; - cvt.s32.s8 %r1119, %r1118; - cvt.u32.u16 %r1120, %rs658; - cvt.s32.s8 %r1121, %r1120; - mad.lo.s32 %r1122, %r42, %r1121, %r1113; - mad.lo.s32 %r1123, %r43, %r1119, %r1122; - mad.lo.s32 %r1124, %r45, %r1117, %r1123; - mad.lo.s32 %r1125, %r46, %r1115, %r1124; - ld.const.v4.u8 {%rs666, %rs667, %rs668, %rs669}, [matrix+332]; - cvt.u32.u16 %r1126, %rs669; - cvt.s32.s8 %r1127, %r1126; - cvt.u32.u16 %r1128, %rs668; - cvt.s32.s8 %r1129, %r1128; - cvt.u32.u16 %r1130, %rs667; - cvt.s32.s8 %r1131, %r1130; - cvt.u32.u16 %r1132, %rs666; - cvt.s32.s8 %r1133, %r1132; - mad.lo.s32 %r1134, %r48, %r1133, %r1125; - mad.lo.s32 %r1135, %r49, %r1131, %r1134; - mad.lo.s32 %r1136, %r50, %r1129, %r1135; - mad.lo.s32 %r1137, %r51, %r1127, %r1136; - ld.const.v4.u8 {%rs674, %rs675, %rs676, %rs677}, [matrix+336]; - cvt.u32.u16 %r1138, %rs677; - cvt.s32.s8 %r1139, %r1138; - cvt.u32.u16 %r1140, %rs676; - cvt.s32.s8 %r1141, %r1140; - cvt.u32.u16 %r1142, %rs675; - cvt.s32.s8 %r1143, %r1142; - cvt.u32.u16 %r1144, %rs674; - cvt.s32.s8 %r1145, %r1144; - mad.lo.s32 %r1146, %r173, %r1145, %r1137; - mad.lo.s32 %r1147, %r53, %r1143, %r1146; - mad.lo.s32 %r1148, %r54, %r1141, %r1147; - mad.lo.s32 %r1149, %r55, %r1139, %r1148; - ld.const.v4.u8 {%rs682, %rs683, %rs684, %rs685}, [matrix+340]; - cvt.u32.u16 %r1150, %rs685; - cvt.s32.s8 %r1151, %r1150; - cvt.u32.u16 %r1152, %rs684; - cvt.s32.s8 %r1153, %r1152; - cvt.u32.u16 %r1154, %rs683; - cvt.s32.s8 %r1155, %r1154; - cvt.u32.u16 %r1156, %rs682; - cvt.s32.s8 %r1157, %r1156; - mad.lo.s32 %r1158, %r56, %r1157, %r1149; - mad.lo.s32 %r1159, %r57, %r1155, %r1158; - mad.lo.s32 %r1160, %r58, %r1153, %r1159; - mad.lo.s32 %r1161, %r59, %r1151, %r1160; - ld.const.v4.u8 {%rs690, %rs691, %rs692, %rs693}, [matrix+344]; - cvt.u32.u16 %r1162, %rs693; - cvt.s32.s8 %r1163, %r1162; - cvt.u32.u16 %r1164, %rs692; - cvt.s32.s8 %r1165, %r1164; - cvt.u32.u16 %r1166, %rs691; - cvt.s32.s8 %r1167, %r1166; - cvt.u32.u16 %r1168, %rs690; - cvt.s32.s8 %r1169, %r1168; - mad.lo.s32 %r1170, %r61, %r1169, %r1161; - mad.lo.s32 %r1171, %r62, %r1167, %r1170; - mad.lo.s32 %r1172, %r64, %r1165, %r1171; - mad.lo.s32 %r1173, %r65, %r1163, %r1172; - ld.const.v4.u8 {%rs698, %rs699, %rs700, %rs701}, [matrix+348]; - cvt.u32.u16 %r1174, %rs701; - cvt.s32.s8 %r1175, %r1174; - cvt.u32.u16 %r1176, %rs700; - cvt.s32.s8 %r1177, %r1176; - cvt.u32.u16 %r1178, %rs699; - cvt.s32.s8 %r1179, %r1178; - cvt.u32.u16 %r1180, %rs698; - cvt.s32.s8 %r1181, %r1180; - mad.lo.s32 %r1182, %r67, %r1181, %r1173; - mad.lo.s32 %r1183, %r68, %r1179, %r1182; - mad.lo.s32 %r1184, %r69, %r1177, %r1183; - mad.lo.s32 %r1185, %r70, %r1175, %r1184; - ld.const.v4.u8 {%rs706, %rs707, %rs708, %rs709}, [matrix+352]; - cvt.u32.u16 %r1186, %rs709; - cvt.s32.s8 %r1187, %r1186; - cvt.u32.u16 %r1188, %rs708; - cvt.s32.s8 %r1189, %r1188; - cvt.u32.u16 %r1190, %rs707; - cvt.s32.s8 %r1191, %r1190; - cvt.u32.u16 %r1192, %rs706; - cvt.s32.s8 %r1193, %r1192; - mad.lo.s32 %r1194, %r222, %r1193, %r1185; - mad.lo.s32 %r1195, %r72, %r1191, %r1194; - mad.lo.s32 %r1196, %r73, %r1189, %r1195; - mad.lo.s32 %r1197, %r74, %r1187, %r1196; - ld.const.v4.u8 {%rs714, %rs715, %rs716, %rs717}, [matrix+356]; - cvt.u32.u16 %r1198, %rs717; - cvt.s32.s8 %r1199, %r1198; - cvt.u32.u16 %r1200, %rs716; - cvt.s32.s8 %r1201, %r1200; - cvt.u32.u16 %r1202, %rs715; - cvt.s32.s8 %r1203, %r1202; - cvt.u32.u16 %r1204, %rs714; - cvt.s32.s8 %r1205, %r1204; - mad.lo.s32 %r1206, %r75, %r1205, %r1197; - mad.lo.s32 %r1207, %r76, %r1203, %r1206; - mad.lo.s32 %r1208, %r77, %r1201, %r1207; - mad.lo.s32 %r1209, %r78, %r1199, %r1208; - ld.const.v4.u8 {%rs722, %rs723, %rs724, %rs725}, [matrix+360]; - cvt.u32.u16 %r1210, %rs725; - cvt.s32.s8 %r1211, %r1210; - cvt.u32.u16 %r1212, %rs724; - cvt.s32.s8 %r1213, %r1212; - cvt.u32.u16 %r1214, %rs723; - cvt.s32.s8 %r1215, %r1214; - cvt.u32.u16 %r1216, %rs722; - cvt.s32.s8 %r1217, %r1216; - mad.lo.s32 %r1218, %r80, %r1217, %r1209; - mad.lo.s32 %r1219, %r81, %r1215, %r1218; - mad.lo.s32 %r1220, %r83, %r1213, %r1219; - mad.lo.s32 %r1221, %r84, %r1211, %r1220; - ld.const.v4.u8 {%rs730, %rs731, %rs732, %rs733}, [matrix+364]; - cvt.u32.u16 %r1222, %rs733; - cvt.s32.s8 %r1223, %r1222; - cvt.u32.u16 %r1224, %rs732; - cvt.s32.s8 %r1225, %r1224; - cvt.u32.u16 %r1226, %rs731; - cvt.s32.s8 %r1227, %r1226; - cvt.u32.u16 %r1228, %rs730; - cvt.s32.s8 %r1229, %r1228; - mad.lo.s32 %r1230, %r86, %r1229, %r1221; - mad.lo.s32 %r1231, %r87, %r1227, %r1230; - mad.lo.s32 %r1232, %r88, %r1225, %r1231; - mad.lo.s32 %r1233, %r89, %r1223, %r1232; - ld.const.v4.u8 {%rs738, %rs739, %rs740, %rs741}, [matrix+368]; - cvt.u32.u16 %r1234, %rs741; - cvt.s32.s8 %r1235, %r1234; - cvt.u32.u16 %r1236, %rs740; - cvt.s32.s8 %r1237, %r1236; - cvt.u32.u16 %r1238, %rs739; - cvt.s32.s8 %r1239, %r1238; - cvt.u32.u16 %r1240, %rs738; - cvt.s32.s8 %r1241, %r1240; - mad.lo.s32 %r1242, %r271, %r1241, %r1233; - mad.lo.s32 %r1243, %r91, %r1239, %r1242; - mad.lo.s32 %r1244, %r93, %r1237, %r1243; - mad.lo.s32 %r1245, %r94, %r1235, %r1244; - ld.const.v4.u8 {%rs746, %rs747, %rs748, %rs749}, [matrix+372]; - cvt.u32.u16 %r1246, %rs749; - cvt.s32.s8 %r1247, %r1246; - cvt.u32.u16 %r1248, %rs748; - cvt.s32.s8 %r1249, %r1248; - cvt.u32.u16 %r1250, %rs747; - cvt.s32.s8 %r1251, %r1250; - cvt.u32.u16 %r1252, %rs746; - cvt.s32.s8 %r1253, %r1252; - mad.lo.s32 %r1254, %r96, %r1253, %r1245; - mad.lo.s32 %r1255, %r97, %r1251, %r1254; - mad.lo.s32 %r1256, %r99, %r1249, %r1255; - mad.lo.s32 %r1257, %r100, %r1247, %r1256; - ld.const.v4.u8 {%rs754, %rs755, %rs756, %rs757}, [matrix+376]; - cvt.u32.u16 %r1258, %rs757; - cvt.s32.s8 %r1259, %r1258; - cvt.u32.u16 %r1260, %rs756; - cvt.s32.s8 %r1261, %r1260; - cvt.u32.u16 %r1262, %rs755; - cvt.s32.s8 %r1263, %r1262; - cvt.u32.u16 %r1264, %rs754; - cvt.s32.s8 %r1265, %r1264; - mad.lo.s32 %r1266, %r103, %r1265, %r1257; - mad.lo.s32 %r1267, %r104, %r1263, %r1266; - mad.lo.s32 %r1268, %r107, %r1261, %r1267; - mad.lo.s32 %r1269, %r108, %r1259, %r1268; - ld.const.v4.u8 {%rs762, %rs763, %rs764, %rs765}, [matrix+380]; - cvt.u32.u16 %r1270, %rs765; - cvt.s32.s8 %r1271, %r1270; - cvt.u32.u16 %r1272, %rs764; - cvt.s32.s8 %r1273, %r1272; - cvt.u32.u16 %r1274, %rs763; - cvt.s32.s8 %r1275, %r1274; - cvt.u32.u16 %r1276, %rs762; - cvt.s32.s8 %r1277, %r1276; - mad.lo.s32 %r1278, %r111, %r1277, %r1269; - mad.lo.s32 %r1279, %r112, %r1275, %r1278; - mad.lo.s32 %r1280, %r114, %r1273, %r1279; - mad.lo.s32 %r1281, %r115, %r1271, %r1280; - shr.u32 %r1282, %r1089, 6; - and.b32 %r1283, %r1282, 240; - shr.u32 %r1284, %r1281, 10; - or.b32 %r1285, %r1284, %r1283; - xor.b32 %r1286, %r14, %r1285; - cvt.u64.u32 %rd382, %r1286; - ld.const.v4.u8 {%rs770, %rs771, %rs772, %rs773}, [matrix+384]; - cvt.u32.u16 %r1287, %rs773; - cvt.s32.s8 %r1288, %r1287; - cvt.u32.u16 %r1289, %rs772; - cvt.s32.s8 %r1290, %r1289; - cvt.u32.u16 %r1291, %rs770; - cvt.s32.s8 %r1292, %r1291; - cvt.u32.u16 %r1293, %rs771; - cvt.s32.s8 %r1294, %r1293; - mul.lo.s32 %r1295, %r34, %r1294; - mad.lo.s32 %r1296, %r124, %r1292, %r1295; - mad.lo.s32 %r1297, %r35, %r1290, %r1296; - mad.lo.s32 %r1298, %r36, %r1288, %r1297; - ld.const.v4.u8 {%rs778, %rs779, %rs780, %rs781}, [matrix+388]; - cvt.u32.u16 %r1299, %rs781; - cvt.s32.s8 %r1300, %r1299; - cvt.u32.u16 %r1301, %rs780; - cvt.s32.s8 %r1302, %r1301; - cvt.u32.u16 %r1303, %rs779; - cvt.s32.s8 %r1304, %r1303; - cvt.u32.u16 %r1305, %rs778; - cvt.s32.s8 %r1306, %r1305; - mad.lo.s32 %r1307, %r37, %r1306, %r1298; - mad.lo.s32 %r1308, %r38, %r1304, %r1307; - mad.lo.s32 %r1309, %r39, %r1302, %r1308; - mad.lo.s32 %r1310, %r40, %r1300, %r1309; - ld.const.v4.u8 {%rs786, %rs787, %rs788, %rs789}, [matrix+392]; - cvt.u32.u16 %r1311, %rs789; - cvt.s32.s8 %r1312, %r1311; - cvt.u32.u16 %r1313, %rs788; - cvt.s32.s8 %r1314, %r1313; - cvt.u32.u16 %r1315, %rs787; - cvt.s32.s8 %r1316, %r1315; - cvt.u32.u16 %r1317, %rs786; - cvt.s32.s8 %r1318, %r1317; - mad.lo.s32 %r1319, %r42, %r1318, %r1310; - mad.lo.s32 %r1320, %r43, %r1316, %r1319; - mad.lo.s32 %r1321, %r45, %r1314, %r1320; - mad.lo.s32 %r1322, %r46, %r1312, %r1321; - ld.const.v4.u8 {%rs794, %rs795, %rs796, %rs797}, [matrix+396]; - cvt.u32.u16 %r1323, %rs797; - cvt.s32.s8 %r1324, %r1323; - cvt.u32.u16 %r1325, %rs796; - cvt.s32.s8 %r1326, %r1325; - cvt.u32.u16 %r1327, %rs795; - cvt.s32.s8 %r1328, %r1327; - cvt.u32.u16 %r1329, %rs794; - cvt.s32.s8 %r1330, %r1329; - mad.lo.s32 %r1331, %r48, %r1330, %r1322; - mad.lo.s32 %r1332, %r49, %r1328, %r1331; - mad.lo.s32 %r1333, %r50, %r1326, %r1332; - mad.lo.s32 %r1334, %r51, %r1324, %r1333; - ld.const.v4.u8 {%rs802, %rs803, %rs804, %rs805}, [matrix+400]; - cvt.u32.u16 %r1335, %rs805; - cvt.s32.s8 %r1336, %r1335; - cvt.u32.u16 %r1337, %rs804; - cvt.s32.s8 %r1338, %r1337; - cvt.u32.u16 %r1339, %rs803; - cvt.s32.s8 %r1340, %r1339; - cvt.u32.u16 %r1341, %rs802; - cvt.s32.s8 %r1342, %r1341; - mad.lo.s32 %r1343, %r173, %r1342, %r1334; - mad.lo.s32 %r1344, %r53, %r1340, %r1343; - mad.lo.s32 %r1345, %r54, %r1338, %r1344; - mad.lo.s32 %r1346, %r55, %r1336, %r1345; - ld.const.v4.u8 {%rs810, %rs811, %rs812, %rs813}, [matrix+404]; - cvt.u32.u16 %r1347, %rs813; - cvt.s32.s8 %r1348, %r1347; - cvt.u32.u16 %r1349, %rs812; - cvt.s32.s8 %r1350, %r1349; - cvt.u32.u16 %r1351, %rs811; - cvt.s32.s8 %r1352, %r1351; - cvt.u32.u16 %r1353, %rs810; - cvt.s32.s8 %r1354, %r1353; - mad.lo.s32 %r1355, %r56, %r1354, %r1346; - mad.lo.s32 %r1356, %r57, %r1352, %r1355; - mad.lo.s32 %r1357, %r58, %r1350, %r1356; - mad.lo.s32 %r1358, %r59, %r1348, %r1357; - ld.const.v4.u8 {%rs818, %rs819, %rs820, %rs821}, [matrix+408]; - cvt.u32.u16 %r1359, %rs821; - cvt.s32.s8 %r1360, %r1359; - cvt.u32.u16 %r1361, %rs820; - cvt.s32.s8 %r1362, %r1361; - cvt.u32.u16 %r1363, %rs819; - cvt.s32.s8 %r1364, %r1363; - cvt.u32.u16 %r1365, %rs818; - cvt.s32.s8 %r1366, %r1365; - mad.lo.s32 %r1367, %r61, %r1366, %r1358; - mad.lo.s32 %r1368, %r62, %r1364, %r1367; - mad.lo.s32 %r1369, %r64, %r1362, %r1368; - mad.lo.s32 %r1370, %r65, %r1360, %r1369; - ld.const.v4.u8 {%rs826, %rs827, %rs828, %rs829}, [matrix+412]; - cvt.u32.u16 %r1371, %rs829; - cvt.s32.s8 %r1372, %r1371; - cvt.u32.u16 %r1373, %rs828; - cvt.s32.s8 %r1374, %r1373; - cvt.u32.u16 %r1375, %rs827; - cvt.s32.s8 %r1376, %r1375; - cvt.u32.u16 %r1377, %rs826; - cvt.s32.s8 %r1378, %r1377; - mad.lo.s32 %r1379, %r67, %r1378, %r1370; - mad.lo.s32 %r1380, %r68, %r1376, %r1379; - mad.lo.s32 %r1381, %r69, %r1374, %r1380; - mad.lo.s32 %r1382, %r70, %r1372, %r1381; - ld.const.v4.u8 {%rs834, %rs835, %rs836, %rs837}, [matrix+416]; - cvt.u32.u16 %r1383, %rs837; - cvt.s32.s8 %r1384, %r1383; - cvt.u32.u16 %r1385, %rs836; - cvt.s32.s8 %r1386, %r1385; - cvt.u32.u16 %r1387, %rs835; - cvt.s32.s8 %r1388, %r1387; - cvt.u32.u16 %r1389, %rs834; - cvt.s32.s8 %r1390, %r1389; - mad.lo.s32 %r1391, %r222, %r1390, %r1382; - mad.lo.s32 %r1392, %r72, %r1388, %r1391; - mad.lo.s32 %r1393, %r73, %r1386, %r1392; - mad.lo.s32 %r1394, %r74, %r1384, %r1393; - ld.const.v4.u8 {%rs842, %rs843, %rs844, %rs845}, [matrix+420]; - cvt.u32.u16 %r1395, %rs845; - cvt.s32.s8 %r1396, %r1395; - cvt.u32.u16 %r1397, %rs844; - cvt.s32.s8 %r1398, %r1397; - cvt.u32.u16 %r1399, %rs843; - cvt.s32.s8 %r1400, %r1399; - cvt.u32.u16 %r1401, %rs842; - cvt.s32.s8 %r1402, %r1401; - mad.lo.s32 %r1403, %r75, %r1402, %r1394; - mad.lo.s32 %r1404, %r76, %r1400, %r1403; - mad.lo.s32 %r1405, %r77, %r1398, %r1404; - mad.lo.s32 %r1406, %r78, %r1396, %r1405; - ld.const.v4.u8 {%rs850, %rs851, %rs852, %rs853}, [matrix+424]; - cvt.u32.u16 %r1407, %rs853; - cvt.s32.s8 %r1408, %r1407; - cvt.u32.u16 %r1409, %rs852; - cvt.s32.s8 %r1410, %r1409; - cvt.u32.u16 %r1411, %rs851; - cvt.s32.s8 %r1412, %r1411; - cvt.u32.u16 %r1413, %rs850; - cvt.s32.s8 %r1414, %r1413; - mad.lo.s32 %r1415, %r80, %r1414, %r1406; - mad.lo.s32 %r1416, %r81, %r1412, %r1415; - mad.lo.s32 %r1417, %r83, %r1410, %r1416; - mad.lo.s32 %r1418, %r84, %r1408, %r1417; - ld.const.v4.u8 {%rs858, %rs859, %rs860, %rs861}, [matrix+428]; - cvt.u32.u16 %r1419, %rs861; - cvt.s32.s8 %r1420, %r1419; - cvt.u32.u16 %r1421, %rs860; - cvt.s32.s8 %r1422, %r1421; - cvt.u32.u16 %r1423, %rs859; - cvt.s32.s8 %r1424, %r1423; - cvt.u32.u16 %r1425, %rs858; - cvt.s32.s8 %r1426, %r1425; - mad.lo.s32 %r1427, %r86, %r1426, %r1418; - mad.lo.s32 %r1428, %r87, %r1424, %r1427; - mad.lo.s32 %r1429, %r88, %r1422, %r1428; - mad.lo.s32 %r1430, %r89, %r1420, %r1429; - ld.const.v4.u8 {%rs866, %rs867, %rs868, %rs869}, [matrix+432]; - cvt.u32.u16 %r1431, %rs869; - cvt.s32.s8 %r1432, %r1431; - cvt.u32.u16 %r1433, %rs868; - cvt.s32.s8 %r1434, %r1433; - cvt.u32.u16 %r1435, %rs867; - cvt.s32.s8 %r1436, %r1435; - cvt.u32.u16 %r1437, %rs866; - cvt.s32.s8 %r1438, %r1437; - mad.lo.s32 %r1439, %r271, %r1438, %r1430; - mad.lo.s32 %r1440, %r91, %r1436, %r1439; - mad.lo.s32 %r1441, %r93, %r1434, %r1440; - mad.lo.s32 %r1442, %r94, %r1432, %r1441; - ld.const.v4.u8 {%rs874, %rs875, %rs876, %rs877}, [matrix+436]; - cvt.u32.u16 %r1443, %rs877; - cvt.s32.s8 %r1444, %r1443; - cvt.u32.u16 %r1445, %rs876; - cvt.s32.s8 %r1446, %r1445; - cvt.u32.u16 %r1447, %rs875; - cvt.s32.s8 %r1448, %r1447; - cvt.u32.u16 %r1449, %rs874; - cvt.s32.s8 %r1450, %r1449; - mad.lo.s32 %r1451, %r96, %r1450, %r1442; - mad.lo.s32 %r1452, %r97, %r1448, %r1451; - mad.lo.s32 %r1453, %r99, %r1446, %r1452; - mad.lo.s32 %r1454, %r100, %r1444, %r1453; - ld.const.v4.u8 {%rs882, %rs883, %rs884, %rs885}, [matrix+440]; - cvt.u32.u16 %r1455, %rs885; - cvt.s32.s8 %r1456, %r1455; - cvt.u32.u16 %r1457, %rs884; - cvt.s32.s8 %r1458, %r1457; - cvt.u32.u16 %r1459, %rs883; - cvt.s32.s8 %r1460, %r1459; - cvt.u32.u16 %r1461, %rs882; - cvt.s32.s8 %r1462, %r1461; - mad.lo.s32 %r1463, %r103, %r1462, %r1454; - mad.lo.s32 %r1464, %r104, %r1460, %r1463; - mad.lo.s32 %r1465, %r107, %r1458, %r1464; - mad.lo.s32 %r1466, %r108, %r1456, %r1465; - ld.const.v4.u8 {%rs890, %rs891, %rs892, %rs893}, [matrix+444]; - cvt.u32.u16 %r1467, %rs893; - cvt.s32.s8 %r1468, %r1467; - cvt.u32.u16 %r1469, %rs892; - cvt.s32.s8 %r1470, %r1469; - cvt.u32.u16 %r1471, %rs891; - cvt.s32.s8 %r1472, %r1471; - cvt.u32.u16 %r1473, %rs890; - cvt.s32.s8 %r1474, %r1473; - mad.lo.s32 %r1475, %r111, %r1474, %r1466; - mad.lo.s32 %r1476, %r112, %r1472, %r1475; - mad.lo.s32 %r1477, %r114, %r1470, %r1476; - mad.lo.s32 %r1478, %r115, %r1468, %r1477; - ld.const.v4.u8 {%rs898, %rs899, %rs900, %rs901}, [matrix+448]; - cvt.u32.u16 %r1479, %rs901; - cvt.s32.s8 %r1480, %r1479; - cvt.u32.u16 %r1481, %rs900; - cvt.s32.s8 %r1482, %r1481; - cvt.u32.u16 %r1483, %rs898; - cvt.s32.s8 %r1484, %r1483; - cvt.u32.u16 %r1485, %rs899; - cvt.s32.s8 %r1486, %r1485; - mul.lo.s32 %r1487, %r34, %r1486; - mad.lo.s32 %r1488, %r124, %r1484, %r1487; - mad.lo.s32 %r1489, %r35, %r1482, %r1488; - mad.lo.s32 %r1490, %r36, %r1480, %r1489; - ld.const.v4.u8 {%rs906, %rs907, %rs908, %rs909}, [matrix+452]; - cvt.u32.u16 %r1491, %rs909; - cvt.s32.s8 %r1492, %r1491; - cvt.u32.u16 %r1493, %rs908; - cvt.s32.s8 %r1494, %r1493; - cvt.u32.u16 %r1495, %rs907; - cvt.s32.s8 %r1496, %r1495; - cvt.u32.u16 %r1497, %rs906; - cvt.s32.s8 %r1498, %r1497; - mad.lo.s32 %r1499, %r37, %r1498, %r1490; - mad.lo.s32 %r1500, %r38, %r1496, %r1499; - mad.lo.s32 %r1501, %r39, %r1494, %r1500; - mad.lo.s32 %r1502, %r40, %r1492, %r1501; - ld.const.v4.u8 {%rs914, %rs915, %rs916, %rs917}, [matrix+456]; - cvt.u32.u16 %r1503, %rs917; - cvt.s32.s8 %r1504, %r1503; - cvt.u32.u16 %r1505, %rs916; - cvt.s32.s8 %r1506, %r1505; - cvt.u32.u16 %r1507, %rs915; - cvt.s32.s8 %r1508, %r1507; - cvt.u32.u16 %r1509, %rs914; - cvt.s32.s8 %r1510, %r1509; - mad.lo.s32 %r1511, %r42, %r1510, %r1502; - mad.lo.s32 %r1512, %r43, %r1508, %r1511; - mad.lo.s32 %r1513, %r45, %r1506, %r1512; - mad.lo.s32 %r1514, %r46, %r1504, %r1513; - ld.const.v4.u8 {%rs922, %rs923, %rs924, %rs925}, [matrix+460]; - cvt.u32.u16 %r1515, %rs925; - cvt.s32.s8 %r1516, %r1515; - cvt.u32.u16 %r1517, %rs924; - cvt.s32.s8 %r1518, %r1517; - cvt.u32.u16 %r1519, %rs923; - cvt.s32.s8 %r1520, %r1519; - cvt.u32.u16 %r1521, %rs922; - cvt.s32.s8 %r1522, %r1521; - mad.lo.s32 %r1523, %r48, %r1522, %r1514; - mad.lo.s32 %r1524, %r49, %r1520, %r1523; - mad.lo.s32 %r1525, %r50, %r1518, %r1524; - mad.lo.s32 %r1526, %r51, %r1516, %r1525; - ld.const.v4.u8 {%rs930, %rs931, %rs932, %rs933}, [matrix+464]; - cvt.u32.u16 %r1527, %rs933; - cvt.s32.s8 %r1528, %r1527; - cvt.u32.u16 %r1529, %rs932; - cvt.s32.s8 %r1530, %r1529; - cvt.u32.u16 %r1531, %rs931; - cvt.s32.s8 %r1532, %r1531; - cvt.u32.u16 %r1533, %rs930; - cvt.s32.s8 %r1534, %r1533; - mad.lo.s32 %r1535, %r173, %r1534, %r1526; - mad.lo.s32 %r1536, %r53, %r1532, %r1535; - mad.lo.s32 %r1537, %r54, %r1530, %r1536; - mad.lo.s32 %r1538, %r55, %r1528, %r1537; - ld.const.v4.u8 {%rs938, %rs939, %rs940, %rs941}, [matrix+468]; - cvt.u32.u16 %r1539, %rs941; - cvt.s32.s8 %r1540, %r1539; - cvt.u32.u16 %r1541, %rs940; - cvt.s32.s8 %r1542, %r1541; - cvt.u32.u16 %r1543, %rs939; - cvt.s32.s8 %r1544, %r1543; - cvt.u32.u16 %r1545, %rs938; - cvt.s32.s8 %r1546, %r1545; - mad.lo.s32 %r1547, %r56, %r1546, %r1538; - mad.lo.s32 %r1548, %r57, %r1544, %r1547; - mad.lo.s32 %r1549, %r58, %r1542, %r1548; - mad.lo.s32 %r1550, %r59, %r1540, %r1549; - ld.const.v4.u8 {%rs946, %rs947, %rs948, %rs949}, [matrix+472]; - cvt.u32.u16 %r1551, %rs949; - cvt.s32.s8 %r1552, %r1551; - cvt.u32.u16 %r1553, %rs948; - cvt.s32.s8 %r1554, %r1553; - cvt.u32.u16 %r1555, %rs947; - cvt.s32.s8 %r1556, %r1555; - cvt.u32.u16 %r1557, %rs946; - cvt.s32.s8 %r1558, %r1557; - mad.lo.s32 %r1559, %r61, %r1558, %r1550; - mad.lo.s32 %r1560, %r62, %r1556, %r1559; - mad.lo.s32 %r1561, %r64, %r1554, %r1560; - mad.lo.s32 %r1562, %r65, %r1552, %r1561; - ld.const.v4.u8 {%rs954, %rs955, %rs956, %rs957}, [matrix+476]; - cvt.u32.u16 %r1563, %rs957; - cvt.s32.s8 %r1564, %r1563; - cvt.u32.u16 %r1565, %rs956; - cvt.s32.s8 %r1566, %r1565; - cvt.u32.u16 %r1567, %rs955; - cvt.s32.s8 %r1568, %r1567; - cvt.u32.u16 %r1569, %rs954; - cvt.s32.s8 %r1570, %r1569; - mad.lo.s32 %r1571, %r67, %r1570, %r1562; - mad.lo.s32 %r1572, %r68, %r1568, %r1571; - mad.lo.s32 %r1573, %r69, %r1566, %r1572; - mad.lo.s32 %r1574, %r70, %r1564, %r1573; - ld.const.v4.u8 {%rs962, %rs963, %rs964, %rs965}, [matrix+480]; - cvt.u32.u16 %r1575, %rs965; - cvt.s32.s8 %r1576, %r1575; - cvt.u32.u16 %r1577, %rs964; - cvt.s32.s8 %r1578, %r1577; - cvt.u32.u16 %r1579, %rs963; - cvt.s32.s8 %r1580, %r1579; - cvt.u32.u16 %r1581, %rs962; - cvt.s32.s8 %r1582, %r1581; - mad.lo.s32 %r1583, %r222, %r1582, %r1574; - mad.lo.s32 %r1584, %r72, %r1580, %r1583; - mad.lo.s32 %r1585, %r73, %r1578, %r1584; - mad.lo.s32 %r1586, %r74, %r1576, %r1585; - ld.const.v4.u8 {%rs970, %rs971, %rs972, %rs973}, [matrix+484]; - cvt.u32.u16 %r1587, %rs973; - cvt.s32.s8 %r1588, %r1587; - cvt.u32.u16 %r1589, %rs972; - cvt.s32.s8 %r1590, %r1589; - cvt.u32.u16 %r1591, %rs971; - cvt.s32.s8 %r1592, %r1591; - cvt.u32.u16 %r1593, %rs970; - cvt.s32.s8 %r1594, %r1593; - mad.lo.s32 %r1595, %r75, %r1594, %r1586; - mad.lo.s32 %r1596, %r76, %r1592, %r1595; - mad.lo.s32 %r1597, %r77, %r1590, %r1596; - mad.lo.s32 %r1598, %r78, %r1588, %r1597; - ld.const.v4.u8 {%rs978, %rs979, %rs980, %rs981}, [matrix+488]; - cvt.u32.u16 %r1599, %rs981; - cvt.s32.s8 %r1600, %r1599; - cvt.u32.u16 %r1601, %rs980; - cvt.s32.s8 %r1602, %r1601; - cvt.u32.u16 %r1603, %rs979; - cvt.s32.s8 %r1604, %r1603; - cvt.u32.u16 %r1605, %rs978; - cvt.s32.s8 %r1606, %r1605; - mad.lo.s32 %r1607, %r80, %r1606, %r1598; - mad.lo.s32 %r1608, %r81, %r1604, %r1607; - mad.lo.s32 %r1609, %r83, %r1602, %r1608; - mad.lo.s32 %r1610, %r84, %r1600, %r1609; - ld.const.v4.u8 {%rs986, %rs987, %rs988, %rs989}, [matrix+492]; - cvt.u32.u16 %r1611, %rs989; - cvt.s32.s8 %r1612, %r1611; - cvt.u32.u16 %r1613, %rs988; - cvt.s32.s8 %r1614, %r1613; - cvt.u32.u16 %r1615, %rs987; - cvt.s32.s8 %r1616, %r1615; - cvt.u32.u16 %r1617, %rs986; - cvt.s32.s8 %r1618, %r1617; - mad.lo.s32 %r1619, %r86, %r1618, %r1610; - mad.lo.s32 %r1620, %r87, %r1616, %r1619; - mad.lo.s32 %r1621, %r88, %r1614, %r1620; - mad.lo.s32 %r1622, %r89, %r1612, %r1621; - ld.const.v4.u8 {%rs994, %rs995, %rs996, %rs997}, [matrix+496]; - cvt.u32.u16 %r1623, %rs997; - cvt.s32.s8 %r1624, %r1623; - cvt.u32.u16 %r1625, %rs996; - cvt.s32.s8 %r1626, %r1625; - cvt.u32.u16 %r1627, %rs995; - cvt.s32.s8 %r1628, %r1627; - cvt.u32.u16 %r1629, %rs994; - cvt.s32.s8 %r1630, %r1629; - mad.lo.s32 %r1631, %r271, %r1630, %r1622; - mad.lo.s32 %r1632, %r91, %r1628, %r1631; - mad.lo.s32 %r1633, %r93, %r1626, %r1632; - mad.lo.s32 %r1634, %r94, %r1624, %r1633; - ld.const.v4.u8 {%rs1002, %rs1003, %rs1004, %rs1005}, [matrix+500]; - cvt.u32.u16 %r1635, %rs1005; - cvt.s32.s8 %r1636, %r1635; - cvt.u32.u16 %r1637, %rs1004; - cvt.s32.s8 %r1638, %r1637; - cvt.u32.u16 %r1639, %rs1003; - cvt.s32.s8 %r1640, %r1639; - cvt.u32.u16 %r1641, %rs1002; - cvt.s32.s8 %r1642, %r1641; - mad.lo.s32 %r1643, %r96, %r1642, %r1634; - mad.lo.s32 %r1644, %r97, %r1640, %r1643; - mad.lo.s32 %r1645, %r99, %r1638, %r1644; - mad.lo.s32 %r1646, %r100, %r1636, %r1645; - ld.const.v4.u8 {%rs1010, %rs1011, %rs1012, %rs1013}, [matrix+504]; - cvt.u32.u16 %r1647, %rs1013; - cvt.s32.s8 %r1648, %r1647; - cvt.u32.u16 %r1649, %rs1012; - cvt.s32.s8 %r1650, %r1649; - cvt.u32.u16 %r1651, %rs1011; - cvt.s32.s8 %r1652, %r1651; - cvt.u32.u16 %r1653, %rs1010; - cvt.s32.s8 %r1654, %r1653; - mad.lo.s32 %r1655, %r103, %r1654, %r1646; - mad.lo.s32 %r1656, %r104, %r1652, %r1655; - mad.lo.s32 %r1657, %r107, %r1650, %r1656; - mad.lo.s32 %r1658, %r108, %r1648, %r1657; - ld.const.v4.u8 {%rs1018, %rs1019, %rs1020, %rs1021}, [matrix+508]; - cvt.u32.u16 %r1659, %rs1021; - cvt.s32.s8 %r1660, %r1659; - cvt.u32.u16 %r1661, %rs1020; - cvt.s32.s8 %r1662, %r1661; - cvt.u32.u16 %r1663, %rs1019; - cvt.s32.s8 %r1664, %r1663; - cvt.u32.u16 %r1665, %rs1018; - cvt.s32.s8 %r1666, %r1665; - mad.lo.s32 %r1667, %r111, %r1666, %r1658; - mad.lo.s32 %r1668, %r112, %r1664, %r1667; - mad.lo.s32 %r1669, %r114, %r1662, %r1668; - mad.lo.s32 %r1670, %r115, %r1660, %r1669; - shr.u32 %r1671, %r1478, 6; - and.b32 %r1672, %r1671, 240; - shr.u32 %r1673, %r1670, 10; - or.b32 %r1674, %r1673, %r1672; - xor.b32 %r1675, %r15, %r1674; - cvt.u64.u32 %rd383, %r1675; - ld.const.v4.u8 {%rs1026, %rs1027, %rs1028, %rs1029}, [matrix+512]; - cvt.u32.u16 %r1676, %rs1029; - cvt.s32.s8 %r1677, %r1676; - cvt.u32.u16 %r1678, %rs1028; - cvt.s32.s8 %r1679, %r1678; - cvt.u32.u16 %r1680, %rs1026; - cvt.s32.s8 %r1681, %r1680; - cvt.u32.u16 %r1682, %rs1027; - cvt.s32.s8 %r1683, %r1682; - mul.lo.s32 %r1684, %r34, %r1683; - mad.lo.s32 %r1685, %r124, %r1681, %r1684; - mad.lo.s32 %r1686, %r35, %r1679, %r1685; - mad.lo.s32 %r1687, %r36, %r1677, %r1686; - ld.const.v4.u8 {%rs1034, %rs1035, %rs1036, %rs1037}, [matrix+516]; - cvt.u32.u16 %r1688, %rs1037; - cvt.s32.s8 %r1689, %r1688; - cvt.u32.u16 %r1690, %rs1036; - cvt.s32.s8 %r1691, %r1690; - cvt.u32.u16 %r1692, %rs1035; - cvt.s32.s8 %r1693, %r1692; - cvt.u32.u16 %r1694, %rs1034; - cvt.s32.s8 %r1695, %r1694; - mad.lo.s32 %r1696, %r37, %r1695, %r1687; - mad.lo.s32 %r1697, %r38, %r1693, %r1696; - mad.lo.s32 %r1698, %r39, %r1691, %r1697; - mad.lo.s32 %r1699, %r40, %r1689, %r1698; - ld.const.v4.u8 {%rs1042, %rs1043, %rs1044, %rs1045}, [matrix+520]; - cvt.u32.u16 %r1700, %rs1045; - cvt.s32.s8 %r1701, %r1700; - cvt.u32.u16 %r1702, %rs1044; - cvt.s32.s8 %r1703, %r1702; - cvt.u32.u16 %r1704, %rs1043; - cvt.s32.s8 %r1705, %r1704; - cvt.u32.u16 %r1706, %rs1042; - cvt.s32.s8 %r1707, %r1706; - mad.lo.s32 %r1708, %r42, %r1707, %r1699; - mad.lo.s32 %r1709, %r43, %r1705, %r1708; - mad.lo.s32 %r1710, %r45, %r1703, %r1709; - mad.lo.s32 %r1711, %r46, %r1701, %r1710; - ld.const.v4.u8 {%rs1050, %rs1051, %rs1052, %rs1053}, [matrix+524]; - cvt.u32.u16 %r1712, %rs1053; - cvt.s32.s8 %r1713, %r1712; - cvt.u32.u16 %r1714, %rs1052; - cvt.s32.s8 %r1715, %r1714; - cvt.u32.u16 %r1716, %rs1051; - cvt.s32.s8 %r1717, %r1716; - cvt.u32.u16 %r1718, %rs1050; - cvt.s32.s8 %r1719, %r1718; - mad.lo.s32 %r1720, %r48, %r1719, %r1711; - mad.lo.s32 %r1721, %r49, %r1717, %r1720; - mad.lo.s32 %r1722, %r50, %r1715, %r1721; - mad.lo.s32 %r1723, %r51, %r1713, %r1722; - ld.const.v4.u8 {%rs1058, %rs1059, %rs1060, %rs1061}, [matrix+528]; - cvt.u32.u16 %r1724, %rs1061; - cvt.s32.s8 %r1725, %r1724; - cvt.u32.u16 %r1726, %rs1060; - cvt.s32.s8 %r1727, %r1726; - cvt.u32.u16 %r1728, %rs1059; - cvt.s32.s8 %r1729, %r1728; - cvt.u32.u16 %r1730, %rs1058; - cvt.s32.s8 %r1731, %r1730; - mad.lo.s32 %r1732, %r173, %r1731, %r1723; - mad.lo.s32 %r1733, %r53, %r1729, %r1732; - mad.lo.s32 %r1734, %r54, %r1727, %r1733; - mad.lo.s32 %r1735, %r55, %r1725, %r1734; - ld.const.v4.u8 {%rs1066, %rs1067, %rs1068, %rs1069}, [matrix+532]; - cvt.u32.u16 %r1736, %rs1069; - cvt.s32.s8 %r1737, %r1736; - cvt.u32.u16 %r1738, %rs1068; - cvt.s32.s8 %r1739, %r1738; - cvt.u32.u16 %r1740, %rs1067; - cvt.s32.s8 %r1741, %r1740; - cvt.u32.u16 %r1742, %rs1066; - cvt.s32.s8 %r1743, %r1742; - mad.lo.s32 %r1744, %r56, %r1743, %r1735; - mad.lo.s32 %r1745, %r57, %r1741, %r1744; - mad.lo.s32 %r1746, %r58, %r1739, %r1745; - mad.lo.s32 %r1747, %r59, %r1737, %r1746; - ld.const.v4.u8 {%rs1074, %rs1075, %rs1076, %rs1077}, [matrix+536]; - cvt.u32.u16 %r1748, %rs1077; - cvt.s32.s8 %r1749, %r1748; - cvt.u32.u16 %r1750, %rs1076; - cvt.s32.s8 %r1751, %r1750; - cvt.u32.u16 %r1752, %rs1075; - cvt.s32.s8 %r1753, %r1752; - cvt.u32.u16 %r1754, %rs1074; - cvt.s32.s8 %r1755, %r1754; - mad.lo.s32 %r1756, %r61, %r1755, %r1747; - mad.lo.s32 %r1757, %r62, %r1753, %r1756; - mad.lo.s32 %r1758, %r64, %r1751, %r1757; - mad.lo.s32 %r1759, %r65, %r1749, %r1758; - ld.const.v4.u8 {%rs1082, %rs1083, %rs1084, %rs1085}, [matrix+540]; - cvt.u32.u16 %r1760, %rs1085; - cvt.s32.s8 %r1761, %r1760; - cvt.u32.u16 %r1762, %rs1084; - cvt.s32.s8 %r1763, %r1762; - cvt.u32.u16 %r1764, %rs1083; - cvt.s32.s8 %r1765, %r1764; - cvt.u32.u16 %r1766, %rs1082; - cvt.s32.s8 %r1767, %r1766; - mad.lo.s32 %r1768, %r67, %r1767, %r1759; - mad.lo.s32 %r1769, %r68, %r1765, %r1768; - mad.lo.s32 %r1770, %r69, %r1763, %r1769; - mad.lo.s32 %r1771, %r70, %r1761, %r1770; - ld.const.v4.u8 {%rs1090, %rs1091, %rs1092, %rs1093}, [matrix+544]; - cvt.u32.u16 %r1772, %rs1093; - cvt.s32.s8 %r1773, %r1772; - cvt.u32.u16 %r1774, %rs1092; - cvt.s32.s8 %r1775, %r1774; - cvt.u32.u16 %r1776, %rs1091; - cvt.s32.s8 %r1777, %r1776; - cvt.u32.u16 %r1778, %rs1090; - cvt.s32.s8 %r1779, %r1778; - mad.lo.s32 %r1780, %r222, %r1779, %r1771; - mad.lo.s32 %r1781, %r72, %r1777, %r1780; - mad.lo.s32 %r1782, %r73, %r1775, %r1781; - mad.lo.s32 %r1783, %r74, %r1773, %r1782; - ld.const.v4.u8 {%rs1098, %rs1099, %rs1100, %rs1101}, [matrix+548]; - cvt.u32.u16 %r1784, %rs1101; - cvt.s32.s8 %r1785, %r1784; - cvt.u32.u16 %r1786, %rs1100; - cvt.s32.s8 %r1787, %r1786; - cvt.u32.u16 %r1788, %rs1099; - cvt.s32.s8 %r1789, %r1788; - cvt.u32.u16 %r1790, %rs1098; - cvt.s32.s8 %r1791, %r1790; - mad.lo.s32 %r1792, %r75, %r1791, %r1783; - mad.lo.s32 %r1793, %r76, %r1789, %r1792; - mad.lo.s32 %r1794, %r77, %r1787, %r1793; - mad.lo.s32 %r1795, %r78, %r1785, %r1794; - ld.const.v4.u8 {%rs1106, %rs1107, %rs1108, %rs1109}, [matrix+552]; - cvt.u32.u16 %r1796, %rs1109; - cvt.s32.s8 %r1797, %r1796; - cvt.u32.u16 %r1798, %rs1108; - cvt.s32.s8 %r1799, %r1798; - cvt.u32.u16 %r1800, %rs1107; - cvt.s32.s8 %r1801, %r1800; - cvt.u32.u16 %r1802, %rs1106; - cvt.s32.s8 %r1803, %r1802; - mad.lo.s32 %r1804, %r80, %r1803, %r1795; - mad.lo.s32 %r1805, %r81, %r1801, %r1804; - mad.lo.s32 %r1806, %r83, %r1799, %r1805; - mad.lo.s32 %r1807, %r84, %r1797, %r1806; - ld.const.v4.u8 {%rs1114, %rs1115, %rs1116, %rs1117}, [matrix+556]; - cvt.u32.u16 %r1808, %rs1117; - cvt.s32.s8 %r1809, %r1808; - cvt.u32.u16 %r1810, %rs1116; - cvt.s32.s8 %r1811, %r1810; - cvt.u32.u16 %r1812, %rs1115; - cvt.s32.s8 %r1813, %r1812; - cvt.u32.u16 %r1814, %rs1114; - cvt.s32.s8 %r1815, %r1814; - mad.lo.s32 %r1816, %r86, %r1815, %r1807; - mad.lo.s32 %r1817, %r87, %r1813, %r1816; - mad.lo.s32 %r1818, %r88, %r1811, %r1817; - mad.lo.s32 %r1819, %r89, %r1809, %r1818; - ld.const.v4.u8 {%rs1122, %rs1123, %rs1124, %rs1125}, [matrix+560]; - cvt.u32.u16 %r1820, %rs1125; - cvt.s32.s8 %r1821, %r1820; - cvt.u32.u16 %r1822, %rs1124; - cvt.s32.s8 %r1823, %r1822; - cvt.u32.u16 %r1824, %rs1123; - cvt.s32.s8 %r1825, %r1824; - cvt.u32.u16 %r1826, %rs1122; - cvt.s32.s8 %r1827, %r1826; - mad.lo.s32 %r1828, %r271, %r1827, %r1819; - mad.lo.s32 %r1829, %r91, %r1825, %r1828; - mad.lo.s32 %r1830, %r93, %r1823, %r1829; - mad.lo.s32 %r1831, %r94, %r1821, %r1830; - ld.const.v4.u8 {%rs1130, %rs1131, %rs1132, %rs1133}, [matrix+564]; - cvt.u32.u16 %r1832, %rs1133; - cvt.s32.s8 %r1833, %r1832; - cvt.u32.u16 %r1834, %rs1132; - cvt.s32.s8 %r1835, %r1834; - cvt.u32.u16 %r1836, %rs1131; - cvt.s32.s8 %r1837, %r1836; - cvt.u32.u16 %r1838, %rs1130; - cvt.s32.s8 %r1839, %r1838; - mad.lo.s32 %r1840, %r96, %r1839, %r1831; - mad.lo.s32 %r1841, %r97, %r1837, %r1840; - mad.lo.s32 %r1842, %r99, %r1835, %r1841; - mad.lo.s32 %r1843, %r100, %r1833, %r1842; - ld.const.v4.u8 {%rs1138, %rs1139, %rs1140, %rs1141}, [matrix+568]; - cvt.u32.u16 %r1844, %rs1141; - cvt.s32.s8 %r1845, %r1844; - cvt.u32.u16 %r1846, %rs1140; - cvt.s32.s8 %r1847, %r1846; - cvt.u32.u16 %r1848, %rs1139; - cvt.s32.s8 %r1849, %r1848; - cvt.u32.u16 %r1850, %rs1138; - cvt.s32.s8 %r1851, %r1850; - mad.lo.s32 %r1852, %r103, %r1851, %r1843; - mad.lo.s32 %r1853, %r104, %r1849, %r1852; - mad.lo.s32 %r1854, %r107, %r1847, %r1853; - mad.lo.s32 %r1855, %r108, %r1845, %r1854; - ld.const.v4.u8 {%rs1146, %rs1147, %rs1148, %rs1149}, [matrix+572]; - cvt.u32.u16 %r1856, %rs1149; - cvt.s32.s8 %r1857, %r1856; - cvt.u32.u16 %r1858, %rs1148; - cvt.s32.s8 %r1859, %r1858; - cvt.u32.u16 %r1860, %rs1147; - cvt.s32.s8 %r1861, %r1860; - cvt.u32.u16 %r1862, %rs1146; - cvt.s32.s8 %r1863, %r1862; - mad.lo.s32 %r1864, %r111, %r1863, %r1855; - mad.lo.s32 %r1865, %r112, %r1861, %r1864; - mad.lo.s32 %r1866, %r114, %r1859, %r1865; - mad.lo.s32 %r1867, %r115, %r1857, %r1866; - ld.const.v4.u8 {%rs1154, %rs1155, %rs1156, %rs1157}, [matrix+576]; - cvt.u32.u16 %r1868, %rs1157; - cvt.s32.s8 %r1869, %r1868; - cvt.u32.u16 %r1870, %rs1156; - cvt.s32.s8 %r1871, %r1870; - cvt.u32.u16 %r1872, %rs1154; - cvt.s32.s8 %r1873, %r1872; - cvt.u32.u16 %r1874, %rs1155; - cvt.s32.s8 %r1875, %r1874; - mul.lo.s32 %r1876, %r34, %r1875; - mad.lo.s32 %r1877, %r124, %r1873, %r1876; - mad.lo.s32 %r1878, %r35, %r1871, %r1877; - mad.lo.s32 %r1879, %r36, %r1869, %r1878; - ld.const.v4.u8 {%rs1162, %rs1163, %rs1164, %rs1165}, [matrix+580]; - cvt.u32.u16 %r1880, %rs1165; - cvt.s32.s8 %r1881, %r1880; - cvt.u32.u16 %r1882, %rs1164; - cvt.s32.s8 %r1883, %r1882; - cvt.u32.u16 %r1884, %rs1163; - cvt.s32.s8 %r1885, %r1884; - cvt.u32.u16 %r1886, %rs1162; - cvt.s32.s8 %r1887, %r1886; - mad.lo.s32 %r1888, %r37, %r1887, %r1879; - mad.lo.s32 %r1889, %r38, %r1885, %r1888; - mad.lo.s32 %r1890, %r39, %r1883, %r1889; - mad.lo.s32 %r1891, %r40, %r1881, %r1890; - ld.const.v4.u8 {%rs1170, %rs1171, %rs1172, %rs1173}, [matrix+584]; - cvt.u32.u16 %r1892, %rs1173; - cvt.s32.s8 %r1893, %r1892; - cvt.u32.u16 %r1894, %rs1172; - cvt.s32.s8 %r1895, %r1894; - cvt.u32.u16 %r1896, %rs1171; - cvt.s32.s8 %r1897, %r1896; - cvt.u32.u16 %r1898, %rs1170; - cvt.s32.s8 %r1899, %r1898; - mad.lo.s32 %r1900, %r42, %r1899, %r1891; - mad.lo.s32 %r1901, %r43, %r1897, %r1900; - mad.lo.s32 %r1902, %r45, %r1895, %r1901; - mad.lo.s32 %r1903, %r46, %r1893, %r1902; - ld.const.v4.u8 {%rs1178, %rs1179, %rs1180, %rs1181}, [matrix+588]; - cvt.u32.u16 %r1904, %rs1181; - cvt.s32.s8 %r1905, %r1904; - cvt.u32.u16 %r1906, %rs1180; - cvt.s32.s8 %r1907, %r1906; - cvt.u32.u16 %r1908, %rs1179; - cvt.s32.s8 %r1909, %r1908; - cvt.u32.u16 %r1910, %rs1178; - cvt.s32.s8 %r1911, %r1910; - mad.lo.s32 %r1912, %r48, %r1911, %r1903; - mad.lo.s32 %r1913, %r49, %r1909, %r1912; - mad.lo.s32 %r1914, %r50, %r1907, %r1913; - mad.lo.s32 %r1915, %r51, %r1905, %r1914; - ld.const.v4.u8 {%rs1186, %rs1187, %rs1188, %rs1189}, [matrix+592]; - cvt.u32.u16 %r1916, %rs1189; - cvt.s32.s8 %r1917, %r1916; - cvt.u32.u16 %r1918, %rs1188; - cvt.s32.s8 %r1919, %r1918; - cvt.u32.u16 %r1920, %rs1187; - cvt.s32.s8 %r1921, %r1920; - cvt.u32.u16 %r1922, %rs1186; - cvt.s32.s8 %r1923, %r1922; - mad.lo.s32 %r1924, %r173, %r1923, %r1915; - mad.lo.s32 %r1925, %r53, %r1921, %r1924; - mad.lo.s32 %r1926, %r54, %r1919, %r1925; - mad.lo.s32 %r1927, %r55, %r1917, %r1926; - ld.const.v4.u8 {%rs1194, %rs1195, %rs1196, %rs1197}, [matrix+596]; - cvt.u32.u16 %r1928, %rs1197; - cvt.s32.s8 %r1929, %r1928; - cvt.u32.u16 %r1930, %rs1196; - cvt.s32.s8 %r1931, %r1930; - cvt.u32.u16 %r1932, %rs1195; - cvt.s32.s8 %r1933, %r1932; - cvt.u32.u16 %r1934, %rs1194; - cvt.s32.s8 %r1935, %r1934; - mad.lo.s32 %r1936, %r56, %r1935, %r1927; - mad.lo.s32 %r1937, %r57, %r1933, %r1936; - mad.lo.s32 %r1938, %r58, %r1931, %r1937; - mad.lo.s32 %r1939, %r59, %r1929, %r1938; - ld.const.v4.u8 {%rs1202, %rs1203, %rs1204, %rs1205}, [matrix+600]; - cvt.u32.u16 %r1940, %rs1205; - cvt.s32.s8 %r1941, %r1940; - cvt.u32.u16 %r1942, %rs1204; - cvt.s32.s8 %r1943, %r1942; - cvt.u32.u16 %r1944, %rs1203; - cvt.s32.s8 %r1945, %r1944; - cvt.u32.u16 %r1946, %rs1202; - cvt.s32.s8 %r1947, %r1946; - mad.lo.s32 %r1948, %r61, %r1947, %r1939; - mad.lo.s32 %r1949, %r62, %r1945, %r1948; - mad.lo.s32 %r1950, %r64, %r1943, %r1949; - mad.lo.s32 %r1951, %r65, %r1941, %r1950; - ld.const.v4.u8 {%rs1210, %rs1211, %rs1212, %rs1213}, [matrix+604]; - cvt.u32.u16 %r1952, %rs1213; - cvt.s32.s8 %r1953, %r1952; - cvt.u32.u16 %r1954, %rs1212; - cvt.s32.s8 %r1955, %r1954; - cvt.u32.u16 %r1956, %rs1211; - cvt.s32.s8 %r1957, %r1956; - cvt.u32.u16 %r1958, %rs1210; - cvt.s32.s8 %r1959, %r1958; - mad.lo.s32 %r1960, %r67, %r1959, %r1951; - mad.lo.s32 %r1961, %r68, %r1957, %r1960; - mad.lo.s32 %r1962, %r69, %r1955, %r1961; - mad.lo.s32 %r1963, %r70, %r1953, %r1962; - ld.const.v4.u8 {%rs1218, %rs1219, %rs1220, %rs1221}, [matrix+608]; - cvt.u32.u16 %r1964, %rs1221; - cvt.s32.s8 %r1965, %r1964; - cvt.u32.u16 %r1966, %rs1220; - cvt.s32.s8 %r1967, %r1966; - cvt.u32.u16 %r1968, %rs1219; - cvt.s32.s8 %r1969, %r1968; - cvt.u32.u16 %r1970, %rs1218; - cvt.s32.s8 %r1971, %r1970; - mad.lo.s32 %r1972, %r222, %r1971, %r1963; - mad.lo.s32 %r1973, %r72, %r1969, %r1972; - mad.lo.s32 %r1974, %r73, %r1967, %r1973; - mad.lo.s32 %r1975, %r74, %r1965, %r1974; - ld.const.v4.u8 {%rs1226, %rs1227, %rs1228, %rs1229}, [matrix+612]; - cvt.u32.u16 %r1976, %rs1229; - cvt.s32.s8 %r1977, %r1976; - cvt.u32.u16 %r1978, %rs1228; - cvt.s32.s8 %r1979, %r1978; - cvt.u32.u16 %r1980, %rs1227; - cvt.s32.s8 %r1981, %r1980; - cvt.u32.u16 %r1982, %rs1226; - cvt.s32.s8 %r1983, %r1982; - mad.lo.s32 %r1984, %r75, %r1983, %r1975; - mad.lo.s32 %r1985, %r76, %r1981, %r1984; - mad.lo.s32 %r1986, %r77, %r1979, %r1985; - mad.lo.s32 %r1987, %r78, %r1977, %r1986; - ld.const.v4.u8 {%rs1234, %rs1235, %rs1236, %rs1237}, [matrix+616]; - cvt.u32.u16 %r1988, %rs1237; - cvt.s32.s8 %r1989, %r1988; - cvt.u32.u16 %r1990, %rs1236; - cvt.s32.s8 %r1991, %r1990; - cvt.u32.u16 %r1992, %rs1235; - cvt.s32.s8 %r1993, %r1992; - cvt.u32.u16 %r1994, %rs1234; - cvt.s32.s8 %r1995, %r1994; - mad.lo.s32 %r1996, %r80, %r1995, %r1987; - mad.lo.s32 %r1997, %r81, %r1993, %r1996; - mad.lo.s32 %r1998, %r83, %r1991, %r1997; - mad.lo.s32 %r1999, %r84, %r1989, %r1998; - ld.const.v4.u8 {%rs1242, %rs1243, %rs1244, %rs1245}, [matrix+620]; - cvt.u32.u16 %r2000, %rs1245; - cvt.s32.s8 %r2001, %r2000; - cvt.u32.u16 %r2002, %rs1244; - cvt.s32.s8 %r2003, %r2002; - cvt.u32.u16 %r2004, %rs1243; - cvt.s32.s8 %r2005, %r2004; - cvt.u32.u16 %r2006, %rs1242; - cvt.s32.s8 %r2007, %r2006; - mad.lo.s32 %r2008, %r86, %r2007, %r1999; - mad.lo.s32 %r2009, %r87, %r2005, %r2008; - mad.lo.s32 %r2010, %r88, %r2003, %r2009; - mad.lo.s32 %r2011, %r89, %r2001, %r2010; - ld.const.v4.u8 {%rs1250, %rs1251, %rs1252, %rs1253}, [matrix+624]; - cvt.u32.u16 %r2012, %rs1253; - cvt.s32.s8 %r2013, %r2012; - cvt.u32.u16 %r2014, %rs1252; - cvt.s32.s8 %r2015, %r2014; - cvt.u32.u16 %r2016, %rs1251; - cvt.s32.s8 %r2017, %r2016; - cvt.u32.u16 %r2018, %rs1250; - cvt.s32.s8 %r2019, %r2018; - mad.lo.s32 %r2020, %r271, %r2019, %r2011; - mad.lo.s32 %r2021, %r91, %r2017, %r2020; - mad.lo.s32 %r2022, %r93, %r2015, %r2021; - mad.lo.s32 %r2023, %r94, %r2013, %r2022; - ld.const.v4.u8 {%rs1258, %rs1259, %rs1260, %rs1261}, [matrix+628]; - cvt.u32.u16 %r2024, %rs1261; - cvt.s32.s8 %r2025, %r2024; - cvt.u32.u16 %r2026, %rs1260; - cvt.s32.s8 %r2027, %r2026; - cvt.u32.u16 %r2028, %rs1259; - cvt.s32.s8 %r2029, %r2028; - cvt.u32.u16 %r2030, %rs1258; - cvt.s32.s8 %r2031, %r2030; - mad.lo.s32 %r2032, %r96, %r2031, %r2023; - mad.lo.s32 %r2033, %r97, %r2029, %r2032; - mad.lo.s32 %r2034, %r99, %r2027, %r2033; - mad.lo.s32 %r2035, %r100, %r2025, %r2034; - ld.const.v4.u8 {%rs1266, %rs1267, %rs1268, %rs1269}, [matrix+632]; - cvt.u32.u16 %r2036, %rs1269; - cvt.s32.s8 %r2037, %r2036; - cvt.u32.u16 %r2038, %rs1268; - cvt.s32.s8 %r2039, %r2038; - cvt.u32.u16 %r2040, %rs1267; - cvt.s32.s8 %r2041, %r2040; - cvt.u32.u16 %r2042, %rs1266; - cvt.s32.s8 %r2043, %r2042; - mad.lo.s32 %r2044, %r103, %r2043, %r2035; - mad.lo.s32 %r2045, %r104, %r2041, %r2044; - mad.lo.s32 %r2046, %r107, %r2039, %r2045; - mad.lo.s32 %r2047, %r108, %r2037, %r2046; - ld.const.v4.u8 {%rs1274, %rs1275, %rs1276, %rs1277}, [matrix+636]; - cvt.u32.u16 %r2048, %rs1277; - cvt.s32.s8 %r2049, %r2048; - cvt.u32.u16 %r2050, %rs1276; - cvt.s32.s8 %r2051, %r2050; - cvt.u32.u16 %r2052, %rs1275; - cvt.s32.s8 %r2053, %r2052; - cvt.u32.u16 %r2054, %rs1274; - cvt.s32.s8 %r2055, %r2054; - mad.lo.s32 %r2056, %r111, %r2055, %r2047; - mad.lo.s32 %r2057, %r112, %r2053, %r2056; - mad.lo.s32 %r2058, %r114, %r2051, %r2057; - mad.lo.s32 %r2059, %r115, %r2049, %r2058; - shr.u32 %r2060, %r1867, 6; - and.b32 %r2061, %r2060, 240; - shr.u32 %r2062, %r2059, 10; - or.b32 %r2063, %r2062, %r2061; - xor.b32 %r2064, %r16, %r2063; - cvt.u64.u32 %rd384, %r2064; - ld.const.v4.u8 {%rs1282, %rs1283, %rs1284, %rs1285}, [matrix+640]; - cvt.u32.u16 %r2065, %rs1285; - cvt.s32.s8 %r2066, %r2065; - cvt.u32.u16 %r2067, %rs1284; - cvt.s32.s8 %r2068, %r2067; - cvt.u32.u16 %r2069, %rs1282; - cvt.s32.s8 %r2070, %r2069; - cvt.u32.u16 %r2071, %rs1283; - cvt.s32.s8 %r2072, %r2071; - mul.lo.s32 %r2073, %r34, %r2072; - mad.lo.s32 %r2074, %r124, %r2070, %r2073; - mad.lo.s32 %r2075, %r35, %r2068, %r2074; - mad.lo.s32 %r2076, %r36, %r2066, %r2075; - ld.const.v4.u8 {%rs1290, %rs1291, %rs1292, %rs1293}, [matrix+644]; - cvt.u32.u16 %r2077, %rs1293; - cvt.s32.s8 %r2078, %r2077; - cvt.u32.u16 %r2079, %rs1292; - cvt.s32.s8 %r2080, %r2079; - cvt.u32.u16 %r2081, %rs1291; - cvt.s32.s8 %r2082, %r2081; - cvt.u32.u16 %r2083, %rs1290; - cvt.s32.s8 %r2084, %r2083; - mad.lo.s32 %r2085, %r37, %r2084, %r2076; - mad.lo.s32 %r2086, %r38, %r2082, %r2085; - mad.lo.s32 %r2087, %r39, %r2080, %r2086; - mad.lo.s32 %r2088, %r40, %r2078, %r2087; - ld.const.v4.u8 {%rs1298, %rs1299, %rs1300, %rs1301}, [matrix+648]; - cvt.u32.u16 %r2089, %rs1301; - cvt.s32.s8 %r2090, %r2089; - cvt.u32.u16 %r2091, %rs1300; - cvt.s32.s8 %r2092, %r2091; - cvt.u32.u16 %r2093, %rs1299; - cvt.s32.s8 %r2094, %r2093; - cvt.u32.u16 %r2095, %rs1298; - cvt.s32.s8 %r2096, %r2095; - mad.lo.s32 %r2097, %r42, %r2096, %r2088; - mad.lo.s32 %r2098, %r43, %r2094, %r2097; - mad.lo.s32 %r2099, %r45, %r2092, %r2098; - mad.lo.s32 %r2100, %r46, %r2090, %r2099; - ld.const.v4.u8 {%rs1306, %rs1307, %rs1308, %rs1309}, [matrix+652]; - cvt.u32.u16 %r2101, %rs1309; - cvt.s32.s8 %r2102, %r2101; - cvt.u32.u16 %r2103, %rs1308; - cvt.s32.s8 %r2104, %r2103; - cvt.u32.u16 %r2105, %rs1307; - cvt.s32.s8 %r2106, %r2105; - cvt.u32.u16 %r2107, %rs1306; - cvt.s32.s8 %r2108, %r2107; - mad.lo.s32 %r2109, %r48, %r2108, %r2100; - mad.lo.s32 %r2110, %r49, %r2106, %r2109; - mad.lo.s32 %r2111, %r50, %r2104, %r2110; - mad.lo.s32 %r2112, %r51, %r2102, %r2111; - ld.const.v4.u8 {%rs1314, %rs1315, %rs1316, %rs1317}, [matrix+656]; - cvt.u32.u16 %r2113, %rs1317; - cvt.s32.s8 %r2114, %r2113; - cvt.u32.u16 %r2115, %rs1316; - cvt.s32.s8 %r2116, %r2115; - cvt.u32.u16 %r2117, %rs1315; - cvt.s32.s8 %r2118, %r2117; - cvt.u32.u16 %r2119, %rs1314; - cvt.s32.s8 %r2120, %r2119; - mad.lo.s32 %r2121, %r173, %r2120, %r2112; - mad.lo.s32 %r2122, %r53, %r2118, %r2121; - mad.lo.s32 %r2123, %r54, %r2116, %r2122; - mad.lo.s32 %r2124, %r55, %r2114, %r2123; - ld.const.v4.u8 {%rs1322, %rs1323, %rs1324, %rs1325}, [matrix+660]; - cvt.u32.u16 %r2125, %rs1325; - cvt.s32.s8 %r2126, %r2125; - cvt.u32.u16 %r2127, %rs1324; - cvt.s32.s8 %r2128, %r2127; - cvt.u32.u16 %r2129, %rs1323; - cvt.s32.s8 %r2130, %r2129; - cvt.u32.u16 %r2131, %rs1322; - cvt.s32.s8 %r2132, %r2131; - mad.lo.s32 %r2133, %r56, %r2132, %r2124; - mad.lo.s32 %r2134, %r57, %r2130, %r2133; - mad.lo.s32 %r2135, %r58, %r2128, %r2134; - mad.lo.s32 %r2136, %r59, %r2126, %r2135; - ld.const.v4.u8 {%rs1330, %rs1331, %rs1332, %rs1333}, [matrix+664]; - cvt.u32.u16 %r2137, %rs1333; - cvt.s32.s8 %r2138, %r2137; - cvt.u32.u16 %r2139, %rs1332; - cvt.s32.s8 %r2140, %r2139; - cvt.u32.u16 %r2141, %rs1331; - cvt.s32.s8 %r2142, %r2141; - cvt.u32.u16 %r2143, %rs1330; - cvt.s32.s8 %r2144, %r2143; - mad.lo.s32 %r2145, %r61, %r2144, %r2136; - mad.lo.s32 %r2146, %r62, %r2142, %r2145; - mad.lo.s32 %r2147, %r64, %r2140, %r2146; - mad.lo.s32 %r2148, %r65, %r2138, %r2147; - ld.const.v4.u8 {%rs1338, %rs1339, %rs1340, %rs1341}, [matrix+668]; - cvt.u32.u16 %r2149, %rs1341; - cvt.s32.s8 %r2150, %r2149; - cvt.u32.u16 %r2151, %rs1340; - cvt.s32.s8 %r2152, %r2151; - cvt.u32.u16 %r2153, %rs1339; - cvt.s32.s8 %r2154, %r2153; - cvt.u32.u16 %r2155, %rs1338; - cvt.s32.s8 %r2156, %r2155; - mad.lo.s32 %r2157, %r67, %r2156, %r2148; - mad.lo.s32 %r2158, %r68, %r2154, %r2157; - mad.lo.s32 %r2159, %r69, %r2152, %r2158; - mad.lo.s32 %r2160, %r70, %r2150, %r2159; - ld.const.v4.u8 {%rs1346, %rs1347, %rs1348, %rs1349}, [matrix+672]; - cvt.u32.u16 %r2161, %rs1349; - cvt.s32.s8 %r2162, %r2161; - cvt.u32.u16 %r2163, %rs1348; - cvt.s32.s8 %r2164, %r2163; - cvt.u32.u16 %r2165, %rs1347; - cvt.s32.s8 %r2166, %r2165; - cvt.u32.u16 %r2167, %rs1346; - cvt.s32.s8 %r2168, %r2167; - mad.lo.s32 %r2169, %r222, %r2168, %r2160; - mad.lo.s32 %r2170, %r72, %r2166, %r2169; - mad.lo.s32 %r2171, %r73, %r2164, %r2170; - mad.lo.s32 %r2172, %r74, %r2162, %r2171; - ld.const.v4.u8 {%rs1354, %rs1355, %rs1356, %rs1357}, [matrix+676]; - cvt.u32.u16 %r2173, %rs1357; - cvt.s32.s8 %r2174, %r2173; - cvt.u32.u16 %r2175, %rs1356; - cvt.s32.s8 %r2176, %r2175; - cvt.u32.u16 %r2177, %rs1355; - cvt.s32.s8 %r2178, %r2177; - cvt.u32.u16 %r2179, %rs1354; - cvt.s32.s8 %r2180, %r2179; - mad.lo.s32 %r2181, %r75, %r2180, %r2172; - mad.lo.s32 %r2182, %r76, %r2178, %r2181; - mad.lo.s32 %r2183, %r77, %r2176, %r2182; - mad.lo.s32 %r2184, %r78, %r2174, %r2183; - ld.const.v4.u8 {%rs1362, %rs1363, %rs1364, %rs1365}, [matrix+680]; - cvt.u32.u16 %r2185, %rs1365; - cvt.s32.s8 %r2186, %r2185; - cvt.u32.u16 %r2187, %rs1364; - cvt.s32.s8 %r2188, %r2187; - cvt.u32.u16 %r2189, %rs1363; - cvt.s32.s8 %r2190, %r2189; - cvt.u32.u16 %r2191, %rs1362; - cvt.s32.s8 %r2192, %r2191; - mad.lo.s32 %r2193, %r80, %r2192, %r2184; - mad.lo.s32 %r2194, %r81, %r2190, %r2193; - mad.lo.s32 %r2195, %r83, %r2188, %r2194; - mad.lo.s32 %r2196, %r84, %r2186, %r2195; - ld.const.v4.u8 {%rs1370, %rs1371, %rs1372, %rs1373}, [matrix+684]; - cvt.u32.u16 %r2197, %rs1373; - cvt.s32.s8 %r2198, %r2197; - cvt.u32.u16 %r2199, %rs1372; - cvt.s32.s8 %r2200, %r2199; - cvt.u32.u16 %r2201, %rs1371; - cvt.s32.s8 %r2202, %r2201; - cvt.u32.u16 %r2203, %rs1370; - cvt.s32.s8 %r2204, %r2203; - mad.lo.s32 %r2205, %r86, %r2204, %r2196; - mad.lo.s32 %r2206, %r87, %r2202, %r2205; - mad.lo.s32 %r2207, %r88, %r2200, %r2206; - mad.lo.s32 %r2208, %r89, %r2198, %r2207; - ld.const.v4.u8 {%rs1378, %rs1379, %rs1380, %rs1381}, [matrix+688]; - cvt.u32.u16 %r2209, %rs1381; - cvt.s32.s8 %r2210, %r2209; - cvt.u32.u16 %r2211, %rs1380; - cvt.s32.s8 %r2212, %r2211; - cvt.u32.u16 %r2213, %rs1379; - cvt.s32.s8 %r2214, %r2213; - cvt.u32.u16 %r2215, %rs1378; - cvt.s32.s8 %r2216, %r2215; - mad.lo.s32 %r2217, %r271, %r2216, %r2208; - mad.lo.s32 %r2218, %r91, %r2214, %r2217; - mad.lo.s32 %r2219, %r93, %r2212, %r2218; - mad.lo.s32 %r2220, %r94, %r2210, %r2219; - ld.const.v4.u8 {%rs1386, %rs1387, %rs1388, %rs1389}, [matrix+692]; - cvt.u32.u16 %r2221, %rs1389; - cvt.s32.s8 %r2222, %r2221; - cvt.u32.u16 %r2223, %rs1388; - cvt.s32.s8 %r2224, %r2223; - cvt.u32.u16 %r2225, %rs1387; - cvt.s32.s8 %r2226, %r2225; - cvt.u32.u16 %r2227, %rs1386; - cvt.s32.s8 %r2228, %r2227; - mad.lo.s32 %r2229, %r96, %r2228, %r2220; - mad.lo.s32 %r2230, %r97, %r2226, %r2229; - mad.lo.s32 %r2231, %r99, %r2224, %r2230; - mad.lo.s32 %r2232, %r100, %r2222, %r2231; - ld.const.v4.u8 {%rs1394, %rs1395, %rs1396, %rs1397}, [matrix+696]; - cvt.u32.u16 %r2233, %rs1397; - cvt.s32.s8 %r2234, %r2233; - cvt.u32.u16 %r2235, %rs1396; - cvt.s32.s8 %r2236, %r2235; - cvt.u32.u16 %r2237, %rs1395; - cvt.s32.s8 %r2238, %r2237; - cvt.u32.u16 %r2239, %rs1394; - cvt.s32.s8 %r2240, %r2239; - mad.lo.s32 %r2241, %r103, %r2240, %r2232; - mad.lo.s32 %r2242, %r104, %r2238, %r2241; - mad.lo.s32 %r2243, %r107, %r2236, %r2242; - mad.lo.s32 %r2244, %r108, %r2234, %r2243; - ld.const.v4.u8 {%rs1402, %rs1403, %rs1404, %rs1405}, [matrix+700]; - cvt.u32.u16 %r2245, %rs1405; - cvt.s32.s8 %r2246, %r2245; - cvt.u32.u16 %r2247, %rs1404; - cvt.s32.s8 %r2248, %r2247; - cvt.u32.u16 %r2249, %rs1403; - cvt.s32.s8 %r2250, %r2249; - cvt.u32.u16 %r2251, %rs1402; - cvt.s32.s8 %r2252, %r2251; - mad.lo.s32 %r2253, %r111, %r2252, %r2244; - mad.lo.s32 %r2254, %r112, %r2250, %r2253; - mad.lo.s32 %r2255, %r114, %r2248, %r2254; - mad.lo.s32 %r2256, %r115, %r2246, %r2255; - ld.const.v4.u8 {%rs1410, %rs1411, %rs1412, %rs1413}, [matrix+704]; - cvt.u32.u16 %r2257, %rs1413; - cvt.s32.s8 %r2258, %r2257; - cvt.u32.u16 %r2259, %rs1412; - cvt.s32.s8 %r2260, %r2259; - cvt.u32.u16 %r2261, %rs1410; - cvt.s32.s8 %r2262, %r2261; - cvt.u32.u16 %r2263, %rs1411; - cvt.s32.s8 %r2264, %r2263; - mul.lo.s32 %r2265, %r34, %r2264; - mad.lo.s32 %r2266, %r124, %r2262, %r2265; - mad.lo.s32 %r2267, %r35, %r2260, %r2266; - mad.lo.s32 %r2268, %r36, %r2258, %r2267; - ld.const.v4.u8 {%rs1418, %rs1419, %rs1420, %rs1421}, [matrix+708]; - cvt.u32.u16 %r2269, %rs1421; - cvt.s32.s8 %r2270, %r2269; - cvt.u32.u16 %r2271, %rs1420; - cvt.s32.s8 %r2272, %r2271; - cvt.u32.u16 %r2273, %rs1419; - cvt.s32.s8 %r2274, %r2273; - cvt.u32.u16 %r2275, %rs1418; - cvt.s32.s8 %r2276, %r2275; - mad.lo.s32 %r2277, %r37, %r2276, %r2268; - mad.lo.s32 %r2278, %r38, %r2274, %r2277; - mad.lo.s32 %r2279, %r39, %r2272, %r2278; - mad.lo.s32 %r2280, %r40, %r2270, %r2279; - ld.const.v4.u8 {%rs1426, %rs1427, %rs1428, %rs1429}, [matrix+712]; - cvt.u32.u16 %r2281, %rs1429; - cvt.s32.s8 %r2282, %r2281; - cvt.u32.u16 %r2283, %rs1428; - cvt.s32.s8 %r2284, %r2283; - cvt.u32.u16 %r2285, %rs1427; - cvt.s32.s8 %r2286, %r2285; - cvt.u32.u16 %r2287, %rs1426; - cvt.s32.s8 %r2288, %r2287; - mad.lo.s32 %r2289, %r42, %r2288, %r2280; - mad.lo.s32 %r2290, %r43, %r2286, %r2289; - mad.lo.s32 %r2291, %r45, %r2284, %r2290; - mad.lo.s32 %r2292, %r46, %r2282, %r2291; - ld.const.v4.u8 {%rs1434, %rs1435, %rs1436, %rs1437}, [matrix+716]; - cvt.u32.u16 %r2293, %rs1437; - cvt.s32.s8 %r2294, %r2293; - cvt.u32.u16 %r2295, %rs1436; - cvt.s32.s8 %r2296, %r2295; - cvt.u32.u16 %r2297, %rs1435; - cvt.s32.s8 %r2298, %r2297; - cvt.u32.u16 %r2299, %rs1434; - cvt.s32.s8 %r2300, %r2299; - mad.lo.s32 %r2301, %r48, %r2300, %r2292; - mad.lo.s32 %r2302, %r49, %r2298, %r2301; - mad.lo.s32 %r2303, %r50, %r2296, %r2302; - mad.lo.s32 %r2304, %r51, %r2294, %r2303; - ld.const.v4.u8 {%rs1442, %rs1443, %rs1444, %rs1445}, [matrix+720]; - cvt.u32.u16 %r2305, %rs1445; - cvt.s32.s8 %r2306, %r2305; - cvt.u32.u16 %r2307, %rs1444; - cvt.s32.s8 %r2308, %r2307; - cvt.u32.u16 %r2309, %rs1443; - cvt.s32.s8 %r2310, %r2309; - cvt.u32.u16 %r2311, %rs1442; - cvt.s32.s8 %r2312, %r2311; - mad.lo.s32 %r2313, %r173, %r2312, %r2304; - mad.lo.s32 %r2314, %r53, %r2310, %r2313; - mad.lo.s32 %r2315, %r54, %r2308, %r2314; - mad.lo.s32 %r2316, %r55, %r2306, %r2315; - ld.const.v4.u8 {%rs1450, %rs1451, %rs1452, %rs1453}, [matrix+724]; - cvt.u32.u16 %r2317, %rs1453; - cvt.s32.s8 %r2318, %r2317; - cvt.u32.u16 %r2319, %rs1452; - cvt.s32.s8 %r2320, %r2319; - cvt.u32.u16 %r2321, %rs1451; - cvt.s32.s8 %r2322, %r2321; - cvt.u32.u16 %r2323, %rs1450; - cvt.s32.s8 %r2324, %r2323; - mad.lo.s32 %r2325, %r56, %r2324, %r2316; - mad.lo.s32 %r2326, %r57, %r2322, %r2325; - mad.lo.s32 %r2327, %r58, %r2320, %r2326; - mad.lo.s32 %r2328, %r59, %r2318, %r2327; - ld.const.v4.u8 {%rs1458, %rs1459, %rs1460, %rs1461}, [matrix+728]; - cvt.u32.u16 %r2329, %rs1461; - cvt.s32.s8 %r2330, %r2329; - cvt.u32.u16 %r2331, %rs1460; - cvt.s32.s8 %r2332, %r2331; - cvt.u32.u16 %r2333, %rs1459; - cvt.s32.s8 %r2334, %r2333; - cvt.u32.u16 %r2335, %rs1458; - cvt.s32.s8 %r2336, %r2335; - mad.lo.s32 %r2337, %r61, %r2336, %r2328; - mad.lo.s32 %r2338, %r62, %r2334, %r2337; - mad.lo.s32 %r2339, %r64, %r2332, %r2338; - mad.lo.s32 %r2340, %r65, %r2330, %r2339; - ld.const.v4.u8 {%rs1466, %rs1467, %rs1468, %rs1469}, [matrix+732]; - cvt.u32.u16 %r2341, %rs1469; - cvt.s32.s8 %r2342, %r2341; - cvt.u32.u16 %r2343, %rs1468; - cvt.s32.s8 %r2344, %r2343; - cvt.u32.u16 %r2345, %rs1467; - cvt.s32.s8 %r2346, %r2345; - cvt.u32.u16 %r2347, %rs1466; - cvt.s32.s8 %r2348, %r2347; - mad.lo.s32 %r2349, %r67, %r2348, %r2340; - mad.lo.s32 %r2350, %r68, %r2346, %r2349; - mad.lo.s32 %r2351, %r69, %r2344, %r2350; - mad.lo.s32 %r2352, %r70, %r2342, %r2351; - ld.const.v4.u8 {%rs1474, %rs1475, %rs1476, %rs1477}, [matrix+736]; - cvt.u32.u16 %r2353, %rs1477; - cvt.s32.s8 %r2354, %r2353; - cvt.u32.u16 %r2355, %rs1476; - cvt.s32.s8 %r2356, %r2355; - cvt.u32.u16 %r2357, %rs1475; - cvt.s32.s8 %r2358, %r2357; - cvt.u32.u16 %r2359, %rs1474; - cvt.s32.s8 %r2360, %r2359; - mad.lo.s32 %r2361, %r222, %r2360, %r2352; - mad.lo.s32 %r2362, %r72, %r2358, %r2361; - mad.lo.s32 %r2363, %r73, %r2356, %r2362; - mad.lo.s32 %r2364, %r74, %r2354, %r2363; - ld.const.v4.u8 {%rs1482, %rs1483, %rs1484, %rs1485}, [matrix+740]; - cvt.u32.u16 %r2365, %rs1485; - cvt.s32.s8 %r2366, %r2365; - cvt.u32.u16 %r2367, %rs1484; - cvt.s32.s8 %r2368, %r2367; - cvt.u32.u16 %r2369, %rs1483; - cvt.s32.s8 %r2370, %r2369; - cvt.u32.u16 %r2371, %rs1482; - cvt.s32.s8 %r2372, %r2371; - mad.lo.s32 %r2373, %r75, %r2372, %r2364; - mad.lo.s32 %r2374, %r76, %r2370, %r2373; - mad.lo.s32 %r2375, %r77, %r2368, %r2374; - mad.lo.s32 %r2376, %r78, %r2366, %r2375; - ld.const.v4.u8 {%rs1490, %rs1491, %rs1492, %rs1493}, [matrix+744]; - cvt.u32.u16 %r2377, %rs1493; - cvt.s32.s8 %r2378, %r2377; - cvt.u32.u16 %r2379, %rs1492; - cvt.s32.s8 %r2380, %r2379; - cvt.u32.u16 %r2381, %rs1491; - cvt.s32.s8 %r2382, %r2381; - cvt.u32.u16 %r2383, %rs1490; - cvt.s32.s8 %r2384, %r2383; - mad.lo.s32 %r2385, %r80, %r2384, %r2376; - mad.lo.s32 %r2386, %r81, %r2382, %r2385; - mad.lo.s32 %r2387, %r83, %r2380, %r2386; - mad.lo.s32 %r2388, %r84, %r2378, %r2387; - ld.const.v4.u8 {%rs1498, %rs1499, %rs1500, %rs1501}, [matrix+748]; - cvt.u32.u16 %r2389, %rs1501; - cvt.s32.s8 %r2390, %r2389; - cvt.u32.u16 %r2391, %rs1500; - cvt.s32.s8 %r2392, %r2391; - cvt.u32.u16 %r2393, %rs1499; - cvt.s32.s8 %r2394, %r2393; - cvt.u32.u16 %r2395, %rs1498; - cvt.s32.s8 %r2396, %r2395; - mad.lo.s32 %r2397, %r86, %r2396, %r2388; - mad.lo.s32 %r2398, %r87, %r2394, %r2397; - mad.lo.s32 %r2399, %r88, %r2392, %r2398; - mad.lo.s32 %r2400, %r89, %r2390, %r2399; - ld.const.v4.u8 {%rs1506, %rs1507, %rs1508, %rs1509}, [matrix+752]; - cvt.u32.u16 %r2401, %rs1509; - cvt.s32.s8 %r2402, %r2401; - cvt.u32.u16 %r2403, %rs1508; - cvt.s32.s8 %r2404, %r2403; - cvt.u32.u16 %r2405, %rs1507; - cvt.s32.s8 %r2406, %r2405; - cvt.u32.u16 %r2407, %rs1506; - cvt.s32.s8 %r2408, %r2407; - mad.lo.s32 %r2409, %r271, %r2408, %r2400; - mad.lo.s32 %r2410, %r91, %r2406, %r2409; - mad.lo.s32 %r2411, %r93, %r2404, %r2410; - mad.lo.s32 %r2412, %r94, %r2402, %r2411; - ld.const.v4.u8 {%rs1514, %rs1515, %rs1516, %rs1517}, [matrix+756]; - cvt.u32.u16 %r2413, %rs1517; - cvt.s32.s8 %r2414, %r2413; - cvt.u32.u16 %r2415, %rs1516; - cvt.s32.s8 %r2416, %r2415; - cvt.u32.u16 %r2417, %rs1515; - cvt.s32.s8 %r2418, %r2417; - cvt.u32.u16 %r2419, %rs1514; - cvt.s32.s8 %r2420, %r2419; - mad.lo.s32 %r2421, %r96, %r2420, %r2412; - mad.lo.s32 %r2422, %r97, %r2418, %r2421; - mad.lo.s32 %r2423, %r99, %r2416, %r2422; - mad.lo.s32 %r2424, %r100, %r2414, %r2423; - ld.const.v4.u8 {%rs1522, %rs1523, %rs1524, %rs1525}, [matrix+760]; - cvt.u32.u16 %r2425, %rs1525; - cvt.s32.s8 %r2426, %r2425; - cvt.u32.u16 %r2427, %rs1524; - cvt.s32.s8 %r2428, %r2427; - cvt.u32.u16 %r2429, %rs1523; - cvt.s32.s8 %r2430, %r2429; - cvt.u32.u16 %r2431, %rs1522; - cvt.s32.s8 %r2432, %r2431; - mad.lo.s32 %r2433, %r103, %r2432, %r2424; - mad.lo.s32 %r2434, %r104, %r2430, %r2433; - mad.lo.s32 %r2435, %r107, %r2428, %r2434; - mad.lo.s32 %r2436, %r108, %r2426, %r2435; - ld.const.v4.u8 {%rs1530, %rs1531, %rs1532, %rs1533}, [matrix+764]; - cvt.u32.u16 %r2437, %rs1533; - cvt.s32.s8 %r2438, %r2437; - cvt.u32.u16 %r2439, %rs1532; - cvt.s32.s8 %r2440, %r2439; - cvt.u32.u16 %r2441, %rs1531; - cvt.s32.s8 %r2442, %r2441; - cvt.u32.u16 %r2443, %rs1530; - cvt.s32.s8 %r2444, %r2443; - mad.lo.s32 %r2445, %r111, %r2444, %r2436; - mad.lo.s32 %r2446, %r112, %r2442, %r2445; - mad.lo.s32 %r2447, %r114, %r2440, %r2446; - mad.lo.s32 %r2448, %r115, %r2438, %r2447; - shr.u32 %r2449, %r2256, 6; - and.b32 %r2450, %r2449, 240; - shr.u32 %r2451, %r2448, 10; - or.b32 %r2452, %r2451, %r2450; - xor.b32 %r2453, %r17, %r2452; - cvt.u64.u32 %rd385, %r2453; - ld.const.v4.u8 {%rs1538, %rs1539, %rs1540, %rs1541}, [matrix+768]; - cvt.u32.u16 %r2454, %rs1541; - cvt.s32.s8 %r2455, %r2454; - cvt.u32.u16 %r2456, %rs1540; - cvt.s32.s8 %r2457, %r2456; - cvt.u32.u16 %r2458, %rs1538; - cvt.s32.s8 %r2459, %r2458; - cvt.u32.u16 %r2460, %rs1539; - cvt.s32.s8 %r2461, %r2460; - mul.lo.s32 %r2462, %r34, %r2461; - mad.lo.s32 %r2463, %r124, %r2459, %r2462; - mad.lo.s32 %r2464, %r35, %r2457, %r2463; - mad.lo.s32 %r2465, %r36, %r2455, %r2464; - ld.const.v4.u8 {%rs1546, %rs1547, %rs1548, %rs1549}, [matrix+772]; - cvt.u32.u16 %r2466, %rs1549; - cvt.s32.s8 %r2467, %r2466; - cvt.u32.u16 %r2468, %rs1548; - cvt.s32.s8 %r2469, %r2468; - cvt.u32.u16 %r2470, %rs1547; - cvt.s32.s8 %r2471, %r2470; - cvt.u32.u16 %r2472, %rs1546; - cvt.s32.s8 %r2473, %r2472; - mad.lo.s32 %r2474, %r37, %r2473, %r2465; - mad.lo.s32 %r2475, %r38, %r2471, %r2474; - mad.lo.s32 %r2476, %r39, %r2469, %r2475; - mad.lo.s32 %r2477, %r40, %r2467, %r2476; - ld.const.v4.u8 {%rs1554, %rs1555, %rs1556, %rs1557}, [matrix+776]; - cvt.u32.u16 %r2478, %rs1557; - cvt.s32.s8 %r2479, %r2478; - cvt.u32.u16 %r2480, %rs1556; - cvt.s32.s8 %r2481, %r2480; - cvt.u32.u16 %r2482, %rs1555; - cvt.s32.s8 %r2483, %r2482; - cvt.u32.u16 %r2484, %rs1554; - cvt.s32.s8 %r2485, %r2484; - mad.lo.s32 %r2486, %r42, %r2485, %r2477; - mad.lo.s32 %r2487, %r43, %r2483, %r2486; - mad.lo.s32 %r2488, %r45, %r2481, %r2487; - mad.lo.s32 %r2489, %r46, %r2479, %r2488; - ld.const.v4.u8 {%rs1562, %rs1563, %rs1564, %rs1565}, [matrix+780]; - cvt.u32.u16 %r2490, %rs1565; - cvt.s32.s8 %r2491, %r2490; - cvt.u32.u16 %r2492, %rs1564; - cvt.s32.s8 %r2493, %r2492; - cvt.u32.u16 %r2494, %rs1563; - cvt.s32.s8 %r2495, %r2494; - cvt.u32.u16 %r2496, %rs1562; - cvt.s32.s8 %r2497, %r2496; - mad.lo.s32 %r2498, %r48, %r2497, %r2489; - mad.lo.s32 %r2499, %r49, %r2495, %r2498; - mad.lo.s32 %r2500, %r50, %r2493, %r2499; - mad.lo.s32 %r2501, %r51, %r2491, %r2500; - ld.const.v4.u8 {%rs1570, %rs1571, %rs1572, %rs1573}, [matrix+784]; - cvt.u32.u16 %r2502, %rs1573; - cvt.s32.s8 %r2503, %r2502; - cvt.u32.u16 %r2504, %rs1572; - cvt.s32.s8 %r2505, %r2504; - cvt.u32.u16 %r2506, %rs1571; - cvt.s32.s8 %r2507, %r2506; - cvt.u32.u16 %r2508, %rs1570; - cvt.s32.s8 %r2509, %r2508; - mad.lo.s32 %r2510, %r173, %r2509, %r2501; - mad.lo.s32 %r2511, %r53, %r2507, %r2510; - mad.lo.s32 %r2512, %r54, %r2505, %r2511; - mad.lo.s32 %r2513, %r55, %r2503, %r2512; - ld.const.v4.u8 {%rs1578, %rs1579, %rs1580, %rs1581}, [matrix+788]; - cvt.u32.u16 %r2514, %rs1581; - cvt.s32.s8 %r2515, %r2514; - cvt.u32.u16 %r2516, %rs1580; - cvt.s32.s8 %r2517, %r2516; - cvt.u32.u16 %r2518, %rs1579; - cvt.s32.s8 %r2519, %r2518; - cvt.u32.u16 %r2520, %rs1578; - cvt.s32.s8 %r2521, %r2520; - mad.lo.s32 %r2522, %r56, %r2521, %r2513; - mad.lo.s32 %r2523, %r57, %r2519, %r2522; - mad.lo.s32 %r2524, %r58, %r2517, %r2523; - mad.lo.s32 %r2525, %r59, %r2515, %r2524; - ld.const.v4.u8 {%rs1586, %rs1587, %rs1588, %rs1589}, [matrix+792]; - cvt.u32.u16 %r2526, %rs1589; - cvt.s32.s8 %r2527, %r2526; - cvt.u32.u16 %r2528, %rs1588; - cvt.s32.s8 %r2529, %r2528; - cvt.u32.u16 %r2530, %rs1587; - cvt.s32.s8 %r2531, %r2530; - cvt.u32.u16 %r2532, %rs1586; - cvt.s32.s8 %r2533, %r2532; - mad.lo.s32 %r2534, %r61, %r2533, %r2525; - mad.lo.s32 %r2535, %r62, %r2531, %r2534; - mad.lo.s32 %r2536, %r64, %r2529, %r2535; - mad.lo.s32 %r2537, %r65, %r2527, %r2536; - ld.const.v4.u8 {%rs1594, %rs1595, %rs1596, %rs1597}, [matrix+796]; - cvt.u32.u16 %r2538, %rs1597; - cvt.s32.s8 %r2539, %r2538; - cvt.u32.u16 %r2540, %rs1596; - cvt.s32.s8 %r2541, %r2540; - cvt.u32.u16 %r2542, %rs1595; - cvt.s32.s8 %r2543, %r2542; - cvt.u32.u16 %r2544, %rs1594; - cvt.s32.s8 %r2545, %r2544; - mad.lo.s32 %r2546, %r67, %r2545, %r2537; - mad.lo.s32 %r2547, %r68, %r2543, %r2546; - mad.lo.s32 %r2548, %r69, %r2541, %r2547; - mad.lo.s32 %r2549, %r70, %r2539, %r2548; - ld.const.v4.u8 {%rs1602, %rs1603, %rs1604, %rs1605}, [matrix+800]; - cvt.u32.u16 %r2550, %rs1605; - cvt.s32.s8 %r2551, %r2550; - cvt.u32.u16 %r2552, %rs1604; - cvt.s32.s8 %r2553, %r2552; - cvt.u32.u16 %r2554, %rs1603; - cvt.s32.s8 %r2555, %r2554; - cvt.u32.u16 %r2556, %rs1602; - cvt.s32.s8 %r2557, %r2556; - mad.lo.s32 %r2558, %r222, %r2557, %r2549; - mad.lo.s32 %r2559, %r72, %r2555, %r2558; - mad.lo.s32 %r2560, %r73, %r2553, %r2559; - mad.lo.s32 %r2561, %r74, %r2551, %r2560; - ld.const.v4.u8 {%rs1610, %rs1611, %rs1612, %rs1613}, [matrix+804]; - cvt.u32.u16 %r2562, %rs1613; - cvt.s32.s8 %r2563, %r2562; - cvt.u32.u16 %r2564, %rs1612; - cvt.s32.s8 %r2565, %r2564; - cvt.u32.u16 %r2566, %rs1611; - cvt.s32.s8 %r2567, %r2566; - cvt.u32.u16 %r2568, %rs1610; - cvt.s32.s8 %r2569, %r2568; - mad.lo.s32 %r2570, %r75, %r2569, %r2561; - mad.lo.s32 %r2571, %r76, %r2567, %r2570; - mad.lo.s32 %r2572, %r77, %r2565, %r2571; - mad.lo.s32 %r2573, %r78, %r2563, %r2572; - ld.const.v4.u8 {%rs1618, %rs1619, %rs1620, %rs1621}, [matrix+808]; - cvt.u32.u16 %r2574, %rs1621; - cvt.s32.s8 %r2575, %r2574; - cvt.u32.u16 %r2576, %rs1620; - cvt.s32.s8 %r2577, %r2576; - cvt.u32.u16 %r2578, %rs1619; - cvt.s32.s8 %r2579, %r2578; - cvt.u32.u16 %r2580, %rs1618; - cvt.s32.s8 %r2581, %r2580; - mad.lo.s32 %r2582, %r80, %r2581, %r2573; - mad.lo.s32 %r2583, %r81, %r2579, %r2582; - mad.lo.s32 %r2584, %r83, %r2577, %r2583; - mad.lo.s32 %r2585, %r84, %r2575, %r2584; - ld.const.v4.u8 {%rs1626, %rs1627, %rs1628, %rs1629}, [matrix+812]; - cvt.u32.u16 %r2586, %rs1629; - cvt.s32.s8 %r2587, %r2586; - cvt.u32.u16 %r2588, %rs1628; - cvt.s32.s8 %r2589, %r2588; - cvt.u32.u16 %r2590, %rs1627; - cvt.s32.s8 %r2591, %r2590; - cvt.u32.u16 %r2592, %rs1626; - cvt.s32.s8 %r2593, %r2592; - mad.lo.s32 %r2594, %r86, %r2593, %r2585; - mad.lo.s32 %r2595, %r87, %r2591, %r2594; - mad.lo.s32 %r2596, %r88, %r2589, %r2595; - mad.lo.s32 %r2597, %r89, %r2587, %r2596; - ld.const.v4.u8 {%rs1634, %rs1635, %rs1636, %rs1637}, [matrix+816]; - cvt.u32.u16 %r2598, %rs1637; - cvt.s32.s8 %r2599, %r2598; - cvt.u32.u16 %r2600, %rs1636; - cvt.s32.s8 %r2601, %r2600; - cvt.u32.u16 %r2602, %rs1635; - cvt.s32.s8 %r2603, %r2602; - cvt.u32.u16 %r2604, %rs1634; - cvt.s32.s8 %r2605, %r2604; - mad.lo.s32 %r2606, %r271, %r2605, %r2597; - mad.lo.s32 %r2607, %r91, %r2603, %r2606; - mad.lo.s32 %r2608, %r93, %r2601, %r2607; - mad.lo.s32 %r2609, %r94, %r2599, %r2608; - ld.const.v4.u8 {%rs1642, %rs1643, %rs1644, %rs1645}, [matrix+820]; - cvt.u32.u16 %r2610, %rs1645; - cvt.s32.s8 %r2611, %r2610; - cvt.u32.u16 %r2612, %rs1644; - cvt.s32.s8 %r2613, %r2612; - cvt.u32.u16 %r2614, %rs1643; - cvt.s32.s8 %r2615, %r2614; - cvt.u32.u16 %r2616, %rs1642; - cvt.s32.s8 %r2617, %r2616; - mad.lo.s32 %r2618, %r96, %r2617, %r2609; - mad.lo.s32 %r2619, %r97, %r2615, %r2618; - mad.lo.s32 %r2620, %r99, %r2613, %r2619; - mad.lo.s32 %r2621, %r100, %r2611, %r2620; - ld.const.v4.u8 {%rs1650, %rs1651, %rs1652, %rs1653}, [matrix+824]; - cvt.u32.u16 %r2622, %rs1653; - cvt.s32.s8 %r2623, %r2622; - cvt.u32.u16 %r2624, %rs1652; - cvt.s32.s8 %r2625, %r2624; - cvt.u32.u16 %r2626, %rs1651; - cvt.s32.s8 %r2627, %r2626; - cvt.u32.u16 %r2628, %rs1650; - cvt.s32.s8 %r2629, %r2628; - mad.lo.s32 %r2630, %r103, %r2629, %r2621; - mad.lo.s32 %r2631, %r104, %r2627, %r2630; - mad.lo.s32 %r2632, %r107, %r2625, %r2631; - mad.lo.s32 %r2633, %r108, %r2623, %r2632; - ld.const.v4.u8 {%rs1658, %rs1659, %rs1660, %rs1661}, [matrix+828]; - cvt.u32.u16 %r2634, %rs1661; - cvt.s32.s8 %r2635, %r2634; - cvt.u32.u16 %r2636, %rs1660; - cvt.s32.s8 %r2637, %r2636; - cvt.u32.u16 %r2638, %rs1659; - cvt.s32.s8 %r2639, %r2638; - cvt.u32.u16 %r2640, %rs1658; - cvt.s32.s8 %r2641, %r2640; - mad.lo.s32 %r2642, %r111, %r2641, %r2633; - mad.lo.s32 %r2643, %r112, %r2639, %r2642; - mad.lo.s32 %r2644, %r114, %r2637, %r2643; - mad.lo.s32 %r2645, %r115, %r2635, %r2644; - ld.const.v4.u8 {%rs1666, %rs1667, %rs1668, %rs1669}, [matrix+832]; - cvt.u32.u16 %r2646, %rs1669; - cvt.s32.s8 %r2647, %r2646; - cvt.u32.u16 %r2648, %rs1668; - cvt.s32.s8 %r2649, %r2648; - cvt.u32.u16 %r2650, %rs1666; - cvt.s32.s8 %r2651, %r2650; - cvt.u32.u16 %r2652, %rs1667; - cvt.s32.s8 %r2653, %r2652; - mul.lo.s32 %r2654, %r34, %r2653; - mad.lo.s32 %r2655, %r124, %r2651, %r2654; - mad.lo.s32 %r2656, %r35, %r2649, %r2655; - mad.lo.s32 %r2657, %r36, %r2647, %r2656; - ld.const.v4.u8 {%rs1674, %rs1675, %rs1676, %rs1677}, [matrix+836]; - cvt.u32.u16 %r2658, %rs1677; - cvt.s32.s8 %r2659, %r2658; - cvt.u32.u16 %r2660, %rs1676; - cvt.s32.s8 %r2661, %r2660; - cvt.u32.u16 %r2662, %rs1675; - cvt.s32.s8 %r2663, %r2662; - cvt.u32.u16 %r2664, %rs1674; - cvt.s32.s8 %r2665, %r2664; - mad.lo.s32 %r2666, %r37, %r2665, %r2657; - mad.lo.s32 %r2667, %r38, %r2663, %r2666; - mad.lo.s32 %r2668, %r39, %r2661, %r2667; - mad.lo.s32 %r2669, %r40, %r2659, %r2668; - ld.const.v4.u8 {%rs1682, %rs1683, %rs1684, %rs1685}, [matrix+840]; - cvt.u32.u16 %r2670, %rs1685; - cvt.s32.s8 %r2671, %r2670; - cvt.u32.u16 %r2672, %rs1684; - cvt.s32.s8 %r2673, %r2672; - cvt.u32.u16 %r2674, %rs1683; - cvt.s32.s8 %r2675, %r2674; - cvt.u32.u16 %r2676, %rs1682; - cvt.s32.s8 %r2677, %r2676; - mad.lo.s32 %r2678, %r42, %r2677, %r2669; - mad.lo.s32 %r2679, %r43, %r2675, %r2678; - mad.lo.s32 %r2680, %r45, %r2673, %r2679; - mad.lo.s32 %r2681, %r46, %r2671, %r2680; - ld.const.v4.u8 {%rs1690, %rs1691, %rs1692, %rs1693}, [matrix+844]; - cvt.u32.u16 %r2682, %rs1693; - cvt.s32.s8 %r2683, %r2682; - cvt.u32.u16 %r2684, %rs1692; - cvt.s32.s8 %r2685, %r2684; - cvt.u32.u16 %r2686, %rs1691; - cvt.s32.s8 %r2687, %r2686; - cvt.u32.u16 %r2688, %rs1690; - cvt.s32.s8 %r2689, %r2688; - mad.lo.s32 %r2690, %r48, %r2689, %r2681; - mad.lo.s32 %r2691, %r49, %r2687, %r2690; - mad.lo.s32 %r2692, %r50, %r2685, %r2691; - mad.lo.s32 %r2693, %r51, %r2683, %r2692; - ld.const.v4.u8 {%rs1698, %rs1699, %rs1700, %rs1701}, [matrix+848]; - cvt.u32.u16 %r2694, %rs1701; - cvt.s32.s8 %r2695, %r2694; - cvt.u32.u16 %r2696, %rs1700; - cvt.s32.s8 %r2697, %r2696; - cvt.u32.u16 %r2698, %rs1699; - cvt.s32.s8 %r2699, %r2698; - cvt.u32.u16 %r2700, %rs1698; - cvt.s32.s8 %r2701, %r2700; - mad.lo.s32 %r2702, %r173, %r2701, %r2693; - mad.lo.s32 %r2703, %r53, %r2699, %r2702; - mad.lo.s32 %r2704, %r54, %r2697, %r2703; - mad.lo.s32 %r2705, %r55, %r2695, %r2704; - ld.const.v4.u8 {%rs1706, %rs1707, %rs1708, %rs1709}, [matrix+852]; - cvt.u32.u16 %r2706, %rs1709; - cvt.s32.s8 %r2707, %r2706; - cvt.u32.u16 %r2708, %rs1708; - cvt.s32.s8 %r2709, %r2708; - cvt.u32.u16 %r2710, %rs1707; - cvt.s32.s8 %r2711, %r2710; - cvt.u32.u16 %r2712, %rs1706; - cvt.s32.s8 %r2713, %r2712; - mad.lo.s32 %r2714, %r56, %r2713, %r2705; - mad.lo.s32 %r2715, %r57, %r2711, %r2714; - mad.lo.s32 %r2716, %r58, %r2709, %r2715; - mad.lo.s32 %r2717, %r59, %r2707, %r2716; - ld.const.v4.u8 {%rs1714, %rs1715, %rs1716, %rs1717}, [matrix+856]; - cvt.u32.u16 %r2718, %rs1717; - cvt.s32.s8 %r2719, %r2718; - cvt.u32.u16 %r2720, %rs1716; - cvt.s32.s8 %r2721, %r2720; - cvt.u32.u16 %r2722, %rs1715; - cvt.s32.s8 %r2723, %r2722; - cvt.u32.u16 %r2724, %rs1714; - cvt.s32.s8 %r2725, %r2724; - mad.lo.s32 %r2726, %r61, %r2725, %r2717; - mad.lo.s32 %r2727, %r62, %r2723, %r2726; - mad.lo.s32 %r2728, %r64, %r2721, %r2727; - mad.lo.s32 %r2729, %r65, %r2719, %r2728; - ld.const.v4.u8 {%rs1722, %rs1723, %rs1724, %rs1725}, [matrix+860]; - cvt.u32.u16 %r2730, %rs1725; - cvt.s32.s8 %r2731, %r2730; - cvt.u32.u16 %r2732, %rs1724; - cvt.s32.s8 %r2733, %r2732; - cvt.u32.u16 %r2734, %rs1723; - cvt.s32.s8 %r2735, %r2734; - cvt.u32.u16 %r2736, %rs1722; - cvt.s32.s8 %r2737, %r2736; - mad.lo.s32 %r2738, %r67, %r2737, %r2729; - mad.lo.s32 %r2739, %r68, %r2735, %r2738; - mad.lo.s32 %r2740, %r69, %r2733, %r2739; - mad.lo.s32 %r2741, %r70, %r2731, %r2740; - ld.const.v4.u8 {%rs1730, %rs1731, %rs1732, %rs1733}, [matrix+864]; - cvt.u32.u16 %r2742, %rs1733; - cvt.s32.s8 %r2743, %r2742; - cvt.u32.u16 %r2744, %rs1732; - cvt.s32.s8 %r2745, %r2744; - cvt.u32.u16 %r2746, %rs1731; - cvt.s32.s8 %r2747, %r2746; - cvt.u32.u16 %r2748, %rs1730; - cvt.s32.s8 %r2749, %r2748; - mad.lo.s32 %r2750, %r222, %r2749, %r2741; - mad.lo.s32 %r2751, %r72, %r2747, %r2750; - mad.lo.s32 %r2752, %r73, %r2745, %r2751; - mad.lo.s32 %r2753, %r74, %r2743, %r2752; - ld.const.v4.u8 {%rs1738, %rs1739, %rs1740, %rs1741}, [matrix+868]; - cvt.u32.u16 %r2754, %rs1741; - cvt.s32.s8 %r2755, %r2754; - cvt.u32.u16 %r2756, %rs1740; - cvt.s32.s8 %r2757, %r2756; - cvt.u32.u16 %r2758, %rs1739; - cvt.s32.s8 %r2759, %r2758; - cvt.u32.u16 %r2760, %rs1738; - cvt.s32.s8 %r2761, %r2760; - mad.lo.s32 %r2762, %r75, %r2761, %r2753; - mad.lo.s32 %r2763, %r76, %r2759, %r2762; - mad.lo.s32 %r2764, %r77, %r2757, %r2763; - mad.lo.s32 %r2765, %r78, %r2755, %r2764; - ld.const.v4.u8 {%rs1746, %rs1747, %rs1748, %rs1749}, [matrix+872]; - cvt.u32.u16 %r2766, %rs1749; - cvt.s32.s8 %r2767, %r2766; - cvt.u32.u16 %r2768, %rs1748; - cvt.s32.s8 %r2769, %r2768; - cvt.u32.u16 %r2770, %rs1747; - cvt.s32.s8 %r2771, %r2770; - cvt.u32.u16 %r2772, %rs1746; - cvt.s32.s8 %r2773, %r2772; - mad.lo.s32 %r2774, %r80, %r2773, %r2765; - mad.lo.s32 %r2775, %r81, %r2771, %r2774; - mad.lo.s32 %r2776, %r83, %r2769, %r2775; - mad.lo.s32 %r2777, %r84, %r2767, %r2776; - ld.const.v4.u8 {%rs1754, %rs1755, %rs1756, %rs1757}, [matrix+876]; - cvt.u32.u16 %r2778, %rs1757; - cvt.s32.s8 %r2779, %r2778; - cvt.u32.u16 %r2780, %rs1756; - cvt.s32.s8 %r2781, %r2780; - cvt.u32.u16 %r2782, %rs1755; - cvt.s32.s8 %r2783, %r2782; - cvt.u32.u16 %r2784, %rs1754; - cvt.s32.s8 %r2785, %r2784; - mad.lo.s32 %r2786, %r86, %r2785, %r2777; - mad.lo.s32 %r2787, %r87, %r2783, %r2786; - mad.lo.s32 %r2788, %r88, %r2781, %r2787; - mad.lo.s32 %r2789, %r89, %r2779, %r2788; - ld.const.v4.u8 {%rs1762, %rs1763, %rs1764, %rs1765}, [matrix+880]; - cvt.u32.u16 %r2790, %rs1765; - cvt.s32.s8 %r2791, %r2790; - cvt.u32.u16 %r2792, %rs1764; - cvt.s32.s8 %r2793, %r2792; - cvt.u32.u16 %r2794, %rs1763; - cvt.s32.s8 %r2795, %r2794; - cvt.u32.u16 %r2796, %rs1762; - cvt.s32.s8 %r2797, %r2796; - mad.lo.s32 %r2798, %r271, %r2797, %r2789; - mad.lo.s32 %r2799, %r91, %r2795, %r2798; - mad.lo.s32 %r2800, %r93, %r2793, %r2799; - mad.lo.s32 %r2801, %r94, %r2791, %r2800; - ld.const.v4.u8 {%rs1770, %rs1771, %rs1772, %rs1773}, [matrix+884]; - cvt.u32.u16 %r2802, %rs1773; - cvt.s32.s8 %r2803, %r2802; - cvt.u32.u16 %r2804, %rs1772; - cvt.s32.s8 %r2805, %r2804; - cvt.u32.u16 %r2806, %rs1771; - cvt.s32.s8 %r2807, %r2806; - cvt.u32.u16 %r2808, %rs1770; - cvt.s32.s8 %r2809, %r2808; - mad.lo.s32 %r2810, %r96, %r2809, %r2801; - mad.lo.s32 %r2811, %r97, %r2807, %r2810; - mad.lo.s32 %r2812, %r99, %r2805, %r2811; - mad.lo.s32 %r2813, %r100, %r2803, %r2812; - ld.const.v4.u8 {%rs1778, %rs1779, %rs1780, %rs1781}, [matrix+888]; - cvt.u32.u16 %r2814, %rs1781; - cvt.s32.s8 %r2815, %r2814; - cvt.u32.u16 %r2816, %rs1780; - cvt.s32.s8 %r2817, %r2816; - cvt.u32.u16 %r2818, %rs1779; - cvt.s32.s8 %r2819, %r2818; - cvt.u32.u16 %r2820, %rs1778; - cvt.s32.s8 %r2821, %r2820; - mad.lo.s32 %r2822, %r103, %r2821, %r2813; - mad.lo.s32 %r2823, %r104, %r2819, %r2822; - mad.lo.s32 %r2824, %r107, %r2817, %r2823; - mad.lo.s32 %r2825, %r108, %r2815, %r2824; - ld.const.v4.u8 {%rs1786, %rs1787, %rs1788, %rs1789}, [matrix+892]; - cvt.u32.u16 %r2826, %rs1789; - cvt.s32.s8 %r2827, %r2826; - cvt.u32.u16 %r2828, %rs1788; - cvt.s32.s8 %r2829, %r2828; - cvt.u32.u16 %r2830, %rs1787; - cvt.s32.s8 %r2831, %r2830; - cvt.u32.u16 %r2832, %rs1786; - cvt.s32.s8 %r2833, %r2832; - mad.lo.s32 %r2834, %r111, %r2833, %r2825; - mad.lo.s32 %r2835, %r112, %r2831, %r2834; - mad.lo.s32 %r2836, %r114, %r2829, %r2835; - mad.lo.s32 %r2837, %r115, %r2827, %r2836; - shr.u32 %r2838, %r2645, 6; - and.b32 %r2839, %r2838, 240; - shr.u32 %r2840, %r2837, 10; - or.b32 %r2841, %r2840, %r2839; - xor.b32 %r2842, %r18, %r2841; - cvt.u64.u32 %rd386, %r2842; - ld.const.v4.u8 {%rs1794, %rs1795, %rs1796, %rs1797}, [matrix+896]; - cvt.u32.u16 %r2843, %rs1797; - cvt.s32.s8 %r2844, %r2843; - cvt.u32.u16 %r2845, %rs1796; - cvt.s32.s8 %r2846, %r2845; - cvt.u32.u16 %r2847, %rs1794; - cvt.s32.s8 %r2848, %r2847; - cvt.u32.u16 %r2849, %rs1795; - cvt.s32.s8 %r2850, %r2849; - mul.lo.s32 %r2851, %r34, %r2850; - mad.lo.s32 %r2852, %r124, %r2848, %r2851; - mad.lo.s32 %r2853, %r35, %r2846, %r2852; - mad.lo.s32 %r2854, %r36, %r2844, %r2853; - ld.const.v4.u8 {%rs1802, %rs1803, %rs1804, %rs1805}, [matrix+900]; - cvt.u32.u16 %r2855, %rs1805; - cvt.s32.s8 %r2856, %r2855; - cvt.u32.u16 %r2857, %rs1804; - cvt.s32.s8 %r2858, %r2857; - cvt.u32.u16 %r2859, %rs1803; - cvt.s32.s8 %r2860, %r2859; - cvt.u32.u16 %r2861, %rs1802; - cvt.s32.s8 %r2862, %r2861; - mad.lo.s32 %r2863, %r37, %r2862, %r2854; - mad.lo.s32 %r2864, %r38, %r2860, %r2863; - mad.lo.s32 %r2865, %r39, %r2858, %r2864; - mad.lo.s32 %r2866, %r40, %r2856, %r2865; - ld.const.v4.u8 {%rs1810, %rs1811, %rs1812, %rs1813}, [matrix+904]; - cvt.u32.u16 %r2867, %rs1813; - cvt.s32.s8 %r2868, %r2867; - cvt.u32.u16 %r2869, %rs1812; - cvt.s32.s8 %r2870, %r2869; - cvt.u32.u16 %r2871, %rs1811; - cvt.s32.s8 %r2872, %r2871; - cvt.u32.u16 %r2873, %rs1810; - cvt.s32.s8 %r2874, %r2873; - mad.lo.s32 %r2875, %r42, %r2874, %r2866; - mad.lo.s32 %r2876, %r43, %r2872, %r2875; - mad.lo.s32 %r2877, %r45, %r2870, %r2876; - mad.lo.s32 %r2878, %r46, %r2868, %r2877; - ld.const.v4.u8 {%rs1818, %rs1819, %rs1820, %rs1821}, [matrix+908]; - cvt.u32.u16 %r2879, %rs1821; - cvt.s32.s8 %r2880, %r2879; - cvt.u32.u16 %r2881, %rs1820; - cvt.s32.s8 %r2882, %r2881; - cvt.u32.u16 %r2883, %rs1819; - cvt.s32.s8 %r2884, %r2883; - cvt.u32.u16 %r2885, %rs1818; - cvt.s32.s8 %r2886, %r2885; - mad.lo.s32 %r2887, %r48, %r2886, %r2878; - mad.lo.s32 %r2888, %r49, %r2884, %r2887; - mad.lo.s32 %r2889, %r50, %r2882, %r2888; - mad.lo.s32 %r2890, %r51, %r2880, %r2889; - ld.const.v4.u8 {%rs1826, %rs1827, %rs1828, %rs1829}, [matrix+912]; - cvt.u32.u16 %r2891, %rs1829; - cvt.s32.s8 %r2892, %r2891; - cvt.u32.u16 %r2893, %rs1828; - cvt.s32.s8 %r2894, %r2893; - cvt.u32.u16 %r2895, %rs1827; - cvt.s32.s8 %r2896, %r2895; - cvt.u32.u16 %r2897, %rs1826; - cvt.s32.s8 %r2898, %r2897; - mad.lo.s32 %r2899, %r173, %r2898, %r2890; - mad.lo.s32 %r2900, %r53, %r2896, %r2899; - mad.lo.s32 %r2901, %r54, %r2894, %r2900; - mad.lo.s32 %r2902, %r55, %r2892, %r2901; - ld.const.v4.u8 {%rs1834, %rs1835, %rs1836, %rs1837}, [matrix+916]; - cvt.u32.u16 %r2903, %rs1837; - cvt.s32.s8 %r2904, %r2903; - cvt.u32.u16 %r2905, %rs1836; - cvt.s32.s8 %r2906, %r2905; - cvt.u32.u16 %r2907, %rs1835; - cvt.s32.s8 %r2908, %r2907; - cvt.u32.u16 %r2909, %rs1834; - cvt.s32.s8 %r2910, %r2909; - mad.lo.s32 %r2911, %r56, %r2910, %r2902; - mad.lo.s32 %r2912, %r57, %r2908, %r2911; - mad.lo.s32 %r2913, %r58, %r2906, %r2912; - mad.lo.s32 %r2914, %r59, %r2904, %r2913; - ld.const.v4.u8 {%rs1842, %rs1843, %rs1844, %rs1845}, [matrix+920]; - cvt.u32.u16 %r2915, %rs1845; - cvt.s32.s8 %r2916, %r2915; - cvt.u32.u16 %r2917, %rs1844; - cvt.s32.s8 %r2918, %r2917; - cvt.u32.u16 %r2919, %rs1843; - cvt.s32.s8 %r2920, %r2919; - cvt.u32.u16 %r2921, %rs1842; - cvt.s32.s8 %r2922, %r2921; - mad.lo.s32 %r2923, %r61, %r2922, %r2914; - mad.lo.s32 %r2924, %r62, %r2920, %r2923; - mad.lo.s32 %r2925, %r64, %r2918, %r2924; - mad.lo.s32 %r2926, %r65, %r2916, %r2925; - ld.const.v4.u8 {%rs1850, %rs1851, %rs1852, %rs1853}, [matrix+924]; - cvt.u32.u16 %r2927, %rs1853; - cvt.s32.s8 %r2928, %r2927; - cvt.u32.u16 %r2929, %rs1852; - cvt.s32.s8 %r2930, %r2929; - cvt.u32.u16 %r2931, %rs1851; - cvt.s32.s8 %r2932, %r2931; - cvt.u32.u16 %r2933, %rs1850; - cvt.s32.s8 %r2934, %r2933; - mad.lo.s32 %r2935, %r67, %r2934, %r2926; - mad.lo.s32 %r2936, %r68, %r2932, %r2935; - mad.lo.s32 %r2937, %r69, %r2930, %r2936; - mad.lo.s32 %r2938, %r70, %r2928, %r2937; - ld.const.v4.u8 {%rs1858, %rs1859, %rs1860, %rs1861}, [matrix+928]; - cvt.u32.u16 %r2939, %rs1861; - cvt.s32.s8 %r2940, %r2939; - cvt.u32.u16 %r2941, %rs1860; - cvt.s32.s8 %r2942, %r2941; - cvt.u32.u16 %r2943, %rs1859; - cvt.s32.s8 %r2944, %r2943; - cvt.u32.u16 %r2945, %rs1858; - cvt.s32.s8 %r2946, %r2945; - mad.lo.s32 %r2947, %r222, %r2946, %r2938; - mad.lo.s32 %r2948, %r72, %r2944, %r2947; - mad.lo.s32 %r2949, %r73, %r2942, %r2948; - mad.lo.s32 %r2950, %r74, %r2940, %r2949; - ld.const.v4.u8 {%rs1866, %rs1867, %rs1868, %rs1869}, [matrix+932]; - cvt.u32.u16 %r2951, %rs1869; - cvt.s32.s8 %r2952, %r2951; - cvt.u32.u16 %r2953, %rs1868; - cvt.s32.s8 %r2954, %r2953; - cvt.u32.u16 %r2955, %rs1867; - cvt.s32.s8 %r2956, %r2955; - cvt.u32.u16 %r2957, %rs1866; - cvt.s32.s8 %r2958, %r2957; - mad.lo.s32 %r2959, %r75, %r2958, %r2950; - mad.lo.s32 %r2960, %r76, %r2956, %r2959; - mad.lo.s32 %r2961, %r77, %r2954, %r2960; - mad.lo.s32 %r2962, %r78, %r2952, %r2961; - ld.const.v4.u8 {%rs1874, %rs1875, %rs1876, %rs1877}, [matrix+936]; - cvt.u32.u16 %r2963, %rs1877; - cvt.s32.s8 %r2964, %r2963; - cvt.u32.u16 %r2965, %rs1876; - cvt.s32.s8 %r2966, %r2965; - cvt.u32.u16 %r2967, %rs1875; - cvt.s32.s8 %r2968, %r2967; - cvt.u32.u16 %r2969, %rs1874; - cvt.s32.s8 %r2970, %r2969; - mad.lo.s32 %r2971, %r80, %r2970, %r2962; - mad.lo.s32 %r2972, %r81, %r2968, %r2971; - mad.lo.s32 %r2973, %r83, %r2966, %r2972; - mad.lo.s32 %r2974, %r84, %r2964, %r2973; - ld.const.v4.u8 {%rs1882, %rs1883, %rs1884, %rs1885}, [matrix+940]; - cvt.u32.u16 %r2975, %rs1885; - cvt.s32.s8 %r2976, %r2975; - cvt.u32.u16 %r2977, %rs1884; - cvt.s32.s8 %r2978, %r2977; - cvt.u32.u16 %r2979, %rs1883; - cvt.s32.s8 %r2980, %r2979; - cvt.u32.u16 %r2981, %rs1882; - cvt.s32.s8 %r2982, %r2981; - mad.lo.s32 %r2983, %r86, %r2982, %r2974; - mad.lo.s32 %r2984, %r87, %r2980, %r2983; - mad.lo.s32 %r2985, %r88, %r2978, %r2984; - mad.lo.s32 %r2986, %r89, %r2976, %r2985; - ld.const.v4.u8 {%rs1890, %rs1891, %rs1892, %rs1893}, [matrix+944]; - cvt.u32.u16 %r2987, %rs1893; - cvt.s32.s8 %r2988, %r2987; - cvt.u32.u16 %r2989, %rs1892; - cvt.s32.s8 %r2990, %r2989; - cvt.u32.u16 %r2991, %rs1891; - cvt.s32.s8 %r2992, %r2991; - cvt.u32.u16 %r2993, %rs1890; - cvt.s32.s8 %r2994, %r2993; - mad.lo.s32 %r2995, %r271, %r2994, %r2986; - mad.lo.s32 %r2996, %r91, %r2992, %r2995; - mad.lo.s32 %r2997, %r93, %r2990, %r2996; - mad.lo.s32 %r2998, %r94, %r2988, %r2997; - ld.const.v4.u8 {%rs1898, %rs1899, %rs1900, %rs1901}, [matrix+948]; - cvt.u32.u16 %r2999, %rs1901; - cvt.s32.s8 %r3000, %r2999; - cvt.u32.u16 %r3001, %rs1900; - cvt.s32.s8 %r3002, %r3001; - cvt.u32.u16 %r3003, %rs1899; - cvt.s32.s8 %r3004, %r3003; - cvt.u32.u16 %r3005, %rs1898; - cvt.s32.s8 %r3006, %r3005; - mad.lo.s32 %r3007, %r96, %r3006, %r2998; - mad.lo.s32 %r3008, %r97, %r3004, %r3007; - mad.lo.s32 %r3009, %r99, %r3002, %r3008; - mad.lo.s32 %r3010, %r100, %r3000, %r3009; - ld.const.v4.u8 {%rs1906, %rs1907, %rs1908, %rs1909}, [matrix+952]; - cvt.u32.u16 %r3011, %rs1909; - cvt.s32.s8 %r3012, %r3011; - cvt.u32.u16 %r3013, %rs1908; - cvt.s32.s8 %r3014, %r3013; - cvt.u32.u16 %r3015, %rs1907; - cvt.s32.s8 %r3016, %r3015; - cvt.u32.u16 %r3017, %rs1906; - cvt.s32.s8 %r3018, %r3017; - mad.lo.s32 %r3019, %r103, %r3018, %r3010; - mad.lo.s32 %r3020, %r104, %r3016, %r3019; - mad.lo.s32 %r3021, %r107, %r3014, %r3020; - mad.lo.s32 %r3022, %r108, %r3012, %r3021; - ld.const.v4.u8 {%rs1914, %rs1915, %rs1916, %rs1917}, [matrix+956]; - cvt.u32.u16 %r3023, %rs1917; - cvt.s32.s8 %r3024, %r3023; - cvt.u32.u16 %r3025, %rs1916; - cvt.s32.s8 %r3026, %r3025; - cvt.u32.u16 %r3027, %rs1915; - cvt.s32.s8 %r3028, %r3027; - cvt.u32.u16 %r3029, %rs1914; - cvt.s32.s8 %r3030, %r3029; - mad.lo.s32 %r3031, %r111, %r3030, %r3022; - mad.lo.s32 %r3032, %r112, %r3028, %r3031; - mad.lo.s32 %r3033, %r114, %r3026, %r3032; - mad.lo.s32 %r3034, %r115, %r3024, %r3033; - ld.const.v4.u8 {%rs1922, %rs1923, %rs1924, %rs1925}, [matrix+960]; - cvt.u32.u16 %r3035, %rs1925; - cvt.s32.s8 %r3036, %r3035; - cvt.u32.u16 %r3037, %rs1924; - cvt.s32.s8 %r3038, %r3037; - cvt.u32.u16 %r3039, %rs1922; - cvt.s32.s8 %r3040, %r3039; - cvt.u32.u16 %r3041, %rs1923; - cvt.s32.s8 %r3042, %r3041; - mul.lo.s32 %r3043, %r34, %r3042; - mad.lo.s32 %r3044, %r124, %r3040, %r3043; - mad.lo.s32 %r3045, %r35, %r3038, %r3044; - mad.lo.s32 %r3046, %r36, %r3036, %r3045; - ld.const.v4.u8 {%rs1930, %rs1931, %rs1932, %rs1933}, [matrix+964]; - cvt.u32.u16 %r3047, %rs1933; - cvt.s32.s8 %r3048, %r3047; - cvt.u32.u16 %r3049, %rs1932; - cvt.s32.s8 %r3050, %r3049; - cvt.u32.u16 %r3051, %rs1931; - cvt.s32.s8 %r3052, %r3051; - cvt.u32.u16 %r3053, %rs1930; - cvt.s32.s8 %r3054, %r3053; - mad.lo.s32 %r3055, %r37, %r3054, %r3046; - mad.lo.s32 %r3056, %r38, %r3052, %r3055; - mad.lo.s32 %r3057, %r39, %r3050, %r3056; - mad.lo.s32 %r3058, %r40, %r3048, %r3057; - ld.const.v4.u8 {%rs1938, %rs1939, %rs1940, %rs1941}, [matrix+968]; - cvt.u32.u16 %r3059, %rs1941; - cvt.s32.s8 %r3060, %r3059; - cvt.u32.u16 %r3061, %rs1940; - cvt.s32.s8 %r3062, %r3061; - cvt.u32.u16 %r3063, %rs1939; - cvt.s32.s8 %r3064, %r3063; - cvt.u32.u16 %r3065, %rs1938; - cvt.s32.s8 %r3066, %r3065; - mad.lo.s32 %r3067, %r42, %r3066, %r3058; - mad.lo.s32 %r3068, %r43, %r3064, %r3067; - mad.lo.s32 %r3069, %r45, %r3062, %r3068; - mad.lo.s32 %r3070, %r46, %r3060, %r3069; - ld.const.v4.u8 {%rs1946, %rs1947, %rs1948, %rs1949}, [matrix+972]; - cvt.u32.u16 %r3071, %rs1949; - cvt.s32.s8 %r3072, %r3071; - cvt.u32.u16 %r3073, %rs1948; - cvt.s32.s8 %r3074, %r3073; - cvt.u32.u16 %r3075, %rs1947; - cvt.s32.s8 %r3076, %r3075; - cvt.u32.u16 %r3077, %rs1946; - cvt.s32.s8 %r3078, %r3077; - mad.lo.s32 %r3079, %r48, %r3078, %r3070; - mad.lo.s32 %r3080, %r49, %r3076, %r3079; - mad.lo.s32 %r3081, %r50, %r3074, %r3080; - mad.lo.s32 %r3082, %r51, %r3072, %r3081; - ld.const.v4.u8 {%rs1954, %rs1955, %rs1956, %rs1957}, [matrix+976]; - cvt.u32.u16 %r3083, %rs1957; - cvt.s32.s8 %r3084, %r3083; - cvt.u32.u16 %r3085, %rs1956; - cvt.s32.s8 %r3086, %r3085; - cvt.u32.u16 %r3087, %rs1955; - cvt.s32.s8 %r3088, %r3087; - cvt.u32.u16 %r3089, %rs1954; - cvt.s32.s8 %r3090, %r3089; - mad.lo.s32 %r3091, %r173, %r3090, %r3082; - mad.lo.s32 %r3092, %r53, %r3088, %r3091; - mad.lo.s32 %r3093, %r54, %r3086, %r3092; - mad.lo.s32 %r3094, %r55, %r3084, %r3093; - ld.const.v4.u8 {%rs1962, %rs1963, %rs1964, %rs1965}, [matrix+980]; - cvt.u32.u16 %r3095, %rs1965; - cvt.s32.s8 %r3096, %r3095; - cvt.u32.u16 %r3097, %rs1964; - cvt.s32.s8 %r3098, %r3097; - cvt.u32.u16 %r3099, %rs1963; - cvt.s32.s8 %r3100, %r3099; - cvt.u32.u16 %r3101, %rs1962; - cvt.s32.s8 %r3102, %r3101; - mad.lo.s32 %r3103, %r56, %r3102, %r3094; - mad.lo.s32 %r3104, %r57, %r3100, %r3103; - mad.lo.s32 %r3105, %r58, %r3098, %r3104; - mad.lo.s32 %r3106, %r59, %r3096, %r3105; - ld.const.v4.u8 {%rs1970, %rs1971, %rs1972, %rs1973}, [matrix+984]; - cvt.u32.u16 %r3107, %rs1973; - cvt.s32.s8 %r3108, %r3107; - cvt.u32.u16 %r3109, %rs1972; - cvt.s32.s8 %r3110, %r3109; - cvt.u32.u16 %r3111, %rs1971; - cvt.s32.s8 %r3112, %r3111; - cvt.u32.u16 %r3113, %rs1970; - cvt.s32.s8 %r3114, %r3113; - mad.lo.s32 %r3115, %r61, %r3114, %r3106; - mad.lo.s32 %r3116, %r62, %r3112, %r3115; - mad.lo.s32 %r3117, %r64, %r3110, %r3116; - mad.lo.s32 %r3118, %r65, %r3108, %r3117; - ld.const.v4.u8 {%rs1978, %rs1979, %rs1980, %rs1981}, [matrix+988]; - cvt.u32.u16 %r3119, %rs1981; - cvt.s32.s8 %r3120, %r3119; - cvt.u32.u16 %r3121, %rs1980; - cvt.s32.s8 %r3122, %r3121; - cvt.u32.u16 %r3123, %rs1979; - cvt.s32.s8 %r3124, %r3123; - cvt.u32.u16 %r3125, %rs1978; - cvt.s32.s8 %r3126, %r3125; - mad.lo.s32 %r3127, %r67, %r3126, %r3118; - mad.lo.s32 %r3128, %r68, %r3124, %r3127; - mad.lo.s32 %r3129, %r69, %r3122, %r3128; - mad.lo.s32 %r3130, %r70, %r3120, %r3129; - ld.const.v4.u8 {%rs1986, %rs1987, %rs1988, %rs1989}, [matrix+992]; - cvt.u32.u16 %r3131, %rs1989; - cvt.s32.s8 %r3132, %r3131; - cvt.u32.u16 %r3133, %rs1988; - cvt.s32.s8 %r3134, %r3133; - cvt.u32.u16 %r3135, %rs1987; - cvt.s32.s8 %r3136, %r3135; - cvt.u32.u16 %r3137, %rs1986; - cvt.s32.s8 %r3138, %r3137; - mad.lo.s32 %r3139, %r222, %r3138, %r3130; - mad.lo.s32 %r3140, %r72, %r3136, %r3139; - mad.lo.s32 %r3141, %r73, %r3134, %r3140; - mad.lo.s32 %r3142, %r74, %r3132, %r3141; - ld.const.v4.u8 {%rs1994, %rs1995, %rs1996, %rs1997}, [matrix+996]; - cvt.u32.u16 %r3143, %rs1997; - cvt.s32.s8 %r3144, %r3143; - cvt.u32.u16 %r3145, %rs1996; - cvt.s32.s8 %r3146, %r3145; - cvt.u32.u16 %r3147, %rs1995; - cvt.s32.s8 %r3148, %r3147; - cvt.u32.u16 %r3149, %rs1994; - cvt.s32.s8 %r3150, %r3149; - mad.lo.s32 %r3151, %r75, %r3150, %r3142; - mad.lo.s32 %r3152, %r76, %r3148, %r3151; - mad.lo.s32 %r3153, %r77, %r3146, %r3152; - mad.lo.s32 %r3154, %r78, %r3144, %r3153; - ld.const.v4.u8 {%rs2002, %rs2003, %rs2004, %rs2005}, [matrix+1000]; - cvt.u32.u16 %r3155, %rs2005; - cvt.s32.s8 %r3156, %r3155; - cvt.u32.u16 %r3157, %rs2004; - cvt.s32.s8 %r3158, %r3157; - cvt.u32.u16 %r3159, %rs2003; - cvt.s32.s8 %r3160, %r3159; - cvt.u32.u16 %r3161, %rs2002; - cvt.s32.s8 %r3162, %r3161; - mad.lo.s32 %r3163, %r80, %r3162, %r3154; - mad.lo.s32 %r3164, %r81, %r3160, %r3163; - mad.lo.s32 %r3165, %r83, %r3158, %r3164; - mad.lo.s32 %r3166, %r84, %r3156, %r3165; - ld.const.v4.u8 {%rs2010, %rs2011, %rs2012, %rs2013}, [matrix+1004]; - cvt.u32.u16 %r3167, %rs2013; - cvt.s32.s8 %r3168, %r3167; - cvt.u32.u16 %r3169, %rs2012; - cvt.s32.s8 %r3170, %r3169; - cvt.u32.u16 %r3171, %rs2011; - cvt.s32.s8 %r3172, %r3171; - cvt.u32.u16 %r3173, %rs2010; - cvt.s32.s8 %r3174, %r3173; - mad.lo.s32 %r3175, %r86, %r3174, %r3166; - mad.lo.s32 %r3176, %r87, %r3172, %r3175; - mad.lo.s32 %r3177, %r88, %r3170, %r3176; - mad.lo.s32 %r3178, %r89, %r3168, %r3177; - ld.const.v4.u8 {%rs2018, %rs2019, %rs2020, %rs2021}, [matrix+1008]; - cvt.u32.u16 %r3179, %rs2021; - cvt.s32.s8 %r3180, %r3179; - cvt.u32.u16 %r3181, %rs2020; - cvt.s32.s8 %r3182, %r3181; - cvt.u32.u16 %r3183, %rs2019; - cvt.s32.s8 %r3184, %r3183; - cvt.u32.u16 %r3185, %rs2018; - cvt.s32.s8 %r3186, %r3185; - mad.lo.s32 %r3187, %r271, %r3186, %r3178; - mad.lo.s32 %r3188, %r91, %r3184, %r3187; - mad.lo.s32 %r3189, %r93, %r3182, %r3188; - mad.lo.s32 %r3190, %r94, %r3180, %r3189; - ld.const.v4.u8 {%rs2026, %rs2027, %rs2028, %rs2029}, [matrix+1012]; - cvt.u32.u16 %r3191, %rs2029; - cvt.s32.s8 %r3192, %r3191; - cvt.u32.u16 %r3193, %rs2028; - cvt.s32.s8 %r3194, %r3193; - cvt.u32.u16 %r3195, %rs2027; - cvt.s32.s8 %r3196, %r3195; - cvt.u32.u16 %r3197, %rs2026; - cvt.s32.s8 %r3198, %r3197; - mad.lo.s32 %r3199, %r96, %r3198, %r3190; - mad.lo.s32 %r3200, %r97, %r3196, %r3199; - mad.lo.s32 %r3201, %r99, %r3194, %r3200; - mad.lo.s32 %r3202, %r100, %r3192, %r3201; - ld.const.v4.u8 {%rs2034, %rs2035, %rs2036, %rs2037}, [matrix+1016]; - cvt.u32.u16 %r3203, %rs2037; - cvt.s32.s8 %r3204, %r3203; - cvt.u32.u16 %r3205, %rs2036; - cvt.s32.s8 %r3206, %r3205; - cvt.u32.u16 %r3207, %rs2035; - cvt.s32.s8 %r3208, %r3207; - cvt.u32.u16 %r3209, %rs2034; - cvt.s32.s8 %r3210, %r3209; - mad.lo.s32 %r3211, %r103, %r3210, %r3202; - mad.lo.s32 %r3212, %r104, %r3208, %r3211; - mad.lo.s32 %r3213, %r107, %r3206, %r3212; - mad.lo.s32 %r3214, %r108, %r3204, %r3213; - ld.const.v4.u8 {%rs2042, %rs2043, %rs2044, %rs2045}, [matrix+1020]; - cvt.u32.u16 %r3215, %rs2045; - cvt.s32.s8 %r3216, %r3215; - cvt.u32.u16 %r3217, %rs2044; - cvt.s32.s8 %r3218, %r3217; - cvt.u32.u16 %r3219, %rs2043; - cvt.s32.s8 %r3220, %r3219; - cvt.u32.u16 %r3221, %rs2042; - cvt.s32.s8 %r3222, %r3221; - mad.lo.s32 %r3223, %r111, %r3222, %r3214; - mad.lo.s32 %r3224, %r112, %r3220, %r3223; - mad.lo.s32 %r3225, %r114, %r3218, %r3224; - mad.lo.s32 %r3226, %r115, %r3216, %r3225; - shr.u32 %r3227, %r3034, 6; - and.b32 %r3228, %r3227, 240; - shr.u32 %r3229, %r3226, 10; - or.b32 %r3230, %r3229, %r3228; - xor.b32 %r3231, %r19, %r3230; - ld.const.v4.u8 {%rs2050, %rs2051, %rs2052, %rs2053}, [matrix+1024]; - cvt.u32.u16 %r3232, %rs2053; - cvt.s32.s8 %r3233, %r3232; - cvt.u32.u16 %r3234, %rs2052; - cvt.s32.s8 %r3235, %r3234; - cvt.u32.u16 %r3236, %rs2050; - cvt.s32.s8 %r3237, %r3236; - cvt.u32.u16 %r3238, %rs2051; - cvt.s32.s8 %r3239, %r3238; - mul.lo.s32 %r3240, %r34, %r3239; - mad.lo.s32 %r3241, %r124, %r3237, %r3240; - mad.lo.s32 %r3242, %r35, %r3235, %r3241; - mad.lo.s32 %r3243, %r36, %r3233, %r3242; - ld.const.v4.u8 {%rs2058, %rs2059, %rs2060, %rs2061}, [matrix+1028]; - cvt.u32.u16 %r3244, %rs2061; - cvt.s32.s8 %r3245, %r3244; - cvt.u32.u16 %r3246, %rs2060; - cvt.s32.s8 %r3247, %r3246; - cvt.u32.u16 %r3248, %rs2059; - cvt.s32.s8 %r3249, %r3248; - cvt.u32.u16 %r3250, %rs2058; - cvt.s32.s8 %r3251, %r3250; - mad.lo.s32 %r3252, %r37, %r3251, %r3243; - mad.lo.s32 %r3253, %r38, %r3249, %r3252; - mad.lo.s32 %r3254, %r39, %r3247, %r3253; - mad.lo.s32 %r3255, %r40, %r3245, %r3254; - ld.const.v4.u8 {%rs2066, %rs2067, %rs2068, %rs2069}, [matrix+1032]; - cvt.u32.u16 %r3256, %rs2069; - cvt.s32.s8 %r3257, %r3256; - cvt.u32.u16 %r3258, %rs2068; - cvt.s32.s8 %r3259, %r3258; - cvt.u32.u16 %r3260, %rs2067; - cvt.s32.s8 %r3261, %r3260; - cvt.u32.u16 %r3262, %rs2066; - cvt.s32.s8 %r3263, %r3262; - mad.lo.s32 %r3264, %r42, %r3263, %r3255; - mad.lo.s32 %r3265, %r43, %r3261, %r3264; - mad.lo.s32 %r3266, %r45, %r3259, %r3265; - mad.lo.s32 %r3267, %r46, %r3257, %r3266; - ld.const.v4.u8 {%rs2074, %rs2075, %rs2076, %rs2077}, [matrix+1036]; - cvt.u32.u16 %r3268, %rs2077; - cvt.s32.s8 %r3269, %r3268; - cvt.u32.u16 %r3270, %rs2076; - cvt.s32.s8 %r3271, %r3270; - cvt.u32.u16 %r3272, %rs2075; - cvt.s32.s8 %r3273, %r3272; - cvt.u32.u16 %r3274, %rs2074; - cvt.s32.s8 %r3275, %r3274; - mad.lo.s32 %r3276, %r48, %r3275, %r3267; - mad.lo.s32 %r3277, %r49, %r3273, %r3276; - mad.lo.s32 %r3278, %r50, %r3271, %r3277; - mad.lo.s32 %r3279, %r51, %r3269, %r3278; - ld.const.v4.u8 {%rs2082, %rs2083, %rs2084, %rs2085}, [matrix+1040]; - cvt.u32.u16 %r3280, %rs2085; - cvt.s32.s8 %r3281, %r3280; - cvt.u32.u16 %r3282, %rs2084; - cvt.s32.s8 %r3283, %r3282; - cvt.u32.u16 %r3284, %rs2083; - cvt.s32.s8 %r3285, %r3284; - cvt.u32.u16 %r3286, %rs2082; - cvt.s32.s8 %r3287, %r3286; - mad.lo.s32 %r3288, %r173, %r3287, %r3279; - mad.lo.s32 %r3289, %r53, %r3285, %r3288; - mad.lo.s32 %r3290, %r54, %r3283, %r3289; - mad.lo.s32 %r3291, %r55, %r3281, %r3290; - ld.const.v4.u8 {%rs2090, %rs2091, %rs2092, %rs2093}, [matrix+1044]; - cvt.u32.u16 %r3292, %rs2093; - cvt.s32.s8 %r3293, %r3292; - cvt.u32.u16 %r3294, %rs2092; - cvt.s32.s8 %r3295, %r3294; - cvt.u32.u16 %r3296, %rs2091; - cvt.s32.s8 %r3297, %r3296; - cvt.u32.u16 %r3298, %rs2090; - cvt.s32.s8 %r3299, %r3298; - mad.lo.s32 %r3300, %r56, %r3299, %r3291; - mad.lo.s32 %r3301, %r57, %r3297, %r3300; - mad.lo.s32 %r3302, %r58, %r3295, %r3301; - mad.lo.s32 %r3303, %r59, %r3293, %r3302; - ld.const.v4.u8 {%rs2098, %rs2099, %rs2100, %rs2101}, [matrix+1048]; - cvt.u32.u16 %r3304, %rs2101; - cvt.s32.s8 %r3305, %r3304; - cvt.u32.u16 %r3306, %rs2100; - cvt.s32.s8 %r3307, %r3306; - cvt.u32.u16 %r3308, %rs2099; - cvt.s32.s8 %r3309, %r3308; - cvt.u32.u16 %r3310, %rs2098; - cvt.s32.s8 %r3311, %r3310; - mad.lo.s32 %r3312, %r61, %r3311, %r3303; - mad.lo.s32 %r3313, %r62, %r3309, %r3312; - mad.lo.s32 %r3314, %r64, %r3307, %r3313; - mad.lo.s32 %r3315, %r65, %r3305, %r3314; - ld.const.v4.u8 {%rs2106, %rs2107, %rs2108, %rs2109}, [matrix+1052]; - cvt.u32.u16 %r3316, %rs2109; - cvt.s32.s8 %r3317, %r3316; - cvt.u32.u16 %r3318, %rs2108; - cvt.s32.s8 %r3319, %r3318; - cvt.u32.u16 %r3320, %rs2107; - cvt.s32.s8 %r3321, %r3320; - cvt.u32.u16 %r3322, %rs2106; - cvt.s32.s8 %r3323, %r3322; - mad.lo.s32 %r3324, %r67, %r3323, %r3315; - mad.lo.s32 %r3325, %r68, %r3321, %r3324; - mad.lo.s32 %r3326, %r69, %r3319, %r3325; - mad.lo.s32 %r3327, %r70, %r3317, %r3326; - ld.const.v4.u8 {%rs2114, %rs2115, %rs2116, %rs2117}, [matrix+1056]; - cvt.u32.u16 %r3328, %rs2117; - cvt.s32.s8 %r3329, %r3328; - cvt.u32.u16 %r3330, %rs2116; - cvt.s32.s8 %r3331, %r3330; - cvt.u32.u16 %r3332, %rs2115; - cvt.s32.s8 %r3333, %r3332; - cvt.u32.u16 %r3334, %rs2114; - cvt.s32.s8 %r3335, %r3334; - mad.lo.s32 %r3336, %r222, %r3335, %r3327; - mad.lo.s32 %r3337, %r72, %r3333, %r3336; - mad.lo.s32 %r3338, %r73, %r3331, %r3337; - mad.lo.s32 %r3339, %r74, %r3329, %r3338; - ld.const.v4.u8 {%rs2122, %rs2123, %rs2124, %rs2125}, [matrix+1060]; - cvt.u32.u16 %r3340, %rs2125; - cvt.s32.s8 %r3341, %r3340; - cvt.u32.u16 %r3342, %rs2124; - cvt.s32.s8 %r3343, %r3342; - cvt.u32.u16 %r3344, %rs2123; - cvt.s32.s8 %r3345, %r3344; - cvt.u32.u16 %r3346, %rs2122; - cvt.s32.s8 %r3347, %r3346; - mad.lo.s32 %r3348, %r75, %r3347, %r3339; - mad.lo.s32 %r3349, %r76, %r3345, %r3348; - mad.lo.s32 %r3350, %r77, %r3343, %r3349; - mad.lo.s32 %r3351, %r78, %r3341, %r3350; - ld.const.v4.u8 {%rs2130, %rs2131, %rs2132, %rs2133}, [matrix+1064]; - cvt.u32.u16 %r3352, %rs2133; - cvt.s32.s8 %r3353, %r3352; - cvt.u32.u16 %r3354, %rs2132; - cvt.s32.s8 %r3355, %r3354; - cvt.u32.u16 %r3356, %rs2131; - cvt.s32.s8 %r3357, %r3356; - cvt.u32.u16 %r3358, %rs2130; - cvt.s32.s8 %r3359, %r3358; - mad.lo.s32 %r3360, %r80, %r3359, %r3351; - mad.lo.s32 %r3361, %r81, %r3357, %r3360; - mad.lo.s32 %r3362, %r83, %r3355, %r3361; - mad.lo.s32 %r3363, %r84, %r3353, %r3362; - ld.const.v4.u8 {%rs2138, %rs2139, %rs2140, %rs2141}, [matrix+1068]; - cvt.u32.u16 %r3364, %rs2141; - cvt.s32.s8 %r3365, %r3364; - cvt.u32.u16 %r3366, %rs2140; - cvt.s32.s8 %r3367, %r3366; - cvt.u32.u16 %r3368, %rs2139; - cvt.s32.s8 %r3369, %r3368; - cvt.u32.u16 %r3370, %rs2138; - cvt.s32.s8 %r3371, %r3370; - mad.lo.s32 %r3372, %r86, %r3371, %r3363; - mad.lo.s32 %r3373, %r87, %r3369, %r3372; - mad.lo.s32 %r3374, %r88, %r3367, %r3373; - mad.lo.s32 %r3375, %r89, %r3365, %r3374; - ld.const.v4.u8 {%rs2146, %rs2147, %rs2148, %rs2149}, [matrix+1072]; - cvt.u32.u16 %r3376, %rs2149; - cvt.s32.s8 %r3377, %r3376; - cvt.u32.u16 %r3378, %rs2148; - cvt.s32.s8 %r3379, %r3378; - cvt.u32.u16 %r3380, %rs2147; - cvt.s32.s8 %r3381, %r3380; - cvt.u32.u16 %r3382, %rs2146; - cvt.s32.s8 %r3383, %r3382; - mad.lo.s32 %r3384, %r271, %r3383, %r3375; - mad.lo.s32 %r3385, %r91, %r3381, %r3384; - mad.lo.s32 %r3386, %r93, %r3379, %r3385; - mad.lo.s32 %r3387, %r94, %r3377, %r3386; - ld.const.v4.u8 {%rs2154, %rs2155, %rs2156, %rs2157}, [matrix+1076]; - cvt.u32.u16 %r3388, %rs2157; - cvt.s32.s8 %r3389, %r3388; - cvt.u32.u16 %r3390, %rs2156; - cvt.s32.s8 %r3391, %r3390; - cvt.u32.u16 %r3392, %rs2155; - cvt.s32.s8 %r3393, %r3392; - cvt.u32.u16 %r3394, %rs2154; - cvt.s32.s8 %r3395, %r3394; - mad.lo.s32 %r3396, %r96, %r3395, %r3387; - mad.lo.s32 %r3397, %r97, %r3393, %r3396; - mad.lo.s32 %r3398, %r99, %r3391, %r3397; - mad.lo.s32 %r3399, %r100, %r3389, %r3398; - ld.const.v4.u8 {%rs2162, %rs2163, %rs2164, %rs2165}, [matrix+1080]; - cvt.u32.u16 %r3400, %rs2165; - cvt.s32.s8 %r3401, %r3400; - cvt.u32.u16 %r3402, %rs2164; - cvt.s32.s8 %r3403, %r3402; - cvt.u32.u16 %r3404, %rs2163; - cvt.s32.s8 %r3405, %r3404; - cvt.u32.u16 %r3406, %rs2162; - cvt.s32.s8 %r3407, %r3406; - mad.lo.s32 %r3408, %r103, %r3407, %r3399; - mad.lo.s32 %r3409, %r104, %r3405, %r3408; - mad.lo.s32 %r3410, %r107, %r3403, %r3409; - mad.lo.s32 %r3411, %r108, %r3401, %r3410; - ld.const.v4.u8 {%rs2170, %rs2171, %rs2172, %rs2173}, [matrix+1084]; - cvt.u32.u16 %r3412, %rs2173; - cvt.s32.s8 %r3413, %r3412; - cvt.u32.u16 %r3414, %rs2172; - cvt.s32.s8 %r3415, %r3414; - cvt.u32.u16 %r3416, %rs2171; - cvt.s32.s8 %r3417, %r3416; - cvt.u32.u16 %r3418, %rs2170; - cvt.s32.s8 %r3419, %r3418; - mad.lo.s32 %r3420, %r111, %r3419, %r3411; - mad.lo.s32 %r3421, %r112, %r3417, %r3420; - mad.lo.s32 %r3422, %r114, %r3415, %r3421; - mad.lo.s32 %r3423, %r115, %r3413, %r3422; - ld.const.v4.u8 {%rs2178, %rs2179, %rs2180, %rs2181}, [matrix+1088]; - cvt.u32.u16 %r3424, %rs2181; - cvt.s32.s8 %r3425, %r3424; - cvt.u32.u16 %r3426, %rs2180; - cvt.s32.s8 %r3427, %r3426; - cvt.u32.u16 %r3428, %rs2178; - cvt.s32.s8 %r3429, %r3428; - cvt.u32.u16 %r3430, %rs2179; - cvt.s32.s8 %r3431, %r3430; - mul.lo.s32 %r3432, %r34, %r3431; - mad.lo.s32 %r3433, %r124, %r3429, %r3432; - mad.lo.s32 %r3434, %r35, %r3427, %r3433; - mad.lo.s32 %r3435, %r36, %r3425, %r3434; - ld.const.v4.u8 {%rs2186, %rs2187, %rs2188, %rs2189}, [matrix+1092]; - cvt.u32.u16 %r3436, %rs2189; - cvt.s32.s8 %r3437, %r3436; - cvt.u32.u16 %r3438, %rs2188; - cvt.s32.s8 %r3439, %r3438; - cvt.u32.u16 %r3440, %rs2187; - cvt.s32.s8 %r3441, %r3440; - cvt.u32.u16 %r3442, %rs2186; - cvt.s32.s8 %r3443, %r3442; - mad.lo.s32 %r3444, %r37, %r3443, %r3435; - mad.lo.s32 %r3445, %r38, %r3441, %r3444; - mad.lo.s32 %r3446, %r39, %r3439, %r3445; - mad.lo.s32 %r3447, %r40, %r3437, %r3446; - ld.const.v4.u8 {%rs2194, %rs2195, %rs2196, %rs2197}, [matrix+1096]; - cvt.u32.u16 %r3448, %rs2197; - cvt.s32.s8 %r3449, %r3448; - cvt.u32.u16 %r3450, %rs2196; - cvt.s32.s8 %r3451, %r3450; - cvt.u32.u16 %r3452, %rs2195; - cvt.s32.s8 %r3453, %r3452; - cvt.u32.u16 %r3454, %rs2194; - cvt.s32.s8 %r3455, %r3454; - mad.lo.s32 %r3456, %r42, %r3455, %r3447; - mad.lo.s32 %r3457, %r43, %r3453, %r3456; - mad.lo.s32 %r3458, %r45, %r3451, %r3457; - mad.lo.s32 %r3459, %r46, %r3449, %r3458; - ld.const.v4.u8 {%rs2202, %rs2203, %rs2204, %rs2205}, [matrix+1100]; - cvt.u32.u16 %r3460, %rs2205; - cvt.s32.s8 %r3461, %r3460; - cvt.u32.u16 %r3462, %rs2204; - cvt.s32.s8 %r3463, %r3462; - cvt.u32.u16 %r3464, %rs2203; - cvt.s32.s8 %r3465, %r3464; - cvt.u32.u16 %r3466, %rs2202; - cvt.s32.s8 %r3467, %r3466; - mad.lo.s32 %r3468, %r48, %r3467, %r3459; - mad.lo.s32 %r3469, %r49, %r3465, %r3468; - mad.lo.s32 %r3470, %r50, %r3463, %r3469; - mad.lo.s32 %r3471, %r51, %r3461, %r3470; - ld.const.v4.u8 {%rs2210, %rs2211, %rs2212, %rs2213}, [matrix+1104]; - cvt.u32.u16 %r3472, %rs2213; - cvt.s32.s8 %r3473, %r3472; - cvt.u32.u16 %r3474, %rs2212; - cvt.s32.s8 %r3475, %r3474; - cvt.u32.u16 %r3476, %rs2211; - cvt.s32.s8 %r3477, %r3476; - cvt.u32.u16 %r3478, %rs2210; - cvt.s32.s8 %r3479, %r3478; - mad.lo.s32 %r3480, %r173, %r3479, %r3471; - mad.lo.s32 %r3481, %r53, %r3477, %r3480; - mad.lo.s32 %r3482, %r54, %r3475, %r3481; - mad.lo.s32 %r3483, %r55, %r3473, %r3482; - ld.const.v4.u8 {%rs2218, %rs2219, %rs2220, %rs2221}, [matrix+1108]; - cvt.u32.u16 %r3484, %rs2221; - cvt.s32.s8 %r3485, %r3484; - cvt.u32.u16 %r3486, %rs2220; - cvt.s32.s8 %r3487, %r3486; - cvt.u32.u16 %r3488, %rs2219; - cvt.s32.s8 %r3489, %r3488; - cvt.u32.u16 %r3490, %rs2218; - cvt.s32.s8 %r3491, %r3490; - mad.lo.s32 %r3492, %r56, %r3491, %r3483; - mad.lo.s32 %r3493, %r57, %r3489, %r3492; - mad.lo.s32 %r3494, %r58, %r3487, %r3493; - mad.lo.s32 %r3495, %r59, %r3485, %r3494; - ld.const.v4.u8 {%rs2226, %rs2227, %rs2228, %rs2229}, [matrix+1112]; - cvt.u32.u16 %r3496, %rs2229; - cvt.s32.s8 %r3497, %r3496; - cvt.u32.u16 %r3498, %rs2228; - cvt.s32.s8 %r3499, %r3498; - cvt.u32.u16 %r3500, %rs2227; - cvt.s32.s8 %r3501, %r3500; - cvt.u32.u16 %r3502, %rs2226; - cvt.s32.s8 %r3503, %r3502; - mad.lo.s32 %r3504, %r61, %r3503, %r3495; - mad.lo.s32 %r3505, %r62, %r3501, %r3504; - mad.lo.s32 %r3506, %r64, %r3499, %r3505; - mad.lo.s32 %r3507, %r65, %r3497, %r3506; - ld.const.v4.u8 {%rs2234, %rs2235, %rs2236, %rs2237}, [matrix+1116]; - cvt.u32.u16 %r3508, %rs2237; - cvt.s32.s8 %r3509, %r3508; - cvt.u32.u16 %r3510, %rs2236; - cvt.s32.s8 %r3511, %r3510; - cvt.u32.u16 %r3512, %rs2235; - cvt.s32.s8 %r3513, %r3512; - cvt.u32.u16 %r3514, %rs2234; - cvt.s32.s8 %r3515, %r3514; - mad.lo.s32 %r3516, %r67, %r3515, %r3507; - mad.lo.s32 %r3517, %r68, %r3513, %r3516; - mad.lo.s32 %r3518, %r69, %r3511, %r3517; - mad.lo.s32 %r3519, %r70, %r3509, %r3518; - ld.const.v4.u8 {%rs2242, %rs2243, %rs2244, %rs2245}, [matrix+1120]; - cvt.u32.u16 %r3520, %rs2245; - cvt.s32.s8 %r3521, %r3520; - cvt.u32.u16 %r3522, %rs2244; - cvt.s32.s8 %r3523, %r3522; - cvt.u32.u16 %r3524, %rs2243; - cvt.s32.s8 %r3525, %r3524; - cvt.u32.u16 %r3526, %rs2242; - cvt.s32.s8 %r3527, %r3526; - mad.lo.s32 %r3528, %r222, %r3527, %r3519; - mad.lo.s32 %r3529, %r72, %r3525, %r3528; - mad.lo.s32 %r3530, %r73, %r3523, %r3529; - mad.lo.s32 %r3531, %r74, %r3521, %r3530; - ld.const.v4.u8 {%rs2250, %rs2251, %rs2252, %rs2253}, [matrix+1124]; - cvt.u32.u16 %r3532, %rs2253; - cvt.s32.s8 %r3533, %r3532; - cvt.u32.u16 %r3534, %rs2252; - cvt.s32.s8 %r3535, %r3534; - cvt.u32.u16 %r3536, %rs2251; - cvt.s32.s8 %r3537, %r3536; - cvt.u32.u16 %r3538, %rs2250; - cvt.s32.s8 %r3539, %r3538; - mad.lo.s32 %r3540, %r75, %r3539, %r3531; - mad.lo.s32 %r3541, %r76, %r3537, %r3540; - mad.lo.s32 %r3542, %r77, %r3535, %r3541; - mad.lo.s32 %r3543, %r78, %r3533, %r3542; - ld.const.v4.u8 {%rs2258, %rs2259, %rs2260, %rs2261}, [matrix+1128]; - cvt.u32.u16 %r3544, %rs2261; - cvt.s32.s8 %r3545, %r3544; - cvt.u32.u16 %r3546, %rs2260; - cvt.s32.s8 %r3547, %r3546; - cvt.u32.u16 %r3548, %rs2259; - cvt.s32.s8 %r3549, %r3548; - cvt.u32.u16 %r3550, %rs2258; - cvt.s32.s8 %r3551, %r3550; - mad.lo.s32 %r3552, %r80, %r3551, %r3543; - mad.lo.s32 %r3553, %r81, %r3549, %r3552; - mad.lo.s32 %r3554, %r83, %r3547, %r3553; - mad.lo.s32 %r3555, %r84, %r3545, %r3554; - ld.const.v4.u8 {%rs2266, %rs2267, %rs2268, %rs2269}, [matrix+1132]; - cvt.u32.u16 %r3556, %rs2269; - cvt.s32.s8 %r3557, %r3556; - cvt.u32.u16 %r3558, %rs2268; - cvt.s32.s8 %r3559, %r3558; - cvt.u32.u16 %r3560, %rs2267; - cvt.s32.s8 %r3561, %r3560; - cvt.u32.u16 %r3562, %rs2266; - cvt.s32.s8 %r3563, %r3562; - mad.lo.s32 %r3564, %r86, %r3563, %r3555; - mad.lo.s32 %r3565, %r87, %r3561, %r3564; - mad.lo.s32 %r3566, %r88, %r3559, %r3565; - mad.lo.s32 %r3567, %r89, %r3557, %r3566; - ld.const.v4.u8 {%rs2274, %rs2275, %rs2276, %rs2277}, [matrix+1136]; - cvt.u32.u16 %r3568, %rs2277; - cvt.s32.s8 %r3569, %r3568; - cvt.u32.u16 %r3570, %rs2276; - cvt.s32.s8 %r3571, %r3570; - cvt.u32.u16 %r3572, %rs2275; - cvt.s32.s8 %r3573, %r3572; - cvt.u32.u16 %r3574, %rs2274; - cvt.s32.s8 %r3575, %r3574; - mad.lo.s32 %r3576, %r271, %r3575, %r3567; - mad.lo.s32 %r3577, %r91, %r3573, %r3576; - mad.lo.s32 %r3578, %r93, %r3571, %r3577; - mad.lo.s32 %r3579, %r94, %r3569, %r3578; - ld.const.v4.u8 {%rs2282, %rs2283, %rs2284, %rs2285}, [matrix+1140]; - cvt.u32.u16 %r3580, %rs2285; - cvt.s32.s8 %r3581, %r3580; - cvt.u32.u16 %r3582, %rs2284; - cvt.s32.s8 %r3583, %r3582; - cvt.u32.u16 %r3584, %rs2283; - cvt.s32.s8 %r3585, %r3584; - cvt.u32.u16 %r3586, %rs2282; - cvt.s32.s8 %r3587, %r3586; - mad.lo.s32 %r3588, %r96, %r3587, %r3579; - mad.lo.s32 %r3589, %r97, %r3585, %r3588; - mad.lo.s32 %r3590, %r99, %r3583, %r3589; - mad.lo.s32 %r3591, %r100, %r3581, %r3590; - ld.const.v4.u8 {%rs2290, %rs2291, %rs2292, %rs2293}, [matrix+1144]; - cvt.u32.u16 %r3592, %rs2293; - cvt.s32.s8 %r3593, %r3592; - cvt.u32.u16 %r3594, %rs2292; - cvt.s32.s8 %r3595, %r3594; - cvt.u32.u16 %r3596, %rs2291; - cvt.s32.s8 %r3597, %r3596; - cvt.u32.u16 %r3598, %rs2290; - cvt.s32.s8 %r3599, %r3598; - mad.lo.s32 %r3600, %r103, %r3599, %r3591; - mad.lo.s32 %r3601, %r104, %r3597, %r3600; - mad.lo.s32 %r3602, %r107, %r3595, %r3601; - mad.lo.s32 %r3603, %r108, %r3593, %r3602; - ld.const.v4.u8 {%rs2298, %rs2299, %rs2300, %rs2301}, [matrix+1148]; - cvt.u32.u16 %r3604, %rs2301; - cvt.s32.s8 %r3605, %r3604; - cvt.u32.u16 %r3606, %rs2300; - cvt.s32.s8 %r3607, %r3606; - cvt.u32.u16 %r3608, %rs2299; - cvt.s32.s8 %r3609, %r3608; - cvt.u32.u16 %r3610, %rs2298; - cvt.s32.s8 %r3611, %r3610; - mad.lo.s32 %r3612, %r111, %r3611, %r3603; - mad.lo.s32 %r3613, %r112, %r3609, %r3612; - mad.lo.s32 %r3614, %r114, %r3607, %r3613; - mad.lo.s32 %r3615, %r115, %r3605, %r3614; - shr.u32 %r3616, %r3423, 6; - and.b32 %r3617, %r3616, 240; - shr.u32 %r3618, %r3615, 10; - or.b32 %r3619, %r3618, %r3617; - xor.b32 %r3620, %r52, %r3619; - cvt.u64.u32 %rd387, %r3620; - ld.const.v4.u8 {%rs2306, %rs2307, %rs2308, %rs2309}, [matrix+1152]; - cvt.u32.u16 %r3621, %rs2309; - cvt.s32.s8 %r3622, %r3621; - cvt.u32.u16 %r3623, %rs2308; - cvt.s32.s8 %r3624, %r3623; - cvt.u32.u16 %r3625, %rs2306; - cvt.s32.s8 %r3626, %r3625; - cvt.u32.u16 %r3627, %rs2307; - cvt.s32.s8 %r3628, %r3627; - mul.lo.s32 %r3629, %r34, %r3628; - mad.lo.s32 %r3630, %r124, %r3626, %r3629; - mad.lo.s32 %r3631, %r35, %r3624, %r3630; - mad.lo.s32 %r3632, %r36, %r3622, %r3631; - ld.const.v4.u8 {%rs2314, %rs2315, %rs2316, %rs2317}, [matrix+1156]; - cvt.u32.u16 %r3633, %rs2317; - cvt.s32.s8 %r3634, %r3633; - cvt.u32.u16 %r3635, %rs2316; - cvt.s32.s8 %r3636, %r3635; - cvt.u32.u16 %r3637, %rs2315; - cvt.s32.s8 %r3638, %r3637; - cvt.u32.u16 %r3639, %rs2314; - cvt.s32.s8 %r3640, %r3639; - mad.lo.s32 %r3641, %r37, %r3640, %r3632; - mad.lo.s32 %r3642, %r38, %r3638, %r3641; - mad.lo.s32 %r3643, %r39, %r3636, %r3642; - mad.lo.s32 %r3644, %r40, %r3634, %r3643; - ld.const.v4.u8 {%rs2322, %rs2323, %rs2324, %rs2325}, [matrix+1160]; - cvt.u32.u16 %r3645, %rs2325; - cvt.s32.s8 %r3646, %r3645; - cvt.u32.u16 %r3647, %rs2324; - cvt.s32.s8 %r3648, %r3647; - cvt.u32.u16 %r3649, %rs2323; - cvt.s32.s8 %r3650, %r3649; - cvt.u32.u16 %r3651, %rs2322; - cvt.s32.s8 %r3652, %r3651; - mad.lo.s32 %r3653, %r42, %r3652, %r3644; - mad.lo.s32 %r3654, %r43, %r3650, %r3653; - mad.lo.s32 %r3655, %r45, %r3648, %r3654; - mad.lo.s32 %r3656, %r46, %r3646, %r3655; - ld.const.v4.u8 {%rs2330, %rs2331, %rs2332, %rs2333}, [matrix+1164]; - cvt.u32.u16 %r3657, %rs2333; - cvt.s32.s8 %r3658, %r3657; - cvt.u32.u16 %r3659, %rs2332; - cvt.s32.s8 %r3660, %r3659; - cvt.u32.u16 %r3661, %rs2331; - cvt.s32.s8 %r3662, %r3661; - cvt.u32.u16 %r3663, %rs2330; - cvt.s32.s8 %r3664, %r3663; - mad.lo.s32 %r3665, %r48, %r3664, %r3656; - mad.lo.s32 %r3666, %r49, %r3662, %r3665; - mad.lo.s32 %r3667, %r50, %r3660, %r3666; - mad.lo.s32 %r3668, %r51, %r3658, %r3667; - ld.const.v4.u8 {%rs2338, %rs2339, %rs2340, %rs2341}, [matrix+1168]; - cvt.u32.u16 %r3669, %rs2341; - cvt.s32.s8 %r3670, %r3669; - cvt.u32.u16 %r3671, %rs2340; - cvt.s32.s8 %r3672, %r3671; - cvt.u32.u16 %r3673, %rs2339; - cvt.s32.s8 %r3674, %r3673; - cvt.u32.u16 %r3675, %rs2338; - cvt.s32.s8 %r3676, %r3675; - mad.lo.s32 %r3677, %r173, %r3676, %r3668; - mad.lo.s32 %r3678, %r53, %r3674, %r3677; - mad.lo.s32 %r3679, %r54, %r3672, %r3678; - mad.lo.s32 %r3680, %r55, %r3670, %r3679; - ld.const.v4.u8 {%rs2346, %rs2347, %rs2348, %rs2349}, [matrix+1172]; - cvt.u32.u16 %r3681, %rs2349; - cvt.s32.s8 %r3682, %r3681; - cvt.u32.u16 %r3683, %rs2348; - cvt.s32.s8 %r3684, %r3683; - cvt.u32.u16 %r3685, %rs2347; - cvt.s32.s8 %r3686, %r3685; - cvt.u32.u16 %r3687, %rs2346; - cvt.s32.s8 %r3688, %r3687; - mad.lo.s32 %r3689, %r56, %r3688, %r3680; - mad.lo.s32 %r3690, %r57, %r3686, %r3689; - mad.lo.s32 %r3691, %r58, %r3684, %r3690; - mad.lo.s32 %r3692, %r59, %r3682, %r3691; - ld.const.v4.u8 {%rs2354, %rs2355, %rs2356, %rs2357}, [matrix+1176]; - cvt.u32.u16 %r3693, %rs2357; - cvt.s32.s8 %r3694, %r3693; - cvt.u32.u16 %r3695, %rs2356; - cvt.s32.s8 %r3696, %r3695; - cvt.u32.u16 %r3697, %rs2355; - cvt.s32.s8 %r3698, %r3697; - cvt.u32.u16 %r3699, %rs2354; - cvt.s32.s8 %r3700, %r3699; - mad.lo.s32 %r3701, %r61, %r3700, %r3692; - mad.lo.s32 %r3702, %r62, %r3698, %r3701; - mad.lo.s32 %r3703, %r64, %r3696, %r3702; - mad.lo.s32 %r3704, %r65, %r3694, %r3703; - ld.const.v4.u8 {%rs2362, %rs2363, %rs2364, %rs2365}, [matrix+1180]; - cvt.u32.u16 %r3705, %rs2365; - cvt.s32.s8 %r3706, %r3705; - cvt.u32.u16 %r3707, %rs2364; - cvt.s32.s8 %r3708, %r3707; - cvt.u32.u16 %r3709, %rs2363; - cvt.s32.s8 %r3710, %r3709; - cvt.u32.u16 %r3711, %rs2362; - cvt.s32.s8 %r3712, %r3711; - mad.lo.s32 %r3713, %r67, %r3712, %r3704; - mad.lo.s32 %r3714, %r68, %r3710, %r3713; - mad.lo.s32 %r3715, %r69, %r3708, %r3714; - mad.lo.s32 %r3716, %r70, %r3706, %r3715; - ld.const.v4.u8 {%rs2370, %rs2371, %rs2372, %rs2373}, [matrix+1184]; - cvt.u32.u16 %r3717, %rs2373; - cvt.s32.s8 %r3718, %r3717; - cvt.u32.u16 %r3719, %rs2372; - cvt.s32.s8 %r3720, %r3719; - cvt.u32.u16 %r3721, %rs2371; - cvt.s32.s8 %r3722, %r3721; - cvt.u32.u16 %r3723, %rs2370; - cvt.s32.s8 %r3724, %r3723; - mad.lo.s32 %r3725, %r222, %r3724, %r3716; - mad.lo.s32 %r3726, %r72, %r3722, %r3725; - mad.lo.s32 %r3727, %r73, %r3720, %r3726; - mad.lo.s32 %r3728, %r74, %r3718, %r3727; - ld.const.v4.u8 {%rs2378, %rs2379, %rs2380, %rs2381}, [matrix+1188]; - cvt.u32.u16 %r3729, %rs2381; - cvt.s32.s8 %r3730, %r3729; - cvt.u32.u16 %r3731, %rs2380; - cvt.s32.s8 %r3732, %r3731; - cvt.u32.u16 %r3733, %rs2379; - cvt.s32.s8 %r3734, %r3733; - cvt.u32.u16 %r3735, %rs2378; - cvt.s32.s8 %r3736, %r3735; - mad.lo.s32 %r3737, %r75, %r3736, %r3728; - mad.lo.s32 %r3738, %r76, %r3734, %r3737; - mad.lo.s32 %r3739, %r77, %r3732, %r3738; - mad.lo.s32 %r3740, %r78, %r3730, %r3739; - ld.const.v4.u8 {%rs2386, %rs2387, %rs2388, %rs2389}, [matrix+1192]; - cvt.u32.u16 %r3741, %rs2389; - cvt.s32.s8 %r3742, %r3741; - cvt.u32.u16 %r3743, %rs2388; - cvt.s32.s8 %r3744, %r3743; - cvt.u32.u16 %r3745, %rs2387; - cvt.s32.s8 %r3746, %r3745; - cvt.u32.u16 %r3747, %rs2386; - cvt.s32.s8 %r3748, %r3747; - mad.lo.s32 %r3749, %r80, %r3748, %r3740; - mad.lo.s32 %r3750, %r81, %r3746, %r3749; - mad.lo.s32 %r3751, %r83, %r3744, %r3750; - mad.lo.s32 %r3752, %r84, %r3742, %r3751; - ld.const.v4.u8 {%rs2394, %rs2395, %rs2396, %rs2397}, [matrix+1196]; - cvt.u32.u16 %r3753, %rs2397; - cvt.s32.s8 %r3754, %r3753; - cvt.u32.u16 %r3755, %rs2396; - cvt.s32.s8 %r3756, %r3755; - cvt.u32.u16 %r3757, %rs2395; - cvt.s32.s8 %r3758, %r3757; - cvt.u32.u16 %r3759, %rs2394; - cvt.s32.s8 %r3760, %r3759; - mad.lo.s32 %r3761, %r86, %r3760, %r3752; - mad.lo.s32 %r3762, %r87, %r3758, %r3761; - mad.lo.s32 %r3763, %r88, %r3756, %r3762; - mad.lo.s32 %r3764, %r89, %r3754, %r3763; - ld.const.v4.u8 {%rs2402, %rs2403, %rs2404, %rs2405}, [matrix+1200]; - cvt.u32.u16 %r3765, %rs2405; - cvt.s32.s8 %r3766, %r3765; - cvt.u32.u16 %r3767, %rs2404; - cvt.s32.s8 %r3768, %r3767; - cvt.u32.u16 %r3769, %rs2403; - cvt.s32.s8 %r3770, %r3769; - cvt.u32.u16 %r3771, %rs2402; - cvt.s32.s8 %r3772, %r3771; - mad.lo.s32 %r3773, %r271, %r3772, %r3764; - mad.lo.s32 %r3774, %r91, %r3770, %r3773; - mad.lo.s32 %r3775, %r93, %r3768, %r3774; - mad.lo.s32 %r3776, %r94, %r3766, %r3775; - ld.const.v4.u8 {%rs2410, %rs2411, %rs2412, %rs2413}, [matrix+1204]; - cvt.u32.u16 %r3777, %rs2413; - cvt.s32.s8 %r3778, %r3777; - cvt.u32.u16 %r3779, %rs2412; - cvt.s32.s8 %r3780, %r3779; - cvt.u32.u16 %r3781, %rs2411; - cvt.s32.s8 %r3782, %r3781; - cvt.u32.u16 %r3783, %rs2410; - cvt.s32.s8 %r3784, %r3783; - mad.lo.s32 %r3785, %r96, %r3784, %r3776; - mad.lo.s32 %r3786, %r97, %r3782, %r3785; - mad.lo.s32 %r3787, %r99, %r3780, %r3786; - mad.lo.s32 %r3788, %r100, %r3778, %r3787; - ld.const.v4.u8 {%rs2418, %rs2419, %rs2420, %rs2421}, [matrix+1208]; - cvt.u32.u16 %r3789, %rs2421; - cvt.s32.s8 %r3790, %r3789; - cvt.u32.u16 %r3791, %rs2420; - cvt.s32.s8 %r3792, %r3791; - cvt.u32.u16 %r3793, %rs2419; - cvt.s32.s8 %r3794, %r3793; - cvt.u32.u16 %r3795, %rs2418; - cvt.s32.s8 %r3796, %r3795; - mad.lo.s32 %r3797, %r103, %r3796, %r3788; - mad.lo.s32 %r3798, %r104, %r3794, %r3797; - mad.lo.s32 %r3799, %r107, %r3792, %r3798; - mad.lo.s32 %r3800, %r108, %r3790, %r3799; - ld.const.v4.u8 {%rs2426, %rs2427, %rs2428, %rs2429}, [matrix+1212]; - cvt.u32.u16 %r3801, %rs2429; - cvt.s32.s8 %r3802, %r3801; - cvt.u32.u16 %r3803, %rs2428; - cvt.s32.s8 %r3804, %r3803; - cvt.u32.u16 %r3805, %rs2427; - cvt.s32.s8 %r3806, %r3805; - cvt.u32.u16 %r3807, %rs2426; - cvt.s32.s8 %r3808, %r3807; - mad.lo.s32 %r3809, %r111, %r3808, %r3800; - mad.lo.s32 %r3810, %r112, %r3806, %r3809; - mad.lo.s32 %r3811, %r114, %r3804, %r3810; - mad.lo.s32 %r3812, %r115, %r3802, %r3811; - ld.const.v4.u8 {%rs2434, %rs2435, %rs2436, %rs2437}, [matrix+1216]; - cvt.u32.u16 %r3813, %rs2437; - cvt.s32.s8 %r3814, %r3813; - cvt.u32.u16 %r3815, %rs2436; - cvt.s32.s8 %r3816, %r3815; - cvt.u32.u16 %r3817, %rs2434; - cvt.s32.s8 %r3818, %r3817; - cvt.u32.u16 %r3819, %rs2435; - cvt.s32.s8 %r3820, %r3819; - mul.lo.s32 %r3821, %r34, %r3820; - mad.lo.s32 %r3822, %r124, %r3818, %r3821; - mad.lo.s32 %r3823, %r35, %r3816, %r3822; - mad.lo.s32 %r3824, %r36, %r3814, %r3823; - ld.const.v4.u8 {%rs2442, %rs2443, %rs2444, %rs2445}, [matrix+1220]; - cvt.u32.u16 %r3825, %rs2445; - cvt.s32.s8 %r3826, %r3825; - cvt.u32.u16 %r3827, %rs2444; - cvt.s32.s8 %r3828, %r3827; - cvt.u32.u16 %r3829, %rs2443; - cvt.s32.s8 %r3830, %r3829; - cvt.u32.u16 %r3831, %rs2442; - cvt.s32.s8 %r3832, %r3831; - mad.lo.s32 %r3833, %r37, %r3832, %r3824; - mad.lo.s32 %r3834, %r38, %r3830, %r3833; - mad.lo.s32 %r3835, %r39, %r3828, %r3834; - mad.lo.s32 %r3836, %r40, %r3826, %r3835; - ld.const.v4.u8 {%rs2450, %rs2451, %rs2452, %rs2453}, [matrix+1224]; - cvt.u32.u16 %r3837, %rs2453; - cvt.s32.s8 %r3838, %r3837; - cvt.u32.u16 %r3839, %rs2452; - cvt.s32.s8 %r3840, %r3839; - cvt.u32.u16 %r3841, %rs2451; - cvt.s32.s8 %r3842, %r3841; - cvt.u32.u16 %r3843, %rs2450; - cvt.s32.s8 %r3844, %r3843; - mad.lo.s32 %r3845, %r42, %r3844, %r3836; - mad.lo.s32 %r3846, %r43, %r3842, %r3845; - mad.lo.s32 %r3847, %r45, %r3840, %r3846; - mad.lo.s32 %r3848, %r46, %r3838, %r3847; - ld.const.v4.u8 {%rs2458, %rs2459, %rs2460, %rs2461}, [matrix+1228]; - cvt.u32.u16 %r3849, %rs2461; - cvt.s32.s8 %r3850, %r3849; - cvt.u32.u16 %r3851, %rs2460; - cvt.s32.s8 %r3852, %r3851; - cvt.u32.u16 %r3853, %rs2459; - cvt.s32.s8 %r3854, %r3853; - cvt.u32.u16 %r3855, %rs2458; - cvt.s32.s8 %r3856, %r3855; - mad.lo.s32 %r3857, %r48, %r3856, %r3848; - mad.lo.s32 %r3858, %r49, %r3854, %r3857; - mad.lo.s32 %r3859, %r50, %r3852, %r3858; - mad.lo.s32 %r3860, %r51, %r3850, %r3859; - ld.const.v4.u8 {%rs2466, %rs2467, %rs2468, %rs2469}, [matrix+1232]; - cvt.u32.u16 %r3861, %rs2469; - cvt.s32.s8 %r3862, %r3861; - cvt.u32.u16 %r3863, %rs2468; - cvt.s32.s8 %r3864, %r3863; - cvt.u32.u16 %r3865, %rs2467; - cvt.s32.s8 %r3866, %r3865; - cvt.u32.u16 %r3867, %rs2466; - cvt.s32.s8 %r3868, %r3867; - mad.lo.s32 %r3869, %r173, %r3868, %r3860; - mad.lo.s32 %r3870, %r53, %r3866, %r3869; - mad.lo.s32 %r3871, %r54, %r3864, %r3870; - mad.lo.s32 %r3872, %r55, %r3862, %r3871; - ld.const.v4.u8 {%rs2474, %rs2475, %rs2476, %rs2477}, [matrix+1236]; - cvt.u32.u16 %r3873, %rs2477; - cvt.s32.s8 %r3874, %r3873; - cvt.u32.u16 %r3875, %rs2476; - cvt.s32.s8 %r3876, %r3875; - cvt.u32.u16 %r3877, %rs2475; - cvt.s32.s8 %r3878, %r3877; - cvt.u32.u16 %r3879, %rs2474; - cvt.s32.s8 %r3880, %r3879; - mad.lo.s32 %r3881, %r56, %r3880, %r3872; - mad.lo.s32 %r3882, %r57, %r3878, %r3881; - mad.lo.s32 %r3883, %r58, %r3876, %r3882; - mad.lo.s32 %r3884, %r59, %r3874, %r3883; - ld.const.v4.u8 {%rs2482, %rs2483, %rs2484, %rs2485}, [matrix+1240]; - cvt.u32.u16 %r3885, %rs2485; - cvt.s32.s8 %r3886, %r3885; - cvt.u32.u16 %r3887, %rs2484; - cvt.s32.s8 %r3888, %r3887; - cvt.u32.u16 %r3889, %rs2483; - cvt.s32.s8 %r3890, %r3889; - cvt.u32.u16 %r3891, %rs2482; - cvt.s32.s8 %r3892, %r3891; - mad.lo.s32 %r3893, %r61, %r3892, %r3884; - mad.lo.s32 %r3894, %r62, %r3890, %r3893; - mad.lo.s32 %r3895, %r64, %r3888, %r3894; - mad.lo.s32 %r3896, %r65, %r3886, %r3895; - ld.const.v4.u8 {%rs2490, %rs2491, %rs2492, %rs2493}, [matrix+1244]; - cvt.u32.u16 %r3897, %rs2493; - cvt.s32.s8 %r3898, %r3897; - cvt.u32.u16 %r3899, %rs2492; - cvt.s32.s8 %r3900, %r3899; - cvt.u32.u16 %r3901, %rs2491; - cvt.s32.s8 %r3902, %r3901; - cvt.u32.u16 %r3903, %rs2490; - cvt.s32.s8 %r3904, %r3903; - mad.lo.s32 %r3905, %r67, %r3904, %r3896; - mad.lo.s32 %r3906, %r68, %r3902, %r3905; - mad.lo.s32 %r3907, %r69, %r3900, %r3906; - mad.lo.s32 %r3908, %r70, %r3898, %r3907; - ld.const.v4.u8 {%rs2498, %rs2499, %rs2500, %rs2501}, [matrix+1248]; - cvt.u32.u16 %r3909, %rs2501; - cvt.s32.s8 %r3910, %r3909; - cvt.u32.u16 %r3911, %rs2500; - cvt.s32.s8 %r3912, %r3911; - cvt.u32.u16 %r3913, %rs2499; - cvt.s32.s8 %r3914, %r3913; - cvt.u32.u16 %r3915, %rs2498; - cvt.s32.s8 %r3916, %r3915; - mad.lo.s32 %r3917, %r222, %r3916, %r3908; - mad.lo.s32 %r3918, %r72, %r3914, %r3917; - mad.lo.s32 %r3919, %r73, %r3912, %r3918; - mad.lo.s32 %r3920, %r74, %r3910, %r3919; - ld.const.v4.u8 {%rs2506, %rs2507, %rs2508, %rs2509}, [matrix+1252]; - cvt.u32.u16 %r3921, %rs2509; - cvt.s32.s8 %r3922, %r3921; - cvt.u32.u16 %r3923, %rs2508; - cvt.s32.s8 %r3924, %r3923; - cvt.u32.u16 %r3925, %rs2507; - cvt.s32.s8 %r3926, %r3925; - cvt.u32.u16 %r3927, %rs2506; - cvt.s32.s8 %r3928, %r3927; - mad.lo.s32 %r3929, %r75, %r3928, %r3920; - mad.lo.s32 %r3930, %r76, %r3926, %r3929; - mad.lo.s32 %r3931, %r77, %r3924, %r3930; - mad.lo.s32 %r3932, %r78, %r3922, %r3931; - ld.const.v4.u8 {%rs2514, %rs2515, %rs2516, %rs2517}, [matrix+1256]; - cvt.u32.u16 %r3933, %rs2517; - cvt.s32.s8 %r3934, %r3933; - cvt.u32.u16 %r3935, %rs2516; - cvt.s32.s8 %r3936, %r3935; - cvt.u32.u16 %r3937, %rs2515; - cvt.s32.s8 %r3938, %r3937; - cvt.u32.u16 %r3939, %rs2514; - cvt.s32.s8 %r3940, %r3939; - mad.lo.s32 %r3941, %r80, %r3940, %r3932; - mad.lo.s32 %r3942, %r81, %r3938, %r3941; - mad.lo.s32 %r3943, %r83, %r3936, %r3942; - mad.lo.s32 %r3944, %r84, %r3934, %r3943; - ld.const.v4.u8 {%rs2522, %rs2523, %rs2524, %rs2525}, [matrix+1260]; - cvt.u32.u16 %r3945, %rs2525; - cvt.s32.s8 %r3946, %r3945; - cvt.u32.u16 %r3947, %rs2524; - cvt.s32.s8 %r3948, %r3947; - cvt.u32.u16 %r3949, %rs2523; - cvt.s32.s8 %r3950, %r3949; - cvt.u32.u16 %r3951, %rs2522; - cvt.s32.s8 %r3952, %r3951; - mad.lo.s32 %r3953, %r86, %r3952, %r3944; - mad.lo.s32 %r3954, %r87, %r3950, %r3953; - mad.lo.s32 %r3955, %r88, %r3948, %r3954; - mad.lo.s32 %r3956, %r89, %r3946, %r3955; - ld.const.v4.u8 {%rs2530, %rs2531, %rs2532, %rs2533}, [matrix+1264]; - cvt.u32.u16 %r3957, %rs2533; - cvt.s32.s8 %r3958, %r3957; - cvt.u32.u16 %r3959, %rs2532; - cvt.s32.s8 %r3960, %r3959; - cvt.u32.u16 %r3961, %rs2531; - cvt.s32.s8 %r3962, %r3961; - cvt.u32.u16 %r3963, %rs2530; - cvt.s32.s8 %r3964, %r3963; - mad.lo.s32 %r3965, %r271, %r3964, %r3956; - mad.lo.s32 %r3966, %r91, %r3962, %r3965; - mad.lo.s32 %r3967, %r93, %r3960, %r3966; - mad.lo.s32 %r3968, %r94, %r3958, %r3967; - ld.const.v4.u8 {%rs2538, %rs2539, %rs2540, %rs2541}, [matrix+1268]; - cvt.u32.u16 %r3969, %rs2541; - cvt.s32.s8 %r3970, %r3969; - cvt.u32.u16 %r3971, %rs2540; - cvt.s32.s8 %r3972, %r3971; - cvt.u32.u16 %r3973, %rs2539; - cvt.s32.s8 %r3974, %r3973; - cvt.u32.u16 %r3975, %rs2538; - cvt.s32.s8 %r3976, %r3975; - mad.lo.s32 %r3977, %r96, %r3976, %r3968; - mad.lo.s32 %r3978, %r97, %r3974, %r3977; - mad.lo.s32 %r3979, %r99, %r3972, %r3978; - mad.lo.s32 %r3980, %r100, %r3970, %r3979; - ld.const.v4.u8 {%rs2546, %rs2547, %rs2548, %rs2549}, [matrix+1272]; - cvt.u32.u16 %r3981, %rs2549; - cvt.s32.s8 %r3982, %r3981; - cvt.u32.u16 %r3983, %rs2548; - cvt.s32.s8 %r3984, %r3983; - cvt.u32.u16 %r3985, %rs2547; - cvt.s32.s8 %r3986, %r3985; - cvt.u32.u16 %r3987, %rs2546; - cvt.s32.s8 %r3988, %r3987; - mad.lo.s32 %r3989, %r103, %r3988, %r3980; - mad.lo.s32 %r3990, %r104, %r3986, %r3989; - mad.lo.s32 %r3991, %r107, %r3984, %r3990; - mad.lo.s32 %r3992, %r108, %r3982, %r3991; - ld.const.v4.u8 {%rs2554, %rs2555, %rs2556, %rs2557}, [matrix+1276]; - cvt.u32.u16 %r3993, %rs2557; - cvt.s32.s8 %r3994, %r3993; - cvt.u32.u16 %r3995, %rs2556; - cvt.s32.s8 %r3996, %r3995; - cvt.u32.u16 %r3997, %rs2555; - cvt.s32.s8 %r3998, %r3997; - cvt.u32.u16 %r3999, %rs2554; - cvt.s32.s8 %r4000, %r3999; - mad.lo.s32 %r4001, %r111, %r4000, %r3992; - mad.lo.s32 %r4002, %r112, %r3998, %r4001; - mad.lo.s32 %r4003, %r114, %r3996, %r4002; - mad.lo.s32 %r4004, %r115, %r3994, %r4003; - shr.u32 %r4005, %r3812, 6; - and.b32 %r4006, %r4005, 240; - shr.u32 %r4007, %r4004, 10; - or.b32 %r4008, %r4007, %r4006; - xor.b32 %r4009, %r20, %r4008; - ld.const.v4.u8 {%rs2562, %rs2563, %rs2564, %rs2565}, [matrix+1280]; - cvt.u32.u16 %r4010, %rs2565; - cvt.s32.s8 %r4011, %r4010; - cvt.u32.u16 %r4012, %rs2564; - cvt.s32.s8 %r4013, %r4012; - cvt.u32.u16 %r4014, %rs2562; - cvt.s32.s8 %r4015, %r4014; - cvt.u32.u16 %r4016, %rs2563; - cvt.s32.s8 %r4017, %r4016; - mul.lo.s32 %r4018, %r34, %r4017; - mad.lo.s32 %r4019, %r124, %r4015, %r4018; - mad.lo.s32 %r4020, %r35, %r4013, %r4019; - mad.lo.s32 %r4021, %r36, %r4011, %r4020; - ld.const.v4.u8 {%rs2570, %rs2571, %rs2572, %rs2573}, [matrix+1284]; - cvt.u32.u16 %r4022, %rs2573; - cvt.s32.s8 %r4023, %r4022; - cvt.u32.u16 %r4024, %rs2572; - cvt.s32.s8 %r4025, %r4024; - cvt.u32.u16 %r4026, %rs2571; - cvt.s32.s8 %r4027, %r4026; - cvt.u32.u16 %r4028, %rs2570; - cvt.s32.s8 %r4029, %r4028; - mad.lo.s32 %r4030, %r37, %r4029, %r4021; - mad.lo.s32 %r4031, %r38, %r4027, %r4030; - mad.lo.s32 %r4032, %r39, %r4025, %r4031; - mad.lo.s32 %r4033, %r40, %r4023, %r4032; - ld.const.v4.u8 {%rs2578, %rs2579, %rs2580, %rs2581}, [matrix+1288]; - cvt.u32.u16 %r4034, %rs2581; - cvt.s32.s8 %r4035, %r4034; - cvt.u32.u16 %r4036, %rs2580; - cvt.s32.s8 %r4037, %r4036; - cvt.u32.u16 %r4038, %rs2579; - cvt.s32.s8 %r4039, %r4038; - cvt.u32.u16 %r4040, %rs2578; - cvt.s32.s8 %r4041, %r4040; - mad.lo.s32 %r4042, %r42, %r4041, %r4033; - mad.lo.s32 %r4043, %r43, %r4039, %r4042; - mad.lo.s32 %r4044, %r45, %r4037, %r4043; - mad.lo.s32 %r4045, %r46, %r4035, %r4044; - ld.const.v4.u8 {%rs2586, %rs2587, %rs2588, %rs2589}, [matrix+1292]; - cvt.u32.u16 %r4046, %rs2589; - cvt.s32.s8 %r4047, %r4046; - cvt.u32.u16 %r4048, %rs2588; - cvt.s32.s8 %r4049, %r4048; - cvt.u32.u16 %r4050, %rs2587; - cvt.s32.s8 %r4051, %r4050; - cvt.u32.u16 %r4052, %rs2586; - cvt.s32.s8 %r4053, %r4052; - mad.lo.s32 %r4054, %r48, %r4053, %r4045; - mad.lo.s32 %r4055, %r49, %r4051, %r4054; - mad.lo.s32 %r4056, %r50, %r4049, %r4055; - mad.lo.s32 %r4057, %r51, %r4047, %r4056; - ld.const.v4.u8 {%rs2594, %rs2595, %rs2596, %rs2597}, [matrix+1296]; - cvt.u32.u16 %r4058, %rs2597; - cvt.s32.s8 %r4059, %r4058; - cvt.u32.u16 %r4060, %rs2596; - cvt.s32.s8 %r4061, %r4060; - cvt.u32.u16 %r4062, %rs2595; - cvt.s32.s8 %r4063, %r4062; - cvt.u32.u16 %r4064, %rs2594; - cvt.s32.s8 %r4065, %r4064; - mad.lo.s32 %r4066, %r173, %r4065, %r4057; - mad.lo.s32 %r4067, %r53, %r4063, %r4066; - mad.lo.s32 %r4068, %r54, %r4061, %r4067; - mad.lo.s32 %r4069, %r55, %r4059, %r4068; - ld.const.v4.u8 {%rs2602, %rs2603, %rs2604, %rs2605}, [matrix+1300]; - cvt.u32.u16 %r4070, %rs2605; - cvt.s32.s8 %r4071, %r4070; - cvt.u32.u16 %r4072, %rs2604; - cvt.s32.s8 %r4073, %r4072; - cvt.u32.u16 %r4074, %rs2603; - cvt.s32.s8 %r4075, %r4074; - cvt.u32.u16 %r4076, %rs2602; - cvt.s32.s8 %r4077, %r4076; - mad.lo.s32 %r4078, %r56, %r4077, %r4069; - mad.lo.s32 %r4079, %r57, %r4075, %r4078; - mad.lo.s32 %r4080, %r58, %r4073, %r4079; - mad.lo.s32 %r4081, %r59, %r4071, %r4080; - ld.const.v4.u8 {%rs2610, %rs2611, %rs2612, %rs2613}, [matrix+1304]; - cvt.u32.u16 %r4082, %rs2613; - cvt.s32.s8 %r4083, %r4082; - cvt.u32.u16 %r4084, %rs2612; - cvt.s32.s8 %r4085, %r4084; - cvt.u32.u16 %r4086, %rs2611; - cvt.s32.s8 %r4087, %r4086; - cvt.u32.u16 %r4088, %rs2610; - cvt.s32.s8 %r4089, %r4088; - mad.lo.s32 %r4090, %r61, %r4089, %r4081; - mad.lo.s32 %r4091, %r62, %r4087, %r4090; - mad.lo.s32 %r4092, %r64, %r4085, %r4091; - mad.lo.s32 %r4093, %r65, %r4083, %r4092; - ld.const.v4.u8 {%rs2618, %rs2619, %rs2620, %rs2621}, [matrix+1308]; - cvt.u32.u16 %r4094, %rs2621; - cvt.s32.s8 %r4095, %r4094; - cvt.u32.u16 %r4096, %rs2620; - cvt.s32.s8 %r4097, %r4096; - cvt.u32.u16 %r4098, %rs2619; - cvt.s32.s8 %r4099, %r4098; - cvt.u32.u16 %r4100, %rs2618; - cvt.s32.s8 %r4101, %r4100; - mad.lo.s32 %r4102, %r67, %r4101, %r4093; - mad.lo.s32 %r4103, %r68, %r4099, %r4102; - mad.lo.s32 %r4104, %r69, %r4097, %r4103; - mad.lo.s32 %r4105, %r70, %r4095, %r4104; - ld.const.v4.u8 {%rs2626, %rs2627, %rs2628, %rs2629}, [matrix+1312]; - cvt.u32.u16 %r4106, %rs2629; - cvt.s32.s8 %r4107, %r4106; - cvt.u32.u16 %r4108, %rs2628; - cvt.s32.s8 %r4109, %r4108; - cvt.u32.u16 %r4110, %rs2627; - cvt.s32.s8 %r4111, %r4110; - cvt.u32.u16 %r4112, %rs2626; - cvt.s32.s8 %r4113, %r4112; - mad.lo.s32 %r4114, %r222, %r4113, %r4105; - mad.lo.s32 %r4115, %r72, %r4111, %r4114; - mad.lo.s32 %r4116, %r73, %r4109, %r4115; - mad.lo.s32 %r4117, %r74, %r4107, %r4116; - ld.const.v4.u8 {%rs2634, %rs2635, %rs2636, %rs2637}, [matrix+1316]; - cvt.u32.u16 %r4118, %rs2637; - cvt.s32.s8 %r4119, %r4118; - cvt.u32.u16 %r4120, %rs2636; - cvt.s32.s8 %r4121, %r4120; - cvt.u32.u16 %r4122, %rs2635; - cvt.s32.s8 %r4123, %r4122; - cvt.u32.u16 %r4124, %rs2634; - cvt.s32.s8 %r4125, %r4124; - mad.lo.s32 %r4126, %r75, %r4125, %r4117; - mad.lo.s32 %r4127, %r76, %r4123, %r4126; - mad.lo.s32 %r4128, %r77, %r4121, %r4127; - mad.lo.s32 %r4129, %r78, %r4119, %r4128; - ld.const.v4.u8 {%rs2642, %rs2643, %rs2644, %rs2645}, [matrix+1320]; - cvt.u32.u16 %r4130, %rs2645; - cvt.s32.s8 %r4131, %r4130; - cvt.u32.u16 %r4132, %rs2644; - cvt.s32.s8 %r4133, %r4132; - cvt.u32.u16 %r4134, %rs2643; - cvt.s32.s8 %r4135, %r4134; - cvt.u32.u16 %r4136, %rs2642; - cvt.s32.s8 %r4137, %r4136; - mad.lo.s32 %r4138, %r80, %r4137, %r4129; - mad.lo.s32 %r4139, %r81, %r4135, %r4138; - mad.lo.s32 %r4140, %r83, %r4133, %r4139; - mad.lo.s32 %r4141, %r84, %r4131, %r4140; - ld.const.v4.u8 {%rs2650, %rs2651, %rs2652, %rs2653}, [matrix+1324]; - cvt.u32.u16 %r4142, %rs2653; - cvt.s32.s8 %r4143, %r4142; - cvt.u32.u16 %r4144, %rs2652; - cvt.s32.s8 %r4145, %r4144; - cvt.u32.u16 %r4146, %rs2651; - cvt.s32.s8 %r4147, %r4146; - cvt.u32.u16 %r4148, %rs2650; - cvt.s32.s8 %r4149, %r4148; - mad.lo.s32 %r4150, %r86, %r4149, %r4141; - mad.lo.s32 %r4151, %r87, %r4147, %r4150; - mad.lo.s32 %r4152, %r88, %r4145, %r4151; - mad.lo.s32 %r4153, %r89, %r4143, %r4152; - ld.const.v4.u8 {%rs2658, %rs2659, %rs2660, %rs2661}, [matrix+1328]; - cvt.u32.u16 %r4154, %rs2661; - cvt.s32.s8 %r4155, %r4154; - cvt.u32.u16 %r4156, %rs2660; - cvt.s32.s8 %r4157, %r4156; - cvt.u32.u16 %r4158, %rs2659; - cvt.s32.s8 %r4159, %r4158; - cvt.u32.u16 %r4160, %rs2658; - cvt.s32.s8 %r4161, %r4160; - mad.lo.s32 %r4162, %r271, %r4161, %r4153; - mad.lo.s32 %r4163, %r91, %r4159, %r4162; - mad.lo.s32 %r4164, %r93, %r4157, %r4163; - mad.lo.s32 %r4165, %r94, %r4155, %r4164; - ld.const.v4.u8 {%rs2666, %rs2667, %rs2668, %rs2669}, [matrix+1332]; - cvt.u32.u16 %r4166, %rs2669; - cvt.s32.s8 %r4167, %r4166; - cvt.u32.u16 %r4168, %rs2668; - cvt.s32.s8 %r4169, %r4168; - cvt.u32.u16 %r4170, %rs2667; - cvt.s32.s8 %r4171, %r4170; - cvt.u32.u16 %r4172, %rs2666; - cvt.s32.s8 %r4173, %r4172; - mad.lo.s32 %r4174, %r96, %r4173, %r4165; - mad.lo.s32 %r4175, %r97, %r4171, %r4174; - mad.lo.s32 %r4176, %r99, %r4169, %r4175; - mad.lo.s32 %r4177, %r100, %r4167, %r4176; - ld.const.v4.u8 {%rs2674, %rs2675, %rs2676, %rs2677}, [matrix+1336]; - cvt.u32.u16 %r4178, %rs2677; - cvt.s32.s8 %r4179, %r4178; - cvt.u32.u16 %r4180, %rs2676; - cvt.s32.s8 %r4181, %r4180; - cvt.u32.u16 %r4182, %rs2675; - cvt.s32.s8 %r4183, %r4182; - cvt.u32.u16 %r4184, %rs2674; - cvt.s32.s8 %r4185, %r4184; - mad.lo.s32 %r4186, %r103, %r4185, %r4177; - mad.lo.s32 %r4187, %r104, %r4183, %r4186; - mad.lo.s32 %r4188, %r107, %r4181, %r4187; - mad.lo.s32 %r4189, %r108, %r4179, %r4188; - ld.const.v4.u8 {%rs2682, %rs2683, %rs2684, %rs2685}, [matrix+1340]; - cvt.u32.u16 %r4190, %rs2685; - cvt.s32.s8 %r4191, %r4190; - cvt.u32.u16 %r4192, %rs2684; - cvt.s32.s8 %r4193, %r4192; - cvt.u32.u16 %r4194, %rs2683; - cvt.s32.s8 %r4195, %r4194; - cvt.u32.u16 %r4196, %rs2682; - cvt.s32.s8 %r4197, %r4196; - mad.lo.s32 %r4198, %r111, %r4197, %r4189; - mad.lo.s32 %r4199, %r112, %r4195, %r4198; - mad.lo.s32 %r4200, %r114, %r4193, %r4199; - mad.lo.s32 %r4201, %r115, %r4191, %r4200; - ld.const.v4.u8 {%rs2690, %rs2691, %rs2692, %rs2693}, [matrix+1344]; - cvt.u32.u16 %r4202, %rs2693; - cvt.s32.s8 %r4203, %r4202; - cvt.u32.u16 %r4204, %rs2692; - cvt.s32.s8 %r4205, %r4204; - cvt.u32.u16 %r4206, %rs2690; - cvt.s32.s8 %r4207, %r4206; - cvt.u32.u16 %r4208, %rs2691; - cvt.s32.s8 %r4209, %r4208; - mul.lo.s32 %r4210, %r34, %r4209; - mad.lo.s32 %r4211, %r124, %r4207, %r4210; - mad.lo.s32 %r4212, %r35, %r4205, %r4211; - mad.lo.s32 %r4213, %r36, %r4203, %r4212; - ld.const.v4.u8 {%rs2698, %rs2699, %rs2700, %rs2701}, [matrix+1348]; - cvt.u32.u16 %r4214, %rs2701; - cvt.s32.s8 %r4215, %r4214; - cvt.u32.u16 %r4216, %rs2700; - cvt.s32.s8 %r4217, %r4216; - cvt.u32.u16 %r4218, %rs2699; - cvt.s32.s8 %r4219, %r4218; - cvt.u32.u16 %r4220, %rs2698; - cvt.s32.s8 %r4221, %r4220; - mad.lo.s32 %r4222, %r37, %r4221, %r4213; - mad.lo.s32 %r4223, %r38, %r4219, %r4222; - mad.lo.s32 %r4224, %r39, %r4217, %r4223; - mad.lo.s32 %r4225, %r40, %r4215, %r4224; - ld.const.v4.u8 {%rs2706, %rs2707, %rs2708, %rs2709}, [matrix+1352]; - cvt.u32.u16 %r4226, %rs2709; - cvt.s32.s8 %r4227, %r4226; - cvt.u32.u16 %r4228, %rs2708; - cvt.s32.s8 %r4229, %r4228; - cvt.u32.u16 %r4230, %rs2707; - cvt.s32.s8 %r4231, %r4230; - cvt.u32.u16 %r4232, %rs2706; - cvt.s32.s8 %r4233, %r4232; - mad.lo.s32 %r4234, %r42, %r4233, %r4225; - mad.lo.s32 %r4235, %r43, %r4231, %r4234; - mad.lo.s32 %r4236, %r45, %r4229, %r4235; - mad.lo.s32 %r4237, %r46, %r4227, %r4236; - ld.const.v4.u8 {%rs2714, %rs2715, %rs2716, %rs2717}, [matrix+1356]; - cvt.u32.u16 %r4238, %rs2717; - cvt.s32.s8 %r4239, %r4238; - cvt.u32.u16 %r4240, %rs2716; - cvt.s32.s8 %r4241, %r4240; - cvt.u32.u16 %r4242, %rs2715; - cvt.s32.s8 %r4243, %r4242; - cvt.u32.u16 %r4244, %rs2714; - cvt.s32.s8 %r4245, %r4244; - mad.lo.s32 %r4246, %r48, %r4245, %r4237; - mad.lo.s32 %r4247, %r49, %r4243, %r4246; - mad.lo.s32 %r4248, %r50, %r4241, %r4247; - mad.lo.s32 %r4249, %r51, %r4239, %r4248; - ld.const.v4.u8 {%rs2722, %rs2723, %rs2724, %rs2725}, [matrix+1360]; - cvt.u32.u16 %r4250, %rs2725; - cvt.s32.s8 %r4251, %r4250; - cvt.u32.u16 %r4252, %rs2724; - cvt.s32.s8 %r4253, %r4252; - cvt.u32.u16 %r4254, %rs2723; - cvt.s32.s8 %r4255, %r4254; - cvt.u32.u16 %r4256, %rs2722; - cvt.s32.s8 %r4257, %r4256; - mad.lo.s32 %r4258, %r173, %r4257, %r4249; - mad.lo.s32 %r4259, %r53, %r4255, %r4258; - mad.lo.s32 %r4260, %r54, %r4253, %r4259; - mad.lo.s32 %r4261, %r55, %r4251, %r4260; - ld.const.v4.u8 {%rs2730, %rs2731, %rs2732, %rs2733}, [matrix+1364]; - cvt.u32.u16 %r4262, %rs2733; - cvt.s32.s8 %r4263, %r4262; - cvt.u32.u16 %r4264, %rs2732; - cvt.s32.s8 %r4265, %r4264; - cvt.u32.u16 %r4266, %rs2731; - cvt.s32.s8 %r4267, %r4266; - cvt.u32.u16 %r4268, %rs2730; - cvt.s32.s8 %r4269, %r4268; - mad.lo.s32 %r4270, %r56, %r4269, %r4261; - mad.lo.s32 %r4271, %r57, %r4267, %r4270; - mad.lo.s32 %r4272, %r58, %r4265, %r4271; - mad.lo.s32 %r4273, %r59, %r4263, %r4272; - ld.const.v4.u8 {%rs2738, %rs2739, %rs2740, %rs2741}, [matrix+1368]; - cvt.u32.u16 %r4274, %rs2741; - cvt.s32.s8 %r4275, %r4274; - cvt.u32.u16 %r4276, %rs2740; - cvt.s32.s8 %r4277, %r4276; - cvt.u32.u16 %r4278, %rs2739; - cvt.s32.s8 %r4279, %r4278; - cvt.u32.u16 %r4280, %rs2738; - cvt.s32.s8 %r4281, %r4280; - mad.lo.s32 %r4282, %r61, %r4281, %r4273; - mad.lo.s32 %r4283, %r62, %r4279, %r4282; - mad.lo.s32 %r4284, %r64, %r4277, %r4283; - mad.lo.s32 %r4285, %r65, %r4275, %r4284; - ld.const.v4.u8 {%rs2746, %rs2747, %rs2748, %rs2749}, [matrix+1372]; - cvt.u32.u16 %r4286, %rs2749; - cvt.s32.s8 %r4287, %r4286; - cvt.u32.u16 %r4288, %rs2748; - cvt.s32.s8 %r4289, %r4288; - cvt.u32.u16 %r4290, %rs2747; - cvt.s32.s8 %r4291, %r4290; - cvt.u32.u16 %r4292, %rs2746; - cvt.s32.s8 %r4293, %r4292; - mad.lo.s32 %r4294, %r67, %r4293, %r4285; - mad.lo.s32 %r4295, %r68, %r4291, %r4294; - mad.lo.s32 %r4296, %r69, %r4289, %r4295; - mad.lo.s32 %r4297, %r70, %r4287, %r4296; - ld.const.v4.u8 {%rs2754, %rs2755, %rs2756, %rs2757}, [matrix+1376]; - cvt.u32.u16 %r4298, %rs2757; - cvt.s32.s8 %r4299, %r4298; - cvt.u32.u16 %r4300, %rs2756; - cvt.s32.s8 %r4301, %r4300; - cvt.u32.u16 %r4302, %rs2755; - cvt.s32.s8 %r4303, %r4302; - cvt.u32.u16 %r4304, %rs2754; - cvt.s32.s8 %r4305, %r4304; - mad.lo.s32 %r4306, %r222, %r4305, %r4297; - mad.lo.s32 %r4307, %r72, %r4303, %r4306; - mad.lo.s32 %r4308, %r73, %r4301, %r4307; - mad.lo.s32 %r4309, %r74, %r4299, %r4308; - ld.const.v4.u8 {%rs2762, %rs2763, %rs2764, %rs2765}, [matrix+1380]; - cvt.u32.u16 %r4310, %rs2765; - cvt.s32.s8 %r4311, %r4310; - cvt.u32.u16 %r4312, %rs2764; - cvt.s32.s8 %r4313, %r4312; - cvt.u32.u16 %r4314, %rs2763; - cvt.s32.s8 %r4315, %r4314; - cvt.u32.u16 %r4316, %rs2762; - cvt.s32.s8 %r4317, %r4316; - mad.lo.s32 %r4318, %r75, %r4317, %r4309; - mad.lo.s32 %r4319, %r76, %r4315, %r4318; - mad.lo.s32 %r4320, %r77, %r4313, %r4319; - mad.lo.s32 %r4321, %r78, %r4311, %r4320; - ld.const.v4.u8 {%rs2770, %rs2771, %rs2772, %rs2773}, [matrix+1384]; - cvt.u32.u16 %r4322, %rs2773; - cvt.s32.s8 %r4323, %r4322; - cvt.u32.u16 %r4324, %rs2772; - cvt.s32.s8 %r4325, %r4324; - cvt.u32.u16 %r4326, %rs2771; - cvt.s32.s8 %r4327, %r4326; - cvt.u32.u16 %r4328, %rs2770; - cvt.s32.s8 %r4329, %r4328; - mad.lo.s32 %r4330, %r80, %r4329, %r4321; - mad.lo.s32 %r4331, %r81, %r4327, %r4330; - mad.lo.s32 %r4332, %r83, %r4325, %r4331; - mad.lo.s32 %r4333, %r84, %r4323, %r4332; - ld.const.v4.u8 {%rs2778, %rs2779, %rs2780, %rs2781}, [matrix+1388]; - cvt.u32.u16 %r4334, %rs2781; - cvt.s32.s8 %r4335, %r4334; - cvt.u32.u16 %r4336, %rs2780; - cvt.s32.s8 %r4337, %r4336; - cvt.u32.u16 %r4338, %rs2779; - cvt.s32.s8 %r4339, %r4338; - cvt.u32.u16 %r4340, %rs2778; - cvt.s32.s8 %r4341, %r4340; - mad.lo.s32 %r4342, %r86, %r4341, %r4333; - mad.lo.s32 %r4343, %r87, %r4339, %r4342; - mad.lo.s32 %r4344, %r88, %r4337, %r4343; - mad.lo.s32 %r4345, %r89, %r4335, %r4344; - ld.const.v4.u8 {%rs2786, %rs2787, %rs2788, %rs2789}, [matrix+1392]; - cvt.u32.u16 %r4346, %rs2789; - cvt.s32.s8 %r4347, %r4346; - cvt.u32.u16 %r4348, %rs2788; - cvt.s32.s8 %r4349, %r4348; - cvt.u32.u16 %r4350, %rs2787; - cvt.s32.s8 %r4351, %r4350; - cvt.u32.u16 %r4352, %rs2786; - cvt.s32.s8 %r4353, %r4352; - mad.lo.s32 %r4354, %r271, %r4353, %r4345; - mad.lo.s32 %r4355, %r91, %r4351, %r4354; - mad.lo.s32 %r4356, %r93, %r4349, %r4355; - mad.lo.s32 %r4357, %r94, %r4347, %r4356; - ld.const.v4.u8 {%rs2794, %rs2795, %rs2796, %rs2797}, [matrix+1396]; - cvt.u32.u16 %r4358, %rs2797; - cvt.s32.s8 %r4359, %r4358; - cvt.u32.u16 %r4360, %rs2796; - cvt.s32.s8 %r4361, %r4360; - cvt.u32.u16 %r4362, %rs2795; - cvt.s32.s8 %r4363, %r4362; - cvt.u32.u16 %r4364, %rs2794; - cvt.s32.s8 %r4365, %r4364; - mad.lo.s32 %r4366, %r96, %r4365, %r4357; - mad.lo.s32 %r4367, %r97, %r4363, %r4366; - mad.lo.s32 %r4368, %r99, %r4361, %r4367; - mad.lo.s32 %r4369, %r100, %r4359, %r4368; - ld.const.v4.u8 {%rs2802, %rs2803, %rs2804, %rs2805}, [matrix+1400]; - cvt.u32.u16 %r4370, %rs2805; - cvt.s32.s8 %r4371, %r4370; - cvt.u32.u16 %r4372, %rs2804; - cvt.s32.s8 %r4373, %r4372; - cvt.u32.u16 %r4374, %rs2803; - cvt.s32.s8 %r4375, %r4374; - cvt.u32.u16 %r4376, %rs2802; - cvt.s32.s8 %r4377, %r4376; - mad.lo.s32 %r4378, %r103, %r4377, %r4369; - mad.lo.s32 %r4379, %r104, %r4375, %r4378; - mad.lo.s32 %r4380, %r107, %r4373, %r4379; - mad.lo.s32 %r4381, %r108, %r4371, %r4380; - ld.const.v4.u8 {%rs2810, %rs2811, %rs2812, %rs2813}, [matrix+1404]; - cvt.u32.u16 %r4382, %rs2813; - cvt.s32.s8 %r4383, %r4382; - cvt.u32.u16 %r4384, %rs2812; - cvt.s32.s8 %r4385, %r4384; - cvt.u32.u16 %r4386, %rs2811; - cvt.s32.s8 %r4387, %r4386; - cvt.u32.u16 %r4388, %rs2810; - cvt.s32.s8 %r4389, %r4388; - mad.lo.s32 %r4390, %r111, %r4389, %r4381; - mad.lo.s32 %r4391, %r112, %r4387, %r4390; - mad.lo.s32 %r4392, %r114, %r4385, %r4391; - mad.lo.s32 %r4393, %r115, %r4383, %r4392; - shr.u32 %r4394, %r4201, 6; - and.b32 %r4395, %r4394, 240; - shr.u32 %r4396, %r4393, 10; - or.b32 %r4397, %r4396, %r4395; - xor.b32 %r4398, %r21, %r4397; - cvt.u64.u32 %rd388, %r4398; - ld.const.v4.u8 {%rs2818, %rs2819, %rs2820, %rs2821}, [matrix+1408]; - cvt.u32.u16 %r4399, %rs2821; - cvt.s32.s8 %r4400, %r4399; - cvt.u32.u16 %r4401, %rs2820; - cvt.s32.s8 %r4402, %r4401; - cvt.u32.u16 %r4403, %rs2818; - cvt.s32.s8 %r4404, %r4403; - cvt.u32.u16 %r4405, %rs2819; - cvt.s32.s8 %r4406, %r4405; - mul.lo.s32 %r4407, %r34, %r4406; - mad.lo.s32 %r4408, %r124, %r4404, %r4407; - mad.lo.s32 %r4409, %r35, %r4402, %r4408; - mad.lo.s32 %r4410, %r36, %r4400, %r4409; - ld.const.v4.u8 {%rs2826, %rs2827, %rs2828, %rs2829}, [matrix+1412]; - cvt.u32.u16 %r4411, %rs2829; - cvt.s32.s8 %r4412, %r4411; - cvt.u32.u16 %r4413, %rs2828; - cvt.s32.s8 %r4414, %r4413; - cvt.u32.u16 %r4415, %rs2827; - cvt.s32.s8 %r4416, %r4415; - cvt.u32.u16 %r4417, %rs2826; - cvt.s32.s8 %r4418, %r4417; - mad.lo.s32 %r4419, %r37, %r4418, %r4410; - mad.lo.s32 %r4420, %r38, %r4416, %r4419; - mad.lo.s32 %r4421, %r39, %r4414, %r4420; - mad.lo.s32 %r4422, %r40, %r4412, %r4421; - ld.const.v4.u8 {%rs2834, %rs2835, %rs2836, %rs2837}, [matrix+1416]; - cvt.u32.u16 %r4423, %rs2837; - cvt.s32.s8 %r4424, %r4423; - cvt.u32.u16 %r4425, %rs2836; - cvt.s32.s8 %r4426, %r4425; - cvt.u32.u16 %r4427, %rs2835; - cvt.s32.s8 %r4428, %r4427; - cvt.u32.u16 %r4429, %rs2834; - cvt.s32.s8 %r4430, %r4429; - mad.lo.s32 %r4431, %r42, %r4430, %r4422; - mad.lo.s32 %r4432, %r43, %r4428, %r4431; - mad.lo.s32 %r4433, %r45, %r4426, %r4432; - mad.lo.s32 %r4434, %r46, %r4424, %r4433; - ld.const.v4.u8 {%rs2842, %rs2843, %rs2844, %rs2845}, [matrix+1420]; - cvt.u32.u16 %r4435, %rs2845; - cvt.s32.s8 %r4436, %r4435; - cvt.u32.u16 %r4437, %rs2844; - cvt.s32.s8 %r4438, %r4437; - cvt.u32.u16 %r4439, %rs2843; - cvt.s32.s8 %r4440, %r4439; - cvt.u32.u16 %r4441, %rs2842; - cvt.s32.s8 %r4442, %r4441; - mad.lo.s32 %r4443, %r48, %r4442, %r4434; - mad.lo.s32 %r4444, %r49, %r4440, %r4443; - mad.lo.s32 %r4445, %r50, %r4438, %r4444; - mad.lo.s32 %r4446, %r51, %r4436, %r4445; - ld.const.v4.u8 {%rs2850, %rs2851, %rs2852, %rs2853}, [matrix+1424]; - cvt.u32.u16 %r4447, %rs2853; - cvt.s32.s8 %r4448, %r4447; - cvt.u32.u16 %r4449, %rs2852; - cvt.s32.s8 %r4450, %r4449; - cvt.u32.u16 %r4451, %rs2851; - cvt.s32.s8 %r4452, %r4451; - cvt.u32.u16 %r4453, %rs2850; - cvt.s32.s8 %r4454, %r4453; - mad.lo.s32 %r4455, %r173, %r4454, %r4446; - mad.lo.s32 %r4456, %r53, %r4452, %r4455; - mad.lo.s32 %r4457, %r54, %r4450, %r4456; - mad.lo.s32 %r4458, %r55, %r4448, %r4457; - ld.const.v4.u8 {%rs2858, %rs2859, %rs2860, %rs2861}, [matrix+1428]; - cvt.u32.u16 %r4459, %rs2861; - cvt.s32.s8 %r4460, %r4459; - cvt.u32.u16 %r4461, %rs2860; - cvt.s32.s8 %r4462, %r4461; - cvt.u32.u16 %r4463, %rs2859; - cvt.s32.s8 %r4464, %r4463; - cvt.u32.u16 %r4465, %rs2858; - cvt.s32.s8 %r4466, %r4465; - mad.lo.s32 %r4467, %r56, %r4466, %r4458; - mad.lo.s32 %r4468, %r57, %r4464, %r4467; - mad.lo.s32 %r4469, %r58, %r4462, %r4468; - mad.lo.s32 %r4470, %r59, %r4460, %r4469; - ld.const.v4.u8 {%rs2866, %rs2867, %rs2868, %rs2869}, [matrix+1432]; - cvt.u32.u16 %r4471, %rs2869; - cvt.s32.s8 %r4472, %r4471; - cvt.u32.u16 %r4473, %rs2868; - cvt.s32.s8 %r4474, %r4473; - cvt.u32.u16 %r4475, %rs2867; - cvt.s32.s8 %r4476, %r4475; - cvt.u32.u16 %r4477, %rs2866; - cvt.s32.s8 %r4478, %r4477; - mad.lo.s32 %r4479, %r61, %r4478, %r4470; - mad.lo.s32 %r4480, %r62, %r4476, %r4479; - mad.lo.s32 %r4481, %r64, %r4474, %r4480; - mad.lo.s32 %r4482, %r65, %r4472, %r4481; - ld.const.v4.u8 {%rs2874, %rs2875, %rs2876, %rs2877}, [matrix+1436]; - cvt.u32.u16 %r4483, %rs2877; - cvt.s32.s8 %r4484, %r4483; - cvt.u32.u16 %r4485, %rs2876; - cvt.s32.s8 %r4486, %r4485; - cvt.u32.u16 %r4487, %rs2875; - cvt.s32.s8 %r4488, %r4487; - cvt.u32.u16 %r4489, %rs2874; - cvt.s32.s8 %r4490, %r4489; - mad.lo.s32 %r4491, %r67, %r4490, %r4482; - mad.lo.s32 %r4492, %r68, %r4488, %r4491; - mad.lo.s32 %r4493, %r69, %r4486, %r4492; - mad.lo.s32 %r4494, %r70, %r4484, %r4493; - ld.const.v4.u8 {%rs2882, %rs2883, %rs2884, %rs2885}, [matrix+1440]; - cvt.u32.u16 %r4495, %rs2885; - cvt.s32.s8 %r4496, %r4495; - cvt.u32.u16 %r4497, %rs2884; - cvt.s32.s8 %r4498, %r4497; - cvt.u32.u16 %r4499, %rs2883; - cvt.s32.s8 %r4500, %r4499; - cvt.u32.u16 %r4501, %rs2882; - cvt.s32.s8 %r4502, %r4501; - mad.lo.s32 %r4503, %r222, %r4502, %r4494; - mad.lo.s32 %r4504, %r72, %r4500, %r4503; - mad.lo.s32 %r4505, %r73, %r4498, %r4504; - mad.lo.s32 %r4506, %r74, %r4496, %r4505; - ld.const.v4.u8 {%rs2890, %rs2891, %rs2892, %rs2893}, [matrix+1444]; - cvt.u32.u16 %r4507, %rs2893; - cvt.s32.s8 %r4508, %r4507; - cvt.u32.u16 %r4509, %rs2892; - cvt.s32.s8 %r4510, %r4509; - cvt.u32.u16 %r4511, %rs2891; - cvt.s32.s8 %r4512, %r4511; - cvt.u32.u16 %r4513, %rs2890; - cvt.s32.s8 %r4514, %r4513; - mad.lo.s32 %r4515, %r75, %r4514, %r4506; - mad.lo.s32 %r4516, %r76, %r4512, %r4515; - mad.lo.s32 %r4517, %r77, %r4510, %r4516; - mad.lo.s32 %r4518, %r78, %r4508, %r4517; - ld.const.v4.u8 {%rs2898, %rs2899, %rs2900, %rs2901}, [matrix+1448]; - cvt.u32.u16 %r4519, %rs2901; - cvt.s32.s8 %r4520, %r4519; - cvt.u32.u16 %r4521, %rs2900; - cvt.s32.s8 %r4522, %r4521; - cvt.u32.u16 %r4523, %rs2899; - cvt.s32.s8 %r4524, %r4523; - cvt.u32.u16 %r4525, %rs2898; - cvt.s32.s8 %r4526, %r4525; - mad.lo.s32 %r4527, %r80, %r4526, %r4518; - mad.lo.s32 %r4528, %r81, %r4524, %r4527; - mad.lo.s32 %r4529, %r83, %r4522, %r4528; - mad.lo.s32 %r4530, %r84, %r4520, %r4529; - ld.const.v4.u8 {%rs2906, %rs2907, %rs2908, %rs2909}, [matrix+1452]; - cvt.u32.u16 %r4531, %rs2909; - cvt.s32.s8 %r4532, %r4531; - cvt.u32.u16 %r4533, %rs2908; - cvt.s32.s8 %r4534, %r4533; - cvt.u32.u16 %r4535, %rs2907; - cvt.s32.s8 %r4536, %r4535; - cvt.u32.u16 %r4537, %rs2906; - cvt.s32.s8 %r4538, %r4537; - mad.lo.s32 %r4539, %r86, %r4538, %r4530; - mad.lo.s32 %r4540, %r87, %r4536, %r4539; - mad.lo.s32 %r4541, %r88, %r4534, %r4540; - mad.lo.s32 %r4542, %r89, %r4532, %r4541; - ld.const.v4.u8 {%rs2914, %rs2915, %rs2916, %rs2917}, [matrix+1456]; - cvt.u32.u16 %r4543, %rs2917; - cvt.s32.s8 %r4544, %r4543; - cvt.u32.u16 %r4545, %rs2916; - cvt.s32.s8 %r4546, %r4545; - cvt.u32.u16 %r4547, %rs2915; - cvt.s32.s8 %r4548, %r4547; - cvt.u32.u16 %r4549, %rs2914; - cvt.s32.s8 %r4550, %r4549; - mad.lo.s32 %r4551, %r271, %r4550, %r4542; - mad.lo.s32 %r4552, %r91, %r4548, %r4551; - mad.lo.s32 %r4553, %r93, %r4546, %r4552; - mad.lo.s32 %r4554, %r94, %r4544, %r4553; - ld.const.v4.u8 {%rs2922, %rs2923, %rs2924, %rs2925}, [matrix+1460]; - cvt.u32.u16 %r4555, %rs2925; - cvt.s32.s8 %r4556, %r4555; - cvt.u32.u16 %r4557, %rs2924; - cvt.s32.s8 %r4558, %r4557; - cvt.u32.u16 %r4559, %rs2923; - cvt.s32.s8 %r4560, %r4559; - cvt.u32.u16 %r4561, %rs2922; - cvt.s32.s8 %r4562, %r4561; - mad.lo.s32 %r4563, %r96, %r4562, %r4554; - mad.lo.s32 %r4564, %r97, %r4560, %r4563; - mad.lo.s32 %r4565, %r99, %r4558, %r4564; - mad.lo.s32 %r4566, %r100, %r4556, %r4565; - ld.const.v4.u8 {%rs2930, %rs2931, %rs2932, %rs2933}, [matrix+1464]; - cvt.u32.u16 %r4567, %rs2933; - cvt.s32.s8 %r4568, %r4567; - cvt.u32.u16 %r4569, %rs2932; - cvt.s32.s8 %r4570, %r4569; - cvt.u32.u16 %r4571, %rs2931; - cvt.s32.s8 %r4572, %r4571; - cvt.u32.u16 %r4573, %rs2930; - cvt.s32.s8 %r4574, %r4573; - mad.lo.s32 %r4575, %r103, %r4574, %r4566; - mad.lo.s32 %r4576, %r104, %r4572, %r4575; - mad.lo.s32 %r4577, %r107, %r4570, %r4576; - mad.lo.s32 %r4578, %r108, %r4568, %r4577; - ld.const.v4.u8 {%rs2938, %rs2939, %rs2940, %rs2941}, [matrix+1468]; - cvt.u32.u16 %r4579, %rs2941; - cvt.s32.s8 %r4580, %r4579; - cvt.u32.u16 %r4581, %rs2940; - cvt.s32.s8 %r4582, %r4581; - cvt.u32.u16 %r4583, %rs2939; - cvt.s32.s8 %r4584, %r4583; - cvt.u32.u16 %r4585, %rs2938; - cvt.s32.s8 %r4586, %r4585; - mad.lo.s32 %r4587, %r111, %r4586, %r4578; - mad.lo.s32 %r4588, %r112, %r4584, %r4587; - mad.lo.s32 %r4589, %r114, %r4582, %r4588; - mad.lo.s32 %r4590, %r115, %r4580, %r4589; - ld.const.v4.u8 {%rs2946, %rs2947, %rs2948, %rs2949}, [matrix+1472]; - cvt.u32.u16 %r4591, %rs2949; - cvt.s32.s8 %r4592, %r4591; - cvt.u32.u16 %r4593, %rs2948; - cvt.s32.s8 %r4594, %r4593; - cvt.u32.u16 %r4595, %rs2946; - cvt.s32.s8 %r4596, %r4595; - cvt.u32.u16 %r4597, %rs2947; - cvt.s32.s8 %r4598, %r4597; - mul.lo.s32 %r4599, %r34, %r4598; - mad.lo.s32 %r4600, %r124, %r4596, %r4599; - mad.lo.s32 %r4601, %r35, %r4594, %r4600; - mad.lo.s32 %r4602, %r36, %r4592, %r4601; - ld.const.v4.u8 {%rs2954, %rs2955, %rs2956, %rs2957}, [matrix+1476]; - cvt.u32.u16 %r4603, %rs2957; - cvt.s32.s8 %r4604, %r4603; - cvt.u32.u16 %r4605, %rs2956; - cvt.s32.s8 %r4606, %r4605; - cvt.u32.u16 %r4607, %rs2955; - cvt.s32.s8 %r4608, %r4607; - cvt.u32.u16 %r4609, %rs2954; - cvt.s32.s8 %r4610, %r4609; - mad.lo.s32 %r4611, %r37, %r4610, %r4602; - mad.lo.s32 %r4612, %r38, %r4608, %r4611; - mad.lo.s32 %r4613, %r39, %r4606, %r4612; - mad.lo.s32 %r4614, %r40, %r4604, %r4613; - ld.const.v4.u8 {%rs2962, %rs2963, %rs2964, %rs2965}, [matrix+1480]; - cvt.u32.u16 %r4615, %rs2965; - cvt.s32.s8 %r4616, %r4615; - cvt.u32.u16 %r4617, %rs2964; - cvt.s32.s8 %r4618, %r4617; - cvt.u32.u16 %r4619, %rs2963; - cvt.s32.s8 %r4620, %r4619; - cvt.u32.u16 %r4621, %rs2962; - cvt.s32.s8 %r4622, %r4621; - mad.lo.s32 %r4623, %r42, %r4622, %r4614; - mad.lo.s32 %r4624, %r43, %r4620, %r4623; - mad.lo.s32 %r4625, %r45, %r4618, %r4624; - mad.lo.s32 %r4626, %r46, %r4616, %r4625; - ld.const.v4.u8 {%rs2970, %rs2971, %rs2972, %rs2973}, [matrix+1484]; - cvt.u32.u16 %r4627, %rs2973; - cvt.s32.s8 %r4628, %r4627; - cvt.u32.u16 %r4629, %rs2972; - cvt.s32.s8 %r4630, %r4629; - cvt.u32.u16 %r4631, %rs2971; - cvt.s32.s8 %r4632, %r4631; - cvt.u32.u16 %r4633, %rs2970; - cvt.s32.s8 %r4634, %r4633; - mad.lo.s32 %r4635, %r48, %r4634, %r4626; - mad.lo.s32 %r4636, %r49, %r4632, %r4635; - mad.lo.s32 %r4637, %r50, %r4630, %r4636; - mad.lo.s32 %r4638, %r51, %r4628, %r4637; - ld.const.v4.u8 {%rs2978, %rs2979, %rs2980, %rs2981}, [matrix+1488]; - cvt.u32.u16 %r4639, %rs2981; - cvt.s32.s8 %r4640, %r4639; - cvt.u32.u16 %r4641, %rs2980; - cvt.s32.s8 %r4642, %r4641; - cvt.u32.u16 %r4643, %rs2979; - cvt.s32.s8 %r4644, %r4643; - cvt.u32.u16 %r4645, %rs2978; - cvt.s32.s8 %r4646, %r4645; - mad.lo.s32 %r4647, %r173, %r4646, %r4638; - mad.lo.s32 %r4648, %r53, %r4644, %r4647; - mad.lo.s32 %r4649, %r54, %r4642, %r4648; - mad.lo.s32 %r4650, %r55, %r4640, %r4649; - ld.const.v4.u8 {%rs2986, %rs2987, %rs2988, %rs2989}, [matrix+1492]; - cvt.u32.u16 %r4651, %rs2989; - cvt.s32.s8 %r4652, %r4651; - cvt.u32.u16 %r4653, %rs2988; - cvt.s32.s8 %r4654, %r4653; - cvt.u32.u16 %r4655, %rs2987; - cvt.s32.s8 %r4656, %r4655; - cvt.u32.u16 %r4657, %rs2986; - cvt.s32.s8 %r4658, %r4657; - mad.lo.s32 %r4659, %r56, %r4658, %r4650; - mad.lo.s32 %r4660, %r57, %r4656, %r4659; - mad.lo.s32 %r4661, %r58, %r4654, %r4660; - mad.lo.s32 %r4662, %r59, %r4652, %r4661; - ld.const.v4.u8 {%rs2994, %rs2995, %rs2996, %rs2997}, [matrix+1496]; - cvt.u32.u16 %r4663, %rs2997; - cvt.s32.s8 %r4664, %r4663; - cvt.u32.u16 %r4665, %rs2996; - cvt.s32.s8 %r4666, %r4665; - cvt.u32.u16 %r4667, %rs2995; - cvt.s32.s8 %r4668, %r4667; - cvt.u32.u16 %r4669, %rs2994; - cvt.s32.s8 %r4670, %r4669; - mad.lo.s32 %r4671, %r61, %r4670, %r4662; - mad.lo.s32 %r4672, %r62, %r4668, %r4671; - mad.lo.s32 %r4673, %r64, %r4666, %r4672; - mad.lo.s32 %r4674, %r65, %r4664, %r4673; - ld.const.v4.u8 {%rs3002, %rs3003, %rs3004, %rs3005}, [matrix+1500]; - cvt.u32.u16 %r4675, %rs3005; - cvt.s32.s8 %r4676, %r4675; - cvt.u32.u16 %r4677, %rs3004; - cvt.s32.s8 %r4678, %r4677; - cvt.u32.u16 %r4679, %rs3003; - cvt.s32.s8 %r4680, %r4679; - cvt.u32.u16 %r4681, %rs3002; - cvt.s32.s8 %r4682, %r4681; - mad.lo.s32 %r4683, %r67, %r4682, %r4674; - mad.lo.s32 %r4684, %r68, %r4680, %r4683; - mad.lo.s32 %r4685, %r69, %r4678, %r4684; - mad.lo.s32 %r4686, %r70, %r4676, %r4685; - ld.const.v4.u8 {%rs3010, %rs3011, %rs3012, %rs3013}, [matrix+1504]; - cvt.u32.u16 %r4687, %rs3013; - cvt.s32.s8 %r4688, %r4687; - cvt.u32.u16 %r4689, %rs3012; - cvt.s32.s8 %r4690, %r4689; - cvt.u32.u16 %r4691, %rs3011; - cvt.s32.s8 %r4692, %r4691; - cvt.u32.u16 %r4693, %rs3010; - cvt.s32.s8 %r4694, %r4693; - mad.lo.s32 %r4695, %r222, %r4694, %r4686; - mad.lo.s32 %r4696, %r72, %r4692, %r4695; - mad.lo.s32 %r4697, %r73, %r4690, %r4696; - mad.lo.s32 %r4698, %r74, %r4688, %r4697; - ld.const.v4.u8 {%rs3018, %rs3019, %rs3020, %rs3021}, [matrix+1508]; - cvt.u32.u16 %r4699, %rs3021; - cvt.s32.s8 %r4700, %r4699; - cvt.u32.u16 %r4701, %rs3020; - cvt.s32.s8 %r4702, %r4701; - cvt.u32.u16 %r4703, %rs3019; - cvt.s32.s8 %r4704, %r4703; - cvt.u32.u16 %r4705, %rs3018; - cvt.s32.s8 %r4706, %r4705; - mad.lo.s32 %r4707, %r75, %r4706, %r4698; - mad.lo.s32 %r4708, %r76, %r4704, %r4707; - mad.lo.s32 %r4709, %r77, %r4702, %r4708; - mad.lo.s32 %r4710, %r78, %r4700, %r4709; - ld.const.v4.u8 {%rs3026, %rs3027, %rs3028, %rs3029}, [matrix+1512]; - cvt.u32.u16 %r4711, %rs3029; - cvt.s32.s8 %r4712, %r4711; - cvt.u32.u16 %r4713, %rs3028; - cvt.s32.s8 %r4714, %r4713; - cvt.u32.u16 %r4715, %rs3027; - cvt.s32.s8 %r4716, %r4715; - cvt.u32.u16 %r4717, %rs3026; - cvt.s32.s8 %r4718, %r4717; - mad.lo.s32 %r4719, %r80, %r4718, %r4710; - mad.lo.s32 %r4720, %r81, %r4716, %r4719; - mad.lo.s32 %r4721, %r83, %r4714, %r4720; - mad.lo.s32 %r4722, %r84, %r4712, %r4721; - ld.const.v4.u8 {%rs3034, %rs3035, %rs3036, %rs3037}, [matrix+1516]; - cvt.u32.u16 %r4723, %rs3037; - cvt.s32.s8 %r4724, %r4723; - cvt.u32.u16 %r4725, %rs3036; - cvt.s32.s8 %r4726, %r4725; - cvt.u32.u16 %r4727, %rs3035; - cvt.s32.s8 %r4728, %r4727; - cvt.u32.u16 %r4729, %rs3034; - cvt.s32.s8 %r4730, %r4729; - mad.lo.s32 %r4731, %r86, %r4730, %r4722; - mad.lo.s32 %r4732, %r87, %r4728, %r4731; - mad.lo.s32 %r4733, %r88, %r4726, %r4732; - mad.lo.s32 %r4734, %r89, %r4724, %r4733; - ld.const.v4.u8 {%rs3042, %rs3043, %rs3044, %rs3045}, [matrix+1520]; - cvt.u32.u16 %r4735, %rs3045; - cvt.s32.s8 %r4736, %r4735; - cvt.u32.u16 %r4737, %rs3044; - cvt.s32.s8 %r4738, %r4737; - cvt.u32.u16 %r4739, %rs3043; - cvt.s32.s8 %r4740, %r4739; - cvt.u32.u16 %r4741, %rs3042; - cvt.s32.s8 %r4742, %r4741; - mad.lo.s32 %r4743, %r271, %r4742, %r4734; - mad.lo.s32 %r4744, %r91, %r4740, %r4743; - mad.lo.s32 %r4745, %r93, %r4738, %r4744; - mad.lo.s32 %r4746, %r94, %r4736, %r4745; - ld.const.v4.u8 {%rs3050, %rs3051, %rs3052, %rs3053}, [matrix+1524]; - cvt.u32.u16 %r4747, %rs3053; - cvt.s32.s8 %r4748, %r4747; - cvt.u32.u16 %r4749, %rs3052; - cvt.s32.s8 %r4750, %r4749; - cvt.u32.u16 %r4751, %rs3051; - cvt.s32.s8 %r4752, %r4751; - cvt.u32.u16 %r4753, %rs3050; - cvt.s32.s8 %r4754, %r4753; - mad.lo.s32 %r4755, %r96, %r4754, %r4746; - mad.lo.s32 %r4756, %r97, %r4752, %r4755; - mad.lo.s32 %r4757, %r99, %r4750, %r4756; - mad.lo.s32 %r4758, %r100, %r4748, %r4757; - ld.const.v4.u8 {%rs3058, %rs3059, %rs3060, %rs3061}, [matrix+1528]; - cvt.u32.u16 %r4759, %rs3061; - cvt.s32.s8 %r4760, %r4759; - cvt.u32.u16 %r4761, %rs3060; - cvt.s32.s8 %r4762, %r4761; - cvt.u32.u16 %r4763, %rs3059; - cvt.s32.s8 %r4764, %r4763; - cvt.u32.u16 %r4765, %rs3058; - cvt.s32.s8 %r4766, %r4765; - mad.lo.s32 %r4767, %r103, %r4766, %r4758; - mad.lo.s32 %r4768, %r104, %r4764, %r4767; - mad.lo.s32 %r4769, %r107, %r4762, %r4768; - mad.lo.s32 %r4770, %r108, %r4760, %r4769; - ld.const.v4.u8 {%rs3066, %rs3067, %rs3068, %rs3069}, [matrix+1532]; - cvt.u32.u16 %r4771, %rs3069; - cvt.s32.s8 %r4772, %r4771; - cvt.u32.u16 %r4773, %rs3068; - cvt.s32.s8 %r4774, %r4773; - cvt.u32.u16 %r4775, %rs3067; - cvt.s32.s8 %r4776, %r4775; - cvt.u32.u16 %r4777, %rs3066; - cvt.s32.s8 %r4778, %r4777; - mad.lo.s32 %r4779, %r111, %r4778, %r4770; - mad.lo.s32 %r4780, %r112, %r4776, %r4779; - mad.lo.s32 %r4781, %r114, %r4774, %r4780; - mad.lo.s32 %r4782, %r115, %r4772, %r4781; - shr.u32 %r4783, %r4590, 6; - and.b32 %r4784, %r4783, 240; - shr.u32 %r4785, %r4782, 10; - or.b32 %r4786, %r4785, %r4784; - xor.b32 %r4787, %r22, %r4786; - cvt.u64.u32 %rd389, %r4787; - ld.const.v4.u8 {%rs3074, %rs3075, %rs3076, %rs3077}, [matrix+1536]; - cvt.u32.u16 %r4788, %rs3077; - cvt.s32.s8 %r4789, %r4788; - cvt.u32.u16 %r4790, %rs3076; - cvt.s32.s8 %r4791, %r4790; - cvt.u32.u16 %r4792, %rs3074; - cvt.s32.s8 %r4793, %r4792; - cvt.u32.u16 %r4794, %rs3075; - cvt.s32.s8 %r4795, %r4794; - mul.lo.s32 %r4796, %r34, %r4795; - mad.lo.s32 %r4797, %r124, %r4793, %r4796; - mad.lo.s32 %r4798, %r35, %r4791, %r4797; - mad.lo.s32 %r4799, %r36, %r4789, %r4798; - ld.const.v4.u8 {%rs3082, %rs3083, %rs3084, %rs3085}, [matrix+1540]; - cvt.u32.u16 %r4800, %rs3085; - cvt.s32.s8 %r4801, %r4800; - cvt.u32.u16 %r4802, %rs3084; - cvt.s32.s8 %r4803, %r4802; - cvt.u32.u16 %r4804, %rs3083; - cvt.s32.s8 %r4805, %r4804; - cvt.u32.u16 %r4806, %rs3082; - cvt.s32.s8 %r4807, %r4806; - mad.lo.s32 %r4808, %r37, %r4807, %r4799; - mad.lo.s32 %r4809, %r38, %r4805, %r4808; - mad.lo.s32 %r4810, %r39, %r4803, %r4809; - mad.lo.s32 %r4811, %r40, %r4801, %r4810; - ld.const.v4.u8 {%rs3090, %rs3091, %rs3092, %rs3093}, [matrix+1544]; - cvt.u32.u16 %r4812, %rs3093; - cvt.s32.s8 %r4813, %r4812; - cvt.u32.u16 %r4814, %rs3092; - cvt.s32.s8 %r4815, %r4814; - cvt.u32.u16 %r4816, %rs3091; - cvt.s32.s8 %r4817, %r4816; - cvt.u32.u16 %r4818, %rs3090; - cvt.s32.s8 %r4819, %r4818; - mad.lo.s32 %r4820, %r42, %r4819, %r4811; - mad.lo.s32 %r4821, %r43, %r4817, %r4820; - mad.lo.s32 %r4822, %r45, %r4815, %r4821; - mad.lo.s32 %r4823, %r46, %r4813, %r4822; - ld.const.v4.u8 {%rs3098, %rs3099, %rs3100, %rs3101}, [matrix+1548]; - cvt.u32.u16 %r4824, %rs3101; - cvt.s32.s8 %r4825, %r4824; - cvt.u32.u16 %r4826, %rs3100; - cvt.s32.s8 %r4827, %r4826; - cvt.u32.u16 %r4828, %rs3099; - cvt.s32.s8 %r4829, %r4828; - cvt.u32.u16 %r4830, %rs3098; - cvt.s32.s8 %r4831, %r4830; - mad.lo.s32 %r4832, %r48, %r4831, %r4823; - mad.lo.s32 %r4833, %r49, %r4829, %r4832; - mad.lo.s32 %r4834, %r50, %r4827, %r4833; - mad.lo.s32 %r4835, %r51, %r4825, %r4834; - ld.const.v4.u8 {%rs3106, %rs3107, %rs3108, %rs3109}, [matrix+1552]; - cvt.u32.u16 %r4836, %rs3109; - cvt.s32.s8 %r4837, %r4836; - cvt.u32.u16 %r4838, %rs3108; - cvt.s32.s8 %r4839, %r4838; - cvt.u32.u16 %r4840, %rs3107; - cvt.s32.s8 %r4841, %r4840; - cvt.u32.u16 %r4842, %rs3106; - cvt.s32.s8 %r4843, %r4842; - mad.lo.s32 %r4844, %r173, %r4843, %r4835; - mad.lo.s32 %r4845, %r53, %r4841, %r4844; - mad.lo.s32 %r4846, %r54, %r4839, %r4845; - mad.lo.s32 %r4847, %r55, %r4837, %r4846; - ld.const.v4.u8 {%rs3114, %rs3115, %rs3116, %rs3117}, [matrix+1556]; - cvt.u32.u16 %r4848, %rs3117; - cvt.s32.s8 %r4849, %r4848; - cvt.u32.u16 %r4850, %rs3116; - cvt.s32.s8 %r4851, %r4850; - cvt.u32.u16 %r4852, %rs3115; - cvt.s32.s8 %r4853, %r4852; - cvt.u32.u16 %r4854, %rs3114; - cvt.s32.s8 %r4855, %r4854; - mad.lo.s32 %r4856, %r56, %r4855, %r4847; - mad.lo.s32 %r4857, %r57, %r4853, %r4856; - mad.lo.s32 %r4858, %r58, %r4851, %r4857; - mad.lo.s32 %r4859, %r59, %r4849, %r4858; - ld.const.v4.u8 {%rs3122, %rs3123, %rs3124, %rs3125}, [matrix+1560]; - cvt.u32.u16 %r4860, %rs3125; - cvt.s32.s8 %r4861, %r4860; - cvt.u32.u16 %r4862, %rs3124; - cvt.s32.s8 %r4863, %r4862; - cvt.u32.u16 %r4864, %rs3123; - cvt.s32.s8 %r4865, %r4864; - cvt.u32.u16 %r4866, %rs3122; - cvt.s32.s8 %r4867, %r4866; - mad.lo.s32 %r4868, %r61, %r4867, %r4859; - mad.lo.s32 %r4869, %r62, %r4865, %r4868; - mad.lo.s32 %r4870, %r64, %r4863, %r4869; - mad.lo.s32 %r4871, %r65, %r4861, %r4870; - ld.const.v4.u8 {%rs3130, %rs3131, %rs3132, %rs3133}, [matrix+1564]; - cvt.u32.u16 %r4872, %rs3133; - cvt.s32.s8 %r4873, %r4872; - cvt.u32.u16 %r4874, %rs3132; - cvt.s32.s8 %r4875, %r4874; - cvt.u32.u16 %r4876, %rs3131; - cvt.s32.s8 %r4877, %r4876; - cvt.u32.u16 %r4878, %rs3130; - cvt.s32.s8 %r4879, %r4878; - mad.lo.s32 %r4880, %r67, %r4879, %r4871; - mad.lo.s32 %r4881, %r68, %r4877, %r4880; - mad.lo.s32 %r4882, %r69, %r4875, %r4881; - mad.lo.s32 %r4883, %r70, %r4873, %r4882; - ld.const.v4.u8 {%rs3138, %rs3139, %rs3140, %rs3141}, [matrix+1568]; - cvt.u32.u16 %r4884, %rs3141; - cvt.s32.s8 %r4885, %r4884; - cvt.u32.u16 %r4886, %rs3140; - cvt.s32.s8 %r4887, %r4886; - cvt.u32.u16 %r4888, %rs3139; - cvt.s32.s8 %r4889, %r4888; - cvt.u32.u16 %r4890, %rs3138; - cvt.s32.s8 %r4891, %r4890; - mad.lo.s32 %r4892, %r222, %r4891, %r4883; - mad.lo.s32 %r4893, %r72, %r4889, %r4892; - mad.lo.s32 %r4894, %r73, %r4887, %r4893; - mad.lo.s32 %r4895, %r74, %r4885, %r4894; - ld.const.v4.u8 {%rs3146, %rs3147, %rs3148, %rs3149}, [matrix+1572]; - cvt.u32.u16 %r4896, %rs3149; - cvt.s32.s8 %r4897, %r4896; - cvt.u32.u16 %r4898, %rs3148; - cvt.s32.s8 %r4899, %r4898; - cvt.u32.u16 %r4900, %rs3147; - cvt.s32.s8 %r4901, %r4900; - cvt.u32.u16 %r4902, %rs3146; - cvt.s32.s8 %r4903, %r4902; - mad.lo.s32 %r4904, %r75, %r4903, %r4895; - mad.lo.s32 %r4905, %r76, %r4901, %r4904; - mad.lo.s32 %r4906, %r77, %r4899, %r4905; - mad.lo.s32 %r4907, %r78, %r4897, %r4906; - ld.const.v4.u8 {%rs3154, %rs3155, %rs3156, %rs3157}, [matrix+1576]; - cvt.u32.u16 %r4908, %rs3157; - cvt.s32.s8 %r4909, %r4908; - cvt.u32.u16 %r4910, %rs3156; - cvt.s32.s8 %r4911, %r4910; - cvt.u32.u16 %r4912, %rs3155; - cvt.s32.s8 %r4913, %r4912; - cvt.u32.u16 %r4914, %rs3154; - cvt.s32.s8 %r4915, %r4914; - mad.lo.s32 %r4916, %r80, %r4915, %r4907; - mad.lo.s32 %r4917, %r81, %r4913, %r4916; - mad.lo.s32 %r4918, %r83, %r4911, %r4917; - mad.lo.s32 %r4919, %r84, %r4909, %r4918; - ld.const.v4.u8 {%rs3162, %rs3163, %rs3164, %rs3165}, [matrix+1580]; - cvt.u32.u16 %r4920, %rs3165; - cvt.s32.s8 %r4921, %r4920; - cvt.u32.u16 %r4922, %rs3164; - cvt.s32.s8 %r4923, %r4922; - cvt.u32.u16 %r4924, %rs3163; - cvt.s32.s8 %r4925, %r4924; - cvt.u32.u16 %r4926, %rs3162; - cvt.s32.s8 %r4927, %r4926; - mad.lo.s32 %r4928, %r86, %r4927, %r4919; - mad.lo.s32 %r4929, %r87, %r4925, %r4928; - mad.lo.s32 %r4930, %r88, %r4923, %r4929; - mad.lo.s32 %r4931, %r89, %r4921, %r4930; - ld.const.v4.u8 {%rs3170, %rs3171, %rs3172, %rs3173}, [matrix+1584]; - cvt.u32.u16 %r4932, %rs3173; - cvt.s32.s8 %r4933, %r4932; - cvt.u32.u16 %r4934, %rs3172; - cvt.s32.s8 %r4935, %r4934; - cvt.u32.u16 %r4936, %rs3171; - cvt.s32.s8 %r4937, %r4936; - cvt.u32.u16 %r4938, %rs3170; - cvt.s32.s8 %r4939, %r4938; - mad.lo.s32 %r4940, %r271, %r4939, %r4931; - mad.lo.s32 %r4941, %r91, %r4937, %r4940; - mad.lo.s32 %r4942, %r93, %r4935, %r4941; - mad.lo.s32 %r4943, %r94, %r4933, %r4942; - ld.const.v4.u8 {%rs3178, %rs3179, %rs3180, %rs3181}, [matrix+1588]; - cvt.u32.u16 %r4944, %rs3181; - cvt.s32.s8 %r4945, %r4944; - cvt.u32.u16 %r4946, %rs3180; - cvt.s32.s8 %r4947, %r4946; - cvt.u32.u16 %r4948, %rs3179; - cvt.s32.s8 %r4949, %r4948; - cvt.u32.u16 %r4950, %rs3178; - cvt.s32.s8 %r4951, %r4950; - mad.lo.s32 %r4952, %r96, %r4951, %r4943; - mad.lo.s32 %r4953, %r97, %r4949, %r4952; - mad.lo.s32 %r4954, %r99, %r4947, %r4953; - mad.lo.s32 %r4955, %r100, %r4945, %r4954; - ld.const.v4.u8 {%rs3186, %rs3187, %rs3188, %rs3189}, [matrix+1592]; - cvt.u32.u16 %r4956, %rs3189; - cvt.s32.s8 %r4957, %r4956; - cvt.u32.u16 %r4958, %rs3188; - cvt.s32.s8 %r4959, %r4958; - cvt.u32.u16 %r4960, %rs3187; - cvt.s32.s8 %r4961, %r4960; - cvt.u32.u16 %r4962, %rs3186; - cvt.s32.s8 %r4963, %r4962; - mad.lo.s32 %r4964, %r103, %r4963, %r4955; - mad.lo.s32 %r4965, %r104, %r4961, %r4964; - mad.lo.s32 %r4966, %r107, %r4959, %r4965; - mad.lo.s32 %r4967, %r108, %r4957, %r4966; - ld.const.v4.u8 {%rs3194, %rs3195, %rs3196, %rs3197}, [matrix+1596]; - cvt.u32.u16 %r4968, %rs3197; - cvt.s32.s8 %r4969, %r4968; - cvt.u32.u16 %r4970, %rs3196; - cvt.s32.s8 %r4971, %r4970; - cvt.u32.u16 %r4972, %rs3195; - cvt.s32.s8 %r4973, %r4972; - cvt.u32.u16 %r4974, %rs3194; - cvt.s32.s8 %r4975, %r4974; - mad.lo.s32 %r4976, %r111, %r4975, %r4967; - mad.lo.s32 %r4977, %r112, %r4973, %r4976; - mad.lo.s32 %r4978, %r114, %r4971, %r4977; - mad.lo.s32 %r4979, %r115, %r4969, %r4978; - ld.const.v4.u8 {%rs3202, %rs3203, %rs3204, %rs3205}, [matrix+1600]; - cvt.u32.u16 %r4980, %rs3205; - cvt.s32.s8 %r4981, %r4980; - cvt.u32.u16 %r4982, %rs3204; - cvt.s32.s8 %r4983, %r4982; - cvt.u32.u16 %r4984, %rs3202; - cvt.s32.s8 %r4985, %r4984; - cvt.u32.u16 %r4986, %rs3203; - cvt.s32.s8 %r4987, %r4986; - mul.lo.s32 %r4988, %r34, %r4987; - mad.lo.s32 %r4989, %r124, %r4985, %r4988; - mad.lo.s32 %r4990, %r35, %r4983, %r4989; - mad.lo.s32 %r4991, %r36, %r4981, %r4990; - ld.const.v4.u8 {%rs3210, %rs3211, %rs3212, %rs3213}, [matrix+1604]; - cvt.u32.u16 %r4992, %rs3213; - cvt.s32.s8 %r4993, %r4992; - cvt.u32.u16 %r4994, %rs3212; - cvt.s32.s8 %r4995, %r4994; - cvt.u32.u16 %r4996, %rs3211; - cvt.s32.s8 %r4997, %r4996; - cvt.u32.u16 %r4998, %rs3210; - cvt.s32.s8 %r4999, %r4998; - mad.lo.s32 %r5000, %r37, %r4999, %r4991; - mad.lo.s32 %r5001, %r38, %r4997, %r5000; - mad.lo.s32 %r5002, %r39, %r4995, %r5001; - mad.lo.s32 %r5003, %r40, %r4993, %r5002; - ld.const.v4.u8 {%rs3218, %rs3219, %rs3220, %rs3221}, [matrix+1608]; - cvt.u32.u16 %r5004, %rs3221; - cvt.s32.s8 %r5005, %r5004; - cvt.u32.u16 %r5006, %rs3220; - cvt.s32.s8 %r5007, %r5006; - cvt.u32.u16 %r5008, %rs3219; - cvt.s32.s8 %r5009, %r5008; - cvt.u32.u16 %r5010, %rs3218; - cvt.s32.s8 %r5011, %r5010; - mad.lo.s32 %r5012, %r42, %r5011, %r5003; - mad.lo.s32 %r5013, %r43, %r5009, %r5012; - mad.lo.s32 %r5014, %r45, %r5007, %r5013; - mad.lo.s32 %r5015, %r46, %r5005, %r5014; - ld.const.v4.u8 {%rs3226, %rs3227, %rs3228, %rs3229}, [matrix+1612]; - cvt.u32.u16 %r5016, %rs3229; - cvt.s32.s8 %r5017, %r5016; - cvt.u32.u16 %r5018, %rs3228; - cvt.s32.s8 %r5019, %r5018; - cvt.u32.u16 %r5020, %rs3227; - cvt.s32.s8 %r5021, %r5020; - cvt.u32.u16 %r5022, %rs3226; - cvt.s32.s8 %r5023, %r5022; - mad.lo.s32 %r5024, %r48, %r5023, %r5015; - mad.lo.s32 %r5025, %r49, %r5021, %r5024; - mad.lo.s32 %r5026, %r50, %r5019, %r5025; - mad.lo.s32 %r5027, %r51, %r5017, %r5026; - ld.const.v4.u8 {%rs3234, %rs3235, %rs3236, %rs3237}, [matrix+1616]; - cvt.u32.u16 %r5028, %rs3237; - cvt.s32.s8 %r5029, %r5028; - cvt.u32.u16 %r5030, %rs3236; - cvt.s32.s8 %r5031, %r5030; - cvt.u32.u16 %r5032, %rs3235; - cvt.s32.s8 %r5033, %r5032; - cvt.u32.u16 %r5034, %rs3234; - cvt.s32.s8 %r5035, %r5034; - mad.lo.s32 %r5036, %r173, %r5035, %r5027; - mad.lo.s32 %r5037, %r53, %r5033, %r5036; - mad.lo.s32 %r5038, %r54, %r5031, %r5037; - mad.lo.s32 %r5039, %r55, %r5029, %r5038; - ld.const.v4.u8 {%rs3242, %rs3243, %rs3244, %rs3245}, [matrix+1620]; - cvt.u32.u16 %r5040, %rs3245; - cvt.s32.s8 %r5041, %r5040; - cvt.u32.u16 %r5042, %rs3244; - cvt.s32.s8 %r5043, %r5042; - cvt.u32.u16 %r5044, %rs3243; - cvt.s32.s8 %r5045, %r5044; - cvt.u32.u16 %r5046, %rs3242; - cvt.s32.s8 %r5047, %r5046; - mad.lo.s32 %r5048, %r56, %r5047, %r5039; - mad.lo.s32 %r5049, %r57, %r5045, %r5048; - mad.lo.s32 %r5050, %r58, %r5043, %r5049; - mad.lo.s32 %r5051, %r59, %r5041, %r5050; - ld.const.v4.u8 {%rs3250, %rs3251, %rs3252, %rs3253}, [matrix+1624]; - cvt.u32.u16 %r5052, %rs3253; - cvt.s32.s8 %r5053, %r5052; - cvt.u32.u16 %r5054, %rs3252; - cvt.s32.s8 %r5055, %r5054; - cvt.u32.u16 %r5056, %rs3251; - cvt.s32.s8 %r5057, %r5056; - cvt.u32.u16 %r5058, %rs3250; - cvt.s32.s8 %r5059, %r5058; - mad.lo.s32 %r5060, %r61, %r5059, %r5051; - mad.lo.s32 %r5061, %r62, %r5057, %r5060; - mad.lo.s32 %r5062, %r64, %r5055, %r5061; - mad.lo.s32 %r5063, %r65, %r5053, %r5062; - ld.const.v4.u8 {%rs3258, %rs3259, %rs3260, %rs3261}, [matrix+1628]; - cvt.u32.u16 %r5064, %rs3261; - cvt.s32.s8 %r5065, %r5064; - cvt.u32.u16 %r5066, %rs3260; - cvt.s32.s8 %r5067, %r5066; - cvt.u32.u16 %r5068, %rs3259; - cvt.s32.s8 %r5069, %r5068; - cvt.u32.u16 %r5070, %rs3258; - cvt.s32.s8 %r5071, %r5070; - mad.lo.s32 %r5072, %r67, %r5071, %r5063; - mad.lo.s32 %r5073, %r68, %r5069, %r5072; - mad.lo.s32 %r5074, %r69, %r5067, %r5073; - mad.lo.s32 %r5075, %r70, %r5065, %r5074; - ld.const.v4.u8 {%rs3266, %rs3267, %rs3268, %rs3269}, [matrix+1632]; - cvt.u32.u16 %r5076, %rs3269; - cvt.s32.s8 %r5077, %r5076; - cvt.u32.u16 %r5078, %rs3268; - cvt.s32.s8 %r5079, %r5078; - cvt.u32.u16 %r5080, %rs3267; - cvt.s32.s8 %r5081, %r5080; - cvt.u32.u16 %r5082, %rs3266; - cvt.s32.s8 %r5083, %r5082; - mad.lo.s32 %r5084, %r222, %r5083, %r5075; - mad.lo.s32 %r5085, %r72, %r5081, %r5084; - mad.lo.s32 %r5086, %r73, %r5079, %r5085; - mad.lo.s32 %r5087, %r74, %r5077, %r5086; - ld.const.v4.u8 {%rs3274, %rs3275, %rs3276, %rs3277}, [matrix+1636]; - cvt.u32.u16 %r5088, %rs3277; - cvt.s32.s8 %r5089, %r5088; - cvt.u32.u16 %r5090, %rs3276; - cvt.s32.s8 %r5091, %r5090; - cvt.u32.u16 %r5092, %rs3275; - cvt.s32.s8 %r5093, %r5092; - cvt.u32.u16 %r5094, %rs3274; - cvt.s32.s8 %r5095, %r5094; - mad.lo.s32 %r5096, %r75, %r5095, %r5087; - mad.lo.s32 %r5097, %r76, %r5093, %r5096; - mad.lo.s32 %r5098, %r77, %r5091, %r5097; - mad.lo.s32 %r5099, %r78, %r5089, %r5098; - ld.const.v4.u8 {%rs3282, %rs3283, %rs3284, %rs3285}, [matrix+1640]; - cvt.u32.u16 %r5100, %rs3285; - cvt.s32.s8 %r5101, %r5100; - cvt.u32.u16 %r5102, %rs3284; - cvt.s32.s8 %r5103, %r5102; - cvt.u32.u16 %r5104, %rs3283; - cvt.s32.s8 %r5105, %r5104; - cvt.u32.u16 %r5106, %rs3282; - cvt.s32.s8 %r5107, %r5106; - mad.lo.s32 %r5108, %r80, %r5107, %r5099; - mad.lo.s32 %r5109, %r81, %r5105, %r5108; - mad.lo.s32 %r5110, %r83, %r5103, %r5109; - mad.lo.s32 %r5111, %r84, %r5101, %r5110; - ld.const.v4.u8 {%rs3290, %rs3291, %rs3292, %rs3293}, [matrix+1644]; - cvt.u32.u16 %r5112, %rs3293; - cvt.s32.s8 %r5113, %r5112; - cvt.u32.u16 %r5114, %rs3292; - cvt.s32.s8 %r5115, %r5114; - cvt.u32.u16 %r5116, %rs3291; - cvt.s32.s8 %r5117, %r5116; - cvt.u32.u16 %r5118, %rs3290; - cvt.s32.s8 %r5119, %r5118; - mad.lo.s32 %r5120, %r86, %r5119, %r5111; - mad.lo.s32 %r5121, %r87, %r5117, %r5120; - mad.lo.s32 %r5122, %r88, %r5115, %r5121; - mad.lo.s32 %r5123, %r89, %r5113, %r5122; - ld.const.v4.u8 {%rs3298, %rs3299, %rs3300, %rs3301}, [matrix+1648]; - cvt.u32.u16 %r5124, %rs3301; - cvt.s32.s8 %r5125, %r5124; - cvt.u32.u16 %r5126, %rs3300; - cvt.s32.s8 %r5127, %r5126; - cvt.u32.u16 %r5128, %rs3299; - cvt.s32.s8 %r5129, %r5128; - cvt.u32.u16 %r5130, %rs3298; - cvt.s32.s8 %r5131, %r5130; - mad.lo.s32 %r5132, %r271, %r5131, %r5123; - mad.lo.s32 %r5133, %r91, %r5129, %r5132; - mad.lo.s32 %r5134, %r93, %r5127, %r5133; - mad.lo.s32 %r5135, %r94, %r5125, %r5134; - ld.const.v4.u8 {%rs3306, %rs3307, %rs3308, %rs3309}, [matrix+1652]; - cvt.u32.u16 %r5136, %rs3309; - cvt.s32.s8 %r5137, %r5136; - cvt.u32.u16 %r5138, %rs3308; - cvt.s32.s8 %r5139, %r5138; - cvt.u32.u16 %r5140, %rs3307; - cvt.s32.s8 %r5141, %r5140; - cvt.u32.u16 %r5142, %rs3306; - cvt.s32.s8 %r5143, %r5142; - mad.lo.s32 %r5144, %r96, %r5143, %r5135; - mad.lo.s32 %r5145, %r97, %r5141, %r5144; - mad.lo.s32 %r5146, %r99, %r5139, %r5145; - mad.lo.s32 %r5147, %r100, %r5137, %r5146; - ld.const.v4.u8 {%rs3314, %rs3315, %rs3316, %rs3317}, [matrix+1656]; - cvt.u32.u16 %r5148, %rs3317; - cvt.s32.s8 %r5149, %r5148; - cvt.u32.u16 %r5150, %rs3316; - cvt.s32.s8 %r5151, %r5150; - cvt.u32.u16 %r5152, %rs3315; - cvt.s32.s8 %r5153, %r5152; - cvt.u32.u16 %r5154, %rs3314; - cvt.s32.s8 %r5155, %r5154; - mad.lo.s32 %r5156, %r103, %r5155, %r5147; - mad.lo.s32 %r5157, %r104, %r5153, %r5156; - mad.lo.s32 %r5158, %r107, %r5151, %r5157; - mad.lo.s32 %r5159, %r108, %r5149, %r5158; - ld.const.v4.u8 {%rs3322, %rs3323, %rs3324, %rs3325}, [matrix+1660]; - cvt.u32.u16 %r5160, %rs3325; - cvt.s32.s8 %r5161, %r5160; - cvt.u32.u16 %r5162, %rs3324; - cvt.s32.s8 %r5163, %r5162; - cvt.u32.u16 %r5164, %rs3323; - cvt.s32.s8 %r5165, %r5164; - cvt.u32.u16 %r5166, %rs3322; - cvt.s32.s8 %r5167, %r5166; - mad.lo.s32 %r5168, %r111, %r5167, %r5159; - mad.lo.s32 %r5169, %r112, %r5165, %r5168; - mad.lo.s32 %r5170, %r114, %r5163, %r5169; - mad.lo.s32 %r5171, %r115, %r5161, %r5170; - shr.u32 %r5172, %r4979, 6; - and.b32 %r5173, %r5172, 240; - shr.u32 %r5174, %r5171, 10; - or.b32 %r5175, %r5174, %r5173; - xor.b32 %r5176, %r23, %r5175; - cvt.u64.u32 %rd390, %r5176; - ld.const.v4.u8 {%rs3330, %rs3331, %rs3332, %rs3333}, [matrix+1664]; - cvt.u32.u16 %r5177, %rs3333; - cvt.s32.s8 %r5178, %r5177; - cvt.u32.u16 %r5179, %rs3332; - cvt.s32.s8 %r5180, %r5179; - cvt.u32.u16 %r5181, %rs3330; - cvt.s32.s8 %r5182, %r5181; - cvt.u32.u16 %r5183, %rs3331; - cvt.s32.s8 %r5184, %r5183; - mul.lo.s32 %r5185, %r34, %r5184; - mad.lo.s32 %r5186, %r124, %r5182, %r5185; - mad.lo.s32 %r5187, %r35, %r5180, %r5186; - mad.lo.s32 %r5188, %r36, %r5178, %r5187; - ld.const.v4.u8 {%rs3338, %rs3339, %rs3340, %rs3341}, [matrix+1668]; - cvt.u32.u16 %r5189, %rs3341; - cvt.s32.s8 %r5190, %r5189; - cvt.u32.u16 %r5191, %rs3340; - cvt.s32.s8 %r5192, %r5191; - cvt.u32.u16 %r5193, %rs3339; - cvt.s32.s8 %r5194, %r5193; - cvt.u32.u16 %r5195, %rs3338; - cvt.s32.s8 %r5196, %r5195; - mad.lo.s32 %r5197, %r37, %r5196, %r5188; - mad.lo.s32 %r5198, %r38, %r5194, %r5197; - mad.lo.s32 %r5199, %r39, %r5192, %r5198; - mad.lo.s32 %r5200, %r40, %r5190, %r5199; - ld.const.v4.u8 {%rs3346, %rs3347, %rs3348, %rs3349}, [matrix+1672]; - cvt.u32.u16 %r5201, %rs3349; - cvt.s32.s8 %r5202, %r5201; - cvt.u32.u16 %r5203, %rs3348; - cvt.s32.s8 %r5204, %r5203; - cvt.u32.u16 %r5205, %rs3347; - cvt.s32.s8 %r5206, %r5205; - cvt.u32.u16 %r5207, %rs3346; - cvt.s32.s8 %r5208, %r5207; - mad.lo.s32 %r5209, %r42, %r5208, %r5200; - mad.lo.s32 %r5210, %r43, %r5206, %r5209; - mad.lo.s32 %r5211, %r45, %r5204, %r5210; - mad.lo.s32 %r5212, %r46, %r5202, %r5211; - ld.const.v4.u8 {%rs3354, %rs3355, %rs3356, %rs3357}, [matrix+1676]; - cvt.u32.u16 %r5213, %rs3357; - cvt.s32.s8 %r5214, %r5213; - cvt.u32.u16 %r5215, %rs3356; - cvt.s32.s8 %r5216, %r5215; - cvt.u32.u16 %r5217, %rs3355; - cvt.s32.s8 %r5218, %r5217; - cvt.u32.u16 %r5219, %rs3354; - cvt.s32.s8 %r5220, %r5219; - mad.lo.s32 %r5221, %r48, %r5220, %r5212; - mad.lo.s32 %r5222, %r49, %r5218, %r5221; - mad.lo.s32 %r5223, %r50, %r5216, %r5222; - mad.lo.s32 %r5224, %r51, %r5214, %r5223; - ld.const.v4.u8 {%rs3362, %rs3363, %rs3364, %rs3365}, [matrix+1680]; - cvt.u32.u16 %r5225, %rs3365; - cvt.s32.s8 %r5226, %r5225; - cvt.u32.u16 %r5227, %rs3364; - cvt.s32.s8 %r5228, %r5227; - cvt.u32.u16 %r5229, %rs3363; - cvt.s32.s8 %r5230, %r5229; - cvt.u32.u16 %r5231, %rs3362; - cvt.s32.s8 %r5232, %r5231; - mad.lo.s32 %r5233, %r173, %r5232, %r5224; - mad.lo.s32 %r5234, %r53, %r5230, %r5233; - mad.lo.s32 %r5235, %r54, %r5228, %r5234; - mad.lo.s32 %r5236, %r55, %r5226, %r5235; - ld.const.v4.u8 {%rs3370, %rs3371, %rs3372, %rs3373}, [matrix+1684]; - cvt.u32.u16 %r5237, %rs3373; - cvt.s32.s8 %r5238, %r5237; - cvt.u32.u16 %r5239, %rs3372; - cvt.s32.s8 %r5240, %r5239; - cvt.u32.u16 %r5241, %rs3371; - cvt.s32.s8 %r5242, %r5241; - cvt.u32.u16 %r5243, %rs3370; - cvt.s32.s8 %r5244, %r5243; - mad.lo.s32 %r5245, %r56, %r5244, %r5236; - mad.lo.s32 %r5246, %r57, %r5242, %r5245; - mad.lo.s32 %r5247, %r58, %r5240, %r5246; - mad.lo.s32 %r5248, %r59, %r5238, %r5247; - ld.const.v4.u8 {%rs3378, %rs3379, %rs3380, %rs3381}, [matrix+1688]; - cvt.u32.u16 %r5249, %rs3381; - cvt.s32.s8 %r5250, %r5249; - cvt.u32.u16 %r5251, %rs3380; - cvt.s32.s8 %r5252, %r5251; - cvt.u32.u16 %r5253, %rs3379; - cvt.s32.s8 %r5254, %r5253; - cvt.u32.u16 %r5255, %rs3378; - cvt.s32.s8 %r5256, %r5255; - mad.lo.s32 %r5257, %r61, %r5256, %r5248; - mad.lo.s32 %r5258, %r62, %r5254, %r5257; - mad.lo.s32 %r5259, %r64, %r5252, %r5258; - mad.lo.s32 %r5260, %r65, %r5250, %r5259; - ld.const.v4.u8 {%rs3386, %rs3387, %rs3388, %rs3389}, [matrix+1692]; - cvt.u32.u16 %r5261, %rs3389; - cvt.s32.s8 %r5262, %r5261; - cvt.u32.u16 %r5263, %rs3388; - cvt.s32.s8 %r5264, %r5263; - cvt.u32.u16 %r5265, %rs3387; - cvt.s32.s8 %r5266, %r5265; - cvt.u32.u16 %r5267, %rs3386; - cvt.s32.s8 %r5268, %r5267; - mad.lo.s32 %r5269, %r67, %r5268, %r5260; - mad.lo.s32 %r5270, %r68, %r5266, %r5269; - mad.lo.s32 %r5271, %r69, %r5264, %r5270; - mad.lo.s32 %r5272, %r70, %r5262, %r5271; - ld.const.v4.u8 {%rs3394, %rs3395, %rs3396, %rs3397}, [matrix+1696]; - cvt.u32.u16 %r5273, %rs3397; - cvt.s32.s8 %r5274, %r5273; - cvt.u32.u16 %r5275, %rs3396; - cvt.s32.s8 %r5276, %r5275; - cvt.u32.u16 %r5277, %rs3395; - cvt.s32.s8 %r5278, %r5277; - cvt.u32.u16 %r5279, %rs3394; - cvt.s32.s8 %r5280, %r5279; - mad.lo.s32 %r5281, %r222, %r5280, %r5272; - mad.lo.s32 %r5282, %r72, %r5278, %r5281; - mad.lo.s32 %r5283, %r73, %r5276, %r5282; - mad.lo.s32 %r5284, %r74, %r5274, %r5283; - ld.const.v4.u8 {%rs3402, %rs3403, %rs3404, %rs3405}, [matrix+1700]; - cvt.u32.u16 %r5285, %rs3405; - cvt.s32.s8 %r5286, %r5285; - cvt.u32.u16 %r5287, %rs3404; - cvt.s32.s8 %r5288, %r5287; - cvt.u32.u16 %r5289, %rs3403; - cvt.s32.s8 %r5290, %r5289; - cvt.u32.u16 %r5291, %rs3402; - cvt.s32.s8 %r5292, %r5291; - mad.lo.s32 %r5293, %r75, %r5292, %r5284; - mad.lo.s32 %r5294, %r76, %r5290, %r5293; - mad.lo.s32 %r5295, %r77, %r5288, %r5294; - mad.lo.s32 %r5296, %r78, %r5286, %r5295; - ld.const.v4.u8 {%rs3410, %rs3411, %rs3412, %rs3413}, [matrix+1704]; - cvt.u32.u16 %r5297, %rs3413; - cvt.s32.s8 %r5298, %r5297; - cvt.u32.u16 %r5299, %rs3412; - cvt.s32.s8 %r5300, %r5299; - cvt.u32.u16 %r5301, %rs3411; - cvt.s32.s8 %r5302, %r5301; - cvt.u32.u16 %r5303, %rs3410; - cvt.s32.s8 %r5304, %r5303; - mad.lo.s32 %r5305, %r80, %r5304, %r5296; - mad.lo.s32 %r5306, %r81, %r5302, %r5305; - mad.lo.s32 %r5307, %r83, %r5300, %r5306; - mad.lo.s32 %r5308, %r84, %r5298, %r5307; - ld.const.v4.u8 {%rs3418, %rs3419, %rs3420, %rs3421}, [matrix+1708]; - cvt.u32.u16 %r5309, %rs3421; - cvt.s32.s8 %r5310, %r5309; - cvt.u32.u16 %r5311, %rs3420; - cvt.s32.s8 %r5312, %r5311; - cvt.u32.u16 %r5313, %rs3419; - cvt.s32.s8 %r5314, %r5313; - cvt.u32.u16 %r5315, %rs3418; - cvt.s32.s8 %r5316, %r5315; - mad.lo.s32 %r5317, %r86, %r5316, %r5308; - mad.lo.s32 %r5318, %r87, %r5314, %r5317; - mad.lo.s32 %r5319, %r88, %r5312, %r5318; - mad.lo.s32 %r5320, %r89, %r5310, %r5319; - ld.const.v4.u8 {%rs3426, %rs3427, %rs3428, %rs3429}, [matrix+1712]; - cvt.u32.u16 %r5321, %rs3429; - cvt.s32.s8 %r5322, %r5321; - cvt.u32.u16 %r5323, %rs3428; - cvt.s32.s8 %r5324, %r5323; - cvt.u32.u16 %r5325, %rs3427; - cvt.s32.s8 %r5326, %r5325; - cvt.u32.u16 %r5327, %rs3426; - cvt.s32.s8 %r5328, %r5327; - mad.lo.s32 %r5329, %r271, %r5328, %r5320; - mad.lo.s32 %r5330, %r91, %r5326, %r5329; - mad.lo.s32 %r5331, %r93, %r5324, %r5330; - mad.lo.s32 %r5332, %r94, %r5322, %r5331; - ld.const.v4.u8 {%rs3434, %rs3435, %rs3436, %rs3437}, [matrix+1716]; - cvt.u32.u16 %r5333, %rs3437; - cvt.s32.s8 %r5334, %r5333; - cvt.u32.u16 %r5335, %rs3436; - cvt.s32.s8 %r5336, %r5335; - cvt.u32.u16 %r5337, %rs3435; - cvt.s32.s8 %r5338, %r5337; - cvt.u32.u16 %r5339, %rs3434; - cvt.s32.s8 %r5340, %r5339; - mad.lo.s32 %r5341, %r96, %r5340, %r5332; - mad.lo.s32 %r5342, %r97, %r5338, %r5341; - mad.lo.s32 %r5343, %r99, %r5336, %r5342; - mad.lo.s32 %r5344, %r100, %r5334, %r5343; - ld.const.v4.u8 {%rs3442, %rs3443, %rs3444, %rs3445}, [matrix+1720]; - cvt.u32.u16 %r5345, %rs3445; - cvt.s32.s8 %r5346, %r5345; - cvt.u32.u16 %r5347, %rs3444; - cvt.s32.s8 %r5348, %r5347; - cvt.u32.u16 %r5349, %rs3443; - cvt.s32.s8 %r5350, %r5349; - cvt.u32.u16 %r5351, %rs3442; - cvt.s32.s8 %r5352, %r5351; - mad.lo.s32 %r5353, %r103, %r5352, %r5344; - mad.lo.s32 %r5354, %r104, %r5350, %r5353; - mad.lo.s32 %r5355, %r107, %r5348, %r5354; - mad.lo.s32 %r5356, %r108, %r5346, %r5355; - ld.const.v4.u8 {%rs3450, %rs3451, %rs3452, %rs3453}, [matrix+1724]; - cvt.u32.u16 %r5357, %rs3453; - cvt.s32.s8 %r5358, %r5357; - cvt.u32.u16 %r5359, %rs3452; - cvt.s32.s8 %r5360, %r5359; - cvt.u32.u16 %r5361, %rs3451; - cvt.s32.s8 %r5362, %r5361; - cvt.u32.u16 %r5363, %rs3450; - cvt.s32.s8 %r5364, %r5363; - mad.lo.s32 %r5365, %r111, %r5364, %r5356; - mad.lo.s32 %r5366, %r112, %r5362, %r5365; - mad.lo.s32 %r5367, %r114, %r5360, %r5366; - mad.lo.s32 %r5368, %r115, %r5358, %r5367; - ld.const.v4.u8 {%rs3458, %rs3459, %rs3460, %rs3461}, [matrix+1728]; - cvt.u32.u16 %r5369, %rs3461; - cvt.s32.s8 %r5370, %r5369; - cvt.u32.u16 %r5371, %rs3460; - cvt.s32.s8 %r5372, %r5371; - cvt.u32.u16 %r5373, %rs3458; - cvt.s32.s8 %r5374, %r5373; - cvt.u32.u16 %r5375, %rs3459; - cvt.s32.s8 %r5376, %r5375; - mul.lo.s32 %r5377, %r34, %r5376; - mad.lo.s32 %r5378, %r124, %r5374, %r5377; - mad.lo.s32 %r5379, %r35, %r5372, %r5378; - mad.lo.s32 %r5380, %r36, %r5370, %r5379; - ld.const.v4.u8 {%rs3466, %rs3467, %rs3468, %rs3469}, [matrix+1732]; - cvt.u32.u16 %r5381, %rs3469; - cvt.s32.s8 %r5382, %r5381; - cvt.u32.u16 %r5383, %rs3468; - cvt.s32.s8 %r5384, %r5383; - cvt.u32.u16 %r5385, %rs3467; - cvt.s32.s8 %r5386, %r5385; - cvt.u32.u16 %r5387, %rs3466; - cvt.s32.s8 %r5388, %r5387; - mad.lo.s32 %r5389, %r37, %r5388, %r5380; - mad.lo.s32 %r5390, %r38, %r5386, %r5389; - mad.lo.s32 %r5391, %r39, %r5384, %r5390; - mad.lo.s32 %r5392, %r40, %r5382, %r5391; - ld.const.v4.u8 {%rs3474, %rs3475, %rs3476, %rs3477}, [matrix+1736]; - cvt.u32.u16 %r5393, %rs3477; - cvt.s32.s8 %r5394, %r5393; - cvt.u32.u16 %r5395, %rs3476; - cvt.s32.s8 %r5396, %r5395; - cvt.u32.u16 %r5397, %rs3475; - cvt.s32.s8 %r5398, %r5397; - cvt.u32.u16 %r5399, %rs3474; - cvt.s32.s8 %r5400, %r5399; - mad.lo.s32 %r5401, %r42, %r5400, %r5392; - mad.lo.s32 %r5402, %r43, %r5398, %r5401; - mad.lo.s32 %r5403, %r45, %r5396, %r5402; - mad.lo.s32 %r5404, %r46, %r5394, %r5403; - ld.const.v4.u8 {%rs3482, %rs3483, %rs3484, %rs3485}, [matrix+1740]; - cvt.u32.u16 %r5405, %rs3485; - cvt.s32.s8 %r5406, %r5405; - cvt.u32.u16 %r5407, %rs3484; - cvt.s32.s8 %r5408, %r5407; - cvt.u32.u16 %r5409, %rs3483; - cvt.s32.s8 %r5410, %r5409; - cvt.u32.u16 %r5411, %rs3482; - cvt.s32.s8 %r5412, %r5411; - mad.lo.s32 %r5413, %r48, %r5412, %r5404; - mad.lo.s32 %r5414, %r49, %r5410, %r5413; - mad.lo.s32 %r5415, %r50, %r5408, %r5414; - mad.lo.s32 %r5416, %r51, %r5406, %r5415; - ld.const.v4.u8 {%rs3490, %rs3491, %rs3492, %rs3493}, [matrix+1744]; - cvt.u32.u16 %r5417, %rs3493; - cvt.s32.s8 %r5418, %r5417; - cvt.u32.u16 %r5419, %rs3492; - cvt.s32.s8 %r5420, %r5419; - cvt.u32.u16 %r5421, %rs3491; - cvt.s32.s8 %r5422, %r5421; - cvt.u32.u16 %r5423, %rs3490; - cvt.s32.s8 %r5424, %r5423; - mad.lo.s32 %r5425, %r173, %r5424, %r5416; - mad.lo.s32 %r5426, %r53, %r5422, %r5425; - mad.lo.s32 %r5427, %r54, %r5420, %r5426; - mad.lo.s32 %r5428, %r55, %r5418, %r5427; - ld.const.v4.u8 {%rs3498, %rs3499, %rs3500, %rs3501}, [matrix+1748]; - cvt.u32.u16 %r5429, %rs3501; - cvt.s32.s8 %r5430, %r5429; - cvt.u32.u16 %r5431, %rs3500; - cvt.s32.s8 %r5432, %r5431; - cvt.u32.u16 %r5433, %rs3499; - cvt.s32.s8 %r5434, %r5433; - cvt.u32.u16 %r5435, %rs3498; - cvt.s32.s8 %r5436, %r5435; - mad.lo.s32 %r5437, %r56, %r5436, %r5428; - mad.lo.s32 %r5438, %r57, %r5434, %r5437; - mad.lo.s32 %r5439, %r58, %r5432, %r5438; - mad.lo.s32 %r5440, %r59, %r5430, %r5439; - ld.const.v4.u8 {%rs3506, %rs3507, %rs3508, %rs3509}, [matrix+1752]; - cvt.u32.u16 %r5441, %rs3509; - cvt.s32.s8 %r5442, %r5441; - cvt.u32.u16 %r5443, %rs3508; - cvt.s32.s8 %r5444, %r5443; - cvt.u32.u16 %r5445, %rs3507; - cvt.s32.s8 %r5446, %r5445; - cvt.u32.u16 %r5447, %rs3506; - cvt.s32.s8 %r5448, %r5447; - mad.lo.s32 %r5449, %r61, %r5448, %r5440; - mad.lo.s32 %r5450, %r62, %r5446, %r5449; - mad.lo.s32 %r5451, %r64, %r5444, %r5450; - mad.lo.s32 %r5452, %r65, %r5442, %r5451; - ld.const.v4.u8 {%rs3514, %rs3515, %rs3516, %rs3517}, [matrix+1756]; - cvt.u32.u16 %r5453, %rs3517; - cvt.s32.s8 %r5454, %r5453; - cvt.u32.u16 %r5455, %rs3516; - cvt.s32.s8 %r5456, %r5455; - cvt.u32.u16 %r5457, %rs3515; - cvt.s32.s8 %r5458, %r5457; - cvt.u32.u16 %r5459, %rs3514; - cvt.s32.s8 %r5460, %r5459; - mad.lo.s32 %r5461, %r67, %r5460, %r5452; - mad.lo.s32 %r5462, %r68, %r5458, %r5461; - mad.lo.s32 %r5463, %r69, %r5456, %r5462; - mad.lo.s32 %r5464, %r70, %r5454, %r5463; - ld.const.v4.u8 {%rs3522, %rs3523, %rs3524, %rs3525}, [matrix+1760]; - cvt.u32.u16 %r5465, %rs3525; - cvt.s32.s8 %r5466, %r5465; - cvt.u32.u16 %r5467, %rs3524; - cvt.s32.s8 %r5468, %r5467; - cvt.u32.u16 %r5469, %rs3523; - cvt.s32.s8 %r5470, %r5469; - cvt.u32.u16 %r5471, %rs3522; - cvt.s32.s8 %r5472, %r5471; - mad.lo.s32 %r5473, %r222, %r5472, %r5464; - mad.lo.s32 %r5474, %r72, %r5470, %r5473; - mad.lo.s32 %r5475, %r73, %r5468, %r5474; - mad.lo.s32 %r5476, %r74, %r5466, %r5475; - ld.const.v4.u8 {%rs3530, %rs3531, %rs3532, %rs3533}, [matrix+1764]; - cvt.u32.u16 %r5477, %rs3533; - cvt.s32.s8 %r5478, %r5477; - cvt.u32.u16 %r5479, %rs3532; - cvt.s32.s8 %r5480, %r5479; - cvt.u32.u16 %r5481, %rs3531; - cvt.s32.s8 %r5482, %r5481; - cvt.u32.u16 %r5483, %rs3530; - cvt.s32.s8 %r5484, %r5483; - mad.lo.s32 %r5485, %r75, %r5484, %r5476; - mad.lo.s32 %r5486, %r76, %r5482, %r5485; - mad.lo.s32 %r5487, %r77, %r5480, %r5486; - mad.lo.s32 %r5488, %r78, %r5478, %r5487; - ld.const.v4.u8 {%rs3538, %rs3539, %rs3540, %rs3541}, [matrix+1768]; - cvt.u32.u16 %r5489, %rs3541; - cvt.s32.s8 %r5490, %r5489; - cvt.u32.u16 %r5491, %rs3540; - cvt.s32.s8 %r5492, %r5491; - cvt.u32.u16 %r5493, %rs3539; - cvt.s32.s8 %r5494, %r5493; - cvt.u32.u16 %r5495, %rs3538; - cvt.s32.s8 %r5496, %r5495; - mad.lo.s32 %r5497, %r80, %r5496, %r5488; - mad.lo.s32 %r5498, %r81, %r5494, %r5497; - mad.lo.s32 %r5499, %r83, %r5492, %r5498; - mad.lo.s32 %r5500, %r84, %r5490, %r5499; - ld.const.v4.u8 {%rs3546, %rs3547, %rs3548, %rs3549}, [matrix+1772]; - cvt.u32.u16 %r5501, %rs3549; - cvt.s32.s8 %r5502, %r5501; - cvt.u32.u16 %r5503, %rs3548; - cvt.s32.s8 %r5504, %r5503; - cvt.u32.u16 %r5505, %rs3547; - cvt.s32.s8 %r5506, %r5505; - cvt.u32.u16 %r5507, %rs3546; - cvt.s32.s8 %r5508, %r5507; - mad.lo.s32 %r5509, %r86, %r5508, %r5500; - mad.lo.s32 %r5510, %r87, %r5506, %r5509; - mad.lo.s32 %r5511, %r88, %r5504, %r5510; - mad.lo.s32 %r5512, %r89, %r5502, %r5511; - ld.const.v4.u8 {%rs3554, %rs3555, %rs3556, %rs3557}, [matrix+1776]; - cvt.u32.u16 %r5513, %rs3557; - cvt.s32.s8 %r5514, %r5513; - cvt.u32.u16 %r5515, %rs3556; - cvt.s32.s8 %r5516, %r5515; - cvt.u32.u16 %r5517, %rs3555; - cvt.s32.s8 %r5518, %r5517; - cvt.u32.u16 %r5519, %rs3554; - cvt.s32.s8 %r5520, %r5519; - mad.lo.s32 %r5521, %r271, %r5520, %r5512; - mad.lo.s32 %r5522, %r91, %r5518, %r5521; - mad.lo.s32 %r5523, %r93, %r5516, %r5522; - mad.lo.s32 %r5524, %r94, %r5514, %r5523; - ld.const.v4.u8 {%rs3562, %rs3563, %rs3564, %rs3565}, [matrix+1780]; - cvt.u32.u16 %r5525, %rs3565; - cvt.s32.s8 %r5526, %r5525; - cvt.u32.u16 %r5527, %rs3564; - cvt.s32.s8 %r5528, %r5527; - cvt.u32.u16 %r5529, %rs3563; - cvt.s32.s8 %r5530, %r5529; - cvt.u32.u16 %r5531, %rs3562; - cvt.s32.s8 %r5532, %r5531; - mad.lo.s32 %r5533, %r96, %r5532, %r5524; - mad.lo.s32 %r5534, %r97, %r5530, %r5533; - mad.lo.s32 %r5535, %r99, %r5528, %r5534; - mad.lo.s32 %r5536, %r100, %r5526, %r5535; - ld.const.v4.u8 {%rs3570, %rs3571, %rs3572, %rs3573}, [matrix+1784]; - cvt.u32.u16 %r5537, %rs3573; - cvt.s32.s8 %r5538, %r5537; - cvt.u32.u16 %r5539, %rs3572; - cvt.s32.s8 %r5540, %r5539; - cvt.u32.u16 %r5541, %rs3571; - cvt.s32.s8 %r5542, %r5541; - cvt.u32.u16 %r5543, %rs3570; - cvt.s32.s8 %r5544, %r5543; - mad.lo.s32 %r5545, %r103, %r5544, %r5536; - mad.lo.s32 %r5546, %r104, %r5542, %r5545; - mad.lo.s32 %r5547, %r107, %r5540, %r5546; - mad.lo.s32 %r5548, %r108, %r5538, %r5547; - ld.const.v4.u8 {%rs3578, %rs3579, %rs3580, %rs3581}, [matrix+1788]; - cvt.u32.u16 %r5549, %rs3581; - cvt.s32.s8 %r5550, %r5549; - cvt.u32.u16 %r5551, %rs3580; - cvt.s32.s8 %r5552, %r5551; - cvt.u32.u16 %r5553, %rs3579; - cvt.s32.s8 %r5554, %r5553; - cvt.u32.u16 %r5555, %rs3578; - cvt.s32.s8 %r5556, %r5555; - mad.lo.s32 %r5557, %r111, %r5556, %r5548; - mad.lo.s32 %r5558, %r112, %r5554, %r5557; - mad.lo.s32 %r5559, %r114, %r5552, %r5558; - mad.lo.s32 %r5560, %r115, %r5550, %r5559; - shr.u32 %r5561, %r5368, 6; - and.b32 %r5562, %r5561, 240; - shr.u32 %r5563, %r5560, 10; - or.b32 %r5564, %r5563, %r5562; - xor.b32 %r5565, %r24, %r5564; - cvt.u64.u32 %rd391, %r5565; - ld.const.v4.u8 {%rs3586, %rs3587, %rs3588, %rs3589}, [matrix+1792]; - cvt.u32.u16 %r5566, %rs3589; - cvt.s32.s8 %r5567, %r5566; - cvt.u32.u16 %r5568, %rs3588; - cvt.s32.s8 %r5569, %r5568; - cvt.u32.u16 %r5570, %rs3586; - cvt.s32.s8 %r5571, %r5570; - cvt.u32.u16 %r5572, %rs3587; - cvt.s32.s8 %r5573, %r5572; - mul.lo.s32 %r5574, %r34, %r5573; - mad.lo.s32 %r5575, %r124, %r5571, %r5574; - mad.lo.s32 %r5576, %r35, %r5569, %r5575; - mad.lo.s32 %r5577, %r36, %r5567, %r5576; - ld.const.v4.u8 {%rs3594, %rs3595, %rs3596, %rs3597}, [matrix+1796]; - cvt.u32.u16 %r5578, %rs3597; - cvt.s32.s8 %r5579, %r5578; - cvt.u32.u16 %r5580, %rs3596; - cvt.s32.s8 %r5581, %r5580; - cvt.u32.u16 %r5582, %rs3595; - cvt.s32.s8 %r5583, %r5582; - cvt.u32.u16 %r5584, %rs3594; - cvt.s32.s8 %r5585, %r5584; - mad.lo.s32 %r5586, %r37, %r5585, %r5577; - mad.lo.s32 %r5587, %r38, %r5583, %r5586; - mad.lo.s32 %r5588, %r39, %r5581, %r5587; - mad.lo.s32 %r5589, %r40, %r5579, %r5588; - ld.const.v4.u8 {%rs3602, %rs3603, %rs3604, %rs3605}, [matrix+1800]; - cvt.u32.u16 %r5590, %rs3605; - cvt.s32.s8 %r5591, %r5590; - cvt.u32.u16 %r5592, %rs3604; - cvt.s32.s8 %r5593, %r5592; - cvt.u32.u16 %r5594, %rs3603; - cvt.s32.s8 %r5595, %r5594; - cvt.u32.u16 %r5596, %rs3602; - cvt.s32.s8 %r5597, %r5596; - mad.lo.s32 %r5598, %r42, %r5597, %r5589; - mad.lo.s32 %r5599, %r43, %r5595, %r5598; - mad.lo.s32 %r5600, %r45, %r5593, %r5599; - mad.lo.s32 %r5601, %r46, %r5591, %r5600; - ld.const.v4.u8 {%rs3610, %rs3611, %rs3612, %rs3613}, [matrix+1804]; - cvt.u32.u16 %r5602, %rs3613; - cvt.s32.s8 %r5603, %r5602; - cvt.u32.u16 %r5604, %rs3612; - cvt.s32.s8 %r5605, %r5604; - cvt.u32.u16 %r5606, %rs3611; - cvt.s32.s8 %r5607, %r5606; - cvt.u32.u16 %r5608, %rs3610; - cvt.s32.s8 %r5609, %r5608; - mad.lo.s32 %r5610, %r48, %r5609, %r5601; - mad.lo.s32 %r5611, %r49, %r5607, %r5610; - mad.lo.s32 %r5612, %r50, %r5605, %r5611; - mad.lo.s32 %r5613, %r51, %r5603, %r5612; - ld.const.v4.u8 {%rs3618, %rs3619, %rs3620, %rs3621}, [matrix+1808]; - cvt.u32.u16 %r5614, %rs3621; - cvt.s32.s8 %r5615, %r5614; - cvt.u32.u16 %r5616, %rs3620; - cvt.s32.s8 %r5617, %r5616; - cvt.u32.u16 %r5618, %rs3619; - cvt.s32.s8 %r5619, %r5618; - cvt.u32.u16 %r5620, %rs3618; - cvt.s32.s8 %r5621, %r5620; - mad.lo.s32 %r5622, %r173, %r5621, %r5613; - mad.lo.s32 %r5623, %r53, %r5619, %r5622; - mad.lo.s32 %r5624, %r54, %r5617, %r5623; - mad.lo.s32 %r5625, %r55, %r5615, %r5624; - ld.const.v4.u8 {%rs3626, %rs3627, %rs3628, %rs3629}, [matrix+1812]; - cvt.u32.u16 %r5626, %rs3629; - cvt.s32.s8 %r5627, %r5626; - cvt.u32.u16 %r5628, %rs3628; - cvt.s32.s8 %r5629, %r5628; - cvt.u32.u16 %r5630, %rs3627; - cvt.s32.s8 %r5631, %r5630; - cvt.u32.u16 %r5632, %rs3626; - cvt.s32.s8 %r5633, %r5632; - mad.lo.s32 %r5634, %r56, %r5633, %r5625; - mad.lo.s32 %r5635, %r57, %r5631, %r5634; - mad.lo.s32 %r5636, %r58, %r5629, %r5635; - mad.lo.s32 %r5637, %r59, %r5627, %r5636; - ld.const.v4.u8 {%rs3634, %rs3635, %rs3636, %rs3637}, [matrix+1816]; - cvt.u32.u16 %r5638, %rs3637; - cvt.s32.s8 %r5639, %r5638; - cvt.u32.u16 %r5640, %rs3636; - cvt.s32.s8 %r5641, %r5640; - cvt.u32.u16 %r5642, %rs3635; - cvt.s32.s8 %r5643, %r5642; - cvt.u32.u16 %r5644, %rs3634; - cvt.s32.s8 %r5645, %r5644; - mad.lo.s32 %r5646, %r61, %r5645, %r5637; - mad.lo.s32 %r5647, %r62, %r5643, %r5646; - mad.lo.s32 %r5648, %r64, %r5641, %r5647; - mad.lo.s32 %r5649, %r65, %r5639, %r5648; - ld.const.v4.u8 {%rs3642, %rs3643, %rs3644, %rs3645}, [matrix+1820]; - cvt.u32.u16 %r5650, %rs3645; - cvt.s32.s8 %r5651, %r5650; - cvt.u32.u16 %r5652, %rs3644; - cvt.s32.s8 %r5653, %r5652; - cvt.u32.u16 %r5654, %rs3643; - cvt.s32.s8 %r5655, %r5654; - cvt.u32.u16 %r5656, %rs3642; - cvt.s32.s8 %r5657, %r5656; - mad.lo.s32 %r5658, %r67, %r5657, %r5649; - mad.lo.s32 %r5659, %r68, %r5655, %r5658; - mad.lo.s32 %r5660, %r69, %r5653, %r5659; - mad.lo.s32 %r5661, %r70, %r5651, %r5660; - ld.const.v4.u8 {%rs3650, %rs3651, %rs3652, %rs3653}, [matrix+1824]; - cvt.u32.u16 %r5662, %rs3653; - cvt.s32.s8 %r5663, %r5662; - cvt.u32.u16 %r5664, %rs3652; - cvt.s32.s8 %r5665, %r5664; - cvt.u32.u16 %r5666, %rs3651; - cvt.s32.s8 %r5667, %r5666; - cvt.u32.u16 %r5668, %rs3650; - cvt.s32.s8 %r5669, %r5668; - mad.lo.s32 %r5670, %r222, %r5669, %r5661; - mad.lo.s32 %r5671, %r72, %r5667, %r5670; - mad.lo.s32 %r5672, %r73, %r5665, %r5671; - mad.lo.s32 %r5673, %r74, %r5663, %r5672; - ld.const.v4.u8 {%rs3658, %rs3659, %rs3660, %rs3661}, [matrix+1828]; - cvt.u32.u16 %r5674, %rs3661; - cvt.s32.s8 %r5675, %r5674; - cvt.u32.u16 %r5676, %rs3660; - cvt.s32.s8 %r5677, %r5676; - cvt.u32.u16 %r5678, %rs3659; - cvt.s32.s8 %r5679, %r5678; - cvt.u32.u16 %r5680, %rs3658; - cvt.s32.s8 %r5681, %r5680; - mad.lo.s32 %r5682, %r75, %r5681, %r5673; - mad.lo.s32 %r5683, %r76, %r5679, %r5682; - mad.lo.s32 %r5684, %r77, %r5677, %r5683; - mad.lo.s32 %r5685, %r78, %r5675, %r5684; - ld.const.v4.u8 {%rs3666, %rs3667, %rs3668, %rs3669}, [matrix+1832]; - cvt.u32.u16 %r5686, %rs3669; - cvt.s32.s8 %r5687, %r5686; - cvt.u32.u16 %r5688, %rs3668; - cvt.s32.s8 %r5689, %r5688; - cvt.u32.u16 %r5690, %rs3667; - cvt.s32.s8 %r5691, %r5690; - cvt.u32.u16 %r5692, %rs3666; - cvt.s32.s8 %r5693, %r5692; - mad.lo.s32 %r5694, %r80, %r5693, %r5685; - mad.lo.s32 %r5695, %r81, %r5691, %r5694; - mad.lo.s32 %r5696, %r83, %r5689, %r5695; - mad.lo.s32 %r5697, %r84, %r5687, %r5696; - ld.const.v4.u8 {%rs3674, %rs3675, %rs3676, %rs3677}, [matrix+1836]; - cvt.u32.u16 %r5698, %rs3677; - cvt.s32.s8 %r5699, %r5698; - cvt.u32.u16 %r5700, %rs3676; - cvt.s32.s8 %r5701, %r5700; - cvt.u32.u16 %r5702, %rs3675; - cvt.s32.s8 %r5703, %r5702; - cvt.u32.u16 %r5704, %rs3674; - cvt.s32.s8 %r5705, %r5704; - mad.lo.s32 %r5706, %r86, %r5705, %r5697; - mad.lo.s32 %r5707, %r87, %r5703, %r5706; - mad.lo.s32 %r5708, %r88, %r5701, %r5707; - mad.lo.s32 %r5709, %r89, %r5699, %r5708; - ld.const.v4.u8 {%rs3682, %rs3683, %rs3684, %rs3685}, [matrix+1840]; - cvt.u32.u16 %r5710, %rs3685; - cvt.s32.s8 %r5711, %r5710; - cvt.u32.u16 %r5712, %rs3684; - cvt.s32.s8 %r5713, %r5712; - cvt.u32.u16 %r5714, %rs3683; - cvt.s32.s8 %r5715, %r5714; - cvt.u32.u16 %r5716, %rs3682; - cvt.s32.s8 %r5717, %r5716; - mad.lo.s32 %r5718, %r271, %r5717, %r5709; - mad.lo.s32 %r5719, %r91, %r5715, %r5718; - mad.lo.s32 %r5720, %r93, %r5713, %r5719; - mad.lo.s32 %r5721, %r94, %r5711, %r5720; - ld.const.v4.u8 {%rs3690, %rs3691, %rs3692, %rs3693}, [matrix+1844]; - cvt.u32.u16 %r5722, %rs3693; - cvt.s32.s8 %r5723, %r5722; - cvt.u32.u16 %r5724, %rs3692; - cvt.s32.s8 %r5725, %r5724; - cvt.u32.u16 %r5726, %rs3691; - cvt.s32.s8 %r5727, %r5726; - cvt.u32.u16 %r5728, %rs3690; - cvt.s32.s8 %r5729, %r5728; - mad.lo.s32 %r5730, %r96, %r5729, %r5721; - mad.lo.s32 %r5731, %r97, %r5727, %r5730; - mad.lo.s32 %r5732, %r99, %r5725, %r5731; - mad.lo.s32 %r5733, %r100, %r5723, %r5732; - ld.const.v4.u8 {%rs3698, %rs3699, %rs3700, %rs3701}, [matrix+1848]; - cvt.u32.u16 %r5734, %rs3701; - cvt.s32.s8 %r5735, %r5734; - cvt.u32.u16 %r5736, %rs3700; - cvt.s32.s8 %r5737, %r5736; - cvt.u32.u16 %r5738, %rs3699; - cvt.s32.s8 %r5739, %r5738; - cvt.u32.u16 %r5740, %rs3698; - cvt.s32.s8 %r5741, %r5740; - mad.lo.s32 %r5742, %r103, %r5741, %r5733; - mad.lo.s32 %r5743, %r104, %r5739, %r5742; - mad.lo.s32 %r5744, %r107, %r5737, %r5743; - mad.lo.s32 %r5745, %r108, %r5735, %r5744; - ld.const.v4.u8 {%rs3706, %rs3707, %rs3708, %rs3709}, [matrix+1852]; - cvt.u32.u16 %r5746, %rs3709; - cvt.s32.s8 %r5747, %r5746; - cvt.u32.u16 %r5748, %rs3708; - cvt.s32.s8 %r5749, %r5748; - cvt.u32.u16 %r5750, %rs3707; - cvt.s32.s8 %r5751, %r5750; - cvt.u32.u16 %r5752, %rs3706; - cvt.s32.s8 %r5753, %r5752; - mad.lo.s32 %r5754, %r111, %r5753, %r5745; - mad.lo.s32 %r5755, %r112, %r5751, %r5754; - mad.lo.s32 %r5756, %r114, %r5749, %r5755; - mad.lo.s32 %r5757, %r115, %r5747, %r5756; - ld.const.v4.u8 {%rs3714, %rs3715, %rs3716, %rs3717}, [matrix+1856]; - cvt.u32.u16 %r5758, %rs3717; - cvt.s32.s8 %r5759, %r5758; - cvt.u32.u16 %r5760, %rs3716; - cvt.s32.s8 %r5761, %r5760; - cvt.u32.u16 %r5762, %rs3714; - cvt.s32.s8 %r5763, %r5762; - cvt.u32.u16 %r5764, %rs3715; - cvt.s32.s8 %r5765, %r5764; - mul.lo.s32 %r5766, %r34, %r5765; - mad.lo.s32 %r5767, %r124, %r5763, %r5766; - mad.lo.s32 %r5768, %r35, %r5761, %r5767; - mad.lo.s32 %r5769, %r36, %r5759, %r5768; - ld.const.v4.u8 {%rs3722, %rs3723, %rs3724, %rs3725}, [matrix+1860]; - cvt.u32.u16 %r5770, %rs3725; - cvt.s32.s8 %r5771, %r5770; - cvt.u32.u16 %r5772, %rs3724; - cvt.s32.s8 %r5773, %r5772; - cvt.u32.u16 %r5774, %rs3723; - cvt.s32.s8 %r5775, %r5774; - cvt.u32.u16 %r5776, %rs3722; - cvt.s32.s8 %r5777, %r5776; - mad.lo.s32 %r5778, %r37, %r5777, %r5769; - mad.lo.s32 %r5779, %r38, %r5775, %r5778; - mad.lo.s32 %r5780, %r39, %r5773, %r5779; - mad.lo.s32 %r5781, %r40, %r5771, %r5780; - ld.const.v4.u8 {%rs3730, %rs3731, %rs3732, %rs3733}, [matrix+1864]; - cvt.u32.u16 %r5782, %rs3733; - cvt.s32.s8 %r5783, %r5782; - cvt.u32.u16 %r5784, %rs3732; - cvt.s32.s8 %r5785, %r5784; - cvt.u32.u16 %r5786, %rs3731; - cvt.s32.s8 %r5787, %r5786; - cvt.u32.u16 %r5788, %rs3730; - cvt.s32.s8 %r5789, %r5788; - mad.lo.s32 %r5790, %r42, %r5789, %r5781; - mad.lo.s32 %r5791, %r43, %r5787, %r5790; - mad.lo.s32 %r5792, %r45, %r5785, %r5791; - mad.lo.s32 %r5793, %r46, %r5783, %r5792; - ld.const.v4.u8 {%rs3738, %rs3739, %rs3740, %rs3741}, [matrix+1868]; - cvt.u32.u16 %r5794, %rs3741; - cvt.s32.s8 %r5795, %r5794; - cvt.u32.u16 %r5796, %rs3740; - cvt.s32.s8 %r5797, %r5796; - cvt.u32.u16 %r5798, %rs3739; - cvt.s32.s8 %r5799, %r5798; - cvt.u32.u16 %r5800, %rs3738; - cvt.s32.s8 %r5801, %r5800; - mad.lo.s32 %r5802, %r48, %r5801, %r5793; - mad.lo.s32 %r5803, %r49, %r5799, %r5802; - mad.lo.s32 %r5804, %r50, %r5797, %r5803; - mad.lo.s32 %r5805, %r51, %r5795, %r5804; - ld.const.v4.u8 {%rs3746, %rs3747, %rs3748, %rs3749}, [matrix+1872]; - cvt.u32.u16 %r5806, %rs3749; - cvt.s32.s8 %r5807, %r5806; - cvt.u32.u16 %r5808, %rs3748; - cvt.s32.s8 %r5809, %r5808; - cvt.u32.u16 %r5810, %rs3747; - cvt.s32.s8 %r5811, %r5810; - cvt.u32.u16 %r5812, %rs3746; - cvt.s32.s8 %r5813, %r5812; - mad.lo.s32 %r5814, %r173, %r5813, %r5805; - mad.lo.s32 %r5815, %r53, %r5811, %r5814; - mad.lo.s32 %r5816, %r54, %r5809, %r5815; - mad.lo.s32 %r5817, %r55, %r5807, %r5816; - ld.const.v4.u8 {%rs3754, %rs3755, %rs3756, %rs3757}, [matrix+1876]; - cvt.u32.u16 %r5818, %rs3757; - cvt.s32.s8 %r5819, %r5818; - cvt.u32.u16 %r5820, %rs3756; - cvt.s32.s8 %r5821, %r5820; - cvt.u32.u16 %r5822, %rs3755; - cvt.s32.s8 %r5823, %r5822; - cvt.u32.u16 %r5824, %rs3754; - cvt.s32.s8 %r5825, %r5824; - mad.lo.s32 %r5826, %r56, %r5825, %r5817; - mad.lo.s32 %r5827, %r57, %r5823, %r5826; - mad.lo.s32 %r5828, %r58, %r5821, %r5827; - mad.lo.s32 %r5829, %r59, %r5819, %r5828; - ld.const.v4.u8 {%rs3762, %rs3763, %rs3764, %rs3765}, [matrix+1880]; - cvt.u32.u16 %r5830, %rs3765; - cvt.s32.s8 %r5831, %r5830; - cvt.u32.u16 %r5832, %rs3764; - cvt.s32.s8 %r5833, %r5832; - cvt.u32.u16 %r5834, %rs3763; - cvt.s32.s8 %r5835, %r5834; - cvt.u32.u16 %r5836, %rs3762; - cvt.s32.s8 %r5837, %r5836; - mad.lo.s32 %r5838, %r61, %r5837, %r5829; - mad.lo.s32 %r5839, %r62, %r5835, %r5838; - mad.lo.s32 %r5840, %r64, %r5833, %r5839; - mad.lo.s32 %r5841, %r65, %r5831, %r5840; - ld.const.v4.u8 {%rs3770, %rs3771, %rs3772, %rs3773}, [matrix+1884]; - cvt.u32.u16 %r5842, %rs3773; - cvt.s32.s8 %r5843, %r5842; - cvt.u32.u16 %r5844, %rs3772; - cvt.s32.s8 %r5845, %r5844; - cvt.u32.u16 %r5846, %rs3771; - cvt.s32.s8 %r5847, %r5846; - cvt.u32.u16 %r5848, %rs3770; - cvt.s32.s8 %r5849, %r5848; - mad.lo.s32 %r5850, %r67, %r5849, %r5841; - mad.lo.s32 %r5851, %r68, %r5847, %r5850; - mad.lo.s32 %r5852, %r69, %r5845, %r5851; - mad.lo.s32 %r5853, %r70, %r5843, %r5852; - ld.const.v4.u8 {%rs3778, %rs3779, %rs3780, %rs3781}, [matrix+1888]; - cvt.u32.u16 %r5854, %rs3781; - cvt.s32.s8 %r5855, %r5854; - cvt.u32.u16 %r5856, %rs3780; - cvt.s32.s8 %r5857, %r5856; - cvt.u32.u16 %r5858, %rs3779; - cvt.s32.s8 %r5859, %r5858; - cvt.u32.u16 %r5860, %rs3778; - cvt.s32.s8 %r5861, %r5860; - mad.lo.s32 %r5862, %r222, %r5861, %r5853; - mad.lo.s32 %r5863, %r72, %r5859, %r5862; - mad.lo.s32 %r5864, %r73, %r5857, %r5863; - mad.lo.s32 %r5865, %r74, %r5855, %r5864; - ld.const.v4.u8 {%rs3786, %rs3787, %rs3788, %rs3789}, [matrix+1892]; - cvt.u32.u16 %r5866, %rs3789; - cvt.s32.s8 %r5867, %r5866; - cvt.u32.u16 %r5868, %rs3788; - cvt.s32.s8 %r5869, %r5868; - cvt.u32.u16 %r5870, %rs3787; - cvt.s32.s8 %r5871, %r5870; - cvt.u32.u16 %r5872, %rs3786; - cvt.s32.s8 %r5873, %r5872; - mad.lo.s32 %r5874, %r75, %r5873, %r5865; - mad.lo.s32 %r5875, %r76, %r5871, %r5874; - mad.lo.s32 %r5876, %r77, %r5869, %r5875; - mad.lo.s32 %r5877, %r78, %r5867, %r5876; - ld.const.v4.u8 {%rs3794, %rs3795, %rs3796, %rs3797}, [matrix+1896]; - cvt.u32.u16 %r5878, %rs3797; - cvt.s32.s8 %r5879, %r5878; - cvt.u32.u16 %r5880, %rs3796; - cvt.s32.s8 %r5881, %r5880; - cvt.u32.u16 %r5882, %rs3795; - cvt.s32.s8 %r5883, %r5882; - cvt.u32.u16 %r5884, %rs3794; - cvt.s32.s8 %r5885, %r5884; - mad.lo.s32 %r5886, %r80, %r5885, %r5877; - mad.lo.s32 %r5887, %r81, %r5883, %r5886; - mad.lo.s32 %r5888, %r83, %r5881, %r5887; - mad.lo.s32 %r5889, %r84, %r5879, %r5888; - ld.const.v4.u8 {%rs3802, %rs3803, %rs3804, %rs3805}, [matrix+1900]; - cvt.u32.u16 %r5890, %rs3805; - cvt.s32.s8 %r5891, %r5890; - cvt.u32.u16 %r5892, %rs3804; - cvt.s32.s8 %r5893, %r5892; - cvt.u32.u16 %r5894, %rs3803; - cvt.s32.s8 %r5895, %r5894; - cvt.u32.u16 %r5896, %rs3802; - cvt.s32.s8 %r5897, %r5896; - mad.lo.s32 %r5898, %r86, %r5897, %r5889; - mad.lo.s32 %r5899, %r87, %r5895, %r5898; - mad.lo.s32 %r5900, %r88, %r5893, %r5899; - mad.lo.s32 %r5901, %r89, %r5891, %r5900; - ld.const.v4.u8 {%rs3810, %rs3811, %rs3812, %rs3813}, [matrix+1904]; - cvt.u32.u16 %r5902, %rs3813; - cvt.s32.s8 %r5903, %r5902; - cvt.u32.u16 %r5904, %rs3812; - cvt.s32.s8 %r5905, %r5904; - cvt.u32.u16 %r5906, %rs3811; - cvt.s32.s8 %r5907, %r5906; - cvt.u32.u16 %r5908, %rs3810; - cvt.s32.s8 %r5909, %r5908; - mad.lo.s32 %r5910, %r271, %r5909, %r5901; - mad.lo.s32 %r5911, %r91, %r5907, %r5910; - mad.lo.s32 %r5912, %r93, %r5905, %r5911; - mad.lo.s32 %r5913, %r94, %r5903, %r5912; - ld.const.v4.u8 {%rs3818, %rs3819, %rs3820, %rs3821}, [matrix+1908]; - cvt.u32.u16 %r5914, %rs3821; - cvt.s32.s8 %r5915, %r5914; - cvt.u32.u16 %r5916, %rs3820; - cvt.s32.s8 %r5917, %r5916; - cvt.u32.u16 %r5918, %rs3819; - cvt.s32.s8 %r5919, %r5918; - cvt.u32.u16 %r5920, %rs3818; - cvt.s32.s8 %r5921, %r5920; - mad.lo.s32 %r5922, %r96, %r5921, %r5913; - mad.lo.s32 %r5923, %r97, %r5919, %r5922; - mad.lo.s32 %r5924, %r99, %r5917, %r5923; - mad.lo.s32 %r5925, %r100, %r5915, %r5924; - ld.const.v4.u8 {%rs3826, %rs3827, %rs3828, %rs3829}, [matrix+1912]; - cvt.u32.u16 %r5926, %rs3829; - cvt.s32.s8 %r5927, %r5926; - cvt.u32.u16 %r5928, %rs3828; - cvt.s32.s8 %r5929, %r5928; - cvt.u32.u16 %r5930, %rs3827; - cvt.s32.s8 %r5931, %r5930; - cvt.u32.u16 %r5932, %rs3826; - cvt.s32.s8 %r5933, %r5932; - mad.lo.s32 %r5934, %r103, %r5933, %r5925; - mad.lo.s32 %r5935, %r104, %r5931, %r5934; - mad.lo.s32 %r5936, %r107, %r5929, %r5935; - mad.lo.s32 %r5937, %r108, %r5927, %r5936; - ld.const.v4.u8 {%rs3834, %rs3835, %rs3836, %rs3837}, [matrix+1916]; - cvt.u32.u16 %r5938, %rs3837; - cvt.s32.s8 %r5939, %r5938; - cvt.u32.u16 %r5940, %rs3836; - cvt.s32.s8 %r5941, %r5940; - cvt.u32.u16 %r5942, %rs3835; - cvt.s32.s8 %r5943, %r5942; - cvt.u32.u16 %r5944, %rs3834; - cvt.s32.s8 %r5945, %r5944; - mad.lo.s32 %r5946, %r111, %r5945, %r5937; - mad.lo.s32 %r5947, %r112, %r5943, %r5946; - mad.lo.s32 %r5948, %r114, %r5941, %r5947; - mad.lo.s32 %r5949, %r115, %r5939, %r5948; - shr.u32 %r5950, %r5757, 6; - and.b32 %r5951, %r5950, 240; - shr.u32 %r5952, %r5949, 10; - or.b32 %r5953, %r5952, %r5951; - xor.b32 %r5954, %r25, %r5953; - cvt.u64.u32 %rd392, %r5954; - ld.const.v4.u8 {%rs3842, %rs3843, %rs3844, %rs3845}, [matrix+1920]; - cvt.u32.u16 %r5955, %rs3845; - cvt.s32.s8 %r5956, %r5955; - cvt.u32.u16 %r5957, %rs3844; - cvt.s32.s8 %r5958, %r5957; - cvt.u32.u16 %r5959, %rs3842; - cvt.s32.s8 %r5960, %r5959; - cvt.u32.u16 %r5961, %rs3843; - cvt.s32.s8 %r5962, %r5961; - mul.lo.s32 %r5963, %r34, %r5962; - mad.lo.s32 %r5964, %r124, %r5960, %r5963; - mad.lo.s32 %r5965, %r35, %r5958, %r5964; - mad.lo.s32 %r5966, %r36, %r5956, %r5965; - ld.const.v4.u8 {%rs3850, %rs3851, %rs3852, %rs3853}, [matrix+1924]; - cvt.u32.u16 %r5967, %rs3853; - cvt.s32.s8 %r5968, %r5967; - cvt.u32.u16 %r5969, %rs3852; - cvt.s32.s8 %r5970, %r5969; - cvt.u32.u16 %r5971, %rs3851; - cvt.s32.s8 %r5972, %r5971; - cvt.u32.u16 %r5973, %rs3850; - cvt.s32.s8 %r5974, %r5973; - mad.lo.s32 %r5975, %r37, %r5974, %r5966; - mad.lo.s32 %r5976, %r38, %r5972, %r5975; - mad.lo.s32 %r5977, %r39, %r5970, %r5976; - mad.lo.s32 %r5978, %r40, %r5968, %r5977; - ld.const.v4.u8 {%rs3858, %rs3859, %rs3860, %rs3861}, [matrix+1928]; - cvt.u32.u16 %r5979, %rs3861; - cvt.s32.s8 %r5980, %r5979; - cvt.u32.u16 %r5981, %rs3860; - cvt.s32.s8 %r5982, %r5981; - cvt.u32.u16 %r5983, %rs3859; - cvt.s32.s8 %r5984, %r5983; - cvt.u32.u16 %r5985, %rs3858; - cvt.s32.s8 %r5986, %r5985; - mad.lo.s32 %r5987, %r42, %r5986, %r5978; - mad.lo.s32 %r5988, %r43, %r5984, %r5987; - mad.lo.s32 %r5989, %r45, %r5982, %r5988; - mad.lo.s32 %r5990, %r46, %r5980, %r5989; - ld.const.v4.u8 {%rs3866, %rs3867, %rs3868, %rs3869}, [matrix+1932]; - cvt.u32.u16 %r5991, %rs3869; - cvt.s32.s8 %r5992, %r5991; - cvt.u32.u16 %r5993, %rs3868; - cvt.s32.s8 %r5994, %r5993; - cvt.u32.u16 %r5995, %rs3867; - cvt.s32.s8 %r5996, %r5995; - cvt.u32.u16 %r5997, %rs3866; - cvt.s32.s8 %r5998, %r5997; - mad.lo.s32 %r5999, %r48, %r5998, %r5990; - mad.lo.s32 %r6000, %r49, %r5996, %r5999; - mad.lo.s32 %r6001, %r50, %r5994, %r6000; - mad.lo.s32 %r6002, %r51, %r5992, %r6001; - ld.const.v4.u8 {%rs3874, %rs3875, %rs3876, %rs3877}, [matrix+1936]; - cvt.u32.u16 %r6003, %rs3877; - cvt.s32.s8 %r6004, %r6003; - cvt.u32.u16 %r6005, %rs3876; - cvt.s32.s8 %r6006, %r6005; - cvt.u32.u16 %r6007, %rs3875; - cvt.s32.s8 %r6008, %r6007; - cvt.u32.u16 %r6009, %rs3874; - cvt.s32.s8 %r6010, %r6009; - mad.lo.s32 %r6011, %r173, %r6010, %r6002; - mad.lo.s32 %r6012, %r53, %r6008, %r6011; - mad.lo.s32 %r6013, %r54, %r6006, %r6012; - mad.lo.s32 %r6014, %r55, %r6004, %r6013; - ld.const.v4.u8 {%rs3882, %rs3883, %rs3884, %rs3885}, [matrix+1940]; - cvt.u32.u16 %r6015, %rs3885; - cvt.s32.s8 %r6016, %r6015; - cvt.u32.u16 %r6017, %rs3884; - cvt.s32.s8 %r6018, %r6017; - cvt.u32.u16 %r6019, %rs3883; - cvt.s32.s8 %r6020, %r6019; - cvt.u32.u16 %r6021, %rs3882; - cvt.s32.s8 %r6022, %r6021; - mad.lo.s32 %r6023, %r56, %r6022, %r6014; - mad.lo.s32 %r6024, %r57, %r6020, %r6023; - mad.lo.s32 %r6025, %r58, %r6018, %r6024; - mad.lo.s32 %r6026, %r59, %r6016, %r6025; - ld.const.v4.u8 {%rs3890, %rs3891, %rs3892, %rs3893}, [matrix+1944]; - cvt.u32.u16 %r6027, %rs3893; - cvt.s32.s8 %r6028, %r6027; - cvt.u32.u16 %r6029, %rs3892; - cvt.s32.s8 %r6030, %r6029; - cvt.u32.u16 %r6031, %rs3891; - cvt.s32.s8 %r6032, %r6031; - cvt.u32.u16 %r6033, %rs3890; - cvt.s32.s8 %r6034, %r6033; - mad.lo.s32 %r6035, %r61, %r6034, %r6026; - mad.lo.s32 %r6036, %r62, %r6032, %r6035; - mad.lo.s32 %r6037, %r64, %r6030, %r6036; - mad.lo.s32 %r6038, %r65, %r6028, %r6037; - ld.const.v4.u8 {%rs3898, %rs3899, %rs3900, %rs3901}, [matrix+1948]; - cvt.u32.u16 %r6039, %rs3901; - cvt.s32.s8 %r6040, %r6039; - cvt.u32.u16 %r6041, %rs3900; - cvt.s32.s8 %r6042, %r6041; - cvt.u32.u16 %r6043, %rs3899; - cvt.s32.s8 %r6044, %r6043; - cvt.u32.u16 %r6045, %rs3898; - cvt.s32.s8 %r6046, %r6045; - mad.lo.s32 %r6047, %r67, %r6046, %r6038; - mad.lo.s32 %r6048, %r68, %r6044, %r6047; - mad.lo.s32 %r6049, %r69, %r6042, %r6048; - mad.lo.s32 %r6050, %r70, %r6040, %r6049; - ld.const.v4.u8 {%rs3906, %rs3907, %rs3908, %rs3909}, [matrix+1952]; - cvt.u32.u16 %r6051, %rs3909; - cvt.s32.s8 %r6052, %r6051; - cvt.u32.u16 %r6053, %rs3908; - cvt.s32.s8 %r6054, %r6053; - cvt.u32.u16 %r6055, %rs3907; - cvt.s32.s8 %r6056, %r6055; - cvt.u32.u16 %r6057, %rs3906; - cvt.s32.s8 %r6058, %r6057; - mad.lo.s32 %r6059, %r222, %r6058, %r6050; - mad.lo.s32 %r6060, %r72, %r6056, %r6059; - mad.lo.s32 %r6061, %r73, %r6054, %r6060; - mad.lo.s32 %r6062, %r74, %r6052, %r6061; - ld.const.v4.u8 {%rs3914, %rs3915, %rs3916, %rs3917}, [matrix+1956]; - cvt.u32.u16 %r6063, %rs3917; - cvt.s32.s8 %r6064, %r6063; - cvt.u32.u16 %r6065, %rs3916; - cvt.s32.s8 %r6066, %r6065; - cvt.u32.u16 %r6067, %rs3915; - cvt.s32.s8 %r6068, %r6067; - cvt.u32.u16 %r6069, %rs3914; - cvt.s32.s8 %r6070, %r6069; - mad.lo.s32 %r6071, %r75, %r6070, %r6062; - mad.lo.s32 %r6072, %r76, %r6068, %r6071; - mad.lo.s32 %r6073, %r77, %r6066, %r6072; - mad.lo.s32 %r6074, %r78, %r6064, %r6073; - ld.const.v4.u8 {%rs3922, %rs3923, %rs3924, %rs3925}, [matrix+1960]; - cvt.u32.u16 %r6075, %rs3925; - cvt.s32.s8 %r6076, %r6075; - cvt.u32.u16 %r6077, %rs3924; - cvt.s32.s8 %r6078, %r6077; - cvt.u32.u16 %r6079, %rs3923; - cvt.s32.s8 %r6080, %r6079; - cvt.u32.u16 %r6081, %rs3922; - cvt.s32.s8 %r6082, %r6081; - mad.lo.s32 %r6083, %r80, %r6082, %r6074; - mad.lo.s32 %r6084, %r81, %r6080, %r6083; - mad.lo.s32 %r6085, %r83, %r6078, %r6084; - mad.lo.s32 %r6086, %r84, %r6076, %r6085; - ld.const.v4.u8 {%rs3930, %rs3931, %rs3932, %rs3933}, [matrix+1964]; - cvt.u32.u16 %r6087, %rs3933; - cvt.s32.s8 %r6088, %r6087; - cvt.u32.u16 %r6089, %rs3932; - cvt.s32.s8 %r6090, %r6089; - cvt.u32.u16 %r6091, %rs3931; - cvt.s32.s8 %r6092, %r6091; - cvt.u32.u16 %r6093, %rs3930; - cvt.s32.s8 %r6094, %r6093; - mad.lo.s32 %r6095, %r86, %r6094, %r6086; - mad.lo.s32 %r6096, %r87, %r6092, %r6095; - mad.lo.s32 %r6097, %r88, %r6090, %r6096; - mad.lo.s32 %r6098, %r89, %r6088, %r6097; - ld.const.v4.u8 {%rs3938, %rs3939, %rs3940, %rs3941}, [matrix+1968]; - cvt.u32.u16 %r6099, %rs3941; - cvt.s32.s8 %r6100, %r6099; - cvt.u32.u16 %r6101, %rs3940; - cvt.s32.s8 %r6102, %r6101; - cvt.u32.u16 %r6103, %rs3939; - cvt.s32.s8 %r6104, %r6103; - cvt.u32.u16 %r6105, %rs3938; - cvt.s32.s8 %r6106, %r6105; - mad.lo.s32 %r6107, %r271, %r6106, %r6098; - mad.lo.s32 %r6108, %r91, %r6104, %r6107; - mad.lo.s32 %r6109, %r93, %r6102, %r6108; - mad.lo.s32 %r6110, %r94, %r6100, %r6109; - ld.const.v4.u8 {%rs3946, %rs3947, %rs3948, %rs3949}, [matrix+1972]; - cvt.u32.u16 %r6111, %rs3949; - cvt.s32.s8 %r6112, %r6111; - cvt.u32.u16 %r6113, %rs3948; - cvt.s32.s8 %r6114, %r6113; - cvt.u32.u16 %r6115, %rs3947; - cvt.s32.s8 %r6116, %r6115; - cvt.u32.u16 %r6117, %rs3946; - cvt.s32.s8 %r6118, %r6117; - mad.lo.s32 %r6119, %r96, %r6118, %r6110; - mad.lo.s32 %r6120, %r97, %r6116, %r6119; - mad.lo.s32 %r6121, %r99, %r6114, %r6120; - mad.lo.s32 %r6122, %r100, %r6112, %r6121; - ld.const.v4.u8 {%rs3954, %rs3955, %rs3956, %rs3957}, [matrix+1976]; - cvt.u32.u16 %r6123, %rs3957; - cvt.s32.s8 %r6124, %r6123; - cvt.u32.u16 %r6125, %rs3956; - cvt.s32.s8 %r6126, %r6125; - cvt.u32.u16 %r6127, %rs3955; - cvt.s32.s8 %r6128, %r6127; - cvt.u32.u16 %r6129, %rs3954; - cvt.s32.s8 %r6130, %r6129; - mad.lo.s32 %r6131, %r103, %r6130, %r6122; - mad.lo.s32 %r6132, %r104, %r6128, %r6131; - mad.lo.s32 %r6133, %r107, %r6126, %r6132; - mad.lo.s32 %r6134, %r108, %r6124, %r6133; - ld.const.v4.u8 {%rs3962, %rs3963, %rs3964, %rs3965}, [matrix+1980]; - cvt.u32.u16 %r6135, %rs3965; - cvt.s32.s8 %r6136, %r6135; - cvt.u32.u16 %r6137, %rs3964; - cvt.s32.s8 %r6138, %r6137; - cvt.u32.u16 %r6139, %rs3963; - cvt.s32.s8 %r6140, %r6139; - cvt.u32.u16 %r6141, %rs3962; - cvt.s32.s8 %r6142, %r6141; - mad.lo.s32 %r6143, %r111, %r6142, %r6134; - mad.lo.s32 %r6144, %r112, %r6140, %r6143; - mad.lo.s32 %r6145, %r114, %r6138, %r6144; - mad.lo.s32 %r6146, %r115, %r6136, %r6145; - ld.const.v4.u8 {%rs3970, %rs3971, %rs3972, %rs3973}, [matrix+1984]; - cvt.u32.u16 %r6147, %rs3973; - cvt.s32.s8 %r6148, %r6147; - cvt.u32.u16 %r6149, %rs3972; - cvt.s32.s8 %r6150, %r6149; - cvt.u32.u16 %r6151, %rs3970; - cvt.s32.s8 %r6152, %r6151; - cvt.u32.u16 %r6153, %rs3971; - cvt.s32.s8 %r6154, %r6153; - mul.lo.s32 %r6155, %r34, %r6154; - mad.lo.s32 %r6156, %r124, %r6152, %r6155; - mad.lo.s32 %r6157, %r35, %r6150, %r6156; - mad.lo.s32 %r6158, %r36, %r6148, %r6157; - ld.const.v4.u8 {%rs3978, %rs3979, %rs3980, %rs3981}, [matrix+1988]; - cvt.u32.u16 %r6159, %rs3981; - cvt.s32.s8 %r6160, %r6159; - cvt.u32.u16 %r6161, %rs3980; - cvt.s32.s8 %r6162, %r6161; - cvt.u32.u16 %r6163, %rs3979; - cvt.s32.s8 %r6164, %r6163; - cvt.u32.u16 %r6165, %rs3978; - cvt.s32.s8 %r6166, %r6165; - mad.lo.s32 %r6167, %r37, %r6166, %r6158; - mad.lo.s32 %r6168, %r38, %r6164, %r6167; - mad.lo.s32 %r6169, %r39, %r6162, %r6168; - mad.lo.s32 %r6170, %r40, %r6160, %r6169; - ld.const.v4.u8 {%rs3986, %rs3987, %rs3988, %rs3989}, [matrix+1992]; - cvt.u32.u16 %r6171, %rs3989; - cvt.s32.s8 %r6172, %r6171; - cvt.u32.u16 %r6173, %rs3988; - cvt.s32.s8 %r6174, %r6173; - cvt.u32.u16 %r6175, %rs3987; - cvt.s32.s8 %r6176, %r6175; - cvt.u32.u16 %r6177, %rs3986; - cvt.s32.s8 %r6178, %r6177; - mad.lo.s32 %r6179, %r42, %r6178, %r6170; - mad.lo.s32 %r6180, %r43, %r6176, %r6179; - mad.lo.s32 %r6181, %r45, %r6174, %r6180; - mad.lo.s32 %r6182, %r46, %r6172, %r6181; - ld.const.v4.u8 {%rs3994, %rs3995, %rs3996, %rs3997}, [matrix+1996]; - cvt.u32.u16 %r6183, %rs3997; - cvt.s32.s8 %r6184, %r6183; - cvt.u32.u16 %r6185, %rs3996; - cvt.s32.s8 %r6186, %r6185; - cvt.u32.u16 %r6187, %rs3995; - cvt.s32.s8 %r6188, %r6187; - cvt.u32.u16 %r6189, %rs3994; - cvt.s32.s8 %r6190, %r6189; - mad.lo.s32 %r6191, %r48, %r6190, %r6182; - mad.lo.s32 %r6192, %r49, %r6188, %r6191; - mad.lo.s32 %r6193, %r50, %r6186, %r6192; - mad.lo.s32 %r6194, %r51, %r6184, %r6193; - ld.const.v4.u8 {%rs4002, %rs4003, %rs4004, %rs4005}, [matrix+2000]; - cvt.u32.u16 %r6195, %rs4005; - cvt.s32.s8 %r6196, %r6195; - cvt.u32.u16 %r6197, %rs4004; - cvt.s32.s8 %r6198, %r6197; - cvt.u32.u16 %r6199, %rs4003; - cvt.s32.s8 %r6200, %r6199; - cvt.u32.u16 %r6201, %rs4002; - cvt.s32.s8 %r6202, %r6201; - mad.lo.s32 %r6203, %r173, %r6202, %r6194; - mad.lo.s32 %r6204, %r53, %r6200, %r6203; - mad.lo.s32 %r6205, %r54, %r6198, %r6204; - mad.lo.s32 %r6206, %r55, %r6196, %r6205; - ld.const.v4.u8 {%rs4010, %rs4011, %rs4012, %rs4013}, [matrix+2004]; - cvt.u32.u16 %r6207, %rs4013; - cvt.s32.s8 %r6208, %r6207; - cvt.u32.u16 %r6209, %rs4012; - cvt.s32.s8 %r6210, %r6209; - cvt.u32.u16 %r6211, %rs4011; - cvt.s32.s8 %r6212, %r6211; - cvt.u32.u16 %r6213, %rs4010; - cvt.s32.s8 %r6214, %r6213; - mad.lo.s32 %r6215, %r56, %r6214, %r6206; - mad.lo.s32 %r6216, %r57, %r6212, %r6215; - mad.lo.s32 %r6217, %r58, %r6210, %r6216; - mad.lo.s32 %r6218, %r59, %r6208, %r6217; - ld.const.v4.u8 {%rs4018, %rs4019, %rs4020, %rs4021}, [matrix+2008]; - cvt.u32.u16 %r6219, %rs4021; - cvt.s32.s8 %r6220, %r6219; - cvt.u32.u16 %r6221, %rs4020; - cvt.s32.s8 %r6222, %r6221; - cvt.u32.u16 %r6223, %rs4019; - cvt.s32.s8 %r6224, %r6223; - cvt.u32.u16 %r6225, %rs4018; - cvt.s32.s8 %r6226, %r6225; - mad.lo.s32 %r6227, %r61, %r6226, %r6218; - mad.lo.s32 %r6228, %r62, %r6224, %r6227; - mad.lo.s32 %r6229, %r64, %r6222, %r6228; - mad.lo.s32 %r6230, %r65, %r6220, %r6229; - ld.const.v4.u8 {%rs4026, %rs4027, %rs4028, %rs4029}, [matrix+2012]; - cvt.u32.u16 %r6231, %rs4029; - cvt.s32.s8 %r6232, %r6231; - cvt.u32.u16 %r6233, %rs4028; - cvt.s32.s8 %r6234, %r6233; - cvt.u32.u16 %r6235, %rs4027; - cvt.s32.s8 %r6236, %r6235; - cvt.u32.u16 %r6237, %rs4026; - cvt.s32.s8 %r6238, %r6237; - mad.lo.s32 %r6239, %r67, %r6238, %r6230; - mad.lo.s32 %r6240, %r68, %r6236, %r6239; - mad.lo.s32 %r6241, %r69, %r6234, %r6240; - mad.lo.s32 %r6242, %r70, %r6232, %r6241; - ld.const.v4.u8 {%rs4034, %rs4035, %rs4036, %rs4037}, [matrix+2016]; - cvt.u32.u16 %r6243, %rs4037; - cvt.s32.s8 %r6244, %r6243; - cvt.u32.u16 %r6245, %rs4036; - cvt.s32.s8 %r6246, %r6245; - cvt.u32.u16 %r6247, %rs4035; - cvt.s32.s8 %r6248, %r6247; - cvt.u32.u16 %r6249, %rs4034; - cvt.s32.s8 %r6250, %r6249; - mad.lo.s32 %r6251, %r222, %r6250, %r6242; - mad.lo.s32 %r6252, %r72, %r6248, %r6251; - mad.lo.s32 %r6253, %r73, %r6246, %r6252; - mad.lo.s32 %r6254, %r74, %r6244, %r6253; - ld.const.v4.u8 {%rs4042, %rs4043, %rs4044, %rs4045}, [matrix+2020]; - cvt.u32.u16 %r6255, %rs4045; - cvt.s32.s8 %r6256, %r6255; - cvt.u32.u16 %r6257, %rs4044; - cvt.s32.s8 %r6258, %r6257; - cvt.u32.u16 %r6259, %rs4043; - cvt.s32.s8 %r6260, %r6259; - cvt.u32.u16 %r6261, %rs4042; - cvt.s32.s8 %r6262, %r6261; - mad.lo.s32 %r6263, %r75, %r6262, %r6254; - mad.lo.s32 %r6264, %r76, %r6260, %r6263; - mad.lo.s32 %r6265, %r77, %r6258, %r6264; - mad.lo.s32 %r6266, %r78, %r6256, %r6265; - ld.const.v4.u8 {%rs4050, %rs4051, %rs4052, %rs4053}, [matrix+2024]; - cvt.u32.u16 %r6267, %rs4053; - cvt.s32.s8 %r6268, %r6267; - cvt.u32.u16 %r6269, %rs4052; - cvt.s32.s8 %r6270, %r6269; - cvt.u32.u16 %r6271, %rs4051; - cvt.s32.s8 %r6272, %r6271; - cvt.u32.u16 %r6273, %rs4050; - cvt.s32.s8 %r6274, %r6273; - mad.lo.s32 %r6275, %r80, %r6274, %r6266; - mad.lo.s32 %r6276, %r81, %r6272, %r6275; - mad.lo.s32 %r6277, %r83, %r6270, %r6276; - mad.lo.s32 %r6278, %r84, %r6268, %r6277; - ld.const.v4.u8 {%rs4058, %rs4059, %rs4060, %rs4061}, [matrix+2028]; - cvt.u32.u16 %r6279, %rs4061; - cvt.s32.s8 %r6280, %r6279; - cvt.u32.u16 %r6281, %rs4060; - cvt.s32.s8 %r6282, %r6281; - cvt.u32.u16 %r6283, %rs4059; - cvt.s32.s8 %r6284, %r6283; - cvt.u32.u16 %r6285, %rs4058; - cvt.s32.s8 %r6286, %r6285; - mad.lo.s32 %r6287, %r86, %r6286, %r6278; - mad.lo.s32 %r6288, %r87, %r6284, %r6287; - mad.lo.s32 %r6289, %r88, %r6282, %r6288; - mad.lo.s32 %r6290, %r89, %r6280, %r6289; - ld.const.v4.u8 {%rs4066, %rs4067, %rs4068, %rs4069}, [matrix+2032]; - cvt.u32.u16 %r6291, %rs4069; - cvt.s32.s8 %r6292, %r6291; - cvt.u32.u16 %r6293, %rs4068; - cvt.s32.s8 %r6294, %r6293; - cvt.u32.u16 %r6295, %rs4067; - cvt.s32.s8 %r6296, %r6295; - cvt.u32.u16 %r6297, %rs4066; - cvt.s32.s8 %r6298, %r6297; - mad.lo.s32 %r6299, %r271, %r6298, %r6290; - mad.lo.s32 %r6300, %r91, %r6296, %r6299; - mad.lo.s32 %r6301, %r93, %r6294, %r6300; - mad.lo.s32 %r6302, %r94, %r6292, %r6301; - ld.const.v4.u8 {%rs4074, %rs4075, %rs4076, %rs4077}, [matrix+2036]; - cvt.u32.u16 %r6303, %rs4077; - cvt.s32.s8 %r6304, %r6303; - cvt.u32.u16 %r6305, %rs4076; - cvt.s32.s8 %r6306, %r6305; - cvt.u32.u16 %r6307, %rs4075; - cvt.s32.s8 %r6308, %r6307; - cvt.u32.u16 %r6309, %rs4074; - cvt.s32.s8 %r6310, %r6309; - mad.lo.s32 %r6311, %r96, %r6310, %r6302; - mad.lo.s32 %r6312, %r97, %r6308, %r6311; - mad.lo.s32 %r6313, %r99, %r6306, %r6312; - mad.lo.s32 %r6314, %r100, %r6304, %r6313; - ld.const.v4.u8 {%rs4082, %rs4083, %rs4084, %rs4085}, [matrix+2040]; - cvt.u32.u16 %r6315, %rs4085; - cvt.s32.s8 %r6316, %r6315; - cvt.u32.u16 %r6317, %rs4084; - cvt.s32.s8 %r6318, %r6317; - cvt.u32.u16 %r6319, %rs4083; - cvt.s32.s8 %r6320, %r6319; - cvt.u32.u16 %r6321, %rs4082; - cvt.s32.s8 %r6322, %r6321; - mad.lo.s32 %r6323, %r103, %r6322, %r6314; - mad.lo.s32 %r6324, %r104, %r6320, %r6323; - mad.lo.s32 %r6325, %r107, %r6318, %r6324; - mad.lo.s32 %r6326, %r108, %r6316, %r6325; - ld.const.v4.u8 {%rs4090, %rs4091, %rs4092, %rs4093}, [matrix+2044]; - cvt.u32.u16 %r6327, %rs4093; - cvt.s32.s8 %r6328, %r6327; - cvt.u32.u16 %r6329, %rs4092; - cvt.s32.s8 %r6330, %r6329; - cvt.u32.u16 %r6331, %rs4091; - cvt.s32.s8 %r6332, %r6331; - cvt.u32.u16 %r6333, %rs4090; - cvt.s32.s8 %r6334, %r6333; - mad.lo.s32 %r6335, %r111, %r6334, %r6326; - mad.lo.s32 %r6336, %r112, %r6332, %r6335; - mad.lo.s32 %r6337, %r114, %r6330, %r6336; - mad.lo.s32 %r6338, %r115, %r6328, %r6337; - shr.u32 %r6339, %r6146, 6; - and.b32 %r6340, %r6339, 240; - shr.u32 %r6341, %r6338, 10; - or.b32 %r6342, %r6341, %r6340; - xor.b32 %r6343, %r26, %r6342; - ld.const.v4.u8 {%rs4098, %rs4099, %rs4100, %rs4101}, [matrix+2048]; - cvt.u32.u16 %r6344, %rs4101; - cvt.s32.s8 %r6345, %r6344; - cvt.u32.u16 %r6346, %rs4100; - cvt.s32.s8 %r6347, %r6346; - cvt.u32.u16 %r6348, %rs4098; - cvt.s32.s8 %r6349, %r6348; - cvt.u32.u16 %r6350, %rs4099; - cvt.s32.s8 %r6351, %r6350; - mul.lo.s32 %r6352, %r34, %r6351; - mad.lo.s32 %r6353, %r124, %r6349, %r6352; - mad.lo.s32 %r6354, %r35, %r6347, %r6353; - mad.lo.s32 %r6355, %r36, %r6345, %r6354; - ld.const.v4.u8 {%rs4106, %rs4107, %rs4108, %rs4109}, [matrix+2052]; - cvt.u32.u16 %r6356, %rs4109; - cvt.s32.s8 %r6357, %r6356; - cvt.u32.u16 %r6358, %rs4108; - cvt.s32.s8 %r6359, %r6358; - cvt.u32.u16 %r6360, %rs4107; - cvt.s32.s8 %r6361, %r6360; - cvt.u32.u16 %r6362, %rs4106; - cvt.s32.s8 %r6363, %r6362; - mad.lo.s32 %r6364, %r37, %r6363, %r6355; - mad.lo.s32 %r6365, %r38, %r6361, %r6364; - mad.lo.s32 %r6366, %r39, %r6359, %r6365; - mad.lo.s32 %r6367, %r40, %r6357, %r6366; - ld.const.v4.u8 {%rs4114, %rs4115, %rs4116, %rs4117}, [matrix+2056]; - cvt.u32.u16 %r6368, %rs4117; - cvt.s32.s8 %r6369, %r6368; - cvt.u32.u16 %r6370, %rs4116; - cvt.s32.s8 %r6371, %r6370; - cvt.u32.u16 %r6372, %rs4115; - cvt.s32.s8 %r6373, %r6372; - cvt.u32.u16 %r6374, %rs4114; - cvt.s32.s8 %r6375, %r6374; - mad.lo.s32 %r6376, %r42, %r6375, %r6367; - mad.lo.s32 %r6377, %r43, %r6373, %r6376; - mad.lo.s32 %r6378, %r45, %r6371, %r6377; - mad.lo.s32 %r6379, %r46, %r6369, %r6378; - ld.const.v4.u8 {%rs4122, %rs4123, %rs4124, %rs4125}, [matrix+2060]; - cvt.u32.u16 %r6380, %rs4125; - cvt.s32.s8 %r6381, %r6380; - cvt.u32.u16 %r6382, %rs4124; - cvt.s32.s8 %r6383, %r6382; - cvt.u32.u16 %r6384, %rs4123; - cvt.s32.s8 %r6385, %r6384; - cvt.u32.u16 %r6386, %rs4122; - cvt.s32.s8 %r6387, %r6386; - mad.lo.s32 %r6388, %r48, %r6387, %r6379; - mad.lo.s32 %r6389, %r49, %r6385, %r6388; - mad.lo.s32 %r6390, %r50, %r6383, %r6389; - mad.lo.s32 %r6391, %r51, %r6381, %r6390; - ld.const.v4.u8 {%rs4130, %rs4131, %rs4132, %rs4133}, [matrix+2064]; - cvt.u32.u16 %r6392, %rs4133; - cvt.s32.s8 %r6393, %r6392; - cvt.u32.u16 %r6394, %rs4132; - cvt.s32.s8 %r6395, %r6394; - cvt.u32.u16 %r6396, %rs4131; - cvt.s32.s8 %r6397, %r6396; - cvt.u32.u16 %r6398, %rs4130; - cvt.s32.s8 %r6399, %r6398; - mad.lo.s32 %r6400, %r173, %r6399, %r6391; - mad.lo.s32 %r6401, %r53, %r6397, %r6400; - mad.lo.s32 %r6402, %r54, %r6395, %r6401; - mad.lo.s32 %r6403, %r55, %r6393, %r6402; - ld.const.v4.u8 {%rs4138, %rs4139, %rs4140, %rs4141}, [matrix+2068]; - cvt.u32.u16 %r6404, %rs4141; - cvt.s32.s8 %r6405, %r6404; - cvt.u32.u16 %r6406, %rs4140; - cvt.s32.s8 %r6407, %r6406; - cvt.u32.u16 %r6408, %rs4139; - cvt.s32.s8 %r6409, %r6408; - cvt.u32.u16 %r6410, %rs4138; - cvt.s32.s8 %r6411, %r6410; - mad.lo.s32 %r6412, %r56, %r6411, %r6403; - mad.lo.s32 %r6413, %r57, %r6409, %r6412; - mad.lo.s32 %r6414, %r58, %r6407, %r6413; - mad.lo.s32 %r6415, %r59, %r6405, %r6414; - ld.const.v4.u8 {%rs4146, %rs4147, %rs4148, %rs4149}, [matrix+2072]; - cvt.u32.u16 %r6416, %rs4149; - cvt.s32.s8 %r6417, %r6416; - cvt.u32.u16 %r6418, %rs4148; - cvt.s32.s8 %r6419, %r6418; - cvt.u32.u16 %r6420, %rs4147; - cvt.s32.s8 %r6421, %r6420; - cvt.u32.u16 %r6422, %rs4146; - cvt.s32.s8 %r6423, %r6422; - mad.lo.s32 %r6424, %r61, %r6423, %r6415; - mad.lo.s32 %r6425, %r62, %r6421, %r6424; - mad.lo.s32 %r6426, %r64, %r6419, %r6425; - mad.lo.s32 %r6427, %r65, %r6417, %r6426; - ld.const.v4.u8 {%rs4154, %rs4155, %rs4156, %rs4157}, [matrix+2076]; - cvt.u32.u16 %r6428, %rs4157; - cvt.s32.s8 %r6429, %r6428; - cvt.u32.u16 %r6430, %rs4156; - cvt.s32.s8 %r6431, %r6430; - cvt.u32.u16 %r6432, %rs4155; - cvt.s32.s8 %r6433, %r6432; - cvt.u32.u16 %r6434, %rs4154; - cvt.s32.s8 %r6435, %r6434; - mad.lo.s32 %r6436, %r67, %r6435, %r6427; - mad.lo.s32 %r6437, %r68, %r6433, %r6436; - mad.lo.s32 %r6438, %r69, %r6431, %r6437; - mad.lo.s32 %r6439, %r70, %r6429, %r6438; - ld.const.v4.u8 {%rs4162, %rs4163, %rs4164, %rs4165}, [matrix+2080]; - cvt.u32.u16 %r6440, %rs4165; - cvt.s32.s8 %r6441, %r6440; - cvt.u32.u16 %r6442, %rs4164; - cvt.s32.s8 %r6443, %r6442; - cvt.u32.u16 %r6444, %rs4163; - cvt.s32.s8 %r6445, %r6444; - cvt.u32.u16 %r6446, %rs4162; - cvt.s32.s8 %r6447, %r6446; - mad.lo.s32 %r6448, %r222, %r6447, %r6439; - mad.lo.s32 %r6449, %r72, %r6445, %r6448; - mad.lo.s32 %r6450, %r73, %r6443, %r6449; - mad.lo.s32 %r6451, %r74, %r6441, %r6450; - ld.const.v4.u8 {%rs4170, %rs4171, %rs4172, %rs4173}, [matrix+2084]; - cvt.u32.u16 %r6452, %rs4173; - cvt.s32.s8 %r6453, %r6452; - cvt.u32.u16 %r6454, %rs4172; - cvt.s32.s8 %r6455, %r6454; - cvt.u32.u16 %r6456, %rs4171; - cvt.s32.s8 %r6457, %r6456; - cvt.u32.u16 %r6458, %rs4170; - cvt.s32.s8 %r6459, %r6458; - mad.lo.s32 %r6460, %r75, %r6459, %r6451; - mad.lo.s32 %r6461, %r76, %r6457, %r6460; - mad.lo.s32 %r6462, %r77, %r6455, %r6461; - mad.lo.s32 %r6463, %r78, %r6453, %r6462; - ld.const.v4.u8 {%rs4178, %rs4179, %rs4180, %rs4181}, [matrix+2088]; - cvt.u32.u16 %r6464, %rs4181; - cvt.s32.s8 %r6465, %r6464; - cvt.u32.u16 %r6466, %rs4180; - cvt.s32.s8 %r6467, %r6466; - cvt.u32.u16 %r6468, %rs4179; - cvt.s32.s8 %r6469, %r6468; - cvt.u32.u16 %r6470, %rs4178; - cvt.s32.s8 %r6471, %r6470; - mad.lo.s32 %r6472, %r80, %r6471, %r6463; - mad.lo.s32 %r6473, %r81, %r6469, %r6472; - mad.lo.s32 %r6474, %r83, %r6467, %r6473; - mad.lo.s32 %r6475, %r84, %r6465, %r6474; - ld.const.v4.u8 {%rs4186, %rs4187, %rs4188, %rs4189}, [matrix+2092]; - cvt.u32.u16 %r6476, %rs4189; - cvt.s32.s8 %r6477, %r6476; - cvt.u32.u16 %r6478, %rs4188; - cvt.s32.s8 %r6479, %r6478; - cvt.u32.u16 %r6480, %rs4187; - cvt.s32.s8 %r6481, %r6480; - cvt.u32.u16 %r6482, %rs4186; - cvt.s32.s8 %r6483, %r6482; - mad.lo.s32 %r6484, %r86, %r6483, %r6475; - mad.lo.s32 %r6485, %r87, %r6481, %r6484; - mad.lo.s32 %r6486, %r88, %r6479, %r6485; - mad.lo.s32 %r6487, %r89, %r6477, %r6486; - ld.const.v4.u8 {%rs4194, %rs4195, %rs4196, %rs4197}, [matrix+2096]; - cvt.u32.u16 %r6488, %rs4197; - cvt.s32.s8 %r6489, %r6488; - cvt.u32.u16 %r6490, %rs4196; - cvt.s32.s8 %r6491, %r6490; - cvt.u32.u16 %r6492, %rs4195; - cvt.s32.s8 %r6493, %r6492; - cvt.u32.u16 %r6494, %rs4194; - cvt.s32.s8 %r6495, %r6494; - mad.lo.s32 %r6496, %r271, %r6495, %r6487; - mad.lo.s32 %r6497, %r91, %r6493, %r6496; - mad.lo.s32 %r6498, %r93, %r6491, %r6497; - mad.lo.s32 %r6499, %r94, %r6489, %r6498; - ld.const.v4.u8 {%rs4202, %rs4203, %rs4204, %rs4205}, [matrix+2100]; - cvt.u32.u16 %r6500, %rs4205; - cvt.s32.s8 %r6501, %r6500; - cvt.u32.u16 %r6502, %rs4204; - cvt.s32.s8 %r6503, %r6502; - cvt.u32.u16 %r6504, %rs4203; - cvt.s32.s8 %r6505, %r6504; - cvt.u32.u16 %r6506, %rs4202; - cvt.s32.s8 %r6507, %r6506; - mad.lo.s32 %r6508, %r96, %r6507, %r6499; - mad.lo.s32 %r6509, %r97, %r6505, %r6508; - mad.lo.s32 %r6510, %r99, %r6503, %r6509; - mad.lo.s32 %r6511, %r100, %r6501, %r6510; - ld.const.v4.u8 {%rs4210, %rs4211, %rs4212, %rs4213}, [matrix+2104]; - cvt.u32.u16 %r6512, %rs4213; - cvt.s32.s8 %r6513, %r6512; - cvt.u32.u16 %r6514, %rs4212; - cvt.s32.s8 %r6515, %r6514; - cvt.u32.u16 %r6516, %rs4211; - cvt.s32.s8 %r6517, %r6516; - cvt.u32.u16 %r6518, %rs4210; - cvt.s32.s8 %r6519, %r6518; - mad.lo.s32 %r6520, %r103, %r6519, %r6511; - mad.lo.s32 %r6521, %r104, %r6517, %r6520; - mad.lo.s32 %r6522, %r107, %r6515, %r6521; - mad.lo.s32 %r6523, %r108, %r6513, %r6522; - ld.const.v4.u8 {%rs4218, %rs4219, %rs4220, %rs4221}, [matrix+2108]; - cvt.u32.u16 %r6524, %rs4221; - cvt.s32.s8 %r6525, %r6524; - cvt.u32.u16 %r6526, %rs4220; - cvt.s32.s8 %r6527, %r6526; - cvt.u32.u16 %r6528, %rs4219; - cvt.s32.s8 %r6529, %r6528; - cvt.u32.u16 %r6530, %rs4218; - cvt.s32.s8 %r6531, %r6530; - mad.lo.s32 %r6532, %r111, %r6531, %r6523; - mad.lo.s32 %r6533, %r112, %r6529, %r6532; - mad.lo.s32 %r6534, %r114, %r6527, %r6533; - mad.lo.s32 %r6535, %r115, %r6525, %r6534; - ld.const.v4.u8 {%rs4226, %rs4227, %rs4228, %rs4229}, [matrix+2112]; - cvt.u32.u16 %r6536, %rs4229; - cvt.s32.s8 %r6537, %r6536; - cvt.u32.u16 %r6538, %rs4228; - cvt.s32.s8 %r6539, %r6538; - cvt.u32.u16 %r6540, %rs4226; - cvt.s32.s8 %r6541, %r6540; - cvt.u32.u16 %r6542, %rs4227; - cvt.s32.s8 %r6543, %r6542; - mul.lo.s32 %r6544, %r34, %r6543; - mad.lo.s32 %r6545, %r124, %r6541, %r6544; - mad.lo.s32 %r6546, %r35, %r6539, %r6545; - mad.lo.s32 %r6547, %r36, %r6537, %r6546; - ld.const.v4.u8 {%rs4234, %rs4235, %rs4236, %rs4237}, [matrix+2116]; - cvt.u32.u16 %r6548, %rs4237; - cvt.s32.s8 %r6549, %r6548; - cvt.u32.u16 %r6550, %rs4236; - cvt.s32.s8 %r6551, %r6550; - cvt.u32.u16 %r6552, %rs4235; - cvt.s32.s8 %r6553, %r6552; - cvt.u32.u16 %r6554, %rs4234; - cvt.s32.s8 %r6555, %r6554; - mad.lo.s32 %r6556, %r37, %r6555, %r6547; - mad.lo.s32 %r6557, %r38, %r6553, %r6556; - mad.lo.s32 %r6558, %r39, %r6551, %r6557; - mad.lo.s32 %r6559, %r40, %r6549, %r6558; - ld.const.v4.u8 {%rs4242, %rs4243, %rs4244, %rs4245}, [matrix+2120]; - cvt.u32.u16 %r6560, %rs4245; - cvt.s32.s8 %r6561, %r6560; - cvt.u32.u16 %r6562, %rs4244; - cvt.s32.s8 %r6563, %r6562; - cvt.u32.u16 %r6564, %rs4243; - cvt.s32.s8 %r6565, %r6564; - cvt.u32.u16 %r6566, %rs4242; - cvt.s32.s8 %r6567, %r6566; - mad.lo.s32 %r6568, %r42, %r6567, %r6559; - mad.lo.s32 %r6569, %r43, %r6565, %r6568; - mad.lo.s32 %r6570, %r45, %r6563, %r6569; - mad.lo.s32 %r6571, %r46, %r6561, %r6570; - ld.const.v4.u8 {%rs4250, %rs4251, %rs4252, %rs4253}, [matrix+2124]; - cvt.u32.u16 %r6572, %rs4253; - cvt.s32.s8 %r6573, %r6572; - cvt.u32.u16 %r6574, %rs4252; - cvt.s32.s8 %r6575, %r6574; - cvt.u32.u16 %r6576, %rs4251; - cvt.s32.s8 %r6577, %r6576; - cvt.u32.u16 %r6578, %rs4250; - cvt.s32.s8 %r6579, %r6578; - mad.lo.s32 %r6580, %r48, %r6579, %r6571; - mad.lo.s32 %r6581, %r49, %r6577, %r6580; - mad.lo.s32 %r6582, %r50, %r6575, %r6581; - mad.lo.s32 %r6583, %r51, %r6573, %r6582; - ld.const.v4.u8 {%rs4258, %rs4259, %rs4260, %rs4261}, [matrix+2128]; - cvt.u32.u16 %r6584, %rs4261; - cvt.s32.s8 %r6585, %r6584; - cvt.u32.u16 %r6586, %rs4260; - cvt.s32.s8 %r6587, %r6586; - cvt.u32.u16 %r6588, %rs4259; - cvt.s32.s8 %r6589, %r6588; - cvt.u32.u16 %r6590, %rs4258; - cvt.s32.s8 %r6591, %r6590; - mad.lo.s32 %r6592, %r173, %r6591, %r6583; - mad.lo.s32 %r6593, %r53, %r6589, %r6592; - mad.lo.s32 %r6594, %r54, %r6587, %r6593; - mad.lo.s32 %r6595, %r55, %r6585, %r6594; - ld.const.v4.u8 {%rs4266, %rs4267, %rs4268, %rs4269}, [matrix+2132]; - cvt.u32.u16 %r6596, %rs4269; - cvt.s32.s8 %r6597, %r6596; - cvt.u32.u16 %r6598, %rs4268; - cvt.s32.s8 %r6599, %r6598; - cvt.u32.u16 %r6600, %rs4267; - cvt.s32.s8 %r6601, %r6600; - cvt.u32.u16 %r6602, %rs4266; - cvt.s32.s8 %r6603, %r6602; - mad.lo.s32 %r6604, %r56, %r6603, %r6595; - mad.lo.s32 %r6605, %r57, %r6601, %r6604; - mad.lo.s32 %r6606, %r58, %r6599, %r6605; - mad.lo.s32 %r6607, %r59, %r6597, %r6606; - ld.const.v4.u8 {%rs4274, %rs4275, %rs4276, %rs4277}, [matrix+2136]; - cvt.u32.u16 %r6608, %rs4277; - cvt.s32.s8 %r6609, %r6608; - cvt.u32.u16 %r6610, %rs4276; - cvt.s32.s8 %r6611, %r6610; - cvt.u32.u16 %r6612, %rs4275; - cvt.s32.s8 %r6613, %r6612; - cvt.u32.u16 %r6614, %rs4274; - cvt.s32.s8 %r6615, %r6614; - mad.lo.s32 %r6616, %r61, %r6615, %r6607; - mad.lo.s32 %r6617, %r62, %r6613, %r6616; - mad.lo.s32 %r6618, %r64, %r6611, %r6617; - mad.lo.s32 %r6619, %r65, %r6609, %r6618; - ld.const.v4.u8 {%rs4282, %rs4283, %rs4284, %rs4285}, [matrix+2140]; - cvt.u32.u16 %r6620, %rs4285; - cvt.s32.s8 %r6621, %r6620; - cvt.u32.u16 %r6622, %rs4284; - cvt.s32.s8 %r6623, %r6622; - cvt.u32.u16 %r6624, %rs4283; - cvt.s32.s8 %r6625, %r6624; - cvt.u32.u16 %r6626, %rs4282; - cvt.s32.s8 %r6627, %r6626; - mad.lo.s32 %r6628, %r67, %r6627, %r6619; - mad.lo.s32 %r6629, %r68, %r6625, %r6628; - mad.lo.s32 %r6630, %r69, %r6623, %r6629; - mad.lo.s32 %r6631, %r70, %r6621, %r6630; - ld.const.v4.u8 {%rs4290, %rs4291, %rs4292, %rs4293}, [matrix+2144]; - cvt.u32.u16 %r6632, %rs4293; - cvt.s32.s8 %r6633, %r6632; - cvt.u32.u16 %r6634, %rs4292; - cvt.s32.s8 %r6635, %r6634; - cvt.u32.u16 %r6636, %rs4291; - cvt.s32.s8 %r6637, %r6636; - cvt.u32.u16 %r6638, %rs4290; - cvt.s32.s8 %r6639, %r6638; - mad.lo.s32 %r6640, %r222, %r6639, %r6631; - mad.lo.s32 %r6641, %r72, %r6637, %r6640; - mad.lo.s32 %r6642, %r73, %r6635, %r6641; - mad.lo.s32 %r6643, %r74, %r6633, %r6642; - ld.const.v4.u8 {%rs4298, %rs4299, %rs4300, %rs4301}, [matrix+2148]; - cvt.u32.u16 %r6644, %rs4301; - cvt.s32.s8 %r6645, %r6644; - cvt.u32.u16 %r6646, %rs4300; - cvt.s32.s8 %r6647, %r6646; - cvt.u32.u16 %r6648, %rs4299; - cvt.s32.s8 %r6649, %r6648; - cvt.u32.u16 %r6650, %rs4298; - cvt.s32.s8 %r6651, %r6650; - mad.lo.s32 %r6652, %r75, %r6651, %r6643; - mad.lo.s32 %r6653, %r76, %r6649, %r6652; - mad.lo.s32 %r6654, %r77, %r6647, %r6653; - mad.lo.s32 %r6655, %r78, %r6645, %r6654; - ld.const.v4.u8 {%rs4306, %rs4307, %rs4308, %rs4309}, [matrix+2152]; - cvt.u32.u16 %r6656, %rs4309; - cvt.s32.s8 %r6657, %r6656; - cvt.u32.u16 %r6658, %rs4308; - cvt.s32.s8 %r6659, %r6658; - cvt.u32.u16 %r6660, %rs4307; - cvt.s32.s8 %r6661, %r6660; - cvt.u32.u16 %r6662, %rs4306; - cvt.s32.s8 %r6663, %r6662; - mad.lo.s32 %r6664, %r80, %r6663, %r6655; - mad.lo.s32 %r6665, %r81, %r6661, %r6664; - mad.lo.s32 %r6666, %r83, %r6659, %r6665; - mad.lo.s32 %r6667, %r84, %r6657, %r6666; - ld.const.v4.u8 {%rs4314, %rs4315, %rs4316, %rs4317}, [matrix+2156]; - cvt.u32.u16 %r6668, %rs4317; - cvt.s32.s8 %r6669, %r6668; - cvt.u32.u16 %r6670, %rs4316; - cvt.s32.s8 %r6671, %r6670; - cvt.u32.u16 %r6672, %rs4315; - cvt.s32.s8 %r6673, %r6672; - cvt.u32.u16 %r6674, %rs4314; - cvt.s32.s8 %r6675, %r6674; - mad.lo.s32 %r6676, %r86, %r6675, %r6667; - mad.lo.s32 %r6677, %r87, %r6673, %r6676; - mad.lo.s32 %r6678, %r88, %r6671, %r6677; - mad.lo.s32 %r6679, %r89, %r6669, %r6678; - ld.const.v4.u8 {%rs4322, %rs4323, %rs4324, %rs4325}, [matrix+2160]; - cvt.u32.u16 %r6680, %rs4325; - cvt.s32.s8 %r6681, %r6680; - cvt.u32.u16 %r6682, %rs4324; - cvt.s32.s8 %r6683, %r6682; - cvt.u32.u16 %r6684, %rs4323; - cvt.s32.s8 %r6685, %r6684; - cvt.u32.u16 %r6686, %rs4322; - cvt.s32.s8 %r6687, %r6686; - mad.lo.s32 %r6688, %r271, %r6687, %r6679; - mad.lo.s32 %r6689, %r91, %r6685, %r6688; - mad.lo.s32 %r6690, %r93, %r6683, %r6689; - mad.lo.s32 %r6691, %r94, %r6681, %r6690; - ld.const.v4.u8 {%rs4330, %rs4331, %rs4332, %rs4333}, [matrix+2164]; - cvt.u32.u16 %r6692, %rs4333; - cvt.s32.s8 %r6693, %r6692; - cvt.u32.u16 %r6694, %rs4332; - cvt.s32.s8 %r6695, %r6694; - cvt.u32.u16 %r6696, %rs4331; - cvt.s32.s8 %r6697, %r6696; - cvt.u32.u16 %r6698, %rs4330; - cvt.s32.s8 %r6699, %r6698; - mad.lo.s32 %r6700, %r96, %r6699, %r6691; - mad.lo.s32 %r6701, %r97, %r6697, %r6700; - mad.lo.s32 %r6702, %r99, %r6695, %r6701; - mad.lo.s32 %r6703, %r100, %r6693, %r6702; - ld.const.v4.u8 {%rs4338, %rs4339, %rs4340, %rs4341}, [matrix+2168]; - cvt.u32.u16 %r6704, %rs4341; - cvt.s32.s8 %r6705, %r6704; - cvt.u32.u16 %r6706, %rs4340; - cvt.s32.s8 %r6707, %r6706; - cvt.u32.u16 %r6708, %rs4339; - cvt.s32.s8 %r6709, %r6708; - cvt.u32.u16 %r6710, %rs4338; - cvt.s32.s8 %r6711, %r6710; - mad.lo.s32 %r6712, %r103, %r6711, %r6703; - mad.lo.s32 %r6713, %r104, %r6709, %r6712; - mad.lo.s32 %r6714, %r107, %r6707, %r6713; - mad.lo.s32 %r6715, %r108, %r6705, %r6714; - ld.const.v4.u8 {%rs4346, %rs4347, %rs4348, %rs4349}, [matrix+2172]; - cvt.u32.u16 %r6716, %rs4349; - cvt.s32.s8 %r6717, %r6716; - cvt.u32.u16 %r6718, %rs4348; - cvt.s32.s8 %r6719, %r6718; - cvt.u32.u16 %r6720, %rs4347; - cvt.s32.s8 %r6721, %r6720; - cvt.u32.u16 %r6722, %rs4346; - cvt.s32.s8 %r6723, %r6722; - mad.lo.s32 %r6724, %r111, %r6723, %r6715; - mad.lo.s32 %r6725, %r112, %r6721, %r6724; - mad.lo.s32 %r6726, %r114, %r6719, %r6725; - mad.lo.s32 %r6727, %r115, %r6717, %r6726; - shr.u32 %r6728, %r6535, 6; - and.b32 %r6729, %r6728, 240; - shr.u32 %r6730, %r6727, 10; - or.b32 %r6731, %r6730, %r6729; - xor.b32 %r6732, %r71, %r6731; - cvt.u64.u32 %rd393, %r6732; - ld.const.v4.u8 {%rs4354, %rs4355, %rs4356, %rs4357}, [matrix+2176]; - cvt.u32.u16 %r6733, %rs4357; - cvt.s32.s8 %r6734, %r6733; - cvt.u32.u16 %r6735, %rs4356; - cvt.s32.s8 %r6736, %r6735; - cvt.u32.u16 %r6737, %rs4354; - cvt.s32.s8 %r6738, %r6737; - cvt.u32.u16 %r6739, %rs4355; - cvt.s32.s8 %r6740, %r6739; - mul.lo.s32 %r6741, %r34, %r6740; - mad.lo.s32 %r6742, %r124, %r6738, %r6741; - mad.lo.s32 %r6743, %r35, %r6736, %r6742; - mad.lo.s32 %r6744, %r36, %r6734, %r6743; - ld.const.v4.u8 {%rs4362, %rs4363, %rs4364, %rs4365}, [matrix+2180]; - cvt.u32.u16 %r6745, %rs4365; - cvt.s32.s8 %r6746, %r6745; - cvt.u32.u16 %r6747, %rs4364; - cvt.s32.s8 %r6748, %r6747; - cvt.u32.u16 %r6749, %rs4363; - cvt.s32.s8 %r6750, %r6749; - cvt.u32.u16 %r6751, %rs4362; - cvt.s32.s8 %r6752, %r6751; - mad.lo.s32 %r6753, %r37, %r6752, %r6744; - mad.lo.s32 %r6754, %r38, %r6750, %r6753; - mad.lo.s32 %r6755, %r39, %r6748, %r6754; - mad.lo.s32 %r6756, %r40, %r6746, %r6755; - ld.const.v4.u8 {%rs4370, %rs4371, %rs4372, %rs4373}, [matrix+2184]; - cvt.u32.u16 %r6757, %rs4373; - cvt.s32.s8 %r6758, %r6757; - cvt.u32.u16 %r6759, %rs4372; - cvt.s32.s8 %r6760, %r6759; - cvt.u32.u16 %r6761, %rs4371; - cvt.s32.s8 %r6762, %r6761; - cvt.u32.u16 %r6763, %rs4370; - cvt.s32.s8 %r6764, %r6763; - mad.lo.s32 %r6765, %r42, %r6764, %r6756; - mad.lo.s32 %r6766, %r43, %r6762, %r6765; - mad.lo.s32 %r6767, %r45, %r6760, %r6766; - mad.lo.s32 %r6768, %r46, %r6758, %r6767; - ld.const.v4.u8 {%rs4378, %rs4379, %rs4380, %rs4381}, [matrix+2188]; - cvt.u32.u16 %r6769, %rs4381; - cvt.s32.s8 %r6770, %r6769; - cvt.u32.u16 %r6771, %rs4380; - cvt.s32.s8 %r6772, %r6771; - cvt.u32.u16 %r6773, %rs4379; - cvt.s32.s8 %r6774, %r6773; - cvt.u32.u16 %r6775, %rs4378; - cvt.s32.s8 %r6776, %r6775; - mad.lo.s32 %r6777, %r48, %r6776, %r6768; - mad.lo.s32 %r6778, %r49, %r6774, %r6777; - mad.lo.s32 %r6779, %r50, %r6772, %r6778; - mad.lo.s32 %r6780, %r51, %r6770, %r6779; - ld.const.v4.u8 {%rs4386, %rs4387, %rs4388, %rs4389}, [matrix+2192]; - cvt.u32.u16 %r6781, %rs4389; - cvt.s32.s8 %r6782, %r6781; - cvt.u32.u16 %r6783, %rs4388; - cvt.s32.s8 %r6784, %r6783; - cvt.u32.u16 %r6785, %rs4387; - cvt.s32.s8 %r6786, %r6785; - cvt.u32.u16 %r6787, %rs4386; - cvt.s32.s8 %r6788, %r6787; - mad.lo.s32 %r6789, %r173, %r6788, %r6780; - mad.lo.s32 %r6790, %r53, %r6786, %r6789; - mad.lo.s32 %r6791, %r54, %r6784, %r6790; - mad.lo.s32 %r6792, %r55, %r6782, %r6791; - ld.const.v4.u8 {%rs4394, %rs4395, %rs4396, %rs4397}, [matrix+2196]; - cvt.u32.u16 %r6793, %rs4397; - cvt.s32.s8 %r6794, %r6793; - cvt.u32.u16 %r6795, %rs4396; - cvt.s32.s8 %r6796, %r6795; - cvt.u32.u16 %r6797, %rs4395; - cvt.s32.s8 %r6798, %r6797; - cvt.u32.u16 %r6799, %rs4394; - cvt.s32.s8 %r6800, %r6799; - mad.lo.s32 %r6801, %r56, %r6800, %r6792; - mad.lo.s32 %r6802, %r57, %r6798, %r6801; - mad.lo.s32 %r6803, %r58, %r6796, %r6802; - mad.lo.s32 %r6804, %r59, %r6794, %r6803; - ld.const.v4.u8 {%rs4402, %rs4403, %rs4404, %rs4405}, [matrix+2200]; - cvt.u32.u16 %r6805, %rs4405; - cvt.s32.s8 %r6806, %r6805; - cvt.u32.u16 %r6807, %rs4404; - cvt.s32.s8 %r6808, %r6807; - cvt.u32.u16 %r6809, %rs4403; - cvt.s32.s8 %r6810, %r6809; - cvt.u32.u16 %r6811, %rs4402; - cvt.s32.s8 %r6812, %r6811; - mad.lo.s32 %r6813, %r61, %r6812, %r6804; - mad.lo.s32 %r6814, %r62, %r6810, %r6813; - mad.lo.s32 %r6815, %r64, %r6808, %r6814; - mad.lo.s32 %r6816, %r65, %r6806, %r6815; - ld.const.v4.u8 {%rs4410, %rs4411, %rs4412, %rs4413}, [matrix+2204]; - cvt.u32.u16 %r6817, %rs4413; - cvt.s32.s8 %r6818, %r6817; - cvt.u32.u16 %r6819, %rs4412; - cvt.s32.s8 %r6820, %r6819; - cvt.u32.u16 %r6821, %rs4411; - cvt.s32.s8 %r6822, %r6821; - cvt.u32.u16 %r6823, %rs4410; - cvt.s32.s8 %r6824, %r6823; - mad.lo.s32 %r6825, %r67, %r6824, %r6816; - mad.lo.s32 %r6826, %r68, %r6822, %r6825; - mad.lo.s32 %r6827, %r69, %r6820, %r6826; - mad.lo.s32 %r6828, %r70, %r6818, %r6827; - ld.const.v4.u8 {%rs4418, %rs4419, %rs4420, %rs4421}, [matrix+2208]; - cvt.u32.u16 %r6829, %rs4421; - cvt.s32.s8 %r6830, %r6829; - cvt.u32.u16 %r6831, %rs4420; - cvt.s32.s8 %r6832, %r6831; - cvt.u32.u16 %r6833, %rs4419; - cvt.s32.s8 %r6834, %r6833; - cvt.u32.u16 %r6835, %rs4418; - cvt.s32.s8 %r6836, %r6835; - mad.lo.s32 %r6837, %r222, %r6836, %r6828; - mad.lo.s32 %r6838, %r72, %r6834, %r6837; - mad.lo.s32 %r6839, %r73, %r6832, %r6838; - mad.lo.s32 %r6840, %r74, %r6830, %r6839; - ld.const.v4.u8 {%rs4426, %rs4427, %rs4428, %rs4429}, [matrix+2212]; - cvt.u32.u16 %r6841, %rs4429; - cvt.s32.s8 %r6842, %r6841; - cvt.u32.u16 %r6843, %rs4428; - cvt.s32.s8 %r6844, %r6843; - cvt.u32.u16 %r6845, %rs4427; - cvt.s32.s8 %r6846, %r6845; - cvt.u32.u16 %r6847, %rs4426; - cvt.s32.s8 %r6848, %r6847; - mad.lo.s32 %r6849, %r75, %r6848, %r6840; - mad.lo.s32 %r6850, %r76, %r6846, %r6849; - mad.lo.s32 %r6851, %r77, %r6844, %r6850; - mad.lo.s32 %r6852, %r78, %r6842, %r6851; - ld.const.v4.u8 {%rs4434, %rs4435, %rs4436, %rs4437}, [matrix+2216]; - cvt.u32.u16 %r6853, %rs4437; - cvt.s32.s8 %r6854, %r6853; - cvt.u32.u16 %r6855, %rs4436; - cvt.s32.s8 %r6856, %r6855; - cvt.u32.u16 %r6857, %rs4435; - cvt.s32.s8 %r6858, %r6857; - cvt.u32.u16 %r6859, %rs4434; - cvt.s32.s8 %r6860, %r6859; - mad.lo.s32 %r6861, %r80, %r6860, %r6852; - mad.lo.s32 %r6862, %r81, %r6858, %r6861; - mad.lo.s32 %r6863, %r83, %r6856, %r6862; - mad.lo.s32 %r6864, %r84, %r6854, %r6863; - ld.const.v4.u8 {%rs4442, %rs4443, %rs4444, %rs4445}, [matrix+2220]; - cvt.u32.u16 %r6865, %rs4445; - cvt.s32.s8 %r6866, %r6865; - cvt.u32.u16 %r6867, %rs4444; - cvt.s32.s8 %r6868, %r6867; - cvt.u32.u16 %r6869, %rs4443; - cvt.s32.s8 %r6870, %r6869; - cvt.u32.u16 %r6871, %rs4442; - cvt.s32.s8 %r6872, %r6871; - mad.lo.s32 %r6873, %r86, %r6872, %r6864; - mad.lo.s32 %r6874, %r87, %r6870, %r6873; - mad.lo.s32 %r6875, %r88, %r6868, %r6874; - mad.lo.s32 %r6876, %r89, %r6866, %r6875; - ld.const.v4.u8 {%rs4450, %rs4451, %rs4452, %rs4453}, [matrix+2224]; - cvt.u32.u16 %r6877, %rs4453; - cvt.s32.s8 %r6878, %r6877; - cvt.u32.u16 %r6879, %rs4452; - cvt.s32.s8 %r6880, %r6879; - cvt.u32.u16 %r6881, %rs4451; - cvt.s32.s8 %r6882, %r6881; - cvt.u32.u16 %r6883, %rs4450; - cvt.s32.s8 %r6884, %r6883; - mad.lo.s32 %r6885, %r271, %r6884, %r6876; - mad.lo.s32 %r6886, %r91, %r6882, %r6885; - mad.lo.s32 %r6887, %r93, %r6880, %r6886; - mad.lo.s32 %r6888, %r94, %r6878, %r6887; - ld.const.v4.u8 {%rs4458, %rs4459, %rs4460, %rs4461}, [matrix+2228]; - cvt.u32.u16 %r6889, %rs4461; - cvt.s32.s8 %r6890, %r6889; - cvt.u32.u16 %r6891, %rs4460; - cvt.s32.s8 %r6892, %r6891; - cvt.u32.u16 %r6893, %rs4459; - cvt.s32.s8 %r6894, %r6893; - cvt.u32.u16 %r6895, %rs4458; - cvt.s32.s8 %r6896, %r6895; - mad.lo.s32 %r6897, %r96, %r6896, %r6888; - mad.lo.s32 %r6898, %r97, %r6894, %r6897; - mad.lo.s32 %r6899, %r99, %r6892, %r6898; - mad.lo.s32 %r6900, %r100, %r6890, %r6899; - ld.const.v4.u8 {%rs4466, %rs4467, %rs4468, %rs4469}, [matrix+2232]; - cvt.u32.u16 %r6901, %rs4469; - cvt.s32.s8 %r6902, %r6901; - cvt.u32.u16 %r6903, %rs4468; - cvt.s32.s8 %r6904, %r6903; - cvt.u32.u16 %r6905, %rs4467; - cvt.s32.s8 %r6906, %r6905; - cvt.u32.u16 %r6907, %rs4466; - cvt.s32.s8 %r6908, %r6907; - mad.lo.s32 %r6909, %r103, %r6908, %r6900; - mad.lo.s32 %r6910, %r104, %r6906, %r6909; - mad.lo.s32 %r6911, %r107, %r6904, %r6910; - mad.lo.s32 %r6912, %r108, %r6902, %r6911; - ld.const.v4.u8 {%rs4474, %rs4475, %rs4476, %rs4477}, [matrix+2236]; - cvt.u32.u16 %r6913, %rs4477; - cvt.s32.s8 %r6914, %r6913; - cvt.u32.u16 %r6915, %rs4476; - cvt.s32.s8 %r6916, %r6915; - cvt.u32.u16 %r6917, %rs4475; - cvt.s32.s8 %r6918, %r6917; - cvt.u32.u16 %r6919, %rs4474; - cvt.s32.s8 %r6920, %r6919; - mad.lo.s32 %r6921, %r111, %r6920, %r6912; - mad.lo.s32 %r6922, %r112, %r6918, %r6921; - mad.lo.s32 %r6923, %r114, %r6916, %r6922; - mad.lo.s32 %r6924, %r115, %r6914, %r6923; - ld.const.v4.u8 {%rs4482, %rs4483, %rs4484, %rs4485}, [matrix+2240]; - cvt.u32.u16 %r6925, %rs4485; - cvt.s32.s8 %r6926, %r6925; - cvt.u32.u16 %r6927, %rs4484; - cvt.s32.s8 %r6928, %r6927; - cvt.u32.u16 %r6929, %rs4482; - cvt.s32.s8 %r6930, %r6929; - cvt.u32.u16 %r6931, %rs4483; - cvt.s32.s8 %r6932, %r6931; - mul.lo.s32 %r6933, %r34, %r6932; - mad.lo.s32 %r6934, %r124, %r6930, %r6933; - mad.lo.s32 %r6935, %r35, %r6928, %r6934; - mad.lo.s32 %r6936, %r36, %r6926, %r6935; - ld.const.v4.u8 {%rs4490, %rs4491, %rs4492, %rs4493}, [matrix+2244]; - cvt.u32.u16 %r6937, %rs4493; - cvt.s32.s8 %r6938, %r6937; - cvt.u32.u16 %r6939, %rs4492; - cvt.s32.s8 %r6940, %r6939; - cvt.u32.u16 %r6941, %rs4491; - cvt.s32.s8 %r6942, %r6941; - cvt.u32.u16 %r6943, %rs4490; - cvt.s32.s8 %r6944, %r6943; - mad.lo.s32 %r6945, %r37, %r6944, %r6936; - mad.lo.s32 %r6946, %r38, %r6942, %r6945; - mad.lo.s32 %r6947, %r39, %r6940, %r6946; - mad.lo.s32 %r6948, %r40, %r6938, %r6947; - ld.const.v4.u8 {%rs4498, %rs4499, %rs4500, %rs4501}, [matrix+2248]; - cvt.u32.u16 %r6949, %rs4501; - cvt.s32.s8 %r6950, %r6949; - cvt.u32.u16 %r6951, %rs4500; - cvt.s32.s8 %r6952, %r6951; - cvt.u32.u16 %r6953, %rs4499; - cvt.s32.s8 %r6954, %r6953; - cvt.u32.u16 %r6955, %rs4498; - cvt.s32.s8 %r6956, %r6955; - mad.lo.s32 %r6957, %r42, %r6956, %r6948; - mad.lo.s32 %r6958, %r43, %r6954, %r6957; - mad.lo.s32 %r6959, %r45, %r6952, %r6958; - mad.lo.s32 %r6960, %r46, %r6950, %r6959; - ld.const.v4.u8 {%rs4506, %rs4507, %rs4508, %rs4509}, [matrix+2252]; - cvt.u32.u16 %r6961, %rs4509; - cvt.s32.s8 %r6962, %r6961; - cvt.u32.u16 %r6963, %rs4508; - cvt.s32.s8 %r6964, %r6963; - cvt.u32.u16 %r6965, %rs4507; - cvt.s32.s8 %r6966, %r6965; - cvt.u32.u16 %r6967, %rs4506; - cvt.s32.s8 %r6968, %r6967; - mad.lo.s32 %r6969, %r48, %r6968, %r6960; - mad.lo.s32 %r6970, %r49, %r6966, %r6969; - mad.lo.s32 %r6971, %r50, %r6964, %r6970; - mad.lo.s32 %r6972, %r51, %r6962, %r6971; - ld.const.v4.u8 {%rs4514, %rs4515, %rs4516, %rs4517}, [matrix+2256]; - cvt.u32.u16 %r6973, %rs4517; - cvt.s32.s8 %r6974, %r6973; - cvt.u32.u16 %r6975, %rs4516; - cvt.s32.s8 %r6976, %r6975; - cvt.u32.u16 %r6977, %rs4515; - cvt.s32.s8 %r6978, %r6977; - cvt.u32.u16 %r6979, %rs4514; - cvt.s32.s8 %r6980, %r6979; - mad.lo.s32 %r6981, %r173, %r6980, %r6972; - mad.lo.s32 %r6982, %r53, %r6978, %r6981; - mad.lo.s32 %r6983, %r54, %r6976, %r6982; - mad.lo.s32 %r6984, %r55, %r6974, %r6983; - ld.const.v4.u8 {%rs4522, %rs4523, %rs4524, %rs4525}, [matrix+2260]; - cvt.u32.u16 %r6985, %rs4525; - cvt.s32.s8 %r6986, %r6985; - cvt.u32.u16 %r6987, %rs4524; - cvt.s32.s8 %r6988, %r6987; - cvt.u32.u16 %r6989, %rs4523; - cvt.s32.s8 %r6990, %r6989; - cvt.u32.u16 %r6991, %rs4522; - cvt.s32.s8 %r6992, %r6991; - mad.lo.s32 %r6993, %r56, %r6992, %r6984; - mad.lo.s32 %r6994, %r57, %r6990, %r6993; - mad.lo.s32 %r6995, %r58, %r6988, %r6994; - mad.lo.s32 %r6996, %r59, %r6986, %r6995; - ld.const.v4.u8 {%rs4530, %rs4531, %rs4532, %rs4533}, [matrix+2264]; - cvt.u32.u16 %r6997, %rs4533; - cvt.s32.s8 %r6998, %r6997; - cvt.u32.u16 %r6999, %rs4532; - cvt.s32.s8 %r7000, %r6999; - cvt.u32.u16 %r7001, %rs4531; - cvt.s32.s8 %r7002, %r7001; - cvt.u32.u16 %r7003, %rs4530; - cvt.s32.s8 %r7004, %r7003; - mad.lo.s32 %r7005, %r61, %r7004, %r6996; - mad.lo.s32 %r7006, %r62, %r7002, %r7005; - mad.lo.s32 %r7007, %r64, %r7000, %r7006; - mad.lo.s32 %r7008, %r65, %r6998, %r7007; - ld.const.v4.u8 {%rs4538, %rs4539, %rs4540, %rs4541}, [matrix+2268]; - cvt.u32.u16 %r7009, %rs4541; - cvt.s32.s8 %r7010, %r7009; - cvt.u32.u16 %r7011, %rs4540; - cvt.s32.s8 %r7012, %r7011; - cvt.u32.u16 %r7013, %rs4539; - cvt.s32.s8 %r7014, %r7013; - cvt.u32.u16 %r7015, %rs4538; - cvt.s32.s8 %r7016, %r7015; - mad.lo.s32 %r7017, %r67, %r7016, %r7008; - mad.lo.s32 %r7018, %r68, %r7014, %r7017; - mad.lo.s32 %r7019, %r69, %r7012, %r7018; - mad.lo.s32 %r7020, %r70, %r7010, %r7019; - ld.const.v4.u8 {%rs4546, %rs4547, %rs4548, %rs4549}, [matrix+2272]; - cvt.u32.u16 %r7021, %rs4549; - cvt.s32.s8 %r7022, %r7021; - cvt.u32.u16 %r7023, %rs4548; - cvt.s32.s8 %r7024, %r7023; - cvt.u32.u16 %r7025, %rs4547; - cvt.s32.s8 %r7026, %r7025; - cvt.u32.u16 %r7027, %rs4546; - cvt.s32.s8 %r7028, %r7027; - mad.lo.s32 %r7029, %r222, %r7028, %r7020; - mad.lo.s32 %r7030, %r72, %r7026, %r7029; - mad.lo.s32 %r7031, %r73, %r7024, %r7030; - mad.lo.s32 %r7032, %r74, %r7022, %r7031; - ld.const.v4.u8 {%rs4554, %rs4555, %rs4556, %rs4557}, [matrix+2276]; - cvt.u32.u16 %r7033, %rs4557; - cvt.s32.s8 %r7034, %r7033; - cvt.u32.u16 %r7035, %rs4556; - cvt.s32.s8 %r7036, %r7035; - cvt.u32.u16 %r7037, %rs4555; - cvt.s32.s8 %r7038, %r7037; - cvt.u32.u16 %r7039, %rs4554; - cvt.s32.s8 %r7040, %r7039; - mad.lo.s32 %r7041, %r75, %r7040, %r7032; - mad.lo.s32 %r7042, %r76, %r7038, %r7041; - mad.lo.s32 %r7043, %r77, %r7036, %r7042; - mad.lo.s32 %r7044, %r78, %r7034, %r7043; - ld.const.v4.u8 {%rs4562, %rs4563, %rs4564, %rs4565}, [matrix+2280]; - cvt.u32.u16 %r7045, %rs4565; - cvt.s32.s8 %r7046, %r7045; - cvt.u32.u16 %r7047, %rs4564; - cvt.s32.s8 %r7048, %r7047; - cvt.u32.u16 %r7049, %rs4563; - cvt.s32.s8 %r7050, %r7049; - cvt.u32.u16 %r7051, %rs4562; - cvt.s32.s8 %r7052, %r7051; - mad.lo.s32 %r7053, %r80, %r7052, %r7044; - mad.lo.s32 %r7054, %r81, %r7050, %r7053; - mad.lo.s32 %r7055, %r83, %r7048, %r7054; - mad.lo.s32 %r7056, %r84, %r7046, %r7055; - ld.const.v4.u8 {%rs4570, %rs4571, %rs4572, %rs4573}, [matrix+2284]; - cvt.u32.u16 %r7057, %rs4573; - cvt.s32.s8 %r7058, %r7057; - cvt.u32.u16 %r7059, %rs4572; - cvt.s32.s8 %r7060, %r7059; - cvt.u32.u16 %r7061, %rs4571; - cvt.s32.s8 %r7062, %r7061; - cvt.u32.u16 %r7063, %rs4570; - cvt.s32.s8 %r7064, %r7063; - mad.lo.s32 %r7065, %r86, %r7064, %r7056; - mad.lo.s32 %r7066, %r87, %r7062, %r7065; - mad.lo.s32 %r7067, %r88, %r7060, %r7066; - mad.lo.s32 %r7068, %r89, %r7058, %r7067; - ld.const.v4.u8 {%rs4578, %rs4579, %rs4580, %rs4581}, [matrix+2288]; - cvt.u32.u16 %r7069, %rs4581; - cvt.s32.s8 %r7070, %r7069; - cvt.u32.u16 %r7071, %rs4580; - cvt.s32.s8 %r7072, %r7071; - cvt.u32.u16 %r7073, %rs4579; - cvt.s32.s8 %r7074, %r7073; - cvt.u32.u16 %r7075, %rs4578; - cvt.s32.s8 %r7076, %r7075; - mad.lo.s32 %r7077, %r271, %r7076, %r7068; - mad.lo.s32 %r7078, %r91, %r7074, %r7077; - mad.lo.s32 %r7079, %r93, %r7072, %r7078; - mad.lo.s32 %r7080, %r94, %r7070, %r7079; - ld.const.v4.u8 {%rs4586, %rs4587, %rs4588, %rs4589}, [matrix+2292]; - cvt.u32.u16 %r7081, %rs4589; - cvt.s32.s8 %r7082, %r7081; - cvt.u32.u16 %r7083, %rs4588; - cvt.s32.s8 %r7084, %r7083; - cvt.u32.u16 %r7085, %rs4587; - cvt.s32.s8 %r7086, %r7085; - cvt.u32.u16 %r7087, %rs4586; - cvt.s32.s8 %r7088, %r7087; - mad.lo.s32 %r7089, %r96, %r7088, %r7080; - mad.lo.s32 %r7090, %r97, %r7086, %r7089; - mad.lo.s32 %r7091, %r99, %r7084, %r7090; - mad.lo.s32 %r7092, %r100, %r7082, %r7091; - ld.const.v4.u8 {%rs4594, %rs4595, %rs4596, %rs4597}, [matrix+2296]; - cvt.u32.u16 %r7093, %rs4597; - cvt.s32.s8 %r7094, %r7093; - cvt.u32.u16 %r7095, %rs4596; - cvt.s32.s8 %r7096, %r7095; - cvt.u32.u16 %r7097, %rs4595; - cvt.s32.s8 %r7098, %r7097; - cvt.u32.u16 %r7099, %rs4594; - cvt.s32.s8 %r7100, %r7099; - mad.lo.s32 %r7101, %r103, %r7100, %r7092; - mad.lo.s32 %r7102, %r104, %r7098, %r7101; - mad.lo.s32 %r7103, %r107, %r7096, %r7102; - mad.lo.s32 %r7104, %r108, %r7094, %r7103; - ld.const.v4.u8 {%rs4602, %rs4603, %rs4604, %rs4605}, [matrix+2300]; - cvt.u32.u16 %r7105, %rs4605; - cvt.s32.s8 %r7106, %r7105; - cvt.u32.u16 %r7107, %rs4604; - cvt.s32.s8 %r7108, %r7107; - cvt.u32.u16 %r7109, %rs4603; - cvt.s32.s8 %r7110, %r7109; - cvt.u32.u16 %r7111, %rs4602; - cvt.s32.s8 %r7112, %r7111; - mad.lo.s32 %r7113, %r111, %r7112, %r7104; - mad.lo.s32 %r7114, %r112, %r7110, %r7113; - mad.lo.s32 %r7115, %r114, %r7108, %r7114; - mad.lo.s32 %r7116, %r115, %r7106, %r7115; - shr.u32 %r7117, %r6924, 6; - and.b32 %r7118, %r7117, 240; - shr.u32 %r7119, %r7116, 10; - or.b32 %r7120, %r7119, %r7118; - xor.b32 %r7121, %r27, %r7120; - ld.const.v4.u8 {%rs4610, %rs4611, %rs4612, %rs4613}, [matrix+2304]; - cvt.u32.u16 %r7122, %rs4613; - cvt.s32.s8 %r7123, %r7122; - cvt.u32.u16 %r7124, %rs4612; - cvt.s32.s8 %r7125, %r7124; - cvt.u32.u16 %r7126, %rs4610; - cvt.s32.s8 %r7127, %r7126; - cvt.u32.u16 %r7128, %rs4611; - cvt.s32.s8 %r7129, %r7128; - mul.lo.s32 %r7130, %r34, %r7129; - mad.lo.s32 %r7131, %r124, %r7127, %r7130; - mad.lo.s32 %r7132, %r35, %r7125, %r7131; - mad.lo.s32 %r7133, %r36, %r7123, %r7132; - ld.const.v4.u8 {%rs4618, %rs4619, %rs4620, %rs4621}, [matrix+2308]; - cvt.u32.u16 %r7134, %rs4621; - cvt.s32.s8 %r7135, %r7134; - cvt.u32.u16 %r7136, %rs4620; - cvt.s32.s8 %r7137, %r7136; - cvt.u32.u16 %r7138, %rs4619; - cvt.s32.s8 %r7139, %r7138; - cvt.u32.u16 %r7140, %rs4618; - cvt.s32.s8 %r7141, %r7140; - mad.lo.s32 %r7142, %r37, %r7141, %r7133; - mad.lo.s32 %r7143, %r38, %r7139, %r7142; - mad.lo.s32 %r7144, %r39, %r7137, %r7143; - mad.lo.s32 %r7145, %r40, %r7135, %r7144; - ld.const.v4.u8 {%rs4626, %rs4627, %rs4628, %rs4629}, [matrix+2312]; - cvt.u32.u16 %r7146, %rs4629; - cvt.s32.s8 %r7147, %r7146; - cvt.u32.u16 %r7148, %rs4628; - cvt.s32.s8 %r7149, %r7148; - cvt.u32.u16 %r7150, %rs4627; - cvt.s32.s8 %r7151, %r7150; - cvt.u32.u16 %r7152, %rs4626; - cvt.s32.s8 %r7153, %r7152; - mad.lo.s32 %r7154, %r42, %r7153, %r7145; - mad.lo.s32 %r7155, %r43, %r7151, %r7154; - mad.lo.s32 %r7156, %r45, %r7149, %r7155; - mad.lo.s32 %r7157, %r46, %r7147, %r7156; - ld.const.v4.u8 {%rs4634, %rs4635, %rs4636, %rs4637}, [matrix+2316]; - cvt.u32.u16 %r7158, %rs4637; - cvt.s32.s8 %r7159, %r7158; - cvt.u32.u16 %r7160, %rs4636; - cvt.s32.s8 %r7161, %r7160; - cvt.u32.u16 %r7162, %rs4635; - cvt.s32.s8 %r7163, %r7162; - cvt.u32.u16 %r7164, %rs4634; - cvt.s32.s8 %r7165, %r7164; - mad.lo.s32 %r7166, %r48, %r7165, %r7157; - mad.lo.s32 %r7167, %r49, %r7163, %r7166; - mad.lo.s32 %r7168, %r50, %r7161, %r7167; - mad.lo.s32 %r7169, %r51, %r7159, %r7168; - ld.const.v4.u8 {%rs4642, %rs4643, %rs4644, %rs4645}, [matrix+2320]; - cvt.u32.u16 %r7170, %rs4645; - cvt.s32.s8 %r7171, %r7170; - cvt.u32.u16 %r7172, %rs4644; - cvt.s32.s8 %r7173, %r7172; - cvt.u32.u16 %r7174, %rs4643; - cvt.s32.s8 %r7175, %r7174; - cvt.u32.u16 %r7176, %rs4642; - cvt.s32.s8 %r7177, %r7176; - mad.lo.s32 %r7178, %r173, %r7177, %r7169; - mad.lo.s32 %r7179, %r53, %r7175, %r7178; - mad.lo.s32 %r7180, %r54, %r7173, %r7179; - mad.lo.s32 %r7181, %r55, %r7171, %r7180; - ld.const.v4.u8 {%rs4650, %rs4651, %rs4652, %rs4653}, [matrix+2324]; - cvt.u32.u16 %r7182, %rs4653; - cvt.s32.s8 %r7183, %r7182; - cvt.u32.u16 %r7184, %rs4652; - cvt.s32.s8 %r7185, %r7184; - cvt.u32.u16 %r7186, %rs4651; - cvt.s32.s8 %r7187, %r7186; - cvt.u32.u16 %r7188, %rs4650; - cvt.s32.s8 %r7189, %r7188; - mad.lo.s32 %r7190, %r56, %r7189, %r7181; - mad.lo.s32 %r7191, %r57, %r7187, %r7190; - mad.lo.s32 %r7192, %r58, %r7185, %r7191; - mad.lo.s32 %r7193, %r59, %r7183, %r7192; - ld.const.v4.u8 {%rs4658, %rs4659, %rs4660, %rs4661}, [matrix+2328]; - cvt.u32.u16 %r7194, %rs4661; - cvt.s32.s8 %r7195, %r7194; - cvt.u32.u16 %r7196, %rs4660; - cvt.s32.s8 %r7197, %r7196; - cvt.u32.u16 %r7198, %rs4659; - cvt.s32.s8 %r7199, %r7198; - cvt.u32.u16 %r7200, %rs4658; - cvt.s32.s8 %r7201, %r7200; - mad.lo.s32 %r7202, %r61, %r7201, %r7193; - mad.lo.s32 %r7203, %r62, %r7199, %r7202; - mad.lo.s32 %r7204, %r64, %r7197, %r7203; - mad.lo.s32 %r7205, %r65, %r7195, %r7204; - ld.const.v4.u8 {%rs4666, %rs4667, %rs4668, %rs4669}, [matrix+2332]; - cvt.u32.u16 %r7206, %rs4669; - cvt.s32.s8 %r7207, %r7206; - cvt.u32.u16 %r7208, %rs4668; - cvt.s32.s8 %r7209, %r7208; - cvt.u32.u16 %r7210, %rs4667; - cvt.s32.s8 %r7211, %r7210; - cvt.u32.u16 %r7212, %rs4666; - cvt.s32.s8 %r7213, %r7212; - mad.lo.s32 %r7214, %r67, %r7213, %r7205; - mad.lo.s32 %r7215, %r68, %r7211, %r7214; - mad.lo.s32 %r7216, %r69, %r7209, %r7215; - mad.lo.s32 %r7217, %r70, %r7207, %r7216; - ld.const.v4.u8 {%rs4674, %rs4675, %rs4676, %rs4677}, [matrix+2336]; - cvt.u32.u16 %r7218, %rs4677; - cvt.s32.s8 %r7219, %r7218; - cvt.u32.u16 %r7220, %rs4676; - cvt.s32.s8 %r7221, %r7220; - cvt.u32.u16 %r7222, %rs4675; - cvt.s32.s8 %r7223, %r7222; - cvt.u32.u16 %r7224, %rs4674; - cvt.s32.s8 %r7225, %r7224; - mad.lo.s32 %r7226, %r222, %r7225, %r7217; - mad.lo.s32 %r7227, %r72, %r7223, %r7226; - mad.lo.s32 %r7228, %r73, %r7221, %r7227; - mad.lo.s32 %r7229, %r74, %r7219, %r7228; - ld.const.v4.u8 {%rs4682, %rs4683, %rs4684, %rs4685}, [matrix+2340]; - cvt.u32.u16 %r7230, %rs4685; - cvt.s32.s8 %r7231, %r7230; - cvt.u32.u16 %r7232, %rs4684; - cvt.s32.s8 %r7233, %r7232; - cvt.u32.u16 %r7234, %rs4683; - cvt.s32.s8 %r7235, %r7234; - cvt.u32.u16 %r7236, %rs4682; - cvt.s32.s8 %r7237, %r7236; - mad.lo.s32 %r7238, %r75, %r7237, %r7229; - mad.lo.s32 %r7239, %r76, %r7235, %r7238; - mad.lo.s32 %r7240, %r77, %r7233, %r7239; - mad.lo.s32 %r7241, %r78, %r7231, %r7240; - ld.const.v4.u8 {%rs4690, %rs4691, %rs4692, %rs4693}, [matrix+2344]; - cvt.u32.u16 %r7242, %rs4693; - cvt.s32.s8 %r7243, %r7242; - cvt.u32.u16 %r7244, %rs4692; - cvt.s32.s8 %r7245, %r7244; - cvt.u32.u16 %r7246, %rs4691; - cvt.s32.s8 %r7247, %r7246; - cvt.u32.u16 %r7248, %rs4690; - cvt.s32.s8 %r7249, %r7248; - mad.lo.s32 %r7250, %r80, %r7249, %r7241; - mad.lo.s32 %r7251, %r81, %r7247, %r7250; - mad.lo.s32 %r7252, %r83, %r7245, %r7251; - mad.lo.s32 %r7253, %r84, %r7243, %r7252; - ld.const.v4.u8 {%rs4698, %rs4699, %rs4700, %rs4701}, [matrix+2348]; - cvt.u32.u16 %r7254, %rs4701; - cvt.s32.s8 %r7255, %r7254; - cvt.u32.u16 %r7256, %rs4700; - cvt.s32.s8 %r7257, %r7256; - cvt.u32.u16 %r7258, %rs4699; - cvt.s32.s8 %r7259, %r7258; - cvt.u32.u16 %r7260, %rs4698; - cvt.s32.s8 %r7261, %r7260; - mad.lo.s32 %r7262, %r86, %r7261, %r7253; - mad.lo.s32 %r7263, %r87, %r7259, %r7262; - mad.lo.s32 %r7264, %r88, %r7257, %r7263; - mad.lo.s32 %r7265, %r89, %r7255, %r7264; - ld.const.v4.u8 {%rs4706, %rs4707, %rs4708, %rs4709}, [matrix+2352]; - cvt.u32.u16 %r7266, %rs4709; - cvt.s32.s8 %r7267, %r7266; - cvt.u32.u16 %r7268, %rs4708; - cvt.s32.s8 %r7269, %r7268; - cvt.u32.u16 %r7270, %rs4707; - cvt.s32.s8 %r7271, %r7270; - cvt.u32.u16 %r7272, %rs4706; - cvt.s32.s8 %r7273, %r7272; - mad.lo.s32 %r7274, %r271, %r7273, %r7265; - mad.lo.s32 %r7275, %r91, %r7271, %r7274; - mad.lo.s32 %r7276, %r93, %r7269, %r7275; - mad.lo.s32 %r7277, %r94, %r7267, %r7276; - ld.const.v4.u8 {%rs4714, %rs4715, %rs4716, %rs4717}, [matrix+2356]; - cvt.u32.u16 %r7278, %rs4717; - cvt.s32.s8 %r7279, %r7278; - cvt.u32.u16 %r7280, %rs4716; - cvt.s32.s8 %r7281, %r7280; - cvt.u32.u16 %r7282, %rs4715; - cvt.s32.s8 %r7283, %r7282; - cvt.u32.u16 %r7284, %rs4714; - cvt.s32.s8 %r7285, %r7284; - mad.lo.s32 %r7286, %r96, %r7285, %r7277; - mad.lo.s32 %r7287, %r97, %r7283, %r7286; - mad.lo.s32 %r7288, %r99, %r7281, %r7287; - mad.lo.s32 %r7289, %r100, %r7279, %r7288; - ld.const.v4.u8 {%rs4722, %rs4723, %rs4724, %rs4725}, [matrix+2360]; - cvt.u32.u16 %r7290, %rs4725; - cvt.s32.s8 %r7291, %r7290; - cvt.u32.u16 %r7292, %rs4724; - cvt.s32.s8 %r7293, %r7292; - cvt.u32.u16 %r7294, %rs4723; - cvt.s32.s8 %r7295, %r7294; - cvt.u32.u16 %r7296, %rs4722; - cvt.s32.s8 %r7297, %r7296; - mad.lo.s32 %r7298, %r103, %r7297, %r7289; - mad.lo.s32 %r7299, %r104, %r7295, %r7298; - mad.lo.s32 %r7300, %r107, %r7293, %r7299; - mad.lo.s32 %r7301, %r108, %r7291, %r7300; - ld.const.v4.u8 {%rs4730, %rs4731, %rs4732, %rs4733}, [matrix+2364]; - cvt.u32.u16 %r7302, %rs4733; - cvt.s32.s8 %r7303, %r7302; - cvt.u32.u16 %r7304, %rs4732; - cvt.s32.s8 %r7305, %r7304; - cvt.u32.u16 %r7306, %rs4731; - cvt.s32.s8 %r7307, %r7306; - cvt.u32.u16 %r7308, %rs4730; - cvt.s32.s8 %r7309, %r7308; - mad.lo.s32 %r7310, %r111, %r7309, %r7301; - mad.lo.s32 %r7311, %r112, %r7307, %r7310; - mad.lo.s32 %r7312, %r114, %r7305, %r7311; - mad.lo.s32 %r7313, %r115, %r7303, %r7312; - ld.const.v4.u8 {%rs4738, %rs4739, %rs4740, %rs4741}, [matrix+2368]; - cvt.u32.u16 %r7314, %rs4741; - cvt.s32.s8 %r7315, %r7314; - cvt.u32.u16 %r7316, %rs4740; - cvt.s32.s8 %r7317, %r7316; - cvt.u32.u16 %r7318, %rs4738; - cvt.s32.s8 %r7319, %r7318; - cvt.u32.u16 %r7320, %rs4739; - cvt.s32.s8 %r7321, %r7320; - mul.lo.s32 %r7322, %r34, %r7321; - mad.lo.s32 %r7323, %r124, %r7319, %r7322; - mad.lo.s32 %r7324, %r35, %r7317, %r7323; - mad.lo.s32 %r7325, %r36, %r7315, %r7324; - ld.const.v4.u8 {%rs4746, %rs4747, %rs4748, %rs4749}, [matrix+2372]; - cvt.u32.u16 %r7326, %rs4749; - cvt.s32.s8 %r7327, %r7326; - cvt.u32.u16 %r7328, %rs4748; - cvt.s32.s8 %r7329, %r7328; - cvt.u32.u16 %r7330, %rs4747; - cvt.s32.s8 %r7331, %r7330; - cvt.u32.u16 %r7332, %rs4746; - cvt.s32.s8 %r7333, %r7332; - mad.lo.s32 %r7334, %r37, %r7333, %r7325; - mad.lo.s32 %r7335, %r38, %r7331, %r7334; - mad.lo.s32 %r7336, %r39, %r7329, %r7335; - mad.lo.s32 %r7337, %r40, %r7327, %r7336; - ld.const.v4.u8 {%rs4754, %rs4755, %rs4756, %rs4757}, [matrix+2376]; - cvt.u32.u16 %r7338, %rs4757; - cvt.s32.s8 %r7339, %r7338; - cvt.u32.u16 %r7340, %rs4756; - cvt.s32.s8 %r7341, %r7340; - cvt.u32.u16 %r7342, %rs4755; - cvt.s32.s8 %r7343, %r7342; - cvt.u32.u16 %r7344, %rs4754; - cvt.s32.s8 %r7345, %r7344; - mad.lo.s32 %r7346, %r42, %r7345, %r7337; - mad.lo.s32 %r7347, %r43, %r7343, %r7346; - mad.lo.s32 %r7348, %r45, %r7341, %r7347; - mad.lo.s32 %r7349, %r46, %r7339, %r7348; - ld.const.v4.u8 {%rs4762, %rs4763, %rs4764, %rs4765}, [matrix+2380]; - cvt.u32.u16 %r7350, %rs4765; - cvt.s32.s8 %r7351, %r7350; - cvt.u32.u16 %r7352, %rs4764; - cvt.s32.s8 %r7353, %r7352; - cvt.u32.u16 %r7354, %rs4763; - cvt.s32.s8 %r7355, %r7354; - cvt.u32.u16 %r7356, %rs4762; - cvt.s32.s8 %r7357, %r7356; - mad.lo.s32 %r7358, %r48, %r7357, %r7349; - mad.lo.s32 %r7359, %r49, %r7355, %r7358; - mad.lo.s32 %r7360, %r50, %r7353, %r7359; - mad.lo.s32 %r7361, %r51, %r7351, %r7360; - ld.const.v4.u8 {%rs4770, %rs4771, %rs4772, %rs4773}, [matrix+2384]; - cvt.u32.u16 %r7362, %rs4773; - cvt.s32.s8 %r7363, %r7362; - cvt.u32.u16 %r7364, %rs4772; - cvt.s32.s8 %r7365, %r7364; - cvt.u32.u16 %r7366, %rs4771; - cvt.s32.s8 %r7367, %r7366; - cvt.u32.u16 %r7368, %rs4770; - cvt.s32.s8 %r7369, %r7368; - mad.lo.s32 %r7370, %r173, %r7369, %r7361; - mad.lo.s32 %r7371, %r53, %r7367, %r7370; - mad.lo.s32 %r7372, %r54, %r7365, %r7371; - mad.lo.s32 %r7373, %r55, %r7363, %r7372; - ld.const.v4.u8 {%rs4778, %rs4779, %rs4780, %rs4781}, [matrix+2388]; - cvt.u32.u16 %r7374, %rs4781; - cvt.s32.s8 %r7375, %r7374; - cvt.u32.u16 %r7376, %rs4780; - cvt.s32.s8 %r7377, %r7376; - cvt.u32.u16 %r7378, %rs4779; - cvt.s32.s8 %r7379, %r7378; - cvt.u32.u16 %r7380, %rs4778; - cvt.s32.s8 %r7381, %r7380; - mad.lo.s32 %r7382, %r56, %r7381, %r7373; - mad.lo.s32 %r7383, %r57, %r7379, %r7382; - mad.lo.s32 %r7384, %r58, %r7377, %r7383; - mad.lo.s32 %r7385, %r59, %r7375, %r7384; - ld.const.v4.u8 {%rs4786, %rs4787, %rs4788, %rs4789}, [matrix+2392]; - cvt.u32.u16 %r7386, %rs4789; - cvt.s32.s8 %r7387, %r7386; - cvt.u32.u16 %r7388, %rs4788; - cvt.s32.s8 %r7389, %r7388; - cvt.u32.u16 %r7390, %rs4787; - cvt.s32.s8 %r7391, %r7390; - cvt.u32.u16 %r7392, %rs4786; - cvt.s32.s8 %r7393, %r7392; - mad.lo.s32 %r7394, %r61, %r7393, %r7385; - mad.lo.s32 %r7395, %r62, %r7391, %r7394; - mad.lo.s32 %r7396, %r64, %r7389, %r7395; - mad.lo.s32 %r7397, %r65, %r7387, %r7396; - ld.const.v4.u8 {%rs4794, %rs4795, %rs4796, %rs4797}, [matrix+2396]; - cvt.u32.u16 %r7398, %rs4797; - cvt.s32.s8 %r7399, %r7398; - cvt.u32.u16 %r7400, %rs4796; - cvt.s32.s8 %r7401, %r7400; - cvt.u32.u16 %r7402, %rs4795; - cvt.s32.s8 %r7403, %r7402; - cvt.u32.u16 %r7404, %rs4794; - cvt.s32.s8 %r7405, %r7404; - mad.lo.s32 %r7406, %r67, %r7405, %r7397; - mad.lo.s32 %r7407, %r68, %r7403, %r7406; - mad.lo.s32 %r7408, %r69, %r7401, %r7407; - mad.lo.s32 %r7409, %r70, %r7399, %r7408; - ld.const.v4.u8 {%rs4802, %rs4803, %rs4804, %rs4805}, [matrix+2400]; - cvt.u32.u16 %r7410, %rs4805; - cvt.s32.s8 %r7411, %r7410; - cvt.u32.u16 %r7412, %rs4804; - cvt.s32.s8 %r7413, %r7412; - cvt.u32.u16 %r7414, %rs4803; - cvt.s32.s8 %r7415, %r7414; - cvt.u32.u16 %r7416, %rs4802; - cvt.s32.s8 %r7417, %r7416; - mad.lo.s32 %r7418, %r222, %r7417, %r7409; - mad.lo.s32 %r7419, %r72, %r7415, %r7418; - mad.lo.s32 %r7420, %r73, %r7413, %r7419; - mad.lo.s32 %r7421, %r74, %r7411, %r7420; - ld.const.v4.u8 {%rs4810, %rs4811, %rs4812, %rs4813}, [matrix+2404]; - cvt.u32.u16 %r7422, %rs4813; - cvt.s32.s8 %r7423, %r7422; - cvt.u32.u16 %r7424, %rs4812; - cvt.s32.s8 %r7425, %r7424; - cvt.u32.u16 %r7426, %rs4811; - cvt.s32.s8 %r7427, %r7426; - cvt.u32.u16 %r7428, %rs4810; - cvt.s32.s8 %r7429, %r7428; - mad.lo.s32 %r7430, %r75, %r7429, %r7421; - mad.lo.s32 %r7431, %r76, %r7427, %r7430; - mad.lo.s32 %r7432, %r77, %r7425, %r7431; - mad.lo.s32 %r7433, %r78, %r7423, %r7432; - ld.const.v4.u8 {%rs4818, %rs4819, %rs4820, %rs4821}, [matrix+2408]; - cvt.u32.u16 %r7434, %rs4821; - cvt.s32.s8 %r7435, %r7434; - cvt.u32.u16 %r7436, %rs4820; - cvt.s32.s8 %r7437, %r7436; - cvt.u32.u16 %r7438, %rs4819; - cvt.s32.s8 %r7439, %r7438; - cvt.u32.u16 %r7440, %rs4818; - cvt.s32.s8 %r7441, %r7440; - mad.lo.s32 %r7442, %r80, %r7441, %r7433; - mad.lo.s32 %r7443, %r81, %r7439, %r7442; - mad.lo.s32 %r7444, %r83, %r7437, %r7443; - mad.lo.s32 %r7445, %r84, %r7435, %r7444; - ld.const.v4.u8 {%rs4826, %rs4827, %rs4828, %rs4829}, [matrix+2412]; - cvt.u32.u16 %r7446, %rs4829; - cvt.s32.s8 %r7447, %r7446; - cvt.u32.u16 %r7448, %rs4828; - cvt.s32.s8 %r7449, %r7448; - cvt.u32.u16 %r7450, %rs4827; - cvt.s32.s8 %r7451, %r7450; - cvt.u32.u16 %r7452, %rs4826; - cvt.s32.s8 %r7453, %r7452; - mad.lo.s32 %r7454, %r86, %r7453, %r7445; - mad.lo.s32 %r7455, %r87, %r7451, %r7454; - mad.lo.s32 %r7456, %r88, %r7449, %r7455; - mad.lo.s32 %r7457, %r89, %r7447, %r7456; - ld.const.v4.u8 {%rs4834, %rs4835, %rs4836, %rs4837}, [matrix+2416]; - cvt.u32.u16 %r7458, %rs4837; - cvt.s32.s8 %r7459, %r7458; - cvt.u32.u16 %r7460, %rs4836; - cvt.s32.s8 %r7461, %r7460; - cvt.u32.u16 %r7462, %rs4835; - cvt.s32.s8 %r7463, %r7462; - cvt.u32.u16 %r7464, %rs4834; - cvt.s32.s8 %r7465, %r7464; - mad.lo.s32 %r7466, %r271, %r7465, %r7457; - mad.lo.s32 %r7467, %r91, %r7463, %r7466; - mad.lo.s32 %r7468, %r93, %r7461, %r7467; - mad.lo.s32 %r7469, %r94, %r7459, %r7468; - ld.const.v4.u8 {%rs4842, %rs4843, %rs4844, %rs4845}, [matrix+2420]; - cvt.u32.u16 %r7470, %rs4845; - cvt.s32.s8 %r7471, %r7470; - cvt.u32.u16 %r7472, %rs4844; - cvt.s32.s8 %r7473, %r7472; - cvt.u32.u16 %r7474, %rs4843; - cvt.s32.s8 %r7475, %r7474; - cvt.u32.u16 %r7476, %rs4842; - cvt.s32.s8 %r7477, %r7476; - mad.lo.s32 %r7478, %r96, %r7477, %r7469; - mad.lo.s32 %r7479, %r97, %r7475, %r7478; - mad.lo.s32 %r7480, %r99, %r7473, %r7479; - mad.lo.s32 %r7481, %r100, %r7471, %r7480; - ld.const.v4.u8 {%rs4850, %rs4851, %rs4852, %rs4853}, [matrix+2424]; - cvt.u32.u16 %r7482, %rs4853; - cvt.s32.s8 %r7483, %r7482; - cvt.u32.u16 %r7484, %rs4852; - cvt.s32.s8 %r7485, %r7484; - cvt.u32.u16 %r7486, %rs4851; - cvt.s32.s8 %r7487, %r7486; - cvt.u32.u16 %r7488, %rs4850; - cvt.s32.s8 %r7489, %r7488; - mad.lo.s32 %r7490, %r103, %r7489, %r7481; - mad.lo.s32 %r7491, %r104, %r7487, %r7490; - mad.lo.s32 %r7492, %r107, %r7485, %r7491; - mad.lo.s32 %r7493, %r108, %r7483, %r7492; - ld.const.v4.u8 {%rs4858, %rs4859, %rs4860, %rs4861}, [matrix+2428]; - cvt.u32.u16 %r7494, %rs4861; - cvt.s32.s8 %r7495, %r7494; - cvt.u32.u16 %r7496, %rs4860; - cvt.s32.s8 %r7497, %r7496; - cvt.u32.u16 %r7498, %rs4859; - cvt.s32.s8 %r7499, %r7498; - cvt.u32.u16 %r7500, %rs4858; - cvt.s32.s8 %r7501, %r7500; - mad.lo.s32 %r7502, %r111, %r7501, %r7493; - mad.lo.s32 %r7503, %r112, %r7499, %r7502; - mad.lo.s32 %r7504, %r114, %r7497, %r7503; - mad.lo.s32 %r7505, %r115, %r7495, %r7504; - shr.u32 %r7506, %r7313, 6; - and.b32 %r7507, %r7506, 240; - shr.u32 %r7508, %r7505, 10; - or.b32 %r7509, %r7508, %r7507; - xor.b32 %r7510, %r28, %r7509; - cvt.u64.u32 %rd394, %r7510; - ld.const.v4.u8 {%rs4866, %rs4867, %rs4868, %rs4869}, [matrix+2432]; - cvt.u32.u16 %r7511, %rs4869; - cvt.s32.s8 %r7512, %r7511; - cvt.u32.u16 %r7513, %rs4868; - cvt.s32.s8 %r7514, %r7513; - cvt.u32.u16 %r7515, %rs4866; - cvt.s32.s8 %r7516, %r7515; - cvt.u32.u16 %r7517, %rs4867; - cvt.s32.s8 %r7518, %r7517; - mul.lo.s32 %r7519, %r34, %r7518; - mad.lo.s32 %r7520, %r124, %r7516, %r7519; - mad.lo.s32 %r7521, %r35, %r7514, %r7520; - mad.lo.s32 %r7522, %r36, %r7512, %r7521; - ld.const.v4.u8 {%rs4874, %rs4875, %rs4876, %rs4877}, [matrix+2436]; - cvt.u32.u16 %r7523, %rs4877; - cvt.s32.s8 %r7524, %r7523; - cvt.u32.u16 %r7525, %rs4876; - cvt.s32.s8 %r7526, %r7525; - cvt.u32.u16 %r7527, %rs4875; - cvt.s32.s8 %r7528, %r7527; - cvt.u32.u16 %r7529, %rs4874; - cvt.s32.s8 %r7530, %r7529; - mad.lo.s32 %r7531, %r37, %r7530, %r7522; - mad.lo.s32 %r7532, %r38, %r7528, %r7531; - mad.lo.s32 %r7533, %r39, %r7526, %r7532; - mad.lo.s32 %r7534, %r40, %r7524, %r7533; - ld.const.v4.u8 {%rs4882, %rs4883, %rs4884, %rs4885}, [matrix+2440]; - cvt.u32.u16 %r7535, %rs4885; - cvt.s32.s8 %r7536, %r7535; - cvt.u32.u16 %r7537, %rs4884; - cvt.s32.s8 %r7538, %r7537; - cvt.u32.u16 %r7539, %rs4883; - cvt.s32.s8 %r7540, %r7539; - cvt.u32.u16 %r7541, %rs4882; - cvt.s32.s8 %r7542, %r7541; - mad.lo.s32 %r7543, %r42, %r7542, %r7534; - mad.lo.s32 %r7544, %r43, %r7540, %r7543; - mad.lo.s32 %r7545, %r45, %r7538, %r7544; - mad.lo.s32 %r7546, %r46, %r7536, %r7545; - ld.const.v4.u8 {%rs4890, %rs4891, %rs4892, %rs4893}, [matrix+2444]; - cvt.u32.u16 %r7547, %rs4893; - cvt.s32.s8 %r7548, %r7547; - cvt.u32.u16 %r7549, %rs4892; - cvt.s32.s8 %r7550, %r7549; - cvt.u32.u16 %r7551, %rs4891; - cvt.s32.s8 %r7552, %r7551; - cvt.u32.u16 %r7553, %rs4890; - cvt.s32.s8 %r7554, %r7553; - mad.lo.s32 %r7555, %r48, %r7554, %r7546; - mad.lo.s32 %r7556, %r49, %r7552, %r7555; - mad.lo.s32 %r7557, %r50, %r7550, %r7556; - mad.lo.s32 %r7558, %r51, %r7548, %r7557; - ld.const.v4.u8 {%rs4898, %rs4899, %rs4900, %rs4901}, [matrix+2448]; - cvt.u32.u16 %r7559, %rs4901; - cvt.s32.s8 %r7560, %r7559; - cvt.u32.u16 %r7561, %rs4900; - cvt.s32.s8 %r7562, %r7561; - cvt.u32.u16 %r7563, %rs4899; - cvt.s32.s8 %r7564, %r7563; - cvt.u32.u16 %r7565, %rs4898; - cvt.s32.s8 %r7566, %r7565; - mad.lo.s32 %r7567, %r173, %r7566, %r7558; - mad.lo.s32 %r7568, %r53, %r7564, %r7567; - mad.lo.s32 %r7569, %r54, %r7562, %r7568; - mad.lo.s32 %r7570, %r55, %r7560, %r7569; - ld.const.v4.u8 {%rs4906, %rs4907, %rs4908, %rs4909}, [matrix+2452]; - cvt.u32.u16 %r7571, %rs4909; - cvt.s32.s8 %r7572, %r7571; - cvt.u32.u16 %r7573, %rs4908; - cvt.s32.s8 %r7574, %r7573; - cvt.u32.u16 %r7575, %rs4907; - cvt.s32.s8 %r7576, %r7575; - cvt.u32.u16 %r7577, %rs4906; - cvt.s32.s8 %r7578, %r7577; - mad.lo.s32 %r7579, %r56, %r7578, %r7570; - mad.lo.s32 %r7580, %r57, %r7576, %r7579; - mad.lo.s32 %r7581, %r58, %r7574, %r7580; - mad.lo.s32 %r7582, %r59, %r7572, %r7581; - ld.const.v4.u8 {%rs4914, %rs4915, %rs4916, %rs4917}, [matrix+2456]; - cvt.u32.u16 %r7583, %rs4917; - cvt.s32.s8 %r7584, %r7583; - cvt.u32.u16 %r7585, %rs4916; - cvt.s32.s8 %r7586, %r7585; - cvt.u32.u16 %r7587, %rs4915; - cvt.s32.s8 %r7588, %r7587; - cvt.u32.u16 %r7589, %rs4914; - cvt.s32.s8 %r7590, %r7589; - mad.lo.s32 %r7591, %r61, %r7590, %r7582; - mad.lo.s32 %r7592, %r62, %r7588, %r7591; - mad.lo.s32 %r7593, %r64, %r7586, %r7592; - mad.lo.s32 %r7594, %r65, %r7584, %r7593; - ld.const.v4.u8 {%rs4922, %rs4923, %rs4924, %rs4925}, [matrix+2460]; - cvt.u32.u16 %r7595, %rs4925; - cvt.s32.s8 %r7596, %r7595; - cvt.u32.u16 %r7597, %rs4924; - cvt.s32.s8 %r7598, %r7597; - cvt.u32.u16 %r7599, %rs4923; - cvt.s32.s8 %r7600, %r7599; - cvt.u32.u16 %r7601, %rs4922; - cvt.s32.s8 %r7602, %r7601; - mad.lo.s32 %r7603, %r67, %r7602, %r7594; - mad.lo.s32 %r7604, %r68, %r7600, %r7603; - mad.lo.s32 %r7605, %r69, %r7598, %r7604; - mad.lo.s32 %r7606, %r70, %r7596, %r7605; - ld.const.v4.u8 {%rs4930, %rs4931, %rs4932, %rs4933}, [matrix+2464]; - cvt.u32.u16 %r7607, %rs4933; - cvt.s32.s8 %r7608, %r7607; - cvt.u32.u16 %r7609, %rs4932; - cvt.s32.s8 %r7610, %r7609; - cvt.u32.u16 %r7611, %rs4931; - cvt.s32.s8 %r7612, %r7611; - cvt.u32.u16 %r7613, %rs4930; - cvt.s32.s8 %r7614, %r7613; - mad.lo.s32 %r7615, %r222, %r7614, %r7606; - mad.lo.s32 %r7616, %r72, %r7612, %r7615; - mad.lo.s32 %r7617, %r73, %r7610, %r7616; - mad.lo.s32 %r7618, %r74, %r7608, %r7617; - ld.const.v4.u8 {%rs4938, %rs4939, %rs4940, %rs4941}, [matrix+2468]; - cvt.u32.u16 %r7619, %rs4941; - cvt.s32.s8 %r7620, %r7619; - cvt.u32.u16 %r7621, %rs4940; - cvt.s32.s8 %r7622, %r7621; - cvt.u32.u16 %r7623, %rs4939; - cvt.s32.s8 %r7624, %r7623; - cvt.u32.u16 %r7625, %rs4938; - cvt.s32.s8 %r7626, %r7625; - mad.lo.s32 %r7627, %r75, %r7626, %r7618; - mad.lo.s32 %r7628, %r76, %r7624, %r7627; - mad.lo.s32 %r7629, %r77, %r7622, %r7628; - mad.lo.s32 %r7630, %r78, %r7620, %r7629; - ld.const.v4.u8 {%rs4946, %rs4947, %rs4948, %rs4949}, [matrix+2472]; - cvt.u32.u16 %r7631, %rs4949; - cvt.s32.s8 %r7632, %r7631; - cvt.u32.u16 %r7633, %rs4948; - cvt.s32.s8 %r7634, %r7633; - cvt.u32.u16 %r7635, %rs4947; - cvt.s32.s8 %r7636, %r7635; - cvt.u32.u16 %r7637, %rs4946; - cvt.s32.s8 %r7638, %r7637; - mad.lo.s32 %r7639, %r80, %r7638, %r7630; - mad.lo.s32 %r7640, %r81, %r7636, %r7639; - mad.lo.s32 %r7641, %r83, %r7634, %r7640; - mad.lo.s32 %r7642, %r84, %r7632, %r7641; - ld.const.v4.u8 {%rs4954, %rs4955, %rs4956, %rs4957}, [matrix+2476]; - cvt.u32.u16 %r7643, %rs4957; - cvt.s32.s8 %r7644, %r7643; - cvt.u32.u16 %r7645, %rs4956; - cvt.s32.s8 %r7646, %r7645; - cvt.u32.u16 %r7647, %rs4955; - cvt.s32.s8 %r7648, %r7647; - cvt.u32.u16 %r7649, %rs4954; - cvt.s32.s8 %r7650, %r7649; - mad.lo.s32 %r7651, %r86, %r7650, %r7642; - mad.lo.s32 %r7652, %r87, %r7648, %r7651; - mad.lo.s32 %r7653, %r88, %r7646, %r7652; - mad.lo.s32 %r7654, %r89, %r7644, %r7653; - ld.const.v4.u8 {%rs4962, %rs4963, %rs4964, %rs4965}, [matrix+2480]; - cvt.u32.u16 %r7655, %rs4965; - cvt.s32.s8 %r7656, %r7655; - cvt.u32.u16 %r7657, %rs4964; - cvt.s32.s8 %r7658, %r7657; - cvt.u32.u16 %r7659, %rs4963; - cvt.s32.s8 %r7660, %r7659; - cvt.u32.u16 %r7661, %rs4962; - cvt.s32.s8 %r7662, %r7661; - mad.lo.s32 %r7663, %r271, %r7662, %r7654; - mad.lo.s32 %r7664, %r91, %r7660, %r7663; - mad.lo.s32 %r7665, %r93, %r7658, %r7664; - mad.lo.s32 %r7666, %r94, %r7656, %r7665; - ld.const.v4.u8 {%rs4970, %rs4971, %rs4972, %rs4973}, [matrix+2484]; - cvt.u32.u16 %r7667, %rs4973; - cvt.s32.s8 %r7668, %r7667; - cvt.u32.u16 %r7669, %rs4972; - cvt.s32.s8 %r7670, %r7669; - cvt.u32.u16 %r7671, %rs4971; - cvt.s32.s8 %r7672, %r7671; - cvt.u32.u16 %r7673, %rs4970; - cvt.s32.s8 %r7674, %r7673; - mad.lo.s32 %r7675, %r96, %r7674, %r7666; - mad.lo.s32 %r7676, %r97, %r7672, %r7675; - mad.lo.s32 %r7677, %r99, %r7670, %r7676; - mad.lo.s32 %r7678, %r100, %r7668, %r7677; - ld.const.v4.u8 {%rs4978, %rs4979, %rs4980, %rs4981}, [matrix+2488]; - cvt.u32.u16 %r7679, %rs4981; - cvt.s32.s8 %r7680, %r7679; - cvt.u32.u16 %r7681, %rs4980; - cvt.s32.s8 %r7682, %r7681; - cvt.u32.u16 %r7683, %rs4979; - cvt.s32.s8 %r7684, %r7683; - cvt.u32.u16 %r7685, %rs4978; - cvt.s32.s8 %r7686, %r7685; - mad.lo.s32 %r7687, %r103, %r7686, %r7678; - mad.lo.s32 %r7688, %r104, %r7684, %r7687; - mad.lo.s32 %r7689, %r107, %r7682, %r7688; - mad.lo.s32 %r7690, %r108, %r7680, %r7689; - ld.const.v4.u8 {%rs4986, %rs4987, %rs4988, %rs4989}, [matrix+2492]; - cvt.u32.u16 %r7691, %rs4989; - cvt.s32.s8 %r7692, %r7691; - cvt.u32.u16 %r7693, %rs4988; - cvt.s32.s8 %r7694, %r7693; - cvt.u32.u16 %r7695, %rs4987; - cvt.s32.s8 %r7696, %r7695; - cvt.u32.u16 %r7697, %rs4986; - cvt.s32.s8 %r7698, %r7697; - mad.lo.s32 %r7699, %r111, %r7698, %r7690; - mad.lo.s32 %r7700, %r112, %r7696, %r7699; - mad.lo.s32 %r7701, %r114, %r7694, %r7700; - mad.lo.s32 %r7702, %r115, %r7692, %r7701; - ld.const.v4.u8 {%rs4994, %rs4995, %rs4996, %rs4997}, [matrix+2496]; - cvt.u32.u16 %r7703, %rs4997; - cvt.s32.s8 %r7704, %r7703; - cvt.u32.u16 %r7705, %rs4996; - cvt.s32.s8 %r7706, %r7705; - cvt.u32.u16 %r7707, %rs4994; - cvt.s32.s8 %r7708, %r7707; - cvt.u32.u16 %r7709, %rs4995; - cvt.s32.s8 %r7710, %r7709; - mul.lo.s32 %r7711, %r34, %r7710; - mad.lo.s32 %r7712, %r124, %r7708, %r7711; - mad.lo.s32 %r7713, %r35, %r7706, %r7712; - mad.lo.s32 %r7714, %r36, %r7704, %r7713; - ld.const.v4.u8 {%rs5002, %rs5003, %rs5004, %rs5005}, [matrix+2500]; - cvt.u32.u16 %r7715, %rs5005; - cvt.s32.s8 %r7716, %r7715; - cvt.u32.u16 %r7717, %rs5004; - cvt.s32.s8 %r7718, %r7717; - cvt.u32.u16 %r7719, %rs5003; - cvt.s32.s8 %r7720, %r7719; - cvt.u32.u16 %r7721, %rs5002; - cvt.s32.s8 %r7722, %r7721; - mad.lo.s32 %r7723, %r37, %r7722, %r7714; - mad.lo.s32 %r7724, %r38, %r7720, %r7723; - mad.lo.s32 %r7725, %r39, %r7718, %r7724; - mad.lo.s32 %r7726, %r40, %r7716, %r7725; - ld.const.v4.u8 {%rs5010, %rs5011, %rs5012, %rs5013}, [matrix+2504]; - cvt.u32.u16 %r7727, %rs5013; - cvt.s32.s8 %r7728, %r7727; - cvt.u32.u16 %r7729, %rs5012; - cvt.s32.s8 %r7730, %r7729; - cvt.u32.u16 %r7731, %rs5011; - cvt.s32.s8 %r7732, %r7731; - cvt.u32.u16 %r7733, %rs5010; - cvt.s32.s8 %r7734, %r7733; - mad.lo.s32 %r7735, %r42, %r7734, %r7726; - mad.lo.s32 %r7736, %r43, %r7732, %r7735; - mad.lo.s32 %r7737, %r45, %r7730, %r7736; - mad.lo.s32 %r7738, %r46, %r7728, %r7737; - ld.const.v4.u8 {%rs5018, %rs5019, %rs5020, %rs5021}, [matrix+2508]; - cvt.u32.u16 %r7739, %rs5021; - cvt.s32.s8 %r7740, %r7739; - cvt.u32.u16 %r7741, %rs5020; - cvt.s32.s8 %r7742, %r7741; - cvt.u32.u16 %r7743, %rs5019; - cvt.s32.s8 %r7744, %r7743; - cvt.u32.u16 %r7745, %rs5018; - cvt.s32.s8 %r7746, %r7745; - mad.lo.s32 %r7747, %r48, %r7746, %r7738; - mad.lo.s32 %r7748, %r49, %r7744, %r7747; - mad.lo.s32 %r7749, %r50, %r7742, %r7748; - mad.lo.s32 %r7750, %r51, %r7740, %r7749; - ld.const.v4.u8 {%rs5026, %rs5027, %rs5028, %rs5029}, [matrix+2512]; - cvt.u32.u16 %r7751, %rs5029; - cvt.s32.s8 %r7752, %r7751; - cvt.u32.u16 %r7753, %rs5028; - cvt.s32.s8 %r7754, %r7753; - cvt.u32.u16 %r7755, %rs5027; - cvt.s32.s8 %r7756, %r7755; - cvt.u32.u16 %r7757, %rs5026; - cvt.s32.s8 %r7758, %r7757; - mad.lo.s32 %r7759, %r173, %r7758, %r7750; - mad.lo.s32 %r7760, %r53, %r7756, %r7759; - mad.lo.s32 %r7761, %r54, %r7754, %r7760; - mad.lo.s32 %r7762, %r55, %r7752, %r7761; - ld.const.v4.u8 {%rs5034, %rs5035, %rs5036, %rs5037}, [matrix+2516]; - cvt.u32.u16 %r7763, %rs5037; - cvt.s32.s8 %r7764, %r7763; - cvt.u32.u16 %r7765, %rs5036; - cvt.s32.s8 %r7766, %r7765; - cvt.u32.u16 %r7767, %rs5035; - cvt.s32.s8 %r7768, %r7767; - cvt.u32.u16 %r7769, %rs5034; - cvt.s32.s8 %r7770, %r7769; - mad.lo.s32 %r7771, %r56, %r7770, %r7762; - mad.lo.s32 %r7772, %r57, %r7768, %r7771; - mad.lo.s32 %r7773, %r58, %r7766, %r7772; - mad.lo.s32 %r7774, %r59, %r7764, %r7773; - ld.const.v4.u8 {%rs5042, %rs5043, %rs5044, %rs5045}, [matrix+2520]; - cvt.u32.u16 %r7775, %rs5045; - cvt.s32.s8 %r7776, %r7775; - cvt.u32.u16 %r7777, %rs5044; - cvt.s32.s8 %r7778, %r7777; - cvt.u32.u16 %r7779, %rs5043; - cvt.s32.s8 %r7780, %r7779; - cvt.u32.u16 %r7781, %rs5042; - cvt.s32.s8 %r7782, %r7781; - mad.lo.s32 %r7783, %r61, %r7782, %r7774; - mad.lo.s32 %r7784, %r62, %r7780, %r7783; - mad.lo.s32 %r7785, %r64, %r7778, %r7784; - mad.lo.s32 %r7786, %r65, %r7776, %r7785; - ld.const.v4.u8 {%rs5050, %rs5051, %rs5052, %rs5053}, [matrix+2524]; - cvt.u32.u16 %r7787, %rs5053; - cvt.s32.s8 %r7788, %r7787; - cvt.u32.u16 %r7789, %rs5052; - cvt.s32.s8 %r7790, %r7789; - cvt.u32.u16 %r7791, %rs5051; - cvt.s32.s8 %r7792, %r7791; - cvt.u32.u16 %r7793, %rs5050; - cvt.s32.s8 %r7794, %r7793; - mad.lo.s32 %r7795, %r67, %r7794, %r7786; - mad.lo.s32 %r7796, %r68, %r7792, %r7795; - mad.lo.s32 %r7797, %r69, %r7790, %r7796; - mad.lo.s32 %r7798, %r70, %r7788, %r7797; - ld.const.v4.u8 {%rs5058, %rs5059, %rs5060, %rs5061}, [matrix+2528]; - cvt.u32.u16 %r7799, %rs5061; - cvt.s32.s8 %r7800, %r7799; - cvt.u32.u16 %r7801, %rs5060; - cvt.s32.s8 %r7802, %r7801; - cvt.u32.u16 %r7803, %rs5059; - cvt.s32.s8 %r7804, %r7803; - cvt.u32.u16 %r7805, %rs5058; - cvt.s32.s8 %r7806, %r7805; - mad.lo.s32 %r7807, %r222, %r7806, %r7798; - mad.lo.s32 %r7808, %r72, %r7804, %r7807; - mad.lo.s32 %r7809, %r73, %r7802, %r7808; - mad.lo.s32 %r7810, %r74, %r7800, %r7809; - ld.const.v4.u8 {%rs5066, %rs5067, %rs5068, %rs5069}, [matrix+2532]; - cvt.u32.u16 %r7811, %rs5069; - cvt.s32.s8 %r7812, %r7811; - cvt.u32.u16 %r7813, %rs5068; - cvt.s32.s8 %r7814, %r7813; - cvt.u32.u16 %r7815, %rs5067; - cvt.s32.s8 %r7816, %r7815; - cvt.u32.u16 %r7817, %rs5066; - cvt.s32.s8 %r7818, %r7817; - mad.lo.s32 %r7819, %r75, %r7818, %r7810; - mad.lo.s32 %r7820, %r76, %r7816, %r7819; - mad.lo.s32 %r7821, %r77, %r7814, %r7820; - mad.lo.s32 %r7822, %r78, %r7812, %r7821; - ld.const.v4.u8 {%rs5074, %rs5075, %rs5076, %rs5077}, [matrix+2536]; - cvt.u32.u16 %r7823, %rs5077; - cvt.s32.s8 %r7824, %r7823; - cvt.u32.u16 %r7825, %rs5076; - cvt.s32.s8 %r7826, %r7825; - cvt.u32.u16 %r7827, %rs5075; - cvt.s32.s8 %r7828, %r7827; - cvt.u32.u16 %r7829, %rs5074; - cvt.s32.s8 %r7830, %r7829; - mad.lo.s32 %r7831, %r80, %r7830, %r7822; - mad.lo.s32 %r7832, %r81, %r7828, %r7831; - mad.lo.s32 %r7833, %r83, %r7826, %r7832; - mad.lo.s32 %r7834, %r84, %r7824, %r7833; - ld.const.v4.u8 {%rs5082, %rs5083, %rs5084, %rs5085}, [matrix+2540]; - cvt.u32.u16 %r7835, %rs5085; - cvt.s32.s8 %r7836, %r7835; - cvt.u32.u16 %r7837, %rs5084; - cvt.s32.s8 %r7838, %r7837; - cvt.u32.u16 %r7839, %rs5083; - cvt.s32.s8 %r7840, %r7839; - cvt.u32.u16 %r7841, %rs5082; - cvt.s32.s8 %r7842, %r7841; - mad.lo.s32 %r7843, %r86, %r7842, %r7834; - mad.lo.s32 %r7844, %r87, %r7840, %r7843; - mad.lo.s32 %r7845, %r88, %r7838, %r7844; - mad.lo.s32 %r7846, %r89, %r7836, %r7845; - ld.const.v4.u8 {%rs5090, %rs5091, %rs5092, %rs5093}, [matrix+2544]; - cvt.u32.u16 %r7847, %rs5093; - cvt.s32.s8 %r7848, %r7847; - cvt.u32.u16 %r7849, %rs5092; - cvt.s32.s8 %r7850, %r7849; - cvt.u32.u16 %r7851, %rs5091; - cvt.s32.s8 %r7852, %r7851; - cvt.u32.u16 %r7853, %rs5090; - cvt.s32.s8 %r7854, %r7853; - mad.lo.s32 %r7855, %r271, %r7854, %r7846; - mad.lo.s32 %r7856, %r91, %r7852, %r7855; - mad.lo.s32 %r7857, %r93, %r7850, %r7856; - mad.lo.s32 %r7858, %r94, %r7848, %r7857; - ld.const.v4.u8 {%rs5098, %rs5099, %rs5100, %rs5101}, [matrix+2548]; - cvt.u32.u16 %r7859, %rs5101; - cvt.s32.s8 %r7860, %r7859; - cvt.u32.u16 %r7861, %rs5100; - cvt.s32.s8 %r7862, %r7861; - cvt.u32.u16 %r7863, %rs5099; - cvt.s32.s8 %r7864, %r7863; - cvt.u32.u16 %r7865, %rs5098; - cvt.s32.s8 %r7866, %r7865; - mad.lo.s32 %r7867, %r96, %r7866, %r7858; - mad.lo.s32 %r7868, %r97, %r7864, %r7867; - mad.lo.s32 %r7869, %r99, %r7862, %r7868; - mad.lo.s32 %r7870, %r100, %r7860, %r7869; - ld.const.v4.u8 {%rs5106, %rs5107, %rs5108, %rs5109}, [matrix+2552]; - cvt.u32.u16 %r7871, %rs5109; - cvt.s32.s8 %r7872, %r7871; - cvt.u32.u16 %r7873, %rs5108; - cvt.s32.s8 %r7874, %r7873; - cvt.u32.u16 %r7875, %rs5107; - cvt.s32.s8 %r7876, %r7875; - cvt.u32.u16 %r7877, %rs5106; - cvt.s32.s8 %r7878, %r7877; - mad.lo.s32 %r7879, %r103, %r7878, %r7870; - mad.lo.s32 %r7880, %r104, %r7876, %r7879; - mad.lo.s32 %r7881, %r107, %r7874, %r7880; - mad.lo.s32 %r7882, %r108, %r7872, %r7881; - ld.const.v4.u8 {%rs5114, %rs5115, %rs5116, %rs5117}, [matrix+2556]; - cvt.u32.u16 %r7883, %rs5117; - cvt.s32.s8 %r7884, %r7883; - cvt.u32.u16 %r7885, %rs5116; - cvt.s32.s8 %r7886, %r7885; - cvt.u32.u16 %r7887, %rs5115; - cvt.s32.s8 %r7888, %r7887; - cvt.u32.u16 %r7889, %rs5114; - cvt.s32.s8 %r7890, %r7889; - mad.lo.s32 %r7891, %r111, %r7890, %r7882; - mad.lo.s32 %r7892, %r112, %r7888, %r7891; - mad.lo.s32 %r7893, %r114, %r7886, %r7892; - mad.lo.s32 %r7894, %r115, %r7884, %r7893; - shr.u32 %r7895, %r7702, 6; - and.b32 %r7896, %r7895, 240; - shr.u32 %r7897, %r7894, 10; - or.b32 %r7898, %r7897, %r7896; - xor.b32 %r7899, %r29, %r7898; - cvt.u64.u32 %rd395, %r7899; - ld.const.v4.u8 {%rs5122, %rs5123, %rs5124, %rs5125}, [matrix+2560]; - cvt.u32.u16 %r7900, %rs5125; - cvt.s32.s8 %r7901, %r7900; - cvt.u32.u16 %r7902, %rs5124; - cvt.s32.s8 %r7903, %r7902; - cvt.u32.u16 %r7904, %rs5122; - cvt.s32.s8 %r7905, %r7904; - cvt.u32.u16 %r7906, %rs5123; - cvt.s32.s8 %r7907, %r7906; - mul.lo.s32 %r7908, %r34, %r7907; - mad.lo.s32 %r7909, %r124, %r7905, %r7908; - mad.lo.s32 %r7910, %r35, %r7903, %r7909; - mad.lo.s32 %r7911, %r36, %r7901, %r7910; - ld.const.v4.u8 {%rs5130, %rs5131, %rs5132, %rs5133}, [matrix+2564]; - cvt.u32.u16 %r7912, %rs5133; - cvt.s32.s8 %r7913, %r7912; - cvt.u32.u16 %r7914, %rs5132; - cvt.s32.s8 %r7915, %r7914; - cvt.u32.u16 %r7916, %rs5131; - cvt.s32.s8 %r7917, %r7916; - cvt.u32.u16 %r7918, %rs5130; - cvt.s32.s8 %r7919, %r7918; - mad.lo.s32 %r7920, %r37, %r7919, %r7911; - mad.lo.s32 %r7921, %r38, %r7917, %r7920; - mad.lo.s32 %r7922, %r39, %r7915, %r7921; - mad.lo.s32 %r7923, %r40, %r7913, %r7922; - ld.const.v4.u8 {%rs5138, %rs5139, %rs5140, %rs5141}, [matrix+2568]; - cvt.u32.u16 %r7924, %rs5141; - cvt.s32.s8 %r7925, %r7924; - cvt.u32.u16 %r7926, %rs5140; - cvt.s32.s8 %r7927, %r7926; - cvt.u32.u16 %r7928, %rs5139; - cvt.s32.s8 %r7929, %r7928; - cvt.u32.u16 %r7930, %rs5138; - cvt.s32.s8 %r7931, %r7930; - mad.lo.s32 %r7932, %r42, %r7931, %r7923; - mad.lo.s32 %r7933, %r43, %r7929, %r7932; - mad.lo.s32 %r7934, %r45, %r7927, %r7933; - mad.lo.s32 %r7935, %r46, %r7925, %r7934; - ld.const.v4.u8 {%rs5146, %rs5147, %rs5148, %rs5149}, [matrix+2572]; - cvt.u32.u16 %r7936, %rs5149; - cvt.s32.s8 %r7937, %r7936; - cvt.u32.u16 %r7938, %rs5148; - cvt.s32.s8 %r7939, %r7938; - cvt.u32.u16 %r7940, %rs5147; - cvt.s32.s8 %r7941, %r7940; - cvt.u32.u16 %r7942, %rs5146; - cvt.s32.s8 %r7943, %r7942; - mad.lo.s32 %r7944, %r48, %r7943, %r7935; - mad.lo.s32 %r7945, %r49, %r7941, %r7944; - mad.lo.s32 %r7946, %r50, %r7939, %r7945; - mad.lo.s32 %r7947, %r51, %r7937, %r7946; - ld.const.v4.u8 {%rs5154, %rs5155, %rs5156, %rs5157}, [matrix+2576]; - cvt.u32.u16 %r7948, %rs5157; - cvt.s32.s8 %r7949, %r7948; - cvt.u32.u16 %r7950, %rs5156; - cvt.s32.s8 %r7951, %r7950; - cvt.u32.u16 %r7952, %rs5155; - cvt.s32.s8 %r7953, %r7952; - cvt.u32.u16 %r7954, %rs5154; - cvt.s32.s8 %r7955, %r7954; - mad.lo.s32 %r7956, %r173, %r7955, %r7947; - mad.lo.s32 %r7957, %r53, %r7953, %r7956; - mad.lo.s32 %r7958, %r54, %r7951, %r7957; - mad.lo.s32 %r7959, %r55, %r7949, %r7958; - ld.const.v4.u8 {%rs5162, %rs5163, %rs5164, %rs5165}, [matrix+2580]; - cvt.u32.u16 %r7960, %rs5165; - cvt.s32.s8 %r7961, %r7960; - cvt.u32.u16 %r7962, %rs5164; - cvt.s32.s8 %r7963, %r7962; - cvt.u32.u16 %r7964, %rs5163; - cvt.s32.s8 %r7965, %r7964; - cvt.u32.u16 %r7966, %rs5162; - cvt.s32.s8 %r7967, %r7966; - mad.lo.s32 %r7968, %r56, %r7967, %r7959; - mad.lo.s32 %r7969, %r57, %r7965, %r7968; - mad.lo.s32 %r7970, %r58, %r7963, %r7969; - mad.lo.s32 %r7971, %r59, %r7961, %r7970; - ld.const.v4.u8 {%rs5170, %rs5171, %rs5172, %rs5173}, [matrix+2584]; - cvt.u32.u16 %r7972, %rs5173; - cvt.s32.s8 %r7973, %r7972; - cvt.u32.u16 %r7974, %rs5172; - cvt.s32.s8 %r7975, %r7974; - cvt.u32.u16 %r7976, %rs5171; - cvt.s32.s8 %r7977, %r7976; - cvt.u32.u16 %r7978, %rs5170; - cvt.s32.s8 %r7979, %r7978; - mad.lo.s32 %r7980, %r61, %r7979, %r7971; - mad.lo.s32 %r7981, %r62, %r7977, %r7980; - mad.lo.s32 %r7982, %r64, %r7975, %r7981; - mad.lo.s32 %r7983, %r65, %r7973, %r7982; - ld.const.v4.u8 {%rs5178, %rs5179, %rs5180, %rs5181}, [matrix+2588]; - cvt.u32.u16 %r7984, %rs5181; - cvt.s32.s8 %r7985, %r7984; - cvt.u32.u16 %r7986, %rs5180; - cvt.s32.s8 %r7987, %r7986; - cvt.u32.u16 %r7988, %rs5179; - cvt.s32.s8 %r7989, %r7988; - cvt.u32.u16 %r7990, %rs5178; - cvt.s32.s8 %r7991, %r7990; - mad.lo.s32 %r7992, %r67, %r7991, %r7983; - mad.lo.s32 %r7993, %r68, %r7989, %r7992; - mad.lo.s32 %r7994, %r69, %r7987, %r7993; - mad.lo.s32 %r7995, %r70, %r7985, %r7994; - ld.const.v4.u8 {%rs5186, %rs5187, %rs5188, %rs5189}, [matrix+2592]; - cvt.u32.u16 %r7996, %rs5189; - cvt.s32.s8 %r7997, %r7996; - cvt.u32.u16 %r7998, %rs5188; - cvt.s32.s8 %r7999, %r7998; - cvt.u32.u16 %r8000, %rs5187; - cvt.s32.s8 %r8001, %r8000; - cvt.u32.u16 %r8002, %rs5186; - cvt.s32.s8 %r8003, %r8002; - mad.lo.s32 %r8004, %r222, %r8003, %r7995; - mad.lo.s32 %r8005, %r72, %r8001, %r8004; - mad.lo.s32 %r8006, %r73, %r7999, %r8005; - mad.lo.s32 %r8007, %r74, %r7997, %r8006; - ld.const.v4.u8 {%rs5194, %rs5195, %rs5196, %rs5197}, [matrix+2596]; - cvt.u32.u16 %r8008, %rs5197; - cvt.s32.s8 %r8009, %r8008; - cvt.u32.u16 %r8010, %rs5196; - cvt.s32.s8 %r8011, %r8010; - cvt.u32.u16 %r8012, %rs5195; - cvt.s32.s8 %r8013, %r8012; - cvt.u32.u16 %r8014, %rs5194; - cvt.s32.s8 %r8015, %r8014; - mad.lo.s32 %r8016, %r75, %r8015, %r8007; - mad.lo.s32 %r8017, %r76, %r8013, %r8016; - mad.lo.s32 %r8018, %r77, %r8011, %r8017; - mad.lo.s32 %r8019, %r78, %r8009, %r8018; - ld.const.v4.u8 {%rs5202, %rs5203, %rs5204, %rs5205}, [matrix+2600]; - cvt.u32.u16 %r8020, %rs5205; - cvt.s32.s8 %r8021, %r8020; - cvt.u32.u16 %r8022, %rs5204; - cvt.s32.s8 %r8023, %r8022; - cvt.u32.u16 %r8024, %rs5203; - cvt.s32.s8 %r8025, %r8024; - cvt.u32.u16 %r8026, %rs5202; - cvt.s32.s8 %r8027, %r8026; - mad.lo.s32 %r8028, %r80, %r8027, %r8019; - mad.lo.s32 %r8029, %r81, %r8025, %r8028; - mad.lo.s32 %r8030, %r83, %r8023, %r8029; - mad.lo.s32 %r8031, %r84, %r8021, %r8030; - ld.const.v4.u8 {%rs5210, %rs5211, %rs5212, %rs5213}, [matrix+2604]; - cvt.u32.u16 %r8032, %rs5213; - cvt.s32.s8 %r8033, %r8032; - cvt.u32.u16 %r8034, %rs5212; - cvt.s32.s8 %r8035, %r8034; - cvt.u32.u16 %r8036, %rs5211; - cvt.s32.s8 %r8037, %r8036; - cvt.u32.u16 %r8038, %rs5210; - cvt.s32.s8 %r8039, %r8038; - mad.lo.s32 %r8040, %r86, %r8039, %r8031; - mad.lo.s32 %r8041, %r87, %r8037, %r8040; - mad.lo.s32 %r8042, %r88, %r8035, %r8041; - mad.lo.s32 %r8043, %r89, %r8033, %r8042; - ld.const.v4.u8 {%rs5218, %rs5219, %rs5220, %rs5221}, [matrix+2608]; - cvt.u32.u16 %r8044, %rs5221; - cvt.s32.s8 %r8045, %r8044; - cvt.u32.u16 %r8046, %rs5220; - cvt.s32.s8 %r8047, %r8046; - cvt.u32.u16 %r8048, %rs5219; - cvt.s32.s8 %r8049, %r8048; - cvt.u32.u16 %r8050, %rs5218; - cvt.s32.s8 %r8051, %r8050; - mad.lo.s32 %r8052, %r271, %r8051, %r8043; - mad.lo.s32 %r8053, %r91, %r8049, %r8052; - mad.lo.s32 %r8054, %r93, %r8047, %r8053; - mad.lo.s32 %r8055, %r94, %r8045, %r8054; - ld.const.v4.u8 {%rs5226, %rs5227, %rs5228, %rs5229}, [matrix+2612]; - cvt.u32.u16 %r8056, %rs5229; - cvt.s32.s8 %r8057, %r8056; - cvt.u32.u16 %r8058, %rs5228; - cvt.s32.s8 %r8059, %r8058; - cvt.u32.u16 %r8060, %rs5227; - cvt.s32.s8 %r8061, %r8060; - cvt.u32.u16 %r8062, %rs5226; - cvt.s32.s8 %r8063, %r8062; - mad.lo.s32 %r8064, %r96, %r8063, %r8055; - mad.lo.s32 %r8065, %r97, %r8061, %r8064; - mad.lo.s32 %r8066, %r99, %r8059, %r8065; - mad.lo.s32 %r8067, %r100, %r8057, %r8066; - ld.const.v4.u8 {%rs5234, %rs5235, %rs5236, %rs5237}, [matrix+2616]; - cvt.u32.u16 %r8068, %rs5237; - cvt.s32.s8 %r8069, %r8068; - cvt.u32.u16 %r8070, %rs5236; - cvt.s32.s8 %r8071, %r8070; - cvt.u32.u16 %r8072, %rs5235; - cvt.s32.s8 %r8073, %r8072; - cvt.u32.u16 %r8074, %rs5234; - cvt.s32.s8 %r8075, %r8074; - mad.lo.s32 %r8076, %r103, %r8075, %r8067; - mad.lo.s32 %r8077, %r104, %r8073, %r8076; - mad.lo.s32 %r8078, %r107, %r8071, %r8077; - mad.lo.s32 %r8079, %r108, %r8069, %r8078; - ld.const.v4.u8 {%rs5242, %rs5243, %rs5244, %rs5245}, [matrix+2620]; - cvt.u32.u16 %r8080, %rs5245; - cvt.s32.s8 %r8081, %r8080; - cvt.u32.u16 %r8082, %rs5244; - cvt.s32.s8 %r8083, %r8082; - cvt.u32.u16 %r8084, %rs5243; - cvt.s32.s8 %r8085, %r8084; - cvt.u32.u16 %r8086, %rs5242; - cvt.s32.s8 %r8087, %r8086; - mad.lo.s32 %r8088, %r111, %r8087, %r8079; - mad.lo.s32 %r8089, %r112, %r8085, %r8088; - mad.lo.s32 %r8090, %r114, %r8083, %r8089; - mad.lo.s32 %r8091, %r115, %r8081, %r8090; - ld.const.v4.u8 {%rs5250, %rs5251, %rs5252, %rs5253}, [matrix+2624]; - cvt.u32.u16 %r8092, %rs5253; - cvt.s32.s8 %r8093, %r8092; - cvt.u32.u16 %r8094, %rs5252; - cvt.s32.s8 %r8095, %r8094; - cvt.u32.u16 %r8096, %rs5250; - cvt.s32.s8 %r8097, %r8096; - cvt.u32.u16 %r8098, %rs5251; - cvt.s32.s8 %r8099, %r8098; - mul.lo.s32 %r8100, %r34, %r8099; - mad.lo.s32 %r8101, %r124, %r8097, %r8100; - mad.lo.s32 %r8102, %r35, %r8095, %r8101; - mad.lo.s32 %r8103, %r36, %r8093, %r8102; - ld.const.v4.u8 {%rs5258, %rs5259, %rs5260, %rs5261}, [matrix+2628]; - cvt.u32.u16 %r8104, %rs5261; - cvt.s32.s8 %r8105, %r8104; - cvt.u32.u16 %r8106, %rs5260; - cvt.s32.s8 %r8107, %r8106; - cvt.u32.u16 %r8108, %rs5259; - cvt.s32.s8 %r8109, %r8108; - cvt.u32.u16 %r8110, %rs5258; - cvt.s32.s8 %r8111, %r8110; - mad.lo.s32 %r8112, %r37, %r8111, %r8103; - mad.lo.s32 %r8113, %r38, %r8109, %r8112; - mad.lo.s32 %r8114, %r39, %r8107, %r8113; - mad.lo.s32 %r8115, %r40, %r8105, %r8114; - ld.const.v4.u8 {%rs5266, %rs5267, %rs5268, %rs5269}, [matrix+2632]; - cvt.u32.u16 %r8116, %rs5269; - cvt.s32.s8 %r8117, %r8116; - cvt.u32.u16 %r8118, %rs5268; - cvt.s32.s8 %r8119, %r8118; - cvt.u32.u16 %r8120, %rs5267; - cvt.s32.s8 %r8121, %r8120; - cvt.u32.u16 %r8122, %rs5266; - cvt.s32.s8 %r8123, %r8122; - mad.lo.s32 %r8124, %r42, %r8123, %r8115; - mad.lo.s32 %r8125, %r43, %r8121, %r8124; - mad.lo.s32 %r8126, %r45, %r8119, %r8125; - mad.lo.s32 %r8127, %r46, %r8117, %r8126; - ld.const.v4.u8 {%rs5274, %rs5275, %rs5276, %rs5277}, [matrix+2636]; - cvt.u32.u16 %r8128, %rs5277; - cvt.s32.s8 %r8129, %r8128; - cvt.u32.u16 %r8130, %rs5276; - cvt.s32.s8 %r8131, %r8130; - cvt.u32.u16 %r8132, %rs5275; - cvt.s32.s8 %r8133, %r8132; - cvt.u32.u16 %r8134, %rs5274; - cvt.s32.s8 %r8135, %r8134; - mad.lo.s32 %r8136, %r48, %r8135, %r8127; - mad.lo.s32 %r8137, %r49, %r8133, %r8136; - mad.lo.s32 %r8138, %r50, %r8131, %r8137; - mad.lo.s32 %r8139, %r51, %r8129, %r8138; - ld.const.v4.u8 {%rs5282, %rs5283, %rs5284, %rs5285}, [matrix+2640]; - cvt.u32.u16 %r8140, %rs5285; - cvt.s32.s8 %r8141, %r8140; - cvt.u32.u16 %r8142, %rs5284; - cvt.s32.s8 %r8143, %r8142; - cvt.u32.u16 %r8144, %rs5283; - cvt.s32.s8 %r8145, %r8144; - cvt.u32.u16 %r8146, %rs5282; - cvt.s32.s8 %r8147, %r8146; - mad.lo.s32 %r8148, %r173, %r8147, %r8139; - mad.lo.s32 %r8149, %r53, %r8145, %r8148; - mad.lo.s32 %r8150, %r54, %r8143, %r8149; - mad.lo.s32 %r8151, %r55, %r8141, %r8150; - ld.const.v4.u8 {%rs5290, %rs5291, %rs5292, %rs5293}, [matrix+2644]; - cvt.u32.u16 %r8152, %rs5293; - cvt.s32.s8 %r8153, %r8152; - cvt.u32.u16 %r8154, %rs5292; - cvt.s32.s8 %r8155, %r8154; - cvt.u32.u16 %r8156, %rs5291; - cvt.s32.s8 %r8157, %r8156; - cvt.u32.u16 %r8158, %rs5290; - cvt.s32.s8 %r8159, %r8158; - mad.lo.s32 %r8160, %r56, %r8159, %r8151; - mad.lo.s32 %r8161, %r57, %r8157, %r8160; - mad.lo.s32 %r8162, %r58, %r8155, %r8161; - mad.lo.s32 %r8163, %r59, %r8153, %r8162; - ld.const.v4.u8 {%rs5298, %rs5299, %rs5300, %rs5301}, [matrix+2648]; - cvt.u32.u16 %r8164, %rs5301; - cvt.s32.s8 %r8165, %r8164; - cvt.u32.u16 %r8166, %rs5300; - cvt.s32.s8 %r8167, %r8166; - cvt.u32.u16 %r8168, %rs5299; - cvt.s32.s8 %r8169, %r8168; - cvt.u32.u16 %r8170, %rs5298; - cvt.s32.s8 %r8171, %r8170; - mad.lo.s32 %r8172, %r61, %r8171, %r8163; - mad.lo.s32 %r8173, %r62, %r8169, %r8172; - mad.lo.s32 %r8174, %r64, %r8167, %r8173; - mad.lo.s32 %r8175, %r65, %r8165, %r8174; - ld.const.v4.u8 {%rs5306, %rs5307, %rs5308, %rs5309}, [matrix+2652]; - cvt.u32.u16 %r8176, %rs5309; - cvt.s32.s8 %r8177, %r8176; - cvt.u32.u16 %r8178, %rs5308; - cvt.s32.s8 %r8179, %r8178; - cvt.u32.u16 %r8180, %rs5307; - cvt.s32.s8 %r8181, %r8180; - cvt.u32.u16 %r8182, %rs5306; - cvt.s32.s8 %r8183, %r8182; - mad.lo.s32 %r8184, %r67, %r8183, %r8175; - mad.lo.s32 %r8185, %r68, %r8181, %r8184; - mad.lo.s32 %r8186, %r69, %r8179, %r8185; - mad.lo.s32 %r8187, %r70, %r8177, %r8186; - ld.const.v4.u8 {%rs5314, %rs5315, %rs5316, %rs5317}, [matrix+2656]; - cvt.u32.u16 %r8188, %rs5317; - cvt.s32.s8 %r8189, %r8188; - cvt.u32.u16 %r8190, %rs5316; - cvt.s32.s8 %r8191, %r8190; - cvt.u32.u16 %r8192, %rs5315; - cvt.s32.s8 %r8193, %r8192; - cvt.u32.u16 %r8194, %rs5314; - cvt.s32.s8 %r8195, %r8194; - mad.lo.s32 %r8196, %r222, %r8195, %r8187; - mad.lo.s32 %r8197, %r72, %r8193, %r8196; - mad.lo.s32 %r8198, %r73, %r8191, %r8197; - mad.lo.s32 %r8199, %r74, %r8189, %r8198; - ld.const.v4.u8 {%rs5322, %rs5323, %rs5324, %rs5325}, [matrix+2660]; - cvt.u32.u16 %r8200, %rs5325; - cvt.s32.s8 %r8201, %r8200; - cvt.u32.u16 %r8202, %rs5324; - cvt.s32.s8 %r8203, %r8202; - cvt.u32.u16 %r8204, %rs5323; - cvt.s32.s8 %r8205, %r8204; - cvt.u32.u16 %r8206, %rs5322; - cvt.s32.s8 %r8207, %r8206; - mad.lo.s32 %r8208, %r75, %r8207, %r8199; - mad.lo.s32 %r8209, %r76, %r8205, %r8208; - mad.lo.s32 %r8210, %r77, %r8203, %r8209; - mad.lo.s32 %r8211, %r78, %r8201, %r8210; - ld.const.v4.u8 {%rs5330, %rs5331, %rs5332, %rs5333}, [matrix+2664]; - cvt.u32.u16 %r8212, %rs5333; - cvt.s32.s8 %r8213, %r8212; - cvt.u32.u16 %r8214, %rs5332; - cvt.s32.s8 %r8215, %r8214; - cvt.u32.u16 %r8216, %rs5331; - cvt.s32.s8 %r8217, %r8216; - cvt.u32.u16 %r8218, %rs5330; - cvt.s32.s8 %r8219, %r8218; - mad.lo.s32 %r8220, %r80, %r8219, %r8211; - mad.lo.s32 %r8221, %r81, %r8217, %r8220; - mad.lo.s32 %r8222, %r83, %r8215, %r8221; - mad.lo.s32 %r8223, %r84, %r8213, %r8222; - ld.const.v4.u8 {%rs5338, %rs5339, %rs5340, %rs5341}, [matrix+2668]; - cvt.u32.u16 %r8224, %rs5341; - cvt.s32.s8 %r8225, %r8224; - cvt.u32.u16 %r8226, %rs5340; - cvt.s32.s8 %r8227, %r8226; - cvt.u32.u16 %r8228, %rs5339; - cvt.s32.s8 %r8229, %r8228; - cvt.u32.u16 %r8230, %rs5338; - cvt.s32.s8 %r8231, %r8230; - mad.lo.s32 %r8232, %r86, %r8231, %r8223; - mad.lo.s32 %r8233, %r87, %r8229, %r8232; - mad.lo.s32 %r8234, %r88, %r8227, %r8233; - mad.lo.s32 %r8235, %r89, %r8225, %r8234; - ld.const.v4.u8 {%rs5346, %rs5347, %rs5348, %rs5349}, [matrix+2672]; - cvt.u32.u16 %r8236, %rs5349; - cvt.s32.s8 %r8237, %r8236; - cvt.u32.u16 %r8238, %rs5348; - cvt.s32.s8 %r8239, %r8238; - cvt.u32.u16 %r8240, %rs5347; - cvt.s32.s8 %r8241, %r8240; - cvt.u32.u16 %r8242, %rs5346; - cvt.s32.s8 %r8243, %r8242; - mad.lo.s32 %r8244, %r271, %r8243, %r8235; - mad.lo.s32 %r8245, %r91, %r8241, %r8244; - mad.lo.s32 %r8246, %r93, %r8239, %r8245; - mad.lo.s32 %r8247, %r94, %r8237, %r8246; - ld.const.v4.u8 {%rs5354, %rs5355, %rs5356, %rs5357}, [matrix+2676]; - cvt.u32.u16 %r8248, %rs5357; - cvt.s32.s8 %r8249, %r8248; - cvt.u32.u16 %r8250, %rs5356; - cvt.s32.s8 %r8251, %r8250; - cvt.u32.u16 %r8252, %rs5355; - cvt.s32.s8 %r8253, %r8252; - cvt.u32.u16 %r8254, %rs5354; - cvt.s32.s8 %r8255, %r8254; - mad.lo.s32 %r8256, %r96, %r8255, %r8247; - mad.lo.s32 %r8257, %r97, %r8253, %r8256; - mad.lo.s32 %r8258, %r99, %r8251, %r8257; - mad.lo.s32 %r8259, %r100, %r8249, %r8258; - ld.const.v4.u8 {%rs5362, %rs5363, %rs5364, %rs5365}, [matrix+2680]; - cvt.u32.u16 %r8260, %rs5365; - cvt.s32.s8 %r8261, %r8260; - cvt.u32.u16 %r8262, %rs5364; - cvt.s32.s8 %r8263, %r8262; - cvt.u32.u16 %r8264, %rs5363; - cvt.s32.s8 %r8265, %r8264; - cvt.u32.u16 %r8266, %rs5362; - cvt.s32.s8 %r8267, %r8266; - mad.lo.s32 %r8268, %r103, %r8267, %r8259; - mad.lo.s32 %r8269, %r104, %r8265, %r8268; - mad.lo.s32 %r8270, %r107, %r8263, %r8269; - mad.lo.s32 %r8271, %r108, %r8261, %r8270; - ld.const.v4.u8 {%rs5370, %rs5371, %rs5372, %rs5373}, [matrix+2684]; - cvt.u32.u16 %r8272, %rs5373; - cvt.s32.s8 %r8273, %r8272; - cvt.u32.u16 %r8274, %rs5372; - cvt.s32.s8 %r8275, %r8274; - cvt.u32.u16 %r8276, %rs5371; - cvt.s32.s8 %r8277, %r8276; - cvt.u32.u16 %r8278, %rs5370; - cvt.s32.s8 %r8279, %r8278; - mad.lo.s32 %r8280, %r111, %r8279, %r8271; - mad.lo.s32 %r8281, %r112, %r8277, %r8280; - mad.lo.s32 %r8282, %r114, %r8275, %r8281; - mad.lo.s32 %r8283, %r115, %r8273, %r8282; - shr.u32 %r8284, %r8091, 6; - and.b32 %r8285, %r8284, 240; - shr.u32 %r8286, %r8283, 10; - or.b32 %r8287, %r8286, %r8285; - xor.b32 %r8288, %r30, %r8287; - cvt.u64.u32 %rd396, %r8288; - ld.const.v4.u8 {%rs5378, %rs5379, %rs5380, %rs5381}, [matrix+2688]; - cvt.u32.u16 %r8289, %rs5381; - cvt.s32.s8 %r8290, %r8289; - cvt.u32.u16 %r8291, %rs5380; - cvt.s32.s8 %r8292, %r8291; - cvt.u32.u16 %r8293, %rs5378; - cvt.s32.s8 %r8294, %r8293; - cvt.u32.u16 %r8295, %rs5379; - cvt.s32.s8 %r8296, %r8295; - mul.lo.s32 %r8297, %r34, %r8296; - mad.lo.s32 %r8298, %r124, %r8294, %r8297; - mad.lo.s32 %r8299, %r35, %r8292, %r8298; - mad.lo.s32 %r8300, %r36, %r8290, %r8299; - ld.const.v4.u8 {%rs5386, %rs5387, %rs5388, %rs5389}, [matrix+2692]; - cvt.u32.u16 %r8301, %rs5389; - cvt.s32.s8 %r8302, %r8301; - cvt.u32.u16 %r8303, %rs5388; - cvt.s32.s8 %r8304, %r8303; - cvt.u32.u16 %r8305, %rs5387; - cvt.s32.s8 %r8306, %r8305; - cvt.u32.u16 %r8307, %rs5386; - cvt.s32.s8 %r8308, %r8307; - mad.lo.s32 %r8309, %r37, %r8308, %r8300; - mad.lo.s32 %r8310, %r38, %r8306, %r8309; - mad.lo.s32 %r8311, %r39, %r8304, %r8310; - mad.lo.s32 %r8312, %r40, %r8302, %r8311; - ld.const.v4.u8 {%rs5394, %rs5395, %rs5396, %rs5397}, [matrix+2696]; - cvt.u32.u16 %r8313, %rs5397; - cvt.s32.s8 %r8314, %r8313; - cvt.u32.u16 %r8315, %rs5396; - cvt.s32.s8 %r8316, %r8315; - cvt.u32.u16 %r8317, %rs5395; - cvt.s32.s8 %r8318, %r8317; - cvt.u32.u16 %r8319, %rs5394; - cvt.s32.s8 %r8320, %r8319; - mad.lo.s32 %r8321, %r42, %r8320, %r8312; - mad.lo.s32 %r8322, %r43, %r8318, %r8321; - mad.lo.s32 %r8323, %r45, %r8316, %r8322; - mad.lo.s32 %r8324, %r46, %r8314, %r8323; - ld.const.v4.u8 {%rs5402, %rs5403, %rs5404, %rs5405}, [matrix+2700]; - cvt.u32.u16 %r8325, %rs5405; - cvt.s32.s8 %r8326, %r8325; - cvt.u32.u16 %r8327, %rs5404; - cvt.s32.s8 %r8328, %r8327; - cvt.u32.u16 %r8329, %rs5403; - cvt.s32.s8 %r8330, %r8329; - cvt.u32.u16 %r8331, %rs5402; - cvt.s32.s8 %r8332, %r8331; - mad.lo.s32 %r8333, %r48, %r8332, %r8324; - mad.lo.s32 %r8334, %r49, %r8330, %r8333; - mad.lo.s32 %r8335, %r50, %r8328, %r8334; - mad.lo.s32 %r8336, %r51, %r8326, %r8335; - ld.const.v4.u8 {%rs5410, %rs5411, %rs5412, %rs5413}, [matrix+2704]; - cvt.u32.u16 %r8337, %rs5413; - cvt.s32.s8 %r8338, %r8337; - cvt.u32.u16 %r8339, %rs5412; - cvt.s32.s8 %r8340, %r8339; - cvt.u32.u16 %r8341, %rs5411; - cvt.s32.s8 %r8342, %r8341; - cvt.u32.u16 %r8343, %rs5410; - cvt.s32.s8 %r8344, %r8343; - mad.lo.s32 %r8345, %r173, %r8344, %r8336; - mad.lo.s32 %r8346, %r53, %r8342, %r8345; - mad.lo.s32 %r8347, %r54, %r8340, %r8346; - mad.lo.s32 %r8348, %r55, %r8338, %r8347; - ld.const.v4.u8 {%rs5418, %rs5419, %rs5420, %rs5421}, [matrix+2708]; - cvt.u32.u16 %r8349, %rs5421; - cvt.s32.s8 %r8350, %r8349; - cvt.u32.u16 %r8351, %rs5420; - cvt.s32.s8 %r8352, %r8351; - cvt.u32.u16 %r8353, %rs5419; - cvt.s32.s8 %r8354, %r8353; - cvt.u32.u16 %r8355, %rs5418; - cvt.s32.s8 %r8356, %r8355; - mad.lo.s32 %r8357, %r56, %r8356, %r8348; - mad.lo.s32 %r8358, %r57, %r8354, %r8357; - mad.lo.s32 %r8359, %r58, %r8352, %r8358; - mad.lo.s32 %r8360, %r59, %r8350, %r8359; - ld.const.v4.u8 {%rs5426, %rs5427, %rs5428, %rs5429}, [matrix+2712]; - cvt.u32.u16 %r8361, %rs5429; - cvt.s32.s8 %r8362, %r8361; - cvt.u32.u16 %r8363, %rs5428; - cvt.s32.s8 %r8364, %r8363; - cvt.u32.u16 %r8365, %rs5427; - cvt.s32.s8 %r8366, %r8365; - cvt.u32.u16 %r8367, %rs5426; - cvt.s32.s8 %r8368, %r8367; - mad.lo.s32 %r8369, %r61, %r8368, %r8360; - mad.lo.s32 %r8370, %r62, %r8366, %r8369; - mad.lo.s32 %r8371, %r64, %r8364, %r8370; - mad.lo.s32 %r8372, %r65, %r8362, %r8371; - ld.const.v4.u8 {%rs5434, %rs5435, %rs5436, %rs5437}, [matrix+2716]; - cvt.u32.u16 %r8373, %rs5437; - cvt.s32.s8 %r8374, %r8373; - cvt.u32.u16 %r8375, %rs5436; - cvt.s32.s8 %r8376, %r8375; - cvt.u32.u16 %r8377, %rs5435; - cvt.s32.s8 %r8378, %r8377; - cvt.u32.u16 %r8379, %rs5434; - cvt.s32.s8 %r8380, %r8379; - mad.lo.s32 %r8381, %r67, %r8380, %r8372; - mad.lo.s32 %r8382, %r68, %r8378, %r8381; - mad.lo.s32 %r8383, %r69, %r8376, %r8382; - mad.lo.s32 %r8384, %r70, %r8374, %r8383; - ld.const.v4.u8 {%rs5442, %rs5443, %rs5444, %rs5445}, [matrix+2720]; - cvt.u32.u16 %r8385, %rs5445; - cvt.s32.s8 %r8386, %r8385; - cvt.u32.u16 %r8387, %rs5444; - cvt.s32.s8 %r8388, %r8387; - cvt.u32.u16 %r8389, %rs5443; - cvt.s32.s8 %r8390, %r8389; - cvt.u32.u16 %r8391, %rs5442; - cvt.s32.s8 %r8392, %r8391; - mad.lo.s32 %r8393, %r222, %r8392, %r8384; - mad.lo.s32 %r8394, %r72, %r8390, %r8393; - mad.lo.s32 %r8395, %r73, %r8388, %r8394; - mad.lo.s32 %r8396, %r74, %r8386, %r8395; - ld.const.v4.u8 {%rs5450, %rs5451, %rs5452, %rs5453}, [matrix+2724]; - cvt.u32.u16 %r8397, %rs5453; - cvt.s32.s8 %r8398, %r8397; - cvt.u32.u16 %r8399, %rs5452; - cvt.s32.s8 %r8400, %r8399; - cvt.u32.u16 %r8401, %rs5451; - cvt.s32.s8 %r8402, %r8401; - cvt.u32.u16 %r8403, %rs5450; - cvt.s32.s8 %r8404, %r8403; - mad.lo.s32 %r8405, %r75, %r8404, %r8396; - mad.lo.s32 %r8406, %r76, %r8402, %r8405; - mad.lo.s32 %r8407, %r77, %r8400, %r8406; - mad.lo.s32 %r8408, %r78, %r8398, %r8407; - ld.const.v4.u8 {%rs5458, %rs5459, %rs5460, %rs5461}, [matrix+2728]; - cvt.u32.u16 %r8409, %rs5461; - cvt.s32.s8 %r8410, %r8409; - cvt.u32.u16 %r8411, %rs5460; - cvt.s32.s8 %r8412, %r8411; - cvt.u32.u16 %r8413, %rs5459; - cvt.s32.s8 %r8414, %r8413; - cvt.u32.u16 %r8415, %rs5458; - cvt.s32.s8 %r8416, %r8415; - mad.lo.s32 %r8417, %r80, %r8416, %r8408; - mad.lo.s32 %r8418, %r81, %r8414, %r8417; - mad.lo.s32 %r8419, %r83, %r8412, %r8418; - mad.lo.s32 %r8420, %r84, %r8410, %r8419; - ld.const.v4.u8 {%rs5466, %rs5467, %rs5468, %rs5469}, [matrix+2732]; - cvt.u32.u16 %r8421, %rs5469; - cvt.s32.s8 %r8422, %r8421; - cvt.u32.u16 %r8423, %rs5468; - cvt.s32.s8 %r8424, %r8423; - cvt.u32.u16 %r8425, %rs5467; - cvt.s32.s8 %r8426, %r8425; - cvt.u32.u16 %r8427, %rs5466; - cvt.s32.s8 %r8428, %r8427; - mad.lo.s32 %r8429, %r86, %r8428, %r8420; - mad.lo.s32 %r8430, %r87, %r8426, %r8429; - mad.lo.s32 %r8431, %r88, %r8424, %r8430; - mad.lo.s32 %r8432, %r89, %r8422, %r8431; - ld.const.v4.u8 {%rs5474, %rs5475, %rs5476, %rs5477}, [matrix+2736]; - cvt.u32.u16 %r8433, %rs5477; - cvt.s32.s8 %r8434, %r8433; - cvt.u32.u16 %r8435, %rs5476; - cvt.s32.s8 %r8436, %r8435; - cvt.u32.u16 %r8437, %rs5475; - cvt.s32.s8 %r8438, %r8437; - cvt.u32.u16 %r8439, %rs5474; - cvt.s32.s8 %r8440, %r8439; - mad.lo.s32 %r8441, %r271, %r8440, %r8432; - mad.lo.s32 %r8442, %r91, %r8438, %r8441; - mad.lo.s32 %r8443, %r93, %r8436, %r8442; - mad.lo.s32 %r8444, %r94, %r8434, %r8443; - ld.const.v4.u8 {%rs5482, %rs5483, %rs5484, %rs5485}, [matrix+2740]; - cvt.u32.u16 %r8445, %rs5485; - cvt.s32.s8 %r8446, %r8445; - cvt.u32.u16 %r8447, %rs5484; - cvt.s32.s8 %r8448, %r8447; - cvt.u32.u16 %r8449, %rs5483; - cvt.s32.s8 %r8450, %r8449; - cvt.u32.u16 %r8451, %rs5482; - cvt.s32.s8 %r8452, %r8451; - mad.lo.s32 %r8453, %r96, %r8452, %r8444; - mad.lo.s32 %r8454, %r97, %r8450, %r8453; - mad.lo.s32 %r8455, %r99, %r8448, %r8454; - mad.lo.s32 %r8456, %r100, %r8446, %r8455; - ld.const.v4.u8 {%rs5490, %rs5491, %rs5492, %rs5493}, [matrix+2744]; - cvt.u32.u16 %r8457, %rs5493; - cvt.s32.s8 %r8458, %r8457; - cvt.u32.u16 %r8459, %rs5492; - cvt.s32.s8 %r8460, %r8459; - cvt.u32.u16 %r8461, %rs5491; - cvt.s32.s8 %r8462, %r8461; - cvt.u32.u16 %r8463, %rs5490; - cvt.s32.s8 %r8464, %r8463; - mad.lo.s32 %r8465, %r103, %r8464, %r8456; - mad.lo.s32 %r8466, %r104, %r8462, %r8465; - mad.lo.s32 %r8467, %r107, %r8460, %r8466; - mad.lo.s32 %r8468, %r108, %r8458, %r8467; - ld.const.v4.u8 {%rs5498, %rs5499, %rs5500, %rs5501}, [matrix+2748]; - cvt.u32.u16 %r8469, %rs5501; - cvt.s32.s8 %r8470, %r8469; - cvt.u32.u16 %r8471, %rs5500; - cvt.s32.s8 %r8472, %r8471; - cvt.u32.u16 %r8473, %rs5499; - cvt.s32.s8 %r8474, %r8473; - cvt.u32.u16 %r8475, %rs5498; - cvt.s32.s8 %r8476, %r8475; - mad.lo.s32 %r8477, %r111, %r8476, %r8468; - mad.lo.s32 %r8478, %r112, %r8474, %r8477; - mad.lo.s32 %r8479, %r114, %r8472, %r8478; - mad.lo.s32 %r8480, %r115, %r8470, %r8479; - ld.const.v4.u8 {%rs5506, %rs5507, %rs5508, %rs5509}, [matrix+2752]; - cvt.u32.u16 %r8481, %rs5509; - cvt.s32.s8 %r8482, %r8481; - cvt.u32.u16 %r8483, %rs5508; - cvt.s32.s8 %r8484, %r8483; - cvt.u32.u16 %r8485, %rs5506; - cvt.s32.s8 %r8486, %r8485; - cvt.u32.u16 %r8487, %rs5507; - cvt.s32.s8 %r8488, %r8487; - mul.lo.s32 %r8489, %r34, %r8488; - mad.lo.s32 %r8490, %r124, %r8486, %r8489; - mad.lo.s32 %r8491, %r35, %r8484, %r8490; - mad.lo.s32 %r8492, %r36, %r8482, %r8491; - ld.const.v4.u8 {%rs5514, %rs5515, %rs5516, %rs5517}, [matrix+2756]; - cvt.u32.u16 %r8493, %rs5517; - cvt.s32.s8 %r8494, %r8493; - cvt.u32.u16 %r8495, %rs5516; - cvt.s32.s8 %r8496, %r8495; - cvt.u32.u16 %r8497, %rs5515; - cvt.s32.s8 %r8498, %r8497; - cvt.u32.u16 %r8499, %rs5514; - cvt.s32.s8 %r8500, %r8499; - mad.lo.s32 %r8501, %r37, %r8500, %r8492; - mad.lo.s32 %r8502, %r38, %r8498, %r8501; - mad.lo.s32 %r8503, %r39, %r8496, %r8502; - mad.lo.s32 %r8504, %r40, %r8494, %r8503; - ld.const.v4.u8 {%rs5522, %rs5523, %rs5524, %rs5525}, [matrix+2760]; - cvt.u32.u16 %r8505, %rs5525; - cvt.s32.s8 %r8506, %r8505; - cvt.u32.u16 %r8507, %rs5524; - cvt.s32.s8 %r8508, %r8507; - cvt.u32.u16 %r8509, %rs5523; - cvt.s32.s8 %r8510, %r8509; - cvt.u32.u16 %r8511, %rs5522; - cvt.s32.s8 %r8512, %r8511; - mad.lo.s32 %r8513, %r42, %r8512, %r8504; - mad.lo.s32 %r8514, %r43, %r8510, %r8513; - mad.lo.s32 %r8515, %r45, %r8508, %r8514; - mad.lo.s32 %r8516, %r46, %r8506, %r8515; - ld.const.v4.u8 {%rs5530, %rs5531, %rs5532, %rs5533}, [matrix+2764]; - cvt.u32.u16 %r8517, %rs5533; - cvt.s32.s8 %r8518, %r8517; - cvt.u32.u16 %r8519, %rs5532; - cvt.s32.s8 %r8520, %r8519; - cvt.u32.u16 %r8521, %rs5531; - cvt.s32.s8 %r8522, %r8521; - cvt.u32.u16 %r8523, %rs5530; - cvt.s32.s8 %r8524, %r8523; - mad.lo.s32 %r8525, %r48, %r8524, %r8516; - mad.lo.s32 %r8526, %r49, %r8522, %r8525; - mad.lo.s32 %r8527, %r50, %r8520, %r8526; - mad.lo.s32 %r8528, %r51, %r8518, %r8527; - ld.const.v4.u8 {%rs5538, %rs5539, %rs5540, %rs5541}, [matrix+2768]; - cvt.u32.u16 %r8529, %rs5541; - cvt.s32.s8 %r8530, %r8529; - cvt.u32.u16 %r8531, %rs5540; - cvt.s32.s8 %r8532, %r8531; - cvt.u32.u16 %r8533, %rs5539; - cvt.s32.s8 %r8534, %r8533; - cvt.u32.u16 %r8535, %rs5538; - cvt.s32.s8 %r8536, %r8535; - mad.lo.s32 %r8537, %r173, %r8536, %r8528; - mad.lo.s32 %r8538, %r53, %r8534, %r8537; - mad.lo.s32 %r8539, %r54, %r8532, %r8538; - mad.lo.s32 %r8540, %r55, %r8530, %r8539; - ld.const.v4.u8 {%rs5546, %rs5547, %rs5548, %rs5549}, [matrix+2772]; - cvt.u32.u16 %r8541, %rs5549; - cvt.s32.s8 %r8542, %r8541; - cvt.u32.u16 %r8543, %rs5548; - cvt.s32.s8 %r8544, %r8543; - cvt.u32.u16 %r8545, %rs5547; - cvt.s32.s8 %r8546, %r8545; - cvt.u32.u16 %r8547, %rs5546; - cvt.s32.s8 %r8548, %r8547; - mad.lo.s32 %r8549, %r56, %r8548, %r8540; - mad.lo.s32 %r8550, %r57, %r8546, %r8549; - mad.lo.s32 %r8551, %r58, %r8544, %r8550; - mad.lo.s32 %r8552, %r59, %r8542, %r8551; - ld.const.v4.u8 {%rs5554, %rs5555, %rs5556, %rs5557}, [matrix+2776]; - cvt.u32.u16 %r8553, %rs5557; - cvt.s32.s8 %r8554, %r8553; - cvt.u32.u16 %r8555, %rs5556; - cvt.s32.s8 %r8556, %r8555; - cvt.u32.u16 %r8557, %rs5555; - cvt.s32.s8 %r8558, %r8557; - cvt.u32.u16 %r8559, %rs5554; - cvt.s32.s8 %r8560, %r8559; - mad.lo.s32 %r8561, %r61, %r8560, %r8552; - mad.lo.s32 %r8562, %r62, %r8558, %r8561; - mad.lo.s32 %r8563, %r64, %r8556, %r8562; - mad.lo.s32 %r8564, %r65, %r8554, %r8563; - ld.const.v4.u8 {%rs5562, %rs5563, %rs5564, %rs5565}, [matrix+2780]; - cvt.u32.u16 %r8565, %rs5565; - cvt.s32.s8 %r8566, %r8565; - cvt.u32.u16 %r8567, %rs5564; - cvt.s32.s8 %r8568, %r8567; - cvt.u32.u16 %r8569, %rs5563; - cvt.s32.s8 %r8570, %r8569; - cvt.u32.u16 %r8571, %rs5562; - cvt.s32.s8 %r8572, %r8571; - mad.lo.s32 %r8573, %r67, %r8572, %r8564; - mad.lo.s32 %r8574, %r68, %r8570, %r8573; - mad.lo.s32 %r8575, %r69, %r8568, %r8574; - mad.lo.s32 %r8576, %r70, %r8566, %r8575; - ld.const.v4.u8 {%rs5570, %rs5571, %rs5572, %rs5573}, [matrix+2784]; - cvt.u32.u16 %r8577, %rs5573; - cvt.s32.s8 %r8578, %r8577; - cvt.u32.u16 %r8579, %rs5572; - cvt.s32.s8 %r8580, %r8579; - cvt.u32.u16 %r8581, %rs5571; - cvt.s32.s8 %r8582, %r8581; - cvt.u32.u16 %r8583, %rs5570; - cvt.s32.s8 %r8584, %r8583; - mad.lo.s32 %r8585, %r222, %r8584, %r8576; - mad.lo.s32 %r8586, %r72, %r8582, %r8585; - mad.lo.s32 %r8587, %r73, %r8580, %r8586; - mad.lo.s32 %r8588, %r74, %r8578, %r8587; - ld.const.v4.u8 {%rs5578, %rs5579, %rs5580, %rs5581}, [matrix+2788]; - cvt.u32.u16 %r8589, %rs5581; - cvt.s32.s8 %r8590, %r8589; - cvt.u32.u16 %r8591, %rs5580; - cvt.s32.s8 %r8592, %r8591; - cvt.u32.u16 %r8593, %rs5579; - cvt.s32.s8 %r8594, %r8593; - cvt.u32.u16 %r8595, %rs5578; - cvt.s32.s8 %r8596, %r8595; - mad.lo.s32 %r8597, %r75, %r8596, %r8588; - mad.lo.s32 %r8598, %r76, %r8594, %r8597; - mad.lo.s32 %r8599, %r77, %r8592, %r8598; - mad.lo.s32 %r8600, %r78, %r8590, %r8599; - ld.const.v4.u8 {%rs5586, %rs5587, %rs5588, %rs5589}, [matrix+2792]; - cvt.u32.u16 %r8601, %rs5589; - cvt.s32.s8 %r8602, %r8601; - cvt.u32.u16 %r8603, %rs5588; - cvt.s32.s8 %r8604, %r8603; - cvt.u32.u16 %r8605, %rs5587; - cvt.s32.s8 %r8606, %r8605; - cvt.u32.u16 %r8607, %rs5586; - cvt.s32.s8 %r8608, %r8607; - mad.lo.s32 %r8609, %r80, %r8608, %r8600; - mad.lo.s32 %r8610, %r81, %r8606, %r8609; - mad.lo.s32 %r8611, %r83, %r8604, %r8610; - mad.lo.s32 %r8612, %r84, %r8602, %r8611; - ld.const.v4.u8 {%rs5594, %rs5595, %rs5596, %rs5597}, [matrix+2796]; - cvt.u32.u16 %r8613, %rs5597; - cvt.s32.s8 %r8614, %r8613; - cvt.u32.u16 %r8615, %rs5596; - cvt.s32.s8 %r8616, %r8615; - cvt.u32.u16 %r8617, %rs5595; - cvt.s32.s8 %r8618, %r8617; - cvt.u32.u16 %r8619, %rs5594; - cvt.s32.s8 %r8620, %r8619; - mad.lo.s32 %r8621, %r86, %r8620, %r8612; - mad.lo.s32 %r8622, %r87, %r8618, %r8621; - mad.lo.s32 %r8623, %r88, %r8616, %r8622; - mad.lo.s32 %r8624, %r89, %r8614, %r8623; - ld.const.v4.u8 {%rs5602, %rs5603, %rs5604, %rs5605}, [matrix+2800]; - cvt.u32.u16 %r8625, %rs5605; - cvt.s32.s8 %r8626, %r8625; - cvt.u32.u16 %r8627, %rs5604; - cvt.s32.s8 %r8628, %r8627; - cvt.u32.u16 %r8629, %rs5603; - cvt.s32.s8 %r8630, %r8629; - cvt.u32.u16 %r8631, %rs5602; - cvt.s32.s8 %r8632, %r8631; - mad.lo.s32 %r8633, %r271, %r8632, %r8624; - mad.lo.s32 %r8634, %r91, %r8630, %r8633; - mad.lo.s32 %r8635, %r93, %r8628, %r8634; - mad.lo.s32 %r8636, %r94, %r8626, %r8635; - ld.const.v4.u8 {%rs5610, %rs5611, %rs5612, %rs5613}, [matrix+2804]; - cvt.u32.u16 %r8637, %rs5613; - cvt.s32.s8 %r8638, %r8637; - cvt.u32.u16 %r8639, %rs5612; - cvt.s32.s8 %r8640, %r8639; - cvt.u32.u16 %r8641, %rs5611; - cvt.s32.s8 %r8642, %r8641; - cvt.u32.u16 %r8643, %rs5610; - cvt.s32.s8 %r8644, %r8643; - mad.lo.s32 %r8645, %r96, %r8644, %r8636; - mad.lo.s32 %r8646, %r97, %r8642, %r8645; - mad.lo.s32 %r8647, %r99, %r8640, %r8646; - mad.lo.s32 %r8648, %r100, %r8638, %r8647; - ld.const.v4.u8 {%rs5618, %rs5619, %rs5620, %rs5621}, [matrix+2808]; - cvt.u32.u16 %r8649, %rs5621; - cvt.s32.s8 %r8650, %r8649; - cvt.u32.u16 %r8651, %rs5620; - cvt.s32.s8 %r8652, %r8651; - cvt.u32.u16 %r8653, %rs5619; - cvt.s32.s8 %r8654, %r8653; - cvt.u32.u16 %r8655, %rs5618; - cvt.s32.s8 %r8656, %r8655; - mad.lo.s32 %r8657, %r103, %r8656, %r8648; - mad.lo.s32 %r8658, %r104, %r8654, %r8657; - mad.lo.s32 %r8659, %r107, %r8652, %r8658; - mad.lo.s32 %r8660, %r108, %r8650, %r8659; - ld.const.v4.u8 {%rs5626, %rs5627, %rs5628, %rs5629}, [matrix+2812]; - cvt.u32.u16 %r8661, %rs5629; - cvt.s32.s8 %r8662, %r8661; - cvt.u32.u16 %r8663, %rs5628; - cvt.s32.s8 %r8664, %r8663; - cvt.u32.u16 %r8665, %rs5627; - cvt.s32.s8 %r8666, %r8665; - cvt.u32.u16 %r8667, %rs5626; - cvt.s32.s8 %r8668, %r8667; - mad.lo.s32 %r8669, %r111, %r8668, %r8660; - mad.lo.s32 %r8670, %r112, %r8666, %r8669; - mad.lo.s32 %r8671, %r114, %r8664, %r8670; - mad.lo.s32 %r8672, %r115, %r8662, %r8671; - shr.u32 %r8673, %r8480, 6; - and.b32 %r8674, %r8673, 240; - shr.u32 %r8675, %r8672, 10; - or.b32 %r8676, %r8675, %r8674; - xor.b32 %r8677, %r31, %r8676; - cvt.u64.u32 %rd397, %r8677; - ld.const.v4.u8 {%rs5634, %rs5635, %rs5636, %rs5637}, [matrix+2816]; - cvt.u32.u16 %r8678, %rs5637; - cvt.s32.s8 %r8679, %r8678; - cvt.u32.u16 %r8680, %rs5636; - cvt.s32.s8 %r8681, %r8680; - cvt.u32.u16 %r8682, %rs5634; - cvt.s32.s8 %r8683, %r8682; - cvt.u32.u16 %r8684, %rs5635; - cvt.s32.s8 %r8685, %r8684; - mul.lo.s32 %r8686, %r34, %r8685; - mad.lo.s32 %r8687, %r124, %r8683, %r8686; - mad.lo.s32 %r8688, %r35, %r8681, %r8687; - mad.lo.s32 %r8689, %r36, %r8679, %r8688; - ld.const.v4.u8 {%rs5642, %rs5643, %rs5644, %rs5645}, [matrix+2820]; - cvt.u32.u16 %r8690, %rs5645; - cvt.s32.s8 %r8691, %r8690; - cvt.u32.u16 %r8692, %rs5644; - cvt.s32.s8 %r8693, %r8692; - cvt.u32.u16 %r8694, %rs5643; - cvt.s32.s8 %r8695, %r8694; - cvt.u32.u16 %r8696, %rs5642; - cvt.s32.s8 %r8697, %r8696; - mad.lo.s32 %r8698, %r37, %r8697, %r8689; - mad.lo.s32 %r8699, %r38, %r8695, %r8698; - mad.lo.s32 %r8700, %r39, %r8693, %r8699; - mad.lo.s32 %r8701, %r40, %r8691, %r8700; - ld.const.v4.u8 {%rs5650, %rs5651, %rs5652, %rs5653}, [matrix+2824]; - cvt.u32.u16 %r8702, %rs5653; - cvt.s32.s8 %r8703, %r8702; - cvt.u32.u16 %r8704, %rs5652; - cvt.s32.s8 %r8705, %r8704; - cvt.u32.u16 %r8706, %rs5651; - cvt.s32.s8 %r8707, %r8706; - cvt.u32.u16 %r8708, %rs5650; - cvt.s32.s8 %r8709, %r8708; - mad.lo.s32 %r8710, %r42, %r8709, %r8701; - mad.lo.s32 %r8711, %r43, %r8707, %r8710; - mad.lo.s32 %r8712, %r45, %r8705, %r8711; - mad.lo.s32 %r8713, %r46, %r8703, %r8712; - ld.const.v4.u8 {%rs5658, %rs5659, %rs5660, %rs5661}, [matrix+2828]; - cvt.u32.u16 %r8714, %rs5661; - cvt.s32.s8 %r8715, %r8714; - cvt.u32.u16 %r8716, %rs5660; - cvt.s32.s8 %r8717, %r8716; - cvt.u32.u16 %r8718, %rs5659; - cvt.s32.s8 %r8719, %r8718; - cvt.u32.u16 %r8720, %rs5658; - cvt.s32.s8 %r8721, %r8720; - mad.lo.s32 %r8722, %r48, %r8721, %r8713; - mad.lo.s32 %r8723, %r49, %r8719, %r8722; - mad.lo.s32 %r8724, %r50, %r8717, %r8723; - mad.lo.s32 %r8725, %r51, %r8715, %r8724; - ld.const.v4.u8 {%rs5666, %rs5667, %rs5668, %rs5669}, [matrix+2832]; - cvt.u32.u16 %r8726, %rs5669; - cvt.s32.s8 %r8727, %r8726; - cvt.u32.u16 %r8728, %rs5668; - cvt.s32.s8 %r8729, %r8728; - cvt.u32.u16 %r8730, %rs5667; - cvt.s32.s8 %r8731, %r8730; - cvt.u32.u16 %r8732, %rs5666; - cvt.s32.s8 %r8733, %r8732; - mad.lo.s32 %r8734, %r173, %r8733, %r8725; - mad.lo.s32 %r8735, %r53, %r8731, %r8734; - mad.lo.s32 %r8736, %r54, %r8729, %r8735; - mad.lo.s32 %r8737, %r55, %r8727, %r8736; - ld.const.v4.u8 {%rs5674, %rs5675, %rs5676, %rs5677}, [matrix+2836]; - cvt.u32.u16 %r8738, %rs5677; - cvt.s32.s8 %r8739, %r8738; - cvt.u32.u16 %r8740, %rs5676; - cvt.s32.s8 %r8741, %r8740; - cvt.u32.u16 %r8742, %rs5675; - cvt.s32.s8 %r8743, %r8742; - cvt.u32.u16 %r8744, %rs5674; - cvt.s32.s8 %r8745, %r8744; - mad.lo.s32 %r8746, %r56, %r8745, %r8737; - mad.lo.s32 %r8747, %r57, %r8743, %r8746; - mad.lo.s32 %r8748, %r58, %r8741, %r8747; - mad.lo.s32 %r8749, %r59, %r8739, %r8748; - ld.const.v4.u8 {%rs5682, %rs5683, %rs5684, %rs5685}, [matrix+2840]; - cvt.u32.u16 %r8750, %rs5685; - cvt.s32.s8 %r8751, %r8750; - cvt.u32.u16 %r8752, %rs5684; - cvt.s32.s8 %r8753, %r8752; - cvt.u32.u16 %r8754, %rs5683; - cvt.s32.s8 %r8755, %r8754; - cvt.u32.u16 %r8756, %rs5682; - cvt.s32.s8 %r8757, %r8756; - mad.lo.s32 %r8758, %r61, %r8757, %r8749; - mad.lo.s32 %r8759, %r62, %r8755, %r8758; - mad.lo.s32 %r8760, %r64, %r8753, %r8759; - mad.lo.s32 %r8761, %r65, %r8751, %r8760; - ld.const.v4.u8 {%rs5690, %rs5691, %rs5692, %rs5693}, [matrix+2844]; - cvt.u32.u16 %r8762, %rs5693; - cvt.s32.s8 %r8763, %r8762; - cvt.u32.u16 %r8764, %rs5692; - cvt.s32.s8 %r8765, %r8764; - cvt.u32.u16 %r8766, %rs5691; - cvt.s32.s8 %r8767, %r8766; - cvt.u32.u16 %r8768, %rs5690; - cvt.s32.s8 %r8769, %r8768; - mad.lo.s32 %r8770, %r67, %r8769, %r8761; - mad.lo.s32 %r8771, %r68, %r8767, %r8770; - mad.lo.s32 %r8772, %r69, %r8765, %r8771; - mad.lo.s32 %r8773, %r70, %r8763, %r8772; - ld.const.v4.u8 {%rs5698, %rs5699, %rs5700, %rs5701}, [matrix+2848]; - cvt.u32.u16 %r8774, %rs5701; - cvt.s32.s8 %r8775, %r8774; - cvt.u32.u16 %r8776, %rs5700; - cvt.s32.s8 %r8777, %r8776; - cvt.u32.u16 %r8778, %rs5699; - cvt.s32.s8 %r8779, %r8778; - cvt.u32.u16 %r8780, %rs5698; - cvt.s32.s8 %r8781, %r8780; - mad.lo.s32 %r8782, %r222, %r8781, %r8773; - mad.lo.s32 %r8783, %r72, %r8779, %r8782; - mad.lo.s32 %r8784, %r73, %r8777, %r8783; - mad.lo.s32 %r8785, %r74, %r8775, %r8784; - ld.const.v4.u8 {%rs5706, %rs5707, %rs5708, %rs5709}, [matrix+2852]; - cvt.u32.u16 %r8786, %rs5709; - cvt.s32.s8 %r8787, %r8786; - cvt.u32.u16 %r8788, %rs5708; - cvt.s32.s8 %r8789, %r8788; - cvt.u32.u16 %r8790, %rs5707; - cvt.s32.s8 %r8791, %r8790; - cvt.u32.u16 %r8792, %rs5706; - cvt.s32.s8 %r8793, %r8792; - mad.lo.s32 %r8794, %r75, %r8793, %r8785; - mad.lo.s32 %r8795, %r76, %r8791, %r8794; - mad.lo.s32 %r8796, %r77, %r8789, %r8795; - mad.lo.s32 %r8797, %r78, %r8787, %r8796; - ld.const.v4.u8 {%rs5714, %rs5715, %rs5716, %rs5717}, [matrix+2856]; - cvt.u32.u16 %r8798, %rs5717; - cvt.s32.s8 %r8799, %r8798; - cvt.u32.u16 %r8800, %rs5716; - cvt.s32.s8 %r8801, %r8800; - cvt.u32.u16 %r8802, %rs5715; - cvt.s32.s8 %r8803, %r8802; - cvt.u32.u16 %r8804, %rs5714; - cvt.s32.s8 %r8805, %r8804; - mad.lo.s32 %r8806, %r80, %r8805, %r8797; - mad.lo.s32 %r8807, %r81, %r8803, %r8806; - mad.lo.s32 %r8808, %r83, %r8801, %r8807; - mad.lo.s32 %r8809, %r84, %r8799, %r8808; - ld.const.v4.u8 {%rs5722, %rs5723, %rs5724, %rs5725}, [matrix+2860]; - cvt.u32.u16 %r8810, %rs5725; - cvt.s32.s8 %r8811, %r8810; - cvt.u32.u16 %r8812, %rs5724; - cvt.s32.s8 %r8813, %r8812; - cvt.u32.u16 %r8814, %rs5723; - cvt.s32.s8 %r8815, %r8814; - cvt.u32.u16 %r8816, %rs5722; - cvt.s32.s8 %r8817, %r8816; - mad.lo.s32 %r8818, %r86, %r8817, %r8809; - mad.lo.s32 %r8819, %r87, %r8815, %r8818; - mad.lo.s32 %r8820, %r88, %r8813, %r8819; - mad.lo.s32 %r8821, %r89, %r8811, %r8820; - ld.const.v4.u8 {%rs5730, %rs5731, %rs5732, %rs5733}, [matrix+2864]; - cvt.u32.u16 %r8822, %rs5733; - cvt.s32.s8 %r8823, %r8822; - cvt.u32.u16 %r8824, %rs5732; - cvt.s32.s8 %r8825, %r8824; - cvt.u32.u16 %r8826, %rs5731; - cvt.s32.s8 %r8827, %r8826; - cvt.u32.u16 %r8828, %rs5730; - cvt.s32.s8 %r8829, %r8828; - mad.lo.s32 %r8830, %r271, %r8829, %r8821; - mad.lo.s32 %r8831, %r91, %r8827, %r8830; - mad.lo.s32 %r8832, %r93, %r8825, %r8831; - mad.lo.s32 %r8833, %r94, %r8823, %r8832; - ld.const.v4.u8 {%rs5738, %rs5739, %rs5740, %rs5741}, [matrix+2868]; - cvt.u32.u16 %r8834, %rs5741; - cvt.s32.s8 %r8835, %r8834; - cvt.u32.u16 %r8836, %rs5740; - cvt.s32.s8 %r8837, %r8836; - cvt.u32.u16 %r8838, %rs5739; - cvt.s32.s8 %r8839, %r8838; - cvt.u32.u16 %r8840, %rs5738; - cvt.s32.s8 %r8841, %r8840; - mad.lo.s32 %r8842, %r96, %r8841, %r8833; - mad.lo.s32 %r8843, %r97, %r8839, %r8842; - mad.lo.s32 %r8844, %r99, %r8837, %r8843; - mad.lo.s32 %r8845, %r100, %r8835, %r8844; - ld.const.v4.u8 {%rs5746, %rs5747, %rs5748, %rs5749}, [matrix+2872]; - cvt.u32.u16 %r8846, %rs5749; - cvt.s32.s8 %r8847, %r8846; - cvt.u32.u16 %r8848, %rs5748; - cvt.s32.s8 %r8849, %r8848; - cvt.u32.u16 %r8850, %rs5747; - cvt.s32.s8 %r8851, %r8850; - cvt.u32.u16 %r8852, %rs5746; - cvt.s32.s8 %r8853, %r8852; - mad.lo.s32 %r8854, %r103, %r8853, %r8845; - mad.lo.s32 %r8855, %r104, %r8851, %r8854; - mad.lo.s32 %r8856, %r107, %r8849, %r8855; - mad.lo.s32 %r8857, %r108, %r8847, %r8856; - ld.const.v4.u8 {%rs5754, %rs5755, %rs5756, %rs5757}, [matrix+2876]; - cvt.u32.u16 %r8858, %rs5757; - cvt.s32.s8 %r8859, %r8858; - cvt.u32.u16 %r8860, %rs5756; - cvt.s32.s8 %r8861, %r8860; - cvt.u32.u16 %r8862, %rs5755; - cvt.s32.s8 %r8863, %r8862; - cvt.u32.u16 %r8864, %rs5754; - cvt.s32.s8 %r8865, %r8864; - mad.lo.s32 %r8866, %r111, %r8865, %r8857; - mad.lo.s32 %r8867, %r112, %r8863, %r8866; - mad.lo.s32 %r8868, %r114, %r8861, %r8867; - mad.lo.s32 %r8869, %r115, %r8859, %r8868; - ld.const.v4.u8 {%rs5762, %rs5763, %rs5764, %rs5765}, [matrix+2880]; - cvt.u32.u16 %r8870, %rs5765; - cvt.s32.s8 %r8871, %r8870; - cvt.u32.u16 %r8872, %rs5764; - cvt.s32.s8 %r8873, %r8872; - cvt.u32.u16 %r8874, %rs5762; - cvt.s32.s8 %r8875, %r8874; - cvt.u32.u16 %r8876, %rs5763; - cvt.s32.s8 %r8877, %r8876; - mul.lo.s32 %r8878, %r34, %r8877; - mad.lo.s32 %r8879, %r124, %r8875, %r8878; - mad.lo.s32 %r8880, %r35, %r8873, %r8879; - mad.lo.s32 %r8881, %r36, %r8871, %r8880; - ld.const.v4.u8 {%rs5770, %rs5771, %rs5772, %rs5773}, [matrix+2884]; - cvt.u32.u16 %r8882, %rs5773; - cvt.s32.s8 %r8883, %r8882; - cvt.u32.u16 %r8884, %rs5772; - cvt.s32.s8 %r8885, %r8884; - cvt.u32.u16 %r8886, %rs5771; - cvt.s32.s8 %r8887, %r8886; - cvt.u32.u16 %r8888, %rs5770; - cvt.s32.s8 %r8889, %r8888; - mad.lo.s32 %r8890, %r37, %r8889, %r8881; - mad.lo.s32 %r8891, %r38, %r8887, %r8890; - mad.lo.s32 %r8892, %r39, %r8885, %r8891; - mad.lo.s32 %r8893, %r40, %r8883, %r8892; - ld.const.v4.u8 {%rs5778, %rs5779, %rs5780, %rs5781}, [matrix+2888]; - cvt.u32.u16 %r8894, %rs5781; - cvt.s32.s8 %r8895, %r8894; - cvt.u32.u16 %r8896, %rs5780; - cvt.s32.s8 %r8897, %r8896; - cvt.u32.u16 %r8898, %rs5779; - cvt.s32.s8 %r8899, %r8898; - cvt.u32.u16 %r8900, %rs5778; - cvt.s32.s8 %r8901, %r8900; - mad.lo.s32 %r8902, %r42, %r8901, %r8893; - mad.lo.s32 %r8903, %r43, %r8899, %r8902; - mad.lo.s32 %r8904, %r45, %r8897, %r8903; - mad.lo.s32 %r8905, %r46, %r8895, %r8904; - ld.const.v4.u8 {%rs5786, %rs5787, %rs5788, %rs5789}, [matrix+2892]; - cvt.u32.u16 %r8906, %rs5789; - cvt.s32.s8 %r8907, %r8906; - cvt.u32.u16 %r8908, %rs5788; - cvt.s32.s8 %r8909, %r8908; - cvt.u32.u16 %r8910, %rs5787; - cvt.s32.s8 %r8911, %r8910; - cvt.u32.u16 %r8912, %rs5786; - cvt.s32.s8 %r8913, %r8912; - mad.lo.s32 %r8914, %r48, %r8913, %r8905; - mad.lo.s32 %r8915, %r49, %r8911, %r8914; - mad.lo.s32 %r8916, %r50, %r8909, %r8915; - mad.lo.s32 %r8917, %r51, %r8907, %r8916; - ld.const.v4.u8 {%rs5794, %rs5795, %rs5796, %rs5797}, [matrix+2896]; - cvt.u32.u16 %r8918, %rs5797; - cvt.s32.s8 %r8919, %r8918; - cvt.u32.u16 %r8920, %rs5796; - cvt.s32.s8 %r8921, %r8920; - cvt.u32.u16 %r8922, %rs5795; - cvt.s32.s8 %r8923, %r8922; - cvt.u32.u16 %r8924, %rs5794; - cvt.s32.s8 %r8925, %r8924; - mad.lo.s32 %r8926, %r173, %r8925, %r8917; - mad.lo.s32 %r8927, %r53, %r8923, %r8926; - mad.lo.s32 %r8928, %r54, %r8921, %r8927; - mad.lo.s32 %r8929, %r55, %r8919, %r8928; - ld.const.v4.u8 {%rs5802, %rs5803, %rs5804, %rs5805}, [matrix+2900]; - cvt.u32.u16 %r8930, %rs5805; - cvt.s32.s8 %r8931, %r8930; - cvt.u32.u16 %r8932, %rs5804; - cvt.s32.s8 %r8933, %r8932; - cvt.u32.u16 %r8934, %rs5803; - cvt.s32.s8 %r8935, %r8934; - cvt.u32.u16 %r8936, %rs5802; - cvt.s32.s8 %r8937, %r8936; - mad.lo.s32 %r8938, %r56, %r8937, %r8929; - mad.lo.s32 %r8939, %r57, %r8935, %r8938; - mad.lo.s32 %r8940, %r58, %r8933, %r8939; - mad.lo.s32 %r8941, %r59, %r8931, %r8940; - ld.const.v4.u8 {%rs5810, %rs5811, %rs5812, %rs5813}, [matrix+2904]; - cvt.u32.u16 %r8942, %rs5813; - cvt.s32.s8 %r8943, %r8942; - cvt.u32.u16 %r8944, %rs5812; - cvt.s32.s8 %r8945, %r8944; - cvt.u32.u16 %r8946, %rs5811; - cvt.s32.s8 %r8947, %r8946; - cvt.u32.u16 %r8948, %rs5810; - cvt.s32.s8 %r8949, %r8948; - mad.lo.s32 %r8950, %r61, %r8949, %r8941; - mad.lo.s32 %r8951, %r62, %r8947, %r8950; - mad.lo.s32 %r8952, %r64, %r8945, %r8951; - mad.lo.s32 %r8953, %r65, %r8943, %r8952; - ld.const.v4.u8 {%rs5818, %rs5819, %rs5820, %rs5821}, [matrix+2908]; - cvt.u32.u16 %r8954, %rs5821; - cvt.s32.s8 %r8955, %r8954; - cvt.u32.u16 %r8956, %rs5820; - cvt.s32.s8 %r8957, %r8956; - cvt.u32.u16 %r8958, %rs5819; - cvt.s32.s8 %r8959, %r8958; - cvt.u32.u16 %r8960, %rs5818; - cvt.s32.s8 %r8961, %r8960; - mad.lo.s32 %r8962, %r67, %r8961, %r8953; - mad.lo.s32 %r8963, %r68, %r8959, %r8962; - mad.lo.s32 %r8964, %r69, %r8957, %r8963; - mad.lo.s32 %r8965, %r70, %r8955, %r8964; - ld.const.v4.u8 {%rs5826, %rs5827, %rs5828, %rs5829}, [matrix+2912]; - cvt.u32.u16 %r8966, %rs5829; - cvt.s32.s8 %r8967, %r8966; - cvt.u32.u16 %r8968, %rs5828; - cvt.s32.s8 %r8969, %r8968; - cvt.u32.u16 %r8970, %rs5827; - cvt.s32.s8 %r8971, %r8970; - cvt.u32.u16 %r8972, %rs5826; - cvt.s32.s8 %r8973, %r8972; - mad.lo.s32 %r8974, %r222, %r8973, %r8965; - mad.lo.s32 %r8975, %r72, %r8971, %r8974; - mad.lo.s32 %r8976, %r73, %r8969, %r8975; - mad.lo.s32 %r8977, %r74, %r8967, %r8976; - ld.const.v4.u8 {%rs5834, %rs5835, %rs5836, %rs5837}, [matrix+2916]; - cvt.u32.u16 %r8978, %rs5837; - cvt.s32.s8 %r8979, %r8978; - cvt.u32.u16 %r8980, %rs5836; - cvt.s32.s8 %r8981, %r8980; - cvt.u32.u16 %r8982, %rs5835; - cvt.s32.s8 %r8983, %r8982; - cvt.u32.u16 %r8984, %rs5834; - cvt.s32.s8 %r8985, %r8984; - mad.lo.s32 %r8986, %r75, %r8985, %r8977; - mad.lo.s32 %r8987, %r76, %r8983, %r8986; - mad.lo.s32 %r8988, %r77, %r8981, %r8987; - mad.lo.s32 %r8989, %r78, %r8979, %r8988; - ld.const.v4.u8 {%rs5842, %rs5843, %rs5844, %rs5845}, [matrix+2920]; - cvt.u32.u16 %r8990, %rs5845; - cvt.s32.s8 %r8991, %r8990; - cvt.u32.u16 %r8992, %rs5844; - cvt.s32.s8 %r8993, %r8992; - cvt.u32.u16 %r8994, %rs5843; - cvt.s32.s8 %r8995, %r8994; - cvt.u32.u16 %r8996, %rs5842; - cvt.s32.s8 %r8997, %r8996; - mad.lo.s32 %r8998, %r80, %r8997, %r8989; - mad.lo.s32 %r8999, %r81, %r8995, %r8998; - mad.lo.s32 %r9000, %r83, %r8993, %r8999; - mad.lo.s32 %r9001, %r84, %r8991, %r9000; - ld.const.v4.u8 {%rs5850, %rs5851, %rs5852, %rs5853}, [matrix+2924]; - cvt.u32.u16 %r9002, %rs5853; - cvt.s32.s8 %r9003, %r9002; - cvt.u32.u16 %r9004, %rs5852; - cvt.s32.s8 %r9005, %r9004; - cvt.u32.u16 %r9006, %rs5851; - cvt.s32.s8 %r9007, %r9006; - cvt.u32.u16 %r9008, %rs5850; - cvt.s32.s8 %r9009, %r9008; - mad.lo.s32 %r9010, %r86, %r9009, %r9001; - mad.lo.s32 %r9011, %r87, %r9007, %r9010; - mad.lo.s32 %r9012, %r88, %r9005, %r9011; - mad.lo.s32 %r9013, %r89, %r9003, %r9012; - ld.const.v4.u8 {%rs5858, %rs5859, %rs5860, %rs5861}, [matrix+2928]; - cvt.u32.u16 %r9014, %rs5861; - cvt.s32.s8 %r9015, %r9014; - cvt.u32.u16 %r9016, %rs5860; - cvt.s32.s8 %r9017, %r9016; - cvt.u32.u16 %r9018, %rs5859; - cvt.s32.s8 %r9019, %r9018; - cvt.u32.u16 %r9020, %rs5858; - cvt.s32.s8 %r9021, %r9020; - mad.lo.s32 %r9022, %r271, %r9021, %r9013; - mad.lo.s32 %r9023, %r91, %r9019, %r9022; - mad.lo.s32 %r9024, %r93, %r9017, %r9023; - mad.lo.s32 %r9025, %r94, %r9015, %r9024; - ld.const.v4.u8 {%rs5866, %rs5867, %rs5868, %rs5869}, [matrix+2932]; - cvt.u32.u16 %r9026, %rs5869; - cvt.s32.s8 %r9027, %r9026; - cvt.u32.u16 %r9028, %rs5868; - cvt.s32.s8 %r9029, %r9028; - cvt.u32.u16 %r9030, %rs5867; - cvt.s32.s8 %r9031, %r9030; - cvt.u32.u16 %r9032, %rs5866; - cvt.s32.s8 %r9033, %r9032; - mad.lo.s32 %r9034, %r96, %r9033, %r9025; - mad.lo.s32 %r9035, %r97, %r9031, %r9034; - mad.lo.s32 %r9036, %r99, %r9029, %r9035; - mad.lo.s32 %r9037, %r100, %r9027, %r9036; - ld.const.v4.u8 {%rs5874, %rs5875, %rs5876, %rs5877}, [matrix+2936]; - cvt.u32.u16 %r9038, %rs5877; - cvt.s32.s8 %r9039, %r9038; - cvt.u32.u16 %r9040, %rs5876; - cvt.s32.s8 %r9041, %r9040; - cvt.u32.u16 %r9042, %rs5875; - cvt.s32.s8 %r9043, %r9042; - cvt.u32.u16 %r9044, %rs5874; - cvt.s32.s8 %r9045, %r9044; - mad.lo.s32 %r9046, %r103, %r9045, %r9037; - mad.lo.s32 %r9047, %r104, %r9043, %r9046; - mad.lo.s32 %r9048, %r107, %r9041, %r9047; - mad.lo.s32 %r9049, %r108, %r9039, %r9048; - ld.const.v4.u8 {%rs5882, %rs5883, %rs5884, %rs5885}, [matrix+2940]; - cvt.u32.u16 %r9050, %rs5885; - cvt.s32.s8 %r9051, %r9050; - cvt.u32.u16 %r9052, %rs5884; - cvt.s32.s8 %r9053, %r9052; - cvt.u32.u16 %r9054, %rs5883; - cvt.s32.s8 %r9055, %r9054; - cvt.u32.u16 %r9056, %rs5882; - cvt.s32.s8 %r9057, %r9056; - mad.lo.s32 %r9058, %r111, %r9057, %r9049; - mad.lo.s32 %r9059, %r112, %r9055, %r9058; - mad.lo.s32 %r9060, %r114, %r9053, %r9059; - mad.lo.s32 %r9061, %r115, %r9051, %r9060; - shr.u32 %r9062, %r8869, 6; - and.b32 %r9063, %r9062, 240; - shr.u32 %r9064, %r9061, 10; - or.b32 %r9065, %r9064, %r9063; - xor.b32 %r9066, %r32, %r9065; - cvt.u64.u32 %rd398, %r9066; - ld.const.v4.u8 {%rs5890, %rs5891, %rs5892, %rs5893}, [matrix+2944]; - cvt.u32.u16 %r9067, %rs5893; - cvt.s32.s8 %r9068, %r9067; - cvt.u32.u16 %r9069, %rs5892; - cvt.s32.s8 %r9070, %r9069; - cvt.u32.u16 %r9071, %rs5890; - cvt.s32.s8 %r9072, %r9071; - cvt.u32.u16 %r9073, %rs5891; - cvt.s32.s8 %r9074, %r9073; - mul.lo.s32 %r9075, %r34, %r9074; - mad.lo.s32 %r9076, %r124, %r9072, %r9075; - mad.lo.s32 %r9077, %r35, %r9070, %r9076; - mad.lo.s32 %r9078, %r36, %r9068, %r9077; - ld.const.v4.u8 {%rs5898, %rs5899, %rs5900, %rs5901}, [matrix+2948]; - cvt.u32.u16 %r9079, %rs5901; - cvt.s32.s8 %r9080, %r9079; - cvt.u32.u16 %r9081, %rs5900; - cvt.s32.s8 %r9082, %r9081; - cvt.u32.u16 %r9083, %rs5899; - cvt.s32.s8 %r9084, %r9083; - cvt.u32.u16 %r9085, %rs5898; - cvt.s32.s8 %r9086, %r9085; - mad.lo.s32 %r9087, %r37, %r9086, %r9078; - mad.lo.s32 %r9088, %r38, %r9084, %r9087; - mad.lo.s32 %r9089, %r39, %r9082, %r9088; - mad.lo.s32 %r9090, %r40, %r9080, %r9089; - ld.const.v4.u8 {%rs5906, %rs5907, %rs5908, %rs5909}, [matrix+2952]; - cvt.u32.u16 %r9091, %rs5909; - cvt.s32.s8 %r9092, %r9091; - cvt.u32.u16 %r9093, %rs5908; - cvt.s32.s8 %r9094, %r9093; - cvt.u32.u16 %r9095, %rs5907; - cvt.s32.s8 %r9096, %r9095; - cvt.u32.u16 %r9097, %rs5906; - cvt.s32.s8 %r9098, %r9097; - mad.lo.s32 %r9099, %r42, %r9098, %r9090; - mad.lo.s32 %r9100, %r43, %r9096, %r9099; - mad.lo.s32 %r9101, %r45, %r9094, %r9100; - mad.lo.s32 %r9102, %r46, %r9092, %r9101; - ld.const.v4.u8 {%rs5914, %rs5915, %rs5916, %rs5917}, [matrix+2956]; - cvt.u32.u16 %r9103, %rs5917; - cvt.s32.s8 %r9104, %r9103; - cvt.u32.u16 %r9105, %rs5916; - cvt.s32.s8 %r9106, %r9105; - cvt.u32.u16 %r9107, %rs5915; - cvt.s32.s8 %r9108, %r9107; - cvt.u32.u16 %r9109, %rs5914; - cvt.s32.s8 %r9110, %r9109; - mad.lo.s32 %r9111, %r48, %r9110, %r9102; - mad.lo.s32 %r9112, %r49, %r9108, %r9111; - mad.lo.s32 %r9113, %r50, %r9106, %r9112; - mad.lo.s32 %r9114, %r51, %r9104, %r9113; - ld.const.v4.u8 {%rs5922, %rs5923, %rs5924, %rs5925}, [matrix+2960]; - cvt.u32.u16 %r9115, %rs5925; - cvt.s32.s8 %r9116, %r9115; - cvt.u32.u16 %r9117, %rs5924; - cvt.s32.s8 %r9118, %r9117; - cvt.u32.u16 %r9119, %rs5923; - cvt.s32.s8 %r9120, %r9119; - cvt.u32.u16 %r9121, %rs5922; - cvt.s32.s8 %r9122, %r9121; - mad.lo.s32 %r9123, %r173, %r9122, %r9114; - mad.lo.s32 %r9124, %r53, %r9120, %r9123; - mad.lo.s32 %r9125, %r54, %r9118, %r9124; - mad.lo.s32 %r9126, %r55, %r9116, %r9125; - ld.const.v4.u8 {%rs5930, %rs5931, %rs5932, %rs5933}, [matrix+2964]; - cvt.u32.u16 %r9127, %rs5933; - cvt.s32.s8 %r9128, %r9127; - cvt.u32.u16 %r9129, %rs5932; - cvt.s32.s8 %r9130, %r9129; - cvt.u32.u16 %r9131, %rs5931; - cvt.s32.s8 %r9132, %r9131; - cvt.u32.u16 %r9133, %rs5930; - cvt.s32.s8 %r9134, %r9133; - mad.lo.s32 %r9135, %r56, %r9134, %r9126; - mad.lo.s32 %r9136, %r57, %r9132, %r9135; - mad.lo.s32 %r9137, %r58, %r9130, %r9136; - mad.lo.s32 %r9138, %r59, %r9128, %r9137; - ld.const.v4.u8 {%rs5938, %rs5939, %rs5940, %rs5941}, [matrix+2968]; - cvt.u32.u16 %r9139, %rs5941; - cvt.s32.s8 %r9140, %r9139; - cvt.u32.u16 %r9141, %rs5940; - cvt.s32.s8 %r9142, %r9141; - cvt.u32.u16 %r9143, %rs5939; - cvt.s32.s8 %r9144, %r9143; - cvt.u32.u16 %r9145, %rs5938; - cvt.s32.s8 %r9146, %r9145; - mad.lo.s32 %r9147, %r61, %r9146, %r9138; - mad.lo.s32 %r9148, %r62, %r9144, %r9147; - mad.lo.s32 %r9149, %r64, %r9142, %r9148; - mad.lo.s32 %r9150, %r65, %r9140, %r9149; - ld.const.v4.u8 {%rs5946, %rs5947, %rs5948, %rs5949}, [matrix+2972]; - cvt.u32.u16 %r9151, %rs5949; - cvt.s32.s8 %r9152, %r9151; - cvt.u32.u16 %r9153, %rs5948; - cvt.s32.s8 %r9154, %r9153; - cvt.u32.u16 %r9155, %rs5947; - cvt.s32.s8 %r9156, %r9155; - cvt.u32.u16 %r9157, %rs5946; - cvt.s32.s8 %r9158, %r9157; - mad.lo.s32 %r9159, %r67, %r9158, %r9150; - mad.lo.s32 %r9160, %r68, %r9156, %r9159; - mad.lo.s32 %r9161, %r69, %r9154, %r9160; - mad.lo.s32 %r9162, %r70, %r9152, %r9161; - ld.const.v4.u8 {%rs5954, %rs5955, %rs5956, %rs5957}, [matrix+2976]; - cvt.u32.u16 %r9163, %rs5957; - cvt.s32.s8 %r9164, %r9163; - cvt.u32.u16 %r9165, %rs5956; - cvt.s32.s8 %r9166, %r9165; - cvt.u32.u16 %r9167, %rs5955; - cvt.s32.s8 %r9168, %r9167; - cvt.u32.u16 %r9169, %rs5954; - cvt.s32.s8 %r9170, %r9169; - mad.lo.s32 %r9171, %r222, %r9170, %r9162; - mad.lo.s32 %r9172, %r72, %r9168, %r9171; - mad.lo.s32 %r9173, %r73, %r9166, %r9172; - mad.lo.s32 %r9174, %r74, %r9164, %r9173; - ld.const.v4.u8 {%rs5962, %rs5963, %rs5964, %rs5965}, [matrix+2980]; - cvt.u32.u16 %r9175, %rs5965; - cvt.s32.s8 %r9176, %r9175; - cvt.u32.u16 %r9177, %rs5964; - cvt.s32.s8 %r9178, %r9177; - cvt.u32.u16 %r9179, %rs5963; - cvt.s32.s8 %r9180, %r9179; - cvt.u32.u16 %r9181, %rs5962; - cvt.s32.s8 %r9182, %r9181; - mad.lo.s32 %r9183, %r75, %r9182, %r9174; - mad.lo.s32 %r9184, %r76, %r9180, %r9183; - mad.lo.s32 %r9185, %r77, %r9178, %r9184; - mad.lo.s32 %r9186, %r78, %r9176, %r9185; - ld.const.v4.u8 {%rs5970, %rs5971, %rs5972, %rs5973}, [matrix+2984]; - cvt.u32.u16 %r9187, %rs5973; - cvt.s32.s8 %r9188, %r9187; - cvt.u32.u16 %r9189, %rs5972; - cvt.s32.s8 %r9190, %r9189; - cvt.u32.u16 %r9191, %rs5971; - cvt.s32.s8 %r9192, %r9191; - cvt.u32.u16 %r9193, %rs5970; - cvt.s32.s8 %r9194, %r9193; - mad.lo.s32 %r9195, %r80, %r9194, %r9186; - mad.lo.s32 %r9196, %r81, %r9192, %r9195; - mad.lo.s32 %r9197, %r83, %r9190, %r9196; - mad.lo.s32 %r9198, %r84, %r9188, %r9197; - ld.const.v4.u8 {%rs5978, %rs5979, %rs5980, %rs5981}, [matrix+2988]; - cvt.u32.u16 %r9199, %rs5981; - cvt.s32.s8 %r9200, %r9199; - cvt.u32.u16 %r9201, %rs5980; - cvt.s32.s8 %r9202, %r9201; - cvt.u32.u16 %r9203, %rs5979; - cvt.s32.s8 %r9204, %r9203; - cvt.u32.u16 %r9205, %rs5978; - cvt.s32.s8 %r9206, %r9205; - mad.lo.s32 %r9207, %r86, %r9206, %r9198; - mad.lo.s32 %r9208, %r87, %r9204, %r9207; - mad.lo.s32 %r9209, %r88, %r9202, %r9208; - mad.lo.s32 %r9210, %r89, %r9200, %r9209; - ld.const.v4.u8 {%rs5986, %rs5987, %rs5988, %rs5989}, [matrix+2992]; - cvt.u32.u16 %r9211, %rs5989; - cvt.s32.s8 %r9212, %r9211; - cvt.u32.u16 %r9213, %rs5988; - cvt.s32.s8 %r9214, %r9213; - cvt.u32.u16 %r9215, %rs5987; - cvt.s32.s8 %r9216, %r9215; - cvt.u32.u16 %r9217, %rs5986; - cvt.s32.s8 %r9218, %r9217; - mad.lo.s32 %r9219, %r271, %r9218, %r9210; - mad.lo.s32 %r9220, %r91, %r9216, %r9219; - mad.lo.s32 %r9221, %r93, %r9214, %r9220; - mad.lo.s32 %r9222, %r94, %r9212, %r9221; - ld.const.v4.u8 {%rs5994, %rs5995, %rs5996, %rs5997}, [matrix+2996]; - cvt.u32.u16 %r9223, %rs5997; - cvt.s32.s8 %r9224, %r9223; - cvt.u32.u16 %r9225, %rs5996; - cvt.s32.s8 %r9226, %r9225; - cvt.u32.u16 %r9227, %rs5995; - cvt.s32.s8 %r9228, %r9227; - cvt.u32.u16 %r9229, %rs5994; - cvt.s32.s8 %r9230, %r9229; - mad.lo.s32 %r9231, %r96, %r9230, %r9222; - mad.lo.s32 %r9232, %r97, %r9228, %r9231; - mad.lo.s32 %r9233, %r99, %r9226, %r9232; - mad.lo.s32 %r9234, %r100, %r9224, %r9233; - ld.const.v4.u8 {%rs6002, %rs6003, %rs6004, %rs6005}, [matrix+3000]; - cvt.u32.u16 %r9235, %rs6005; - cvt.s32.s8 %r9236, %r9235; - cvt.u32.u16 %r9237, %rs6004; - cvt.s32.s8 %r9238, %r9237; - cvt.u32.u16 %r9239, %rs6003; - cvt.s32.s8 %r9240, %r9239; - cvt.u32.u16 %r9241, %rs6002; - cvt.s32.s8 %r9242, %r9241; - mad.lo.s32 %r9243, %r103, %r9242, %r9234; - mad.lo.s32 %r9244, %r104, %r9240, %r9243; - mad.lo.s32 %r9245, %r107, %r9238, %r9244; - mad.lo.s32 %r9246, %r108, %r9236, %r9245; - ld.const.v4.u8 {%rs6010, %rs6011, %rs6012, %rs6013}, [matrix+3004]; - cvt.u32.u16 %r9247, %rs6013; - cvt.s32.s8 %r9248, %r9247; - cvt.u32.u16 %r9249, %rs6012; - cvt.s32.s8 %r9250, %r9249; - cvt.u32.u16 %r9251, %rs6011; - cvt.s32.s8 %r9252, %r9251; - cvt.u32.u16 %r9253, %rs6010; - cvt.s32.s8 %r9254, %r9253; - mad.lo.s32 %r9255, %r111, %r9254, %r9246; - mad.lo.s32 %r9256, %r112, %r9252, %r9255; - mad.lo.s32 %r9257, %r114, %r9250, %r9256; - mad.lo.s32 %r9258, %r115, %r9248, %r9257; - ld.const.v4.u8 {%rs6018, %rs6019, %rs6020, %rs6021}, [matrix+3008]; - cvt.u32.u16 %r9259, %rs6021; - cvt.s32.s8 %r9260, %r9259; - cvt.u32.u16 %r9261, %rs6020; - cvt.s32.s8 %r9262, %r9261; - cvt.u32.u16 %r9263, %rs6018; - cvt.s32.s8 %r9264, %r9263; - cvt.u32.u16 %r9265, %rs6019; - cvt.s32.s8 %r9266, %r9265; - mul.lo.s32 %r9267, %r34, %r9266; - mad.lo.s32 %r9268, %r124, %r9264, %r9267; - mad.lo.s32 %r9269, %r35, %r9262, %r9268; - mad.lo.s32 %r9270, %r36, %r9260, %r9269; - ld.const.v4.u8 {%rs6026, %rs6027, %rs6028, %rs6029}, [matrix+3012]; - cvt.u32.u16 %r9271, %rs6029; - cvt.s32.s8 %r9272, %r9271; - cvt.u32.u16 %r9273, %rs6028; - cvt.s32.s8 %r9274, %r9273; - cvt.u32.u16 %r9275, %rs6027; - cvt.s32.s8 %r9276, %r9275; - cvt.u32.u16 %r9277, %rs6026; - cvt.s32.s8 %r9278, %r9277; - mad.lo.s32 %r9279, %r37, %r9278, %r9270; - mad.lo.s32 %r9280, %r38, %r9276, %r9279; - mad.lo.s32 %r9281, %r39, %r9274, %r9280; - mad.lo.s32 %r9282, %r40, %r9272, %r9281; - ld.const.v4.u8 {%rs6034, %rs6035, %rs6036, %rs6037}, [matrix+3016]; - cvt.u32.u16 %r9283, %rs6037; - cvt.s32.s8 %r9284, %r9283; - cvt.u32.u16 %r9285, %rs6036; - cvt.s32.s8 %r9286, %r9285; - cvt.u32.u16 %r9287, %rs6035; - cvt.s32.s8 %r9288, %r9287; - cvt.u32.u16 %r9289, %rs6034; - cvt.s32.s8 %r9290, %r9289; - mad.lo.s32 %r9291, %r42, %r9290, %r9282; - mad.lo.s32 %r9292, %r43, %r9288, %r9291; - mad.lo.s32 %r9293, %r45, %r9286, %r9292; - mad.lo.s32 %r9294, %r46, %r9284, %r9293; - ld.const.v4.u8 {%rs6042, %rs6043, %rs6044, %rs6045}, [matrix+3020]; - cvt.u32.u16 %r9295, %rs6045; - cvt.s32.s8 %r9296, %r9295; - cvt.u32.u16 %r9297, %rs6044; - cvt.s32.s8 %r9298, %r9297; - cvt.u32.u16 %r9299, %rs6043; - cvt.s32.s8 %r9300, %r9299; - cvt.u32.u16 %r9301, %rs6042; - cvt.s32.s8 %r9302, %r9301; - mad.lo.s32 %r9303, %r48, %r9302, %r9294; - mad.lo.s32 %r9304, %r49, %r9300, %r9303; - mad.lo.s32 %r9305, %r50, %r9298, %r9304; - mad.lo.s32 %r9306, %r51, %r9296, %r9305; - ld.const.v4.u8 {%rs6050, %rs6051, %rs6052, %rs6053}, [matrix+3024]; - cvt.u32.u16 %r9307, %rs6053; - cvt.s32.s8 %r9308, %r9307; - cvt.u32.u16 %r9309, %rs6052; - cvt.s32.s8 %r9310, %r9309; - cvt.u32.u16 %r9311, %rs6051; - cvt.s32.s8 %r9312, %r9311; - cvt.u32.u16 %r9313, %rs6050; - cvt.s32.s8 %r9314, %r9313; - mad.lo.s32 %r9315, %r173, %r9314, %r9306; - mad.lo.s32 %r9316, %r53, %r9312, %r9315; - mad.lo.s32 %r9317, %r54, %r9310, %r9316; - mad.lo.s32 %r9318, %r55, %r9308, %r9317; - ld.const.v4.u8 {%rs6058, %rs6059, %rs6060, %rs6061}, [matrix+3028]; - cvt.u32.u16 %r9319, %rs6061; - cvt.s32.s8 %r9320, %r9319; - cvt.u32.u16 %r9321, %rs6060; - cvt.s32.s8 %r9322, %r9321; - cvt.u32.u16 %r9323, %rs6059; - cvt.s32.s8 %r9324, %r9323; - cvt.u32.u16 %r9325, %rs6058; - cvt.s32.s8 %r9326, %r9325; - mad.lo.s32 %r9327, %r56, %r9326, %r9318; - mad.lo.s32 %r9328, %r57, %r9324, %r9327; - mad.lo.s32 %r9329, %r58, %r9322, %r9328; - mad.lo.s32 %r9330, %r59, %r9320, %r9329; - ld.const.v4.u8 {%rs6066, %rs6067, %rs6068, %rs6069}, [matrix+3032]; - cvt.u32.u16 %r9331, %rs6069; - cvt.s32.s8 %r9332, %r9331; - cvt.u32.u16 %r9333, %rs6068; - cvt.s32.s8 %r9334, %r9333; - cvt.u32.u16 %r9335, %rs6067; - cvt.s32.s8 %r9336, %r9335; - cvt.u32.u16 %r9337, %rs6066; - cvt.s32.s8 %r9338, %r9337; - mad.lo.s32 %r9339, %r61, %r9338, %r9330; - mad.lo.s32 %r9340, %r62, %r9336, %r9339; - mad.lo.s32 %r9341, %r64, %r9334, %r9340; - mad.lo.s32 %r9342, %r65, %r9332, %r9341; - ld.const.v4.u8 {%rs6074, %rs6075, %rs6076, %rs6077}, [matrix+3036]; - cvt.u32.u16 %r9343, %rs6077; - cvt.s32.s8 %r9344, %r9343; - cvt.u32.u16 %r9345, %rs6076; - cvt.s32.s8 %r9346, %r9345; - cvt.u32.u16 %r9347, %rs6075; - cvt.s32.s8 %r9348, %r9347; - cvt.u32.u16 %r9349, %rs6074; - cvt.s32.s8 %r9350, %r9349; - mad.lo.s32 %r9351, %r67, %r9350, %r9342; - mad.lo.s32 %r9352, %r68, %r9348, %r9351; - mad.lo.s32 %r9353, %r69, %r9346, %r9352; - mad.lo.s32 %r9354, %r70, %r9344, %r9353; - ld.const.v4.u8 {%rs6082, %rs6083, %rs6084, %rs6085}, [matrix+3040]; - cvt.u32.u16 %r9355, %rs6085; - cvt.s32.s8 %r9356, %r9355; - cvt.u32.u16 %r9357, %rs6084; - cvt.s32.s8 %r9358, %r9357; - cvt.u32.u16 %r9359, %rs6083; - cvt.s32.s8 %r9360, %r9359; - cvt.u32.u16 %r9361, %rs6082; - cvt.s32.s8 %r9362, %r9361; - mad.lo.s32 %r9363, %r222, %r9362, %r9354; - mad.lo.s32 %r9364, %r72, %r9360, %r9363; - mad.lo.s32 %r9365, %r73, %r9358, %r9364; - mad.lo.s32 %r9366, %r74, %r9356, %r9365; - ld.const.v4.u8 {%rs6090, %rs6091, %rs6092, %rs6093}, [matrix+3044]; - cvt.u32.u16 %r9367, %rs6093; - cvt.s32.s8 %r9368, %r9367; - cvt.u32.u16 %r9369, %rs6092; - cvt.s32.s8 %r9370, %r9369; - cvt.u32.u16 %r9371, %rs6091; - cvt.s32.s8 %r9372, %r9371; - cvt.u32.u16 %r9373, %rs6090; - cvt.s32.s8 %r9374, %r9373; - mad.lo.s32 %r9375, %r75, %r9374, %r9366; - mad.lo.s32 %r9376, %r76, %r9372, %r9375; - mad.lo.s32 %r9377, %r77, %r9370, %r9376; - mad.lo.s32 %r9378, %r78, %r9368, %r9377; - ld.const.v4.u8 {%rs6098, %rs6099, %rs6100, %rs6101}, [matrix+3048]; - cvt.u32.u16 %r9379, %rs6101; - cvt.s32.s8 %r9380, %r9379; - cvt.u32.u16 %r9381, %rs6100; - cvt.s32.s8 %r9382, %r9381; - cvt.u32.u16 %r9383, %rs6099; - cvt.s32.s8 %r9384, %r9383; - cvt.u32.u16 %r9385, %rs6098; - cvt.s32.s8 %r9386, %r9385; - mad.lo.s32 %r9387, %r80, %r9386, %r9378; - mad.lo.s32 %r9388, %r81, %r9384, %r9387; - mad.lo.s32 %r9389, %r83, %r9382, %r9388; - mad.lo.s32 %r9390, %r84, %r9380, %r9389; - ld.const.v4.u8 {%rs6106, %rs6107, %rs6108, %rs6109}, [matrix+3052]; - cvt.u32.u16 %r9391, %rs6109; - cvt.s32.s8 %r9392, %r9391; - cvt.u32.u16 %r9393, %rs6108; - cvt.s32.s8 %r9394, %r9393; - cvt.u32.u16 %r9395, %rs6107; - cvt.s32.s8 %r9396, %r9395; - cvt.u32.u16 %r9397, %rs6106; - cvt.s32.s8 %r9398, %r9397; - mad.lo.s32 %r9399, %r86, %r9398, %r9390; - mad.lo.s32 %r9400, %r87, %r9396, %r9399; - mad.lo.s32 %r9401, %r88, %r9394, %r9400; - mad.lo.s32 %r9402, %r89, %r9392, %r9401; - ld.const.v4.u8 {%rs6114, %rs6115, %rs6116, %rs6117}, [matrix+3056]; - cvt.u32.u16 %r9403, %rs6117; - cvt.s32.s8 %r9404, %r9403; - cvt.u32.u16 %r9405, %rs6116; - cvt.s32.s8 %r9406, %r9405; - cvt.u32.u16 %r9407, %rs6115; - cvt.s32.s8 %r9408, %r9407; - cvt.u32.u16 %r9409, %rs6114; - cvt.s32.s8 %r9410, %r9409; - mad.lo.s32 %r9411, %r271, %r9410, %r9402; - mad.lo.s32 %r9412, %r91, %r9408, %r9411; - mad.lo.s32 %r9413, %r93, %r9406, %r9412; - mad.lo.s32 %r9414, %r94, %r9404, %r9413; - ld.const.v4.u8 {%rs6122, %rs6123, %rs6124, %rs6125}, [matrix+3060]; - cvt.u32.u16 %r9415, %rs6125; - cvt.s32.s8 %r9416, %r9415; - cvt.u32.u16 %r9417, %rs6124; - cvt.s32.s8 %r9418, %r9417; - cvt.u32.u16 %r9419, %rs6123; - cvt.s32.s8 %r9420, %r9419; - cvt.u32.u16 %r9421, %rs6122; - cvt.s32.s8 %r9422, %r9421; - mad.lo.s32 %r9423, %r96, %r9422, %r9414; - mad.lo.s32 %r9424, %r97, %r9420, %r9423; - mad.lo.s32 %r9425, %r99, %r9418, %r9424; - mad.lo.s32 %r9426, %r100, %r9416, %r9425; - ld.const.v4.u8 {%rs6130, %rs6131, %rs6132, %rs6133}, [matrix+3064]; - cvt.u32.u16 %r9427, %rs6133; - cvt.s32.s8 %r9428, %r9427; - cvt.u32.u16 %r9429, %rs6132; - cvt.s32.s8 %r9430, %r9429; - cvt.u32.u16 %r9431, %rs6131; - cvt.s32.s8 %r9432, %r9431; - cvt.u32.u16 %r9433, %rs6130; - cvt.s32.s8 %r9434, %r9433; - mad.lo.s32 %r9435, %r103, %r9434, %r9426; - mad.lo.s32 %r9436, %r104, %r9432, %r9435; - mad.lo.s32 %r9437, %r107, %r9430, %r9436; - mad.lo.s32 %r9438, %r108, %r9428, %r9437; - ld.const.v4.u8 {%rs6138, %rs6139, %rs6140, %rs6141}, [matrix+3068]; - cvt.u32.u16 %r9439, %rs6141; - cvt.s32.s8 %r9440, %r9439; - cvt.u32.u16 %r9441, %rs6140; - cvt.s32.s8 %r9442, %r9441; - cvt.u32.u16 %r9443, %rs6139; - cvt.s32.s8 %r9444, %r9443; - cvt.u32.u16 %r9445, %rs6138; - cvt.s32.s8 %r9446, %r9445; - mad.lo.s32 %r9447, %r111, %r9446, %r9438; - mad.lo.s32 %r9448, %r112, %r9444, %r9447; - mad.lo.s32 %r9449, %r114, %r9442, %r9448; - mad.lo.s32 %r9450, %r115, %r9440, %r9449; - shr.u32 %r9451, %r9258, 6; - and.b32 %r9452, %r9451, 240; - shr.u32 %r9453, %r9450, 10; - or.b32 %r9454, %r9453, %r9452; - xor.b32 %r9455, %r33, %r9454; - ld.const.v4.u8 {%rs6146, %rs6147, %rs6148, %rs6149}, [matrix+3072]; - cvt.u32.u16 %r9456, %rs6149; - cvt.s32.s8 %r9457, %r9456; - cvt.u32.u16 %r9458, %rs6148; - cvt.s32.s8 %r9459, %r9458; - cvt.u32.u16 %r9460, %rs6146; - cvt.s32.s8 %r9461, %r9460; - cvt.u32.u16 %r9462, %rs6147; - cvt.s32.s8 %r9463, %r9462; - mul.lo.s32 %r9464, %r34, %r9463; - mad.lo.s32 %r9465, %r124, %r9461, %r9464; - mad.lo.s32 %r9466, %r35, %r9459, %r9465; - mad.lo.s32 %r9467, %r36, %r9457, %r9466; - ld.const.v4.u8 {%rs6154, %rs6155, %rs6156, %rs6157}, [matrix+3076]; - cvt.u32.u16 %r9468, %rs6157; - cvt.s32.s8 %r9469, %r9468; - cvt.u32.u16 %r9470, %rs6156; - cvt.s32.s8 %r9471, %r9470; - cvt.u32.u16 %r9472, %rs6155; - cvt.s32.s8 %r9473, %r9472; - cvt.u32.u16 %r9474, %rs6154; - cvt.s32.s8 %r9475, %r9474; - mad.lo.s32 %r9476, %r37, %r9475, %r9467; - mad.lo.s32 %r9477, %r38, %r9473, %r9476; - mad.lo.s32 %r9478, %r39, %r9471, %r9477; - mad.lo.s32 %r9479, %r40, %r9469, %r9478; - ld.const.v4.u8 {%rs6162, %rs6163, %rs6164, %rs6165}, [matrix+3080]; - cvt.u32.u16 %r9480, %rs6165; - cvt.s32.s8 %r9481, %r9480; - cvt.u32.u16 %r9482, %rs6164; - cvt.s32.s8 %r9483, %r9482; - cvt.u32.u16 %r9484, %rs6163; - cvt.s32.s8 %r9485, %r9484; - cvt.u32.u16 %r9486, %rs6162; - cvt.s32.s8 %r9487, %r9486; - mad.lo.s32 %r9488, %r42, %r9487, %r9479; - mad.lo.s32 %r9489, %r43, %r9485, %r9488; - mad.lo.s32 %r9490, %r45, %r9483, %r9489; - mad.lo.s32 %r9491, %r46, %r9481, %r9490; - ld.const.v4.u8 {%rs6170, %rs6171, %rs6172, %rs6173}, [matrix+3084]; - cvt.u32.u16 %r9492, %rs6173; - cvt.s32.s8 %r9493, %r9492; - cvt.u32.u16 %r9494, %rs6172; - cvt.s32.s8 %r9495, %r9494; - cvt.u32.u16 %r9496, %rs6171; - cvt.s32.s8 %r9497, %r9496; - cvt.u32.u16 %r9498, %rs6170; - cvt.s32.s8 %r9499, %r9498; - mad.lo.s32 %r9500, %r48, %r9499, %r9491; - mad.lo.s32 %r9501, %r49, %r9497, %r9500; - mad.lo.s32 %r9502, %r50, %r9495, %r9501; - mad.lo.s32 %r9503, %r51, %r9493, %r9502; - ld.const.v4.u8 {%rs6178, %rs6179, %rs6180, %rs6181}, [matrix+3088]; - cvt.u32.u16 %r9504, %rs6181; - cvt.s32.s8 %r9505, %r9504; - cvt.u32.u16 %r9506, %rs6180; - cvt.s32.s8 %r9507, %r9506; - cvt.u32.u16 %r9508, %rs6179; - cvt.s32.s8 %r9509, %r9508; - cvt.u32.u16 %r9510, %rs6178; - cvt.s32.s8 %r9511, %r9510; - mad.lo.s32 %r9512, %r173, %r9511, %r9503; - mad.lo.s32 %r9513, %r53, %r9509, %r9512; - mad.lo.s32 %r9514, %r54, %r9507, %r9513; - mad.lo.s32 %r9515, %r55, %r9505, %r9514; - ld.const.v4.u8 {%rs6186, %rs6187, %rs6188, %rs6189}, [matrix+3092]; - cvt.u32.u16 %r9516, %rs6189; - cvt.s32.s8 %r9517, %r9516; - cvt.u32.u16 %r9518, %rs6188; - cvt.s32.s8 %r9519, %r9518; - cvt.u32.u16 %r9520, %rs6187; - cvt.s32.s8 %r9521, %r9520; - cvt.u32.u16 %r9522, %rs6186; - cvt.s32.s8 %r9523, %r9522; - mad.lo.s32 %r9524, %r56, %r9523, %r9515; - mad.lo.s32 %r9525, %r57, %r9521, %r9524; - mad.lo.s32 %r9526, %r58, %r9519, %r9525; - mad.lo.s32 %r9527, %r59, %r9517, %r9526; - ld.const.v4.u8 {%rs6194, %rs6195, %rs6196, %rs6197}, [matrix+3096]; - cvt.u32.u16 %r9528, %rs6197; - cvt.s32.s8 %r9529, %r9528; - cvt.u32.u16 %r9530, %rs6196; - cvt.s32.s8 %r9531, %r9530; - cvt.u32.u16 %r9532, %rs6195; - cvt.s32.s8 %r9533, %r9532; - cvt.u32.u16 %r9534, %rs6194; - cvt.s32.s8 %r9535, %r9534; - mad.lo.s32 %r9536, %r61, %r9535, %r9527; - mad.lo.s32 %r9537, %r62, %r9533, %r9536; - mad.lo.s32 %r9538, %r64, %r9531, %r9537; - mad.lo.s32 %r9539, %r65, %r9529, %r9538; - ld.const.v4.u8 {%rs6202, %rs6203, %rs6204, %rs6205}, [matrix+3100]; - cvt.u32.u16 %r9540, %rs6205; - cvt.s32.s8 %r9541, %r9540; - cvt.u32.u16 %r9542, %rs6204; - cvt.s32.s8 %r9543, %r9542; - cvt.u32.u16 %r9544, %rs6203; - cvt.s32.s8 %r9545, %r9544; - cvt.u32.u16 %r9546, %rs6202; - cvt.s32.s8 %r9547, %r9546; - mad.lo.s32 %r9548, %r67, %r9547, %r9539; - mad.lo.s32 %r9549, %r68, %r9545, %r9548; - mad.lo.s32 %r9550, %r69, %r9543, %r9549; - mad.lo.s32 %r9551, %r70, %r9541, %r9550; - ld.const.v4.u8 {%rs6210, %rs6211, %rs6212, %rs6213}, [matrix+3104]; - cvt.u32.u16 %r9552, %rs6213; - cvt.s32.s8 %r9553, %r9552; - cvt.u32.u16 %r9554, %rs6212; - cvt.s32.s8 %r9555, %r9554; - cvt.u32.u16 %r9556, %rs6211; - cvt.s32.s8 %r9557, %r9556; - cvt.u32.u16 %r9558, %rs6210; - cvt.s32.s8 %r9559, %r9558; - mad.lo.s32 %r9560, %r222, %r9559, %r9551; - mad.lo.s32 %r9561, %r72, %r9557, %r9560; - mad.lo.s32 %r9562, %r73, %r9555, %r9561; - mad.lo.s32 %r9563, %r74, %r9553, %r9562; - ld.const.v4.u8 {%rs6218, %rs6219, %rs6220, %rs6221}, [matrix+3108]; - cvt.u32.u16 %r9564, %rs6221; - cvt.s32.s8 %r9565, %r9564; - cvt.u32.u16 %r9566, %rs6220; - cvt.s32.s8 %r9567, %r9566; - cvt.u32.u16 %r9568, %rs6219; - cvt.s32.s8 %r9569, %r9568; - cvt.u32.u16 %r9570, %rs6218; - cvt.s32.s8 %r9571, %r9570; - mad.lo.s32 %r9572, %r75, %r9571, %r9563; - mad.lo.s32 %r9573, %r76, %r9569, %r9572; - mad.lo.s32 %r9574, %r77, %r9567, %r9573; - mad.lo.s32 %r9575, %r78, %r9565, %r9574; - ld.const.v4.u8 {%rs6226, %rs6227, %rs6228, %rs6229}, [matrix+3112]; - cvt.u32.u16 %r9576, %rs6229; - cvt.s32.s8 %r9577, %r9576; - cvt.u32.u16 %r9578, %rs6228; - cvt.s32.s8 %r9579, %r9578; - cvt.u32.u16 %r9580, %rs6227; - cvt.s32.s8 %r9581, %r9580; - cvt.u32.u16 %r9582, %rs6226; - cvt.s32.s8 %r9583, %r9582; - mad.lo.s32 %r9584, %r80, %r9583, %r9575; - mad.lo.s32 %r9585, %r81, %r9581, %r9584; - mad.lo.s32 %r9586, %r83, %r9579, %r9585; - mad.lo.s32 %r9587, %r84, %r9577, %r9586; - ld.const.v4.u8 {%rs6234, %rs6235, %rs6236, %rs6237}, [matrix+3116]; - cvt.u32.u16 %r9588, %rs6237; - cvt.s32.s8 %r9589, %r9588; - cvt.u32.u16 %r9590, %rs6236; - cvt.s32.s8 %r9591, %r9590; - cvt.u32.u16 %r9592, %rs6235; - cvt.s32.s8 %r9593, %r9592; - cvt.u32.u16 %r9594, %rs6234; - cvt.s32.s8 %r9595, %r9594; - mad.lo.s32 %r9596, %r86, %r9595, %r9587; - mad.lo.s32 %r9597, %r87, %r9593, %r9596; - mad.lo.s32 %r9598, %r88, %r9591, %r9597; - mad.lo.s32 %r9599, %r89, %r9589, %r9598; - ld.const.v4.u8 {%rs6242, %rs6243, %rs6244, %rs6245}, [matrix+3120]; - cvt.u32.u16 %r9600, %rs6245; - cvt.s32.s8 %r9601, %r9600; - cvt.u32.u16 %r9602, %rs6244; - cvt.s32.s8 %r9603, %r9602; - cvt.u32.u16 %r9604, %rs6243; - cvt.s32.s8 %r9605, %r9604; - cvt.u32.u16 %r9606, %rs6242; - cvt.s32.s8 %r9607, %r9606; - mad.lo.s32 %r9608, %r271, %r9607, %r9599; - mad.lo.s32 %r9609, %r91, %r9605, %r9608; - mad.lo.s32 %r9610, %r93, %r9603, %r9609; - mad.lo.s32 %r9611, %r94, %r9601, %r9610; - ld.const.v4.u8 {%rs6250, %rs6251, %rs6252, %rs6253}, [matrix+3124]; - cvt.u32.u16 %r9612, %rs6253; - cvt.s32.s8 %r9613, %r9612; - cvt.u32.u16 %r9614, %rs6252; - cvt.s32.s8 %r9615, %r9614; - cvt.u32.u16 %r9616, %rs6251; - cvt.s32.s8 %r9617, %r9616; - cvt.u32.u16 %r9618, %rs6250; - cvt.s32.s8 %r9619, %r9618; - mad.lo.s32 %r9620, %r96, %r9619, %r9611; - mad.lo.s32 %r9621, %r97, %r9617, %r9620; - mad.lo.s32 %r9622, %r99, %r9615, %r9621; - mad.lo.s32 %r9623, %r100, %r9613, %r9622; - ld.const.v4.u8 {%rs6258, %rs6259, %rs6260, %rs6261}, [matrix+3128]; - cvt.u32.u16 %r9624, %rs6261; - cvt.s32.s8 %r9625, %r9624; - cvt.u32.u16 %r9626, %rs6260; - cvt.s32.s8 %r9627, %r9626; - cvt.u32.u16 %r9628, %rs6259; - cvt.s32.s8 %r9629, %r9628; - cvt.u32.u16 %r9630, %rs6258; - cvt.s32.s8 %r9631, %r9630; - mad.lo.s32 %r9632, %r103, %r9631, %r9623; - mad.lo.s32 %r9633, %r104, %r9629, %r9632; - mad.lo.s32 %r9634, %r107, %r9627, %r9633; - mad.lo.s32 %r9635, %r108, %r9625, %r9634; - ld.const.v4.u8 {%rs6266, %rs6267, %rs6268, %rs6269}, [matrix+3132]; - cvt.u32.u16 %r9636, %rs6269; - cvt.s32.s8 %r9637, %r9636; - cvt.u32.u16 %r9638, %rs6268; - cvt.s32.s8 %r9639, %r9638; - cvt.u32.u16 %r9640, %rs6267; - cvt.s32.s8 %r9641, %r9640; - cvt.u32.u16 %r9642, %rs6266; - cvt.s32.s8 %r9643, %r9642; - mad.lo.s32 %r9644, %r111, %r9643, %r9635; - mad.lo.s32 %r9645, %r112, %r9641, %r9644; - mad.lo.s32 %r9646, %r114, %r9639, %r9645; - mad.lo.s32 %r9647, %r115, %r9637, %r9646; - ld.const.v4.u8 {%rs6274, %rs6275, %rs6276, %rs6277}, [matrix+3136]; - cvt.u32.u16 %r9648, %rs6277; - cvt.s32.s8 %r9649, %r9648; - cvt.u32.u16 %r9650, %rs6276; - cvt.s32.s8 %r9651, %r9650; - cvt.u32.u16 %r9652, %rs6274; - cvt.s32.s8 %r9653, %r9652; - cvt.u32.u16 %r9654, %rs6275; - cvt.s32.s8 %r9655, %r9654; - mul.lo.s32 %r9656, %r34, %r9655; - mad.lo.s32 %r9657, %r124, %r9653, %r9656; - mad.lo.s32 %r9658, %r35, %r9651, %r9657; - mad.lo.s32 %r9659, %r36, %r9649, %r9658; - ld.const.v4.u8 {%rs6282, %rs6283, %rs6284, %rs6285}, [matrix+3140]; - cvt.u32.u16 %r9660, %rs6285; - cvt.s32.s8 %r9661, %r9660; - cvt.u32.u16 %r9662, %rs6284; - cvt.s32.s8 %r9663, %r9662; - cvt.u32.u16 %r9664, %rs6283; - cvt.s32.s8 %r9665, %r9664; - cvt.u32.u16 %r9666, %rs6282; - cvt.s32.s8 %r9667, %r9666; - mad.lo.s32 %r9668, %r37, %r9667, %r9659; - mad.lo.s32 %r9669, %r38, %r9665, %r9668; - mad.lo.s32 %r9670, %r39, %r9663, %r9669; - mad.lo.s32 %r9671, %r40, %r9661, %r9670; - ld.const.v4.u8 {%rs6290, %rs6291, %rs6292, %rs6293}, [matrix+3144]; - cvt.u32.u16 %r9672, %rs6293; - cvt.s32.s8 %r9673, %r9672; - cvt.u32.u16 %r9674, %rs6292; - cvt.s32.s8 %r9675, %r9674; - cvt.u32.u16 %r9676, %rs6291; - cvt.s32.s8 %r9677, %r9676; - cvt.u32.u16 %r9678, %rs6290; - cvt.s32.s8 %r9679, %r9678; - mad.lo.s32 %r9680, %r42, %r9679, %r9671; - mad.lo.s32 %r9681, %r43, %r9677, %r9680; - mad.lo.s32 %r9682, %r45, %r9675, %r9681; - mad.lo.s32 %r9683, %r46, %r9673, %r9682; - ld.const.v4.u8 {%rs6298, %rs6299, %rs6300, %rs6301}, [matrix+3148]; - cvt.u32.u16 %r9684, %rs6301; - cvt.s32.s8 %r9685, %r9684; - cvt.u32.u16 %r9686, %rs6300; - cvt.s32.s8 %r9687, %r9686; - cvt.u32.u16 %r9688, %rs6299; - cvt.s32.s8 %r9689, %r9688; - cvt.u32.u16 %r9690, %rs6298; - cvt.s32.s8 %r9691, %r9690; - mad.lo.s32 %r9692, %r48, %r9691, %r9683; - mad.lo.s32 %r9693, %r49, %r9689, %r9692; - mad.lo.s32 %r9694, %r50, %r9687, %r9693; - mad.lo.s32 %r9695, %r51, %r9685, %r9694; - ld.const.v4.u8 {%rs6306, %rs6307, %rs6308, %rs6309}, [matrix+3152]; - cvt.u32.u16 %r9696, %rs6309; - cvt.s32.s8 %r9697, %r9696; - cvt.u32.u16 %r9698, %rs6308; - cvt.s32.s8 %r9699, %r9698; - cvt.u32.u16 %r9700, %rs6307; - cvt.s32.s8 %r9701, %r9700; - cvt.u32.u16 %r9702, %rs6306; - cvt.s32.s8 %r9703, %r9702; - mad.lo.s32 %r9704, %r173, %r9703, %r9695; - mad.lo.s32 %r9705, %r53, %r9701, %r9704; - mad.lo.s32 %r9706, %r54, %r9699, %r9705; - mad.lo.s32 %r9707, %r55, %r9697, %r9706; - ld.const.v4.u8 {%rs6314, %rs6315, %rs6316, %rs6317}, [matrix+3156]; - cvt.u32.u16 %r9708, %rs6317; - cvt.s32.s8 %r9709, %r9708; - cvt.u32.u16 %r9710, %rs6316; - cvt.s32.s8 %r9711, %r9710; - cvt.u32.u16 %r9712, %rs6315; - cvt.s32.s8 %r9713, %r9712; - cvt.u32.u16 %r9714, %rs6314; - cvt.s32.s8 %r9715, %r9714; - mad.lo.s32 %r9716, %r56, %r9715, %r9707; - mad.lo.s32 %r9717, %r57, %r9713, %r9716; - mad.lo.s32 %r9718, %r58, %r9711, %r9717; - mad.lo.s32 %r9719, %r59, %r9709, %r9718; - ld.const.v4.u8 {%rs6322, %rs6323, %rs6324, %rs6325}, [matrix+3160]; - cvt.u32.u16 %r9720, %rs6325; - cvt.s32.s8 %r9721, %r9720; - cvt.u32.u16 %r9722, %rs6324; - cvt.s32.s8 %r9723, %r9722; - cvt.u32.u16 %r9724, %rs6323; - cvt.s32.s8 %r9725, %r9724; - cvt.u32.u16 %r9726, %rs6322; - cvt.s32.s8 %r9727, %r9726; - mad.lo.s32 %r9728, %r61, %r9727, %r9719; - mad.lo.s32 %r9729, %r62, %r9725, %r9728; - mad.lo.s32 %r9730, %r64, %r9723, %r9729; - mad.lo.s32 %r9731, %r65, %r9721, %r9730; - ld.const.v4.u8 {%rs6330, %rs6331, %rs6332, %rs6333}, [matrix+3164]; - cvt.u32.u16 %r9732, %rs6333; - cvt.s32.s8 %r9733, %r9732; - cvt.u32.u16 %r9734, %rs6332; - cvt.s32.s8 %r9735, %r9734; - cvt.u32.u16 %r9736, %rs6331; - cvt.s32.s8 %r9737, %r9736; - cvt.u32.u16 %r9738, %rs6330; - cvt.s32.s8 %r9739, %r9738; - mad.lo.s32 %r9740, %r67, %r9739, %r9731; - mad.lo.s32 %r9741, %r68, %r9737, %r9740; - mad.lo.s32 %r9742, %r69, %r9735, %r9741; - mad.lo.s32 %r9743, %r70, %r9733, %r9742; - ld.const.v4.u8 {%rs6338, %rs6339, %rs6340, %rs6341}, [matrix+3168]; - cvt.u32.u16 %r9744, %rs6341; - cvt.s32.s8 %r9745, %r9744; - cvt.u32.u16 %r9746, %rs6340; - cvt.s32.s8 %r9747, %r9746; - cvt.u32.u16 %r9748, %rs6339; - cvt.s32.s8 %r9749, %r9748; - cvt.u32.u16 %r9750, %rs6338; - cvt.s32.s8 %r9751, %r9750; - mad.lo.s32 %r9752, %r222, %r9751, %r9743; - mad.lo.s32 %r9753, %r72, %r9749, %r9752; - mad.lo.s32 %r9754, %r73, %r9747, %r9753; - mad.lo.s32 %r9755, %r74, %r9745, %r9754; - ld.const.v4.u8 {%rs6346, %rs6347, %rs6348, %rs6349}, [matrix+3172]; - cvt.u32.u16 %r9756, %rs6349; - cvt.s32.s8 %r9757, %r9756; - cvt.u32.u16 %r9758, %rs6348; - cvt.s32.s8 %r9759, %r9758; - cvt.u32.u16 %r9760, %rs6347; - cvt.s32.s8 %r9761, %r9760; - cvt.u32.u16 %r9762, %rs6346; - cvt.s32.s8 %r9763, %r9762; - mad.lo.s32 %r9764, %r75, %r9763, %r9755; - mad.lo.s32 %r9765, %r76, %r9761, %r9764; - mad.lo.s32 %r9766, %r77, %r9759, %r9765; - mad.lo.s32 %r9767, %r78, %r9757, %r9766; - ld.const.v4.u8 {%rs6354, %rs6355, %rs6356, %rs6357}, [matrix+3176]; - cvt.u32.u16 %r9768, %rs6357; - cvt.s32.s8 %r9769, %r9768; - cvt.u32.u16 %r9770, %rs6356; - cvt.s32.s8 %r9771, %r9770; - cvt.u32.u16 %r9772, %rs6355; - cvt.s32.s8 %r9773, %r9772; - cvt.u32.u16 %r9774, %rs6354; - cvt.s32.s8 %r9775, %r9774; - mad.lo.s32 %r9776, %r80, %r9775, %r9767; - mad.lo.s32 %r9777, %r81, %r9773, %r9776; - mad.lo.s32 %r9778, %r83, %r9771, %r9777; - mad.lo.s32 %r9779, %r84, %r9769, %r9778; - ld.const.v4.u8 {%rs6362, %rs6363, %rs6364, %rs6365}, [matrix+3180]; - cvt.u32.u16 %r9780, %rs6365; - cvt.s32.s8 %r9781, %r9780; - cvt.u32.u16 %r9782, %rs6364; - cvt.s32.s8 %r9783, %r9782; - cvt.u32.u16 %r9784, %rs6363; - cvt.s32.s8 %r9785, %r9784; - cvt.u32.u16 %r9786, %rs6362; - cvt.s32.s8 %r9787, %r9786; - mad.lo.s32 %r9788, %r86, %r9787, %r9779; - mad.lo.s32 %r9789, %r87, %r9785, %r9788; - mad.lo.s32 %r9790, %r88, %r9783, %r9789; - mad.lo.s32 %r9791, %r89, %r9781, %r9790; - ld.const.v4.u8 {%rs6370, %rs6371, %rs6372, %rs6373}, [matrix+3184]; - cvt.u32.u16 %r9792, %rs6373; - cvt.s32.s8 %r9793, %r9792; - cvt.u32.u16 %r9794, %rs6372; - cvt.s32.s8 %r9795, %r9794; - cvt.u32.u16 %r9796, %rs6371; - cvt.s32.s8 %r9797, %r9796; - cvt.u32.u16 %r9798, %rs6370; - cvt.s32.s8 %r9799, %r9798; - mad.lo.s32 %r9800, %r271, %r9799, %r9791; - mad.lo.s32 %r9801, %r91, %r9797, %r9800; - mad.lo.s32 %r9802, %r93, %r9795, %r9801; - mad.lo.s32 %r9803, %r94, %r9793, %r9802; - ld.const.v4.u8 {%rs6378, %rs6379, %rs6380, %rs6381}, [matrix+3188]; - cvt.u32.u16 %r9804, %rs6381; - cvt.s32.s8 %r9805, %r9804; - cvt.u32.u16 %r9806, %rs6380; - cvt.s32.s8 %r9807, %r9806; - cvt.u32.u16 %r9808, %rs6379; - cvt.s32.s8 %r9809, %r9808; - cvt.u32.u16 %r9810, %rs6378; - cvt.s32.s8 %r9811, %r9810; - mad.lo.s32 %r9812, %r96, %r9811, %r9803; - mad.lo.s32 %r9813, %r97, %r9809, %r9812; - mad.lo.s32 %r9814, %r99, %r9807, %r9813; - mad.lo.s32 %r9815, %r100, %r9805, %r9814; - ld.const.v4.u8 {%rs6386, %rs6387, %rs6388, %rs6389}, [matrix+3192]; - cvt.u32.u16 %r9816, %rs6389; - cvt.s32.s8 %r9817, %r9816; - cvt.u32.u16 %r9818, %rs6388; - cvt.s32.s8 %r9819, %r9818; - cvt.u32.u16 %r9820, %rs6387; - cvt.s32.s8 %r9821, %r9820; - cvt.u32.u16 %r9822, %rs6386; - cvt.s32.s8 %r9823, %r9822; - mad.lo.s32 %r9824, %r103, %r9823, %r9815; - mad.lo.s32 %r9825, %r104, %r9821, %r9824; - mad.lo.s32 %r9826, %r107, %r9819, %r9825; - mad.lo.s32 %r9827, %r108, %r9817, %r9826; - ld.const.v4.u8 {%rs6394, %rs6395, %rs6396, %rs6397}, [matrix+3196]; - cvt.u32.u16 %r9828, %rs6397; - cvt.s32.s8 %r9829, %r9828; - cvt.u32.u16 %r9830, %rs6396; - cvt.s32.s8 %r9831, %r9830; - cvt.u32.u16 %r9832, %rs6395; - cvt.s32.s8 %r9833, %r9832; - cvt.u32.u16 %r9834, %rs6394; - cvt.s32.s8 %r9835, %r9834; - mad.lo.s32 %r9836, %r111, %r9835, %r9827; - mad.lo.s32 %r9837, %r112, %r9833, %r9836; - mad.lo.s32 %r9838, %r114, %r9831, %r9837; - mad.lo.s32 %r9839, %r115, %r9829, %r9838; - shr.u32 %r9840, %r9647, 6; - and.b32 %r9841, %r9840, 240; - shr.u32 %r9842, %r9839, 10; - or.b32 %r9843, %r9842, %r9841; - xor.b32 %r9844, %r90, %r9843; - ld.const.v4.u8 {%rs6402, %rs6403, %rs6404, %rs6405}, [matrix+3200]; - cvt.u32.u16 %r9845, %rs6405; - cvt.s32.s8 %r9846, %r9845; - cvt.u32.u16 %r9847, %rs6404; - cvt.s32.s8 %r9848, %r9847; - cvt.u32.u16 %r9849, %rs6402; - cvt.s32.s8 %r9850, %r9849; - cvt.u32.u16 %r9851, %rs6403; - cvt.s32.s8 %r9852, %r9851; - mul.lo.s32 %r9853, %r34, %r9852; - mad.lo.s32 %r9854, %r124, %r9850, %r9853; - mad.lo.s32 %r9855, %r35, %r9848, %r9854; - mad.lo.s32 %r9856, %r36, %r9846, %r9855; - ld.const.v4.u8 {%rs6410, %rs6411, %rs6412, %rs6413}, [matrix+3204]; - cvt.u32.u16 %r9857, %rs6413; - cvt.s32.s8 %r9858, %r9857; - cvt.u32.u16 %r9859, %rs6412; - cvt.s32.s8 %r9860, %r9859; - cvt.u32.u16 %r9861, %rs6411; - cvt.s32.s8 %r9862, %r9861; - cvt.u32.u16 %r9863, %rs6410; - cvt.s32.s8 %r9864, %r9863; - mad.lo.s32 %r9865, %r37, %r9864, %r9856; - mad.lo.s32 %r9866, %r38, %r9862, %r9865; - mad.lo.s32 %r9867, %r39, %r9860, %r9866; - mad.lo.s32 %r9868, %r40, %r9858, %r9867; - ld.const.v4.u8 {%rs6418, %rs6419, %rs6420, %rs6421}, [matrix+3208]; - cvt.u32.u16 %r9869, %rs6421; - cvt.s32.s8 %r9870, %r9869; - cvt.u32.u16 %r9871, %rs6420; - cvt.s32.s8 %r9872, %r9871; - cvt.u32.u16 %r9873, %rs6419; - cvt.s32.s8 %r9874, %r9873; - cvt.u32.u16 %r9875, %rs6418; - cvt.s32.s8 %r9876, %r9875; - mad.lo.s32 %r9877, %r42, %r9876, %r9868; - mad.lo.s32 %r9878, %r43, %r9874, %r9877; - mad.lo.s32 %r9879, %r45, %r9872, %r9878; - mad.lo.s32 %r9880, %r46, %r9870, %r9879; - ld.const.v4.u8 {%rs6426, %rs6427, %rs6428, %rs6429}, [matrix+3212]; - cvt.u32.u16 %r9881, %rs6429; - cvt.s32.s8 %r9882, %r9881; - cvt.u32.u16 %r9883, %rs6428; - cvt.s32.s8 %r9884, %r9883; - cvt.u32.u16 %r9885, %rs6427; - cvt.s32.s8 %r9886, %r9885; - cvt.u32.u16 %r9887, %rs6426; - cvt.s32.s8 %r9888, %r9887; - mad.lo.s32 %r9889, %r48, %r9888, %r9880; - mad.lo.s32 %r9890, %r49, %r9886, %r9889; - mad.lo.s32 %r9891, %r50, %r9884, %r9890; - mad.lo.s32 %r9892, %r51, %r9882, %r9891; - ld.const.v4.u8 {%rs6434, %rs6435, %rs6436, %rs6437}, [matrix+3216]; - cvt.u32.u16 %r9893, %rs6437; - cvt.s32.s8 %r9894, %r9893; - cvt.u32.u16 %r9895, %rs6436; - cvt.s32.s8 %r9896, %r9895; - cvt.u32.u16 %r9897, %rs6435; - cvt.s32.s8 %r9898, %r9897; - cvt.u32.u16 %r9899, %rs6434; - cvt.s32.s8 %r9900, %r9899; - mad.lo.s32 %r9901, %r173, %r9900, %r9892; - mad.lo.s32 %r9902, %r53, %r9898, %r9901; - mad.lo.s32 %r9903, %r54, %r9896, %r9902; - mad.lo.s32 %r9904, %r55, %r9894, %r9903; - ld.const.v4.u8 {%rs6442, %rs6443, %rs6444, %rs6445}, [matrix+3220]; - cvt.u32.u16 %r9905, %rs6445; - cvt.s32.s8 %r9906, %r9905; - cvt.u32.u16 %r9907, %rs6444; - cvt.s32.s8 %r9908, %r9907; - cvt.u32.u16 %r9909, %rs6443; - cvt.s32.s8 %r9910, %r9909; - cvt.u32.u16 %r9911, %rs6442; - cvt.s32.s8 %r9912, %r9911; - mad.lo.s32 %r9913, %r56, %r9912, %r9904; - mad.lo.s32 %r9914, %r57, %r9910, %r9913; - mad.lo.s32 %r9915, %r58, %r9908, %r9914; - mad.lo.s32 %r9916, %r59, %r9906, %r9915; - ld.const.v4.u8 {%rs6450, %rs6451, %rs6452, %rs6453}, [matrix+3224]; - cvt.u32.u16 %r9917, %rs6453; - cvt.s32.s8 %r9918, %r9917; - cvt.u32.u16 %r9919, %rs6452; - cvt.s32.s8 %r9920, %r9919; - cvt.u32.u16 %r9921, %rs6451; - cvt.s32.s8 %r9922, %r9921; - cvt.u32.u16 %r9923, %rs6450; - cvt.s32.s8 %r9924, %r9923; - mad.lo.s32 %r9925, %r61, %r9924, %r9916; - mad.lo.s32 %r9926, %r62, %r9922, %r9925; - mad.lo.s32 %r9927, %r64, %r9920, %r9926; - mad.lo.s32 %r9928, %r65, %r9918, %r9927; - ld.const.v4.u8 {%rs6458, %rs6459, %rs6460, %rs6461}, [matrix+3228]; - cvt.u32.u16 %r9929, %rs6461; - cvt.s32.s8 %r9930, %r9929; - cvt.u32.u16 %r9931, %rs6460; - cvt.s32.s8 %r9932, %r9931; - cvt.u32.u16 %r9933, %rs6459; - cvt.s32.s8 %r9934, %r9933; - cvt.u32.u16 %r9935, %rs6458; - cvt.s32.s8 %r9936, %r9935; - mad.lo.s32 %r9937, %r67, %r9936, %r9928; - mad.lo.s32 %r9938, %r68, %r9934, %r9937; - mad.lo.s32 %r9939, %r69, %r9932, %r9938; - mad.lo.s32 %r9940, %r70, %r9930, %r9939; - ld.const.v4.u8 {%rs6466, %rs6467, %rs6468, %rs6469}, [matrix+3232]; - cvt.u32.u16 %r9941, %rs6469; - cvt.s32.s8 %r9942, %r9941; - cvt.u32.u16 %r9943, %rs6468; - cvt.s32.s8 %r9944, %r9943; - cvt.u32.u16 %r9945, %rs6467; - cvt.s32.s8 %r9946, %r9945; - cvt.u32.u16 %r9947, %rs6466; - cvt.s32.s8 %r9948, %r9947; - mad.lo.s32 %r9949, %r222, %r9948, %r9940; - mad.lo.s32 %r9950, %r72, %r9946, %r9949; - mad.lo.s32 %r9951, %r73, %r9944, %r9950; - mad.lo.s32 %r9952, %r74, %r9942, %r9951; - ld.const.v4.u8 {%rs6474, %rs6475, %rs6476, %rs6477}, [matrix+3236]; - cvt.u32.u16 %r9953, %rs6477; - cvt.s32.s8 %r9954, %r9953; - cvt.u32.u16 %r9955, %rs6476; - cvt.s32.s8 %r9956, %r9955; - cvt.u32.u16 %r9957, %rs6475; - cvt.s32.s8 %r9958, %r9957; - cvt.u32.u16 %r9959, %rs6474; - cvt.s32.s8 %r9960, %r9959; - mad.lo.s32 %r9961, %r75, %r9960, %r9952; - mad.lo.s32 %r9962, %r76, %r9958, %r9961; - mad.lo.s32 %r9963, %r77, %r9956, %r9962; - mad.lo.s32 %r9964, %r78, %r9954, %r9963; - ld.const.v4.u8 {%rs6482, %rs6483, %rs6484, %rs6485}, [matrix+3240]; - cvt.u32.u16 %r9965, %rs6485; - cvt.s32.s8 %r9966, %r9965; - cvt.u32.u16 %r9967, %rs6484; - cvt.s32.s8 %r9968, %r9967; - cvt.u32.u16 %r9969, %rs6483; - cvt.s32.s8 %r9970, %r9969; - cvt.u32.u16 %r9971, %rs6482; - cvt.s32.s8 %r9972, %r9971; - mad.lo.s32 %r9973, %r80, %r9972, %r9964; - mad.lo.s32 %r9974, %r81, %r9970, %r9973; - mad.lo.s32 %r9975, %r83, %r9968, %r9974; - mad.lo.s32 %r9976, %r84, %r9966, %r9975; - ld.const.v4.u8 {%rs6490, %rs6491, %rs6492, %rs6493}, [matrix+3244]; - cvt.u32.u16 %r9977, %rs6493; - cvt.s32.s8 %r9978, %r9977; - cvt.u32.u16 %r9979, %rs6492; - cvt.s32.s8 %r9980, %r9979; - cvt.u32.u16 %r9981, %rs6491; - cvt.s32.s8 %r9982, %r9981; - cvt.u32.u16 %r9983, %rs6490; - cvt.s32.s8 %r9984, %r9983; - mad.lo.s32 %r9985, %r86, %r9984, %r9976; - mad.lo.s32 %r9986, %r87, %r9982, %r9985; - mad.lo.s32 %r9987, %r88, %r9980, %r9986; - mad.lo.s32 %r9988, %r89, %r9978, %r9987; - ld.const.v4.u8 {%rs6498, %rs6499, %rs6500, %rs6501}, [matrix+3248]; - cvt.u32.u16 %r9989, %rs6501; - cvt.s32.s8 %r9990, %r9989; - cvt.u32.u16 %r9991, %rs6500; - cvt.s32.s8 %r9992, %r9991; - cvt.u32.u16 %r9993, %rs6499; - cvt.s32.s8 %r9994, %r9993; - cvt.u32.u16 %r9995, %rs6498; - cvt.s32.s8 %r9996, %r9995; - mad.lo.s32 %r9997, %r271, %r9996, %r9988; - mad.lo.s32 %r9998, %r91, %r9994, %r9997; - mad.lo.s32 %r9999, %r93, %r9992, %r9998; - mad.lo.s32 %r10000, %r94, %r9990, %r9999; - ld.const.v4.u8 {%rs6506, %rs6507, %rs6508, %rs6509}, [matrix+3252]; - cvt.u32.u16 %r10001, %rs6509; - cvt.s32.s8 %r10002, %r10001; - cvt.u32.u16 %r10003, %rs6508; - cvt.s32.s8 %r10004, %r10003; - cvt.u32.u16 %r10005, %rs6507; - cvt.s32.s8 %r10006, %r10005; - cvt.u32.u16 %r10007, %rs6506; - cvt.s32.s8 %r10008, %r10007; - mad.lo.s32 %r10009, %r96, %r10008, %r10000; - mad.lo.s32 %r10010, %r97, %r10006, %r10009; - mad.lo.s32 %r10011, %r99, %r10004, %r10010; - mad.lo.s32 %r10012, %r100, %r10002, %r10011; - ld.const.v4.u8 {%rs6514, %rs6515, %rs6516, %rs6517}, [matrix+3256]; - cvt.u32.u16 %r10013, %rs6517; - cvt.s32.s8 %r10014, %r10013; - cvt.u32.u16 %r10015, %rs6516; - cvt.s32.s8 %r10016, %r10015; - cvt.u32.u16 %r10017, %rs6515; - cvt.s32.s8 %r10018, %r10017; - cvt.u32.u16 %r10019, %rs6514; - cvt.s32.s8 %r10020, %r10019; - mad.lo.s32 %r10021, %r103, %r10020, %r10012; - mad.lo.s32 %r10022, %r104, %r10018, %r10021; - mad.lo.s32 %r10023, %r107, %r10016, %r10022; - mad.lo.s32 %r10024, %r108, %r10014, %r10023; - ld.const.v4.u8 {%rs6522, %rs6523, %rs6524, %rs6525}, [matrix+3260]; - cvt.u32.u16 %r10025, %rs6525; - cvt.s32.s8 %r10026, %r10025; - cvt.u32.u16 %r10027, %rs6524; - cvt.s32.s8 %r10028, %r10027; - cvt.u32.u16 %r10029, %rs6523; - cvt.s32.s8 %r10030, %r10029; - cvt.u32.u16 %r10031, %rs6522; - cvt.s32.s8 %r10032, %r10031; - mad.lo.s32 %r10033, %r111, %r10032, %r10024; - mad.lo.s32 %r10034, %r112, %r10030, %r10033; - mad.lo.s32 %r10035, %r114, %r10028, %r10034; - mad.lo.s32 %r10036, %r115, %r10026, %r10035; - ld.const.v4.u8 {%rs6530, %rs6531, %rs6532, %rs6533}, [matrix+3264]; - cvt.u32.u16 %r10037, %rs6533; - cvt.s32.s8 %r10038, %r10037; - cvt.u32.u16 %r10039, %rs6532; - cvt.s32.s8 %r10040, %r10039; - cvt.u32.u16 %r10041, %rs6530; - cvt.s32.s8 %r10042, %r10041; - cvt.u32.u16 %r10043, %rs6531; - cvt.s32.s8 %r10044, %r10043; - mul.lo.s32 %r10045, %r34, %r10044; - mad.lo.s32 %r10046, %r124, %r10042, %r10045; - mad.lo.s32 %r10047, %r35, %r10040, %r10046; - mad.lo.s32 %r10048, %r36, %r10038, %r10047; - ld.const.v4.u8 {%rs6538, %rs6539, %rs6540, %rs6541}, [matrix+3268]; - cvt.u32.u16 %r10049, %rs6541; - cvt.s32.s8 %r10050, %r10049; - cvt.u32.u16 %r10051, %rs6540; - cvt.s32.s8 %r10052, %r10051; - cvt.u32.u16 %r10053, %rs6539; - cvt.s32.s8 %r10054, %r10053; - cvt.u32.u16 %r10055, %rs6538; - cvt.s32.s8 %r10056, %r10055; - mad.lo.s32 %r10057, %r37, %r10056, %r10048; - mad.lo.s32 %r10058, %r38, %r10054, %r10057; - mad.lo.s32 %r10059, %r39, %r10052, %r10058; - mad.lo.s32 %r10060, %r40, %r10050, %r10059; - ld.const.v4.u8 {%rs6546, %rs6547, %rs6548, %rs6549}, [matrix+3272]; - cvt.u32.u16 %r10061, %rs6549; - cvt.s32.s8 %r10062, %r10061; - cvt.u32.u16 %r10063, %rs6548; - cvt.s32.s8 %r10064, %r10063; - cvt.u32.u16 %r10065, %rs6547; - cvt.s32.s8 %r10066, %r10065; - cvt.u32.u16 %r10067, %rs6546; - cvt.s32.s8 %r10068, %r10067; - mad.lo.s32 %r10069, %r42, %r10068, %r10060; - mad.lo.s32 %r10070, %r43, %r10066, %r10069; - mad.lo.s32 %r10071, %r45, %r10064, %r10070; - mad.lo.s32 %r10072, %r46, %r10062, %r10071; - ld.const.v4.u8 {%rs6554, %rs6555, %rs6556, %rs6557}, [matrix+3276]; - cvt.u32.u16 %r10073, %rs6557; - cvt.s32.s8 %r10074, %r10073; - cvt.u32.u16 %r10075, %rs6556; - cvt.s32.s8 %r10076, %r10075; - cvt.u32.u16 %r10077, %rs6555; - cvt.s32.s8 %r10078, %r10077; - cvt.u32.u16 %r10079, %rs6554; - cvt.s32.s8 %r10080, %r10079; - mad.lo.s32 %r10081, %r48, %r10080, %r10072; - mad.lo.s32 %r10082, %r49, %r10078, %r10081; - mad.lo.s32 %r10083, %r50, %r10076, %r10082; - mad.lo.s32 %r10084, %r51, %r10074, %r10083; - ld.const.v4.u8 {%rs6562, %rs6563, %rs6564, %rs6565}, [matrix+3280]; - cvt.u32.u16 %r10085, %rs6565; - cvt.s32.s8 %r10086, %r10085; - cvt.u32.u16 %r10087, %rs6564; - cvt.s32.s8 %r10088, %r10087; - cvt.u32.u16 %r10089, %rs6563; - cvt.s32.s8 %r10090, %r10089; - cvt.u32.u16 %r10091, %rs6562; - cvt.s32.s8 %r10092, %r10091; - mad.lo.s32 %r10093, %r173, %r10092, %r10084; - mad.lo.s32 %r10094, %r53, %r10090, %r10093; - mad.lo.s32 %r10095, %r54, %r10088, %r10094; - mad.lo.s32 %r10096, %r55, %r10086, %r10095; - ld.const.v4.u8 {%rs6570, %rs6571, %rs6572, %rs6573}, [matrix+3284]; - cvt.u32.u16 %r10097, %rs6573; - cvt.s32.s8 %r10098, %r10097; - cvt.u32.u16 %r10099, %rs6572; - cvt.s32.s8 %r10100, %r10099; - cvt.u32.u16 %r10101, %rs6571; - cvt.s32.s8 %r10102, %r10101; - cvt.u32.u16 %r10103, %rs6570; - cvt.s32.s8 %r10104, %r10103; - mad.lo.s32 %r10105, %r56, %r10104, %r10096; - mad.lo.s32 %r10106, %r57, %r10102, %r10105; - mad.lo.s32 %r10107, %r58, %r10100, %r10106; - mad.lo.s32 %r10108, %r59, %r10098, %r10107; - ld.const.v4.u8 {%rs6578, %rs6579, %rs6580, %rs6581}, [matrix+3288]; - cvt.u32.u16 %r10109, %rs6581; - cvt.s32.s8 %r10110, %r10109; - cvt.u32.u16 %r10111, %rs6580; - cvt.s32.s8 %r10112, %r10111; - cvt.u32.u16 %r10113, %rs6579; - cvt.s32.s8 %r10114, %r10113; - cvt.u32.u16 %r10115, %rs6578; - cvt.s32.s8 %r10116, %r10115; - mad.lo.s32 %r10117, %r61, %r10116, %r10108; - mad.lo.s32 %r10118, %r62, %r10114, %r10117; - mad.lo.s32 %r10119, %r64, %r10112, %r10118; - mad.lo.s32 %r10120, %r65, %r10110, %r10119; - ld.const.v4.u8 {%rs6586, %rs6587, %rs6588, %rs6589}, [matrix+3292]; - cvt.u32.u16 %r10121, %rs6589; - cvt.s32.s8 %r10122, %r10121; - cvt.u32.u16 %r10123, %rs6588; - cvt.s32.s8 %r10124, %r10123; - cvt.u32.u16 %r10125, %rs6587; - cvt.s32.s8 %r10126, %r10125; - cvt.u32.u16 %r10127, %rs6586; - cvt.s32.s8 %r10128, %r10127; - mad.lo.s32 %r10129, %r67, %r10128, %r10120; - mad.lo.s32 %r10130, %r68, %r10126, %r10129; - mad.lo.s32 %r10131, %r69, %r10124, %r10130; - mad.lo.s32 %r10132, %r70, %r10122, %r10131; - ld.const.v4.u8 {%rs6594, %rs6595, %rs6596, %rs6597}, [matrix+3296]; - cvt.u32.u16 %r10133, %rs6597; - cvt.s32.s8 %r10134, %r10133; - cvt.u32.u16 %r10135, %rs6596; - cvt.s32.s8 %r10136, %r10135; - cvt.u32.u16 %r10137, %rs6595; - cvt.s32.s8 %r10138, %r10137; - cvt.u32.u16 %r10139, %rs6594; - cvt.s32.s8 %r10140, %r10139; - mad.lo.s32 %r10141, %r222, %r10140, %r10132; - mad.lo.s32 %r10142, %r72, %r10138, %r10141; - mad.lo.s32 %r10143, %r73, %r10136, %r10142; - mad.lo.s32 %r10144, %r74, %r10134, %r10143; - ld.const.v4.u8 {%rs6602, %rs6603, %rs6604, %rs6605}, [matrix+3300]; - cvt.u32.u16 %r10145, %rs6605; - cvt.s32.s8 %r10146, %r10145; - cvt.u32.u16 %r10147, %rs6604; - cvt.s32.s8 %r10148, %r10147; - cvt.u32.u16 %r10149, %rs6603; - cvt.s32.s8 %r10150, %r10149; - cvt.u32.u16 %r10151, %rs6602; - cvt.s32.s8 %r10152, %r10151; - mad.lo.s32 %r10153, %r75, %r10152, %r10144; - mad.lo.s32 %r10154, %r76, %r10150, %r10153; - mad.lo.s32 %r10155, %r77, %r10148, %r10154; - mad.lo.s32 %r10156, %r78, %r10146, %r10155; - ld.const.v4.u8 {%rs6610, %rs6611, %rs6612, %rs6613}, [matrix+3304]; - cvt.u32.u16 %r10157, %rs6613; - cvt.s32.s8 %r10158, %r10157; - cvt.u32.u16 %r10159, %rs6612; - cvt.s32.s8 %r10160, %r10159; - cvt.u32.u16 %r10161, %rs6611; - cvt.s32.s8 %r10162, %r10161; - cvt.u32.u16 %r10163, %rs6610; - cvt.s32.s8 %r10164, %r10163; - mad.lo.s32 %r10165, %r80, %r10164, %r10156; - mad.lo.s32 %r10166, %r81, %r10162, %r10165; - mad.lo.s32 %r10167, %r83, %r10160, %r10166; - mad.lo.s32 %r10168, %r84, %r10158, %r10167; - ld.const.v4.u8 {%rs6618, %rs6619, %rs6620, %rs6621}, [matrix+3308]; - cvt.u32.u16 %r10169, %rs6621; - cvt.s32.s8 %r10170, %r10169; - cvt.u32.u16 %r10171, %rs6620; - cvt.s32.s8 %r10172, %r10171; - cvt.u32.u16 %r10173, %rs6619; - cvt.s32.s8 %r10174, %r10173; - cvt.u32.u16 %r10175, %rs6618; - cvt.s32.s8 %r10176, %r10175; - mad.lo.s32 %r10177, %r86, %r10176, %r10168; - mad.lo.s32 %r10178, %r87, %r10174, %r10177; - mad.lo.s32 %r10179, %r88, %r10172, %r10178; - mad.lo.s32 %r10180, %r89, %r10170, %r10179; - ld.const.v4.u8 {%rs6626, %rs6627, %rs6628, %rs6629}, [matrix+3312]; - cvt.u32.u16 %r10181, %rs6629; - cvt.s32.s8 %r10182, %r10181; - cvt.u32.u16 %r10183, %rs6628; - cvt.s32.s8 %r10184, %r10183; - cvt.u32.u16 %r10185, %rs6627; - cvt.s32.s8 %r10186, %r10185; - cvt.u32.u16 %r10187, %rs6626; - cvt.s32.s8 %r10188, %r10187; - mad.lo.s32 %r10189, %r271, %r10188, %r10180; - mad.lo.s32 %r10190, %r91, %r10186, %r10189; - mad.lo.s32 %r10191, %r93, %r10184, %r10190; - mad.lo.s32 %r10192, %r94, %r10182, %r10191; - ld.const.v4.u8 {%rs6634, %rs6635, %rs6636, %rs6637}, [matrix+3316]; - cvt.u32.u16 %r10193, %rs6637; - cvt.s32.s8 %r10194, %r10193; - cvt.u32.u16 %r10195, %rs6636; - cvt.s32.s8 %r10196, %r10195; - cvt.u32.u16 %r10197, %rs6635; - cvt.s32.s8 %r10198, %r10197; - cvt.u32.u16 %r10199, %rs6634; - cvt.s32.s8 %r10200, %r10199; - mad.lo.s32 %r10201, %r96, %r10200, %r10192; - mad.lo.s32 %r10202, %r97, %r10198, %r10201; - mad.lo.s32 %r10203, %r99, %r10196, %r10202; - mad.lo.s32 %r10204, %r100, %r10194, %r10203; - ld.const.v4.u8 {%rs6642, %rs6643, %rs6644, %rs6645}, [matrix+3320]; - cvt.u32.u16 %r10205, %rs6645; - cvt.s32.s8 %r10206, %r10205; - cvt.u32.u16 %r10207, %rs6644; - cvt.s32.s8 %r10208, %r10207; - cvt.u32.u16 %r10209, %rs6643; - cvt.s32.s8 %r10210, %r10209; - cvt.u32.u16 %r10211, %rs6642; - cvt.s32.s8 %r10212, %r10211; - mad.lo.s32 %r10213, %r103, %r10212, %r10204; - mad.lo.s32 %r10214, %r104, %r10210, %r10213; - mad.lo.s32 %r10215, %r107, %r10208, %r10214; - mad.lo.s32 %r10216, %r108, %r10206, %r10215; - ld.const.v4.u8 {%rs6650, %rs6651, %rs6652, %rs6653}, [matrix+3324]; - cvt.u32.u16 %r10217, %rs6653; - cvt.s32.s8 %r10218, %r10217; - cvt.u32.u16 %r10219, %rs6652; - cvt.s32.s8 %r10220, %r10219; - cvt.u32.u16 %r10221, %rs6651; - cvt.s32.s8 %r10222, %r10221; - cvt.u32.u16 %r10223, %rs6650; - cvt.s32.s8 %r10224, %r10223; - mad.lo.s32 %r10225, %r111, %r10224, %r10216; - mad.lo.s32 %r10226, %r112, %r10222, %r10225; - mad.lo.s32 %r10227, %r114, %r10220, %r10226; - mad.lo.s32 %r10228, %r115, %r10218, %r10227; - shr.u32 %r10229, %r10036, 6; - and.b32 %r10230, %r10229, 240; - shr.u32 %r10231, %r10228, 10; - or.b32 %r10232, %r10231, %r10230; - xor.b32 %r10233, %r92, %r10232; - cvt.u64.u32 %rd399, %r10233; - ld.const.v4.u8 {%rs6658, %rs6659, %rs6660, %rs6661}, [matrix+3328]; - cvt.u32.u16 %r10234, %rs6661; - cvt.s32.s8 %r10235, %r10234; - cvt.u32.u16 %r10236, %rs6660; - cvt.s32.s8 %r10237, %r10236; - cvt.u32.u16 %r10238, %rs6658; - cvt.s32.s8 %r10239, %r10238; - cvt.u32.u16 %r10240, %rs6659; - cvt.s32.s8 %r10241, %r10240; - mul.lo.s32 %r10242, %r34, %r10241; - mad.lo.s32 %r10243, %r124, %r10239, %r10242; - mad.lo.s32 %r10244, %r35, %r10237, %r10243; - mad.lo.s32 %r10245, %r36, %r10235, %r10244; - ld.const.v4.u8 {%rs6666, %rs6667, %rs6668, %rs6669}, [matrix+3332]; - cvt.u32.u16 %r10246, %rs6669; - cvt.s32.s8 %r10247, %r10246; - cvt.u32.u16 %r10248, %rs6668; - cvt.s32.s8 %r10249, %r10248; - cvt.u32.u16 %r10250, %rs6667; - cvt.s32.s8 %r10251, %r10250; - cvt.u32.u16 %r10252, %rs6666; - cvt.s32.s8 %r10253, %r10252; - mad.lo.s32 %r10254, %r37, %r10253, %r10245; - mad.lo.s32 %r10255, %r38, %r10251, %r10254; - mad.lo.s32 %r10256, %r39, %r10249, %r10255; - mad.lo.s32 %r10257, %r40, %r10247, %r10256; - ld.const.v4.u8 {%rs6674, %rs6675, %rs6676, %rs6677}, [matrix+3336]; - cvt.u32.u16 %r10258, %rs6677; - cvt.s32.s8 %r10259, %r10258; - cvt.u32.u16 %r10260, %rs6676; - cvt.s32.s8 %r10261, %r10260; - cvt.u32.u16 %r10262, %rs6675; - cvt.s32.s8 %r10263, %r10262; - cvt.u32.u16 %r10264, %rs6674; - cvt.s32.s8 %r10265, %r10264; - mad.lo.s32 %r10266, %r42, %r10265, %r10257; - mad.lo.s32 %r10267, %r43, %r10263, %r10266; - mad.lo.s32 %r10268, %r45, %r10261, %r10267; - mad.lo.s32 %r10269, %r46, %r10259, %r10268; - ld.const.v4.u8 {%rs6682, %rs6683, %rs6684, %rs6685}, [matrix+3340]; - cvt.u32.u16 %r10270, %rs6685; - cvt.s32.s8 %r10271, %r10270; - cvt.u32.u16 %r10272, %rs6684; - cvt.s32.s8 %r10273, %r10272; - cvt.u32.u16 %r10274, %rs6683; - cvt.s32.s8 %r10275, %r10274; - cvt.u32.u16 %r10276, %rs6682; - cvt.s32.s8 %r10277, %r10276; - mad.lo.s32 %r10278, %r48, %r10277, %r10269; - mad.lo.s32 %r10279, %r49, %r10275, %r10278; - mad.lo.s32 %r10280, %r50, %r10273, %r10279; - mad.lo.s32 %r10281, %r51, %r10271, %r10280; - ld.const.v4.u8 {%rs6690, %rs6691, %rs6692, %rs6693}, [matrix+3344]; - cvt.u32.u16 %r10282, %rs6693; - cvt.s32.s8 %r10283, %r10282; - cvt.u32.u16 %r10284, %rs6692; - cvt.s32.s8 %r10285, %r10284; - cvt.u32.u16 %r10286, %rs6691; - cvt.s32.s8 %r10287, %r10286; - cvt.u32.u16 %r10288, %rs6690; - cvt.s32.s8 %r10289, %r10288; - mad.lo.s32 %r10290, %r173, %r10289, %r10281; - mad.lo.s32 %r10291, %r53, %r10287, %r10290; - mad.lo.s32 %r10292, %r54, %r10285, %r10291; - mad.lo.s32 %r10293, %r55, %r10283, %r10292; - ld.const.v4.u8 {%rs6698, %rs6699, %rs6700, %rs6701}, [matrix+3348]; - cvt.u32.u16 %r10294, %rs6701; - cvt.s32.s8 %r10295, %r10294; - cvt.u32.u16 %r10296, %rs6700; - cvt.s32.s8 %r10297, %r10296; - cvt.u32.u16 %r10298, %rs6699; - cvt.s32.s8 %r10299, %r10298; - cvt.u32.u16 %r10300, %rs6698; - cvt.s32.s8 %r10301, %r10300; - mad.lo.s32 %r10302, %r56, %r10301, %r10293; - mad.lo.s32 %r10303, %r57, %r10299, %r10302; - mad.lo.s32 %r10304, %r58, %r10297, %r10303; - mad.lo.s32 %r10305, %r59, %r10295, %r10304; - ld.const.v4.u8 {%rs6706, %rs6707, %rs6708, %rs6709}, [matrix+3352]; - cvt.u32.u16 %r10306, %rs6709; - cvt.s32.s8 %r10307, %r10306; - cvt.u32.u16 %r10308, %rs6708; - cvt.s32.s8 %r10309, %r10308; - cvt.u32.u16 %r10310, %rs6707; - cvt.s32.s8 %r10311, %r10310; - cvt.u32.u16 %r10312, %rs6706; - cvt.s32.s8 %r10313, %r10312; - mad.lo.s32 %r10314, %r61, %r10313, %r10305; - mad.lo.s32 %r10315, %r62, %r10311, %r10314; - mad.lo.s32 %r10316, %r64, %r10309, %r10315; - mad.lo.s32 %r10317, %r65, %r10307, %r10316; - ld.const.v4.u8 {%rs6714, %rs6715, %rs6716, %rs6717}, [matrix+3356]; - cvt.u32.u16 %r10318, %rs6717; - cvt.s32.s8 %r10319, %r10318; - cvt.u32.u16 %r10320, %rs6716; - cvt.s32.s8 %r10321, %r10320; - cvt.u32.u16 %r10322, %rs6715; - cvt.s32.s8 %r10323, %r10322; - cvt.u32.u16 %r10324, %rs6714; - cvt.s32.s8 %r10325, %r10324; - mad.lo.s32 %r10326, %r67, %r10325, %r10317; - mad.lo.s32 %r10327, %r68, %r10323, %r10326; - mad.lo.s32 %r10328, %r69, %r10321, %r10327; - mad.lo.s32 %r10329, %r70, %r10319, %r10328; - ld.const.v4.u8 {%rs6722, %rs6723, %rs6724, %rs6725}, [matrix+3360]; - cvt.u32.u16 %r10330, %rs6725; - cvt.s32.s8 %r10331, %r10330; - cvt.u32.u16 %r10332, %rs6724; - cvt.s32.s8 %r10333, %r10332; - cvt.u32.u16 %r10334, %rs6723; - cvt.s32.s8 %r10335, %r10334; - cvt.u32.u16 %r10336, %rs6722; - cvt.s32.s8 %r10337, %r10336; - mad.lo.s32 %r10338, %r222, %r10337, %r10329; - mad.lo.s32 %r10339, %r72, %r10335, %r10338; - mad.lo.s32 %r10340, %r73, %r10333, %r10339; - mad.lo.s32 %r10341, %r74, %r10331, %r10340; - ld.const.v4.u8 {%rs6730, %rs6731, %rs6732, %rs6733}, [matrix+3364]; - cvt.u32.u16 %r10342, %rs6733; - cvt.s32.s8 %r10343, %r10342; - cvt.u32.u16 %r10344, %rs6732; - cvt.s32.s8 %r10345, %r10344; - cvt.u32.u16 %r10346, %rs6731; - cvt.s32.s8 %r10347, %r10346; - cvt.u32.u16 %r10348, %rs6730; - cvt.s32.s8 %r10349, %r10348; - mad.lo.s32 %r10350, %r75, %r10349, %r10341; - mad.lo.s32 %r10351, %r76, %r10347, %r10350; - mad.lo.s32 %r10352, %r77, %r10345, %r10351; - mad.lo.s32 %r10353, %r78, %r10343, %r10352; - ld.const.v4.u8 {%rs6738, %rs6739, %rs6740, %rs6741}, [matrix+3368]; - cvt.u32.u16 %r10354, %rs6741; - cvt.s32.s8 %r10355, %r10354; - cvt.u32.u16 %r10356, %rs6740; - cvt.s32.s8 %r10357, %r10356; - cvt.u32.u16 %r10358, %rs6739; - cvt.s32.s8 %r10359, %r10358; - cvt.u32.u16 %r10360, %rs6738; - cvt.s32.s8 %r10361, %r10360; - mad.lo.s32 %r10362, %r80, %r10361, %r10353; - mad.lo.s32 %r10363, %r81, %r10359, %r10362; - mad.lo.s32 %r10364, %r83, %r10357, %r10363; - mad.lo.s32 %r10365, %r84, %r10355, %r10364; - ld.const.v4.u8 {%rs6746, %rs6747, %rs6748, %rs6749}, [matrix+3372]; - cvt.u32.u16 %r10366, %rs6749; - cvt.s32.s8 %r10367, %r10366; - cvt.u32.u16 %r10368, %rs6748; - cvt.s32.s8 %r10369, %r10368; - cvt.u32.u16 %r10370, %rs6747; - cvt.s32.s8 %r10371, %r10370; - cvt.u32.u16 %r10372, %rs6746; - cvt.s32.s8 %r10373, %r10372; - mad.lo.s32 %r10374, %r86, %r10373, %r10365; - mad.lo.s32 %r10375, %r87, %r10371, %r10374; - mad.lo.s32 %r10376, %r88, %r10369, %r10375; - mad.lo.s32 %r10377, %r89, %r10367, %r10376; - ld.const.v4.u8 {%rs6754, %rs6755, %rs6756, %rs6757}, [matrix+3376]; - cvt.u32.u16 %r10378, %rs6757; - cvt.s32.s8 %r10379, %r10378; - cvt.u32.u16 %r10380, %rs6756; - cvt.s32.s8 %r10381, %r10380; - cvt.u32.u16 %r10382, %rs6755; - cvt.s32.s8 %r10383, %r10382; - cvt.u32.u16 %r10384, %rs6754; - cvt.s32.s8 %r10385, %r10384; - mad.lo.s32 %r10386, %r271, %r10385, %r10377; - mad.lo.s32 %r10387, %r91, %r10383, %r10386; - mad.lo.s32 %r10388, %r93, %r10381, %r10387; - mad.lo.s32 %r10389, %r94, %r10379, %r10388; - ld.const.v4.u8 {%rs6762, %rs6763, %rs6764, %rs6765}, [matrix+3380]; - cvt.u32.u16 %r10390, %rs6765; - cvt.s32.s8 %r10391, %r10390; - cvt.u32.u16 %r10392, %rs6764; - cvt.s32.s8 %r10393, %r10392; - cvt.u32.u16 %r10394, %rs6763; - cvt.s32.s8 %r10395, %r10394; - cvt.u32.u16 %r10396, %rs6762; - cvt.s32.s8 %r10397, %r10396; - mad.lo.s32 %r10398, %r96, %r10397, %r10389; - mad.lo.s32 %r10399, %r97, %r10395, %r10398; - mad.lo.s32 %r10400, %r99, %r10393, %r10399; - mad.lo.s32 %r10401, %r100, %r10391, %r10400; - ld.const.v4.u8 {%rs6770, %rs6771, %rs6772, %rs6773}, [matrix+3384]; - cvt.u32.u16 %r10402, %rs6773; - cvt.s32.s8 %r10403, %r10402; - cvt.u32.u16 %r10404, %rs6772; - cvt.s32.s8 %r10405, %r10404; - cvt.u32.u16 %r10406, %rs6771; - cvt.s32.s8 %r10407, %r10406; - cvt.u32.u16 %r10408, %rs6770; - cvt.s32.s8 %r10409, %r10408; - mad.lo.s32 %r10410, %r103, %r10409, %r10401; - mad.lo.s32 %r10411, %r104, %r10407, %r10410; - mad.lo.s32 %r10412, %r107, %r10405, %r10411; - mad.lo.s32 %r10413, %r108, %r10403, %r10412; - ld.const.v4.u8 {%rs6778, %rs6779, %rs6780, %rs6781}, [matrix+3388]; - cvt.u32.u16 %r10414, %rs6781; - cvt.s32.s8 %r10415, %r10414; - cvt.u32.u16 %r10416, %rs6780; - cvt.s32.s8 %r10417, %r10416; - cvt.u32.u16 %r10418, %rs6779; - cvt.s32.s8 %r10419, %r10418; - cvt.u32.u16 %r10420, %rs6778; - cvt.s32.s8 %r10421, %r10420; - mad.lo.s32 %r10422, %r111, %r10421, %r10413; - mad.lo.s32 %r10423, %r112, %r10419, %r10422; - mad.lo.s32 %r10424, %r114, %r10417, %r10423; - mad.lo.s32 %r10425, %r115, %r10415, %r10424; - ld.const.v4.u8 {%rs6786, %rs6787, %rs6788, %rs6789}, [matrix+3392]; - cvt.u32.u16 %r10426, %rs6789; - cvt.s32.s8 %r10427, %r10426; - cvt.u32.u16 %r10428, %rs6788; - cvt.s32.s8 %r10429, %r10428; - cvt.u32.u16 %r10430, %rs6786; - cvt.s32.s8 %r10431, %r10430; - cvt.u32.u16 %r10432, %rs6787; - cvt.s32.s8 %r10433, %r10432; - mul.lo.s32 %r10434, %r34, %r10433; - mad.lo.s32 %r10435, %r124, %r10431, %r10434; - mad.lo.s32 %r10436, %r35, %r10429, %r10435; - mad.lo.s32 %r10437, %r36, %r10427, %r10436; - ld.const.v4.u8 {%rs6794, %rs6795, %rs6796, %rs6797}, [matrix+3396]; - cvt.u32.u16 %r10438, %rs6797; - cvt.s32.s8 %r10439, %r10438; - cvt.u32.u16 %r10440, %rs6796; - cvt.s32.s8 %r10441, %r10440; - cvt.u32.u16 %r10442, %rs6795; - cvt.s32.s8 %r10443, %r10442; - cvt.u32.u16 %r10444, %rs6794; - cvt.s32.s8 %r10445, %r10444; - mad.lo.s32 %r10446, %r37, %r10445, %r10437; - mad.lo.s32 %r10447, %r38, %r10443, %r10446; - mad.lo.s32 %r10448, %r39, %r10441, %r10447; - mad.lo.s32 %r10449, %r40, %r10439, %r10448; - ld.const.v4.u8 {%rs6802, %rs6803, %rs6804, %rs6805}, [matrix+3400]; - cvt.u32.u16 %r10450, %rs6805; - cvt.s32.s8 %r10451, %r10450; - cvt.u32.u16 %r10452, %rs6804; - cvt.s32.s8 %r10453, %r10452; - cvt.u32.u16 %r10454, %rs6803; - cvt.s32.s8 %r10455, %r10454; - cvt.u32.u16 %r10456, %rs6802; - cvt.s32.s8 %r10457, %r10456; - mad.lo.s32 %r10458, %r42, %r10457, %r10449; - mad.lo.s32 %r10459, %r43, %r10455, %r10458; - mad.lo.s32 %r10460, %r45, %r10453, %r10459; - mad.lo.s32 %r10461, %r46, %r10451, %r10460; - ld.const.v4.u8 {%rs6810, %rs6811, %rs6812, %rs6813}, [matrix+3404]; - cvt.u32.u16 %r10462, %rs6813; - cvt.s32.s8 %r10463, %r10462; - cvt.u32.u16 %r10464, %rs6812; - cvt.s32.s8 %r10465, %r10464; - cvt.u32.u16 %r10466, %rs6811; - cvt.s32.s8 %r10467, %r10466; - cvt.u32.u16 %r10468, %rs6810; - cvt.s32.s8 %r10469, %r10468; - mad.lo.s32 %r10470, %r48, %r10469, %r10461; - mad.lo.s32 %r10471, %r49, %r10467, %r10470; - mad.lo.s32 %r10472, %r50, %r10465, %r10471; - mad.lo.s32 %r10473, %r51, %r10463, %r10472; - ld.const.v4.u8 {%rs6818, %rs6819, %rs6820, %rs6821}, [matrix+3408]; - cvt.u32.u16 %r10474, %rs6821; - cvt.s32.s8 %r10475, %r10474; - cvt.u32.u16 %r10476, %rs6820; - cvt.s32.s8 %r10477, %r10476; - cvt.u32.u16 %r10478, %rs6819; - cvt.s32.s8 %r10479, %r10478; - cvt.u32.u16 %r10480, %rs6818; - cvt.s32.s8 %r10481, %r10480; - mad.lo.s32 %r10482, %r173, %r10481, %r10473; - mad.lo.s32 %r10483, %r53, %r10479, %r10482; - mad.lo.s32 %r10484, %r54, %r10477, %r10483; - mad.lo.s32 %r10485, %r55, %r10475, %r10484; - ld.const.v4.u8 {%rs6826, %rs6827, %rs6828, %rs6829}, [matrix+3412]; - cvt.u32.u16 %r10486, %rs6829; - cvt.s32.s8 %r10487, %r10486; - cvt.u32.u16 %r10488, %rs6828; - cvt.s32.s8 %r10489, %r10488; - cvt.u32.u16 %r10490, %rs6827; - cvt.s32.s8 %r10491, %r10490; - cvt.u32.u16 %r10492, %rs6826; - cvt.s32.s8 %r10493, %r10492; - mad.lo.s32 %r10494, %r56, %r10493, %r10485; - mad.lo.s32 %r10495, %r57, %r10491, %r10494; - mad.lo.s32 %r10496, %r58, %r10489, %r10495; - mad.lo.s32 %r10497, %r59, %r10487, %r10496; - ld.const.v4.u8 {%rs6834, %rs6835, %rs6836, %rs6837}, [matrix+3416]; - cvt.u32.u16 %r10498, %rs6837; - cvt.s32.s8 %r10499, %r10498; - cvt.u32.u16 %r10500, %rs6836; - cvt.s32.s8 %r10501, %r10500; - cvt.u32.u16 %r10502, %rs6835; - cvt.s32.s8 %r10503, %r10502; - cvt.u32.u16 %r10504, %rs6834; - cvt.s32.s8 %r10505, %r10504; - mad.lo.s32 %r10506, %r61, %r10505, %r10497; - mad.lo.s32 %r10507, %r62, %r10503, %r10506; - mad.lo.s32 %r10508, %r64, %r10501, %r10507; - mad.lo.s32 %r10509, %r65, %r10499, %r10508; - ld.const.v4.u8 {%rs6842, %rs6843, %rs6844, %rs6845}, [matrix+3420]; - cvt.u32.u16 %r10510, %rs6845; - cvt.s32.s8 %r10511, %r10510; - cvt.u32.u16 %r10512, %rs6844; - cvt.s32.s8 %r10513, %r10512; - cvt.u32.u16 %r10514, %rs6843; - cvt.s32.s8 %r10515, %r10514; - cvt.u32.u16 %r10516, %rs6842; - cvt.s32.s8 %r10517, %r10516; - mad.lo.s32 %r10518, %r67, %r10517, %r10509; - mad.lo.s32 %r10519, %r68, %r10515, %r10518; - mad.lo.s32 %r10520, %r69, %r10513, %r10519; - mad.lo.s32 %r10521, %r70, %r10511, %r10520; - ld.const.v4.u8 {%rs6850, %rs6851, %rs6852, %rs6853}, [matrix+3424]; - cvt.u32.u16 %r10522, %rs6853; - cvt.s32.s8 %r10523, %r10522; - cvt.u32.u16 %r10524, %rs6852; - cvt.s32.s8 %r10525, %r10524; - cvt.u32.u16 %r10526, %rs6851; - cvt.s32.s8 %r10527, %r10526; - cvt.u32.u16 %r10528, %rs6850; - cvt.s32.s8 %r10529, %r10528; - mad.lo.s32 %r10530, %r222, %r10529, %r10521; - mad.lo.s32 %r10531, %r72, %r10527, %r10530; - mad.lo.s32 %r10532, %r73, %r10525, %r10531; - mad.lo.s32 %r10533, %r74, %r10523, %r10532; - ld.const.v4.u8 {%rs6858, %rs6859, %rs6860, %rs6861}, [matrix+3428]; - cvt.u32.u16 %r10534, %rs6861; - cvt.s32.s8 %r10535, %r10534; - cvt.u32.u16 %r10536, %rs6860; - cvt.s32.s8 %r10537, %r10536; - cvt.u32.u16 %r10538, %rs6859; - cvt.s32.s8 %r10539, %r10538; - cvt.u32.u16 %r10540, %rs6858; - cvt.s32.s8 %r10541, %r10540; - mad.lo.s32 %r10542, %r75, %r10541, %r10533; - mad.lo.s32 %r10543, %r76, %r10539, %r10542; - mad.lo.s32 %r10544, %r77, %r10537, %r10543; - mad.lo.s32 %r10545, %r78, %r10535, %r10544; - ld.const.v4.u8 {%rs6866, %rs6867, %rs6868, %rs6869}, [matrix+3432]; - cvt.u32.u16 %r10546, %rs6869; - cvt.s32.s8 %r10547, %r10546; - cvt.u32.u16 %r10548, %rs6868; - cvt.s32.s8 %r10549, %r10548; - cvt.u32.u16 %r10550, %rs6867; - cvt.s32.s8 %r10551, %r10550; - cvt.u32.u16 %r10552, %rs6866; - cvt.s32.s8 %r10553, %r10552; - mad.lo.s32 %r10554, %r80, %r10553, %r10545; - mad.lo.s32 %r10555, %r81, %r10551, %r10554; - mad.lo.s32 %r10556, %r83, %r10549, %r10555; - mad.lo.s32 %r10557, %r84, %r10547, %r10556; - ld.const.v4.u8 {%rs6874, %rs6875, %rs6876, %rs6877}, [matrix+3436]; - cvt.u32.u16 %r10558, %rs6877; - cvt.s32.s8 %r10559, %r10558; - cvt.u32.u16 %r10560, %rs6876; - cvt.s32.s8 %r10561, %r10560; - cvt.u32.u16 %r10562, %rs6875; - cvt.s32.s8 %r10563, %r10562; - cvt.u32.u16 %r10564, %rs6874; - cvt.s32.s8 %r10565, %r10564; - mad.lo.s32 %r10566, %r86, %r10565, %r10557; - mad.lo.s32 %r10567, %r87, %r10563, %r10566; - mad.lo.s32 %r10568, %r88, %r10561, %r10567; - mad.lo.s32 %r10569, %r89, %r10559, %r10568; - ld.const.v4.u8 {%rs6882, %rs6883, %rs6884, %rs6885}, [matrix+3440]; - cvt.u32.u16 %r10570, %rs6885; - cvt.s32.s8 %r10571, %r10570; - cvt.u32.u16 %r10572, %rs6884; - cvt.s32.s8 %r10573, %r10572; - cvt.u32.u16 %r10574, %rs6883; - cvt.s32.s8 %r10575, %r10574; - cvt.u32.u16 %r10576, %rs6882; - cvt.s32.s8 %r10577, %r10576; - mad.lo.s32 %r10578, %r271, %r10577, %r10569; - mad.lo.s32 %r10579, %r91, %r10575, %r10578; - mad.lo.s32 %r10580, %r93, %r10573, %r10579; - mad.lo.s32 %r10581, %r94, %r10571, %r10580; - ld.const.v4.u8 {%rs6890, %rs6891, %rs6892, %rs6893}, [matrix+3444]; - cvt.u32.u16 %r10582, %rs6893; - cvt.s32.s8 %r10583, %r10582; - cvt.u32.u16 %r10584, %rs6892; - cvt.s32.s8 %r10585, %r10584; - cvt.u32.u16 %r10586, %rs6891; - cvt.s32.s8 %r10587, %r10586; - cvt.u32.u16 %r10588, %rs6890; - cvt.s32.s8 %r10589, %r10588; - mad.lo.s32 %r10590, %r96, %r10589, %r10581; - mad.lo.s32 %r10591, %r97, %r10587, %r10590; - mad.lo.s32 %r10592, %r99, %r10585, %r10591; - mad.lo.s32 %r10593, %r100, %r10583, %r10592; - ld.const.v4.u8 {%rs6898, %rs6899, %rs6900, %rs6901}, [matrix+3448]; - cvt.u32.u16 %r10594, %rs6901; - cvt.s32.s8 %r10595, %r10594; - cvt.u32.u16 %r10596, %rs6900; - cvt.s32.s8 %r10597, %r10596; - cvt.u32.u16 %r10598, %rs6899; - cvt.s32.s8 %r10599, %r10598; - cvt.u32.u16 %r10600, %rs6898; - cvt.s32.s8 %r10601, %r10600; - mad.lo.s32 %r10602, %r103, %r10601, %r10593; - mad.lo.s32 %r10603, %r104, %r10599, %r10602; - mad.lo.s32 %r10604, %r107, %r10597, %r10603; - mad.lo.s32 %r10605, %r108, %r10595, %r10604; - ld.const.v4.u8 {%rs6906, %rs6907, %rs6908, %rs6909}, [matrix+3452]; - cvt.u32.u16 %r10606, %rs6909; - cvt.s32.s8 %r10607, %r10606; - cvt.u32.u16 %r10608, %rs6908; - cvt.s32.s8 %r10609, %r10608; - cvt.u32.u16 %r10610, %rs6907; - cvt.s32.s8 %r10611, %r10610; - cvt.u32.u16 %r10612, %rs6906; - cvt.s32.s8 %r10613, %r10612; - mad.lo.s32 %r10614, %r111, %r10613, %r10605; - mad.lo.s32 %r10615, %r112, %r10611, %r10614; - mad.lo.s32 %r10616, %r114, %r10609, %r10615; - mad.lo.s32 %r10617, %r115, %r10607, %r10616; - shr.u32 %r10618, %r10425, 6; - and.b32 %r10619, %r10618, 240; - shr.u32 %r10620, %r10617, 10; - or.b32 %r10621, %r10620, %r10619; - xor.b32 %r10622, %r95, %r10621; - cvt.u64.u32 %rd400, %r10622; - ld.const.v4.u8 {%rs6914, %rs6915, %rs6916, %rs6917}, [matrix+3456]; - cvt.u32.u16 %r10623, %rs6917; - cvt.s32.s8 %r10624, %r10623; - cvt.u32.u16 %r10625, %rs6916; - cvt.s32.s8 %r10626, %r10625; - cvt.u32.u16 %r10627, %rs6914; - cvt.s32.s8 %r10628, %r10627; - cvt.u32.u16 %r10629, %rs6915; - cvt.s32.s8 %r10630, %r10629; - mul.lo.s32 %r10631, %r34, %r10630; - mad.lo.s32 %r10632, %r124, %r10628, %r10631; - mad.lo.s32 %r10633, %r35, %r10626, %r10632; - mad.lo.s32 %r10634, %r36, %r10624, %r10633; - ld.const.v4.u8 {%rs6922, %rs6923, %rs6924, %rs6925}, [matrix+3460]; - cvt.u32.u16 %r10635, %rs6925; - cvt.s32.s8 %r10636, %r10635; - cvt.u32.u16 %r10637, %rs6924; - cvt.s32.s8 %r10638, %r10637; - cvt.u32.u16 %r10639, %rs6923; - cvt.s32.s8 %r10640, %r10639; - cvt.u32.u16 %r10641, %rs6922; - cvt.s32.s8 %r10642, %r10641; - mad.lo.s32 %r10643, %r37, %r10642, %r10634; - mad.lo.s32 %r10644, %r38, %r10640, %r10643; - mad.lo.s32 %r10645, %r39, %r10638, %r10644; - mad.lo.s32 %r10646, %r40, %r10636, %r10645; - ld.const.v4.u8 {%rs6930, %rs6931, %rs6932, %rs6933}, [matrix+3464]; - cvt.u32.u16 %r10647, %rs6933; - cvt.s32.s8 %r10648, %r10647; - cvt.u32.u16 %r10649, %rs6932; - cvt.s32.s8 %r10650, %r10649; - cvt.u32.u16 %r10651, %rs6931; - cvt.s32.s8 %r10652, %r10651; - cvt.u32.u16 %r10653, %rs6930; - cvt.s32.s8 %r10654, %r10653; - mad.lo.s32 %r10655, %r42, %r10654, %r10646; - mad.lo.s32 %r10656, %r43, %r10652, %r10655; - mad.lo.s32 %r10657, %r45, %r10650, %r10656; - mad.lo.s32 %r10658, %r46, %r10648, %r10657; - ld.const.v4.u8 {%rs6938, %rs6939, %rs6940, %rs6941}, [matrix+3468]; - cvt.u32.u16 %r10659, %rs6941; - cvt.s32.s8 %r10660, %r10659; - cvt.u32.u16 %r10661, %rs6940; - cvt.s32.s8 %r10662, %r10661; - cvt.u32.u16 %r10663, %rs6939; - cvt.s32.s8 %r10664, %r10663; - cvt.u32.u16 %r10665, %rs6938; - cvt.s32.s8 %r10666, %r10665; - mad.lo.s32 %r10667, %r48, %r10666, %r10658; - mad.lo.s32 %r10668, %r49, %r10664, %r10667; - mad.lo.s32 %r10669, %r50, %r10662, %r10668; - mad.lo.s32 %r10670, %r51, %r10660, %r10669; - ld.const.v4.u8 {%rs6946, %rs6947, %rs6948, %rs6949}, [matrix+3472]; - cvt.u32.u16 %r10671, %rs6949; - cvt.s32.s8 %r10672, %r10671; - cvt.u32.u16 %r10673, %rs6948; - cvt.s32.s8 %r10674, %r10673; - cvt.u32.u16 %r10675, %rs6947; - cvt.s32.s8 %r10676, %r10675; - cvt.u32.u16 %r10677, %rs6946; - cvt.s32.s8 %r10678, %r10677; - mad.lo.s32 %r10679, %r173, %r10678, %r10670; - mad.lo.s32 %r10680, %r53, %r10676, %r10679; - mad.lo.s32 %r10681, %r54, %r10674, %r10680; - mad.lo.s32 %r10682, %r55, %r10672, %r10681; - ld.const.v4.u8 {%rs6954, %rs6955, %rs6956, %rs6957}, [matrix+3476]; - cvt.u32.u16 %r10683, %rs6957; - cvt.s32.s8 %r10684, %r10683; - cvt.u32.u16 %r10685, %rs6956; - cvt.s32.s8 %r10686, %r10685; - cvt.u32.u16 %r10687, %rs6955; - cvt.s32.s8 %r10688, %r10687; - cvt.u32.u16 %r10689, %rs6954; - cvt.s32.s8 %r10690, %r10689; - mad.lo.s32 %r10691, %r56, %r10690, %r10682; - mad.lo.s32 %r10692, %r57, %r10688, %r10691; - mad.lo.s32 %r10693, %r58, %r10686, %r10692; - mad.lo.s32 %r10694, %r59, %r10684, %r10693; - ld.const.v4.u8 {%rs6962, %rs6963, %rs6964, %rs6965}, [matrix+3480]; - cvt.u32.u16 %r10695, %rs6965; - cvt.s32.s8 %r10696, %r10695; - cvt.u32.u16 %r10697, %rs6964; - cvt.s32.s8 %r10698, %r10697; - cvt.u32.u16 %r10699, %rs6963; - cvt.s32.s8 %r10700, %r10699; - cvt.u32.u16 %r10701, %rs6962; - cvt.s32.s8 %r10702, %r10701; - mad.lo.s32 %r10703, %r61, %r10702, %r10694; - mad.lo.s32 %r10704, %r62, %r10700, %r10703; - mad.lo.s32 %r10705, %r64, %r10698, %r10704; - mad.lo.s32 %r10706, %r65, %r10696, %r10705; - ld.const.v4.u8 {%rs6970, %rs6971, %rs6972, %rs6973}, [matrix+3484]; - cvt.u32.u16 %r10707, %rs6973; - cvt.s32.s8 %r10708, %r10707; - cvt.u32.u16 %r10709, %rs6972; - cvt.s32.s8 %r10710, %r10709; - cvt.u32.u16 %r10711, %rs6971; - cvt.s32.s8 %r10712, %r10711; - cvt.u32.u16 %r10713, %rs6970; - cvt.s32.s8 %r10714, %r10713; - mad.lo.s32 %r10715, %r67, %r10714, %r10706; - mad.lo.s32 %r10716, %r68, %r10712, %r10715; - mad.lo.s32 %r10717, %r69, %r10710, %r10716; - mad.lo.s32 %r10718, %r70, %r10708, %r10717; - ld.const.v4.u8 {%rs6978, %rs6979, %rs6980, %rs6981}, [matrix+3488]; - cvt.u32.u16 %r10719, %rs6981; - cvt.s32.s8 %r10720, %r10719; - cvt.u32.u16 %r10721, %rs6980; - cvt.s32.s8 %r10722, %r10721; - cvt.u32.u16 %r10723, %rs6979; - cvt.s32.s8 %r10724, %r10723; - cvt.u32.u16 %r10725, %rs6978; - cvt.s32.s8 %r10726, %r10725; - mad.lo.s32 %r10727, %r222, %r10726, %r10718; - mad.lo.s32 %r10728, %r72, %r10724, %r10727; - mad.lo.s32 %r10729, %r73, %r10722, %r10728; - mad.lo.s32 %r10730, %r74, %r10720, %r10729; - ld.const.v4.u8 {%rs6986, %rs6987, %rs6988, %rs6989}, [matrix+3492]; - cvt.u32.u16 %r10731, %rs6989; - cvt.s32.s8 %r10732, %r10731; - cvt.u32.u16 %r10733, %rs6988; - cvt.s32.s8 %r10734, %r10733; - cvt.u32.u16 %r10735, %rs6987; - cvt.s32.s8 %r10736, %r10735; - cvt.u32.u16 %r10737, %rs6986; - cvt.s32.s8 %r10738, %r10737; - mad.lo.s32 %r10739, %r75, %r10738, %r10730; - mad.lo.s32 %r10740, %r76, %r10736, %r10739; - mad.lo.s32 %r10741, %r77, %r10734, %r10740; - mad.lo.s32 %r10742, %r78, %r10732, %r10741; - ld.const.v4.u8 {%rs6994, %rs6995, %rs6996, %rs6997}, [matrix+3496]; - cvt.u32.u16 %r10743, %rs6997; - cvt.s32.s8 %r10744, %r10743; - cvt.u32.u16 %r10745, %rs6996; - cvt.s32.s8 %r10746, %r10745; - cvt.u32.u16 %r10747, %rs6995; - cvt.s32.s8 %r10748, %r10747; - cvt.u32.u16 %r10749, %rs6994; - cvt.s32.s8 %r10750, %r10749; - mad.lo.s32 %r10751, %r80, %r10750, %r10742; - mad.lo.s32 %r10752, %r81, %r10748, %r10751; - mad.lo.s32 %r10753, %r83, %r10746, %r10752; - mad.lo.s32 %r10754, %r84, %r10744, %r10753; - ld.const.v4.u8 {%rs7002, %rs7003, %rs7004, %rs7005}, [matrix+3500]; - cvt.u32.u16 %r10755, %rs7005; - cvt.s32.s8 %r10756, %r10755; - cvt.u32.u16 %r10757, %rs7004; - cvt.s32.s8 %r10758, %r10757; - cvt.u32.u16 %r10759, %rs7003; - cvt.s32.s8 %r10760, %r10759; - cvt.u32.u16 %r10761, %rs7002; - cvt.s32.s8 %r10762, %r10761; - mad.lo.s32 %r10763, %r86, %r10762, %r10754; - mad.lo.s32 %r10764, %r87, %r10760, %r10763; - mad.lo.s32 %r10765, %r88, %r10758, %r10764; - mad.lo.s32 %r10766, %r89, %r10756, %r10765; - ld.const.v4.u8 {%rs7010, %rs7011, %rs7012, %rs7013}, [matrix+3504]; - cvt.u32.u16 %r10767, %rs7013; - cvt.s32.s8 %r10768, %r10767; - cvt.u32.u16 %r10769, %rs7012; - cvt.s32.s8 %r10770, %r10769; - cvt.u32.u16 %r10771, %rs7011; - cvt.s32.s8 %r10772, %r10771; - cvt.u32.u16 %r10773, %rs7010; - cvt.s32.s8 %r10774, %r10773; - mad.lo.s32 %r10775, %r271, %r10774, %r10766; - mad.lo.s32 %r10776, %r91, %r10772, %r10775; - mad.lo.s32 %r10777, %r93, %r10770, %r10776; - mad.lo.s32 %r10778, %r94, %r10768, %r10777; - ld.const.v4.u8 {%rs7018, %rs7019, %rs7020, %rs7021}, [matrix+3508]; - cvt.u32.u16 %r10779, %rs7021; - cvt.s32.s8 %r10780, %r10779; - cvt.u32.u16 %r10781, %rs7020; - cvt.s32.s8 %r10782, %r10781; - cvt.u32.u16 %r10783, %rs7019; - cvt.s32.s8 %r10784, %r10783; - cvt.u32.u16 %r10785, %rs7018; - cvt.s32.s8 %r10786, %r10785; - mad.lo.s32 %r10787, %r96, %r10786, %r10778; - mad.lo.s32 %r10788, %r97, %r10784, %r10787; - mad.lo.s32 %r10789, %r99, %r10782, %r10788; - mad.lo.s32 %r10790, %r100, %r10780, %r10789; - ld.const.v4.u8 {%rs7026, %rs7027, %rs7028, %rs7029}, [matrix+3512]; - cvt.u32.u16 %r10791, %rs7029; - cvt.s32.s8 %r10792, %r10791; - cvt.u32.u16 %r10793, %rs7028; - cvt.s32.s8 %r10794, %r10793; - cvt.u32.u16 %r10795, %rs7027; - cvt.s32.s8 %r10796, %r10795; - cvt.u32.u16 %r10797, %rs7026; - cvt.s32.s8 %r10798, %r10797; - mad.lo.s32 %r10799, %r103, %r10798, %r10790; - mad.lo.s32 %r10800, %r104, %r10796, %r10799; - mad.lo.s32 %r10801, %r107, %r10794, %r10800; - mad.lo.s32 %r10802, %r108, %r10792, %r10801; - ld.const.v4.u8 {%rs7034, %rs7035, %rs7036, %rs7037}, [matrix+3516]; - cvt.u32.u16 %r10803, %rs7037; - cvt.s32.s8 %r10804, %r10803; - cvt.u32.u16 %r10805, %rs7036; - cvt.s32.s8 %r10806, %r10805; - cvt.u32.u16 %r10807, %rs7035; - cvt.s32.s8 %r10808, %r10807; - cvt.u32.u16 %r10809, %rs7034; - cvt.s32.s8 %r10810, %r10809; - mad.lo.s32 %r10811, %r111, %r10810, %r10802; - mad.lo.s32 %r10812, %r112, %r10808, %r10811; - mad.lo.s32 %r10813, %r114, %r10806, %r10812; - mad.lo.s32 %r10814, %r115, %r10804, %r10813; - ld.const.v4.u8 {%rs7042, %rs7043, %rs7044, %rs7045}, [matrix+3520]; - cvt.u32.u16 %r10815, %rs7045; - cvt.s32.s8 %r10816, %r10815; - cvt.u32.u16 %r10817, %rs7044; - cvt.s32.s8 %r10818, %r10817; - cvt.u32.u16 %r10819, %rs7042; - cvt.s32.s8 %r10820, %r10819; - cvt.u32.u16 %r10821, %rs7043; - cvt.s32.s8 %r10822, %r10821; - mul.lo.s32 %r10823, %r34, %r10822; - mad.lo.s32 %r10824, %r124, %r10820, %r10823; - mad.lo.s32 %r10825, %r35, %r10818, %r10824; - mad.lo.s32 %r10826, %r36, %r10816, %r10825; - ld.const.v4.u8 {%rs7050, %rs7051, %rs7052, %rs7053}, [matrix+3524]; - cvt.u32.u16 %r10827, %rs7053; - cvt.s32.s8 %r10828, %r10827; - cvt.u32.u16 %r10829, %rs7052; - cvt.s32.s8 %r10830, %r10829; - cvt.u32.u16 %r10831, %rs7051; - cvt.s32.s8 %r10832, %r10831; - cvt.u32.u16 %r10833, %rs7050; - cvt.s32.s8 %r10834, %r10833; - mad.lo.s32 %r10835, %r37, %r10834, %r10826; - mad.lo.s32 %r10836, %r38, %r10832, %r10835; - mad.lo.s32 %r10837, %r39, %r10830, %r10836; - mad.lo.s32 %r10838, %r40, %r10828, %r10837; - ld.const.v4.u8 {%rs7058, %rs7059, %rs7060, %rs7061}, [matrix+3528]; - cvt.u32.u16 %r10839, %rs7061; - cvt.s32.s8 %r10840, %r10839; - cvt.u32.u16 %r10841, %rs7060; - cvt.s32.s8 %r10842, %r10841; - cvt.u32.u16 %r10843, %rs7059; - cvt.s32.s8 %r10844, %r10843; - cvt.u32.u16 %r10845, %rs7058; - cvt.s32.s8 %r10846, %r10845; - mad.lo.s32 %r10847, %r42, %r10846, %r10838; - mad.lo.s32 %r10848, %r43, %r10844, %r10847; - mad.lo.s32 %r10849, %r45, %r10842, %r10848; - mad.lo.s32 %r10850, %r46, %r10840, %r10849; - ld.const.v4.u8 {%rs7066, %rs7067, %rs7068, %rs7069}, [matrix+3532]; - cvt.u32.u16 %r10851, %rs7069; - cvt.s32.s8 %r10852, %r10851; - cvt.u32.u16 %r10853, %rs7068; - cvt.s32.s8 %r10854, %r10853; - cvt.u32.u16 %r10855, %rs7067; - cvt.s32.s8 %r10856, %r10855; - cvt.u32.u16 %r10857, %rs7066; - cvt.s32.s8 %r10858, %r10857; - mad.lo.s32 %r10859, %r48, %r10858, %r10850; - mad.lo.s32 %r10860, %r49, %r10856, %r10859; - mad.lo.s32 %r10861, %r50, %r10854, %r10860; - mad.lo.s32 %r10862, %r51, %r10852, %r10861; - ld.const.v4.u8 {%rs7074, %rs7075, %rs7076, %rs7077}, [matrix+3536]; - cvt.u32.u16 %r10863, %rs7077; - cvt.s32.s8 %r10864, %r10863; - cvt.u32.u16 %r10865, %rs7076; - cvt.s32.s8 %r10866, %r10865; - cvt.u32.u16 %r10867, %rs7075; - cvt.s32.s8 %r10868, %r10867; - cvt.u32.u16 %r10869, %rs7074; - cvt.s32.s8 %r10870, %r10869; - mad.lo.s32 %r10871, %r173, %r10870, %r10862; - mad.lo.s32 %r10872, %r53, %r10868, %r10871; - mad.lo.s32 %r10873, %r54, %r10866, %r10872; - mad.lo.s32 %r10874, %r55, %r10864, %r10873; - ld.const.v4.u8 {%rs7082, %rs7083, %rs7084, %rs7085}, [matrix+3540]; - cvt.u32.u16 %r10875, %rs7085; - cvt.s32.s8 %r10876, %r10875; - cvt.u32.u16 %r10877, %rs7084; - cvt.s32.s8 %r10878, %r10877; - cvt.u32.u16 %r10879, %rs7083; - cvt.s32.s8 %r10880, %r10879; - cvt.u32.u16 %r10881, %rs7082; - cvt.s32.s8 %r10882, %r10881; - mad.lo.s32 %r10883, %r56, %r10882, %r10874; - mad.lo.s32 %r10884, %r57, %r10880, %r10883; - mad.lo.s32 %r10885, %r58, %r10878, %r10884; - mad.lo.s32 %r10886, %r59, %r10876, %r10885; - ld.const.v4.u8 {%rs7090, %rs7091, %rs7092, %rs7093}, [matrix+3544]; - cvt.u32.u16 %r10887, %rs7093; - cvt.s32.s8 %r10888, %r10887; - cvt.u32.u16 %r10889, %rs7092; - cvt.s32.s8 %r10890, %r10889; - cvt.u32.u16 %r10891, %rs7091; - cvt.s32.s8 %r10892, %r10891; - cvt.u32.u16 %r10893, %rs7090; - cvt.s32.s8 %r10894, %r10893; - mad.lo.s32 %r10895, %r61, %r10894, %r10886; - mad.lo.s32 %r10896, %r62, %r10892, %r10895; - mad.lo.s32 %r10897, %r64, %r10890, %r10896; - mad.lo.s32 %r10898, %r65, %r10888, %r10897; - ld.const.v4.u8 {%rs7098, %rs7099, %rs7100, %rs7101}, [matrix+3548]; - cvt.u32.u16 %r10899, %rs7101; - cvt.s32.s8 %r10900, %r10899; - cvt.u32.u16 %r10901, %rs7100; - cvt.s32.s8 %r10902, %r10901; - cvt.u32.u16 %r10903, %rs7099; - cvt.s32.s8 %r10904, %r10903; - cvt.u32.u16 %r10905, %rs7098; - cvt.s32.s8 %r10906, %r10905; - mad.lo.s32 %r10907, %r67, %r10906, %r10898; - mad.lo.s32 %r10908, %r68, %r10904, %r10907; - mad.lo.s32 %r10909, %r69, %r10902, %r10908; - mad.lo.s32 %r10910, %r70, %r10900, %r10909; - ld.const.v4.u8 {%rs7106, %rs7107, %rs7108, %rs7109}, [matrix+3552]; - cvt.u32.u16 %r10911, %rs7109; - cvt.s32.s8 %r10912, %r10911; - cvt.u32.u16 %r10913, %rs7108; - cvt.s32.s8 %r10914, %r10913; - cvt.u32.u16 %r10915, %rs7107; - cvt.s32.s8 %r10916, %r10915; - cvt.u32.u16 %r10917, %rs7106; - cvt.s32.s8 %r10918, %r10917; - mad.lo.s32 %r10919, %r222, %r10918, %r10910; - mad.lo.s32 %r10920, %r72, %r10916, %r10919; - mad.lo.s32 %r10921, %r73, %r10914, %r10920; - mad.lo.s32 %r10922, %r74, %r10912, %r10921; - ld.const.v4.u8 {%rs7114, %rs7115, %rs7116, %rs7117}, [matrix+3556]; - cvt.u32.u16 %r10923, %rs7117; - cvt.s32.s8 %r10924, %r10923; - cvt.u32.u16 %r10925, %rs7116; - cvt.s32.s8 %r10926, %r10925; - cvt.u32.u16 %r10927, %rs7115; - cvt.s32.s8 %r10928, %r10927; - cvt.u32.u16 %r10929, %rs7114; - cvt.s32.s8 %r10930, %r10929; - mad.lo.s32 %r10931, %r75, %r10930, %r10922; - mad.lo.s32 %r10932, %r76, %r10928, %r10931; - mad.lo.s32 %r10933, %r77, %r10926, %r10932; - mad.lo.s32 %r10934, %r78, %r10924, %r10933; - ld.const.v4.u8 {%rs7122, %rs7123, %rs7124, %rs7125}, [matrix+3560]; - cvt.u32.u16 %r10935, %rs7125; - cvt.s32.s8 %r10936, %r10935; - cvt.u32.u16 %r10937, %rs7124; - cvt.s32.s8 %r10938, %r10937; - cvt.u32.u16 %r10939, %rs7123; - cvt.s32.s8 %r10940, %r10939; - cvt.u32.u16 %r10941, %rs7122; - cvt.s32.s8 %r10942, %r10941; - mad.lo.s32 %r10943, %r80, %r10942, %r10934; - mad.lo.s32 %r10944, %r81, %r10940, %r10943; - mad.lo.s32 %r10945, %r83, %r10938, %r10944; - mad.lo.s32 %r10946, %r84, %r10936, %r10945; - ld.const.v4.u8 {%rs7130, %rs7131, %rs7132, %rs7133}, [matrix+3564]; - cvt.u32.u16 %r10947, %rs7133; - cvt.s32.s8 %r10948, %r10947; - cvt.u32.u16 %r10949, %rs7132; - cvt.s32.s8 %r10950, %r10949; - cvt.u32.u16 %r10951, %rs7131; - cvt.s32.s8 %r10952, %r10951; - cvt.u32.u16 %r10953, %rs7130; - cvt.s32.s8 %r10954, %r10953; - mad.lo.s32 %r10955, %r86, %r10954, %r10946; - mad.lo.s32 %r10956, %r87, %r10952, %r10955; - mad.lo.s32 %r10957, %r88, %r10950, %r10956; - mad.lo.s32 %r10958, %r89, %r10948, %r10957; - ld.const.v4.u8 {%rs7138, %rs7139, %rs7140, %rs7141}, [matrix+3568]; - cvt.u32.u16 %r10959, %rs7141; - cvt.s32.s8 %r10960, %r10959; - cvt.u32.u16 %r10961, %rs7140; - cvt.s32.s8 %r10962, %r10961; - cvt.u32.u16 %r10963, %rs7139; - cvt.s32.s8 %r10964, %r10963; - cvt.u32.u16 %r10965, %rs7138; - cvt.s32.s8 %r10966, %r10965; - mad.lo.s32 %r10967, %r271, %r10966, %r10958; - mad.lo.s32 %r10968, %r91, %r10964, %r10967; - mad.lo.s32 %r10969, %r93, %r10962, %r10968; - mad.lo.s32 %r10970, %r94, %r10960, %r10969; - ld.const.v4.u8 {%rs7146, %rs7147, %rs7148, %rs7149}, [matrix+3572]; - cvt.u32.u16 %r10971, %rs7149; - cvt.s32.s8 %r10972, %r10971; - cvt.u32.u16 %r10973, %rs7148; - cvt.s32.s8 %r10974, %r10973; - cvt.u32.u16 %r10975, %rs7147; - cvt.s32.s8 %r10976, %r10975; - cvt.u32.u16 %r10977, %rs7146; - cvt.s32.s8 %r10978, %r10977; - mad.lo.s32 %r10979, %r96, %r10978, %r10970; - mad.lo.s32 %r10980, %r97, %r10976, %r10979; - mad.lo.s32 %r10981, %r99, %r10974, %r10980; - mad.lo.s32 %r10982, %r100, %r10972, %r10981; - ld.const.v4.u8 {%rs7154, %rs7155, %rs7156, %rs7157}, [matrix+3576]; - cvt.u32.u16 %r10983, %rs7157; - cvt.s32.s8 %r10984, %r10983; - cvt.u32.u16 %r10985, %rs7156; - cvt.s32.s8 %r10986, %r10985; - cvt.u32.u16 %r10987, %rs7155; - cvt.s32.s8 %r10988, %r10987; - cvt.u32.u16 %r10989, %rs7154; - cvt.s32.s8 %r10990, %r10989; - mad.lo.s32 %r10991, %r103, %r10990, %r10982; - mad.lo.s32 %r10992, %r104, %r10988, %r10991; - mad.lo.s32 %r10993, %r107, %r10986, %r10992; - mad.lo.s32 %r10994, %r108, %r10984, %r10993; - ld.const.v4.u8 {%rs7162, %rs7163, %rs7164, %rs7165}, [matrix+3580]; - cvt.u32.u16 %r10995, %rs7165; - cvt.s32.s8 %r10996, %r10995; - cvt.u32.u16 %r10997, %rs7164; - cvt.s32.s8 %r10998, %r10997; - cvt.u32.u16 %r10999, %rs7163; - cvt.s32.s8 %r11000, %r10999; - cvt.u32.u16 %r11001, %rs7162; - cvt.s32.s8 %r11002, %r11001; - mad.lo.s32 %r11003, %r111, %r11002, %r10994; - mad.lo.s32 %r11004, %r112, %r11000, %r11003; - mad.lo.s32 %r11005, %r114, %r10998, %r11004; - mad.lo.s32 %r11006, %r115, %r10996, %r11005; - shr.u32 %r11007, %r10814, 6; - and.b32 %r11008, %r11007, 240; - shr.u32 %r11009, %r11006, 10; - or.b32 %r11010, %r11009, %r11008; - xor.b32 %r11011, %r98, %r11010; - cvt.u64.u32 %rd401, %r11011; - ld.const.v4.u8 {%rs7170, %rs7171, %rs7172, %rs7173}, [matrix+3584]; - cvt.u32.u16 %r11012, %rs7173; - cvt.s32.s8 %r11013, %r11012; - cvt.u32.u16 %r11014, %rs7172; - cvt.s32.s8 %r11015, %r11014; - cvt.u32.u16 %r11016, %rs7170; - cvt.s32.s8 %r11017, %r11016; - cvt.u32.u16 %r11018, %rs7171; - cvt.s32.s8 %r11019, %r11018; - mul.lo.s32 %r11020, %r34, %r11019; - mad.lo.s32 %r11021, %r124, %r11017, %r11020; - mad.lo.s32 %r11022, %r35, %r11015, %r11021; - mad.lo.s32 %r11023, %r36, %r11013, %r11022; - ld.const.v4.u8 {%rs7178, %rs7179, %rs7180, %rs7181}, [matrix+3588]; - cvt.u32.u16 %r11024, %rs7181; - cvt.s32.s8 %r11025, %r11024; - cvt.u32.u16 %r11026, %rs7180; - cvt.s32.s8 %r11027, %r11026; - cvt.u32.u16 %r11028, %rs7179; - cvt.s32.s8 %r11029, %r11028; - cvt.u32.u16 %r11030, %rs7178; - cvt.s32.s8 %r11031, %r11030; - mad.lo.s32 %r11032, %r37, %r11031, %r11023; - mad.lo.s32 %r11033, %r38, %r11029, %r11032; - mad.lo.s32 %r11034, %r39, %r11027, %r11033; - mad.lo.s32 %r11035, %r40, %r11025, %r11034; - ld.const.v4.u8 {%rs7186, %rs7187, %rs7188, %rs7189}, [matrix+3592]; - cvt.u32.u16 %r11036, %rs7189; - cvt.s32.s8 %r11037, %r11036; - cvt.u32.u16 %r11038, %rs7188; - cvt.s32.s8 %r11039, %r11038; - cvt.u32.u16 %r11040, %rs7187; - cvt.s32.s8 %r11041, %r11040; - cvt.u32.u16 %r11042, %rs7186; - cvt.s32.s8 %r11043, %r11042; - mad.lo.s32 %r11044, %r42, %r11043, %r11035; - mad.lo.s32 %r11045, %r43, %r11041, %r11044; - mad.lo.s32 %r11046, %r45, %r11039, %r11045; - mad.lo.s32 %r11047, %r46, %r11037, %r11046; - ld.const.v4.u8 {%rs7194, %rs7195, %rs7196, %rs7197}, [matrix+3596]; - cvt.u32.u16 %r11048, %rs7197; - cvt.s32.s8 %r11049, %r11048; - cvt.u32.u16 %r11050, %rs7196; - cvt.s32.s8 %r11051, %r11050; - cvt.u32.u16 %r11052, %rs7195; - cvt.s32.s8 %r11053, %r11052; - cvt.u32.u16 %r11054, %rs7194; - cvt.s32.s8 %r11055, %r11054; - mad.lo.s32 %r11056, %r48, %r11055, %r11047; - mad.lo.s32 %r11057, %r49, %r11053, %r11056; - mad.lo.s32 %r11058, %r50, %r11051, %r11057; - mad.lo.s32 %r11059, %r51, %r11049, %r11058; - ld.const.v4.u8 {%rs7202, %rs7203, %rs7204, %rs7205}, [matrix+3600]; - cvt.u32.u16 %r11060, %rs7205; - cvt.s32.s8 %r11061, %r11060; - cvt.u32.u16 %r11062, %rs7204; - cvt.s32.s8 %r11063, %r11062; - cvt.u32.u16 %r11064, %rs7203; - cvt.s32.s8 %r11065, %r11064; - cvt.u32.u16 %r11066, %rs7202; - cvt.s32.s8 %r11067, %r11066; - mad.lo.s32 %r11068, %r173, %r11067, %r11059; - mad.lo.s32 %r11069, %r53, %r11065, %r11068; - mad.lo.s32 %r11070, %r54, %r11063, %r11069; - mad.lo.s32 %r11071, %r55, %r11061, %r11070; - ld.const.v4.u8 {%rs7210, %rs7211, %rs7212, %rs7213}, [matrix+3604]; - cvt.u32.u16 %r11072, %rs7213; - cvt.s32.s8 %r11073, %r11072; - cvt.u32.u16 %r11074, %rs7212; - cvt.s32.s8 %r11075, %r11074; - cvt.u32.u16 %r11076, %rs7211; - cvt.s32.s8 %r11077, %r11076; - cvt.u32.u16 %r11078, %rs7210; - cvt.s32.s8 %r11079, %r11078; - mad.lo.s32 %r11080, %r56, %r11079, %r11071; - mad.lo.s32 %r11081, %r57, %r11077, %r11080; - mad.lo.s32 %r11082, %r58, %r11075, %r11081; - mad.lo.s32 %r11083, %r59, %r11073, %r11082; - ld.const.v4.u8 {%rs7218, %rs7219, %rs7220, %rs7221}, [matrix+3608]; - cvt.u32.u16 %r11084, %rs7221; - cvt.s32.s8 %r11085, %r11084; - cvt.u32.u16 %r11086, %rs7220; - cvt.s32.s8 %r11087, %r11086; - cvt.u32.u16 %r11088, %rs7219; - cvt.s32.s8 %r11089, %r11088; - cvt.u32.u16 %r11090, %rs7218; - cvt.s32.s8 %r11091, %r11090; - mad.lo.s32 %r11092, %r61, %r11091, %r11083; - mad.lo.s32 %r11093, %r62, %r11089, %r11092; - mad.lo.s32 %r11094, %r64, %r11087, %r11093; - mad.lo.s32 %r11095, %r65, %r11085, %r11094; - ld.const.v4.u8 {%rs7226, %rs7227, %rs7228, %rs7229}, [matrix+3612]; - cvt.u32.u16 %r11096, %rs7229; - cvt.s32.s8 %r11097, %r11096; - cvt.u32.u16 %r11098, %rs7228; - cvt.s32.s8 %r11099, %r11098; - cvt.u32.u16 %r11100, %rs7227; - cvt.s32.s8 %r11101, %r11100; - cvt.u32.u16 %r11102, %rs7226; - cvt.s32.s8 %r11103, %r11102; - mad.lo.s32 %r11104, %r67, %r11103, %r11095; - mad.lo.s32 %r11105, %r68, %r11101, %r11104; - mad.lo.s32 %r11106, %r69, %r11099, %r11105; - mad.lo.s32 %r11107, %r70, %r11097, %r11106; - ld.const.v4.u8 {%rs7234, %rs7235, %rs7236, %rs7237}, [matrix+3616]; - cvt.u32.u16 %r11108, %rs7237; - cvt.s32.s8 %r11109, %r11108; - cvt.u32.u16 %r11110, %rs7236; - cvt.s32.s8 %r11111, %r11110; - cvt.u32.u16 %r11112, %rs7235; - cvt.s32.s8 %r11113, %r11112; - cvt.u32.u16 %r11114, %rs7234; - cvt.s32.s8 %r11115, %r11114; - mad.lo.s32 %r11116, %r222, %r11115, %r11107; - mad.lo.s32 %r11117, %r72, %r11113, %r11116; - mad.lo.s32 %r11118, %r73, %r11111, %r11117; - mad.lo.s32 %r11119, %r74, %r11109, %r11118; - ld.const.v4.u8 {%rs7242, %rs7243, %rs7244, %rs7245}, [matrix+3620]; - cvt.u32.u16 %r11120, %rs7245; - cvt.s32.s8 %r11121, %r11120; - cvt.u32.u16 %r11122, %rs7244; - cvt.s32.s8 %r11123, %r11122; - cvt.u32.u16 %r11124, %rs7243; - cvt.s32.s8 %r11125, %r11124; - cvt.u32.u16 %r11126, %rs7242; - cvt.s32.s8 %r11127, %r11126; - mad.lo.s32 %r11128, %r75, %r11127, %r11119; - mad.lo.s32 %r11129, %r76, %r11125, %r11128; - mad.lo.s32 %r11130, %r77, %r11123, %r11129; - mad.lo.s32 %r11131, %r78, %r11121, %r11130; - ld.const.v4.u8 {%rs7250, %rs7251, %rs7252, %rs7253}, [matrix+3624]; - cvt.u32.u16 %r11132, %rs7253; - cvt.s32.s8 %r11133, %r11132; - cvt.u32.u16 %r11134, %rs7252; - cvt.s32.s8 %r11135, %r11134; - cvt.u32.u16 %r11136, %rs7251; - cvt.s32.s8 %r11137, %r11136; - cvt.u32.u16 %r11138, %rs7250; - cvt.s32.s8 %r11139, %r11138; - mad.lo.s32 %r11140, %r80, %r11139, %r11131; - mad.lo.s32 %r11141, %r81, %r11137, %r11140; - mad.lo.s32 %r11142, %r83, %r11135, %r11141; - mad.lo.s32 %r11143, %r84, %r11133, %r11142; - ld.const.v4.u8 {%rs7258, %rs7259, %rs7260, %rs7261}, [matrix+3628]; - cvt.u32.u16 %r11144, %rs7261; - cvt.s32.s8 %r11145, %r11144; - cvt.u32.u16 %r11146, %rs7260; - cvt.s32.s8 %r11147, %r11146; - cvt.u32.u16 %r11148, %rs7259; - cvt.s32.s8 %r11149, %r11148; - cvt.u32.u16 %r11150, %rs7258; - cvt.s32.s8 %r11151, %r11150; - mad.lo.s32 %r11152, %r86, %r11151, %r11143; - mad.lo.s32 %r11153, %r87, %r11149, %r11152; - mad.lo.s32 %r11154, %r88, %r11147, %r11153; - mad.lo.s32 %r11155, %r89, %r11145, %r11154; - ld.const.v4.u8 {%rs7266, %rs7267, %rs7268, %rs7269}, [matrix+3632]; - cvt.u32.u16 %r11156, %rs7269; - cvt.s32.s8 %r11157, %r11156; - cvt.u32.u16 %r11158, %rs7268; - cvt.s32.s8 %r11159, %r11158; - cvt.u32.u16 %r11160, %rs7267; - cvt.s32.s8 %r11161, %r11160; - cvt.u32.u16 %r11162, %rs7266; - cvt.s32.s8 %r11163, %r11162; - mad.lo.s32 %r11164, %r271, %r11163, %r11155; - mad.lo.s32 %r11165, %r91, %r11161, %r11164; - mad.lo.s32 %r11166, %r93, %r11159, %r11165; - mad.lo.s32 %r11167, %r94, %r11157, %r11166; - ld.const.v4.u8 {%rs7274, %rs7275, %rs7276, %rs7277}, [matrix+3636]; - cvt.u32.u16 %r11168, %rs7277; - cvt.s32.s8 %r11169, %r11168; - cvt.u32.u16 %r11170, %rs7276; - cvt.s32.s8 %r11171, %r11170; - cvt.u32.u16 %r11172, %rs7275; - cvt.s32.s8 %r11173, %r11172; - cvt.u32.u16 %r11174, %rs7274; - cvt.s32.s8 %r11175, %r11174; - mad.lo.s32 %r11176, %r96, %r11175, %r11167; - mad.lo.s32 %r11177, %r97, %r11173, %r11176; - mad.lo.s32 %r11178, %r99, %r11171, %r11177; - mad.lo.s32 %r11179, %r100, %r11169, %r11178; - ld.const.v4.u8 {%rs7282, %rs7283, %rs7284, %rs7285}, [matrix+3640]; - cvt.u32.u16 %r11180, %rs7285; - cvt.s32.s8 %r11181, %r11180; - cvt.u32.u16 %r11182, %rs7284; - cvt.s32.s8 %r11183, %r11182; - cvt.u32.u16 %r11184, %rs7283; - cvt.s32.s8 %r11185, %r11184; - cvt.u32.u16 %r11186, %rs7282; - cvt.s32.s8 %r11187, %r11186; - mad.lo.s32 %r11188, %r103, %r11187, %r11179; - mad.lo.s32 %r11189, %r104, %r11185, %r11188; - mad.lo.s32 %r11190, %r107, %r11183, %r11189; - mad.lo.s32 %r11191, %r108, %r11181, %r11190; - ld.const.v4.u8 {%rs7290, %rs7291, %rs7292, %rs7293}, [matrix+3644]; - cvt.u32.u16 %r11192, %rs7293; - cvt.s32.s8 %r11193, %r11192; - cvt.u32.u16 %r11194, %rs7292; - cvt.s32.s8 %r11195, %r11194; - cvt.u32.u16 %r11196, %rs7291; - cvt.s32.s8 %r11197, %r11196; - cvt.u32.u16 %r11198, %rs7290; - cvt.s32.s8 %r11199, %r11198; - mad.lo.s32 %r11200, %r111, %r11199, %r11191; - mad.lo.s32 %r11201, %r112, %r11197, %r11200; - mad.lo.s32 %r11202, %r114, %r11195, %r11201; - mad.lo.s32 %r11203, %r115, %r11193, %r11202; - ld.const.v4.u8 {%rs7298, %rs7299, %rs7300, %rs7301}, [matrix+3648]; - cvt.u32.u16 %r11204, %rs7301; - cvt.s32.s8 %r11205, %r11204; - cvt.u32.u16 %r11206, %rs7300; - cvt.s32.s8 %r11207, %r11206; - cvt.u32.u16 %r11208, %rs7298; - cvt.s32.s8 %r11209, %r11208; - cvt.u32.u16 %r11210, %rs7299; - cvt.s32.s8 %r11211, %r11210; - mul.lo.s32 %r11212, %r34, %r11211; - mad.lo.s32 %r11213, %r124, %r11209, %r11212; - mad.lo.s32 %r11214, %r35, %r11207, %r11213; - mad.lo.s32 %r11215, %r36, %r11205, %r11214; - ld.const.v4.u8 {%rs7306, %rs7307, %rs7308, %rs7309}, [matrix+3652]; - cvt.u32.u16 %r11216, %rs7309; - cvt.s32.s8 %r11217, %r11216; - cvt.u32.u16 %r11218, %rs7308; - cvt.s32.s8 %r11219, %r11218; - cvt.u32.u16 %r11220, %rs7307; - cvt.s32.s8 %r11221, %r11220; - cvt.u32.u16 %r11222, %rs7306; - cvt.s32.s8 %r11223, %r11222; - mad.lo.s32 %r11224, %r37, %r11223, %r11215; - mad.lo.s32 %r11225, %r38, %r11221, %r11224; - mad.lo.s32 %r11226, %r39, %r11219, %r11225; - mad.lo.s32 %r11227, %r40, %r11217, %r11226; - ld.const.v4.u8 {%rs7314, %rs7315, %rs7316, %rs7317}, [matrix+3656]; - cvt.u32.u16 %r11228, %rs7317; - cvt.s32.s8 %r11229, %r11228; - cvt.u32.u16 %r11230, %rs7316; - cvt.s32.s8 %r11231, %r11230; - cvt.u32.u16 %r11232, %rs7315; - cvt.s32.s8 %r11233, %r11232; - cvt.u32.u16 %r11234, %rs7314; - cvt.s32.s8 %r11235, %r11234; - mad.lo.s32 %r11236, %r42, %r11235, %r11227; - mad.lo.s32 %r11237, %r43, %r11233, %r11236; - mad.lo.s32 %r11238, %r45, %r11231, %r11237; - mad.lo.s32 %r11239, %r46, %r11229, %r11238; - ld.const.v4.u8 {%rs7322, %rs7323, %rs7324, %rs7325}, [matrix+3660]; - cvt.u32.u16 %r11240, %rs7325; - cvt.s32.s8 %r11241, %r11240; - cvt.u32.u16 %r11242, %rs7324; - cvt.s32.s8 %r11243, %r11242; - cvt.u32.u16 %r11244, %rs7323; - cvt.s32.s8 %r11245, %r11244; - cvt.u32.u16 %r11246, %rs7322; - cvt.s32.s8 %r11247, %r11246; - mad.lo.s32 %r11248, %r48, %r11247, %r11239; - mad.lo.s32 %r11249, %r49, %r11245, %r11248; - mad.lo.s32 %r11250, %r50, %r11243, %r11249; - mad.lo.s32 %r11251, %r51, %r11241, %r11250; - ld.const.v4.u8 {%rs7330, %rs7331, %rs7332, %rs7333}, [matrix+3664]; - cvt.u32.u16 %r11252, %rs7333; - cvt.s32.s8 %r11253, %r11252; - cvt.u32.u16 %r11254, %rs7332; - cvt.s32.s8 %r11255, %r11254; - cvt.u32.u16 %r11256, %rs7331; - cvt.s32.s8 %r11257, %r11256; - cvt.u32.u16 %r11258, %rs7330; - cvt.s32.s8 %r11259, %r11258; - mad.lo.s32 %r11260, %r173, %r11259, %r11251; - mad.lo.s32 %r11261, %r53, %r11257, %r11260; - mad.lo.s32 %r11262, %r54, %r11255, %r11261; - mad.lo.s32 %r11263, %r55, %r11253, %r11262; - ld.const.v4.u8 {%rs7338, %rs7339, %rs7340, %rs7341}, [matrix+3668]; - cvt.u32.u16 %r11264, %rs7341; - cvt.s32.s8 %r11265, %r11264; - cvt.u32.u16 %r11266, %rs7340; - cvt.s32.s8 %r11267, %r11266; - cvt.u32.u16 %r11268, %rs7339; - cvt.s32.s8 %r11269, %r11268; - cvt.u32.u16 %r11270, %rs7338; - cvt.s32.s8 %r11271, %r11270; - mad.lo.s32 %r11272, %r56, %r11271, %r11263; - mad.lo.s32 %r11273, %r57, %r11269, %r11272; - mad.lo.s32 %r11274, %r58, %r11267, %r11273; - mad.lo.s32 %r11275, %r59, %r11265, %r11274; - ld.const.v4.u8 {%rs7346, %rs7347, %rs7348, %rs7349}, [matrix+3672]; - cvt.u32.u16 %r11276, %rs7349; - cvt.s32.s8 %r11277, %r11276; - cvt.u32.u16 %r11278, %rs7348; - cvt.s32.s8 %r11279, %r11278; - cvt.u32.u16 %r11280, %rs7347; - cvt.s32.s8 %r11281, %r11280; - cvt.u32.u16 %r11282, %rs7346; - cvt.s32.s8 %r11283, %r11282; - mad.lo.s32 %r11284, %r61, %r11283, %r11275; - mad.lo.s32 %r11285, %r62, %r11281, %r11284; - mad.lo.s32 %r11286, %r64, %r11279, %r11285; - mad.lo.s32 %r11287, %r65, %r11277, %r11286; - ld.const.v4.u8 {%rs7354, %rs7355, %rs7356, %rs7357}, [matrix+3676]; - cvt.u32.u16 %r11288, %rs7357; - cvt.s32.s8 %r11289, %r11288; - cvt.u32.u16 %r11290, %rs7356; - cvt.s32.s8 %r11291, %r11290; - cvt.u32.u16 %r11292, %rs7355; - cvt.s32.s8 %r11293, %r11292; - cvt.u32.u16 %r11294, %rs7354; - cvt.s32.s8 %r11295, %r11294; - mad.lo.s32 %r11296, %r67, %r11295, %r11287; - mad.lo.s32 %r11297, %r68, %r11293, %r11296; - mad.lo.s32 %r11298, %r69, %r11291, %r11297; - mad.lo.s32 %r11299, %r70, %r11289, %r11298; - ld.const.v4.u8 {%rs7362, %rs7363, %rs7364, %rs7365}, [matrix+3680]; - cvt.u32.u16 %r11300, %rs7365; - cvt.s32.s8 %r11301, %r11300; - cvt.u32.u16 %r11302, %rs7364; - cvt.s32.s8 %r11303, %r11302; - cvt.u32.u16 %r11304, %rs7363; - cvt.s32.s8 %r11305, %r11304; - cvt.u32.u16 %r11306, %rs7362; - cvt.s32.s8 %r11307, %r11306; - mad.lo.s32 %r11308, %r222, %r11307, %r11299; - mad.lo.s32 %r11309, %r72, %r11305, %r11308; - mad.lo.s32 %r11310, %r73, %r11303, %r11309; - mad.lo.s32 %r11311, %r74, %r11301, %r11310; - ld.const.v4.u8 {%rs7370, %rs7371, %rs7372, %rs7373}, [matrix+3684]; - cvt.u32.u16 %r11312, %rs7373; - cvt.s32.s8 %r11313, %r11312; - cvt.u32.u16 %r11314, %rs7372; - cvt.s32.s8 %r11315, %r11314; - cvt.u32.u16 %r11316, %rs7371; - cvt.s32.s8 %r11317, %r11316; - cvt.u32.u16 %r11318, %rs7370; - cvt.s32.s8 %r11319, %r11318; - mad.lo.s32 %r11320, %r75, %r11319, %r11311; - mad.lo.s32 %r11321, %r76, %r11317, %r11320; - mad.lo.s32 %r11322, %r77, %r11315, %r11321; - mad.lo.s32 %r11323, %r78, %r11313, %r11322; - ld.const.v4.u8 {%rs7378, %rs7379, %rs7380, %rs7381}, [matrix+3688]; - cvt.u32.u16 %r11324, %rs7381; - cvt.s32.s8 %r11325, %r11324; - cvt.u32.u16 %r11326, %rs7380; - cvt.s32.s8 %r11327, %r11326; - cvt.u32.u16 %r11328, %rs7379; - cvt.s32.s8 %r11329, %r11328; - cvt.u32.u16 %r11330, %rs7378; - cvt.s32.s8 %r11331, %r11330; - mad.lo.s32 %r11332, %r80, %r11331, %r11323; - mad.lo.s32 %r11333, %r81, %r11329, %r11332; - mad.lo.s32 %r11334, %r83, %r11327, %r11333; - mad.lo.s32 %r11335, %r84, %r11325, %r11334; - ld.const.v4.u8 {%rs7386, %rs7387, %rs7388, %rs7389}, [matrix+3692]; - cvt.u32.u16 %r11336, %rs7389; - cvt.s32.s8 %r11337, %r11336; - cvt.u32.u16 %r11338, %rs7388; - cvt.s32.s8 %r11339, %r11338; - cvt.u32.u16 %r11340, %rs7387; - cvt.s32.s8 %r11341, %r11340; - cvt.u32.u16 %r11342, %rs7386; - cvt.s32.s8 %r11343, %r11342; - mad.lo.s32 %r11344, %r86, %r11343, %r11335; - mad.lo.s32 %r11345, %r87, %r11341, %r11344; - mad.lo.s32 %r11346, %r88, %r11339, %r11345; - mad.lo.s32 %r11347, %r89, %r11337, %r11346; - ld.const.v4.u8 {%rs7394, %rs7395, %rs7396, %rs7397}, [matrix+3696]; - cvt.u32.u16 %r11348, %rs7397; - cvt.s32.s8 %r11349, %r11348; - cvt.u32.u16 %r11350, %rs7396; - cvt.s32.s8 %r11351, %r11350; - cvt.u32.u16 %r11352, %rs7395; - cvt.s32.s8 %r11353, %r11352; - cvt.u32.u16 %r11354, %rs7394; - cvt.s32.s8 %r11355, %r11354; - mad.lo.s32 %r11356, %r271, %r11355, %r11347; - mad.lo.s32 %r11357, %r91, %r11353, %r11356; - mad.lo.s32 %r11358, %r93, %r11351, %r11357; - mad.lo.s32 %r11359, %r94, %r11349, %r11358; - ld.const.v4.u8 {%rs7402, %rs7403, %rs7404, %rs7405}, [matrix+3700]; - cvt.u32.u16 %r11360, %rs7405; - cvt.s32.s8 %r11361, %r11360; - cvt.u32.u16 %r11362, %rs7404; - cvt.s32.s8 %r11363, %r11362; - cvt.u32.u16 %r11364, %rs7403; - cvt.s32.s8 %r11365, %r11364; - cvt.u32.u16 %r11366, %rs7402; - cvt.s32.s8 %r11367, %r11366; - mad.lo.s32 %r11368, %r96, %r11367, %r11359; - mad.lo.s32 %r11369, %r97, %r11365, %r11368; - mad.lo.s32 %r11370, %r99, %r11363, %r11369; - mad.lo.s32 %r11371, %r100, %r11361, %r11370; - ld.const.v4.u8 {%rs7410, %rs7411, %rs7412, %rs7413}, [matrix+3704]; - cvt.u32.u16 %r11372, %rs7413; - cvt.s32.s8 %r11373, %r11372; - cvt.u32.u16 %r11374, %rs7412; - cvt.s32.s8 %r11375, %r11374; - cvt.u32.u16 %r11376, %rs7411; - cvt.s32.s8 %r11377, %r11376; - cvt.u32.u16 %r11378, %rs7410; - cvt.s32.s8 %r11379, %r11378; - mad.lo.s32 %r11380, %r103, %r11379, %r11371; - mad.lo.s32 %r11381, %r104, %r11377, %r11380; - mad.lo.s32 %r11382, %r107, %r11375, %r11381; - mad.lo.s32 %r11383, %r108, %r11373, %r11382; - ld.const.v4.u8 {%rs7418, %rs7419, %rs7420, %rs7421}, [matrix+3708]; - cvt.u32.u16 %r11384, %rs7421; - cvt.s32.s8 %r11385, %r11384; - cvt.u32.u16 %r11386, %rs7420; - cvt.s32.s8 %r11387, %r11386; - cvt.u32.u16 %r11388, %rs7419; - cvt.s32.s8 %r11389, %r11388; - cvt.u32.u16 %r11390, %rs7418; - cvt.s32.s8 %r11391, %r11390; - mad.lo.s32 %r11392, %r111, %r11391, %r11383; - mad.lo.s32 %r11393, %r112, %r11389, %r11392; - mad.lo.s32 %r11394, %r114, %r11387, %r11393; - mad.lo.s32 %r11395, %r115, %r11385, %r11394; - shr.u32 %r11396, %r11203, 6; - and.b32 %r11397, %r11396, 240; - shr.u32 %r11398, %r11395, 10; - or.b32 %r11399, %r11398, %r11397; - xor.b32 %r11400, %r101, %r11399; - cvt.u64.u32 %rd402, %r11400; - and.b64 %rd403, %rd402, 255; - ld.const.v4.u8 {%rs7426, %rs7427, %rs7428, %rs7429}, [matrix+3712]; - cvt.u32.u16 %r11401, %rs7429; - cvt.s32.s8 %r11402, %r11401; - cvt.u32.u16 %r11403, %rs7428; - cvt.s32.s8 %r11404, %r11403; - cvt.u32.u16 %r11405, %rs7426; - cvt.s32.s8 %r11406, %r11405; - cvt.u32.u16 %r11407, %rs7427; - cvt.s32.s8 %r11408, %r11407; - mul.lo.s32 %r11409, %r34, %r11408; - mad.lo.s32 %r11410, %r124, %r11406, %r11409; - mad.lo.s32 %r11411, %r35, %r11404, %r11410; - mad.lo.s32 %r11412, %r36, %r11402, %r11411; - ld.const.v4.u8 {%rs7434, %rs7435, %rs7436, %rs7437}, [matrix+3716]; - cvt.u32.u16 %r11413, %rs7437; - cvt.s32.s8 %r11414, %r11413; - cvt.u32.u16 %r11415, %rs7436; - cvt.s32.s8 %r11416, %r11415; - cvt.u32.u16 %r11417, %rs7435; - cvt.s32.s8 %r11418, %r11417; - cvt.u32.u16 %r11419, %rs7434; - cvt.s32.s8 %r11420, %r11419; - mad.lo.s32 %r11421, %r37, %r11420, %r11412; - mad.lo.s32 %r11422, %r38, %r11418, %r11421; - mad.lo.s32 %r11423, %r39, %r11416, %r11422; - mad.lo.s32 %r11424, %r40, %r11414, %r11423; - ld.const.v4.u8 {%rs7442, %rs7443, %rs7444, %rs7445}, [matrix+3720]; - cvt.u32.u16 %r11425, %rs7445; - cvt.s32.s8 %r11426, %r11425; - cvt.u32.u16 %r11427, %rs7444; - cvt.s32.s8 %r11428, %r11427; - cvt.u32.u16 %r11429, %rs7443; - cvt.s32.s8 %r11430, %r11429; - cvt.u32.u16 %r11431, %rs7442; - cvt.s32.s8 %r11432, %r11431; - mad.lo.s32 %r11433, %r42, %r11432, %r11424; - mad.lo.s32 %r11434, %r43, %r11430, %r11433; - mad.lo.s32 %r11435, %r45, %r11428, %r11434; - mad.lo.s32 %r11436, %r46, %r11426, %r11435; - ld.const.v4.u8 {%rs7450, %rs7451, %rs7452, %rs7453}, [matrix+3724]; - cvt.u32.u16 %r11437, %rs7453; - cvt.s32.s8 %r11438, %r11437; - cvt.u32.u16 %r11439, %rs7452; - cvt.s32.s8 %r11440, %r11439; - cvt.u32.u16 %r11441, %rs7451; - cvt.s32.s8 %r11442, %r11441; - cvt.u32.u16 %r11443, %rs7450; - cvt.s32.s8 %r11444, %r11443; - mad.lo.s32 %r11445, %r48, %r11444, %r11436; - mad.lo.s32 %r11446, %r49, %r11442, %r11445; - mad.lo.s32 %r11447, %r50, %r11440, %r11446; - mad.lo.s32 %r11448, %r51, %r11438, %r11447; - ld.const.v4.u8 {%rs7458, %rs7459, %rs7460, %rs7461}, [matrix+3728]; - cvt.u32.u16 %r11449, %rs7461; - cvt.s32.s8 %r11450, %r11449; - cvt.u32.u16 %r11451, %rs7460; - cvt.s32.s8 %r11452, %r11451; - cvt.u32.u16 %r11453, %rs7459; - cvt.s32.s8 %r11454, %r11453; - cvt.u32.u16 %r11455, %rs7458; - cvt.s32.s8 %r11456, %r11455; - mad.lo.s32 %r11457, %r173, %r11456, %r11448; - mad.lo.s32 %r11458, %r53, %r11454, %r11457; - mad.lo.s32 %r11459, %r54, %r11452, %r11458; - mad.lo.s32 %r11460, %r55, %r11450, %r11459; - ld.const.v4.u8 {%rs7466, %rs7467, %rs7468, %rs7469}, [matrix+3732]; - cvt.u32.u16 %r11461, %rs7469; - cvt.s32.s8 %r11462, %r11461; - cvt.u32.u16 %r11463, %rs7468; - cvt.s32.s8 %r11464, %r11463; - cvt.u32.u16 %r11465, %rs7467; - cvt.s32.s8 %r11466, %r11465; - cvt.u32.u16 %r11467, %rs7466; - cvt.s32.s8 %r11468, %r11467; - mad.lo.s32 %r11469, %r56, %r11468, %r11460; - mad.lo.s32 %r11470, %r57, %r11466, %r11469; - mad.lo.s32 %r11471, %r58, %r11464, %r11470; - mad.lo.s32 %r11472, %r59, %r11462, %r11471; - ld.const.v4.u8 {%rs7474, %rs7475, %rs7476, %rs7477}, [matrix+3736]; - cvt.u32.u16 %r11473, %rs7477; - cvt.s32.s8 %r11474, %r11473; - cvt.u32.u16 %r11475, %rs7476; - cvt.s32.s8 %r11476, %r11475; - cvt.u32.u16 %r11477, %rs7475; - cvt.s32.s8 %r11478, %r11477; - cvt.u32.u16 %r11479, %rs7474; - cvt.s32.s8 %r11480, %r11479; - mad.lo.s32 %r11481, %r61, %r11480, %r11472; - mad.lo.s32 %r11482, %r62, %r11478, %r11481; - mad.lo.s32 %r11483, %r64, %r11476, %r11482; - mad.lo.s32 %r11484, %r65, %r11474, %r11483; - ld.const.v4.u8 {%rs7482, %rs7483, %rs7484, %rs7485}, [matrix+3740]; - cvt.u32.u16 %r11485, %rs7485; - cvt.s32.s8 %r11486, %r11485; - cvt.u32.u16 %r11487, %rs7484; - cvt.s32.s8 %r11488, %r11487; - cvt.u32.u16 %r11489, %rs7483; - cvt.s32.s8 %r11490, %r11489; - cvt.u32.u16 %r11491, %rs7482; - cvt.s32.s8 %r11492, %r11491; - mad.lo.s32 %r11493, %r67, %r11492, %r11484; - mad.lo.s32 %r11494, %r68, %r11490, %r11493; - mad.lo.s32 %r11495, %r69, %r11488, %r11494; - mad.lo.s32 %r11496, %r70, %r11486, %r11495; - ld.const.v4.u8 {%rs7490, %rs7491, %rs7492, %rs7493}, [matrix+3744]; - cvt.u32.u16 %r11497, %rs7493; - cvt.s32.s8 %r11498, %r11497; - cvt.u32.u16 %r11499, %rs7492; - cvt.s32.s8 %r11500, %r11499; - cvt.u32.u16 %r11501, %rs7491; - cvt.s32.s8 %r11502, %r11501; - cvt.u32.u16 %r11503, %rs7490; - cvt.s32.s8 %r11504, %r11503; - mad.lo.s32 %r11505, %r222, %r11504, %r11496; - mad.lo.s32 %r11506, %r72, %r11502, %r11505; - mad.lo.s32 %r11507, %r73, %r11500, %r11506; - mad.lo.s32 %r11508, %r74, %r11498, %r11507; - ld.const.v4.u8 {%rs7498, %rs7499, %rs7500, %rs7501}, [matrix+3748]; - cvt.u32.u16 %r11509, %rs7501; - cvt.s32.s8 %r11510, %r11509; - cvt.u32.u16 %r11511, %rs7500; - cvt.s32.s8 %r11512, %r11511; - cvt.u32.u16 %r11513, %rs7499; - cvt.s32.s8 %r11514, %r11513; - cvt.u32.u16 %r11515, %rs7498; - cvt.s32.s8 %r11516, %r11515; - mad.lo.s32 %r11517, %r75, %r11516, %r11508; - mad.lo.s32 %r11518, %r76, %r11514, %r11517; - mad.lo.s32 %r11519, %r77, %r11512, %r11518; - mad.lo.s32 %r11520, %r78, %r11510, %r11519; - ld.const.v4.u8 {%rs7506, %rs7507, %rs7508, %rs7509}, [matrix+3752]; - cvt.u32.u16 %r11521, %rs7509; - cvt.s32.s8 %r11522, %r11521; - cvt.u32.u16 %r11523, %rs7508; - cvt.s32.s8 %r11524, %r11523; - cvt.u32.u16 %r11525, %rs7507; - cvt.s32.s8 %r11526, %r11525; - cvt.u32.u16 %r11527, %rs7506; - cvt.s32.s8 %r11528, %r11527; - mad.lo.s32 %r11529, %r80, %r11528, %r11520; - mad.lo.s32 %r11530, %r81, %r11526, %r11529; - mad.lo.s32 %r11531, %r83, %r11524, %r11530; - mad.lo.s32 %r11532, %r84, %r11522, %r11531; - ld.const.v4.u8 {%rs7514, %rs7515, %rs7516, %rs7517}, [matrix+3756]; - cvt.u32.u16 %r11533, %rs7517; - cvt.s32.s8 %r11534, %r11533; - cvt.u32.u16 %r11535, %rs7516; - cvt.s32.s8 %r11536, %r11535; - cvt.u32.u16 %r11537, %rs7515; - cvt.s32.s8 %r11538, %r11537; - cvt.u32.u16 %r11539, %rs7514; - cvt.s32.s8 %r11540, %r11539; - mad.lo.s32 %r11541, %r86, %r11540, %r11532; - mad.lo.s32 %r11542, %r87, %r11538, %r11541; - mad.lo.s32 %r11543, %r88, %r11536, %r11542; - mad.lo.s32 %r11544, %r89, %r11534, %r11543; - ld.const.v4.u8 {%rs7522, %rs7523, %rs7524, %rs7525}, [matrix+3760]; - cvt.u32.u16 %r11545, %rs7525; - cvt.s32.s8 %r11546, %r11545; - cvt.u32.u16 %r11547, %rs7524; - cvt.s32.s8 %r11548, %r11547; - cvt.u32.u16 %r11549, %rs7523; - cvt.s32.s8 %r11550, %r11549; - cvt.u32.u16 %r11551, %rs7522; - cvt.s32.s8 %r11552, %r11551; - mad.lo.s32 %r11553, %r271, %r11552, %r11544; - mad.lo.s32 %r11554, %r91, %r11550, %r11553; - mad.lo.s32 %r11555, %r93, %r11548, %r11554; - mad.lo.s32 %r11556, %r94, %r11546, %r11555; - ld.const.v4.u8 {%rs7530, %rs7531, %rs7532, %rs7533}, [matrix+3764]; - cvt.u32.u16 %r11557, %rs7533; - cvt.s32.s8 %r11558, %r11557; - cvt.u32.u16 %r11559, %rs7532; - cvt.s32.s8 %r11560, %r11559; - cvt.u32.u16 %r11561, %rs7531; - cvt.s32.s8 %r11562, %r11561; - cvt.u32.u16 %r11563, %rs7530; - cvt.s32.s8 %r11564, %r11563; - mad.lo.s32 %r11565, %r96, %r11564, %r11556; - mad.lo.s32 %r11566, %r97, %r11562, %r11565; - mad.lo.s32 %r11567, %r99, %r11560, %r11566; - mad.lo.s32 %r11568, %r100, %r11558, %r11567; - ld.const.v4.u8 {%rs7538, %rs7539, %rs7540, %rs7541}, [matrix+3768]; - cvt.u32.u16 %r11569, %rs7541; - cvt.s32.s8 %r11570, %r11569; - cvt.u32.u16 %r11571, %rs7540; - cvt.s32.s8 %r11572, %r11571; - cvt.u32.u16 %r11573, %rs7539; - cvt.s32.s8 %r11574, %r11573; - cvt.u32.u16 %r11575, %rs7538; - cvt.s32.s8 %r11576, %r11575; - mad.lo.s32 %r11577, %r103, %r11576, %r11568; - mad.lo.s32 %r11578, %r104, %r11574, %r11577; - mad.lo.s32 %r11579, %r107, %r11572, %r11578; - mad.lo.s32 %r11580, %r108, %r11570, %r11579; - ld.const.v4.u8 {%rs7546, %rs7547, %rs7548, %rs7549}, [matrix+3772]; - cvt.u32.u16 %r11581, %rs7549; - cvt.s32.s8 %r11582, %r11581; - cvt.u32.u16 %r11583, %rs7548; - cvt.s32.s8 %r11584, %r11583; - cvt.u32.u16 %r11585, %rs7547; - cvt.s32.s8 %r11586, %r11585; - cvt.u32.u16 %r11587, %rs7546; - cvt.s32.s8 %r11588, %r11587; - mad.lo.s32 %r11589, %r111, %r11588, %r11580; - mad.lo.s32 %r11590, %r112, %r11586, %r11589; - mad.lo.s32 %r11591, %r114, %r11584, %r11590; - mad.lo.s32 %r11592, %r115, %r11582, %r11591; - ld.const.v4.u8 {%rs7554, %rs7555, %rs7556, %rs7557}, [matrix+3776]; - cvt.u32.u16 %r11593, %rs7557; - cvt.s32.s8 %r11594, %r11593; - cvt.u32.u16 %r11595, %rs7556; - cvt.s32.s8 %r11596, %r11595; - cvt.u32.u16 %r11597, %rs7554; - cvt.s32.s8 %r11598, %r11597; - cvt.u32.u16 %r11599, %rs7555; - cvt.s32.s8 %r11600, %r11599; - mul.lo.s32 %r11601, %r34, %r11600; - mad.lo.s32 %r11602, %r124, %r11598, %r11601; - mad.lo.s32 %r11603, %r35, %r11596, %r11602; - mad.lo.s32 %r11604, %r36, %r11594, %r11603; - ld.const.v4.u8 {%rs7562, %rs7563, %rs7564, %rs7565}, [matrix+3780]; - cvt.u32.u16 %r11605, %rs7565; - cvt.s32.s8 %r11606, %r11605; - cvt.u32.u16 %r11607, %rs7564; - cvt.s32.s8 %r11608, %r11607; - cvt.u32.u16 %r11609, %rs7563; - cvt.s32.s8 %r11610, %r11609; - cvt.u32.u16 %r11611, %rs7562; - cvt.s32.s8 %r11612, %r11611; - mad.lo.s32 %r11613, %r37, %r11612, %r11604; - mad.lo.s32 %r11614, %r38, %r11610, %r11613; - mad.lo.s32 %r11615, %r39, %r11608, %r11614; - mad.lo.s32 %r11616, %r40, %r11606, %r11615; - ld.const.v4.u8 {%rs7570, %rs7571, %rs7572, %rs7573}, [matrix+3784]; - cvt.u32.u16 %r11617, %rs7573; - cvt.s32.s8 %r11618, %r11617; - cvt.u32.u16 %r11619, %rs7572; - cvt.s32.s8 %r11620, %r11619; - cvt.u32.u16 %r11621, %rs7571; - cvt.s32.s8 %r11622, %r11621; - cvt.u32.u16 %r11623, %rs7570; - cvt.s32.s8 %r11624, %r11623; - mad.lo.s32 %r11625, %r42, %r11624, %r11616; - mad.lo.s32 %r11626, %r43, %r11622, %r11625; - mad.lo.s32 %r11627, %r45, %r11620, %r11626; - mad.lo.s32 %r11628, %r46, %r11618, %r11627; - ld.const.v4.u8 {%rs7578, %rs7579, %rs7580, %rs7581}, [matrix+3788]; - cvt.u32.u16 %r11629, %rs7581; - cvt.s32.s8 %r11630, %r11629; - cvt.u32.u16 %r11631, %rs7580; - cvt.s32.s8 %r11632, %r11631; - cvt.u32.u16 %r11633, %rs7579; - cvt.s32.s8 %r11634, %r11633; - cvt.u32.u16 %r11635, %rs7578; - cvt.s32.s8 %r11636, %r11635; - mad.lo.s32 %r11637, %r48, %r11636, %r11628; - mad.lo.s32 %r11638, %r49, %r11634, %r11637; - mad.lo.s32 %r11639, %r50, %r11632, %r11638; - mad.lo.s32 %r11640, %r51, %r11630, %r11639; - ld.const.v4.u8 {%rs7586, %rs7587, %rs7588, %rs7589}, [matrix+3792]; - cvt.u32.u16 %r11641, %rs7589; - cvt.s32.s8 %r11642, %r11641; - cvt.u32.u16 %r11643, %rs7588; - cvt.s32.s8 %r11644, %r11643; - cvt.u32.u16 %r11645, %rs7587; - cvt.s32.s8 %r11646, %r11645; - cvt.u32.u16 %r11647, %rs7586; - cvt.s32.s8 %r11648, %r11647; - mad.lo.s32 %r11649, %r173, %r11648, %r11640; - mad.lo.s32 %r11650, %r53, %r11646, %r11649; - mad.lo.s32 %r11651, %r54, %r11644, %r11650; - mad.lo.s32 %r11652, %r55, %r11642, %r11651; - ld.const.v4.u8 {%rs7594, %rs7595, %rs7596, %rs7597}, [matrix+3796]; - cvt.u32.u16 %r11653, %rs7597; - cvt.s32.s8 %r11654, %r11653; - cvt.u32.u16 %r11655, %rs7596; - cvt.s32.s8 %r11656, %r11655; - cvt.u32.u16 %r11657, %rs7595; - cvt.s32.s8 %r11658, %r11657; - cvt.u32.u16 %r11659, %rs7594; - cvt.s32.s8 %r11660, %r11659; - mad.lo.s32 %r11661, %r56, %r11660, %r11652; - mad.lo.s32 %r11662, %r57, %r11658, %r11661; - mad.lo.s32 %r11663, %r58, %r11656, %r11662; - mad.lo.s32 %r11664, %r59, %r11654, %r11663; - ld.const.v4.u8 {%rs7602, %rs7603, %rs7604, %rs7605}, [matrix+3800]; - cvt.u32.u16 %r11665, %rs7605; - cvt.s32.s8 %r11666, %r11665; - cvt.u32.u16 %r11667, %rs7604; - cvt.s32.s8 %r11668, %r11667; - cvt.u32.u16 %r11669, %rs7603; - cvt.s32.s8 %r11670, %r11669; - cvt.u32.u16 %r11671, %rs7602; - cvt.s32.s8 %r11672, %r11671; - mad.lo.s32 %r11673, %r61, %r11672, %r11664; - mad.lo.s32 %r11674, %r62, %r11670, %r11673; - mad.lo.s32 %r11675, %r64, %r11668, %r11674; - mad.lo.s32 %r11676, %r65, %r11666, %r11675; - ld.const.v4.u8 {%rs7610, %rs7611, %rs7612, %rs7613}, [matrix+3804]; - cvt.u32.u16 %r11677, %rs7613; - cvt.s32.s8 %r11678, %r11677; - cvt.u32.u16 %r11679, %rs7612; - cvt.s32.s8 %r11680, %r11679; - cvt.u32.u16 %r11681, %rs7611; - cvt.s32.s8 %r11682, %r11681; - cvt.u32.u16 %r11683, %rs7610; - cvt.s32.s8 %r11684, %r11683; - mad.lo.s32 %r11685, %r67, %r11684, %r11676; - mad.lo.s32 %r11686, %r68, %r11682, %r11685; - mad.lo.s32 %r11687, %r69, %r11680, %r11686; - mad.lo.s32 %r11688, %r70, %r11678, %r11687; - ld.const.v4.u8 {%rs7618, %rs7619, %rs7620, %rs7621}, [matrix+3808]; - cvt.u32.u16 %r11689, %rs7621; - cvt.s32.s8 %r11690, %r11689; - cvt.u32.u16 %r11691, %rs7620; - cvt.s32.s8 %r11692, %r11691; - cvt.u32.u16 %r11693, %rs7619; - cvt.s32.s8 %r11694, %r11693; - cvt.u32.u16 %r11695, %rs7618; - cvt.s32.s8 %r11696, %r11695; - mad.lo.s32 %r11697, %r222, %r11696, %r11688; - mad.lo.s32 %r11698, %r72, %r11694, %r11697; - mad.lo.s32 %r11699, %r73, %r11692, %r11698; - mad.lo.s32 %r11700, %r74, %r11690, %r11699; - ld.const.v4.u8 {%rs7626, %rs7627, %rs7628, %rs7629}, [matrix+3812]; - cvt.u32.u16 %r11701, %rs7629; - cvt.s32.s8 %r11702, %r11701; - cvt.u32.u16 %r11703, %rs7628; - cvt.s32.s8 %r11704, %r11703; - cvt.u32.u16 %r11705, %rs7627; - cvt.s32.s8 %r11706, %r11705; - cvt.u32.u16 %r11707, %rs7626; - cvt.s32.s8 %r11708, %r11707; - mad.lo.s32 %r11709, %r75, %r11708, %r11700; - mad.lo.s32 %r11710, %r76, %r11706, %r11709; - mad.lo.s32 %r11711, %r77, %r11704, %r11710; - mad.lo.s32 %r11712, %r78, %r11702, %r11711; - ld.const.v4.u8 {%rs7634, %rs7635, %rs7636, %rs7637}, [matrix+3816]; - cvt.u32.u16 %r11713, %rs7637; - cvt.s32.s8 %r11714, %r11713; - cvt.u32.u16 %r11715, %rs7636; - cvt.s32.s8 %r11716, %r11715; - cvt.u32.u16 %r11717, %rs7635; - cvt.s32.s8 %r11718, %r11717; - cvt.u32.u16 %r11719, %rs7634; - cvt.s32.s8 %r11720, %r11719; - mad.lo.s32 %r11721, %r80, %r11720, %r11712; - mad.lo.s32 %r11722, %r81, %r11718, %r11721; - mad.lo.s32 %r11723, %r83, %r11716, %r11722; - mad.lo.s32 %r11724, %r84, %r11714, %r11723; - ld.const.v4.u8 {%rs7642, %rs7643, %rs7644, %rs7645}, [matrix+3820]; - cvt.u32.u16 %r11725, %rs7645; - cvt.s32.s8 %r11726, %r11725; - cvt.u32.u16 %r11727, %rs7644; - cvt.s32.s8 %r11728, %r11727; - cvt.u32.u16 %r11729, %rs7643; - cvt.s32.s8 %r11730, %r11729; - cvt.u32.u16 %r11731, %rs7642; - cvt.s32.s8 %r11732, %r11731; - mad.lo.s32 %r11733, %r86, %r11732, %r11724; - mad.lo.s32 %r11734, %r87, %r11730, %r11733; - mad.lo.s32 %r11735, %r88, %r11728, %r11734; - mad.lo.s32 %r11736, %r89, %r11726, %r11735; - ld.const.v4.u8 {%rs7650, %rs7651, %rs7652, %rs7653}, [matrix+3824]; - cvt.u32.u16 %r11737, %rs7653; - cvt.s32.s8 %r11738, %r11737; - cvt.u32.u16 %r11739, %rs7652; - cvt.s32.s8 %r11740, %r11739; - cvt.u32.u16 %r11741, %rs7651; - cvt.s32.s8 %r11742, %r11741; - cvt.u32.u16 %r11743, %rs7650; - cvt.s32.s8 %r11744, %r11743; - mad.lo.s32 %r11745, %r271, %r11744, %r11736; - mad.lo.s32 %r11746, %r91, %r11742, %r11745; - mad.lo.s32 %r11747, %r93, %r11740, %r11746; - mad.lo.s32 %r11748, %r94, %r11738, %r11747; - ld.const.v4.u8 {%rs7658, %rs7659, %rs7660, %rs7661}, [matrix+3828]; - cvt.u32.u16 %r11749, %rs7661; - cvt.s32.s8 %r11750, %r11749; - cvt.u32.u16 %r11751, %rs7660; - cvt.s32.s8 %r11752, %r11751; - cvt.u32.u16 %r11753, %rs7659; - cvt.s32.s8 %r11754, %r11753; - cvt.u32.u16 %r11755, %rs7658; - cvt.s32.s8 %r11756, %r11755; - mad.lo.s32 %r11757, %r96, %r11756, %r11748; - mad.lo.s32 %r11758, %r97, %r11754, %r11757; - mad.lo.s32 %r11759, %r99, %r11752, %r11758; - mad.lo.s32 %r11760, %r100, %r11750, %r11759; - ld.const.v4.u8 {%rs7666, %rs7667, %rs7668, %rs7669}, [matrix+3832]; - cvt.u32.u16 %r11761, %rs7669; - cvt.s32.s8 %r11762, %r11761; - cvt.u32.u16 %r11763, %rs7668; - cvt.s32.s8 %r11764, %r11763; - cvt.u32.u16 %r11765, %rs7667; - cvt.s32.s8 %r11766, %r11765; - cvt.u32.u16 %r11767, %rs7666; - cvt.s32.s8 %r11768, %r11767; - mad.lo.s32 %r11769, %r103, %r11768, %r11760; - mad.lo.s32 %r11770, %r104, %r11766, %r11769; - mad.lo.s32 %r11771, %r107, %r11764, %r11770; - mad.lo.s32 %r11772, %r108, %r11762, %r11771; - ld.const.v4.u8 {%rs7674, %rs7675, %rs7676, %rs7677}, [matrix+3836]; - cvt.u32.u16 %r11773, %rs7677; - cvt.s32.s8 %r11774, %r11773; - cvt.u32.u16 %r11775, %rs7676; - cvt.s32.s8 %r11776, %r11775; - cvt.u32.u16 %r11777, %rs7675; - cvt.s32.s8 %r11778, %r11777; - cvt.u32.u16 %r11779, %rs7674; - cvt.s32.s8 %r11780, %r11779; - mad.lo.s32 %r11781, %r111, %r11780, %r11772; - mad.lo.s32 %r11782, %r112, %r11778, %r11781; - mad.lo.s32 %r11783, %r114, %r11776, %r11782; - mad.lo.s32 %r11784, %r115, %r11774, %r11783; - shr.u32 %r11785, %r11592, 6; - and.b32 %r11786, %r11785, 240; - shr.u32 %r11787, %r11784, 10; - or.b32 %r11788, %r11787, %r11786; - xor.b32 %r11789, %r105, %r11788; - cvt.u64.u32 %rd404, %r11789; - ld.const.v4.u8 {%rs7682, %rs7683, %rs7684, %rs7685}, [matrix+3840]; - cvt.u32.u16 %r11790, %rs7685; - cvt.s32.s8 %r11791, %r11790; - cvt.u32.u16 %r11792, %rs7684; - cvt.s32.s8 %r11793, %r11792; - cvt.u32.u16 %r11794, %rs7682; - cvt.s32.s8 %r11795, %r11794; - cvt.u32.u16 %r11796, %rs7683; - cvt.s32.s8 %r11797, %r11796; - mul.lo.s32 %r11798, %r34, %r11797; - mad.lo.s32 %r11799, %r124, %r11795, %r11798; - mad.lo.s32 %r11800, %r35, %r11793, %r11799; - mad.lo.s32 %r11801, %r36, %r11791, %r11800; - ld.const.v4.u8 {%rs7690, %rs7691, %rs7692, %rs7693}, [matrix+3844]; - cvt.u32.u16 %r11802, %rs7693; - cvt.s32.s8 %r11803, %r11802; - cvt.u32.u16 %r11804, %rs7692; - cvt.s32.s8 %r11805, %r11804; - cvt.u32.u16 %r11806, %rs7691; - cvt.s32.s8 %r11807, %r11806; - cvt.u32.u16 %r11808, %rs7690; - cvt.s32.s8 %r11809, %r11808; - mad.lo.s32 %r11810, %r37, %r11809, %r11801; - mad.lo.s32 %r11811, %r38, %r11807, %r11810; - mad.lo.s32 %r11812, %r39, %r11805, %r11811; - mad.lo.s32 %r11813, %r40, %r11803, %r11812; - ld.const.v4.u8 {%rs7698, %rs7699, %rs7700, %rs7701}, [matrix+3848]; - cvt.u32.u16 %r11814, %rs7701; - cvt.s32.s8 %r11815, %r11814; - cvt.u32.u16 %r11816, %rs7700; - cvt.s32.s8 %r11817, %r11816; - cvt.u32.u16 %r11818, %rs7699; - cvt.s32.s8 %r11819, %r11818; - cvt.u32.u16 %r11820, %rs7698; - cvt.s32.s8 %r11821, %r11820; - mad.lo.s32 %r11822, %r42, %r11821, %r11813; - mad.lo.s32 %r11823, %r43, %r11819, %r11822; - mad.lo.s32 %r11824, %r45, %r11817, %r11823; - mad.lo.s32 %r11825, %r46, %r11815, %r11824; - ld.const.v4.u8 {%rs7706, %rs7707, %rs7708, %rs7709}, [matrix+3852]; - cvt.u32.u16 %r11826, %rs7709; - cvt.s32.s8 %r11827, %r11826; - cvt.u32.u16 %r11828, %rs7708; - cvt.s32.s8 %r11829, %r11828; - cvt.u32.u16 %r11830, %rs7707; - cvt.s32.s8 %r11831, %r11830; - cvt.u32.u16 %r11832, %rs7706; - cvt.s32.s8 %r11833, %r11832; - mad.lo.s32 %r11834, %r48, %r11833, %r11825; - mad.lo.s32 %r11835, %r49, %r11831, %r11834; - mad.lo.s32 %r11836, %r50, %r11829, %r11835; - mad.lo.s32 %r11837, %r51, %r11827, %r11836; - ld.const.v4.u8 {%rs7714, %rs7715, %rs7716, %rs7717}, [matrix+3856]; - cvt.u32.u16 %r11838, %rs7717; - cvt.s32.s8 %r11839, %r11838; - cvt.u32.u16 %r11840, %rs7716; - cvt.s32.s8 %r11841, %r11840; - cvt.u32.u16 %r11842, %rs7715; - cvt.s32.s8 %r11843, %r11842; - cvt.u32.u16 %r11844, %rs7714; - cvt.s32.s8 %r11845, %r11844; - mad.lo.s32 %r11846, %r173, %r11845, %r11837; - mad.lo.s32 %r11847, %r53, %r11843, %r11846; - mad.lo.s32 %r11848, %r54, %r11841, %r11847; - mad.lo.s32 %r11849, %r55, %r11839, %r11848; - ld.const.v4.u8 {%rs7722, %rs7723, %rs7724, %rs7725}, [matrix+3860]; - cvt.u32.u16 %r11850, %rs7725; - cvt.s32.s8 %r11851, %r11850; - cvt.u32.u16 %r11852, %rs7724; - cvt.s32.s8 %r11853, %r11852; - cvt.u32.u16 %r11854, %rs7723; - cvt.s32.s8 %r11855, %r11854; - cvt.u32.u16 %r11856, %rs7722; - cvt.s32.s8 %r11857, %r11856; - mad.lo.s32 %r11858, %r56, %r11857, %r11849; - mad.lo.s32 %r11859, %r57, %r11855, %r11858; - mad.lo.s32 %r11860, %r58, %r11853, %r11859; - mad.lo.s32 %r11861, %r59, %r11851, %r11860; - ld.const.v4.u8 {%rs7730, %rs7731, %rs7732, %rs7733}, [matrix+3864]; - cvt.u32.u16 %r11862, %rs7733; - cvt.s32.s8 %r11863, %r11862; - cvt.u32.u16 %r11864, %rs7732; - cvt.s32.s8 %r11865, %r11864; - cvt.u32.u16 %r11866, %rs7731; - cvt.s32.s8 %r11867, %r11866; - cvt.u32.u16 %r11868, %rs7730; - cvt.s32.s8 %r11869, %r11868; - mad.lo.s32 %r11870, %r61, %r11869, %r11861; - mad.lo.s32 %r11871, %r62, %r11867, %r11870; - mad.lo.s32 %r11872, %r64, %r11865, %r11871; - mad.lo.s32 %r11873, %r65, %r11863, %r11872; - ld.const.v4.u8 {%rs7738, %rs7739, %rs7740, %rs7741}, [matrix+3868]; - cvt.u32.u16 %r11874, %rs7741; - cvt.s32.s8 %r11875, %r11874; - cvt.u32.u16 %r11876, %rs7740; - cvt.s32.s8 %r11877, %r11876; - cvt.u32.u16 %r11878, %rs7739; - cvt.s32.s8 %r11879, %r11878; - cvt.u32.u16 %r11880, %rs7738; - cvt.s32.s8 %r11881, %r11880; - mad.lo.s32 %r11882, %r67, %r11881, %r11873; - mad.lo.s32 %r11883, %r68, %r11879, %r11882; - mad.lo.s32 %r11884, %r69, %r11877, %r11883; - mad.lo.s32 %r11885, %r70, %r11875, %r11884; - ld.const.v4.u8 {%rs7746, %rs7747, %rs7748, %rs7749}, [matrix+3872]; - cvt.u32.u16 %r11886, %rs7749; - cvt.s32.s8 %r11887, %r11886; - cvt.u32.u16 %r11888, %rs7748; - cvt.s32.s8 %r11889, %r11888; - cvt.u32.u16 %r11890, %rs7747; - cvt.s32.s8 %r11891, %r11890; - cvt.u32.u16 %r11892, %rs7746; - cvt.s32.s8 %r11893, %r11892; - mad.lo.s32 %r11894, %r222, %r11893, %r11885; - mad.lo.s32 %r11895, %r72, %r11891, %r11894; - mad.lo.s32 %r11896, %r73, %r11889, %r11895; - mad.lo.s32 %r11897, %r74, %r11887, %r11896; - ld.const.v4.u8 {%rs7754, %rs7755, %rs7756, %rs7757}, [matrix+3876]; - cvt.u32.u16 %r11898, %rs7757; - cvt.s32.s8 %r11899, %r11898; - cvt.u32.u16 %r11900, %rs7756; - cvt.s32.s8 %r11901, %r11900; - cvt.u32.u16 %r11902, %rs7755; - cvt.s32.s8 %r11903, %r11902; - cvt.u32.u16 %r11904, %rs7754; - cvt.s32.s8 %r11905, %r11904; - mad.lo.s32 %r11906, %r75, %r11905, %r11897; - mad.lo.s32 %r11907, %r76, %r11903, %r11906; - mad.lo.s32 %r11908, %r77, %r11901, %r11907; - mad.lo.s32 %r11909, %r78, %r11899, %r11908; - ld.const.v4.u8 {%rs7762, %rs7763, %rs7764, %rs7765}, [matrix+3880]; - cvt.u32.u16 %r11910, %rs7765; - cvt.s32.s8 %r11911, %r11910; - cvt.u32.u16 %r11912, %rs7764; - cvt.s32.s8 %r11913, %r11912; - cvt.u32.u16 %r11914, %rs7763; - cvt.s32.s8 %r11915, %r11914; - cvt.u32.u16 %r11916, %rs7762; - cvt.s32.s8 %r11917, %r11916; - mad.lo.s32 %r11918, %r80, %r11917, %r11909; - mad.lo.s32 %r11919, %r81, %r11915, %r11918; - mad.lo.s32 %r11920, %r83, %r11913, %r11919; - mad.lo.s32 %r11921, %r84, %r11911, %r11920; - ld.const.v4.u8 {%rs7770, %rs7771, %rs7772, %rs7773}, [matrix+3884]; - cvt.u32.u16 %r11922, %rs7773; - cvt.s32.s8 %r11923, %r11922; - cvt.u32.u16 %r11924, %rs7772; - cvt.s32.s8 %r11925, %r11924; - cvt.u32.u16 %r11926, %rs7771; - cvt.s32.s8 %r11927, %r11926; - cvt.u32.u16 %r11928, %rs7770; - cvt.s32.s8 %r11929, %r11928; - mad.lo.s32 %r11930, %r86, %r11929, %r11921; - mad.lo.s32 %r11931, %r87, %r11927, %r11930; - mad.lo.s32 %r11932, %r88, %r11925, %r11931; - mad.lo.s32 %r11933, %r89, %r11923, %r11932; - ld.const.v4.u8 {%rs7778, %rs7779, %rs7780, %rs7781}, [matrix+3888]; - cvt.u32.u16 %r11934, %rs7781; - cvt.s32.s8 %r11935, %r11934; - cvt.u32.u16 %r11936, %rs7780; - cvt.s32.s8 %r11937, %r11936; - cvt.u32.u16 %r11938, %rs7779; - cvt.s32.s8 %r11939, %r11938; - cvt.u32.u16 %r11940, %rs7778; - cvt.s32.s8 %r11941, %r11940; - mad.lo.s32 %r11942, %r271, %r11941, %r11933; - mad.lo.s32 %r11943, %r91, %r11939, %r11942; - mad.lo.s32 %r11944, %r93, %r11937, %r11943; - mad.lo.s32 %r11945, %r94, %r11935, %r11944; - ld.const.v4.u8 {%rs7786, %rs7787, %rs7788, %rs7789}, [matrix+3892]; - cvt.u32.u16 %r11946, %rs7789; - cvt.s32.s8 %r11947, %r11946; - cvt.u32.u16 %r11948, %rs7788; - cvt.s32.s8 %r11949, %r11948; - cvt.u32.u16 %r11950, %rs7787; - cvt.s32.s8 %r11951, %r11950; - cvt.u32.u16 %r11952, %rs7786; - cvt.s32.s8 %r11953, %r11952; - mad.lo.s32 %r11954, %r96, %r11953, %r11945; - mad.lo.s32 %r11955, %r97, %r11951, %r11954; - mad.lo.s32 %r11956, %r99, %r11949, %r11955; - mad.lo.s32 %r11957, %r100, %r11947, %r11956; - ld.const.v4.u8 {%rs7794, %rs7795, %rs7796, %rs7797}, [matrix+3896]; - cvt.u32.u16 %r11958, %rs7797; - cvt.s32.s8 %r11959, %r11958; - cvt.u32.u16 %r11960, %rs7796; - cvt.s32.s8 %r11961, %r11960; - cvt.u32.u16 %r11962, %rs7795; - cvt.s32.s8 %r11963, %r11962; - cvt.u32.u16 %r11964, %rs7794; - cvt.s32.s8 %r11965, %r11964; - mad.lo.s32 %r11966, %r103, %r11965, %r11957; - mad.lo.s32 %r11967, %r104, %r11963, %r11966; - mad.lo.s32 %r11968, %r107, %r11961, %r11967; - mad.lo.s32 %r11969, %r108, %r11959, %r11968; - ld.const.v4.u8 {%rs7802, %rs7803, %rs7804, %rs7805}, [matrix+3900]; - cvt.u32.u16 %r11970, %rs7805; - cvt.s32.s8 %r11971, %r11970; - cvt.u32.u16 %r11972, %rs7804; - cvt.s32.s8 %r11973, %r11972; - cvt.u32.u16 %r11974, %rs7803; - cvt.s32.s8 %r11975, %r11974; - cvt.u32.u16 %r11976, %rs7802; - cvt.s32.s8 %r11977, %r11976; - mad.lo.s32 %r11978, %r111, %r11977, %r11969; - mad.lo.s32 %r11979, %r112, %r11975, %r11978; - mad.lo.s32 %r11980, %r114, %r11973, %r11979; - mad.lo.s32 %r11981, %r115, %r11971, %r11980; - ld.const.v4.u8 {%rs7810, %rs7811, %rs7812, %rs7813}, [matrix+3904]; - cvt.u32.u16 %r11982, %rs7813; - cvt.s32.s8 %r11983, %r11982; - cvt.u32.u16 %r11984, %rs7812; - cvt.s32.s8 %r11985, %r11984; - cvt.u32.u16 %r11986, %rs7810; - cvt.s32.s8 %r11987, %r11986; - cvt.u32.u16 %r11988, %rs7811; - cvt.s32.s8 %r11989, %r11988; - mul.lo.s32 %r11990, %r34, %r11989; - mad.lo.s32 %r11991, %r124, %r11987, %r11990; - mad.lo.s32 %r11992, %r35, %r11985, %r11991; - mad.lo.s32 %r11993, %r36, %r11983, %r11992; - ld.const.v4.u8 {%rs7818, %rs7819, %rs7820, %rs7821}, [matrix+3908]; - cvt.u32.u16 %r11994, %rs7821; - cvt.s32.s8 %r11995, %r11994; - cvt.u32.u16 %r11996, %rs7820; - cvt.s32.s8 %r11997, %r11996; - cvt.u32.u16 %r11998, %rs7819; - cvt.s32.s8 %r11999, %r11998; - cvt.u32.u16 %r12000, %rs7818; - cvt.s32.s8 %r12001, %r12000; - mad.lo.s32 %r12002, %r37, %r12001, %r11993; - mad.lo.s32 %r12003, %r38, %r11999, %r12002; - mad.lo.s32 %r12004, %r39, %r11997, %r12003; - mad.lo.s32 %r12005, %r40, %r11995, %r12004; - ld.const.v4.u8 {%rs7826, %rs7827, %rs7828, %rs7829}, [matrix+3912]; - cvt.u32.u16 %r12006, %rs7829; - cvt.s32.s8 %r12007, %r12006; - cvt.u32.u16 %r12008, %rs7828; - cvt.s32.s8 %r12009, %r12008; - cvt.u32.u16 %r12010, %rs7827; - cvt.s32.s8 %r12011, %r12010; - cvt.u32.u16 %r12012, %rs7826; - cvt.s32.s8 %r12013, %r12012; - mad.lo.s32 %r12014, %r42, %r12013, %r12005; - mad.lo.s32 %r12015, %r43, %r12011, %r12014; - mad.lo.s32 %r12016, %r45, %r12009, %r12015; - mad.lo.s32 %r12017, %r46, %r12007, %r12016; - ld.const.v4.u8 {%rs7834, %rs7835, %rs7836, %rs7837}, [matrix+3916]; - cvt.u32.u16 %r12018, %rs7837; - cvt.s32.s8 %r12019, %r12018; - cvt.u32.u16 %r12020, %rs7836; - cvt.s32.s8 %r12021, %r12020; - cvt.u32.u16 %r12022, %rs7835; - cvt.s32.s8 %r12023, %r12022; - cvt.u32.u16 %r12024, %rs7834; - cvt.s32.s8 %r12025, %r12024; - mad.lo.s32 %r12026, %r48, %r12025, %r12017; - mad.lo.s32 %r12027, %r49, %r12023, %r12026; - mad.lo.s32 %r12028, %r50, %r12021, %r12027; - mad.lo.s32 %r12029, %r51, %r12019, %r12028; - ld.const.v4.u8 {%rs7842, %rs7843, %rs7844, %rs7845}, [matrix+3920]; - cvt.u32.u16 %r12030, %rs7845; - cvt.s32.s8 %r12031, %r12030; - cvt.u32.u16 %r12032, %rs7844; - cvt.s32.s8 %r12033, %r12032; - cvt.u32.u16 %r12034, %rs7843; - cvt.s32.s8 %r12035, %r12034; - cvt.u32.u16 %r12036, %rs7842; - cvt.s32.s8 %r12037, %r12036; - mad.lo.s32 %r12038, %r173, %r12037, %r12029; - mad.lo.s32 %r12039, %r53, %r12035, %r12038; - mad.lo.s32 %r12040, %r54, %r12033, %r12039; - mad.lo.s32 %r12041, %r55, %r12031, %r12040; - ld.const.v4.u8 {%rs7850, %rs7851, %rs7852, %rs7853}, [matrix+3924]; - cvt.u32.u16 %r12042, %rs7853; - cvt.s32.s8 %r12043, %r12042; - cvt.u32.u16 %r12044, %rs7852; - cvt.s32.s8 %r12045, %r12044; - cvt.u32.u16 %r12046, %rs7851; - cvt.s32.s8 %r12047, %r12046; - cvt.u32.u16 %r12048, %rs7850; - cvt.s32.s8 %r12049, %r12048; - mad.lo.s32 %r12050, %r56, %r12049, %r12041; - mad.lo.s32 %r12051, %r57, %r12047, %r12050; - mad.lo.s32 %r12052, %r58, %r12045, %r12051; - mad.lo.s32 %r12053, %r59, %r12043, %r12052; - ld.const.v4.u8 {%rs7858, %rs7859, %rs7860, %rs7861}, [matrix+3928]; - cvt.u32.u16 %r12054, %rs7861; - cvt.s32.s8 %r12055, %r12054; - cvt.u32.u16 %r12056, %rs7860; - cvt.s32.s8 %r12057, %r12056; - cvt.u32.u16 %r12058, %rs7859; - cvt.s32.s8 %r12059, %r12058; - cvt.u32.u16 %r12060, %rs7858; - cvt.s32.s8 %r12061, %r12060; - mad.lo.s32 %r12062, %r61, %r12061, %r12053; - mad.lo.s32 %r12063, %r62, %r12059, %r12062; - mad.lo.s32 %r12064, %r64, %r12057, %r12063; - mad.lo.s32 %r12065, %r65, %r12055, %r12064; - ld.const.v4.u8 {%rs7866, %rs7867, %rs7868, %rs7869}, [matrix+3932]; - cvt.u32.u16 %r12066, %rs7869; - cvt.s32.s8 %r12067, %r12066; - cvt.u32.u16 %r12068, %rs7868; - cvt.s32.s8 %r12069, %r12068; - cvt.u32.u16 %r12070, %rs7867; - cvt.s32.s8 %r12071, %r12070; - cvt.u32.u16 %r12072, %rs7866; - cvt.s32.s8 %r12073, %r12072; - mad.lo.s32 %r12074, %r67, %r12073, %r12065; - mad.lo.s32 %r12075, %r68, %r12071, %r12074; - mad.lo.s32 %r12076, %r69, %r12069, %r12075; - mad.lo.s32 %r12077, %r70, %r12067, %r12076; - ld.const.v4.u8 {%rs7874, %rs7875, %rs7876, %rs7877}, [matrix+3936]; - cvt.u32.u16 %r12078, %rs7877; - cvt.s32.s8 %r12079, %r12078; - cvt.u32.u16 %r12080, %rs7876; - cvt.s32.s8 %r12081, %r12080; - cvt.u32.u16 %r12082, %rs7875; - cvt.s32.s8 %r12083, %r12082; - cvt.u32.u16 %r12084, %rs7874; - cvt.s32.s8 %r12085, %r12084; - mad.lo.s32 %r12086, %r222, %r12085, %r12077; - mad.lo.s32 %r12087, %r72, %r12083, %r12086; - mad.lo.s32 %r12088, %r73, %r12081, %r12087; - mad.lo.s32 %r12089, %r74, %r12079, %r12088; - ld.const.v4.u8 {%rs7882, %rs7883, %rs7884, %rs7885}, [matrix+3940]; - cvt.u32.u16 %r12090, %rs7885; - cvt.s32.s8 %r12091, %r12090; - cvt.u32.u16 %r12092, %rs7884; - cvt.s32.s8 %r12093, %r12092; - cvt.u32.u16 %r12094, %rs7883; - cvt.s32.s8 %r12095, %r12094; - cvt.u32.u16 %r12096, %rs7882; - cvt.s32.s8 %r12097, %r12096; - mad.lo.s32 %r12098, %r75, %r12097, %r12089; - mad.lo.s32 %r12099, %r76, %r12095, %r12098; - mad.lo.s32 %r12100, %r77, %r12093, %r12099; - mad.lo.s32 %r12101, %r78, %r12091, %r12100; - ld.const.v4.u8 {%rs7890, %rs7891, %rs7892, %rs7893}, [matrix+3944]; - cvt.u32.u16 %r12102, %rs7893; - cvt.s32.s8 %r12103, %r12102; - cvt.u32.u16 %r12104, %rs7892; - cvt.s32.s8 %r12105, %r12104; - cvt.u32.u16 %r12106, %rs7891; - cvt.s32.s8 %r12107, %r12106; - cvt.u32.u16 %r12108, %rs7890; - cvt.s32.s8 %r12109, %r12108; - mad.lo.s32 %r12110, %r80, %r12109, %r12101; - mad.lo.s32 %r12111, %r81, %r12107, %r12110; - mad.lo.s32 %r12112, %r83, %r12105, %r12111; - mad.lo.s32 %r12113, %r84, %r12103, %r12112; - ld.const.v4.u8 {%rs7898, %rs7899, %rs7900, %rs7901}, [matrix+3948]; - cvt.u32.u16 %r12114, %rs7901; - cvt.s32.s8 %r12115, %r12114; - cvt.u32.u16 %r12116, %rs7900; - cvt.s32.s8 %r12117, %r12116; - cvt.u32.u16 %r12118, %rs7899; - cvt.s32.s8 %r12119, %r12118; - cvt.u32.u16 %r12120, %rs7898; - cvt.s32.s8 %r12121, %r12120; - mad.lo.s32 %r12122, %r86, %r12121, %r12113; - mad.lo.s32 %r12123, %r87, %r12119, %r12122; - mad.lo.s32 %r12124, %r88, %r12117, %r12123; - mad.lo.s32 %r12125, %r89, %r12115, %r12124; - ld.const.v4.u8 {%rs7906, %rs7907, %rs7908, %rs7909}, [matrix+3952]; - cvt.u32.u16 %r12126, %rs7909; - cvt.s32.s8 %r12127, %r12126; - cvt.u32.u16 %r12128, %rs7908; - cvt.s32.s8 %r12129, %r12128; - cvt.u32.u16 %r12130, %rs7907; - cvt.s32.s8 %r12131, %r12130; - cvt.u32.u16 %r12132, %rs7906; - cvt.s32.s8 %r12133, %r12132; - mad.lo.s32 %r12134, %r271, %r12133, %r12125; - mad.lo.s32 %r12135, %r91, %r12131, %r12134; - mad.lo.s32 %r12136, %r93, %r12129, %r12135; - mad.lo.s32 %r12137, %r94, %r12127, %r12136; - ld.const.v4.u8 {%rs7914, %rs7915, %rs7916, %rs7917}, [matrix+3956]; - cvt.u32.u16 %r12138, %rs7917; - cvt.s32.s8 %r12139, %r12138; - cvt.u32.u16 %r12140, %rs7916; - cvt.s32.s8 %r12141, %r12140; - cvt.u32.u16 %r12142, %rs7915; - cvt.s32.s8 %r12143, %r12142; - cvt.u32.u16 %r12144, %rs7914; - cvt.s32.s8 %r12145, %r12144; - mad.lo.s32 %r12146, %r96, %r12145, %r12137; - mad.lo.s32 %r12147, %r97, %r12143, %r12146; - mad.lo.s32 %r12148, %r99, %r12141, %r12147; - mad.lo.s32 %r12149, %r100, %r12139, %r12148; - ld.const.v4.u8 {%rs7922, %rs7923, %rs7924, %rs7925}, [matrix+3960]; - cvt.u32.u16 %r12150, %rs7925; - cvt.s32.s8 %r12151, %r12150; - cvt.u32.u16 %r12152, %rs7924; - cvt.s32.s8 %r12153, %r12152; - cvt.u32.u16 %r12154, %rs7923; - cvt.s32.s8 %r12155, %r12154; - cvt.u32.u16 %r12156, %rs7922; - cvt.s32.s8 %r12157, %r12156; - mad.lo.s32 %r12158, %r103, %r12157, %r12149; - mad.lo.s32 %r12159, %r104, %r12155, %r12158; - mad.lo.s32 %r12160, %r107, %r12153, %r12159; - mad.lo.s32 %r12161, %r108, %r12151, %r12160; - ld.const.v4.u8 {%rs7930, %rs7931, %rs7932, %rs7933}, [matrix+3964]; - cvt.u32.u16 %r12162, %rs7933; - cvt.s32.s8 %r12163, %r12162; - cvt.u32.u16 %r12164, %rs7932; - cvt.s32.s8 %r12165, %r12164; - cvt.u32.u16 %r12166, %rs7931; - cvt.s32.s8 %r12167, %r12166; - cvt.u32.u16 %r12168, %rs7930; - cvt.s32.s8 %r12169, %r12168; - mad.lo.s32 %r12170, %r111, %r12169, %r12161; - mad.lo.s32 %r12171, %r112, %r12167, %r12170; - mad.lo.s32 %r12172, %r114, %r12165, %r12171; - mad.lo.s32 %r12173, %r115, %r12163, %r12172; - shr.u32 %r12174, %r11981, 6; - and.b32 %r12175, %r12174, 240; - shr.u32 %r12176, %r12173, 10; - or.b32 %r12177, %r12176, %r12175; - xor.b32 %r12178, %r109, %r12177; - cvt.u64.u32 %rd405, %r12178; - ld.const.v4.u8 {%rs7938, %rs7939, %rs7940, %rs7941}, [matrix+3968]; - cvt.u32.u16 %r12179, %rs7941; - cvt.s32.s8 %r12180, %r12179; - cvt.u32.u16 %r12181, %rs7940; - cvt.s32.s8 %r12182, %r12181; - cvt.u32.u16 %r12183, %rs7938; - cvt.s32.s8 %r12184, %r12183; - cvt.u32.u16 %r12185, %rs7939; - cvt.s32.s8 %r12186, %r12185; - mul.lo.s32 %r12187, %r34, %r12186; - mad.lo.s32 %r12188, %r124, %r12184, %r12187; - mad.lo.s32 %r12189, %r35, %r12182, %r12188; - mad.lo.s32 %r12190, %r36, %r12180, %r12189; - ld.const.v4.u8 {%rs7946, %rs7947, %rs7948, %rs7949}, [matrix+3972]; - cvt.u32.u16 %r12191, %rs7949; - cvt.s32.s8 %r12192, %r12191; - cvt.u32.u16 %r12193, %rs7948; - cvt.s32.s8 %r12194, %r12193; - cvt.u32.u16 %r12195, %rs7947; - cvt.s32.s8 %r12196, %r12195; - cvt.u32.u16 %r12197, %rs7946; - cvt.s32.s8 %r12198, %r12197; - mad.lo.s32 %r12199, %r37, %r12198, %r12190; - mad.lo.s32 %r12200, %r38, %r12196, %r12199; - mad.lo.s32 %r12201, %r39, %r12194, %r12200; - mad.lo.s32 %r12202, %r40, %r12192, %r12201; - ld.const.v4.u8 {%rs7954, %rs7955, %rs7956, %rs7957}, [matrix+3976]; - cvt.u32.u16 %r12203, %rs7957; - cvt.s32.s8 %r12204, %r12203; - cvt.u32.u16 %r12205, %rs7956; - cvt.s32.s8 %r12206, %r12205; - cvt.u32.u16 %r12207, %rs7955; - cvt.s32.s8 %r12208, %r12207; - cvt.u32.u16 %r12209, %rs7954; - cvt.s32.s8 %r12210, %r12209; - mad.lo.s32 %r12211, %r42, %r12210, %r12202; - mad.lo.s32 %r12212, %r43, %r12208, %r12211; - mad.lo.s32 %r12213, %r45, %r12206, %r12212; - mad.lo.s32 %r12214, %r46, %r12204, %r12213; - ld.const.v4.u8 {%rs7962, %rs7963, %rs7964, %rs7965}, [matrix+3980]; - cvt.u32.u16 %r12215, %rs7965; - cvt.s32.s8 %r12216, %r12215; - cvt.u32.u16 %r12217, %rs7964; - cvt.s32.s8 %r12218, %r12217; - cvt.u32.u16 %r12219, %rs7963; - cvt.s32.s8 %r12220, %r12219; - cvt.u32.u16 %r12221, %rs7962; - cvt.s32.s8 %r12222, %r12221; - mad.lo.s32 %r12223, %r48, %r12222, %r12214; - mad.lo.s32 %r12224, %r49, %r12220, %r12223; - mad.lo.s32 %r12225, %r50, %r12218, %r12224; - mad.lo.s32 %r12226, %r51, %r12216, %r12225; - ld.const.v4.u8 {%rs7970, %rs7971, %rs7972, %rs7973}, [matrix+3984]; - cvt.u32.u16 %r12227, %rs7973; - cvt.s32.s8 %r12228, %r12227; - cvt.u32.u16 %r12229, %rs7972; - cvt.s32.s8 %r12230, %r12229; - cvt.u32.u16 %r12231, %rs7971; - cvt.s32.s8 %r12232, %r12231; - cvt.u32.u16 %r12233, %rs7970; - cvt.s32.s8 %r12234, %r12233; - mad.lo.s32 %r12235, %r173, %r12234, %r12226; - mad.lo.s32 %r12236, %r53, %r12232, %r12235; - mad.lo.s32 %r12237, %r54, %r12230, %r12236; - mad.lo.s32 %r12238, %r55, %r12228, %r12237; - ld.const.v4.u8 {%rs7978, %rs7979, %rs7980, %rs7981}, [matrix+3988]; - cvt.u32.u16 %r12239, %rs7981; - cvt.s32.s8 %r12240, %r12239; - cvt.u32.u16 %r12241, %rs7980; - cvt.s32.s8 %r12242, %r12241; - cvt.u32.u16 %r12243, %rs7979; - cvt.s32.s8 %r12244, %r12243; - cvt.u32.u16 %r12245, %rs7978; - cvt.s32.s8 %r12246, %r12245; - mad.lo.s32 %r12247, %r56, %r12246, %r12238; - mad.lo.s32 %r12248, %r57, %r12244, %r12247; - mad.lo.s32 %r12249, %r58, %r12242, %r12248; - mad.lo.s32 %r12250, %r59, %r12240, %r12249; - ld.const.v4.u8 {%rs7986, %rs7987, %rs7988, %rs7989}, [matrix+3992]; - cvt.u32.u16 %r12251, %rs7989; - cvt.s32.s8 %r12252, %r12251; - cvt.u32.u16 %r12253, %rs7988; - cvt.s32.s8 %r12254, %r12253; - cvt.u32.u16 %r12255, %rs7987; - cvt.s32.s8 %r12256, %r12255; - cvt.u32.u16 %r12257, %rs7986; - cvt.s32.s8 %r12258, %r12257; - mad.lo.s32 %r12259, %r61, %r12258, %r12250; - mad.lo.s32 %r12260, %r62, %r12256, %r12259; - mad.lo.s32 %r12261, %r64, %r12254, %r12260; - mad.lo.s32 %r12262, %r65, %r12252, %r12261; - ld.const.v4.u8 {%rs7994, %rs7995, %rs7996, %rs7997}, [matrix+3996]; - cvt.u32.u16 %r12263, %rs7997; - cvt.s32.s8 %r12264, %r12263; - cvt.u32.u16 %r12265, %rs7996; - cvt.s32.s8 %r12266, %r12265; - cvt.u32.u16 %r12267, %rs7995; - cvt.s32.s8 %r12268, %r12267; - cvt.u32.u16 %r12269, %rs7994; - cvt.s32.s8 %r12270, %r12269; - mad.lo.s32 %r12271, %r67, %r12270, %r12262; - mad.lo.s32 %r12272, %r68, %r12268, %r12271; - mad.lo.s32 %r12273, %r69, %r12266, %r12272; - mad.lo.s32 %r12274, %r70, %r12264, %r12273; - ld.const.v4.u8 {%rs8002, %rs8003, %rs8004, %rs8005}, [matrix+4000]; - cvt.u32.u16 %r12275, %rs8005; - cvt.s32.s8 %r12276, %r12275; - cvt.u32.u16 %r12277, %rs8004; - cvt.s32.s8 %r12278, %r12277; - cvt.u32.u16 %r12279, %rs8003; - cvt.s32.s8 %r12280, %r12279; - cvt.u32.u16 %r12281, %rs8002; - cvt.s32.s8 %r12282, %r12281; - mad.lo.s32 %r12283, %r222, %r12282, %r12274; - mad.lo.s32 %r12284, %r72, %r12280, %r12283; - mad.lo.s32 %r12285, %r73, %r12278, %r12284; - mad.lo.s32 %r12286, %r74, %r12276, %r12285; - ld.const.v4.u8 {%rs8010, %rs8011, %rs8012, %rs8013}, [matrix+4004]; - cvt.u32.u16 %r12287, %rs8013; - cvt.s32.s8 %r12288, %r12287; - cvt.u32.u16 %r12289, %rs8012; - cvt.s32.s8 %r12290, %r12289; - cvt.u32.u16 %r12291, %rs8011; - cvt.s32.s8 %r12292, %r12291; - cvt.u32.u16 %r12293, %rs8010; - cvt.s32.s8 %r12294, %r12293; - mad.lo.s32 %r12295, %r75, %r12294, %r12286; - mad.lo.s32 %r12296, %r76, %r12292, %r12295; - mad.lo.s32 %r12297, %r77, %r12290, %r12296; - mad.lo.s32 %r12298, %r78, %r12288, %r12297; - ld.const.v4.u8 {%rs8018, %rs8019, %rs8020, %rs8021}, [matrix+4008]; - cvt.u32.u16 %r12299, %rs8021; - cvt.s32.s8 %r12300, %r12299; - cvt.u32.u16 %r12301, %rs8020; - cvt.s32.s8 %r12302, %r12301; - cvt.u32.u16 %r12303, %rs8019; - cvt.s32.s8 %r12304, %r12303; - cvt.u32.u16 %r12305, %rs8018; - cvt.s32.s8 %r12306, %r12305; - mad.lo.s32 %r12307, %r80, %r12306, %r12298; - mad.lo.s32 %r12308, %r81, %r12304, %r12307; - mad.lo.s32 %r12309, %r83, %r12302, %r12308; - mad.lo.s32 %r12310, %r84, %r12300, %r12309; - ld.const.v4.u8 {%rs8026, %rs8027, %rs8028, %rs8029}, [matrix+4012]; - cvt.u32.u16 %r12311, %rs8029; - cvt.s32.s8 %r12312, %r12311; - cvt.u32.u16 %r12313, %rs8028; - cvt.s32.s8 %r12314, %r12313; - cvt.u32.u16 %r12315, %rs8027; - cvt.s32.s8 %r12316, %r12315; - cvt.u32.u16 %r12317, %rs8026; - cvt.s32.s8 %r12318, %r12317; - mad.lo.s32 %r12319, %r86, %r12318, %r12310; - mad.lo.s32 %r12320, %r87, %r12316, %r12319; - mad.lo.s32 %r12321, %r88, %r12314, %r12320; - mad.lo.s32 %r12322, %r89, %r12312, %r12321; - ld.const.v4.u8 {%rs8034, %rs8035, %rs8036, %rs8037}, [matrix+4016]; - cvt.u32.u16 %r12323, %rs8037; - cvt.s32.s8 %r12324, %r12323; - cvt.u32.u16 %r12325, %rs8036; - cvt.s32.s8 %r12326, %r12325; - cvt.u32.u16 %r12327, %rs8035; - cvt.s32.s8 %r12328, %r12327; - cvt.u32.u16 %r12329, %rs8034; - cvt.s32.s8 %r12330, %r12329; - mad.lo.s32 %r12331, %r271, %r12330, %r12322; - mad.lo.s32 %r12332, %r91, %r12328, %r12331; - mad.lo.s32 %r12333, %r93, %r12326, %r12332; - mad.lo.s32 %r12334, %r94, %r12324, %r12333; - ld.const.v4.u8 {%rs8042, %rs8043, %rs8044, %rs8045}, [matrix+4020]; - cvt.u32.u16 %r12335, %rs8045; - cvt.s32.s8 %r12336, %r12335; - cvt.u32.u16 %r12337, %rs8044; - cvt.s32.s8 %r12338, %r12337; - cvt.u32.u16 %r12339, %rs8043; - cvt.s32.s8 %r12340, %r12339; - cvt.u32.u16 %r12341, %rs8042; - cvt.s32.s8 %r12342, %r12341; - mad.lo.s32 %r12343, %r96, %r12342, %r12334; - mad.lo.s32 %r12344, %r97, %r12340, %r12343; - mad.lo.s32 %r12345, %r99, %r12338, %r12344; - mad.lo.s32 %r12346, %r100, %r12336, %r12345; - ld.const.v4.u8 {%rs8050, %rs8051, %rs8052, %rs8053}, [matrix+4024]; - cvt.u32.u16 %r12347, %rs8053; - cvt.s32.s8 %r12348, %r12347; - cvt.u32.u16 %r12349, %rs8052; - cvt.s32.s8 %r12350, %r12349; - cvt.u32.u16 %r12351, %rs8051; - cvt.s32.s8 %r12352, %r12351; - cvt.u32.u16 %r12353, %rs8050; - cvt.s32.s8 %r12354, %r12353; - mad.lo.s32 %r12355, %r103, %r12354, %r12346; - mad.lo.s32 %r12356, %r104, %r12352, %r12355; - mad.lo.s32 %r12357, %r107, %r12350, %r12356; - mad.lo.s32 %r12358, %r108, %r12348, %r12357; - ld.const.v4.u8 {%rs8058, %rs8059, %rs8060, %rs8061}, [matrix+4028]; - cvt.u32.u16 %r12359, %rs8061; - cvt.s32.s8 %r12360, %r12359; - cvt.u32.u16 %r12361, %rs8060; - cvt.s32.s8 %r12362, %r12361; - cvt.u32.u16 %r12363, %rs8059; - cvt.s32.s8 %r12364, %r12363; - cvt.u32.u16 %r12365, %rs8058; - cvt.s32.s8 %r12366, %r12365; - mad.lo.s32 %r12367, %r111, %r12366, %r12358; - mad.lo.s32 %r12368, %r112, %r12364, %r12367; - mad.lo.s32 %r12369, %r114, %r12362, %r12368; - mad.lo.s32 %r12370, %r115, %r12360, %r12369; - ld.const.v4.u8 {%rs8066, %rs8067, %rs8068, %rs8069}, [matrix+4032]; - cvt.u32.u16 %r12371, %rs8069; - cvt.s32.s8 %r12372, %r12371; - cvt.u32.u16 %r12373, %rs8068; - cvt.s32.s8 %r12374, %r12373; - cvt.u32.u16 %r12375, %rs8066; - cvt.s32.s8 %r12376, %r12375; - cvt.u32.u16 %r12377, %rs8067; - cvt.s32.s8 %r12378, %r12377; - mul.lo.s32 %r12379, %r34, %r12378; - mad.lo.s32 %r12380, %r124, %r12376, %r12379; - mad.lo.s32 %r12381, %r35, %r12374, %r12380; - mad.lo.s32 %r12382, %r36, %r12372, %r12381; - ld.const.v4.u8 {%rs8074, %rs8075, %rs8076, %rs8077}, [matrix+4036]; - cvt.u32.u16 %r12383, %rs8077; - cvt.s32.s8 %r12384, %r12383; - cvt.u32.u16 %r12385, %rs8076; - cvt.s32.s8 %r12386, %r12385; - cvt.u32.u16 %r12387, %rs8075; - cvt.s32.s8 %r12388, %r12387; - cvt.u32.u16 %r12389, %rs8074; - cvt.s32.s8 %r12390, %r12389; - mad.lo.s32 %r12391, %r37, %r12390, %r12382; - mad.lo.s32 %r12392, %r38, %r12388, %r12391; - mad.lo.s32 %r12393, %r39, %r12386, %r12392; - mad.lo.s32 %r12394, %r40, %r12384, %r12393; - ld.const.v4.u8 {%rs8082, %rs8083, %rs8084, %rs8085}, [matrix+4040]; - cvt.u32.u16 %r12395, %rs8085; - cvt.s32.s8 %r12396, %r12395; - cvt.u32.u16 %r12397, %rs8084; - cvt.s32.s8 %r12398, %r12397; - cvt.u32.u16 %r12399, %rs8083; - cvt.s32.s8 %r12400, %r12399; - cvt.u32.u16 %r12401, %rs8082; - cvt.s32.s8 %r12402, %r12401; - mad.lo.s32 %r12403, %r42, %r12402, %r12394; - mad.lo.s32 %r12404, %r43, %r12400, %r12403; - mad.lo.s32 %r12405, %r45, %r12398, %r12404; - mad.lo.s32 %r12406, %r46, %r12396, %r12405; - ld.const.v4.u8 {%rs8090, %rs8091, %rs8092, %rs8093}, [matrix+4044]; - cvt.u32.u16 %r12407, %rs8093; - cvt.s32.s8 %r12408, %r12407; - cvt.u32.u16 %r12409, %rs8092; - cvt.s32.s8 %r12410, %r12409; - cvt.u32.u16 %r12411, %rs8091; - cvt.s32.s8 %r12412, %r12411; - cvt.u32.u16 %r12413, %rs8090; - cvt.s32.s8 %r12414, %r12413; - mad.lo.s32 %r12415, %r48, %r12414, %r12406; - mad.lo.s32 %r12416, %r49, %r12412, %r12415; - mad.lo.s32 %r12417, %r50, %r12410, %r12416; - mad.lo.s32 %r12418, %r51, %r12408, %r12417; - ld.const.v4.u8 {%rs8098, %rs8099, %rs8100, %rs8101}, [matrix+4048]; - cvt.u32.u16 %r12419, %rs8101; - cvt.s32.s8 %r12420, %r12419; - cvt.u32.u16 %r12421, %rs8100; - cvt.s32.s8 %r12422, %r12421; - cvt.u32.u16 %r12423, %rs8099; - cvt.s32.s8 %r12424, %r12423; - cvt.u32.u16 %r12425, %rs8098; - cvt.s32.s8 %r12426, %r12425; - mad.lo.s32 %r12427, %r173, %r12426, %r12418; - mad.lo.s32 %r12428, %r53, %r12424, %r12427; - mad.lo.s32 %r12429, %r54, %r12422, %r12428; - mad.lo.s32 %r12430, %r55, %r12420, %r12429; - ld.const.v4.u8 {%rs8106, %rs8107, %rs8108, %rs8109}, [matrix+4052]; - cvt.u32.u16 %r12431, %rs8109; - cvt.s32.s8 %r12432, %r12431; - cvt.u32.u16 %r12433, %rs8108; - cvt.s32.s8 %r12434, %r12433; - cvt.u32.u16 %r12435, %rs8107; - cvt.s32.s8 %r12436, %r12435; - cvt.u32.u16 %r12437, %rs8106; - cvt.s32.s8 %r12438, %r12437; - mad.lo.s32 %r12439, %r56, %r12438, %r12430; - mad.lo.s32 %r12440, %r57, %r12436, %r12439; - mad.lo.s32 %r12441, %r58, %r12434, %r12440; - mad.lo.s32 %r12442, %r59, %r12432, %r12441; - ld.const.v4.u8 {%rs8114, %rs8115, %rs8116, %rs8117}, [matrix+4056]; - cvt.u32.u16 %r12443, %rs8117; - cvt.s32.s8 %r12444, %r12443; - cvt.u32.u16 %r12445, %rs8116; - cvt.s32.s8 %r12446, %r12445; - cvt.u32.u16 %r12447, %rs8115; - cvt.s32.s8 %r12448, %r12447; - cvt.u32.u16 %r12449, %rs8114; - cvt.s32.s8 %r12450, %r12449; - mad.lo.s32 %r12451, %r61, %r12450, %r12442; - mad.lo.s32 %r12452, %r62, %r12448, %r12451; - mad.lo.s32 %r12453, %r64, %r12446, %r12452; - mad.lo.s32 %r12454, %r65, %r12444, %r12453; - ld.const.v4.u8 {%rs8122, %rs8123, %rs8124, %rs8125}, [matrix+4060]; - cvt.u32.u16 %r12455, %rs8125; - cvt.s32.s8 %r12456, %r12455; - cvt.u32.u16 %r12457, %rs8124; - cvt.s32.s8 %r12458, %r12457; - cvt.u32.u16 %r12459, %rs8123; - cvt.s32.s8 %r12460, %r12459; - cvt.u32.u16 %r12461, %rs8122; - cvt.s32.s8 %r12462, %r12461; - mad.lo.s32 %r12463, %r67, %r12462, %r12454; - mad.lo.s32 %r12464, %r68, %r12460, %r12463; - mad.lo.s32 %r12465, %r69, %r12458, %r12464; - mad.lo.s32 %r12466, %r70, %r12456, %r12465; - ld.const.v4.u8 {%rs8130, %rs8131, %rs8132, %rs8133}, [matrix+4064]; - cvt.u32.u16 %r12467, %rs8133; - cvt.s32.s8 %r12468, %r12467; - cvt.u32.u16 %r12469, %rs8132; - cvt.s32.s8 %r12470, %r12469; - cvt.u32.u16 %r12471, %rs8131; - cvt.s32.s8 %r12472, %r12471; - cvt.u32.u16 %r12473, %rs8130; - cvt.s32.s8 %r12474, %r12473; - mad.lo.s32 %r12475, %r222, %r12474, %r12466; - mad.lo.s32 %r12476, %r72, %r12472, %r12475; - mad.lo.s32 %r12477, %r73, %r12470, %r12476; - mad.lo.s32 %r12478, %r74, %r12468, %r12477; - ld.const.v4.u8 {%rs8138, %rs8139, %rs8140, %rs8141}, [matrix+4068]; - cvt.u32.u16 %r12479, %rs8141; - cvt.s32.s8 %r12480, %r12479; - cvt.u32.u16 %r12481, %rs8140; - cvt.s32.s8 %r12482, %r12481; - cvt.u32.u16 %r12483, %rs8139; - cvt.s32.s8 %r12484, %r12483; - cvt.u32.u16 %r12485, %rs8138; - cvt.s32.s8 %r12486, %r12485; - mad.lo.s32 %r12487, %r75, %r12486, %r12478; - mad.lo.s32 %r12488, %r76, %r12484, %r12487; - mad.lo.s32 %r12489, %r77, %r12482, %r12488; - mad.lo.s32 %r12490, %r78, %r12480, %r12489; - ld.const.v4.u8 {%rs8146, %rs8147, %rs8148, %rs8149}, [matrix+4072]; - cvt.u32.u16 %r12491, %rs8149; - cvt.s32.s8 %r12492, %r12491; - cvt.u32.u16 %r12493, %rs8148; - cvt.s32.s8 %r12494, %r12493; - cvt.u32.u16 %r12495, %rs8147; - cvt.s32.s8 %r12496, %r12495; - cvt.u32.u16 %r12497, %rs8146; - cvt.s32.s8 %r12498, %r12497; - mad.lo.s32 %r12499, %r80, %r12498, %r12490; - mad.lo.s32 %r12500, %r81, %r12496, %r12499; - mad.lo.s32 %r12501, %r83, %r12494, %r12500; - mad.lo.s32 %r12502, %r84, %r12492, %r12501; - ld.const.v4.u8 {%rs8154, %rs8155, %rs8156, %rs8157}, [matrix+4076]; - cvt.u32.u16 %r12503, %rs8157; - cvt.s32.s8 %r12504, %r12503; - cvt.u32.u16 %r12505, %rs8156; - cvt.s32.s8 %r12506, %r12505; - cvt.u32.u16 %r12507, %rs8155; - cvt.s32.s8 %r12508, %r12507; - cvt.u32.u16 %r12509, %rs8154; - cvt.s32.s8 %r12510, %r12509; - mad.lo.s32 %r12511, %r86, %r12510, %r12502; - mad.lo.s32 %r12512, %r87, %r12508, %r12511; - mad.lo.s32 %r12513, %r88, %r12506, %r12512; - mad.lo.s32 %r12514, %r89, %r12504, %r12513; - ld.const.v4.u8 {%rs8162, %rs8163, %rs8164, %rs8165}, [matrix+4080]; - cvt.u32.u16 %r12515, %rs8165; - cvt.s32.s8 %r12516, %r12515; - cvt.u32.u16 %r12517, %rs8164; - cvt.s32.s8 %r12518, %r12517; - cvt.u32.u16 %r12519, %rs8163; - cvt.s32.s8 %r12520, %r12519; - cvt.u32.u16 %r12521, %rs8162; - cvt.s32.s8 %r12522, %r12521; - mad.lo.s32 %r12523, %r271, %r12522, %r12514; - mad.lo.s32 %r12524, %r91, %r12520, %r12523; - mad.lo.s32 %r12525, %r93, %r12518, %r12524; - mad.lo.s32 %r12526, %r94, %r12516, %r12525; - ld.const.v4.u8 {%rs8170, %rs8171, %rs8172, %rs8173}, [matrix+4084]; - cvt.u32.u16 %r12527, %rs8173; - cvt.s32.s8 %r12528, %r12527; - cvt.u32.u16 %r12529, %rs8172; - cvt.s32.s8 %r12530, %r12529; - cvt.u32.u16 %r12531, %rs8171; - cvt.s32.s8 %r12532, %r12531; - cvt.u32.u16 %r12533, %rs8170; - cvt.s32.s8 %r12534, %r12533; - mad.lo.s32 %r12535, %r96, %r12534, %r12526; - mad.lo.s32 %r12536, %r97, %r12532, %r12535; - mad.lo.s32 %r12537, %r99, %r12530, %r12536; - mad.lo.s32 %r12538, %r100, %r12528, %r12537; - ld.const.v4.u8 {%rs8178, %rs8179, %rs8180, %rs8181}, [matrix+4088]; - cvt.u32.u16 %r12539, %rs8181; - cvt.s32.s8 %r12540, %r12539; - cvt.u32.u16 %r12541, %rs8180; - cvt.s32.s8 %r12542, %r12541; - cvt.u32.u16 %r12543, %rs8179; - cvt.s32.s8 %r12544, %r12543; - cvt.u32.u16 %r12545, %rs8178; - cvt.s32.s8 %r12546, %r12545; - mad.lo.s32 %r12547, %r103, %r12546, %r12538; - mad.lo.s32 %r12548, %r104, %r12544, %r12547; - mad.lo.s32 %r12549, %r107, %r12542, %r12548; - mad.lo.s32 %r12550, %r108, %r12540, %r12549; - ld.const.v4.u8 {%rs8186, %rs8187, %rs8188, %rs8189}, [matrix+4092]; - cvt.u32.u16 %r12551, %rs8189; - cvt.s32.s8 %r12552, %r12551; - cvt.u32.u16 %r12553, %rs8188; - cvt.s32.s8 %r12554, %r12553; - cvt.u32.u16 %r12555, %rs8187; - cvt.s32.s8 %r12556, %r12555; - cvt.u32.u16 %r12557, %rs8186; - cvt.s32.s8 %r12558, %r12557; - mad.lo.s32 %r12559, %r111, %r12558, %r12550; - mad.lo.s32 %r12560, %r112, %r12556, %r12559; - mad.lo.s32 %r12561, %r114, %r12554, %r12560; - mad.lo.s32 %r12562, %r115, %r12552, %r12561; - shr.u32 %r12563, %r12370, 6; - and.b32 %r12564, %r12563, 240; - shr.u32 %r12565, %r12562, 10; - or.b32 %r12566, %r12565, %r12564; - xor.b32 %r12567, %r113, %r12566; - and.b64 %rd406, %rd386, 255; - and.b64 %rd407, %rd385, 255; - and.b64 %rd408, %rd384, 255; - and.b64 %rd409, %rd383, 255; - shl.b64 %rd410, %rd409, 24; - and.b64 %rd411, %rd382, 255; - shl.b64 %rd412, %rd411, 16; - shl.b32 %r12568, %r897, 8; - cvt.u64.u32 %rd413, %r12568; - cvt.u64.u32 %rd414, %r3231; - and.b64 %rd415, %rd392, 255; - and.b64 %rd416, %rd391, 255; - and.b64 %rd417, %rd390, 255; - and.b64 %rd418, %rd389, 255; - shl.b64 %rd419, %rd418, 24; - and.b64 %rd420, %rd388, 255; - shl.b64 %rd421, %rd420, 16; - shl.b32 %r12569, %r4009, 8; - cvt.u64.u32 %rd422, %r12569; - cvt.u64.u32 %rd423, %r6343; - and.b64 %rd424, %rd398, 255; - and.b64 %rd425, %rd397, 255; - and.b64 %rd426, %rd396, 255; - and.b64 %rd427, %rd395, 255; - shl.b64 %rd428, %rd427, 24; - and.b64 %rd429, %rd394, 255; - shl.b64 %rd430, %rd429, 16; - shl.b32 %r12570, %r7121, 8; - cvt.u64.u32 %rd431, %r12570; - cvt.u64.u32 %rd432, %r9455; - cvt.u64.u32 %rd433, %r9844; - cvt.u64.u32 %rd434, %r12567; - shl.b64 %rd435, %rd414, 56; - shl.b64 %rd436, %rd406, 48; - or.b64 %rd437, %rd435, %rd436; - shl.b64 %rd438, %rd407, 40; - or.b64 %rd439, %rd437, %rd438; - shl.b64 %rd440, %rd408, 32; - or.b64 %rd441, %rd439, %rd440; - or.b64 %rd442, %rd441, %rd410; - or.b64 %rd443, %rd442, %rd412; - and.b64 %rd444, %rd381, 255; - and.b64 %rd445, %rd413, 65280; - or.b64 %rd446, %rd443, %rd445; - or.b64 %rd447, %rd446, %rd444; - xor.b64 %rd123, %rd447, 4239941492252378377; - shl.b64 %rd448, %rd423, 56; - shl.b64 %rd449, %rd415, 48; - or.b64 %rd450, %rd448, %rd449; - shl.b64 %rd451, %rd416, 40; - or.b64 %rd452, %rd450, %rd451; - shl.b64 %rd453, %rd417, 32; - or.b64 %rd454, %rd452, %rd453; - or.b64 %rd455, %rd454, %rd419; - or.b64 %rd456, %rd455, %rd421; - and.b64 %rd457, %rd387, 255; - and.b64 %rd458, %rd422, 65280; - or.b64 %rd459, %rd456, %rd458; - or.b64 %rd460, %rd459, %rd457; - xor.b64 %rd681, %rd460, 8746723911537738262; - shl.b64 %rd461, %rd432, 56; - shl.b64 %rd462, %rd424, 48; - or.b64 %rd463, %rd461, %rd462; - shl.b64 %rd464, %rd425, 40; - or.b64 %rd465, %rd463, %rd464; - shl.b64 %rd466, %rd426, 32; - or.b64 %rd467, %rd465, %rd466; - or.b64 %rd468, %rd467, %rd428; - or.b64 %rd469, %rd468, %rd430; - and.b64 %rd470, %rd393, 255; - and.b64 %rd471, %rd431, 65280; - or.b64 %rd472, %rd469, %rd471; - or.b64 %rd473, %rd472, %rd470; - xor.b64 %rd676, %rd473, 8796936657246353646; - shl.b64 %rd474, %rd434, 56; - and.b64 %rd475, %rd405, 255; - shl.b64 %rd476, %rd475, 48; - or.b64 %rd477, %rd474, %rd476; - and.b64 %rd478, %rd404, 255; - shl.b64 %rd479, %rd478, 40; - or.b64 %rd480, %rd477, %rd479; - shl.b64 %rd481, %rd403, 32; - or.b64 %rd482, %rd480, %rd481; - and.b64 %rd483, %rd401, 255; - shl.b64 %rd484, %rd483, 24; - or.b64 %rd485, %rd482, %rd484; - and.b64 %rd486, %rd400, 255; - shl.b64 %rd487, %rd486, 16; - and.b64 %rd488, %rd399, 255; - shl.b64 %rd489, %rd488, 8; - or.b64 %rd490, %rd485, %rd487; - and.b64 %rd491, %rd433, 255; - or.b64 %rd492, %rd490, %rd489; - or.b64 %rd493, %rd492, %rd491; - xor.b64 %rd671, %rd493, 1272090201925444760; - mov.u64 %rd685, 8270816933120786537; - mov.u64 %rd684, -850687345431043546; - mov.u64 %rd683, 8596393687355028144; - mov.u64 %rd682, -4073852189716399785; - mov.u64 %rd680, -4539347866060507718; - mov.u64 %rd679, -3233781605604422593; - mov.u64 %rd678, 570094237299545110; - mov.u64 %rd677, 5171152063242093102; - mov.u64 %rd675, 6782861118970774626; - mov.u64 %rd674, 7812475424661425213; - mov.u64 %rd673, 9119540418498120711; - mov.u64 %rd672, -7873636174015165430; - mov.u64 %rd670, -9207053471590684088; - mov.u64 %rd669, 3370482334374859748; - mov.u64 %rd668, -1544774801229058759; - mov.u64 %rd667, 6096431547456407061; - mov.u64 %rd666, -1792185402154627366; - mov.u64 %rd665, -6864424130110145268; - mov.u64 %rd664, 5690099369266491460; - mov.u64 %rd663, -5074726839974049192; - mov.u64 %rd662, 1592359455985097269; - mov.u64 %rd661, RC; - mov.u32 %r12572, -24; - -BB0_9: - xor.b64 %rd494, %rd685, %rd123; - xor.b64 %rd495, %rd494, %rd684; - xor.b64 %rd496, %rd495, %rd683; - xor.b64 %rd497, %rd496, %rd682; - xor.b64 %rd498, %rd680, %rd681; - xor.b64 %rd499, %rd498, %rd679; - xor.b64 %rd500, %rd499, %rd678; - xor.b64 %rd501, %rd500, %rd677; - xor.b64 %rd502, %rd675, %rd676; - xor.b64 %rd503, %rd502, %rd674; - xor.b64 %rd504, %rd503, %rd673; - xor.b64 %rd505, %rd504, %rd672; - xor.b64 %rd506, %rd670, %rd671; - xor.b64 %rd507, %rd506, %rd669; - xor.b64 %rd508, %rd507, %rd668; - xor.b64 %rd509, %rd508, %rd667; - xor.b64 %rd510, %rd665, %rd666; - xor.b64 %rd511, %rd510, %rd664; - xor.b64 %rd512, %rd511, %rd663; - xor.b64 %rd513, %rd512, %rd662; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd501, 1; - shr.b64 %rhs, %rd501, 63; - add.u64 %rd514, %lhs, %rhs; - } - xor.b64 %rd515, %rd513, %rd514; - xor.b64 %rd516, %rd123, %rd515; - xor.b64 %rd517, %rd685, %rd515; - xor.b64 %rd518, %rd684, %rd515; - xor.b64 %rd519, %rd683, %rd515; - xor.b64 %rd520, %rd682, %rd515; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd505, 1; - shr.b64 %rhs, %rd505, 63; - add.u64 %rd521, %lhs, %rhs; - } - xor.b64 %rd522, %rd521, %rd497; - xor.b64 %rd523, %rd681, %rd522; - xor.b64 %rd524, %rd680, %rd522; - xor.b64 %rd525, %rd679, %rd522; - xor.b64 %rd526, %rd678, %rd522; - xor.b64 %rd527, %rd677, %rd522; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd509, 1; - shr.b64 %rhs, %rd509, 63; - add.u64 %rd528, %lhs, %rhs; - } - xor.b64 %rd529, %rd528, %rd501; - xor.b64 %rd530, %rd676, %rd529; - xor.b64 %rd531, %rd675, %rd529; - xor.b64 %rd532, %rd674, %rd529; - xor.b64 %rd533, %rd673, %rd529; - xor.b64 %rd534, %rd672, %rd529; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd513, 1; - shr.b64 %rhs, %rd513, 63; - add.u64 %rd535, %lhs, %rhs; - } - xor.b64 %rd536, %rd535, %rd505; - xor.b64 %rd537, %rd671, %rd536; - xor.b64 %rd538, %rd670, %rd536; - xor.b64 %rd539, %rd669, %rd536; - xor.b64 %rd540, %rd668, %rd536; - xor.b64 %rd541, %rd667, %rd536; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd497, 1; - shr.b64 %rhs, %rd497, 63; - add.u64 %rd542, %lhs, %rhs; - } - xor.b64 %rd543, %rd542, %rd509; - xor.b64 %rd544, %rd666, %rd543; - xor.b64 %rd545, %rd665, %rd543; - xor.b64 %rd546, %rd664, %rd543; - xor.b64 %rd547, %rd663, %rd543; - xor.b64 %rd548, %rd662, %rd543; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd523, 1; - shr.b64 %rhs, %rd523, 63; - add.u64 %rd549, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd518, 3; - shr.b64 %rhs, %rd518, 61; - add.u64 %rd550, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd531, 6; - shr.b64 %rhs, %rd531, 58; - add.u64 %rd551, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd525, 10; - shr.b64 %rhs, %rd525, 54; - add.u64 %rd552, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd533, 15; - shr.b64 %rhs, %rd533, 49; - add.u64 %rd553, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd540, 21; - shr.b64 %rhs, %rd540, 43; - add.u64 %rd554, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd537, 28; - shr.b64 %rhs, %rd537, 36; - add.u64 %rd555, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd517, 36; - shr.b64 %rhs, %rd517, 28; - add.u64 %rd556, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd526, 45; - shr.b64 %rhs, %rd526, 19; - add.u64 %rd557, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd538, 55; - shr.b64 %rhs, %rd538, 9; - add.u64 %rd558, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd527, 2; - shr.b64 %rhs, %rd527, 62; - add.u64 %rd559, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd548, 14; - shr.b64 %rhs, %rd548, 50; - add.u64 %rd560, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd544, 27; - shr.b64 %rhs, %rd544, 37; - add.u64 %rd561, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd519, 41; - shr.b64 %rhs, %rd519, 23; - add.u64 %rd562, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd541, 56; - shr.b64 %rhs, %rd541, 8; - add.u64 %rd563, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd547, 8; - shr.b64 %rhs, %rd547, 56; - add.u64 %rd564, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd539, 25; - shr.b64 %rhs, %rd539, 39; - add.u64 %rd565, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd532, 43; - shr.b64 %rhs, %rd532, 21; - add.u64 %rd566, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd530, 62; - shr.b64 %rhs, %rd530, 2; - add.u64 %rd567, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd520, 18; - shr.b64 %rhs, %rd520, 46; - add.u64 %rd568, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd546, 39; - shr.b64 %rhs, %rd546, 25; - add.u64 %rd569, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd534, 61; - shr.b64 %rhs, %rd534, 3; - add.u64 %rd570, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd545, 20; - shr.b64 %rhs, %rd545, 44; - add.u64 %rd571, %lhs, %rhs; - } - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd524, 44; - shr.b64 %rhs, %rd524, 20; - add.u64 %rd572, %lhs, %rhs; - } - not.b64 %rd573, %rd572; - and.b64 %rd574, %rd566, %rd573; - xor.b64 %rd575, %rd574, %rd516; - not.b64 %rd576, %rd566; - and.b64 %rd577, %rd554, %rd576; - xor.b64 %rd681, %rd577, %rd572; - not.b64 %rd578, %rd554; - and.b64 %rd579, %rd560, %rd578; - xor.b64 %rd676, %rd579, %rd566; - not.b64 %rd580, %rd560; - and.b64 %rd581, %rd516, %rd580; - xor.b64 %rd671, %rd581, %rd554; - not.b64 %rd582, %rd516; - and.b64 %rd583, %rd572, %rd582; - xor.b64 %rd666, %rd560, %rd583; - not.b64 %rd584, %rd571; - and.b64 %rd585, %rd550, %rd584; - xor.b64 %rd685, %rd585, %rd555; - not.b64 %rd586, %rd550; - and.b64 %rd587, %rd557, %rd586; - xor.b64 %rd680, %rd587, %rd571; - not.b64 %rd588, %rd557; - and.b64 %rd589, %rd570, %rd588; - xor.b64 %rd675, %rd589, %rd550; - not.b64 %rd590, %rd570; - and.b64 %rd591, %rd555, %rd590; - xor.b64 %rd670, %rd591, %rd557; - not.b64 %rd592, %rd555; - and.b64 %rd593, %rd571, %rd592; - xor.b64 %rd665, %rd570, %rd593; - not.b64 %rd594, %rd551; - and.b64 %rd595, %rd565, %rd594; - xor.b64 %rd684, %rd595, %rd549; - not.b64 %rd596, %rd565; - and.b64 %rd597, %rd564, %rd596; - xor.b64 %rd679, %rd597, %rd551; - not.b64 %rd598, %rd564; - and.b64 %rd599, %rd568, %rd598; - xor.b64 %rd674, %rd599, %rd565; - not.b64 %rd600, %rd568; - and.b64 %rd601, %rd549, %rd600; - xor.b64 %rd669, %rd601, %rd564; - not.b64 %rd602, %rd549; - and.b64 %rd603, %rd551, %rd602; - xor.b64 %rd664, %rd568, %rd603; - not.b64 %rd604, %rd556; - and.b64 %rd605, %rd552, %rd604; - xor.b64 %rd683, %rd605, %rd561; - not.b64 %rd606, %rd552; - and.b64 %rd607, %rd553, %rd606; - xor.b64 %rd678, %rd607, %rd556; - not.b64 %rd608, %rd553; - and.b64 %rd609, %rd563, %rd608; - xor.b64 %rd673, %rd609, %rd552; - not.b64 %rd610, %rd563; - and.b64 %rd611, %rd561, %rd610; - xor.b64 %rd668, %rd611, %rd553; - not.b64 %rd612, %rd561; - and.b64 %rd613, %rd556, %rd612; - xor.b64 %rd663, %rd563, %rd613; - not.b64 %rd614, %rd558; - and.b64 %rd615, %rd569, %rd614; - xor.b64 %rd682, %rd615, %rd567; - not.b64 %rd616, %rd569; - and.b64 %rd617, %rd562, %rd616; - xor.b64 %rd677, %rd617, %rd558; - not.b64 %rd618, %rd562; - and.b64 %rd619, %rd559, %rd618; - xor.b64 %rd672, %rd619, %rd569; - not.b64 %rd620, %rd559; - and.b64 %rd621, %rd567, %rd620; - xor.b64 %rd667, %rd621, %rd562; - not.b64 %rd622, %rd567; - and.b64 %rd623, %rd558, %rd622; - xor.b64 %rd662, %rd559, %rd623; - ld.global.u64 %rd624, [%rd661]; - xor.b64 %rd123, %rd575, %rd624; - add.s64 %rd661, %rd661, 8; - add.s32 %r12572, %r12572, 1; - setp.ne.s32 %p10, %r12572, 0; - @%p10 bra BB0_9; - - ld.const.u64 %rd125, [target+24]; - setp.eq.s64 %p11, %rd671, %rd125; - @%p11 bra BB0_12; - bra.uni BB0_11; - -BB0_12: - ld.const.u64 %rd126, [target+16]; - setp.eq.s64 %p12, %rd676, %rd126; - @%p12 bra BB0_14; - bra.uni BB0_13; - -BB0_14: - ld.const.u64 %rd127, [target+8]; - setp.eq.s64 %p13, %rd681, %rd127; - @%p13 bra BB0_16; - bra.uni BB0_15; - -BB0_16: - ld.const.u64 %rd625, [target]; - setp.lt.u64 %p4, %rd123, %rd625; - @!%p4 bra BB0_18; - bra.uni BB0_17; - -BB0_11: - setp.lt.u64 %p1, %rd671, %rd125; - @!%p1 bra BB0_18; - bra.uni BB0_17; - -BB0_13: - setp.lt.u64 %p2, %rd676, %rd126; - @!%p2 bra BB0_18; - bra.uni BB0_17; - -BB0_15: - setp.lt.u64 %p3, %rd681, %rd127; - @!%p3 bra BB0_18; - bra.uni BB0_17; - -BB0_17: - ld.param.u64 %rd633, [heavy_hash_param_0]; - ld.param.u64 %rd632, [heavy_hash_param_1]; - and.b64 %rd631, %rd634, %rd633; - or.b64 %rd630, %rd631, %rd632; - ld.param.u64 %rd629, [heavy_hash_param_5]; - cvta.to.global.u64 %rd628, %rd629; - mov.u64 %rd626, 0; - atom.global.cas.b64 %rd627, [%rd628], %rd626, %rd630; - -BB0_18: - ret; -} - - +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-24330188 +// Cuda compilation tools, release 9.2, V9.2.148 +// Based on LLVM 3.4svn +// + +.version 6.2 +.target sm_30 +.address_size 64 + + // .globl heavy_hash +.global .align 16 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; +.global .align 16 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; +.global .align 16 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 16 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; +.global .align 16 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; +.const .align 1 .b8 matrix[4096]; +.const .align 8 .b8 hash_header[72]; +.const .align 8 .b8 target[32]; +.const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; +.const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; + +.visible .entry heavy_hash( + .param .u64 heavy_hash_param_0, + .param .u64 heavy_hash_param_1, + .param .u64 heavy_hash_param_2, + .param .u8 heavy_hash_param_3, + .param .u64 heavy_hash_param_4, + .param .u64 heavy_hash_param_5 +) +{ + .reg .pred %p<18>; + .reg .b16 %rs<8194>; + .reg .b32 %r<12573>; + .reg .b64 %rd<687>; + + + ld.param.u64 %rd128, [heavy_hash_param_0]; + ld.param.u64 %rd129, [heavy_hash_param_1]; + ld.param.u64 %rd131, [heavy_hash_param_2]; + ld.param.u64 %rd130, [heavy_hash_param_4]; + ld.param.u64 %rd132, [heavy_hash_param_5]; + ld.param.u8 %rs1, [heavy_hash_param_3]; + cvta.to.global.u64 %rd1, %rd132; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r8, %r5, %r6, %r7; + cvt.s64.s32 %rd2, %r8; + setp.ge.u64 %p6, %rd2, %rd131; + @%p6 bra BB0_18; + + cvt.u32.u64 %r9, %rd2; + setp.ne.s32 %p7, %r9, 0; + @%p7 bra BB0_3; + + mov.u64 %rd133, 0; + st.global.u64 [%rd1], %rd133; + +BB0_3: + setp.eq.s16 %p8, %rs1, 0; + @%p8 bra BB0_5; + + cvta.to.global.u64 %rd134, %rd130; + shl.b64 %rd135, %rd2, 5; + add.s64 %rd136, %rd134, %rd135; + ld.global.v2.u64 {%rd137, %rd138}, [%rd136]; + mul.lo.s64 %rd141, %rd138, 5; + mul.lo.s64 %rd142, %rd138, 640; + shr.u64 %rd143, %rd141, 57; + or.b64 %rd144, %rd143, %rd142; + mul.lo.s64 %rd634, %rd144, 9; + shl.b64 %rd145, %rd138, 17; + ld.global.v2.u64 {%rd146, %rd147}, [%rd136+16]; + xor.b64 %rd149, %rd146, %rd137; + xor.b64 %rd151, %rd147, %rd138; + xor.b64 %rd152, %rd138, %rd149; + xor.b64 %rd153, %rd137, %rd151; + st.global.v2.u64 [%rd136], {%rd153, %rd152}; + xor.b64 %rd154, %rd149, %rd145; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd151, 45; + shr.b64 %rhs, %rd151, 19; + add.u64 %rd155, %lhs, %rhs; + } + st.global.v2.u64 [%rd136+16], {%rd154, %rd155}; + bra.uni BB0_6; + +BB0_5: + cvta.to.global.u64 %rd156, %rd130; + ld.global.u64 %rd157, [%rd156]; + xor.b64 %rd634, %rd157, %rd2; + +BB0_6: + and.b64 %rd174, %rd634, %rd128; + or.b64 %rd6, %rd174, %rd129; + ld.const.u64 %rd175, [hash_header]; + xor.b64 %rd660, %rd175, 1242148031264380989; + ld.const.u64 %rd176, [hash_header+8]; + xor.b64 %rd655, %rd176, 3008272977830772284; + ld.const.u64 %rd177, [hash_header+16]; + xor.b64 %rd650, %rd177, 2188519011337848018; + ld.const.u64 %rd178, [hash_header+24]; + xor.b64 %rd645, %rd178, 1992179434288343456; + ld.const.u64 %rd179, [hash_header+32]; + xor.b64 %rd640, %rd179, 8876506674959887717; + ld.const.u64 %rd180, [hash_header+40]; + xor.b64 %rd659, %rd180, 5399642050693751366; + ld.const.u64 %rd181, [hash_header+48]; + xor.b64 %rd654, %rd181, 1745875063082670864; + ld.const.u64 %rd182, [hash_header+56]; + xor.b64 %rd649, %rd182, 8605242046444978844; + ld.const.u64 %rd183, [hash_header+64]; + xor.b64 %rd644, %rd183, -510048929142394560; + xor.b64 %rd639, %rd6, 3343109343542796272; + mov.u64 %rd658, 1123092876221303306; + mov.u64 %rd657, 3784524041015224902; + mov.u64 %rd656, -8517909413761200310; + mov.u64 %rd653, 4963925045340115282; + mov.u64 %rd652, 1082795874807940378; + mov.u64 %rd651, 5237849264682708699; + mov.u64 %rd648, -1409360996057663723; + mov.u64 %rd647, -4494027153138273982; + mov.u64 %rd646, -5621391061570334094; + mov.u64 %rd643, -1817099578685924727; + mov.u64 %rd642, -5035616039755945756; + mov.u64 %rd641, 6706187291358897596; + mov.u64 %rd638, -5613068297060437469; + mov.u64 %rd637, -3386048033060200563; + mov.u64 %rd636, 196324915476054915; + mov.u64 %rd635, RC; + mov.u32 %r12571, -24; + +BB0_7: + xor.b64 %rd184, %rd659, %rd660; + xor.b64 %rd185, %rd184, %rd658; + xor.b64 %rd186, %rd185, %rd657; + xor.b64 %rd187, %rd186, %rd656; + xor.b64 %rd188, %rd654, %rd655; + xor.b64 %rd189, %rd188, %rd653; + xor.b64 %rd190, %rd189, %rd652; + xor.b64 %rd191, %rd190, %rd651; + xor.b64 %rd192, %rd649, %rd650; + xor.b64 %rd193, %rd192, %rd648; + xor.b64 %rd194, %rd193, %rd647; + xor.b64 %rd195, %rd194, %rd646; + xor.b64 %rd196, %rd644, %rd645; + xor.b64 %rd197, %rd196, %rd643; + xor.b64 %rd198, %rd197, %rd642; + xor.b64 %rd199, %rd198, %rd641; + xor.b64 %rd200, %rd639, %rd640; + xor.b64 %rd201, %rd200, %rd638; + xor.b64 %rd202, %rd201, %rd637; + xor.b64 %rd203, %rd202, %rd636; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd191, 1; + shr.b64 %rhs, %rd191, 63; + add.u64 %rd204, %lhs, %rhs; + } + xor.b64 %rd205, %rd203, %rd204; + xor.b64 %rd206, %rd660, %rd205; + xor.b64 %rd207, %rd659, %rd205; + xor.b64 %rd208, %rd658, %rd205; + xor.b64 %rd209, %rd657, %rd205; + xor.b64 %rd210, %rd656, %rd205; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd195, 1; + shr.b64 %rhs, %rd195, 63; + add.u64 %rd211, %lhs, %rhs; + } + xor.b64 %rd212, %rd211, %rd187; + xor.b64 %rd213, %rd655, %rd212; + xor.b64 %rd214, %rd654, %rd212; + xor.b64 %rd215, %rd653, %rd212; + xor.b64 %rd216, %rd652, %rd212; + xor.b64 %rd217, %rd651, %rd212; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd199, 1; + shr.b64 %rhs, %rd199, 63; + add.u64 %rd218, %lhs, %rhs; + } + xor.b64 %rd219, %rd218, %rd191; + xor.b64 %rd220, %rd650, %rd219; + xor.b64 %rd221, %rd649, %rd219; + xor.b64 %rd222, %rd648, %rd219; + xor.b64 %rd223, %rd647, %rd219; + xor.b64 %rd224, %rd646, %rd219; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd203, 1; + shr.b64 %rhs, %rd203, 63; + add.u64 %rd225, %lhs, %rhs; + } + xor.b64 %rd226, %rd225, %rd195; + xor.b64 %rd227, %rd645, %rd226; + xor.b64 %rd228, %rd644, %rd226; + xor.b64 %rd229, %rd643, %rd226; + xor.b64 %rd230, %rd642, %rd226; + xor.b64 %rd231, %rd641, %rd226; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd187, 1; + shr.b64 %rhs, %rd187, 63; + add.u64 %rd232, %lhs, %rhs; + } + xor.b64 %rd233, %rd232, %rd199; + xor.b64 %rd234, %rd640, %rd233; + xor.b64 %rd235, %rd639, %rd233; + xor.b64 %rd236, %rd638, %rd233; + xor.b64 %rd237, %rd637, %rd233; + xor.b64 %rd238, %rd636, %rd233; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd213, 1; + shr.b64 %rhs, %rd213, 63; + add.u64 %rd239, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd208, 3; + shr.b64 %rhs, %rd208, 61; + add.u64 %rd240, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd221, 6; + shr.b64 %rhs, %rd221, 58; + add.u64 %rd241, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd215, 10; + shr.b64 %rhs, %rd215, 54; + add.u64 %rd242, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd223, 15; + shr.b64 %rhs, %rd223, 49; + add.u64 %rd243, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd230, 21; + shr.b64 %rhs, %rd230, 43; + add.u64 %rd244, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd227, 28; + shr.b64 %rhs, %rd227, 36; + add.u64 %rd245, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd207, 36; + shr.b64 %rhs, %rd207, 28; + add.u64 %rd246, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd216, 45; + shr.b64 %rhs, %rd216, 19; + add.u64 %rd247, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd228, 55; + shr.b64 %rhs, %rd228, 9; + add.u64 %rd248, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd217, 2; + shr.b64 %rhs, %rd217, 62; + add.u64 %rd249, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd238, 14; + shr.b64 %rhs, %rd238, 50; + add.u64 %rd250, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd234, 27; + shr.b64 %rhs, %rd234, 37; + add.u64 %rd251, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd209, 41; + shr.b64 %rhs, %rd209, 23; + add.u64 %rd252, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd231, 56; + shr.b64 %rhs, %rd231, 8; + add.u64 %rd253, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd237, 8; + shr.b64 %rhs, %rd237, 56; + add.u64 %rd254, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd229, 25; + shr.b64 %rhs, %rd229, 39; + add.u64 %rd255, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd222, 43; + shr.b64 %rhs, %rd222, 21; + add.u64 %rd256, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd220, 62; + shr.b64 %rhs, %rd220, 2; + add.u64 %rd257, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd210, 18; + shr.b64 %rhs, %rd210, 46; + add.u64 %rd258, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd236, 39; + shr.b64 %rhs, %rd236, 25; + add.u64 %rd259, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd224, 61; + shr.b64 %rhs, %rd224, 3; + add.u64 %rd260, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd235, 20; + shr.b64 %rhs, %rd235, 44; + add.u64 %rd261, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd214, 44; + shr.b64 %rhs, %rd214, 20; + add.u64 %rd262, %lhs, %rhs; + } + not.b64 %rd263, %rd262; + and.b64 %rd264, %rd256, %rd263; + xor.b64 %rd265, %rd264, %rd206; + not.b64 %rd266, %rd256; + and.b64 %rd267, %rd244, %rd266; + xor.b64 %rd655, %rd267, %rd262; + not.b64 %rd268, %rd244; + and.b64 %rd269, %rd250, %rd268; + xor.b64 %rd650, %rd269, %rd256; + not.b64 %rd270, %rd250; + and.b64 %rd271, %rd206, %rd270; + xor.b64 %rd645, %rd271, %rd244; + not.b64 %rd272, %rd206; + and.b64 %rd273, %rd262, %rd272; + xor.b64 %rd640, %rd250, %rd273; + not.b64 %rd274, %rd261; + and.b64 %rd275, %rd240, %rd274; + xor.b64 %rd659, %rd275, %rd245; + not.b64 %rd276, %rd240; + and.b64 %rd277, %rd247, %rd276; + xor.b64 %rd654, %rd277, %rd261; + not.b64 %rd278, %rd247; + and.b64 %rd279, %rd260, %rd278; + xor.b64 %rd649, %rd279, %rd240; + not.b64 %rd280, %rd260; + and.b64 %rd281, %rd245, %rd280; + xor.b64 %rd644, %rd281, %rd247; + not.b64 %rd282, %rd245; + and.b64 %rd283, %rd261, %rd282; + xor.b64 %rd639, %rd260, %rd283; + not.b64 %rd284, %rd241; + and.b64 %rd285, %rd255, %rd284; + xor.b64 %rd658, %rd285, %rd239; + not.b64 %rd286, %rd255; + and.b64 %rd287, %rd254, %rd286; + xor.b64 %rd653, %rd287, %rd241; + not.b64 %rd288, %rd254; + and.b64 %rd289, %rd258, %rd288; + xor.b64 %rd648, %rd289, %rd255; + not.b64 %rd290, %rd258; + and.b64 %rd291, %rd239, %rd290; + xor.b64 %rd643, %rd291, %rd254; + not.b64 %rd292, %rd239; + and.b64 %rd293, %rd241, %rd292; + xor.b64 %rd638, %rd258, %rd293; + not.b64 %rd294, %rd246; + and.b64 %rd295, %rd242, %rd294; + xor.b64 %rd657, %rd295, %rd251; + not.b64 %rd296, %rd242; + and.b64 %rd297, %rd243, %rd296; + xor.b64 %rd652, %rd297, %rd246; + not.b64 %rd298, %rd243; + and.b64 %rd299, %rd253, %rd298; + xor.b64 %rd647, %rd299, %rd242; + not.b64 %rd300, %rd253; + and.b64 %rd301, %rd251, %rd300; + xor.b64 %rd642, %rd301, %rd243; + not.b64 %rd302, %rd251; + and.b64 %rd303, %rd246, %rd302; + xor.b64 %rd637, %rd253, %rd303; + not.b64 %rd304, %rd248; + and.b64 %rd305, %rd259, %rd304; + xor.b64 %rd656, %rd305, %rd257; + not.b64 %rd306, %rd259; + and.b64 %rd307, %rd252, %rd306; + xor.b64 %rd651, %rd307, %rd248; + not.b64 %rd308, %rd252; + and.b64 %rd309, %rd249, %rd308; + xor.b64 %rd646, %rd309, %rd259; + not.b64 %rd310, %rd249; + and.b64 %rd311, %rd257, %rd310; + xor.b64 %rd641, %rd311, %rd252; + not.b64 %rd312, %rd257; + and.b64 %rd313, %rd248, %rd312; + xor.b64 %rd636, %rd249, %rd313; + ld.global.u64 %rd314, [%rd635]; + xor.b64 %rd660, %rd265, %rd314; + add.s64 %rd635, %rd635, 8; + add.s32 %r12571, %r12571, 1; + setp.ne.s32 %p9, %r12571, 0; + @%p9 bra BB0_7; + + cvt.u32.u64 %r12, %rd660; + shr.u64 %rd337, %rd660, 8; + cvt.u32.u64 %r13, %rd337; + shr.u64 %rd338, %rd660, 16; + cvt.u32.u64 %r14, %rd338; + shr.u64 %rd339, %rd660, 24; + cvt.u32.u64 %r15, %rd339; + shr.u64 %rd340, %rd660, 32; + cvt.u32.u64 %r16, %rd340; + shr.u64 %rd341, %rd660, 40; + cvt.u32.u64 %r17, %rd341; + shr.u64 %rd342, %rd660, 48; + cvt.u32.u64 %r18, %rd342; + shr.u64 %rd343, %rd660, 56; + cvt.u32.u64 %r19, %rd343; + shr.u64 %rd344, %rd655, 8; + cvt.u32.u64 %r20, %rd344; + shr.u64 %rd345, %rd655, 16; + cvt.u32.u64 %r21, %rd345; + shr.u64 %rd346, %rd655, 24; + cvt.u32.u64 %r22, %rd346; + shr.u64 %rd347, %rd655, 32; + cvt.u32.u64 %r23, %rd347; + shr.u64 %rd348, %rd655, 40; + cvt.u32.u64 %r24, %rd348; + shr.u64 %rd349, %rd655, 48; + cvt.u32.u64 %r25, %rd349; + shr.u64 %rd350, %rd655, 56; + cvt.u32.u64 %r26, %rd350; + shr.u64 %rd351, %rd650, 8; + cvt.u32.u64 %r27, %rd351; + shr.u64 %rd352, %rd650, 16; + cvt.u32.u64 %r28, %rd352; + shr.u64 %rd353, %rd650, 24; + cvt.u32.u64 %r29, %rd353; + shr.u64 %rd354, %rd650, 32; + cvt.u32.u64 %r30, %rd354; + shr.u64 %rd355, %rd650, 40; + cvt.u32.u64 %r31, %rd355; + shr.u64 %rd356, %rd650, 48; + cvt.u32.u64 %r32, %rd356; + shr.u64 %rd357, %rd650, 56; + cvt.u32.u64 %r33, %rd357; + and.b32 %r34, %r12, 15; + bfe.u32 %r35, %r12, 12, 4; + and.b32 %r36, %r13, 15; + bfe.u32 %r37, %r12, 20, 4; + and.b32 %r38, %r14, 15; + shr.u32 %r39, %r12, 28; + and.b32 %r40, %r15, 15; + shr.u64 %rd358, %rd660, 36; + cvt.u32.u64 %r41, %rd358; + and.b32 %r42, %r41, 15; + and.b32 %r43, %r16, 15; + shr.u64 %rd359, %rd660, 44; + cvt.u32.u64 %r44, %rd359; + and.b32 %r45, %r44, 15; + and.b32 %r46, %r17, 15; + shr.u64 %rd360, %rd660, 52; + cvt.u32.u64 %r47, %rd360; + and.b32 %r48, %r47, 15; + and.b32 %r49, %r18, 15; + shr.u64 %rd361, %rd660, 60; + cvt.u32.u64 %r50, %rd361; + and.b32 %r51, %r19, 15; + cvt.u32.u64 %r52, %rd655; + and.b32 %r53, %r52, 15; + bfe.u32 %r54, %r52, 12, 4; + and.b32 %r55, %r20, 15; + bfe.u32 %r56, %r52, 20, 4; + and.b32 %r57, %r21, 15; + shr.u32 %r58, %r52, 28; + and.b32 %r59, %r22, 15; + shr.u64 %rd362, %rd655, 36; + cvt.u32.u64 %r60, %rd362; + and.b32 %r61, %r60, 15; + and.b32 %r62, %r23, 15; + shr.u64 %rd363, %rd655, 44; + cvt.u32.u64 %r63, %rd363; + and.b32 %r64, %r63, 15; + and.b32 %r65, %r24, 15; + shr.u64 %rd364, %rd655, 52; + cvt.u32.u64 %r66, %rd364; + and.b32 %r67, %r66, 15; + and.b32 %r68, %r25, 15; + shr.u64 %rd365, %rd655, 60; + cvt.u32.u64 %r69, %rd365; + and.b32 %r70, %r26, 15; + cvt.u32.u64 %r71, %rd650; + and.b32 %r72, %r71, 15; + bfe.u32 %r73, %r71, 12, 4; + and.b32 %r74, %r27, 15; + bfe.u32 %r75, %r71, 20, 4; + and.b32 %r76, %r28, 15; + shr.u32 %r77, %r71, 28; + and.b32 %r78, %r29, 15; + shr.u64 %rd366, %rd650, 36; + cvt.u32.u64 %r79, %rd366; + and.b32 %r80, %r79, 15; + and.b32 %r81, %r30, 15; + shr.u64 %rd367, %rd650, 44; + cvt.u32.u64 %r82, %rd367; + and.b32 %r83, %r82, 15; + and.b32 %r84, %r31, 15; + shr.u64 %rd368, %rd650, 52; + cvt.u32.u64 %r85, %rd368; + and.b32 %r86, %r85, 15; + and.b32 %r87, %r32, 15; + shr.u64 %rd369, %rd650, 60; + cvt.u32.u64 %r88, %rd369; + and.b32 %r89, %r33, 15; + cvt.u32.u64 %r90, %rd645; + and.b32 %r91, %r90, 15; + shr.u64 %rd370, %rd645, 8; + cvt.u32.u64 %r92, %rd370; + bfe.u32 %r93, %r90, 12, 4; + and.b32 %r94, %r92, 15; + shr.u64 %rd371, %rd645, 16; + cvt.u32.u64 %r95, %rd371; + bfe.u32 %r96, %r90, 20, 4; + and.b32 %r97, %r95, 15; + shr.u64 %rd372, %rd645, 24; + cvt.u32.u64 %r98, %rd372; + shr.u32 %r99, %r90, 28; + and.b32 %r100, %r98, 15; + shr.u64 %rd373, %rd645, 32; + cvt.u32.u64 %r101, %rd373; + shr.u64 %rd374, %rd645, 36; + cvt.u32.u64 %r102, %rd374; + and.b32 %r103, %r102, 15; + and.b32 %r104, %r101, 15; + shr.u64 %rd375, %rd645, 40; + cvt.u32.u64 %r105, %rd375; + shr.u64 %rd376, %rd645, 44; + cvt.u32.u64 %r106, %rd376; + and.b32 %r107, %r106, 15; + and.b32 %r108, %r105, 15; + shr.u64 %rd377, %rd645, 48; + cvt.u32.u64 %r109, %rd377; + shr.u64 %rd378, %rd645, 52; + cvt.u32.u64 %r110, %rd378; + and.b32 %r111, %r110, 15; + and.b32 %r112, %r109, 15; + shr.u64 %rd379, %rd645, 56; + cvt.u32.u64 %r113, %rd379; + shr.u64 %rd380, %rd645, 60; + cvt.u32.u64 %r114, %rd380; + and.b32 %r115, %r113, 15; + ld.const.v4.u8 {%rs2, %rs3, %rs4, %rs5}, [matrix]; + cvt.u32.u16 %r116, %rs5; + cvt.s32.s8 %r117, %r116; + cvt.u32.u16 %r118, %rs4; + cvt.s32.s8 %r119, %r118; + cvt.u32.u16 %r120, %rs2; + cvt.s32.s8 %r121, %r120; + cvt.u32.u16 %r122, %rs3; + cvt.s32.s8 %r123, %r122; + bfe.u32 %r124, %r12, 4, 4; + mul.lo.s32 %r125, %r34, %r123; + mad.lo.s32 %r126, %r124, %r121, %r125; + mad.lo.s32 %r127, %r35, %r119, %r126; + mad.lo.s32 %r128, %r36, %r117, %r127; + ld.const.v4.u8 {%rs10, %rs11, %rs12, %rs13}, [matrix+4]; + cvt.u32.u16 %r129, %rs13; + cvt.s32.s8 %r130, %r129; + cvt.u32.u16 %r131, %rs12; + cvt.s32.s8 %r132, %r131; + cvt.u32.u16 %r133, %rs11; + cvt.s32.s8 %r134, %r133; + cvt.u32.u16 %r135, %rs10; + cvt.s32.s8 %r136, %r135; + mad.lo.s32 %r137, %r37, %r136, %r128; + mad.lo.s32 %r138, %r38, %r134, %r137; + mad.lo.s32 %r139, %r39, %r132, %r138; + mad.lo.s32 %r140, %r40, %r130, %r139; + ld.const.v4.u8 {%rs18, %rs19, %rs20, %rs21}, [matrix+8]; + cvt.u32.u16 %r141, %rs21; + cvt.s32.s8 %r142, %r141; + cvt.u32.u16 %r143, %rs20; + cvt.s32.s8 %r144, %r143; + cvt.u32.u16 %r145, %rs19; + cvt.s32.s8 %r146, %r145; + cvt.u32.u16 %r147, %rs18; + cvt.s32.s8 %r148, %r147; + mad.lo.s32 %r149, %r42, %r148, %r140; + mad.lo.s32 %r150, %r43, %r146, %r149; + mad.lo.s32 %r151, %r45, %r144, %r150; + mad.lo.s32 %r152, %r46, %r142, %r151; + ld.const.v4.u8 {%rs26, %rs27, %rs28, %rs29}, [matrix+12]; + cvt.u32.u16 %r153, %rs29; + cvt.s32.s8 %r154, %r153; + cvt.u32.u16 %r155, %rs28; + cvt.s32.s8 %r156, %r155; + cvt.u32.u16 %r157, %rs27; + cvt.s32.s8 %r158, %r157; + cvt.u32.u16 %r159, %rs26; + cvt.s32.s8 %r160, %r159; + mad.lo.s32 %r161, %r48, %r160, %r152; + mad.lo.s32 %r162, %r49, %r158, %r161; + mad.lo.s32 %r163, %r50, %r156, %r162; + mad.lo.s32 %r164, %r51, %r154, %r163; + ld.const.v4.u8 {%rs34, %rs35, %rs36, %rs37}, [matrix+16]; + cvt.u32.u16 %r165, %rs37; + cvt.s32.s8 %r166, %r165; + cvt.u32.u16 %r167, %rs36; + cvt.s32.s8 %r168, %r167; + cvt.u32.u16 %r169, %rs35; + cvt.s32.s8 %r170, %r169; + cvt.u32.u16 %r171, %rs34; + cvt.s32.s8 %r172, %r171; + bfe.u32 %r173, %r52, 4, 4; + mad.lo.s32 %r174, %r173, %r172, %r164; + mad.lo.s32 %r175, %r53, %r170, %r174; + mad.lo.s32 %r176, %r54, %r168, %r175; + mad.lo.s32 %r177, %r55, %r166, %r176; + ld.const.v4.u8 {%rs42, %rs43, %rs44, %rs45}, [matrix+20]; + cvt.u32.u16 %r178, %rs45; + cvt.s32.s8 %r179, %r178; + cvt.u32.u16 %r180, %rs44; + cvt.s32.s8 %r181, %r180; + cvt.u32.u16 %r182, %rs43; + cvt.s32.s8 %r183, %r182; + cvt.u32.u16 %r184, %rs42; + cvt.s32.s8 %r185, %r184; + mad.lo.s32 %r186, %r56, %r185, %r177; + mad.lo.s32 %r187, %r57, %r183, %r186; + mad.lo.s32 %r188, %r58, %r181, %r187; + mad.lo.s32 %r189, %r59, %r179, %r188; + ld.const.v4.u8 {%rs50, %rs51, %rs52, %rs53}, [matrix+24]; + cvt.u32.u16 %r190, %rs53; + cvt.s32.s8 %r191, %r190; + cvt.u32.u16 %r192, %rs52; + cvt.s32.s8 %r193, %r192; + cvt.u32.u16 %r194, %rs51; + cvt.s32.s8 %r195, %r194; + cvt.u32.u16 %r196, %rs50; + cvt.s32.s8 %r197, %r196; + mad.lo.s32 %r198, %r61, %r197, %r189; + mad.lo.s32 %r199, %r62, %r195, %r198; + mad.lo.s32 %r200, %r64, %r193, %r199; + mad.lo.s32 %r201, %r65, %r191, %r200; + ld.const.v4.u8 {%rs58, %rs59, %rs60, %rs61}, [matrix+28]; + cvt.u32.u16 %r202, %rs61; + cvt.s32.s8 %r203, %r202; + cvt.u32.u16 %r204, %rs60; + cvt.s32.s8 %r205, %r204; + cvt.u32.u16 %r206, %rs59; + cvt.s32.s8 %r207, %r206; + cvt.u32.u16 %r208, %rs58; + cvt.s32.s8 %r209, %r208; + mad.lo.s32 %r210, %r67, %r209, %r201; + mad.lo.s32 %r211, %r68, %r207, %r210; + mad.lo.s32 %r212, %r69, %r205, %r211; + mad.lo.s32 %r213, %r70, %r203, %r212; + ld.const.v4.u8 {%rs66, %rs67, %rs68, %rs69}, [matrix+32]; + cvt.u32.u16 %r214, %rs69; + cvt.s32.s8 %r215, %r214; + cvt.u32.u16 %r216, %rs68; + cvt.s32.s8 %r217, %r216; + cvt.u32.u16 %r218, %rs67; + cvt.s32.s8 %r219, %r218; + cvt.u32.u16 %r220, %rs66; + cvt.s32.s8 %r221, %r220; + bfe.u32 %r222, %r71, 4, 4; + mad.lo.s32 %r223, %r222, %r221, %r213; + mad.lo.s32 %r224, %r72, %r219, %r223; + mad.lo.s32 %r225, %r73, %r217, %r224; + mad.lo.s32 %r226, %r74, %r215, %r225; + ld.const.v4.u8 {%rs74, %rs75, %rs76, %rs77}, [matrix+36]; + cvt.u32.u16 %r227, %rs77; + cvt.s32.s8 %r228, %r227; + cvt.u32.u16 %r229, %rs76; + cvt.s32.s8 %r230, %r229; + cvt.u32.u16 %r231, %rs75; + cvt.s32.s8 %r232, %r231; + cvt.u32.u16 %r233, %rs74; + cvt.s32.s8 %r234, %r233; + mad.lo.s32 %r235, %r75, %r234, %r226; + mad.lo.s32 %r236, %r76, %r232, %r235; + mad.lo.s32 %r237, %r77, %r230, %r236; + mad.lo.s32 %r238, %r78, %r228, %r237; + ld.const.v4.u8 {%rs82, %rs83, %rs84, %rs85}, [matrix+40]; + cvt.u32.u16 %r239, %rs85; + cvt.s32.s8 %r240, %r239; + cvt.u32.u16 %r241, %rs84; + cvt.s32.s8 %r242, %r241; + cvt.u32.u16 %r243, %rs83; + cvt.s32.s8 %r244, %r243; + cvt.u32.u16 %r245, %rs82; + cvt.s32.s8 %r246, %r245; + mad.lo.s32 %r247, %r80, %r246, %r238; + mad.lo.s32 %r248, %r81, %r244, %r247; + mad.lo.s32 %r249, %r83, %r242, %r248; + mad.lo.s32 %r250, %r84, %r240, %r249; + ld.const.v4.u8 {%rs90, %rs91, %rs92, %rs93}, [matrix+44]; + cvt.u32.u16 %r251, %rs93; + cvt.s32.s8 %r252, %r251; + cvt.u32.u16 %r253, %rs92; + cvt.s32.s8 %r254, %r253; + cvt.u32.u16 %r255, %rs91; + cvt.s32.s8 %r256, %r255; + cvt.u32.u16 %r257, %rs90; + cvt.s32.s8 %r258, %r257; + mad.lo.s32 %r259, %r86, %r258, %r250; + mad.lo.s32 %r260, %r87, %r256, %r259; + mad.lo.s32 %r261, %r88, %r254, %r260; + mad.lo.s32 %r262, %r89, %r252, %r261; + ld.const.v4.u8 {%rs98, %rs99, %rs100, %rs101}, [matrix+48]; + cvt.u32.u16 %r263, %rs101; + cvt.s32.s8 %r264, %r263; + cvt.u32.u16 %r265, %rs100; + cvt.s32.s8 %r266, %r265; + cvt.u32.u16 %r267, %rs99; + cvt.s32.s8 %r268, %r267; + cvt.u32.u16 %r269, %rs98; + cvt.s32.s8 %r270, %r269; + bfe.u32 %r271, %r90, 4, 4; + mad.lo.s32 %r272, %r271, %r270, %r262; + mad.lo.s32 %r273, %r91, %r268, %r272; + mad.lo.s32 %r274, %r93, %r266, %r273; + mad.lo.s32 %r275, %r94, %r264, %r274; + ld.const.v4.u8 {%rs106, %rs107, %rs108, %rs109}, [matrix+52]; + cvt.u32.u16 %r276, %rs109; + cvt.s32.s8 %r277, %r276; + cvt.u32.u16 %r278, %rs108; + cvt.s32.s8 %r279, %r278; + cvt.u32.u16 %r280, %rs107; + cvt.s32.s8 %r281, %r280; + cvt.u32.u16 %r282, %rs106; + cvt.s32.s8 %r283, %r282; + mad.lo.s32 %r284, %r96, %r283, %r275; + mad.lo.s32 %r285, %r97, %r281, %r284; + mad.lo.s32 %r286, %r99, %r279, %r285; + mad.lo.s32 %r287, %r100, %r277, %r286; + ld.const.v4.u8 {%rs114, %rs115, %rs116, %rs117}, [matrix+56]; + cvt.u32.u16 %r288, %rs117; + cvt.s32.s8 %r289, %r288; + cvt.u32.u16 %r290, %rs116; + cvt.s32.s8 %r291, %r290; + cvt.u32.u16 %r292, %rs115; + cvt.s32.s8 %r293, %r292; + cvt.u32.u16 %r294, %rs114; + cvt.s32.s8 %r295, %r294; + mad.lo.s32 %r296, %r103, %r295, %r287; + mad.lo.s32 %r297, %r104, %r293, %r296; + mad.lo.s32 %r298, %r107, %r291, %r297; + mad.lo.s32 %r299, %r108, %r289, %r298; + ld.const.v4.u8 {%rs122, %rs123, %rs124, %rs125}, [matrix+60]; + cvt.u32.u16 %r300, %rs125; + cvt.s32.s8 %r301, %r300; + cvt.u32.u16 %r302, %rs124; + cvt.s32.s8 %r303, %r302; + cvt.u32.u16 %r304, %rs123; + cvt.s32.s8 %r305, %r304; + cvt.u32.u16 %r306, %rs122; + cvt.s32.s8 %r307, %r306; + mad.lo.s32 %r308, %r111, %r307, %r299; + mad.lo.s32 %r309, %r112, %r305, %r308; + mad.lo.s32 %r310, %r114, %r303, %r309; + mad.lo.s32 %r311, %r115, %r301, %r310; + ld.const.v4.u8 {%rs130, %rs131, %rs132, %rs133}, [matrix+64]; + cvt.u32.u16 %r312, %rs133; + cvt.s32.s8 %r313, %r312; + cvt.u32.u16 %r314, %rs132; + cvt.s32.s8 %r315, %r314; + cvt.u32.u16 %r316, %rs130; + cvt.s32.s8 %r317, %r316; + cvt.u32.u16 %r318, %rs131; + cvt.s32.s8 %r319, %r318; + mul.lo.s32 %r320, %r34, %r319; + mad.lo.s32 %r321, %r124, %r317, %r320; + mad.lo.s32 %r322, %r35, %r315, %r321; + mad.lo.s32 %r323, %r36, %r313, %r322; + ld.const.v4.u8 {%rs138, %rs139, %rs140, %rs141}, [matrix+68]; + cvt.u32.u16 %r324, %rs141; + cvt.s32.s8 %r325, %r324; + cvt.u32.u16 %r326, %rs140; + cvt.s32.s8 %r327, %r326; + cvt.u32.u16 %r328, %rs139; + cvt.s32.s8 %r329, %r328; + cvt.u32.u16 %r330, %rs138; + cvt.s32.s8 %r331, %r330; + mad.lo.s32 %r332, %r37, %r331, %r323; + mad.lo.s32 %r333, %r38, %r329, %r332; + mad.lo.s32 %r334, %r39, %r327, %r333; + mad.lo.s32 %r335, %r40, %r325, %r334; + ld.const.v4.u8 {%rs146, %rs147, %rs148, %rs149}, [matrix+72]; + cvt.u32.u16 %r336, %rs149; + cvt.s32.s8 %r337, %r336; + cvt.u32.u16 %r338, %rs148; + cvt.s32.s8 %r339, %r338; + cvt.u32.u16 %r340, %rs147; + cvt.s32.s8 %r341, %r340; + cvt.u32.u16 %r342, %rs146; + cvt.s32.s8 %r343, %r342; + mad.lo.s32 %r344, %r42, %r343, %r335; + mad.lo.s32 %r345, %r43, %r341, %r344; + mad.lo.s32 %r346, %r45, %r339, %r345; + mad.lo.s32 %r347, %r46, %r337, %r346; + ld.const.v4.u8 {%rs154, %rs155, %rs156, %rs157}, [matrix+76]; + cvt.u32.u16 %r348, %rs157; + cvt.s32.s8 %r349, %r348; + cvt.u32.u16 %r350, %rs156; + cvt.s32.s8 %r351, %r350; + cvt.u32.u16 %r352, %rs155; + cvt.s32.s8 %r353, %r352; + cvt.u32.u16 %r354, %rs154; + cvt.s32.s8 %r355, %r354; + mad.lo.s32 %r356, %r48, %r355, %r347; + mad.lo.s32 %r357, %r49, %r353, %r356; + mad.lo.s32 %r358, %r50, %r351, %r357; + mad.lo.s32 %r359, %r51, %r349, %r358; + ld.const.v4.u8 {%rs162, %rs163, %rs164, %rs165}, [matrix+80]; + cvt.u32.u16 %r360, %rs165; + cvt.s32.s8 %r361, %r360; + cvt.u32.u16 %r362, %rs164; + cvt.s32.s8 %r363, %r362; + cvt.u32.u16 %r364, %rs163; + cvt.s32.s8 %r365, %r364; + cvt.u32.u16 %r366, %rs162; + cvt.s32.s8 %r367, %r366; + mad.lo.s32 %r368, %r173, %r367, %r359; + mad.lo.s32 %r369, %r53, %r365, %r368; + mad.lo.s32 %r370, %r54, %r363, %r369; + mad.lo.s32 %r371, %r55, %r361, %r370; + ld.const.v4.u8 {%rs170, %rs171, %rs172, %rs173}, [matrix+84]; + cvt.u32.u16 %r372, %rs173; + cvt.s32.s8 %r373, %r372; + cvt.u32.u16 %r374, %rs172; + cvt.s32.s8 %r375, %r374; + cvt.u32.u16 %r376, %rs171; + cvt.s32.s8 %r377, %r376; + cvt.u32.u16 %r378, %rs170; + cvt.s32.s8 %r379, %r378; + mad.lo.s32 %r380, %r56, %r379, %r371; + mad.lo.s32 %r381, %r57, %r377, %r380; + mad.lo.s32 %r382, %r58, %r375, %r381; + mad.lo.s32 %r383, %r59, %r373, %r382; + ld.const.v4.u8 {%rs178, %rs179, %rs180, %rs181}, [matrix+88]; + cvt.u32.u16 %r384, %rs181; + cvt.s32.s8 %r385, %r384; + cvt.u32.u16 %r386, %rs180; + cvt.s32.s8 %r387, %r386; + cvt.u32.u16 %r388, %rs179; + cvt.s32.s8 %r389, %r388; + cvt.u32.u16 %r390, %rs178; + cvt.s32.s8 %r391, %r390; + mad.lo.s32 %r392, %r61, %r391, %r383; + mad.lo.s32 %r393, %r62, %r389, %r392; + mad.lo.s32 %r394, %r64, %r387, %r393; + mad.lo.s32 %r395, %r65, %r385, %r394; + ld.const.v4.u8 {%rs186, %rs187, %rs188, %rs189}, [matrix+92]; + cvt.u32.u16 %r396, %rs189; + cvt.s32.s8 %r397, %r396; + cvt.u32.u16 %r398, %rs188; + cvt.s32.s8 %r399, %r398; + cvt.u32.u16 %r400, %rs187; + cvt.s32.s8 %r401, %r400; + cvt.u32.u16 %r402, %rs186; + cvt.s32.s8 %r403, %r402; + mad.lo.s32 %r404, %r67, %r403, %r395; + mad.lo.s32 %r405, %r68, %r401, %r404; + mad.lo.s32 %r406, %r69, %r399, %r405; + mad.lo.s32 %r407, %r70, %r397, %r406; + ld.const.v4.u8 {%rs194, %rs195, %rs196, %rs197}, [matrix+96]; + cvt.u32.u16 %r408, %rs197; + cvt.s32.s8 %r409, %r408; + cvt.u32.u16 %r410, %rs196; + cvt.s32.s8 %r411, %r410; + cvt.u32.u16 %r412, %rs195; + cvt.s32.s8 %r413, %r412; + cvt.u32.u16 %r414, %rs194; + cvt.s32.s8 %r415, %r414; + mad.lo.s32 %r416, %r222, %r415, %r407; + mad.lo.s32 %r417, %r72, %r413, %r416; + mad.lo.s32 %r418, %r73, %r411, %r417; + mad.lo.s32 %r419, %r74, %r409, %r418; + ld.const.v4.u8 {%rs202, %rs203, %rs204, %rs205}, [matrix+100]; + cvt.u32.u16 %r420, %rs205; + cvt.s32.s8 %r421, %r420; + cvt.u32.u16 %r422, %rs204; + cvt.s32.s8 %r423, %r422; + cvt.u32.u16 %r424, %rs203; + cvt.s32.s8 %r425, %r424; + cvt.u32.u16 %r426, %rs202; + cvt.s32.s8 %r427, %r426; + mad.lo.s32 %r428, %r75, %r427, %r419; + mad.lo.s32 %r429, %r76, %r425, %r428; + mad.lo.s32 %r430, %r77, %r423, %r429; + mad.lo.s32 %r431, %r78, %r421, %r430; + ld.const.v4.u8 {%rs210, %rs211, %rs212, %rs213}, [matrix+104]; + cvt.u32.u16 %r432, %rs213; + cvt.s32.s8 %r433, %r432; + cvt.u32.u16 %r434, %rs212; + cvt.s32.s8 %r435, %r434; + cvt.u32.u16 %r436, %rs211; + cvt.s32.s8 %r437, %r436; + cvt.u32.u16 %r438, %rs210; + cvt.s32.s8 %r439, %r438; + mad.lo.s32 %r440, %r80, %r439, %r431; + mad.lo.s32 %r441, %r81, %r437, %r440; + mad.lo.s32 %r442, %r83, %r435, %r441; + mad.lo.s32 %r443, %r84, %r433, %r442; + ld.const.v4.u8 {%rs218, %rs219, %rs220, %rs221}, [matrix+108]; + cvt.u32.u16 %r444, %rs221; + cvt.s32.s8 %r445, %r444; + cvt.u32.u16 %r446, %rs220; + cvt.s32.s8 %r447, %r446; + cvt.u32.u16 %r448, %rs219; + cvt.s32.s8 %r449, %r448; + cvt.u32.u16 %r450, %rs218; + cvt.s32.s8 %r451, %r450; + mad.lo.s32 %r452, %r86, %r451, %r443; + mad.lo.s32 %r453, %r87, %r449, %r452; + mad.lo.s32 %r454, %r88, %r447, %r453; + mad.lo.s32 %r455, %r89, %r445, %r454; + ld.const.v4.u8 {%rs226, %rs227, %rs228, %rs229}, [matrix+112]; + cvt.u32.u16 %r456, %rs229; + cvt.s32.s8 %r457, %r456; + cvt.u32.u16 %r458, %rs228; + cvt.s32.s8 %r459, %r458; + cvt.u32.u16 %r460, %rs227; + cvt.s32.s8 %r461, %r460; + cvt.u32.u16 %r462, %rs226; + cvt.s32.s8 %r463, %r462; + mad.lo.s32 %r464, %r271, %r463, %r455; + mad.lo.s32 %r465, %r91, %r461, %r464; + mad.lo.s32 %r466, %r93, %r459, %r465; + mad.lo.s32 %r467, %r94, %r457, %r466; + ld.const.v4.u8 {%rs234, %rs235, %rs236, %rs237}, [matrix+116]; + cvt.u32.u16 %r468, %rs237; + cvt.s32.s8 %r469, %r468; + cvt.u32.u16 %r470, %rs236; + cvt.s32.s8 %r471, %r470; + cvt.u32.u16 %r472, %rs235; + cvt.s32.s8 %r473, %r472; + cvt.u32.u16 %r474, %rs234; + cvt.s32.s8 %r475, %r474; + mad.lo.s32 %r476, %r96, %r475, %r467; + mad.lo.s32 %r477, %r97, %r473, %r476; + mad.lo.s32 %r478, %r99, %r471, %r477; + mad.lo.s32 %r479, %r100, %r469, %r478; + ld.const.v4.u8 {%rs242, %rs243, %rs244, %rs245}, [matrix+120]; + cvt.u32.u16 %r480, %rs245; + cvt.s32.s8 %r481, %r480; + cvt.u32.u16 %r482, %rs244; + cvt.s32.s8 %r483, %r482; + cvt.u32.u16 %r484, %rs243; + cvt.s32.s8 %r485, %r484; + cvt.u32.u16 %r486, %rs242; + cvt.s32.s8 %r487, %r486; + mad.lo.s32 %r488, %r103, %r487, %r479; + mad.lo.s32 %r489, %r104, %r485, %r488; + mad.lo.s32 %r490, %r107, %r483, %r489; + mad.lo.s32 %r491, %r108, %r481, %r490; + ld.const.v4.u8 {%rs250, %rs251, %rs252, %rs253}, [matrix+124]; + cvt.u32.u16 %r492, %rs253; + cvt.s32.s8 %r493, %r492; + cvt.u32.u16 %r494, %rs252; + cvt.s32.s8 %r495, %r494; + cvt.u32.u16 %r496, %rs251; + cvt.s32.s8 %r497, %r496; + cvt.u32.u16 %r498, %rs250; + cvt.s32.s8 %r499, %r498; + mad.lo.s32 %r500, %r111, %r499, %r491; + mad.lo.s32 %r501, %r112, %r497, %r500; + mad.lo.s32 %r502, %r114, %r495, %r501; + mad.lo.s32 %r503, %r115, %r493, %r502; + shr.u32 %r504, %r311, 6; + and.b32 %r505, %r504, 240; + shr.u32 %r506, %r503, 10; + or.b32 %r507, %r506, %r505; + xor.b32 %r508, %r12, %r507; + cvt.u64.u32 %rd381, %r508; + ld.const.v4.u8 {%rs258, %rs259, %rs260, %rs261}, [matrix+128]; + cvt.u32.u16 %r509, %rs261; + cvt.s32.s8 %r510, %r509; + cvt.u32.u16 %r511, %rs260; + cvt.s32.s8 %r512, %r511; + cvt.u32.u16 %r513, %rs258; + cvt.s32.s8 %r514, %r513; + cvt.u32.u16 %r515, %rs259; + cvt.s32.s8 %r516, %r515; + mul.lo.s32 %r517, %r34, %r516; + mad.lo.s32 %r518, %r124, %r514, %r517; + mad.lo.s32 %r519, %r35, %r512, %r518; + mad.lo.s32 %r520, %r36, %r510, %r519; + ld.const.v4.u8 {%rs266, %rs267, %rs268, %rs269}, [matrix+132]; + cvt.u32.u16 %r521, %rs269; + cvt.s32.s8 %r522, %r521; + cvt.u32.u16 %r523, %rs268; + cvt.s32.s8 %r524, %r523; + cvt.u32.u16 %r525, %rs267; + cvt.s32.s8 %r526, %r525; + cvt.u32.u16 %r527, %rs266; + cvt.s32.s8 %r528, %r527; + mad.lo.s32 %r529, %r37, %r528, %r520; + mad.lo.s32 %r530, %r38, %r526, %r529; + mad.lo.s32 %r531, %r39, %r524, %r530; + mad.lo.s32 %r532, %r40, %r522, %r531; + ld.const.v4.u8 {%rs274, %rs275, %rs276, %rs277}, [matrix+136]; + cvt.u32.u16 %r533, %rs277; + cvt.s32.s8 %r534, %r533; + cvt.u32.u16 %r535, %rs276; + cvt.s32.s8 %r536, %r535; + cvt.u32.u16 %r537, %rs275; + cvt.s32.s8 %r538, %r537; + cvt.u32.u16 %r539, %rs274; + cvt.s32.s8 %r540, %r539; + mad.lo.s32 %r541, %r42, %r540, %r532; + mad.lo.s32 %r542, %r43, %r538, %r541; + mad.lo.s32 %r543, %r45, %r536, %r542; + mad.lo.s32 %r544, %r46, %r534, %r543; + ld.const.v4.u8 {%rs282, %rs283, %rs284, %rs285}, [matrix+140]; + cvt.u32.u16 %r545, %rs285; + cvt.s32.s8 %r546, %r545; + cvt.u32.u16 %r547, %rs284; + cvt.s32.s8 %r548, %r547; + cvt.u32.u16 %r549, %rs283; + cvt.s32.s8 %r550, %r549; + cvt.u32.u16 %r551, %rs282; + cvt.s32.s8 %r552, %r551; + mad.lo.s32 %r553, %r48, %r552, %r544; + mad.lo.s32 %r554, %r49, %r550, %r553; + mad.lo.s32 %r555, %r50, %r548, %r554; + mad.lo.s32 %r556, %r51, %r546, %r555; + ld.const.v4.u8 {%rs290, %rs291, %rs292, %rs293}, [matrix+144]; + cvt.u32.u16 %r557, %rs293; + cvt.s32.s8 %r558, %r557; + cvt.u32.u16 %r559, %rs292; + cvt.s32.s8 %r560, %r559; + cvt.u32.u16 %r561, %rs291; + cvt.s32.s8 %r562, %r561; + cvt.u32.u16 %r563, %rs290; + cvt.s32.s8 %r564, %r563; + mad.lo.s32 %r565, %r173, %r564, %r556; + mad.lo.s32 %r566, %r53, %r562, %r565; + mad.lo.s32 %r567, %r54, %r560, %r566; + mad.lo.s32 %r568, %r55, %r558, %r567; + ld.const.v4.u8 {%rs298, %rs299, %rs300, %rs301}, [matrix+148]; + cvt.u32.u16 %r569, %rs301; + cvt.s32.s8 %r570, %r569; + cvt.u32.u16 %r571, %rs300; + cvt.s32.s8 %r572, %r571; + cvt.u32.u16 %r573, %rs299; + cvt.s32.s8 %r574, %r573; + cvt.u32.u16 %r575, %rs298; + cvt.s32.s8 %r576, %r575; + mad.lo.s32 %r577, %r56, %r576, %r568; + mad.lo.s32 %r578, %r57, %r574, %r577; + mad.lo.s32 %r579, %r58, %r572, %r578; + mad.lo.s32 %r580, %r59, %r570, %r579; + ld.const.v4.u8 {%rs306, %rs307, %rs308, %rs309}, [matrix+152]; + cvt.u32.u16 %r581, %rs309; + cvt.s32.s8 %r582, %r581; + cvt.u32.u16 %r583, %rs308; + cvt.s32.s8 %r584, %r583; + cvt.u32.u16 %r585, %rs307; + cvt.s32.s8 %r586, %r585; + cvt.u32.u16 %r587, %rs306; + cvt.s32.s8 %r588, %r587; + mad.lo.s32 %r589, %r61, %r588, %r580; + mad.lo.s32 %r590, %r62, %r586, %r589; + mad.lo.s32 %r591, %r64, %r584, %r590; + mad.lo.s32 %r592, %r65, %r582, %r591; + ld.const.v4.u8 {%rs314, %rs315, %rs316, %rs317}, [matrix+156]; + cvt.u32.u16 %r593, %rs317; + cvt.s32.s8 %r594, %r593; + cvt.u32.u16 %r595, %rs316; + cvt.s32.s8 %r596, %r595; + cvt.u32.u16 %r597, %rs315; + cvt.s32.s8 %r598, %r597; + cvt.u32.u16 %r599, %rs314; + cvt.s32.s8 %r600, %r599; + mad.lo.s32 %r601, %r67, %r600, %r592; + mad.lo.s32 %r602, %r68, %r598, %r601; + mad.lo.s32 %r603, %r69, %r596, %r602; + mad.lo.s32 %r604, %r70, %r594, %r603; + ld.const.v4.u8 {%rs322, %rs323, %rs324, %rs325}, [matrix+160]; + cvt.u32.u16 %r605, %rs325; + cvt.s32.s8 %r606, %r605; + cvt.u32.u16 %r607, %rs324; + cvt.s32.s8 %r608, %r607; + cvt.u32.u16 %r609, %rs323; + cvt.s32.s8 %r610, %r609; + cvt.u32.u16 %r611, %rs322; + cvt.s32.s8 %r612, %r611; + mad.lo.s32 %r613, %r222, %r612, %r604; + mad.lo.s32 %r614, %r72, %r610, %r613; + mad.lo.s32 %r615, %r73, %r608, %r614; + mad.lo.s32 %r616, %r74, %r606, %r615; + ld.const.v4.u8 {%rs330, %rs331, %rs332, %rs333}, [matrix+164]; + cvt.u32.u16 %r617, %rs333; + cvt.s32.s8 %r618, %r617; + cvt.u32.u16 %r619, %rs332; + cvt.s32.s8 %r620, %r619; + cvt.u32.u16 %r621, %rs331; + cvt.s32.s8 %r622, %r621; + cvt.u32.u16 %r623, %rs330; + cvt.s32.s8 %r624, %r623; + mad.lo.s32 %r625, %r75, %r624, %r616; + mad.lo.s32 %r626, %r76, %r622, %r625; + mad.lo.s32 %r627, %r77, %r620, %r626; + mad.lo.s32 %r628, %r78, %r618, %r627; + ld.const.v4.u8 {%rs338, %rs339, %rs340, %rs341}, [matrix+168]; + cvt.u32.u16 %r629, %rs341; + cvt.s32.s8 %r630, %r629; + cvt.u32.u16 %r631, %rs340; + cvt.s32.s8 %r632, %r631; + cvt.u32.u16 %r633, %rs339; + cvt.s32.s8 %r634, %r633; + cvt.u32.u16 %r635, %rs338; + cvt.s32.s8 %r636, %r635; + mad.lo.s32 %r637, %r80, %r636, %r628; + mad.lo.s32 %r638, %r81, %r634, %r637; + mad.lo.s32 %r639, %r83, %r632, %r638; + mad.lo.s32 %r640, %r84, %r630, %r639; + ld.const.v4.u8 {%rs346, %rs347, %rs348, %rs349}, [matrix+172]; + cvt.u32.u16 %r641, %rs349; + cvt.s32.s8 %r642, %r641; + cvt.u32.u16 %r643, %rs348; + cvt.s32.s8 %r644, %r643; + cvt.u32.u16 %r645, %rs347; + cvt.s32.s8 %r646, %r645; + cvt.u32.u16 %r647, %rs346; + cvt.s32.s8 %r648, %r647; + mad.lo.s32 %r649, %r86, %r648, %r640; + mad.lo.s32 %r650, %r87, %r646, %r649; + mad.lo.s32 %r651, %r88, %r644, %r650; + mad.lo.s32 %r652, %r89, %r642, %r651; + ld.const.v4.u8 {%rs354, %rs355, %rs356, %rs357}, [matrix+176]; + cvt.u32.u16 %r653, %rs357; + cvt.s32.s8 %r654, %r653; + cvt.u32.u16 %r655, %rs356; + cvt.s32.s8 %r656, %r655; + cvt.u32.u16 %r657, %rs355; + cvt.s32.s8 %r658, %r657; + cvt.u32.u16 %r659, %rs354; + cvt.s32.s8 %r660, %r659; + mad.lo.s32 %r661, %r271, %r660, %r652; + mad.lo.s32 %r662, %r91, %r658, %r661; + mad.lo.s32 %r663, %r93, %r656, %r662; + mad.lo.s32 %r664, %r94, %r654, %r663; + ld.const.v4.u8 {%rs362, %rs363, %rs364, %rs365}, [matrix+180]; + cvt.u32.u16 %r665, %rs365; + cvt.s32.s8 %r666, %r665; + cvt.u32.u16 %r667, %rs364; + cvt.s32.s8 %r668, %r667; + cvt.u32.u16 %r669, %rs363; + cvt.s32.s8 %r670, %r669; + cvt.u32.u16 %r671, %rs362; + cvt.s32.s8 %r672, %r671; + mad.lo.s32 %r673, %r96, %r672, %r664; + mad.lo.s32 %r674, %r97, %r670, %r673; + mad.lo.s32 %r675, %r99, %r668, %r674; + mad.lo.s32 %r676, %r100, %r666, %r675; + ld.const.v4.u8 {%rs370, %rs371, %rs372, %rs373}, [matrix+184]; + cvt.u32.u16 %r677, %rs373; + cvt.s32.s8 %r678, %r677; + cvt.u32.u16 %r679, %rs372; + cvt.s32.s8 %r680, %r679; + cvt.u32.u16 %r681, %rs371; + cvt.s32.s8 %r682, %r681; + cvt.u32.u16 %r683, %rs370; + cvt.s32.s8 %r684, %r683; + mad.lo.s32 %r685, %r103, %r684, %r676; + mad.lo.s32 %r686, %r104, %r682, %r685; + mad.lo.s32 %r687, %r107, %r680, %r686; + mad.lo.s32 %r688, %r108, %r678, %r687; + ld.const.v4.u8 {%rs378, %rs379, %rs380, %rs381}, [matrix+188]; + cvt.u32.u16 %r689, %rs381; + cvt.s32.s8 %r690, %r689; + cvt.u32.u16 %r691, %rs380; + cvt.s32.s8 %r692, %r691; + cvt.u32.u16 %r693, %rs379; + cvt.s32.s8 %r694, %r693; + cvt.u32.u16 %r695, %rs378; + cvt.s32.s8 %r696, %r695; + mad.lo.s32 %r697, %r111, %r696, %r688; + mad.lo.s32 %r698, %r112, %r694, %r697; + mad.lo.s32 %r699, %r114, %r692, %r698; + mad.lo.s32 %r700, %r115, %r690, %r699; + ld.const.v4.u8 {%rs386, %rs387, %rs388, %rs389}, [matrix+192]; + cvt.u32.u16 %r701, %rs389; + cvt.s32.s8 %r702, %r701; + cvt.u32.u16 %r703, %rs388; + cvt.s32.s8 %r704, %r703; + cvt.u32.u16 %r705, %rs386; + cvt.s32.s8 %r706, %r705; + cvt.u32.u16 %r707, %rs387; + cvt.s32.s8 %r708, %r707; + mul.lo.s32 %r709, %r34, %r708; + mad.lo.s32 %r710, %r124, %r706, %r709; + mad.lo.s32 %r711, %r35, %r704, %r710; + mad.lo.s32 %r712, %r36, %r702, %r711; + ld.const.v4.u8 {%rs394, %rs395, %rs396, %rs397}, [matrix+196]; + cvt.u32.u16 %r713, %rs397; + cvt.s32.s8 %r714, %r713; + cvt.u32.u16 %r715, %rs396; + cvt.s32.s8 %r716, %r715; + cvt.u32.u16 %r717, %rs395; + cvt.s32.s8 %r718, %r717; + cvt.u32.u16 %r719, %rs394; + cvt.s32.s8 %r720, %r719; + mad.lo.s32 %r721, %r37, %r720, %r712; + mad.lo.s32 %r722, %r38, %r718, %r721; + mad.lo.s32 %r723, %r39, %r716, %r722; + mad.lo.s32 %r724, %r40, %r714, %r723; + ld.const.v4.u8 {%rs402, %rs403, %rs404, %rs405}, [matrix+200]; + cvt.u32.u16 %r725, %rs405; + cvt.s32.s8 %r726, %r725; + cvt.u32.u16 %r727, %rs404; + cvt.s32.s8 %r728, %r727; + cvt.u32.u16 %r729, %rs403; + cvt.s32.s8 %r730, %r729; + cvt.u32.u16 %r731, %rs402; + cvt.s32.s8 %r732, %r731; + mad.lo.s32 %r733, %r42, %r732, %r724; + mad.lo.s32 %r734, %r43, %r730, %r733; + mad.lo.s32 %r735, %r45, %r728, %r734; + mad.lo.s32 %r736, %r46, %r726, %r735; + ld.const.v4.u8 {%rs410, %rs411, %rs412, %rs413}, [matrix+204]; + cvt.u32.u16 %r737, %rs413; + cvt.s32.s8 %r738, %r737; + cvt.u32.u16 %r739, %rs412; + cvt.s32.s8 %r740, %r739; + cvt.u32.u16 %r741, %rs411; + cvt.s32.s8 %r742, %r741; + cvt.u32.u16 %r743, %rs410; + cvt.s32.s8 %r744, %r743; + mad.lo.s32 %r745, %r48, %r744, %r736; + mad.lo.s32 %r746, %r49, %r742, %r745; + mad.lo.s32 %r747, %r50, %r740, %r746; + mad.lo.s32 %r748, %r51, %r738, %r747; + ld.const.v4.u8 {%rs418, %rs419, %rs420, %rs421}, [matrix+208]; + cvt.u32.u16 %r749, %rs421; + cvt.s32.s8 %r750, %r749; + cvt.u32.u16 %r751, %rs420; + cvt.s32.s8 %r752, %r751; + cvt.u32.u16 %r753, %rs419; + cvt.s32.s8 %r754, %r753; + cvt.u32.u16 %r755, %rs418; + cvt.s32.s8 %r756, %r755; + mad.lo.s32 %r757, %r173, %r756, %r748; + mad.lo.s32 %r758, %r53, %r754, %r757; + mad.lo.s32 %r759, %r54, %r752, %r758; + mad.lo.s32 %r760, %r55, %r750, %r759; + ld.const.v4.u8 {%rs426, %rs427, %rs428, %rs429}, [matrix+212]; + cvt.u32.u16 %r761, %rs429; + cvt.s32.s8 %r762, %r761; + cvt.u32.u16 %r763, %rs428; + cvt.s32.s8 %r764, %r763; + cvt.u32.u16 %r765, %rs427; + cvt.s32.s8 %r766, %r765; + cvt.u32.u16 %r767, %rs426; + cvt.s32.s8 %r768, %r767; + mad.lo.s32 %r769, %r56, %r768, %r760; + mad.lo.s32 %r770, %r57, %r766, %r769; + mad.lo.s32 %r771, %r58, %r764, %r770; + mad.lo.s32 %r772, %r59, %r762, %r771; + ld.const.v4.u8 {%rs434, %rs435, %rs436, %rs437}, [matrix+216]; + cvt.u32.u16 %r773, %rs437; + cvt.s32.s8 %r774, %r773; + cvt.u32.u16 %r775, %rs436; + cvt.s32.s8 %r776, %r775; + cvt.u32.u16 %r777, %rs435; + cvt.s32.s8 %r778, %r777; + cvt.u32.u16 %r779, %rs434; + cvt.s32.s8 %r780, %r779; + mad.lo.s32 %r781, %r61, %r780, %r772; + mad.lo.s32 %r782, %r62, %r778, %r781; + mad.lo.s32 %r783, %r64, %r776, %r782; + mad.lo.s32 %r784, %r65, %r774, %r783; + ld.const.v4.u8 {%rs442, %rs443, %rs444, %rs445}, [matrix+220]; + cvt.u32.u16 %r785, %rs445; + cvt.s32.s8 %r786, %r785; + cvt.u32.u16 %r787, %rs444; + cvt.s32.s8 %r788, %r787; + cvt.u32.u16 %r789, %rs443; + cvt.s32.s8 %r790, %r789; + cvt.u32.u16 %r791, %rs442; + cvt.s32.s8 %r792, %r791; + mad.lo.s32 %r793, %r67, %r792, %r784; + mad.lo.s32 %r794, %r68, %r790, %r793; + mad.lo.s32 %r795, %r69, %r788, %r794; + mad.lo.s32 %r796, %r70, %r786, %r795; + ld.const.v4.u8 {%rs450, %rs451, %rs452, %rs453}, [matrix+224]; + cvt.u32.u16 %r797, %rs453; + cvt.s32.s8 %r798, %r797; + cvt.u32.u16 %r799, %rs452; + cvt.s32.s8 %r800, %r799; + cvt.u32.u16 %r801, %rs451; + cvt.s32.s8 %r802, %r801; + cvt.u32.u16 %r803, %rs450; + cvt.s32.s8 %r804, %r803; + mad.lo.s32 %r805, %r222, %r804, %r796; + mad.lo.s32 %r806, %r72, %r802, %r805; + mad.lo.s32 %r807, %r73, %r800, %r806; + mad.lo.s32 %r808, %r74, %r798, %r807; + ld.const.v4.u8 {%rs458, %rs459, %rs460, %rs461}, [matrix+228]; + cvt.u32.u16 %r809, %rs461; + cvt.s32.s8 %r810, %r809; + cvt.u32.u16 %r811, %rs460; + cvt.s32.s8 %r812, %r811; + cvt.u32.u16 %r813, %rs459; + cvt.s32.s8 %r814, %r813; + cvt.u32.u16 %r815, %rs458; + cvt.s32.s8 %r816, %r815; + mad.lo.s32 %r817, %r75, %r816, %r808; + mad.lo.s32 %r818, %r76, %r814, %r817; + mad.lo.s32 %r819, %r77, %r812, %r818; + mad.lo.s32 %r820, %r78, %r810, %r819; + ld.const.v4.u8 {%rs466, %rs467, %rs468, %rs469}, [matrix+232]; + cvt.u32.u16 %r821, %rs469; + cvt.s32.s8 %r822, %r821; + cvt.u32.u16 %r823, %rs468; + cvt.s32.s8 %r824, %r823; + cvt.u32.u16 %r825, %rs467; + cvt.s32.s8 %r826, %r825; + cvt.u32.u16 %r827, %rs466; + cvt.s32.s8 %r828, %r827; + mad.lo.s32 %r829, %r80, %r828, %r820; + mad.lo.s32 %r830, %r81, %r826, %r829; + mad.lo.s32 %r831, %r83, %r824, %r830; + mad.lo.s32 %r832, %r84, %r822, %r831; + ld.const.v4.u8 {%rs474, %rs475, %rs476, %rs477}, [matrix+236]; + cvt.u32.u16 %r833, %rs477; + cvt.s32.s8 %r834, %r833; + cvt.u32.u16 %r835, %rs476; + cvt.s32.s8 %r836, %r835; + cvt.u32.u16 %r837, %rs475; + cvt.s32.s8 %r838, %r837; + cvt.u32.u16 %r839, %rs474; + cvt.s32.s8 %r840, %r839; + mad.lo.s32 %r841, %r86, %r840, %r832; + mad.lo.s32 %r842, %r87, %r838, %r841; + mad.lo.s32 %r843, %r88, %r836, %r842; + mad.lo.s32 %r844, %r89, %r834, %r843; + ld.const.v4.u8 {%rs482, %rs483, %rs484, %rs485}, [matrix+240]; + cvt.u32.u16 %r845, %rs485; + cvt.s32.s8 %r846, %r845; + cvt.u32.u16 %r847, %rs484; + cvt.s32.s8 %r848, %r847; + cvt.u32.u16 %r849, %rs483; + cvt.s32.s8 %r850, %r849; + cvt.u32.u16 %r851, %rs482; + cvt.s32.s8 %r852, %r851; + mad.lo.s32 %r853, %r271, %r852, %r844; + mad.lo.s32 %r854, %r91, %r850, %r853; + mad.lo.s32 %r855, %r93, %r848, %r854; + mad.lo.s32 %r856, %r94, %r846, %r855; + ld.const.v4.u8 {%rs490, %rs491, %rs492, %rs493}, [matrix+244]; + cvt.u32.u16 %r857, %rs493; + cvt.s32.s8 %r858, %r857; + cvt.u32.u16 %r859, %rs492; + cvt.s32.s8 %r860, %r859; + cvt.u32.u16 %r861, %rs491; + cvt.s32.s8 %r862, %r861; + cvt.u32.u16 %r863, %rs490; + cvt.s32.s8 %r864, %r863; + mad.lo.s32 %r865, %r96, %r864, %r856; + mad.lo.s32 %r866, %r97, %r862, %r865; + mad.lo.s32 %r867, %r99, %r860, %r866; + mad.lo.s32 %r868, %r100, %r858, %r867; + ld.const.v4.u8 {%rs498, %rs499, %rs500, %rs501}, [matrix+248]; + cvt.u32.u16 %r869, %rs501; + cvt.s32.s8 %r870, %r869; + cvt.u32.u16 %r871, %rs500; + cvt.s32.s8 %r872, %r871; + cvt.u32.u16 %r873, %rs499; + cvt.s32.s8 %r874, %r873; + cvt.u32.u16 %r875, %rs498; + cvt.s32.s8 %r876, %r875; + mad.lo.s32 %r877, %r103, %r876, %r868; + mad.lo.s32 %r878, %r104, %r874, %r877; + mad.lo.s32 %r879, %r107, %r872, %r878; + mad.lo.s32 %r880, %r108, %r870, %r879; + ld.const.v4.u8 {%rs506, %rs507, %rs508, %rs509}, [matrix+252]; + cvt.u32.u16 %r881, %rs509; + cvt.s32.s8 %r882, %r881; + cvt.u32.u16 %r883, %rs508; + cvt.s32.s8 %r884, %r883; + cvt.u32.u16 %r885, %rs507; + cvt.s32.s8 %r886, %r885; + cvt.u32.u16 %r887, %rs506; + cvt.s32.s8 %r888, %r887; + mad.lo.s32 %r889, %r111, %r888, %r880; + mad.lo.s32 %r890, %r112, %r886, %r889; + mad.lo.s32 %r891, %r114, %r884, %r890; + mad.lo.s32 %r892, %r115, %r882, %r891; + shr.u32 %r893, %r700, 6; + and.b32 %r894, %r893, 240; + shr.u32 %r895, %r892, 10; + or.b32 %r896, %r895, %r894; + xor.b32 %r897, %r13, %r896; + ld.const.v4.u8 {%rs514, %rs515, %rs516, %rs517}, [matrix+256]; + cvt.u32.u16 %r898, %rs517; + cvt.s32.s8 %r899, %r898; + cvt.u32.u16 %r900, %rs516; + cvt.s32.s8 %r901, %r900; + cvt.u32.u16 %r902, %rs514; + cvt.s32.s8 %r903, %r902; + cvt.u32.u16 %r904, %rs515; + cvt.s32.s8 %r905, %r904; + mul.lo.s32 %r906, %r34, %r905; + mad.lo.s32 %r907, %r124, %r903, %r906; + mad.lo.s32 %r908, %r35, %r901, %r907; + mad.lo.s32 %r909, %r36, %r899, %r908; + ld.const.v4.u8 {%rs522, %rs523, %rs524, %rs525}, [matrix+260]; + cvt.u32.u16 %r910, %rs525; + cvt.s32.s8 %r911, %r910; + cvt.u32.u16 %r912, %rs524; + cvt.s32.s8 %r913, %r912; + cvt.u32.u16 %r914, %rs523; + cvt.s32.s8 %r915, %r914; + cvt.u32.u16 %r916, %rs522; + cvt.s32.s8 %r917, %r916; + mad.lo.s32 %r918, %r37, %r917, %r909; + mad.lo.s32 %r919, %r38, %r915, %r918; + mad.lo.s32 %r920, %r39, %r913, %r919; + mad.lo.s32 %r921, %r40, %r911, %r920; + ld.const.v4.u8 {%rs530, %rs531, %rs532, %rs533}, [matrix+264]; + cvt.u32.u16 %r922, %rs533; + cvt.s32.s8 %r923, %r922; + cvt.u32.u16 %r924, %rs532; + cvt.s32.s8 %r925, %r924; + cvt.u32.u16 %r926, %rs531; + cvt.s32.s8 %r927, %r926; + cvt.u32.u16 %r928, %rs530; + cvt.s32.s8 %r929, %r928; + mad.lo.s32 %r930, %r42, %r929, %r921; + mad.lo.s32 %r931, %r43, %r927, %r930; + mad.lo.s32 %r932, %r45, %r925, %r931; + mad.lo.s32 %r933, %r46, %r923, %r932; + ld.const.v4.u8 {%rs538, %rs539, %rs540, %rs541}, [matrix+268]; + cvt.u32.u16 %r934, %rs541; + cvt.s32.s8 %r935, %r934; + cvt.u32.u16 %r936, %rs540; + cvt.s32.s8 %r937, %r936; + cvt.u32.u16 %r938, %rs539; + cvt.s32.s8 %r939, %r938; + cvt.u32.u16 %r940, %rs538; + cvt.s32.s8 %r941, %r940; + mad.lo.s32 %r942, %r48, %r941, %r933; + mad.lo.s32 %r943, %r49, %r939, %r942; + mad.lo.s32 %r944, %r50, %r937, %r943; + mad.lo.s32 %r945, %r51, %r935, %r944; + ld.const.v4.u8 {%rs546, %rs547, %rs548, %rs549}, [matrix+272]; + cvt.u32.u16 %r946, %rs549; + cvt.s32.s8 %r947, %r946; + cvt.u32.u16 %r948, %rs548; + cvt.s32.s8 %r949, %r948; + cvt.u32.u16 %r950, %rs547; + cvt.s32.s8 %r951, %r950; + cvt.u32.u16 %r952, %rs546; + cvt.s32.s8 %r953, %r952; + mad.lo.s32 %r954, %r173, %r953, %r945; + mad.lo.s32 %r955, %r53, %r951, %r954; + mad.lo.s32 %r956, %r54, %r949, %r955; + mad.lo.s32 %r957, %r55, %r947, %r956; + ld.const.v4.u8 {%rs554, %rs555, %rs556, %rs557}, [matrix+276]; + cvt.u32.u16 %r958, %rs557; + cvt.s32.s8 %r959, %r958; + cvt.u32.u16 %r960, %rs556; + cvt.s32.s8 %r961, %r960; + cvt.u32.u16 %r962, %rs555; + cvt.s32.s8 %r963, %r962; + cvt.u32.u16 %r964, %rs554; + cvt.s32.s8 %r965, %r964; + mad.lo.s32 %r966, %r56, %r965, %r957; + mad.lo.s32 %r967, %r57, %r963, %r966; + mad.lo.s32 %r968, %r58, %r961, %r967; + mad.lo.s32 %r969, %r59, %r959, %r968; + ld.const.v4.u8 {%rs562, %rs563, %rs564, %rs565}, [matrix+280]; + cvt.u32.u16 %r970, %rs565; + cvt.s32.s8 %r971, %r970; + cvt.u32.u16 %r972, %rs564; + cvt.s32.s8 %r973, %r972; + cvt.u32.u16 %r974, %rs563; + cvt.s32.s8 %r975, %r974; + cvt.u32.u16 %r976, %rs562; + cvt.s32.s8 %r977, %r976; + mad.lo.s32 %r978, %r61, %r977, %r969; + mad.lo.s32 %r979, %r62, %r975, %r978; + mad.lo.s32 %r980, %r64, %r973, %r979; + mad.lo.s32 %r981, %r65, %r971, %r980; + ld.const.v4.u8 {%rs570, %rs571, %rs572, %rs573}, [matrix+284]; + cvt.u32.u16 %r982, %rs573; + cvt.s32.s8 %r983, %r982; + cvt.u32.u16 %r984, %rs572; + cvt.s32.s8 %r985, %r984; + cvt.u32.u16 %r986, %rs571; + cvt.s32.s8 %r987, %r986; + cvt.u32.u16 %r988, %rs570; + cvt.s32.s8 %r989, %r988; + mad.lo.s32 %r990, %r67, %r989, %r981; + mad.lo.s32 %r991, %r68, %r987, %r990; + mad.lo.s32 %r992, %r69, %r985, %r991; + mad.lo.s32 %r993, %r70, %r983, %r992; + ld.const.v4.u8 {%rs578, %rs579, %rs580, %rs581}, [matrix+288]; + cvt.u32.u16 %r994, %rs581; + cvt.s32.s8 %r995, %r994; + cvt.u32.u16 %r996, %rs580; + cvt.s32.s8 %r997, %r996; + cvt.u32.u16 %r998, %rs579; + cvt.s32.s8 %r999, %r998; + cvt.u32.u16 %r1000, %rs578; + cvt.s32.s8 %r1001, %r1000; + mad.lo.s32 %r1002, %r222, %r1001, %r993; + mad.lo.s32 %r1003, %r72, %r999, %r1002; + mad.lo.s32 %r1004, %r73, %r997, %r1003; + mad.lo.s32 %r1005, %r74, %r995, %r1004; + ld.const.v4.u8 {%rs586, %rs587, %rs588, %rs589}, [matrix+292]; + cvt.u32.u16 %r1006, %rs589; + cvt.s32.s8 %r1007, %r1006; + cvt.u32.u16 %r1008, %rs588; + cvt.s32.s8 %r1009, %r1008; + cvt.u32.u16 %r1010, %rs587; + cvt.s32.s8 %r1011, %r1010; + cvt.u32.u16 %r1012, %rs586; + cvt.s32.s8 %r1013, %r1012; + mad.lo.s32 %r1014, %r75, %r1013, %r1005; + mad.lo.s32 %r1015, %r76, %r1011, %r1014; + mad.lo.s32 %r1016, %r77, %r1009, %r1015; + mad.lo.s32 %r1017, %r78, %r1007, %r1016; + ld.const.v4.u8 {%rs594, %rs595, %rs596, %rs597}, [matrix+296]; + cvt.u32.u16 %r1018, %rs597; + cvt.s32.s8 %r1019, %r1018; + cvt.u32.u16 %r1020, %rs596; + cvt.s32.s8 %r1021, %r1020; + cvt.u32.u16 %r1022, %rs595; + cvt.s32.s8 %r1023, %r1022; + cvt.u32.u16 %r1024, %rs594; + cvt.s32.s8 %r1025, %r1024; + mad.lo.s32 %r1026, %r80, %r1025, %r1017; + mad.lo.s32 %r1027, %r81, %r1023, %r1026; + mad.lo.s32 %r1028, %r83, %r1021, %r1027; + mad.lo.s32 %r1029, %r84, %r1019, %r1028; + ld.const.v4.u8 {%rs602, %rs603, %rs604, %rs605}, [matrix+300]; + cvt.u32.u16 %r1030, %rs605; + cvt.s32.s8 %r1031, %r1030; + cvt.u32.u16 %r1032, %rs604; + cvt.s32.s8 %r1033, %r1032; + cvt.u32.u16 %r1034, %rs603; + cvt.s32.s8 %r1035, %r1034; + cvt.u32.u16 %r1036, %rs602; + cvt.s32.s8 %r1037, %r1036; + mad.lo.s32 %r1038, %r86, %r1037, %r1029; + mad.lo.s32 %r1039, %r87, %r1035, %r1038; + mad.lo.s32 %r1040, %r88, %r1033, %r1039; + mad.lo.s32 %r1041, %r89, %r1031, %r1040; + ld.const.v4.u8 {%rs610, %rs611, %rs612, %rs613}, [matrix+304]; + cvt.u32.u16 %r1042, %rs613; + cvt.s32.s8 %r1043, %r1042; + cvt.u32.u16 %r1044, %rs612; + cvt.s32.s8 %r1045, %r1044; + cvt.u32.u16 %r1046, %rs611; + cvt.s32.s8 %r1047, %r1046; + cvt.u32.u16 %r1048, %rs610; + cvt.s32.s8 %r1049, %r1048; + mad.lo.s32 %r1050, %r271, %r1049, %r1041; + mad.lo.s32 %r1051, %r91, %r1047, %r1050; + mad.lo.s32 %r1052, %r93, %r1045, %r1051; + mad.lo.s32 %r1053, %r94, %r1043, %r1052; + ld.const.v4.u8 {%rs618, %rs619, %rs620, %rs621}, [matrix+308]; + cvt.u32.u16 %r1054, %rs621; + cvt.s32.s8 %r1055, %r1054; + cvt.u32.u16 %r1056, %rs620; + cvt.s32.s8 %r1057, %r1056; + cvt.u32.u16 %r1058, %rs619; + cvt.s32.s8 %r1059, %r1058; + cvt.u32.u16 %r1060, %rs618; + cvt.s32.s8 %r1061, %r1060; + mad.lo.s32 %r1062, %r96, %r1061, %r1053; + mad.lo.s32 %r1063, %r97, %r1059, %r1062; + mad.lo.s32 %r1064, %r99, %r1057, %r1063; + mad.lo.s32 %r1065, %r100, %r1055, %r1064; + ld.const.v4.u8 {%rs626, %rs627, %rs628, %rs629}, [matrix+312]; + cvt.u32.u16 %r1066, %rs629; + cvt.s32.s8 %r1067, %r1066; + cvt.u32.u16 %r1068, %rs628; + cvt.s32.s8 %r1069, %r1068; + cvt.u32.u16 %r1070, %rs627; + cvt.s32.s8 %r1071, %r1070; + cvt.u32.u16 %r1072, %rs626; + cvt.s32.s8 %r1073, %r1072; + mad.lo.s32 %r1074, %r103, %r1073, %r1065; + mad.lo.s32 %r1075, %r104, %r1071, %r1074; + mad.lo.s32 %r1076, %r107, %r1069, %r1075; + mad.lo.s32 %r1077, %r108, %r1067, %r1076; + ld.const.v4.u8 {%rs634, %rs635, %rs636, %rs637}, [matrix+316]; + cvt.u32.u16 %r1078, %rs637; + cvt.s32.s8 %r1079, %r1078; + cvt.u32.u16 %r1080, %rs636; + cvt.s32.s8 %r1081, %r1080; + cvt.u32.u16 %r1082, %rs635; + cvt.s32.s8 %r1083, %r1082; + cvt.u32.u16 %r1084, %rs634; + cvt.s32.s8 %r1085, %r1084; + mad.lo.s32 %r1086, %r111, %r1085, %r1077; + mad.lo.s32 %r1087, %r112, %r1083, %r1086; + mad.lo.s32 %r1088, %r114, %r1081, %r1087; + mad.lo.s32 %r1089, %r115, %r1079, %r1088; + ld.const.v4.u8 {%rs642, %rs643, %rs644, %rs645}, [matrix+320]; + cvt.u32.u16 %r1090, %rs645; + cvt.s32.s8 %r1091, %r1090; + cvt.u32.u16 %r1092, %rs644; + cvt.s32.s8 %r1093, %r1092; + cvt.u32.u16 %r1094, %rs642; + cvt.s32.s8 %r1095, %r1094; + cvt.u32.u16 %r1096, %rs643; + cvt.s32.s8 %r1097, %r1096; + mul.lo.s32 %r1098, %r34, %r1097; + mad.lo.s32 %r1099, %r124, %r1095, %r1098; + mad.lo.s32 %r1100, %r35, %r1093, %r1099; + mad.lo.s32 %r1101, %r36, %r1091, %r1100; + ld.const.v4.u8 {%rs650, %rs651, %rs652, %rs653}, [matrix+324]; + cvt.u32.u16 %r1102, %rs653; + cvt.s32.s8 %r1103, %r1102; + cvt.u32.u16 %r1104, %rs652; + cvt.s32.s8 %r1105, %r1104; + cvt.u32.u16 %r1106, %rs651; + cvt.s32.s8 %r1107, %r1106; + cvt.u32.u16 %r1108, %rs650; + cvt.s32.s8 %r1109, %r1108; + mad.lo.s32 %r1110, %r37, %r1109, %r1101; + mad.lo.s32 %r1111, %r38, %r1107, %r1110; + mad.lo.s32 %r1112, %r39, %r1105, %r1111; + mad.lo.s32 %r1113, %r40, %r1103, %r1112; + ld.const.v4.u8 {%rs658, %rs659, %rs660, %rs661}, [matrix+328]; + cvt.u32.u16 %r1114, %rs661; + cvt.s32.s8 %r1115, %r1114; + cvt.u32.u16 %r1116, %rs660; + cvt.s32.s8 %r1117, %r1116; + cvt.u32.u16 %r1118, %rs659; + cvt.s32.s8 %r1119, %r1118; + cvt.u32.u16 %r1120, %rs658; + cvt.s32.s8 %r1121, %r1120; + mad.lo.s32 %r1122, %r42, %r1121, %r1113; + mad.lo.s32 %r1123, %r43, %r1119, %r1122; + mad.lo.s32 %r1124, %r45, %r1117, %r1123; + mad.lo.s32 %r1125, %r46, %r1115, %r1124; + ld.const.v4.u8 {%rs666, %rs667, %rs668, %rs669}, [matrix+332]; + cvt.u32.u16 %r1126, %rs669; + cvt.s32.s8 %r1127, %r1126; + cvt.u32.u16 %r1128, %rs668; + cvt.s32.s8 %r1129, %r1128; + cvt.u32.u16 %r1130, %rs667; + cvt.s32.s8 %r1131, %r1130; + cvt.u32.u16 %r1132, %rs666; + cvt.s32.s8 %r1133, %r1132; + mad.lo.s32 %r1134, %r48, %r1133, %r1125; + mad.lo.s32 %r1135, %r49, %r1131, %r1134; + mad.lo.s32 %r1136, %r50, %r1129, %r1135; + mad.lo.s32 %r1137, %r51, %r1127, %r1136; + ld.const.v4.u8 {%rs674, %rs675, %rs676, %rs677}, [matrix+336]; + cvt.u32.u16 %r1138, %rs677; + cvt.s32.s8 %r1139, %r1138; + cvt.u32.u16 %r1140, %rs676; + cvt.s32.s8 %r1141, %r1140; + cvt.u32.u16 %r1142, %rs675; + cvt.s32.s8 %r1143, %r1142; + cvt.u32.u16 %r1144, %rs674; + cvt.s32.s8 %r1145, %r1144; + mad.lo.s32 %r1146, %r173, %r1145, %r1137; + mad.lo.s32 %r1147, %r53, %r1143, %r1146; + mad.lo.s32 %r1148, %r54, %r1141, %r1147; + mad.lo.s32 %r1149, %r55, %r1139, %r1148; + ld.const.v4.u8 {%rs682, %rs683, %rs684, %rs685}, [matrix+340]; + cvt.u32.u16 %r1150, %rs685; + cvt.s32.s8 %r1151, %r1150; + cvt.u32.u16 %r1152, %rs684; + cvt.s32.s8 %r1153, %r1152; + cvt.u32.u16 %r1154, %rs683; + cvt.s32.s8 %r1155, %r1154; + cvt.u32.u16 %r1156, %rs682; + cvt.s32.s8 %r1157, %r1156; + mad.lo.s32 %r1158, %r56, %r1157, %r1149; + mad.lo.s32 %r1159, %r57, %r1155, %r1158; + mad.lo.s32 %r1160, %r58, %r1153, %r1159; + mad.lo.s32 %r1161, %r59, %r1151, %r1160; + ld.const.v4.u8 {%rs690, %rs691, %rs692, %rs693}, [matrix+344]; + cvt.u32.u16 %r1162, %rs693; + cvt.s32.s8 %r1163, %r1162; + cvt.u32.u16 %r1164, %rs692; + cvt.s32.s8 %r1165, %r1164; + cvt.u32.u16 %r1166, %rs691; + cvt.s32.s8 %r1167, %r1166; + cvt.u32.u16 %r1168, %rs690; + cvt.s32.s8 %r1169, %r1168; + mad.lo.s32 %r1170, %r61, %r1169, %r1161; + mad.lo.s32 %r1171, %r62, %r1167, %r1170; + mad.lo.s32 %r1172, %r64, %r1165, %r1171; + mad.lo.s32 %r1173, %r65, %r1163, %r1172; + ld.const.v4.u8 {%rs698, %rs699, %rs700, %rs701}, [matrix+348]; + cvt.u32.u16 %r1174, %rs701; + cvt.s32.s8 %r1175, %r1174; + cvt.u32.u16 %r1176, %rs700; + cvt.s32.s8 %r1177, %r1176; + cvt.u32.u16 %r1178, %rs699; + cvt.s32.s8 %r1179, %r1178; + cvt.u32.u16 %r1180, %rs698; + cvt.s32.s8 %r1181, %r1180; + mad.lo.s32 %r1182, %r67, %r1181, %r1173; + mad.lo.s32 %r1183, %r68, %r1179, %r1182; + mad.lo.s32 %r1184, %r69, %r1177, %r1183; + mad.lo.s32 %r1185, %r70, %r1175, %r1184; + ld.const.v4.u8 {%rs706, %rs707, %rs708, %rs709}, [matrix+352]; + cvt.u32.u16 %r1186, %rs709; + cvt.s32.s8 %r1187, %r1186; + cvt.u32.u16 %r1188, %rs708; + cvt.s32.s8 %r1189, %r1188; + cvt.u32.u16 %r1190, %rs707; + cvt.s32.s8 %r1191, %r1190; + cvt.u32.u16 %r1192, %rs706; + cvt.s32.s8 %r1193, %r1192; + mad.lo.s32 %r1194, %r222, %r1193, %r1185; + mad.lo.s32 %r1195, %r72, %r1191, %r1194; + mad.lo.s32 %r1196, %r73, %r1189, %r1195; + mad.lo.s32 %r1197, %r74, %r1187, %r1196; + ld.const.v4.u8 {%rs714, %rs715, %rs716, %rs717}, [matrix+356]; + cvt.u32.u16 %r1198, %rs717; + cvt.s32.s8 %r1199, %r1198; + cvt.u32.u16 %r1200, %rs716; + cvt.s32.s8 %r1201, %r1200; + cvt.u32.u16 %r1202, %rs715; + cvt.s32.s8 %r1203, %r1202; + cvt.u32.u16 %r1204, %rs714; + cvt.s32.s8 %r1205, %r1204; + mad.lo.s32 %r1206, %r75, %r1205, %r1197; + mad.lo.s32 %r1207, %r76, %r1203, %r1206; + mad.lo.s32 %r1208, %r77, %r1201, %r1207; + mad.lo.s32 %r1209, %r78, %r1199, %r1208; + ld.const.v4.u8 {%rs722, %rs723, %rs724, %rs725}, [matrix+360]; + cvt.u32.u16 %r1210, %rs725; + cvt.s32.s8 %r1211, %r1210; + cvt.u32.u16 %r1212, %rs724; + cvt.s32.s8 %r1213, %r1212; + cvt.u32.u16 %r1214, %rs723; + cvt.s32.s8 %r1215, %r1214; + cvt.u32.u16 %r1216, %rs722; + cvt.s32.s8 %r1217, %r1216; + mad.lo.s32 %r1218, %r80, %r1217, %r1209; + mad.lo.s32 %r1219, %r81, %r1215, %r1218; + mad.lo.s32 %r1220, %r83, %r1213, %r1219; + mad.lo.s32 %r1221, %r84, %r1211, %r1220; + ld.const.v4.u8 {%rs730, %rs731, %rs732, %rs733}, [matrix+364]; + cvt.u32.u16 %r1222, %rs733; + cvt.s32.s8 %r1223, %r1222; + cvt.u32.u16 %r1224, %rs732; + cvt.s32.s8 %r1225, %r1224; + cvt.u32.u16 %r1226, %rs731; + cvt.s32.s8 %r1227, %r1226; + cvt.u32.u16 %r1228, %rs730; + cvt.s32.s8 %r1229, %r1228; + mad.lo.s32 %r1230, %r86, %r1229, %r1221; + mad.lo.s32 %r1231, %r87, %r1227, %r1230; + mad.lo.s32 %r1232, %r88, %r1225, %r1231; + mad.lo.s32 %r1233, %r89, %r1223, %r1232; + ld.const.v4.u8 {%rs738, %rs739, %rs740, %rs741}, [matrix+368]; + cvt.u32.u16 %r1234, %rs741; + cvt.s32.s8 %r1235, %r1234; + cvt.u32.u16 %r1236, %rs740; + cvt.s32.s8 %r1237, %r1236; + cvt.u32.u16 %r1238, %rs739; + cvt.s32.s8 %r1239, %r1238; + cvt.u32.u16 %r1240, %rs738; + cvt.s32.s8 %r1241, %r1240; + mad.lo.s32 %r1242, %r271, %r1241, %r1233; + mad.lo.s32 %r1243, %r91, %r1239, %r1242; + mad.lo.s32 %r1244, %r93, %r1237, %r1243; + mad.lo.s32 %r1245, %r94, %r1235, %r1244; + ld.const.v4.u8 {%rs746, %rs747, %rs748, %rs749}, [matrix+372]; + cvt.u32.u16 %r1246, %rs749; + cvt.s32.s8 %r1247, %r1246; + cvt.u32.u16 %r1248, %rs748; + cvt.s32.s8 %r1249, %r1248; + cvt.u32.u16 %r1250, %rs747; + cvt.s32.s8 %r1251, %r1250; + cvt.u32.u16 %r1252, %rs746; + cvt.s32.s8 %r1253, %r1252; + mad.lo.s32 %r1254, %r96, %r1253, %r1245; + mad.lo.s32 %r1255, %r97, %r1251, %r1254; + mad.lo.s32 %r1256, %r99, %r1249, %r1255; + mad.lo.s32 %r1257, %r100, %r1247, %r1256; + ld.const.v4.u8 {%rs754, %rs755, %rs756, %rs757}, [matrix+376]; + cvt.u32.u16 %r1258, %rs757; + cvt.s32.s8 %r1259, %r1258; + cvt.u32.u16 %r1260, %rs756; + cvt.s32.s8 %r1261, %r1260; + cvt.u32.u16 %r1262, %rs755; + cvt.s32.s8 %r1263, %r1262; + cvt.u32.u16 %r1264, %rs754; + cvt.s32.s8 %r1265, %r1264; + mad.lo.s32 %r1266, %r103, %r1265, %r1257; + mad.lo.s32 %r1267, %r104, %r1263, %r1266; + mad.lo.s32 %r1268, %r107, %r1261, %r1267; + mad.lo.s32 %r1269, %r108, %r1259, %r1268; + ld.const.v4.u8 {%rs762, %rs763, %rs764, %rs765}, [matrix+380]; + cvt.u32.u16 %r1270, %rs765; + cvt.s32.s8 %r1271, %r1270; + cvt.u32.u16 %r1272, %rs764; + cvt.s32.s8 %r1273, %r1272; + cvt.u32.u16 %r1274, %rs763; + cvt.s32.s8 %r1275, %r1274; + cvt.u32.u16 %r1276, %rs762; + cvt.s32.s8 %r1277, %r1276; + mad.lo.s32 %r1278, %r111, %r1277, %r1269; + mad.lo.s32 %r1279, %r112, %r1275, %r1278; + mad.lo.s32 %r1280, %r114, %r1273, %r1279; + mad.lo.s32 %r1281, %r115, %r1271, %r1280; + shr.u32 %r1282, %r1089, 6; + and.b32 %r1283, %r1282, 240; + shr.u32 %r1284, %r1281, 10; + or.b32 %r1285, %r1284, %r1283; + xor.b32 %r1286, %r14, %r1285; + cvt.u64.u32 %rd382, %r1286; + ld.const.v4.u8 {%rs770, %rs771, %rs772, %rs773}, [matrix+384]; + cvt.u32.u16 %r1287, %rs773; + cvt.s32.s8 %r1288, %r1287; + cvt.u32.u16 %r1289, %rs772; + cvt.s32.s8 %r1290, %r1289; + cvt.u32.u16 %r1291, %rs770; + cvt.s32.s8 %r1292, %r1291; + cvt.u32.u16 %r1293, %rs771; + cvt.s32.s8 %r1294, %r1293; + mul.lo.s32 %r1295, %r34, %r1294; + mad.lo.s32 %r1296, %r124, %r1292, %r1295; + mad.lo.s32 %r1297, %r35, %r1290, %r1296; + mad.lo.s32 %r1298, %r36, %r1288, %r1297; + ld.const.v4.u8 {%rs778, %rs779, %rs780, %rs781}, [matrix+388]; + cvt.u32.u16 %r1299, %rs781; + cvt.s32.s8 %r1300, %r1299; + cvt.u32.u16 %r1301, %rs780; + cvt.s32.s8 %r1302, %r1301; + cvt.u32.u16 %r1303, %rs779; + cvt.s32.s8 %r1304, %r1303; + cvt.u32.u16 %r1305, %rs778; + cvt.s32.s8 %r1306, %r1305; + mad.lo.s32 %r1307, %r37, %r1306, %r1298; + mad.lo.s32 %r1308, %r38, %r1304, %r1307; + mad.lo.s32 %r1309, %r39, %r1302, %r1308; + mad.lo.s32 %r1310, %r40, %r1300, %r1309; + ld.const.v4.u8 {%rs786, %rs787, %rs788, %rs789}, [matrix+392]; + cvt.u32.u16 %r1311, %rs789; + cvt.s32.s8 %r1312, %r1311; + cvt.u32.u16 %r1313, %rs788; + cvt.s32.s8 %r1314, %r1313; + cvt.u32.u16 %r1315, %rs787; + cvt.s32.s8 %r1316, %r1315; + cvt.u32.u16 %r1317, %rs786; + cvt.s32.s8 %r1318, %r1317; + mad.lo.s32 %r1319, %r42, %r1318, %r1310; + mad.lo.s32 %r1320, %r43, %r1316, %r1319; + mad.lo.s32 %r1321, %r45, %r1314, %r1320; + mad.lo.s32 %r1322, %r46, %r1312, %r1321; + ld.const.v4.u8 {%rs794, %rs795, %rs796, %rs797}, [matrix+396]; + cvt.u32.u16 %r1323, %rs797; + cvt.s32.s8 %r1324, %r1323; + cvt.u32.u16 %r1325, %rs796; + cvt.s32.s8 %r1326, %r1325; + cvt.u32.u16 %r1327, %rs795; + cvt.s32.s8 %r1328, %r1327; + cvt.u32.u16 %r1329, %rs794; + cvt.s32.s8 %r1330, %r1329; + mad.lo.s32 %r1331, %r48, %r1330, %r1322; + mad.lo.s32 %r1332, %r49, %r1328, %r1331; + mad.lo.s32 %r1333, %r50, %r1326, %r1332; + mad.lo.s32 %r1334, %r51, %r1324, %r1333; + ld.const.v4.u8 {%rs802, %rs803, %rs804, %rs805}, [matrix+400]; + cvt.u32.u16 %r1335, %rs805; + cvt.s32.s8 %r1336, %r1335; + cvt.u32.u16 %r1337, %rs804; + cvt.s32.s8 %r1338, %r1337; + cvt.u32.u16 %r1339, %rs803; + cvt.s32.s8 %r1340, %r1339; + cvt.u32.u16 %r1341, %rs802; + cvt.s32.s8 %r1342, %r1341; + mad.lo.s32 %r1343, %r173, %r1342, %r1334; + mad.lo.s32 %r1344, %r53, %r1340, %r1343; + mad.lo.s32 %r1345, %r54, %r1338, %r1344; + mad.lo.s32 %r1346, %r55, %r1336, %r1345; + ld.const.v4.u8 {%rs810, %rs811, %rs812, %rs813}, [matrix+404]; + cvt.u32.u16 %r1347, %rs813; + cvt.s32.s8 %r1348, %r1347; + cvt.u32.u16 %r1349, %rs812; + cvt.s32.s8 %r1350, %r1349; + cvt.u32.u16 %r1351, %rs811; + cvt.s32.s8 %r1352, %r1351; + cvt.u32.u16 %r1353, %rs810; + cvt.s32.s8 %r1354, %r1353; + mad.lo.s32 %r1355, %r56, %r1354, %r1346; + mad.lo.s32 %r1356, %r57, %r1352, %r1355; + mad.lo.s32 %r1357, %r58, %r1350, %r1356; + mad.lo.s32 %r1358, %r59, %r1348, %r1357; + ld.const.v4.u8 {%rs818, %rs819, %rs820, %rs821}, [matrix+408]; + cvt.u32.u16 %r1359, %rs821; + cvt.s32.s8 %r1360, %r1359; + cvt.u32.u16 %r1361, %rs820; + cvt.s32.s8 %r1362, %r1361; + cvt.u32.u16 %r1363, %rs819; + cvt.s32.s8 %r1364, %r1363; + cvt.u32.u16 %r1365, %rs818; + cvt.s32.s8 %r1366, %r1365; + mad.lo.s32 %r1367, %r61, %r1366, %r1358; + mad.lo.s32 %r1368, %r62, %r1364, %r1367; + mad.lo.s32 %r1369, %r64, %r1362, %r1368; + mad.lo.s32 %r1370, %r65, %r1360, %r1369; + ld.const.v4.u8 {%rs826, %rs827, %rs828, %rs829}, [matrix+412]; + cvt.u32.u16 %r1371, %rs829; + cvt.s32.s8 %r1372, %r1371; + cvt.u32.u16 %r1373, %rs828; + cvt.s32.s8 %r1374, %r1373; + cvt.u32.u16 %r1375, %rs827; + cvt.s32.s8 %r1376, %r1375; + cvt.u32.u16 %r1377, %rs826; + cvt.s32.s8 %r1378, %r1377; + mad.lo.s32 %r1379, %r67, %r1378, %r1370; + mad.lo.s32 %r1380, %r68, %r1376, %r1379; + mad.lo.s32 %r1381, %r69, %r1374, %r1380; + mad.lo.s32 %r1382, %r70, %r1372, %r1381; + ld.const.v4.u8 {%rs834, %rs835, %rs836, %rs837}, [matrix+416]; + cvt.u32.u16 %r1383, %rs837; + cvt.s32.s8 %r1384, %r1383; + cvt.u32.u16 %r1385, %rs836; + cvt.s32.s8 %r1386, %r1385; + cvt.u32.u16 %r1387, %rs835; + cvt.s32.s8 %r1388, %r1387; + cvt.u32.u16 %r1389, %rs834; + cvt.s32.s8 %r1390, %r1389; + mad.lo.s32 %r1391, %r222, %r1390, %r1382; + mad.lo.s32 %r1392, %r72, %r1388, %r1391; + mad.lo.s32 %r1393, %r73, %r1386, %r1392; + mad.lo.s32 %r1394, %r74, %r1384, %r1393; + ld.const.v4.u8 {%rs842, %rs843, %rs844, %rs845}, [matrix+420]; + cvt.u32.u16 %r1395, %rs845; + cvt.s32.s8 %r1396, %r1395; + cvt.u32.u16 %r1397, %rs844; + cvt.s32.s8 %r1398, %r1397; + cvt.u32.u16 %r1399, %rs843; + cvt.s32.s8 %r1400, %r1399; + cvt.u32.u16 %r1401, %rs842; + cvt.s32.s8 %r1402, %r1401; + mad.lo.s32 %r1403, %r75, %r1402, %r1394; + mad.lo.s32 %r1404, %r76, %r1400, %r1403; + mad.lo.s32 %r1405, %r77, %r1398, %r1404; + mad.lo.s32 %r1406, %r78, %r1396, %r1405; + ld.const.v4.u8 {%rs850, %rs851, %rs852, %rs853}, [matrix+424]; + cvt.u32.u16 %r1407, %rs853; + cvt.s32.s8 %r1408, %r1407; + cvt.u32.u16 %r1409, %rs852; + cvt.s32.s8 %r1410, %r1409; + cvt.u32.u16 %r1411, %rs851; + cvt.s32.s8 %r1412, %r1411; + cvt.u32.u16 %r1413, %rs850; + cvt.s32.s8 %r1414, %r1413; + mad.lo.s32 %r1415, %r80, %r1414, %r1406; + mad.lo.s32 %r1416, %r81, %r1412, %r1415; + mad.lo.s32 %r1417, %r83, %r1410, %r1416; + mad.lo.s32 %r1418, %r84, %r1408, %r1417; + ld.const.v4.u8 {%rs858, %rs859, %rs860, %rs861}, [matrix+428]; + cvt.u32.u16 %r1419, %rs861; + cvt.s32.s8 %r1420, %r1419; + cvt.u32.u16 %r1421, %rs860; + cvt.s32.s8 %r1422, %r1421; + cvt.u32.u16 %r1423, %rs859; + cvt.s32.s8 %r1424, %r1423; + cvt.u32.u16 %r1425, %rs858; + cvt.s32.s8 %r1426, %r1425; + mad.lo.s32 %r1427, %r86, %r1426, %r1418; + mad.lo.s32 %r1428, %r87, %r1424, %r1427; + mad.lo.s32 %r1429, %r88, %r1422, %r1428; + mad.lo.s32 %r1430, %r89, %r1420, %r1429; + ld.const.v4.u8 {%rs866, %rs867, %rs868, %rs869}, [matrix+432]; + cvt.u32.u16 %r1431, %rs869; + cvt.s32.s8 %r1432, %r1431; + cvt.u32.u16 %r1433, %rs868; + cvt.s32.s8 %r1434, %r1433; + cvt.u32.u16 %r1435, %rs867; + cvt.s32.s8 %r1436, %r1435; + cvt.u32.u16 %r1437, %rs866; + cvt.s32.s8 %r1438, %r1437; + mad.lo.s32 %r1439, %r271, %r1438, %r1430; + mad.lo.s32 %r1440, %r91, %r1436, %r1439; + mad.lo.s32 %r1441, %r93, %r1434, %r1440; + mad.lo.s32 %r1442, %r94, %r1432, %r1441; + ld.const.v4.u8 {%rs874, %rs875, %rs876, %rs877}, [matrix+436]; + cvt.u32.u16 %r1443, %rs877; + cvt.s32.s8 %r1444, %r1443; + cvt.u32.u16 %r1445, %rs876; + cvt.s32.s8 %r1446, %r1445; + cvt.u32.u16 %r1447, %rs875; + cvt.s32.s8 %r1448, %r1447; + cvt.u32.u16 %r1449, %rs874; + cvt.s32.s8 %r1450, %r1449; + mad.lo.s32 %r1451, %r96, %r1450, %r1442; + mad.lo.s32 %r1452, %r97, %r1448, %r1451; + mad.lo.s32 %r1453, %r99, %r1446, %r1452; + mad.lo.s32 %r1454, %r100, %r1444, %r1453; + ld.const.v4.u8 {%rs882, %rs883, %rs884, %rs885}, [matrix+440]; + cvt.u32.u16 %r1455, %rs885; + cvt.s32.s8 %r1456, %r1455; + cvt.u32.u16 %r1457, %rs884; + cvt.s32.s8 %r1458, %r1457; + cvt.u32.u16 %r1459, %rs883; + cvt.s32.s8 %r1460, %r1459; + cvt.u32.u16 %r1461, %rs882; + cvt.s32.s8 %r1462, %r1461; + mad.lo.s32 %r1463, %r103, %r1462, %r1454; + mad.lo.s32 %r1464, %r104, %r1460, %r1463; + mad.lo.s32 %r1465, %r107, %r1458, %r1464; + mad.lo.s32 %r1466, %r108, %r1456, %r1465; + ld.const.v4.u8 {%rs890, %rs891, %rs892, %rs893}, [matrix+444]; + cvt.u32.u16 %r1467, %rs893; + cvt.s32.s8 %r1468, %r1467; + cvt.u32.u16 %r1469, %rs892; + cvt.s32.s8 %r1470, %r1469; + cvt.u32.u16 %r1471, %rs891; + cvt.s32.s8 %r1472, %r1471; + cvt.u32.u16 %r1473, %rs890; + cvt.s32.s8 %r1474, %r1473; + mad.lo.s32 %r1475, %r111, %r1474, %r1466; + mad.lo.s32 %r1476, %r112, %r1472, %r1475; + mad.lo.s32 %r1477, %r114, %r1470, %r1476; + mad.lo.s32 %r1478, %r115, %r1468, %r1477; + ld.const.v4.u8 {%rs898, %rs899, %rs900, %rs901}, [matrix+448]; + cvt.u32.u16 %r1479, %rs901; + cvt.s32.s8 %r1480, %r1479; + cvt.u32.u16 %r1481, %rs900; + cvt.s32.s8 %r1482, %r1481; + cvt.u32.u16 %r1483, %rs898; + cvt.s32.s8 %r1484, %r1483; + cvt.u32.u16 %r1485, %rs899; + cvt.s32.s8 %r1486, %r1485; + mul.lo.s32 %r1487, %r34, %r1486; + mad.lo.s32 %r1488, %r124, %r1484, %r1487; + mad.lo.s32 %r1489, %r35, %r1482, %r1488; + mad.lo.s32 %r1490, %r36, %r1480, %r1489; + ld.const.v4.u8 {%rs906, %rs907, %rs908, %rs909}, [matrix+452]; + cvt.u32.u16 %r1491, %rs909; + cvt.s32.s8 %r1492, %r1491; + cvt.u32.u16 %r1493, %rs908; + cvt.s32.s8 %r1494, %r1493; + cvt.u32.u16 %r1495, %rs907; + cvt.s32.s8 %r1496, %r1495; + cvt.u32.u16 %r1497, %rs906; + cvt.s32.s8 %r1498, %r1497; + mad.lo.s32 %r1499, %r37, %r1498, %r1490; + mad.lo.s32 %r1500, %r38, %r1496, %r1499; + mad.lo.s32 %r1501, %r39, %r1494, %r1500; + mad.lo.s32 %r1502, %r40, %r1492, %r1501; + ld.const.v4.u8 {%rs914, %rs915, %rs916, %rs917}, [matrix+456]; + cvt.u32.u16 %r1503, %rs917; + cvt.s32.s8 %r1504, %r1503; + cvt.u32.u16 %r1505, %rs916; + cvt.s32.s8 %r1506, %r1505; + cvt.u32.u16 %r1507, %rs915; + cvt.s32.s8 %r1508, %r1507; + cvt.u32.u16 %r1509, %rs914; + cvt.s32.s8 %r1510, %r1509; + mad.lo.s32 %r1511, %r42, %r1510, %r1502; + mad.lo.s32 %r1512, %r43, %r1508, %r1511; + mad.lo.s32 %r1513, %r45, %r1506, %r1512; + mad.lo.s32 %r1514, %r46, %r1504, %r1513; + ld.const.v4.u8 {%rs922, %rs923, %rs924, %rs925}, [matrix+460]; + cvt.u32.u16 %r1515, %rs925; + cvt.s32.s8 %r1516, %r1515; + cvt.u32.u16 %r1517, %rs924; + cvt.s32.s8 %r1518, %r1517; + cvt.u32.u16 %r1519, %rs923; + cvt.s32.s8 %r1520, %r1519; + cvt.u32.u16 %r1521, %rs922; + cvt.s32.s8 %r1522, %r1521; + mad.lo.s32 %r1523, %r48, %r1522, %r1514; + mad.lo.s32 %r1524, %r49, %r1520, %r1523; + mad.lo.s32 %r1525, %r50, %r1518, %r1524; + mad.lo.s32 %r1526, %r51, %r1516, %r1525; + ld.const.v4.u8 {%rs930, %rs931, %rs932, %rs933}, [matrix+464]; + cvt.u32.u16 %r1527, %rs933; + cvt.s32.s8 %r1528, %r1527; + cvt.u32.u16 %r1529, %rs932; + cvt.s32.s8 %r1530, %r1529; + cvt.u32.u16 %r1531, %rs931; + cvt.s32.s8 %r1532, %r1531; + cvt.u32.u16 %r1533, %rs930; + cvt.s32.s8 %r1534, %r1533; + mad.lo.s32 %r1535, %r173, %r1534, %r1526; + mad.lo.s32 %r1536, %r53, %r1532, %r1535; + mad.lo.s32 %r1537, %r54, %r1530, %r1536; + mad.lo.s32 %r1538, %r55, %r1528, %r1537; + ld.const.v4.u8 {%rs938, %rs939, %rs940, %rs941}, [matrix+468]; + cvt.u32.u16 %r1539, %rs941; + cvt.s32.s8 %r1540, %r1539; + cvt.u32.u16 %r1541, %rs940; + cvt.s32.s8 %r1542, %r1541; + cvt.u32.u16 %r1543, %rs939; + cvt.s32.s8 %r1544, %r1543; + cvt.u32.u16 %r1545, %rs938; + cvt.s32.s8 %r1546, %r1545; + mad.lo.s32 %r1547, %r56, %r1546, %r1538; + mad.lo.s32 %r1548, %r57, %r1544, %r1547; + mad.lo.s32 %r1549, %r58, %r1542, %r1548; + mad.lo.s32 %r1550, %r59, %r1540, %r1549; + ld.const.v4.u8 {%rs946, %rs947, %rs948, %rs949}, [matrix+472]; + cvt.u32.u16 %r1551, %rs949; + cvt.s32.s8 %r1552, %r1551; + cvt.u32.u16 %r1553, %rs948; + cvt.s32.s8 %r1554, %r1553; + cvt.u32.u16 %r1555, %rs947; + cvt.s32.s8 %r1556, %r1555; + cvt.u32.u16 %r1557, %rs946; + cvt.s32.s8 %r1558, %r1557; + mad.lo.s32 %r1559, %r61, %r1558, %r1550; + mad.lo.s32 %r1560, %r62, %r1556, %r1559; + mad.lo.s32 %r1561, %r64, %r1554, %r1560; + mad.lo.s32 %r1562, %r65, %r1552, %r1561; + ld.const.v4.u8 {%rs954, %rs955, %rs956, %rs957}, [matrix+476]; + cvt.u32.u16 %r1563, %rs957; + cvt.s32.s8 %r1564, %r1563; + cvt.u32.u16 %r1565, %rs956; + cvt.s32.s8 %r1566, %r1565; + cvt.u32.u16 %r1567, %rs955; + cvt.s32.s8 %r1568, %r1567; + cvt.u32.u16 %r1569, %rs954; + cvt.s32.s8 %r1570, %r1569; + mad.lo.s32 %r1571, %r67, %r1570, %r1562; + mad.lo.s32 %r1572, %r68, %r1568, %r1571; + mad.lo.s32 %r1573, %r69, %r1566, %r1572; + mad.lo.s32 %r1574, %r70, %r1564, %r1573; + ld.const.v4.u8 {%rs962, %rs963, %rs964, %rs965}, [matrix+480]; + cvt.u32.u16 %r1575, %rs965; + cvt.s32.s8 %r1576, %r1575; + cvt.u32.u16 %r1577, %rs964; + cvt.s32.s8 %r1578, %r1577; + cvt.u32.u16 %r1579, %rs963; + cvt.s32.s8 %r1580, %r1579; + cvt.u32.u16 %r1581, %rs962; + cvt.s32.s8 %r1582, %r1581; + mad.lo.s32 %r1583, %r222, %r1582, %r1574; + mad.lo.s32 %r1584, %r72, %r1580, %r1583; + mad.lo.s32 %r1585, %r73, %r1578, %r1584; + mad.lo.s32 %r1586, %r74, %r1576, %r1585; + ld.const.v4.u8 {%rs970, %rs971, %rs972, %rs973}, [matrix+484]; + cvt.u32.u16 %r1587, %rs973; + cvt.s32.s8 %r1588, %r1587; + cvt.u32.u16 %r1589, %rs972; + cvt.s32.s8 %r1590, %r1589; + cvt.u32.u16 %r1591, %rs971; + cvt.s32.s8 %r1592, %r1591; + cvt.u32.u16 %r1593, %rs970; + cvt.s32.s8 %r1594, %r1593; + mad.lo.s32 %r1595, %r75, %r1594, %r1586; + mad.lo.s32 %r1596, %r76, %r1592, %r1595; + mad.lo.s32 %r1597, %r77, %r1590, %r1596; + mad.lo.s32 %r1598, %r78, %r1588, %r1597; + ld.const.v4.u8 {%rs978, %rs979, %rs980, %rs981}, [matrix+488]; + cvt.u32.u16 %r1599, %rs981; + cvt.s32.s8 %r1600, %r1599; + cvt.u32.u16 %r1601, %rs980; + cvt.s32.s8 %r1602, %r1601; + cvt.u32.u16 %r1603, %rs979; + cvt.s32.s8 %r1604, %r1603; + cvt.u32.u16 %r1605, %rs978; + cvt.s32.s8 %r1606, %r1605; + mad.lo.s32 %r1607, %r80, %r1606, %r1598; + mad.lo.s32 %r1608, %r81, %r1604, %r1607; + mad.lo.s32 %r1609, %r83, %r1602, %r1608; + mad.lo.s32 %r1610, %r84, %r1600, %r1609; + ld.const.v4.u8 {%rs986, %rs987, %rs988, %rs989}, [matrix+492]; + cvt.u32.u16 %r1611, %rs989; + cvt.s32.s8 %r1612, %r1611; + cvt.u32.u16 %r1613, %rs988; + cvt.s32.s8 %r1614, %r1613; + cvt.u32.u16 %r1615, %rs987; + cvt.s32.s8 %r1616, %r1615; + cvt.u32.u16 %r1617, %rs986; + cvt.s32.s8 %r1618, %r1617; + mad.lo.s32 %r1619, %r86, %r1618, %r1610; + mad.lo.s32 %r1620, %r87, %r1616, %r1619; + mad.lo.s32 %r1621, %r88, %r1614, %r1620; + mad.lo.s32 %r1622, %r89, %r1612, %r1621; + ld.const.v4.u8 {%rs994, %rs995, %rs996, %rs997}, [matrix+496]; + cvt.u32.u16 %r1623, %rs997; + cvt.s32.s8 %r1624, %r1623; + cvt.u32.u16 %r1625, %rs996; + cvt.s32.s8 %r1626, %r1625; + cvt.u32.u16 %r1627, %rs995; + cvt.s32.s8 %r1628, %r1627; + cvt.u32.u16 %r1629, %rs994; + cvt.s32.s8 %r1630, %r1629; + mad.lo.s32 %r1631, %r271, %r1630, %r1622; + mad.lo.s32 %r1632, %r91, %r1628, %r1631; + mad.lo.s32 %r1633, %r93, %r1626, %r1632; + mad.lo.s32 %r1634, %r94, %r1624, %r1633; + ld.const.v4.u8 {%rs1002, %rs1003, %rs1004, %rs1005}, [matrix+500]; + cvt.u32.u16 %r1635, %rs1005; + cvt.s32.s8 %r1636, %r1635; + cvt.u32.u16 %r1637, %rs1004; + cvt.s32.s8 %r1638, %r1637; + cvt.u32.u16 %r1639, %rs1003; + cvt.s32.s8 %r1640, %r1639; + cvt.u32.u16 %r1641, %rs1002; + cvt.s32.s8 %r1642, %r1641; + mad.lo.s32 %r1643, %r96, %r1642, %r1634; + mad.lo.s32 %r1644, %r97, %r1640, %r1643; + mad.lo.s32 %r1645, %r99, %r1638, %r1644; + mad.lo.s32 %r1646, %r100, %r1636, %r1645; + ld.const.v4.u8 {%rs1010, %rs1011, %rs1012, %rs1013}, [matrix+504]; + cvt.u32.u16 %r1647, %rs1013; + cvt.s32.s8 %r1648, %r1647; + cvt.u32.u16 %r1649, %rs1012; + cvt.s32.s8 %r1650, %r1649; + cvt.u32.u16 %r1651, %rs1011; + cvt.s32.s8 %r1652, %r1651; + cvt.u32.u16 %r1653, %rs1010; + cvt.s32.s8 %r1654, %r1653; + mad.lo.s32 %r1655, %r103, %r1654, %r1646; + mad.lo.s32 %r1656, %r104, %r1652, %r1655; + mad.lo.s32 %r1657, %r107, %r1650, %r1656; + mad.lo.s32 %r1658, %r108, %r1648, %r1657; + ld.const.v4.u8 {%rs1018, %rs1019, %rs1020, %rs1021}, [matrix+508]; + cvt.u32.u16 %r1659, %rs1021; + cvt.s32.s8 %r1660, %r1659; + cvt.u32.u16 %r1661, %rs1020; + cvt.s32.s8 %r1662, %r1661; + cvt.u32.u16 %r1663, %rs1019; + cvt.s32.s8 %r1664, %r1663; + cvt.u32.u16 %r1665, %rs1018; + cvt.s32.s8 %r1666, %r1665; + mad.lo.s32 %r1667, %r111, %r1666, %r1658; + mad.lo.s32 %r1668, %r112, %r1664, %r1667; + mad.lo.s32 %r1669, %r114, %r1662, %r1668; + mad.lo.s32 %r1670, %r115, %r1660, %r1669; + shr.u32 %r1671, %r1478, 6; + and.b32 %r1672, %r1671, 240; + shr.u32 %r1673, %r1670, 10; + or.b32 %r1674, %r1673, %r1672; + xor.b32 %r1675, %r15, %r1674; + cvt.u64.u32 %rd383, %r1675; + ld.const.v4.u8 {%rs1026, %rs1027, %rs1028, %rs1029}, [matrix+512]; + cvt.u32.u16 %r1676, %rs1029; + cvt.s32.s8 %r1677, %r1676; + cvt.u32.u16 %r1678, %rs1028; + cvt.s32.s8 %r1679, %r1678; + cvt.u32.u16 %r1680, %rs1026; + cvt.s32.s8 %r1681, %r1680; + cvt.u32.u16 %r1682, %rs1027; + cvt.s32.s8 %r1683, %r1682; + mul.lo.s32 %r1684, %r34, %r1683; + mad.lo.s32 %r1685, %r124, %r1681, %r1684; + mad.lo.s32 %r1686, %r35, %r1679, %r1685; + mad.lo.s32 %r1687, %r36, %r1677, %r1686; + ld.const.v4.u8 {%rs1034, %rs1035, %rs1036, %rs1037}, [matrix+516]; + cvt.u32.u16 %r1688, %rs1037; + cvt.s32.s8 %r1689, %r1688; + cvt.u32.u16 %r1690, %rs1036; + cvt.s32.s8 %r1691, %r1690; + cvt.u32.u16 %r1692, %rs1035; + cvt.s32.s8 %r1693, %r1692; + cvt.u32.u16 %r1694, %rs1034; + cvt.s32.s8 %r1695, %r1694; + mad.lo.s32 %r1696, %r37, %r1695, %r1687; + mad.lo.s32 %r1697, %r38, %r1693, %r1696; + mad.lo.s32 %r1698, %r39, %r1691, %r1697; + mad.lo.s32 %r1699, %r40, %r1689, %r1698; + ld.const.v4.u8 {%rs1042, %rs1043, %rs1044, %rs1045}, [matrix+520]; + cvt.u32.u16 %r1700, %rs1045; + cvt.s32.s8 %r1701, %r1700; + cvt.u32.u16 %r1702, %rs1044; + cvt.s32.s8 %r1703, %r1702; + cvt.u32.u16 %r1704, %rs1043; + cvt.s32.s8 %r1705, %r1704; + cvt.u32.u16 %r1706, %rs1042; + cvt.s32.s8 %r1707, %r1706; + mad.lo.s32 %r1708, %r42, %r1707, %r1699; + mad.lo.s32 %r1709, %r43, %r1705, %r1708; + mad.lo.s32 %r1710, %r45, %r1703, %r1709; + mad.lo.s32 %r1711, %r46, %r1701, %r1710; + ld.const.v4.u8 {%rs1050, %rs1051, %rs1052, %rs1053}, [matrix+524]; + cvt.u32.u16 %r1712, %rs1053; + cvt.s32.s8 %r1713, %r1712; + cvt.u32.u16 %r1714, %rs1052; + cvt.s32.s8 %r1715, %r1714; + cvt.u32.u16 %r1716, %rs1051; + cvt.s32.s8 %r1717, %r1716; + cvt.u32.u16 %r1718, %rs1050; + cvt.s32.s8 %r1719, %r1718; + mad.lo.s32 %r1720, %r48, %r1719, %r1711; + mad.lo.s32 %r1721, %r49, %r1717, %r1720; + mad.lo.s32 %r1722, %r50, %r1715, %r1721; + mad.lo.s32 %r1723, %r51, %r1713, %r1722; + ld.const.v4.u8 {%rs1058, %rs1059, %rs1060, %rs1061}, [matrix+528]; + cvt.u32.u16 %r1724, %rs1061; + cvt.s32.s8 %r1725, %r1724; + cvt.u32.u16 %r1726, %rs1060; + cvt.s32.s8 %r1727, %r1726; + cvt.u32.u16 %r1728, %rs1059; + cvt.s32.s8 %r1729, %r1728; + cvt.u32.u16 %r1730, %rs1058; + cvt.s32.s8 %r1731, %r1730; + mad.lo.s32 %r1732, %r173, %r1731, %r1723; + mad.lo.s32 %r1733, %r53, %r1729, %r1732; + mad.lo.s32 %r1734, %r54, %r1727, %r1733; + mad.lo.s32 %r1735, %r55, %r1725, %r1734; + ld.const.v4.u8 {%rs1066, %rs1067, %rs1068, %rs1069}, [matrix+532]; + cvt.u32.u16 %r1736, %rs1069; + cvt.s32.s8 %r1737, %r1736; + cvt.u32.u16 %r1738, %rs1068; + cvt.s32.s8 %r1739, %r1738; + cvt.u32.u16 %r1740, %rs1067; + cvt.s32.s8 %r1741, %r1740; + cvt.u32.u16 %r1742, %rs1066; + cvt.s32.s8 %r1743, %r1742; + mad.lo.s32 %r1744, %r56, %r1743, %r1735; + mad.lo.s32 %r1745, %r57, %r1741, %r1744; + mad.lo.s32 %r1746, %r58, %r1739, %r1745; + mad.lo.s32 %r1747, %r59, %r1737, %r1746; + ld.const.v4.u8 {%rs1074, %rs1075, %rs1076, %rs1077}, [matrix+536]; + cvt.u32.u16 %r1748, %rs1077; + cvt.s32.s8 %r1749, %r1748; + cvt.u32.u16 %r1750, %rs1076; + cvt.s32.s8 %r1751, %r1750; + cvt.u32.u16 %r1752, %rs1075; + cvt.s32.s8 %r1753, %r1752; + cvt.u32.u16 %r1754, %rs1074; + cvt.s32.s8 %r1755, %r1754; + mad.lo.s32 %r1756, %r61, %r1755, %r1747; + mad.lo.s32 %r1757, %r62, %r1753, %r1756; + mad.lo.s32 %r1758, %r64, %r1751, %r1757; + mad.lo.s32 %r1759, %r65, %r1749, %r1758; + ld.const.v4.u8 {%rs1082, %rs1083, %rs1084, %rs1085}, [matrix+540]; + cvt.u32.u16 %r1760, %rs1085; + cvt.s32.s8 %r1761, %r1760; + cvt.u32.u16 %r1762, %rs1084; + cvt.s32.s8 %r1763, %r1762; + cvt.u32.u16 %r1764, %rs1083; + cvt.s32.s8 %r1765, %r1764; + cvt.u32.u16 %r1766, %rs1082; + cvt.s32.s8 %r1767, %r1766; + mad.lo.s32 %r1768, %r67, %r1767, %r1759; + mad.lo.s32 %r1769, %r68, %r1765, %r1768; + mad.lo.s32 %r1770, %r69, %r1763, %r1769; + mad.lo.s32 %r1771, %r70, %r1761, %r1770; + ld.const.v4.u8 {%rs1090, %rs1091, %rs1092, %rs1093}, [matrix+544]; + cvt.u32.u16 %r1772, %rs1093; + cvt.s32.s8 %r1773, %r1772; + cvt.u32.u16 %r1774, %rs1092; + cvt.s32.s8 %r1775, %r1774; + cvt.u32.u16 %r1776, %rs1091; + cvt.s32.s8 %r1777, %r1776; + cvt.u32.u16 %r1778, %rs1090; + cvt.s32.s8 %r1779, %r1778; + mad.lo.s32 %r1780, %r222, %r1779, %r1771; + mad.lo.s32 %r1781, %r72, %r1777, %r1780; + mad.lo.s32 %r1782, %r73, %r1775, %r1781; + mad.lo.s32 %r1783, %r74, %r1773, %r1782; + ld.const.v4.u8 {%rs1098, %rs1099, %rs1100, %rs1101}, [matrix+548]; + cvt.u32.u16 %r1784, %rs1101; + cvt.s32.s8 %r1785, %r1784; + cvt.u32.u16 %r1786, %rs1100; + cvt.s32.s8 %r1787, %r1786; + cvt.u32.u16 %r1788, %rs1099; + cvt.s32.s8 %r1789, %r1788; + cvt.u32.u16 %r1790, %rs1098; + cvt.s32.s8 %r1791, %r1790; + mad.lo.s32 %r1792, %r75, %r1791, %r1783; + mad.lo.s32 %r1793, %r76, %r1789, %r1792; + mad.lo.s32 %r1794, %r77, %r1787, %r1793; + mad.lo.s32 %r1795, %r78, %r1785, %r1794; + ld.const.v4.u8 {%rs1106, %rs1107, %rs1108, %rs1109}, [matrix+552]; + cvt.u32.u16 %r1796, %rs1109; + cvt.s32.s8 %r1797, %r1796; + cvt.u32.u16 %r1798, %rs1108; + cvt.s32.s8 %r1799, %r1798; + cvt.u32.u16 %r1800, %rs1107; + cvt.s32.s8 %r1801, %r1800; + cvt.u32.u16 %r1802, %rs1106; + cvt.s32.s8 %r1803, %r1802; + mad.lo.s32 %r1804, %r80, %r1803, %r1795; + mad.lo.s32 %r1805, %r81, %r1801, %r1804; + mad.lo.s32 %r1806, %r83, %r1799, %r1805; + mad.lo.s32 %r1807, %r84, %r1797, %r1806; + ld.const.v4.u8 {%rs1114, %rs1115, %rs1116, %rs1117}, [matrix+556]; + cvt.u32.u16 %r1808, %rs1117; + cvt.s32.s8 %r1809, %r1808; + cvt.u32.u16 %r1810, %rs1116; + cvt.s32.s8 %r1811, %r1810; + cvt.u32.u16 %r1812, %rs1115; + cvt.s32.s8 %r1813, %r1812; + cvt.u32.u16 %r1814, %rs1114; + cvt.s32.s8 %r1815, %r1814; + mad.lo.s32 %r1816, %r86, %r1815, %r1807; + mad.lo.s32 %r1817, %r87, %r1813, %r1816; + mad.lo.s32 %r1818, %r88, %r1811, %r1817; + mad.lo.s32 %r1819, %r89, %r1809, %r1818; + ld.const.v4.u8 {%rs1122, %rs1123, %rs1124, %rs1125}, [matrix+560]; + cvt.u32.u16 %r1820, %rs1125; + cvt.s32.s8 %r1821, %r1820; + cvt.u32.u16 %r1822, %rs1124; + cvt.s32.s8 %r1823, %r1822; + cvt.u32.u16 %r1824, %rs1123; + cvt.s32.s8 %r1825, %r1824; + cvt.u32.u16 %r1826, %rs1122; + cvt.s32.s8 %r1827, %r1826; + mad.lo.s32 %r1828, %r271, %r1827, %r1819; + mad.lo.s32 %r1829, %r91, %r1825, %r1828; + mad.lo.s32 %r1830, %r93, %r1823, %r1829; + mad.lo.s32 %r1831, %r94, %r1821, %r1830; + ld.const.v4.u8 {%rs1130, %rs1131, %rs1132, %rs1133}, [matrix+564]; + cvt.u32.u16 %r1832, %rs1133; + cvt.s32.s8 %r1833, %r1832; + cvt.u32.u16 %r1834, %rs1132; + cvt.s32.s8 %r1835, %r1834; + cvt.u32.u16 %r1836, %rs1131; + cvt.s32.s8 %r1837, %r1836; + cvt.u32.u16 %r1838, %rs1130; + cvt.s32.s8 %r1839, %r1838; + mad.lo.s32 %r1840, %r96, %r1839, %r1831; + mad.lo.s32 %r1841, %r97, %r1837, %r1840; + mad.lo.s32 %r1842, %r99, %r1835, %r1841; + mad.lo.s32 %r1843, %r100, %r1833, %r1842; + ld.const.v4.u8 {%rs1138, %rs1139, %rs1140, %rs1141}, [matrix+568]; + cvt.u32.u16 %r1844, %rs1141; + cvt.s32.s8 %r1845, %r1844; + cvt.u32.u16 %r1846, %rs1140; + cvt.s32.s8 %r1847, %r1846; + cvt.u32.u16 %r1848, %rs1139; + cvt.s32.s8 %r1849, %r1848; + cvt.u32.u16 %r1850, %rs1138; + cvt.s32.s8 %r1851, %r1850; + mad.lo.s32 %r1852, %r103, %r1851, %r1843; + mad.lo.s32 %r1853, %r104, %r1849, %r1852; + mad.lo.s32 %r1854, %r107, %r1847, %r1853; + mad.lo.s32 %r1855, %r108, %r1845, %r1854; + ld.const.v4.u8 {%rs1146, %rs1147, %rs1148, %rs1149}, [matrix+572]; + cvt.u32.u16 %r1856, %rs1149; + cvt.s32.s8 %r1857, %r1856; + cvt.u32.u16 %r1858, %rs1148; + cvt.s32.s8 %r1859, %r1858; + cvt.u32.u16 %r1860, %rs1147; + cvt.s32.s8 %r1861, %r1860; + cvt.u32.u16 %r1862, %rs1146; + cvt.s32.s8 %r1863, %r1862; + mad.lo.s32 %r1864, %r111, %r1863, %r1855; + mad.lo.s32 %r1865, %r112, %r1861, %r1864; + mad.lo.s32 %r1866, %r114, %r1859, %r1865; + mad.lo.s32 %r1867, %r115, %r1857, %r1866; + ld.const.v4.u8 {%rs1154, %rs1155, %rs1156, %rs1157}, [matrix+576]; + cvt.u32.u16 %r1868, %rs1157; + cvt.s32.s8 %r1869, %r1868; + cvt.u32.u16 %r1870, %rs1156; + cvt.s32.s8 %r1871, %r1870; + cvt.u32.u16 %r1872, %rs1154; + cvt.s32.s8 %r1873, %r1872; + cvt.u32.u16 %r1874, %rs1155; + cvt.s32.s8 %r1875, %r1874; + mul.lo.s32 %r1876, %r34, %r1875; + mad.lo.s32 %r1877, %r124, %r1873, %r1876; + mad.lo.s32 %r1878, %r35, %r1871, %r1877; + mad.lo.s32 %r1879, %r36, %r1869, %r1878; + ld.const.v4.u8 {%rs1162, %rs1163, %rs1164, %rs1165}, [matrix+580]; + cvt.u32.u16 %r1880, %rs1165; + cvt.s32.s8 %r1881, %r1880; + cvt.u32.u16 %r1882, %rs1164; + cvt.s32.s8 %r1883, %r1882; + cvt.u32.u16 %r1884, %rs1163; + cvt.s32.s8 %r1885, %r1884; + cvt.u32.u16 %r1886, %rs1162; + cvt.s32.s8 %r1887, %r1886; + mad.lo.s32 %r1888, %r37, %r1887, %r1879; + mad.lo.s32 %r1889, %r38, %r1885, %r1888; + mad.lo.s32 %r1890, %r39, %r1883, %r1889; + mad.lo.s32 %r1891, %r40, %r1881, %r1890; + ld.const.v4.u8 {%rs1170, %rs1171, %rs1172, %rs1173}, [matrix+584]; + cvt.u32.u16 %r1892, %rs1173; + cvt.s32.s8 %r1893, %r1892; + cvt.u32.u16 %r1894, %rs1172; + cvt.s32.s8 %r1895, %r1894; + cvt.u32.u16 %r1896, %rs1171; + cvt.s32.s8 %r1897, %r1896; + cvt.u32.u16 %r1898, %rs1170; + cvt.s32.s8 %r1899, %r1898; + mad.lo.s32 %r1900, %r42, %r1899, %r1891; + mad.lo.s32 %r1901, %r43, %r1897, %r1900; + mad.lo.s32 %r1902, %r45, %r1895, %r1901; + mad.lo.s32 %r1903, %r46, %r1893, %r1902; + ld.const.v4.u8 {%rs1178, %rs1179, %rs1180, %rs1181}, [matrix+588]; + cvt.u32.u16 %r1904, %rs1181; + cvt.s32.s8 %r1905, %r1904; + cvt.u32.u16 %r1906, %rs1180; + cvt.s32.s8 %r1907, %r1906; + cvt.u32.u16 %r1908, %rs1179; + cvt.s32.s8 %r1909, %r1908; + cvt.u32.u16 %r1910, %rs1178; + cvt.s32.s8 %r1911, %r1910; + mad.lo.s32 %r1912, %r48, %r1911, %r1903; + mad.lo.s32 %r1913, %r49, %r1909, %r1912; + mad.lo.s32 %r1914, %r50, %r1907, %r1913; + mad.lo.s32 %r1915, %r51, %r1905, %r1914; + ld.const.v4.u8 {%rs1186, %rs1187, %rs1188, %rs1189}, [matrix+592]; + cvt.u32.u16 %r1916, %rs1189; + cvt.s32.s8 %r1917, %r1916; + cvt.u32.u16 %r1918, %rs1188; + cvt.s32.s8 %r1919, %r1918; + cvt.u32.u16 %r1920, %rs1187; + cvt.s32.s8 %r1921, %r1920; + cvt.u32.u16 %r1922, %rs1186; + cvt.s32.s8 %r1923, %r1922; + mad.lo.s32 %r1924, %r173, %r1923, %r1915; + mad.lo.s32 %r1925, %r53, %r1921, %r1924; + mad.lo.s32 %r1926, %r54, %r1919, %r1925; + mad.lo.s32 %r1927, %r55, %r1917, %r1926; + ld.const.v4.u8 {%rs1194, %rs1195, %rs1196, %rs1197}, [matrix+596]; + cvt.u32.u16 %r1928, %rs1197; + cvt.s32.s8 %r1929, %r1928; + cvt.u32.u16 %r1930, %rs1196; + cvt.s32.s8 %r1931, %r1930; + cvt.u32.u16 %r1932, %rs1195; + cvt.s32.s8 %r1933, %r1932; + cvt.u32.u16 %r1934, %rs1194; + cvt.s32.s8 %r1935, %r1934; + mad.lo.s32 %r1936, %r56, %r1935, %r1927; + mad.lo.s32 %r1937, %r57, %r1933, %r1936; + mad.lo.s32 %r1938, %r58, %r1931, %r1937; + mad.lo.s32 %r1939, %r59, %r1929, %r1938; + ld.const.v4.u8 {%rs1202, %rs1203, %rs1204, %rs1205}, [matrix+600]; + cvt.u32.u16 %r1940, %rs1205; + cvt.s32.s8 %r1941, %r1940; + cvt.u32.u16 %r1942, %rs1204; + cvt.s32.s8 %r1943, %r1942; + cvt.u32.u16 %r1944, %rs1203; + cvt.s32.s8 %r1945, %r1944; + cvt.u32.u16 %r1946, %rs1202; + cvt.s32.s8 %r1947, %r1946; + mad.lo.s32 %r1948, %r61, %r1947, %r1939; + mad.lo.s32 %r1949, %r62, %r1945, %r1948; + mad.lo.s32 %r1950, %r64, %r1943, %r1949; + mad.lo.s32 %r1951, %r65, %r1941, %r1950; + ld.const.v4.u8 {%rs1210, %rs1211, %rs1212, %rs1213}, [matrix+604]; + cvt.u32.u16 %r1952, %rs1213; + cvt.s32.s8 %r1953, %r1952; + cvt.u32.u16 %r1954, %rs1212; + cvt.s32.s8 %r1955, %r1954; + cvt.u32.u16 %r1956, %rs1211; + cvt.s32.s8 %r1957, %r1956; + cvt.u32.u16 %r1958, %rs1210; + cvt.s32.s8 %r1959, %r1958; + mad.lo.s32 %r1960, %r67, %r1959, %r1951; + mad.lo.s32 %r1961, %r68, %r1957, %r1960; + mad.lo.s32 %r1962, %r69, %r1955, %r1961; + mad.lo.s32 %r1963, %r70, %r1953, %r1962; + ld.const.v4.u8 {%rs1218, %rs1219, %rs1220, %rs1221}, [matrix+608]; + cvt.u32.u16 %r1964, %rs1221; + cvt.s32.s8 %r1965, %r1964; + cvt.u32.u16 %r1966, %rs1220; + cvt.s32.s8 %r1967, %r1966; + cvt.u32.u16 %r1968, %rs1219; + cvt.s32.s8 %r1969, %r1968; + cvt.u32.u16 %r1970, %rs1218; + cvt.s32.s8 %r1971, %r1970; + mad.lo.s32 %r1972, %r222, %r1971, %r1963; + mad.lo.s32 %r1973, %r72, %r1969, %r1972; + mad.lo.s32 %r1974, %r73, %r1967, %r1973; + mad.lo.s32 %r1975, %r74, %r1965, %r1974; + ld.const.v4.u8 {%rs1226, %rs1227, %rs1228, %rs1229}, [matrix+612]; + cvt.u32.u16 %r1976, %rs1229; + cvt.s32.s8 %r1977, %r1976; + cvt.u32.u16 %r1978, %rs1228; + cvt.s32.s8 %r1979, %r1978; + cvt.u32.u16 %r1980, %rs1227; + cvt.s32.s8 %r1981, %r1980; + cvt.u32.u16 %r1982, %rs1226; + cvt.s32.s8 %r1983, %r1982; + mad.lo.s32 %r1984, %r75, %r1983, %r1975; + mad.lo.s32 %r1985, %r76, %r1981, %r1984; + mad.lo.s32 %r1986, %r77, %r1979, %r1985; + mad.lo.s32 %r1987, %r78, %r1977, %r1986; + ld.const.v4.u8 {%rs1234, %rs1235, %rs1236, %rs1237}, [matrix+616]; + cvt.u32.u16 %r1988, %rs1237; + cvt.s32.s8 %r1989, %r1988; + cvt.u32.u16 %r1990, %rs1236; + cvt.s32.s8 %r1991, %r1990; + cvt.u32.u16 %r1992, %rs1235; + cvt.s32.s8 %r1993, %r1992; + cvt.u32.u16 %r1994, %rs1234; + cvt.s32.s8 %r1995, %r1994; + mad.lo.s32 %r1996, %r80, %r1995, %r1987; + mad.lo.s32 %r1997, %r81, %r1993, %r1996; + mad.lo.s32 %r1998, %r83, %r1991, %r1997; + mad.lo.s32 %r1999, %r84, %r1989, %r1998; + ld.const.v4.u8 {%rs1242, %rs1243, %rs1244, %rs1245}, [matrix+620]; + cvt.u32.u16 %r2000, %rs1245; + cvt.s32.s8 %r2001, %r2000; + cvt.u32.u16 %r2002, %rs1244; + cvt.s32.s8 %r2003, %r2002; + cvt.u32.u16 %r2004, %rs1243; + cvt.s32.s8 %r2005, %r2004; + cvt.u32.u16 %r2006, %rs1242; + cvt.s32.s8 %r2007, %r2006; + mad.lo.s32 %r2008, %r86, %r2007, %r1999; + mad.lo.s32 %r2009, %r87, %r2005, %r2008; + mad.lo.s32 %r2010, %r88, %r2003, %r2009; + mad.lo.s32 %r2011, %r89, %r2001, %r2010; + ld.const.v4.u8 {%rs1250, %rs1251, %rs1252, %rs1253}, [matrix+624]; + cvt.u32.u16 %r2012, %rs1253; + cvt.s32.s8 %r2013, %r2012; + cvt.u32.u16 %r2014, %rs1252; + cvt.s32.s8 %r2015, %r2014; + cvt.u32.u16 %r2016, %rs1251; + cvt.s32.s8 %r2017, %r2016; + cvt.u32.u16 %r2018, %rs1250; + cvt.s32.s8 %r2019, %r2018; + mad.lo.s32 %r2020, %r271, %r2019, %r2011; + mad.lo.s32 %r2021, %r91, %r2017, %r2020; + mad.lo.s32 %r2022, %r93, %r2015, %r2021; + mad.lo.s32 %r2023, %r94, %r2013, %r2022; + ld.const.v4.u8 {%rs1258, %rs1259, %rs1260, %rs1261}, [matrix+628]; + cvt.u32.u16 %r2024, %rs1261; + cvt.s32.s8 %r2025, %r2024; + cvt.u32.u16 %r2026, %rs1260; + cvt.s32.s8 %r2027, %r2026; + cvt.u32.u16 %r2028, %rs1259; + cvt.s32.s8 %r2029, %r2028; + cvt.u32.u16 %r2030, %rs1258; + cvt.s32.s8 %r2031, %r2030; + mad.lo.s32 %r2032, %r96, %r2031, %r2023; + mad.lo.s32 %r2033, %r97, %r2029, %r2032; + mad.lo.s32 %r2034, %r99, %r2027, %r2033; + mad.lo.s32 %r2035, %r100, %r2025, %r2034; + ld.const.v4.u8 {%rs1266, %rs1267, %rs1268, %rs1269}, [matrix+632]; + cvt.u32.u16 %r2036, %rs1269; + cvt.s32.s8 %r2037, %r2036; + cvt.u32.u16 %r2038, %rs1268; + cvt.s32.s8 %r2039, %r2038; + cvt.u32.u16 %r2040, %rs1267; + cvt.s32.s8 %r2041, %r2040; + cvt.u32.u16 %r2042, %rs1266; + cvt.s32.s8 %r2043, %r2042; + mad.lo.s32 %r2044, %r103, %r2043, %r2035; + mad.lo.s32 %r2045, %r104, %r2041, %r2044; + mad.lo.s32 %r2046, %r107, %r2039, %r2045; + mad.lo.s32 %r2047, %r108, %r2037, %r2046; + ld.const.v4.u8 {%rs1274, %rs1275, %rs1276, %rs1277}, [matrix+636]; + cvt.u32.u16 %r2048, %rs1277; + cvt.s32.s8 %r2049, %r2048; + cvt.u32.u16 %r2050, %rs1276; + cvt.s32.s8 %r2051, %r2050; + cvt.u32.u16 %r2052, %rs1275; + cvt.s32.s8 %r2053, %r2052; + cvt.u32.u16 %r2054, %rs1274; + cvt.s32.s8 %r2055, %r2054; + mad.lo.s32 %r2056, %r111, %r2055, %r2047; + mad.lo.s32 %r2057, %r112, %r2053, %r2056; + mad.lo.s32 %r2058, %r114, %r2051, %r2057; + mad.lo.s32 %r2059, %r115, %r2049, %r2058; + shr.u32 %r2060, %r1867, 6; + and.b32 %r2061, %r2060, 240; + shr.u32 %r2062, %r2059, 10; + or.b32 %r2063, %r2062, %r2061; + xor.b32 %r2064, %r16, %r2063; + cvt.u64.u32 %rd384, %r2064; + ld.const.v4.u8 {%rs1282, %rs1283, %rs1284, %rs1285}, [matrix+640]; + cvt.u32.u16 %r2065, %rs1285; + cvt.s32.s8 %r2066, %r2065; + cvt.u32.u16 %r2067, %rs1284; + cvt.s32.s8 %r2068, %r2067; + cvt.u32.u16 %r2069, %rs1282; + cvt.s32.s8 %r2070, %r2069; + cvt.u32.u16 %r2071, %rs1283; + cvt.s32.s8 %r2072, %r2071; + mul.lo.s32 %r2073, %r34, %r2072; + mad.lo.s32 %r2074, %r124, %r2070, %r2073; + mad.lo.s32 %r2075, %r35, %r2068, %r2074; + mad.lo.s32 %r2076, %r36, %r2066, %r2075; + ld.const.v4.u8 {%rs1290, %rs1291, %rs1292, %rs1293}, [matrix+644]; + cvt.u32.u16 %r2077, %rs1293; + cvt.s32.s8 %r2078, %r2077; + cvt.u32.u16 %r2079, %rs1292; + cvt.s32.s8 %r2080, %r2079; + cvt.u32.u16 %r2081, %rs1291; + cvt.s32.s8 %r2082, %r2081; + cvt.u32.u16 %r2083, %rs1290; + cvt.s32.s8 %r2084, %r2083; + mad.lo.s32 %r2085, %r37, %r2084, %r2076; + mad.lo.s32 %r2086, %r38, %r2082, %r2085; + mad.lo.s32 %r2087, %r39, %r2080, %r2086; + mad.lo.s32 %r2088, %r40, %r2078, %r2087; + ld.const.v4.u8 {%rs1298, %rs1299, %rs1300, %rs1301}, [matrix+648]; + cvt.u32.u16 %r2089, %rs1301; + cvt.s32.s8 %r2090, %r2089; + cvt.u32.u16 %r2091, %rs1300; + cvt.s32.s8 %r2092, %r2091; + cvt.u32.u16 %r2093, %rs1299; + cvt.s32.s8 %r2094, %r2093; + cvt.u32.u16 %r2095, %rs1298; + cvt.s32.s8 %r2096, %r2095; + mad.lo.s32 %r2097, %r42, %r2096, %r2088; + mad.lo.s32 %r2098, %r43, %r2094, %r2097; + mad.lo.s32 %r2099, %r45, %r2092, %r2098; + mad.lo.s32 %r2100, %r46, %r2090, %r2099; + ld.const.v4.u8 {%rs1306, %rs1307, %rs1308, %rs1309}, [matrix+652]; + cvt.u32.u16 %r2101, %rs1309; + cvt.s32.s8 %r2102, %r2101; + cvt.u32.u16 %r2103, %rs1308; + cvt.s32.s8 %r2104, %r2103; + cvt.u32.u16 %r2105, %rs1307; + cvt.s32.s8 %r2106, %r2105; + cvt.u32.u16 %r2107, %rs1306; + cvt.s32.s8 %r2108, %r2107; + mad.lo.s32 %r2109, %r48, %r2108, %r2100; + mad.lo.s32 %r2110, %r49, %r2106, %r2109; + mad.lo.s32 %r2111, %r50, %r2104, %r2110; + mad.lo.s32 %r2112, %r51, %r2102, %r2111; + ld.const.v4.u8 {%rs1314, %rs1315, %rs1316, %rs1317}, [matrix+656]; + cvt.u32.u16 %r2113, %rs1317; + cvt.s32.s8 %r2114, %r2113; + cvt.u32.u16 %r2115, %rs1316; + cvt.s32.s8 %r2116, %r2115; + cvt.u32.u16 %r2117, %rs1315; + cvt.s32.s8 %r2118, %r2117; + cvt.u32.u16 %r2119, %rs1314; + cvt.s32.s8 %r2120, %r2119; + mad.lo.s32 %r2121, %r173, %r2120, %r2112; + mad.lo.s32 %r2122, %r53, %r2118, %r2121; + mad.lo.s32 %r2123, %r54, %r2116, %r2122; + mad.lo.s32 %r2124, %r55, %r2114, %r2123; + ld.const.v4.u8 {%rs1322, %rs1323, %rs1324, %rs1325}, [matrix+660]; + cvt.u32.u16 %r2125, %rs1325; + cvt.s32.s8 %r2126, %r2125; + cvt.u32.u16 %r2127, %rs1324; + cvt.s32.s8 %r2128, %r2127; + cvt.u32.u16 %r2129, %rs1323; + cvt.s32.s8 %r2130, %r2129; + cvt.u32.u16 %r2131, %rs1322; + cvt.s32.s8 %r2132, %r2131; + mad.lo.s32 %r2133, %r56, %r2132, %r2124; + mad.lo.s32 %r2134, %r57, %r2130, %r2133; + mad.lo.s32 %r2135, %r58, %r2128, %r2134; + mad.lo.s32 %r2136, %r59, %r2126, %r2135; + ld.const.v4.u8 {%rs1330, %rs1331, %rs1332, %rs1333}, [matrix+664]; + cvt.u32.u16 %r2137, %rs1333; + cvt.s32.s8 %r2138, %r2137; + cvt.u32.u16 %r2139, %rs1332; + cvt.s32.s8 %r2140, %r2139; + cvt.u32.u16 %r2141, %rs1331; + cvt.s32.s8 %r2142, %r2141; + cvt.u32.u16 %r2143, %rs1330; + cvt.s32.s8 %r2144, %r2143; + mad.lo.s32 %r2145, %r61, %r2144, %r2136; + mad.lo.s32 %r2146, %r62, %r2142, %r2145; + mad.lo.s32 %r2147, %r64, %r2140, %r2146; + mad.lo.s32 %r2148, %r65, %r2138, %r2147; + ld.const.v4.u8 {%rs1338, %rs1339, %rs1340, %rs1341}, [matrix+668]; + cvt.u32.u16 %r2149, %rs1341; + cvt.s32.s8 %r2150, %r2149; + cvt.u32.u16 %r2151, %rs1340; + cvt.s32.s8 %r2152, %r2151; + cvt.u32.u16 %r2153, %rs1339; + cvt.s32.s8 %r2154, %r2153; + cvt.u32.u16 %r2155, %rs1338; + cvt.s32.s8 %r2156, %r2155; + mad.lo.s32 %r2157, %r67, %r2156, %r2148; + mad.lo.s32 %r2158, %r68, %r2154, %r2157; + mad.lo.s32 %r2159, %r69, %r2152, %r2158; + mad.lo.s32 %r2160, %r70, %r2150, %r2159; + ld.const.v4.u8 {%rs1346, %rs1347, %rs1348, %rs1349}, [matrix+672]; + cvt.u32.u16 %r2161, %rs1349; + cvt.s32.s8 %r2162, %r2161; + cvt.u32.u16 %r2163, %rs1348; + cvt.s32.s8 %r2164, %r2163; + cvt.u32.u16 %r2165, %rs1347; + cvt.s32.s8 %r2166, %r2165; + cvt.u32.u16 %r2167, %rs1346; + cvt.s32.s8 %r2168, %r2167; + mad.lo.s32 %r2169, %r222, %r2168, %r2160; + mad.lo.s32 %r2170, %r72, %r2166, %r2169; + mad.lo.s32 %r2171, %r73, %r2164, %r2170; + mad.lo.s32 %r2172, %r74, %r2162, %r2171; + ld.const.v4.u8 {%rs1354, %rs1355, %rs1356, %rs1357}, [matrix+676]; + cvt.u32.u16 %r2173, %rs1357; + cvt.s32.s8 %r2174, %r2173; + cvt.u32.u16 %r2175, %rs1356; + cvt.s32.s8 %r2176, %r2175; + cvt.u32.u16 %r2177, %rs1355; + cvt.s32.s8 %r2178, %r2177; + cvt.u32.u16 %r2179, %rs1354; + cvt.s32.s8 %r2180, %r2179; + mad.lo.s32 %r2181, %r75, %r2180, %r2172; + mad.lo.s32 %r2182, %r76, %r2178, %r2181; + mad.lo.s32 %r2183, %r77, %r2176, %r2182; + mad.lo.s32 %r2184, %r78, %r2174, %r2183; + ld.const.v4.u8 {%rs1362, %rs1363, %rs1364, %rs1365}, [matrix+680]; + cvt.u32.u16 %r2185, %rs1365; + cvt.s32.s8 %r2186, %r2185; + cvt.u32.u16 %r2187, %rs1364; + cvt.s32.s8 %r2188, %r2187; + cvt.u32.u16 %r2189, %rs1363; + cvt.s32.s8 %r2190, %r2189; + cvt.u32.u16 %r2191, %rs1362; + cvt.s32.s8 %r2192, %r2191; + mad.lo.s32 %r2193, %r80, %r2192, %r2184; + mad.lo.s32 %r2194, %r81, %r2190, %r2193; + mad.lo.s32 %r2195, %r83, %r2188, %r2194; + mad.lo.s32 %r2196, %r84, %r2186, %r2195; + ld.const.v4.u8 {%rs1370, %rs1371, %rs1372, %rs1373}, [matrix+684]; + cvt.u32.u16 %r2197, %rs1373; + cvt.s32.s8 %r2198, %r2197; + cvt.u32.u16 %r2199, %rs1372; + cvt.s32.s8 %r2200, %r2199; + cvt.u32.u16 %r2201, %rs1371; + cvt.s32.s8 %r2202, %r2201; + cvt.u32.u16 %r2203, %rs1370; + cvt.s32.s8 %r2204, %r2203; + mad.lo.s32 %r2205, %r86, %r2204, %r2196; + mad.lo.s32 %r2206, %r87, %r2202, %r2205; + mad.lo.s32 %r2207, %r88, %r2200, %r2206; + mad.lo.s32 %r2208, %r89, %r2198, %r2207; + ld.const.v4.u8 {%rs1378, %rs1379, %rs1380, %rs1381}, [matrix+688]; + cvt.u32.u16 %r2209, %rs1381; + cvt.s32.s8 %r2210, %r2209; + cvt.u32.u16 %r2211, %rs1380; + cvt.s32.s8 %r2212, %r2211; + cvt.u32.u16 %r2213, %rs1379; + cvt.s32.s8 %r2214, %r2213; + cvt.u32.u16 %r2215, %rs1378; + cvt.s32.s8 %r2216, %r2215; + mad.lo.s32 %r2217, %r271, %r2216, %r2208; + mad.lo.s32 %r2218, %r91, %r2214, %r2217; + mad.lo.s32 %r2219, %r93, %r2212, %r2218; + mad.lo.s32 %r2220, %r94, %r2210, %r2219; + ld.const.v4.u8 {%rs1386, %rs1387, %rs1388, %rs1389}, [matrix+692]; + cvt.u32.u16 %r2221, %rs1389; + cvt.s32.s8 %r2222, %r2221; + cvt.u32.u16 %r2223, %rs1388; + cvt.s32.s8 %r2224, %r2223; + cvt.u32.u16 %r2225, %rs1387; + cvt.s32.s8 %r2226, %r2225; + cvt.u32.u16 %r2227, %rs1386; + cvt.s32.s8 %r2228, %r2227; + mad.lo.s32 %r2229, %r96, %r2228, %r2220; + mad.lo.s32 %r2230, %r97, %r2226, %r2229; + mad.lo.s32 %r2231, %r99, %r2224, %r2230; + mad.lo.s32 %r2232, %r100, %r2222, %r2231; + ld.const.v4.u8 {%rs1394, %rs1395, %rs1396, %rs1397}, [matrix+696]; + cvt.u32.u16 %r2233, %rs1397; + cvt.s32.s8 %r2234, %r2233; + cvt.u32.u16 %r2235, %rs1396; + cvt.s32.s8 %r2236, %r2235; + cvt.u32.u16 %r2237, %rs1395; + cvt.s32.s8 %r2238, %r2237; + cvt.u32.u16 %r2239, %rs1394; + cvt.s32.s8 %r2240, %r2239; + mad.lo.s32 %r2241, %r103, %r2240, %r2232; + mad.lo.s32 %r2242, %r104, %r2238, %r2241; + mad.lo.s32 %r2243, %r107, %r2236, %r2242; + mad.lo.s32 %r2244, %r108, %r2234, %r2243; + ld.const.v4.u8 {%rs1402, %rs1403, %rs1404, %rs1405}, [matrix+700]; + cvt.u32.u16 %r2245, %rs1405; + cvt.s32.s8 %r2246, %r2245; + cvt.u32.u16 %r2247, %rs1404; + cvt.s32.s8 %r2248, %r2247; + cvt.u32.u16 %r2249, %rs1403; + cvt.s32.s8 %r2250, %r2249; + cvt.u32.u16 %r2251, %rs1402; + cvt.s32.s8 %r2252, %r2251; + mad.lo.s32 %r2253, %r111, %r2252, %r2244; + mad.lo.s32 %r2254, %r112, %r2250, %r2253; + mad.lo.s32 %r2255, %r114, %r2248, %r2254; + mad.lo.s32 %r2256, %r115, %r2246, %r2255; + ld.const.v4.u8 {%rs1410, %rs1411, %rs1412, %rs1413}, [matrix+704]; + cvt.u32.u16 %r2257, %rs1413; + cvt.s32.s8 %r2258, %r2257; + cvt.u32.u16 %r2259, %rs1412; + cvt.s32.s8 %r2260, %r2259; + cvt.u32.u16 %r2261, %rs1410; + cvt.s32.s8 %r2262, %r2261; + cvt.u32.u16 %r2263, %rs1411; + cvt.s32.s8 %r2264, %r2263; + mul.lo.s32 %r2265, %r34, %r2264; + mad.lo.s32 %r2266, %r124, %r2262, %r2265; + mad.lo.s32 %r2267, %r35, %r2260, %r2266; + mad.lo.s32 %r2268, %r36, %r2258, %r2267; + ld.const.v4.u8 {%rs1418, %rs1419, %rs1420, %rs1421}, [matrix+708]; + cvt.u32.u16 %r2269, %rs1421; + cvt.s32.s8 %r2270, %r2269; + cvt.u32.u16 %r2271, %rs1420; + cvt.s32.s8 %r2272, %r2271; + cvt.u32.u16 %r2273, %rs1419; + cvt.s32.s8 %r2274, %r2273; + cvt.u32.u16 %r2275, %rs1418; + cvt.s32.s8 %r2276, %r2275; + mad.lo.s32 %r2277, %r37, %r2276, %r2268; + mad.lo.s32 %r2278, %r38, %r2274, %r2277; + mad.lo.s32 %r2279, %r39, %r2272, %r2278; + mad.lo.s32 %r2280, %r40, %r2270, %r2279; + ld.const.v4.u8 {%rs1426, %rs1427, %rs1428, %rs1429}, [matrix+712]; + cvt.u32.u16 %r2281, %rs1429; + cvt.s32.s8 %r2282, %r2281; + cvt.u32.u16 %r2283, %rs1428; + cvt.s32.s8 %r2284, %r2283; + cvt.u32.u16 %r2285, %rs1427; + cvt.s32.s8 %r2286, %r2285; + cvt.u32.u16 %r2287, %rs1426; + cvt.s32.s8 %r2288, %r2287; + mad.lo.s32 %r2289, %r42, %r2288, %r2280; + mad.lo.s32 %r2290, %r43, %r2286, %r2289; + mad.lo.s32 %r2291, %r45, %r2284, %r2290; + mad.lo.s32 %r2292, %r46, %r2282, %r2291; + ld.const.v4.u8 {%rs1434, %rs1435, %rs1436, %rs1437}, [matrix+716]; + cvt.u32.u16 %r2293, %rs1437; + cvt.s32.s8 %r2294, %r2293; + cvt.u32.u16 %r2295, %rs1436; + cvt.s32.s8 %r2296, %r2295; + cvt.u32.u16 %r2297, %rs1435; + cvt.s32.s8 %r2298, %r2297; + cvt.u32.u16 %r2299, %rs1434; + cvt.s32.s8 %r2300, %r2299; + mad.lo.s32 %r2301, %r48, %r2300, %r2292; + mad.lo.s32 %r2302, %r49, %r2298, %r2301; + mad.lo.s32 %r2303, %r50, %r2296, %r2302; + mad.lo.s32 %r2304, %r51, %r2294, %r2303; + ld.const.v4.u8 {%rs1442, %rs1443, %rs1444, %rs1445}, [matrix+720]; + cvt.u32.u16 %r2305, %rs1445; + cvt.s32.s8 %r2306, %r2305; + cvt.u32.u16 %r2307, %rs1444; + cvt.s32.s8 %r2308, %r2307; + cvt.u32.u16 %r2309, %rs1443; + cvt.s32.s8 %r2310, %r2309; + cvt.u32.u16 %r2311, %rs1442; + cvt.s32.s8 %r2312, %r2311; + mad.lo.s32 %r2313, %r173, %r2312, %r2304; + mad.lo.s32 %r2314, %r53, %r2310, %r2313; + mad.lo.s32 %r2315, %r54, %r2308, %r2314; + mad.lo.s32 %r2316, %r55, %r2306, %r2315; + ld.const.v4.u8 {%rs1450, %rs1451, %rs1452, %rs1453}, [matrix+724]; + cvt.u32.u16 %r2317, %rs1453; + cvt.s32.s8 %r2318, %r2317; + cvt.u32.u16 %r2319, %rs1452; + cvt.s32.s8 %r2320, %r2319; + cvt.u32.u16 %r2321, %rs1451; + cvt.s32.s8 %r2322, %r2321; + cvt.u32.u16 %r2323, %rs1450; + cvt.s32.s8 %r2324, %r2323; + mad.lo.s32 %r2325, %r56, %r2324, %r2316; + mad.lo.s32 %r2326, %r57, %r2322, %r2325; + mad.lo.s32 %r2327, %r58, %r2320, %r2326; + mad.lo.s32 %r2328, %r59, %r2318, %r2327; + ld.const.v4.u8 {%rs1458, %rs1459, %rs1460, %rs1461}, [matrix+728]; + cvt.u32.u16 %r2329, %rs1461; + cvt.s32.s8 %r2330, %r2329; + cvt.u32.u16 %r2331, %rs1460; + cvt.s32.s8 %r2332, %r2331; + cvt.u32.u16 %r2333, %rs1459; + cvt.s32.s8 %r2334, %r2333; + cvt.u32.u16 %r2335, %rs1458; + cvt.s32.s8 %r2336, %r2335; + mad.lo.s32 %r2337, %r61, %r2336, %r2328; + mad.lo.s32 %r2338, %r62, %r2334, %r2337; + mad.lo.s32 %r2339, %r64, %r2332, %r2338; + mad.lo.s32 %r2340, %r65, %r2330, %r2339; + ld.const.v4.u8 {%rs1466, %rs1467, %rs1468, %rs1469}, [matrix+732]; + cvt.u32.u16 %r2341, %rs1469; + cvt.s32.s8 %r2342, %r2341; + cvt.u32.u16 %r2343, %rs1468; + cvt.s32.s8 %r2344, %r2343; + cvt.u32.u16 %r2345, %rs1467; + cvt.s32.s8 %r2346, %r2345; + cvt.u32.u16 %r2347, %rs1466; + cvt.s32.s8 %r2348, %r2347; + mad.lo.s32 %r2349, %r67, %r2348, %r2340; + mad.lo.s32 %r2350, %r68, %r2346, %r2349; + mad.lo.s32 %r2351, %r69, %r2344, %r2350; + mad.lo.s32 %r2352, %r70, %r2342, %r2351; + ld.const.v4.u8 {%rs1474, %rs1475, %rs1476, %rs1477}, [matrix+736]; + cvt.u32.u16 %r2353, %rs1477; + cvt.s32.s8 %r2354, %r2353; + cvt.u32.u16 %r2355, %rs1476; + cvt.s32.s8 %r2356, %r2355; + cvt.u32.u16 %r2357, %rs1475; + cvt.s32.s8 %r2358, %r2357; + cvt.u32.u16 %r2359, %rs1474; + cvt.s32.s8 %r2360, %r2359; + mad.lo.s32 %r2361, %r222, %r2360, %r2352; + mad.lo.s32 %r2362, %r72, %r2358, %r2361; + mad.lo.s32 %r2363, %r73, %r2356, %r2362; + mad.lo.s32 %r2364, %r74, %r2354, %r2363; + ld.const.v4.u8 {%rs1482, %rs1483, %rs1484, %rs1485}, [matrix+740]; + cvt.u32.u16 %r2365, %rs1485; + cvt.s32.s8 %r2366, %r2365; + cvt.u32.u16 %r2367, %rs1484; + cvt.s32.s8 %r2368, %r2367; + cvt.u32.u16 %r2369, %rs1483; + cvt.s32.s8 %r2370, %r2369; + cvt.u32.u16 %r2371, %rs1482; + cvt.s32.s8 %r2372, %r2371; + mad.lo.s32 %r2373, %r75, %r2372, %r2364; + mad.lo.s32 %r2374, %r76, %r2370, %r2373; + mad.lo.s32 %r2375, %r77, %r2368, %r2374; + mad.lo.s32 %r2376, %r78, %r2366, %r2375; + ld.const.v4.u8 {%rs1490, %rs1491, %rs1492, %rs1493}, [matrix+744]; + cvt.u32.u16 %r2377, %rs1493; + cvt.s32.s8 %r2378, %r2377; + cvt.u32.u16 %r2379, %rs1492; + cvt.s32.s8 %r2380, %r2379; + cvt.u32.u16 %r2381, %rs1491; + cvt.s32.s8 %r2382, %r2381; + cvt.u32.u16 %r2383, %rs1490; + cvt.s32.s8 %r2384, %r2383; + mad.lo.s32 %r2385, %r80, %r2384, %r2376; + mad.lo.s32 %r2386, %r81, %r2382, %r2385; + mad.lo.s32 %r2387, %r83, %r2380, %r2386; + mad.lo.s32 %r2388, %r84, %r2378, %r2387; + ld.const.v4.u8 {%rs1498, %rs1499, %rs1500, %rs1501}, [matrix+748]; + cvt.u32.u16 %r2389, %rs1501; + cvt.s32.s8 %r2390, %r2389; + cvt.u32.u16 %r2391, %rs1500; + cvt.s32.s8 %r2392, %r2391; + cvt.u32.u16 %r2393, %rs1499; + cvt.s32.s8 %r2394, %r2393; + cvt.u32.u16 %r2395, %rs1498; + cvt.s32.s8 %r2396, %r2395; + mad.lo.s32 %r2397, %r86, %r2396, %r2388; + mad.lo.s32 %r2398, %r87, %r2394, %r2397; + mad.lo.s32 %r2399, %r88, %r2392, %r2398; + mad.lo.s32 %r2400, %r89, %r2390, %r2399; + ld.const.v4.u8 {%rs1506, %rs1507, %rs1508, %rs1509}, [matrix+752]; + cvt.u32.u16 %r2401, %rs1509; + cvt.s32.s8 %r2402, %r2401; + cvt.u32.u16 %r2403, %rs1508; + cvt.s32.s8 %r2404, %r2403; + cvt.u32.u16 %r2405, %rs1507; + cvt.s32.s8 %r2406, %r2405; + cvt.u32.u16 %r2407, %rs1506; + cvt.s32.s8 %r2408, %r2407; + mad.lo.s32 %r2409, %r271, %r2408, %r2400; + mad.lo.s32 %r2410, %r91, %r2406, %r2409; + mad.lo.s32 %r2411, %r93, %r2404, %r2410; + mad.lo.s32 %r2412, %r94, %r2402, %r2411; + ld.const.v4.u8 {%rs1514, %rs1515, %rs1516, %rs1517}, [matrix+756]; + cvt.u32.u16 %r2413, %rs1517; + cvt.s32.s8 %r2414, %r2413; + cvt.u32.u16 %r2415, %rs1516; + cvt.s32.s8 %r2416, %r2415; + cvt.u32.u16 %r2417, %rs1515; + cvt.s32.s8 %r2418, %r2417; + cvt.u32.u16 %r2419, %rs1514; + cvt.s32.s8 %r2420, %r2419; + mad.lo.s32 %r2421, %r96, %r2420, %r2412; + mad.lo.s32 %r2422, %r97, %r2418, %r2421; + mad.lo.s32 %r2423, %r99, %r2416, %r2422; + mad.lo.s32 %r2424, %r100, %r2414, %r2423; + ld.const.v4.u8 {%rs1522, %rs1523, %rs1524, %rs1525}, [matrix+760]; + cvt.u32.u16 %r2425, %rs1525; + cvt.s32.s8 %r2426, %r2425; + cvt.u32.u16 %r2427, %rs1524; + cvt.s32.s8 %r2428, %r2427; + cvt.u32.u16 %r2429, %rs1523; + cvt.s32.s8 %r2430, %r2429; + cvt.u32.u16 %r2431, %rs1522; + cvt.s32.s8 %r2432, %r2431; + mad.lo.s32 %r2433, %r103, %r2432, %r2424; + mad.lo.s32 %r2434, %r104, %r2430, %r2433; + mad.lo.s32 %r2435, %r107, %r2428, %r2434; + mad.lo.s32 %r2436, %r108, %r2426, %r2435; + ld.const.v4.u8 {%rs1530, %rs1531, %rs1532, %rs1533}, [matrix+764]; + cvt.u32.u16 %r2437, %rs1533; + cvt.s32.s8 %r2438, %r2437; + cvt.u32.u16 %r2439, %rs1532; + cvt.s32.s8 %r2440, %r2439; + cvt.u32.u16 %r2441, %rs1531; + cvt.s32.s8 %r2442, %r2441; + cvt.u32.u16 %r2443, %rs1530; + cvt.s32.s8 %r2444, %r2443; + mad.lo.s32 %r2445, %r111, %r2444, %r2436; + mad.lo.s32 %r2446, %r112, %r2442, %r2445; + mad.lo.s32 %r2447, %r114, %r2440, %r2446; + mad.lo.s32 %r2448, %r115, %r2438, %r2447; + shr.u32 %r2449, %r2256, 6; + and.b32 %r2450, %r2449, 240; + shr.u32 %r2451, %r2448, 10; + or.b32 %r2452, %r2451, %r2450; + xor.b32 %r2453, %r17, %r2452; + cvt.u64.u32 %rd385, %r2453; + ld.const.v4.u8 {%rs1538, %rs1539, %rs1540, %rs1541}, [matrix+768]; + cvt.u32.u16 %r2454, %rs1541; + cvt.s32.s8 %r2455, %r2454; + cvt.u32.u16 %r2456, %rs1540; + cvt.s32.s8 %r2457, %r2456; + cvt.u32.u16 %r2458, %rs1538; + cvt.s32.s8 %r2459, %r2458; + cvt.u32.u16 %r2460, %rs1539; + cvt.s32.s8 %r2461, %r2460; + mul.lo.s32 %r2462, %r34, %r2461; + mad.lo.s32 %r2463, %r124, %r2459, %r2462; + mad.lo.s32 %r2464, %r35, %r2457, %r2463; + mad.lo.s32 %r2465, %r36, %r2455, %r2464; + ld.const.v4.u8 {%rs1546, %rs1547, %rs1548, %rs1549}, [matrix+772]; + cvt.u32.u16 %r2466, %rs1549; + cvt.s32.s8 %r2467, %r2466; + cvt.u32.u16 %r2468, %rs1548; + cvt.s32.s8 %r2469, %r2468; + cvt.u32.u16 %r2470, %rs1547; + cvt.s32.s8 %r2471, %r2470; + cvt.u32.u16 %r2472, %rs1546; + cvt.s32.s8 %r2473, %r2472; + mad.lo.s32 %r2474, %r37, %r2473, %r2465; + mad.lo.s32 %r2475, %r38, %r2471, %r2474; + mad.lo.s32 %r2476, %r39, %r2469, %r2475; + mad.lo.s32 %r2477, %r40, %r2467, %r2476; + ld.const.v4.u8 {%rs1554, %rs1555, %rs1556, %rs1557}, [matrix+776]; + cvt.u32.u16 %r2478, %rs1557; + cvt.s32.s8 %r2479, %r2478; + cvt.u32.u16 %r2480, %rs1556; + cvt.s32.s8 %r2481, %r2480; + cvt.u32.u16 %r2482, %rs1555; + cvt.s32.s8 %r2483, %r2482; + cvt.u32.u16 %r2484, %rs1554; + cvt.s32.s8 %r2485, %r2484; + mad.lo.s32 %r2486, %r42, %r2485, %r2477; + mad.lo.s32 %r2487, %r43, %r2483, %r2486; + mad.lo.s32 %r2488, %r45, %r2481, %r2487; + mad.lo.s32 %r2489, %r46, %r2479, %r2488; + ld.const.v4.u8 {%rs1562, %rs1563, %rs1564, %rs1565}, [matrix+780]; + cvt.u32.u16 %r2490, %rs1565; + cvt.s32.s8 %r2491, %r2490; + cvt.u32.u16 %r2492, %rs1564; + cvt.s32.s8 %r2493, %r2492; + cvt.u32.u16 %r2494, %rs1563; + cvt.s32.s8 %r2495, %r2494; + cvt.u32.u16 %r2496, %rs1562; + cvt.s32.s8 %r2497, %r2496; + mad.lo.s32 %r2498, %r48, %r2497, %r2489; + mad.lo.s32 %r2499, %r49, %r2495, %r2498; + mad.lo.s32 %r2500, %r50, %r2493, %r2499; + mad.lo.s32 %r2501, %r51, %r2491, %r2500; + ld.const.v4.u8 {%rs1570, %rs1571, %rs1572, %rs1573}, [matrix+784]; + cvt.u32.u16 %r2502, %rs1573; + cvt.s32.s8 %r2503, %r2502; + cvt.u32.u16 %r2504, %rs1572; + cvt.s32.s8 %r2505, %r2504; + cvt.u32.u16 %r2506, %rs1571; + cvt.s32.s8 %r2507, %r2506; + cvt.u32.u16 %r2508, %rs1570; + cvt.s32.s8 %r2509, %r2508; + mad.lo.s32 %r2510, %r173, %r2509, %r2501; + mad.lo.s32 %r2511, %r53, %r2507, %r2510; + mad.lo.s32 %r2512, %r54, %r2505, %r2511; + mad.lo.s32 %r2513, %r55, %r2503, %r2512; + ld.const.v4.u8 {%rs1578, %rs1579, %rs1580, %rs1581}, [matrix+788]; + cvt.u32.u16 %r2514, %rs1581; + cvt.s32.s8 %r2515, %r2514; + cvt.u32.u16 %r2516, %rs1580; + cvt.s32.s8 %r2517, %r2516; + cvt.u32.u16 %r2518, %rs1579; + cvt.s32.s8 %r2519, %r2518; + cvt.u32.u16 %r2520, %rs1578; + cvt.s32.s8 %r2521, %r2520; + mad.lo.s32 %r2522, %r56, %r2521, %r2513; + mad.lo.s32 %r2523, %r57, %r2519, %r2522; + mad.lo.s32 %r2524, %r58, %r2517, %r2523; + mad.lo.s32 %r2525, %r59, %r2515, %r2524; + ld.const.v4.u8 {%rs1586, %rs1587, %rs1588, %rs1589}, [matrix+792]; + cvt.u32.u16 %r2526, %rs1589; + cvt.s32.s8 %r2527, %r2526; + cvt.u32.u16 %r2528, %rs1588; + cvt.s32.s8 %r2529, %r2528; + cvt.u32.u16 %r2530, %rs1587; + cvt.s32.s8 %r2531, %r2530; + cvt.u32.u16 %r2532, %rs1586; + cvt.s32.s8 %r2533, %r2532; + mad.lo.s32 %r2534, %r61, %r2533, %r2525; + mad.lo.s32 %r2535, %r62, %r2531, %r2534; + mad.lo.s32 %r2536, %r64, %r2529, %r2535; + mad.lo.s32 %r2537, %r65, %r2527, %r2536; + ld.const.v4.u8 {%rs1594, %rs1595, %rs1596, %rs1597}, [matrix+796]; + cvt.u32.u16 %r2538, %rs1597; + cvt.s32.s8 %r2539, %r2538; + cvt.u32.u16 %r2540, %rs1596; + cvt.s32.s8 %r2541, %r2540; + cvt.u32.u16 %r2542, %rs1595; + cvt.s32.s8 %r2543, %r2542; + cvt.u32.u16 %r2544, %rs1594; + cvt.s32.s8 %r2545, %r2544; + mad.lo.s32 %r2546, %r67, %r2545, %r2537; + mad.lo.s32 %r2547, %r68, %r2543, %r2546; + mad.lo.s32 %r2548, %r69, %r2541, %r2547; + mad.lo.s32 %r2549, %r70, %r2539, %r2548; + ld.const.v4.u8 {%rs1602, %rs1603, %rs1604, %rs1605}, [matrix+800]; + cvt.u32.u16 %r2550, %rs1605; + cvt.s32.s8 %r2551, %r2550; + cvt.u32.u16 %r2552, %rs1604; + cvt.s32.s8 %r2553, %r2552; + cvt.u32.u16 %r2554, %rs1603; + cvt.s32.s8 %r2555, %r2554; + cvt.u32.u16 %r2556, %rs1602; + cvt.s32.s8 %r2557, %r2556; + mad.lo.s32 %r2558, %r222, %r2557, %r2549; + mad.lo.s32 %r2559, %r72, %r2555, %r2558; + mad.lo.s32 %r2560, %r73, %r2553, %r2559; + mad.lo.s32 %r2561, %r74, %r2551, %r2560; + ld.const.v4.u8 {%rs1610, %rs1611, %rs1612, %rs1613}, [matrix+804]; + cvt.u32.u16 %r2562, %rs1613; + cvt.s32.s8 %r2563, %r2562; + cvt.u32.u16 %r2564, %rs1612; + cvt.s32.s8 %r2565, %r2564; + cvt.u32.u16 %r2566, %rs1611; + cvt.s32.s8 %r2567, %r2566; + cvt.u32.u16 %r2568, %rs1610; + cvt.s32.s8 %r2569, %r2568; + mad.lo.s32 %r2570, %r75, %r2569, %r2561; + mad.lo.s32 %r2571, %r76, %r2567, %r2570; + mad.lo.s32 %r2572, %r77, %r2565, %r2571; + mad.lo.s32 %r2573, %r78, %r2563, %r2572; + ld.const.v4.u8 {%rs1618, %rs1619, %rs1620, %rs1621}, [matrix+808]; + cvt.u32.u16 %r2574, %rs1621; + cvt.s32.s8 %r2575, %r2574; + cvt.u32.u16 %r2576, %rs1620; + cvt.s32.s8 %r2577, %r2576; + cvt.u32.u16 %r2578, %rs1619; + cvt.s32.s8 %r2579, %r2578; + cvt.u32.u16 %r2580, %rs1618; + cvt.s32.s8 %r2581, %r2580; + mad.lo.s32 %r2582, %r80, %r2581, %r2573; + mad.lo.s32 %r2583, %r81, %r2579, %r2582; + mad.lo.s32 %r2584, %r83, %r2577, %r2583; + mad.lo.s32 %r2585, %r84, %r2575, %r2584; + ld.const.v4.u8 {%rs1626, %rs1627, %rs1628, %rs1629}, [matrix+812]; + cvt.u32.u16 %r2586, %rs1629; + cvt.s32.s8 %r2587, %r2586; + cvt.u32.u16 %r2588, %rs1628; + cvt.s32.s8 %r2589, %r2588; + cvt.u32.u16 %r2590, %rs1627; + cvt.s32.s8 %r2591, %r2590; + cvt.u32.u16 %r2592, %rs1626; + cvt.s32.s8 %r2593, %r2592; + mad.lo.s32 %r2594, %r86, %r2593, %r2585; + mad.lo.s32 %r2595, %r87, %r2591, %r2594; + mad.lo.s32 %r2596, %r88, %r2589, %r2595; + mad.lo.s32 %r2597, %r89, %r2587, %r2596; + ld.const.v4.u8 {%rs1634, %rs1635, %rs1636, %rs1637}, [matrix+816]; + cvt.u32.u16 %r2598, %rs1637; + cvt.s32.s8 %r2599, %r2598; + cvt.u32.u16 %r2600, %rs1636; + cvt.s32.s8 %r2601, %r2600; + cvt.u32.u16 %r2602, %rs1635; + cvt.s32.s8 %r2603, %r2602; + cvt.u32.u16 %r2604, %rs1634; + cvt.s32.s8 %r2605, %r2604; + mad.lo.s32 %r2606, %r271, %r2605, %r2597; + mad.lo.s32 %r2607, %r91, %r2603, %r2606; + mad.lo.s32 %r2608, %r93, %r2601, %r2607; + mad.lo.s32 %r2609, %r94, %r2599, %r2608; + ld.const.v4.u8 {%rs1642, %rs1643, %rs1644, %rs1645}, [matrix+820]; + cvt.u32.u16 %r2610, %rs1645; + cvt.s32.s8 %r2611, %r2610; + cvt.u32.u16 %r2612, %rs1644; + cvt.s32.s8 %r2613, %r2612; + cvt.u32.u16 %r2614, %rs1643; + cvt.s32.s8 %r2615, %r2614; + cvt.u32.u16 %r2616, %rs1642; + cvt.s32.s8 %r2617, %r2616; + mad.lo.s32 %r2618, %r96, %r2617, %r2609; + mad.lo.s32 %r2619, %r97, %r2615, %r2618; + mad.lo.s32 %r2620, %r99, %r2613, %r2619; + mad.lo.s32 %r2621, %r100, %r2611, %r2620; + ld.const.v4.u8 {%rs1650, %rs1651, %rs1652, %rs1653}, [matrix+824]; + cvt.u32.u16 %r2622, %rs1653; + cvt.s32.s8 %r2623, %r2622; + cvt.u32.u16 %r2624, %rs1652; + cvt.s32.s8 %r2625, %r2624; + cvt.u32.u16 %r2626, %rs1651; + cvt.s32.s8 %r2627, %r2626; + cvt.u32.u16 %r2628, %rs1650; + cvt.s32.s8 %r2629, %r2628; + mad.lo.s32 %r2630, %r103, %r2629, %r2621; + mad.lo.s32 %r2631, %r104, %r2627, %r2630; + mad.lo.s32 %r2632, %r107, %r2625, %r2631; + mad.lo.s32 %r2633, %r108, %r2623, %r2632; + ld.const.v4.u8 {%rs1658, %rs1659, %rs1660, %rs1661}, [matrix+828]; + cvt.u32.u16 %r2634, %rs1661; + cvt.s32.s8 %r2635, %r2634; + cvt.u32.u16 %r2636, %rs1660; + cvt.s32.s8 %r2637, %r2636; + cvt.u32.u16 %r2638, %rs1659; + cvt.s32.s8 %r2639, %r2638; + cvt.u32.u16 %r2640, %rs1658; + cvt.s32.s8 %r2641, %r2640; + mad.lo.s32 %r2642, %r111, %r2641, %r2633; + mad.lo.s32 %r2643, %r112, %r2639, %r2642; + mad.lo.s32 %r2644, %r114, %r2637, %r2643; + mad.lo.s32 %r2645, %r115, %r2635, %r2644; + ld.const.v4.u8 {%rs1666, %rs1667, %rs1668, %rs1669}, [matrix+832]; + cvt.u32.u16 %r2646, %rs1669; + cvt.s32.s8 %r2647, %r2646; + cvt.u32.u16 %r2648, %rs1668; + cvt.s32.s8 %r2649, %r2648; + cvt.u32.u16 %r2650, %rs1666; + cvt.s32.s8 %r2651, %r2650; + cvt.u32.u16 %r2652, %rs1667; + cvt.s32.s8 %r2653, %r2652; + mul.lo.s32 %r2654, %r34, %r2653; + mad.lo.s32 %r2655, %r124, %r2651, %r2654; + mad.lo.s32 %r2656, %r35, %r2649, %r2655; + mad.lo.s32 %r2657, %r36, %r2647, %r2656; + ld.const.v4.u8 {%rs1674, %rs1675, %rs1676, %rs1677}, [matrix+836]; + cvt.u32.u16 %r2658, %rs1677; + cvt.s32.s8 %r2659, %r2658; + cvt.u32.u16 %r2660, %rs1676; + cvt.s32.s8 %r2661, %r2660; + cvt.u32.u16 %r2662, %rs1675; + cvt.s32.s8 %r2663, %r2662; + cvt.u32.u16 %r2664, %rs1674; + cvt.s32.s8 %r2665, %r2664; + mad.lo.s32 %r2666, %r37, %r2665, %r2657; + mad.lo.s32 %r2667, %r38, %r2663, %r2666; + mad.lo.s32 %r2668, %r39, %r2661, %r2667; + mad.lo.s32 %r2669, %r40, %r2659, %r2668; + ld.const.v4.u8 {%rs1682, %rs1683, %rs1684, %rs1685}, [matrix+840]; + cvt.u32.u16 %r2670, %rs1685; + cvt.s32.s8 %r2671, %r2670; + cvt.u32.u16 %r2672, %rs1684; + cvt.s32.s8 %r2673, %r2672; + cvt.u32.u16 %r2674, %rs1683; + cvt.s32.s8 %r2675, %r2674; + cvt.u32.u16 %r2676, %rs1682; + cvt.s32.s8 %r2677, %r2676; + mad.lo.s32 %r2678, %r42, %r2677, %r2669; + mad.lo.s32 %r2679, %r43, %r2675, %r2678; + mad.lo.s32 %r2680, %r45, %r2673, %r2679; + mad.lo.s32 %r2681, %r46, %r2671, %r2680; + ld.const.v4.u8 {%rs1690, %rs1691, %rs1692, %rs1693}, [matrix+844]; + cvt.u32.u16 %r2682, %rs1693; + cvt.s32.s8 %r2683, %r2682; + cvt.u32.u16 %r2684, %rs1692; + cvt.s32.s8 %r2685, %r2684; + cvt.u32.u16 %r2686, %rs1691; + cvt.s32.s8 %r2687, %r2686; + cvt.u32.u16 %r2688, %rs1690; + cvt.s32.s8 %r2689, %r2688; + mad.lo.s32 %r2690, %r48, %r2689, %r2681; + mad.lo.s32 %r2691, %r49, %r2687, %r2690; + mad.lo.s32 %r2692, %r50, %r2685, %r2691; + mad.lo.s32 %r2693, %r51, %r2683, %r2692; + ld.const.v4.u8 {%rs1698, %rs1699, %rs1700, %rs1701}, [matrix+848]; + cvt.u32.u16 %r2694, %rs1701; + cvt.s32.s8 %r2695, %r2694; + cvt.u32.u16 %r2696, %rs1700; + cvt.s32.s8 %r2697, %r2696; + cvt.u32.u16 %r2698, %rs1699; + cvt.s32.s8 %r2699, %r2698; + cvt.u32.u16 %r2700, %rs1698; + cvt.s32.s8 %r2701, %r2700; + mad.lo.s32 %r2702, %r173, %r2701, %r2693; + mad.lo.s32 %r2703, %r53, %r2699, %r2702; + mad.lo.s32 %r2704, %r54, %r2697, %r2703; + mad.lo.s32 %r2705, %r55, %r2695, %r2704; + ld.const.v4.u8 {%rs1706, %rs1707, %rs1708, %rs1709}, [matrix+852]; + cvt.u32.u16 %r2706, %rs1709; + cvt.s32.s8 %r2707, %r2706; + cvt.u32.u16 %r2708, %rs1708; + cvt.s32.s8 %r2709, %r2708; + cvt.u32.u16 %r2710, %rs1707; + cvt.s32.s8 %r2711, %r2710; + cvt.u32.u16 %r2712, %rs1706; + cvt.s32.s8 %r2713, %r2712; + mad.lo.s32 %r2714, %r56, %r2713, %r2705; + mad.lo.s32 %r2715, %r57, %r2711, %r2714; + mad.lo.s32 %r2716, %r58, %r2709, %r2715; + mad.lo.s32 %r2717, %r59, %r2707, %r2716; + ld.const.v4.u8 {%rs1714, %rs1715, %rs1716, %rs1717}, [matrix+856]; + cvt.u32.u16 %r2718, %rs1717; + cvt.s32.s8 %r2719, %r2718; + cvt.u32.u16 %r2720, %rs1716; + cvt.s32.s8 %r2721, %r2720; + cvt.u32.u16 %r2722, %rs1715; + cvt.s32.s8 %r2723, %r2722; + cvt.u32.u16 %r2724, %rs1714; + cvt.s32.s8 %r2725, %r2724; + mad.lo.s32 %r2726, %r61, %r2725, %r2717; + mad.lo.s32 %r2727, %r62, %r2723, %r2726; + mad.lo.s32 %r2728, %r64, %r2721, %r2727; + mad.lo.s32 %r2729, %r65, %r2719, %r2728; + ld.const.v4.u8 {%rs1722, %rs1723, %rs1724, %rs1725}, [matrix+860]; + cvt.u32.u16 %r2730, %rs1725; + cvt.s32.s8 %r2731, %r2730; + cvt.u32.u16 %r2732, %rs1724; + cvt.s32.s8 %r2733, %r2732; + cvt.u32.u16 %r2734, %rs1723; + cvt.s32.s8 %r2735, %r2734; + cvt.u32.u16 %r2736, %rs1722; + cvt.s32.s8 %r2737, %r2736; + mad.lo.s32 %r2738, %r67, %r2737, %r2729; + mad.lo.s32 %r2739, %r68, %r2735, %r2738; + mad.lo.s32 %r2740, %r69, %r2733, %r2739; + mad.lo.s32 %r2741, %r70, %r2731, %r2740; + ld.const.v4.u8 {%rs1730, %rs1731, %rs1732, %rs1733}, [matrix+864]; + cvt.u32.u16 %r2742, %rs1733; + cvt.s32.s8 %r2743, %r2742; + cvt.u32.u16 %r2744, %rs1732; + cvt.s32.s8 %r2745, %r2744; + cvt.u32.u16 %r2746, %rs1731; + cvt.s32.s8 %r2747, %r2746; + cvt.u32.u16 %r2748, %rs1730; + cvt.s32.s8 %r2749, %r2748; + mad.lo.s32 %r2750, %r222, %r2749, %r2741; + mad.lo.s32 %r2751, %r72, %r2747, %r2750; + mad.lo.s32 %r2752, %r73, %r2745, %r2751; + mad.lo.s32 %r2753, %r74, %r2743, %r2752; + ld.const.v4.u8 {%rs1738, %rs1739, %rs1740, %rs1741}, [matrix+868]; + cvt.u32.u16 %r2754, %rs1741; + cvt.s32.s8 %r2755, %r2754; + cvt.u32.u16 %r2756, %rs1740; + cvt.s32.s8 %r2757, %r2756; + cvt.u32.u16 %r2758, %rs1739; + cvt.s32.s8 %r2759, %r2758; + cvt.u32.u16 %r2760, %rs1738; + cvt.s32.s8 %r2761, %r2760; + mad.lo.s32 %r2762, %r75, %r2761, %r2753; + mad.lo.s32 %r2763, %r76, %r2759, %r2762; + mad.lo.s32 %r2764, %r77, %r2757, %r2763; + mad.lo.s32 %r2765, %r78, %r2755, %r2764; + ld.const.v4.u8 {%rs1746, %rs1747, %rs1748, %rs1749}, [matrix+872]; + cvt.u32.u16 %r2766, %rs1749; + cvt.s32.s8 %r2767, %r2766; + cvt.u32.u16 %r2768, %rs1748; + cvt.s32.s8 %r2769, %r2768; + cvt.u32.u16 %r2770, %rs1747; + cvt.s32.s8 %r2771, %r2770; + cvt.u32.u16 %r2772, %rs1746; + cvt.s32.s8 %r2773, %r2772; + mad.lo.s32 %r2774, %r80, %r2773, %r2765; + mad.lo.s32 %r2775, %r81, %r2771, %r2774; + mad.lo.s32 %r2776, %r83, %r2769, %r2775; + mad.lo.s32 %r2777, %r84, %r2767, %r2776; + ld.const.v4.u8 {%rs1754, %rs1755, %rs1756, %rs1757}, [matrix+876]; + cvt.u32.u16 %r2778, %rs1757; + cvt.s32.s8 %r2779, %r2778; + cvt.u32.u16 %r2780, %rs1756; + cvt.s32.s8 %r2781, %r2780; + cvt.u32.u16 %r2782, %rs1755; + cvt.s32.s8 %r2783, %r2782; + cvt.u32.u16 %r2784, %rs1754; + cvt.s32.s8 %r2785, %r2784; + mad.lo.s32 %r2786, %r86, %r2785, %r2777; + mad.lo.s32 %r2787, %r87, %r2783, %r2786; + mad.lo.s32 %r2788, %r88, %r2781, %r2787; + mad.lo.s32 %r2789, %r89, %r2779, %r2788; + ld.const.v4.u8 {%rs1762, %rs1763, %rs1764, %rs1765}, [matrix+880]; + cvt.u32.u16 %r2790, %rs1765; + cvt.s32.s8 %r2791, %r2790; + cvt.u32.u16 %r2792, %rs1764; + cvt.s32.s8 %r2793, %r2792; + cvt.u32.u16 %r2794, %rs1763; + cvt.s32.s8 %r2795, %r2794; + cvt.u32.u16 %r2796, %rs1762; + cvt.s32.s8 %r2797, %r2796; + mad.lo.s32 %r2798, %r271, %r2797, %r2789; + mad.lo.s32 %r2799, %r91, %r2795, %r2798; + mad.lo.s32 %r2800, %r93, %r2793, %r2799; + mad.lo.s32 %r2801, %r94, %r2791, %r2800; + ld.const.v4.u8 {%rs1770, %rs1771, %rs1772, %rs1773}, [matrix+884]; + cvt.u32.u16 %r2802, %rs1773; + cvt.s32.s8 %r2803, %r2802; + cvt.u32.u16 %r2804, %rs1772; + cvt.s32.s8 %r2805, %r2804; + cvt.u32.u16 %r2806, %rs1771; + cvt.s32.s8 %r2807, %r2806; + cvt.u32.u16 %r2808, %rs1770; + cvt.s32.s8 %r2809, %r2808; + mad.lo.s32 %r2810, %r96, %r2809, %r2801; + mad.lo.s32 %r2811, %r97, %r2807, %r2810; + mad.lo.s32 %r2812, %r99, %r2805, %r2811; + mad.lo.s32 %r2813, %r100, %r2803, %r2812; + ld.const.v4.u8 {%rs1778, %rs1779, %rs1780, %rs1781}, [matrix+888]; + cvt.u32.u16 %r2814, %rs1781; + cvt.s32.s8 %r2815, %r2814; + cvt.u32.u16 %r2816, %rs1780; + cvt.s32.s8 %r2817, %r2816; + cvt.u32.u16 %r2818, %rs1779; + cvt.s32.s8 %r2819, %r2818; + cvt.u32.u16 %r2820, %rs1778; + cvt.s32.s8 %r2821, %r2820; + mad.lo.s32 %r2822, %r103, %r2821, %r2813; + mad.lo.s32 %r2823, %r104, %r2819, %r2822; + mad.lo.s32 %r2824, %r107, %r2817, %r2823; + mad.lo.s32 %r2825, %r108, %r2815, %r2824; + ld.const.v4.u8 {%rs1786, %rs1787, %rs1788, %rs1789}, [matrix+892]; + cvt.u32.u16 %r2826, %rs1789; + cvt.s32.s8 %r2827, %r2826; + cvt.u32.u16 %r2828, %rs1788; + cvt.s32.s8 %r2829, %r2828; + cvt.u32.u16 %r2830, %rs1787; + cvt.s32.s8 %r2831, %r2830; + cvt.u32.u16 %r2832, %rs1786; + cvt.s32.s8 %r2833, %r2832; + mad.lo.s32 %r2834, %r111, %r2833, %r2825; + mad.lo.s32 %r2835, %r112, %r2831, %r2834; + mad.lo.s32 %r2836, %r114, %r2829, %r2835; + mad.lo.s32 %r2837, %r115, %r2827, %r2836; + shr.u32 %r2838, %r2645, 6; + and.b32 %r2839, %r2838, 240; + shr.u32 %r2840, %r2837, 10; + or.b32 %r2841, %r2840, %r2839; + xor.b32 %r2842, %r18, %r2841; + cvt.u64.u32 %rd386, %r2842; + ld.const.v4.u8 {%rs1794, %rs1795, %rs1796, %rs1797}, [matrix+896]; + cvt.u32.u16 %r2843, %rs1797; + cvt.s32.s8 %r2844, %r2843; + cvt.u32.u16 %r2845, %rs1796; + cvt.s32.s8 %r2846, %r2845; + cvt.u32.u16 %r2847, %rs1794; + cvt.s32.s8 %r2848, %r2847; + cvt.u32.u16 %r2849, %rs1795; + cvt.s32.s8 %r2850, %r2849; + mul.lo.s32 %r2851, %r34, %r2850; + mad.lo.s32 %r2852, %r124, %r2848, %r2851; + mad.lo.s32 %r2853, %r35, %r2846, %r2852; + mad.lo.s32 %r2854, %r36, %r2844, %r2853; + ld.const.v4.u8 {%rs1802, %rs1803, %rs1804, %rs1805}, [matrix+900]; + cvt.u32.u16 %r2855, %rs1805; + cvt.s32.s8 %r2856, %r2855; + cvt.u32.u16 %r2857, %rs1804; + cvt.s32.s8 %r2858, %r2857; + cvt.u32.u16 %r2859, %rs1803; + cvt.s32.s8 %r2860, %r2859; + cvt.u32.u16 %r2861, %rs1802; + cvt.s32.s8 %r2862, %r2861; + mad.lo.s32 %r2863, %r37, %r2862, %r2854; + mad.lo.s32 %r2864, %r38, %r2860, %r2863; + mad.lo.s32 %r2865, %r39, %r2858, %r2864; + mad.lo.s32 %r2866, %r40, %r2856, %r2865; + ld.const.v4.u8 {%rs1810, %rs1811, %rs1812, %rs1813}, [matrix+904]; + cvt.u32.u16 %r2867, %rs1813; + cvt.s32.s8 %r2868, %r2867; + cvt.u32.u16 %r2869, %rs1812; + cvt.s32.s8 %r2870, %r2869; + cvt.u32.u16 %r2871, %rs1811; + cvt.s32.s8 %r2872, %r2871; + cvt.u32.u16 %r2873, %rs1810; + cvt.s32.s8 %r2874, %r2873; + mad.lo.s32 %r2875, %r42, %r2874, %r2866; + mad.lo.s32 %r2876, %r43, %r2872, %r2875; + mad.lo.s32 %r2877, %r45, %r2870, %r2876; + mad.lo.s32 %r2878, %r46, %r2868, %r2877; + ld.const.v4.u8 {%rs1818, %rs1819, %rs1820, %rs1821}, [matrix+908]; + cvt.u32.u16 %r2879, %rs1821; + cvt.s32.s8 %r2880, %r2879; + cvt.u32.u16 %r2881, %rs1820; + cvt.s32.s8 %r2882, %r2881; + cvt.u32.u16 %r2883, %rs1819; + cvt.s32.s8 %r2884, %r2883; + cvt.u32.u16 %r2885, %rs1818; + cvt.s32.s8 %r2886, %r2885; + mad.lo.s32 %r2887, %r48, %r2886, %r2878; + mad.lo.s32 %r2888, %r49, %r2884, %r2887; + mad.lo.s32 %r2889, %r50, %r2882, %r2888; + mad.lo.s32 %r2890, %r51, %r2880, %r2889; + ld.const.v4.u8 {%rs1826, %rs1827, %rs1828, %rs1829}, [matrix+912]; + cvt.u32.u16 %r2891, %rs1829; + cvt.s32.s8 %r2892, %r2891; + cvt.u32.u16 %r2893, %rs1828; + cvt.s32.s8 %r2894, %r2893; + cvt.u32.u16 %r2895, %rs1827; + cvt.s32.s8 %r2896, %r2895; + cvt.u32.u16 %r2897, %rs1826; + cvt.s32.s8 %r2898, %r2897; + mad.lo.s32 %r2899, %r173, %r2898, %r2890; + mad.lo.s32 %r2900, %r53, %r2896, %r2899; + mad.lo.s32 %r2901, %r54, %r2894, %r2900; + mad.lo.s32 %r2902, %r55, %r2892, %r2901; + ld.const.v4.u8 {%rs1834, %rs1835, %rs1836, %rs1837}, [matrix+916]; + cvt.u32.u16 %r2903, %rs1837; + cvt.s32.s8 %r2904, %r2903; + cvt.u32.u16 %r2905, %rs1836; + cvt.s32.s8 %r2906, %r2905; + cvt.u32.u16 %r2907, %rs1835; + cvt.s32.s8 %r2908, %r2907; + cvt.u32.u16 %r2909, %rs1834; + cvt.s32.s8 %r2910, %r2909; + mad.lo.s32 %r2911, %r56, %r2910, %r2902; + mad.lo.s32 %r2912, %r57, %r2908, %r2911; + mad.lo.s32 %r2913, %r58, %r2906, %r2912; + mad.lo.s32 %r2914, %r59, %r2904, %r2913; + ld.const.v4.u8 {%rs1842, %rs1843, %rs1844, %rs1845}, [matrix+920]; + cvt.u32.u16 %r2915, %rs1845; + cvt.s32.s8 %r2916, %r2915; + cvt.u32.u16 %r2917, %rs1844; + cvt.s32.s8 %r2918, %r2917; + cvt.u32.u16 %r2919, %rs1843; + cvt.s32.s8 %r2920, %r2919; + cvt.u32.u16 %r2921, %rs1842; + cvt.s32.s8 %r2922, %r2921; + mad.lo.s32 %r2923, %r61, %r2922, %r2914; + mad.lo.s32 %r2924, %r62, %r2920, %r2923; + mad.lo.s32 %r2925, %r64, %r2918, %r2924; + mad.lo.s32 %r2926, %r65, %r2916, %r2925; + ld.const.v4.u8 {%rs1850, %rs1851, %rs1852, %rs1853}, [matrix+924]; + cvt.u32.u16 %r2927, %rs1853; + cvt.s32.s8 %r2928, %r2927; + cvt.u32.u16 %r2929, %rs1852; + cvt.s32.s8 %r2930, %r2929; + cvt.u32.u16 %r2931, %rs1851; + cvt.s32.s8 %r2932, %r2931; + cvt.u32.u16 %r2933, %rs1850; + cvt.s32.s8 %r2934, %r2933; + mad.lo.s32 %r2935, %r67, %r2934, %r2926; + mad.lo.s32 %r2936, %r68, %r2932, %r2935; + mad.lo.s32 %r2937, %r69, %r2930, %r2936; + mad.lo.s32 %r2938, %r70, %r2928, %r2937; + ld.const.v4.u8 {%rs1858, %rs1859, %rs1860, %rs1861}, [matrix+928]; + cvt.u32.u16 %r2939, %rs1861; + cvt.s32.s8 %r2940, %r2939; + cvt.u32.u16 %r2941, %rs1860; + cvt.s32.s8 %r2942, %r2941; + cvt.u32.u16 %r2943, %rs1859; + cvt.s32.s8 %r2944, %r2943; + cvt.u32.u16 %r2945, %rs1858; + cvt.s32.s8 %r2946, %r2945; + mad.lo.s32 %r2947, %r222, %r2946, %r2938; + mad.lo.s32 %r2948, %r72, %r2944, %r2947; + mad.lo.s32 %r2949, %r73, %r2942, %r2948; + mad.lo.s32 %r2950, %r74, %r2940, %r2949; + ld.const.v4.u8 {%rs1866, %rs1867, %rs1868, %rs1869}, [matrix+932]; + cvt.u32.u16 %r2951, %rs1869; + cvt.s32.s8 %r2952, %r2951; + cvt.u32.u16 %r2953, %rs1868; + cvt.s32.s8 %r2954, %r2953; + cvt.u32.u16 %r2955, %rs1867; + cvt.s32.s8 %r2956, %r2955; + cvt.u32.u16 %r2957, %rs1866; + cvt.s32.s8 %r2958, %r2957; + mad.lo.s32 %r2959, %r75, %r2958, %r2950; + mad.lo.s32 %r2960, %r76, %r2956, %r2959; + mad.lo.s32 %r2961, %r77, %r2954, %r2960; + mad.lo.s32 %r2962, %r78, %r2952, %r2961; + ld.const.v4.u8 {%rs1874, %rs1875, %rs1876, %rs1877}, [matrix+936]; + cvt.u32.u16 %r2963, %rs1877; + cvt.s32.s8 %r2964, %r2963; + cvt.u32.u16 %r2965, %rs1876; + cvt.s32.s8 %r2966, %r2965; + cvt.u32.u16 %r2967, %rs1875; + cvt.s32.s8 %r2968, %r2967; + cvt.u32.u16 %r2969, %rs1874; + cvt.s32.s8 %r2970, %r2969; + mad.lo.s32 %r2971, %r80, %r2970, %r2962; + mad.lo.s32 %r2972, %r81, %r2968, %r2971; + mad.lo.s32 %r2973, %r83, %r2966, %r2972; + mad.lo.s32 %r2974, %r84, %r2964, %r2973; + ld.const.v4.u8 {%rs1882, %rs1883, %rs1884, %rs1885}, [matrix+940]; + cvt.u32.u16 %r2975, %rs1885; + cvt.s32.s8 %r2976, %r2975; + cvt.u32.u16 %r2977, %rs1884; + cvt.s32.s8 %r2978, %r2977; + cvt.u32.u16 %r2979, %rs1883; + cvt.s32.s8 %r2980, %r2979; + cvt.u32.u16 %r2981, %rs1882; + cvt.s32.s8 %r2982, %r2981; + mad.lo.s32 %r2983, %r86, %r2982, %r2974; + mad.lo.s32 %r2984, %r87, %r2980, %r2983; + mad.lo.s32 %r2985, %r88, %r2978, %r2984; + mad.lo.s32 %r2986, %r89, %r2976, %r2985; + ld.const.v4.u8 {%rs1890, %rs1891, %rs1892, %rs1893}, [matrix+944]; + cvt.u32.u16 %r2987, %rs1893; + cvt.s32.s8 %r2988, %r2987; + cvt.u32.u16 %r2989, %rs1892; + cvt.s32.s8 %r2990, %r2989; + cvt.u32.u16 %r2991, %rs1891; + cvt.s32.s8 %r2992, %r2991; + cvt.u32.u16 %r2993, %rs1890; + cvt.s32.s8 %r2994, %r2993; + mad.lo.s32 %r2995, %r271, %r2994, %r2986; + mad.lo.s32 %r2996, %r91, %r2992, %r2995; + mad.lo.s32 %r2997, %r93, %r2990, %r2996; + mad.lo.s32 %r2998, %r94, %r2988, %r2997; + ld.const.v4.u8 {%rs1898, %rs1899, %rs1900, %rs1901}, [matrix+948]; + cvt.u32.u16 %r2999, %rs1901; + cvt.s32.s8 %r3000, %r2999; + cvt.u32.u16 %r3001, %rs1900; + cvt.s32.s8 %r3002, %r3001; + cvt.u32.u16 %r3003, %rs1899; + cvt.s32.s8 %r3004, %r3003; + cvt.u32.u16 %r3005, %rs1898; + cvt.s32.s8 %r3006, %r3005; + mad.lo.s32 %r3007, %r96, %r3006, %r2998; + mad.lo.s32 %r3008, %r97, %r3004, %r3007; + mad.lo.s32 %r3009, %r99, %r3002, %r3008; + mad.lo.s32 %r3010, %r100, %r3000, %r3009; + ld.const.v4.u8 {%rs1906, %rs1907, %rs1908, %rs1909}, [matrix+952]; + cvt.u32.u16 %r3011, %rs1909; + cvt.s32.s8 %r3012, %r3011; + cvt.u32.u16 %r3013, %rs1908; + cvt.s32.s8 %r3014, %r3013; + cvt.u32.u16 %r3015, %rs1907; + cvt.s32.s8 %r3016, %r3015; + cvt.u32.u16 %r3017, %rs1906; + cvt.s32.s8 %r3018, %r3017; + mad.lo.s32 %r3019, %r103, %r3018, %r3010; + mad.lo.s32 %r3020, %r104, %r3016, %r3019; + mad.lo.s32 %r3021, %r107, %r3014, %r3020; + mad.lo.s32 %r3022, %r108, %r3012, %r3021; + ld.const.v4.u8 {%rs1914, %rs1915, %rs1916, %rs1917}, [matrix+956]; + cvt.u32.u16 %r3023, %rs1917; + cvt.s32.s8 %r3024, %r3023; + cvt.u32.u16 %r3025, %rs1916; + cvt.s32.s8 %r3026, %r3025; + cvt.u32.u16 %r3027, %rs1915; + cvt.s32.s8 %r3028, %r3027; + cvt.u32.u16 %r3029, %rs1914; + cvt.s32.s8 %r3030, %r3029; + mad.lo.s32 %r3031, %r111, %r3030, %r3022; + mad.lo.s32 %r3032, %r112, %r3028, %r3031; + mad.lo.s32 %r3033, %r114, %r3026, %r3032; + mad.lo.s32 %r3034, %r115, %r3024, %r3033; + ld.const.v4.u8 {%rs1922, %rs1923, %rs1924, %rs1925}, [matrix+960]; + cvt.u32.u16 %r3035, %rs1925; + cvt.s32.s8 %r3036, %r3035; + cvt.u32.u16 %r3037, %rs1924; + cvt.s32.s8 %r3038, %r3037; + cvt.u32.u16 %r3039, %rs1922; + cvt.s32.s8 %r3040, %r3039; + cvt.u32.u16 %r3041, %rs1923; + cvt.s32.s8 %r3042, %r3041; + mul.lo.s32 %r3043, %r34, %r3042; + mad.lo.s32 %r3044, %r124, %r3040, %r3043; + mad.lo.s32 %r3045, %r35, %r3038, %r3044; + mad.lo.s32 %r3046, %r36, %r3036, %r3045; + ld.const.v4.u8 {%rs1930, %rs1931, %rs1932, %rs1933}, [matrix+964]; + cvt.u32.u16 %r3047, %rs1933; + cvt.s32.s8 %r3048, %r3047; + cvt.u32.u16 %r3049, %rs1932; + cvt.s32.s8 %r3050, %r3049; + cvt.u32.u16 %r3051, %rs1931; + cvt.s32.s8 %r3052, %r3051; + cvt.u32.u16 %r3053, %rs1930; + cvt.s32.s8 %r3054, %r3053; + mad.lo.s32 %r3055, %r37, %r3054, %r3046; + mad.lo.s32 %r3056, %r38, %r3052, %r3055; + mad.lo.s32 %r3057, %r39, %r3050, %r3056; + mad.lo.s32 %r3058, %r40, %r3048, %r3057; + ld.const.v4.u8 {%rs1938, %rs1939, %rs1940, %rs1941}, [matrix+968]; + cvt.u32.u16 %r3059, %rs1941; + cvt.s32.s8 %r3060, %r3059; + cvt.u32.u16 %r3061, %rs1940; + cvt.s32.s8 %r3062, %r3061; + cvt.u32.u16 %r3063, %rs1939; + cvt.s32.s8 %r3064, %r3063; + cvt.u32.u16 %r3065, %rs1938; + cvt.s32.s8 %r3066, %r3065; + mad.lo.s32 %r3067, %r42, %r3066, %r3058; + mad.lo.s32 %r3068, %r43, %r3064, %r3067; + mad.lo.s32 %r3069, %r45, %r3062, %r3068; + mad.lo.s32 %r3070, %r46, %r3060, %r3069; + ld.const.v4.u8 {%rs1946, %rs1947, %rs1948, %rs1949}, [matrix+972]; + cvt.u32.u16 %r3071, %rs1949; + cvt.s32.s8 %r3072, %r3071; + cvt.u32.u16 %r3073, %rs1948; + cvt.s32.s8 %r3074, %r3073; + cvt.u32.u16 %r3075, %rs1947; + cvt.s32.s8 %r3076, %r3075; + cvt.u32.u16 %r3077, %rs1946; + cvt.s32.s8 %r3078, %r3077; + mad.lo.s32 %r3079, %r48, %r3078, %r3070; + mad.lo.s32 %r3080, %r49, %r3076, %r3079; + mad.lo.s32 %r3081, %r50, %r3074, %r3080; + mad.lo.s32 %r3082, %r51, %r3072, %r3081; + ld.const.v4.u8 {%rs1954, %rs1955, %rs1956, %rs1957}, [matrix+976]; + cvt.u32.u16 %r3083, %rs1957; + cvt.s32.s8 %r3084, %r3083; + cvt.u32.u16 %r3085, %rs1956; + cvt.s32.s8 %r3086, %r3085; + cvt.u32.u16 %r3087, %rs1955; + cvt.s32.s8 %r3088, %r3087; + cvt.u32.u16 %r3089, %rs1954; + cvt.s32.s8 %r3090, %r3089; + mad.lo.s32 %r3091, %r173, %r3090, %r3082; + mad.lo.s32 %r3092, %r53, %r3088, %r3091; + mad.lo.s32 %r3093, %r54, %r3086, %r3092; + mad.lo.s32 %r3094, %r55, %r3084, %r3093; + ld.const.v4.u8 {%rs1962, %rs1963, %rs1964, %rs1965}, [matrix+980]; + cvt.u32.u16 %r3095, %rs1965; + cvt.s32.s8 %r3096, %r3095; + cvt.u32.u16 %r3097, %rs1964; + cvt.s32.s8 %r3098, %r3097; + cvt.u32.u16 %r3099, %rs1963; + cvt.s32.s8 %r3100, %r3099; + cvt.u32.u16 %r3101, %rs1962; + cvt.s32.s8 %r3102, %r3101; + mad.lo.s32 %r3103, %r56, %r3102, %r3094; + mad.lo.s32 %r3104, %r57, %r3100, %r3103; + mad.lo.s32 %r3105, %r58, %r3098, %r3104; + mad.lo.s32 %r3106, %r59, %r3096, %r3105; + ld.const.v4.u8 {%rs1970, %rs1971, %rs1972, %rs1973}, [matrix+984]; + cvt.u32.u16 %r3107, %rs1973; + cvt.s32.s8 %r3108, %r3107; + cvt.u32.u16 %r3109, %rs1972; + cvt.s32.s8 %r3110, %r3109; + cvt.u32.u16 %r3111, %rs1971; + cvt.s32.s8 %r3112, %r3111; + cvt.u32.u16 %r3113, %rs1970; + cvt.s32.s8 %r3114, %r3113; + mad.lo.s32 %r3115, %r61, %r3114, %r3106; + mad.lo.s32 %r3116, %r62, %r3112, %r3115; + mad.lo.s32 %r3117, %r64, %r3110, %r3116; + mad.lo.s32 %r3118, %r65, %r3108, %r3117; + ld.const.v4.u8 {%rs1978, %rs1979, %rs1980, %rs1981}, [matrix+988]; + cvt.u32.u16 %r3119, %rs1981; + cvt.s32.s8 %r3120, %r3119; + cvt.u32.u16 %r3121, %rs1980; + cvt.s32.s8 %r3122, %r3121; + cvt.u32.u16 %r3123, %rs1979; + cvt.s32.s8 %r3124, %r3123; + cvt.u32.u16 %r3125, %rs1978; + cvt.s32.s8 %r3126, %r3125; + mad.lo.s32 %r3127, %r67, %r3126, %r3118; + mad.lo.s32 %r3128, %r68, %r3124, %r3127; + mad.lo.s32 %r3129, %r69, %r3122, %r3128; + mad.lo.s32 %r3130, %r70, %r3120, %r3129; + ld.const.v4.u8 {%rs1986, %rs1987, %rs1988, %rs1989}, [matrix+992]; + cvt.u32.u16 %r3131, %rs1989; + cvt.s32.s8 %r3132, %r3131; + cvt.u32.u16 %r3133, %rs1988; + cvt.s32.s8 %r3134, %r3133; + cvt.u32.u16 %r3135, %rs1987; + cvt.s32.s8 %r3136, %r3135; + cvt.u32.u16 %r3137, %rs1986; + cvt.s32.s8 %r3138, %r3137; + mad.lo.s32 %r3139, %r222, %r3138, %r3130; + mad.lo.s32 %r3140, %r72, %r3136, %r3139; + mad.lo.s32 %r3141, %r73, %r3134, %r3140; + mad.lo.s32 %r3142, %r74, %r3132, %r3141; + ld.const.v4.u8 {%rs1994, %rs1995, %rs1996, %rs1997}, [matrix+996]; + cvt.u32.u16 %r3143, %rs1997; + cvt.s32.s8 %r3144, %r3143; + cvt.u32.u16 %r3145, %rs1996; + cvt.s32.s8 %r3146, %r3145; + cvt.u32.u16 %r3147, %rs1995; + cvt.s32.s8 %r3148, %r3147; + cvt.u32.u16 %r3149, %rs1994; + cvt.s32.s8 %r3150, %r3149; + mad.lo.s32 %r3151, %r75, %r3150, %r3142; + mad.lo.s32 %r3152, %r76, %r3148, %r3151; + mad.lo.s32 %r3153, %r77, %r3146, %r3152; + mad.lo.s32 %r3154, %r78, %r3144, %r3153; + ld.const.v4.u8 {%rs2002, %rs2003, %rs2004, %rs2005}, [matrix+1000]; + cvt.u32.u16 %r3155, %rs2005; + cvt.s32.s8 %r3156, %r3155; + cvt.u32.u16 %r3157, %rs2004; + cvt.s32.s8 %r3158, %r3157; + cvt.u32.u16 %r3159, %rs2003; + cvt.s32.s8 %r3160, %r3159; + cvt.u32.u16 %r3161, %rs2002; + cvt.s32.s8 %r3162, %r3161; + mad.lo.s32 %r3163, %r80, %r3162, %r3154; + mad.lo.s32 %r3164, %r81, %r3160, %r3163; + mad.lo.s32 %r3165, %r83, %r3158, %r3164; + mad.lo.s32 %r3166, %r84, %r3156, %r3165; + ld.const.v4.u8 {%rs2010, %rs2011, %rs2012, %rs2013}, [matrix+1004]; + cvt.u32.u16 %r3167, %rs2013; + cvt.s32.s8 %r3168, %r3167; + cvt.u32.u16 %r3169, %rs2012; + cvt.s32.s8 %r3170, %r3169; + cvt.u32.u16 %r3171, %rs2011; + cvt.s32.s8 %r3172, %r3171; + cvt.u32.u16 %r3173, %rs2010; + cvt.s32.s8 %r3174, %r3173; + mad.lo.s32 %r3175, %r86, %r3174, %r3166; + mad.lo.s32 %r3176, %r87, %r3172, %r3175; + mad.lo.s32 %r3177, %r88, %r3170, %r3176; + mad.lo.s32 %r3178, %r89, %r3168, %r3177; + ld.const.v4.u8 {%rs2018, %rs2019, %rs2020, %rs2021}, [matrix+1008]; + cvt.u32.u16 %r3179, %rs2021; + cvt.s32.s8 %r3180, %r3179; + cvt.u32.u16 %r3181, %rs2020; + cvt.s32.s8 %r3182, %r3181; + cvt.u32.u16 %r3183, %rs2019; + cvt.s32.s8 %r3184, %r3183; + cvt.u32.u16 %r3185, %rs2018; + cvt.s32.s8 %r3186, %r3185; + mad.lo.s32 %r3187, %r271, %r3186, %r3178; + mad.lo.s32 %r3188, %r91, %r3184, %r3187; + mad.lo.s32 %r3189, %r93, %r3182, %r3188; + mad.lo.s32 %r3190, %r94, %r3180, %r3189; + ld.const.v4.u8 {%rs2026, %rs2027, %rs2028, %rs2029}, [matrix+1012]; + cvt.u32.u16 %r3191, %rs2029; + cvt.s32.s8 %r3192, %r3191; + cvt.u32.u16 %r3193, %rs2028; + cvt.s32.s8 %r3194, %r3193; + cvt.u32.u16 %r3195, %rs2027; + cvt.s32.s8 %r3196, %r3195; + cvt.u32.u16 %r3197, %rs2026; + cvt.s32.s8 %r3198, %r3197; + mad.lo.s32 %r3199, %r96, %r3198, %r3190; + mad.lo.s32 %r3200, %r97, %r3196, %r3199; + mad.lo.s32 %r3201, %r99, %r3194, %r3200; + mad.lo.s32 %r3202, %r100, %r3192, %r3201; + ld.const.v4.u8 {%rs2034, %rs2035, %rs2036, %rs2037}, [matrix+1016]; + cvt.u32.u16 %r3203, %rs2037; + cvt.s32.s8 %r3204, %r3203; + cvt.u32.u16 %r3205, %rs2036; + cvt.s32.s8 %r3206, %r3205; + cvt.u32.u16 %r3207, %rs2035; + cvt.s32.s8 %r3208, %r3207; + cvt.u32.u16 %r3209, %rs2034; + cvt.s32.s8 %r3210, %r3209; + mad.lo.s32 %r3211, %r103, %r3210, %r3202; + mad.lo.s32 %r3212, %r104, %r3208, %r3211; + mad.lo.s32 %r3213, %r107, %r3206, %r3212; + mad.lo.s32 %r3214, %r108, %r3204, %r3213; + ld.const.v4.u8 {%rs2042, %rs2043, %rs2044, %rs2045}, [matrix+1020]; + cvt.u32.u16 %r3215, %rs2045; + cvt.s32.s8 %r3216, %r3215; + cvt.u32.u16 %r3217, %rs2044; + cvt.s32.s8 %r3218, %r3217; + cvt.u32.u16 %r3219, %rs2043; + cvt.s32.s8 %r3220, %r3219; + cvt.u32.u16 %r3221, %rs2042; + cvt.s32.s8 %r3222, %r3221; + mad.lo.s32 %r3223, %r111, %r3222, %r3214; + mad.lo.s32 %r3224, %r112, %r3220, %r3223; + mad.lo.s32 %r3225, %r114, %r3218, %r3224; + mad.lo.s32 %r3226, %r115, %r3216, %r3225; + shr.u32 %r3227, %r3034, 6; + and.b32 %r3228, %r3227, 240; + shr.u32 %r3229, %r3226, 10; + or.b32 %r3230, %r3229, %r3228; + xor.b32 %r3231, %r19, %r3230; + ld.const.v4.u8 {%rs2050, %rs2051, %rs2052, %rs2053}, [matrix+1024]; + cvt.u32.u16 %r3232, %rs2053; + cvt.s32.s8 %r3233, %r3232; + cvt.u32.u16 %r3234, %rs2052; + cvt.s32.s8 %r3235, %r3234; + cvt.u32.u16 %r3236, %rs2050; + cvt.s32.s8 %r3237, %r3236; + cvt.u32.u16 %r3238, %rs2051; + cvt.s32.s8 %r3239, %r3238; + mul.lo.s32 %r3240, %r34, %r3239; + mad.lo.s32 %r3241, %r124, %r3237, %r3240; + mad.lo.s32 %r3242, %r35, %r3235, %r3241; + mad.lo.s32 %r3243, %r36, %r3233, %r3242; + ld.const.v4.u8 {%rs2058, %rs2059, %rs2060, %rs2061}, [matrix+1028]; + cvt.u32.u16 %r3244, %rs2061; + cvt.s32.s8 %r3245, %r3244; + cvt.u32.u16 %r3246, %rs2060; + cvt.s32.s8 %r3247, %r3246; + cvt.u32.u16 %r3248, %rs2059; + cvt.s32.s8 %r3249, %r3248; + cvt.u32.u16 %r3250, %rs2058; + cvt.s32.s8 %r3251, %r3250; + mad.lo.s32 %r3252, %r37, %r3251, %r3243; + mad.lo.s32 %r3253, %r38, %r3249, %r3252; + mad.lo.s32 %r3254, %r39, %r3247, %r3253; + mad.lo.s32 %r3255, %r40, %r3245, %r3254; + ld.const.v4.u8 {%rs2066, %rs2067, %rs2068, %rs2069}, [matrix+1032]; + cvt.u32.u16 %r3256, %rs2069; + cvt.s32.s8 %r3257, %r3256; + cvt.u32.u16 %r3258, %rs2068; + cvt.s32.s8 %r3259, %r3258; + cvt.u32.u16 %r3260, %rs2067; + cvt.s32.s8 %r3261, %r3260; + cvt.u32.u16 %r3262, %rs2066; + cvt.s32.s8 %r3263, %r3262; + mad.lo.s32 %r3264, %r42, %r3263, %r3255; + mad.lo.s32 %r3265, %r43, %r3261, %r3264; + mad.lo.s32 %r3266, %r45, %r3259, %r3265; + mad.lo.s32 %r3267, %r46, %r3257, %r3266; + ld.const.v4.u8 {%rs2074, %rs2075, %rs2076, %rs2077}, [matrix+1036]; + cvt.u32.u16 %r3268, %rs2077; + cvt.s32.s8 %r3269, %r3268; + cvt.u32.u16 %r3270, %rs2076; + cvt.s32.s8 %r3271, %r3270; + cvt.u32.u16 %r3272, %rs2075; + cvt.s32.s8 %r3273, %r3272; + cvt.u32.u16 %r3274, %rs2074; + cvt.s32.s8 %r3275, %r3274; + mad.lo.s32 %r3276, %r48, %r3275, %r3267; + mad.lo.s32 %r3277, %r49, %r3273, %r3276; + mad.lo.s32 %r3278, %r50, %r3271, %r3277; + mad.lo.s32 %r3279, %r51, %r3269, %r3278; + ld.const.v4.u8 {%rs2082, %rs2083, %rs2084, %rs2085}, [matrix+1040]; + cvt.u32.u16 %r3280, %rs2085; + cvt.s32.s8 %r3281, %r3280; + cvt.u32.u16 %r3282, %rs2084; + cvt.s32.s8 %r3283, %r3282; + cvt.u32.u16 %r3284, %rs2083; + cvt.s32.s8 %r3285, %r3284; + cvt.u32.u16 %r3286, %rs2082; + cvt.s32.s8 %r3287, %r3286; + mad.lo.s32 %r3288, %r173, %r3287, %r3279; + mad.lo.s32 %r3289, %r53, %r3285, %r3288; + mad.lo.s32 %r3290, %r54, %r3283, %r3289; + mad.lo.s32 %r3291, %r55, %r3281, %r3290; + ld.const.v4.u8 {%rs2090, %rs2091, %rs2092, %rs2093}, [matrix+1044]; + cvt.u32.u16 %r3292, %rs2093; + cvt.s32.s8 %r3293, %r3292; + cvt.u32.u16 %r3294, %rs2092; + cvt.s32.s8 %r3295, %r3294; + cvt.u32.u16 %r3296, %rs2091; + cvt.s32.s8 %r3297, %r3296; + cvt.u32.u16 %r3298, %rs2090; + cvt.s32.s8 %r3299, %r3298; + mad.lo.s32 %r3300, %r56, %r3299, %r3291; + mad.lo.s32 %r3301, %r57, %r3297, %r3300; + mad.lo.s32 %r3302, %r58, %r3295, %r3301; + mad.lo.s32 %r3303, %r59, %r3293, %r3302; + ld.const.v4.u8 {%rs2098, %rs2099, %rs2100, %rs2101}, [matrix+1048]; + cvt.u32.u16 %r3304, %rs2101; + cvt.s32.s8 %r3305, %r3304; + cvt.u32.u16 %r3306, %rs2100; + cvt.s32.s8 %r3307, %r3306; + cvt.u32.u16 %r3308, %rs2099; + cvt.s32.s8 %r3309, %r3308; + cvt.u32.u16 %r3310, %rs2098; + cvt.s32.s8 %r3311, %r3310; + mad.lo.s32 %r3312, %r61, %r3311, %r3303; + mad.lo.s32 %r3313, %r62, %r3309, %r3312; + mad.lo.s32 %r3314, %r64, %r3307, %r3313; + mad.lo.s32 %r3315, %r65, %r3305, %r3314; + ld.const.v4.u8 {%rs2106, %rs2107, %rs2108, %rs2109}, [matrix+1052]; + cvt.u32.u16 %r3316, %rs2109; + cvt.s32.s8 %r3317, %r3316; + cvt.u32.u16 %r3318, %rs2108; + cvt.s32.s8 %r3319, %r3318; + cvt.u32.u16 %r3320, %rs2107; + cvt.s32.s8 %r3321, %r3320; + cvt.u32.u16 %r3322, %rs2106; + cvt.s32.s8 %r3323, %r3322; + mad.lo.s32 %r3324, %r67, %r3323, %r3315; + mad.lo.s32 %r3325, %r68, %r3321, %r3324; + mad.lo.s32 %r3326, %r69, %r3319, %r3325; + mad.lo.s32 %r3327, %r70, %r3317, %r3326; + ld.const.v4.u8 {%rs2114, %rs2115, %rs2116, %rs2117}, [matrix+1056]; + cvt.u32.u16 %r3328, %rs2117; + cvt.s32.s8 %r3329, %r3328; + cvt.u32.u16 %r3330, %rs2116; + cvt.s32.s8 %r3331, %r3330; + cvt.u32.u16 %r3332, %rs2115; + cvt.s32.s8 %r3333, %r3332; + cvt.u32.u16 %r3334, %rs2114; + cvt.s32.s8 %r3335, %r3334; + mad.lo.s32 %r3336, %r222, %r3335, %r3327; + mad.lo.s32 %r3337, %r72, %r3333, %r3336; + mad.lo.s32 %r3338, %r73, %r3331, %r3337; + mad.lo.s32 %r3339, %r74, %r3329, %r3338; + ld.const.v4.u8 {%rs2122, %rs2123, %rs2124, %rs2125}, [matrix+1060]; + cvt.u32.u16 %r3340, %rs2125; + cvt.s32.s8 %r3341, %r3340; + cvt.u32.u16 %r3342, %rs2124; + cvt.s32.s8 %r3343, %r3342; + cvt.u32.u16 %r3344, %rs2123; + cvt.s32.s8 %r3345, %r3344; + cvt.u32.u16 %r3346, %rs2122; + cvt.s32.s8 %r3347, %r3346; + mad.lo.s32 %r3348, %r75, %r3347, %r3339; + mad.lo.s32 %r3349, %r76, %r3345, %r3348; + mad.lo.s32 %r3350, %r77, %r3343, %r3349; + mad.lo.s32 %r3351, %r78, %r3341, %r3350; + ld.const.v4.u8 {%rs2130, %rs2131, %rs2132, %rs2133}, [matrix+1064]; + cvt.u32.u16 %r3352, %rs2133; + cvt.s32.s8 %r3353, %r3352; + cvt.u32.u16 %r3354, %rs2132; + cvt.s32.s8 %r3355, %r3354; + cvt.u32.u16 %r3356, %rs2131; + cvt.s32.s8 %r3357, %r3356; + cvt.u32.u16 %r3358, %rs2130; + cvt.s32.s8 %r3359, %r3358; + mad.lo.s32 %r3360, %r80, %r3359, %r3351; + mad.lo.s32 %r3361, %r81, %r3357, %r3360; + mad.lo.s32 %r3362, %r83, %r3355, %r3361; + mad.lo.s32 %r3363, %r84, %r3353, %r3362; + ld.const.v4.u8 {%rs2138, %rs2139, %rs2140, %rs2141}, [matrix+1068]; + cvt.u32.u16 %r3364, %rs2141; + cvt.s32.s8 %r3365, %r3364; + cvt.u32.u16 %r3366, %rs2140; + cvt.s32.s8 %r3367, %r3366; + cvt.u32.u16 %r3368, %rs2139; + cvt.s32.s8 %r3369, %r3368; + cvt.u32.u16 %r3370, %rs2138; + cvt.s32.s8 %r3371, %r3370; + mad.lo.s32 %r3372, %r86, %r3371, %r3363; + mad.lo.s32 %r3373, %r87, %r3369, %r3372; + mad.lo.s32 %r3374, %r88, %r3367, %r3373; + mad.lo.s32 %r3375, %r89, %r3365, %r3374; + ld.const.v4.u8 {%rs2146, %rs2147, %rs2148, %rs2149}, [matrix+1072]; + cvt.u32.u16 %r3376, %rs2149; + cvt.s32.s8 %r3377, %r3376; + cvt.u32.u16 %r3378, %rs2148; + cvt.s32.s8 %r3379, %r3378; + cvt.u32.u16 %r3380, %rs2147; + cvt.s32.s8 %r3381, %r3380; + cvt.u32.u16 %r3382, %rs2146; + cvt.s32.s8 %r3383, %r3382; + mad.lo.s32 %r3384, %r271, %r3383, %r3375; + mad.lo.s32 %r3385, %r91, %r3381, %r3384; + mad.lo.s32 %r3386, %r93, %r3379, %r3385; + mad.lo.s32 %r3387, %r94, %r3377, %r3386; + ld.const.v4.u8 {%rs2154, %rs2155, %rs2156, %rs2157}, [matrix+1076]; + cvt.u32.u16 %r3388, %rs2157; + cvt.s32.s8 %r3389, %r3388; + cvt.u32.u16 %r3390, %rs2156; + cvt.s32.s8 %r3391, %r3390; + cvt.u32.u16 %r3392, %rs2155; + cvt.s32.s8 %r3393, %r3392; + cvt.u32.u16 %r3394, %rs2154; + cvt.s32.s8 %r3395, %r3394; + mad.lo.s32 %r3396, %r96, %r3395, %r3387; + mad.lo.s32 %r3397, %r97, %r3393, %r3396; + mad.lo.s32 %r3398, %r99, %r3391, %r3397; + mad.lo.s32 %r3399, %r100, %r3389, %r3398; + ld.const.v4.u8 {%rs2162, %rs2163, %rs2164, %rs2165}, [matrix+1080]; + cvt.u32.u16 %r3400, %rs2165; + cvt.s32.s8 %r3401, %r3400; + cvt.u32.u16 %r3402, %rs2164; + cvt.s32.s8 %r3403, %r3402; + cvt.u32.u16 %r3404, %rs2163; + cvt.s32.s8 %r3405, %r3404; + cvt.u32.u16 %r3406, %rs2162; + cvt.s32.s8 %r3407, %r3406; + mad.lo.s32 %r3408, %r103, %r3407, %r3399; + mad.lo.s32 %r3409, %r104, %r3405, %r3408; + mad.lo.s32 %r3410, %r107, %r3403, %r3409; + mad.lo.s32 %r3411, %r108, %r3401, %r3410; + ld.const.v4.u8 {%rs2170, %rs2171, %rs2172, %rs2173}, [matrix+1084]; + cvt.u32.u16 %r3412, %rs2173; + cvt.s32.s8 %r3413, %r3412; + cvt.u32.u16 %r3414, %rs2172; + cvt.s32.s8 %r3415, %r3414; + cvt.u32.u16 %r3416, %rs2171; + cvt.s32.s8 %r3417, %r3416; + cvt.u32.u16 %r3418, %rs2170; + cvt.s32.s8 %r3419, %r3418; + mad.lo.s32 %r3420, %r111, %r3419, %r3411; + mad.lo.s32 %r3421, %r112, %r3417, %r3420; + mad.lo.s32 %r3422, %r114, %r3415, %r3421; + mad.lo.s32 %r3423, %r115, %r3413, %r3422; + ld.const.v4.u8 {%rs2178, %rs2179, %rs2180, %rs2181}, [matrix+1088]; + cvt.u32.u16 %r3424, %rs2181; + cvt.s32.s8 %r3425, %r3424; + cvt.u32.u16 %r3426, %rs2180; + cvt.s32.s8 %r3427, %r3426; + cvt.u32.u16 %r3428, %rs2178; + cvt.s32.s8 %r3429, %r3428; + cvt.u32.u16 %r3430, %rs2179; + cvt.s32.s8 %r3431, %r3430; + mul.lo.s32 %r3432, %r34, %r3431; + mad.lo.s32 %r3433, %r124, %r3429, %r3432; + mad.lo.s32 %r3434, %r35, %r3427, %r3433; + mad.lo.s32 %r3435, %r36, %r3425, %r3434; + ld.const.v4.u8 {%rs2186, %rs2187, %rs2188, %rs2189}, [matrix+1092]; + cvt.u32.u16 %r3436, %rs2189; + cvt.s32.s8 %r3437, %r3436; + cvt.u32.u16 %r3438, %rs2188; + cvt.s32.s8 %r3439, %r3438; + cvt.u32.u16 %r3440, %rs2187; + cvt.s32.s8 %r3441, %r3440; + cvt.u32.u16 %r3442, %rs2186; + cvt.s32.s8 %r3443, %r3442; + mad.lo.s32 %r3444, %r37, %r3443, %r3435; + mad.lo.s32 %r3445, %r38, %r3441, %r3444; + mad.lo.s32 %r3446, %r39, %r3439, %r3445; + mad.lo.s32 %r3447, %r40, %r3437, %r3446; + ld.const.v4.u8 {%rs2194, %rs2195, %rs2196, %rs2197}, [matrix+1096]; + cvt.u32.u16 %r3448, %rs2197; + cvt.s32.s8 %r3449, %r3448; + cvt.u32.u16 %r3450, %rs2196; + cvt.s32.s8 %r3451, %r3450; + cvt.u32.u16 %r3452, %rs2195; + cvt.s32.s8 %r3453, %r3452; + cvt.u32.u16 %r3454, %rs2194; + cvt.s32.s8 %r3455, %r3454; + mad.lo.s32 %r3456, %r42, %r3455, %r3447; + mad.lo.s32 %r3457, %r43, %r3453, %r3456; + mad.lo.s32 %r3458, %r45, %r3451, %r3457; + mad.lo.s32 %r3459, %r46, %r3449, %r3458; + ld.const.v4.u8 {%rs2202, %rs2203, %rs2204, %rs2205}, [matrix+1100]; + cvt.u32.u16 %r3460, %rs2205; + cvt.s32.s8 %r3461, %r3460; + cvt.u32.u16 %r3462, %rs2204; + cvt.s32.s8 %r3463, %r3462; + cvt.u32.u16 %r3464, %rs2203; + cvt.s32.s8 %r3465, %r3464; + cvt.u32.u16 %r3466, %rs2202; + cvt.s32.s8 %r3467, %r3466; + mad.lo.s32 %r3468, %r48, %r3467, %r3459; + mad.lo.s32 %r3469, %r49, %r3465, %r3468; + mad.lo.s32 %r3470, %r50, %r3463, %r3469; + mad.lo.s32 %r3471, %r51, %r3461, %r3470; + ld.const.v4.u8 {%rs2210, %rs2211, %rs2212, %rs2213}, [matrix+1104]; + cvt.u32.u16 %r3472, %rs2213; + cvt.s32.s8 %r3473, %r3472; + cvt.u32.u16 %r3474, %rs2212; + cvt.s32.s8 %r3475, %r3474; + cvt.u32.u16 %r3476, %rs2211; + cvt.s32.s8 %r3477, %r3476; + cvt.u32.u16 %r3478, %rs2210; + cvt.s32.s8 %r3479, %r3478; + mad.lo.s32 %r3480, %r173, %r3479, %r3471; + mad.lo.s32 %r3481, %r53, %r3477, %r3480; + mad.lo.s32 %r3482, %r54, %r3475, %r3481; + mad.lo.s32 %r3483, %r55, %r3473, %r3482; + ld.const.v4.u8 {%rs2218, %rs2219, %rs2220, %rs2221}, [matrix+1108]; + cvt.u32.u16 %r3484, %rs2221; + cvt.s32.s8 %r3485, %r3484; + cvt.u32.u16 %r3486, %rs2220; + cvt.s32.s8 %r3487, %r3486; + cvt.u32.u16 %r3488, %rs2219; + cvt.s32.s8 %r3489, %r3488; + cvt.u32.u16 %r3490, %rs2218; + cvt.s32.s8 %r3491, %r3490; + mad.lo.s32 %r3492, %r56, %r3491, %r3483; + mad.lo.s32 %r3493, %r57, %r3489, %r3492; + mad.lo.s32 %r3494, %r58, %r3487, %r3493; + mad.lo.s32 %r3495, %r59, %r3485, %r3494; + ld.const.v4.u8 {%rs2226, %rs2227, %rs2228, %rs2229}, [matrix+1112]; + cvt.u32.u16 %r3496, %rs2229; + cvt.s32.s8 %r3497, %r3496; + cvt.u32.u16 %r3498, %rs2228; + cvt.s32.s8 %r3499, %r3498; + cvt.u32.u16 %r3500, %rs2227; + cvt.s32.s8 %r3501, %r3500; + cvt.u32.u16 %r3502, %rs2226; + cvt.s32.s8 %r3503, %r3502; + mad.lo.s32 %r3504, %r61, %r3503, %r3495; + mad.lo.s32 %r3505, %r62, %r3501, %r3504; + mad.lo.s32 %r3506, %r64, %r3499, %r3505; + mad.lo.s32 %r3507, %r65, %r3497, %r3506; + ld.const.v4.u8 {%rs2234, %rs2235, %rs2236, %rs2237}, [matrix+1116]; + cvt.u32.u16 %r3508, %rs2237; + cvt.s32.s8 %r3509, %r3508; + cvt.u32.u16 %r3510, %rs2236; + cvt.s32.s8 %r3511, %r3510; + cvt.u32.u16 %r3512, %rs2235; + cvt.s32.s8 %r3513, %r3512; + cvt.u32.u16 %r3514, %rs2234; + cvt.s32.s8 %r3515, %r3514; + mad.lo.s32 %r3516, %r67, %r3515, %r3507; + mad.lo.s32 %r3517, %r68, %r3513, %r3516; + mad.lo.s32 %r3518, %r69, %r3511, %r3517; + mad.lo.s32 %r3519, %r70, %r3509, %r3518; + ld.const.v4.u8 {%rs2242, %rs2243, %rs2244, %rs2245}, [matrix+1120]; + cvt.u32.u16 %r3520, %rs2245; + cvt.s32.s8 %r3521, %r3520; + cvt.u32.u16 %r3522, %rs2244; + cvt.s32.s8 %r3523, %r3522; + cvt.u32.u16 %r3524, %rs2243; + cvt.s32.s8 %r3525, %r3524; + cvt.u32.u16 %r3526, %rs2242; + cvt.s32.s8 %r3527, %r3526; + mad.lo.s32 %r3528, %r222, %r3527, %r3519; + mad.lo.s32 %r3529, %r72, %r3525, %r3528; + mad.lo.s32 %r3530, %r73, %r3523, %r3529; + mad.lo.s32 %r3531, %r74, %r3521, %r3530; + ld.const.v4.u8 {%rs2250, %rs2251, %rs2252, %rs2253}, [matrix+1124]; + cvt.u32.u16 %r3532, %rs2253; + cvt.s32.s8 %r3533, %r3532; + cvt.u32.u16 %r3534, %rs2252; + cvt.s32.s8 %r3535, %r3534; + cvt.u32.u16 %r3536, %rs2251; + cvt.s32.s8 %r3537, %r3536; + cvt.u32.u16 %r3538, %rs2250; + cvt.s32.s8 %r3539, %r3538; + mad.lo.s32 %r3540, %r75, %r3539, %r3531; + mad.lo.s32 %r3541, %r76, %r3537, %r3540; + mad.lo.s32 %r3542, %r77, %r3535, %r3541; + mad.lo.s32 %r3543, %r78, %r3533, %r3542; + ld.const.v4.u8 {%rs2258, %rs2259, %rs2260, %rs2261}, [matrix+1128]; + cvt.u32.u16 %r3544, %rs2261; + cvt.s32.s8 %r3545, %r3544; + cvt.u32.u16 %r3546, %rs2260; + cvt.s32.s8 %r3547, %r3546; + cvt.u32.u16 %r3548, %rs2259; + cvt.s32.s8 %r3549, %r3548; + cvt.u32.u16 %r3550, %rs2258; + cvt.s32.s8 %r3551, %r3550; + mad.lo.s32 %r3552, %r80, %r3551, %r3543; + mad.lo.s32 %r3553, %r81, %r3549, %r3552; + mad.lo.s32 %r3554, %r83, %r3547, %r3553; + mad.lo.s32 %r3555, %r84, %r3545, %r3554; + ld.const.v4.u8 {%rs2266, %rs2267, %rs2268, %rs2269}, [matrix+1132]; + cvt.u32.u16 %r3556, %rs2269; + cvt.s32.s8 %r3557, %r3556; + cvt.u32.u16 %r3558, %rs2268; + cvt.s32.s8 %r3559, %r3558; + cvt.u32.u16 %r3560, %rs2267; + cvt.s32.s8 %r3561, %r3560; + cvt.u32.u16 %r3562, %rs2266; + cvt.s32.s8 %r3563, %r3562; + mad.lo.s32 %r3564, %r86, %r3563, %r3555; + mad.lo.s32 %r3565, %r87, %r3561, %r3564; + mad.lo.s32 %r3566, %r88, %r3559, %r3565; + mad.lo.s32 %r3567, %r89, %r3557, %r3566; + ld.const.v4.u8 {%rs2274, %rs2275, %rs2276, %rs2277}, [matrix+1136]; + cvt.u32.u16 %r3568, %rs2277; + cvt.s32.s8 %r3569, %r3568; + cvt.u32.u16 %r3570, %rs2276; + cvt.s32.s8 %r3571, %r3570; + cvt.u32.u16 %r3572, %rs2275; + cvt.s32.s8 %r3573, %r3572; + cvt.u32.u16 %r3574, %rs2274; + cvt.s32.s8 %r3575, %r3574; + mad.lo.s32 %r3576, %r271, %r3575, %r3567; + mad.lo.s32 %r3577, %r91, %r3573, %r3576; + mad.lo.s32 %r3578, %r93, %r3571, %r3577; + mad.lo.s32 %r3579, %r94, %r3569, %r3578; + ld.const.v4.u8 {%rs2282, %rs2283, %rs2284, %rs2285}, [matrix+1140]; + cvt.u32.u16 %r3580, %rs2285; + cvt.s32.s8 %r3581, %r3580; + cvt.u32.u16 %r3582, %rs2284; + cvt.s32.s8 %r3583, %r3582; + cvt.u32.u16 %r3584, %rs2283; + cvt.s32.s8 %r3585, %r3584; + cvt.u32.u16 %r3586, %rs2282; + cvt.s32.s8 %r3587, %r3586; + mad.lo.s32 %r3588, %r96, %r3587, %r3579; + mad.lo.s32 %r3589, %r97, %r3585, %r3588; + mad.lo.s32 %r3590, %r99, %r3583, %r3589; + mad.lo.s32 %r3591, %r100, %r3581, %r3590; + ld.const.v4.u8 {%rs2290, %rs2291, %rs2292, %rs2293}, [matrix+1144]; + cvt.u32.u16 %r3592, %rs2293; + cvt.s32.s8 %r3593, %r3592; + cvt.u32.u16 %r3594, %rs2292; + cvt.s32.s8 %r3595, %r3594; + cvt.u32.u16 %r3596, %rs2291; + cvt.s32.s8 %r3597, %r3596; + cvt.u32.u16 %r3598, %rs2290; + cvt.s32.s8 %r3599, %r3598; + mad.lo.s32 %r3600, %r103, %r3599, %r3591; + mad.lo.s32 %r3601, %r104, %r3597, %r3600; + mad.lo.s32 %r3602, %r107, %r3595, %r3601; + mad.lo.s32 %r3603, %r108, %r3593, %r3602; + ld.const.v4.u8 {%rs2298, %rs2299, %rs2300, %rs2301}, [matrix+1148]; + cvt.u32.u16 %r3604, %rs2301; + cvt.s32.s8 %r3605, %r3604; + cvt.u32.u16 %r3606, %rs2300; + cvt.s32.s8 %r3607, %r3606; + cvt.u32.u16 %r3608, %rs2299; + cvt.s32.s8 %r3609, %r3608; + cvt.u32.u16 %r3610, %rs2298; + cvt.s32.s8 %r3611, %r3610; + mad.lo.s32 %r3612, %r111, %r3611, %r3603; + mad.lo.s32 %r3613, %r112, %r3609, %r3612; + mad.lo.s32 %r3614, %r114, %r3607, %r3613; + mad.lo.s32 %r3615, %r115, %r3605, %r3614; + shr.u32 %r3616, %r3423, 6; + and.b32 %r3617, %r3616, 240; + shr.u32 %r3618, %r3615, 10; + or.b32 %r3619, %r3618, %r3617; + xor.b32 %r3620, %r52, %r3619; + cvt.u64.u32 %rd387, %r3620; + ld.const.v4.u8 {%rs2306, %rs2307, %rs2308, %rs2309}, [matrix+1152]; + cvt.u32.u16 %r3621, %rs2309; + cvt.s32.s8 %r3622, %r3621; + cvt.u32.u16 %r3623, %rs2308; + cvt.s32.s8 %r3624, %r3623; + cvt.u32.u16 %r3625, %rs2306; + cvt.s32.s8 %r3626, %r3625; + cvt.u32.u16 %r3627, %rs2307; + cvt.s32.s8 %r3628, %r3627; + mul.lo.s32 %r3629, %r34, %r3628; + mad.lo.s32 %r3630, %r124, %r3626, %r3629; + mad.lo.s32 %r3631, %r35, %r3624, %r3630; + mad.lo.s32 %r3632, %r36, %r3622, %r3631; + ld.const.v4.u8 {%rs2314, %rs2315, %rs2316, %rs2317}, [matrix+1156]; + cvt.u32.u16 %r3633, %rs2317; + cvt.s32.s8 %r3634, %r3633; + cvt.u32.u16 %r3635, %rs2316; + cvt.s32.s8 %r3636, %r3635; + cvt.u32.u16 %r3637, %rs2315; + cvt.s32.s8 %r3638, %r3637; + cvt.u32.u16 %r3639, %rs2314; + cvt.s32.s8 %r3640, %r3639; + mad.lo.s32 %r3641, %r37, %r3640, %r3632; + mad.lo.s32 %r3642, %r38, %r3638, %r3641; + mad.lo.s32 %r3643, %r39, %r3636, %r3642; + mad.lo.s32 %r3644, %r40, %r3634, %r3643; + ld.const.v4.u8 {%rs2322, %rs2323, %rs2324, %rs2325}, [matrix+1160]; + cvt.u32.u16 %r3645, %rs2325; + cvt.s32.s8 %r3646, %r3645; + cvt.u32.u16 %r3647, %rs2324; + cvt.s32.s8 %r3648, %r3647; + cvt.u32.u16 %r3649, %rs2323; + cvt.s32.s8 %r3650, %r3649; + cvt.u32.u16 %r3651, %rs2322; + cvt.s32.s8 %r3652, %r3651; + mad.lo.s32 %r3653, %r42, %r3652, %r3644; + mad.lo.s32 %r3654, %r43, %r3650, %r3653; + mad.lo.s32 %r3655, %r45, %r3648, %r3654; + mad.lo.s32 %r3656, %r46, %r3646, %r3655; + ld.const.v4.u8 {%rs2330, %rs2331, %rs2332, %rs2333}, [matrix+1164]; + cvt.u32.u16 %r3657, %rs2333; + cvt.s32.s8 %r3658, %r3657; + cvt.u32.u16 %r3659, %rs2332; + cvt.s32.s8 %r3660, %r3659; + cvt.u32.u16 %r3661, %rs2331; + cvt.s32.s8 %r3662, %r3661; + cvt.u32.u16 %r3663, %rs2330; + cvt.s32.s8 %r3664, %r3663; + mad.lo.s32 %r3665, %r48, %r3664, %r3656; + mad.lo.s32 %r3666, %r49, %r3662, %r3665; + mad.lo.s32 %r3667, %r50, %r3660, %r3666; + mad.lo.s32 %r3668, %r51, %r3658, %r3667; + ld.const.v4.u8 {%rs2338, %rs2339, %rs2340, %rs2341}, [matrix+1168]; + cvt.u32.u16 %r3669, %rs2341; + cvt.s32.s8 %r3670, %r3669; + cvt.u32.u16 %r3671, %rs2340; + cvt.s32.s8 %r3672, %r3671; + cvt.u32.u16 %r3673, %rs2339; + cvt.s32.s8 %r3674, %r3673; + cvt.u32.u16 %r3675, %rs2338; + cvt.s32.s8 %r3676, %r3675; + mad.lo.s32 %r3677, %r173, %r3676, %r3668; + mad.lo.s32 %r3678, %r53, %r3674, %r3677; + mad.lo.s32 %r3679, %r54, %r3672, %r3678; + mad.lo.s32 %r3680, %r55, %r3670, %r3679; + ld.const.v4.u8 {%rs2346, %rs2347, %rs2348, %rs2349}, [matrix+1172]; + cvt.u32.u16 %r3681, %rs2349; + cvt.s32.s8 %r3682, %r3681; + cvt.u32.u16 %r3683, %rs2348; + cvt.s32.s8 %r3684, %r3683; + cvt.u32.u16 %r3685, %rs2347; + cvt.s32.s8 %r3686, %r3685; + cvt.u32.u16 %r3687, %rs2346; + cvt.s32.s8 %r3688, %r3687; + mad.lo.s32 %r3689, %r56, %r3688, %r3680; + mad.lo.s32 %r3690, %r57, %r3686, %r3689; + mad.lo.s32 %r3691, %r58, %r3684, %r3690; + mad.lo.s32 %r3692, %r59, %r3682, %r3691; + ld.const.v4.u8 {%rs2354, %rs2355, %rs2356, %rs2357}, [matrix+1176]; + cvt.u32.u16 %r3693, %rs2357; + cvt.s32.s8 %r3694, %r3693; + cvt.u32.u16 %r3695, %rs2356; + cvt.s32.s8 %r3696, %r3695; + cvt.u32.u16 %r3697, %rs2355; + cvt.s32.s8 %r3698, %r3697; + cvt.u32.u16 %r3699, %rs2354; + cvt.s32.s8 %r3700, %r3699; + mad.lo.s32 %r3701, %r61, %r3700, %r3692; + mad.lo.s32 %r3702, %r62, %r3698, %r3701; + mad.lo.s32 %r3703, %r64, %r3696, %r3702; + mad.lo.s32 %r3704, %r65, %r3694, %r3703; + ld.const.v4.u8 {%rs2362, %rs2363, %rs2364, %rs2365}, [matrix+1180]; + cvt.u32.u16 %r3705, %rs2365; + cvt.s32.s8 %r3706, %r3705; + cvt.u32.u16 %r3707, %rs2364; + cvt.s32.s8 %r3708, %r3707; + cvt.u32.u16 %r3709, %rs2363; + cvt.s32.s8 %r3710, %r3709; + cvt.u32.u16 %r3711, %rs2362; + cvt.s32.s8 %r3712, %r3711; + mad.lo.s32 %r3713, %r67, %r3712, %r3704; + mad.lo.s32 %r3714, %r68, %r3710, %r3713; + mad.lo.s32 %r3715, %r69, %r3708, %r3714; + mad.lo.s32 %r3716, %r70, %r3706, %r3715; + ld.const.v4.u8 {%rs2370, %rs2371, %rs2372, %rs2373}, [matrix+1184]; + cvt.u32.u16 %r3717, %rs2373; + cvt.s32.s8 %r3718, %r3717; + cvt.u32.u16 %r3719, %rs2372; + cvt.s32.s8 %r3720, %r3719; + cvt.u32.u16 %r3721, %rs2371; + cvt.s32.s8 %r3722, %r3721; + cvt.u32.u16 %r3723, %rs2370; + cvt.s32.s8 %r3724, %r3723; + mad.lo.s32 %r3725, %r222, %r3724, %r3716; + mad.lo.s32 %r3726, %r72, %r3722, %r3725; + mad.lo.s32 %r3727, %r73, %r3720, %r3726; + mad.lo.s32 %r3728, %r74, %r3718, %r3727; + ld.const.v4.u8 {%rs2378, %rs2379, %rs2380, %rs2381}, [matrix+1188]; + cvt.u32.u16 %r3729, %rs2381; + cvt.s32.s8 %r3730, %r3729; + cvt.u32.u16 %r3731, %rs2380; + cvt.s32.s8 %r3732, %r3731; + cvt.u32.u16 %r3733, %rs2379; + cvt.s32.s8 %r3734, %r3733; + cvt.u32.u16 %r3735, %rs2378; + cvt.s32.s8 %r3736, %r3735; + mad.lo.s32 %r3737, %r75, %r3736, %r3728; + mad.lo.s32 %r3738, %r76, %r3734, %r3737; + mad.lo.s32 %r3739, %r77, %r3732, %r3738; + mad.lo.s32 %r3740, %r78, %r3730, %r3739; + ld.const.v4.u8 {%rs2386, %rs2387, %rs2388, %rs2389}, [matrix+1192]; + cvt.u32.u16 %r3741, %rs2389; + cvt.s32.s8 %r3742, %r3741; + cvt.u32.u16 %r3743, %rs2388; + cvt.s32.s8 %r3744, %r3743; + cvt.u32.u16 %r3745, %rs2387; + cvt.s32.s8 %r3746, %r3745; + cvt.u32.u16 %r3747, %rs2386; + cvt.s32.s8 %r3748, %r3747; + mad.lo.s32 %r3749, %r80, %r3748, %r3740; + mad.lo.s32 %r3750, %r81, %r3746, %r3749; + mad.lo.s32 %r3751, %r83, %r3744, %r3750; + mad.lo.s32 %r3752, %r84, %r3742, %r3751; + ld.const.v4.u8 {%rs2394, %rs2395, %rs2396, %rs2397}, [matrix+1196]; + cvt.u32.u16 %r3753, %rs2397; + cvt.s32.s8 %r3754, %r3753; + cvt.u32.u16 %r3755, %rs2396; + cvt.s32.s8 %r3756, %r3755; + cvt.u32.u16 %r3757, %rs2395; + cvt.s32.s8 %r3758, %r3757; + cvt.u32.u16 %r3759, %rs2394; + cvt.s32.s8 %r3760, %r3759; + mad.lo.s32 %r3761, %r86, %r3760, %r3752; + mad.lo.s32 %r3762, %r87, %r3758, %r3761; + mad.lo.s32 %r3763, %r88, %r3756, %r3762; + mad.lo.s32 %r3764, %r89, %r3754, %r3763; + ld.const.v4.u8 {%rs2402, %rs2403, %rs2404, %rs2405}, [matrix+1200]; + cvt.u32.u16 %r3765, %rs2405; + cvt.s32.s8 %r3766, %r3765; + cvt.u32.u16 %r3767, %rs2404; + cvt.s32.s8 %r3768, %r3767; + cvt.u32.u16 %r3769, %rs2403; + cvt.s32.s8 %r3770, %r3769; + cvt.u32.u16 %r3771, %rs2402; + cvt.s32.s8 %r3772, %r3771; + mad.lo.s32 %r3773, %r271, %r3772, %r3764; + mad.lo.s32 %r3774, %r91, %r3770, %r3773; + mad.lo.s32 %r3775, %r93, %r3768, %r3774; + mad.lo.s32 %r3776, %r94, %r3766, %r3775; + ld.const.v4.u8 {%rs2410, %rs2411, %rs2412, %rs2413}, [matrix+1204]; + cvt.u32.u16 %r3777, %rs2413; + cvt.s32.s8 %r3778, %r3777; + cvt.u32.u16 %r3779, %rs2412; + cvt.s32.s8 %r3780, %r3779; + cvt.u32.u16 %r3781, %rs2411; + cvt.s32.s8 %r3782, %r3781; + cvt.u32.u16 %r3783, %rs2410; + cvt.s32.s8 %r3784, %r3783; + mad.lo.s32 %r3785, %r96, %r3784, %r3776; + mad.lo.s32 %r3786, %r97, %r3782, %r3785; + mad.lo.s32 %r3787, %r99, %r3780, %r3786; + mad.lo.s32 %r3788, %r100, %r3778, %r3787; + ld.const.v4.u8 {%rs2418, %rs2419, %rs2420, %rs2421}, [matrix+1208]; + cvt.u32.u16 %r3789, %rs2421; + cvt.s32.s8 %r3790, %r3789; + cvt.u32.u16 %r3791, %rs2420; + cvt.s32.s8 %r3792, %r3791; + cvt.u32.u16 %r3793, %rs2419; + cvt.s32.s8 %r3794, %r3793; + cvt.u32.u16 %r3795, %rs2418; + cvt.s32.s8 %r3796, %r3795; + mad.lo.s32 %r3797, %r103, %r3796, %r3788; + mad.lo.s32 %r3798, %r104, %r3794, %r3797; + mad.lo.s32 %r3799, %r107, %r3792, %r3798; + mad.lo.s32 %r3800, %r108, %r3790, %r3799; + ld.const.v4.u8 {%rs2426, %rs2427, %rs2428, %rs2429}, [matrix+1212]; + cvt.u32.u16 %r3801, %rs2429; + cvt.s32.s8 %r3802, %r3801; + cvt.u32.u16 %r3803, %rs2428; + cvt.s32.s8 %r3804, %r3803; + cvt.u32.u16 %r3805, %rs2427; + cvt.s32.s8 %r3806, %r3805; + cvt.u32.u16 %r3807, %rs2426; + cvt.s32.s8 %r3808, %r3807; + mad.lo.s32 %r3809, %r111, %r3808, %r3800; + mad.lo.s32 %r3810, %r112, %r3806, %r3809; + mad.lo.s32 %r3811, %r114, %r3804, %r3810; + mad.lo.s32 %r3812, %r115, %r3802, %r3811; + ld.const.v4.u8 {%rs2434, %rs2435, %rs2436, %rs2437}, [matrix+1216]; + cvt.u32.u16 %r3813, %rs2437; + cvt.s32.s8 %r3814, %r3813; + cvt.u32.u16 %r3815, %rs2436; + cvt.s32.s8 %r3816, %r3815; + cvt.u32.u16 %r3817, %rs2434; + cvt.s32.s8 %r3818, %r3817; + cvt.u32.u16 %r3819, %rs2435; + cvt.s32.s8 %r3820, %r3819; + mul.lo.s32 %r3821, %r34, %r3820; + mad.lo.s32 %r3822, %r124, %r3818, %r3821; + mad.lo.s32 %r3823, %r35, %r3816, %r3822; + mad.lo.s32 %r3824, %r36, %r3814, %r3823; + ld.const.v4.u8 {%rs2442, %rs2443, %rs2444, %rs2445}, [matrix+1220]; + cvt.u32.u16 %r3825, %rs2445; + cvt.s32.s8 %r3826, %r3825; + cvt.u32.u16 %r3827, %rs2444; + cvt.s32.s8 %r3828, %r3827; + cvt.u32.u16 %r3829, %rs2443; + cvt.s32.s8 %r3830, %r3829; + cvt.u32.u16 %r3831, %rs2442; + cvt.s32.s8 %r3832, %r3831; + mad.lo.s32 %r3833, %r37, %r3832, %r3824; + mad.lo.s32 %r3834, %r38, %r3830, %r3833; + mad.lo.s32 %r3835, %r39, %r3828, %r3834; + mad.lo.s32 %r3836, %r40, %r3826, %r3835; + ld.const.v4.u8 {%rs2450, %rs2451, %rs2452, %rs2453}, [matrix+1224]; + cvt.u32.u16 %r3837, %rs2453; + cvt.s32.s8 %r3838, %r3837; + cvt.u32.u16 %r3839, %rs2452; + cvt.s32.s8 %r3840, %r3839; + cvt.u32.u16 %r3841, %rs2451; + cvt.s32.s8 %r3842, %r3841; + cvt.u32.u16 %r3843, %rs2450; + cvt.s32.s8 %r3844, %r3843; + mad.lo.s32 %r3845, %r42, %r3844, %r3836; + mad.lo.s32 %r3846, %r43, %r3842, %r3845; + mad.lo.s32 %r3847, %r45, %r3840, %r3846; + mad.lo.s32 %r3848, %r46, %r3838, %r3847; + ld.const.v4.u8 {%rs2458, %rs2459, %rs2460, %rs2461}, [matrix+1228]; + cvt.u32.u16 %r3849, %rs2461; + cvt.s32.s8 %r3850, %r3849; + cvt.u32.u16 %r3851, %rs2460; + cvt.s32.s8 %r3852, %r3851; + cvt.u32.u16 %r3853, %rs2459; + cvt.s32.s8 %r3854, %r3853; + cvt.u32.u16 %r3855, %rs2458; + cvt.s32.s8 %r3856, %r3855; + mad.lo.s32 %r3857, %r48, %r3856, %r3848; + mad.lo.s32 %r3858, %r49, %r3854, %r3857; + mad.lo.s32 %r3859, %r50, %r3852, %r3858; + mad.lo.s32 %r3860, %r51, %r3850, %r3859; + ld.const.v4.u8 {%rs2466, %rs2467, %rs2468, %rs2469}, [matrix+1232]; + cvt.u32.u16 %r3861, %rs2469; + cvt.s32.s8 %r3862, %r3861; + cvt.u32.u16 %r3863, %rs2468; + cvt.s32.s8 %r3864, %r3863; + cvt.u32.u16 %r3865, %rs2467; + cvt.s32.s8 %r3866, %r3865; + cvt.u32.u16 %r3867, %rs2466; + cvt.s32.s8 %r3868, %r3867; + mad.lo.s32 %r3869, %r173, %r3868, %r3860; + mad.lo.s32 %r3870, %r53, %r3866, %r3869; + mad.lo.s32 %r3871, %r54, %r3864, %r3870; + mad.lo.s32 %r3872, %r55, %r3862, %r3871; + ld.const.v4.u8 {%rs2474, %rs2475, %rs2476, %rs2477}, [matrix+1236]; + cvt.u32.u16 %r3873, %rs2477; + cvt.s32.s8 %r3874, %r3873; + cvt.u32.u16 %r3875, %rs2476; + cvt.s32.s8 %r3876, %r3875; + cvt.u32.u16 %r3877, %rs2475; + cvt.s32.s8 %r3878, %r3877; + cvt.u32.u16 %r3879, %rs2474; + cvt.s32.s8 %r3880, %r3879; + mad.lo.s32 %r3881, %r56, %r3880, %r3872; + mad.lo.s32 %r3882, %r57, %r3878, %r3881; + mad.lo.s32 %r3883, %r58, %r3876, %r3882; + mad.lo.s32 %r3884, %r59, %r3874, %r3883; + ld.const.v4.u8 {%rs2482, %rs2483, %rs2484, %rs2485}, [matrix+1240]; + cvt.u32.u16 %r3885, %rs2485; + cvt.s32.s8 %r3886, %r3885; + cvt.u32.u16 %r3887, %rs2484; + cvt.s32.s8 %r3888, %r3887; + cvt.u32.u16 %r3889, %rs2483; + cvt.s32.s8 %r3890, %r3889; + cvt.u32.u16 %r3891, %rs2482; + cvt.s32.s8 %r3892, %r3891; + mad.lo.s32 %r3893, %r61, %r3892, %r3884; + mad.lo.s32 %r3894, %r62, %r3890, %r3893; + mad.lo.s32 %r3895, %r64, %r3888, %r3894; + mad.lo.s32 %r3896, %r65, %r3886, %r3895; + ld.const.v4.u8 {%rs2490, %rs2491, %rs2492, %rs2493}, [matrix+1244]; + cvt.u32.u16 %r3897, %rs2493; + cvt.s32.s8 %r3898, %r3897; + cvt.u32.u16 %r3899, %rs2492; + cvt.s32.s8 %r3900, %r3899; + cvt.u32.u16 %r3901, %rs2491; + cvt.s32.s8 %r3902, %r3901; + cvt.u32.u16 %r3903, %rs2490; + cvt.s32.s8 %r3904, %r3903; + mad.lo.s32 %r3905, %r67, %r3904, %r3896; + mad.lo.s32 %r3906, %r68, %r3902, %r3905; + mad.lo.s32 %r3907, %r69, %r3900, %r3906; + mad.lo.s32 %r3908, %r70, %r3898, %r3907; + ld.const.v4.u8 {%rs2498, %rs2499, %rs2500, %rs2501}, [matrix+1248]; + cvt.u32.u16 %r3909, %rs2501; + cvt.s32.s8 %r3910, %r3909; + cvt.u32.u16 %r3911, %rs2500; + cvt.s32.s8 %r3912, %r3911; + cvt.u32.u16 %r3913, %rs2499; + cvt.s32.s8 %r3914, %r3913; + cvt.u32.u16 %r3915, %rs2498; + cvt.s32.s8 %r3916, %r3915; + mad.lo.s32 %r3917, %r222, %r3916, %r3908; + mad.lo.s32 %r3918, %r72, %r3914, %r3917; + mad.lo.s32 %r3919, %r73, %r3912, %r3918; + mad.lo.s32 %r3920, %r74, %r3910, %r3919; + ld.const.v4.u8 {%rs2506, %rs2507, %rs2508, %rs2509}, [matrix+1252]; + cvt.u32.u16 %r3921, %rs2509; + cvt.s32.s8 %r3922, %r3921; + cvt.u32.u16 %r3923, %rs2508; + cvt.s32.s8 %r3924, %r3923; + cvt.u32.u16 %r3925, %rs2507; + cvt.s32.s8 %r3926, %r3925; + cvt.u32.u16 %r3927, %rs2506; + cvt.s32.s8 %r3928, %r3927; + mad.lo.s32 %r3929, %r75, %r3928, %r3920; + mad.lo.s32 %r3930, %r76, %r3926, %r3929; + mad.lo.s32 %r3931, %r77, %r3924, %r3930; + mad.lo.s32 %r3932, %r78, %r3922, %r3931; + ld.const.v4.u8 {%rs2514, %rs2515, %rs2516, %rs2517}, [matrix+1256]; + cvt.u32.u16 %r3933, %rs2517; + cvt.s32.s8 %r3934, %r3933; + cvt.u32.u16 %r3935, %rs2516; + cvt.s32.s8 %r3936, %r3935; + cvt.u32.u16 %r3937, %rs2515; + cvt.s32.s8 %r3938, %r3937; + cvt.u32.u16 %r3939, %rs2514; + cvt.s32.s8 %r3940, %r3939; + mad.lo.s32 %r3941, %r80, %r3940, %r3932; + mad.lo.s32 %r3942, %r81, %r3938, %r3941; + mad.lo.s32 %r3943, %r83, %r3936, %r3942; + mad.lo.s32 %r3944, %r84, %r3934, %r3943; + ld.const.v4.u8 {%rs2522, %rs2523, %rs2524, %rs2525}, [matrix+1260]; + cvt.u32.u16 %r3945, %rs2525; + cvt.s32.s8 %r3946, %r3945; + cvt.u32.u16 %r3947, %rs2524; + cvt.s32.s8 %r3948, %r3947; + cvt.u32.u16 %r3949, %rs2523; + cvt.s32.s8 %r3950, %r3949; + cvt.u32.u16 %r3951, %rs2522; + cvt.s32.s8 %r3952, %r3951; + mad.lo.s32 %r3953, %r86, %r3952, %r3944; + mad.lo.s32 %r3954, %r87, %r3950, %r3953; + mad.lo.s32 %r3955, %r88, %r3948, %r3954; + mad.lo.s32 %r3956, %r89, %r3946, %r3955; + ld.const.v4.u8 {%rs2530, %rs2531, %rs2532, %rs2533}, [matrix+1264]; + cvt.u32.u16 %r3957, %rs2533; + cvt.s32.s8 %r3958, %r3957; + cvt.u32.u16 %r3959, %rs2532; + cvt.s32.s8 %r3960, %r3959; + cvt.u32.u16 %r3961, %rs2531; + cvt.s32.s8 %r3962, %r3961; + cvt.u32.u16 %r3963, %rs2530; + cvt.s32.s8 %r3964, %r3963; + mad.lo.s32 %r3965, %r271, %r3964, %r3956; + mad.lo.s32 %r3966, %r91, %r3962, %r3965; + mad.lo.s32 %r3967, %r93, %r3960, %r3966; + mad.lo.s32 %r3968, %r94, %r3958, %r3967; + ld.const.v4.u8 {%rs2538, %rs2539, %rs2540, %rs2541}, [matrix+1268]; + cvt.u32.u16 %r3969, %rs2541; + cvt.s32.s8 %r3970, %r3969; + cvt.u32.u16 %r3971, %rs2540; + cvt.s32.s8 %r3972, %r3971; + cvt.u32.u16 %r3973, %rs2539; + cvt.s32.s8 %r3974, %r3973; + cvt.u32.u16 %r3975, %rs2538; + cvt.s32.s8 %r3976, %r3975; + mad.lo.s32 %r3977, %r96, %r3976, %r3968; + mad.lo.s32 %r3978, %r97, %r3974, %r3977; + mad.lo.s32 %r3979, %r99, %r3972, %r3978; + mad.lo.s32 %r3980, %r100, %r3970, %r3979; + ld.const.v4.u8 {%rs2546, %rs2547, %rs2548, %rs2549}, [matrix+1272]; + cvt.u32.u16 %r3981, %rs2549; + cvt.s32.s8 %r3982, %r3981; + cvt.u32.u16 %r3983, %rs2548; + cvt.s32.s8 %r3984, %r3983; + cvt.u32.u16 %r3985, %rs2547; + cvt.s32.s8 %r3986, %r3985; + cvt.u32.u16 %r3987, %rs2546; + cvt.s32.s8 %r3988, %r3987; + mad.lo.s32 %r3989, %r103, %r3988, %r3980; + mad.lo.s32 %r3990, %r104, %r3986, %r3989; + mad.lo.s32 %r3991, %r107, %r3984, %r3990; + mad.lo.s32 %r3992, %r108, %r3982, %r3991; + ld.const.v4.u8 {%rs2554, %rs2555, %rs2556, %rs2557}, [matrix+1276]; + cvt.u32.u16 %r3993, %rs2557; + cvt.s32.s8 %r3994, %r3993; + cvt.u32.u16 %r3995, %rs2556; + cvt.s32.s8 %r3996, %r3995; + cvt.u32.u16 %r3997, %rs2555; + cvt.s32.s8 %r3998, %r3997; + cvt.u32.u16 %r3999, %rs2554; + cvt.s32.s8 %r4000, %r3999; + mad.lo.s32 %r4001, %r111, %r4000, %r3992; + mad.lo.s32 %r4002, %r112, %r3998, %r4001; + mad.lo.s32 %r4003, %r114, %r3996, %r4002; + mad.lo.s32 %r4004, %r115, %r3994, %r4003; + shr.u32 %r4005, %r3812, 6; + and.b32 %r4006, %r4005, 240; + shr.u32 %r4007, %r4004, 10; + or.b32 %r4008, %r4007, %r4006; + xor.b32 %r4009, %r20, %r4008; + ld.const.v4.u8 {%rs2562, %rs2563, %rs2564, %rs2565}, [matrix+1280]; + cvt.u32.u16 %r4010, %rs2565; + cvt.s32.s8 %r4011, %r4010; + cvt.u32.u16 %r4012, %rs2564; + cvt.s32.s8 %r4013, %r4012; + cvt.u32.u16 %r4014, %rs2562; + cvt.s32.s8 %r4015, %r4014; + cvt.u32.u16 %r4016, %rs2563; + cvt.s32.s8 %r4017, %r4016; + mul.lo.s32 %r4018, %r34, %r4017; + mad.lo.s32 %r4019, %r124, %r4015, %r4018; + mad.lo.s32 %r4020, %r35, %r4013, %r4019; + mad.lo.s32 %r4021, %r36, %r4011, %r4020; + ld.const.v4.u8 {%rs2570, %rs2571, %rs2572, %rs2573}, [matrix+1284]; + cvt.u32.u16 %r4022, %rs2573; + cvt.s32.s8 %r4023, %r4022; + cvt.u32.u16 %r4024, %rs2572; + cvt.s32.s8 %r4025, %r4024; + cvt.u32.u16 %r4026, %rs2571; + cvt.s32.s8 %r4027, %r4026; + cvt.u32.u16 %r4028, %rs2570; + cvt.s32.s8 %r4029, %r4028; + mad.lo.s32 %r4030, %r37, %r4029, %r4021; + mad.lo.s32 %r4031, %r38, %r4027, %r4030; + mad.lo.s32 %r4032, %r39, %r4025, %r4031; + mad.lo.s32 %r4033, %r40, %r4023, %r4032; + ld.const.v4.u8 {%rs2578, %rs2579, %rs2580, %rs2581}, [matrix+1288]; + cvt.u32.u16 %r4034, %rs2581; + cvt.s32.s8 %r4035, %r4034; + cvt.u32.u16 %r4036, %rs2580; + cvt.s32.s8 %r4037, %r4036; + cvt.u32.u16 %r4038, %rs2579; + cvt.s32.s8 %r4039, %r4038; + cvt.u32.u16 %r4040, %rs2578; + cvt.s32.s8 %r4041, %r4040; + mad.lo.s32 %r4042, %r42, %r4041, %r4033; + mad.lo.s32 %r4043, %r43, %r4039, %r4042; + mad.lo.s32 %r4044, %r45, %r4037, %r4043; + mad.lo.s32 %r4045, %r46, %r4035, %r4044; + ld.const.v4.u8 {%rs2586, %rs2587, %rs2588, %rs2589}, [matrix+1292]; + cvt.u32.u16 %r4046, %rs2589; + cvt.s32.s8 %r4047, %r4046; + cvt.u32.u16 %r4048, %rs2588; + cvt.s32.s8 %r4049, %r4048; + cvt.u32.u16 %r4050, %rs2587; + cvt.s32.s8 %r4051, %r4050; + cvt.u32.u16 %r4052, %rs2586; + cvt.s32.s8 %r4053, %r4052; + mad.lo.s32 %r4054, %r48, %r4053, %r4045; + mad.lo.s32 %r4055, %r49, %r4051, %r4054; + mad.lo.s32 %r4056, %r50, %r4049, %r4055; + mad.lo.s32 %r4057, %r51, %r4047, %r4056; + ld.const.v4.u8 {%rs2594, %rs2595, %rs2596, %rs2597}, [matrix+1296]; + cvt.u32.u16 %r4058, %rs2597; + cvt.s32.s8 %r4059, %r4058; + cvt.u32.u16 %r4060, %rs2596; + cvt.s32.s8 %r4061, %r4060; + cvt.u32.u16 %r4062, %rs2595; + cvt.s32.s8 %r4063, %r4062; + cvt.u32.u16 %r4064, %rs2594; + cvt.s32.s8 %r4065, %r4064; + mad.lo.s32 %r4066, %r173, %r4065, %r4057; + mad.lo.s32 %r4067, %r53, %r4063, %r4066; + mad.lo.s32 %r4068, %r54, %r4061, %r4067; + mad.lo.s32 %r4069, %r55, %r4059, %r4068; + ld.const.v4.u8 {%rs2602, %rs2603, %rs2604, %rs2605}, [matrix+1300]; + cvt.u32.u16 %r4070, %rs2605; + cvt.s32.s8 %r4071, %r4070; + cvt.u32.u16 %r4072, %rs2604; + cvt.s32.s8 %r4073, %r4072; + cvt.u32.u16 %r4074, %rs2603; + cvt.s32.s8 %r4075, %r4074; + cvt.u32.u16 %r4076, %rs2602; + cvt.s32.s8 %r4077, %r4076; + mad.lo.s32 %r4078, %r56, %r4077, %r4069; + mad.lo.s32 %r4079, %r57, %r4075, %r4078; + mad.lo.s32 %r4080, %r58, %r4073, %r4079; + mad.lo.s32 %r4081, %r59, %r4071, %r4080; + ld.const.v4.u8 {%rs2610, %rs2611, %rs2612, %rs2613}, [matrix+1304]; + cvt.u32.u16 %r4082, %rs2613; + cvt.s32.s8 %r4083, %r4082; + cvt.u32.u16 %r4084, %rs2612; + cvt.s32.s8 %r4085, %r4084; + cvt.u32.u16 %r4086, %rs2611; + cvt.s32.s8 %r4087, %r4086; + cvt.u32.u16 %r4088, %rs2610; + cvt.s32.s8 %r4089, %r4088; + mad.lo.s32 %r4090, %r61, %r4089, %r4081; + mad.lo.s32 %r4091, %r62, %r4087, %r4090; + mad.lo.s32 %r4092, %r64, %r4085, %r4091; + mad.lo.s32 %r4093, %r65, %r4083, %r4092; + ld.const.v4.u8 {%rs2618, %rs2619, %rs2620, %rs2621}, [matrix+1308]; + cvt.u32.u16 %r4094, %rs2621; + cvt.s32.s8 %r4095, %r4094; + cvt.u32.u16 %r4096, %rs2620; + cvt.s32.s8 %r4097, %r4096; + cvt.u32.u16 %r4098, %rs2619; + cvt.s32.s8 %r4099, %r4098; + cvt.u32.u16 %r4100, %rs2618; + cvt.s32.s8 %r4101, %r4100; + mad.lo.s32 %r4102, %r67, %r4101, %r4093; + mad.lo.s32 %r4103, %r68, %r4099, %r4102; + mad.lo.s32 %r4104, %r69, %r4097, %r4103; + mad.lo.s32 %r4105, %r70, %r4095, %r4104; + ld.const.v4.u8 {%rs2626, %rs2627, %rs2628, %rs2629}, [matrix+1312]; + cvt.u32.u16 %r4106, %rs2629; + cvt.s32.s8 %r4107, %r4106; + cvt.u32.u16 %r4108, %rs2628; + cvt.s32.s8 %r4109, %r4108; + cvt.u32.u16 %r4110, %rs2627; + cvt.s32.s8 %r4111, %r4110; + cvt.u32.u16 %r4112, %rs2626; + cvt.s32.s8 %r4113, %r4112; + mad.lo.s32 %r4114, %r222, %r4113, %r4105; + mad.lo.s32 %r4115, %r72, %r4111, %r4114; + mad.lo.s32 %r4116, %r73, %r4109, %r4115; + mad.lo.s32 %r4117, %r74, %r4107, %r4116; + ld.const.v4.u8 {%rs2634, %rs2635, %rs2636, %rs2637}, [matrix+1316]; + cvt.u32.u16 %r4118, %rs2637; + cvt.s32.s8 %r4119, %r4118; + cvt.u32.u16 %r4120, %rs2636; + cvt.s32.s8 %r4121, %r4120; + cvt.u32.u16 %r4122, %rs2635; + cvt.s32.s8 %r4123, %r4122; + cvt.u32.u16 %r4124, %rs2634; + cvt.s32.s8 %r4125, %r4124; + mad.lo.s32 %r4126, %r75, %r4125, %r4117; + mad.lo.s32 %r4127, %r76, %r4123, %r4126; + mad.lo.s32 %r4128, %r77, %r4121, %r4127; + mad.lo.s32 %r4129, %r78, %r4119, %r4128; + ld.const.v4.u8 {%rs2642, %rs2643, %rs2644, %rs2645}, [matrix+1320]; + cvt.u32.u16 %r4130, %rs2645; + cvt.s32.s8 %r4131, %r4130; + cvt.u32.u16 %r4132, %rs2644; + cvt.s32.s8 %r4133, %r4132; + cvt.u32.u16 %r4134, %rs2643; + cvt.s32.s8 %r4135, %r4134; + cvt.u32.u16 %r4136, %rs2642; + cvt.s32.s8 %r4137, %r4136; + mad.lo.s32 %r4138, %r80, %r4137, %r4129; + mad.lo.s32 %r4139, %r81, %r4135, %r4138; + mad.lo.s32 %r4140, %r83, %r4133, %r4139; + mad.lo.s32 %r4141, %r84, %r4131, %r4140; + ld.const.v4.u8 {%rs2650, %rs2651, %rs2652, %rs2653}, [matrix+1324]; + cvt.u32.u16 %r4142, %rs2653; + cvt.s32.s8 %r4143, %r4142; + cvt.u32.u16 %r4144, %rs2652; + cvt.s32.s8 %r4145, %r4144; + cvt.u32.u16 %r4146, %rs2651; + cvt.s32.s8 %r4147, %r4146; + cvt.u32.u16 %r4148, %rs2650; + cvt.s32.s8 %r4149, %r4148; + mad.lo.s32 %r4150, %r86, %r4149, %r4141; + mad.lo.s32 %r4151, %r87, %r4147, %r4150; + mad.lo.s32 %r4152, %r88, %r4145, %r4151; + mad.lo.s32 %r4153, %r89, %r4143, %r4152; + ld.const.v4.u8 {%rs2658, %rs2659, %rs2660, %rs2661}, [matrix+1328]; + cvt.u32.u16 %r4154, %rs2661; + cvt.s32.s8 %r4155, %r4154; + cvt.u32.u16 %r4156, %rs2660; + cvt.s32.s8 %r4157, %r4156; + cvt.u32.u16 %r4158, %rs2659; + cvt.s32.s8 %r4159, %r4158; + cvt.u32.u16 %r4160, %rs2658; + cvt.s32.s8 %r4161, %r4160; + mad.lo.s32 %r4162, %r271, %r4161, %r4153; + mad.lo.s32 %r4163, %r91, %r4159, %r4162; + mad.lo.s32 %r4164, %r93, %r4157, %r4163; + mad.lo.s32 %r4165, %r94, %r4155, %r4164; + ld.const.v4.u8 {%rs2666, %rs2667, %rs2668, %rs2669}, [matrix+1332]; + cvt.u32.u16 %r4166, %rs2669; + cvt.s32.s8 %r4167, %r4166; + cvt.u32.u16 %r4168, %rs2668; + cvt.s32.s8 %r4169, %r4168; + cvt.u32.u16 %r4170, %rs2667; + cvt.s32.s8 %r4171, %r4170; + cvt.u32.u16 %r4172, %rs2666; + cvt.s32.s8 %r4173, %r4172; + mad.lo.s32 %r4174, %r96, %r4173, %r4165; + mad.lo.s32 %r4175, %r97, %r4171, %r4174; + mad.lo.s32 %r4176, %r99, %r4169, %r4175; + mad.lo.s32 %r4177, %r100, %r4167, %r4176; + ld.const.v4.u8 {%rs2674, %rs2675, %rs2676, %rs2677}, [matrix+1336]; + cvt.u32.u16 %r4178, %rs2677; + cvt.s32.s8 %r4179, %r4178; + cvt.u32.u16 %r4180, %rs2676; + cvt.s32.s8 %r4181, %r4180; + cvt.u32.u16 %r4182, %rs2675; + cvt.s32.s8 %r4183, %r4182; + cvt.u32.u16 %r4184, %rs2674; + cvt.s32.s8 %r4185, %r4184; + mad.lo.s32 %r4186, %r103, %r4185, %r4177; + mad.lo.s32 %r4187, %r104, %r4183, %r4186; + mad.lo.s32 %r4188, %r107, %r4181, %r4187; + mad.lo.s32 %r4189, %r108, %r4179, %r4188; + ld.const.v4.u8 {%rs2682, %rs2683, %rs2684, %rs2685}, [matrix+1340]; + cvt.u32.u16 %r4190, %rs2685; + cvt.s32.s8 %r4191, %r4190; + cvt.u32.u16 %r4192, %rs2684; + cvt.s32.s8 %r4193, %r4192; + cvt.u32.u16 %r4194, %rs2683; + cvt.s32.s8 %r4195, %r4194; + cvt.u32.u16 %r4196, %rs2682; + cvt.s32.s8 %r4197, %r4196; + mad.lo.s32 %r4198, %r111, %r4197, %r4189; + mad.lo.s32 %r4199, %r112, %r4195, %r4198; + mad.lo.s32 %r4200, %r114, %r4193, %r4199; + mad.lo.s32 %r4201, %r115, %r4191, %r4200; + ld.const.v4.u8 {%rs2690, %rs2691, %rs2692, %rs2693}, [matrix+1344]; + cvt.u32.u16 %r4202, %rs2693; + cvt.s32.s8 %r4203, %r4202; + cvt.u32.u16 %r4204, %rs2692; + cvt.s32.s8 %r4205, %r4204; + cvt.u32.u16 %r4206, %rs2690; + cvt.s32.s8 %r4207, %r4206; + cvt.u32.u16 %r4208, %rs2691; + cvt.s32.s8 %r4209, %r4208; + mul.lo.s32 %r4210, %r34, %r4209; + mad.lo.s32 %r4211, %r124, %r4207, %r4210; + mad.lo.s32 %r4212, %r35, %r4205, %r4211; + mad.lo.s32 %r4213, %r36, %r4203, %r4212; + ld.const.v4.u8 {%rs2698, %rs2699, %rs2700, %rs2701}, [matrix+1348]; + cvt.u32.u16 %r4214, %rs2701; + cvt.s32.s8 %r4215, %r4214; + cvt.u32.u16 %r4216, %rs2700; + cvt.s32.s8 %r4217, %r4216; + cvt.u32.u16 %r4218, %rs2699; + cvt.s32.s8 %r4219, %r4218; + cvt.u32.u16 %r4220, %rs2698; + cvt.s32.s8 %r4221, %r4220; + mad.lo.s32 %r4222, %r37, %r4221, %r4213; + mad.lo.s32 %r4223, %r38, %r4219, %r4222; + mad.lo.s32 %r4224, %r39, %r4217, %r4223; + mad.lo.s32 %r4225, %r40, %r4215, %r4224; + ld.const.v4.u8 {%rs2706, %rs2707, %rs2708, %rs2709}, [matrix+1352]; + cvt.u32.u16 %r4226, %rs2709; + cvt.s32.s8 %r4227, %r4226; + cvt.u32.u16 %r4228, %rs2708; + cvt.s32.s8 %r4229, %r4228; + cvt.u32.u16 %r4230, %rs2707; + cvt.s32.s8 %r4231, %r4230; + cvt.u32.u16 %r4232, %rs2706; + cvt.s32.s8 %r4233, %r4232; + mad.lo.s32 %r4234, %r42, %r4233, %r4225; + mad.lo.s32 %r4235, %r43, %r4231, %r4234; + mad.lo.s32 %r4236, %r45, %r4229, %r4235; + mad.lo.s32 %r4237, %r46, %r4227, %r4236; + ld.const.v4.u8 {%rs2714, %rs2715, %rs2716, %rs2717}, [matrix+1356]; + cvt.u32.u16 %r4238, %rs2717; + cvt.s32.s8 %r4239, %r4238; + cvt.u32.u16 %r4240, %rs2716; + cvt.s32.s8 %r4241, %r4240; + cvt.u32.u16 %r4242, %rs2715; + cvt.s32.s8 %r4243, %r4242; + cvt.u32.u16 %r4244, %rs2714; + cvt.s32.s8 %r4245, %r4244; + mad.lo.s32 %r4246, %r48, %r4245, %r4237; + mad.lo.s32 %r4247, %r49, %r4243, %r4246; + mad.lo.s32 %r4248, %r50, %r4241, %r4247; + mad.lo.s32 %r4249, %r51, %r4239, %r4248; + ld.const.v4.u8 {%rs2722, %rs2723, %rs2724, %rs2725}, [matrix+1360]; + cvt.u32.u16 %r4250, %rs2725; + cvt.s32.s8 %r4251, %r4250; + cvt.u32.u16 %r4252, %rs2724; + cvt.s32.s8 %r4253, %r4252; + cvt.u32.u16 %r4254, %rs2723; + cvt.s32.s8 %r4255, %r4254; + cvt.u32.u16 %r4256, %rs2722; + cvt.s32.s8 %r4257, %r4256; + mad.lo.s32 %r4258, %r173, %r4257, %r4249; + mad.lo.s32 %r4259, %r53, %r4255, %r4258; + mad.lo.s32 %r4260, %r54, %r4253, %r4259; + mad.lo.s32 %r4261, %r55, %r4251, %r4260; + ld.const.v4.u8 {%rs2730, %rs2731, %rs2732, %rs2733}, [matrix+1364]; + cvt.u32.u16 %r4262, %rs2733; + cvt.s32.s8 %r4263, %r4262; + cvt.u32.u16 %r4264, %rs2732; + cvt.s32.s8 %r4265, %r4264; + cvt.u32.u16 %r4266, %rs2731; + cvt.s32.s8 %r4267, %r4266; + cvt.u32.u16 %r4268, %rs2730; + cvt.s32.s8 %r4269, %r4268; + mad.lo.s32 %r4270, %r56, %r4269, %r4261; + mad.lo.s32 %r4271, %r57, %r4267, %r4270; + mad.lo.s32 %r4272, %r58, %r4265, %r4271; + mad.lo.s32 %r4273, %r59, %r4263, %r4272; + ld.const.v4.u8 {%rs2738, %rs2739, %rs2740, %rs2741}, [matrix+1368]; + cvt.u32.u16 %r4274, %rs2741; + cvt.s32.s8 %r4275, %r4274; + cvt.u32.u16 %r4276, %rs2740; + cvt.s32.s8 %r4277, %r4276; + cvt.u32.u16 %r4278, %rs2739; + cvt.s32.s8 %r4279, %r4278; + cvt.u32.u16 %r4280, %rs2738; + cvt.s32.s8 %r4281, %r4280; + mad.lo.s32 %r4282, %r61, %r4281, %r4273; + mad.lo.s32 %r4283, %r62, %r4279, %r4282; + mad.lo.s32 %r4284, %r64, %r4277, %r4283; + mad.lo.s32 %r4285, %r65, %r4275, %r4284; + ld.const.v4.u8 {%rs2746, %rs2747, %rs2748, %rs2749}, [matrix+1372]; + cvt.u32.u16 %r4286, %rs2749; + cvt.s32.s8 %r4287, %r4286; + cvt.u32.u16 %r4288, %rs2748; + cvt.s32.s8 %r4289, %r4288; + cvt.u32.u16 %r4290, %rs2747; + cvt.s32.s8 %r4291, %r4290; + cvt.u32.u16 %r4292, %rs2746; + cvt.s32.s8 %r4293, %r4292; + mad.lo.s32 %r4294, %r67, %r4293, %r4285; + mad.lo.s32 %r4295, %r68, %r4291, %r4294; + mad.lo.s32 %r4296, %r69, %r4289, %r4295; + mad.lo.s32 %r4297, %r70, %r4287, %r4296; + ld.const.v4.u8 {%rs2754, %rs2755, %rs2756, %rs2757}, [matrix+1376]; + cvt.u32.u16 %r4298, %rs2757; + cvt.s32.s8 %r4299, %r4298; + cvt.u32.u16 %r4300, %rs2756; + cvt.s32.s8 %r4301, %r4300; + cvt.u32.u16 %r4302, %rs2755; + cvt.s32.s8 %r4303, %r4302; + cvt.u32.u16 %r4304, %rs2754; + cvt.s32.s8 %r4305, %r4304; + mad.lo.s32 %r4306, %r222, %r4305, %r4297; + mad.lo.s32 %r4307, %r72, %r4303, %r4306; + mad.lo.s32 %r4308, %r73, %r4301, %r4307; + mad.lo.s32 %r4309, %r74, %r4299, %r4308; + ld.const.v4.u8 {%rs2762, %rs2763, %rs2764, %rs2765}, [matrix+1380]; + cvt.u32.u16 %r4310, %rs2765; + cvt.s32.s8 %r4311, %r4310; + cvt.u32.u16 %r4312, %rs2764; + cvt.s32.s8 %r4313, %r4312; + cvt.u32.u16 %r4314, %rs2763; + cvt.s32.s8 %r4315, %r4314; + cvt.u32.u16 %r4316, %rs2762; + cvt.s32.s8 %r4317, %r4316; + mad.lo.s32 %r4318, %r75, %r4317, %r4309; + mad.lo.s32 %r4319, %r76, %r4315, %r4318; + mad.lo.s32 %r4320, %r77, %r4313, %r4319; + mad.lo.s32 %r4321, %r78, %r4311, %r4320; + ld.const.v4.u8 {%rs2770, %rs2771, %rs2772, %rs2773}, [matrix+1384]; + cvt.u32.u16 %r4322, %rs2773; + cvt.s32.s8 %r4323, %r4322; + cvt.u32.u16 %r4324, %rs2772; + cvt.s32.s8 %r4325, %r4324; + cvt.u32.u16 %r4326, %rs2771; + cvt.s32.s8 %r4327, %r4326; + cvt.u32.u16 %r4328, %rs2770; + cvt.s32.s8 %r4329, %r4328; + mad.lo.s32 %r4330, %r80, %r4329, %r4321; + mad.lo.s32 %r4331, %r81, %r4327, %r4330; + mad.lo.s32 %r4332, %r83, %r4325, %r4331; + mad.lo.s32 %r4333, %r84, %r4323, %r4332; + ld.const.v4.u8 {%rs2778, %rs2779, %rs2780, %rs2781}, [matrix+1388]; + cvt.u32.u16 %r4334, %rs2781; + cvt.s32.s8 %r4335, %r4334; + cvt.u32.u16 %r4336, %rs2780; + cvt.s32.s8 %r4337, %r4336; + cvt.u32.u16 %r4338, %rs2779; + cvt.s32.s8 %r4339, %r4338; + cvt.u32.u16 %r4340, %rs2778; + cvt.s32.s8 %r4341, %r4340; + mad.lo.s32 %r4342, %r86, %r4341, %r4333; + mad.lo.s32 %r4343, %r87, %r4339, %r4342; + mad.lo.s32 %r4344, %r88, %r4337, %r4343; + mad.lo.s32 %r4345, %r89, %r4335, %r4344; + ld.const.v4.u8 {%rs2786, %rs2787, %rs2788, %rs2789}, [matrix+1392]; + cvt.u32.u16 %r4346, %rs2789; + cvt.s32.s8 %r4347, %r4346; + cvt.u32.u16 %r4348, %rs2788; + cvt.s32.s8 %r4349, %r4348; + cvt.u32.u16 %r4350, %rs2787; + cvt.s32.s8 %r4351, %r4350; + cvt.u32.u16 %r4352, %rs2786; + cvt.s32.s8 %r4353, %r4352; + mad.lo.s32 %r4354, %r271, %r4353, %r4345; + mad.lo.s32 %r4355, %r91, %r4351, %r4354; + mad.lo.s32 %r4356, %r93, %r4349, %r4355; + mad.lo.s32 %r4357, %r94, %r4347, %r4356; + ld.const.v4.u8 {%rs2794, %rs2795, %rs2796, %rs2797}, [matrix+1396]; + cvt.u32.u16 %r4358, %rs2797; + cvt.s32.s8 %r4359, %r4358; + cvt.u32.u16 %r4360, %rs2796; + cvt.s32.s8 %r4361, %r4360; + cvt.u32.u16 %r4362, %rs2795; + cvt.s32.s8 %r4363, %r4362; + cvt.u32.u16 %r4364, %rs2794; + cvt.s32.s8 %r4365, %r4364; + mad.lo.s32 %r4366, %r96, %r4365, %r4357; + mad.lo.s32 %r4367, %r97, %r4363, %r4366; + mad.lo.s32 %r4368, %r99, %r4361, %r4367; + mad.lo.s32 %r4369, %r100, %r4359, %r4368; + ld.const.v4.u8 {%rs2802, %rs2803, %rs2804, %rs2805}, [matrix+1400]; + cvt.u32.u16 %r4370, %rs2805; + cvt.s32.s8 %r4371, %r4370; + cvt.u32.u16 %r4372, %rs2804; + cvt.s32.s8 %r4373, %r4372; + cvt.u32.u16 %r4374, %rs2803; + cvt.s32.s8 %r4375, %r4374; + cvt.u32.u16 %r4376, %rs2802; + cvt.s32.s8 %r4377, %r4376; + mad.lo.s32 %r4378, %r103, %r4377, %r4369; + mad.lo.s32 %r4379, %r104, %r4375, %r4378; + mad.lo.s32 %r4380, %r107, %r4373, %r4379; + mad.lo.s32 %r4381, %r108, %r4371, %r4380; + ld.const.v4.u8 {%rs2810, %rs2811, %rs2812, %rs2813}, [matrix+1404]; + cvt.u32.u16 %r4382, %rs2813; + cvt.s32.s8 %r4383, %r4382; + cvt.u32.u16 %r4384, %rs2812; + cvt.s32.s8 %r4385, %r4384; + cvt.u32.u16 %r4386, %rs2811; + cvt.s32.s8 %r4387, %r4386; + cvt.u32.u16 %r4388, %rs2810; + cvt.s32.s8 %r4389, %r4388; + mad.lo.s32 %r4390, %r111, %r4389, %r4381; + mad.lo.s32 %r4391, %r112, %r4387, %r4390; + mad.lo.s32 %r4392, %r114, %r4385, %r4391; + mad.lo.s32 %r4393, %r115, %r4383, %r4392; + shr.u32 %r4394, %r4201, 6; + and.b32 %r4395, %r4394, 240; + shr.u32 %r4396, %r4393, 10; + or.b32 %r4397, %r4396, %r4395; + xor.b32 %r4398, %r21, %r4397; + cvt.u64.u32 %rd388, %r4398; + ld.const.v4.u8 {%rs2818, %rs2819, %rs2820, %rs2821}, [matrix+1408]; + cvt.u32.u16 %r4399, %rs2821; + cvt.s32.s8 %r4400, %r4399; + cvt.u32.u16 %r4401, %rs2820; + cvt.s32.s8 %r4402, %r4401; + cvt.u32.u16 %r4403, %rs2818; + cvt.s32.s8 %r4404, %r4403; + cvt.u32.u16 %r4405, %rs2819; + cvt.s32.s8 %r4406, %r4405; + mul.lo.s32 %r4407, %r34, %r4406; + mad.lo.s32 %r4408, %r124, %r4404, %r4407; + mad.lo.s32 %r4409, %r35, %r4402, %r4408; + mad.lo.s32 %r4410, %r36, %r4400, %r4409; + ld.const.v4.u8 {%rs2826, %rs2827, %rs2828, %rs2829}, [matrix+1412]; + cvt.u32.u16 %r4411, %rs2829; + cvt.s32.s8 %r4412, %r4411; + cvt.u32.u16 %r4413, %rs2828; + cvt.s32.s8 %r4414, %r4413; + cvt.u32.u16 %r4415, %rs2827; + cvt.s32.s8 %r4416, %r4415; + cvt.u32.u16 %r4417, %rs2826; + cvt.s32.s8 %r4418, %r4417; + mad.lo.s32 %r4419, %r37, %r4418, %r4410; + mad.lo.s32 %r4420, %r38, %r4416, %r4419; + mad.lo.s32 %r4421, %r39, %r4414, %r4420; + mad.lo.s32 %r4422, %r40, %r4412, %r4421; + ld.const.v4.u8 {%rs2834, %rs2835, %rs2836, %rs2837}, [matrix+1416]; + cvt.u32.u16 %r4423, %rs2837; + cvt.s32.s8 %r4424, %r4423; + cvt.u32.u16 %r4425, %rs2836; + cvt.s32.s8 %r4426, %r4425; + cvt.u32.u16 %r4427, %rs2835; + cvt.s32.s8 %r4428, %r4427; + cvt.u32.u16 %r4429, %rs2834; + cvt.s32.s8 %r4430, %r4429; + mad.lo.s32 %r4431, %r42, %r4430, %r4422; + mad.lo.s32 %r4432, %r43, %r4428, %r4431; + mad.lo.s32 %r4433, %r45, %r4426, %r4432; + mad.lo.s32 %r4434, %r46, %r4424, %r4433; + ld.const.v4.u8 {%rs2842, %rs2843, %rs2844, %rs2845}, [matrix+1420]; + cvt.u32.u16 %r4435, %rs2845; + cvt.s32.s8 %r4436, %r4435; + cvt.u32.u16 %r4437, %rs2844; + cvt.s32.s8 %r4438, %r4437; + cvt.u32.u16 %r4439, %rs2843; + cvt.s32.s8 %r4440, %r4439; + cvt.u32.u16 %r4441, %rs2842; + cvt.s32.s8 %r4442, %r4441; + mad.lo.s32 %r4443, %r48, %r4442, %r4434; + mad.lo.s32 %r4444, %r49, %r4440, %r4443; + mad.lo.s32 %r4445, %r50, %r4438, %r4444; + mad.lo.s32 %r4446, %r51, %r4436, %r4445; + ld.const.v4.u8 {%rs2850, %rs2851, %rs2852, %rs2853}, [matrix+1424]; + cvt.u32.u16 %r4447, %rs2853; + cvt.s32.s8 %r4448, %r4447; + cvt.u32.u16 %r4449, %rs2852; + cvt.s32.s8 %r4450, %r4449; + cvt.u32.u16 %r4451, %rs2851; + cvt.s32.s8 %r4452, %r4451; + cvt.u32.u16 %r4453, %rs2850; + cvt.s32.s8 %r4454, %r4453; + mad.lo.s32 %r4455, %r173, %r4454, %r4446; + mad.lo.s32 %r4456, %r53, %r4452, %r4455; + mad.lo.s32 %r4457, %r54, %r4450, %r4456; + mad.lo.s32 %r4458, %r55, %r4448, %r4457; + ld.const.v4.u8 {%rs2858, %rs2859, %rs2860, %rs2861}, [matrix+1428]; + cvt.u32.u16 %r4459, %rs2861; + cvt.s32.s8 %r4460, %r4459; + cvt.u32.u16 %r4461, %rs2860; + cvt.s32.s8 %r4462, %r4461; + cvt.u32.u16 %r4463, %rs2859; + cvt.s32.s8 %r4464, %r4463; + cvt.u32.u16 %r4465, %rs2858; + cvt.s32.s8 %r4466, %r4465; + mad.lo.s32 %r4467, %r56, %r4466, %r4458; + mad.lo.s32 %r4468, %r57, %r4464, %r4467; + mad.lo.s32 %r4469, %r58, %r4462, %r4468; + mad.lo.s32 %r4470, %r59, %r4460, %r4469; + ld.const.v4.u8 {%rs2866, %rs2867, %rs2868, %rs2869}, [matrix+1432]; + cvt.u32.u16 %r4471, %rs2869; + cvt.s32.s8 %r4472, %r4471; + cvt.u32.u16 %r4473, %rs2868; + cvt.s32.s8 %r4474, %r4473; + cvt.u32.u16 %r4475, %rs2867; + cvt.s32.s8 %r4476, %r4475; + cvt.u32.u16 %r4477, %rs2866; + cvt.s32.s8 %r4478, %r4477; + mad.lo.s32 %r4479, %r61, %r4478, %r4470; + mad.lo.s32 %r4480, %r62, %r4476, %r4479; + mad.lo.s32 %r4481, %r64, %r4474, %r4480; + mad.lo.s32 %r4482, %r65, %r4472, %r4481; + ld.const.v4.u8 {%rs2874, %rs2875, %rs2876, %rs2877}, [matrix+1436]; + cvt.u32.u16 %r4483, %rs2877; + cvt.s32.s8 %r4484, %r4483; + cvt.u32.u16 %r4485, %rs2876; + cvt.s32.s8 %r4486, %r4485; + cvt.u32.u16 %r4487, %rs2875; + cvt.s32.s8 %r4488, %r4487; + cvt.u32.u16 %r4489, %rs2874; + cvt.s32.s8 %r4490, %r4489; + mad.lo.s32 %r4491, %r67, %r4490, %r4482; + mad.lo.s32 %r4492, %r68, %r4488, %r4491; + mad.lo.s32 %r4493, %r69, %r4486, %r4492; + mad.lo.s32 %r4494, %r70, %r4484, %r4493; + ld.const.v4.u8 {%rs2882, %rs2883, %rs2884, %rs2885}, [matrix+1440]; + cvt.u32.u16 %r4495, %rs2885; + cvt.s32.s8 %r4496, %r4495; + cvt.u32.u16 %r4497, %rs2884; + cvt.s32.s8 %r4498, %r4497; + cvt.u32.u16 %r4499, %rs2883; + cvt.s32.s8 %r4500, %r4499; + cvt.u32.u16 %r4501, %rs2882; + cvt.s32.s8 %r4502, %r4501; + mad.lo.s32 %r4503, %r222, %r4502, %r4494; + mad.lo.s32 %r4504, %r72, %r4500, %r4503; + mad.lo.s32 %r4505, %r73, %r4498, %r4504; + mad.lo.s32 %r4506, %r74, %r4496, %r4505; + ld.const.v4.u8 {%rs2890, %rs2891, %rs2892, %rs2893}, [matrix+1444]; + cvt.u32.u16 %r4507, %rs2893; + cvt.s32.s8 %r4508, %r4507; + cvt.u32.u16 %r4509, %rs2892; + cvt.s32.s8 %r4510, %r4509; + cvt.u32.u16 %r4511, %rs2891; + cvt.s32.s8 %r4512, %r4511; + cvt.u32.u16 %r4513, %rs2890; + cvt.s32.s8 %r4514, %r4513; + mad.lo.s32 %r4515, %r75, %r4514, %r4506; + mad.lo.s32 %r4516, %r76, %r4512, %r4515; + mad.lo.s32 %r4517, %r77, %r4510, %r4516; + mad.lo.s32 %r4518, %r78, %r4508, %r4517; + ld.const.v4.u8 {%rs2898, %rs2899, %rs2900, %rs2901}, [matrix+1448]; + cvt.u32.u16 %r4519, %rs2901; + cvt.s32.s8 %r4520, %r4519; + cvt.u32.u16 %r4521, %rs2900; + cvt.s32.s8 %r4522, %r4521; + cvt.u32.u16 %r4523, %rs2899; + cvt.s32.s8 %r4524, %r4523; + cvt.u32.u16 %r4525, %rs2898; + cvt.s32.s8 %r4526, %r4525; + mad.lo.s32 %r4527, %r80, %r4526, %r4518; + mad.lo.s32 %r4528, %r81, %r4524, %r4527; + mad.lo.s32 %r4529, %r83, %r4522, %r4528; + mad.lo.s32 %r4530, %r84, %r4520, %r4529; + ld.const.v4.u8 {%rs2906, %rs2907, %rs2908, %rs2909}, [matrix+1452]; + cvt.u32.u16 %r4531, %rs2909; + cvt.s32.s8 %r4532, %r4531; + cvt.u32.u16 %r4533, %rs2908; + cvt.s32.s8 %r4534, %r4533; + cvt.u32.u16 %r4535, %rs2907; + cvt.s32.s8 %r4536, %r4535; + cvt.u32.u16 %r4537, %rs2906; + cvt.s32.s8 %r4538, %r4537; + mad.lo.s32 %r4539, %r86, %r4538, %r4530; + mad.lo.s32 %r4540, %r87, %r4536, %r4539; + mad.lo.s32 %r4541, %r88, %r4534, %r4540; + mad.lo.s32 %r4542, %r89, %r4532, %r4541; + ld.const.v4.u8 {%rs2914, %rs2915, %rs2916, %rs2917}, [matrix+1456]; + cvt.u32.u16 %r4543, %rs2917; + cvt.s32.s8 %r4544, %r4543; + cvt.u32.u16 %r4545, %rs2916; + cvt.s32.s8 %r4546, %r4545; + cvt.u32.u16 %r4547, %rs2915; + cvt.s32.s8 %r4548, %r4547; + cvt.u32.u16 %r4549, %rs2914; + cvt.s32.s8 %r4550, %r4549; + mad.lo.s32 %r4551, %r271, %r4550, %r4542; + mad.lo.s32 %r4552, %r91, %r4548, %r4551; + mad.lo.s32 %r4553, %r93, %r4546, %r4552; + mad.lo.s32 %r4554, %r94, %r4544, %r4553; + ld.const.v4.u8 {%rs2922, %rs2923, %rs2924, %rs2925}, [matrix+1460]; + cvt.u32.u16 %r4555, %rs2925; + cvt.s32.s8 %r4556, %r4555; + cvt.u32.u16 %r4557, %rs2924; + cvt.s32.s8 %r4558, %r4557; + cvt.u32.u16 %r4559, %rs2923; + cvt.s32.s8 %r4560, %r4559; + cvt.u32.u16 %r4561, %rs2922; + cvt.s32.s8 %r4562, %r4561; + mad.lo.s32 %r4563, %r96, %r4562, %r4554; + mad.lo.s32 %r4564, %r97, %r4560, %r4563; + mad.lo.s32 %r4565, %r99, %r4558, %r4564; + mad.lo.s32 %r4566, %r100, %r4556, %r4565; + ld.const.v4.u8 {%rs2930, %rs2931, %rs2932, %rs2933}, [matrix+1464]; + cvt.u32.u16 %r4567, %rs2933; + cvt.s32.s8 %r4568, %r4567; + cvt.u32.u16 %r4569, %rs2932; + cvt.s32.s8 %r4570, %r4569; + cvt.u32.u16 %r4571, %rs2931; + cvt.s32.s8 %r4572, %r4571; + cvt.u32.u16 %r4573, %rs2930; + cvt.s32.s8 %r4574, %r4573; + mad.lo.s32 %r4575, %r103, %r4574, %r4566; + mad.lo.s32 %r4576, %r104, %r4572, %r4575; + mad.lo.s32 %r4577, %r107, %r4570, %r4576; + mad.lo.s32 %r4578, %r108, %r4568, %r4577; + ld.const.v4.u8 {%rs2938, %rs2939, %rs2940, %rs2941}, [matrix+1468]; + cvt.u32.u16 %r4579, %rs2941; + cvt.s32.s8 %r4580, %r4579; + cvt.u32.u16 %r4581, %rs2940; + cvt.s32.s8 %r4582, %r4581; + cvt.u32.u16 %r4583, %rs2939; + cvt.s32.s8 %r4584, %r4583; + cvt.u32.u16 %r4585, %rs2938; + cvt.s32.s8 %r4586, %r4585; + mad.lo.s32 %r4587, %r111, %r4586, %r4578; + mad.lo.s32 %r4588, %r112, %r4584, %r4587; + mad.lo.s32 %r4589, %r114, %r4582, %r4588; + mad.lo.s32 %r4590, %r115, %r4580, %r4589; + ld.const.v4.u8 {%rs2946, %rs2947, %rs2948, %rs2949}, [matrix+1472]; + cvt.u32.u16 %r4591, %rs2949; + cvt.s32.s8 %r4592, %r4591; + cvt.u32.u16 %r4593, %rs2948; + cvt.s32.s8 %r4594, %r4593; + cvt.u32.u16 %r4595, %rs2946; + cvt.s32.s8 %r4596, %r4595; + cvt.u32.u16 %r4597, %rs2947; + cvt.s32.s8 %r4598, %r4597; + mul.lo.s32 %r4599, %r34, %r4598; + mad.lo.s32 %r4600, %r124, %r4596, %r4599; + mad.lo.s32 %r4601, %r35, %r4594, %r4600; + mad.lo.s32 %r4602, %r36, %r4592, %r4601; + ld.const.v4.u8 {%rs2954, %rs2955, %rs2956, %rs2957}, [matrix+1476]; + cvt.u32.u16 %r4603, %rs2957; + cvt.s32.s8 %r4604, %r4603; + cvt.u32.u16 %r4605, %rs2956; + cvt.s32.s8 %r4606, %r4605; + cvt.u32.u16 %r4607, %rs2955; + cvt.s32.s8 %r4608, %r4607; + cvt.u32.u16 %r4609, %rs2954; + cvt.s32.s8 %r4610, %r4609; + mad.lo.s32 %r4611, %r37, %r4610, %r4602; + mad.lo.s32 %r4612, %r38, %r4608, %r4611; + mad.lo.s32 %r4613, %r39, %r4606, %r4612; + mad.lo.s32 %r4614, %r40, %r4604, %r4613; + ld.const.v4.u8 {%rs2962, %rs2963, %rs2964, %rs2965}, [matrix+1480]; + cvt.u32.u16 %r4615, %rs2965; + cvt.s32.s8 %r4616, %r4615; + cvt.u32.u16 %r4617, %rs2964; + cvt.s32.s8 %r4618, %r4617; + cvt.u32.u16 %r4619, %rs2963; + cvt.s32.s8 %r4620, %r4619; + cvt.u32.u16 %r4621, %rs2962; + cvt.s32.s8 %r4622, %r4621; + mad.lo.s32 %r4623, %r42, %r4622, %r4614; + mad.lo.s32 %r4624, %r43, %r4620, %r4623; + mad.lo.s32 %r4625, %r45, %r4618, %r4624; + mad.lo.s32 %r4626, %r46, %r4616, %r4625; + ld.const.v4.u8 {%rs2970, %rs2971, %rs2972, %rs2973}, [matrix+1484]; + cvt.u32.u16 %r4627, %rs2973; + cvt.s32.s8 %r4628, %r4627; + cvt.u32.u16 %r4629, %rs2972; + cvt.s32.s8 %r4630, %r4629; + cvt.u32.u16 %r4631, %rs2971; + cvt.s32.s8 %r4632, %r4631; + cvt.u32.u16 %r4633, %rs2970; + cvt.s32.s8 %r4634, %r4633; + mad.lo.s32 %r4635, %r48, %r4634, %r4626; + mad.lo.s32 %r4636, %r49, %r4632, %r4635; + mad.lo.s32 %r4637, %r50, %r4630, %r4636; + mad.lo.s32 %r4638, %r51, %r4628, %r4637; + ld.const.v4.u8 {%rs2978, %rs2979, %rs2980, %rs2981}, [matrix+1488]; + cvt.u32.u16 %r4639, %rs2981; + cvt.s32.s8 %r4640, %r4639; + cvt.u32.u16 %r4641, %rs2980; + cvt.s32.s8 %r4642, %r4641; + cvt.u32.u16 %r4643, %rs2979; + cvt.s32.s8 %r4644, %r4643; + cvt.u32.u16 %r4645, %rs2978; + cvt.s32.s8 %r4646, %r4645; + mad.lo.s32 %r4647, %r173, %r4646, %r4638; + mad.lo.s32 %r4648, %r53, %r4644, %r4647; + mad.lo.s32 %r4649, %r54, %r4642, %r4648; + mad.lo.s32 %r4650, %r55, %r4640, %r4649; + ld.const.v4.u8 {%rs2986, %rs2987, %rs2988, %rs2989}, [matrix+1492]; + cvt.u32.u16 %r4651, %rs2989; + cvt.s32.s8 %r4652, %r4651; + cvt.u32.u16 %r4653, %rs2988; + cvt.s32.s8 %r4654, %r4653; + cvt.u32.u16 %r4655, %rs2987; + cvt.s32.s8 %r4656, %r4655; + cvt.u32.u16 %r4657, %rs2986; + cvt.s32.s8 %r4658, %r4657; + mad.lo.s32 %r4659, %r56, %r4658, %r4650; + mad.lo.s32 %r4660, %r57, %r4656, %r4659; + mad.lo.s32 %r4661, %r58, %r4654, %r4660; + mad.lo.s32 %r4662, %r59, %r4652, %r4661; + ld.const.v4.u8 {%rs2994, %rs2995, %rs2996, %rs2997}, [matrix+1496]; + cvt.u32.u16 %r4663, %rs2997; + cvt.s32.s8 %r4664, %r4663; + cvt.u32.u16 %r4665, %rs2996; + cvt.s32.s8 %r4666, %r4665; + cvt.u32.u16 %r4667, %rs2995; + cvt.s32.s8 %r4668, %r4667; + cvt.u32.u16 %r4669, %rs2994; + cvt.s32.s8 %r4670, %r4669; + mad.lo.s32 %r4671, %r61, %r4670, %r4662; + mad.lo.s32 %r4672, %r62, %r4668, %r4671; + mad.lo.s32 %r4673, %r64, %r4666, %r4672; + mad.lo.s32 %r4674, %r65, %r4664, %r4673; + ld.const.v4.u8 {%rs3002, %rs3003, %rs3004, %rs3005}, [matrix+1500]; + cvt.u32.u16 %r4675, %rs3005; + cvt.s32.s8 %r4676, %r4675; + cvt.u32.u16 %r4677, %rs3004; + cvt.s32.s8 %r4678, %r4677; + cvt.u32.u16 %r4679, %rs3003; + cvt.s32.s8 %r4680, %r4679; + cvt.u32.u16 %r4681, %rs3002; + cvt.s32.s8 %r4682, %r4681; + mad.lo.s32 %r4683, %r67, %r4682, %r4674; + mad.lo.s32 %r4684, %r68, %r4680, %r4683; + mad.lo.s32 %r4685, %r69, %r4678, %r4684; + mad.lo.s32 %r4686, %r70, %r4676, %r4685; + ld.const.v4.u8 {%rs3010, %rs3011, %rs3012, %rs3013}, [matrix+1504]; + cvt.u32.u16 %r4687, %rs3013; + cvt.s32.s8 %r4688, %r4687; + cvt.u32.u16 %r4689, %rs3012; + cvt.s32.s8 %r4690, %r4689; + cvt.u32.u16 %r4691, %rs3011; + cvt.s32.s8 %r4692, %r4691; + cvt.u32.u16 %r4693, %rs3010; + cvt.s32.s8 %r4694, %r4693; + mad.lo.s32 %r4695, %r222, %r4694, %r4686; + mad.lo.s32 %r4696, %r72, %r4692, %r4695; + mad.lo.s32 %r4697, %r73, %r4690, %r4696; + mad.lo.s32 %r4698, %r74, %r4688, %r4697; + ld.const.v4.u8 {%rs3018, %rs3019, %rs3020, %rs3021}, [matrix+1508]; + cvt.u32.u16 %r4699, %rs3021; + cvt.s32.s8 %r4700, %r4699; + cvt.u32.u16 %r4701, %rs3020; + cvt.s32.s8 %r4702, %r4701; + cvt.u32.u16 %r4703, %rs3019; + cvt.s32.s8 %r4704, %r4703; + cvt.u32.u16 %r4705, %rs3018; + cvt.s32.s8 %r4706, %r4705; + mad.lo.s32 %r4707, %r75, %r4706, %r4698; + mad.lo.s32 %r4708, %r76, %r4704, %r4707; + mad.lo.s32 %r4709, %r77, %r4702, %r4708; + mad.lo.s32 %r4710, %r78, %r4700, %r4709; + ld.const.v4.u8 {%rs3026, %rs3027, %rs3028, %rs3029}, [matrix+1512]; + cvt.u32.u16 %r4711, %rs3029; + cvt.s32.s8 %r4712, %r4711; + cvt.u32.u16 %r4713, %rs3028; + cvt.s32.s8 %r4714, %r4713; + cvt.u32.u16 %r4715, %rs3027; + cvt.s32.s8 %r4716, %r4715; + cvt.u32.u16 %r4717, %rs3026; + cvt.s32.s8 %r4718, %r4717; + mad.lo.s32 %r4719, %r80, %r4718, %r4710; + mad.lo.s32 %r4720, %r81, %r4716, %r4719; + mad.lo.s32 %r4721, %r83, %r4714, %r4720; + mad.lo.s32 %r4722, %r84, %r4712, %r4721; + ld.const.v4.u8 {%rs3034, %rs3035, %rs3036, %rs3037}, [matrix+1516]; + cvt.u32.u16 %r4723, %rs3037; + cvt.s32.s8 %r4724, %r4723; + cvt.u32.u16 %r4725, %rs3036; + cvt.s32.s8 %r4726, %r4725; + cvt.u32.u16 %r4727, %rs3035; + cvt.s32.s8 %r4728, %r4727; + cvt.u32.u16 %r4729, %rs3034; + cvt.s32.s8 %r4730, %r4729; + mad.lo.s32 %r4731, %r86, %r4730, %r4722; + mad.lo.s32 %r4732, %r87, %r4728, %r4731; + mad.lo.s32 %r4733, %r88, %r4726, %r4732; + mad.lo.s32 %r4734, %r89, %r4724, %r4733; + ld.const.v4.u8 {%rs3042, %rs3043, %rs3044, %rs3045}, [matrix+1520]; + cvt.u32.u16 %r4735, %rs3045; + cvt.s32.s8 %r4736, %r4735; + cvt.u32.u16 %r4737, %rs3044; + cvt.s32.s8 %r4738, %r4737; + cvt.u32.u16 %r4739, %rs3043; + cvt.s32.s8 %r4740, %r4739; + cvt.u32.u16 %r4741, %rs3042; + cvt.s32.s8 %r4742, %r4741; + mad.lo.s32 %r4743, %r271, %r4742, %r4734; + mad.lo.s32 %r4744, %r91, %r4740, %r4743; + mad.lo.s32 %r4745, %r93, %r4738, %r4744; + mad.lo.s32 %r4746, %r94, %r4736, %r4745; + ld.const.v4.u8 {%rs3050, %rs3051, %rs3052, %rs3053}, [matrix+1524]; + cvt.u32.u16 %r4747, %rs3053; + cvt.s32.s8 %r4748, %r4747; + cvt.u32.u16 %r4749, %rs3052; + cvt.s32.s8 %r4750, %r4749; + cvt.u32.u16 %r4751, %rs3051; + cvt.s32.s8 %r4752, %r4751; + cvt.u32.u16 %r4753, %rs3050; + cvt.s32.s8 %r4754, %r4753; + mad.lo.s32 %r4755, %r96, %r4754, %r4746; + mad.lo.s32 %r4756, %r97, %r4752, %r4755; + mad.lo.s32 %r4757, %r99, %r4750, %r4756; + mad.lo.s32 %r4758, %r100, %r4748, %r4757; + ld.const.v4.u8 {%rs3058, %rs3059, %rs3060, %rs3061}, [matrix+1528]; + cvt.u32.u16 %r4759, %rs3061; + cvt.s32.s8 %r4760, %r4759; + cvt.u32.u16 %r4761, %rs3060; + cvt.s32.s8 %r4762, %r4761; + cvt.u32.u16 %r4763, %rs3059; + cvt.s32.s8 %r4764, %r4763; + cvt.u32.u16 %r4765, %rs3058; + cvt.s32.s8 %r4766, %r4765; + mad.lo.s32 %r4767, %r103, %r4766, %r4758; + mad.lo.s32 %r4768, %r104, %r4764, %r4767; + mad.lo.s32 %r4769, %r107, %r4762, %r4768; + mad.lo.s32 %r4770, %r108, %r4760, %r4769; + ld.const.v4.u8 {%rs3066, %rs3067, %rs3068, %rs3069}, [matrix+1532]; + cvt.u32.u16 %r4771, %rs3069; + cvt.s32.s8 %r4772, %r4771; + cvt.u32.u16 %r4773, %rs3068; + cvt.s32.s8 %r4774, %r4773; + cvt.u32.u16 %r4775, %rs3067; + cvt.s32.s8 %r4776, %r4775; + cvt.u32.u16 %r4777, %rs3066; + cvt.s32.s8 %r4778, %r4777; + mad.lo.s32 %r4779, %r111, %r4778, %r4770; + mad.lo.s32 %r4780, %r112, %r4776, %r4779; + mad.lo.s32 %r4781, %r114, %r4774, %r4780; + mad.lo.s32 %r4782, %r115, %r4772, %r4781; + shr.u32 %r4783, %r4590, 6; + and.b32 %r4784, %r4783, 240; + shr.u32 %r4785, %r4782, 10; + or.b32 %r4786, %r4785, %r4784; + xor.b32 %r4787, %r22, %r4786; + cvt.u64.u32 %rd389, %r4787; + ld.const.v4.u8 {%rs3074, %rs3075, %rs3076, %rs3077}, [matrix+1536]; + cvt.u32.u16 %r4788, %rs3077; + cvt.s32.s8 %r4789, %r4788; + cvt.u32.u16 %r4790, %rs3076; + cvt.s32.s8 %r4791, %r4790; + cvt.u32.u16 %r4792, %rs3074; + cvt.s32.s8 %r4793, %r4792; + cvt.u32.u16 %r4794, %rs3075; + cvt.s32.s8 %r4795, %r4794; + mul.lo.s32 %r4796, %r34, %r4795; + mad.lo.s32 %r4797, %r124, %r4793, %r4796; + mad.lo.s32 %r4798, %r35, %r4791, %r4797; + mad.lo.s32 %r4799, %r36, %r4789, %r4798; + ld.const.v4.u8 {%rs3082, %rs3083, %rs3084, %rs3085}, [matrix+1540]; + cvt.u32.u16 %r4800, %rs3085; + cvt.s32.s8 %r4801, %r4800; + cvt.u32.u16 %r4802, %rs3084; + cvt.s32.s8 %r4803, %r4802; + cvt.u32.u16 %r4804, %rs3083; + cvt.s32.s8 %r4805, %r4804; + cvt.u32.u16 %r4806, %rs3082; + cvt.s32.s8 %r4807, %r4806; + mad.lo.s32 %r4808, %r37, %r4807, %r4799; + mad.lo.s32 %r4809, %r38, %r4805, %r4808; + mad.lo.s32 %r4810, %r39, %r4803, %r4809; + mad.lo.s32 %r4811, %r40, %r4801, %r4810; + ld.const.v4.u8 {%rs3090, %rs3091, %rs3092, %rs3093}, [matrix+1544]; + cvt.u32.u16 %r4812, %rs3093; + cvt.s32.s8 %r4813, %r4812; + cvt.u32.u16 %r4814, %rs3092; + cvt.s32.s8 %r4815, %r4814; + cvt.u32.u16 %r4816, %rs3091; + cvt.s32.s8 %r4817, %r4816; + cvt.u32.u16 %r4818, %rs3090; + cvt.s32.s8 %r4819, %r4818; + mad.lo.s32 %r4820, %r42, %r4819, %r4811; + mad.lo.s32 %r4821, %r43, %r4817, %r4820; + mad.lo.s32 %r4822, %r45, %r4815, %r4821; + mad.lo.s32 %r4823, %r46, %r4813, %r4822; + ld.const.v4.u8 {%rs3098, %rs3099, %rs3100, %rs3101}, [matrix+1548]; + cvt.u32.u16 %r4824, %rs3101; + cvt.s32.s8 %r4825, %r4824; + cvt.u32.u16 %r4826, %rs3100; + cvt.s32.s8 %r4827, %r4826; + cvt.u32.u16 %r4828, %rs3099; + cvt.s32.s8 %r4829, %r4828; + cvt.u32.u16 %r4830, %rs3098; + cvt.s32.s8 %r4831, %r4830; + mad.lo.s32 %r4832, %r48, %r4831, %r4823; + mad.lo.s32 %r4833, %r49, %r4829, %r4832; + mad.lo.s32 %r4834, %r50, %r4827, %r4833; + mad.lo.s32 %r4835, %r51, %r4825, %r4834; + ld.const.v4.u8 {%rs3106, %rs3107, %rs3108, %rs3109}, [matrix+1552]; + cvt.u32.u16 %r4836, %rs3109; + cvt.s32.s8 %r4837, %r4836; + cvt.u32.u16 %r4838, %rs3108; + cvt.s32.s8 %r4839, %r4838; + cvt.u32.u16 %r4840, %rs3107; + cvt.s32.s8 %r4841, %r4840; + cvt.u32.u16 %r4842, %rs3106; + cvt.s32.s8 %r4843, %r4842; + mad.lo.s32 %r4844, %r173, %r4843, %r4835; + mad.lo.s32 %r4845, %r53, %r4841, %r4844; + mad.lo.s32 %r4846, %r54, %r4839, %r4845; + mad.lo.s32 %r4847, %r55, %r4837, %r4846; + ld.const.v4.u8 {%rs3114, %rs3115, %rs3116, %rs3117}, [matrix+1556]; + cvt.u32.u16 %r4848, %rs3117; + cvt.s32.s8 %r4849, %r4848; + cvt.u32.u16 %r4850, %rs3116; + cvt.s32.s8 %r4851, %r4850; + cvt.u32.u16 %r4852, %rs3115; + cvt.s32.s8 %r4853, %r4852; + cvt.u32.u16 %r4854, %rs3114; + cvt.s32.s8 %r4855, %r4854; + mad.lo.s32 %r4856, %r56, %r4855, %r4847; + mad.lo.s32 %r4857, %r57, %r4853, %r4856; + mad.lo.s32 %r4858, %r58, %r4851, %r4857; + mad.lo.s32 %r4859, %r59, %r4849, %r4858; + ld.const.v4.u8 {%rs3122, %rs3123, %rs3124, %rs3125}, [matrix+1560]; + cvt.u32.u16 %r4860, %rs3125; + cvt.s32.s8 %r4861, %r4860; + cvt.u32.u16 %r4862, %rs3124; + cvt.s32.s8 %r4863, %r4862; + cvt.u32.u16 %r4864, %rs3123; + cvt.s32.s8 %r4865, %r4864; + cvt.u32.u16 %r4866, %rs3122; + cvt.s32.s8 %r4867, %r4866; + mad.lo.s32 %r4868, %r61, %r4867, %r4859; + mad.lo.s32 %r4869, %r62, %r4865, %r4868; + mad.lo.s32 %r4870, %r64, %r4863, %r4869; + mad.lo.s32 %r4871, %r65, %r4861, %r4870; + ld.const.v4.u8 {%rs3130, %rs3131, %rs3132, %rs3133}, [matrix+1564]; + cvt.u32.u16 %r4872, %rs3133; + cvt.s32.s8 %r4873, %r4872; + cvt.u32.u16 %r4874, %rs3132; + cvt.s32.s8 %r4875, %r4874; + cvt.u32.u16 %r4876, %rs3131; + cvt.s32.s8 %r4877, %r4876; + cvt.u32.u16 %r4878, %rs3130; + cvt.s32.s8 %r4879, %r4878; + mad.lo.s32 %r4880, %r67, %r4879, %r4871; + mad.lo.s32 %r4881, %r68, %r4877, %r4880; + mad.lo.s32 %r4882, %r69, %r4875, %r4881; + mad.lo.s32 %r4883, %r70, %r4873, %r4882; + ld.const.v4.u8 {%rs3138, %rs3139, %rs3140, %rs3141}, [matrix+1568]; + cvt.u32.u16 %r4884, %rs3141; + cvt.s32.s8 %r4885, %r4884; + cvt.u32.u16 %r4886, %rs3140; + cvt.s32.s8 %r4887, %r4886; + cvt.u32.u16 %r4888, %rs3139; + cvt.s32.s8 %r4889, %r4888; + cvt.u32.u16 %r4890, %rs3138; + cvt.s32.s8 %r4891, %r4890; + mad.lo.s32 %r4892, %r222, %r4891, %r4883; + mad.lo.s32 %r4893, %r72, %r4889, %r4892; + mad.lo.s32 %r4894, %r73, %r4887, %r4893; + mad.lo.s32 %r4895, %r74, %r4885, %r4894; + ld.const.v4.u8 {%rs3146, %rs3147, %rs3148, %rs3149}, [matrix+1572]; + cvt.u32.u16 %r4896, %rs3149; + cvt.s32.s8 %r4897, %r4896; + cvt.u32.u16 %r4898, %rs3148; + cvt.s32.s8 %r4899, %r4898; + cvt.u32.u16 %r4900, %rs3147; + cvt.s32.s8 %r4901, %r4900; + cvt.u32.u16 %r4902, %rs3146; + cvt.s32.s8 %r4903, %r4902; + mad.lo.s32 %r4904, %r75, %r4903, %r4895; + mad.lo.s32 %r4905, %r76, %r4901, %r4904; + mad.lo.s32 %r4906, %r77, %r4899, %r4905; + mad.lo.s32 %r4907, %r78, %r4897, %r4906; + ld.const.v4.u8 {%rs3154, %rs3155, %rs3156, %rs3157}, [matrix+1576]; + cvt.u32.u16 %r4908, %rs3157; + cvt.s32.s8 %r4909, %r4908; + cvt.u32.u16 %r4910, %rs3156; + cvt.s32.s8 %r4911, %r4910; + cvt.u32.u16 %r4912, %rs3155; + cvt.s32.s8 %r4913, %r4912; + cvt.u32.u16 %r4914, %rs3154; + cvt.s32.s8 %r4915, %r4914; + mad.lo.s32 %r4916, %r80, %r4915, %r4907; + mad.lo.s32 %r4917, %r81, %r4913, %r4916; + mad.lo.s32 %r4918, %r83, %r4911, %r4917; + mad.lo.s32 %r4919, %r84, %r4909, %r4918; + ld.const.v4.u8 {%rs3162, %rs3163, %rs3164, %rs3165}, [matrix+1580]; + cvt.u32.u16 %r4920, %rs3165; + cvt.s32.s8 %r4921, %r4920; + cvt.u32.u16 %r4922, %rs3164; + cvt.s32.s8 %r4923, %r4922; + cvt.u32.u16 %r4924, %rs3163; + cvt.s32.s8 %r4925, %r4924; + cvt.u32.u16 %r4926, %rs3162; + cvt.s32.s8 %r4927, %r4926; + mad.lo.s32 %r4928, %r86, %r4927, %r4919; + mad.lo.s32 %r4929, %r87, %r4925, %r4928; + mad.lo.s32 %r4930, %r88, %r4923, %r4929; + mad.lo.s32 %r4931, %r89, %r4921, %r4930; + ld.const.v4.u8 {%rs3170, %rs3171, %rs3172, %rs3173}, [matrix+1584]; + cvt.u32.u16 %r4932, %rs3173; + cvt.s32.s8 %r4933, %r4932; + cvt.u32.u16 %r4934, %rs3172; + cvt.s32.s8 %r4935, %r4934; + cvt.u32.u16 %r4936, %rs3171; + cvt.s32.s8 %r4937, %r4936; + cvt.u32.u16 %r4938, %rs3170; + cvt.s32.s8 %r4939, %r4938; + mad.lo.s32 %r4940, %r271, %r4939, %r4931; + mad.lo.s32 %r4941, %r91, %r4937, %r4940; + mad.lo.s32 %r4942, %r93, %r4935, %r4941; + mad.lo.s32 %r4943, %r94, %r4933, %r4942; + ld.const.v4.u8 {%rs3178, %rs3179, %rs3180, %rs3181}, [matrix+1588]; + cvt.u32.u16 %r4944, %rs3181; + cvt.s32.s8 %r4945, %r4944; + cvt.u32.u16 %r4946, %rs3180; + cvt.s32.s8 %r4947, %r4946; + cvt.u32.u16 %r4948, %rs3179; + cvt.s32.s8 %r4949, %r4948; + cvt.u32.u16 %r4950, %rs3178; + cvt.s32.s8 %r4951, %r4950; + mad.lo.s32 %r4952, %r96, %r4951, %r4943; + mad.lo.s32 %r4953, %r97, %r4949, %r4952; + mad.lo.s32 %r4954, %r99, %r4947, %r4953; + mad.lo.s32 %r4955, %r100, %r4945, %r4954; + ld.const.v4.u8 {%rs3186, %rs3187, %rs3188, %rs3189}, [matrix+1592]; + cvt.u32.u16 %r4956, %rs3189; + cvt.s32.s8 %r4957, %r4956; + cvt.u32.u16 %r4958, %rs3188; + cvt.s32.s8 %r4959, %r4958; + cvt.u32.u16 %r4960, %rs3187; + cvt.s32.s8 %r4961, %r4960; + cvt.u32.u16 %r4962, %rs3186; + cvt.s32.s8 %r4963, %r4962; + mad.lo.s32 %r4964, %r103, %r4963, %r4955; + mad.lo.s32 %r4965, %r104, %r4961, %r4964; + mad.lo.s32 %r4966, %r107, %r4959, %r4965; + mad.lo.s32 %r4967, %r108, %r4957, %r4966; + ld.const.v4.u8 {%rs3194, %rs3195, %rs3196, %rs3197}, [matrix+1596]; + cvt.u32.u16 %r4968, %rs3197; + cvt.s32.s8 %r4969, %r4968; + cvt.u32.u16 %r4970, %rs3196; + cvt.s32.s8 %r4971, %r4970; + cvt.u32.u16 %r4972, %rs3195; + cvt.s32.s8 %r4973, %r4972; + cvt.u32.u16 %r4974, %rs3194; + cvt.s32.s8 %r4975, %r4974; + mad.lo.s32 %r4976, %r111, %r4975, %r4967; + mad.lo.s32 %r4977, %r112, %r4973, %r4976; + mad.lo.s32 %r4978, %r114, %r4971, %r4977; + mad.lo.s32 %r4979, %r115, %r4969, %r4978; + ld.const.v4.u8 {%rs3202, %rs3203, %rs3204, %rs3205}, [matrix+1600]; + cvt.u32.u16 %r4980, %rs3205; + cvt.s32.s8 %r4981, %r4980; + cvt.u32.u16 %r4982, %rs3204; + cvt.s32.s8 %r4983, %r4982; + cvt.u32.u16 %r4984, %rs3202; + cvt.s32.s8 %r4985, %r4984; + cvt.u32.u16 %r4986, %rs3203; + cvt.s32.s8 %r4987, %r4986; + mul.lo.s32 %r4988, %r34, %r4987; + mad.lo.s32 %r4989, %r124, %r4985, %r4988; + mad.lo.s32 %r4990, %r35, %r4983, %r4989; + mad.lo.s32 %r4991, %r36, %r4981, %r4990; + ld.const.v4.u8 {%rs3210, %rs3211, %rs3212, %rs3213}, [matrix+1604]; + cvt.u32.u16 %r4992, %rs3213; + cvt.s32.s8 %r4993, %r4992; + cvt.u32.u16 %r4994, %rs3212; + cvt.s32.s8 %r4995, %r4994; + cvt.u32.u16 %r4996, %rs3211; + cvt.s32.s8 %r4997, %r4996; + cvt.u32.u16 %r4998, %rs3210; + cvt.s32.s8 %r4999, %r4998; + mad.lo.s32 %r5000, %r37, %r4999, %r4991; + mad.lo.s32 %r5001, %r38, %r4997, %r5000; + mad.lo.s32 %r5002, %r39, %r4995, %r5001; + mad.lo.s32 %r5003, %r40, %r4993, %r5002; + ld.const.v4.u8 {%rs3218, %rs3219, %rs3220, %rs3221}, [matrix+1608]; + cvt.u32.u16 %r5004, %rs3221; + cvt.s32.s8 %r5005, %r5004; + cvt.u32.u16 %r5006, %rs3220; + cvt.s32.s8 %r5007, %r5006; + cvt.u32.u16 %r5008, %rs3219; + cvt.s32.s8 %r5009, %r5008; + cvt.u32.u16 %r5010, %rs3218; + cvt.s32.s8 %r5011, %r5010; + mad.lo.s32 %r5012, %r42, %r5011, %r5003; + mad.lo.s32 %r5013, %r43, %r5009, %r5012; + mad.lo.s32 %r5014, %r45, %r5007, %r5013; + mad.lo.s32 %r5015, %r46, %r5005, %r5014; + ld.const.v4.u8 {%rs3226, %rs3227, %rs3228, %rs3229}, [matrix+1612]; + cvt.u32.u16 %r5016, %rs3229; + cvt.s32.s8 %r5017, %r5016; + cvt.u32.u16 %r5018, %rs3228; + cvt.s32.s8 %r5019, %r5018; + cvt.u32.u16 %r5020, %rs3227; + cvt.s32.s8 %r5021, %r5020; + cvt.u32.u16 %r5022, %rs3226; + cvt.s32.s8 %r5023, %r5022; + mad.lo.s32 %r5024, %r48, %r5023, %r5015; + mad.lo.s32 %r5025, %r49, %r5021, %r5024; + mad.lo.s32 %r5026, %r50, %r5019, %r5025; + mad.lo.s32 %r5027, %r51, %r5017, %r5026; + ld.const.v4.u8 {%rs3234, %rs3235, %rs3236, %rs3237}, [matrix+1616]; + cvt.u32.u16 %r5028, %rs3237; + cvt.s32.s8 %r5029, %r5028; + cvt.u32.u16 %r5030, %rs3236; + cvt.s32.s8 %r5031, %r5030; + cvt.u32.u16 %r5032, %rs3235; + cvt.s32.s8 %r5033, %r5032; + cvt.u32.u16 %r5034, %rs3234; + cvt.s32.s8 %r5035, %r5034; + mad.lo.s32 %r5036, %r173, %r5035, %r5027; + mad.lo.s32 %r5037, %r53, %r5033, %r5036; + mad.lo.s32 %r5038, %r54, %r5031, %r5037; + mad.lo.s32 %r5039, %r55, %r5029, %r5038; + ld.const.v4.u8 {%rs3242, %rs3243, %rs3244, %rs3245}, [matrix+1620]; + cvt.u32.u16 %r5040, %rs3245; + cvt.s32.s8 %r5041, %r5040; + cvt.u32.u16 %r5042, %rs3244; + cvt.s32.s8 %r5043, %r5042; + cvt.u32.u16 %r5044, %rs3243; + cvt.s32.s8 %r5045, %r5044; + cvt.u32.u16 %r5046, %rs3242; + cvt.s32.s8 %r5047, %r5046; + mad.lo.s32 %r5048, %r56, %r5047, %r5039; + mad.lo.s32 %r5049, %r57, %r5045, %r5048; + mad.lo.s32 %r5050, %r58, %r5043, %r5049; + mad.lo.s32 %r5051, %r59, %r5041, %r5050; + ld.const.v4.u8 {%rs3250, %rs3251, %rs3252, %rs3253}, [matrix+1624]; + cvt.u32.u16 %r5052, %rs3253; + cvt.s32.s8 %r5053, %r5052; + cvt.u32.u16 %r5054, %rs3252; + cvt.s32.s8 %r5055, %r5054; + cvt.u32.u16 %r5056, %rs3251; + cvt.s32.s8 %r5057, %r5056; + cvt.u32.u16 %r5058, %rs3250; + cvt.s32.s8 %r5059, %r5058; + mad.lo.s32 %r5060, %r61, %r5059, %r5051; + mad.lo.s32 %r5061, %r62, %r5057, %r5060; + mad.lo.s32 %r5062, %r64, %r5055, %r5061; + mad.lo.s32 %r5063, %r65, %r5053, %r5062; + ld.const.v4.u8 {%rs3258, %rs3259, %rs3260, %rs3261}, [matrix+1628]; + cvt.u32.u16 %r5064, %rs3261; + cvt.s32.s8 %r5065, %r5064; + cvt.u32.u16 %r5066, %rs3260; + cvt.s32.s8 %r5067, %r5066; + cvt.u32.u16 %r5068, %rs3259; + cvt.s32.s8 %r5069, %r5068; + cvt.u32.u16 %r5070, %rs3258; + cvt.s32.s8 %r5071, %r5070; + mad.lo.s32 %r5072, %r67, %r5071, %r5063; + mad.lo.s32 %r5073, %r68, %r5069, %r5072; + mad.lo.s32 %r5074, %r69, %r5067, %r5073; + mad.lo.s32 %r5075, %r70, %r5065, %r5074; + ld.const.v4.u8 {%rs3266, %rs3267, %rs3268, %rs3269}, [matrix+1632]; + cvt.u32.u16 %r5076, %rs3269; + cvt.s32.s8 %r5077, %r5076; + cvt.u32.u16 %r5078, %rs3268; + cvt.s32.s8 %r5079, %r5078; + cvt.u32.u16 %r5080, %rs3267; + cvt.s32.s8 %r5081, %r5080; + cvt.u32.u16 %r5082, %rs3266; + cvt.s32.s8 %r5083, %r5082; + mad.lo.s32 %r5084, %r222, %r5083, %r5075; + mad.lo.s32 %r5085, %r72, %r5081, %r5084; + mad.lo.s32 %r5086, %r73, %r5079, %r5085; + mad.lo.s32 %r5087, %r74, %r5077, %r5086; + ld.const.v4.u8 {%rs3274, %rs3275, %rs3276, %rs3277}, [matrix+1636]; + cvt.u32.u16 %r5088, %rs3277; + cvt.s32.s8 %r5089, %r5088; + cvt.u32.u16 %r5090, %rs3276; + cvt.s32.s8 %r5091, %r5090; + cvt.u32.u16 %r5092, %rs3275; + cvt.s32.s8 %r5093, %r5092; + cvt.u32.u16 %r5094, %rs3274; + cvt.s32.s8 %r5095, %r5094; + mad.lo.s32 %r5096, %r75, %r5095, %r5087; + mad.lo.s32 %r5097, %r76, %r5093, %r5096; + mad.lo.s32 %r5098, %r77, %r5091, %r5097; + mad.lo.s32 %r5099, %r78, %r5089, %r5098; + ld.const.v4.u8 {%rs3282, %rs3283, %rs3284, %rs3285}, [matrix+1640]; + cvt.u32.u16 %r5100, %rs3285; + cvt.s32.s8 %r5101, %r5100; + cvt.u32.u16 %r5102, %rs3284; + cvt.s32.s8 %r5103, %r5102; + cvt.u32.u16 %r5104, %rs3283; + cvt.s32.s8 %r5105, %r5104; + cvt.u32.u16 %r5106, %rs3282; + cvt.s32.s8 %r5107, %r5106; + mad.lo.s32 %r5108, %r80, %r5107, %r5099; + mad.lo.s32 %r5109, %r81, %r5105, %r5108; + mad.lo.s32 %r5110, %r83, %r5103, %r5109; + mad.lo.s32 %r5111, %r84, %r5101, %r5110; + ld.const.v4.u8 {%rs3290, %rs3291, %rs3292, %rs3293}, [matrix+1644]; + cvt.u32.u16 %r5112, %rs3293; + cvt.s32.s8 %r5113, %r5112; + cvt.u32.u16 %r5114, %rs3292; + cvt.s32.s8 %r5115, %r5114; + cvt.u32.u16 %r5116, %rs3291; + cvt.s32.s8 %r5117, %r5116; + cvt.u32.u16 %r5118, %rs3290; + cvt.s32.s8 %r5119, %r5118; + mad.lo.s32 %r5120, %r86, %r5119, %r5111; + mad.lo.s32 %r5121, %r87, %r5117, %r5120; + mad.lo.s32 %r5122, %r88, %r5115, %r5121; + mad.lo.s32 %r5123, %r89, %r5113, %r5122; + ld.const.v4.u8 {%rs3298, %rs3299, %rs3300, %rs3301}, [matrix+1648]; + cvt.u32.u16 %r5124, %rs3301; + cvt.s32.s8 %r5125, %r5124; + cvt.u32.u16 %r5126, %rs3300; + cvt.s32.s8 %r5127, %r5126; + cvt.u32.u16 %r5128, %rs3299; + cvt.s32.s8 %r5129, %r5128; + cvt.u32.u16 %r5130, %rs3298; + cvt.s32.s8 %r5131, %r5130; + mad.lo.s32 %r5132, %r271, %r5131, %r5123; + mad.lo.s32 %r5133, %r91, %r5129, %r5132; + mad.lo.s32 %r5134, %r93, %r5127, %r5133; + mad.lo.s32 %r5135, %r94, %r5125, %r5134; + ld.const.v4.u8 {%rs3306, %rs3307, %rs3308, %rs3309}, [matrix+1652]; + cvt.u32.u16 %r5136, %rs3309; + cvt.s32.s8 %r5137, %r5136; + cvt.u32.u16 %r5138, %rs3308; + cvt.s32.s8 %r5139, %r5138; + cvt.u32.u16 %r5140, %rs3307; + cvt.s32.s8 %r5141, %r5140; + cvt.u32.u16 %r5142, %rs3306; + cvt.s32.s8 %r5143, %r5142; + mad.lo.s32 %r5144, %r96, %r5143, %r5135; + mad.lo.s32 %r5145, %r97, %r5141, %r5144; + mad.lo.s32 %r5146, %r99, %r5139, %r5145; + mad.lo.s32 %r5147, %r100, %r5137, %r5146; + ld.const.v4.u8 {%rs3314, %rs3315, %rs3316, %rs3317}, [matrix+1656]; + cvt.u32.u16 %r5148, %rs3317; + cvt.s32.s8 %r5149, %r5148; + cvt.u32.u16 %r5150, %rs3316; + cvt.s32.s8 %r5151, %r5150; + cvt.u32.u16 %r5152, %rs3315; + cvt.s32.s8 %r5153, %r5152; + cvt.u32.u16 %r5154, %rs3314; + cvt.s32.s8 %r5155, %r5154; + mad.lo.s32 %r5156, %r103, %r5155, %r5147; + mad.lo.s32 %r5157, %r104, %r5153, %r5156; + mad.lo.s32 %r5158, %r107, %r5151, %r5157; + mad.lo.s32 %r5159, %r108, %r5149, %r5158; + ld.const.v4.u8 {%rs3322, %rs3323, %rs3324, %rs3325}, [matrix+1660]; + cvt.u32.u16 %r5160, %rs3325; + cvt.s32.s8 %r5161, %r5160; + cvt.u32.u16 %r5162, %rs3324; + cvt.s32.s8 %r5163, %r5162; + cvt.u32.u16 %r5164, %rs3323; + cvt.s32.s8 %r5165, %r5164; + cvt.u32.u16 %r5166, %rs3322; + cvt.s32.s8 %r5167, %r5166; + mad.lo.s32 %r5168, %r111, %r5167, %r5159; + mad.lo.s32 %r5169, %r112, %r5165, %r5168; + mad.lo.s32 %r5170, %r114, %r5163, %r5169; + mad.lo.s32 %r5171, %r115, %r5161, %r5170; + shr.u32 %r5172, %r4979, 6; + and.b32 %r5173, %r5172, 240; + shr.u32 %r5174, %r5171, 10; + or.b32 %r5175, %r5174, %r5173; + xor.b32 %r5176, %r23, %r5175; + cvt.u64.u32 %rd390, %r5176; + ld.const.v4.u8 {%rs3330, %rs3331, %rs3332, %rs3333}, [matrix+1664]; + cvt.u32.u16 %r5177, %rs3333; + cvt.s32.s8 %r5178, %r5177; + cvt.u32.u16 %r5179, %rs3332; + cvt.s32.s8 %r5180, %r5179; + cvt.u32.u16 %r5181, %rs3330; + cvt.s32.s8 %r5182, %r5181; + cvt.u32.u16 %r5183, %rs3331; + cvt.s32.s8 %r5184, %r5183; + mul.lo.s32 %r5185, %r34, %r5184; + mad.lo.s32 %r5186, %r124, %r5182, %r5185; + mad.lo.s32 %r5187, %r35, %r5180, %r5186; + mad.lo.s32 %r5188, %r36, %r5178, %r5187; + ld.const.v4.u8 {%rs3338, %rs3339, %rs3340, %rs3341}, [matrix+1668]; + cvt.u32.u16 %r5189, %rs3341; + cvt.s32.s8 %r5190, %r5189; + cvt.u32.u16 %r5191, %rs3340; + cvt.s32.s8 %r5192, %r5191; + cvt.u32.u16 %r5193, %rs3339; + cvt.s32.s8 %r5194, %r5193; + cvt.u32.u16 %r5195, %rs3338; + cvt.s32.s8 %r5196, %r5195; + mad.lo.s32 %r5197, %r37, %r5196, %r5188; + mad.lo.s32 %r5198, %r38, %r5194, %r5197; + mad.lo.s32 %r5199, %r39, %r5192, %r5198; + mad.lo.s32 %r5200, %r40, %r5190, %r5199; + ld.const.v4.u8 {%rs3346, %rs3347, %rs3348, %rs3349}, [matrix+1672]; + cvt.u32.u16 %r5201, %rs3349; + cvt.s32.s8 %r5202, %r5201; + cvt.u32.u16 %r5203, %rs3348; + cvt.s32.s8 %r5204, %r5203; + cvt.u32.u16 %r5205, %rs3347; + cvt.s32.s8 %r5206, %r5205; + cvt.u32.u16 %r5207, %rs3346; + cvt.s32.s8 %r5208, %r5207; + mad.lo.s32 %r5209, %r42, %r5208, %r5200; + mad.lo.s32 %r5210, %r43, %r5206, %r5209; + mad.lo.s32 %r5211, %r45, %r5204, %r5210; + mad.lo.s32 %r5212, %r46, %r5202, %r5211; + ld.const.v4.u8 {%rs3354, %rs3355, %rs3356, %rs3357}, [matrix+1676]; + cvt.u32.u16 %r5213, %rs3357; + cvt.s32.s8 %r5214, %r5213; + cvt.u32.u16 %r5215, %rs3356; + cvt.s32.s8 %r5216, %r5215; + cvt.u32.u16 %r5217, %rs3355; + cvt.s32.s8 %r5218, %r5217; + cvt.u32.u16 %r5219, %rs3354; + cvt.s32.s8 %r5220, %r5219; + mad.lo.s32 %r5221, %r48, %r5220, %r5212; + mad.lo.s32 %r5222, %r49, %r5218, %r5221; + mad.lo.s32 %r5223, %r50, %r5216, %r5222; + mad.lo.s32 %r5224, %r51, %r5214, %r5223; + ld.const.v4.u8 {%rs3362, %rs3363, %rs3364, %rs3365}, [matrix+1680]; + cvt.u32.u16 %r5225, %rs3365; + cvt.s32.s8 %r5226, %r5225; + cvt.u32.u16 %r5227, %rs3364; + cvt.s32.s8 %r5228, %r5227; + cvt.u32.u16 %r5229, %rs3363; + cvt.s32.s8 %r5230, %r5229; + cvt.u32.u16 %r5231, %rs3362; + cvt.s32.s8 %r5232, %r5231; + mad.lo.s32 %r5233, %r173, %r5232, %r5224; + mad.lo.s32 %r5234, %r53, %r5230, %r5233; + mad.lo.s32 %r5235, %r54, %r5228, %r5234; + mad.lo.s32 %r5236, %r55, %r5226, %r5235; + ld.const.v4.u8 {%rs3370, %rs3371, %rs3372, %rs3373}, [matrix+1684]; + cvt.u32.u16 %r5237, %rs3373; + cvt.s32.s8 %r5238, %r5237; + cvt.u32.u16 %r5239, %rs3372; + cvt.s32.s8 %r5240, %r5239; + cvt.u32.u16 %r5241, %rs3371; + cvt.s32.s8 %r5242, %r5241; + cvt.u32.u16 %r5243, %rs3370; + cvt.s32.s8 %r5244, %r5243; + mad.lo.s32 %r5245, %r56, %r5244, %r5236; + mad.lo.s32 %r5246, %r57, %r5242, %r5245; + mad.lo.s32 %r5247, %r58, %r5240, %r5246; + mad.lo.s32 %r5248, %r59, %r5238, %r5247; + ld.const.v4.u8 {%rs3378, %rs3379, %rs3380, %rs3381}, [matrix+1688]; + cvt.u32.u16 %r5249, %rs3381; + cvt.s32.s8 %r5250, %r5249; + cvt.u32.u16 %r5251, %rs3380; + cvt.s32.s8 %r5252, %r5251; + cvt.u32.u16 %r5253, %rs3379; + cvt.s32.s8 %r5254, %r5253; + cvt.u32.u16 %r5255, %rs3378; + cvt.s32.s8 %r5256, %r5255; + mad.lo.s32 %r5257, %r61, %r5256, %r5248; + mad.lo.s32 %r5258, %r62, %r5254, %r5257; + mad.lo.s32 %r5259, %r64, %r5252, %r5258; + mad.lo.s32 %r5260, %r65, %r5250, %r5259; + ld.const.v4.u8 {%rs3386, %rs3387, %rs3388, %rs3389}, [matrix+1692]; + cvt.u32.u16 %r5261, %rs3389; + cvt.s32.s8 %r5262, %r5261; + cvt.u32.u16 %r5263, %rs3388; + cvt.s32.s8 %r5264, %r5263; + cvt.u32.u16 %r5265, %rs3387; + cvt.s32.s8 %r5266, %r5265; + cvt.u32.u16 %r5267, %rs3386; + cvt.s32.s8 %r5268, %r5267; + mad.lo.s32 %r5269, %r67, %r5268, %r5260; + mad.lo.s32 %r5270, %r68, %r5266, %r5269; + mad.lo.s32 %r5271, %r69, %r5264, %r5270; + mad.lo.s32 %r5272, %r70, %r5262, %r5271; + ld.const.v4.u8 {%rs3394, %rs3395, %rs3396, %rs3397}, [matrix+1696]; + cvt.u32.u16 %r5273, %rs3397; + cvt.s32.s8 %r5274, %r5273; + cvt.u32.u16 %r5275, %rs3396; + cvt.s32.s8 %r5276, %r5275; + cvt.u32.u16 %r5277, %rs3395; + cvt.s32.s8 %r5278, %r5277; + cvt.u32.u16 %r5279, %rs3394; + cvt.s32.s8 %r5280, %r5279; + mad.lo.s32 %r5281, %r222, %r5280, %r5272; + mad.lo.s32 %r5282, %r72, %r5278, %r5281; + mad.lo.s32 %r5283, %r73, %r5276, %r5282; + mad.lo.s32 %r5284, %r74, %r5274, %r5283; + ld.const.v4.u8 {%rs3402, %rs3403, %rs3404, %rs3405}, [matrix+1700]; + cvt.u32.u16 %r5285, %rs3405; + cvt.s32.s8 %r5286, %r5285; + cvt.u32.u16 %r5287, %rs3404; + cvt.s32.s8 %r5288, %r5287; + cvt.u32.u16 %r5289, %rs3403; + cvt.s32.s8 %r5290, %r5289; + cvt.u32.u16 %r5291, %rs3402; + cvt.s32.s8 %r5292, %r5291; + mad.lo.s32 %r5293, %r75, %r5292, %r5284; + mad.lo.s32 %r5294, %r76, %r5290, %r5293; + mad.lo.s32 %r5295, %r77, %r5288, %r5294; + mad.lo.s32 %r5296, %r78, %r5286, %r5295; + ld.const.v4.u8 {%rs3410, %rs3411, %rs3412, %rs3413}, [matrix+1704]; + cvt.u32.u16 %r5297, %rs3413; + cvt.s32.s8 %r5298, %r5297; + cvt.u32.u16 %r5299, %rs3412; + cvt.s32.s8 %r5300, %r5299; + cvt.u32.u16 %r5301, %rs3411; + cvt.s32.s8 %r5302, %r5301; + cvt.u32.u16 %r5303, %rs3410; + cvt.s32.s8 %r5304, %r5303; + mad.lo.s32 %r5305, %r80, %r5304, %r5296; + mad.lo.s32 %r5306, %r81, %r5302, %r5305; + mad.lo.s32 %r5307, %r83, %r5300, %r5306; + mad.lo.s32 %r5308, %r84, %r5298, %r5307; + ld.const.v4.u8 {%rs3418, %rs3419, %rs3420, %rs3421}, [matrix+1708]; + cvt.u32.u16 %r5309, %rs3421; + cvt.s32.s8 %r5310, %r5309; + cvt.u32.u16 %r5311, %rs3420; + cvt.s32.s8 %r5312, %r5311; + cvt.u32.u16 %r5313, %rs3419; + cvt.s32.s8 %r5314, %r5313; + cvt.u32.u16 %r5315, %rs3418; + cvt.s32.s8 %r5316, %r5315; + mad.lo.s32 %r5317, %r86, %r5316, %r5308; + mad.lo.s32 %r5318, %r87, %r5314, %r5317; + mad.lo.s32 %r5319, %r88, %r5312, %r5318; + mad.lo.s32 %r5320, %r89, %r5310, %r5319; + ld.const.v4.u8 {%rs3426, %rs3427, %rs3428, %rs3429}, [matrix+1712]; + cvt.u32.u16 %r5321, %rs3429; + cvt.s32.s8 %r5322, %r5321; + cvt.u32.u16 %r5323, %rs3428; + cvt.s32.s8 %r5324, %r5323; + cvt.u32.u16 %r5325, %rs3427; + cvt.s32.s8 %r5326, %r5325; + cvt.u32.u16 %r5327, %rs3426; + cvt.s32.s8 %r5328, %r5327; + mad.lo.s32 %r5329, %r271, %r5328, %r5320; + mad.lo.s32 %r5330, %r91, %r5326, %r5329; + mad.lo.s32 %r5331, %r93, %r5324, %r5330; + mad.lo.s32 %r5332, %r94, %r5322, %r5331; + ld.const.v4.u8 {%rs3434, %rs3435, %rs3436, %rs3437}, [matrix+1716]; + cvt.u32.u16 %r5333, %rs3437; + cvt.s32.s8 %r5334, %r5333; + cvt.u32.u16 %r5335, %rs3436; + cvt.s32.s8 %r5336, %r5335; + cvt.u32.u16 %r5337, %rs3435; + cvt.s32.s8 %r5338, %r5337; + cvt.u32.u16 %r5339, %rs3434; + cvt.s32.s8 %r5340, %r5339; + mad.lo.s32 %r5341, %r96, %r5340, %r5332; + mad.lo.s32 %r5342, %r97, %r5338, %r5341; + mad.lo.s32 %r5343, %r99, %r5336, %r5342; + mad.lo.s32 %r5344, %r100, %r5334, %r5343; + ld.const.v4.u8 {%rs3442, %rs3443, %rs3444, %rs3445}, [matrix+1720]; + cvt.u32.u16 %r5345, %rs3445; + cvt.s32.s8 %r5346, %r5345; + cvt.u32.u16 %r5347, %rs3444; + cvt.s32.s8 %r5348, %r5347; + cvt.u32.u16 %r5349, %rs3443; + cvt.s32.s8 %r5350, %r5349; + cvt.u32.u16 %r5351, %rs3442; + cvt.s32.s8 %r5352, %r5351; + mad.lo.s32 %r5353, %r103, %r5352, %r5344; + mad.lo.s32 %r5354, %r104, %r5350, %r5353; + mad.lo.s32 %r5355, %r107, %r5348, %r5354; + mad.lo.s32 %r5356, %r108, %r5346, %r5355; + ld.const.v4.u8 {%rs3450, %rs3451, %rs3452, %rs3453}, [matrix+1724]; + cvt.u32.u16 %r5357, %rs3453; + cvt.s32.s8 %r5358, %r5357; + cvt.u32.u16 %r5359, %rs3452; + cvt.s32.s8 %r5360, %r5359; + cvt.u32.u16 %r5361, %rs3451; + cvt.s32.s8 %r5362, %r5361; + cvt.u32.u16 %r5363, %rs3450; + cvt.s32.s8 %r5364, %r5363; + mad.lo.s32 %r5365, %r111, %r5364, %r5356; + mad.lo.s32 %r5366, %r112, %r5362, %r5365; + mad.lo.s32 %r5367, %r114, %r5360, %r5366; + mad.lo.s32 %r5368, %r115, %r5358, %r5367; + ld.const.v4.u8 {%rs3458, %rs3459, %rs3460, %rs3461}, [matrix+1728]; + cvt.u32.u16 %r5369, %rs3461; + cvt.s32.s8 %r5370, %r5369; + cvt.u32.u16 %r5371, %rs3460; + cvt.s32.s8 %r5372, %r5371; + cvt.u32.u16 %r5373, %rs3458; + cvt.s32.s8 %r5374, %r5373; + cvt.u32.u16 %r5375, %rs3459; + cvt.s32.s8 %r5376, %r5375; + mul.lo.s32 %r5377, %r34, %r5376; + mad.lo.s32 %r5378, %r124, %r5374, %r5377; + mad.lo.s32 %r5379, %r35, %r5372, %r5378; + mad.lo.s32 %r5380, %r36, %r5370, %r5379; + ld.const.v4.u8 {%rs3466, %rs3467, %rs3468, %rs3469}, [matrix+1732]; + cvt.u32.u16 %r5381, %rs3469; + cvt.s32.s8 %r5382, %r5381; + cvt.u32.u16 %r5383, %rs3468; + cvt.s32.s8 %r5384, %r5383; + cvt.u32.u16 %r5385, %rs3467; + cvt.s32.s8 %r5386, %r5385; + cvt.u32.u16 %r5387, %rs3466; + cvt.s32.s8 %r5388, %r5387; + mad.lo.s32 %r5389, %r37, %r5388, %r5380; + mad.lo.s32 %r5390, %r38, %r5386, %r5389; + mad.lo.s32 %r5391, %r39, %r5384, %r5390; + mad.lo.s32 %r5392, %r40, %r5382, %r5391; + ld.const.v4.u8 {%rs3474, %rs3475, %rs3476, %rs3477}, [matrix+1736]; + cvt.u32.u16 %r5393, %rs3477; + cvt.s32.s8 %r5394, %r5393; + cvt.u32.u16 %r5395, %rs3476; + cvt.s32.s8 %r5396, %r5395; + cvt.u32.u16 %r5397, %rs3475; + cvt.s32.s8 %r5398, %r5397; + cvt.u32.u16 %r5399, %rs3474; + cvt.s32.s8 %r5400, %r5399; + mad.lo.s32 %r5401, %r42, %r5400, %r5392; + mad.lo.s32 %r5402, %r43, %r5398, %r5401; + mad.lo.s32 %r5403, %r45, %r5396, %r5402; + mad.lo.s32 %r5404, %r46, %r5394, %r5403; + ld.const.v4.u8 {%rs3482, %rs3483, %rs3484, %rs3485}, [matrix+1740]; + cvt.u32.u16 %r5405, %rs3485; + cvt.s32.s8 %r5406, %r5405; + cvt.u32.u16 %r5407, %rs3484; + cvt.s32.s8 %r5408, %r5407; + cvt.u32.u16 %r5409, %rs3483; + cvt.s32.s8 %r5410, %r5409; + cvt.u32.u16 %r5411, %rs3482; + cvt.s32.s8 %r5412, %r5411; + mad.lo.s32 %r5413, %r48, %r5412, %r5404; + mad.lo.s32 %r5414, %r49, %r5410, %r5413; + mad.lo.s32 %r5415, %r50, %r5408, %r5414; + mad.lo.s32 %r5416, %r51, %r5406, %r5415; + ld.const.v4.u8 {%rs3490, %rs3491, %rs3492, %rs3493}, [matrix+1744]; + cvt.u32.u16 %r5417, %rs3493; + cvt.s32.s8 %r5418, %r5417; + cvt.u32.u16 %r5419, %rs3492; + cvt.s32.s8 %r5420, %r5419; + cvt.u32.u16 %r5421, %rs3491; + cvt.s32.s8 %r5422, %r5421; + cvt.u32.u16 %r5423, %rs3490; + cvt.s32.s8 %r5424, %r5423; + mad.lo.s32 %r5425, %r173, %r5424, %r5416; + mad.lo.s32 %r5426, %r53, %r5422, %r5425; + mad.lo.s32 %r5427, %r54, %r5420, %r5426; + mad.lo.s32 %r5428, %r55, %r5418, %r5427; + ld.const.v4.u8 {%rs3498, %rs3499, %rs3500, %rs3501}, [matrix+1748]; + cvt.u32.u16 %r5429, %rs3501; + cvt.s32.s8 %r5430, %r5429; + cvt.u32.u16 %r5431, %rs3500; + cvt.s32.s8 %r5432, %r5431; + cvt.u32.u16 %r5433, %rs3499; + cvt.s32.s8 %r5434, %r5433; + cvt.u32.u16 %r5435, %rs3498; + cvt.s32.s8 %r5436, %r5435; + mad.lo.s32 %r5437, %r56, %r5436, %r5428; + mad.lo.s32 %r5438, %r57, %r5434, %r5437; + mad.lo.s32 %r5439, %r58, %r5432, %r5438; + mad.lo.s32 %r5440, %r59, %r5430, %r5439; + ld.const.v4.u8 {%rs3506, %rs3507, %rs3508, %rs3509}, [matrix+1752]; + cvt.u32.u16 %r5441, %rs3509; + cvt.s32.s8 %r5442, %r5441; + cvt.u32.u16 %r5443, %rs3508; + cvt.s32.s8 %r5444, %r5443; + cvt.u32.u16 %r5445, %rs3507; + cvt.s32.s8 %r5446, %r5445; + cvt.u32.u16 %r5447, %rs3506; + cvt.s32.s8 %r5448, %r5447; + mad.lo.s32 %r5449, %r61, %r5448, %r5440; + mad.lo.s32 %r5450, %r62, %r5446, %r5449; + mad.lo.s32 %r5451, %r64, %r5444, %r5450; + mad.lo.s32 %r5452, %r65, %r5442, %r5451; + ld.const.v4.u8 {%rs3514, %rs3515, %rs3516, %rs3517}, [matrix+1756]; + cvt.u32.u16 %r5453, %rs3517; + cvt.s32.s8 %r5454, %r5453; + cvt.u32.u16 %r5455, %rs3516; + cvt.s32.s8 %r5456, %r5455; + cvt.u32.u16 %r5457, %rs3515; + cvt.s32.s8 %r5458, %r5457; + cvt.u32.u16 %r5459, %rs3514; + cvt.s32.s8 %r5460, %r5459; + mad.lo.s32 %r5461, %r67, %r5460, %r5452; + mad.lo.s32 %r5462, %r68, %r5458, %r5461; + mad.lo.s32 %r5463, %r69, %r5456, %r5462; + mad.lo.s32 %r5464, %r70, %r5454, %r5463; + ld.const.v4.u8 {%rs3522, %rs3523, %rs3524, %rs3525}, [matrix+1760]; + cvt.u32.u16 %r5465, %rs3525; + cvt.s32.s8 %r5466, %r5465; + cvt.u32.u16 %r5467, %rs3524; + cvt.s32.s8 %r5468, %r5467; + cvt.u32.u16 %r5469, %rs3523; + cvt.s32.s8 %r5470, %r5469; + cvt.u32.u16 %r5471, %rs3522; + cvt.s32.s8 %r5472, %r5471; + mad.lo.s32 %r5473, %r222, %r5472, %r5464; + mad.lo.s32 %r5474, %r72, %r5470, %r5473; + mad.lo.s32 %r5475, %r73, %r5468, %r5474; + mad.lo.s32 %r5476, %r74, %r5466, %r5475; + ld.const.v4.u8 {%rs3530, %rs3531, %rs3532, %rs3533}, [matrix+1764]; + cvt.u32.u16 %r5477, %rs3533; + cvt.s32.s8 %r5478, %r5477; + cvt.u32.u16 %r5479, %rs3532; + cvt.s32.s8 %r5480, %r5479; + cvt.u32.u16 %r5481, %rs3531; + cvt.s32.s8 %r5482, %r5481; + cvt.u32.u16 %r5483, %rs3530; + cvt.s32.s8 %r5484, %r5483; + mad.lo.s32 %r5485, %r75, %r5484, %r5476; + mad.lo.s32 %r5486, %r76, %r5482, %r5485; + mad.lo.s32 %r5487, %r77, %r5480, %r5486; + mad.lo.s32 %r5488, %r78, %r5478, %r5487; + ld.const.v4.u8 {%rs3538, %rs3539, %rs3540, %rs3541}, [matrix+1768]; + cvt.u32.u16 %r5489, %rs3541; + cvt.s32.s8 %r5490, %r5489; + cvt.u32.u16 %r5491, %rs3540; + cvt.s32.s8 %r5492, %r5491; + cvt.u32.u16 %r5493, %rs3539; + cvt.s32.s8 %r5494, %r5493; + cvt.u32.u16 %r5495, %rs3538; + cvt.s32.s8 %r5496, %r5495; + mad.lo.s32 %r5497, %r80, %r5496, %r5488; + mad.lo.s32 %r5498, %r81, %r5494, %r5497; + mad.lo.s32 %r5499, %r83, %r5492, %r5498; + mad.lo.s32 %r5500, %r84, %r5490, %r5499; + ld.const.v4.u8 {%rs3546, %rs3547, %rs3548, %rs3549}, [matrix+1772]; + cvt.u32.u16 %r5501, %rs3549; + cvt.s32.s8 %r5502, %r5501; + cvt.u32.u16 %r5503, %rs3548; + cvt.s32.s8 %r5504, %r5503; + cvt.u32.u16 %r5505, %rs3547; + cvt.s32.s8 %r5506, %r5505; + cvt.u32.u16 %r5507, %rs3546; + cvt.s32.s8 %r5508, %r5507; + mad.lo.s32 %r5509, %r86, %r5508, %r5500; + mad.lo.s32 %r5510, %r87, %r5506, %r5509; + mad.lo.s32 %r5511, %r88, %r5504, %r5510; + mad.lo.s32 %r5512, %r89, %r5502, %r5511; + ld.const.v4.u8 {%rs3554, %rs3555, %rs3556, %rs3557}, [matrix+1776]; + cvt.u32.u16 %r5513, %rs3557; + cvt.s32.s8 %r5514, %r5513; + cvt.u32.u16 %r5515, %rs3556; + cvt.s32.s8 %r5516, %r5515; + cvt.u32.u16 %r5517, %rs3555; + cvt.s32.s8 %r5518, %r5517; + cvt.u32.u16 %r5519, %rs3554; + cvt.s32.s8 %r5520, %r5519; + mad.lo.s32 %r5521, %r271, %r5520, %r5512; + mad.lo.s32 %r5522, %r91, %r5518, %r5521; + mad.lo.s32 %r5523, %r93, %r5516, %r5522; + mad.lo.s32 %r5524, %r94, %r5514, %r5523; + ld.const.v4.u8 {%rs3562, %rs3563, %rs3564, %rs3565}, [matrix+1780]; + cvt.u32.u16 %r5525, %rs3565; + cvt.s32.s8 %r5526, %r5525; + cvt.u32.u16 %r5527, %rs3564; + cvt.s32.s8 %r5528, %r5527; + cvt.u32.u16 %r5529, %rs3563; + cvt.s32.s8 %r5530, %r5529; + cvt.u32.u16 %r5531, %rs3562; + cvt.s32.s8 %r5532, %r5531; + mad.lo.s32 %r5533, %r96, %r5532, %r5524; + mad.lo.s32 %r5534, %r97, %r5530, %r5533; + mad.lo.s32 %r5535, %r99, %r5528, %r5534; + mad.lo.s32 %r5536, %r100, %r5526, %r5535; + ld.const.v4.u8 {%rs3570, %rs3571, %rs3572, %rs3573}, [matrix+1784]; + cvt.u32.u16 %r5537, %rs3573; + cvt.s32.s8 %r5538, %r5537; + cvt.u32.u16 %r5539, %rs3572; + cvt.s32.s8 %r5540, %r5539; + cvt.u32.u16 %r5541, %rs3571; + cvt.s32.s8 %r5542, %r5541; + cvt.u32.u16 %r5543, %rs3570; + cvt.s32.s8 %r5544, %r5543; + mad.lo.s32 %r5545, %r103, %r5544, %r5536; + mad.lo.s32 %r5546, %r104, %r5542, %r5545; + mad.lo.s32 %r5547, %r107, %r5540, %r5546; + mad.lo.s32 %r5548, %r108, %r5538, %r5547; + ld.const.v4.u8 {%rs3578, %rs3579, %rs3580, %rs3581}, [matrix+1788]; + cvt.u32.u16 %r5549, %rs3581; + cvt.s32.s8 %r5550, %r5549; + cvt.u32.u16 %r5551, %rs3580; + cvt.s32.s8 %r5552, %r5551; + cvt.u32.u16 %r5553, %rs3579; + cvt.s32.s8 %r5554, %r5553; + cvt.u32.u16 %r5555, %rs3578; + cvt.s32.s8 %r5556, %r5555; + mad.lo.s32 %r5557, %r111, %r5556, %r5548; + mad.lo.s32 %r5558, %r112, %r5554, %r5557; + mad.lo.s32 %r5559, %r114, %r5552, %r5558; + mad.lo.s32 %r5560, %r115, %r5550, %r5559; + shr.u32 %r5561, %r5368, 6; + and.b32 %r5562, %r5561, 240; + shr.u32 %r5563, %r5560, 10; + or.b32 %r5564, %r5563, %r5562; + xor.b32 %r5565, %r24, %r5564; + cvt.u64.u32 %rd391, %r5565; + ld.const.v4.u8 {%rs3586, %rs3587, %rs3588, %rs3589}, [matrix+1792]; + cvt.u32.u16 %r5566, %rs3589; + cvt.s32.s8 %r5567, %r5566; + cvt.u32.u16 %r5568, %rs3588; + cvt.s32.s8 %r5569, %r5568; + cvt.u32.u16 %r5570, %rs3586; + cvt.s32.s8 %r5571, %r5570; + cvt.u32.u16 %r5572, %rs3587; + cvt.s32.s8 %r5573, %r5572; + mul.lo.s32 %r5574, %r34, %r5573; + mad.lo.s32 %r5575, %r124, %r5571, %r5574; + mad.lo.s32 %r5576, %r35, %r5569, %r5575; + mad.lo.s32 %r5577, %r36, %r5567, %r5576; + ld.const.v4.u8 {%rs3594, %rs3595, %rs3596, %rs3597}, [matrix+1796]; + cvt.u32.u16 %r5578, %rs3597; + cvt.s32.s8 %r5579, %r5578; + cvt.u32.u16 %r5580, %rs3596; + cvt.s32.s8 %r5581, %r5580; + cvt.u32.u16 %r5582, %rs3595; + cvt.s32.s8 %r5583, %r5582; + cvt.u32.u16 %r5584, %rs3594; + cvt.s32.s8 %r5585, %r5584; + mad.lo.s32 %r5586, %r37, %r5585, %r5577; + mad.lo.s32 %r5587, %r38, %r5583, %r5586; + mad.lo.s32 %r5588, %r39, %r5581, %r5587; + mad.lo.s32 %r5589, %r40, %r5579, %r5588; + ld.const.v4.u8 {%rs3602, %rs3603, %rs3604, %rs3605}, [matrix+1800]; + cvt.u32.u16 %r5590, %rs3605; + cvt.s32.s8 %r5591, %r5590; + cvt.u32.u16 %r5592, %rs3604; + cvt.s32.s8 %r5593, %r5592; + cvt.u32.u16 %r5594, %rs3603; + cvt.s32.s8 %r5595, %r5594; + cvt.u32.u16 %r5596, %rs3602; + cvt.s32.s8 %r5597, %r5596; + mad.lo.s32 %r5598, %r42, %r5597, %r5589; + mad.lo.s32 %r5599, %r43, %r5595, %r5598; + mad.lo.s32 %r5600, %r45, %r5593, %r5599; + mad.lo.s32 %r5601, %r46, %r5591, %r5600; + ld.const.v4.u8 {%rs3610, %rs3611, %rs3612, %rs3613}, [matrix+1804]; + cvt.u32.u16 %r5602, %rs3613; + cvt.s32.s8 %r5603, %r5602; + cvt.u32.u16 %r5604, %rs3612; + cvt.s32.s8 %r5605, %r5604; + cvt.u32.u16 %r5606, %rs3611; + cvt.s32.s8 %r5607, %r5606; + cvt.u32.u16 %r5608, %rs3610; + cvt.s32.s8 %r5609, %r5608; + mad.lo.s32 %r5610, %r48, %r5609, %r5601; + mad.lo.s32 %r5611, %r49, %r5607, %r5610; + mad.lo.s32 %r5612, %r50, %r5605, %r5611; + mad.lo.s32 %r5613, %r51, %r5603, %r5612; + ld.const.v4.u8 {%rs3618, %rs3619, %rs3620, %rs3621}, [matrix+1808]; + cvt.u32.u16 %r5614, %rs3621; + cvt.s32.s8 %r5615, %r5614; + cvt.u32.u16 %r5616, %rs3620; + cvt.s32.s8 %r5617, %r5616; + cvt.u32.u16 %r5618, %rs3619; + cvt.s32.s8 %r5619, %r5618; + cvt.u32.u16 %r5620, %rs3618; + cvt.s32.s8 %r5621, %r5620; + mad.lo.s32 %r5622, %r173, %r5621, %r5613; + mad.lo.s32 %r5623, %r53, %r5619, %r5622; + mad.lo.s32 %r5624, %r54, %r5617, %r5623; + mad.lo.s32 %r5625, %r55, %r5615, %r5624; + ld.const.v4.u8 {%rs3626, %rs3627, %rs3628, %rs3629}, [matrix+1812]; + cvt.u32.u16 %r5626, %rs3629; + cvt.s32.s8 %r5627, %r5626; + cvt.u32.u16 %r5628, %rs3628; + cvt.s32.s8 %r5629, %r5628; + cvt.u32.u16 %r5630, %rs3627; + cvt.s32.s8 %r5631, %r5630; + cvt.u32.u16 %r5632, %rs3626; + cvt.s32.s8 %r5633, %r5632; + mad.lo.s32 %r5634, %r56, %r5633, %r5625; + mad.lo.s32 %r5635, %r57, %r5631, %r5634; + mad.lo.s32 %r5636, %r58, %r5629, %r5635; + mad.lo.s32 %r5637, %r59, %r5627, %r5636; + ld.const.v4.u8 {%rs3634, %rs3635, %rs3636, %rs3637}, [matrix+1816]; + cvt.u32.u16 %r5638, %rs3637; + cvt.s32.s8 %r5639, %r5638; + cvt.u32.u16 %r5640, %rs3636; + cvt.s32.s8 %r5641, %r5640; + cvt.u32.u16 %r5642, %rs3635; + cvt.s32.s8 %r5643, %r5642; + cvt.u32.u16 %r5644, %rs3634; + cvt.s32.s8 %r5645, %r5644; + mad.lo.s32 %r5646, %r61, %r5645, %r5637; + mad.lo.s32 %r5647, %r62, %r5643, %r5646; + mad.lo.s32 %r5648, %r64, %r5641, %r5647; + mad.lo.s32 %r5649, %r65, %r5639, %r5648; + ld.const.v4.u8 {%rs3642, %rs3643, %rs3644, %rs3645}, [matrix+1820]; + cvt.u32.u16 %r5650, %rs3645; + cvt.s32.s8 %r5651, %r5650; + cvt.u32.u16 %r5652, %rs3644; + cvt.s32.s8 %r5653, %r5652; + cvt.u32.u16 %r5654, %rs3643; + cvt.s32.s8 %r5655, %r5654; + cvt.u32.u16 %r5656, %rs3642; + cvt.s32.s8 %r5657, %r5656; + mad.lo.s32 %r5658, %r67, %r5657, %r5649; + mad.lo.s32 %r5659, %r68, %r5655, %r5658; + mad.lo.s32 %r5660, %r69, %r5653, %r5659; + mad.lo.s32 %r5661, %r70, %r5651, %r5660; + ld.const.v4.u8 {%rs3650, %rs3651, %rs3652, %rs3653}, [matrix+1824]; + cvt.u32.u16 %r5662, %rs3653; + cvt.s32.s8 %r5663, %r5662; + cvt.u32.u16 %r5664, %rs3652; + cvt.s32.s8 %r5665, %r5664; + cvt.u32.u16 %r5666, %rs3651; + cvt.s32.s8 %r5667, %r5666; + cvt.u32.u16 %r5668, %rs3650; + cvt.s32.s8 %r5669, %r5668; + mad.lo.s32 %r5670, %r222, %r5669, %r5661; + mad.lo.s32 %r5671, %r72, %r5667, %r5670; + mad.lo.s32 %r5672, %r73, %r5665, %r5671; + mad.lo.s32 %r5673, %r74, %r5663, %r5672; + ld.const.v4.u8 {%rs3658, %rs3659, %rs3660, %rs3661}, [matrix+1828]; + cvt.u32.u16 %r5674, %rs3661; + cvt.s32.s8 %r5675, %r5674; + cvt.u32.u16 %r5676, %rs3660; + cvt.s32.s8 %r5677, %r5676; + cvt.u32.u16 %r5678, %rs3659; + cvt.s32.s8 %r5679, %r5678; + cvt.u32.u16 %r5680, %rs3658; + cvt.s32.s8 %r5681, %r5680; + mad.lo.s32 %r5682, %r75, %r5681, %r5673; + mad.lo.s32 %r5683, %r76, %r5679, %r5682; + mad.lo.s32 %r5684, %r77, %r5677, %r5683; + mad.lo.s32 %r5685, %r78, %r5675, %r5684; + ld.const.v4.u8 {%rs3666, %rs3667, %rs3668, %rs3669}, [matrix+1832]; + cvt.u32.u16 %r5686, %rs3669; + cvt.s32.s8 %r5687, %r5686; + cvt.u32.u16 %r5688, %rs3668; + cvt.s32.s8 %r5689, %r5688; + cvt.u32.u16 %r5690, %rs3667; + cvt.s32.s8 %r5691, %r5690; + cvt.u32.u16 %r5692, %rs3666; + cvt.s32.s8 %r5693, %r5692; + mad.lo.s32 %r5694, %r80, %r5693, %r5685; + mad.lo.s32 %r5695, %r81, %r5691, %r5694; + mad.lo.s32 %r5696, %r83, %r5689, %r5695; + mad.lo.s32 %r5697, %r84, %r5687, %r5696; + ld.const.v4.u8 {%rs3674, %rs3675, %rs3676, %rs3677}, [matrix+1836]; + cvt.u32.u16 %r5698, %rs3677; + cvt.s32.s8 %r5699, %r5698; + cvt.u32.u16 %r5700, %rs3676; + cvt.s32.s8 %r5701, %r5700; + cvt.u32.u16 %r5702, %rs3675; + cvt.s32.s8 %r5703, %r5702; + cvt.u32.u16 %r5704, %rs3674; + cvt.s32.s8 %r5705, %r5704; + mad.lo.s32 %r5706, %r86, %r5705, %r5697; + mad.lo.s32 %r5707, %r87, %r5703, %r5706; + mad.lo.s32 %r5708, %r88, %r5701, %r5707; + mad.lo.s32 %r5709, %r89, %r5699, %r5708; + ld.const.v4.u8 {%rs3682, %rs3683, %rs3684, %rs3685}, [matrix+1840]; + cvt.u32.u16 %r5710, %rs3685; + cvt.s32.s8 %r5711, %r5710; + cvt.u32.u16 %r5712, %rs3684; + cvt.s32.s8 %r5713, %r5712; + cvt.u32.u16 %r5714, %rs3683; + cvt.s32.s8 %r5715, %r5714; + cvt.u32.u16 %r5716, %rs3682; + cvt.s32.s8 %r5717, %r5716; + mad.lo.s32 %r5718, %r271, %r5717, %r5709; + mad.lo.s32 %r5719, %r91, %r5715, %r5718; + mad.lo.s32 %r5720, %r93, %r5713, %r5719; + mad.lo.s32 %r5721, %r94, %r5711, %r5720; + ld.const.v4.u8 {%rs3690, %rs3691, %rs3692, %rs3693}, [matrix+1844]; + cvt.u32.u16 %r5722, %rs3693; + cvt.s32.s8 %r5723, %r5722; + cvt.u32.u16 %r5724, %rs3692; + cvt.s32.s8 %r5725, %r5724; + cvt.u32.u16 %r5726, %rs3691; + cvt.s32.s8 %r5727, %r5726; + cvt.u32.u16 %r5728, %rs3690; + cvt.s32.s8 %r5729, %r5728; + mad.lo.s32 %r5730, %r96, %r5729, %r5721; + mad.lo.s32 %r5731, %r97, %r5727, %r5730; + mad.lo.s32 %r5732, %r99, %r5725, %r5731; + mad.lo.s32 %r5733, %r100, %r5723, %r5732; + ld.const.v4.u8 {%rs3698, %rs3699, %rs3700, %rs3701}, [matrix+1848]; + cvt.u32.u16 %r5734, %rs3701; + cvt.s32.s8 %r5735, %r5734; + cvt.u32.u16 %r5736, %rs3700; + cvt.s32.s8 %r5737, %r5736; + cvt.u32.u16 %r5738, %rs3699; + cvt.s32.s8 %r5739, %r5738; + cvt.u32.u16 %r5740, %rs3698; + cvt.s32.s8 %r5741, %r5740; + mad.lo.s32 %r5742, %r103, %r5741, %r5733; + mad.lo.s32 %r5743, %r104, %r5739, %r5742; + mad.lo.s32 %r5744, %r107, %r5737, %r5743; + mad.lo.s32 %r5745, %r108, %r5735, %r5744; + ld.const.v4.u8 {%rs3706, %rs3707, %rs3708, %rs3709}, [matrix+1852]; + cvt.u32.u16 %r5746, %rs3709; + cvt.s32.s8 %r5747, %r5746; + cvt.u32.u16 %r5748, %rs3708; + cvt.s32.s8 %r5749, %r5748; + cvt.u32.u16 %r5750, %rs3707; + cvt.s32.s8 %r5751, %r5750; + cvt.u32.u16 %r5752, %rs3706; + cvt.s32.s8 %r5753, %r5752; + mad.lo.s32 %r5754, %r111, %r5753, %r5745; + mad.lo.s32 %r5755, %r112, %r5751, %r5754; + mad.lo.s32 %r5756, %r114, %r5749, %r5755; + mad.lo.s32 %r5757, %r115, %r5747, %r5756; + ld.const.v4.u8 {%rs3714, %rs3715, %rs3716, %rs3717}, [matrix+1856]; + cvt.u32.u16 %r5758, %rs3717; + cvt.s32.s8 %r5759, %r5758; + cvt.u32.u16 %r5760, %rs3716; + cvt.s32.s8 %r5761, %r5760; + cvt.u32.u16 %r5762, %rs3714; + cvt.s32.s8 %r5763, %r5762; + cvt.u32.u16 %r5764, %rs3715; + cvt.s32.s8 %r5765, %r5764; + mul.lo.s32 %r5766, %r34, %r5765; + mad.lo.s32 %r5767, %r124, %r5763, %r5766; + mad.lo.s32 %r5768, %r35, %r5761, %r5767; + mad.lo.s32 %r5769, %r36, %r5759, %r5768; + ld.const.v4.u8 {%rs3722, %rs3723, %rs3724, %rs3725}, [matrix+1860]; + cvt.u32.u16 %r5770, %rs3725; + cvt.s32.s8 %r5771, %r5770; + cvt.u32.u16 %r5772, %rs3724; + cvt.s32.s8 %r5773, %r5772; + cvt.u32.u16 %r5774, %rs3723; + cvt.s32.s8 %r5775, %r5774; + cvt.u32.u16 %r5776, %rs3722; + cvt.s32.s8 %r5777, %r5776; + mad.lo.s32 %r5778, %r37, %r5777, %r5769; + mad.lo.s32 %r5779, %r38, %r5775, %r5778; + mad.lo.s32 %r5780, %r39, %r5773, %r5779; + mad.lo.s32 %r5781, %r40, %r5771, %r5780; + ld.const.v4.u8 {%rs3730, %rs3731, %rs3732, %rs3733}, [matrix+1864]; + cvt.u32.u16 %r5782, %rs3733; + cvt.s32.s8 %r5783, %r5782; + cvt.u32.u16 %r5784, %rs3732; + cvt.s32.s8 %r5785, %r5784; + cvt.u32.u16 %r5786, %rs3731; + cvt.s32.s8 %r5787, %r5786; + cvt.u32.u16 %r5788, %rs3730; + cvt.s32.s8 %r5789, %r5788; + mad.lo.s32 %r5790, %r42, %r5789, %r5781; + mad.lo.s32 %r5791, %r43, %r5787, %r5790; + mad.lo.s32 %r5792, %r45, %r5785, %r5791; + mad.lo.s32 %r5793, %r46, %r5783, %r5792; + ld.const.v4.u8 {%rs3738, %rs3739, %rs3740, %rs3741}, [matrix+1868]; + cvt.u32.u16 %r5794, %rs3741; + cvt.s32.s8 %r5795, %r5794; + cvt.u32.u16 %r5796, %rs3740; + cvt.s32.s8 %r5797, %r5796; + cvt.u32.u16 %r5798, %rs3739; + cvt.s32.s8 %r5799, %r5798; + cvt.u32.u16 %r5800, %rs3738; + cvt.s32.s8 %r5801, %r5800; + mad.lo.s32 %r5802, %r48, %r5801, %r5793; + mad.lo.s32 %r5803, %r49, %r5799, %r5802; + mad.lo.s32 %r5804, %r50, %r5797, %r5803; + mad.lo.s32 %r5805, %r51, %r5795, %r5804; + ld.const.v4.u8 {%rs3746, %rs3747, %rs3748, %rs3749}, [matrix+1872]; + cvt.u32.u16 %r5806, %rs3749; + cvt.s32.s8 %r5807, %r5806; + cvt.u32.u16 %r5808, %rs3748; + cvt.s32.s8 %r5809, %r5808; + cvt.u32.u16 %r5810, %rs3747; + cvt.s32.s8 %r5811, %r5810; + cvt.u32.u16 %r5812, %rs3746; + cvt.s32.s8 %r5813, %r5812; + mad.lo.s32 %r5814, %r173, %r5813, %r5805; + mad.lo.s32 %r5815, %r53, %r5811, %r5814; + mad.lo.s32 %r5816, %r54, %r5809, %r5815; + mad.lo.s32 %r5817, %r55, %r5807, %r5816; + ld.const.v4.u8 {%rs3754, %rs3755, %rs3756, %rs3757}, [matrix+1876]; + cvt.u32.u16 %r5818, %rs3757; + cvt.s32.s8 %r5819, %r5818; + cvt.u32.u16 %r5820, %rs3756; + cvt.s32.s8 %r5821, %r5820; + cvt.u32.u16 %r5822, %rs3755; + cvt.s32.s8 %r5823, %r5822; + cvt.u32.u16 %r5824, %rs3754; + cvt.s32.s8 %r5825, %r5824; + mad.lo.s32 %r5826, %r56, %r5825, %r5817; + mad.lo.s32 %r5827, %r57, %r5823, %r5826; + mad.lo.s32 %r5828, %r58, %r5821, %r5827; + mad.lo.s32 %r5829, %r59, %r5819, %r5828; + ld.const.v4.u8 {%rs3762, %rs3763, %rs3764, %rs3765}, [matrix+1880]; + cvt.u32.u16 %r5830, %rs3765; + cvt.s32.s8 %r5831, %r5830; + cvt.u32.u16 %r5832, %rs3764; + cvt.s32.s8 %r5833, %r5832; + cvt.u32.u16 %r5834, %rs3763; + cvt.s32.s8 %r5835, %r5834; + cvt.u32.u16 %r5836, %rs3762; + cvt.s32.s8 %r5837, %r5836; + mad.lo.s32 %r5838, %r61, %r5837, %r5829; + mad.lo.s32 %r5839, %r62, %r5835, %r5838; + mad.lo.s32 %r5840, %r64, %r5833, %r5839; + mad.lo.s32 %r5841, %r65, %r5831, %r5840; + ld.const.v4.u8 {%rs3770, %rs3771, %rs3772, %rs3773}, [matrix+1884]; + cvt.u32.u16 %r5842, %rs3773; + cvt.s32.s8 %r5843, %r5842; + cvt.u32.u16 %r5844, %rs3772; + cvt.s32.s8 %r5845, %r5844; + cvt.u32.u16 %r5846, %rs3771; + cvt.s32.s8 %r5847, %r5846; + cvt.u32.u16 %r5848, %rs3770; + cvt.s32.s8 %r5849, %r5848; + mad.lo.s32 %r5850, %r67, %r5849, %r5841; + mad.lo.s32 %r5851, %r68, %r5847, %r5850; + mad.lo.s32 %r5852, %r69, %r5845, %r5851; + mad.lo.s32 %r5853, %r70, %r5843, %r5852; + ld.const.v4.u8 {%rs3778, %rs3779, %rs3780, %rs3781}, [matrix+1888]; + cvt.u32.u16 %r5854, %rs3781; + cvt.s32.s8 %r5855, %r5854; + cvt.u32.u16 %r5856, %rs3780; + cvt.s32.s8 %r5857, %r5856; + cvt.u32.u16 %r5858, %rs3779; + cvt.s32.s8 %r5859, %r5858; + cvt.u32.u16 %r5860, %rs3778; + cvt.s32.s8 %r5861, %r5860; + mad.lo.s32 %r5862, %r222, %r5861, %r5853; + mad.lo.s32 %r5863, %r72, %r5859, %r5862; + mad.lo.s32 %r5864, %r73, %r5857, %r5863; + mad.lo.s32 %r5865, %r74, %r5855, %r5864; + ld.const.v4.u8 {%rs3786, %rs3787, %rs3788, %rs3789}, [matrix+1892]; + cvt.u32.u16 %r5866, %rs3789; + cvt.s32.s8 %r5867, %r5866; + cvt.u32.u16 %r5868, %rs3788; + cvt.s32.s8 %r5869, %r5868; + cvt.u32.u16 %r5870, %rs3787; + cvt.s32.s8 %r5871, %r5870; + cvt.u32.u16 %r5872, %rs3786; + cvt.s32.s8 %r5873, %r5872; + mad.lo.s32 %r5874, %r75, %r5873, %r5865; + mad.lo.s32 %r5875, %r76, %r5871, %r5874; + mad.lo.s32 %r5876, %r77, %r5869, %r5875; + mad.lo.s32 %r5877, %r78, %r5867, %r5876; + ld.const.v4.u8 {%rs3794, %rs3795, %rs3796, %rs3797}, [matrix+1896]; + cvt.u32.u16 %r5878, %rs3797; + cvt.s32.s8 %r5879, %r5878; + cvt.u32.u16 %r5880, %rs3796; + cvt.s32.s8 %r5881, %r5880; + cvt.u32.u16 %r5882, %rs3795; + cvt.s32.s8 %r5883, %r5882; + cvt.u32.u16 %r5884, %rs3794; + cvt.s32.s8 %r5885, %r5884; + mad.lo.s32 %r5886, %r80, %r5885, %r5877; + mad.lo.s32 %r5887, %r81, %r5883, %r5886; + mad.lo.s32 %r5888, %r83, %r5881, %r5887; + mad.lo.s32 %r5889, %r84, %r5879, %r5888; + ld.const.v4.u8 {%rs3802, %rs3803, %rs3804, %rs3805}, [matrix+1900]; + cvt.u32.u16 %r5890, %rs3805; + cvt.s32.s8 %r5891, %r5890; + cvt.u32.u16 %r5892, %rs3804; + cvt.s32.s8 %r5893, %r5892; + cvt.u32.u16 %r5894, %rs3803; + cvt.s32.s8 %r5895, %r5894; + cvt.u32.u16 %r5896, %rs3802; + cvt.s32.s8 %r5897, %r5896; + mad.lo.s32 %r5898, %r86, %r5897, %r5889; + mad.lo.s32 %r5899, %r87, %r5895, %r5898; + mad.lo.s32 %r5900, %r88, %r5893, %r5899; + mad.lo.s32 %r5901, %r89, %r5891, %r5900; + ld.const.v4.u8 {%rs3810, %rs3811, %rs3812, %rs3813}, [matrix+1904]; + cvt.u32.u16 %r5902, %rs3813; + cvt.s32.s8 %r5903, %r5902; + cvt.u32.u16 %r5904, %rs3812; + cvt.s32.s8 %r5905, %r5904; + cvt.u32.u16 %r5906, %rs3811; + cvt.s32.s8 %r5907, %r5906; + cvt.u32.u16 %r5908, %rs3810; + cvt.s32.s8 %r5909, %r5908; + mad.lo.s32 %r5910, %r271, %r5909, %r5901; + mad.lo.s32 %r5911, %r91, %r5907, %r5910; + mad.lo.s32 %r5912, %r93, %r5905, %r5911; + mad.lo.s32 %r5913, %r94, %r5903, %r5912; + ld.const.v4.u8 {%rs3818, %rs3819, %rs3820, %rs3821}, [matrix+1908]; + cvt.u32.u16 %r5914, %rs3821; + cvt.s32.s8 %r5915, %r5914; + cvt.u32.u16 %r5916, %rs3820; + cvt.s32.s8 %r5917, %r5916; + cvt.u32.u16 %r5918, %rs3819; + cvt.s32.s8 %r5919, %r5918; + cvt.u32.u16 %r5920, %rs3818; + cvt.s32.s8 %r5921, %r5920; + mad.lo.s32 %r5922, %r96, %r5921, %r5913; + mad.lo.s32 %r5923, %r97, %r5919, %r5922; + mad.lo.s32 %r5924, %r99, %r5917, %r5923; + mad.lo.s32 %r5925, %r100, %r5915, %r5924; + ld.const.v4.u8 {%rs3826, %rs3827, %rs3828, %rs3829}, [matrix+1912]; + cvt.u32.u16 %r5926, %rs3829; + cvt.s32.s8 %r5927, %r5926; + cvt.u32.u16 %r5928, %rs3828; + cvt.s32.s8 %r5929, %r5928; + cvt.u32.u16 %r5930, %rs3827; + cvt.s32.s8 %r5931, %r5930; + cvt.u32.u16 %r5932, %rs3826; + cvt.s32.s8 %r5933, %r5932; + mad.lo.s32 %r5934, %r103, %r5933, %r5925; + mad.lo.s32 %r5935, %r104, %r5931, %r5934; + mad.lo.s32 %r5936, %r107, %r5929, %r5935; + mad.lo.s32 %r5937, %r108, %r5927, %r5936; + ld.const.v4.u8 {%rs3834, %rs3835, %rs3836, %rs3837}, [matrix+1916]; + cvt.u32.u16 %r5938, %rs3837; + cvt.s32.s8 %r5939, %r5938; + cvt.u32.u16 %r5940, %rs3836; + cvt.s32.s8 %r5941, %r5940; + cvt.u32.u16 %r5942, %rs3835; + cvt.s32.s8 %r5943, %r5942; + cvt.u32.u16 %r5944, %rs3834; + cvt.s32.s8 %r5945, %r5944; + mad.lo.s32 %r5946, %r111, %r5945, %r5937; + mad.lo.s32 %r5947, %r112, %r5943, %r5946; + mad.lo.s32 %r5948, %r114, %r5941, %r5947; + mad.lo.s32 %r5949, %r115, %r5939, %r5948; + shr.u32 %r5950, %r5757, 6; + and.b32 %r5951, %r5950, 240; + shr.u32 %r5952, %r5949, 10; + or.b32 %r5953, %r5952, %r5951; + xor.b32 %r5954, %r25, %r5953; + cvt.u64.u32 %rd392, %r5954; + ld.const.v4.u8 {%rs3842, %rs3843, %rs3844, %rs3845}, [matrix+1920]; + cvt.u32.u16 %r5955, %rs3845; + cvt.s32.s8 %r5956, %r5955; + cvt.u32.u16 %r5957, %rs3844; + cvt.s32.s8 %r5958, %r5957; + cvt.u32.u16 %r5959, %rs3842; + cvt.s32.s8 %r5960, %r5959; + cvt.u32.u16 %r5961, %rs3843; + cvt.s32.s8 %r5962, %r5961; + mul.lo.s32 %r5963, %r34, %r5962; + mad.lo.s32 %r5964, %r124, %r5960, %r5963; + mad.lo.s32 %r5965, %r35, %r5958, %r5964; + mad.lo.s32 %r5966, %r36, %r5956, %r5965; + ld.const.v4.u8 {%rs3850, %rs3851, %rs3852, %rs3853}, [matrix+1924]; + cvt.u32.u16 %r5967, %rs3853; + cvt.s32.s8 %r5968, %r5967; + cvt.u32.u16 %r5969, %rs3852; + cvt.s32.s8 %r5970, %r5969; + cvt.u32.u16 %r5971, %rs3851; + cvt.s32.s8 %r5972, %r5971; + cvt.u32.u16 %r5973, %rs3850; + cvt.s32.s8 %r5974, %r5973; + mad.lo.s32 %r5975, %r37, %r5974, %r5966; + mad.lo.s32 %r5976, %r38, %r5972, %r5975; + mad.lo.s32 %r5977, %r39, %r5970, %r5976; + mad.lo.s32 %r5978, %r40, %r5968, %r5977; + ld.const.v4.u8 {%rs3858, %rs3859, %rs3860, %rs3861}, [matrix+1928]; + cvt.u32.u16 %r5979, %rs3861; + cvt.s32.s8 %r5980, %r5979; + cvt.u32.u16 %r5981, %rs3860; + cvt.s32.s8 %r5982, %r5981; + cvt.u32.u16 %r5983, %rs3859; + cvt.s32.s8 %r5984, %r5983; + cvt.u32.u16 %r5985, %rs3858; + cvt.s32.s8 %r5986, %r5985; + mad.lo.s32 %r5987, %r42, %r5986, %r5978; + mad.lo.s32 %r5988, %r43, %r5984, %r5987; + mad.lo.s32 %r5989, %r45, %r5982, %r5988; + mad.lo.s32 %r5990, %r46, %r5980, %r5989; + ld.const.v4.u8 {%rs3866, %rs3867, %rs3868, %rs3869}, [matrix+1932]; + cvt.u32.u16 %r5991, %rs3869; + cvt.s32.s8 %r5992, %r5991; + cvt.u32.u16 %r5993, %rs3868; + cvt.s32.s8 %r5994, %r5993; + cvt.u32.u16 %r5995, %rs3867; + cvt.s32.s8 %r5996, %r5995; + cvt.u32.u16 %r5997, %rs3866; + cvt.s32.s8 %r5998, %r5997; + mad.lo.s32 %r5999, %r48, %r5998, %r5990; + mad.lo.s32 %r6000, %r49, %r5996, %r5999; + mad.lo.s32 %r6001, %r50, %r5994, %r6000; + mad.lo.s32 %r6002, %r51, %r5992, %r6001; + ld.const.v4.u8 {%rs3874, %rs3875, %rs3876, %rs3877}, [matrix+1936]; + cvt.u32.u16 %r6003, %rs3877; + cvt.s32.s8 %r6004, %r6003; + cvt.u32.u16 %r6005, %rs3876; + cvt.s32.s8 %r6006, %r6005; + cvt.u32.u16 %r6007, %rs3875; + cvt.s32.s8 %r6008, %r6007; + cvt.u32.u16 %r6009, %rs3874; + cvt.s32.s8 %r6010, %r6009; + mad.lo.s32 %r6011, %r173, %r6010, %r6002; + mad.lo.s32 %r6012, %r53, %r6008, %r6011; + mad.lo.s32 %r6013, %r54, %r6006, %r6012; + mad.lo.s32 %r6014, %r55, %r6004, %r6013; + ld.const.v4.u8 {%rs3882, %rs3883, %rs3884, %rs3885}, [matrix+1940]; + cvt.u32.u16 %r6015, %rs3885; + cvt.s32.s8 %r6016, %r6015; + cvt.u32.u16 %r6017, %rs3884; + cvt.s32.s8 %r6018, %r6017; + cvt.u32.u16 %r6019, %rs3883; + cvt.s32.s8 %r6020, %r6019; + cvt.u32.u16 %r6021, %rs3882; + cvt.s32.s8 %r6022, %r6021; + mad.lo.s32 %r6023, %r56, %r6022, %r6014; + mad.lo.s32 %r6024, %r57, %r6020, %r6023; + mad.lo.s32 %r6025, %r58, %r6018, %r6024; + mad.lo.s32 %r6026, %r59, %r6016, %r6025; + ld.const.v4.u8 {%rs3890, %rs3891, %rs3892, %rs3893}, [matrix+1944]; + cvt.u32.u16 %r6027, %rs3893; + cvt.s32.s8 %r6028, %r6027; + cvt.u32.u16 %r6029, %rs3892; + cvt.s32.s8 %r6030, %r6029; + cvt.u32.u16 %r6031, %rs3891; + cvt.s32.s8 %r6032, %r6031; + cvt.u32.u16 %r6033, %rs3890; + cvt.s32.s8 %r6034, %r6033; + mad.lo.s32 %r6035, %r61, %r6034, %r6026; + mad.lo.s32 %r6036, %r62, %r6032, %r6035; + mad.lo.s32 %r6037, %r64, %r6030, %r6036; + mad.lo.s32 %r6038, %r65, %r6028, %r6037; + ld.const.v4.u8 {%rs3898, %rs3899, %rs3900, %rs3901}, [matrix+1948]; + cvt.u32.u16 %r6039, %rs3901; + cvt.s32.s8 %r6040, %r6039; + cvt.u32.u16 %r6041, %rs3900; + cvt.s32.s8 %r6042, %r6041; + cvt.u32.u16 %r6043, %rs3899; + cvt.s32.s8 %r6044, %r6043; + cvt.u32.u16 %r6045, %rs3898; + cvt.s32.s8 %r6046, %r6045; + mad.lo.s32 %r6047, %r67, %r6046, %r6038; + mad.lo.s32 %r6048, %r68, %r6044, %r6047; + mad.lo.s32 %r6049, %r69, %r6042, %r6048; + mad.lo.s32 %r6050, %r70, %r6040, %r6049; + ld.const.v4.u8 {%rs3906, %rs3907, %rs3908, %rs3909}, [matrix+1952]; + cvt.u32.u16 %r6051, %rs3909; + cvt.s32.s8 %r6052, %r6051; + cvt.u32.u16 %r6053, %rs3908; + cvt.s32.s8 %r6054, %r6053; + cvt.u32.u16 %r6055, %rs3907; + cvt.s32.s8 %r6056, %r6055; + cvt.u32.u16 %r6057, %rs3906; + cvt.s32.s8 %r6058, %r6057; + mad.lo.s32 %r6059, %r222, %r6058, %r6050; + mad.lo.s32 %r6060, %r72, %r6056, %r6059; + mad.lo.s32 %r6061, %r73, %r6054, %r6060; + mad.lo.s32 %r6062, %r74, %r6052, %r6061; + ld.const.v4.u8 {%rs3914, %rs3915, %rs3916, %rs3917}, [matrix+1956]; + cvt.u32.u16 %r6063, %rs3917; + cvt.s32.s8 %r6064, %r6063; + cvt.u32.u16 %r6065, %rs3916; + cvt.s32.s8 %r6066, %r6065; + cvt.u32.u16 %r6067, %rs3915; + cvt.s32.s8 %r6068, %r6067; + cvt.u32.u16 %r6069, %rs3914; + cvt.s32.s8 %r6070, %r6069; + mad.lo.s32 %r6071, %r75, %r6070, %r6062; + mad.lo.s32 %r6072, %r76, %r6068, %r6071; + mad.lo.s32 %r6073, %r77, %r6066, %r6072; + mad.lo.s32 %r6074, %r78, %r6064, %r6073; + ld.const.v4.u8 {%rs3922, %rs3923, %rs3924, %rs3925}, [matrix+1960]; + cvt.u32.u16 %r6075, %rs3925; + cvt.s32.s8 %r6076, %r6075; + cvt.u32.u16 %r6077, %rs3924; + cvt.s32.s8 %r6078, %r6077; + cvt.u32.u16 %r6079, %rs3923; + cvt.s32.s8 %r6080, %r6079; + cvt.u32.u16 %r6081, %rs3922; + cvt.s32.s8 %r6082, %r6081; + mad.lo.s32 %r6083, %r80, %r6082, %r6074; + mad.lo.s32 %r6084, %r81, %r6080, %r6083; + mad.lo.s32 %r6085, %r83, %r6078, %r6084; + mad.lo.s32 %r6086, %r84, %r6076, %r6085; + ld.const.v4.u8 {%rs3930, %rs3931, %rs3932, %rs3933}, [matrix+1964]; + cvt.u32.u16 %r6087, %rs3933; + cvt.s32.s8 %r6088, %r6087; + cvt.u32.u16 %r6089, %rs3932; + cvt.s32.s8 %r6090, %r6089; + cvt.u32.u16 %r6091, %rs3931; + cvt.s32.s8 %r6092, %r6091; + cvt.u32.u16 %r6093, %rs3930; + cvt.s32.s8 %r6094, %r6093; + mad.lo.s32 %r6095, %r86, %r6094, %r6086; + mad.lo.s32 %r6096, %r87, %r6092, %r6095; + mad.lo.s32 %r6097, %r88, %r6090, %r6096; + mad.lo.s32 %r6098, %r89, %r6088, %r6097; + ld.const.v4.u8 {%rs3938, %rs3939, %rs3940, %rs3941}, [matrix+1968]; + cvt.u32.u16 %r6099, %rs3941; + cvt.s32.s8 %r6100, %r6099; + cvt.u32.u16 %r6101, %rs3940; + cvt.s32.s8 %r6102, %r6101; + cvt.u32.u16 %r6103, %rs3939; + cvt.s32.s8 %r6104, %r6103; + cvt.u32.u16 %r6105, %rs3938; + cvt.s32.s8 %r6106, %r6105; + mad.lo.s32 %r6107, %r271, %r6106, %r6098; + mad.lo.s32 %r6108, %r91, %r6104, %r6107; + mad.lo.s32 %r6109, %r93, %r6102, %r6108; + mad.lo.s32 %r6110, %r94, %r6100, %r6109; + ld.const.v4.u8 {%rs3946, %rs3947, %rs3948, %rs3949}, [matrix+1972]; + cvt.u32.u16 %r6111, %rs3949; + cvt.s32.s8 %r6112, %r6111; + cvt.u32.u16 %r6113, %rs3948; + cvt.s32.s8 %r6114, %r6113; + cvt.u32.u16 %r6115, %rs3947; + cvt.s32.s8 %r6116, %r6115; + cvt.u32.u16 %r6117, %rs3946; + cvt.s32.s8 %r6118, %r6117; + mad.lo.s32 %r6119, %r96, %r6118, %r6110; + mad.lo.s32 %r6120, %r97, %r6116, %r6119; + mad.lo.s32 %r6121, %r99, %r6114, %r6120; + mad.lo.s32 %r6122, %r100, %r6112, %r6121; + ld.const.v4.u8 {%rs3954, %rs3955, %rs3956, %rs3957}, [matrix+1976]; + cvt.u32.u16 %r6123, %rs3957; + cvt.s32.s8 %r6124, %r6123; + cvt.u32.u16 %r6125, %rs3956; + cvt.s32.s8 %r6126, %r6125; + cvt.u32.u16 %r6127, %rs3955; + cvt.s32.s8 %r6128, %r6127; + cvt.u32.u16 %r6129, %rs3954; + cvt.s32.s8 %r6130, %r6129; + mad.lo.s32 %r6131, %r103, %r6130, %r6122; + mad.lo.s32 %r6132, %r104, %r6128, %r6131; + mad.lo.s32 %r6133, %r107, %r6126, %r6132; + mad.lo.s32 %r6134, %r108, %r6124, %r6133; + ld.const.v4.u8 {%rs3962, %rs3963, %rs3964, %rs3965}, [matrix+1980]; + cvt.u32.u16 %r6135, %rs3965; + cvt.s32.s8 %r6136, %r6135; + cvt.u32.u16 %r6137, %rs3964; + cvt.s32.s8 %r6138, %r6137; + cvt.u32.u16 %r6139, %rs3963; + cvt.s32.s8 %r6140, %r6139; + cvt.u32.u16 %r6141, %rs3962; + cvt.s32.s8 %r6142, %r6141; + mad.lo.s32 %r6143, %r111, %r6142, %r6134; + mad.lo.s32 %r6144, %r112, %r6140, %r6143; + mad.lo.s32 %r6145, %r114, %r6138, %r6144; + mad.lo.s32 %r6146, %r115, %r6136, %r6145; + ld.const.v4.u8 {%rs3970, %rs3971, %rs3972, %rs3973}, [matrix+1984]; + cvt.u32.u16 %r6147, %rs3973; + cvt.s32.s8 %r6148, %r6147; + cvt.u32.u16 %r6149, %rs3972; + cvt.s32.s8 %r6150, %r6149; + cvt.u32.u16 %r6151, %rs3970; + cvt.s32.s8 %r6152, %r6151; + cvt.u32.u16 %r6153, %rs3971; + cvt.s32.s8 %r6154, %r6153; + mul.lo.s32 %r6155, %r34, %r6154; + mad.lo.s32 %r6156, %r124, %r6152, %r6155; + mad.lo.s32 %r6157, %r35, %r6150, %r6156; + mad.lo.s32 %r6158, %r36, %r6148, %r6157; + ld.const.v4.u8 {%rs3978, %rs3979, %rs3980, %rs3981}, [matrix+1988]; + cvt.u32.u16 %r6159, %rs3981; + cvt.s32.s8 %r6160, %r6159; + cvt.u32.u16 %r6161, %rs3980; + cvt.s32.s8 %r6162, %r6161; + cvt.u32.u16 %r6163, %rs3979; + cvt.s32.s8 %r6164, %r6163; + cvt.u32.u16 %r6165, %rs3978; + cvt.s32.s8 %r6166, %r6165; + mad.lo.s32 %r6167, %r37, %r6166, %r6158; + mad.lo.s32 %r6168, %r38, %r6164, %r6167; + mad.lo.s32 %r6169, %r39, %r6162, %r6168; + mad.lo.s32 %r6170, %r40, %r6160, %r6169; + ld.const.v4.u8 {%rs3986, %rs3987, %rs3988, %rs3989}, [matrix+1992]; + cvt.u32.u16 %r6171, %rs3989; + cvt.s32.s8 %r6172, %r6171; + cvt.u32.u16 %r6173, %rs3988; + cvt.s32.s8 %r6174, %r6173; + cvt.u32.u16 %r6175, %rs3987; + cvt.s32.s8 %r6176, %r6175; + cvt.u32.u16 %r6177, %rs3986; + cvt.s32.s8 %r6178, %r6177; + mad.lo.s32 %r6179, %r42, %r6178, %r6170; + mad.lo.s32 %r6180, %r43, %r6176, %r6179; + mad.lo.s32 %r6181, %r45, %r6174, %r6180; + mad.lo.s32 %r6182, %r46, %r6172, %r6181; + ld.const.v4.u8 {%rs3994, %rs3995, %rs3996, %rs3997}, [matrix+1996]; + cvt.u32.u16 %r6183, %rs3997; + cvt.s32.s8 %r6184, %r6183; + cvt.u32.u16 %r6185, %rs3996; + cvt.s32.s8 %r6186, %r6185; + cvt.u32.u16 %r6187, %rs3995; + cvt.s32.s8 %r6188, %r6187; + cvt.u32.u16 %r6189, %rs3994; + cvt.s32.s8 %r6190, %r6189; + mad.lo.s32 %r6191, %r48, %r6190, %r6182; + mad.lo.s32 %r6192, %r49, %r6188, %r6191; + mad.lo.s32 %r6193, %r50, %r6186, %r6192; + mad.lo.s32 %r6194, %r51, %r6184, %r6193; + ld.const.v4.u8 {%rs4002, %rs4003, %rs4004, %rs4005}, [matrix+2000]; + cvt.u32.u16 %r6195, %rs4005; + cvt.s32.s8 %r6196, %r6195; + cvt.u32.u16 %r6197, %rs4004; + cvt.s32.s8 %r6198, %r6197; + cvt.u32.u16 %r6199, %rs4003; + cvt.s32.s8 %r6200, %r6199; + cvt.u32.u16 %r6201, %rs4002; + cvt.s32.s8 %r6202, %r6201; + mad.lo.s32 %r6203, %r173, %r6202, %r6194; + mad.lo.s32 %r6204, %r53, %r6200, %r6203; + mad.lo.s32 %r6205, %r54, %r6198, %r6204; + mad.lo.s32 %r6206, %r55, %r6196, %r6205; + ld.const.v4.u8 {%rs4010, %rs4011, %rs4012, %rs4013}, [matrix+2004]; + cvt.u32.u16 %r6207, %rs4013; + cvt.s32.s8 %r6208, %r6207; + cvt.u32.u16 %r6209, %rs4012; + cvt.s32.s8 %r6210, %r6209; + cvt.u32.u16 %r6211, %rs4011; + cvt.s32.s8 %r6212, %r6211; + cvt.u32.u16 %r6213, %rs4010; + cvt.s32.s8 %r6214, %r6213; + mad.lo.s32 %r6215, %r56, %r6214, %r6206; + mad.lo.s32 %r6216, %r57, %r6212, %r6215; + mad.lo.s32 %r6217, %r58, %r6210, %r6216; + mad.lo.s32 %r6218, %r59, %r6208, %r6217; + ld.const.v4.u8 {%rs4018, %rs4019, %rs4020, %rs4021}, [matrix+2008]; + cvt.u32.u16 %r6219, %rs4021; + cvt.s32.s8 %r6220, %r6219; + cvt.u32.u16 %r6221, %rs4020; + cvt.s32.s8 %r6222, %r6221; + cvt.u32.u16 %r6223, %rs4019; + cvt.s32.s8 %r6224, %r6223; + cvt.u32.u16 %r6225, %rs4018; + cvt.s32.s8 %r6226, %r6225; + mad.lo.s32 %r6227, %r61, %r6226, %r6218; + mad.lo.s32 %r6228, %r62, %r6224, %r6227; + mad.lo.s32 %r6229, %r64, %r6222, %r6228; + mad.lo.s32 %r6230, %r65, %r6220, %r6229; + ld.const.v4.u8 {%rs4026, %rs4027, %rs4028, %rs4029}, [matrix+2012]; + cvt.u32.u16 %r6231, %rs4029; + cvt.s32.s8 %r6232, %r6231; + cvt.u32.u16 %r6233, %rs4028; + cvt.s32.s8 %r6234, %r6233; + cvt.u32.u16 %r6235, %rs4027; + cvt.s32.s8 %r6236, %r6235; + cvt.u32.u16 %r6237, %rs4026; + cvt.s32.s8 %r6238, %r6237; + mad.lo.s32 %r6239, %r67, %r6238, %r6230; + mad.lo.s32 %r6240, %r68, %r6236, %r6239; + mad.lo.s32 %r6241, %r69, %r6234, %r6240; + mad.lo.s32 %r6242, %r70, %r6232, %r6241; + ld.const.v4.u8 {%rs4034, %rs4035, %rs4036, %rs4037}, [matrix+2016]; + cvt.u32.u16 %r6243, %rs4037; + cvt.s32.s8 %r6244, %r6243; + cvt.u32.u16 %r6245, %rs4036; + cvt.s32.s8 %r6246, %r6245; + cvt.u32.u16 %r6247, %rs4035; + cvt.s32.s8 %r6248, %r6247; + cvt.u32.u16 %r6249, %rs4034; + cvt.s32.s8 %r6250, %r6249; + mad.lo.s32 %r6251, %r222, %r6250, %r6242; + mad.lo.s32 %r6252, %r72, %r6248, %r6251; + mad.lo.s32 %r6253, %r73, %r6246, %r6252; + mad.lo.s32 %r6254, %r74, %r6244, %r6253; + ld.const.v4.u8 {%rs4042, %rs4043, %rs4044, %rs4045}, [matrix+2020]; + cvt.u32.u16 %r6255, %rs4045; + cvt.s32.s8 %r6256, %r6255; + cvt.u32.u16 %r6257, %rs4044; + cvt.s32.s8 %r6258, %r6257; + cvt.u32.u16 %r6259, %rs4043; + cvt.s32.s8 %r6260, %r6259; + cvt.u32.u16 %r6261, %rs4042; + cvt.s32.s8 %r6262, %r6261; + mad.lo.s32 %r6263, %r75, %r6262, %r6254; + mad.lo.s32 %r6264, %r76, %r6260, %r6263; + mad.lo.s32 %r6265, %r77, %r6258, %r6264; + mad.lo.s32 %r6266, %r78, %r6256, %r6265; + ld.const.v4.u8 {%rs4050, %rs4051, %rs4052, %rs4053}, [matrix+2024]; + cvt.u32.u16 %r6267, %rs4053; + cvt.s32.s8 %r6268, %r6267; + cvt.u32.u16 %r6269, %rs4052; + cvt.s32.s8 %r6270, %r6269; + cvt.u32.u16 %r6271, %rs4051; + cvt.s32.s8 %r6272, %r6271; + cvt.u32.u16 %r6273, %rs4050; + cvt.s32.s8 %r6274, %r6273; + mad.lo.s32 %r6275, %r80, %r6274, %r6266; + mad.lo.s32 %r6276, %r81, %r6272, %r6275; + mad.lo.s32 %r6277, %r83, %r6270, %r6276; + mad.lo.s32 %r6278, %r84, %r6268, %r6277; + ld.const.v4.u8 {%rs4058, %rs4059, %rs4060, %rs4061}, [matrix+2028]; + cvt.u32.u16 %r6279, %rs4061; + cvt.s32.s8 %r6280, %r6279; + cvt.u32.u16 %r6281, %rs4060; + cvt.s32.s8 %r6282, %r6281; + cvt.u32.u16 %r6283, %rs4059; + cvt.s32.s8 %r6284, %r6283; + cvt.u32.u16 %r6285, %rs4058; + cvt.s32.s8 %r6286, %r6285; + mad.lo.s32 %r6287, %r86, %r6286, %r6278; + mad.lo.s32 %r6288, %r87, %r6284, %r6287; + mad.lo.s32 %r6289, %r88, %r6282, %r6288; + mad.lo.s32 %r6290, %r89, %r6280, %r6289; + ld.const.v4.u8 {%rs4066, %rs4067, %rs4068, %rs4069}, [matrix+2032]; + cvt.u32.u16 %r6291, %rs4069; + cvt.s32.s8 %r6292, %r6291; + cvt.u32.u16 %r6293, %rs4068; + cvt.s32.s8 %r6294, %r6293; + cvt.u32.u16 %r6295, %rs4067; + cvt.s32.s8 %r6296, %r6295; + cvt.u32.u16 %r6297, %rs4066; + cvt.s32.s8 %r6298, %r6297; + mad.lo.s32 %r6299, %r271, %r6298, %r6290; + mad.lo.s32 %r6300, %r91, %r6296, %r6299; + mad.lo.s32 %r6301, %r93, %r6294, %r6300; + mad.lo.s32 %r6302, %r94, %r6292, %r6301; + ld.const.v4.u8 {%rs4074, %rs4075, %rs4076, %rs4077}, [matrix+2036]; + cvt.u32.u16 %r6303, %rs4077; + cvt.s32.s8 %r6304, %r6303; + cvt.u32.u16 %r6305, %rs4076; + cvt.s32.s8 %r6306, %r6305; + cvt.u32.u16 %r6307, %rs4075; + cvt.s32.s8 %r6308, %r6307; + cvt.u32.u16 %r6309, %rs4074; + cvt.s32.s8 %r6310, %r6309; + mad.lo.s32 %r6311, %r96, %r6310, %r6302; + mad.lo.s32 %r6312, %r97, %r6308, %r6311; + mad.lo.s32 %r6313, %r99, %r6306, %r6312; + mad.lo.s32 %r6314, %r100, %r6304, %r6313; + ld.const.v4.u8 {%rs4082, %rs4083, %rs4084, %rs4085}, [matrix+2040]; + cvt.u32.u16 %r6315, %rs4085; + cvt.s32.s8 %r6316, %r6315; + cvt.u32.u16 %r6317, %rs4084; + cvt.s32.s8 %r6318, %r6317; + cvt.u32.u16 %r6319, %rs4083; + cvt.s32.s8 %r6320, %r6319; + cvt.u32.u16 %r6321, %rs4082; + cvt.s32.s8 %r6322, %r6321; + mad.lo.s32 %r6323, %r103, %r6322, %r6314; + mad.lo.s32 %r6324, %r104, %r6320, %r6323; + mad.lo.s32 %r6325, %r107, %r6318, %r6324; + mad.lo.s32 %r6326, %r108, %r6316, %r6325; + ld.const.v4.u8 {%rs4090, %rs4091, %rs4092, %rs4093}, [matrix+2044]; + cvt.u32.u16 %r6327, %rs4093; + cvt.s32.s8 %r6328, %r6327; + cvt.u32.u16 %r6329, %rs4092; + cvt.s32.s8 %r6330, %r6329; + cvt.u32.u16 %r6331, %rs4091; + cvt.s32.s8 %r6332, %r6331; + cvt.u32.u16 %r6333, %rs4090; + cvt.s32.s8 %r6334, %r6333; + mad.lo.s32 %r6335, %r111, %r6334, %r6326; + mad.lo.s32 %r6336, %r112, %r6332, %r6335; + mad.lo.s32 %r6337, %r114, %r6330, %r6336; + mad.lo.s32 %r6338, %r115, %r6328, %r6337; + shr.u32 %r6339, %r6146, 6; + and.b32 %r6340, %r6339, 240; + shr.u32 %r6341, %r6338, 10; + or.b32 %r6342, %r6341, %r6340; + xor.b32 %r6343, %r26, %r6342; + ld.const.v4.u8 {%rs4098, %rs4099, %rs4100, %rs4101}, [matrix+2048]; + cvt.u32.u16 %r6344, %rs4101; + cvt.s32.s8 %r6345, %r6344; + cvt.u32.u16 %r6346, %rs4100; + cvt.s32.s8 %r6347, %r6346; + cvt.u32.u16 %r6348, %rs4098; + cvt.s32.s8 %r6349, %r6348; + cvt.u32.u16 %r6350, %rs4099; + cvt.s32.s8 %r6351, %r6350; + mul.lo.s32 %r6352, %r34, %r6351; + mad.lo.s32 %r6353, %r124, %r6349, %r6352; + mad.lo.s32 %r6354, %r35, %r6347, %r6353; + mad.lo.s32 %r6355, %r36, %r6345, %r6354; + ld.const.v4.u8 {%rs4106, %rs4107, %rs4108, %rs4109}, [matrix+2052]; + cvt.u32.u16 %r6356, %rs4109; + cvt.s32.s8 %r6357, %r6356; + cvt.u32.u16 %r6358, %rs4108; + cvt.s32.s8 %r6359, %r6358; + cvt.u32.u16 %r6360, %rs4107; + cvt.s32.s8 %r6361, %r6360; + cvt.u32.u16 %r6362, %rs4106; + cvt.s32.s8 %r6363, %r6362; + mad.lo.s32 %r6364, %r37, %r6363, %r6355; + mad.lo.s32 %r6365, %r38, %r6361, %r6364; + mad.lo.s32 %r6366, %r39, %r6359, %r6365; + mad.lo.s32 %r6367, %r40, %r6357, %r6366; + ld.const.v4.u8 {%rs4114, %rs4115, %rs4116, %rs4117}, [matrix+2056]; + cvt.u32.u16 %r6368, %rs4117; + cvt.s32.s8 %r6369, %r6368; + cvt.u32.u16 %r6370, %rs4116; + cvt.s32.s8 %r6371, %r6370; + cvt.u32.u16 %r6372, %rs4115; + cvt.s32.s8 %r6373, %r6372; + cvt.u32.u16 %r6374, %rs4114; + cvt.s32.s8 %r6375, %r6374; + mad.lo.s32 %r6376, %r42, %r6375, %r6367; + mad.lo.s32 %r6377, %r43, %r6373, %r6376; + mad.lo.s32 %r6378, %r45, %r6371, %r6377; + mad.lo.s32 %r6379, %r46, %r6369, %r6378; + ld.const.v4.u8 {%rs4122, %rs4123, %rs4124, %rs4125}, [matrix+2060]; + cvt.u32.u16 %r6380, %rs4125; + cvt.s32.s8 %r6381, %r6380; + cvt.u32.u16 %r6382, %rs4124; + cvt.s32.s8 %r6383, %r6382; + cvt.u32.u16 %r6384, %rs4123; + cvt.s32.s8 %r6385, %r6384; + cvt.u32.u16 %r6386, %rs4122; + cvt.s32.s8 %r6387, %r6386; + mad.lo.s32 %r6388, %r48, %r6387, %r6379; + mad.lo.s32 %r6389, %r49, %r6385, %r6388; + mad.lo.s32 %r6390, %r50, %r6383, %r6389; + mad.lo.s32 %r6391, %r51, %r6381, %r6390; + ld.const.v4.u8 {%rs4130, %rs4131, %rs4132, %rs4133}, [matrix+2064]; + cvt.u32.u16 %r6392, %rs4133; + cvt.s32.s8 %r6393, %r6392; + cvt.u32.u16 %r6394, %rs4132; + cvt.s32.s8 %r6395, %r6394; + cvt.u32.u16 %r6396, %rs4131; + cvt.s32.s8 %r6397, %r6396; + cvt.u32.u16 %r6398, %rs4130; + cvt.s32.s8 %r6399, %r6398; + mad.lo.s32 %r6400, %r173, %r6399, %r6391; + mad.lo.s32 %r6401, %r53, %r6397, %r6400; + mad.lo.s32 %r6402, %r54, %r6395, %r6401; + mad.lo.s32 %r6403, %r55, %r6393, %r6402; + ld.const.v4.u8 {%rs4138, %rs4139, %rs4140, %rs4141}, [matrix+2068]; + cvt.u32.u16 %r6404, %rs4141; + cvt.s32.s8 %r6405, %r6404; + cvt.u32.u16 %r6406, %rs4140; + cvt.s32.s8 %r6407, %r6406; + cvt.u32.u16 %r6408, %rs4139; + cvt.s32.s8 %r6409, %r6408; + cvt.u32.u16 %r6410, %rs4138; + cvt.s32.s8 %r6411, %r6410; + mad.lo.s32 %r6412, %r56, %r6411, %r6403; + mad.lo.s32 %r6413, %r57, %r6409, %r6412; + mad.lo.s32 %r6414, %r58, %r6407, %r6413; + mad.lo.s32 %r6415, %r59, %r6405, %r6414; + ld.const.v4.u8 {%rs4146, %rs4147, %rs4148, %rs4149}, [matrix+2072]; + cvt.u32.u16 %r6416, %rs4149; + cvt.s32.s8 %r6417, %r6416; + cvt.u32.u16 %r6418, %rs4148; + cvt.s32.s8 %r6419, %r6418; + cvt.u32.u16 %r6420, %rs4147; + cvt.s32.s8 %r6421, %r6420; + cvt.u32.u16 %r6422, %rs4146; + cvt.s32.s8 %r6423, %r6422; + mad.lo.s32 %r6424, %r61, %r6423, %r6415; + mad.lo.s32 %r6425, %r62, %r6421, %r6424; + mad.lo.s32 %r6426, %r64, %r6419, %r6425; + mad.lo.s32 %r6427, %r65, %r6417, %r6426; + ld.const.v4.u8 {%rs4154, %rs4155, %rs4156, %rs4157}, [matrix+2076]; + cvt.u32.u16 %r6428, %rs4157; + cvt.s32.s8 %r6429, %r6428; + cvt.u32.u16 %r6430, %rs4156; + cvt.s32.s8 %r6431, %r6430; + cvt.u32.u16 %r6432, %rs4155; + cvt.s32.s8 %r6433, %r6432; + cvt.u32.u16 %r6434, %rs4154; + cvt.s32.s8 %r6435, %r6434; + mad.lo.s32 %r6436, %r67, %r6435, %r6427; + mad.lo.s32 %r6437, %r68, %r6433, %r6436; + mad.lo.s32 %r6438, %r69, %r6431, %r6437; + mad.lo.s32 %r6439, %r70, %r6429, %r6438; + ld.const.v4.u8 {%rs4162, %rs4163, %rs4164, %rs4165}, [matrix+2080]; + cvt.u32.u16 %r6440, %rs4165; + cvt.s32.s8 %r6441, %r6440; + cvt.u32.u16 %r6442, %rs4164; + cvt.s32.s8 %r6443, %r6442; + cvt.u32.u16 %r6444, %rs4163; + cvt.s32.s8 %r6445, %r6444; + cvt.u32.u16 %r6446, %rs4162; + cvt.s32.s8 %r6447, %r6446; + mad.lo.s32 %r6448, %r222, %r6447, %r6439; + mad.lo.s32 %r6449, %r72, %r6445, %r6448; + mad.lo.s32 %r6450, %r73, %r6443, %r6449; + mad.lo.s32 %r6451, %r74, %r6441, %r6450; + ld.const.v4.u8 {%rs4170, %rs4171, %rs4172, %rs4173}, [matrix+2084]; + cvt.u32.u16 %r6452, %rs4173; + cvt.s32.s8 %r6453, %r6452; + cvt.u32.u16 %r6454, %rs4172; + cvt.s32.s8 %r6455, %r6454; + cvt.u32.u16 %r6456, %rs4171; + cvt.s32.s8 %r6457, %r6456; + cvt.u32.u16 %r6458, %rs4170; + cvt.s32.s8 %r6459, %r6458; + mad.lo.s32 %r6460, %r75, %r6459, %r6451; + mad.lo.s32 %r6461, %r76, %r6457, %r6460; + mad.lo.s32 %r6462, %r77, %r6455, %r6461; + mad.lo.s32 %r6463, %r78, %r6453, %r6462; + ld.const.v4.u8 {%rs4178, %rs4179, %rs4180, %rs4181}, [matrix+2088]; + cvt.u32.u16 %r6464, %rs4181; + cvt.s32.s8 %r6465, %r6464; + cvt.u32.u16 %r6466, %rs4180; + cvt.s32.s8 %r6467, %r6466; + cvt.u32.u16 %r6468, %rs4179; + cvt.s32.s8 %r6469, %r6468; + cvt.u32.u16 %r6470, %rs4178; + cvt.s32.s8 %r6471, %r6470; + mad.lo.s32 %r6472, %r80, %r6471, %r6463; + mad.lo.s32 %r6473, %r81, %r6469, %r6472; + mad.lo.s32 %r6474, %r83, %r6467, %r6473; + mad.lo.s32 %r6475, %r84, %r6465, %r6474; + ld.const.v4.u8 {%rs4186, %rs4187, %rs4188, %rs4189}, [matrix+2092]; + cvt.u32.u16 %r6476, %rs4189; + cvt.s32.s8 %r6477, %r6476; + cvt.u32.u16 %r6478, %rs4188; + cvt.s32.s8 %r6479, %r6478; + cvt.u32.u16 %r6480, %rs4187; + cvt.s32.s8 %r6481, %r6480; + cvt.u32.u16 %r6482, %rs4186; + cvt.s32.s8 %r6483, %r6482; + mad.lo.s32 %r6484, %r86, %r6483, %r6475; + mad.lo.s32 %r6485, %r87, %r6481, %r6484; + mad.lo.s32 %r6486, %r88, %r6479, %r6485; + mad.lo.s32 %r6487, %r89, %r6477, %r6486; + ld.const.v4.u8 {%rs4194, %rs4195, %rs4196, %rs4197}, [matrix+2096]; + cvt.u32.u16 %r6488, %rs4197; + cvt.s32.s8 %r6489, %r6488; + cvt.u32.u16 %r6490, %rs4196; + cvt.s32.s8 %r6491, %r6490; + cvt.u32.u16 %r6492, %rs4195; + cvt.s32.s8 %r6493, %r6492; + cvt.u32.u16 %r6494, %rs4194; + cvt.s32.s8 %r6495, %r6494; + mad.lo.s32 %r6496, %r271, %r6495, %r6487; + mad.lo.s32 %r6497, %r91, %r6493, %r6496; + mad.lo.s32 %r6498, %r93, %r6491, %r6497; + mad.lo.s32 %r6499, %r94, %r6489, %r6498; + ld.const.v4.u8 {%rs4202, %rs4203, %rs4204, %rs4205}, [matrix+2100]; + cvt.u32.u16 %r6500, %rs4205; + cvt.s32.s8 %r6501, %r6500; + cvt.u32.u16 %r6502, %rs4204; + cvt.s32.s8 %r6503, %r6502; + cvt.u32.u16 %r6504, %rs4203; + cvt.s32.s8 %r6505, %r6504; + cvt.u32.u16 %r6506, %rs4202; + cvt.s32.s8 %r6507, %r6506; + mad.lo.s32 %r6508, %r96, %r6507, %r6499; + mad.lo.s32 %r6509, %r97, %r6505, %r6508; + mad.lo.s32 %r6510, %r99, %r6503, %r6509; + mad.lo.s32 %r6511, %r100, %r6501, %r6510; + ld.const.v4.u8 {%rs4210, %rs4211, %rs4212, %rs4213}, [matrix+2104]; + cvt.u32.u16 %r6512, %rs4213; + cvt.s32.s8 %r6513, %r6512; + cvt.u32.u16 %r6514, %rs4212; + cvt.s32.s8 %r6515, %r6514; + cvt.u32.u16 %r6516, %rs4211; + cvt.s32.s8 %r6517, %r6516; + cvt.u32.u16 %r6518, %rs4210; + cvt.s32.s8 %r6519, %r6518; + mad.lo.s32 %r6520, %r103, %r6519, %r6511; + mad.lo.s32 %r6521, %r104, %r6517, %r6520; + mad.lo.s32 %r6522, %r107, %r6515, %r6521; + mad.lo.s32 %r6523, %r108, %r6513, %r6522; + ld.const.v4.u8 {%rs4218, %rs4219, %rs4220, %rs4221}, [matrix+2108]; + cvt.u32.u16 %r6524, %rs4221; + cvt.s32.s8 %r6525, %r6524; + cvt.u32.u16 %r6526, %rs4220; + cvt.s32.s8 %r6527, %r6526; + cvt.u32.u16 %r6528, %rs4219; + cvt.s32.s8 %r6529, %r6528; + cvt.u32.u16 %r6530, %rs4218; + cvt.s32.s8 %r6531, %r6530; + mad.lo.s32 %r6532, %r111, %r6531, %r6523; + mad.lo.s32 %r6533, %r112, %r6529, %r6532; + mad.lo.s32 %r6534, %r114, %r6527, %r6533; + mad.lo.s32 %r6535, %r115, %r6525, %r6534; + ld.const.v4.u8 {%rs4226, %rs4227, %rs4228, %rs4229}, [matrix+2112]; + cvt.u32.u16 %r6536, %rs4229; + cvt.s32.s8 %r6537, %r6536; + cvt.u32.u16 %r6538, %rs4228; + cvt.s32.s8 %r6539, %r6538; + cvt.u32.u16 %r6540, %rs4226; + cvt.s32.s8 %r6541, %r6540; + cvt.u32.u16 %r6542, %rs4227; + cvt.s32.s8 %r6543, %r6542; + mul.lo.s32 %r6544, %r34, %r6543; + mad.lo.s32 %r6545, %r124, %r6541, %r6544; + mad.lo.s32 %r6546, %r35, %r6539, %r6545; + mad.lo.s32 %r6547, %r36, %r6537, %r6546; + ld.const.v4.u8 {%rs4234, %rs4235, %rs4236, %rs4237}, [matrix+2116]; + cvt.u32.u16 %r6548, %rs4237; + cvt.s32.s8 %r6549, %r6548; + cvt.u32.u16 %r6550, %rs4236; + cvt.s32.s8 %r6551, %r6550; + cvt.u32.u16 %r6552, %rs4235; + cvt.s32.s8 %r6553, %r6552; + cvt.u32.u16 %r6554, %rs4234; + cvt.s32.s8 %r6555, %r6554; + mad.lo.s32 %r6556, %r37, %r6555, %r6547; + mad.lo.s32 %r6557, %r38, %r6553, %r6556; + mad.lo.s32 %r6558, %r39, %r6551, %r6557; + mad.lo.s32 %r6559, %r40, %r6549, %r6558; + ld.const.v4.u8 {%rs4242, %rs4243, %rs4244, %rs4245}, [matrix+2120]; + cvt.u32.u16 %r6560, %rs4245; + cvt.s32.s8 %r6561, %r6560; + cvt.u32.u16 %r6562, %rs4244; + cvt.s32.s8 %r6563, %r6562; + cvt.u32.u16 %r6564, %rs4243; + cvt.s32.s8 %r6565, %r6564; + cvt.u32.u16 %r6566, %rs4242; + cvt.s32.s8 %r6567, %r6566; + mad.lo.s32 %r6568, %r42, %r6567, %r6559; + mad.lo.s32 %r6569, %r43, %r6565, %r6568; + mad.lo.s32 %r6570, %r45, %r6563, %r6569; + mad.lo.s32 %r6571, %r46, %r6561, %r6570; + ld.const.v4.u8 {%rs4250, %rs4251, %rs4252, %rs4253}, [matrix+2124]; + cvt.u32.u16 %r6572, %rs4253; + cvt.s32.s8 %r6573, %r6572; + cvt.u32.u16 %r6574, %rs4252; + cvt.s32.s8 %r6575, %r6574; + cvt.u32.u16 %r6576, %rs4251; + cvt.s32.s8 %r6577, %r6576; + cvt.u32.u16 %r6578, %rs4250; + cvt.s32.s8 %r6579, %r6578; + mad.lo.s32 %r6580, %r48, %r6579, %r6571; + mad.lo.s32 %r6581, %r49, %r6577, %r6580; + mad.lo.s32 %r6582, %r50, %r6575, %r6581; + mad.lo.s32 %r6583, %r51, %r6573, %r6582; + ld.const.v4.u8 {%rs4258, %rs4259, %rs4260, %rs4261}, [matrix+2128]; + cvt.u32.u16 %r6584, %rs4261; + cvt.s32.s8 %r6585, %r6584; + cvt.u32.u16 %r6586, %rs4260; + cvt.s32.s8 %r6587, %r6586; + cvt.u32.u16 %r6588, %rs4259; + cvt.s32.s8 %r6589, %r6588; + cvt.u32.u16 %r6590, %rs4258; + cvt.s32.s8 %r6591, %r6590; + mad.lo.s32 %r6592, %r173, %r6591, %r6583; + mad.lo.s32 %r6593, %r53, %r6589, %r6592; + mad.lo.s32 %r6594, %r54, %r6587, %r6593; + mad.lo.s32 %r6595, %r55, %r6585, %r6594; + ld.const.v4.u8 {%rs4266, %rs4267, %rs4268, %rs4269}, [matrix+2132]; + cvt.u32.u16 %r6596, %rs4269; + cvt.s32.s8 %r6597, %r6596; + cvt.u32.u16 %r6598, %rs4268; + cvt.s32.s8 %r6599, %r6598; + cvt.u32.u16 %r6600, %rs4267; + cvt.s32.s8 %r6601, %r6600; + cvt.u32.u16 %r6602, %rs4266; + cvt.s32.s8 %r6603, %r6602; + mad.lo.s32 %r6604, %r56, %r6603, %r6595; + mad.lo.s32 %r6605, %r57, %r6601, %r6604; + mad.lo.s32 %r6606, %r58, %r6599, %r6605; + mad.lo.s32 %r6607, %r59, %r6597, %r6606; + ld.const.v4.u8 {%rs4274, %rs4275, %rs4276, %rs4277}, [matrix+2136]; + cvt.u32.u16 %r6608, %rs4277; + cvt.s32.s8 %r6609, %r6608; + cvt.u32.u16 %r6610, %rs4276; + cvt.s32.s8 %r6611, %r6610; + cvt.u32.u16 %r6612, %rs4275; + cvt.s32.s8 %r6613, %r6612; + cvt.u32.u16 %r6614, %rs4274; + cvt.s32.s8 %r6615, %r6614; + mad.lo.s32 %r6616, %r61, %r6615, %r6607; + mad.lo.s32 %r6617, %r62, %r6613, %r6616; + mad.lo.s32 %r6618, %r64, %r6611, %r6617; + mad.lo.s32 %r6619, %r65, %r6609, %r6618; + ld.const.v4.u8 {%rs4282, %rs4283, %rs4284, %rs4285}, [matrix+2140]; + cvt.u32.u16 %r6620, %rs4285; + cvt.s32.s8 %r6621, %r6620; + cvt.u32.u16 %r6622, %rs4284; + cvt.s32.s8 %r6623, %r6622; + cvt.u32.u16 %r6624, %rs4283; + cvt.s32.s8 %r6625, %r6624; + cvt.u32.u16 %r6626, %rs4282; + cvt.s32.s8 %r6627, %r6626; + mad.lo.s32 %r6628, %r67, %r6627, %r6619; + mad.lo.s32 %r6629, %r68, %r6625, %r6628; + mad.lo.s32 %r6630, %r69, %r6623, %r6629; + mad.lo.s32 %r6631, %r70, %r6621, %r6630; + ld.const.v4.u8 {%rs4290, %rs4291, %rs4292, %rs4293}, [matrix+2144]; + cvt.u32.u16 %r6632, %rs4293; + cvt.s32.s8 %r6633, %r6632; + cvt.u32.u16 %r6634, %rs4292; + cvt.s32.s8 %r6635, %r6634; + cvt.u32.u16 %r6636, %rs4291; + cvt.s32.s8 %r6637, %r6636; + cvt.u32.u16 %r6638, %rs4290; + cvt.s32.s8 %r6639, %r6638; + mad.lo.s32 %r6640, %r222, %r6639, %r6631; + mad.lo.s32 %r6641, %r72, %r6637, %r6640; + mad.lo.s32 %r6642, %r73, %r6635, %r6641; + mad.lo.s32 %r6643, %r74, %r6633, %r6642; + ld.const.v4.u8 {%rs4298, %rs4299, %rs4300, %rs4301}, [matrix+2148]; + cvt.u32.u16 %r6644, %rs4301; + cvt.s32.s8 %r6645, %r6644; + cvt.u32.u16 %r6646, %rs4300; + cvt.s32.s8 %r6647, %r6646; + cvt.u32.u16 %r6648, %rs4299; + cvt.s32.s8 %r6649, %r6648; + cvt.u32.u16 %r6650, %rs4298; + cvt.s32.s8 %r6651, %r6650; + mad.lo.s32 %r6652, %r75, %r6651, %r6643; + mad.lo.s32 %r6653, %r76, %r6649, %r6652; + mad.lo.s32 %r6654, %r77, %r6647, %r6653; + mad.lo.s32 %r6655, %r78, %r6645, %r6654; + ld.const.v4.u8 {%rs4306, %rs4307, %rs4308, %rs4309}, [matrix+2152]; + cvt.u32.u16 %r6656, %rs4309; + cvt.s32.s8 %r6657, %r6656; + cvt.u32.u16 %r6658, %rs4308; + cvt.s32.s8 %r6659, %r6658; + cvt.u32.u16 %r6660, %rs4307; + cvt.s32.s8 %r6661, %r6660; + cvt.u32.u16 %r6662, %rs4306; + cvt.s32.s8 %r6663, %r6662; + mad.lo.s32 %r6664, %r80, %r6663, %r6655; + mad.lo.s32 %r6665, %r81, %r6661, %r6664; + mad.lo.s32 %r6666, %r83, %r6659, %r6665; + mad.lo.s32 %r6667, %r84, %r6657, %r6666; + ld.const.v4.u8 {%rs4314, %rs4315, %rs4316, %rs4317}, [matrix+2156]; + cvt.u32.u16 %r6668, %rs4317; + cvt.s32.s8 %r6669, %r6668; + cvt.u32.u16 %r6670, %rs4316; + cvt.s32.s8 %r6671, %r6670; + cvt.u32.u16 %r6672, %rs4315; + cvt.s32.s8 %r6673, %r6672; + cvt.u32.u16 %r6674, %rs4314; + cvt.s32.s8 %r6675, %r6674; + mad.lo.s32 %r6676, %r86, %r6675, %r6667; + mad.lo.s32 %r6677, %r87, %r6673, %r6676; + mad.lo.s32 %r6678, %r88, %r6671, %r6677; + mad.lo.s32 %r6679, %r89, %r6669, %r6678; + ld.const.v4.u8 {%rs4322, %rs4323, %rs4324, %rs4325}, [matrix+2160]; + cvt.u32.u16 %r6680, %rs4325; + cvt.s32.s8 %r6681, %r6680; + cvt.u32.u16 %r6682, %rs4324; + cvt.s32.s8 %r6683, %r6682; + cvt.u32.u16 %r6684, %rs4323; + cvt.s32.s8 %r6685, %r6684; + cvt.u32.u16 %r6686, %rs4322; + cvt.s32.s8 %r6687, %r6686; + mad.lo.s32 %r6688, %r271, %r6687, %r6679; + mad.lo.s32 %r6689, %r91, %r6685, %r6688; + mad.lo.s32 %r6690, %r93, %r6683, %r6689; + mad.lo.s32 %r6691, %r94, %r6681, %r6690; + ld.const.v4.u8 {%rs4330, %rs4331, %rs4332, %rs4333}, [matrix+2164]; + cvt.u32.u16 %r6692, %rs4333; + cvt.s32.s8 %r6693, %r6692; + cvt.u32.u16 %r6694, %rs4332; + cvt.s32.s8 %r6695, %r6694; + cvt.u32.u16 %r6696, %rs4331; + cvt.s32.s8 %r6697, %r6696; + cvt.u32.u16 %r6698, %rs4330; + cvt.s32.s8 %r6699, %r6698; + mad.lo.s32 %r6700, %r96, %r6699, %r6691; + mad.lo.s32 %r6701, %r97, %r6697, %r6700; + mad.lo.s32 %r6702, %r99, %r6695, %r6701; + mad.lo.s32 %r6703, %r100, %r6693, %r6702; + ld.const.v4.u8 {%rs4338, %rs4339, %rs4340, %rs4341}, [matrix+2168]; + cvt.u32.u16 %r6704, %rs4341; + cvt.s32.s8 %r6705, %r6704; + cvt.u32.u16 %r6706, %rs4340; + cvt.s32.s8 %r6707, %r6706; + cvt.u32.u16 %r6708, %rs4339; + cvt.s32.s8 %r6709, %r6708; + cvt.u32.u16 %r6710, %rs4338; + cvt.s32.s8 %r6711, %r6710; + mad.lo.s32 %r6712, %r103, %r6711, %r6703; + mad.lo.s32 %r6713, %r104, %r6709, %r6712; + mad.lo.s32 %r6714, %r107, %r6707, %r6713; + mad.lo.s32 %r6715, %r108, %r6705, %r6714; + ld.const.v4.u8 {%rs4346, %rs4347, %rs4348, %rs4349}, [matrix+2172]; + cvt.u32.u16 %r6716, %rs4349; + cvt.s32.s8 %r6717, %r6716; + cvt.u32.u16 %r6718, %rs4348; + cvt.s32.s8 %r6719, %r6718; + cvt.u32.u16 %r6720, %rs4347; + cvt.s32.s8 %r6721, %r6720; + cvt.u32.u16 %r6722, %rs4346; + cvt.s32.s8 %r6723, %r6722; + mad.lo.s32 %r6724, %r111, %r6723, %r6715; + mad.lo.s32 %r6725, %r112, %r6721, %r6724; + mad.lo.s32 %r6726, %r114, %r6719, %r6725; + mad.lo.s32 %r6727, %r115, %r6717, %r6726; + shr.u32 %r6728, %r6535, 6; + and.b32 %r6729, %r6728, 240; + shr.u32 %r6730, %r6727, 10; + or.b32 %r6731, %r6730, %r6729; + xor.b32 %r6732, %r71, %r6731; + cvt.u64.u32 %rd393, %r6732; + ld.const.v4.u8 {%rs4354, %rs4355, %rs4356, %rs4357}, [matrix+2176]; + cvt.u32.u16 %r6733, %rs4357; + cvt.s32.s8 %r6734, %r6733; + cvt.u32.u16 %r6735, %rs4356; + cvt.s32.s8 %r6736, %r6735; + cvt.u32.u16 %r6737, %rs4354; + cvt.s32.s8 %r6738, %r6737; + cvt.u32.u16 %r6739, %rs4355; + cvt.s32.s8 %r6740, %r6739; + mul.lo.s32 %r6741, %r34, %r6740; + mad.lo.s32 %r6742, %r124, %r6738, %r6741; + mad.lo.s32 %r6743, %r35, %r6736, %r6742; + mad.lo.s32 %r6744, %r36, %r6734, %r6743; + ld.const.v4.u8 {%rs4362, %rs4363, %rs4364, %rs4365}, [matrix+2180]; + cvt.u32.u16 %r6745, %rs4365; + cvt.s32.s8 %r6746, %r6745; + cvt.u32.u16 %r6747, %rs4364; + cvt.s32.s8 %r6748, %r6747; + cvt.u32.u16 %r6749, %rs4363; + cvt.s32.s8 %r6750, %r6749; + cvt.u32.u16 %r6751, %rs4362; + cvt.s32.s8 %r6752, %r6751; + mad.lo.s32 %r6753, %r37, %r6752, %r6744; + mad.lo.s32 %r6754, %r38, %r6750, %r6753; + mad.lo.s32 %r6755, %r39, %r6748, %r6754; + mad.lo.s32 %r6756, %r40, %r6746, %r6755; + ld.const.v4.u8 {%rs4370, %rs4371, %rs4372, %rs4373}, [matrix+2184]; + cvt.u32.u16 %r6757, %rs4373; + cvt.s32.s8 %r6758, %r6757; + cvt.u32.u16 %r6759, %rs4372; + cvt.s32.s8 %r6760, %r6759; + cvt.u32.u16 %r6761, %rs4371; + cvt.s32.s8 %r6762, %r6761; + cvt.u32.u16 %r6763, %rs4370; + cvt.s32.s8 %r6764, %r6763; + mad.lo.s32 %r6765, %r42, %r6764, %r6756; + mad.lo.s32 %r6766, %r43, %r6762, %r6765; + mad.lo.s32 %r6767, %r45, %r6760, %r6766; + mad.lo.s32 %r6768, %r46, %r6758, %r6767; + ld.const.v4.u8 {%rs4378, %rs4379, %rs4380, %rs4381}, [matrix+2188]; + cvt.u32.u16 %r6769, %rs4381; + cvt.s32.s8 %r6770, %r6769; + cvt.u32.u16 %r6771, %rs4380; + cvt.s32.s8 %r6772, %r6771; + cvt.u32.u16 %r6773, %rs4379; + cvt.s32.s8 %r6774, %r6773; + cvt.u32.u16 %r6775, %rs4378; + cvt.s32.s8 %r6776, %r6775; + mad.lo.s32 %r6777, %r48, %r6776, %r6768; + mad.lo.s32 %r6778, %r49, %r6774, %r6777; + mad.lo.s32 %r6779, %r50, %r6772, %r6778; + mad.lo.s32 %r6780, %r51, %r6770, %r6779; + ld.const.v4.u8 {%rs4386, %rs4387, %rs4388, %rs4389}, [matrix+2192]; + cvt.u32.u16 %r6781, %rs4389; + cvt.s32.s8 %r6782, %r6781; + cvt.u32.u16 %r6783, %rs4388; + cvt.s32.s8 %r6784, %r6783; + cvt.u32.u16 %r6785, %rs4387; + cvt.s32.s8 %r6786, %r6785; + cvt.u32.u16 %r6787, %rs4386; + cvt.s32.s8 %r6788, %r6787; + mad.lo.s32 %r6789, %r173, %r6788, %r6780; + mad.lo.s32 %r6790, %r53, %r6786, %r6789; + mad.lo.s32 %r6791, %r54, %r6784, %r6790; + mad.lo.s32 %r6792, %r55, %r6782, %r6791; + ld.const.v4.u8 {%rs4394, %rs4395, %rs4396, %rs4397}, [matrix+2196]; + cvt.u32.u16 %r6793, %rs4397; + cvt.s32.s8 %r6794, %r6793; + cvt.u32.u16 %r6795, %rs4396; + cvt.s32.s8 %r6796, %r6795; + cvt.u32.u16 %r6797, %rs4395; + cvt.s32.s8 %r6798, %r6797; + cvt.u32.u16 %r6799, %rs4394; + cvt.s32.s8 %r6800, %r6799; + mad.lo.s32 %r6801, %r56, %r6800, %r6792; + mad.lo.s32 %r6802, %r57, %r6798, %r6801; + mad.lo.s32 %r6803, %r58, %r6796, %r6802; + mad.lo.s32 %r6804, %r59, %r6794, %r6803; + ld.const.v4.u8 {%rs4402, %rs4403, %rs4404, %rs4405}, [matrix+2200]; + cvt.u32.u16 %r6805, %rs4405; + cvt.s32.s8 %r6806, %r6805; + cvt.u32.u16 %r6807, %rs4404; + cvt.s32.s8 %r6808, %r6807; + cvt.u32.u16 %r6809, %rs4403; + cvt.s32.s8 %r6810, %r6809; + cvt.u32.u16 %r6811, %rs4402; + cvt.s32.s8 %r6812, %r6811; + mad.lo.s32 %r6813, %r61, %r6812, %r6804; + mad.lo.s32 %r6814, %r62, %r6810, %r6813; + mad.lo.s32 %r6815, %r64, %r6808, %r6814; + mad.lo.s32 %r6816, %r65, %r6806, %r6815; + ld.const.v4.u8 {%rs4410, %rs4411, %rs4412, %rs4413}, [matrix+2204]; + cvt.u32.u16 %r6817, %rs4413; + cvt.s32.s8 %r6818, %r6817; + cvt.u32.u16 %r6819, %rs4412; + cvt.s32.s8 %r6820, %r6819; + cvt.u32.u16 %r6821, %rs4411; + cvt.s32.s8 %r6822, %r6821; + cvt.u32.u16 %r6823, %rs4410; + cvt.s32.s8 %r6824, %r6823; + mad.lo.s32 %r6825, %r67, %r6824, %r6816; + mad.lo.s32 %r6826, %r68, %r6822, %r6825; + mad.lo.s32 %r6827, %r69, %r6820, %r6826; + mad.lo.s32 %r6828, %r70, %r6818, %r6827; + ld.const.v4.u8 {%rs4418, %rs4419, %rs4420, %rs4421}, [matrix+2208]; + cvt.u32.u16 %r6829, %rs4421; + cvt.s32.s8 %r6830, %r6829; + cvt.u32.u16 %r6831, %rs4420; + cvt.s32.s8 %r6832, %r6831; + cvt.u32.u16 %r6833, %rs4419; + cvt.s32.s8 %r6834, %r6833; + cvt.u32.u16 %r6835, %rs4418; + cvt.s32.s8 %r6836, %r6835; + mad.lo.s32 %r6837, %r222, %r6836, %r6828; + mad.lo.s32 %r6838, %r72, %r6834, %r6837; + mad.lo.s32 %r6839, %r73, %r6832, %r6838; + mad.lo.s32 %r6840, %r74, %r6830, %r6839; + ld.const.v4.u8 {%rs4426, %rs4427, %rs4428, %rs4429}, [matrix+2212]; + cvt.u32.u16 %r6841, %rs4429; + cvt.s32.s8 %r6842, %r6841; + cvt.u32.u16 %r6843, %rs4428; + cvt.s32.s8 %r6844, %r6843; + cvt.u32.u16 %r6845, %rs4427; + cvt.s32.s8 %r6846, %r6845; + cvt.u32.u16 %r6847, %rs4426; + cvt.s32.s8 %r6848, %r6847; + mad.lo.s32 %r6849, %r75, %r6848, %r6840; + mad.lo.s32 %r6850, %r76, %r6846, %r6849; + mad.lo.s32 %r6851, %r77, %r6844, %r6850; + mad.lo.s32 %r6852, %r78, %r6842, %r6851; + ld.const.v4.u8 {%rs4434, %rs4435, %rs4436, %rs4437}, [matrix+2216]; + cvt.u32.u16 %r6853, %rs4437; + cvt.s32.s8 %r6854, %r6853; + cvt.u32.u16 %r6855, %rs4436; + cvt.s32.s8 %r6856, %r6855; + cvt.u32.u16 %r6857, %rs4435; + cvt.s32.s8 %r6858, %r6857; + cvt.u32.u16 %r6859, %rs4434; + cvt.s32.s8 %r6860, %r6859; + mad.lo.s32 %r6861, %r80, %r6860, %r6852; + mad.lo.s32 %r6862, %r81, %r6858, %r6861; + mad.lo.s32 %r6863, %r83, %r6856, %r6862; + mad.lo.s32 %r6864, %r84, %r6854, %r6863; + ld.const.v4.u8 {%rs4442, %rs4443, %rs4444, %rs4445}, [matrix+2220]; + cvt.u32.u16 %r6865, %rs4445; + cvt.s32.s8 %r6866, %r6865; + cvt.u32.u16 %r6867, %rs4444; + cvt.s32.s8 %r6868, %r6867; + cvt.u32.u16 %r6869, %rs4443; + cvt.s32.s8 %r6870, %r6869; + cvt.u32.u16 %r6871, %rs4442; + cvt.s32.s8 %r6872, %r6871; + mad.lo.s32 %r6873, %r86, %r6872, %r6864; + mad.lo.s32 %r6874, %r87, %r6870, %r6873; + mad.lo.s32 %r6875, %r88, %r6868, %r6874; + mad.lo.s32 %r6876, %r89, %r6866, %r6875; + ld.const.v4.u8 {%rs4450, %rs4451, %rs4452, %rs4453}, [matrix+2224]; + cvt.u32.u16 %r6877, %rs4453; + cvt.s32.s8 %r6878, %r6877; + cvt.u32.u16 %r6879, %rs4452; + cvt.s32.s8 %r6880, %r6879; + cvt.u32.u16 %r6881, %rs4451; + cvt.s32.s8 %r6882, %r6881; + cvt.u32.u16 %r6883, %rs4450; + cvt.s32.s8 %r6884, %r6883; + mad.lo.s32 %r6885, %r271, %r6884, %r6876; + mad.lo.s32 %r6886, %r91, %r6882, %r6885; + mad.lo.s32 %r6887, %r93, %r6880, %r6886; + mad.lo.s32 %r6888, %r94, %r6878, %r6887; + ld.const.v4.u8 {%rs4458, %rs4459, %rs4460, %rs4461}, [matrix+2228]; + cvt.u32.u16 %r6889, %rs4461; + cvt.s32.s8 %r6890, %r6889; + cvt.u32.u16 %r6891, %rs4460; + cvt.s32.s8 %r6892, %r6891; + cvt.u32.u16 %r6893, %rs4459; + cvt.s32.s8 %r6894, %r6893; + cvt.u32.u16 %r6895, %rs4458; + cvt.s32.s8 %r6896, %r6895; + mad.lo.s32 %r6897, %r96, %r6896, %r6888; + mad.lo.s32 %r6898, %r97, %r6894, %r6897; + mad.lo.s32 %r6899, %r99, %r6892, %r6898; + mad.lo.s32 %r6900, %r100, %r6890, %r6899; + ld.const.v4.u8 {%rs4466, %rs4467, %rs4468, %rs4469}, [matrix+2232]; + cvt.u32.u16 %r6901, %rs4469; + cvt.s32.s8 %r6902, %r6901; + cvt.u32.u16 %r6903, %rs4468; + cvt.s32.s8 %r6904, %r6903; + cvt.u32.u16 %r6905, %rs4467; + cvt.s32.s8 %r6906, %r6905; + cvt.u32.u16 %r6907, %rs4466; + cvt.s32.s8 %r6908, %r6907; + mad.lo.s32 %r6909, %r103, %r6908, %r6900; + mad.lo.s32 %r6910, %r104, %r6906, %r6909; + mad.lo.s32 %r6911, %r107, %r6904, %r6910; + mad.lo.s32 %r6912, %r108, %r6902, %r6911; + ld.const.v4.u8 {%rs4474, %rs4475, %rs4476, %rs4477}, [matrix+2236]; + cvt.u32.u16 %r6913, %rs4477; + cvt.s32.s8 %r6914, %r6913; + cvt.u32.u16 %r6915, %rs4476; + cvt.s32.s8 %r6916, %r6915; + cvt.u32.u16 %r6917, %rs4475; + cvt.s32.s8 %r6918, %r6917; + cvt.u32.u16 %r6919, %rs4474; + cvt.s32.s8 %r6920, %r6919; + mad.lo.s32 %r6921, %r111, %r6920, %r6912; + mad.lo.s32 %r6922, %r112, %r6918, %r6921; + mad.lo.s32 %r6923, %r114, %r6916, %r6922; + mad.lo.s32 %r6924, %r115, %r6914, %r6923; + ld.const.v4.u8 {%rs4482, %rs4483, %rs4484, %rs4485}, [matrix+2240]; + cvt.u32.u16 %r6925, %rs4485; + cvt.s32.s8 %r6926, %r6925; + cvt.u32.u16 %r6927, %rs4484; + cvt.s32.s8 %r6928, %r6927; + cvt.u32.u16 %r6929, %rs4482; + cvt.s32.s8 %r6930, %r6929; + cvt.u32.u16 %r6931, %rs4483; + cvt.s32.s8 %r6932, %r6931; + mul.lo.s32 %r6933, %r34, %r6932; + mad.lo.s32 %r6934, %r124, %r6930, %r6933; + mad.lo.s32 %r6935, %r35, %r6928, %r6934; + mad.lo.s32 %r6936, %r36, %r6926, %r6935; + ld.const.v4.u8 {%rs4490, %rs4491, %rs4492, %rs4493}, [matrix+2244]; + cvt.u32.u16 %r6937, %rs4493; + cvt.s32.s8 %r6938, %r6937; + cvt.u32.u16 %r6939, %rs4492; + cvt.s32.s8 %r6940, %r6939; + cvt.u32.u16 %r6941, %rs4491; + cvt.s32.s8 %r6942, %r6941; + cvt.u32.u16 %r6943, %rs4490; + cvt.s32.s8 %r6944, %r6943; + mad.lo.s32 %r6945, %r37, %r6944, %r6936; + mad.lo.s32 %r6946, %r38, %r6942, %r6945; + mad.lo.s32 %r6947, %r39, %r6940, %r6946; + mad.lo.s32 %r6948, %r40, %r6938, %r6947; + ld.const.v4.u8 {%rs4498, %rs4499, %rs4500, %rs4501}, [matrix+2248]; + cvt.u32.u16 %r6949, %rs4501; + cvt.s32.s8 %r6950, %r6949; + cvt.u32.u16 %r6951, %rs4500; + cvt.s32.s8 %r6952, %r6951; + cvt.u32.u16 %r6953, %rs4499; + cvt.s32.s8 %r6954, %r6953; + cvt.u32.u16 %r6955, %rs4498; + cvt.s32.s8 %r6956, %r6955; + mad.lo.s32 %r6957, %r42, %r6956, %r6948; + mad.lo.s32 %r6958, %r43, %r6954, %r6957; + mad.lo.s32 %r6959, %r45, %r6952, %r6958; + mad.lo.s32 %r6960, %r46, %r6950, %r6959; + ld.const.v4.u8 {%rs4506, %rs4507, %rs4508, %rs4509}, [matrix+2252]; + cvt.u32.u16 %r6961, %rs4509; + cvt.s32.s8 %r6962, %r6961; + cvt.u32.u16 %r6963, %rs4508; + cvt.s32.s8 %r6964, %r6963; + cvt.u32.u16 %r6965, %rs4507; + cvt.s32.s8 %r6966, %r6965; + cvt.u32.u16 %r6967, %rs4506; + cvt.s32.s8 %r6968, %r6967; + mad.lo.s32 %r6969, %r48, %r6968, %r6960; + mad.lo.s32 %r6970, %r49, %r6966, %r6969; + mad.lo.s32 %r6971, %r50, %r6964, %r6970; + mad.lo.s32 %r6972, %r51, %r6962, %r6971; + ld.const.v4.u8 {%rs4514, %rs4515, %rs4516, %rs4517}, [matrix+2256]; + cvt.u32.u16 %r6973, %rs4517; + cvt.s32.s8 %r6974, %r6973; + cvt.u32.u16 %r6975, %rs4516; + cvt.s32.s8 %r6976, %r6975; + cvt.u32.u16 %r6977, %rs4515; + cvt.s32.s8 %r6978, %r6977; + cvt.u32.u16 %r6979, %rs4514; + cvt.s32.s8 %r6980, %r6979; + mad.lo.s32 %r6981, %r173, %r6980, %r6972; + mad.lo.s32 %r6982, %r53, %r6978, %r6981; + mad.lo.s32 %r6983, %r54, %r6976, %r6982; + mad.lo.s32 %r6984, %r55, %r6974, %r6983; + ld.const.v4.u8 {%rs4522, %rs4523, %rs4524, %rs4525}, [matrix+2260]; + cvt.u32.u16 %r6985, %rs4525; + cvt.s32.s8 %r6986, %r6985; + cvt.u32.u16 %r6987, %rs4524; + cvt.s32.s8 %r6988, %r6987; + cvt.u32.u16 %r6989, %rs4523; + cvt.s32.s8 %r6990, %r6989; + cvt.u32.u16 %r6991, %rs4522; + cvt.s32.s8 %r6992, %r6991; + mad.lo.s32 %r6993, %r56, %r6992, %r6984; + mad.lo.s32 %r6994, %r57, %r6990, %r6993; + mad.lo.s32 %r6995, %r58, %r6988, %r6994; + mad.lo.s32 %r6996, %r59, %r6986, %r6995; + ld.const.v4.u8 {%rs4530, %rs4531, %rs4532, %rs4533}, [matrix+2264]; + cvt.u32.u16 %r6997, %rs4533; + cvt.s32.s8 %r6998, %r6997; + cvt.u32.u16 %r6999, %rs4532; + cvt.s32.s8 %r7000, %r6999; + cvt.u32.u16 %r7001, %rs4531; + cvt.s32.s8 %r7002, %r7001; + cvt.u32.u16 %r7003, %rs4530; + cvt.s32.s8 %r7004, %r7003; + mad.lo.s32 %r7005, %r61, %r7004, %r6996; + mad.lo.s32 %r7006, %r62, %r7002, %r7005; + mad.lo.s32 %r7007, %r64, %r7000, %r7006; + mad.lo.s32 %r7008, %r65, %r6998, %r7007; + ld.const.v4.u8 {%rs4538, %rs4539, %rs4540, %rs4541}, [matrix+2268]; + cvt.u32.u16 %r7009, %rs4541; + cvt.s32.s8 %r7010, %r7009; + cvt.u32.u16 %r7011, %rs4540; + cvt.s32.s8 %r7012, %r7011; + cvt.u32.u16 %r7013, %rs4539; + cvt.s32.s8 %r7014, %r7013; + cvt.u32.u16 %r7015, %rs4538; + cvt.s32.s8 %r7016, %r7015; + mad.lo.s32 %r7017, %r67, %r7016, %r7008; + mad.lo.s32 %r7018, %r68, %r7014, %r7017; + mad.lo.s32 %r7019, %r69, %r7012, %r7018; + mad.lo.s32 %r7020, %r70, %r7010, %r7019; + ld.const.v4.u8 {%rs4546, %rs4547, %rs4548, %rs4549}, [matrix+2272]; + cvt.u32.u16 %r7021, %rs4549; + cvt.s32.s8 %r7022, %r7021; + cvt.u32.u16 %r7023, %rs4548; + cvt.s32.s8 %r7024, %r7023; + cvt.u32.u16 %r7025, %rs4547; + cvt.s32.s8 %r7026, %r7025; + cvt.u32.u16 %r7027, %rs4546; + cvt.s32.s8 %r7028, %r7027; + mad.lo.s32 %r7029, %r222, %r7028, %r7020; + mad.lo.s32 %r7030, %r72, %r7026, %r7029; + mad.lo.s32 %r7031, %r73, %r7024, %r7030; + mad.lo.s32 %r7032, %r74, %r7022, %r7031; + ld.const.v4.u8 {%rs4554, %rs4555, %rs4556, %rs4557}, [matrix+2276]; + cvt.u32.u16 %r7033, %rs4557; + cvt.s32.s8 %r7034, %r7033; + cvt.u32.u16 %r7035, %rs4556; + cvt.s32.s8 %r7036, %r7035; + cvt.u32.u16 %r7037, %rs4555; + cvt.s32.s8 %r7038, %r7037; + cvt.u32.u16 %r7039, %rs4554; + cvt.s32.s8 %r7040, %r7039; + mad.lo.s32 %r7041, %r75, %r7040, %r7032; + mad.lo.s32 %r7042, %r76, %r7038, %r7041; + mad.lo.s32 %r7043, %r77, %r7036, %r7042; + mad.lo.s32 %r7044, %r78, %r7034, %r7043; + ld.const.v4.u8 {%rs4562, %rs4563, %rs4564, %rs4565}, [matrix+2280]; + cvt.u32.u16 %r7045, %rs4565; + cvt.s32.s8 %r7046, %r7045; + cvt.u32.u16 %r7047, %rs4564; + cvt.s32.s8 %r7048, %r7047; + cvt.u32.u16 %r7049, %rs4563; + cvt.s32.s8 %r7050, %r7049; + cvt.u32.u16 %r7051, %rs4562; + cvt.s32.s8 %r7052, %r7051; + mad.lo.s32 %r7053, %r80, %r7052, %r7044; + mad.lo.s32 %r7054, %r81, %r7050, %r7053; + mad.lo.s32 %r7055, %r83, %r7048, %r7054; + mad.lo.s32 %r7056, %r84, %r7046, %r7055; + ld.const.v4.u8 {%rs4570, %rs4571, %rs4572, %rs4573}, [matrix+2284]; + cvt.u32.u16 %r7057, %rs4573; + cvt.s32.s8 %r7058, %r7057; + cvt.u32.u16 %r7059, %rs4572; + cvt.s32.s8 %r7060, %r7059; + cvt.u32.u16 %r7061, %rs4571; + cvt.s32.s8 %r7062, %r7061; + cvt.u32.u16 %r7063, %rs4570; + cvt.s32.s8 %r7064, %r7063; + mad.lo.s32 %r7065, %r86, %r7064, %r7056; + mad.lo.s32 %r7066, %r87, %r7062, %r7065; + mad.lo.s32 %r7067, %r88, %r7060, %r7066; + mad.lo.s32 %r7068, %r89, %r7058, %r7067; + ld.const.v4.u8 {%rs4578, %rs4579, %rs4580, %rs4581}, [matrix+2288]; + cvt.u32.u16 %r7069, %rs4581; + cvt.s32.s8 %r7070, %r7069; + cvt.u32.u16 %r7071, %rs4580; + cvt.s32.s8 %r7072, %r7071; + cvt.u32.u16 %r7073, %rs4579; + cvt.s32.s8 %r7074, %r7073; + cvt.u32.u16 %r7075, %rs4578; + cvt.s32.s8 %r7076, %r7075; + mad.lo.s32 %r7077, %r271, %r7076, %r7068; + mad.lo.s32 %r7078, %r91, %r7074, %r7077; + mad.lo.s32 %r7079, %r93, %r7072, %r7078; + mad.lo.s32 %r7080, %r94, %r7070, %r7079; + ld.const.v4.u8 {%rs4586, %rs4587, %rs4588, %rs4589}, [matrix+2292]; + cvt.u32.u16 %r7081, %rs4589; + cvt.s32.s8 %r7082, %r7081; + cvt.u32.u16 %r7083, %rs4588; + cvt.s32.s8 %r7084, %r7083; + cvt.u32.u16 %r7085, %rs4587; + cvt.s32.s8 %r7086, %r7085; + cvt.u32.u16 %r7087, %rs4586; + cvt.s32.s8 %r7088, %r7087; + mad.lo.s32 %r7089, %r96, %r7088, %r7080; + mad.lo.s32 %r7090, %r97, %r7086, %r7089; + mad.lo.s32 %r7091, %r99, %r7084, %r7090; + mad.lo.s32 %r7092, %r100, %r7082, %r7091; + ld.const.v4.u8 {%rs4594, %rs4595, %rs4596, %rs4597}, [matrix+2296]; + cvt.u32.u16 %r7093, %rs4597; + cvt.s32.s8 %r7094, %r7093; + cvt.u32.u16 %r7095, %rs4596; + cvt.s32.s8 %r7096, %r7095; + cvt.u32.u16 %r7097, %rs4595; + cvt.s32.s8 %r7098, %r7097; + cvt.u32.u16 %r7099, %rs4594; + cvt.s32.s8 %r7100, %r7099; + mad.lo.s32 %r7101, %r103, %r7100, %r7092; + mad.lo.s32 %r7102, %r104, %r7098, %r7101; + mad.lo.s32 %r7103, %r107, %r7096, %r7102; + mad.lo.s32 %r7104, %r108, %r7094, %r7103; + ld.const.v4.u8 {%rs4602, %rs4603, %rs4604, %rs4605}, [matrix+2300]; + cvt.u32.u16 %r7105, %rs4605; + cvt.s32.s8 %r7106, %r7105; + cvt.u32.u16 %r7107, %rs4604; + cvt.s32.s8 %r7108, %r7107; + cvt.u32.u16 %r7109, %rs4603; + cvt.s32.s8 %r7110, %r7109; + cvt.u32.u16 %r7111, %rs4602; + cvt.s32.s8 %r7112, %r7111; + mad.lo.s32 %r7113, %r111, %r7112, %r7104; + mad.lo.s32 %r7114, %r112, %r7110, %r7113; + mad.lo.s32 %r7115, %r114, %r7108, %r7114; + mad.lo.s32 %r7116, %r115, %r7106, %r7115; + shr.u32 %r7117, %r6924, 6; + and.b32 %r7118, %r7117, 240; + shr.u32 %r7119, %r7116, 10; + or.b32 %r7120, %r7119, %r7118; + xor.b32 %r7121, %r27, %r7120; + ld.const.v4.u8 {%rs4610, %rs4611, %rs4612, %rs4613}, [matrix+2304]; + cvt.u32.u16 %r7122, %rs4613; + cvt.s32.s8 %r7123, %r7122; + cvt.u32.u16 %r7124, %rs4612; + cvt.s32.s8 %r7125, %r7124; + cvt.u32.u16 %r7126, %rs4610; + cvt.s32.s8 %r7127, %r7126; + cvt.u32.u16 %r7128, %rs4611; + cvt.s32.s8 %r7129, %r7128; + mul.lo.s32 %r7130, %r34, %r7129; + mad.lo.s32 %r7131, %r124, %r7127, %r7130; + mad.lo.s32 %r7132, %r35, %r7125, %r7131; + mad.lo.s32 %r7133, %r36, %r7123, %r7132; + ld.const.v4.u8 {%rs4618, %rs4619, %rs4620, %rs4621}, [matrix+2308]; + cvt.u32.u16 %r7134, %rs4621; + cvt.s32.s8 %r7135, %r7134; + cvt.u32.u16 %r7136, %rs4620; + cvt.s32.s8 %r7137, %r7136; + cvt.u32.u16 %r7138, %rs4619; + cvt.s32.s8 %r7139, %r7138; + cvt.u32.u16 %r7140, %rs4618; + cvt.s32.s8 %r7141, %r7140; + mad.lo.s32 %r7142, %r37, %r7141, %r7133; + mad.lo.s32 %r7143, %r38, %r7139, %r7142; + mad.lo.s32 %r7144, %r39, %r7137, %r7143; + mad.lo.s32 %r7145, %r40, %r7135, %r7144; + ld.const.v4.u8 {%rs4626, %rs4627, %rs4628, %rs4629}, [matrix+2312]; + cvt.u32.u16 %r7146, %rs4629; + cvt.s32.s8 %r7147, %r7146; + cvt.u32.u16 %r7148, %rs4628; + cvt.s32.s8 %r7149, %r7148; + cvt.u32.u16 %r7150, %rs4627; + cvt.s32.s8 %r7151, %r7150; + cvt.u32.u16 %r7152, %rs4626; + cvt.s32.s8 %r7153, %r7152; + mad.lo.s32 %r7154, %r42, %r7153, %r7145; + mad.lo.s32 %r7155, %r43, %r7151, %r7154; + mad.lo.s32 %r7156, %r45, %r7149, %r7155; + mad.lo.s32 %r7157, %r46, %r7147, %r7156; + ld.const.v4.u8 {%rs4634, %rs4635, %rs4636, %rs4637}, [matrix+2316]; + cvt.u32.u16 %r7158, %rs4637; + cvt.s32.s8 %r7159, %r7158; + cvt.u32.u16 %r7160, %rs4636; + cvt.s32.s8 %r7161, %r7160; + cvt.u32.u16 %r7162, %rs4635; + cvt.s32.s8 %r7163, %r7162; + cvt.u32.u16 %r7164, %rs4634; + cvt.s32.s8 %r7165, %r7164; + mad.lo.s32 %r7166, %r48, %r7165, %r7157; + mad.lo.s32 %r7167, %r49, %r7163, %r7166; + mad.lo.s32 %r7168, %r50, %r7161, %r7167; + mad.lo.s32 %r7169, %r51, %r7159, %r7168; + ld.const.v4.u8 {%rs4642, %rs4643, %rs4644, %rs4645}, [matrix+2320]; + cvt.u32.u16 %r7170, %rs4645; + cvt.s32.s8 %r7171, %r7170; + cvt.u32.u16 %r7172, %rs4644; + cvt.s32.s8 %r7173, %r7172; + cvt.u32.u16 %r7174, %rs4643; + cvt.s32.s8 %r7175, %r7174; + cvt.u32.u16 %r7176, %rs4642; + cvt.s32.s8 %r7177, %r7176; + mad.lo.s32 %r7178, %r173, %r7177, %r7169; + mad.lo.s32 %r7179, %r53, %r7175, %r7178; + mad.lo.s32 %r7180, %r54, %r7173, %r7179; + mad.lo.s32 %r7181, %r55, %r7171, %r7180; + ld.const.v4.u8 {%rs4650, %rs4651, %rs4652, %rs4653}, [matrix+2324]; + cvt.u32.u16 %r7182, %rs4653; + cvt.s32.s8 %r7183, %r7182; + cvt.u32.u16 %r7184, %rs4652; + cvt.s32.s8 %r7185, %r7184; + cvt.u32.u16 %r7186, %rs4651; + cvt.s32.s8 %r7187, %r7186; + cvt.u32.u16 %r7188, %rs4650; + cvt.s32.s8 %r7189, %r7188; + mad.lo.s32 %r7190, %r56, %r7189, %r7181; + mad.lo.s32 %r7191, %r57, %r7187, %r7190; + mad.lo.s32 %r7192, %r58, %r7185, %r7191; + mad.lo.s32 %r7193, %r59, %r7183, %r7192; + ld.const.v4.u8 {%rs4658, %rs4659, %rs4660, %rs4661}, [matrix+2328]; + cvt.u32.u16 %r7194, %rs4661; + cvt.s32.s8 %r7195, %r7194; + cvt.u32.u16 %r7196, %rs4660; + cvt.s32.s8 %r7197, %r7196; + cvt.u32.u16 %r7198, %rs4659; + cvt.s32.s8 %r7199, %r7198; + cvt.u32.u16 %r7200, %rs4658; + cvt.s32.s8 %r7201, %r7200; + mad.lo.s32 %r7202, %r61, %r7201, %r7193; + mad.lo.s32 %r7203, %r62, %r7199, %r7202; + mad.lo.s32 %r7204, %r64, %r7197, %r7203; + mad.lo.s32 %r7205, %r65, %r7195, %r7204; + ld.const.v4.u8 {%rs4666, %rs4667, %rs4668, %rs4669}, [matrix+2332]; + cvt.u32.u16 %r7206, %rs4669; + cvt.s32.s8 %r7207, %r7206; + cvt.u32.u16 %r7208, %rs4668; + cvt.s32.s8 %r7209, %r7208; + cvt.u32.u16 %r7210, %rs4667; + cvt.s32.s8 %r7211, %r7210; + cvt.u32.u16 %r7212, %rs4666; + cvt.s32.s8 %r7213, %r7212; + mad.lo.s32 %r7214, %r67, %r7213, %r7205; + mad.lo.s32 %r7215, %r68, %r7211, %r7214; + mad.lo.s32 %r7216, %r69, %r7209, %r7215; + mad.lo.s32 %r7217, %r70, %r7207, %r7216; + ld.const.v4.u8 {%rs4674, %rs4675, %rs4676, %rs4677}, [matrix+2336]; + cvt.u32.u16 %r7218, %rs4677; + cvt.s32.s8 %r7219, %r7218; + cvt.u32.u16 %r7220, %rs4676; + cvt.s32.s8 %r7221, %r7220; + cvt.u32.u16 %r7222, %rs4675; + cvt.s32.s8 %r7223, %r7222; + cvt.u32.u16 %r7224, %rs4674; + cvt.s32.s8 %r7225, %r7224; + mad.lo.s32 %r7226, %r222, %r7225, %r7217; + mad.lo.s32 %r7227, %r72, %r7223, %r7226; + mad.lo.s32 %r7228, %r73, %r7221, %r7227; + mad.lo.s32 %r7229, %r74, %r7219, %r7228; + ld.const.v4.u8 {%rs4682, %rs4683, %rs4684, %rs4685}, [matrix+2340]; + cvt.u32.u16 %r7230, %rs4685; + cvt.s32.s8 %r7231, %r7230; + cvt.u32.u16 %r7232, %rs4684; + cvt.s32.s8 %r7233, %r7232; + cvt.u32.u16 %r7234, %rs4683; + cvt.s32.s8 %r7235, %r7234; + cvt.u32.u16 %r7236, %rs4682; + cvt.s32.s8 %r7237, %r7236; + mad.lo.s32 %r7238, %r75, %r7237, %r7229; + mad.lo.s32 %r7239, %r76, %r7235, %r7238; + mad.lo.s32 %r7240, %r77, %r7233, %r7239; + mad.lo.s32 %r7241, %r78, %r7231, %r7240; + ld.const.v4.u8 {%rs4690, %rs4691, %rs4692, %rs4693}, [matrix+2344]; + cvt.u32.u16 %r7242, %rs4693; + cvt.s32.s8 %r7243, %r7242; + cvt.u32.u16 %r7244, %rs4692; + cvt.s32.s8 %r7245, %r7244; + cvt.u32.u16 %r7246, %rs4691; + cvt.s32.s8 %r7247, %r7246; + cvt.u32.u16 %r7248, %rs4690; + cvt.s32.s8 %r7249, %r7248; + mad.lo.s32 %r7250, %r80, %r7249, %r7241; + mad.lo.s32 %r7251, %r81, %r7247, %r7250; + mad.lo.s32 %r7252, %r83, %r7245, %r7251; + mad.lo.s32 %r7253, %r84, %r7243, %r7252; + ld.const.v4.u8 {%rs4698, %rs4699, %rs4700, %rs4701}, [matrix+2348]; + cvt.u32.u16 %r7254, %rs4701; + cvt.s32.s8 %r7255, %r7254; + cvt.u32.u16 %r7256, %rs4700; + cvt.s32.s8 %r7257, %r7256; + cvt.u32.u16 %r7258, %rs4699; + cvt.s32.s8 %r7259, %r7258; + cvt.u32.u16 %r7260, %rs4698; + cvt.s32.s8 %r7261, %r7260; + mad.lo.s32 %r7262, %r86, %r7261, %r7253; + mad.lo.s32 %r7263, %r87, %r7259, %r7262; + mad.lo.s32 %r7264, %r88, %r7257, %r7263; + mad.lo.s32 %r7265, %r89, %r7255, %r7264; + ld.const.v4.u8 {%rs4706, %rs4707, %rs4708, %rs4709}, [matrix+2352]; + cvt.u32.u16 %r7266, %rs4709; + cvt.s32.s8 %r7267, %r7266; + cvt.u32.u16 %r7268, %rs4708; + cvt.s32.s8 %r7269, %r7268; + cvt.u32.u16 %r7270, %rs4707; + cvt.s32.s8 %r7271, %r7270; + cvt.u32.u16 %r7272, %rs4706; + cvt.s32.s8 %r7273, %r7272; + mad.lo.s32 %r7274, %r271, %r7273, %r7265; + mad.lo.s32 %r7275, %r91, %r7271, %r7274; + mad.lo.s32 %r7276, %r93, %r7269, %r7275; + mad.lo.s32 %r7277, %r94, %r7267, %r7276; + ld.const.v4.u8 {%rs4714, %rs4715, %rs4716, %rs4717}, [matrix+2356]; + cvt.u32.u16 %r7278, %rs4717; + cvt.s32.s8 %r7279, %r7278; + cvt.u32.u16 %r7280, %rs4716; + cvt.s32.s8 %r7281, %r7280; + cvt.u32.u16 %r7282, %rs4715; + cvt.s32.s8 %r7283, %r7282; + cvt.u32.u16 %r7284, %rs4714; + cvt.s32.s8 %r7285, %r7284; + mad.lo.s32 %r7286, %r96, %r7285, %r7277; + mad.lo.s32 %r7287, %r97, %r7283, %r7286; + mad.lo.s32 %r7288, %r99, %r7281, %r7287; + mad.lo.s32 %r7289, %r100, %r7279, %r7288; + ld.const.v4.u8 {%rs4722, %rs4723, %rs4724, %rs4725}, [matrix+2360]; + cvt.u32.u16 %r7290, %rs4725; + cvt.s32.s8 %r7291, %r7290; + cvt.u32.u16 %r7292, %rs4724; + cvt.s32.s8 %r7293, %r7292; + cvt.u32.u16 %r7294, %rs4723; + cvt.s32.s8 %r7295, %r7294; + cvt.u32.u16 %r7296, %rs4722; + cvt.s32.s8 %r7297, %r7296; + mad.lo.s32 %r7298, %r103, %r7297, %r7289; + mad.lo.s32 %r7299, %r104, %r7295, %r7298; + mad.lo.s32 %r7300, %r107, %r7293, %r7299; + mad.lo.s32 %r7301, %r108, %r7291, %r7300; + ld.const.v4.u8 {%rs4730, %rs4731, %rs4732, %rs4733}, [matrix+2364]; + cvt.u32.u16 %r7302, %rs4733; + cvt.s32.s8 %r7303, %r7302; + cvt.u32.u16 %r7304, %rs4732; + cvt.s32.s8 %r7305, %r7304; + cvt.u32.u16 %r7306, %rs4731; + cvt.s32.s8 %r7307, %r7306; + cvt.u32.u16 %r7308, %rs4730; + cvt.s32.s8 %r7309, %r7308; + mad.lo.s32 %r7310, %r111, %r7309, %r7301; + mad.lo.s32 %r7311, %r112, %r7307, %r7310; + mad.lo.s32 %r7312, %r114, %r7305, %r7311; + mad.lo.s32 %r7313, %r115, %r7303, %r7312; + ld.const.v4.u8 {%rs4738, %rs4739, %rs4740, %rs4741}, [matrix+2368]; + cvt.u32.u16 %r7314, %rs4741; + cvt.s32.s8 %r7315, %r7314; + cvt.u32.u16 %r7316, %rs4740; + cvt.s32.s8 %r7317, %r7316; + cvt.u32.u16 %r7318, %rs4738; + cvt.s32.s8 %r7319, %r7318; + cvt.u32.u16 %r7320, %rs4739; + cvt.s32.s8 %r7321, %r7320; + mul.lo.s32 %r7322, %r34, %r7321; + mad.lo.s32 %r7323, %r124, %r7319, %r7322; + mad.lo.s32 %r7324, %r35, %r7317, %r7323; + mad.lo.s32 %r7325, %r36, %r7315, %r7324; + ld.const.v4.u8 {%rs4746, %rs4747, %rs4748, %rs4749}, [matrix+2372]; + cvt.u32.u16 %r7326, %rs4749; + cvt.s32.s8 %r7327, %r7326; + cvt.u32.u16 %r7328, %rs4748; + cvt.s32.s8 %r7329, %r7328; + cvt.u32.u16 %r7330, %rs4747; + cvt.s32.s8 %r7331, %r7330; + cvt.u32.u16 %r7332, %rs4746; + cvt.s32.s8 %r7333, %r7332; + mad.lo.s32 %r7334, %r37, %r7333, %r7325; + mad.lo.s32 %r7335, %r38, %r7331, %r7334; + mad.lo.s32 %r7336, %r39, %r7329, %r7335; + mad.lo.s32 %r7337, %r40, %r7327, %r7336; + ld.const.v4.u8 {%rs4754, %rs4755, %rs4756, %rs4757}, [matrix+2376]; + cvt.u32.u16 %r7338, %rs4757; + cvt.s32.s8 %r7339, %r7338; + cvt.u32.u16 %r7340, %rs4756; + cvt.s32.s8 %r7341, %r7340; + cvt.u32.u16 %r7342, %rs4755; + cvt.s32.s8 %r7343, %r7342; + cvt.u32.u16 %r7344, %rs4754; + cvt.s32.s8 %r7345, %r7344; + mad.lo.s32 %r7346, %r42, %r7345, %r7337; + mad.lo.s32 %r7347, %r43, %r7343, %r7346; + mad.lo.s32 %r7348, %r45, %r7341, %r7347; + mad.lo.s32 %r7349, %r46, %r7339, %r7348; + ld.const.v4.u8 {%rs4762, %rs4763, %rs4764, %rs4765}, [matrix+2380]; + cvt.u32.u16 %r7350, %rs4765; + cvt.s32.s8 %r7351, %r7350; + cvt.u32.u16 %r7352, %rs4764; + cvt.s32.s8 %r7353, %r7352; + cvt.u32.u16 %r7354, %rs4763; + cvt.s32.s8 %r7355, %r7354; + cvt.u32.u16 %r7356, %rs4762; + cvt.s32.s8 %r7357, %r7356; + mad.lo.s32 %r7358, %r48, %r7357, %r7349; + mad.lo.s32 %r7359, %r49, %r7355, %r7358; + mad.lo.s32 %r7360, %r50, %r7353, %r7359; + mad.lo.s32 %r7361, %r51, %r7351, %r7360; + ld.const.v4.u8 {%rs4770, %rs4771, %rs4772, %rs4773}, [matrix+2384]; + cvt.u32.u16 %r7362, %rs4773; + cvt.s32.s8 %r7363, %r7362; + cvt.u32.u16 %r7364, %rs4772; + cvt.s32.s8 %r7365, %r7364; + cvt.u32.u16 %r7366, %rs4771; + cvt.s32.s8 %r7367, %r7366; + cvt.u32.u16 %r7368, %rs4770; + cvt.s32.s8 %r7369, %r7368; + mad.lo.s32 %r7370, %r173, %r7369, %r7361; + mad.lo.s32 %r7371, %r53, %r7367, %r7370; + mad.lo.s32 %r7372, %r54, %r7365, %r7371; + mad.lo.s32 %r7373, %r55, %r7363, %r7372; + ld.const.v4.u8 {%rs4778, %rs4779, %rs4780, %rs4781}, [matrix+2388]; + cvt.u32.u16 %r7374, %rs4781; + cvt.s32.s8 %r7375, %r7374; + cvt.u32.u16 %r7376, %rs4780; + cvt.s32.s8 %r7377, %r7376; + cvt.u32.u16 %r7378, %rs4779; + cvt.s32.s8 %r7379, %r7378; + cvt.u32.u16 %r7380, %rs4778; + cvt.s32.s8 %r7381, %r7380; + mad.lo.s32 %r7382, %r56, %r7381, %r7373; + mad.lo.s32 %r7383, %r57, %r7379, %r7382; + mad.lo.s32 %r7384, %r58, %r7377, %r7383; + mad.lo.s32 %r7385, %r59, %r7375, %r7384; + ld.const.v4.u8 {%rs4786, %rs4787, %rs4788, %rs4789}, [matrix+2392]; + cvt.u32.u16 %r7386, %rs4789; + cvt.s32.s8 %r7387, %r7386; + cvt.u32.u16 %r7388, %rs4788; + cvt.s32.s8 %r7389, %r7388; + cvt.u32.u16 %r7390, %rs4787; + cvt.s32.s8 %r7391, %r7390; + cvt.u32.u16 %r7392, %rs4786; + cvt.s32.s8 %r7393, %r7392; + mad.lo.s32 %r7394, %r61, %r7393, %r7385; + mad.lo.s32 %r7395, %r62, %r7391, %r7394; + mad.lo.s32 %r7396, %r64, %r7389, %r7395; + mad.lo.s32 %r7397, %r65, %r7387, %r7396; + ld.const.v4.u8 {%rs4794, %rs4795, %rs4796, %rs4797}, [matrix+2396]; + cvt.u32.u16 %r7398, %rs4797; + cvt.s32.s8 %r7399, %r7398; + cvt.u32.u16 %r7400, %rs4796; + cvt.s32.s8 %r7401, %r7400; + cvt.u32.u16 %r7402, %rs4795; + cvt.s32.s8 %r7403, %r7402; + cvt.u32.u16 %r7404, %rs4794; + cvt.s32.s8 %r7405, %r7404; + mad.lo.s32 %r7406, %r67, %r7405, %r7397; + mad.lo.s32 %r7407, %r68, %r7403, %r7406; + mad.lo.s32 %r7408, %r69, %r7401, %r7407; + mad.lo.s32 %r7409, %r70, %r7399, %r7408; + ld.const.v4.u8 {%rs4802, %rs4803, %rs4804, %rs4805}, [matrix+2400]; + cvt.u32.u16 %r7410, %rs4805; + cvt.s32.s8 %r7411, %r7410; + cvt.u32.u16 %r7412, %rs4804; + cvt.s32.s8 %r7413, %r7412; + cvt.u32.u16 %r7414, %rs4803; + cvt.s32.s8 %r7415, %r7414; + cvt.u32.u16 %r7416, %rs4802; + cvt.s32.s8 %r7417, %r7416; + mad.lo.s32 %r7418, %r222, %r7417, %r7409; + mad.lo.s32 %r7419, %r72, %r7415, %r7418; + mad.lo.s32 %r7420, %r73, %r7413, %r7419; + mad.lo.s32 %r7421, %r74, %r7411, %r7420; + ld.const.v4.u8 {%rs4810, %rs4811, %rs4812, %rs4813}, [matrix+2404]; + cvt.u32.u16 %r7422, %rs4813; + cvt.s32.s8 %r7423, %r7422; + cvt.u32.u16 %r7424, %rs4812; + cvt.s32.s8 %r7425, %r7424; + cvt.u32.u16 %r7426, %rs4811; + cvt.s32.s8 %r7427, %r7426; + cvt.u32.u16 %r7428, %rs4810; + cvt.s32.s8 %r7429, %r7428; + mad.lo.s32 %r7430, %r75, %r7429, %r7421; + mad.lo.s32 %r7431, %r76, %r7427, %r7430; + mad.lo.s32 %r7432, %r77, %r7425, %r7431; + mad.lo.s32 %r7433, %r78, %r7423, %r7432; + ld.const.v4.u8 {%rs4818, %rs4819, %rs4820, %rs4821}, [matrix+2408]; + cvt.u32.u16 %r7434, %rs4821; + cvt.s32.s8 %r7435, %r7434; + cvt.u32.u16 %r7436, %rs4820; + cvt.s32.s8 %r7437, %r7436; + cvt.u32.u16 %r7438, %rs4819; + cvt.s32.s8 %r7439, %r7438; + cvt.u32.u16 %r7440, %rs4818; + cvt.s32.s8 %r7441, %r7440; + mad.lo.s32 %r7442, %r80, %r7441, %r7433; + mad.lo.s32 %r7443, %r81, %r7439, %r7442; + mad.lo.s32 %r7444, %r83, %r7437, %r7443; + mad.lo.s32 %r7445, %r84, %r7435, %r7444; + ld.const.v4.u8 {%rs4826, %rs4827, %rs4828, %rs4829}, [matrix+2412]; + cvt.u32.u16 %r7446, %rs4829; + cvt.s32.s8 %r7447, %r7446; + cvt.u32.u16 %r7448, %rs4828; + cvt.s32.s8 %r7449, %r7448; + cvt.u32.u16 %r7450, %rs4827; + cvt.s32.s8 %r7451, %r7450; + cvt.u32.u16 %r7452, %rs4826; + cvt.s32.s8 %r7453, %r7452; + mad.lo.s32 %r7454, %r86, %r7453, %r7445; + mad.lo.s32 %r7455, %r87, %r7451, %r7454; + mad.lo.s32 %r7456, %r88, %r7449, %r7455; + mad.lo.s32 %r7457, %r89, %r7447, %r7456; + ld.const.v4.u8 {%rs4834, %rs4835, %rs4836, %rs4837}, [matrix+2416]; + cvt.u32.u16 %r7458, %rs4837; + cvt.s32.s8 %r7459, %r7458; + cvt.u32.u16 %r7460, %rs4836; + cvt.s32.s8 %r7461, %r7460; + cvt.u32.u16 %r7462, %rs4835; + cvt.s32.s8 %r7463, %r7462; + cvt.u32.u16 %r7464, %rs4834; + cvt.s32.s8 %r7465, %r7464; + mad.lo.s32 %r7466, %r271, %r7465, %r7457; + mad.lo.s32 %r7467, %r91, %r7463, %r7466; + mad.lo.s32 %r7468, %r93, %r7461, %r7467; + mad.lo.s32 %r7469, %r94, %r7459, %r7468; + ld.const.v4.u8 {%rs4842, %rs4843, %rs4844, %rs4845}, [matrix+2420]; + cvt.u32.u16 %r7470, %rs4845; + cvt.s32.s8 %r7471, %r7470; + cvt.u32.u16 %r7472, %rs4844; + cvt.s32.s8 %r7473, %r7472; + cvt.u32.u16 %r7474, %rs4843; + cvt.s32.s8 %r7475, %r7474; + cvt.u32.u16 %r7476, %rs4842; + cvt.s32.s8 %r7477, %r7476; + mad.lo.s32 %r7478, %r96, %r7477, %r7469; + mad.lo.s32 %r7479, %r97, %r7475, %r7478; + mad.lo.s32 %r7480, %r99, %r7473, %r7479; + mad.lo.s32 %r7481, %r100, %r7471, %r7480; + ld.const.v4.u8 {%rs4850, %rs4851, %rs4852, %rs4853}, [matrix+2424]; + cvt.u32.u16 %r7482, %rs4853; + cvt.s32.s8 %r7483, %r7482; + cvt.u32.u16 %r7484, %rs4852; + cvt.s32.s8 %r7485, %r7484; + cvt.u32.u16 %r7486, %rs4851; + cvt.s32.s8 %r7487, %r7486; + cvt.u32.u16 %r7488, %rs4850; + cvt.s32.s8 %r7489, %r7488; + mad.lo.s32 %r7490, %r103, %r7489, %r7481; + mad.lo.s32 %r7491, %r104, %r7487, %r7490; + mad.lo.s32 %r7492, %r107, %r7485, %r7491; + mad.lo.s32 %r7493, %r108, %r7483, %r7492; + ld.const.v4.u8 {%rs4858, %rs4859, %rs4860, %rs4861}, [matrix+2428]; + cvt.u32.u16 %r7494, %rs4861; + cvt.s32.s8 %r7495, %r7494; + cvt.u32.u16 %r7496, %rs4860; + cvt.s32.s8 %r7497, %r7496; + cvt.u32.u16 %r7498, %rs4859; + cvt.s32.s8 %r7499, %r7498; + cvt.u32.u16 %r7500, %rs4858; + cvt.s32.s8 %r7501, %r7500; + mad.lo.s32 %r7502, %r111, %r7501, %r7493; + mad.lo.s32 %r7503, %r112, %r7499, %r7502; + mad.lo.s32 %r7504, %r114, %r7497, %r7503; + mad.lo.s32 %r7505, %r115, %r7495, %r7504; + shr.u32 %r7506, %r7313, 6; + and.b32 %r7507, %r7506, 240; + shr.u32 %r7508, %r7505, 10; + or.b32 %r7509, %r7508, %r7507; + xor.b32 %r7510, %r28, %r7509; + cvt.u64.u32 %rd394, %r7510; + ld.const.v4.u8 {%rs4866, %rs4867, %rs4868, %rs4869}, [matrix+2432]; + cvt.u32.u16 %r7511, %rs4869; + cvt.s32.s8 %r7512, %r7511; + cvt.u32.u16 %r7513, %rs4868; + cvt.s32.s8 %r7514, %r7513; + cvt.u32.u16 %r7515, %rs4866; + cvt.s32.s8 %r7516, %r7515; + cvt.u32.u16 %r7517, %rs4867; + cvt.s32.s8 %r7518, %r7517; + mul.lo.s32 %r7519, %r34, %r7518; + mad.lo.s32 %r7520, %r124, %r7516, %r7519; + mad.lo.s32 %r7521, %r35, %r7514, %r7520; + mad.lo.s32 %r7522, %r36, %r7512, %r7521; + ld.const.v4.u8 {%rs4874, %rs4875, %rs4876, %rs4877}, [matrix+2436]; + cvt.u32.u16 %r7523, %rs4877; + cvt.s32.s8 %r7524, %r7523; + cvt.u32.u16 %r7525, %rs4876; + cvt.s32.s8 %r7526, %r7525; + cvt.u32.u16 %r7527, %rs4875; + cvt.s32.s8 %r7528, %r7527; + cvt.u32.u16 %r7529, %rs4874; + cvt.s32.s8 %r7530, %r7529; + mad.lo.s32 %r7531, %r37, %r7530, %r7522; + mad.lo.s32 %r7532, %r38, %r7528, %r7531; + mad.lo.s32 %r7533, %r39, %r7526, %r7532; + mad.lo.s32 %r7534, %r40, %r7524, %r7533; + ld.const.v4.u8 {%rs4882, %rs4883, %rs4884, %rs4885}, [matrix+2440]; + cvt.u32.u16 %r7535, %rs4885; + cvt.s32.s8 %r7536, %r7535; + cvt.u32.u16 %r7537, %rs4884; + cvt.s32.s8 %r7538, %r7537; + cvt.u32.u16 %r7539, %rs4883; + cvt.s32.s8 %r7540, %r7539; + cvt.u32.u16 %r7541, %rs4882; + cvt.s32.s8 %r7542, %r7541; + mad.lo.s32 %r7543, %r42, %r7542, %r7534; + mad.lo.s32 %r7544, %r43, %r7540, %r7543; + mad.lo.s32 %r7545, %r45, %r7538, %r7544; + mad.lo.s32 %r7546, %r46, %r7536, %r7545; + ld.const.v4.u8 {%rs4890, %rs4891, %rs4892, %rs4893}, [matrix+2444]; + cvt.u32.u16 %r7547, %rs4893; + cvt.s32.s8 %r7548, %r7547; + cvt.u32.u16 %r7549, %rs4892; + cvt.s32.s8 %r7550, %r7549; + cvt.u32.u16 %r7551, %rs4891; + cvt.s32.s8 %r7552, %r7551; + cvt.u32.u16 %r7553, %rs4890; + cvt.s32.s8 %r7554, %r7553; + mad.lo.s32 %r7555, %r48, %r7554, %r7546; + mad.lo.s32 %r7556, %r49, %r7552, %r7555; + mad.lo.s32 %r7557, %r50, %r7550, %r7556; + mad.lo.s32 %r7558, %r51, %r7548, %r7557; + ld.const.v4.u8 {%rs4898, %rs4899, %rs4900, %rs4901}, [matrix+2448]; + cvt.u32.u16 %r7559, %rs4901; + cvt.s32.s8 %r7560, %r7559; + cvt.u32.u16 %r7561, %rs4900; + cvt.s32.s8 %r7562, %r7561; + cvt.u32.u16 %r7563, %rs4899; + cvt.s32.s8 %r7564, %r7563; + cvt.u32.u16 %r7565, %rs4898; + cvt.s32.s8 %r7566, %r7565; + mad.lo.s32 %r7567, %r173, %r7566, %r7558; + mad.lo.s32 %r7568, %r53, %r7564, %r7567; + mad.lo.s32 %r7569, %r54, %r7562, %r7568; + mad.lo.s32 %r7570, %r55, %r7560, %r7569; + ld.const.v4.u8 {%rs4906, %rs4907, %rs4908, %rs4909}, [matrix+2452]; + cvt.u32.u16 %r7571, %rs4909; + cvt.s32.s8 %r7572, %r7571; + cvt.u32.u16 %r7573, %rs4908; + cvt.s32.s8 %r7574, %r7573; + cvt.u32.u16 %r7575, %rs4907; + cvt.s32.s8 %r7576, %r7575; + cvt.u32.u16 %r7577, %rs4906; + cvt.s32.s8 %r7578, %r7577; + mad.lo.s32 %r7579, %r56, %r7578, %r7570; + mad.lo.s32 %r7580, %r57, %r7576, %r7579; + mad.lo.s32 %r7581, %r58, %r7574, %r7580; + mad.lo.s32 %r7582, %r59, %r7572, %r7581; + ld.const.v4.u8 {%rs4914, %rs4915, %rs4916, %rs4917}, [matrix+2456]; + cvt.u32.u16 %r7583, %rs4917; + cvt.s32.s8 %r7584, %r7583; + cvt.u32.u16 %r7585, %rs4916; + cvt.s32.s8 %r7586, %r7585; + cvt.u32.u16 %r7587, %rs4915; + cvt.s32.s8 %r7588, %r7587; + cvt.u32.u16 %r7589, %rs4914; + cvt.s32.s8 %r7590, %r7589; + mad.lo.s32 %r7591, %r61, %r7590, %r7582; + mad.lo.s32 %r7592, %r62, %r7588, %r7591; + mad.lo.s32 %r7593, %r64, %r7586, %r7592; + mad.lo.s32 %r7594, %r65, %r7584, %r7593; + ld.const.v4.u8 {%rs4922, %rs4923, %rs4924, %rs4925}, [matrix+2460]; + cvt.u32.u16 %r7595, %rs4925; + cvt.s32.s8 %r7596, %r7595; + cvt.u32.u16 %r7597, %rs4924; + cvt.s32.s8 %r7598, %r7597; + cvt.u32.u16 %r7599, %rs4923; + cvt.s32.s8 %r7600, %r7599; + cvt.u32.u16 %r7601, %rs4922; + cvt.s32.s8 %r7602, %r7601; + mad.lo.s32 %r7603, %r67, %r7602, %r7594; + mad.lo.s32 %r7604, %r68, %r7600, %r7603; + mad.lo.s32 %r7605, %r69, %r7598, %r7604; + mad.lo.s32 %r7606, %r70, %r7596, %r7605; + ld.const.v4.u8 {%rs4930, %rs4931, %rs4932, %rs4933}, [matrix+2464]; + cvt.u32.u16 %r7607, %rs4933; + cvt.s32.s8 %r7608, %r7607; + cvt.u32.u16 %r7609, %rs4932; + cvt.s32.s8 %r7610, %r7609; + cvt.u32.u16 %r7611, %rs4931; + cvt.s32.s8 %r7612, %r7611; + cvt.u32.u16 %r7613, %rs4930; + cvt.s32.s8 %r7614, %r7613; + mad.lo.s32 %r7615, %r222, %r7614, %r7606; + mad.lo.s32 %r7616, %r72, %r7612, %r7615; + mad.lo.s32 %r7617, %r73, %r7610, %r7616; + mad.lo.s32 %r7618, %r74, %r7608, %r7617; + ld.const.v4.u8 {%rs4938, %rs4939, %rs4940, %rs4941}, [matrix+2468]; + cvt.u32.u16 %r7619, %rs4941; + cvt.s32.s8 %r7620, %r7619; + cvt.u32.u16 %r7621, %rs4940; + cvt.s32.s8 %r7622, %r7621; + cvt.u32.u16 %r7623, %rs4939; + cvt.s32.s8 %r7624, %r7623; + cvt.u32.u16 %r7625, %rs4938; + cvt.s32.s8 %r7626, %r7625; + mad.lo.s32 %r7627, %r75, %r7626, %r7618; + mad.lo.s32 %r7628, %r76, %r7624, %r7627; + mad.lo.s32 %r7629, %r77, %r7622, %r7628; + mad.lo.s32 %r7630, %r78, %r7620, %r7629; + ld.const.v4.u8 {%rs4946, %rs4947, %rs4948, %rs4949}, [matrix+2472]; + cvt.u32.u16 %r7631, %rs4949; + cvt.s32.s8 %r7632, %r7631; + cvt.u32.u16 %r7633, %rs4948; + cvt.s32.s8 %r7634, %r7633; + cvt.u32.u16 %r7635, %rs4947; + cvt.s32.s8 %r7636, %r7635; + cvt.u32.u16 %r7637, %rs4946; + cvt.s32.s8 %r7638, %r7637; + mad.lo.s32 %r7639, %r80, %r7638, %r7630; + mad.lo.s32 %r7640, %r81, %r7636, %r7639; + mad.lo.s32 %r7641, %r83, %r7634, %r7640; + mad.lo.s32 %r7642, %r84, %r7632, %r7641; + ld.const.v4.u8 {%rs4954, %rs4955, %rs4956, %rs4957}, [matrix+2476]; + cvt.u32.u16 %r7643, %rs4957; + cvt.s32.s8 %r7644, %r7643; + cvt.u32.u16 %r7645, %rs4956; + cvt.s32.s8 %r7646, %r7645; + cvt.u32.u16 %r7647, %rs4955; + cvt.s32.s8 %r7648, %r7647; + cvt.u32.u16 %r7649, %rs4954; + cvt.s32.s8 %r7650, %r7649; + mad.lo.s32 %r7651, %r86, %r7650, %r7642; + mad.lo.s32 %r7652, %r87, %r7648, %r7651; + mad.lo.s32 %r7653, %r88, %r7646, %r7652; + mad.lo.s32 %r7654, %r89, %r7644, %r7653; + ld.const.v4.u8 {%rs4962, %rs4963, %rs4964, %rs4965}, [matrix+2480]; + cvt.u32.u16 %r7655, %rs4965; + cvt.s32.s8 %r7656, %r7655; + cvt.u32.u16 %r7657, %rs4964; + cvt.s32.s8 %r7658, %r7657; + cvt.u32.u16 %r7659, %rs4963; + cvt.s32.s8 %r7660, %r7659; + cvt.u32.u16 %r7661, %rs4962; + cvt.s32.s8 %r7662, %r7661; + mad.lo.s32 %r7663, %r271, %r7662, %r7654; + mad.lo.s32 %r7664, %r91, %r7660, %r7663; + mad.lo.s32 %r7665, %r93, %r7658, %r7664; + mad.lo.s32 %r7666, %r94, %r7656, %r7665; + ld.const.v4.u8 {%rs4970, %rs4971, %rs4972, %rs4973}, [matrix+2484]; + cvt.u32.u16 %r7667, %rs4973; + cvt.s32.s8 %r7668, %r7667; + cvt.u32.u16 %r7669, %rs4972; + cvt.s32.s8 %r7670, %r7669; + cvt.u32.u16 %r7671, %rs4971; + cvt.s32.s8 %r7672, %r7671; + cvt.u32.u16 %r7673, %rs4970; + cvt.s32.s8 %r7674, %r7673; + mad.lo.s32 %r7675, %r96, %r7674, %r7666; + mad.lo.s32 %r7676, %r97, %r7672, %r7675; + mad.lo.s32 %r7677, %r99, %r7670, %r7676; + mad.lo.s32 %r7678, %r100, %r7668, %r7677; + ld.const.v4.u8 {%rs4978, %rs4979, %rs4980, %rs4981}, [matrix+2488]; + cvt.u32.u16 %r7679, %rs4981; + cvt.s32.s8 %r7680, %r7679; + cvt.u32.u16 %r7681, %rs4980; + cvt.s32.s8 %r7682, %r7681; + cvt.u32.u16 %r7683, %rs4979; + cvt.s32.s8 %r7684, %r7683; + cvt.u32.u16 %r7685, %rs4978; + cvt.s32.s8 %r7686, %r7685; + mad.lo.s32 %r7687, %r103, %r7686, %r7678; + mad.lo.s32 %r7688, %r104, %r7684, %r7687; + mad.lo.s32 %r7689, %r107, %r7682, %r7688; + mad.lo.s32 %r7690, %r108, %r7680, %r7689; + ld.const.v4.u8 {%rs4986, %rs4987, %rs4988, %rs4989}, [matrix+2492]; + cvt.u32.u16 %r7691, %rs4989; + cvt.s32.s8 %r7692, %r7691; + cvt.u32.u16 %r7693, %rs4988; + cvt.s32.s8 %r7694, %r7693; + cvt.u32.u16 %r7695, %rs4987; + cvt.s32.s8 %r7696, %r7695; + cvt.u32.u16 %r7697, %rs4986; + cvt.s32.s8 %r7698, %r7697; + mad.lo.s32 %r7699, %r111, %r7698, %r7690; + mad.lo.s32 %r7700, %r112, %r7696, %r7699; + mad.lo.s32 %r7701, %r114, %r7694, %r7700; + mad.lo.s32 %r7702, %r115, %r7692, %r7701; + ld.const.v4.u8 {%rs4994, %rs4995, %rs4996, %rs4997}, [matrix+2496]; + cvt.u32.u16 %r7703, %rs4997; + cvt.s32.s8 %r7704, %r7703; + cvt.u32.u16 %r7705, %rs4996; + cvt.s32.s8 %r7706, %r7705; + cvt.u32.u16 %r7707, %rs4994; + cvt.s32.s8 %r7708, %r7707; + cvt.u32.u16 %r7709, %rs4995; + cvt.s32.s8 %r7710, %r7709; + mul.lo.s32 %r7711, %r34, %r7710; + mad.lo.s32 %r7712, %r124, %r7708, %r7711; + mad.lo.s32 %r7713, %r35, %r7706, %r7712; + mad.lo.s32 %r7714, %r36, %r7704, %r7713; + ld.const.v4.u8 {%rs5002, %rs5003, %rs5004, %rs5005}, [matrix+2500]; + cvt.u32.u16 %r7715, %rs5005; + cvt.s32.s8 %r7716, %r7715; + cvt.u32.u16 %r7717, %rs5004; + cvt.s32.s8 %r7718, %r7717; + cvt.u32.u16 %r7719, %rs5003; + cvt.s32.s8 %r7720, %r7719; + cvt.u32.u16 %r7721, %rs5002; + cvt.s32.s8 %r7722, %r7721; + mad.lo.s32 %r7723, %r37, %r7722, %r7714; + mad.lo.s32 %r7724, %r38, %r7720, %r7723; + mad.lo.s32 %r7725, %r39, %r7718, %r7724; + mad.lo.s32 %r7726, %r40, %r7716, %r7725; + ld.const.v4.u8 {%rs5010, %rs5011, %rs5012, %rs5013}, [matrix+2504]; + cvt.u32.u16 %r7727, %rs5013; + cvt.s32.s8 %r7728, %r7727; + cvt.u32.u16 %r7729, %rs5012; + cvt.s32.s8 %r7730, %r7729; + cvt.u32.u16 %r7731, %rs5011; + cvt.s32.s8 %r7732, %r7731; + cvt.u32.u16 %r7733, %rs5010; + cvt.s32.s8 %r7734, %r7733; + mad.lo.s32 %r7735, %r42, %r7734, %r7726; + mad.lo.s32 %r7736, %r43, %r7732, %r7735; + mad.lo.s32 %r7737, %r45, %r7730, %r7736; + mad.lo.s32 %r7738, %r46, %r7728, %r7737; + ld.const.v4.u8 {%rs5018, %rs5019, %rs5020, %rs5021}, [matrix+2508]; + cvt.u32.u16 %r7739, %rs5021; + cvt.s32.s8 %r7740, %r7739; + cvt.u32.u16 %r7741, %rs5020; + cvt.s32.s8 %r7742, %r7741; + cvt.u32.u16 %r7743, %rs5019; + cvt.s32.s8 %r7744, %r7743; + cvt.u32.u16 %r7745, %rs5018; + cvt.s32.s8 %r7746, %r7745; + mad.lo.s32 %r7747, %r48, %r7746, %r7738; + mad.lo.s32 %r7748, %r49, %r7744, %r7747; + mad.lo.s32 %r7749, %r50, %r7742, %r7748; + mad.lo.s32 %r7750, %r51, %r7740, %r7749; + ld.const.v4.u8 {%rs5026, %rs5027, %rs5028, %rs5029}, [matrix+2512]; + cvt.u32.u16 %r7751, %rs5029; + cvt.s32.s8 %r7752, %r7751; + cvt.u32.u16 %r7753, %rs5028; + cvt.s32.s8 %r7754, %r7753; + cvt.u32.u16 %r7755, %rs5027; + cvt.s32.s8 %r7756, %r7755; + cvt.u32.u16 %r7757, %rs5026; + cvt.s32.s8 %r7758, %r7757; + mad.lo.s32 %r7759, %r173, %r7758, %r7750; + mad.lo.s32 %r7760, %r53, %r7756, %r7759; + mad.lo.s32 %r7761, %r54, %r7754, %r7760; + mad.lo.s32 %r7762, %r55, %r7752, %r7761; + ld.const.v4.u8 {%rs5034, %rs5035, %rs5036, %rs5037}, [matrix+2516]; + cvt.u32.u16 %r7763, %rs5037; + cvt.s32.s8 %r7764, %r7763; + cvt.u32.u16 %r7765, %rs5036; + cvt.s32.s8 %r7766, %r7765; + cvt.u32.u16 %r7767, %rs5035; + cvt.s32.s8 %r7768, %r7767; + cvt.u32.u16 %r7769, %rs5034; + cvt.s32.s8 %r7770, %r7769; + mad.lo.s32 %r7771, %r56, %r7770, %r7762; + mad.lo.s32 %r7772, %r57, %r7768, %r7771; + mad.lo.s32 %r7773, %r58, %r7766, %r7772; + mad.lo.s32 %r7774, %r59, %r7764, %r7773; + ld.const.v4.u8 {%rs5042, %rs5043, %rs5044, %rs5045}, [matrix+2520]; + cvt.u32.u16 %r7775, %rs5045; + cvt.s32.s8 %r7776, %r7775; + cvt.u32.u16 %r7777, %rs5044; + cvt.s32.s8 %r7778, %r7777; + cvt.u32.u16 %r7779, %rs5043; + cvt.s32.s8 %r7780, %r7779; + cvt.u32.u16 %r7781, %rs5042; + cvt.s32.s8 %r7782, %r7781; + mad.lo.s32 %r7783, %r61, %r7782, %r7774; + mad.lo.s32 %r7784, %r62, %r7780, %r7783; + mad.lo.s32 %r7785, %r64, %r7778, %r7784; + mad.lo.s32 %r7786, %r65, %r7776, %r7785; + ld.const.v4.u8 {%rs5050, %rs5051, %rs5052, %rs5053}, [matrix+2524]; + cvt.u32.u16 %r7787, %rs5053; + cvt.s32.s8 %r7788, %r7787; + cvt.u32.u16 %r7789, %rs5052; + cvt.s32.s8 %r7790, %r7789; + cvt.u32.u16 %r7791, %rs5051; + cvt.s32.s8 %r7792, %r7791; + cvt.u32.u16 %r7793, %rs5050; + cvt.s32.s8 %r7794, %r7793; + mad.lo.s32 %r7795, %r67, %r7794, %r7786; + mad.lo.s32 %r7796, %r68, %r7792, %r7795; + mad.lo.s32 %r7797, %r69, %r7790, %r7796; + mad.lo.s32 %r7798, %r70, %r7788, %r7797; + ld.const.v4.u8 {%rs5058, %rs5059, %rs5060, %rs5061}, [matrix+2528]; + cvt.u32.u16 %r7799, %rs5061; + cvt.s32.s8 %r7800, %r7799; + cvt.u32.u16 %r7801, %rs5060; + cvt.s32.s8 %r7802, %r7801; + cvt.u32.u16 %r7803, %rs5059; + cvt.s32.s8 %r7804, %r7803; + cvt.u32.u16 %r7805, %rs5058; + cvt.s32.s8 %r7806, %r7805; + mad.lo.s32 %r7807, %r222, %r7806, %r7798; + mad.lo.s32 %r7808, %r72, %r7804, %r7807; + mad.lo.s32 %r7809, %r73, %r7802, %r7808; + mad.lo.s32 %r7810, %r74, %r7800, %r7809; + ld.const.v4.u8 {%rs5066, %rs5067, %rs5068, %rs5069}, [matrix+2532]; + cvt.u32.u16 %r7811, %rs5069; + cvt.s32.s8 %r7812, %r7811; + cvt.u32.u16 %r7813, %rs5068; + cvt.s32.s8 %r7814, %r7813; + cvt.u32.u16 %r7815, %rs5067; + cvt.s32.s8 %r7816, %r7815; + cvt.u32.u16 %r7817, %rs5066; + cvt.s32.s8 %r7818, %r7817; + mad.lo.s32 %r7819, %r75, %r7818, %r7810; + mad.lo.s32 %r7820, %r76, %r7816, %r7819; + mad.lo.s32 %r7821, %r77, %r7814, %r7820; + mad.lo.s32 %r7822, %r78, %r7812, %r7821; + ld.const.v4.u8 {%rs5074, %rs5075, %rs5076, %rs5077}, [matrix+2536]; + cvt.u32.u16 %r7823, %rs5077; + cvt.s32.s8 %r7824, %r7823; + cvt.u32.u16 %r7825, %rs5076; + cvt.s32.s8 %r7826, %r7825; + cvt.u32.u16 %r7827, %rs5075; + cvt.s32.s8 %r7828, %r7827; + cvt.u32.u16 %r7829, %rs5074; + cvt.s32.s8 %r7830, %r7829; + mad.lo.s32 %r7831, %r80, %r7830, %r7822; + mad.lo.s32 %r7832, %r81, %r7828, %r7831; + mad.lo.s32 %r7833, %r83, %r7826, %r7832; + mad.lo.s32 %r7834, %r84, %r7824, %r7833; + ld.const.v4.u8 {%rs5082, %rs5083, %rs5084, %rs5085}, [matrix+2540]; + cvt.u32.u16 %r7835, %rs5085; + cvt.s32.s8 %r7836, %r7835; + cvt.u32.u16 %r7837, %rs5084; + cvt.s32.s8 %r7838, %r7837; + cvt.u32.u16 %r7839, %rs5083; + cvt.s32.s8 %r7840, %r7839; + cvt.u32.u16 %r7841, %rs5082; + cvt.s32.s8 %r7842, %r7841; + mad.lo.s32 %r7843, %r86, %r7842, %r7834; + mad.lo.s32 %r7844, %r87, %r7840, %r7843; + mad.lo.s32 %r7845, %r88, %r7838, %r7844; + mad.lo.s32 %r7846, %r89, %r7836, %r7845; + ld.const.v4.u8 {%rs5090, %rs5091, %rs5092, %rs5093}, [matrix+2544]; + cvt.u32.u16 %r7847, %rs5093; + cvt.s32.s8 %r7848, %r7847; + cvt.u32.u16 %r7849, %rs5092; + cvt.s32.s8 %r7850, %r7849; + cvt.u32.u16 %r7851, %rs5091; + cvt.s32.s8 %r7852, %r7851; + cvt.u32.u16 %r7853, %rs5090; + cvt.s32.s8 %r7854, %r7853; + mad.lo.s32 %r7855, %r271, %r7854, %r7846; + mad.lo.s32 %r7856, %r91, %r7852, %r7855; + mad.lo.s32 %r7857, %r93, %r7850, %r7856; + mad.lo.s32 %r7858, %r94, %r7848, %r7857; + ld.const.v4.u8 {%rs5098, %rs5099, %rs5100, %rs5101}, [matrix+2548]; + cvt.u32.u16 %r7859, %rs5101; + cvt.s32.s8 %r7860, %r7859; + cvt.u32.u16 %r7861, %rs5100; + cvt.s32.s8 %r7862, %r7861; + cvt.u32.u16 %r7863, %rs5099; + cvt.s32.s8 %r7864, %r7863; + cvt.u32.u16 %r7865, %rs5098; + cvt.s32.s8 %r7866, %r7865; + mad.lo.s32 %r7867, %r96, %r7866, %r7858; + mad.lo.s32 %r7868, %r97, %r7864, %r7867; + mad.lo.s32 %r7869, %r99, %r7862, %r7868; + mad.lo.s32 %r7870, %r100, %r7860, %r7869; + ld.const.v4.u8 {%rs5106, %rs5107, %rs5108, %rs5109}, [matrix+2552]; + cvt.u32.u16 %r7871, %rs5109; + cvt.s32.s8 %r7872, %r7871; + cvt.u32.u16 %r7873, %rs5108; + cvt.s32.s8 %r7874, %r7873; + cvt.u32.u16 %r7875, %rs5107; + cvt.s32.s8 %r7876, %r7875; + cvt.u32.u16 %r7877, %rs5106; + cvt.s32.s8 %r7878, %r7877; + mad.lo.s32 %r7879, %r103, %r7878, %r7870; + mad.lo.s32 %r7880, %r104, %r7876, %r7879; + mad.lo.s32 %r7881, %r107, %r7874, %r7880; + mad.lo.s32 %r7882, %r108, %r7872, %r7881; + ld.const.v4.u8 {%rs5114, %rs5115, %rs5116, %rs5117}, [matrix+2556]; + cvt.u32.u16 %r7883, %rs5117; + cvt.s32.s8 %r7884, %r7883; + cvt.u32.u16 %r7885, %rs5116; + cvt.s32.s8 %r7886, %r7885; + cvt.u32.u16 %r7887, %rs5115; + cvt.s32.s8 %r7888, %r7887; + cvt.u32.u16 %r7889, %rs5114; + cvt.s32.s8 %r7890, %r7889; + mad.lo.s32 %r7891, %r111, %r7890, %r7882; + mad.lo.s32 %r7892, %r112, %r7888, %r7891; + mad.lo.s32 %r7893, %r114, %r7886, %r7892; + mad.lo.s32 %r7894, %r115, %r7884, %r7893; + shr.u32 %r7895, %r7702, 6; + and.b32 %r7896, %r7895, 240; + shr.u32 %r7897, %r7894, 10; + or.b32 %r7898, %r7897, %r7896; + xor.b32 %r7899, %r29, %r7898; + cvt.u64.u32 %rd395, %r7899; + ld.const.v4.u8 {%rs5122, %rs5123, %rs5124, %rs5125}, [matrix+2560]; + cvt.u32.u16 %r7900, %rs5125; + cvt.s32.s8 %r7901, %r7900; + cvt.u32.u16 %r7902, %rs5124; + cvt.s32.s8 %r7903, %r7902; + cvt.u32.u16 %r7904, %rs5122; + cvt.s32.s8 %r7905, %r7904; + cvt.u32.u16 %r7906, %rs5123; + cvt.s32.s8 %r7907, %r7906; + mul.lo.s32 %r7908, %r34, %r7907; + mad.lo.s32 %r7909, %r124, %r7905, %r7908; + mad.lo.s32 %r7910, %r35, %r7903, %r7909; + mad.lo.s32 %r7911, %r36, %r7901, %r7910; + ld.const.v4.u8 {%rs5130, %rs5131, %rs5132, %rs5133}, [matrix+2564]; + cvt.u32.u16 %r7912, %rs5133; + cvt.s32.s8 %r7913, %r7912; + cvt.u32.u16 %r7914, %rs5132; + cvt.s32.s8 %r7915, %r7914; + cvt.u32.u16 %r7916, %rs5131; + cvt.s32.s8 %r7917, %r7916; + cvt.u32.u16 %r7918, %rs5130; + cvt.s32.s8 %r7919, %r7918; + mad.lo.s32 %r7920, %r37, %r7919, %r7911; + mad.lo.s32 %r7921, %r38, %r7917, %r7920; + mad.lo.s32 %r7922, %r39, %r7915, %r7921; + mad.lo.s32 %r7923, %r40, %r7913, %r7922; + ld.const.v4.u8 {%rs5138, %rs5139, %rs5140, %rs5141}, [matrix+2568]; + cvt.u32.u16 %r7924, %rs5141; + cvt.s32.s8 %r7925, %r7924; + cvt.u32.u16 %r7926, %rs5140; + cvt.s32.s8 %r7927, %r7926; + cvt.u32.u16 %r7928, %rs5139; + cvt.s32.s8 %r7929, %r7928; + cvt.u32.u16 %r7930, %rs5138; + cvt.s32.s8 %r7931, %r7930; + mad.lo.s32 %r7932, %r42, %r7931, %r7923; + mad.lo.s32 %r7933, %r43, %r7929, %r7932; + mad.lo.s32 %r7934, %r45, %r7927, %r7933; + mad.lo.s32 %r7935, %r46, %r7925, %r7934; + ld.const.v4.u8 {%rs5146, %rs5147, %rs5148, %rs5149}, [matrix+2572]; + cvt.u32.u16 %r7936, %rs5149; + cvt.s32.s8 %r7937, %r7936; + cvt.u32.u16 %r7938, %rs5148; + cvt.s32.s8 %r7939, %r7938; + cvt.u32.u16 %r7940, %rs5147; + cvt.s32.s8 %r7941, %r7940; + cvt.u32.u16 %r7942, %rs5146; + cvt.s32.s8 %r7943, %r7942; + mad.lo.s32 %r7944, %r48, %r7943, %r7935; + mad.lo.s32 %r7945, %r49, %r7941, %r7944; + mad.lo.s32 %r7946, %r50, %r7939, %r7945; + mad.lo.s32 %r7947, %r51, %r7937, %r7946; + ld.const.v4.u8 {%rs5154, %rs5155, %rs5156, %rs5157}, [matrix+2576]; + cvt.u32.u16 %r7948, %rs5157; + cvt.s32.s8 %r7949, %r7948; + cvt.u32.u16 %r7950, %rs5156; + cvt.s32.s8 %r7951, %r7950; + cvt.u32.u16 %r7952, %rs5155; + cvt.s32.s8 %r7953, %r7952; + cvt.u32.u16 %r7954, %rs5154; + cvt.s32.s8 %r7955, %r7954; + mad.lo.s32 %r7956, %r173, %r7955, %r7947; + mad.lo.s32 %r7957, %r53, %r7953, %r7956; + mad.lo.s32 %r7958, %r54, %r7951, %r7957; + mad.lo.s32 %r7959, %r55, %r7949, %r7958; + ld.const.v4.u8 {%rs5162, %rs5163, %rs5164, %rs5165}, [matrix+2580]; + cvt.u32.u16 %r7960, %rs5165; + cvt.s32.s8 %r7961, %r7960; + cvt.u32.u16 %r7962, %rs5164; + cvt.s32.s8 %r7963, %r7962; + cvt.u32.u16 %r7964, %rs5163; + cvt.s32.s8 %r7965, %r7964; + cvt.u32.u16 %r7966, %rs5162; + cvt.s32.s8 %r7967, %r7966; + mad.lo.s32 %r7968, %r56, %r7967, %r7959; + mad.lo.s32 %r7969, %r57, %r7965, %r7968; + mad.lo.s32 %r7970, %r58, %r7963, %r7969; + mad.lo.s32 %r7971, %r59, %r7961, %r7970; + ld.const.v4.u8 {%rs5170, %rs5171, %rs5172, %rs5173}, [matrix+2584]; + cvt.u32.u16 %r7972, %rs5173; + cvt.s32.s8 %r7973, %r7972; + cvt.u32.u16 %r7974, %rs5172; + cvt.s32.s8 %r7975, %r7974; + cvt.u32.u16 %r7976, %rs5171; + cvt.s32.s8 %r7977, %r7976; + cvt.u32.u16 %r7978, %rs5170; + cvt.s32.s8 %r7979, %r7978; + mad.lo.s32 %r7980, %r61, %r7979, %r7971; + mad.lo.s32 %r7981, %r62, %r7977, %r7980; + mad.lo.s32 %r7982, %r64, %r7975, %r7981; + mad.lo.s32 %r7983, %r65, %r7973, %r7982; + ld.const.v4.u8 {%rs5178, %rs5179, %rs5180, %rs5181}, [matrix+2588]; + cvt.u32.u16 %r7984, %rs5181; + cvt.s32.s8 %r7985, %r7984; + cvt.u32.u16 %r7986, %rs5180; + cvt.s32.s8 %r7987, %r7986; + cvt.u32.u16 %r7988, %rs5179; + cvt.s32.s8 %r7989, %r7988; + cvt.u32.u16 %r7990, %rs5178; + cvt.s32.s8 %r7991, %r7990; + mad.lo.s32 %r7992, %r67, %r7991, %r7983; + mad.lo.s32 %r7993, %r68, %r7989, %r7992; + mad.lo.s32 %r7994, %r69, %r7987, %r7993; + mad.lo.s32 %r7995, %r70, %r7985, %r7994; + ld.const.v4.u8 {%rs5186, %rs5187, %rs5188, %rs5189}, [matrix+2592]; + cvt.u32.u16 %r7996, %rs5189; + cvt.s32.s8 %r7997, %r7996; + cvt.u32.u16 %r7998, %rs5188; + cvt.s32.s8 %r7999, %r7998; + cvt.u32.u16 %r8000, %rs5187; + cvt.s32.s8 %r8001, %r8000; + cvt.u32.u16 %r8002, %rs5186; + cvt.s32.s8 %r8003, %r8002; + mad.lo.s32 %r8004, %r222, %r8003, %r7995; + mad.lo.s32 %r8005, %r72, %r8001, %r8004; + mad.lo.s32 %r8006, %r73, %r7999, %r8005; + mad.lo.s32 %r8007, %r74, %r7997, %r8006; + ld.const.v4.u8 {%rs5194, %rs5195, %rs5196, %rs5197}, [matrix+2596]; + cvt.u32.u16 %r8008, %rs5197; + cvt.s32.s8 %r8009, %r8008; + cvt.u32.u16 %r8010, %rs5196; + cvt.s32.s8 %r8011, %r8010; + cvt.u32.u16 %r8012, %rs5195; + cvt.s32.s8 %r8013, %r8012; + cvt.u32.u16 %r8014, %rs5194; + cvt.s32.s8 %r8015, %r8014; + mad.lo.s32 %r8016, %r75, %r8015, %r8007; + mad.lo.s32 %r8017, %r76, %r8013, %r8016; + mad.lo.s32 %r8018, %r77, %r8011, %r8017; + mad.lo.s32 %r8019, %r78, %r8009, %r8018; + ld.const.v4.u8 {%rs5202, %rs5203, %rs5204, %rs5205}, [matrix+2600]; + cvt.u32.u16 %r8020, %rs5205; + cvt.s32.s8 %r8021, %r8020; + cvt.u32.u16 %r8022, %rs5204; + cvt.s32.s8 %r8023, %r8022; + cvt.u32.u16 %r8024, %rs5203; + cvt.s32.s8 %r8025, %r8024; + cvt.u32.u16 %r8026, %rs5202; + cvt.s32.s8 %r8027, %r8026; + mad.lo.s32 %r8028, %r80, %r8027, %r8019; + mad.lo.s32 %r8029, %r81, %r8025, %r8028; + mad.lo.s32 %r8030, %r83, %r8023, %r8029; + mad.lo.s32 %r8031, %r84, %r8021, %r8030; + ld.const.v4.u8 {%rs5210, %rs5211, %rs5212, %rs5213}, [matrix+2604]; + cvt.u32.u16 %r8032, %rs5213; + cvt.s32.s8 %r8033, %r8032; + cvt.u32.u16 %r8034, %rs5212; + cvt.s32.s8 %r8035, %r8034; + cvt.u32.u16 %r8036, %rs5211; + cvt.s32.s8 %r8037, %r8036; + cvt.u32.u16 %r8038, %rs5210; + cvt.s32.s8 %r8039, %r8038; + mad.lo.s32 %r8040, %r86, %r8039, %r8031; + mad.lo.s32 %r8041, %r87, %r8037, %r8040; + mad.lo.s32 %r8042, %r88, %r8035, %r8041; + mad.lo.s32 %r8043, %r89, %r8033, %r8042; + ld.const.v4.u8 {%rs5218, %rs5219, %rs5220, %rs5221}, [matrix+2608]; + cvt.u32.u16 %r8044, %rs5221; + cvt.s32.s8 %r8045, %r8044; + cvt.u32.u16 %r8046, %rs5220; + cvt.s32.s8 %r8047, %r8046; + cvt.u32.u16 %r8048, %rs5219; + cvt.s32.s8 %r8049, %r8048; + cvt.u32.u16 %r8050, %rs5218; + cvt.s32.s8 %r8051, %r8050; + mad.lo.s32 %r8052, %r271, %r8051, %r8043; + mad.lo.s32 %r8053, %r91, %r8049, %r8052; + mad.lo.s32 %r8054, %r93, %r8047, %r8053; + mad.lo.s32 %r8055, %r94, %r8045, %r8054; + ld.const.v4.u8 {%rs5226, %rs5227, %rs5228, %rs5229}, [matrix+2612]; + cvt.u32.u16 %r8056, %rs5229; + cvt.s32.s8 %r8057, %r8056; + cvt.u32.u16 %r8058, %rs5228; + cvt.s32.s8 %r8059, %r8058; + cvt.u32.u16 %r8060, %rs5227; + cvt.s32.s8 %r8061, %r8060; + cvt.u32.u16 %r8062, %rs5226; + cvt.s32.s8 %r8063, %r8062; + mad.lo.s32 %r8064, %r96, %r8063, %r8055; + mad.lo.s32 %r8065, %r97, %r8061, %r8064; + mad.lo.s32 %r8066, %r99, %r8059, %r8065; + mad.lo.s32 %r8067, %r100, %r8057, %r8066; + ld.const.v4.u8 {%rs5234, %rs5235, %rs5236, %rs5237}, [matrix+2616]; + cvt.u32.u16 %r8068, %rs5237; + cvt.s32.s8 %r8069, %r8068; + cvt.u32.u16 %r8070, %rs5236; + cvt.s32.s8 %r8071, %r8070; + cvt.u32.u16 %r8072, %rs5235; + cvt.s32.s8 %r8073, %r8072; + cvt.u32.u16 %r8074, %rs5234; + cvt.s32.s8 %r8075, %r8074; + mad.lo.s32 %r8076, %r103, %r8075, %r8067; + mad.lo.s32 %r8077, %r104, %r8073, %r8076; + mad.lo.s32 %r8078, %r107, %r8071, %r8077; + mad.lo.s32 %r8079, %r108, %r8069, %r8078; + ld.const.v4.u8 {%rs5242, %rs5243, %rs5244, %rs5245}, [matrix+2620]; + cvt.u32.u16 %r8080, %rs5245; + cvt.s32.s8 %r8081, %r8080; + cvt.u32.u16 %r8082, %rs5244; + cvt.s32.s8 %r8083, %r8082; + cvt.u32.u16 %r8084, %rs5243; + cvt.s32.s8 %r8085, %r8084; + cvt.u32.u16 %r8086, %rs5242; + cvt.s32.s8 %r8087, %r8086; + mad.lo.s32 %r8088, %r111, %r8087, %r8079; + mad.lo.s32 %r8089, %r112, %r8085, %r8088; + mad.lo.s32 %r8090, %r114, %r8083, %r8089; + mad.lo.s32 %r8091, %r115, %r8081, %r8090; + ld.const.v4.u8 {%rs5250, %rs5251, %rs5252, %rs5253}, [matrix+2624]; + cvt.u32.u16 %r8092, %rs5253; + cvt.s32.s8 %r8093, %r8092; + cvt.u32.u16 %r8094, %rs5252; + cvt.s32.s8 %r8095, %r8094; + cvt.u32.u16 %r8096, %rs5250; + cvt.s32.s8 %r8097, %r8096; + cvt.u32.u16 %r8098, %rs5251; + cvt.s32.s8 %r8099, %r8098; + mul.lo.s32 %r8100, %r34, %r8099; + mad.lo.s32 %r8101, %r124, %r8097, %r8100; + mad.lo.s32 %r8102, %r35, %r8095, %r8101; + mad.lo.s32 %r8103, %r36, %r8093, %r8102; + ld.const.v4.u8 {%rs5258, %rs5259, %rs5260, %rs5261}, [matrix+2628]; + cvt.u32.u16 %r8104, %rs5261; + cvt.s32.s8 %r8105, %r8104; + cvt.u32.u16 %r8106, %rs5260; + cvt.s32.s8 %r8107, %r8106; + cvt.u32.u16 %r8108, %rs5259; + cvt.s32.s8 %r8109, %r8108; + cvt.u32.u16 %r8110, %rs5258; + cvt.s32.s8 %r8111, %r8110; + mad.lo.s32 %r8112, %r37, %r8111, %r8103; + mad.lo.s32 %r8113, %r38, %r8109, %r8112; + mad.lo.s32 %r8114, %r39, %r8107, %r8113; + mad.lo.s32 %r8115, %r40, %r8105, %r8114; + ld.const.v4.u8 {%rs5266, %rs5267, %rs5268, %rs5269}, [matrix+2632]; + cvt.u32.u16 %r8116, %rs5269; + cvt.s32.s8 %r8117, %r8116; + cvt.u32.u16 %r8118, %rs5268; + cvt.s32.s8 %r8119, %r8118; + cvt.u32.u16 %r8120, %rs5267; + cvt.s32.s8 %r8121, %r8120; + cvt.u32.u16 %r8122, %rs5266; + cvt.s32.s8 %r8123, %r8122; + mad.lo.s32 %r8124, %r42, %r8123, %r8115; + mad.lo.s32 %r8125, %r43, %r8121, %r8124; + mad.lo.s32 %r8126, %r45, %r8119, %r8125; + mad.lo.s32 %r8127, %r46, %r8117, %r8126; + ld.const.v4.u8 {%rs5274, %rs5275, %rs5276, %rs5277}, [matrix+2636]; + cvt.u32.u16 %r8128, %rs5277; + cvt.s32.s8 %r8129, %r8128; + cvt.u32.u16 %r8130, %rs5276; + cvt.s32.s8 %r8131, %r8130; + cvt.u32.u16 %r8132, %rs5275; + cvt.s32.s8 %r8133, %r8132; + cvt.u32.u16 %r8134, %rs5274; + cvt.s32.s8 %r8135, %r8134; + mad.lo.s32 %r8136, %r48, %r8135, %r8127; + mad.lo.s32 %r8137, %r49, %r8133, %r8136; + mad.lo.s32 %r8138, %r50, %r8131, %r8137; + mad.lo.s32 %r8139, %r51, %r8129, %r8138; + ld.const.v4.u8 {%rs5282, %rs5283, %rs5284, %rs5285}, [matrix+2640]; + cvt.u32.u16 %r8140, %rs5285; + cvt.s32.s8 %r8141, %r8140; + cvt.u32.u16 %r8142, %rs5284; + cvt.s32.s8 %r8143, %r8142; + cvt.u32.u16 %r8144, %rs5283; + cvt.s32.s8 %r8145, %r8144; + cvt.u32.u16 %r8146, %rs5282; + cvt.s32.s8 %r8147, %r8146; + mad.lo.s32 %r8148, %r173, %r8147, %r8139; + mad.lo.s32 %r8149, %r53, %r8145, %r8148; + mad.lo.s32 %r8150, %r54, %r8143, %r8149; + mad.lo.s32 %r8151, %r55, %r8141, %r8150; + ld.const.v4.u8 {%rs5290, %rs5291, %rs5292, %rs5293}, [matrix+2644]; + cvt.u32.u16 %r8152, %rs5293; + cvt.s32.s8 %r8153, %r8152; + cvt.u32.u16 %r8154, %rs5292; + cvt.s32.s8 %r8155, %r8154; + cvt.u32.u16 %r8156, %rs5291; + cvt.s32.s8 %r8157, %r8156; + cvt.u32.u16 %r8158, %rs5290; + cvt.s32.s8 %r8159, %r8158; + mad.lo.s32 %r8160, %r56, %r8159, %r8151; + mad.lo.s32 %r8161, %r57, %r8157, %r8160; + mad.lo.s32 %r8162, %r58, %r8155, %r8161; + mad.lo.s32 %r8163, %r59, %r8153, %r8162; + ld.const.v4.u8 {%rs5298, %rs5299, %rs5300, %rs5301}, [matrix+2648]; + cvt.u32.u16 %r8164, %rs5301; + cvt.s32.s8 %r8165, %r8164; + cvt.u32.u16 %r8166, %rs5300; + cvt.s32.s8 %r8167, %r8166; + cvt.u32.u16 %r8168, %rs5299; + cvt.s32.s8 %r8169, %r8168; + cvt.u32.u16 %r8170, %rs5298; + cvt.s32.s8 %r8171, %r8170; + mad.lo.s32 %r8172, %r61, %r8171, %r8163; + mad.lo.s32 %r8173, %r62, %r8169, %r8172; + mad.lo.s32 %r8174, %r64, %r8167, %r8173; + mad.lo.s32 %r8175, %r65, %r8165, %r8174; + ld.const.v4.u8 {%rs5306, %rs5307, %rs5308, %rs5309}, [matrix+2652]; + cvt.u32.u16 %r8176, %rs5309; + cvt.s32.s8 %r8177, %r8176; + cvt.u32.u16 %r8178, %rs5308; + cvt.s32.s8 %r8179, %r8178; + cvt.u32.u16 %r8180, %rs5307; + cvt.s32.s8 %r8181, %r8180; + cvt.u32.u16 %r8182, %rs5306; + cvt.s32.s8 %r8183, %r8182; + mad.lo.s32 %r8184, %r67, %r8183, %r8175; + mad.lo.s32 %r8185, %r68, %r8181, %r8184; + mad.lo.s32 %r8186, %r69, %r8179, %r8185; + mad.lo.s32 %r8187, %r70, %r8177, %r8186; + ld.const.v4.u8 {%rs5314, %rs5315, %rs5316, %rs5317}, [matrix+2656]; + cvt.u32.u16 %r8188, %rs5317; + cvt.s32.s8 %r8189, %r8188; + cvt.u32.u16 %r8190, %rs5316; + cvt.s32.s8 %r8191, %r8190; + cvt.u32.u16 %r8192, %rs5315; + cvt.s32.s8 %r8193, %r8192; + cvt.u32.u16 %r8194, %rs5314; + cvt.s32.s8 %r8195, %r8194; + mad.lo.s32 %r8196, %r222, %r8195, %r8187; + mad.lo.s32 %r8197, %r72, %r8193, %r8196; + mad.lo.s32 %r8198, %r73, %r8191, %r8197; + mad.lo.s32 %r8199, %r74, %r8189, %r8198; + ld.const.v4.u8 {%rs5322, %rs5323, %rs5324, %rs5325}, [matrix+2660]; + cvt.u32.u16 %r8200, %rs5325; + cvt.s32.s8 %r8201, %r8200; + cvt.u32.u16 %r8202, %rs5324; + cvt.s32.s8 %r8203, %r8202; + cvt.u32.u16 %r8204, %rs5323; + cvt.s32.s8 %r8205, %r8204; + cvt.u32.u16 %r8206, %rs5322; + cvt.s32.s8 %r8207, %r8206; + mad.lo.s32 %r8208, %r75, %r8207, %r8199; + mad.lo.s32 %r8209, %r76, %r8205, %r8208; + mad.lo.s32 %r8210, %r77, %r8203, %r8209; + mad.lo.s32 %r8211, %r78, %r8201, %r8210; + ld.const.v4.u8 {%rs5330, %rs5331, %rs5332, %rs5333}, [matrix+2664]; + cvt.u32.u16 %r8212, %rs5333; + cvt.s32.s8 %r8213, %r8212; + cvt.u32.u16 %r8214, %rs5332; + cvt.s32.s8 %r8215, %r8214; + cvt.u32.u16 %r8216, %rs5331; + cvt.s32.s8 %r8217, %r8216; + cvt.u32.u16 %r8218, %rs5330; + cvt.s32.s8 %r8219, %r8218; + mad.lo.s32 %r8220, %r80, %r8219, %r8211; + mad.lo.s32 %r8221, %r81, %r8217, %r8220; + mad.lo.s32 %r8222, %r83, %r8215, %r8221; + mad.lo.s32 %r8223, %r84, %r8213, %r8222; + ld.const.v4.u8 {%rs5338, %rs5339, %rs5340, %rs5341}, [matrix+2668]; + cvt.u32.u16 %r8224, %rs5341; + cvt.s32.s8 %r8225, %r8224; + cvt.u32.u16 %r8226, %rs5340; + cvt.s32.s8 %r8227, %r8226; + cvt.u32.u16 %r8228, %rs5339; + cvt.s32.s8 %r8229, %r8228; + cvt.u32.u16 %r8230, %rs5338; + cvt.s32.s8 %r8231, %r8230; + mad.lo.s32 %r8232, %r86, %r8231, %r8223; + mad.lo.s32 %r8233, %r87, %r8229, %r8232; + mad.lo.s32 %r8234, %r88, %r8227, %r8233; + mad.lo.s32 %r8235, %r89, %r8225, %r8234; + ld.const.v4.u8 {%rs5346, %rs5347, %rs5348, %rs5349}, [matrix+2672]; + cvt.u32.u16 %r8236, %rs5349; + cvt.s32.s8 %r8237, %r8236; + cvt.u32.u16 %r8238, %rs5348; + cvt.s32.s8 %r8239, %r8238; + cvt.u32.u16 %r8240, %rs5347; + cvt.s32.s8 %r8241, %r8240; + cvt.u32.u16 %r8242, %rs5346; + cvt.s32.s8 %r8243, %r8242; + mad.lo.s32 %r8244, %r271, %r8243, %r8235; + mad.lo.s32 %r8245, %r91, %r8241, %r8244; + mad.lo.s32 %r8246, %r93, %r8239, %r8245; + mad.lo.s32 %r8247, %r94, %r8237, %r8246; + ld.const.v4.u8 {%rs5354, %rs5355, %rs5356, %rs5357}, [matrix+2676]; + cvt.u32.u16 %r8248, %rs5357; + cvt.s32.s8 %r8249, %r8248; + cvt.u32.u16 %r8250, %rs5356; + cvt.s32.s8 %r8251, %r8250; + cvt.u32.u16 %r8252, %rs5355; + cvt.s32.s8 %r8253, %r8252; + cvt.u32.u16 %r8254, %rs5354; + cvt.s32.s8 %r8255, %r8254; + mad.lo.s32 %r8256, %r96, %r8255, %r8247; + mad.lo.s32 %r8257, %r97, %r8253, %r8256; + mad.lo.s32 %r8258, %r99, %r8251, %r8257; + mad.lo.s32 %r8259, %r100, %r8249, %r8258; + ld.const.v4.u8 {%rs5362, %rs5363, %rs5364, %rs5365}, [matrix+2680]; + cvt.u32.u16 %r8260, %rs5365; + cvt.s32.s8 %r8261, %r8260; + cvt.u32.u16 %r8262, %rs5364; + cvt.s32.s8 %r8263, %r8262; + cvt.u32.u16 %r8264, %rs5363; + cvt.s32.s8 %r8265, %r8264; + cvt.u32.u16 %r8266, %rs5362; + cvt.s32.s8 %r8267, %r8266; + mad.lo.s32 %r8268, %r103, %r8267, %r8259; + mad.lo.s32 %r8269, %r104, %r8265, %r8268; + mad.lo.s32 %r8270, %r107, %r8263, %r8269; + mad.lo.s32 %r8271, %r108, %r8261, %r8270; + ld.const.v4.u8 {%rs5370, %rs5371, %rs5372, %rs5373}, [matrix+2684]; + cvt.u32.u16 %r8272, %rs5373; + cvt.s32.s8 %r8273, %r8272; + cvt.u32.u16 %r8274, %rs5372; + cvt.s32.s8 %r8275, %r8274; + cvt.u32.u16 %r8276, %rs5371; + cvt.s32.s8 %r8277, %r8276; + cvt.u32.u16 %r8278, %rs5370; + cvt.s32.s8 %r8279, %r8278; + mad.lo.s32 %r8280, %r111, %r8279, %r8271; + mad.lo.s32 %r8281, %r112, %r8277, %r8280; + mad.lo.s32 %r8282, %r114, %r8275, %r8281; + mad.lo.s32 %r8283, %r115, %r8273, %r8282; + shr.u32 %r8284, %r8091, 6; + and.b32 %r8285, %r8284, 240; + shr.u32 %r8286, %r8283, 10; + or.b32 %r8287, %r8286, %r8285; + xor.b32 %r8288, %r30, %r8287; + cvt.u64.u32 %rd396, %r8288; + ld.const.v4.u8 {%rs5378, %rs5379, %rs5380, %rs5381}, [matrix+2688]; + cvt.u32.u16 %r8289, %rs5381; + cvt.s32.s8 %r8290, %r8289; + cvt.u32.u16 %r8291, %rs5380; + cvt.s32.s8 %r8292, %r8291; + cvt.u32.u16 %r8293, %rs5378; + cvt.s32.s8 %r8294, %r8293; + cvt.u32.u16 %r8295, %rs5379; + cvt.s32.s8 %r8296, %r8295; + mul.lo.s32 %r8297, %r34, %r8296; + mad.lo.s32 %r8298, %r124, %r8294, %r8297; + mad.lo.s32 %r8299, %r35, %r8292, %r8298; + mad.lo.s32 %r8300, %r36, %r8290, %r8299; + ld.const.v4.u8 {%rs5386, %rs5387, %rs5388, %rs5389}, [matrix+2692]; + cvt.u32.u16 %r8301, %rs5389; + cvt.s32.s8 %r8302, %r8301; + cvt.u32.u16 %r8303, %rs5388; + cvt.s32.s8 %r8304, %r8303; + cvt.u32.u16 %r8305, %rs5387; + cvt.s32.s8 %r8306, %r8305; + cvt.u32.u16 %r8307, %rs5386; + cvt.s32.s8 %r8308, %r8307; + mad.lo.s32 %r8309, %r37, %r8308, %r8300; + mad.lo.s32 %r8310, %r38, %r8306, %r8309; + mad.lo.s32 %r8311, %r39, %r8304, %r8310; + mad.lo.s32 %r8312, %r40, %r8302, %r8311; + ld.const.v4.u8 {%rs5394, %rs5395, %rs5396, %rs5397}, [matrix+2696]; + cvt.u32.u16 %r8313, %rs5397; + cvt.s32.s8 %r8314, %r8313; + cvt.u32.u16 %r8315, %rs5396; + cvt.s32.s8 %r8316, %r8315; + cvt.u32.u16 %r8317, %rs5395; + cvt.s32.s8 %r8318, %r8317; + cvt.u32.u16 %r8319, %rs5394; + cvt.s32.s8 %r8320, %r8319; + mad.lo.s32 %r8321, %r42, %r8320, %r8312; + mad.lo.s32 %r8322, %r43, %r8318, %r8321; + mad.lo.s32 %r8323, %r45, %r8316, %r8322; + mad.lo.s32 %r8324, %r46, %r8314, %r8323; + ld.const.v4.u8 {%rs5402, %rs5403, %rs5404, %rs5405}, [matrix+2700]; + cvt.u32.u16 %r8325, %rs5405; + cvt.s32.s8 %r8326, %r8325; + cvt.u32.u16 %r8327, %rs5404; + cvt.s32.s8 %r8328, %r8327; + cvt.u32.u16 %r8329, %rs5403; + cvt.s32.s8 %r8330, %r8329; + cvt.u32.u16 %r8331, %rs5402; + cvt.s32.s8 %r8332, %r8331; + mad.lo.s32 %r8333, %r48, %r8332, %r8324; + mad.lo.s32 %r8334, %r49, %r8330, %r8333; + mad.lo.s32 %r8335, %r50, %r8328, %r8334; + mad.lo.s32 %r8336, %r51, %r8326, %r8335; + ld.const.v4.u8 {%rs5410, %rs5411, %rs5412, %rs5413}, [matrix+2704]; + cvt.u32.u16 %r8337, %rs5413; + cvt.s32.s8 %r8338, %r8337; + cvt.u32.u16 %r8339, %rs5412; + cvt.s32.s8 %r8340, %r8339; + cvt.u32.u16 %r8341, %rs5411; + cvt.s32.s8 %r8342, %r8341; + cvt.u32.u16 %r8343, %rs5410; + cvt.s32.s8 %r8344, %r8343; + mad.lo.s32 %r8345, %r173, %r8344, %r8336; + mad.lo.s32 %r8346, %r53, %r8342, %r8345; + mad.lo.s32 %r8347, %r54, %r8340, %r8346; + mad.lo.s32 %r8348, %r55, %r8338, %r8347; + ld.const.v4.u8 {%rs5418, %rs5419, %rs5420, %rs5421}, [matrix+2708]; + cvt.u32.u16 %r8349, %rs5421; + cvt.s32.s8 %r8350, %r8349; + cvt.u32.u16 %r8351, %rs5420; + cvt.s32.s8 %r8352, %r8351; + cvt.u32.u16 %r8353, %rs5419; + cvt.s32.s8 %r8354, %r8353; + cvt.u32.u16 %r8355, %rs5418; + cvt.s32.s8 %r8356, %r8355; + mad.lo.s32 %r8357, %r56, %r8356, %r8348; + mad.lo.s32 %r8358, %r57, %r8354, %r8357; + mad.lo.s32 %r8359, %r58, %r8352, %r8358; + mad.lo.s32 %r8360, %r59, %r8350, %r8359; + ld.const.v4.u8 {%rs5426, %rs5427, %rs5428, %rs5429}, [matrix+2712]; + cvt.u32.u16 %r8361, %rs5429; + cvt.s32.s8 %r8362, %r8361; + cvt.u32.u16 %r8363, %rs5428; + cvt.s32.s8 %r8364, %r8363; + cvt.u32.u16 %r8365, %rs5427; + cvt.s32.s8 %r8366, %r8365; + cvt.u32.u16 %r8367, %rs5426; + cvt.s32.s8 %r8368, %r8367; + mad.lo.s32 %r8369, %r61, %r8368, %r8360; + mad.lo.s32 %r8370, %r62, %r8366, %r8369; + mad.lo.s32 %r8371, %r64, %r8364, %r8370; + mad.lo.s32 %r8372, %r65, %r8362, %r8371; + ld.const.v4.u8 {%rs5434, %rs5435, %rs5436, %rs5437}, [matrix+2716]; + cvt.u32.u16 %r8373, %rs5437; + cvt.s32.s8 %r8374, %r8373; + cvt.u32.u16 %r8375, %rs5436; + cvt.s32.s8 %r8376, %r8375; + cvt.u32.u16 %r8377, %rs5435; + cvt.s32.s8 %r8378, %r8377; + cvt.u32.u16 %r8379, %rs5434; + cvt.s32.s8 %r8380, %r8379; + mad.lo.s32 %r8381, %r67, %r8380, %r8372; + mad.lo.s32 %r8382, %r68, %r8378, %r8381; + mad.lo.s32 %r8383, %r69, %r8376, %r8382; + mad.lo.s32 %r8384, %r70, %r8374, %r8383; + ld.const.v4.u8 {%rs5442, %rs5443, %rs5444, %rs5445}, [matrix+2720]; + cvt.u32.u16 %r8385, %rs5445; + cvt.s32.s8 %r8386, %r8385; + cvt.u32.u16 %r8387, %rs5444; + cvt.s32.s8 %r8388, %r8387; + cvt.u32.u16 %r8389, %rs5443; + cvt.s32.s8 %r8390, %r8389; + cvt.u32.u16 %r8391, %rs5442; + cvt.s32.s8 %r8392, %r8391; + mad.lo.s32 %r8393, %r222, %r8392, %r8384; + mad.lo.s32 %r8394, %r72, %r8390, %r8393; + mad.lo.s32 %r8395, %r73, %r8388, %r8394; + mad.lo.s32 %r8396, %r74, %r8386, %r8395; + ld.const.v4.u8 {%rs5450, %rs5451, %rs5452, %rs5453}, [matrix+2724]; + cvt.u32.u16 %r8397, %rs5453; + cvt.s32.s8 %r8398, %r8397; + cvt.u32.u16 %r8399, %rs5452; + cvt.s32.s8 %r8400, %r8399; + cvt.u32.u16 %r8401, %rs5451; + cvt.s32.s8 %r8402, %r8401; + cvt.u32.u16 %r8403, %rs5450; + cvt.s32.s8 %r8404, %r8403; + mad.lo.s32 %r8405, %r75, %r8404, %r8396; + mad.lo.s32 %r8406, %r76, %r8402, %r8405; + mad.lo.s32 %r8407, %r77, %r8400, %r8406; + mad.lo.s32 %r8408, %r78, %r8398, %r8407; + ld.const.v4.u8 {%rs5458, %rs5459, %rs5460, %rs5461}, [matrix+2728]; + cvt.u32.u16 %r8409, %rs5461; + cvt.s32.s8 %r8410, %r8409; + cvt.u32.u16 %r8411, %rs5460; + cvt.s32.s8 %r8412, %r8411; + cvt.u32.u16 %r8413, %rs5459; + cvt.s32.s8 %r8414, %r8413; + cvt.u32.u16 %r8415, %rs5458; + cvt.s32.s8 %r8416, %r8415; + mad.lo.s32 %r8417, %r80, %r8416, %r8408; + mad.lo.s32 %r8418, %r81, %r8414, %r8417; + mad.lo.s32 %r8419, %r83, %r8412, %r8418; + mad.lo.s32 %r8420, %r84, %r8410, %r8419; + ld.const.v4.u8 {%rs5466, %rs5467, %rs5468, %rs5469}, [matrix+2732]; + cvt.u32.u16 %r8421, %rs5469; + cvt.s32.s8 %r8422, %r8421; + cvt.u32.u16 %r8423, %rs5468; + cvt.s32.s8 %r8424, %r8423; + cvt.u32.u16 %r8425, %rs5467; + cvt.s32.s8 %r8426, %r8425; + cvt.u32.u16 %r8427, %rs5466; + cvt.s32.s8 %r8428, %r8427; + mad.lo.s32 %r8429, %r86, %r8428, %r8420; + mad.lo.s32 %r8430, %r87, %r8426, %r8429; + mad.lo.s32 %r8431, %r88, %r8424, %r8430; + mad.lo.s32 %r8432, %r89, %r8422, %r8431; + ld.const.v4.u8 {%rs5474, %rs5475, %rs5476, %rs5477}, [matrix+2736]; + cvt.u32.u16 %r8433, %rs5477; + cvt.s32.s8 %r8434, %r8433; + cvt.u32.u16 %r8435, %rs5476; + cvt.s32.s8 %r8436, %r8435; + cvt.u32.u16 %r8437, %rs5475; + cvt.s32.s8 %r8438, %r8437; + cvt.u32.u16 %r8439, %rs5474; + cvt.s32.s8 %r8440, %r8439; + mad.lo.s32 %r8441, %r271, %r8440, %r8432; + mad.lo.s32 %r8442, %r91, %r8438, %r8441; + mad.lo.s32 %r8443, %r93, %r8436, %r8442; + mad.lo.s32 %r8444, %r94, %r8434, %r8443; + ld.const.v4.u8 {%rs5482, %rs5483, %rs5484, %rs5485}, [matrix+2740]; + cvt.u32.u16 %r8445, %rs5485; + cvt.s32.s8 %r8446, %r8445; + cvt.u32.u16 %r8447, %rs5484; + cvt.s32.s8 %r8448, %r8447; + cvt.u32.u16 %r8449, %rs5483; + cvt.s32.s8 %r8450, %r8449; + cvt.u32.u16 %r8451, %rs5482; + cvt.s32.s8 %r8452, %r8451; + mad.lo.s32 %r8453, %r96, %r8452, %r8444; + mad.lo.s32 %r8454, %r97, %r8450, %r8453; + mad.lo.s32 %r8455, %r99, %r8448, %r8454; + mad.lo.s32 %r8456, %r100, %r8446, %r8455; + ld.const.v4.u8 {%rs5490, %rs5491, %rs5492, %rs5493}, [matrix+2744]; + cvt.u32.u16 %r8457, %rs5493; + cvt.s32.s8 %r8458, %r8457; + cvt.u32.u16 %r8459, %rs5492; + cvt.s32.s8 %r8460, %r8459; + cvt.u32.u16 %r8461, %rs5491; + cvt.s32.s8 %r8462, %r8461; + cvt.u32.u16 %r8463, %rs5490; + cvt.s32.s8 %r8464, %r8463; + mad.lo.s32 %r8465, %r103, %r8464, %r8456; + mad.lo.s32 %r8466, %r104, %r8462, %r8465; + mad.lo.s32 %r8467, %r107, %r8460, %r8466; + mad.lo.s32 %r8468, %r108, %r8458, %r8467; + ld.const.v4.u8 {%rs5498, %rs5499, %rs5500, %rs5501}, [matrix+2748]; + cvt.u32.u16 %r8469, %rs5501; + cvt.s32.s8 %r8470, %r8469; + cvt.u32.u16 %r8471, %rs5500; + cvt.s32.s8 %r8472, %r8471; + cvt.u32.u16 %r8473, %rs5499; + cvt.s32.s8 %r8474, %r8473; + cvt.u32.u16 %r8475, %rs5498; + cvt.s32.s8 %r8476, %r8475; + mad.lo.s32 %r8477, %r111, %r8476, %r8468; + mad.lo.s32 %r8478, %r112, %r8474, %r8477; + mad.lo.s32 %r8479, %r114, %r8472, %r8478; + mad.lo.s32 %r8480, %r115, %r8470, %r8479; + ld.const.v4.u8 {%rs5506, %rs5507, %rs5508, %rs5509}, [matrix+2752]; + cvt.u32.u16 %r8481, %rs5509; + cvt.s32.s8 %r8482, %r8481; + cvt.u32.u16 %r8483, %rs5508; + cvt.s32.s8 %r8484, %r8483; + cvt.u32.u16 %r8485, %rs5506; + cvt.s32.s8 %r8486, %r8485; + cvt.u32.u16 %r8487, %rs5507; + cvt.s32.s8 %r8488, %r8487; + mul.lo.s32 %r8489, %r34, %r8488; + mad.lo.s32 %r8490, %r124, %r8486, %r8489; + mad.lo.s32 %r8491, %r35, %r8484, %r8490; + mad.lo.s32 %r8492, %r36, %r8482, %r8491; + ld.const.v4.u8 {%rs5514, %rs5515, %rs5516, %rs5517}, [matrix+2756]; + cvt.u32.u16 %r8493, %rs5517; + cvt.s32.s8 %r8494, %r8493; + cvt.u32.u16 %r8495, %rs5516; + cvt.s32.s8 %r8496, %r8495; + cvt.u32.u16 %r8497, %rs5515; + cvt.s32.s8 %r8498, %r8497; + cvt.u32.u16 %r8499, %rs5514; + cvt.s32.s8 %r8500, %r8499; + mad.lo.s32 %r8501, %r37, %r8500, %r8492; + mad.lo.s32 %r8502, %r38, %r8498, %r8501; + mad.lo.s32 %r8503, %r39, %r8496, %r8502; + mad.lo.s32 %r8504, %r40, %r8494, %r8503; + ld.const.v4.u8 {%rs5522, %rs5523, %rs5524, %rs5525}, [matrix+2760]; + cvt.u32.u16 %r8505, %rs5525; + cvt.s32.s8 %r8506, %r8505; + cvt.u32.u16 %r8507, %rs5524; + cvt.s32.s8 %r8508, %r8507; + cvt.u32.u16 %r8509, %rs5523; + cvt.s32.s8 %r8510, %r8509; + cvt.u32.u16 %r8511, %rs5522; + cvt.s32.s8 %r8512, %r8511; + mad.lo.s32 %r8513, %r42, %r8512, %r8504; + mad.lo.s32 %r8514, %r43, %r8510, %r8513; + mad.lo.s32 %r8515, %r45, %r8508, %r8514; + mad.lo.s32 %r8516, %r46, %r8506, %r8515; + ld.const.v4.u8 {%rs5530, %rs5531, %rs5532, %rs5533}, [matrix+2764]; + cvt.u32.u16 %r8517, %rs5533; + cvt.s32.s8 %r8518, %r8517; + cvt.u32.u16 %r8519, %rs5532; + cvt.s32.s8 %r8520, %r8519; + cvt.u32.u16 %r8521, %rs5531; + cvt.s32.s8 %r8522, %r8521; + cvt.u32.u16 %r8523, %rs5530; + cvt.s32.s8 %r8524, %r8523; + mad.lo.s32 %r8525, %r48, %r8524, %r8516; + mad.lo.s32 %r8526, %r49, %r8522, %r8525; + mad.lo.s32 %r8527, %r50, %r8520, %r8526; + mad.lo.s32 %r8528, %r51, %r8518, %r8527; + ld.const.v4.u8 {%rs5538, %rs5539, %rs5540, %rs5541}, [matrix+2768]; + cvt.u32.u16 %r8529, %rs5541; + cvt.s32.s8 %r8530, %r8529; + cvt.u32.u16 %r8531, %rs5540; + cvt.s32.s8 %r8532, %r8531; + cvt.u32.u16 %r8533, %rs5539; + cvt.s32.s8 %r8534, %r8533; + cvt.u32.u16 %r8535, %rs5538; + cvt.s32.s8 %r8536, %r8535; + mad.lo.s32 %r8537, %r173, %r8536, %r8528; + mad.lo.s32 %r8538, %r53, %r8534, %r8537; + mad.lo.s32 %r8539, %r54, %r8532, %r8538; + mad.lo.s32 %r8540, %r55, %r8530, %r8539; + ld.const.v4.u8 {%rs5546, %rs5547, %rs5548, %rs5549}, [matrix+2772]; + cvt.u32.u16 %r8541, %rs5549; + cvt.s32.s8 %r8542, %r8541; + cvt.u32.u16 %r8543, %rs5548; + cvt.s32.s8 %r8544, %r8543; + cvt.u32.u16 %r8545, %rs5547; + cvt.s32.s8 %r8546, %r8545; + cvt.u32.u16 %r8547, %rs5546; + cvt.s32.s8 %r8548, %r8547; + mad.lo.s32 %r8549, %r56, %r8548, %r8540; + mad.lo.s32 %r8550, %r57, %r8546, %r8549; + mad.lo.s32 %r8551, %r58, %r8544, %r8550; + mad.lo.s32 %r8552, %r59, %r8542, %r8551; + ld.const.v4.u8 {%rs5554, %rs5555, %rs5556, %rs5557}, [matrix+2776]; + cvt.u32.u16 %r8553, %rs5557; + cvt.s32.s8 %r8554, %r8553; + cvt.u32.u16 %r8555, %rs5556; + cvt.s32.s8 %r8556, %r8555; + cvt.u32.u16 %r8557, %rs5555; + cvt.s32.s8 %r8558, %r8557; + cvt.u32.u16 %r8559, %rs5554; + cvt.s32.s8 %r8560, %r8559; + mad.lo.s32 %r8561, %r61, %r8560, %r8552; + mad.lo.s32 %r8562, %r62, %r8558, %r8561; + mad.lo.s32 %r8563, %r64, %r8556, %r8562; + mad.lo.s32 %r8564, %r65, %r8554, %r8563; + ld.const.v4.u8 {%rs5562, %rs5563, %rs5564, %rs5565}, [matrix+2780]; + cvt.u32.u16 %r8565, %rs5565; + cvt.s32.s8 %r8566, %r8565; + cvt.u32.u16 %r8567, %rs5564; + cvt.s32.s8 %r8568, %r8567; + cvt.u32.u16 %r8569, %rs5563; + cvt.s32.s8 %r8570, %r8569; + cvt.u32.u16 %r8571, %rs5562; + cvt.s32.s8 %r8572, %r8571; + mad.lo.s32 %r8573, %r67, %r8572, %r8564; + mad.lo.s32 %r8574, %r68, %r8570, %r8573; + mad.lo.s32 %r8575, %r69, %r8568, %r8574; + mad.lo.s32 %r8576, %r70, %r8566, %r8575; + ld.const.v4.u8 {%rs5570, %rs5571, %rs5572, %rs5573}, [matrix+2784]; + cvt.u32.u16 %r8577, %rs5573; + cvt.s32.s8 %r8578, %r8577; + cvt.u32.u16 %r8579, %rs5572; + cvt.s32.s8 %r8580, %r8579; + cvt.u32.u16 %r8581, %rs5571; + cvt.s32.s8 %r8582, %r8581; + cvt.u32.u16 %r8583, %rs5570; + cvt.s32.s8 %r8584, %r8583; + mad.lo.s32 %r8585, %r222, %r8584, %r8576; + mad.lo.s32 %r8586, %r72, %r8582, %r8585; + mad.lo.s32 %r8587, %r73, %r8580, %r8586; + mad.lo.s32 %r8588, %r74, %r8578, %r8587; + ld.const.v4.u8 {%rs5578, %rs5579, %rs5580, %rs5581}, [matrix+2788]; + cvt.u32.u16 %r8589, %rs5581; + cvt.s32.s8 %r8590, %r8589; + cvt.u32.u16 %r8591, %rs5580; + cvt.s32.s8 %r8592, %r8591; + cvt.u32.u16 %r8593, %rs5579; + cvt.s32.s8 %r8594, %r8593; + cvt.u32.u16 %r8595, %rs5578; + cvt.s32.s8 %r8596, %r8595; + mad.lo.s32 %r8597, %r75, %r8596, %r8588; + mad.lo.s32 %r8598, %r76, %r8594, %r8597; + mad.lo.s32 %r8599, %r77, %r8592, %r8598; + mad.lo.s32 %r8600, %r78, %r8590, %r8599; + ld.const.v4.u8 {%rs5586, %rs5587, %rs5588, %rs5589}, [matrix+2792]; + cvt.u32.u16 %r8601, %rs5589; + cvt.s32.s8 %r8602, %r8601; + cvt.u32.u16 %r8603, %rs5588; + cvt.s32.s8 %r8604, %r8603; + cvt.u32.u16 %r8605, %rs5587; + cvt.s32.s8 %r8606, %r8605; + cvt.u32.u16 %r8607, %rs5586; + cvt.s32.s8 %r8608, %r8607; + mad.lo.s32 %r8609, %r80, %r8608, %r8600; + mad.lo.s32 %r8610, %r81, %r8606, %r8609; + mad.lo.s32 %r8611, %r83, %r8604, %r8610; + mad.lo.s32 %r8612, %r84, %r8602, %r8611; + ld.const.v4.u8 {%rs5594, %rs5595, %rs5596, %rs5597}, [matrix+2796]; + cvt.u32.u16 %r8613, %rs5597; + cvt.s32.s8 %r8614, %r8613; + cvt.u32.u16 %r8615, %rs5596; + cvt.s32.s8 %r8616, %r8615; + cvt.u32.u16 %r8617, %rs5595; + cvt.s32.s8 %r8618, %r8617; + cvt.u32.u16 %r8619, %rs5594; + cvt.s32.s8 %r8620, %r8619; + mad.lo.s32 %r8621, %r86, %r8620, %r8612; + mad.lo.s32 %r8622, %r87, %r8618, %r8621; + mad.lo.s32 %r8623, %r88, %r8616, %r8622; + mad.lo.s32 %r8624, %r89, %r8614, %r8623; + ld.const.v4.u8 {%rs5602, %rs5603, %rs5604, %rs5605}, [matrix+2800]; + cvt.u32.u16 %r8625, %rs5605; + cvt.s32.s8 %r8626, %r8625; + cvt.u32.u16 %r8627, %rs5604; + cvt.s32.s8 %r8628, %r8627; + cvt.u32.u16 %r8629, %rs5603; + cvt.s32.s8 %r8630, %r8629; + cvt.u32.u16 %r8631, %rs5602; + cvt.s32.s8 %r8632, %r8631; + mad.lo.s32 %r8633, %r271, %r8632, %r8624; + mad.lo.s32 %r8634, %r91, %r8630, %r8633; + mad.lo.s32 %r8635, %r93, %r8628, %r8634; + mad.lo.s32 %r8636, %r94, %r8626, %r8635; + ld.const.v4.u8 {%rs5610, %rs5611, %rs5612, %rs5613}, [matrix+2804]; + cvt.u32.u16 %r8637, %rs5613; + cvt.s32.s8 %r8638, %r8637; + cvt.u32.u16 %r8639, %rs5612; + cvt.s32.s8 %r8640, %r8639; + cvt.u32.u16 %r8641, %rs5611; + cvt.s32.s8 %r8642, %r8641; + cvt.u32.u16 %r8643, %rs5610; + cvt.s32.s8 %r8644, %r8643; + mad.lo.s32 %r8645, %r96, %r8644, %r8636; + mad.lo.s32 %r8646, %r97, %r8642, %r8645; + mad.lo.s32 %r8647, %r99, %r8640, %r8646; + mad.lo.s32 %r8648, %r100, %r8638, %r8647; + ld.const.v4.u8 {%rs5618, %rs5619, %rs5620, %rs5621}, [matrix+2808]; + cvt.u32.u16 %r8649, %rs5621; + cvt.s32.s8 %r8650, %r8649; + cvt.u32.u16 %r8651, %rs5620; + cvt.s32.s8 %r8652, %r8651; + cvt.u32.u16 %r8653, %rs5619; + cvt.s32.s8 %r8654, %r8653; + cvt.u32.u16 %r8655, %rs5618; + cvt.s32.s8 %r8656, %r8655; + mad.lo.s32 %r8657, %r103, %r8656, %r8648; + mad.lo.s32 %r8658, %r104, %r8654, %r8657; + mad.lo.s32 %r8659, %r107, %r8652, %r8658; + mad.lo.s32 %r8660, %r108, %r8650, %r8659; + ld.const.v4.u8 {%rs5626, %rs5627, %rs5628, %rs5629}, [matrix+2812]; + cvt.u32.u16 %r8661, %rs5629; + cvt.s32.s8 %r8662, %r8661; + cvt.u32.u16 %r8663, %rs5628; + cvt.s32.s8 %r8664, %r8663; + cvt.u32.u16 %r8665, %rs5627; + cvt.s32.s8 %r8666, %r8665; + cvt.u32.u16 %r8667, %rs5626; + cvt.s32.s8 %r8668, %r8667; + mad.lo.s32 %r8669, %r111, %r8668, %r8660; + mad.lo.s32 %r8670, %r112, %r8666, %r8669; + mad.lo.s32 %r8671, %r114, %r8664, %r8670; + mad.lo.s32 %r8672, %r115, %r8662, %r8671; + shr.u32 %r8673, %r8480, 6; + and.b32 %r8674, %r8673, 240; + shr.u32 %r8675, %r8672, 10; + or.b32 %r8676, %r8675, %r8674; + xor.b32 %r8677, %r31, %r8676; + cvt.u64.u32 %rd397, %r8677; + ld.const.v4.u8 {%rs5634, %rs5635, %rs5636, %rs5637}, [matrix+2816]; + cvt.u32.u16 %r8678, %rs5637; + cvt.s32.s8 %r8679, %r8678; + cvt.u32.u16 %r8680, %rs5636; + cvt.s32.s8 %r8681, %r8680; + cvt.u32.u16 %r8682, %rs5634; + cvt.s32.s8 %r8683, %r8682; + cvt.u32.u16 %r8684, %rs5635; + cvt.s32.s8 %r8685, %r8684; + mul.lo.s32 %r8686, %r34, %r8685; + mad.lo.s32 %r8687, %r124, %r8683, %r8686; + mad.lo.s32 %r8688, %r35, %r8681, %r8687; + mad.lo.s32 %r8689, %r36, %r8679, %r8688; + ld.const.v4.u8 {%rs5642, %rs5643, %rs5644, %rs5645}, [matrix+2820]; + cvt.u32.u16 %r8690, %rs5645; + cvt.s32.s8 %r8691, %r8690; + cvt.u32.u16 %r8692, %rs5644; + cvt.s32.s8 %r8693, %r8692; + cvt.u32.u16 %r8694, %rs5643; + cvt.s32.s8 %r8695, %r8694; + cvt.u32.u16 %r8696, %rs5642; + cvt.s32.s8 %r8697, %r8696; + mad.lo.s32 %r8698, %r37, %r8697, %r8689; + mad.lo.s32 %r8699, %r38, %r8695, %r8698; + mad.lo.s32 %r8700, %r39, %r8693, %r8699; + mad.lo.s32 %r8701, %r40, %r8691, %r8700; + ld.const.v4.u8 {%rs5650, %rs5651, %rs5652, %rs5653}, [matrix+2824]; + cvt.u32.u16 %r8702, %rs5653; + cvt.s32.s8 %r8703, %r8702; + cvt.u32.u16 %r8704, %rs5652; + cvt.s32.s8 %r8705, %r8704; + cvt.u32.u16 %r8706, %rs5651; + cvt.s32.s8 %r8707, %r8706; + cvt.u32.u16 %r8708, %rs5650; + cvt.s32.s8 %r8709, %r8708; + mad.lo.s32 %r8710, %r42, %r8709, %r8701; + mad.lo.s32 %r8711, %r43, %r8707, %r8710; + mad.lo.s32 %r8712, %r45, %r8705, %r8711; + mad.lo.s32 %r8713, %r46, %r8703, %r8712; + ld.const.v4.u8 {%rs5658, %rs5659, %rs5660, %rs5661}, [matrix+2828]; + cvt.u32.u16 %r8714, %rs5661; + cvt.s32.s8 %r8715, %r8714; + cvt.u32.u16 %r8716, %rs5660; + cvt.s32.s8 %r8717, %r8716; + cvt.u32.u16 %r8718, %rs5659; + cvt.s32.s8 %r8719, %r8718; + cvt.u32.u16 %r8720, %rs5658; + cvt.s32.s8 %r8721, %r8720; + mad.lo.s32 %r8722, %r48, %r8721, %r8713; + mad.lo.s32 %r8723, %r49, %r8719, %r8722; + mad.lo.s32 %r8724, %r50, %r8717, %r8723; + mad.lo.s32 %r8725, %r51, %r8715, %r8724; + ld.const.v4.u8 {%rs5666, %rs5667, %rs5668, %rs5669}, [matrix+2832]; + cvt.u32.u16 %r8726, %rs5669; + cvt.s32.s8 %r8727, %r8726; + cvt.u32.u16 %r8728, %rs5668; + cvt.s32.s8 %r8729, %r8728; + cvt.u32.u16 %r8730, %rs5667; + cvt.s32.s8 %r8731, %r8730; + cvt.u32.u16 %r8732, %rs5666; + cvt.s32.s8 %r8733, %r8732; + mad.lo.s32 %r8734, %r173, %r8733, %r8725; + mad.lo.s32 %r8735, %r53, %r8731, %r8734; + mad.lo.s32 %r8736, %r54, %r8729, %r8735; + mad.lo.s32 %r8737, %r55, %r8727, %r8736; + ld.const.v4.u8 {%rs5674, %rs5675, %rs5676, %rs5677}, [matrix+2836]; + cvt.u32.u16 %r8738, %rs5677; + cvt.s32.s8 %r8739, %r8738; + cvt.u32.u16 %r8740, %rs5676; + cvt.s32.s8 %r8741, %r8740; + cvt.u32.u16 %r8742, %rs5675; + cvt.s32.s8 %r8743, %r8742; + cvt.u32.u16 %r8744, %rs5674; + cvt.s32.s8 %r8745, %r8744; + mad.lo.s32 %r8746, %r56, %r8745, %r8737; + mad.lo.s32 %r8747, %r57, %r8743, %r8746; + mad.lo.s32 %r8748, %r58, %r8741, %r8747; + mad.lo.s32 %r8749, %r59, %r8739, %r8748; + ld.const.v4.u8 {%rs5682, %rs5683, %rs5684, %rs5685}, [matrix+2840]; + cvt.u32.u16 %r8750, %rs5685; + cvt.s32.s8 %r8751, %r8750; + cvt.u32.u16 %r8752, %rs5684; + cvt.s32.s8 %r8753, %r8752; + cvt.u32.u16 %r8754, %rs5683; + cvt.s32.s8 %r8755, %r8754; + cvt.u32.u16 %r8756, %rs5682; + cvt.s32.s8 %r8757, %r8756; + mad.lo.s32 %r8758, %r61, %r8757, %r8749; + mad.lo.s32 %r8759, %r62, %r8755, %r8758; + mad.lo.s32 %r8760, %r64, %r8753, %r8759; + mad.lo.s32 %r8761, %r65, %r8751, %r8760; + ld.const.v4.u8 {%rs5690, %rs5691, %rs5692, %rs5693}, [matrix+2844]; + cvt.u32.u16 %r8762, %rs5693; + cvt.s32.s8 %r8763, %r8762; + cvt.u32.u16 %r8764, %rs5692; + cvt.s32.s8 %r8765, %r8764; + cvt.u32.u16 %r8766, %rs5691; + cvt.s32.s8 %r8767, %r8766; + cvt.u32.u16 %r8768, %rs5690; + cvt.s32.s8 %r8769, %r8768; + mad.lo.s32 %r8770, %r67, %r8769, %r8761; + mad.lo.s32 %r8771, %r68, %r8767, %r8770; + mad.lo.s32 %r8772, %r69, %r8765, %r8771; + mad.lo.s32 %r8773, %r70, %r8763, %r8772; + ld.const.v4.u8 {%rs5698, %rs5699, %rs5700, %rs5701}, [matrix+2848]; + cvt.u32.u16 %r8774, %rs5701; + cvt.s32.s8 %r8775, %r8774; + cvt.u32.u16 %r8776, %rs5700; + cvt.s32.s8 %r8777, %r8776; + cvt.u32.u16 %r8778, %rs5699; + cvt.s32.s8 %r8779, %r8778; + cvt.u32.u16 %r8780, %rs5698; + cvt.s32.s8 %r8781, %r8780; + mad.lo.s32 %r8782, %r222, %r8781, %r8773; + mad.lo.s32 %r8783, %r72, %r8779, %r8782; + mad.lo.s32 %r8784, %r73, %r8777, %r8783; + mad.lo.s32 %r8785, %r74, %r8775, %r8784; + ld.const.v4.u8 {%rs5706, %rs5707, %rs5708, %rs5709}, [matrix+2852]; + cvt.u32.u16 %r8786, %rs5709; + cvt.s32.s8 %r8787, %r8786; + cvt.u32.u16 %r8788, %rs5708; + cvt.s32.s8 %r8789, %r8788; + cvt.u32.u16 %r8790, %rs5707; + cvt.s32.s8 %r8791, %r8790; + cvt.u32.u16 %r8792, %rs5706; + cvt.s32.s8 %r8793, %r8792; + mad.lo.s32 %r8794, %r75, %r8793, %r8785; + mad.lo.s32 %r8795, %r76, %r8791, %r8794; + mad.lo.s32 %r8796, %r77, %r8789, %r8795; + mad.lo.s32 %r8797, %r78, %r8787, %r8796; + ld.const.v4.u8 {%rs5714, %rs5715, %rs5716, %rs5717}, [matrix+2856]; + cvt.u32.u16 %r8798, %rs5717; + cvt.s32.s8 %r8799, %r8798; + cvt.u32.u16 %r8800, %rs5716; + cvt.s32.s8 %r8801, %r8800; + cvt.u32.u16 %r8802, %rs5715; + cvt.s32.s8 %r8803, %r8802; + cvt.u32.u16 %r8804, %rs5714; + cvt.s32.s8 %r8805, %r8804; + mad.lo.s32 %r8806, %r80, %r8805, %r8797; + mad.lo.s32 %r8807, %r81, %r8803, %r8806; + mad.lo.s32 %r8808, %r83, %r8801, %r8807; + mad.lo.s32 %r8809, %r84, %r8799, %r8808; + ld.const.v4.u8 {%rs5722, %rs5723, %rs5724, %rs5725}, [matrix+2860]; + cvt.u32.u16 %r8810, %rs5725; + cvt.s32.s8 %r8811, %r8810; + cvt.u32.u16 %r8812, %rs5724; + cvt.s32.s8 %r8813, %r8812; + cvt.u32.u16 %r8814, %rs5723; + cvt.s32.s8 %r8815, %r8814; + cvt.u32.u16 %r8816, %rs5722; + cvt.s32.s8 %r8817, %r8816; + mad.lo.s32 %r8818, %r86, %r8817, %r8809; + mad.lo.s32 %r8819, %r87, %r8815, %r8818; + mad.lo.s32 %r8820, %r88, %r8813, %r8819; + mad.lo.s32 %r8821, %r89, %r8811, %r8820; + ld.const.v4.u8 {%rs5730, %rs5731, %rs5732, %rs5733}, [matrix+2864]; + cvt.u32.u16 %r8822, %rs5733; + cvt.s32.s8 %r8823, %r8822; + cvt.u32.u16 %r8824, %rs5732; + cvt.s32.s8 %r8825, %r8824; + cvt.u32.u16 %r8826, %rs5731; + cvt.s32.s8 %r8827, %r8826; + cvt.u32.u16 %r8828, %rs5730; + cvt.s32.s8 %r8829, %r8828; + mad.lo.s32 %r8830, %r271, %r8829, %r8821; + mad.lo.s32 %r8831, %r91, %r8827, %r8830; + mad.lo.s32 %r8832, %r93, %r8825, %r8831; + mad.lo.s32 %r8833, %r94, %r8823, %r8832; + ld.const.v4.u8 {%rs5738, %rs5739, %rs5740, %rs5741}, [matrix+2868]; + cvt.u32.u16 %r8834, %rs5741; + cvt.s32.s8 %r8835, %r8834; + cvt.u32.u16 %r8836, %rs5740; + cvt.s32.s8 %r8837, %r8836; + cvt.u32.u16 %r8838, %rs5739; + cvt.s32.s8 %r8839, %r8838; + cvt.u32.u16 %r8840, %rs5738; + cvt.s32.s8 %r8841, %r8840; + mad.lo.s32 %r8842, %r96, %r8841, %r8833; + mad.lo.s32 %r8843, %r97, %r8839, %r8842; + mad.lo.s32 %r8844, %r99, %r8837, %r8843; + mad.lo.s32 %r8845, %r100, %r8835, %r8844; + ld.const.v4.u8 {%rs5746, %rs5747, %rs5748, %rs5749}, [matrix+2872]; + cvt.u32.u16 %r8846, %rs5749; + cvt.s32.s8 %r8847, %r8846; + cvt.u32.u16 %r8848, %rs5748; + cvt.s32.s8 %r8849, %r8848; + cvt.u32.u16 %r8850, %rs5747; + cvt.s32.s8 %r8851, %r8850; + cvt.u32.u16 %r8852, %rs5746; + cvt.s32.s8 %r8853, %r8852; + mad.lo.s32 %r8854, %r103, %r8853, %r8845; + mad.lo.s32 %r8855, %r104, %r8851, %r8854; + mad.lo.s32 %r8856, %r107, %r8849, %r8855; + mad.lo.s32 %r8857, %r108, %r8847, %r8856; + ld.const.v4.u8 {%rs5754, %rs5755, %rs5756, %rs5757}, [matrix+2876]; + cvt.u32.u16 %r8858, %rs5757; + cvt.s32.s8 %r8859, %r8858; + cvt.u32.u16 %r8860, %rs5756; + cvt.s32.s8 %r8861, %r8860; + cvt.u32.u16 %r8862, %rs5755; + cvt.s32.s8 %r8863, %r8862; + cvt.u32.u16 %r8864, %rs5754; + cvt.s32.s8 %r8865, %r8864; + mad.lo.s32 %r8866, %r111, %r8865, %r8857; + mad.lo.s32 %r8867, %r112, %r8863, %r8866; + mad.lo.s32 %r8868, %r114, %r8861, %r8867; + mad.lo.s32 %r8869, %r115, %r8859, %r8868; + ld.const.v4.u8 {%rs5762, %rs5763, %rs5764, %rs5765}, [matrix+2880]; + cvt.u32.u16 %r8870, %rs5765; + cvt.s32.s8 %r8871, %r8870; + cvt.u32.u16 %r8872, %rs5764; + cvt.s32.s8 %r8873, %r8872; + cvt.u32.u16 %r8874, %rs5762; + cvt.s32.s8 %r8875, %r8874; + cvt.u32.u16 %r8876, %rs5763; + cvt.s32.s8 %r8877, %r8876; + mul.lo.s32 %r8878, %r34, %r8877; + mad.lo.s32 %r8879, %r124, %r8875, %r8878; + mad.lo.s32 %r8880, %r35, %r8873, %r8879; + mad.lo.s32 %r8881, %r36, %r8871, %r8880; + ld.const.v4.u8 {%rs5770, %rs5771, %rs5772, %rs5773}, [matrix+2884]; + cvt.u32.u16 %r8882, %rs5773; + cvt.s32.s8 %r8883, %r8882; + cvt.u32.u16 %r8884, %rs5772; + cvt.s32.s8 %r8885, %r8884; + cvt.u32.u16 %r8886, %rs5771; + cvt.s32.s8 %r8887, %r8886; + cvt.u32.u16 %r8888, %rs5770; + cvt.s32.s8 %r8889, %r8888; + mad.lo.s32 %r8890, %r37, %r8889, %r8881; + mad.lo.s32 %r8891, %r38, %r8887, %r8890; + mad.lo.s32 %r8892, %r39, %r8885, %r8891; + mad.lo.s32 %r8893, %r40, %r8883, %r8892; + ld.const.v4.u8 {%rs5778, %rs5779, %rs5780, %rs5781}, [matrix+2888]; + cvt.u32.u16 %r8894, %rs5781; + cvt.s32.s8 %r8895, %r8894; + cvt.u32.u16 %r8896, %rs5780; + cvt.s32.s8 %r8897, %r8896; + cvt.u32.u16 %r8898, %rs5779; + cvt.s32.s8 %r8899, %r8898; + cvt.u32.u16 %r8900, %rs5778; + cvt.s32.s8 %r8901, %r8900; + mad.lo.s32 %r8902, %r42, %r8901, %r8893; + mad.lo.s32 %r8903, %r43, %r8899, %r8902; + mad.lo.s32 %r8904, %r45, %r8897, %r8903; + mad.lo.s32 %r8905, %r46, %r8895, %r8904; + ld.const.v4.u8 {%rs5786, %rs5787, %rs5788, %rs5789}, [matrix+2892]; + cvt.u32.u16 %r8906, %rs5789; + cvt.s32.s8 %r8907, %r8906; + cvt.u32.u16 %r8908, %rs5788; + cvt.s32.s8 %r8909, %r8908; + cvt.u32.u16 %r8910, %rs5787; + cvt.s32.s8 %r8911, %r8910; + cvt.u32.u16 %r8912, %rs5786; + cvt.s32.s8 %r8913, %r8912; + mad.lo.s32 %r8914, %r48, %r8913, %r8905; + mad.lo.s32 %r8915, %r49, %r8911, %r8914; + mad.lo.s32 %r8916, %r50, %r8909, %r8915; + mad.lo.s32 %r8917, %r51, %r8907, %r8916; + ld.const.v4.u8 {%rs5794, %rs5795, %rs5796, %rs5797}, [matrix+2896]; + cvt.u32.u16 %r8918, %rs5797; + cvt.s32.s8 %r8919, %r8918; + cvt.u32.u16 %r8920, %rs5796; + cvt.s32.s8 %r8921, %r8920; + cvt.u32.u16 %r8922, %rs5795; + cvt.s32.s8 %r8923, %r8922; + cvt.u32.u16 %r8924, %rs5794; + cvt.s32.s8 %r8925, %r8924; + mad.lo.s32 %r8926, %r173, %r8925, %r8917; + mad.lo.s32 %r8927, %r53, %r8923, %r8926; + mad.lo.s32 %r8928, %r54, %r8921, %r8927; + mad.lo.s32 %r8929, %r55, %r8919, %r8928; + ld.const.v4.u8 {%rs5802, %rs5803, %rs5804, %rs5805}, [matrix+2900]; + cvt.u32.u16 %r8930, %rs5805; + cvt.s32.s8 %r8931, %r8930; + cvt.u32.u16 %r8932, %rs5804; + cvt.s32.s8 %r8933, %r8932; + cvt.u32.u16 %r8934, %rs5803; + cvt.s32.s8 %r8935, %r8934; + cvt.u32.u16 %r8936, %rs5802; + cvt.s32.s8 %r8937, %r8936; + mad.lo.s32 %r8938, %r56, %r8937, %r8929; + mad.lo.s32 %r8939, %r57, %r8935, %r8938; + mad.lo.s32 %r8940, %r58, %r8933, %r8939; + mad.lo.s32 %r8941, %r59, %r8931, %r8940; + ld.const.v4.u8 {%rs5810, %rs5811, %rs5812, %rs5813}, [matrix+2904]; + cvt.u32.u16 %r8942, %rs5813; + cvt.s32.s8 %r8943, %r8942; + cvt.u32.u16 %r8944, %rs5812; + cvt.s32.s8 %r8945, %r8944; + cvt.u32.u16 %r8946, %rs5811; + cvt.s32.s8 %r8947, %r8946; + cvt.u32.u16 %r8948, %rs5810; + cvt.s32.s8 %r8949, %r8948; + mad.lo.s32 %r8950, %r61, %r8949, %r8941; + mad.lo.s32 %r8951, %r62, %r8947, %r8950; + mad.lo.s32 %r8952, %r64, %r8945, %r8951; + mad.lo.s32 %r8953, %r65, %r8943, %r8952; + ld.const.v4.u8 {%rs5818, %rs5819, %rs5820, %rs5821}, [matrix+2908]; + cvt.u32.u16 %r8954, %rs5821; + cvt.s32.s8 %r8955, %r8954; + cvt.u32.u16 %r8956, %rs5820; + cvt.s32.s8 %r8957, %r8956; + cvt.u32.u16 %r8958, %rs5819; + cvt.s32.s8 %r8959, %r8958; + cvt.u32.u16 %r8960, %rs5818; + cvt.s32.s8 %r8961, %r8960; + mad.lo.s32 %r8962, %r67, %r8961, %r8953; + mad.lo.s32 %r8963, %r68, %r8959, %r8962; + mad.lo.s32 %r8964, %r69, %r8957, %r8963; + mad.lo.s32 %r8965, %r70, %r8955, %r8964; + ld.const.v4.u8 {%rs5826, %rs5827, %rs5828, %rs5829}, [matrix+2912]; + cvt.u32.u16 %r8966, %rs5829; + cvt.s32.s8 %r8967, %r8966; + cvt.u32.u16 %r8968, %rs5828; + cvt.s32.s8 %r8969, %r8968; + cvt.u32.u16 %r8970, %rs5827; + cvt.s32.s8 %r8971, %r8970; + cvt.u32.u16 %r8972, %rs5826; + cvt.s32.s8 %r8973, %r8972; + mad.lo.s32 %r8974, %r222, %r8973, %r8965; + mad.lo.s32 %r8975, %r72, %r8971, %r8974; + mad.lo.s32 %r8976, %r73, %r8969, %r8975; + mad.lo.s32 %r8977, %r74, %r8967, %r8976; + ld.const.v4.u8 {%rs5834, %rs5835, %rs5836, %rs5837}, [matrix+2916]; + cvt.u32.u16 %r8978, %rs5837; + cvt.s32.s8 %r8979, %r8978; + cvt.u32.u16 %r8980, %rs5836; + cvt.s32.s8 %r8981, %r8980; + cvt.u32.u16 %r8982, %rs5835; + cvt.s32.s8 %r8983, %r8982; + cvt.u32.u16 %r8984, %rs5834; + cvt.s32.s8 %r8985, %r8984; + mad.lo.s32 %r8986, %r75, %r8985, %r8977; + mad.lo.s32 %r8987, %r76, %r8983, %r8986; + mad.lo.s32 %r8988, %r77, %r8981, %r8987; + mad.lo.s32 %r8989, %r78, %r8979, %r8988; + ld.const.v4.u8 {%rs5842, %rs5843, %rs5844, %rs5845}, [matrix+2920]; + cvt.u32.u16 %r8990, %rs5845; + cvt.s32.s8 %r8991, %r8990; + cvt.u32.u16 %r8992, %rs5844; + cvt.s32.s8 %r8993, %r8992; + cvt.u32.u16 %r8994, %rs5843; + cvt.s32.s8 %r8995, %r8994; + cvt.u32.u16 %r8996, %rs5842; + cvt.s32.s8 %r8997, %r8996; + mad.lo.s32 %r8998, %r80, %r8997, %r8989; + mad.lo.s32 %r8999, %r81, %r8995, %r8998; + mad.lo.s32 %r9000, %r83, %r8993, %r8999; + mad.lo.s32 %r9001, %r84, %r8991, %r9000; + ld.const.v4.u8 {%rs5850, %rs5851, %rs5852, %rs5853}, [matrix+2924]; + cvt.u32.u16 %r9002, %rs5853; + cvt.s32.s8 %r9003, %r9002; + cvt.u32.u16 %r9004, %rs5852; + cvt.s32.s8 %r9005, %r9004; + cvt.u32.u16 %r9006, %rs5851; + cvt.s32.s8 %r9007, %r9006; + cvt.u32.u16 %r9008, %rs5850; + cvt.s32.s8 %r9009, %r9008; + mad.lo.s32 %r9010, %r86, %r9009, %r9001; + mad.lo.s32 %r9011, %r87, %r9007, %r9010; + mad.lo.s32 %r9012, %r88, %r9005, %r9011; + mad.lo.s32 %r9013, %r89, %r9003, %r9012; + ld.const.v4.u8 {%rs5858, %rs5859, %rs5860, %rs5861}, [matrix+2928]; + cvt.u32.u16 %r9014, %rs5861; + cvt.s32.s8 %r9015, %r9014; + cvt.u32.u16 %r9016, %rs5860; + cvt.s32.s8 %r9017, %r9016; + cvt.u32.u16 %r9018, %rs5859; + cvt.s32.s8 %r9019, %r9018; + cvt.u32.u16 %r9020, %rs5858; + cvt.s32.s8 %r9021, %r9020; + mad.lo.s32 %r9022, %r271, %r9021, %r9013; + mad.lo.s32 %r9023, %r91, %r9019, %r9022; + mad.lo.s32 %r9024, %r93, %r9017, %r9023; + mad.lo.s32 %r9025, %r94, %r9015, %r9024; + ld.const.v4.u8 {%rs5866, %rs5867, %rs5868, %rs5869}, [matrix+2932]; + cvt.u32.u16 %r9026, %rs5869; + cvt.s32.s8 %r9027, %r9026; + cvt.u32.u16 %r9028, %rs5868; + cvt.s32.s8 %r9029, %r9028; + cvt.u32.u16 %r9030, %rs5867; + cvt.s32.s8 %r9031, %r9030; + cvt.u32.u16 %r9032, %rs5866; + cvt.s32.s8 %r9033, %r9032; + mad.lo.s32 %r9034, %r96, %r9033, %r9025; + mad.lo.s32 %r9035, %r97, %r9031, %r9034; + mad.lo.s32 %r9036, %r99, %r9029, %r9035; + mad.lo.s32 %r9037, %r100, %r9027, %r9036; + ld.const.v4.u8 {%rs5874, %rs5875, %rs5876, %rs5877}, [matrix+2936]; + cvt.u32.u16 %r9038, %rs5877; + cvt.s32.s8 %r9039, %r9038; + cvt.u32.u16 %r9040, %rs5876; + cvt.s32.s8 %r9041, %r9040; + cvt.u32.u16 %r9042, %rs5875; + cvt.s32.s8 %r9043, %r9042; + cvt.u32.u16 %r9044, %rs5874; + cvt.s32.s8 %r9045, %r9044; + mad.lo.s32 %r9046, %r103, %r9045, %r9037; + mad.lo.s32 %r9047, %r104, %r9043, %r9046; + mad.lo.s32 %r9048, %r107, %r9041, %r9047; + mad.lo.s32 %r9049, %r108, %r9039, %r9048; + ld.const.v4.u8 {%rs5882, %rs5883, %rs5884, %rs5885}, [matrix+2940]; + cvt.u32.u16 %r9050, %rs5885; + cvt.s32.s8 %r9051, %r9050; + cvt.u32.u16 %r9052, %rs5884; + cvt.s32.s8 %r9053, %r9052; + cvt.u32.u16 %r9054, %rs5883; + cvt.s32.s8 %r9055, %r9054; + cvt.u32.u16 %r9056, %rs5882; + cvt.s32.s8 %r9057, %r9056; + mad.lo.s32 %r9058, %r111, %r9057, %r9049; + mad.lo.s32 %r9059, %r112, %r9055, %r9058; + mad.lo.s32 %r9060, %r114, %r9053, %r9059; + mad.lo.s32 %r9061, %r115, %r9051, %r9060; + shr.u32 %r9062, %r8869, 6; + and.b32 %r9063, %r9062, 240; + shr.u32 %r9064, %r9061, 10; + or.b32 %r9065, %r9064, %r9063; + xor.b32 %r9066, %r32, %r9065; + cvt.u64.u32 %rd398, %r9066; + ld.const.v4.u8 {%rs5890, %rs5891, %rs5892, %rs5893}, [matrix+2944]; + cvt.u32.u16 %r9067, %rs5893; + cvt.s32.s8 %r9068, %r9067; + cvt.u32.u16 %r9069, %rs5892; + cvt.s32.s8 %r9070, %r9069; + cvt.u32.u16 %r9071, %rs5890; + cvt.s32.s8 %r9072, %r9071; + cvt.u32.u16 %r9073, %rs5891; + cvt.s32.s8 %r9074, %r9073; + mul.lo.s32 %r9075, %r34, %r9074; + mad.lo.s32 %r9076, %r124, %r9072, %r9075; + mad.lo.s32 %r9077, %r35, %r9070, %r9076; + mad.lo.s32 %r9078, %r36, %r9068, %r9077; + ld.const.v4.u8 {%rs5898, %rs5899, %rs5900, %rs5901}, [matrix+2948]; + cvt.u32.u16 %r9079, %rs5901; + cvt.s32.s8 %r9080, %r9079; + cvt.u32.u16 %r9081, %rs5900; + cvt.s32.s8 %r9082, %r9081; + cvt.u32.u16 %r9083, %rs5899; + cvt.s32.s8 %r9084, %r9083; + cvt.u32.u16 %r9085, %rs5898; + cvt.s32.s8 %r9086, %r9085; + mad.lo.s32 %r9087, %r37, %r9086, %r9078; + mad.lo.s32 %r9088, %r38, %r9084, %r9087; + mad.lo.s32 %r9089, %r39, %r9082, %r9088; + mad.lo.s32 %r9090, %r40, %r9080, %r9089; + ld.const.v4.u8 {%rs5906, %rs5907, %rs5908, %rs5909}, [matrix+2952]; + cvt.u32.u16 %r9091, %rs5909; + cvt.s32.s8 %r9092, %r9091; + cvt.u32.u16 %r9093, %rs5908; + cvt.s32.s8 %r9094, %r9093; + cvt.u32.u16 %r9095, %rs5907; + cvt.s32.s8 %r9096, %r9095; + cvt.u32.u16 %r9097, %rs5906; + cvt.s32.s8 %r9098, %r9097; + mad.lo.s32 %r9099, %r42, %r9098, %r9090; + mad.lo.s32 %r9100, %r43, %r9096, %r9099; + mad.lo.s32 %r9101, %r45, %r9094, %r9100; + mad.lo.s32 %r9102, %r46, %r9092, %r9101; + ld.const.v4.u8 {%rs5914, %rs5915, %rs5916, %rs5917}, [matrix+2956]; + cvt.u32.u16 %r9103, %rs5917; + cvt.s32.s8 %r9104, %r9103; + cvt.u32.u16 %r9105, %rs5916; + cvt.s32.s8 %r9106, %r9105; + cvt.u32.u16 %r9107, %rs5915; + cvt.s32.s8 %r9108, %r9107; + cvt.u32.u16 %r9109, %rs5914; + cvt.s32.s8 %r9110, %r9109; + mad.lo.s32 %r9111, %r48, %r9110, %r9102; + mad.lo.s32 %r9112, %r49, %r9108, %r9111; + mad.lo.s32 %r9113, %r50, %r9106, %r9112; + mad.lo.s32 %r9114, %r51, %r9104, %r9113; + ld.const.v4.u8 {%rs5922, %rs5923, %rs5924, %rs5925}, [matrix+2960]; + cvt.u32.u16 %r9115, %rs5925; + cvt.s32.s8 %r9116, %r9115; + cvt.u32.u16 %r9117, %rs5924; + cvt.s32.s8 %r9118, %r9117; + cvt.u32.u16 %r9119, %rs5923; + cvt.s32.s8 %r9120, %r9119; + cvt.u32.u16 %r9121, %rs5922; + cvt.s32.s8 %r9122, %r9121; + mad.lo.s32 %r9123, %r173, %r9122, %r9114; + mad.lo.s32 %r9124, %r53, %r9120, %r9123; + mad.lo.s32 %r9125, %r54, %r9118, %r9124; + mad.lo.s32 %r9126, %r55, %r9116, %r9125; + ld.const.v4.u8 {%rs5930, %rs5931, %rs5932, %rs5933}, [matrix+2964]; + cvt.u32.u16 %r9127, %rs5933; + cvt.s32.s8 %r9128, %r9127; + cvt.u32.u16 %r9129, %rs5932; + cvt.s32.s8 %r9130, %r9129; + cvt.u32.u16 %r9131, %rs5931; + cvt.s32.s8 %r9132, %r9131; + cvt.u32.u16 %r9133, %rs5930; + cvt.s32.s8 %r9134, %r9133; + mad.lo.s32 %r9135, %r56, %r9134, %r9126; + mad.lo.s32 %r9136, %r57, %r9132, %r9135; + mad.lo.s32 %r9137, %r58, %r9130, %r9136; + mad.lo.s32 %r9138, %r59, %r9128, %r9137; + ld.const.v4.u8 {%rs5938, %rs5939, %rs5940, %rs5941}, [matrix+2968]; + cvt.u32.u16 %r9139, %rs5941; + cvt.s32.s8 %r9140, %r9139; + cvt.u32.u16 %r9141, %rs5940; + cvt.s32.s8 %r9142, %r9141; + cvt.u32.u16 %r9143, %rs5939; + cvt.s32.s8 %r9144, %r9143; + cvt.u32.u16 %r9145, %rs5938; + cvt.s32.s8 %r9146, %r9145; + mad.lo.s32 %r9147, %r61, %r9146, %r9138; + mad.lo.s32 %r9148, %r62, %r9144, %r9147; + mad.lo.s32 %r9149, %r64, %r9142, %r9148; + mad.lo.s32 %r9150, %r65, %r9140, %r9149; + ld.const.v4.u8 {%rs5946, %rs5947, %rs5948, %rs5949}, [matrix+2972]; + cvt.u32.u16 %r9151, %rs5949; + cvt.s32.s8 %r9152, %r9151; + cvt.u32.u16 %r9153, %rs5948; + cvt.s32.s8 %r9154, %r9153; + cvt.u32.u16 %r9155, %rs5947; + cvt.s32.s8 %r9156, %r9155; + cvt.u32.u16 %r9157, %rs5946; + cvt.s32.s8 %r9158, %r9157; + mad.lo.s32 %r9159, %r67, %r9158, %r9150; + mad.lo.s32 %r9160, %r68, %r9156, %r9159; + mad.lo.s32 %r9161, %r69, %r9154, %r9160; + mad.lo.s32 %r9162, %r70, %r9152, %r9161; + ld.const.v4.u8 {%rs5954, %rs5955, %rs5956, %rs5957}, [matrix+2976]; + cvt.u32.u16 %r9163, %rs5957; + cvt.s32.s8 %r9164, %r9163; + cvt.u32.u16 %r9165, %rs5956; + cvt.s32.s8 %r9166, %r9165; + cvt.u32.u16 %r9167, %rs5955; + cvt.s32.s8 %r9168, %r9167; + cvt.u32.u16 %r9169, %rs5954; + cvt.s32.s8 %r9170, %r9169; + mad.lo.s32 %r9171, %r222, %r9170, %r9162; + mad.lo.s32 %r9172, %r72, %r9168, %r9171; + mad.lo.s32 %r9173, %r73, %r9166, %r9172; + mad.lo.s32 %r9174, %r74, %r9164, %r9173; + ld.const.v4.u8 {%rs5962, %rs5963, %rs5964, %rs5965}, [matrix+2980]; + cvt.u32.u16 %r9175, %rs5965; + cvt.s32.s8 %r9176, %r9175; + cvt.u32.u16 %r9177, %rs5964; + cvt.s32.s8 %r9178, %r9177; + cvt.u32.u16 %r9179, %rs5963; + cvt.s32.s8 %r9180, %r9179; + cvt.u32.u16 %r9181, %rs5962; + cvt.s32.s8 %r9182, %r9181; + mad.lo.s32 %r9183, %r75, %r9182, %r9174; + mad.lo.s32 %r9184, %r76, %r9180, %r9183; + mad.lo.s32 %r9185, %r77, %r9178, %r9184; + mad.lo.s32 %r9186, %r78, %r9176, %r9185; + ld.const.v4.u8 {%rs5970, %rs5971, %rs5972, %rs5973}, [matrix+2984]; + cvt.u32.u16 %r9187, %rs5973; + cvt.s32.s8 %r9188, %r9187; + cvt.u32.u16 %r9189, %rs5972; + cvt.s32.s8 %r9190, %r9189; + cvt.u32.u16 %r9191, %rs5971; + cvt.s32.s8 %r9192, %r9191; + cvt.u32.u16 %r9193, %rs5970; + cvt.s32.s8 %r9194, %r9193; + mad.lo.s32 %r9195, %r80, %r9194, %r9186; + mad.lo.s32 %r9196, %r81, %r9192, %r9195; + mad.lo.s32 %r9197, %r83, %r9190, %r9196; + mad.lo.s32 %r9198, %r84, %r9188, %r9197; + ld.const.v4.u8 {%rs5978, %rs5979, %rs5980, %rs5981}, [matrix+2988]; + cvt.u32.u16 %r9199, %rs5981; + cvt.s32.s8 %r9200, %r9199; + cvt.u32.u16 %r9201, %rs5980; + cvt.s32.s8 %r9202, %r9201; + cvt.u32.u16 %r9203, %rs5979; + cvt.s32.s8 %r9204, %r9203; + cvt.u32.u16 %r9205, %rs5978; + cvt.s32.s8 %r9206, %r9205; + mad.lo.s32 %r9207, %r86, %r9206, %r9198; + mad.lo.s32 %r9208, %r87, %r9204, %r9207; + mad.lo.s32 %r9209, %r88, %r9202, %r9208; + mad.lo.s32 %r9210, %r89, %r9200, %r9209; + ld.const.v4.u8 {%rs5986, %rs5987, %rs5988, %rs5989}, [matrix+2992]; + cvt.u32.u16 %r9211, %rs5989; + cvt.s32.s8 %r9212, %r9211; + cvt.u32.u16 %r9213, %rs5988; + cvt.s32.s8 %r9214, %r9213; + cvt.u32.u16 %r9215, %rs5987; + cvt.s32.s8 %r9216, %r9215; + cvt.u32.u16 %r9217, %rs5986; + cvt.s32.s8 %r9218, %r9217; + mad.lo.s32 %r9219, %r271, %r9218, %r9210; + mad.lo.s32 %r9220, %r91, %r9216, %r9219; + mad.lo.s32 %r9221, %r93, %r9214, %r9220; + mad.lo.s32 %r9222, %r94, %r9212, %r9221; + ld.const.v4.u8 {%rs5994, %rs5995, %rs5996, %rs5997}, [matrix+2996]; + cvt.u32.u16 %r9223, %rs5997; + cvt.s32.s8 %r9224, %r9223; + cvt.u32.u16 %r9225, %rs5996; + cvt.s32.s8 %r9226, %r9225; + cvt.u32.u16 %r9227, %rs5995; + cvt.s32.s8 %r9228, %r9227; + cvt.u32.u16 %r9229, %rs5994; + cvt.s32.s8 %r9230, %r9229; + mad.lo.s32 %r9231, %r96, %r9230, %r9222; + mad.lo.s32 %r9232, %r97, %r9228, %r9231; + mad.lo.s32 %r9233, %r99, %r9226, %r9232; + mad.lo.s32 %r9234, %r100, %r9224, %r9233; + ld.const.v4.u8 {%rs6002, %rs6003, %rs6004, %rs6005}, [matrix+3000]; + cvt.u32.u16 %r9235, %rs6005; + cvt.s32.s8 %r9236, %r9235; + cvt.u32.u16 %r9237, %rs6004; + cvt.s32.s8 %r9238, %r9237; + cvt.u32.u16 %r9239, %rs6003; + cvt.s32.s8 %r9240, %r9239; + cvt.u32.u16 %r9241, %rs6002; + cvt.s32.s8 %r9242, %r9241; + mad.lo.s32 %r9243, %r103, %r9242, %r9234; + mad.lo.s32 %r9244, %r104, %r9240, %r9243; + mad.lo.s32 %r9245, %r107, %r9238, %r9244; + mad.lo.s32 %r9246, %r108, %r9236, %r9245; + ld.const.v4.u8 {%rs6010, %rs6011, %rs6012, %rs6013}, [matrix+3004]; + cvt.u32.u16 %r9247, %rs6013; + cvt.s32.s8 %r9248, %r9247; + cvt.u32.u16 %r9249, %rs6012; + cvt.s32.s8 %r9250, %r9249; + cvt.u32.u16 %r9251, %rs6011; + cvt.s32.s8 %r9252, %r9251; + cvt.u32.u16 %r9253, %rs6010; + cvt.s32.s8 %r9254, %r9253; + mad.lo.s32 %r9255, %r111, %r9254, %r9246; + mad.lo.s32 %r9256, %r112, %r9252, %r9255; + mad.lo.s32 %r9257, %r114, %r9250, %r9256; + mad.lo.s32 %r9258, %r115, %r9248, %r9257; + ld.const.v4.u8 {%rs6018, %rs6019, %rs6020, %rs6021}, [matrix+3008]; + cvt.u32.u16 %r9259, %rs6021; + cvt.s32.s8 %r9260, %r9259; + cvt.u32.u16 %r9261, %rs6020; + cvt.s32.s8 %r9262, %r9261; + cvt.u32.u16 %r9263, %rs6018; + cvt.s32.s8 %r9264, %r9263; + cvt.u32.u16 %r9265, %rs6019; + cvt.s32.s8 %r9266, %r9265; + mul.lo.s32 %r9267, %r34, %r9266; + mad.lo.s32 %r9268, %r124, %r9264, %r9267; + mad.lo.s32 %r9269, %r35, %r9262, %r9268; + mad.lo.s32 %r9270, %r36, %r9260, %r9269; + ld.const.v4.u8 {%rs6026, %rs6027, %rs6028, %rs6029}, [matrix+3012]; + cvt.u32.u16 %r9271, %rs6029; + cvt.s32.s8 %r9272, %r9271; + cvt.u32.u16 %r9273, %rs6028; + cvt.s32.s8 %r9274, %r9273; + cvt.u32.u16 %r9275, %rs6027; + cvt.s32.s8 %r9276, %r9275; + cvt.u32.u16 %r9277, %rs6026; + cvt.s32.s8 %r9278, %r9277; + mad.lo.s32 %r9279, %r37, %r9278, %r9270; + mad.lo.s32 %r9280, %r38, %r9276, %r9279; + mad.lo.s32 %r9281, %r39, %r9274, %r9280; + mad.lo.s32 %r9282, %r40, %r9272, %r9281; + ld.const.v4.u8 {%rs6034, %rs6035, %rs6036, %rs6037}, [matrix+3016]; + cvt.u32.u16 %r9283, %rs6037; + cvt.s32.s8 %r9284, %r9283; + cvt.u32.u16 %r9285, %rs6036; + cvt.s32.s8 %r9286, %r9285; + cvt.u32.u16 %r9287, %rs6035; + cvt.s32.s8 %r9288, %r9287; + cvt.u32.u16 %r9289, %rs6034; + cvt.s32.s8 %r9290, %r9289; + mad.lo.s32 %r9291, %r42, %r9290, %r9282; + mad.lo.s32 %r9292, %r43, %r9288, %r9291; + mad.lo.s32 %r9293, %r45, %r9286, %r9292; + mad.lo.s32 %r9294, %r46, %r9284, %r9293; + ld.const.v4.u8 {%rs6042, %rs6043, %rs6044, %rs6045}, [matrix+3020]; + cvt.u32.u16 %r9295, %rs6045; + cvt.s32.s8 %r9296, %r9295; + cvt.u32.u16 %r9297, %rs6044; + cvt.s32.s8 %r9298, %r9297; + cvt.u32.u16 %r9299, %rs6043; + cvt.s32.s8 %r9300, %r9299; + cvt.u32.u16 %r9301, %rs6042; + cvt.s32.s8 %r9302, %r9301; + mad.lo.s32 %r9303, %r48, %r9302, %r9294; + mad.lo.s32 %r9304, %r49, %r9300, %r9303; + mad.lo.s32 %r9305, %r50, %r9298, %r9304; + mad.lo.s32 %r9306, %r51, %r9296, %r9305; + ld.const.v4.u8 {%rs6050, %rs6051, %rs6052, %rs6053}, [matrix+3024]; + cvt.u32.u16 %r9307, %rs6053; + cvt.s32.s8 %r9308, %r9307; + cvt.u32.u16 %r9309, %rs6052; + cvt.s32.s8 %r9310, %r9309; + cvt.u32.u16 %r9311, %rs6051; + cvt.s32.s8 %r9312, %r9311; + cvt.u32.u16 %r9313, %rs6050; + cvt.s32.s8 %r9314, %r9313; + mad.lo.s32 %r9315, %r173, %r9314, %r9306; + mad.lo.s32 %r9316, %r53, %r9312, %r9315; + mad.lo.s32 %r9317, %r54, %r9310, %r9316; + mad.lo.s32 %r9318, %r55, %r9308, %r9317; + ld.const.v4.u8 {%rs6058, %rs6059, %rs6060, %rs6061}, [matrix+3028]; + cvt.u32.u16 %r9319, %rs6061; + cvt.s32.s8 %r9320, %r9319; + cvt.u32.u16 %r9321, %rs6060; + cvt.s32.s8 %r9322, %r9321; + cvt.u32.u16 %r9323, %rs6059; + cvt.s32.s8 %r9324, %r9323; + cvt.u32.u16 %r9325, %rs6058; + cvt.s32.s8 %r9326, %r9325; + mad.lo.s32 %r9327, %r56, %r9326, %r9318; + mad.lo.s32 %r9328, %r57, %r9324, %r9327; + mad.lo.s32 %r9329, %r58, %r9322, %r9328; + mad.lo.s32 %r9330, %r59, %r9320, %r9329; + ld.const.v4.u8 {%rs6066, %rs6067, %rs6068, %rs6069}, [matrix+3032]; + cvt.u32.u16 %r9331, %rs6069; + cvt.s32.s8 %r9332, %r9331; + cvt.u32.u16 %r9333, %rs6068; + cvt.s32.s8 %r9334, %r9333; + cvt.u32.u16 %r9335, %rs6067; + cvt.s32.s8 %r9336, %r9335; + cvt.u32.u16 %r9337, %rs6066; + cvt.s32.s8 %r9338, %r9337; + mad.lo.s32 %r9339, %r61, %r9338, %r9330; + mad.lo.s32 %r9340, %r62, %r9336, %r9339; + mad.lo.s32 %r9341, %r64, %r9334, %r9340; + mad.lo.s32 %r9342, %r65, %r9332, %r9341; + ld.const.v4.u8 {%rs6074, %rs6075, %rs6076, %rs6077}, [matrix+3036]; + cvt.u32.u16 %r9343, %rs6077; + cvt.s32.s8 %r9344, %r9343; + cvt.u32.u16 %r9345, %rs6076; + cvt.s32.s8 %r9346, %r9345; + cvt.u32.u16 %r9347, %rs6075; + cvt.s32.s8 %r9348, %r9347; + cvt.u32.u16 %r9349, %rs6074; + cvt.s32.s8 %r9350, %r9349; + mad.lo.s32 %r9351, %r67, %r9350, %r9342; + mad.lo.s32 %r9352, %r68, %r9348, %r9351; + mad.lo.s32 %r9353, %r69, %r9346, %r9352; + mad.lo.s32 %r9354, %r70, %r9344, %r9353; + ld.const.v4.u8 {%rs6082, %rs6083, %rs6084, %rs6085}, [matrix+3040]; + cvt.u32.u16 %r9355, %rs6085; + cvt.s32.s8 %r9356, %r9355; + cvt.u32.u16 %r9357, %rs6084; + cvt.s32.s8 %r9358, %r9357; + cvt.u32.u16 %r9359, %rs6083; + cvt.s32.s8 %r9360, %r9359; + cvt.u32.u16 %r9361, %rs6082; + cvt.s32.s8 %r9362, %r9361; + mad.lo.s32 %r9363, %r222, %r9362, %r9354; + mad.lo.s32 %r9364, %r72, %r9360, %r9363; + mad.lo.s32 %r9365, %r73, %r9358, %r9364; + mad.lo.s32 %r9366, %r74, %r9356, %r9365; + ld.const.v4.u8 {%rs6090, %rs6091, %rs6092, %rs6093}, [matrix+3044]; + cvt.u32.u16 %r9367, %rs6093; + cvt.s32.s8 %r9368, %r9367; + cvt.u32.u16 %r9369, %rs6092; + cvt.s32.s8 %r9370, %r9369; + cvt.u32.u16 %r9371, %rs6091; + cvt.s32.s8 %r9372, %r9371; + cvt.u32.u16 %r9373, %rs6090; + cvt.s32.s8 %r9374, %r9373; + mad.lo.s32 %r9375, %r75, %r9374, %r9366; + mad.lo.s32 %r9376, %r76, %r9372, %r9375; + mad.lo.s32 %r9377, %r77, %r9370, %r9376; + mad.lo.s32 %r9378, %r78, %r9368, %r9377; + ld.const.v4.u8 {%rs6098, %rs6099, %rs6100, %rs6101}, [matrix+3048]; + cvt.u32.u16 %r9379, %rs6101; + cvt.s32.s8 %r9380, %r9379; + cvt.u32.u16 %r9381, %rs6100; + cvt.s32.s8 %r9382, %r9381; + cvt.u32.u16 %r9383, %rs6099; + cvt.s32.s8 %r9384, %r9383; + cvt.u32.u16 %r9385, %rs6098; + cvt.s32.s8 %r9386, %r9385; + mad.lo.s32 %r9387, %r80, %r9386, %r9378; + mad.lo.s32 %r9388, %r81, %r9384, %r9387; + mad.lo.s32 %r9389, %r83, %r9382, %r9388; + mad.lo.s32 %r9390, %r84, %r9380, %r9389; + ld.const.v4.u8 {%rs6106, %rs6107, %rs6108, %rs6109}, [matrix+3052]; + cvt.u32.u16 %r9391, %rs6109; + cvt.s32.s8 %r9392, %r9391; + cvt.u32.u16 %r9393, %rs6108; + cvt.s32.s8 %r9394, %r9393; + cvt.u32.u16 %r9395, %rs6107; + cvt.s32.s8 %r9396, %r9395; + cvt.u32.u16 %r9397, %rs6106; + cvt.s32.s8 %r9398, %r9397; + mad.lo.s32 %r9399, %r86, %r9398, %r9390; + mad.lo.s32 %r9400, %r87, %r9396, %r9399; + mad.lo.s32 %r9401, %r88, %r9394, %r9400; + mad.lo.s32 %r9402, %r89, %r9392, %r9401; + ld.const.v4.u8 {%rs6114, %rs6115, %rs6116, %rs6117}, [matrix+3056]; + cvt.u32.u16 %r9403, %rs6117; + cvt.s32.s8 %r9404, %r9403; + cvt.u32.u16 %r9405, %rs6116; + cvt.s32.s8 %r9406, %r9405; + cvt.u32.u16 %r9407, %rs6115; + cvt.s32.s8 %r9408, %r9407; + cvt.u32.u16 %r9409, %rs6114; + cvt.s32.s8 %r9410, %r9409; + mad.lo.s32 %r9411, %r271, %r9410, %r9402; + mad.lo.s32 %r9412, %r91, %r9408, %r9411; + mad.lo.s32 %r9413, %r93, %r9406, %r9412; + mad.lo.s32 %r9414, %r94, %r9404, %r9413; + ld.const.v4.u8 {%rs6122, %rs6123, %rs6124, %rs6125}, [matrix+3060]; + cvt.u32.u16 %r9415, %rs6125; + cvt.s32.s8 %r9416, %r9415; + cvt.u32.u16 %r9417, %rs6124; + cvt.s32.s8 %r9418, %r9417; + cvt.u32.u16 %r9419, %rs6123; + cvt.s32.s8 %r9420, %r9419; + cvt.u32.u16 %r9421, %rs6122; + cvt.s32.s8 %r9422, %r9421; + mad.lo.s32 %r9423, %r96, %r9422, %r9414; + mad.lo.s32 %r9424, %r97, %r9420, %r9423; + mad.lo.s32 %r9425, %r99, %r9418, %r9424; + mad.lo.s32 %r9426, %r100, %r9416, %r9425; + ld.const.v4.u8 {%rs6130, %rs6131, %rs6132, %rs6133}, [matrix+3064]; + cvt.u32.u16 %r9427, %rs6133; + cvt.s32.s8 %r9428, %r9427; + cvt.u32.u16 %r9429, %rs6132; + cvt.s32.s8 %r9430, %r9429; + cvt.u32.u16 %r9431, %rs6131; + cvt.s32.s8 %r9432, %r9431; + cvt.u32.u16 %r9433, %rs6130; + cvt.s32.s8 %r9434, %r9433; + mad.lo.s32 %r9435, %r103, %r9434, %r9426; + mad.lo.s32 %r9436, %r104, %r9432, %r9435; + mad.lo.s32 %r9437, %r107, %r9430, %r9436; + mad.lo.s32 %r9438, %r108, %r9428, %r9437; + ld.const.v4.u8 {%rs6138, %rs6139, %rs6140, %rs6141}, [matrix+3068]; + cvt.u32.u16 %r9439, %rs6141; + cvt.s32.s8 %r9440, %r9439; + cvt.u32.u16 %r9441, %rs6140; + cvt.s32.s8 %r9442, %r9441; + cvt.u32.u16 %r9443, %rs6139; + cvt.s32.s8 %r9444, %r9443; + cvt.u32.u16 %r9445, %rs6138; + cvt.s32.s8 %r9446, %r9445; + mad.lo.s32 %r9447, %r111, %r9446, %r9438; + mad.lo.s32 %r9448, %r112, %r9444, %r9447; + mad.lo.s32 %r9449, %r114, %r9442, %r9448; + mad.lo.s32 %r9450, %r115, %r9440, %r9449; + shr.u32 %r9451, %r9258, 6; + and.b32 %r9452, %r9451, 240; + shr.u32 %r9453, %r9450, 10; + or.b32 %r9454, %r9453, %r9452; + xor.b32 %r9455, %r33, %r9454; + ld.const.v4.u8 {%rs6146, %rs6147, %rs6148, %rs6149}, [matrix+3072]; + cvt.u32.u16 %r9456, %rs6149; + cvt.s32.s8 %r9457, %r9456; + cvt.u32.u16 %r9458, %rs6148; + cvt.s32.s8 %r9459, %r9458; + cvt.u32.u16 %r9460, %rs6146; + cvt.s32.s8 %r9461, %r9460; + cvt.u32.u16 %r9462, %rs6147; + cvt.s32.s8 %r9463, %r9462; + mul.lo.s32 %r9464, %r34, %r9463; + mad.lo.s32 %r9465, %r124, %r9461, %r9464; + mad.lo.s32 %r9466, %r35, %r9459, %r9465; + mad.lo.s32 %r9467, %r36, %r9457, %r9466; + ld.const.v4.u8 {%rs6154, %rs6155, %rs6156, %rs6157}, [matrix+3076]; + cvt.u32.u16 %r9468, %rs6157; + cvt.s32.s8 %r9469, %r9468; + cvt.u32.u16 %r9470, %rs6156; + cvt.s32.s8 %r9471, %r9470; + cvt.u32.u16 %r9472, %rs6155; + cvt.s32.s8 %r9473, %r9472; + cvt.u32.u16 %r9474, %rs6154; + cvt.s32.s8 %r9475, %r9474; + mad.lo.s32 %r9476, %r37, %r9475, %r9467; + mad.lo.s32 %r9477, %r38, %r9473, %r9476; + mad.lo.s32 %r9478, %r39, %r9471, %r9477; + mad.lo.s32 %r9479, %r40, %r9469, %r9478; + ld.const.v4.u8 {%rs6162, %rs6163, %rs6164, %rs6165}, [matrix+3080]; + cvt.u32.u16 %r9480, %rs6165; + cvt.s32.s8 %r9481, %r9480; + cvt.u32.u16 %r9482, %rs6164; + cvt.s32.s8 %r9483, %r9482; + cvt.u32.u16 %r9484, %rs6163; + cvt.s32.s8 %r9485, %r9484; + cvt.u32.u16 %r9486, %rs6162; + cvt.s32.s8 %r9487, %r9486; + mad.lo.s32 %r9488, %r42, %r9487, %r9479; + mad.lo.s32 %r9489, %r43, %r9485, %r9488; + mad.lo.s32 %r9490, %r45, %r9483, %r9489; + mad.lo.s32 %r9491, %r46, %r9481, %r9490; + ld.const.v4.u8 {%rs6170, %rs6171, %rs6172, %rs6173}, [matrix+3084]; + cvt.u32.u16 %r9492, %rs6173; + cvt.s32.s8 %r9493, %r9492; + cvt.u32.u16 %r9494, %rs6172; + cvt.s32.s8 %r9495, %r9494; + cvt.u32.u16 %r9496, %rs6171; + cvt.s32.s8 %r9497, %r9496; + cvt.u32.u16 %r9498, %rs6170; + cvt.s32.s8 %r9499, %r9498; + mad.lo.s32 %r9500, %r48, %r9499, %r9491; + mad.lo.s32 %r9501, %r49, %r9497, %r9500; + mad.lo.s32 %r9502, %r50, %r9495, %r9501; + mad.lo.s32 %r9503, %r51, %r9493, %r9502; + ld.const.v4.u8 {%rs6178, %rs6179, %rs6180, %rs6181}, [matrix+3088]; + cvt.u32.u16 %r9504, %rs6181; + cvt.s32.s8 %r9505, %r9504; + cvt.u32.u16 %r9506, %rs6180; + cvt.s32.s8 %r9507, %r9506; + cvt.u32.u16 %r9508, %rs6179; + cvt.s32.s8 %r9509, %r9508; + cvt.u32.u16 %r9510, %rs6178; + cvt.s32.s8 %r9511, %r9510; + mad.lo.s32 %r9512, %r173, %r9511, %r9503; + mad.lo.s32 %r9513, %r53, %r9509, %r9512; + mad.lo.s32 %r9514, %r54, %r9507, %r9513; + mad.lo.s32 %r9515, %r55, %r9505, %r9514; + ld.const.v4.u8 {%rs6186, %rs6187, %rs6188, %rs6189}, [matrix+3092]; + cvt.u32.u16 %r9516, %rs6189; + cvt.s32.s8 %r9517, %r9516; + cvt.u32.u16 %r9518, %rs6188; + cvt.s32.s8 %r9519, %r9518; + cvt.u32.u16 %r9520, %rs6187; + cvt.s32.s8 %r9521, %r9520; + cvt.u32.u16 %r9522, %rs6186; + cvt.s32.s8 %r9523, %r9522; + mad.lo.s32 %r9524, %r56, %r9523, %r9515; + mad.lo.s32 %r9525, %r57, %r9521, %r9524; + mad.lo.s32 %r9526, %r58, %r9519, %r9525; + mad.lo.s32 %r9527, %r59, %r9517, %r9526; + ld.const.v4.u8 {%rs6194, %rs6195, %rs6196, %rs6197}, [matrix+3096]; + cvt.u32.u16 %r9528, %rs6197; + cvt.s32.s8 %r9529, %r9528; + cvt.u32.u16 %r9530, %rs6196; + cvt.s32.s8 %r9531, %r9530; + cvt.u32.u16 %r9532, %rs6195; + cvt.s32.s8 %r9533, %r9532; + cvt.u32.u16 %r9534, %rs6194; + cvt.s32.s8 %r9535, %r9534; + mad.lo.s32 %r9536, %r61, %r9535, %r9527; + mad.lo.s32 %r9537, %r62, %r9533, %r9536; + mad.lo.s32 %r9538, %r64, %r9531, %r9537; + mad.lo.s32 %r9539, %r65, %r9529, %r9538; + ld.const.v4.u8 {%rs6202, %rs6203, %rs6204, %rs6205}, [matrix+3100]; + cvt.u32.u16 %r9540, %rs6205; + cvt.s32.s8 %r9541, %r9540; + cvt.u32.u16 %r9542, %rs6204; + cvt.s32.s8 %r9543, %r9542; + cvt.u32.u16 %r9544, %rs6203; + cvt.s32.s8 %r9545, %r9544; + cvt.u32.u16 %r9546, %rs6202; + cvt.s32.s8 %r9547, %r9546; + mad.lo.s32 %r9548, %r67, %r9547, %r9539; + mad.lo.s32 %r9549, %r68, %r9545, %r9548; + mad.lo.s32 %r9550, %r69, %r9543, %r9549; + mad.lo.s32 %r9551, %r70, %r9541, %r9550; + ld.const.v4.u8 {%rs6210, %rs6211, %rs6212, %rs6213}, [matrix+3104]; + cvt.u32.u16 %r9552, %rs6213; + cvt.s32.s8 %r9553, %r9552; + cvt.u32.u16 %r9554, %rs6212; + cvt.s32.s8 %r9555, %r9554; + cvt.u32.u16 %r9556, %rs6211; + cvt.s32.s8 %r9557, %r9556; + cvt.u32.u16 %r9558, %rs6210; + cvt.s32.s8 %r9559, %r9558; + mad.lo.s32 %r9560, %r222, %r9559, %r9551; + mad.lo.s32 %r9561, %r72, %r9557, %r9560; + mad.lo.s32 %r9562, %r73, %r9555, %r9561; + mad.lo.s32 %r9563, %r74, %r9553, %r9562; + ld.const.v4.u8 {%rs6218, %rs6219, %rs6220, %rs6221}, [matrix+3108]; + cvt.u32.u16 %r9564, %rs6221; + cvt.s32.s8 %r9565, %r9564; + cvt.u32.u16 %r9566, %rs6220; + cvt.s32.s8 %r9567, %r9566; + cvt.u32.u16 %r9568, %rs6219; + cvt.s32.s8 %r9569, %r9568; + cvt.u32.u16 %r9570, %rs6218; + cvt.s32.s8 %r9571, %r9570; + mad.lo.s32 %r9572, %r75, %r9571, %r9563; + mad.lo.s32 %r9573, %r76, %r9569, %r9572; + mad.lo.s32 %r9574, %r77, %r9567, %r9573; + mad.lo.s32 %r9575, %r78, %r9565, %r9574; + ld.const.v4.u8 {%rs6226, %rs6227, %rs6228, %rs6229}, [matrix+3112]; + cvt.u32.u16 %r9576, %rs6229; + cvt.s32.s8 %r9577, %r9576; + cvt.u32.u16 %r9578, %rs6228; + cvt.s32.s8 %r9579, %r9578; + cvt.u32.u16 %r9580, %rs6227; + cvt.s32.s8 %r9581, %r9580; + cvt.u32.u16 %r9582, %rs6226; + cvt.s32.s8 %r9583, %r9582; + mad.lo.s32 %r9584, %r80, %r9583, %r9575; + mad.lo.s32 %r9585, %r81, %r9581, %r9584; + mad.lo.s32 %r9586, %r83, %r9579, %r9585; + mad.lo.s32 %r9587, %r84, %r9577, %r9586; + ld.const.v4.u8 {%rs6234, %rs6235, %rs6236, %rs6237}, [matrix+3116]; + cvt.u32.u16 %r9588, %rs6237; + cvt.s32.s8 %r9589, %r9588; + cvt.u32.u16 %r9590, %rs6236; + cvt.s32.s8 %r9591, %r9590; + cvt.u32.u16 %r9592, %rs6235; + cvt.s32.s8 %r9593, %r9592; + cvt.u32.u16 %r9594, %rs6234; + cvt.s32.s8 %r9595, %r9594; + mad.lo.s32 %r9596, %r86, %r9595, %r9587; + mad.lo.s32 %r9597, %r87, %r9593, %r9596; + mad.lo.s32 %r9598, %r88, %r9591, %r9597; + mad.lo.s32 %r9599, %r89, %r9589, %r9598; + ld.const.v4.u8 {%rs6242, %rs6243, %rs6244, %rs6245}, [matrix+3120]; + cvt.u32.u16 %r9600, %rs6245; + cvt.s32.s8 %r9601, %r9600; + cvt.u32.u16 %r9602, %rs6244; + cvt.s32.s8 %r9603, %r9602; + cvt.u32.u16 %r9604, %rs6243; + cvt.s32.s8 %r9605, %r9604; + cvt.u32.u16 %r9606, %rs6242; + cvt.s32.s8 %r9607, %r9606; + mad.lo.s32 %r9608, %r271, %r9607, %r9599; + mad.lo.s32 %r9609, %r91, %r9605, %r9608; + mad.lo.s32 %r9610, %r93, %r9603, %r9609; + mad.lo.s32 %r9611, %r94, %r9601, %r9610; + ld.const.v4.u8 {%rs6250, %rs6251, %rs6252, %rs6253}, [matrix+3124]; + cvt.u32.u16 %r9612, %rs6253; + cvt.s32.s8 %r9613, %r9612; + cvt.u32.u16 %r9614, %rs6252; + cvt.s32.s8 %r9615, %r9614; + cvt.u32.u16 %r9616, %rs6251; + cvt.s32.s8 %r9617, %r9616; + cvt.u32.u16 %r9618, %rs6250; + cvt.s32.s8 %r9619, %r9618; + mad.lo.s32 %r9620, %r96, %r9619, %r9611; + mad.lo.s32 %r9621, %r97, %r9617, %r9620; + mad.lo.s32 %r9622, %r99, %r9615, %r9621; + mad.lo.s32 %r9623, %r100, %r9613, %r9622; + ld.const.v4.u8 {%rs6258, %rs6259, %rs6260, %rs6261}, [matrix+3128]; + cvt.u32.u16 %r9624, %rs6261; + cvt.s32.s8 %r9625, %r9624; + cvt.u32.u16 %r9626, %rs6260; + cvt.s32.s8 %r9627, %r9626; + cvt.u32.u16 %r9628, %rs6259; + cvt.s32.s8 %r9629, %r9628; + cvt.u32.u16 %r9630, %rs6258; + cvt.s32.s8 %r9631, %r9630; + mad.lo.s32 %r9632, %r103, %r9631, %r9623; + mad.lo.s32 %r9633, %r104, %r9629, %r9632; + mad.lo.s32 %r9634, %r107, %r9627, %r9633; + mad.lo.s32 %r9635, %r108, %r9625, %r9634; + ld.const.v4.u8 {%rs6266, %rs6267, %rs6268, %rs6269}, [matrix+3132]; + cvt.u32.u16 %r9636, %rs6269; + cvt.s32.s8 %r9637, %r9636; + cvt.u32.u16 %r9638, %rs6268; + cvt.s32.s8 %r9639, %r9638; + cvt.u32.u16 %r9640, %rs6267; + cvt.s32.s8 %r9641, %r9640; + cvt.u32.u16 %r9642, %rs6266; + cvt.s32.s8 %r9643, %r9642; + mad.lo.s32 %r9644, %r111, %r9643, %r9635; + mad.lo.s32 %r9645, %r112, %r9641, %r9644; + mad.lo.s32 %r9646, %r114, %r9639, %r9645; + mad.lo.s32 %r9647, %r115, %r9637, %r9646; + ld.const.v4.u8 {%rs6274, %rs6275, %rs6276, %rs6277}, [matrix+3136]; + cvt.u32.u16 %r9648, %rs6277; + cvt.s32.s8 %r9649, %r9648; + cvt.u32.u16 %r9650, %rs6276; + cvt.s32.s8 %r9651, %r9650; + cvt.u32.u16 %r9652, %rs6274; + cvt.s32.s8 %r9653, %r9652; + cvt.u32.u16 %r9654, %rs6275; + cvt.s32.s8 %r9655, %r9654; + mul.lo.s32 %r9656, %r34, %r9655; + mad.lo.s32 %r9657, %r124, %r9653, %r9656; + mad.lo.s32 %r9658, %r35, %r9651, %r9657; + mad.lo.s32 %r9659, %r36, %r9649, %r9658; + ld.const.v4.u8 {%rs6282, %rs6283, %rs6284, %rs6285}, [matrix+3140]; + cvt.u32.u16 %r9660, %rs6285; + cvt.s32.s8 %r9661, %r9660; + cvt.u32.u16 %r9662, %rs6284; + cvt.s32.s8 %r9663, %r9662; + cvt.u32.u16 %r9664, %rs6283; + cvt.s32.s8 %r9665, %r9664; + cvt.u32.u16 %r9666, %rs6282; + cvt.s32.s8 %r9667, %r9666; + mad.lo.s32 %r9668, %r37, %r9667, %r9659; + mad.lo.s32 %r9669, %r38, %r9665, %r9668; + mad.lo.s32 %r9670, %r39, %r9663, %r9669; + mad.lo.s32 %r9671, %r40, %r9661, %r9670; + ld.const.v4.u8 {%rs6290, %rs6291, %rs6292, %rs6293}, [matrix+3144]; + cvt.u32.u16 %r9672, %rs6293; + cvt.s32.s8 %r9673, %r9672; + cvt.u32.u16 %r9674, %rs6292; + cvt.s32.s8 %r9675, %r9674; + cvt.u32.u16 %r9676, %rs6291; + cvt.s32.s8 %r9677, %r9676; + cvt.u32.u16 %r9678, %rs6290; + cvt.s32.s8 %r9679, %r9678; + mad.lo.s32 %r9680, %r42, %r9679, %r9671; + mad.lo.s32 %r9681, %r43, %r9677, %r9680; + mad.lo.s32 %r9682, %r45, %r9675, %r9681; + mad.lo.s32 %r9683, %r46, %r9673, %r9682; + ld.const.v4.u8 {%rs6298, %rs6299, %rs6300, %rs6301}, [matrix+3148]; + cvt.u32.u16 %r9684, %rs6301; + cvt.s32.s8 %r9685, %r9684; + cvt.u32.u16 %r9686, %rs6300; + cvt.s32.s8 %r9687, %r9686; + cvt.u32.u16 %r9688, %rs6299; + cvt.s32.s8 %r9689, %r9688; + cvt.u32.u16 %r9690, %rs6298; + cvt.s32.s8 %r9691, %r9690; + mad.lo.s32 %r9692, %r48, %r9691, %r9683; + mad.lo.s32 %r9693, %r49, %r9689, %r9692; + mad.lo.s32 %r9694, %r50, %r9687, %r9693; + mad.lo.s32 %r9695, %r51, %r9685, %r9694; + ld.const.v4.u8 {%rs6306, %rs6307, %rs6308, %rs6309}, [matrix+3152]; + cvt.u32.u16 %r9696, %rs6309; + cvt.s32.s8 %r9697, %r9696; + cvt.u32.u16 %r9698, %rs6308; + cvt.s32.s8 %r9699, %r9698; + cvt.u32.u16 %r9700, %rs6307; + cvt.s32.s8 %r9701, %r9700; + cvt.u32.u16 %r9702, %rs6306; + cvt.s32.s8 %r9703, %r9702; + mad.lo.s32 %r9704, %r173, %r9703, %r9695; + mad.lo.s32 %r9705, %r53, %r9701, %r9704; + mad.lo.s32 %r9706, %r54, %r9699, %r9705; + mad.lo.s32 %r9707, %r55, %r9697, %r9706; + ld.const.v4.u8 {%rs6314, %rs6315, %rs6316, %rs6317}, [matrix+3156]; + cvt.u32.u16 %r9708, %rs6317; + cvt.s32.s8 %r9709, %r9708; + cvt.u32.u16 %r9710, %rs6316; + cvt.s32.s8 %r9711, %r9710; + cvt.u32.u16 %r9712, %rs6315; + cvt.s32.s8 %r9713, %r9712; + cvt.u32.u16 %r9714, %rs6314; + cvt.s32.s8 %r9715, %r9714; + mad.lo.s32 %r9716, %r56, %r9715, %r9707; + mad.lo.s32 %r9717, %r57, %r9713, %r9716; + mad.lo.s32 %r9718, %r58, %r9711, %r9717; + mad.lo.s32 %r9719, %r59, %r9709, %r9718; + ld.const.v4.u8 {%rs6322, %rs6323, %rs6324, %rs6325}, [matrix+3160]; + cvt.u32.u16 %r9720, %rs6325; + cvt.s32.s8 %r9721, %r9720; + cvt.u32.u16 %r9722, %rs6324; + cvt.s32.s8 %r9723, %r9722; + cvt.u32.u16 %r9724, %rs6323; + cvt.s32.s8 %r9725, %r9724; + cvt.u32.u16 %r9726, %rs6322; + cvt.s32.s8 %r9727, %r9726; + mad.lo.s32 %r9728, %r61, %r9727, %r9719; + mad.lo.s32 %r9729, %r62, %r9725, %r9728; + mad.lo.s32 %r9730, %r64, %r9723, %r9729; + mad.lo.s32 %r9731, %r65, %r9721, %r9730; + ld.const.v4.u8 {%rs6330, %rs6331, %rs6332, %rs6333}, [matrix+3164]; + cvt.u32.u16 %r9732, %rs6333; + cvt.s32.s8 %r9733, %r9732; + cvt.u32.u16 %r9734, %rs6332; + cvt.s32.s8 %r9735, %r9734; + cvt.u32.u16 %r9736, %rs6331; + cvt.s32.s8 %r9737, %r9736; + cvt.u32.u16 %r9738, %rs6330; + cvt.s32.s8 %r9739, %r9738; + mad.lo.s32 %r9740, %r67, %r9739, %r9731; + mad.lo.s32 %r9741, %r68, %r9737, %r9740; + mad.lo.s32 %r9742, %r69, %r9735, %r9741; + mad.lo.s32 %r9743, %r70, %r9733, %r9742; + ld.const.v4.u8 {%rs6338, %rs6339, %rs6340, %rs6341}, [matrix+3168]; + cvt.u32.u16 %r9744, %rs6341; + cvt.s32.s8 %r9745, %r9744; + cvt.u32.u16 %r9746, %rs6340; + cvt.s32.s8 %r9747, %r9746; + cvt.u32.u16 %r9748, %rs6339; + cvt.s32.s8 %r9749, %r9748; + cvt.u32.u16 %r9750, %rs6338; + cvt.s32.s8 %r9751, %r9750; + mad.lo.s32 %r9752, %r222, %r9751, %r9743; + mad.lo.s32 %r9753, %r72, %r9749, %r9752; + mad.lo.s32 %r9754, %r73, %r9747, %r9753; + mad.lo.s32 %r9755, %r74, %r9745, %r9754; + ld.const.v4.u8 {%rs6346, %rs6347, %rs6348, %rs6349}, [matrix+3172]; + cvt.u32.u16 %r9756, %rs6349; + cvt.s32.s8 %r9757, %r9756; + cvt.u32.u16 %r9758, %rs6348; + cvt.s32.s8 %r9759, %r9758; + cvt.u32.u16 %r9760, %rs6347; + cvt.s32.s8 %r9761, %r9760; + cvt.u32.u16 %r9762, %rs6346; + cvt.s32.s8 %r9763, %r9762; + mad.lo.s32 %r9764, %r75, %r9763, %r9755; + mad.lo.s32 %r9765, %r76, %r9761, %r9764; + mad.lo.s32 %r9766, %r77, %r9759, %r9765; + mad.lo.s32 %r9767, %r78, %r9757, %r9766; + ld.const.v4.u8 {%rs6354, %rs6355, %rs6356, %rs6357}, [matrix+3176]; + cvt.u32.u16 %r9768, %rs6357; + cvt.s32.s8 %r9769, %r9768; + cvt.u32.u16 %r9770, %rs6356; + cvt.s32.s8 %r9771, %r9770; + cvt.u32.u16 %r9772, %rs6355; + cvt.s32.s8 %r9773, %r9772; + cvt.u32.u16 %r9774, %rs6354; + cvt.s32.s8 %r9775, %r9774; + mad.lo.s32 %r9776, %r80, %r9775, %r9767; + mad.lo.s32 %r9777, %r81, %r9773, %r9776; + mad.lo.s32 %r9778, %r83, %r9771, %r9777; + mad.lo.s32 %r9779, %r84, %r9769, %r9778; + ld.const.v4.u8 {%rs6362, %rs6363, %rs6364, %rs6365}, [matrix+3180]; + cvt.u32.u16 %r9780, %rs6365; + cvt.s32.s8 %r9781, %r9780; + cvt.u32.u16 %r9782, %rs6364; + cvt.s32.s8 %r9783, %r9782; + cvt.u32.u16 %r9784, %rs6363; + cvt.s32.s8 %r9785, %r9784; + cvt.u32.u16 %r9786, %rs6362; + cvt.s32.s8 %r9787, %r9786; + mad.lo.s32 %r9788, %r86, %r9787, %r9779; + mad.lo.s32 %r9789, %r87, %r9785, %r9788; + mad.lo.s32 %r9790, %r88, %r9783, %r9789; + mad.lo.s32 %r9791, %r89, %r9781, %r9790; + ld.const.v4.u8 {%rs6370, %rs6371, %rs6372, %rs6373}, [matrix+3184]; + cvt.u32.u16 %r9792, %rs6373; + cvt.s32.s8 %r9793, %r9792; + cvt.u32.u16 %r9794, %rs6372; + cvt.s32.s8 %r9795, %r9794; + cvt.u32.u16 %r9796, %rs6371; + cvt.s32.s8 %r9797, %r9796; + cvt.u32.u16 %r9798, %rs6370; + cvt.s32.s8 %r9799, %r9798; + mad.lo.s32 %r9800, %r271, %r9799, %r9791; + mad.lo.s32 %r9801, %r91, %r9797, %r9800; + mad.lo.s32 %r9802, %r93, %r9795, %r9801; + mad.lo.s32 %r9803, %r94, %r9793, %r9802; + ld.const.v4.u8 {%rs6378, %rs6379, %rs6380, %rs6381}, [matrix+3188]; + cvt.u32.u16 %r9804, %rs6381; + cvt.s32.s8 %r9805, %r9804; + cvt.u32.u16 %r9806, %rs6380; + cvt.s32.s8 %r9807, %r9806; + cvt.u32.u16 %r9808, %rs6379; + cvt.s32.s8 %r9809, %r9808; + cvt.u32.u16 %r9810, %rs6378; + cvt.s32.s8 %r9811, %r9810; + mad.lo.s32 %r9812, %r96, %r9811, %r9803; + mad.lo.s32 %r9813, %r97, %r9809, %r9812; + mad.lo.s32 %r9814, %r99, %r9807, %r9813; + mad.lo.s32 %r9815, %r100, %r9805, %r9814; + ld.const.v4.u8 {%rs6386, %rs6387, %rs6388, %rs6389}, [matrix+3192]; + cvt.u32.u16 %r9816, %rs6389; + cvt.s32.s8 %r9817, %r9816; + cvt.u32.u16 %r9818, %rs6388; + cvt.s32.s8 %r9819, %r9818; + cvt.u32.u16 %r9820, %rs6387; + cvt.s32.s8 %r9821, %r9820; + cvt.u32.u16 %r9822, %rs6386; + cvt.s32.s8 %r9823, %r9822; + mad.lo.s32 %r9824, %r103, %r9823, %r9815; + mad.lo.s32 %r9825, %r104, %r9821, %r9824; + mad.lo.s32 %r9826, %r107, %r9819, %r9825; + mad.lo.s32 %r9827, %r108, %r9817, %r9826; + ld.const.v4.u8 {%rs6394, %rs6395, %rs6396, %rs6397}, [matrix+3196]; + cvt.u32.u16 %r9828, %rs6397; + cvt.s32.s8 %r9829, %r9828; + cvt.u32.u16 %r9830, %rs6396; + cvt.s32.s8 %r9831, %r9830; + cvt.u32.u16 %r9832, %rs6395; + cvt.s32.s8 %r9833, %r9832; + cvt.u32.u16 %r9834, %rs6394; + cvt.s32.s8 %r9835, %r9834; + mad.lo.s32 %r9836, %r111, %r9835, %r9827; + mad.lo.s32 %r9837, %r112, %r9833, %r9836; + mad.lo.s32 %r9838, %r114, %r9831, %r9837; + mad.lo.s32 %r9839, %r115, %r9829, %r9838; + shr.u32 %r9840, %r9647, 6; + and.b32 %r9841, %r9840, 240; + shr.u32 %r9842, %r9839, 10; + or.b32 %r9843, %r9842, %r9841; + xor.b32 %r9844, %r90, %r9843; + ld.const.v4.u8 {%rs6402, %rs6403, %rs6404, %rs6405}, [matrix+3200]; + cvt.u32.u16 %r9845, %rs6405; + cvt.s32.s8 %r9846, %r9845; + cvt.u32.u16 %r9847, %rs6404; + cvt.s32.s8 %r9848, %r9847; + cvt.u32.u16 %r9849, %rs6402; + cvt.s32.s8 %r9850, %r9849; + cvt.u32.u16 %r9851, %rs6403; + cvt.s32.s8 %r9852, %r9851; + mul.lo.s32 %r9853, %r34, %r9852; + mad.lo.s32 %r9854, %r124, %r9850, %r9853; + mad.lo.s32 %r9855, %r35, %r9848, %r9854; + mad.lo.s32 %r9856, %r36, %r9846, %r9855; + ld.const.v4.u8 {%rs6410, %rs6411, %rs6412, %rs6413}, [matrix+3204]; + cvt.u32.u16 %r9857, %rs6413; + cvt.s32.s8 %r9858, %r9857; + cvt.u32.u16 %r9859, %rs6412; + cvt.s32.s8 %r9860, %r9859; + cvt.u32.u16 %r9861, %rs6411; + cvt.s32.s8 %r9862, %r9861; + cvt.u32.u16 %r9863, %rs6410; + cvt.s32.s8 %r9864, %r9863; + mad.lo.s32 %r9865, %r37, %r9864, %r9856; + mad.lo.s32 %r9866, %r38, %r9862, %r9865; + mad.lo.s32 %r9867, %r39, %r9860, %r9866; + mad.lo.s32 %r9868, %r40, %r9858, %r9867; + ld.const.v4.u8 {%rs6418, %rs6419, %rs6420, %rs6421}, [matrix+3208]; + cvt.u32.u16 %r9869, %rs6421; + cvt.s32.s8 %r9870, %r9869; + cvt.u32.u16 %r9871, %rs6420; + cvt.s32.s8 %r9872, %r9871; + cvt.u32.u16 %r9873, %rs6419; + cvt.s32.s8 %r9874, %r9873; + cvt.u32.u16 %r9875, %rs6418; + cvt.s32.s8 %r9876, %r9875; + mad.lo.s32 %r9877, %r42, %r9876, %r9868; + mad.lo.s32 %r9878, %r43, %r9874, %r9877; + mad.lo.s32 %r9879, %r45, %r9872, %r9878; + mad.lo.s32 %r9880, %r46, %r9870, %r9879; + ld.const.v4.u8 {%rs6426, %rs6427, %rs6428, %rs6429}, [matrix+3212]; + cvt.u32.u16 %r9881, %rs6429; + cvt.s32.s8 %r9882, %r9881; + cvt.u32.u16 %r9883, %rs6428; + cvt.s32.s8 %r9884, %r9883; + cvt.u32.u16 %r9885, %rs6427; + cvt.s32.s8 %r9886, %r9885; + cvt.u32.u16 %r9887, %rs6426; + cvt.s32.s8 %r9888, %r9887; + mad.lo.s32 %r9889, %r48, %r9888, %r9880; + mad.lo.s32 %r9890, %r49, %r9886, %r9889; + mad.lo.s32 %r9891, %r50, %r9884, %r9890; + mad.lo.s32 %r9892, %r51, %r9882, %r9891; + ld.const.v4.u8 {%rs6434, %rs6435, %rs6436, %rs6437}, [matrix+3216]; + cvt.u32.u16 %r9893, %rs6437; + cvt.s32.s8 %r9894, %r9893; + cvt.u32.u16 %r9895, %rs6436; + cvt.s32.s8 %r9896, %r9895; + cvt.u32.u16 %r9897, %rs6435; + cvt.s32.s8 %r9898, %r9897; + cvt.u32.u16 %r9899, %rs6434; + cvt.s32.s8 %r9900, %r9899; + mad.lo.s32 %r9901, %r173, %r9900, %r9892; + mad.lo.s32 %r9902, %r53, %r9898, %r9901; + mad.lo.s32 %r9903, %r54, %r9896, %r9902; + mad.lo.s32 %r9904, %r55, %r9894, %r9903; + ld.const.v4.u8 {%rs6442, %rs6443, %rs6444, %rs6445}, [matrix+3220]; + cvt.u32.u16 %r9905, %rs6445; + cvt.s32.s8 %r9906, %r9905; + cvt.u32.u16 %r9907, %rs6444; + cvt.s32.s8 %r9908, %r9907; + cvt.u32.u16 %r9909, %rs6443; + cvt.s32.s8 %r9910, %r9909; + cvt.u32.u16 %r9911, %rs6442; + cvt.s32.s8 %r9912, %r9911; + mad.lo.s32 %r9913, %r56, %r9912, %r9904; + mad.lo.s32 %r9914, %r57, %r9910, %r9913; + mad.lo.s32 %r9915, %r58, %r9908, %r9914; + mad.lo.s32 %r9916, %r59, %r9906, %r9915; + ld.const.v4.u8 {%rs6450, %rs6451, %rs6452, %rs6453}, [matrix+3224]; + cvt.u32.u16 %r9917, %rs6453; + cvt.s32.s8 %r9918, %r9917; + cvt.u32.u16 %r9919, %rs6452; + cvt.s32.s8 %r9920, %r9919; + cvt.u32.u16 %r9921, %rs6451; + cvt.s32.s8 %r9922, %r9921; + cvt.u32.u16 %r9923, %rs6450; + cvt.s32.s8 %r9924, %r9923; + mad.lo.s32 %r9925, %r61, %r9924, %r9916; + mad.lo.s32 %r9926, %r62, %r9922, %r9925; + mad.lo.s32 %r9927, %r64, %r9920, %r9926; + mad.lo.s32 %r9928, %r65, %r9918, %r9927; + ld.const.v4.u8 {%rs6458, %rs6459, %rs6460, %rs6461}, [matrix+3228]; + cvt.u32.u16 %r9929, %rs6461; + cvt.s32.s8 %r9930, %r9929; + cvt.u32.u16 %r9931, %rs6460; + cvt.s32.s8 %r9932, %r9931; + cvt.u32.u16 %r9933, %rs6459; + cvt.s32.s8 %r9934, %r9933; + cvt.u32.u16 %r9935, %rs6458; + cvt.s32.s8 %r9936, %r9935; + mad.lo.s32 %r9937, %r67, %r9936, %r9928; + mad.lo.s32 %r9938, %r68, %r9934, %r9937; + mad.lo.s32 %r9939, %r69, %r9932, %r9938; + mad.lo.s32 %r9940, %r70, %r9930, %r9939; + ld.const.v4.u8 {%rs6466, %rs6467, %rs6468, %rs6469}, [matrix+3232]; + cvt.u32.u16 %r9941, %rs6469; + cvt.s32.s8 %r9942, %r9941; + cvt.u32.u16 %r9943, %rs6468; + cvt.s32.s8 %r9944, %r9943; + cvt.u32.u16 %r9945, %rs6467; + cvt.s32.s8 %r9946, %r9945; + cvt.u32.u16 %r9947, %rs6466; + cvt.s32.s8 %r9948, %r9947; + mad.lo.s32 %r9949, %r222, %r9948, %r9940; + mad.lo.s32 %r9950, %r72, %r9946, %r9949; + mad.lo.s32 %r9951, %r73, %r9944, %r9950; + mad.lo.s32 %r9952, %r74, %r9942, %r9951; + ld.const.v4.u8 {%rs6474, %rs6475, %rs6476, %rs6477}, [matrix+3236]; + cvt.u32.u16 %r9953, %rs6477; + cvt.s32.s8 %r9954, %r9953; + cvt.u32.u16 %r9955, %rs6476; + cvt.s32.s8 %r9956, %r9955; + cvt.u32.u16 %r9957, %rs6475; + cvt.s32.s8 %r9958, %r9957; + cvt.u32.u16 %r9959, %rs6474; + cvt.s32.s8 %r9960, %r9959; + mad.lo.s32 %r9961, %r75, %r9960, %r9952; + mad.lo.s32 %r9962, %r76, %r9958, %r9961; + mad.lo.s32 %r9963, %r77, %r9956, %r9962; + mad.lo.s32 %r9964, %r78, %r9954, %r9963; + ld.const.v4.u8 {%rs6482, %rs6483, %rs6484, %rs6485}, [matrix+3240]; + cvt.u32.u16 %r9965, %rs6485; + cvt.s32.s8 %r9966, %r9965; + cvt.u32.u16 %r9967, %rs6484; + cvt.s32.s8 %r9968, %r9967; + cvt.u32.u16 %r9969, %rs6483; + cvt.s32.s8 %r9970, %r9969; + cvt.u32.u16 %r9971, %rs6482; + cvt.s32.s8 %r9972, %r9971; + mad.lo.s32 %r9973, %r80, %r9972, %r9964; + mad.lo.s32 %r9974, %r81, %r9970, %r9973; + mad.lo.s32 %r9975, %r83, %r9968, %r9974; + mad.lo.s32 %r9976, %r84, %r9966, %r9975; + ld.const.v4.u8 {%rs6490, %rs6491, %rs6492, %rs6493}, [matrix+3244]; + cvt.u32.u16 %r9977, %rs6493; + cvt.s32.s8 %r9978, %r9977; + cvt.u32.u16 %r9979, %rs6492; + cvt.s32.s8 %r9980, %r9979; + cvt.u32.u16 %r9981, %rs6491; + cvt.s32.s8 %r9982, %r9981; + cvt.u32.u16 %r9983, %rs6490; + cvt.s32.s8 %r9984, %r9983; + mad.lo.s32 %r9985, %r86, %r9984, %r9976; + mad.lo.s32 %r9986, %r87, %r9982, %r9985; + mad.lo.s32 %r9987, %r88, %r9980, %r9986; + mad.lo.s32 %r9988, %r89, %r9978, %r9987; + ld.const.v4.u8 {%rs6498, %rs6499, %rs6500, %rs6501}, [matrix+3248]; + cvt.u32.u16 %r9989, %rs6501; + cvt.s32.s8 %r9990, %r9989; + cvt.u32.u16 %r9991, %rs6500; + cvt.s32.s8 %r9992, %r9991; + cvt.u32.u16 %r9993, %rs6499; + cvt.s32.s8 %r9994, %r9993; + cvt.u32.u16 %r9995, %rs6498; + cvt.s32.s8 %r9996, %r9995; + mad.lo.s32 %r9997, %r271, %r9996, %r9988; + mad.lo.s32 %r9998, %r91, %r9994, %r9997; + mad.lo.s32 %r9999, %r93, %r9992, %r9998; + mad.lo.s32 %r10000, %r94, %r9990, %r9999; + ld.const.v4.u8 {%rs6506, %rs6507, %rs6508, %rs6509}, [matrix+3252]; + cvt.u32.u16 %r10001, %rs6509; + cvt.s32.s8 %r10002, %r10001; + cvt.u32.u16 %r10003, %rs6508; + cvt.s32.s8 %r10004, %r10003; + cvt.u32.u16 %r10005, %rs6507; + cvt.s32.s8 %r10006, %r10005; + cvt.u32.u16 %r10007, %rs6506; + cvt.s32.s8 %r10008, %r10007; + mad.lo.s32 %r10009, %r96, %r10008, %r10000; + mad.lo.s32 %r10010, %r97, %r10006, %r10009; + mad.lo.s32 %r10011, %r99, %r10004, %r10010; + mad.lo.s32 %r10012, %r100, %r10002, %r10011; + ld.const.v4.u8 {%rs6514, %rs6515, %rs6516, %rs6517}, [matrix+3256]; + cvt.u32.u16 %r10013, %rs6517; + cvt.s32.s8 %r10014, %r10013; + cvt.u32.u16 %r10015, %rs6516; + cvt.s32.s8 %r10016, %r10015; + cvt.u32.u16 %r10017, %rs6515; + cvt.s32.s8 %r10018, %r10017; + cvt.u32.u16 %r10019, %rs6514; + cvt.s32.s8 %r10020, %r10019; + mad.lo.s32 %r10021, %r103, %r10020, %r10012; + mad.lo.s32 %r10022, %r104, %r10018, %r10021; + mad.lo.s32 %r10023, %r107, %r10016, %r10022; + mad.lo.s32 %r10024, %r108, %r10014, %r10023; + ld.const.v4.u8 {%rs6522, %rs6523, %rs6524, %rs6525}, [matrix+3260]; + cvt.u32.u16 %r10025, %rs6525; + cvt.s32.s8 %r10026, %r10025; + cvt.u32.u16 %r10027, %rs6524; + cvt.s32.s8 %r10028, %r10027; + cvt.u32.u16 %r10029, %rs6523; + cvt.s32.s8 %r10030, %r10029; + cvt.u32.u16 %r10031, %rs6522; + cvt.s32.s8 %r10032, %r10031; + mad.lo.s32 %r10033, %r111, %r10032, %r10024; + mad.lo.s32 %r10034, %r112, %r10030, %r10033; + mad.lo.s32 %r10035, %r114, %r10028, %r10034; + mad.lo.s32 %r10036, %r115, %r10026, %r10035; + ld.const.v4.u8 {%rs6530, %rs6531, %rs6532, %rs6533}, [matrix+3264]; + cvt.u32.u16 %r10037, %rs6533; + cvt.s32.s8 %r10038, %r10037; + cvt.u32.u16 %r10039, %rs6532; + cvt.s32.s8 %r10040, %r10039; + cvt.u32.u16 %r10041, %rs6530; + cvt.s32.s8 %r10042, %r10041; + cvt.u32.u16 %r10043, %rs6531; + cvt.s32.s8 %r10044, %r10043; + mul.lo.s32 %r10045, %r34, %r10044; + mad.lo.s32 %r10046, %r124, %r10042, %r10045; + mad.lo.s32 %r10047, %r35, %r10040, %r10046; + mad.lo.s32 %r10048, %r36, %r10038, %r10047; + ld.const.v4.u8 {%rs6538, %rs6539, %rs6540, %rs6541}, [matrix+3268]; + cvt.u32.u16 %r10049, %rs6541; + cvt.s32.s8 %r10050, %r10049; + cvt.u32.u16 %r10051, %rs6540; + cvt.s32.s8 %r10052, %r10051; + cvt.u32.u16 %r10053, %rs6539; + cvt.s32.s8 %r10054, %r10053; + cvt.u32.u16 %r10055, %rs6538; + cvt.s32.s8 %r10056, %r10055; + mad.lo.s32 %r10057, %r37, %r10056, %r10048; + mad.lo.s32 %r10058, %r38, %r10054, %r10057; + mad.lo.s32 %r10059, %r39, %r10052, %r10058; + mad.lo.s32 %r10060, %r40, %r10050, %r10059; + ld.const.v4.u8 {%rs6546, %rs6547, %rs6548, %rs6549}, [matrix+3272]; + cvt.u32.u16 %r10061, %rs6549; + cvt.s32.s8 %r10062, %r10061; + cvt.u32.u16 %r10063, %rs6548; + cvt.s32.s8 %r10064, %r10063; + cvt.u32.u16 %r10065, %rs6547; + cvt.s32.s8 %r10066, %r10065; + cvt.u32.u16 %r10067, %rs6546; + cvt.s32.s8 %r10068, %r10067; + mad.lo.s32 %r10069, %r42, %r10068, %r10060; + mad.lo.s32 %r10070, %r43, %r10066, %r10069; + mad.lo.s32 %r10071, %r45, %r10064, %r10070; + mad.lo.s32 %r10072, %r46, %r10062, %r10071; + ld.const.v4.u8 {%rs6554, %rs6555, %rs6556, %rs6557}, [matrix+3276]; + cvt.u32.u16 %r10073, %rs6557; + cvt.s32.s8 %r10074, %r10073; + cvt.u32.u16 %r10075, %rs6556; + cvt.s32.s8 %r10076, %r10075; + cvt.u32.u16 %r10077, %rs6555; + cvt.s32.s8 %r10078, %r10077; + cvt.u32.u16 %r10079, %rs6554; + cvt.s32.s8 %r10080, %r10079; + mad.lo.s32 %r10081, %r48, %r10080, %r10072; + mad.lo.s32 %r10082, %r49, %r10078, %r10081; + mad.lo.s32 %r10083, %r50, %r10076, %r10082; + mad.lo.s32 %r10084, %r51, %r10074, %r10083; + ld.const.v4.u8 {%rs6562, %rs6563, %rs6564, %rs6565}, [matrix+3280]; + cvt.u32.u16 %r10085, %rs6565; + cvt.s32.s8 %r10086, %r10085; + cvt.u32.u16 %r10087, %rs6564; + cvt.s32.s8 %r10088, %r10087; + cvt.u32.u16 %r10089, %rs6563; + cvt.s32.s8 %r10090, %r10089; + cvt.u32.u16 %r10091, %rs6562; + cvt.s32.s8 %r10092, %r10091; + mad.lo.s32 %r10093, %r173, %r10092, %r10084; + mad.lo.s32 %r10094, %r53, %r10090, %r10093; + mad.lo.s32 %r10095, %r54, %r10088, %r10094; + mad.lo.s32 %r10096, %r55, %r10086, %r10095; + ld.const.v4.u8 {%rs6570, %rs6571, %rs6572, %rs6573}, [matrix+3284]; + cvt.u32.u16 %r10097, %rs6573; + cvt.s32.s8 %r10098, %r10097; + cvt.u32.u16 %r10099, %rs6572; + cvt.s32.s8 %r10100, %r10099; + cvt.u32.u16 %r10101, %rs6571; + cvt.s32.s8 %r10102, %r10101; + cvt.u32.u16 %r10103, %rs6570; + cvt.s32.s8 %r10104, %r10103; + mad.lo.s32 %r10105, %r56, %r10104, %r10096; + mad.lo.s32 %r10106, %r57, %r10102, %r10105; + mad.lo.s32 %r10107, %r58, %r10100, %r10106; + mad.lo.s32 %r10108, %r59, %r10098, %r10107; + ld.const.v4.u8 {%rs6578, %rs6579, %rs6580, %rs6581}, [matrix+3288]; + cvt.u32.u16 %r10109, %rs6581; + cvt.s32.s8 %r10110, %r10109; + cvt.u32.u16 %r10111, %rs6580; + cvt.s32.s8 %r10112, %r10111; + cvt.u32.u16 %r10113, %rs6579; + cvt.s32.s8 %r10114, %r10113; + cvt.u32.u16 %r10115, %rs6578; + cvt.s32.s8 %r10116, %r10115; + mad.lo.s32 %r10117, %r61, %r10116, %r10108; + mad.lo.s32 %r10118, %r62, %r10114, %r10117; + mad.lo.s32 %r10119, %r64, %r10112, %r10118; + mad.lo.s32 %r10120, %r65, %r10110, %r10119; + ld.const.v4.u8 {%rs6586, %rs6587, %rs6588, %rs6589}, [matrix+3292]; + cvt.u32.u16 %r10121, %rs6589; + cvt.s32.s8 %r10122, %r10121; + cvt.u32.u16 %r10123, %rs6588; + cvt.s32.s8 %r10124, %r10123; + cvt.u32.u16 %r10125, %rs6587; + cvt.s32.s8 %r10126, %r10125; + cvt.u32.u16 %r10127, %rs6586; + cvt.s32.s8 %r10128, %r10127; + mad.lo.s32 %r10129, %r67, %r10128, %r10120; + mad.lo.s32 %r10130, %r68, %r10126, %r10129; + mad.lo.s32 %r10131, %r69, %r10124, %r10130; + mad.lo.s32 %r10132, %r70, %r10122, %r10131; + ld.const.v4.u8 {%rs6594, %rs6595, %rs6596, %rs6597}, [matrix+3296]; + cvt.u32.u16 %r10133, %rs6597; + cvt.s32.s8 %r10134, %r10133; + cvt.u32.u16 %r10135, %rs6596; + cvt.s32.s8 %r10136, %r10135; + cvt.u32.u16 %r10137, %rs6595; + cvt.s32.s8 %r10138, %r10137; + cvt.u32.u16 %r10139, %rs6594; + cvt.s32.s8 %r10140, %r10139; + mad.lo.s32 %r10141, %r222, %r10140, %r10132; + mad.lo.s32 %r10142, %r72, %r10138, %r10141; + mad.lo.s32 %r10143, %r73, %r10136, %r10142; + mad.lo.s32 %r10144, %r74, %r10134, %r10143; + ld.const.v4.u8 {%rs6602, %rs6603, %rs6604, %rs6605}, [matrix+3300]; + cvt.u32.u16 %r10145, %rs6605; + cvt.s32.s8 %r10146, %r10145; + cvt.u32.u16 %r10147, %rs6604; + cvt.s32.s8 %r10148, %r10147; + cvt.u32.u16 %r10149, %rs6603; + cvt.s32.s8 %r10150, %r10149; + cvt.u32.u16 %r10151, %rs6602; + cvt.s32.s8 %r10152, %r10151; + mad.lo.s32 %r10153, %r75, %r10152, %r10144; + mad.lo.s32 %r10154, %r76, %r10150, %r10153; + mad.lo.s32 %r10155, %r77, %r10148, %r10154; + mad.lo.s32 %r10156, %r78, %r10146, %r10155; + ld.const.v4.u8 {%rs6610, %rs6611, %rs6612, %rs6613}, [matrix+3304]; + cvt.u32.u16 %r10157, %rs6613; + cvt.s32.s8 %r10158, %r10157; + cvt.u32.u16 %r10159, %rs6612; + cvt.s32.s8 %r10160, %r10159; + cvt.u32.u16 %r10161, %rs6611; + cvt.s32.s8 %r10162, %r10161; + cvt.u32.u16 %r10163, %rs6610; + cvt.s32.s8 %r10164, %r10163; + mad.lo.s32 %r10165, %r80, %r10164, %r10156; + mad.lo.s32 %r10166, %r81, %r10162, %r10165; + mad.lo.s32 %r10167, %r83, %r10160, %r10166; + mad.lo.s32 %r10168, %r84, %r10158, %r10167; + ld.const.v4.u8 {%rs6618, %rs6619, %rs6620, %rs6621}, [matrix+3308]; + cvt.u32.u16 %r10169, %rs6621; + cvt.s32.s8 %r10170, %r10169; + cvt.u32.u16 %r10171, %rs6620; + cvt.s32.s8 %r10172, %r10171; + cvt.u32.u16 %r10173, %rs6619; + cvt.s32.s8 %r10174, %r10173; + cvt.u32.u16 %r10175, %rs6618; + cvt.s32.s8 %r10176, %r10175; + mad.lo.s32 %r10177, %r86, %r10176, %r10168; + mad.lo.s32 %r10178, %r87, %r10174, %r10177; + mad.lo.s32 %r10179, %r88, %r10172, %r10178; + mad.lo.s32 %r10180, %r89, %r10170, %r10179; + ld.const.v4.u8 {%rs6626, %rs6627, %rs6628, %rs6629}, [matrix+3312]; + cvt.u32.u16 %r10181, %rs6629; + cvt.s32.s8 %r10182, %r10181; + cvt.u32.u16 %r10183, %rs6628; + cvt.s32.s8 %r10184, %r10183; + cvt.u32.u16 %r10185, %rs6627; + cvt.s32.s8 %r10186, %r10185; + cvt.u32.u16 %r10187, %rs6626; + cvt.s32.s8 %r10188, %r10187; + mad.lo.s32 %r10189, %r271, %r10188, %r10180; + mad.lo.s32 %r10190, %r91, %r10186, %r10189; + mad.lo.s32 %r10191, %r93, %r10184, %r10190; + mad.lo.s32 %r10192, %r94, %r10182, %r10191; + ld.const.v4.u8 {%rs6634, %rs6635, %rs6636, %rs6637}, [matrix+3316]; + cvt.u32.u16 %r10193, %rs6637; + cvt.s32.s8 %r10194, %r10193; + cvt.u32.u16 %r10195, %rs6636; + cvt.s32.s8 %r10196, %r10195; + cvt.u32.u16 %r10197, %rs6635; + cvt.s32.s8 %r10198, %r10197; + cvt.u32.u16 %r10199, %rs6634; + cvt.s32.s8 %r10200, %r10199; + mad.lo.s32 %r10201, %r96, %r10200, %r10192; + mad.lo.s32 %r10202, %r97, %r10198, %r10201; + mad.lo.s32 %r10203, %r99, %r10196, %r10202; + mad.lo.s32 %r10204, %r100, %r10194, %r10203; + ld.const.v4.u8 {%rs6642, %rs6643, %rs6644, %rs6645}, [matrix+3320]; + cvt.u32.u16 %r10205, %rs6645; + cvt.s32.s8 %r10206, %r10205; + cvt.u32.u16 %r10207, %rs6644; + cvt.s32.s8 %r10208, %r10207; + cvt.u32.u16 %r10209, %rs6643; + cvt.s32.s8 %r10210, %r10209; + cvt.u32.u16 %r10211, %rs6642; + cvt.s32.s8 %r10212, %r10211; + mad.lo.s32 %r10213, %r103, %r10212, %r10204; + mad.lo.s32 %r10214, %r104, %r10210, %r10213; + mad.lo.s32 %r10215, %r107, %r10208, %r10214; + mad.lo.s32 %r10216, %r108, %r10206, %r10215; + ld.const.v4.u8 {%rs6650, %rs6651, %rs6652, %rs6653}, [matrix+3324]; + cvt.u32.u16 %r10217, %rs6653; + cvt.s32.s8 %r10218, %r10217; + cvt.u32.u16 %r10219, %rs6652; + cvt.s32.s8 %r10220, %r10219; + cvt.u32.u16 %r10221, %rs6651; + cvt.s32.s8 %r10222, %r10221; + cvt.u32.u16 %r10223, %rs6650; + cvt.s32.s8 %r10224, %r10223; + mad.lo.s32 %r10225, %r111, %r10224, %r10216; + mad.lo.s32 %r10226, %r112, %r10222, %r10225; + mad.lo.s32 %r10227, %r114, %r10220, %r10226; + mad.lo.s32 %r10228, %r115, %r10218, %r10227; + shr.u32 %r10229, %r10036, 6; + and.b32 %r10230, %r10229, 240; + shr.u32 %r10231, %r10228, 10; + or.b32 %r10232, %r10231, %r10230; + xor.b32 %r10233, %r92, %r10232; + cvt.u64.u32 %rd399, %r10233; + ld.const.v4.u8 {%rs6658, %rs6659, %rs6660, %rs6661}, [matrix+3328]; + cvt.u32.u16 %r10234, %rs6661; + cvt.s32.s8 %r10235, %r10234; + cvt.u32.u16 %r10236, %rs6660; + cvt.s32.s8 %r10237, %r10236; + cvt.u32.u16 %r10238, %rs6658; + cvt.s32.s8 %r10239, %r10238; + cvt.u32.u16 %r10240, %rs6659; + cvt.s32.s8 %r10241, %r10240; + mul.lo.s32 %r10242, %r34, %r10241; + mad.lo.s32 %r10243, %r124, %r10239, %r10242; + mad.lo.s32 %r10244, %r35, %r10237, %r10243; + mad.lo.s32 %r10245, %r36, %r10235, %r10244; + ld.const.v4.u8 {%rs6666, %rs6667, %rs6668, %rs6669}, [matrix+3332]; + cvt.u32.u16 %r10246, %rs6669; + cvt.s32.s8 %r10247, %r10246; + cvt.u32.u16 %r10248, %rs6668; + cvt.s32.s8 %r10249, %r10248; + cvt.u32.u16 %r10250, %rs6667; + cvt.s32.s8 %r10251, %r10250; + cvt.u32.u16 %r10252, %rs6666; + cvt.s32.s8 %r10253, %r10252; + mad.lo.s32 %r10254, %r37, %r10253, %r10245; + mad.lo.s32 %r10255, %r38, %r10251, %r10254; + mad.lo.s32 %r10256, %r39, %r10249, %r10255; + mad.lo.s32 %r10257, %r40, %r10247, %r10256; + ld.const.v4.u8 {%rs6674, %rs6675, %rs6676, %rs6677}, [matrix+3336]; + cvt.u32.u16 %r10258, %rs6677; + cvt.s32.s8 %r10259, %r10258; + cvt.u32.u16 %r10260, %rs6676; + cvt.s32.s8 %r10261, %r10260; + cvt.u32.u16 %r10262, %rs6675; + cvt.s32.s8 %r10263, %r10262; + cvt.u32.u16 %r10264, %rs6674; + cvt.s32.s8 %r10265, %r10264; + mad.lo.s32 %r10266, %r42, %r10265, %r10257; + mad.lo.s32 %r10267, %r43, %r10263, %r10266; + mad.lo.s32 %r10268, %r45, %r10261, %r10267; + mad.lo.s32 %r10269, %r46, %r10259, %r10268; + ld.const.v4.u8 {%rs6682, %rs6683, %rs6684, %rs6685}, [matrix+3340]; + cvt.u32.u16 %r10270, %rs6685; + cvt.s32.s8 %r10271, %r10270; + cvt.u32.u16 %r10272, %rs6684; + cvt.s32.s8 %r10273, %r10272; + cvt.u32.u16 %r10274, %rs6683; + cvt.s32.s8 %r10275, %r10274; + cvt.u32.u16 %r10276, %rs6682; + cvt.s32.s8 %r10277, %r10276; + mad.lo.s32 %r10278, %r48, %r10277, %r10269; + mad.lo.s32 %r10279, %r49, %r10275, %r10278; + mad.lo.s32 %r10280, %r50, %r10273, %r10279; + mad.lo.s32 %r10281, %r51, %r10271, %r10280; + ld.const.v4.u8 {%rs6690, %rs6691, %rs6692, %rs6693}, [matrix+3344]; + cvt.u32.u16 %r10282, %rs6693; + cvt.s32.s8 %r10283, %r10282; + cvt.u32.u16 %r10284, %rs6692; + cvt.s32.s8 %r10285, %r10284; + cvt.u32.u16 %r10286, %rs6691; + cvt.s32.s8 %r10287, %r10286; + cvt.u32.u16 %r10288, %rs6690; + cvt.s32.s8 %r10289, %r10288; + mad.lo.s32 %r10290, %r173, %r10289, %r10281; + mad.lo.s32 %r10291, %r53, %r10287, %r10290; + mad.lo.s32 %r10292, %r54, %r10285, %r10291; + mad.lo.s32 %r10293, %r55, %r10283, %r10292; + ld.const.v4.u8 {%rs6698, %rs6699, %rs6700, %rs6701}, [matrix+3348]; + cvt.u32.u16 %r10294, %rs6701; + cvt.s32.s8 %r10295, %r10294; + cvt.u32.u16 %r10296, %rs6700; + cvt.s32.s8 %r10297, %r10296; + cvt.u32.u16 %r10298, %rs6699; + cvt.s32.s8 %r10299, %r10298; + cvt.u32.u16 %r10300, %rs6698; + cvt.s32.s8 %r10301, %r10300; + mad.lo.s32 %r10302, %r56, %r10301, %r10293; + mad.lo.s32 %r10303, %r57, %r10299, %r10302; + mad.lo.s32 %r10304, %r58, %r10297, %r10303; + mad.lo.s32 %r10305, %r59, %r10295, %r10304; + ld.const.v4.u8 {%rs6706, %rs6707, %rs6708, %rs6709}, [matrix+3352]; + cvt.u32.u16 %r10306, %rs6709; + cvt.s32.s8 %r10307, %r10306; + cvt.u32.u16 %r10308, %rs6708; + cvt.s32.s8 %r10309, %r10308; + cvt.u32.u16 %r10310, %rs6707; + cvt.s32.s8 %r10311, %r10310; + cvt.u32.u16 %r10312, %rs6706; + cvt.s32.s8 %r10313, %r10312; + mad.lo.s32 %r10314, %r61, %r10313, %r10305; + mad.lo.s32 %r10315, %r62, %r10311, %r10314; + mad.lo.s32 %r10316, %r64, %r10309, %r10315; + mad.lo.s32 %r10317, %r65, %r10307, %r10316; + ld.const.v4.u8 {%rs6714, %rs6715, %rs6716, %rs6717}, [matrix+3356]; + cvt.u32.u16 %r10318, %rs6717; + cvt.s32.s8 %r10319, %r10318; + cvt.u32.u16 %r10320, %rs6716; + cvt.s32.s8 %r10321, %r10320; + cvt.u32.u16 %r10322, %rs6715; + cvt.s32.s8 %r10323, %r10322; + cvt.u32.u16 %r10324, %rs6714; + cvt.s32.s8 %r10325, %r10324; + mad.lo.s32 %r10326, %r67, %r10325, %r10317; + mad.lo.s32 %r10327, %r68, %r10323, %r10326; + mad.lo.s32 %r10328, %r69, %r10321, %r10327; + mad.lo.s32 %r10329, %r70, %r10319, %r10328; + ld.const.v4.u8 {%rs6722, %rs6723, %rs6724, %rs6725}, [matrix+3360]; + cvt.u32.u16 %r10330, %rs6725; + cvt.s32.s8 %r10331, %r10330; + cvt.u32.u16 %r10332, %rs6724; + cvt.s32.s8 %r10333, %r10332; + cvt.u32.u16 %r10334, %rs6723; + cvt.s32.s8 %r10335, %r10334; + cvt.u32.u16 %r10336, %rs6722; + cvt.s32.s8 %r10337, %r10336; + mad.lo.s32 %r10338, %r222, %r10337, %r10329; + mad.lo.s32 %r10339, %r72, %r10335, %r10338; + mad.lo.s32 %r10340, %r73, %r10333, %r10339; + mad.lo.s32 %r10341, %r74, %r10331, %r10340; + ld.const.v4.u8 {%rs6730, %rs6731, %rs6732, %rs6733}, [matrix+3364]; + cvt.u32.u16 %r10342, %rs6733; + cvt.s32.s8 %r10343, %r10342; + cvt.u32.u16 %r10344, %rs6732; + cvt.s32.s8 %r10345, %r10344; + cvt.u32.u16 %r10346, %rs6731; + cvt.s32.s8 %r10347, %r10346; + cvt.u32.u16 %r10348, %rs6730; + cvt.s32.s8 %r10349, %r10348; + mad.lo.s32 %r10350, %r75, %r10349, %r10341; + mad.lo.s32 %r10351, %r76, %r10347, %r10350; + mad.lo.s32 %r10352, %r77, %r10345, %r10351; + mad.lo.s32 %r10353, %r78, %r10343, %r10352; + ld.const.v4.u8 {%rs6738, %rs6739, %rs6740, %rs6741}, [matrix+3368]; + cvt.u32.u16 %r10354, %rs6741; + cvt.s32.s8 %r10355, %r10354; + cvt.u32.u16 %r10356, %rs6740; + cvt.s32.s8 %r10357, %r10356; + cvt.u32.u16 %r10358, %rs6739; + cvt.s32.s8 %r10359, %r10358; + cvt.u32.u16 %r10360, %rs6738; + cvt.s32.s8 %r10361, %r10360; + mad.lo.s32 %r10362, %r80, %r10361, %r10353; + mad.lo.s32 %r10363, %r81, %r10359, %r10362; + mad.lo.s32 %r10364, %r83, %r10357, %r10363; + mad.lo.s32 %r10365, %r84, %r10355, %r10364; + ld.const.v4.u8 {%rs6746, %rs6747, %rs6748, %rs6749}, [matrix+3372]; + cvt.u32.u16 %r10366, %rs6749; + cvt.s32.s8 %r10367, %r10366; + cvt.u32.u16 %r10368, %rs6748; + cvt.s32.s8 %r10369, %r10368; + cvt.u32.u16 %r10370, %rs6747; + cvt.s32.s8 %r10371, %r10370; + cvt.u32.u16 %r10372, %rs6746; + cvt.s32.s8 %r10373, %r10372; + mad.lo.s32 %r10374, %r86, %r10373, %r10365; + mad.lo.s32 %r10375, %r87, %r10371, %r10374; + mad.lo.s32 %r10376, %r88, %r10369, %r10375; + mad.lo.s32 %r10377, %r89, %r10367, %r10376; + ld.const.v4.u8 {%rs6754, %rs6755, %rs6756, %rs6757}, [matrix+3376]; + cvt.u32.u16 %r10378, %rs6757; + cvt.s32.s8 %r10379, %r10378; + cvt.u32.u16 %r10380, %rs6756; + cvt.s32.s8 %r10381, %r10380; + cvt.u32.u16 %r10382, %rs6755; + cvt.s32.s8 %r10383, %r10382; + cvt.u32.u16 %r10384, %rs6754; + cvt.s32.s8 %r10385, %r10384; + mad.lo.s32 %r10386, %r271, %r10385, %r10377; + mad.lo.s32 %r10387, %r91, %r10383, %r10386; + mad.lo.s32 %r10388, %r93, %r10381, %r10387; + mad.lo.s32 %r10389, %r94, %r10379, %r10388; + ld.const.v4.u8 {%rs6762, %rs6763, %rs6764, %rs6765}, [matrix+3380]; + cvt.u32.u16 %r10390, %rs6765; + cvt.s32.s8 %r10391, %r10390; + cvt.u32.u16 %r10392, %rs6764; + cvt.s32.s8 %r10393, %r10392; + cvt.u32.u16 %r10394, %rs6763; + cvt.s32.s8 %r10395, %r10394; + cvt.u32.u16 %r10396, %rs6762; + cvt.s32.s8 %r10397, %r10396; + mad.lo.s32 %r10398, %r96, %r10397, %r10389; + mad.lo.s32 %r10399, %r97, %r10395, %r10398; + mad.lo.s32 %r10400, %r99, %r10393, %r10399; + mad.lo.s32 %r10401, %r100, %r10391, %r10400; + ld.const.v4.u8 {%rs6770, %rs6771, %rs6772, %rs6773}, [matrix+3384]; + cvt.u32.u16 %r10402, %rs6773; + cvt.s32.s8 %r10403, %r10402; + cvt.u32.u16 %r10404, %rs6772; + cvt.s32.s8 %r10405, %r10404; + cvt.u32.u16 %r10406, %rs6771; + cvt.s32.s8 %r10407, %r10406; + cvt.u32.u16 %r10408, %rs6770; + cvt.s32.s8 %r10409, %r10408; + mad.lo.s32 %r10410, %r103, %r10409, %r10401; + mad.lo.s32 %r10411, %r104, %r10407, %r10410; + mad.lo.s32 %r10412, %r107, %r10405, %r10411; + mad.lo.s32 %r10413, %r108, %r10403, %r10412; + ld.const.v4.u8 {%rs6778, %rs6779, %rs6780, %rs6781}, [matrix+3388]; + cvt.u32.u16 %r10414, %rs6781; + cvt.s32.s8 %r10415, %r10414; + cvt.u32.u16 %r10416, %rs6780; + cvt.s32.s8 %r10417, %r10416; + cvt.u32.u16 %r10418, %rs6779; + cvt.s32.s8 %r10419, %r10418; + cvt.u32.u16 %r10420, %rs6778; + cvt.s32.s8 %r10421, %r10420; + mad.lo.s32 %r10422, %r111, %r10421, %r10413; + mad.lo.s32 %r10423, %r112, %r10419, %r10422; + mad.lo.s32 %r10424, %r114, %r10417, %r10423; + mad.lo.s32 %r10425, %r115, %r10415, %r10424; + ld.const.v4.u8 {%rs6786, %rs6787, %rs6788, %rs6789}, [matrix+3392]; + cvt.u32.u16 %r10426, %rs6789; + cvt.s32.s8 %r10427, %r10426; + cvt.u32.u16 %r10428, %rs6788; + cvt.s32.s8 %r10429, %r10428; + cvt.u32.u16 %r10430, %rs6786; + cvt.s32.s8 %r10431, %r10430; + cvt.u32.u16 %r10432, %rs6787; + cvt.s32.s8 %r10433, %r10432; + mul.lo.s32 %r10434, %r34, %r10433; + mad.lo.s32 %r10435, %r124, %r10431, %r10434; + mad.lo.s32 %r10436, %r35, %r10429, %r10435; + mad.lo.s32 %r10437, %r36, %r10427, %r10436; + ld.const.v4.u8 {%rs6794, %rs6795, %rs6796, %rs6797}, [matrix+3396]; + cvt.u32.u16 %r10438, %rs6797; + cvt.s32.s8 %r10439, %r10438; + cvt.u32.u16 %r10440, %rs6796; + cvt.s32.s8 %r10441, %r10440; + cvt.u32.u16 %r10442, %rs6795; + cvt.s32.s8 %r10443, %r10442; + cvt.u32.u16 %r10444, %rs6794; + cvt.s32.s8 %r10445, %r10444; + mad.lo.s32 %r10446, %r37, %r10445, %r10437; + mad.lo.s32 %r10447, %r38, %r10443, %r10446; + mad.lo.s32 %r10448, %r39, %r10441, %r10447; + mad.lo.s32 %r10449, %r40, %r10439, %r10448; + ld.const.v4.u8 {%rs6802, %rs6803, %rs6804, %rs6805}, [matrix+3400]; + cvt.u32.u16 %r10450, %rs6805; + cvt.s32.s8 %r10451, %r10450; + cvt.u32.u16 %r10452, %rs6804; + cvt.s32.s8 %r10453, %r10452; + cvt.u32.u16 %r10454, %rs6803; + cvt.s32.s8 %r10455, %r10454; + cvt.u32.u16 %r10456, %rs6802; + cvt.s32.s8 %r10457, %r10456; + mad.lo.s32 %r10458, %r42, %r10457, %r10449; + mad.lo.s32 %r10459, %r43, %r10455, %r10458; + mad.lo.s32 %r10460, %r45, %r10453, %r10459; + mad.lo.s32 %r10461, %r46, %r10451, %r10460; + ld.const.v4.u8 {%rs6810, %rs6811, %rs6812, %rs6813}, [matrix+3404]; + cvt.u32.u16 %r10462, %rs6813; + cvt.s32.s8 %r10463, %r10462; + cvt.u32.u16 %r10464, %rs6812; + cvt.s32.s8 %r10465, %r10464; + cvt.u32.u16 %r10466, %rs6811; + cvt.s32.s8 %r10467, %r10466; + cvt.u32.u16 %r10468, %rs6810; + cvt.s32.s8 %r10469, %r10468; + mad.lo.s32 %r10470, %r48, %r10469, %r10461; + mad.lo.s32 %r10471, %r49, %r10467, %r10470; + mad.lo.s32 %r10472, %r50, %r10465, %r10471; + mad.lo.s32 %r10473, %r51, %r10463, %r10472; + ld.const.v4.u8 {%rs6818, %rs6819, %rs6820, %rs6821}, [matrix+3408]; + cvt.u32.u16 %r10474, %rs6821; + cvt.s32.s8 %r10475, %r10474; + cvt.u32.u16 %r10476, %rs6820; + cvt.s32.s8 %r10477, %r10476; + cvt.u32.u16 %r10478, %rs6819; + cvt.s32.s8 %r10479, %r10478; + cvt.u32.u16 %r10480, %rs6818; + cvt.s32.s8 %r10481, %r10480; + mad.lo.s32 %r10482, %r173, %r10481, %r10473; + mad.lo.s32 %r10483, %r53, %r10479, %r10482; + mad.lo.s32 %r10484, %r54, %r10477, %r10483; + mad.lo.s32 %r10485, %r55, %r10475, %r10484; + ld.const.v4.u8 {%rs6826, %rs6827, %rs6828, %rs6829}, [matrix+3412]; + cvt.u32.u16 %r10486, %rs6829; + cvt.s32.s8 %r10487, %r10486; + cvt.u32.u16 %r10488, %rs6828; + cvt.s32.s8 %r10489, %r10488; + cvt.u32.u16 %r10490, %rs6827; + cvt.s32.s8 %r10491, %r10490; + cvt.u32.u16 %r10492, %rs6826; + cvt.s32.s8 %r10493, %r10492; + mad.lo.s32 %r10494, %r56, %r10493, %r10485; + mad.lo.s32 %r10495, %r57, %r10491, %r10494; + mad.lo.s32 %r10496, %r58, %r10489, %r10495; + mad.lo.s32 %r10497, %r59, %r10487, %r10496; + ld.const.v4.u8 {%rs6834, %rs6835, %rs6836, %rs6837}, [matrix+3416]; + cvt.u32.u16 %r10498, %rs6837; + cvt.s32.s8 %r10499, %r10498; + cvt.u32.u16 %r10500, %rs6836; + cvt.s32.s8 %r10501, %r10500; + cvt.u32.u16 %r10502, %rs6835; + cvt.s32.s8 %r10503, %r10502; + cvt.u32.u16 %r10504, %rs6834; + cvt.s32.s8 %r10505, %r10504; + mad.lo.s32 %r10506, %r61, %r10505, %r10497; + mad.lo.s32 %r10507, %r62, %r10503, %r10506; + mad.lo.s32 %r10508, %r64, %r10501, %r10507; + mad.lo.s32 %r10509, %r65, %r10499, %r10508; + ld.const.v4.u8 {%rs6842, %rs6843, %rs6844, %rs6845}, [matrix+3420]; + cvt.u32.u16 %r10510, %rs6845; + cvt.s32.s8 %r10511, %r10510; + cvt.u32.u16 %r10512, %rs6844; + cvt.s32.s8 %r10513, %r10512; + cvt.u32.u16 %r10514, %rs6843; + cvt.s32.s8 %r10515, %r10514; + cvt.u32.u16 %r10516, %rs6842; + cvt.s32.s8 %r10517, %r10516; + mad.lo.s32 %r10518, %r67, %r10517, %r10509; + mad.lo.s32 %r10519, %r68, %r10515, %r10518; + mad.lo.s32 %r10520, %r69, %r10513, %r10519; + mad.lo.s32 %r10521, %r70, %r10511, %r10520; + ld.const.v4.u8 {%rs6850, %rs6851, %rs6852, %rs6853}, [matrix+3424]; + cvt.u32.u16 %r10522, %rs6853; + cvt.s32.s8 %r10523, %r10522; + cvt.u32.u16 %r10524, %rs6852; + cvt.s32.s8 %r10525, %r10524; + cvt.u32.u16 %r10526, %rs6851; + cvt.s32.s8 %r10527, %r10526; + cvt.u32.u16 %r10528, %rs6850; + cvt.s32.s8 %r10529, %r10528; + mad.lo.s32 %r10530, %r222, %r10529, %r10521; + mad.lo.s32 %r10531, %r72, %r10527, %r10530; + mad.lo.s32 %r10532, %r73, %r10525, %r10531; + mad.lo.s32 %r10533, %r74, %r10523, %r10532; + ld.const.v4.u8 {%rs6858, %rs6859, %rs6860, %rs6861}, [matrix+3428]; + cvt.u32.u16 %r10534, %rs6861; + cvt.s32.s8 %r10535, %r10534; + cvt.u32.u16 %r10536, %rs6860; + cvt.s32.s8 %r10537, %r10536; + cvt.u32.u16 %r10538, %rs6859; + cvt.s32.s8 %r10539, %r10538; + cvt.u32.u16 %r10540, %rs6858; + cvt.s32.s8 %r10541, %r10540; + mad.lo.s32 %r10542, %r75, %r10541, %r10533; + mad.lo.s32 %r10543, %r76, %r10539, %r10542; + mad.lo.s32 %r10544, %r77, %r10537, %r10543; + mad.lo.s32 %r10545, %r78, %r10535, %r10544; + ld.const.v4.u8 {%rs6866, %rs6867, %rs6868, %rs6869}, [matrix+3432]; + cvt.u32.u16 %r10546, %rs6869; + cvt.s32.s8 %r10547, %r10546; + cvt.u32.u16 %r10548, %rs6868; + cvt.s32.s8 %r10549, %r10548; + cvt.u32.u16 %r10550, %rs6867; + cvt.s32.s8 %r10551, %r10550; + cvt.u32.u16 %r10552, %rs6866; + cvt.s32.s8 %r10553, %r10552; + mad.lo.s32 %r10554, %r80, %r10553, %r10545; + mad.lo.s32 %r10555, %r81, %r10551, %r10554; + mad.lo.s32 %r10556, %r83, %r10549, %r10555; + mad.lo.s32 %r10557, %r84, %r10547, %r10556; + ld.const.v4.u8 {%rs6874, %rs6875, %rs6876, %rs6877}, [matrix+3436]; + cvt.u32.u16 %r10558, %rs6877; + cvt.s32.s8 %r10559, %r10558; + cvt.u32.u16 %r10560, %rs6876; + cvt.s32.s8 %r10561, %r10560; + cvt.u32.u16 %r10562, %rs6875; + cvt.s32.s8 %r10563, %r10562; + cvt.u32.u16 %r10564, %rs6874; + cvt.s32.s8 %r10565, %r10564; + mad.lo.s32 %r10566, %r86, %r10565, %r10557; + mad.lo.s32 %r10567, %r87, %r10563, %r10566; + mad.lo.s32 %r10568, %r88, %r10561, %r10567; + mad.lo.s32 %r10569, %r89, %r10559, %r10568; + ld.const.v4.u8 {%rs6882, %rs6883, %rs6884, %rs6885}, [matrix+3440]; + cvt.u32.u16 %r10570, %rs6885; + cvt.s32.s8 %r10571, %r10570; + cvt.u32.u16 %r10572, %rs6884; + cvt.s32.s8 %r10573, %r10572; + cvt.u32.u16 %r10574, %rs6883; + cvt.s32.s8 %r10575, %r10574; + cvt.u32.u16 %r10576, %rs6882; + cvt.s32.s8 %r10577, %r10576; + mad.lo.s32 %r10578, %r271, %r10577, %r10569; + mad.lo.s32 %r10579, %r91, %r10575, %r10578; + mad.lo.s32 %r10580, %r93, %r10573, %r10579; + mad.lo.s32 %r10581, %r94, %r10571, %r10580; + ld.const.v4.u8 {%rs6890, %rs6891, %rs6892, %rs6893}, [matrix+3444]; + cvt.u32.u16 %r10582, %rs6893; + cvt.s32.s8 %r10583, %r10582; + cvt.u32.u16 %r10584, %rs6892; + cvt.s32.s8 %r10585, %r10584; + cvt.u32.u16 %r10586, %rs6891; + cvt.s32.s8 %r10587, %r10586; + cvt.u32.u16 %r10588, %rs6890; + cvt.s32.s8 %r10589, %r10588; + mad.lo.s32 %r10590, %r96, %r10589, %r10581; + mad.lo.s32 %r10591, %r97, %r10587, %r10590; + mad.lo.s32 %r10592, %r99, %r10585, %r10591; + mad.lo.s32 %r10593, %r100, %r10583, %r10592; + ld.const.v4.u8 {%rs6898, %rs6899, %rs6900, %rs6901}, [matrix+3448]; + cvt.u32.u16 %r10594, %rs6901; + cvt.s32.s8 %r10595, %r10594; + cvt.u32.u16 %r10596, %rs6900; + cvt.s32.s8 %r10597, %r10596; + cvt.u32.u16 %r10598, %rs6899; + cvt.s32.s8 %r10599, %r10598; + cvt.u32.u16 %r10600, %rs6898; + cvt.s32.s8 %r10601, %r10600; + mad.lo.s32 %r10602, %r103, %r10601, %r10593; + mad.lo.s32 %r10603, %r104, %r10599, %r10602; + mad.lo.s32 %r10604, %r107, %r10597, %r10603; + mad.lo.s32 %r10605, %r108, %r10595, %r10604; + ld.const.v4.u8 {%rs6906, %rs6907, %rs6908, %rs6909}, [matrix+3452]; + cvt.u32.u16 %r10606, %rs6909; + cvt.s32.s8 %r10607, %r10606; + cvt.u32.u16 %r10608, %rs6908; + cvt.s32.s8 %r10609, %r10608; + cvt.u32.u16 %r10610, %rs6907; + cvt.s32.s8 %r10611, %r10610; + cvt.u32.u16 %r10612, %rs6906; + cvt.s32.s8 %r10613, %r10612; + mad.lo.s32 %r10614, %r111, %r10613, %r10605; + mad.lo.s32 %r10615, %r112, %r10611, %r10614; + mad.lo.s32 %r10616, %r114, %r10609, %r10615; + mad.lo.s32 %r10617, %r115, %r10607, %r10616; + shr.u32 %r10618, %r10425, 6; + and.b32 %r10619, %r10618, 240; + shr.u32 %r10620, %r10617, 10; + or.b32 %r10621, %r10620, %r10619; + xor.b32 %r10622, %r95, %r10621; + cvt.u64.u32 %rd400, %r10622; + ld.const.v4.u8 {%rs6914, %rs6915, %rs6916, %rs6917}, [matrix+3456]; + cvt.u32.u16 %r10623, %rs6917; + cvt.s32.s8 %r10624, %r10623; + cvt.u32.u16 %r10625, %rs6916; + cvt.s32.s8 %r10626, %r10625; + cvt.u32.u16 %r10627, %rs6914; + cvt.s32.s8 %r10628, %r10627; + cvt.u32.u16 %r10629, %rs6915; + cvt.s32.s8 %r10630, %r10629; + mul.lo.s32 %r10631, %r34, %r10630; + mad.lo.s32 %r10632, %r124, %r10628, %r10631; + mad.lo.s32 %r10633, %r35, %r10626, %r10632; + mad.lo.s32 %r10634, %r36, %r10624, %r10633; + ld.const.v4.u8 {%rs6922, %rs6923, %rs6924, %rs6925}, [matrix+3460]; + cvt.u32.u16 %r10635, %rs6925; + cvt.s32.s8 %r10636, %r10635; + cvt.u32.u16 %r10637, %rs6924; + cvt.s32.s8 %r10638, %r10637; + cvt.u32.u16 %r10639, %rs6923; + cvt.s32.s8 %r10640, %r10639; + cvt.u32.u16 %r10641, %rs6922; + cvt.s32.s8 %r10642, %r10641; + mad.lo.s32 %r10643, %r37, %r10642, %r10634; + mad.lo.s32 %r10644, %r38, %r10640, %r10643; + mad.lo.s32 %r10645, %r39, %r10638, %r10644; + mad.lo.s32 %r10646, %r40, %r10636, %r10645; + ld.const.v4.u8 {%rs6930, %rs6931, %rs6932, %rs6933}, [matrix+3464]; + cvt.u32.u16 %r10647, %rs6933; + cvt.s32.s8 %r10648, %r10647; + cvt.u32.u16 %r10649, %rs6932; + cvt.s32.s8 %r10650, %r10649; + cvt.u32.u16 %r10651, %rs6931; + cvt.s32.s8 %r10652, %r10651; + cvt.u32.u16 %r10653, %rs6930; + cvt.s32.s8 %r10654, %r10653; + mad.lo.s32 %r10655, %r42, %r10654, %r10646; + mad.lo.s32 %r10656, %r43, %r10652, %r10655; + mad.lo.s32 %r10657, %r45, %r10650, %r10656; + mad.lo.s32 %r10658, %r46, %r10648, %r10657; + ld.const.v4.u8 {%rs6938, %rs6939, %rs6940, %rs6941}, [matrix+3468]; + cvt.u32.u16 %r10659, %rs6941; + cvt.s32.s8 %r10660, %r10659; + cvt.u32.u16 %r10661, %rs6940; + cvt.s32.s8 %r10662, %r10661; + cvt.u32.u16 %r10663, %rs6939; + cvt.s32.s8 %r10664, %r10663; + cvt.u32.u16 %r10665, %rs6938; + cvt.s32.s8 %r10666, %r10665; + mad.lo.s32 %r10667, %r48, %r10666, %r10658; + mad.lo.s32 %r10668, %r49, %r10664, %r10667; + mad.lo.s32 %r10669, %r50, %r10662, %r10668; + mad.lo.s32 %r10670, %r51, %r10660, %r10669; + ld.const.v4.u8 {%rs6946, %rs6947, %rs6948, %rs6949}, [matrix+3472]; + cvt.u32.u16 %r10671, %rs6949; + cvt.s32.s8 %r10672, %r10671; + cvt.u32.u16 %r10673, %rs6948; + cvt.s32.s8 %r10674, %r10673; + cvt.u32.u16 %r10675, %rs6947; + cvt.s32.s8 %r10676, %r10675; + cvt.u32.u16 %r10677, %rs6946; + cvt.s32.s8 %r10678, %r10677; + mad.lo.s32 %r10679, %r173, %r10678, %r10670; + mad.lo.s32 %r10680, %r53, %r10676, %r10679; + mad.lo.s32 %r10681, %r54, %r10674, %r10680; + mad.lo.s32 %r10682, %r55, %r10672, %r10681; + ld.const.v4.u8 {%rs6954, %rs6955, %rs6956, %rs6957}, [matrix+3476]; + cvt.u32.u16 %r10683, %rs6957; + cvt.s32.s8 %r10684, %r10683; + cvt.u32.u16 %r10685, %rs6956; + cvt.s32.s8 %r10686, %r10685; + cvt.u32.u16 %r10687, %rs6955; + cvt.s32.s8 %r10688, %r10687; + cvt.u32.u16 %r10689, %rs6954; + cvt.s32.s8 %r10690, %r10689; + mad.lo.s32 %r10691, %r56, %r10690, %r10682; + mad.lo.s32 %r10692, %r57, %r10688, %r10691; + mad.lo.s32 %r10693, %r58, %r10686, %r10692; + mad.lo.s32 %r10694, %r59, %r10684, %r10693; + ld.const.v4.u8 {%rs6962, %rs6963, %rs6964, %rs6965}, [matrix+3480]; + cvt.u32.u16 %r10695, %rs6965; + cvt.s32.s8 %r10696, %r10695; + cvt.u32.u16 %r10697, %rs6964; + cvt.s32.s8 %r10698, %r10697; + cvt.u32.u16 %r10699, %rs6963; + cvt.s32.s8 %r10700, %r10699; + cvt.u32.u16 %r10701, %rs6962; + cvt.s32.s8 %r10702, %r10701; + mad.lo.s32 %r10703, %r61, %r10702, %r10694; + mad.lo.s32 %r10704, %r62, %r10700, %r10703; + mad.lo.s32 %r10705, %r64, %r10698, %r10704; + mad.lo.s32 %r10706, %r65, %r10696, %r10705; + ld.const.v4.u8 {%rs6970, %rs6971, %rs6972, %rs6973}, [matrix+3484]; + cvt.u32.u16 %r10707, %rs6973; + cvt.s32.s8 %r10708, %r10707; + cvt.u32.u16 %r10709, %rs6972; + cvt.s32.s8 %r10710, %r10709; + cvt.u32.u16 %r10711, %rs6971; + cvt.s32.s8 %r10712, %r10711; + cvt.u32.u16 %r10713, %rs6970; + cvt.s32.s8 %r10714, %r10713; + mad.lo.s32 %r10715, %r67, %r10714, %r10706; + mad.lo.s32 %r10716, %r68, %r10712, %r10715; + mad.lo.s32 %r10717, %r69, %r10710, %r10716; + mad.lo.s32 %r10718, %r70, %r10708, %r10717; + ld.const.v4.u8 {%rs6978, %rs6979, %rs6980, %rs6981}, [matrix+3488]; + cvt.u32.u16 %r10719, %rs6981; + cvt.s32.s8 %r10720, %r10719; + cvt.u32.u16 %r10721, %rs6980; + cvt.s32.s8 %r10722, %r10721; + cvt.u32.u16 %r10723, %rs6979; + cvt.s32.s8 %r10724, %r10723; + cvt.u32.u16 %r10725, %rs6978; + cvt.s32.s8 %r10726, %r10725; + mad.lo.s32 %r10727, %r222, %r10726, %r10718; + mad.lo.s32 %r10728, %r72, %r10724, %r10727; + mad.lo.s32 %r10729, %r73, %r10722, %r10728; + mad.lo.s32 %r10730, %r74, %r10720, %r10729; + ld.const.v4.u8 {%rs6986, %rs6987, %rs6988, %rs6989}, [matrix+3492]; + cvt.u32.u16 %r10731, %rs6989; + cvt.s32.s8 %r10732, %r10731; + cvt.u32.u16 %r10733, %rs6988; + cvt.s32.s8 %r10734, %r10733; + cvt.u32.u16 %r10735, %rs6987; + cvt.s32.s8 %r10736, %r10735; + cvt.u32.u16 %r10737, %rs6986; + cvt.s32.s8 %r10738, %r10737; + mad.lo.s32 %r10739, %r75, %r10738, %r10730; + mad.lo.s32 %r10740, %r76, %r10736, %r10739; + mad.lo.s32 %r10741, %r77, %r10734, %r10740; + mad.lo.s32 %r10742, %r78, %r10732, %r10741; + ld.const.v4.u8 {%rs6994, %rs6995, %rs6996, %rs6997}, [matrix+3496]; + cvt.u32.u16 %r10743, %rs6997; + cvt.s32.s8 %r10744, %r10743; + cvt.u32.u16 %r10745, %rs6996; + cvt.s32.s8 %r10746, %r10745; + cvt.u32.u16 %r10747, %rs6995; + cvt.s32.s8 %r10748, %r10747; + cvt.u32.u16 %r10749, %rs6994; + cvt.s32.s8 %r10750, %r10749; + mad.lo.s32 %r10751, %r80, %r10750, %r10742; + mad.lo.s32 %r10752, %r81, %r10748, %r10751; + mad.lo.s32 %r10753, %r83, %r10746, %r10752; + mad.lo.s32 %r10754, %r84, %r10744, %r10753; + ld.const.v4.u8 {%rs7002, %rs7003, %rs7004, %rs7005}, [matrix+3500]; + cvt.u32.u16 %r10755, %rs7005; + cvt.s32.s8 %r10756, %r10755; + cvt.u32.u16 %r10757, %rs7004; + cvt.s32.s8 %r10758, %r10757; + cvt.u32.u16 %r10759, %rs7003; + cvt.s32.s8 %r10760, %r10759; + cvt.u32.u16 %r10761, %rs7002; + cvt.s32.s8 %r10762, %r10761; + mad.lo.s32 %r10763, %r86, %r10762, %r10754; + mad.lo.s32 %r10764, %r87, %r10760, %r10763; + mad.lo.s32 %r10765, %r88, %r10758, %r10764; + mad.lo.s32 %r10766, %r89, %r10756, %r10765; + ld.const.v4.u8 {%rs7010, %rs7011, %rs7012, %rs7013}, [matrix+3504]; + cvt.u32.u16 %r10767, %rs7013; + cvt.s32.s8 %r10768, %r10767; + cvt.u32.u16 %r10769, %rs7012; + cvt.s32.s8 %r10770, %r10769; + cvt.u32.u16 %r10771, %rs7011; + cvt.s32.s8 %r10772, %r10771; + cvt.u32.u16 %r10773, %rs7010; + cvt.s32.s8 %r10774, %r10773; + mad.lo.s32 %r10775, %r271, %r10774, %r10766; + mad.lo.s32 %r10776, %r91, %r10772, %r10775; + mad.lo.s32 %r10777, %r93, %r10770, %r10776; + mad.lo.s32 %r10778, %r94, %r10768, %r10777; + ld.const.v4.u8 {%rs7018, %rs7019, %rs7020, %rs7021}, [matrix+3508]; + cvt.u32.u16 %r10779, %rs7021; + cvt.s32.s8 %r10780, %r10779; + cvt.u32.u16 %r10781, %rs7020; + cvt.s32.s8 %r10782, %r10781; + cvt.u32.u16 %r10783, %rs7019; + cvt.s32.s8 %r10784, %r10783; + cvt.u32.u16 %r10785, %rs7018; + cvt.s32.s8 %r10786, %r10785; + mad.lo.s32 %r10787, %r96, %r10786, %r10778; + mad.lo.s32 %r10788, %r97, %r10784, %r10787; + mad.lo.s32 %r10789, %r99, %r10782, %r10788; + mad.lo.s32 %r10790, %r100, %r10780, %r10789; + ld.const.v4.u8 {%rs7026, %rs7027, %rs7028, %rs7029}, [matrix+3512]; + cvt.u32.u16 %r10791, %rs7029; + cvt.s32.s8 %r10792, %r10791; + cvt.u32.u16 %r10793, %rs7028; + cvt.s32.s8 %r10794, %r10793; + cvt.u32.u16 %r10795, %rs7027; + cvt.s32.s8 %r10796, %r10795; + cvt.u32.u16 %r10797, %rs7026; + cvt.s32.s8 %r10798, %r10797; + mad.lo.s32 %r10799, %r103, %r10798, %r10790; + mad.lo.s32 %r10800, %r104, %r10796, %r10799; + mad.lo.s32 %r10801, %r107, %r10794, %r10800; + mad.lo.s32 %r10802, %r108, %r10792, %r10801; + ld.const.v4.u8 {%rs7034, %rs7035, %rs7036, %rs7037}, [matrix+3516]; + cvt.u32.u16 %r10803, %rs7037; + cvt.s32.s8 %r10804, %r10803; + cvt.u32.u16 %r10805, %rs7036; + cvt.s32.s8 %r10806, %r10805; + cvt.u32.u16 %r10807, %rs7035; + cvt.s32.s8 %r10808, %r10807; + cvt.u32.u16 %r10809, %rs7034; + cvt.s32.s8 %r10810, %r10809; + mad.lo.s32 %r10811, %r111, %r10810, %r10802; + mad.lo.s32 %r10812, %r112, %r10808, %r10811; + mad.lo.s32 %r10813, %r114, %r10806, %r10812; + mad.lo.s32 %r10814, %r115, %r10804, %r10813; + ld.const.v4.u8 {%rs7042, %rs7043, %rs7044, %rs7045}, [matrix+3520]; + cvt.u32.u16 %r10815, %rs7045; + cvt.s32.s8 %r10816, %r10815; + cvt.u32.u16 %r10817, %rs7044; + cvt.s32.s8 %r10818, %r10817; + cvt.u32.u16 %r10819, %rs7042; + cvt.s32.s8 %r10820, %r10819; + cvt.u32.u16 %r10821, %rs7043; + cvt.s32.s8 %r10822, %r10821; + mul.lo.s32 %r10823, %r34, %r10822; + mad.lo.s32 %r10824, %r124, %r10820, %r10823; + mad.lo.s32 %r10825, %r35, %r10818, %r10824; + mad.lo.s32 %r10826, %r36, %r10816, %r10825; + ld.const.v4.u8 {%rs7050, %rs7051, %rs7052, %rs7053}, [matrix+3524]; + cvt.u32.u16 %r10827, %rs7053; + cvt.s32.s8 %r10828, %r10827; + cvt.u32.u16 %r10829, %rs7052; + cvt.s32.s8 %r10830, %r10829; + cvt.u32.u16 %r10831, %rs7051; + cvt.s32.s8 %r10832, %r10831; + cvt.u32.u16 %r10833, %rs7050; + cvt.s32.s8 %r10834, %r10833; + mad.lo.s32 %r10835, %r37, %r10834, %r10826; + mad.lo.s32 %r10836, %r38, %r10832, %r10835; + mad.lo.s32 %r10837, %r39, %r10830, %r10836; + mad.lo.s32 %r10838, %r40, %r10828, %r10837; + ld.const.v4.u8 {%rs7058, %rs7059, %rs7060, %rs7061}, [matrix+3528]; + cvt.u32.u16 %r10839, %rs7061; + cvt.s32.s8 %r10840, %r10839; + cvt.u32.u16 %r10841, %rs7060; + cvt.s32.s8 %r10842, %r10841; + cvt.u32.u16 %r10843, %rs7059; + cvt.s32.s8 %r10844, %r10843; + cvt.u32.u16 %r10845, %rs7058; + cvt.s32.s8 %r10846, %r10845; + mad.lo.s32 %r10847, %r42, %r10846, %r10838; + mad.lo.s32 %r10848, %r43, %r10844, %r10847; + mad.lo.s32 %r10849, %r45, %r10842, %r10848; + mad.lo.s32 %r10850, %r46, %r10840, %r10849; + ld.const.v4.u8 {%rs7066, %rs7067, %rs7068, %rs7069}, [matrix+3532]; + cvt.u32.u16 %r10851, %rs7069; + cvt.s32.s8 %r10852, %r10851; + cvt.u32.u16 %r10853, %rs7068; + cvt.s32.s8 %r10854, %r10853; + cvt.u32.u16 %r10855, %rs7067; + cvt.s32.s8 %r10856, %r10855; + cvt.u32.u16 %r10857, %rs7066; + cvt.s32.s8 %r10858, %r10857; + mad.lo.s32 %r10859, %r48, %r10858, %r10850; + mad.lo.s32 %r10860, %r49, %r10856, %r10859; + mad.lo.s32 %r10861, %r50, %r10854, %r10860; + mad.lo.s32 %r10862, %r51, %r10852, %r10861; + ld.const.v4.u8 {%rs7074, %rs7075, %rs7076, %rs7077}, [matrix+3536]; + cvt.u32.u16 %r10863, %rs7077; + cvt.s32.s8 %r10864, %r10863; + cvt.u32.u16 %r10865, %rs7076; + cvt.s32.s8 %r10866, %r10865; + cvt.u32.u16 %r10867, %rs7075; + cvt.s32.s8 %r10868, %r10867; + cvt.u32.u16 %r10869, %rs7074; + cvt.s32.s8 %r10870, %r10869; + mad.lo.s32 %r10871, %r173, %r10870, %r10862; + mad.lo.s32 %r10872, %r53, %r10868, %r10871; + mad.lo.s32 %r10873, %r54, %r10866, %r10872; + mad.lo.s32 %r10874, %r55, %r10864, %r10873; + ld.const.v4.u8 {%rs7082, %rs7083, %rs7084, %rs7085}, [matrix+3540]; + cvt.u32.u16 %r10875, %rs7085; + cvt.s32.s8 %r10876, %r10875; + cvt.u32.u16 %r10877, %rs7084; + cvt.s32.s8 %r10878, %r10877; + cvt.u32.u16 %r10879, %rs7083; + cvt.s32.s8 %r10880, %r10879; + cvt.u32.u16 %r10881, %rs7082; + cvt.s32.s8 %r10882, %r10881; + mad.lo.s32 %r10883, %r56, %r10882, %r10874; + mad.lo.s32 %r10884, %r57, %r10880, %r10883; + mad.lo.s32 %r10885, %r58, %r10878, %r10884; + mad.lo.s32 %r10886, %r59, %r10876, %r10885; + ld.const.v4.u8 {%rs7090, %rs7091, %rs7092, %rs7093}, [matrix+3544]; + cvt.u32.u16 %r10887, %rs7093; + cvt.s32.s8 %r10888, %r10887; + cvt.u32.u16 %r10889, %rs7092; + cvt.s32.s8 %r10890, %r10889; + cvt.u32.u16 %r10891, %rs7091; + cvt.s32.s8 %r10892, %r10891; + cvt.u32.u16 %r10893, %rs7090; + cvt.s32.s8 %r10894, %r10893; + mad.lo.s32 %r10895, %r61, %r10894, %r10886; + mad.lo.s32 %r10896, %r62, %r10892, %r10895; + mad.lo.s32 %r10897, %r64, %r10890, %r10896; + mad.lo.s32 %r10898, %r65, %r10888, %r10897; + ld.const.v4.u8 {%rs7098, %rs7099, %rs7100, %rs7101}, [matrix+3548]; + cvt.u32.u16 %r10899, %rs7101; + cvt.s32.s8 %r10900, %r10899; + cvt.u32.u16 %r10901, %rs7100; + cvt.s32.s8 %r10902, %r10901; + cvt.u32.u16 %r10903, %rs7099; + cvt.s32.s8 %r10904, %r10903; + cvt.u32.u16 %r10905, %rs7098; + cvt.s32.s8 %r10906, %r10905; + mad.lo.s32 %r10907, %r67, %r10906, %r10898; + mad.lo.s32 %r10908, %r68, %r10904, %r10907; + mad.lo.s32 %r10909, %r69, %r10902, %r10908; + mad.lo.s32 %r10910, %r70, %r10900, %r10909; + ld.const.v4.u8 {%rs7106, %rs7107, %rs7108, %rs7109}, [matrix+3552]; + cvt.u32.u16 %r10911, %rs7109; + cvt.s32.s8 %r10912, %r10911; + cvt.u32.u16 %r10913, %rs7108; + cvt.s32.s8 %r10914, %r10913; + cvt.u32.u16 %r10915, %rs7107; + cvt.s32.s8 %r10916, %r10915; + cvt.u32.u16 %r10917, %rs7106; + cvt.s32.s8 %r10918, %r10917; + mad.lo.s32 %r10919, %r222, %r10918, %r10910; + mad.lo.s32 %r10920, %r72, %r10916, %r10919; + mad.lo.s32 %r10921, %r73, %r10914, %r10920; + mad.lo.s32 %r10922, %r74, %r10912, %r10921; + ld.const.v4.u8 {%rs7114, %rs7115, %rs7116, %rs7117}, [matrix+3556]; + cvt.u32.u16 %r10923, %rs7117; + cvt.s32.s8 %r10924, %r10923; + cvt.u32.u16 %r10925, %rs7116; + cvt.s32.s8 %r10926, %r10925; + cvt.u32.u16 %r10927, %rs7115; + cvt.s32.s8 %r10928, %r10927; + cvt.u32.u16 %r10929, %rs7114; + cvt.s32.s8 %r10930, %r10929; + mad.lo.s32 %r10931, %r75, %r10930, %r10922; + mad.lo.s32 %r10932, %r76, %r10928, %r10931; + mad.lo.s32 %r10933, %r77, %r10926, %r10932; + mad.lo.s32 %r10934, %r78, %r10924, %r10933; + ld.const.v4.u8 {%rs7122, %rs7123, %rs7124, %rs7125}, [matrix+3560]; + cvt.u32.u16 %r10935, %rs7125; + cvt.s32.s8 %r10936, %r10935; + cvt.u32.u16 %r10937, %rs7124; + cvt.s32.s8 %r10938, %r10937; + cvt.u32.u16 %r10939, %rs7123; + cvt.s32.s8 %r10940, %r10939; + cvt.u32.u16 %r10941, %rs7122; + cvt.s32.s8 %r10942, %r10941; + mad.lo.s32 %r10943, %r80, %r10942, %r10934; + mad.lo.s32 %r10944, %r81, %r10940, %r10943; + mad.lo.s32 %r10945, %r83, %r10938, %r10944; + mad.lo.s32 %r10946, %r84, %r10936, %r10945; + ld.const.v4.u8 {%rs7130, %rs7131, %rs7132, %rs7133}, [matrix+3564]; + cvt.u32.u16 %r10947, %rs7133; + cvt.s32.s8 %r10948, %r10947; + cvt.u32.u16 %r10949, %rs7132; + cvt.s32.s8 %r10950, %r10949; + cvt.u32.u16 %r10951, %rs7131; + cvt.s32.s8 %r10952, %r10951; + cvt.u32.u16 %r10953, %rs7130; + cvt.s32.s8 %r10954, %r10953; + mad.lo.s32 %r10955, %r86, %r10954, %r10946; + mad.lo.s32 %r10956, %r87, %r10952, %r10955; + mad.lo.s32 %r10957, %r88, %r10950, %r10956; + mad.lo.s32 %r10958, %r89, %r10948, %r10957; + ld.const.v4.u8 {%rs7138, %rs7139, %rs7140, %rs7141}, [matrix+3568]; + cvt.u32.u16 %r10959, %rs7141; + cvt.s32.s8 %r10960, %r10959; + cvt.u32.u16 %r10961, %rs7140; + cvt.s32.s8 %r10962, %r10961; + cvt.u32.u16 %r10963, %rs7139; + cvt.s32.s8 %r10964, %r10963; + cvt.u32.u16 %r10965, %rs7138; + cvt.s32.s8 %r10966, %r10965; + mad.lo.s32 %r10967, %r271, %r10966, %r10958; + mad.lo.s32 %r10968, %r91, %r10964, %r10967; + mad.lo.s32 %r10969, %r93, %r10962, %r10968; + mad.lo.s32 %r10970, %r94, %r10960, %r10969; + ld.const.v4.u8 {%rs7146, %rs7147, %rs7148, %rs7149}, [matrix+3572]; + cvt.u32.u16 %r10971, %rs7149; + cvt.s32.s8 %r10972, %r10971; + cvt.u32.u16 %r10973, %rs7148; + cvt.s32.s8 %r10974, %r10973; + cvt.u32.u16 %r10975, %rs7147; + cvt.s32.s8 %r10976, %r10975; + cvt.u32.u16 %r10977, %rs7146; + cvt.s32.s8 %r10978, %r10977; + mad.lo.s32 %r10979, %r96, %r10978, %r10970; + mad.lo.s32 %r10980, %r97, %r10976, %r10979; + mad.lo.s32 %r10981, %r99, %r10974, %r10980; + mad.lo.s32 %r10982, %r100, %r10972, %r10981; + ld.const.v4.u8 {%rs7154, %rs7155, %rs7156, %rs7157}, [matrix+3576]; + cvt.u32.u16 %r10983, %rs7157; + cvt.s32.s8 %r10984, %r10983; + cvt.u32.u16 %r10985, %rs7156; + cvt.s32.s8 %r10986, %r10985; + cvt.u32.u16 %r10987, %rs7155; + cvt.s32.s8 %r10988, %r10987; + cvt.u32.u16 %r10989, %rs7154; + cvt.s32.s8 %r10990, %r10989; + mad.lo.s32 %r10991, %r103, %r10990, %r10982; + mad.lo.s32 %r10992, %r104, %r10988, %r10991; + mad.lo.s32 %r10993, %r107, %r10986, %r10992; + mad.lo.s32 %r10994, %r108, %r10984, %r10993; + ld.const.v4.u8 {%rs7162, %rs7163, %rs7164, %rs7165}, [matrix+3580]; + cvt.u32.u16 %r10995, %rs7165; + cvt.s32.s8 %r10996, %r10995; + cvt.u32.u16 %r10997, %rs7164; + cvt.s32.s8 %r10998, %r10997; + cvt.u32.u16 %r10999, %rs7163; + cvt.s32.s8 %r11000, %r10999; + cvt.u32.u16 %r11001, %rs7162; + cvt.s32.s8 %r11002, %r11001; + mad.lo.s32 %r11003, %r111, %r11002, %r10994; + mad.lo.s32 %r11004, %r112, %r11000, %r11003; + mad.lo.s32 %r11005, %r114, %r10998, %r11004; + mad.lo.s32 %r11006, %r115, %r10996, %r11005; + shr.u32 %r11007, %r10814, 6; + and.b32 %r11008, %r11007, 240; + shr.u32 %r11009, %r11006, 10; + or.b32 %r11010, %r11009, %r11008; + xor.b32 %r11011, %r98, %r11010; + cvt.u64.u32 %rd401, %r11011; + ld.const.v4.u8 {%rs7170, %rs7171, %rs7172, %rs7173}, [matrix+3584]; + cvt.u32.u16 %r11012, %rs7173; + cvt.s32.s8 %r11013, %r11012; + cvt.u32.u16 %r11014, %rs7172; + cvt.s32.s8 %r11015, %r11014; + cvt.u32.u16 %r11016, %rs7170; + cvt.s32.s8 %r11017, %r11016; + cvt.u32.u16 %r11018, %rs7171; + cvt.s32.s8 %r11019, %r11018; + mul.lo.s32 %r11020, %r34, %r11019; + mad.lo.s32 %r11021, %r124, %r11017, %r11020; + mad.lo.s32 %r11022, %r35, %r11015, %r11021; + mad.lo.s32 %r11023, %r36, %r11013, %r11022; + ld.const.v4.u8 {%rs7178, %rs7179, %rs7180, %rs7181}, [matrix+3588]; + cvt.u32.u16 %r11024, %rs7181; + cvt.s32.s8 %r11025, %r11024; + cvt.u32.u16 %r11026, %rs7180; + cvt.s32.s8 %r11027, %r11026; + cvt.u32.u16 %r11028, %rs7179; + cvt.s32.s8 %r11029, %r11028; + cvt.u32.u16 %r11030, %rs7178; + cvt.s32.s8 %r11031, %r11030; + mad.lo.s32 %r11032, %r37, %r11031, %r11023; + mad.lo.s32 %r11033, %r38, %r11029, %r11032; + mad.lo.s32 %r11034, %r39, %r11027, %r11033; + mad.lo.s32 %r11035, %r40, %r11025, %r11034; + ld.const.v4.u8 {%rs7186, %rs7187, %rs7188, %rs7189}, [matrix+3592]; + cvt.u32.u16 %r11036, %rs7189; + cvt.s32.s8 %r11037, %r11036; + cvt.u32.u16 %r11038, %rs7188; + cvt.s32.s8 %r11039, %r11038; + cvt.u32.u16 %r11040, %rs7187; + cvt.s32.s8 %r11041, %r11040; + cvt.u32.u16 %r11042, %rs7186; + cvt.s32.s8 %r11043, %r11042; + mad.lo.s32 %r11044, %r42, %r11043, %r11035; + mad.lo.s32 %r11045, %r43, %r11041, %r11044; + mad.lo.s32 %r11046, %r45, %r11039, %r11045; + mad.lo.s32 %r11047, %r46, %r11037, %r11046; + ld.const.v4.u8 {%rs7194, %rs7195, %rs7196, %rs7197}, [matrix+3596]; + cvt.u32.u16 %r11048, %rs7197; + cvt.s32.s8 %r11049, %r11048; + cvt.u32.u16 %r11050, %rs7196; + cvt.s32.s8 %r11051, %r11050; + cvt.u32.u16 %r11052, %rs7195; + cvt.s32.s8 %r11053, %r11052; + cvt.u32.u16 %r11054, %rs7194; + cvt.s32.s8 %r11055, %r11054; + mad.lo.s32 %r11056, %r48, %r11055, %r11047; + mad.lo.s32 %r11057, %r49, %r11053, %r11056; + mad.lo.s32 %r11058, %r50, %r11051, %r11057; + mad.lo.s32 %r11059, %r51, %r11049, %r11058; + ld.const.v4.u8 {%rs7202, %rs7203, %rs7204, %rs7205}, [matrix+3600]; + cvt.u32.u16 %r11060, %rs7205; + cvt.s32.s8 %r11061, %r11060; + cvt.u32.u16 %r11062, %rs7204; + cvt.s32.s8 %r11063, %r11062; + cvt.u32.u16 %r11064, %rs7203; + cvt.s32.s8 %r11065, %r11064; + cvt.u32.u16 %r11066, %rs7202; + cvt.s32.s8 %r11067, %r11066; + mad.lo.s32 %r11068, %r173, %r11067, %r11059; + mad.lo.s32 %r11069, %r53, %r11065, %r11068; + mad.lo.s32 %r11070, %r54, %r11063, %r11069; + mad.lo.s32 %r11071, %r55, %r11061, %r11070; + ld.const.v4.u8 {%rs7210, %rs7211, %rs7212, %rs7213}, [matrix+3604]; + cvt.u32.u16 %r11072, %rs7213; + cvt.s32.s8 %r11073, %r11072; + cvt.u32.u16 %r11074, %rs7212; + cvt.s32.s8 %r11075, %r11074; + cvt.u32.u16 %r11076, %rs7211; + cvt.s32.s8 %r11077, %r11076; + cvt.u32.u16 %r11078, %rs7210; + cvt.s32.s8 %r11079, %r11078; + mad.lo.s32 %r11080, %r56, %r11079, %r11071; + mad.lo.s32 %r11081, %r57, %r11077, %r11080; + mad.lo.s32 %r11082, %r58, %r11075, %r11081; + mad.lo.s32 %r11083, %r59, %r11073, %r11082; + ld.const.v4.u8 {%rs7218, %rs7219, %rs7220, %rs7221}, [matrix+3608]; + cvt.u32.u16 %r11084, %rs7221; + cvt.s32.s8 %r11085, %r11084; + cvt.u32.u16 %r11086, %rs7220; + cvt.s32.s8 %r11087, %r11086; + cvt.u32.u16 %r11088, %rs7219; + cvt.s32.s8 %r11089, %r11088; + cvt.u32.u16 %r11090, %rs7218; + cvt.s32.s8 %r11091, %r11090; + mad.lo.s32 %r11092, %r61, %r11091, %r11083; + mad.lo.s32 %r11093, %r62, %r11089, %r11092; + mad.lo.s32 %r11094, %r64, %r11087, %r11093; + mad.lo.s32 %r11095, %r65, %r11085, %r11094; + ld.const.v4.u8 {%rs7226, %rs7227, %rs7228, %rs7229}, [matrix+3612]; + cvt.u32.u16 %r11096, %rs7229; + cvt.s32.s8 %r11097, %r11096; + cvt.u32.u16 %r11098, %rs7228; + cvt.s32.s8 %r11099, %r11098; + cvt.u32.u16 %r11100, %rs7227; + cvt.s32.s8 %r11101, %r11100; + cvt.u32.u16 %r11102, %rs7226; + cvt.s32.s8 %r11103, %r11102; + mad.lo.s32 %r11104, %r67, %r11103, %r11095; + mad.lo.s32 %r11105, %r68, %r11101, %r11104; + mad.lo.s32 %r11106, %r69, %r11099, %r11105; + mad.lo.s32 %r11107, %r70, %r11097, %r11106; + ld.const.v4.u8 {%rs7234, %rs7235, %rs7236, %rs7237}, [matrix+3616]; + cvt.u32.u16 %r11108, %rs7237; + cvt.s32.s8 %r11109, %r11108; + cvt.u32.u16 %r11110, %rs7236; + cvt.s32.s8 %r11111, %r11110; + cvt.u32.u16 %r11112, %rs7235; + cvt.s32.s8 %r11113, %r11112; + cvt.u32.u16 %r11114, %rs7234; + cvt.s32.s8 %r11115, %r11114; + mad.lo.s32 %r11116, %r222, %r11115, %r11107; + mad.lo.s32 %r11117, %r72, %r11113, %r11116; + mad.lo.s32 %r11118, %r73, %r11111, %r11117; + mad.lo.s32 %r11119, %r74, %r11109, %r11118; + ld.const.v4.u8 {%rs7242, %rs7243, %rs7244, %rs7245}, [matrix+3620]; + cvt.u32.u16 %r11120, %rs7245; + cvt.s32.s8 %r11121, %r11120; + cvt.u32.u16 %r11122, %rs7244; + cvt.s32.s8 %r11123, %r11122; + cvt.u32.u16 %r11124, %rs7243; + cvt.s32.s8 %r11125, %r11124; + cvt.u32.u16 %r11126, %rs7242; + cvt.s32.s8 %r11127, %r11126; + mad.lo.s32 %r11128, %r75, %r11127, %r11119; + mad.lo.s32 %r11129, %r76, %r11125, %r11128; + mad.lo.s32 %r11130, %r77, %r11123, %r11129; + mad.lo.s32 %r11131, %r78, %r11121, %r11130; + ld.const.v4.u8 {%rs7250, %rs7251, %rs7252, %rs7253}, [matrix+3624]; + cvt.u32.u16 %r11132, %rs7253; + cvt.s32.s8 %r11133, %r11132; + cvt.u32.u16 %r11134, %rs7252; + cvt.s32.s8 %r11135, %r11134; + cvt.u32.u16 %r11136, %rs7251; + cvt.s32.s8 %r11137, %r11136; + cvt.u32.u16 %r11138, %rs7250; + cvt.s32.s8 %r11139, %r11138; + mad.lo.s32 %r11140, %r80, %r11139, %r11131; + mad.lo.s32 %r11141, %r81, %r11137, %r11140; + mad.lo.s32 %r11142, %r83, %r11135, %r11141; + mad.lo.s32 %r11143, %r84, %r11133, %r11142; + ld.const.v4.u8 {%rs7258, %rs7259, %rs7260, %rs7261}, [matrix+3628]; + cvt.u32.u16 %r11144, %rs7261; + cvt.s32.s8 %r11145, %r11144; + cvt.u32.u16 %r11146, %rs7260; + cvt.s32.s8 %r11147, %r11146; + cvt.u32.u16 %r11148, %rs7259; + cvt.s32.s8 %r11149, %r11148; + cvt.u32.u16 %r11150, %rs7258; + cvt.s32.s8 %r11151, %r11150; + mad.lo.s32 %r11152, %r86, %r11151, %r11143; + mad.lo.s32 %r11153, %r87, %r11149, %r11152; + mad.lo.s32 %r11154, %r88, %r11147, %r11153; + mad.lo.s32 %r11155, %r89, %r11145, %r11154; + ld.const.v4.u8 {%rs7266, %rs7267, %rs7268, %rs7269}, [matrix+3632]; + cvt.u32.u16 %r11156, %rs7269; + cvt.s32.s8 %r11157, %r11156; + cvt.u32.u16 %r11158, %rs7268; + cvt.s32.s8 %r11159, %r11158; + cvt.u32.u16 %r11160, %rs7267; + cvt.s32.s8 %r11161, %r11160; + cvt.u32.u16 %r11162, %rs7266; + cvt.s32.s8 %r11163, %r11162; + mad.lo.s32 %r11164, %r271, %r11163, %r11155; + mad.lo.s32 %r11165, %r91, %r11161, %r11164; + mad.lo.s32 %r11166, %r93, %r11159, %r11165; + mad.lo.s32 %r11167, %r94, %r11157, %r11166; + ld.const.v4.u8 {%rs7274, %rs7275, %rs7276, %rs7277}, [matrix+3636]; + cvt.u32.u16 %r11168, %rs7277; + cvt.s32.s8 %r11169, %r11168; + cvt.u32.u16 %r11170, %rs7276; + cvt.s32.s8 %r11171, %r11170; + cvt.u32.u16 %r11172, %rs7275; + cvt.s32.s8 %r11173, %r11172; + cvt.u32.u16 %r11174, %rs7274; + cvt.s32.s8 %r11175, %r11174; + mad.lo.s32 %r11176, %r96, %r11175, %r11167; + mad.lo.s32 %r11177, %r97, %r11173, %r11176; + mad.lo.s32 %r11178, %r99, %r11171, %r11177; + mad.lo.s32 %r11179, %r100, %r11169, %r11178; + ld.const.v4.u8 {%rs7282, %rs7283, %rs7284, %rs7285}, [matrix+3640]; + cvt.u32.u16 %r11180, %rs7285; + cvt.s32.s8 %r11181, %r11180; + cvt.u32.u16 %r11182, %rs7284; + cvt.s32.s8 %r11183, %r11182; + cvt.u32.u16 %r11184, %rs7283; + cvt.s32.s8 %r11185, %r11184; + cvt.u32.u16 %r11186, %rs7282; + cvt.s32.s8 %r11187, %r11186; + mad.lo.s32 %r11188, %r103, %r11187, %r11179; + mad.lo.s32 %r11189, %r104, %r11185, %r11188; + mad.lo.s32 %r11190, %r107, %r11183, %r11189; + mad.lo.s32 %r11191, %r108, %r11181, %r11190; + ld.const.v4.u8 {%rs7290, %rs7291, %rs7292, %rs7293}, [matrix+3644]; + cvt.u32.u16 %r11192, %rs7293; + cvt.s32.s8 %r11193, %r11192; + cvt.u32.u16 %r11194, %rs7292; + cvt.s32.s8 %r11195, %r11194; + cvt.u32.u16 %r11196, %rs7291; + cvt.s32.s8 %r11197, %r11196; + cvt.u32.u16 %r11198, %rs7290; + cvt.s32.s8 %r11199, %r11198; + mad.lo.s32 %r11200, %r111, %r11199, %r11191; + mad.lo.s32 %r11201, %r112, %r11197, %r11200; + mad.lo.s32 %r11202, %r114, %r11195, %r11201; + mad.lo.s32 %r11203, %r115, %r11193, %r11202; + ld.const.v4.u8 {%rs7298, %rs7299, %rs7300, %rs7301}, [matrix+3648]; + cvt.u32.u16 %r11204, %rs7301; + cvt.s32.s8 %r11205, %r11204; + cvt.u32.u16 %r11206, %rs7300; + cvt.s32.s8 %r11207, %r11206; + cvt.u32.u16 %r11208, %rs7298; + cvt.s32.s8 %r11209, %r11208; + cvt.u32.u16 %r11210, %rs7299; + cvt.s32.s8 %r11211, %r11210; + mul.lo.s32 %r11212, %r34, %r11211; + mad.lo.s32 %r11213, %r124, %r11209, %r11212; + mad.lo.s32 %r11214, %r35, %r11207, %r11213; + mad.lo.s32 %r11215, %r36, %r11205, %r11214; + ld.const.v4.u8 {%rs7306, %rs7307, %rs7308, %rs7309}, [matrix+3652]; + cvt.u32.u16 %r11216, %rs7309; + cvt.s32.s8 %r11217, %r11216; + cvt.u32.u16 %r11218, %rs7308; + cvt.s32.s8 %r11219, %r11218; + cvt.u32.u16 %r11220, %rs7307; + cvt.s32.s8 %r11221, %r11220; + cvt.u32.u16 %r11222, %rs7306; + cvt.s32.s8 %r11223, %r11222; + mad.lo.s32 %r11224, %r37, %r11223, %r11215; + mad.lo.s32 %r11225, %r38, %r11221, %r11224; + mad.lo.s32 %r11226, %r39, %r11219, %r11225; + mad.lo.s32 %r11227, %r40, %r11217, %r11226; + ld.const.v4.u8 {%rs7314, %rs7315, %rs7316, %rs7317}, [matrix+3656]; + cvt.u32.u16 %r11228, %rs7317; + cvt.s32.s8 %r11229, %r11228; + cvt.u32.u16 %r11230, %rs7316; + cvt.s32.s8 %r11231, %r11230; + cvt.u32.u16 %r11232, %rs7315; + cvt.s32.s8 %r11233, %r11232; + cvt.u32.u16 %r11234, %rs7314; + cvt.s32.s8 %r11235, %r11234; + mad.lo.s32 %r11236, %r42, %r11235, %r11227; + mad.lo.s32 %r11237, %r43, %r11233, %r11236; + mad.lo.s32 %r11238, %r45, %r11231, %r11237; + mad.lo.s32 %r11239, %r46, %r11229, %r11238; + ld.const.v4.u8 {%rs7322, %rs7323, %rs7324, %rs7325}, [matrix+3660]; + cvt.u32.u16 %r11240, %rs7325; + cvt.s32.s8 %r11241, %r11240; + cvt.u32.u16 %r11242, %rs7324; + cvt.s32.s8 %r11243, %r11242; + cvt.u32.u16 %r11244, %rs7323; + cvt.s32.s8 %r11245, %r11244; + cvt.u32.u16 %r11246, %rs7322; + cvt.s32.s8 %r11247, %r11246; + mad.lo.s32 %r11248, %r48, %r11247, %r11239; + mad.lo.s32 %r11249, %r49, %r11245, %r11248; + mad.lo.s32 %r11250, %r50, %r11243, %r11249; + mad.lo.s32 %r11251, %r51, %r11241, %r11250; + ld.const.v4.u8 {%rs7330, %rs7331, %rs7332, %rs7333}, [matrix+3664]; + cvt.u32.u16 %r11252, %rs7333; + cvt.s32.s8 %r11253, %r11252; + cvt.u32.u16 %r11254, %rs7332; + cvt.s32.s8 %r11255, %r11254; + cvt.u32.u16 %r11256, %rs7331; + cvt.s32.s8 %r11257, %r11256; + cvt.u32.u16 %r11258, %rs7330; + cvt.s32.s8 %r11259, %r11258; + mad.lo.s32 %r11260, %r173, %r11259, %r11251; + mad.lo.s32 %r11261, %r53, %r11257, %r11260; + mad.lo.s32 %r11262, %r54, %r11255, %r11261; + mad.lo.s32 %r11263, %r55, %r11253, %r11262; + ld.const.v4.u8 {%rs7338, %rs7339, %rs7340, %rs7341}, [matrix+3668]; + cvt.u32.u16 %r11264, %rs7341; + cvt.s32.s8 %r11265, %r11264; + cvt.u32.u16 %r11266, %rs7340; + cvt.s32.s8 %r11267, %r11266; + cvt.u32.u16 %r11268, %rs7339; + cvt.s32.s8 %r11269, %r11268; + cvt.u32.u16 %r11270, %rs7338; + cvt.s32.s8 %r11271, %r11270; + mad.lo.s32 %r11272, %r56, %r11271, %r11263; + mad.lo.s32 %r11273, %r57, %r11269, %r11272; + mad.lo.s32 %r11274, %r58, %r11267, %r11273; + mad.lo.s32 %r11275, %r59, %r11265, %r11274; + ld.const.v4.u8 {%rs7346, %rs7347, %rs7348, %rs7349}, [matrix+3672]; + cvt.u32.u16 %r11276, %rs7349; + cvt.s32.s8 %r11277, %r11276; + cvt.u32.u16 %r11278, %rs7348; + cvt.s32.s8 %r11279, %r11278; + cvt.u32.u16 %r11280, %rs7347; + cvt.s32.s8 %r11281, %r11280; + cvt.u32.u16 %r11282, %rs7346; + cvt.s32.s8 %r11283, %r11282; + mad.lo.s32 %r11284, %r61, %r11283, %r11275; + mad.lo.s32 %r11285, %r62, %r11281, %r11284; + mad.lo.s32 %r11286, %r64, %r11279, %r11285; + mad.lo.s32 %r11287, %r65, %r11277, %r11286; + ld.const.v4.u8 {%rs7354, %rs7355, %rs7356, %rs7357}, [matrix+3676]; + cvt.u32.u16 %r11288, %rs7357; + cvt.s32.s8 %r11289, %r11288; + cvt.u32.u16 %r11290, %rs7356; + cvt.s32.s8 %r11291, %r11290; + cvt.u32.u16 %r11292, %rs7355; + cvt.s32.s8 %r11293, %r11292; + cvt.u32.u16 %r11294, %rs7354; + cvt.s32.s8 %r11295, %r11294; + mad.lo.s32 %r11296, %r67, %r11295, %r11287; + mad.lo.s32 %r11297, %r68, %r11293, %r11296; + mad.lo.s32 %r11298, %r69, %r11291, %r11297; + mad.lo.s32 %r11299, %r70, %r11289, %r11298; + ld.const.v4.u8 {%rs7362, %rs7363, %rs7364, %rs7365}, [matrix+3680]; + cvt.u32.u16 %r11300, %rs7365; + cvt.s32.s8 %r11301, %r11300; + cvt.u32.u16 %r11302, %rs7364; + cvt.s32.s8 %r11303, %r11302; + cvt.u32.u16 %r11304, %rs7363; + cvt.s32.s8 %r11305, %r11304; + cvt.u32.u16 %r11306, %rs7362; + cvt.s32.s8 %r11307, %r11306; + mad.lo.s32 %r11308, %r222, %r11307, %r11299; + mad.lo.s32 %r11309, %r72, %r11305, %r11308; + mad.lo.s32 %r11310, %r73, %r11303, %r11309; + mad.lo.s32 %r11311, %r74, %r11301, %r11310; + ld.const.v4.u8 {%rs7370, %rs7371, %rs7372, %rs7373}, [matrix+3684]; + cvt.u32.u16 %r11312, %rs7373; + cvt.s32.s8 %r11313, %r11312; + cvt.u32.u16 %r11314, %rs7372; + cvt.s32.s8 %r11315, %r11314; + cvt.u32.u16 %r11316, %rs7371; + cvt.s32.s8 %r11317, %r11316; + cvt.u32.u16 %r11318, %rs7370; + cvt.s32.s8 %r11319, %r11318; + mad.lo.s32 %r11320, %r75, %r11319, %r11311; + mad.lo.s32 %r11321, %r76, %r11317, %r11320; + mad.lo.s32 %r11322, %r77, %r11315, %r11321; + mad.lo.s32 %r11323, %r78, %r11313, %r11322; + ld.const.v4.u8 {%rs7378, %rs7379, %rs7380, %rs7381}, [matrix+3688]; + cvt.u32.u16 %r11324, %rs7381; + cvt.s32.s8 %r11325, %r11324; + cvt.u32.u16 %r11326, %rs7380; + cvt.s32.s8 %r11327, %r11326; + cvt.u32.u16 %r11328, %rs7379; + cvt.s32.s8 %r11329, %r11328; + cvt.u32.u16 %r11330, %rs7378; + cvt.s32.s8 %r11331, %r11330; + mad.lo.s32 %r11332, %r80, %r11331, %r11323; + mad.lo.s32 %r11333, %r81, %r11329, %r11332; + mad.lo.s32 %r11334, %r83, %r11327, %r11333; + mad.lo.s32 %r11335, %r84, %r11325, %r11334; + ld.const.v4.u8 {%rs7386, %rs7387, %rs7388, %rs7389}, [matrix+3692]; + cvt.u32.u16 %r11336, %rs7389; + cvt.s32.s8 %r11337, %r11336; + cvt.u32.u16 %r11338, %rs7388; + cvt.s32.s8 %r11339, %r11338; + cvt.u32.u16 %r11340, %rs7387; + cvt.s32.s8 %r11341, %r11340; + cvt.u32.u16 %r11342, %rs7386; + cvt.s32.s8 %r11343, %r11342; + mad.lo.s32 %r11344, %r86, %r11343, %r11335; + mad.lo.s32 %r11345, %r87, %r11341, %r11344; + mad.lo.s32 %r11346, %r88, %r11339, %r11345; + mad.lo.s32 %r11347, %r89, %r11337, %r11346; + ld.const.v4.u8 {%rs7394, %rs7395, %rs7396, %rs7397}, [matrix+3696]; + cvt.u32.u16 %r11348, %rs7397; + cvt.s32.s8 %r11349, %r11348; + cvt.u32.u16 %r11350, %rs7396; + cvt.s32.s8 %r11351, %r11350; + cvt.u32.u16 %r11352, %rs7395; + cvt.s32.s8 %r11353, %r11352; + cvt.u32.u16 %r11354, %rs7394; + cvt.s32.s8 %r11355, %r11354; + mad.lo.s32 %r11356, %r271, %r11355, %r11347; + mad.lo.s32 %r11357, %r91, %r11353, %r11356; + mad.lo.s32 %r11358, %r93, %r11351, %r11357; + mad.lo.s32 %r11359, %r94, %r11349, %r11358; + ld.const.v4.u8 {%rs7402, %rs7403, %rs7404, %rs7405}, [matrix+3700]; + cvt.u32.u16 %r11360, %rs7405; + cvt.s32.s8 %r11361, %r11360; + cvt.u32.u16 %r11362, %rs7404; + cvt.s32.s8 %r11363, %r11362; + cvt.u32.u16 %r11364, %rs7403; + cvt.s32.s8 %r11365, %r11364; + cvt.u32.u16 %r11366, %rs7402; + cvt.s32.s8 %r11367, %r11366; + mad.lo.s32 %r11368, %r96, %r11367, %r11359; + mad.lo.s32 %r11369, %r97, %r11365, %r11368; + mad.lo.s32 %r11370, %r99, %r11363, %r11369; + mad.lo.s32 %r11371, %r100, %r11361, %r11370; + ld.const.v4.u8 {%rs7410, %rs7411, %rs7412, %rs7413}, [matrix+3704]; + cvt.u32.u16 %r11372, %rs7413; + cvt.s32.s8 %r11373, %r11372; + cvt.u32.u16 %r11374, %rs7412; + cvt.s32.s8 %r11375, %r11374; + cvt.u32.u16 %r11376, %rs7411; + cvt.s32.s8 %r11377, %r11376; + cvt.u32.u16 %r11378, %rs7410; + cvt.s32.s8 %r11379, %r11378; + mad.lo.s32 %r11380, %r103, %r11379, %r11371; + mad.lo.s32 %r11381, %r104, %r11377, %r11380; + mad.lo.s32 %r11382, %r107, %r11375, %r11381; + mad.lo.s32 %r11383, %r108, %r11373, %r11382; + ld.const.v4.u8 {%rs7418, %rs7419, %rs7420, %rs7421}, [matrix+3708]; + cvt.u32.u16 %r11384, %rs7421; + cvt.s32.s8 %r11385, %r11384; + cvt.u32.u16 %r11386, %rs7420; + cvt.s32.s8 %r11387, %r11386; + cvt.u32.u16 %r11388, %rs7419; + cvt.s32.s8 %r11389, %r11388; + cvt.u32.u16 %r11390, %rs7418; + cvt.s32.s8 %r11391, %r11390; + mad.lo.s32 %r11392, %r111, %r11391, %r11383; + mad.lo.s32 %r11393, %r112, %r11389, %r11392; + mad.lo.s32 %r11394, %r114, %r11387, %r11393; + mad.lo.s32 %r11395, %r115, %r11385, %r11394; + shr.u32 %r11396, %r11203, 6; + and.b32 %r11397, %r11396, 240; + shr.u32 %r11398, %r11395, 10; + or.b32 %r11399, %r11398, %r11397; + xor.b32 %r11400, %r101, %r11399; + cvt.u64.u32 %rd402, %r11400; + and.b64 %rd403, %rd402, 255; + ld.const.v4.u8 {%rs7426, %rs7427, %rs7428, %rs7429}, [matrix+3712]; + cvt.u32.u16 %r11401, %rs7429; + cvt.s32.s8 %r11402, %r11401; + cvt.u32.u16 %r11403, %rs7428; + cvt.s32.s8 %r11404, %r11403; + cvt.u32.u16 %r11405, %rs7426; + cvt.s32.s8 %r11406, %r11405; + cvt.u32.u16 %r11407, %rs7427; + cvt.s32.s8 %r11408, %r11407; + mul.lo.s32 %r11409, %r34, %r11408; + mad.lo.s32 %r11410, %r124, %r11406, %r11409; + mad.lo.s32 %r11411, %r35, %r11404, %r11410; + mad.lo.s32 %r11412, %r36, %r11402, %r11411; + ld.const.v4.u8 {%rs7434, %rs7435, %rs7436, %rs7437}, [matrix+3716]; + cvt.u32.u16 %r11413, %rs7437; + cvt.s32.s8 %r11414, %r11413; + cvt.u32.u16 %r11415, %rs7436; + cvt.s32.s8 %r11416, %r11415; + cvt.u32.u16 %r11417, %rs7435; + cvt.s32.s8 %r11418, %r11417; + cvt.u32.u16 %r11419, %rs7434; + cvt.s32.s8 %r11420, %r11419; + mad.lo.s32 %r11421, %r37, %r11420, %r11412; + mad.lo.s32 %r11422, %r38, %r11418, %r11421; + mad.lo.s32 %r11423, %r39, %r11416, %r11422; + mad.lo.s32 %r11424, %r40, %r11414, %r11423; + ld.const.v4.u8 {%rs7442, %rs7443, %rs7444, %rs7445}, [matrix+3720]; + cvt.u32.u16 %r11425, %rs7445; + cvt.s32.s8 %r11426, %r11425; + cvt.u32.u16 %r11427, %rs7444; + cvt.s32.s8 %r11428, %r11427; + cvt.u32.u16 %r11429, %rs7443; + cvt.s32.s8 %r11430, %r11429; + cvt.u32.u16 %r11431, %rs7442; + cvt.s32.s8 %r11432, %r11431; + mad.lo.s32 %r11433, %r42, %r11432, %r11424; + mad.lo.s32 %r11434, %r43, %r11430, %r11433; + mad.lo.s32 %r11435, %r45, %r11428, %r11434; + mad.lo.s32 %r11436, %r46, %r11426, %r11435; + ld.const.v4.u8 {%rs7450, %rs7451, %rs7452, %rs7453}, [matrix+3724]; + cvt.u32.u16 %r11437, %rs7453; + cvt.s32.s8 %r11438, %r11437; + cvt.u32.u16 %r11439, %rs7452; + cvt.s32.s8 %r11440, %r11439; + cvt.u32.u16 %r11441, %rs7451; + cvt.s32.s8 %r11442, %r11441; + cvt.u32.u16 %r11443, %rs7450; + cvt.s32.s8 %r11444, %r11443; + mad.lo.s32 %r11445, %r48, %r11444, %r11436; + mad.lo.s32 %r11446, %r49, %r11442, %r11445; + mad.lo.s32 %r11447, %r50, %r11440, %r11446; + mad.lo.s32 %r11448, %r51, %r11438, %r11447; + ld.const.v4.u8 {%rs7458, %rs7459, %rs7460, %rs7461}, [matrix+3728]; + cvt.u32.u16 %r11449, %rs7461; + cvt.s32.s8 %r11450, %r11449; + cvt.u32.u16 %r11451, %rs7460; + cvt.s32.s8 %r11452, %r11451; + cvt.u32.u16 %r11453, %rs7459; + cvt.s32.s8 %r11454, %r11453; + cvt.u32.u16 %r11455, %rs7458; + cvt.s32.s8 %r11456, %r11455; + mad.lo.s32 %r11457, %r173, %r11456, %r11448; + mad.lo.s32 %r11458, %r53, %r11454, %r11457; + mad.lo.s32 %r11459, %r54, %r11452, %r11458; + mad.lo.s32 %r11460, %r55, %r11450, %r11459; + ld.const.v4.u8 {%rs7466, %rs7467, %rs7468, %rs7469}, [matrix+3732]; + cvt.u32.u16 %r11461, %rs7469; + cvt.s32.s8 %r11462, %r11461; + cvt.u32.u16 %r11463, %rs7468; + cvt.s32.s8 %r11464, %r11463; + cvt.u32.u16 %r11465, %rs7467; + cvt.s32.s8 %r11466, %r11465; + cvt.u32.u16 %r11467, %rs7466; + cvt.s32.s8 %r11468, %r11467; + mad.lo.s32 %r11469, %r56, %r11468, %r11460; + mad.lo.s32 %r11470, %r57, %r11466, %r11469; + mad.lo.s32 %r11471, %r58, %r11464, %r11470; + mad.lo.s32 %r11472, %r59, %r11462, %r11471; + ld.const.v4.u8 {%rs7474, %rs7475, %rs7476, %rs7477}, [matrix+3736]; + cvt.u32.u16 %r11473, %rs7477; + cvt.s32.s8 %r11474, %r11473; + cvt.u32.u16 %r11475, %rs7476; + cvt.s32.s8 %r11476, %r11475; + cvt.u32.u16 %r11477, %rs7475; + cvt.s32.s8 %r11478, %r11477; + cvt.u32.u16 %r11479, %rs7474; + cvt.s32.s8 %r11480, %r11479; + mad.lo.s32 %r11481, %r61, %r11480, %r11472; + mad.lo.s32 %r11482, %r62, %r11478, %r11481; + mad.lo.s32 %r11483, %r64, %r11476, %r11482; + mad.lo.s32 %r11484, %r65, %r11474, %r11483; + ld.const.v4.u8 {%rs7482, %rs7483, %rs7484, %rs7485}, [matrix+3740]; + cvt.u32.u16 %r11485, %rs7485; + cvt.s32.s8 %r11486, %r11485; + cvt.u32.u16 %r11487, %rs7484; + cvt.s32.s8 %r11488, %r11487; + cvt.u32.u16 %r11489, %rs7483; + cvt.s32.s8 %r11490, %r11489; + cvt.u32.u16 %r11491, %rs7482; + cvt.s32.s8 %r11492, %r11491; + mad.lo.s32 %r11493, %r67, %r11492, %r11484; + mad.lo.s32 %r11494, %r68, %r11490, %r11493; + mad.lo.s32 %r11495, %r69, %r11488, %r11494; + mad.lo.s32 %r11496, %r70, %r11486, %r11495; + ld.const.v4.u8 {%rs7490, %rs7491, %rs7492, %rs7493}, [matrix+3744]; + cvt.u32.u16 %r11497, %rs7493; + cvt.s32.s8 %r11498, %r11497; + cvt.u32.u16 %r11499, %rs7492; + cvt.s32.s8 %r11500, %r11499; + cvt.u32.u16 %r11501, %rs7491; + cvt.s32.s8 %r11502, %r11501; + cvt.u32.u16 %r11503, %rs7490; + cvt.s32.s8 %r11504, %r11503; + mad.lo.s32 %r11505, %r222, %r11504, %r11496; + mad.lo.s32 %r11506, %r72, %r11502, %r11505; + mad.lo.s32 %r11507, %r73, %r11500, %r11506; + mad.lo.s32 %r11508, %r74, %r11498, %r11507; + ld.const.v4.u8 {%rs7498, %rs7499, %rs7500, %rs7501}, [matrix+3748]; + cvt.u32.u16 %r11509, %rs7501; + cvt.s32.s8 %r11510, %r11509; + cvt.u32.u16 %r11511, %rs7500; + cvt.s32.s8 %r11512, %r11511; + cvt.u32.u16 %r11513, %rs7499; + cvt.s32.s8 %r11514, %r11513; + cvt.u32.u16 %r11515, %rs7498; + cvt.s32.s8 %r11516, %r11515; + mad.lo.s32 %r11517, %r75, %r11516, %r11508; + mad.lo.s32 %r11518, %r76, %r11514, %r11517; + mad.lo.s32 %r11519, %r77, %r11512, %r11518; + mad.lo.s32 %r11520, %r78, %r11510, %r11519; + ld.const.v4.u8 {%rs7506, %rs7507, %rs7508, %rs7509}, [matrix+3752]; + cvt.u32.u16 %r11521, %rs7509; + cvt.s32.s8 %r11522, %r11521; + cvt.u32.u16 %r11523, %rs7508; + cvt.s32.s8 %r11524, %r11523; + cvt.u32.u16 %r11525, %rs7507; + cvt.s32.s8 %r11526, %r11525; + cvt.u32.u16 %r11527, %rs7506; + cvt.s32.s8 %r11528, %r11527; + mad.lo.s32 %r11529, %r80, %r11528, %r11520; + mad.lo.s32 %r11530, %r81, %r11526, %r11529; + mad.lo.s32 %r11531, %r83, %r11524, %r11530; + mad.lo.s32 %r11532, %r84, %r11522, %r11531; + ld.const.v4.u8 {%rs7514, %rs7515, %rs7516, %rs7517}, [matrix+3756]; + cvt.u32.u16 %r11533, %rs7517; + cvt.s32.s8 %r11534, %r11533; + cvt.u32.u16 %r11535, %rs7516; + cvt.s32.s8 %r11536, %r11535; + cvt.u32.u16 %r11537, %rs7515; + cvt.s32.s8 %r11538, %r11537; + cvt.u32.u16 %r11539, %rs7514; + cvt.s32.s8 %r11540, %r11539; + mad.lo.s32 %r11541, %r86, %r11540, %r11532; + mad.lo.s32 %r11542, %r87, %r11538, %r11541; + mad.lo.s32 %r11543, %r88, %r11536, %r11542; + mad.lo.s32 %r11544, %r89, %r11534, %r11543; + ld.const.v4.u8 {%rs7522, %rs7523, %rs7524, %rs7525}, [matrix+3760]; + cvt.u32.u16 %r11545, %rs7525; + cvt.s32.s8 %r11546, %r11545; + cvt.u32.u16 %r11547, %rs7524; + cvt.s32.s8 %r11548, %r11547; + cvt.u32.u16 %r11549, %rs7523; + cvt.s32.s8 %r11550, %r11549; + cvt.u32.u16 %r11551, %rs7522; + cvt.s32.s8 %r11552, %r11551; + mad.lo.s32 %r11553, %r271, %r11552, %r11544; + mad.lo.s32 %r11554, %r91, %r11550, %r11553; + mad.lo.s32 %r11555, %r93, %r11548, %r11554; + mad.lo.s32 %r11556, %r94, %r11546, %r11555; + ld.const.v4.u8 {%rs7530, %rs7531, %rs7532, %rs7533}, [matrix+3764]; + cvt.u32.u16 %r11557, %rs7533; + cvt.s32.s8 %r11558, %r11557; + cvt.u32.u16 %r11559, %rs7532; + cvt.s32.s8 %r11560, %r11559; + cvt.u32.u16 %r11561, %rs7531; + cvt.s32.s8 %r11562, %r11561; + cvt.u32.u16 %r11563, %rs7530; + cvt.s32.s8 %r11564, %r11563; + mad.lo.s32 %r11565, %r96, %r11564, %r11556; + mad.lo.s32 %r11566, %r97, %r11562, %r11565; + mad.lo.s32 %r11567, %r99, %r11560, %r11566; + mad.lo.s32 %r11568, %r100, %r11558, %r11567; + ld.const.v4.u8 {%rs7538, %rs7539, %rs7540, %rs7541}, [matrix+3768]; + cvt.u32.u16 %r11569, %rs7541; + cvt.s32.s8 %r11570, %r11569; + cvt.u32.u16 %r11571, %rs7540; + cvt.s32.s8 %r11572, %r11571; + cvt.u32.u16 %r11573, %rs7539; + cvt.s32.s8 %r11574, %r11573; + cvt.u32.u16 %r11575, %rs7538; + cvt.s32.s8 %r11576, %r11575; + mad.lo.s32 %r11577, %r103, %r11576, %r11568; + mad.lo.s32 %r11578, %r104, %r11574, %r11577; + mad.lo.s32 %r11579, %r107, %r11572, %r11578; + mad.lo.s32 %r11580, %r108, %r11570, %r11579; + ld.const.v4.u8 {%rs7546, %rs7547, %rs7548, %rs7549}, [matrix+3772]; + cvt.u32.u16 %r11581, %rs7549; + cvt.s32.s8 %r11582, %r11581; + cvt.u32.u16 %r11583, %rs7548; + cvt.s32.s8 %r11584, %r11583; + cvt.u32.u16 %r11585, %rs7547; + cvt.s32.s8 %r11586, %r11585; + cvt.u32.u16 %r11587, %rs7546; + cvt.s32.s8 %r11588, %r11587; + mad.lo.s32 %r11589, %r111, %r11588, %r11580; + mad.lo.s32 %r11590, %r112, %r11586, %r11589; + mad.lo.s32 %r11591, %r114, %r11584, %r11590; + mad.lo.s32 %r11592, %r115, %r11582, %r11591; + ld.const.v4.u8 {%rs7554, %rs7555, %rs7556, %rs7557}, [matrix+3776]; + cvt.u32.u16 %r11593, %rs7557; + cvt.s32.s8 %r11594, %r11593; + cvt.u32.u16 %r11595, %rs7556; + cvt.s32.s8 %r11596, %r11595; + cvt.u32.u16 %r11597, %rs7554; + cvt.s32.s8 %r11598, %r11597; + cvt.u32.u16 %r11599, %rs7555; + cvt.s32.s8 %r11600, %r11599; + mul.lo.s32 %r11601, %r34, %r11600; + mad.lo.s32 %r11602, %r124, %r11598, %r11601; + mad.lo.s32 %r11603, %r35, %r11596, %r11602; + mad.lo.s32 %r11604, %r36, %r11594, %r11603; + ld.const.v4.u8 {%rs7562, %rs7563, %rs7564, %rs7565}, [matrix+3780]; + cvt.u32.u16 %r11605, %rs7565; + cvt.s32.s8 %r11606, %r11605; + cvt.u32.u16 %r11607, %rs7564; + cvt.s32.s8 %r11608, %r11607; + cvt.u32.u16 %r11609, %rs7563; + cvt.s32.s8 %r11610, %r11609; + cvt.u32.u16 %r11611, %rs7562; + cvt.s32.s8 %r11612, %r11611; + mad.lo.s32 %r11613, %r37, %r11612, %r11604; + mad.lo.s32 %r11614, %r38, %r11610, %r11613; + mad.lo.s32 %r11615, %r39, %r11608, %r11614; + mad.lo.s32 %r11616, %r40, %r11606, %r11615; + ld.const.v4.u8 {%rs7570, %rs7571, %rs7572, %rs7573}, [matrix+3784]; + cvt.u32.u16 %r11617, %rs7573; + cvt.s32.s8 %r11618, %r11617; + cvt.u32.u16 %r11619, %rs7572; + cvt.s32.s8 %r11620, %r11619; + cvt.u32.u16 %r11621, %rs7571; + cvt.s32.s8 %r11622, %r11621; + cvt.u32.u16 %r11623, %rs7570; + cvt.s32.s8 %r11624, %r11623; + mad.lo.s32 %r11625, %r42, %r11624, %r11616; + mad.lo.s32 %r11626, %r43, %r11622, %r11625; + mad.lo.s32 %r11627, %r45, %r11620, %r11626; + mad.lo.s32 %r11628, %r46, %r11618, %r11627; + ld.const.v4.u8 {%rs7578, %rs7579, %rs7580, %rs7581}, [matrix+3788]; + cvt.u32.u16 %r11629, %rs7581; + cvt.s32.s8 %r11630, %r11629; + cvt.u32.u16 %r11631, %rs7580; + cvt.s32.s8 %r11632, %r11631; + cvt.u32.u16 %r11633, %rs7579; + cvt.s32.s8 %r11634, %r11633; + cvt.u32.u16 %r11635, %rs7578; + cvt.s32.s8 %r11636, %r11635; + mad.lo.s32 %r11637, %r48, %r11636, %r11628; + mad.lo.s32 %r11638, %r49, %r11634, %r11637; + mad.lo.s32 %r11639, %r50, %r11632, %r11638; + mad.lo.s32 %r11640, %r51, %r11630, %r11639; + ld.const.v4.u8 {%rs7586, %rs7587, %rs7588, %rs7589}, [matrix+3792]; + cvt.u32.u16 %r11641, %rs7589; + cvt.s32.s8 %r11642, %r11641; + cvt.u32.u16 %r11643, %rs7588; + cvt.s32.s8 %r11644, %r11643; + cvt.u32.u16 %r11645, %rs7587; + cvt.s32.s8 %r11646, %r11645; + cvt.u32.u16 %r11647, %rs7586; + cvt.s32.s8 %r11648, %r11647; + mad.lo.s32 %r11649, %r173, %r11648, %r11640; + mad.lo.s32 %r11650, %r53, %r11646, %r11649; + mad.lo.s32 %r11651, %r54, %r11644, %r11650; + mad.lo.s32 %r11652, %r55, %r11642, %r11651; + ld.const.v4.u8 {%rs7594, %rs7595, %rs7596, %rs7597}, [matrix+3796]; + cvt.u32.u16 %r11653, %rs7597; + cvt.s32.s8 %r11654, %r11653; + cvt.u32.u16 %r11655, %rs7596; + cvt.s32.s8 %r11656, %r11655; + cvt.u32.u16 %r11657, %rs7595; + cvt.s32.s8 %r11658, %r11657; + cvt.u32.u16 %r11659, %rs7594; + cvt.s32.s8 %r11660, %r11659; + mad.lo.s32 %r11661, %r56, %r11660, %r11652; + mad.lo.s32 %r11662, %r57, %r11658, %r11661; + mad.lo.s32 %r11663, %r58, %r11656, %r11662; + mad.lo.s32 %r11664, %r59, %r11654, %r11663; + ld.const.v4.u8 {%rs7602, %rs7603, %rs7604, %rs7605}, [matrix+3800]; + cvt.u32.u16 %r11665, %rs7605; + cvt.s32.s8 %r11666, %r11665; + cvt.u32.u16 %r11667, %rs7604; + cvt.s32.s8 %r11668, %r11667; + cvt.u32.u16 %r11669, %rs7603; + cvt.s32.s8 %r11670, %r11669; + cvt.u32.u16 %r11671, %rs7602; + cvt.s32.s8 %r11672, %r11671; + mad.lo.s32 %r11673, %r61, %r11672, %r11664; + mad.lo.s32 %r11674, %r62, %r11670, %r11673; + mad.lo.s32 %r11675, %r64, %r11668, %r11674; + mad.lo.s32 %r11676, %r65, %r11666, %r11675; + ld.const.v4.u8 {%rs7610, %rs7611, %rs7612, %rs7613}, [matrix+3804]; + cvt.u32.u16 %r11677, %rs7613; + cvt.s32.s8 %r11678, %r11677; + cvt.u32.u16 %r11679, %rs7612; + cvt.s32.s8 %r11680, %r11679; + cvt.u32.u16 %r11681, %rs7611; + cvt.s32.s8 %r11682, %r11681; + cvt.u32.u16 %r11683, %rs7610; + cvt.s32.s8 %r11684, %r11683; + mad.lo.s32 %r11685, %r67, %r11684, %r11676; + mad.lo.s32 %r11686, %r68, %r11682, %r11685; + mad.lo.s32 %r11687, %r69, %r11680, %r11686; + mad.lo.s32 %r11688, %r70, %r11678, %r11687; + ld.const.v4.u8 {%rs7618, %rs7619, %rs7620, %rs7621}, [matrix+3808]; + cvt.u32.u16 %r11689, %rs7621; + cvt.s32.s8 %r11690, %r11689; + cvt.u32.u16 %r11691, %rs7620; + cvt.s32.s8 %r11692, %r11691; + cvt.u32.u16 %r11693, %rs7619; + cvt.s32.s8 %r11694, %r11693; + cvt.u32.u16 %r11695, %rs7618; + cvt.s32.s8 %r11696, %r11695; + mad.lo.s32 %r11697, %r222, %r11696, %r11688; + mad.lo.s32 %r11698, %r72, %r11694, %r11697; + mad.lo.s32 %r11699, %r73, %r11692, %r11698; + mad.lo.s32 %r11700, %r74, %r11690, %r11699; + ld.const.v4.u8 {%rs7626, %rs7627, %rs7628, %rs7629}, [matrix+3812]; + cvt.u32.u16 %r11701, %rs7629; + cvt.s32.s8 %r11702, %r11701; + cvt.u32.u16 %r11703, %rs7628; + cvt.s32.s8 %r11704, %r11703; + cvt.u32.u16 %r11705, %rs7627; + cvt.s32.s8 %r11706, %r11705; + cvt.u32.u16 %r11707, %rs7626; + cvt.s32.s8 %r11708, %r11707; + mad.lo.s32 %r11709, %r75, %r11708, %r11700; + mad.lo.s32 %r11710, %r76, %r11706, %r11709; + mad.lo.s32 %r11711, %r77, %r11704, %r11710; + mad.lo.s32 %r11712, %r78, %r11702, %r11711; + ld.const.v4.u8 {%rs7634, %rs7635, %rs7636, %rs7637}, [matrix+3816]; + cvt.u32.u16 %r11713, %rs7637; + cvt.s32.s8 %r11714, %r11713; + cvt.u32.u16 %r11715, %rs7636; + cvt.s32.s8 %r11716, %r11715; + cvt.u32.u16 %r11717, %rs7635; + cvt.s32.s8 %r11718, %r11717; + cvt.u32.u16 %r11719, %rs7634; + cvt.s32.s8 %r11720, %r11719; + mad.lo.s32 %r11721, %r80, %r11720, %r11712; + mad.lo.s32 %r11722, %r81, %r11718, %r11721; + mad.lo.s32 %r11723, %r83, %r11716, %r11722; + mad.lo.s32 %r11724, %r84, %r11714, %r11723; + ld.const.v4.u8 {%rs7642, %rs7643, %rs7644, %rs7645}, [matrix+3820]; + cvt.u32.u16 %r11725, %rs7645; + cvt.s32.s8 %r11726, %r11725; + cvt.u32.u16 %r11727, %rs7644; + cvt.s32.s8 %r11728, %r11727; + cvt.u32.u16 %r11729, %rs7643; + cvt.s32.s8 %r11730, %r11729; + cvt.u32.u16 %r11731, %rs7642; + cvt.s32.s8 %r11732, %r11731; + mad.lo.s32 %r11733, %r86, %r11732, %r11724; + mad.lo.s32 %r11734, %r87, %r11730, %r11733; + mad.lo.s32 %r11735, %r88, %r11728, %r11734; + mad.lo.s32 %r11736, %r89, %r11726, %r11735; + ld.const.v4.u8 {%rs7650, %rs7651, %rs7652, %rs7653}, [matrix+3824]; + cvt.u32.u16 %r11737, %rs7653; + cvt.s32.s8 %r11738, %r11737; + cvt.u32.u16 %r11739, %rs7652; + cvt.s32.s8 %r11740, %r11739; + cvt.u32.u16 %r11741, %rs7651; + cvt.s32.s8 %r11742, %r11741; + cvt.u32.u16 %r11743, %rs7650; + cvt.s32.s8 %r11744, %r11743; + mad.lo.s32 %r11745, %r271, %r11744, %r11736; + mad.lo.s32 %r11746, %r91, %r11742, %r11745; + mad.lo.s32 %r11747, %r93, %r11740, %r11746; + mad.lo.s32 %r11748, %r94, %r11738, %r11747; + ld.const.v4.u8 {%rs7658, %rs7659, %rs7660, %rs7661}, [matrix+3828]; + cvt.u32.u16 %r11749, %rs7661; + cvt.s32.s8 %r11750, %r11749; + cvt.u32.u16 %r11751, %rs7660; + cvt.s32.s8 %r11752, %r11751; + cvt.u32.u16 %r11753, %rs7659; + cvt.s32.s8 %r11754, %r11753; + cvt.u32.u16 %r11755, %rs7658; + cvt.s32.s8 %r11756, %r11755; + mad.lo.s32 %r11757, %r96, %r11756, %r11748; + mad.lo.s32 %r11758, %r97, %r11754, %r11757; + mad.lo.s32 %r11759, %r99, %r11752, %r11758; + mad.lo.s32 %r11760, %r100, %r11750, %r11759; + ld.const.v4.u8 {%rs7666, %rs7667, %rs7668, %rs7669}, [matrix+3832]; + cvt.u32.u16 %r11761, %rs7669; + cvt.s32.s8 %r11762, %r11761; + cvt.u32.u16 %r11763, %rs7668; + cvt.s32.s8 %r11764, %r11763; + cvt.u32.u16 %r11765, %rs7667; + cvt.s32.s8 %r11766, %r11765; + cvt.u32.u16 %r11767, %rs7666; + cvt.s32.s8 %r11768, %r11767; + mad.lo.s32 %r11769, %r103, %r11768, %r11760; + mad.lo.s32 %r11770, %r104, %r11766, %r11769; + mad.lo.s32 %r11771, %r107, %r11764, %r11770; + mad.lo.s32 %r11772, %r108, %r11762, %r11771; + ld.const.v4.u8 {%rs7674, %rs7675, %rs7676, %rs7677}, [matrix+3836]; + cvt.u32.u16 %r11773, %rs7677; + cvt.s32.s8 %r11774, %r11773; + cvt.u32.u16 %r11775, %rs7676; + cvt.s32.s8 %r11776, %r11775; + cvt.u32.u16 %r11777, %rs7675; + cvt.s32.s8 %r11778, %r11777; + cvt.u32.u16 %r11779, %rs7674; + cvt.s32.s8 %r11780, %r11779; + mad.lo.s32 %r11781, %r111, %r11780, %r11772; + mad.lo.s32 %r11782, %r112, %r11778, %r11781; + mad.lo.s32 %r11783, %r114, %r11776, %r11782; + mad.lo.s32 %r11784, %r115, %r11774, %r11783; + shr.u32 %r11785, %r11592, 6; + and.b32 %r11786, %r11785, 240; + shr.u32 %r11787, %r11784, 10; + or.b32 %r11788, %r11787, %r11786; + xor.b32 %r11789, %r105, %r11788; + cvt.u64.u32 %rd404, %r11789; + ld.const.v4.u8 {%rs7682, %rs7683, %rs7684, %rs7685}, [matrix+3840]; + cvt.u32.u16 %r11790, %rs7685; + cvt.s32.s8 %r11791, %r11790; + cvt.u32.u16 %r11792, %rs7684; + cvt.s32.s8 %r11793, %r11792; + cvt.u32.u16 %r11794, %rs7682; + cvt.s32.s8 %r11795, %r11794; + cvt.u32.u16 %r11796, %rs7683; + cvt.s32.s8 %r11797, %r11796; + mul.lo.s32 %r11798, %r34, %r11797; + mad.lo.s32 %r11799, %r124, %r11795, %r11798; + mad.lo.s32 %r11800, %r35, %r11793, %r11799; + mad.lo.s32 %r11801, %r36, %r11791, %r11800; + ld.const.v4.u8 {%rs7690, %rs7691, %rs7692, %rs7693}, [matrix+3844]; + cvt.u32.u16 %r11802, %rs7693; + cvt.s32.s8 %r11803, %r11802; + cvt.u32.u16 %r11804, %rs7692; + cvt.s32.s8 %r11805, %r11804; + cvt.u32.u16 %r11806, %rs7691; + cvt.s32.s8 %r11807, %r11806; + cvt.u32.u16 %r11808, %rs7690; + cvt.s32.s8 %r11809, %r11808; + mad.lo.s32 %r11810, %r37, %r11809, %r11801; + mad.lo.s32 %r11811, %r38, %r11807, %r11810; + mad.lo.s32 %r11812, %r39, %r11805, %r11811; + mad.lo.s32 %r11813, %r40, %r11803, %r11812; + ld.const.v4.u8 {%rs7698, %rs7699, %rs7700, %rs7701}, [matrix+3848]; + cvt.u32.u16 %r11814, %rs7701; + cvt.s32.s8 %r11815, %r11814; + cvt.u32.u16 %r11816, %rs7700; + cvt.s32.s8 %r11817, %r11816; + cvt.u32.u16 %r11818, %rs7699; + cvt.s32.s8 %r11819, %r11818; + cvt.u32.u16 %r11820, %rs7698; + cvt.s32.s8 %r11821, %r11820; + mad.lo.s32 %r11822, %r42, %r11821, %r11813; + mad.lo.s32 %r11823, %r43, %r11819, %r11822; + mad.lo.s32 %r11824, %r45, %r11817, %r11823; + mad.lo.s32 %r11825, %r46, %r11815, %r11824; + ld.const.v4.u8 {%rs7706, %rs7707, %rs7708, %rs7709}, [matrix+3852]; + cvt.u32.u16 %r11826, %rs7709; + cvt.s32.s8 %r11827, %r11826; + cvt.u32.u16 %r11828, %rs7708; + cvt.s32.s8 %r11829, %r11828; + cvt.u32.u16 %r11830, %rs7707; + cvt.s32.s8 %r11831, %r11830; + cvt.u32.u16 %r11832, %rs7706; + cvt.s32.s8 %r11833, %r11832; + mad.lo.s32 %r11834, %r48, %r11833, %r11825; + mad.lo.s32 %r11835, %r49, %r11831, %r11834; + mad.lo.s32 %r11836, %r50, %r11829, %r11835; + mad.lo.s32 %r11837, %r51, %r11827, %r11836; + ld.const.v4.u8 {%rs7714, %rs7715, %rs7716, %rs7717}, [matrix+3856]; + cvt.u32.u16 %r11838, %rs7717; + cvt.s32.s8 %r11839, %r11838; + cvt.u32.u16 %r11840, %rs7716; + cvt.s32.s8 %r11841, %r11840; + cvt.u32.u16 %r11842, %rs7715; + cvt.s32.s8 %r11843, %r11842; + cvt.u32.u16 %r11844, %rs7714; + cvt.s32.s8 %r11845, %r11844; + mad.lo.s32 %r11846, %r173, %r11845, %r11837; + mad.lo.s32 %r11847, %r53, %r11843, %r11846; + mad.lo.s32 %r11848, %r54, %r11841, %r11847; + mad.lo.s32 %r11849, %r55, %r11839, %r11848; + ld.const.v4.u8 {%rs7722, %rs7723, %rs7724, %rs7725}, [matrix+3860]; + cvt.u32.u16 %r11850, %rs7725; + cvt.s32.s8 %r11851, %r11850; + cvt.u32.u16 %r11852, %rs7724; + cvt.s32.s8 %r11853, %r11852; + cvt.u32.u16 %r11854, %rs7723; + cvt.s32.s8 %r11855, %r11854; + cvt.u32.u16 %r11856, %rs7722; + cvt.s32.s8 %r11857, %r11856; + mad.lo.s32 %r11858, %r56, %r11857, %r11849; + mad.lo.s32 %r11859, %r57, %r11855, %r11858; + mad.lo.s32 %r11860, %r58, %r11853, %r11859; + mad.lo.s32 %r11861, %r59, %r11851, %r11860; + ld.const.v4.u8 {%rs7730, %rs7731, %rs7732, %rs7733}, [matrix+3864]; + cvt.u32.u16 %r11862, %rs7733; + cvt.s32.s8 %r11863, %r11862; + cvt.u32.u16 %r11864, %rs7732; + cvt.s32.s8 %r11865, %r11864; + cvt.u32.u16 %r11866, %rs7731; + cvt.s32.s8 %r11867, %r11866; + cvt.u32.u16 %r11868, %rs7730; + cvt.s32.s8 %r11869, %r11868; + mad.lo.s32 %r11870, %r61, %r11869, %r11861; + mad.lo.s32 %r11871, %r62, %r11867, %r11870; + mad.lo.s32 %r11872, %r64, %r11865, %r11871; + mad.lo.s32 %r11873, %r65, %r11863, %r11872; + ld.const.v4.u8 {%rs7738, %rs7739, %rs7740, %rs7741}, [matrix+3868]; + cvt.u32.u16 %r11874, %rs7741; + cvt.s32.s8 %r11875, %r11874; + cvt.u32.u16 %r11876, %rs7740; + cvt.s32.s8 %r11877, %r11876; + cvt.u32.u16 %r11878, %rs7739; + cvt.s32.s8 %r11879, %r11878; + cvt.u32.u16 %r11880, %rs7738; + cvt.s32.s8 %r11881, %r11880; + mad.lo.s32 %r11882, %r67, %r11881, %r11873; + mad.lo.s32 %r11883, %r68, %r11879, %r11882; + mad.lo.s32 %r11884, %r69, %r11877, %r11883; + mad.lo.s32 %r11885, %r70, %r11875, %r11884; + ld.const.v4.u8 {%rs7746, %rs7747, %rs7748, %rs7749}, [matrix+3872]; + cvt.u32.u16 %r11886, %rs7749; + cvt.s32.s8 %r11887, %r11886; + cvt.u32.u16 %r11888, %rs7748; + cvt.s32.s8 %r11889, %r11888; + cvt.u32.u16 %r11890, %rs7747; + cvt.s32.s8 %r11891, %r11890; + cvt.u32.u16 %r11892, %rs7746; + cvt.s32.s8 %r11893, %r11892; + mad.lo.s32 %r11894, %r222, %r11893, %r11885; + mad.lo.s32 %r11895, %r72, %r11891, %r11894; + mad.lo.s32 %r11896, %r73, %r11889, %r11895; + mad.lo.s32 %r11897, %r74, %r11887, %r11896; + ld.const.v4.u8 {%rs7754, %rs7755, %rs7756, %rs7757}, [matrix+3876]; + cvt.u32.u16 %r11898, %rs7757; + cvt.s32.s8 %r11899, %r11898; + cvt.u32.u16 %r11900, %rs7756; + cvt.s32.s8 %r11901, %r11900; + cvt.u32.u16 %r11902, %rs7755; + cvt.s32.s8 %r11903, %r11902; + cvt.u32.u16 %r11904, %rs7754; + cvt.s32.s8 %r11905, %r11904; + mad.lo.s32 %r11906, %r75, %r11905, %r11897; + mad.lo.s32 %r11907, %r76, %r11903, %r11906; + mad.lo.s32 %r11908, %r77, %r11901, %r11907; + mad.lo.s32 %r11909, %r78, %r11899, %r11908; + ld.const.v4.u8 {%rs7762, %rs7763, %rs7764, %rs7765}, [matrix+3880]; + cvt.u32.u16 %r11910, %rs7765; + cvt.s32.s8 %r11911, %r11910; + cvt.u32.u16 %r11912, %rs7764; + cvt.s32.s8 %r11913, %r11912; + cvt.u32.u16 %r11914, %rs7763; + cvt.s32.s8 %r11915, %r11914; + cvt.u32.u16 %r11916, %rs7762; + cvt.s32.s8 %r11917, %r11916; + mad.lo.s32 %r11918, %r80, %r11917, %r11909; + mad.lo.s32 %r11919, %r81, %r11915, %r11918; + mad.lo.s32 %r11920, %r83, %r11913, %r11919; + mad.lo.s32 %r11921, %r84, %r11911, %r11920; + ld.const.v4.u8 {%rs7770, %rs7771, %rs7772, %rs7773}, [matrix+3884]; + cvt.u32.u16 %r11922, %rs7773; + cvt.s32.s8 %r11923, %r11922; + cvt.u32.u16 %r11924, %rs7772; + cvt.s32.s8 %r11925, %r11924; + cvt.u32.u16 %r11926, %rs7771; + cvt.s32.s8 %r11927, %r11926; + cvt.u32.u16 %r11928, %rs7770; + cvt.s32.s8 %r11929, %r11928; + mad.lo.s32 %r11930, %r86, %r11929, %r11921; + mad.lo.s32 %r11931, %r87, %r11927, %r11930; + mad.lo.s32 %r11932, %r88, %r11925, %r11931; + mad.lo.s32 %r11933, %r89, %r11923, %r11932; + ld.const.v4.u8 {%rs7778, %rs7779, %rs7780, %rs7781}, [matrix+3888]; + cvt.u32.u16 %r11934, %rs7781; + cvt.s32.s8 %r11935, %r11934; + cvt.u32.u16 %r11936, %rs7780; + cvt.s32.s8 %r11937, %r11936; + cvt.u32.u16 %r11938, %rs7779; + cvt.s32.s8 %r11939, %r11938; + cvt.u32.u16 %r11940, %rs7778; + cvt.s32.s8 %r11941, %r11940; + mad.lo.s32 %r11942, %r271, %r11941, %r11933; + mad.lo.s32 %r11943, %r91, %r11939, %r11942; + mad.lo.s32 %r11944, %r93, %r11937, %r11943; + mad.lo.s32 %r11945, %r94, %r11935, %r11944; + ld.const.v4.u8 {%rs7786, %rs7787, %rs7788, %rs7789}, [matrix+3892]; + cvt.u32.u16 %r11946, %rs7789; + cvt.s32.s8 %r11947, %r11946; + cvt.u32.u16 %r11948, %rs7788; + cvt.s32.s8 %r11949, %r11948; + cvt.u32.u16 %r11950, %rs7787; + cvt.s32.s8 %r11951, %r11950; + cvt.u32.u16 %r11952, %rs7786; + cvt.s32.s8 %r11953, %r11952; + mad.lo.s32 %r11954, %r96, %r11953, %r11945; + mad.lo.s32 %r11955, %r97, %r11951, %r11954; + mad.lo.s32 %r11956, %r99, %r11949, %r11955; + mad.lo.s32 %r11957, %r100, %r11947, %r11956; + ld.const.v4.u8 {%rs7794, %rs7795, %rs7796, %rs7797}, [matrix+3896]; + cvt.u32.u16 %r11958, %rs7797; + cvt.s32.s8 %r11959, %r11958; + cvt.u32.u16 %r11960, %rs7796; + cvt.s32.s8 %r11961, %r11960; + cvt.u32.u16 %r11962, %rs7795; + cvt.s32.s8 %r11963, %r11962; + cvt.u32.u16 %r11964, %rs7794; + cvt.s32.s8 %r11965, %r11964; + mad.lo.s32 %r11966, %r103, %r11965, %r11957; + mad.lo.s32 %r11967, %r104, %r11963, %r11966; + mad.lo.s32 %r11968, %r107, %r11961, %r11967; + mad.lo.s32 %r11969, %r108, %r11959, %r11968; + ld.const.v4.u8 {%rs7802, %rs7803, %rs7804, %rs7805}, [matrix+3900]; + cvt.u32.u16 %r11970, %rs7805; + cvt.s32.s8 %r11971, %r11970; + cvt.u32.u16 %r11972, %rs7804; + cvt.s32.s8 %r11973, %r11972; + cvt.u32.u16 %r11974, %rs7803; + cvt.s32.s8 %r11975, %r11974; + cvt.u32.u16 %r11976, %rs7802; + cvt.s32.s8 %r11977, %r11976; + mad.lo.s32 %r11978, %r111, %r11977, %r11969; + mad.lo.s32 %r11979, %r112, %r11975, %r11978; + mad.lo.s32 %r11980, %r114, %r11973, %r11979; + mad.lo.s32 %r11981, %r115, %r11971, %r11980; + ld.const.v4.u8 {%rs7810, %rs7811, %rs7812, %rs7813}, [matrix+3904]; + cvt.u32.u16 %r11982, %rs7813; + cvt.s32.s8 %r11983, %r11982; + cvt.u32.u16 %r11984, %rs7812; + cvt.s32.s8 %r11985, %r11984; + cvt.u32.u16 %r11986, %rs7810; + cvt.s32.s8 %r11987, %r11986; + cvt.u32.u16 %r11988, %rs7811; + cvt.s32.s8 %r11989, %r11988; + mul.lo.s32 %r11990, %r34, %r11989; + mad.lo.s32 %r11991, %r124, %r11987, %r11990; + mad.lo.s32 %r11992, %r35, %r11985, %r11991; + mad.lo.s32 %r11993, %r36, %r11983, %r11992; + ld.const.v4.u8 {%rs7818, %rs7819, %rs7820, %rs7821}, [matrix+3908]; + cvt.u32.u16 %r11994, %rs7821; + cvt.s32.s8 %r11995, %r11994; + cvt.u32.u16 %r11996, %rs7820; + cvt.s32.s8 %r11997, %r11996; + cvt.u32.u16 %r11998, %rs7819; + cvt.s32.s8 %r11999, %r11998; + cvt.u32.u16 %r12000, %rs7818; + cvt.s32.s8 %r12001, %r12000; + mad.lo.s32 %r12002, %r37, %r12001, %r11993; + mad.lo.s32 %r12003, %r38, %r11999, %r12002; + mad.lo.s32 %r12004, %r39, %r11997, %r12003; + mad.lo.s32 %r12005, %r40, %r11995, %r12004; + ld.const.v4.u8 {%rs7826, %rs7827, %rs7828, %rs7829}, [matrix+3912]; + cvt.u32.u16 %r12006, %rs7829; + cvt.s32.s8 %r12007, %r12006; + cvt.u32.u16 %r12008, %rs7828; + cvt.s32.s8 %r12009, %r12008; + cvt.u32.u16 %r12010, %rs7827; + cvt.s32.s8 %r12011, %r12010; + cvt.u32.u16 %r12012, %rs7826; + cvt.s32.s8 %r12013, %r12012; + mad.lo.s32 %r12014, %r42, %r12013, %r12005; + mad.lo.s32 %r12015, %r43, %r12011, %r12014; + mad.lo.s32 %r12016, %r45, %r12009, %r12015; + mad.lo.s32 %r12017, %r46, %r12007, %r12016; + ld.const.v4.u8 {%rs7834, %rs7835, %rs7836, %rs7837}, [matrix+3916]; + cvt.u32.u16 %r12018, %rs7837; + cvt.s32.s8 %r12019, %r12018; + cvt.u32.u16 %r12020, %rs7836; + cvt.s32.s8 %r12021, %r12020; + cvt.u32.u16 %r12022, %rs7835; + cvt.s32.s8 %r12023, %r12022; + cvt.u32.u16 %r12024, %rs7834; + cvt.s32.s8 %r12025, %r12024; + mad.lo.s32 %r12026, %r48, %r12025, %r12017; + mad.lo.s32 %r12027, %r49, %r12023, %r12026; + mad.lo.s32 %r12028, %r50, %r12021, %r12027; + mad.lo.s32 %r12029, %r51, %r12019, %r12028; + ld.const.v4.u8 {%rs7842, %rs7843, %rs7844, %rs7845}, [matrix+3920]; + cvt.u32.u16 %r12030, %rs7845; + cvt.s32.s8 %r12031, %r12030; + cvt.u32.u16 %r12032, %rs7844; + cvt.s32.s8 %r12033, %r12032; + cvt.u32.u16 %r12034, %rs7843; + cvt.s32.s8 %r12035, %r12034; + cvt.u32.u16 %r12036, %rs7842; + cvt.s32.s8 %r12037, %r12036; + mad.lo.s32 %r12038, %r173, %r12037, %r12029; + mad.lo.s32 %r12039, %r53, %r12035, %r12038; + mad.lo.s32 %r12040, %r54, %r12033, %r12039; + mad.lo.s32 %r12041, %r55, %r12031, %r12040; + ld.const.v4.u8 {%rs7850, %rs7851, %rs7852, %rs7853}, [matrix+3924]; + cvt.u32.u16 %r12042, %rs7853; + cvt.s32.s8 %r12043, %r12042; + cvt.u32.u16 %r12044, %rs7852; + cvt.s32.s8 %r12045, %r12044; + cvt.u32.u16 %r12046, %rs7851; + cvt.s32.s8 %r12047, %r12046; + cvt.u32.u16 %r12048, %rs7850; + cvt.s32.s8 %r12049, %r12048; + mad.lo.s32 %r12050, %r56, %r12049, %r12041; + mad.lo.s32 %r12051, %r57, %r12047, %r12050; + mad.lo.s32 %r12052, %r58, %r12045, %r12051; + mad.lo.s32 %r12053, %r59, %r12043, %r12052; + ld.const.v4.u8 {%rs7858, %rs7859, %rs7860, %rs7861}, [matrix+3928]; + cvt.u32.u16 %r12054, %rs7861; + cvt.s32.s8 %r12055, %r12054; + cvt.u32.u16 %r12056, %rs7860; + cvt.s32.s8 %r12057, %r12056; + cvt.u32.u16 %r12058, %rs7859; + cvt.s32.s8 %r12059, %r12058; + cvt.u32.u16 %r12060, %rs7858; + cvt.s32.s8 %r12061, %r12060; + mad.lo.s32 %r12062, %r61, %r12061, %r12053; + mad.lo.s32 %r12063, %r62, %r12059, %r12062; + mad.lo.s32 %r12064, %r64, %r12057, %r12063; + mad.lo.s32 %r12065, %r65, %r12055, %r12064; + ld.const.v4.u8 {%rs7866, %rs7867, %rs7868, %rs7869}, [matrix+3932]; + cvt.u32.u16 %r12066, %rs7869; + cvt.s32.s8 %r12067, %r12066; + cvt.u32.u16 %r12068, %rs7868; + cvt.s32.s8 %r12069, %r12068; + cvt.u32.u16 %r12070, %rs7867; + cvt.s32.s8 %r12071, %r12070; + cvt.u32.u16 %r12072, %rs7866; + cvt.s32.s8 %r12073, %r12072; + mad.lo.s32 %r12074, %r67, %r12073, %r12065; + mad.lo.s32 %r12075, %r68, %r12071, %r12074; + mad.lo.s32 %r12076, %r69, %r12069, %r12075; + mad.lo.s32 %r12077, %r70, %r12067, %r12076; + ld.const.v4.u8 {%rs7874, %rs7875, %rs7876, %rs7877}, [matrix+3936]; + cvt.u32.u16 %r12078, %rs7877; + cvt.s32.s8 %r12079, %r12078; + cvt.u32.u16 %r12080, %rs7876; + cvt.s32.s8 %r12081, %r12080; + cvt.u32.u16 %r12082, %rs7875; + cvt.s32.s8 %r12083, %r12082; + cvt.u32.u16 %r12084, %rs7874; + cvt.s32.s8 %r12085, %r12084; + mad.lo.s32 %r12086, %r222, %r12085, %r12077; + mad.lo.s32 %r12087, %r72, %r12083, %r12086; + mad.lo.s32 %r12088, %r73, %r12081, %r12087; + mad.lo.s32 %r12089, %r74, %r12079, %r12088; + ld.const.v4.u8 {%rs7882, %rs7883, %rs7884, %rs7885}, [matrix+3940]; + cvt.u32.u16 %r12090, %rs7885; + cvt.s32.s8 %r12091, %r12090; + cvt.u32.u16 %r12092, %rs7884; + cvt.s32.s8 %r12093, %r12092; + cvt.u32.u16 %r12094, %rs7883; + cvt.s32.s8 %r12095, %r12094; + cvt.u32.u16 %r12096, %rs7882; + cvt.s32.s8 %r12097, %r12096; + mad.lo.s32 %r12098, %r75, %r12097, %r12089; + mad.lo.s32 %r12099, %r76, %r12095, %r12098; + mad.lo.s32 %r12100, %r77, %r12093, %r12099; + mad.lo.s32 %r12101, %r78, %r12091, %r12100; + ld.const.v4.u8 {%rs7890, %rs7891, %rs7892, %rs7893}, [matrix+3944]; + cvt.u32.u16 %r12102, %rs7893; + cvt.s32.s8 %r12103, %r12102; + cvt.u32.u16 %r12104, %rs7892; + cvt.s32.s8 %r12105, %r12104; + cvt.u32.u16 %r12106, %rs7891; + cvt.s32.s8 %r12107, %r12106; + cvt.u32.u16 %r12108, %rs7890; + cvt.s32.s8 %r12109, %r12108; + mad.lo.s32 %r12110, %r80, %r12109, %r12101; + mad.lo.s32 %r12111, %r81, %r12107, %r12110; + mad.lo.s32 %r12112, %r83, %r12105, %r12111; + mad.lo.s32 %r12113, %r84, %r12103, %r12112; + ld.const.v4.u8 {%rs7898, %rs7899, %rs7900, %rs7901}, [matrix+3948]; + cvt.u32.u16 %r12114, %rs7901; + cvt.s32.s8 %r12115, %r12114; + cvt.u32.u16 %r12116, %rs7900; + cvt.s32.s8 %r12117, %r12116; + cvt.u32.u16 %r12118, %rs7899; + cvt.s32.s8 %r12119, %r12118; + cvt.u32.u16 %r12120, %rs7898; + cvt.s32.s8 %r12121, %r12120; + mad.lo.s32 %r12122, %r86, %r12121, %r12113; + mad.lo.s32 %r12123, %r87, %r12119, %r12122; + mad.lo.s32 %r12124, %r88, %r12117, %r12123; + mad.lo.s32 %r12125, %r89, %r12115, %r12124; + ld.const.v4.u8 {%rs7906, %rs7907, %rs7908, %rs7909}, [matrix+3952]; + cvt.u32.u16 %r12126, %rs7909; + cvt.s32.s8 %r12127, %r12126; + cvt.u32.u16 %r12128, %rs7908; + cvt.s32.s8 %r12129, %r12128; + cvt.u32.u16 %r12130, %rs7907; + cvt.s32.s8 %r12131, %r12130; + cvt.u32.u16 %r12132, %rs7906; + cvt.s32.s8 %r12133, %r12132; + mad.lo.s32 %r12134, %r271, %r12133, %r12125; + mad.lo.s32 %r12135, %r91, %r12131, %r12134; + mad.lo.s32 %r12136, %r93, %r12129, %r12135; + mad.lo.s32 %r12137, %r94, %r12127, %r12136; + ld.const.v4.u8 {%rs7914, %rs7915, %rs7916, %rs7917}, [matrix+3956]; + cvt.u32.u16 %r12138, %rs7917; + cvt.s32.s8 %r12139, %r12138; + cvt.u32.u16 %r12140, %rs7916; + cvt.s32.s8 %r12141, %r12140; + cvt.u32.u16 %r12142, %rs7915; + cvt.s32.s8 %r12143, %r12142; + cvt.u32.u16 %r12144, %rs7914; + cvt.s32.s8 %r12145, %r12144; + mad.lo.s32 %r12146, %r96, %r12145, %r12137; + mad.lo.s32 %r12147, %r97, %r12143, %r12146; + mad.lo.s32 %r12148, %r99, %r12141, %r12147; + mad.lo.s32 %r12149, %r100, %r12139, %r12148; + ld.const.v4.u8 {%rs7922, %rs7923, %rs7924, %rs7925}, [matrix+3960]; + cvt.u32.u16 %r12150, %rs7925; + cvt.s32.s8 %r12151, %r12150; + cvt.u32.u16 %r12152, %rs7924; + cvt.s32.s8 %r12153, %r12152; + cvt.u32.u16 %r12154, %rs7923; + cvt.s32.s8 %r12155, %r12154; + cvt.u32.u16 %r12156, %rs7922; + cvt.s32.s8 %r12157, %r12156; + mad.lo.s32 %r12158, %r103, %r12157, %r12149; + mad.lo.s32 %r12159, %r104, %r12155, %r12158; + mad.lo.s32 %r12160, %r107, %r12153, %r12159; + mad.lo.s32 %r12161, %r108, %r12151, %r12160; + ld.const.v4.u8 {%rs7930, %rs7931, %rs7932, %rs7933}, [matrix+3964]; + cvt.u32.u16 %r12162, %rs7933; + cvt.s32.s8 %r12163, %r12162; + cvt.u32.u16 %r12164, %rs7932; + cvt.s32.s8 %r12165, %r12164; + cvt.u32.u16 %r12166, %rs7931; + cvt.s32.s8 %r12167, %r12166; + cvt.u32.u16 %r12168, %rs7930; + cvt.s32.s8 %r12169, %r12168; + mad.lo.s32 %r12170, %r111, %r12169, %r12161; + mad.lo.s32 %r12171, %r112, %r12167, %r12170; + mad.lo.s32 %r12172, %r114, %r12165, %r12171; + mad.lo.s32 %r12173, %r115, %r12163, %r12172; + shr.u32 %r12174, %r11981, 6; + and.b32 %r12175, %r12174, 240; + shr.u32 %r12176, %r12173, 10; + or.b32 %r12177, %r12176, %r12175; + xor.b32 %r12178, %r109, %r12177; + cvt.u64.u32 %rd405, %r12178; + ld.const.v4.u8 {%rs7938, %rs7939, %rs7940, %rs7941}, [matrix+3968]; + cvt.u32.u16 %r12179, %rs7941; + cvt.s32.s8 %r12180, %r12179; + cvt.u32.u16 %r12181, %rs7940; + cvt.s32.s8 %r12182, %r12181; + cvt.u32.u16 %r12183, %rs7938; + cvt.s32.s8 %r12184, %r12183; + cvt.u32.u16 %r12185, %rs7939; + cvt.s32.s8 %r12186, %r12185; + mul.lo.s32 %r12187, %r34, %r12186; + mad.lo.s32 %r12188, %r124, %r12184, %r12187; + mad.lo.s32 %r12189, %r35, %r12182, %r12188; + mad.lo.s32 %r12190, %r36, %r12180, %r12189; + ld.const.v4.u8 {%rs7946, %rs7947, %rs7948, %rs7949}, [matrix+3972]; + cvt.u32.u16 %r12191, %rs7949; + cvt.s32.s8 %r12192, %r12191; + cvt.u32.u16 %r12193, %rs7948; + cvt.s32.s8 %r12194, %r12193; + cvt.u32.u16 %r12195, %rs7947; + cvt.s32.s8 %r12196, %r12195; + cvt.u32.u16 %r12197, %rs7946; + cvt.s32.s8 %r12198, %r12197; + mad.lo.s32 %r12199, %r37, %r12198, %r12190; + mad.lo.s32 %r12200, %r38, %r12196, %r12199; + mad.lo.s32 %r12201, %r39, %r12194, %r12200; + mad.lo.s32 %r12202, %r40, %r12192, %r12201; + ld.const.v4.u8 {%rs7954, %rs7955, %rs7956, %rs7957}, [matrix+3976]; + cvt.u32.u16 %r12203, %rs7957; + cvt.s32.s8 %r12204, %r12203; + cvt.u32.u16 %r12205, %rs7956; + cvt.s32.s8 %r12206, %r12205; + cvt.u32.u16 %r12207, %rs7955; + cvt.s32.s8 %r12208, %r12207; + cvt.u32.u16 %r12209, %rs7954; + cvt.s32.s8 %r12210, %r12209; + mad.lo.s32 %r12211, %r42, %r12210, %r12202; + mad.lo.s32 %r12212, %r43, %r12208, %r12211; + mad.lo.s32 %r12213, %r45, %r12206, %r12212; + mad.lo.s32 %r12214, %r46, %r12204, %r12213; + ld.const.v4.u8 {%rs7962, %rs7963, %rs7964, %rs7965}, [matrix+3980]; + cvt.u32.u16 %r12215, %rs7965; + cvt.s32.s8 %r12216, %r12215; + cvt.u32.u16 %r12217, %rs7964; + cvt.s32.s8 %r12218, %r12217; + cvt.u32.u16 %r12219, %rs7963; + cvt.s32.s8 %r12220, %r12219; + cvt.u32.u16 %r12221, %rs7962; + cvt.s32.s8 %r12222, %r12221; + mad.lo.s32 %r12223, %r48, %r12222, %r12214; + mad.lo.s32 %r12224, %r49, %r12220, %r12223; + mad.lo.s32 %r12225, %r50, %r12218, %r12224; + mad.lo.s32 %r12226, %r51, %r12216, %r12225; + ld.const.v4.u8 {%rs7970, %rs7971, %rs7972, %rs7973}, [matrix+3984]; + cvt.u32.u16 %r12227, %rs7973; + cvt.s32.s8 %r12228, %r12227; + cvt.u32.u16 %r12229, %rs7972; + cvt.s32.s8 %r12230, %r12229; + cvt.u32.u16 %r12231, %rs7971; + cvt.s32.s8 %r12232, %r12231; + cvt.u32.u16 %r12233, %rs7970; + cvt.s32.s8 %r12234, %r12233; + mad.lo.s32 %r12235, %r173, %r12234, %r12226; + mad.lo.s32 %r12236, %r53, %r12232, %r12235; + mad.lo.s32 %r12237, %r54, %r12230, %r12236; + mad.lo.s32 %r12238, %r55, %r12228, %r12237; + ld.const.v4.u8 {%rs7978, %rs7979, %rs7980, %rs7981}, [matrix+3988]; + cvt.u32.u16 %r12239, %rs7981; + cvt.s32.s8 %r12240, %r12239; + cvt.u32.u16 %r12241, %rs7980; + cvt.s32.s8 %r12242, %r12241; + cvt.u32.u16 %r12243, %rs7979; + cvt.s32.s8 %r12244, %r12243; + cvt.u32.u16 %r12245, %rs7978; + cvt.s32.s8 %r12246, %r12245; + mad.lo.s32 %r12247, %r56, %r12246, %r12238; + mad.lo.s32 %r12248, %r57, %r12244, %r12247; + mad.lo.s32 %r12249, %r58, %r12242, %r12248; + mad.lo.s32 %r12250, %r59, %r12240, %r12249; + ld.const.v4.u8 {%rs7986, %rs7987, %rs7988, %rs7989}, [matrix+3992]; + cvt.u32.u16 %r12251, %rs7989; + cvt.s32.s8 %r12252, %r12251; + cvt.u32.u16 %r12253, %rs7988; + cvt.s32.s8 %r12254, %r12253; + cvt.u32.u16 %r12255, %rs7987; + cvt.s32.s8 %r12256, %r12255; + cvt.u32.u16 %r12257, %rs7986; + cvt.s32.s8 %r12258, %r12257; + mad.lo.s32 %r12259, %r61, %r12258, %r12250; + mad.lo.s32 %r12260, %r62, %r12256, %r12259; + mad.lo.s32 %r12261, %r64, %r12254, %r12260; + mad.lo.s32 %r12262, %r65, %r12252, %r12261; + ld.const.v4.u8 {%rs7994, %rs7995, %rs7996, %rs7997}, [matrix+3996]; + cvt.u32.u16 %r12263, %rs7997; + cvt.s32.s8 %r12264, %r12263; + cvt.u32.u16 %r12265, %rs7996; + cvt.s32.s8 %r12266, %r12265; + cvt.u32.u16 %r12267, %rs7995; + cvt.s32.s8 %r12268, %r12267; + cvt.u32.u16 %r12269, %rs7994; + cvt.s32.s8 %r12270, %r12269; + mad.lo.s32 %r12271, %r67, %r12270, %r12262; + mad.lo.s32 %r12272, %r68, %r12268, %r12271; + mad.lo.s32 %r12273, %r69, %r12266, %r12272; + mad.lo.s32 %r12274, %r70, %r12264, %r12273; + ld.const.v4.u8 {%rs8002, %rs8003, %rs8004, %rs8005}, [matrix+4000]; + cvt.u32.u16 %r12275, %rs8005; + cvt.s32.s8 %r12276, %r12275; + cvt.u32.u16 %r12277, %rs8004; + cvt.s32.s8 %r12278, %r12277; + cvt.u32.u16 %r12279, %rs8003; + cvt.s32.s8 %r12280, %r12279; + cvt.u32.u16 %r12281, %rs8002; + cvt.s32.s8 %r12282, %r12281; + mad.lo.s32 %r12283, %r222, %r12282, %r12274; + mad.lo.s32 %r12284, %r72, %r12280, %r12283; + mad.lo.s32 %r12285, %r73, %r12278, %r12284; + mad.lo.s32 %r12286, %r74, %r12276, %r12285; + ld.const.v4.u8 {%rs8010, %rs8011, %rs8012, %rs8013}, [matrix+4004]; + cvt.u32.u16 %r12287, %rs8013; + cvt.s32.s8 %r12288, %r12287; + cvt.u32.u16 %r12289, %rs8012; + cvt.s32.s8 %r12290, %r12289; + cvt.u32.u16 %r12291, %rs8011; + cvt.s32.s8 %r12292, %r12291; + cvt.u32.u16 %r12293, %rs8010; + cvt.s32.s8 %r12294, %r12293; + mad.lo.s32 %r12295, %r75, %r12294, %r12286; + mad.lo.s32 %r12296, %r76, %r12292, %r12295; + mad.lo.s32 %r12297, %r77, %r12290, %r12296; + mad.lo.s32 %r12298, %r78, %r12288, %r12297; + ld.const.v4.u8 {%rs8018, %rs8019, %rs8020, %rs8021}, [matrix+4008]; + cvt.u32.u16 %r12299, %rs8021; + cvt.s32.s8 %r12300, %r12299; + cvt.u32.u16 %r12301, %rs8020; + cvt.s32.s8 %r12302, %r12301; + cvt.u32.u16 %r12303, %rs8019; + cvt.s32.s8 %r12304, %r12303; + cvt.u32.u16 %r12305, %rs8018; + cvt.s32.s8 %r12306, %r12305; + mad.lo.s32 %r12307, %r80, %r12306, %r12298; + mad.lo.s32 %r12308, %r81, %r12304, %r12307; + mad.lo.s32 %r12309, %r83, %r12302, %r12308; + mad.lo.s32 %r12310, %r84, %r12300, %r12309; + ld.const.v4.u8 {%rs8026, %rs8027, %rs8028, %rs8029}, [matrix+4012]; + cvt.u32.u16 %r12311, %rs8029; + cvt.s32.s8 %r12312, %r12311; + cvt.u32.u16 %r12313, %rs8028; + cvt.s32.s8 %r12314, %r12313; + cvt.u32.u16 %r12315, %rs8027; + cvt.s32.s8 %r12316, %r12315; + cvt.u32.u16 %r12317, %rs8026; + cvt.s32.s8 %r12318, %r12317; + mad.lo.s32 %r12319, %r86, %r12318, %r12310; + mad.lo.s32 %r12320, %r87, %r12316, %r12319; + mad.lo.s32 %r12321, %r88, %r12314, %r12320; + mad.lo.s32 %r12322, %r89, %r12312, %r12321; + ld.const.v4.u8 {%rs8034, %rs8035, %rs8036, %rs8037}, [matrix+4016]; + cvt.u32.u16 %r12323, %rs8037; + cvt.s32.s8 %r12324, %r12323; + cvt.u32.u16 %r12325, %rs8036; + cvt.s32.s8 %r12326, %r12325; + cvt.u32.u16 %r12327, %rs8035; + cvt.s32.s8 %r12328, %r12327; + cvt.u32.u16 %r12329, %rs8034; + cvt.s32.s8 %r12330, %r12329; + mad.lo.s32 %r12331, %r271, %r12330, %r12322; + mad.lo.s32 %r12332, %r91, %r12328, %r12331; + mad.lo.s32 %r12333, %r93, %r12326, %r12332; + mad.lo.s32 %r12334, %r94, %r12324, %r12333; + ld.const.v4.u8 {%rs8042, %rs8043, %rs8044, %rs8045}, [matrix+4020]; + cvt.u32.u16 %r12335, %rs8045; + cvt.s32.s8 %r12336, %r12335; + cvt.u32.u16 %r12337, %rs8044; + cvt.s32.s8 %r12338, %r12337; + cvt.u32.u16 %r12339, %rs8043; + cvt.s32.s8 %r12340, %r12339; + cvt.u32.u16 %r12341, %rs8042; + cvt.s32.s8 %r12342, %r12341; + mad.lo.s32 %r12343, %r96, %r12342, %r12334; + mad.lo.s32 %r12344, %r97, %r12340, %r12343; + mad.lo.s32 %r12345, %r99, %r12338, %r12344; + mad.lo.s32 %r12346, %r100, %r12336, %r12345; + ld.const.v4.u8 {%rs8050, %rs8051, %rs8052, %rs8053}, [matrix+4024]; + cvt.u32.u16 %r12347, %rs8053; + cvt.s32.s8 %r12348, %r12347; + cvt.u32.u16 %r12349, %rs8052; + cvt.s32.s8 %r12350, %r12349; + cvt.u32.u16 %r12351, %rs8051; + cvt.s32.s8 %r12352, %r12351; + cvt.u32.u16 %r12353, %rs8050; + cvt.s32.s8 %r12354, %r12353; + mad.lo.s32 %r12355, %r103, %r12354, %r12346; + mad.lo.s32 %r12356, %r104, %r12352, %r12355; + mad.lo.s32 %r12357, %r107, %r12350, %r12356; + mad.lo.s32 %r12358, %r108, %r12348, %r12357; + ld.const.v4.u8 {%rs8058, %rs8059, %rs8060, %rs8061}, [matrix+4028]; + cvt.u32.u16 %r12359, %rs8061; + cvt.s32.s8 %r12360, %r12359; + cvt.u32.u16 %r12361, %rs8060; + cvt.s32.s8 %r12362, %r12361; + cvt.u32.u16 %r12363, %rs8059; + cvt.s32.s8 %r12364, %r12363; + cvt.u32.u16 %r12365, %rs8058; + cvt.s32.s8 %r12366, %r12365; + mad.lo.s32 %r12367, %r111, %r12366, %r12358; + mad.lo.s32 %r12368, %r112, %r12364, %r12367; + mad.lo.s32 %r12369, %r114, %r12362, %r12368; + mad.lo.s32 %r12370, %r115, %r12360, %r12369; + ld.const.v4.u8 {%rs8066, %rs8067, %rs8068, %rs8069}, [matrix+4032]; + cvt.u32.u16 %r12371, %rs8069; + cvt.s32.s8 %r12372, %r12371; + cvt.u32.u16 %r12373, %rs8068; + cvt.s32.s8 %r12374, %r12373; + cvt.u32.u16 %r12375, %rs8066; + cvt.s32.s8 %r12376, %r12375; + cvt.u32.u16 %r12377, %rs8067; + cvt.s32.s8 %r12378, %r12377; + mul.lo.s32 %r12379, %r34, %r12378; + mad.lo.s32 %r12380, %r124, %r12376, %r12379; + mad.lo.s32 %r12381, %r35, %r12374, %r12380; + mad.lo.s32 %r12382, %r36, %r12372, %r12381; + ld.const.v4.u8 {%rs8074, %rs8075, %rs8076, %rs8077}, [matrix+4036]; + cvt.u32.u16 %r12383, %rs8077; + cvt.s32.s8 %r12384, %r12383; + cvt.u32.u16 %r12385, %rs8076; + cvt.s32.s8 %r12386, %r12385; + cvt.u32.u16 %r12387, %rs8075; + cvt.s32.s8 %r12388, %r12387; + cvt.u32.u16 %r12389, %rs8074; + cvt.s32.s8 %r12390, %r12389; + mad.lo.s32 %r12391, %r37, %r12390, %r12382; + mad.lo.s32 %r12392, %r38, %r12388, %r12391; + mad.lo.s32 %r12393, %r39, %r12386, %r12392; + mad.lo.s32 %r12394, %r40, %r12384, %r12393; + ld.const.v4.u8 {%rs8082, %rs8083, %rs8084, %rs8085}, [matrix+4040]; + cvt.u32.u16 %r12395, %rs8085; + cvt.s32.s8 %r12396, %r12395; + cvt.u32.u16 %r12397, %rs8084; + cvt.s32.s8 %r12398, %r12397; + cvt.u32.u16 %r12399, %rs8083; + cvt.s32.s8 %r12400, %r12399; + cvt.u32.u16 %r12401, %rs8082; + cvt.s32.s8 %r12402, %r12401; + mad.lo.s32 %r12403, %r42, %r12402, %r12394; + mad.lo.s32 %r12404, %r43, %r12400, %r12403; + mad.lo.s32 %r12405, %r45, %r12398, %r12404; + mad.lo.s32 %r12406, %r46, %r12396, %r12405; + ld.const.v4.u8 {%rs8090, %rs8091, %rs8092, %rs8093}, [matrix+4044]; + cvt.u32.u16 %r12407, %rs8093; + cvt.s32.s8 %r12408, %r12407; + cvt.u32.u16 %r12409, %rs8092; + cvt.s32.s8 %r12410, %r12409; + cvt.u32.u16 %r12411, %rs8091; + cvt.s32.s8 %r12412, %r12411; + cvt.u32.u16 %r12413, %rs8090; + cvt.s32.s8 %r12414, %r12413; + mad.lo.s32 %r12415, %r48, %r12414, %r12406; + mad.lo.s32 %r12416, %r49, %r12412, %r12415; + mad.lo.s32 %r12417, %r50, %r12410, %r12416; + mad.lo.s32 %r12418, %r51, %r12408, %r12417; + ld.const.v4.u8 {%rs8098, %rs8099, %rs8100, %rs8101}, [matrix+4048]; + cvt.u32.u16 %r12419, %rs8101; + cvt.s32.s8 %r12420, %r12419; + cvt.u32.u16 %r12421, %rs8100; + cvt.s32.s8 %r12422, %r12421; + cvt.u32.u16 %r12423, %rs8099; + cvt.s32.s8 %r12424, %r12423; + cvt.u32.u16 %r12425, %rs8098; + cvt.s32.s8 %r12426, %r12425; + mad.lo.s32 %r12427, %r173, %r12426, %r12418; + mad.lo.s32 %r12428, %r53, %r12424, %r12427; + mad.lo.s32 %r12429, %r54, %r12422, %r12428; + mad.lo.s32 %r12430, %r55, %r12420, %r12429; + ld.const.v4.u8 {%rs8106, %rs8107, %rs8108, %rs8109}, [matrix+4052]; + cvt.u32.u16 %r12431, %rs8109; + cvt.s32.s8 %r12432, %r12431; + cvt.u32.u16 %r12433, %rs8108; + cvt.s32.s8 %r12434, %r12433; + cvt.u32.u16 %r12435, %rs8107; + cvt.s32.s8 %r12436, %r12435; + cvt.u32.u16 %r12437, %rs8106; + cvt.s32.s8 %r12438, %r12437; + mad.lo.s32 %r12439, %r56, %r12438, %r12430; + mad.lo.s32 %r12440, %r57, %r12436, %r12439; + mad.lo.s32 %r12441, %r58, %r12434, %r12440; + mad.lo.s32 %r12442, %r59, %r12432, %r12441; + ld.const.v4.u8 {%rs8114, %rs8115, %rs8116, %rs8117}, [matrix+4056]; + cvt.u32.u16 %r12443, %rs8117; + cvt.s32.s8 %r12444, %r12443; + cvt.u32.u16 %r12445, %rs8116; + cvt.s32.s8 %r12446, %r12445; + cvt.u32.u16 %r12447, %rs8115; + cvt.s32.s8 %r12448, %r12447; + cvt.u32.u16 %r12449, %rs8114; + cvt.s32.s8 %r12450, %r12449; + mad.lo.s32 %r12451, %r61, %r12450, %r12442; + mad.lo.s32 %r12452, %r62, %r12448, %r12451; + mad.lo.s32 %r12453, %r64, %r12446, %r12452; + mad.lo.s32 %r12454, %r65, %r12444, %r12453; + ld.const.v4.u8 {%rs8122, %rs8123, %rs8124, %rs8125}, [matrix+4060]; + cvt.u32.u16 %r12455, %rs8125; + cvt.s32.s8 %r12456, %r12455; + cvt.u32.u16 %r12457, %rs8124; + cvt.s32.s8 %r12458, %r12457; + cvt.u32.u16 %r12459, %rs8123; + cvt.s32.s8 %r12460, %r12459; + cvt.u32.u16 %r12461, %rs8122; + cvt.s32.s8 %r12462, %r12461; + mad.lo.s32 %r12463, %r67, %r12462, %r12454; + mad.lo.s32 %r12464, %r68, %r12460, %r12463; + mad.lo.s32 %r12465, %r69, %r12458, %r12464; + mad.lo.s32 %r12466, %r70, %r12456, %r12465; + ld.const.v4.u8 {%rs8130, %rs8131, %rs8132, %rs8133}, [matrix+4064]; + cvt.u32.u16 %r12467, %rs8133; + cvt.s32.s8 %r12468, %r12467; + cvt.u32.u16 %r12469, %rs8132; + cvt.s32.s8 %r12470, %r12469; + cvt.u32.u16 %r12471, %rs8131; + cvt.s32.s8 %r12472, %r12471; + cvt.u32.u16 %r12473, %rs8130; + cvt.s32.s8 %r12474, %r12473; + mad.lo.s32 %r12475, %r222, %r12474, %r12466; + mad.lo.s32 %r12476, %r72, %r12472, %r12475; + mad.lo.s32 %r12477, %r73, %r12470, %r12476; + mad.lo.s32 %r12478, %r74, %r12468, %r12477; + ld.const.v4.u8 {%rs8138, %rs8139, %rs8140, %rs8141}, [matrix+4068]; + cvt.u32.u16 %r12479, %rs8141; + cvt.s32.s8 %r12480, %r12479; + cvt.u32.u16 %r12481, %rs8140; + cvt.s32.s8 %r12482, %r12481; + cvt.u32.u16 %r12483, %rs8139; + cvt.s32.s8 %r12484, %r12483; + cvt.u32.u16 %r12485, %rs8138; + cvt.s32.s8 %r12486, %r12485; + mad.lo.s32 %r12487, %r75, %r12486, %r12478; + mad.lo.s32 %r12488, %r76, %r12484, %r12487; + mad.lo.s32 %r12489, %r77, %r12482, %r12488; + mad.lo.s32 %r12490, %r78, %r12480, %r12489; + ld.const.v4.u8 {%rs8146, %rs8147, %rs8148, %rs8149}, [matrix+4072]; + cvt.u32.u16 %r12491, %rs8149; + cvt.s32.s8 %r12492, %r12491; + cvt.u32.u16 %r12493, %rs8148; + cvt.s32.s8 %r12494, %r12493; + cvt.u32.u16 %r12495, %rs8147; + cvt.s32.s8 %r12496, %r12495; + cvt.u32.u16 %r12497, %rs8146; + cvt.s32.s8 %r12498, %r12497; + mad.lo.s32 %r12499, %r80, %r12498, %r12490; + mad.lo.s32 %r12500, %r81, %r12496, %r12499; + mad.lo.s32 %r12501, %r83, %r12494, %r12500; + mad.lo.s32 %r12502, %r84, %r12492, %r12501; + ld.const.v4.u8 {%rs8154, %rs8155, %rs8156, %rs8157}, [matrix+4076]; + cvt.u32.u16 %r12503, %rs8157; + cvt.s32.s8 %r12504, %r12503; + cvt.u32.u16 %r12505, %rs8156; + cvt.s32.s8 %r12506, %r12505; + cvt.u32.u16 %r12507, %rs8155; + cvt.s32.s8 %r12508, %r12507; + cvt.u32.u16 %r12509, %rs8154; + cvt.s32.s8 %r12510, %r12509; + mad.lo.s32 %r12511, %r86, %r12510, %r12502; + mad.lo.s32 %r12512, %r87, %r12508, %r12511; + mad.lo.s32 %r12513, %r88, %r12506, %r12512; + mad.lo.s32 %r12514, %r89, %r12504, %r12513; + ld.const.v4.u8 {%rs8162, %rs8163, %rs8164, %rs8165}, [matrix+4080]; + cvt.u32.u16 %r12515, %rs8165; + cvt.s32.s8 %r12516, %r12515; + cvt.u32.u16 %r12517, %rs8164; + cvt.s32.s8 %r12518, %r12517; + cvt.u32.u16 %r12519, %rs8163; + cvt.s32.s8 %r12520, %r12519; + cvt.u32.u16 %r12521, %rs8162; + cvt.s32.s8 %r12522, %r12521; + mad.lo.s32 %r12523, %r271, %r12522, %r12514; + mad.lo.s32 %r12524, %r91, %r12520, %r12523; + mad.lo.s32 %r12525, %r93, %r12518, %r12524; + mad.lo.s32 %r12526, %r94, %r12516, %r12525; + ld.const.v4.u8 {%rs8170, %rs8171, %rs8172, %rs8173}, [matrix+4084]; + cvt.u32.u16 %r12527, %rs8173; + cvt.s32.s8 %r12528, %r12527; + cvt.u32.u16 %r12529, %rs8172; + cvt.s32.s8 %r12530, %r12529; + cvt.u32.u16 %r12531, %rs8171; + cvt.s32.s8 %r12532, %r12531; + cvt.u32.u16 %r12533, %rs8170; + cvt.s32.s8 %r12534, %r12533; + mad.lo.s32 %r12535, %r96, %r12534, %r12526; + mad.lo.s32 %r12536, %r97, %r12532, %r12535; + mad.lo.s32 %r12537, %r99, %r12530, %r12536; + mad.lo.s32 %r12538, %r100, %r12528, %r12537; + ld.const.v4.u8 {%rs8178, %rs8179, %rs8180, %rs8181}, [matrix+4088]; + cvt.u32.u16 %r12539, %rs8181; + cvt.s32.s8 %r12540, %r12539; + cvt.u32.u16 %r12541, %rs8180; + cvt.s32.s8 %r12542, %r12541; + cvt.u32.u16 %r12543, %rs8179; + cvt.s32.s8 %r12544, %r12543; + cvt.u32.u16 %r12545, %rs8178; + cvt.s32.s8 %r12546, %r12545; + mad.lo.s32 %r12547, %r103, %r12546, %r12538; + mad.lo.s32 %r12548, %r104, %r12544, %r12547; + mad.lo.s32 %r12549, %r107, %r12542, %r12548; + mad.lo.s32 %r12550, %r108, %r12540, %r12549; + ld.const.v4.u8 {%rs8186, %rs8187, %rs8188, %rs8189}, [matrix+4092]; + cvt.u32.u16 %r12551, %rs8189; + cvt.s32.s8 %r12552, %r12551; + cvt.u32.u16 %r12553, %rs8188; + cvt.s32.s8 %r12554, %r12553; + cvt.u32.u16 %r12555, %rs8187; + cvt.s32.s8 %r12556, %r12555; + cvt.u32.u16 %r12557, %rs8186; + cvt.s32.s8 %r12558, %r12557; + mad.lo.s32 %r12559, %r111, %r12558, %r12550; + mad.lo.s32 %r12560, %r112, %r12556, %r12559; + mad.lo.s32 %r12561, %r114, %r12554, %r12560; + mad.lo.s32 %r12562, %r115, %r12552, %r12561; + shr.u32 %r12563, %r12370, 6; + and.b32 %r12564, %r12563, 240; + shr.u32 %r12565, %r12562, 10; + or.b32 %r12566, %r12565, %r12564; + xor.b32 %r12567, %r113, %r12566; + and.b64 %rd406, %rd386, 255; + and.b64 %rd407, %rd385, 255; + and.b64 %rd408, %rd384, 255; + and.b64 %rd409, %rd383, 255; + shl.b64 %rd410, %rd409, 24; + and.b64 %rd411, %rd382, 255; + shl.b64 %rd412, %rd411, 16; + shl.b32 %r12568, %r897, 8; + cvt.u64.u32 %rd413, %r12568; + cvt.u64.u32 %rd414, %r3231; + and.b64 %rd415, %rd392, 255; + and.b64 %rd416, %rd391, 255; + and.b64 %rd417, %rd390, 255; + and.b64 %rd418, %rd389, 255; + shl.b64 %rd419, %rd418, 24; + and.b64 %rd420, %rd388, 255; + shl.b64 %rd421, %rd420, 16; + shl.b32 %r12569, %r4009, 8; + cvt.u64.u32 %rd422, %r12569; + cvt.u64.u32 %rd423, %r6343; + and.b64 %rd424, %rd398, 255; + and.b64 %rd425, %rd397, 255; + and.b64 %rd426, %rd396, 255; + and.b64 %rd427, %rd395, 255; + shl.b64 %rd428, %rd427, 24; + and.b64 %rd429, %rd394, 255; + shl.b64 %rd430, %rd429, 16; + shl.b32 %r12570, %r7121, 8; + cvt.u64.u32 %rd431, %r12570; + cvt.u64.u32 %rd432, %r9455; + cvt.u64.u32 %rd433, %r9844; + cvt.u64.u32 %rd434, %r12567; + shl.b64 %rd435, %rd414, 56; + shl.b64 %rd436, %rd406, 48; + or.b64 %rd437, %rd435, %rd436; + shl.b64 %rd438, %rd407, 40; + or.b64 %rd439, %rd437, %rd438; + shl.b64 %rd440, %rd408, 32; + or.b64 %rd441, %rd439, %rd440; + or.b64 %rd442, %rd441, %rd410; + or.b64 %rd443, %rd442, %rd412; + and.b64 %rd444, %rd381, 255; + and.b64 %rd445, %rd413, 65280; + or.b64 %rd446, %rd443, %rd445; + or.b64 %rd447, %rd446, %rd444; + xor.b64 %rd123, %rd447, 4239941492252378377; + shl.b64 %rd448, %rd423, 56; + shl.b64 %rd449, %rd415, 48; + or.b64 %rd450, %rd448, %rd449; + shl.b64 %rd451, %rd416, 40; + or.b64 %rd452, %rd450, %rd451; + shl.b64 %rd453, %rd417, 32; + or.b64 %rd454, %rd452, %rd453; + or.b64 %rd455, %rd454, %rd419; + or.b64 %rd456, %rd455, %rd421; + and.b64 %rd457, %rd387, 255; + and.b64 %rd458, %rd422, 65280; + or.b64 %rd459, %rd456, %rd458; + or.b64 %rd460, %rd459, %rd457; + xor.b64 %rd681, %rd460, 8746723911537738262; + shl.b64 %rd461, %rd432, 56; + shl.b64 %rd462, %rd424, 48; + or.b64 %rd463, %rd461, %rd462; + shl.b64 %rd464, %rd425, 40; + or.b64 %rd465, %rd463, %rd464; + shl.b64 %rd466, %rd426, 32; + or.b64 %rd467, %rd465, %rd466; + or.b64 %rd468, %rd467, %rd428; + or.b64 %rd469, %rd468, %rd430; + and.b64 %rd470, %rd393, 255; + and.b64 %rd471, %rd431, 65280; + or.b64 %rd472, %rd469, %rd471; + or.b64 %rd473, %rd472, %rd470; + xor.b64 %rd676, %rd473, 8796936657246353646; + shl.b64 %rd474, %rd434, 56; + and.b64 %rd475, %rd405, 255; + shl.b64 %rd476, %rd475, 48; + or.b64 %rd477, %rd474, %rd476; + and.b64 %rd478, %rd404, 255; + shl.b64 %rd479, %rd478, 40; + or.b64 %rd480, %rd477, %rd479; + shl.b64 %rd481, %rd403, 32; + or.b64 %rd482, %rd480, %rd481; + and.b64 %rd483, %rd401, 255; + shl.b64 %rd484, %rd483, 24; + or.b64 %rd485, %rd482, %rd484; + and.b64 %rd486, %rd400, 255; + shl.b64 %rd487, %rd486, 16; + and.b64 %rd488, %rd399, 255; + shl.b64 %rd489, %rd488, 8; + or.b64 %rd490, %rd485, %rd487; + and.b64 %rd491, %rd433, 255; + or.b64 %rd492, %rd490, %rd489; + or.b64 %rd493, %rd492, %rd491; + xor.b64 %rd671, %rd493, 1272090201925444760; + mov.u64 %rd685, 8270816933120786537; + mov.u64 %rd684, -850687345431043546; + mov.u64 %rd683, 8596393687355028144; + mov.u64 %rd682, -4073852189716399785; + mov.u64 %rd680, -4539347866060507718; + mov.u64 %rd679, -3233781605604422593; + mov.u64 %rd678, 570094237299545110; + mov.u64 %rd677, 5171152063242093102; + mov.u64 %rd675, 6782861118970774626; + mov.u64 %rd674, 7812475424661425213; + mov.u64 %rd673, 9119540418498120711; + mov.u64 %rd672, -7873636174015165430; + mov.u64 %rd670, -9207053471590684088; + mov.u64 %rd669, 3370482334374859748; + mov.u64 %rd668, -1544774801229058759; + mov.u64 %rd667, 6096431547456407061; + mov.u64 %rd666, -1792185402154627366; + mov.u64 %rd665, -6864424130110145268; + mov.u64 %rd664, 5690099369266491460; + mov.u64 %rd663, -5074726839974049192; + mov.u64 %rd662, 1592359455985097269; + mov.u64 %rd661, RC; + mov.u32 %r12572, -24; + +BB0_9: + xor.b64 %rd494, %rd685, %rd123; + xor.b64 %rd495, %rd494, %rd684; + xor.b64 %rd496, %rd495, %rd683; + xor.b64 %rd497, %rd496, %rd682; + xor.b64 %rd498, %rd680, %rd681; + xor.b64 %rd499, %rd498, %rd679; + xor.b64 %rd500, %rd499, %rd678; + xor.b64 %rd501, %rd500, %rd677; + xor.b64 %rd502, %rd675, %rd676; + xor.b64 %rd503, %rd502, %rd674; + xor.b64 %rd504, %rd503, %rd673; + xor.b64 %rd505, %rd504, %rd672; + xor.b64 %rd506, %rd670, %rd671; + xor.b64 %rd507, %rd506, %rd669; + xor.b64 %rd508, %rd507, %rd668; + xor.b64 %rd509, %rd508, %rd667; + xor.b64 %rd510, %rd665, %rd666; + xor.b64 %rd511, %rd510, %rd664; + xor.b64 %rd512, %rd511, %rd663; + xor.b64 %rd513, %rd512, %rd662; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd501, 1; + shr.b64 %rhs, %rd501, 63; + add.u64 %rd514, %lhs, %rhs; + } + xor.b64 %rd515, %rd513, %rd514; + xor.b64 %rd516, %rd123, %rd515; + xor.b64 %rd517, %rd685, %rd515; + xor.b64 %rd518, %rd684, %rd515; + xor.b64 %rd519, %rd683, %rd515; + xor.b64 %rd520, %rd682, %rd515; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd505, 1; + shr.b64 %rhs, %rd505, 63; + add.u64 %rd521, %lhs, %rhs; + } + xor.b64 %rd522, %rd521, %rd497; + xor.b64 %rd523, %rd681, %rd522; + xor.b64 %rd524, %rd680, %rd522; + xor.b64 %rd525, %rd679, %rd522; + xor.b64 %rd526, %rd678, %rd522; + xor.b64 %rd527, %rd677, %rd522; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd509, 1; + shr.b64 %rhs, %rd509, 63; + add.u64 %rd528, %lhs, %rhs; + } + xor.b64 %rd529, %rd528, %rd501; + xor.b64 %rd530, %rd676, %rd529; + xor.b64 %rd531, %rd675, %rd529; + xor.b64 %rd532, %rd674, %rd529; + xor.b64 %rd533, %rd673, %rd529; + xor.b64 %rd534, %rd672, %rd529; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd513, 1; + shr.b64 %rhs, %rd513, 63; + add.u64 %rd535, %lhs, %rhs; + } + xor.b64 %rd536, %rd535, %rd505; + xor.b64 %rd537, %rd671, %rd536; + xor.b64 %rd538, %rd670, %rd536; + xor.b64 %rd539, %rd669, %rd536; + xor.b64 %rd540, %rd668, %rd536; + xor.b64 %rd541, %rd667, %rd536; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd497, 1; + shr.b64 %rhs, %rd497, 63; + add.u64 %rd542, %lhs, %rhs; + } + xor.b64 %rd543, %rd542, %rd509; + xor.b64 %rd544, %rd666, %rd543; + xor.b64 %rd545, %rd665, %rd543; + xor.b64 %rd546, %rd664, %rd543; + xor.b64 %rd547, %rd663, %rd543; + xor.b64 %rd548, %rd662, %rd543; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd523, 1; + shr.b64 %rhs, %rd523, 63; + add.u64 %rd549, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd518, 3; + shr.b64 %rhs, %rd518, 61; + add.u64 %rd550, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd531, 6; + shr.b64 %rhs, %rd531, 58; + add.u64 %rd551, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd525, 10; + shr.b64 %rhs, %rd525, 54; + add.u64 %rd552, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd533, 15; + shr.b64 %rhs, %rd533, 49; + add.u64 %rd553, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd540, 21; + shr.b64 %rhs, %rd540, 43; + add.u64 %rd554, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd537, 28; + shr.b64 %rhs, %rd537, 36; + add.u64 %rd555, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd517, 36; + shr.b64 %rhs, %rd517, 28; + add.u64 %rd556, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd526, 45; + shr.b64 %rhs, %rd526, 19; + add.u64 %rd557, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd538, 55; + shr.b64 %rhs, %rd538, 9; + add.u64 %rd558, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd527, 2; + shr.b64 %rhs, %rd527, 62; + add.u64 %rd559, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd548, 14; + shr.b64 %rhs, %rd548, 50; + add.u64 %rd560, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd544, 27; + shr.b64 %rhs, %rd544, 37; + add.u64 %rd561, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd519, 41; + shr.b64 %rhs, %rd519, 23; + add.u64 %rd562, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd541, 56; + shr.b64 %rhs, %rd541, 8; + add.u64 %rd563, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd547, 8; + shr.b64 %rhs, %rd547, 56; + add.u64 %rd564, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd539, 25; + shr.b64 %rhs, %rd539, 39; + add.u64 %rd565, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd532, 43; + shr.b64 %rhs, %rd532, 21; + add.u64 %rd566, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd530, 62; + shr.b64 %rhs, %rd530, 2; + add.u64 %rd567, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd520, 18; + shr.b64 %rhs, %rd520, 46; + add.u64 %rd568, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd546, 39; + shr.b64 %rhs, %rd546, 25; + add.u64 %rd569, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd534, 61; + shr.b64 %rhs, %rd534, 3; + add.u64 %rd570, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd545, 20; + shr.b64 %rhs, %rd545, 44; + add.u64 %rd571, %lhs, %rhs; + } + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd524, 44; + shr.b64 %rhs, %rd524, 20; + add.u64 %rd572, %lhs, %rhs; + } + not.b64 %rd573, %rd572; + and.b64 %rd574, %rd566, %rd573; + xor.b64 %rd575, %rd574, %rd516; + not.b64 %rd576, %rd566; + and.b64 %rd577, %rd554, %rd576; + xor.b64 %rd681, %rd577, %rd572; + not.b64 %rd578, %rd554; + and.b64 %rd579, %rd560, %rd578; + xor.b64 %rd676, %rd579, %rd566; + not.b64 %rd580, %rd560; + and.b64 %rd581, %rd516, %rd580; + xor.b64 %rd671, %rd581, %rd554; + not.b64 %rd582, %rd516; + and.b64 %rd583, %rd572, %rd582; + xor.b64 %rd666, %rd560, %rd583; + not.b64 %rd584, %rd571; + and.b64 %rd585, %rd550, %rd584; + xor.b64 %rd685, %rd585, %rd555; + not.b64 %rd586, %rd550; + and.b64 %rd587, %rd557, %rd586; + xor.b64 %rd680, %rd587, %rd571; + not.b64 %rd588, %rd557; + and.b64 %rd589, %rd570, %rd588; + xor.b64 %rd675, %rd589, %rd550; + not.b64 %rd590, %rd570; + and.b64 %rd591, %rd555, %rd590; + xor.b64 %rd670, %rd591, %rd557; + not.b64 %rd592, %rd555; + and.b64 %rd593, %rd571, %rd592; + xor.b64 %rd665, %rd570, %rd593; + not.b64 %rd594, %rd551; + and.b64 %rd595, %rd565, %rd594; + xor.b64 %rd684, %rd595, %rd549; + not.b64 %rd596, %rd565; + and.b64 %rd597, %rd564, %rd596; + xor.b64 %rd679, %rd597, %rd551; + not.b64 %rd598, %rd564; + and.b64 %rd599, %rd568, %rd598; + xor.b64 %rd674, %rd599, %rd565; + not.b64 %rd600, %rd568; + and.b64 %rd601, %rd549, %rd600; + xor.b64 %rd669, %rd601, %rd564; + not.b64 %rd602, %rd549; + and.b64 %rd603, %rd551, %rd602; + xor.b64 %rd664, %rd568, %rd603; + not.b64 %rd604, %rd556; + and.b64 %rd605, %rd552, %rd604; + xor.b64 %rd683, %rd605, %rd561; + not.b64 %rd606, %rd552; + and.b64 %rd607, %rd553, %rd606; + xor.b64 %rd678, %rd607, %rd556; + not.b64 %rd608, %rd553; + and.b64 %rd609, %rd563, %rd608; + xor.b64 %rd673, %rd609, %rd552; + not.b64 %rd610, %rd563; + and.b64 %rd611, %rd561, %rd610; + xor.b64 %rd668, %rd611, %rd553; + not.b64 %rd612, %rd561; + and.b64 %rd613, %rd556, %rd612; + xor.b64 %rd663, %rd563, %rd613; + not.b64 %rd614, %rd558; + and.b64 %rd615, %rd569, %rd614; + xor.b64 %rd682, %rd615, %rd567; + not.b64 %rd616, %rd569; + and.b64 %rd617, %rd562, %rd616; + xor.b64 %rd677, %rd617, %rd558; + not.b64 %rd618, %rd562; + and.b64 %rd619, %rd559, %rd618; + xor.b64 %rd672, %rd619, %rd569; + not.b64 %rd620, %rd559; + and.b64 %rd621, %rd567, %rd620; + xor.b64 %rd667, %rd621, %rd562; + not.b64 %rd622, %rd567; + and.b64 %rd623, %rd558, %rd622; + xor.b64 %rd662, %rd559, %rd623; + ld.global.u64 %rd624, [%rd661]; + xor.b64 %rd123, %rd575, %rd624; + add.s64 %rd661, %rd661, 8; + add.s32 %r12572, %r12572, 1; + setp.ne.s32 %p10, %r12572, 0; + @%p10 bra BB0_9; + + ld.const.u64 %rd125, [target+24]; + setp.eq.s64 %p11, %rd671, %rd125; + @%p11 bra BB0_12; + bra.uni BB0_11; + +BB0_12: + ld.const.u64 %rd126, [target+16]; + setp.eq.s64 %p12, %rd676, %rd126; + @%p12 bra BB0_14; + bra.uni BB0_13; + +BB0_14: + ld.const.u64 %rd127, [target+8]; + setp.eq.s64 %p13, %rd681, %rd127; + @%p13 bra BB0_16; + bra.uni BB0_15; + +BB0_16: + ld.const.u64 %rd625, [target]; + setp.lt.u64 %p4, %rd123, %rd625; + @!%p4 bra BB0_18; + bra.uni BB0_17; + +BB0_11: + setp.lt.u64 %p1, %rd671, %rd125; + @!%p1 bra BB0_18; + bra.uni BB0_17; + +BB0_13: + setp.lt.u64 %p2, %rd676, %rd126; + @!%p2 bra BB0_18; + bra.uni BB0_17; + +BB0_15: + setp.lt.u64 %p3, %rd681, %rd127; + @!%p3 bra BB0_18; + bra.uni BB0_17; + +BB0_17: + ld.param.u64 %rd633, [heavy_hash_param_0]; + ld.param.u64 %rd632, [heavy_hash_param_1]; + and.b64 %rd631, %rd634, %rd633; + or.b64 %rd630, %rd631, %rd632; + ld.param.u64 %rd629, [heavy_hash_param_5]; + cvta.to.global.u64 %rd628, %rd629; + mov.u64 %rd626, 0; + atom.global.cas.b64 %rd627, [%rd628], %rd626, %rd630; + +BB0_18: + ret; +} + + diff --git a/plugins/cuda/resources/karlsen-cuda-sm61.ptx b/plugins/cuda/resources/karlsen-cuda-sm61.ptx new file mode 100644 index 0000000..339bdb9 --- /dev/null +++ b/plugins/cuda/resources/karlsen-cuda-sm61.ptx @@ -0,0 +1,42129 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-31833905 +// Cuda compilation tools, release 11.8, V11.8.89 +// Based on NVVM 7.0.1 +// + +.version 7.8 +.target sm_61 +.address_size 64 + +.global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; +.global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; +.global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; +.global .align 1 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; +.global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; +.global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; +.const .align 8 .b8 keccak_round_constants[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .u64 _ZZN10item_state6updateEjE9num_words = 16; +.global .align 8 .u64 _ZZ15fishhash_kernelRK16fishhash_contextRK7hash512E9num_words = 32; +.const .align 1 .b8 matrix[4096]; +.const .align 1 .b8 hash_header[72]; +.const .align 8 .b8 target[32]; + +.func (.param .b64 func_retval0) _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh( + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3, + .param .b32 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5 +) +{ + .local .align 16 .b8 __local_depot0[224]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<28>; + .reg .b16 %rs<233>; + .reg .b32 %r<3965>; + .reg .b64 %rd<174>; + + + mov.u64 %SPL, __local_depot0; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs75, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4]; + ld.param.u64 %rd69, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd170, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + ld.param.u64 %rd71, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2]; + ld.param.u64 %rd164, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + ld.param.u64 %rd73, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd154, %rd73; + cvta.to.local.u64 %rd2, %rd71; + add.u64 %rd152, %SPL, 16; + add.u64 %rd148, %SP, 96; + cvta.to.local.u64 %rd4, %rd148; + setp.lt.u64 %p1, %rd170, 1025; + @%p1 bra $L__BB0_14; + bra.uni $L__BB0_1; + +$L__BB0_14: + add.u64 %rd161, %SPL, 0; + setp.ne.s64 %p16, %rd170, 1024; + mov.u64 %rd158, 0; + mov.u64 %rd150, %rd158; + @%p16 bra $L__BB0_16; + + mov.u64 %rd170, 0; + st.local.u64 [%rd161], %rd69; + mov.u64 %rd150, 1; + mov.u64 %rd158, 1024; + +$L__BB0_16: + setp.eq.s64 %p17, %rd150, 0; + @%p17 bra $L__BB0_21; + + or.b16 %rs1, %rs75, 1; + mov.u64 %rd162, %rd150; + +$L__BB0_18: + ld.local.u64 %rd165, [%rd161]; + ld.local.u8 %r1060, [%rd2]; + ld.local.u8 %r1061, [%rd2+1]; + prmt.b32 %r1062, %r1061, %r1060, 30212; + ld.local.u8 %r1063, [%rd2+2]; + ld.local.u8 %r1064, [%rd2+3]; + prmt.b32 %r1065, %r1064, %r1063, 30212; + prmt.b32 %r3948, %r1065, %r1062, 4180; + ld.local.u8 %r1066, [%rd2+4]; + ld.local.u8 %r1067, [%rd2+5]; + prmt.b32 %r1068, %r1067, %r1066, 30212; + ld.local.u8 %r1069, [%rd2+6]; + ld.local.u8 %r1070, [%rd2+7]; + prmt.b32 %r1071, %r1070, %r1069, 30212; + prmt.b32 %r3947, %r1071, %r1068, 4180; + ld.local.u8 %r1072, [%rd2+8]; + ld.local.u8 %r1073, [%rd2+9]; + prmt.b32 %r1074, %r1073, %r1072, 30212; + ld.local.u8 %r1075, [%rd2+10]; + ld.local.u8 %r1076, [%rd2+11]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + prmt.b32 %r3946, %r1077, %r1074, 4180; + ld.local.u8 %r1078, [%rd2+12]; + ld.local.u8 %r1079, [%rd2+13]; + prmt.b32 %r1080, %r1079, %r1078, 30212; + ld.local.u8 %r1081, [%rd2+14]; + ld.local.u8 %r1082, [%rd2+15]; + prmt.b32 %r1083, %r1082, %r1081, 30212; + prmt.b32 %r3945, %r1083, %r1080, 4180; + mov.u64 %rd166, 16; + ld.local.u8 %r1084, [%rd2+16]; + ld.local.u8 %r1085, [%rd2+17]; + prmt.b32 %r1086, %r1085, %r1084, 30212; + ld.local.u8 %r1087, [%rd2+18]; + ld.local.u8 %r1088, [%rd2+19]; + prmt.b32 %r1089, %r1088, %r1087, 30212; + prmt.b32 %r3944, %r1089, %r1086, 4180; + ld.local.u8 %r1090, [%rd2+20]; + ld.local.u8 %r1091, [%rd2+21]; + prmt.b32 %r1092, %r1091, %r1090, 30212; + ld.local.u8 %r1093, [%rd2+22]; + ld.local.u8 %r1094, [%rd2+23]; + prmt.b32 %r1095, %r1094, %r1093, 30212; + prmt.b32 %r3943, %r1095, %r1092, 4180; + ld.local.u8 %r1096, [%rd2+24]; + ld.local.u8 %r1097, [%rd2+25]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd2+26]; + ld.local.u8 %r1100, [%rd2+27]; + prmt.b32 %r1101, %r1100, %r1099, 30212; + prmt.b32 %r3942, %r1101, %r1098, 4180; + ld.local.u8 %r1102, [%rd2+28]; + ld.local.u8 %r1103, [%rd2+29]; + prmt.b32 %r1104, %r1103, %r1102, 30212; + ld.local.u8 %r1105, [%rd2+30]; + ld.local.u8 %r1106, [%rd2+31]; + prmt.b32 %r1107, %r1106, %r1105, 30212; + prmt.b32 %r3941, %r1107, %r1104, 4180; + mov.u16 %rs197, %rs1; + +$L__BB0_19: + shr.u64 %rd142, %rd164, 32; + cvt.u32.u64 %r3940, %rd142; + cvt.u32.u64 %r3939, %rd164; + setp.eq.s64 %p18, %rd166, 1; + selp.b16 %rs79, 2, 0, %p18; + or.b16 %rs80, %rs79, %rs197; + ld.u8 %r1108, [%rd165]; + ld.u8 %r1109, [%rd165+1]; + prmt.b32 %r1110, %r1109, %r1108, 30212; + ld.u8 %r1111, [%rd165+2]; + prmt.b32 %r1112, %r1111, %r1110, 28756; + ld.u8 %r1113, [%rd165+3]; + prmt.b32 %r1114, %r1113, %r1112, 1620; + ld.u8 %r1115, [%rd165+4]; + ld.u8 %r1116, [%rd165+5]; + prmt.b32 %r1117, %r1116, %r1115, 30212; + ld.u8 %r1118, [%rd165+6]; + prmt.b32 %r1119, %r1118, %r1117, 28756; + ld.u8 %r1120, [%rd165+7]; + prmt.b32 %r1121, %r1120, %r1119, 1620; + ld.u8 %r1122, [%rd165+8]; + ld.u8 %r1123, [%rd165+9]; + prmt.b32 %r1124, %r1123, %r1122, 30212; + ld.u8 %r1125, [%rd165+10]; + prmt.b32 %r1126, %r1125, %r1124, 28756; + ld.u8 %r1127, [%rd165+11]; + prmt.b32 %r1128, %r1127, %r1126, 1620; + ld.u8 %r1129, [%rd165+12]; + ld.u8 %r1130, [%rd165+13]; + prmt.b32 %r1131, %r1130, %r1129, 30212; + ld.u8 %r1132, [%rd165+14]; + prmt.b32 %r1133, %r1132, %r1131, 28756; + ld.u8 %r1134, [%rd165+15]; + prmt.b32 %r1135, %r1134, %r1133, 1620; + ld.u8 %r1136, [%rd165+16]; + ld.u8 %r1137, [%rd165+17]; + prmt.b32 %r1138, %r1137, %r1136, 30212; + ld.u8 %r1139, [%rd165+18]; + prmt.b32 %r1140, %r1139, %r1138, 28756; + ld.u8 %r1141, [%rd165+19]; + prmt.b32 %r1142, %r1141, %r1140, 1620; + ld.u8 %r1143, [%rd165+20]; + ld.u8 %r1144, [%rd165+21]; + prmt.b32 %r1145, %r1144, %r1143, 30212; + ld.u8 %r1146, [%rd165+22]; + prmt.b32 %r1147, %r1146, %r1145, 28756; + ld.u8 %r1148, [%rd165+23]; + prmt.b32 %r1149, %r1148, %r1147, 1620; + ld.u8 %r1150, [%rd165+24]; + ld.u8 %r1151, [%rd165+25]; + prmt.b32 %r1152, %r1151, %r1150, 30212; + ld.u8 %r1153, [%rd165+26]; + prmt.b32 %r1154, %r1153, %r1152, 28756; + ld.u8 %r1155, [%rd165+27]; + prmt.b32 %r1156, %r1155, %r1154, 1620; + ld.u8 %r1157, [%rd165+28]; + ld.u8 %r1158, [%rd165+29]; + prmt.b32 %r1159, %r1158, %r1157, 30212; + ld.u8 %r1160, [%rd165+30]; + prmt.b32 %r1161, %r1160, %r1159, 28756; + ld.u8 %r1162, [%rd165+31]; + prmt.b32 %r1163, %r1162, %r1161, 1620; + ld.u8 %r1164, [%rd165+32]; + ld.u8 %r1165, [%rd165+33]; + prmt.b32 %r1166, %r1165, %r1164, 30212; + ld.u8 %r1167, [%rd165+34]; + prmt.b32 %r1168, %r1167, %r1166, 28756; + ld.u8 %r1169, [%rd165+35]; + prmt.b32 %r1170, %r1169, %r1168, 1620; + ld.u8 %r1171, [%rd165+36]; + ld.u8 %r1172, [%rd165+37]; + prmt.b32 %r1173, %r1172, %r1171, 30212; + ld.u8 %r1174, [%rd165+38]; + prmt.b32 %r1175, %r1174, %r1173, 28756; + ld.u8 %r1176, [%rd165+39]; + prmt.b32 %r1177, %r1176, %r1175, 1620; + ld.u8 %r1178, [%rd165+40]; + ld.u8 %r1179, [%rd165+41]; + prmt.b32 %r1180, %r1179, %r1178, 30212; + ld.u8 %r1181, [%rd165+42]; + prmt.b32 %r1182, %r1181, %r1180, 28756; + ld.u8 %r1183, [%rd165+43]; + prmt.b32 %r1184, %r1183, %r1182, 1620; + ld.u8 %r1185, [%rd165+44]; + ld.u8 %r1186, [%rd165+45]; + prmt.b32 %r1187, %r1186, %r1185, 30212; + ld.u8 %r1188, [%rd165+46]; + prmt.b32 %r1189, %r1188, %r1187, 28756; + ld.u8 %r1190, [%rd165+47]; + prmt.b32 %r1191, %r1190, %r1189, 1620; + ld.u8 %r1192, [%rd165+48]; + ld.u8 %r1193, [%rd165+49]; + prmt.b32 %r1194, %r1193, %r1192, 30212; + ld.u8 %r1195, [%rd165+50]; + prmt.b32 %r1196, %r1195, %r1194, 28756; + ld.u8 %r1197, [%rd165+51]; + prmt.b32 %r1198, %r1197, %r1196, 1620; + ld.u8 %r1199, [%rd165+52]; + ld.u8 %r1200, [%rd165+53]; + prmt.b32 %r1201, %r1200, %r1199, 30212; + ld.u8 %r1202, [%rd165+54]; + prmt.b32 %r1203, %r1202, %r1201, 28756; + ld.u8 %r1204, [%rd165+55]; + prmt.b32 %r1205, %r1204, %r1203, 1620; + ld.u8 %r1206, [%rd165+56]; + ld.u8 %r1207, [%rd165+57]; + prmt.b32 %r1208, %r1207, %r1206, 30212; + ld.u8 %r1209, [%rd165+58]; + prmt.b32 %r1210, %r1209, %r1208, 28756; + ld.u8 %r1211, [%rd165+59]; + prmt.b32 %r1212, %r1211, %r1210, 1620; + ld.u8 %r1213, [%rd165+60]; + ld.u8 %r1214, [%rd165+61]; + prmt.b32 %r1215, %r1214, %r1213, 30212; + ld.u8 %r1216, [%rd165+62]; + prmt.b32 %r1217, %r1216, %r1215, 28756; + ld.u8 %r1218, [%rd165+63]; + prmt.b32 %r1219, %r1218, %r1217, 1620; + cvt.u32.u16 %r1220, %rs80; + and.b32 %r1221, %r1220, 255; + add.s32 %r1222, %r3944, %r3948; + add.s32 %r1223, %r1222, %r1114; + xor.b32 %r1224, %r1223, %r3939; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 16; + add.s32 %r1226, %r1225, 1779033703; + xor.b32 %r1227, %r1226, %r3944; + shf.l.wrap.b32 %r1228, %r1227, %r1227, 20; + add.s32 %r1229, %r1121, %r1223; + add.s32 %r1230, %r1229, %r1228; + xor.b32 %r1231, %r1230, %r1225; + shf.l.wrap.b32 %r1232, %r1231, %r1231, 24; + add.s32 %r1233, %r1232, %r1226; + xor.b32 %r1234, %r1233, %r1228; + shf.l.wrap.b32 %r1235, %r1234, %r1234, 25; + add.s32 %r1236, %r3943, %r3947; + add.s32 %r1237, %r1236, %r1128; + xor.b32 %r1238, %r1237, %r3940; + shf.l.wrap.b32 %r1239, %r1238, %r1238, 16; + add.s32 %r1240, %r1239, -1150833019; + xor.b32 %r1241, %r1240, %r3943; + shf.l.wrap.b32 %r1242, %r1241, %r1241, 20; + add.s32 %r1243, %r1135, %r1237; + add.s32 %r1244, %r1243, %r1242; + xor.b32 %r1245, %r1244, %r1239; + shf.l.wrap.b32 %r1246, %r1245, %r1245, 24; + add.s32 %r1247, %r1246, %r1240; + xor.b32 %r1248, %r1247, %r1242; + shf.l.wrap.b32 %r1249, %r1248, %r1248, 25; + add.s32 %r1250, %r3942, %r3946; + add.s32 %r1251, %r1250, %r1142; + shr.u32 %r1252, %r1251, 16; + shl.b32 %r1253, %r1251, 16; + xor.b32 %r1254, %r1253, 4194304; + or.b32 %r1255, %r1254, %r1252; + add.s32 %r1256, %r1255, 1013904242; + xor.b32 %r1257, %r1256, %r3942; + shf.l.wrap.b32 %r1258, %r1257, %r1257, 20; + add.s32 %r1259, %r1149, %r1251; + add.s32 %r1260, %r1259, %r1258; + xor.b32 %r1261, %r1260, %r1255; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 24; + add.s32 %r1263, %r1262, %r1256; + xor.b32 %r1264, %r1263, %r1258; + shf.l.wrap.b32 %r1265, %r1264, %r1264, 25; + add.s32 %r1266, %r3941, %r3945; + add.s32 %r1267, %r1266, %r1156; + xor.b32 %r1268, %r1267, %r1221; + shr.u32 %r1269, %r1267, 16; + shl.b32 %r1270, %r1268, 16; + or.b32 %r1271, %r1270, %r1269; + add.s32 %r1272, %r1271, -1521486534; + xor.b32 %r1273, %r1272, %r3941; + shf.l.wrap.b32 %r1274, %r1273, %r1273, 20; + add.s32 %r1275, %r1163, %r1267; + add.s32 %r1276, %r1275, %r1274; + xor.b32 %r1277, %r1276, %r1271; + shf.l.wrap.b32 %r1278, %r1277, %r1277, 24; + add.s32 %r1279, %r1278, %r1272; + xor.b32 %r1280, %r1279, %r1274; + shf.l.wrap.b32 %r1281, %r1280, %r1280, 25; + add.s32 %r1282, %r1249, %r1230; + add.s32 %r1283, %r1282, %r1170; + xor.b32 %r1284, %r1278, %r1283; + shf.l.wrap.b32 %r1285, %r1284, %r1284, 16; + add.s32 %r1286, %r1285, %r1263; + xor.b32 %r1287, %r1286, %r1249; + shf.l.wrap.b32 %r1288, %r1287, %r1287, 20; + add.s32 %r1289, %r1177, %r1283; + add.s32 %r1290, %r1289, %r1288; + xor.b32 %r1291, %r1290, %r1285; + shf.l.wrap.b32 %r1292, %r1291, %r1291, 24; + add.s32 %r1293, %r1292, %r1286; + xor.b32 %r1294, %r1293, %r1288; + shf.l.wrap.b32 %r1295, %r1294, %r1294, 25; + add.s32 %r1296, %r1265, %r1244; + add.s32 %r1297, %r1296, %r1184; + xor.b32 %r1298, %r1297, %r1232; + shf.l.wrap.b32 %r1299, %r1298, %r1298, 16; + add.s32 %r1300, %r1299, %r1279; + xor.b32 %r1301, %r1300, %r1265; + shf.l.wrap.b32 %r1302, %r1301, %r1301, 20; + add.s32 %r1303, %r1191, %r1297; + add.s32 %r1304, %r1303, %r1302; + xor.b32 %r1305, %r1304, %r1299; + shf.l.wrap.b32 %r1306, %r1305, %r1305, 24; + add.s32 %r1307, %r1306, %r1300; + xor.b32 %r1308, %r1307, %r1302; + shf.l.wrap.b32 %r1309, %r1308, %r1308, 25; + add.s32 %r1310, %r1281, %r1260; + add.s32 %r1311, %r1310, %r1198; + xor.b32 %r1312, %r1311, %r1246; + shf.l.wrap.b32 %r1313, %r1312, %r1312, 16; + add.s32 %r1314, %r1313, %r1233; + xor.b32 %r1315, %r1314, %r1281; + shf.l.wrap.b32 %r1316, %r1315, %r1315, 20; + add.s32 %r1317, %r1205, %r1311; + add.s32 %r1318, %r1317, %r1316; + xor.b32 %r1319, %r1318, %r1313; + shf.l.wrap.b32 %r1320, %r1319, %r1319, 24; + add.s32 %r1321, %r1320, %r1314; + xor.b32 %r1322, %r1321, %r1316; + shf.l.wrap.b32 %r1323, %r1322, %r1322, 25; + add.s32 %r1324, %r1276, %r1235; + add.s32 %r1325, %r1324, %r1212; + xor.b32 %r1326, %r1325, %r1262; + shf.l.wrap.b32 %r1327, %r1326, %r1326, 16; + add.s32 %r1328, %r1327, %r1247; + xor.b32 %r1329, %r1328, %r1235; + shf.l.wrap.b32 %r1330, %r1329, %r1329, 20; + add.s32 %r1331, %r1219, %r1325; + add.s32 %r1332, %r1331, %r1330; + xor.b32 %r1333, %r1332, %r1327; + shf.l.wrap.b32 %r1334, %r1333, %r1333, 24; + add.s32 %r1335, %r1334, %r1328; + xor.b32 %r1336, %r1335, %r1330; + shf.l.wrap.b32 %r1337, %r1336, %r1336, 25; + add.s32 %r1338, %r1290, %r1128; + add.s32 %r1339, %r1338, %r1337; + xor.b32 %r1340, %r1339, %r1306; + shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; + add.s32 %r1342, %r1341, %r1321; + xor.b32 %r1343, %r1342, %r1337; + shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; + add.s32 %r1345, %r1339, %r1156; + add.s32 %r1346, %r1345, %r1344; + xor.b32 %r1347, %r1346, %r1341; + shf.l.wrap.b32 %r1348, %r1347, %r1347, 24; + add.s32 %r1349, %r1348, %r1342; + xor.b32 %r1350, %r1349, %r1344; + shf.l.wrap.b32 %r1351, %r1350, %r1350, 25; + add.s32 %r1352, %r1304, %r1135; + add.s32 %r1353, %r1352, %r1295; + xor.b32 %r1354, %r1320, %r1353; + shf.l.wrap.b32 %r1355, %r1354, %r1354, 16; + add.s32 %r1356, %r1335, %r1355; + xor.b32 %r1357, %r1356, %r1295; + shf.l.wrap.b32 %r1358, %r1357, %r1357, 20; + add.s32 %r1359, %r1353, %r1184; + add.s32 %r1360, %r1359, %r1358; + xor.b32 %r1361, %r1360, %r1355; + shf.l.wrap.b32 %r1362, %r1361, %r1361, 24; + add.s32 %r1363, %r1362, %r1356; + xor.b32 %r1364, %r1363, %r1358; + shf.l.wrap.b32 %r1365, %r1364, %r1364, 25; + add.s32 %r1366, %r1309, %r1163; + add.s32 %r1367, %r1366, %r1318; + xor.b32 %r1368, %r1334, %r1367; + shf.l.wrap.b32 %r1369, %r1368, %r1368, 16; + add.s32 %r1370, %r1369, %r1293; + xor.b32 %r1371, %r1370, %r1309; + shf.l.wrap.b32 %r1372, %r1371, %r1371, 20; + add.s32 %r1373, %r1367, %r1114; + add.s32 %r1374, %r1373, %r1372; + xor.b32 %r1375, %r1374, %r1369; + shf.l.wrap.b32 %r1376, %r1375, %r1375, 24; + add.s32 %r1377, %r1376, %r1370; + xor.b32 %r1378, %r1377, %r1372; + shf.l.wrap.b32 %r1379, %r1378, %r1378, 25; + add.s32 %r1380, %r1323, %r1142; + add.s32 %r1381, %r1380, %r1332; + xor.b32 %r1382, %r1381, %r1292; + shf.l.wrap.b32 %r1383, %r1382, %r1382, 16; + add.s32 %r1384, %r1383, %r1307; + xor.b32 %r1385, %r1384, %r1323; + shf.l.wrap.b32 %r1386, %r1385, %r1385, 20; + add.s32 %r1387, %r1381, %r1205; + add.s32 %r1388, %r1387, %r1386; + xor.b32 %r1389, %r1388, %r1383; + shf.l.wrap.b32 %r1390, %r1389, %r1389, 24; + add.s32 %r1391, %r1390, %r1384; + xor.b32 %r1392, %r1391, %r1386; + shf.l.wrap.b32 %r1393, %r1392, %r1392, 25; + add.s32 %r1394, %r1365, %r1121; + add.s32 %r1395, %r1394, %r1346; + xor.b32 %r1396, %r1395, %r1390; + shf.l.wrap.b32 %r1397, %r1396, %r1396, 16; + add.s32 %r1398, %r1397, %r1377; + xor.b32 %r1399, %r1398, %r1365; + shf.l.wrap.b32 %r1400, %r1399, %r1399, 20; + add.s32 %r1401, %r1395, %r1191; + add.s32 %r1402, %r1401, %r1400; + xor.b32 %r1403, %r1402, %r1397; + shf.l.wrap.b32 %r1404, %r1403, %r1403, 24; + add.s32 %r1405, %r1404, %r1398; + xor.b32 %r1406, %r1405, %r1400; + shf.l.wrap.b32 %r1407, %r1406, %r1406, 25; + add.s32 %r1408, %r1360, %r1198; + add.s32 %r1409, %r1408, %r1379; + xor.b32 %r1410, %r1348, %r1409; + shf.l.wrap.b32 %r1411, %r1410, %r1410, 16; + add.s32 %r1412, %r1411, %r1391; + xor.b32 %r1413, %r1412, %r1379; + shf.l.wrap.b32 %r1414, %r1413, %r1413, 20; + add.s32 %r1415, %r1409, %r1149; + add.s32 %r1416, %r1415, %r1414; + xor.b32 %r1417, %r1416, %r1411; + shf.l.wrap.b32 %r1418, %r1417, %r1417, 24; + add.s32 %r1419, %r1418, %r1412; + xor.b32 %r1420, %r1419, %r1414; + shf.l.wrap.b32 %r1421, %r1420, %r1420, 25; + add.s32 %r1422, %r1374, %r1177; + add.s32 %r1423, %r1422, %r1393; + xor.b32 %r1424, %r1423, %r1362; + shf.l.wrap.b32 %r1425, %r1424, %r1424, 16; + add.s32 %r1426, %r1425, %r1349; + xor.b32 %r1427, %r1426, %r1393; + shf.l.wrap.b32 %r1428, %r1427, %r1427, 20; + add.s32 %r1429, %r1423, %r1212; + add.s32 %r1430, %r1429, %r1428; + xor.b32 %r1431, %r1430, %r1425; + shf.l.wrap.b32 %r1432, %r1431, %r1431, 24; + add.s32 %r1433, %r1432, %r1426; + xor.b32 %r1434, %r1433, %r1428; + shf.l.wrap.b32 %r1435, %r1434, %r1434, 25; + add.s32 %r1436, %r1388, %r1219; + add.s32 %r1437, %r1436, %r1351; + xor.b32 %r1438, %r1437, %r1376; + shf.l.wrap.b32 %r1439, %r1438, %r1438, 16; + add.s32 %r1440, %r1439, %r1363; + xor.b32 %r1441, %r1440, %r1351; + shf.l.wrap.b32 %r1442, %r1441, %r1441, 20; + add.s32 %r1443, %r1437, %r1170; + add.s32 %r1444, %r1443, %r1442; + xor.b32 %r1445, %r1444, %r1439; + shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; + add.s32 %r1447, %r1446, %r1440; + xor.b32 %r1448, %r1447, %r1442; + shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; + add.s32 %r1450, %r1402, %r1135; + add.s32 %r1451, %r1450, %r1449; + xor.b32 %r1452, %r1451, %r1418; + shf.l.wrap.b32 %r1453, %r1452, %r1452, 16; + add.s32 %r1454, %r1453, %r1433; + xor.b32 %r1455, %r1454, %r1449; + shf.l.wrap.b32 %r1456, %r1455, %r1455, 20; + add.s32 %r1457, %r1451, %r1142; + add.s32 %r1458, %r1457, %r1456; + xor.b32 %r1459, %r1458, %r1453; + shf.l.wrap.b32 %r1460, %r1459, %r1459, 24; + add.s32 %r1461, %r1460, %r1454; + xor.b32 %r1462, %r1461, %r1456; + shf.l.wrap.b32 %r1463, %r1462, %r1462, 25; + add.s32 %r1464, %r1416, %r1184; + add.s32 %r1465, %r1464, %r1407; + xor.b32 %r1466, %r1465, %r1432; + shf.l.wrap.b32 %r1467, %r1466, %r1466, 16; + add.s32 %r1468, %r1467, %r1447; + xor.b32 %r1469, %r1468, %r1407; + shf.l.wrap.b32 %r1470, %r1469, %r1469, 20; + add.s32 %r1471, %r1465, %r1198; + add.s32 %r1472, %r1471, %r1470; + xor.b32 %r1473, %r1472, %r1467; + shf.l.wrap.b32 %r1474, %r1473, %r1473, 24; + add.s32 %r1475, %r1474, %r1468; + xor.b32 %r1476, %r1475, %r1470; + shf.l.wrap.b32 %r1477, %r1476, %r1476, 25; + add.s32 %r1478, %r1430, %r1205; + add.s32 %r1479, %r1478, %r1421; + xor.b32 %r1480, %r1446, %r1479; + shf.l.wrap.b32 %r1481, %r1480, %r1480, 16; + add.s32 %r1482, %r1481, %r1405; + xor.b32 %r1483, %r1482, %r1421; + shf.l.wrap.b32 %r1484, %r1483, %r1483, 20; + add.s32 %r1485, %r1479, %r1128; + add.s32 %r1486, %r1485, %r1484; + xor.b32 %r1487, %r1486, %r1481; + shf.l.wrap.b32 %r1488, %r1487, %r1487, 24; + add.s32 %r1489, %r1488, %r1482; + xor.b32 %r1490, %r1489, %r1484; + shf.l.wrap.b32 %r1491, %r1490, %r1490, 25; + add.s32 %r1492, %r1435, %r1163; + add.s32 %r1493, %r1492, %r1444; + xor.b32 %r1494, %r1493, %r1404; + shf.l.wrap.b32 %r1495, %r1494, %r1494, 16; + add.s32 %r1496, %r1495, %r1419; + xor.b32 %r1497, %r1496, %r1435; + shf.l.wrap.b32 %r1498, %r1497, %r1497, 20; + add.s32 %r1499, %r1493, %r1212; + add.s32 %r1500, %r1499, %r1498; + xor.b32 %r1501, %r1500, %r1495; + shf.l.wrap.b32 %r1502, %r1501, %r1501, 24; + add.s32 %r1503, %r1502, %r1496; + xor.b32 %r1504, %r1503, %r1498; + shf.l.wrap.b32 %r1505, %r1504, %r1504, 25; + add.s32 %r1506, %r1477, %r1156; + add.s32 %r1507, %r1506, %r1458; + xor.b32 %r1508, %r1507, %r1502; + shf.l.wrap.b32 %r1509, %r1508, %r1508, 16; + add.s32 %r1510, %r1509, %r1489; + xor.b32 %r1511, %r1510, %r1477; + shf.l.wrap.b32 %r1512, %r1511, %r1511, 20; + add.s32 %r1513, %r1507, %r1149; + add.s32 %r1514, %r1513, %r1512; + xor.b32 %r1515, %r1514, %r1509; + shf.l.wrap.b32 %r1516, %r1515, %r1515, 24; + add.s32 %r1517, %r1516, %r1510; + xor.b32 %r1518, %r1517, %r1512; + shf.l.wrap.b32 %r1519, %r1518, %r1518, 25; + add.s32 %r1520, %r1472, %r1177; + add.s32 %r1521, %r1520, %r1491; + xor.b32 %r1522, %r1460, %r1521; + shf.l.wrap.b32 %r1523, %r1522, %r1522, 16; + add.s32 %r1524, %r1523, %r1503; + xor.b32 %r1525, %r1524, %r1491; + shf.l.wrap.b32 %r1526, %r1525, %r1525, 20; + add.s32 %r1527, %r1521, %r1114; + add.s32 %r1528, %r1527, %r1526; + xor.b32 %r1529, %r1528, %r1523; + shf.l.wrap.b32 %r1530, %r1529, %r1529, 24; + add.s32 %r1531, %r1530, %r1524; + xor.b32 %r1532, %r1531, %r1526; + shf.l.wrap.b32 %r1533, %r1532, %r1532, 25; + add.s32 %r1534, %r1486, %r1191; + add.s32 %r1535, %r1534, %r1505; + xor.b32 %r1536, %r1535, %r1474; + shf.l.wrap.b32 %r1537, %r1536, %r1536, 16; + add.s32 %r1538, %r1537, %r1461; + xor.b32 %r1539, %r1538, %r1505; + shf.l.wrap.b32 %r1540, %r1539, %r1539, 20; + add.s32 %r1541, %r1535, %r1219; + add.s32 %r1542, %r1541, %r1540; + xor.b32 %r1543, %r1542, %r1537; + shf.l.wrap.b32 %r1544, %r1543, %r1543, 24; + add.s32 %r1545, %r1544, %r1538; + xor.b32 %r1546, %r1545, %r1540; + shf.l.wrap.b32 %r1547, %r1546, %r1546, 25; + add.s32 %r1548, %r1500, %r1170; + add.s32 %r1549, %r1548, %r1463; + xor.b32 %r1550, %r1549, %r1488; + shf.l.wrap.b32 %r1551, %r1550, %r1550, 16; + add.s32 %r1552, %r1551, %r1475; + xor.b32 %r1553, %r1552, %r1463; + shf.l.wrap.b32 %r1554, %r1553, %r1553, 20; + add.s32 %r1555, %r1549, %r1121; + add.s32 %r1556, %r1555, %r1554; + xor.b32 %r1557, %r1556, %r1551; + shf.l.wrap.b32 %r1558, %r1557, %r1557, 24; + add.s32 %r1559, %r1558, %r1552; + xor.b32 %r1560, %r1559, %r1554; + shf.l.wrap.b32 %r1561, %r1560, %r1560, 25; + add.s32 %r1562, %r1514, %r1184; + add.s32 %r1563, %r1562, %r1561; + xor.b32 %r1564, %r1563, %r1530; + shf.l.wrap.b32 %r1565, %r1564, %r1564, 16; + add.s32 %r1566, %r1565, %r1545; + xor.b32 %r1567, %r1566, %r1561; + shf.l.wrap.b32 %r1568, %r1567, %r1567, 20; + add.s32 %r1569, %r1563, %r1163; + add.s32 %r1570, %r1569, %r1568; + xor.b32 %r1571, %r1570, %r1565; + shf.l.wrap.b32 %r1572, %r1571, %r1571, 24; + add.s32 %r1573, %r1572, %r1566; + xor.b32 %r1574, %r1573, %r1568; + shf.l.wrap.b32 %r1575, %r1574, %r1574, 25; + add.s32 %r1576, %r1528, %r1198; + add.s32 %r1577, %r1576, %r1519; + xor.b32 %r1578, %r1577, %r1544; + shf.l.wrap.b32 %r1579, %r1578, %r1578, 16; + add.s32 %r1580, %r1579, %r1559; + xor.b32 %r1581, %r1580, %r1519; + shf.l.wrap.b32 %r1582, %r1581, %r1581, 20; + add.s32 %r1583, %r1577, %r1177; + add.s32 %r1584, %r1583, %r1582; + xor.b32 %r1585, %r1584, %r1579; + shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; + add.s32 %r1587, %r1586, %r1580; + xor.b32 %r1588, %r1587, %r1582; + shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; + add.s32 %r1590, %r1542, %r1212; + add.s32 %r1591, %r1590, %r1533; + xor.b32 %r1592, %r1558, %r1591; + shf.l.wrap.b32 %r1593, %r1592, %r1592, 16; + add.s32 %r1594, %r1593, %r1517; + xor.b32 %r1595, %r1594, %r1533; + shf.l.wrap.b32 %r1596, %r1595, %r1595, 20; + add.s32 %r1597, %r1591, %r1135; + add.s32 %r1598, %r1597, %r1596; + xor.b32 %r1599, %r1598, %r1593; + shf.l.wrap.b32 %r1600, %r1599, %r1599, 24; + add.s32 %r1601, %r1600, %r1594; + xor.b32 %r1602, %r1601, %r1596; + shf.l.wrap.b32 %r1603, %r1602, %r1602, 25; + add.s32 %r1604, %r1547, %r1205; + add.s32 %r1605, %r1604, %r1556; + xor.b32 %r1606, %r1605, %r1516; + shf.l.wrap.b32 %r1607, %r1606, %r1606, 16; + add.s32 %r1608, %r1607, %r1531; + xor.b32 %r1609, %r1608, %r1547; + shf.l.wrap.b32 %r1610, %r1609, %r1609, 20; + add.s32 %r1611, %r1605, %r1219; + add.s32 %r1612, %r1611, %r1610; + xor.b32 %r1613, %r1612, %r1607; + shf.l.wrap.b32 %r1614, %r1613, %r1613, 24; + add.s32 %r1615, %r1614, %r1608; + xor.b32 %r1616, %r1615, %r1610; + shf.l.wrap.b32 %r1617, %r1616, %r1616, 25; + add.s32 %r1618, %r1589, %r1142; + add.s32 %r1619, %r1618, %r1570; + xor.b32 %r1620, %r1619, %r1614; + shf.l.wrap.b32 %r1621, %r1620, %r1620, 16; + add.s32 %r1622, %r1621, %r1601; + xor.b32 %r1623, %r1622, %r1589; + shf.l.wrap.b32 %r1624, %r1623, %r1623, 20; + add.s32 %r1625, %r1619, %r1114; + add.s32 %r1626, %r1625, %r1624; + xor.b32 %r1627, %r1626, %r1621; + shf.l.wrap.b32 %r1628, %r1627, %r1627, 24; + add.s32 %r1629, %r1628, %r1622; + xor.b32 %r1630, %r1629, %r1624; + shf.l.wrap.b32 %r1631, %r1630, %r1630, 25; + add.s32 %r1632, %r1584, %r1191; + add.s32 %r1633, %r1632, %r1603; + xor.b32 %r1634, %r1572, %r1633; + shf.l.wrap.b32 %r1635, %r1634, %r1634, 16; + add.s32 %r1636, %r1635, %r1615; + xor.b32 %r1637, %r1636, %r1603; + shf.l.wrap.b32 %r1638, %r1637, %r1637, 20; + add.s32 %r1639, %r1633, %r1128; + add.s32 %r1640, %r1639, %r1638; + xor.b32 %r1641, %r1640, %r1635; + shf.l.wrap.b32 %r1642, %r1641, %r1641, 24; + add.s32 %r1643, %r1642, %r1636; + xor.b32 %r1644, %r1643, %r1638; + shf.l.wrap.b32 %r1645, %r1644, %r1644, 25; + add.s32 %r1646, %r1598, %r1149; + add.s32 %r1647, %r1646, %r1617; + xor.b32 %r1648, %r1647, %r1586; + shf.l.wrap.b32 %r1649, %r1648, %r1648, 16; + add.s32 %r1650, %r1649, %r1573; + xor.b32 %r1651, %r1650, %r1617; + shf.l.wrap.b32 %r1652, %r1651, %r1651, 20; + add.s32 %r1653, %r1647, %r1170; + add.s32 %r1654, %r1653, %r1652; + xor.b32 %r1655, %r1654, %r1649; + shf.l.wrap.b32 %r1656, %r1655, %r1655, 24; + add.s32 %r1657, %r1656, %r1650; + xor.b32 %r1658, %r1657, %r1652; + shf.l.wrap.b32 %r1659, %r1658, %r1658, 25; + add.s32 %r1660, %r1612, %r1121; + add.s32 %r1661, %r1660, %r1575; + xor.b32 %r1662, %r1661, %r1600; + shf.l.wrap.b32 %r1663, %r1662, %r1662, 16; + add.s32 %r1664, %r1663, %r1587; + xor.b32 %r1665, %r1664, %r1575; + shf.l.wrap.b32 %r1666, %r1665, %r1665, 20; + add.s32 %r1667, %r1661, %r1156; + add.s32 %r1668, %r1667, %r1666; + xor.b32 %r1669, %r1668, %r1663; + shf.l.wrap.b32 %r1670, %r1669, %r1669, 24; + add.s32 %r1671, %r1670, %r1664; + xor.b32 %r1672, %r1671, %r1666; + shf.l.wrap.b32 %r1673, %r1672, %r1672, 25; + add.s32 %r1674, %r1626, %r1198; + add.s32 %r1675, %r1674, %r1673; + xor.b32 %r1676, %r1675, %r1642; + shf.l.wrap.b32 %r1677, %r1676, %r1676, 16; + add.s32 %r1678, %r1677, %r1657; + xor.b32 %r1679, %r1678, %r1673; + shf.l.wrap.b32 %r1680, %r1679, %r1679, 20; + add.s32 %r1681, %r1675, %r1205; + add.s32 %r1682, %r1681, %r1680; + xor.b32 %r1683, %r1682, %r1677; + shf.l.wrap.b32 %r1684, %r1683, %r1683, 24; + add.s32 %r1685, %r1684, %r1678; + xor.b32 %r1686, %r1685, %r1680; + shf.l.wrap.b32 %r1687, %r1686, %r1686, 25; + add.s32 %r1688, %r1640, %r1177; + add.s32 %r1689, %r1688, %r1631; + xor.b32 %r1690, %r1689, %r1656; + shf.l.wrap.b32 %r1691, %r1690, %r1690, 16; + add.s32 %r1692, %r1691, %r1671; + xor.b32 %r1693, %r1692, %r1631; + shf.l.wrap.b32 %r1694, %r1693, %r1693, 20; + add.s32 %r1695, %r1689, %r1191; + add.s32 %r1696, %r1695, %r1694; + xor.b32 %r1697, %r1696, %r1691; + shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; + add.s32 %r1699, %r1698, %r1692; + xor.b32 %r1700, %r1699, %r1694; + shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; + add.s32 %r1702, %r1654, %r1219; + add.s32 %r1703, %r1702, %r1645; + xor.b32 %r1704, %r1670, %r1703; + shf.l.wrap.b32 %r1705, %r1704, %r1704, 16; + add.s32 %r1706, %r1705, %r1629; + xor.b32 %r1707, %r1706, %r1645; + shf.l.wrap.b32 %r1708, %r1707, %r1707, 20; + add.s32 %r1709, %r1703, %r1184; + add.s32 %r1710, %r1709, %r1708; + xor.b32 %r1711, %r1710, %r1705; + shf.l.wrap.b32 %r1712, %r1711, %r1711, 24; + add.s32 %r1713, %r1712, %r1706; + xor.b32 %r1714, %r1713, %r1708; + shf.l.wrap.b32 %r1715, %r1714, %r1714, 25; + add.s32 %r1716, %r1659, %r1212; + add.s32 %r1717, %r1716, %r1668; + xor.b32 %r1718, %r1717, %r1628; + shf.l.wrap.b32 %r1719, %r1718, %r1718, 16; + add.s32 %r1720, %r1719, %r1643; + xor.b32 %r1721, %r1720, %r1659; + shf.l.wrap.b32 %r1722, %r1721, %r1721, 20; + add.s32 %r1723, %r1717, %r1170; + add.s32 %r1724, %r1723, %r1722; + xor.b32 %r1725, %r1724, %r1719; + shf.l.wrap.b32 %r1726, %r1725, %r1725, 24; + add.s32 %r1727, %r1726, %r1720; + xor.b32 %r1728, %r1727, %r1722; + shf.l.wrap.b32 %r1729, %r1728, %r1728, 25; + add.s32 %r1730, %r1701, %r1163; + add.s32 %r1731, %r1730, %r1682; + xor.b32 %r1732, %r1731, %r1726; + shf.l.wrap.b32 %r1733, %r1732, %r1732, 16; + add.s32 %r1734, %r1733, %r1713; + xor.b32 %r1735, %r1734, %r1701; + shf.l.wrap.b32 %r1736, %r1735, %r1735, 20; + add.s32 %r1737, %r1731, %r1128; + add.s32 %r1738, %r1737, %r1736; + xor.b32 %r1739, %r1738, %r1733; + shf.l.wrap.b32 %r1740, %r1739, %r1739, 24; + add.s32 %r1741, %r1740, %r1734; + xor.b32 %r1742, %r1741, %r1736; + shf.l.wrap.b32 %r1743, %r1742, %r1742, 25; + add.s32 %r1744, %r1696, %r1149; + add.s32 %r1745, %r1744, %r1715; + xor.b32 %r1746, %r1684, %r1745; + shf.l.wrap.b32 %r1747, %r1746, %r1746, 16; + add.s32 %r1748, %r1747, %r1727; + xor.b32 %r1749, %r1748, %r1715; + shf.l.wrap.b32 %r1750, %r1749, %r1749, 20; + add.s32 %r1751, %r1745, %r1135; + add.s32 %r1752, %r1751, %r1750; + xor.b32 %r1753, %r1752, %r1747; + shf.l.wrap.b32 %r1754, %r1753, %r1753, 24; + add.s32 %r1755, %r1754, %r1748; + xor.b32 %r1756, %r1755, %r1750; + shf.l.wrap.b32 %r1757, %r1756, %r1756, 25; + add.s32 %r1758, %r1710, %r1114; + add.s32 %r1759, %r1758, %r1729; + xor.b32 %r1760, %r1759, %r1698; + shf.l.wrap.b32 %r1761, %r1760, %r1760, 16; + add.s32 %r1762, %r1761, %r1685; + xor.b32 %r1763, %r1762, %r1729; + shf.l.wrap.b32 %r1764, %r1763, %r1763, 20; + add.s32 %r1765, %r1759, %r1121; + add.s32 %r1766, %r1765, %r1764; + xor.b32 %r1767, %r1766, %r1761; + shf.l.wrap.b32 %r1768, %r1767, %r1767, 24; + add.s32 %r1769, %r1768, %r1762; + xor.b32 %r1770, %r1769, %r1764; + shf.l.wrap.b32 %r1771, %r1770, %r1770, 25; + add.s32 %r1772, %r1724, %r1156; + add.s32 %r1773, %r1772, %r1687; + xor.b32 %r1774, %r1773, %r1712; + shf.l.wrap.b32 %r1775, %r1774, %r1774, 16; + add.s32 %r1776, %r1775, %r1699; + xor.b32 %r1777, %r1776, %r1687; + shf.l.wrap.b32 %r1778, %r1777, %r1777, 20; + add.s32 %r1779, %r1773, %r1142; + add.s32 %r1780, %r1779, %r1778; + xor.b32 %r1781, %r1780, %r1775; + shf.l.wrap.b32 %r1782, %r1781, %r1781, 24; + add.s32 %r1783, %r1782, %r1776; + xor.b32 %r1784, %r1783, %r1778; + shf.l.wrap.b32 %r1785, %r1784, %r1784, 25; + add.s32 %r1786, %r1738, %r1177; + add.s32 %r1787, %r1786, %r1785; + xor.b32 %r1788, %r1787, %r1754; + shf.l.wrap.b32 %r1789, %r1788, %r1788, 16; + add.s32 %r1790, %r1789, %r1769; + xor.b32 %r1791, %r1790, %r1785; + shf.l.wrap.b32 %r1792, %r1791, %r1791, 20; + add.s32 %r1793, %r1787, %r1212; + add.s32 %r1794, %r1793, %r1792; + xor.b32 %r1795, %r1794, %r1789; + shf.l.wrap.b32 %r1796, %r1795, %r1795, 24; + add.s32 %r1797, %r1796, %r1790; + xor.b32 %r1798, %r1797, %r1792; + shf.l.wrap.b32 %r1799, %r1798, %r1798, 25; + add.s32 %r1800, %r1752, %r1191; + add.s32 %r1801, %r1800, %r1743; + xor.b32 %r1802, %r1801, %r1768; + shf.l.wrap.b32 %r1803, %r1802, %r1802, 16; + add.s32 %r1804, %r1803, %r1783; + xor.b32 %r1805, %r1804, %r1743; + shf.l.wrap.b32 %r1806, %r1805, %r1805, 20; + add.s32 %r1807, %r1801, %r1149; + add.s32 %r1808, %r1807, %r1806; + xor.b32 %r1809, %r1808, %r1803; + shf.l.wrap.b32 %r1810, %r1809, %r1809, 24; + add.s32 %r1811, %r1810, %r1804; + xor.b32 %r1812, %r1811, %r1806; + shf.l.wrap.b32 %r1813, %r1812, %r1812, 25; + add.s32 %r1814, %r1766, %r1170; + add.s32 %r1815, %r1814, %r1757; + xor.b32 %r1816, %r1782, %r1815; + shf.l.wrap.b32 %r1817, %r1816, %r1816, 16; + add.s32 %r1818, %r1817, %r1741; + xor.b32 %r1819, %r1818, %r1757; + shf.l.wrap.b32 %r1820, %r1819, %r1819, 20; + add.s32 %r1821, %r1815, %r1198; + add.s32 %r1822, %r1821, %r1820; + xor.b32 %r1823, %r1822, %r1817; + shf.l.wrap.b32 %r1824, %r1823, %r1823, 24; + add.s32 %r1825, %r1824, %r1818; + xor.b32 %r1826, %r1825, %r1820; + shf.l.wrap.b32 %r1827, %r1826, %r1826, 25; + add.s32 %r1828, %r1771, %r1219; + add.s32 %r1829, %r1828, %r1780; + xor.b32 %r1830, %r1829, %r1740; + shf.l.wrap.b32 %r1831, %r1830, %r1830, 16; + add.s32 %r1832, %r1831, %r1755; + xor.b32 %r1833, %r1832, %r1771; + shf.l.wrap.b32 %r1834, %r1833, %r1833, 20; + add.s32 %r1835, %r1829, %r1121; + add.s32 %r1836, %r1835, %r1834; + xor.b32 %r1837, %r1836, %r1831; + shf.l.wrap.b32 %r1838, %r1837, %r1837, 24; + add.s32 %r1839, %r1838, %r1832; + xor.b32 %r1840, %r1839, %r1834; + shf.l.wrap.b32 %r1841, %r1840, %r1840, 25; + add.s32 %r1842, %r1813, %r1205; + add.s32 %r1843, %r1842, %r1794; + xor.b32 %r1844, %r1843, %r1838; + shf.l.wrap.b32 %r1845, %r1844, %r1844, 16; + add.s32 %r1846, %r1845, %r1825; + xor.b32 %r1847, %r1846, %r1813; + shf.l.wrap.b32 %r1848, %r1847, %r1847, 20; + add.s32 %r1849, %r1843, %r1135; + add.s32 %r1850, %r1849, %r1848; + xor.b32 %r1851, %r1850, %r1845; + shf.l.wrap.b32 %r1852, %r1851, %r1851, 24; + add.s32 %r1853, %r1852, %r1846; + xor.b32 %r1854, %r1853, %r1848; + shf.l.wrap.b32 %r1855, %r1854, %r1854, 25; + add.s32 %r1856, %r1808, %r1114; + add.s32 %r1857, %r1856, %r1827; + xor.b32 %r1858, %r1796, %r1857; + shf.l.wrap.b32 %r1859, %r1858, %r1858, 16; + add.s32 %r1860, %r1859, %r1839; + xor.b32 %r1861, %r1860, %r1827; + shf.l.wrap.b32 %r1862, %r1861, %r1861, 20; + add.s32 %r1863, %r1857, %r1184; + add.s32 %r1864, %r1863, %r1862; + xor.b32 %r1865, %r1864, %r1859; + shf.l.wrap.b32 %r1866, %r1865, %r1865, 24; + add.s32 %r1867, %r1866, %r1860; + xor.b32 %r1868, %r1867, %r1862; + shf.l.wrap.b32 %r1869, %r1868, %r1868, 25; + add.s32 %r1870, %r1822, %r1128; + add.s32 %r1871, %r1870, %r1841; + xor.b32 %r1872, %r1871, %r1810; + shf.l.wrap.b32 %r1873, %r1872, %r1872, 16; + add.s32 %r1874, %r1873, %r1797; + xor.b32 %r1875, %r1874, %r1841; + shf.l.wrap.b32 %r1876, %r1875, %r1875, 20; + add.s32 %r1877, %r1871, %r1156; + add.s32 %r1878, %r1877, %r1876; + xor.b32 %r1879, %r1878, %r1873; + shf.l.wrap.b32 %r1880, %r1879, %r1879, 24; + add.s32 %r1881, %r1880, %r1874; + xor.b32 %r1882, %r1881, %r1876; + shf.l.wrap.b32 %r1883, %r1882, %r1882, 25; + add.s32 %r1884, %r1836, %r1142; + add.s32 %r1885, %r1884, %r1799; + xor.b32 %r1886, %r1885, %r1824; + shf.l.wrap.b32 %r1887, %r1886, %r1886, 16; + add.s32 %r1888, %r1887, %r1811; + xor.b32 %r1889, %r1888, %r1799; + shf.l.wrap.b32 %r1890, %r1889, %r1889, 20; + add.s32 %r1891, %r1885, %r1163; + add.s32 %r1892, %r1891, %r1890; + xor.b32 %r1893, %r1892, %r1887; + shf.l.wrap.b32 %r1894, %r1893, %r1893, 24; + add.s32 %r1895, %r1894, %r1888; + xor.b32 %r1896, %r1895, %r1890; + shf.l.wrap.b32 %r1897, %r1896, %r1896, 25; + add.s32 %r1898, %r1850, %r1191; + add.s32 %r1899, %r1898, %r1897; + xor.b32 %r1900, %r1899, %r1866; + shf.l.wrap.b32 %r1901, %r1900, %r1900, 16; + add.s32 %r1902, %r1901, %r1881; + xor.b32 %r1903, %r1902, %r1897; + shf.l.wrap.b32 %r1904, %r1903, %r1903, 20; + add.s32 %r1905, %r1899, %r1219; + add.s32 %r1906, %r1905, %r1904; + xor.b32 %r1907, %r1906, %r1901; + shf.l.wrap.b32 %r1908, %r1907, %r1907, 24; + add.s32 %r1909, %r1908, %r1902; + xor.b32 %r1910, %r1909, %r1904; + shf.l.wrap.b32 %r1911, %r1910, %r1910, 25; + add.s32 %r1912, %r1864, %r1149; + add.s32 %r1913, %r1912, %r1855; + xor.b32 %r1914, %r1913, %r1880; + shf.l.wrap.b32 %r1915, %r1914, %r1914, 16; + add.s32 %r1916, %r1915, %r1895; + xor.b32 %r1917, %r1916, %r1855; + shf.l.wrap.b32 %r1918, %r1917, %r1917, 20; + add.s32 %r1919, %r1913, %r1114; + add.s32 %r1920, %r1919, %r1918; + xor.b32 %r1921, %r1920, %r1915; + shf.l.wrap.b32 %r1922, %r1921, %r1921, 24; + add.s32 %r1923, %r1922, %r1916; + xor.b32 %r1924, %r1923, %r1918; + shf.l.wrap.b32 %r1925, %r1924, %r1924, 25; + add.s32 %r1926, %r1878, %r1121; + add.s32 %r1927, %r1926, %r1869; + xor.b32 %r1928, %r1894, %r1927; + shf.l.wrap.b32 %r1929, %r1928, %r1928, 16; + add.s32 %r1930, %r1929, %r1853; + xor.b32 %r1931, %r1930, %r1869; + shf.l.wrap.b32 %r1932, %r1931, %r1931, 20; + add.s32 %r1933, %r1927, %r1177; + add.s32 %r1934, %r1933, %r1932; + xor.b32 %r1935, %r1934, %r1929; + shf.l.wrap.b32 %r1936, %r1935, %r1935, 24; + add.s32 %r1937, %r1936, %r1930; + xor.b32 %r1938, %r1937, %r1932; + shf.l.wrap.b32 %r1939, %r1938, %r1938, 25; + add.s32 %r1940, %r1883, %r1170; + add.s32 %r1941, %r1940, %r1892; + xor.b32 %r1942, %r1941, %r1852; + shf.l.wrap.b32 %r1943, %r1942, %r1942, 16; + add.s32 %r1944, %r1943, %r1867; + xor.b32 %r1945, %r1944, %r1883; + shf.l.wrap.b32 %r1946, %r1945, %r1945, 20; + add.s32 %r1947, %r1941, %r1156; + add.s32 %r1948, %r1947, %r1946; + xor.b32 %r1949, %r1948, %r1943; + shf.l.wrap.b32 %r1950, %r1949, %r1949, 24; + add.s32 %r1951, %r1950, %r1944; + xor.b32 %r1952, %r1951, %r1946; + shf.l.wrap.b32 %r1953, %r1952, %r1952, 25; + add.s32 %r1954, %r1925, %r1212; + add.s32 %r1955, %r1954, %r1906; + xor.b32 %r1956, %r1955, %r1950; + shf.l.wrap.b32 %r1957, %r1956, %r1956, 16; + add.s32 %r1958, %r1957, %r1937; + xor.b32 %r1959, %r1958, %r1925; + shf.l.wrap.b32 %r1960, %r1959, %r1959, 20; + add.s32 %r1961, %r1955, %r1184; + add.s32 %r1962, %r1961, %r1960; + xor.b32 %r1963, %r1962, %r1957; + shf.l.wrap.b32 %r1964, %r1963, %r1963, 24; + add.s32 %r1965, %r1964, %r1958; + xor.b32 %r1966, %r1965, %r1960; + shf.l.wrap.b32 %r1967, %r1966, %r1966, 25; + add.s32 %r1968, %r1920, %r1128; + add.s32 %r1969, %r1968, %r1939; + xor.b32 %r1970, %r1908, %r1969; + shf.l.wrap.b32 %r1971, %r1970, %r1970, 16; + add.s32 %r1972, %r1971, %r1951; + xor.b32 %r1973, %r1972, %r1939; + shf.l.wrap.b32 %r1974, %r1973, %r1973, 20; + add.s32 %r1975, %r1969, %r1198; + add.s32 %r1976, %r1975, %r1974; + xor.b32 %r1977, %r1976, %r1971; + shf.l.wrap.b32 %r1978, %r1977, %r1977, 24; + add.s32 %r1979, %r1978, %r1972; + xor.b32 %r1980, %r1979, %r1974; + shf.l.wrap.b32 %r1981, %r1980, %r1980, 25; + add.s32 %r1982, %r1934, %r1135; + add.s32 %r1983, %r1982, %r1953; + xor.b32 %r1984, %r1983, %r1922; + shf.l.wrap.b32 %r1985, %r1984, %r1984, 16; + add.s32 %r1986, %r1985, %r1909; + xor.b32 %r1987, %r1986, %r1953; + shf.l.wrap.b32 %r1988, %r1987, %r1987, 20; + add.s32 %r1989, %r1983, %r1142; + add.s32 %r1990, %r1989, %r1988; + xor.b32 %r1991, %r1990, %r1985; + shf.l.wrap.b32 %r1992, %r1991, %r1991, 24; + add.s32 %r1993, %r1992, %r1986; + xor.b32 %r1994, %r1993, %r1988; + shf.l.wrap.b32 %r1995, %r1994, %r1994, 25; + add.s32 %r1996, %r1948, %r1163; + add.s32 %r1997, %r1996, %r1911; + xor.b32 %r1998, %r1997, %r1936; + shf.l.wrap.b32 %r1999, %r1998, %r1998, 16; + add.s32 %r2000, %r1999, %r1923; + xor.b32 %r2001, %r2000, %r1911; + shf.l.wrap.b32 %r2002, %r2001, %r2001, 20; + add.s32 %r2003, %r1997, %r1205; + add.s32 %r2004, %r2003, %r2002; + xor.b32 %r2005, %r2004, %r1999; + shf.l.wrap.b32 %r2006, %r2005, %r2005, 24; + add.s32 %r2007, %r2006, %r2000; + xor.b32 %r2008, %r2007, %r2002; + shf.l.wrap.b32 %r2009, %r2008, %r2008, 25; + xor.b32 %r3948, %r1993, %r1962; + xor.b32 %r3947, %r2007, %r1976; + xor.b32 %r3946, %r1965, %r1990; + xor.b32 %r3945, %r2004, %r1979; + xor.b32 %r3944, %r2009, %r1978; + xor.b32 %r3943, %r1967, %r1992; + xor.b32 %r3942, %r2006, %r1981; + xor.b32 %r3941, %r1995, %r1964; + add.s64 %rd165, %rd165, 64; + add.s64 %rd166, %rd166, -1; + setp.ne.s64 %p19, %rd166, 0; + mov.u16 %rs197, %rs75; + @%p19 bra $L__BB0_19; + + st.local.u8 [%rd154], %r3948; + shr.u32 %r2010, %r3948, 8; + st.local.u8 [%rd154+1], %r2010; + shr.u32 %r2011, %r3948, 16; + st.local.u8 [%rd154+2], %r2011; + shr.u32 %r2012, %r3948, 24; + st.local.u8 [%rd154+3], %r2012; + st.local.u8 [%rd154+4], %r3947; + shr.u32 %r2013, %r3947, 8; + st.local.u8 [%rd154+5], %r2013; + shr.u32 %r2014, %r3947, 16; + st.local.u8 [%rd154+6], %r2014; + shr.u32 %r2015, %r3947, 24; + st.local.u8 [%rd154+7], %r2015; + st.local.u8 [%rd154+8], %r3946; + shr.u32 %r2016, %r3946, 8; + st.local.u8 [%rd154+9], %r2016; + shr.u32 %r2017, %r3946, 16; + st.local.u8 [%rd154+10], %r2017; + shr.u32 %r2018, %r3946, 24; + st.local.u8 [%rd154+11], %r2018; + st.local.u8 [%rd154+12], %r3945; + shr.u32 %r2019, %r3945, 8; + st.local.u8 [%rd154+13], %r2019; + shr.u32 %r2020, %r3945, 16; + st.local.u8 [%rd154+14], %r2020; + shr.u32 %r2021, %r3945, 24; + st.local.u8 [%rd154+15], %r2021; + st.local.u8 [%rd154+16], %r3944; + shr.u32 %r2022, %r3944, 8; + st.local.u8 [%rd154+17], %r2022; + shr.u32 %r2023, %r3944, 16; + st.local.u8 [%rd154+18], %r2023; + shr.u32 %r2024, %r3944, 24; + st.local.u8 [%rd154+19], %r2024; + st.local.u8 [%rd154+20], %r3943; + shr.u32 %r2025, %r3943, 8; + st.local.u8 [%rd154+21], %r2025; + shr.u32 %r2026, %r3943, 16; + st.local.u8 [%rd154+22], %r2026; + shr.u32 %r2027, %r3943, 24; + st.local.u8 [%rd154+23], %r2027; + st.local.u8 [%rd154+24], %r3942; + shr.u32 %r2028, %r3942, 8; + st.local.u8 [%rd154+25], %r2028; + shr.u32 %r2029, %r3942, 16; + st.local.u8 [%rd154+26], %r2029; + shr.u32 %r2030, %r3942, 24; + st.local.u8 [%rd154+27], %r2030; + st.local.u8 [%rd154+28], %r3941; + shr.u32 %r2031, %r3941, 8; + st.local.u8 [%rd154+29], %r2031; + shr.u32 %r2032, %r3941, 16; + st.local.u8 [%rd154+30], %r2032; + shr.u32 %r2033, %r3941, 24; + st.local.u8 [%rd154+31], %r2033; + add.s64 %rd164, %rd164, 1; + add.s64 %rd161, %rd161, 8; + add.s64 %rd154, %rd154, 32; + add.s64 %rd162, %rd162, -1; + setp.ne.s64 %p20, %rd162, 0; + @%p20 bra $L__BB0_18; + +$L__BB0_21: + ld.param.u64 %rd138, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + setp.ge.u64 %p21, %rd158, %rd138; + @%p21 bra $L__BB0_30; + + ld.param.u64 %rd139, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd134, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + add.s64 %rd126, %rd150, %rd134; + ld.local.u8 %r2034, [%rd2]; + ld.local.u8 %r2035, [%rd2+1]; + prmt.b32 %r2036, %r2035, %r2034, 30212; + ld.local.u8 %r2037, [%rd2+2]; + ld.local.u8 %r2038, [%rd2+3]; + prmt.b32 %r2039, %r2038, %r2037, 30212; + prmt.b32 %r3964, %r2039, %r2036, 4180; + ld.local.u8 %r2040, [%rd2+4]; + ld.local.u8 %r2041, [%rd2+5]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd2+6]; + ld.local.u8 %r2044, [%rd2+7]; + prmt.b32 %r2045, %r2044, %r2043, 30212; + prmt.b32 %r3963, %r2045, %r2042, 4180; + ld.local.u8 %r2046, [%rd2+8]; + ld.local.u8 %r2047, [%rd2+9]; + prmt.b32 %r2048, %r2047, %r2046, 30212; + ld.local.u8 %r2049, [%rd2+10]; + ld.local.u8 %r2050, [%rd2+11]; + prmt.b32 %r2051, %r2050, %r2049, 30212; + prmt.b32 %r3962, %r2051, %r2048, 4180; + ld.local.u8 %r2052, [%rd2+12]; + ld.local.u8 %r2053, [%rd2+13]; + prmt.b32 %r2054, %r2053, %r2052, 30212; + ld.local.u8 %r2055, [%rd2+14]; + ld.local.u8 %r2056, [%rd2+15]; + prmt.b32 %r2057, %r2056, %r2055, 30212; + prmt.b32 %r3961, %r2057, %r2054, 4180; + ld.local.u8 %r2058, [%rd2+16]; + ld.local.u8 %r2059, [%rd2+17]; + prmt.b32 %r2060, %r2059, %r2058, 30212; + ld.local.u8 %r2061, [%rd2+18]; + ld.local.u8 %r2062, [%rd2+19]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + prmt.b32 %r3960, %r2063, %r2060, 4180; + ld.local.u8 %r2064, [%rd2+20]; + ld.local.u8 %r2065, [%rd2+21]; + prmt.b32 %r2066, %r2065, %r2064, 30212; + ld.local.u8 %r2067, [%rd2+22]; + ld.local.u8 %r2068, [%rd2+23]; + prmt.b32 %r2069, %r2068, %r2067, 30212; + prmt.b32 %r3959, %r2069, %r2066, 4180; + ld.local.u8 %r2070, [%rd2+24]; + ld.local.u8 %r2071, [%rd2+25]; + prmt.b32 %r2072, %r2071, %r2070, 30212; + ld.local.u8 %r2073, [%rd2+26]; + ld.local.u8 %r2074, [%rd2+27]; + prmt.b32 %r2075, %r2074, %r2073, 30212; + prmt.b32 %r3958, %r2075, %r2072, 4180; + ld.local.u8 %r2076, [%rd2+28]; + ld.local.u8 %r2077, [%rd2+29]; + prmt.b32 %r2078, %r2077, %r2076, 30212; + ld.local.u8 %r2079, [%rd2+30]; + ld.local.u8 %r2080, [%rd2+31]; + prmt.b32 %r2081, %r2080, %r2079, 30212; + prmt.b32 %r3957, %r2081, %r2078, 4180; + add.u64 %rd53, %SPL, 16; + mov.u32 %r2082, 0; + st.local.v2.u32 [%rd53], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+8], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+16], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+24], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+32], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+40], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+48], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+56], {%r2082, %r2082}; + mov.u16 %rs199, 0; + st.local.v2.u8 [%rd53+64], {%rs199, %rs199}; + st.local.u8 [%rd53+66], %rs75; + add.s64 %rd169, %rd139, %rd158; + cvt.u32.u64 %r36, %rd126; + shr.u64 %rd128, %rd126, 32; + cvt.u32.u64 %r37, %rd128; + setp.lt.u64 %p22, %rd170, 65; + @%p22 bra $L__BB0_25; + + add.s64 %rd56, %rd53, 64; + mov.u16 %rs198, 0; + +$L__BB0_24: + and.b16 %rs83, %rs198, 255; + setp.eq.s16 %p23, %rs83, 0; + selp.u16 %rs84, 1, 0, %p23; + or.b16 %rs85, %rs84, %rs75; + ld.u8 %r2083, [%rd169]; + ld.u8 %r2084, [%rd169+1]; + prmt.b32 %r2085, %r2084, %r2083, 30212; + ld.u8 %r2086, [%rd169+2]; + prmt.b32 %r2087, %r2086, %r2085, 28756; + ld.u8 %r2088, [%rd169+3]; + prmt.b32 %r2089, %r2088, %r2087, 1620; + ld.u8 %r2090, [%rd169+4]; + ld.u8 %r2091, [%rd169+5]; + prmt.b32 %r2092, %r2091, %r2090, 30212; + ld.u8 %r2093, [%rd169+6]; + prmt.b32 %r2094, %r2093, %r2092, 28756; + ld.u8 %r2095, [%rd169+7]; + prmt.b32 %r2096, %r2095, %r2094, 1620; + ld.u8 %r2097, [%rd169+8]; + ld.u8 %r2098, [%rd169+9]; + prmt.b32 %r2099, %r2098, %r2097, 30212; + ld.u8 %r2100, [%rd169+10]; + prmt.b32 %r2101, %r2100, %r2099, 28756; + ld.u8 %r2102, [%rd169+11]; + prmt.b32 %r2103, %r2102, %r2101, 1620; + ld.u8 %r2104, [%rd169+12]; + ld.u8 %r2105, [%rd169+13]; + prmt.b32 %r2106, %r2105, %r2104, 30212; + ld.u8 %r2107, [%rd169+14]; + prmt.b32 %r2108, %r2107, %r2106, 28756; + ld.u8 %r2109, [%rd169+15]; + prmt.b32 %r2110, %r2109, %r2108, 1620; + ld.u8 %r2111, [%rd169+16]; + ld.u8 %r2112, [%rd169+17]; + prmt.b32 %r2113, %r2112, %r2111, 30212; + ld.u8 %r2114, [%rd169+18]; + prmt.b32 %r2115, %r2114, %r2113, 28756; + ld.u8 %r2116, [%rd169+19]; + prmt.b32 %r2117, %r2116, %r2115, 1620; + ld.u8 %r2118, [%rd169+20]; + ld.u8 %r2119, [%rd169+21]; + prmt.b32 %r2120, %r2119, %r2118, 30212; + ld.u8 %r2121, [%rd169+22]; + prmt.b32 %r2122, %r2121, %r2120, 28756; + ld.u8 %r2123, [%rd169+23]; + prmt.b32 %r2124, %r2123, %r2122, 1620; + ld.u8 %r2125, [%rd169+24]; + ld.u8 %r2126, [%rd169+25]; + prmt.b32 %r2127, %r2126, %r2125, 30212; + ld.u8 %r2128, [%rd169+26]; + prmt.b32 %r2129, %r2128, %r2127, 28756; + ld.u8 %r2130, [%rd169+27]; + prmt.b32 %r2131, %r2130, %r2129, 1620; + ld.u8 %r2132, [%rd169+28]; + ld.u8 %r2133, [%rd169+29]; + prmt.b32 %r2134, %r2133, %r2132, 30212; + ld.u8 %r2135, [%rd169+30]; + prmt.b32 %r2136, %r2135, %r2134, 28756; + ld.u8 %r2137, [%rd169+31]; + prmt.b32 %r2138, %r2137, %r2136, 1620; + ld.u8 %r2139, [%rd169+32]; + ld.u8 %r2140, [%rd169+33]; + prmt.b32 %r2141, %r2140, %r2139, 30212; + ld.u8 %r2142, [%rd169+34]; + prmt.b32 %r2143, %r2142, %r2141, 28756; + ld.u8 %r2144, [%rd169+35]; + prmt.b32 %r2145, %r2144, %r2143, 1620; + ld.u8 %r2146, [%rd169+36]; + ld.u8 %r2147, [%rd169+37]; + prmt.b32 %r2148, %r2147, %r2146, 30212; + ld.u8 %r2149, [%rd169+38]; + prmt.b32 %r2150, %r2149, %r2148, 28756; + ld.u8 %r2151, [%rd169+39]; + prmt.b32 %r2152, %r2151, %r2150, 1620; + ld.u8 %r2153, [%rd169+40]; + ld.u8 %r2154, [%rd169+41]; + prmt.b32 %r2155, %r2154, %r2153, 30212; + ld.u8 %r2156, [%rd169+42]; + prmt.b32 %r2157, %r2156, %r2155, 28756; + ld.u8 %r2158, [%rd169+43]; + prmt.b32 %r2159, %r2158, %r2157, 1620; + ld.u8 %r2160, [%rd169+44]; + ld.u8 %r2161, [%rd169+45]; + prmt.b32 %r2162, %r2161, %r2160, 30212; + ld.u8 %r2163, [%rd169+46]; + prmt.b32 %r2164, %r2163, %r2162, 28756; + ld.u8 %r2165, [%rd169+47]; + prmt.b32 %r2166, %r2165, %r2164, 1620; + ld.u8 %r2167, [%rd169+48]; + ld.u8 %r2168, [%rd169+49]; + prmt.b32 %r2169, %r2168, %r2167, 30212; + ld.u8 %r2170, [%rd169+50]; + prmt.b32 %r2171, %r2170, %r2169, 28756; + ld.u8 %r2172, [%rd169+51]; + prmt.b32 %r2173, %r2172, %r2171, 1620; + ld.u8 %r2174, [%rd169+52]; + ld.u8 %r2175, [%rd169+53]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.u8 %r2177, [%rd169+54]; + prmt.b32 %r2178, %r2177, %r2176, 28756; + ld.u8 %r2179, [%rd169+55]; + prmt.b32 %r2180, %r2179, %r2178, 1620; + ld.u8 %r2181, [%rd169+56]; + ld.u8 %r2182, [%rd169+57]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.u8 %r2184, [%rd169+58]; + prmt.b32 %r2185, %r2184, %r2183, 28756; + ld.u8 %r2186, [%rd169+59]; + prmt.b32 %r2187, %r2186, %r2185, 1620; + ld.u8 %r2188, [%rd169+60]; + ld.u8 %r2189, [%rd169+61]; + prmt.b32 %r2190, %r2189, %r2188, 30212; + ld.u8 %r2191, [%rd169+62]; + prmt.b32 %r2192, %r2191, %r2190, 28756; + ld.u8 %r2193, [%rd169+63]; + prmt.b32 %r2194, %r2193, %r2192, 1620; + cvt.u32.u16 %r2195, %rs85; + add.s32 %r2196, %r3964, %r2089; + add.s32 %r2197, %r2196, %r3960; + xor.b32 %r2198, %r2197, %r36; + shf.l.wrap.b32 %r2199, %r2198, %r2198, 16; + add.s32 %r2200, %r2199, 1779033703; + xor.b32 %r2201, %r2200, %r3960; + shf.l.wrap.b32 %r2202, %r2201, %r2201, 20; + add.s32 %r2203, %r2197, %r2096; + add.s32 %r2204, %r2203, %r2202; + xor.b32 %r2205, %r2204, %r2199; + shf.l.wrap.b32 %r2206, %r2205, %r2205, 24; + add.s32 %r2207, %r2206, %r2200; + xor.b32 %r2208, %r2207, %r2202; + shf.l.wrap.b32 %r2209, %r2208, %r2208, 25; + add.s32 %r2210, %r3963, %r2103; + add.s32 %r2211, %r2210, %r3959; + xor.b32 %r2212, %r2211, %r37; + shf.l.wrap.b32 %r2213, %r2212, %r2212, 16; + add.s32 %r2214, %r2213, -1150833019; + xor.b32 %r2215, %r2214, %r3959; + shf.l.wrap.b32 %r2216, %r2215, %r2215, 20; + add.s32 %r2217, %r2211, %r2110; + add.s32 %r2218, %r2217, %r2216; + xor.b32 %r2219, %r2218, %r2213; + shf.l.wrap.b32 %r2220, %r2219, %r2219, 24; + add.s32 %r2221, %r2220, %r2214; + xor.b32 %r2222, %r2221, %r2216; + shf.l.wrap.b32 %r2223, %r2222, %r2222, 25; + add.s32 %r2224, %r3962, %r2117; + add.s32 %r2225, %r2224, %r3958; + shr.u32 %r2226, %r2225, 16; + shl.b32 %r2227, %r2225, 16; + xor.b32 %r2228, %r2227, 4194304; + or.b32 %r2229, %r2228, %r2226; + add.s32 %r2230, %r2229, 1013904242; + xor.b32 %r2231, %r2230, %r3958; + shf.l.wrap.b32 %r2232, %r2231, %r2231, 20; + add.s32 %r2233, %r2225, %r2124; + add.s32 %r2234, %r2233, %r2232; + xor.b32 %r2235, %r2234, %r2229; + shf.l.wrap.b32 %r2236, %r2235, %r2235, 24; + add.s32 %r2237, %r2236, %r2230; + xor.b32 %r2238, %r2237, %r2232; + shf.l.wrap.b32 %r2239, %r2238, %r2238, 25; + add.s32 %r2240, %r3961, %r2131; + add.s32 %r2241, %r2240, %r3957; + xor.b32 %r2242, %r2241, %r2195; + shr.u32 %r2243, %r2241, 16; + shl.b32 %r2244, %r2242, 16; + or.b32 %r2245, %r2244, %r2243; + add.s32 %r2246, %r2245, -1521486534; + xor.b32 %r2247, %r2246, %r3957; + shf.l.wrap.b32 %r2248, %r2247, %r2247, 20; + add.s32 %r2249, %r2241, %r2138; + add.s32 %r2250, %r2249, %r2248; + xor.b32 %r2251, %r2250, %r2245; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 24; + add.s32 %r2253, %r2252, %r2246; + xor.b32 %r2254, %r2253, %r2248; + shf.l.wrap.b32 %r2255, %r2254, %r2254, 25; + add.s32 %r2256, %r2204, %r2145; + add.s32 %r2257, %r2256, %r2223; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 16; + add.s32 %r2260, %r2259, %r2237; + xor.b32 %r2261, %r2260, %r2223; + shf.l.wrap.b32 %r2262, %r2261, %r2261, 20; + add.s32 %r2263, %r2257, %r2152; + add.s32 %r2264, %r2263, %r2262; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 24; + add.s32 %r2267, %r2266, %r2260; + xor.b32 %r2268, %r2267, %r2262; + shf.l.wrap.b32 %r2269, %r2268, %r2268, 25; + add.s32 %r2270, %r2218, %r2159; + add.s32 %r2271, %r2270, %r2239; + xor.b32 %r2272, %r2271, %r2206; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 16; + add.s32 %r2274, %r2273, %r2253; + xor.b32 %r2275, %r2274, %r2239; + shf.l.wrap.b32 %r2276, %r2275, %r2275, 20; + add.s32 %r2277, %r2271, %r2166; + add.s32 %r2278, %r2277, %r2276; + xor.b32 %r2279, %r2278, %r2273; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 24; + add.s32 %r2281, %r2280, %r2274; + xor.b32 %r2282, %r2281, %r2276; + shf.l.wrap.b32 %r2283, %r2282, %r2282, 25; + add.s32 %r2284, %r2234, %r2173; + add.s32 %r2285, %r2284, %r2255; + xor.b32 %r2286, %r2285, %r2220; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 16; + add.s32 %r2288, %r2287, %r2207; + xor.b32 %r2289, %r2288, %r2255; + shf.l.wrap.b32 %r2290, %r2289, %r2289, 20; + add.s32 %r2291, %r2285, %r2180; + add.s32 %r2292, %r2291, %r2290; + xor.b32 %r2293, %r2292, %r2287; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 24; + add.s32 %r2295, %r2294, %r2288; + xor.b32 %r2296, %r2295, %r2290; + shf.l.wrap.b32 %r2297, %r2296, %r2296, 25; + add.s32 %r2298, %r2250, %r2187; + add.s32 %r2299, %r2298, %r2209; + xor.b32 %r2300, %r2299, %r2236; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 16; + add.s32 %r2302, %r2301, %r2221; + xor.b32 %r2303, %r2302, %r2209; + shf.l.wrap.b32 %r2304, %r2303, %r2303, 20; + add.s32 %r2305, %r2299, %r2194; + add.s32 %r2306, %r2305, %r2304; + xor.b32 %r2307, %r2306, %r2301; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 24; + add.s32 %r2309, %r2308, %r2302; + xor.b32 %r2310, %r2309, %r2304; + shf.l.wrap.b32 %r2311, %r2310, %r2310, 25; + add.s32 %r2312, %r2264, %r2103; + add.s32 %r2313, %r2312, %r2311; + xor.b32 %r2314, %r2313, %r2280; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 16; + add.s32 %r2316, %r2315, %r2295; + xor.b32 %r2317, %r2316, %r2311; + shf.l.wrap.b32 %r2318, %r2317, %r2317, 20; + add.s32 %r2319, %r2313, %r2131; + add.s32 %r2320, %r2319, %r2318; + xor.b32 %r2321, %r2320, %r2315; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 24; + add.s32 %r2323, %r2322, %r2316; + xor.b32 %r2324, %r2323, %r2318; + shf.l.wrap.b32 %r2325, %r2324, %r2324, 25; + add.s32 %r2326, %r2278, %r2110; + add.s32 %r2327, %r2326, %r2269; + xor.b32 %r2328, %r2327, %r2294; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 16; + add.s32 %r2330, %r2329, %r2309; + xor.b32 %r2331, %r2330, %r2269; + shf.l.wrap.b32 %r2332, %r2331, %r2331, 20; + add.s32 %r2333, %r2327, %r2159; + add.s32 %r2334, %r2333, %r2332; + xor.b32 %r2335, %r2334, %r2329; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 24; + add.s32 %r2337, %r2336, %r2330; + xor.b32 %r2338, %r2337, %r2332; + shf.l.wrap.b32 %r2339, %r2338, %r2338, 25; + add.s32 %r2340, %r2292, %r2138; + add.s32 %r2341, %r2340, %r2283; + xor.b32 %r2342, %r2341, %r2308; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 16; + add.s32 %r2344, %r2343, %r2267; + xor.b32 %r2345, %r2344, %r2283; + shf.l.wrap.b32 %r2346, %r2345, %r2345, 20; + add.s32 %r2347, %r2341, %r2089; + add.s32 %r2348, %r2347, %r2346; + xor.b32 %r2349, %r2348, %r2343; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 24; + add.s32 %r2351, %r2350, %r2344; + xor.b32 %r2352, %r2351, %r2346; + shf.l.wrap.b32 %r2353, %r2352, %r2352, 25; + add.s32 %r2354, %r2306, %r2117; + add.s32 %r2355, %r2354, %r2297; + xor.b32 %r2356, %r2355, %r2266; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 16; + add.s32 %r2358, %r2357, %r2281; + xor.b32 %r2359, %r2358, %r2297; + shf.l.wrap.b32 %r2360, %r2359, %r2359, 20; + add.s32 %r2361, %r2355, %r2180; + add.s32 %r2362, %r2361, %r2360; + xor.b32 %r2363, %r2362, %r2357; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 24; + add.s32 %r2365, %r2364, %r2358; + xor.b32 %r2366, %r2365, %r2360; + shf.l.wrap.b32 %r2367, %r2366, %r2366, 25; + add.s32 %r2368, %r2320, %r2096; + add.s32 %r2369, %r2368, %r2339; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 16; + add.s32 %r2372, %r2371, %r2351; + xor.b32 %r2373, %r2372, %r2339; + shf.l.wrap.b32 %r2374, %r2373, %r2373, 20; + add.s32 %r2375, %r2369, %r2166; + add.s32 %r2376, %r2375, %r2374; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 24; + add.s32 %r2379, %r2378, %r2372; + xor.b32 %r2380, %r2379, %r2374; + shf.l.wrap.b32 %r2381, %r2380, %r2380, 25; + add.s32 %r2382, %r2334, %r2173; + add.s32 %r2383, %r2382, %r2353; + xor.b32 %r2384, %r2383, %r2322; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 16; + add.s32 %r2386, %r2385, %r2365; + xor.b32 %r2387, %r2386, %r2353; + shf.l.wrap.b32 %r2388, %r2387, %r2387, 20; + add.s32 %r2389, %r2383, %r2124; + add.s32 %r2390, %r2389, %r2388; + xor.b32 %r2391, %r2390, %r2385; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 24; + add.s32 %r2393, %r2392, %r2386; + xor.b32 %r2394, %r2393, %r2388; + shf.l.wrap.b32 %r2395, %r2394, %r2394, 25; + add.s32 %r2396, %r2348, %r2152; + add.s32 %r2397, %r2396, %r2367; + xor.b32 %r2398, %r2397, %r2336; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 16; + add.s32 %r2400, %r2399, %r2323; + xor.b32 %r2401, %r2400, %r2367; + shf.l.wrap.b32 %r2402, %r2401, %r2401, 20; + add.s32 %r2403, %r2397, %r2187; + add.s32 %r2404, %r2403, %r2402; + xor.b32 %r2405, %r2404, %r2399; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 24; + add.s32 %r2407, %r2406, %r2400; + xor.b32 %r2408, %r2407, %r2402; + shf.l.wrap.b32 %r2409, %r2408, %r2408, 25; + add.s32 %r2410, %r2362, %r2194; + add.s32 %r2411, %r2410, %r2325; + xor.b32 %r2412, %r2411, %r2350; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 16; + add.s32 %r2414, %r2413, %r2337; + xor.b32 %r2415, %r2414, %r2325; + shf.l.wrap.b32 %r2416, %r2415, %r2415, 20; + add.s32 %r2417, %r2411, %r2145; + add.s32 %r2418, %r2417, %r2416; + xor.b32 %r2419, %r2418, %r2413; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 24; + add.s32 %r2421, %r2420, %r2414; + xor.b32 %r2422, %r2421, %r2416; + shf.l.wrap.b32 %r2423, %r2422, %r2422, 25; + add.s32 %r2424, %r2376, %r2110; + add.s32 %r2425, %r2424, %r2423; + xor.b32 %r2426, %r2425, %r2392; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 16; + add.s32 %r2428, %r2427, %r2407; + xor.b32 %r2429, %r2428, %r2423; + shf.l.wrap.b32 %r2430, %r2429, %r2429, 20; + add.s32 %r2431, %r2425, %r2117; + add.s32 %r2432, %r2431, %r2430; + xor.b32 %r2433, %r2432, %r2427; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 24; + add.s32 %r2435, %r2434, %r2428; + xor.b32 %r2436, %r2435, %r2430; + shf.l.wrap.b32 %r2437, %r2436, %r2436, 25; + add.s32 %r2438, %r2390, %r2159; + add.s32 %r2439, %r2438, %r2381; + xor.b32 %r2440, %r2439, %r2406; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 16; + add.s32 %r2442, %r2441, %r2421; + xor.b32 %r2443, %r2442, %r2381; + shf.l.wrap.b32 %r2444, %r2443, %r2443, 20; + add.s32 %r2445, %r2439, %r2173; + add.s32 %r2446, %r2445, %r2444; + xor.b32 %r2447, %r2446, %r2441; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 24; + add.s32 %r2449, %r2448, %r2442; + xor.b32 %r2450, %r2449, %r2444; + shf.l.wrap.b32 %r2451, %r2450, %r2450, 25; + add.s32 %r2452, %r2404, %r2180; + add.s32 %r2453, %r2452, %r2395; + xor.b32 %r2454, %r2453, %r2420; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 16; + add.s32 %r2456, %r2455, %r2379; + xor.b32 %r2457, %r2456, %r2395; + shf.l.wrap.b32 %r2458, %r2457, %r2457, 20; + add.s32 %r2459, %r2453, %r2103; + add.s32 %r2460, %r2459, %r2458; + xor.b32 %r2461, %r2460, %r2455; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 24; + add.s32 %r2463, %r2462, %r2456; + xor.b32 %r2464, %r2463, %r2458; + shf.l.wrap.b32 %r2465, %r2464, %r2464, 25; + add.s32 %r2466, %r2418, %r2138; + add.s32 %r2467, %r2466, %r2409; + xor.b32 %r2468, %r2467, %r2378; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 16; + add.s32 %r2470, %r2469, %r2393; + xor.b32 %r2471, %r2470, %r2409; + shf.l.wrap.b32 %r2472, %r2471, %r2471, 20; + add.s32 %r2473, %r2467, %r2187; + add.s32 %r2474, %r2473, %r2472; + xor.b32 %r2475, %r2474, %r2469; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 24; + add.s32 %r2477, %r2476, %r2470; + xor.b32 %r2478, %r2477, %r2472; + shf.l.wrap.b32 %r2479, %r2478, %r2478, 25; + add.s32 %r2480, %r2432, %r2131; + add.s32 %r2481, %r2480, %r2451; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 16; + add.s32 %r2484, %r2483, %r2463; + xor.b32 %r2485, %r2484, %r2451; + shf.l.wrap.b32 %r2486, %r2485, %r2485, 20; + add.s32 %r2487, %r2481, %r2124; + add.s32 %r2488, %r2487, %r2486; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 24; + add.s32 %r2491, %r2490, %r2484; + xor.b32 %r2492, %r2491, %r2486; + shf.l.wrap.b32 %r2493, %r2492, %r2492, 25; + add.s32 %r2494, %r2446, %r2152; + add.s32 %r2495, %r2494, %r2465; + xor.b32 %r2496, %r2495, %r2434; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 16; + add.s32 %r2498, %r2497, %r2477; + xor.b32 %r2499, %r2498, %r2465; + shf.l.wrap.b32 %r2500, %r2499, %r2499, 20; + add.s32 %r2501, %r2495, %r2089; + add.s32 %r2502, %r2501, %r2500; + xor.b32 %r2503, %r2502, %r2497; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 24; + add.s32 %r2505, %r2504, %r2498; + xor.b32 %r2506, %r2505, %r2500; + shf.l.wrap.b32 %r2507, %r2506, %r2506, 25; + add.s32 %r2508, %r2460, %r2166; + add.s32 %r2509, %r2508, %r2479; + xor.b32 %r2510, %r2509, %r2448; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 16; + add.s32 %r2512, %r2511, %r2435; + xor.b32 %r2513, %r2512, %r2479; + shf.l.wrap.b32 %r2514, %r2513, %r2513, 20; + add.s32 %r2515, %r2509, %r2194; + add.s32 %r2516, %r2515, %r2514; + xor.b32 %r2517, %r2516, %r2511; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 24; + add.s32 %r2519, %r2518, %r2512; + xor.b32 %r2520, %r2519, %r2514; + shf.l.wrap.b32 %r2521, %r2520, %r2520, 25; + add.s32 %r2522, %r2474, %r2145; + add.s32 %r2523, %r2522, %r2437; + xor.b32 %r2524, %r2523, %r2462; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 16; + add.s32 %r2526, %r2525, %r2449; + xor.b32 %r2527, %r2526, %r2437; + shf.l.wrap.b32 %r2528, %r2527, %r2527, 20; + add.s32 %r2529, %r2523, %r2096; + add.s32 %r2530, %r2529, %r2528; + xor.b32 %r2531, %r2530, %r2525; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 24; + add.s32 %r2533, %r2532, %r2526; + xor.b32 %r2534, %r2533, %r2528; + shf.l.wrap.b32 %r2535, %r2534, %r2534, 25; + add.s32 %r2536, %r2488, %r2159; + add.s32 %r2537, %r2536, %r2535; + xor.b32 %r2538, %r2537, %r2504; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 16; + add.s32 %r2540, %r2539, %r2519; + xor.b32 %r2541, %r2540, %r2535; + shf.l.wrap.b32 %r2542, %r2541, %r2541, 20; + add.s32 %r2543, %r2537, %r2138; + add.s32 %r2544, %r2543, %r2542; + xor.b32 %r2545, %r2544, %r2539; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 24; + add.s32 %r2547, %r2546, %r2540; + xor.b32 %r2548, %r2547, %r2542; + shf.l.wrap.b32 %r2549, %r2548, %r2548, 25; + add.s32 %r2550, %r2502, %r2173; + add.s32 %r2551, %r2550, %r2493; + xor.b32 %r2552, %r2551, %r2518; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 16; + add.s32 %r2554, %r2553, %r2533; + xor.b32 %r2555, %r2554, %r2493; + shf.l.wrap.b32 %r2556, %r2555, %r2555, 20; + add.s32 %r2557, %r2551, %r2152; + add.s32 %r2558, %r2557, %r2556; + xor.b32 %r2559, %r2558, %r2553; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 24; + add.s32 %r2561, %r2560, %r2554; + xor.b32 %r2562, %r2561, %r2556; + shf.l.wrap.b32 %r2563, %r2562, %r2562, 25; + add.s32 %r2564, %r2516, %r2187; + add.s32 %r2565, %r2564, %r2507; + xor.b32 %r2566, %r2565, %r2532; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 16; + add.s32 %r2568, %r2567, %r2491; + xor.b32 %r2569, %r2568, %r2507; + shf.l.wrap.b32 %r2570, %r2569, %r2569, 20; + add.s32 %r2571, %r2565, %r2110; + add.s32 %r2572, %r2571, %r2570; + xor.b32 %r2573, %r2572, %r2567; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 24; + add.s32 %r2575, %r2574, %r2568; + xor.b32 %r2576, %r2575, %r2570; + shf.l.wrap.b32 %r2577, %r2576, %r2576, 25; + add.s32 %r2578, %r2530, %r2180; + add.s32 %r2579, %r2578, %r2521; + xor.b32 %r2580, %r2579, %r2490; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 16; + add.s32 %r2582, %r2581, %r2505; + xor.b32 %r2583, %r2582, %r2521; + shf.l.wrap.b32 %r2584, %r2583, %r2583, 20; + add.s32 %r2585, %r2579, %r2194; + add.s32 %r2586, %r2585, %r2584; + xor.b32 %r2587, %r2586, %r2581; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 24; + add.s32 %r2589, %r2588, %r2582; + xor.b32 %r2590, %r2589, %r2584; + shf.l.wrap.b32 %r2591, %r2590, %r2590, 25; + add.s32 %r2592, %r2544, %r2117; + add.s32 %r2593, %r2592, %r2563; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 16; + add.s32 %r2596, %r2595, %r2575; + xor.b32 %r2597, %r2596, %r2563; + shf.l.wrap.b32 %r2598, %r2597, %r2597, 20; + add.s32 %r2599, %r2593, %r2089; + add.s32 %r2600, %r2599, %r2598; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 24; + add.s32 %r2603, %r2602, %r2596; + xor.b32 %r2604, %r2603, %r2598; + shf.l.wrap.b32 %r2605, %r2604, %r2604, 25; + add.s32 %r2606, %r2558, %r2166; + add.s32 %r2607, %r2606, %r2577; + xor.b32 %r2608, %r2607, %r2546; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 16; + add.s32 %r2610, %r2609, %r2589; + xor.b32 %r2611, %r2610, %r2577; + shf.l.wrap.b32 %r2612, %r2611, %r2611, 20; + add.s32 %r2613, %r2607, %r2103; + add.s32 %r2614, %r2613, %r2612; + xor.b32 %r2615, %r2614, %r2609; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 24; + add.s32 %r2617, %r2616, %r2610; + xor.b32 %r2618, %r2617, %r2612; + shf.l.wrap.b32 %r2619, %r2618, %r2618, 25; + add.s32 %r2620, %r2572, %r2124; + add.s32 %r2621, %r2620, %r2591; + xor.b32 %r2622, %r2621, %r2560; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 16; + add.s32 %r2624, %r2623, %r2547; + xor.b32 %r2625, %r2624, %r2591; + shf.l.wrap.b32 %r2626, %r2625, %r2625, 20; + add.s32 %r2627, %r2621, %r2145; + add.s32 %r2628, %r2627, %r2626; + xor.b32 %r2629, %r2628, %r2623; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 24; + add.s32 %r2631, %r2630, %r2624; + xor.b32 %r2632, %r2631, %r2626; + shf.l.wrap.b32 %r2633, %r2632, %r2632, 25; + add.s32 %r2634, %r2586, %r2096; + add.s32 %r2635, %r2634, %r2549; + xor.b32 %r2636, %r2635, %r2574; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 16; + add.s32 %r2638, %r2637, %r2561; + xor.b32 %r2639, %r2638, %r2549; + shf.l.wrap.b32 %r2640, %r2639, %r2639, 20; + add.s32 %r2641, %r2635, %r2131; + add.s32 %r2642, %r2641, %r2640; + xor.b32 %r2643, %r2642, %r2637; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 24; + add.s32 %r2645, %r2644, %r2638; + xor.b32 %r2646, %r2645, %r2640; + shf.l.wrap.b32 %r2647, %r2646, %r2646, 25; + add.s32 %r2648, %r2600, %r2173; + add.s32 %r2649, %r2648, %r2647; + xor.b32 %r2650, %r2649, %r2616; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 16; + add.s32 %r2652, %r2651, %r2631; + xor.b32 %r2653, %r2652, %r2647; + shf.l.wrap.b32 %r2654, %r2653, %r2653, 20; + add.s32 %r2655, %r2649, %r2180; + add.s32 %r2656, %r2655, %r2654; + xor.b32 %r2657, %r2656, %r2651; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 24; + add.s32 %r2659, %r2658, %r2652; + xor.b32 %r2660, %r2659, %r2654; + shf.l.wrap.b32 %r2661, %r2660, %r2660, 25; + add.s32 %r2662, %r2614, %r2152; + add.s32 %r2663, %r2662, %r2605; + xor.b32 %r2664, %r2663, %r2630; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 16; + add.s32 %r2666, %r2665, %r2645; + xor.b32 %r2667, %r2666, %r2605; + shf.l.wrap.b32 %r2668, %r2667, %r2667, 20; + add.s32 %r2669, %r2663, %r2166; + add.s32 %r2670, %r2669, %r2668; + xor.b32 %r2671, %r2670, %r2665; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 24; + add.s32 %r2673, %r2672, %r2666; + xor.b32 %r2674, %r2673, %r2668; + shf.l.wrap.b32 %r2675, %r2674, %r2674, 25; + add.s32 %r2676, %r2628, %r2194; + add.s32 %r2677, %r2676, %r2619; + xor.b32 %r2678, %r2677, %r2644; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 16; + add.s32 %r2680, %r2679, %r2603; + xor.b32 %r2681, %r2680, %r2619; + shf.l.wrap.b32 %r2682, %r2681, %r2681, 20; + add.s32 %r2683, %r2677, %r2159; + add.s32 %r2684, %r2683, %r2682; + xor.b32 %r2685, %r2684, %r2679; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 24; + add.s32 %r2687, %r2686, %r2680; + xor.b32 %r2688, %r2687, %r2682; + shf.l.wrap.b32 %r2689, %r2688, %r2688, 25; + add.s32 %r2690, %r2642, %r2187; + add.s32 %r2691, %r2690, %r2633; + xor.b32 %r2692, %r2691, %r2602; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 16; + add.s32 %r2694, %r2693, %r2617; + xor.b32 %r2695, %r2694, %r2633; + shf.l.wrap.b32 %r2696, %r2695, %r2695, 20; + add.s32 %r2697, %r2691, %r2145; + add.s32 %r2698, %r2697, %r2696; + xor.b32 %r2699, %r2698, %r2693; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 24; + add.s32 %r2701, %r2700, %r2694; + xor.b32 %r2702, %r2701, %r2696; + shf.l.wrap.b32 %r2703, %r2702, %r2702, 25; + add.s32 %r2704, %r2656, %r2138; + add.s32 %r2705, %r2704, %r2675; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 16; + add.s32 %r2708, %r2707, %r2687; + xor.b32 %r2709, %r2708, %r2675; + shf.l.wrap.b32 %r2710, %r2709, %r2709, 20; + add.s32 %r2711, %r2705, %r2103; + add.s32 %r2712, %r2711, %r2710; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 24; + add.s32 %r2715, %r2714, %r2708; + xor.b32 %r2716, %r2715, %r2710; + shf.l.wrap.b32 %r2717, %r2716, %r2716, 25; + add.s32 %r2718, %r2670, %r2124; + add.s32 %r2719, %r2718, %r2689; + xor.b32 %r2720, %r2719, %r2658; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 16; + add.s32 %r2722, %r2721, %r2701; + xor.b32 %r2723, %r2722, %r2689; + shf.l.wrap.b32 %r2724, %r2723, %r2723, 20; + add.s32 %r2725, %r2719, %r2110; + add.s32 %r2726, %r2725, %r2724; + xor.b32 %r2727, %r2726, %r2721; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 24; + add.s32 %r2729, %r2728, %r2722; + xor.b32 %r2730, %r2729, %r2724; + shf.l.wrap.b32 %r2731, %r2730, %r2730, 25; + add.s32 %r2732, %r2684, %r2089; + add.s32 %r2733, %r2732, %r2703; + xor.b32 %r2734, %r2733, %r2672; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 16; + add.s32 %r2736, %r2735, %r2659; + xor.b32 %r2737, %r2736, %r2703; + shf.l.wrap.b32 %r2738, %r2737, %r2737, 20; + add.s32 %r2739, %r2733, %r2096; + add.s32 %r2740, %r2739, %r2738; + xor.b32 %r2741, %r2740, %r2735; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 24; + add.s32 %r2743, %r2742, %r2736; + xor.b32 %r2744, %r2743, %r2738; + shf.l.wrap.b32 %r2745, %r2744, %r2744, 25; + add.s32 %r2746, %r2698, %r2131; + add.s32 %r2747, %r2746, %r2661; + xor.b32 %r2748, %r2747, %r2686; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 16; + add.s32 %r2750, %r2749, %r2673; + xor.b32 %r2751, %r2750, %r2661; + shf.l.wrap.b32 %r2752, %r2751, %r2751, 20; + add.s32 %r2753, %r2747, %r2117; + add.s32 %r2754, %r2753, %r2752; + xor.b32 %r2755, %r2754, %r2749; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 24; + add.s32 %r2757, %r2756, %r2750; + xor.b32 %r2758, %r2757, %r2752; + shf.l.wrap.b32 %r2759, %r2758, %r2758, 25; + add.s32 %r2760, %r2712, %r2152; + add.s32 %r2761, %r2760, %r2759; + xor.b32 %r2762, %r2761, %r2728; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 16; + add.s32 %r2764, %r2763, %r2743; + xor.b32 %r2765, %r2764, %r2759; + shf.l.wrap.b32 %r2766, %r2765, %r2765, 20; + add.s32 %r2767, %r2761, %r2187; + add.s32 %r2768, %r2767, %r2766; + xor.b32 %r2769, %r2768, %r2763; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 24; + add.s32 %r2771, %r2770, %r2764; + xor.b32 %r2772, %r2771, %r2766; + shf.l.wrap.b32 %r2773, %r2772, %r2772, 25; + add.s32 %r2774, %r2726, %r2166; + add.s32 %r2775, %r2774, %r2717; + xor.b32 %r2776, %r2775, %r2742; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 16; + add.s32 %r2778, %r2777, %r2757; + xor.b32 %r2779, %r2778, %r2717; + shf.l.wrap.b32 %r2780, %r2779, %r2779, 20; + add.s32 %r2781, %r2775, %r2124; + add.s32 %r2782, %r2781, %r2780; + xor.b32 %r2783, %r2782, %r2777; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 24; + add.s32 %r2785, %r2784, %r2778; + xor.b32 %r2786, %r2785, %r2780; + shf.l.wrap.b32 %r2787, %r2786, %r2786, 25; + add.s32 %r2788, %r2740, %r2145; + add.s32 %r2789, %r2788, %r2731; + xor.b32 %r2790, %r2789, %r2756; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 16; + add.s32 %r2792, %r2791, %r2715; + xor.b32 %r2793, %r2792, %r2731; + shf.l.wrap.b32 %r2794, %r2793, %r2793, 20; + add.s32 %r2795, %r2789, %r2173; + add.s32 %r2796, %r2795, %r2794; + xor.b32 %r2797, %r2796, %r2791; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 24; + add.s32 %r2799, %r2798, %r2792; + xor.b32 %r2800, %r2799, %r2794; + shf.l.wrap.b32 %r2801, %r2800, %r2800, 25; + add.s32 %r2802, %r2754, %r2194; + add.s32 %r2803, %r2802, %r2745; + xor.b32 %r2804, %r2803, %r2714; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 16; + add.s32 %r2806, %r2805, %r2729; + xor.b32 %r2807, %r2806, %r2745; + shf.l.wrap.b32 %r2808, %r2807, %r2807, 20; + add.s32 %r2809, %r2803, %r2096; + add.s32 %r2810, %r2809, %r2808; + xor.b32 %r2811, %r2810, %r2805; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 24; + add.s32 %r2813, %r2812, %r2806; + xor.b32 %r2814, %r2813, %r2808; + shf.l.wrap.b32 %r2815, %r2814, %r2814, 25; + add.s32 %r2816, %r2768, %r2180; + add.s32 %r2817, %r2816, %r2787; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 16; + add.s32 %r2820, %r2819, %r2799; + xor.b32 %r2821, %r2820, %r2787; + shf.l.wrap.b32 %r2822, %r2821, %r2821, 20; + add.s32 %r2823, %r2817, %r2110; + add.s32 %r2824, %r2823, %r2822; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 24; + add.s32 %r2827, %r2826, %r2820; + xor.b32 %r2828, %r2827, %r2822; + shf.l.wrap.b32 %r2829, %r2828, %r2828, 25; + add.s32 %r2830, %r2782, %r2089; + add.s32 %r2831, %r2830, %r2801; + xor.b32 %r2832, %r2831, %r2770; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 16; + add.s32 %r2834, %r2833, %r2813; + xor.b32 %r2835, %r2834, %r2801; + shf.l.wrap.b32 %r2836, %r2835, %r2835, 20; + add.s32 %r2837, %r2831, %r2159; + add.s32 %r2838, %r2837, %r2836; + xor.b32 %r2839, %r2838, %r2833; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 24; + add.s32 %r2841, %r2840, %r2834; + xor.b32 %r2842, %r2841, %r2836; + shf.l.wrap.b32 %r2843, %r2842, %r2842, 25; + add.s32 %r2844, %r2796, %r2103; + add.s32 %r2845, %r2844, %r2815; + xor.b32 %r2846, %r2845, %r2784; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 16; + add.s32 %r2848, %r2847, %r2771; + xor.b32 %r2849, %r2848, %r2815; + shf.l.wrap.b32 %r2850, %r2849, %r2849, 20; + add.s32 %r2851, %r2845, %r2131; + add.s32 %r2852, %r2851, %r2850; + xor.b32 %r2853, %r2852, %r2847; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 24; + add.s32 %r2855, %r2854, %r2848; + xor.b32 %r2856, %r2855, %r2850; + shf.l.wrap.b32 %r2857, %r2856, %r2856, 25; + add.s32 %r2858, %r2810, %r2117; + add.s32 %r2859, %r2858, %r2773; + xor.b32 %r2860, %r2859, %r2798; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 16; + add.s32 %r2862, %r2861, %r2785; + xor.b32 %r2863, %r2862, %r2773; + shf.l.wrap.b32 %r2864, %r2863, %r2863, 20; + add.s32 %r2865, %r2859, %r2138; + add.s32 %r2866, %r2865, %r2864; + xor.b32 %r2867, %r2866, %r2861; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 24; + add.s32 %r2869, %r2868, %r2862; + xor.b32 %r2870, %r2869, %r2864; + shf.l.wrap.b32 %r2871, %r2870, %r2870, 25; + add.s32 %r2872, %r2824, %r2166; + add.s32 %r2873, %r2872, %r2871; + xor.b32 %r2874, %r2873, %r2840; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 16; + add.s32 %r2876, %r2875, %r2855; + xor.b32 %r2877, %r2876, %r2871; + shf.l.wrap.b32 %r2878, %r2877, %r2877, 20; + add.s32 %r2879, %r2873, %r2194; + add.s32 %r2880, %r2879, %r2878; + xor.b32 %r2881, %r2880, %r2875; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 24; + add.s32 %r2883, %r2882, %r2876; + xor.b32 %r2884, %r2883, %r2878; + shf.l.wrap.b32 %r2885, %r2884, %r2884, 25; + add.s32 %r2886, %r2838, %r2124; + add.s32 %r2887, %r2886, %r2829; + xor.b32 %r2888, %r2887, %r2854; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 16; + add.s32 %r2890, %r2889, %r2869; + xor.b32 %r2891, %r2890, %r2829; + shf.l.wrap.b32 %r2892, %r2891, %r2891, 20; + add.s32 %r2893, %r2887, %r2089; + add.s32 %r2894, %r2893, %r2892; + xor.b32 %r2895, %r2894, %r2889; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 24; + add.s32 %r2897, %r2896, %r2890; + xor.b32 %r2898, %r2897, %r2892; + shf.l.wrap.b32 %r2899, %r2898, %r2898, 25; + add.s32 %r2900, %r2852, %r2096; + add.s32 %r2901, %r2900, %r2843; + xor.b32 %r2902, %r2901, %r2868; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 16; + add.s32 %r2904, %r2903, %r2827; + xor.b32 %r2905, %r2904, %r2843; + shf.l.wrap.b32 %r2906, %r2905, %r2905, 20; + add.s32 %r2907, %r2901, %r2152; + add.s32 %r2908, %r2907, %r2906; + xor.b32 %r2909, %r2908, %r2903; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 24; + add.s32 %r2911, %r2910, %r2904; + xor.b32 %r2912, %r2911, %r2906; + shf.l.wrap.b32 %r2913, %r2912, %r2912, 25; + add.s32 %r2914, %r2866, %r2145; + add.s32 %r2915, %r2914, %r2857; + xor.b32 %r2916, %r2915, %r2826; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 16; + add.s32 %r2918, %r2917, %r2841; + xor.b32 %r2919, %r2918, %r2857; + shf.l.wrap.b32 %r2920, %r2919, %r2919, 20; + add.s32 %r2921, %r2915, %r2131; + add.s32 %r2922, %r2921, %r2920; + xor.b32 %r2923, %r2922, %r2917; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 24; + add.s32 %r2925, %r2924, %r2918; + xor.b32 %r2926, %r2925, %r2920; + shf.l.wrap.b32 %r2927, %r2926, %r2926, 25; + add.s32 %r2928, %r2880, %r2187; + add.s32 %r2929, %r2928, %r2899; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 16; + add.s32 %r2932, %r2931, %r2911; + xor.b32 %r2933, %r2932, %r2899; + shf.l.wrap.b32 %r2934, %r2933, %r2933, 20; + add.s32 %r2935, %r2929, %r2159; + add.s32 %r2936, %r2935, %r2934; + xor.b32 %r2937, %r2936, %r2931; + shf.l.wrap.b32 %r2938, %r2937, %r2937, 24; + add.s32 %r2939, %r2938, %r2932; + xor.b32 %r2940, %r2939, %r2934; + shf.l.wrap.b32 %r2941, %r2940, %r2940, 25; + add.s32 %r2942, %r2894, %r2103; + add.s32 %r2943, %r2942, %r2913; + xor.b32 %r2944, %r2943, %r2882; + shf.l.wrap.b32 %r2945, %r2944, %r2944, 16; + add.s32 %r2946, %r2945, %r2925; + xor.b32 %r2947, %r2946, %r2913; + shf.l.wrap.b32 %r2948, %r2947, %r2947, 20; + add.s32 %r2949, %r2943, %r2173; + add.s32 %r2950, %r2949, %r2948; + xor.b32 %r2951, %r2950, %r2945; + shf.l.wrap.b32 %r2952, %r2951, %r2951, 24; + add.s32 %r2953, %r2952, %r2946; + xor.b32 %r2954, %r2953, %r2948; + shf.l.wrap.b32 %r2955, %r2954, %r2954, 25; + add.s32 %r2956, %r2908, %r2110; + add.s32 %r2957, %r2956, %r2927; + xor.b32 %r2958, %r2957, %r2896; + shf.l.wrap.b32 %r2959, %r2958, %r2958, 16; + add.s32 %r2960, %r2959, %r2883; + xor.b32 %r2961, %r2960, %r2927; + shf.l.wrap.b32 %r2962, %r2961, %r2961, 20; + add.s32 %r2963, %r2957, %r2117; + add.s32 %r2964, %r2963, %r2962; + xor.b32 %r2965, %r2964, %r2959; + shf.l.wrap.b32 %r2966, %r2965, %r2965, 24; + add.s32 %r2967, %r2966, %r2960; + xor.b32 %r2968, %r2967, %r2962; + shf.l.wrap.b32 %r2969, %r2968, %r2968, 25; + add.s32 %r2970, %r2922, %r2138; + add.s32 %r2971, %r2970, %r2885; + xor.b32 %r2972, %r2971, %r2910; + shf.l.wrap.b32 %r2973, %r2972, %r2972, 16; + add.s32 %r2974, %r2973, %r2897; + xor.b32 %r2975, %r2974, %r2885; + shf.l.wrap.b32 %r2976, %r2975, %r2975, 20; + add.s32 %r2977, %r2971, %r2180; + add.s32 %r2978, %r2977, %r2976; + xor.b32 %r2979, %r2978, %r2973; + shf.l.wrap.b32 %r2980, %r2979, %r2979, 24; + add.s32 %r2981, %r2980, %r2974; + xor.b32 %r2982, %r2981, %r2976; + shf.l.wrap.b32 %r2983, %r2982, %r2982, 25; + xor.b32 %r3964, %r2967, %r2936; + xor.b32 %r3963, %r2981, %r2950; + xor.b32 %r3962, %r2939, %r2964; + xor.b32 %r3961, %r2953, %r2978; + xor.b32 %r3960, %r2983, %r2952; + xor.b32 %r3959, %r2941, %r2966; + xor.b32 %r3958, %r2955, %r2980; + xor.b32 %r3957, %r2969, %r2938; + add.s16 %rs198, %rs198, 1; + st.local.u8 [%rd56+1], %rs198; + add.s64 %rd169, %rd169, 64; + add.s64 %rd170, %rd170, -64; + setp.gt.u64 %p24, %rd170, 64; + @%p24 bra $L__BB0_24; + +$L__BB0_25: + min.u64 %rd63, %rd170, 64; + setp.eq.s64 %p25, %rd63, 0; + mov.u16 %rs200, %rs199; + mov.u16 %rs201, %rs199; + mov.u16 %rs202, %rs199; + mov.u16 %rs203, %rs199; + mov.u16 %rs204, %rs199; + mov.u16 %rs205, %rs199; + mov.u16 %rs206, %rs199; + mov.u16 %rs207, %rs199; + mov.u16 %rs208, %rs199; + mov.u16 %rs209, %rs199; + mov.u16 %rs210, %rs199; + mov.u16 %rs211, %rs199; + mov.u16 %rs212, %rs199; + mov.u16 %rs213, %rs199; + mov.u16 %rs214, %rs199; + mov.u16 %rs215, %rs199; + mov.u16 %rs216, %rs199; + mov.u16 %rs217, %rs199; + mov.u16 %rs218, %rs199; + mov.u16 %rs219, %rs199; + mov.u16 %rs220, %rs199; + mov.u16 %rs221, %rs199; + mov.u16 %rs222, %rs199; + mov.u16 %rs223, %rs199; + mov.u16 %rs224, %rs199; + mov.u16 %rs225, %rs199; + mov.u16 %rs226, %rs199; + mov.u16 %rs227, %rs199; + mov.u16 %rs228, %rs199; + mov.u16 %rs229, %rs199; + mov.u16 %rs230, %rs199; + mov.u16 %rs231, %rs199; + mov.u16 %rs232, %rs199; + @%p25 bra $L__BB0_29; + + mov.u64 %rd171, 0; + +$L__BB0_27: + add.s64 %rd130, %rd169, %rd171; + ld.u8 %rs121, [%rd130]; + add.s64 %rd131, %rd53, %rd171; + st.local.u8 [%rd131], %rs121; + add.s64 %rd171, %rd171, 1; + setp.lt.u64 %p26, %rd171, %rd63; + @%p26 bra $L__BB0_27; + + ld.local.v4.u16 {%rs229, %rs230, %rs231, %rs232}, [%rd53]; + ld.local.v4.u16 {%rs225, %rs226, %rs227, %rs228}, [%rd53+8]; + ld.local.v4.u16 {%rs221, %rs222, %rs223, %rs224}, [%rd53+16]; + ld.local.v4.u16 {%rs217, %rs218, %rs219, %rs220}, [%rd53+24]; + ld.local.v4.u16 {%rs213, %rs214, %rs215, %rs216}, [%rd53+32]; + ld.local.v4.u16 {%rs209, %rs210, %rs211, %rs212}, [%rd53+40]; + ld.local.v4.u16 {%rs205, %rs206, %rs207, %rs208}, [%rd53+48]; + ld.local.v4.u16 {%rs202, %rs203, %rs204, %rs153}, [%rd53+56]; + ld.local.u8 %rs201, [%rd53+61]; + ld.local.v2.u8 {%rs199, %rs200}, [%rd53+62]; + +$L__BB0_29: + ld.param.u64 %rd137, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd136, %rd137; + ld.local.v4.u8 {%rs156, %rs157, %rs158, %rs159}, [%rd53+64]; + cvt.u16.u64 %rs162, %rd63; + add.s16 %rs163, %rs156, %rs162; + st.local.u8 [%rd53+64], %rs163; + setp.eq.s16 %p27, %rs157, 0; + selp.u16 %rs164, 1, 0, %p27; + or.b16 %rs165, %rs158, %rs164; + or.b16 %rs166, %rs165, 2; + shr.u16 %rs167, %rs229, 8; + shr.u16 %rs168, %rs230, 8; + shr.u16 %rs169, %rs231, 8; + shr.u16 %rs170, %rs232, 8; + shr.u16 %rs171, %rs225, 8; + shr.u16 %rs172, %rs226, 8; + shr.u16 %rs173, %rs227, 8; + shr.u16 %rs174, %rs228, 8; + shr.u16 %rs175, %rs221, 8; + shr.u16 %rs176, %rs222, 8; + shr.u16 %rs177, %rs223, 8; + shr.u16 %rs178, %rs224, 8; + shr.u16 %rs179, %rs217, 8; + shr.u16 %rs180, %rs218, 8; + shr.u16 %rs181, %rs219, 8; + shr.u16 %rs182, %rs220, 8; + shr.u16 %rs183, %rs213, 8; + shr.u16 %rs184, %rs214, 8; + shr.u16 %rs185, %rs215, 8; + shr.u16 %rs186, %rs216, 8; + shr.u16 %rs187, %rs209, 8; + shr.u16 %rs188, %rs210, 8; + shr.u16 %rs189, %rs211, 8; + shr.u16 %rs190, %rs212, 8; + shr.u16 %rs191, %rs205, 8; + shr.u16 %rs192, %rs206, 8; + shr.u16 %rs193, %rs207, 8; + shr.u16 %rs194, %rs208, 8; + shr.u16 %rs195, %rs202, 8; + shr.u16 %rs196, %rs203, 8; + shl.b64 %rd132, %rd150, 5; + add.s64 %rd133, %rd136, %rd132; + cvt.u32.u16 %r2984, %rs229; + and.b32 %r2985, %r2984, 255; + cvt.u32.u16 %r2986, %rs167; + prmt.b32 %r2987, %r2986, %r2985, 30212; + cvt.u32.u16 %r2988, %rs230; + prmt.b32 %r2989, %r2988, %r2987, 28756; + cvt.u32.u16 %r2990, %rs168; + prmt.b32 %r2991, %r2990, %r2989, 1620; + cvt.u32.u16 %r2992, %rs231; + and.b32 %r2993, %r2992, 255; + cvt.u32.u16 %r2994, %rs169; + prmt.b32 %r2995, %r2994, %r2993, 30212; + cvt.u32.u16 %r2996, %rs232; + prmt.b32 %r2997, %r2996, %r2995, 28756; + cvt.u32.u16 %r2998, %rs170; + prmt.b32 %r2999, %r2998, %r2997, 1620; + cvt.u32.u16 %r3000, %rs225; + and.b32 %r3001, %r3000, 255; + cvt.u32.u16 %r3002, %rs171; + prmt.b32 %r3003, %r3002, %r3001, 30212; + cvt.u32.u16 %r3004, %rs226; + prmt.b32 %r3005, %r3004, %r3003, 28756; + cvt.u32.u16 %r3006, %rs172; + prmt.b32 %r3007, %r3006, %r3005, 1620; + cvt.u32.u16 %r3008, %rs227; + and.b32 %r3009, %r3008, 255; + cvt.u32.u16 %r3010, %rs173; + prmt.b32 %r3011, %r3010, %r3009, 30212; + cvt.u32.u16 %r3012, %rs228; + prmt.b32 %r3013, %r3012, %r3011, 28756; + cvt.u32.u16 %r3014, %rs174; + prmt.b32 %r3015, %r3014, %r3013, 1620; + cvt.u32.u16 %r3016, %rs221; + and.b32 %r3017, %r3016, 255; + cvt.u32.u16 %r3018, %rs175; + prmt.b32 %r3019, %r3018, %r3017, 30212; + cvt.u32.u16 %r3020, %rs222; + prmt.b32 %r3021, %r3020, %r3019, 28756; + cvt.u32.u16 %r3022, %rs176; + prmt.b32 %r3023, %r3022, %r3021, 1620; + cvt.u32.u16 %r3024, %rs223; + and.b32 %r3025, %r3024, 255; + cvt.u32.u16 %r3026, %rs177; + prmt.b32 %r3027, %r3026, %r3025, 30212; + cvt.u32.u16 %r3028, %rs224; + prmt.b32 %r3029, %r3028, %r3027, 28756; + cvt.u32.u16 %r3030, %rs178; + prmt.b32 %r3031, %r3030, %r3029, 1620; + cvt.u32.u16 %r3032, %rs217; + and.b32 %r3033, %r3032, 255; + cvt.u32.u16 %r3034, %rs179; + prmt.b32 %r3035, %r3034, %r3033, 30212; + cvt.u32.u16 %r3036, %rs218; + prmt.b32 %r3037, %r3036, %r3035, 28756; + cvt.u32.u16 %r3038, %rs180; + prmt.b32 %r3039, %r3038, %r3037, 1620; + cvt.u32.u16 %r3040, %rs219; + and.b32 %r3041, %r3040, 255; + cvt.u32.u16 %r3042, %rs181; + prmt.b32 %r3043, %r3042, %r3041, 30212; + cvt.u32.u16 %r3044, %rs220; + prmt.b32 %r3045, %r3044, %r3043, 28756; + cvt.u32.u16 %r3046, %rs182; + prmt.b32 %r3047, %r3046, %r3045, 1620; + cvt.u32.u16 %r3048, %rs213; + and.b32 %r3049, %r3048, 255; + cvt.u32.u16 %r3050, %rs183; + prmt.b32 %r3051, %r3050, %r3049, 30212; + cvt.u32.u16 %r3052, %rs214; + prmt.b32 %r3053, %r3052, %r3051, 28756; + cvt.u32.u16 %r3054, %rs184; + prmt.b32 %r3055, %r3054, %r3053, 1620; + cvt.u32.u16 %r3056, %rs215; + and.b32 %r3057, %r3056, 255; + cvt.u32.u16 %r3058, %rs185; + prmt.b32 %r3059, %r3058, %r3057, 30212; + cvt.u32.u16 %r3060, %rs216; + prmt.b32 %r3061, %r3060, %r3059, 28756; + cvt.u32.u16 %r3062, %rs186; + prmt.b32 %r3063, %r3062, %r3061, 1620; + cvt.u32.u16 %r3064, %rs209; + and.b32 %r3065, %r3064, 255; + cvt.u32.u16 %r3066, %rs187; + prmt.b32 %r3067, %r3066, %r3065, 30212; + cvt.u32.u16 %r3068, %rs210; + prmt.b32 %r3069, %r3068, %r3067, 28756; + cvt.u32.u16 %r3070, %rs188; + prmt.b32 %r3071, %r3070, %r3069, 1620; + cvt.u32.u16 %r3072, %rs211; + and.b32 %r3073, %r3072, 255; + cvt.u32.u16 %r3074, %rs189; + prmt.b32 %r3075, %r3074, %r3073, 30212; + cvt.u32.u16 %r3076, %rs212; + prmt.b32 %r3077, %r3076, %r3075, 28756; + cvt.u32.u16 %r3078, %rs190; + prmt.b32 %r3079, %r3078, %r3077, 1620; + cvt.u32.u16 %r3080, %rs205; + and.b32 %r3081, %r3080, 255; + cvt.u32.u16 %r3082, %rs191; + prmt.b32 %r3083, %r3082, %r3081, 30212; + cvt.u32.u16 %r3084, %rs206; + prmt.b32 %r3085, %r3084, %r3083, 28756; + cvt.u32.u16 %r3086, %rs192; + prmt.b32 %r3087, %r3086, %r3085, 1620; + cvt.u32.u16 %r3088, %rs207; + and.b32 %r3089, %r3088, 255; + cvt.u32.u16 %r3090, %rs193; + prmt.b32 %r3091, %r3090, %r3089, 30212; + cvt.u32.u16 %r3092, %rs208; + prmt.b32 %r3093, %r3092, %r3091, 28756; + cvt.u32.u16 %r3094, %rs194; + prmt.b32 %r3095, %r3094, %r3093, 1620; + cvt.u32.u16 %r3096, %rs202; + and.b32 %r3097, %r3096, 255; + cvt.u32.u16 %r3098, %rs195; + prmt.b32 %r3099, %r3098, %r3097, 30212; + cvt.u32.u16 %r3100, %rs203; + prmt.b32 %r3101, %r3100, %r3099, 28756; + cvt.u32.u16 %r3102, %rs196; + prmt.b32 %r3103, %r3102, %r3101, 1620; + cvt.u32.u16 %r3104, %rs204; + and.b32 %r3105, %r3104, 255; + cvt.u32.u16 %r3106, %rs201; + prmt.b32 %r3107, %r3106, %r3105, 30212; + cvt.u32.u16 %r3108, %rs199; + shl.b32 %r3109, %r3108, 16; + and.b32 %r3110, %r3109, 16711680; + or.b32 %r3111, %r3107, %r3110; + cvt.u32.u16 %r3112, %rs200; + shl.b32 %r3113, %r3112, 24; + or.b32 %r3114, %r3111, %r3113; + cvt.u32.u16 %r3115, %rs163; + and.b32 %r3116, %r3115, 255; + cvt.u32.u16 %r3117, %rs166; + and.b32 %r3118, %r3117, 255; + add.s32 %r3119, %r3960, %r3964; + add.s32 %r3120, %r3119, %r2991; + xor.b32 %r3121, %r3120, %r36; + shf.l.wrap.b32 %r3122, %r3121, %r3121, 16; + add.s32 %r3123, %r3122, 1779033703; + xor.b32 %r3124, %r3123, %r3960; + shf.l.wrap.b32 %r3125, %r3124, %r3124, 20; + add.s32 %r3126, %r2999, %r3120; + add.s32 %r3127, %r3126, %r3125; + xor.b32 %r3128, %r3127, %r3122; + shf.l.wrap.b32 %r3129, %r3128, %r3128, 24; + add.s32 %r3130, %r3129, %r3123; + xor.b32 %r3131, %r3130, %r3125; + shf.l.wrap.b32 %r3132, %r3131, %r3131, 25; + add.s32 %r3133, %r3959, %r3963; + add.s32 %r3134, %r3133, %r3007; + xor.b32 %r3135, %r3134, %r37; + shf.l.wrap.b32 %r3136, %r3135, %r3135, 16; + add.s32 %r3137, %r3136, -1150833019; + xor.b32 %r3138, %r3137, %r3959; + shf.l.wrap.b32 %r3139, %r3138, %r3138, 20; + add.s32 %r3140, %r3015, %r3134; + add.s32 %r3141, %r3140, %r3139; + xor.b32 %r3142, %r3141, %r3136; + shf.l.wrap.b32 %r3143, %r3142, %r3142, 24; + add.s32 %r3144, %r3143, %r3137; + xor.b32 %r3145, %r3144, %r3139; + shf.l.wrap.b32 %r3146, %r3145, %r3145, 25; + add.s32 %r3147, %r3958, %r3962; + add.s32 %r3148, %r3147, %r3023; + xor.b32 %r3149, %r3148, %r3116; + shr.u32 %r3150, %r3148, 16; + shl.b32 %r3151, %r3149, 16; + or.b32 %r3152, %r3151, %r3150; + add.s32 %r3153, %r3152, 1013904242; + xor.b32 %r3154, %r3153, %r3958; + shf.l.wrap.b32 %r3155, %r3154, %r3154, 20; + add.s32 %r3156, %r3031, %r3148; + add.s32 %r3157, %r3156, %r3155; + xor.b32 %r3158, %r3157, %r3152; + shf.l.wrap.b32 %r3159, %r3158, %r3158, 24; + add.s32 %r3160, %r3159, %r3153; + xor.b32 %r3161, %r3160, %r3155; + shf.l.wrap.b32 %r3162, %r3161, %r3161, 25; + add.s32 %r3163, %r3957, %r3961; + add.s32 %r3164, %r3163, %r3039; + xor.b32 %r3165, %r3164, %r3118; + shr.u32 %r3166, %r3164, 16; + shl.b32 %r3167, %r3165, 16; + or.b32 %r3168, %r3167, %r3166; + add.s32 %r3169, %r3168, -1521486534; + xor.b32 %r3170, %r3169, %r3957; + shf.l.wrap.b32 %r3171, %r3170, %r3170, 20; + add.s32 %r3172, %r3047, %r3164; + add.s32 %r3173, %r3172, %r3171; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 24; + add.s32 %r3176, %r3175, %r3169; + xor.b32 %r3177, %r3176, %r3171; + shf.l.wrap.b32 %r3178, %r3177, %r3177, 25; + add.s32 %r3179, %r3146, %r3127; + add.s32 %r3180, %r3179, %r3055; + xor.b32 %r3181, %r3175, %r3180; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 16; + add.s32 %r3183, %r3182, %r3160; + xor.b32 %r3184, %r3183, %r3146; + shf.l.wrap.b32 %r3185, %r3184, %r3184, 20; + add.s32 %r3186, %r3063, %r3180; + add.s32 %r3187, %r3186, %r3185; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 24; + add.s32 %r3190, %r3189, %r3183; + xor.b32 %r3191, %r3190, %r3185; + shf.l.wrap.b32 %r3192, %r3191, %r3191, 25; + add.s32 %r3193, %r3162, %r3141; + add.s32 %r3194, %r3193, %r3071; + xor.b32 %r3195, %r3194, %r3129; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 16; + add.s32 %r3197, %r3196, %r3176; + xor.b32 %r3198, %r3197, %r3162; + shf.l.wrap.b32 %r3199, %r3198, %r3198, 20; + add.s32 %r3200, %r3079, %r3194; + add.s32 %r3201, %r3200, %r3199; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 24; + add.s32 %r3204, %r3203, %r3197; + xor.b32 %r3205, %r3204, %r3199; + shf.l.wrap.b32 %r3206, %r3205, %r3205, 25; + add.s32 %r3207, %r3178, %r3157; + add.s32 %r3208, %r3207, %r3087; + xor.b32 %r3209, %r3208, %r3143; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 16; + add.s32 %r3211, %r3210, %r3130; + xor.b32 %r3212, %r3211, %r3178; + shf.l.wrap.b32 %r3213, %r3212, %r3212, 20; + add.s32 %r3214, %r3095, %r3208; + add.s32 %r3215, %r3214, %r3213; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 24; + add.s32 %r3218, %r3217, %r3211; + xor.b32 %r3219, %r3218, %r3213; + shf.l.wrap.b32 %r3220, %r3219, %r3219, 25; + add.s32 %r3221, %r3173, %r3132; + add.s32 %r3222, %r3221, %r3103; + xor.b32 %r3223, %r3222, %r3159; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 16; + add.s32 %r3225, %r3224, %r3144; + xor.b32 %r3226, %r3225, %r3132; + shf.l.wrap.b32 %r3227, %r3226, %r3226, 20; + add.s32 %r3228, %r3114, %r3222; + add.s32 %r3229, %r3228, %r3227; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 24; + add.s32 %r3232, %r3231, %r3225; + xor.b32 %r3233, %r3232, %r3227; + shf.l.wrap.b32 %r3234, %r3233, %r3233, 25; + add.s32 %r3235, %r3187, %r3007; + add.s32 %r3236, %r3235, %r3234; + xor.b32 %r3237, %r3236, %r3203; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 16; + add.s32 %r3239, %r3238, %r3218; + xor.b32 %r3240, %r3239, %r3234; + shf.l.wrap.b32 %r3241, %r3240, %r3240, 20; + add.s32 %r3242, %r3236, %r3039; + add.s32 %r3243, %r3242, %r3241; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 24; + add.s32 %r3246, %r3245, %r3239; + xor.b32 %r3247, %r3246, %r3241; + shf.l.wrap.b32 %r3248, %r3247, %r3247, 25; + add.s32 %r3249, %r3201, %r3015; + add.s32 %r3250, %r3249, %r3192; + xor.b32 %r3251, %r3217, %r3250; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 16; + add.s32 %r3253, %r3232, %r3252; + xor.b32 %r3254, %r3253, %r3192; + shf.l.wrap.b32 %r3255, %r3254, %r3254, 20; + add.s32 %r3256, %r3250, %r3071; + add.s32 %r3257, %r3256, %r3255; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 24; + add.s32 %r3260, %r3259, %r3253; + xor.b32 %r3261, %r3260, %r3255; + shf.l.wrap.b32 %r3262, %r3261, %r3261, 25; + add.s32 %r3263, %r3206, %r3047; + add.s32 %r3264, %r3263, %r3215; + xor.b32 %r3265, %r3231, %r3264; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 16; + add.s32 %r3267, %r3266, %r3190; + xor.b32 %r3268, %r3267, %r3206; + shf.l.wrap.b32 %r3269, %r3268, %r3268, 20; + add.s32 %r3270, %r3264, %r2991; + add.s32 %r3271, %r3270, %r3269; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 24; + add.s32 %r3274, %r3273, %r3267; + xor.b32 %r3275, %r3274, %r3269; + shf.l.wrap.b32 %r3276, %r3275, %r3275, 25; + add.s32 %r3277, %r3220, %r3023; + add.s32 %r3278, %r3277, %r3229; + xor.b32 %r3279, %r3278, %r3189; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 16; + add.s32 %r3281, %r3280, %r3204; + xor.b32 %r3282, %r3281, %r3220; + shf.l.wrap.b32 %r3283, %r3282, %r3282, 20; + add.s32 %r3284, %r3278, %r3095; + add.s32 %r3285, %r3284, %r3283; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 24; + add.s32 %r3288, %r3287, %r3281; + xor.b32 %r3289, %r3288, %r3283; + shf.l.wrap.b32 %r3290, %r3289, %r3289, 25; + add.s32 %r3291, %r3262, %r2999; + add.s32 %r3292, %r3291, %r3243; + xor.b32 %r3293, %r3292, %r3287; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 16; + add.s32 %r3295, %r3294, %r3274; + xor.b32 %r3296, %r3295, %r3262; + shf.l.wrap.b32 %r3297, %r3296, %r3296, 20; + add.s32 %r3298, %r3292, %r3079; + add.s32 %r3299, %r3298, %r3297; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 24; + add.s32 %r3302, %r3301, %r3295; + xor.b32 %r3303, %r3302, %r3297; + shf.l.wrap.b32 %r3304, %r3303, %r3303, 25; + add.s32 %r3305, %r3257, %r3087; + add.s32 %r3306, %r3305, %r3276; + xor.b32 %r3307, %r3245, %r3306; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 16; + add.s32 %r3309, %r3308, %r3288; + xor.b32 %r3310, %r3309, %r3276; + shf.l.wrap.b32 %r3311, %r3310, %r3310, 20; + add.s32 %r3312, %r3306, %r3031; + add.s32 %r3313, %r3312, %r3311; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 24; + add.s32 %r3316, %r3315, %r3309; + xor.b32 %r3317, %r3316, %r3311; + shf.l.wrap.b32 %r3318, %r3317, %r3317, 25; + add.s32 %r3319, %r3271, %r3063; + add.s32 %r3320, %r3319, %r3290; + xor.b32 %r3321, %r3320, %r3259; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 16; + add.s32 %r3323, %r3322, %r3246; + xor.b32 %r3324, %r3323, %r3290; + shf.l.wrap.b32 %r3325, %r3324, %r3324, 20; + add.s32 %r3326, %r3320, %r3103; + add.s32 %r3327, %r3326, %r3325; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 24; + add.s32 %r3330, %r3329, %r3323; + xor.b32 %r3331, %r3330, %r3325; + shf.l.wrap.b32 %r3332, %r3331, %r3331, 25; + add.s32 %r3333, %r3285, %r3114; + add.s32 %r3334, %r3333, %r3248; + xor.b32 %r3335, %r3334, %r3273; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 16; + add.s32 %r3337, %r3336, %r3260; + xor.b32 %r3338, %r3337, %r3248; + shf.l.wrap.b32 %r3339, %r3338, %r3338, 20; + add.s32 %r3340, %r3334, %r3055; + add.s32 %r3341, %r3340, %r3339; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 24; + add.s32 %r3344, %r3343, %r3337; + xor.b32 %r3345, %r3344, %r3339; + shf.l.wrap.b32 %r3346, %r3345, %r3345, 25; + add.s32 %r3347, %r3299, %r3015; + add.s32 %r3348, %r3347, %r3346; + xor.b32 %r3349, %r3348, %r3315; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 16; + add.s32 %r3351, %r3350, %r3330; + xor.b32 %r3352, %r3351, %r3346; + shf.l.wrap.b32 %r3353, %r3352, %r3352, 20; + add.s32 %r3354, %r3348, %r3023; + add.s32 %r3355, %r3354, %r3353; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 24; + add.s32 %r3358, %r3357, %r3351; + xor.b32 %r3359, %r3358, %r3353; + shf.l.wrap.b32 %r3360, %r3359, %r3359, 25; + add.s32 %r3361, %r3313, %r3071; + add.s32 %r3362, %r3361, %r3304; + xor.b32 %r3363, %r3362, %r3329; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 16; + add.s32 %r3365, %r3364, %r3344; + xor.b32 %r3366, %r3365, %r3304; + shf.l.wrap.b32 %r3367, %r3366, %r3366, 20; + add.s32 %r3368, %r3362, %r3087; + add.s32 %r3369, %r3368, %r3367; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 24; + add.s32 %r3372, %r3371, %r3365; + xor.b32 %r3373, %r3372, %r3367; + shf.l.wrap.b32 %r3374, %r3373, %r3373, 25; + add.s32 %r3375, %r3327, %r3095; + add.s32 %r3376, %r3375, %r3318; + xor.b32 %r3377, %r3343, %r3376; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 16; + add.s32 %r3379, %r3378, %r3302; + xor.b32 %r3380, %r3379, %r3318; + shf.l.wrap.b32 %r3381, %r3380, %r3380, 20; + add.s32 %r3382, %r3376, %r3007; + add.s32 %r3383, %r3382, %r3381; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 24; + add.s32 %r3386, %r3385, %r3379; + xor.b32 %r3387, %r3386, %r3381; + shf.l.wrap.b32 %r3388, %r3387, %r3387, 25; + add.s32 %r3389, %r3332, %r3047; + add.s32 %r3390, %r3389, %r3341; + xor.b32 %r3391, %r3390, %r3301; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 16; + add.s32 %r3393, %r3392, %r3316; + xor.b32 %r3394, %r3393, %r3332; + shf.l.wrap.b32 %r3395, %r3394, %r3394, 20; + add.s32 %r3396, %r3390, %r3103; + add.s32 %r3397, %r3396, %r3395; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 24; + add.s32 %r3400, %r3399, %r3393; + xor.b32 %r3401, %r3400, %r3395; + shf.l.wrap.b32 %r3402, %r3401, %r3401, 25; + add.s32 %r3403, %r3374, %r3039; + add.s32 %r3404, %r3403, %r3355; + xor.b32 %r3405, %r3404, %r3399; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 16; + add.s32 %r3407, %r3406, %r3386; + xor.b32 %r3408, %r3407, %r3374; + shf.l.wrap.b32 %r3409, %r3408, %r3408, 20; + add.s32 %r3410, %r3404, %r3031; + add.s32 %r3411, %r3410, %r3409; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 24; + add.s32 %r3414, %r3413, %r3407; + xor.b32 %r3415, %r3414, %r3409; + shf.l.wrap.b32 %r3416, %r3415, %r3415, 25; + add.s32 %r3417, %r3369, %r3063; + add.s32 %r3418, %r3417, %r3388; + xor.b32 %r3419, %r3357, %r3418; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 16; + add.s32 %r3421, %r3420, %r3400; + xor.b32 %r3422, %r3421, %r3388; + shf.l.wrap.b32 %r3423, %r3422, %r3422, 20; + add.s32 %r3424, %r3418, %r2991; + add.s32 %r3425, %r3424, %r3423; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 24; + add.s32 %r3428, %r3427, %r3421; + xor.b32 %r3429, %r3428, %r3423; + shf.l.wrap.b32 %r3430, %r3429, %r3429, 25; + add.s32 %r3431, %r3383, %r3079; + add.s32 %r3432, %r3431, %r3402; + xor.b32 %r3433, %r3432, %r3371; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 16; + add.s32 %r3435, %r3434, %r3358; + xor.b32 %r3436, %r3435, %r3402; + shf.l.wrap.b32 %r3437, %r3436, %r3436, 20; + add.s32 %r3438, %r3432, %r3114; + add.s32 %r3439, %r3438, %r3437; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 24; + add.s32 %r3442, %r3441, %r3435; + xor.b32 %r3443, %r3442, %r3437; + shf.l.wrap.b32 %r3444, %r3443, %r3443, 25; + add.s32 %r3445, %r3397, %r3055; + add.s32 %r3446, %r3445, %r3360; + xor.b32 %r3447, %r3446, %r3385; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 16; + add.s32 %r3449, %r3448, %r3372; + xor.b32 %r3450, %r3449, %r3360; + shf.l.wrap.b32 %r3451, %r3450, %r3450, 20; + add.s32 %r3452, %r3446, %r2999; + add.s32 %r3453, %r3452, %r3451; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 24; + add.s32 %r3456, %r3455, %r3449; + xor.b32 %r3457, %r3456, %r3451; + shf.l.wrap.b32 %r3458, %r3457, %r3457, 25; + add.s32 %r3459, %r3411, %r3071; + add.s32 %r3460, %r3459, %r3458; + xor.b32 %r3461, %r3460, %r3427; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 16; + add.s32 %r3463, %r3462, %r3442; + xor.b32 %r3464, %r3463, %r3458; + shf.l.wrap.b32 %r3465, %r3464, %r3464, 20; + add.s32 %r3466, %r3460, %r3047; + add.s32 %r3467, %r3466, %r3465; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 24; + add.s32 %r3470, %r3469, %r3463; + xor.b32 %r3471, %r3470, %r3465; + shf.l.wrap.b32 %r3472, %r3471, %r3471, 25; + add.s32 %r3473, %r3425, %r3087; + add.s32 %r3474, %r3473, %r3416; + xor.b32 %r3475, %r3474, %r3441; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 16; + add.s32 %r3477, %r3476, %r3456; + xor.b32 %r3478, %r3477, %r3416; + shf.l.wrap.b32 %r3479, %r3478, %r3478, 20; + add.s32 %r3480, %r3474, %r3063; + add.s32 %r3481, %r3480, %r3479; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 24; + add.s32 %r3484, %r3483, %r3477; + xor.b32 %r3485, %r3484, %r3479; + shf.l.wrap.b32 %r3486, %r3485, %r3485, 25; + add.s32 %r3487, %r3439, %r3103; + add.s32 %r3488, %r3487, %r3430; + xor.b32 %r3489, %r3455, %r3488; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 16; + add.s32 %r3491, %r3490, %r3414; + xor.b32 %r3492, %r3491, %r3430; + shf.l.wrap.b32 %r3493, %r3492, %r3492, 20; + add.s32 %r3494, %r3488, %r3015; + add.s32 %r3495, %r3494, %r3493; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 24; + add.s32 %r3498, %r3497, %r3491; + xor.b32 %r3499, %r3498, %r3493; + shf.l.wrap.b32 %r3500, %r3499, %r3499, 25; + add.s32 %r3501, %r3444, %r3095; + add.s32 %r3502, %r3501, %r3453; + xor.b32 %r3503, %r3502, %r3413; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 16; + add.s32 %r3505, %r3504, %r3428; + xor.b32 %r3506, %r3505, %r3444; + shf.l.wrap.b32 %r3507, %r3506, %r3506, 20; + add.s32 %r3508, %r3502, %r3114; + add.s32 %r3509, %r3508, %r3507; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 24; + add.s32 %r3512, %r3511, %r3505; + xor.b32 %r3513, %r3512, %r3507; + shf.l.wrap.b32 %r3514, %r3513, %r3513, 25; + add.s32 %r3515, %r3486, %r3023; + add.s32 %r3516, %r3515, %r3467; + xor.b32 %r3517, %r3516, %r3511; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 16; + add.s32 %r3519, %r3518, %r3498; + xor.b32 %r3520, %r3519, %r3486; + shf.l.wrap.b32 %r3521, %r3520, %r3520, 20; + add.s32 %r3522, %r3516, %r2991; + add.s32 %r3523, %r3522, %r3521; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 24; + add.s32 %r3526, %r3525, %r3519; + xor.b32 %r3527, %r3526, %r3521; + shf.l.wrap.b32 %r3528, %r3527, %r3527, 25; + add.s32 %r3529, %r3481, %r3079; + add.s32 %r3530, %r3529, %r3500; + xor.b32 %r3531, %r3469, %r3530; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 16; + add.s32 %r3533, %r3532, %r3512; + xor.b32 %r3534, %r3533, %r3500; + shf.l.wrap.b32 %r3535, %r3534, %r3534, 20; + add.s32 %r3536, %r3530, %r3007; + add.s32 %r3537, %r3536, %r3535; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 24; + add.s32 %r3540, %r3539, %r3533; + xor.b32 %r3541, %r3540, %r3535; + shf.l.wrap.b32 %r3542, %r3541, %r3541, 25; + add.s32 %r3543, %r3495, %r3031; + add.s32 %r3544, %r3543, %r3514; + xor.b32 %r3545, %r3544, %r3483; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 16; + add.s32 %r3547, %r3546, %r3470; + xor.b32 %r3548, %r3547, %r3514; + shf.l.wrap.b32 %r3549, %r3548, %r3548, 20; + add.s32 %r3550, %r3544, %r3055; + add.s32 %r3551, %r3550, %r3549; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 24; + add.s32 %r3554, %r3553, %r3547; + xor.b32 %r3555, %r3554, %r3549; + shf.l.wrap.b32 %r3556, %r3555, %r3555, 25; + add.s32 %r3557, %r3509, %r2999; + add.s32 %r3558, %r3557, %r3472; + xor.b32 %r3559, %r3558, %r3497; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 16; + add.s32 %r3561, %r3560, %r3484; + xor.b32 %r3562, %r3561, %r3472; + shf.l.wrap.b32 %r3563, %r3562, %r3562, 20; + add.s32 %r3564, %r3558, %r3039; + add.s32 %r3565, %r3564, %r3563; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 24; + add.s32 %r3568, %r3567, %r3561; + xor.b32 %r3569, %r3568, %r3563; + shf.l.wrap.b32 %r3570, %r3569, %r3569, 25; + add.s32 %r3571, %r3523, %r3087; + add.s32 %r3572, %r3571, %r3570; + xor.b32 %r3573, %r3572, %r3539; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 16; + add.s32 %r3575, %r3574, %r3554; + xor.b32 %r3576, %r3575, %r3570; + shf.l.wrap.b32 %r3577, %r3576, %r3576, 20; + add.s32 %r3578, %r3572, %r3095; + add.s32 %r3579, %r3578, %r3577; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 24; + add.s32 %r3582, %r3581, %r3575; + xor.b32 %r3583, %r3582, %r3577; + shf.l.wrap.b32 %r3584, %r3583, %r3583, 25; + add.s32 %r3585, %r3537, %r3063; + add.s32 %r3586, %r3585, %r3528; + xor.b32 %r3587, %r3586, %r3553; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 16; + add.s32 %r3589, %r3588, %r3568; + xor.b32 %r3590, %r3589, %r3528; + shf.l.wrap.b32 %r3591, %r3590, %r3590, 20; + add.s32 %r3592, %r3586, %r3079; + add.s32 %r3593, %r3592, %r3591; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 24; + add.s32 %r3596, %r3595, %r3589; + xor.b32 %r3597, %r3596, %r3591; + shf.l.wrap.b32 %r3598, %r3597, %r3597, 25; + add.s32 %r3599, %r3551, %r3114; + add.s32 %r3600, %r3599, %r3542; + xor.b32 %r3601, %r3567, %r3600; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 16; + add.s32 %r3603, %r3602, %r3526; + xor.b32 %r3604, %r3603, %r3542; + shf.l.wrap.b32 %r3605, %r3604, %r3604, 20; + add.s32 %r3606, %r3600, %r3071; + add.s32 %r3607, %r3606, %r3605; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 24; + add.s32 %r3610, %r3609, %r3603; + xor.b32 %r3611, %r3610, %r3605; + shf.l.wrap.b32 %r3612, %r3611, %r3611, 25; + add.s32 %r3613, %r3556, %r3103; + add.s32 %r3614, %r3613, %r3565; + xor.b32 %r3615, %r3614, %r3525; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 16; + add.s32 %r3617, %r3616, %r3540; + xor.b32 %r3618, %r3617, %r3556; + shf.l.wrap.b32 %r3619, %r3618, %r3618, 20; + add.s32 %r3620, %r3614, %r3055; + add.s32 %r3621, %r3620, %r3619; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 24; + add.s32 %r3624, %r3623, %r3617; + xor.b32 %r3625, %r3624, %r3619; + shf.l.wrap.b32 %r3626, %r3625, %r3625, 25; + add.s32 %r3627, %r3598, %r3047; + add.s32 %r3628, %r3627, %r3579; + xor.b32 %r3629, %r3628, %r3623; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 16; + add.s32 %r3631, %r3630, %r3610; + xor.b32 %r3632, %r3631, %r3598; + shf.l.wrap.b32 %r3633, %r3632, %r3632, 20; + add.s32 %r3634, %r3628, %r3007; + add.s32 %r3635, %r3634, %r3633; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 24; + add.s32 %r3638, %r3637, %r3631; + xor.b32 %r3639, %r3638, %r3633; + shf.l.wrap.b32 %r3640, %r3639, %r3639, 25; + add.s32 %r3641, %r3593, %r3031; + add.s32 %r3642, %r3641, %r3612; + xor.b32 %r3643, %r3581, %r3642; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 16; + add.s32 %r3645, %r3644, %r3624; + xor.b32 %r3646, %r3645, %r3612; + shf.l.wrap.b32 %r3647, %r3646, %r3646, 20; + add.s32 %r3648, %r3642, %r3015; + add.s32 %r3649, %r3648, %r3647; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 24; + add.s32 %r3652, %r3651, %r3645; + xor.b32 %r3653, %r3652, %r3647; + shf.l.wrap.b32 %r3654, %r3653, %r3653, 25; + add.s32 %r3655, %r3607, %r2991; + add.s32 %r3656, %r3655, %r3626; + xor.b32 %r3657, %r3656, %r3595; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 16; + add.s32 %r3659, %r3658, %r3582; + xor.b32 %r3660, %r3659, %r3626; + shf.l.wrap.b32 %r3661, %r3660, %r3660, 20; + add.s32 %r3662, %r3656, %r2999; + add.s32 %r3663, %r3662, %r3661; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 24; + add.s32 %r3666, %r3665, %r3659; + xor.b32 %r3667, %r3666, %r3661; + shf.l.wrap.b32 %r3668, %r3667, %r3667, 25; + add.s32 %r3669, %r3621, %r3039; + add.s32 %r3670, %r3669, %r3584; + xor.b32 %r3671, %r3670, %r3609; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 16; + add.s32 %r3673, %r3672, %r3596; + xor.b32 %r3674, %r3673, %r3584; + shf.l.wrap.b32 %r3675, %r3674, %r3674, 20; + add.s32 %r3676, %r3670, %r3023; + add.s32 %r3677, %r3676, %r3675; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 24; + add.s32 %r3680, %r3679, %r3673; + xor.b32 %r3681, %r3680, %r3675; + shf.l.wrap.b32 %r3682, %r3681, %r3681, 25; + add.s32 %r3683, %r3635, %r3063; + add.s32 %r3684, %r3683, %r3682; + xor.b32 %r3685, %r3684, %r3651; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 16; + add.s32 %r3687, %r3686, %r3666; + xor.b32 %r3688, %r3687, %r3682; + shf.l.wrap.b32 %r3689, %r3688, %r3688, 20; + add.s32 %r3690, %r3684, %r3103; + add.s32 %r3691, %r3690, %r3689; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 24; + add.s32 %r3694, %r3693, %r3687; + xor.b32 %r3695, %r3694, %r3689; + shf.l.wrap.b32 %r3696, %r3695, %r3695, 25; + add.s32 %r3697, %r3649, %r3079; + add.s32 %r3698, %r3697, %r3640; + xor.b32 %r3699, %r3698, %r3665; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 16; + add.s32 %r3701, %r3700, %r3680; + xor.b32 %r3702, %r3701, %r3640; + shf.l.wrap.b32 %r3703, %r3702, %r3702, 20; + add.s32 %r3704, %r3698, %r3031; + add.s32 %r3705, %r3704, %r3703; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 24; + add.s32 %r3708, %r3707, %r3701; + xor.b32 %r3709, %r3708, %r3703; + shf.l.wrap.b32 %r3710, %r3709, %r3709, 25; + add.s32 %r3711, %r3663, %r3055; + add.s32 %r3712, %r3711, %r3654; + xor.b32 %r3713, %r3679, %r3712; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 16; + add.s32 %r3715, %r3714, %r3638; + xor.b32 %r3716, %r3715, %r3654; + shf.l.wrap.b32 %r3717, %r3716, %r3716, 20; + add.s32 %r3718, %r3712, %r3087; + add.s32 %r3719, %r3718, %r3717; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 24; + add.s32 %r3722, %r3721, %r3715; + xor.b32 %r3723, %r3722, %r3717; + shf.l.wrap.b32 %r3724, %r3723, %r3723, 25; + add.s32 %r3725, %r3668, %r3114; + add.s32 %r3726, %r3725, %r3677; + xor.b32 %r3727, %r3726, %r3637; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 16; + add.s32 %r3729, %r3728, %r3652; + xor.b32 %r3730, %r3729, %r3668; + shf.l.wrap.b32 %r3731, %r3730, %r3730, 20; + add.s32 %r3732, %r3726, %r2999; + add.s32 %r3733, %r3732, %r3731; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 24; + add.s32 %r3736, %r3735, %r3729; + xor.b32 %r3737, %r3736, %r3731; + shf.l.wrap.b32 %r3738, %r3737, %r3737, 25; + add.s32 %r3739, %r3710, %r3095; + add.s32 %r3740, %r3739, %r3691; + xor.b32 %r3741, %r3740, %r3735; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 16; + add.s32 %r3743, %r3742, %r3722; + xor.b32 %r3744, %r3743, %r3710; + shf.l.wrap.b32 %r3745, %r3744, %r3744, 20; + add.s32 %r3746, %r3740, %r3015; + add.s32 %r3747, %r3746, %r3745; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 24; + add.s32 %r3750, %r3749, %r3743; + xor.b32 %r3751, %r3750, %r3745; + shf.l.wrap.b32 %r3752, %r3751, %r3751, 25; + add.s32 %r3753, %r3705, %r2991; + add.s32 %r3754, %r3753, %r3724; + xor.b32 %r3755, %r3693, %r3754; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 16; + add.s32 %r3757, %r3756, %r3736; + xor.b32 %r3758, %r3757, %r3724; + shf.l.wrap.b32 %r3759, %r3758, %r3758, 20; + add.s32 %r3760, %r3754, %r3071; + add.s32 %r3761, %r3760, %r3759; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 24; + add.s32 %r3764, %r3763, %r3757; + xor.b32 %r3765, %r3764, %r3759; + shf.l.wrap.b32 %r3766, %r3765, %r3765, 25; + add.s32 %r3767, %r3719, %r3007; + add.s32 %r3768, %r3767, %r3738; + xor.b32 %r3769, %r3768, %r3707; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 16; + add.s32 %r3771, %r3770, %r3694; + xor.b32 %r3772, %r3771, %r3738; + shf.l.wrap.b32 %r3773, %r3772, %r3772, 20; + add.s32 %r3774, %r3768, %r3039; + add.s32 %r3775, %r3774, %r3773; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 24; + add.s32 %r3778, %r3777, %r3771; + xor.b32 %r3779, %r3778, %r3773; + shf.l.wrap.b32 %r3780, %r3779, %r3779, 25; + add.s32 %r3781, %r3733, %r3023; + add.s32 %r3782, %r3781, %r3696; + xor.b32 %r3783, %r3782, %r3721; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 16; + add.s32 %r3785, %r3784, %r3708; + xor.b32 %r3786, %r3785, %r3696; + shf.l.wrap.b32 %r3787, %r3786, %r3786, 20; + add.s32 %r3788, %r3782, %r3047; + add.s32 %r3789, %r3788, %r3787; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 24; + add.s32 %r3792, %r3791, %r3785; + xor.b32 %r3793, %r3792, %r3787; + shf.l.wrap.b32 %r3794, %r3793, %r3793, 25; + add.s32 %r3795, %r3747, %r3079; + add.s32 %r3796, %r3795, %r3794; + xor.b32 %r3797, %r3796, %r3763; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 16; + add.s32 %r3799, %r3798, %r3778; + xor.b32 %r3800, %r3799, %r3794; + shf.l.wrap.b32 %r3801, %r3800, %r3800, 20; + add.s32 %r3802, %r3796, %r3114; + add.s32 %r3803, %r3802, %r3801; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 24; + add.s32 %r3806, %r3805, %r3799; + xor.b32 %r3807, %r3806, %r3801; + shf.l.wrap.b32 %r3808, %r3807, %r3807, 25; + add.s32 %r3809, %r3761, %r3031; + add.s32 %r3810, %r3809, %r3752; + xor.b32 %r3811, %r3810, %r3777; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 16; + add.s32 %r3813, %r3812, %r3792; + xor.b32 %r3814, %r3813, %r3752; + shf.l.wrap.b32 %r3815, %r3814, %r3814, 20; + add.s32 %r3816, %r3810, %r2991; + add.s32 %r3817, %r3816, %r3815; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 24; + add.s32 %r3820, %r3819, %r3813; + xor.b32 %r3821, %r3820, %r3815; + shf.l.wrap.b32 %r3822, %r3821, %r3821, 25; + add.s32 %r3823, %r3775, %r2999; + add.s32 %r3824, %r3823, %r3766; + xor.b32 %r3825, %r3791, %r3824; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 16; + add.s32 %r3827, %r3826, %r3750; + xor.b32 %r3828, %r3827, %r3766; + shf.l.wrap.b32 %r3829, %r3828, %r3828, 20; + add.s32 %r3830, %r3824, %r3063; + add.s32 %r3831, %r3830, %r3829; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 24; + add.s32 %r3834, %r3833, %r3827; + xor.b32 %r3835, %r3834, %r3829; + shf.l.wrap.b32 %r3836, %r3835, %r3835, 25; + add.s32 %r3837, %r3780, %r3055; + add.s32 %r3838, %r3837, %r3789; + xor.b32 %r3839, %r3838, %r3749; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 16; + add.s32 %r3841, %r3840, %r3764; + xor.b32 %r3842, %r3841, %r3780; + shf.l.wrap.b32 %r3843, %r3842, %r3842, 20; + add.s32 %r3844, %r3838, %r3039; + add.s32 %r3845, %r3844, %r3843; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 24; + add.s32 %r3848, %r3847, %r3841; + xor.b32 %r3849, %r3848, %r3843; + shf.l.wrap.b32 %r3850, %r3849, %r3849, 25; + add.s32 %r3851, %r3822, %r3103; + add.s32 %r3852, %r3851, %r3803; + xor.b32 %r3853, %r3852, %r3847; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 16; + add.s32 %r3855, %r3854, %r3834; + xor.b32 %r3856, %r3855, %r3822; + shf.l.wrap.b32 %r3857, %r3856, %r3856, 20; + add.s32 %r3858, %r3852, %r3071; + add.s32 %r3859, %r3858, %r3857; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 24; + add.s32 %r3862, %r3861, %r3855; + xor.b32 %r3863, %r3862, %r3857; + shf.l.wrap.b32 %r3864, %r3863, %r3863, 25; + add.s32 %r3865, %r3817, %r3007; + add.s32 %r3866, %r3865, %r3836; + xor.b32 %r3867, %r3805, %r3866; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 16; + add.s32 %r3869, %r3868, %r3848; + xor.b32 %r3870, %r3869, %r3836; + shf.l.wrap.b32 %r3871, %r3870, %r3870, 20; + add.s32 %r3872, %r3866, %r3087; + add.s32 %r3873, %r3872, %r3871; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 24; + add.s32 %r3876, %r3875, %r3869; + xor.b32 %r3877, %r3876, %r3871; + shf.l.wrap.b32 %r3878, %r3877, %r3877, 25; + add.s32 %r3879, %r3831, %r3015; + add.s32 %r3880, %r3879, %r3850; + xor.b32 %r3881, %r3880, %r3819; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 16; + add.s32 %r3883, %r3882, %r3806; + xor.b32 %r3884, %r3883, %r3850; + shf.l.wrap.b32 %r3885, %r3884, %r3884, 20; + add.s32 %r3886, %r3880, %r3023; + add.s32 %r3887, %r3886, %r3885; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 24; + add.s32 %r3890, %r3889, %r3883; + xor.b32 %r3891, %r3890, %r3885; + shf.l.wrap.b32 %r3892, %r3891, %r3891, 25; + add.s32 %r3893, %r3845, %r3047; + add.s32 %r3894, %r3893, %r3808; + xor.b32 %r3895, %r3894, %r3833; + shf.l.wrap.b32 %r3896, %r3895, %r3895, 16; + add.s32 %r3897, %r3896, %r3820; + xor.b32 %r3898, %r3897, %r3808; + shf.l.wrap.b32 %r3899, %r3898, %r3898, 20; + add.s32 %r3900, %r3894, %r3095; + add.s32 %r3901, %r3900, %r3899; + xor.b32 %r3902, %r3901, %r3896; + shf.l.wrap.b32 %r3903, %r3902, %r3902, 24; + add.s32 %r3904, %r3903, %r3897; + xor.b32 %r3905, %r3904, %r3899; + shf.l.wrap.b32 %r3906, %r3905, %r3905, 25; + xor.b32 %r3907, %r3890, %r3859; + xor.b32 %r3908, %r3904, %r3873; + xor.b32 %r3909, %r3862, %r3887; + xor.b32 %r3910, %r3901, %r3876; + xor.b32 %r3911, %r3906, %r3875; + xor.b32 %r3912, %r3864, %r3889; + xor.b32 %r3913, %r3903, %r3878; + xor.b32 %r3914, %r3892, %r3861; + st.local.u8 [%rd133], %r3907; + shr.u32 %r3915, %r3907, 8; + st.local.u8 [%rd133+1], %r3915; + shr.u32 %r3916, %r3907, 16; + st.local.u8 [%rd133+2], %r3916; + shr.u32 %r3917, %r3907, 24; + st.local.u8 [%rd133+3], %r3917; + st.local.u8 [%rd133+4], %r3908; + shr.u32 %r3918, %r3908, 8; + st.local.u8 [%rd133+5], %r3918; + shr.u32 %r3919, %r3908, 16; + st.local.u8 [%rd133+6], %r3919; + shr.u32 %r3920, %r3908, 24; + st.local.u8 [%rd133+7], %r3920; + st.local.u8 [%rd133+8], %r3909; + shr.u32 %r3921, %r3909, 8; + st.local.u8 [%rd133+9], %r3921; + shr.u32 %r3922, %r3909, 16; + st.local.u8 [%rd133+10], %r3922; + shr.u32 %r3923, %r3909, 24; + st.local.u8 [%rd133+11], %r3923; + st.local.u8 [%rd133+12], %r3910; + shr.u32 %r3924, %r3910, 8; + st.local.u8 [%rd133+13], %r3924; + shr.u32 %r3925, %r3910, 16; + st.local.u8 [%rd133+14], %r3925; + shr.u32 %r3926, %r3910, 24; + st.local.u8 [%rd133+15], %r3926; + st.local.u8 [%rd133+16], %r3911; + shr.u32 %r3927, %r3911, 8; + st.local.u8 [%rd133+17], %r3927; + shr.u32 %r3928, %r3911, 16; + st.local.u8 [%rd133+18], %r3928; + shr.u32 %r3929, %r3911, 24; + st.local.u8 [%rd133+19], %r3929; + st.local.u8 [%rd133+20], %r3912; + shr.u32 %r3930, %r3912, 8; + st.local.u8 [%rd133+21], %r3930; + shr.u32 %r3931, %r3912, 16; + st.local.u8 [%rd133+22], %r3931; + shr.u32 %r3932, %r3912, 24; + st.local.u8 [%rd133+23], %r3932; + st.local.u8 [%rd133+24], %r3913; + shr.u32 %r3933, %r3913, 8; + st.local.u8 [%rd133+25], %r3933; + shr.u32 %r3934, %r3913, 16; + st.local.u8 [%rd133+26], %r3934; + shr.u32 %r3935, %r3913, 24; + st.local.u8 [%rd133+27], %r3935; + st.local.u8 [%rd133+28], %r3914; + shr.u32 %r3936, %r3914, 8; + st.local.u8 [%rd133+29], %r3936; + shr.u32 %r3937, %r3914, 16; + st.local.u8 [%rd133+30], %r3937; + shr.u32 %r3938, %r3914, 24; + st.local.u8 [%rd133+31], %r3938; + add.s64 %rd150, %rd150, 1; + bra.uni $L__BB0_30; + +$L__BB0_1: + add.s64 %rd76, %rd170, -1; + shr.u64 %rd77, %rd76, 10; + or.b64 %rd78, %rd77, 1; + setp.gt.u64 %p2, %rd78, 4294967295; + shr.u64 %rd79, %rd76, 42; + selp.b64 %rd80, %rd79, %rd78, %p2; + selp.b32 %r62, 32, 0, %p2; + and.b64 %rd81, %rd80, 4294901760; + setp.ne.s64 %p3, %rd81, 0; + shr.u64 %rd82, %rd80, 16; + or.b32 %r63, %r62, 16; + selp.b64 %rd83, %rd82, %rd80, %p3; + selp.b32 %r64, %r63, %r62, %p3; + and.b64 %rd84, %rd83, 65280; + setp.ne.s64 %p4, %rd84, 0; + shr.u64 %rd85, %rd83, 8; + or.b32 %r65, %r64, 8; + selp.b64 %rd86, %rd85, %rd83, %p4; + selp.b32 %r66, %r65, %r64, %p4; + and.b64 %rd87, %rd86, 240; + setp.ne.s64 %p5, %rd87, 0; + shr.u64 %rd88, %rd86, 4; + or.b32 %r67, %r66, 4; + selp.b64 %rd89, %rd88, %rd86, %p5; + selp.b32 %r68, %r67, %r66, %p5; + and.b64 %rd90, %rd89, 12; + setp.ne.s64 %p6, %rd90, 0; + shr.u64 %rd91, %rd89, 2; + add.s32 %r69, %r68, 2; + selp.b64 %rd92, %rd91, %rd89, %p6; + selp.b32 %r70, %r69, %r68, %p6; + bfe.u64 %rd93, %rd92, 1, 1; + cvt.u32.u64 %r71, %rd93; + add.s32 %r72, %r70, %r71; + mov.u64 %rd94, 1024; + shl.b64 %rd95, %rd94, %r72; + sub.s64 %rd96, %rd170, %rd95; + add.s64 %rd97, %rd69, %rd95; + shr.u64 %rd98, %rd95, 10; + add.s64 %rd99, %rd98, %rd164; + setp.gt.u64 %p7, %rd95, 1024; + selp.b64 %rd100, 64, 32, %p7; + add.s64 %rd102, %rd148, %rd100; + cvt.u32.u16 %r73, %rs75; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd69; + .param .b64 param1; + st.param.b64 [param1+0], %rd95; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd164; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd148; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd5, [retval0+0]; + } // callseq 0 + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd97; + .param .b64 param1; + st.param.b64 [param1+0], %rd96; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd99; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd102; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd6, [retval0+0]; + } // callseq 1 + setp.eq.s64 %p8, %rd5, 1; + @%p8 bra $L__BB0_12; + bra.uni $L__BB0_2; + +$L__BB0_12: + mov.u64 %rd157, 0; + +$L__BB0_13: + add.s64 %rd116, %rd4, %rd157; + ld.local.u8 %rs78, [%rd116]; + add.s64 %rd117, %rd154, %rd157; + st.local.u8 [%rd117], %rs78; + add.s64 %rd157, %rd157, 1; + setp.lt.u64 %p15, %rd157, 64; + mov.u64 %rd150, 2; + @%p15 bra $L__BB0_13; + bra.uni $L__BB0_30; + +$L__BB0_2: + add.s64 %rd7, %rd6, %rd5; + setp.lt.u64 %p9, %rd7, 2; + mov.u64 %rd150, 0; + mov.u64 %rd151, %rd150; + @%p9 bra $L__BB0_5; + + mov.u64 %rd145, %rd152; + mov.u64 %rd146, %rd7; + +$L__BB0_4: + st.local.u64 [%rd145], %rd148; + add.s64 %rd150, %rd150, 1; + add.s64 %rd148, %rd148, 64; + add.s64 %rd151, %rd151, 2; + add.s64 %rd145, %rd145, 8; + add.s64 %rd146, %rd146, -2; + setp.gt.u64 %p10, %rd146, 1; + @%p10 bra $L__BB0_4; + +$L__BB0_5: + setp.eq.s64 %p11, %rd150, 0; + @%p11 bra $L__BB0_8; + + or.b16 %rs76, %rs75, 4; + cvt.u32.u16 %r1, %rs76; + mov.u64 %rd153, %rd150; + +$L__BB0_7: + ld.local.u64 %rd108, [%rd152]; + ld.u8 %r74, [%rd108]; + ld.u8 %r75, [%rd108+1]; + prmt.b32 %r76, %r75, %r74, 30212; + ld.u8 %r77, [%rd108+2]; + prmt.b32 %r78, %r77, %r76, 28756; + ld.u8 %r79, [%rd108+3]; + prmt.b32 %r80, %r79, %r78, 1620; + ld.u8 %r81, [%rd108+4]; + ld.u8 %r82, [%rd108+5]; + prmt.b32 %r83, %r82, %r81, 30212; + ld.u8 %r84, [%rd108+6]; + prmt.b32 %r85, %r84, %r83, 28756; + ld.u8 %r86, [%rd108+7]; + prmt.b32 %r87, %r86, %r85, 1620; + ld.u8 %r88, [%rd108+8]; + ld.u8 %r89, [%rd108+9]; + prmt.b32 %r90, %r89, %r88, 30212; + ld.u8 %r91, [%rd108+10]; + prmt.b32 %r92, %r91, %r90, 28756; + ld.u8 %r93, [%rd108+11]; + prmt.b32 %r94, %r93, %r92, 1620; + ld.u8 %r95, [%rd108+12]; + ld.u8 %r96, [%rd108+13]; + prmt.b32 %r97, %r96, %r95, 30212; + ld.u8 %r98, [%rd108+14]; + prmt.b32 %r99, %r98, %r97, 28756; + ld.u8 %r100, [%rd108+15]; + prmt.b32 %r101, %r100, %r99, 1620; + ld.u8 %r102, [%rd108+16]; + ld.u8 %r103, [%rd108+17]; + prmt.b32 %r104, %r103, %r102, 30212; + ld.u8 %r105, [%rd108+18]; + prmt.b32 %r106, %r105, %r104, 28756; + ld.u8 %r107, [%rd108+19]; + prmt.b32 %r108, %r107, %r106, 1620; + ld.u8 %r109, [%rd108+20]; + ld.u8 %r110, [%rd108+21]; + prmt.b32 %r111, %r110, %r109, 30212; + ld.u8 %r112, [%rd108+22]; + prmt.b32 %r113, %r112, %r111, 28756; + ld.u8 %r114, [%rd108+23]; + prmt.b32 %r115, %r114, %r113, 1620; + ld.u8 %r116, [%rd108+24]; + ld.u8 %r117, [%rd108+25]; + prmt.b32 %r118, %r117, %r116, 30212; + ld.u8 %r119, [%rd108+26]; + prmt.b32 %r120, %r119, %r118, 28756; + ld.u8 %r121, [%rd108+27]; + prmt.b32 %r122, %r121, %r120, 1620; + ld.u8 %r123, [%rd108+28]; + ld.u8 %r124, [%rd108+29]; + prmt.b32 %r125, %r124, %r123, 30212; + ld.u8 %r126, [%rd108+30]; + prmt.b32 %r127, %r126, %r125, 28756; + ld.u8 %r128, [%rd108+31]; + prmt.b32 %r129, %r128, %r127, 1620; + ld.u8 %r130, [%rd108+32]; + ld.u8 %r131, [%rd108+33]; + prmt.b32 %r132, %r131, %r130, 30212; + ld.u8 %r133, [%rd108+34]; + prmt.b32 %r134, %r133, %r132, 28756; + ld.u8 %r135, [%rd108+35]; + prmt.b32 %r136, %r135, %r134, 1620; + ld.u8 %r137, [%rd108+36]; + ld.u8 %r138, [%rd108+37]; + prmt.b32 %r139, %r138, %r137, 30212; + ld.u8 %r140, [%rd108+38]; + prmt.b32 %r141, %r140, %r139, 28756; + ld.u8 %r142, [%rd108+39]; + prmt.b32 %r143, %r142, %r141, 1620; + ld.u8 %r144, [%rd108+40]; + ld.u8 %r145, [%rd108+41]; + prmt.b32 %r146, %r145, %r144, 30212; + ld.u8 %r147, [%rd108+42]; + prmt.b32 %r148, %r147, %r146, 28756; + ld.u8 %r149, [%rd108+43]; + prmt.b32 %r150, %r149, %r148, 1620; + ld.u8 %r151, [%rd108+44]; + ld.u8 %r152, [%rd108+45]; + prmt.b32 %r153, %r152, %r151, 30212; + ld.u8 %r154, [%rd108+46]; + prmt.b32 %r155, %r154, %r153, 28756; + ld.u8 %r156, [%rd108+47]; + prmt.b32 %r157, %r156, %r155, 1620; + ld.u8 %r158, [%rd108+48]; + ld.u8 %r159, [%rd108+49]; + prmt.b32 %r160, %r159, %r158, 30212; + ld.u8 %r161, [%rd108+50]; + prmt.b32 %r162, %r161, %r160, 28756; + ld.u8 %r163, [%rd108+51]; + prmt.b32 %r164, %r163, %r162, 1620; + ld.u8 %r165, [%rd108+52]; + ld.u8 %r166, [%rd108+53]; + prmt.b32 %r167, %r166, %r165, 30212; + ld.u8 %r168, [%rd108+54]; + prmt.b32 %r169, %r168, %r167, 28756; + ld.u8 %r170, [%rd108+55]; + prmt.b32 %r171, %r170, %r169, 1620; + ld.u8 %r172, [%rd108+56]; + ld.u8 %r173, [%rd108+57]; + prmt.b32 %r174, %r173, %r172, 30212; + ld.u8 %r175, [%rd108+58]; + prmt.b32 %r176, %r175, %r174, 28756; + ld.u8 %r177, [%rd108+59]; + prmt.b32 %r178, %r177, %r176, 1620; + ld.u8 %r179, [%rd108+60]; + ld.u8 %r180, [%rd108+61]; + prmt.b32 %r181, %r180, %r179, 30212; + ld.u8 %r182, [%rd108+62]; + prmt.b32 %r183, %r182, %r181, 28756; + ld.u8 %r184, [%rd108+63]; + prmt.b32 %r185, %r184, %r183, 1620; + ld.local.u8 %r186, [%rd2+16]; + ld.local.u8 %r187, [%rd2+17]; + prmt.b32 %r188, %r187, %r186, 30212; + ld.local.u8 %r189, [%rd2+18]; + ld.local.u8 %r190, [%rd2+19]; + prmt.b32 %r191, %r190, %r189, 30212; + prmt.b32 %r192, %r191, %r188, 4180; + ld.local.u8 %r193, [%rd2]; + ld.local.u8 %r194, [%rd2+1]; + prmt.b32 %r195, %r194, %r193, 30212; + ld.local.u8 %r196, [%rd2+2]; + ld.local.u8 %r197, [%rd2+3]; + prmt.b32 %r198, %r197, %r196, 30212; + prmt.b32 %r199, %r198, %r195, 4180; + add.s32 %r200, %r192, %r199; + add.s32 %r201, %r200, %r80; + shf.l.wrap.b32 %r202, %r201, %r201, 16; + add.s32 %r203, %r202, 1779033703; + xor.b32 %r204, %r203, %r192; + shf.l.wrap.b32 %r205, %r204, %r204, 20; + add.s32 %r206, %r87, %r201; + add.s32 %r207, %r206, %r205; + xor.b32 %r208, %r207, %r202; + shf.l.wrap.b32 %r209, %r208, %r208, 24; + add.s32 %r210, %r209, %r203; + xor.b32 %r211, %r210, %r205; + shf.l.wrap.b32 %r212, %r211, %r211, 25; + ld.local.u8 %r213, [%rd2+20]; + ld.local.u8 %r214, [%rd2+21]; + prmt.b32 %r215, %r214, %r213, 30212; + ld.local.u8 %r216, [%rd2+22]; + ld.local.u8 %r217, [%rd2+23]; + prmt.b32 %r218, %r217, %r216, 30212; + prmt.b32 %r219, %r218, %r215, 4180; + ld.local.u8 %r220, [%rd2+4]; + ld.local.u8 %r221, [%rd2+5]; + prmt.b32 %r222, %r221, %r220, 30212; + ld.local.u8 %r223, [%rd2+6]; + ld.local.u8 %r224, [%rd2+7]; + prmt.b32 %r225, %r224, %r223, 30212; + prmt.b32 %r226, %r225, %r222, 4180; + add.s32 %r227, %r219, %r226; + add.s32 %r228, %r227, %r94; + shf.l.wrap.b32 %r229, %r228, %r228, 16; + add.s32 %r230, %r229, -1150833019; + xor.b32 %r231, %r230, %r219; + shf.l.wrap.b32 %r232, %r231, %r231, 20; + add.s32 %r233, %r101, %r228; + add.s32 %r234, %r233, %r232; + xor.b32 %r235, %r234, %r229; + shf.l.wrap.b32 %r236, %r235, %r235, 24; + add.s32 %r237, %r236, %r230; + xor.b32 %r238, %r237, %r232; + shf.l.wrap.b32 %r239, %r238, %r238, 25; + ld.local.u8 %r240, [%rd2+24]; + ld.local.u8 %r241, [%rd2+25]; + prmt.b32 %r242, %r241, %r240, 30212; + ld.local.u8 %r243, [%rd2+26]; + ld.local.u8 %r244, [%rd2+27]; + prmt.b32 %r245, %r244, %r243, 30212; + prmt.b32 %r246, %r245, %r242, 4180; + ld.local.u8 %r247, [%rd2+8]; + ld.local.u8 %r248, [%rd2+9]; + prmt.b32 %r249, %r248, %r247, 30212; + ld.local.u8 %r250, [%rd2+10]; + ld.local.u8 %r251, [%rd2+11]; + prmt.b32 %r252, %r251, %r250, 30212; + prmt.b32 %r253, %r252, %r249, 4180; + add.s32 %r254, %r246, %r253; + add.s32 %r255, %r254, %r108; + shr.u32 %r256, %r255, 16; + shl.b32 %r257, %r255, 16; + xor.b32 %r258, %r257, 4194304; + or.b32 %r259, %r258, %r256; + add.s32 %r260, %r259, 1013904242; + xor.b32 %r261, %r260, %r246; + shf.l.wrap.b32 %r262, %r261, %r261, 20; + add.s32 %r263, %r115, %r255; + add.s32 %r264, %r263, %r262; + xor.b32 %r265, %r264, %r259; + shf.l.wrap.b32 %r266, %r265, %r265, 24; + add.s32 %r267, %r266, %r260; + xor.b32 %r268, %r267, %r262; + shf.l.wrap.b32 %r269, %r268, %r268, 25; + ld.local.u8 %r270, [%rd2+28]; + ld.local.u8 %r271, [%rd2+29]; + prmt.b32 %r272, %r271, %r270, 30212; + ld.local.u8 %r273, [%rd2+30]; + ld.local.u8 %r274, [%rd2+31]; + prmt.b32 %r275, %r274, %r273, 30212; + prmt.b32 %r276, %r275, %r272, 4180; + ld.local.u8 %r277, [%rd2+12]; + ld.local.u8 %r278, [%rd2+13]; + prmt.b32 %r279, %r278, %r277, 30212; + ld.local.u8 %r280, [%rd2+14]; + ld.local.u8 %r281, [%rd2+15]; + prmt.b32 %r282, %r281, %r280, 30212; + prmt.b32 %r283, %r282, %r279, 4180; + add.s32 %r284, %r276, %r283; + add.s32 %r285, %r284, %r122; + xor.b32 %r286, %r285, %r1; + shr.u32 %r287, %r285, 16; + shl.b32 %r288, %r286, 16; + or.b32 %r289, %r288, %r287; + add.s32 %r290, %r289, -1521486534; + xor.b32 %r291, %r290, %r276; + shf.l.wrap.b32 %r292, %r291, %r291, 20; + add.s32 %r293, %r129, %r285; + add.s32 %r294, %r293, %r292; + xor.b32 %r295, %r294, %r289; + shf.l.wrap.b32 %r296, %r295, %r295, 24; + add.s32 %r297, %r296, %r290; + xor.b32 %r298, %r297, %r292; + shf.l.wrap.b32 %r299, %r298, %r298, 25; + add.s32 %r300, %r239, %r207; + add.s32 %r301, %r300, %r136; + xor.b32 %r302, %r296, %r301; + shf.l.wrap.b32 %r303, %r302, %r302, 16; + add.s32 %r304, %r303, %r267; + xor.b32 %r305, %r304, %r239; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r143, %r301; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + add.s32 %r314, %r269, %r234; + add.s32 %r315, %r314, %r150; + xor.b32 %r316, %r315, %r209; + shf.l.wrap.b32 %r317, %r316, %r316, 16; + add.s32 %r318, %r317, %r297; + xor.b32 %r319, %r318, %r269; + shf.l.wrap.b32 %r320, %r319, %r319, 20; + add.s32 %r321, %r157, %r315; + add.s32 %r322, %r321, %r320; + xor.b32 %r323, %r322, %r317; + shf.l.wrap.b32 %r324, %r323, %r323, 24; + add.s32 %r325, %r324, %r318; + xor.b32 %r326, %r325, %r320; + shf.l.wrap.b32 %r327, %r326, %r326, 25; + add.s32 %r328, %r299, %r264; + add.s32 %r329, %r328, %r164; + xor.b32 %r330, %r329, %r236; + shf.l.wrap.b32 %r331, %r330, %r330, 16; + add.s32 %r332, %r331, %r210; + xor.b32 %r333, %r332, %r299; + shf.l.wrap.b32 %r334, %r333, %r333, 20; + add.s32 %r335, %r171, %r329; + add.s32 %r336, %r335, %r334; + xor.b32 %r337, %r336, %r331; + shf.l.wrap.b32 %r338, %r337, %r337, 24; + add.s32 %r339, %r338, %r332; + xor.b32 %r340, %r339, %r334; + shf.l.wrap.b32 %r341, %r340, %r340, 25; + add.s32 %r342, %r294, %r212; + add.s32 %r343, %r342, %r178; + xor.b32 %r344, %r343, %r266; + shf.l.wrap.b32 %r345, %r344, %r344, 16; + add.s32 %r346, %r345, %r237; + xor.b32 %r347, %r346, %r212; + shf.l.wrap.b32 %r348, %r347, %r347, 20; + add.s32 %r349, %r185, %r343; + add.s32 %r350, %r349, %r348; + xor.b32 %r351, %r350, %r345; + shf.l.wrap.b32 %r352, %r351, %r351, 24; + add.s32 %r353, %r352, %r346; + xor.b32 %r354, %r353, %r348; + shf.l.wrap.b32 %r355, %r354, %r354, 25; + add.s32 %r356, %r308, %r94; + add.s32 %r357, %r356, %r355; + xor.b32 %r358, %r357, %r324; + shf.l.wrap.b32 %r359, %r358, %r358, 16; + add.s32 %r360, %r359, %r339; + xor.b32 %r361, %r360, %r355; + shf.l.wrap.b32 %r362, %r361, %r361, 20; + add.s32 %r363, %r357, %r122; + add.s32 %r364, %r363, %r362; + xor.b32 %r365, %r364, %r359; + shf.l.wrap.b32 %r366, %r365, %r365, 24; + add.s32 %r367, %r366, %r360; + xor.b32 %r368, %r367, %r362; + shf.l.wrap.b32 %r369, %r368, %r368, 25; + add.s32 %r370, %r322, %r101; + add.s32 %r371, %r370, %r313; + xor.b32 %r372, %r338, %r371; + shf.l.wrap.b32 %r373, %r372, %r372, 16; + add.s32 %r374, %r353, %r373; + xor.b32 %r375, %r374, %r313; + shf.l.wrap.b32 %r376, %r375, %r375, 20; + add.s32 %r377, %r371, %r150; + add.s32 %r378, %r377, %r376; + xor.b32 %r379, %r378, %r373; + shf.l.wrap.b32 %r380, %r379, %r379, 24; + add.s32 %r381, %r380, %r374; + xor.b32 %r382, %r381, %r376; + shf.l.wrap.b32 %r383, %r382, %r382, 25; + add.s32 %r384, %r327, %r129; + add.s32 %r385, %r384, %r336; + xor.b32 %r386, %r352, %r385; + shf.l.wrap.b32 %r387, %r386, %r386, 16; + add.s32 %r388, %r387, %r311; + xor.b32 %r389, %r388, %r327; + shf.l.wrap.b32 %r390, %r389, %r389, 20; + add.s32 %r391, %r385, %r80; + add.s32 %r392, %r391, %r390; + xor.b32 %r393, %r392, %r387; + shf.l.wrap.b32 %r394, %r393, %r393, 24; + add.s32 %r395, %r394, %r388; + xor.b32 %r396, %r395, %r390; + shf.l.wrap.b32 %r397, %r396, %r396, 25; + add.s32 %r398, %r341, %r108; + add.s32 %r399, %r398, %r350; + xor.b32 %r400, %r399, %r310; + shf.l.wrap.b32 %r401, %r400, %r400, 16; + add.s32 %r402, %r401, %r325; + xor.b32 %r403, %r402, %r341; + shf.l.wrap.b32 %r404, %r403, %r403, 20; + add.s32 %r405, %r399, %r171; + add.s32 %r406, %r405, %r404; + xor.b32 %r407, %r406, %r401; + shf.l.wrap.b32 %r408, %r407, %r407, 24; + add.s32 %r409, %r408, %r402; + xor.b32 %r410, %r409, %r404; + shf.l.wrap.b32 %r411, %r410, %r410, 25; + add.s32 %r412, %r383, %r87; + add.s32 %r413, %r412, %r364; + xor.b32 %r414, %r413, %r408; + shf.l.wrap.b32 %r415, %r414, %r414, 16; + add.s32 %r416, %r415, %r395; + xor.b32 %r417, %r416, %r383; + shf.l.wrap.b32 %r418, %r417, %r417, 20; + add.s32 %r419, %r413, %r157; + add.s32 %r420, %r419, %r418; + xor.b32 %r421, %r420, %r415; + shf.l.wrap.b32 %r422, %r421, %r421, 24; + add.s32 %r423, %r422, %r416; + xor.b32 %r424, %r423, %r418; + shf.l.wrap.b32 %r425, %r424, %r424, 25; + add.s32 %r426, %r378, %r164; + add.s32 %r427, %r426, %r397; + xor.b32 %r428, %r366, %r427; + shf.l.wrap.b32 %r429, %r428, %r428, 16; + add.s32 %r430, %r429, %r409; + xor.b32 %r431, %r430, %r397; + shf.l.wrap.b32 %r432, %r431, %r431, 20; + add.s32 %r433, %r427, %r115; + add.s32 %r434, %r433, %r432; + xor.b32 %r435, %r434, %r429; + shf.l.wrap.b32 %r436, %r435, %r435, 24; + add.s32 %r437, %r436, %r430; + xor.b32 %r438, %r437, %r432; + shf.l.wrap.b32 %r439, %r438, %r438, 25; + add.s32 %r440, %r392, %r143; + add.s32 %r441, %r440, %r411; + xor.b32 %r442, %r441, %r380; + shf.l.wrap.b32 %r443, %r442, %r442, 16; + add.s32 %r444, %r443, %r367; + xor.b32 %r445, %r444, %r411; + shf.l.wrap.b32 %r446, %r445, %r445, 20; + add.s32 %r447, %r441, %r178; + add.s32 %r448, %r447, %r446; + xor.b32 %r449, %r448, %r443; + shf.l.wrap.b32 %r450, %r449, %r449, 24; + add.s32 %r451, %r450, %r444; + xor.b32 %r452, %r451, %r446; + shf.l.wrap.b32 %r453, %r452, %r452, 25; + add.s32 %r454, %r406, %r185; + add.s32 %r455, %r454, %r369; + xor.b32 %r456, %r455, %r394; + shf.l.wrap.b32 %r457, %r456, %r456, 16; + add.s32 %r458, %r457, %r381; + xor.b32 %r459, %r458, %r369; + shf.l.wrap.b32 %r460, %r459, %r459, 20; + add.s32 %r461, %r455, %r136; + add.s32 %r462, %r461, %r460; + xor.b32 %r463, %r462, %r457; + shf.l.wrap.b32 %r464, %r463, %r463, 24; + add.s32 %r465, %r464, %r458; + xor.b32 %r466, %r465, %r460; + shf.l.wrap.b32 %r467, %r466, %r466, 25; + add.s32 %r468, %r420, %r101; + add.s32 %r469, %r468, %r467; + xor.b32 %r470, %r469, %r436; + shf.l.wrap.b32 %r471, %r470, %r470, 16; + add.s32 %r472, %r471, %r451; + xor.b32 %r473, %r472, %r467; + shf.l.wrap.b32 %r474, %r473, %r473, 20; + add.s32 %r475, %r469, %r108; + add.s32 %r476, %r475, %r474; + xor.b32 %r477, %r476, %r471; + shf.l.wrap.b32 %r478, %r477, %r477, 24; + add.s32 %r479, %r478, %r472; + xor.b32 %r480, %r479, %r474; + shf.l.wrap.b32 %r481, %r480, %r480, 25; + add.s32 %r482, %r434, %r150; + add.s32 %r483, %r482, %r425; + xor.b32 %r484, %r483, %r450; + shf.l.wrap.b32 %r485, %r484, %r484, 16; + add.s32 %r486, %r485, %r465; + xor.b32 %r487, %r486, %r425; + shf.l.wrap.b32 %r488, %r487, %r487, 20; + add.s32 %r489, %r483, %r164; + add.s32 %r490, %r489, %r488; + xor.b32 %r491, %r490, %r485; + shf.l.wrap.b32 %r492, %r491, %r491, 24; + add.s32 %r493, %r492, %r486; + xor.b32 %r494, %r493, %r488; + shf.l.wrap.b32 %r495, %r494, %r494, 25; + add.s32 %r496, %r448, %r171; + add.s32 %r497, %r496, %r439; + xor.b32 %r498, %r464, %r497; + shf.l.wrap.b32 %r499, %r498, %r498, 16; + add.s32 %r500, %r499, %r423; + xor.b32 %r501, %r500, %r439; + shf.l.wrap.b32 %r502, %r501, %r501, 20; + add.s32 %r503, %r497, %r94; + add.s32 %r504, %r503, %r502; + xor.b32 %r505, %r504, %r499; + shf.l.wrap.b32 %r506, %r505, %r505, 24; + add.s32 %r507, %r506, %r500; + xor.b32 %r508, %r507, %r502; + shf.l.wrap.b32 %r509, %r508, %r508, 25; + add.s32 %r510, %r453, %r129; + add.s32 %r511, %r510, %r462; + xor.b32 %r512, %r511, %r422; + shf.l.wrap.b32 %r513, %r512, %r512, 16; + add.s32 %r514, %r513, %r437; + xor.b32 %r515, %r514, %r453; + shf.l.wrap.b32 %r516, %r515, %r515, 20; + add.s32 %r517, %r511, %r178; + add.s32 %r518, %r517, %r516; + xor.b32 %r519, %r518, %r513; + shf.l.wrap.b32 %r520, %r519, %r519, 24; + add.s32 %r521, %r520, %r514; + xor.b32 %r522, %r521, %r516; + shf.l.wrap.b32 %r523, %r522, %r522, 25; + add.s32 %r524, %r495, %r122; + add.s32 %r525, %r524, %r476; + xor.b32 %r526, %r525, %r520; + shf.l.wrap.b32 %r527, %r526, %r526, 16; + add.s32 %r528, %r527, %r507; + xor.b32 %r529, %r528, %r495; + shf.l.wrap.b32 %r530, %r529, %r529, 20; + add.s32 %r531, %r525, %r115; + add.s32 %r532, %r531, %r530; + xor.b32 %r533, %r532, %r527; + shf.l.wrap.b32 %r534, %r533, %r533, 24; + add.s32 %r535, %r534, %r528; + xor.b32 %r536, %r535, %r530; + shf.l.wrap.b32 %r537, %r536, %r536, 25; + add.s32 %r538, %r490, %r143; + add.s32 %r539, %r538, %r509; + xor.b32 %r540, %r478, %r539; + shf.l.wrap.b32 %r541, %r540, %r540, 16; + add.s32 %r542, %r541, %r521; + xor.b32 %r543, %r542, %r509; + shf.l.wrap.b32 %r544, %r543, %r543, 20; + add.s32 %r545, %r539, %r80; + add.s32 %r546, %r545, %r544; + xor.b32 %r547, %r546, %r541; + shf.l.wrap.b32 %r548, %r547, %r547, 24; + add.s32 %r549, %r548, %r542; + xor.b32 %r550, %r549, %r544; + shf.l.wrap.b32 %r551, %r550, %r550, 25; + add.s32 %r552, %r504, %r157; + add.s32 %r553, %r552, %r523; + xor.b32 %r554, %r553, %r492; + shf.l.wrap.b32 %r555, %r554, %r554, 16; + add.s32 %r556, %r555, %r479; + xor.b32 %r557, %r556, %r523; + shf.l.wrap.b32 %r558, %r557, %r557, 20; + add.s32 %r559, %r553, %r185; + add.s32 %r560, %r559, %r558; + xor.b32 %r561, %r560, %r555; + shf.l.wrap.b32 %r562, %r561, %r561, 24; + add.s32 %r563, %r562, %r556; + xor.b32 %r564, %r563, %r558; + shf.l.wrap.b32 %r565, %r564, %r564, 25; + add.s32 %r566, %r518, %r136; + add.s32 %r567, %r566, %r481; + xor.b32 %r568, %r567, %r506; + shf.l.wrap.b32 %r569, %r568, %r568, 16; + add.s32 %r570, %r569, %r493; + xor.b32 %r571, %r570, %r481; + shf.l.wrap.b32 %r572, %r571, %r571, 20; + add.s32 %r573, %r567, %r87; + add.s32 %r574, %r573, %r572; + xor.b32 %r575, %r574, %r569; + shf.l.wrap.b32 %r576, %r575, %r575, 24; + add.s32 %r577, %r576, %r570; + xor.b32 %r578, %r577, %r572; + shf.l.wrap.b32 %r579, %r578, %r578, 25; + add.s32 %r580, %r532, %r150; + add.s32 %r581, %r580, %r579; + xor.b32 %r582, %r581, %r548; + shf.l.wrap.b32 %r583, %r582, %r582, 16; + add.s32 %r584, %r583, %r563; + xor.b32 %r585, %r584, %r579; + shf.l.wrap.b32 %r586, %r585, %r585, 20; + add.s32 %r587, %r581, %r129; + add.s32 %r588, %r587, %r586; + xor.b32 %r589, %r588, %r583; + shf.l.wrap.b32 %r590, %r589, %r589, 24; + add.s32 %r591, %r590, %r584; + xor.b32 %r592, %r591, %r586; + shf.l.wrap.b32 %r593, %r592, %r592, 25; + add.s32 %r594, %r546, %r164; + add.s32 %r595, %r594, %r537; + xor.b32 %r596, %r595, %r562; + shf.l.wrap.b32 %r597, %r596, %r596, 16; + add.s32 %r598, %r597, %r577; + xor.b32 %r599, %r598, %r537; + shf.l.wrap.b32 %r600, %r599, %r599, 20; + add.s32 %r601, %r595, %r143; + add.s32 %r602, %r601, %r600; + xor.b32 %r603, %r602, %r597; + shf.l.wrap.b32 %r604, %r603, %r603, 24; + add.s32 %r605, %r604, %r598; + xor.b32 %r606, %r605, %r600; + shf.l.wrap.b32 %r607, %r606, %r606, 25; + add.s32 %r608, %r560, %r178; + add.s32 %r609, %r608, %r551; + xor.b32 %r610, %r576, %r609; + shf.l.wrap.b32 %r611, %r610, %r610, 16; + add.s32 %r612, %r611, %r535; + xor.b32 %r613, %r612, %r551; + shf.l.wrap.b32 %r614, %r613, %r613, 20; + add.s32 %r615, %r609, %r101; + add.s32 %r616, %r615, %r614; + xor.b32 %r617, %r616, %r611; + shf.l.wrap.b32 %r618, %r617, %r617, 24; + add.s32 %r619, %r618, %r612; + xor.b32 %r620, %r619, %r614; + shf.l.wrap.b32 %r621, %r620, %r620, 25; + add.s32 %r622, %r565, %r171; + add.s32 %r623, %r622, %r574; + xor.b32 %r624, %r623, %r534; + shf.l.wrap.b32 %r625, %r624, %r624, 16; + add.s32 %r626, %r625, %r549; + xor.b32 %r627, %r626, %r565; + shf.l.wrap.b32 %r628, %r627, %r627, 20; + add.s32 %r629, %r623, %r185; + add.s32 %r630, %r629, %r628; + xor.b32 %r631, %r630, %r625; + shf.l.wrap.b32 %r632, %r631, %r631, 24; + add.s32 %r633, %r632, %r626; + xor.b32 %r634, %r633, %r628; + shf.l.wrap.b32 %r635, %r634, %r634, 25; + add.s32 %r636, %r607, %r108; + add.s32 %r637, %r636, %r588; + xor.b32 %r638, %r637, %r632; + shf.l.wrap.b32 %r639, %r638, %r638, 16; + add.s32 %r640, %r639, %r619; + xor.b32 %r641, %r640, %r607; + shf.l.wrap.b32 %r642, %r641, %r641, 20; + add.s32 %r643, %r637, %r80; + add.s32 %r644, %r643, %r642; + xor.b32 %r645, %r644, %r639; + shf.l.wrap.b32 %r646, %r645, %r645, 24; + add.s32 %r647, %r646, %r640; + xor.b32 %r648, %r647, %r642; + shf.l.wrap.b32 %r649, %r648, %r648, 25; + add.s32 %r650, %r602, %r157; + add.s32 %r651, %r650, %r621; + xor.b32 %r652, %r590, %r651; + shf.l.wrap.b32 %r653, %r652, %r652, 16; + add.s32 %r654, %r653, %r633; + xor.b32 %r655, %r654, %r621; + shf.l.wrap.b32 %r656, %r655, %r655, 20; + add.s32 %r657, %r651, %r94; + add.s32 %r658, %r657, %r656; + xor.b32 %r659, %r658, %r653; + shf.l.wrap.b32 %r660, %r659, %r659, 24; + add.s32 %r661, %r660, %r654; + xor.b32 %r662, %r661, %r656; + shf.l.wrap.b32 %r663, %r662, %r662, 25; + add.s32 %r664, %r616, %r115; + add.s32 %r665, %r664, %r635; + xor.b32 %r666, %r665, %r604; + shf.l.wrap.b32 %r667, %r666, %r666, 16; + add.s32 %r668, %r667, %r591; + xor.b32 %r669, %r668, %r635; + shf.l.wrap.b32 %r670, %r669, %r669, 20; + add.s32 %r671, %r665, %r136; + add.s32 %r672, %r671, %r670; + xor.b32 %r673, %r672, %r667; + shf.l.wrap.b32 %r674, %r673, %r673, 24; + add.s32 %r675, %r674, %r668; + xor.b32 %r676, %r675, %r670; + shf.l.wrap.b32 %r677, %r676, %r676, 25; + add.s32 %r678, %r630, %r87; + add.s32 %r679, %r678, %r593; + xor.b32 %r680, %r679, %r618; + shf.l.wrap.b32 %r681, %r680, %r680, 16; + add.s32 %r682, %r681, %r605; + xor.b32 %r683, %r682, %r593; + shf.l.wrap.b32 %r684, %r683, %r683, 20; + add.s32 %r685, %r679, %r122; + add.s32 %r686, %r685, %r684; + xor.b32 %r687, %r686, %r681; + shf.l.wrap.b32 %r688, %r687, %r687, 24; + add.s32 %r689, %r688, %r682; + xor.b32 %r690, %r689, %r684; + shf.l.wrap.b32 %r691, %r690, %r690, 25; + add.s32 %r692, %r644, %r164; + add.s32 %r693, %r692, %r691; + xor.b32 %r694, %r693, %r660; + shf.l.wrap.b32 %r695, %r694, %r694, 16; + add.s32 %r696, %r695, %r675; + xor.b32 %r697, %r696, %r691; + shf.l.wrap.b32 %r698, %r697, %r697, 20; + add.s32 %r699, %r693, %r171; + add.s32 %r700, %r699, %r698; + xor.b32 %r701, %r700, %r695; + shf.l.wrap.b32 %r702, %r701, %r701, 24; + add.s32 %r703, %r702, %r696; + xor.b32 %r704, %r703, %r698; + shf.l.wrap.b32 %r705, %r704, %r704, 25; + add.s32 %r706, %r658, %r143; + add.s32 %r707, %r706, %r649; + xor.b32 %r708, %r707, %r674; + shf.l.wrap.b32 %r709, %r708, %r708, 16; + add.s32 %r710, %r709, %r689; + xor.b32 %r711, %r710, %r649; + shf.l.wrap.b32 %r712, %r711, %r711, 20; + add.s32 %r713, %r707, %r157; + add.s32 %r714, %r713, %r712; + xor.b32 %r715, %r714, %r709; + shf.l.wrap.b32 %r716, %r715, %r715, 24; + add.s32 %r717, %r716, %r710; + xor.b32 %r718, %r717, %r712; + shf.l.wrap.b32 %r719, %r718, %r718, 25; + add.s32 %r720, %r672, %r185; + add.s32 %r721, %r720, %r663; + xor.b32 %r722, %r688, %r721; + shf.l.wrap.b32 %r723, %r722, %r722, 16; + add.s32 %r724, %r723, %r647; + xor.b32 %r725, %r724, %r663; + shf.l.wrap.b32 %r726, %r725, %r725, 20; + add.s32 %r727, %r721, %r150; + add.s32 %r728, %r727, %r726; + xor.b32 %r729, %r728, %r723; + shf.l.wrap.b32 %r730, %r729, %r729, 24; + add.s32 %r731, %r730, %r724; + xor.b32 %r732, %r731, %r726; + shf.l.wrap.b32 %r733, %r732, %r732, 25; + add.s32 %r734, %r677, %r178; + add.s32 %r735, %r734, %r686; + xor.b32 %r736, %r735, %r646; + shf.l.wrap.b32 %r737, %r736, %r736, 16; + add.s32 %r738, %r737, %r661; + xor.b32 %r739, %r738, %r677; + shf.l.wrap.b32 %r740, %r739, %r739, 20; + add.s32 %r741, %r735, %r136; + add.s32 %r742, %r741, %r740; + xor.b32 %r743, %r742, %r737; + shf.l.wrap.b32 %r744, %r743, %r743, 24; + add.s32 %r745, %r744, %r738; + xor.b32 %r746, %r745, %r740; + shf.l.wrap.b32 %r747, %r746, %r746, 25; + add.s32 %r748, %r719, %r129; + add.s32 %r749, %r748, %r700; + xor.b32 %r750, %r749, %r744; + shf.l.wrap.b32 %r751, %r750, %r750, 16; + add.s32 %r752, %r751, %r731; + xor.b32 %r753, %r752, %r719; + shf.l.wrap.b32 %r754, %r753, %r753, 20; + add.s32 %r755, %r749, %r94; + add.s32 %r756, %r755, %r754; + xor.b32 %r757, %r756, %r751; + shf.l.wrap.b32 %r758, %r757, %r757, 24; + add.s32 %r759, %r758, %r752; + xor.b32 %r760, %r759, %r754; + shf.l.wrap.b32 %r761, %r760, %r760, 25; + add.s32 %r762, %r714, %r115; + add.s32 %r763, %r762, %r733; + xor.b32 %r764, %r702, %r763; + shf.l.wrap.b32 %r765, %r764, %r764, 16; + add.s32 %r766, %r765, %r745; + xor.b32 %r767, %r766, %r733; + shf.l.wrap.b32 %r768, %r767, %r767, 20; + add.s32 %r769, %r763, %r101; + add.s32 %r770, %r769, %r768; + xor.b32 %r771, %r770, %r765; + shf.l.wrap.b32 %r772, %r771, %r771, 24; + add.s32 %r773, %r772, %r766; + xor.b32 %r774, %r773, %r768; + shf.l.wrap.b32 %r775, %r774, %r774, 25; + add.s32 %r776, %r728, %r80; + add.s32 %r777, %r776, %r747; + xor.b32 %r778, %r777, %r716; + shf.l.wrap.b32 %r779, %r778, %r778, 16; + add.s32 %r780, %r779, %r703; + xor.b32 %r781, %r780, %r747; + shf.l.wrap.b32 %r782, %r781, %r781, 20; + add.s32 %r783, %r777, %r87; + add.s32 %r784, %r783, %r782; + xor.b32 %r785, %r784, %r779; + shf.l.wrap.b32 %r786, %r785, %r785, 24; + add.s32 %r787, %r786, %r780; + xor.b32 %r788, %r787, %r782; + shf.l.wrap.b32 %r789, %r788, %r788, 25; + add.s32 %r790, %r742, %r122; + add.s32 %r791, %r790, %r705; + xor.b32 %r792, %r791, %r730; + shf.l.wrap.b32 %r793, %r792, %r792, 16; + add.s32 %r794, %r793, %r717; + xor.b32 %r795, %r794, %r705; + shf.l.wrap.b32 %r796, %r795, %r795, 20; + add.s32 %r797, %r791, %r108; + add.s32 %r798, %r797, %r796; + xor.b32 %r799, %r798, %r793; + shf.l.wrap.b32 %r800, %r799, %r799, 24; + add.s32 %r801, %r800, %r794; + xor.b32 %r802, %r801, %r796; + shf.l.wrap.b32 %r803, %r802, %r802, 25; + add.s32 %r804, %r756, %r143; + add.s32 %r805, %r804, %r803; + xor.b32 %r806, %r805, %r772; + shf.l.wrap.b32 %r807, %r806, %r806, 16; + add.s32 %r808, %r807, %r787; + xor.b32 %r809, %r808, %r803; + shf.l.wrap.b32 %r810, %r809, %r809, 20; + add.s32 %r811, %r805, %r178; + add.s32 %r812, %r811, %r810; + xor.b32 %r813, %r812, %r807; + shf.l.wrap.b32 %r814, %r813, %r813, 24; + add.s32 %r815, %r814, %r808; + xor.b32 %r816, %r815, %r810; + shf.l.wrap.b32 %r817, %r816, %r816, 25; + add.s32 %r818, %r770, %r157; + add.s32 %r819, %r818, %r761; + xor.b32 %r820, %r819, %r786; + shf.l.wrap.b32 %r821, %r820, %r820, 16; + add.s32 %r822, %r821, %r801; + xor.b32 %r823, %r822, %r761; + shf.l.wrap.b32 %r824, %r823, %r823, 20; + add.s32 %r825, %r819, %r115; + add.s32 %r826, %r825, %r824; + xor.b32 %r827, %r826, %r821; + shf.l.wrap.b32 %r828, %r827, %r827, 24; + add.s32 %r829, %r828, %r822; + xor.b32 %r830, %r829, %r824; + shf.l.wrap.b32 %r831, %r830, %r830, 25; + add.s32 %r832, %r784, %r136; + add.s32 %r833, %r832, %r775; + xor.b32 %r834, %r800, %r833; + shf.l.wrap.b32 %r835, %r834, %r834, 16; + add.s32 %r836, %r835, %r759; + xor.b32 %r837, %r836, %r775; + shf.l.wrap.b32 %r838, %r837, %r837, 20; + add.s32 %r839, %r833, %r164; + add.s32 %r840, %r839, %r838; + xor.b32 %r841, %r840, %r835; + shf.l.wrap.b32 %r842, %r841, %r841, 24; + add.s32 %r843, %r842, %r836; + xor.b32 %r844, %r843, %r838; + shf.l.wrap.b32 %r845, %r844, %r844, 25; + add.s32 %r846, %r789, %r185; + add.s32 %r847, %r846, %r798; + xor.b32 %r848, %r847, %r758; + shf.l.wrap.b32 %r849, %r848, %r848, 16; + add.s32 %r850, %r849, %r773; + xor.b32 %r851, %r850, %r789; + shf.l.wrap.b32 %r852, %r851, %r851, 20; + add.s32 %r853, %r847, %r87; + add.s32 %r854, %r853, %r852; + xor.b32 %r855, %r854, %r849; + shf.l.wrap.b32 %r856, %r855, %r855, 24; + add.s32 %r857, %r856, %r850; + xor.b32 %r858, %r857, %r852; + shf.l.wrap.b32 %r859, %r858, %r858, 25; + add.s32 %r860, %r831, %r171; + add.s32 %r861, %r860, %r812; + xor.b32 %r862, %r861, %r856; + shf.l.wrap.b32 %r863, %r862, %r862, 16; + add.s32 %r864, %r863, %r843; + xor.b32 %r865, %r864, %r831; + shf.l.wrap.b32 %r866, %r865, %r865, 20; + add.s32 %r867, %r861, %r101; + add.s32 %r868, %r867, %r866; + xor.b32 %r869, %r868, %r863; + shf.l.wrap.b32 %r870, %r869, %r869, 24; + add.s32 %r871, %r870, %r864; + xor.b32 %r872, %r871, %r866; + shf.l.wrap.b32 %r873, %r872, %r872, 25; + add.s32 %r874, %r826, %r80; + add.s32 %r875, %r874, %r845; + xor.b32 %r876, %r814, %r875; + shf.l.wrap.b32 %r877, %r876, %r876, 16; + add.s32 %r878, %r877, %r857; + xor.b32 %r879, %r878, %r845; + shf.l.wrap.b32 %r880, %r879, %r879, 20; + add.s32 %r881, %r875, %r150; + add.s32 %r882, %r881, %r880; + xor.b32 %r883, %r882, %r877; + shf.l.wrap.b32 %r884, %r883, %r883, 24; + add.s32 %r885, %r884, %r878; + xor.b32 %r886, %r885, %r880; + shf.l.wrap.b32 %r887, %r886, %r886, 25; + add.s32 %r888, %r840, %r94; + add.s32 %r889, %r888, %r859; + xor.b32 %r890, %r889, %r828; + shf.l.wrap.b32 %r891, %r890, %r890, 16; + add.s32 %r892, %r891, %r815; + xor.b32 %r893, %r892, %r859; + shf.l.wrap.b32 %r894, %r893, %r893, 20; + add.s32 %r895, %r889, %r122; + add.s32 %r896, %r895, %r894; + xor.b32 %r897, %r896, %r891; + shf.l.wrap.b32 %r898, %r897, %r897, 24; + add.s32 %r899, %r898, %r892; + xor.b32 %r900, %r899, %r894; + shf.l.wrap.b32 %r901, %r900, %r900, 25; + add.s32 %r902, %r854, %r108; + add.s32 %r903, %r902, %r817; + xor.b32 %r904, %r903, %r842; + shf.l.wrap.b32 %r905, %r904, %r904, 16; + add.s32 %r906, %r905, %r829; + xor.b32 %r907, %r906, %r817; + shf.l.wrap.b32 %r908, %r907, %r907, 20; + add.s32 %r909, %r903, %r129; + add.s32 %r910, %r909, %r908; + xor.b32 %r911, %r910, %r905; + shf.l.wrap.b32 %r912, %r911, %r911, 24; + add.s32 %r913, %r912, %r906; + xor.b32 %r914, %r913, %r908; + shf.l.wrap.b32 %r915, %r914, %r914, 25; + add.s32 %r916, %r868, %r157; + add.s32 %r917, %r916, %r915; + xor.b32 %r918, %r917, %r884; + shf.l.wrap.b32 %r919, %r918, %r918, 16; + add.s32 %r920, %r919, %r899; + xor.b32 %r921, %r920, %r915; + shf.l.wrap.b32 %r922, %r921, %r921, 20; + add.s32 %r923, %r917, %r185; + add.s32 %r924, %r923, %r922; + xor.b32 %r925, %r924, %r919; + shf.l.wrap.b32 %r926, %r925, %r925, 24; + add.s32 %r927, %r926, %r920; + xor.b32 %r928, %r927, %r922; + shf.l.wrap.b32 %r929, %r928, %r928, 25; + add.s32 %r930, %r882, %r115; + add.s32 %r931, %r930, %r873; + xor.b32 %r932, %r931, %r898; + shf.l.wrap.b32 %r933, %r932, %r932, 16; + add.s32 %r934, %r933, %r913; + xor.b32 %r935, %r934, %r873; + shf.l.wrap.b32 %r936, %r935, %r935, 20; + add.s32 %r937, %r931, %r80; + add.s32 %r938, %r937, %r936; + xor.b32 %r939, %r938, %r933; + shf.l.wrap.b32 %r940, %r939, %r939, 24; + add.s32 %r941, %r940, %r934; + xor.b32 %r942, %r941, %r936; + shf.l.wrap.b32 %r943, %r942, %r942, 25; + add.s32 %r944, %r896, %r87; + add.s32 %r945, %r944, %r887; + xor.b32 %r946, %r912, %r945; + shf.l.wrap.b32 %r947, %r946, %r946, 16; + add.s32 %r948, %r947, %r871; + xor.b32 %r949, %r948, %r887; + shf.l.wrap.b32 %r950, %r949, %r949, 20; + add.s32 %r951, %r945, %r143; + add.s32 %r952, %r951, %r950; + xor.b32 %r953, %r952, %r947; + shf.l.wrap.b32 %r954, %r953, %r953, 24; + add.s32 %r955, %r954, %r948; + xor.b32 %r956, %r955, %r950; + shf.l.wrap.b32 %r957, %r956, %r956, 25; + add.s32 %r958, %r901, %r136; + add.s32 %r959, %r958, %r910; + xor.b32 %r960, %r959, %r870; + shf.l.wrap.b32 %r961, %r960, %r960, 16; + add.s32 %r962, %r961, %r885; + xor.b32 %r963, %r962, %r901; + shf.l.wrap.b32 %r964, %r963, %r963, 20; + add.s32 %r965, %r959, %r122; + add.s32 %r966, %r965, %r964; + xor.b32 %r967, %r966, %r961; + shf.l.wrap.b32 %r968, %r967, %r967, 24; + add.s32 %r969, %r968, %r962; + xor.b32 %r970, %r969, %r964; + shf.l.wrap.b32 %r971, %r970, %r970, 25; + add.s32 %r972, %r943, %r178; + add.s32 %r973, %r972, %r924; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 16; + add.s32 %r976, %r975, %r955; + xor.b32 %r977, %r976, %r943; + shf.l.wrap.b32 %r978, %r977, %r977, 20; + add.s32 %r979, %r973, %r150; + add.s32 %r980, %r979, %r978; + xor.b32 %r981, %r980, %r975; + shf.l.wrap.b32 %r982, %r981, %r981, 24; + add.s32 %r983, %r982, %r976; + xor.b32 %r984, %r983, %r978; + shf.l.wrap.b32 %r985, %r984, %r984, 25; + add.s32 %r986, %r938, %r94; + add.s32 %r987, %r986, %r957; + xor.b32 %r988, %r926, %r987; + shf.l.wrap.b32 %r989, %r988, %r988, 16; + add.s32 %r990, %r989, %r969; + xor.b32 %r991, %r990, %r957; + shf.l.wrap.b32 %r992, %r991, %r991, 20; + add.s32 %r993, %r987, %r164; + add.s32 %r994, %r993, %r992; + xor.b32 %r995, %r994, %r989; + shf.l.wrap.b32 %r996, %r995, %r995, 24; + add.s32 %r997, %r996, %r990; + xor.b32 %r998, %r997, %r992; + shf.l.wrap.b32 %r999, %r998, %r998, 25; + add.s32 %r1000, %r952, %r101; + add.s32 %r1001, %r1000, %r971; + xor.b32 %r1002, %r1001, %r940; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 16; + add.s32 %r1004, %r1003, %r927; + xor.b32 %r1005, %r1004, %r971; + shf.l.wrap.b32 %r1006, %r1005, %r1005, 20; + add.s32 %r1007, %r1001, %r108; + add.s32 %r1008, %r1007, %r1006; + xor.b32 %r1009, %r1008, %r1003; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 24; + add.s32 %r1011, %r1010, %r1004; + xor.b32 %r1012, %r1011, %r1006; + shf.l.wrap.b32 %r1013, %r1012, %r1012, 25; + add.s32 %r1014, %r966, %r129; + add.s32 %r1015, %r1014, %r929; + xor.b32 %r1016, %r1015, %r954; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 16; + add.s32 %r1018, %r1017, %r941; + xor.b32 %r1019, %r1018, %r929; + shf.l.wrap.b32 %r1020, %r1019, %r1019, 20; + add.s32 %r1021, %r1015, %r171; + add.s32 %r1022, %r1021, %r1020; + xor.b32 %r1023, %r1022, %r1017; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 24; + add.s32 %r1025, %r1024, %r1018; + xor.b32 %r1026, %r1025, %r1020; + shf.l.wrap.b32 %r1027, %r1026, %r1026, 25; + xor.b32 %r1028, %r1011, %r980; + xor.b32 %r1029, %r1025, %r994; + xor.b32 %r1030, %r983, %r1008; + xor.b32 %r1031, %r1022, %r997; + xor.b32 %r1032, %r1027, %r996; + xor.b32 %r1033, %r985, %r1010; + xor.b32 %r1034, %r1024, %r999; + xor.b32 %r1035, %r1013, %r982; + st.local.u8 [%rd154], %r1028; + shr.u32 %r1036, %r1028, 8; + st.local.u8 [%rd154+1], %r1036; + shr.u32 %r1037, %r1028, 16; + st.local.u8 [%rd154+2], %r1037; + shr.u32 %r1038, %r1028, 24; + st.local.u8 [%rd154+3], %r1038; + st.local.u8 [%rd154+4], %r1029; + shr.u32 %r1039, %r1029, 8; + st.local.u8 [%rd154+5], %r1039; + shr.u32 %r1040, %r1029, 16; + st.local.u8 [%rd154+6], %r1040; + shr.u32 %r1041, %r1029, 24; + st.local.u8 [%rd154+7], %r1041; + st.local.u8 [%rd154+8], %r1030; + shr.u32 %r1042, %r1030, 8; + st.local.u8 [%rd154+9], %r1042; + shr.u32 %r1043, %r1030, 16; + st.local.u8 [%rd154+10], %r1043; + shr.u32 %r1044, %r1030, 24; + st.local.u8 [%rd154+11], %r1044; + st.local.u8 [%rd154+12], %r1031; + shr.u32 %r1045, %r1031, 8; + st.local.u8 [%rd154+13], %r1045; + shr.u32 %r1046, %r1031, 16; + st.local.u8 [%rd154+14], %r1046; + shr.u32 %r1047, %r1031, 24; + st.local.u8 [%rd154+15], %r1047; + st.local.u8 [%rd154+16], %r1032; + shr.u32 %r1048, %r1032, 8; + st.local.u8 [%rd154+17], %r1048; + shr.u32 %r1049, %r1032, 16; + st.local.u8 [%rd154+18], %r1049; + shr.u32 %r1050, %r1032, 24; + st.local.u8 [%rd154+19], %r1050; + st.local.u8 [%rd154+20], %r1033; + shr.u32 %r1051, %r1033, 8; + st.local.u8 [%rd154+21], %r1051; + shr.u32 %r1052, %r1033, 16; + st.local.u8 [%rd154+22], %r1052; + shr.u32 %r1053, %r1033, 24; + st.local.u8 [%rd154+23], %r1053; + st.local.u8 [%rd154+24], %r1034; + shr.u32 %r1054, %r1034, 8; + st.local.u8 [%rd154+25], %r1054; + shr.u32 %r1055, %r1034, 16; + st.local.u8 [%rd154+26], %r1055; + shr.u32 %r1056, %r1034, 24; + st.local.u8 [%rd154+27], %r1056; + st.local.u8 [%rd154+28], %r1035; + shr.u32 %r1057, %r1035, 8; + st.local.u8 [%rd154+29], %r1057; + shr.u32 %r1058, %r1035, 16; + st.local.u8 [%rd154+30], %r1058; + shr.u32 %r1059, %r1035, 24; + st.local.u8 [%rd154+31], %r1059; + add.s64 %rd152, %rd152, 8; + add.s64 %rd154, %rd154, 32; + add.s64 %rd153, %rd153, -1; + setp.ne.s64 %p12, %rd153, 0; + @%p12 bra $L__BB0_7; + +$L__BB0_8: + setp.le.u64 %p13, %rd7, %rd151; + @%p13 bra $L__BB0_30; + + add.u64 %rd143, %SPL, 96; + ld.param.u64 %rd141, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd140, %rd141; + shl.b64 %rd110, %rd150, 6; + shl.b64 %rd111, %rd150, 5; + add.s64 %rd27, %rd140, %rd111; + add.s64 %rd28, %rd143, %rd110; + mov.u64 %rd155, 0; + +$L__BB0_10: + add.s64 %rd112, %rd28, %rd155; + ld.local.u8 %rs77, [%rd112]; + add.s64 %rd113, %rd27, %rd155; + st.local.u8 [%rd113], %rs77; + add.s64 %rd155, %rd155, 1; + setp.lt.u64 %p14, %rd155, 32; + @%p14 bra $L__BB0_10; + + add.s64 %rd150, %rd150, 1; + +$L__BB0_30: + st.param.b64 [func_retval0+0], %rd150; + ret; + +} +.func _Z20blake3_hasher_updateP13blake3_hasherPKvy( + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0, + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1 +) +{ + .local .align 16 .b8 __local_depot1[144]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<54>; + .reg .b16 %rs<393>; + .reg .b32 %r<11690>; + .reg .b64 %rd<272>; + + + mov.u64 %SPL, __local_depot1; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd98, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + ld.param.u64 %rd253, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd260, %rd253; + cvta.to.local.u64 %rd2, %rd98; + add.s64 %rd3, %rd2, 136; + mov.u64 %rd261, 32; + ld.local.v2.u8 {%rs102, %rs103}, [%rd2+136]; + cvt.u64.u16 %rd4, %rs103; + cvt.u32.u16 %r144, %rs103; + mul.wide.u32 %rd101, %r144, 64; + cvt.u64.u16 %rd5, %rs102; + neg.s64 %rd102, %rd5; + setp.eq.s64 %p1, %rd101, %rd102; + @%p1 bra $L__BB1_24; + + shl.b64 %rd103, %rd4, 6; + mov.u64 %rd104, 1024; + sub.s64 %rd105, %rd104, %rd5; + sub.s64 %rd106, %rd105, %rd103; + min.u64 %rd6, %rd106, 32; + setp.eq.s16 %p2, %rs102, 0; + mov.u16 %rs351, 0; + mov.u64 %rd243, %rd6; + @%p2 bra $L__BB1_9; + + cvt.u32.u16 %r145, %rs102; + prmt.b32 %r147, %r144, %r145, 30212; + cvt.u16.u32 %rs350, %r147; + mov.u64 %rd107, 64; + sub.s64 %rd108, %rd107, %rd5; + min.u64 %rd7, %rd108, %rd6; + setp.eq.s64 %p3, %rd7, 0; + @%p3 bra $L__BB1_6; + + add.s64 %rd110, %rd2, %rd5; + add.s64 %rd8, %rd110, 72; + mov.u64 %rd236, 0; + +$L__BB1_4: + add.s64 %rd111, %rd260, %rd236; + ld.local.u8 %rs107, [%rd111]; + add.s64 %rd112, %rd8, %rd236; + st.local.u8 [%rd112], %rs107; + add.s64 %rd236, %rd236, 1; + setp.lt.u64 %p4, %rd236, %rd7; + @%p4 bra $L__BB1_4; + + ld.local.u8 %rs350, [%rd3]; + +$L__BB1_6: + cvt.u16.u64 %rs108, %rd7; + add.s16 %rs351, %rs350, %rs108; + mov.u64 %rd243, 0; + st.local.u8 [%rd3], %rs351; + add.s64 %rd260, %rd260, %rd7; + sub.s64 %rd12, %rd6, %rd7; + setp.eq.s64 %p5, %rd12, 0; + @%p5 bra $L__BB1_9; + + add.s64 %rd13, %rd2, 72; + ld.local.u8 %rs109, [%rd3+1]; + mov.u64 %rd237, 0; + setp.eq.s16 %p6, %rs109, 0; + mov.u16 %rs351, 0; + selp.u16 %rs111, 1, 0, %p6; + ld.local.u8 %rs112, [%rd3+2]; + or.b16 %rs113, %rs112, %rs111; + ld.local.u8 %r148, [%rd3+-64]; + ld.local.u8 %r149, [%rd3+-63]; + prmt.b32 %r150, %r149, %r148, 30212; + ld.local.u8 %r151, [%rd3+-62]; + prmt.b32 %r152, %r151, %r150, 28756; + ld.local.u8 %r153, [%rd3+-61]; + prmt.b32 %r154, %r153, %r152, 1620; + ld.local.u8 %r155, [%rd3+-60]; + ld.local.u8 %r156, [%rd3+-59]; + prmt.b32 %r157, %r156, %r155, 30212; + ld.local.u8 %r158, [%rd3+-58]; + prmt.b32 %r159, %r158, %r157, 28756; + ld.local.u8 %r160, [%rd3+-57]; + prmt.b32 %r161, %r160, %r159, 1620; + ld.local.u8 %r162, [%rd3+-56]; + ld.local.u8 %r163, [%rd3+-55]; + prmt.b32 %r164, %r163, %r162, 30212; + ld.local.u8 %r165, [%rd3+-54]; + prmt.b32 %r166, %r165, %r164, 28756; + ld.local.u8 %r167, [%rd3+-53]; + prmt.b32 %r168, %r167, %r166, 1620; + ld.local.u8 %r169, [%rd3+-52]; + ld.local.u8 %r170, [%rd3+-51]; + prmt.b32 %r171, %r170, %r169, 30212; + ld.local.u8 %r172, [%rd3+-50]; + prmt.b32 %r173, %r172, %r171, 28756; + ld.local.u8 %r174, [%rd3+-49]; + prmt.b32 %r175, %r174, %r173, 1620; + ld.local.u8 %r176, [%rd3+-48]; + ld.local.u8 %r177, [%rd3+-47]; + prmt.b32 %r178, %r177, %r176, 30212; + ld.local.u8 %r179, [%rd3+-46]; + prmt.b32 %r180, %r179, %r178, 28756; + ld.local.u8 %r181, [%rd3+-45]; + prmt.b32 %r182, %r181, %r180, 1620; + ld.local.u8 %r183, [%rd3+-44]; + ld.local.u8 %r184, [%rd3+-43]; + prmt.b32 %r185, %r184, %r183, 30212; + ld.local.u8 %r186, [%rd3+-42]; + prmt.b32 %r187, %r186, %r185, 28756; + ld.local.u8 %r188, [%rd3+-41]; + prmt.b32 %r189, %r188, %r187, 1620; + ld.local.u8 %r190, [%rd3+-40]; + ld.local.u8 %r191, [%rd3+-39]; + prmt.b32 %r192, %r191, %r190, 30212; + ld.local.u8 %r193, [%rd3+-38]; + prmt.b32 %r194, %r193, %r192, 28756; + ld.local.u8 %r195, [%rd3+-37]; + prmt.b32 %r196, %r195, %r194, 1620; + ld.local.u8 %r197, [%rd3+-36]; + ld.local.u8 %r198, [%rd3+-35]; + prmt.b32 %r199, %r198, %r197, 30212; + ld.local.u8 %r200, [%rd3+-34]; + prmt.b32 %r201, %r200, %r199, 28756; + ld.local.u8 %r202, [%rd3+-33]; + prmt.b32 %r203, %r202, %r201, 1620; + ld.local.u8 %r204, [%rd3+-32]; + ld.local.u8 %r205, [%rd3+-31]; + prmt.b32 %r206, %r205, %r204, 30212; + ld.local.u8 %r207, [%rd3+-30]; + prmt.b32 %r208, %r207, %r206, 28756; + ld.local.u8 %r209, [%rd3+-29]; + prmt.b32 %r210, %r209, %r208, 1620; + ld.local.u8 %r211, [%rd3+-28]; + ld.local.u8 %r212, [%rd3+-27]; + prmt.b32 %r213, %r212, %r211, 30212; + ld.local.u8 %r214, [%rd3+-26]; + prmt.b32 %r215, %r214, %r213, 28756; + ld.local.u8 %r216, [%rd3+-25]; + prmt.b32 %r217, %r216, %r215, 1620; + ld.local.u8 %r218, [%rd3+-24]; + ld.local.u8 %r219, [%rd3+-23]; + prmt.b32 %r220, %r219, %r218, 30212; + ld.local.u8 %r221, [%rd3+-22]; + prmt.b32 %r222, %r221, %r220, 28756; + ld.local.u8 %r223, [%rd3+-21]; + prmt.b32 %r224, %r223, %r222, 1620; + ld.local.u8 %r225, [%rd3+-20]; + ld.local.u8 %r226, [%rd3+-19]; + prmt.b32 %r227, %r226, %r225, 30212; + ld.local.u8 %r228, [%rd3+-18]; + prmt.b32 %r229, %r228, %r227, 28756; + ld.local.u8 %r230, [%rd3+-17]; + prmt.b32 %r231, %r230, %r229, 1620; + ld.local.u8 %r232, [%rd3+-16]; + ld.local.u8 %r233, [%rd3+-15]; + prmt.b32 %r234, %r233, %r232, 30212; + ld.local.u8 %r235, [%rd3+-14]; + prmt.b32 %r236, %r235, %r234, 28756; + ld.local.u8 %r237, [%rd3+-13]; + prmt.b32 %r238, %r237, %r236, 1620; + ld.local.u8 %r239, [%rd3+-12]; + ld.local.u8 %r240, [%rd3+-11]; + prmt.b32 %r241, %r240, %r239, 30212; + ld.local.u8 %r242, [%rd3+-10]; + prmt.b32 %r243, %r242, %r241, 28756; + ld.local.u8 %r244, [%rd3+-9]; + prmt.b32 %r245, %r244, %r243, 1620; + ld.local.u8 %r246, [%rd3+-8]; + ld.local.u8 %r247, [%rd3+-7]; + prmt.b32 %r248, %r247, %r246, 30212; + ld.local.u8 %r249, [%rd3+-6]; + prmt.b32 %r250, %r249, %r248, 28756; + ld.local.u8 %r251, [%rd3+-5]; + prmt.b32 %r252, %r251, %r250, 1620; + ld.local.u8 %r253, [%rd3+-4]; + ld.local.u8 %r254, [%rd3+-3]; + prmt.b32 %r255, %r254, %r253, 30212; + ld.local.u8 %r256, [%rd3+-2]; + prmt.b32 %r257, %r256, %r255, 28756; + ld.local.u8 %r258, [%rd3+-1]; + prmt.b32 %r259, %r258, %r257, 1620; + ld.local.u64 %rd115, [%rd3+-72]; + cvt.u32.u64 %r260, %rd115; + shr.u64 %rd116, %rd115, 32; + cvt.u32.u64 %r261, %rd116; + cvt.u32.u16 %r262, %rs113; + and.b32 %r263, %r262, 255; + ld.local.u32 %r264, [%rd3+-104]; + add.s32 %r265, %r264, %r154; + ld.local.u32 %r266, [%rd3+-88]; + add.s32 %r267, %r265, %r266; + xor.b32 %r268, %r267, %r260; + shf.l.wrap.b32 %r269, %r268, %r268, 16; + add.s32 %r270, %r269, 1779033703; + xor.b32 %r271, %r270, %r266; + shf.l.wrap.b32 %r272, %r271, %r271, 20; + add.s32 %r273, %r267, %r161; + add.s32 %r274, %r273, %r272; + xor.b32 %r275, %r274, %r269; + shf.l.wrap.b32 %r276, %r275, %r275, 24; + add.s32 %r277, %r276, %r270; + xor.b32 %r278, %r277, %r272; + shf.l.wrap.b32 %r279, %r278, %r278, 25; + ld.local.u32 %r280, [%rd3+-100]; + add.s32 %r281, %r280, %r168; + ld.local.u32 %r282, [%rd3+-84]; + add.s32 %r283, %r281, %r282; + xor.b32 %r284, %r283, %r261; + shf.l.wrap.b32 %r285, %r284, %r284, 16; + add.s32 %r286, %r285, -1150833019; + xor.b32 %r287, %r286, %r282; + shf.l.wrap.b32 %r288, %r287, %r287, 20; + add.s32 %r289, %r283, %r175; + add.s32 %r290, %r289, %r288; + xor.b32 %r291, %r290, %r285; + shf.l.wrap.b32 %r292, %r291, %r291, 24; + add.s32 %r293, %r292, %r286; + xor.b32 %r294, %r293, %r288; + shf.l.wrap.b32 %r295, %r294, %r294, 25; + ld.local.u32 %r296, [%rd3+-96]; + add.s32 %r297, %r296, %r182; + ld.local.u32 %r298, [%rd3+-80]; + add.s32 %r299, %r297, %r298; + shr.u32 %r300, %r299, 16; + shl.b32 %r301, %r299, 16; + xor.b32 %r302, %r301, 4194304; + or.b32 %r303, %r302, %r300; + add.s32 %r304, %r303, 1013904242; + xor.b32 %r305, %r304, %r298; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r299, %r189; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + ld.local.u32 %r314, [%rd3+-92]; + add.s32 %r315, %r314, %r196; + ld.local.u32 %r316, [%rd3+-76]; + add.s32 %r317, %r315, %r316; + xor.b32 %r318, %r317, %r263; + shr.u32 %r319, %r317, 16; + shl.b32 %r320, %r318, 16; + or.b32 %r321, %r320, %r319; + add.s32 %r322, %r321, -1521486534; + xor.b32 %r323, %r322, %r316; + shf.l.wrap.b32 %r324, %r323, %r323, 20; + add.s32 %r325, %r317, %r203; + add.s32 %r326, %r325, %r324; + xor.b32 %r327, %r326, %r321; + shf.l.wrap.b32 %r328, %r327, %r327, 24; + add.s32 %r329, %r328, %r322; + xor.b32 %r330, %r329, %r324; + shf.l.wrap.b32 %r331, %r330, %r330, 25; + add.s32 %r332, %r274, %r210; + add.s32 %r333, %r332, %r295; + xor.b32 %r334, %r333, %r328; + shf.l.wrap.b32 %r335, %r334, %r334, 16; + add.s32 %r336, %r335, %r311; + xor.b32 %r337, %r336, %r295; + shf.l.wrap.b32 %r338, %r337, %r337, 20; + add.s32 %r339, %r333, %r217; + add.s32 %r340, %r339, %r338; + xor.b32 %r341, %r340, %r335; + shf.l.wrap.b32 %r342, %r341, %r341, 24; + add.s32 %r343, %r342, %r336; + xor.b32 %r344, %r343, %r338; + shf.l.wrap.b32 %r345, %r344, %r344, 25; + add.s32 %r346, %r290, %r224; + add.s32 %r347, %r346, %r313; + xor.b32 %r348, %r347, %r276; + shf.l.wrap.b32 %r349, %r348, %r348, 16; + add.s32 %r350, %r349, %r329; + xor.b32 %r351, %r350, %r313; + shf.l.wrap.b32 %r352, %r351, %r351, 20; + add.s32 %r353, %r347, %r231; + add.s32 %r354, %r353, %r352; + xor.b32 %r355, %r354, %r349; + shf.l.wrap.b32 %r356, %r355, %r355, 24; + add.s32 %r357, %r356, %r350; + xor.b32 %r358, %r357, %r352; + shf.l.wrap.b32 %r359, %r358, %r358, 25; + add.s32 %r360, %r308, %r238; + add.s32 %r361, %r360, %r331; + xor.b32 %r362, %r361, %r292; + shf.l.wrap.b32 %r363, %r362, %r362, 16; + add.s32 %r364, %r363, %r277; + xor.b32 %r365, %r364, %r331; + shf.l.wrap.b32 %r366, %r365, %r365, 20; + add.s32 %r367, %r361, %r245; + add.s32 %r368, %r367, %r366; + xor.b32 %r369, %r368, %r363; + shf.l.wrap.b32 %r370, %r369, %r369, 24; + add.s32 %r371, %r370, %r364; + xor.b32 %r372, %r371, %r366; + shf.l.wrap.b32 %r373, %r372, %r372, 25; + add.s32 %r374, %r326, %r252; + add.s32 %r375, %r374, %r279; + xor.b32 %r376, %r375, %r310; + shf.l.wrap.b32 %r377, %r376, %r376, 16; + add.s32 %r378, %r377, %r293; + xor.b32 %r379, %r378, %r279; + shf.l.wrap.b32 %r380, %r379, %r379, 20; + add.s32 %r381, %r375, %r259; + add.s32 %r382, %r381, %r380; + xor.b32 %r383, %r382, %r377; + shf.l.wrap.b32 %r384, %r383, %r383, 24; + add.s32 %r385, %r384, %r378; + xor.b32 %r386, %r385, %r380; + shf.l.wrap.b32 %r387, %r386, %r386, 25; + add.s32 %r388, %r340, %r168; + add.s32 %r389, %r388, %r387; + xor.b32 %r390, %r389, %r356; + shf.l.wrap.b32 %r391, %r390, %r390, 16; + add.s32 %r392, %r391, %r371; + xor.b32 %r393, %r392, %r387; + shf.l.wrap.b32 %r394, %r393, %r393, 20; + add.s32 %r395, %r389, %r196; + add.s32 %r396, %r395, %r394; + xor.b32 %r397, %r396, %r391; + shf.l.wrap.b32 %r398, %r397, %r397, 24; + add.s32 %r399, %r398, %r392; + xor.b32 %r400, %r399, %r394; + shf.l.wrap.b32 %r401, %r400, %r400, 25; + add.s32 %r402, %r354, %r175; + add.s32 %r403, %r402, %r345; + xor.b32 %r404, %r403, %r370; + shf.l.wrap.b32 %r405, %r404, %r404, 16; + add.s32 %r406, %r405, %r385; + xor.b32 %r407, %r406, %r345; + shf.l.wrap.b32 %r408, %r407, %r407, 20; + add.s32 %r409, %r403, %r224; + add.s32 %r410, %r409, %r408; + xor.b32 %r411, %r410, %r405; + shf.l.wrap.b32 %r412, %r411, %r411, 24; + add.s32 %r413, %r412, %r406; + xor.b32 %r414, %r413, %r408; + shf.l.wrap.b32 %r415, %r414, %r414, 25; + add.s32 %r416, %r368, %r203; + add.s32 %r417, %r416, %r359; + xor.b32 %r418, %r417, %r384; + shf.l.wrap.b32 %r419, %r418, %r418, 16; + add.s32 %r420, %r419, %r343; + xor.b32 %r421, %r420, %r359; + shf.l.wrap.b32 %r422, %r421, %r421, 20; + add.s32 %r423, %r417, %r154; + add.s32 %r424, %r423, %r422; + xor.b32 %r425, %r424, %r419; + shf.l.wrap.b32 %r426, %r425, %r425, 24; + add.s32 %r427, %r426, %r420; + xor.b32 %r428, %r427, %r422; + shf.l.wrap.b32 %r429, %r428, %r428, 25; + add.s32 %r430, %r382, %r182; + add.s32 %r431, %r430, %r373; + xor.b32 %r432, %r431, %r342; + shf.l.wrap.b32 %r433, %r432, %r432, 16; + add.s32 %r434, %r433, %r357; + xor.b32 %r435, %r434, %r373; + shf.l.wrap.b32 %r436, %r435, %r435, 20; + add.s32 %r437, %r431, %r245; + add.s32 %r438, %r437, %r436; + xor.b32 %r439, %r438, %r433; + shf.l.wrap.b32 %r440, %r439, %r439, 24; + add.s32 %r441, %r440, %r434; + xor.b32 %r442, %r441, %r436; + shf.l.wrap.b32 %r443, %r442, %r442, 25; + add.s32 %r444, %r396, %r161; + add.s32 %r445, %r444, %r415; + xor.b32 %r446, %r445, %r440; + shf.l.wrap.b32 %r447, %r446, %r446, 16; + add.s32 %r448, %r447, %r427; + xor.b32 %r449, %r448, %r415; + shf.l.wrap.b32 %r450, %r449, %r449, 20; + add.s32 %r451, %r445, %r231; + add.s32 %r452, %r451, %r450; + xor.b32 %r453, %r452, %r447; + shf.l.wrap.b32 %r454, %r453, %r453, 24; + add.s32 %r455, %r454, %r448; + xor.b32 %r456, %r455, %r450; + shf.l.wrap.b32 %r457, %r456, %r456, 25; + add.s32 %r458, %r410, %r238; + add.s32 %r459, %r458, %r429; + xor.b32 %r460, %r459, %r398; + shf.l.wrap.b32 %r461, %r460, %r460, 16; + add.s32 %r462, %r461, %r441; + xor.b32 %r463, %r462, %r429; + shf.l.wrap.b32 %r464, %r463, %r463, 20; + add.s32 %r465, %r459, %r189; + add.s32 %r466, %r465, %r464; + xor.b32 %r467, %r466, %r461; + shf.l.wrap.b32 %r468, %r467, %r467, 24; + add.s32 %r469, %r468, %r462; + xor.b32 %r470, %r469, %r464; + shf.l.wrap.b32 %r471, %r470, %r470, 25; + add.s32 %r472, %r424, %r217; + add.s32 %r473, %r472, %r443; + xor.b32 %r474, %r473, %r412; + shf.l.wrap.b32 %r475, %r474, %r474, 16; + add.s32 %r476, %r475, %r399; + xor.b32 %r477, %r476, %r443; + shf.l.wrap.b32 %r478, %r477, %r477, 20; + add.s32 %r479, %r473, %r252; + add.s32 %r480, %r479, %r478; + xor.b32 %r481, %r480, %r475; + shf.l.wrap.b32 %r482, %r481, %r481, 24; + add.s32 %r483, %r482, %r476; + xor.b32 %r484, %r483, %r478; + shf.l.wrap.b32 %r485, %r484, %r484, 25; + add.s32 %r486, %r438, %r259; + add.s32 %r487, %r486, %r401; + xor.b32 %r488, %r487, %r426; + shf.l.wrap.b32 %r489, %r488, %r488, 16; + add.s32 %r490, %r489, %r413; + xor.b32 %r491, %r490, %r401; + shf.l.wrap.b32 %r492, %r491, %r491, 20; + add.s32 %r493, %r487, %r210; + add.s32 %r494, %r493, %r492; + xor.b32 %r495, %r494, %r489; + shf.l.wrap.b32 %r496, %r495, %r495, 24; + add.s32 %r497, %r496, %r490; + xor.b32 %r498, %r497, %r492; + shf.l.wrap.b32 %r499, %r498, %r498, 25; + add.s32 %r500, %r452, %r175; + add.s32 %r501, %r500, %r499; + xor.b32 %r502, %r501, %r468; + shf.l.wrap.b32 %r503, %r502, %r502, 16; + add.s32 %r504, %r503, %r483; + xor.b32 %r505, %r504, %r499; + shf.l.wrap.b32 %r506, %r505, %r505, 20; + add.s32 %r507, %r501, %r182; + add.s32 %r508, %r507, %r506; + xor.b32 %r509, %r508, %r503; + shf.l.wrap.b32 %r510, %r509, %r509, 24; + add.s32 %r511, %r510, %r504; + xor.b32 %r512, %r511, %r506; + shf.l.wrap.b32 %r513, %r512, %r512, 25; + add.s32 %r514, %r466, %r224; + add.s32 %r515, %r514, %r457; + xor.b32 %r516, %r515, %r482; + shf.l.wrap.b32 %r517, %r516, %r516, 16; + add.s32 %r518, %r517, %r497; + xor.b32 %r519, %r518, %r457; + shf.l.wrap.b32 %r520, %r519, %r519, 20; + add.s32 %r521, %r515, %r238; + add.s32 %r522, %r521, %r520; + xor.b32 %r523, %r522, %r517; + shf.l.wrap.b32 %r524, %r523, %r523, 24; + add.s32 %r525, %r524, %r518; + xor.b32 %r526, %r525, %r520; + shf.l.wrap.b32 %r527, %r526, %r526, 25; + add.s32 %r528, %r480, %r245; + add.s32 %r529, %r528, %r471; + xor.b32 %r530, %r529, %r496; + shf.l.wrap.b32 %r531, %r530, %r530, 16; + add.s32 %r532, %r531, %r455; + xor.b32 %r533, %r532, %r471; + shf.l.wrap.b32 %r534, %r533, %r533, 20; + add.s32 %r535, %r529, %r168; + add.s32 %r536, %r535, %r534; + xor.b32 %r537, %r536, %r531; + shf.l.wrap.b32 %r538, %r537, %r537, 24; + add.s32 %r539, %r538, %r532; + xor.b32 %r540, %r539, %r534; + shf.l.wrap.b32 %r541, %r540, %r540, 25; + add.s32 %r542, %r494, %r203; + add.s32 %r543, %r542, %r485; + xor.b32 %r544, %r543, %r454; + shf.l.wrap.b32 %r545, %r544, %r544, 16; + add.s32 %r546, %r545, %r469; + xor.b32 %r547, %r546, %r485; + shf.l.wrap.b32 %r548, %r547, %r547, 20; + add.s32 %r549, %r543, %r252; + add.s32 %r550, %r549, %r548; + xor.b32 %r551, %r550, %r545; + shf.l.wrap.b32 %r552, %r551, %r551, 24; + add.s32 %r553, %r552, %r546; + xor.b32 %r554, %r553, %r548; + shf.l.wrap.b32 %r555, %r554, %r554, 25; + add.s32 %r556, %r508, %r196; + add.s32 %r557, %r556, %r527; + xor.b32 %r558, %r557, %r552; + shf.l.wrap.b32 %r559, %r558, %r558, 16; + add.s32 %r560, %r559, %r539; + xor.b32 %r561, %r560, %r527; + shf.l.wrap.b32 %r562, %r561, %r561, 20; + add.s32 %r563, %r557, %r189; + add.s32 %r564, %r563, %r562; + xor.b32 %r565, %r564, %r559; + shf.l.wrap.b32 %r566, %r565, %r565, 24; + add.s32 %r567, %r566, %r560; + xor.b32 %r568, %r567, %r562; + shf.l.wrap.b32 %r569, %r568, %r568, 25; + add.s32 %r570, %r522, %r217; + add.s32 %r571, %r570, %r541; + xor.b32 %r572, %r571, %r510; + shf.l.wrap.b32 %r573, %r572, %r572, 16; + add.s32 %r574, %r573, %r553; + xor.b32 %r575, %r574, %r541; + shf.l.wrap.b32 %r576, %r575, %r575, 20; + add.s32 %r577, %r571, %r154; + add.s32 %r578, %r577, %r576; + xor.b32 %r579, %r578, %r573; + shf.l.wrap.b32 %r580, %r579, %r579, 24; + add.s32 %r581, %r580, %r574; + xor.b32 %r582, %r581, %r576; + shf.l.wrap.b32 %r583, %r582, %r582, 25; + add.s32 %r584, %r536, %r231; + add.s32 %r585, %r584, %r555; + xor.b32 %r586, %r585, %r524; + shf.l.wrap.b32 %r587, %r586, %r586, 16; + add.s32 %r588, %r587, %r511; + xor.b32 %r589, %r588, %r555; + shf.l.wrap.b32 %r590, %r589, %r589, 20; + add.s32 %r591, %r585, %r259; + add.s32 %r592, %r591, %r590; + xor.b32 %r593, %r592, %r587; + shf.l.wrap.b32 %r594, %r593, %r593, 24; + add.s32 %r595, %r594, %r588; + xor.b32 %r596, %r595, %r590; + shf.l.wrap.b32 %r597, %r596, %r596, 25; + add.s32 %r598, %r550, %r210; + add.s32 %r599, %r598, %r513; + xor.b32 %r600, %r599, %r538; + shf.l.wrap.b32 %r601, %r600, %r600, 16; + add.s32 %r602, %r601, %r525; + xor.b32 %r603, %r602, %r513; + shf.l.wrap.b32 %r604, %r603, %r603, 20; + add.s32 %r605, %r599, %r161; + add.s32 %r606, %r605, %r604; + xor.b32 %r607, %r606, %r601; + shf.l.wrap.b32 %r608, %r607, %r607, 24; + add.s32 %r609, %r608, %r602; + xor.b32 %r610, %r609, %r604; + shf.l.wrap.b32 %r611, %r610, %r610, 25; + add.s32 %r612, %r564, %r224; + add.s32 %r613, %r612, %r611; + xor.b32 %r614, %r613, %r580; + shf.l.wrap.b32 %r615, %r614, %r614, 16; + add.s32 %r616, %r615, %r595; + xor.b32 %r617, %r616, %r611; + shf.l.wrap.b32 %r618, %r617, %r617, 20; + add.s32 %r619, %r613, %r203; + add.s32 %r620, %r619, %r618; + xor.b32 %r621, %r620, %r615; + shf.l.wrap.b32 %r622, %r621, %r621, 24; + add.s32 %r623, %r622, %r616; + xor.b32 %r624, %r623, %r618; + shf.l.wrap.b32 %r625, %r624, %r624, 25; + add.s32 %r626, %r578, %r238; + add.s32 %r627, %r626, %r569; + xor.b32 %r628, %r627, %r594; + shf.l.wrap.b32 %r629, %r628, %r628, 16; + add.s32 %r630, %r629, %r609; + xor.b32 %r631, %r630, %r569; + shf.l.wrap.b32 %r632, %r631, %r631, 20; + add.s32 %r633, %r627, %r217; + add.s32 %r634, %r633, %r632; + xor.b32 %r635, %r634, %r629; + shf.l.wrap.b32 %r636, %r635, %r635, 24; + add.s32 %r637, %r636, %r630; + xor.b32 %r638, %r637, %r632; + shf.l.wrap.b32 %r639, %r638, %r638, 25; + add.s32 %r640, %r592, %r252; + add.s32 %r641, %r640, %r583; + xor.b32 %r642, %r641, %r608; + shf.l.wrap.b32 %r643, %r642, %r642, 16; + add.s32 %r644, %r643, %r567; + xor.b32 %r645, %r644, %r583; + shf.l.wrap.b32 %r646, %r645, %r645, 20; + add.s32 %r647, %r641, %r175; + add.s32 %r648, %r647, %r646; + xor.b32 %r649, %r648, %r643; + shf.l.wrap.b32 %r650, %r649, %r649, 24; + add.s32 %r651, %r650, %r644; + xor.b32 %r652, %r651, %r646; + shf.l.wrap.b32 %r653, %r652, %r652, 25; + add.s32 %r654, %r606, %r245; + add.s32 %r655, %r654, %r597; + xor.b32 %r656, %r655, %r566; + shf.l.wrap.b32 %r657, %r656, %r656, 16; + add.s32 %r658, %r657, %r581; + xor.b32 %r659, %r658, %r597; + shf.l.wrap.b32 %r660, %r659, %r659, 20; + add.s32 %r661, %r655, %r259; + add.s32 %r662, %r661, %r660; + xor.b32 %r663, %r662, %r657; + shf.l.wrap.b32 %r664, %r663, %r663, 24; + add.s32 %r665, %r664, %r658; + xor.b32 %r666, %r665, %r660; + shf.l.wrap.b32 %r667, %r666, %r666, 25; + add.s32 %r668, %r620, %r182; + add.s32 %r669, %r668, %r639; + xor.b32 %r670, %r669, %r664; + shf.l.wrap.b32 %r671, %r670, %r670, 16; + add.s32 %r672, %r671, %r651; + xor.b32 %r673, %r672, %r639; + shf.l.wrap.b32 %r674, %r673, %r673, 20; + add.s32 %r675, %r669, %r154; + add.s32 %r676, %r675, %r674; + xor.b32 %r677, %r676, %r671; + shf.l.wrap.b32 %r678, %r677, %r677, 24; + add.s32 %r679, %r678, %r672; + xor.b32 %r680, %r679, %r674; + shf.l.wrap.b32 %r681, %r680, %r680, 25; + add.s32 %r682, %r634, %r231; + add.s32 %r683, %r682, %r653; + xor.b32 %r684, %r683, %r622; + shf.l.wrap.b32 %r685, %r684, %r684, 16; + add.s32 %r686, %r685, %r665; + xor.b32 %r687, %r686, %r653; + shf.l.wrap.b32 %r688, %r687, %r687, 20; + add.s32 %r689, %r683, %r168; + add.s32 %r690, %r689, %r688; + xor.b32 %r691, %r690, %r685; + shf.l.wrap.b32 %r692, %r691, %r691, 24; + add.s32 %r693, %r692, %r686; + xor.b32 %r694, %r693, %r688; + shf.l.wrap.b32 %r695, %r694, %r694, 25; + add.s32 %r696, %r648, %r189; + add.s32 %r697, %r696, %r667; + xor.b32 %r698, %r697, %r636; + shf.l.wrap.b32 %r699, %r698, %r698, 16; + add.s32 %r700, %r699, %r623; + xor.b32 %r701, %r700, %r667; + shf.l.wrap.b32 %r702, %r701, %r701, 20; + add.s32 %r703, %r697, %r210; + add.s32 %r704, %r703, %r702; + xor.b32 %r705, %r704, %r699; + shf.l.wrap.b32 %r706, %r705, %r705, 24; + add.s32 %r707, %r706, %r700; + xor.b32 %r708, %r707, %r702; + shf.l.wrap.b32 %r709, %r708, %r708, 25; + add.s32 %r710, %r662, %r161; + add.s32 %r711, %r710, %r625; + xor.b32 %r712, %r711, %r650; + shf.l.wrap.b32 %r713, %r712, %r712, 16; + add.s32 %r714, %r713, %r637; + xor.b32 %r715, %r714, %r625; + shf.l.wrap.b32 %r716, %r715, %r715, 20; + add.s32 %r717, %r711, %r196; + add.s32 %r718, %r717, %r716; + xor.b32 %r719, %r718, %r713; + shf.l.wrap.b32 %r720, %r719, %r719, 24; + add.s32 %r721, %r720, %r714; + xor.b32 %r722, %r721, %r716; + shf.l.wrap.b32 %r723, %r722, %r722, 25; + add.s32 %r724, %r676, %r238; + add.s32 %r725, %r724, %r723; + xor.b32 %r726, %r725, %r692; + shf.l.wrap.b32 %r727, %r726, %r726, 16; + add.s32 %r728, %r727, %r707; + xor.b32 %r729, %r728, %r723; + shf.l.wrap.b32 %r730, %r729, %r729, 20; + add.s32 %r731, %r725, %r245; + add.s32 %r732, %r731, %r730; + xor.b32 %r733, %r732, %r727; + shf.l.wrap.b32 %r734, %r733, %r733, 24; + add.s32 %r735, %r734, %r728; + xor.b32 %r736, %r735, %r730; + shf.l.wrap.b32 %r737, %r736, %r736, 25; + add.s32 %r738, %r690, %r217; + add.s32 %r739, %r738, %r681; + xor.b32 %r740, %r739, %r706; + shf.l.wrap.b32 %r741, %r740, %r740, 16; + add.s32 %r742, %r741, %r721; + xor.b32 %r743, %r742, %r681; + shf.l.wrap.b32 %r744, %r743, %r743, 20; + add.s32 %r745, %r739, %r231; + add.s32 %r746, %r745, %r744; + xor.b32 %r747, %r746, %r741; + shf.l.wrap.b32 %r748, %r747, %r747, 24; + add.s32 %r749, %r748, %r742; + xor.b32 %r750, %r749, %r744; + shf.l.wrap.b32 %r751, %r750, %r750, 25; + add.s32 %r752, %r704, %r259; + add.s32 %r753, %r752, %r695; + xor.b32 %r754, %r753, %r720; + shf.l.wrap.b32 %r755, %r754, %r754, 16; + add.s32 %r756, %r755, %r679; + xor.b32 %r757, %r756, %r695; + shf.l.wrap.b32 %r758, %r757, %r757, 20; + add.s32 %r759, %r753, %r224; + add.s32 %r760, %r759, %r758; + xor.b32 %r761, %r760, %r755; + shf.l.wrap.b32 %r762, %r761, %r761, 24; + add.s32 %r763, %r762, %r756; + xor.b32 %r764, %r763, %r758; + shf.l.wrap.b32 %r765, %r764, %r764, 25; + add.s32 %r766, %r718, %r252; + add.s32 %r767, %r766, %r709; + xor.b32 %r768, %r767, %r678; + shf.l.wrap.b32 %r769, %r768, %r768, 16; + add.s32 %r770, %r769, %r693; + xor.b32 %r771, %r770, %r709; + shf.l.wrap.b32 %r772, %r771, %r771, 20; + add.s32 %r773, %r767, %r210; + add.s32 %r774, %r773, %r772; + xor.b32 %r775, %r774, %r769; + shf.l.wrap.b32 %r776, %r775, %r775, 24; + add.s32 %r777, %r776, %r770; + xor.b32 %r778, %r777, %r772; + shf.l.wrap.b32 %r779, %r778, %r778, 25; + add.s32 %r780, %r732, %r203; + add.s32 %r781, %r780, %r751; + xor.b32 %r782, %r781, %r776; + shf.l.wrap.b32 %r783, %r782, %r782, 16; + add.s32 %r784, %r783, %r763; + xor.b32 %r785, %r784, %r751; + shf.l.wrap.b32 %r786, %r785, %r785, 20; + add.s32 %r787, %r781, %r168; + add.s32 %r788, %r787, %r786; + xor.b32 %r789, %r788, %r783; + shf.l.wrap.b32 %r790, %r789, %r789, 24; + add.s32 %r791, %r790, %r784; + xor.b32 %r792, %r791, %r786; + shf.l.wrap.b32 %r793, %r792, %r792, 25; + add.s32 %r794, %r746, %r189; + add.s32 %r795, %r794, %r765; + xor.b32 %r796, %r795, %r734; + shf.l.wrap.b32 %r797, %r796, %r796, 16; + add.s32 %r798, %r797, %r777; + xor.b32 %r799, %r798, %r765; + shf.l.wrap.b32 %r800, %r799, %r799, 20; + add.s32 %r801, %r795, %r175; + add.s32 %r802, %r801, %r800; + xor.b32 %r803, %r802, %r797; + shf.l.wrap.b32 %r804, %r803, %r803, 24; + add.s32 %r805, %r804, %r798; + xor.b32 %r806, %r805, %r800; + shf.l.wrap.b32 %r807, %r806, %r806, 25; + add.s32 %r808, %r760, %r154; + add.s32 %r809, %r808, %r779; + xor.b32 %r810, %r809, %r748; + shf.l.wrap.b32 %r811, %r810, %r810, 16; + add.s32 %r812, %r811, %r735; + xor.b32 %r813, %r812, %r779; + shf.l.wrap.b32 %r814, %r813, %r813, 20; + add.s32 %r815, %r809, %r161; + add.s32 %r816, %r815, %r814; + xor.b32 %r817, %r816, %r811; + shf.l.wrap.b32 %r818, %r817, %r817, 24; + add.s32 %r819, %r818, %r812; + xor.b32 %r820, %r819, %r814; + shf.l.wrap.b32 %r821, %r820, %r820, 25; + add.s32 %r822, %r774, %r196; + add.s32 %r823, %r822, %r737; + xor.b32 %r824, %r823, %r762; + shf.l.wrap.b32 %r825, %r824, %r824, 16; + add.s32 %r826, %r825, %r749; + xor.b32 %r827, %r826, %r737; + shf.l.wrap.b32 %r828, %r827, %r827, 20; + add.s32 %r829, %r823, %r182; + add.s32 %r830, %r829, %r828; + xor.b32 %r831, %r830, %r825; + shf.l.wrap.b32 %r832, %r831, %r831, 24; + add.s32 %r833, %r832, %r826; + xor.b32 %r834, %r833, %r828; + shf.l.wrap.b32 %r835, %r834, %r834, 25; + add.s32 %r836, %r788, %r217; + add.s32 %r837, %r836, %r835; + xor.b32 %r838, %r837, %r804; + shf.l.wrap.b32 %r839, %r838, %r838, 16; + add.s32 %r840, %r839, %r819; + xor.b32 %r841, %r840, %r835; + shf.l.wrap.b32 %r842, %r841, %r841, 20; + add.s32 %r843, %r837, %r252; + add.s32 %r844, %r843, %r842; + xor.b32 %r845, %r844, %r839; + shf.l.wrap.b32 %r846, %r845, %r845, 24; + add.s32 %r847, %r846, %r840; + xor.b32 %r848, %r847, %r842; + shf.l.wrap.b32 %r849, %r848, %r848, 25; + add.s32 %r850, %r802, %r231; + add.s32 %r851, %r850, %r793; + xor.b32 %r852, %r851, %r818; + shf.l.wrap.b32 %r853, %r852, %r852, 16; + add.s32 %r854, %r853, %r833; + xor.b32 %r855, %r854, %r793; + shf.l.wrap.b32 %r856, %r855, %r855, 20; + add.s32 %r857, %r851, %r189; + add.s32 %r858, %r857, %r856; + xor.b32 %r859, %r858, %r853; + shf.l.wrap.b32 %r860, %r859, %r859, 24; + add.s32 %r861, %r860, %r854; + xor.b32 %r862, %r861, %r856; + shf.l.wrap.b32 %r863, %r862, %r862, 25; + add.s32 %r864, %r816, %r210; + add.s32 %r865, %r864, %r807; + xor.b32 %r866, %r865, %r832; + shf.l.wrap.b32 %r867, %r866, %r866, 16; + add.s32 %r868, %r867, %r791; + xor.b32 %r869, %r868, %r807; + shf.l.wrap.b32 %r870, %r869, %r869, 20; + add.s32 %r871, %r865, %r238; + add.s32 %r872, %r871, %r870; + xor.b32 %r873, %r872, %r867; + shf.l.wrap.b32 %r874, %r873, %r873, 24; + add.s32 %r875, %r874, %r868; + xor.b32 %r876, %r875, %r870; + shf.l.wrap.b32 %r877, %r876, %r876, 25; + add.s32 %r878, %r830, %r259; + add.s32 %r879, %r878, %r821; + xor.b32 %r880, %r879, %r790; + shf.l.wrap.b32 %r881, %r880, %r880, 16; + add.s32 %r882, %r881, %r805; + xor.b32 %r883, %r882, %r821; + shf.l.wrap.b32 %r884, %r883, %r883, 20; + add.s32 %r885, %r879, %r161; + add.s32 %r886, %r885, %r884; + xor.b32 %r887, %r886, %r881; + shf.l.wrap.b32 %r888, %r887, %r887, 24; + add.s32 %r889, %r888, %r882; + xor.b32 %r890, %r889, %r884; + shf.l.wrap.b32 %r891, %r890, %r890, 25; + add.s32 %r892, %r844, %r245; + add.s32 %r893, %r892, %r863; + xor.b32 %r894, %r893, %r888; + shf.l.wrap.b32 %r895, %r894, %r894, 16; + add.s32 %r896, %r895, %r875; + xor.b32 %r897, %r896, %r863; + shf.l.wrap.b32 %r898, %r897, %r897, 20; + add.s32 %r899, %r893, %r175; + add.s32 %r900, %r899, %r898; + xor.b32 %r901, %r900, %r895; + shf.l.wrap.b32 %r902, %r901, %r901, 24; + add.s32 %r903, %r902, %r896; + xor.b32 %r904, %r903, %r898; + shf.l.wrap.b32 %r905, %r904, %r904, 25; + add.s32 %r906, %r858, %r154; + add.s32 %r907, %r906, %r877; + xor.b32 %r908, %r907, %r846; + shf.l.wrap.b32 %r909, %r908, %r908, 16; + add.s32 %r910, %r909, %r889; + xor.b32 %r911, %r910, %r877; + shf.l.wrap.b32 %r912, %r911, %r911, 20; + add.s32 %r913, %r907, %r224; + add.s32 %r914, %r913, %r912; + xor.b32 %r915, %r914, %r909; + shf.l.wrap.b32 %r916, %r915, %r915, 24; + add.s32 %r917, %r916, %r910; + xor.b32 %r918, %r917, %r912; + shf.l.wrap.b32 %r919, %r918, %r918, 25; + add.s32 %r920, %r872, %r168; + add.s32 %r921, %r920, %r891; + xor.b32 %r922, %r921, %r860; + shf.l.wrap.b32 %r923, %r922, %r922, 16; + add.s32 %r924, %r923, %r847; + xor.b32 %r925, %r924, %r891; + shf.l.wrap.b32 %r926, %r925, %r925, 20; + add.s32 %r927, %r921, %r196; + add.s32 %r928, %r927, %r926; + xor.b32 %r929, %r928, %r923; + shf.l.wrap.b32 %r930, %r929, %r929, 24; + add.s32 %r931, %r930, %r924; + xor.b32 %r932, %r931, %r926; + shf.l.wrap.b32 %r933, %r932, %r932, 25; + add.s32 %r934, %r886, %r182; + add.s32 %r935, %r934, %r849; + xor.b32 %r936, %r935, %r874; + shf.l.wrap.b32 %r937, %r936, %r936, 16; + add.s32 %r938, %r937, %r861; + xor.b32 %r939, %r938, %r849; + shf.l.wrap.b32 %r940, %r939, %r939, 20; + add.s32 %r941, %r935, %r203; + add.s32 %r942, %r941, %r940; + xor.b32 %r943, %r942, %r937; + shf.l.wrap.b32 %r944, %r943, %r943, 24; + add.s32 %r945, %r944, %r938; + xor.b32 %r946, %r945, %r940; + shf.l.wrap.b32 %r947, %r946, %r946, 25; + add.s32 %r948, %r900, %r231; + add.s32 %r949, %r948, %r947; + xor.b32 %r950, %r949, %r916; + shf.l.wrap.b32 %r951, %r950, %r950, 16; + add.s32 %r952, %r951, %r931; + xor.b32 %r953, %r952, %r947; + shf.l.wrap.b32 %r954, %r953, %r953, 20; + add.s32 %r955, %r949, %r259; + add.s32 %r956, %r955, %r954; + xor.b32 %r957, %r956, %r951; + shf.l.wrap.b32 %r958, %r957, %r957, 24; + add.s32 %r959, %r958, %r952; + xor.b32 %r960, %r959, %r954; + shf.l.wrap.b32 %r961, %r960, %r960, 25; + add.s32 %r962, %r914, %r189; + add.s32 %r963, %r962, %r905; + xor.b32 %r964, %r963, %r930; + shf.l.wrap.b32 %r965, %r964, %r964, 16; + add.s32 %r966, %r965, %r945; + xor.b32 %r967, %r966, %r905; + shf.l.wrap.b32 %r968, %r967, %r967, 20; + add.s32 %r969, %r963, %r154; + add.s32 %r970, %r969, %r968; + xor.b32 %r971, %r970, %r965; + shf.l.wrap.b32 %r972, %r971, %r971, 24; + add.s32 %r973, %r972, %r966; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 25; + add.s32 %r976, %r928, %r161; + add.s32 %r977, %r976, %r919; + xor.b32 %r978, %r977, %r944; + shf.l.wrap.b32 %r979, %r978, %r978, 16; + add.s32 %r980, %r979, %r903; + xor.b32 %r981, %r980, %r919; + shf.l.wrap.b32 %r982, %r981, %r981, 20; + add.s32 %r983, %r977, %r217; + add.s32 %r984, %r983, %r982; + xor.b32 %r985, %r984, %r979; + shf.l.wrap.b32 %r986, %r985, %r985, 24; + add.s32 %r987, %r986, %r980; + xor.b32 %r988, %r987, %r982; + shf.l.wrap.b32 %r989, %r988, %r988, 25; + add.s32 %r990, %r942, %r210; + add.s32 %r991, %r990, %r933; + xor.b32 %r992, %r991, %r902; + shf.l.wrap.b32 %r993, %r992, %r992, 16; + add.s32 %r994, %r993, %r917; + xor.b32 %r995, %r994, %r933; + shf.l.wrap.b32 %r996, %r995, %r995, 20; + add.s32 %r997, %r991, %r196; + add.s32 %r998, %r997, %r996; + xor.b32 %r999, %r998, %r993; + shf.l.wrap.b32 %r1000, %r999, %r999, 24; + add.s32 %r1001, %r1000, %r994; + xor.b32 %r1002, %r1001, %r996; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 25; + add.s32 %r1004, %r956, %r252; + add.s32 %r1005, %r1004, %r975; + xor.b32 %r1006, %r1005, %r1000; + shf.l.wrap.b32 %r1007, %r1006, %r1006, 16; + add.s32 %r1008, %r1007, %r987; + xor.b32 %r1009, %r1008, %r975; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 20; + add.s32 %r1011, %r1005, %r224; + add.s32 %r1012, %r1011, %r1010; + xor.b32 %r1013, %r1012, %r1007; + shf.l.wrap.b32 %r1014, %r1013, %r1013, 24; + add.s32 %r1015, %r1014, %r1008; + xor.b32 %r1016, %r1015, %r1010; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 25; + add.s32 %r1018, %r970, %r168; + add.s32 %r1019, %r1018, %r989; + xor.b32 %r1020, %r1019, %r958; + shf.l.wrap.b32 %r1021, %r1020, %r1020, 16; + add.s32 %r1022, %r1021, %r1001; + xor.b32 %r1023, %r1022, %r989; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 20; + add.s32 %r1025, %r1019, %r238; + add.s32 %r1026, %r1025, %r1024; + xor.b32 %r1027, %r1026, %r1021; + shf.l.wrap.b32 %r1028, %r1027, %r1027, 24; + add.s32 %r1029, %r1028, %r1022; + xor.b32 %r1030, %r1029, %r1024; + shf.l.wrap.b32 %r1031, %r1030, %r1030, 25; + add.s32 %r1032, %r984, %r175; + add.s32 %r1033, %r1032, %r1003; + xor.b32 %r1034, %r1033, %r972; + shf.l.wrap.b32 %r1035, %r1034, %r1034, 16; + add.s32 %r1036, %r1035, %r959; + xor.b32 %r1037, %r1036, %r1003; + shf.l.wrap.b32 %r1038, %r1037, %r1037, 20; + add.s32 %r1039, %r1033, %r182; + add.s32 %r1040, %r1039, %r1038; + xor.b32 %r1041, %r1040, %r1035; + shf.l.wrap.b32 %r1042, %r1041, %r1041, 24; + add.s32 %r1043, %r1042, %r1036; + xor.b32 %r1044, %r1043, %r1038; + shf.l.wrap.b32 %r1045, %r1044, %r1044, 25; + add.s32 %r1046, %r998, %r203; + add.s32 %r1047, %r1046, %r961; + xor.b32 %r1048, %r1047, %r986; + shf.l.wrap.b32 %r1049, %r1048, %r1048, 16; + add.s32 %r1050, %r1049, %r973; + xor.b32 %r1051, %r1050, %r961; + shf.l.wrap.b32 %r1052, %r1051, %r1051, 20; + add.s32 %r1053, %r1047, %r245; + add.s32 %r1054, %r1053, %r1052; + xor.b32 %r1055, %r1054, %r1049; + shf.l.wrap.b32 %r1056, %r1055, %r1055, 24; + add.s32 %r1057, %r1056, %r1050; + xor.b32 %r1058, %r1057, %r1052; + shf.l.wrap.b32 %r1059, %r1058, %r1058, 25; + xor.b32 %r1060, %r1043, %r1012; + st.local.u32 [%rd3+-104], %r1060; + xor.b32 %r1061, %r1057, %r1026; + st.local.u32 [%rd3+-100], %r1061; + xor.b32 %r1062, %r1015, %r1040; + st.local.u32 [%rd3+-96], %r1062; + xor.b32 %r1063, %r1029, %r1054; + st.local.u32 [%rd3+-92], %r1063; + xor.b32 %r1064, %r1059, %r1028; + st.local.u32 [%rd3+-88], %r1064; + xor.b32 %r1065, %r1017, %r1042; + st.local.u32 [%rd3+-84], %r1065; + xor.b32 %r1066, %r1031, %r1056; + st.local.u32 [%rd3+-80], %r1066; + xor.b32 %r1067, %r1045, %r1014; + st.local.u32 [%rd3+-76], %r1067; + add.s16 %rs114, %rs109, 1; + st.local.v2.u8 [%rd3], {%rs351, %rs114}; + +$L__BB1_8: + add.s64 %rd117, %rd13, %rd237; + st.local.u8 [%rd117], %rs351; + add.s64 %rd237, %rd237, 1; + setp.lt.u64 %p7, %rd237, 64; + mov.u64 %rd243, %rd12; + @%p7 bra $L__BB1_8; + +$L__BB1_9: + setp.lt.u64 %p8, %rd243, 65; + @%p8 bra $L__BB1_12; + + ld.local.u8 %rs9, [%rd3+2]; + ld.local.u8 %rs352, [%rd3+1]; + ld.local.u32 %r11657, [%rd3+-104]; + ld.local.u32 %r11656, [%rd3+-100]; + ld.local.u32 %r11655, [%rd3+-96]; + ld.local.u32 %r11654, [%rd3+-92]; + ld.local.u32 %r11653, [%rd3+-88]; + ld.local.u32 %r11652, [%rd3+-84]; + ld.local.u32 %r11651, [%rd3+-80]; + ld.local.u32 %r11650, [%rd3+-76]; + ld.local.u64 %rd118, [%rd3+-72]; + cvt.u32.u64 %r9, %rd118; + shr.u64 %rd119, %rd118, 32; + cvt.u32.u64 %r10, %rd119; + +$L__BB1_11: + and.b16 %rs116, %rs352, 255; + setp.eq.s16 %p9, %rs116, 0; + selp.u16 %rs117, 1, 0, %p9; + or.b16 %rs118, %rs9, %rs117; + ld.local.u8 %r1068, [%rd260]; + ld.local.u8 %r1069, [%rd260+1]; + prmt.b32 %r1070, %r1069, %r1068, 30212; + ld.local.u8 %r1071, [%rd260+2]; + prmt.b32 %r1072, %r1071, %r1070, 28756; + ld.local.u8 %r1073, [%rd260+3]; + prmt.b32 %r1074, %r1073, %r1072, 1620; + ld.local.u8 %r1075, [%rd260+4]; + ld.local.u8 %r1076, [%rd260+5]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + ld.local.u8 %r1078, [%rd260+6]; + prmt.b32 %r1079, %r1078, %r1077, 28756; + ld.local.u8 %r1080, [%rd260+7]; + prmt.b32 %r1081, %r1080, %r1079, 1620; + ld.local.u8 %r1082, [%rd260+8]; + ld.local.u8 %r1083, [%rd260+9]; + prmt.b32 %r1084, %r1083, %r1082, 30212; + ld.local.u8 %r1085, [%rd260+10]; + prmt.b32 %r1086, %r1085, %r1084, 28756; + ld.local.u8 %r1087, [%rd260+11]; + prmt.b32 %r1088, %r1087, %r1086, 1620; + ld.local.u8 %r1089, [%rd260+12]; + ld.local.u8 %r1090, [%rd260+13]; + prmt.b32 %r1091, %r1090, %r1089, 30212; + ld.local.u8 %r1092, [%rd260+14]; + prmt.b32 %r1093, %r1092, %r1091, 28756; + ld.local.u8 %r1094, [%rd260+15]; + prmt.b32 %r1095, %r1094, %r1093, 1620; + ld.local.u8 %r1096, [%rd260+16]; + ld.local.u8 %r1097, [%rd260+17]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd260+18]; + prmt.b32 %r1100, %r1099, %r1098, 28756; + ld.local.u8 %r1101, [%rd260+19]; + prmt.b32 %r1102, %r1101, %r1100, 1620; + ld.local.u8 %r1103, [%rd260+20]; + ld.local.u8 %r1104, [%rd260+21]; + prmt.b32 %r1105, %r1104, %r1103, 30212; + ld.local.u8 %r1106, [%rd260+22]; + prmt.b32 %r1107, %r1106, %r1105, 28756; + ld.local.u8 %r1108, [%rd260+23]; + prmt.b32 %r1109, %r1108, %r1107, 1620; + ld.local.u8 %r1110, [%rd260+24]; + ld.local.u8 %r1111, [%rd260+25]; + prmt.b32 %r1112, %r1111, %r1110, 30212; + ld.local.u8 %r1113, [%rd260+26]; + prmt.b32 %r1114, %r1113, %r1112, 28756; + ld.local.u8 %r1115, [%rd260+27]; + prmt.b32 %r1116, %r1115, %r1114, 1620; + ld.local.u8 %r1117, [%rd260+28]; + ld.local.u8 %r1118, [%rd260+29]; + prmt.b32 %r1119, %r1118, %r1117, 30212; + ld.local.u8 %r1120, [%rd260+30]; + prmt.b32 %r1121, %r1120, %r1119, 28756; + ld.local.u8 %r1122, [%rd260+31]; + prmt.b32 %r1123, %r1122, %r1121, 1620; + ld.local.u8 %r1124, [%rd260+32]; + ld.local.u8 %r1125, [%rd260+33]; + prmt.b32 %r1126, %r1125, %r1124, 30212; + ld.local.u8 %r1127, [%rd260+34]; + prmt.b32 %r1128, %r1127, %r1126, 28756; + ld.local.u8 %r1129, [%rd260+35]; + prmt.b32 %r1130, %r1129, %r1128, 1620; + ld.local.u8 %r1131, [%rd260+36]; + ld.local.u8 %r1132, [%rd260+37]; + prmt.b32 %r1133, %r1132, %r1131, 30212; + ld.local.u8 %r1134, [%rd260+38]; + prmt.b32 %r1135, %r1134, %r1133, 28756; + ld.local.u8 %r1136, [%rd260+39]; + prmt.b32 %r1137, %r1136, %r1135, 1620; + ld.local.u8 %r1138, [%rd260+40]; + ld.local.u8 %r1139, [%rd260+41]; + prmt.b32 %r1140, %r1139, %r1138, 30212; + ld.local.u8 %r1141, [%rd260+42]; + prmt.b32 %r1142, %r1141, %r1140, 28756; + ld.local.u8 %r1143, [%rd260+43]; + prmt.b32 %r1144, %r1143, %r1142, 1620; + ld.local.u8 %r1145, [%rd260+44]; + ld.local.u8 %r1146, [%rd260+45]; + prmt.b32 %r1147, %r1146, %r1145, 30212; + ld.local.u8 %r1148, [%rd260+46]; + prmt.b32 %r1149, %r1148, %r1147, 28756; + ld.local.u8 %r1150, [%rd260+47]; + prmt.b32 %r1151, %r1150, %r1149, 1620; + ld.local.u8 %r1152, [%rd260+48]; + ld.local.u8 %r1153, [%rd260+49]; + prmt.b32 %r1154, %r1153, %r1152, 30212; + ld.local.u8 %r1155, [%rd260+50]; + prmt.b32 %r1156, %r1155, %r1154, 28756; + ld.local.u8 %r1157, [%rd260+51]; + prmt.b32 %r1158, %r1157, %r1156, 1620; + ld.local.u8 %r1159, [%rd260+52]; + ld.local.u8 %r1160, [%rd260+53]; + prmt.b32 %r1161, %r1160, %r1159, 30212; + ld.local.u8 %r1162, [%rd260+54]; + prmt.b32 %r1163, %r1162, %r1161, 28756; + ld.local.u8 %r1164, [%rd260+55]; + prmt.b32 %r1165, %r1164, %r1163, 1620; + ld.local.u8 %r1166, [%rd260+56]; + ld.local.u8 %r1167, [%rd260+57]; + prmt.b32 %r1168, %r1167, %r1166, 30212; + ld.local.u8 %r1169, [%rd260+58]; + prmt.b32 %r1170, %r1169, %r1168, 28756; + ld.local.u8 %r1171, [%rd260+59]; + prmt.b32 %r1172, %r1171, %r1170, 1620; + ld.local.u8 %r1173, [%rd260+60]; + ld.local.u8 %r1174, [%rd260+61]; + prmt.b32 %r1175, %r1174, %r1173, 30212; + ld.local.u8 %r1176, [%rd260+62]; + prmt.b32 %r1177, %r1176, %r1175, 28756; + ld.local.u8 %r1178, [%rd260+63]; + prmt.b32 %r1179, %r1178, %r1177, 1620; + cvt.u32.u16 %r1180, %rs118; + and.b32 %r1181, %r1180, 255; + add.s32 %r1182, %r11657, %r1074; + add.s32 %r1183, %r1182, %r11653; + xor.b32 %r1184, %r1183, %r9; + shf.l.wrap.b32 %r1185, %r1184, %r1184, 16; + add.s32 %r1186, %r1185, 1779033703; + xor.b32 %r1187, %r1186, %r11653; + shf.l.wrap.b32 %r1188, %r1187, %r1187, 20; + add.s32 %r1189, %r1183, %r1081; + add.s32 %r1190, %r1189, %r1188; + xor.b32 %r1191, %r1190, %r1185; + shf.l.wrap.b32 %r1192, %r1191, %r1191, 24; + add.s32 %r1193, %r1192, %r1186; + xor.b32 %r1194, %r1193, %r1188; + shf.l.wrap.b32 %r1195, %r1194, %r1194, 25; + add.s32 %r1196, %r11656, %r1088; + add.s32 %r1197, %r1196, %r11652; + xor.b32 %r1198, %r1197, %r10; + shf.l.wrap.b32 %r1199, %r1198, %r1198, 16; + add.s32 %r1200, %r1199, -1150833019; + xor.b32 %r1201, %r1200, %r11652; + shf.l.wrap.b32 %r1202, %r1201, %r1201, 20; + add.s32 %r1203, %r1197, %r1095; + add.s32 %r1204, %r1203, %r1202; + xor.b32 %r1205, %r1204, %r1199; + shf.l.wrap.b32 %r1206, %r1205, %r1205, 24; + add.s32 %r1207, %r1206, %r1200; + xor.b32 %r1208, %r1207, %r1202; + shf.l.wrap.b32 %r1209, %r1208, %r1208, 25; + add.s32 %r1210, %r11655, %r1102; + add.s32 %r1211, %r1210, %r11651; + shr.u32 %r1212, %r1211, 16; + shl.b32 %r1213, %r1211, 16; + xor.b32 %r1214, %r1213, 4194304; + or.b32 %r1215, %r1214, %r1212; + add.s32 %r1216, %r1215, 1013904242; + xor.b32 %r1217, %r1216, %r11651; + shf.l.wrap.b32 %r1218, %r1217, %r1217, 20; + add.s32 %r1219, %r1211, %r1109; + add.s32 %r1220, %r1219, %r1218; + xor.b32 %r1221, %r1220, %r1215; + shf.l.wrap.b32 %r1222, %r1221, %r1221, 24; + add.s32 %r1223, %r1222, %r1216; + xor.b32 %r1224, %r1223, %r1218; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 25; + add.s32 %r1226, %r11654, %r1116; + add.s32 %r1227, %r1226, %r11650; + xor.b32 %r1228, %r1227, %r1181; + shr.u32 %r1229, %r1227, 16; + shl.b32 %r1230, %r1228, 16; + or.b32 %r1231, %r1230, %r1229; + add.s32 %r1232, %r1231, -1521486534; + xor.b32 %r1233, %r1232, %r11650; + shf.l.wrap.b32 %r1234, %r1233, %r1233, 20; + add.s32 %r1235, %r1227, %r1123; + add.s32 %r1236, %r1235, %r1234; + xor.b32 %r1237, %r1236, %r1231; + shf.l.wrap.b32 %r1238, %r1237, %r1237, 24; + add.s32 %r1239, %r1238, %r1232; + xor.b32 %r1240, %r1239, %r1234; + shf.l.wrap.b32 %r1241, %r1240, %r1240, 25; + add.s32 %r1242, %r1190, %r1130; + add.s32 %r1243, %r1242, %r1209; + xor.b32 %r1244, %r1243, %r1238; + shf.l.wrap.b32 %r1245, %r1244, %r1244, 16; + add.s32 %r1246, %r1245, %r1223; + xor.b32 %r1247, %r1246, %r1209; + shf.l.wrap.b32 %r1248, %r1247, %r1247, 20; + add.s32 %r1249, %r1243, %r1137; + add.s32 %r1250, %r1249, %r1248; + xor.b32 %r1251, %r1250, %r1245; + shf.l.wrap.b32 %r1252, %r1251, %r1251, 24; + add.s32 %r1253, %r1252, %r1246; + xor.b32 %r1254, %r1253, %r1248; + shf.l.wrap.b32 %r1255, %r1254, %r1254, 25; + add.s32 %r1256, %r1204, %r1144; + add.s32 %r1257, %r1256, %r1225; + xor.b32 %r1258, %r1257, %r1192; + shf.l.wrap.b32 %r1259, %r1258, %r1258, 16; + add.s32 %r1260, %r1259, %r1239; + xor.b32 %r1261, %r1260, %r1225; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 20; + add.s32 %r1263, %r1257, %r1151; + add.s32 %r1264, %r1263, %r1262; + xor.b32 %r1265, %r1264, %r1259; + shf.l.wrap.b32 %r1266, %r1265, %r1265, 24; + add.s32 %r1267, %r1266, %r1260; + xor.b32 %r1268, %r1267, %r1262; + shf.l.wrap.b32 %r1269, %r1268, %r1268, 25; + add.s32 %r1270, %r1220, %r1158; + add.s32 %r1271, %r1270, %r1241; + xor.b32 %r1272, %r1271, %r1206; + shf.l.wrap.b32 %r1273, %r1272, %r1272, 16; + add.s32 %r1274, %r1273, %r1193; + xor.b32 %r1275, %r1274, %r1241; + shf.l.wrap.b32 %r1276, %r1275, %r1275, 20; + add.s32 %r1277, %r1271, %r1165; + add.s32 %r1278, %r1277, %r1276; + xor.b32 %r1279, %r1278, %r1273; + shf.l.wrap.b32 %r1280, %r1279, %r1279, 24; + add.s32 %r1281, %r1280, %r1274; + xor.b32 %r1282, %r1281, %r1276; + shf.l.wrap.b32 %r1283, %r1282, %r1282, 25; + add.s32 %r1284, %r1236, %r1172; + add.s32 %r1285, %r1284, %r1195; + xor.b32 %r1286, %r1285, %r1222; + shf.l.wrap.b32 %r1287, %r1286, %r1286, 16; + add.s32 %r1288, %r1287, %r1207; + xor.b32 %r1289, %r1288, %r1195; + shf.l.wrap.b32 %r1290, %r1289, %r1289, 20; + add.s32 %r1291, %r1285, %r1179; + add.s32 %r1292, %r1291, %r1290; + xor.b32 %r1293, %r1292, %r1287; + shf.l.wrap.b32 %r1294, %r1293, %r1293, 24; + add.s32 %r1295, %r1294, %r1288; + xor.b32 %r1296, %r1295, %r1290; + shf.l.wrap.b32 %r1297, %r1296, %r1296, 25; + add.s32 %r1298, %r1250, %r1088; + add.s32 %r1299, %r1298, %r1297; + xor.b32 %r1300, %r1299, %r1266; + shf.l.wrap.b32 %r1301, %r1300, %r1300, 16; + add.s32 %r1302, %r1301, %r1281; + xor.b32 %r1303, %r1302, %r1297; + shf.l.wrap.b32 %r1304, %r1303, %r1303, 20; + add.s32 %r1305, %r1299, %r1116; + add.s32 %r1306, %r1305, %r1304; + xor.b32 %r1307, %r1306, %r1301; + shf.l.wrap.b32 %r1308, %r1307, %r1307, 24; + add.s32 %r1309, %r1308, %r1302; + xor.b32 %r1310, %r1309, %r1304; + shf.l.wrap.b32 %r1311, %r1310, %r1310, 25; + add.s32 %r1312, %r1264, %r1095; + add.s32 %r1313, %r1312, %r1255; + xor.b32 %r1314, %r1313, %r1280; + shf.l.wrap.b32 %r1315, %r1314, %r1314, 16; + add.s32 %r1316, %r1315, %r1295; + xor.b32 %r1317, %r1316, %r1255; + shf.l.wrap.b32 %r1318, %r1317, %r1317, 20; + add.s32 %r1319, %r1313, %r1144; + add.s32 %r1320, %r1319, %r1318; + xor.b32 %r1321, %r1320, %r1315; + shf.l.wrap.b32 %r1322, %r1321, %r1321, 24; + add.s32 %r1323, %r1322, %r1316; + xor.b32 %r1324, %r1323, %r1318; + shf.l.wrap.b32 %r1325, %r1324, %r1324, 25; + add.s32 %r1326, %r1278, %r1123; + add.s32 %r1327, %r1326, %r1269; + xor.b32 %r1328, %r1327, %r1294; + shf.l.wrap.b32 %r1329, %r1328, %r1328, 16; + add.s32 %r1330, %r1329, %r1253; + xor.b32 %r1331, %r1330, %r1269; + shf.l.wrap.b32 %r1332, %r1331, %r1331, 20; + add.s32 %r1333, %r1327, %r1074; + add.s32 %r1334, %r1333, %r1332; + xor.b32 %r1335, %r1334, %r1329; + shf.l.wrap.b32 %r1336, %r1335, %r1335, 24; + add.s32 %r1337, %r1336, %r1330; + xor.b32 %r1338, %r1337, %r1332; + shf.l.wrap.b32 %r1339, %r1338, %r1338, 25; + add.s32 %r1340, %r1292, %r1102; + add.s32 %r1341, %r1340, %r1283; + xor.b32 %r1342, %r1341, %r1252; + shf.l.wrap.b32 %r1343, %r1342, %r1342, 16; + add.s32 %r1344, %r1343, %r1267; + xor.b32 %r1345, %r1344, %r1283; + shf.l.wrap.b32 %r1346, %r1345, %r1345, 20; + add.s32 %r1347, %r1341, %r1165; + add.s32 %r1348, %r1347, %r1346; + xor.b32 %r1349, %r1348, %r1343; + shf.l.wrap.b32 %r1350, %r1349, %r1349, 24; + add.s32 %r1351, %r1350, %r1344; + xor.b32 %r1352, %r1351, %r1346; + shf.l.wrap.b32 %r1353, %r1352, %r1352, 25; + add.s32 %r1354, %r1306, %r1081; + add.s32 %r1355, %r1354, %r1325; + xor.b32 %r1356, %r1355, %r1350; + shf.l.wrap.b32 %r1357, %r1356, %r1356, 16; + add.s32 %r1358, %r1357, %r1337; + xor.b32 %r1359, %r1358, %r1325; + shf.l.wrap.b32 %r1360, %r1359, %r1359, 20; + add.s32 %r1361, %r1355, %r1151; + add.s32 %r1362, %r1361, %r1360; + xor.b32 %r1363, %r1362, %r1357; + shf.l.wrap.b32 %r1364, %r1363, %r1363, 24; + add.s32 %r1365, %r1364, %r1358; + xor.b32 %r1366, %r1365, %r1360; + shf.l.wrap.b32 %r1367, %r1366, %r1366, 25; + add.s32 %r1368, %r1320, %r1158; + add.s32 %r1369, %r1368, %r1339; + xor.b32 %r1370, %r1369, %r1308; + shf.l.wrap.b32 %r1371, %r1370, %r1370, 16; + add.s32 %r1372, %r1371, %r1351; + xor.b32 %r1373, %r1372, %r1339; + shf.l.wrap.b32 %r1374, %r1373, %r1373, 20; + add.s32 %r1375, %r1369, %r1109; + add.s32 %r1376, %r1375, %r1374; + xor.b32 %r1377, %r1376, %r1371; + shf.l.wrap.b32 %r1378, %r1377, %r1377, 24; + add.s32 %r1379, %r1378, %r1372; + xor.b32 %r1380, %r1379, %r1374; + shf.l.wrap.b32 %r1381, %r1380, %r1380, 25; + add.s32 %r1382, %r1334, %r1137; + add.s32 %r1383, %r1382, %r1353; + xor.b32 %r1384, %r1383, %r1322; + shf.l.wrap.b32 %r1385, %r1384, %r1384, 16; + add.s32 %r1386, %r1385, %r1309; + xor.b32 %r1387, %r1386, %r1353; + shf.l.wrap.b32 %r1388, %r1387, %r1387, 20; + add.s32 %r1389, %r1383, %r1172; + add.s32 %r1390, %r1389, %r1388; + xor.b32 %r1391, %r1390, %r1385; + shf.l.wrap.b32 %r1392, %r1391, %r1391, 24; + add.s32 %r1393, %r1392, %r1386; + xor.b32 %r1394, %r1393, %r1388; + shf.l.wrap.b32 %r1395, %r1394, %r1394, 25; + add.s32 %r1396, %r1348, %r1179; + add.s32 %r1397, %r1396, %r1311; + xor.b32 %r1398, %r1397, %r1336; + shf.l.wrap.b32 %r1399, %r1398, %r1398, 16; + add.s32 %r1400, %r1399, %r1323; + xor.b32 %r1401, %r1400, %r1311; + shf.l.wrap.b32 %r1402, %r1401, %r1401, 20; + add.s32 %r1403, %r1397, %r1130; + add.s32 %r1404, %r1403, %r1402; + xor.b32 %r1405, %r1404, %r1399; + shf.l.wrap.b32 %r1406, %r1405, %r1405, 24; + add.s32 %r1407, %r1406, %r1400; + xor.b32 %r1408, %r1407, %r1402; + shf.l.wrap.b32 %r1409, %r1408, %r1408, 25; + add.s32 %r1410, %r1362, %r1095; + add.s32 %r1411, %r1410, %r1409; + xor.b32 %r1412, %r1411, %r1378; + shf.l.wrap.b32 %r1413, %r1412, %r1412, 16; + add.s32 %r1414, %r1413, %r1393; + xor.b32 %r1415, %r1414, %r1409; + shf.l.wrap.b32 %r1416, %r1415, %r1415, 20; + add.s32 %r1417, %r1411, %r1102; + add.s32 %r1418, %r1417, %r1416; + xor.b32 %r1419, %r1418, %r1413; + shf.l.wrap.b32 %r1420, %r1419, %r1419, 24; + add.s32 %r1421, %r1420, %r1414; + xor.b32 %r1422, %r1421, %r1416; + shf.l.wrap.b32 %r1423, %r1422, %r1422, 25; + add.s32 %r1424, %r1376, %r1144; + add.s32 %r1425, %r1424, %r1367; + xor.b32 %r1426, %r1425, %r1392; + shf.l.wrap.b32 %r1427, %r1426, %r1426, 16; + add.s32 %r1428, %r1427, %r1407; + xor.b32 %r1429, %r1428, %r1367; + shf.l.wrap.b32 %r1430, %r1429, %r1429, 20; + add.s32 %r1431, %r1425, %r1158; + add.s32 %r1432, %r1431, %r1430; + xor.b32 %r1433, %r1432, %r1427; + shf.l.wrap.b32 %r1434, %r1433, %r1433, 24; + add.s32 %r1435, %r1434, %r1428; + xor.b32 %r1436, %r1435, %r1430; + shf.l.wrap.b32 %r1437, %r1436, %r1436, 25; + add.s32 %r1438, %r1390, %r1165; + add.s32 %r1439, %r1438, %r1381; + xor.b32 %r1440, %r1439, %r1406; + shf.l.wrap.b32 %r1441, %r1440, %r1440, 16; + add.s32 %r1442, %r1441, %r1365; + xor.b32 %r1443, %r1442, %r1381; + shf.l.wrap.b32 %r1444, %r1443, %r1443, 20; + add.s32 %r1445, %r1439, %r1088; + add.s32 %r1446, %r1445, %r1444; + xor.b32 %r1447, %r1446, %r1441; + shf.l.wrap.b32 %r1448, %r1447, %r1447, 24; + add.s32 %r1449, %r1448, %r1442; + xor.b32 %r1450, %r1449, %r1444; + shf.l.wrap.b32 %r1451, %r1450, %r1450, 25; + add.s32 %r1452, %r1404, %r1123; + add.s32 %r1453, %r1452, %r1395; + xor.b32 %r1454, %r1453, %r1364; + shf.l.wrap.b32 %r1455, %r1454, %r1454, 16; + add.s32 %r1456, %r1455, %r1379; + xor.b32 %r1457, %r1456, %r1395; + shf.l.wrap.b32 %r1458, %r1457, %r1457, 20; + add.s32 %r1459, %r1453, %r1172; + add.s32 %r1460, %r1459, %r1458; + xor.b32 %r1461, %r1460, %r1455; + shf.l.wrap.b32 %r1462, %r1461, %r1461, 24; + add.s32 %r1463, %r1462, %r1456; + xor.b32 %r1464, %r1463, %r1458; + shf.l.wrap.b32 %r1465, %r1464, %r1464, 25; + add.s32 %r1466, %r1418, %r1116; + add.s32 %r1467, %r1466, %r1437; + xor.b32 %r1468, %r1467, %r1462; + shf.l.wrap.b32 %r1469, %r1468, %r1468, 16; + add.s32 %r1470, %r1469, %r1449; + xor.b32 %r1471, %r1470, %r1437; + shf.l.wrap.b32 %r1472, %r1471, %r1471, 20; + add.s32 %r1473, %r1467, %r1109; + add.s32 %r1474, %r1473, %r1472; + xor.b32 %r1475, %r1474, %r1469; + shf.l.wrap.b32 %r1476, %r1475, %r1475, 24; + add.s32 %r1477, %r1476, %r1470; + xor.b32 %r1478, %r1477, %r1472; + shf.l.wrap.b32 %r1479, %r1478, %r1478, 25; + add.s32 %r1480, %r1432, %r1137; + add.s32 %r1481, %r1480, %r1451; + xor.b32 %r1482, %r1481, %r1420; + shf.l.wrap.b32 %r1483, %r1482, %r1482, 16; + add.s32 %r1484, %r1483, %r1463; + xor.b32 %r1485, %r1484, %r1451; + shf.l.wrap.b32 %r1486, %r1485, %r1485, 20; + add.s32 %r1487, %r1481, %r1074; + add.s32 %r1488, %r1487, %r1486; + xor.b32 %r1489, %r1488, %r1483; + shf.l.wrap.b32 %r1490, %r1489, %r1489, 24; + add.s32 %r1491, %r1490, %r1484; + xor.b32 %r1492, %r1491, %r1486; + shf.l.wrap.b32 %r1493, %r1492, %r1492, 25; + add.s32 %r1494, %r1446, %r1151; + add.s32 %r1495, %r1494, %r1465; + xor.b32 %r1496, %r1495, %r1434; + shf.l.wrap.b32 %r1497, %r1496, %r1496, 16; + add.s32 %r1498, %r1497, %r1421; + xor.b32 %r1499, %r1498, %r1465; + shf.l.wrap.b32 %r1500, %r1499, %r1499, 20; + add.s32 %r1501, %r1495, %r1179; + add.s32 %r1502, %r1501, %r1500; + xor.b32 %r1503, %r1502, %r1497; + shf.l.wrap.b32 %r1504, %r1503, %r1503, 24; + add.s32 %r1505, %r1504, %r1498; + xor.b32 %r1506, %r1505, %r1500; + shf.l.wrap.b32 %r1507, %r1506, %r1506, 25; + add.s32 %r1508, %r1460, %r1130; + add.s32 %r1509, %r1508, %r1423; + xor.b32 %r1510, %r1509, %r1448; + shf.l.wrap.b32 %r1511, %r1510, %r1510, 16; + add.s32 %r1512, %r1511, %r1435; + xor.b32 %r1513, %r1512, %r1423; + shf.l.wrap.b32 %r1514, %r1513, %r1513, 20; + add.s32 %r1515, %r1509, %r1081; + add.s32 %r1516, %r1515, %r1514; + xor.b32 %r1517, %r1516, %r1511; + shf.l.wrap.b32 %r1518, %r1517, %r1517, 24; + add.s32 %r1519, %r1518, %r1512; + xor.b32 %r1520, %r1519, %r1514; + shf.l.wrap.b32 %r1521, %r1520, %r1520, 25; + add.s32 %r1522, %r1474, %r1144; + add.s32 %r1523, %r1522, %r1521; + xor.b32 %r1524, %r1523, %r1490; + shf.l.wrap.b32 %r1525, %r1524, %r1524, 16; + add.s32 %r1526, %r1525, %r1505; + xor.b32 %r1527, %r1526, %r1521; + shf.l.wrap.b32 %r1528, %r1527, %r1527, 20; + add.s32 %r1529, %r1523, %r1123; + add.s32 %r1530, %r1529, %r1528; + xor.b32 %r1531, %r1530, %r1525; + shf.l.wrap.b32 %r1532, %r1531, %r1531, 24; + add.s32 %r1533, %r1532, %r1526; + xor.b32 %r1534, %r1533, %r1528; + shf.l.wrap.b32 %r1535, %r1534, %r1534, 25; + add.s32 %r1536, %r1488, %r1158; + add.s32 %r1537, %r1536, %r1479; + xor.b32 %r1538, %r1537, %r1504; + shf.l.wrap.b32 %r1539, %r1538, %r1538, 16; + add.s32 %r1540, %r1539, %r1519; + xor.b32 %r1541, %r1540, %r1479; + shf.l.wrap.b32 %r1542, %r1541, %r1541, 20; + add.s32 %r1543, %r1537, %r1137; + add.s32 %r1544, %r1543, %r1542; + xor.b32 %r1545, %r1544, %r1539; + shf.l.wrap.b32 %r1546, %r1545, %r1545, 24; + add.s32 %r1547, %r1546, %r1540; + xor.b32 %r1548, %r1547, %r1542; + shf.l.wrap.b32 %r1549, %r1548, %r1548, 25; + add.s32 %r1550, %r1502, %r1172; + add.s32 %r1551, %r1550, %r1493; + xor.b32 %r1552, %r1551, %r1518; + shf.l.wrap.b32 %r1553, %r1552, %r1552, 16; + add.s32 %r1554, %r1553, %r1477; + xor.b32 %r1555, %r1554, %r1493; + shf.l.wrap.b32 %r1556, %r1555, %r1555, 20; + add.s32 %r1557, %r1551, %r1095; + add.s32 %r1558, %r1557, %r1556; + xor.b32 %r1559, %r1558, %r1553; + shf.l.wrap.b32 %r1560, %r1559, %r1559, 24; + add.s32 %r1561, %r1560, %r1554; + xor.b32 %r1562, %r1561, %r1556; + shf.l.wrap.b32 %r1563, %r1562, %r1562, 25; + add.s32 %r1564, %r1516, %r1165; + add.s32 %r1565, %r1564, %r1507; + xor.b32 %r1566, %r1565, %r1476; + shf.l.wrap.b32 %r1567, %r1566, %r1566, 16; + add.s32 %r1568, %r1567, %r1491; + xor.b32 %r1569, %r1568, %r1507; + shf.l.wrap.b32 %r1570, %r1569, %r1569, 20; + add.s32 %r1571, %r1565, %r1179; + add.s32 %r1572, %r1571, %r1570; + xor.b32 %r1573, %r1572, %r1567; + shf.l.wrap.b32 %r1574, %r1573, %r1573, 24; + add.s32 %r1575, %r1574, %r1568; + xor.b32 %r1576, %r1575, %r1570; + shf.l.wrap.b32 %r1577, %r1576, %r1576, 25; + add.s32 %r1578, %r1530, %r1102; + add.s32 %r1579, %r1578, %r1549; + xor.b32 %r1580, %r1579, %r1574; + shf.l.wrap.b32 %r1581, %r1580, %r1580, 16; + add.s32 %r1582, %r1581, %r1561; + xor.b32 %r1583, %r1582, %r1549; + shf.l.wrap.b32 %r1584, %r1583, %r1583, 20; + add.s32 %r1585, %r1579, %r1074; + add.s32 %r1586, %r1585, %r1584; + xor.b32 %r1587, %r1586, %r1581; + shf.l.wrap.b32 %r1588, %r1587, %r1587, 24; + add.s32 %r1589, %r1588, %r1582; + xor.b32 %r1590, %r1589, %r1584; + shf.l.wrap.b32 %r1591, %r1590, %r1590, 25; + add.s32 %r1592, %r1544, %r1151; + add.s32 %r1593, %r1592, %r1563; + xor.b32 %r1594, %r1593, %r1532; + shf.l.wrap.b32 %r1595, %r1594, %r1594, 16; + add.s32 %r1596, %r1595, %r1575; + xor.b32 %r1597, %r1596, %r1563; + shf.l.wrap.b32 %r1598, %r1597, %r1597, 20; + add.s32 %r1599, %r1593, %r1088; + add.s32 %r1600, %r1599, %r1598; + xor.b32 %r1601, %r1600, %r1595; + shf.l.wrap.b32 %r1602, %r1601, %r1601, 24; + add.s32 %r1603, %r1602, %r1596; + xor.b32 %r1604, %r1603, %r1598; + shf.l.wrap.b32 %r1605, %r1604, %r1604, 25; + add.s32 %r1606, %r1558, %r1109; + add.s32 %r1607, %r1606, %r1577; + xor.b32 %r1608, %r1607, %r1546; + shf.l.wrap.b32 %r1609, %r1608, %r1608, 16; + add.s32 %r1610, %r1609, %r1533; + xor.b32 %r1611, %r1610, %r1577; + shf.l.wrap.b32 %r1612, %r1611, %r1611, 20; + add.s32 %r1613, %r1607, %r1130; + add.s32 %r1614, %r1613, %r1612; + xor.b32 %r1615, %r1614, %r1609; + shf.l.wrap.b32 %r1616, %r1615, %r1615, 24; + add.s32 %r1617, %r1616, %r1610; + xor.b32 %r1618, %r1617, %r1612; + shf.l.wrap.b32 %r1619, %r1618, %r1618, 25; + add.s32 %r1620, %r1572, %r1081; + add.s32 %r1621, %r1620, %r1535; + xor.b32 %r1622, %r1621, %r1560; + shf.l.wrap.b32 %r1623, %r1622, %r1622, 16; + add.s32 %r1624, %r1623, %r1547; + xor.b32 %r1625, %r1624, %r1535; + shf.l.wrap.b32 %r1626, %r1625, %r1625, 20; + add.s32 %r1627, %r1621, %r1116; + add.s32 %r1628, %r1627, %r1626; + xor.b32 %r1629, %r1628, %r1623; + shf.l.wrap.b32 %r1630, %r1629, %r1629, 24; + add.s32 %r1631, %r1630, %r1624; + xor.b32 %r1632, %r1631, %r1626; + shf.l.wrap.b32 %r1633, %r1632, %r1632, 25; + add.s32 %r1634, %r1586, %r1158; + add.s32 %r1635, %r1634, %r1633; + xor.b32 %r1636, %r1635, %r1602; + shf.l.wrap.b32 %r1637, %r1636, %r1636, 16; + add.s32 %r1638, %r1637, %r1617; + xor.b32 %r1639, %r1638, %r1633; + shf.l.wrap.b32 %r1640, %r1639, %r1639, 20; + add.s32 %r1641, %r1635, %r1165; + add.s32 %r1642, %r1641, %r1640; + xor.b32 %r1643, %r1642, %r1637; + shf.l.wrap.b32 %r1644, %r1643, %r1643, 24; + add.s32 %r1645, %r1644, %r1638; + xor.b32 %r1646, %r1645, %r1640; + shf.l.wrap.b32 %r1647, %r1646, %r1646, 25; + add.s32 %r1648, %r1600, %r1137; + add.s32 %r1649, %r1648, %r1591; + xor.b32 %r1650, %r1649, %r1616; + shf.l.wrap.b32 %r1651, %r1650, %r1650, 16; + add.s32 %r1652, %r1651, %r1631; + xor.b32 %r1653, %r1652, %r1591; + shf.l.wrap.b32 %r1654, %r1653, %r1653, 20; + add.s32 %r1655, %r1649, %r1151; + add.s32 %r1656, %r1655, %r1654; + xor.b32 %r1657, %r1656, %r1651; + shf.l.wrap.b32 %r1658, %r1657, %r1657, 24; + add.s32 %r1659, %r1658, %r1652; + xor.b32 %r1660, %r1659, %r1654; + shf.l.wrap.b32 %r1661, %r1660, %r1660, 25; + add.s32 %r1662, %r1614, %r1179; + add.s32 %r1663, %r1662, %r1605; + xor.b32 %r1664, %r1663, %r1630; + shf.l.wrap.b32 %r1665, %r1664, %r1664, 16; + add.s32 %r1666, %r1665, %r1589; + xor.b32 %r1667, %r1666, %r1605; + shf.l.wrap.b32 %r1668, %r1667, %r1667, 20; + add.s32 %r1669, %r1663, %r1144; + add.s32 %r1670, %r1669, %r1668; + xor.b32 %r1671, %r1670, %r1665; + shf.l.wrap.b32 %r1672, %r1671, %r1671, 24; + add.s32 %r1673, %r1672, %r1666; + xor.b32 %r1674, %r1673, %r1668; + shf.l.wrap.b32 %r1675, %r1674, %r1674, 25; + add.s32 %r1676, %r1628, %r1172; + add.s32 %r1677, %r1676, %r1619; + xor.b32 %r1678, %r1677, %r1588; + shf.l.wrap.b32 %r1679, %r1678, %r1678, 16; + add.s32 %r1680, %r1679, %r1603; + xor.b32 %r1681, %r1680, %r1619; + shf.l.wrap.b32 %r1682, %r1681, %r1681, 20; + add.s32 %r1683, %r1677, %r1130; + add.s32 %r1684, %r1683, %r1682; + xor.b32 %r1685, %r1684, %r1679; + shf.l.wrap.b32 %r1686, %r1685, %r1685, 24; + add.s32 %r1687, %r1686, %r1680; + xor.b32 %r1688, %r1687, %r1682; + shf.l.wrap.b32 %r1689, %r1688, %r1688, 25; + add.s32 %r1690, %r1642, %r1123; + add.s32 %r1691, %r1690, %r1661; + xor.b32 %r1692, %r1691, %r1686; + shf.l.wrap.b32 %r1693, %r1692, %r1692, 16; + add.s32 %r1694, %r1693, %r1673; + xor.b32 %r1695, %r1694, %r1661; + shf.l.wrap.b32 %r1696, %r1695, %r1695, 20; + add.s32 %r1697, %r1691, %r1088; + add.s32 %r1698, %r1697, %r1696; + xor.b32 %r1699, %r1698, %r1693; + shf.l.wrap.b32 %r1700, %r1699, %r1699, 24; + add.s32 %r1701, %r1700, %r1694; + xor.b32 %r1702, %r1701, %r1696; + shf.l.wrap.b32 %r1703, %r1702, %r1702, 25; + add.s32 %r1704, %r1656, %r1109; + add.s32 %r1705, %r1704, %r1675; + xor.b32 %r1706, %r1705, %r1644; + shf.l.wrap.b32 %r1707, %r1706, %r1706, 16; + add.s32 %r1708, %r1707, %r1687; + xor.b32 %r1709, %r1708, %r1675; + shf.l.wrap.b32 %r1710, %r1709, %r1709, 20; + add.s32 %r1711, %r1705, %r1095; + add.s32 %r1712, %r1711, %r1710; + xor.b32 %r1713, %r1712, %r1707; + shf.l.wrap.b32 %r1714, %r1713, %r1713, 24; + add.s32 %r1715, %r1714, %r1708; + xor.b32 %r1716, %r1715, %r1710; + shf.l.wrap.b32 %r1717, %r1716, %r1716, 25; + add.s32 %r1718, %r1670, %r1074; + add.s32 %r1719, %r1718, %r1689; + xor.b32 %r1720, %r1719, %r1658; + shf.l.wrap.b32 %r1721, %r1720, %r1720, 16; + add.s32 %r1722, %r1721, %r1645; + xor.b32 %r1723, %r1722, %r1689; + shf.l.wrap.b32 %r1724, %r1723, %r1723, 20; + add.s32 %r1725, %r1719, %r1081; + add.s32 %r1726, %r1725, %r1724; + xor.b32 %r1727, %r1726, %r1721; + shf.l.wrap.b32 %r1728, %r1727, %r1727, 24; + add.s32 %r1729, %r1728, %r1722; + xor.b32 %r1730, %r1729, %r1724; + shf.l.wrap.b32 %r1731, %r1730, %r1730, 25; + add.s32 %r1732, %r1684, %r1116; + add.s32 %r1733, %r1732, %r1647; + xor.b32 %r1734, %r1733, %r1672; + shf.l.wrap.b32 %r1735, %r1734, %r1734, 16; + add.s32 %r1736, %r1735, %r1659; + xor.b32 %r1737, %r1736, %r1647; + shf.l.wrap.b32 %r1738, %r1737, %r1737, 20; + add.s32 %r1739, %r1733, %r1102; + add.s32 %r1740, %r1739, %r1738; + xor.b32 %r1741, %r1740, %r1735; + shf.l.wrap.b32 %r1742, %r1741, %r1741, 24; + add.s32 %r1743, %r1742, %r1736; + xor.b32 %r1744, %r1743, %r1738; + shf.l.wrap.b32 %r1745, %r1744, %r1744, 25; + add.s32 %r1746, %r1698, %r1137; + add.s32 %r1747, %r1746, %r1745; + xor.b32 %r1748, %r1747, %r1714; + shf.l.wrap.b32 %r1749, %r1748, %r1748, 16; + add.s32 %r1750, %r1749, %r1729; + xor.b32 %r1751, %r1750, %r1745; + shf.l.wrap.b32 %r1752, %r1751, %r1751, 20; + add.s32 %r1753, %r1747, %r1172; + add.s32 %r1754, %r1753, %r1752; + xor.b32 %r1755, %r1754, %r1749; + shf.l.wrap.b32 %r1756, %r1755, %r1755, 24; + add.s32 %r1757, %r1756, %r1750; + xor.b32 %r1758, %r1757, %r1752; + shf.l.wrap.b32 %r1759, %r1758, %r1758, 25; + add.s32 %r1760, %r1712, %r1151; + add.s32 %r1761, %r1760, %r1703; + xor.b32 %r1762, %r1761, %r1728; + shf.l.wrap.b32 %r1763, %r1762, %r1762, 16; + add.s32 %r1764, %r1763, %r1743; + xor.b32 %r1765, %r1764, %r1703; + shf.l.wrap.b32 %r1766, %r1765, %r1765, 20; + add.s32 %r1767, %r1761, %r1109; + add.s32 %r1768, %r1767, %r1766; + xor.b32 %r1769, %r1768, %r1763; + shf.l.wrap.b32 %r1770, %r1769, %r1769, 24; + add.s32 %r1771, %r1770, %r1764; + xor.b32 %r1772, %r1771, %r1766; + shf.l.wrap.b32 %r1773, %r1772, %r1772, 25; + add.s32 %r1774, %r1726, %r1130; + add.s32 %r1775, %r1774, %r1717; + xor.b32 %r1776, %r1775, %r1742; + shf.l.wrap.b32 %r1777, %r1776, %r1776, 16; + add.s32 %r1778, %r1777, %r1701; + xor.b32 %r1779, %r1778, %r1717; + shf.l.wrap.b32 %r1780, %r1779, %r1779, 20; + add.s32 %r1781, %r1775, %r1158; + add.s32 %r1782, %r1781, %r1780; + xor.b32 %r1783, %r1782, %r1777; + shf.l.wrap.b32 %r1784, %r1783, %r1783, 24; + add.s32 %r1785, %r1784, %r1778; + xor.b32 %r1786, %r1785, %r1780; + shf.l.wrap.b32 %r1787, %r1786, %r1786, 25; + add.s32 %r1788, %r1740, %r1179; + add.s32 %r1789, %r1788, %r1731; + xor.b32 %r1790, %r1789, %r1700; + shf.l.wrap.b32 %r1791, %r1790, %r1790, 16; + add.s32 %r1792, %r1791, %r1715; + xor.b32 %r1793, %r1792, %r1731; + shf.l.wrap.b32 %r1794, %r1793, %r1793, 20; + add.s32 %r1795, %r1789, %r1081; + add.s32 %r1796, %r1795, %r1794; + xor.b32 %r1797, %r1796, %r1791; + shf.l.wrap.b32 %r1798, %r1797, %r1797, 24; + add.s32 %r1799, %r1798, %r1792; + xor.b32 %r1800, %r1799, %r1794; + shf.l.wrap.b32 %r1801, %r1800, %r1800, 25; + add.s32 %r1802, %r1754, %r1165; + add.s32 %r1803, %r1802, %r1773; + xor.b32 %r1804, %r1803, %r1798; + shf.l.wrap.b32 %r1805, %r1804, %r1804, 16; + add.s32 %r1806, %r1805, %r1785; + xor.b32 %r1807, %r1806, %r1773; + shf.l.wrap.b32 %r1808, %r1807, %r1807, 20; + add.s32 %r1809, %r1803, %r1095; + add.s32 %r1810, %r1809, %r1808; + xor.b32 %r1811, %r1810, %r1805; + shf.l.wrap.b32 %r1812, %r1811, %r1811, 24; + add.s32 %r1813, %r1812, %r1806; + xor.b32 %r1814, %r1813, %r1808; + shf.l.wrap.b32 %r1815, %r1814, %r1814, 25; + add.s32 %r1816, %r1768, %r1074; + add.s32 %r1817, %r1816, %r1787; + xor.b32 %r1818, %r1817, %r1756; + shf.l.wrap.b32 %r1819, %r1818, %r1818, 16; + add.s32 %r1820, %r1819, %r1799; + xor.b32 %r1821, %r1820, %r1787; + shf.l.wrap.b32 %r1822, %r1821, %r1821, 20; + add.s32 %r1823, %r1817, %r1144; + add.s32 %r1824, %r1823, %r1822; + xor.b32 %r1825, %r1824, %r1819; + shf.l.wrap.b32 %r1826, %r1825, %r1825, 24; + add.s32 %r1827, %r1826, %r1820; + xor.b32 %r1828, %r1827, %r1822; + shf.l.wrap.b32 %r1829, %r1828, %r1828, 25; + add.s32 %r1830, %r1782, %r1088; + add.s32 %r1831, %r1830, %r1801; + xor.b32 %r1832, %r1831, %r1770; + shf.l.wrap.b32 %r1833, %r1832, %r1832, 16; + add.s32 %r1834, %r1833, %r1757; + xor.b32 %r1835, %r1834, %r1801; + shf.l.wrap.b32 %r1836, %r1835, %r1835, 20; + add.s32 %r1837, %r1831, %r1116; + add.s32 %r1838, %r1837, %r1836; + xor.b32 %r1839, %r1838, %r1833; + shf.l.wrap.b32 %r1840, %r1839, %r1839, 24; + add.s32 %r1841, %r1840, %r1834; + xor.b32 %r1842, %r1841, %r1836; + shf.l.wrap.b32 %r1843, %r1842, %r1842, 25; + add.s32 %r1844, %r1796, %r1102; + add.s32 %r1845, %r1844, %r1759; + xor.b32 %r1846, %r1845, %r1784; + shf.l.wrap.b32 %r1847, %r1846, %r1846, 16; + add.s32 %r1848, %r1847, %r1771; + xor.b32 %r1849, %r1848, %r1759; + shf.l.wrap.b32 %r1850, %r1849, %r1849, 20; + add.s32 %r1851, %r1845, %r1123; + add.s32 %r1852, %r1851, %r1850; + xor.b32 %r1853, %r1852, %r1847; + shf.l.wrap.b32 %r1854, %r1853, %r1853, 24; + add.s32 %r1855, %r1854, %r1848; + xor.b32 %r1856, %r1855, %r1850; + shf.l.wrap.b32 %r1857, %r1856, %r1856, 25; + add.s32 %r1858, %r1810, %r1151; + add.s32 %r1859, %r1858, %r1857; + xor.b32 %r1860, %r1859, %r1826; + shf.l.wrap.b32 %r1861, %r1860, %r1860, 16; + add.s32 %r1862, %r1861, %r1841; + xor.b32 %r1863, %r1862, %r1857; + shf.l.wrap.b32 %r1864, %r1863, %r1863, 20; + add.s32 %r1865, %r1859, %r1179; + add.s32 %r1866, %r1865, %r1864; + xor.b32 %r1867, %r1866, %r1861; + shf.l.wrap.b32 %r1868, %r1867, %r1867, 24; + add.s32 %r1869, %r1868, %r1862; + xor.b32 %r1870, %r1869, %r1864; + shf.l.wrap.b32 %r1871, %r1870, %r1870, 25; + add.s32 %r1872, %r1824, %r1109; + add.s32 %r1873, %r1872, %r1815; + xor.b32 %r1874, %r1873, %r1840; + shf.l.wrap.b32 %r1875, %r1874, %r1874, 16; + add.s32 %r1876, %r1875, %r1855; + xor.b32 %r1877, %r1876, %r1815; + shf.l.wrap.b32 %r1878, %r1877, %r1877, 20; + add.s32 %r1879, %r1873, %r1074; + add.s32 %r1880, %r1879, %r1878; + xor.b32 %r1881, %r1880, %r1875; + shf.l.wrap.b32 %r1882, %r1881, %r1881, 24; + add.s32 %r1883, %r1882, %r1876; + xor.b32 %r1884, %r1883, %r1878; + shf.l.wrap.b32 %r1885, %r1884, %r1884, 25; + add.s32 %r1886, %r1838, %r1081; + add.s32 %r1887, %r1886, %r1829; + xor.b32 %r1888, %r1887, %r1854; + shf.l.wrap.b32 %r1889, %r1888, %r1888, 16; + add.s32 %r1890, %r1889, %r1813; + xor.b32 %r1891, %r1890, %r1829; + shf.l.wrap.b32 %r1892, %r1891, %r1891, 20; + add.s32 %r1893, %r1887, %r1137; + add.s32 %r1894, %r1893, %r1892; + xor.b32 %r1895, %r1894, %r1889; + shf.l.wrap.b32 %r1896, %r1895, %r1895, 24; + add.s32 %r1897, %r1896, %r1890; + xor.b32 %r1898, %r1897, %r1892; + shf.l.wrap.b32 %r1899, %r1898, %r1898, 25; + add.s32 %r1900, %r1852, %r1130; + add.s32 %r1901, %r1900, %r1843; + xor.b32 %r1902, %r1901, %r1812; + shf.l.wrap.b32 %r1903, %r1902, %r1902, 16; + add.s32 %r1904, %r1903, %r1827; + xor.b32 %r1905, %r1904, %r1843; + shf.l.wrap.b32 %r1906, %r1905, %r1905, 20; + add.s32 %r1907, %r1901, %r1116; + add.s32 %r1908, %r1907, %r1906; + xor.b32 %r1909, %r1908, %r1903; + shf.l.wrap.b32 %r1910, %r1909, %r1909, 24; + add.s32 %r1911, %r1910, %r1904; + xor.b32 %r1912, %r1911, %r1906; + shf.l.wrap.b32 %r1913, %r1912, %r1912, 25; + add.s32 %r1914, %r1866, %r1172; + add.s32 %r1915, %r1914, %r1885; + xor.b32 %r1916, %r1915, %r1910; + shf.l.wrap.b32 %r1917, %r1916, %r1916, 16; + add.s32 %r1918, %r1917, %r1897; + xor.b32 %r1919, %r1918, %r1885; + shf.l.wrap.b32 %r1920, %r1919, %r1919, 20; + add.s32 %r1921, %r1915, %r1144; + add.s32 %r1922, %r1921, %r1920; + xor.b32 %r1923, %r1922, %r1917; + shf.l.wrap.b32 %r1924, %r1923, %r1923, 24; + add.s32 %r1925, %r1924, %r1918; + xor.b32 %r1926, %r1925, %r1920; + shf.l.wrap.b32 %r1927, %r1926, %r1926, 25; + add.s32 %r1928, %r1880, %r1088; + add.s32 %r1929, %r1928, %r1899; + xor.b32 %r1930, %r1929, %r1868; + shf.l.wrap.b32 %r1931, %r1930, %r1930, 16; + add.s32 %r1932, %r1931, %r1911; + xor.b32 %r1933, %r1932, %r1899; + shf.l.wrap.b32 %r1934, %r1933, %r1933, 20; + add.s32 %r1935, %r1929, %r1158; + add.s32 %r1936, %r1935, %r1934; + xor.b32 %r1937, %r1936, %r1931; + shf.l.wrap.b32 %r1938, %r1937, %r1937, 24; + add.s32 %r1939, %r1938, %r1932; + xor.b32 %r1940, %r1939, %r1934; + shf.l.wrap.b32 %r1941, %r1940, %r1940, 25; + add.s32 %r1942, %r1894, %r1095; + add.s32 %r1943, %r1942, %r1913; + xor.b32 %r1944, %r1943, %r1882; + shf.l.wrap.b32 %r1945, %r1944, %r1944, 16; + add.s32 %r1946, %r1945, %r1869; + xor.b32 %r1947, %r1946, %r1913; + shf.l.wrap.b32 %r1948, %r1947, %r1947, 20; + add.s32 %r1949, %r1943, %r1102; + add.s32 %r1950, %r1949, %r1948; + xor.b32 %r1951, %r1950, %r1945; + shf.l.wrap.b32 %r1952, %r1951, %r1951, 24; + add.s32 %r1953, %r1952, %r1946; + xor.b32 %r1954, %r1953, %r1948; + shf.l.wrap.b32 %r1955, %r1954, %r1954, 25; + add.s32 %r1956, %r1908, %r1123; + add.s32 %r1957, %r1956, %r1871; + xor.b32 %r1958, %r1957, %r1896; + shf.l.wrap.b32 %r1959, %r1958, %r1958, 16; + add.s32 %r1960, %r1959, %r1883; + xor.b32 %r1961, %r1960, %r1871; + shf.l.wrap.b32 %r1962, %r1961, %r1961, 20; + add.s32 %r1963, %r1957, %r1165; + add.s32 %r1964, %r1963, %r1962; + xor.b32 %r1965, %r1964, %r1959; + shf.l.wrap.b32 %r1966, %r1965, %r1965, 24; + add.s32 %r1967, %r1966, %r1960; + xor.b32 %r1968, %r1967, %r1962; + shf.l.wrap.b32 %r1969, %r1968, %r1968, 25; + xor.b32 %r11657, %r1953, %r1922; + st.local.u32 [%rd3+-104], %r11657; + xor.b32 %r11656, %r1967, %r1936; + st.local.u32 [%rd3+-100], %r11656; + xor.b32 %r11655, %r1925, %r1950; + st.local.u32 [%rd3+-96], %r11655; + xor.b32 %r11654, %r1939, %r1964; + st.local.u32 [%rd3+-92], %r11654; + xor.b32 %r11653, %r1969, %r1938; + st.local.u32 [%rd3+-88], %r11653; + xor.b32 %r11652, %r1927, %r1952; + st.local.u32 [%rd3+-84], %r11652; + xor.b32 %r11651, %r1941, %r1966; + st.local.u32 [%rd3+-80], %r11651; + xor.b32 %r11650, %r1955, %r1924; + st.local.u32 [%rd3+-76], %r11650; + add.s16 %rs352, %rs352, 1; + st.local.u8 [%rd3+1], %rs352; + add.s64 %rd260, %rd260, 64; + add.s64 %rd243, %rd243, -64; + setp.gt.u64 %p10, %rd243, 64; + @%p10 bra $L__BB1_11; + +$L__BB1_12: + cvt.u64.u16 %rd120, %rs351; + and.b64 %rd24, %rd120, 255; + mov.u64 %rd121, 64; + sub.s64 %rd122, %rd121, %rd24; + min.u64 %rd25, %rd122, %rd243; + setp.eq.s64 %p11, %rd25, 0; + @%p11 bra $L__BB1_15; + + add.s64 %rd124, %rd2, %rd24; + add.s64 %rd26, %rd124, 72; + mov.u64 %rd244, 0; + +$L__BB1_14: + add.s64 %rd125, %rd260, %rd244; + ld.local.u8 %rs119, [%rd125]; + add.s64 %rd126, %rd26, %rd244; + st.local.u8 [%rd126], %rs119; + add.s64 %rd244, %rd244, 1; + setp.lt.u64 %p12, %rd244, %rd25; + @%p12 bra $L__BB1_14; + +$L__BB1_15: + cvt.u16.u64 %rs120, %rd25; + ld.local.u8 %rs121, [%rd3]; + add.s16 %rs13, %rs121, %rs120; + st.local.u8 [%rd3], %rs13; + mov.u64 %rd127, 32; + sub.s64 %rd29, %rd127, %rd6; + setp.eq.s64 %p13, %rd29, 0; + @%p13 bra $L__BB1_68; + + ld.local.u8 %rs122, [%rd3+1]; + setp.eq.s16 %p14, %rs122, 0; + selp.u16 %rs123, 1, 0, %p14; + ld.local.u8 %rs124, [%rd3+2]; + or.b16 %rs125, %rs124, %rs123; + or.b16 %rs126, %rs125, 2; + ld.local.u8 %r1970, [%rd3+-64]; + ld.local.u8 %r1971, [%rd3+-63]; + prmt.b32 %r1972, %r1971, %r1970, 30212; + ld.local.u8 %r1973, [%rd3+-62]; + prmt.b32 %r1974, %r1973, %r1972, 28756; + ld.local.u8 %r1975, [%rd3+-61]; + prmt.b32 %r1976, %r1975, %r1974, 1620; + ld.local.u8 %r1977, [%rd3+-60]; + ld.local.u8 %r1978, [%rd3+-59]; + prmt.b32 %r1979, %r1978, %r1977, 30212; + ld.local.u8 %r1980, [%rd3+-58]; + prmt.b32 %r1981, %r1980, %r1979, 28756; + ld.local.u8 %r1982, [%rd3+-57]; + prmt.b32 %r1983, %r1982, %r1981, 1620; + ld.local.u8 %r1984, [%rd3+-56]; + ld.local.u8 %r1985, [%rd3+-55]; + prmt.b32 %r1986, %r1985, %r1984, 30212; + ld.local.u8 %r1987, [%rd3+-54]; + prmt.b32 %r1988, %r1987, %r1986, 28756; + ld.local.u8 %r1989, [%rd3+-53]; + prmt.b32 %r1990, %r1989, %r1988, 1620; + ld.local.u8 %r1991, [%rd3+-52]; + ld.local.u8 %r1992, [%rd3+-51]; + prmt.b32 %r1993, %r1992, %r1991, 30212; + ld.local.u8 %r1994, [%rd3+-50]; + prmt.b32 %r1995, %r1994, %r1993, 28756; + ld.local.u8 %r1996, [%rd3+-49]; + prmt.b32 %r1997, %r1996, %r1995, 1620; + ld.local.u8 %r1998, [%rd3+-48]; + ld.local.u8 %r1999, [%rd3+-47]; + prmt.b32 %r2000, %r1999, %r1998, 30212; + ld.local.u8 %r2001, [%rd3+-46]; + prmt.b32 %r2002, %r2001, %r2000, 28756; + ld.local.u8 %r2003, [%rd3+-45]; + prmt.b32 %r2004, %r2003, %r2002, 1620; + ld.local.u8 %r2005, [%rd3+-44]; + ld.local.u8 %r2006, [%rd3+-43]; + prmt.b32 %r2007, %r2006, %r2005, 30212; + ld.local.u8 %r2008, [%rd3+-42]; + prmt.b32 %r2009, %r2008, %r2007, 28756; + ld.local.u8 %r2010, [%rd3+-41]; + prmt.b32 %r2011, %r2010, %r2009, 1620; + ld.local.u8 %r2012, [%rd3+-40]; + ld.local.u8 %r2013, [%rd3+-39]; + prmt.b32 %r2014, %r2013, %r2012, 30212; + ld.local.u8 %r2015, [%rd3+-38]; + prmt.b32 %r2016, %r2015, %r2014, 28756; + ld.local.u8 %r2017, [%rd3+-37]; + prmt.b32 %r2018, %r2017, %r2016, 1620; + ld.local.u8 %r2019, [%rd3+-36]; + ld.local.u8 %r2020, [%rd3+-35]; + prmt.b32 %r2021, %r2020, %r2019, 30212; + ld.local.u8 %r2022, [%rd3+-34]; + prmt.b32 %r2023, %r2022, %r2021, 28756; + ld.local.u8 %r2024, [%rd3+-33]; + prmt.b32 %r2025, %r2024, %r2023, 1620; + ld.local.u8 %r2026, [%rd3+-32]; + ld.local.u8 %r2027, [%rd3+-31]; + prmt.b32 %r2028, %r2027, %r2026, 30212; + ld.local.u8 %r2029, [%rd3+-30]; + prmt.b32 %r2030, %r2029, %r2028, 28756; + ld.local.u8 %r2031, [%rd3+-29]; + prmt.b32 %r2032, %r2031, %r2030, 1620; + ld.local.u8 %r2033, [%rd3+-28]; + ld.local.u8 %r2034, [%rd3+-27]; + prmt.b32 %r2035, %r2034, %r2033, 30212; + ld.local.u8 %r2036, [%rd3+-26]; + prmt.b32 %r2037, %r2036, %r2035, 28756; + ld.local.u8 %r2038, [%rd3+-25]; + prmt.b32 %r2039, %r2038, %r2037, 1620; + ld.local.u8 %r2040, [%rd3+-24]; + ld.local.u8 %r2041, [%rd3+-23]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd3+-22]; + prmt.b32 %r2044, %r2043, %r2042, 28756; + ld.local.u8 %r2045, [%rd3+-21]; + prmt.b32 %r2046, %r2045, %r2044, 1620; + ld.local.u8 %r2047, [%rd3+-20]; + ld.local.u8 %r2048, [%rd3+-19]; + prmt.b32 %r2049, %r2048, %r2047, 30212; + ld.local.u8 %r2050, [%rd3+-18]; + prmt.b32 %r2051, %r2050, %r2049, 28756; + ld.local.u8 %r2052, [%rd3+-17]; + prmt.b32 %r2053, %r2052, %r2051, 1620; + ld.local.u8 %r2054, [%rd3+-16]; + ld.local.u8 %r2055, [%rd3+-15]; + prmt.b32 %r2056, %r2055, %r2054, 30212; + ld.local.u8 %r2057, [%rd3+-14]; + prmt.b32 %r2058, %r2057, %r2056, 28756; + ld.local.u8 %r2059, [%rd3+-13]; + prmt.b32 %r2060, %r2059, %r2058, 1620; + ld.local.u8 %r2061, [%rd3+-12]; + ld.local.u8 %r2062, [%rd3+-11]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + ld.local.u8 %r2064, [%rd3+-10]; + prmt.b32 %r2065, %r2064, %r2063, 28756; + ld.local.u8 %r2066, [%rd3+-9]; + prmt.b32 %r2067, %r2066, %r2065, 1620; + ld.local.u8 %r2068, [%rd3+-8]; + ld.local.u8 %r2069, [%rd3+-7]; + prmt.b32 %r2070, %r2069, %r2068, 30212; + ld.local.u8 %r2071, [%rd3+-6]; + prmt.b32 %r2072, %r2071, %r2070, 28756; + ld.local.u8 %r2073, [%rd3+-5]; + prmt.b32 %r2074, %r2073, %r2072, 1620; + ld.local.u8 %r2075, [%rd3+-4]; + ld.local.u8 %r2076, [%rd3+-3]; + prmt.b32 %r2077, %r2076, %r2075, 30212; + ld.local.u8 %r2078, [%rd3+-2]; + prmt.b32 %r2079, %r2078, %r2077, 28756; + ld.local.u8 %r2080, [%rd3+-1]; + prmt.b32 %r2081, %r2080, %r2079, 1620; + ld.local.u64 %rd128, [%rd3+-72]; + cvt.u32.u64 %r2082, %rd128; + shr.u64 %rd129, %rd128, 32; + cvt.u32.u64 %r2083, %rd129; + cvt.u32.u16 %r2084, %rs126; + and.b32 %r2085, %r2084, 255; + ld.local.u8 %r2086, [%rd3+-88]; + ld.local.u8 %r2087, [%rd3+-87]; + prmt.b32 %r2088, %r2087, %r2086, 30212; + ld.local.u8 %r2089, [%rd3+-86]; + ld.local.u8 %r2090, [%rd3+-85]; + prmt.b32 %r2091, %r2090, %r2089, 30212; + prmt.b32 %r2092, %r2091, %r2088, 4180; + ld.local.u8 %r2093, [%rd3+-104]; + ld.local.u8 %r2094, [%rd3+-103]; + prmt.b32 %r2095, %r2094, %r2093, 30212; + ld.local.u8 %r2096, [%rd3+-102]; + ld.local.u8 %r2097, [%rd3+-101]; + prmt.b32 %r2098, %r2097, %r2096, 30212; + prmt.b32 %r2099, %r2098, %r2095, 4180; + add.s32 %r2100, %r2092, %r2099; + add.s32 %r2101, %r2100, %r1976; + xor.b32 %r2102, %r2101, %r2082; + shf.l.wrap.b32 %r2103, %r2102, %r2102, 16; + add.s32 %r2104, %r2103, 1779033703; + xor.b32 %r2105, %r2104, %r2092; + shf.l.wrap.b32 %r2106, %r2105, %r2105, 20; + add.s32 %r2107, %r1983, %r2101; + add.s32 %r2108, %r2107, %r2106; + xor.b32 %r2109, %r2108, %r2103; + shf.l.wrap.b32 %r2110, %r2109, %r2109, 24; + add.s32 %r2111, %r2110, %r2104; + xor.b32 %r2112, %r2111, %r2106; + shf.l.wrap.b32 %r2113, %r2112, %r2112, 25; + ld.local.u8 %r2114, [%rd3+-84]; + ld.local.u8 %r2115, [%rd3+-83]; + prmt.b32 %r2116, %r2115, %r2114, 30212; + ld.local.u8 %r2117, [%rd3+-82]; + ld.local.u8 %r2118, [%rd3+-81]; + prmt.b32 %r2119, %r2118, %r2117, 30212; + prmt.b32 %r2120, %r2119, %r2116, 4180; + ld.local.u8 %r2121, [%rd3+-100]; + ld.local.u8 %r2122, [%rd3+-99]; + prmt.b32 %r2123, %r2122, %r2121, 30212; + ld.local.u8 %r2124, [%rd3+-98]; + ld.local.u8 %r2125, [%rd3+-97]; + prmt.b32 %r2126, %r2125, %r2124, 30212; + prmt.b32 %r2127, %r2126, %r2123, 4180; + add.s32 %r2128, %r2120, %r2127; + add.s32 %r2129, %r2128, %r1990; + xor.b32 %r2130, %r2129, %r2083; + shf.l.wrap.b32 %r2131, %r2130, %r2130, 16; + add.s32 %r2132, %r2131, -1150833019; + xor.b32 %r2133, %r2132, %r2120; + shf.l.wrap.b32 %r2134, %r2133, %r2133, 20; + add.s32 %r2135, %r1997, %r2129; + add.s32 %r2136, %r2135, %r2134; + xor.b32 %r2137, %r2136, %r2131; + shf.l.wrap.b32 %r2138, %r2137, %r2137, 24; + add.s32 %r2139, %r2138, %r2132; + xor.b32 %r2140, %r2139, %r2134; + shf.l.wrap.b32 %r2141, %r2140, %r2140, 25; + ld.local.u8 %r2142, [%rd3+-80]; + ld.local.u8 %r2143, [%rd3+-79]; + prmt.b32 %r2144, %r2143, %r2142, 30212; + ld.local.u8 %r2145, [%rd3+-78]; + ld.local.u8 %r2146, [%rd3+-77]; + prmt.b32 %r2147, %r2146, %r2145, 30212; + prmt.b32 %r2148, %r2147, %r2144, 4180; + ld.local.u8 %r2149, [%rd3+-96]; + ld.local.u8 %r2150, [%rd3+-95]; + prmt.b32 %r2151, %r2150, %r2149, 30212; + ld.local.u8 %r2152, [%rd3+-94]; + ld.local.u8 %r2153, [%rd3+-93]; + prmt.b32 %r2154, %r2153, %r2152, 30212; + prmt.b32 %r2155, %r2154, %r2151, 4180; + add.s32 %r2156, %r2148, %r2155; + add.s32 %r2157, %r2156, %r2004; + cvt.u32.u16 %r2158, %rs13; + and.b32 %r2159, %r2158, 255; + xor.b32 %r2160, %r2157, %r2159; + shr.u32 %r2161, %r2157, 16; + shl.b32 %r2162, %r2160, 16; + or.b32 %r2163, %r2162, %r2161; + add.s32 %r2164, %r2163, 1013904242; + xor.b32 %r2165, %r2164, %r2148; + shf.l.wrap.b32 %r2166, %r2165, %r2165, 20; + add.s32 %r2167, %r2011, %r2157; + add.s32 %r2168, %r2167, %r2166; + xor.b32 %r2169, %r2168, %r2163; + shf.l.wrap.b32 %r2170, %r2169, %r2169, 24; + add.s32 %r2171, %r2170, %r2164; + xor.b32 %r2172, %r2171, %r2166; + shf.l.wrap.b32 %r2173, %r2172, %r2172, 25; + ld.local.u8 %r2174, [%rd3+-76]; + ld.local.u8 %r2175, [%rd3+-75]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.local.u8 %r2177, [%rd3+-74]; + ld.local.u8 %r2178, [%rd3+-73]; + prmt.b32 %r2179, %r2178, %r2177, 30212; + prmt.b32 %r2180, %r2179, %r2176, 4180; + ld.local.u8 %r2181, [%rd3+-92]; + ld.local.u8 %r2182, [%rd3+-91]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.local.u8 %r2184, [%rd3+-90]; + ld.local.u8 %r2185, [%rd3+-89]; + prmt.b32 %r2186, %r2185, %r2184, 30212; + prmt.b32 %r2187, %r2186, %r2183, 4180; + add.s32 %r2188, %r2180, %r2187; + add.s32 %r2189, %r2188, %r2018; + xor.b32 %r2190, %r2189, %r2085; + shr.u32 %r2191, %r2189, 16; + shl.b32 %r2192, %r2190, 16; + or.b32 %r2193, %r2192, %r2191; + add.s32 %r2194, %r2193, -1521486534; + xor.b32 %r2195, %r2194, %r2180; + shf.l.wrap.b32 %r2196, %r2195, %r2195, 20; + add.s32 %r2197, %r2025, %r2189; + add.s32 %r2198, %r2197, %r2196; + xor.b32 %r2199, %r2198, %r2193; + shf.l.wrap.b32 %r2200, %r2199, %r2199, 24; + add.s32 %r2201, %r2200, %r2194; + xor.b32 %r2202, %r2201, %r2196; + shf.l.wrap.b32 %r2203, %r2202, %r2202, 25; + add.s32 %r2204, %r2141, %r2108; + add.s32 %r2205, %r2204, %r2032; + xor.b32 %r2206, %r2200, %r2205; + shf.l.wrap.b32 %r2207, %r2206, %r2206, 16; + add.s32 %r2208, %r2207, %r2171; + xor.b32 %r2209, %r2208, %r2141; + shf.l.wrap.b32 %r2210, %r2209, %r2209, 20; + add.s32 %r2211, %r2039, %r2205; + add.s32 %r2212, %r2211, %r2210; + xor.b32 %r2213, %r2212, %r2207; + shf.l.wrap.b32 %r2214, %r2213, %r2213, 24; + add.s32 %r2215, %r2214, %r2208; + xor.b32 %r2216, %r2215, %r2210; + shf.l.wrap.b32 %r2217, %r2216, %r2216, 25; + add.s32 %r2218, %r2173, %r2136; + add.s32 %r2219, %r2218, %r2046; + xor.b32 %r2220, %r2219, %r2110; + shf.l.wrap.b32 %r2221, %r2220, %r2220, 16; + add.s32 %r2222, %r2221, %r2201; + xor.b32 %r2223, %r2222, %r2173; + shf.l.wrap.b32 %r2224, %r2223, %r2223, 20; + add.s32 %r2225, %r2053, %r2219; + add.s32 %r2226, %r2225, %r2224; + xor.b32 %r2227, %r2226, %r2221; + shf.l.wrap.b32 %r2228, %r2227, %r2227, 24; + add.s32 %r2229, %r2228, %r2222; + xor.b32 %r2230, %r2229, %r2224; + shf.l.wrap.b32 %r2231, %r2230, %r2230, 25; + add.s32 %r2232, %r2203, %r2168; + add.s32 %r2233, %r2232, %r2060; + xor.b32 %r2234, %r2233, %r2138; + shf.l.wrap.b32 %r2235, %r2234, %r2234, 16; + add.s32 %r2236, %r2235, %r2111; + xor.b32 %r2237, %r2236, %r2203; + shf.l.wrap.b32 %r2238, %r2237, %r2237, 20; + add.s32 %r2239, %r2067, %r2233; + add.s32 %r2240, %r2239, %r2238; + xor.b32 %r2241, %r2240, %r2235; + shf.l.wrap.b32 %r2242, %r2241, %r2241, 24; + add.s32 %r2243, %r2242, %r2236; + xor.b32 %r2244, %r2243, %r2238; + shf.l.wrap.b32 %r2245, %r2244, %r2244, 25; + add.s32 %r2246, %r2198, %r2113; + add.s32 %r2247, %r2246, %r2074; + xor.b32 %r2248, %r2247, %r2170; + shf.l.wrap.b32 %r2249, %r2248, %r2248, 16; + add.s32 %r2250, %r2249, %r2139; + xor.b32 %r2251, %r2250, %r2113; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 20; + add.s32 %r2253, %r2081, %r2247; + add.s32 %r2254, %r2253, %r2252; + xor.b32 %r2255, %r2254, %r2249; + shf.l.wrap.b32 %r2256, %r2255, %r2255, 24; + add.s32 %r2257, %r2256, %r2250; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 25; + add.s32 %r2260, %r2212, %r1990; + add.s32 %r2261, %r2260, %r2259; + xor.b32 %r2262, %r2261, %r2228; + shf.l.wrap.b32 %r2263, %r2262, %r2262, 16; + add.s32 %r2264, %r2263, %r2243; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 20; + add.s32 %r2267, %r2261, %r2018; + add.s32 %r2268, %r2267, %r2266; + xor.b32 %r2269, %r2268, %r2263; + shf.l.wrap.b32 %r2270, %r2269, %r2269, 24; + add.s32 %r2271, %r2270, %r2264; + xor.b32 %r2272, %r2271, %r2266; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 25; + add.s32 %r2274, %r2226, %r1997; + add.s32 %r2275, %r2274, %r2217; + xor.b32 %r2276, %r2242, %r2275; + shf.l.wrap.b32 %r2277, %r2276, %r2276, 16; + add.s32 %r2278, %r2257, %r2277; + xor.b32 %r2279, %r2278, %r2217; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 20; + add.s32 %r2281, %r2275, %r2046; + add.s32 %r2282, %r2281, %r2280; + xor.b32 %r2283, %r2282, %r2277; + shf.l.wrap.b32 %r2284, %r2283, %r2283, 24; + add.s32 %r2285, %r2284, %r2278; + xor.b32 %r2286, %r2285, %r2280; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 25; + add.s32 %r2288, %r2231, %r2025; + add.s32 %r2289, %r2288, %r2240; + xor.b32 %r2290, %r2256, %r2289; + shf.l.wrap.b32 %r2291, %r2290, %r2290, 16; + add.s32 %r2292, %r2291, %r2215; + xor.b32 %r2293, %r2292, %r2231; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 20; + add.s32 %r2295, %r2289, %r1976; + add.s32 %r2296, %r2295, %r2294; + xor.b32 %r2297, %r2296, %r2291; + shf.l.wrap.b32 %r2298, %r2297, %r2297, 24; + add.s32 %r2299, %r2298, %r2292; + xor.b32 %r2300, %r2299, %r2294; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 25; + add.s32 %r2302, %r2245, %r2004; + add.s32 %r2303, %r2302, %r2254; + xor.b32 %r2304, %r2303, %r2214; + shf.l.wrap.b32 %r2305, %r2304, %r2304, 16; + add.s32 %r2306, %r2305, %r2229; + xor.b32 %r2307, %r2306, %r2245; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 20; + add.s32 %r2309, %r2303, %r2067; + add.s32 %r2310, %r2309, %r2308; + xor.b32 %r2311, %r2310, %r2305; + shf.l.wrap.b32 %r2312, %r2311, %r2311, 24; + add.s32 %r2313, %r2312, %r2306; + xor.b32 %r2314, %r2313, %r2308; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 25; + add.s32 %r2316, %r2287, %r1983; + add.s32 %r2317, %r2316, %r2268; + xor.b32 %r2318, %r2317, %r2312; + shf.l.wrap.b32 %r2319, %r2318, %r2318, 16; + add.s32 %r2320, %r2319, %r2299; + xor.b32 %r2321, %r2320, %r2287; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 20; + add.s32 %r2323, %r2317, %r2053; + add.s32 %r2324, %r2323, %r2322; + xor.b32 %r2325, %r2324, %r2319; + shf.l.wrap.b32 %r2326, %r2325, %r2325, 24; + add.s32 %r2327, %r2326, %r2320; + xor.b32 %r2328, %r2327, %r2322; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 25; + add.s32 %r2330, %r2282, %r2060; + add.s32 %r2331, %r2330, %r2301; + xor.b32 %r2332, %r2270, %r2331; + shf.l.wrap.b32 %r2333, %r2332, %r2332, 16; + add.s32 %r2334, %r2333, %r2313; + xor.b32 %r2335, %r2334, %r2301; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 20; + add.s32 %r2337, %r2331, %r2011; + add.s32 %r2338, %r2337, %r2336; + xor.b32 %r2339, %r2338, %r2333; + shf.l.wrap.b32 %r2340, %r2339, %r2339, 24; + add.s32 %r2341, %r2340, %r2334; + xor.b32 %r2342, %r2341, %r2336; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 25; + add.s32 %r2344, %r2296, %r2039; + add.s32 %r2345, %r2344, %r2315; + xor.b32 %r2346, %r2345, %r2284; + shf.l.wrap.b32 %r2347, %r2346, %r2346, 16; + add.s32 %r2348, %r2347, %r2271; + xor.b32 %r2349, %r2348, %r2315; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 20; + add.s32 %r2351, %r2345, %r2074; + add.s32 %r2352, %r2351, %r2350; + xor.b32 %r2353, %r2352, %r2347; + shf.l.wrap.b32 %r2354, %r2353, %r2353, 24; + add.s32 %r2355, %r2354, %r2348; + xor.b32 %r2356, %r2355, %r2350; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 25; + add.s32 %r2358, %r2310, %r2081; + add.s32 %r2359, %r2358, %r2273; + xor.b32 %r2360, %r2359, %r2298; + shf.l.wrap.b32 %r2361, %r2360, %r2360, 16; + add.s32 %r2362, %r2361, %r2285; + xor.b32 %r2363, %r2362, %r2273; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 20; + add.s32 %r2365, %r2359, %r2032; + add.s32 %r2366, %r2365, %r2364; + xor.b32 %r2367, %r2366, %r2361; + shf.l.wrap.b32 %r2368, %r2367, %r2367, 24; + add.s32 %r2369, %r2368, %r2362; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 25; + add.s32 %r2372, %r2324, %r1997; + add.s32 %r2373, %r2372, %r2371; + xor.b32 %r2374, %r2373, %r2340; + shf.l.wrap.b32 %r2375, %r2374, %r2374, 16; + add.s32 %r2376, %r2375, %r2355; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 20; + add.s32 %r2379, %r2373, %r2004; + add.s32 %r2380, %r2379, %r2378; + xor.b32 %r2381, %r2380, %r2375; + shf.l.wrap.b32 %r2382, %r2381, %r2381, 24; + add.s32 %r2383, %r2382, %r2376; + xor.b32 %r2384, %r2383, %r2378; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 25; + add.s32 %r2386, %r2338, %r2046; + add.s32 %r2387, %r2386, %r2329; + xor.b32 %r2388, %r2387, %r2354; + shf.l.wrap.b32 %r2389, %r2388, %r2388, 16; + add.s32 %r2390, %r2389, %r2369; + xor.b32 %r2391, %r2390, %r2329; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 20; + add.s32 %r2393, %r2387, %r2060; + add.s32 %r2394, %r2393, %r2392; + xor.b32 %r2395, %r2394, %r2389; + shf.l.wrap.b32 %r2396, %r2395, %r2395, 24; + add.s32 %r2397, %r2396, %r2390; + xor.b32 %r2398, %r2397, %r2392; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 25; + add.s32 %r2400, %r2352, %r2067; + add.s32 %r2401, %r2400, %r2343; + xor.b32 %r2402, %r2368, %r2401; + shf.l.wrap.b32 %r2403, %r2402, %r2402, 16; + add.s32 %r2404, %r2403, %r2327; + xor.b32 %r2405, %r2404, %r2343; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 20; + add.s32 %r2407, %r2401, %r1990; + add.s32 %r2408, %r2407, %r2406; + xor.b32 %r2409, %r2408, %r2403; + shf.l.wrap.b32 %r2410, %r2409, %r2409, 24; + add.s32 %r2411, %r2410, %r2404; + xor.b32 %r2412, %r2411, %r2406; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 25; + add.s32 %r2414, %r2357, %r2025; + add.s32 %r2415, %r2414, %r2366; + xor.b32 %r2416, %r2415, %r2326; + shf.l.wrap.b32 %r2417, %r2416, %r2416, 16; + add.s32 %r2418, %r2417, %r2341; + xor.b32 %r2419, %r2418, %r2357; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 20; + add.s32 %r2421, %r2415, %r2074; + add.s32 %r2422, %r2421, %r2420; + xor.b32 %r2423, %r2422, %r2417; + shf.l.wrap.b32 %r2424, %r2423, %r2423, 24; + add.s32 %r2425, %r2424, %r2418; + xor.b32 %r2426, %r2425, %r2420; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 25; + add.s32 %r2428, %r2399, %r2018; + add.s32 %r2429, %r2428, %r2380; + xor.b32 %r2430, %r2429, %r2424; + shf.l.wrap.b32 %r2431, %r2430, %r2430, 16; + add.s32 %r2432, %r2431, %r2411; + xor.b32 %r2433, %r2432, %r2399; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 20; + add.s32 %r2435, %r2429, %r2011; + add.s32 %r2436, %r2435, %r2434; + xor.b32 %r2437, %r2436, %r2431; + shf.l.wrap.b32 %r2438, %r2437, %r2437, 24; + add.s32 %r2439, %r2438, %r2432; + xor.b32 %r2440, %r2439, %r2434; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 25; + add.s32 %r2442, %r2394, %r2039; + add.s32 %r2443, %r2442, %r2413; + xor.b32 %r2444, %r2382, %r2443; + shf.l.wrap.b32 %r2445, %r2444, %r2444, 16; + add.s32 %r2446, %r2445, %r2425; + xor.b32 %r2447, %r2446, %r2413; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 20; + add.s32 %r2449, %r2443, %r1976; + add.s32 %r2450, %r2449, %r2448; + xor.b32 %r2451, %r2450, %r2445; + shf.l.wrap.b32 %r2452, %r2451, %r2451, 24; + add.s32 %r2453, %r2452, %r2446; + xor.b32 %r2454, %r2453, %r2448; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 25; + add.s32 %r2456, %r2408, %r2053; + add.s32 %r2457, %r2456, %r2427; + xor.b32 %r2458, %r2457, %r2396; + shf.l.wrap.b32 %r2459, %r2458, %r2458, 16; + add.s32 %r2460, %r2459, %r2383; + xor.b32 %r2461, %r2460, %r2427; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 20; + add.s32 %r2463, %r2457, %r2081; + add.s32 %r2464, %r2463, %r2462; + xor.b32 %r2465, %r2464, %r2459; + shf.l.wrap.b32 %r2466, %r2465, %r2465, 24; + add.s32 %r2467, %r2466, %r2460; + xor.b32 %r2468, %r2467, %r2462; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 25; + add.s32 %r2470, %r2422, %r2032; + add.s32 %r2471, %r2470, %r2385; + xor.b32 %r2472, %r2471, %r2410; + shf.l.wrap.b32 %r2473, %r2472, %r2472, 16; + add.s32 %r2474, %r2473, %r2397; + xor.b32 %r2475, %r2474, %r2385; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 20; + add.s32 %r2477, %r2471, %r1983; + add.s32 %r2478, %r2477, %r2476; + xor.b32 %r2479, %r2478, %r2473; + shf.l.wrap.b32 %r2480, %r2479, %r2479, 24; + add.s32 %r2481, %r2480, %r2474; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 25; + add.s32 %r2484, %r2436, %r2046; + add.s32 %r2485, %r2484, %r2483; + xor.b32 %r2486, %r2485, %r2452; + shf.l.wrap.b32 %r2487, %r2486, %r2486, 16; + add.s32 %r2488, %r2487, %r2467; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 20; + add.s32 %r2491, %r2485, %r2025; + add.s32 %r2492, %r2491, %r2490; + xor.b32 %r2493, %r2492, %r2487; + shf.l.wrap.b32 %r2494, %r2493, %r2493, 24; + add.s32 %r2495, %r2494, %r2488; + xor.b32 %r2496, %r2495, %r2490; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 25; + add.s32 %r2498, %r2450, %r2060; + add.s32 %r2499, %r2498, %r2441; + xor.b32 %r2500, %r2499, %r2466; + shf.l.wrap.b32 %r2501, %r2500, %r2500, 16; + add.s32 %r2502, %r2501, %r2481; + xor.b32 %r2503, %r2502, %r2441; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 20; + add.s32 %r2505, %r2499, %r2039; + add.s32 %r2506, %r2505, %r2504; + xor.b32 %r2507, %r2506, %r2501; + shf.l.wrap.b32 %r2508, %r2507, %r2507, 24; + add.s32 %r2509, %r2508, %r2502; + xor.b32 %r2510, %r2509, %r2504; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 25; + add.s32 %r2512, %r2464, %r2074; + add.s32 %r2513, %r2512, %r2455; + xor.b32 %r2514, %r2480, %r2513; + shf.l.wrap.b32 %r2515, %r2514, %r2514, 16; + add.s32 %r2516, %r2515, %r2439; + xor.b32 %r2517, %r2516, %r2455; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 20; + add.s32 %r2519, %r2513, %r1997; + add.s32 %r2520, %r2519, %r2518; + xor.b32 %r2521, %r2520, %r2515; + shf.l.wrap.b32 %r2522, %r2521, %r2521, 24; + add.s32 %r2523, %r2522, %r2516; + xor.b32 %r2524, %r2523, %r2518; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 25; + add.s32 %r2526, %r2469, %r2067; + add.s32 %r2527, %r2526, %r2478; + xor.b32 %r2528, %r2527, %r2438; + shf.l.wrap.b32 %r2529, %r2528, %r2528, 16; + add.s32 %r2530, %r2529, %r2453; + xor.b32 %r2531, %r2530, %r2469; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 20; + add.s32 %r2533, %r2527, %r2081; + add.s32 %r2534, %r2533, %r2532; + xor.b32 %r2535, %r2534, %r2529; + shf.l.wrap.b32 %r2536, %r2535, %r2535, 24; + add.s32 %r2537, %r2536, %r2530; + xor.b32 %r2538, %r2537, %r2532; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 25; + add.s32 %r2540, %r2511, %r2004; + add.s32 %r2541, %r2540, %r2492; + xor.b32 %r2542, %r2541, %r2536; + shf.l.wrap.b32 %r2543, %r2542, %r2542, 16; + add.s32 %r2544, %r2543, %r2523; + xor.b32 %r2545, %r2544, %r2511; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 20; + add.s32 %r2547, %r2541, %r1976; + add.s32 %r2548, %r2547, %r2546; + xor.b32 %r2549, %r2548, %r2543; + shf.l.wrap.b32 %r2550, %r2549, %r2549, 24; + add.s32 %r2551, %r2550, %r2544; + xor.b32 %r2552, %r2551, %r2546; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 25; + add.s32 %r2554, %r2506, %r2053; + add.s32 %r2555, %r2554, %r2525; + xor.b32 %r2556, %r2494, %r2555; + shf.l.wrap.b32 %r2557, %r2556, %r2556, 16; + add.s32 %r2558, %r2557, %r2537; + xor.b32 %r2559, %r2558, %r2525; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 20; + add.s32 %r2561, %r2555, %r1990; + add.s32 %r2562, %r2561, %r2560; + xor.b32 %r2563, %r2562, %r2557; + shf.l.wrap.b32 %r2564, %r2563, %r2563, 24; + add.s32 %r2565, %r2564, %r2558; + xor.b32 %r2566, %r2565, %r2560; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 25; + add.s32 %r2568, %r2520, %r2011; + add.s32 %r2569, %r2568, %r2539; + xor.b32 %r2570, %r2569, %r2508; + shf.l.wrap.b32 %r2571, %r2570, %r2570, 16; + add.s32 %r2572, %r2571, %r2495; + xor.b32 %r2573, %r2572, %r2539; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 20; + add.s32 %r2575, %r2569, %r2032; + add.s32 %r2576, %r2575, %r2574; + xor.b32 %r2577, %r2576, %r2571; + shf.l.wrap.b32 %r2578, %r2577, %r2577, 24; + add.s32 %r2579, %r2578, %r2572; + xor.b32 %r2580, %r2579, %r2574; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 25; + add.s32 %r2582, %r2534, %r1983; + add.s32 %r2583, %r2582, %r2497; + xor.b32 %r2584, %r2583, %r2522; + shf.l.wrap.b32 %r2585, %r2584, %r2584, 16; + add.s32 %r2586, %r2585, %r2509; + xor.b32 %r2587, %r2586, %r2497; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 20; + add.s32 %r2589, %r2583, %r2018; + add.s32 %r2590, %r2589, %r2588; + xor.b32 %r2591, %r2590, %r2585; + shf.l.wrap.b32 %r2592, %r2591, %r2591, 24; + add.s32 %r2593, %r2592, %r2586; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 25; + add.s32 %r2596, %r2548, %r2060; + add.s32 %r2597, %r2596, %r2595; + xor.b32 %r2598, %r2597, %r2564; + shf.l.wrap.b32 %r2599, %r2598, %r2598, 16; + add.s32 %r2600, %r2599, %r2579; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 20; + add.s32 %r2603, %r2597, %r2067; + add.s32 %r2604, %r2603, %r2602; + xor.b32 %r2605, %r2604, %r2599; + shf.l.wrap.b32 %r2606, %r2605, %r2605, 24; + add.s32 %r2607, %r2606, %r2600; + xor.b32 %r2608, %r2607, %r2602; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 25; + add.s32 %r2610, %r2562, %r2039; + add.s32 %r2611, %r2610, %r2553; + xor.b32 %r2612, %r2611, %r2578; + shf.l.wrap.b32 %r2613, %r2612, %r2612, 16; + add.s32 %r2614, %r2613, %r2593; + xor.b32 %r2615, %r2614, %r2553; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 20; + add.s32 %r2617, %r2611, %r2053; + add.s32 %r2618, %r2617, %r2616; + xor.b32 %r2619, %r2618, %r2613; + shf.l.wrap.b32 %r2620, %r2619, %r2619, 24; + add.s32 %r2621, %r2620, %r2614; + xor.b32 %r2622, %r2621, %r2616; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 25; + add.s32 %r2624, %r2576, %r2081; + add.s32 %r2625, %r2624, %r2567; + xor.b32 %r2626, %r2592, %r2625; + shf.l.wrap.b32 %r2627, %r2626, %r2626, 16; + add.s32 %r2628, %r2627, %r2551; + xor.b32 %r2629, %r2628, %r2567; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 20; + add.s32 %r2631, %r2625, %r2046; + add.s32 %r2632, %r2631, %r2630; + xor.b32 %r2633, %r2632, %r2627; + shf.l.wrap.b32 %r2634, %r2633, %r2633, 24; + add.s32 %r2635, %r2634, %r2628; + xor.b32 %r2636, %r2635, %r2630; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 25; + add.s32 %r2638, %r2581, %r2074; + add.s32 %r2639, %r2638, %r2590; + xor.b32 %r2640, %r2639, %r2550; + shf.l.wrap.b32 %r2641, %r2640, %r2640, 16; + add.s32 %r2642, %r2641, %r2565; + xor.b32 %r2643, %r2642, %r2581; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 20; + add.s32 %r2645, %r2639, %r2032; + add.s32 %r2646, %r2645, %r2644; + xor.b32 %r2647, %r2646, %r2641; + shf.l.wrap.b32 %r2648, %r2647, %r2647, 24; + add.s32 %r2649, %r2648, %r2642; + xor.b32 %r2650, %r2649, %r2644; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 25; + add.s32 %r2652, %r2623, %r2025; + add.s32 %r2653, %r2652, %r2604; + xor.b32 %r2654, %r2653, %r2648; + shf.l.wrap.b32 %r2655, %r2654, %r2654, 16; + add.s32 %r2656, %r2655, %r2635; + xor.b32 %r2657, %r2656, %r2623; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 20; + add.s32 %r2659, %r2653, %r1990; + add.s32 %r2660, %r2659, %r2658; + xor.b32 %r2661, %r2660, %r2655; + shf.l.wrap.b32 %r2662, %r2661, %r2661, 24; + add.s32 %r2663, %r2662, %r2656; + xor.b32 %r2664, %r2663, %r2658; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 25; + add.s32 %r2666, %r2618, %r2011; + add.s32 %r2667, %r2666, %r2637; + xor.b32 %r2668, %r2606, %r2667; + shf.l.wrap.b32 %r2669, %r2668, %r2668, 16; + add.s32 %r2670, %r2669, %r2649; + xor.b32 %r2671, %r2670, %r2637; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 20; + add.s32 %r2673, %r2667, %r1997; + add.s32 %r2674, %r2673, %r2672; + xor.b32 %r2675, %r2674, %r2669; + shf.l.wrap.b32 %r2676, %r2675, %r2675, 24; + add.s32 %r2677, %r2676, %r2670; + xor.b32 %r2678, %r2677, %r2672; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 25; + add.s32 %r2680, %r2632, %r1976; + add.s32 %r2681, %r2680, %r2651; + xor.b32 %r2682, %r2681, %r2620; + shf.l.wrap.b32 %r2683, %r2682, %r2682, 16; + add.s32 %r2684, %r2683, %r2607; + xor.b32 %r2685, %r2684, %r2651; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 20; + add.s32 %r2687, %r2681, %r1983; + add.s32 %r2688, %r2687, %r2686; + xor.b32 %r2689, %r2688, %r2683; + shf.l.wrap.b32 %r2690, %r2689, %r2689, 24; + add.s32 %r2691, %r2690, %r2684; + xor.b32 %r2692, %r2691, %r2686; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 25; + add.s32 %r2694, %r2646, %r2018; + add.s32 %r2695, %r2694, %r2609; + xor.b32 %r2696, %r2695, %r2634; + shf.l.wrap.b32 %r2697, %r2696, %r2696, 16; + add.s32 %r2698, %r2697, %r2621; + xor.b32 %r2699, %r2698, %r2609; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 20; + add.s32 %r2701, %r2695, %r2004; + add.s32 %r2702, %r2701, %r2700; + xor.b32 %r2703, %r2702, %r2697; + shf.l.wrap.b32 %r2704, %r2703, %r2703, 24; + add.s32 %r2705, %r2704, %r2698; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 25; + add.s32 %r2708, %r2660, %r2039; + add.s32 %r2709, %r2708, %r2707; + xor.b32 %r2710, %r2709, %r2676; + shf.l.wrap.b32 %r2711, %r2710, %r2710, 16; + add.s32 %r2712, %r2711, %r2691; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 20; + add.s32 %r2715, %r2709, %r2074; + add.s32 %r2716, %r2715, %r2714; + xor.b32 %r2717, %r2716, %r2711; + shf.l.wrap.b32 %r2718, %r2717, %r2717, 24; + add.s32 %r2719, %r2718, %r2712; + xor.b32 %r2720, %r2719, %r2714; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 25; + add.s32 %r2722, %r2674, %r2053; + add.s32 %r2723, %r2722, %r2665; + xor.b32 %r2724, %r2723, %r2690; + shf.l.wrap.b32 %r2725, %r2724, %r2724, 16; + add.s32 %r2726, %r2725, %r2705; + xor.b32 %r2727, %r2726, %r2665; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 20; + add.s32 %r2729, %r2723, %r2011; + add.s32 %r2730, %r2729, %r2728; + xor.b32 %r2731, %r2730, %r2725; + shf.l.wrap.b32 %r2732, %r2731, %r2731, 24; + add.s32 %r2733, %r2732, %r2726; + xor.b32 %r2734, %r2733, %r2728; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 25; + add.s32 %r2736, %r2688, %r2032; + add.s32 %r2737, %r2736, %r2679; + xor.b32 %r2738, %r2704, %r2737; + shf.l.wrap.b32 %r2739, %r2738, %r2738, 16; + add.s32 %r2740, %r2739, %r2663; + xor.b32 %r2741, %r2740, %r2679; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 20; + add.s32 %r2743, %r2737, %r2060; + add.s32 %r2744, %r2743, %r2742; + xor.b32 %r2745, %r2744, %r2739; + shf.l.wrap.b32 %r2746, %r2745, %r2745, 24; + add.s32 %r2747, %r2746, %r2740; + xor.b32 %r2748, %r2747, %r2742; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 25; + add.s32 %r2750, %r2693, %r2081; + add.s32 %r2751, %r2750, %r2702; + xor.b32 %r2752, %r2751, %r2662; + shf.l.wrap.b32 %r2753, %r2752, %r2752, 16; + add.s32 %r2754, %r2753, %r2677; + xor.b32 %r2755, %r2754, %r2693; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 20; + add.s32 %r2757, %r2751, %r1983; + add.s32 %r2758, %r2757, %r2756; + xor.b32 %r2759, %r2758, %r2753; + shf.l.wrap.b32 %r2760, %r2759, %r2759, 24; + add.s32 %r2761, %r2760, %r2754; + xor.b32 %r2762, %r2761, %r2756; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 25; + add.s32 %r2764, %r2735, %r2067; + add.s32 %r2765, %r2764, %r2716; + xor.b32 %r2766, %r2765, %r2760; + shf.l.wrap.b32 %r2767, %r2766, %r2766, 16; + add.s32 %r2768, %r2767, %r2747; + xor.b32 %r2769, %r2768, %r2735; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 20; + add.s32 %r2771, %r2765, %r1997; + add.s32 %r2772, %r2771, %r2770; + xor.b32 %r2773, %r2772, %r2767; + shf.l.wrap.b32 %r2774, %r2773, %r2773, 24; + add.s32 %r2775, %r2774, %r2768; + xor.b32 %r2776, %r2775, %r2770; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 25; + add.s32 %r2778, %r2730, %r1976; + add.s32 %r2779, %r2778, %r2749; + xor.b32 %r2780, %r2718, %r2779; + shf.l.wrap.b32 %r2781, %r2780, %r2780, 16; + add.s32 %r2782, %r2781, %r2761; + xor.b32 %r2783, %r2782, %r2749; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 20; + add.s32 %r2785, %r2779, %r2046; + add.s32 %r2786, %r2785, %r2784; + xor.b32 %r2787, %r2786, %r2781; + shf.l.wrap.b32 %r2788, %r2787, %r2787, 24; + add.s32 %r2789, %r2788, %r2782; + xor.b32 %r2790, %r2789, %r2784; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 25; + add.s32 %r2792, %r2744, %r1990; + add.s32 %r2793, %r2792, %r2763; + xor.b32 %r2794, %r2793, %r2732; + shf.l.wrap.b32 %r2795, %r2794, %r2794, 16; + add.s32 %r2796, %r2795, %r2719; + xor.b32 %r2797, %r2796, %r2763; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 20; + add.s32 %r2799, %r2793, %r2018; + add.s32 %r2800, %r2799, %r2798; + xor.b32 %r2801, %r2800, %r2795; + shf.l.wrap.b32 %r2802, %r2801, %r2801, 24; + add.s32 %r2803, %r2802, %r2796; + xor.b32 %r2804, %r2803, %r2798; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 25; + add.s32 %r2806, %r2758, %r2004; + add.s32 %r2807, %r2806, %r2721; + xor.b32 %r2808, %r2807, %r2746; + shf.l.wrap.b32 %r2809, %r2808, %r2808, 16; + add.s32 %r2810, %r2809, %r2733; + xor.b32 %r2811, %r2810, %r2721; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 20; + add.s32 %r2813, %r2807, %r2025; + add.s32 %r2814, %r2813, %r2812; + xor.b32 %r2815, %r2814, %r2809; + shf.l.wrap.b32 %r2816, %r2815, %r2815, 24; + add.s32 %r2817, %r2816, %r2810; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 25; + add.s32 %r2820, %r2772, %r2053; + add.s32 %r2821, %r2820, %r2819; + xor.b32 %r2822, %r2821, %r2788; + shf.l.wrap.b32 %r2823, %r2822, %r2822, 16; + add.s32 %r2824, %r2823, %r2803; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 20; + add.s32 %r2827, %r2821, %r2081; + add.s32 %r2828, %r2827, %r2826; + xor.b32 %r2829, %r2828, %r2823; + shf.l.wrap.b32 %r2830, %r2829, %r2829, 24; + add.s32 %r2831, %r2830, %r2824; + xor.b32 %r2832, %r2831, %r2826; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 25; + add.s32 %r2834, %r2786, %r2011; + add.s32 %r2835, %r2834, %r2777; + xor.b32 %r2836, %r2835, %r2802; + shf.l.wrap.b32 %r2837, %r2836, %r2836, 16; + add.s32 %r2838, %r2837, %r2817; + xor.b32 %r2839, %r2838, %r2777; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 20; + add.s32 %r2841, %r2835, %r1976; + add.s32 %r2842, %r2841, %r2840; + xor.b32 %r2843, %r2842, %r2837; + shf.l.wrap.b32 %r2844, %r2843, %r2843, 24; + add.s32 %r2845, %r2844, %r2838; + xor.b32 %r2846, %r2845, %r2840; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 25; + add.s32 %r2848, %r2800, %r1983; + add.s32 %r2849, %r2848, %r2791; + xor.b32 %r2850, %r2816, %r2849; + shf.l.wrap.b32 %r2851, %r2850, %r2850, 16; + add.s32 %r2852, %r2851, %r2775; + xor.b32 %r2853, %r2852, %r2791; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 20; + add.s32 %r2855, %r2849, %r2039; + add.s32 %r2856, %r2855, %r2854; + xor.b32 %r2857, %r2856, %r2851; + shf.l.wrap.b32 %r2858, %r2857, %r2857, 24; + add.s32 %r2859, %r2858, %r2852; + xor.b32 %r2860, %r2859, %r2854; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 25; + add.s32 %r2862, %r2805, %r2032; + add.s32 %r2863, %r2862, %r2814; + xor.b32 %r2864, %r2863, %r2774; + shf.l.wrap.b32 %r2865, %r2864, %r2864, 16; + add.s32 %r2866, %r2865, %r2789; + xor.b32 %r2867, %r2866, %r2805; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 20; + add.s32 %r2869, %r2863, %r2018; + add.s32 %r2870, %r2869, %r2868; + xor.b32 %r2871, %r2870, %r2865; + shf.l.wrap.b32 %r2872, %r2871, %r2871, 24; + add.s32 %r2873, %r2872, %r2866; + xor.b32 %r2874, %r2873, %r2868; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 25; + add.s32 %r2876, %r2847, %r2074; + add.s32 %r2877, %r2876, %r2828; + xor.b32 %r2878, %r2877, %r2872; + shf.l.wrap.b32 %r2879, %r2878, %r2878, 16; + add.s32 %r2880, %r2879, %r2859; + xor.b32 %r2881, %r2880, %r2847; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 20; + add.s32 %r2883, %r2877, %r2046; + add.s32 %r2884, %r2883, %r2882; + xor.b32 %r2885, %r2884, %r2879; + shf.l.wrap.b32 %r2886, %r2885, %r2885, 24; + add.s32 %r2887, %r2886, %r2880; + xor.b32 %r2888, %r2887, %r2882; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 25; + add.s32 %r2890, %r2842, %r1990; + add.s32 %r2891, %r2890, %r2861; + xor.b32 %r2892, %r2830, %r2891; + shf.l.wrap.b32 %r2893, %r2892, %r2892, 16; + add.s32 %r2894, %r2893, %r2873; + xor.b32 %r2895, %r2894, %r2861; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 20; + add.s32 %r2897, %r2891, %r2060; + add.s32 %r2898, %r2897, %r2896; + xor.b32 %r2899, %r2898, %r2893; + shf.l.wrap.b32 %r2900, %r2899, %r2899, 24; + add.s32 %r2901, %r2900, %r2894; + xor.b32 %r2902, %r2901, %r2896; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 25; + add.s32 %r2904, %r2856, %r1997; + add.s32 %r2905, %r2904, %r2875; + xor.b32 %r2906, %r2905, %r2844; + shf.l.wrap.b32 %r2907, %r2906, %r2906, 16; + add.s32 %r2908, %r2907, %r2831; + xor.b32 %r2909, %r2908, %r2875; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 20; + add.s32 %r2911, %r2905, %r2004; + add.s32 %r2912, %r2911, %r2910; + xor.b32 %r2913, %r2912, %r2907; + shf.l.wrap.b32 %r2914, %r2913, %r2913, 24; + add.s32 %r2915, %r2914, %r2908; + xor.b32 %r2916, %r2915, %r2910; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 25; + add.s32 %r2918, %r2870, %r2025; + add.s32 %r2919, %r2918, %r2833; + xor.b32 %r2920, %r2919, %r2858; + shf.l.wrap.b32 %r2921, %r2920, %r2920, 16; + add.s32 %r2922, %r2921, %r2845; + xor.b32 %r2923, %r2922, %r2833; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 20; + add.s32 %r2925, %r2919, %r2067; + add.s32 %r2926, %r2925, %r2924; + xor.b32 %r2927, %r2926, %r2921; + shf.l.wrap.b32 %r2928, %r2927, %r2927, 24; + add.s32 %r2929, %r2928, %r2922; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 25; + xor.b32 %r27, %r2915, %r2884; + xor.b32 %r28, %r2929, %r2898; + xor.b32 %r29, %r2887, %r2912; + xor.b32 %r30, %r2926, %r2901; + xor.b32 %r31, %r2931, %r2900; + xor.b32 %r32, %r2889, %r2914; + xor.b32 %r33, %r2928, %r2903; + xor.b32 %r34, %r2917, %r2886; + popc.b64 %r2932, %rd128; + cvt.u64.u32 %rd30, %r2932; + ld.local.u8 %rs127, [%rd3+8]; + cvt.u64.u16 %rd130, %rs127; + setp.ge.u64 %p15, %rd30, %rd130; + mul.wide.u16 %r11659, %rs127, 32; + @%p15 bra $L__BB1_19; + +$L__BB1_18: + add.s32 %r2933, %r11659, -64; + cvt.s64.s32 %rd131, %r2933; + add.s64 %rd132, %rd2, %rd131; + ld.local.u8 %r2934, [%rd3+2]; + ld.local.u8 %r2935, [%rd132+145]; + ld.local.u8 %r2936, [%rd132+146]; + prmt.b32 %r2937, %r2936, %r2935, 30212; + ld.local.u8 %r2938, [%rd132+147]; + prmt.b32 %r2939, %r2938, %r2937, 28756; + ld.local.u8 %r2940, [%rd132+148]; + prmt.b32 %r2941, %r2940, %r2939, 1620; + ld.local.u8 %r2942, [%rd132+149]; + ld.local.u8 %r2943, [%rd132+150]; + prmt.b32 %r2944, %r2943, %r2942, 30212; + ld.local.u8 %r2945, [%rd132+151]; + prmt.b32 %r2946, %r2945, %r2944, 28756; + ld.local.u8 %r2947, [%rd132+152]; + prmt.b32 %r2948, %r2947, %r2946, 1620; + ld.local.u8 %r2949, [%rd132+153]; + ld.local.u8 %r2950, [%rd132+154]; + prmt.b32 %r2951, %r2950, %r2949, 30212; + ld.local.u8 %r2952, [%rd132+155]; + prmt.b32 %r2953, %r2952, %r2951, 28756; + ld.local.u8 %r2954, [%rd132+156]; + prmt.b32 %r2955, %r2954, %r2953, 1620; + ld.local.u8 %r2956, [%rd132+157]; + ld.local.u8 %r2957, [%rd132+158]; + prmt.b32 %r2958, %r2957, %r2956, 30212; + ld.local.u8 %r2959, [%rd132+159]; + prmt.b32 %r2960, %r2959, %r2958, 28756; + ld.local.u8 %r2961, [%rd132+160]; + prmt.b32 %r2962, %r2961, %r2960, 1620; + ld.local.u8 %r2963, [%rd132+161]; + ld.local.u8 %r2964, [%rd132+162]; + prmt.b32 %r2965, %r2964, %r2963, 30212; + ld.local.u8 %r2966, [%rd132+163]; + prmt.b32 %r2967, %r2966, %r2965, 28756; + ld.local.u8 %r2968, [%rd132+164]; + prmt.b32 %r2969, %r2968, %r2967, 1620; + ld.local.u8 %r2970, [%rd132+165]; + ld.local.u8 %r2971, [%rd132+166]; + prmt.b32 %r2972, %r2971, %r2970, 30212; + ld.local.u8 %r2973, [%rd132+167]; + prmt.b32 %r2974, %r2973, %r2972, 28756; + ld.local.u8 %r2975, [%rd132+168]; + prmt.b32 %r2976, %r2975, %r2974, 1620; + ld.local.u8 %r2977, [%rd132+169]; + ld.local.u8 %r2978, [%rd132+170]; + prmt.b32 %r2979, %r2978, %r2977, 30212; + ld.local.u8 %r2980, [%rd132+171]; + prmt.b32 %r2981, %r2980, %r2979, 28756; + ld.local.u8 %r2982, [%rd132+172]; + prmt.b32 %r2983, %r2982, %r2981, 1620; + ld.local.u8 %r2984, [%rd132+173]; + ld.local.u8 %r2985, [%rd132+174]; + prmt.b32 %r2986, %r2985, %r2984, 30212; + ld.local.u8 %r2987, [%rd132+175]; + prmt.b32 %r2988, %r2987, %r2986, 28756; + ld.local.u8 %r2989, [%rd132+176]; + prmt.b32 %r2990, %r2989, %r2988, 1620; + ld.local.u8 %r2991, [%rd132+177]; + ld.local.u8 %r2992, [%rd132+178]; + prmt.b32 %r2993, %r2992, %r2991, 30212; + ld.local.u8 %r2994, [%rd132+179]; + prmt.b32 %r2995, %r2994, %r2993, 28756; + ld.local.u8 %r2996, [%rd132+180]; + prmt.b32 %r2997, %r2996, %r2995, 1620; + ld.local.u8 %r2998, [%rd132+181]; + ld.local.u8 %r2999, [%rd132+182]; + prmt.b32 %r3000, %r2999, %r2998, 30212; + ld.local.u8 %r3001, [%rd132+183]; + prmt.b32 %r3002, %r3001, %r3000, 28756; + ld.local.u8 %r3003, [%rd132+184]; + prmt.b32 %r3004, %r3003, %r3002, 1620; + ld.local.u8 %r3005, [%rd132+185]; + ld.local.u8 %r3006, [%rd132+186]; + prmt.b32 %r3007, %r3006, %r3005, 30212; + ld.local.u8 %r3008, [%rd132+187]; + prmt.b32 %r3009, %r3008, %r3007, 28756; + ld.local.u8 %r3010, [%rd132+188]; + prmt.b32 %r3011, %r3010, %r3009, 1620; + ld.local.u8 %r3012, [%rd132+189]; + ld.local.u8 %r3013, [%rd132+190]; + prmt.b32 %r3014, %r3013, %r3012, 30212; + ld.local.u8 %r3015, [%rd132+191]; + prmt.b32 %r3016, %r3015, %r3014, 28756; + ld.local.u8 %r3017, [%rd132+192]; + prmt.b32 %r3018, %r3017, %r3016, 1620; + ld.local.u8 %r3019, [%rd132+193]; + ld.local.u8 %r3020, [%rd132+194]; + prmt.b32 %r3021, %r3020, %r3019, 30212; + ld.local.u8 %r3022, [%rd132+195]; + prmt.b32 %r3023, %r3022, %r3021, 28756; + ld.local.u8 %r3024, [%rd132+196]; + prmt.b32 %r3025, %r3024, %r3023, 1620; + ld.local.u8 %r3026, [%rd132+197]; + ld.local.u8 %r3027, [%rd132+198]; + prmt.b32 %r3028, %r3027, %r3026, 30212; + ld.local.u8 %r3029, [%rd132+199]; + prmt.b32 %r3030, %r3029, %r3028, 28756; + ld.local.u8 %r3031, [%rd132+200]; + prmt.b32 %r3032, %r3031, %r3030, 1620; + ld.local.u8 %r3033, [%rd132+201]; + ld.local.u8 %r3034, [%rd132+202]; + prmt.b32 %r3035, %r3034, %r3033, 30212; + ld.local.u8 %r3036, [%rd132+203]; + prmt.b32 %r3037, %r3036, %r3035, 28756; + ld.local.u8 %r3038, [%rd132+204]; + prmt.b32 %r3039, %r3038, %r3037, 1620; + ld.local.u8 %r3040, [%rd132+205]; + ld.local.u8 %r3041, [%rd132+206]; + prmt.b32 %r3042, %r3041, %r3040, 30212; + ld.local.u8 %r3043, [%rd132+207]; + prmt.b32 %r3044, %r3043, %r3042, 28756; + ld.local.u8 %r3045, [%rd132+208]; + prmt.b32 %r3046, %r3045, %r3044, 1620; + or.b32 %r3047, %r2934, 4; + ld.local.u8 %r3048, [%rd3+-120]; + ld.local.u8 %r3049, [%rd3+-119]; + prmt.b32 %r3050, %r3049, %r3048, 30212; + ld.local.u8 %r3051, [%rd3+-118]; + ld.local.u8 %r3052, [%rd3+-117]; + prmt.b32 %r3053, %r3052, %r3051, 30212; + prmt.b32 %r3054, %r3053, %r3050, 4180; + ld.local.u8 %r3055, [%rd3+-136]; + ld.local.u8 %r3056, [%rd3+-135]; + prmt.b32 %r3057, %r3056, %r3055, 30212; + ld.local.u8 %r3058, [%rd3+-134]; + ld.local.u8 %r3059, [%rd3+-133]; + prmt.b32 %r3060, %r3059, %r3058, 30212; + prmt.b32 %r3061, %r3060, %r3057, 4180; + add.s32 %r3062, %r3054, %r3061; + add.s32 %r3063, %r3062, %r2941; + shf.l.wrap.b32 %r3064, %r3063, %r3063, 16; + add.s32 %r3065, %r3064, 1779033703; + xor.b32 %r3066, %r3065, %r3054; + shf.l.wrap.b32 %r3067, %r3066, %r3066, 20; + add.s32 %r3068, %r2948, %r3063; + add.s32 %r3069, %r3068, %r3067; + xor.b32 %r3070, %r3069, %r3064; + shf.l.wrap.b32 %r3071, %r3070, %r3070, 24; + add.s32 %r3072, %r3071, %r3065; + xor.b32 %r3073, %r3072, %r3067; + shf.l.wrap.b32 %r3074, %r3073, %r3073, 25; + ld.local.u8 %r3075, [%rd3+-116]; + ld.local.u8 %r3076, [%rd3+-115]; + prmt.b32 %r3077, %r3076, %r3075, 30212; + ld.local.u8 %r3078, [%rd3+-114]; + ld.local.u8 %r3079, [%rd3+-113]; + prmt.b32 %r3080, %r3079, %r3078, 30212; + prmt.b32 %r3081, %r3080, %r3077, 4180; + ld.local.u8 %r3082, [%rd3+-132]; + ld.local.u8 %r3083, [%rd3+-131]; + prmt.b32 %r3084, %r3083, %r3082, 30212; + ld.local.u8 %r3085, [%rd3+-130]; + ld.local.u8 %r3086, [%rd3+-129]; + prmt.b32 %r3087, %r3086, %r3085, 30212; + prmt.b32 %r3088, %r3087, %r3084, 4180; + add.s32 %r3089, %r3081, %r3088; + add.s32 %r3090, %r3089, %r2955; + shf.l.wrap.b32 %r3091, %r3090, %r3090, 16; + add.s32 %r3092, %r3091, -1150833019; + xor.b32 %r3093, %r3092, %r3081; + shf.l.wrap.b32 %r3094, %r3093, %r3093, 20; + add.s32 %r3095, %r2962, %r3090; + add.s32 %r3096, %r3095, %r3094; + xor.b32 %r3097, %r3096, %r3091; + shf.l.wrap.b32 %r3098, %r3097, %r3097, 24; + add.s32 %r3099, %r3098, %r3092; + xor.b32 %r3100, %r3099, %r3094; + shf.l.wrap.b32 %r3101, %r3100, %r3100, 25; + ld.local.u8 %r3102, [%rd3+-112]; + ld.local.u8 %r3103, [%rd3+-111]; + prmt.b32 %r3104, %r3103, %r3102, 30212; + ld.local.u8 %r3105, [%rd3+-110]; + ld.local.u8 %r3106, [%rd3+-109]; + prmt.b32 %r3107, %r3106, %r3105, 30212; + prmt.b32 %r3108, %r3107, %r3104, 4180; + ld.local.u8 %r3109, [%rd3+-128]; + ld.local.u8 %r3110, [%rd3+-127]; + prmt.b32 %r3111, %r3110, %r3109, 30212; + ld.local.u8 %r3112, [%rd3+-126]; + ld.local.u8 %r3113, [%rd3+-125]; + prmt.b32 %r3114, %r3113, %r3112, 30212; + prmt.b32 %r3115, %r3114, %r3111, 4180; + add.s32 %r3116, %r3108, %r3115; + add.s32 %r3117, %r3116, %r2969; + shr.u32 %r3118, %r3117, 16; + shl.b32 %r3119, %r3117, 16; + xor.b32 %r3120, %r3119, 4194304; + or.b32 %r3121, %r3120, %r3118; + add.s32 %r3122, %r3121, 1013904242; + xor.b32 %r3123, %r3122, %r3108; + shf.l.wrap.b32 %r3124, %r3123, %r3123, 20; + add.s32 %r3125, %r2976, %r3117; + add.s32 %r3126, %r3125, %r3124; + xor.b32 %r3127, %r3126, %r3121; + shf.l.wrap.b32 %r3128, %r3127, %r3127, 24; + add.s32 %r3129, %r3128, %r3122; + xor.b32 %r3130, %r3129, %r3124; + shf.l.wrap.b32 %r3131, %r3130, %r3130, 25; + ld.local.u8 %r3132, [%rd3+-108]; + ld.local.u8 %r3133, [%rd3+-107]; + prmt.b32 %r3134, %r3133, %r3132, 30212; + ld.local.u8 %r3135, [%rd3+-106]; + ld.local.u8 %r3136, [%rd3+-105]; + prmt.b32 %r3137, %r3136, %r3135, 30212; + prmt.b32 %r3138, %r3137, %r3134, 4180; + ld.local.u8 %r3139, [%rd3+-124]; + ld.local.u8 %r3140, [%rd3+-123]; + prmt.b32 %r3141, %r3140, %r3139, 30212; + ld.local.u8 %r3142, [%rd3+-122]; + ld.local.u8 %r3143, [%rd3+-121]; + prmt.b32 %r3144, %r3143, %r3142, 30212; + prmt.b32 %r3145, %r3144, %r3141, 4180; + add.s32 %r3146, %r3138, %r3145; + add.s32 %r3147, %r3146, %r2983; + xor.b32 %r3148, %r3147, %r3047; + shr.u32 %r3149, %r3147, 16; + shl.b32 %r3150, %r3148, 16; + or.b32 %r3151, %r3150, %r3149; + add.s32 %r3152, %r3151, -1521486534; + xor.b32 %r3153, %r3152, %r3138; + shf.l.wrap.b32 %r3154, %r3153, %r3153, 20; + add.s32 %r3155, %r2990, %r3147; + add.s32 %r3156, %r3155, %r3154; + xor.b32 %r3157, %r3156, %r3151; + shf.l.wrap.b32 %r3158, %r3157, %r3157, 24; + add.s32 %r3159, %r3158, %r3152; + xor.b32 %r3160, %r3159, %r3154; + shf.l.wrap.b32 %r3161, %r3160, %r3160, 25; + add.s32 %r3162, %r3101, %r3069; + add.s32 %r3163, %r3162, %r2997; + xor.b32 %r3164, %r3158, %r3163; + shf.l.wrap.b32 %r3165, %r3164, %r3164, 16; + add.s32 %r3166, %r3165, %r3129; + xor.b32 %r3167, %r3166, %r3101; + shf.l.wrap.b32 %r3168, %r3167, %r3167, 20; + add.s32 %r3169, %r3004, %r3163; + add.s32 %r3170, %r3169, %r3168; + xor.b32 %r3171, %r3170, %r3165; + shf.l.wrap.b32 %r3172, %r3171, %r3171, 24; + add.s32 %r3173, %r3172, %r3166; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 25; + add.s32 %r3176, %r3131, %r3096; + add.s32 %r3177, %r3176, %r3011; + xor.b32 %r3178, %r3177, %r3071; + shf.l.wrap.b32 %r3179, %r3178, %r3178, 16; + add.s32 %r3180, %r3179, %r3159; + xor.b32 %r3181, %r3180, %r3131; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 20; + add.s32 %r3183, %r3018, %r3177; + add.s32 %r3184, %r3183, %r3182; + xor.b32 %r3185, %r3184, %r3179; + shf.l.wrap.b32 %r3186, %r3185, %r3185, 24; + add.s32 %r3187, %r3186, %r3180; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 25; + add.s32 %r3190, %r3161, %r3126; + add.s32 %r3191, %r3190, %r3025; + xor.b32 %r3192, %r3191, %r3098; + shf.l.wrap.b32 %r3193, %r3192, %r3192, 16; + add.s32 %r3194, %r3193, %r3072; + xor.b32 %r3195, %r3194, %r3161; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 20; + add.s32 %r3197, %r3032, %r3191; + add.s32 %r3198, %r3197, %r3196; + xor.b32 %r3199, %r3198, %r3193; + shf.l.wrap.b32 %r3200, %r3199, %r3199, 24; + add.s32 %r3201, %r3200, %r3194; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 25; + add.s32 %r3204, %r3156, %r3074; + add.s32 %r3205, %r3204, %r3039; + xor.b32 %r3206, %r3205, %r3128; + shf.l.wrap.b32 %r3207, %r3206, %r3206, 16; + add.s32 %r3208, %r3207, %r3099; + xor.b32 %r3209, %r3208, %r3074; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 20; + add.s32 %r3211, %r3046, %r3205; + add.s32 %r3212, %r3211, %r3210; + xor.b32 %r3213, %r3212, %r3207; + shf.l.wrap.b32 %r3214, %r3213, %r3213, 24; + add.s32 %r3215, %r3214, %r3208; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 25; + add.s32 %r3218, %r3170, %r2955; + add.s32 %r3219, %r3218, %r3217; + xor.b32 %r3220, %r3219, %r3186; + shf.l.wrap.b32 %r3221, %r3220, %r3220, 16; + add.s32 %r3222, %r3221, %r3201; + xor.b32 %r3223, %r3222, %r3217; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 20; + add.s32 %r3225, %r3219, %r2983; + add.s32 %r3226, %r3225, %r3224; + xor.b32 %r3227, %r3226, %r3221; + shf.l.wrap.b32 %r3228, %r3227, %r3227, 24; + add.s32 %r3229, %r3228, %r3222; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 25; + add.s32 %r3232, %r3184, %r2962; + add.s32 %r3233, %r3232, %r3175; + xor.b32 %r3234, %r3200, %r3233; + shf.l.wrap.b32 %r3235, %r3234, %r3234, 16; + add.s32 %r3236, %r3215, %r3235; + xor.b32 %r3237, %r3236, %r3175; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 20; + add.s32 %r3239, %r3233, %r3011; + add.s32 %r3240, %r3239, %r3238; + xor.b32 %r3241, %r3240, %r3235; + shf.l.wrap.b32 %r3242, %r3241, %r3241, 24; + add.s32 %r3243, %r3242, %r3236; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 25; + add.s32 %r3246, %r3189, %r2990; + add.s32 %r3247, %r3246, %r3198; + xor.b32 %r3248, %r3214, %r3247; + shf.l.wrap.b32 %r3249, %r3248, %r3248, 16; + add.s32 %r3250, %r3249, %r3173; + xor.b32 %r3251, %r3250, %r3189; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 20; + add.s32 %r3253, %r3247, %r2941; + add.s32 %r3254, %r3253, %r3252; + xor.b32 %r3255, %r3254, %r3249; + shf.l.wrap.b32 %r3256, %r3255, %r3255, 24; + add.s32 %r3257, %r3256, %r3250; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 25; + add.s32 %r3260, %r3203, %r2969; + add.s32 %r3261, %r3260, %r3212; + xor.b32 %r3262, %r3261, %r3172; + shf.l.wrap.b32 %r3263, %r3262, %r3262, 16; + add.s32 %r3264, %r3263, %r3187; + xor.b32 %r3265, %r3264, %r3203; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 20; + add.s32 %r3267, %r3261, %r3032; + add.s32 %r3268, %r3267, %r3266; + xor.b32 %r3269, %r3268, %r3263; + shf.l.wrap.b32 %r3270, %r3269, %r3269, 24; + add.s32 %r3271, %r3270, %r3264; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 25; + add.s32 %r3274, %r3245, %r2948; + add.s32 %r3275, %r3274, %r3226; + xor.b32 %r3276, %r3275, %r3270; + shf.l.wrap.b32 %r3277, %r3276, %r3276, 16; + add.s32 %r3278, %r3277, %r3257; + xor.b32 %r3279, %r3278, %r3245; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 20; + add.s32 %r3281, %r3275, %r3018; + add.s32 %r3282, %r3281, %r3280; + xor.b32 %r3283, %r3282, %r3277; + shf.l.wrap.b32 %r3284, %r3283, %r3283, 24; + add.s32 %r3285, %r3284, %r3278; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 25; + add.s32 %r3288, %r3240, %r3025; + add.s32 %r3289, %r3288, %r3259; + xor.b32 %r3290, %r3228, %r3289; + shf.l.wrap.b32 %r3291, %r3290, %r3290, 16; + add.s32 %r3292, %r3291, %r3271; + xor.b32 %r3293, %r3292, %r3259; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 20; + add.s32 %r3295, %r3289, %r2976; + add.s32 %r3296, %r3295, %r3294; + xor.b32 %r3297, %r3296, %r3291; + shf.l.wrap.b32 %r3298, %r3297, %r3297, 24; + add.s32 %r3299, %r3298, %r3292; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 25; + add.s32 %r3302, %r3254, %r3004; + add.s32 %r3303, %r3302, %r3273; + xor.b32 %r3304, %r3303, %r3242; + shf.l.wrap.b32 %r3305, %r3304, %r3304, 16; + add.s32 %r3306, %r3305, %r3229; + xor.b32 %r3307, %r3306, %r3273; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 20; + add.s32 %r3309, %r3303, %r3039; + add.s32 %r3310, %r3309, %r3308; + xor.b32 %r3311, %r3310, %r3305; + shf.l.wrap.b32 %r3312, %r3311, %r3311, 24; + add.s32 %r3313, %r3312, %r3306; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 25; + add.s32 %r3316, %r3268, %r3046; + add.s32 %r3317, %r3316, %r3231; + xor.b32 %r3318, %r3317, %r3256; + shf.l.wrap.b32 %r3319, %r3318, %r3318, 16; + add.s32 %r3320, %r3319, %r3243; + xor.b32 %r3321, %r3320, %r3231; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 20; + add.s32 %r3323, %r3317, %r2997; + add.s32 %r3324, %r3323, %r3322; + xor.b32 %r3325, %r3324, %r3319; + shf.l.wrap.b32 %r3326, %r3325, %r3325, 24; + add.s32 %r3327, %r3326, %r3320; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 25; + add.s32 %r3330, %r3282, %r2962; + add.s32 %r3331, %r3330, %r3329; + xor.b32 %r3332, %r3331, %r3298; + shf.l.wrap.b32 %r3333, %r3332, %r3332, 16; + add.s32 %r3334, %r3333, %r3313; + xor.b32 %r3335, %r3334, %r3329; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 20; + add.s32 %r3337, %r3331, %r2969; + add.s32 %r3338, %r3337, %r3336; + xor.b32 %r3339, %r3338, %r3333; + shf.l.wrap.b32 %r3340, %r3339, %r3339, 24; + add.s32 %r3341, %r3340, %r3334; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 25; + add.s32 %r3344, %r3296, %r3011; + add.s32 %r3345, %r3344, %r3287; + xor.b32 %r3346, %r3345, %r3312; + shf.l.wrap.b32 %r3347, %r3346, %r3346, 16; + add.s32 %r3348, %r3347, %r3327; + xor.b32 %r3349, %r3348, %r3287; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 20; + add.s32 %r3351, %r3345, %r3025; + add.s32 %r3352, %r3351, %r3350; + xor.b32 %r3353, %r3352, %r3347; + shf.l.wrap.b32 %r3354, %r3353, %r3353, 24; + add.s32 %r3355, %r3354, %r3348; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 25; + add.s32 %r3358, %r3310, %r3032; + add.s32 %r3359, %r3358, %r3301; + xor.b32 %r3360, %r3326, %r3359; + shf.l.wrap.b32 %r3361, %r3360, %r3360, 16; + add.s32 %r3362, %r3361, %r3285; + xor.b32 %r3363, %r3362, %r3301; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 20; + add.s32 %r3365, %r3359, %r2955; + add.s32 %r3366, %r3365, %r3364; + xor.b32 %r3367, %r3366, %r3361; + shf.l.wrap.b32 %r3368, %r3367, %r3367, 24; + add.s32 %r3369, %r3368, %r3362; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 25; + add.s32 %r3372, %r3315, %r2990; + add.s32 %r3373, %r3372, %r3324; + xor.b32 %r3374, %r3373, %r3284; + shf.l.wrap.b32 %r3375, %r3374, %r3374, 16; + add.s32 %r3376, %r3375, %r3299; + xor.b32 %r3377, %r3376, %r3315; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 20; + add.s32 %r3379, %r3373, %r3039; + add.s32 %r3380, %r3379, %r3378; + xor.b32 %r3381, %r3380, %r3375; + shf.l.wrap.b32 %r3382, %r3381, %r3381, 24; + add.s32 %r3383, %r3382, %r3376; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 25; + add.s32 %r3386, %r3357, %r2983; + add.s32 %r3387, %r3386, %r3338; + xor.b32 %r3388, %r3387, %r3382; + shf.l.wrap.b32 %r3389, %r3388, %r3388, 16; + add.s32 %r3390, %r3389, %r3369; + xor.b32 %r3391, %r3390, %r3357; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 20; + add.s32 %r3393, %r3387, %r2976; + add.s32 %r3394, %r3393, %r3392; + xor.b32 %r3395, %r3394, %r3389; + shf.l.wrap.b32 %r3396, %r3395, %r3395, 24; + add.s32 %r3397, %r3396, %r3390; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 25; + add.s32 %r3400, %r3352, %r3004; + add.s32 %r3401, %r3400, %r3371; + xor.b32 %r3402, %r3340, %r3401; + shf.l.wrap.b32 %r3403, %r3402, %r3402, 16; + add.s32 %r3404, %r3403, %r3383; + xor.b32 %r3405, %r3404, %r3371; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 20; + add.s32 %r3407, %r3401, %r2941; + add.s32 %r3408, %r3407, %r3406; + xor.b32 %r3409, %r3408, %r3403; + shf.l.wrap.b32 %r3410, %r3409, %r3409, 24; + add.s32 %r3411, %r3410, %r3404; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 25; + add.s32 %r3414, %r3366, %r3018; + add.s32 %r3415, %r3414, %r3385; + xor.b32 %r3416, %r3415, %r3354; + shf.l.wrap.b32 %r3417, %r3416, %r3416, 16; + add.s32 %r3418, %r3417, %r3341; + xor.b32 %r3419, %r3418, %r3385; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 20; + add.s32 %r3421, %r3415, %r3046; + add.s32 %r3422, %r3421, %r3420; + xor.b32 %r3423, %r3422, %r3417; + shf.l.wrap.b32 %r3424, %r3423, %r3423, 24; + add.s32 %r3425, %r3424, %r3418; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 25; + add.s32 %r3428, %r3380, %r2997; + add.s32 %r3429, %r3428, %r3343; + xor.b32 %r3430, %r3429, %r3368; + shf.l.wrap.b32 %r3431, %r3430, %r3430, 16; + add.s32 %r3432, %r3431, %r3355; + xor.b32 %r3433, %r3432, %r3343; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 20; + add.s32 %r3435, %r3429, %r2948; + add.s32 %r3436, %r3435, %r3434; + xor.b32 %r3437, %r3436, %r3431; + shf.l.wrap.b32 %r3438, %r3437, %r3437, 24; + add.s32 %r3439, %r3438, %r3432; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 25; + add.s32 %r3442, %r3394, %r3011; + add.s32 %r3443, %r3442, %r3441; + xor.b32 %r3444, %r3443, %r3410; + shf.l.wrap.b32 %r3445, %r3444, %r3444, 16; + add.s32 %r3446, %r3445, %r3425; + xor.b32 %r3447, %r3446, %r3441; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 20; + add.s32 %r3449, %r3443, %r2990; + add.s32 %r3450, %r3449, %r3448; + xor.b32 %r3451, %r3450, %r3445; + shf.l.wrap.b32 %r3452, %r3451, %r3451, 24; + add.s32 %r3453, %r3452, %r3446; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 25; + add.s32 %r3456, %r3408, %r3025; + add.s32 %r3457, %r3456, %r3399; + xor.b32 %r3458, %r3457, %r3424; + shf.l.wrap.b32 %r3459, %r3458, %r3458, 16; + add.s32 %r3460, %r3459, %r3439; + xor.b32 %r3461, %r3460, %r3399; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 20; + add.s32 %r3463, %r3457, %r3004; + add.s32 %r3464, %r3463, %r3462; + xor.b32 %r3465, %r3464, %r3459; + shf.l.wrap.b32 %r3466, %r3465, %r3465, 24; + add.s32 %r3467, %r3466, %r3460; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 25; + add.s32 %r3470, %r3422, %r3039; + add.s32 %r3471, %r3470, %r3413; + xor.b32 %r3472, %r3438, %r3471; + shf.l.wrap.b32 %r3473, %r3472, %r3472, 16; + add.s32 %r3474, %r3473, %r3397; + xor.b32 %r3475, %r3474, %r3413; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 20; + add.s32 %r3477, %r3471, %r2962; + add.s32 %r3478, %r3477, %r3476; + xor.b32 %r3479, %r3478, %r3473; + shf.l.wrap.b32 %r3480, %r3479, %r3479, 24; + add.s32 %r3481, %r3480, %r3474; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 25; + add.s32 %r3484, %r3427, %r3032; + add.s32 %r3485, %r3484, %r3436; + xor.b32 %r3486, %r3485, %r3396; + shf.l.wrap.b32 %r3487, %r3486, %r3486, 16; + add.s32 %r3488, %r3487, %r3411; + xor.b32 %r3489, %r3488, %r3427; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 20; + add.s32 %r3491, %r3485, %r3046; + add.s32 %r3492, %r3491, %r3490; + xor.b32 %r3493, %r3492, %r3487; + shf.l.wrap.b32 %r3494, %r3493, %r3493, 24; + add.s32 %r3495, %r3494, %r3488; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 25; + add.s32 %r3498, %r3469, %r2969; + add.s32 %r3499, %r3498, %r3450; + xor.b32 %r3500, %r3499, %r3494; + shf.l.wrap.b32 %r3501, %r3500, %r3500, 16; + add.s32 %r3502, %r3501, %r3481; + xor.b32 %r3503, %r3502, %r3469; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 20; + add.s32 %r3505, %r3499, %r2941; + add.s32 %r3506, %r3505, %r3504; + xor.b32 %r3507, %r3506, %r3501; + shf.l.wrap.b32 %r3508, %r3507, %r3507, 24; + add.s32 %r3509, %r3508, %r3502; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 25; + add.s32 %r3512, %r3464, %r3018; + add.s32 %r3513, %r3512, %r3483; + xor.b32 %r3514, %r3452, %r3513; + shf.l.wrap.b32 %r3515, %r3514, %r3514, 16; + add.s32 %r3516, %r3515, %r3495; + xor.b32 %r3517, %r3516, %r3483; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 20; + add.s32 %r3519, %r3513, %r2955; + add.s32 %r3520, %r3519, %r3518; + xor.b32 %r3521, %r3520, %r3515; + shf.l.wrap.b32 %r3522, %r3521, %r3521, 24; + add.s32 %r3523, %r3522, %r3516; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 25; + add.s32 %r3526, %r3478, %r2976; + add.s32 %r3527, %r3526, %r3497; + xor.b32 %r3528, %r3527, %r3466; + shf.l.wrap.b32 %r3529, %r3528, %r3528, 16; + add.s32 %r3530, %r3529, %r3453; + xor.b32 %r3531, %r3530, %r3497; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 20; + add.s32 %r3533, %r3527, %r2997; + add.s32 %r3534, %r3533, %r3532; + xor.b32 %r3535, %r3534, %r3529; + shf.l.wrap.b32 %r3536, %r3535, %r3535, 24; + add.s32 %r3537, %r3536, %r3530; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 25; + add.s32 %r3540, %r3492, %r2948; + add.s32 %r3541, %r3540, %r3455; + xor.b32 %r3542, %r3541, %r3480; + shf.l.wrap.b32 %r3543, %r3542, %r3542, 16; + add.s32 %r3544, %r3543, %r3467; + xor.b32 %r3545, %r3544, %r3455; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 20; + add.s32 %r3547, %r3541, %r2983; + add.s32 %r3548, %r3547, %r3546; + xor.b32 %r3549, %r3548, %r3543; + shf.l.wrap.b32 %r3550, %r3549, %r3549, 24; + add.s32 %r3551, %r3550, %r3544; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 25; + add.s32 %r3554, %r3506, %r3025; + add.s32 %r3555, %r3554, %r3553; + xor.b32 %r3556, %r3555, %r3522; + shf.l.wrap.b32 %r3557, %r3556, %r3556, 16; + add.s32 %r3558, %r3557, %r3537; + xor.b32 %r3559, %r3558, %r3553; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 20; + add.s32 %r3561, %r3555, %r3032; + add.s32 %r3562, %r3561, %r3560; + xor.b32 %r3563, %r3562, %r3557; + shf.l.wrap.b32 %r3564, %r3563, %r3563, 24; + add.s32 %r3565, %r3564, %r3558; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 25; + add.s32 %r3568, %r3520, %r3004; + add.s32 %r3569, %r3568, %r3511; + xor.b32 %r3570, %r3569, %r3536; + shf.l.wrap.b32 %r3571, %r3570, %r3570, 16; + add.s32 %r3572, %r3571, %r3551; + xor.b32 %r3573, %r3572, %r3511; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 20; + add.s32 %r3575, %r3569, %r3018; + add.s32 %r3576, %r3575, %r3574; + xor.b32 %r3577, %r3576, %r3571; + shf.l.wrap.b32 %r3578, %r3577, %r3577, 24; + add.s32 %r3579, %r3578, %r3572; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 25; + add.s32 %r3582, %r3534, %r3046; + add.s32 %r3583, %r3582, %r3525; + xor.b32 %r3584, %r3550, %r3583; + shf.l.wrap.b32 %r3585, %r3584, %r3584, 16; + add.s32 %r3586, %r3585, %r3509; + xor.b32 %r3587, %r3586, %r3525; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 20; + add.s32 %r3589, %r3583, %r3011; + add.s32 %r3590, %r3589, %r3588; + xor.b32 %r3591, %r3590, %r3585; + shf.l.wrap.b32 %r3592, %r3591, %r3591, 24; + add.s32 %r3593, %r3592, %r3586; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 25; + add.s32 %r3596, %r3539, %r3039; + add.s32 %r3597, %r3596, %r3548; + xor.b32 %r3598, %r3597, %r3508; + shf.l.wrap.b32 %r3599, %r3598, %r3598, 16; + add.s32 %r3600, %r3599, %r3523; + xor.b32 %r3601, %r3600, %r3539; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 20; + add.s32 %r3603, %r3597, %r2997; + add.s32 %r3604, %r3603, %r3602; + xor.b32 %r3605, %r3604, %r3599; + shf.l.wrap.b32 %r3606, %r3605, %r3605, 24; + add.s32 %r3607, %r3606, %r3600; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 25; + add.s32 %r3610, %r3581, %r2990; + add.s32 %r3611, %r3610, %r3562; + xor.b32 %r3612, %r3611, %r3606; + shf.l.wrap.b32 %r3613, %r3612, %r3612, 16; + add.s32 %r3614, %r3613, %r3593; + xor.b32 %r3615, %r3614, %r3581; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 20; + add.s32 %r3617, %r3611, %r2955; + add.s32 %r3618, %r3617, %r3616; + xor.b32 %r3619, %r3618, %r3613; + shf.l.wrap.b32 %r3620, %r3619, %r3619, 24; + add.s32 %r3621, %r3620, %r3614; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 25; + add.s32 %r3624, %r3576, %r2976; + add.s32 %r3625, %r3624, %r3595; + xor.b32 %r3626, %r3564, %r3625; + shf.l.wrap.b32 %r3627, %r3626, %r3626, 16; + add.s32 %r3628, %r3627, %r3607; + xor.b32 %r3629, %r3628, %r3595; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 20; + add.s32 %r3631, %r3625, %r2962; + add.s32 %r3632, %r3631, %r3630; + xor.b32 %r3633, %r3632, %r3627; + shf.l.wrap.b32 %r3634, %r3633, %r3633, 24; + add.s32 %r3635, %r3634, %r3628; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 25; + add.s32 %r3638, %r3590, %r2941; + add.s32 %r3639, %r3638, %r3609; + xor.b32 %r3640, %r3639, %r3578; + shf.l.wrap.b32 %r3641, %r3640, %r3640, 16; + add.s32 %r3642, %r3641, %r3565; + xor.b32 %r3643, %r3642, %r3609; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 20; + add.s32 %r3645, %r3639, %r2948; + add.s32 %r3646, %r3645, %r3644; + xor.b32 %r3647, %r3646, %r3641; + shf.l.wrap.b32 %r3648, %r3647, %r3647, 24; + add.s32 %r3649, %r3648, %r3642; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 25; + add.s32 %r3652, %r3604, %r2983; + add.s32 %r3653, %r3652, %r3567; + xor.b32 %r3654, %r3653, %r3592; + shf.l.wrap.b32 %r3655, %r3654, %r3654, 16; + add.s32 %r3656, %r3655, %r3579; + xor.b32 %r3657, %r3656, %r3567; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 20; + add.s32 %r3659, %r3653, %r2969; + add.s32 %r3660, %r3659, %r3658; + xor.b32 %r3661, %r3660, %r3655; + shf.l.wrap.b32 %r3662, %r3661, %r3661, 24; + add.s32 %r3663, %r3662, %r3656; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 25; + add.s32 %r3666, %r3618, %r3004; + add.s32 %r3667, %r3666, %r3665; + xor.b32 %r3668, %r3667, %r3634; + shf.l.wrap.b32 %r3669, %r3668, %r3668, 16; + add.s32 %r3670, %r3669, %r3649; + xor.b32 %r3671, %r3670, %r3665; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 20; + add.s32 %r3673, %r3667, %r3039; + add.s32 %r3674, %r3673, %r3672; + xor.b32 %r3675, %r3674, %r3669; + shf.l.wrap.b32 %r3676, %r3675, %r3675, 24; + add.s32 %r3677, %r3676, %r3670; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 25; + add.s32 %r3680, %r3632, %r3018; + add.s32 %r3681, %r3680, %r3623; + xor.b32 %r3682, %r3681, %r3648; + shf.l.wrap.b32 %r3683, %r3682, %r3682, 16; + add.s32 %r3684, %r3683, %r3663; + xor.b32 %r3685, %r3684, %r3623; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 20; + add.s32 %r3687, %r3681, %r2976; + add.s32 %r3688, %r3687, %r3686; + xor.b32 %r3689, %r3688, %r3683; + shf.l.wrap.b32 %r3690, %r3689, %r3689, 24; + add.s32 %r3691, %r3690, %r3684; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 25; + add.s32 %r3694, %r3646, %r2997; + add.s32 %r3695, %r3694, %r3637; + xor.b32 %r3696, %r3662, %r3695; + shf.l.wrap.b32 %r3697, %r3696, %r3696, 16; + add.s32 %r3698, %r3697, %r3621; + xor.b32 %r3699, %r3698, %r3637; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 20; + add.s32 %r3701, %r3695, %r3025; + add.s32 %r3702, %r3701, %r3700; + xor.b32 %r3703, %r3702, %r3697; + shf.l.wrap.b32 %r3704, %r3703, %r3703, 24; + add.s32 %r3705, %r3704, %r3698; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 25; + add.s32 %r3708, %r3651, %r3046; + add.s32 %r3709, %r3708, %r3660; + xor.b32 %r3710, %r3709, %r3620; + shf.l.wrap.b32 %r3711, %r3710, %r3710, 16; + add.s32 %r3712, %r3711, %r3635; + xor.b32 %r3713, %r3712, %r3651; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 20; + add.s32 %r3715, %r3709, %r2948; + add.s32 %r3716, %r3715, %r3714; + xor.b32 %r3717, %r3716, %r3711; + shf.l.wrap.b32 %r3718, %r3717, %r3717, 24; + add.s32 %r3719, %r3718, %r3712; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 25; + add.s32 %r3722, %r3693, %r3032; + add.s32 %r3723, %r3722, %r3674; + xor.b32 %r3724, %r3723, %r3718; + shf.l.wrap.b32 %r3725, %r3724, %r3724, 16; + add.s32 %r3726, %r3725, %r3705; + xor.b32 %r3727, %r3726, %r3693; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 20; + add.s32 %r3729, %r3723, %r2962; + add.s32 %r3730, %r3729, %r3728; + xor.b32 %r3731, %r3730, %r3725; + shf.l.wrap.b32 %r3732, %r3731, %r3731, 24; + add.s32 %r3733, %r3732, %r3726; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 25; + add.s32 %r3736, %r3688, %r2941; + add.s32 %r3737, %r3736, %r3707; + xor.b32 %r3738, %r3676, %r3737; + shf.l.wrap.b32 %r3739, %r3738, %r3738, 16; + add.s32 %r3740, %r3739, %r3719; + xor.b32 %r3741, %r3740, %r3707; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 20; + add.s32 %r3743, %r3737, %r3011; + add.s32 %r3744, %r3743, %r3742; + xor.b32 %r3745, %r3744, %r3739; + shf.l.wrap.b32 %r3746, %r3745, %r3745, 24; + add.s32 %r3747, %r3746, %r3740; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 25; + add.s32 %r3750, %r3702, %r2955; + add.s32 %r3751, %r3750, %r3721; + xor.b32 %r3752, %r3751, %r3690; + shf.l.wrap.b32 %r3753, %r3752, %r3752, 16; + add.s32 %r3754, %r3753, %r3677; + xor.b32 %r3755, %r3754, %r3721; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 20; + add.s32 %r3757, %r3751, %r2983; + add.s32 %r3758, %r3757, %r3756; + xor.b32 %r3759, %r3758, %r3753; + shf.l.wrap.b32 %r3760, %r3759, %r3759, 24; + add.s32 %r3761, %r3760, %r3754; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 25; + add.s32 %r3764, %r3716, %r2969; + add.s32 %r3765, %r3764, %r3679; + xor.b32 %r3766, %r3765, %r3704; + shf.l.wrap.b32 %r3767, %r3766, %r3766, 16; + add.s32 %r3768, %r3767, %r3691; + xor.b32 %r3769, %r3768, %r3679; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 20; + add.s32 %r3771, %r3765, %r2990; + add.s32 %r3772, %r3771, %r3770; + xor.b32 %r3773, %r3772, %r3767; + shf.l.wrap.b32 %r3774, %r3773, %r3773, 24; + add.s32 %r3775, %r3774, %r3768; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 25; + add.s32 %r3778, %r3730, %r3018; + add.s32 %r3779, %r3778, %r3777; + xor.b32 %r3780, %r3779, %r3746; + shf.l.wrap.b32 %r3781, %r3780, %r3780, 16; + add.s32 %r3782, %r3781, %r3761; + xor.b32 %r3783, %r3782, %r3777; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 20; + add.s32 %r3785, %r3779, %r3046; + add.s32 %r3786, %r3785, %r3784; + xor.b32 %r3787, %r3786, %r3781; + shf.l.wrap.b32 %r3788, %r3787, %r3787, 24; + add.s32 %r3789, %r3788, %r3782; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 25; + add.s32 %r3792, %r3744, %r2976; + add.s32 %r3793, %r3792, %r3735; + xor.b32 %r3794, %r3793, %r3760; + shf.l.wrap.b32 %r3795, %r3794, %r3794, 16; + add.s32 %r3796, %r3795, %r3775; + xor.b32 %r3797, %r3796, %r3735; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 20; + add.s32 %r3799, %r3793, %r2941; + add.s32 %r3800, %r3799, %r3798; + xor.b32 %r3801, %r3800, %r3795; + shf.l.wrap.b32 %r3802, %r3801, %r3801, 24; + add.s32 %r3803, %r3802, %r3796; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 25; + add.s32 %r3806, %r3758, %r2948; + add.s32 %r3807, %r3806, %r3749; + xor.b32 %r3808, %r3774, %r3807; + shf.l.wrap.b32 %r3809, %r3808, %r3808, 16; + add.s32 %r3810, %r3809, %r3733; + xor.b32 %r3811, %r3810, %r3749; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 20; + add.s32 %r3813, %r3807, %r3004; + add.s32 %r3814, %r3813, %r3812; + xor.b32 %r3815, %r3814, %r3809; + shf.l.wrap.b32 %r3816, %r3815, %r3815, 24; + add.s32 %r3817, %r3816, %r3810; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 25; + add.s32 %r3820, %r3763, %r2997; + add.s32 %r3821, %r3820, %r3772; + xor.b32 %r3822, %r3821, %r3732; + shf.l.wrap.b32 %r3823, %r3822, %r3822, 16; + add.s32 %r3824, %r3823, %r3747; + xor.b32 %r3825, %r3824, %r3763; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 20; + add.s32 %r3827, %r3821, %r2983; + add.s32 %r3828, %r3827, %r3826; + xor.b32 %r3829, %r3828, %r3823; + shf.l.wrap.b32 %r3830, %r3829, %r3829, 24; + add.s32 %r3831, %r3830, %r3824; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 25; + add.s32 %r3834, %r3805, %r3039; + add.s32 %r3835, %r3834, %r3786; + xor.b32 %r3836, %r3835, %r3830; + shf.l.wrap.b32 %r3837, %r3836, %r3836, 16; + add.s32 %r3838, %r3837, %r3817; + xor.b32 %r3839, %r3838, %r3805; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 20; + add.s32 %r3841, %r3835, %r3011; + add.s32 %r3842, %r3841, %r3840; + xor.b32 %r3843, %r3842, %r3837; + shf.l.wrap.b32 %r3844, %r3843, %r3843, 24; + add.s32 %r3845, %r3844, %r3838; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 25; + add.s32 %r3848, %r3800, %r2955; + add.s32 %r3849, %r3848, %r3819; + xor.b32 %r3850, %r3788, %r3849; + shf.l.wrap.b32 %r3851, %r3850, %r3850, 16; + add.s32 %r3852, %r3851, %r3831; + xor.b32 %r3853, %r3852, %r3819; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 20; + add.s32 %r3855, %r3849, %r3025; + add.s32 %r3856, %r3855, %r3854; + xor.b32 %r3857, %r3856, %r3851; + shf.l.wrap.b32 %r3858, %r3857, %r3857, 24; + add.s32 %r3859, %r3858, %r3852; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 25; + add.s32 %r3862, %r3814, %r2962; + add.s32 %r3863, %r3862, %r3833; + xor.b32 %r3864, %r3863, %r3802; + shf.l.wrap.b32 %r3865, %r3864, %r3864, 16; + add.s32 %r3866, %r3865, %r3789; + xor.b32 %r3867, %r3866, %r3833; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 20; + add.s32 %r3869, %r3863, %r2969; + add.s32 %r3870, %r3869, %r3868; + xor.b32 %r3871, %r3870, %r3865; + shf.l.wrap.b32 %r3872, %r3871, %r3871, 24; + add.s32 %r3873, %r3872, %r3866; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 25; + add.s32 %r3876, %r3828, %r2990; + add.s32 %r3877, %r3876, %r3791; + xor.b32 %r3878, %r3877, %r3816; + shf.l.wrap.b32 %r3879, %r3878, %r3878, 16; + add.s32 %r3880, %r3879, %r3803; + xor.b32 %r3881, %r3880, %r3791; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 20; + add.s32 %r3883, %r3877, %r3032; + add.s32 %r3884, %r3883, %r3882; + xor.b32 %r3885, %r3884, %r3879; + shf.l.wrap.b32 %r3886, %r3885, %r3885, 24; + add.s32 %r3887, %r3886, %r3880; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 25; + xor.b32 %r3890, %r3873, %r3842; + xor.b32 %r3891, %r3887, %r3856; + xor.b32 %r3892, %r3845, %r3870; + xor.b32 %r3893, %r3884, %r3859; + xor.b32 %r3894, %r3889, %r3858; + xor.b32 %r3895, %r3847, %r3872; + xor.b32 %r3896, %r3886, %r3861; + xor.b32 %r3897, %r3875, %r3844; + st.local.u8 [%rd132+145], %r3890; + shr.u32 %r3898, %r3890, 8; + st.local.u8 [%rd132+146], %r3898; + shr.u32 %r3899, %r3890, 16; + st.local.u8 [%rd132+147], %r3899; + shr.u32 %r3900, %r3890, 24; + st.local.u8 [%rd132+148], %r3900; + st.local.u8 [%rd132+149], %r3891; + shr.u32 %r3901, %r3891, 8; + st.local.u8 [%rd132+150], %r3901; + shr.u32 %r3902, %r3891, 16; + st.local.u8 [%rd132+151], %r3902; + shr.u32 %r3903, %r3891, 24; + st.local.u8 [%rd132+152], %r3903; + st.local.u8 [%rd132+153], %r3892; + shr.u32 %r3904, %r3892, 8; + st.local.u8 [%rd132+154], %r3904; + shr.u32 %r3905, %r3892, 16; + st.local.u8 [%rd132+155], %r3905; + shr.u32 %r3906, %r3892, 24; + st.local.u8 [%rd132+156], %r3906; + st.local.u8 [%rd132+157], %r3893; + shr.u32 %r3907, %r3893, 8; + st.local.u8 [%rd132+158], %r3907; + shr.u32 %r3908, %r3893, 16; + st.local.u8 [%rd132+159], %r3908; + shr.u32 %r3909, %r3893, 24; + st.local.u8 [%rd132+160], %r3909; + st.local.u8 [%rd132+161], %r3894; + shr.u32 %r3910, %r3894, 8; + st.local.u8 [%rd132+162], %r3910; + shr.u32 %r3911, %r3894, 16; + st.local.u8 [%rd132+163], %r3911; + shr.u32 %r3912, %r3894, 24; + st.local.u8 [%rd132+164], %r3912; + st.local.u8 [%rd132+165], %r3895; + shr.u32 %r3913, %r3895, 8; + st.local.u8 [%rd132+166], %r3913; + shr.u32 %r3914, %r3895, 16; + st.local.u8 [%rd132+167], %r3914; + shr.u32 %r3915, %r3895, 24; + st.local.u8 [%rd132+168], %r3915; + st.local.u8 [%rd132+169], %r3896; + shr.u32 %r3916, %r3896, 8; + st.local.u8 [%rd132+170], %r3916; + shr.u32 %r3917, %r3896, 16; + st.local.u8 [%rd132+171], %r3917; + shr.u32 %r3918, %r3896, 24; + st.local.u8 [%rd132+172], %r3918; + st.local.u8 [%rd132+173], %r3897; + shr.u32 %r3919, %r3897, 8; + st.local.u8 [%rd132+174], %r3919; + shr.u32 %r3920, %r3897, 16; + st.local.u8 [%rd132+175], %r3920; + shr.u32 %r3921, %r3897, 24; + st.local.u8 [%rd132+176], %r3921; + ld.local.u8 %rs128, [%rd3+8]; + add.s16 %rs129, %rs128, -1; + st.local.u8 [%rd3+8], %rs129; + cvt.u64.u16 %rd133, %rs129; + and.b64 %rd134, %rd133, 255; + setp.lt.u64 %p16, %rd30, %rd134; + and.b16 %rs130, %rs129, 255; + mul.wide.u16 %r11659, %rs130, 32; + @%p16 bra $L__BB1_18; + +$L__BB1_19: + ld.param.u64 %rd222, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvt.s64.s32 %rd136, %r11659; + add.s64 %rd137, %rd2, %rd136; + mov.u64 %rd245, 0; + st.local.u8 [%rd137+145], %r27; + shr.u32 %r3922, %r27, 8; + st.local.u8 [%rd137+146], %r3922; + shr.u32 %r3923, %r27, 16; + st.local.u8 [%rd137+147], %r3923; + shr.u32 %r3924, %r27, 24; + st.local.u8 [%rd137+148], %r3924; + st.local.u8 [%rd137+149], %r28; + shr.u32 %r3925, %r28, 8; + st.local.u8 [%rd137+150], %r3925; + shr.u32 %r3926, %r28, 16; + st.local.u8 [%rd137+151], %r3926; + shr.u32 %r3927, %r28, 24; + st.local.u8 [%rd137+152], %r3927; + st.local.u8 [%rd137+153], %r29; + shr.u32 %r3928, %r29, 8; + st.local.u8 [%rd137+154], %r3928; + shr.u32 %r3929, %r29, 16; + st.local.u8 [%rd137+155], %r3929; + shr.u32 %r3930, %r29, 24; + st.local.u8 [%rd137+156], %r3930; + st.local.u8 [%rd137+157], %r30; + shr.u32 %r3931, %r30, 8; + st.local.u8 [%rd137+158], %r3931; + shr.u32 %r3932, %r30, 16; + st.local.u8 [%rd137+159], %r3932; + shr.u32 %r3933, %r30, 24; + st.local.u8 [%rd137+160], %r3933; + st.local.u8 [%rd137+161], %r31; + shr.u32 %r3934, %r31, 8; + st.local.u8 [%rd137+162], %r3934; + shr.u32 %r3935, %r31, 16; + st.local.u8 [%rd137+163], %r3935; + shr.u32 %r3936, %r31, 24; + st.local.u8 [%rd137+164], %r3936; + st.local.u8 [%rd137+165], %r32; + shr.u32 %r3937, %r32, 8; + st.local.u8 [%rd137+166], %r3937; + shr.u32 %r3938, %r32, 16; + st.local.u8 [%rd137+167], %r3938; + shr.u32 %r3939, %r32, 24; + st.local.u8 [%rd137+168], %r3939; + st.local.u8 [%rd137+169], %r33; + shr.u32 %r3940, %r33, 8; + st.local.u8 [%rd137+170], %r3940; + shr.u32 %r3941, %r33, 16; + st.local.u8 [%rd137+171], %r3941; + shr.u32 %r3942, %r33, 24; + st.local.u8 [%rd137+172], %r3942; + st.local.u8 [%rd137+173], %r34; + shr.u32 %r3943, %r34, 8; + st.local.u8 [%rd137+174], %r3943; + shr.u32 %r3944, %r34, 16; + st.local.u8 [%rd137+175], %r3944; + shr.u32 %r3945, %r34, 24; + st.local.u8 [%rd137+176], %r3945; + ld.local.u8 %rs131, [%rd3+8]; + add.s16 %rs132, %rs131, 1; + st.local.u8 [%rd3+8], %rs132; + ld.local.u64 %rd138, [%rd3+-72]; + add.s64 %rd32, %rd138, 1; + add.s64 %rd253, %rd222, %rd6; + +$L__BB1_20: + add.s64 %rd139, %rd2, %rd245; + ld.local.u8 %rs133, [%rd139]; + st.local.u8 [%rd139+32], %rs133; + add.s64 %rd245, %rd245, 1; + setp.lt.u64 %p17, %rd245, 32; + @%p17 bra $L__BB1_20; + + mov.u64 %rd246, 0; + st.local.u64 [%rd3+-72], %rd32; + mov.u16 %rs134, 0; + st.local.u8 [%rd3+1], %rs134; + +$L__BB1_22: + add.s64 %rd141, %rd2, %rd246; + st.local.u8 [%rd141+72], %rs134; + add.s64 %rd246, %rd246, 1; + setp.lt.u64 %p18, %rd246, 64; + @%p18 bra $L__BB1_22; + + ld.param.u64 %rd235, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd234, %rd235; + add.s64 %rd260, %rd234, %rd6; + mov.u64 %rd224, 32; + sub.s64 %rd261, %rd224, %rd6; + mov.u16 %rs136, 0; + st.local.u8 [%rd3], %rs136; + +$L__BB1_24: + setp.lt.u64 %p19, %rd261, 1025; + @%p19 bra $L__BB1_48; + + ld.local.u64 %rd250, [%rd3+-72]; + add.u64 %rd142, %SP, 0; + add.u64 %rd42, %SPL, 0; + +$L__BB1_26: + or.b64 %rd143, %rd261, 1; + mov.u64 %rd144, 1; + setp.gt.u64 %p20, %rd143, 4294967295; + shr.u64 %rd145, %rd261, 32; + selp.b64 %rd146, %rd145, %rd143, %p20; + selp.b32 %r3946, 32, 0, %p20; + and.b64 %rd147, %rd146, 4294901760; + setp.ne.s64 %p21, %rd147, 0; + shr.u64 %rd148, %rd146, 16; + or.b32 %r3947, %r3946, 16; + selp.b64 %rd149, %rd148, %rd146, %p21; + selp.b32 %r3948, %r3947, %r3946, %p21; + and.b64 %rd150, %rd149, 65280; + setp.ne.s64 %p22, %rd150, 0; + shr.u64 %rd151, %rd149, 8; + or.b32 %r3949, %r3948, 8; + selp.b64 %rd152, %rd151, %rd149, %p22; + selp.b32 %r3950, %r3949, %r3948, %p22; + and.b64 %rd153, %rd152, 240; + setp.ne.s64 %p23, %rd153, 0; + shr.u64 %rd154, %rd152, 4; + or.b32 %r3951, %r3950, 4; + selp.b64 %rd155, %rd154, %rd152, %p23; + selp.b32 %r3952, %r3951, %r3950, %p23; + and.b64 %rd156, %rd155, 12; + setp.ne.s64 %p24, %rd156, 0; + shr.u64 %rd157, %rd155, 2; + add.s32 %r3953, %r3952, 2; + selp.b64 %rd158, %rd157, %rd155, %p24; + selp.b32 %r3954, %r3953, %r3952, %p24; + bfe.u64 %rd159, %rd158, 1, 1; + cvt.u32.u64 %r3955, %rd159; + add.s32 %r3956, %r3954, %r3955; + shl.b64 %rd254, %rd144, %r3956; + shl.b64 %rd48, %rd250, 10; + +$L__BB1_27: + mov.u64 %rd49, %rd254; + add.s64 %rd160, %rd49, -1; + and.b64 %rd161, %rd160, %rd48; + setp.ne.s64 %p25, %rd161, 0; + shr.u64 %rd254, %rd49, 1; + @%p25 bra $L__BB1_27; + + ld.local.u8 %rs14, [%rd3+2]; + setp.lt.u64 %p26, %rd49, 1025; + @%p26 bra $L__BB1_36; + bra.uni $L__BB1_29; + +$L__BB1_36: + ld.local.u8 %r5955, [%rd3+-136]; + ld.local.u8 %r5956, [%rd3+-135]; + prmt.b32 %r5957, %r5956, %r5955, 30212; + ld.local.u8 %r5958, [%rd3+-134]; + ld.local.u8 %r5959, [%rd3+-133]; + prmt.b32 %r5960, %r5959, %r5958, 30212; + prmt.b32 %r11679, %r5960, %r5957, 4180; + ld.local.u8 %r5961, [%rd3+-132]; + ld.local.u8 %r5962, [%rd3+-131]; + prmt.b32 %r5963, %r5962, %r5961, 30212; + ld.local.u8 %r5964, [%rd3+-130]; + ld.local.u8 %r5965, [%rd3+-129]; + prmt.b32 %r5966, %r5965, %r5964, 30212; + prmt.b32 %r11678, %r5966, %r5963, 4180; + ld.local.u8 %r5967, [%rd3+-128]; + ld.local.u8 %r5968, [%rd3+-127]; + prmt.b32 %r5969, %r5968, %r5967, 30212; + ld.local.u8 %r5970, [%rd3+-126]; + ld.local.u8 %r5971, [%rd3+-125]; + prmt.b32 %r5972, %r5971, %r5970, 30212; + prmt.b32 %r11677, %r5972, %r5969, 4180; + ld.local.u8 %r5973, [%rd3+-124]; + ld.local.u8 %r5974, [%rd3+-123]; + prmt.b32 %r5975, %r5974, %r5973, 30212; + ld.local.u8 %r5976, [%rd3+-122]; + ld.local.u8 %r5977, [%rd3+-121]; + prmt.b32 %r5978, %r5977, %r5976, 30212; + prmt.b32 %r11676, %r5978, %r5975, 4180; + ld.local.u8 %r5979, [%rd3+-120]; + ld.local.u8 %r5980, [%rd3+-119]; + prmt.b32 %r5981, %r5980, %r5979, 30212; + ld.local.u8 %r5982, [%rd3+-118]; + ld.local.u8 %r5983, [%rd3+-117]; + prmt.b32 %r5984, %r5983, %r5982, 30212; + prmt.b32 %r11675, %r5984, %r5981, 4180; + ld.local.u8 %r5985, [%rd3+-116]; + ld.local.u8 %r5986, [%rd3+-115]; + prmt.b32 %r5987, %r5986, %r5985, 30212; + ld.local.u8 %r5988, [%rd3+-114]; + ld.local.u8 %r5989, [%rd3+-113]; + prmt.b32 %r5990, %r5989, %r5988, 30212; + prmt.b32 %r11674, %r5990, %r5987, 4180; + ld.local.u8 %r5991, [%rd3+-112]; + ld.local.u8 %r5992, [%rd3+-111]; + prmt.b32 %r5993, %r5992, %r5991, 30212; + ld.local.u8 %r5994, [%rd3+-110]; + ld.local.u8 %r5995, [%rd3+-109]; + prmt.b32 %r5996, %r5995, %r5994, 30212; + prmt.b32 %r11673, %r5996, %r5993, 4180; + ld.local.u8 %r5997, [%rd3+-108]; + ld.local.u8 %r5998, [%rd3+-107]; + prmt.b32 %r5999, %r5998, %r5997, 30212; + ld.local.u8 %r6000, [%rd3+-106]; + ld.local.u8 %r6001, [%rd3+-105]; + prmt.b32 %r6002, %r6001, %r6000, 30212; + prmt.b32 %r11672, %r6002, %r5999, 4180; + add.u64 %rd53, %SPL, 64; + mov.u32 %r6003, 0; + st.local.v2.u32 [%rd53], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+8], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+16], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+24], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+32], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+40], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+48], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+56], {%r6003, %r6003}; + mov.u16 %rs354, 0; + st.local.v2.u8 [%rd53+64], {%rs354, %rs354}; + st.local.u8 [%rd53+66], %rs14; + cvt.u32.u64 %r71, %rd250; + shr.u64 %rd184, %rd250, 32; + cvt.u32.u64 %r72, %rd184; + setp.lt.u64 %p31, %rd49, 65; + mov.u64 %rd257, %rd260; + mov.u64 %rd258, %rd49; + @%p31 bra $L__BB1_39; + + add.s64 %rd54, %rd53, 64; + mov.u16 %rs353, 0; + mov.u64 %rd258, %rd49; + mov.u64 %rd257, %rd260; + +$L__BB1_38: + and.b16 %rs213, %rs353, 255; + setp.eq.s16 %p32, %rs213, 0; + selp.u16 %rs214, 1, 0, %p32; + or.b16 %rs215, %rs14, %rs214; + ld.local.u8 %r6004, [%rd257]; + ld.local.u8 %r6005, [%rd257+1]; + prmt.b32 %r6006, %r6005, %r6004, 30212; + ld.local.u8 %r6007, [%rd257+2]; + prmt.b32 %r6008, %r6007, %r6006, 28756; + ld.local.u8 %r6009, [%rd257+3]; + prmt.b32 %r6010, %r6009, %r6008, 1620; + ld.local.u8 %r6011, [%rd257+4]; + ld.local.u8 %r6012, [%rd257+5]; + prmt.b32 %r6013, %r6012, %r6011, 30212; + ld.local.u8 %r6014, [%rd257+6]; + prmt.b32 %r6015, %r6014, %r6013, 28756; + ld.local.u8 %r6016, [%rd257+7]; + prmt.b32 %r6017, %r6016, %r6015, 1620; + ld.local.u8 %r6018, [%rd257+8]; + ld.local.u8 %r6019, [%rd257+9]; + prmt.b32 %r6020, %r6019, %r6018, 30212; + ld.local.u8 %r6021, [%rd257+10]; + prmt.b32 %r6022, %r6021, %r6020, 28756; + ld.local.u8 %r6023, [%rd257+11]; + prmt.b32 %r6024, %r6023, %r6022, 1620; + ld.local.u8 %r6025, [%rd257+12]; + ld.local.u8 %r6026, [%rd257+13]; + prmt.b32 %r6027, %r6026, %r6025, 30212; + ld.local.u8 %r6028, [%rd257+14]; + prmt.b32 %r6029, %r6028, %r6027, 28756; + ld.local.u8 %r6030, [%rd257+15]; + prmt.b32 %r6031, %r6030, %r6029, 1620; + ld.local.u8 %r6032, [%rd257+16]; + ld.local.u8 %r6033, [%rd257+17]; + prmt.b32 %r6034, %r6033, %r6032, 30212; + ld.local.u8 %r6035, [%rd257+18]; + prmt.b32 %r6036, %r6035, %r6034, 28756; + ld.local.u8 %r6037, [%rd257+19]; + prmt.b32 %r6038, %r6037, %r6036, 1620; + ld.local.u8 %r6039, [%rd257+20]; + ld.local.u8 %r6040, [%rd257+21]; + prmt.b32 %r6041, %r6040, %r6039, 30212; + ld.local.u8 %r6042, [%rd257+22]; + prmt.b32 %r6043, %r6042, %r6041, 28756; + ld.local.u8 %r6044, [%rd257+23]; + prmt.b32 %r6045, %r6044, %r6043, 1620; + ld.local.u8 %r6046, [%rd257+24]; + ld.local.u8 %r6047, [%rd257+25]; + prmt.b32 %r6048, %r6047, %r6046, 30212; + ld.local.u8 %r6049, [%rd257+26]; + prmt.b32 %r6050, %r6049, %r6048, 28756; + ld.local.u8 %r6051, [%rd257+27]; + prmt.b32 %r6052, %r6051, %r6050, 1620; + ld.local.u8 %r6053, [%rd257+28]; + ld.local.u8 %r6054, [%rd257+29]; + prmt.b32 %r6055, %r6054, %r6053, 30212; + ld.local.u8 %r6056, [%rd257+30]; + prmt.b32 %r6057, %r6056, %r6055, 28756; + ld.local.u8 %r6058, [%rd257+31]; + prmt.b32 %r6059, %r6058, %r6057, 1620; + ld.local.u8 %r6060, [%rd257+32]; + ld.local.u8 %r6061, [%rd257+33]; + prmt.b32 %r6062, %r6061, %r6060, 30212; + ld.local.u8 %r6063, [%rd257+34]; + prmt.b32 %r6064, %r6063, %r6062, 28756; + ld.local.u8 %r6065, [%rd257+35]; + prmt.b32 %r6066, %r6065, %r6064, 1620; + ld.local.u8 %r6067, [%rd257+36]; + ld.local.u8 %r6068, [%rd257+37]; + prmt.b32 %r6069, %r6068, %r6067, 30212; + ld.local.u8 %r6070, [%rd257+38]; + prmt.b32 %r6071, %r6070, %r6069, 28756; + ld.local.u8 %r6072, [%rd257+39]; + prmt.b32 %r6073, %r6072, %r6071, 1620; + ld.local.u8 %r6074, [%rd257+40]; + ld.local.u8 %r6075, [%rd257+41]; + prmt.b32 %r6076, %r6075, %r6074, 30212; + ld.local.u8 %r6077, [%rd257+42]; + prmt.b32 %r6078, %r6077, %r6076, 28756; + ld.local.u8 %r6079, [%rd257+43]; + prmt.b32 %r6080, %r6079, %r6078, 1620; + ld.local.u8 %r6081, [%rd257+44]; + ld.local.u8 %r6082, [%rd257+45]; + prmt.b32 %r6083, %r6082, %r6081, 30212; + ld.local.u8 %r6084, [%rd257+46]; + prmt.b32 %r6085, %r6084, %r6083, 28756; + ld.local.u8 %r6086, [%rd257+47]; + prmt.b32 %r6087, %r6086, %r6085, 1620; + ld.local.u8 %r6088, [%rd257+48]; + ld.local.u8 %r6089, [%rd257+49]; + prmt.b32 %r6090, %r6089, %r6088, 30212; + ld.local.u8 %r6091, [%rd257+50]; + prmt.b32 %r6092, %r6091, %r6090, 28756; + ld.local.u8 %r6093, [%rd257+51]; + prmt.b32 %r6094, %r6093, %r6092, 1620; + ld.local.u8 %r6095, [%rd257+52]; + ld.local.u8 %r6096, [%rd257+53]; + prmt.b32 %r6097, %r6096, %r6095, 30212; + ld.local.u8 %r6098, [%rd257+54]; + prmt.b32 %r6099, %r6098, %r6097, 28756; + ld.local.u8 %r6100, [%rd257+55]; + prmt.b32 %r6101, %r6100, %r6099, 1620; + ld.local.u8 %r6102, [%rd257+56]; + ld.local.u8 %r6103, [%rd257+57]; + prmt.b32 %r6104, %r6103, %r6102, 30212; + ld.local.u8 %r6105, [%rd257+58]; + prmt.b32 %r6106, %r6105, %r6104, 28756; + ld.local.u8 %r6107, [%rd257+59]; + prmt.b32 %r6108, %r6107, %r6106, 1620; + ld.local.u8 %r6109, [%rd257+60]; + ld.local.u8 %r6110, [%rd257+61]; + prmt.b32 %r6111, %r6110, %r6109, 30212; + ld.local.u8 %r6112, [%rd257+62]; + prmt.b32 %r6113, %r6112, %r6111, 28756; + ld.local.u8 %r6114, [%rd257+63]; + prmt.b32 %r6115, %r6114, %r6113, 1620; + cvt.u32.u16 %r6116, %rs215; + and.b32 %r6117, %r6116, 255; + add.s32 %r6118, %r11679, %r11675; + add.s32 %r6119, %r6118, %r6010; + xor.b32 %r6120, %r6119, %r71; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 16; + add.s32 %r6122, %r6121, 1779033703; + xor.b32 %r6123, %r6122, %r11675; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 20; + add.s32 %r6125, %r6017, %r6119; + add.s32 %r6126, %r6125, %r6124; + xor.b32 %r6127, %r6126, %r6121; + shf.l.wrap.b32 %r6128, %r6127, %r6127, 24; + add.s32 %r6129, %r6128, %r6122; + xor.b32 %r6130, %r6129, %r6124; + shf.l.wrap.b32 %r6131, %r6130, %r6130, 25; + add.s32 %r6132, %r11678, %r11674; + add.s32 %r6133, %r6132, %r6024; + xor.b32 %r6134, %r6133, %r72; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 16; + add.s32 %r6136, %r6135, -1150833019; + xor.b32 %r6137, %r6136, %r11674; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 20; + add.s32 %r6139, %r6031, %r6133; + add.s32 %r6140, %r6139, %r6138; + xor.b32 %r6141, %r6140, %r6135; + shf.l.wrap.b32 %r6142, %r6141, %r6141, 24; + add.s32 %r6143, %r6142, %r6136; + xor.b32 %r6144, %r6143, %r6138; + shf.l.wrap.b32 %r6145, %r6144, %r6144, 25; + add.s32 %r6146, %r11677, %r11673; + add.s32 %r6147, %r6146, %r6038; + shr.u32 %r6148, %r6147, 16; + shl.b32 %r6149, %r6147, 16; + xor.b32 %r6150, %r6149, 4194304; + or.b32 %r6151, %r6150, %r6148; + add.s32 %r6152, %r6151, 1013904242; + xor.b32 %r6153, %r6152, %r11673; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 20; + add.s32 %r6155, %r6045, %r6147; + add.s32 %r6156, %r6155, %r6154; + xor.b32 %r6157, %r6156, %r6151; + shf.l.wrap.b32 %r6158, %r6157, %r6157, 24; + add.s32 %r6159, %r6158, %r6152; + xor.b32 %r6160, %r6159, %r6154; + shf.l.wrap.b32 %r6161, %r6160, %r6160, 25; + add.s32 %r6162, %r11676, %r11672; + add.s32 %r6163, %r6162, %r6052; + xor.b32 %r6164, %r6163, %r6117; + shr.u32 %r6165, %r6163, 16; + shl.b32 %r6166, %r6164, 16; + or.b32 %r6167, %r6166, %r6165; + add.s32 %r6168, %r6167, -1521486534; + xor.b32 %r6169, %r6168, %r11672; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 20; + add.s32 %r6171, %r6059, %r6163; + add.s32 %r6172, %r6171, %r6170; + xor.b32 %r6173, %r6172, %r6167; + shf.l.wrap.b32 %r6174, %r6173, %r6173, 24; + add.s32 %r6175, %r6174, %r6168; + xor.b32 %r6176, %r6175, %r6170; + shf.l.wrap.b32 %r6177, %r6176, %r6176, 25; + add.s32 %r6178, %r6145, %r6126; + add.s32 %r6179, %r6178, %r6066; + xor.b32 %r6180, %r6174, %r6179; + shf.l.wrap.b32 %r6181, %r6180, %r6180, 16; + add.s32 %r6182, %r6181, %r6159; + xor.b32 %r6183, %r6182, %r6145; + shf.l.wrap.b32 %r6184, %r6183, %r6183, 20; + add.s32 %r6185, %r6073, %r6179; + add.s32 %r6186, %r6185, %r6184; + xor.b32 %r6187, %r6186, %r6181; + shf.l.wrap.b32 %r6188, %r6187, %r6187, 24; + add.s32 %r6189, %r6188, %r6182; + xor.b32 %r6190, %r6189, %r6184; + shf.l.wrap.b32 %r6191, %r6190, %r6190, 25; + add.s32 %r6192, %r6161, %r6140; + add.s32 %r6193, %r6192, %r6080; + xor.b32 %r6194, %r6193, %r6128; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 16; + add.s32 %r6196, %r6195, %r6175; + xor.b32 %r6197, %r6196, %r6161; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 20; + add.s32 %r6199, %r6087, %r6193; + add.s32 %r6200, %r6199, %r6198; + xor.b32 %r6201, %r6200, %r6195; + shf.l.wrap.b32 %r6202, %r6201, %r6201, 24; + add.s32 %r6203, %r6202, %r6196; + xor.b32 %r6204, %r6203, %r6198; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 25; + add.s32 %r6206, %r6177, %r6156; + add.s32 %r6207, %r6206, %r6094; + xor.b32 %r6208, %r6207, %r6142; + shf.l.wrap.b32 %r6209, %r6208, %r6208, 16; + add.s32 %r6210, %r6209, %r6129; + xor.b32 %r6211, %r6210, %r6177; + shf.l.wrap.b32 %r6212, %r6211, %r6211, 20; + add.s32 %r6213, %r6101, %r6207; + add.s32 %r6214, %r6213, %r6212; + xor.b32 %r6215, %r6214, %r6209; + shf.l.wrap.b32 %r6216, %r6215, %r6215, 24; + add.s32 %r6217, %r6216, %r6210; + xor.b32 %r6218, %r6217, %r6212; + shf.l.wrap.b32 %r6219, %r6218, %r6218, 25; + add.s32 %r6220, %r6172, %r6131; + add.s32 %r6221, %r6220, %r6108; + xor.b32 %r6222, %r6221, %r6158; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 16; + add.s32 %r6224, %r6223, %r6143; + xor.b32 %r6225, %r6224, %r6131; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 20; + add.s32 %r6227, %r6115, %r6221; + add.s32 %r6228, %r6227, %r6226; + xor.b32 %r6229, %r6228, %r6223; + shf.l.wrap.b32 %r6230, %r6229, %r6229, 24; + add.s32 %r6231, %r6230, %r6224; + xor.b32 %r6232, %r6231, %r6226; + shf.l.wrap.b32 %r6233, %r6232, %r6232, 25; + add.s32 %r6234, %r6186, %r6024; + add.s32 %r6235, %r6234, %r6233; + xor.b32 %r6236, %r6235, %r6202; + shf.l.wrap.b32 %r6237, %r6236, %r6236, 16; + add.s32 %r6238, %r6237, %r6217; + xor.b32 %r6239, %r6238, %r6233; + shf.l.wrap.b32 %r6240, %r6239, %r6239, 20; + add.s32 %r6241, %r6235, %r6052; + add.s32 %r6242, %r6241, %r6240; + xor.b32 %r6243, %r6242, %r6237; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 24; + add.s32 %r6245, %r6244, %r6238; + xor.b32 %r6246, %r6245, %r6240; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 25; + add.s32 %r6248, %r6200, %r6031; + add.s32 %r6249, %r6248, %r6191; + xor.b32 %r6250, %r6216, %r6249; + shf.l.wrap.b32 %r6251, %r6250, %r6250, 16; + add.s32 %r6252, %r6231, %r6251; + xor.b32 %r6253, %r6252, %r6191; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 20; + add.s32 %r6255, %r6249, %r6080; + add.s32 %r6256, %r6255, %r6254; + xor.b32 %r6257, %r6256, %r6251; + shf.l.wrap.b32 %r6258, %r6257, %r6257, 24; + add.s32 %r6259, %r6258, %r6252; + xor.b32 %r6260, %r6259, %r6254; + shf.l.wrap.b32 %r6261, %r6260, %r6260, 25; + add.s32 %r6262, %r6205, %r6059; + add.s32 %r6263, %r6262, %r6214; + xor.b32 %r6264, %r6230, %r6263; + shf.l.wrap.b32 %r6265, %r6264, %r6264, 16; + add.s32 %r6266, %r6265, %r6189; + xor.b32 %r6267, %r6266, %r6205; + shf.l.wrap.b32 %r6268, %r6267, %r6267, 20; + add.s32 %r6269, %r6263, %r6010; + add.s32 %r6270, %r6269, %r6268; + xor.b32 %r6271, %r6270, %r6265; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 24; + add.s32 %r6273, %r6272, %r6266; + xor.b32 %r6274, %r6273, %r6268; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 25; + add.s32 %r6276, %r6219, %r6038; + add.s32 %r6277, %r6276, %r6228; + xor.b32 %r6278, %r6277, %r6188; + shf.l.wrap.b32 %r6279, %r6278, %r6278, 16; + add.s32 %r6280, %r6279, %r6203; + xor.b32 %r6281, %r6280, %r6219; + shf.l.wrap.b32 %r6282, %r6281, %r6281, 20; + add.s32 %r6283, %r6277, %r6101; + add.s32 %r6284, %r6283, %r6282; + xor.b32 %r6285, %r6284, %r6279; + shf.l.wrap.b32 %r6286, %r6285, %r6285, 24; + add.s32 %r6287, %r6286, %r6280; + xor.b32 %r6288, %r6287, %r6282; + shf.l.wrap.b32 %r6289, %r6288, %r6288, 25; + add.s32 %r6290, %r6242, %r6017; + add.s32 %r6291, %r6290, %r6261; + xor.b32 %r6292, %r6291, %r6286; + shf.l.wrap.b32 %r6293, %r6292, %r6292, 16; + add.s32 %r6294, %r6293, %r6273; + xor.b32 %r6295, %r6294, %r6261; + shf.l.wrap.b32 %r6296, %r6295, %r6295, 20; + add.s32 %r6297, %r6291, %r6087; + add.s32 %r6298, %r6297, %r6296; + xor.b32 %r6299, %r6298, %r6293; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 24; + add.s32 %r6301, %r6300, %r6294; + xor.b32 %r6302, %r6301, %r6296; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 25; + add.s32 %r6304, %r6256, %r6094; + add.s32 %r6305, %r6304, %r6275; + xor.b32 %r6306, %r6305, %r6244; + shf.l.wrap.b32 %r6307, %r6306, %r6306, 16; + add.s32 %r6308, %r6307, %r6287; + xor.b32 %r6309, %r6308, %r6275; + shf.l.wrap.b32 %r6310, %r6309, %r6309, 20; + add.s32 %r6311, %r6305, %r6045; + add.s32 %r6312, %r6311, %r6310; + xor.b32 %r6313, %r6312, %r6307; + shf.l.wrap.b32 %r6314, %r6313, %r6313, 24; + add.s32 %r6315, %r6314, %r6308; + xor.b32 %r6316, %r6315, %r6310; + shf.l.wrap.b32 %r6317, %r6316, %r6316, 25; + add.s32 %r6318, %r6270, %r6073; + add.s32 %r6319, %r6318, %r6289; + xor.b32 %r6320, %r6319, %r6258; + shf.l.wrap.b32 %r6321, %r6320, %r6320, 16; + add.s32 %r6322, %r6321, %r6245; + xor.b32 %r6323, %r6322, %r6289; + shf.l.wrap.b32 %r6324, %r6323, %r6323, 20; + add.s32 %r6325, %r6319, %r6108; + add.s32 %r6326, %r6325, %r6324; + xor.b32 %r6327, %r6326, %r6321; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 24; + add.s32 %r6329, %r6328, %r6322; + xor.b32 %r6330, %r6329, %r6324; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 25; + add.s32 %r6332, %r6284, %r6115; + add.s32 %r6333, %r6332, %r6247; + xor.b32 %r6334, %r6333, %r6272; + shf.l.wrap.b32 %r6335, %r6334, %r6334, 16; + add.s32 %r6336, %r6335, %r6259; + xor.b32 %r6337, %r6336, %r6247; + shf.l.wrap.b32 %r6338, %r6337, %r6337, 20; + add.s32 %r6339, %r6333, %r6066; + add.s32 %r6340, %r6339, %r6338; + xor.b32 %r6341, %r6340, %r6335; + shf.l.wrap.b32 %r6342, %r6341, %r6341, 24; + add.s32 %r6343, %r6342, %r6336; + xor.b32 %r6344, %r6343, %r6338; + shf.l.wrap.b32 %r6345, %r6344, %r6344, 25; + add.s32 %r6346, %r6298, %r6031; + add.s32 %r6347, %r6346, %r6345; + xor.b32 %r6348, %r6347, %r6314; + shf.l.wrap.b32 %r6349, %r6348, %r6348, 16; + add.s32 %r6350, %r6349, %r6329; + xor.b32 %r6351, %r6350, %r6345; + shf.l.wrap.b32 %r6352, %r6351, %r6351, 20; + add.s32 %r6353, %r6347, %r6038; + add.s32 %r6354, %r6353, %r6352; + xor.b32 %r6355, %r6354, %r6349; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 24; + add.s32 %r6357, %r6356, %r6350; + xor.b32 %r6358, %r6357, %r6352; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 25; + add.s32 %r6360, %r6312, %r6080; + add.s32 %r6361, %r6360, %r6303; + xor.b32 %r6362, %r6361, %r6328; + shf.l.wrap.b32 %r6363, %r6362, %r6362, 16; + add.s32 %r6364, %r6363, %r6343; + xor.b32 %r6365, %r6364, %r6303; + shf.l.wrap.b32 %r6366, %r6365, %r6365, 20; + add.s32 %r6367, %r6361, %r6094; + add.s32 %r6368, %r6367, %r6366; + xor.b32 %r6369, %r6368, %r6363; + shf.l.wrap.b32 %r6370, %r6369, %r6369, 24; + add.s32 %r6371, %r6370, %r6364; + xor.b32 %r6372, %r6371, %r6366; + shf.l.wrap.b32 %r6373, %r6372, %r6372, 25; + add.s32 %r6374, %r6326, %r6101; + add.s32 %r6375, %r6374, %r6317; + xor.b32 %r6376, %r6375, %r6342; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 16; + add.s32 %r6378, %r6377, %r6301; + xor.b32 %r6379, %r6378, %r6317; + shf.l.wrap.b32 %r6380, %r6379, %r6379, 20; + add.s32 %r6381, %r6375, %r6024; + add.s32 %r6382, %r6381, %r6380; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 24; + add.s32 %r6385, %r6384, %r6378; + xor.b32 %r6386, %r6385, %r6380; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 25; + add.s32 %r6388, %r6340, %r6059; + add.s32 %r6389, %r6388, %r6331; + xor.b32 %r6390, %r6389, %r6300; + shf.l.wrap.b32 %r6391, %r6390, %r6390, 16; + add.s32 %r6392, %r6391, %r6315; + xor.b32 %r6393, %r6392, %r6331; + shf.l.wrap.b32 %r6394, %r6393, %r6393, 20; + add.s32 %r6395, %r6389, %r6108; + add.s32 %r6396, %r6395, %r6394; + xor.b32 %r6397, %r6396, %r6391; + shf.l.wrap.b32 %r6398, %r6397, %r6397, 24; + add.s32 %r6399, %r6398, %r6392; + xor.b32 %r6400, %r6399, %r6394; + shf.l.wrap.b32 %r6401, %r6400, %r6400, 25; + add.s32 %r6402, %r6354, %r6052; + add.s32 %r6403, %r6402, %r6373; + xor.b32 %r6404, %r6403, %r6398; + shf.l.wrap.b32 %r6405, %r6404, %r6404, 16; + add.s32 %r6406, %r6405, %r6385; + xor.b32 %r6407, %r6406, %r6373; + shf.l.wrap.b32 %r6408, %r6407, %r6407, 20; + add.s32 %r6409, %r6403, %r6045; + add.s32 %r6410, %r6409, %r6408; + xor.b32 %r6411, %r6410, %r6405; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 24; + add.s32 %r6413, %r6412, %r6406; + xor.b32 %r6414, %r6413, %r6408; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 25; + add.s32 %r6416, %r6368, %r6073; + add.s32 %r6417, %r6416, %r6387; + xor.b32 %r6418, %r6417, %r6356; + shf.l.wrap.b32 %r6419, %r6418, %r6418, 16; + add.s32 %r6420, %r6419, %r6399; + xor.b32 %r6421, %r6420, %r6387; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 20; + add.s32 %r6423, %r6417, %r6010; + add.s32 %r6424, %r6423, %r6422; + xor.b32 %r6425, %r6424, %r6419; + shf.l.wrap.b32 %r6426, %r6425, %r6425, 24; + add.s32 %r6427, %r6426, %r6420; + xor.b32 %r6428, %r6427, %r6422; + shf.l.wrap.b32 %r6429, %r6428, %r6428, 25; + add.s32 %r6430, %r6382, %r6087; + add.s32 %r6431, %r6430, %r6401; + xor.b32 %r6432, %r6431, %r6370; + shf.l.wrap.b32 %r6433, %r6432, %r6432, 16; + add.s32 %r6434, %r6433, %r6357; + xor.b32 %r6435, %r6434, %r6401; + shf.l.wrap.b32 %r6436, %r6435, %r6435, 20; + add.s32 %r6437, %r6431, %r6115; + add.s32 %r6438, %r6437, %r6436; + xor.b32 %r6439, %r6438, %r6433; + shf.l.wrap.b32 %r6440, %r6439, %r6439, 24; + add.s32 %r6441, %r6440, %r6434; + xor.b32 %r6442, %r6441, %r6436; + shf.l.wrap.b32 %r6443, %r6442, %r6442, 25; + add.s32 %r6444, %r6396, %r6066; + add.s32 %r6445, %r6444, %r6359; + xor.b32 %r6446, %r6445, %r6384; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 16; + add.s32 %r6448, %r6447, %r6371; + xor.b32 %r6449, %r6448, %r6359; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 20; + add.s32 %r6451, %r6445, %r6017; + add.s32 %r6452, %r6451, %r6450; + xor.b32 %r6453, %r6452, %r6447; + shf.l.wrap.b32 %r6454, %r6453, %r6453, 24; + add.s32 %r6455, %r6454, %r6448; + xor.b32 %r6456, %r6455, %r6450; + shf.l.wrap.b32 %r6457, %r6456, %r6456, 25; + add.s32 %r6458, %r6410, %r6080; + add.s32 %r6459, %r6458, %r6457; + xor.b32 %r6460, %r6459, %r6426; + shf.l.wrap.b32 %r6461, %r6460, %r6460, 16; + add.s32 %r6462, %r6461, %r6441; + xor.b32 %r6463, %r6462, %r6457; + shf.l.wrap.b32 %r6464, %r6463, %r6463, 20; + add.s32 %r6465, %r6459, %r6059; + add.s32 %r6466, %r6465, %r6464; + xor.b32 %r6467, %r6466, %r6461; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 24; + add.s32 %r6469, %r6468, %r6462; + xor.b32 %r6470, %r6469, %r6464; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 25; + add.s32 %r6472, %r6424, %r6094; + add.s32 %r6473, %r6472, %r6415; + xor.b32 %r6474, %r6473, %r6440; + shf.l.wrap.b32 %r6475, %r6474, %r6474, 16; + add.s32 %r6476, %r6475, %r6455; + xor.b32 %r6477, %r6476, %r6415; + shf.l.wrap.b32 %r6478, %r6477, %r6477, 20; + add.s32 %r6479, %r6473, %r6073; + add.s32 %r6480, %r6479, %r6478; + xor.b32 %r6481, %r6480, %r6475; + shf.l.wrap.b32 %r6482, %r6481, %r6481, 24; + add.s32 %r6483, %r6482, %r6476; + xor.b32 %r6484, %r6483, %r6478; + shf.l.wrap.b32 %r6485, %r6484, %r6484, 25; + add.s32 %r6486, %r6438, %r6108; + add.s32 %r6487, %r6486, %r6429; + xor.b32 %r6488, %r6487, %r6454; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 16; + add.s32 %r6490, %r6489, %r6413; + xor.b32 %r6491, %r6490, %r6429; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 20; + add.s32 %r6493, %r6487, %r6031; + add.s32 %r6494, %r6493, %r6492; + xor.b32 %r6495, %r6494, %r6489; + shf.l.wrap.b32 %r6496, %r6495, %r6495, 24; + add.s32 %r6497, %r6496, %r6490; + xor.b32 %r6498, %r6497, %r6492; + shf.l.wrap.b32 %r6499, %r6498, %r6498, 25; + add.s32 %r6500, %r6452, %r6101; + add.s32 %r6501, %r6500, %r6443; + xor.b32 %r6502, %r6501, %r6412; + shf.l.wrap.b32 %r6503, %r6502, %r6502, 16; + add.s32 %r6504, %r6503, %r6427; + xor.b32 %r6505, %r6504, %r6443; + shf.l.wrap.b32 %r6506, %r6505, %r6505, 20; + add.s32 %r6507, %r6501, %r6115; + add.s32 %r6508, %r6507, %r6506; + xor.b32 %r6509, %r6508, %r6503; + shf.l.wrap.b32 %r6510, %r6509, %r6509, 24; + add.s32 %r6511, %r6510, %r6504; + xor.b32 %r6512, %r6511, %r6506; + shf.l.wrap.b32 %r6513, %r6512, %r6512, 25; + add.s32 %r6514, %r6466, %r6038; + add.s32 %r6515, %r6514, %r6485; + xor.b32 %r6516, %r6515, %r6510; + shf.l.wrap.b32 %r6517, %r6516, %r6516, 16; + add.s32 %r6518, %r6517, %r6497; + xor.b32 %r6519, %r6518, %r6485; + shf.l.wrap.b32 %r6520, %r6519, %r6519, 20; + add.s32 %r6521, %r6515, %r6010; + add.s32 %r6522, %r6521, %r6520; + xor.b32 %r6523, %r6522, %r6517; + shf.l.wrap.b32 %r6524, %r6523, %r6523, 24; + add.s32 %r6525, %r6524, %r6518; + xor.b32 %r6526, %r6525, %r6520; + shf.l.wrap.b32 %r6527, %r6526, %r6526, 25; + add.s32 %r6528, %r6480, %r6087; + add.s32 %r6529, %r6528, %r6499; + xor.b32 %r6530, %r6529, %r6468; + shf.l.wrap.b32 %r6531, %r6530, %r6530, 16; + add.s32 %r6532, %r6531, %r6511; + xor.b32 %r6533, %r6532, %r6499; + shf.l.wrap.b32 %r6534, %r6533, %r6533, 20; + add.s32 %r6535, %r6529, %r6024; + add.s32 %r6536, %r6535, %r6534; + xor.b32 %r6537, %r6536, %r6531; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 24; + add.s32 %r6539, %r6538, %r6532; + xor.b32 %r6540, %r6539, %r6534; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 25; + add.s32 %r6542, %r6494, %r6045; + add.s32 %r6543, %r6542, %r6513; + xor.b32 %r6544, %r6543, %r6482; + shf.l.wrap.b32 %r6545, %r6544, %r6544, 16; + add.s32 %r6546, %r6545, %r6469; + xor.b32 %r6547, %r6546, %r6513; + shf.l.wrap.b32 %r6548, %r6547, %r6547, 20; + add.s32 %r6549, %r6543, %r6066; + add.s32 %r6550, %r6549, %r6548; + xor.b32 %r6551, %r6550, %r6545; + shf.l.wrap.b32 %r6552, %r6551, %r6551, 24; + add.s32 %r6553, %r6552, %r6546; + xor.b32 %r6554, %r6553, %r6548; + shf.l.wrap.b32 %r6555, %r6554, %r6554, 25; + add.s32 %r6556, %r6508, %r6017; + add.s32 %r6557, %r6556, %r6471; + xor.b32 %r6558, %r6557, %r6496; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 16; + add.s32 %r6560, %r6559, %r6483; + xor.b32 %r6561, %r6560, %r6471; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 20; + add.s32 %r6563, %r6557, %r6052; + add.s32 %r6564, %r6563, %r6562; + xor.b32 %r6565, %r6564, %r6559; + shf.l.wrap.b32 %r6566, %r6565, %r6565, 24; + add.s32 %r6567, %r6566, %r6560; + xor.b32 %r6568, %r6567, %r6562; + shf.l.wrap.b32 %r6569, %r6568, %r6568, 25; + add.s32 %r6570, %r6522, %r6094; + add.s32 %r6571, %r6570, %r6569; + xor.b32 %r6572, %r6571, %r6538; + shf.l.wrap.b32 %r6573, %r6572, %r6572, 16; + add.s32 %r6574, %r6573, %r6553; + xor.b32 %r6575, %r6574, %r6569; + shf.l.wrap.b32 %r6576, %r6575, %r6575, 20; + add.s32 %r6577, %r6571, %r6101; + add.s32 %r6578, %r6577, %r6576; + xor.b32 %r6579, %r6578, %r6573; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 24; + add.s32 %r6581, %r6580, %r6574; + xor.b32 %r6582, %r6581, %r6576; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 25; + add.s32 %r6584, %r6536, %r6073; + add.s32 %r6585, %r6584, %r6527; + xor.b32 %r6586, %r6585, %r6552; + shf.l.wrap.b32 %r6587, %r6586, %r6586, 16; + add.s32 %r6588, %r6587, %r6567; + xor.b32 %r6589, %r6588, %r6527; + shf.l.wrap.b32 %r6590, %r6589, %r6589, 20; + add.s32 %r6591, %r6585, %r6087; + add.s32 %r6592, %r6591, %r6590; + xor.b32 %r6593, %r6592, %r6587; + shf.l.wrap.b32 %r6594, %r6593, %r6593, 24; + add.s32 %r6595, %r6594, %r6588; + xor.b32 %r6596, %r6595, %r6590; + shf.l.wrap.b32 %r6597, %r6596, %r6596, 25; + add.s32 %r6598, %r6550, %r6115; + add.s32 %r6599, %r6598, %r6541; + xor.b32 %r6600, %r6599, %r6566; + shf.l.wrap.b32 %r6601, %r6600, %r6600, 16; + add.s32 %r6602, %r6601, %r6525; + xor.b32 %r6603, %r6602, %r6541; + shf.l.wrap.b32 %r6604, %r6603, %r6603, 20; + add.s32 %r6605, %r6599, %r6080; + add.s32 %r6606, %r6605, %r6604; + xor.b32 %r6607, %r6606, %r6601; + shf.l.wrap.b32 %r6608, %r6607, %r6607, 24; + add.s32 %r6609, %r6608, %r6602; + xor.b32 %r6610, %r6609, %r6604; + shf.l.wrap.b32 %r6611, %r6610, %r6610, 25; + add.s32 %r6612, %r6564, %r6108; + add.s32 %r6613, %r6612, %r6555; + xor.b32 %r6614, %r6613, %r6524; + shf.l.wrap.b32 %r6615, %r6614, %r6614, 16; + add.s32 %r6616, %r6615, %r6539; + xor.b32 %r6617, %r6616, %r6555; + shf.l.wrap.b32 %r6618, %r6617, %r6617, 20; + add.s32 %r6619, %r6613, %r6066; + add.s32 %r6620, %r6619, %r6618; + xor.b32 %r6621, %r6620, %r6615; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 24; + add.s32 %r6623, %r6622, %r6616; + xor.b32 %r6624, %r6623, %r6618; + shf.l.wrap.b32 %r6625, %r6624, %r6624, 25; + add.s32 %r6626, %r6578, %r6059; + add.s32 %r6627, %r6626, %r6597; + xor.b32 %r6628, %r6627, %r6622; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 16; + add.s32 %r6630, %r6629, %r6609; + xor.b32 %r6631, %r6630, %r6597; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 20; + add.s32 %r6633, %r6627, %r6024; + add.s32 %r6634, %r6633, %r6632; + xor.b32 %r6635, %r6634, %r6629; + shf.l.wrap.b32 %r6636, %r6635, %r6635, 24; + add.s32 %r6637, %r6636, %r6630; + xor.b32 %r6638, %r6637, %r6632; + shf.l.wrap.b32 %r6639, %r6638, %r6638, 25; + add.s32 %r6640, %r6592, %r6045; + add.s32 %r6641, %r6640, %r6611; + xor.b32 %r6642, %r6641, %r6580; + shf.l.wrap.b32 %r6643, %r6642, %r6642, 16; + add.s32 %r6644, %r6643, %r6623; + xor.b32 %r6645, %r6644, %r6611; + shf.l.wrap.b32 %r6646, %r6645, %r6645, 20; + add.s32 %r6647, %r6641, %r6031; + add.s32 %r6648, %r6647, %r6646; + xor.b32 %r6649, %r6648, %r6643; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 24; + add.s32 %r6651, %r6650, %r6644; + xor.b32 %r6652, %r6651, %r6646; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 25; + add.s32 %r6654, %r6606, %r6010; + add.s32 %r6655, %r6654, %r6625; + xor.b32 %r6656, %r6655, %r6594; + shf.l.wrap.b32 %r6657, %r6656, %r6656, 16; + add.s32 %r6658, %r6657, %r6581; + xor.b32 %r6659, %r6658, %r6625; + shf.l.wrap.b32 %r6660, %r6659, %r6659, 20; + add.s32 %r6661, %r6655, %r6017; + add.s32 %r6662, %r6661, %r6660; + xor.b32 %r6663, %r6662, %r6657; + shf.l.wrap.b32 %r6664, %r6663, %r6663, 24; + add.s32 %r6665, %r6664, %r6658; + xor.b32 %r6666, %r6665, %r6660; + shf.l.wrap.b32 %r6667, %r6666, %r6666, 25; + add.s32 %r6668, %r6620, %r6052; + add.s32 %r6669, %r6668, %r6583; + xor.b32 %r6670, %r6669, %r6608; + shf.l.wrap.b32 %r6671, %r6670, %r6670, 16; + add.s32 %r6672, %r6671, %r6595; + xor.b32 %r6673, %r6672, %r6583; + shf.l.wrap.b32 %r6674, %r6673, %r6673, 20; + add.s32 %r6675, %r6669, %r6038; + add.s32 %r6676, %r6675, %r6674; + xor.b32 %r6677, %r6676, %r6671; + shf.l.wrap.b32 %r6678, %r6677, %r6677, 24; + add.s32 %r6679, %r6678, %r6672; + xor.b32 %r6680, %r6679, %r6674; + shf.l.wrap.b32 %r6681, %r6680, %r6680, 25; + add.s32 %r6682, %r6634, %r6073; + add.s32 %r6683, %r6682, %r6681; + xor.b32 %r6684, %r6683, %r6650; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 16; + add.s32 %r6686, %r6685, %r6665; + xor.b32 %r6687, %r6686, %r6681; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 20; + add.s32 %r6689, %r6683, %r6108; + add.s32 %r6690, %r6689, %r6688; + xor.b32 %r6691, %r6690, %r6685; + shf.l.wrap.b32 %r6692, %r6691, %r6691, 24; + add.s32 %r6693, %r6692, %r6686; + xor.b32 %r6694, %r6693, %r6688; + shf.l.wrap.b32 %r6695, %r6694, %r6694, 25; + add.s32 %r6696, %r6648, %r6087; + add.s32 %r6697, %r6696, %r6639; + xor.b32 %r6698, %r6697, %r6664; + shf.l.wrap.b32 %r6699, %r6698, %r6698, 16; + add.s32 %r6700, %r6699, %r6679; + xor.b32 %r6701, %r6700, %r6639; + shf.l.wrap.b32 %r6702, %r6701, %r6701, 20; + add.s32 %r6703, %r6697, %r6045; + add.s32 %r6704, %r6703, %r6702; + xor.b32 %r6705, %r6704, %r6699; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 24; + add.s32 %r6707, %r6706, %r6700; + xor.b32 %r6708, %r6707, %r6702; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 25; + add.s32 %r6710, %r6662, %r6066; + add.s32 %r6711, %r6710, %r6653; + xor.b32 %r6712, %r6711, %r6678; + shf.l.wrap.b32 %r6713, %r6712, %r6712, 16; + add.s32 %r6714, %r6713, %r6637; + xor.b32 %r6715, %r6714, %r6653; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 20; + add.s32 %r6717, %r6711, %r6094; + add.s32 %r6718, %r6717, %r6716; + xor.b32 %r6719, %r6718, %r6713; + shf.l.wrap.b32 %r6720, %r6719, %r6719, 24; + add.s32 %r6721, %r6720, %r6714; + xor.b32 %r6722, %r6721, %r6716; + shf.l.wrap.b32 %r6723, %r6722, %r6722, 25; + add.s32 %r6724, %r6676, %r6115; + add.s32 %r6725, %r6724, %r6667; + xor.b32 %r6726, %r6725, %r6636; + shf.l.wrap.b32 %r6727, %r6726, %r6726, 16; + add.s32 %r6728, %r6727, %r6651; + xor.b32 %r6729, %r6728, %r6667; + shf.l.wrap.b32 %r6730, %r6729, %r6729, 20; + add.s32 %r6731, %r6725, %r6017; + add.s32 %r6732, %r6731, %r6730; + xor.b32 %r6733, %r6732, %r6727; + shf.l.wrap.b32 %r6734, %r6733, %r6733, 24; + add.s32 %r6735, %r6734, %r6728; + xor.b32 %r6736, %r6735, %r6730; + shf.l.wrap.b32 %r6737, %r6736, %r6736, 25; + add.s32 %r6738, %r6690, %r6101; + add.s32 %r6739, %r6738, %r6709; + xor.b32 %r6740, %r6739, %r6734; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 16; + add.s32 %r6742, %r6741, %r6721; + xor.b32 %r6743, %r6742, %r6709; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 20; + add.s32 %r6745, %r6739, %r6031; + add.s32 %r6746, %r6745, %r6744; + xor.b32 %r6747, %r6746, %r6741; + shf.l.wrap.b32 %r6748, %r6747, %r6747, 24; + add.s32 %r6749, %r6748, %r6742; + xor.b32 %r6750, %r6749, %r6744; + shf.l.wrap.b32 %r6751, %r6750, %r6750, 25; + add.s32 %r6752, %r6704, %r6010; + add.s32 %r6753, %r6752, %r6723; + xor.b32 %r6754, %r6753, %r6692; + shf.l.wrap.b32 %r6755, %r6754, %r6754, 16; + add.s32 %r6756, %r6755, %r6735; + xor.b32 %r6757, %r6756, %r6723; + shf.l.wrap.b32 %r6758, %r6757, %r6757, 20; + add.s32 %r6759, %r6753, %r6080; + add.s32 %r6760, %r6759, %r6758; + xor.b32 %r6761, %r6760, %r6755; + shf.l.wrap.b32 %r6762, %r6761, %r6761, 24; + add.s32 %r6763, %r6762, %r6756; + xor.b32 %r6764, %r6763, %r6758; + shf.l.wrap.b32 %r6765, %r6764, %r6764, 25; + add.s32 %r6766, %r6718, %r6024; + add.s32 %r6767, %r6766, %r6737; + xor.b32 %r6768, %r6767, %r6706; + shf.l.wrap.b32 %r6769, %r6768, %r6768, 16; + add.s32 %r6770, %r6769, %r6693; + xor.b32 %r6771, %r6770, %r6737; + shf.l.wrap.b32 %r6772, %r6771, %r6771, 20; + add.s32 %r6773, %r6767, %r6052; + add.s32 %r6774, %r6773, %r6772; + xor.b32 %r6775, %r6774, %r6769; + shf.l.wrap.b32 %r6776, %r6775, %r6775, 24; + add.s32 %r6777, %r6776, %r6770; + xor.b32 %r6778, %r6777, %r6772; + shf.l.wrap.b32 %r6779, %r6778, %r6778, 25; + add.s32 %r6780, %r6732, %r6038; + add.s32 %r6781, %r6780, %r6695; + xor.b32 %r6782, %r6781, %r6720; + shf.l.wrap.b32 %r6783, %r6782, %r6782, 16; + add.s32 %r6784, %r6783, %r6707; + xor.b32 %r6785, %r6784, %r6695; + shf.l.wrap.b32 %r6786, %r6785, %r6785, 20; + add.s32 %r6787, %r6781, %r6059; + add.s32 %r6788, %r6787, %r6786; + xor.b32 %r6789, %r6788, %r6783; + shf.l.wrap.b32 %r6790, %r6789, %r6789, 24; + add.s32 %r6791, %r6790, %r6784; + xor.b32 %r6792, %r6791, %r6786; + shf.l.wrap.b32 %r6793, %r6792, %r6792, 25; + add.s32 %r6794, %r6746, %r6087; + add.s32 %r6795, %r6794, %r6793; + xor.b32 %r6796, %r6795, %r6762; + shf.l.wrap.b32 %r6797, %r6796, %r6796, 16; + add.s32 %r6798, %r6797, %r6777; + xor.b32 %r6799, %r6798, %r6793; + shf.l.wrap.b32 %r6800, %r6799, %r6799, 20; + add.s32 %r6801, %r6795, %r6115; + add.s32 %r6802, %r6801, %r6800; + xor.b32 %r6803, %r6802, %r6797; + shf.l.wrap.b32 %r6804, %r6803, %r6803, 24; + add.s32 %r6805, %r6804, %r6798; + xor.b32 %r6806, %r6805, %r6800; + shf.l.wrap.b32 %r6807, %r6806, %r6806, 25; + add.s32 %r6808, %r6760, %r6045; + add.s32 %r6809, %r6808, %r6751; + xor.b32 %r6810, %r6809, %r6776; + shf.l.wrap.b32 %r6811, %r6810, %r6810, 16; + add.s32 %r6812, %r6811, %r6791; + xor.b32 %r6813, %r6812, %r6751; + shf.l.wrap.b32 %r6814, %r6813, %r6813, 20; + add.s32 %r6815, %r6809, %r6010; + add.s32 %r6816, %r6815, %r6814; + xor.b32 %r6817, %r6816, %r6811; + shf.l.wrap.b32 %r6818, %r6817, %r6817, 24; + add.s32 %r6819, %r6818, %r6812; + xor.b32 %r6820, %r6819, %r6814; + shf.l.wrap.b32 %r6821, %r6820, %r6820, 25; + add.s32 %r6822, %r6774, %r6017; + add.s32 %r6823, %r6822, %r6765; + xor.b32 %r6824, %r6823, %r6790; + shf.l.wrap.b32 %r6825, %r6824, %r6824, 16; + add.s32 %r6826, %r6825, %r6749; + xor.b32 %r6827, %r6826, %r6765; + shf.l.wrap.b32 %r6828, %r6827, %r6827, 20; + add.s32 %r6829, %r6823, %r6073; + add.s32 %r6830, %r6829, %r6828; + xor.b32 %r6831, %r6830, %r6825; + shf.l.wrap.b32 %r6832, %r6831, %r6831, 24; + add.s32 %r6833, %r6832, %r6826; + xor.b32 %r6834, %r6833, %r6828; + shf.l.wrap.b32 %r6835, %r6834, %r6834, 25; + add.s32 %r6836, %r6788, %r6066; + add.s32 %r6837, %r6836, %r6779; + xor.b32 %r6838, %r6837, %r6748; + shf.l.wrap.b32 %r6839, %r6838, %r6838, 16; + add.s32 %r6840, %r6839, %r6763; + xor.b32 %r6841, %r6840, %r6779; + shf.l.wrap.b32 %r6842, %r6841, %r6841, 20; + add.s32 %r6843, %r6837, %r6052; + add.s32 %r6844, %r6843, %r6842; + xor.b32 %r6845, %r6844, %r6839; + shf.l.wrap.b32 %r6846, %r6845, %r6845, 24; + add.s32 %r6847, %r6846, %r6840; + xor.b32 %r6848, %r6847, %r6842; + shf.l.wrap.b32 %r6849, %r6848, %r6848, 25; + add.s32 %r6850, %r6802, %r6108; + add.s32 %r6851, %r6850, %r6821; + xor.b32 %r6852, %r6851, %r6846; + shf.l.wrap.b32 %r6853, %r6852, %r6852, 16; + add.s32 %r6854, %r6853, %r6833; + xor.b32 %r6855, %r6854, %r6821; + shf.l.wrap.b32 %r6856, %r6855, %r6855, 20; + add.s32 %r6857, %r6851, %r6080; + add.s32 %r6858, %r6857, %r6856; + xor.b32 %r6859, %r6858, %r6853; + shf.l.wrap.b32 %r6860, %r6859, %r6859, 24; + add.s32 %r6861, %r6860, %r6854; + xor.b32 %r6862, %r6861, %r6856; + shf.l.wrap.b32 %r6863, %r6862, %r6862, 25; + add.s32 %r6864, %r6816, %r6024; + add.s32 %r6865, %r6864, %r6835; + xor.b32 %r6866, %r6865, %r6804; + shf.l.wrap.b32 %r6867, %r6866, %r6866, 16; + add.s32 %r6868, %r6867, %r6847; + xor.b32 %r6869, %r6868, %r6835; + shf.l.wrap.b32 %r6870, %r6869, %r6869, 20; + add.s32 %r6871, %r6865, %r6094; + add.s32 %r6872, %r6871, %r6870; + xor.b32 %r6873, %r6872, %r6867; + shf.l.wrap.b32 %r6874, %r6873, %r6873, 24; + add.s32 %r6875, %r6874, %r6868; + xor.b32 %r6876, %r6875, %r6870; + shf.l.wrap.b32 %r6877, %r6876, %r6876, 25; + add.s32 %r6878, %r6830, %r6031; + add.s32 %r6879, %r6878, %r6849; + xor.b32 %r6880, %r6879, %r6818; + shf.l.wrap.b32 %r6881, %r6880, %r6880, 16; + add.s32 %r6882, %r6881, %r6805; + xor.b32 %r6883, %r6882, %r6849; + shf.l.wrap.b32 %r6884, %r6883, %r6883, 20; + add.s32 %r6885, %r6879, %r6038; + add.s32 %r6886, %r6885, %r6884; + xor.b32 %r6887, %r6886, %r6881; + shf.l.wrap.b32 %r6888, %r6887, %r6887, 24; + add.s32 %r6889, %r6888, %r6882; + xor.b32 %r6890, %r6889, %r6884; + shf.l.wrap.b32 %r6891, %r6890, %r6890, 25; + add.s32 %r6892, %r6844, %r6059; + add.s32 %r6893, %r6892, %r6807; + xor.b32 %r6894, %r6893, %r6832; + shf.l.wrap.b32 %r6895, %r6894, %r6894, 16; + add.s32 %r6896, %r6895, %r6819; + xor.b32 %r6897, %r6896, %r6807; + shf.l.wrap.b32 %r6898, %r6897, %r6897, 20; + add.s32 %r6899, %r6893, %r6101; + add.s32 %r6900, %r6899, %r6898; + xor.b32 %r6901, %r6900, %r6895; + shf.l.wrap.b32 %r6902, %r6901, %r6901, 24; + add.s32 %r6903, %r6902, %r6896; + xor.b32 %r6904, %r6903, %r6898; + shf.l.wrap.b32 %r6905, %r6904, %r6904, 25; + xor.b32 %r11679, %r6889, %r6858; + xor.b32 %r11678, %r6903, %r6872; + xor.b32 %r11677, %r6861, %r6886; + xor.b32 %r11676, %r6900, %r6875; + xor.b32 %r11675, %r6905, %r6874; + xor.b32 %r11674, %r6863, %r6888; + xor.b32 %r11673, %r6902, %r6877; + xor.b32 %r11672, %r6891, %r6860; + add.s16 %rs353, %rs353, 1; + st.local.u8 [%rd54+1], %rs353; + add.s64 %rd257, %rd257, 64; + add.s64 %rd258, %rd258, -64; + setp.gt.u64 %p33, %rd258, 64; + @%p33 bra $L__BB1_38; + +$L__BB1_39: + min.u64 %rd61, %rd258, 64; + setp.eq.s64 %p34, %rd61, 0; + mov.u16 %rs355, %rs354; + mov.u16 %rs356, %rs354; + mov.u16 %rs357, %rs354; + mov.u16 %rs358, %rs354; + mov.u16 %rs359, %rs354; + mov.u16 %rs360, %rs354; + mov.u16 %rs361, %rs354; + mov.u16 %rs362, %rs354; + mov.u16 %rs363, %rs354; + mov.u16 %rs364, %rs354; + mov.u16 %rs365, %rs354; + mov.u16 %rs366, %rs354; + mov.u16 %rs367, %rs354; + mov.u16 %rs368, %rs354; + mov.u16 %rs369, %rs354; + mov.u16 %rs370, %rs354; + mov.u16 %rs371, %rs354; + mov.u16 %rs372, %rs354; + mov.u16 %rs373, %rs354; + mov.u16 %rs374, %rs354; + mov.u16 %rs375, %rs354; + mov.u16 %rs376, %rs354; + mov.u16 %rs377, %rs354; + mov.u16 %rs378, %rs354; + mov.u16 %rs379, %rs354; + mov.u16 %rs380, %rs354; + mov.u16 %rs381, %rs354; + mov.u16 %rs382, %rs354; + mov.u16 %rs383, %rs354; + mov.u16 %rs384, %rs354; + mov.u16 %rs385, %rs354; + mov.u16 %rs386, %rs354; + mov.u16 %rs387, %rs354; + @%p34 bra $L__BB1_43; + + mov.u64 %rd259, 0; + +$L__BB1_41: + add.s64 %rd186, %rd257, %rd259; + ld.local.u8 %rs251, [%rd186]; + add.s64 %rd187, %rd53, %rd259; + st.local.u8 [%rd187], %rs251; + add.s64 %rd259, %rd259, 1; + setp.lt.u64 %p35, %rd259, %rd61; + @%p35 bra $L__BB1_41; + + ld.local.v4.u16 {%rs384, %rs385, %rs386, %rs387}, [%rd53]; + ld.local.v4.u16 {%rs380, %rs381, %rs382, %rs383}, [%rd53+8]; + ld.local.v4.u16 {%rs376, %rs377, %rs378, %rs379}, [%rd53+16]; + ld.local.v4.u16 {%rs372, %rs373, %rs374, %rs375}, [%rd53+24]; + ld.local.v4.u16 {%rs368, %rs369, %rs370, %rs371}, [%rd53+32]; + ld.local.v4.u16 {%rs364, %rs365, %rs366, %rs367}, [%rd53+40]; + ld.local.v4.u16 {%rs360, %rs361, %rs362, %rs363}, [%rd53+48]; + ld.local.v4.u16 {%rs357, %rs358, %rs359, %rs283}, [%rd53+56]; + ld.local.u8 %rs356, [%rd53+61]; + ld.local.v2.u8 {%rs354, %rs355}, [%rd53+62]; + +$L__BB1_43: + ld.local.v4.u8 {%rs286, %rs287, %rs288, %rs289}, [%rd53+64]; + cvt.u16.u64 %rs292, %rd61; + add.s16 %rs293, %rs286, %rs292; + st.local.u8 [%rd53+64], %rs293; + setp.eq.s16 %p36, %rs287, 0; + selp.u16 %rs294, 1, 0, %p36; + or.b16 %rs295, %rs288, %rs294; + or.b16 %rs296, %rs295, 2; + shr.u16 %rs297, %rs384, 8; + shr.u16 %rs298, %rs385, 8; + shr.u16 %rs299, %rs386, 8; + shr.u16 %rs300, %rs387, 8; + shr.u16 %rs301, %rs380, 8; + shr.u16 %rs302, %rs381, 8; + shr.u16 %rs303, %rs382, 8; + shr.u16 %rs304, %rs383, 8; + shr.u16 %rs305, %rs376, 8; + shr.u16 %rs306, %rs377, 8; + shr.u16 %rs307, %rs378, 8; + shr.u16 %rs308, %rs379, 8; + shr.u16 %rs309, %rs372, 8; + shr.u16 %rs310, %rs373, 8; + shr.u16 %rs311, %rs374, 8; + shr.u16 %rs312, %rs375, 8; + shr.u16 %rs313, %rs368, 8; + shr.u16 %rs314, %rs369, 8; + shr.u16 %rs315, %rs370, 8; + shr.u16 %rs316, %rs371, 8; + shr.u16 %rs317, %rs364, 8; + shr.u16 %rs318, %rs365, 8; + shr.u16 %rs319, %rs366, 8; + shr.u16 %rs320, %rs367, 8; + shr.u16 %rs321, %rs360, 8; + shr.u16 %rs322, %rs361, 8; + shr.u16 %rs323, %rs362, 8; + shr.u16 %rs324, %rs363, 8; + shr.u16 %rs325, %rs357, 8; + shr.u16 %rs326, %rs358, 8; + cvt.u32.u16 %r6906, %rs384; + and.b32 %r6907, %r6906, 255; + cvt.u32.u16 %r6908, %rs297; + prmt.b32 %r6909, %r6908, %r6907, 30212; + cvt.u32.u16 %r6910, %rs385; + prmt.b32 %r6911, %r6910, %r6909, 28756; + cvt.u32.u16 %r6912, %rs298; + prmt.b32 %r6913, %r6912, %r6911, 1620; + cvt.u32.u16 %r6914, %rs386; + and.b32 %r6915, %r6914, 255; + cvt.u32.u16 %r6916, %rs299; + prmt.b32 %r6917, %r6916, %r6915, 30212; + cvt.u32.u16 %r6918, %rs387; + prmt.b32 %r6919, %r6918, %r6917, 28756; + cvt.u32.u16 %r6920, %rs300; + prmt.b32 %r6921, %r6920, %r6919, 1620; + cvt.u32.u16 %r6922, %rs380; + and.b32 %r6923, %r6922, 255; + cvt.u32.u16 %r6924, %rs301; + prmt.b32 %r6925, %r6924, %r6923, 30212; + cvt.u32.u16 %r6926, %rs381; + prmt.b32 %r6927, %r6926, %r6925, 28756; + cvt.u32.u16 %r6928, %rs302; + prmt.b32 %r6929, %r6928, %r6927, 1620; + cvt.u32.u16 %r6930, %rs382; + and.b32 %r6931, %r6930, 255; + cvt.u32.u16 %r6932, %rs303; + prmt.b32 %r6933, %r6932, %r6931, 30212; + cvt.u32.u16 %r6934, %rs383; + prmt.b32 %r6935, %r6934, %r6933, 28756; + cvt.u32.u16 %r6936, %rs304; + prmt.b32 %r6937, %r6936, %r6935, 1620; + cvt.u32.u16 %r6938, %rs376; + and.b32 %r6939, %r6938, 255; + cvt.u32.u16 %r6940, %rs305; + prmt.b32 %r6941, %r6940, %r6939, 30212; + cvt.u32.u16 %r6942, %rs377; + prmt.b32 %r6943, %r6942, %r6941, 28756; + cvt.u32.u16 %r6944, %rs306; + prmt.b32 %r6945, %r6944, %r6943, 1620; + cvt.u32.u16 %r6946, %rs378; + and.b32 %r6947, %r6946, 255; + cvt.u32.u16 %r6948, %rs307; + prmt.b32 %r6949, %r6948, %r6947, 30212; + cvt.u32.u16 %r6950, %rs379; + prmt.b32 %r6951, %r6950, %r6949, 28756; + cvt.u32.u16 %r6952, %rs308; + prmt.b32 %r6953, %r6952, %r6951, 1620; + cvt.u32.u16 %r6954, %rs372; + and.b32 %r6955, %r6954, 255; + cvt.u32.u16 %r6956, %rs309; + prmt.b32 %r6957, %r6956, %r6955, 30212; + cvt.u32.u16 %r6958, %rs373; + prmt.b32 %r6959, %r6958, %r6957, 28756; + cvt.u32.u16 %r6960, %rs310; + prmt.b32 %r6961, %r6960, %r6959, 1620; + cvt.u32.u16 %r6962, %rs374; + and.b32 %r6963, %r6962, 255; + cvt.u32.u16 %r6964, %rs311; + prmt.b32 %r6965, %r6964, %r6963, 30212; + cvt.u32.u16 %r6966, %rs375; + prmt.b32 %r6967, %r6966, %r6965, 28756; + cvt.u32.u16 %r6968, %rs312; + prmt.b32 %r6969, %r6968, %r6967, 1620; + cvt.u32.u16 %r6970, %rs368; + and.b32 %r6971, %r6970, 255; + cvt.u32.u16 %r6972, %rs313; + prmt.b32 %r6973, %r6972, %r6971, 30212; + cvt.u32.u16 %r6974, %rs369; + prmt.b32 %r6975, %r6974, %r6973, 28756; + cvt.u32.u16 %r6976, %rs314; + prmt.b32 %r6977, %r6976, %r6975, 1620; + cvt.u32.u16 %r6978, %rs370; + and.b32 %r6979, %r6978, 255; + cvt.u32.u16 %r6980, %rs315; + prmt.b32 %r6981, %r6980, %r6979, 30212; + cvt.u32.u16 %r6982, %rs371; + prmt.b32 %r6983, %r6982, %r6981, 28756; + cvt.u32.u16 %r6984, %rs316; + prmt.b32 %r6985, %r6984, %r6983, 1620; + cvt.u32.u16 %r6986, %rs364; + and.b32 %r6987, %r6986, 255; + cvt.u32.u16 %r6988, %rs317; + prmt.b32 %r6989, %r6988, %r6987, 30212; + cvt.u32.u16 %r6990, %rs365; + prmt.b32 %r6991, %r6990, %r6989, 28756; + cvt.u32.u16 %r6992, %rs318; + prmt.b32 %r6993, %r6992, %r6991, 1620; + cvt.u32.u16 %r6994, %rs366; + and.b32 %r6995, %r6994, 255; + cvt.u32.u16 %r6996, %rs319; + prmt.b32 %r6997, %r6996, %r6995, 30212; + cvt.u32.u16 %r6998, %rs367; + prmt.b32 %r6999, %r6998, %r6997, 28756; + cvt.u32.u16 %r7000, %rs320; + prmt.b32 %r7001, %r7000, %r6999, 1620; + cvt.u32.u16 %r7002, %rs360; + and.b32 %r7003, %r7002, 255; + cvt.u32.u16 %r7004, %rs321; + prmt.b32 %r7005, %r7004, %r7003, 30212; + cvt.u32.u16 %r7006, %rs361; + prmt.b32 %r7007, %r7006, %r7005, 28756; + cvt.u32.u16 %r7008, %rs322; + prmt.b32 %r7009, %r7008, %r7007, 1620; + cvt.u32.u16 %r7010, %rs362; + and.b32 %r7011, %r7010, 255; + cvt.u32.u16 %r7012, %rs323; + prmt.b32 %r7013, %r7012, %r7011, 30212; + cvt.u32.u16 %r7014, %rs363; + prmt.b32 %r7015, %r7014, %r7013, 28756; + cvt.u32.u16 %r7016, %rs324; + prmt.b32 %r7017, %r7016, %r7015, 1620; + cvt.u32.u16 %r7018, %rs357; + and.b32 %r7019, %r7018, 255; + cvt.u32.u16 %r7020, %rs325; + prmt.b32 %r7021, %r7020, %r7019, 30212; + cvt.u32.u16 %r7022, %rs358; + prmt.b32 %r7023, %r7022, %r7021, 28756; + cvt.u32.u16 %r7024, %rs326; + prmt.b32 %r7025, %r7024, %r7023, 1620; + cvt.u32.u16 %r7026, %rs359; + and.b32 %r7027, %r7026, 255; + cvt.u32.u16 %r7028, %rs356; + prmt.b32 %r7029, %r7028, %r7027, 30212; + cvt.u32.u16 %r7030, %rs354; + shl.b32 %r7031, %r7030, 16; + and.b32 %r7032, %r7031, 16711680; + or.b32 %r7033, %r7029, %r7032; + cvt.u32.u16 %r7034, %rs355; + shl.b32 %r7035, %r7034, 24; + or.b32 %r7036, %r7033, %r7035; + cvt.u32.u16 %r7037, %rs293; + and.b32 %r7038, %r7037, 255; + cvt.u32.u16 %r7039, %rs296; + and.b32 %r7040, %r7039, 255; + add.s32 %r7041, %r11675, %r11679; + add.s32 %r7042, %r7041, %r6913; + xor.b32 %r7043, %r7042, %r71; + shf.l.wrap.b32 %r7044, %r7043, %r7043, 16; + add.s32 %r7045, %r7044, 1779033703; + xor.b32 %r7046, %r7045, %r11675; + shf.l.wrap.b32 %r7047, %r7046, %r7046, 20; + add.s32 %r7048, %r6921, %r7042; + add.s32 %r7049, %r7048, %r7047; + xor.b32 %r7050, %r7049, %r7044; + shf.l.wrap.b32 %r7051, %r7050, %r7050, 24; + add.s32 %r7052, %r7051, %r7045; + xor.b32 %r7053, %r7052, %r7047; + shf.l.wrap.b32 %r7054, %r7053, %r7053, 25; + add.s32 %r7055, %r11674, %r11678; + add.s32 %r7056, %r7055, %r6929; + xor.b32 %r7057, %r7056, %r72; + shf.l.wrap.b32 %r7058, %r7057, %r7057, 16; + add.s32 %r7059, %r7058, -1150833019; + xor.b32 %r7060, %r7059, %r11674; + shf.l.wrap.b32 %r7061, %r7060, %r7060, 20; + add.s32 %r7062, %r6937, %r7056; + add.s32 %r7063, %r7062, %r7061; + xor.b32 %r7064, %r7063, %r7058; + shf.l.wrap.b32 %r7065, %r7064, %r7064, 24; + add.s32 %r7066, %r7065, %r7059; + xor.b32 %r7067, %r7066, %r7061; + shf.l.wrap.b32 %r7068, %r7067, %r7067, 25; + add.s32 %r7069, %r11673, %r11677; + add.s32 %r7070, %r7069, %r6945; + xor.b32 %r7071, %r7070, %r7038; + shr.u32 %r7072, %r7070, 16; + shl.b32 %r7073, %r7071, 16; + or.b32 %r7074, %r7073, %r7072; + add.s32 %r7075, %r7074, 1013904242; + xor.b32 %r7076, %r7075, %r11673; + shf.l.wrap.b32 %r7077, %r7076, %r7076, 20; + add.s32 %r7078, %r6953, %r7070; + add.s32 %r7079, %r7078, %r7077; + xor.b32 %r7080, %r7079, %r7074; + shf.l.wrap.b32 %r7081, %r7080, %r7080, 24; + add.s32 %r7082, %r7081, %r7075; + xor.b32 %r7083, %r7082, %r7077; + shf.l.wrap.b32 %r7084, %r7083, %r7083, 25; + add.s32 %r7085, %r11672, %r11676; + add.s32 %r7086, %r7085, %r6961; + xor.b32 %r7087, %r7086, %r7040; + shr.u32 %r7088, %r7086, 16; + shl.b32 %r7089, %r7087, 16; + or.b32 %r7090, %r7089, %r7088; + add.s32 %r7091, %r7090, -1521486534; + xor.b32 %r7092, %r7091, %r11672; + shf.l.wrap.b32 %r7093, %r7092, %r7092, 20; + add.s32 %r7094, %r6969, %r7086; + add.s32 %r7095, %r7094, %r7093; + xor.b32 %r7096, %r7095, %r7090; + shf.l.wrap.b32 %r7097, %r7096, %r7096, 24; + add.s32 %r7098, %r7097, %r7091; + xor.b32 %r7099, %r7098, %r7093; + shf.l.wrap.b32 %r7100, %r7099, %r7099, 25; + add.s32 %r7101, %r7068, %r7049; + add.s32 %r7102, %r7101, %r6977; + xor.b32 %r7103, %r7097, %r7102; + shf.l.wrap.b32 %r7104, %r7103, %r7103, 16; + add.s32 %r7105, %r7104, %r7082; + xor.b32 %r7106, %r7105, %r7068; + shf.l.wrap.b32 %r7107, %r7106, %r7106, 20; + add.s32 %r7108, %r6985, %r7102; + add.s32 %r7109, %r7108, %r7107; + xor.b32 %r7110, %r7109, %r7104; + shf.l.wrap.b32 %r7111, %r7110, %r7110, 24; + add.s32 %r7112, %r7111, %r7105; + xor.b32 %r7113, %r7112, %r7107; + shf.l.wrap.b32 %r7114, %r7113, %r7113, 25; + add.s32 %r7115, %r7084, %r7063; + add.s32 %r7116, %r7115, %r6993; + xor.b32 %r7117, %r7116, %r7051; + shf.l.wrap.b32 %r7118, %r7117, %r7117, 16; + add.s32 %r7119, %r7118, %r7098; + xor.b32 %r7120, %r7119, %r7084; + shf.l.wrap.b32 %r7121, %r7120, %r7120, 20; + add.s32 %r7122, %r7001, %r7116; + add.s32 %r7123, %r7122, %r7121; + xor.b32 %r7124, %r7123, %r7118; + shf.l.wrap.b32 %r7125, %r7124, %r7124, 24; + add.s32 %r7126, %r7125, %r7119; + xor.b32 %r7127, %r7126, %r7121; + shf.l.wrap.b32 %r7128, %r7127, %r7127, 25; + add.s32 %r7129, %r7100, %r7079; + add.s32 %r7130, %r7129, %r7009; + xor.b32 %r7131, %r7130, %r7065; + shf.l.wrap.b32 %r7132, %r7131, %r7131, 16; + add.s32 %r7133, %r7132, %r7052; + xor.b32 %r7134, %r7133, %r7100; + shf.l.wrap.b32 %r7135, %r7134, %r7134, 20; + add.s32 %r7136, %r7017, %r7130; + add.s32 %r7137, %r7136, %r7135; + xor.b32 %r7138, %r7137, %r7132; + shf.l.wrap.b32 %r7139, %r7138, %r7138, 24; + add.s32 %r7140, %r7139, %r7133; + xor.b32 %r7141, %r7140, %r7135; + shf.l.wrap.b32 %r7142, %r7141, %r7141, 25; + add.s32 %r7143, %r7095, %r7054; + add.s32 %r7144, %r7143, %r7025; + xor.b32 %r7145, %r7144, %r7081; + shf.l.wrap.b32 %r7146, %r7145, %r7145, 16; + add.s32 %r7147, %r7146, %r7066; + xor.b32 %r7148, %r7147, %r7054; + shf.l.wrap.b32 %r7149, %r7148, %r7148, 20; + add.s32 %r7150, %r7036, %r7144; + add.s32 %r7151, %r7150, %r7149; + xor.b32 %r7152, %r7151, %r7146; + shf.l.wrap.b32 %r7153, %r7152, %r7152, 24; + add.s32 %r7154, %r7153, %r7147; + xor.b32 %r7155, %r7154, %r7149; + shf.l.wrap.b32 %r7156, %r7155, %r7155, 25; + add.s32 %r7157, %r7109, %r6929; + add.s32 %r7158, %r7157, %r7156; + xor.b32 %r7159, %r7158, %r7125; + shf.l.wrap.b32 %r7160, %r7159, %r7159, 16; + add.s32 %r7161, %r7160, %r7140; + xor.b32 %r7162, %r7161, %r7156; + shf.l.wrap.b32 %r7163, %r7162, %r7162, 20; + add.s32 %r7164, %r7158, %r6961; + add.s32 %r7165, %r7164, %r7163; + xor.b32 %r7166, %r7165, %r7160; + shf.l.wrap.b32 %r7167, %r7166, %r7166, 24; + add.s32 %r7168, %r7167, %r7161; + xor.b32 %r7169, %r7168, %r7163; + shf.l.wrap.b32 %r7170, %r7169, %r7169, 25; + add.s32 %r7171, %r7123, %r6937; + add.s32 %r7172, %r7171, %r7114; + xor.b32 %r7173, %r7139, %r7172; + shf.l.wrap.b32 %r7174, %r7173, %r7173, 16; + add.s32 %r7175, %r7154, %r7174; + xor.b32 %r7176, %r7175, %r7114; + shf.l.wrap.b32 %r7177, %r7176, %r7176, 20; + add.s32 %r7178, %r7172, %r6993; + add.s32 %r7179, %r7178, %r7177; + xor.b32 %r7180, %r7179, %r7174; + shf.l.wrap.b32 %r7181, %r7180, %r7180, 24; + add.s32 %r7182, %r7181, %r7175; + xor.b32 %r7183, %r7182, %r7177; + shf.l.wrap.b32 %r7184, %r7183, %r7183, 25; + add.s32 %r7185, %r7128, %r6969; + add.s32 %r7186, %r7185, %r7137; + xor.b32 %r7187, %r7153, %r7186; + shf.l.wrap.b32 %r7188, %r7187, %r7187, 16; + add.s32 %r7189, %r7188, %r7112; + xor.b32 %r7190, %r7189, %r7128; + shf.l.wrap.b32 %r7191, %r7190, %r7190, 20; + add.s32 %r7192, %r7186, %r6913; + add.s32 %r7193, %r7192, %r7191; + xor.b32 %r7194, %r7193, %r7188; + shf.l.wrap.b32 %r7195, %r7194, %r7194, 24; + add.s32 %r7196, %r7195, %r7189; + xor.b32 %r7197, %r7196, %r7191; + shf.l.wrap.b32 %r7198, %r7197, %r7197, 25; + add.s32 %r7199, %r7142, %r6945; + add.s32 %r7200, %r7199, %r7151; + xor.b32 %r7201, %r7200, %r7111; + shf.l.wrap.b32 %r7202, %r7201, %r7201, 16; + add.s32 %r7203, %r7202, %r7126; + xor.b32 %r7204, %r7203, %r7142; + shf.l.wrap.b32 %r7205, %r7204, %r7204, 20; + add.s32 %r7206, %r7200, %r7017; + add.s32 %r7207, %r7206, %r7205; + xor.b32 %r7208, %r7207, %r7202; + shf.l.wrap.b32 %r7209, %r7208, %r7208, 24; + add.s32 %r7210, %r7209, %r7203; + xor.b32 %r7211, %r7210, %r7205; + shf.l.wrap.b32 %r7212, %r7211, %r7211, 25; + add.s32 %r7213, %r7184, %r6921; + add.s32 %r7214, %r7213, %r7165; + xor.b32 %r7215, %r7214, %r7209; + shf.l.wrap.b32 %r7216, %r7215, %r7215, 16; + add.s32 %r7217, %r7216, %r7196; + xor.b32 %r7218, %r7217, %r7184; + shf.l.wrap.b32 %r7219, %r7218, %r7218, 20; + add.s32 %r7220, %r7214, %r7001; + add.s32 %r7221, %r7220, %r7219; + xor.b32 %r7222, %r7221, %r7216; + shf.l.wrap.b32 %r7223, %r7222, %r7222, 24; + add.s32 %r7224, %r7223, %r7217; + xor.b32 %r7225, %r7224, %r7219; + shf.l.wrap.b32 %r7226, %r7225, %r7225, 25; + add.s32 %r7227, %r7179, %r7009; + add.s32 %r7228, %r7227, %r7198; + xor.b32 %r7229, %r7167, %r7228; + shf.l.wrap.b32 %r7230, %r7229, %r7229, 16; + add.s32 %r7231, %r7230, %r7210; + xor.b32 %r7232, %r7231, %r7198; + shf.l.wrap.b32 %r7233, %r7232, %r7232, 20; + add.s32 %r7234, %r7228, %r6953; + add.s32 %r7235, %r7234, %r7233; + xor.b32 %r7236, %r7235, %r7230; + shf.l.wrap.b32 %r7237, %r7236, %r7236, 24; + add.s32 %r7238, %r7237, %r7231; + xor.b32 %r7239, %r7238, %r7233; + shf.l.wrap.b32 %r7240, %r7239, %r7239, 25; + add.s32 %r7241, %r7193, %r6985; + add.s32 %r7242, %r7241, %r7212; + xor.b32 %r7243, %r7242, %r7181; + shf.l.wrap.b32 %r7244, %r7243, %r7243, 16; + add.s32 %r7245, %r7244, %r7168; + xor.b32 %r7246, %r7245, %r7212; + shf.l.wrap.b32 %r7247, %r7246, %r7246, 20; + add.s32 %r7248, %r7242, %r7025; + add.s32 %r7249, %r7248, %r7247; + xor.b32 %r7250, %r7249, %r7244; + shf.l.wrap.b32 %r7251, %r7250, %r7250, 24; + add.s32 %r7252, %r7251, %r7245; + xor.b32 %r7253, %r7252, %r7247; + shf.l.wrap.b32 %r7254, %r7253, %r7253, 25; + add.s32 %r7255, %r7207, %r7036; + add.s32 %r7256, %r7255, %r7170; + xor.b32 %r7257, %r7256, %r7195; + shf.l.wrap.b32 %r7258, %r7257, %r7257, 16; + add.s32 %r7259, %r7258, %r7182; + xor.b32 %r7260, %r7259, %r7170; + shf.l.wrap.b32 %r7261, %r7260, %r7260, 20; + add.s32 %r7262, %r7256, %r6977; + add.s32 %r7263, %r7262, %r7261; + xor.b32 %r7264, %r7263, %r7258; + shf.l.wrap.b32 %r7265, %r7264, %r7264, 24; + add.s32 %r7266, %r7265, %r7259; + xor.b32 %r7267, %r7266, %r7261; + shf.l.wrap.b32 %r7268, %r7267, %r7267, 25; + add.s32 %r7269, %r7221, %r6937; + add.s32 %r7270, %r7269, %r7268; + xor.b32 %r7271, %r7270, %r7237; + shf.l.wrap.b32 %r7272, %r7271, %r7271, 16; + add.s32 %r7273, %r7272, %r7252; + xor.b32 %r7274, %r7273, %r7268; + shf.l.wrap.b32 %r7275, %r7274, %r7274, 20; + add.s32 %r7276, %r7270, %r6945; + add.s32 %r7277, %r7276, %r7275; + xor.b32 %r7278, %r7277, %r7272; + shf.l.wrap.b32 %r7279, %r7278, %r7278, 24; + add.s32 %r7280, %r7279, %r7273; + xor.b32 %r7281, %r7280, %r7275; + shf.l.wrap.b32 %r7282, %r7281, %r7281, 25; + add.s32 %r7283, %r7235, %r6993; + add.s32 %r7284, %r7283, %r7226; + xor.b32 %r7285, %r7284, %r7251; + shf.l.wrap.b32 %r7286, %r7285, %r7285, 16; + add.s32 %r7287, %r7286, %r7266; + xor.b32 %r7288, %r7287, %r7226; + shf.l.wrap.b32 %r7289, %r7288, %r7288, 20; + add.s32 %r7290, %r7284, %r7009; + add.s32 %r7291, %r7290, %r7289; + xor.b32 %r7292, %r7291, %r7286; + shf.l.wrap.b32 %r7293, %r7292, %r7292, 24; + add.s32 %r7294, %r7293, %r7287; + xor.b32 %r7295, %r7294, %r7289; + shf.l.wrap.b32 %r7296, %r7295, %r7295, 25; + add.s32 %r7297, %r7249, %r7017; + add.s32 %r7298, %r7297, %r7240; + xor.b32 %r7299, %r7265, %r7298; + shf.l.wrap.b32 %r7300, %r7299, %r7299, 16; + add.s32 %r7301, %r7300, %r7224; + xor.b32 %r7302, %r7301, %r7240; + shf.l.wrap.b32 %r7303, %r7302, %r7302, 20; + add.s32 %r7304, %r7298, %r6929; + add.s32 %r7305, %r7304, %r7303; + xor.b32 %r7306, %r7305, %r7300; + shf.l.wrap.b32 %r7307, %r7306, %r7306, 24; + add.s32 %r7308, %r7307, %r7301; + xor.b32 %r7309, %r7308, %r7303; + shf.l.wrap.b32 %r7310, %r7309, %r7309, 25; + add.s32 %r7311, %r7254, %r6969; + add.s32 %r7312, %r7311, %r7263; + xor.b32 %r7313, %r7312, %r7223; + shf.l.wrap.b32 %r7314, %r7313, %r7313, 16; + add.s32 %r7315, %r7314, %r7238; + xor.b32 %r7316, %r7315, %r7254; + shf.l.wrap.b32 %r7317, %r7316, %r7316, 20; + add.s32 %r7318, %r7312, %r7025; + add.s32 %r7319, %r7318, %r7317; + xor.b32 %r7320, %r7319, %r7314; + shf.l.wrap.b32 %r7321, %r7320, %r7320, 24; + add.s32 %r7322, %r7321, %r7315; + xor.b32 %r7323, %r7322, %r7317; + shf.l.wrap.b32 %r7324, %r7323, %r7323, 25; + add.s32 %r7325, %r7277, %r6961; + add.s32 %r7326, %r7325, %r7296; + xor.b32 %r7327, %r7326, %r7321; + shf.l.wrap.b32 %r7328, %r7327, %r7327, 16; + add.s32 %r7329, %r7328, %r7308; + xor.b32 %r7330, %r7329, %r7296; + shf.l.wrap.b32 %r7331, %r7330, %r7330, 20; + add.s32 %r7332, %r7326, %r6953; + add.s32 %r7333, %r7332, %r7331; + xor.b32 %r7334, %r7333, %r7328; + shf.l.wrap.b32 %r7335, %r7334, %r7334, 24; + add.s32 %r7336, %r7335, %r7329; + xor.b32 %r7337, %r7336, %r7331; + shf.l.wrap.b32 %r7338, %r7337, %r7337, 25; + add.s32 %r7339, %r7291, %r6985; + add.s32 %r7340, %r7339, %r7310; + xor.b32 %r7341, %r7279, %r7340; + shf.l.wrap.b32 %r7342, %r7341, %r7341, 16; + add.s32 %r7343, %r7342, %r7322; + xor.b32 %r7344, %r7343, %r7310; + shf.l.wrap.b32 %r7345, %r7344, %r7344, 20; + add.s32 %r7346, %r7340, %r6913; + add.s32 %r7347, %r7346, %r7345; + xor.b32 %r7348, %r7347, %r7342; + shf.l.wrap.b32 %r7349, %r7348, %r7348, 24; + add.s32 %r7350, %r7349, %r7343; + xor.b32 %r7351, %r7350, %r7345; + shf.l.wrap.b32 %r7352, %r7351, %r7351, 25; + add.s32 %r7353, %r7305, %r7001; + add.s32 %r7354, %r7353, %r7324; + xor.b32 %r7355, %r7354, %r7293; + shf.l.wrap.b32 %r7356, %r7355, %r7355, 16; + add.s32 %r7357, %r7356, %r7280; + xor.b32 %r7358, %r7357, %r7324; + shf.l.wrap.b32 %r7359, %r7358, %r7358, 20; + add.s32 %r7360, %r7354, %r7036; + add.s32 %r7361, %r7360, %r7359; + xor.b32 %r7362, %r7361, %r7356; + shf.l.wrap.b32 %r7363, %r7362, %r7362, 24; + add.s32 %r7364, %r7363, %r7357; + xor.b32 %r7365, %r7364, %r7359; + shf.l.wrap.b32 %r7366, %r7365, %r7365, 25; + add.s32 %r7367, %r7319, %r6977; + add.s32 %r7368, %r7367, %r7282; + xor.b32 %r7369, %r7368, %r7307; + shf.l.wrap.b32 %r7370, %r7369, %r7369, 16; + add.s32 %r7371, %r7370, %r7294; + xor.b32 %r7372, %r7371, %r7282; + shf.l.wrap.b32 %r7373, %r7372, %r7372, 20; + add.s32 %r7374, %r7368, %r6921; + add.s32 %r7375, %r7374, %r7373; + xor.b32 %r7376, %r7375, %r7370; + shf.l.wrap.b32 %r7377, %r7376, %r7376, 24; + add.s32 %r7378, %r7377, %r7371; + xor.b32 %r7379, %r7378, %r7373; + shf.l.wrap.b32 %r7380, %r7379, %r7379, 25; + add.s32 %r7381, %r7333, %r6993; + add.s32 %r7382, %r7381, %r7380; + xor.b32 %r7383, %r7382, %r7349; + shf.l.wrap.b32 %r7384, %r7383, %r7383, 16; + add.s32 %r7385, %r7384, %r7364; + xor.b32 %r7386, %r7385, %r7380; + shf.l.wrap.b32 %r7387, %r7386, %r7386, 20; + add.s32 %r7388, %r7382, %r6969; + add.s32 %r7389, %r7388, %r7387; + xor.b32 %r7390, %r7389, %r7384; + shf.l.wrap.b32 %r7391, %r7390, %r7390, 24; + add.s32 %r7392, %r7391, %r7385; + xor.b32 %r7393, %r7392, %r7387; + shf.l.wrap.b32 %r7394, %r7393, %r7393, 25; + add.s32 %r7395, %r7347, %r7009; + add.s32 %r7396, %r7395, %r7338; + xor.b32 %r7397, %r7396, %r7363; + shf.l.wrap.b32 %r7398, %r7397, %r7397, 16; + add.s32 %r7399, %r7398, %r7378; + xor.b32 %r7400, %r7399, %r7338; + shf.l.wrap.b32 %r7401, %r7400, %r7400, 20; + add.s32 %r7402, %r7396, %r6985; + add.s32 %r7403, %r7402, %r7401; + xor.b32 %r7404, %r7403, %r7398; + shf.l.wrap.b32 %r7405, %r7404, %r7404, 24; + add.s32 %r7406, %r7405, %r7399; + xor.b32 %r7407, %r7406, %r7401; + shf.l.wrap.b32 %r7408, %r7407, %r7407, 25; + add.s32 %r7409, %r7361, %r7025; + add.s32 %r7410, %r7409, %r7352; + xor.b32 %r7411, %r7377, %r7410; + shf.l.wrap.b32 %r7412, %r7411, %r7411, 16; + add.s32 %r7413, %r7412, %r7336; + xor.b32 %r7414, %r7413, %r7352; + shf.l.wrap.b32 %r7415, %r7414, %r7414, 20; + add.s32 %r7416, %r7410, %r6937; + add.s32 %r7417, %r7416, %r7415; + xor.b32 %r7418, %r7417, %r7412; + shf.l.wrap.b32 %r7419, %r7418, %r7418, 24; + add.s32 %r7420, %r7419, %r7413; + xor.b32 %r7421, %r7420, %r7415; + shf.l.wrap.b32 %r7422, %r7421, %r7421, 25; + add.s32 %r7423, %r7375, %r7017; + add.s32 %r7424, %r7423, %r7366; + xor.b32 %r7425, %r7424, %r7335; + shf.l.wrap.b32 %r7426, %r7425, %r7425, 16; + add.s32 %r7427, %r7426, %r7350; + xor.b32 %r7428, %r7427, %r7366; + shf.l.wrap.b32 %r7429, %r7428, %r7428, 20; + add.s32 %r7430, %r7424, %r7036; + add.s32 %r7431, %r7430, %r7429; + xor.b32 %r7432, %r7431, %r7426; + shf.l.wrap.b32 %r7433, %r7432, %r7432, 24; + add.s32 %r7434, %r7433, %r7427; + xor.b32 %r7435, %r7434, %r7429; + shf.l.wrap.b32 %r7436, %r7435, %r7435, 25; + add.s32 %r7437, %r7389, %r6945; + add.s32 %r7438, %r7437, %r7408; + xor.b32 %r7439, %r7438, %r7433; + shf.l.wrap.b32 %r7440, %r7439, %r7439, 16; + add.s32 %r7441, %r7440, %r7420; + xor.b32 %r7442, %r7441, %r7408; + shf.l.wrap.b32 %r7443, %r7442, %r7442, 20; + add.s32 %r7444, %r7438, %r6913; + add.s32 %r7445, %r7444, %r7443; + xor.b32 %r7446, %r7445, %r7440; + shf.l.wrap.b32 %r7447, %r7446, %r7446, 24; + add.s32 %r7448, %r7447, %r7441; + xor.b32 %r7449, %r7448, %r7443; + shf.l.wrap.b32 %r7450, %r7449, %r7449, 25; + add.s32 %r7451, %r7403, %r7001; + add.s32 %r7452, %r7451, %r7422; + xor.b32 %r7453, %r7391, %r7452; + shf.l.wrap.b32 %r7454, %r7453, %r7453, 16; + add.s32 %r7455, %r7454, %r7434; + xor.b32 %r7456, %r7455, %r7422; + shf.l.wrap.b32 %r7457, %r7456, %r7456, 20; + add.s32 %r7458, %r7452, %r6929; + add.s32 %r7459, %r7458, %r7457; + xor.b32 %r7460, %r7459, %r7454; + shf.l.wrap.b32 %r7461, %r7460, %r7460, 24; + add.s32 %r7462, %r7461, %r7455; + xor.b32 %r7463, %r7462, %r7457; + shf.l.wrap.b32 %r7464, %r7463, %r7463, 25; + add.s32 %r7465, %r7417, %r6953; + add.s32 %r7466, %r7465, %r7436; + xor.b32 %r7467, %r7466, %r7405; + shf.l.wrap.b32 %r7468, %r7467, %r7467, 16; + add.s32 %r7469, %r7468, %r7392; + xor.b32 %r7470, %r7469, %r7436; + shf.l.wrap.b32 %r7471, %r7470, %r7470, 20; + add.s32 %r7472, %r7466, %r6977; + add.s32 %r7473, %r7472, %r7471; + xor.b32 %r7474, %r7473, %r7468; + shf.l.wrap.b32 %r7475, %r7474, %r7474, 24; + add.s32 %r7476, %r7475, %r7469; + xor.b32 %r7477, %r7476, %r7471; + shf.l.wrap.b32 %r7478, %r7477, %r7477, 25; + add.s32 %r7479, %r7431, %r6921; + add.s32 %r7480, %r7479, %r7394; + xor.b32 %r7481, %r7480, %r7419; + shf.l.wrap.b32 %r7482, %r7481, %r7481, 16; + add.s32 %r7483, %r7482, %r7406; + xor.b32 %r7484, %r7483, %r7394; + shf.l.wrap.b32 %r7485, %r7484, %r7484, 20; + add.s32 %r7486, %r7480, %r6961; + add.s32 %r7487, %r7486, %r7485; + xor.b32 %r7488, %r7487, %r7482; + shf.l.wrap.b32 %r7489, %r7488, %r7488, 24; + add.s32 %r7490, %r7489, %r7483; + xor.b32 %r7491, %r7490, %r7485; + shf.l.wrap.b32 %r7492, %r7491, %r7491, 25; + add.s32 %r7493, %r7445, %r7009; + add.s32 %r7494, %r7493, %r7492; + xor.b32 %r7495, %r7494, %r7461; + shf.l.wrap.b32 %r7496, %r7495, %r7495, 16; + add.s32 %r7497, %r7496, %r7476; + xor.b32 %r7498, %r7497, %r7492; + shf.l.wrap.b32 %r7499, %r7498, %r7498, 20; + add.s32 %r7500, %r7494, %r7017; + add.s32 %r7501, %r7500, %r7499; + xor.b32 %r7502, %r7501, %r7496; + shf.l.wrap.b32 %r7503, %r7502, %r7502, 24; + add.s32 %r7504, %r7503, %r7497; + xor.b32 %r7505, %r7504, %r7499; + shf.l.wrap.b32 %r7506, %r7505, %r7505, 25; + add.s32 %r7507, %r7459, %r6985; + add.s32 %r7508, %r7507, %r7450; + xor.b32 %r7509, %r7508, %r7475; + shf.l.wrap.b32 %r7510, %r7509, %r7509, 16; + add.s32 %r7511, %r7510, %r7490; + xor.b32 %r7512, %r7511, %r7450; + shf.l.wrap.b32 %r7513, %r7512, %r7512, 20; + add.s32 %r7514, %r7508, %r7001; + add.s32 %r7515, %r7514, %r7513; + xor.b32 %r7516, %r7515, %r7510; + shf.l.wrap.b32 %r7517, %r7516, %r7516, 24; + add.s32 %r7518, %r7517, %r7511; + xor.b32 %r7519, %r7518, %r7513; + shf.l.wrap.b32 %r7520, %r7519, %r7519, 25; + add.s32 %r7521, %r7473, %r7036; + add.s32 %r7522, %r7521, %r7464; + xor.b32 %r7523, %r7489, %r7522; + shf.l.wrap.b32 %r7524, %r7523, %r7523, 16; + add.s32 %r7525, %r7524, %r7448; + xor.b32 %r7526, %r7525, %r7464; + shf.l.wrap.b32 %r7527, %r7526, %r7526, 20; + add.s32 %r7528, %r7522, %r6993; + add.s32 %r7529, %r7528, %r7527; + xor.b32 %r7530, %r7529, %r7524; + shf.l.wrap.b32 %r7531, %r7530, %r7530, 24; + add.s32 %r7532, %r7531, %r7525; + xor.b32 %r7533, %r7532, %r7527; + shf.l.wrap.b32 %r7534, %r7533, %r7533, 25; + add.s32 %r7535, %r7487, %r7025; + add.s32 %r7536, %r7535, %r7478; + xor.b32 %r7537, %r7536, %r7447; + shf.l.wrap.b32 %r7538, %r7537, %r7537, 16; + add.s32 %r7539, %r7538, %r7462; + xor.b32 %r7540, %r7539, %r7478; + shf.l.wrap.b32 %r7541, %r7540, %r7540, 20; + add.s32 %r7542, %r7536, %r6977; + add.s32 %r7543, %r7542, %r7541; + xor.b32 %r7544, %r7543, %r7538; + shf.l.wrap.b32 %r7545, %r7544, %r7544, 24; + add.s32 %r7546, %r7545, %r7539; + xor.b32 %r7547, %r7546, %r7541; + shf.l.wrap.b32 %r7548, %r7547, %r7547, 25; + add.s32 %r7549, %r7501, %r6969; + add.s32 %r7550, %r7549, %r7520; + xor.b32 %r7551, %r7550, %r7545; + shf.l.wrap.b32 %r7552, %r7551, %r7551, 16; + add.s32 %r7553, %r7552, %r7532; + xor.b32 %r7554, %r7553, %r7520; + shf.l.wrap.b32 %r7555, %r7554, %r7554, 20; + add.s32 %r7556, %r7550, %r6929; + add.s32 %r7557, %r7556, %r7555; + xor.b32 %r7558, %r7557, %r7552; + shf.l.wrap.b32 %r7559, %r7558, %r7558, 24; + add.s32 %r7560, %r7559, %r7553; + xor.b32 %r7561, %r7560, %r7555; + shf.l.wrap.b32 %r7562, %r7561, %r7561, 25; + add.s32 %r7563, %r7515, %r6953; + add.s32 %r7564, %r7563, %r7534; + xor.b32 %r7565, %r7503, %r7564; + shf.l.wrap.b32 %r7566, %r7565, %r7565, 16; + add.s32 %r7567, %r7566, %r7546; + xor.b32 %r7568, %r7567, %r7534; + shf.l.wrap.b32 %r7569, %r7568, %r7568, 20; + add.s32 %r7570, %r7564, %r6937; + add.s32 %r7571, %r7570, %r7569; + xor.b32 %r7572, %r7571, %r7566; + shf.l.wrap.b32 %r7573, %r7572, %r7572, 24; + add.s32 %r7574, %r7573, %r7567; + xor.b32 %r7575, %r7574, %r7569; + shf.l.wrap.b32 %r7576, %r7575, %r7575, 25; + add.s32 %r7577, %r7529, %r6913; + add.s32 %r7578, %r7577, %r7548; + xor.b32 %r7579, %r7578, %r7517; + shf.l.wrap.b32 %r7580, %r7579, %r7579, 16; + add.s32 %r7581, %r7580, %r7504; + xor.b32 %r7582, %r7581, %r7548; + shf.l.wrap.b32 %r7583, %r7582, %r7582, 20; + add.s32 %r7584, %r7578, %r6921; + add.s32 %r7585, %r7584, %r7583; + xor.b32 %r7586, %r7585, %r7580; + shf.l.wrap.b32 %r7587, %r7586, %r7586, 24; + add.s32 %r7588, %r7587, %r7581; + xor.b32 %r7589, %r7588, %r7583; + shf.l.wrap.b32 %r7590, %r7589, %r7589, 25; + add.s32 %r7591, %r7543, %r6961; + add.s32 %r7592, %r7591, %r7506; + xor.b32 %r7593, %r7592, %r7531; + shf.l.wrap.b32 %r7594, %r7593, %r7593, 16; + add.s32 %r7595, %r7594, %r7518; + xor.b32 %r7596, %r7595, %r7506; + shf.l.wrap.b32 %r7597, %r7596, %r7596, 20; + add.s32 %r7598, %r7592, %r6945; + add.s32 %r7599, %r7598, %r7597; + xor.b32 %r7600, %r7599, %r7594; + shf.l.wrap.b32 %r7601, %r7600, %r7600, 24; + add.s32 %r7602, %r7601, %r7595; + xor.b32 %r7603, %r7602, %r7597; + shf.l.wrap.b32 %r7604, %r7603, %r7603, 25; + add.s32 %r7605, %r7557, %r6985; + add.s32 %r7606, %r7605, %r7604; + xor.b32 %r7607, %r7606, %r7573; + shf.l.wrap.b32 %r7608, %r7607, %r7607, 16; + add.s32 %r7609, %r7608, %r7588; + xor.b32 %r7610, %r7609, %r7604; + shf.l.wrap.b32 %r7611, %r7610, %r7610, 20; + add.s32 %r7612, %r7606, %r7025; + add.s32 %r7613, %r7612, %r7611; + xor.b32 %r7614, %r7613, %r7608; + shf.l.wrap.b32 %r7615, %r7614, %r7614, 24; + add.s32 %r7616, %r7615, %r7609; + xor.b32 %r7617, %r7616, %r7611; + shf.l.wrap.b32 %r7618, %r7617, %r7617, 25; + add.s32 %r7619, %r7571, %r7001; + add.s32 %r7620, %r7619, %r7562; + xor.b32 %r7621, %r7620, %r7587; + shf.l.wrap.b32 %r7622, %r7621, %r7621, 16; + add.s32 %r7623, %r7622, %r7602; + xor.b32 %r7624, %r7623, %r7562; + shf.l.wrap.b32 %r7625, %r7624, %r7624, 20; + add.s32 %r7626, %r7620, %r6953; + add.s32 %r7627, %r7626, %r7625; + xor.b32 %r7628, %r7627, %r7622; + shf.l.wrap.b32 %r7629, %r7628, %r7628, 24; + add.s32 %r7630, %r7629, %r7623; + xor.b32 %r7631, %r7630, %r7625; + shf.l.wrap.b32 %r7632, %r7631, %r7631, 25; + add.s32 %r7633, %r7585, %r6977; + add.s32 %r7634, %r7633, %r7576; + xor.b32 %r7635, %r7601, %r7634; + shf.l.wrap.b32 %r7636, %r7635, %r7635, 16; + add.s32 %r7637, %r7636, %r7560; + xor.b32 %r7638, %r7637, %r7576; + shf.l.wrap.b32 %r7639, %r7638, %r7638, 20; + add.s32 %r7640, %r7634, %r7009; + add.s32 %r7641, %r7640, %r7639; + xor.b32 %r7642, %r7641, %r7636; + shf.l.wrap.b32 %r7643, %r7642, %r7642, 24; + add.s32 %r7644, %r7643, %r7637; + xor.b32 %r7645, %r7644, %r7639; + shf.l.wrap.b32 %r7646, %r7645, %r7645, 25; + add.s32 %r7647, %r7599, %r7036; + add.s32 %r7648, %r7647, %r7590; + xor.b32 %r7649, %r7648, %r7559; + shf.l.wrap.b32 %r7650, %r7649, %r7649, 16; + add.s32 %r7651, %r7650, %r7574; + xor.b32 %r7652, %r7651, %r7590; + shf.l.wrap.b32 %r7653, %r7652, %r7652, 20; + add.s32 %r7654, %r7648, %r6921; + add.s32 %r7655, %r7654, %r7653; + xor.b32 %r7656, %r7655, %r7650; + shf.l.wrap.b32 %r7657, %r7656, %r7656, 24; + add.s32 %r7658, %r7657, %r7651; + xor.b32 %r7659, %r7658, %r7653; + shf.l.wrap.b32 %r7660, %r7659, %r7659, 25; + add.s32 %r7661, %r7613, %r7017; + add.s32 %r7662, %r7661, %r7632; + xor.b32 %r7663, %r7662, %r7657; + shf.l.wrap.b32 %r7664, %r7663, %r7663, 16; + add.s32 %r7665, %r7664, %r7644; + xor.b32 %r7666, %r7665, %r7632; + shf.l.wrap.b32 %r7667, %r7666, %r7666, 20; + add.s32 %r7668, %r7662, %r6937; + add.s32 %r7669, %r7668, %r7667; + xor.b32 %r7670, %r7669, %r7664; + shf.l.wrap.b32 %r7671, %r7670, %r7670, 24; + add.s32 %r7672, %r7671, %r7665; + xor.b32 %r7673, %r7672, %r7667; + shf.l.wrap.b32 %r7674, %r7673, %r7673, 25; + add.s32 %r7675, %r7627, %r6913; + add.s32 %r7676, %r7675, %r7646; + xor.b32 %r7677, %r7615, %r7676; + shf.l.wrap.b32 %r7678, %r7677, %r7677, 16; + add.s32 %r7679, %r7678, %r7658; + xor.b32 %r7680, %r7679, %r7646; + shf.l.wrap.b32 %r7681, %r7680, %r7680, 20; + add.s32 %r7682, %r7676, %r6993; + add.s32 %r7683, %r7682, %r7681; + xor.b32 %r7684, %r7683, %r7678; + shf.l.wrap.b32 %r7685, %r7684, %r7684, 24; + add.s32 %r7686, %r7685, %r7679; + xor.b32 %r7687, %r7686, %r7681; + shf.l.wrap.b32 %r7688, %r7687, %r7687, 25; + add.s32 %r7689, %r7641, %r6929; + add.s32 %r7690, %r7689, %r7660; + xor.b32 %r7691, %r7690, %r7629; + shf.l.wrap.b32 %r7692, %r7691, %r7691, 16; + add.s32 %r7693, %r7692, %r7616; + xor.b32 %r7694, %r7693, %r7660; + shf.l.wrap.b32 %r7695, %r7694, %r7694, 20; + add.s32 %r7696, %r7690, %r6961; + add.s32 %r7697, %r7696, %r7695; + xor.b32 %r7698, %r7697, %r7692; + shf.l.wrap.b32 %r7699, %r7698, %r7698, 24; + add.s32 %r7700, %r7699, %r7693; + xor.b32 %r7701, %r7700, %r7695; + shf.l.wrap.b32 %r7702, %r7701, %r7701, 25; + add.s32 %r7703, %r7655, %r6945; + add.s32 %r7704, %r7703, %r7618; + xor.b32 %r7705, %r7704, %r7643; + shf.l.wrap.b32 %r7706, %r7705, %r7705, 16; + add.s32 %r7707, %r7706, %r7630; + xor.b32 %r7708, %r7707, %r7618; + shf.l.wrap.b32 %r7709, %r7708, %r7708, 20; + add.s32 %r7710, %r7704, %r6969; + add.s32 %r7711, %r7710, %r7709; + xor.b32 %r7712, %r7711, %r7706; + shf.l.wrap.b32 %r7713, %r7712, %r7712, 24; + add.s32 %r7714, %r7713, %r7707; + xor.b32 %r7715, %r7714, %r7709; + shf.l.wrap.b32 %r7716, %r7715, %r7715, 25; + add.s32 %r7717, %r7669, %r7001; + add.s32 %r7718, %r7717, %r7716; + xor.b32 %r7719, %r7718, %r7685; + shf.l.wrap.b32 %r7720, %r7719, %r7719, 16; + add.s32 %r7721, %r7720, %r7700; + xor.b32 %r7722, %r7721, %r7716; + shf.l.wrap.b32 %r7723, %r7722, %r7722, 20; + add.s32 %r7724, %r7718, %r7036; + add.s32 %r7725, %r7724, %r7723; + xor.b32 %r7726, %r7725, %r7720; + shf.l.wrap.b32 %r7727, %r7726, %r7726, 24; + add.s32 %r7728, %r7727, %r7721; + xor.b32 %r7729, %r7728, %r7723; + shf.l.wrap.b32 %r7730, %r7729, %r7729, 25; + add.s32 %r7731, %r7683, %r6953; + add.s32 %r7732, %r7731, %r7674; + xor.b32 %r7733, %r7732, %r7699; + shf.l.wrap.b32 %r7734, %r7733, %r7733, 16; + add.s32 %r7735, %r7734, %r7714; + xor.b32 %r7736, %r7735, %r7674; + shf.l.wrap.b32 %r7737, %r7736, %r7736, 20; + add.s32 %r7738, %r7732, %r6913; + add.s32 %r7739, %r7738, %r7737; + xor.b32 %r7740, %r7739, %r7734; + shf.l.wrap.b32 %r7741, %r7740, %r7740, 24; + add.s32 %r7742, %r7741, %r7735; + xor.b32 %r7743, %r7742, %r7737; + shf.l.wrap.b32 %r7744, %r7743, %r7743, 25; + add.s32 %r7745, %r7697, %r6921; + add.s32 %r7746, %r7745, %r7688; + xor.b32 %r7747, %r7713, %r7746; + shf.l.wrap.b32 %r7748, %r7747, %r7747, 16; + add.s32 %r7749, %r7748, %r7672; + xor.b32 %r7750, %r7749, %r7688; + shf.l.wrap.b32 %r7751, %r7750, %r7750, 20; + add.s32 %r7752, %r7746, %r6985; + add.s32 %r7753, %r7752, %r7751; + xor.b32 %r7754, %r7753, %r7748; + shf.l.wrap.b32 %r7755, %r7754, %r7754, 24; + add.s32 %r7756, %r7755, %r7749; + xor.b32 %r7757, %r7756, %r7751; + shf.l.wrap.b32 %r7758, %r7757, %r7757, 25; + add.s32 %r7759, %r7711, %r6977; + add.s32 %r7760, %r7759, %r7702; + xor.b32 %r7761, %r7760, %r7671; + shf.l.wrap.b32 %r7762, %r7761, %r7761, 16; + add.s32 %r7763, %r7762, %r7686; + xor.b32 %r7764, %r7763, %r7702; + shf.l.wrap.b32 %r7765, %r7764, %r7764, 20; + add.s32 %r7766, %r7760, %r6961; + add.s32 %r7767, %r7766, %r7765; + xor.b32 %r7768, %r7767, %r7762; + shf.l.wrap.b32 %r7769, %r7768, %r7768, 24; + add.s32 %r7770, %r7769, %r7763; + xor.b32 %r7771, %r7770, %r7765; + shf.l.wrap.b32 %r7772, %r7771, %r7771, 25; + add.s32 %r7773, %r7725, %r7025; + add.s32 %r7774, %r7773, %r7744; + xor.b32 %r7775, %r7774, %r7769; + shf.l.wrap.b32 %r7776, %r7775, %r7775, 16; + add.s32 %r7777, %r7776, %r7756; + xor.b32 %r7778, %r7777, %r7744; + shf.l.wrap.b32 %r7779, %r7778, %r7778, 20; + add.s32 %r7780, %r7774, %r6993; + add.s32 %r7781, %r7780, %r7779; + xor.b32 %r7782, %r7781, %r7776; + shf.l.wrap.b32 %r7783, %r7782, %r7782, 24; + add.s32 %r7784, %r7783, %r7777; + xor.b32 %r7785, %r7784, %r7779; + shf.l.wrap.b32 %r7786, %r7785, %r7785, 25; + add.s32 %r7787, %r7739, %r6929; + add.s32 %r7788, %r7787, %r7758; + xor.b32 %r7789, %r7727, %r7788; + shf.l.wrap.b32 %r7790, %r7789, %r7789, 16; + add.s32 %r7791, %r7790, %r7770; + xor.b32 %r7792, %r7791, %r7758; + shf.l.wrap.b32 %r7793, %r7792, %r7792, 20; + add.s32 %r7794, %r7788, %r7009; + add.s32 %r7795, %r7794, %r7793; + xor.b32 %r7796, %r7795, %r7790; + shf.l.wrap.b32 %r7797, %r7796, %r7796, 24; + add.s32 %r7798, %r7797, %r7791; + xor.b32 %r7799, %r7798, %r7793; + shf.l.wrap.b32 %r7800, %r7799, %r7799, 25; + add.s32 %r7801, %r7753, %r6937; + add.s32 %r7802, %r7801, %r7772; + xor.b32 %r7803, %r7802, %r7741; + shf.l.wrap.b32 %r7804, %r7803, %r7803, 16; + add.s32 %r7805, %r7804, %r7728; + xor.b32 %r7806, %r7805, %r7772; + shf.l.wrap.b32 %r7807, %r7806, %r7806, 20; + add.s32 %r7808, %r7802, %r6945; + add.s32 %r7809, %r7808, %r7807; + xor.b32 %r7810, %r7809, %r7804; + shf.l.wrap.b32 %r7811, %r7810, %r7810, 24; + add.s32 %r7812, %r7811, %r7805; + xor.b32 %r7813, %r7812, %r7807; + shf.l.wrap.b32 %r7814, %r7813, %r7813, 25; + add.s32 %r7815, %r7767, %r6969; + add.s32 %r7816, %r7815, %r7730; + xor.b32 %r7817, %r7816, %r7755; + shf.l.wrap.b32 %r7818, %r7817, %r7817, 16; + add.s32 %r7819, %r7818, %r7742; + xor.b32 %r7820, %r7819, %r7730; + shf.l.wrap.b32 %r7821, %r7820, %r7820, 20; + add.s32 %r7822, %r7816, %r7017; + add.s32 %r7823, %r7822, %r7821; + xor.b32 %r7824, %r7823, %r7818; + shf.l.wrap.b32 %r7825, %r7824, %r7824, 24; + add.s32 %r7826, %r7825, %r7819; + xor.b32 %r7827, %r7826, %r7821; + shf.l.wrap.b32 %r7828, %r7827, %r7827, 25; + xor.b32 %r97, %r7812, %r7781; + xor.b32 %r98, %r7826, %r7795; + xor.b32 %r99, %r7784, %r7809; + xor.b32 %r100, %r7823, %r7798; + xor.b32 %r101, %r7828, %r7797; + xor.b32 %r102, %r7786, %r7811; + xor.b32 %r103, %r7825, %r7800; + xor.b32 %r104, %r7814, %r7783; + ld.local.u8 %rs327, [%rd3+8]; + cvt.u64.u16 %rd188, %rs327; + popc.b64 %r7829, %rd250; + cvt.u64.u32 %rd64, %r7829; + setp.ge.u64 %p37, %rd64, %rd188; + mul.wide.u16 %r11681, %rs327, 32; + @%p37 bra $L__BB1_46; + +$L__BB1_45: + popc.b64 %r11649, %rd250; + cvt.u64.u32 %rd229, %r11649; + add.s32 %r7830, %r11681, -64; + cvt.s64.s32 %rd189, %r7830; + add.s64 %rd190, %rd2, %rd189; + ld.local.u8 %r7831, [%rd3+2]; + ld.local.u8 %r7832, [%rd190+145]; + ld.local.u8 %r7833, [%rd190+146]; + prmt.b32 %r7834, %r7833, %r7832, 30212; + ld.local.u8 %r7835, [%rd190+147]; + prmt.b32 %r7836, %r7835, %r7834, 28756; + ld.local.u8 %r7837, [%rd190+148]; + prmt.b32 %r7838, %r7837, %r7836, 1620; + ld.local.u8 %r7839, [%rd190+149]; + ld.local.u8 %r7840, [%rd190+150]; + prmt.b32 %r7841, %r7840, %r7839, 30212; + ld.local.u8 %r7842, [%rd190+151]; + prmt.b32 %r7843, %r7842, %r7841, 28756; + ld.local.u8 %r7844, [%rd190+152]; + prmt.b32 %r7845, %r7844, %r7843, 1620; + ld.local.u8 %r7846, [%rd190+153]; + ld.local.u8 %r7847, [%rd190+154]; + prmt.b32 %r7848, %r7847, %r7846, 30212; + ld.local.u8 %r7849, [%rd190+155]; + prmt.b32 %r7850, %r7849, %r7848, 28756; + ld.local.u8 %r7851, [%rd190+156]; + prmt.b32 %r7852, %r7851, %r7850, 1620; + ld.local.u8 %r7853, [%rd190+157]; + ld.local.u8 %r7854, [%rd190+158]; + prmt.b32 %r7855, %r7854, %r7853, 30212; + ld.local.u8 %r7856, [%rd190+159]; + prmt.b32 %r7857, %r7856, %r7855, 28756; + ld.local.u8 %r7858, [%rd190+160]; + prmt.b32 %r7859, %r7858, %r7857, 1620; + ld.local.u8 %r7860, [%rd190+161]; + ld.local.u8 %r7861, [%rd190+162]; + prmt.b32 %r7862, %r7861, %r7860, 30212; + ld.local.u8 %r7863, [%rd190+163]; + prmt.b32 %r7864, %r7863, %r7862, 28756; + ld.local.u8 %r7865, [%rd190+164]; + prmt.b32 %r7866, %r7865, %r7864, 1620; + ld.local.u8 %r7867, [%rd190+165]; + ld.local.u8 %r7868, [%rd190+166]; + prmt.b32 %r7869, %r7868, %r7867, 30212; + ld.local.u8 %r7870, [%rd190+167]; + prmt.b32 %r7871, %r7870, %r7869, 28756; + ld.local.u8 %r7872, [%rd190+168]; + prmt.b32 %r7873, %r7872, %r7871, 1620; + ld.local.u8 %r7874, [%rd190+169]; + ld.local.u8 %r7875, [%rd190+170]; + prmt.b32 %r7876, %r7875, %r7874, 30212; + ld.local.u8 %r7877, [%rd190+171]; + prmt.b32 %r7878, %r7877, %r7876, 28756; + ld.local.u8 %r7879, [%rd190+172]; + prmt.b32 %r7880, %r7879, %r7878, 1620; + ld.local.u8 %r7881, [%rd190+173]; + ld.local.u8 %r7882, [%rd190+174]; + prmt.b32 %r7883, %r7882, %r7881, 30212; + ld.local.u8 %r7884, [%rd190+175]; + prmt.b32 %r7885, %r7884, %r7883, 28756; + ld.local.u8 %r7886, [%rd190+176]; + prmt.b32 %r7887, %r7886, %r7885, 1620; + ld.local.u8 %r7888, [%rd190+177]; + ld.local.u8 %r7889, [%rd190+178]; + prmt.b32 %r7890, %r7889, %r7888, 30212; + ld.local.u8 %r7891, [%rd190+179]; + prmt.b32 %r7892, %r7891, %r7890, 28756; + ld.local.u8 %r7893, [%rd190+180]; + prmt.b32 %r7894, %r7893, %r7892, 1620; + ld.local.u8 %r7895, [%rd190+181]; + ld.local.u8 %r7896, [%rd190+182]; + prmt.b32 %r7897, %r7896, %r7895, 30212; + ld.local.u8 %r7898, [%rd190+183]; + prmt.b32 %r7899, %r7898, %r7897, 28756; + ld.local.u8 %r7900, [%rd190+184]; + prmt.b32 %r7901, %r7900, %r7899, 1620; + ld.local.u8 %r7902, [%rd190+185]; + ld.local.u8 %r7903, [%rd190+186]; + prmt.b32 %r7904, %r7903, %r7902, 30212; + ld.local.u8 %r7905, [%rd190+187]; + prmt.b32 %r7906, %r7905, %r7904, 28756; + ld.local.u8 %r7907, [%rd190+188]; + prmt.b32 %r7908, %r7907, %r7906, 1620; + ld.local.u8 %r7909, [%rd190+189]; + ld.local.u8 %r7910, [%rd190+190]; + prmt.b32 %r7911, %r7910, %r7909, 30212; + ld.local.u8 %r7912, [%rd190+191]; + prmt.b32 %r7913, %r7912, %r7911, 28756; + ld.local.u8 %r7914, [%rd190+192]; + prmt.b32 %r7915, %r7914, %r7913, 1620; + ld.local.u8 %r7916, [%rd190+193]; + ld.local.u8 %r7917, [%rd190+194]; + prmt.b32 %r7918, %r7917, %r7916, 30212; + ld.local.u8 %r7919, [%rd190+195]; + prmt.b32 %r7920, %r7919, %r7918, 28756; + ld.local.u8 %r7921, [%rd190+196]; + prmt.b32 %r7922, %r7921, %r7920, 1620; + ld.local.u8 %r7923, [%rd190+197]; + ld.local.u8 %r7924, [%rd190+198]; + prmt.b32 %r7925, %r7924, %r7923, 30212; + ld.local.u8 %r7926, [%rd190+199]; + prmt.b32 %r7927, %r7926, %r7925, 28756; + ld.local.u8 %r7928, [%rd190+200]; + prmt.b32 %r7929, %r7928, %r7927, 1620; + ld.local.u8 %r7930, [%rd190+201]; + ld.local.u8 %r7931, [%rd190+202]; + prmt.b32 %r7932, %r7931, %r7930, 30212; + ld.local.u8 %r7933, [%rd190+203]; + prmt.b32 %r7934, %r7933, %r7932, 28756; + ld.local.u8 %r7935, [%rd190+204]; + prmt.b32 %r7936, %r7935, %r7934, 1620; + ld.local.u8 %r7937, [%rd190+205]; + ld.local.u8 %r7938, [%rd190+206]; + prmt.b32 %r7939, %r7938, %r7937, 30212; + ld.local.u8 %r7940, [%rd190+207]; + prmt.b32 %r7941, %r7940, %r7939, 28756; + ld.local.u8 %r7942, [%rd190+208]; + prmt.b32 %r7943, %r7942, %r7941, 1620; + or.b32 %r7944, %r7831, 4; + ld.local.u8 %r7945, [%rd3+-120]; + ld.local.u8 %r7946, [%rd3+-119]; + prmt.b32 %r7947, %r7946, %r7945, 30212; + ld.local.u8 %r7948, [%rd3+-118]; + ld.local.u8 %r7949, [%rd3+-117]; + prmt.b32 %r7950, %r7949, %r7948, 30212; + prmt.b32 %r7951, %r7950, %r7947, 4180; + ld.local.u8 %r7952, [%rd3+-136]; + ld.local.u8 %r7953, [%rd3+-135]; + prmt.b32 %r7954, %r7953, %r7952, 30212; + ld.local.u8 %r7955, [%rd3+-134]; + ld.local.u8 %r7956, [%rd3+-133]; + prmt.b32 %r7957, %r7956, %r7955, 30212; + prmt.b32 %r7958, %r7957, %r7954, 4180; + add.s32 %r7959, %r7951, %r7958; + add.s32 %r7960, %r7959, %r7838; + shf.l.wrap.b32 %r7961, %r7960, %r7960, 16; + add.s32 %r7962, %r7961, 1779033703; + xor.b32 %r7963, %r7962, %r7951; + shf.l.wrap.b32 %r7964, %r7963, %r7963, 20; + add.s32 %r7965, %r7845, %r7960; + add.s32 %r7966, %r7965, %r7964; + xor.b32 %r7967, %r7966, %r7961; + shf.l.wrap.b32 %r7968, %r7967, %r7967, 24; + add.s32 %r7969, %r7968, %r7962; + xor.b32 %r7970, %r7969, %r7964; + shf.l.wrap.b32 %r7971, %r7970, %r7970, 25; + ld.local.u8 %r7972, [%rd3+-116]; + ld.local.u8 %r7973, [%rd3+-115]; + prmt.b32 %r7974, %r7973, %r7972, 30212; + ld.local.u8 %r7975, [%rd3+-114]; + ld.local.u8 %r7976, [%rd3+-113]; + prmt.b32 %r7977, %r7976, %r7975, 30212; + prmt.b32 %r7978, %r7977, %r7974, 4180; + ld.local.u8 %r7979, [%rd3+-132]; + ld.local.u8 %r7980, [%rd3+-131]; + prmt.b32 %r7981, %r7980, %r7979, 30212; + ld.local.u8 %r7982, [%rd3+-130]; + ld.local.u8 %r7983, [%rd3+-129]; + prmt.b32 %r7984, %r7983, %r7982, 30212; + prmt.b32 %r7985, %r7984, %r7981, 4180; + add.s32 %r7986, %r7978, %r7985; + add.s32 %r7987, %r7986, %r7852; + shf.l.wrap.b32 %r7988, %r7987, %r7987, 16; + add.s32 %r7989, %r7988, -1150833019; + xor.b32 %r7990, %r7989, %r7978; + shf.l.wrap.b32 %r7991, %r7990, %r7990, 20; + add.s32 %r7992, %r7859, %r7987; + add.s32 %r7993, %r7992, %r7991; + xor.b32 %r7994, %r7993, %r7988; + shf.l.wrap.b32 %r7995, %r7994, %r7994, 24; + add.s32 %r7996, %r7995, %r7989; + xor.b32 %r7997, %r7996, %r7991; + shf.l.wrap.b32 %r7998, %r7997, %r7997, 25; + ld.local.u8 %r7999, [%rd3+-112]; + ld.local.u8 %r8000, [%rd3+-111]; + prmt.b32 %r8001, %r8000, %r7999, 30212; + ld.local.u8 %r8002, [%rd3+-110]; + ld.local.u8 %r8003, [%rd3+-109]; + prmt.b32 %r8004, %r8003, %r8002, 30212; + prmt.b32 %r8005, %r8004, %r8001, 4180; + ld.local.u8 %r8006, [%rd3+-128]; + ld.local.u8 %r8007, [%rd3+-127]; + prmt.b32 %r8008, %r8007, %r8006, 30212; + ld.local.u8 %r8009, [%rd3+-126]; + ld.local.u8 %r8010, [%rd3+-125]; + prmt.b32 %r8011, %r8010, %r8009, 30212; + prmt.b32 %r8012, %r8011, %r8008, 4180; + add.s32 %r8013, %r8005, %r8012; + add.s32 %r8014, %r8013, %r7866; + shr.u32 %r8015, %r8014, 16; + shl.b32 %r8016, %r8014, 16; + xor.b32 %r8017, %r8016, 4194304; + or.b32 %r8018, %r8017, %r8015; + add.s32 %r8019, %r8018, 1013904242; + xor.b32 %r8020, %r8019, %r8005; + shf.l.wrap.b32 %r8021, %r8020, %r8020, 20; + add.s32 %r8022, %r7873, %r8014; + add.s32 %r8023, %r8022, %r8021; + xor.b32 %r8024, %r8023, %r8018; + shf.l.wrap.b32 %r8025, %r8024, %r8024, 24; + add.s32 %r8026, %r8025, %r8019; + xor.b32 %r8027, %r8026, %r8021; + shf.l.wrap.b32 %r8028, %r8027, %r8027, 25; + ld.local.u8 %r8029, [%rd3+-108]; + ld.local.u8 %r8030, [%rd3+-107]; + prmt.b32 %r8031, %r8030, %r8029, 30212; + ld.local.u8 %r8032, [%rd3+-106]; + ld.local.u8 %r8033, [%rd3+-105]; + prmt.b32 %r8034, %r8033, %r8032, 30212; + prmt.b32 %r8035, %r8034, %r8031, 4180; + ld.local.u8 %r8036, [%rd3+-124]; + ld.local.u8 %r8037, [%rd3+-123]; + prmt.b32 %r8038, %r8037, %r8036, 30212; + ld.local.u8 %r8039, [%rd3+-122]; + ld.local.u8 %r8040, [%rd3+-121]; + prmt.b32 %r8041, %r8040, %r8039, 30212; + prmt.b32 %r8042, %r8041, %r8038, 4180; + add.s32 %r8043, %r8035, %r8042; + add.s32 %r8044, %r8043, %r7880; + xor.b32 %r8045, %r8044, %r7944; + shr.u32 %r8046, %r8044, 16; + shl.b32 %r8047, %r8045, 16; + or.b32 %r8048, %r8047, %r8046; + add.s32 %r8049, %r8048, -1521486534; + xor.b32 %r8050, %r8049, %r8035; + shf.l.wrap.b32 %r8051, %r8050, %r8050, 20; + add.s32 %r8052, %r7887, %r8044; + add.s32 %r8053, %r8052, %r8051; + xor.b32 %r8054, %r8053, %r8048; + shf.l.wrap.b32 %r8055, %r8054, %r8054, 24; + add.s32 %r8056, %r8055, %r8049; + xor.b32 %r8057, %r8056, %r8051; + shf.l.wrap.b32 %r8058, %r8057, %r8057, 25; + add.s32 %r8059, %r7998, %r7966; + add.s32 %r8060, %r8059, %r7894; + xor.b32 %r8061, %r8055, %r8060; + shf.l.wrap.b32 %r8062, %r8061, %r8061, 16; + add.s32 %r8063, %r8062, %r8026; + xor.b32 %r8064, %r8063, %r7998; + shf.l.wrap.b32 %r8065, %r8064, %r8064, 20; + add.s32 %r8066, %r7901, %r8060; + add.s32 %r8067, %r8066, %r8065; + xor.b32 %r8068, %r8067, %r8062; + shf.l.wrap.b32 %r8069, %r8068, %r8068, 24; + add.s32 %r8070, %r8069, %r8063; + xor.b32 %r8071, %r8070, %r8065; + shf.l.wrap.b32 %r8072, %r8071, %r8071, 25; + add.s32 %r8073, %r8028, %r7993; + add.s32 %r8074, %r8073, %r7908; + xor.b32 %r8075, %r8074, %r7968; + shf.l.wrap.b32 %r8076, %r8075, %r8075, 16; + add.s32 %r8077, %r8076, %r8056; + xor.b32 %r8078, %r8077, %r8028; + shf.l.wrap.b32 %r8079, %r8078, %r8078, 20; + add.s32 %r8080, %r7915, %r8074; + add.s32 %r8081, %r8080, %r8079; + xor.b32 %r8082, %r8081, %r8076; + shf.l.wrap.b32 %r8083, %r8082, %r8082, 24; + add.s32 %r8084, %r8083, %r8077; + xor.b32 %r8085, %r8084, %r8079; + shf.l.wrap.b32 %r8086, %r8085, %r8085, 25; + add.s32 %r8087, %r8058, %r8023; + add.s32 %r8088, %r8087, %r7922; + xor.b32 %r8089, %r8088, %r7995; + shf.l.wrap.b32 %r8090, %r8089, %r8089, 16; + add.s32 %r8091, %r8090, %r7969; + xor.b32 %r8092, %r8091, %r8058; + shf.l.wrap.b32 %r8093, %r8092, %r8092, 20; + add.s32 %r8094, %r7929, %r8088; + add.s32 %r8095, %r8094, %r8093; + xor.b32 %r8096, %r8095, %r8090; + shf.l.wrap.b32 %r8097, %r8096, %r8096, 24; + add.s32 %r8098, %r8097, %r8091; + xor.b32 %r8099, %r8098, %r8093; + shf.l.wrap.b32 %r8100, %r8099, %r8099, 25; + add.s32 %r8101, %r8053, %r7971; + add.s32 %r8102, %r8101, %r7936; + xor.b32 %r8103, %r8102, %r8025; + shf.l.wrap.b32 %r8104, %r8103, %r8103, 16; + add.s32 %r8105, %r8104, %r7996; + xor.b32 %r8106, %r8105, %r7971; + shf.l.wrap.b32 %r8107, %r8106, %r8106, 20; + add.s32 %r8108, %r7943, %r8102; + add.s32 %r8109, %r8108, %r8107; + xor.b32 %r8110, %r8109, %r8104; + shf.l.wrap.b32 %r8111, %r8110, %r8110, 24; + add.s32 %r8112, %r8111, %r8105; + xor.b32 %r8113, %r8112, %r8107; + shf.l.wrap.b32 %r8114, %r8113, %r8113, 25; + add.s32 %r8115, %r8067, %r7852; + add.s32 %r8116, %r8115, %r8114; + xor.b32 %r8117, %r8116, %r8083; + shf.l.wrap.b32 %r8118, %r8117, %r8117, 16; + add.s32 %r8119, %r8118, %r8098; + xor.b32 %r8120, %r8119, %r8114; + shf.l.wrap.b32 %r8121, %r8120, %r8120, 20; + add.s32 %r8122, %r8116, %r7880; + add.s32 %r8123, %r8122, %r8121; + xor.b32 %r8124, %r8123, %r8118; + shf.l.wrap.b32 %r8125, %r8124, %r8124, 24; + add.s32 %r8126, %r8125, %r8119; + xor.b32 %r8127, %r8126, %r8121; + shf.l.wrap.b32 %r8128, %r8127, %r8127, 25; + add.s32 %r8129, %r8081, %r7859; + add.s32 %r8130, %r8129, %r8072; + xor.b32 %r8131, %r8097, %r8130; + shf.l.wrap.b32 %r8132, %r8131, %r8131, 16; + add.s32 %r8133, %r8112, %r8132; + xor.b32 %r8134, %r8133, %r8072; + shf.l.wrap.b32 %r8135, %r8134, %r8134, 20; + add.s32 %r8136, %r8130, %r7908; + add.s32 %r8137, %r8136, %r8135; + xor.b32 %r8138, %r8137, %r8132; + shf.l.wrap.b32 %r8139, %r8138, %r8138, 24; + add.s32 %r8140, %r8139, %r8133; + xor.b32 %r8141, %r8140, %r8135; + shf.l.wrap.b32 %r8142, %r8141, %r8141, 25; + add.s32 %r8143, %r8086, %r7887; + add.s32 %r8144, %r8143, %r8095; + xor.b32 %r8145, %r8111, %r8144; + shf.l.wrap.b32 %r8146, %r8145, %r8145, 16; + add.s32 %r8147, %r8146, %r8070; + xor.b32 %r8148, %r8147, %r8086; + shf.l.wrap.b32 %r8149, %r8148, %r8148, 20; + add.s32 %r8150, %r8144, %r7838; + add.s32 %r8151, %r8150, %r8149; + xor.b32 %r8152, %r8151, %r8146; + shf.l.wrap.b32 %r8153, %r8152, %r8152, 24; + add.s32 %r8154, %r8153, %r8147; + xor.b32 %r8155, %r8154, %r8149; + shf.l.wrap.b32 %r8156, %r8155, %r8155, 25; + add.s32 %r8157, %r8100, %r7866; + add.s32 %r8158, %r8157, %r8109; + xor.b32 %r8159, %r8158, %r8069; + shf.l.wrap.b32 %r8160, %r8159, %r8159, 16; + add.s32 %r8161, %r8160, %r8084; + xor.b32 %r8162, %r8161, %r8100; + shf.l.wrap.b32 %r8163, %r8162, %r8162, 20; + add.s32 %r8164, %r8158, %r7929; + add.s32 %r8165, %r8164, %r8163; + xor.b32 %r8166, %r8165, %r8160; + shf.l.wrap.b32 %r8167, %r8166, %r8166, 24; + add.s32 %r8168, %r8167, %r8161; + xor.b32 %r8169, %r8168, %r8163; + shf.l.wrap.b32 %r8170, %r8169, %r8169, 25; + add.s32 %r8171, %r8142, %r7845; + add.s32 %r8172, %r8171, %r8123; + xor.b32 %r8173, %r8172, %r8167; + shf.l.wrap.b32 %r8174, %r8173, %r8173, 16; + add.s32 %r8175, %r8174, %r8154; + xor.b32 %r8176, %r8175, %r8142; + shf.l.wrap.b32 %r8177, %r8176, %r8176, 20; + add.s32 %r8178, %r8172, %r7915; + add.s32 %r8179, %r8178, %r8177; + xor.b32 %r8180, %r8179, %r8174; + shf.l.wrap.b32 %r8181, %r8180, %r8180, 24; + add.s32 %r8182, %r8181, %r8175; + xor.b32 %r8183, %r8182, %r8177; + shf.l.wrap.b32 %r8184, %r8183, %r8183, 25; + add.s32 %r8185, %r8137, %r7922; + add.s32 %r8186, %r8185, %r8156; + xor.b32 %r8187, %r8125, %r8186; + shf.l.wrap.b32 %r8188, %r8187, %r8187, 16; + add.s32 %r8189, %r8188, %r8168; + xor.b32 %r8190, %r8189, %r8156; + shf.l.wrap.b32 %r8191, %r8190, %r8190, 20; + add.s32 %r8192, %r8186, %r7873; + add.s32 %r8193, %r8192, %r8191; + xor.b32 %r8194, %r8193, %r8188; + shf.l.wrap.b32 %r8195, %r8194, %r8194, 24; + add.s32 %r8196, %r8195, %r8189; + xor.b32 %r8197, %r8196, %r8191; + shf.l.wrap.b32 %r8198, %r8197, %r8197, 25; + add.s32 %r8199, %r8151, %r7901; + add.s32 %r8200, %r8199, %r8170; + xor.b32 %r8201, %r8200, %r8139; + shf.l.wrap.b32 %r8202, %r8201, %r8201, 16; + add.s32 %r8203, %r8202, %r8126; + xor.b32 %r8204, %r8203, %r8170; + shf.l.wrap.b32 %r8205, %r8204, %r8204, 20; + add.s32 %r8206, %r8200, %r7936; + add.s32 %r8207, %r8206, %r8205; + xor.b32 %r8208, %r8207, %r8202; + shf.l.wrap.b32 %r8209, %r8208, %r8208, 24; + add.s32 %r8210, %r8209, %r8203; + xor.b32 %r8211, %r8210, %r8205; + shf.l.wrap.b32 %r8212, %r8211, %r8211, 25; + add.s32 %r8213, %r8165, %r7943; + add.s32 %r8214, %r8213, %r8128; + xor.b32 %r8215, %r8214, %r8153; + shf.l.wrap.b32 %r8216, %r8215, %r8215, 16; + add.s32 %r8217, %r8216, %r8140; + xor.b32 %r8218, %r8217, %r8128; + shf.l.wrap.b32 %r8219, %r8218, %r8218, 20; + add.s32 %r8220, %r8214, %r7894; + add.s32 %r8221, %r8220, %r8219; + xor.b32 %r8222, %r8221, %r8216; + shf.l.wrap.b32 %r8223, %r8222, %r8222, 24; + add.s32 %r8224, %r8223, %r8217; + xor.b32 %r8225, %r8224, %r8219; + shf.l.wrap.b32 %r8226, %r8225, %r8225, 25; + add.s32 %r8227, %r8179, %r7859; + add.s32 %r8228, %r8227, %r8226; + xor.b32 %r8229, %r8228, %r8195; + shf.l.wrap.b32 %r8230, %r8229, %r8229, 16; + add.s32 %r8231, %r8230, %r8210; + xor.b32 %r8232, %r8231, %r8226; + shf.l.wrap.b32 %r8233, %r8232, %r8232, 20; + add.s32 %r8234, %r8228, %r7866; + add.s32 %r8235, %r8234, %r8233; + xor.b32 %r8236, %r8235, %r8230; + shf.l.wrap.b32 %r8237, %r8236, %r8236, 24; + add.s32 %r8238, %r8237, %r8231; + xor.b32 %r8239, %r8238, %r8233; + shf.l.wrap.b32 %r8240, %r8239, %r8239, 25; + add.s32 %r8241, %r8193, %r7908; + add.s32 %r8242, %r8241, %r8184; + xor.b32 %r8243, %r8242, %r8209; + shf.l.wrap.b32 %r8244, %r8243, %r8243, 16; + add.s32 %r8245, %r8244, %r8224; + xor.b32 %r8246, %r8245, %r8184; + shf.l.wrap.b32 %r8247, %r8246, %r8246, 20; + add.s32 %r8248, %r8242, %r7922; + add.s32 %r8249, %r8248, %r8247; + xor.b32 %r8250, %r8249, %r8244; + shf.l.wrap.b32 %r8251, %r8250, %r8250, 24; + add.s32 %r8252, %r8251, %r8245; + xor.b32 %r8253, %r8252, %r8247; + shf.l.wrap.b32 %r8254, %r8253, %r8253, 25; + add.s32 %r8255, %r8207, %r7929; + add.s32 %r8256, %r8255, %r8198; + xor.b32 %r8257, %r8223, %r8256; + shf.l.wrap.b32 %r8258, %r8257, %r8257, 16; + add.s32 %r8259, %r8258, %r8182; + xor.b32 %r8260, %r8259, %r8198; + shf.l.wrap.b32 %r8261, %r8260, %r8260, 20; + add.s32 %r8262, %r8256, %r7852; + add.s32 %r8263, %r8262, %r8261; + xor.b32 %r8264, %r8263, %r8258; + shf.l.wrap.b32 %r8265, %r8264, %r8264, 24; + add.s32 %r8266, %r8265, %r8259; + xor.b32 %r8267, %r8266, %r8261; + shf.l.wrap.b32 %r8268, %r8267, %r8267, 25; + add.s32 %r8269, %r8212, %r7887; + add.s32 %r8270, %r8269, %r8221; + xor.b32 %r8271, %r8270, %r8181; + shf.l.wrap.b32 %r8272, %r8271, %r8271, 16; + add.s32 %r8273, %r8272, %r8196; + xor.b32 %r8274, %r8273, %r8212; + shf.l.wrap.b32 %r8275, %r8274, %r8274, 20; + add.s32 %r8276, %r8270, %r7936; + add.s32 %r8277, %r8276, %r8275; + xor.b32 %r8278, %r8277, %r8272; + shf.l.wrap.b32 %r8279, %r8278, %r8278, 24; + add.s32 %r8280, %r8279, %r8273; + xor.b32 %r8281, %r8280, %r8275; + shf.l.wrap.b32 %r8282, %r8281, %r8281, 25; + add.s32 %r8283, %r8235, %r7880; + add.s32 %r8284, %r8283, %r8254; + xor.b32 %r8285, %r8284, %r8279; + shf.l.wrap.b32 %r8286, %r8285, %r8285, 16; + add.s32 %r8287, %r8286, %r8266; + xor.b32 %r8288, %r8287, %r8254; + shf.l.wrap.b32 %r8289, %r8288, %r8288, 20; + add.s32 %r8290, %r8284, %r7873; + add.s32 %r8291, %r8290, %r8289; + xor.b32 %r8292, %r8291, %r8286; + shf.l.wrap.b32 %r8293, %r8292, %r8292, 24; + add.s32 %r8294, %r8293, %r8287; + xor.b32 %r8295, %r8294, %r8289; + shf.l.wrap.b32 %r8296, %r8295, %r8295, 25; + add.s32 %r8297, %r8249, %r7901; + add.s32 %r8298, %r8297, %r8268; + xor.b32 %r8299, %r8237, %r8298; + shf.l.wrap.b32 %r8300, %r8299, %r8299, 16; + add.s32 %r8301, %r8300, %r8280; + xor.b32 %r8302, %r8301, %r8268; + shf.l.wrap.b32 %r8303, %r8302, %r8302, 20; + add.s32 %r8304, %r8298, %r7838; + add.s32 %r8305, %r8304, %r8303; + xor.b32 %r8306, %r8305, %r8300; + shf.l.wrap.b32 %r8307, %r8306, %r8306, 24; + add.s32 %r8308, %r8307, %r8301; + xor.b32 %r8309, %r8308, %r8303; + shf.l.wrap.b32 %r8310, %r8309, %r8309, 25; + add.s32 %r8311, %r8263, %r7915; + add.s32 %r8312, %r8311, %r8282; + xor.b32 %r8313, %r8312, %r8251; + shf.l.wrap.b32 %r8314, %r8313, %r8313, 16; + add.s32 %r8315, %r8314, %r8238; + xor.b32 %r8316, %r8315, %r8282; + shf.l.wrap.b32 %r8317, %r8316, %r8316, 20; + add.s32 %r8318, %r8312, %r7943; + add.s32 %r8319, %r8318, %r8317; + xor.b32 %r8320, %r8319, %r8314; + shf.l.wrap.b32 %r8321, %r8320, %r8320, 24; + add.s32 %r8322, %r8321, %r8315; + xor.b32 %r8323, %r8322, %r8317; + shf.l.wrap.b32 %r8324, %r8323, %r8323, 25; + add.s32 %r8325, %r8277, %r7894; + add.s32 %r8326, %r8325, %r8240; + xor.b32 %r8327, %r8326, %r8265; + shf.l.wrap.b32 %r8328, %r8327, %r8327, 16; + add.s32 %r8329, %r8328, %r8252; + xor.b32 %r8330, %r8329, %r8240; + shf.l.wrap.b32 %r8331, %r8330, %r8330, 20; + add.s32 %r8332, %r8326, %r7845; + add.s32 %r8333, %r8332, %r8331; + xor.b32 %r8334, %r8333, %r8328; + shf.l.wrap.b32 %r8335, %r8334, %r8334, 24; + add.s32 %r8336, %r8335, %r8329; + xor.b32 %r8337, %r8336, %r8331; + shf.l.wrap.b32 %r8338, %r8337, %r8337, 25; + add.s32 %r8339, %r8291, %r7908; + add.s32 %r8340, %r8339, %r8338; + xor.b32 %r8341, %r8340, %r8307; + shf.l.wrap.b32 %r8342, %r8341, %r8341, 16; + add.s32 %r8343, %r8342, %r8322; + xor.b32 %r8344, %r8343, %r8338; + shf.l.wrap.b32 %r8345, %r8344, %r8344, 20; + add.s32 %r8346, %r8340, %r7887; + add.s32 %r8347, %r8346, %r8345; + xor.b32 %r8348, %r8347, %r8342; + shf.l.wrap.b32 %r8349, %r8348, %r8348, 24; + add.s32 %r8350, %r8349, %r8343; + xor.b32 %r8351, %r8350, %r8345; + shf.l.wrap.b32 %r8352, %r8351, %r8351, 25; + add.s32 %r8353, %r8305, %r7922; + add.s32 %r8354, %r8353, %r8296; + xor.b32 %r8355, %r8354, %r8321; + shf.l.wrap.b32 %r8356, %r8355, %r8355, 16; + add.s32 %r8357, %r8356, %r8336; + xor.b32 %r8358, %r8357, %r8296; + shf.l.wrap.b32 %r8359, %r8358, %r8358, 20; + add.s32 %r8360, %r8354, %r7901; + add.s32 %r8361, %r8360, %r8359; + xor.b32 %r8362, %r8361, %r8356; + shf.l.wrap.b32 %r8363, %r8362, %r8362, 24; + add.s32 %r8364, %r8363, %r8357; + xor.b32 %r8365, %r8364, %r8359; + shf.l.wrap.b32 %r8366, %r8365, %r8365, 25; + add.s32 %r8367, %r8319, %r7936; + add.s32 %r8368, %r8367, %r8310; + xor.b32 %r8369, %r8335, %r8368; + shf.l.wrap.b32 %r8370, %r8369, %r8369, 16; + add.s32 %r8371, %r8370, %r8294; + xor.b32 %r8372, %r8371, %r8310; + shf.l.wrap.b32 %r8373, %r8372, %r8372, 20; + add.s32 %r8374, %r8368, %r7859; + add.s32 %r8375, %r8374, %r8373; + xor.b32 %r8376, %r8375, %r8370; + shf.l.wrap.b32 %r8377, %r8376, %r8376, 24; + add.s32 %r8378, %r8377, %r8371; + xor.b32 %r8379, %r8378, %r8373; + shf.l.wrap.b32 %r8380, %r8379, %r8379, 25; + add.s32 %r8381, %r8333, %r7929; + add.s32 %r8382, %r8381, %r8324; + xor.b32 %r8383, %r8382, %r8293; + shf.l.wrap.b32 %r8384, %r8383, %r8383, 16; + add.s32 %r8385, %r8384, %r8308; + xor.b32 %r8386, %r8385, %r8324; + shf.l.wrap.b32 %r8387, %r8386, %r8386, 20; + add.s32 %r8388, %r8382, %r7943; + add.s32 %r8389, %r8388, %r8387; + xor.b32 %r8390, %r8389, %r8384; + shf.l.wrap.b32 %r8391, %r8390, %r8390, 24; + add.s32 %r8392, %r8391, %r8385; + xor.b32 %r8393, %r8392, %r8387; + shf.l.wrap.b32 %r8394, %r8393, %r8393, 25; + add.s32 %r8395, %r8347, %r7866; + add.s32 %r8396, %r8395, %r8366; + xor.b32 %r8397, %r8396, %r8391; + shf.l.wrap.b32 %r8398, %r8397, %r8397, 16; + add.s32 %r8399, %r8398, %r8378; + xor.b32 %r8400, %r8399, %r8366; + shf.l.wrap.b32 %r8401, %r8400, %r8400, 20; + add.s32 %r8402, %r8396, %r7838; + add.s32 %r8403, %r8402, %r8401; + xor.b32 %r8404, %r8403, %r8398; + shf.l.wrap.b32 %r8405, %r8404, %r8404, 24; + add.s32 %r8406, %r8405, %r8399; + xor.b32 %r8407, %r8406, %r8401; + shf.l.wrap.b32 %r8408, %r8407, %r8407, 25; + add.s32 %r8409, %r8361, %r7915; + add.s32 %r8410, %r8409, %r8380; + xor.b32 %r8411, %r8349, %r8410; + shf.l.wrap.b32 %r8412, %r8411, %r8411, 16; + add.s32 %r8413, %r8412, %r8392; + xor.b32 %r8414, %r8413, %r8380; + shf.l.wrap.b32 %r8415, %r8414, %r8414, 20; + add.s32 %r8416, %r8410, %r7852; + add.s32 %r8417, %r8416, %r8415; + xor.b32 %r8418, %r8417, %r8412; + shf.l.wrap.b32 %r8419, %r8418, %r8418, 24; + add.s32 %r8420, %r8419, %r8413; + xor.b32 %r8421, %r8420, %r8415; + shf.l.wrap.b32 %r8422, %r8421, %r8421, 25; + add.s32 %r8423, %r8375, %r7873; + add.s32 %r8424, %r8423, %r8394; + xor.b32 %r8425, %r8424, %r8363; + shf.l.wrap.b32 %r8426, %r8425, %r8425, 16; + add.s32 %r8427, %r8426, %r8350; + xor.b32 %r8428, %r8427, %r8394; + shf.l.wrap.b32 %r8429, %r8428, %r8428, 20; + add.s32 %r8430, %r8424, %r7894; + add.s32 %r8431, %r8430, %r8429; + xor.b32 %r8432, %r8431, %r8426; + shf.l.wrap.b32 %r8433, %r8432, %r8432, 24; + add.s32 %r8434, %r8433, %r8427; + xor.b32 %r8435, %r8434, %r8429; + shf.l.wrap.b32 %r8436, %r8435, %r8435, 25; + add.s32 %r8437, %r8389, %r7845; + add.s32 %r8438, %r8437, %r8352; + xor.b32 %r8439, %r8438, %r8377; + shf.l.wrap.b32 %r8440, %r8439, %r8439, 16; + add.s32 %r8441, %r8440, %r8364; + xor.b32 %r8442, %r8441, %r8352; + shf.l.wrap.b32 %r8443, %r8442, %r8442, 20; + add.s32 %r8444, %r8438, %r7880; + add.s32 %r8445, %r8444, %r8443; + xor.b32 %r8446, %r8445, %r8440; + shf.l.wrap.b32 %r8447, %r8446, %r8446, 24; + add.s32 %r8448, %r8447, %r8441; + xor.b32 %r8449, %r8448, %r8443; + shf.l.wrap.b32 %r8450, %r8449, %r8449, 25; + add.s32 %r8451, %r8403, %r7922; + add.s32 %r8452, %r8451, %r8450; + xor.b32 %r8453, %r8452, %r8419; + shf.l.wrap.b32 %r8454, %r8453, %r8453, 16; + add.s32 %r8455, %r8454, %r8434; + xor.b32 %r8456, %r8455, %r8450; + shf.l.wrap.b32 %r8457, %r8456, %r8456, 20; + add.s32 %r8458, %r8452, %r7929; + add.s32 %r8459, %r8458, %r8457; + xor.b32 %r8460, %r8459, %r8454; + shf.l.wrap.b32 %r8461, %r8460, %r8460, 24; + add.s32 %r8462, %r8461, %r8455; + xor.b32 %r8463, %r8462, %r8457; + shf.l.wrap.b32 %r8464, %r8463, %r8463, 25; + add.s32 %r8465, %r8417, %r7901; + add.s32 %r8466, %r8465, %r8408; + xor.b32 %r8467, %r8466, %r8433; + shf.l.wrap.b32 %r8468, %r8467, %r8467, 16; + add.s32 %r8469, %r8468, %r8448; + xor.b32 %r8470, %r8469, %r8408; + shf.l.wrap.b32 %r8471, %r8470, %r8470, 20; + add.s32 %r8472, %r8466, %r7915; + add.s32 %r8473, %r8472, %r8471; + xor.b32 %r8474, %r8473, %r8468; + shf.l.wrap.b32 %r8475, %r8474, %r8474, 24; + add.s32 %r8476, %r8475, %r8469; + xor.b32 %r8477, %r8476, %r8471; + shf.l.wrap.b32 %r8478, %r8477, %r8477, 25; + add.s32 %r8479, %r8431, %r7943; + add.s32 %r8480, %r8479, %r8422; + xor.b32 %r8481, %r8447, %r8480; + shf.l.wrap.b32 %r8482, %r8481, %r8481, 16; + add.s32 %r8483, %r8482, %r8406; + xor.b32 %r8484, %r8483, %r8422; + shf.l.wrap.b32 %r8485, %r8484, %r8484, 20; + add.s32 %r8486, %r8480, %r7908; + add.s32 %r8487, %r8486, %r8485; + xor.b32 %r8488, %r8487, %r8482; + shf.l.wrap.b32 %r8489, %r8488, %r8488, 24; + add.s32 %r8490, %r8489, %r8483; + xor.b32 %r8491, %r8490, %r8485; + shf.l.wrap.b32 %r8492, %r8491, %r8491, 25; + add.s32 %r8493, %r8445, %r7936; + add.s32 %r8494, %r8493, %r8436; + xor.b32 %r8495, %r8494, %r8405; + shf.l.wrap.b32 %r8496, %r8495, %r8495, 16; + add.s32 %r8497, %r8496, %r8420; + xor.b32 %r8498, %r8497, %r8436; + shf.l.wrap.b32 %r8499, %r8498, %r8498, 20; + add.s32 %r8500, %r8494, %r7894; + add.s32 %r8501, %r8500, %r8499; + xor.b32 %r8502, %r8501, %r8496; + shf.l.wrap.b32 %r8503, %r8502, %r8502, 24; + add.s32 %r8504, %r8503, %r8497; + xor.b32 %r8505, %r8504, %r8499; + shf.l.wrap.b32 %r8506, %r8505, %r8505, 25; + add.s32 %r8507, %r8459, %r7887; + add.s32 %r8508, %r8507, %r8478; + xor.b32 %r8509, %r8508, %r8503; + shf.l.wrap.b32 %r8510, %r8509, %r8509, 16; + add.s32 %r8511, %r8510, %r8490; + xor.b32 %r8512, %r8511, %r8478; + shf.l.wrap.b32 %r8513, %r8512, %r8512, 20; + add.s32 %r8514, %r8508, %r7852; + add.s32 %r8515, %r8514, %r8513; + xor.b32 %r8516, %r8515, %r8510; + shf.l.wrap.b32 %r8517, %r8516, %r8516, 24; + add.s32 %r8518, %r8517, %r8511; + xor.b32 %r8519, %r8518, %r8513; + shf.l.wrap.b32 %r8520, %r8519, %r8519, 25; + add.s32 %r8521, %r8473, %r7873; + add.s32 %r8522, %r8521, %r8492; + xor.b32 %r8523, %r8461, %r8522; + shf.l.wrap.b32 %r8524, %r8523, %r8523, 16; + add.s32 %r8525, %r8524, %r8504; + xor.b32 %r8526, %r8525, %r8492; + shf.l.wrap.b32 %r8527, %r8526, %r8526, 20; + add.s32 %r8528, %r8522, %r7859; + add.s32 %r8529, %r8528, %r8527; + xor.b32 %r8530, %r8529, %r8524; + shf.l.wrap.b32 %r8531, %r8530, %r8530, 24; + add.s32 %r8532, %r8531, %r8525; + xor.b32 %r8533, %r8532, %r8527; + shf.l.wrap.b32 %r8534, %r8533, %r8533, 25; + add.s32 %r8535, %r8487, %r7838; + add.s32 %r8536, %r8535, %r8506; + xor.b32 %r8537, %r8536, %r8475; + shf.l.wrap.b32 %r8538, %r8537, %r8537, 16; + add.s32 %r8539, %r8538, %r8462; + xor.b32 %r8540, %r8539, %r8506; + shf.l.wrap.b32 %r8541, %r8540, %r8540, 20; + add.s32 %r8542, %r8536, %r7845; + add.s32 %r8543, %r8542, %r8541; + xor.b32 %r8544, %r8543, %r8538; + shf.l.wrap.b32 %r8545, %r8544, %r8544, 24; + add.s32 %r8546, %r8545, %r8539; + xor.b32 %r8547, %r8546, %r8541; + shf.l.wrap.b32 %r8548, %r8547, %r8547, 25; + add.s32 %r8549, %r8501, %r7880; + add.s32 %r8550, %r8549, %r8464; + xor.b32 %r8551, %r8550, %r8489; + shf.l.wrap.b32 %r8552, %r8551, %r8551, 16; + add.s32 %r8553, %r8552, %r8476; + xor.b32 %r8554, %r8553, %r8464; + shf.l.wrap.b32 %r8555, %r8554, %r8554, 20; + add.s32 %r8556, %r8550, %r7866; + add.s32 %r8557, %r8556, %r8555; + xor.b32 %r8558, %r8557, %r8552; + shf.l.wrap.b32 %r8559, %r8558, %r8558, 24; + add.s32 %r8560, %r8559, %r8553; + xor.b32 %r8561, %r8560, %r8555; + shf.l.wrap.b32 %r8562, %r8561, %r8561, 25; + add.s32 %r8563, %r8515, %r7901; + add.s32 %r8564, %r8563, %r8562; + xor.b32 %r8565, %r8564, %r8531; + shf.l.wrap.b32 %r8566, %r8565, %r8565, 16; + add.s32 %r8567, %r8566, %r8546; + xor.b32 %r8568, %r8567, %r8562; + shf.l.wrap.b32 %r8569, %r8568, %r8568, 20; + add.s32 %r8570, %r8564, %r7936; + add.s32 %r8571, %r8570, %r8569; + xor.b32 %r8572, %r8571, %r8566; + shf.l.wrap.b32 %r8573, %r8572, %r8572, 24; + add.s32 %r8574, %r8573, %r8567; + xor.b32 %r8575, %r8574, %r8569; + shf.l.wrap.b32 %r8576, %r8575, %r8575, 25; + add.s32 %r8577, %r8529, %r7915; + add.s32 %r8578, %r8577, %r8520; + xor.b32 %r8579, %r8578, %r8545; + shf.l.wrap.b32 %r8580, %r8579, %r8579, 16; + add.s32 %r8581, %r8580, %r8560; + xor.b32 %r8582, %r8581, %r8520; + shf.l.wrap.b32 %r8583, %r8582, %r8582, 20; + add.s32 %r8584, %r8578, %r7873; + add.s32 %r8585, %r8584, %r8583; + xor.b32 %r8586, %r8585, %r8580; + shf.l.wrap.b32 %r8587, %r8586, %r8586, 24; + add.s32 %r8588, %r8587, %r8581; + xor.b32 %r8589, %r8588, %r8583; + shf.l.wrap.b32 %r8590, %r8589, %r8589, 25; + add.s32 %r8591, %r8543, %r7894; + add.s32 %r8592, %r8591, %r8534; + xor.b32 %r8593, %r8559, %r8592; + shf.l.wrap.b32 %r8594, %r8593, %r8593, 16; + add.s32 %r8595, %r8594, %r8518; + xor.b32 %r8596, %r8595, %r8534; + shf.l.wrap.b32 %r8597, %r8596, %r8596, 20; + add.s32 %r8598, %r8592, %r7922; + add.s32 %r8599, %r8598, %r8597; + xor.b32 %r8600, %r8599, %r8594; + shf.l.wrap.b32 %r8601, %r8600, %r8600, 24; + add.s32 %r8602, %r8601, %r8595; + xor.b32 %r8603, %r8602, %r8597; + shf.l.wrap.b32 %r8604, %r8603, %r8603, 25; + add.s32 %r8605, %r8557, %r7943; + add.s32 %r8606, %r8605, %r8548; + xor.b32 %r8607, %r8606, %r8517; + shf.l.wrap.b32 %r8608, %r8607, %r8607, 16; + add.s32 %r8609, %r8608, %r8532; + xor.b32 %r8610, %r8609, %r8548; + shf.l.wrap.b32 %r8611, %r8610, %r8610, 20; + add.s32 %r8612, %r8606, %r7845; + add.s32 %r8613, %r8612, %r8611; + xor.b32 %r8614, %r8613, %r8608; + shf.l.wrap.b32 %r8615, %r8614, %r8614, 24; + add.s32 %r8616, %r8615, %r8609; + xor.b32 %r8617, %r8616, %r8611; + shf.l.wrap.b32 %r8618, %r8617, %r8617, 25; + add.s32 %r8619, %r8571, %r7929; + add.s32 %r8620, %r8619, %r8590; + xor.b32 %r8621, %r8620, %r8615; + shf.l.wrap.b32 %r8622, %r8621, %r8621, 16; + add.s32 %r8623, %r8622, %r8602; + xor.b32 %r8624, %r8623, %r8590; + shf.l.wrap.b32 %r8625, %r8624, %r8624, 20; + add.s32 %r8626, %r8620, %r7859; + add.s32 %r8627, %r8626, %r8625; + xor.b32 %r8628, %r8627, %r8622; + shf.l.wrap.b32 %r8629, %r8628, %r8628, 24; + add.s32 %r8630, %r8629, %r8623; + xor.b32 %r8631, %r8630, %r8625; + shf.l.wrap.b32 %r8632, %r8631, %r8631, 25; + add.s32 %r8633, %r8585, %r7838; + add.s32 %r8634, %r8633, %r8604; + xor.b32 %r8635, %r8573, %r8634; + shf.l.wrap.b32 %r8636, %r8635, %r8635, 16; + add.s32 %r8637, %r8636, %r8616; + xor.b32 %r8638, %r8637, %r8604; + shf.l.wrap.b32 %r8639, %r8638, %r8638, 20; + add.s32 %r8640, %r8634, %r7908; + add.s32 %r8641, %r8640, %r8639; + xor.b32 %r8642, %r8641, %r8636; + shf.l.wrap.b32 %r8643, %r8642, %r8642, 24; + add.s32 %r8644, %r8643, %r8637; + xor.b32 %r8645, %r8644, %r8639; + shf.l.wrap.b32 %r8646, %r8645, %r8645, 25; + add.s32 %r8647, %r8599, %r7852; + add.s32 %r8648, %r8647, %r8618; + xor.b32 %r8649, %r8648, %r8587; + shf.l.wrap.b32 %r8650, %r8649, %r8649, 16; + add.s32 %r8651, %r8650, %r8574; + xor.b32 %r8652, %r8651, %r8618; + shf.l.wrap.b32 %r8653, %r8652, %r8652, 20; + add.s32 %r8654, %r8648, %r7880; + add.s32 %r8655, %r8654, %r8653; + xor.b32 %r8656, %r8655, %r8650; + shf.l.wrap.b32 %r8657, %r8656, %r8656, 24; + add.s32 %r8658, %r8657, %r8651; + xor.b32 %r8659, %r8658, %r8653; + shf.l.wrap.b32 %r8660, %r8659, %r8659, 25; + add.s32 %r8661, %r8613, %r7866; + add.s32 %r8662, %r8661, %r8576; + xor.b32 %r8663, %r8662, %r8601; + shf.l.wrap.b32 %r8664, %r8663, %r8663, 16; + add.s32 %r8665, %r8664, %r8588; + xor.b32 %r8666, %r8665, %r8576; + shf.l.wrap.b32 %r8667, %r8666, %r8666, 20; + add.s32 %r8668, %r8662, %r7887; + add.s32 %r8669, %r8668, %r8667; + xor.b32 %r8670, %r8669, %r8664; + shf.l.wrap.b32 %r8671, %r8670, %r8670, 24; + add.s32 %r8672, %r8671, %r8665; + xor.b32 %r8673, %r8672, %r8667; + shf.l.wrap.b32 %r8674, %r8673, %r8673, 25; + add.s32 %r8675, %r8627, %r7915; + add.s32 %r8676, %r8675, %r8674; + xor.b32 %r8677, %r8676, %r8643; + shf.l.wrap.b32 %r8678, %r8677, %r8677, 16; + add.s32 %r8679, %r8678, %r8658; + xor.b32 %r8680, %r8679, %r8674; + shf.l.wrap.b32 %r8681, %r8680, %r8680, 20; + add.s32 %r8682, %r8676, %r7943; + add.s32 %r8683, %r8682, %r8681; + xor.b32 %r8684, %r8683, %r8678; + shf.l.wrap.b32 %r8685, %r8684, %r8684, 24; + add.s32 %r8686, %r8685, %r8679; + xor.b32 %r8687, %r8686, %r8681; + shf.l.wrap.b32 %r8688, %r8687, %r8687, 25; + add.s32 %r8689, %r8641, %r7873; + add.s32 %r8690, %r8689, %r8632; + xor.b32 %r8691, %r8690, %r8657; + shf.l.wrap.b32 %r8692, %r8691, %r8691, 16; + add.s32 %r8693, %r8692, %r8672; + xor.b32 %r8694, %r8693, %r8632; + shf.l.wrap.b32 %r8695, %r8694, %r8694, 20; + add.s32 %r8696, %r8690, %r7838; + add.s32 %r8697, %r8696, %r8695; + xor.b32 %r8698, %r8697, %r8692; + shf.l.wrap.b32 %r8699, %r8698, %r8698, 24; + add.s32 %r8700, %r8699, %r8693; + xor.b32 %r8701, %r8700, %r8695; + shf.l.wrap.b32 %r8702, %r8701, %r8701, 25; + add.s32 %r8703, %r8655, %r7845; + add.s32 %r8704, %r8703, %r8646; + xor.b32 %r8705, %r8671, %r8704; + shf.l.wrap.b32 %r8706, %r8705, %r8705, 16; + add.s32 %r8707, %r8706, %r8630; + xor.b32 %r8708, %r8707, %r8646; + shf.l.wrap.b32 %r8709, %r8708, %r8708, 20; + add.s32 %r8710, %r8704, %r7901; + add.s32 %r8711, %r8710, %r8709; + xor.b32 %r8712, %r8711, %r8706; + shf.l.wrap.b32 %r8713, %r8712, %r8712, 24; + add.s32 %r8714, %r8713, %r8707; + xor.b32 %r8715, %r8714, %r8709; + shf.l.wrap.b32 %r8716, %r8715, %r8715, 25; + add.s32 %r8717, %r8669, %r7894; + add.s32 %r8718, %r8717, %r8660; + xor.b32 %r8719, %r8718, %r8629; + shf.l.wrap.b32 %r8720, %r8719, %r8719, 16; + add.s32 %r8721, %r8720, %r8644; + xor.b32 %r8722, %r8721, %r8660; + shf.l.wrap.b32 %r8723, %r8722, %r8722, 20; + add.s32 %r8724, %r8718, %r7880; + add.s32 %r8725, %r8724, %r8723; + xor.b32 %r8726, %r8725, %r8720; + shf.l.wrap.b32 %r8727, %r8726, %r8726, 24; + add.s32 %r8728, %r8727, %r8721; + xor.b32 %r8729, %r8728, %r8723; + shf.l.wrap.b32 %r8730, %r8729, %r8729, 25; + add.s32 %r8731, %r8683, %r7936; + add.s32 %r8732, %r8731, %r8702; + xor.b32 %r8733, %r8732, %r8727; + shf.l.wrap.b32 %r8734, %r8733, %r8733, 16; + add.s32 %r8735, %r8734, %r8714; + xor.b32 %r8736, %r8735, %r8702; + shf.l.wrap.b32 %r8737, %r8736, %r8736, 20; + add.s32 %r8738, %r8732, %r7908; + add.s32 %r8739, %r8738, %r8737; + xor.b32 %r8740, %r8739, %r8734; + shf.l.wrap.b32 %r8741, %r8740, %r8740, 24; + add.s32 %r8742, %r8741, %r8735; + xor.b32 %r8743, %r8742, %r8737; + shf.l.wrap.b32 %r8744, %r8743, %r8743, 25; + add.s32 %r8745, %r8697, %r7852; + add.s32 %r8746, %r8745, %r8716; + xor.b32 %r8747, %r8685, %r8746; + shf.l.wrap.b32 %r8748, %r8747, %r8747, 16; + add.s32 %r8749, %r8748, %r8728; + xor.b32 %r8750, %r8749, %r8716; + shf.l.wrap.b32 %r8751, %r8750, %r8750, 20; + add.s32 %r8752, %r8746, %r7922; + add.s32 %r8753, %r8752, %r8751; + xor.b32 %r8754, %r8753, %r8748; + shf.l.wrap.b32 %r8755, %r8754, %r8754, 24; + add.s32 %r8756, %r8755, %r8749; + xor.b32 %r8757, %r8756, %r8751; + shf.l.wrap.b32 %r8758, %r8757, %r8757, 25; + add.s32 %r8759, %r8711, %r7859; + add.s32 %r8760, %r8759, %r8730; + xor.b32 %r8761, %r8760, %r8699; + shf.l.wrap.b32 %r8762, %r8761, %r8761, 16; + add.s32 %r8763, %r8762, %r8686; + xor.b32 %r8764, %r8763, %r8730; + shf.l.wrap.b32 %r8765, %r8764, %r8764, 20; + add.s32 %r8766, %r8760, %r7866; + add.s32 %r8767, %r8766, %r8765; + xor.b32 %r8768, %r8767, %r8762; + shf.l.wrap.b32 %r8769, %r8768, %r8768, 24; + add.s32 %r8770, %r8769, %r8763; + xor.b32 %r8771, %r8770, %r8765; + shf.l.wrap.b32 %r8772, %r8771, %r8771, 25; + add.s32 %r8773, %r8725, %r7887; + add.s32 %r8774, %r8773, %r8688; + xor.b32 %r8775, %r8774, %r8713; + shf.l.wrap.b32 %r8776, %r8775, %r8775, 16; + add.s32 %r8777, %r8776, %r8700; + xor.b32 %r8778, %r8777, %r8688; + shf.l.wrap.b32 %r8779, %r8778, %r8778, 20; + add.s32 %r8780, %r8774, %r7929; + add.s32 %r8781, %r8780, %r8779; + xor.b32 %r8782, %r8781, %r8776; + shf.l.wrap.b32 %r8783, %r8782, %r8782, 24; + add.s32 %r8784, %r8783, %r8777; + xor.b32 %r8785, %r8784, %r8779; + shf.l.wrap.b32 %r8786, %r8785, %r8785, 25; + xor.b32 %r8787, %r8770, %r8739; + xor.b32 %r8788, %r8784, %r8753; + xor.b32 %r8789, %r8742, %r8767; + xor.b32 %r8790, %r8781, %r8756; + xor.b32 %r8791, %r8786, %r8755; + xor.b32 %r8792, %r8744, %r8769; + xor.b32 %r8793, %r8783, %r8758; + xor.b32 %r8794, %r8772, %r8741; + st.local.u8 [%rd190+145], %r8787; + shr.u32 %r8795, %r8787, 8; + st.local.u8 [%rd190+146], %r8795; + shr.u32 %r8796, %r8787, 16; + st.local.u8 [%rd190+147], %r8796; + shr.u32 %r8797, %r8787, 24; + st.local.u8 [%rd190+148], %r8797; + st.local.u8 [%rd190+149], %r8788; + shr.u32 %r8798, %r8788, 8; + st.local.u8 [%rd190+150], %r8798; + shr.u32 %r8799, %r8788, 16; + st.local.u8 [%rd190+151], %r8799; + shr.u32 %r8800, %r8788, 24; + st.local.u8 [%rd190+152], %r8800; + st.local.u8 [%rd190+153], %r8789; + shr.u32 %r8801, %r8789, 8; + st.local.u8 [%rd190+154], %r8801; + shr.u32 %r8802, %r8789, 16; + st.local.u8 [%rd190+155], %r8802; + shr.u32 %r8803, %r8789, 24; + st.local.u8 [%rd190+156], %r8803; + st.local.u8 [%rd190+157], %r8790; + shr.u32 %r8804, %r8790, 8; + st.local.u8 [%rd190+158], %r8804; + shr.u32 %r8805, %r8790, 16; + st.local.u8 [%rd190+159], %r8805; + shr.u32 %r8806, %r8790, 24; + st.local.u8 [%rd190+160], %r8806; + st.local.u8 [%rd190+161], %r8791; + shr.u32 %r8807, %r8791, 8; + st.local.u8 [%rd190+162], %r8807; + shr.u32 %r8808, %r8791, 16; + st.local.u8 [%rd190+163], %r8808; + shr.u32 %r8809, %r8791, 24; + st.local.u8 [%rd190+164], %r8809; + st.local.u8 [%rd190+165], %r8792; + shr.u32 %r8810, %r8792, 8; + st.local.u8 [%rd190+166], %r8810; + shr.u32 %r8811, %r8792, 16; + st.local.u8 [%rd190+167], %r8811; + shr.u32 %r8812, %r8792, 24; + st.local.u8 [%rd190+168], %r8812; + st.local.u8 [%rd190+169], %r8793; + shr.u32 %r8813, %r8793, 8; + st.local.u8 [%rd190+170], %r8813; + shr.u32 %r8814, %r8793, 16; + st.local.u8 [%rd190+171], %r8814; + shr.u32 %r8815, %r8793, 24; + st.local.u8 [%rd190+172], %r8815; + st.local.u8 [%rd190+173], %r8794; + shr.u32 %r8816, %r8794, 8; + st.local.u8 [%rd190+174], %r8816; + shr.u32 %r8817, %r8794, 16; + st.local.u8 [%rd190+175], %r8817; + shr.u32 %r8818, %r8794, 24; + st.local.u8 [%rd190+176], %r8818; + ld.local.u8 %rs328, [%rd3+8]; + add.s16 %rs329, %rs328, -1; + st.local.u8 [%rd3+8], %rs329; + cvt.u64.u16 %rd191, %rs329; + and.b64 %rd192, %rd191, 255; + setp.lt.u64 %p38, %rd229, %rd192; + and.b16 %rs330, %rs329, 255; + mul.wide.u16 %r11681, %rs330, 32; + @%p38 bra $L__BB1_45; + +$L__BB1_46: + cvt.s64.s32 %rd193, %r11681; + add.s64 %rd194, %rd2, %rd193; + st.local.u8 [%rd194+145], %r97; + shr.u32 %r8819, %r97, 8; + st.local.u8 [%rd194+146], %r8819; + shr.u32 %r8820, %r97, 16; + st.local.u8 [%rd194+147], %r8820; + shr.u32 %r8821, %r97, 24; + st.local.u8 [%rd194+148], %r8821; + st.local.u8 [%rd194+149], %r98; + shr.u32 %r8822, %r98, 8; + st.local.u8 [%rd194+150], %r8822; + shr.u32 %r8823, %r98, 16; + st.local.u8 [%rd194+151], %r8823; + shr.u32 %r8824, %r98, 24; + st.local.u8 [%rd194+152], %r8824; + st.local.u8 [%rd194+153], %r99; + shr.u32 %r8825, %r99, 8; + st.local.u8 [%rd194+154], %r8825; + shr.u32 %r8826, %r99, 16; + st.local.u8 [%rd194+155], %r8826; + shr.u32 %r8827, %r99, 24; + st.local.u8 [%rd194+156], %r8827; + st.local.u8 [%rd194+157], %r100; + shr.u32 %r8828, %r100, 8; + st.local.u8 [%rd194+158], %r8828; + shr.u32 %r8829, %r100, 16; + st.local.u8 [%rd194+159], %r8829; + shr.u32 %r8830, %r100, 24; + st.local.u8 [%rd194+160], %r8830; + st.local.u8 [%rd194+161], %r101; + shr.u32 %r8831, %r101, 8; + st.local.u8 [%rd194+162], %r8831; + shr.u32 %r8832, %r101, 16; + st.local.u8 [%rd194+163], %r8832; + shr.u32 %r8833, %r101, 24; + st.local.u8 [%rd194+164], %r8833; + st.local.u8 [%rd194+165], %r102; + shr.u32 %r8834, %r102, 8; + st.local.u8 [%rd194+166], %r8834; + shr.u32 %r8835, %r102, 16; + st.local.u8 [%rd194+167], %r8835; + shr.u32 %r8836, %r102, 24; + st.local.u8 [%rd194+168], %r8836; + st.local.u8 [%rd194+169], %r103; + shr.u32 %r8837, %r103, 8; + st.local.u8 [%rd194+170], %r8837; + shr.u32 %r8838, %r103, 16; + st.local.u8 [%rd194+171], %r8838; + shr.u32 %r8839, %r103, 24; + st.local.u8 [%rd194+172], %r8839; + st.local.u8 [%rd194+173], %r104; + shr.u32 %r8840, %r104, 8; + st.local.u8 [%rd194+174], %r8840; + shr.u32 %r8841, %r104, 16; + st.local.u8 [%rd194+175], %r8841; + shr.u32 %r8842, %r104, 24; + st.local.u8 [%rd194+176], %r8842; + ld.local.u8 %rs388, [%rd3+8]; + bra.uni $L__BB1_47; + +$L__BB1_29: + cvt.u32.u16 %r3957, %rs14; + and.b32 %r3958, %r3957, 255; + { // callseq 2, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd253; + .param .b64 param1; + st.param.b64 [param1+0], %rd49; + .param .b64 param2; + st.param.b64 [param2+0], %rd98; + .param .b64 param3; + st.param.b64 [param3+0], %rd250; + .param .b32 param4; + st.param.b32 [param4+0], %r3958; + .param .b64 param5; + st.param.b64 [param5+0], %rd142; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd163, [retval0+0]; + } // callseq 2 + ld.local.v4.u32 {%r3959, %r3960, %r3961, %r3962}, [%rd42]; + ld.local.v4.u32 {%r3963, %r3964, %r3965, %r3966}, [%rd42+16]; + ld.local.v4.u32 {%r3967, %r3968, %r3969, %r3970}, [%rd42+32]; + ld.local.v4.u32 {%r3971, %r3972, %r3973, %r3974}, [%rd42+48]; + ld.local.u64 %rd164, [%rd3+-72]; + popc.b64 %r3975, %rd164; + cvt.u64.u32 %rd51, %r3975; + ld.local.u8 %rs137, [%rd3+8]; + cvt.u64.u16 %rd165, %rs137; + setp.ge.u64 %p27, %rd51, %rd165; + mul.wide.u16 %r11661, %rs137, 32; + @%p27 bra $L__BB1_32; + +$L__BB1_31: + popc.b64 %r11647, %rd164; + cvt.u64.u32 %rd225, %r11647; + add.s32 %r3976, %r11661, -64; + cvt.s64.s32 %rd166, %r3976; + add.s64 %rd167, %rd2, %rd166; + ld.local.u8 %r3977, [%rd3+2]; + ld.local.u8 %r3978, [%rd167+145]; + ld.local.u8 %r3979, [%rd167+146]; + prmt.b32 %r3980, %r3979, %r3978, 30212; + ld.local.u8 %r3981, [%rd167+147]; + prmt.b32 %r3982, %r3981, %r3980, 28756; + ld.local.u8 %r3983, [%rd167+148]; + prmt.b32 %r3984, %r3983, %r3982, 1620; + ld.local.u8 %r3985, [%rd167+149]; + ld.local.u8 %r3986, [%rd167+150]; + prmt.b32 %r3987, %r3986, %r3985, 30212; + ld.local.u8 %r3988, [%rd167+151]; + prmt.b32 %r3989, %r3988, %r3987, 28756; + ld.local.u8 %r3990, [%rd167+152]; + prmt.b32 %r3991, %r3990, %r3989, 1620; + ld.local.u8 %r3992, [%rd167+153]; + ld.local.u8 %r3993, [%rd167+154]; + prmt.b32 %r3994, %r3993, %r3992, 30212; + ld.local.u8 %r3995, [%rd167+155]; + prmt.b32 %r3996, %r3995, %r3994, 28756; + ld.local.u8 %r3997, [%rd167+156]; + prmt.b32 %r3998, %r3997, %r3996, 1620; + ld.local.u8 %r3999, [%rd167+157]; + ld.local.u8 %r4000, [%rd167+158]; + prmt.b32 %r4001, %r4000, %r3999, 30212; + ld.local.u8 %r4002, [%rd167+159]; + prmt.b32 %r4003, %r4002, %r4001, 28756; + ld.local.u8 %r4004, [%rd167+160]; + prmt.b32 %r4005, %r4004, %r4003, 1620; + ld.local.u8 %r4006, [%rd167+161]; + ld.local.u8 %r4007, [%rd167+162]; + prmt.b32 %r4008, %r4007, %r4006, 30212; + ld.local.u8 %r4009, [%rd167+163]; + prmt.b32 %r4010, %r4009, %r4008, 28756; + ld.local.u8 %r4011, [%rd167+164]; + prmt.b32 %r4012, %r4011, %r4010, 1620; + ld.local.u8 %r4013, [%rd167+165]; + ld.local.u8 %r4014, [%rd167+166]; + prmt.b32 %r4015, %r4014, %r4013, 30212; + ld.local.u8 %r4016, [%rd167+167]; + prmt.b32 %r4017, %r4016, %r4015, 28756; + ld.local.u8 %r4018, [%rd167+168]; + prmt.b32 %r4019, %r4018, %r4017, 1620; + ld.local.u8 %r4020, [%rd167+169]; + ld.local.u8 %r4021, [%rd167+170]; + prmt.b32 %r4022, %r4021, %r4020, 30212; + ld.local.u8 %r4023, [%rd167+171]; + prmt.b32 %r4024, %r4023, %r4022, 28756; + ld.local.u8 %r4025, [%rd167+172]; + prmt.b32 %r4026, %r4025, %r4024, 1620; + ld.local.u8 %r4027, [%rd167+173]; + ld.local.u8 %r4028, [%rd167+174]; + prmt.b32 %r4029, %r4028, %r4027, 30212; + ld.local.u8 %r4030, [%rd167+175]; + prmt.b32 %r4031, %r4030, %r4029, 28756; + ld.local.u8 %r4032, [%rd167+176]; + prmt.b32 %r4033, %r4032, %r4031, 1620; + ld.local.u8 %r4034, [%rd167+177]; + ld.local.u8 %r4035, [%rd167+178]; + prmt.b32 %r4036, %r4035, %r4034, 30212; + ld.local.u8 %r4037, [%rd167+179]; + prmt.b32 %r4038, %r4037, %r4036, 28756; + ld.local.u8 %r4039, [%rd167+180]; + prmt.b32 %r4040, %r4039, %r4038, 1620; + ld.local.u8 %r4041, [%rd167+181]; + ld.local.u8 %r4042, [%rd167+182]; + prmt.b32 %r4043, %r4042, %r4041, 30212; + ld.local.u8 %r4044, [%rd167+183]; + prmt.b32 %r4045, %r4044, %r4043, 28756; + ld.local.u8 %r4046, [%rd167+184]; + prmt.b32 %r4047, %r4046, %r4045, 1620; + ld.local.u8 %r4048, [%rd167+185]; + ld.local.u8 %r4049, [%rd167+186]; + prmt.b32 %r4050, %r4049, %r4048, 30212; + ld.local.u8 %r4051, [%rd167+187]; + prmt.b32 %r4052, %r4051, %r4050, 28756; + ld.local.u8 %r4053, [%rd167+188]; + prmt.b32 %r4054, %r4053, %r4052, 1620; + ld.local.u8 %r4055, [%rd167+189]; + ld.local.u8 %r4056, [%rd167+190]; + prmt.b32 %r4057, %r4056, %r4055, 30212; + ld.local.u8 %r4058, [%rd167+191]; + prmt.b32 %r4059, %r4058, %r4057, 28756; + ld.local.u8 %r4060, [%rd167+192]; + prmt.b32 %r4061, %r4060, %r4059, 1620; + ld.local.u8 %r4062, [%rd167+193]; + ld.local.u8 %r4063, [%rd167+194]; + prmt.b32 %r4064, %r4063, %r4062, 30212; + ld.local.u8 %r4065, [%rd167+195]; + prmt.b32 %r4066, %r4065, %r4064, 28756; + ld.local.u8 %r4067, [%rd167+196]; + prmt.b32 %r4068, %r4067, %r4066, 1620; + ld.local.u8 %r4069, [%rd167+197]; + ld.local.u8 %r4070, [%rd167+198]; + prmt.b32 %r4071, %r4070, %r4069, 30212; + ld.local.u8 %r4072, [%rd167+199]; + prmt.b32 %r4073, %r4072, %r4071, 28756; + ld.local.u8 %r4074, [%rd167+200]; + prmt.b32 %r4075, %r4074, %r4073, 1620; + ld.local.u8 %r4076, [%rd167+201]; + ld.local.u8 %r4077, [%rd167+202]; + prmt.b32 %r4078, %r4077, %r4076, 30212; + ld.local.u8 %r4079, [%rd167+203]; + prmt.b32 %r4080, %r4079, %r4078, 28756; + ld.local.u8 %r4081, [%rd167+204]; + prmt.b32 %r4082, %r4081, %r4080, 1620; + ld.local.u8 %r4083, [%rd167+205]; + ld.local.u8 %r4084, [%rd167+206]; + prmt.b32 %r4085, %r4084, %r4083, 30212; + ld.local.u8 %r4086, [%rd167+207]; + prmt.b32 %r4087, %r4086, %r4085, 28756; + ld.local.u8 %r4088, [%rd167+208]; + prmt.b32 %r4089, %r4088, %r4087, 1620; + or.b32 %r4090, %r3977, 4; + ld.local.u8 %r4091, [%rd3+-120]; + ld.local.u8 %r4092, [%rd3+-119]; + prmt.b32 %r4093, %r4092, %r4091, 30212; + ld.local.u8 %r4094, [%rd3+-118]; + ld.local.u8 %r4095, [%rd3+-117]; + prmt.b32 %r4096, %r4095, %r4094, 30212; + prmt.b32 %r4097, %r4096, %r4093, 4180; + ld.local.u8 %r4098, [%rd3+-136]; + ld.local.u8 %r4099, [%rd3+-135]; + prmt.b32 %r4100, %r4099, %r4098, 30212; + ld.local.u8 %r4101, [%rd3+-134]; + ld.local.u8 %r4102, [%rd3+-133]; + prmt.b32 %r4103, %r4102, %r4101, 30212; + prmt.b32 %r4104, %r4103, %r4100, 4180; + add.s32 %r4105, %r4097, %r4104; + add.s32 %r4106, %r4105, %r3984; + shf.l.wrap.b32 %r4107, %r4106, %r4106, 16; + add.s32 %r4108, %r4107, 1779033703; + xor.b32 %r4109, %r4108, %r4097; + shf.l.wrap.b32 %r4110, %r4109, %r4109, 20; + add.s32 %r4111, %r3991, %r4106; + add.s32 %r4112, %r4111, %r4110; + xor.b32 %r4113, %r4112, %r4107; + shf.l.wrap.b32 %r4114, %r4113, %r4113, 24; + add.s32 %r4115, %r4114, %r4108; + xor.b32 %r4116, %r4115, %r4110; + shf.l.wrap.b32 %r4117, %r4116, %r4116, 25; + ld.local.u8 %r4118, [%rd3+-116]; + ld.local.u8 %r4119, [%rd3+-115]; + prmt.b32 %r4120, %r4119, %r4118, 30212; + ld.local.u8 %r4121, [%rd3+-114]; + ld.local.u8 %r4122, [%rd3+-113]; + prmt.b32 %r4123, %r4122, %r4121, 30212; + prmt.b32 %r4124, %r4123, %r4120, 4180; + ld.local.u8 %r4125, [%rd3+-132]; + ld.local.u8 %r4126, [%rd3+-131]; + prmt.b32 %r4127, %r4126, %r4125, 30212; + ld.local.u8 %r4128, [%rd3+-130]; + ld.local.u8 %r4129, [%rd3+-129]; + prmt.b32 %r4130, %r4129, %r4128, 30212; + prmt.b32 %r4131, %r4130, %r4127, 4180; + add.s32 %r4132, %r4124, %r4131; + add.s32 %r4133, %r4132, %r3998; + shf.l.wrap.b32 %r4134, %r4133, %r4133, 16; + add.s32 %r4135, %r4134, -1150833019; + xor.b32 %r4136, %r4135, %r4124; + shf.l.wrap.b32 %r4137, %r4136, %r4136, 20; + add.s32 %r4138, %r4005, %r4133; + add.s32 %r4139, %r4138, %r4137; + xor.b32 %r4140, %r4139, %r4134; + shf.l.wrap.b32 %r4141, %r4140, %r4140, 24; + add.s32 %r4142, %r4141, %r4135; + xor.b32 %r4143, %r4142, %r4137; + shf.l.wrap.b32 %r4144, %r4143, %r4143, 25; + ld.local.u8 %r4145, [%rd3+-112]; + ld.local.u8 %r4146, [%rd3+-111]; + prmt.b32 %r4147, %r4146, %r4145, 30212; + ld.local.u8 %r4148, [%rd3+-110]; + ld.local.u8 %r4149, [%rd3+-109]; + prmt.b32 %r4150, %r4149, %r4148, 30212; + prmt.b32 %r4151, %r4150, %r4147, 4180; + ld.local.u8 %r4152, [%rd3+-128]; + ld.local.u8 %r4153, [%rd3+-127]; + prmt.b32 %r4154, %r4153, %r4152, 30212; + ld.local.u8 %r4155, [%rd3+-126]; + ld.local.u8 %r4156, [%rd3+-125]; + prmt.b32 %r4157, %r4156, %r4155, 30212; + prmt.b32 %r4158, %r4157, %r4154, 4180; + add.s32 %r4159, %r4151, %r4158; + add.s32 %r4160, %r4159, %r4012; + shr.u32 %r4161, %r4160, 16; + shl.b32 %r4162, %r4160, 16; + xor.b32 %r4163, %r4162, 4194304; + or.b32 %r4164, %r4163, %r4161; + add.s32 %r4165, %r4164, 1013904242; + xor.b32 %r4166, %r4165, %r4151; + shf.l.wrap.b32 %r4167, %r4166, %r4166, 20; + add.s32 %r4168, %r4019, %r4160; + add.s32 %r4169, %r4168, %r4167; + xor.b32 %r4170, %r4169, %r4164; + shf.l.wrap.b32 %r4171, %r4170, %r4170, 24; + add.s32 %r4172, %r4171, %r4165; + xor.b32 %r4173, %r4172, %r4167; + shf.l.wrap.b32 %r4174, %r4173, %r4173, 25; + ld.local.u8 %r4175, [%rd3+-108]; + ld.local.u8 %r4176, [%rd3+-107]; + prmt.b32 %r4177, %r4176, %r4175, 30212; + ld.local.u8 %r4178, [%rd3+-106]; + ld.local.u8 %r4179, [%rd3+-105]; + prmt.b32 %r4180, %r4179, %r4178, 30212; + prmt.b32 %r4181, %r4180, %r4177, 4180; + ld.local.u8 %r4182, [%rd3+-124]; + ld.local.u8 %r4183, [%rd3+-123]; + prmt.b32 %r4184, %r4183, %r4182, 30212; + ld.local.u8 %r4185, [%rd3+-122]; + ld.local.u8 %r4186, [%rd3+-121]; + prmt.b32 %r4187, %r4186, %r4185, 30212; + prmt.b32 %r4188, %r4187, %r4184, 4180; + add.s32 %r4189, %r4181, %r4188; + add.s32 %r4190, %r4189, %r4026; + xor.b32 %r4191, %r4190, %r4090; + shr.u32 %r4192, %r4190, 16; + shl.b32 %r4193, %r4191, 16; + or.b32 %r4194, %r4193, %r4192; + add.s32 %r4195, %r4194, -1521486534; + xor.b32 %r4196, %r4195, %r4181; + shf.l.wrap.b32 %r4197, %r4196, %r4196, 20; + add.s32 %r4198, %r4033, %r4190; + add.s32 %r4199, %r4198, %r4197; + xor.b32 %r4200, %r4199, %r4194; + shf.l.wrap.b32 %r4201, %r4200, %r4200, 24; + add.s32 %r4202, %r4201, %r4195; + xor.b32 %r4203, %r4202, %r4197; + shf.l.wrap.b32 %r4204, %r4203, %r4203, 25; + add.s32 %r4205, %r4144, %r4112; + add.s32 %r4206, %r4205, %r4040; + xor.b32 %r4207, %r4201, %r4206; + shf.l.wrap.b32 %r4208, %r4207, %r4207, 16; + add.s32 %r4209, %r4208, %r4172; + xor.b32 %r4210, %r4209, %r4144; + shf.l.wrap.b32 %r4211, %r4210, %r4210, 20; + add.s32 %r4212, %r4047, %r4206; + add.s32 %r4213, %r4212, %r4211; + xor.b32 %r4214, %r4213, %r4208; + shf.l.wrap.b32 %r4215, %r4214, %r4214, 24; + add.s32 %r4216, %r4215, %r4209; + xor.b32 %r4217, %r4216, %r4211; + shf.l.wrap.b32 %r4218, %r4217, %r4217, 25; + add.s32 %r4219, %r4174, %r4139; + add.s32 %r4220, %r4219, %r4054; + xor.b32 %r4221, %r4220, %r4114; + shf.l.wrap.b32 %r4222, %r4221, %r4221, 16; + add.s32 %r4223, %r4222, %r4202; + xor.b32 %r4224, %r4223, %r4174; + shf.l.wrap.b32 %r4225, %r4224, %r4224, 20; + add.s32 %r4226, %r4061, %r4220; + add.s32 %r4227, %r4226, %r4225; + xor.b32 %r4228, %r4227, %r4222; + shf.l.wrap.b32 %r4229, %r4228, %r4228, 24; + add.s32 %r4230, %r4229, %r4223; + xor.b32 %r4231, %r4230, %r4225; + shf.l.wrap.b32 %r4232, %r4231, %r4231, 25; + add.s32 %r4233, %r4204, %r4169; + add.s32 %r4234, %r4233, %r4068; + xor.b32 %r4235, %r4234, %r4141; + shf.l.wrap.b32 %r4236, %r4235, %r4235, 16; + add.s32 %r4237, %r4236, %r4115; + xor.b32 %r4238, %r4237, %r4204; + shf.l.wrap.b32 %r4239, %r4238, %r4238, 20; + add.s32 %r4240, %r4075, %r4234; + add.s32 %r4241, %r4240, %r4239; + xor.b32 %r4242, %r4241, %r4236; + shf.l.wrap.b32 %r4243, %r4242, %r4242, 24; + add.s32 %r4244, %r4243, %r4237; + xor.b32 %r4245, %r4244, %r4239; + shf.l.wrap.b32 %r4246, %r4245, %r4245, 25; + add.s32 %r4247, %r4199, %r4117; + add.s32 %r4248, %r4247, %r4082; + xor.b32 %r4249, %r4248, %r4171; + shf.l.wrap.b32 %r4250, %r4249, %r4249, 16; + add.s32 %r4251, %r4250, %r4142; + xor.b32 %r4252, %r4251, %r4117; + shf.l.wrap.b32 %r4253, %r4252, %r4252, 20; + add.s32 %r4254, %r4089, %r4248; + add.s32 %r4255, %r4254, %r4253; + xor.b32 %r4256, %r4255, %r4250; + shf.l.wrap.b32 %r4257, %r4256, %r4256, 24; + add.s32 %r4258, %r4257, %r4251; + xor.b32 %r4259, %r4258, %r4253; + shf.l.wrap.b32 %r4260, %r4259, %r4259, 25; + add.s32 %r4261, %r4213, %r3998; + add.s32 %r4262, %r4261, %r4260; + xor.b32 %r4263, %r4262, %r4229; + shf.l.wrap.b32 %r4264, %r4263, %r4263, 16; + add.s32 %r4265, %r4264, %r4244; + xor.b32 %r4266, %r4265, %r4260; + shf.l.wrap.b32 %r4267, %r4266, %r4266, 20; + add.s32 %r4268, %r4262, %r4026; + add.s32 %r4269, %r4268, %r4267; + xor.b32 %r4270, %r4269, %r4264; + shf.l.wrap.b32 %r4271, %r4270, %r4270, 24; + add.s32 %r4272, %r4271, %r4265; + xor.b32 %r4273, %r4272, %r4267; + shf.l.wrap.b32 %r4274, %r4273, %r4273, 25; + add.s32 %r4275, %r4227, %r4005; + add.s32 %r4276, %r4275, %r4218; + xor.b32 %r4277, %r4243, %r4276; + shf.l.wrap.b32 %r4278, %r4277, %r4277, 16; + add.s32 %r4279, %r4258, %r4278; + xor.b32 %r4280, %r4279, %r4218; + shf.l.wrap.b32 %r4281, %r4280, %r4280, 20; + add.s32 %r4282, %r4276, %r4054; + add.s32 %r4283, %r4282, %r4281; + xor.b32 %r4284, %r4283, %r4278; + shf.l.wrap.b32 %r4285, %r4284, %r4284, 24; + add.s32 %r4286, %r4285, %r4279; + xor.b32 %r4287, %r4286, %r4281; + shf.l.wrap.b32 %r4288, %r4287, %r4287, 25; + add.s32 %r4289, %r4232, %r4033; + add.s32 %r4290, %r4289, %r4241; + xor.b32 %r4291, %r4257, %r4290; + shf.l.wrap.b32 %r4292, %r4291, %r4291, 16; + add.s32 %r4293, %r4292, %r4216; + xor.b32 %r4294, %r4293, %r4232; + shf.l.wrap.b32 %r4295, %r4294, %r4294, 20; + add.s32 %r4296, %r4290, %r3984; + add.s32 %r4297, %r4296, %r4295; + xor.b32 %r4298, %r4297, %r4292; + shf.l.wrap.b32 %r4299, %r4298, %r4298, 24; + add.s32 %r4300, %r4299, %r4293; + xor.b32 %r4301, %r4300, %r4295; + shf.l.wrap.b32 %r4302, %r4301, %r4301, 25; + add.s32 %r4303, %r4246, %r4012; + add.s32 %r4304, %r4303, %r4255; + xor.b32 %r4305, %r4304, %r4215; + shf.l.wrap.b32 %r4306, %r4305, %r4305, 16; + add.s32 %r4307, %r4306, %r4230; + xor.b32 %r4308, %r4307, %r4246; + shf.l.wrap.b32 %r4309, %r4308, %r4308, 20; + add.s32 %r4310, %r4304, %r4075; + add.s32 %r4311, %r4310, %r4309; + xor.b32 %r4312, %r4311, %r4306; + shf.l.wrap.b32 %r4313, %r4312, %r4312, 24; + add.s32 %r4314, %r4313, %r4307; + xor.b32 %r4315, %r4314, %r4309; + shf.l.wrap.b32 %r4316, %r4315, %r4315, 25; + add.s32 %r4317, %r4288, %r3991; + add.s32 %r4318, %r4317, %r4269; + xor.b32 %r4319, %r4318, %r4313; + shf.l.wrap.b32 %r4320, %r4319, %r4319, 16; + add.s32 %r4321, %r4320, %r4300; + xor.b32 %r4322, %r4321, %r4288; + shf.l.wrap.b32 %r4323, %r4322, %r4322, 20; + add.s32 %r4324, %r4318, %r4061; + add.s32 %r4325, %r4324, %r4323; + xor.b32 %r4326, %r4325, %r4320; + shf.l.wrap.b32 %r4327, %r4326, %r4326, 24; + add.s32 %r4328, %r4327, %r4321; + xor.b32 %r4329, %r4328, %r4323; + shf.l.wrap.b32 %r4330, %r4329, %r4329, 25; + add.s32 %r4331, %r4283, %r4068; + add.s32 %r4332, %r4331, %r4302; + xor.b32 %r4333, %r4271, %r4332; + shf.l.wrap.b32 %r4334, %r4333, %r4333, 16; + add.s32 %r4335, %r4334, %r4314; + xor.b32 %r4336, %r4335, %r4302; + shf.l.wrap.b32 %r4337, %r4336, %r4336, 20; + add.s32 %r4338, %r4332, %r4019; + add.s32 %r4339, %r4338, %r4337; + xor.b32 %r4340, %r4339, %r4334; + shf.l.wrap.b32 %r4341, %r4340, %r4340, 24; + add.s32 %r4342, %r4341, %r4335; + xor.b32 %r4343, %r4342, %r4337; + shf.l.wrap.b32 %r4344, %r4343, %r4343, 25; + add.s32 %r4345, %r4297, %r4047; + add.s32 %r4346, %r4345, %r4316; + xor.b32 %r4347, %r4346, %r4285; + shf.l.wrap.b32 %r4348, %r4347, %r4347, 16; + add.s32 %r4349, %r4348, %r4272; + xor.b32 %r4350, %r4349, %r4316; + shf.l.wrap.b32 %r4351, %r4350, %r4350, 20; + add.s32 %r4352, %r4346, %r4082; + add.s32 %r4353, %r4352, %r4351; + xor.b32 %r4354, %r4353, %r4348; + shf.l.wrap.b32 %r4355, %r4354, %r4354, 24; + add.s32 %r4356, %r4355, %r4349; + xor.b32 %r4357, %r4356, %r4351; + shf.l.wrap.b32 %r4358, %r4357, %r4357, 25; + add.s32 %r4359, %r4311, %r4089; + add.s32 %r4360, %r4359, %r4274; + xor.b32 %r4361, %r4360, %r4299; + shf.l.wrap.b32 %r4362, %r4361, %r4361, 16; + add.s32 %r4363, %r4362, %r4286; + xor.b32 %r4364, %r4363, %r4274; + shf.l.wrap.b32 %r4365, %r4364, %r4364, 20; + add.s32 %r4366, %r4360, %r4040; + add.s32 %r4367, %r4366, %r4365; + xor.b32 %r4368, %r4367, %r4362; + shf.l.wrap.b32 %r4369, %r4368, %r4368, 24; + add.s32 %r4370, %r4369, %r4363; + xor.b32 %r4371, %r4370, %r4365; + shf.l.wrap.b32 %r4372, %r4371, %r4371, 25; + add.s32 %r4373, %r4325, %r4005; + add.s32 %r4374, %r4373, %r4372; + xor.b32 %r4375, %r4374, %r4341; + shf.l.wrap.b32 %r4376, %r4375, %r4375, 16; + add.s32 %r4377, %r4376, %r4356; + xor.b32 %r4378, %r4377, %r4372; + shf.l.wrap.b32 %r4379, %r4378, %r4378, 20; + add.s32 %r4380, %r4374, %r4012; + add.s32 %r4381, %r4380, %r4379; + xor.b32 %r4382, %r4381, %r4376; + shf.l.wrap.b32 %r4383, %r4382, %r4382, 24; + add.s32 %r4384, %r4383, %r4377; + xor.b32 %r4385, %r4384, %r4379; + shf.l.wrap.b32 %r4386, %r4385, %r4385, 25; + add.s32 %r4387, %r4339, %r4054; + add.s32 %r4388, %r4387, %r4330; + xor.b32 %r4389, %r4388, %r4355; + shf.l.wrap.b32 %r4390, %r4389, %r4389, 16; + add.s32 %r4391, %r4390, %r4370; + xor.b32 %r4392, %r4391, %r4330; + shf.l.wrap.b32 %r4393, %r4392, %r4392, 20; + add.s32 %r4394, %r4388, %r4068; + add.s32 %r4395, %r4394, %r4393; + xor.b32 %r4396, %r4395, %r4390; + shf.l.wrap.b32 %r4397, %r4396, %r4396, 24; + add.s32 %r4398, %r4397, %r4391; + xor.b32 %r4399, %r4398, %r4393; + shf.l.wrap.b32 %r4400, %r4399, %r4399, 25; + add.s32 %r4401, %r4353, %r4075; + add.s32 %r4402, %r4401, %r4344; + xor.b32 %r4403, %r4369, %r4402; + shf.l.wrap.b32 %r4404, %r4403, %r4403, 16; + add.s32 %r4405, %r4404, %r4328; + xor.b32 %r4406, %r4405, %r4344; + shf.l.wrap.b32 %r4407, %r4406, %r4406, 20; + add.s32 %r4408, %r4402, %r3998; + add.s32 %r4409, %r4408, %r4407; + xor.b32 %r4410, %r4409, %r4404; + shf.l.wrap.b32 %r4411, %r4410, %r4410, 24; + add.s32 %r4412, %r4411, %r4405; + xor.b32 %r4413, %r4412, %r4407; + shf.l.wrap.b32 %r4414, %r4413, %r4413, 25; + add.s32 %r4415, %r4358, %r4033; + add.s32 %r4416, %r4415, %r4367; + xor.b32 %r4417, %r4416, %r4327; + shf.l.wrap.b32 %r4418, %r4417, %r4417, 16; + add.s32 %r4419, %r4418, %r4342; + xor.b32 %r4420, %r4419, %r4358; + shf.l.wrap.b32 %r4421, %r4420, %r4420, 20; + add.s32 %r4422, %r4416, %r4082; + add.s32 %r4423, %r4422, %r4421; + xor.b32 %r4424, %r4423, %r4418; + shf.l.wrap.b32 %r4425, %r4424, %r4424, 24; + add.s32 %r4426, %r4425, %r4419; + xor.b32 %r4427, %r4426, %r4421; + shf.l.wrap.b32 %r4428, %r4427, %r4427, 25; + add.s32 %r4429, %r4381, %r4026; + add.s32 %r4430, %r4429, %r4400; + xor.b32 %r4431, %r4430, %r4425; + shf.l.wrap.b32 %r4432, %r4431, %r4431, 16; + add.s32 %r4433, %r4432, %r4412; + xor.b32 %r4434, %r4433, %r4400; + shf.l.wrap.b32 %r4435, %r4434, %r4434, 20; + add.s32 %r4436, %r4430, %r4019; + add.s32 %r4437, %r4436, %r4435; + xor.b32 %r4438, %r4437, %r4432; + shf.l.wrap.b32 %r4439, %r4438, %r4438, 24; + add.s32 %r4440, %r4439, %r4433; + xor.b32 %r4441, %r4440, %r4435; + shf.l.wrap.b32 %r4442, %r4441, %r4441, 25; + add.s32 %r4443, %r4395, %r4047; + add.s32 %r4444, %r4443, %r4414; + xor.b32 %r4445, %r4383, %r4444; + shf.l.wrap.b32 %r4446, %r4445, %r4445, 16; + add.s32 %r4447, %r4446, %r4426; + xor.b32 %r4448, %r4447, %r4414; + shf.l.wrap.b32 %r4449, %r4448, %r4448, 20; + add.s32 %r4450, %r4444, %r3984; + add.s32 %r4451, %r4450, %r4449; + xor.b32 %r4452, %r4451, %r4446; + shf.l.wrap.b32 %r4453, %r4452, %r4452, 24; + add.s32 %r4454, %r4453, %r4447; + xor.b32 %r4455, %r4454, %r4449; + shf.l.wrap.b32 %r4456, %r4455, %r4455, 25; + add.s32 %r4457, %r4409, %r4061; + add.s32 %r4458, %r4457, %r4428; + xor.b32 %r4459, %r4458, %r4397; + shf.l.wrap.b32 %r4460, %r4459, %r4459, 16; + add.s32 %r4461, %r4460, %r4384; + xor.b32 %r4462, %r4461, %r4428; + shf.l.wrap.b32 %r4463, %r4462, %r4462, 20; + add.s32 %r4464, %r4458, %r4089; + add.s32 %r4465, %r4464, %r4463; + xor.b32 %r4466, %r4465, %r4460; + shf.l.wrap.b32 %r4467, %r4466, %r4466, 24; + add.s32 %r4468, %r4467, %r4461; + xor.b32 %r4469, %r4468, %r4463; + shf.l.wrap.b32 %r4470, %r4469, %r4469, 25; + add.s32 %r4471, %r4423, %r4040; + add.s32 %r4472, %r4471, %r4386; + xor.b32 %r4473, %r4472, %r4411; + shf.l.wrap.b32 %r4474, %r4473, %r4473, 16; + add.s32 %r4475, %r4474, %r4398; + xor.b32 %r4476, %r4475, %r4386; + shf.l.wrap.b32 %r4477, %r4476, %r4476, 20; + add.s32 %r4478, %r4472, %r3991; + add.s32 %r4479, %r4478, %r4477; + xor.b32 %r4480, %r4479, %r4474; + shf.l.wrap.b32 %r4481, %r4480, %r4480, 24; + add.s32 %r4482, %r4481, %r4475; + xor.b32 %r4483, %r4482, %r4477; + shf.l.wrap.b32 %r4484, %r4483, %r4483, 25; + add.s32 %r4485, %r4437, %r4054; + add.s32 %r4486, %r4485, %r4484; + xor.b32 %r4487, %r4486, %r4453; + shf.l.wrap.b32 %r4488, %r4487, %r4487, 16; + add.s32 %r4489, %r4488, %r4468; + xor.b32 %r4490, %r4489, %r4484; + shf.l.wrap.b32 %r4491, %r4490, %r4490, 20; + add.s32 %r4492, %r4486, %r4033; + add.s32 %r4493, %r4492, %r4491; + xor.b32 %r4494, %r4493, %r4488; + shf.l.wrap.b32 %r4495, %r4494, %r4494, 24; + add.s32 %r4496, %r4495, %r4489; + xor.b32 %r4497, %r4496, %r4491; + shf.l.wrap.b32 %r4498, %r4497, %r4497, 25; + add.s32 %r4499, %r4451, %r4068; + add.s32 %r4500, %r4499, %r4442; + xor.b32 %r4501, %r4500, %r4467; + shf.l.wrap.b32 %r4502, %r4501, %r4501, 16; + add.s32 %r4503, %r4502, %r4482; + xor.b32 %r4504, %r4503, %r4442; + shf.l.wrap.b32 %r4505, %r4504, %r4504, 20; + add.s32 %r4506, %r4500, %r4047; + add.s32 %r4507, %r4506, %r4505; + xor.b32 %r4508, %r4507, %r4502; + shf.l.wrap.b32 %r4509, %r4508, %r4508, 24; + add.s32 %r4510, %r4509, %r4503; + xor.b32 %r4511, %r4510, %r4505; + shf.l.wrap.b32 %r4512, %r4511, %r4511, 25; + add.s32 %r4513, %r4465, %r4082; + add.s32 %r4514, %r4513, %r4456; + xor.b32 %r4515, %r4481, %r4514; + shf.l.wrap.b32 %r4516, %r4515, %r4515, 16; + add.s32 %r4517, %r4516, %r4440; + xor.b32 %r4518, %r4517, %r4456; + shf.l.wrap.b32 %r4519, %r4518, %r4518, 20; + add.s32 %r4520, %r4514, %r4005; + add.s32 %r4521, %r4520, %r4519; + xor.b32 %r4522, %r4521, %r4516; + shf.l.wrap.b32 %r4523, %r4522, %r4522, 24; + add.s32 %r4524, %r4523, %r4517; + xor.b32 %r4525, %r4524, %r4519; + shf.l.wrap.b32 %r4526, %r4525, %r4525, 25; + add.s32 %r4527, %r4479, %r4075; + add.s32 %r4528, %r4527, %r4470; + xor.b32 %r4529, %r4528, %r4439; + shf.l.wrap.b32 %r4530, %r4529, %r4529, 16; + add.s32 %r4531, %r4530, %r4454; + xor.b32 %r4532, %r4531, %r4470; + shf.l.wrap.b32 %r4533, %r4532, %r4532, 20; + add.s32 %r4534, %r4528, %r4089; + add.s32 %r4535, %r4534, %r4533; + xor.b32 %r4536, %r4535, %r4530; + shf.l.wrap.b32 %r4537, %r4536, %r4536, 24; + add.s32 %r4538, %r4537, %r4531; + xor.b32 %r4539, %r4538, %r4533; + shf.l.wrap.b32 %r4540, %r4539, %r4539, 25; + add.s32 %r4541, %r4493, %r4012; + add.s32 %r4542, %r4541, %r4512; + xor.b32 %r4543, %r4542, %r4537; + shf.l.wrap.b32 %r4544, %r4543, %r4543, 16; + add.s32 %r4545, %r4544, %r4524; + xor.b32 %r4546, %r4545, %r4512; + shf.l.wrap.b32 %r4547, %r4546, %r4546, 20; + add.s32 %r4548, %r4542, %r3984; + add.s32 %r4549, %r4548, %r4547; + xor.b32 %r4550, %r4549, %r4544; + shf.l.wrap.b32 %r4551, %r4550, %r4550, 24; + add.s32 %r4552, %r4551, %r4545; + xor.b32 %r4553, %r4552, %r4547; + shf.l.wrap.b32 %r4554, %r4553, %r4553, 25; + add.s32 %r4555, %r4507, %r4061; + add.s32 %r4556, %r4555, %r4526; + xor.b32 %r4557, %r4495, %r4556; + shf.l.wrap.b32 %r4558, %r4557, %r4557, 16; + add.s32 %r4559, %r4558, %r4538; + xor.b32 %r4560, %r4559, %r4526; + shf.l.wrap.b32 %r4561, %r4560, %r4560, 20; + add.s32 %r4562, %r4556, %r3998; + add.s32 %r4563, %r4562, %r4561; + xor.b32 %r4564, %r4563, %r4558; + shf.l.wrap.b32 %r4565, %r4564, %r4564, 24; + add.s32 %r4566, %r4565, %r4559; + xor.b32 %r4567, %r4566, %r4561; + shf.l.wrap.b32 %r4568, %r4567, %r4567, 25; + add.s32 %r4569, %r4521, %r4019; + add.s32 %r4570, %r4569, %r4540; + xor.b32 %r4571, %r4570, %r4509; + shf.l.wrap.b32 %r4572, %r4571, %r4571, 16; + add.s32 %r4573, %r4572, %r4496; + xor.b32 %r4574, %r4573, %r4540; + shf.l.wrap.b32 %r4575, %r4574, %r4574, 20; + add.s32 %r4576, %r4570, %r4040; + add.s32 %r4577, %r4576, %r4575; + xor.b32 %r4578, %r4577, %r4572; + shf.l.wrap.b32 %r4579, %r4578, %r4578, 24; + add.s32 %r4580, %r4579, %r4573; + xor.b32 %r4581, %r4580, %r4575; + shf.l.wrap.b32 %r4582, %r4581, %r4581, 25; + add.s32 %r4583, %r4535, %r3991; + add.s32 %r4584, %r4583, %r4498; + xor.b32 %r4585, %r4584, %r4523; + shf.l.wrap.b32 %r4586, %r4585, %r4585, 16; + add.s32 %r4587, %r4586, %r4510; + xor.b32 %r4588, %r4587, %r4498; + shf.l.wrap.b32 %r4589, %r4588, %r4588, 20; + add.s32 %r4590, %r4584, %r4026; + add.s32 %r4591, %r4590, %r4589; + xor.b32 %r4592, %r4591, %r4586; + shf.l.wrap.b32 %r4593, %r4592, %r4592, 24; + add.s32 %r4594, %r4593, %r4587; + xor.b32 %r4595, %r4594, %r4589; + shf.l.wrap.b32 %r4596, %r4595, %r4595, 25; + add.s32 %r4597, %r4549, %r4068; + add.s32 %r4598, %r4597, %r4596; + xor.b32 %r4599, %r4598, %r4565; + shf.l.wrap.b32 %r4600, %r4599, %r4599, 16; + add.s32 %r4601, %r4600, %r4580; + xor.b32 %r4602, %r4601, %r4596; + shf.l.wrap.b32 %r4603, %r4602, %r4602, 20; + add.s32 %r4604, %r4598, %r4075; + add.s32 %r4605, %r4604, %r4603; + xor.b32 %r4606, %r4605, %r4600; + shf.l.wrap.b32 %r4607, %r4606, %r4606, 24; + add.s32 %r4608, %r4607, %r4601; + xor.b32 %r4609, %r4608, %r4603; + shf.l.wrap.b32 %r4610, %r4609, %r4609, 25; + add.s32 %r4611, %r4563, %r4047; + add.s32 %r4612, %r4611, %r4554; + xor.b32 %r4613, %r4612, %r4579; + shf.l.wrap.b32 %r4614, %r4613, %r4613, 16; + add.s32 %r4615, %r4614, %r4594; + xor.b32 %r4616, %r4615, %r4554; + shf.l.wrap.b32 %r4617, %r4616, %r4616, 20; + add.s32 %r4618, %r4612, %r4061; + add.s32 %r4619, %r4618, %r4617; + xor.b32 %r4620, %r4619, %r4614; + shf.l.wrap.b32 %r4621, %r4620, %r4620, 24; + add.s32 %r4622, %r4621, %r4615; + xor.b32 %r4623, %r4622, %r4617; + shf.l.wrap.b32 %r4624, %r4623, %r4623, 25; + add.s32 %r4625, %r4577, %r4089; + add.s32 %r4626, %r4625, %r4568; + xor.b32 %r4627, %r4593, %r4626; + shf.l.wrap.b32 %r4628, %r4627, %r4627, 16; + add.s32 %r4629, %r4628, %r4552; + xor.b32 %r4630, %r4629, %r4568; + shf.l.wrap.b32 %r4631, %r4630, %r4630, 20; + add.s32 %r4632, %r4626, %r4054; + add.s32 %r4633, %r4632, %r4631; + xor.b32 %r4634, %r4633, %r4628; + shf.l.wrap.b32 %r4635, %r4634, %r4634, 24; + add.s32 %r4636, %r4635, %r4629; + xor.b32 %r4637, %r4636, %r4631; + shf.l.wrap.b32 %r4638, %r4637, %r4637, 25; + add.s32 %r4639, %r4591, %r4082; + add.s32 %r4640, %r4639, %r4582; + xor.b32 %r4641, %r4640, %r4551; + shf.l.wrap.b32 %r4642, %r4641, %r4641, 16; + add.s32 %r4643, %r4642, %r4566; + xor.b32 %r4644, %r4643, %r4582; + shf.l.wrap.b32 %r4645, %r4644, %r4644, 20; + add.s32 %r4646, %r4640, %r4040; + add.s32 %r4647, %r4646, %r4645; + xor.b32 %r4648, %r4647, %r4642; + shf.l.wrap.b32 %r4649, %r4648, %r4648, 24; + add.s32 %r4650, %r4649, %r4643; + xor.b32 %r4651, %r4650, %r4645; + shf.l.wrap.b32 %r4652, %r4651, %r4651, 25; + add.s32 %r4653, %r4605, %r4033; + add.s32 %r4654, %r4653, %r4624; + xor.b32 %r4655, %r4654, %r4649; + shf.l.wrap.b32 %r4656, %r4655, %r4655, 16; + add.s32 %r4657, %r4656, %r4636; + xor.b32 %r4658, %r4657, %r4624; + shf.l.wrap.b32 %r4659, %r4658, %r4658, 20; + add.s32 %r4660, %r4654, %r3998; + add.s32 %r4661, %r4660, %r4659; + xor.b32 %r4662, %r4661, %r4656; + shf.l.wrap.b32 %r4663, %r4662, %r4662, 24; + add.s32 %r4664, %r4663, %r4657; + xor.b32 %r4665, %r4664, %r4659; + shf.l.wrap.b32 %r4666, %r4665, %r4665, 25; + add.s32 %r4667, %r4619, %r4019; + add.s32 %r4668, %r4667, %r4638; + xor.b32 %r4669, %r4607, %r4668; + shf.l.wrap.b32 %r4670, %r4669, %r4669, 16; + add.s32 %r4671, %r4670, %r4650; + xor.b32 %r4672, %r4671, %r4638; + shf.l.wrap.b32 %r4673, %r4672, %r4672, 20; + add.s32 %r4674, %r4668, %r4005; + add.s32 %r4675, %r4674, %r4673; + xor.b32 %r4676, %r4675, %r4670; + shf.l.wrap.b32 %r4677, %r4676, %r4676, 24; + add.s32 %r4678, %r4677, %r4671; + xor.b32 %r4679, %r4678, %r4673; + shf.l.wrap.b32 %r4680, %r4679, %r4679, 25; + add.s32 %r4681, %r4633, %r3984; + add.s32 %r4682, %r4681, %r4652; + xor.b32 %r4683, %r4682, %r4621; + shf.l.wrap.b32 %r4684, %r4683, %r4683, 16; + add.s32 %r4685, %r4684, %r4608; + xor.b32 %r4686, %r4685, %r4652; + shf.l.wrap.b32 %r4687, %r4686, %r4686, 20; + add.s32 %r4688, %r4682, %r3991; + add.s32 %r4689, %r4688, %r4687; + xor.b32 %r4690, %r4689, %r4684; + shf.l.wrap.b32 %r4691, %r4690, %r4690, 24; + add.s32 %r4692, %r4691, %r4685; + xor.b32 %r4693, %r4692, %r4687; + shf.l.wrap.b32 %r4694, %r4693, %r4693, 25; + add.s32 %r4695, %r4647, %r4026; + add.s32 %r4696, %r4695, %r4610; + xor.b32 %r4697, %r4696, %r4635; + shf.l.wrap.b32 %r4698, %r4697, %r4697, 16; + add.s32 %r4699, %r4698, %r4622; + xor.b32 %r4700, %r4699, %r4610; + shf.l.wrap.b32 %r4701, %r4700, %r4700, 20; + add.s32 %r4702, %r4696, %r4012; + add.s32 %r4703, %r4702, %r4701; + xor.b32 %r4704, %r4703, %r4698; + shf.l.wrap.b32 %r4705, %r4704, %r4704, 24; + add.s32 %r4706, %r4705, %r4699; + xor.b32 %r4707, %r4706, %r4701; + shf.l.wrap.b32 %r4708, %r4707, %r4707, 25; + add.s32 %r4709, %r4661, %r4047; + add.s32 %r4710, %r4709, %r4708; + xor.b32 %r4711, %r4710, %r4677; + shf.l.wrap.b32 %r4712, %r4711, %r4711, 16; + add.s32 %r4713, %r4712, %r4692; + xor.b32 %r4714, %r4713, %r4708; + shf.l.wrap.b32 %r4715, %r4714, %r4714, 20; + add.s32 %r4716, %r4710, %r4082; + add.s32 %r4717, %r4716, %r4715; + xor.b32 %r4718, %r4717, %r4712; + shf.l.wrap.b32 %r4719, %r4718, %r4718, 24; + add.s32 %r4720, %r4719, %r4713; + xor.b32 %r4721, %r4720, %r4715; + shf.l.wrap.b32 %r4722, %r4721, %r4721, 25; + add.s32 %r4723, %r4675, %r4061; + add.s32 %r4724, %r4723, %r4666; + xor.b32 %r4725, %r4724, %r4691; + shf.l.wrap.b32 %r4726, %r4725, %r4725, 16; + add.s32 %r4727, %r4726, %r4706; + xor.b32 %r4728, %r4727, %r4666; + shf.l.wrap.b32 %r4729, %r4728, %r4728, 20; + add.s32 %r4730, %r4724, %r4019; + add.s32 %r4731, %r4730, %r4729; + xor.b32 %r4732, %r4731, %r4726; + shf.l.wrap.b32 %r4733, %r4732, %r4732, 24; + add.s32 %r4734, %r4733, %r4727; + xor.b32 %r4735, %r4734, %r4729; + shf.l.wrap.b32 %r4736, %r4735, %r4735, 25; + add.s32 %r4737, %r4689, %r4040; + add.s32 %r4738, %r4737, %r4680; + xor.b32 %r4739, %r4705, %r4738; + shf.l.wrap.b32 %r4740, %r4739, %r4739, 16; + add.s32 %r4741, %r4740, %r4664; + xor.b32 %r4742, %r4741, %r4680; + shf.l.wrap.b32 %r4743, %r4742, %r4742, 20; + add.s32 %r4744, %r4738, %r4068; + add.s32 %r4745, %r4744, %r4743; + xor.b32 %r4746, %r4745, %r4740; + shf.l.wrap.b32 %r4747, %r4746, %r4746, 24; + add.s32 %r4748, %r4747, %r4741; + xor.b32 %r4749, %r4748, %r4743; + shf.l.wrap.b32 %r4750, %r4749, %r4749, 25; + add.s32 %r4751, %r4703, %r4089; + add.s32 %r4752, %r4751, %r4694; + xor.b32 %r4753, %r4752, %r4663; + shf.l.wrap.b32 %r4754, %r4753, %r4753, 16; + add.s32 %r4755, %r4754, %r4678; + xor.b32 %r4756, %r4755, %r4694; + shf.l.wrap.b32 %r4757, %r4756, %r4756, 20; + add.s32 %r4758, %r4752, %r3991; + add.s32 %r4759, %r4758, %r4757; + xor.b32 %r4760, %r4759, %r4754; + shf.l.wrap.b32 %r4761, %r4760, %r4760, 24; + add.s32 %r4762, %r4761, %r4755; + xor.b32 %r4763, %r4762, %r4757; + shf.l.wrap.b32 %r4764, %r4763, %r4763, 25; + add.s32 %r4765, %r4717, %r4075; + add.s32 %r4766, %r4765, %r4736; + xor.b32 %r4767, %r4766, %r4761; + shf.l.wrap.b32 %r4768, %r4767, %r4767, 16; + add.s32 %r4769, %r4768, %r4748; + xor.b32 %r4770, %r4769, %r4736; + shf.l.wrap.b32 %r4771, %r4770, %r4770, 20; + add.s32 %r4772, %r4766, %r4005; + add.s32 %r4773, %r4772, %r4771; + xor.b32 %r4774, %r4773, %r4768; + shf.l.wrap.b32 %r4775, %r4774, %r4774, 24; + add.s32 %r4776, %r4775, %r4769; + xor.b32 %r4777, %r4776, %r4771; + shf.l.wrap.b32 %r4778, %r4777, %r4777, 25; + add.s32 %r4779, %r4731, %r3984; + add.s32 %r4780, %r4779, %r4750; + xor.b32 %r4781, %r4719, %r4780; + shf.l.wrap.b32 %r4782, %r4781, %r4781, 16; + add.s32 %r4783, %r4782, %r4762; + xor.b32 %r4784, %r4783, %r4750; + shf.l.wrap.b32 %r4785, %r4784, %r4784, 20; + add.s32 %r4786, %r4780, %r4054; + add.s32 %r4787, %r4786, %r4785; + xor.b32 %r4788, %r4787, %r4782; + shf.l.wrap.b32 %r4789, %r4788, %r4788, 24; + add.s32 %r4790, %r4789, %r4783; + xor.b32 %r4791, %r4790, %r4785; + shf.l.wrap.b32 %r4792, %r4791, %r4791, 25; + add.s32 %r4793, %r4745, %r3998; + add.s32 %r4794, %r4793, %r4764; + xor.b32 %r4795, %r4794, %r4733; + shf.l.wrap.b32 %r4796, %r4795, %r4795, 16; + add.s32 %r4797, %r4796, %r4720; + xor.b32 %r4798, %r4797, %r4764; + shf.l.wrap.b32 %r4799, %r4798, %r4798, 20; + add.s32 %r4800, %r4794, %r4026; + add.s32 %r4801, %r4800, %r4799; + xor.b32 %r4802, %r4801, %r4796; + shf.l.wrap.b32 %r4803, %r4802, %r4802, 24; + add.s32 %r4804, %r4803, %r4797; + xor.b32 %r4805, %r4804, %r4799; + shf.l.wrap.b32 %r4806, %r4805, %r4805, 25; + add.s32 %r4807, %r4759, %r4012; + add.s32 %r4808, %r4807, %r4722; + xor.b32 %r4809, %r4808, %r4747; + shf.l.wrap.b32 %r4810, %r4809, %r4809, 16; + add.s32 %r4811, %r4810, %r4734; + xor.b32 %r4812, %r4811, %r4722; + shf.l.wrap.b32 %r4813, %r4812, %r4812, 20; + add.s32 %r4814, %r4808, %r4033; + add.s32 %r4815, %r4814, %r4813; + xor.b32 %r4816, %r4815, %r4810; + shf.l.wrap.b32 %r4817, %r4816, %r4816, 24; + add.s32 %r4818, %r4817, %r4811; + xor.b32 %r4819, %r4818, %r4813; + shf.l.wrap.b32 %r4820, %r4819, %r4819, 25; + add.s32 %r4821, %r4773, %r4061; + add.s32 %r4822, %r4821, %r4820; + xor.b32 %r4823, %r4822, %r4789; + shf.l.wrap.b32 %r4824, %r4823, %r4823, 16; + add.s32 %r4825, %r4824, %r4804; + xor.b32 %r4826, %r4825, %r4820; + shf.l.wrap.b32 %r4827, %r4826, %r4826, 20; + add.s32 %r4828, %r4822, %r4089; + add.s32 %r4829, %r4828, %r4827; + xor.b32 %r4830, %r4829, %r4824; + shf.l.wrap.b32 %r4831, %r4830, %r4830, 24; + add.s32 %r4832, %r4831, %r4825; + xor.b32 %r4833, %r4832, %r4827; + shf.l.wrap.b32 %r4834, %r4833, %r4833, 25; + add.s32 %r4835, %r4787, %r4019; + add.s32 %r4836, %r4835, %r4778; + xor.b32 %r4837, %r4836, %r4803; + shf.l.wrap.b32 %r4838, %r4837, %r4837, 16; + add.s32 %r4839, %r4838, %r4818; + xor.b32 %r4840, %r4839, %r4778; + shf.l.wrap.b32 %r4841, %r4840, %r4840, 20; + add.s32 %r4842, %r4836, %r3984; + add.s32 %r4843, %r4842, %r4841; + xor.b32 %r4844, %r4843, %r4838; + shf.l.wrap.b32 %r4845, %r4844, %r4844, 24; + add.s32 %r4846, %r4845, %r4839; + xor.b32 %r4847, %r4846, %r4841; + shf.l.wrap.b32 %r4848, %r4847, %r4847, 25; + add.s32 %r4849, %r4801, %r3991; + add.s32 %r4850, %r4849, %r4792; + xor.b32 %r4851, %r4817, %r4850; + shf.l.wrap.b32 %r4852, %r4851, %r4851, 16; + add.s32 %r4853, %r4852, %r4776; + xor.b32 %r4854, %r4853, %r4792; + shf.l.wrap.b32 %r4855, %r4854, %r4854, 20; + add.s32 %r4856, %r4850, %r4047; + add.s32 %r4857, %r4856, %r4855; + xor.b32 %r4858, %r4857, %r4852; + shf.l.wrap.b32 %r4859, %r4858, %r4858, 24; + add.s32 %r4860, %r4859, %r4853; + xor.b32 %r4861, %r4860, %r4855; + shf.l.wrap.b32 %r4862, %r4861, %r4861, 25; + add.s32 %r4863, %r4815, %r4040; + add.s32 %r4864, %r4863, %r4806; + xor.b32 %r4865, %r4864, %r4775; + shf.l.wrap.b32 %r4866, %r4865, %r4865, 16; + add.s32 %r4867, %r4866, %r4790; + xor.b32 %r4868, %r4867, %r4806; + shf.l.wrap.b32 %r4869, %r4868, %r4868, 20; + add.s32 %r4870, %r4864, %r4026; + add.s32 %r4871, %r4870, %r4869; + xor.b32 %r4872, %r4871, %r4866; + shf.l.wrap.b32 %r4873, %r4872, %r4872, 24; + add.s32 %r4874, %r4873, %r4867; + xor.b32 %r4875, %r4874, %r4869; + shf.l.wrap.b32 %r4876, %r4875, %r4875, 25; + add.s32 %r4877, %r4829, %r4082; + add.s32 %r4878, %r4877, %r4848; + xor.b32 %r4879, %r4878, %r4873; + shf.l.wrap.b32 %r4880, %r4879, %r4879, 16; + add.s32 %r4881, %r4880, %r4860; + xor.b32 %r4882, %r4881, %r4848; + shf.l.wrap.b32 %r4883, %r4882, %r4882, 20; + add.s32 %r4884, %r4878, %r4054; + add.s32 %r4885, %r4884, %r4883; + xor.b32 %r4886, %r4885, %r4880; + shf.l.wrap.b32 %r4887, %r4886, %r4886, 24; + add.s32 %r4888, %r4887, %r4881; + xor.b32 %r4889, %r4888, %r4883; + shf.l.wrap.b32 %r4890, %r4889, %r4889, 25; + add.s32 %r4891, %r4843, %r3998; + add.s32 %r4892, %r4891, %r4862; + xor.b32 %r4893, %r4831, %r4892; + shf.l.wrap.b32 %r4894, %r4893, %r4893, 16; + add.s32 %r4895, %r4894, %r4874; + xor.b32 %r4896, %r4895, %r4862; + shf.l.wrap.b32 %r4897, %r4896, %r4896, 20; + add.s32 %r4898, %r4892, %r4068; + add.s32 %r4899, %r4898, %r4897; + xor.b32 %r4900, %r4899, %r4894; + shf.l.wrap.b32 %r4901, %r4900, %r4900, 24; + add.s32 %r4902, %r4901, %r4895; + xor.b32 %r4903, %r4902, %r4897; + shf.l.wrap.b32 %r4904, %r4903, %r4903, 25; + add.s32 %r4905, %r4857, %r4005; + add.s32 %r4906, %r4905, %r4876; + xor.b32 %r4907, %r4906, %r4845; + shf.l.wrap.b32 %r4908, %r4907, %r4907, 16; + add.s32 %r4909, %r4908, %r4832; + xor.b32 %r4910, %r4909, %r4876; + shf.l.wrap.b32 %r4911, %r4910, %r4910, 20; + add.s32 %r4912, %r4906, %r4012; + add.s32 %r4913, %r4912, %r4911; + xor.b32 %r4914, %r4913, %r4908; + shf.l.wrap.b32 %r4915, %r4914, %r4914, 24; + add.s32 %r4916, %r4915, %r4909; + xor.b32 %r4917, %r4916, %r4911; + shf.l.wrap.b32 %r4918, %r4917, %r4917, 25; + add.s32 %r4919, %r4871, %r4033; + add.s32 %r4920, %r4919, %r4834; + xor.b32 %r4921, %r4920, %r4859; + shf.l.wrap.b32 %r4922, %r4921, %r4921, 16; + add.s32 %r4923, %r4922, %r4846; + xor.b32 %r4924, %r4923, %r4834; + shf.l.wrap.b32 %r4925, %r4924, %r4924, 20; + add.s32 %r4926, %r4920, %r4075; + add.s32 %r4927, %r4926, %r4925; + xor.b32 %r4928, %r4927, %r4922; + shf.l.wrap.b32 %r4929, %r4928, %r4928, 24; + add.s32 %r4930, %r4929, %r4923; + xor.b32 %r4931, %r4930, %r4925; + shf.l.wrap.b32 %r4932, %r4931, %r4931, 25; + xor.b32 %r4933, %r4916, %r4885; + xor.b32 %r4934, %r4930, %r4899; + xor.b32 %r4935, %r4888, %r4913; + xor.b32 %r4936, %r4927, %r4902; + xor.b32 %r4937, %r4932, %r4901; + xor.b32 %r4938, %r4890, %r4915; + xor.b32 %r4939, %r4929, %r4904; + xor.b32 %r4940, %r4918, %r4887; + st.local.u8 [%rd167+145], %r4933; + shr.u32 %r4941, %r4933, 8; + st.local.u8 [%rd167+146], %r4941; + shr.u32 %r4942, %r4933, 16; + st.local.u8 [%rd167+147], %r4942; + shr.u32 %r4943, %r4933, 24; + st.local.u8 [%rd167+148], %r4943; + st.local.u8 [%rd167+149], %r4934; + shr.u32 %r4944, %r4934, 8; + st.local.u8 [%rd167+150], %r4944; + shr.u32 %r4945, %r4934, 16; + st.local.u8 [%rd167+151], %r4945; + shr.u32 %r4946, %r4934, 24; + st.local.u8 [%rd167+152], %r4946; + st.local.u8 [%rd167+153], %r4935; + shr.u32 %r4947, %r4935, 8; + st.local.u8 [%rd167+154], %r4947; + shr.u32 %r4948, %r4935, 16; + st.local.u8 [%rd167+155], %r4948; + shr.u32 %r4949, %r4935, 24; + st.local.u8 [%rd167+156], %r4949; + st.local.u8 [%rd167+157], %r4936; + shr.u32 %r4950, %r4936, 8; + st.local.u8 [%rd167+158], %r4950; + shr.u32 %r4951, %r4936, 16; + st.local.u8 [%rd167+159], %r4951; + shr.u32 %r4952, %r4936, 24; + st.local.u8 [%rd167+160], %r4952; + st.local.u8 [%rd167+161], %r4937; + shr.u32 %r4953, %r4937, 8; + st.local.u8 [%rd167+162], %r4953; + shr.u32 %r4954, %r4937, 16; + st.local.u8 [%rd167+163], %r4954; + shr.u32 %r4955, %r4937, 24; + st.local.u8 [%rd167+164], %r4955; + st.local.u8 [%rd167+165], %r4938; + shr.u32 %r4956, %r4938, 8; + st.local.u8 [%rd167+166], %r4956; + shr.u32 %r4957, %r4938, 16; + st.local.u8 [%rd167+167], %r4957; + shr.u32 %r4958, %r4938, 24; + st.local.u8 [%rd167+168], %r4958; + st.local.u8 [%rd167+169], %r4939; + shr.u32 %r4959, %r4939, 8; + st.local.u8 [%rd167+170], %r4959; + shr.u32 %r4960, %r4939, 16; + st.local.u8 [%rd167+171], %r4960; + shr.u32 %r4961, %r4939, 24; + st.local.u8 [%rd167+172], %r4961; + st.local.u8 [%rd167+173], %r4940; + shr.u32 %r4962, %r4940, 8; + st.local.u8 [%rd167+174], %r4962; + shr.u32 %r4963, %r4940, 16; + st.local.u8 [%rd167+175], %r4963; + shr.u32 %r4964, %r4940, 24; + st.local.u8 [%rd167+176], %r4964; + ld.local.u8 %rs138, [%rd3+8]; + add.s16 %rs139, %rs138, -1; + st.local.u8 [%rd3+8], %rs139; + cvt.u64.u16 %rd168, %rs139; + and.b64 %rd169, %rd168, 255; + setp.lt.u64 %p28, %rd225, %rd169; + and.b16 %rs140, %rs139, 255; + mul.wide.u16 %r11661, %rs140, 32; + @%p28 bra $L__BB1_31; + +$L__BB1_32: + cvt.s64.s32 %rd170, %r11661; + add.s64 %rd171, %rd2, %rd170; + mov.b32 {%rs141, %rs142}, %r3959; + st.local.u8 [%rd171+145], %rs141; + shr.u16 %rs143, %rs141, 8; + st.local.u8 [%rd171+146], %rs143; + st.local.u8 [%rd171+147], %rs142; + shr.u16 %rs144, %rs142, 8; + st.local.u8 [%rd171+148], %rs144; + mov.b32 {%rs145, %rs146}, %r3960; + st.local.u8 [%rd171+149], %rs145; + shr.u16 %rs147, %rs145, 8; + st.local.u8 [%rd171+150], %rs147; + st.local.u8 [%rd171+151], %rs146; + shr.u16 %rs148, %rs146, 8; + st.local.u8 [%rd171+152], %rs148; + mov.b32 {%rs149, %rs150}, %r3961; + st.local.u8 [%rd171+153], %rs149; + shr.u16 %rs151, %rs149, 8; + st.local.u8 [%rd171+154], %rs151; + st.local.u8 [%rd171+155], %rs150; + shr.u16 %rs152, %rs150, 8; + st.local.u8 [%rd171+156], %rs152; + mov.b32 {%rs153, %rs154}, %r3962; + st.local.u8 [%rd171+157], %rs153; + shr.u16 %rs155, %rs153, 8; + st.local.u8 [%rd171+158], %rs155; + st.local.u8 [%rd171+159], %rs154; + shr.u16 %rs156, %rs154, 8; + st.local.u8 [%rd171+160], %rs156; + mov.b32 {%rs157, %rs158}, %r3963; + st.local.u8 [%rd171+161], %rs157; + shr.u16 %rs159, %rs157, 8; + st.local.u8 [%rd171+162], %rs159; + st.local.u8 [%rd171+163], %rs158; + shr.u16 %rs160, %rs158, 8; + st.local.u8 [%rd171+164], %rs160; + mov.b32 {%rs161, %rs162}, %r3964; + st.local.u8 [%rd171+165], %rs161; + shr.u16 %rs163, %rs161, 8; + st.local.u8 [%rd171+166], %rs163; + st.local.u8 [%rd171+167], %rs162; + shr.u16 %rs164, %rs162, 8; + st.local.u8 [%rd171+168], %rs164; + mov.b32 {%rs165, %rs166}, %r3965; + st.local.u8 [%rd171+169], %rs165; + shr.u16 %rs167, %rs165, 8; + st.local.u8 [%rd171+170], %rs167; + st.local.u8 [%rd171+171], %rs166; + shr.u16 %rs168, %rs166, 8; + st.local.u8 [%rd171+172], %rs168; + mov.b32 {%rs169, %rs170}, %r3966; + st.local.u8 [%rd171+173], %rs169; + shr.u16 %rs171, %rs169, 8; + st.local.u8 [%rd171+174], %rs171; + st.local.u8 [%rd171+175], %rs170; + shr.u16 %rs172, %rs170, 8; + st.local.u8 [%rd171+176], %rs172; + ld.local.u8 %rs173, [%rd3+8]; + add.s16 %rs174, %rs173, 1; + st.local.u8 [%rd3+8], %rs174; + shr.u64 %rd172, %rd49, 11; + ld.local.u64 %rd173, [%rd3+-72]; + add.s64 %rd174, %rd173, %rd172; + popc.b64 %r4965, %rd174; + cvt.u64.u32 %rd52, %r4965; + cvt.u64.u16 %rd175, %rs174; + and.b64 %rd176, %rd175, 255; + setp.ge.u64 %p29, %rd52, %rd176; + and.b16 %rs175, %rs174, 255; + mul.wide.u16 %r11663, %rs175, 32; + @%p29 bra $L__BB1_35; + +$L__BB1_34: + shr.u64 %rd228, %rd49, 11; + add.s64 %rd227, %rd173, %rd228; + popc.b64 %r11648, %rd227; + cvt.u64.u32 %rd226, %r11648; + add.s32 %r4966, %r11663, -64; + cvt.s64.s32 %rd177, %r4966; + add.s64 %rd178, %rd2, %rd177; + ld.local.u8 %r4967, [%rd3+2]; + ld.local.u8 %r4968, [%rd178+145]; + ld.local.u8 %r4969, [%rd178+146]; + prmt.b32 %r4970, %r4969, %r4968, 30212; + ld.local.u8 %r4971, [%rd178+147]; + prmt.b32 %r4972, %r4971, %r4970, 28756; + ld.local.u8 %r4973, [%rd178+148]; + prmt.b32 %r4974, %r4973, %r4972, 1620; + ld.local.u8 %r4975, [%rd178+149]; + ld.local.u8 %r4976, [%rd178+150]; + prmt.b32 %r4977, %r4976, %r4975, 30212; + ld.local.u8 %r4978, [%rd178+151]; + prmt.b32 %r4979, %r4978, %r4977, 28756; + ld.local.u8 %r4980, [%rd178+152]; + prmt.b32 %r4981, %r4980, %r4979, 1620; + ld.local.u8 %r4982, [%rd178+153]; + ld.local.u8 %r4983, [%rd178+154]; + prmt.b32 %r4984, %r4983, %r4982, 30212; + ld.local.u8 %r4985, [%rd178+155]; + prmt.b32 %r4986, %r4985, %r4984, 28756; + ld.local.u8 %r4987, [%rd178+156]; + prmt.b32 %r4988, %r4987, %r4986, 1620; + ld.local.u8 %r4989, [%rd178+157]; + ld.local.u8 %r4990, [%rd178+158]; + prmt.b32 %r4991, %r4990, %r4989, 30212; + ld.local.u8 %r4992, [%rd178+159]; + prmt.b32 %r4993, %r4992, %r4991, 28756; + ld.local.u8 %r4994, [%rd178+160]; + prmt.b32 %r4995, %r4994, %r4993, 1620; + ld.local.u8 %r4996, [%rd178+161]; + ld.local.u8 %r4997, [%rd178+162]; + prmt.b32 %r4998, %r4997, %r4996, 30212; + ld.local.u8 %r4999, [%rd178+163]; + prmt.b32 %r5000, %r4999, %r4998, 28756; + ld.local.u8 %r5001, [%rd178+164]; + prmt.b32 %r5002, %r5001, %r5000, 1620; + ld.local.u8 %r5003, [%rd178+165]; + ld.local.u8 %r5004, [%rd178+166]; + prmt.b32 %r5005, %r5004, %r5003, 30212; + ld.local.u8 %r5006, [%rd178+167]; + prmt.b32 %r5007, %r5006, %r5005, 28756; + ld.local.u8 %r5008, [%rd178+168]; + prmt.b32 %r5009, %r5008, %r5007, 1620; + ld.local.u8 %r5010, [%rd178+169]; + ld.local.u8 %r5011, [%rd178+170]; + prmt.b32 %r5012, %r5011, %r5010, 30212; + ld.local.u8 %r5013, [%rd178+171]; + prmt.b32 %r5014, %r5013, %r5012, 28756; + ld.local.u8 %r5015, [%rd178+172]; + prmt.b32 %r5016, %r5015, %r5014, 1620; + ld.local.u8 %r5017, [%rd178+173]; + ld.local.u8 %r5018, [%rd178+174]; + prmt.b32 %r5019, %r5018, %r5017, 30212; + ld.local.u8 %r5020, [%rd178+175]; + prmt.b32 %r5021, %r5020, %r5019, 28756; + ld.local.u8 %r5022, [%rd178+176]; + prmt.b32 %r5023, %r5022, %r5021, 1620; + ld.local.u8 %r5024, [%rd178+177]; + ld.local.u8 %r5025, [%rd178+178]; + prmt.b32 %r5026, %r5025, %r5024, 30212; + ld.local.u8 %r5027, [%rd178+179]; + prmt.b32 %r5028, %r5027, %r5026, 28756; + ld.local.u8 %r5029, [%rd178+180]; + prmt.b32 %r5030, %r5029, %r5028, 1620; + ld.local.u8 %r5031, [%rd178+181]; + ld.local.u8 %r5032, [%rd178+182]; + prmt.b32 %r5033, %r5032, %r5031, 30212; + ld.local.u8 %r5034, [%rd178+183]; + prmt.b32 %r5035, %r5034, %r5033, 28756; + ld.local.u8 %r5036, [%rd178+184]; + prmt.b32 %r5037, %r5036, %r5035, 1620; + ld.local.u8 %r5038, [%rd178+185]; + ld.local.u8 %r5039, [%rd178+186]; + prmt.b32 %r5040, %r5039, %r5038, 30212; + ld.local.u8 %r5041, [%rd178+187]; + prmt.b32 %r5042, %r5041, %r5040, 28756; + ld.local.u8 %r5043, [%rd178+188]; + prmt.b32 %r5044, %r5043, %r5042, 1620; + ld.local.u8 %r5045, [%rd178+189]; + ld.local.u8 %r5046, [%rd178+190]; + prmt.b32 %r5047, %r5046, %r5045, 30212; + ld.local.u8 %r5048, [%rd178+191]; + prmt.b32 %r5049, %r5048, %r5047, 28756; + ld.local.u8 %r5050, [%rd178+192]; + prmt.b32 %r5051, %r5050, %r5049, 1620; + ld.local.u8 %r5052, [%rd178+193]; + ld.local.u8 %r5053, [%rd178+194]; + prmt.b32 %r5054, %r5053, %r5052, 30212; + ld.local.u8 %r5055, [%rd178+195]; + prmt.b32 %r5056, %r5055, %r5054, 28756; + ld.local.u8 %r5057, [%rd178+196]; + prmt.b32 %r5058, %r5057, %r5056, 1620; + ld.local.u8 %r5059, [%rd178+197]; + ld.local.u8 %r5060, [%rd178+198]; + prmt.b32 %r5061, %r5060, %r5059, 30212; + ld.local.u8 %r5062, [%rd178+199]; + prmt.b32 %r5063, %r5062, %r5061, 28756; + ld.local.u8 %r5064, [%rd178+200]; + prmt.b32 %r5065, %r5064, %r5063, 1620; + ld.local.u8 %r5066, [%rd178+201]; + ld.local.u8 %r5067, [%rd178+202]; + prmt.b32 %r5068, %r5067, %r5066, 30212; + ld.local.u8 %r5069, [%rd178+203]; + prmt.b32 %r5070, %r5069, %r5068, 28756; + ld.local.u8 %r5071, [%rd178+204]; + prmt.b32 %r5072, %r5071, %r5070, 1620; + ld.local.u8 %r5073, [%rd178+205]; + ld.local.u8 %r5074, [%rd178+206]; + prmt.b32 %r5075, %r5074, %r5073, 30212; + ld.local.u8 %r5076, [%rd178+207]; + prmt.b32 %r5077, %r5076, %r5075, 28756; + ld.local.u8 %r5078, [%rd178+208]; + prmt.b32 %r5079, %r5078, %r5077, 1620; + or.b32 %r5080, %r4967, 4; + ld.local.u8 %r5081, [%rd3+-120]; + ld.local.u8 %r5082, [%rd3+-119]; + prmt.b32 %r5083, %r5082, %r5081, 30212; + ld.local.u8 %r5084, [%rd3+-118]; + ld.local.u8 %r5085, [%rd3+-117]; + prmt.b32 %r5086, %r5085, %r5084, 30212; + prmt.b32 %r5087, %r5086, %r5083, 4180; + ld.local.u8 %r5088, [%rd3+-136]; + ld.local.u8 %r5089, [%rd3+-135]; + prmt.b32 %r5090, %r5089, %r5088, 30212; + ld.local.u8 %r5091, [%rd3+-134]; + ld.local.u8 %r5092, [%rd3+-133]; + prmt.b32 %r5093, %r5092, %r5091, 30212; + prmt.b32 %r5094, %r5093, %r5090, 4180; + add.s32 %r5095, %r5087, %r5094; + add.s32 %r5096, %r5095, %r4974; + shf.l.wrap.b32 %r5097, %r5096, %r5096, 16; + add.s32 %r5098, %r5097, 1779033703; + xor.b32 %r5099, %r5098, %r5087; + shf.l.wrap.b32 %r5100, %r5099, %r5099, 20; + add.s32 %r5101, %r4981, %r5096; + add.s32 %r5102, %r5101, %r5100; + xor.b32 %r5103, %r5102, %r5097; + shf.l.wrap.b32 %r5104, %r5103, %r5103, 24; + add.s32 %r5105, %r5104, %r5098; + xor.b32 %r5106, %r5105, %r5100; + shf.l.wrap.b32 %r5107, %r5106, %r5106, 25; + ld.local.u8 %r5108, [%rd3+-116]; + ld.local.u8 %r5109, [%rd3+-115]; + prmt.b32 %r5110, %r5109, %r5108, 30212; + ld.local.u8 %r5111, [%rd3+-114]; + ld.local.u8 %r5112, [%rd3+-113]; + prmt.b32 %r5113, %r5112, %r5111, 30212; + prmt.b32 %r5114, %r5113, %r5110, 4180; + ld.local.u8 %r5115, [%rd3+-132]; + ld.local.u8 %r5116, [%rd3+-131]; + prmt.b32 %r5117, %r5116, %r5115, 30212; + ld.local.u8 %r5118, [%rd3+-130]; + ld.local.u8 %r5119, [%rd3+-129]; + prmt.b32 %r5120, %r5119, %r5118, 30212; + prmt.b32 %r5121, %r5120, %r5117, 4180; + add.s32 %r5122, %r5114, %r5121; + add.s32 %r5123, %r5122, %r4988; + shf.l.wrap.b32 %r5124, %r5123, %r5123, 16; + add.s32 %r5125, %r5124, -1150833019; + xor.b32 %r5126, %r5125, %r5114; + shf.l.wrap.b32 %r5127, %r5126, %r5126, 20; + add.s32 %r5128, %r4995, %r5123; + add.s32 %r5129, %r5128, %r5127; + xor.b32 %r5130, %r5129, %r5124; + shf.l.wrap.b32 %r5131, %r5130, %r5130, 24; + add.s32 %r5132, %r5131, %r5125; + xor.b32 %r5133, %r5132, %r5127; + shf.l.wrap.b32 %r5134, %r5133, %r5133, 25; + ld.local.u8 %r5135, [%rd3+-112]; + ld.local.u8 %r5136, [%rd3+-111]; + prmt.b32 %r5137, %r5136, %r5135, 30212; + ld.local.u8 %r5138, [%rd3+-110]; + ld.local.u8 %r5139, [%rd3+-109]; + prmt.b32 %r5140, %r5139, %r5138, 30212; + prmt.b32 %r5141, %r5140, %r5137, 4180; + ld.local.u8 %r5142, [%rd3+-128]; + ld.local.u8 %r5143, [%rd3+-127]; + prmt.b32 %r5144, %r5143, %r5142, 30212; + ld.local.u8 %r5145, [%rd3+-126]; + ld.local.u8 %r5146, [%rd3+-125]; + prmt.b32 %r5147, %r5146, %r5145, 30212; + prmt.b32 %r5148, %r5147, %r5144, 4180; + add.s32 %r5149, %r5141, %r5148; + add.s32 %r5150, %r5149, %r5002; + shr.u32 %r5151, %r5150, 16; + shl.b32 %r5152, %r5150, 16; + xor.b32 %r5153, %r5152, 4194304; + or.b32 %r5154, %r5153, %r5151; + add.s32 %r5155, %r5154, 1013904242; + xor.b32 %r5156, %r5155, %r5141; + shf.l.wrap.b32 %r5157, %r5156, %r5156, 20; + add.s32 %r5158, %r5009, %r5150; + add.s32 %r5159, %r5158, %r5157; + xor.b32 %r5160, %r5159, %r5154; + shf.l.wrap.b32 %r5161, %r5160, %r5160, 24; + add.s32 %r5162, %r5161, %r5155; + xor.b32 %r5163, %r5162, %r5157; + shf.l.wrap.b32 %r5164, %r5163, %r5163, 25; + ld.local.u8 %r5165, [%rd3+-108]; + ld.local.u8 %r5166, [%rd3+-107]; + prmt.b32 %r5167, %r5166, %r5165, 30212; + ld.local.u8 %r5168, [%rd3+-106]; + ld.local.u8 %r5169, [%rd3+-105]; + prmt.b32 %r5170, %r5169, %r5168, 30212; + prmt.b32 %r5171, %r5170, %r5167, 4180; + ld.local.u8 %r5172, [%rd3+-124]; + ld.local.u8 %r5173, [%rd3+-123]; + prmt.b32 %r5174, %r5173, %r5172, 30212; + ld.local.u8 %r5175, [%rd3+-122]; + ld.local.u8 %r5176, [%rd3+-121]; + prmt.b32 %r5177, %r5176, %r5175, 30212; + prmt.b32 %r5178, %r5177, %r5174, 4180; + add.s32 %r5179, %r5171, %r5178; + add.s32 %r5180, %r5179, %r5016; + xor.b32 %r5181, %r5180, %r5080; + shr.u32 %r5182, %r5180, 16; + shl.b32 %r5183, %r5181, 16; + or.b32 %r5184, %r5183, %r5182; + add.s32 %r5185, %r5184, -1521486534; + xor.b32 %r5186, %r5185, %r5171; + shf.l.wrap.b32 %r5187, %r5186, %r5186, 20; + add.s32 %r5188, %r5023, %r5180; + add.s32 %r5189, %r5188, %r5187; + xor.b32 %r5190, %r5189, %r5184; + shf.l.wrap.b32 %r5191, %r5190, %r5190, 24; + add.s32 %r5192, %r5191, %r5185; + xor.b32 %r5193, %r5192, %r5187; + shf.l.wrap.b32 %r5194, %r5193, %r5193, 25; + add.s32 %r5195, %r5134, %r5102; + add.s32 %r5196, %r5195, %r5030; + xor.b32 %r5197, %r5191, %r5196; + shf.l.wrap.b32 %r5198, %r5197, %r5197, 16; + add.s32 %r5199, %r5198, %r5162; + xor.b32 %r5200, %r5199, %r5134; + shf.l.wrap.b32 %r5201, %r5200, %r5200, 20; + add.s32 %r5202, %r5037, %r5196; + add.s32 %r5203, %r5202, %r5201; + xor.b32 %r5204, %r5203, %r5198; + shf.l.wrap.b32 %r5205, %r5204, %r5204, 24; + add.s32 %r5206, %r5205, %r5199; + xor.b32 %r5207, %r5206, %r5201; + shf.l.wrap.b32 %r5208, %r5207, %r5207, 25; + add.s32 %r5209, %r5164, %r5129; + add.s32 %r5210, %r5209, %r5044; + xor.b32 %r5211, %r5210, %r5104; + shf.l.wrap.b32 %r5212, %r5211, %r5211, 16; + add.s32 %r5213, %r5212, %r5192; + xor.b32 %r5214, %r5213, %r5164; + shf.l.wrap.b32 %r5215, %r5214, %r5214, 20; + add.s32 %r5216, %r5051, %r5210; + add.s32 %r5217, %r5216, %r5215; + xor.b32 %r5218, %r5217, %r5212; + shf.l.wrap.b32 %r5219, %r5218, %r5218, 24; + add.s32 %r5220, %r5219, %r5213; + xor.b32 %r5221, %r5220, %r5215; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 25; + add.s32 %r5223, %r5194, %r5159; + add.s32 %r5224, %r5223, %r5058; + xor.b32 %r5225, %r5224, %r5131; + shf.l.wrap.b32 %r5226, %r5225, %r5225, 16; + add.s32 %r5227, %r5226, %r5105; + xor.b32 %r5228, %r5227, %r5194; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 20; + add.s32 %r5230, %r5065, %r5224; + add.s32 %r5231, %r5230, %r5229; + xor.b32 %r5232, %r5231, %r5226; + shf.l.wrap.b32 %r5233, %r5232, %r5232, 24; + add.s32 %r5234, %r5233, %r5227; + xor.b32 %r5235, %r5234, %r5229; + shf.l.wrap.b32 %r5236, %r5235, %r5235, 25; + add.s32 %r5237, %r5189, %r5107; + add.s32 %r5238, %r5237, %r5072; + xor.b32 %r5239, %r5238, %r5161; + shf.l.wrap.b32 %r5240, %r5239, %r5239, 16; + add.s32 %r5241, %r5240, %r5132; + xor.b32 %r5242, %r5241, %r5107; + shf.l.wrap.b32 %r5243, %r5242, %r5242, 20; + add.s32 %r5244, %r5079, %r5238; + add.s32 %r5245, %r5244, %r5243; + xor.b32 %r5246, %r5245, %r5240; + shf.l.wrap.b32 %r5247, %r5246, %r5246, 24; + add.s32 %r5248, %r5247, %r5241; + xor.b32 %r5249, %r5248, %r5243; + shf.l.wrap.b32 %r5250, %r5249, %r5249, 25; + add.s32 %r5251, %r5203, %r4988; + add.s32 %r5252, %r5251, %r5250; + xor.b32 %r5253, %r5252, %r5219; + shf.l.wrap.b32 %r5254, %r5253, %r5253, 16; + add.s32 %r5255, %r5254, %r5234; + xor.b32 %r5256, %r5255, %r5250; + shf.l.wrap.b32 %r5257, %r5256, %r5256, 20; + add.s32 %r5258, %r5252, %r5016; + add.s32 %r5259, %r5258, %r5257; + xor.b32 %r5260, %r5259, %r5254; + shf.l.wrap.b32 %r5261, %r5260, %r5260, 24; + add.s32 %r5262, %r5261, %r5255; + xor.b32 %r5263, %r5262, %r5257; + shf.l.wrap.b32 %r5264, %r5263, %r5263, 25; + add.s32 %r5265, %r5217, %r4995; + add.s32 %r5266, %r5265, %r5208; + xor.b32 %r5267, %r5233, %r5266; + shf.l.wrap.b32 %r5268, %r5267, %r5267, 16; + add.s32 %r5269, %r5248, %r5268; + xor.b32 %r5270, %r5269, %r5208; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 20; + add.s32 %r5272, %r5266, %r5044; + add.s32 %r5273, %r5272, %r5271; + xor.b32 %r5274, %r5273, %r5268; + shf.l.wrap.b32 %r5275, %r5274, %r5274, 24; + add.s32 %r5276, %r5275, %r5269; + xor.b32 %r5277, %r5276, %r5271; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 25; + add.s32 %r5279, %r5222, %r5023; + add.s32 %r5280, %r5279, %r5231; + xor.b32 %r5281, %r5247, %r5280; + shf.l.wrap.b32 %r5282, %r5281, %r5281, 16; + add.s32 %r5283, %r5282, %r5206; + xor.b32 %r5284, %r5283, %r5222; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 20; + add.s32 %r5286, %r5280, %r4974; + add.s32 %r5287, %r5286, %r5285; + xor.b32 %r5288, %r5287, %r5282; + shf.l.wrap.b32 %r5289, %r5288, %r5288, 24; + add.s32 %r5290, %r5289, %r5283; + xor.b32 %r5291, %r5290, %r5285; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 25; + add.s32 %r5293, %r5236, %r5002; + add.s32 %r5294, %r5293, %r5245; + xor.b32 %r5295, %r5294, %r5205; + shf.l.wrap.b32 %r5296, %r5295, %r5295, 16; + add.s32 %r5297, %r5296, %r5220; + xor.b32 %r5298, %r5297, %r5236; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 20; + add.s32 %r5300, %r5294, %r5065; + add.s32 %r5301, %r5300, %r5299; + xor.b32 %r5302, %r5301, %r5296; + shf.l.wrap.b32 %r5303, %r5302, %r5302, 24; + add.s32 %r5304, %r5303, %r5297; + xor.b32 %r5305, %r5304, %r5299; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 25; + add.s32 %r5307, %r5278, %r4981; + add.s32 %r5308, %r5307, %r5259; + xor.b32 %r5309, %r5308, %r5303; + shf.l.wrap.b32 %r5310, %r5309, %r5309, 16; + add.s32 %r5311, %r5310, %r5290; + xor.b32 %r5312, %r5311, %r5278; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 20; + add.s32 %r5314, %r5308, %r5051; + add.s32 %r5315, %r5314, %r5313; + xor.b32 %r5316, %r5315, %r5310; + shf.l.wrap.b32 %r5317, %r5316, %r5316, 24; + add.s32 %r5318, %r5317, %r5311; + xor.b32 %r5319, %r5318, %r5313; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 25; + add.s32 %r5321, %r5273, %r5058; + add.s32 %r5322, %r5321, %r5292; + xor.b32 %r5323, %r5261, %r5322; + shf.l.wrap.b32 %r5324, %r5323, %r5323, 16; + add.s32 %r5325, %r5324, %r5304; + xor.b32 %r5326, %r5325, %r5292; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 20; + add.s32 %r5328, %r5322, %r5009; + add.s32 %r5329, %r5328, %r5327; + xor.b32 %r5330, %r5329, %r5324; + shf.l.wrap.b32 %r5331, %r5330, %r5330, 24; + add.s32 %r5332, %r5331, %r5325; + xor.b32 %r5333, %r5332, %r5327; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 25; + add.s32 %r5335, %r5287, %r5037; + add.s32 %r5336, %r5335, %r5306; + xor.b32 %r5337, %r5336, %r5275; + shf.l.wrap.b32 %r5338, %r5337, %r5337, 16; + add.s32 %r5339, %r5338, %r5262; + xor.b32 %r5340, %r5339, %r5306; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 20; + add.s32 %r5342, %r5336, %r5072; + add.s32 %r5343, %r5342, %r5341; + xor.b32 %r5344, %r5343, %r5338; + shf.l.wrap.b32 %r5345, %r5344, %r5344, 24; + add.s32 %r5346, %r5345, %r5339; + xor.b32 %r5347, %r5346, %r5341; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 25; + add.s32 %r5349, %r5301, %r5079; + add.s32 %r5350, %r5349, %r5264; + xor.b32 %r5351, %r5350, %r5289; + shf.l.wrap.b32 %r5352, %r5351, %r5351, 16; + add.s32 %r5353, %r5352, %r5276; + xor.b32 %r5354, %r5353, %r5264; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 20; + add.s32 %r5356, %r5350, %r5030; + add.s32 %r5357, %r5356, %r5355; + xor.b32 %r5358, %r5357, %r5352; + shf.l.wrap.b32 %r5359, %r5358, %r5358, 24; + add.s32 %r5360, %r5359, %r5353; + xor.b32 %r5361, %r5360, %r5355; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 25; + add.s32 %r5363, %r5315, %r4995; + add.s32 %r5364, %r5363, %r5362; + xor.b32 %r5365, %r5364, %r5331; + shf.l.wrap.b32 %r5366, %r5365, %r5365, 16; + add.s32 %r5367, %r5366, %r5346; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 20; + add.s32 %r5370, %r5364, %r5002; + add.s32 %r5371, %r5370, %r5369; + xor.b32 %r5372, %r5371, %r5366; + shf.l.wrap.b32 %r5373, %r5372, %r5372, 24; + add.s32 %r5374, %r5373, %r5367; + xor.b32 %r5375, %r5374, %r5369; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 25; + add.s32 %r5377, %r5329, %r5044; + add.s32 %r5378, %r5377, %r5320; + xor.b32 %r5379, %r5378, %r5345; + shf.l.wrap.b32 %r5380, %r5379, %r5379, 16; + add.s32 %r5381, %r5380, %r5360; + xor.b32 %r5382, %r5381, %r5320; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 20; + add.s32 %r5384, %r5378, %r5058; + add.s32 %r5385, %r5384, %r5383; + xor.b32 %r5386, %r5385, %r5380; + shf.l.wrap.b32 %r5387, %r5386, %r5386, 24; + add.s32 %r5388, %r5387, %r5381; + xor.b32 %r5389, %r5388, %r5383; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 25; + add.s32 %r5391, %r5343, %r5065; + add.s32 %r5392, %r5391, %r5334; + xor.b32 %r5393, %r5359, %r5392; + shf.l.wrap.b32 %r5394, %r5393, %r5393, 16; + add.s32 %r5395, %r5394, %r5318; + xor.b32 %r5396, %r5395, %r5334; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 20; + add.s32 %r5398, %r5392, %r4988; + add.s32 %r5399, %r5398, %r5397; + xor.b32 %r5400, %r5399, %r5394; + shf.l.wrap.b32 %r5401, %r5400, %r5400, 24; + add.s32 %r5402, %r5401, %r5395; + xor.b32 %r5403, %r5402, %r5397; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 25; + add.s32 %r5405, %r5348, %r5023; + add.s32 %r5406, %r5405, %r5357; + xor.b32 %r5407, %r5406, %r5317; + shf.l.wrap.b32 %r5408, %r5407, %r5407, 16; + add.s32 %r5409, %r5408, %r5332; + xor.b32 %r5410, %r5409, %r5348; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 20; + add.s32 %r5412, %r5406, %r5072; + add.s32 %r5413, %r5412, %r5411; + xor.b32 %r5414, %r5413, %r5408; + shf.l.wrap.b32 %r5415, %r5414, %r5414, 24; + add.s32 %r5416, %r5415, %r5409; + xor.b32 %r5417, %r5416, %r5411; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 25; + add.s32 %r5419, %r5371, %r5016; + add.s32 %r5420, %r5419, %r5390; + xor.b32 %r5421, %r5420, %r5415; + shf.l.wrap.b32 %r5422, %r5421, %r5421, 16; + add.s32 %r5423, %r5422, %r5402; + xor.b32 %r5424, %r5423, %r5390; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 20; + add.s32 %r5426, %r5420, %r5009; + add.s32 %r5427, %r5426, %r5425; + xor.b32 %r5428, %r5427, %r5422; + shf.l.wrap.b32 %r5429, %r5428, %r5428, 24; + add.s32 %r5430, %r5429, %r5423; + xor.b32 %r5431, %r5430, %r5425; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 25; + add.s32 %r5433, %r5385, %r5037; + add.s32 %r5434, %r5433, %r5404; + xor.b32 %r5435, %r5373, %r5434; + shf.l.wrap.b32 %r5436, %r5435, %r5435, 16; + add.s32 %r5437, %r5436, %r5416; + xor.b32 %r5438, %r5437, %r5404; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 20; + add.s32 %r5440, %r5434, %r4974; + add.s32 %r5441, %r5440, %r5439; + xor.b32 %r5442, %r5441, %r5436; + shf.l.wrap.b32 %r5443, %r5442, %r5442, 24; + add.s32 %r5444, %r5443, %r5437; + xor.b32 %r5445, %r5444, %r5439; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 25; + add.s32 %r5447, %r5399, %r5051; + add.s32 %r5448, %r5447, %r5418; + xor.b32 %r5449, %r5448, %r5387; + shf.l.wrap.b32 %r5450, %r5449, %r5449, 16; + add.s32 %r5451, %r5450, %r5374; + xor.b32 %r5452, %r5451, %r5418; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 20; + add.s32 %r5454, %r5448, %r5079; + add.s32 %r5455, %r5454, %r5453; + xor.b32 %r5456, %r5455, %r5450; + shf.l.wrap.b32 %r5457, %r5456, %r5456, 24; + add.s32 %r5458, %r5457, %r5451; + xor.b32 %r5459, %r5458, %r5453; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 25; + add.s32 %r5461, %r5413, %r5030; + add.s32 %r5462, %r5461, %r5376; + xor.b32 %r5463, %r5462, %r5401; + shf.l.wrap.b32 %r5464, %r5463, %r5463, 16; + add.s32 %r5465, %r5464, %r5388; + xor.b32 %r5466, %r5465, %r5376; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 20; + add.s32 %r5468, %r5462, %r4981; + add.s32 %r5469, %r5468, %r5467; + xor.b32 %r5470, %r5469, %r5464; + shf.l.wrap.b32 %r5471, %r5470, %r5470, 24; + add.s32 %r5472, %r5471, %r5465; + xor.b32 %r5473, %r5472, %r5467; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 25; + add.s32 %r5475, %r5427, %r5044; + add.s32 %r5476, %r5475, %r5474; + xor.b32 %r5477, %r5476, %r5443; + shf.l.wrap.b32 %r5478, %r5477, %r5477, 16; + add.s32 %r5479, %r5478, %r5458; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 20; + add.s32 %r5482, %r5476, %r5023; + add.s32 %r5483, %r5482, %r5481; + xor.b32 %r5484, %r5483, %r5478; + shf.l.wrap.b32 %r5485, %r5484, %r5484, 24; + add.s32 %r5486, %r5485, %r5479; + xor.b32 %r5487, %r5486, %r5481; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 25; + add.s32 %r5489, %r5441, %r5058; + add.s32 %r5490, %r5489, %r5432; + xor.b32 %r5491, %r5490, %r5457; + shf.l.wrap.b32 %r5492, %r5491, %r5491, 16; + add.s32 %r5493, %r5492, %r5472; + xor.b32 %r5494, %r5493, %r5432; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 20; + add.s32 %r5496, %r5490, %r5037; + add.s32 %r5497, %r5496, %r5495; + xor.b32 %r5498, %r5497, %r5492; + shf.l.wrap.b32 %r5499, %r5498, %r5498, 24; + add.s32 %r5500, %r5499, %r5493; + xor.b32 %r5501, %r5500, %r5495; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 25; + add.s32 %r5503, %r5455, %r5072; + add.s32 %r5504, %r5503, %r5446; + xor.b32 %r5505, %r5471, %r5504; + shf.l.wrap.b32 %r5506, %r5505, %r5505, 16; + add.s32 %r5507, %r5506, %r5430; + xor.b32 %r5508, %r5507, %r5446; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 20; + add.s32 %r5510, %r5504, %r4995; + add.s32 %r5511, %r5510, %r5509; + xor.b32 %r5512, %r5511, %r5506; + shf.l.wrap.b32 %r5513, %r5512, %r5512, 24; + add.s32 %r5514, %r5513, %r5507; + xor.b32 %r5515, %r5514, %r5509; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 25; + add.s32 %r5517, %r5469, %r5065; + add.s32 %r5518, %r5517, %r5460; + xor.b32 %r5519, %r5518, %r5429; + shf.l.wrap.b32 %r5520, %r5519, %r5519, 16; + add.s32 %r5521, %r5520, %r5444; + xor.b32 %r5522, %r5521, %r5460; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 20; + add.s32 %r5524, %r5518, %r5079; + add.s32 %r5525, %r5524, %r5523; + xor.b32 %r5526, %r5525, %r5520; + shf.l.wrap.b32 %r5527, %r5526, %r5526, 24; + add.s32 %r5528, %r5527, %r5521; + xor.b32 %r5529, %r5528, %r5523; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 25; + add.s32 %r5531, %r5483, %r5002; + add.s32 %r5532, %r5531, %r5502; + xor.b32 %r5533, %r5532, %r5527; + shf.l.wrap.b32 %r5534, %r5533, %r5533, 16; + add.s32 %r5535, %r5534, %r5514; + xor.b32 %r5536, %r5535, %r5502; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 20; + add.s32 %r5538, %r5532, %r4974; + add.s32 %r5539, %r5538, %r5537; + xor.b32 %r5540, %r5539, %r5534; + shf.l.wrap.b32 %r5541, %r5540, %r5540, 24; + add.s32 %r5542, %r5541, %r5535; + xor.b32 %r5543, %r5542, %r5537; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 25; + add.s32 %r5545, %r5497, %r5051; + add.s32 %r5546, %r5545, %r5516; + xor.b32 %r5547, %r5485, %r5546; + shf.l.wrap.b32 %r5548, %r5547, %r5547, 16; + add.s32 %r5549, %r5548, %r5528; + xor.b32 %r5550, %r5549, %r5516; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 20; + add.s32 %r5552, %r5546, %r4988; + add.s32 %r5553, %r5552, %r5551; + xor.b32 %r5554, %r5553, %r5548; + shf.l.wrap.b32 %r5555, %r5554, %r5554, 24; + add.s32 %r5556, %r5555, %r5549; + xor.b32 %r5557, %r5556, %r5551; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 25; + add.s32 %r5559, %r5511, %r5009; + add.s32 %r5560, %r5559, %r5530; + xor.b32 %r5561, %r5560, %r5499; + shf.l.wrap.b32 %r5562, %r5561, %r5561, 16; + add.s32 %r5563, %r5562, %r5486; + xor.b32 %r5564, %r5563, %r5530; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 20; + add.s32 %r5566, %r5560, %r5030; + add.s32 %r5567, %r5566, %r5565; + xor.b32 %r5568, %r5567, %r5562; + shf.l.wrap.b32 %r5569, %r5568, %r5568, 24; + add.s32 %r5570, %r5569, %r5563; + xor.b32 %r5571, %r5570, %r5565; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 25; + add.s32 %r5573, %r5525, %r4981; + add.s32 %r5574, %r5573, %r5488; + xor.b32 %r5575, %r5574, %r5513; + shf.l.wrap.b32 %r5576, %r5575, %r5575, 16; + add.s32 %r5577, %r5576, %r5500; + xor.b32 %r5578, %r5577, %r5488; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 20; + add.s32 %r5580, %r5574, %r5016; + add.s32 %r5581, %r5580, %r5579; + xor.b32 %r5582, %r5581, %r5576; + shf.l.wrap.b32 %r5583, %r5582, %r5582, 24; + add.s32 %r5584, %r5583, %r5577; + xor.b32 %r5585, %r5584, %r5579; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 25; + add.s32 %r5587, %r5539, %r5058; + add.s32 %r5588, %r5587, %r5586; + xor.b32 %r5589, %r5588, %r5555; + shf.l.wrap.b32 %r5590, %r5589, %r5589, 16; + add.s32 %r5591, %r5590, %r5570; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 20; + add.s32 %r5594, %r5588, %r5065; + add.s32 %r5595, %r5594, %r5593; + xor.b32 %r5596, %r5595, %r5590; + shf.l.wrap.b32 %r5597, %r5596, %r5596, 24; + add.s32 %r5598, %r5597, %r5591; + xor.b32 %r5599, %r5598, %r5593; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 25; + add.s32 %r5601, %r5553, %r5037; + add.s32 %r5602, %r5601, %r5544; + xor.b32 %r5603, %r5602, %r5569; + shf.l.wrap.b32 %r5604, %r5603, %r5603, 16; + add.s32 %r5605, %r5604, %r5584; + xor.b32 %r5606, %r5605, %r5544; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 20; + add.s32 %r5608, %r5602, %r5051; + add.s32 %r5609, %r5608, %r5607; + xor.b32 %r5610, %r5609, %r5604; + shf.l.wrap.b32 %r5611, %r5610, %r5610, 24; + add.s32 %r5612, %r5611, %r5605; + xor.b32 %r5613, %r5612, %r5607; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 25; + add.s32 %r5615, %r5567, %r5079; + add.s32 %r5616, %r5615, %r5558; + xor.b32 %r5617, %r5583, %r5616; + shf.l.wrap.b32 %r5618, %r5617, %r5617, 16; + add.s32 %r5619, %r5618, %r5542; + xor.b32 %r5620, %r5619, %r5558; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 20; + add.s32 %r5622, %r5616, %r5044; + add.s32 %r5623, %r5622, %r5621; + xor.b32 %r5624, %r5623, %r5618; + shf.l.wrap.b32 %r5625, %r5624, %r5624, 24; + add.s32 %r5626, %r5625, %r5619; + xor.b32 %r5627, %r5626, %r5621; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 25; + add.s32 %r5629, %r5581, %r5072; + add.s32 %r5630, %r5629, %r5572; + xor.b32 %r5631, %r5630, %r5541; + shf.l.wrap.b32 %r5632, %r5631, %r5631, 16; + add.s32 %r5633, %r5632, %r5556; + xor.b32 %r5634, %r5633, %r5572; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 20; + add.s32 %r5636, %r5630, %r5030; + add.s32 %r5637, %r5636, %r5635; + xor.b32 %r5638, %r5637, %r5632; + shf.l.wrap.b32 %r5639, %r5638, %r5638, 24; + add.s32 %r5640, %r5639, %r5633; + xor.b32 %r5641, %r5640, %r5635; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 25; + add.s32 %r5643, %r5595, %r5023; + add.s32 %r5644, %r5643, %r5614; + xor.b32 %r5645, %r5644, %r5639; + shf.l.wrap.b32 %r5646, %r5645, %r5645, 16; + add.s32 %r5647, %r5646, %r5626; + xor.b32 %r5648, %r5647, %r5614; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 20; + add.s32 %r5650, %r5644, %r4988; + add.s32 %r5651, %r5650, %r5649; + xor.b32 %r5652, %r5651, %r5646; + shf.l.wrap.b32 %r5653, %r5652, %r5652, 24; + add.s32 %r5654, %r5653, %r5647; + xor.b32 %r5655, %r5654, %r5649; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 25; + add.s32 %r5657, %r5609, %r5009; + add.s32 %r5658, %r5657, %r5628; + xor.b32 %r5659, %r5597, %r5658; + shf.l.wrap.b32 %r5660, %r5659, %r5659, 16; + add.s32 %r5661, %r5660, %r5640; + xor.b32 %r5662, %r5661, %r5628; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 20; + add.s32 %r5664, %r5658, %r4995; + add.s32 %r5665, %r5664, %r5663; + xor.b32 %r5666, %r5665, %r5660; + shf.l.wrap.b32 %r5667, %r5666, %r5666, 24; + add.s32 %r5668, %r5667, %r5661; + xor.b32 %r5669, %r5668, %r5663; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 25; + add.s32 %r5671, %r5623, %r4974; + add.s32 %r5672, %r5671, %r5642; + xor.b32 %r5673, %r5672, %r5611; + shf.l.wrap.b32 %r5674, %r5673, %r5673, 16; + add.s32 %r5675, %r5674, %r5598; + xor.b32 %r5676, %r5675, %r5642; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 20; + add.s32 %r5678, %r5672, %r4981; + add.s32 %r5679, %r5678, %r5677; + xor.b32 %r5680, %r5679, %r5674; + shf.l.wrap.b32 %r5681, %r5680, %r5680, 24; + add.s32 %r5682, %r5681, %r5675; + xor.b32 %r5683, %r5682, %r5677; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 25; + add.s32 %r5685, %r5637, %r5016; + add.s32 %r5686, %r5685, %r5600; + xor.b32 %r5687, %r5686, %r5625; + shf.l.wrap.b32 %r5688, %r5687, %r5687, 16; + add.s32 %r5689, %r5688, %r5612; + xor.b32 %r5690, %r5689, %r5600; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 20; + add.s32 %r5692, %r5686, %r5002; + add.s32 %r5693, %r5692, %r5691; + xor.b32 %r5694, %r5693, %r5688; + shf.l.wrap.b32 %r5695, %r5694, %r5694, 24; + add.s32 %r5696, %r5695, %r5689; + xor.b32 %r5697, %r5696, %r5691; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 25; + add.s32 %r5699, %r5651, %r5037; + add.s32 %r5700, %r5699, %r5698; + xor.b32 %r5701, %r5700, %r5667; + shf.l.wrap.b32 %r5702, %r5701, %r5701, 16; + add.s32 %r5703, %r5702, %r5682; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 20; + add.s32 %r5706, %r5700, %r5072; + add.s32 %r5707, %r5706, %r5705; + xor.b32 %r5708, %r5707, %r5702; + shf.l.wrap.b32 %r5709, %r5708, %r5708, 24; + add.s32 %r5710, %r5709, %r5703; + xor.b32 %r5711, %r5710, %r5705; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 25; + add.s32 %r5713, %r5665, %r5051; + add.s32 %r5714, %r5713, %r5656; + xor.b32 %r5715, %r5714, %r5681; + shf.l.wrap.b32 %r5716, %r5715, %r5715, 16; + add.s32 %r5717, %r5716, %r5696; + xor.b32 %r5718, %r5717, %r5656; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 20; + add.s32 %r5720, %r5714, %r5009; + add.s32 %r5721, %r5720, %r5719; + xor.b32 %r5722, %r5721, %r5716; + shf.l.wrap.b32 %r5723, %r5722, %r5722, 24; + add.s32 %r5724, %r5723, %r5717; + xor.b32 %r5725, %r5724, %r5719; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 25; + add.s32 %r5727, %r5679, %r5030; + add.s32 %r5728, %r5727, %r5670; + xor.b32 %r5729, %r5695, %r5728; + shf.l.wrap.b32 %r5730, %r5729, %r5729, 16; + add.s32 %r5731, %r5730, %r5654; + xor.b32 %r5732, %r5731, %r5670; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 20; + add.s32 %r5734, %r5728, %r5058; + add.s32 %r5735, %r5734, %r5733; + xor.b32 %r5736, %r5735, %r5730; + shf.l.wrap.b32 %r5737, %r5736, %r5736, 24; + add.s32 %r5738, %r5737, %r5731; + xor.b32 %r5739, %r5738, %r5733; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 25; + add.s32 %r5741, %r5693, %r5079; + add.s32 %r5742, %r5741, %r5684; + xor.b32 %r5743, %r5742, %r5653; + shf.l.wrap.b32 %r5744, %r5743, %r5743, 16; + add.s32 %r5745, %r5744, %r5668; + xor.b32 %r5746, %r5745, %r5684; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 20; + add.s32 %r5748, %r5742, %r4981; + add.s32 %r5749, %r5748, %r5747; + xor.b32 %r5750, %r5749, %r5744; + shf.l.wrap.b32 %r5751, %r5750, %r5750, 24; + add.s32 %r5752, %r5751, %r5745; + xor.b32 %r5753, %r5752, %r5747; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 25; + add.s32 %r5755, %r5707, %r5065; + add.s32 %r5756, %r5755, %r5726; + xor.b32 %r5757, %r5756, %r5751; + shf.l.wrap.b32 %r5758, %r5757, %r5757, 16; + add.s32 %r5759, %r5758, %r5738; + xor.b32 %r5760, %r5759, %r5726; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 20; + add.s32 %r5762, %r5756, %r4995; + add.s32 %r5763, %r5762, %r5761; + xor.b32 %r5764, %r5763, %r5758; + shf.l.wrap.b32 %r5765, %r5764, %r5764, 24; + add.s32 %r5766, %r5765, %r5759; + xor.b32 %r5767, %r5766, %r5761; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 25; + add.s32 %r5769, %r5721, %r4974; + add.s32 %r5770, %r5769, %r5740; + xor.b32 %r5771, %r5709, %r5770; + shf.l.wrap.b32 %r5772, %r5771, %r5771, 16; + add.s32 %r5773, %r5772, %r5752; + xor.b32 %r5774, %r5773, %r5740; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 20; + add.s32 %r5776, %r5770, %r5044; + add.s32 %r5777, %r5776, %r5775; + xor.b32 %r5778, %r5777, %r5772; + shf.l.wrap.b32 %r5779, %r5778, %r5778, 24; + add.s32 %r5780, %r5779, %r5773; + xor.b32 %r5781, %r5780, %r5775; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 25; + add.s32 %r5783, %r5735, %r4988; + add.s32 %r5784, %r5783, %r5754; + xor.b32 %r5785, %r5784, %r5723; + shf.l.wrap.b32 %r5786, %r5785, %r5785, 16; + add.s32 %r5787, %r5786, %r5710; + xor.b32 %r5788, %r5787, %r5754; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 20; + add.s32 %r5790, %r5784, %r5016; + add.s32 %r5791, %r5790, %r5789; + xor.b32 %r5792, %r5791, %r5786; + shf.l.wrap.b32 %r5793, %r5792, %r5792, 24; + add.s32 %r5794, %r5793, %r5787; + xor.b32 %r5795, %r5794, %r5789; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 25; + add.s32 %r5797, %r5749, %r5002; + add.s32 %r5798, %r5797, %r5712; + xor.b32 %r5799, %r5798, %r5737; + shf.l.wrap.b32 %r5800, %r5799, %r5799, 16; + add.s32 %r5801, %r5800, %r5724; + xor.b32 %r5802, %r5801, %r5712; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 20; + add.s32 %r5804, %r5798, %r5023; + add.s32 %r5805, %r5804, %r5803; + xor.b32 %r5806, %r5805, %r5800; + shf.l.wrap.b32 %r5807, %r5806, %r5806, 24; + add.s32 %r5808, %r5807, %r5801; + xor.b32 %r5809, %r5808, %r5803; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 25; + add.s32 %r5811, %r5763, %r5051; + add.s32 %r5812, %r5811, %r5810; + xor.b32 %r5813, %r5812, %r5779; + shf.l.wrap.b32 %r5814, %r5813, %r5813, 16; + add.s32 %r5815, %r5814, %r5794; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 20; + add.s32 %r5818, %r5812, %r5079; + add.s32 %r5819, %r5818, %r5817; + xor.b32 %r5820, %r5819, %r5814; + shf.l.wrap.b32 %r5821, %r5820, %r5820, 24; + add.s32 %r5822, %r5821, %r5815; + xor.b32 %r5823, %r5822, %r5817; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 25; + add.s32 %r5825, %r5777, %r5009; + add.s32 %r5826, %r5825, %r5768; + xor.b32 %r5827, %r5826, %r5793; + shf.l.wrap.b32 %r5828, %r5827, %r5827, 16; + add.s32 %r5829, %r5828, %r5808; + xor.b32 %r5830, %r5829, %r5768; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 20; + add.s32 %r5832, %r5826, %r4974; + add.s32 %r5833, %r5832, %r5831; + xor.b32 %r5834, %r5833, %r5828; + shf.l.wrap.b32 %r5835, %r5834, %r5834, 24; + add.s32 %r5836, %r5835, %r5829; + xor.b32 %r5837, %r5836, %r5831; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 25; + add.s32 %r5839, %r5791, %r4981; + add.s32 %r5840, %r5839, %r5782; + xor.b32 %r5841, %r5807, %r5840; + shf.l.wrap.b32 %r5842, %r5841, %r5841, 16; + add.s32 %r5843, %r5842, %r5766; + xor.b32 %r5844, %r5843, %r5782; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 20; + add.s32 %r5846, %r5840, %r5037; + add.s32 %r5847, %r5846, %r5845; + xor.b32 %r5848, %r5847, %r5842; + shf.l.wrap.b32 %r5849, %r5848, %r5848, 24; + add.s32 %r5850, %r5849, %r5843; + xor.b32 %r5851, %r5850, %r5845; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 25; + add.s32 %r5853, %r5805, %r5030; + add.s32 %r5854, %r5853, %r5796; + xor.b32 %r5855, %r5854, %r5765; + shf.l.wrap.b32 %r5856, %r5855, %r5855, 16; + add.s32 %r5857, %r5856, %r5780; + xor.b32 %r5858, %r5857, %r5796; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 20; + add.s32 %r5860, %r5854, %r5016; + add.s32 %r5861, %r5860, %r5859; + xor.b32 %r5862, %r5861, %r5856; + shf.l.wrap.b32 %r5863, %r5862, %r5862, 24; + add.s32 %r5864, %r5863, %r5857; + xor.b32 %r5865, %r5864, %r5859; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 25; + add.s32 %r5867, %r5819, %r5072; + add.s32 %r5868, %r5867, %r5838; + xor.b32 %r5869, %r5868, %r5863; + shf.l.wrap.b32 %r5870, %r5869, %r5869, 16; + add.s32 %r5871, %r5870, %r5850; + xor.b32 %r5872, %r5871, %r5838; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 20; + add.s32 %r5874, %r5868, %r5044; + add.s32 %r5875, %r5874, %r5873; + xor.b32 %r5876, %r5875, %r5870; + shf.l.wrap.b32 %r5877, %r5876, %r5876, 24; + add.s32 %r5878, %r5877, %r5871; + xor.b32 %r5879, %r5878, %r5873; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 25; + add.s32 %r5881, %r5833, %r4988; + add.s32 %r5882, %r5881, %r5852; + xor.b32 %r5883, %r5821, %r5882; + shf.l.wrap.b32 %r5884, %r5883, %r5883, 16; + add.s32 %r5885, %r5884, %r5864; + xor.b32 %r5886, %r5885, %r5852; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 20; + add.s32 %r5888, %r5882, %r5058; + add.s32 %r5889, %r5888, %r5887; + xor.b32 %r5890, %r5889, %r5884; + shf.l.wrap.b32 %r5891, %r5890, %r5890, 24; + add.s32 %r5892, %r5891, %r5885; + xor.b32 %r5893, %r5892, %r5887; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 25; + add.s32 %r5895, %r5847, %r4995; + add.s32 %r5896, %r5895, %r5866; + xor.b32 %r5897, %r5896, %r5835; + shf.l.wrap.b32 %r5898, %r5897, %r5897, 16; + add.s32 %r5899, %r5898, %r5822; + xor.b32 %r5900, %r5899, %r5866; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 20; + add.s32 %r5902, %r5896, %r5002; + add.s32 %r5903, %r5902, %r5901; + xor.b32 %r5904, %r5903, %r5898; + shf.l.wrap.b32 %r5905, %r5904, %r5904, 24; + add.s32 %r5906, %r5905, %r5899; + xor.b32 %r5907, %r5906, %r5901; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 25; + add.s32 %r5909, %r5861, %r5023; + add.s32 %r5910, %r5909, %r5824; + xor.b32 %r5911, %r5910, %r5849; + shf.l.wrap.b32 %r5912, %r5911, %r5911, 16; + add.s32 %r5913, %r5912, %r5836; + xor.b32 %r5914, %r5913, %r5824; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 20; + add.s32 %r5916, %r5910, %r5065; + add.s32 %r5917, %r5916, %r5915; + xor.b32 %r5918, %r5917, %r5912; + shf.l.wrap.b32 %r5919, %r5918, %r5918, 24; + add.s32 %r5920, %r5919, %r5913; + xor.b32 %r5921, %r5920, %r5915; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 25; + xor.b32 %r5923, %r5906, %r5875; + xor.b32 %r5924, %r5920, %r5889; + xor.b32 %r5925, %r5878, %r5903; + xor.b32 %r5926, %r5917, %r5892; + xor.b32 %r5927, %r5922, %r5891; + xor.b32 %r5928, %r5880, %r5905; + xor.b32 %r5929, %r5919, %r5894; + xor.b32 %r5930, %r5908, %r5877; + st.local.u8 [%rd178+145], %r5923; + shr.u32 %r5931, %r5923, 8; + st.local.u8 [%rd178+146], %r5931; + shr.u32 %r5932, %r5923, 16; + st.local.u8 [%rd178+147], %r5932; + shr.u32 %r5933, %r5923, 24; + st.local.u8 [%rd178+148], %r5933; + st.local.u8 [%rd178+149], %r5924; + shr.u32 %r5934, %r5924, 8; + st.local.u8 [%rd178+150], %r5934; + shr.u32 %r5935, %r5924, 16; + st.local.u8 [%rd178+151], %r5935; + shr.u32 %r5936, %r5924, 24; + st.local.u8 [%rd178+152], %r5936; + st.local.u8 [%rd178+153], %r5925; + shr.u32 %r5937, %r5925, 8; + st.local.u8 [%rd178+154], %r5937; + shr.u32 %r5938, %r5925, 16; + st.local.u8 [%rd178+155], %r5938; + shr.u32 %r5939, %r5925, 24; + st.local.u8 [%rd178+156], %r5939; + st.local.u8 [%rd178+157], %r5926; + shr.u32 %r5940, %r5926, 8; + st.local.u8 [%rd178+158], %r5940; + shr.u32 %r5941, %r5926, 16; + st.local.u8 [%rd178+159], %r5941; + shr.u32 %r5942, %r5926, 24; + st.local.u8 [%rd178+160], %r5942; + st.local.u8 [%rd178+161], %r5927; + shr.u32 %r5943, %r5927, 8; + st.local.u8 [%rd178+162], %r5943; + shr.u32 %r5944, %r5927, 16; + st.local.u8 [%rd178+163], %r5944; + shr.u32 %r5945, %r5927, 24; + st.local.u8 [%rd178+164], %r5945; + st.local.u8 [%rd178+165], %r5928; + shr.u32 %r5946, %r5928, 8; + st.local.u8 [%rd178+166], %r5946; + shr.u32 %r5947, %r5928, 16; + st.local.u8 [%rd178+167], %r5947; + shr.u32 %r5948, %r5928, 24; + st.local.u8 [%rd178+168], %r5948; + st.local.u8 [%rd178+169], %r5929; + shr.u32 %r5949, %r5929, 8; + st.local.u8 [%rd178+170], %r5949; + shr.u32 %r5950, %r5929, 16; + st.local.u8 [%rd178+171], %r5950; + shr.u32 %r5951, %r5929, 24; + st.local.u8 [%rd178+172], %r5951; + st.local.u8 [%rd178+173], %r5930; + shr.u32 %r5952, %r5930, 8; + st.local.u8 [%rd178+174], %r5952; + shr.u32 %r5953, %r5930, 16; + st.local.u8 [%rd178+175], %r5953; + shr.u32 %r5954, %r5930, 24; + st.local.u8 [%rd178+176], %r5954; + ld.local.u8 %rs176, [%rd3+8]; + add.s16 %rs177, %rs176, -1; + st.local.u8 [%rd3+8], %rs177; + cvt.u64.u16 %rd179, %rs177; + and.b64 %rd180, %rd179, 255; + setp.lt.u64 %p30, %rd226, %rd180; + and.b16 %rs178, %rs177, 255; + mul.wide.u16 %r11663, %rs178, 32; + @%p30 bra $L__BB1_34; + +$L__BB1_35: + cvt.s64.s32 %rd181, %r11663; + add.s64 %rd182, %rd2, %rd181; + mov.b32 {%rs179, %rs180}, %r3967; + st.local.u8 [%rd182+145], %rs179; + shr.u16 %rs181, %rs179, 8; + st.local.u8 [%rd182+146], %rs181; + st.local.u8 [%rd182+147], %rs180; + shr.u16 %rs182, %rs180, 8; + st.local.u8 [%rd182+148], %rs182; + mov.b32 {%rs183, %rs184}, %r3968; + st.local.u8 [%rd182+149], %rs183; + shr.u16 %rs185, %rs183, 8; + st.local.u8 [%rd182+150], %rs185; + st.local.u8 [%rd182+151], %rs184; + shr.u16 %rs186, %rs184, 8; + st.local.u8 [%rd182+152], %rs186; + mov.b32 {%rs187, %rs188}, %r3969; + st.local.u8 [%rd182+153], %rs187; + shr.u16 %rs189, %rs187, 8; + st.local.u8 [%rd182+154], %rs189; + st.local.u8 [%rd182+155], %rs188; + shr.u16 %rs190, %rs188, 8; + st.local.u8 [%rd182+156], %rs190; + mov.b32 {%rs191, %rs192}, %r3970; + st.local.u8 [%rd182+157], %rs191; + shr.u16 %rs193, %rs191, 8; + st.local.u8 [%rd182+158], %rs193; + st.local.u8 [%rd182+159], %rs192; + shr.u16 %rs194, %rs192, 8; + st.local.u8 [%rd182+160], %rs194; + mov.b32 {%rs195, %rs196}, %r3971; + st.local.u8 [%rd182+161], %rs195; + shr.u16 %rs197, %rs195, 8; + st.local.u8 [%rd182+162], %rs197; + st.local.u8 [%rd182+163], %rs196; + shr.u16 %rs198, %rs196, 8; + st.local.u8 [%rd182+164], %rs198; + mov.b32 {%rs199, %rs200}, %r3972; + st.local.u8 [%rd182+165], %rs199; + shr.u16 %rs201, %rs199, 8; + st.local.u8 [%rd182+166], %rs201; + st.local.u8 [%rd182+167], %rs200; + shr.u16 %rs202, %rs200, 8; + st.local.u8 [%rd182+168], %rs202; + mov.b32 {%rs203, %rs204}, %r3973; + st.local.u8 [%rd182+169], %rs203; + shr.u16 %rs205, %rs203, 8; + st.local.u8 [%rd182+170], %rs205; + st.local.u8 [%rd182+171], %rs204; + shr.u16 %rs206, %rs204, 8; + st.local.u8 [%rd182+172], %rs206; + mov.b32 {%rs207, %rs208}, %r3974; + st.local.u8 [%rd182+173], %rs207; + shr.u16 %rs209, %rs207, 8; + st.local.u8 [%rd182+174], %rs209; + st.local.u8 [%rd182+175], %rs208; + shr.u16 %rs210, %rs208, 8; + st.local.u8 [%rd182+176], %rs210; + ld.local.u8 %rs388, [%rd3+8]; + +$L__BB1_47: + add.s16 %rs331, %rs388, 1; + st.local.u8 [%rd3+8], %rs331; + ld.local.u64 %rd195, [%rd3+-72]; + shr.u64 %rd196, %rd49, 10; + add.s64 %rd250, %rd195, %rd196; + st.local.u64 [%rd3+-72], %rd250; + add.s64 %rd260, %rd260, %rd49; + add.s64 %rd253, %rd253, %rd49; + sub.s64 %rd261, %rd261, %rd49; + setp.gt.u64 %p39, %rd261, 1024; + @%p39 bra $L__BB1_26; + +$L__BB1_48: + setp.eq.s64 %p40, %rd261, 0; + @%p40 bra $L__BB1_68; + + ld.local.u8 %rs389, [%rd3]; + cvt.u64.u16 %rd71, %rs389; + setp.eq.s16 %p41, %rs389, 0; + mov.u16 %rs390, 0; + mov.u64 %rd270, %rd261; + @%p41 bra $L__BB1_57; + + mov.u64 %rd197, 64; + sub.s64 %rd198, %rd197, %rd71; + min.u64 %rd72, %rd198, %rd261; + setp.eq.s64 %p42, %rd72, 0; + @%p42 bra $L__BB1_54; + + add.s64 %rd200, %rd2, %rd71; + add.s64 %rd73, %rd200, 72; + mov.u64 %rd262, 0; + +$L__BB1_52: + add.s64 %rd201, %rd260, %rd262; + ld.local.u8 %rs333, [%rd201]; + add.s64 %rd202, %rd73, %rd262; + st.local.u8 [%rd202], %rs333; + add.s64 %rd262, %rd262, 1; + setp.lt.u64 %p43, %rd262, %rd72; + @%p43 bra $L__BB1_52; + + ld.local.u8 %rs389, [%rd3]; + +$L__BB1_54: + cvt.u16.u64 %rs334, %rd72; + add.s16 %rs390, %rs389, %rs334; + mov.u64 %rd270, 0; + st.local.u8 [%rd3], %rs390; + add.s64 %rd260, %rd260, %rd72; + sub.s64 %rd77, %rd261, %rd72; + setp.eq.s64 %p44, %rd77, 0; + @%p44 bra $L__BB1_57; + + add.s64 %rd78, %rd2, 72; + ld.local.u8 %rs335, [%rd3+1]; + mov.u64 %rd263, 0; + setp.eq.s16 %p45, %rs335, 0; + mov.u16 %rs390, 0; + selp.u16 %rs337, 1, 0, %p45; + ld.local.u8 %rs338, [%rd3+2]; + or.b16 %rs339, %rs338, %rs337; + ld.local.u8 %r8843, [%rd3+-64]; + ld.local.u8 %r8844, [%rd3+-63]; + prmt.b32 %r8845, %r8844, %r8843, 30212; + ld.local.u8 %r8846, [%rd3+-62]; + prmt.b32 %r8847, %r8846, %r8845, 28756; + ld.local.u8 %r8848, [%rd3+-61]; + prmt.b32 %r8849, %r8848, %r8847, 1620; + ld.local.u8 %r8850, [%rd3+-60]; + ld.local.u8 %r8851, [%rd3+-59]; + prmt.b32 %r8852, %r8851, %r8850, 30212; + ld.local.u8 %r8853, [%rd3+-58]; + prmt.b32 %r8854, %r8853, %r8852, 28756; + ld.local.u8 %r8855, [%rd3+-57]; + prmt.b32 %r8856, %r8855, %r8854, 1620; + ld.local.u8 %r8857, [%rd3+-56]; + ld.local.u8 %r8858, [%rd3+-55]; + prmt.b32 %r8859, %r8858, %r8857, 30212; + ld.local.u8 %r8860, [%rd3+-54]; + prmt.b32 %r8861, %r8860, %r8859, 28756; + ld.local.u8 %r8862, [%rd3+-53]; + prmt.b32 %r8863, %r8862, %r8861, 1620; + ld.local.u8 %r8864, [%rd3+-52]; + ld.local.u8 %r8865, [%rd3+-51]; + prmt.b32 %r8866, %r8865, %r8864, 30212; + ld.local.u8 %r8867, [%rd3+-50]; + prmt.b32 %r8868, %r8867, %r8866, 28756; + ld.local.u8 %r8869, [%rd3+-49]; + prmt.b32 %r8870, %r8869, %r8868, 1620; + ld.local.u8 %r8871, [%rd3+-48]; + ld.local.u8 %r8872, [%rd3+-47]; + prmt.b32 %r8873, %r8872, %r8871, 30212; + ld.local.u8 %r8874, [%rd3+-46]; + prmt.b32 %r8875, %r8874, %r8873, 28756; + ld.local.u8 %r8876, [%rd3+-45]; + prmt.b32 %r8877, %r8876, %r8875, 1620; + ld.local.u8 %r8878, [%rd3+-44]; + ld.local.u8 %r8879, [%rd3+-43]; + prmt.b32 %r8880, %r8879, %r8878, 30212; + ld.local.u8 %r8881, [%rd3+-42]; + prmt.b32 %r8882, %r8881, %r8880, 28756; + ld.local.u8 %r8883, [%rd3+-41]; + prmt.b32 %r8884, %r8883, %r8882, 1620; + ld.local.u8 %r8885, [%rd3+-40]; + ld.local.u8 %r8886, [%rd3+-39]; + prmt.b32 %r8887, %r8886, %r8885, 30212; + ld.local.u8 %r8888, [%rd3+-38]; + prmt.b32 %r8889, %r8888, %r8887, 28756; + ld.local.u8 %r8890, [%rd3+-37]; + prmt.b32 %r8891, %r8890, %r8889, 1620; + ld.local.u8 %r8892, [%rd3+-36]; + ld.local.u8 %r8893, [%rd3+-35]; + prmt.b32 %r8894, %r8893, %r8892, 30212; + ld.local.u8 %r8895, [%rd3+-34]; + prmt.b32 %r8896, %r8895, %r8894, 28756; + ld.local.u8 %r8897, [%rd3+-33]; + prmt.b32 %r8898, %r8897, %r8896, 1620; + ld.local.u8 %r8899, [%rd3+-32]; + ld.local.u8 %r8900, [%rd3+-31]; + prmt.b32 %r8901, %r8900, %r8899, 30212; + ld.local.u8 %r8902, [%rd3+-30]; + prmt.b32 %r8903, %r8902, %r8901, 28756; + ld.local.u8 %r8904, [%rd3+-29]; + prmt.b32 %r8905, %r8904, %r8903, 1620; + ld.local.u8 %r8906, [%rd3+-28]; + ld.local.u8 %r8907, [%rd3+-27]; + prmt.b32 %r8908, %r8907, %r8906, 30212; + ld.local.u8 %r8909, [%rd3+-26]; + prmt.b32 %r8910, %r8909, %r8908, 28756; + ld.local.u8 %r8911, [%rd3+-25]; + prmt.b32 %r8912, %r8911, %r8910, 1620; + ld.local.u8 %r8913, [%rd3+-24]; + ld.local.u8 %r8914, [%rd3+-23]; + prmt.b32 %r8915, %r8914, %r8913, 30212; + ld.local.u8 %r8916, [%rd3+-22]; + prmt.b32 %r8917, %r8916, %r8915, 28756; + ld.local.u8 %r8918, [%rd3+-21]; + prmt.b32 %r8919, %r8918, %r8917, 1620; + ld.local.u8 %r8920, [%rd3+-20]; + ld.local.u8 %r8921, [%rd3+-19]; + prmt.b32 %r8922, %r8921, %r8920, 30212; + ld.local.u8 %r8923, [%rd3+-18]; + prmt.b32 %r8924, %r8923, %r8922, 28756; + ld.local.u8 %r8925, [%rd3+-17]; + prmt.b32 %r8926, %r8925, %r8924, 1620; + ld.local.u8 %r8927, [%rd3+-16]; + ld.local.u8 %r8928, [%rd3+-15]; + prmt.b32 %r8929, %r8928, %r8927, 30212; + ld.local.u8 %r8930, [%rd3+-14]; + prmt.b32 %r8931, %r8930, %r8929, 28756; + ld.local.u8 %r8932, [%rd3+-13]; + prmt.b32 %r8933, %r8932, %r8931, 1620; + ld.local.u8 %r8934, [%rd3+-12]; + ld.local.u8 %r8935, [%rd3+-11]; + prmt.b32 %r8936, %r8935, %r8934, 30212; + ld.local.u8 %r8937, [%rd3+-10]; + prmt.b32 %r8938, %r8937, %r8936, 28756; + ld.local.u8 %r8939, [%rd3+-9]; + prmt.b32 %r8940, %r8939, %r8938, 1620; + ld.local.u8 %r8941, [%rd3+-8]; + ld.local.u8 %r8942, [%rd3+-7]; + prmt.b32 %r8943, %r8942, %r8941, 30212; + ld.local.u8 %r8944, [%rd3+-6]; + prmt.b32 %r8945, %r8944, %r8943, 28756; + ld.local.u8 %r8946, [%rd3+-5]; + prmt.b32 %r8947, %r8946, %r8945, 1620; + ld.local.u8 %r8948, [%rd3+-4]; + ld.local.u8 %r8949, [%rd3+-3]; + prmt.b32 %r8950, %r8949, %r8948, 30212; + ld.local.u8 %r8951, [%rd3+-2]; + prmt.b32 %r8952, %r8951, %r8950, 28756; + ld.local.u8 %r8953, [%rd3+-1]; + prmt.b32 %r8954, %r8953, %r8952, 1620; + ld.local.u64 %rd205, [%rd3+-72]; + cvt.u32.u64 %r8955, %rd205; + shr.u64 %rd206, %rd205, 32; + cvt.u32.u64 %r8956, %rd206; + cvt.u32.u16 %r8957, %rs339; + and.b32 %r8958, %r8957, 255; + ld.local.u32 %r8959, [%rd3+-104]; + add.s32 %r8960, %r8959, %r8849; + ld.local.u32 %r8961, [%rd3+-88]; + add.s32 %r8962, %r8960, %r8961; + xor.b32 %r8963, %r8962, %r8955; + shf.l.wrap.b32 %r8964, %r8963, %r8963, 16; + add.s32 %r8965, %r8964, 1779033703; + xor.b32 %r8966, %r8965, %r8961; + shf.l.wrap.b32 %r8967, %r8966, %r8966, 20; + add.s32 %r8968, %r8962, %r8856; + add.s32 %r8969, %r8968, %r8967; + xor.b32 %r8970, %r8969, %r8964; + shf.l.wrap.b32 %r8971, %r8970, %r8970, 24; + add.s32 %r8972, %r8971, %r8965; + xor.b32 %r8973, %r8972, %r8967; + shf.l.wrap.b32 %r8974, %r8973, %r8973, 25; + ld.local.u32 %r8975, [%rd3+-100]; + add.s32 %r8976, %r8975, %r8863; + ld.local.u32 %r8977, [%rd3+-84]; + add.s32 %r8978, %r8976, %r8977; + xor.b32 %r8979, %r8978, %r8956; + shf.l.wrap.b32 %r8980, %r8979, %r8979, 16; + add.s32 %r8981, %r8980, -1150833019; + xor.b32 %r8982, %r8981, %r8977; + shf.l.wrap.b32 %r8983, %r8982, %r8982, 20; + add.s32 %r8984, %r8978, %r8870; + add.s32 %r8985, %r8984, %r8983; + xor.b32 %r8986, %r8985, %r8980; + shf.l.wrap.b32 %r8987, %r8986, %r8986, 24; + add.s32 %r8988, %r8987, %r8981; + xor.b32 %r8989, %r8988, %r8983; + shf.l.wrap.b32 %r8990, %r8989, %r8989, 25; + ld.local.u32 %r8991, [%rd3+-96]; + add.s32 %r8992, %r8991, %r8877; + ld.local.u32 %r8993, [%rd3+-80]; + add.s32 %r8994, %r8992, %r8993; + shr.u32 %r8995, %r8994, 16; + shl.b32 %r8996, %r8994, 16; + xor.b32 %r8997, %r8996, 4194304; + or.b32 %r8998, %r8997, %r8995; + add.s32 %r8999, %r8998, 1013904242; + xor.b32 %r9000, %r8999, %r8993; + shf.l.wrap.b32 %r9001, %r9000, %r9000, 20; + add.s32 %r9002, %r8994, %r8884; + add.s32 %r9003, %r9002, %r9001; + xor.b32 %r9004, %r9003, %r8998; + shf.l.wrap.b32 %r9005, %r9004, %r9004, 24; + add.s32 %r9006, %r9005, %r8999; + xor.b32 %r9007, %r9006, %r9001; + shf.l.wrap.b32 %r9008, %r9007, %r9007, 25; + ld.local.u32 %r9009, [%rd3+-92]; + add.s32 %r9010, %r9009, %r8891; + ld.local.u32 %r9011, [%rd3+-76]; + add.s32 %r9012, %r9010, %r9011; + xor.b32 %r9013, %r9012, %r8958; + shr.u32 %r9014, %r9012, 16; + shl.b32 %r9015, %r9013, 16; + or.b32 %r9016, %r9015, %r9014; + add.s32 %r9017, %r9016, -1521486534; + xor.b32 %r9018, %r9017, %r9011; + shf.l.wrap.b32 %r9019, %r9018, %r9018, 20; + add.s32 %r9020, %r9012, %r8898; + add.s32 %r9021, %r9020, %r9019; + xor.b32 %r9022, %r9021, %r9016; + shf.l.wrap.b32 %r9023, %r9022, %r9022, 24; + add.s32 %r9024, %r9023, %r9017; + xor.b32 %r9025, %r9024, %r9019; + shf.l.wrap.b32 %r9026, %r9025, %r9025, 25; + add.s32 %r9027, %r8969, %r8905; + add.s32 %r9028, %r9027, %r8990; + xor.b32 %r9029, %r9028, %r9023; + shf.l.wrap.b32 %r9030, %r9029, %r9029, 16; + add.s32 %r9031, %r9030, %r9006; + xor.b32 %r9032, %r9031, %r8990; + shf.l.wrap.b32 %r9033, %r9032, %r9032, 20; + add.s32 %r9034, %r9028, %r8912; + add.s32 %r9035, %r9034, %r9033; + xor.b32 %r9036, %r9035, %r9030; + shf.l.wrap.b32 %r9037, %r9036, %r9036, 24; + add.s32 %r9038, %r9037, %r9031; + xor.b32 %r9039, %r9038, %r9033; + shf.l.wrap.b32 %r9040, %r9039, %r9039, 25; + add.s32 %r9041, %r8985, %r8919; + add.s32 %r9042, %r9041, %r9008; + xor.b32 %r9043, %r9042, %r8971; + shf.l.wrap.b32 %r9044, %r9043, %r9043, 16; + add.s32 %r9045, %r9044, %r9024; + xor.b32 %r9046, %r9045, %r9008; + shf.l.wrap.b32 %r9047, %r9046, %r9046, 20; + add.s32 %r9048, %r9042, %r8926; + add.s32 %r9049, %r9048, %r9047; + xor.b32 %r9050, %r9049, %r9044; + shf.l.wrap.b32 %r9051, %r9050, %r9050, 24; + add.s32 %r9052, %r9051, %r9045; + xor.b32 %r9053, %r9052, %r9047; + shf.l.wrap.b32 %r9054, %r9053, %r9053, 25; + add.s32 %r9055, %r9003, %r8933; + add.s32 %r9056, %r9055, %r9026; + xor.b32 %r9057, %r9056, %r8987; + shf.l.wrap.b32 %r9058, %r9057, %r9057, 16; + add.s32 %r9059, %r9058, %r8972; + xor.b32 %r9060, %r9059, %r9026; + shf.l.wrap.b32 %r9061, %r9060, %r9060, 20; + add.s32 %r9062, %r9056, %r8940; + add.s32 %r9063, %r9062, %r9061; + xor.b32 %r9064, %r9063, %r9058; + shf.l.wrap.b32 %r9065, %r9064, %r9064, 24; + add.s32 %r9066, %r9065, %r9059; + xor.b32 %r9067, %r9066, %r9061; + shf.l.wrap.b32 %r9068, %r9067, %r9067, 25; + add.s32 %r9069, %r9021, %r8947; + add.s32 %r9070, %r9069, %r8974; + xor.b32 %r9071, %r9070, %r9005; + shf.l.wrap.b32 %r9072, %r9071, %r9071, 16; + add.s32 %r9073, %r9072, %r8988; + xor.b32 %r9074, %r9073, %r8974; + shf.l.wrap.b32 %r9075, %r9074, %r9074, 20; + add.s32 %r9076, %r9070, %r8954; + add.s32 %r9077, %r9076, %r9075; + xor.b32 %r9078, %r9077, %r9072; + shf.l.wrap.b32 %r9079, %r9078, %r9078, 24; + add.s32 %r9080, %r9079, %r9073; + xor.b32 %r9081, %r9080, %r9075; + shf.l.wrap.b32 %r9082, %r9081, %r9081, 25; + add.s32 %r9083, %r9035, %r8863; + add.s32 %r9084, %r9083, %r9082; + xor.b32 %r9085, %r9084, %r9051; + shf.l.wrap.b32 %r9086, %r9085, %r9085, 16; + add.s32 %r9087, %r9086, %r9066; + xor.b32 %r9088, %r9087, %r9082; + shf.l.wrap.b32 %r9089, %r9088, %r9088, 20; + add.s32 %r9090, %r9084, %r8891; + add.s32 %r9091, %r9090, %r9089; + xor.b32 %r9092, %r9091, %r9086; + shf.l.wrap.b32 %r9093, %r9092, %r9092, 24; + add.s32 %r9094, %r9093, %r9087; + xor.b32 %r9095, %r9094, %r9089; + shf.l.wrap.b32 %r9096, %r9095, %r9095, 25; + add.s32 %r9097, %r9049, %r8870; + add.s32 %r9098, %r9097, %r9040; + xor.b32 %r9099, %r9098, %r9065; + shf.l.wrap.b32 %r9100, %r9099, %r9099, 16; + add.s32 %r9101, %r9100, %r9080; + xor.b32 %r9102, %r9101, %r9040; + shf.l.wrap.b32 %r9103, %r9102, %r9102, 20; + add.s32 %r9104, %r9098, %r8919; + add.s32 %r9105, %r9104, %r9103; + xor.b32 %r9106, %r9105, %r9100; + shf.l.wrap.b32 %r9107, %r9106, %r9106, 24; + add.s32 %r9108, %r9107, %r9101; + xor.b32 %r9109, %r9108, %r9103; + shf.l.wrap.b32 %r9110, %r9109, %r9109, 25; + add.s32 %r9111, %r9063, %r8898; + add.s32 %r9112, %r9111, %r9054; + xor.b32 %r9113, %r9112, %r9079; + shf.l.wrap.b32 %r9114, %r9113, %r9113, 16; + add.s32 %r9115, %r9114, %r9038; + xor.b32 %r9116, %r9115, %r9054; + shf.l.wrap.b32 %r9117, %r9116, %r9116, 20; + add.s32 %r9118, %r9112, %r8849; + add.s32 %r9119, %r9118, %r9117; + xor.b32 %r9120, %r9119, %r9114; + shf.l.wrap.b32 %r9121, %r9120, %r9120, 24; + add.s32 %r9122, %r9121, %r9115; + xor.b32 %r9123, %r9122, %r9117; + shf.l.wrap.b32 %r9124, %r9123, %r9123, 25; + add.s32 %r9125, %r9077, %r8877; + add.s32 %r9126, %r9125, %r9068; + xor.b32 %r9127, %r9126, %r9037; + shf.l.wrap.b32 %r9128, %r9127, %r9127, 16; + add.s32 %r9129, %r9128, %r9052; + xor.b32 %r9130, %r9129, %r9068; + shf.l.wrap.b32 %r9131, %r9130, %r9130, 20; + add.s32 %r9132, %r9126, %r8940; + add.s32 %r9133, %r9132, %r9131; + xor.b32 %r9134, %r9133, %r9128; + shf.l.wrap.b32 %r9135, %r9134, %r9134, 24; + add.s32 %r9136, %r9135, %r9129; + xor.b32 %r9137, %r9136, %r9131; + shf.l.wrap.b32 %r9138, %r9137, %r9137, 25; + add.s32 %r9139, %r9091, %r8856; + add.s32 %r9140, %r9139, %r9110; + xor.b32 %r9141, %r9140, %r9135; + shf.l.wrap.b32 %r9142, %r9141, %r9141, 16; + add.s32 %r9143, %r9142, %r9122; + xor.b32 %r9144, %r9143, %r9110; + shf.l.wrap.b32 %r9145, %r9144, %r9144, 20; + add.s32 %r9146, %r9140, %r8926; + add.s32 %r9147, %r9146, %r9145; + xor.b32 %r9148, %r9147, %r9142; + shf.l.wrap.b32 %r9149, %r9148, %r9148, 24; + add.s32 %r9150, %r9149, %r9143; + xor.b32 %r9151, %r9150, %r9145; + shf.l.wrap.b32 %r9152, %r9151, %r9151, 25; + add.s32 %r9153, %r9105, %r8933; + add.s32 %r9154, %r9153, %r9124; + xor.b32 %r9155, %r9154, %r9093; + shf.l.wrap.b32 %r9156, %r9155, %r9155, 16; + add.s32 %r9157, %r9156, %r9136; + xor.b32 %r9158, %r9157, %r9124; + shf.l.wrap.b32 %r9159, %r9158, %r9158, 20; + add.s32 %r9160, %r9154, %r8884; + add.s32 %r9161, %r9160, %r9159; + xor.b32 %r9162, %r9161, %r9156; + shf.l.wrap.b32 %r9163, %r9162, %r9162, 24; + add.s32 %r9164, %r9163, %r9157; + xor.b32 %r9165, %r9164, %r9159; + shf.l.wrap.b32 %r9166, %r9165, %r9165, 25; + add.s32 %r9167, %r9119, %r8912; + add.s32 %r9168, %r9167, %r9138; + xor.b32 %r9169, %r9168, %r9107; + shf.l.wrap.b32 %r9170, %r9169, %r9169, 16; + add.s32 %r9171, %r9170, %r9094; + xor.b32 %r9172, %r9171, %r9138; + shf.l.wrap.b32 %r9173, %r9172, %r9172, 20; + add.s32 %r9174, %r9168, %r8947; + add.s32 %r9175, %r9174, %r9173; + xor.b32 %r9176, %r9175, %r9170; + shf.l.wrap.b32 %r9177, %r9176, %r9176, 24; + add.s32 %r9178, %r9177, %r9171; + xor.b32 %r9179, %r9178, %r9173; + shf.l.wrap.b32 %r9180, %r9179, %r9179, 25; + add.s32 %r9181, %r9133, %r8954; + add.s32 %r9182, %r9181, %r9096; + xor.b32 %r9183, %r9182, %r9121; + shf.l.wrap.b32 %r9184, %r9183, %r9183, 16; + add.s32 %r9185, %r9184, %r9108; + xor.b32 %r9186, %r9185, %r9096; + shf.l.wrap.b32 %r9187, %r9186, %r9186, 20; + add.s32 %r9188, %r9182, %r8905; + add.s32 %r9189, %r9188, %r9187; + xor.b32 %r9190, %r9189, %r9184; + shf.l.wrap.b32 %r9191, %r9190, %r9190, 24; + add.s32 %r9192, %r9191, %r9185; + xor.b32 %r9193, %r9192, %r9187; + shf.l.wrap.b32 %r9194, %r9193, %r9193, 25; + add.s32 %r9195, %r9147, %r8870; + add.s32 %r9196, %r9195, %r9194; + xor.b32 %r9197, %r9196, %r9163; + shf.l.wrap.b32 %r9198, %r9197, %r9197, 16; + add.s32 %r9199, %r9198, %r9178; + xor.b32 %r9200, %r9199, %r9194; + shf.l.wrap.b32 %r9201, %r9200, %r9200, 20; + add.s32 %r9202, %r9196, %r8877; + add.s32 %r9203, %r9202, %r9201; + xor.b32 %r9204, %r9203, %r9198; + shf.l.wrap.b32 %r9205, %r9204, %r9204, 24; + add.s32 %r9206, %r9205, %r9199; + xor.b32 %r9207, %r9206, %r9201; + shf.l.wrap.b32 %r9208, %r9207, %r9207, 25; + add.s32 %r9209, %r9161, %r8919; + add.s32 %r9210, %r9209, %r9152; + xor.b32 %r9211, %r9210, %r9177; + shf.l.wrap.b32 %r9212, %r9211, %r9211, 16; + add.s32 %r9213, %r9212, %r9192; + xor.b32 %r9214, %r9213, %r9152; + shf.l.wrap.b32 %r9215, %r9214, %r9214, 20; + add.s32 %r9216, %r9210, %r8933; + add.s32 %r9217, %r9216, %r9215; + xor.b32 %r9218, %r9217, %r9212; + shf.l.wrap.b32 %r9219, %r9218, %r9218, 24; + add.s32 %r9220, %r9219, %r9213; + xor.b32 %r9221, %r9220, %r9215; + shf.l.wrap.b32 %r9222, %r9221, %r9221, 25; + add.s32 %r9223, %r9175, %r8940; + add.s32 %r9224, %r9223, %r9166; + xor.b32 %r9225, %r9224, %r9191; + shf.l.wrap.b32 %r9226, %r9225, %r9225, 16; + add.s32 %r9227, %r9226, %r9150; + xor.b32 %r9228, %r9227, %r9166; + shf.l.wrap.b32 %r9229, %r9228, %r9228, 20; + add.s32 %r9230, %r9224, %r8863; + add.s32 %r9231, %r9230, %r9229; + xor.b32 %r9232, %r9231, %r9226; + shf.l.wrap.b32 %r9233, %r9232, %r9232, 24; + add.s32 %r9234, %r9233, %r9227; + xor.b32 %r9235, %r9234, %r9229; + shf.l.wrap.b32 %r9236, %r9235, %r9235, 25; + add.s32 %r9237, %r9189, %r8898; + add.s32 %r9238, %r9237, %r9180; + xor.b32 %r9239, %r9238, %r9149; + shf.l.wrap.b32 %r9240, %r9239, %r9239, 16; + add.s32 %r9241, %r9240, %r9164; + xor.b32 %r9242, %r9241, %r9180; + shf.l.wrap.b32 %r9243, %r9242, %r9242, 20; + add.s32 %r9244, %r9238, %r8947; + add.s32 %r9245, %r9244, %r9243; + xor.b32 %r9246, %r9245, %r9240; + shf.l.wrap.b32 %r9247, %r9246, %r9246, 24; + add.s32 %r9248, %r9247, %r9241; + xor.b32 %r9249, %r9248, %r9243; + shf.l.wrap.b32 %r9250, %r9249, %r9249, 25; + add.s32 %r9251, %r9203, %r8891; + add.s32 %r9252, %r9251, %r9222; + xor.b32 %r9253, %r9252, %r9247; + shf.l.wrap.b32 %r9254, %r9253, %r9253, 16; + add.s32 %r9255, %r9254, %r9234; + xor.b32 %r9256, %r9255, %r9222; + shf.l.wrap.b32 %r9257, %r9256, %r9256, 20; + add.s32 %r9258, %r9252, %r8884; + add.s32 %r9259, %r9258, %r9257; + xor.b32 %r9260, %r9259, %r9254; + shf.l.wrap.b32 %r9261, %r9260, %r9260, 24; + add.s32 %r9262, %r9261, %r9255; + xor.b32 %r9263, %r9262, %r9257; + shf.l.wrap.b32 %r9264, %r9263, %r9263, 25; + add.s32 %r9265, %r9217, %r8912; + add.s32 %r9266, %r9265, %r9236; + xor.b32 %r9267, %r9266, %r9205; + shf.l.wrap.b32 %r9268, %r9267, %r9267, 16; + add.s32 %r9269, %r9268, %r9248; + xor.b32 %r9270, %r9269, %r9236; + shf.l.wrap.b32 %r9271, %r9270, %r9270, 20; + add.s32 %r9272, %r9266, %r8849; + add.s32 %r9273, %r9272, %r9271; + xor.b32 %r9274, %r9273, %r9268; + shf.l.wrap.b32 %r9275, %r9274, %r9274, 24; + add.s32 %r9276, %r9275, %r9269; + xor.b32 %r9277, %r9276, %r9271; + shf.l.wrap.b32 %r9278, %r9277, %r9277, 25; + add.s32 %r9279, %r9231, %r8926; + add.s32 %r9280, %r9279, %r9250; + xor.b32 %r9281, %r9280, %r9219; + shf.l.wrap.b32 %r9282, %r9281, %r9281, 16; + add.s32 %r9283, %r9282, %r9206; + xor.b32 %r9284, %r9283, %r9250; + shf.l.wrap.b32 %r9285, %r9284, %r9284, 20; + add.s32 %r9286, %r9280, %r8954; + add.s32 %r9287, %r9286, %r9285; + xor.b32 %r9288, %r9287, %r9282; + shf.l.wrap.b32 %r9289, %r9288, %r9288, 24; + add.s32 %r9290, %r9289, %r9283; + xor.b32 %r9291, %r9290, %r9285; + shf.l.wrap.b32 %r9292, %r9291, %r9291, 25; + add.s32 %r9293, %r9245, %r8905; + add.s32 %r9294, %r9293, %r9208; + xor.b32 %r9295, %r9294, %r9233; + shf.l.wrap.b32 %r9296, %r9295, %r9295, 16; + add.s32 %r9297, %r9296, %r9220; + xor.b32 %r9298, %r9297, %r9208; + shf.l.wrap.b32 %r9299, %r9298, %r9298, 20; + add.s32 %r9300, %r9294, %r8856; + add.s32 %r9301, %r9300, %r9299; + xor.b32 %r9302, %r9301, %r9296; + shf.l.wrap.b32 %r9303, %r9302, %r9302, 24; + add.s32 %r9304, %r9303, %r9297; + xor.b32 %r9305, %r9304, %r9299; + shf.l.wrap.b32 %r9306, %r9305, %r9305, 25; + add.s32 %r9307, %r9259, %r8919; + add.s32 %r9308, %r9307, %r9306; + xor.b32 %r9309, %r9308, %r9275; + shf.l.wrap.b32 %r9310, %r9309, %r9309, 16; + add.s32 %r9311, %r9310, %r9290; + xor.b32 %r9312, %r9311, %r9306; + shf.l.wrap.b32 %r9313, %r9312, %r9312, 20; + add.s32 %r9314, %r9308, %r8898; + add.s32 %r9315, %r9314, %r9313; + xor.b32 %r9316, %r9315, %r9310; + shf.l.wrap.b32 %r9317, %r9316, %r9316, 24; + add.s32 %r9318, %r9317, %r9311; + xor.b32 %r9319, %r9318, %r9313; + shf.l.wrap.b32 %r9320, %r9319, %r9319, 25; + add.s32 %r9321, %r9273, %r8933; + add.s32 %r9322, %r9321, %r9264; + xor.b32 %r9323, %r9322, %r9289; + shf.l.wrap.b32 %r9324, %r9323, %r9323, 16; + add.s32 %r9325, %r9324, %r9304; + xor.b32 %r9326, %r9325, %r9264; + shf.l.wrap.b32 %r9327, %r9326, %r9326, 20; + add.s32 %r9328, %r9322, %r8912; + add.s32 %r9329, %r9328, %r9327; + xor.b32 %r9330, %r9329, %r9324; + shf.l.wrap.b32 %r9331, %r9330, %r9330, 24; + add.s32 %r9332, %r9331, %r9325; + xor.b32 %r9333, %r9332, %r9327; + shf.l.wrap.b32 %r9334, %r9333, %r9333, 25; + add.s32 %r9335, %r9287, %r8947; + add.s32 %r9336, %r9335, %r9278; + xor.b32 %r9337, %r9336, %r9303; + shf.l.wrap.b32 %r9338, %r9337, %r9337, 16; + add.s32 %r9339, %r9338, %r9262; + xor.b32 %r9340, %r9339, %r9278; + shf.l.wrap.b32 %r9341, %r9340, %r9340, 20; + add.s32 %r9342, %r9336, %r8870; + add.s32 %r9343, %r9342, %r9341; + xor.b32 %r9344, %r9343, %r9338; + shf.l.wrap.b32 %r9345, %r9344, %r9344, 24; + add.s32 %r9346, %r9345, %r9339; + xor.b32 %r9347, %r9346, %r9341; + shf.l.wrap.b32 %r9348, %r9347, %r9347, 25; + add.s32 %r9349, %r9301, %r8940; + add.s32 %r9350, %r9349, %r9292; + xor.b32 %r9351, %r9350, %r9261; + shf.l.wrap.b32 %r9352, %r9351, %r9351, 16; + add.s32 %r9353, %r9352, %r9276; + xor.b32 %r9354, %r9353, %r9292; + shf.l.wrap.b32 %r9355, %r9354, %r9354, 20; + add.s32 %r9356, %r9350, %r8954; + add.s32 %r9357, %r9356, %r9355; + xor.b32 %r9358, %r9357, %r9352; + shf.l.wrap.b32 %r9359, %r9358, %r9358, 24; + add.s32 %r9360, %r9359, %r9353; + xor.b32 %r9361, %r9360, %r9355; + shf.l.wrap.b32 %r9362, %r9361, %r9361, 25; + add.s32 %r9363, %r9315, %r8877; + add.s32 %r9364, %r9363, %r9334; + xor.b32 %r9365, %r9364, %r9359; + shf.l.wrap.b32 %r9366, %r9365, %r9365, 16; + add.s32 %r9367, %r9366, %r9346; + xor.b32 %r9368, %r9367, %r9334; + shf.l.wrap.b32 %r9369, %r9368, %r9368, 20; + add.s32 %r9370, %r9364, %r8849; + add.s32 %r9371, %r9370, %r9369; + xor.b32 %r9372, %r9371, %r9366; + shf.l.wrap.b32 %r9373, %r9372, %r9372, 24; + add.s32 %r9374, %r9373, %r9367; + xor.b32 %r9375, %r9374, %r9369; + shf.l.wrap.b32 %r9376, %r9375, %r9375, 25; + add.s32 %r9377, %r9329, %r8926; + add.s32 %r9378, %r9377, %r9348; + xor.b32 %r9379, %r9378, %r9317; + shf.l.wrap.b32 %r9380, %r9379, %r9379, 16; + add.s32 %r9381, %r9380, %r9360; + xor.b32 %r9382, %r9381, %r9348; + shf.l.wrap.b32 %r9383, %r9382, %r9382, 20; + add.s32 %r9384, %r9378, %r8863; + add.s32 %r9385, %r9384, %r9383; + xor.b32 %r9386, %r9385, %r9380; + shf.l.wrap.b32 %r9387, %r9386, %r9386, 24; + add.s32 %r9388, %r9387, %r9381; + xor.b32 %r9389, %r9388, %r9383; + shf.l.wrap.b32 %r9390, %r9389, %r9389, 25; + add.s32 %r9391, %r9343, %r8884; + add.s32 %r9392, %r9391, %r9362; + xor.b32 %r9393, %r9392, %r9331; + shf.l.wrap.b32 %r9394, %r9393, %r9393, 16; + add.s32 %r9395, %r9394, %r9318; + xor.b32 %r9396, %r9395, %r9362; + shf.l.wrap.b32 %r9397, %r9396, %r9396, 20; + add.s32 %r9398, %r9392, %r8905; + add.s32 %r9399, %r9398, %r9397; + xor.b32 %r9400, %r9399, %r9394; + shf.l.wrap.b32 %r9401, %r9400, %r9400, 24; + add.s32 %r9402, %r9401, %r9395; + xor.b32 %r9403, %r9402, %r9397; + shf.l.wrap.b32 %r9404, %r9403, %r9403, 25; + add.s32 %r9405, %r9357, %r8856; + add.s32 %r9406, %r9405, %r9320; + xor.b32 %r9407, %r9406, %r9345; + shf.l.wrap.b32 %r9408, %r9407, %r9407, 16; + add.s32 %r9409, %r9408, %r9332; + xor.b32 %r9410, %r9409, %r9320; + shf.l.wrap.b32 %r9411, %r9410, %r9410, 20; + add.s32 %r9412, %r9406, %r8891; + add.s32 %r9413, %r9412, %r9411; + xor.b32 %r9414, %r9413, %r9408; + shf.l.wrap.b32 %r9415, %r9414, %r9414, 24; + add.s32 %r9416, %r9415, %r9409; + xor.b32 %r9417, %r9416, %r9411; + shf.l.wrap.b32 %r9418, %r9417, %r9417, 25; + add.s32 %r9419, %r9371, %r8933; + add.s32 %r9420, %r9419, %r9418; + xor.b32 %r9421, %r9420, %r9387; + shf.l.wrap.b32 %r9422, %r9421, %r9421, 16; + add.s32 %r9423, %r9422, %r9402; + xor.b32 %r9424, %r9423, %r9418; + shf.l.wrap.b32 %r9425, %r9424, %r9424, 20; + add.s32 %r9426, %r9420, %r8940; + add.s32 %r9427, %r9426, %r9425; + xor.b32 %r9428, %r9427, %r9422; + shf.l.wrap.b32 %r9429, %r9428, %r9428, 24; + add.s32 %r9430, %r9429, %r9423; + xor.b32 %r9431, %r9430, %r9425; + shf.l.wrap.b32 %r9432, %r9431, %r9431, 25; + add.s32 %r9433, %r9385, %r8912; + add.s32 %r9434, %r9433, %r9376; + xor.b32 %r9435, %r9434, %r9401; + shf.l.wrap.b32 %r9436, %r9435, %r9435, 16; + add.s32 %r9437, %r9436, %r9416; + xor.b32 %r9438, %r9437, %r9376; + shf.l.wrap.b32 %r9439, %r9438, %r9438, 20; + add.s32 %r9440, %r9434, %r8926; + add.s32 %r9441, %r9440, %r9439; + xor.b32 %r9442, %r9441, %r9436; + shf.l.wrap.b32 %r9443, %r9442, %r9442, 24; + add.s32 %r9444, %r9443, %r9437; + xor.b32 %r9445, %r9444, %r9439; + shf.l.wrap.b32 %r9446, %r9445, %r9445, 25; + add.s32 %r9447, %r9399, %r8954; + add.s32 %r9448, %r9447, %r9390; + xor.b32 %r9449, %r9448, %r9415; + shf.l.wrap.b32 %r9450, %r9449, %r9449, 16; + add.s32 %r9451, %r9450, %r9374; + xor.b32 %r9452, %r9451, %r9390; + shf.l.wrap.b32 %r9453, %r9452, %r9452, 20; + add.s32 %r9454, %r9448, %r8919; + add.s32 %r9455, %r9454, %r9453; + xor.b32 %r9456, %r9455, %r9450; + shf.l.wrap.b32 %r9457, %r9456, %r9456, 24; + add.s32 %r9458, %r9457, %r9451; + xor.b32 %r9459, %r9458, %r9453; + shf.l.wrap.b32 %r9460, %r9459, %r9459, 25; + add.s32 %r9461, %r9413, %r8947; + add.s32 %r9462, %r9461, %r9404; + xor.b32 %r9463, %r9462, %r9373; + shf.l.wrap.b32 %r9464, %r9463, %r9463, 16; + add.s32 %r9465, %r9464, %r9388; + xor.b32 %r9466, %r9465, %r9404; + shf.l.wrap.b32 %r9467, %r9466, %r9466, 20; + add.s32 %r9468, %r9462, %r8905; + add.s32 %r9469, %r9468, %r9467; + xor.b32 %r9470, %r9469, %r9464; + shf.l.wrap.b32 %r9471, %r9470, %r9470, 24; + add.s32 %r9472, %r9471, %r9465; + xor.b32 %r9473, %r9472, %r9467; + shf.l.wrap.b32 %r9474, %r9473, %r9473, 25; + add.s32 %r9475, %r9427, %r8898; + add.s32 %r9476, %r9475, %r9446; + xor.b32 %r9477, %r9476, %r9471; + shf.l.wrap.b32 %r9478, %r9477, %r9477, 16; + add.s32 %r9479, %r9478, %r9458; + xor.b32 %r9480, %r9479, %r9446; + shf.l.wrap.b32 %r9481, %r9480, %r9480, 20; + add.s32 %r9482, %r9476, %r8863; + add.s32 %r9483, %r9482, %r9481; + xor.b32 %r9484, %r9483, %r9478; + shf.l.wrap.b32 %r9485, %r9484, %r9484, 24; + add.s32 %r9486, %r9485, %r9479; + xor.b32 %r9487, %r9486, %r9481; + shf.l.wrap.b32 %r9488, %r9487, %r9487, 25; + add.s32 %r9489, %r9441, %r8884; + add.s32 %r9490, %r9489, %r9460; + xor.b32 %r9491, %r9490, %r9429; + shf.l.wrap.b32 %r9492, %r9491, %r9491, 16; + add.s32 %r9493, %r9492, %r9472; + xor.b32 %r9494, %r9493, %r9460; + shf.l.wrap.b32 %r9495, %r9494, %r9494, 20; + add.s32 %r9496, %r9490, %r8870; + add.s32 %r9497, %r9496, %r9495; + xor.b32 %r9498, %r9497, %r9492; + shf.l.wrap.b32 %r9499, %r9498, %r9498, 24; + add.s32 %r9500, %r9499, %r9493; + xor.b32 %r9501, %r9500, %r9495; + shf.l.wrap.b32 %r9502, %r9501, %r9501, 25; + add.s32 %r9503, %r9455, %r8849; + add.s32 %r9504, %r9503, %r9474; + xor.b32 %r9505, %r9504, %r9443; + shf.l.wrap.b32 %r9506, %r9505, %r9505, 16; + add.s32 %r9507, %r9506, %r9430; + xor.b32 %r9508, %r9507, %r9474; + shf.l.wrap.b32 %r9509, %r9508, %r9508, 20; + add.s32 %r9510, %r9504, %r8856; + add.s32 %r9511, %r9510, %r9509; + xor.b32 %r9512, %r9511, %r9506; + shf.l.wrap.b32 %r9513, %r9512, %r9512, 24; + add.s32 %r9514, %r9513, %r9507; + xor.b32 %r9515, %r9514, %r9509; + shf.l.wrap.b32 %r9516, %r9515, %r9515, 25; + add.s32 %r9517, %r9469, %r8891; + add.s32 %r9518, %r9517, %r9432; + xor.b32 %r9519, %r9518, %r9457; + shf.l.wrap.b32 %r9520, %r9519, %r9519, 16; + add.s32 %r9521, %r9520, %r9444; + xor.b32 %r9522, %r9521, %r9432; + shf.l.wrap.b32 %r9523, %r9522, %r9522, 20; + add.s32 %r9524, %r9518, %r8877; + add.s32 %r9525, %r9524, %r9523; + xor.b32 %r9526, %r9525, %r9520; + shf.l.wrap.b32 %r9527, %r9526, %r9526, 24; + add.s32 %r9528, %r9527, %r9521; + xor.b32 %r9529, %r9528, %r9523; + shf.l.wrap.b32 %r9530, %r9529, %r9529, 25; + add.s32 %r9531, %r9483, %r8912; + add.s32 %r9532, %r9531, %r9530; + xor.b32 %r9533, %r9532, %r9499; + shf.l.wrap.b32 %r9534, %r9533, %r9533, 16; + add.s32 %r9535, %r9534, %r9514; + xor.b32 %r9536, %r9535, %r9530; + shf.l.wrap.b32 %r9537, %r9536, %r9536, 20; + add.s32 %r9538, %r9532, %r8947; + add.s32 %r9539, %r9538, %r9537; + xor.b32 %r9540, %r9539, %r9534; + shf.l.wrap.b32 %r9541, %r9540, %r9540, 24; + add.s32 %r9542, %r9541, %r9535; + xor.b32 %r9543, %r9542, %r9537; + shf.l.wrap.b32 %r9544, %r9543, %r9543, 25; + add.s32 %r9545, %r9497, %r8926; + add.s32 %r9546, %r9545, %r9488; + xor.b32 %r9547, %r9546, %r9513; + shf.l.wrap.b32 %r9548, %r9547, %r9547, 16; + add.s32 %r9549, %r9548, %r9528; + xor.b32 %r9550, %r9549, %r9488; + shf.l.wrap.b32 %r9551, %r9550, %r9550, 20; + add.s32 %r9552, %r9546, %r8884; + add.s32 %r9553, %r9552, %r9551; + xor.b32 %r9554, %r9553, %r9548; + shf.l.wrap.b32 %r9555, %r9554, %r9554, 24; + add.s32 %r9556, %r9555, %r9549; + xor.b32 %r9557, %r9556, %r9551; + shf.l.wrap.b32 %r9558, %r9557, %r9557, 25; + add.s32 %r9559, %r9511, %r8905; + add.s32 %r9560, %r9559, %r9502; + xor.b32 %r9561, %r9560, %r9527; + shf.l.wrap.b32 %r9562, %r9561, %r9561, 16; + add.s32 %r9563, %r9562, %r9486; + xor.b32 %r9564, %r9563, %r9502; + shf.l.wrap.b32 %r9565, %r9564, %r9564, 20; + add.s32 %r9566, %r9560, %r8933; + add.s32 %r9567, %r9566, %r9565; + xor.b32 %r9568, %r9567, %r9562; + shf.l.wrap.b32 %r9569, %r9568, %r9568, 24; + add.s32 %r9570, %r9569, %r9563; + xor.b32 %r9571, %r9570, %r9565; + shf.l.wrap.b32 %r9572, %r9571, %r9571, 25; + add.s32 %r9573, %r9525, %r8954; + add.s32 %r9574, %r9573, %r9516; + xor.b32 %r9575, %r9574, %r9485; + shf.l.wrap.b32 %r9576, %r9575, %r9575, 16; + add.s32 %r9577, %r9576, %r9500; + xor.b32 %r9578, %r9577, %r9516; + shf.l.wrap.b32 %r9579, %r9578, %r9578, 20; + add.s32 %r9580, %r9574, %r8856; + add.s32 %r9581, %r9580, %r9579; + xor.b32 %r9582, %r9581, %r9576; + shf.l.wrap.b32 %r9583, %r9582, %r9582, 24; + add.s32 %r9584, %r9583, %r9577; + xor.b32 %r9585, %r9584, %r9579; + shf.l.wrap.b32 %r9586, %r9585, %r9585, 25; + add.s32 %r9587, %r9539, %r8940; + add.s32 %r9588, %r9587, %r9558; + xor.b32 %r9589, %r9588, %r9583; + shf.l.wrap.b32 %r9590, %r9589, %r9589, 16; + add.s32 %r9591, %r9590, %r9570; + xor.b32 %r9592, %r9591, %r9558; + shf.l.wrap.b32 %r9593, %r9592, %r9592, 20; + add.s32 %r9594, %r9588, %r8870; + add.s32 %r9595, %r9594, %r9593; + xor.b32 %r9596, %r9595, %r9590; + shf.l.wrap.b32 %r9597, %r9596, %r9596, 24; + add.s32 %r9598, %r9597, %r9591; + xor.b32 %r9599, %r9598, %r9593; + shf.l.wrap.b32 %r9600, %r9599, %r9599, 25; + add.s32 %r9601, %r9553, %r8849; + add.s32 %r9602, %r9601, %r9572; + xor.b32 %r9603, %r9602, %r9541; + shf.l.wrap.b32 %r9604, %r9603, %r9603, 16; + add.s32 %r9605, %r9604, %r9584; + xor.b32 %r9606, %r9605, %r9572; + shf.l.wrap.b32 %r9607, %r9606, %r9606, 20; + add.s32 %r9608, %r9602, %r8919; + add.s32 %r9609, %r9608, %r9607; + xor.b32 %r9610, %r9609, %r9604; + shf.l.wrap.b32 %r9611, %r9610, %r9610, 24; + add.s32 %r9612, %r9611, %r9605; + xor.b32 %r9613, %r9612, %r9607; + shf.l.wrap.b32 %r9614, %r9613, %r9613, 25; + add.s32 %r9615, %r9567, %r8863; + add.s32 %r9616, %r9615, %r9586; + xor.b32 %r9617, %r9616, %r9555; + shf.l.wrap.b32 %r9618, %r9617, %r9617, 16; + add.s32 %r9619, %r9618, %r9542; + xor.b32 %r9620, %r9619, %r9586; + shf.l.wrap.b32 %r9621, %r9620, %r9620, 20; + add.s32 %r9622, %r9616, %r8891; + add.s32 %r9623, %r9622, %r9621; + xor.b32 %r9624, %r9623, %r9618; + shf.l.wrap.b32 %r9625, %r9624, %r9624, 24; + add.s32 %r9626, %r9625, %r9619; + xor.b32 %r9627, %r9626, %r9621; + shf.l.wrap.b32 %r9628, %r9627, %r9627, 25; + add.s32 %r9629, %r9581, %r8877; + add.s32 %r9630, %r9629, %r9544; + xor.b32 %r9631, %r9630, %r9569; + shf.l.wrap.b32 %r9632, %r9631, %r9631, 16; + add.s32 %r9633, %r9632, %r9556; + xor.b32 %r9634, %r9633, %r9544; + shf.l.wrap.b32 %r9635, %r9634, %r9634, 20; + add.s32 %r9636, %r9630, %r8898; + add.s32 %r9637, %r9636, %r9635; + xor.b32 %r9638, %r9637, %r9632; + shf.l.wrap.b32 %r9639, %r9638, %r9638, 24; + add.s32 %r9640, %r9639, %r9633; + xor.b32 %r9641, %r9640, %r9635; + shf.l.wrap.b32 %r9642, %r9641, %r9641, 25; + add.s32 %r9643, %r9595, %r8926; + add.s32 %r9644, %r9643, %r9642; + xor.b32 %r9645, %r9644, %r9611; + shf.l.wrap.b32 %r9646, %r9645, %r9645, 16; + add.s32 %r9647, %r9646, %r9626; + xor.b32 %r9648, %r9647, %r9642; + shf.l.wrap.b32 %r9649, %r9648, %r9648, 20; + add.s32 %r9650, %r9644, %r8954; + add.s32 %r9651, %r9650, %r9649; + xor.b32 %r9652, %r9651, %r9646; + shf.l.wrap.b32 %r9653, %r9652, %r9652, 24; + add.s32 %r9654, %r9653, %r9647; + xor.b32 %r9655, %r9654, %r9649; + shf.l.wrap.b32 %r9656, %r9655, %r9655, 25; + add.s32 %r9657, %r9609, %r8884; + add.s32 %r9658, %r9657, %r9600; + xor.b32 %r9659, %r9658, %r9625; + shf.l.wrap.b32 %r9660, %r9659, %r9659, 16; + add.s32 %r9661, %r9660, %r9640; + xor.b32 %r9662, %r9661, %r9600; + shf.l.wrap.b32 %r9663, %r9662, %r9662, 20; + add.s32 %r9664, %r9658, %r8849; + add.s32 %r9665, %r9664, %r9663; + xor.b32 %r9666, %r9665, %r9660; + shf.l.wrap.b32 %r9667, %r9666, %r9666, 24; + add.s32 %r9668, %r9667, %r9661; + xor.b32 %r9669, %r9668, %r9663; + shf.l.wrap.b32 %r9670, %r9669, %r9669, 25; + add.s32 %r9671, %r9623, %r8856; + add.s32 %r9672, %r9671, %r9614; + xor.b32 %r9673, %r9672, %r9639; + shf.l.wrap.b32 %r9674, %r9673, %r9673, 16; + add.s32 %r9675, %r9674, %r9598; + xor.b32 %r9676, %r9675, %r9614; + shf.l.wrap.b32 %r9677, %r9676, %r9676, 20; + add.s32 %r9678, %r9672, %r8912; + add.s32 %r9679, %r9678, %r9677; + xor.b32 %r9680, %r9679, %r9674; + shf.l.wrap.b32 %r9681, %r9680, %r9680, 24; + add.s32 %r9682, %r9681, %r9675; + xor.b32 %r9683, %r9682, %r9677; + shf.l.wrap.b32 %r9684, %r9683, %r9683, 25; + add.s32 %r9685, %r9637, %r8905; + add.s32 %r9686, %r9685, %r9628; + xor.b32 %r9687, %r9686, %r9597; + shf.l.wrap.b32 %r9688, %r9687, %r9687, 16; + add.s32 %r9689, %r9688, %r9612; + xor.b32 %r9690, %r9689, %r9628; + shf.l.wrap.b32 %r9691, %r9690, %r9690, 20; + add.s32 %r9692, %r9686, %r8891; + add.s32 %r9693, %r9692, %r9691; + xor.b32 %r9694, %r9693, %r9688; + shf.l.wrap.b32 %r9695, %r9694, %r9694, 24; + add.s32 %r9696, %r9695, %r9689; + xor.b32 %r9697, %r9696, %r9691; + shf.l.wrap.b32 %r9698, %r9697, %r9697, 25; + add.s32 %r9699, %r9651, %r8947; + add.s32 %r9700, %r9699, %r9670; + xor.b32 %r9701, %r9700, %r9695; + shf.l.wrap.b32 %r9702, %r9701, %r9701, 16; + add.s32 %r9703, %r9702, %r9682; + xor.b32 %r9704, %r9703, %r9670; + shf.l.wrap.b32 %r9705, %r9704, %r9704, 20; + add.s32 %r9706, %r9700, %r8919; + add.s32 %r9707, %r9706, %r9705; + xor.b32 %r9708, %r9707, %r9702; + shf.l.wrap.b32 %r9709, %r9708, %r9708, 24; + add.s32 %r9710, %r9709, %r9703; + xor.b32 %r9711, %r9710, %r9705; + shf.l.wrap.b32 %r9712, %r9711, %r9711, 25; + add.s32 %r9713, %r9665, %r8863; + add.s32 %r9714, %r9713, %r9684; + xor.b32 %r9715, %r9714, %r9653; + shf.l.wrap.b32 %r9716, %r9715, %r9715, 16; + add.s32 %r9717, %r9716, %r9696; + xor.b32 %r9718, %r9717, %r9684; + shf.l.wrap.b32 %r9719, %r9718, %r9718, 20; + add.s32 %r9720, %r9714, %r8933; + add.s32 %r9721, %r9720, %r9719; + xor.b32 %r9722, %r9721, %r9716; + shf.l.wrap.b32 %r9723, %r9722, %r9722, 24; + add.s32 %r9724, %r9723, %r9717; + xor.b32 %r9725, %r9724, %r9719; + shf.l.wrap.b32 %r9726, %r9725, %r9725, 25; + add.s32 %r9727, %r9679, %r8870; + add.s32 %r9728, %r9727, %r9698; + xor.b32 %r9729, %r9728, %r9667; + shf.l.wrap.b32 %r9730, %r9729, %r9729, 16; + add.s32 %r9731, %r9730, %r9654; + xor.b32 %r9732, %r9731, %r9698; + shf.l.wrap.b32 %r9733, %r9732, %r9732, 20; + add.s32 %r9734, %r9728, %r8877; + add.s32 %r9735, %r9734, %r9733; + xor.b32 %r9736, %r9735, %r9730; + shf.l.wrap.b32 %r9737, %r9736, %r9736, 24; + add.s32 %r9738, %r9737, %r9731; + xor.b32 %r9739, %r9738, %r9733; + shf.l.wrap.b32 %r9740, %r9739, %r9739, 25; + add.s32 %r9741, %r9693, %r8898; + add.s32 %r9742, %r9741, %r9656; + xor.b32 %r9743, %r9742, %r9681; + shf.l.wrap.b32 %r9744, %r9743, %r9743, 16; + add.s32 %r9745, %r9744, %r9668; + xor.b32 %r9746, %r9745, %r9656; + shf.l.wrap.b32 %r9747, %r9746, %r9746, 20; + add.s32 %r9748, %r9742, %r8940; + add.s32 %r9749, %r9748, %r9747; + xor.b32 %r9750, %r9749, %r9744; + shf.l.wrap.b32 %r9751, %r9750, %r9750, 24; + add.s32 %r9752, %r9751, %r9745; + xor.b32 %r9753, %r9752, %r9747; + shf.l.wrap.b32 %r9754, %r9753, %r9753, 25; + xor.b32 %r9755, %r9738, %r9707; + st.local.u32 [%rd3+-104], %r9755; + xor.b32 %r9756, %r9752, %r9721; + st.local.u32 [%rd3+-100], %r9756; + xor.b32 %r9757, %r9710, %r9735; + st.local.u32 [%rd3+-96], %r9757; + xor.b32 %r9758, %r9724, %r9749; + st.local.u32 [%rd3+-92], %r9758; + xor.b32 %r9759, %r9754, %r9723; + st.local.u32 [%rd3+-88], %r9759; + xor.b32 %r9760, %r9712, %r9737; + st.local.u32 [%rd3+-84], %r9760; + xor.b32 %r9761, %r9726, %r9751; + st.local.u32 [%rd3+-80], %r9761; + xor.b32 %r9762, %r9740, %r9709; + st.local.u32 [%rd3+-76], %r9762; + add.s16 %rs340, %rs335, 1; + st.local.v2.u8 [%rd3], {%rs390, %rs340}; + +$L__BB1_56: + add.s64 %rd207, %rd78, %rd263; + st.local.u8 [%rd207], %rs390; + add.s64 %rd263, %rd263, 1; + setp.lt.u64 %p46, %rd263, 64; + mov.u64 %rd270, %rd77; + @%p46 bra $L__BB1_56; + +$L__BB1_57: + setp.gt.u64 %p47, %rd270, 64; + @%p47 bra $L__BB1_59; + bra.uni $L__BB1_58; + +$L__BB1_59: + ld.local.u8 %rs95, [%rd3+2]; + ld.local.u8 %rs391, [%rd3+1]; + ld.local.u32 %r11689, [%rd3+-104]; + ld.local.u32 %r11688, [%rd3+-100]; + ld.local.u32 %r11687, [%rd3+-96]; + ld.local.u32 %r11686, [%rd3+-92]; + ld.local.u32 %r11685, [%rd3+-88]; + ld.local.u32 %r11684, [%rd3+-84]; + ld.local.u32 %r11683, [%rd3+-80]; + ld.local.u32 %r11682, [%rd3+-76]; + ld.local.u64 %rd268, [%rd3+-72]; + cvt.u32.u64 %r117, %rd268; + shr.u64 %rd208, %rd268, 32; + cvt.u32.u64 %r118, %rd208; + +$L__BB1_60: + and.b16 %rs342, %rs391, 255; + setp.eq.s16 %p48, %rs342, 0; + selp.u16 %rs343, 1, 0, %p48; + or.b16 %rs344, %rs95, %rs343; + ld.local.u8 %r9763, [%rd260]; + ld.local.u8 %r9764, [%rd260+1]; + prmt.b32 %r9765, %r9764, %r9763, 30212; + ld.local.u8 %r9766, [%rd260+2]; + prmt.b32 %r9767, %r9766, %r9765, 28756; + ld.local.u8 %r9768, [%rd260+3]; + prmt.b32 %r9769, %r9768, %r9767, 1620; + ld.local.u8 %r9770, [%rd260+4]; + ld.local.u8 %r9771, [%rd260+5]; + prmt.b32 %r9772, %r9771, %r9770, 30212; + ld.local.u8 %r9773, [%rd260+6]; + prmt.b32 %r9774, %r9773, %r9772, 28756; + ld.local.u8 %r9775, [%rd260+7]; + prmt.b32 %r9776, %r9775, %r9774, 1620; + ld.local.u8 %r9777, [%rd260+8]; + ld.local.u8 %r9778, [%rd260+9]; + prmt.b32 %r9779, %r9778, %r9777, 30212; + ld.local.u8 %r9780, [%rd260+10]; + prmt.b32 %r9781, %r9780, %r9779, 28756; + ld.local.u8 %r9782, [%rd260+11]; + prmt.b32 %r9783, %r9782, %r9781, 1620; + ld.local.u8 %r9784, [%rd260+12]; + ld.local.u8 %r9785, [%rd260+13]; + prmt.b32 %r9786, %r9785, %r9784, 30212; + ld.local.u8 %r9787, [%rd260+14]; + prmt.b32 %r9788, %r9787, %r9786, 28756; + ld.local.u8 %r9789, [%rd260+15]; + prmt.b32 %r9790, %r9789, %r9788, 1620; + ld.local.u8 %r9791, [%rd260+16]; + ld.local.u8 %r9792, [%rd260+17]; + prmt.b32 %r9793, %r9792, %r9791, 30212; + ld.local.u8 %r9794, [%rd260+18]; + prmt.b32 %r9795, %r9794, %r9793, 28756; + ld.local.u8 %r9796, [%rd260+19]; + prmt.b32 %r9797, %r9796, %r9795, 1620; + ld.local.u8 %r9798, [%rd260+20]; + ld.local.u8 %r9799, [%rd260+21]; + prmt.b32 %r9800, %r9799, %r9798, 30212; + ld.local.u8 %r9801, [%rd260+22]; + prmt.b32 %r9802, %r9801, %r9800, 28756; + ld.local.u8 %r9803, [%rd260+23]; + prmt.b32 %r9804, %r9803, %r9802, 1620; + ld.local.u8 %r9805, [%rd260+24]; + ld.local.u8 %r9806, [%rd260+25]; + prmt.b32 %r9807, %r9806, %r9805, 30212; + ld.local.u8 %r9808, [%rd260+26]; + prmt.b32 %r9809, %r9808, %r9807, 28756; + ld.local.u8 %r9810, [%rd260+27]; + prmt.b32 %r9811, %r9810, %r9809, 1620; + ld.local.u8 %r9812, [%rd260+28]; + ld.local.u8 %r9813, [%rd260+29]; + prmt.b32 %r9814, %r9813, %r9812, 30212; + ld.local.u8 %r9815, [%rd260+30]; + prmt.b32 %r9816, %r9815, %r9814, 28756; + ld.local.u8 %r9817, [%rd260+31]; + prmt.b32 %r9818, %r9817, %r9816, 1620; + ld.local.u8 %r9819, [%rd260+32]; + ld.local.u8 %r9820, [%rd260+33]; + prmt.b32 %r9821, %r9820, %r9819, 30212; + ld.local.u8 %r9822, [%rd260+34]; + prmt.b32 %r9823, %r9822, %r9821, 28756; + ld.local.u8 %r9824, [%rd260+35]; + prmt.b32 %r9825, %r9824, %r9823, 1620; + ld.local.u8 %r9826, [%rd260+36]; + ld.local.u8 %r9827, [%rd260+37]; + prmt.b32 %r9828, %r9827, %r9826, 30212; + ld.local.u8 %r9829, [%rd260+38]; + prmt.b32 %r9830, %r9829, %r9828, 28756; + ld.local.u8 %r9831, [%rd260+39]; + prmt.b32 %r9832, %r9831, %r9830, 1620; + ld.local.u8 %r9833, [%rd260+40]; + ld.local.u8 %r9834, [%rd260+41]; + prmt.b32 %r9835, %r9834, %r9833, 30212; + ld.local.u8 %r9836, [%rd260+42]; + prmt.b32 %r9837, %r9836, %r9835, 28756; + ld.local.u8 %r9838, [%rd260+43]; + prmt.b32 %r9839, %r9838, %r9837, 1620; + ld.local.u8 %r9840, [%rd260+44]; + ld.local.u8 %r9841, [%rd260+45]; + prmt.b32 %r9842, %r9841, %r9840, 30212; + ld.local.u8 %r9843, [%rd260+46]; + prmt.b32 %r9844, %r9843, %r9842, 28756; + ld.local.u8 %r9845, [%rd260+47]; + prmt.b32 %r9846, %r9845, %r9844, 1620; + ld.local.u8 %r9847, [%rd260+48]; + ld.local.u8 %r9848, [%rd260+49]; + prmt.b32 %r9849, %r9848, %r9847, 30212; + ld.local.u8 %r9850, [%rd260+50]; + prmt.b32 %r9851, %r9850, %r9849, 28756; + ld.local.u8 %r9852, [%rd260+51]; + prmt.b32 %r9853, %r9852, %r9851, 1620; + ld.local.u8 %r9854, [%rd260+52]; + ld.local.u8 %r9855, [%rd260+53]; + prmt.b32 %r9856, %r9855, %r9854, 30212; + ld.local.u8 %r9857, [%rd260+54]; + prmt.b32 %r9858, %r9857, %r9856, 28756; + ld.local.u8 %r9859, [%rd260+55]; + prmt.b32 %r9860, %r9859, %r9858, 1620; + ld.local.u8 %r9861, [%rd260+56]; + ld.local.u8 %r9862, [%rd260+57]; + prmt.b32 %r9863, %r9862, %r9861, 30212; + ld.local.u8 %r9864, [%rd260+58]; + prmt.b32 %r9865, %r9864, %r9863, 28756; + ld.local.u8 %r9866, [%rd260+59]; + prmt.b32 %r9867, %r9866, %r9865, 1620; + ld.local.u8 %r9868, [%rd260+60]; + ld.local.u8 %r9869, [%rd260+61]; + prmt.b32 %r9870, %r9869, %r9868, 30212; + ld.local.u8 %r9871, [%rd260+62]; + prmt.b32 %r9872, %r9871, %r9870, 28756; + ld.local.u8 %r9873, [%rd260+63]; + prmt.b32 %r9874, %r9873, %r9872, 1620; + cvt.u32.u16 %r9875, %rs344; + and.b32 %r9876, %r9875, 255; + add.s32 %r9877, %r11689, %r11685; + add.s32 %r9878, %r9877, %r9769; + xor.b32 %r9879, %r9878, %r117; + shf.l.wrap.b32 %r9880, %r9879, %r9879, 16; + add.s32 %r9881, %r9880, 1779033703; + xor.b32 %r9882, %r9881, %r11685; + shf.l.wrap.b32 %r9883, %r9882, %r9882, 20; + add.s32 %r9884, %r9776, %r9878; + add.s32 %r9885, %r9884, %r9883; + xor.b32 %r9886, %r9885, %r9880; + shf.l.wrap.b32 %r9887, %r9886, %r9886, 24; + add.s32 %r9888, %r9887, %r9881; + xor.b32 %r9889, %r9888, %r9883; + shf.l.wrap.b32 %r9890, %r9889, %r9889, 25; + add.s32 %r9891, %r11688, %r11684; + add.s32 %r9892, %r9891, %r9783; + xor.b32 %r9893, %r9892, %r118; + shf.l.wrap.b32 %r9894, %r9893, %r9893, 16; + add.s32 %r9895, %r9894, -1150833019; + xor.b32 %r9896, %r9895, %r11684; + shf.l.wrap.b32 %r9897, %r9896, %r9896, 20; + add.s32 %r9898, %r9790, %r9892; + add.s32 %r9899, %r9898, %r9897; + xor.b32 %r9900, %r9899, %r9894; + shf.l.wrap.b32 %r9901, %r9900, %r9900, 24; + add.s32 %r9902, %r9901, %r9895; + xor.b32 %r9903, %r9902, %r9897; + shf.l.wrap.b32 %r9904, %r9903, %r9903, 25; + add.s32 %r9905, %r11687, %r11683; + add.s32 %r9906, %r9905, %r9797; + shr.u32 %r9907, %r9906, 16; + shl.b32 %r9908, %r9906, 16; + xor.b32 %r9909, %r9908, 4194304; + or.b32 %r9910, %r9909, %r9907; + add.s32 %r9911, %r9910, 1013904242; + xor.b32 %r9912, %r9911, %r11683; + shf.l.wrap.b32 %r9913, %r9912, %r9912, 20; + add.s32 %r9914, %r9804, %r9906; + add.s32 %r9915, %r9914, %r9913; + xor.b32 %r9916, %r9915, %r9910; + shf.l.wrap.b32 %r9917, %r9916, %r9916, 24; + add.s32 %r9918, %r9917, %r9911; + xor.b32 %r9919, %r9918, %r9913; + shf.l.wrap.b32 %r9920, %r9919, %r9919, 25; + add.s32 %r9921, %r11686, %r11682; + add.s32 %r9922, %r9921, %r9811; + xor.b32 %r9923, %r9922, %r9876; + shr.u32 %r9924, %r9922, 16; + shl.b32 %r9925, %r9923, 16; + or.b32 %r9926, %r9925, %r9924; + add.s32 %r9927, %r9926, -1521486534; + xor.b32 %r9928, %r9927, %r11682; + shf.l.wrap.b32 %r9929, %r9928, %r9928, 20; + add.s32 %r9930, %r9818, %r9922; + add.s32 %r9931, %r9930, %r9929; + xor.b32 %r9932, %r9931, %r9926; + shf.l.wrap.b32 %r9933, %r9932, %r9932, 24; + add.s32 %r9934, %r9933, %r9927; + xor.b32 %r9935, %r9934, %r9929; + shf.l.wrap.b32 %r9936, %r9935, %r9935, 25; + add.s32 %r9937, %r9904, %r9885; + add.s32 %r9938, %r9937, %r9825; + xor.b32 %r9939, %r9933, %r9938; + shf.l.wrap.b32 %r9940, %r9939, %r9939, 16; + add.s32 %r9941, %r9940, %r9918; + xor.b32 %r9942, %r9941, %r9904; + shf.l.wrap.b32 %r9943, %r9942, %r9942, 20; + add.s32 %r9944, %r9832, %r9938; + add.s32 %r9945, %r9944, %r9943; + xor.b32 %r9946, %r9945, %r9940; + shf.l.wrap.b32 %r9947, %r9946, %r9946, 24; + add.s32 %r9948, %r9947, %r9941; + xor.b32 %r9949, %r9948, %r9943; + shf.l.wrap.b32 %r9950, %r9949, %r9949, 25; + add.s32 %r9951, %r9920, %r9899; + add.s32 %r9952, %r9951, %r9839; + xor.b32 %r9953, %r9952, %r9887; + shf.l.wrap.b32 %r9954, %r9953, %r9953, 16; + add.s32 %r9955, %r9954, %r9934; + xor.b32 %r9956, %r9955, %r9920; + shf.l.wrap.b32 %r9957, %r9956, %r9956, 20; + add.s32 %r9958, %r9846, %r9952; + add.s32 %r9959, %r9958, %r9957; + xor.b32 %r9960, %r9959, %r9954; + shf.l.wrap.b32 %r9961, %r9960, %r9960, 24; + add.s32 %r9962, %r9961, %r9955; + xor.b32 %r9963, %r9962, %r9957; + shf.l.wrap.b32 %r9964, %r9963, %r9963, 25; + add.s32 %r9965, %r9936, %r9915; + add.s32 %r9966, %r9965, %r9853; + xor.b32 %r9967, %r9966, %r9901; + shf.l.wrap.b32 %r9968, %r9967, %r9967, 16; + add.s32 %r9969, %r9968, %r9888; + xor.b32 %r9970, %r9969, %r9936; + shf.l.wrap.b32 %r9971, %r9970, %r9970, 20; + add.s32 %r9972, %r9860, %r9966; + add.s32 %r9973, %r9972, %r9971; + xor.b32 %r9974, %r9973, %r9968; + shf.l.wrap.b32 %r9975, %r9974, %r9974, 24; + add.s32 %r9976, %r9975, %r9969; + xor.b32 %r9977, %r9976, %r9971; + shf.l.wrap.b32 %r9978, %r9977, %r9977, 25; + add.s32 %r9979, %r9931, %r9890; + add.s32 %r9980, %r9979, %r9867; + xor.b32 %r9981, %r9980, %r9917; + shf.l.wrap.b32 %r9982, %r9981, %r9981, 16; + add.s32 %r9983, %r9982, %r9902; + xor.b32 %r9984, %r9983, %r9890; + shf.l.wrap.b32 %r9985, %r9984, %r9984, 20; + add.s32 %r9986, %r9874, %r9980; + add.s32 %r9987, %r9986, %r9985; + xor.b32 %r9988, %r9987, %r9982; + shf.l.wrap.b32 %r9989, %r9988, %r9988, 24; + add.s32 %r9990, %r9989, %r9983; + xor.b32 %r9991, %r9990, %r9985; + shf.l.wrap.b32 %r9992, %r9991, %r9991, 25; + add.s32 %r9993, %r9945, %r9783; + add.s32 %r9994, %r9993, %r9992; + xor.b32 %r9995, %r9994, %r9961; + shf.l.wrap.b32 %r9996, %r9995, %r9995, 16; + add.s32 %r9997, %r9996, %r9976; + xor.b32 %r9998, %r9997, %r9992; + shf.l.wrap.b32 %r9999, %r9998, %r9998, 20; + add.s32 %r10000, %r9994, %r9811; + add.s32 %r10001, %r10000, %r9999; + xor.b32 %r10002, %r10001, %r9996; + shf.l.wrap.b32 %r10003, %r10002, %r10002, 24; + add.s32 %r10004, %r10003, %r9997; + xor.b32 %r10005, %r10004, %r9999; + shf.l.wrap.b32 %r10006, %r10005, %r10005, 25; + add.s32 %r10007, %r9959, %r9790; + add.s32 %r10008, %r10007, %r9950; + xor.b32 %r10009, %r9975, %r10008; + shf.l.wrap.b32 %r10010, %r10009, %r10009, 16; + add.s32 %r10011, %r9990, %r10010; + xor.b32 %r10012, %r10011, %r9950; + shf.l.wrap.b32 %r10013, %r10012, %r10012, 20; + add.s32 %r10014, %r10008, %r9839; + add.s32 %r10015, %r10014, %r10013; + xor.b32 %r10016, %r10015, %r10010; + shf.l.wrap.b32 %r10017, %r10016, %r10016, 24; + add.s32 %r10018, %r10017, %r10011; + xor.b32 %r10019, %r10018, %r10013; + shf.l.wrap.b32 %r10020, %r10019, %r10019, 25; + add.s32 %r10021, %r9964, %r9818; + add.s32 %r10022, %r10021, %r9973; + xor.b32 %r10023, %r9989, %r10022; + shf.l.wrap.b32 %r10024, %r10023, %r10023, 16; + add.s32 %r10025, %r10024, %r9948; + xor.b32 %r10026, %r10025, %r9964; + shf.l.wrap.b32 %r10027, %r10026, %r10026, 20; + add.s32 %r10028, %r10022, %r9769; + add.s32 %r10029, %r10028, %r10027; + xor.b32 %r10030, %r10029, %r10024; + shf.l.wrap.b32 %r10031, %r10030, %r10030, 24; + add.s32 %r10032, %r10031, %r10025; + xor.b32 %r10033, %r10032, %r10027; + shf.l.wrap.b32 %r10034, %r10033, %r10033, 25; + add.s32 %r10035, %r9978, %r9797; + add.s32 %r10036, %r10035, %r9987; + xor.b32 %r10037, %r10036, %r9947; + shf.l.wrap.b32 %r10038, %r10037, %r10037, 16; + add.s32 %r10039, %r10038, %r9962; + xor.b32 %r10040, %r10039, %r9978; + shf.l.wrap.b32 %r10041, %r10040, %r10040, 20; + add.s32 %r10042, %r10036, %r9860; + add.s32 %r10043, %r10042, %r10041; + xor.b32 %r10044, %r10043, %r10038; + shf.l.wrap.b32 %r10045, %r10044, %r10044, 24; + add.s32 %r10046, %r10045, %r10039; + xor.b32 %r10047, %r10046, %r10041; + shf.l.wrap.b32 %r10048, %r10047, %r10047, 25; + add.s32 %r10049, %r10001, %r9776; + add.s32 %r10050, %r10049, %r10020; + xor.b32 %r10051, %r10050, %r10045; + shf.l.wrap.b32 %r10052, %r10051, %r10051, 16; + add.s32 %r10053, %r10052, %r10032; + xor.b32 %r10054, %r10053, %r10020; + shf.l.wrap.b32 %r10055, %r10054, %r10054, 20; + add.s32 %r10056, %r10050, %r9846; + add.s32 %r10057, %r10056, %r10055; + xor.b32 %r10058, %r10057, %r10052; + shf.l.wrap.b32 %r10059, %r10058, %r10058, 24; + add.s32 %r10060, %r10059, %r10053; + xor.b32 %r10061, %r10060, %r10055; + shf.l.wrap.b32 %r10062, %r10061, %r10061, 25; + add.s32 %r10063, %r10015, %r9853; + add.s32 %r10064, %r10063, %r10034; + xor.b32 %r10065, %r10064, %r10003; + shf.l.wrap.b32 %r10066, %r10065, %r10065, 16; + add.s32 %r10067, %r10066, %r10046; + xor.b32 %r10068, %r10067, %r10034; + shf.l.wrap.b32 %r10069, %r10068, %r10068, 20; + add.s32 %r10070, %r10064, %r9804; + add.s32 %r10071, %r10070, %r10069; + xor.b32 %r10072, %r10071, %r10066; + shf.l.wrap.b32 %r10073, %r10072, %r10072, 24; + add.s32 %r10074, %r10073, %r10067; + xor.b32 %r10075, %r10074, %r10069; + shf.l.wrap.b32 %r10076, %r10075, %r10075, 25; + add.s32 %r10077, %r10029, %r9832; + add.s32 %r10078, %r10077, %r10048; + xor.b32 %r10079, %r10078, %r10017; + shf.l.wrap.b32 %r10080, %r10079, %r10079, 16; + add.s32 %r10081, %r10080, %r10004; + xor.b32 %r10082, %r10081, %r10048; + shf.l.wrap.b32 %r10083, %r10082, %r10082, 20; + add.s32 %r10084, %r10078, %r9867; + add.s32 %r10085, %r10084, %r10083; + xor.b32 %r10086, %r10085, %r10080; + shf.l.wrap.b32 %r10087, %r10086, %r10086, 24; + add.s32 %r10088, %r10087, %r10081; + xor.b32 %r10089, %r10088, %r10083; + shf.l.wrap.b32 %r10090, %r10089, %r10089, 25; + add.s32 %r10091, %r10043, %r9874; + add.s32 %r10092, %r10091, %r10006; + xor.b32 %r10093, %r10092, %r10031; + shf.l.wrap.b32 %r10094, %r10093, %r10093, 16; + add.s32 %r10095, %r10094, %r10018; + xor.b32 %r10096, %r10095, %r10006; + shf.l.wrap.b32 %r10097, %r10096, %r10096, 20; + add.s32 %r10098, %r10092, %r9825; + add.s32 %r10099, %r10098, %r10097; + xor.b32 %r10100, %r10099, %r10094; + shf.l.wrap.b32 %r10101, %r10100, %r10100, 24; + add.s32 %r10102, %r10101, %r10095; + xor.b32 %r10103, %r10102, %r10097; + shf.l.wrap.b32 %r10104, %r10103, %r10103, 25; + add.s32 %r10105, %r10057, %r9790; + add.s32 %r10106, %r10105, %r10104; + xor.b32 %r10107, %r10106, %r10073; + shf.l.wrap.b32 %r10108, %r10107, %r10107, 16; + add.s32 %r10109, %r10108, %r10088; + xor.b32 %r10110, %r10109, %r10104; + shf.l.wrap.b32 %r10111, %r10110, %r10110, 20; + add.s32 %r10112, %r10106, %r9797; + add.s32 %r10113, %r10112, %r10111; + xor.b32 %r10114, %r10113, %r10108; + shf.l.wrap.b32 %r10115, %r10114, %r10114, 24; + add.s32 %r10116, %r10115, %r10109; + xor.b32 %r10117, %r10116, %r10111; + shf.l.wrap.b32 %r10118, %r10117, %r10117, 25; + add.s32 %r10119, %r10071, %r9839; + add.s32 %r10120, %r10119, %r10062; + xor.b32 %r10121, %r10120, %r10087; + shf.l.wrap.b32 %r10122, %r10121, %r10121, 16; + add.s32 %r10123, %r10122, %r10102; + xor.b32 %r10124, %r10123, %r10062; + shf.l.wrap.b32 %r10125, %r10124, %r10124, 20; + add.s32 %r10126, %r10120, %r9853; + add.s32 %r10127, %r10126, %r10125; + xor.b32 %r10128, %r10127, %r10122; + shf.l.wrap.b32 %r10129, %r10128, %r10128, 24; + add.s32 %r10130, %r10129, %r10123; + xor.b32 %r10131, %r10130, %r10125; + shf.l.wrap.b32 %r10132, %r10131, %r10131, 25; + add.s32 %r10133, %r10085, %r9860; + add.s32 %r10134, %r10133, %r10076; + xor.b32 %r10135, %r10134, %r10101; + shf.l.wrap.b32 %r10136, %r10135, %r10135, 16; + add.s32 %r10137, %r10136, %r10060; + xor.b32 %r10138, %r10137, %r10076; + shf.l.wrap.b32 %r10139, %r10138, %r10138, 20; + add.s32 %r10140, %r10134, %r9783; + add.s32 %r10141, %r10140, %r10139; + xor.b32 %r10142, %r10141, %r10136; + shf.l.wrap.b32 %r10143, %r10142, %r10142, 24; + add.s32 %r10144, %r10143, %r10137; + xor.b32 %r10145, %r10144, %r10139; + shf.l.wrap.b32 %r10146, %r10145, %r10145, 25; + add.s32 %r10147, %r10099, %r9818; + add.s32 %r10148, %r10147, %r10090; + xor.b32 %r10149, %r10148, %r10059; + shf.l.wrap.b32 %r10150, %r10149, %r10149, 16; + add.s32 %r10151, %r10150, %r10074; + xor.b32 %r10152, %r10151, %r10090; + shf.l.wrap.b32 %r10153, %r10152, %r10152, 20; + add.s32 %r10154, %r10148, %r9867; + add.s32 %r10155, %r10154, %r10153; + xor.b32 %r10156, %r10155, %r10150; + shf.l.wrap.b32 %r10157, %r10156, %r10156, 24; + add.s32 %r10158, %r10157, %r10151; + xor.b32 %r10159, %r10158, %r10153; + shf.l.wrap.b32 %r10160, %r10159, %r10159, 25; + add.s32 %r10161, %r10113, %r9811; + add.s32 %r10162, %r10161, %r10132; + xor.b32 %r10163, %r10162, %r10157; + shf.l.wrap.b32 %r10164, %r10163, %r10163, 16; + add.s32 %r10165, %r10164, %r10144; + xor.b32 %r10166, %r10165, %r10132; + shf.l.wrap.b32 %r10167, %r10166, %r10166, 20; + add.s32 %r10168, %r10162, %r9804; + add.s32 %r10169, %r10168, %r10167; + xor.b32 %r10170, %r10169, %r10164; + shf.l.wrap.b32 %r10171, %r10170, %r10170, 24; + add.s32 %r10172, %r10171, %r10165; + xor.b32 %r10173, %r10172, %r10167; + shf.l.wrap.b32 %r10174, %r10173, %r10173, 25; + add.s32 %r10175, %r10127, %r9832; + add.s32 %r10176, %r10175, %r10146; + xor.b32 %r10177, %r10176, %r10115; + shf.l.wrap.b32 %r10178, %r10177, %r10177, 16; + add.s32 %r10179, %r10178, %r10158; + xor.b32 %r10180, %r10179, %r10146; + shf.l.wrap.b32 %r10181, %r10180, %r10180, 20; + add.s32 %r10182, %r10176, %r9769; + add.s32 %r10183, %r10182, %r10181; + xor.b32 %r10184, %r10183, %r10178; + shf.l.wrap.b32 %r10185, %r10184, %r10184, 24; + add.s32 %r10186, %r10185, %r10179; + xor.b32 %r10187, %r10186, %r10181; + shf.l.wrap.b32 %r10188, %r10187, %r10187, 25; + add.s32 %r10189, %r10141, %r9846; + add.s32 %r10190, %r10189, %r10160; + xor.b32 %r10191, %r10190, %r10129; + shf.l.wrap.b32 %r10192, %r10191, %r10191, 16; + add.s32 %r10193, %r10192, %r10116; + xor.b32 %r10194, %r10193, %r10160; + shf.l.wrap.b32 %r10195, %r10194, %r10194, 20; + add.s32 %r10196, %r10190, %r9874; + add.s32 %r10197, %r10196, %r10195; + xor.b32 %r10198, %r10197, %r10192; + shf.l.wrap.b32 %r10199, %r10198, %r10198, 24; + add.s32 %r10200, %r10199, %r10193; + xor.b32 %r10201, %r10200, %r10195; + shf.l.wrap.b32 %r10202, %r10201, %r10201, 25; + add.s32 %r10203, %r10155, %r9825; + add.s32 %r10204, %r10203, %r10118; + xor.b32 %r10205, %r10204, %r10143; + shf.l.wrap.b32 %r10206, %r10205, %r10205, 16; + add.s32 %r10207, %r10206, %r10130; + xor.b32 %r10208, %r10207, %r10118; + shf.l.wrap.b32 %r10209, %r10208, %r10208, 20; + add.s32 %r10210, %r10204, %r9776; + add.s32 %r10211, %r10210, %r10209; + xor.b32 %r10212, %r10211, %r10206; + shf.l.wrap.b32 %r10213, %r10212, %r10212, 24; + add.s32 %r10214, %r10213, %r10207; + xor.b32 %r10215, %r10214, %r10209; + shf.l.wrap.b32 %r10216, %r10215, %r10215, 25; + add.s32 %r10217, %r10169, %r9839; + add.s32 %r10218, %r10217, %r10216; + xor.b32 %r10219, %r10218, %r10185; + shf.l.wrap.b32 %r10220, %r10219, %r10219, 16; + add.s32 %r10221, %r10220, %r10200; + xor.b32 %r10222, %r10221, %r10216; + shf.l.wrap.b32 %r10223, %r10222, %r10222, 20; + add.s32 %r10224, %r10218, %r9818; + add.s32 %r10225, %r10224, %r10223; + xor.b32 %r10226, %r10225, %r10220; + shf.l.wrap.b32 %r10227, %r10226, %r10226, 24; + add.s32 %r10228, %r10227, %r10221; + xor.b32 %r10229, %r10228, %r10223; + shf.l.wrap.b32 %r10230, %r10229, %r10229, 25; + add.s32 %r10231, %r10183, %r9853; + add.s32 %r10232, %r10231, %r10174; + xor.b32 %r10233, %r10232, %r10199; + shf.l.wrap.b32 %r10234, %r10233, %r10233, 16; + add.s32 %r10235, %r10234, %r10214; + xor.b32 %r10236, %r10235, %r10174; + shf.l.wrap.b32 %r10237, %r10236, %r10236, 20; + add.s32 %r10238, %r10232, %r9832; + add.s32 %r10239, %r10238, %r10237; + xor.b32 %r10240, %r10239, %r10234; + shf.l.wrap.b32 %r10241, %r10240, %r10240, 24; + add.s32 %r10242, %r10241, %r10235; + xor.b32 %r10243, %r10242, %r10237; + shf.l.wrap.b32 %r10244, %r10243, %r10243, 25; + add.s32 %r10245, %r10197, %r9867; + add.s32 %r10246, %r10245, %r10188; + xor.b32 %r10247, %r10246, %r10213; + shf.l.wrap.b32 %r10248, %r10247, %r10247, 16; + add.s32 %r10249, %r10248, %r10172; + xor.b32 %r10250, %r10249, %r10188; + shf.l.wrap.b32 %r10251, %r10250, %r10250, 20; + add.s32 %r10252, %r10246, %r9790; + add.s32 %r10253, %r10252, %r10251; + xor.b32 %r10254, %r10253, %r10248; + shf.l.wrap.b32 %r10255, %r10254, %r10254, 24; + add.s32 %r10256, %r10255, %r10249; + xor.b32 %r10257, %r10256, %r10251; + shf.l.wrap.b32 %r10258, %r10257, %r10257, 25; + add.s32 %r10259, %r10211, %r9860; + add.s32 %r10260, %r10259, %r10202; + xor.b32 %r10261, %r10260, %r10171; + shf.l.wrap.b32 %r10262, %r10261, %r10261, 16; + add.s32 %r10263, %r10262, %r10186; + xor.b32 %r10264, %r10263, %r10202; + shf.l.wrap.b32 %r10265, %r10264, %r10264, 20; + add.s32 %r10266, %r10260, %r9874; + add.s32 %r10267, %r10266, %r10265; + xor.b32 %r10268, %r10267, %r10262; + shf.l.wrap.b32 %r10269, %r10268, %r10268, 24; + add.s32 %r10270, %r10269, %r10263; + xor.b32 %r10271, %r10270, %r10265; + shf.l.wrap.b32 %r10272, %r10271, %r10271, 25; + add.s32 %r10273, %r10225, %r9797; + add.s32 %r10274, %r10273, %r10244; + xor.b32 %r10275, %r10274, %r10269; + shf.l.wrap.b32 %r10276, %r10275, %r10275, 16; + add.s32 %r10277, %r10276, %r10256; + xor.b32 %r10278, %r10277, %r10244; + shf.l.wrap.b32 %r10279, %r10278, %r10278, 20; + add.s32 %r10280, %r10274, %r9769; + add.s32 %r10281, %r10280, %r10279; + xor.b32 %r10282, %r10281, %r10276; + shf.l.wrap.b32 %r10283, %r10282, %r10282, 24; + add.s32 %r10284, %r10283, %r10277; + xor.b32 %r10285, %r10284, %r10279; + shf.l.wrap.b32 %r10286, %r10285, %r10285, 25; + add.s32 %r10287, %r10239, %r9846; + add.s32 %r10288, %r10287, %r10258; + xor.b32 %r10289, %r10288, %r10227; + shf.l.wrap.b32 %r10290, %r10289, %r10289, 16; + add.s32 %r10291, %r10290, %r10270; + xor.b32 %r10292, %r10291, %r10258; + shf.l.wrap.b32 %r10293, %r10292, %r10292, 20; + add.s32 %r10294, %r10288, %r9783; + add.s32 %r10295, %r10294, %r10293; + xor.b32 %r10296, %r10295, %r10290; + shf.l.wrap.b32 %r10297, %r10296, %r10296, 24; + add.s32 %r10298, %r10297, %r10291; + xor.b32 %r10299, %r10298, %r10293; + shf.l.wrap.b32 %r10300, %r10299, %r10299, 25; + add.s32 %r10301, %r10253, %r9804; + add.s32 %r10302, %r10301, %r10272; + xor.b32 %r10303, %r10302, %r10241; + shf.l.wrap.b32 %r10304, %r10303, %r10303, 16; + add.s32 %r10305, %r10304, %r10228; + xor.b32 %r10306, %r10305, %r10272; + shf.l.wrap.b32 %r10307, %r10306, %r10306, 20; + add.s32 %r10308, %r10302, %r9825; + add.s32 %r10309, %r10308, %r10307; + xor.b32 %r10310, %r10309, %r10304; + shf.l.wrap.b32 %r10311, %r10310, %r10310, 24; + add.s32 %r10312, %r10311, %r10305; + xor.b32 %r10313, %r10312, %r10307; + shf.l.wrap.b32 %r10314, %r10313, %r10313, 25; + add.s32 %r10315, %r10267, %r9776; + add.s32 %r10316, %r10315, %r10230; + xor.b32 %r10317, %r10316, %r10255; + shf.l.wrap.b32 %r10318, %r10317, %r10317, 16; + add.s32 %r10319, %r10318, %r10242; + xor.b32 %r10320, %r10319, %r10230; + shf.l.wrap.b32 %r10321, %r10320, %r10320, 20; + add.s32 %r10322, %r10316, %r9811; + add.s32 %r10323, %r10322, %r10321; + xor.b32 %r10324, %r10323, %r10318; + shf.l.wrap.b32 %r10325, %r10324, %r10324, 24; + add.s32 %r10326, %r10325, %r10319; + xor.b32 %r10327, %r10326, %r10321; + shf.l.wrap.b32 %r10328, %r10327, %r10327, 25; + add.s32 %r10329, %r10281, %r9853; + add.s32 %r10330, %r10329, %r10328; + xor.b32 %r10331, %r10330, %r10297; + shf.l.wrap.b32 %r10332, %r10331, %r10331, 16; + add.s32 %r10333, %r10332, %r10312; + xor.b32 %r10334, %r10333, %r10328; + shf.l.wrap.b32 %r10335, %r10334, %r10334, 20; + add.s32 %r10336, %r10330, %r9860; + add.s32 %r10337, %r10336, %r10335; + xor.b32 %r10338, %r10337, %r10332; + shf.l.wrap.b32 %r10339, %r10338, %r10338, 24; + add.s32 %r10340, %r10339, %r10333; + xor.b32 %r10341, %r10340, %r10335; + shf.l.wrap.b32 %r10342, %r10341, %r10341, 25; + add.s32 %r10343, %r10295, %r9832; + add.s32 %r10344, %r10343, %r10286; + xor.b32 %r10345, %r10344, %r10311; + shf.l.wrap.b32 %r10346, %r10345, %r10345, 16; + add.s32 %r10347, %r10346, %r10326; + xor.b32 %r10348, %r10347, %r10286; + shf.l.wrap.b32 %r10349, %r10348, %r10348, 20; + add.s32 %r10350, %r10344, %r9846; + add.s32 %r10351, %r10350, %r10349; + xor.b32 %r10352, %r10351, %r10346; + shf.l.wrap.b32 %r10353, %r10352, %r10352, 24; + add.s32 %r10354, %r10353, %r10347; + xor.b32 %r10355, %r10354, %r10349; + shf.l.wrap.b32 %r10356, %r10355, %r10355, 25; + add.s32 %r10357, %r10309, %r9874; + add.s32 %r10358, %r10357, %r10300; + xor.b32 %r10359, %r10358, %r10325; + shf.l.wrap.b32 %r10360, %r10359, %r10359, 16; + add.s32 %r10361, %r10360, %r10284; + xor.b32 %r10362, %r10361, %r10300; + shf.l.wrap.b32 %r10363, %r10362, %r10362, 20; + add.s32 %r10364, %r10358, %r9839; + add.s32 %r10365, %r10364, %r10363; + xor.b32 %r10366, %r10365, %r10360; + shf.l.wrap.b32 %r10367, %r10366, %r10366, 24; + add.s32 %r10368, %r10367, %r10361; + xor.b32 %r10369, %r10368, %r10363; + shf.l.wrap.b32 %r10370, %r10369, %r10369, 25; + add.s32 %r10371, %r10323, %r9867; + add.s32 %r10372, %r10371, %r10314; + xor.b32 %r10373, %r10372, %r10283; + shf.l.wrap.b32 %r10374, %r10373, %r10373, 16; + add.s32 %r10375, %r10374, %r10298; + xor.b32 %r10376, %r10375, %r10314; + shf.l.wrap.b32 %r10377, %r10376, %r10376, 20; + add.s32 %r10378, %r10372, %r9825; + add.s32 %r10379, %r10378, %r10377; + xor.b32 %r10380, %r10379, %r10374; + shf.l.wrap.b32 %r10381, %r10380, %r10380, 24; + add.s32 %r10382, %r10381, %r10375; + xor.b32 %r10383, %r10382, %r10377; + shf.l.wrap.b32 %r10384, %r10383, %r10383, 25; + add.s32 %r10385, %r10337, %r9818; + add.s32 %r10386, %r10385, %r10356; + xor.b32 %r10387, %r10386, %r10381; + shf.l.wrap.b32 %r10388, %r10387, %r10387, 16; + add.s32 %r10389, %r10388, %r10368; + xor.b32 %r10390, %r10389, %r10356; + shf.l.wrap.b32 %r10391, %r10390, %r10390, 20; + add.s32 %r10392, %r10386, %r9783; + add.s32 %r10393, %r10392, %r10391; + xor.b32 %r10394, %r10393, %r10388; + shf.l.wrap.b32 %r10395, %r10394, %r10394, 24; + add.s32 %r10396, %r10395, %r10389; + xor.b32 %r10397, %r10396, %r10391; + shf.l.wrap.b32 %r10398, %r10397, %r10397, 25; + add.s32 %r10399, %r10351, %r9804; + add.s32 %r10400, %r10399, %r10370; + xor.b32 %r10401, %r10400, %r10339; + shf.l.wrap.b32 %r10402, %r10401, %r10401, 16; + add.s32 %r10403, %r10402, %r10382; + xor.b32 %r10404, %r10403, %r10370; + shf.l.wrap.b32 %r10405, %r10404, %r10404, 20; + add.s32 %r10406, %r10400, %r9790; + add.s32 %r10407, %r10406, %r10405; + xor.b32 %r10408, %r10407, %r10402; + shf.l.wrap.b32 %r10409, %r10408, %r10408, 24; + add.s32 %r10410, %r10409, %r10403; + xor.b32 %r10411, %r10410, %r10405; + shf.l.wrap.b32 %r10412, %r10411, %r10411, 25; + add.s32 %r10413, %r10365, %r9769; + add.s32 %r10414, %r10413, %r10384; + xor.b32 %r10415, %r10414, %r10353; + shf.l.wrap.b32 %r10416, %r10415, %r10415, 16; + add.s32 %r10417, %r10416, %r10340; + xor.b32 %r10418, %r10417, %r10384; + shf.l.wrap.b32 %r10419, %r10418, %r10418, 20; + add.s32 %r10420, %r10414, %r9776; + add.s32 %r10421, %r10420, %r10419; + xor.b32 %r10422, %r10421, %r10416; + shf.l.wrap.b32 %r10423, %r10422, %r10422, 24; + add.s32 %r10424, %r10423, %r10417; + xor.b32 %r10425, %r10424, %r10419; + shf.l.wrap.b32 %r10426, %r10425, %r10425, 25; + add.s32 %r10427, %r10379, %r9811; + add.s32 %r10428, %r10427, %r10342; + xor.b32 %r10429, %r10428, %r10367; + shf.l.wrap.b32 %r10430, %r10429, %r10429, 16; + add.s32 %r10431, %r10430, %r10354; + xor.b32 %r10432, %r10431, %r10342; + shf.l.wrap.b32 %r10433, %r10432, %r10432, 20; + add.s32 %r10434, %r10428, %r9797; + add.s32 %r10435, %r10434, %r10433; + xor.b32 %r10436, %r10435, %r10430; + shf.l.wrap.b32 %r10437, %r10436, %r10436, 24; + add.s32 %r10438, %r10437, %r10431; + xor.b32 %r10439, %r10438, %r10433; + shf.l.wrap.b32 %r10440, %r10439, %r10439, 25; + add.s32 %r10441, %r10393, %r9832; + add.s32 %r10442, %r10441, %r10440; + xor.b32 %r10443, %r10442, %r10409; + shf.l.wrap.b32 %r10444, %r10443, %r10443, 16; + add.s32 %r10445, %r10444, %r10424; + xor.b32 %r10446, %r10445, %r10440; + shf.l.wrap.b32 %r10447, %r10446, %r10446, 20; + add.s32 %r10448, %r10442, %r9867; + add.s32 %r10449, %r10448, %r10447; + xor.b32 %r10450, %r10449, %r10444; + shf.l.wrap.b32 %r10451, %r10450, %r10450, 24; + add.s32 %r10452, %r10451, %r10445; + xor.b32 %r10453, %r10452, %r10447; + shf.l.wrap.b32 %r10454, %r10453, %r10453, 25; + add.s32 %r10455, %r10407, %r9846; + add.s32 %r10456, %r10455, %r10398; + xor.b32 %r10457, %r10456, %r10423; + shf.l.wrap.b32 %r10458, %r10457, %r10457, 16; + add.s32 %r10459, %r10458, %r10438; + xor.b32 %r10460, %r10459, %r10398; + shf.l.wrap.b32 %r10461, %r10460, %r10460, 20; + add.s32 %r10462, %r10456, %r9804; + add.s32 %r10463, %r10462, %r10461; + xor.b32 %r10464, %r10463, %r10458; + shf.l.wrap.b32 %r10465, %r10464, %r10464, 24; + add.s32 %r10466, %r10465, %r10459; + xor.b32 %r10467, %r10466, %r10461; + shf.l.wrap.b32 %r10468, %r10467, %r10467, 25; + add.s32 %r10469, %r10421, %r9825; + add.s32 %r10470, %r10469, %r10412; + xor.b32 %r10471, %r10470, %r10437; + shf.l.wrap.b32 %r10472, %r10471, %r10471, 16; + add.s32 %r10473, %r10472, %r10396; + xor.b32 %r10474, %r10473, %r10412; + shf.l.wrap.b32 %r10475, %r10474, %r10474, 20; + add.s32 %r10476, %r10470, %r9853; + add.s32 %r10477, %r10476, %r10475; + xor.b32 %r10478, %r10477, %r10472; + shf.l.wrap.b32 %r10479, %r10478, %r10478, 24; + add.s32 %r10480, %r10479, %r10473; + xor.b32 %r10481, %r10480, %r10475; + shf.l.wrap.b32 %r10482, %r10481, %r10481, 25; + add.s32 %r10483, %r10435, %r9874; + add.s32 %r10484, %r10483, %r10426; + xor.b32 %r10485, %r10484, %r10395; + shf.l.wrap.b32 %r10486, %r10485, %r10485, 16; + add.s32 %r10487, %r10486, %r10410; + xor.b32 %r10488, %r10487, %r10426; + shf.l.wrap.b32 %r10489, %r10488, %r10488, 20; + add.s32 %r10490, %r10484, %r9776; + add.s32 %r10491, %r10490, %r10489; + xor.b32 %r10492, %r10491, %r10486; + shf.l.wrap.b32 %r10493, %r10492, %r10492, 24; + add.s32 %r10494, %r10493, %r10487; + xor.b32 %r10495, %r10494, %r10489; + shf.l.wrap.b32 %r10496, %r10495, %r10495, 25; + add.s32 %r10497, %r10449, %r9860; + add.s32 %r10498, %r10497, %r10468; + xor.b32 %r10499, %r10498, %r10493; + shf.l.wrap.b32 %r10500, %r10499, %r10499, 16; + add.s32 %r10501, %r10500, %r10480; + xor.b32 %r10502, %r10501, %r10468; + shf.l.wrap.b32 %r10503, %r10502, %r10502, 20; + add.s32 %r10504, %r10498, %r9790; + add.s32 %r10505, %r10504, %r10503; + xor.b32 %r10506, %r10505, %r10500; + shf.l.wrap.b32 %r10507, %r10506, %r10506, 24; + add.s32 %r10508, %r10507, %r10501; + xor.b32 %r10509, %r10508, %r10503; + shf.l.wrap.b32 %r10510, %r10509, %r10509, 25; + add.s32 %r10511, %r10463, %r9769; + add.s32 %r10512, %r10511, %r10482; + xor.b32 %r10513, %r10512, %r10451; + shf.l.wrap.b32 %r10514, %r10513, %r10513, 16; + add.s32 %r10515, %r10514, %r10494; + xor.b32 %r10516, %r10515, %r10482; + shf.l.wrap.b32 %r10517, %r10516, %r10516, 20; + add.s32 %r10518, %r10512, %r9839; + add.s32 %r10519, %r10518, %r10517; + xor.b32 %r10520, %r10519, %r10514; + shf.l.wrap.b32 %r10521, %r10520, %r10520, 24; + add.s32 %r10522, %r10521, %r10515; + xor.b32 %r10523, %r10522, %r10517; + shf.l.wrap.b32 %r10524, %r10523, %r10523, 25; + add.s32 %r10525, %r10477, %r9783; + add.s32 %r10526, %r10525, %r10496; + xor.b32 %r10527, %r10526, %r10465; + shf.l.wrap.b32 %r10528, %r10527, %r10527, 16; + add.s32 %r10529, %r10528, %r10452; + xor.b32 %r10530, %r10529, %r10496; + shf.l.wrap.b32 %r10531, %r10530, %r10530, 20; + add.s32 %r10532, %r10526, %r9811; + add.s32 %r10533, %r10532, %r10531; + xor.b32 %r10534, %r10533, %r10528; + shf.l.wrap.b32 %r10535, %r10534, %r10534, 24; + add.s32 %r10536, %r10535, %r10529; + xor.b32 %r10537, %r10536, %r10531; + shf.l.wrap.b32 %r10538, %r10537, %r10537, 25; + add.s32 %r10539, %r10491, %r9797; + add.s32 %r10540, %r10539, %r10454; + xor.b32 %r10541, %r10540, %r10479; + shf.l.wrap.b32 %r10542, %r10541, %r10541, 16; + add.s32 %r10543, %r10542, %r10466; + xor.b32 %r10544, %r10543, %r10454; + shf.l.wrap.b32 %r10545, %r10544, %r10544, 20; + add.s32 %r10546, %r10540, %r9818; + add.s32 %r10547, %r10546, %r10545; + xor.b32 %r10548, %r10547, %r10542; + shf.l.wrap.b32 %r10549, %r10548, %r10548, 24; + add.s32 %r10550, %r10549, %r10543; + xor.b32 %r10551, %r10550, %r10545; + shf.l.wrap.b32 %r10552, %r10551, %r10551, 25; + add.s32 %r10553, %r10505, %r9846; + add.s32 %r10554, %r10553, %r10552; + xor.b32 %r10555, %r10554, %r10521; + shf.l.wrap.b32 %r10556, %r10555, %r10555, 16; + add.s32 %r10557, %r10556, %r10536; + xor.b32 %r10558, %r10557, %r10552; + shf.l.wrap.b32 %r10559, %r10558, %r10558, 20; + add.s32 %r10560, %r10554, %r9874; + add.s32 %r10561, %r10560, %r10559; + xor.b32 %r10562, %r10561, %r10556; + shf.l.wrap.b32 %r10563, %r10562, %r10562, 24; + add.s32 %r10564, %r10563, %r10557; + xor.b32 %r10565, %r10564, %r10559; + shf.l.wrap.b32 %r10566, %r10565, %r10565, 25; + add.s32 %r10567, %r10519, %r9804; + add.s32 %r10568, %r10567, %r10510; + xor.b32 %r10569, %r10568, %r10535; + shf.l.wrap.b32 %r10570, %r10569, %r10569, 16; + add.s32 %r10571, %r10570, %r10550; + xor.b32 %r10572, %r10571, %r10510; + shf.l.wrap.b32 %r10573, %r10572, %r10572, 20; + add.s32 %r10574, %r10568, %r9769; + add.s32 %r10575, %r10574, %r10573; + xor.b32 %r10576, %r10575, %r10570; + shf.l.wrap.b32 %r10577, %r10576, %r10576, 24; + add.s32 %r10578, %r10577, %r10571; + xor.b32 %r10579, %r10578, %r10573; + shf.l.wrap.b32 %r10580, %r10579, %r10579, 25; + add.s32 %r10581, %r10533, %r9776; + add.s32 %r10582, %r10581, %r10524; + xor.b32 %r10583, %r10582, %r10549; + shf.l.wrap.b32 %r10584, %r10583, %r10583, 16; + add.s32 %r10585, %r10584, %r10508; + xor.b32 %r10586, %r10585, %r10524; + shf.l.wrap.b32 %r10587, %r10586, %r10586, 20; + add.s32 %r10588, %r10582, %r9832; + add.s32 %r10589, %r10588, %r10587; + xor.b32 %r10590, %r10589, %r10584; + shf.l.wrap.b32 %r10591, %r10590, %r10590, 24; + add.s32 %r10592, %r10591, %r10585; + xor.b32 %r10593, %r10592, %r10587; + shf.l.wrap.b32 %r10594, %r10593, %r10593, 25; + add.s32 %r10595, %r10547, %r9825; + add.s32 %r10596, %r10595, %r10538; + xor.b32 %r10597, %r10596, %r10507; + shf.l.wrap.b32 %r10598, %r10597, %r10597, 16; + add.s32 %r10599, %r10598, %r10522; + xor.b32 %r10600, %r10599, %r10538; + shf.l.wrap.b32 %r10601, %r10600, %r10600, 20; + add.s32 %r10602, %r10596, %r9811; + add.s32 %r10603, %r10602, %r10601; + xor.b32 %r10604, %r10603, %r10598; + shf.l.wrap.b32 %r10605, %r10604, %r10604, 24; + add.s32 %r10606, %r10605, %r10599; + xor.b32 %r10607, %r10606, %r10601; + shf.l.wrap.b32 %r10608, %r10607, %r10607, 25; + add.s32 %r10609, %r10561, %r9867; + add.s32 %r10610, %r10609, %r10580; + xor.b32 %r10611, %r10610, %r10605; + shf.l.wrap.b32 %r10612, %r10611, %r10611, 16; + add.s32 %r10613, %r10612, %r10592; + xor.b32 %r10614, %r10613, %r10580; + shf.l.wrap.b32 %r10615, %r10614, %r10614, 20; + add.s32 %r10616, %r10610, %r9839; + add.s32 %r10617, %r10616, %r10615; + xor.b32 %r10618, %r10617, %r10612; + shf.l.wrap.b32 %r10619, %r10618, %r10618, 24; + add.s32 %r10620, %r10619, %r10613; + xor.b32 %r10621, %r10620, %r10615; + shf.l.wrap.b32 %r10622, %r10621, %r10621, 25; + add.s32 %r10623, %r10575, %r9783; + add.s32 %r10624, %r10623, %r10594; + xor.b32 %r10625, %r10624, %r10563; + shf.l.wrap.b32 %r10626, %r10625, %r10625, 16; + add.s32 %r10627, %r10626, %r10606; + xor.b32 %r10628, %r10627, %r10594; + shf.l.wrap.b32 %r10629, %r10628, %r10628, 20; + add.s32 %r10630, %r10624, %r9853; + add.s32 %r10631, %r10630, %r10629; + xor.b32 %r10632, %r10631, %r10626; + shf.l.wrap.b32 %r10633, %r10632, %r10632, 24; + add.s32 %r10634, %r10633, %r10627; + xor.b32 %r10635, %r10634, %r10629; + shf.l.wrap.b32 %r10636, %r10635, %r10635, 25; + add.s32 %r10637, %r10589, %r9790; + add.s32 %r10638, %r10637, %r10608; + xor.b32 %r10639, %r10638, %r10577; + shf.l.wrap.b32 %r10640, %r10639, %r10639, 16; + add.s32 %r10641, %r10640, %r10564; + xor.b32 %r10642, %r10641, %r10608; + shf.l.wrap.b32 %r10643, %r10642, %r10642, 20; + add.s32 %r10644, %r10638, %r9797; + add.s32 %r10645, %r10644, %r10643; + xor.b32 %r10646, %r10645, %r10640; + shf.l.wrap.b32 %r10647, %r10646, %r10646, 24; + add.s32 %r10648, %r10647, %r10641; + xor.b32 %r10649, %r10648, %r10643; + shf.l.wrap.b32 %r10650, %r10649, %r10649, 25; + add.s32 %r10651, %r10603, %r9818; + add.s32 %r10652, %r10651, %r10566; + xor.b32 %r10653, %r10652, %r10591; + shf.l.wrap.b32 %r10654, %r10653, %r10653, 16; + add.s32 %r10655, %r10654, %r10578; + xor.b32 %r10656, %r10655, %r10566; + shf.l.wrap.b32 %r10657, %r10656, %r10656, 20; + add.s32 %r10658, %r10652, %r9860; + add.s32 %r10659, %r10658, %r10657; + xor.b32 %r10660, %r10659, %r10654; + shf.l.wrap.b32 %r10661, %r10660, %r10660, 24; + add.s32 %r10662, %r10661, %r10655; + xor.b32 %r10663, %r10662, %r10657; + shf.l.wrap.b32 %r10664, %r10663, %r10663, 25; + xor.b32 %r11689, %r10648, %r10617; + st.local.u32 [%rd3+-104], %r11689; + xor.b32 %r11688, %r10662, %r10631; + st.local.u32 [%rd3+-100], %r11688; + xor.b32 %r11687, %r10620, %r10645; + st.local.u32 [%rd3+-96], %r11687; + xor.b32 %r11686, %r10659, %r10634; + st.local.u32 [%rd3+-92], %r11686; + xor.b32 %r11685, %r10664, %r10633; + st.local.u32 [%rd3+-88], %r11685; + xor.b32 %r11684, %r10622, %r10647; + st.local.u32 [%rd3+-84], %r11684; + xor.b32 %r11683, %r10661, %r10636; + st.local.u32 [%rd3+-80], %r11683; + xor.b32 %r11682, %r10650, %r10619; + st.local.u32 [%rd3+-76], %r11682; + add.s16 %rs391, %rs391, 1; + st.local.u8 [%rd3+1], %rs391; + add.s64 %rd260, %rd260, 64; + add.s64 %rd270, %rd270, -64; + setp.gt.u64 %p49, %rd270, 64; + @%p49 bra $L__BB1_60; + bra.uni $L__BB1_61; + +$L__BB1_58: + ld.local.u64 %rd268, [%rd3+-72]; + +$L__BB1_61: + cvt.u64.u16 %rd209, %rs390; + and.b64 %rd92, %rd209, 255; + mov.u64 %rd210, 64; + sub.s64 %rd211, %rd210, %rd92; + min.u64 %rd93, %rd211, %rd270; + setp.eq.s64 %p50, %rd93, 0; + @%p50 bra $L__BB1_64; + + add.s64 %rd213, %rd2, %rd92; + add.s64 %rd94, %rd213, 72; + mov.u64 %rd271, 0; + +$L__BB1_63: + add.s64 %rd214, %rd260, %rd271; + ld.local.u8 %rs345, [%rd214]; + add.s64 %rd215, %rd94, %rd271; + st.local.u8 [%rd215], %rs345; + add.s64 %rd271, %rd271, 1; + setp.lt.u64 %p51, %rd271, %rd93; + @%p51 bra $L__BB1_63; + +$L__BB1_64: + cvt.u16.u64 %rs346, %rd93; + ld.local.u8 %rs347, [%rd3]; + add.s16 %rs348, %rs347, %rs346; + st.local.u8 [%rd3], %rs348; + ld.local.u8 %rs392, [%rd3+8]; + cvt.u64.u16 %rd216, %rs392; + and.b64 %rd217, %rd216, 255; + popc.b64 %r10665, %rd268; + cvt.u64.u32 %rd97, %r10665; + setp.ge.u64 %p52, %rd97, %rd217; + @%p52 bra $L__BB1_68; + + ld.local.u8 %r10666, [%rd3+2]; + or.b32 %r135, %r10666, 4; + ld.local.u8 %r10667, [%rd3+-120]; + ld.local.u8 %r10668, [%rd3+-119]; + prmt.b32 %r10669, %r10668, %r10667, 30212; + ld.local.u8 %r10670, [%rd3+-118]; + ld.local.u8 %r10671, [%rd3+-117]; + prmt.b32 %r10672, %r10671, %r10670, 30212; + prmt.b32 %r136, %r10672, %r10669, 4180; + ld.local.u8 %r10673, [%rd3+-136]; + ld.local.u8 %r10674, [%rd3+-135]; + prmt.b32 %r10675, %r10674, %r10673, 30212; + ld.local.u8 %r10676, [%rd3+-134]; + ld.local.u8 %r10677, [%rd3+-133]; + prmt.b32 %r10678, %r10677, %r10676, 30212; + prmt.b32 %r10679, %r10678, %r10675, 4180; + add.s32 %r137, %r136, %r10679; + ld.local.u8 %r10680, [%rd3+-116]; + ld.local.u8 %r10681, [%rd3+-115]; + prmt.b32 %r10682, %r10681, %r10680, 30212; + ld.local.u8 %r10683, [%rd3+-114]; + ld.local.u8 %r10684, [%rd3+-113]; + prmt.b32 %r10685, %r10684, %r10683, 30212; + prmt.b32 %r138, %r10685, %r10682, 4180; + ld.local.u8 %r10686, [%rd3+-132]; + ld.local.u8 %r10687, [%rd3+-131]; + prmt.b32 %r10688, %r10687, %r10686, 30212; + ld.local.u8 %r10689, [%rd3+-130]; + ld.local.u8 %r10690, [%rd3+-129]; + prmt.b32 %r10691, %r10690, %r10689, 30212; + prmt.b32 %r10692, %r10691, %r10688, 4180; + add.s32 %r139, %r138, %r10692; + ld.local.u8 %r10693, [%rd3+-112]; + ld.local.u8 %r10694, [%rd3+-111]; + prmt.b32 %r10695, %r10694, %r10693, 30212; + ld.local.u8 %r10696, [%rd3+-110]; + ld.local.u8 %r10697, [%rd3+-109]; + prmt.b32 %r10698, %r10697, %r10696, 30212; + prmt.b32 %r140, %r10698, %r10695, 4180; + ld.local.u8 %r10699, [%rd3+-128]; + ld.local.u8 %r10700, [%rd3+-127]; + prmt.b32 %r10701, %r10700, %r10699, 30212; + ld.local.u8 %r10702, [%rd3+-126]; + ld.local.u8 %r10703, [%rd3+-125]; + prmt.b32 %r10704, %r10703, %r10702, 30212; + prmt.b32 %r10705, %r10704, %r10701, 4180; + add.s32 %r141, %r140, %r10705; + ld.local.u8 %r10706, [%rd3+-108]; + ld.local.u8 %r10707, [%rd3+-107]; + prmt.b32 %r10708, %r10707, %r10706, 30212; + ld.local.u8 %r10709, [%rd3+-106]; + ld.local.u8 %r10710, [%rd3+-105]; + prmt.b32 %r10711, %r10710, %r10709, 30212; + prmt.b32 %r142, %r10711, %r10708, 4180; + ld.local.u8 %r10712, [%rd3+-124]; + ld.local.u8 %r10713, [%rd3+-123]; + prmt.b32 %r10714, %r10713, %r10712, 30212; + ld.local.u8 %r10715, [%rd3+-122]; + ld.local.u8 %r10716, [%rd3+-121]; + prmt.b32 %r10717, %r10716, %r10715, 30212; + prmt.b32 %r10718, %r10717, %r10714, 4180; + add.s32 %r143, %r142, %r10718; + +$L__BB1_66: + and.b16 %rs349, %rs392, 255; + mul.wide.u16 %r10719, %rs349, 32; + add.s32 %r10720, %r10719, -64; + cvt.s64.s32 %rd218, %r10720; + add.s64 %rd219, %rd2, %rd218; + ld.local.u8 %r10721, [%rd219+145]; + ld.local.u8 %r10722, [%rd219+146]; + prmt.b32 %r10723, %r10722, %r10721, 30212; + ld.local.u8 %r10724, [%rd219+147]; + prmt.b32 %r10725, %r10724, %r10723, 28756; + ld.local.u8 %r10726, [%rd219+148]; + prmt.b32 %r10727, %r10726, %r10725, 1620; + ld.local.u8 %r10728, [%rd219+149]; + ld.local.u8 %r10729, [%rd219+150]; + prmt.b32 %r10730, %r10729, %r10728, 30212; + ld.local.u8 %r10731, [%rd219+151]; + prmt.b32 %r10732, %r10731, %r10730, 28756; + ld.local.u8 %r10733, [%rd219+152]; + prmt.b32 %r10734, %r10733, %r10732, 1620; + ld.local.u8 %r10735, [%rd219+153]; + ld.local.u8 %r10736, [%rd219+154]; + prmt.b32 %r10737, %r10736, %r10735, 30212; + ld.local.u8 %r10738, [%rd219+155]; + prmt.b32 %r10739, %r10738, %r10737, 28756; + ld.local.u8 %r10740, [%rd219+156]; + prmt.b32 %r10741, %r10740, %r10739, 1620; + ld.local.u8 %r10742, [%rd219+157]; + ld.local.u8 %r10743, [%rd219+158]; + prmt.b32 %r10744, %r10743, %r10742, 30212; + ld.local.u8 %r10745, [%rd219+159]; + prmt.b32 %r10746, %r10745, %r10744, 28756; + ld.local.u8 %r10747, [%rd219+160]; + prmt.b32 %r10748, %r10747, %r10746, 1620; + ld.local.u8 %r10749, [%rd219+161]; + ld.local.u8 %r10750, [%rd219+162]; + prmt.b32 %r10751, %r10750, %r10749, 30212; + ld.local.u8 %r10752, [%rd219+163]; + prmt.b32 %r10753, %r10752, %r10751, 28756; + ld.local.u8 %r10754, [%rd219+164]; + prmt.b32 %r10755, %r10754, %r10753, 1620; + ld.local.u8 %r10756, [%rd219+165]; + ld.local.u8 %r10757, [%rd219+166]; + prmt.b32 %r10758, %r10757, %r10756, 30212; + ld.local.u8 %r10759, [%rd219+167]; + prmt.b32 %r10760, %r10759, %r10758, 28756; + ld.local.u8 %r10761, [%rd219+168]; + prmt.b32 %r10762, %r10761, %r10760, 1620; + ld.local.u8 %r10763, [%rd219+169]; + ld.local.u8 %r10764, [%rd219+170]; + prmt.b32 %r10765, %r10764, %r10763, 30212; + ld.local.u8 %r10766, [%rd219+171]; + prmt.b32 %r10767, %r10766, %r10765, 28756; + ld.local.u8 %r10768, [%rd219+172]; + prmt.b32 %r10769, %r10768, %r10767, 1620; + ld.local.u8 %r10770, [%rd219+173]; + ld.local.u8 %r10771, [%rd219+174]; + prmt.b32 %r10772, %r10771, %r10770, 30212; + ld.local.u8 %r10773, [%rd219+175]; + prmt.b32 %r10774, %r10773, %r10772, 28756; + ld.local.u8 %r10775, [%rd219+176]; + prmt.b32 %r10776, %r10775, %r10774, 1620; + ld.local.u8 %r10777, [%rd219+177]; + ld.local.u8 %r10778, [%rd219+178]; + prmt.b32 %r10779, %r10778, %r10777, 30212; + ld.local.u8 %r10780, [%rd219+179]; + prmt.b32 %r10781, %r10780, %r10779, 28756; + ld.local.u8 %r10782, [%rd219+180]; + prmt.b32 %r10783, %r10782, %r10781, 1620; + ld.local.u8 %r10784, [%rd219+181]; + ld.local.u8 %r10785, [%rd219+182]; + prmt.b32 %r10786, %r10785, %r10784, 30212; + ld.local.u8 %r10787, [%rd219+183]; + prmt.b32 %r10788, %r10787, %r10786, 28756; + ld.local.u8 %r10789, [%rd219+184]; + prmt.b32 %r10790, %r10789, %r10788, 1620; + ld.local.u8 %r10791, [%rd219+185]; + ld.local.u8 %r10792, [%rd219+186]; + prmt.b32 %r10793, %r10792, %r10791, 30212; + ld.local.u8 %r10794, [%rd219+187]; + prmt.b32 %r10795, %r10794, %r10793, 28756; + ld.local.u8 %r10796, [%rd219+188]; + prmt.b32 %r10797, %r10796, %r10795, 1620; + ld.local.u8 %r10798, [%rd219+189]; + ld.local.u8 %r10799, [%rd219+190]; + prmt.b32 %r10800, %r10799, %r10798, 30212; + ld.local.u8 %r10801, [%rd219+191]; + prmt.b32 %r10802, %r10801, %r10800, 28756; + ld.local.u8 %r10803, [%rd219+192]; + prmt.b32 %r10804, %r10803, %r10802, 1620; + ld.local.u8 %r10805, [%rd219+193]; + ld.local.u8 %r10806, [%rd219+194]; + prmt.b32 %r10807, %r10806, %r10805, 30212; + ld.local.u8 %r10808, [%rd219+195]; + prmt.b32 %r10809, %r10808, %r10807, 28756; + ld.local.u8 %r10810, [%rd219+196]; + prmt.b32 %r10811, %r10810, %r10809, 1620; + ld.local.u8 %r10812, [%rd219+197]; + ld.local.u8 %r10813, [%rd219+198]; + prmt.b32 %r10814, %r10813, %r10812, 30212; + ld.local.u8 %r10815, [%rd219+199]; + prmt.b32 %r10816, %r10815, %r10814, 28756; + ld.local.u8 %r10817, [%rd219+200]; + prmt.b32 %r10818, %r10817, %r10816, 1620; + ld.local.u8 %r10819, [%rd219+201]; + ld.local.u8 %r10820, [%rd219+202]; + prmt.b32 %r10821, %r10820, %r10819, 30212; + ld.local.u8 %r10822, [%rd219+203]; + prmt.b32 %r10823, %r10822, %r10821, 28756; + ld.local.u8 %r10824, [%rd219+204]; + prmt.b32 %r10825, %r10824, %r10823, 1620; + ld.local.u8 %r10826, [%rd219+205]; + ld.local.u8 %r10827, [%rd219+206]; + prmt.b32 %r10828, %r10827, %r10826, 30212; + ld.local.u8 %r10829, [%rd219+207]; + prmt.b32 %r10830, %r10829, %r10828, 28756; + ld.local.u8 %r10831, [%rd219+208]; + prmt.b32 %r10832, %r10831, %r10830, 1620; + add.s32 %r10833, %r137, %r10727; + shf.l.wrap.b32 %r10834, %r10833, %r10833, 16; + add.s32 %r10835, %r10834, 1779033703; + xor.b32 %r10836, %r10835, %r136; + shf.l.wrap.b32 %r10837, %r10836, %r10836, 20; + add.s32 %r10838, %r10734, %r10833; + add.s32 %r10839, %r10838, %r10837; + xor.b32 %r10840, %r10839, %r10834; + shf.l.wrap.b32 %r10841, %r10840, %r10840, 24; + add.s32 %r10842, %r10841, %r10835; + xor.b32 %r10843, %r10842, %r10837; + shf.l.wrap.b32 %r10844, %r10843, %r10843, 25; + add.s32 %r10845, %r139, %r10741; + shf.l.wrap.b32 %r10846, %r10845, %r10845, 16; + add.s32 %r10847, %r10846, -1150833019; + xor.b32 %r10848, %r10847, %r138; + shf.l.wrap.b32 %r10849, %r10848, %r10848, 20; + add.s32 %r10850, %r10748, %r10845; + add.s32 %r10851, %r10850, %r10849; + xor.b32 %r10852, %r10851, %r10846; + shf.l.wrap.b32 %r10853, %r10852, %r10852, 24; + add.s32 %r10854, %r10853, %r10847; + xor.b32 %r10855, %r10854, %r10849; + shf.l.wrap.b32 %r10856, %r10855, %r10855, 25; + add.s32 %r10857, %r141, %r10755; + shr.u32 %r10858, %r10857, 16; + shl.b32 %r10859, %r10857, 16; + xor.b32 %r10860, %r10859, 4194304; + or.b32 %r10861, %r10860, %r10858; + add.s32 %r10862, %r10861, 1013904242; + xor.b32 %r10863, %r10862, %r140; + shf.l.wrap.b32 %r10864, %r10863, %r10863, 20; + add.s32 %r10865, %r10762, %r10857; + add.s32 %r10866, %r10865, %r10864; + xor.b32 %r10867, %r10866, %r10861; + shf.l.wrap.b32 %r10868, %r10867, %r10867, 24; + add.s32 %r10869, %r10868, %r10862; + xor.b32 %r10870, %r10869, %r10864; + shf.l.wrap.b32 %r10871, %r10870, %r10870, 25; + add.s32 %r10872, %r143, %r10769; + xor.b32 %r10873, %r10872, %r135; + shr.u32 %r10874, %r10872, 16; + shl.b32 %r10875, %r10873, 16; + or.b32 %r10876, %r10875, %r10874; + add.s32 %r10877, %r10876, -1521486534; + xor.b32 %r10878, %r10877, %r142; + shf.l.wrap.b32 %r10879, %r10878, %r10878, 20; + add.s32 %r10880, %r10776, %r10872; + add.s32 %r10881, %r10880, %r10879; + xor.b32 %r10882, %r10881, %r10876; + shf.l.wrap.b32 %r10883, %r10882, %r10882, 24; + add.s32 %r10884, %r10883, %r10877; + xor.b32 %r10885, %r10884, %r10879; + shf.l.wrap.b32 %r10886, %r10885, %r10885, 25; + add.s32 %r10887, %r10856, %r10839; + add.s32 %r10888, %r10887, %r10783; + xor.b32 %r10889, %r10883, %r10888; + shf.l.wrap.b32 %r10890, %r10889, %r10889, 16; + add.s32 %r10891, %r10890, %r10869; + xor.b32 %r10892, %r10891, %r10856; + shf.l.wrap.b32 %r10893, %r10892, %r10892, 20; + add.s32 %r10894, %r10790, %r10888; + add.s32 %r10895, %r10894, %r10893; + xor.b32 %r10896, %r10895, %r10890; + shf.l.wrap.b32 %r10897, %r10896, %r10896, 24; + add.s32 %r10898, %r10897, %r10891; + xor.b32 %r10899, %r10898, %r10893; + shf.l.wrap.b32 %r10900, %r10899, %r10899, 25; + add.s32 %r10901, %r10871, %r10851; + add.s32 %r10902, %r10901, %r10797; + xor.b32 %r10903, %r10902, %r10841; + shf.l.wrap.b32 %r10904, %r10903, %r10903, 16; + add.s32 %r10905, %r10904, %r10884; + xor.b32 %r10906, %r10905, %r10871; + shf.l.wrap.b32 %r10907, %r10906, %r10906, 20; + add.s32 %r10908, %r10804, %r10902; + add.s32 %r10909, %r10908, %r10907; + xor.b32 %r10910, %r10909, %r10904; + shf.l.wrap.b32 %r10911, %r10910, %r10910, 24; + add.s32 %r10912, %r10911, %r10905; + xor.b32 %r10913, %r10912, %r10907; + shf.l.wrap.b32 %r10914, %r10913, %r10913, 25; + add.s32 %r10915, %r10886, %r10866; + add.s32 %r10916, %r10915, %r10811; + xor.b32 %r10917, %r10916, %r10853; + shf.l.wrap.b32 %r10918, %r10917, %r10917, 16; + add.s32 %r10919, %r10918, %r10842; + xor.b32 %r10920, %r10919, %r10886; + shf.l.wrap.b32 %r10921, %r10920, %r10920, 20; + add.s32 %r10922, %r10818, %r10916; + add.s32 %r10923, %r10922, %r10921; + xor.b32 %r10924, %r10923, %r10918; + shf.l.wrap.b32 %r10925, %r10924, %r10924, 24; + add.s32 %r10926, %r10925, %r10919; + xor.b32 %r10927, %r10926, %r10921; + shf.l.wrap.b32 %r10928, %r10927, %r10927, 25; + add.s32 %r10929, %r10881, %r10844; + add.s32 %r10930, %r10929, %r10825; + xor.b32 %r10931, %r10930, %r10868; + shf.l.wrap.b32 %r10932, %r10931, %r10931, 16; + add.s32 %r10933, %r10932, %r10854; + xor.b32 %r10934, %r10933, %r10844; + shf.l.wrap.b32 %r10935, %r10934, %r10934, 20; + add.s32 %r10936, %r10832, %r10930; + add.s32 %r10937, %r10936, %r10935; + xor.b32 %r10938, %r10937, %r10932; + shf.l.wrap.b32 %r10939, %r10938, %r10938, 24; + add.s32 %r10940, %r10939, %r10933; + xor.b32 %r10941, %r10940, %r10935; + shf.l.wrap.b32 %r10942, %r10941, %r10941, 25; + add.s32 %r10943, %r10895, %r10741; + add.s32 %r10944, %r10943, %r10942; + xor.b32 %r10945, %r10944, %r10911; + shf.l.wrap.b32 %r10946, %r10945, %r10945, 16; + add.s32 %r10947, %r10946, %r10926; + xor.b32 %r10948, %r10947, %r10942; + shf.l.wrap.b32 %r10949, %r10948, %r10948, 20; + add.s32 %r10950, %r10944, %r10769; + add.s32 %r10951, %r10950, %r10949; + xor.b32 %r10952, %r10951, %r10946; + shf.l.wrap.b32 %r10953, %r10952, %r10952, 24; + add.s32 %r10954, %r10953, %r10947; + xor.b32 %r10955, %r10954, %r10949; + shf.l.wrap.b32 %r10956, %r10955, %r10955, 25; + add.s32 %r10957, %r10909, %r10748; + add.s32 %r10958, %r10957, %r10900; + xor.b32 %r10959, %r10925, %r10958; + shf.l.wrap.b32 %r10960, %r10959, %r10959, 16; + add.s32 %r10961, %r10940, %r10960; + xor.b32 %r10962, %r10961, %r10900; + shf.l.wrap.b32 %r10963, %r10962, %r10962, 20; + add.s32 %r10964, %r10958, %r10797; + add.s32 %r10965, %r10964, %r10963; + xor.b32 %r10966, %r10965, %r10960; + shf.l.wrap.b32 %r10967, %r10966, %r10966, 24; + add.s32 %r10968, %r10967, %r10961; + xor.b32 %r10969, %r10968, %r10963; + shf.l.wrap.b32 %r10970, %r10969, %r10969, 25; + add.s32 %r10971, %r10914, %r10776; + add.s32 %r10972, %r10971, %r10923; + xor.b32 %r10973, %r10939, %r10972; + shf.l.wrap.b32 %r10974, %r10973, %r10973, 16; + add.s32 %r10975, %r10974, %r10898; + xor.b32 %r10976, %r10975, %r10914; + shf.l.wrap.b32 %r10977, %r10976, %r10976, 20; + add.s32 %r10978, %r10972, %r10727; + add.s32 %r10979, %r10978, %r10977; + xor.b32 %r10980, %r10979, %r10974; + shf.l.wrap.b32 %r10981, %r10980, %r10980, 24; + add.s32 %r10982, %r10981, %r10975; + xor.b32 %r10983, %r10982, %r10977; + shf.l.wrap.b32 %r10984, %r10983, %r10983, 25; + add.s32 %r10985, %r10928, %r10755; + add.s32 %r10986, %r10985, %r10937; + xor.b32 %r10987, %r10986, %r10897; + shf.l.wrap.b32 %r10988, %r10987, %r10987, 16; + add.s32 %r10989, %r10988, %r10912; + xor.b32 %r10990, %r10989, %r10928; + shf.l.wrap.b32 %r10991, %r10990, %r10990, 20; + add.s32 %r10992, %r10986, %r10818; + add.s32 %r10993, %r10992, %r10991; + xor.b32 %r10994, %r10993, %r10988; + shf.l.wrap.b32 %r10995, %r10994, %r10994, 24; + add.s32 %r10996, %r10995, %r10989; + xor.b32 %r10997, %r10996, %r10991; + shf.l.wrap.b32 %r10998, %r10997, %r10997, 25; + add.s32 %r10999, %r10970, %r10734; + add.s32 %r11000, %r10999, %r10951; + xor.b32 %r11001, %r11000, %r10995; + shf.l.wrap.b32 %r11002, %r11001, %r11001, 16; + add.s32 %r11003, %r11002, %r10982; + xor.b32 %r11004, %r11003, %r10970; + shf.l.wrap.b32 %r11005, %r11004, %r11004, 20; + add.s32 %r11006, %r11000, %r10804; + add.s32 %r11007, %r11006, %r11005; + xor.b32 %r11008, %r11007, %r11002; + shf.l.wrap.b32 %r11009, %r11008, %r11008, 24; + add.s32 %r11010, %r11009, %r11003; + xor.b32 %r11011, %r11010, %r11005; + shf.l.wrap.b32 %r11012, %r11011, %r11011, 25; + add.s32 %r11013, %r10965, %r10811; + add.s32 %r11014, %r11013, %r10984; + xor.b32 %r11015, %r10953, %r11014; + shf.l.wrap.b32 %r11016, %r11015, %r11015, 16; + add.s32 %r11017, %r11016, %r10996; + xor.b32 %r11018, %r11017, %r10984; + shf.l.wrap.b32 %r11019, %r11018, %r11018, 20; + add.s32 %r11020, %r11014, %r10762; + add.s32 %r11021, %r11020, %r11019; + xor.b32 %r11022, %r11021, %r11016; + shf.l.wrap.b32 %r11023, %r11022, %r11022, 24; + add.s32 %r11024, %r11023, %r11017; + xor.b32 %r11025, %r11024, %r11019; + shf.l.wrap.b32 %r11026, %r11025, %r11025, 25; + add.s32 %r11027, %r10979, %r10790; + add.s32 %r11028, %r11027, %r10998; + xor.b32 %r11029, %r11028, %r10967; + shf.l.wrap.b32 %r11030, %r11029, %r11029, 16; + add.s32 %r11031, %r11030, %r10954; + xor.b32 %r11032, %r11031, %r10998; + shf.l.wrap.b32 %r11033, %r11032, %r11032, 20; + add.s32 %r11034, %r11028, %r10825; + add.s32 %r11035, %r11034, %r11033; + xor.b32 %r11036, %r11035, %r11030; + shf.l.wrap.b32 %r11037, %r11036, %r11036, 24; + add.s32 %r11038, %r11037, %r11031; + xor.b32 %r11039, %r11038, %r11033; + shf.l.wrap.b32 %r11040, %r11039, %r11039, 25; + add.s32 %r11041, %r10993, %r10832; + add.s32 %r11042, %r11041, %r10956; + xor.b32 %r11043, %r11042, %r10981; + shf.l.wrap.b32 %r11044, %r11043, %r11043, 16; + add.s32 %r11045, %r11044, %r10968; + xor.b32 %r11046, %r11045, %r10956; + shf.l.wrap.b32 %r11047, %r11046, %r11046, 20; + add.s32 %r11048, %r11042, %r10783; + add.s32 %r11049, %r11048, %r11047; + xor.b32 %r11050, %r11049, %r11044; + shf.l.wrap.b32 %r11051, %r11050, %r11050, 24; + add.s32 %r11052, %r11051, %r11045; + xor.b32 %r11053, %r11052, %r11047; + shf.l.wrap.b32 %r11054, %r11053, %r11053, 25; + add.s32 %r11055, %r11007, %r10748; + add.s32 %r11056, %r11055, %r11054; + xor.b32 %r11057, %r11056, %r11023; + shf.l.wrap.b32 %r11058, %r11057, %r11057, 16; + add.s32 %r11059, %r11058, %r11038; + xor.b32 %r11060, %r11059, %r11054; + shf.l.wrap.b32 %r11061, %r11060, %r11060, 20; + add.s32 %r11062, %r11056, %r10755; + add.s32 %r11063, %r11062, %r11061; + xor.b32 %r11064, %r11063, %r11058; + shf.l.wrap.b32 %r11065, %r11064, %r11064, 24; + add.s32 %r11066, %r11065, %r11059; + xor.b32 %r11067, %r11066, %r11061; + shf.l.wrap.b32 %r11068, %r11067, %r11067, 25; + add.s32 %r11069, %r11021, %r10797; + add.s32 %r11070, %r11069, %r11012; + xor.b32 %r11071, %r11070, %r11037; + shf.l.wrap.b32 %r11072, %r11071, %r11071, 16; + add.s32 %r11073, %r11072, %r11052; + xor.b32 %r11074, %r11073, %r11012; + shf.l.wrap.b32 %r11075, %r11074, %r11074, 20; + add.s32 %r11076, %r11070, %r10811; + add.s32 %r11077, %r11076, %r11075; + xor.b32 %r11078, %r11077, %r11072; + shf.l.wrap.b32 %r11079, %r11078, %r11078, 24; + add.s32 %r11080, %r11079, %r11073; + xor.b32 %r11081, %r11080, %r11075; + shf.l.wrap.b32 %r11082, %r11081, %r11081, 25; + add.s32 %r11083, %r11035, %r10818; + add.s32 %r11084, %r11083, %r11026; + xor.b32 %r11085, %r11051, %r11084; + shf.l.wrap.b32 %r11086, %r11085, %r11085, 16; + add.s32 %r11087, %r11086, %r11010; + xor.b32 %r11088, %r11087, %r11026; + shf.l.wrap.b32 %r11089, %r11088, %r11088, 20; + add.s32 %r11090, %r11084, %r10741; + add.s32 %r11091, %r11090, %r11089; + xor.b32 %r11092, %r11091, %r11086; + shf.l.wrap.b32 %r11093, %r11092, %r11092, 24; + add.s32 %r11094, %r11093, %r11087; + xor.b32 %r11095, %r11094, %r11089; + shf.l.wrap.b32 %r11096, %r11095, %r11095, 25; + add.s32 %r11097, %r11040, %r10776; + add.s32 %r11098, %r11097, %r11049; + xor.b32 %r11099, %r11098, %r11009; + shf.l.wrap.b32 %r11100, %r11099, %r11099, 16; + add.s32 %r11101, %r11100, %r11024; + xor.b32 %r11102, %r11101, %r11040; + shf.l.wrap.b32 %r11103, %r11102, %r11102, 20; + add.s32 %r11104, %r11098, %r10825; + add.s32 %r11105, %r11104, %r11103; + xor.b32 %r11106, %r11105, %r11100; + shf.l.wrap.b32 %r11107, %r11106, %r11106, 24; + add.s32 %r11108, %r11107, %r11101; + xor.b32 %r11109, %r11108, %r11103; + shf.l.wrap.b32 %r11110, %r11109, %r11109, 25; + add.s32 %r11111, %r11082, %r10769; + add.s32 %r11112, %r11111, %r11063; + xor.b32 %r11113, %r11112, %r11107; + shf.l.wrap.b32 %r11114, %r11113, %r11113, 16; + add.s32 %r11115, %r11114, %r11094; + xor.b32 %r11116, %r11115, %r11082; + shf.l.wrap.b32 %r11117, %r11116, %r11116, 20; + add.s32 %r11118, %r11112, %r10762; + add.s32 %r11119, %r11118, %r11117; + xor.b32 %r11120, %r11119, %r11114; + shf.l.wrap.b32 %r11121, %r11120, %r11120, 24; + add.s32 %r11122, %r11121, %r11115; + xor.b32 %r11123, %r11122, %r11117; + shf.l.wrap.b32 %r11124, %r11123, %r11123, 25; + add.s32 %r11125, %r11077, %r10790; + add.s32 %r11126, %r11125, %r11096; + xor.b32 %r11127, %r11065, %r11126; + shf.l.wrap.b32 %r11128, %r11127, %r11127, 16; + add.s32 %r11129, %r11128, %r11108; + xor.b32 %r11130, %r11129, %r11096; + shf.l.wrap.b32 %r11131, %r11130, %r11130, 20; + add.s32 %r11132, %r11126, %r10727; + add.s32 %r11133, %r11132, %r11131; + xor.b32 %r11134, %r11133, %r11128; + shf.l.wrap.b32 %r11135, %r11134, %r11134, 24; + add.s32 %r11136, %r11135, %r11129; + xor.b32 %r11137, %r11136, %r11131; + shf.l.wrap.b32 %r11138, %r11137, %r11137, 25; + add.s32 %r11139, %r11091, %r10804; + add.s32 %r11140, %r11139, %r11110; + xor.b32 %r11141, %r11140, %r11079; + shf.l.wrap.b32 %r11142, %r11141, %r11141, 16; + add.s32 %r11143, %r11142, %r11066; + xor.b32 %r11144, %r11143, %r11110; + shf.l.wrap.b32 %r11145, %r11144, %r11144, 20; + add.s32 %r11146, %r11140, %r10832; + add.s32 %r11147, %r11146, %r11145; + xor.b32 %r11148, %r11147, %r11142; + shf.l.wrap.b32 %r11149, %r11148, %r11148, 24; + add.s32 %r11150, %r11149, %r11143; + xor.b32 %r11151, %r11150, %r11145; + shf.l.wrap.b32 %r11152, %r11151, %r11151, 25; + add.s32 %r11153, %r11105, %r10783; + add.s32 %r11154, %r11153, %r11068; + xor.b32 %r11155, %r11154, %r11093; + shf.l.wrap.b32 %r11156, %r11155, %r11155, 16; + add.s32 %r11157, %r11156, %r11080; + xor.b32 %r11158, %r11157, %r11068; + shf.l.wrap.b32 %r11159, %r11158, %r11158, 20; + add.s32 %r11160, %r11154, %r10734; + add.s32 %r11161, %r11160, %r11159; + xor.b32 %r11162, %r11161, %r11156; + shf.l.wrap.b32 %r11163, %r11162, %r11162, 24; + add.s32 %r11164, %r11163, %r11157; + xor.b32 %r11165, %r11164, %r11159; + shf.l.wrap.b32 %r11166, %r11165, %r11165, 25; + add.s32 %r11167, %r11119, %r10797; + add.s32 %r11168, %r11167, %r11166; + xor.b32 %r11169, %r11168, %r11135; + shf.l.wrap.b32 %r11170, %r11169, %r11169, 16; + add.s32 %r11171, %r11170, %r11150; + xor.b32 %r11172, %r11171, %r11166; + shf.l.wrap.b32 %r11173, %r11172, %r11172, 20; + add.s32 %r11174, %r11168, %r10776; + add.s32 %r11175, %r11174, %r11173; + xor.b32 %r11176, %r11175, %r11170; + shf.l.wrap.b32 %r11177, %r11176, %r11176, 24; + add.s32 %r11178, %r11177, %r11171; + xor.b32 %r11179, %r11178, %r11173; + shf.l.wrap.b32 %r11180, %r11179, %r11179, 25; + add.s32 %r11181, %r11133, %r10811; + add.s32 %r11182, %r11181, %r11124; + xor.b32 %r11183, %r11182, %r11149; + shf.l.wrap.b32 %r11184, %r11183, %r11183, 16; + add.s32 %r11185, %r11184, %r11164; + xor.b32 %r11186, %r11185, %r11124; + shf.l.wrap.b32 %r11187, %r11186, %r11186, 20; + add.s32 %r11188, %r11182, %r10790; + add.s32 %r11189, %r11188, %r11187; + xor.b32 %r11190, %r11189, %r11184; + shf.l.wrap.b32 %r11191, %r11190, %r11190, 24; + add.s32 %r11192, %r11191, %r11185; + xor.b32 %r11193, %r11192, %r11187; + shf.l.wrap.b32 %r11194, %r11193, %r11193, 25; + add.s32 %r11195, %r11147, %r10825; + add.s32 %r11196, %r11195, %r11138; + xor.b32 %r11197, %r11163, %r11196; + shf.l.wrap.b32 %r11198, %r11197, %r11197, 16; + add.s32 %r11199, %r11198, %r11122; + xor.b32 %r11200, %r11199, %r11138; + shf.l.wrap.b32 %r11201, %r11200, %r11200, 20; + add.s32 %r11202, %r11196, %r10748; + add.s32 %r11203, %r11202, %r11201; + xor.b32 %r11204, %r11203, %r11198; + shf.l.wrap.b32 %r11205, %r11204, %r11204, 24; + add.s32 %r11206, %r11205, %r11199; + xor.b32 %r11207, %r11206, %r11201; + shf.l.wrap.b32 %r11208, %r11207, %r11207, 25; + add.s32 %r11209, %r11152, %r10818; + add.s32 %r11210, %r11209, %r11161; + xor.b32 %r11211, %r11210, %r11121; + shf.l.wrap.b32 %r11212, %r11211, %r11211, 16; + add.s32 %r11213, %r11212, %r11136; + xor.b32 %r11214, %r11213, %r11152; + shf.l.wrap.b32 %r11215, %r11214, %r11214, 20; + add.s32 %r11216, %r11210, %r10832; + add.s32 %r11217, %r11216, %r11215; + xor.b32 %r11218, %r11217, %r11212; + shf.l.wrap.b32 %r11219, %r11218, %r11218, 24; + add.s32 %r11220, %r11219, %r11213; + xor.b32 %r11221, %r11220, %r11215; + shf.l.wrap.b32 %r11222, %r11221, %r11221, 25; + add.s32 %r11223, %r11194, %r10755; + add.s32 %r11224, %r11223, %r11175; + xor.b32 %r11225, %r11224, %r11219; + shf.l.wrap.b32 %r11226, %r11225, %r11225, 16; + add.s32 %r11227, %r11226, %r11206; + xor.b32 %r11228, %r11227, %r11194; + shf.l.wrap.b32 %r11229, %r11228, %r11228, 20; + add.s32 %r11230, %r11224, %r10727; + add.s32 %r11231, %r11230, %r11229; + xor.b32 %r11232, %r11231, %r11226; + shf.l.wrap.b32 %r11233, %r11232, %r11232, 24; + add.s32 %r11234, %r11233, %r11227; + xor.b32 %r11235, %r11234, %r11229; + shf.l.wrap.b32 %r11236, %r11235, %r11235, 25; + add.s32 %r11237, %r11189, %r10804; + add.s32 %r11238, %r11237, %r11208; + xor.b32 %r11239, %r11177, %r11238; + shf.l.wrap.b32 %r11240, %r11239, %r11239, 16; + add.s32 %r11241, %r11240, %r11220; + xor.b32 %r11242, %r11241, %r11208; + shf.l.wrap.b32 %r11243, %r11242, %r11242, 20; + add.s32 %r11244, %r11238, %r10741; + add.s32 %r11245, %r11244, %r11243; + xor.b32 %r11246, %r11245, %r11240; + shf.l.wrap.b32 %r11247, %r11246, %r11246, 24; + add.s32 %r11248, %r11247, %r11241; + xor.b32 %r11249, %r11248, %r11243; + shf.l.wrap.b32 %r11250, %r11249, %r11249, 25; + add.s32 %r11251, %r11203, %r10762; + add.s32 %r11252, %r11251, %r11222; + xor.b32 %r11253, %r11252, %r11191; + shf.l.wrap.b32 %r11254, %r11253, %r11253, 16; + add.s32 %r11255, %r11254, %r11178; + xor.b32 %r11256, %r11255, %r11222; + shf.l.wrap.b32 %r11257, %r11256, %r11256, 20; + add.s32 %r11258, %r11252, %r10783; + add.s32 %r11259, %r11258, %r11257; + xor.b32 %r11260, %r11259, %r11254; + shf.l.wrap.b32 %r11261, %r11260, %r11260, 24; + add.s32 %r11262, %r11261, %r11255; + xor.b32 %r11263, %r11262, %r11257; + shf.l.wrap.b32 %r11264, %r11263, %r11263, 25; + add.s32 %r11265, %r11217, %r10734; + add.s32 %r11266, %r11265, %r11180; + xor.b32 %r11267, %r11266, %r11205; + shf.l.wrap.b32 %r11268, %r11267, %r11267, 16; + add.s32 %r11269, %r11268, %r11192; + xor.b32 %r11270, %r11269, %r11180; + shf.l.wrap.b32 %r11271, %r11270, %r11270, 20; + add.s32 %r11272, %r11266, %r10769; + add.s32 %r11273, %r11272, %r11271; + xor.b32 %r11274, %r11273, %r11268; + shf.l.wrap.b32 %r11275, %r11274, %r11274, 24; + add.s32 %r11276, %r11275, %r11269; + xor.b32 %r11277, %r11276, %r11271; + shf.l.wrap.b32 %r11278, %r11277, %r11277, 25; + add.s32 %r11279, %r11231, %r10811; + add.s32 %r11280, %r11279, %r11278; + xor.b32 %r11281, %r11280, %r11247; + shf.l.wrap.b32 %r11282, %r11281, %r11281, 16; + add.s32 %r11283, %r11282, %r11262; + xor.b32 %r11284, %r11283, %r11278; + shf.l.wrap.b32 %r11285, %r11284, %r11284, 20; + add.s32 %r11286, %r11280, %r10818; + add.s32 %r11287, %r11286, %r11285; + xor.b32 %r11288, %r11287, %r11282; + shf.l.wrap.b32 %r11289, %r11288, %r11288, 24; + add.s32 %r11290, %r11289, %r11283; + xor.b32 %r11291, %r11290, %r11285; + shf.l.wrap.b32 %r11292, %r11291, %r11291, 25; + add.s32 %r11293, %r11245, %r10790; + add.s32 %r11294, %r11293, %r11236; + xor.b32 %r11295, %r11294, %r11261; + shf.l.wrap.b32 %r11296, %r11295, %r11295, 16; + add.s32 %r11297, %r11296, %r11276; + xor.b32 %r11298, %r11297, %r11236; + shf.l.wrap.b32 %r11299, %r11298, %r11298, 20; + add.s32 %r11300, %r11294, %r10804; + add.s32 %r11301, %r11300, %r11299; + xor.b32 %r11302, %r11301, %r11296; + shf.l.wrap.b32 %r11303, %r11302, %r11302, 24; + add.s32 %r11304, %r11303, %r11297; + xor.b32 %r11305, %r11304, %r11299; + shf.l.wrap.b32 %r11306, %r11305, %r11305, 25; + add.s32 %r11307, %r11259, %r10832; + add.s32 %r11308, %r11307, %r11250; + xor.b32 %r11309, %r11275, %r11308; + shf.l.wrap.b32 %r11310, %r11309, %r11309, 16; + add.s32 %r11311, %r11310, %r11234; + xor.b32 %r11312, %r11311, %r11250; + shf.l.wrap.b32 %r11313, %r11312, %r11312, 20; + add.s32 %r11314, %r11308, %r10797; + add.s32 %r11315, %r11314, %r11313; + xor.b32 %r11316, %r11315, %r11310; + shf.l.wrap.b32 %r11317, %r11316, %r11316, 24; + add.s32 %r11318, %r11317, %r11311; + xor.b32 %r11319, %r11318, %r11313; + shf.l.wrap.b32 %r11320, %r11319, %r11319, 25; + add.s32 %r11321, %r11264, %r10825; + add.s32 %r11322, %r11321, %r11273; + xor.b32 %r11323, %r11322, %r11233; + shf.l.wrap.b32 %r11324, %r11323, %r11323, 16; + add.s32 %r11325, %r11324, %r11248; + xor.b32 %r11326, %r11325, %r11264; + shf.l.wrap.b32 %r11327, %r11326, %r11326, 20; + add.s32 %r11328, %r11322, %r10783; + add.s32 %r11329, %r11328, %r11327; + xor.b32 %r11330, %r11329, %r11324; + shf.l.wrap.b32 %r11331, %r11330, %r11330, 24; + add.s32 %r11332, %r11331, %r11325; + xor.b32 %r11333, %r11332, %r11327; + shf.l.wrap.b32 %r11334, %r11333, %r11333, 25; + add.s32 %r11335, %r11306, %r10776; + add.s32 %r11336, %r11335, %r11287; + xor.b32 %r11337, %r11336, %r11331; + shf.l.wrap.b32 %r11338, %r11337, %r11337, 16; + add.s32 %r11339, %r11338, %r11318; + xor.b32 %r11340, %r11339, %r11306; + shf.l.wrap.b32 %r11341, %r11340, %r11340, 20; + add.s32 %r11342, %r11336, %r10741; + add.s32 %r11343, %r11342, %r11341; + xor.b32 %r11344, %r11343, %r11338; + shf.l.wrap.b32 %r11345, %r11344, %r11344, 24; + add.s32 %r11346, %r11345, %r11339; + xor.b32 %r11347, %r11346, %r11341; + shf.l.wrap.b32 %r11348, %r11347, %r11347, 25; + add.s32 %r11349, %r11301, %r10762; + add.s32 %r11350, %r11349, %r11320; + xor.b32 %r11351, %r11289, %r11350; + shf.l.wrap.b32 %r11352, %r11351, %r11351, 16; + add.s32 %r11353, %r11352, %r11332; + xor.b32 %r11354, %r11353, %r11320; + shf.l.wrap.b32 %r11355, %r11354, %r11354, 20; + add.s32 %r11356, %r11350, %r10748; + add.s32 %r11357, %r11356, %r11355; + xor.b32 %r11358, %r11357, %r11352; + shf.l.wrap.b32 %r11359, %r11358, %r11358, 24; + add.s32 %r11360, %r11359, %r11353; + xor.b32 %r11361, %r11360, %r11355; + shf.l.wrap.b32 %r11362, %r11361, %r11361, 25; + add.s32 %r11363, %r11315, %r10727; + add.s32 %r11364, %r11363, %r11334; + xor.b32 %r11365, %r11364, %r11303; + shf.l.wrap.b32 %r11366, %r11365, %r11365, 16; + add.s32 %r11367, %r11366, %r11290; + xor.b32 %r11368, %r11367, %r11334; + shf.l.wrap.b32 %r11369, %r11368, %r11368, 20; + add.s32 %r11370, %r11364, %r10734; + add.s32 %r11371, %r11370, %r11369; + xor.b32 %r11372, %r11371, %r11366; + shf.l.wrap.b32 %r11373, %r11372, %r11372, 24; + add.s32 %r11374, %r11373, %r11367; + xor.b32 %r11375, %r11374, %r11369; + shf.l.wrap.b32 %r11376, %r11375, %r11375, 25; + add.s32 %r11377, %r11329, %r10769; + add.s32 %r11378, %r11377, %r11292; + xor.b32 %r11379, %r11378, %r11317; + shf.l.wrap.b32 %r11380, %r11379, %r11379, 16; + add.s32 %r11381, %r11380, %r11304; + xor.b32 %r11382, %r11381, %r11292; + shf.l.wrap.b32 %r11383, %r11382, %r11382, 20; + add.s32 %r11384, %r11378, %r10755; + add.s32 %r11385, %r11384, %r11383; + xor.b32 %r11386, %r11385, %r11380; + shf.l.wrap.b32 %r11387, %r11386, %r11386, 24; + add.s32 %r11388, %r11387, %r11381; + xor.b32 %r11389, %r11388, %r11383; + shf.l.wrap.b32 %r11390, %r11389, %r11389, 25; + add.s32 %r11391, %r11343, %r10790; + add.s32 %r11392, %r11391, %r11390; + xor.b32 %r11393, %r11392, %r11359; + shf.l.wrap.b32 %r11394, %r11393, %r11393, 16; + add.s32 %r11395, %r11394, %r11374; + xor.b32 %r11396, %r11395, %r11390; + shf.l.wrap.b32 %r11397, %r11396, %r11396, 20; + add.s32 %r11398, %r11392, %r10825; + add.s32 %r11399, %r11398, %r11397; + xor.b32 %r11400, %r11399, %r11394; + shf.l.wrap.b32 %r11401, %r11400, %r11400, 24; + add.s32 %r11402, %r11401, %r11395; + xor.b32 %r11403, %r11402, %r11397; + shf.l.wrap.b32 %r11404, %r11403, %r11403, 25; + add.s32 %r11405, %r11357, %r10804; + add.s32 %r11406, %r11405, %r11348; + xor.b32 %r11407, %r11406, %r11373; + shf.l.wrap.b32 %r11408, %r11407, %r11407, 16; + add.s32 %r11409, %r11408, %r11388; + xor.b32 %r11410, %r11409, %r11348; + shf.l.wrap.b32 %r11411, %r11410, %r11410, 20; + add.s32 %r11412, %r11406, %r10762; + add.s32 %r11413, %r11412, %r11411; + xor.b32 %r11414, %r11413, %r11408; + shf.l.wrap.b32 %r11415, %r11414, %r11414, 24; + add.s32 %r11416, %r11415, %r11409; + xor.b32 %r11417, %r11416, %r11411; + shf.l.wrap.b32 %r11418, %r11417, %r11417, 25; + add.s32 %r11419, %r11371, %r10783; + add.s32 %r11420, %r11419, %r11362; + xor.b32 %r11421, %r11387, %r11420; + shf.l.wrap.b32 %r11422, %r11421, %r11421, 16; + add.s32 %r11423, %r11422, %r11346; + xor.b32 %r11424, %r11423, %r11362; + shf.l.wrap.b32 %r11425, %r11424, %r11424, 20; + add.s32 %r11426, %r11420, %r10811; + add.s32 %r11427, %r11426, %r11425; + xor.b32 %r11428, %r11427, %r11422; + shf.l.wrap.b32 %r11429, %r11428, %r11428, 24; + add.s32 %r11430, %r11429, %r11423; + xor.b32 %r11431, %r11430, %r11425; + shf.l.wrap.b32 %r11432, %r11431, %r11431, 25; + add.s32 %r11433, %r11376, %r10832; + add.s32 %r11434, %r11433, %r11385; + xor.b32 %r11435, %r11434, %r11345; + shf.l.wrap.b32 %r11436, %r11435, %r11435, 16; + add.s32 %r11437, %r11436, %r11360; + xor.b32 %r11438, %r11437, %r11376; + shf.l.wrap.b32 %r11439, %r11438, %r11438, 20; + add.s32 %r11440, %r11434, %r10734; + add.s32 %r11441, %r11440, %r11439; + xor.b32 %r11442, %r11441, %r11436; + shf.l.wrap.b32 %r11443, %r11442, %r11442, 24; + add.s32 %r11444, %r11443, %r11437; + xor.b32 %r11445, %r11444, %r11439; + shf.l.wrap.b32 %r11446, %r11445, %r11445, 25; + add.s32 %r11447, %r11418, %r10818; + add.s32 %r11448, %r11447, %r11399; + xor.b32 %r11449, %r11448, %r11443; + shf.l.wrap.b32 %r11450, %r11449, %r11449, 16; + add.s32 %r11451, %r11450, %r11430; + xor.b32 %r11452, %r11451, %r11418; + shf.l.wrap.b32 %r11453, %r11452, %r11452, 20; + add.s32 %r11454, %r11448, %r10748; + add.s32 %r11455, %r11454, %r11453; + xor.b32 %r11456, %r11455, %r11450; + shf.l.wrap.b32 %r11457, %r11456, %r11456, 24; + add.s32 %r11458, %r11457, %r11451; + xor.b32 %r11459, %r11458, %r11453; + shf.l.wrap.b32 %r11460, %r11459, %r11459, 25; + add.s32 %r11461, %r11413, %r10727; + add.s32 %r11462, %r11461, %r11432; + xor.b32 %r11463, %r11401, %r11462; + shf.l.wrap.b32 %r11464, %r11463, %r11463, 16; + add.s32 %r11465, %r11464, %r11444; + xor.b32 %r11466, %r11465, %r11432; + shf.l.wrap.b32 %r11467, %r11466, %r11466, 20; + add.s32 %r11468, %r11462, %r10797; + add.s32 %r11469, %r11468, %r11467; + xor.b32 %r11470, %r11469, %r11464; + shf.l.wrap.b32 %r11471, %r11470, %r11470, 24; + add.s32 %r11472, %r11471, %r11465; + xor.b32 %r11473, %r11472, %r11467; + shf.l.wrap.b32 %r11474, %r11473, %r11473, 25; + add.s32 %r11475, %r11427, %r10741; + add.s32 %r11476, %r11475, %r11446; + xor.b32 %r11477, %r11476, %r11415; + shf.l.wrap.b32 %r11478, %r11477, %r11477, 16; + add.s32 %r11479, %r11478, %r11402; + xor.b32 %r11480, %r11479, %r11446; + shf.l.wrap.b32 %r11481, %r11480, %r11480, 20; + add.s32 %r11482, %r11476, %r10769; + add.s32 %r11483, %r11482, %r11481; + xor.b32 %r11484, %r11483, %r11478; + shf.l.wrap.b32 %r11485, %r11484, %r11484, 24; + add.s32 %r11486, %r11485, %r11479; + xor.b32 %r11487, %r11486, %r11481; + shf.l.wrap.b32 %r11488, %r11487, %r11487, 25; + add.s32 %r11489, %r11441, %r10755; + add.s32 %r11490, %r11489, %r11404; + xor.b32 %r11491, %r11490, %r11429; + shf.l.wrap.b32 %r11492, %r11491, %r11491, 16; + add.s32 %r11493, %r11492, %r11416; + xor.b32 %r11494, %r11493, %r11404; + shf.l.wrap.b32 %r11495, %r11494, %r11494, 20; + add.s32 %r11496, %r11490, %r10776; + add.s32 %r11497, %r11496, %r11495; + xor.b32 %r11498, %r11497, %r11492; + shf.l.wrap.b32 %r11499, %r11498, %r11498, 24; + add.s32 %r11500, %r11499, %r11493; + xor.b32 %r11501, %r11500, %r11495; + shf.l.wrap.b32 %r11502, %r11501, %r11501, 25; + add.s32 %r11503, %r11455, %r10804; + add.s32 %r11504, %r11503, %r11502; + xor.b32 %r11505, %r11504, %r11471; + shf.l.wrap.b32 %r11506, %r11505, %r11505, 16; + add.s32 %r11507, %r11506, %r11486; + xor.b32 %r11508, %r11507, %r11502; + shf.l.wrap.b32 %r11509, %r11508, %r11508, 20; + add.s32 %r11510, %r11504, %r10832; + add.s32 %r11511, %r11510, %r11509; + xor.b32 %r11512, %r11511, %r11506; + shf.l.wrap.b32 %r11513, %r11512, %r11512, 24; + add.s32 %r11514, %r11513, %r11507; + xor.b32 %r11515, %r11514, %r11509; + shf.l.wrap.b32 %r11516, %r11515, %r11515, 25; + add.s32 %r11517, %r11469, %r10762; + add.s32 %r11518, %r11517, %r11460; + xor.b32 %r11519, %r11518, %r11485; + shf.l.wrap.b32 %r11520, %r11519, %r11519, 16; + add.s32 %r11521, %r11520, %r11500; + xor.b32 %r11522, %r11521, %r11460; + shf.l.wrap.b32 %r11523, %r11522, %r11522, 20; + add.s32 %r11524, %r11518, %r10727; + add.s32 %r11525, %r11524, %r11523; + xor.b32 %r11526, %r11525, %r11520; + shf.l.wrap.b32 %r11527, %r11526, %r11526, 24; + add.s32 %r11528, %r11527, %r11521; + xor.b32 %r11529, %r11528, %r11523; + shf.l.wrap.b32 %r11530, %r11529, %r11529, 25; + add.s32 %r11531, %r11483, %r10734; + add.s32 %r11532, %r11531, %r11474; + xor.b32 %r11533, %r11499, %r11532; + shf.l.wrap.b32 %r11534, %r11533, %r11533, 16; + add.s32 %r11535, %r11534, %r11458; + xor.b32 %r11536, %r11535, %r11474; + shf.l.wrap.b32 %r11537, %r11536, %r11536, 20; + add.s32 %r11538, %r11532, %r10790; + add.s32 %r11539, %r11538, %r11537; + xor.b32 %r11540, %r11539, %r11534; + shf.l.wrap.b32 %r11541, %r11540, %r11540, 24; + add.s32 %r11542, %r11541, %r11535; + xor.b32 %r11543, %r11542, %r11537; + shf.l.wrap.b32 %r11544, %r11543, %r11543, 25; + add.s32 %r11545, %r11488, %r10783; + add.s32 %r11546, %r11545, %r11497; + xor.b32 %r11547, %r11546, %r11457; + shf.l.wrap.b32 %r11548, %r11547, %r11547, 16; + add.s32 %r11549, %r11548, %r11472; + xor.b32 %r11550, %r11549, %r11488; + shf.l.wrap.b32 %r11551, %r11550, %r11550, 20; + add.s32 %r11552, %r11546, %r10769; + add.s32 %r11553, %r11552, %r11551; + xor.b32 %r11554, %r11553, %r11548; + shf.l.wrap.b32 %r11555, %r11554, %r11554, 24; + add.s32 %r11556, %r11555, %r11549; + xor.b32 %r11557, %r11556, %r11551; + shf.l.wrap.b32 %r11558, %r11557, %r11557, 25; + add.s32 %r11559, %r11530, %r10825; + add.s32 %r11560, %r11559, %r11511; + xor.b32 %r11561, %r11560, %r11555; + shf.l.wrap.b32 %r11562, %r11561, %r11561, 16; + add.s32 %r11563, %r11562, %r11542; + xor.b32 %r11564, %r11563, %r11530; + shf.l.wrap.b32 %r11565, %r11564, %r11564, 20; + add.s32 %r11566, %r11560, %r10797; + add.s32 %r11567, %r11566, %r11565; + xor.b32 %r11568, %r11567, %r11562; + shf.l.wrap.b32 %r11569, %r11568, %r11568, 24; + add.s32 %r11570, %r11569, %r11563; + xor.b32 %r11571, %r11570, %r11565; + shf.l.wrap.b32 %r11572, %r11571, %r11571, 25; + add.s32 %r11573, %r11525, %r10741; + add.s32 %r11574, %r11573, %r11544; + xor.b32 %r11575, %r11513, %r11574; + shf.l.wrap.b32 %r11576, %r11575, %r11575, 16; + add.s32 %r11577, %r11576, %r11556; + xor.b32 %r11578, %r11577, %r11544; + shf.l.wrap.b32 %r11579, %r11578, %r11578, 20; + add.s32 %r11580, %r11574, %r10811; + add.s32 %r11581, %r11580, %r11579; + xor.b32 %r11582, %r11581, %r11576; + shf.l.wrap.b32 %r11583, %r11582, %r11582, 24; + add.s32 %r11584, %r11583, %r11577; + xor.b32 %r11585, %r11584, %r11579; + shf.l.wrap.b32 %r11586, %r11585, %r11585, 25; + add.s32 %r11587, %r11539, %r10748; + add.s32 %r11588, %r11587, %r11558; + xor.b32 %r11589, %r11588, %r11527; + shf.l.wrap.b32 %r11590, %r11589, %r11589, 16; + add.s32 %r11591, %r11590, %r11514; + xor.b32 %r11592, %r11591, %r11558; + shf.l.wrap.b32 %r11593, %r11592, %r11592, 20; + add.s32 %r11594, %r11588, %r10755; + add.s32 %r11595, %r11594, %r11593; + xor.b32 %r11596, %r11595, %r11590; + shf.l.wrap.b32 %r11597, %r11596, %r11596, 24; + add.s32 %r11598, %r11597, %r11591; + xor.b32 %r11599, %r11598, %r11593; + shf.l.wrap.b32 %r11600, %r11599, %r11599, 25; + add.s32 %r11601, %r11553, %r10776; + add.s32 %r11602, %r11601, %r11516; + xor.b32 %r11603, %r11602, %r11541; + shf.l.wrap.b32 %r11604, %r11603, %r11603, 16; + add.s32 %r11605, %r11604, %r11528; + xor.b32 %r11606, %r11605, %r11516; + shf.l.wrap.b32 %r11607, %r11606, %r11606, 20; + add.s32 %r11608, %r11602, %r10818; + add.s32 %r11609, %r11608, %r11607; + xor.b32 %r11610, %r11609, %r11604; + shf.l.wrap.b32 %r11611, %r11610, %r11610, 24; + add.s32 %r11612, %r11611, %r11605; + xor.b32 %r11613, %r11612, %r11607; + shf.l.wrap.b32 %r11614, %r11613, %r11613, 25; + xor.b32 %r11615, %r11598, %r11567; + xor.b32 %r11616, %r11612, %r11581; + xor.b32 %r11617, %r11570, %r11595; + xor.b32 %r11618, %r11609, %r11584; + xor.b32 %r11619, %r11614, %r11583; + xor.b32 %r11620, %r11572, %r11597; + xor.b32 %r11621, %r11611, %r11586; + xor.b32 %r11622, %r11600, %r11569; + st.local.u8 [%rd219+145], %r11615; + shr.u32 %r11623, %r11615, 8; + st.local.u8 [%rd219+146], %r11623; + shr.u32 %r11624, %r11615, 16; + st.local.u8 [%rd219+147], %r11624; + shr.u32 %r11625, %r11615, 24; + st.local.u8 [%rd219+148], %r11625; + st.local.u8 [%rd219+149], %r11616; + shr.u32 %r11626, %r11616, 8; + st.local.u8 [%rd219+150], %r11626; + shr.u32 %r11627, %r11616, 16; + st.local.u8 [%rd219+151], %r11627; + shr.u32 %r11628, %r11616, 24; + st.local.u8 [%rd219+152], %r11628; + st.local.u8 [%rd219+153], %r11617; + shr.u32 %r11629, %r11617, 8; + st.local.u8 [%rd219+154], %r11629; + shr.u32 %r11630, %r11617, 16; + st.local.u8 [%rd219+155], %r11630; + shr.u32 %r11631, %r11617, 24; + st.local.u8 [%rd219+156], %r11631; + st.local.u8 [%rd219+157], %r11618; + shr.u32 %r11632, %r11618, 8; + st.local.u8 [%rd219+158], %r11632; + shr.u32 %r11633, %r11618, 16; + st.local.u8 [%rd219+159], %r11633; + shr.u32 %r11634, %r11618, 24; + st.local.u8 [%rd219+160], %r11634; + st.local.u8 [%rd219+161], %r11619; + shr.u32 %r11635, %r11619, 8; + st.local.u8 [%rd219+162], %r11635; + shr.u32 %r11636, %r11619, 16; + st.local.u8 [%rd219+163], %r11636; + shr.u32 %r11637, %r11619, 24; + st.local.u8 [%rd219+164], %r11637; + st.local.u8 [%rd219+165], %r11620; + shr.u32 %r11638, %r11620, 8; + st.local.u8 [%rd219+166], %r11638; + shr.u32 %r11639, %r11620, 16; + st.local.u8 [%rd219+167], %r11639; + shr.u32 %r11640, %r11620, 24; + st.local.u8 [%rd219+168], %r11640; + st.local.u8 [%rd219+169], %r11621; + shr.u32 %r11641, %r11621, 8; + st.local.u8 [%rd219+170], %r11641; + shr.u32 %r11642, %r11621, 16; + st.local.u8 [%rd219+171], %r11642; + shr.u32 %r11643, %r11621, 24; + st.local.u8 [%rd219+172], %r11643; + st.local.u8 [%rd219+173], %r11622; + shr.u32 %r11644, %r11622, 8; + st.local.u8 [%rd219+174], %r11644; + shr.u32 %r11645, %r11622, 16; + st.local.u8 [%rd219+175], %r11645; + shr.u32 %r11646, %r11622, 24; + st.local.u8 [%rd219+176], %r11646; + add.s16 %rs392, %rs392, -1; + cvt.u64.u16 %rd220, %rs392; + and.b64 %rd221, %rd220, 255; + setp.lt.u64 %p53, %rd97, %rd221; + @%p53 bra $L__BB1_66; + + ld.param.u64 %rd232, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + cvta.to.local.u64 %rd231, %rd232; + add.s64 %rd230, %rd231, 136; + st.local.u8 [%rd230+8], %rs392; + +$L__BB1_68: + ret; + +} + // .globl heavy_hash +.visible .entry heavy_hash( + .param .u64 heavy_hash_param_0, + .param .u64 heavy_hash_param_1, + .param .u64 heavy_hash_param_2, + .param .u8 heavy_hash_param_3, + .param .u64 heavy_hash_param_4, + .param .u64 heavy_hash_param_5, + .param .u64 heavy_hash_param_6, + .param .u64 heavy_hash_param_7 +) +{ + .local .align 16 .b8 __local_depot2[2080]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<60>; + .reg .b16 %rs<864>; + .reg .b32 %r<31266>; + .reg .b64 %rd<1373>; + + + mov.u64 %SPL, __local_depot2; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs409, [heavy_hash_param_3]; + ld.param.u64 %rd357, [heavy_hash_param_0]; + ld.param.u64 %rd358, [heavy_hash_param_1]; + ld.param.u64 %rd362, [heavy_hash_param_2]; + ld.param.u64 %rd363, [heavy_hash_param_4]; + ld.param.u64 %rd359, [heavy_hash_param_5]; + ld.param.u64 %rd360, [heavy_hash_param_6]; + ld.param.u64 %rd361, [heavy_hash_param_7]; + cvta.to.global.u64 %rd1, %rd363; + add.u64 %rd2, %SPL, 0; + add.u64 %rd3, %SPL, 2000; + mov.u32 %r5040, %ntid.x; + mov.u32 %r5041, %ctaid.x; + mov.u32 %r5042, %tid.x; + mad.lo.s32 %r5043, %r5041, %r5040, %r5042; + cvt.s64.s32 %rd4, %r5043; + setp.ge.u64 %p6, %rd4, %rd362; + @%p6 bra $L__BB2_105; + + cvt.u32.u64 %r5044, %rd4; + setp.ne.s32 %p7, %r5044, 0; + @%p7 bra $L__BB2_3; + + cvta.to.global.u64 %rd366, %rd359; + mov.u64 %rd367, 0; + st.global.u64 [%rd366], %rd367; + +$L__BB2_3: + setp.eq.s16 %p8, %rs409, 0; + @%p8 bra $L__BB2_5; + + shl.b64 %rd368, %rd4, 5; + add.s64 %rd369, %rd1, %rd368; + ld.global.v2.u64 {%rd370, %rd371}, [%rd369]; + mul.lo.s64 %rd374, %rd371, 5; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd374, 7; + shr.b64 %rhs, %rd374, 57; + add.u64 %rd375, %lhs, %rhs; + } + mul.lo.s64 %rd1299, %rd375, 9; + shl.b64 %rd376, %rd371, 17; + ld.global.v2.u64 {%rd377, %rd378}, [%rd369+16]; + xor.b64 %rd381, %rd377, %rd370; + xor.b64 %rd382, %rd378, %rd371; + xor.b64 %rd383, %rd371, %rd381; + xor.b64 %rd384, %rd370, %rd382; + st.global.v2.u64 [%rd369], {%rd384, %rd383}; + { + .reg .b32 %dummy; + mov.b64 {%r5045,%dummy}, %rd382; + } + { + .reg .b32 %dummy; + mov.b64 {%dummy,%r5046}, %rd382; + } + shf.r.wrap.b32 %r5047, %r5046, %r5045, 19; + shf.r.wrap.b32 %r5048, %r5045, %r5046, 19; + mov.b64 %rd385, {%r5048, %r5047}; + xor.b64 %rd386, %rd381, %rd376; + st.global.v2.u64 [%rd369+16], {%rd386, %rd385}; + bra.uni $L__BB2_6; + +$L__BB2_5: + ld.global.u64 %rd387, [%rd1]; + xor.b64 %rd1299, %rd387, %rd4; + +$L__BB2_6: + and.b64 %rd389, %rd1299, %rd357; + or.b64 %rd8, %rd389, %rd358; + mov.u64 %rd1300, 0; + mov.u32 %r29818, 0; + mov.u64 %rd390, hash_header; + +$L__BB2_7: + add.s64 %rd391, %rd390, %rd1300; + ld.const.u8 %rs410, [%rd391]; + add.s64 %rd392, %rd3, %rd1300; + st.local.u8 [%rd392], %rs410; + add.s64 %rd1300, %rd1300, 1; + add.s32 %r29818, %r29818, 1; + setp.lt.u32 %p9, %r29818, 72; + @%p9 bra $L__BB2_7; + + ld.local.v4.u32 {%r5050, %r5051, %r5052, %r5053}, [%rd3]; + mov.u64 %rd393, 0; + ld.local.v4.u32 {%r5054, %r5055, %r5056, %r5057}, [%rd3+16]; + ld.local.v4.u32 {%r5058, %r5059, %r5060, %r5061}, [%rd3+32]; + ld.local.v4.u32 {%r5062, %r5063, %r5064, %r5065}, [%rd3+48]; + st.local.u64 [%rd3+72], %rd8; + mov.u32 %r5066, -1150833019; + mov.u32 %r5067, 1779033703; + st.local.v2.u32 [%rd2], {%r5067, %r5066}; + mov.u32 %r5068, -1521486534; + mov.u32 %r5069, 1013904242; + st.local.v2.u32 [%rd2+8], {%r5069, %r5068}; + mov.u32 %r5070, -1694144372; + mov.u32 %r5071, 1359893119; + st.local.v2.u32 [%rd2+16], {%r5071, %r5070}; + mov.u32 %r5072, 1541459225; + mov.u32 %r5073, 528734635; + st.local.v2.u32 [%rd2+24], {%r5073, %r5072}; + st.local.u64 [%rd2+64], %rd393; + mov.u32 %r5074, 0; + st.local.v2.u32 [%rd2+72], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+80], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+88], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+96], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+104], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+112], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+120], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+128], {%r5074, %r5074}; + mov.u16 %rs411, 0; + st.local.v2.u8 [%rd2+136], {%rs411, %rs411}; + st.local.u8 [%rd2+138], %rs411; + st.local.v2.u32 [%rd2+32], {%r5067, %r5066}; + st.local.v2.u32 [%rd2+40], {%r5069, %r5068}; + st.local.v2.u32 [%rd2+48], {%r5071, %r5070}; + st.local.v2.u32 [%rd2+56], {%r5073, %r5072}; + st.local.u8 [%rd2+144], %rs411; + ld.local.v4.u8 {%rs412, %rs413, %rs414, %rs415}, [%rd2+136]; + setp.eq.s16 %p10, %rs413, 0; + selp.u16 %rs419, 1, 0, %p10; + or.b16 %rs420, %rs414, %rs419; + mov.b32 {%rs421, %rs422}, %r5050; + shr.u16 %rs423, %rs421, 8; + shr.u16 %rs424, %rs422, 8; + mov.b32 {%rs425, %rs426}, %r5051; + shr.u16 %rs427, %rs425, 8; + shr.u16 %rs428, %rs426, 8; + mov.b32 {%rs429, %rs430}, %r5052; + shr.u16 %rs431, %rs429, 8; + shr.u16 %rs432, %rs430, 8; + mov.b32 {%rs433, %rs434}, %r5053; + shr.u16 %rs435, %rs433, 8; + shr.u16 %rs436, %rs434, 8; + cvt.u32.u16 %r5079, %rs421; + and.b32 %r5080, %r5079, 255; + cvt.u32.u16 %r5081, %rs423; + prmt.b32 %r5082, %r5081, %r5080, 30212; + cvt.u32.u16 %r5083, %rs422; + prmt.b32 %r5084, %r5083, %r5082, 28756; + cvt.u32.u16 %r5085, %rs424; + prmt.b32 %r5086, %r5085, %r5084, 1620; + cvt.u32.u16 %r5087, %rs425; + and.b32 %r5088, %r5087, 255; + cvt.u32.u16 %r5089, %rs427; + prmt.b32 %r5090, %r5089, %r5088, 30212; + cvt.u32.u16 %r5091, %rs426; + prmt.b32 %r5092, %r5091, %r5090, 28756; + cvt.u32.u16 %r5093, %rs428; + prmt.b32 %r5094, %r5093, %r5092, 1620; + cvt.u32.u16 %r5095, %rs429; + and.b32 %r5096, %r5095, 255; + cvt.u32.u16 %r5097, %rs431; + prmt.b32 %r5098, %r5097, %r5096, 30212; + cvt.u32.u16 %r5099, %rs430; + prmt.b32 %r5100, %r5099, %r5098, 28756; + cvt.u32.u16 %r5101, %rs432; + prmt.b32 %r5102, %r5101, %r5100, 1620; + cvt.u32.u16 %r5103, %rs433; + and.b32 %r5104, %r5103, 255; + cvt.u32.u16 %r5105, %rs435; + prmt.b32 %r5106, %r5105, %r5104, 30212; + cvt.u32.u16 %r5107, %rs434; + prmt.b32 %r5108, %r5107, %r5106, 28756; + cvt.u32.u16 %r5109, %rs436; + prmt.b32 %r5110, %r5109, %r5108, 1620; + mov.b32 {%rs437, %rs438}, %r5054; + shr.u16 %rs439, %rs437, 8; + shr.u16 %rs440, %rs438, 8; + mov.b32 {%rs441, %rs442}, %r5055; + shr.u16 %rs443, %rs441, 8; + shr.u16 %rs444, %rs442, 8; + mov.b32 {%rs445, %rs446}, %r5056; + shr.u16 %rs447, %rs445, 8; + shr.u16 %rs448, %rs446, 8; + mov.b32 {%rs449, %rs450}, %r5057; + shr.u16 %rs451, %rs449, 8; + shr.u16 %rs452, %rs450, 8; + cvt.u32.u16 %r5115, %rs437; + and.b32 %r5116, %r5115, 255; + cvt.u32.u16 %r5117, %rs439; + prmt.b32 %r5118, %r5117, %r5116, 30212; + cvt.u32.u16 %r5119, %rs438; + prmt.b32 %r5120, %r5119, %r5118, 28756; + cvt.u32.u16 %r5121, %rs440; + prmt.b32 %r5122, %r5121, %r5120, 1620; + cvt.u32.u16 %r5123, %rs441; + and.b32 %r5124, %r5123, 255; + cvt.u32.u16 %r5125, %rs443; + prmt.b32 %r5126, %r5125, %r5124, 30212; + cvt.u32.u16 %r5127, %rs442; + prmt.b32 %r5128, %r5127, %r5126, 28756; + cvt.u32.u16 %r5129, %rs444; + prmt.b32 %r5130, %r5129, %r5128, 1620; + cvt.u32.u16 %r5131, %rs445; + and.b32 %r5132, %r5131, 255; + cvt.u32.u16 %r5133, %rs447; + prmt.b32 %r5134, %r5133, %r5132, 30212; + cvt.u32.u16 %r5135, %rs446; + prmt.b32 %r5136, %r5135, %r5134, 28756; + cvt.u32.u16 %r5137, %rs448; + prmt.b32 %r5138, %r5137, %r5136, 1620; + cvt.u32.u16 %r5139, %rs449; + and.b32 %r5140, %r5139, 255; + cvt.u32.u16 %r5141, %rs451; + prmt.b32 %r5142, %r5141, %r5140, 30212; + cvt.u32.u16 %r5143, %rs450; + prmt.b32 %r5144, %r5143, %r5142, 28756; + cvt.u32.u16 %r5145, %rs452; + prmt.b32 %r5146, %r5145, %r5144, 1620; + mov.b32 {%rs453, %rs454}, %r5058; + shr.u16 %rs455, %rs453, 8; + shr.u16 %rs456, %rs454, 8; + mov.b32 {%rs457, %rs458}, %r5059; + shr.u16 %rs459, %rs457, 8; + shr.u16 %rs460, %rs458, 8; + mov.b32 {%rs461, %rs462}, %r5060; + shr.u16 %rs463, %rs461, 8; + shr.u16 %rs464, %rs462, 8; + mov.b32 {%rs465, %rs466}, %r5061; + shr.u16 %rs467, %rs465, 8; + shr.u16 %rs468, %rs466, 8; + cvt.u32.u16 %r5151, %rs453; + and.b32 %r5152, %r5151, 255; + cvt.u32.u16 %r5153, %rs455; + prmt.b32 %r5154, %r5153, %r5152, 30212; + cvt.u32.u16 %r5155, %rs454; + prmt.b32 %r5156, %r5155, %r5154, 28756; + cvt.u32.u16 %r5157, %rs456; + prmt.b32 %r5158, %r5157, %r5156, 1620; + cvt.u32.u16 %r5159, %rs457; + and.b32 %r5160, %r5159, 255; + cvt.u32.u16 %r5161, %rs459; + prmt.b32 %r5162, %r5161, %r5160, 30212; + cvt.u32.u16 %r5163, %rs458; + prmt.b32 %r5164, %r5163, %r5162, 28756; + cvt.u32.u16 %r5165, %rs460; + prmt.b32 %r5166, %r5165, %r5164, 1620; + cvt.u32.u16 %r5167, %rs461; + and.b32 %r5168, %r5167, 255; + cvt.u32.u16 %r5169, %rs463; + prmt.b32 %r5170, %r5169, %r5168, 30212; + cvt.u32.u16 %r5171, %rs462; + prmt.b32 %r5172, %r5171, %r5170, 28756; + cvt.u32.u16 %r5173, %rs464; + prmt.b32 %r5174, %r5173, %r5172, 1620; + cvt.u32.u16 %r5175, %rs465; + and.b32 %r5176, %r5175, 255; + cvt.u32.u16 %r5177, %rs467; + prmt.b32 %r5178, %r5177, %r5176, 30212; + cvt.u32.u16 %r5179, %rs466; + prmt.b32 %r5180, %r5179, %r5178, 28756; + cvt.u32.u16 %r5181, %rs468; + prmt.b32 %r5182, %r5181, %r5180, 1620; + mov.b32 {%rs469, %rs470}, %r5062; + shr.u16 %rs471, %rs469, 8; + shr.u16 %rs472, %rs470, 8; + mov.b32 {%rs473, %rs474}, %r5063; + shr.u16 %rs475, %rs473, 8; + shr.u16 %rs476, %rs474, 8; + mov.b32 {%rs477, %rs478}, %r5064; + shr.u16 %rs479, %rs477, 8; + shr.u16 %rs480, %rs478, 8; + mov.b32 {%rs481, %rs482}, %r5065; + shr.u16 %rs483, %rs481, 8; + shr.u16 %rs484, %rs482, 8; + cvt.u32.u16 %r5187, %rs469; + and.b32 %r5188, %r5187, 255; + cvt.u32.u16 %r5189, %rs471; + prmt.b32 %r5190, %r5189, %r5188, 30212; + cvt.u32.u16 %r5191, %rs470; + prmt.b32 %r5192, %r5191, %r5190, 28756; + cvt.u32.u16 %r5193, %rs472; + prmt.b32 %r5194, %r5193, %r5192, 1620; + cvt.u32.u16 %r5195, %rs473; + and.b32 %r5196, %r5195, 255; + cvt.u32.u16 %r5197, %rs475; + prmt.b32 %r5198, %r5197, %r5196, 30212; + cvt.u32.u16 %r5199, %rs474; + prmt.b32 %r5200, %r5199, %r5198, 28756; + cvt.u32.u16 %r5201, %rs476; + prmt.b32 %r5202, %r5201, %r5200, 1620; + cvt.u32.u16 %r5203, %rs477; + and.b32 %r5204, %r5203, 255; + cvt.u32.u16 %r5205, %rs479; + prmt.b32 %r5206, %r5205, %r5204, 30212; + cvt.u32.u16 %r5207, %rs478; + prmt.b32 %r5208, %r5207, %r5206, 28756; + cvt.u32.u16 %r5209, %rs480; + prmt.b32 %r5210, %r5209, %r5208, 1620; + cvt.u32.u16 %r5211, %rs481; + and.b32 %r5212, %r5211, 255; + cvt.u32.u16 %r5213, %rs483; + prmt.b32 %r5214, %r5213, %r5212, 30212; + cvt.u32.u16 %r5215, %rs482; + prmt.b32 %r5216, %r5215, %r5214, 28756; + cvt.u32.u16 %r5217, %rs484; + prmt.b32 %r5218, %r5217, %r5216, 1620; + cvt.u32.u16 %r5219, %rs420; + and.b32 %r5220, %r5219, 255; + add.s32 %r5221, %r5086, -1156040474; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 16; + add.s32 %r5223, %r5222, 1779033703; + xor.b32 %r5224, %r5223, 1359893119; + shf.l.wrap.b32 %r5225, %r5224, %r5224, 20; + add.s32 %r5226, %r5094, %r5221; + add.s32 %r5227, %r5226, %r5225; + xor.b32 %r5228, %r5227, %r5222; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 24; + add.s32 %r5230, %r5229, %r5223; + xor.b32 %r5231, %r5230, %r5225; + shf.l.wrap.b32 %r5232, %r5231, %r5231, 25; + add.s32 %r5233, %r5102, 1449989905; + shf.l.wrap.b32 %r5234, %r5233, %r5233, 16; + add.s32 %r5235, %r5234, -1150833019; + xor.b32 %r5236, %r5235, -1694144372; + shf.l.wrap.b32 %r5237, %r5236, %r5236, 20; + add.s32 %r5238, %r5110, %r5233; + add.s32 %r5239, %r5238, %r5237; + xor.b32 %r5240, %r5239, %r5234; + shf.l.wrap.b32 %r5241, %r5240, %r5240, 24; + add.s32 %r5242, %r5241, %r5235; + xor.b32 %r5243, %r5242, %r5237; + shf.l.wrap.b32 %r5244, %r5243, %r5243, 25; + add.s32 %r5245, %r5122, 1542638877; + shr.u32 %r5246, %r5245, 16; + shl.b32 %r5247, %r5245, 16; + xor.b32 %r5248, %r5247, 4194304; + or.b32 %r5249, %r5248, %r5246; + add.s32 %r5250, %r5249, 1013904242; + xor.b32 %r5251, %r5250, 528734635; + shf.l.wrap.b32 %r5252, %r5251, %r5251, 20; + add.s32 %r5253, %r5130, %r5245; + add.s32 %r5254, %r5253, %r5252; + xor.b32 %r5255, %r5254, %r5249; + shf.l.wrap.b32 %r5256, %r5255, %r5255, 24; + add.s32 %r5257, %r5256, %r5250; + xor.b32 %r5258, %r5257, %r5252; + shf.l.wrap.b32 %r5259, %r5258, %r5258, 25; + add.s32 %r5260, %r5138, 19972691; + xor.b32 %r5261, %r5260, %r5220; + shr.u32 %r5262, %r5260, 16; + shl.b32 %r5263, %r5261, 16; + or.b32 %r5264, %r5263, %r5262; + add.s32 %r5265, %r5264, -1521486534; + xor.b32 %r5266, %r5265, 1541459225; + shf.l.wrap.b32 %r5267, %r5266, %r5266, 20; + add.s32 %r5268, %r5146, %r5260; + add.s32 %r5269, %r5268, %r5267; + xor.b32 %r5270, %r5269, %r5264; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 24; + add.s32 %r5272, %r5271, %r5265; + xor.b32 %r5273, %r5272, %r5267; + shf.l.wrap.b32 %r5274, %r5273, %r5273, 25; + add.s32 %r5275, %r5244, %r5227; + add.s32 %r5276, %r5275, %r5158; + xor.b32 %r5277, %r5271, %r5276; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 16; + add.s32 %r5279, %r5278, %r5257; + xor.b32 %r5280, %r5279, %r5244; + shf.l.wrap.b32 %r5281, %r5280, %r5280, 20; + add.s32 %r5282, %r5166, %r5276; + add.s32 %r5283, %r5282, %r5281; + xor.b32 %r5284, %r5283, %r5278; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 24; + add.s32 %r5286, %r5285, %r5279; + xor.b32 %r5287, %r5286, %r5281; + shf.l.wrap.b32 %r5288, %r5287, %r5287, 25; + add.s32 %r5289, %r5259, %r5239; + add.s32 %r5290, %r5289, %r5174; + xor.b32 %r5291, %r5290, %r5229; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 16; + add.s32 %r5293, %r5292, %r5272; + xor.b32 %r5294, %r5293, %r5259; + shf.l.wrap.b32 %r5295, %r5294, %r5294, 20; + add.s32 %r5296, %r5182, %r5290; + add.s32 %r5297, %r5296, %r5295; + xor.b32 %r5298, %r5297, %r5292; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 24; + add.s32 %r5300, %r5299, %r5293; + xor.b32 %r5301, %r5300, %r5295; + shf.l.wrap.b32 %r5302, %r5301, %r5301, 25; + add.s32 %r5303, %r5274, %r5254; + add.s32 %r5304, %r5303, %r5194; + xor.b32 %r5305, %r5304, %r5241; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 16; + add.s32 %r5307, %r5306, %r5230; + xor.b32 %r5308, %r5307, %r5274; + shf.l.wrap.b32 %r5309, %r5308, %r5308, 20; + add.s32 %r5310, %r5202, %r5304; + add.s32 %r5311, %r5310, %r5309; + xor.b32 %r5312, %r5311, %r5306; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 24; + add.s32 %r5314, %r5313, %r5307; + xor.b32 %r5315, %r5314, %r5309; + shf.l.wrap.b32 %r5316, %r5315, %r5315, 25; + add.s32 %r5317, %r5269, %r5232; + add.s32 %r5318, %r5317, %r5210; + xor.b32 %r5319, %r5318, %r5256; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 16; + add.s32 %r5321, %r5320, %r5242; + xor.b32 %r5322, %r5321, %r5232; + shf.l.wrap.b32 %r5323, %r5322, %r5322, 20; + add.s32 %r5324, %r5218, %r5318; + add.s32 %r5325, %r5324, %r5323; + xor.b32 %r5326, %r5325, %r5320; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 24; + add.s32 %r5328, %r5327, %r5321; + xor.b32 %r5329, %r5328, %r5323; + shf.l.wrap.b32 %r5330, %r5329, %r5329, 25; + add.s32 %r5331, %r5283, %r5102; + add.s32 %r5332, %r5331, %r5330; + xor.b32 %r5333, %r5332, %r5299; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 16; + add.s32 %r5335, %r5334, %r5314; + xor.b32 %r5336, %r5335, %r5330; + shf.l.wrap.b32 %r5337, %r5336, %r5336, 20; + add.s32 %r5338, %r5332, %r5138; + add.s32 %r5339, %r5338, %r5337; + xor.b32 %r5340, %r5339, %r5334; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 24; + add.s32 %r5342, %r5341, %r5335; + xor.b32 %r5343, %r5342, %r5337; + shf.l.wrap.b32 %r5344, %r5343, %r5343, 25; + add.s32 %r5345, %r5297, %r5110; + add.s32 %r5346, %r5345, %r5288; + xor.b32 %r5347, %r5313, %r5346; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 16; + add.s32 %r5349, %r5328, %r5348; + xor.b32 %r5350, %r5349, %r5288; + shf.l.wrap.b32 %r5351, %r5350, %r5350, 20; + add.s32 %r5352, %r5346, %r5174; + add.s32 %r5353, %r5352, %r5351; + xor.b32 %r5354, %r5353, %r5348; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 24; + add.s32 %r5356, %r5355, %r5349; + xor.b32 %r5357, %r5356, %r5351; + shf.l.wrap.b32 %r5358, %r5357, %r5357, 25; + add.s32 %r5359, %r5302, %r5146; + add.s32 %r5360, %r5359, %r5311; + xor.b32 %r5361, %r5327, %r5360; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 16; + add.s32 %r5363, %r5362, %r5286; + xor.b32 %r5364, %r5363, %r5302; + shf.l.wrap.b32 %r5365, %r5364, %r5364, 20; + add.s32 %r5366, %r5360, %r5086; + add.s32 %r5367, %r5366, %r5365; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 24; + add.s32 %r5370, %r5369, %r5363; + xor.b32 %r5371, %r5370, %r5365; + shf.l.wrap.b32 %r5372, %r5371, %r5371, 25; + add.s32 %r5373, %r5316, %r5122; + add.s32 %r5374, %r5373, %r5325; + xor.b32 %r5375, %r5374, %r5285; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 16; + add.s32 %r5377, %r5376, %r5300; + xor.b32 %r5378, %r5377, %r5316; + shf.l.wrap.b32 %r5379, %r5378, %r5378, 20; + add.s32 %r5380, %r5374, %r5202; + add.s32 %r5381, %r5380, %r5379; + xor.b32 %r5382, %r5381, %r5376; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 24; + add.s32 %r5384, %r5383, %r5377; + xor.b32 %r5385, %r5384, %r5379; + shf.l.wrap.b32 %r5386, %r5385, %r5385, 25; + add.s32 %r5387, %r5339, %r5094; + add.s32 %r5388, %r5387, %r5358; + xor.b32 %r5389, %r5388, %r5383; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 16; + add.s32 %r5391, %r5390, %r5370; + xor.b32 %r5392, %r5391, %r5358; + shf.l.wrap.b32 %r5393, %r5392, %r5392, 20; + add.s32 %r5394, %r5388, %r5182; + add.s32 %r5395, %r5394, %r5393; + xor.b32 %r5396, %r5395, %r5390; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 24; + add.s32 %r5398, %r5397, %r5391; + xor.b32 %r5399, %r5398, %r5393; + shf.l.wrap.b32 %r5400, %r5399, %r5399, 25; + add.s32 %r5401, %r5353, %r5194; + add.s32 %r5402, %r5401, %r5372; + xor.b32 %r5403, %r5402, %r5341; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 16; + add.s32 %r5405, %r5404, %r5384; + xor.b32 %r5406, %r5405, %r5372; + shf.l.wrap.b32 %r5407, %r5406, %r5406, 20; + add.s32 %r5408, %r5402, %r5130; + add.s32 %r5409, %r5408, %r5407; + xor.b32 %r5410, %r5409, %r5404; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 24; + add.s32 %r5412, %r5411, %r5405; + xor.b32 %r5413, %r5412, %r5407; + shf.l.wrap.b32 %r5414, %r5413, %r5413, 25; + add.s32 %r5415, %r5367, %r5166; + add.s32 %r5416, %r5415, %r5386; + xor.b32 %r5417, %r5416, %r5355; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 16; + add.s32 %r5419, %r5418, %r5342; + xor.b32 %r5420, %r5419, %r5386; + shf.l.wrap.b32 %r5421, %r5420, %r5420, 20; + add.s32 %r5422, %r5416, %r5210; + add.s32 %r5423, %r5422, %r5421; + xor.b32 %r5424, %r5423, %r5418; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 24; + add.s32 %r5426, %r5425, %r5419; + xor.b32 %r5427, %r5426, %r5421; + shf.l.wrap.b32 %r5428, %r5427, %r5427, 25; + add.s32 %r5429, %r5381, %r5218; + add.s32 %r5430, %r5429, %r5344; + xor.b32 %r5431, %r5430, %r5369; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 16; + add.s32 %r5433, %r5432, %r5356; + xor.b32 %r5434, %r5433, %r5344; + shf.l.wrap.b32 %r5435, %r5434, %r5434, 20; + add.s32 %r5436, %r5430, %r5158; + add.s32 %r5437, %r5436, %r5435; + xor.b32 %r5438, %r5437, %r5432; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 24; + add.s32 %r5440, %r5439, %r5433; + xor.b32 %r5441, %r5440, %r5435; + shf.l.wrap.b32 %r5442, %r5441, %r5441, 25; + add.s32 %r5443, %r5395, %r5110; + add.s32 %r5444, %r5443, %r5442; + xor.b32 %r5445, %r5444, %r5411; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 16; + add.s32 %r5447, %r5446, %r5426; + xor.b32 %r5448, %r5447, %r5442; + shf.l.wrap.b32 %r5449, %r5448, %r5448, 20; + add.s32 %r5450, %r5444, %r5122; + add.s32 %r5451, %r5450, %r5449; + xor.b32 %r5452, %r5451, %r5446; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 24; + add.s32 %r5454, %r5453, %r5447; + xor.b32 %r5455, %r5454, %r5449; + shf.l.wrap.b32 %r5456, %r5455, %r5455, 25; + add.s32 %r5457, %r5409, %r5174; + add.s32 %r5458, %r5457, %r5400; + xor.b32 %r5459, %r5458, %r5425; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 16; + add.s32 %r5461, %r5460, %r5440; + xor.b32 %r5462, %r5461, %r5400; + shf.l.wrap.b32 %r5463, %r5462, %r5462, 20; + add.s32 %r5464, %r5458, %r5194; + add.s32 %r5465, %r5464, %r5463; + xor.b32 %r5466, %r5465, %r5460; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 24; + add.s32 %r5468, %r5467, %r5461; + xor.b32 %r5469, %r5468, %r5463; + shf.l.wrap.b32 %r5470, %r5469, %r5469, 25; + add.s32 %r5471, %r5423, %r5202; + add.s32 %r5472, %r5471, %r5414; + xor.b32 %r5473, %r5472, %r5439; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 16; + add.s32 %r5475, %r5474, %r5398; + xor.b32 %r5476, %r5475, %r5414; + shf.l.wrap.b32 %r5477, %r5476, %r5476, 20; + add.s32 %r5478, %r5472, %r5102; + add.s32 %r5479, %r5478, %r5477; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 24; + add.s32 %r5482, %r5481, %r5475; + xor.b32 %r5483, %r5482, %r5477; + shf.l.wrap.b32 %r5484, %r5483, %r5483, 25; + add.s32 %r5485, %r5437, %r5146; + add.s32 %r5486, %r5485, %r5428; + xor.b32 %r5487, %r5486, %r5397; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 16; + add.s32 %r5489, %r5488, %r5412; + xor.b32 %r5490, %r5489, %r5428; + shf.l.wrap.b32 %r5491, %r5490, %r5490, 20; + add.s32 %r5492, %r5486, %r5210; + add.s32 %r5493, %r5492, %r5491; + xor.b32 %r5494, %r5493, %r5488; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 24; + add.s32 %r5496, %r5495, %r5489; + xor.b32 %r5497, %r5496, %r5491; + shf.l.wrap.b32 %r5498, %r5497, %r5497, 25; + add.s32 %r5499, %r5451, %r5138; + add.s32 %r5500, %r5499, %r5470; + xor.b32 %r5501, %r5500, %r5495; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 16; + add.s32 %r5503, %r5502, %r5482; + xor.b32 %r5504, %r5503, %r5470; + shf.l.wrap.b32 %r5505, %r5504, %r5504, 20; + add.s32 %r5506, %r5500, %r5130; + add.s32 %r5507, %r5506, %r5505; + xor.b32 %r5508, %r5507, %r5502; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 24; + add.s32 %r5510, %r5509, %r5503; + xor.b32 %r5511, %r5510, %r5505; + shf.l.wrap.b32 %r5512, %r5511, %r5511, 25; + add.s32 %r5513, %r5465, %r5166; + add.s32 %r5514, %r5513, %r5484; + xor.b32 %r5515, %r5514, %r5453; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 16; + add.s32 %r5517, %r5516, %r5496; + xor.b32 %r5518, %r5517, %r5484; + shf.l.wrap.b32 %r5519, %r5518, %r5518, 20; + add.s32 %r5520, %r5514, %r5086; + add.s32 %r5521, %r5520, %r5519; + xor.b32 %r5522, %r5521, %r5516; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 24; + add.s32 %r5524, %r5523, %r5517; + xor.b32 %r5525, %r5524, %r5519; + shf.l.wrap.b32 %r5526, %r5525, %r5525, 25; + add.s32 %r5527, %r5479, %r5182; + add.s32 %r5528, %r5527, %r5498; + xor.b32 %r5529, %r5528, %r5467; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 16; + add.s32 %r5531, %r5530, %r5454; + xor.b32 %r5532, %r5531, %r5498; + shf.l.wrap.b32 %r5533, %r5532, %r5532, 20; + add.s32 %r5534, %r5528, %r5218; + add.s32 %r5535, %r5534, %r5533; + xor.b32 %r5536, %r5535, %r5530; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 24; + add.s32 %r5538, %r5537, %r5531; + xor.b32 %r5539, %r5538, %r5533; + shf.l.wrap.b32 %r5540, %r5539, %r5539, 25; + add.s32 %r5541, %r5493, %r5158; + add.s32 %r5542, %r5541, %r5456; + xor.b32 %r5543, %r5542, %r5481; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 16; + add.s32 %r5545, %r5544, %r5468; + xor.b32 %r5546, %r5545, %r5456; + shf.l.wrap.b32 %r5547, %r5546, %r5546, 20; + add.s32 %r5548, %r5542, %r5094; + add.s32 %r5549, %r5548, %r5547; + xor.b32 %r5550, %r5549, %r5544; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 24; + add.s32 %r5552, %r5551, %r5545; + xor.b32 %r5553, %r5552, %r5547; + shf.l.wrap.b32 %r5554, %r5553, %r5553, 25; + add.s32 %r5555, %r5507, %r5174; + add.s32 %r5556, %r5555, %r5554; + xor.b32 %r5557, %r5556, %r5523; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 16; + add.s32 %r5559, %r5558, %r5538; + xor.b32 %r5560, %r5559, %r5554; + shf.l.wrap.b32 %r5561, %r5560, %r5560, 20; + add.s32 %r5562, %r5556, %r5146; + add.s32 %r5563, %r5562, %r5561; + xor.b32 %r5564, %r5563, %r5558; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 24; + add.s32 %r5566, %r5565, %r5559; + xor.b32 %r5567, %r5566, %r5561; + shf.l.wrap.b32 %r5568, %r5567, %r5567, 25; + add.s32 %r5569, %r5521, %r5194; + add.s32 %r5570, %r5569, %r5512; + xor.b32 %r5571, %r5570, %r5537; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 16; + add.s32 %r5573, %r5572, %r5552; + xor.b32 %r5574, %r5573, %r5512; + shf.l.wrap.b32 %r5575, %r5574, %r5574, 20; + add.s32 %r5576, %r5570, %r5166; + add.s32 %r5577, %r5576, %r5575; + xor.b32 %r5578, %r5577, %r5572; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 24; + add.s32 %r5580, %r5579, %r5573; + xor.b32 %r5581, %r5580, %r5575; + shf.l.wrap.b32 %r5582, %r5581, %r5581, 25; + add.s32 %r5583, %r5535, %r5210; + add.s32 %r5584, %r5583, %r5526; + xor.b32 %r5585, %r5584, %r5551; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 16; + add.s32 %r5587, %r5586, %r5510; + xor.b32 %r5588, %r5587, %r5526; + shf.l.wrap.b32 %r5589, %r5588, %r5588, 20; + add.s32 %r5590, %r5584, %r5110; + add.s32 %r5591, %r5590, %r5589; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 24; + add.s32 %r5594, %r5593, %r5587; + xor.b32 %r5595, %r5594, %r5589; + shf.l.wrap.b32 %r5596, %r5595, %r5595, 25; + add.s32 %r5597, %r5549, %r5202; + add.s32 %r5598, %r5597, %r5540; + xor.b32 %r5599, %r5598, %r5509; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 16; + add.s32 %r5601, %r5600, %r5524; + xor.b32 %r5602, %r5601, %r5540; + shf.l.wrap.b32 %r5603, %r5602, %r5602, 20; + add.s32 %r5604, %r5598, %r5218; + add.s32 %r5605, %r5604, %r5603; + xor.b32 %r5606, %r5605, %r5600; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 24; + add.s32 %r5608, %r5607, %r5601; + xor.b32 %r5609, %r5608, %r5603; + shf.l.wrap.b32 %r5610, %r5609, %r5609, 25; + add.s32 %r5611, %r5563, %r5122; + add.s32 %r5612, %r5611, %r5582; + xor.b32 %r5613, %r5612, %r5607; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 16; + add.s32 %r5615, %r5614, %r5594; + xor.b32 %r5616, %r5615, %r5582; + shf.l.wrap.b32 %r5617, %r5616, %r5616, 20; + add.s32 %r5618, %r5612, %r5086; + add.s32 %r5619, %r5618, %r5617; + xor.b32 %r5620, %r5619, %r5614; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 24; + add.s32 %r5622, %r5621, %r5615; + xor.b32 %r5623, %r5622, %r5617; + shf.l.wrap.b32 %r5624, %r5623, %r5623, 25; + add.s32 %r5625, %r5577, %r5182; + add.s32 %r5626, %r5625, %r5596; + xor.b32 %r5627, %r5626, %r5565; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 16; + add.s32 %r5629, %r5628, %r5608; + xor.b32 %r5630, %r5629, %r5596; + shf.l.wrap.b32 %r5631, %r5630, %r5630, 20; + add.s32 %r5632, %r5626, %r5102; + add.s32 %r5633, %r5632, %r5631; + xor.b32 %r5634, %r5633, %r5628; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 24; + add.s32 %r5636, %r5635, %r5629; + xor.b32 %r5637, %r5636, %r5631; + shf.l.wrap.b32 %r5638, %r5637, %r5637, 25; + add.s32 %r5639, %r5591, %r5130; + add.s32 %r5640, %r5639, %r5610; + xor.b32 %r5641, %r5640, %r5579; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 16; + add.s32 %r5643, %r5642, %r5566; + xor.b32 %r5644, %r5643, %r5610; + shf.l.wrap.b32 %r5645, %r5644, %r5644, 20; + add.s32 %r5646, %r5640, %r5158; + add.s32 %r5647, %r5646, %r5645; + xor.b32 %r5648, %r5647, %r5642; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 24; + add.s32 %r5650, %r5649, %r5643; + xor.b32 %r5651, %r5650, %r5645; + shf.l.wrap.b32 %r5652, %r5651, %r5651, 25; + add.s32 %r5653, %r5605, %r5094; + add.s32 %r5654, %r5653, %r5568; + xor.b32 %r5655, %r5654, %r5593; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 16; + add.s32 %r5657, %r5656, %r5580; + xor.b32 %r5658, %r5657, %r5568; + shf.l.wrap.b32 %r5659, %r5658, %r5658, 20; + add.s32 %r5660, %r5654, %r5138; + add.s32 %r5661, %r5660, %r5659; + xor.b32 %r5662, %r5661, %r5656; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 24; + add.s32 %r5664, %r5663, %r5657; + xor.b32 %r5665, %r5664, %r5659; + shf.l.wrap.b32 %r5666, %r5665, %r5665, 25; + add.s32 %r5667, %r5619, %r5194; + add.s32 %r5668, %r5667, %r5666; + xor.b32 %r5669, %r5668, %r5635; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 16; + add.s32 %r5671, %r5670, %r5650; + xor.b32 %r5672, %r5671, %r5666; + shf.l.wrap.b32 %r5673, %r5672, %r5672, 20; + add.s32 %r5674, %r5668, %r5202; + add.s32 %r5675, %r5674, %r5673; + xor.b32 %r5676, %r5675, %r5670; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 24; + add.s32 %r5678, %r5677, %r5671; + xor.b32 %r5679, %r5678, %r5673; + shf.l.wrap.b32 %r5680, %r5679, %r5679, 25; + add.s32 %r5681, %r5633, %r5166; + add.s32 %r5682, %r5681, %r5624; + xor.b32 %r5683, %r5682, %r5649; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 16; + add.s32 %r5685, %r5684, %r5664; + xor.b32 %r5686, %r5685, %r5624; + shf.l.wrap.b32 %r5687, %r5686, %r5686, 20; + add.s32 %r5688, %r5682, %r5182; + add.s32 %r5689, %r5688, %r5687; + xor.b32 %r5690, %r5689, %r5684; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 24; + add.s32 %r5692, %r5691, %r5685; + xor.b32 %r5693, %r5692, %r5687; + shf.l.wrap.b32 %r5694, %r5693, %r5693, 25; + add.s32 %r5695, %r5647, %r5218; + add.s32 %r5696, %r5695, %r5638; + xor.b32 %r5697, %r5696, %r5663; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 16; + add.s32 %r5699, %r5698, %r5622; + xor.b32 %r5700, %r5699, %r5638; + shf.l.wrap.b32 %r5701, %r5700, %r5700, 20; + add.s32 %r5702, %r5696, %r5174; + add.s32 %r5703, %r5702, %r5701; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 24; + add.s32 %r5706, %r5705, %r5699; + xor.b32 %r5707, %r5706, %r5701; + shf.l.wrap.b32 %r5708, %r5707, %r5707, 25; + add.s32 %r5709, %r5661, %r5210; + add.s32 %r5710, %r5709, %r5652; + xor.b32 %r5711, %r5710, %r5621; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 16; + add.s32 %r5713, %r5712, %r5636; + xor.b32 %r5714, %r5713, %r5652; + shf.l.wrap.b32 %r5715, %r5714, %r5714, 20; + add.s32 %r5716, %r5710, %r5158; + add.s32 %r5717, %r5716, %r5715; + xor.b32 %r5718, %r5717, %r5712; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 24; + add.s32 %r5720, %r5719, %r5713; + xor.b32 %r5721, %r5720, %r5715; + shf.l.wrap.b32 %r5722, %r5721, %r5721, 25; + add.s32 %r5723, %r5675, %r5146; + add.s32 %r5724, %r5723, %r5694; + xor.b32 %r5725, %r5724, %r5719; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 16; + add.s32 %r5727, %r5726, %r5706; + xor.b32 %r5728, %r5727, %r5694; + shf.l.wrap.b32 %r5729, %r5728, %r5728, 20; + add.s32 %r5730, %r5724, %r5102; + add.s32 %r5731, %r5730, %r5729; + xor.b32 %r5732, %r5731, %r5726; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 24; + add.s32 %r5734, %r5733, %r5727; + xor.b32 %r5735, %r5734, %r5729; + shf.l.wrap.b32 %r5736, %r5735, %r5735, 25; + add.s32 %r5737, %r5689, %r5130; + add.s32 %r5738, %r5737, %r5708; + xor.b32 %r5739, %r5738, %r5677; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 16; + add.s32 %r5741, %r5740, %r5720; + xor.b32 %r5742, %r5741, %r5708; + shf.l.wrap.b32 %r5743, %r5742, %r5742, 20; + add.s32 %r5744, %r5738, %r5110; + add.s32 %r5745, %r5744, %r5743; + xor.b32 %r5746, %r5745, %r5740; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 24; + add.s32 %r5748, %r5747, %r5741; + xor.b32 %r5749, %r5748, %r5743; + shf.l.wrap.b32 %r5750, %r5749, %r5749, 25; + add.s32 %r5751, %r5703, %r5086; + add.s32 %r5752, %r5751, %r5722; + xor.b32 %r5753, %r5752, %r5691; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 16; + add.s32 %r5755, %r5754, %r5678; + xor.b32 %r5756, %r5755, %r5722; + shf.l.wrap.b32 %r5757, %r5756, %r5756, 20; + add.s32 %r5758, %r5752, %r5094; + add.s32 %r5759, %r5758, %r5757; + xor.b32 %r5760, %r5759, %r5754; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 24; + add.s32 %r5762, %r5761, %r5755; + xor.b32 %r5763, %r5762, %r5757; + shf.l.wrap.b32 %r5764, %r5763, %r5763, 25; + add.s32 %r5765, %r5717, %r5138; + add.s32 %r5766, %r5765, %r5680; + xor.b32 %r5767, %r5766, %r5705; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 16; + add.s32 %r5769, %r5768, %r5692; + xor.b32 %r5770, %r5769, %r5680; + shf.l.wrap.b32 %r5771, %r5770, %r5770, 20; + add.s32 %r5772, %r5766, %r5122; + add.s32 %r5773, %r5772, %r5771; + xor.b32 %r5774, %r5773, %r5768; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 24; + add.s32 %r5776, %r5775, %r5769; + xor.b32 %r5777, %r5776, %r5771; + shf.l.wrap.b32 %r5778, %r5777, %r5777, 25; + add.s32 %r5779, %r5731, %r5166; + add.s32 %r5780, %r5779, %r5778; + xor.b32 %r5781, %r5780, %r5747; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 16; + add.s32 %r5783, %r5782, %r5762; + xor.b32 %r5784, %r5783, %r5778; + shf.l.wrap.b32 %r5785, %r5784, %r5784, 20; + add.s32 %r5786, %r5780, %r5210; + add.s32 %r5787, %r5786, %r5785; + xor.b32 %r5788, %r5787, %r5782; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 24; + add.s32 %r5790, %r5789, %r5783; + xor.b32 %r5791, %r5790, %r5785; + shf.l.wrap.b32 %r5792, %r5791, %r5791, 25; + add.s32 %r5793, %r5745, %r5182; + add.s32 %r5794, %r5793, %r5736; + xor.b32 %r5795, %r5794, %r5761; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 16; + add.s32 %r5797, %r5796, %r5776; + xor.b32 %r5798, %r5797, %r5736; + shf.l.wrap.b32 %r5799, %r5798, %r5798, 20; + add.s32 %r5800, %r5794, %r5130; + add.s32 %r5801, %r5800, %r5799; + xor.b32 %r5802, %r5801, %r5796; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 24; + add.s32 %r5804, %r5803, %r5797; + xor.b32 %r5805, %r5804, %r5799; + shf.l.wrap.b32 %r5806, %r5805, %r5805, 25; + add.s32 %r5807, %r5759, %r5158; + add.s32 %r5808, %r5807, %r5750; + xor.b32 %r5809, %r5808, %r5775; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 16; + add.s32 %r5811, %r5810, %r5734; + xor.b32 %r5812, %r5811, %r5750; + shf.l.wrap.b32 %r5813, %r5812, %r5812, 20; + add.s32 %r5814, %r5808, %r5194; + add.s32 %r5815, %r5814, %r5813; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 24; + add.s32 %r5818, %r5817, %r5811; + xor.b32 %r5819, %r5818, %r5813; + shf.l.wrap.b32 %r5820, %r5819, %r5819, 25; + add.s32 %r5821, %r5773, %r5218; + add.s32 %r5822, %r5821, %r5764; + xor.b32 %r5823, %r5822, %r5733; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 16; + add.s32 %r5825, %r5824, %r5748; + xor.b32 %r5826, %r5825, %r5764; + shf.l.wrap.b32 %r5827, %r5826, %r5826, 20; + add.s32 %r5828, %r5822, %r5094; + add.s32 %r5829, %r5828, %r5827; + xor.b32 %r5830, %r5829, %r5824; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 24; + add.s32 %r5832, %r5831, %r5825; + xor.b32 %r5833, %r5832, %r5827; + shf.l.wrap.b32 %r5834, %r5833, %r5833, 25; + add.s32 %r5835, %r5787, %r5202; + add.s32 %r5836, %r5835, %r5806; + xor.b32 %r5837, %r5836, %r5831; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 16; + add.s32 %r5839, %r5838, %r5818; + xor.b32 %r5840, %r5839, %r5806; + shf.l.wrap.b32 %r5841, %r5840, %r5840, 20; + add.s32 %r5842, %r5836, %r5110; + add.s32 %r5843, %r5842, %r5841; + xor.b32 %r5844, %r5843, %r5838; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 24; + add.s32 %r5846, %r5845, %r5839; + xor.b32 %r5847, %r5846, %r5841; + shf.l.wrap.b32 %r5848, %r5847, %r5847, 25; + add.s32 %r5849, %r5801, %r5086; + add.s32 %r5850, %r5849, %r5820; + xor.b32 %r5851, %r5850, %r5789; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 16; + add.s32 %r5853, %r5852, %r5832; + xor.b32 %r5854, %r5853, %r5820; + shf.l.wrap.b32 %r5855, %r5854, %r5854, 20; + add.s32 %r5856, %r5850, %r5174; + add.s32 %r5857, %r5856, %r5855; + xor.b32 %r5858, %r5857, %r5852; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 24; + add.s32 %r5860, %r5859, %r5853; + xor.b32 %r5861, %r5860, %r5855; + shf.l.wrap.b32 %r5862, %r5861, %r5861, 25; + add.s32 %r5863, %r5815, %r5102; + add.s32 %r5864, %r5863, %r5834; + xor.b32 %r5865, %r5864, %r5803; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 16; + add.s32 %r5867, %r5866, %r5790; + xor.b32 %r5868, %r5867, %r5834; + shf.l.wrap.b32 %r5869, %r5868, %r5868, 20; + add.s32 %r5870, %r5864, %r5138; + add.s32 %r5871, %r5870, %r5869; + xor.b32 %r5872, %r5871, %r5866; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 24; + add.s32 %r5874, %r5873, %r5867; + xor.b32 %r5875, %r5874, %r5869; + shf.l.wrap.b32 %r5876, %r5875, %r5875, 25; + add.s32 %r5877, %r5829, %r5122; + add.s32 %r5878, %r5877, %r5792; + xor.b32 %r5879, %r5878, %r5817; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 16; + add.s32 %r5881, %r5880, %r5804; + xor.b32 %r5882, %r5881, %r5792; + shf.l.wrap.b32 %r5883, %r5882, %r5882, 20; + add.s32 %r5884, %r5878, %r5146; + add.s32 %r5885, %r5884, %r5883; + xor.b32 %r5886, %r5885, %r5880; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 24; + add.s32 %r5888, %r5887, %r5881; + xor.b32 %r5889, %r5888, %r5883; + shf.l.wrap.b32 %r5890, %r5889, %r5889, 25; + add.s32 %r5891, %r5843, %r5182; + add.s32 %r5892, %r5891, %r5890; + xor.b32 %r5893, %r5892, %r5859; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 16; + add.s32 %r5895, %r5894, %r5874; + xor.b32 %r5896, %r5895, %r5890; + shf.l.wrap.b32 %r5897, %r5896, %r5896, 20; + add.s32 %r5898, %r5892, %r5218; + add.s32 %r5899, %r5898, %r5897; + xor.b32 %r5900, %r5899, %r5894; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 24; + add.s32 %r5902, %r5901, %r5895; + xor.b32 %r5903, %r5902, %r5897; + shf.l.wrap.b32 %r5904, %r5903, %r5903, 25; + add.s32 %r5905, %r5857, %r5130; + add.s32 %r5906, %r5905, %r5848; + xor.b32 %r5907, %r5906, %r5873; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 16; + add.s32 %r5909, %r5908, %r5888; + xor.b32 %r5910, %r5909, %r5848; + shf.l.wrap.b32 %r5911, %r5910, %r5910, 20; + add.s32 %r5912, %r5906, %r5086; + add.s32 %r5913, %r5912, %r5911; + xor.b32 %r5914, %r5913, %r5908; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 24; + add.s32 %r5916, %r5915, %r5909; + xor.b32 %r5917, %r5916, %r5911; + shf.l.wrap.b32 %r5918, %r5917, %r5917, 25; + add.s32 %r5919, %r5871, %r5094; + add.s32 %r5920, %r5919, %r5862; + xor.b32 %r5921, %r5920, %r5887; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 16; + add.s32 %r5923, %r5922, %r5846; + xor.b32 %r5924, %r5923, %r5862; + shf.l.wrap.b32 %r5925, %r5924, %r5924, 20; + add.s32 %r5926, %r5920, %r5166; + add.s32 %r5927, %r5926, %r5925; + xor.b32 %r5928, %r5927, %r5922; + shf.l.wrap.b32 %r5929, %r5928, %r5928, 24; + add.s32 %r5930, %r5929, %r5923; + xor.b32 %r5931, %r5930, %r5925; + shf.l.wrap.b32 %r5932, %r5931, %r5931, 25; + add.s32 %r5933, %r5885, %r5158; + add.s32 %r5934, %r5933, %r5876; + xor.b32 %r5935, %r5934, %r5845; + shf.l.wrap.b32 %r5936, %r5935, %r5935, 16; + add.s32 %r5937, %r5936, %r5860; + xor.b32 %r5938, %r5937, %r5876; + shf.l.wrap.b32 %r5939, %r5938, %r5938, 20; + add.s32 %r5940, %r5934, %r5138; + add.s32 %r5941, %r5940, %r5939; + xor.b32 %r5942, %r5941, %r5936; + shf.l.wrap.b32 %r5943, %r5942, %r5942, 24; + add.s32 %r5944, %r5943, %r5937; + xor.b32 %r5945, %r5944, %r5939; + shf.l.wrap.b32 %r5946, %r5945, %r5945, 25; + add.s32 %r5947, %r5899, %r5210; + add.s32 %r5948, %r5947, %r5918; + xor.b32 %r5949, %r5948, %r5943; + shf.l.wrap.b32 %r5950, %r5949, %r5949, 16; + add.s32 %r5951, %r5950, %r5930; + xor.b32 %r5952, %r5951, %r5918; + shf.l.wrap.b32 %r5953, %r5952, %r5952, 20; + add.s32 %r5954, %r5948, %r5174; + add.s32 %r5955, %r5954, %r5953; + xor.b32 %r5956, %r5955, %r5950; + shf.l.wrap.b32 %r5957, %r5956, %r5956, 24; + add.s32 %r5958, %r5957, %r5951; + xor.b32 %r5959, %r5958, %r5953; + shf.l.wrap.b32 %r5960, %r5959, %r5959, 25; + add.s32 %r5961, %r5913, %r5102; + add.s32 %r5962, %r5961, %r5932; + xor.b32 %r5963, %r5962, %r5901; + shf.l.wrap.b32 %r5964, %r5963, %r5963, 16; + add.s32 %r5965, %r5964, %r5944; + xor.b32 %r5966, %r5965, %r5932; + shf.l.wrap.b32 %r5967, %r5966, %r5966, 20; + add.s32 %r5968, %r5962, %r5194; + add.s32 %r5969, %r5968, %r5967; + xor.b32 %r5970, %r5969, %r5964; + shf.l.wrap.b32 %r5971, %r5970, %r5970, 24; + add.s32 %r5972, %r5971, %r5965; + xor.b32 %r5973, %r5972, %r5967; + shf.l.wrap.b32 %r5974, %r5973, %r5973, 25; + add.s32 %r5975, %r5927, %r5110; + add.s32 %r5976, %r5975, %r5946; + xor.b32 %r5977, %r5976, %r5915; + shf.l.wrap.b32 %r5978, %r5977, %r5977, 16; + add.s32 %r5979, %r5978, %r5902; + xor.b32 %r5980, %r5979, %r5946; + shf.l.wrap.b32 %r5981, %r5980, %r5980, 20; + add.s32 %r5982, %r5976, %r5122; + add.s32 %r5983, %r5982, %r5981; + xor.b32 %r5984, %r5983, %r5978; + shf.l.wrap.b32 %r5985, %r5984, %r5984, 24; + add.s32 %r5986, %r5985, %r5979; + xor.b32 %r5987, %r5986, %r5981; + shf.l.wrap.b32 %r5988, %r5987, %r5987, 25; + add.s32 %r5989, %r5941, %r5146; + add.s32 %r5990, %r5989, %r5904; + xor.b32 %r5991, %r5990, %r5929; + shf.l.wrap.b32 %r5992, %r5991, %r5991, 16; + add.s32 %r5993, %r5992, %r5916; + xor.b32 %r5994, %r5993, %r5904; + shf.l.wrap.b32 %r5995, %r5994, %r5994, 20; + add.s32 %r5996, %r5990, %r5202; + add.s32 %r5997, %r5996, %r5995; + xor.b32 %r5998, %r5997, %r5992; + shf.l.wrap.b32 %r5999, %r5998, %r5998, 24; + add.s32 %r6000, %r5999, %r5993; + xor.b32 %r6001, %r6000, %r5995; + shf.l.wrap.b32 %r6002, %r6001, %r6001, 25; + xor.b32 %r3, %r5986, %r5955; + xor.b32 %r4, %r6000, %r5969; + st.local.v2.u32 [%rd2+32], {%r3, %r4}; + xor.b32 %r5, %r5958, %r5983; + xor.b32 %r6, %r5997, %r5972; + st.local.v2.u32 [%rd2+40], {%r5, %r6}; + xor.b32 %r7, %r6002, %r5971; + xor.b32 %r8, %r5960, %r5985; + st.local.v2.u32 [%rd2+48], {%r7, %r8}; + xor.b32 %r9, %r5999, %r5974; + xor.b32 %r10, %r5988, %r5957; + st.local.v2.u32 [%rd2+56], {%r9, %r10}; + ld.local.v4.u32 {%r6003, %r6004, %r6005, %r6006}, [%rd3+64]; + st.local.v2.u32 [%rd2+72], {%r6003, %r6004}; + st.local.v2.u32 [%rd2+80], {%r6005, %r6006}; + add.s16 %rs1, %rs412, 16; + and.b16 %rs485, %rs1, 255; + add.s16 %rs486, %rs413, 1; + st.local.v2.u8 [%rd2+136], {%rs1, %rs486}; + cvt.u32.u16 %r6011, %rs486; + cvt.u32.u16 %r6012, %rs485; + prmt.b32 %r6013, %r6011, %r6012, 30212; + cvt.u16.u32 %rs487, %r6013; + shr.u16 %rs2, %rs487, 8; + mov.b32 {%rs5, %rs6}, %r6004; + mov.b32 {%rs3, %rs4}, %r6003; + mov.b32 {%rs9, %rs10}, %r6006; + mov.b32 {%rs7, %rs8}, %r6005; + setp.eq.s16 %p11, %rs2, 0; + selp.u16 %rs488, 1, 0, %p11; + shr.u16 %rs489, %rs3, 8; + shr.u16 %rs490, %rs4, 8; + shr.u16 %rs491, %rs5, 8; + shr.u16 %rs492, %rs6, 8; + shr.u16 %rs493, %rs7, 8; + shr.u16 %rs494, %rs8, 8; + shr.u16 %rs495, %rs9, 8; + shr.u16 %rs496, %rs10, 8; + or.b16 %rs497, %rs488, 10; + cvt.u32.u16 %r6014, %rs3; + and.b32 %r6015, %r6014, 255; + cvt.u32.u16 %r6016, %rs489; + prmt.b32 %r6017, %r6016, %r6015, 30212; + cvt.u32.u16 %r6018, %rs4; + prmt.b32 %r6019, %r6018, %r6017, 28756; + cvt.u32.u16 %r6020, %rs490; + prmt.b32 %r6021, %r6020, %r6019, 1620; + cvt.u32.u16 %r6022, %rs5; + and.b32 %r6023, %r6022, 255; + cvt.u32.u16 %r6024, %rs491; + prmt.b32 %r6025, %r6024, %r6023, 30212; + cvt.u32.u16 %r6026, %rs6; + prmt.b32 %r6027, %r6026, %r6025, 28756; + cvt.u32.u16 %r6028, %rs492; + prmt.b32 %r6029, %r6028, %r6027, 1620; + cvt.u32.u16 %r6030, %rs7; + and.b32 %r6031, %r6030, 255; + cvt.u32.u16 %r6032, %rs493; + prmt.b32 %r6033, %r6032, %r6031, 30212; + cvt.u32.u16 %r6034, %rs8; + prmt.b32 %r6035, %r6034, %r6033, 28756; + cvt.u32.u16 %r6036, %rs494; + prmt.b32 %r6037, %r6036, %r6035, 1620; + cvt.u32.u16 %r6038, %rs9; + and.b32 %r6039, %r6038, 255; + cvt.u32.u16 %r6040, %rs495; + prmt.b32 %r6041, %r6040, %r6039, 30212; + cvt.u32.u16 %r6042, %rs10; + prmt.b32 %r6043, %r6042, %r6041, 28756; + cvt.u32.u16 %r6044, %rs496; + prmt.b32 %r6045, %r6044, %r6043, 1620; + cvt.u32.u16 %r6046, %rs497; + add.s32 %r6047, %r7, %r3; + add.s32 %r6048, %r6047, %r6021; + add.s32 %r6049, %r6029, %r6048; + add.s32 %r6050, %r8, %r4; + add.s32 %r6051, %r6050, %r6037; + add.s32 %r6052, %r6045, %r6051; + add.s32 %r6053, %r9, %r5; + cvt.u32.u16 %r6054, %rs1; + and.b32 %r6055, %r6054, 255; + xor.b32 %r6056, %r6053, %r6055; + shr.u32 %r6057, %r6053, 16; + shl.b32 %r6058, %r6056, 16; + or.b32 %r6059, %r6058, %r6057; + add.s32 %r6060, %r6059, 1013904242; + xor.b32 %r6061, %r6060, %r9; + shf.l.wrap.b32 %r6062, %r6061, %r6061, 20; + add.s32 %r6063, %r6053, %r6062; + xor.b32 %r6064, %r6063, %r6059; + shf.l.wrap.b32 %r6065, %r6064, %r6064, 24; + add.s32 %r6066, %r6065, %r6060; + xor.b32 %r6067, %r6066, %r6062; + shf.l.wrap.b32 %r6068, %r6067, %r6067, 25; + add.s32 %r6069, %r10, %r6; + xor.b32 %r6070, %r6069, %r6046; + shr.u32 %r6071, %r6069, 16; + shl.b32 %r6072, %r6070, 16; + or.b32 %r6073, %r6072, %r6071; + add.s32 %r6074, %r6073, -1521486534; + xor.b32 %r6075, %r6074, %r10; + shf.l.wrap.b32 %r6076, %r6075, %r6075, 20; + add.s32 %r6077, %r6069, %r6076; + xor.b32 %r6078, %r6077, %r6073; + shf.l.wrap.b32 %r6079, %r6078, %r6078, 24; + add.s32 %r6080, %r6079, %r6074; + xor.b32 %r6081, %r6080, %r6076; + shf.l.wrap.b32 %r6082, %r6081, %r6081, 25; + add.s32 %r6083, %r6082, %r6063; + shf.l.wrap.b32 %r6084, %r6048, %r6048, 16; + add.s32 %r6085, %r6084, 1779033703; + xor.b32 %r6086, %r6085, %r7; + shf.l.wrap.b32 %r6087, %r6086, %r6086, 20; + add.s32 %r6088, %r6049, %r6087; + xor.b32 %r6089, %r6088, %r6084; + shf.l.wrap.b32 %r6090, %r6089, %r6089, 24; + add.s32 %r6091, %r6090, %r6085; + xor.b32 %r6092, %r6091, %r6087; + shf.l.wrap.b32 %r6093, %r6092, %r6092, 25; + shf.l.wrap.b32 %r6094, %r6051, %r6051, 16; + add.s32 %r6095, %r6094, -1150833019; + xor.b32 %r6096, %r6095, %r8; + shf.l.wrap.b32 %r6097, %r6096, %r6096, 20; + add.s32 %r6098, %r6052, %r6097; + xor.b32 %r6099, %r6098, %r6094; + shf.l.wrap.b32 %r6100, %r6099, %r6099, 24; + add.s32 %r6101, %r6100, %r6095; + xor.b32 %r6102, %r6101, %r6097; + shf.l.wrap.b32 %r6103, %r6102, %r6102, 25; + add.s32 %r6104, %r6088, %r6103; + xor.b32 %r6105, %r6104, %r6079; + shf.l.wrap.b32 %r6106, %r6105, %r6105, 16; + add.s32 %r6107, %r6106, %r6066; + xor.b32 %r6108, %r6107, %r6103; + shf.l.wrap.b32 %r6109, %r6108, %r6108, 20; + add.s32 %r6110, %r6104, %r6109; + xor.b32 %r6111, %r6110, %r6106; + shf.l.wrap.b32 %r6112, %r6111, %r6111, 24; + add.s32 %r6113, %r6112, %r6107; + xor.b32 %r6114, %r6113, %r6109; + shf.l.wrap.b32 %r6115, %r6114, %r6114, 25; + add.s32 %r6116, %r6068, %r6098; + xor.b32 %r6117, %r6090, %r6116; + shf.l.wrap.b32 %r6118, %r6117, %r6117, 16; + add.s32 %r6119, %r6118, %r6080; + xor.b32 %r6120, %r6119, %r6068; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 20; + add.s32 %r6122, %r6116, %r6121; + xor.b32 %r6123, %r6122, %r6118; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 24; + add.s32 %r6125, %r6124, %r6119; + xor.b32 %r6126, %r6125, %r6121; + shf.l.wrap.b32 %r6127, %r6126, %r6126, 25; + xor.b32 %r6128, %r6100, %r6083; + shf.l.wrap.b32 %r6129, %r6128, %r6128, 16; + add.s32 %r6130, %r6129, %r6091; + xor.b32 %r6131, %r6130, %r6082; + shf.l.wrap.b32 %r6132, %r6131, %r6131, 20; + add.s32 %r6133, %r6083, %r6132; + xor.b32 %r6134, %r6133, %r6129; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 24; + add.s32 %r6136, %r6135, %r6130; + xor.b32 %r6137, %r6136, %r6132; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 25; + add.s32 %r6139, %r6077, %r6093; + xor.b32 %r6140, %r6139, %r6065; + shf.l.wrap.b32 %r6141, %r6140, %r6140, 16; + add.s32 %r6142, %r6141, %r6101; + xor.b32 %r6143, %r6142, %r6093; + shf.l.wrap.b32 %r6144, %r6143, %r6143, 20; + add.s32 %r6145, %r6139, %r6144; + xor.b32 %r6146, %r6145, %r6141; + shf.l.wrap.b32 %r6147, %r6146, %r6146, 24; + add.s32 %r6148, %r6147, %r6142; + xor.b32 %r6149, %r6148, %r6144; + shf.l.wrap.b32 %r6150, %r6149, %r6149, 25; + add.s32 %r6151, %r6110, %r6037; + add.s32 %r6152, %r6151, %r6150; + xor.b32 %r6153, %r6152, %r6124; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 16; + add.s32 %r6155, %r6154, %r6136; + xor.b32 %r6156, %r6155, %r6150; + shf.l.wrap.b32 %r6157, %r6156, %r6156, 20; + add.s32 %r6158, %r6152, %r6157; + xor.b32 %r6159, %r6158, %r6154; + shf.l.wrap.b32 %r6160, %r6159, %r6159, 24; + add.s32 %r6161, %r6160, %r6155; + xor.b32 %r6162, %r6161, %r6157; + shf.l.wrap.b32 %r6163, %r6162, %r6162, 25; + add.s32 %r6164, %r6122, %r6045; + add.s32 %r6165, %r6164, %r6115; + xor.b32 %r6166, %r6165, %r6135; + shf.l.wrap.b32 %r6167, %r6166, %r6166, 16; + add.s32 %r6168, %r6167, %r6148; + xor.b32 %r6169, %r6168, %r6115; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 20; + add.s32 %r6171, %r6165, %r6170; + xor.b32 %r6172, %r6171, %r6167; + shf.l.wrap.b32 %r6173, %r6172, %r6172, 24; + add.s32 %r6174, %r6173, %r6168; + xor.b32 %r6175, %r6174, %r6170; + shf.l.wrap.b32 %r6176, %r6175, %r6175, 25; + add.s32 %r6177, %r6133, %r6127; + xor.b32 %r6178, %r6147, %r6177; + shf.l.wrap.b32 %r6179, %r6178, %r6178, 16; + add.s32 %r6180, %r6179, %r6113; + xor.b32 %r6181, %r6180, %r6127; + shf.l.wrap.b32 %r6182, %r6181, %r6181, 20; + add.s32 %r6183, %r6177, %r6021; + add.s32 %r6184, %r6183, %r6182; + xor.b32 %r6185, %r6184, %r6179; + shf.l.wrap.b32 %r6186, %r6185, %r6185, 24; + add.s32 %r6187, %r6186, %r6180; + xor.b32 %r6188, %r6187, %r6182; + shf.l.wrap.b32 %r6189, %r6188, %r6188, 25; + add.s32 %r6190, %r6145, %r6138; + xor.b32 %r6191, %r6112, %r6190; + shf.l.wrap.b32 %r6192, %r6191, %r6191, 16; + add.s32 %r6193, %r6192, %r6125; + xor.b32 %r6194, %r6193, %r6138; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 20; + add.s32 %r6196, %r6190, %r6195; + xor.b32 %r6197, %r6196, %r6192; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 24; + add.s32 %r6199, %r6198, %r6193; + xor.b32 %r6200, %r6199, %r6195; + shf.l.wrap.b32 %r6201, %r6200, %r6200, 25; + add.s32 %r6202, %r6158, %r6029; + add.s32 %r6203, %r6202, %r6176; + xor.b32 %r6204, %r6203, %r6198; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 16; + add.s32 %r6206, %r6205, %r6187; + xor.b32 %r6207, %r6206, %r6176; + shf.l.wrap.b32 %r6208, %r6207, %r6207, 20; + add.s32 %r6209, %r6203, %r6208; + xor.b32 %r6210, %r6209, %r6205; + shf.l.wrap.b32 %r6211, %r6210, %r6210, 24; + add.s32 %r6212, %r6211, %r6206; + xor.b32 %r6213, %r6212, %r6208; + shf.l.wrap.b32 %r6214, %r6213, %r6213, 25; + add.s32 %r6215, %r6189, %r6171; + xor.b32 %r6216, %r6160, %r6215; + shf.l.wrap.b32 %r6217, %r6216, %r6216, 16; + add.s32 %r6218, %r6217, %r6199; + xor.b32 %r6219, %r6218, %r6189; + shf.l.wrap.b32 %r6220, %r6219, %r6219, 20; + add.s32 %r6221, %r6215, %r6220; + xor.b32 %r6222, %r6221, %r6217; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 24; + add.s32 %r6224, %r6223, %r6218; + xor.b32 %r6225, %r6224, %r6220; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 25; + add.s32 %r6227, %r6184, %r6201; + xor.b32 %r6228, %r6173, %r6227; + shf.l.wrap.b32 %r6229, %r6228, %r6228, 16; + add.s32 %r6230, %r6229, %r6161; + xor.b32 %r6231, %r6230, %r6201; + shf.l.wrap.b32 %r6232, %r6231, %r6231, 20; + add.s32 %r6233, %r6227, %r6232; + xor.b32 %r6234, %r6233, %r6229; + shf.l.wrap.b32 %r6235, %r6234, %r6234, 24; + add.s32 %r6236, %r6235, %r6230; + xor.b32 %r6237, %r6236, %r6232; + shf.l.wrap.b32 %r6238, %r6237, %r6237, 25; + add.s32 %r6239, %r6196, %r6163; + xor.b32 %r6240, %r6239, %r6186; + shf.l.wrap.b32 %r6241, %r6240, %r6240, 16; + add.s32 %r6242, %r6241, %r6174; + xor.b32 %r6243, %r6242, %r6163; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 20; + add.s32 %r6245, %r6239, %r6244; + xor.b32 %r6246, %r6245, %r6241; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 24; + add.s32 %r6248, %r6247, %r6242; + xor.b32 %r6249, %r6248, %r6244; + shf.l.wrap.b32 %r6250, %r6249, %r6249, 25; + add.s32 %r6251, %r6209, %r6045; + add.s32 %r6252, %r6251, %r6250; + xor.b32 %r6253, %r6252, %r6223; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 16; + add.s32 %r6255, %r6254, %r6236; + xor.b32 %r6256, %r6255, %r6250; + shf.l.wrap.b32 %r6257, %r6256, %r6256, 20; + add.s32 %r6258, %r6252, %r6257; + xor.b32 %r6259, %r6258, %r6254; + shf.l.wrap.b32 %r6260, %r6259, %r6259, 24; + add.s32 %r6261, %r6260, %r6255; + xor.b32 %r6262, %r6261, %r6257; + shf.l.wrap.b32 %r6263, %r6262, %r6262, 25; + add.s32 %r6264, %r6221, %r6214; + xor.b32 %r6265, %r6264, %r6235; + shf.l.wrap.b32 %r6266, %r6265, %r6265, 16; + add.s32 %r6267, %r6266, %r6248; + xor.b32 %r6268, %r6267, %r6214; + shf.l.wrap.b32 %r6269, %r6268, %r6268, 20; + add.s32 %r6270, %r6264, %r6269; + xor.b32 %r6271, %r6270, %r6266; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 24; + add.s32 %r6273, %r6272, %r6267; + xor.b32 %r6274, %r6273, %r6269; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 25; + add.s32 %r6276, %r6233, %r6226; + xor.b32 %r6277, %r6247, %r6276; + shf.l.wrap.b32 %r6278, %r6277, %r6277, 16; + add.s32 %r6279, %r6278, %r6212; + xor.b32 %r6280, %r6279, %r6226; + shf.l.wrap.b32 %r6281, %r6280, %r6280, 20; + add.s32 %r6282, %r6276, %r6037; + add.s32 %r6283, %r6282, %r6281; + xor.b32 %r6284, %r6283, %r6278; + shf.l.wrap.b32 %r6285, %r6284, %r6284, 24; + add.s32 %r6286, %r6285, %r6279; + xor.b32 %r6287, %r6286, %r6281; + shf.l.wrap.b32 %r6288, %r6287, %r6287, 25; + add.s32 %r6289, %r6245, %r6238; + xor.b32 %r6290, %r6211, %r6289; + shf.l.wrap.b32 %r6291, %r6290, %r6290, 16; + add.s32 %r6292, %r6291, %r6224; + xor.b32 %r6293, %r6292, %r6238; + shf.l.wrap.b32 %r6294, %r6293, %r6293, 20; + add.s32 %r6295, %r6289, %r6294; + xor.b32 %r6296, %r6295, %r6291; + shf.l.wrap.b32 %r6297, %r6296, %r6296, 24; + add.s32 %r6298, %r6297, %r6292; + xor.b32 %r6299, %r6298, %r6294; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 25; + add.s32 %r6301, %r6258, %r6275; + xor.b32 %r6302, %r6301, %r6297; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 16; + add.s32 %r6304, %r6303, %r6286; + xor.b32 %r6305, %r6304, %r6275; + shf.l.wrap.b32 %r6306, %r6305, %r6305, 20; + add.s32 %r6307, %r6301, %r6306; + xor.b32 %r6308, %r6307, %r6303; + shf.l.wrap.b32 %r6309, %r6308, %r6308, 24; + add.s32 %r6310, %r6309, %r6304; + xor.b32 %r6311, %r6310, %r6306; + shf.l.wrap.b32 %r6312, %r6311, %r6311, 25; + add.s32 %r6313, %r6288, %r6270; + xor.b32 %r6314, %r6260, %r6313; + shf.l.wrap.b32 %r6315, %r6314, %r6314, 16; + add.s32 %r6316, %r6315, %r6298; + xor.b32 %r6317, %r6316, %r6288; + shf.l.wrap.b32 %r6318, %r6317, %r6317, 20; + add.s32 %r6319, %r6313, %r6021; + add.s32 %r6320, %r6319, %r6318; + xor.b32 %r6321, %r6320, %r6315; + shf.l.wrap.b32 %r6322, %r6321, %r6321, 24; + add.s32 %r6323, %r6322, %r6316; + xor.b32 %r6324, %r6323, %r6318; + shf.l.wrap.b32 %r6325, %r6324, %r6324, 25; + add.s32 %r6326, %r6283, %r6300; + xor.b32 %r6327, %r6272, %r6326; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 16; + add.s32 %r6329, %r6328, %r6261; + xor.b32 %r6330, %r6329, %r6300; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 20; + add.s32 %r6332, %r6326, %r6331; + xor.b32 %r6333, %r6332, %r6328; + shf.l.wrap.b32 %r6334, %r6333, %r6333, 24; + add.s32 %r6335, %r6334, %r6329; + xor.b32 %r6336, %r6335, %r6331; + shf.l.wrap.b32 %r6337, %r6336, %r6336, 25; + add.s32 %r6338, %r6295, %r6263; + xor.b32 %r6339, %r6338, %r6285; + shf.l.wrap.b32 %r6340, %r6339, %r6339, 16; + add.s32 %r6341, %r6340, %r6273; + xor.b32 %r6342, %r6341, %r6263; + shf.l.wrap.b32 %r6343, %r6342, %r6342, 20; + add.s32 %r6344, %r6338, %r6029; + add.s32 %r6345, %r6344, %r6343; + xor.b32 %r6346, %r6345, %r6340; + shf.l.wrap.b32 %r6347, %r6346, %r6346, 24; + add.s32 %r6348, %r6347, %r6341; + xor.b32 %r6349, %r6348, %r6343; + shf.l.wrap.b32 %r6350, %r6349, %r6349, 25; + add.s32 %r6351, %r6307, %r6350; + xor.b32 %r6352, %r6351, %r6322; + shf.l.wrap.b32 %r6353, %r6352, %r6352, 16; + add.s32 %r6354, %r6353, %r6335; + xor.b32 %r6355, %r6354, %r6350; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 20; + add.s32 %r6357, %r6351, %r6356; + xor.b32 %r6358, %r6357, %r6353; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 24; + add.s32 %r6360, %r6359, %r6354; + xor.b32 %r6361, %r6360, %r6356; + shf.l.wrap.b32 %r6362, %r6361, %r6361, 25; + add.s32 %r6363, %r6320, %r6312; + xor.b32 %r6364, %r6363, %r6334; + shf.l.wrap.b32 %r6365, %r6364, %r6364, 16; + add.s32 %r6366, %r6365, %r6348; + xor.b32 %r6367, %r6366, %r6312; + shf.l.wrap.b32 %r6368, %r6367, %r6367, 20; + add.s32 %r6369, %r6363, %r6368; + xor.b32 %r6370, %r6369, %r6365; + shf.l.wrap.b32 %r6371, %r6370, %r6370, 24; + add.s32 %r6372, %r6371, %r6366; + xor.b32 %r6373, %r6372, %r6368; + shf.l.wrap.b32 %r6374, %r6373, %r6373, 25; + add.s32 %r6375, %r6332, %r6325; + xor.b32 %r6376, %r6347, %r6375; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 16; + add.s32 %r6378, %r6377, %r6310; + xor.b32 %r6379, %r6378, %r6325; + shf.l.wrap.b32 %r6380, %r6379, %r6379, 20; + add.s32 %r6381, %r6375, %r6045; + add.s32 %r6382, %r6381, %r6380; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 24; + add.s32 %r6385, %r6384, %r6378; + xor.b32 %r6386, %r6385, %r6380; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 25; + add.s32 %r6388, %r6345, %r6337; + xor.b32 %r6389, %r6309, %r6388; + shf.l.wrap.b32 %r6390, %r6389, %r6389, 16; + add.s32 %r6391, %r6390, %r6323; + xor.b32 %r6392, %r6391, %r6337; + shf.l.wrap.b32 %r6393, %r6392, %r6392, 20; + add.s32 %r6394, %r6388, %r6393; + xor.b32 %r6395, %r6394, %r6390; + shf.l.wrap.b32 %r6396, %r6395, %r6395, 24; + add.s32 %r6397, %r6396, %r6391; + xor.b32 %r6398, %r6397, %r6393; + shf.l.wrap.b32 %r6399, %r6398, %r6398, 25; + add.s32 %r6400, %r6357, %r6374; + xor.b32 %r6401, %r6400, %r6396; + shf.l.wrap.b32 %r6402, %r6401, %r6401, 16; + add.s32 %r6403, %r6402, %r6385; + xor.b32 %r6404, %r6403, %r6374; + shf.l.wrap.b32 %r6405, %r6404, %r6404, 20; + add.s32 %r6406, %r6400, %r6021; + add.s32 %r6407, %r6406, %r6405; + xor.b32 %r6408, %r6407, %r6402; + shf.l.wrap.b32 %r6409, %r6408, %r6408, 24; + add.s32 %r6410, %r6409, %r6403; + xor.b32 %r6411, %r6410, %r6405; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 25; + add.s32 %r6413, %r6387, %r6369; + xor.b32 %r6414, %r6359, %r6413; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 16; + add.s32 %r6416, %r6415, %r6397; + xor.b32 %r6417, %r6416, %r6387; + shf.l.wrap.b32 %r6418, %r6417, %r6417, 20; + add.s32 %r6419, %r6413, %r6037; + add.s32 %r6420, %r6419, %r6418; + xor.b32 %r6421, %r6420, %r6415; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 24; + add.s32 %r6423, %r6422, %r6416; + xor.b32 %r6424, %r6423, %r6418; + shf.l.wrap.b32 %r6425, %r6424, %r6424, 25; + add.s32 %r6426, %r6382, %r6399; + xor.b32 %r6427, %r6371, %r6426; + shf.l.wrap.b32 %r6428, %r6427, %r6427, 16; + add.s32 %r6429, %r6428, %r6360; + xor.b32 %r6430, %r6429, %r6399; + shf.l.wrap.b32 %r6431, %r6430, %r6430, 20; + add.s32 %r6432, %r6426, %r6431; + xor.b32 %r6433, %r6432, %r6428; + shf.l.wrap.b32 %r6434, %r6433, %r6433, 24; + add.s32 %r6435, %r6434, %r6429; + xor.b32 %r6436, %r6435, %r6431; + shf.l.wrap.b32 %r6437, %r6436, %r6436, 25; + add.s32 %r6438, %r6394, %r6029; + add.s32 %r6439, %r6438, %r6362; + xor.b32 %r6440, %r6439, %r6384; + shf.l.wrap.b32 %r6441, %r6440, %r6440, 16; + add.s32 %r6442, %r6441, %r6372; + xor.b32 %r6443, %r6442, %r6362; + shf.l.wrap.b32 %r6444, %r6443, %r6443, 20; + add.s32 %r6445, %r6439, %r6444; + xor.b32 %r6446, %r6445, %r6441; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 24; + add.s32 %r6448, %r6447, %r6442; + xor.b32 %r6449, %r6448, %r6444; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 25; + add.s32 %r6451, %r6407, %r6450; + xor.b32 %r6452, %r6451, %r6422; + shf.l.wrap.b32 %r6453, %r6452, %r6452, 16; + add.s32 %r6454, %r6453, %r6435; + xor.b32 %r6455, %r6454, %r6450; + shf.l.wrap.b32 %r6456, %r6455, %r6455, 20; + add.s32 %r6457, %r6451, %r6456; + xor.b32 %r6458, %r6457, %r6453; + shf.l.wrap.b32 %r6459, %r6458, %r6458, 24; + add.s32 %r6460, %r6459, %r6454; + xor.b32 %r6461, %r6460, %r6456; + shf.l.wrap.b32 %r6462, %r6461, %r6461, 25; + add.s32 %r6463, %r6420, %r6412; + xor.b32 %r6464, %r6463, %r6434; + shf.l.wrap.b32 %r6465, %r6464, %r6464, 16; + add.s32 %r6466, %r6465, %r6448; + xor.b32 %r6467, %r6466, %r6412; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 20; + add.s32 %r6469, %r6463, %r6468; + xor.b32 %r6470, %r6469, %r6465; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 24; + add.s32 %r6472, %r6471, %r6466; + xor.b32 %r6473, %r6472, %r6468; + shf.l.wrap.b32 %r6474, %r6473, %r6473, 25; + add.s32 %r6475, %r6432, %r6425; + xor.b32 %r6476, %r6447, %r6475; + shf.l.wrap.b32 %r6477, %r6476, %r6476, 16; + add.s32 %r6478, %r6477, %r6410; + xor.b32 %r6479, %r6478, %r6425; + shf.l.wrap.b32 %r6480, %r6479, %r6479, 20; + add.s32 %r6481, %r6475, %r6480; + xor.b32 %r6482, %r6481, %r6477; + shf.l.wrap.b32 %r6483, %r6482, %r6482, 24; + add.s32 %r6484, %r6483, %r6478; + xor.b32 %r6485, %r6484, %r6480; + shf.l.wrap.b32 %r6486, %r6485, %r6485, 25; + add.s32 %r6487, %r6445, %r6437; + xor.b32 %r6488, %r6409, %r6487; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 16; + add.s32 %r6490, %r6489, %r6423; + xor.b32 %r6491, %r6490, %r6437; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 20; + add.s32 %r6493, %r6487, %r6492; + xor.b32 %r6494, %r6493, %r6489; + shf.l.wrap.b32 %r6495, %r6494, %r6494, 24; + add.s32 %r6496, %r6495, %r6490; + xor.b32 %r6497, %r6496, %r6492; + shf.l.wrap.b32 %r6498, %r6497, %r6497, 25; + add.s32 %r6499, %r6457, %r6474; + xor.b32 %r6500, %r6499, %r6495; + shf.l.wrap.b32 %r6501, %r6500, %r6500, 16; + add.s32 %r6502, %r6501, %r6484; + xor.b32 %r6503, %r6502, %r6474; + shf.l.wrap.b32 %r6504, %r6503, %r6503, 20; + add.s32 %r6505, %r6499, %r6037; + add.s32 %r6506, %r6505, %r6504; + xor.b32 %r6507, %r6506, %r6501; + shf.l.wrap.b32 %r6508, %r6507, %r6507, 24; + add.s32 %r6509, %r6508, %r6502; + xor.b32 %r6510, %r6509, %r6504; + shf.l.wrap.b32 %r6511, %r6510, %r6510, 25; + add.s32 %r6512, %r6486, %r6469; + xor.b32 %r6513, %r6459, %r6512; + shf.l.wrap.b32 %r6514, %r6513, %r6513, 16; + add.s32 %r6515, %r6514, %r6496; + xor.b32 %r6516, %r6515, %r6486; + shf.l.wrap.b32 %r6517, %r6516, %r6516, 20; + add.s32 %r6518, %r6512, %r6045; + add.s32 %r6519, %r6518, %r6517; + xor.b32 %r6520, %r6519, %r6514; + shf.l.wrap.b32 %r6521, %r6520, %r6520, 24; + add.s32 %r6522, %r6521, %r6515; + xor.b32 %r6523, %r6522, %r6517; + shf.l.wrap.b32 %r6524, %r6523, %r6523, 25; + add.s32 %r6525, %r6481, %r6021; + add.s32 %r6526, %r6525, %r6498; + xor.b32 %r6527, %r6471, %r6526; + shf.l.wrap.b32 %r6528, %r6527, %r6527, 16; + add.s32 %r6529, %r6528, %r6460; + xor.b32 %r6530, %r6529, %r6498; + shf.l.wrap.b32 %r6531, %r6530, %r6530, 20; + add.s32 %r6532, %r6526, %r6029; + add.s32 %r6533, %r6532, %r6531; + xor.b32 %r6534, %r6533, %r6528; + shf.l.wrap.b32 %r6535, %r6534, %r6534, 24; + add.s32 %r6536, %r6535, %r6529; + xor.b32 %r6537, %r6536, %r6531; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 25; + add.s32 %r6539, %r6493, %r6462; + xor.b32 %r6540, %r6539, %r6483; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 16; + add.s32 %r6542, %r6541, %r6472; + xor.b32 %r6543, %r6542, %r6462; + shf.l.wrap.b32 %r6544, %r6543, %r6543, 20; + add.s32 %r6545, %r6539, %r6544; + xor.b32 %r6546, %r6545, %r6541; + shf.l.wrap.b32 %r6547, %r6546, %r6546, 24; + add.s32 %r6548, %r6547, %r6542; + xor.b32 %r6549, %r6548, %r6544; + shf.l.wrap.b32 %r6550, %r6549, %r6549, 25; + add.s32 %r6551, %r6506, %r6550; + xor.b32 %r6552, %r6551, %r6521; + shf.l.wrap.b32 %r6553, %r6552, %r6552, 16; + add.s32 %r6554, %r6553, %r6536; + xor.b32 %r6555, %r6554, %r6550; + shf.l.wrap.b32 %r6556, %r6555, %r6555, 20; + add.s32 %r6557, %r6551, %r6556; + xor.b32 %r6558, %r6557, %r6553; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 24; + add.s32 %r6560, %r6559, %r6554; + xor.b32 %r6561, %r6560, %r6556; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 25; + add.s32 %r6563, %r6519, %r6511; + xor.b32 %r6564, %r6563, %r6535; + shf.l.wrap.b32 %r6565, %r6564, %r6564, 16; + add.s32 %r6566, %r6565, %r6548; + xor.b32 %r6567, %r6566, %r6511; + shf.l.wrap.b32 %r6568, %r6567, %r6567, 20; + add.s32 %r6569, %r6563, %r6568; + xor.b32 %r6570, %r6569, %r6565; + shf.l.wrap.b32 %r6571, %r6570, %r6570, 24; + add.s32 %r6572, %r6571, %r6566; + xor.b32 %r6573, %r6572, %r6568; + shf.l.wrap.b32 %r6574, %r6573, %r6573, 25; + add.s32 %r6575, %r6533, %r6524; + xor.b32 %r6576, %r6547, %r6575; + shf.l.wrap.b32 %r6577, %r6576, %r6576, 16; + add.s32 %r6578, %r6577, %r6509; + xor.b32 %r6579, %r6578, %r6524; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 20; + add.s32 %r6581, %r6575, %r6580; + xor.b32 %r6582, %r6581, %r6577; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 24; + add.s32 %r6584, %r6583, %r6578; + xor.b32 %r6585, %r6584, %r6580; + shf.l.wrap.b32 %r6586, %r6585, %r6585, 25; + add.s32 %r6587, %r6545, %r6538; + xor.b32 %r6588, %r6508, %r6587; + shf.l.wrap.b32 %r6589, %r6588, %r6588, 16; + add.s32 %r6590, %r6589, %r6522; + xor.b32 %r6591, %r6590, %r6538; + shf.l.wrap.b32 %r6592, %r6591, %r6591, 20; + add.s32 %r6593, %r6587, %r6029; + add.s32 %r6594, %r6593, %r6592; + xor.b32 %r6595, %r6594, %r6589; + shf.l.wrap.b32 %r6596, %r6595, %r6595, 24; + add.s32 %r6597, %r6596, %r6590; + xor.b32 %r6598, %r6597, %r6592; + shf.l.wrap.b32 %r6599, %r6598, %r6598, 25; + add.s32 %r6600, %r6557, %r6574; + xor.b32 %r6601, %r6600, %r6596; + shf.l.wrap.b32 %r6602, %r6601, %r6601, 16; + add.s32 %r6603, %r6602, %r6584; + xor.b32 %r6604, %r6603, %r6574; + shf.l.wrap.b32 %r6605, %r6604, %r6604, 20; + add.s32 %r6606, %r6600, %r6045; + add.s32 %r6607, %r6606, %r6605; + xor.b32 %r6608, %r6607, %r6602; + shf.l.wrap.b32 %r6609, %r6608, %r6608, 24; + add.s32 %r6610, %r6609, %r6603; + xor.b32 %r6611, %r6610, %r6605; + shf.l.wrap.b32 %r6612, %r6611, %r6611, 25; + add.s32 %r6613, %r6586, %r6021; + add.s32 %r6614, %r6613, %r6569; + xor.b32 %r6615, %r6559, %r6614; + shf.l.wrap.b32 %r6616, %r6615, %r6615, 16; + add.s32 %r6617, %r6616, %r6597; + xor.b32 %r6618, %r6617, %r6586; + shf.l.wrap.b32 %r6619, %r6618, %r6618, 20; + add.s32 %r6620, %r6614, %r6619; + xor.b32 %r6621, %r6620, %r6616; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 24; + add.s32 %r6623, %r6622, %r6617; + xor.b32 %r6624, %r6623, %r6619; + shf.l.wrap.b32 %r6625, %r6624, %r6624, 25; + add.s32 %r6626, %r6581, %r6037; + add.s32 %r6627, %r6626, %r6599; + xor.b32 %r6628, %r6571, %r6627; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 16; + add.s32 %r6630, %r6629, %r6560; + xor.b32 %r6631, %r6630, %r6599; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 20; + add.s32 %r6633, %r6627, %r6632; + xor.b32 %r6634, %r6633, %r6629; + shf.l.wrap.b32 %r6635, %r6634, %r6634, 24; + add.s32 %r6636, %r6635, %r6630; + xor.b32 %r6637, %r6636, %r6632; + shf.l.wrap.b32 %r6638, %r6637, %r6637, 25; + add.s32 %r6639, %r6594, %r6562; + xor.b32 %r6640, %r6639, %r6583; + shf.l.wrap.b32 %r6641, %r6640, %r6640, 16; + add.s32 %r6642, %r6641, %r6572; + xor.b32 %r6643, %r6642, %r6562; + shf.l.wrap.b32 %r6644, %r6643, %r6643, 20; + add.s32 %r6645, %r6639, %r6644; + xor.b32 %r6646, %r6645, %r6641; + shf.l.wrap.b32 %r6647, %r6646, %r6646, 24; + add.s32 %r6648, %r6647, %r6642; + xor.b32 %r6649, %r6648, %r6644; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 25; + add.s32 %r6651, %r6607, %r6650; + xor.b32 %r6652, %r6651, %r6622; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 16; + add.s32 %r6654, %r6653, %r6636; + xor.b32 %r6655, %r6654, %r6650; + shf.l.wrap.b32 %r6656, %r6655, %r6655, 20; + add.s32 %r6657, %r6651, %r6656; + xor.b32 %r6658, %r6657, %r6653; + shf.l.wrap.b32 %r6659, %r6658, %r6658, 24; + add.s32 %r6660, %r6659, %r6654; + xor.b32 %r6661, %r6660, %r6656; + shf.l.wrap.b32 %r6662, %r6661, %r6661, 25; + add.s32 %r6663, %r6620, %r6612; + xor.b32 %r6664, %r6663, %r6635; + shf.l.wrap.b32 %r6665, %r6664, %r6664, 16; + add.s32 %r6666, %r6665, %r6648; + xor.b32 %r6667, %r6666, %r6612; + shf.l.wrap.b32 %r6668, %r6667, %r6667, 20; + add.s32 %r6669, %r6663, %r6021; + add.s32 %r6670, %r6669, %r6668; + xor.b32 %r6671, %r6670, %r6665; + shf.l.wrap.b32 %r6672, %r6671, %r6671, 24; + add.s32 %r6673, %r6672, %r6666; + xor.b32 %r6674, %r6673, %r6668; + shf.l.wrap.b32 %r6675, %r6674, %r6674, 25; + add.s32 %r6676, %r6633, %r6029; + add.s32 %r6677, %r6676, %r6625; + xor.b32 %r6678, %r6647, %r6677; + shf.l.wrap.b32 %r6679, %r6678, %r6678, 16; + add.s32 %r6680, %r6679, %r6610; + xor.b32 %r6681, %r6680, %r6625; + shf.l.wrap.b32 %r6682, %r6681, %r6681, 20; + add.s32 %r6683, %r6677, %r6682; + xor.b32 %r6684, %r6683, %r6679; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 24; + add.s32 %r6686, %r6685, %r6680; + xor.b32 %r6687, %r6686, %r6682; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 25; + add.s32 %r6689, %r6645, %r6638; + xor.b32 %r6690, %r6609, %r6689; + shf.l.wrap.b32 %r6691, %r6690, %r6690, 16; + add.s32 %r6692, %r6691, %r6623; + xor.b32 %r6693, %r6692, %r6638; + shf.l.wrap.b32 %r6694, %r6693, %r6693, 20; + add.s32 %r6695, %r6689, %r6694; + xor.b32 %r6696, %r6695, %r6691; + shf.l.wrap.b32 %r6697, %r6696, %r6696, 24; + add.s32 %r6698, %r6697, %r6692; + xor.b32 %r6699, %r6698, %r6694; + shf.l.wrap.b32 %r6700, %r6699, %r6699, 25; + add.s32 %r6701, %r6657, %r6675; + xor.b32 %r6702, %r6701, %r6697; + shf.l.wrap.b32 %r6703, %r6702, %r6702, 16; + add.s32 %r6704, %r6703, %r6686; + xor.b32 %r6705, %r6704, %r6675; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 20; + add.s32 %r6707, %r6701, %r6706; + xor.b32 %r6708, %r6707, %r6703; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 24; + add.s32 %r6710, %r6709, %r6704; + xor.b32 %r6711, %r6710, %r6706; + shf.l.wrap.b32 %r6712, %r6711, %r6711, 25; + add.s32 %r6713, %r6688, %r6037; + add.s32 %r6714, %r6713, %r6670; + xor.b32 %r6715, %r6659, %r6714; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 16; + add.s32 %r6717, %r6716, %r6698; + xor.b32 %r6718, %r6717, %r6688; + shf.l.wrap.b32 %r6719, %r6718, %r6718, 20; + add.s32 %r6720, %r6714, %r6719; + xor.b32 %r6721, %r6720, %r6716; + shf.l.wrap.b32 %r6722, %r6721, %r6721, 24; + add.s32 %r6723, %r6722, %r6717; + xor.b32 %r6724, %r6723, %r6719; + shf.l.wrap.b32 %r6725, %r6724, %r6724, 25; + add.s32 %r6726, %r6683, %r6045; + add.s32 %r6727, %r6726, %r6700; + xor.b32 %r6728, %r6672, %r6727; + shf.l.wrap.b32 %r6729, %r6728, %r6728, 16; + add.s32 %r6730, %r6729, %r6660; + xor.b32 %r6731, %r6730, %r6700; + shf.l.wrap.b32 %r6732, %r6731, %r6731, 20; + add.s32 %r6733, %r6727, %r6732; + xor.b32 %r6734, %r6733, %r6729; + shf.l.wrap.b32 %r6735, %r6734, %r6734, 24; + add.s32 %r6736, %r6735, %r6730; + xor.b32 %r6737, %r6736, %r6732; + shf.l.wrap.b32 %r6738, %r6737, %r6737, 25; + add.s32 %r6739, %r6695, %r6662; + xor.b32 %r6740, %r6739, %r6685; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 16; + add.s32 %r6742, %r6741, %r6673; + xor.b32 %r6743, %r6742, %r6662; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 20; + add.s32 %r6745, %r6739, %r6744; + xor.b32 %r6746, %r6745, %r6741; + shf.l.wrap.b32 %r6747, %r6746, %r6746, 24; + add.s32 %r6748, %r6747, %r6742; + xor.b32 %r6749, %r6748, %r6744; + shf.l.wrap.b32 %r6750, %r6749, %r6749, 25; + xor.b32 %r6751, %r6707, %r6736; + cvt.u64.u32 %rd394, %r6751; + xor.b32 %r6752, %r6748, %r6720; + and.b32 %r6753, %r6752, 255; + cvt.u64.u32 %rd395, %r6753; + bfi.b64 %rd396, %rd395, %rd394, 32, 32; + cvt.u64.u32 %rd397, %r6752; + shl.b64 %rd398, %rd397, 32; + and.b64 %rd399, %rd398, 280375465082880; + or.b64 %rd400, %rd396, %rd399; + and.b64 %rd401, %rd398, 71776119061217280; + shr.u32 %r6754, %r6752, 24; + cvt.u64.u32 %rd402, %r6754; + shl.b64 %rd403, %rd402, 56; + or.b64 %rd404, %rd400, %rd401; + or.b64 %rd1308, %rd404, %rd403; + xor.b32 %r6755, %r6710, %r6733; + cvt.u64.u32 %rd405, %r6755; + xor.b32 %r6756, %r6745, %r6723; + and.b32 %r6757, %r6756, 255; + cvt.u64.u32 %rd406, %r6757; + bfi.b64 %rd407, %rd406, %rd405, 32, 32; + cvt.u64.u32 %rd408, %r6756; + shl.b64 %rd409, %rd408, 32; + and.b64 %rd410, %rd409, 280375465082880; + or.b64 %rd411, %rd407, %rd410; + and.b64 %rd412, %rd409, 71776119061217280; + shr.u32 %r6758, %r6756, 24; + cvt.u64.u32 %rd413, %r6758; + shl.b64 %rd414, %rd413, 56; + or.b64 %rd415, %rd411, %rd412; + or.b64 %rd1307, %rd415, %rd414; + xor.b32 %r6759, %r6750, %r6722; + cvt.u64.u32 %rd416, %r6759; + xor.b32 %r6760, %r6712, %r6735; + and.b32 %r6761, %r6760, 255; + cvt.u64.u32 %rd417, %r6761; + bfi.b64 %rd418, %rd417, %rd416, 32, 32; + cvt.u64.u32 %rd419, %r6760; + shl.b64 %rd420, %rd419, 32; + and.b64 %rd421, %rd420, 280375465082880; + or.b64 %rd422, %rd418, %rd421; + and.b64 %rd423, %rd420, 71776119061217280; + shr.u32 %r6762, %r6760, 24; + cvt.u64.u32 %rd424, %r6762; + shl.b64 %rd425, %rd424, 56; + or.b64 %rd426, %rd422, %rd423; + or.b64 %rd1306, %rd426, %rd425; + xor.b32 %r6763, %r6747, %r6725; + cvt.u64.u32 %rd427, %r6763; + xor.b32 %r6764, %r6709, %r6738; + and.b32 %r6765, %r6764, 255; + cvt.u64.u32 %rd428, %r6765; + bfi.b64 %rd429, %rd428, %rd427, 32, 32; + cvt.u64.u32 %rd430, %r6764; + shl.b64 %rd431, %rd430, 32; + and.b64 %rd432, %rd431, 280375465082880; + or.b64 %rd433, %rd429, %rd432; + and.b64 %rd434, %rd431, 71776119061217280; + shr.u32 %r6766, %r6764, 24; + cvt.u64.u32 %rd435, %r6766; + shl.b64 %rd436, %rd435, 56; + or.b64 %rd437, %rd433, %rd434; + or.b64 %rd1305, %rd437, %rd436; + add.u64 %rd1297, %SPL, 2000; + mov.u64 %rd1301, 0; + mov.u32 %r29819, 0; + st.local.v4.u32 [%rd1297+32], {%r29819, %r29819, %r29819, %r29819}; + st.local.v4.u32 [%rd1297+48], {%r29819, %r29819, %r29819, %r29819}; + st.local.v4.u32 [%rd1297+64], {%r29819, %r29819, %r29819, %r29819}; + st.local.v2.u64 [%rd1297], {%rd1308, %rd1307}; + st.local.v2.u64 [%rd1297+16], {%rd1306, %rd1305}; + mov.u64 %rd1302, %rd1301; + mov.u64 %rd1303, %rd1301; + mov.u64 %rd1304, %rd1301; + mov.u64 %rd1309, %rd1301; + mov.u64 %rd1310, %rd1301; + mov.u64 %rd1311, %rd1301; + mov.u64 %rd1312, %rd1301; + mov.u64 %rd1313, %rd1305; + mov.u64 %rd1314, %rd1306; + mov.u64 %rd1315, %rd1307; + mov.u64 %rd1316, %rd1308; + +$L__BB2_11: + mov.b64 {%r12, %r13}, %rd1316; + xor.b32 %r6768, %r13, %r12; + mov.b64 {%r14, %r15}, %rd1315; + xor.b32 %r6769, %r6768, %r14; + xor.b32 %r6770, %r6769, %r15; + mov.b64 {%r16, %r17}, %rd1314; + xor.b32 %r6771, %r17, %r16; + mov.b64 {%r18, %r19}, %rd1313; + xor.b32 %r6772, %r6771, %r18; + xor.b32 %r6773, %r6772, %r19; + mov.b64 {%r20, %r21}, %rd1312; + xor.b32 %r6774, %r21, %r20; + mov.b64 {%r22, %r23}, %rd1311; + xor.b32 %r6775, %r6774, %r22; + xor.b32 %r6776, %r6775, %r23; + mov.b64 {%r24, %r25}, %rd1310; + xor.b32 %r6777, %r25, %r24; + mov.b64 {%r26, %r27}, %rd1309; + xor.b32 %r6778, %r6777, %r26; + xor.b32 %r6779, %r6778, %r27; + mov.b64 {%r28, %r29}, %rd1308; + xor.b32 %r6780, %r29, %r28; + mov.b64 {%r30, %r31}, %rd1307; + xor.b32 %r6781, %r6780, %r30; + xor.b32 %r6782, %r6781, %r31; + mov.b64 {%r32, %r33}, %rd1306; + xor.b32 %r6783, %r33, %r32; + mov.b64 {%r34, %r35}, %rd1305; + xor.b32 %r6784, %r6783, %r34; + xor.b32 %r6785, %r6784, %r35; + mov.b64 {%r36, %r37}, %rd1304; + xor.b32 %r6786, %r37, %r36; + mov.b64 {%r38, %r39}, %rd1303; + xor.b32 %r6787, %r6786, %r38; + xor.b32 %r6788, %r6787, %r39; + mov.b64 {%r40, %r41}, %rd1302; + xor.b32 %r6789, %r41, %r40; + mov.b64 {%r42, %r43}, %rd1301; + xor.b32 %r6790, %r6789, %r42; + xor.b32 %r6791, %r6790, %r43; + xor.b32 %r6792, %r6779, %r6770; + xor.b32 %r6793, %r6792, %r6788; + mul.wide.u32 %rd446, %r6793, 1908875315; + shr.u64 %rd447, %rd446, 56; + cvt.u32.u64 %r6794, %rd447; + mul.lo.s32 %r6795, %r6794, 37748717; + sub.s32 %r44, %r6793, %r6795; + xor.b32 %r6796, %r6782, %r6773; + xor.b32 %r6797, %r6796, %r6791; + mul.wide.u32 %rd448, %r6797, 1908875315; + shr.u64 %rd449, %rd448, 56; + cvt.u32.u64 %r6798, %rd449; + mul.lo.s32 %r6799, %r6798, 37748717; + sub.s32 %r45, %r6797, %r6799; + xor.b32 %r6800, %r6776, %r29819; + xor.b32 %r6801, %r6800, %r6785; + mul.wide.u32 %rd450, %r6801, 1908875315; + shr.u64 %rd451, %rd450, 56; + cvt.u32.u64 %r6802, %rd451; + mul.lo.s32 %r6803, %r6802, 37748717; + sub.s32 %r46, %r6801, %r6803; + shl.b32 %r47, %r44, 1; + mul.wide.u32 %rd452, %r47, -954391867; + shr.u64 %rd453, %rd452, 32; + cvt.u32.u64 %r6804, %rd453; + sub.s32 %r6805, %r47, %r6804; + shr.u32 %r6806, %r6805, 1; + add.s32 %r6807, %r6806, %r6804; + shr.u32 %r6808, %r6807, 20; + mul.lo.s32 %r6809, %r6808, 1179641; + sub.s32 %r6810, %r47, %r6809; + cvta.to.global.u64 %rd454, %rd361; + mul.wide.u32 %rd455, %r6810, 64; + add.s64 %rd32, %rd454, %rd455; + or.b32 %r48, %r47, 1; + mul.wide.u32 %rd456, %r48, -954391867; + shr.u64 %rd457, %rd456, 32; + cvt.u32.u64 %r6811, %rd457; + sub.s32 %r6812, %r48, %r6811; + shr.u32 %r6813, %r6812, 1; + add.s32 %r6814, %r6813, %r6811; + shr.u32 %r6815, %r6814, 20; + mul.lo.s32 %r6816, %r6815, 1179641; + sub.s32 %r6817, %r48, %r6816; + mul.wide.u32 %rd458, %r6817, 64; + add.s64 %rd33, %rd454, %rd458; + setp.eq.s64 %p12, %rd360, 0; + @%p12 bra $L__BB2_25; + + cvta.to.global.u64 %rd459, %rd360; + mul.wide.u32 %rd460, %r44, 128; + add.s64 %rd34, %rd459, %rd460; + ld.global.u64 %rd1317, [%rd34]; + setp.eq.s64 %p13, %rd1317, 0; + @%p13 bra $L__BB2_14; + + ld.global.u64 %rd1332, [%rd34+120]; + ld.global.u64 %rd1331, [%rd34+112]; + ld.global.u64 %rd1330, [%rd34+104]; + ld.global.u64 %rd1329, [%rd34+96]; + ld.global.u64 %rd1328, [%rd34+88]; + ld.global.u64 %rd1327, [%rd34+80]; + ld.global.u64 %rd1326, [%rd34+72]; + ld.global.u64 %rd1325, [%rd34+64]; + ld.global.u64 %rd1324, [%rd34+56]; + ld.global.u64 %rd1323, [%rd34+48]; + ld.global.u64 %rd1322, [%rd34+40]; + ld.global.u64 %rd1321, [%rd34+32]; + ld.global.u64 %rd1320, [%rd34+24]; + ld.global.u64 %rd1319, [%rd34+16]; + ld.global.u64 %rd1318, [%rd34+8]; + bra.uni $L__BB2_36; + +$L__BB2_25: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd562, 1179641; + st.local.u64 [%rd2+8], %rd562; + st.local.u32 [%rd2+16], %r47; + ld.global.u64 %rd563, [%rd32]; + ld.global.u64 %rd564, [%rd32+8]; + ld.global.u64 %rd565, [%rd32+16]; + ld.global.u64 %rd566, [%rd32+24]; + ld.global.u64 %rd567, [%rd32+32]; + ld.global.u64 %rd568, [%rd32+40]; + ld.global.u64 %rd569, [%rd32+48]; + ld.global.u64 %rd570, [%rd32+56]; + st.local.u64 [%rd2+24], %rd563; + st.local.u64 [%rd2+32], %rd564; + st.local.u64 [%rd2+40], %rd565; + st.local.u64 [%rd2+48], %rd566; + st.local.u64 [%rd2+56], %rd567; + st.local.u64 [%rd2+64], %rd568; + st.local.u64 [%rd2+72], %rd569; + st.local.u64 [%rd2+80], %rd570; + cvt.u32.u64 %r10143, %rd563; + xor.b32 %r10144, %r47, %r10143; + st.local.u32 [%rd2+24], %r10144; + mov.u32 %r30057, 0; + st.local.v2.u32 [%rd2+96], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+104], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+112], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+120], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+128], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+136], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+144], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+152], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+160], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+168], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+176], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+184], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+192], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+200], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+208], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+216], {%r30057, %r30057}; + mov.u32 %r30072, -2147483648; + mov.u32 %r10116, 1; + st.local.v2.u32 [%rd2+88], {%r10116, %r30072}; + ld.local.v2.u32 {%r30093, %r30094}, [%rd2+24]; + mov.b64 {%r30091, %r30092}, %rd568; + shr.u64 %rd571, %rd564, 32; + cvt.u32.u64 %r30105, %rd564; + cvt.u32.u64 %r30106, %rd571; + shr.u64 %rd572, %rd569, 32; + cvt.u32.u64 %r30103, %rd569; + cvt.u32.u64 %r30104, %rd572; + shr.u64 %rd573, %rd565, 32; + cvt.u32.u64 %r30101, %rd565; + cvt.u32.u64 %r30102, %rd573; + shr.u64 %rd574, %rd570, 32; + cvt.u32.u64 %r30099, %rd570; + cvt.u32.u64 %r30100, %rd574; + shr.u64 %rd575, %rd566, 32; + cvt.u32.u64 %r30097, %rd566; + cvt.u32.u64 %r30098, %rd575; + shr.u64 %rd576, %rd567, 32; + cvt.u32.u64 %r30095, %rd567; + cvt.u32.u64 %r30096, %rd576; + mov.u32 %r30058, %r30057; + mov.u32 %r30059, %r30057; + mov.u32 %r30060, %r30057; + mov.u32 %r30061, %r30057; + mov.u32 %r30062, %r30057; + mov.u32 %r30063, %r30057; + mov.u32 %r30064, %r30057; + mov.u32 %r30065, %r30057; + mov.u32 %r30066, %r30057; + mov.u32 %r30067, %r30057; + mov.u32 %r30068, %r30057; + mov.u32 %r30069, %r30057; + mov.u32 %r30070, %r30057; + mov.u32 %r30071, %r10116; + mov.u32 %r30073, %r30057; + mov.u32 %r30074, %r30057; + mov.u32 %r30075, %r30057; + mov.u32 %r30076, %r30057; + mov.u32 %r30077, %r30057; + mov.u32 %r30078, %r30057; + mov.u32 %r30079, %r30057; + mov.u32 %r30080, %r30057; + mov.u32 %r30081, %r30057; + mov.u32 %r30082, %r30057; + mov.u32 %r30083, %r30057; + mov.u32 %r30084, %r30057; + mov.u32 %r30085, %r30057; + mov.u32 %r30086, %r30057; + mov.u32 %r30087, %r30057; + mov.u32 %r30088, %r30057; + mov.u32 %r30089, %r30057; + mov.u32 %r30090, %r30057; + mov.u32 %r30107, %r30057; + +$L__BB2_26: + // begin inline asm + // xor5 + lop3.b32 %r10147, %r30093, %r30091, %r30089, 0x96; + lop3.b32 %r10147, %r10147, %r30087, %r30085, 0x96; + lop3.b32 %r10148, %r30094, %r30092, %r30090, 0x96; + lop3.b32 %r10148, %r10148, %r30088, %r30086, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10159, %r30105, %r30103, %r30083, 0x96; + lop3.b32 %r10159, %r10159, %r30081, %r30079, 0x96; + lop3.b32 %r10160, %r30106, %r30104, %r30084, 0x96; + lop3.b32 %r10160, %r10160, %r30082, %r30080, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10171, %r30101, %r30099, %r30077, 0x96; + lop3.b32 %r10171, %r10171, %r30075, %r30073, 0x96; + lop3.b32 %r10172, %r30102, %r30100, %r30078, 0x96; + lop3.b32 %r10172, %r10172, %r30076, %r30074, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10183, %r30097, %r30071, %r30069, 0x96; + lop3.b32 %r10183, %r10183, %r30067, %r30065, 0x96; + lop3.b32 %r10184, %r30098, %r30072, %r30070, 0x96; + lop3.b32 %r10184, %r10184, %r30068, %r30066, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10195, %r30095, %r30063, %r30061, 0x96; + lop3.b32 %r10195, %r10195, %r30059, %r30057, 0x96; + lop3.b32 %r10196, %r30096, %r30064, %r30062, 0x96; + lop3.b32 %r10196, %r10196, %r30060, %r30058, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10207, %r10160, %r10159, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10211, %r10159, %r10160, %r10116; + // end inline asm + xor.b32 %r10641, %r10207, %r10195; + xor.b32 %r10642, %r10211, %r10196; + xor.b32 %r10474, %r30093, %r10641; + xor.b32 %r10477, %r30094, %r10642; + xor.b32 %r10381, %r30091, %r10641; + xor.b32 %r10380, %r30092, %r10642; + xor.b32 %r10428, %r30089, %r10641; + xor.b32 %r10429, %r30090, %r10642; + xor.b32 %r10333, %r30087, %r10641; + xor.b32 %r10332, %r30088, %r10642; + xor.b32 %r10284, %r30085, %r10641; + xor.b32 %r10285, %r30086, %r10642; + // begin inline asm + shf.l.wrap.b32 %r10215, %r10172, %r10171, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10219, %r10171, %r10172, %r10116; + // end inline asm + xor.b32 %r10643, %r10215, %r10147; + xor.b32 %r10644, %r10219, %r10148; + xor.b32 %r10436, %r30105, %r10643; + xor.b32 %r10437, %r30106, %r10644; + xor.b32 %r10253, %r30103, %r10643; + xor.b32 %r10252, %r30104, %r10644; + xor.b32 %r10412, %r30083, %r10643; + xor.b32 %r10413, %r30084, %r10644; + xor.b32 %r10373, %r30081, %r10643; + xor.b32 %r10372, %r30082, %r10644; + xor.b32 %r10356, %r30079, %r10643; + xor.b32 %r10357, %r30080, %r10644; + // begin inline asm + shf.l.wrap.b32 %r10223, %r10184, %r10183, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10227, %r10183, %r10184, %r10116; + // end inline asm + xor.b32 %r10645, %r10223, %r10159; + xor.b32 %r10646, %r10227, %r10160; + xor.b32 %r10293, %r30101, %r10645; + xor.b32 %r10292, %r30102, %r10646; + xor.b32 %r10420, %r30099, %r10645; + xor.b32 %r10421, %r30100, %r10646; + xor.b32 %r10301, %r30077, %r10645; + xor.b32 %r10300, %r30078, %r10646; + xor.b32 %r10404, %r30075, %r10645; + xor.b32 %r10405, %r30076, %r10646; + xor.b32 %r10269, %r30073, %r10645; + xor.b32 %r10268, %r30074, %r10646; + // begin inline asm + shf.l.wrap.b32 %r10231, %r10196, %r10195, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10235, %r10195, %r10196, %r10116; + // end inline asm + xor.b32 %r10647, %r10231, %r10171; + xor.b32 %r10648, %r10235, %r10172; + xor.b32 %r10388, %r30097, %r10647; + xor.b32 %r10389, %r30098, %r10648; + xor.b32 %r10365, %r30071, %r10647; + xor.b32 %r10364, %r30072, %r10648; + xor.b32 %r10308, %r30069, %r10647; + xor.b32 %r10309, %r30070, %r10648; + xor.b32 %r10396, %r30067, %r10647; + xor.b32 %r10397, %r30068, %r10648; + xor.b32 %r10325, %r30065, %r10647; + xor.b32 %r10324, %r30066, %r10648; + // begin inline asm + shf.l.wrap.b32 %r10239, %r10148, %r10147, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10243, %r10147, %r10148, %r10116; + // end inline asm + xor.b32 %r10649, %r10239, %r10183; + xor.b32 %r10650, %r10243, %r10184; + xor.b32 %r10340, %r30095, %r10649; + xor.b32 %r10341, %r30096, %r10650; + xor.b32 %r10260, %r30063, %r10649; + xor.b32 %r10261, %r30064, %r10650; + xor.b32 %r10277, %r30061, %r10649; + xor.b32 %r10276, %r30062, %r10650; + xor.b32 %r10316, %r30059, %r10649; + xor.b32 %r10317, %r30060, %r10650; + xor.b32 %r10348, %r30057, %r10649; + xor.b32 %r10349, %r30058, %r10650; + mov.u32 %r10254, 44; + // begin inline asm + shf.l.wrap.b32 %r10247, %r10253, %r10252, %r10254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10251, %r10252, %r10253, %r10254; + // end inline asm + mov.u32 %r10262, 20; + // begin inline asm + shf.l.wrap.b32 %r10255, %r10261, %r10260, %r10262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10259, %r10260, %r10261, %r10262; + // end inline asm + mov.u32 %r10270, 61; + // begin inline asm + shf.l.wrap.b32 %r10263, %r10269, %r10268, %r10270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10267, %r10268, %r10269, %r10270; + // end inline asm + mov.u32 %r10278, 39; + // begin inline asm + shf.l.wrap.b32 %r10271, %r10277, %r10276, %r10278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10275, %r10276, %r10277, %r10278; + // end inline asm + mov.u32 %r10286, 18; + // begin inline asm + shf.l.wrap.b32 %r10279, %r10285, %r10284, %r10286; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10283, %r10284, %r10285, %r10286; + // end inline asm + mov.u32 %r10294, 62; + // begin inline asm + shf.l.wrap.b32 %r10287, %r10293, %r10292, %r10294; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10291, %r10292, %r10293, %r10294; + // end inline asm + mov.u32 %r10302, 43; + // begin inline asm + shf.l.wrap.b32 %r10295, %r10301, %r10300, %r10302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10299, %r10300, %r10301, %r10302; + // end inline asm + mov.u32 %r10310, 25; + // begin inline asm + shf.l.wrap.b32 %r10303, %r10309, %r10308, %r10310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10307, %r10308, %r10309, %r10310; + // end inline asm + mov.u32 %r10318, 8; + // begin inline asm + shf.l.wrap.b32 %r10311, %r10317, %r10316, %r10318; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10315, %r10316, %r10317, %r10318; + // end inline asm + mov.u32 %r10326, 56; + // begin inline asm + shf.l.wrap.b32 %r10319, %r10325, %r10324, %r10326; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10323, %r10324, %r10325, %r10326; + // end inline asm + mov.u32 %r10334, 41; + // begin inline asm + shf.l.wrap.b32 %r10327, %r10333, %r10332, %r10334; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10331, %r10332, %r10333, %r10334; + // end inline asm + mov.u32 %r10342, 27; + // begin inline asm + shf.l.wrap.b32 %r10335, %r10341, %r10340, %r10342; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10339, %r10340, %r10341, %r10342; + // end inline asm + mov.u32 %r10350, 14; + // begin inline asm + shf.l.wrap.b32 %r10343, %r10349, %r10348, %r10350; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10347, %r10348, %r10349, %r10350; + // end inline asm + mov.u32 %r10358, 2; + // begin inline asm + shf.l.wrap.b32 %r10351, %r10357, %r10356, %r10358; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10355, %r10356, %r10357, %r10358; + // end inline asm + mov.u32 %r10366, 55; + // begin inline asm + shf.l.wrap.b32 %r10359, %r10365, %r10364, %r10366; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10363, %r10364, %r10365, %r10366; + // end inline asm + mov.u32 %r10374, 45; + // begin inline asm + shf.l.wrap.b32 %r10367, %r10373, %r10372, %r10374; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10371, %r10372, %r10373, %r10374; + // end inline asm + mov.u32 %r10382, 36; + // begin inline asm + shf.l.wrap.b32 %r10375, %r10381, %r10380, %r10382; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10379, %r10380, %r10381, %r10382; + // end inline asm + mov.u32 %r10390, 28; + // begin inline asm + shf.l.wrap.b32 %r10383, %r10389, %r10388, %r10390; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10387, %r10388, %r10389, %r10390; + // end inline asm + mov.u32 %r10398, 21; + // begin inline asm + shf.l.wrap.b32 %r10391, %r10397, %r10396, %r10398; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10395, %r10396, %r10397, %r10398; + // end inline asm + mov.u32 %r10406, 15; + // begin inline asm + shf.l.wrap.b32 %r10399, %r10405, %r10404, %r10406; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10403, %r10404, %r10405, %r10406; + // end inline asm + mov.u32 %r10414, 10; + // begin inline asm + shf.l.wrap.b32 %r10407, %r10413, %r10412, %r10414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10411, %r10412, %r10413, %r10414; + // end inline asm + mov.u32 %r10422, 6; + // begin inline asm + shf.l.wrap.b32 %r10415, %r10421, %r10420, %r10422; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10419, %r10420, %r10421, %r10422; + // end inline asm + mov.u32 %r10430, 3; + // begin inline asm + shf.l.wrap.b32 %r10423, %r10429, %r10428, %r10430; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10427, %r10428, %r10429, %r10430; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10431, %r10437, %r10436, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10435, %r10436, %r10437, %r10116; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10439, %r10474, %r10247, %r10295, 0xD2; + lop3.b32 %r10440, %r10477, %r10251, %r10299, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30105, %r10247, %r10295, %r10391, 0xD2; + lop3.b32 %r30106, %r10251, %r10299, %r10395, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30101, %r10295, %r10391, %r10343, 0xD2; + lop3.b32 %r30102, %r10299, %r10395, %r10347, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30097, %r10391, %r10343, %r10474, 0xD2; + lop3.b32 %r30098, %r10395, %r10347, %r10477, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30095, %r10343, %r10474, %r10247, 0xD2; + lop3.b32 %r30096, %r10347, %r10477, %r10251, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30091, %r10383, %r10255, %r10423, 0xD2; + lop3.b32 %r30092, %r10387, %r10259, %r10427, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30103, %r10255, %r10423, %r10367, 0xD2; + lop3.b32 %r30104, %r10259, %r10427, %r10371, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30099, %r10423, %r10367, %r10263, 0xD2; + lop3.b32 %r30100, %r10427, %r10371, %r10267, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30071, %r10367, %r10263, %r10383, 0xD2; + lop3.b32 %r30072, %r10371, %r10267, %r10387, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30071, %r30072}; + // begin inline asm + // chi + lop3.b32 %r30063, %r10263, %r10383, %r10255, 0xD2; + lop3.b32 %r30064, %r10267, %r10387, %r10259, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30063, %r30064}; + // begin inline asm + // chi + lop3.b32 %r30089, %r10431, %r10415, %r10303, 0xD2; + lop3.b32 %r30090, %r10435, %r10419, %r10307, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30089, %r30090}; + // begin inline asm + // chi + lop3.b32 %r30083, %r10415, %r10303, %r10311, 0xD2; + lop3.b32 %r30084, %r10419, %r10307, %r10315, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30083, %r30084}; + // begin inline asm + // chi + lop3.b32 %r30077, %r10303, %r10311, %r10279, 0xD2; + lop3.b32 %r30078, %r10307, %r10315, %r10283, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30077, %r30078}; + // begin inline asm + // chi + lop3.b32 %r30069, %r10311, %r10279, %r10431, 0xD2; + lop3.b32 %r30070, %r10315, %r10283, %r10435, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30069, %r30070}; + // begin inline asm + // chi + lop3.b32 %r30061, %r10279, %r10431, %r10415, 0xD2; + lop3.b32 %r30062, %r10283, %r10435, %r10419, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30061, %r30062}; + // begin inline asm + // chi + lop3.b32 %r30087, %r10335, %r10375, %r10407, 0xD2; + lop3.b32 %r30088, %r10339, %r10379, %r10411, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30087, %r30088}; + // begin inline asm + // chi + lop3.b32 %r30081, %r10375, %r10407, %r10399, 0xD2; + lop3.b32 %r30082, %r10379, %r10411, %r10403, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30081, %r30082}; + // begin inline asm + // chi + lop3.b32 %r30075, %r10407, %r10399, %r10319, 0xD2; + lop3.b32 %r30076, %r10411, %r10403, %r10323, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30075, %r30076}; + // begin inline asm + // chi + lop3.b32 %r30067, %r10399, %r10319, %r10335, 0xD2; + lop3.b32 %r30068, %r10403, %r10323, %r10339, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30067, %r30068}; + // begin inline asm + // chi + lop3.b32 %r30059, %r10319, %r10335, %r10375, 0xD2; + lop3.b32 %r30060, %r10323, %r10339, %r10379, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30059, %r30060}; + // begin inline asm + // chi + lop3.b32 %r30085, %r10287, %r10359, %r10271, 0xD2; + lop3.b32 %r30086, %r10291, %r10363, %r10275, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30085, %r30086}; + // begin inline asm + // chi + lop3.b32 %r30079, %r10359, %r10271, %r10327, 0xD2; + lop3.b32 %r30080, %r10363, %r10275, %r10331, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30079, %r30080}; + // begin inline asm + // chi + lop3.b32 %r30073, %r10271, %r10327, %r10351, 0xD2; + lop3.b32 %r30074, %r10275, %r10331, %r10355, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30073, %r30074}; + // begin inline asm + // chi + lop3.b32 %r30065, %r10327, %r10351, %r10287, 0xD2; + lop3.b32 %r30066, %r10331, %r10355, %r10291, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30065, %r30066}; + // begin inline asm + // chi + lop3.b32 %r30057, %r10351, %r10287, %r10359, 0xD2; + lop3.b32 %r30058, %r10355, %r10291, %r10363, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30057, %r30058}; + mul.wide.s32 %rd578, %r30107, 8; + mov.u64 %rd579, keccak_round_constants; + cvta.const.u64 %rd580, %rd579; + add.s64 %rd577, %rd580, %rd578; + // begin inline asm + ld.global.nc.v2.u32 {%r10639,%r10640}, [%rd577]; + // end inline asm + xor.b32 %r30093, %r10439, %r10639; + xor.b32 %r30094, %r10440, %r10640; + add.s32 %r30107, %r30107, 1; + setp.lt.u32 %p19, %r30107, 23; + @%p19 bra $L__BB2_26; + + add.u64 %rd82, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r30105, %r30106}; + st.local.v2.u32 [%rd2+72], {%r30103, %r30104}; + st.local.v2.u32 [%rd2+40], {%r30101, %r30102}; + st.local.v2.u32 [%rd2+80], {%r30099, %r30100}; + st.local.v2.u32 [%rd2+48], {%r30097, %r30098}; + st.local.v2.u32 [%rd2+56], {%r30095, %r30096}; + st.local.v2.u32 [%rd2+24], {%r30093, %r30094}; + // begin inline asm + // xor5 + lop3.b32 %r10651, %r30093, %r30091, %r30089, 0x96; + lop3.b32 %r10651, %r10651, %r30087, %r30085, 0x96; + lop3.b32 %r10652, %r30094, %r30092, %r30090, 0x96; + lop3.b32 %r10652, %r10652, %r30088, %r30086, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10663, %r30105, %r30103, %r30083, 0x96; + lop3.b32 %r10663, %r10663, %r30081, %r30079, 0x96; + lop3.b32 %r10664, %r30106, %r30104, %r30084, 0x96; + lop3.b32 %r10664, %r10664, %r30082, %r30080, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10675, %r30101, %r30099, %r30077, 0x96; + lop3.b32 %r10675, %r10675, %r30075, %r30073, 0x96; + lop3.b32 %r10676, %r30102, %r30100, %r30078, 0x96; + lop3.b32 %r10676, %r10676, %r30076, %r30074, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10687, %r30097, %r30071, %r30069, 0x96; + lop3.b32 %r10687, %r10687, %r30067, %r30065, 0x96; + lop3.b32 %r10688, %r30098, %r30072, %r30070, 0x96; + lop3.b32 %r10688, %r10688, %r30068, %r30066, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10699, %r30095, %r30063, %r30061, 0x96; + lop3.b32 %r10699, %r10699, %r30059, %r30057, 0x96; + lop3.b32 %r10700, %r30096, %r30064, %r30062, 0x96; + lop3.b32 %r10700, %r10700, %r30060, %r30058, 0x96; + // end inline asm + mov.u32 %r10903, 1; + // begin inline asm + shf.l.wrap.b32 %r10711, %r10664, %r10663, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10715, %r10663, %r10664, %r10903; + // end inline asm + xor.b32 %r10930, %r10711, %r10699; + xor.b32 %r10931, %r10715, %r10700; + xor.b32 %r10858, %r30093, %r10930; + xor.b32 %r10861, %r30094, %r10931; + xor.b32 %r10821, %r30090, %r10931; + xor.b32 %r10820, %r30089, %r10930; + st.local.v2.u32 [%rd2+104], {%r10820, %r10821}; + // begin inline asm + shf.l.wrap.b32 %r10719, %r10676, %r10675, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10723, %r10675, %r10676, %r10903; + // end inline asm + xor.b32 %r10932, %r10719, %r10651; + xor.b32 %r10933, %r10723, %r10652; + xor.b32 %r10757, %r30103, %r10932; + xor.b32 %r10756, %r30104, %r10933; + xor.b32 %r10796, %r30082, %r10933; + xor.b32 %r10797, %r30081, %r10932; + st.local.v2.u32 [%rd2+152], {%r10797, %r10796}; + // begin inline asm + shf.l.wrap.b32 %r10727, %r10688, %r10687, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10731, %r10687, %r10688, %r10903; + // end inline asm + xor.b32 %r10934, %r10727, %r10663; + xor.b32 %r10935, %r10731, %r10664; + xor.b32 %r10780, %r30078, %r10935; + xor.b32 %r10781, %r30077, %r10934; + st.local.v2.u32 [%rd2+120], {%r10781, %r10780}; + xor.b32 %r10772, %r30074, %r10935; + xor.b32 %r10773, %r30073, %r10934; + st.local.v2.u32 [%rd2+200], {%r10773, %r10772}; + // begin inline asm + shf.l.wrap.b32 %r10735, %r10700, %r10699, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10739, %r10699, %r10700, %r10903; + // end inline asm + xor.b32 %r10936, %r10735, %r10675; + xor.b32 %r10937, %r10739, %r10676; + xor.b32 %r10804, %r30097, %r10936; + xor.b32 %r10805, %r30098, %r10937; + xor.b32 %r10813, %r30068, %r10937; + xor.b32 %r10812, %r30067, %r10936; + st.local.v2.u32 [%rd2+168], {%r10812, %r10813}; + // begin inline asm + shf.l.wrap.b32 %r10743, %r10652, %r10651, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10747, %r10651, %r10652, %r10903; + // end inline asm + xor.b32 %r10938, %r10743, %r10687; + xor.b32 %r10939, %r10747, %r10688; + xor.b32 %r10764, %r30063, %r10938; + xor.b32 %r10765, %r30064, %r10939; + xor.b32 %r10789, %r30058, %r10939; + xor.b32 %r10788, %r30057, %r10938; + st.local.v2.u32 [%rd2+216], {%r10788, %r10789}; + // begin inline asm + shf.l.wrap.b32 %r10751, %r10757, %r10756, %r10254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10755, %r10756, %r10757, %r10254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10759, %r10765, %r10764, %r10262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10763, %r10764, %r10765, %r10262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10771, %r10772, %r10773, %r10270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10767, %r10773, %r10772, %r10270; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r10767, %r10771}; + // begin inline asm + shf.l.wrap.b32 %r10775, %r10781, %r10780, %r10302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10779, %r10780, %r10781, %r10302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10783, %r10789, %r10788, %r10350; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10787, %r10788, %r10789, %r10350; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10795, %r10796, %r10797, %r10374; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10791, %r10797, %r10796, %r10374; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r10791, %r10795}; + // begin inline asm + shf.l.wrap.b32 %r10799, %r10805, %r10804, %r10390; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10803, %r10804, %r10805, %r10390; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10807, %r10813, %r10812, %r10398; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10811, %r10812, %r10813, %r10398; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10815, %r10821, %r10820, %r10430; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10819, %r10820, %r10821, %r10430; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10823, %r10858, %r10751, %r10775, 0xD2; + lop3.b32 %r10824, %r10861, %r10755, %r10779, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30240, %r10751, %r10775, %r10807, 0xD2; + lop3.b32 %r30241, %r10755, %r10779, %r10811, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30240, %r30241}; + // begin inline asm + // chi + lop3.b32 %r30236, %r10775, %r10807, %r10783, 0xD2; + lop3.b32 %r30237, %r10779, %r10811, %r10787, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30236, %r30237}; + // begin inline asm + // chi + lop3.b32 %r30232, %r10807, %r10783, %r10858, 0xD2; + lop3.b32 %r30233, %r10811, %r10787, %r10861, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30232, %r30233}; + // begin inline asm + // chi + lop3.b32 %r30230, %r10783, %r10858, %r10751, 0xD2; + lop3.b32 %r30231, %r10787, %r10861, %r10755, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30230, %r30231}; + // begin inline asm + // chi + lop3.b32 %r30226, %r10799, %r10759, %r10815, 0xD2; + lop3.b32 %r30227, %r10803, %r10763, %r10819, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30226, %r30227}; + // begin inline asm + // chi + lop3.b32 %r30238, %r10759, %r10815, %r10791, 0xD2; + lop3.b32 %r30239, %r10763, %r10819, %r10795, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30238, %r30239}; + // begin inline asm + // chi + lop3.b32 %r30234, %r10815, %r10791, %r10767, 0xD2; + lop3.b32 %r30235, %r10819, %r10795, %r10771, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30234, %r30235}; + add.s64 %rd581, %rd580, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r10887,%r10888}, [%rd581]; + // end inline asm + xor.b32 %r30228, %r10823, %r10887; + xor.b32 %r30229, %r10824, %r10888; + st.local.v2.u32 [%rd2+24], {%r30228, %r30229}; + st.local.u64 [%rd82], %rd361; + mov.u64 %rd585, 1179641; + st.local.u64 [%rd82+8], %rd585; + st.local.u32 [%rd82+16], %r48; + ld.global.u64 %rd586, [%rd33]; + ld.global.u64 %rd587, [%rd33+8]; + ld.global.u64 %rd588, [%rd33+16]; + ld.global.u64 %rd589, [%rd33+24]; + ld.global.u64 %rd590, [%rd33+32]; + ld.global.u64 %rd591, [%rd33+40]; + ld.global.u64 %rd592, [%rd33+48]; + ld.global.u64 %rd593, [%rd33+56]; + st.local.u64 [%rd82+32], %rd587; + st.local.u64 [%rd82+40], %rd588; + st.local.u64 [%rd82+48], %rd589; + st.local.u64 [%rd82+56], %rd590; + st.local.u64 [%rd82+64], %rd591; + st.local.u64 [%rd82+72], %rd592; + st.local.u64 [%rd82+80], %rd593; + cvt.u32.u64 %r10940, %rd586; + xor.b32 %r10941, %r48, %r10940; + st.local.u64 [%rd82+24], %rd586; + st.local.u32 [%rd82+24], %r10941; + mov.u32 %r30108, 0; + st.local.v2.u32 [%rd82+96], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+104], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+112], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+120], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+128], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+136], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+144], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+152], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+160], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+168], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+176], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+184], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+192], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+200], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+208], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+216], {%r30108, %r30108}; + mov.u32 %r30123, -2147483648; + st.local.v2.u32 [%rd82+88], {%r10903, %r30123}; + ld.local.v2.u32 {%r30144, %r30145}, [%rd82+24]; + mov.b64 {%r30142, %r30143}, %rd591; + shr.u64 %rd594, %rd587, 32; + cvt.u32.u64 %r30156, %rd587; + cvt.u32.u64 %r30157, %rd594; + shr.u64 %rd595, %rd592, 32; + cvt.u32.u64 %r30154, %rd592; + cvt.u32.u64 %r30155, %rd595; + shr.u64 %rd596, %rd588, 32; + cvt.u32.u64 %r30152, %rd588; + cvt.u32.u64 %r30153, %rd596; + shr.u64 %rd597, %rd593, 32; + cvt.u32.u64 %r30150, %rd593; + cvt.u32.u64 %r30151, %rd597; + shr.u64 %rd598, %rd589, 32; + cvt.u32.u64 %r30148, %rd589; + cvt.u32.u64 %r30149, %rd598; + shr.u64 %rd599, %rd590, 32; + cvt.u32.u64 %r30146, %rd590; + cvt.u32.u64 %r30147, %rd599; + mov.u32 %r30109, %r30108; + mov.u32 %r30110, %r30108; + mov.u32 %r30111, %r30108; + mov.u32 %r30112, %r30108; + mov.u32 %r30113, %r30108; + mov.u32 %r30114, %r30108; + mov.u32 %r30115, %r30108; + mov.u32 %r30116, %r30108; + mov.u32 %r30117, %r30108; + mov.u32 %r30118, %r30108; + mov.u32 %r30119, %r30108; + mov.u32 %r30120, %r30108; + mov.u32 %r30121, %r30108; + mov.u32 %r30122, %r10903; + mov.u32 %r30124, %r30108; + mov.u32 %r30125, %r30108; + mov.u32 %r30126, %r30108; + mov.u32 %r30127, %r30108; + mov.u32 %r30128, %r30108; + mov.u32 %r30129, %r30108; + mov.u32 %r30130, %r30108; + mov.u32 %r30131, %r30108; + mov.u32 %r30132, %r30108; + mov.u32 %r30133, %r30108; + mov.u32 %r30134, %r30108; + mov.u32 %r30135, %r30108; + mov.u32 %r30136, %r30108; + mov.u32 %r30137, %r30108; + mov.u32 %r30138, %r30108; + mov.u32 %r30139, %r30108; + mov.u32 %r30140, %r30108; + mov.u32 %r30141, %r30108; + mov.u32 %r30158, %r30108; + +$L__BB2_28: + // begin inline asm + // xor5 + lop3.b32 %r10944, %r30144, %r30142, %r30140, 0x96; + lop3.b32 %r10944, %r10944, %r30138, %r30136, 0x96; + lop3.b32 %r10945, %r30145, %r30143, %r30141, 0x96; + lop3.b32 %r10945, %r10945, %r30139, %r30137, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10956, %r30156, %r30154, %r30134, 0x96; + lop3.b32 %r10956, %r10956, %r30132, %r30130, 0x96; + lop3.b32 %r10957, %r30157, %r30155, %r30135, 0x96; + lop3.b32 %r10957, %r10957, %r30133, %r30131, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10968, %r30152, %r30150, %r30128, 0x96; + lop3.b32 %r10968, %r10968, %r30126, %r30124, 0x96; + lop3.b32 %r10969, %r30153, %r30151, %r30129, 0x96; + lop3.b32 %r10969, %r10969, %r30127, %r30125, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10980, %r30148, %r30122, %r30120, 0x96; + lop3.b32 %r10980, %r10980, %r30118, %r30116, 0x96; + lop3.b32 %r10981, %r30149, %r30123, %r30121, 0x96; + lop3.b32 %r10981, %r10981, %r30119, %r30117, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10992, %r30146, %r30114, %r30112, 0x96; + lop3.b32 %r10992, %r10992, %r30110, %r30108, 0x96; + lop3.b32 %r10993, %r30147, %r30115, %r30113, 0x96; + lop3.b32 %r10993, %r10993, %r30111, %r30109, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11004, %r10957, %r10956, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11008, %r10956, %r10957, %r10903; + // end inline asm + xor.b32 %r11438, %r11004, %r10992; + xor.b32 %r11439, %r11008, %r10993; + xor.b32 %r11271, %r30144, %r11438; + xor.b32 %r11274, %r30145, %r11439; + xor.b32 %r11178, %r30142, %r11438; + xor.b32 %r11177, %r30143, %r11439; + xor.b32 %r11225, %r30140, %r11438; + xor.b32 %r11226, %r30141, %r11439; + xor.b32 %r11130, %r30138, %r11438; + xor.b32 %r11129, %r30139, %r11439; + xor.b32 %r11081, %r30136, %r11438; + xor.b32 %r11082, %r30137, %r11439; + // begin inline asm + shf.l.wrap.b32 %r11012, %r10969, %r10968, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11016, %r10968, %r10969, %r10903; + // end inline asm + xor.b32 %r11440, %r11012, %r10944; + xor.b32 %r11441, %r11016, %r10945; + xor.b32 %r11233, %r30156, %r11440; + xor.b32 %r11234, %r30157, %r11441; + xor.b32 %r11050, %r30154, %r11440; + xor.b32 %r11049, %r30155, %r11441; + xor.b32 %r11209, %r30134, %r11440; + xor.b32 %r11210, %r30135, %r11441; + xor.b32 %r11170, %r30132, %r11440; + xor.b32 %r11169, %r30133, %r11441; + xor.b32 %r11153, %r30130, %r11440; + xor.b32 %r11154, %r30131, %r11441; + // begin inline asm + shf.l.wrap.b32 %r11020, %r10981, %r10980, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11024, %r10980, %r10981, %r10903; + // end inline asm + xor.b32 %r11442, %r11020, %r10956; + xor.b32 %r11443, %r11024, %r10957; + xor.b32 %r11090, %r30152, %r11442; + xor.b32 %r11089, %r30153, %r11443; + xor.b32 %r11217, %r30150, %r11442; + xor.b32 %r11218, %r30151, %r11443; + xor.b32 %r11098, %r30128, %r11442; + xor.b32 %r11097, %r30129, %r11443; + xor.b32 %r11201, %r30126, %r11442; + xor.b32 %r11202, %r30127, %r11443; + xor.b32 %r11066, %r30124, %r11442; + xor.b32 %r11065, %r30125, %r11443; + // begin inline asm + shf.l.wrap.b32 %r11028, %r10993, %r10992, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11032, %r10992, %r10993, %r10903; + // end inline asm + xor.b32 %r11444, %r11028, %r10968; + xor.b32 %r11445, %r11032, %r10969; + xor.b32 %r11185, %r30148, %r11444; + xor.b32 %r11186, %r30149, %r11445; + xor.b32 %r11162, %r30122, %r11444; + xor.b32 %r11161, %r30123, %r11445; + xor.b32 %r11105, %r30120, %r11444; + xor.b32 %r11106, %r30121, %r11445; + xor.b32 %r11193, %r30118, %r11444; + xor.b32 %r11194, %r30119, %r11445; + xor.b32 %r11122, %r30116, %r11444; + xor.b32 %r11121, %r30117, %r11445; + // begin inline asm + shf.l.wrap.b32 %r11036, %r10945, %r10944, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11040, %r10944, %r10945, %r10903; + // end inline asm + xor.b32 %r11446, %r11036, %r10980; + xor.b32 %r11447, %r11040, %r10981; + xor.b32 %r11137, %r30146, %r11446; + xor.b32 %r11138, %r30147, %r11447; + xor.b32 %r11057, %r30114, %r11446; + xor.b32 %r11058, %r30115, %r11447; + xor.b32 %r11074, %r30112, %r11446; + xor.b32 %r11073, %r30113, %r11447; + xor.b32 %r11113, %r30110, %r11446; + xor.b32 %r11114, %r30111, %r11447; + xor.b32 %r11145, %r30108, %r11446; + xor.b32 %r11146, %r30109, %r11447; + mov.u32 %r11051, 44; + // begin inline asm + shf.l.wrap.b32 %r11044, %r11050, %r11049, %r11051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11048, %r11049, %r11050, %r11051; + // end inline asm + mov.u32 %r11059, 20; + // begin inline asm + shf.l.wrap.b32 %r11052, %r11058, %r11057, %r11059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11056, %r11057, %r11058, %r11059; + // end inline asm + mov.u32 %r11067, 61; + // begin inline asm + shf.l.wrap.b32 %r11060, %r11066, %r11065, %r11067; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11064, %r11065, %r11066, %r11067; + // end inline asm + mov.u32 %r11075, 39; + // begin inline asm + shf.l.wrap.b32 %r11068, %r11074, %r11073, %r11075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11072, %r11073, %r11074, %r11075; + // end inline asm + mov.u32 %r11083, 18; + // begin inline asm + shf.l.wrap.b32 %r11076, %r11082, %r11081, %r11083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11080, %r11081, %r11082, %r11083; + // end inline asm + mov.u32 %r11091, 62; + // begin inline asm + shf.l.wrap.b32 %r11084, %r11090, %r11089, %r11091; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11088, %r11089, %r11090, %r11091; + // end inline asm + mov.u32 %r11099, 43; + // begin inline asm + shf.l.wrap.b32 %r11092, %r11098, %r11097, %r11099; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11096, %r11097, %r11098, %r11099; + // end inline asm + mov.u32 %r11107, 25; + // begin inline asm + shf.l.wrap.b32 %r11100, %r11106, %r11105, %r11107; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11104, %r11105, %r11106, %r11107; + // end inline asm + mov.u32 %r11115, 8; + // begin inline asm + shf.l.wrap.b32 %r11108, %r11114, %r11113, %r11115; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11112, %r11113, %r11114, %r11115; + // end inline asm + mov.u32 %r11123, 56; + // begin inline asm + shf.l.wrap.b32 %r11116, %r11122, %r11121, %r11123; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11120, %r11121, %r11122, %r11123; + // end inline asm + mov.u32 %r11131, 41; + // begin inline asm + shf.l.wrap.b32 %r11124, %r11130, %r11129, %r11131; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11128, %r11129, %r11130, %r11131; + // end inline asm + mov.u32 %r11139, 27; + // begin inline asm + shf.l.wrap.b32 %r11132, %r11138, %r11137, %r11139; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11136, %r11137, %r11138, %r11139; + // end inline asm + mov.u32 %r11147, 14; + // begin inline asm + shf.l.wrap.b32 %r11140, %r11146, %r11145, %r11147; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11144, %r11145, %r11146, %r11147; + // end inline asm + mov.u32 %r11155, 2; + // begin inline asm + shf.l.wrap.b32 %r11148, %r11154, %r11153, %r11155; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11152, %r11153, %r11154, %r11155; + // end inline asm + mov.u32 %r11163, 55; + // begin inline asm + shf.l.wrap.b32 %r11156, %r11162, %r11161, %r11163; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11160, %r11161, %r11162, %r11163; + // end inline asm + mov.u32 %r11171, 45; + // begin inline asm + shf.l.wrap.b32 %r11164, %r11170, %r11169, %r11171; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11168, %r11169, %r11170, %r11171; + // end inline asm + mov.u32 %r11179, 36; + // begin inline asm + shf.l.wrap.b32 %r11172, %r11178, %r11177, %r11179; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11176, %r11177, %r11178, %r11179; + // end inline asm + mov.u32 %r11187, 28; + // begin inline asm + shf.l.wrap.b32 %r11180, %r11186, %r11185, %r11187; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11184, %r11185, %r11186, %r11187; + // end inline asm + mov.u32 %r11195, 21; + // begin inline asm + shf.l.wrap.b32 %r11188, %r11194, %r11193, %r11195; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11192, %r11193, %r11194, %r11195; + // end inline asm + mov.u32 %r11203, 15; + // begin inline asm + shf.l.wrap.b32 %r11196, %r11202, %r11201, %r11203; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11200, %r11201, %r11202, %r11203; + // end inline asm + mov.u32 %r11211, 10; + // begin inline asm + shf.l.wrap.b32 %r11204, %r11210, %r11209, %r11211; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11208, %r11209, %r11210, %r11211; + // end inline asm + mov.u32 %r11219, 6; + // begin inline asm + shf.l.wrap.b32 %r11212, %r11218, %r11217, %r11219; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11216, %r11217, %r11218, %r11219; + // end inline asm + mov.u32 %r11227, 3; + // begin inline asm + shf.l.wrap.b32 %r11220, %r11226, %r11225, %r11227; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11224, %r11225, %r11226, %r11227; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11228, %r11234, %r11233, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11232, %r11233, %r11234, %r10903; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r11236, %r11271, %r11044, %r11092, 0xD2; + lop3.b32 %r11237, %r11274, %r11048, %r11096, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30156, %r11044, %r11092, %r11188, 0xD2; + lop3.b32 %r30157, %r11048, %r11096, %r11192, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30152, %r11092, %r11188, %r11140, 0xD2; + lop3.b32 %r30153, %r11096, %r11192, %r11144, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30148, %r11188, %r11140, %r11271, 0xD2; + lop3.b32 %r30149, %r11192, %r11144, %r11274, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30146, %r11140, %r11271, %r11044, 0xD2; + lop3.b32 %r30147, %r11144, %r11274, %r11048, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30142, %r11180, %r11052, %r11220, 0xD2; + lop3.b32 %r30143, %r11184, %r11056, %r11224, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30154, %r11052, %r11220, %r11164, 0xD2; + lop3.b32 %r30155, %r11056, %r11224, %r11168, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30150, %r11220, %r11164, %r11060, 0xD2; + lop3.b32 %r30151, %r11224, %r11168, %r11064, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30122, %r11164, %r11060, %r11180, 0xD2; + lop3.b32 %r30123, %r11168, %r11064, %r11184, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r30122, %r30123}; + // begin inline asm + // chi + lop3.b32 %r30114, %r11060, %r11180, %r11052, 0xD2; + lop3.b32 %r30115, %r11064, %r11184, %r11056, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r30114, %r30115}; + // begin inline asm + // chi + lop3.b32 %r30140, %r11228, %r11212, %r11100, 0xD2; + lop3.b32 %r30141, %r11232, %r11216, %r11104, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+104], {%r30140, %r30141}; + // begin inline asm + // chi + lop3.b32 %r30134, %r11212, %r11100, %r11108, 0xD2; + lop3.b32 %r30135, %r11216, %r11104, %r11112, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+112], {%r30134, %r30135}; + // begin inline asm + // chi + lop3.b32 %r30128, %r11100, %r11108, %r11076, 0xD2; + lop3.b32 %r30129, %r11104, %r11112, %r11080, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+120], {%r30128, %r30129}; + // begin inline asm + // chi + lop3.b32 %r30120, %r11108, %r11076, %r11228, 0xD2; + lop3.b32 %r30121, %r11112, %r11080, %r11232, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+128], {%r30120, %r30121}; + // begin inline asm + // chi + lop3.b32 %r30112, %r11076, %r11228, %r11212, 0xD2; + lop3.b32 %r30113, %r11080, %r11232, %r11216, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+136], {%r30112, %r30113}; + // begin inline asm + // chi + lop3.b32 %r30138, %r11132, %r11172, %r11204, 0xD2; + lop3.b32 %r30139, %r11136, %r11176, %r11208, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+144], {%r30138, %r30139}; + // begin inline asm + // chi + lop3.b32 %r30132, %r11172, %r11204, %r11196, 0xD2; + lop3.b32 %r30133, %r11176, %r11208, %r11200, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+152], {%r30132, %r30133}; + // begin inline asm + // chi + lop3.b32 %r30126, %r11204, %r11196, %r11116, 0xD2; + lop3.b32 %r30127, %r11208, %r11200, %r11120, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+160], {%r30126, %r30127}; + // begin inline asm + // chi + lop3.b32 %r30118, %r11196, %r11116, %r11132, 0xD2; + lop3.b32 %r30119, %r11200, %r11120, %r11136, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+168], {%r30118, %r30119}; + // begin inline asm + // chi + lop3.b32 %r30110, %r11116, %r11132, %r11172, 0xD2; + lop3.b32 %r30111, %r11120, %r11136, %r11176, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+176], {%r30110, %r30111}; + // begin inline asm + // chi + lop3.b32 %r30136, %r11084, %r11156, %r11068, 0xD2; + lop3.b32 %r30137, %r11088, %r11160, %r11072, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+184], {%r30136, %r30137}; + // begin inline asm + // chi + lop3.b32 %r30130, %r11156, %r11068, %r11124, 0xD2; + lop3.b32 %r30131, %r11160, %r11072, %r11128, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+192], {%r30130, %r30131}; + // begin inline asm + // chi + lop3.b32 %r30124, %r11068, %r11124, %r11148, 0xD2; + lop3.b32 %r30125, %r11072, %r11128, %r11152, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+200], {%r30124, %r30125}; + // begin inline asm + // chi + lop3.b32 %r30116, %r11124, %r11148, %r11084, 0xD2; + lop3.b32 %r30117, %r11128, %r11152, %r11088, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+208], {%r30116, %r30117}; + // begin inline asm + // chi + lop3.b32 %r30108, %r11148, %r11084, %r11156, 0xD2; + lop3.b32 %r30109, %r11152, %r11088, %r11160, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+216], {%r30108, %r30109}; + mul.wide.s32 %rd601, %r30158, 8; + add.s64 %rd600, %rd580, %rd601; + // begin inline asm + ld.global.nc.v2.u32 {%r11436,%r11437}, [%rd600]; + // end inline asm + xor.b32 %r30144, %r11236, %r11436; + xor.b32 %r30145, %r11237, %r11437; + add.s32 %r30158, %r30158, 1; + setp.lt.u32 %p20, %r30158, 23; + @%p20 bra $L__BB2_28; + + mov.u32 %r30191, 0; + mov.u32 %r11547, 1; + st.local.v2.u32 [%rd82+32], {%r30156, %r30157}; + st.local.v2.u32 [%rd82+72], {%r30154, %r30155}; + st.local.v2.u32 [%rd82+40], {%r30152, %r30153}; + st.local.v2.u32 [%rd82+80], {%r30150, %r30151}; + st.local.v2.u32 [%rd82+48], {%r30148, %r30149}; + st.local.v2.u32 [%rd82+56], {%r30146, %r30147}; + st.local.v2.u32 [%rd82+24], {%r30144, %r30145}; + // begin inline asm + // xor5 + lop3.b32 %r11448, %r30144, %r30142, %r30140, 0x96; + lop3.b32 %r11448, %r11448, %r30138, %r30136, 0x96; + lop3.b32 %r11449, %r30145, %r30143, %r30141, 0x96; + lop3.b32 %r11449, %r11449, %r30139, %r30137, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11460, %r30156, %r30154, %r30134, 0x96; + lop3.b32 %r11460, %r11460, %r30132, %r30130, 0x96; + lop3.b32 %r11461, %r30157, %r30155, %r30135, 0x96; + lop3.b32 %r11461, %r11461, %r30133, %r30131, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11472, %r30152, %r30150, %r30128, 0x96; + lop3.b32 %r11472, %r11472, %r30126, %r30124, 0x96; + lop3.b32 %r11473, %r30153, %r30151, %r30129, 0x96; + lop3.b32 %r11473, %r11473, %r30127, %r30125, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11484, %r30148, %r30122, %r30120, 0x96; + lop3.b32 %r11484, %r11484, %r30118, %r30116, 0x96; + lop3.b32 %r11485, %r30149, %r30123, %r30121, 0x96; + lop3.b32 %r11485, %r11485, %r30119, %r30117, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11496, %r30146, %r30114, %r30112, 0x96; + lop3.b32 %r11496, %r11496, %r30110, %r30108, 0x96; + lop3.b32 %r11497, %r30147, %r30115, %r30113, 0x96; + lop3.b32 %r11497, %r11497, %r30111, %r30109, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11508, %r11461, %r11460, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11512, %r11460, %r11461, %r11547; + // end inline asm + xor.b32 %r11687, %r11508, %r11496; + xor.b32 %r11688, %r11512, %r11497; + xor.b32 %r11655, %r30144, %r11687; + xor.b32 %r11658, %r30145, %r11688; + xor.b32 %r11618, %r30141, %r11688; + xor.b32 %r11617, %r30140, %r11687; + st.local.v2.u32 [%rd82+104], {%r11617, %r11618}; + // begin inline asm + shf.l.wrap.b32 %r11516, %r11473, %r11472, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11520, %r11472, %r11473, %r11547; + // end inline asm + xor.b32 %r11689, %r11516, %r11448; + xor.b32 %r11690, %r11520, %r11449; + xor.b32 %r11554, %r30154, %r11689; + xor.b32 %r11553, %r30155, %r11690; + xor.b32 %r11593, %r30133, %r11690; + xor.b32 %r11594, %r30132, %r11689; + st.local.v2.u32 [%rd82+152], {%r11594, %r11593}; + // begin inline asm + shf.l.wrap.b32 %r11524, %r11485, %r11484, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11528, %r11484, %r11485, %r11547; + // end inline asm + xor.b32 %r11691, %r11524, %r11460; + xor.b32 %r11692, %r11528, %r11461; + xor.b32 %r11577, %r30129, %r11692; + xor.b32 %r11578, %r30128, %r11691; + st.local.v2.u32 [%rd82+120], {%r11578, %r11577}; + xor.b32 %r11569, %r30125, %r11692; + xor.b32 %r11570, %r30124, %r11691; + st.local.v2.u32 [%rd82+200], {%r11570, %r11569}; + // begin inline asm + shf.l.wrap.b32 %r11532, %r11497, %r11496, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11536, %r11496, %r11497, %r11547; + // end inline asm + xor.b32 %r11693, %r11532, %r11472; + xor.b32 %r11694, %r11536, %r11473; + xor.b32 %r11601, %r30148, %r11693; + xor.b32 %r11602, %r30149, %r11694; + xor.b32 %r11610, %r30119, %r11694; + xor.b32 %r11609, %r30118, %r11693; + st.local.v2.u32 [%rd82+168], {%r11609, %r11610}; + // begin inline asm + shf.l.wrap.b32 %r11540, %r11449, %r11448, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11544, %r11448, %r11449, %r11547; + // end inline asm + xor.b32 %r11695, %r11540, %r11484; + xor.b32 %r11696, %r11544, %r11485; + xor.b32 %r11561, %r30114, %r11695; + xor.b32 %r11562, %r30115, %r11696; + xor.b32 %r11586, %r30109, %r11696; + xor.b32 %r11585, %r30108, %r11695; + st.local.v2.u32 [%rd82+216], {%r11585, %r11586}; + // begin inline asm + shf.l.wrap.b32 %r11548, %r11554, %r11553, %r11051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11552, %r11553, %r11554, %r11051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11556, %r11562, %r11561, %r11059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11560, %r11561, %r11562, %r11059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11568, %r11569, %r11570, %r11067; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11564, %r11570, %r11569, %r11067; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r11564, %r11568}; + // begin inline asm + shf.l.wrap.b32 %r11572, %r11578, %r11577, %r11099; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11576, %r11577, %r11578, %r11099; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11580, %r11586, %r11585, %r11147; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11584, %r11585, %r11586, %r11147; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11592, %r11593, %r11594, %r11171; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11588, %r11594, %r11593, %r11171; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r11588, %r11592}; + // begin inline asm + shf.l.wrap.b32 %r11596, %r11602, %r11601, %r11187; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11600, %r11601, %r11602, %r11187; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11604, %r11610, %r11609, %r11195; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11608, %r11609, %r11610, %r11195; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11612, %r11618, %r11617, %r11227; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11616, %r11617, %r11618, %r11227; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r11620, %r11655, %r11548, %r11572, 0xD2; + lop3.b32 %r11621, %r11658, %r11552, %r11576, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30291, %r11548, %r11572, %r11604, 0xD2; + lop3.b32 %r30292, %r11552, %r11576, %r11608, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+32], {%r30291, %r30292}; + // begin inline asm + // chi + lop3.b32 %r30287, %r11572, %r11604, %r11580, 0xD2; + lop3.b32 %r30288, %r11576, %r11608, %r11584, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+40], {%r30287, %r30288}; + // begin inline asm + // chi + lop3.b32 %r30283, %r11604, %r11580, %r11655, 0xD2; + lop3.b32 %r30284, %r11608, %r11584, %r11658, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+48], {%r30283, %r30284}; + // begin inline asm + // chi + lop3.b32 %r30281, %r11580, %r11655, %r11548, 0xD2; + lop3.b32 %r30282, %r11584, %r11658, %r11552, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+56], {%r30281, %r30282}; + // begin inline asm + // chi + lop3.b32 %r30277, %r11596, %r11556, %r11612, 0xD2; + lop3.b32 %r30278, %r11600, %r11560, %r11616, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+64], {%r30277, %r30278}; + // begin inline asm + // chi + lop3.b32 %r30289, %r11556, %r11612, %r11588, 0xD2; + lop3.b32 %r30290, %r11560, %r11616, %r11592, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+72], {%r30289, %r30290}; + // begin inline asm + // chi + lop3.b32 %r30285, %r11612, %r11588, %r11564, 0xD2; + lop3.b32 %r30286, %r11616, %r11592, %r11568, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+80], {%r30285, %r30286}; + // begin inline asm + ld.global.nc.v2.u32 {%r11684,%r11685}, [%rd581]; + // end inline asm + xor.b32 %r30279, %r11620, %r11684; + xor.b32 %r30280, %r11621, %r11685; + st.local.v2.u32 [%rd82+24], {%r30279, %r30280}; + add.s64 %rd84, %rd82, 24; + add.s64 %rd85, %rd2, 24; + +$L__BB2_30: + cvta.to.global.u64 %rd1271, %rd361; + shl.b32 %r11697, %r30191, 2; + cvt.u64.u32 %rd611, %r11697; + and.b64 %rd612, %rd611, 60; + add.s64 %rd613, %rd85, %rd612; + xor.b32 %r11698, %r47, %r30191; + mul.lo.s32 %r11699, %r11698, 16777619; + ld.local.u32 %r11700, [%rd613]; + xor.b32 %r11701, %r11699, %r11700; + mul.wide.u32 %rd614, %r11701, -954391867; + shr.u64 %rd615, %rd614, 32; + cvt.u32.u64 %r11702, %rd615; + sub.s32 %r11703, %r11701, %r11702; + shr.u32 %r11704, %r11703, 1; + add.s32 %r11705, %r11704, %r11702; + shr.u32 %r11706, %r11705, 20; + mul.lo.s32 %r11707, %r11706, 1179641; + sub.s32 %r11708, %r11701, %r11707; + mul.wide.u32 %rd616, %r11708, 64; + add.s64 %rd617, %rd1271, %rd616; + mul.lo.s32 %r11709, %r30228, 16777619; + ld.global.u32 %r11710, [%rd617]; + xor.b32 %r30228, %r11709, %r11710; + mul.lo.s32 %r11711, %r30229, 16777619; + ld.global.u32 %r11712, [%rd617+4]; + xor.b32 %r30229, %r11711, %r11712; + mul.lo.s32 %r11713, %r30240, 16777619; + ld.global.u32 %r11714, [%rd617+8]; + mul.lo.s32 %r11715, %r30241, 16777619; + ld.global.u32 %r11716, [%rd617+12]; + xor.b32 %r11717, %r11715, %r11716; + xor.b32 %r30240, %r11713, %r11714; + mov.b64 %rd618, {%r30240, %r11717}; + mul.lo.s32 %r11718, %r30236, 16777619; + ld.global.u32 %r11719, [%rd617+16]; + mul.lo.s32 %r11720, %r30237, 16777619; + ld.global.u32 %r11721, [%rd617+20]; + xor.b32 %r11722, %r11720, %r11721; + xor.b32 %r30236, %r11718, %r11719; + mov.b64 %rd619, {%r30236, %r11722}; + mul.lo.s32 %r11723, %r30232, 16777619; + ld.global.u32 %r11724, [%rd617+24]; + mul.lo.s32 %r11725, %r30233, 16777619; + ld.global.u32 %r11726, [%rd617+28]; + xor.b32 %r11727, %r11725, %r11726; + xor.b32 %r30232, %r11723, %r11724; + mov.b64 %rd620, {%r30232, %r11727}; + mul.lo.s32 %r11728, %r30230, 16777619; + ld.global.u32 %r11729, [%rd617+32]; + mul.lo.s32 %r11730, %r30231, 16777619; + ld.global.u32 %r11731, [%rd617+36]; + xor.b32 %r11732, %r11730, %r11731; + xor.b32 %r30230, %r11728, %r11729; + mov.b64 %rd621, {%r30230, %r11732}; + mul.lo.s32 %r11733, %r30226, 16777619; + ld.global.u32 %r11734, [%rd617+40]; + xor.b32 %r30226, %r11733, %r11734; + mul.lo.s32 %r11735, %r30227, 16777619; + ld.global.u32 %r11736, [%rd617+44]; + xor.b32 %r30227, %r11735, %r11736; + mul.lo.s32 %r11737, %r30238, 16777619; + ld.global.u32 %r11738, [%rd617+48]; + mul.lo.s32 %r11739, %r30239, 16777619; + ld.global.u32 %r11740, [%rd617+52]; + xor.b32 %r11741, %r11739, %r11740; + xor.b32 %r30238, %r11737, %r11738; + mov.b64 %rd622, {%r30238, %r11741}; + mul.lo.s32 %r11742, %r30234, 16777619; + ld.global.u32 %r11743, [%rd617+56]; + mul.lo.s32 %r11744, %r30235, 16777619; + ld.global.u32 %r11745, [%rd617+60]; + xor.b32 %r11746, %r11744, %r11745; + xor.b32 %r30234, %r11742, %r11743; + mov.b64 %rd623, {%r30234, %r11746}; + st.local.v2.u32 [%rd2+24], {%r30228, %r30229}; + st.local.v2.u32 [%rd2+32], {%r30240, %r11717}; + st.local.v2.u32 [%rd2+40], {%r30236, %r11722}; + st.local.v2.u32 [%rd2+48], {%r30232, %r11727}; + st.local.v2.u32 [%rd2+56], {%r30230, %r11732}; + st.local.v2.u32 [%rd2+64], {%r30226, %r30227}; + st.local.v2.u32 [%rd2+72], {%r30238, %r11741}; + st.local.v2.u32 [%rd2+80], {%r30234, %r11746}; + add.s64 %rd624, %rd84, %rd612; + xor.b32 %r11747, %r48, %r30191; + mul.lo.s32 %r11748, %r11747, 16777619; + ld.local.u32 %r11749, [%rd624]; + xor.b32 %r11750, %r11748, %r11749; + mul.wide.u32 %rd625, %r11750, -954391867; + shr.u64 %rd626, %rd625, 32; + cvt.u32.u64 %r11751, %rd626; + sub.s32 %r11752, %r11750, %r11751; + shr.u32 %r11753, %r11752, 1; + add.s32 %r11754, %r11753, %r11751; + shr.u32 %r11755, %r11754, 20; + mul.lo.s32 %r11756, %r11755, 1179641; + sub.s32 %r11757, %r11750, %r11756; + mul.wide.u32 %rd627, %r11757, 64; + add.s64 %rd628, %rd1271, %rd627; + mul.lo.s32 %r11758, %r30279, 16777619; + ld.global.u32 %r11759, [%rd628]; + xor.b32 %r30279, %r11758, %r11759; + mul.lo.s32 %r11760, %r30280, 16777619; + ld.global.u32 %r11761, [%rd628+4]; + xor.b32 %r30280, %r11760, %r11761; + mul.lo.s32 %r11762, %r30291, 16777619; + ld.global.u32 %r11763, [%rd628+8]; + mul.lo.s32 %r11764, %r30292, 16777619; + ld.global.u32 %r11765, [%rd628+12]; + xor.b32 %r11766, %r11764, %r11765; + xor.b32 %r30291, %r11762, %r11763; + mov.b64 %rd629, {%r30291, %r11766}; + mul.lo.s32 %r11767, %r30287, 16777619; + ld.global.u32 %r11768, [%rd628+16]; + mul.lo.s32 %r11769, %r30288, 16777619; + ld.global.u32 %r11770, [%rd628+20]; + xor.b32 %r11771, %r11769, %r11770; + xor.b32 %r30287, %r11767, %r11768; + mov.b64 %rd630, {%r30287, %r11771}; + mul.lo.s32 %r11772, %r30283, 16777619; + ld.global.u32 %r11773, [%rd628+24]; + mul.lo.s32 %r11774, %r30284, 16777619; + ld.global.u32 %r11775, [%rd628+28]; + xor.b32 %r11776, %r11774, %r11775; + xor.b32 %r30283, %r11772, %r11773; + mov.b64 %rd631, {%r30283, %r11776}; + mul.lo.s32 %r11777, %r30281, 16777619; + ld.global.u32 %r11778, [%rd628+32]; + mul.lo.s32 %r11779, %r30282, 16777619; + ld.global.u32 %r11780, [%rd628+36]; + xor.b32 %r11781, %r11779, %r11780; + xor.b32 %r30281, %r11777, %r11778; + mov.b64 %rd632, {%r30281, %r11781}; + mul.lo.s32 %r11782, %r30277, 16777619; + ld.global.u32 %r11783, [%rd628+40]; + xor.b32 %r30277, %r11782, %r11783; + mul.lo.s32 %r11784, %r30278, 16777619; + ld.global.u32 %r11785, [%rd628+44]; + xor.b32 %r30278, %r11784, %r11785; + mul.lo.s32 %r11786, %r30289, 16777619; + ld.global.u32 %r11787, [%rd628+48]; + mul.lo.s32 %r11788, %r30290, 16777619; + ld.global.u32 %r11789, [%rd628+52]; + xor.b32 %r11790, %r11788, %r11789; + xor.b32 %r30289, %r11786, %r11787; + mov.b64 %rd633, {%r30289, %r11790}; + mul.lo.s32 %r11791, %r30285, 16777619; + ld.global.u32 %r11792, [%rd628+56]; + mul.lo.s32 %r11793, %r30286, 16777619; + ld.global.u32 %r11794, [%rd628+60]; + xor.b32 %r11795, %r11793, %r11794; + xor.b32 %r30285, %r11791, %r11792; + mov.b64 %rd634, {%r30285, %r11795}; + st.local.v2.u32 [%rd82+24], {%r30279, %r30280}; + st.local.v2.u32 [%rd82+32], {%r30291, %r11766}; + st.local.v2.u32 [%rd82+40], {%r30287, %r11771}; + st.local.v2.u32 [%rd82+48], {%r30283, %r11776}; + st.local.v2.u32 [%rd82+56], {%r30281, %r11781}; + st.local.v2.u32 [%rd82+64], {%r30277, %r30278}; + st.local.v2.u32 [%rd82+72], {%r30289, %r11790}; + st.local.v2.u32 [%rd82+80], {%r30285, %r11795}; + add.s32 %r30191, %r30191, 1; + setp.lt.u32 %p21, %r30191, 512; + shr.u64 %rd635, %rd618, 32; + cvt.u32.u64 %r30241, %rd635; + shr.u64 %rd636, %rd619, 32; + cvt.u32.u64 %r30237, %rd636; + shr.u64 %rd637, %rd620, 32; + cvt.u32.u64 %r30233, %rd637; + shr.u64 %rd638, %rd621, 32; + cvt.u32.u64 %r30231, %rd638; + shr.u64 %rd639, %rd622, 32; + cvt.u32.u64 %r30239, %rd639; + shr.u64 %rd640, %rd623, 32; + cvt.u32.u64 %r30235, %rd640; + shr.u64 %rd641, %rd629, 32; + cvt.u32.u64 %r30292, %rd641; + shr.u64 %rd642, %rd630, 32; + cvt.u32.u64 %r30288, %rd642; + shr.u64 %rd643, %rd631, 32; + cvt.u32.u64 %r30284, %rd643; + shr.u64 %rd644, %rd632, 32; + cvt.u32.u64 %r30282, %rd644; + shr.u64 %rd645, %rd633, 32; + cvt.u32.u64 %r30290, %rd645; + shr.u64 %rd646, %rd634, 32; + cvt.u32.u64 %r30286, %rd646; + @%p21 bra $L__BB2_30; + + mov.u32 %r30192, 0; + st.local.v2.u32 [%rd2+96], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+104], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+112], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+120], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+128], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+136], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+144], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+152], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+160], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+168], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+176], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+184], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+192], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+200], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+208], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+216], {%r30192, %r30192}; + mov.u32 %r30207, -2147483648; + mov.u32 %r11810, 1; + st.local.v2.u32 [%rd2+88], {%r11810, %r30207}; + mov.u32 %r30193, %r30192; + mov.u32 %r30194, %r30192; + mov.u32 %r30195, %r30192; + mov.u32 %r30196, %r30192; + mov.u32 %r30197, %r30192; + mov.u32 %r30198, %r30192; + mov.u32 %r30199, %r30192; + mov.u32 %r30200, %r30192; + mov.u32 %r30201, %r30192; + mov.u32 %r30202, %r30192; + mov.u32 %r30203, %r30192; + mov.u32 %r30204, %r30192; + mov.u32 %r30205, %r30192; + mov.u32 %r30206, %r11810; + mov.u32 %r30208, %r30192; + mov.u32 %r30209, %r30192; + mov.u32 %r30210, %r30192; + mov.u32 %r30211, %r30192; + mov.u32 %r30212, %r30192; + mov.u32 %r30213, %r30192; + mov.u32 %r30214, %r30192; + mov.u32 %r30215, %r30192; + mov.u32 %r30216, %r30192; + mov.u32 %r30217, %r30192; + mov.u32 %r30218, %r30192; + mov.u32 %r30219, %r30192; + mov.u32 %r30220, %r30192; + mov.u32 %r30221, %r30192; + mov.u32 %r30222, %r30192; + mov.u32 %r30223, %r30192; + mov.u32 %r30224, %r30192; + mov.u32 %r30225, %r30192; + mov.u32 %r30242, %r30192; + +$L__BB2_32: + // begin inline asm + // xor5 + lop3.b32 %r11837, %r30228, %r30226, %r30224, 0x96; + lop3.b32 %r11837, %r11837, %r30222, %r30220, 0x96; + lop3.b32 %r11838, %r30229, %r30227, %r30225, 0x96; + lop3.b32 %r11838, %r11838, %r30223, %r30221, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11849, %r30240, %r30238, %r30218, 0x96; + lop3.b32 %r11849, %r11849, %r30216, %r30214, 0x96; + lop3.b32 %r11850, %r30241, %r30239, %r30219, 0x96; + lop3.b32 %r11850, %r11850, %r30217, %r30215, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11861, %r30236, %r30234, %r30212, 0x96; + lop3.b32 %r11861, %r11861, %r30210, %r30208, 0x96; + lop3.b32 %r11862, %r30237, %r30235, %r30213, 0x96; + lop3.b32 %r11862, %r11862, %r30211, %r30209, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11873, %r30232, %r30206, %r30204, 0x96; + lop3.b32 %r11873, %r11873, %r30202, %r30200, 0x96; + lop3.b32 %r11874, %r30233, %r30207, %r30205, 0x96; + lop3.b32 %r11874, %r11874, %r30203, %r30201, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11885, %r30230, %r30198, %r30196, 0x96; + lop3.b32 %r11885, %r11885, %r30194, %r30192, 0x96; + lop3.b32 %r11886, %r30231, %r30199, %r30197, 0x96; + lop3.b32 %r11886, %r11886, %r30195, %r30193, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11897, %r11850, %r11849, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11901, %r11849, %r11850, %r11810; + // end inline asm + xor.b32 %r12331, %r11897, %r11885; + xor.b32 %r12332, %r11901, %r11886; + xor.b32 %r12164, %r30228, %r12331; + xor.b32 %r12167, %r30229, %r12332; + xor.b32 %r12071, %r30226, %r12331; + xor.b32 %r12070, %r30227, %r12332; + xor.b32 %r12118, %r30224, %r12331; + xor.b32 %r12119, %r30225, %r12332; + xor.b32 %r12023, %r30222, %r12331; + xor.b32 %r12022, %r30223, %r12332; + xor.b32 %r11974, %r30220, %r12331; + xor.b32 %r11975, %r30221, %r12332; + // begin inline asm + shf.l.wrap.b32 %r11905, %r11862, %r11861, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11909, %r11861, %r11862, %r11810; + // end inline asm + xor.b32 %r12333, %r11905, %r11837; + xor.b32 %r12334, %r11909, %r11838; + xor.b32 %r12126, %r30240, %r12333; + xor.b32 %r12127, %r30241, %r12334; + xor.b32 %r11943, %r30238, %r12333; + xor.b32 %r11942, %r30239, %r12334; + xor.b32 %r12102, %r30218, %r12333; + xor.b32 %r12103, %r30219, %r12334; + xor.b32 %r12063, %r30216, %r12333; + xor.b32 %r12062, %r30217, %r12334; + xor.b32 %r12046, %r30214, %r12333; + xor.b32 %r12047, %r30215, %r12334; + // begin inline asm + shf.l.wrap.b32 %r11913, %r11874, %r11873, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11917, %r11873, %r11874, %r11810; + // end inline asm + xor.b32 %r12335, %r11913, %r11849; + xor.b32 %r12336, %r11917, %r11850; + xor.b32 %r11983, %r30236, %r12335; + xor.b32 %r11982, %r30237, %r12336; + xor.b32 %r12110, %r30234, %r12335; + xor.b32 %r12111, %r30235, %r12336; + xor.b32 %r11991, %r30212, %r12335; + xor.b32 %r11990, %r30213, %r12336; + xor.b32 %r12094, %r30210, %r12335; + xor.b32 %r12095, %r30211, %r12336; + xor.b32 %r11959, %r30208, %r12335; + xor.b32 %r11958, %r30209, %r12336; + // begin inline asm + shf.l.wrap.b32 %r11921, %r11886, %r11885, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11925, %r11885, %r11886, %r11810; + // end inline asm + xor.b32 %r12337, %r11921, %r11861; + xor.b32 %r12338, %r11925, %r11862; + xor.b32 %r12078, %r30232, %r12337; + xor.b32 %r12079, %r30233, %r12338; + xor.b32 %r12055, %r30206, %r12337; + xor.b32 %r12054, %r30207, %r12338; + xor.b32 %r11998, %r30204, %r12337; + xor.b32 %r11999, %r30205, %r12338; + xor.b32 %r12086, %r30202, %r12337; + xor.b32 %r12087, %r30203, %r12338; + xor.b32 %r12015, %r30200, %r12337; + xor.b32 %r12014, %r30201, %r12338; + // begin inline asm + shf.l.wrap.b32 %r11929, %r11838, %r11837, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11933, %r11837, %r11838, %r11810; + // end inline asm + xor.b32 %r12339, %r11929, %r11873; + xor.b32 %r12340, %r11933, %r11874; + xor.b32 %r12030, %r30230, %r12339; + xor.b32 %r12031, %r30231, %r12340; + xor.b32 %r11950, %r30198, %r12339; + xor.b32 %r11951, %r30199, %r12340; + xor.b32 %r11967, %r30196, %r12339; + xor.b32 %r11966, %r30197, %r12340; + xor.b32 %r12006, %r30194, %r12339; + xor.b32 %r12007, %r30195, %r12340; + xor.b32 %r12038, %r30192, %r12339; + xor.b32 %r12039, %r30193, %r12340; + mov.u32 %r11944, 44; + // begin inline asm + shf.l.wrap.b32 %r11937, %r11943, %r11942, %r11944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11941, %r11942, %r11943, %r11944; + // end inline asm + mov.u32 %r11952, 20; + // begin inline asm + shf.l.wrap.b32 %r11945, %r11951, %r11950, %r11952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11949, %r11950, %r11951, %r11952; + // end inline asm + mov.u32 %r11960, 61; + // begin inline asm + shf.l.wrap.b32 %r11953, %r11959, %r11958, %r11960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11957, %r11958, %r11959, %r11960; + // end inline asm + mov.u32 %r11968, 39; + // begin inline asm + shf.l.wrap.b32 %r11961, %r11967, %r11966, %r11968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11965, %r11966, %r11967, %r11968; + // end inline asm + mov.u32 %r11976, 18; + // begin inline asm + shf.l.wrap.b32 %r11969, %r11975, %r11974, %r11976; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11973, %r11974, %r11975, %r11976; + // end inline asm + mov.u32 %r11984, 62; + // begin inline asm + shf.l.wrap.b32 %r11977, %r11983, %r11982, %r11984; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11981, %r11982, %r11983, %r11984; + // end inline asm + mov.u32 %r11992, 43; + // begin inline asm + shf.l.wrap.b32 %r11985, %r11991, %r11990, %r11992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11989, %r11990, %r11991, %r11992; + // end inline asm + mov.u32 %r12000, 25; + // begin inline asm + shf.l.wrap.b32 %r11993, %r11999, %r11998, %r12000; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11997, %r11998, %r11999, %r12000; + // end inline asm + mov.u32 %r12008, 8; + // begin inline asm + shf.l.wrap.b32 %r12001, %r12007, %r12006, %r12008; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12005, %r12006, %r12007, %r12008; + // end inline asm + mov.u32 %r12016, 56; + // begin inline asm + shf.l.wrap.b32 %r12009, %r12015, %r12014, %r12016; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12013, %r12014, %r12015, %r12016; + // end inline asm + mov.u32 %r12024, 41; + // begin inline asm + shf.l.wrap.b32 %r12017, %r12023, %r12022, %r12024; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12021, %r12022, %r12023, %r12024; + // end inline asm + mov.u32 %r12032, 27; + // begin inline asm + shf.l.wrap.b32 %r12025, %r12031, %r12030, %r12032; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12029, %r12030, %r12031, %r12032; + // end inline asm + mov.u32 %r12040, 14; + // begin inline asm + shf.l.wrap.b32 %r12033, %r12039, %r12038, %r12040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12037, %r12038, %r12039, %r12040; + // end inline asm + mov.u32 %r12048, 2; + // begin inline asm + shf.l.wrap.b32 %r12041, %r12047, %r12046, %r12048; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12045, %r12046, %r12047, %r12048; + // end inline asm + mov.u32 %r12056, 55; + // begin inline asm + shf.l.wrap.b32 %r12049, %r12055, %r12054, %r12056; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12053, %r12054, %r12055, %r12056; + // end inline asm + mov.u32 %r12064, 45; + // begin inline asm + shf.l.wrap.b32 %r12057, %r12063, %r12062, %r12064; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12061, %r12062, %r12063, %r12064; + // end inline asm + mov.u32 %r12072, 36; + // begin inline asm + shf.l.wrap.b32 %r12065, %r12071, %r12070, %r12072; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12069, %r12070, %r12071, %r12072; + // end inline asm + mov.u32 %r12080, 28; + // begin inline asm + shf.l.wrap.b32 %r12073, %r12079, %r12078, %r12080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12077, %r12078, %r12079, %r12080; + // end inline asm + mov.u32 %r12088, 21; + // begin inline asm + shf.l.wrap.b32 %r12081, %r12087, %r12086, %r12088; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12085, %r12086, %r12087, %r12088; + // end inline asm + mov.u32 %r12096, 15; + // begin inline asm + shf.l.wrap.b32 %r12089, %r12095, %r12094, %r12096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12093, %r12094, %r12095, %r12096; + // end inline asm + mov.u32 %r12104, 10; + // begin inline asm + shf.l.wrap.b32 %r12097, %r12103, %r12102, %r12104; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12101, %r12102, %r12103, %r12104; + // end inline asm + mov.u32 %r12112, 6; + // begin inline asm + shf.l.wrap.b32 %r12105, %r12111, %r12110, %r12112; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12109, %r12110, %r12111, %r12112; + // end inline asm + mov.u32 %r12120, 3; + // begin inline asm + shf.l.wrap.b32 %r12113, %r12119, %r12118, %r12120; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12117, %r12118, %r12119, %r12120; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12121, %r12127, %r12126, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12125, %r12126, %r12127, %r11810; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12129, %r12164, %r11937, %r11985, 0xD2; + lop3.b32 %r12130, %r12167, %r11941, %r11989, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30240, %r11937, %r11985, %r12081, 0xD2; + lop3.b32 %r30241, %r11941, %r11989, %r12085, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30236, %r11985, %r12081, %r12033, 0xD2; + lop3.b32 %r30237, %r11989, %r12085, %r12037, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30232, %r12081, %r12033, %r12164, 0xD2; + lop3.b32 %r30233, %r12085, %r12037, %r12167, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30230, %r12033, %r12164, %r11937, 0xD2; + lop3.b32 %r30231, %r12037, %r12167, %r11941, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30226, %r12073, %r11945, %r12113, 0xD2; + lop3.b32 %r30227, %r12077, %r11949, %r12117, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30238, %r11945, %r12113, %r12057, 0xD2; + lop3.b32 %r30239, %r11949, %r12117, %r12061, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30234, %r12113, %r12057, %r11953, 0xD2; + lop3.b32 %r30235, %r12117, %r12061, %r11957, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30206, %r12057, %r11953, %r12073, 0xD2; + lop3.b32 %r30207, %r12061, %r11957, %r12077, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30206, %r30207}; + // begin inline asm + // chi + lop3.b32 %r30198, %r11953, %r12073, %r11945, 0xD2; + lop3.b32 %r30199, %r11957, %r12077, %r11949, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30198, %r30199}; + // begin inline asm + // chi + lop3.b32 %r30224, %r12121, %r12105, %r11993, 0xD2; + lop3.b32 %r30225, %r12125, %r12109, %r11997, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30224, %r30225}; + // begin inline asm + // chi + lop3.b32 %r30218, %r12105, %r11993, %r12001, 0xD2; + lop3.b32 %r30219, %r12109, %r11997, %r12005, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30218, %r30219}; + // begin inline asm + // chi + lop3.b32 %r30212, %r11993, %r12001, %r11969, 0xD2; + lop3.b32 %r30213, %r11997, %r12005, %r11973, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30212, %r30213}; + // begin inline asm + // chi + lop3.b32 %r30204, %r12001, %r11969, %r12121, 0xD2; + lop3.b32 %r30205, %r12005, %r11973, %r12125, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30204, %r30205}; + // begin inline asm + // chi + lop3.b32 %r30196, %r11969, %r12121, %r12105, 0xD2; + lop3.b32 %r30197, %r11973, %r12125, %r12109, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30196, %r30197}; + // begin inline asm + // chi + lop3.b32 %r30222, %r12025, %r12065, %r12097, 0xD2; + lop3.b32 %r30223, %r12029, %r12069, %r12101, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30222, %r30223}; + // begin inline asm + // chi + lop3.b32 %r30216, %r12065, %r12097, %r12089, 0xD2; + lop3.b32 %r30217, %r12069, %r12101, %r12093, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30216, %r30217}; + // begin inline asm + // chi + lop3.b32 %r30210, %r12097, %r12089, %r12009, 0xD2; + lop3.b32 %r30211, %r12101, %r12093, %r12013, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30210, %r30211}; + // begin inline asm + // chi + lop3.b32 %r30202, %r12089, %r12009, %r12025, 0xD2; + lop3.b32 %r30203, %r12093, %r12013, %r12029, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30202, %r30203}; + // begin inline asm + // chi + lop3.b32 %r30194, %r12009, %r12025, %r12065, 0xD2; + lop3.b32 %r30195, %r12013, %r12029, %r12069, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30194, %r30195}; + // begin inline asm + // chi + lop3.b32 %r30220, %r11977, %r12049, %r11961, 0xD2; + lop3.b32 %r30221, %r11981, %r12053, %r11965, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30220, %r30221}; + // begin inline asm + // chi + lop3.b32 %r30214, %r12049, %r11961, %r12017, 0xD2; + lop3.b32 %r30215, %r12053, %r11965, %r12021, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30214, %r30215}; + // begin inline asm + // chi + lop3.b32 %r30208, %r11961, %r12017, %r12041, 0xD2; + lop3.b32 %r30209, %r11965, %r12021, %r12045, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30208, %r30209}; + // begin inline asm + // chi + lop3.b32 %r30200, %r12017, %r12041, %r11977, 0xD2; + lop3.b32 %r30201, %r12021, %r12045, %r11981, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30200, %r30201}; + // begin inline asm + // chi + lop3.b32 %r30192, %r12041, %r11977, %r12049, 0xD2; + lop3.b32 %r30193, %r12045, %r11981, %r12053, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30192, %r30193}; + mul.wide.s32 %rd648, %r30242, 8; + add.s64 %rd647, %rd580, %rd648; + // begin inline asm + ld.global.nc.v2.u32 {%r12329,%r12330}, [%rd647]; + // end inline asm + xor.b32 %r30228, %r12129, %r12329; + xor.b32 %r30229, %r12130, %r12330; + add.s32 %r30242, %r30242, 1; + setp.lt.u32 %p22, %r30242, 23; + @%p22 bra $L__BB2_32; + + st.local.v2.u32 [%rd2+32], {%r30240, %r30241}; + st.local.v2.u32 [%rd2+72], {%r30238, %r30239}; + st.local.v2.u32 [%rd2+40], {%r30236, %r30237}; + st.local.v2.u32 [%rd2+80], {%r30234, %r30235}; + st.local.v2.u32 [%rd2+48], {%r30232, %r30233}; + st.local.v2.u32 [%rd2+56], {%r30230, %r30231}; + st.local.v2.u32 [%rd2+24], {%r30228, %r30229}; + // begin inline asm + // xor5 + lop3.b32 %r12341, %r30228, %r30226, %r30224, 0x96; + lop3.b32 %r12341, %r12341, %r30222, %r30220, 0x96; + lop3.b32 %r12342, %r30229, %r30227, %r30225, 0x96; + lop3.b32 %r12342, %r12342, %r30223, %r30221, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12353, %r30240, %r30238, %r30218, 0x96; + lop3.b32 %r12353, %r12353, %r30216, %r30214, 0x96; + lop3.b32 %r12354, %r30241, %r30239, %r30219, 0x96; + lop3.b32 %r12354, %r12354, %r30217, %r30215, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12365, %r30236, %r30234, %r30212, 0x96; + lop3.b32 %r12365, %r12365, %r30210, %r30208, 0x96; + lop3.b32 %r12366, %r30237, %r30235, %r30213, 0x96; + lop3.b32 %r12366, %r12366, %r30211, %r30209, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12377, %r30232, %r30206, %r30204, 0x96; + lop3.b32 %r12377, %r12377, %r30202, %r30200, 0x96; + lop3.b32 %r12378, %r30233, %r30207, %r30205, 0x96; + lop3.b32 %r12378, %r12378, %r30203, %r30201, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12389, %r30230, %r30198, %r30196, 0x96; + lop3.b32 %r12389, %r12389, %r30194, %r30192, 0x96; + lop3.b32 %r12390, %r30231, %r30199, %r30197, 0x96; + lop3.b32 %r12390, %r12390, %r30195, %r30193, 0x96; + // end inline asm + mov.u32 %r12593, 1; + // begin inline asm + shf.l.wrap.b32 %r12401, %r12354, %r12353, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12405, %r12353, %r12354, %r12593; + // end inline asm + xor.b32 %r12620, %r12401, %r12389; + xor.b32 %r12621, %r12405, %r12390; + xor.b32 %r12548, %r30228, %r12620; + xor.b32 %r12551, %r30229, %r12621; + xor.b32 %r12511, %r30225, %r12621; + xor.b32 %r12510, %r30224, %r12620; + st.local.v2.u32 [%rd2+104], {%r12510, %r12511}; + // begin inline asm + shf.l.wrap.b32 %r12409, %r12366, %r12365, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12413, %r12365, %r12366, %r12593; + // end inline asm + xor.b32 %r12622, %r12409, %r12341; + xor.b32 %r12623, %r12413, %r12342; + xor.b32 %r12447, %r30238, %r12622; + xor.b32 %r12446, %r30239, %r12623; + xor.b32 %r12486, %r30217, %r12623; + xor.b32 %r12487, %r30216, %r12622; + st.local.v2.u32 [%rd2+152], {%r12487, %r12486}; + // begin inline asm + shf.l.wrap.b32 %r12417, %r12378, %r12377, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12421, %r12377, %r12378, %r12593; + // end inline asm + xor.b32 %r12624, %r12417, %r12353; + xor.b32 %r12625, %r12421, %r12354; + xor.b32 %r12470, %r30213, %r12625; + xor.b32 %r12471, %r30212, %r12624; + st.local.v2.u32 [%rd2+120], {%r12471, %r12470}; + xor.b32 %r12462, %r30209, %r12625; + xor.b32 %r12463, %r30208, %r12624; + st.local.v2.u32 [%rd2+200], {%r12463, %r12462}; + // begin inline asm + shf.l.wrap.b32 %r12425, %r12390, %r12389, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12429, %r12389, %r12390, %r12593; + // end inline asm + xor.b32 %r12626, %r12425, %r12365; + xor.b32 %r12627, %r12429, %r12366; + xor.b32 %r12494, %r30232, %r12626; + xor.b32 %r12495, %r30233, %r12627; + xor.b32 %r12503, %r30203, %r12627; + xor.b32 %r12502, %r30202, %r12626; + st.local.v2.u32 [%rd2+168], {%r12502, %r12503}; + // begin inline asm + shf.l.wrap.b32 %r12433, %r12342, %r12341, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12437, %r12341, %r12342, %r12593; + // end inline asm + xor.b32 %r12628, %r12433, %r12377; + xor.b32 %r12629, %r12437, %r12378; + xor.b32 %r12454, %r30198, %r12628; + xor.b32 %r12455, %r30199, %r12629; + xor.b32 %r12479, %r30193, %r12629; + xor.b32 %r12478, %r30192, %r12628; + st.local.v2.u32 [%rd2+216], {%r12478, %r12479}; + // begin inline asm + shf.l.wrap.b32 %r12441, %r12447, %r12446, %r11944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12445, %r12446, %r12447, %r11944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12449, %r12455, %r12454, %r11952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12453, %r12454, %r12455, %r11952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12461, %r12462, %r12463, %r11960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12457, %r12463, %r12462, %r11960; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r12457, %r12461}; + // begin inline asm + shf.l.wrap.b32 %r12465, %r12471, %r12470, %r11992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12469, %r12470, %r12471, %r11992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12473, %r12479, %r12478, %r12040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12477, %r12478, %r12479, %r12040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12485, %r12486, %r12487, %r12064; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12481, %r12487, %r12486, %r12064; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r12481, %r12485}; + // begin inline asm + shf.l.wrap.b32 %r12489, %r12495, %r12494, %r12080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12493, %r12494, %r12495, %r12080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12497, %r12503, %r12502, %r12088; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12501, %r12502, %r12503, %r12088; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12505, %r12511, %r12510, %r12120; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12509, %r12510, %r12511, %r12120; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12513, %r12548, %r12441, %r12465, 0xD2; + lop3.b32 %r12514, %r12551, %r12445, %r12469, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12521, %r12441, %r12465, %r12497, 0xD2; + lop3.b32 %r12522, %r12445, %r12469, %r12501, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r12521, %r12522}; + // begin inline asm + // chi + lop3.b32 %r12529, %r12465, %r12497, %r12473, 0xD2; + lop3.b32 %r12530, %r12469, %r12501, %r12477, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r12529, %r12530}; + // begin inline asm + // chi + lop3.b32 %r12537, %r12497, %r12473, %r12548, 0xD2; + lop3.b32 %r12538, %r12501, %r12477, %r12551, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r12537, %r12538}; + // begin inline asm + // chi + lop3.b32 %r12545, %r12473, %r12548, %r12441, 0xD2; + lop3.b32 %r12546, %r12477, %r12551, %r12445, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r12545, %r12546}; + // begin inline asm + // chi + lop3.b32 %r12553, %r12489, %r12449, %r12505, 0xD2; + lop3.b32 %r12554, %r12493, %r12453, %r12509, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r12553, %r12554}; + // begin inline asm + // chi + lop3.b32 %r12561, %r12449, %r12505, %r12481, 0xD2; + lop3.b32 %r12562, %r12453, %r12509, %r12485, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r12561, %r12562}; + // begin inline asm + // chi + lop3.b32 %r12569, %r12505, %r12481, %r12457, 0xD2; + lop3.b32 %r12570, %r12509, %r12485, %r12461, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r12569, %r12570}; + // begin inline asm + ld.global.nc.v2.u32 {%r12577,%r12578}, [%rd581]; + // end inline asm + xor.b32 %r12630, %r12514, %r12578; + xor.b32 %r12631, %r12513, %r12577; + mov.b64 %rd1317, {%r12631, %r12630}; + mov.b64 %rd1318, {%r12521, %r12522}; + mov.b64 %rd1319, {%r12529, %r12530}; + mov.b64 %rd1320, {%r12537, %r12538}; + mov.b64 %rd1321, {%r12545, %r12546}; + mov.b64 %rd1322, {%r12553, %r12554}; + mov.b64 %rd1323, {%r12561, %r12562}; + mov.b64 %rd1324, {%r12569, %r12570}; + mov.u32 %r30243, 0; + st.local.v2.u32 [%rd2+24], {%r12631, %r12630}; + st.local.v2.u32 [%rd82+96], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+104], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+112], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+120], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+128], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+136], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+144], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+152], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+160], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+168], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+176], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+184], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+192], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+200], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+208], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+216], {%r30243, %r30243}; + mov.u32 %r30258, -2147483648; + st.local.v2.u32 [%rd82+88], {%r12593, %r30258}; + mov.u32 %r30244, %r30243; + mov.u32 %r30245, %r30243; + mov.u32 %r30246, %r30243; + mov.u32 %r30247, %r30243; + mov.u32 %r30248, %r30243; + mov.u32 %r30249, %r30243; + mov.u32 %r30250, %r30243; + mov.u32 %r30251, %r30243; + mov.u32 %r30252, %r30243; + mov.u32 %r30253, %r30243; + mov.u32 %r30254, %r30243; + mov.u32 %r30255, %r30243; + mov.u32 %r30256, %r30243; + mov.u32 %r30257, %r12593; + mov.u32 %r30259, %r30243; + mov.u32 %r30260, %r30243; + mov.u32 %r30261, %r30243; + mov.u32 %r30262, %r30243; + mov.u32 %r30263, %r30243; + mov.u32 %r30264, %r30243; + mov.u32 %r30265, %r30243; + mov.u32 %r30266, %r30243; + mov.u32 %r30267, %r30243; + mov.u32 %r30268, %r30243; + mov.u32 %r30269, %r30243; + mov.u32 %r30270, %r30243; + mov.u32 %r30271, %r30243; + mov.u32 %r30272, %r30243; + mov.u32 %r30273, %r30243; + mov.u32 %r30274, %r30243; + mov.u32 %r30275, %r30243; + mov.u32 %r30276, %r30243; + mov.u32 %r30293, %r30243; + +$L__BB2_34: + // begin inline asm + // xor5 + lop3.b32 %r12632, %r30279, %r30277, %r30275, 0x96; + lop3.b32 %r12632, %r12632, %r30273, %r30271, 0x96; + lop3.b32 %r12633, %r30280, %r30278, %r30276, 0x96; + lop3.b32 %r12633, %r12633, %r30274, %r30272, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12644, %r30291, %r30289, %r30269, 0x96; + lop3.b32 %r12644, %r12644, %r30267, %r30265, 0x96; + lop3.b32 %r12645, %r30292, %r30290, %r30270, 0x96; + lop3.b32 %r12645, %r12645, %r30268, %r30266, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12656, %r30287, %r30285, %r30263, 0x96; + lop3.b32 %r12656, %r12656, %r30261, %r30259, 0x96; + lop3.b32 %r12657, %r30288, %r30286, %r30264, 0x96; + lop3.b32 %r12657, %r12657, %r30262, %r30260, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12668, %r30283, %r30257, %r30255, 0x96; + lop3.b32 %r12668, %r12668, %r30253, %r30251, 0x96; + lop3.b32 %r12669, %r30284, %r30258, %r30256, 0x96; + lop3.b32 %r12669, %r12669, %r30254, %r30252, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12680, %r30281, %r30249, %r30247, 0x96; + lop3.b32 %r12680, %r12680, %r30245, %r30243, 0x96; + lop3.b32 %r12681, %r30282, %r30250, %r30248, 0x96; + lop3.b32 %r12681, %r12681, %r30246, %r30244, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12692, %r12645, %r12644, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12696, %r12644, %r12645, %r12593; + // end inline asm + xor.b32 %r13126, %r12692, %r12680; + xor.b32 %r13127, %r12696, %r12681; + xor.b32 %r12959, %r30279, %r13126; + xor.b32 %r12962, %r30280, %r13127; + xor.b32 %r12866, %r30277, %r13126; + xor.b32 %r12865, %r30278, %r13127; + xor.b32 %r12913, %r30275, %r13126; + xor.b32 %r12914, %r30276, %r13127; + xor.b32 %r12818, %r30273, %r13126; + xor.b32 %r12817, %r30274, %r13127; + xor.b32 %r12769, %r30271, %r13126; + xor.b32 %r12770, %r30272, %r13127; + // begin inline asm + shf.l.wrap.b32 %r12700, %r12657, %r12656, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12704, %r12656, %r12657, %r12593; + // end inline asm + xor.b32 %r13128, %r12700, %r12632; + xor.b32 %r13129, %r12704, %r12633; + xor.b32 %r12921, %r30291, %r13128; + xor.b32 %r12922, %r30292, %r13129; + xor.b32 %r12738, %r30289, %r13128; + xor.b32 %r12737, %r30290, %r13129; + xor.b32 %r12897, %r30269, %r13128; + xor.b32 %r12898, %r30270, %r13129; + xor.b32 %r12858, %r30267, %r13128; + xor.b32 %r12857, %r30268, %r13129; + xor.b32 %r12841, %r30265, %r13128; + xor.b32 %r12842, %r30266, %r13129; + // begin inline asm + shf.l.wrap.b32 %r12708, %r12669, %r12668, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12712, %r12668, %r12669, %r12593; + // end inline asm + xor.b32 %r13130, %r12708, %r12644; + xor.b32 %r13131, %r12712, %r12645; + xor.b32 %r12778, %r30287, %r13130; + xor.b32 %r12777, %r30288, %r13131; + xor.b32 %r12905, %r30285, %r13130; + xor.b32 %r12906, %r30286, %r13131; + xor.b32 %r12786, %r30263, %r13130; + xor.b32 %r12785, %r30264, %r13131; + xor.b32 %r12889, %r30261, %r13130; + xor.b32 %r12890, %r30262, %r13131; + xor.b32 %r12754, %r30259, %r13130; + xor.b32 %r12753, %r30260, %r13131; + // begin inline asm + shf.l.wrap.b32 %r12716, %r12681, %r12680, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12720, %r12680, %r12681, %r12593; + // end inline asm + xor.b32 %r13132, %r12716, %r12656; + xor.b32 %r13133, %r12720, %r12657; + xor.b32 %r12873, %r30283, %r13132; + xor.b32 %r12874, %r30284, %r13133; + xor.b32 %r12850, %r30257, %r13132; + xor.b32 %r12849, %r30258, %r13133; + xor.b32 %r12793, %r30255, %r13132; + xor.b32 %r12794, %r30256, %r13133; + xor.b32 %r12881, %r30253, %r13132; + xor.b32 %r12882, %r30254, %r13133; + xor.b32 %r12810, %r30251, %r13132; + xor.b32 %r12809, %r30252, %r13133; + // begin inline asm + shf.l.wrap.b32 %r12724, %r12633, %r12632, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12728, %r12632, %r12633, %r12593; + // end inline asm + xor.b32 %r13134, %r12724, %r12668; + xor.b32 %r13135, %r12728, %r12669; + xor.b32 %r12825, %r30281, %r13134; + xor.b32 %r12826, %r30282, %r13135; + xor.b32 %r12745, %r30249, %r13134; + xor.b32 %r12746, %r30250, %r13135; + xor.b32 %r12762, %r30247, %r13134; + xor.b32 %r12761, %r30248, %r13135; + xor.b32 %r12801, %r30245, %r13134; + xor.b32 %r12802, %r30246, %r13135; + xor.b32 %r12833, %r30243, %r13134; + xor.b32 %r12834, %r30244, %r13135; + mov.u32 %r12739, 44; + // begin inline asm + shf.l.wrap.b32 %r12732, %r12738, %r12737, %r12739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12736, %r12737, %r12738, %r12739; + // end inline asm + mov.u32 %r12747, 20; + // begin inline asm + shf.l.wrap.b32 %r12740, %r12746, %r12745, %r12747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12744, %r12745, %r12746, %r12747; + // end inline asm + mov.u32 %r12755, 61; + // begin inline asm + shf.l.wrap.b32 %r12748, %r12754, %r12753, %r12755; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12752, %r12753, %r12754, %r12755; + // end inline asm + mov.u32 %r12763, 39; + // begin inline asm + shf.l.wrap.b32 %r12756, %r12762, %r12761, %r12763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12760, %r12761, %r12762, %r12763; + // end inline asm + mov.u32 %r12771, 18; + // begin inline asm + shf.l.wrap.b32 %r12764, %r12770, %r12769, %r12771; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12768, %r12769, %r12770, %r12771; + // end inline asm + mov.u32 %r12779, 62; + // begin inline asm + shf.l.wrap.b32 %r12772, %r12778, %r12777, %r12779; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12776, %r12777, %r12778, %r12779; + // end inline asm + mov.u32 %r12787, 43; + // begin inline asm + shf.l.wrap.b32 %r12780, %r12786, %r12785, %r12787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12784, %r12785, %r12786, %r12787; + // end inline asm + mov.u32 %r12795, 25; + // begin inline asm + shf.l.wrap.b32 %r12788, %r12794, %r12793, %r12795; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12792, %r12793, %r12794, %r12795; + // end inline asm + mov.u32 %r12803, 8; + // begin inline asm + shf.l.wrap.b32 %r12796, %r12802, %r12801, %r12803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12800, %r12801, %r12802, %r12803; + // end inline asm + mov.u32 %r12811, 56; + // begin inline asm + shf.l.wrap.b32 %r12804, %r12810, %r12809, %r12811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12808, %r12809, %r12810, %r12811; + // end inline asm + mov.u32 %r12819, 41; + // begin inline asm + shf.l.wrap.b32 %r12812, %r12818, %r12817, %r12819; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12816, %r12817, %r12818, %r12819; + // end inline asm + mov.u32 %r12827, 27; + // begin inline asm + shf.l.wrap.b32 %r12820, %r12826, %r12825, %r12827; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12824, %r12825, %r12826, %r12827; + // end inline asm + mov.u32 %r12835, 14; + // begin inline asm + shf.l.wrap.b32 %r12828, %r12834, %r12833, %r12835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12832, %r12833, %r12834, %r12835; + // end inline asm + mov.u32 %r12843, 2; + // begin inline asm + shf.l.wrap.b32 %r12836, %r12842, %r12841, %r12843; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12840, %r12841, %r12842, %r12843; + // end inline asm + mov.u32 %r12851, 55; + // begin inline asm + shf.l.wrap.b32 %r12844, %r12850, %r12849, %r12851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12848, %r12849, %r12850, %r12851; + // end inline asm + mov.u32 %r12859, 45; + // begin inline asm + shf.l.wrap.b32 %r12852, %r12858, %r12857, %r12859; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12856, %r12857, %r12858, %r12859; + // end inline asm + mov.u32 %r12867, 36; + // begin inline asm + shf.l.wrap.b32 %r12860, %r12866, %r12865, %r12867; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12864, %r12865, %r12866, %r12867; + // end inline asm + mov.u32 %r12875, 28; + // begin inline asm + shf.l.wrap.b32 %r12868, %r12874, %r12873, %r12875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12872, %r12873, %r12874, %r12875; + // end inline asm + mov.u32 %r12883, 21; + // begin inline asm + shf.l.wrap.b32 %r12876, %r12882, %r12881, %r12883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12880, %r12881, %r12882, %r12883; + // end inline asm + mov.u32 %r12891, 15; + // begin inline asm + shf.l.wrap.b32 %r12884, %r12890, %r12889, %r12891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12888, %r12889, %r12890, %r12891; + // end inline asm + mov.u32 %r12899, 10; + // begin inline asm + shf.l.wrap.b32 %r12892, %r12898, %r12897, %r12899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12896, %r12897, %r12898, %r12899; + // end inline asm + mov.u32 %r12907, 6; + // begin inline asm + shf.l.wrap.b32 %r12900, %r12906, %r12905, %r12907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12904, %r12905, %r12906, %r12907; + // end inline asm + mov.u32 %r12915, 3; + // begin inline asm + shf.l.wrap.b32 %r12908, %r12914, %r12913, %r12915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12912, %r12913, %r12914, %r12915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12916, %r12922, %r12921, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12920, %r12921, %r12922, %r12593; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12924, %r12959, %r12732, %r12780, 0xD2; + lop3.b32 %r12925, %r12962, %r12736, %r12784, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30291, %r12732, %r12780, %r12876, 0xD2; + lop3.b32 %r30292, %r12736, %r12784, %r12880, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30287, %r12780, %r12876, %r12828, 0xD2; + lop3.b32 %r30288, %r12784, %r12880, %r12832, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30283, %r12876, %r12828, %r12959, 0xD2; + lop3.b32 %r30284, %r12880, %r12832, %r12962, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30281, %r12828, %r12959, %r12732, 0xD2; + lop3.b32 %r30282, %r12832, %r12962, %r12736, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30277, %r12868, %r12740, %r12908, 0xD2; + lop3.b32 %r30278, %r12872, %r12744, %r12912, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30289, %r12740, %r12908, %r12852, 0xD2; + lop3.b32 %r30290, %r12744, %r12912, %r12856, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30285, %r12908, %r12852, %r12748, 0xD2; + lop3.b32 %r30286, %r12912, %r12856, %r12752, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30257, %r12852, %r12748, %r12868, 0xD2; + lop3.b32 %r30258, %r12856, %r12752, %r12872, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r30257, %r30258}; + // begin inline asm + // chi + lop3.b32 %r30249, %r12748, %r12868, %r12740, 0xD2; + lop3.b32 %r30250, %r12752, %r12872, %r12744, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r30249, %r30250}; + // begin inline asm + // chi + lop3.b32 %r30275, %r12916, %r12900, %r12788, 0xD2; + lop3.b32 %r30276, %r12920, %r12904, %r12792, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+104], {%r30275, %r30276}; + // begin inline asm + // chi + lop3.b32 %r30269, %r12900, %r12788, %r12796, 0xD2; + lop3.b32 %r30270, %r12904, %r12792, %r12800, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+112], {%r30269, %r30270}; + // begin inline asm + // chi + lop3.b32 %r30263, %r12788, %r12796, %r12764, 0xD2; + lop3.b32 %r30264, %r12792, %r12800, %r12768, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+120], {%r30263, %r30264}; + // begin inline asm + // chi + lop3.b32 %r30255, %r12796, %r12764, %r12916, 0xD2; + lop3.b32 %r30256, %r12800, %r12768, %r12920, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+128], {%r30255, %r30256}; + // begin inline asm + // chi + lop3.b32 %r30247, %r12764, %r12916, %r12900, 0xD2; + lop3.b32 %r30248, %r12768, %r12920, %r12904, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+136], {%r30247, %r30248}; + // begin inline asm + // chi + lop3.b32 %r30273, %r12820, %r12860, %r12892, 0xD2; + lop3.b32 %r30274, %r12824, %r12864, %r12896, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+144], {%r30273, %r30274}; + // begin inline asm + // chi + lop3.b32 %r30267, %r12860, %r12892, %r12884, 0xD2; + lop3.b32 %r30268, %r12864, %r12896, %r12888, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+152], {%r30267, %r30268}; + // begin inline asm + // chi + lop3.b32 %r30261, %r12892, %r12884, %r12804, 0xD2; + lop3.b32 %r30262, %r12896, %r12888, %r12808, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+160], {%r30261, %r30262}; + // begin inline asm + // chi + lop3.b32 %r30253, %r12884, %r12804, %r12820, 0xD2; + lop3.b32 %r30254, %r12888, %r12808, %r12824, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+168], {%r30253, %r30254}; + // begin inline asm + // chi + lop3.b32 %r30245, %r12804, %r12820, %r12860, 0xD2; + lop3.b32 %r30246, %r12808, %r12824, %r12864, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+176], {%r30245, %r30246}; + // begin inline asm + // chi + lop3.b32 %r30271, %r12772, %r12844, %r12756, 0xD2; + lop3.b32 %r30272, %r12776, %r12848, %r12760, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+184], {%r30271, %r30272}; + // begin inline asm + // chi + lop3.b32 %r30265, %r12844, %r12756, %r12812, 0xD2; + lop3.b32 %r30266, %r12848, %r12760, %r12816, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+192], {%r30265, %r30266}; + // begin inline asm + // chi + lop3.b32 %r30259, %r12756, %r12812, %r12836, 0xD2; + lop3.b32 %r30260, %r12760, %r12816, %r12840, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+200], {%r30259, %r30260}; + // begin inline asm + // chi + lop3.b32 %r30251, %r12812, %r12836, %r12772, 0xD2; + lop3.b32 %r30252, %r12816, %r12840, %r12776, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+208], {%r30251, %r30252}; + // begin inline asm + // chi + lop3.b32 %r30243, %r12836, %r12772, %r12844, 0xD2; + lop3.b32 %r30244, %r12840, %r12776, %r12848, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+216], {%r30243, %r30244}; + mul.wide.s32 %rd655, %r30293, 8; + add.s64 %rd654, %rd580, %rd655; + // begin inline asm + ld.global.nc.v2.u32 {%r13124,%r13125}, [%rd654]; + // end inline asm + xor.b32 %r30279, %r12924, %r13124; + xor.b32 %r30280, %r12925, %r13125; + add.s32 %r30293, %r30293, 1; + setp.lt.u32 %p23, %r30293, 23; + @%p23 bra $L__BB2_34; + + mov.u32 %r13235, 1; + st.local.v2.u32 [%rd82+32], {%r30291, %r30292}; + st.local.v2.u32 [%rd82+72], {%r30289, %r30290}; + st.local.v2.u32 [%rd82+40], {%r30287, %r30288}; + st.local.v2.u32 [%rd82+80], {%r30285, %r30286}; + st.local.v2.u32 [%rd82+48], {%r30283, %r30284}; + st.local.v2.u32 [%rd82+56], {%r30281, %r30282}; + st.local.v2.u32 [%rd82+24], {%r30279, %r30280}; + // begin inline asm + // xor5 + lop3.b32 %r13136, %r30279, %r30277, %r30275, 0x96; + lop3.b32 %r13136, %r13136, %r30273, %r30271, 0x96; + lop3.b32 %r13137, %r30280, %r30278, %r30276, 0x96; + lop3.b32 %r13137, %r13137, %r30274, %r30272, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13148, %r30291, %r30289, %r30269, 0x96; + lop3.b32 %r13148, %r13148, %r30267, %r30265, 0x96; + lop3.b32 %r13149, %r30292, %r30290, %r30270, 0x96; + lop3.b32 %r13149, %r13149, %r30268, %r30266, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13160, %r30287, %r30285, %r30263, 0x96; + lop3.b32 %r13160, %r13160, %r30261, %r30259, 0x96; + lop3.b32 %r13161, %r30288, %r30286, %r30264, 0x96; + lop3.b32 %r13161, %r13161, %r30262, %r30260, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13172, %r30283, %r30257, %r30255, 0x96; + lop3.b32 %r13172, %r13172, %r30253, %r30251, 0x96; + lop3.b32 %r13173, %r30284, %r30258, %r30256, 0x96; + lop3.b32 %r13173, %r13173, %r30254, %r30252, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13184, %r30281, %r30249, %r30247, 0x96; + lop3.b32 %r13184, %r13184, %r30245, %r30243, 0x96; + lop3.b32 %r13185, %r30282, %r30250, %r30248, 0x96; + lop3.b32 %r13185, %r13185, %r30246, %r30244, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13196, %r13149, %r13148, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13200, %r13148, %r13149, %r13235; + // end inline asm + xor.b32 %r13374, %r13196, %r13184; + xor.b32 %r13375, %r13200, %r13185; + xor.b32 %r13343, %r30279, %r13374; + xor.b32 %r13346, %r30280, %r13375; + xor.b32 %r13306, %r30276, %r13375; + xor.b32 %r13305, %r30275, %r13374; + st.local.v2.u32 [%rd82+104], {%r13305, %r13306}; + // begin inline asm + shf.l.wrap.b32 %r13204, %r13161, %r13160, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13208, %r13160, %r13161, %r13235; + // end inline asm + xor.b32 %r13376, %r13204, %r13136; + xor.b32 %r13377, %r13208, %r13137; + xor.b32 %r13242, %r30289, %r13376; + xor.b32 %r13241, %r30290, %r13377; + xor.b32 %r13281, %r30268, %r13377; + xor.b32 %r13282, %r30267, %r13376; + st.local.v2.u32 [%rd82+152], {%r13282, %r13281}; + // begin inline asm + shf.l.wrap.b32 %r13212, %r13173, %r13172, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13216, %r13172, %r13173, %r13235; + // end inline asm + xor.b32 %r13378, %r13212, %r13148; + xor.b32 %r13379, %r13216, %r13149; + xor.b32 %r13265, %r30264, %r13379; + xor.b32 %r13266, %r30263, %r13378; + st.local.v2.u32 [%rd82+120], {%r13266, %r13265}; + xor.b32 %r13257, %r30260, %r13379; + xor.b32 %r13258, %r30259, %r13378; + st.local.v2.u32 [%rd82+200], {%r13258, %r13257}; + // begin inline asm + shf.l.wrap.b32 %r13220, %r13185, %r13184, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13224, %r13184, %r13185, %r13235; + // end inline asm + xor.b32 %r13380, %r13220, %r13160; + xor.b32 %r13381, %r13224, %r13161; + xor.b32 %r13289, %r30283, %r13380; + xor.b32 %r13290, %r30284, %r13381; + xor.b32 %r13298, %r30254, %r13381; + xor.b32 %r13297, %r30253, %r13380; + st.local.v2.u32 [%rd82+168], {%r13297, %r13298}; + // begin inline asm + shf.l.wrap.b32 %r13228, %r13137, %r13136, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13232, %r13136, %r13137, %r13235; + // end inline asm + xor.b32 %r13382, %r13228, %r13172; + xor.b32 %r13383, %r13232, %r13173; + xor.b32 %r13249, %r30249, %r13382; + xor.b32 %r13250, %r30250, %r13383; + xor.b32 %r13274, %r30244, %r13383; + xor.b32 %r13273, %r30243, %r13382; + st.local.v2.u32 [%rd82+216], {%r13273, %r13274}; + // begin inline asm + shf.l.wrap.b32 %r13236, %r13242, %r13241, %r12739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13240, %r13241, %r13242, %r12739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13244, %r13250, %r13249, %r12747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13248, %r13249, %r13250, %r12747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13256, %r13257, %r13258, %r12755; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13252, %r13258, %r13257, %r12755; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r13252, %r13256}; + // begin inline asm + shf.l.wrap.b32 %r13260, %r13266, %r13265, %r12787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13264, %r13265, %r13266, %r12787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13268, %r13274, %r13273, %r12835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13272, %r13273, %r13274, %r12835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13280, %r13281, %r13282, %r12859; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13276, %r13282, %r13281, %r12859; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r13276, %r13280}; + // begin inline asm + shf.l.wrap.b32 %r13284, %r13290, %r13289, %r12875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13288, %r13289, %r13290, %r12875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13292, %r13298, %r13297, %r12883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13296, %r13297, %r13298, %r12883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13300, %r13306, %r13305, %r12915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13304, %r13305, %r13306, %r12915; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13308, %r13343, %r13236, %r13260, 0xD2; + lop3.b32 %r13309, %r13346, %r13240, %r13264, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13316, %r13236, %r13260, %r13292, 0xD2; + lop3.b32 %r13317, %r13240, %r13264, %r13296, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+32], {%r13316, %r13317}; + // begin inline asm + // chi + lop3.b32 %r13324, %r13260, %r13292, %r13268, 0xD2; + lop3.b32 %r13325, %r13264, %r13296, %r13272, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+40], {%r13324, %r13325}; + // begin inline asm + // chi + lop3.b32 %r13332, %r13292, %r13268, %r13343, 0xD2; + lop3.b32 %r13333, %r13296, %r13272, %r13346, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+48], {%r13332, %r13333}; + // begin inline asm + // chi + lop3.b32 %r13340, %r13268, %r13343, %r13236, 0xD2; + lop3.b32 %r13341, %r13272, %r13346, %r13240, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+56], {%r13340, %r13341}; + // begin inline asm + // chi + lop3.b32 %r13348, %r13284, %r13244, %r13300, 0xD2; + lop3.b32 %r13349, %r13288, %r13248, %r13304, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+64], {%r13348, %r13349}; + // begin inline asm + // chi + lop3.b32 %r13356, %r13244, %r13300, %r13276, 0xD2; + lop3.b32 %r13357, %r13248, %r13304, %r13280, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+72], {%r13356, %r13357}; + // begin inline asm + // chi + lop3.b32 %r13364, %r13300, %r13276, %r13252, 0xD2; + lop3.b32 %r13365, %r13304, %r13280, %r13256, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+80], {%r13364, %r13365}; + // begin inline asm + ld.global.nc.v2.u32 {%r13372,%r13373}, [%rd581]; + // end inline asm + xor.b32 %r13384, %r13309, %r13373; + xor.b32 %r13385, %r13308, %r13372; + st.local.v2.u32 [%rd82+24], {%r13385, %r13384}; + mov.b64 %rd1326, {%r13316, %r13317}; + mov.b64 %rd1327, {%r13324, %r13325}; + mov.b64 %rd1330, {%r13348, %r13349}; + mov.b64 %rd1331, {%r13356, %r13357}; + mov.b64 %rd1332, {%r13364, %r13365}; + mov.b64 %rd1325, {%r13385, %r13384}; + mov.b64 %rd1328, {%r13332, %r13333}; + mov.b64 %rd1329, {%r13340, %r13341}; + bra.uni $L__BB2_36; + +$L__BB2_14: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd462, 1179641; + st.local.u64 [%rd2+8], %rd462; + st.local.u32 [%rd2+16], %r47; + ld.global.u64 %rd463, [%rd32]; + ld.global.u64 %rd464, [%rd32+8]; + ld.global.u64 %rd465, [%rd32+16]; + ld.global.u64 %rd466, [%rd32+24]; + ld.global.u64 %rd467, [%rd32+32]; + ld.global.u64 %rd468, [%rd32+40]; + ld.global.u64 %rd469, [%rd32+48]; + ld.global.u64 %rd470, [%rd32+56]; + st.local.u64 [%rd2+24], %rd463; + st.local.u64 [%rd2+32], %rd464; + st.local.u64 [%rd2+40], %rd465; + st.local.u64 [%rd2+48], %rd466; + st.local.u64 [%rd2+56], %rd467; + st.local.u64 [%rd2+64], %rd468; + st.local.u64 [%rd2+72], %rd469; + st.local.u64 [%rd2+80], %rd470; + cvt.u32.u64 %r6859, %rd463; + xor.b32 %r6860, %r47, %r6859; + st.local.u32 [%rd2+24], %r6860; + mov.u32 %r29820, 0; + st.local.v2.u32 [%rd2+96], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+104], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+112], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+120], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+128], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+136], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+144], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+152], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+160], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+168], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+176], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+184], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+192], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+200], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+208], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+216], {%r29820, %r29820}; + mov.u32 %r29835, -2147483648; + mov.u32 %r6832, 1; + st.local.v2.u32 [%rd2+88], {%r6832, %r29835}; + ld.local.v2.u32 {%r29856, %r29857}, [%rd2+24]; + mov.b64 {%r29854, %r29855}, %rd468; + shr.u64 %rd471, %rd464, 32; + cvt.u32.u64 %r29868, %rd464; + cvt.u32.u64 %r29869, %rd471; + shr.u64 %rd472, %rd469, 32; + cvt.u32.u64 %r29866, %rd469; + cvt.u32.u64 %r29867, %rd472; + shr.u64 %rd473, %rd465, 32; + cvt.u32.u64 %r29864, %rd465; + cvt.u32.u64 %r29865, %rd473; + shr.u64 %rd474, %rd470, 32; + cvt.u32.u64 %r29862, %rd470; + cvt.u32.u64 %r29863, %rd474; + shr.u64 %rd475, %rd466, 32; + cvt.u32.u64 %r29860, %rd466; + cvt.u32.u64 %r29861, %rd475; + shr.u64 %rd476, %rd467, 32; + cvt.u32.u64 %r29858, %rd467; + cvt.u32.u64 %r29859, %rd476; + mov.u32 %r29821, %r29820; + mov.u32 %r29822, %r29820; + mov.u32 %r29823, %r29820; + mov.u32 %r29824, %r29820; + mov.u32 %r29825, %r29820; + mov.u32 %r29826, %r29820; + mov.u32 %r29827, %r29820; + mov.u32 %r29828, %r29820; + mov.u32 %r29829, %r29820; + mov.u32 %r29830, %r29820; + mov.u32 %r29831, %r29820; + mov.u32 %r29832, %r29820; + mov.u32 %r29833, %r29820; + mov.u32 %r29834, %r6832; + mov.u32 %r29836, %r29820; + mov.u32 %r29837, %r29820; + mov.u32 %r29838, %r29820; + mov.u32 %r29839, %r29820; + mov.u32 %r29840, %r29820; + mov.u32 %r29841, %r29820; + mov.u32 %r29842, %r29820; + mov.u32 %r29843, %r29820; + mov.u32 %r29844, %r29820; + mov.u32 %r29845, %r29820; + mov.u32 %r29846, %r29820; + mov.u32 %r29847, %r29820; + mov.u32 %r29848, %r29820; + mov.u32 %r29849, %r29820; + mov.u32 %r29850, %r29820; + mov.u32 %r29851, %r29820; + mov.u32 %r29852, %r29820; + mov.u32 %r29853, %r29820; + mov.u32 %r29870, %r29820; + +$L__BB2_15: + // begin inline asm + // xor5 + lop3.b32 %r6863, %r29856, %r29854, %r29852, 0x96; + lop3.b32 %r6863, %r6863, %r29850, %r29848, 0x96; + lop3.b32 %r6864, %r29857, %r29855, %r29853, 0x96; + lop3.b32 %r6864, %r6864, %r29851, %r29849, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6875, %r29868, %r29866, %r29846, 0x96; + lop3.b32 %r6875, %r6875, %r29844, %r29842, 0x96; + lop3.b32 %r6876, %r29869, %r29867, %r29847, 0x96; + lop3.b32 %r6876, %r6876, %r29845, %r29843, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6887, %r29864, %r29862, %r29840, 0x96; + lop3.b32 %r6887, %r6887, %r29838, %r29836, 0x96; + lop3.b32 %r6888, %r29865, %r29863, %r29841, 0x96; + lop3.b32 %r6888, %r6888, %r29839, %r29837, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6899, %r29860, %r29834, %r29832, 0x96; + lop3.b32 %r6899, %r6899, %r29830, %r29828, 0x96; + lop3.b32 %r6900, %r29861, %r29835, %r29833, 0x96; + lop3.b32 %r6900, %r6900, %r29831, %r29829, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6911, %r29858, %r29826, %r29824, 0x96; + lop3.b32 %r6911, %r6911, %r29822, %r29820, 0x96; + lop3.b32 %r6912, %r29859, %r29827, %r29825, 0x96; + lop3.b32 %r6912, %r6912, %r29823, %r29821, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6923, %r6876, %r6875, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6927, %r6875, %r6876, %r6832; + // end inline asm + xor.b32 %r7357, %r6923, %r6911; + xor.b32 %r7358, %r6927, %r6912; + xor.b32 %r7190, %r29856, %r7357; + xor.b32 %r7193, %r29857, %r7358; + xor.b32 %r7097, %r29854, %r7357; + xor.b32 %r7096, %r29855, %r7358; + xor.b32 %r7144, %r29852, %r7357; + xor.b32 %r7145, %r29853, %r7358; + xor.b32 %r7049, %r29850, %r7357; + xor.b32 %r7048, %r29851, %r7358; + xor.b32 %r7000, %r29848, %r7357; + xor.b32 %r7001, %r29849, %r7358; + // begin inline asm + shf.l.wrap.b32 %r6931, %r6888, %r6887, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6935, %r6887, %r6888, %r6832; + // end inline asm + xor.b32 %r7359, %r6931, %r6863; + xor.b32 %r7360, %r6935, %r6864; + xor.b32 %r7152, %r29868, %r7359; + xor.b32 %r7153, %r29869, %r7360; + xor.b32 %r6969, %r29866, %r7359; + xor.b32 %r6968, %r29867, %r7360; + xor.b32 %r7128, %r29846, %r7359; + xor.b32 %r7129, %r29847, %r7360; + xor.b32 %r7089, %r29844, %r7359; + xor.b32 %r7088, %r29845, %r7360; + xor.b32 %r7072, %r29842, %r7359; + xor.b32 %r7073, %r29843, %r7360; + // begin inline asm + shf.l.wrap.b32 %r6939, %r6900, %r6899, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6943, %r6899, %r6900, %r6832; + // end inline asm + xor.b32 %r7361, %r6939, %r6875; + xor.b32 %r7362, %r6943, %r6876; + xor.b32 %r7009, %r29864, %r7361; + xor.b32 %r7008, %r29865, %r7362; + xor.b32 %r7136, %r29862, %r7361; + xor.b32 %r7137, %r29863, %r7362; + xor.b32 %r7017, %r29840, %r7361; + xor.b32 %r7016, %r29841, %r7362; + xor.b32 %r7120, %r29838, %r7361; + xor.b32 %r7121, %r29839, %r7362; + xor.b32 %r6985, %r29836, %r7361; + xor.b32 %r6984, %r29837, %r7362; + // begin inline asm + shf.l.wrap.b32 %r6947, %r6912, %r6911, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6951, %r6911, %r6912, %r6832; + // end inline asm + xor.b32 %r7363, %r6947, %r6887; + xor.b32 %r7364, %r6951, %r6888; + xor.b32 %r7104, %r29860, %r7363; + xor.b32 %r7105, %r29861, %r7364; + xor.b32 %r7081, %r29834, %r7363; + xor.b32 %r7080, %r29835, %r7364; + xor.b32 %r7024, %r29832, %r7363; + xor.b32 %r7025, %r29833, %r7364; + xor.b32 %r7112, %r29830, %r7363; + xor.b32 %r7113, %r29831, %r7364; + xor.b32 %r7041, %r29828, %r7363; + xor.b32 %r7040, %r29829, %r7364; + // begin inline asm + shf.l.wrap.b32 %r6955, %r6864, %r6863, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6959, %r6863, %r6864, %r6832; + // end inline asm + xor.b32 %r7365, %r6955, %r6899; + xor.b32 %r7366, %r6959, %r6900; + xor.b32 %r7056, %r29858, %r7365; + xor.b32 %r7057, %r29859, %r7366; + xor.b32 %r6976, %r29826, %r7365; + xor.b32 %r6977, %r29827, %r7366; + xor.b32 %r6993, %r29824, %r7365; + xor.b32 %r6992, %r29825, %r7366; + xor.b32 %r7032, %r29822, %r7365; + xor.b32 %r7033, %r29823, %r7366; + xor.b32 %r7064, %r29820, %r7365; + xor.b32 %r7065, %r29821, %r7366; + mov.u32 %r6970, 44; + // begin inline asm + shf.l.wrap.b32 %r6963, %r6969, %r6968, %r6970; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6967, %r6968, %r6969, %r6970; + // end inline asm + mov.u32 %r6978, 20; + // begin inline asm + shf.l.wrap.b32 %r6971, %r6977, %r6976, %r6978; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6975, %r6976, %r6977, %r6978; + // end inline asm + mov.u32 %r6986, 61; + // begin inline asm + shf.l.wrap.b32 %r6979, %r6985, %r6984, %r6986; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6983, %r6984, %r6985, %r6986; + // end inline asm + mov.u32 %r6994, 39; + // begin inline asm + shf.l.wrap.b32 %r6987, %r6993, %r6992, %r6994; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6991, %r6992, %r6993, %r6994; + // end inline asm + mov.u32 %r7002, 18; + // begin inline asm + shf.l.wrap.b32 %r6995, %r7001, %r7000, %r7002; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6999, %r7000, %r7001, %r7002; + // end inline asm + mov.u32 %r7010, 62; + // begin inline asm + shf.l.wrap.b32 %r7003, %r7009, %r7008, %r7010; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7007, %r7008, %r7009, %r7010; + // end inline asm + mov.u32 %r7018, 43; + // begin inline asm + shf.l.wrap.b32 %r7011, %r7017, %r7016, %r7018; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7015, %r7016, %r7017, %r7018; + // end inline asm + mov.u32 %r7026, 25; + // begin inline asm + shf.l.wrap.b32 %r7019, %r7025, %r7024, %r7026; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7023, %r7024, %r7025, %r7026; + // end inline asm + mov.u32 %r7034, 8; + // begin inline asm + shf.l.wrap.b32 %r7027, %r7033, %r7032, %r7034; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7031, %r7032, %r7033, %r7034; + // end inline asm + mov.u32 %r7042, 56; + // begin inline asm + shf.l.wrap.b32 %r7035, %r7041, %r7040, %r7042; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7039, %r7040, %r7041, %r7042; + // end inline asm + mov.u32 %r7050, 41; + // begin inline asm + shf.l.wrap.b32 %r7043, %r7049, %r7048, %r7050; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7047, %r7048, %r7049, %r7050; + // end inline asm + mov.u32 %r7058, 27; + // begin inline asm + shf.l.wrap.b32 %r7051, %r7057, %r7056, %r7058; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7055, %r7056, %r7057, %r7058; + // end inline asm + mov.u32 %r7066, 14; + // begin inline asm + shf.l.wrap.b32 %r7059, %r7065, %r7064, %r7066; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7063, %r7064, %r7065, %r7066; + // end inline asm + mov.u32 %r7074, 2; + // begin inline asm + shf.l.wrap.b32 %r7067, %r7073, %r7072, %r7074; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7071, %r7072, %r7073, %r7074; + // end inline asm + mov.u32 %r7082, 55; + // begin inline asm + shf.l.wrap.b32 %r7075, %r7081, %r7080, %r7082; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7079, %r7080, %r7081, %r7082; + // end inline asm + mov.u32 %r7090, 45; + // begin inline asm + shf.l.wrap.b32 %r7083, %r7089, %r7088, %r7090; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7087, %r7088, %r7089, %r7090; + // end inline asm + mov.u32 %r7098, 36; + // begin inline asm + shf.l.wrap.b32 %r7091, %r7097, %r7096, %r7098; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7095, %r7096, %r7097, %r7098; + // end inline asm + mov.u32 %r7106, 28; + // begin inline asm + shf.l.wrap.b32 %r7099, %r7105, %r7104, %r7106; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7103, %r7104, %r7105, %r7106; + // end inline asm + mov.u32 %r7114, 21; + // begin inline asm + shf.l.wrap.b32 %r7107, %r7113, %r7112, %r7114; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7111, %r7112, %r7113, %r7114; + // end inline asm + mov.u32 %r7122, 15; + // begin inline asm + shf.l.wrap.b32 %r7115, %r7121, %r7120, %r7122; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7119, %r7120, %r7121, %r7122; + // end inline asm + mov.u32 %r7130, 10; + // begin inline asm + shf.l.wrap.b32 %r7123, %r7129, %r7128, %r7130; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7127, %r7128, %r7129, %r7130; + // end inline asm + mov.u32 %r7138, 6; + // begin inline asm + shf.l.wrap.b32 %r7131, %r7137, %r7136, %r7138; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7135, %r7136, %r7137, %r7138; + // end inline asm + mov.u32 %r7146, 3; + // begin inline asm + shf.l.wrap.b32 %r7139, %r7145, %r7144, %r7146; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7143, %r7144, %r7145, %r7146; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7147, %r7153, %r7152, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7151, %r7152, %r7153, %r6832; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r7155, %r7190, %r6963, %r7011, 0xD2; + lop3.b32 %r7156, %r7193, %r6967, %r7015, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29868, %r6963, %r7011, %r7107, 0xD2; + lop3.b32 %r29869, %r6967, %r7015, %r7111, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29864, %r7011, %r7107, %r7059, 0xD2; + lop3.b32 %r29865, %r7015, %r7111, %r7063, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29860, %r7107, %r7059, %r7190, 0xD2; + lop3.b32 %r29861, %r7111, %r7063, %r7193, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29858, %r7059, %r7190, %r6963, 0xD2; + lop3.b32 %r29859, %r7063, %r7193, %r6967, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29854, %r7099, %r6971, %r7139, 0xD2; + lop3.b32 %r29855, %r7103, %r6975, %r7143, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29866, %r6971, %r7139, %r7083, 0xD2; + lop3.b32 %r29867, %r6975, %r7143, %r7087, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29862, %r7139, %r7083, %r6979, 0xD2; + lop3.b32 %r29863, %r7143, %r7087, %r6983, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29834, %r7083, %r6979, %r7099, 0xD2; + lop3.b32 %r29835, %r7087, %r6983, %r7103, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r29834, %r29835}; + // begin inline asm + // chi + lop3.b32 %r29826, %r6979, %r7099, %r6971, 0xD2; + lop3.b32 %r29827, %r6983, %r7103, %r6975, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r29826, %r29827}; + // begin inline asm + // chi + lop3.b32 %r29852, %r7147, %r7131, %r7019, 0xD2; + lop3.b32 %r29853, %r7151, %r7135, %r7023, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r29852, %r29853}; + // begin inline asm + // chi + lop3.b32 %r29846, %r7131, %r7019, %r7027, 0xD2; + lop3.b32 %r29847, %r7135, %r7023, %r7031, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r29846, %r29847}; + // begin inline asm + // chi + lop3.b32 %r29840, %r7019, %r7027, %r6995, 0xD2; + lop3.b32 %r29841, %r7023, %r7031, %r6999, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r29840, %r29841}; + // begin inline asm + // chi + lop3.b32 %r29832, %r7027, %r6995, %r7147, 0xD2; + lop3.b32 %r29833, %r7031, %r6999, %r7151, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r29832, %r29833}; + // begin inline asm + // chi + lop3.b32 %r29824, %r6995, %r7147, %r7131, 0xD2; + lop3.b32 %r29825, %r6999, %r7151, %r7135, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r29824, %r29825}; + // begin inline asm + // chi + lop3.b32 %r29850, %r7051, %r7091, %r7123, 0xD2; + lop3.b32 %r29851, %r7055, %r7095, %r7127, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r29850, %r29851}; + // begin inline asm + // chi + lop3.b32 %r29844, %r7091, %r7123, %r7115, 0xD2; + lop3.b32 %r29845, %r7095, %r7127, %r7119, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r29844, %r29845}; + // begin inline asm + // chi + lop3.b32 %r29838, %r7123, %r7115, %r7035, 0xD2; + lop3.b32 %r29839, %r7127, %r7119, %r7039, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r29838, %r29839}; + // begin inline asm + // chi + lop3.b32 %r29830, %r7115, %r7035, %r7051, 0xD2; + lop3.b32 %r29831, %r7119, %r7039, %r7055, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r29830, %r29831}; + // begin inline asm + // chi + lop3.b32 %r29822, %r7035, %r7051, %r7091, 0xD2; + lop3.b32 %r29823, %r7039, %r7055, %r7095, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r29822, %r29823}; + // begin inline asm + // chi + lop3.b32 %r29848, %r7003, %r7075, %r6987, 0xD2; + lop3.b32 %r29849, %r7007, %r7079, %r6991, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r29848, %r29849}; + // begin inline asm + // chi + lop3.b32 %r29842, %r7075, %r6987, %r7043, 0xD2; + lop3.b32 %r29843, %r7079, %r6991, %r7047, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r29842, %r29843}; + // begin inline asm + // chi + lop3.b32 %r29836, %r6987, %r7043, %r7067, 0xD2; + lop3.b32 %r29837, %r6991, %r7047, %r7071, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r29836, %r29837}; + // begin inline asm + // chi + lop3.b32 %r29828, %r7043, %r7067, %r7003, 0xD2; + lop3.b32 %r29829, %r7047, %r7071, %r7007, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r29828, %r29829}; + // begin inline asm + // chi + lop3.b32 %r29820, %r7067, %r7003, %r7075, 0xD2; + lop3.b32 %r29821, %r7071, %r7007, %r7079, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r29820, %r29821}; + mul.wide.s32 %rd478, %r29870, 8; + mov.u64 %rd479, keccak_round_constants; + cvta.const.u64 %rd480, %rd479; + add.s64 %rd477, %rd480, %rd478; + // begin inline asm + ld.global.nc.v2.u32 {%r7355,%r7356}, [%rd477]; + // end inline asm + xor.b32 %r29856, %r7155, %r7355; + xor.b32 %r29857, %r7156, %r7356; + add.s32 %r29870, %r29870, 1; + setp.lt.u32 %p14, %r29870, 23; + @%p14 bra $L__BB2_15; + + add.u64 %rd53, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r29868, %r29869}; + st.local.v2.u32 [%rd2+72], {%r29866, %r29867}; + st.local.v2.u32 [%rd2+40], {%r29864, %r29865}; + st.local.v2.u32 [%rd2+80], {%r29862, %r29863}; + st.local.v2.u32 [%rd2+48], {%r29860, %r29861}; + st.local.v2.u32 [%rd2+56], {%r29858, %r29859}; + st.local.v2.u32 [%rd2+24], {%r29856, %r29857}; + // begin inline asm + // xor5 + lop3.b32 %r7367, %r29856, %r29854, %r29852, 0x96; + lop3.b32 %r7367, %r7367, %r29850, %r29848, 0x96; + lop3.b32 %r7368, %r29857, %r29855, %r29853, 0x96; + lop3.b32 %r7368, %r7368, %r29851, %r29849, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7379, %r29868, %r29866, %r29846, 0x96; + lop3.b32 %r7379, %r7379, %r29844, %r29842, 0x96; + lop3.b32 %r7380, %r29869, %r29867, %r29847, 0x96; + lop3.b32 %r7380, %r7380, %r29845, %r29843, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7391, %r29864, %r29862, %r29840, 0x96; + lop3.b32 %r7391, %r7391, %r29838, %r29836, 0x96; + lop3.b32 %r7392, %r29865, %r29863, %r29841, 0x96; + lop3.b32 %r7392, %r7392, %r29839, %r29837, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7403, %r29860, %r29834, %r29832, 0x96; + lop3.b32 %r7403, %r7403, %r29830, %r29828, 0x96; + lop3.b32 %r7404, %r29861, %r29835, %r29833, 0x96; + lop3.b32 %r7404, %r7404, %r29831, %r29829, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7415, %r29858, %r29826, %r29824, 0x96; + lop3.b32 %r7415, %r7415, %r29822, %r29820, 0x96; + lop3.b32 %r7416, %r29859, %r29827, %r29825, 0x96; + lop3.b32 %r7416, %r7416, %r29823, %r29821, 0x96; + // end inline asm + mov.u32 %r7619, 1; + // begin inline asm + shf.l.wrap.b32 %r7427, %r7380, %r7379, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7431, %r7379, %r7380, %r7619; + // end inline asm + xor.b32 %r7646, %r7427, %r7415; + xor.b32 %r7647, %r7431, %r7416; + xor.b32 %r7574, %r29856, %r7646; + xor.b32 %r7577, %r29857, %r7647; + xor.b32 %r7537, %r29853, %r7647; + xor.b32 %r7536, %r29852, %r7646; + st.local.v2.u32 [%rd2+104], {%r7536, %r7537}; + // begin inline asm + shf.l.wrap.b32 %r7435, %r7392, %r7391, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7439, %r7391, %r7392, %r7619; + // end inline asm + xor.b32 %r7648, %r7435, %r7367; + xor.b32 %r7649, %r7439, %r7368; + xor.b32 %r7473, %r29866, %r7648; + xor.b32 %r7472, %r29867, %r7649; + xor.b32 %r7512, %r29845, %r7649; + xor.b32 %r7513, %r29844, %r7648; + st.local.v2.u32 [%rd2+152], {%r7513, %r7512}; + // begin inline asm + shf.l.wrap.b32 %r7443, %r7404, %r7403, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7447, %r7403, %r7404, %r7619; + // end inline asm + xor.b32 %r7650, %r7443, %r7379; + xor.b32 %r7651, %r7447, %r7380; + xor.b32 %r7496, %r29841, %r7651; + xor.b32 %r7497, %r29840, %r7650; + st.local.v2.u32 [%rd2+120], {%r7497, %r7496}; + xor.b32 %r7488, %r29837, %r7651; + xor.b32 %r7489, %r29836, %r7650; + st.local.v2.u32 [%rd2+200], {%r7489, %r7488}; + // begin inline asm + shf.l.wrap.b32 %r7451, %r7416, %r7415, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7455, %r7415, %r7416, %r7619; + // end inline asm + xor.b32 %r7652, %r7451, %r7391; + xor.b32 %r7653, %r7455, %r7392; + xor.b32 %r7520, %r29860, %r7652; + xor.b32 %r7521, %r29861, %r7653; + xor.b32 %r7529, %r29831, %r7653; + xor.b32 %r7528, %r29830, %r7652; + st.local.v2.u32 [%rd2+168], {%r7528, %r7529}; + // begin inline asm + shf.l.wrap.b32 %r7459, %r7368, %r7367, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7463, %r7367, %r7368, %r7619; + // end inline asm + xor.b32 %r7654, %r7459, %r7403; + xor.b32 %r7655, %r7463, %r7404; + xor.b32 %r7480, %r29826, %r7654; + xor.b32 %r7481, %r29827, %r7655; + xor.b32 %r7505, %r29821, %r7655; + xor.b32 %r7504, %r29820, %r7654; + st.local.v2.u32 [%rd2+216], {%r7504, %r7505}; + // begin inline asm + shf.l.wrap.b32 %r7467, %r7473, %r7472, %r6970; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7471, %r7472, %r7473, %r6970; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7475, %r7481, %r7480, %r6978; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7479, %r7480, %r7481, %r6978; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7487, %r7488, %r7489, %r6986; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7483, %r7489, %r7488, %r6986; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r7483, %r7487}; + // begin inline asm + shf.l.wrap.b32 %r7491, %r7497, %r7496, %r7018; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7495, %r7496, %r7497, %r7018; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7499, %r7505, %r7504, %r7066; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7503, %r7504, %r7505, %r7066; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7511, %r7512, %r7513, %r7090; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7507, %r7513, %r7512, %r7090; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r7507, %r7511}; + // begin inline asm + shf.l.wrap.b32 %r7515, %r7521, %r7520, %r7106; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7519, %r7520, %r7521, %r7106; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7523, %r7529, %r7528, %r7114; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7527, %r7528, %r7529, %r7114; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7531, %r7537, %r7536, %r7146; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7535, %r7536, %r7537, %r7146; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r7539, %r7574, %r7467, %r7491, 0xD2; + lop3.b32 %r7540, %r7577, %r7471, %r7495, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30003, %r7467, %r7491, %r7523, 0xD2; + lop3.b32 %r30004, %r7471, %r7495, %r7527, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30003, %r30004}; + // begin inline asm + // chi + lop3.b32 %r29999, %r7491, %r7523, %r7499, 0xD2; + lop3.b32 %r30000, %r7495, %r7527, %r7503, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r29999, %r30000}; + // begin inline asm + // chi + lop3.b32 %r29995, %r7523, %r7499, %r7574, 0xD2; + lop3.b32 %r29996, %r7527, %r7503, %r7577, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r29995, %r29996}; + // begin inline asm + // chi + lop3.b32 %r29993, %r7499, %r7574, %r7467, 0xD2; + lop3.b32 %r29994, %r7503, %r7577, %r7471, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r29993, %r29994}; + // begin inline asm + // chi + lop3.b32 %r29989, %r7515, %r7475, %r7531, 0xD2; + lop3.b32 %r29990, %r7519, %r7479, %r7535, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r29989, %r29990}; + // begin inline asm + // chi + lop3.b32 %r30001, %r7475, %r7531, %r7507, 0xD2; + lop3.b32 %r30002, %r7479, %r7535, %r7511, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30001, %r30002}; + // begin inline asm + // chi + lop3.b32 %r29997, %r7531, %r7507, %r7483, 0xD2; + lop3.b32 %r29998, %r7535, %r7511, %r7487, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r29997, %r29998}; + add.s64 %rd481, %rd480, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r7603,%r7604}, [%rd481]; + // end inline asm + xor.b32 %r29991, %r7539, %r7603; + xor.b32 %r29992, %r7540, %r7604; + st.local.v2.u32 [%rd2+24], {%r29991, %r29992}; + st.local.u64 [%rd53], %rd361; + mov.u64 %rd485, 1179641; + st.local.u64 [%rd53+8], %rd485; + add.s32 %r243, %r47, 1; + st.local.u32 [%rd53+16], %r243; + ld.global.u64 %rd486, [%rd33]; + ld.global.u64 %rd487, [%rd33+8]; + ld.global.u64 %rd488, [%rd33+16]; + ld.global.u64 %rd489, [%rd33+24]; + ld.global.u64 %rd490, [%rd33+32]; + ld.global.u64 %rd491, [%rd33+40]; + ld.global.u64 %rd492, [%rd33+48]; + ld.global.u64 %rd493, [%rd33+56]; + st.local.u64 [%rd53+32], %rd487; + st.local.u64 [%rd53+40], %rd488; + st.local.u64 [%rd53+48], %rd489; + st.local.u64 [%rd53+56], %rd490; + st.local.u64 [%rd53+64], %rd491; + st.local.u64 [%rd53+72], %rd492; + st.local.u64 [%rd53+80], %rd493; + cvt.u32.u64 %r7656, %rd486; + xor.b32 %r7657, %r243, %r7656; + st.local.u64 [%rd53+24], %rd486; + st.local.u32 [%rd53+24], %r7657; + mov.u32 %r29871, 0; + st.local.v2.u32 [%rd53+96], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+104], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+112], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+120], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+128], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+136], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+144], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+152], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+160], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+168], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+176], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+184], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+192], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+200], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+208], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+216], {%r29871, %r29871}; + mov.u32 %r29886, -2147483648; + st.local.v2.u32 [%rd53+88], {%r7619, %r29886}; + ld.local.v2.u32 {%r29907, %r29908}, [%rd53+24]; + mov.b64 {%r29905, %r29906}, %rd491; + shr.u64 %rd494, %rd487, 32; + cvt.u32.u64 %r29919, %rd487; + cvt.u32.u64 %r29920, %rd494; + shr.u64 %rd495, %rd492, 32; + cvt.u32.u64 %r29917, %rd492; + cvt.u32.u64 %r29918, %rd495; + shr.u64 %rd496, %rd488, 32; + cvt.u32.u64 %r29915, %rd488; + cvt.u32.u64 %r29916, %rd496; + shr.u64 %rd497, %rd493, 32; + cvt.u32.u64 %r29913, %rd493; + cvt.u32.u64 %r29914, %rd497; + shr.u64 %rd498, %rd489, 32; + cvt.u32.u64 %r29911, %rd489; + cvt.u32.u64 %r29912, %rd498; + shr.u64 %rd499, %rd490, 32; + cvt.u32.u64 %r29909, %rd490; + cvt.u32.u64 %r29910, %rd499; + mov.u32 %r29872, %r29871; + mov.u32 %r29873, %r29871; + mov.u32 %r29874, %r29871; + mov.u32 %r29875, %r29871; + mov.u32 %r29876, %r29871; + mov.u32 %r29877, %r29871; + mov.u32 %r29878, %r29871; + mov.u32 %r29879, %r29871; + mov.u32 %r29880, %r29871; + mov.u32 %r29881, %r29871; + mov.u32 %r29882, %r29871; + mov.u32 %r29883, %r29871; + mov.u32 %r29884, %r29871; + mov.u32 %r29885, %r7619; + mov.u32 %r29887, %r29871; + mov.u32 %r29888, %r29871; + mov.u32 %r29889, %r29871; + mov.u32 %r29890, %r29871; + mov.u32 %r29891, %r29871; + mov.u32 %r29892, %r29871; + mov.u32 %r29893, %r29871; + mov.u32 %r29894, %r29871; + mov.u32 %r29895, %r29871; + mov.u32 %r29896, %r29871; + mov.u32 %r29897, %r29871; + mov.u32 %r29898, %r29871; + mov.u32 %r29899, %r29871; + mov.u32 %r29900, %r29871; + mov.u32 %r29901, %r29871; + mov.u32 %r29902, %r29871; + mov.u32 %r29903, %r29871; + mov.u32 %r29904, %r29871; + mov.u32 %r29921, %r29871; + +$L__BB2_17: + // begin inline asm + // xor5 + lop3.b32 %r7660, %r29907, %r29905, %r29903, 0x96; + lop3.b32 %r7660, %r7660, %r29901, %r29899, 0x96; + lop3.b32 %r7661, %r29908, %r29906, %r29904, 0x96; + lop3.b32 %r7661, %r7661, %r29902, %r29900, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7672, %r29919, %r29917, %r29897, 0x96; + lop3.b32 %r7672, %r7672, %r29895, %r29893, 0x96; + lop3.b32 %r7673, %r29920, %r29918, %r29898, 0x96; + lop3.b32 %r7673, %r7673, %r29896, %r29894, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7684, %r29915, %r29913, %r29891, 0x96; + lop3.b32 %r7684, %r7684, %r29889, %r29887, 0x96; + lop3.b32 %r7685, %r29916, %r29914, %r29892, 0x96; + lop3.b32 %r7685, %r7685, %r29890, %r29888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7696, %r29911, %r29885, %r29883, 0x96; + lop3.b32 %r7696, %r7696, %r29881, %r29879, 0x96; + lop3.b32 %r7697, %r29912, %r29886, %r29884, 0x96; + lop3.b32 %r7697, %r7697, %r29882, %r29880, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7708, %r29909, %r29877, %r29875, 0x96; + lop3.b32 %r7708, %r7708, %r29873, %r29871, 0x96; + lop3.b32 %r7709, %r29910, %r29878, %r29876, 0x96; + lop3.b32 %r7709, %r7709, %r29874, %r29872, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7720, %r7673, %r7672, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7724, %r7672, %r7673, %r7619; + // end inline asm + xor.b32 %r8154, %r7720, %r7708; + xor.b32 %r8155, %r7724, %r7709; + xor.b32 %r7987, %r29907, %r8154; + xor.b32 %r7990, %r29908, %r8155; + xor.b32 %r7894, %r29905, %r8154; + xor.b32 %r7893, %r29906, %r8155; + xor.b32 %r7941, %r29903, %r8154; + xor.b32 %r7942, %r29904, %r8155; + xor.b32 %r7846, %r29901, %r8154; + xor.b32 %r7845, %r29902, %r8155; + xor.b32 %r7797, %r29899, %r8154; + xor.b32 %r7798, %r29900, %r8155; + // begin inline asm + shf.l.wrap.b32 %r7728, %r7685, %r7684, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7732, %r7684, %r7685, %r7619; + // end inline asm + xor.b32 %r8156, %r7728, %r7660; + xor.b32 %r8157, %r7732, %r7661; + xor.b32 %r7949, %r29919, %r8156; + xor.b32 %r7950, %r29920, %r8157; + xor.b32 %r7766, %r29917, %r8156; + xor.b32 %r7765, %r29918, %r8157; + xor.b32 %r7925, %r29897, %r8156; + xor.b32 %r7926, %r29898, %r8157; + xor.b32 %r7886, %r29895, %r8156; + xor.b32 %r7885, %r29896, %r8157; + xor.b32 %r7869, %r29893, %r8156; + xor.b32 %r7870, %r29894, %r8157; + // begin inline asm + shf.l.wrap.b32 %r7736, %r7697, %r7696, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7740, %r7696, %r7697, %r7619; + // end inline asm + xor.b32 %r8158, %r7736, %r7672; + xor.b32 %r8159, %r7740, %r7673; + xor.b32 %r7806, %r29915, %r8158; + xor.b32 %r7805, %r29916, %r8159; + xor.b32 %r7933, %r29913, %r8158; + xor.b32 %r7934, %r29914, %r8159; + xor.b32 %r7814, %r29891, %r8158; + xor.b32 %r7813, %r29892, %r8159; + xor.b32 %r7917, %r29889, %r8158; + xor.b32 %r7918, %r29890, %r8159; + xor.b32 %r7782, %r29887, %r8158; + xor.b32 %r7781, %r29888, %r8159; + // begin inline asm + shf.l.wrap.b32 %r7744, %r7709, %r7708, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7748, %r7708, %r7709, %r7619; + // end inline asm + xor.b32 %r8160, %r7744, %r7684; + xor.b32 %r8161, %r7748, %r7685; + xor.b32 %r7901, %r29911, %r8160; + xor.b32 %r7902, %r29912, %r8161; + xor.b32 %r7878, %r29885, %r8160; + xor.b32 %r7877, %r29886, %r8161; + xor.b32 %r7821, %r29883, %r8160; + xor.b32 %r7822, %r29884, %r8161; + xor.b32 %r7909, %r29881, %r8160; + xor.b32 %r7910, %r29882, %r8161; + xor.b32 %r7838, %r29879, %r8160; + xor.b32 %r7837, %r29880, %r8161; + // begin inline asm + shf.l.wrap.b32 %r7752, %r7661, %r7660, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7756, %r7660, %r7661, %r7619; + // end inline asm + xor.b32 %r8162, %r7752, %r7696; + xor.b32 %r8163, %r7756, %r7697; + xor.b32 %r7853, %r29909, %r8162; + xor.b32 %r7854, %r29910, %r8163; + xor.b32 %r7773, %r29877, %r8162; + xor.b32 %r7774, %r29878, %r8163; + xor.b32 %r7790, %r29875, %r8162; + xor.b32 %r7789, %r29876, %r8163; + xor.b32 %r7829, %r29873, %r8162; + xor.b32 %r7830, %r29874, %r8163; + xor.b32 %r7861, %r29871, %r8162; + xor.b32 %r7862, %r29872, %r8163; + mov.u32 %r7767, 44; + // begin inline asm + shf.l.wrap.b32 %r7760, %r7766, %r7765, %r7767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7764, %r7765, %r7766, %r7767; + // end inline asm + mov.u32 %r7775, 20; + // begin inline asm + shf.l.wrap.b32 %r7768, %r7774, %r7773, %r7775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7772, %r7773, %r7774, %r7775; + // end inline asm + mov.u32 %r7783, 61; + // begin inline asm + shf.l.wrap.b32 %r7776, %r7782, %r7781, %r7783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7780, %r7781, %r7782, %r7783; + // end inline asm + mov.u32 %r7791, 39; + // begin inline asm + shf.l.wrap.b32 %r7784, %r7790, %r7789, %r7791; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7788, %r7789, %r7790, %r7791; + // end inline asm + mov.u32 %r7799, 18; + // begin inline asm + shf.l.wrap.b32 %r7792, %r7798, %r7797, %r7799; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7796, %r7797, %r7798, %r7799; + // end inline asm + mov.u32 %r7807, 62; + // begin inline asm + shf.l.wrap.b32 %r7800, %r7806, %r7805, %r7807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7804, %r7805, %r7806, %r7807; + // end inline asm + mov.u32 %r7815, 43; + // begin inline asm + shf.l.wrap.b32 %r7808, %r7814, %r7813, %r7815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7812, %r7813, %r7814, %r7815; + // end inline asm + mov.u32 %r7823, 25; + // begin inline asm + shf.l.wrap.b32 %r7816, %r7822, %r7821, %r7823; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7820, %r7821, %r7822, %r7823; + // end inline asm + mov.u32 %r7831, 8; + // begin inline asm + shf.l.wrap.b32 %r7824, %r7830, %r7829, %r7831; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7828, %r7829, %r7830, %r7831; + // end inline asm + mov.u32 %r7839, 56; + // begin inline asm + shf.l.wrap.b32 %r7832, %r7838, %r7837, %r7839; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7836, %r7837, %r7838, %r7839; + // end inline asm + mov.u32 %r7847, 41; + // begin inline asm + shf.l.wrap.b32 %r7840, %r7846, %r7845, %r7847; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7844, %r7845, %r7846, %r7847; + // end inline asm + mov.u32 %r7855, 27; + // begin inline asm + shf.l.wrap.b32 %r7848, %r7854, %r7853, %r7855; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7852, %r7853, %r7854, %r7855; + // end inline asm + mov.u32 %r7863, 14; + // begin inline asm + shf.l.wrap.b32 %r7856, %r7862, %r7861, %r7863; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7860, %r7861, %r7862, %r7863; + // end inline asm + mov.u32 %r7871, 2; + // begin inline asm + shf.l.wrap.b32 %r7864, %r7870, %r7869, %r7871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7868, %r7869, %r7870, %r7871; + // end inline asm + mov.u32 %r7879, 55; + // begin inline asm + shf.l.wrap.b32 %r7872, %r7878, %r7877, %r7879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7876, %r7877, %r7878, %r7879; + // end inline asm + mov.u32 %r7887, 45; + // begin inline asm + shf.l.wrap.b32 %r7880, %r7886, %r7885, %r7887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7884, %r7885, %r7886, %r7887; + // end inline asm + mov.u32 %r7895, 36; + // begin inline asm + shf.l.wrap.b32 %r7888, %r7894, %r7893, %r7895; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7892, %r7893, %r7894, %r7895; + // end inline asm + mov.u32 %r7903, 28; + // begin inline asm + shf.l.wrap.b32 %r7896, %r7902, %r7901, %r7903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7900, %r7901, %r7902, %r7903; + // end inline asm + mov.u32 %r7911, 21; + // begin inline asm + shf.l.wrap.b32 %r7904, %r7910, %r7909, %r7911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7908, %r7909, %r7910, %r7911; + // end inline asm + mov.u32 %r7919, 15; + // begin inline asm + shf.l.wrap.b32 %r7912, %r7918, %r7917, %r7919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7916, %r7917, %r7918, %r7919; + // end inline asm + mov.u32 %r7927, 10; + // begin inline asm + shf.l.wrap.b32 %r7920, %r7926, %r7925, %r7927; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7924, %r7925, %r7926, %r7927; + // end inline asm + mov.u32 %r7935, 6; + // begin inline asm + shf.l.wrap.b32 %r7928, %r7934, %r7933, %r7935; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7932, %r7933, %r7934, %r7935; + // end inline asm + mov.u32 %r7943, 3; + // begin inline asm + shf.l.wrap.b32 %r7936, %r7942, %r7941, %r7943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7940, %r7941, %r7942, %r7943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7944, %r7950, %r7949, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7948, %r7949, %r7950, %r7619; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r7952, %r7987, %r7760, %r7808, 0xD2; + lop3.b32 %r7953, %r7990, %r7764, %r7812, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29919, %r7760, %r7808, %r7904, 0xD2; + lop3.b32 %r29920, %r7764, %r7812, %r7908, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29915, %r7808, %r7904, %r7856, 0xD2; + lop3.b32 %r29916, %r7812, %r7908, %r7860, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29911, %r7904, %r7856, %r7987, 0xD2; + lop3.b32 %r29912, %r7908, %r7860, %r7990, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29909, %r7856, %r7987, %r7760, 0xD2; + lop3.b32 %r29910, %r7860, %r7990, %r7764, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29905, %r7896, %r7768, %r7936, 0xD2; + lop3.b32 %r29906, %r7900, %r7772, %r7940, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29917, %r7768, %r7936, %r7880, 0xD2; + lop3.b32 %r29918, %r7772, %r7940, %r7884, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29913, %r7936, %r7880, %r7776, 0xD2; + lop3.b32 %r29914, %r7940, %r7884, %r7780, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29885, %r7880, %r7776, %r7896, 0xD2; + lop3.b32 %r29886, %r7884, %r7780, %r7900, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r29885, %r29886}; + // begin inline asm + // chi + lop3.b32 %r29877, %r7776, %r7896, %r7768, 0xD2; + lop3.b32 %r29878, %r7780, %r7900, %r7772, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r29877, %r29878}; + // begin inline asm + // chi + lop3.b32 %r29903, %r7944, %r7928, %r7816, 0xD2; + lop3.b32 %r29904, %r7948, %r7932, %r7820, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+104], {%r29903, %r29904}; + // begin inline asm + // chi + lop3.b32 %r29897, %r7928, %r7816, %r7824, 0xD2; + lop3.b32 %r29898, %r7932, %r7820, %r7828, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+112], {%r29897, %r29898}; + // begin inline asm + // chi + lop3.b32 %r29891, %r7816, %r7824, %r7792, 0xD2; + lop3.b32 %r29892, %r7820, %r7828, %r7796, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+120], {%r29891, %r29892}; + // begin inline asm + // chi + lop3.b32 %r29883, %r7824, %r7792, %r7944, 0xD2; + lop3.b32 %r29884, %r7828, %r7796, %r7948, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+128], {%r29883, %r29884}; + // begin inline asm + // chi + lop3.b32 %r29875, %r7792, %r7944, %r7928, 0xD2; + lop3.b32 %r29876, %r7796, %r7948, %r7932, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+136], {%r29875, %r29876}; + // begin inline asm + // chi + lop3.b32 %r29901, %r7848, %r7888, %r7920, 0xD2; + lop3.b32 %r29902, %r7852, %r7892, %r7924, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+144], {%r29901, %r29902}; + // begin inline asm + // chi + lop3.b32 %r29895, %r7888, %r7920, %r7912, 0xD2; + lop3.b32 %r29896, %r7892, %r7924, %r7916, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+152], {%r29895, %r29896}; + // begin inline asm + // chi + lop3.b32 %r29889, %r7920, %r7912, %r7832, 0xD2; + lop3.b32 %r29890, %r7924, %r7916, %r7836, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+160], {%r29889, %r29890}; + // begin inline asm + // chi + lop3.b32 %r29881, %r7912, %r7832, %r7848, 0xD2; + lop3.b32 %r29882, %r7916, %r7836, %r7852, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+168], {%r29881, %r29882}; + // begin inline asm + // chi + lop3.b32 %r29873, %r7832, %r7848, %r7888, 0xD2; + lop3.b32 %r29874, %r7836, %r7852, %r7892, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+176], {%r29873, %r29874}; + // begin inline asm + // chi + lop3.b32 %r29899, %r7800, %r7872, %r7784, 0xD2; + lop3.b32 %r29900, %r7804, %r7876, %r7788, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+184], {%r29899, %r29900}; + // begin inline asm + // chi + lop3.b32 %r29893, %r7872, %r7784, %r7840, 0xD2; + lop3.b32 %r29894, %r7876, %r7788, %r7844, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+192], {%r29893, %r29894}; + // begin inline asm + // chi + lop3.b32 %r29887, %r7784, %r7840, %r7864, 0xD2; + lop3.b32 %r29888, %r7788, %r7844, %r7868, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+200], {%r29887, %r29888}; + // begin inline asm + // chi + lop3.b32 %r29879, %r7840, %r7864, %r7800, 0xD2; + lop3.b32 %r29880, %r7844, %r7868, %r7804, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+208], {%r29879, %r29880}; + // begin inline asm + // chi + lop3.b32 %r29871, %r7864, %r7800, %r7872, 0xD2; + lop3.b32 %r29872, %r7868, %r7804, %r7876, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+216], {%r29871, %r29872}; + mul.wide.s32 %rd501, %r29921, 8; + add.s64 %rd500, %rd480, %rd501; + // begin inline asm + ld.global.nc.v2.u32 {%r8152,%r8153}, [%rd500]; + // end inline asm + xor.b32 %r29907, %r7952, %r8152; + xor.b32 %r29908, %r7953, %r8153; + add.s32 %r29921, %r29921, 1; + setp.lt.u32 %p15, %r29921, 23; + @%p15 bra $L__BB2_17; + + mov.u32 %r29954, 0; + mov.u32 %r8263, 1; + st.local.v2.u32 [%rd53+32], {%r29919, %r29920}; + st.local.v2.u32 [%rd53+72], {%r29917, %r29918}; + st.local.v2.u32 [%rd53+40], {%r29915, %r29916}; + st.local.v2.u32 [%rd53+80], {%r29913, %r29914}; + st.local.v2.u32 [%rd53+48], {%r29911, %r29912}; + st.local.v2.u32 [%rd53+56], {%r29909, %r29910}; + st.local.v2.u32 [%rd53+24], {%r29907, %r29908}; + // begin inline asm + // xor5 + lop3.b32 %r8164, %r29907, %r29905, %r29903, 0x96; + lop3.b32 %r8164, %r8164, %r29901, %r29899, 0x96; + lop3.b32 %r8165, %r29908, %r29906, %r29904, 0x96; + lop3.b32 %r8165, %r8165, %r29902, %r29900, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8176, %r29919, %r29917, %r29897, 0x96; + lop3.b32 %r8176, %r8176, %r29895, %r29893, 0x96; + lop3.b32 %r8177, %r29920, %r29918, %r29898, 0x96; + lop3.b32 %r8177, %r8177, %r29896, %r29894, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8188, %r29915, %r29913, %r29891, 0x96; + lop3.b32 %r8188, %r8188, %r29889, %r29887, 0x96; + lop3.b32 %r8189, %r29916, %r29914, %r29892, 0x96; + lop3.b32 %r8189, %r8189, %r29890, %r29888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8200, %r29911, %r29885, %r29883, 0x96; + lop3.b32 %r8200, %r8200, %r29881, %r29879, 0x96; + lop3.b32 %r8201, %r29912, %r29886, %r29884, 0x96; + lop3.b32 %r8201, %r8201, %r29882, %r29880, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8212, %r29909, %r29877, %r29875, 0x96; + lop3.b32 %r8212, %r8212, %r29873, %r29871, 0x96; + lop3.b32 %r8213, %r29910, %r29878, %r29876, 0x96; + lop3.b32 %r8213, %r8213, %r29874, %r29872, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8224, %r8177, %r8176, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8228, %r8176, %r8177, %r8263; + // end inline asm + xor.b32 %r8403, %r8224, %r8212; + xor.b32 %r8404, %r8228, %r8213; + xor.b32 %r8371, %r29907, %r8403; + xor.b32 %r8374, %r29908, %r8404; + xor.b32 %r8334, %r29904, %r8404; + xor.b32 %r8333, %r29903, %r8403; + st.local.v2.u32 [%rd53+104], {%r8333, %r8334}; + // begin inline asm + shf.l.wrap.b32 %r8232, %r8189, %r8188, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8236, %r8188, %r8189, %r8263; + // end inline asm + xor.b32 %r8405, %r8232, %r8164; + xor.b32 %r8406, %r8236, %r8165; + xor.b32 %r8270, %r29917, %r8405; + xor.b32 %r8269, %r29918, %r8406; + xor.b32 %r8309, %r29896, %r8406; + xor.b32 %r8310, %r29895, %r8405; + st.local.v2.u32 [%rd53+152], {%r8310, %r8309}; + // begin inline asm + shf.l.wrap.b32 %r8240, %r8201, %r8200, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8244, %r8200, %r8201, %r8263; + // end inline asm + xor.b32 %r8407, %r8240, %r8176; + xor.b32 %r8408, %r8244, %r8177; + xor.b32 %r8293, %r29892, %r8408; + xor.b32 %r8294, %r29891, %r8407; + st.local.v2.u32 [%rd53+120], {%r8294, %r8293}; + xor.b32 %r8285, %r29888, %r8408; + xor.b32 %r8286, %r29887, %r8407; + st.local.v2.u32 [%rd53+200], {%r8286, %r8285}; + // begin inline asm + shf.l.wrap.b32 %r8248, %r8213, %r8212, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8252, %r8212, %r8213, %r8263; + // end inline asm + xor.b32 %r8409, %r8248, %r8188; + xor.b32 %r8410, %r8252, %r8189; + xor.b32 %r8317, %r29911, %r8409; + xor.b32 %r8318, %r29912, %r8410; + xor.b32 %r8326, %r29882, %r8410; + xor.b32 %r8325, %r29881, %r8409; + st.local.v2.u32 [%rd53+168], {%r8325, %r8326}; + // begin inline asm + shf.l.wrap.b32 %r8256, %r8165, %r8164, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8260, %r8164, %r8165, %r8263; + // end inline asm + xor.b32 %r8411, %r8256, %r8200; + xor.b32 %r8412, %r8260, %r8201; + xor.b32 %r8277, %r29877, %r8411; + xor.b32 %r8278, %r29878, %r8412; + xor.b32 %r8302, %r29872, %r8412; + xor.b32 %r8301, %r29871, %r8411; + st.local.v2.u32 [%rd53+216], {%r8301, %r8302}; + // begin inline asm + shf.l.wrap.b32 %r8264, %r8270, %r8269, %r7767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8268, %r8269, %r8270, %r7767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8272, %r8278, %r8277, %r7775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8276, %r8277, %r8278, %r7775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8284, %r8285, %r8286, %r7783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8280, %r8286, %r8285, %r7783; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r8280, %r8284}; + // begin inline asm + shf.l.wrap.b32 %r8288, %r8294, %r8293, %r7815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8292, %r8293, %r8294, %r7815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8296, %r8302, %r8301, %r7863; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8300, %r8301, %r8302, %r7863; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8308, %r8309, %r8310, %r7887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8304, %r8310, %r8309, %r7887; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r8304, %r8308}; + // begin inline asm + shf.l.wrap.b32 %r8312, %r8318, %r8317, %r7903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8316, %r8317, %r8318, %r7903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8320, %r8326, %r8325, %r7911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8324, %r8325, %r8326, %r7911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8328, %r8334, %r8333, %r7943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8332, %r8333, %r8334, %r7943; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r8336, %r8371, %r8264, %r8288, 0xD2; + lop3.b32 %r8337, %r8374, %r8268, %r8292, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30054, %r8264, %r8288, %r8320, 0xD2; + lop3.b32 %r30055, %r8268, %r8292, %r8324, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+32], {%r30054, %r30055}; + // begin inline asm + // chi + lop3.b32 %r30050, %r8288, %r8320, %r8296, 0xD2; + lop3.b32 %r30051, %r8292, %r8324, %r8300, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+40], {%r30050, %r30051}; + // begin inline asm + // chi + lop3.b32 %r30046, %r8320, %r8296, %r8371, 0xD2; + lop3.b32 %r30047, %r8324, %r8300, %r8374, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+48], {%r30046, %r30047}; + // begin inline asm + // chi + lop3.b32 %r30044, %r8296, %r8371, %r8264, 0xD2; + lop3.b32 %r30045, %r8300, %r8374, %r8268, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+56], {%r30044, %r30045}; + // begin inline asm + // chi + lop3.b32 %r30040, %r8312, %r8272, %r8328, 0xD2; + lop3.b32 %r30041, %r8316, %r8276, %r8332, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+64], {%r30040, %r30041}; + // begin inline asm + // chi + lop3.b32 %r30052, %r8272, %r8328, %r8304, 0xD2; + lop3.b32 %r30053, %r8276, %r8332, %r8308, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+72], {%r30052, %r30053}; + // begin inline asm + // chi + lop3.b32 %r30048, %r8328, %r8304, %r8280, 0xD2; + lop3.b32 %r30049, %r8332, %r8308, %r8284, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+80], {%r30048, %r30049}; + // begin inline asm + ld.global.nc.v2.u32 {%r8400,%r8401}, [%rd481]; + // end inline asm + xor.b32 %r30042, %r8336, %r8400; + xor.b32 %r30043, %r8337, %r8401; + st.local.v2.u32 [%rd53+24], {%r30042, %r30043}; + add.s64 %rd55, %rd53, 24; + add.s64 %rd56, %rd2, 24; + +$L__BB2_19: + cvta.to.global.u64 %rd1270, %rd361; + shl.b32 %r8413, %r29954, 2; + cvt.u64.u32 %rd511, %r8413; + and.b64 %rd512, %rd511, 60; + add.s64 %rd513, %rd56, %rd512; + xor.b32 %r8414, %r47, %r29954; + mul.lo.s32 %r8415, %r8414, 16777619; + ld.local.u32 %r8416, [%rd513]; + xor.b32 %r8417, %r8415, %r8416; + mul.wide.u32 %rd514, %r8417, -954391867; + shr.u64 %rd515, %rd514, 32; + cvt.u32.u64 %r8418, %rd515; + sub.s32 %r8419, %r8417, %r8418; + shr.u32 %r8420, %r8419, 1; + add.s32 %r8421, %r8420, %r8418; + shr.u32 %r8422, %r8421, 20; + mul.lo.s32 %r8423, %r8422, 1179641; + sub.s32 %r8424, %r8417, %r8423; + mul.wide.u32 %rd516, %r8424, 64; + add.s64 %rd517, %rd1270, %rd516; + mul.lo.s32 %r8425, %r29991, 16777619; + ld.global.u32 %r8426, [%rd517]; + xor.b32 %r29991, %r8425, %r8426; + mul.lo.s32 %r8427, %r29992, 16777619; + ld.global.u32 %r8428, [%rd517+4]; + xor.b32 %r29992, %r8427, %r8428; + mul.lo.s32 %r8429, %r30003, 16777619; + ld.global.u32 %r8430, [%rd517+8]; + mul.lo.s32 %r8431, %r30004, 16777619; + ld.global.u32 %r8432, [%rd517+12]; + xor.b32 %r8433, %r8431, %r8432; + xor.b32 %r30003, %r8429, %r8430; + mov.b64 %rd518, {%r30003, %r8433}; + mul.lo.s32 %r8434, %r29999, 16777619; + ld.global.u32 %r8435, [%rd517+16]; + mul.lo.s32 %r8436, %r30000, 16777619; + ld.global.u32 %r8437, [%rd517+20]; + xor.b32 %r8438, %r8436, %r8437; + xor.b32 %r29999, %r8434, %r8435; + mov.b64 %rd519, {%r29999, %r8438}; + mul.lo.s32 %r8439, %r29995, 16777619; + ld.global.u32 %r8440, [%rd517+24]; + mul.lo.s32 %r8441, %r29996, 16777619; + ld.global.u32 %r8442, [%rd517+28]; + xor.b32 %r8443, %r8441, %r8442; + xor.b32 %r29995, %r8439, %r8440; + mov.b64 %rd520, {%r29995, %r8443}; + mul.lo.s32 %r8444, %r29993, 16777619; + ld.global.u32 %r8445, [%rd517+32]; + mul.lo.s32 %r8446, %r29994, 16777619; + ld.global.u32 %r8447, [%rd517+36]; + xor.b32 %r8448, %r8446, %r8447; + xor.b32 %r29993, %r8444, %r8445; + mov.b64 %rd521, {%r29993, %r8448}; + mul.lo.s32 %r8449, %r29989, 16777619; + ld.global.u32 %r8450, [%rd517+40]; + xor.b32 %r29989, %r8449, %r8450; + mul.lo.s32 %r8451, %r29990, 16777619; + ld.global.u32 %r8452, [%rd517+44]; + xor.b32 %r29990, %r8451, %r8452; + mul.lo.s32 %r8453, %r30001, 16777619; + ld.global.u32 %r8454, [%rd517+48]; + mul.lo.s32 %r8455, %r30002, 16777619; + ld.global.u32 %r8456, [%rd517+52]; + xor.b32 %r8457, %r8455, %r8456; + xor.b32 %r30001, %r8453, %r8454; + mov.b64 %rd522, {%r30001, %r8457}; + mul.lo.s32 %r8458, %r29997, 16777619; + ld.global.u32 %r8459, [%rd517+56]; + mul.lo.s32 %r8460, %r29998, 16777619; + ld.global.u32 %r8461, [%rd517+60]; + xor.b32 %r8462, %r8460, %r8461; + xor.b32 %r29997, %r8458, %r8459; + mov.b64 %rd523, {%r29997, %r8462}; + st.local.v2.u32 [%rd2+24], {%r29991, %r29992}; + st.local.v2.u32 [%rd2+32], {%r30003, %r8433}; + st.local.v2.u32 [%rd2+40], {%r29999, %r8438}; + st.local.v2.u32 [%rd2+48], {%r29995, %r8443}; + st.local.v2.u32 [%rd2+56], {%r29993, %r8448}; + st.local.v2.u32 [%rd2+64], {%r29989, %r29990}; + st.local.v2.u32 [%rd2+72], {%r30001, %r8457}; + st.local.v2.u32 [%rd2+80], {%r29997, %r8462}; + add.s64 %rd524, %rd55, %rd512; + xor.b32 %r8463, %r243, %r29954; + mul.lo.s32 %r8464, %r8463, 16777619; + ld.local.u32 %r8465, [%rd524]; + xor.b32 %r8466, %r8464, %r8465; + mul.wide.u32 %rd525, %r8466, -954391867; + shr.u64 %rd526, %rd525, 32; + cvt.u32.u64 %r8467, %rd526; + sub.s32 %r8468, %r8466, %r8467; + shr.u32 %r8469, %r8468, 1; + add.s32 %r8470, %r8469, %r8467; + shr.u32 %r8471, %r8470, 20; + mul.lo.s32 %r8472, %r8471, 1179641; + sub.s32 %r8473, %r8466, %r8472; + mul.wide.u32 %rd527, %r8473, 64; + add.s64 %rd528, %rd1270, %rd527; + mul.lo.s32 %r8474, %r30042, 16777619; + ld.global.u32 %r8475, [%rd528]; + xor.b32 %r30042, %r8474, %r8475; + mul.lo.s32 %r8476, %r30043, 16777619; + ld.global.u32 %r8477, [%rd528+4]; + xor.b32 %r30043, %r8476, %r8477; + mul.lo.s32 %r8478, %r30054, 16777619; + ld.global.u32 %r8479, [%rd528+8]; + mul.lo.s32 %r8480, %r30055, 16777619; + ld.global.u32 %r8481, [%rd528+12]; + xor.b32 %r8482, %r8480, %r8481; + xor.b32 %r30054, %r8478, %r8479; + mov.b64 %rd529, {%r30054, %r8482}; + mul.lo.s32 %r8483, %r30050, 16777619; + ld.global.u32 %r8484, [%rd528+16]; + mul.lo.s32 %r8485, %r30051, 16777619; + ld.global.u32 %r8486, [%rd528+20]; + xor.b32 %r8487, %r8485, %r8486; + xor.b32 %r30050, %r8483, %r8484; + mov.b64 %rd530, {%r30050, %r8487}; + mul.lo.s32 %r8488, %r30046, 16777619; + ld.global.u32 %r8489, [%rd528+24]; + mul.lo.s32 %r8490, %r30047, 16777619; + ld.global.u32 %r8491, [%rd528+28]; + xor.b32 %r8492, %r8490, %r8491; + xor.b32 %r30046, %r8488, %r8489; + mov.b64 %rd531, {%r30046, %r8492}; + mul.lo.s32 %r8493, %r30044, 16777619; + ld.global.u32 %r8494, [%rd528+32]; + mul.lo.s32 %r8495, %r30045, 16777619; + ld.global.u32 %r8496, [%rd528+36]; + xor.b32 %r8497, %r8495, %r8496; + xor.b32 %r30044, %r8493, %r8494; + mov.b64 %rd532, {%r30044, %r8497}; + mul.lo.s32 %r8498, %r30040, 16777619; + ld.global.u32 %r8499, [%rd528+40]; + xor.b32 %r30040, %r8498, %r8499; + mul.lo.s32 %r8500, %r30041, 16777619; + ld.global.u32 %r8501, [%rd528+44]; + xor.b32 %r30041, %r8500, %r8501; + mul.lo.s32 %r8502, %r30052, 16777619; + ld.global.u32 %r8503, [%rd528+48]; + mul.lo.s32 %r8504, %r30053, 16777619; + ld.global.u32 %r8505, [%rd528+52]; + xor.b32 %r8506, %r8504, %r8505; + xor.b32 %r30052, %r8502, %r8503; + mov.b64 %rd533, {%r30052, %r8506}; + mul.lo.s32 %r8507, %r30048, 16777619; + ld.global.u32 %r8508, [%rd528+56]; + mul.lo.s32 %r8509, %r30049, 16777619; + ld.global.u32 %r8510, [%rd528+60]; + xor.b32 %r8511, %r8509, %r8510; + xor.b32 %r30048, %r8507, %r8508; + mov.b64 %rd534, {%r30048, %r8511}; + st.local.v2.u32 [%rd53+24], {%r30042, %r30043}; + st.local.v2.u32 [%rd53+32], {%r30054, %r8482}; + st.local.v2.u32 [%rd53+40], {%r30050, %r8487}; + st.local.v2.u32 [%rd53+48], {%r30046, %r8492}; + st.local.v2.u32 [%rd53+56], {%r30044, %r8497}; + st.local.v2.u32 [%rd53+64], {%r30040, %r30041}; + st.local.v2.u32 [%rd53+72], {%r30052, %r8506}; + st.local.v2.u32 [%rd53+80], {%r30048, %r8511}; + add.s32 %r29954, %r29954, 1; + setp.lt.u32 %p16, %r29954, 512; + shr.u64 %rd535, %rd518, 32; + cvt.u32.u64 %r30004, %rd535; + shr.u64 %rd536, %rd519, 32; + cvt.u32.u64 %r30000, %rd536; + shr.u64 %rd537, %rd520, 32; + cvt.u32.u64 %r29996, %rd537; + shr.u64 %rd538, %rd521, 32; + cvt.u32.u64 %r29994, %rd538; + shr.u64 %rd539, %rd522, 32; + cvt.u32.u64 %r30002, %rd539; + shr.u64 %rd540, %rd523, 32; + cvt.u32.u64 %r29998, %rd540; + shr.u64 %rd541, %rd529, 32; + cvt.u32.u64 %r30055, %rd541; + shr.u64 %rd542, %rd530, 32; + cvt.u32.u64 %r30051, %rd542; + shr.u64 %rd543, %rd531, 32; + cvt.u32.u64 %r30047, %rd543; + shr.u64 %rd544, %rd532, 32; + cvt.u32.u64 %r30045, %rd544; + shr.u64 %rd545, %rd533, 32; + cvt.u32.u64 %r30053, %rd545; + shr.u64 %rd546, %rd534, 32; + cvt.u32.u64 %r30049, %rd546; + @%p16 bra $L__BB2_19; + + mov.u32 %r29955, 0; + st.local.v2.u32 [%rd2+96], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+104], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+112], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+120], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+128], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+136], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+144], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+152], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+160], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+168], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+176], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+184], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+192], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+200], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+208], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+216], {%r29955, %r29955}; + mov.u32 %r29970, -2147483648; + mov.u32 %r8526, 1; + st.local.v2.u32 [%rd2+88], {%r8526, %r29970}; + mov.u32 %r29956, %r29955; + mov.u32 %r29957, %r29955; + mov.u32 %r29958, %r29955; + mov.u32 %r29959, %r29955; + mov.u32 %r29960, %r29955; + mov.u32 %r29961, %r29955; + mov.u32 %r29962, %r29955; + mov.u32 %r29963, %r29955; + mov.u32 %r29964, %r29955; + mov.u32 %r29965, %r29955; + mov.u32 %r29966, %r29955; + mov.u32 %r29967, %r29955; + mov.u32 %r29968, %r29955; + mov.u32 %r29969, %r8526; + mov.u32 %r29971, %r29955; + mov.u32 %r29972, %r29955; + mov.u32 %r29973, %r29955; + mov.u32 %r29974, %r29955; + mov.u32 %r29975, %r29955; + mov.u32 %r29976, %r29955; + mov.u32 %r29977, %r29955; + mov.u32 %r29978, %r29955; + mov.u32 %r29979, %r29955; + mov.u32 %r29980, %r29955; + mov.u32 %r29981, %r29955; + mov.u32 %r29982, %r29955; + mov.u32 %r29983, %r29955; + mov.u32 %r29984, %r29955; + mov.u32 %r29985, %r29955; + mov.u32 %r29986, %r29955; + mov.u32 %r29987, %r29955; + mov.u32 %r29988, %r29955; + mov.u32 %r30005, %r29955; + +$L__BB2_21: + // begin inline asm + // xor5 + lop3.b32 %r8553, %r29991, %r29989, %r29987, 0x96; + lop3.b32 %r8553, %r8553, %r29985, %r29983, 0x96; + lop3.b32 %r8554, %r29992, %r29990, %r29988, 0x96; + lop3.b32 %r8554, %r8554, %r29986, %r29984, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8565, %r30003, %r30001, %r29981, 0x96; + lop3.b32 %r8565, %r8565, %r29979, %r29977, 0x96; + lop3.b32 %r8566, %r30004, %r30002, %r29982, 0x96; + lop3.b32 %r8566, %r8566, %r29980, %r29978, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8577, %r29999, %r29997, %r29975, 0x96; + lop3.b32 %r8577, %r8577, %r29973, %r29971, 0x96; + lop3.b32 %r8578, %r30000, %r29998, %r29976, 0x96; + lop3.b32 %r8578, %r8578, %r29974, %r29972, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8589, %r29995, %r29969, %r29967, 0x96; + lop3.b32 %r8589, %r8589, %r29965, %r29963, 0x96; + lop3.b32 %r8590, %r29996, %r29970, %r29968, 0x96; + lop3.b32 %r8590, %r8590, %r29966, %r29964, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8601, %r29993, %r29961, %r29959, 0x96; + lop3.b32 %r8601, %r8601, %r29957, %r29955, 0x96; + lop3.b32 %r8602, %r29994, %r29962, %r29960, 0x96; + lop3.b32 %r8602, %r8602, %r29958, %r29956, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8613, %r8566, %r8565, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8617, %r8565, %r8566, %r8526; + // end inline asm + xor.b32 %r9047, %r8613, %r8601; + xor.b32 %r9048, %r8617, %r8602; + xor.b32 %r8880, %r29991, %r9047; + xor.b32 %r8883, %r29992, %r9048; + xor.b32 %r8787, %r29989, %r9047; + xor.b32 %r8786, %r29990, %r9048; + xor.b32 %r8834, %r29987, %r9047; + xor.b32 %r8835, %r29988, %r9048; + xor.b32 %r8739, %r29985, %r9047; + xor.b32 %r8738, %r29986, %r9048; + xor.b32 %r8690, %r29983, %r9047; + xor.b32 %r8691, %r29984, %r9048; + // begin inline asm + shf.l.wrap.b32 %r8621, %r8578, %r8577, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8625, %r8577, %r8578, %r8526; + // end inline asm + xor.b32 %r9049, %r8621, %r8553; + xor.b32 %r9050, %r8625, %r8554; + xor.b32 %r8842, %r30003, %r9049; + xor.b32 %r8843, %r30004, %r9050; + xor.b32 %r8659, %r30001, %r9049; + xor.b32 %r8658, %r30002, %r9050; + xor.b32 %r8818, %r29981, %r9049; + xor.b32 %r8819, %r29982, %r9050; + xor.b32 %r8779, %r29979, %r9049; + xor.b32 %r8778, %r29980, %r9050; + xor.b32 %r8762, %r29977, %r9049; + xor.b32 %r8763, %r29978, %r9050; + // begin inline asm + shf.l.wrap.b32 %r8629, %r8590, %r8589, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8633, %r8589, %r8590, %r8526; + // end inline asm + xor.b32 %r9051, %r8629, %r8565; + xor.b32 %r9052, %r8633, %r8566; + xor.b32 %r8699, %r29999, %r9051; + xor.b32 %r8698, %r30000, %r9052; + xor.b32 %r8826, %r29997, %r9051; + xor.b32 %r8827, %r29998, %r9052; + xor.b32 %r8707, %r29975, %r9051; + xor.b32 %r8706, %r29976, %r9052; + xor.b32 %r8810, %r29973, %r9051; + xor.b32 %r8811, %r29974, %r9052; + xor.b32 %r8675, %r29971, %r9051; + xor.b32 %r8674, %r29972, %r9052; + // begin inline asm + shf.l.wrap.b32 %r8637, %r8602, %r8601, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8641, %r8601, %r8602, %r8526; + // end inline asm + xor.b32 %r9053, %r8637, %r8577; + xor.b32 %r9054, %r8641, %r8578; + xor.b32 %r8794, %r29995, %r9053; + xor.b32 %r8795, %r29996, %r9054; + xor.b32 %r8771, %r29969, %r9053; + xor.b32 %r8770, %r29970, %r9054; + xor.b32 %r8714, %r29967, %r9053; + xor.b32 %r8715, %r29968, %r9054; + xor.b32 %r8802, %r29965, %r9053; + xor.b32 %r8803, %r29966, %r9054; + xor.b32 %r8731, %r29963, %r9053; + xor.b32 %r8730, %r29964, %r9054; + // begin inline asm + shf.l.wrap.b32 %r8645, %r8554, %r8553, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8649, %r8553, %r8554, %r8526; + // end inline asm + xor.b32 %r9055, %r8645, %r8589; + xor.b32 %r9056, %r8649, %r8590; + xor.b32 %r8746, %r29993, %r9055; + xor.b32 %r8747, %r29994, %r9056; + xor.b32 %r8666, %r29961, %r9055; + xor.b32 %r8667, %r29962, %r9056; + xor.b32 %r8683, %r29959, %r9055; + xor.b32 %r8682, %r29960, %r9056; + xor.b32 %r8722, %r29957, %r9055; + xor.b32 %r8723, %r29958, %r9056; + xor.b32 %r8754, %r29955, %r9055; + xor.b32 %r8755, %r29956, %r9056; + mov.u32 %r8660, 44; + // begin inline asm + shf.l.wrap.b32 %r8653, %r8659, %r8658, %r8660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8657, %r8658, %r8659, %r8660; + // end inline asm + mov.u32 %r8668, 20; + // begin inline asm + shf.l.wrap.b32 %r8661, %r8667, %r8666, %r8668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8665, %r8666, %r8667, %r8668; + // end inline asm + mov.u32 %r8676, 61; + // begin inline asm + shf.l.wrap.b32 %r8669, %r8675, %r8674, %r8676; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8673, %r8674, %r8675, %r8676; + // end inline asm + mov.u32 %r8684, 39; + // begin inline asm + shf.l.wrap.b32 %r8677, %r8683, %r8682, %r8684; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8681, %r8682, %r8683, %r8684; + // end inline asm + mov.u32 %r8692, 18; + // begin inline asm + shf.l.wrap.b32 %r8685, %r8691, %r8690, %r8692; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8689, %r8690, %r8691, %r8692; + // end inline asm + mov.u32 %r8700, 62; + // begin inline asm + shf.l.wrap.b32 %r8693, %r8699, %r8698, %r8700; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8697, %r8698, %r8699, %r8700; + // end inline asm + mov.u32 %r8708, 43; + // begin inline asm + shf.l.wrap.b32 %r8701, %r8707, %r8706, %r8708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8705, %r8706, %r8707, %r8708; + // end inline asm + mov.u32 %r8716, 25; + // begin inline asm + shf.l.wrap.b32 %r8709, %r8715, %r8714, %r8716; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8713, %r8714, %r8715, %r8716; + // end inline asm + mov.u32 %r8724, 8; + // begin inline asm + shf.l.wrap.b32 %r8717, %r8723, %r8722, %r8724; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8721, %r8722, %r8723, %r8724; + // end inline asm + mov.u32 %r8732, 56; + // begin inline asm + shf.l.wrap.b32 %r8725, %r8731, %r8730, %r8732; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8729, %r8730, %r8731, %r8732; + // end inline asm + mov.u32 %r8740, 41; + // begin inline asm + shf.l.wrap.b32 %r8733, %r8739, %r8738, %r8740; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8737, %r8738, %r8739, %r8740; + // end inline asm + mov.u32 %r8748, 27; + // begin inline asm + shf.l.wrap.b32 %r8741, %r8747, %r8746, %r8748; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8745, %r8746, %r8747, %r8748; + // end inline asm + mov.u32 %r8756, 14; + // begin inline asm + shf.l.wrap.b32 %r8749, %r8755, %r8754, %r8756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8753, %r8754, %r8755, %r8756; + // end inline asm + mov.u32 %r8764, 2; + // begin inline asm + shf.l.wrap.b32 %r8757, %r8763, %r8762, %r8764; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8761, %r8762, %r8763, %r8764; + // end inline asm + mov.u32 %r8772, 55; + // begin inline asm + shf.l.wrap.b32 %r8765, %r8771, %r8770, %r8772; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8769, %r8770, %r8771, %r8772; + // end inline asm + mov.u32 %r8780, 45; + // begin inline asm + shf.l.wrap.b32 %r8773, %r8779, %r8778, %r8780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8777, %r8778, %r8779, %r8780; + // end inline asm + mov.u32 %r8788, 36; + // begin inline asm + shf.l.wrap.b32 %r8781, %r8787, %r8786, %r8788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8785, %r8786, %r8787, %r8788; + // end inline asm + mov.u32 %r8796, 28; + // begin inline asm + shf.l.wrap.b32 %r8789, %r8795, %r8794, %r8796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8793, %r8794, %r8795, %r8796; + // end inline asm + mov.u32 %r8804, 21; + // begin inline asm + shf.l.wrap.b32 %r8797, %r8803, %r8802, %r8804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8801, %r8802, %r8803, %r8804; + // end inline asm + mov.u32 %r8812, 15; + // begin inline asm + shf.l.wrap.b32 %r8805, %r8811, %r8810, %r8812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8809, %r8810, %r8811, %r8812; + // end inline asm + mov.u32 %r8820, 10; + // begin inline asm + shf.l.wrap.b32 %r8813, %r8819, %r8818, %r8820; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8817, %r8818, %r8819, %r8820; + // end inline asm + mov.u32 %r8828, 6; + // begin inline asm + shf.l.wrap.b32 %r8821, %r8827, %r8826, %r8828; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8825, %r8826, %r8827, %r8828; + // end inline asm + mov.u32 %r8836, 3; + // begin inline asm + shf.l.wrap.b32 %r8829, %r8835, %r8834, %r8836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8833, %r8834, %r8835, %r8836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8837, %r8843, %r8842, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8841, %r8842, %r8843, %r8526; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r8845, %r8880, %r8653, %r8701, 0xD2; + lop3.b32 %r8846, %r8883, %r8657, %r8705, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30003, %r8653, %r8701, %r8797, 0xD2; + lop3.b32 %r30004, %r8657, %r8705, %r8801, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29999, %r8701, %r8797, %r8749, 0xD2; + lop3.b32 %r30000, %r8705, %r8801, %r8753, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29995, %r8797, %r8749, %r8880, 0xD2; + lop3.b32 %r29996, %r8801, %r8753, %r8883, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29993, %r8749, %r8880, %r8653, 0xD2; + lop3.b32 %r29994, %r8753, %r8883, %r8657, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29989, %r8789, %r8661, %r8829, 0xD2; + lop3.b32 %r29990, %r8793, %r8665, %r8833, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30001, %r8661, %r8829, %r8773, 0xD2; + lop3.b32 %r30002, %r8665, %r8833, %r8777, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29997, %r8829, %r8773, %r8669, 0xD2; + lop3.b32 %r29998, %r8833, %r8777, %r8673, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29969, %r8773, %r8669, %r8789, 0xD2; + lop3.b32 %r29970, %r8777, %r8673, %r8793, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r29969, %r29970}; + // begin inline asm + // chi + lop3.b32 %r29961, %r8669, %r8789, %r8661, 0xD2; + lop3.b32 %r29962, %r8673, %r8793, %r8665, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r29961, %r29962}; + // begin inline asm + // chi + lop3.b32 %r29987, %r8837, %r8821, %r8709, 0xD2; + lop3.b32 %r29988, %r8841, %r8825, %r8713, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r29987, %r29988}; + // begin inline asm + // chi + lop3.b32 %r29981, %r8821, %r8709, %r8717, 0xD2; + lop3.b32 %r29982, %r8825, %r8713, %r8721, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r29981, %r29982}; + // begin inline asm + // chi + lop3.b32 %r29975, %r8709, %r8717, %r8685, 0xD2; + lop3.b32 %r29976, %r8713, %r8721, %r8689, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r29975, %r29976}; + // begin inline asm + // chi + lop3.b32 %r29967, %r8717, %r8685, %r8837, 0xD2; + lop3.b32 %r29968, %r8721, %r8689, %r8841, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r29967, %r29968}; + // begin inline asm + // chi + lop3.b32 %r29959, %r8685, %r8837, %r8821, 0xD2; + lop3.b32 %r29960, %r8689, %r8841, %r8825, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r29959, %r29960}; + // begin inline asm + // chi + lop3.b32 %r29985, %r8741, %r8781, %r8813, 0xD2; + lop3.b32 %r29986, %r8745, %r8785, %r8817, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r29985, %r29986}; + // begin inline asm + // chi + lop3.b32 %r29979, %r8781, %r8813, %r8805, 0xD2; + lop3.b32 %r29980, %r8785, %r8817, %r8809, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r29979, %r29980}; + // begin inline asm + // chi + lop3.b32 %r29973, %r8813, %r8805, %r8725, 0xD2; + lop3.b32 %r29974, %r8817, %r8809, %r8729, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r29973, %r29974}; + // begin inline asm + // chi + lop3.b32 %r29965, %r8805, %r8725, %r8741, 0xD2; + lop3.b32 %r29966, %r8809, %r8729, %r8745, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r29965, %r29966}; + // begin inline asm + // chi + lop3.b32 %r29957, %r8725, %r8741, %r8781, 0xD2; + lop3.b32 %r29958, %r8729, %r8745, %r8785, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r29957, %r29958}; + // begin inline asm + // chi + lop3.b32 %r29983, %r8693, %r8765, %r8677, 0xD2; + lop3.b32 %r29984, %r8697, %r8769, %r8681, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r29983, %r29984}; + // begin inline asm + // chi + lop3.b32 %r29977, %r8765, %r8677, %r8733, 0xD2; + lop3.b32 %r29978, %r8769, %r8681, %r8737, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r29977, %r29978}; + // begin inline asm + // chi + lop3.b32 %r29971, %r8677, %r8733, %r8757, 0xD2; + lop3.b32 %r29972, %r8681, %r8737, %r8761, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r29971, %r29972}; + // begin inline asm + // chi + lop3.b32 %r29963, %r8733, %r8757, %r8693, 0xD2; + lop3.b32 %r29964, %r8737, %r8761, %r8697, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r29963, %r29964}; + // begin inline asm + // chi + lop3.b32 %r29955, %r8757, %r8693, %r8765, 0xD2; + lop3.b32 %r29956, %r8761, %r8697, %r8769, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r29955, %r29956}; + mul.wide.s32 %rd548, %r30005, 8; + add.s64 %rd547, %rd480, %rd548; + // begin inline asm + ld.global.nc.v2.u32 {%r9045,%r9046}, [%rd547]; + // end inline asm + xor.b32 %r29991, %r8845, %r9045; + xor.b32 %r29992, %r8846, %r9046; + add.s32 %r30005, %r30005, 1; + setp.lt.u32 %p17, %r30005, 23; + @%p17 bra $L__BB2_21; + + st.local.v2.u32 [%rd2+32], {%r30003, %r30004}; + st.local.v2.u32 [%rd2+72], {%r30001, %r30002}; + st.local.v2.u32 [%rd2+40], {%r29999, %r30000}; + st.local.v2.u32 [%rd2+80], {%r29997, %r29998}; + st.local.v2.u32 [%rd2+48], {%r29995, %r29996}; + st.local.v2.u32 [%rd2+56], {%r29993, %r29994}; + st.local.v2.u32 [%rd2+24], {%r29991, %r29992}; + // begin inline asm + // xor5 + lop3.b32 %r9057, %r29991, %r29989, %r29987, 0x96; + lop3.b32 %r9057, %r9057, %r29985, %r29983, 0x96; + lop3.b32 %r9058, %r29992, %r29990, %r29988, 0x96; + lop3.b32 %r9058, %r9058, %r29986, %r29984, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9069, %r30003, %r30001, %r29981, 0x96; + lop3.b32 %r9069, %r9069, %r29979, %r29977, 0x96; + lop3.b32 %r9070, %r30004, %r30002, %r29982, 0x96; + lop3.b32 %r9070, %r9070, %r29980, %r29978, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9081, %r29999, %r29997, %r29975, 0x96; + lop3.b32 %r9081, %r9081, %r29973, %r29971, 0x96; + lop3.b32 %r9082, %r30000, %r29998, %r29976, 0x96; + lop3.b32 %r9082, %r9082, %r29974, %r29972, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9093, %r29995, %r29969, %r29967, 0x96; + lop3.b32 %r9093, %r9093, %r29965, %r29963, 0x96; + lop3.b32 %r9094, %r29996, %r29970, %r29968, 0x96; + lop3.b32 %r9094, %r9094, %r29966, %r29964, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9105, %r29993, %r29961, %r29959, 0x96; + lop3.b32 %r9105, %r9105, %r29957, %r29955, 0x96; + lop3.b32 %r9106, %r29994, %r29962, %r29960, 0x96; + lop3.b32 %r9106, %r9106, %r29958, %r29956, 0x96; + // end inline asm + mov.u32 %r9309, 1; + // begin inline asm + shf.l.wrap.b32 %r9117, %r9070, %r9069, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9121, %r9069, %r9070, %r9309; + // end inline asm + xor.b32 %r9336, %r9117, %r9105; + xor.b32 %r9337, %r9121, %r9106; + xor.b32 %r9264, %r29991, %r9336; + xor.b32 %r9267, %r29992, %r9337; + xor.b32 %r9227, %r29988, %r9337; + xor.b32 %r9226, %r29987, %r9336; + st.local.v2.u32 [%rd2+104], {%r9226, %r9227}; + // begin inline asm + shf.l.wrap.b32 %r9125, %r9082, %r9081, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9129, %r9081, %r9082, %r9309; + // end inline asm + xor.b32 %r9338, %r9125, %r9057; + xor.b32 %r9339, %r9129, %r9058; + xor.b32 %r9163, %r30001, %r9338; + xor.b32 %r9162, %r30002, %r9339; + xor.b32 %r9202, %r29980, %r9339; + xor.b32 %r9203, %r29979, %r9338; + st.local.v2.u32 [%rd2+152], {%r9203, %r9202}; + // begin inline asm + shf.l.wrap.b32 %r9133, %r9094, %r9093, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9137, %r9093, %r9094, %r9309; + // end inline asm + xor.b32 %r9340, %r9133, %r9069; + xor.b32 %r9341, %r9137, %r9070; + xor.b32 %r9186, %r29976, %r9341; + xor.b32 %r9187, %r29975, %r9340; + st.local.v2.u32 [%rd2+120], {%r9187, %r9186}; + xor.b32 %r9178, %r29972, %r9341; + xor.b32 %r9179, %r29971, %r9340; + st.local.v2.u32 [%rd2+200], {%r9179, %r9178}; + // begin inline asm + shf.l.wrap.b32 %r9141, %r9106, %r9105, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9145, %r9105, %r9106, %r9309; + // end inline asm + xor.b32 %r9342, %r9141, %r9081; + xor.b32 %r9343, %r9145, %r9082; + xor.b32 %r9210, %r29995, %r9342; + xor.b32 %r9211, %r29996, %r9343; + xor.b32 %r9219, %r29966, %r9343; + xor.b32 %r9218, %r29965, %r9342; + st.local.v2.u32 [%rd2+168], {%r9218, %r9219}; + // begin inline asm + shf.l.wrap.b32 %r9149, %r9058, %r9057, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9153, %r9057, %r9058, %r9309; + // end inline asm + xor.b32 %r9344, %r9149, %r9093; + xor.b32 %r9345, %r9153, %r9094; + xor.b32 %r9170, %r29961, %r9344; + xor.b32 %r9171, %r29962, %r9345; + xor.b32 %r9195, %r29956, %r9345; + xor.b32 %r9194, %r29955, %r9344; + st.local.v2.u32 [%rd2+216], {%r9194, %r9195}; + // begin inline asm + shf.l.wrap.b32 %r9157, %r9163, %r9162, %r8660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9161, %r9162, %r9163, %r8660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9165, %r9171, %r9170, %r8668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9169, %r9170, %r9171, %r8668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9177, %r9178, %r9179, %r8676; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9173, %r9179, %r9178, %r8676; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r9173, %r9177}; + // begin inline asm + shf.l.wrap.b32 %r9181, %r9187, %r9186, %r8708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9185, %r9186, %r9187, %r8708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9189, %r9195, %r9194, %r8756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9193, %r9194, %r9195, %r8756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9201, %r9202, %r9203, %r8780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9197, %r9203, %r9202, %r8780; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r9197, %r9201}; + // begin inline asm + shf.l.wrap.b32 %r9205, %r9211, %r9210, %r8796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9209, %r9210, %r9211, %r8796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9213, %r9219, %r9218, %r8804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9217, %r9218, %r9219, %r8804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9221, %r9227, %r9226, %r8836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9225, %r9226, %r9227, %r8836; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r9229, %r9264, %r9157, %r9181, 0xD2; + lop3.b32 %r9230, %r9267, %r9161, %r9185, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r9237, %r9157, %r9181, %r9213, 0xD2; + lop3.b32 %r9238, %r9161, %r9185, %r9217, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r9237, %r9238}; + // begin inline asm + // chi + lop3.b32 %r9245, %r9181, %r9213, %r9189, 0xD2; + lop3.b32 %r9246, %r9185, %r9217, %r9193, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r9245, %r9246}; + // begin inline asm + // chi + lop3.b32 %r9253, %r9213, %r9189, %r9264, 0xD2; + lop3.b32 %r9254, %r9217, %r9193, %r9267, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r9253, %r9254}; + // begin inline asm + // chi + lop3.b32 %r9261, %r9189, %r9264, %r9157, 0xD2; + lop3.b32 %r9262, %r9193, %r9267, %r9161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r9261, %r9262}; + // begin inline asm + // chi + lop3.b32 %r9269, %r9205, %r9165, %r9221, 0xD2; + lop3.b32 %r9270, %r9209, %r9169, %r9225, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r9269, %r9270}; + // begin inline asm + // chi + lop3.b32 %r9277, %r9165, %r9221, %r9197, 0xD2; + lop3.b32 %r9278, %r9169, %r9225, %r9201, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r9277, %r9278}; + // begin inline asm + // chi + lop3.b32 %r9285, %r9221, %r9197, %r9173, 0xD2; + lop3.b32 %r9286, %r9225, %r9201, %r9177, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r9285, %r9286}; + // begin inline asm + ld.global.nc.v2.u32 {%r9293,%r9294}, [%rd481]; + // end inline asm + xor.b32 %r9346, %r9230, %r9294; + xor.b32 %r9347, %r9229, %r9293; + mov.b64 %rd1317, {%r9347, %r9346}; + mov.b64 %rd1318, {%r9237, %r9238}; + mov.b64 %rd1319, {%r9245, %r9246}; + mov.b64 %rd1320, {%r9253, %r9254}; + mov.b64 %rd1321, {%r9261, %r9262}; + mov.b64 %rd1322, {%r9269, %r9270}; + mov.b64 %rd1323, {%r9277, %r9278}; + mov.b64 %rd1324, {%r9285, %r9286}; + mov.u32 %r30006, 0; + st.local.v2.u32 [%rd2+24], {%r9347, %r9346}; + st.local.v2.u32 [%rd53+96], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+104], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+112], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+120], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+128], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+136], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+144], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+152], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+160], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+168], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+176], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+184], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+192], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+200], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+208], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+216], {%r30006, %r30006}; + mov.u32 %r30021, -2147483648; + st.local.v2.u32 [%rd53+88], {%r9309, %r30021}; + mov.u32 %r30007, %r30006; + mov.u32 %r30008, %r30006; + mov.u32 %r30009, %r30006; + mov.u32 %r30010, %r30006; + mov.u32 %r30011, %r30006; + mov.u32 %r30012, %r30006; + mov.u32 %r30013, %r30006; + mov.u32 %r30014, %r30006; + mov.u32 %r30015, %r30006; + mov.u32 %r30016, %r30006; + mov.u32 %r30017, %r30006; + mov.u32 %r30018, %r30006; + mov.u32 %r30019, %r30006; + mov.u32 %r30020, %r9309; + mov.u32 %r30022, %r30006; + mov.u32 %r30023, %r30006; + mov.u32 %r30024, %r30006; + mov.u32 %r30025, %r30006; + mov.u32 %r30026, %r30006; + mov.u32 %r30027, %r30006; + mov.u32 %r30028, %r30006; + mov.u32 %r30029, %r30006; + mov.u32 %r30030, %r30006; + mov.u32 %r30031, %r30006; + mov.u32 %r30032, %r30006; + mov.u32 %r30033, %r30006; + mov.u32 %r30034, %r30006; + mov.u32 %r30035, %r30006; + mov.u32 %r30036, %r30006; + mov.u32 %r30037, %r30006; + mov.u32 %r30038, %r30006; + mov.u32 %r30039, %r30006; + mov.u32 %r30056, %r30006; + +$L__BB2_23: + // begin inline asm + // xor5 + lop3.b32 %r9348, %r30042, %r30040, %r30038, 0x96; + lop3.b32 %r9348, %r9348, %r30036, %r30034, 0x96; + lop3.b32 %r9349, %r30043, %r30041, %r30039, 0x96; + lop3.b32 %r9349, %r9349, %r30037, %r30035, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9360, %r30054, %r30052, %r30032, 0x96; + lop3.b32 %r9360, %r9360, %r30030, %r30028, 0x96; + lop3.b32 %r9361, %r30055, %r30053, %r30033, 0x96; + lop3.b32 %r9361, %r9361, %r30031, %r30029, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9372, %r30050, %r30048, %r30026, 0x96; + lop3.b32 %r9372, %r9372, %r30024, %r30022, 0x96; + lop3.b32 %r9373, %r30051, %r30049, %r30027, 0x96; + lop3.b32 %r9373, %r9373, %r30025, %r30023, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9384, %r30046, %r30020, %r30018, 0x96; + lop3.b32 %r9384, %r9384, %r30016, %r30014, 0x96; + lop3.b32 %r9385, %r30047, %r30021, %r30019, 0x96; + lop3.b32 %r9385, %r9385, %r30017, %r30015, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9396, %r30044, %r30012, %r30010, 0x96; + lop3.b32 %r9396, %r9396, %r30008, %r30006, 0x96; + lop3.b32 %r9397, %r30045, %r30013, %r30011, 0x96; + lop3.b32 %r9397, %r9397, %r30009, %r30007, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9408, %r9361, %r9360, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9412, %r9360, %r9361, %r9309; + // end inline asm + xor.b32 %r9842, %r9408, %r9396; + xor.b32 %r9843, %r9412, %r9397; + xor.b32 %r9675, %r30042, %r9842; + xor.b32 %r9678, %r30043, %r9843; + xor.b32 %r9582, %r30040, %r9842; + xor.b32 %r9581, %r30041, %r9843; + xor.b32 %r9629, %r30038, %r9842; + xor.b32 %r9630, %r30039, %r9843; + xor.b32 %r9534, %r30036, %r9842; + xor.b32 %r9533, %r30037, %r9843; + xor.b32 %r9485, %r30034, %r9842; + xor.b32 %r9486, %r30035, %r9843; + // begin inline asm + shf.l.wrap.b32 %r9416, %r9373, %r9372, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9420, %r9372, %r9373, %r9309; + // end inline asm + xor.b32 %r9844, %r9416, %r9348; + xor.b32 %r9845, %r9420, %r9349; + xor.b32 %r9637, %r30054, %r9844; + xor.b32 %r9638, %r30055, %r9845; + xor.b32 %r9454, %r30052, %r9844; + xor.b32 %r9453, %r30053, %r9845; + xor.b32 %r9613, %r30032, %r9844; + xor.b32 %r9614, %r30033, %r9845; + xor.b32 %r9574, %r30030, %r9844; + xor.b32 %r9573, %r30031, %r9845; + xor.b32 %r9557, %r30028, %r9844; + xor.b32 %r9558, %r30029, %r9845; + // begin inline asm + shf.l.wrap.b32 %r9424, %r9385, %r9384, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9428, %r9384, %r9385, %r9309; + // end inline asm + xor.b32 %r9846, %r9424, %r9360; + xor.b32 %r9847, %r9428, %r9361; + xor.b32 %r9494, %r30050, %r9846; + xor.b32 %r9493, %r30051, %r9847; + xor.b32 %r9621, %r30048, %r9846; + xor.b32 %r9622, %r30049, %r9847; + xor.b32 %r9502, %r30026, %r9846; + xor.b32 %r9501, %r30027, %r9847; + xor.b32 %r9605, %r30024, %r9846; + xor.b32 %r9606, %r30025, %r9847; + xor.b32 %r9470, %r30022, %r9846; + xor.b32 %r9469, %r30023, %r9847; + // begin inline asm + shf.l.wrap.b32 %r9432, %r9397, %r9396, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9436, %r9396, %r9397, %r9309; + // end inline asm + xor.b32 %r9848, %r9432, %r9372; + xor.b32 %r9849, %r9436, %r9373; + xor.b32 %r9589, %r30046, %r9848; + xor.b32 %r9590, %r30047, %r9849; + xor.b32 %r9566, %r30020, %r9848; + xor.b32 %r9565, %r30021, %r9849; + xor.b32 %r9509, %r30018, %r9848; + xor.b32 %r9510, %r30019, %r9849; + xor.b32 %r9597, %r30016, %r9848; + xor.b32 %r9598, %r30017, %r9849; + xor.b32 %r9526, %r30014, %r9848; + xor.b32 %r9525, %r30015, %r9849; + // begin inline asm + shf.l.wrap.b32 %r9440, %r9349, %r9348, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9444, %r9348, %r9349, %r9309; + // end inline asm + xor.b32 %r9850, %r9440, %r9384; + xor.b32 %r9851, %r9444, %r9385; + xor.b32 %r9541, %r30044, %r9850; + xor.b32 %r9542, %r30045, %r9851; + xor.b32 %r9461, %r30012, %r9850; + xor.b32 %r9462, %r30013, %r9851; + xor.b32 %r9478, %r30010, %r9850; + xor.b32 %r9477, %r30011, %r9851; + xor.b32 %r9517, %r30008, %r9850; + xor.b32 %r9518, %r30009, %r9851; + xor.b32 %r9549, %r30006, %r9850; + xor.b32 %r9550, %r30007, %r9851; + mov.u32 %r9455, 44; + // begin inline asm + shf.l.wrap.b32 %r9448, %r9454, %r9453, %r9455; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9452, %r9453, %r9454, %r9455; + // end inline asm + mov.u32 %r9463, 20; + // begin inline asm + shf.l.wrap.b32 %r9456, %r9462, %r9461, %r9463; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9460, %r9461, %r9462, %r9463; + // end inline asm + mov.u32 %r9471, 61; + // begin inline asm + shf.l.wrap.b32 %r9464, %r9470, %r9469, %r9471; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9468, %r9469, %r9470, %r9471; + // end inline asm + mov.u32 %r9479, 39; + // begin inline asm + shf.l.wrap.b32 %r9472, %r9478, %r9477, %r9479; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9476, %r9477, %r9478, %r9479; + // end inline asm + mov.u32 %r9487, 18; + // begin inline asm + shf.l.wrap.b32 %r9480, %r9486, %r9485, %r9487; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9484, %r9485, %r9486, %r9487; + // end inline asm + mov.u32 %r9495, 62; + // begin inline asm + shf.l.wrap.b32 %r9488, %r9494, %r9493, %r9495; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9492, %r9493, %r9494, %r9495; + // end inline asm + mov.u32 %r9503, 43; + // begin inline asm + shf.l.wrap.b32 %r9496, %r9502, %r9501, %r9503; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9500, %r9501, %r9502, %r9503; + // end inline asm + mov.u32 %r9511, 25; + // begin inline asm + shf.l.wrap.b32 %r9504, %r9510, %r9509, %r9511; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9508, %r9509, %r9510, %r9511; + // end inline asm + mov.u32 %r9519, 8; + // begin inline asm + shf.l.wrap.b32 %r9512, %r9518, %r9517, %r9519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9516, %r9517, %r9518, %r9519; + // end inline asm + mov.u32 %r9527, 56; + // begin inline asm + shf.l.wrap.b32 %r9520, %r9526, %r9525, %r9527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9524, %r9525, %r9526, %r9527; + // end inline asm + mov.u32 %r9535, 41; + // begin inline asm + shf.l.wrap.b32 %r9528, %r9534, %r9533, %r9535; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9532, %r9533, %r9534, %r9535; + // end inline asm + mov.u32 %r9543, 27; + // begin inline asm + shf.l.wrap.b32 %r9536, %r9542, %r9541, %r9543; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9540, %r9541, %r9542, %r9543; + // end inline asm + mov.u32 %r9551, 14; + // begin inline asm + shf.l.wrap.b32 %r9544, %r9550, %r9549, %r9551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9548, %r9549, %r9550, %r9551; + // end inline asm + mov.u32 %r9559, 2; + // begin inline asm + shf.l.wrap.b32 %r9552, %r9558, %r9557, %r9559; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9556, %r9557, %r9558, %r9559; + // end inline asm + mov.u32 %r9567, 55; + // begin inline asm + shf.l.wrap.b32 %r9560, %r9566, %r9565, %r9567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9564, %r9565, %r9566, %r9567; + // end inline asm + mov.u32 %r9575, 45; + // begin inline asm + shf.l.wrap.b32 %r9568, %r9574, %r9573, %r9575; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9572, %r9573, %r9574, %r9575; + // end inline asm + mov.u32 %r9583, 36; + // begin inline asm + shf.l.wrap.b32 %r9576, %r9582, %r9581, %r9583; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9580, %r9581, %r9582, %r9583; + // end inline asm + mov.u32 %r9591, 28; + // begin inline asm + shf.l.wrap.b32 %r9584, %r9590, %r9589, %r9591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9588, %r9589, %r9590, %r9591; + // end inline asm + mov.u32 %r9599, 21; + // begin inline asm + shf.l.wrap.b32 %r9592, %r9598, %r9597, %r9599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9596, %r9597, %r9598, %r9599; + // end inline asm + mov.u32 %r9607, 15; + // begin inline asm + shf.l.wrap.b32 %r9600, %r9606, %r9605, %r9607; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9604, %r9605, %r9606, %r9607; + // end inline asm + mov.u32 %r9615, 10; + // begin inline asm + shf.l.wrap.b32 %r9608, %r9614, %r9613, %r9615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9612, %r9613, %r9614, %r9615; + // end inline asm + mov.u32 %r9623, 6; + // begin inline asm + shf.l.wrap.b32 %r9616, %r9622, %r9621, %r9623; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9620, %r9621, %r9622, %r9623; + // end inline asm + mov.u32 %r9631, 3; + // begin inline asm + shf.l.wrap.b32 %r9624, %r9630, %r9629, %r9631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9628, %r9629, %r9630, %r9631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9632, %r9638, %r9637, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9636, %r9637, %r9638, %r9309; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r9640, %r9675, %r9448, %r9496, 0xD2; + lop3.b32 %r9641, %r9678, %r9452, %r9500, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30054, %r9448, %r9496, %r9592, 0xD2; + lop3.b32 %r30055, %r9452, %r9500, %r9596, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30050, %r9496, %r9592, %r9544, 0xD2; + lop3.b32 %r30051, %r9500, %r9596, %r9548, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30046, %r9592, %r9544, %r9675, 0xD2; + lop3.b32 %r30047, %r9596, %r9548, %r9678, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30044, %r9544, %r9675, %r9448, 0xD2; + lop3.b32 %r30045, %r9548, %r9678, %r9452, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30040, %r9584, %r9456, %r9624, 0xD2; + lop3.b32 %r30041, %r9588, %r9460, %r9628, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30052, %r9456, %r9624, %r9568, 0xD2; + lop3.b32 %r30053, %r9460, %r9628, %r9572, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30048, %r9624, %r9568, %r9464, 0xD2; + lop3.b32 %r30049, %r9628, %r9572, %r9468, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30020, %r9568, %r9464, %r9584, 0xD2; + lop3.b32 %r30021, %r9572, %r9468, %r9588, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r30020, %r30021}; + // begin inline asm + // chi + lop3.b32 %r30012, %r9464, %r9584, %r9456, 0xD2; + lop3.b32 %r30013, %r9468, %r9588, %r9460, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r30012, %r30013}; + // begin inline asm + // chi + lop3.b32 %r30038, %r9632, %r9616, %r9504, 0xD2; + lop3.b32 %r30039, %r9636, %r9620, %r9508, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+104], {%r30038, %r30039}; + // begin inline asm + // chi + lop3.b32 %r30032, %r9616, %r9504, %r9512, 0xD2; + lop3.b32 %r30033, %r9620, %r9508, %r9516, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+112], {%r30032, %r30033}; + // begin inline asm + // chi + lop3.b32 %r30026, %r9504, %r9512, %r9480, 0xD2; + lop3.b32 %r30027, %r9508, %r9516, %r9484, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+120], {%r30026, %r30027}; + // begin inline asm + // chi + lop3.b32 %r30018, %r9512, %r9480, %r9632, 0xD2; + lop3.b32 %r30019, %r9516, %r9484, %r9636, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+128], {%r30018, %r30019}; + // begin inline asm + // chi + lop3.b32 %r30010, %r9480, %r9632, %r9616, 0xD2; + lop3.b32 %r30011, %r9484, %r9636, %r9620, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+136], {%r30010, %r30011}; + // begin inline asm + // chi + lop3.b32 %r30036, %r9536, %r9576, %r9608, 0xD2; + lop3.b32 %r30037, %r9540, %r9580, %r9612, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+144], {%r30036, %r30037}; + // begin inline asm + // chi + lop3.b32 %r30030, %r9576, %r9608, %r9600, 0xD2; + lop3.b32 %r30031, %r9580, %r9612, %r9604, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+152], {%r30030, %r30031}; + // begin inline asm + // chi + lop3.b32 %r30024, %r9608, %r9600, %r9520, 0xD2; + lop3.b32 %r30025, %r9612, %r9604, %r9524, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+160], {%r30024, %r30025}; + // begin inline asm + // chi + lop3.b32 %r30016, %r9600, %r9520, %r9536, 0xD2; + lop3.b32 %r30017, %r9604, %r9524, %r9540, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+168], {%r30016, %r30017}; + // begin inline asm + // chi + lop3.b32 %r30008, %r9520, %r9536, %r9576, 0xD2; + lop3.b32 %r30009, %r9524, %r9540, %r9580, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+176], {%r30008, %r30009}; + // begin inline asm + // chi + lop3.b32 %r30034, %r9488, %r9560, %r9472, 0xD2; + lop3.b32 %r30035, %r9492, %r9564, %r9476, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+184], {%r30034, %r30035}; + // begin inline asm + // chi + lop3.b32 %r30028, %r9560, %r9472, %r9528, 0xD2; + lop3.b32 %r30029, %r9564, %r9476, %r9532, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+192], {%r30028, %r30029}; + // begin inline asm + // chi + lop3.b32 %r30022, %r9472, %r9528, %r9552, 0xD2; + lop3.b32 %r30023, %r9476, %r9532, %r9556, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+200], {%r30022, %r30023}; + // begin inline asm + // chi + lop3.b32 %r30014, %r9528, %r9552, %r9488, 0xD2; + lop3.b32 %r30015, %r9532, %r9556, %r9492, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+208], {%r30014, %r30015}; + // begin inline asm + // chi + lop3.b32 %r30006, %r9552, %r9488, %r9560, 0xD2; + lop3.b32 %r30007, %r9556, %r9492, %r9564, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+216], {%r30006, %r30007}; + mul.wide.s32 %rd555, %r30056, 8; + add.s64 %rd554, %rd480, %rd555; + // begin inline asm + ld.global.nc.v2.u32 {%r9840,%r9841}, [%rd554]; + // end inline asm + xor.b32 %r30042, %r9640, %r9840; + xor.b32 %r30043, %r9641, %r9841; + add.s32 %r30056, %r30056, 1; + setp.lt.u32 %p18, %r30056, 23; + @%p18 bra $L__BB2_23; + + mov.u32 %r9951, 1; + st.local.v2.u32 [%rd53+32], {%r30054, %r30055}; + st.local.v2.u32 [%rd53+72], {%r30052, %r30053}; + st.local.v2.u32 [%rd53+40], {%r30050, %r30051}; + st.local.v2.u32 [%rd53+80], {%r30048, %r30049}; + st.local.v2.u32 [%rd53+48], {%r30046, %r30047}; + st.local.v2.u32 [%rd53+56], {%r30044, %r30045}; + st.local.v2.u32 [%rd53+24], {%r30042, %r30043}; + // begin inline asm + // xor5 + lop3.b32 %r9852, %r30042, %r30040, %r30038, 0x96; + lop3.b32 %r9852, %r9852, %r30036, %r30034, 0x96; + lop3.b32 %r9853, %r30043, %r30041, %r30039, 0x96; + lop3.b32 %r9853, %r9853, %r30037, %r30035, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9864, %r30054, %r30052, %r30032, 0x96; + lop3.b32 %r9864, %r9864, %r30030, %r30028, 0x96; + lop3.b32 %r9865, %r30055, %r30053, %r30033, 0x96; + lop3.b32 %r9865, %r9865, %r30031, %r30029, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9876, %r30050, %r30048, %r30026, 0x96; + lop3.b32 %r9876, %r9876, %r30024, %r30022, 0x96; + lop3.b32 %r9877, %r30051, %r30049, %r30027, 0x96; + lop3.b32 %r9877, %r9877, %r30025, %r30023, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9888, %r30046, %r30020, %r30018, 0x96; + lop3.b32 %r9888, %r9888, %r30016, %r30014, 0x96; + lop3.b32 %r9889, %r30047, %r30021, %r30019, 0x96; + lop3.b32 %r9889, %r9889, %r30017, %r30015, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9900, %r30044, %r30012, %r30010, 0x96; + lop3.b32 %r9900, %r9900, %r30008, %r30006, 0x96; + lop3.b32 %r9901, %r30045, %r30013, %r30011, 0x96; + lop3.b32 %r9901, %r9901, %r30009, %r30007, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9912, %r9865, %r9864, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9916, %r9864, %r9865, %r9951; + // end inline asm + xor.b32 %r10090, %r9912, %r9900; + xor.b32 %r10091, %r9916, %r9901; + xor.b32 %r10059, %r30042, %r10090; + xor.b32 %r10062, %r30043, %r10091; + xor.b32 %r10022, %r30039, %r10091; + xor.b32 %r10021, %r30038, %r10090; + st.local.v2.u32 [%rd53+104], {%r10021, %r10022}; + // begin inline asm + shf.l.wrap.b32 %r9920, %r9877, %r9876, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9924, %r9876, %r9877, %r9951; + // end inline asm + xor.b32 %r10092, %r9920, %r9852; + xor.b32 %r10093, %r9924, %r9853; + xor.b32 %r9958, %r30052, %r10092; + xor.b32 %r9957, %r30053, %r10093; + xor.b32 %r9997, %r30031, %r10093; + xor.b32 %r9998, %r30030, %r10092; + st.local.v2.u32 [%rd53+152], {%r9998, %r9997}; + // begin inline asm + shf.l.wrap.b32 %r9928, %r9889, %r9888, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9932, %r9888, %r9889, %r9951; + // end inline asm + xor.b32 %r10094, %r9928, %r9864; + xor.b32 %r10095, %r9932, %r9865; + xor.b32 %r9981, %r30027, %r10095; + xor.b32 %r9982, %r30026, %r10094; + st.local.v2.u32 [%rd53+120], {%r9982, %r9981}; + xor.b32 %r9973, %r30023, %r10095; + xor.b32 %r9974, %r30022, %r10094; + st.local.v2.u32 [%rd53+200], {%r9974, %r9973}; + // begin inline asm + shf.l.wrap.b32 %r9936, %r9901, %r9900, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9940, %r9900, %r9901, %r9951; + // end inline asm + xor.b32 %r10096, %r9936, %r9876; + xor.b32 %r10097, %r9940, %r9877; + xor.b32 %r10005, %r30046, %r10096; + xor.b32 %r10006, %r30047, %r10097; + xor.b32 %r10014, %r30017, %r10097; + xor.b32 %r10013, %r30016, %r10096; + st.local.v2.u32 [%rd53+168], {%r10013, %r10014}; + // begin inline asm + shf.l.wrap.b32 %r9944, %r9853, %r9852, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9948, %r9852, %r9853, %r9951; + // end inline asm + xor.b32 %r10098, %r9944, %r9888; + xor.b32 %r10099, %r9948, %r9889; + xor.b32 %r9965, %r30012, %r10098; + xor.b32 %r9966, %r30013, %r10099; + xor.b32 %r9990, %r30007, %r10099; + xor.b32 %r9989, %r30006, %r10098; + st.local.v2.u32 [%rd53+216], {%r9989, %r9990}; + // begin inline asm + shf.l.wrap.b32 %r9952, %r9958, %r9957, %r9455; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9956, %r9957, %r9958, %r9455; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9960, %r9966, %r9965, %r9463; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9964, %r9965, %r9966, %r9463; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9972, %r9973, %r9974, %r9471; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9968, %r9974, %r9973, %r9471; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r9968, %r9972}; + // begin inline asm + shf.l.wrap.b32 %r9976, %r9982, %r9981, %r9503; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9980, %r9981, %r9982, %r9503; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9984, %r9990, %r9989, %r9551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9988, %r9989, %r9990, %r9551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9996, %r9997, %r9998, %r9575; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9992, %r9998, %r9997, %r9575; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r9992, %r9996}; + // begin inline asm + shf.l.wrap.b32 %r10000, %r10006, %r10005, %r9591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10004, %r10005, %r10006, %r9591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10008, %r10014, %r10013, %r9599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10012, %r10013, %r10014, %r9599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10016, %r10022, %r10021, %r9631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10020, %r10021, %r10022, %r9631; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10024, %r10059, %r9952, %r9976, 0xD2; + lop3.b32 %r10025, %r10062, %r9956, %r9980, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10032, %r9952, %r9976, %r10008, 0xD2; + lop3.b32 %r10033, %r9956, %r9980, %r10012, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+32], {%r10032, %r10033}; + // begin inline asm + // chi + lop3.b32 %r10040, %r9976, %r10008, %r9984, 0xD2; + lop3.b32 %r10041, %r9980, %r10012, %r9988, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+40], {%r10040, %r10041}; + // begin inline asm + // chi + lop3.b32 %r10048, %r10008, %r9984, %r10059, 0xD2; + lop3.b32 %r10049, %r10012, %r9988, %r10062, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+48], {%r10048, %r10049}; + // begin inline asm + // chi + lop3.b32 %r10056, %r9984, %r10059, %r9952, 0xD2; + lop3.b32 %r10057, %r9988, %r10062, %r9956, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+56], {%r10056, %r10057}; + // begin inline asm + // chi + lop3.b32 %r10064, %r10000, %r9960, %r10016, 0xD2; + lop3.b32 %r10065, %r10004, %r9964, %r10020, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+64], {%r10064, %r10065}; + // begin inline asm + // chi + lop3.b32 %r10072, %r9960, %r10016, %r9992, 0xD2; + lop3.b32 %r10073, %r9964, %r10020, %r9996, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+72], {%r10072, %r10073}; + // begin inline asm + // chi + lop3.b32 %r10080, %r10016, %r9992, %r9968, 0xD2; + lop3.b32 %r10081, %r10020, %r9996, %r9972, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+80], {%r10080, %r10081}; + // begin inline asm + ld.global.nc.v2.u32 {%r10088,%r10089}, [%rd481]; + // end inline asm + xor.b32 %r10100, %r10025, %r10089; + xor.b32 %r10101, %r10024, %r10088; + st.local.v2.u32 [%rd53+24], {%r10101, %r10100}; + mov.b64 %rd1326, {%r10032, %r10033}; + mov.b64 %rd1327, {%r10040, %r10041}; + mov.b64 %rd1330, {%r10064, %r10065}; + mov.b64 %rd1331, {%r10072, %r10073}; + mov.b64 %rd1332, {%r10080, %r10081}; + mov.b64 %rd1325, {%r10101, %r10100}; + mov.b64 %rd1328, {%r10048, %r10049}; + mov.b64 %rd1329, {%r10056, %r10057}; + st.global.u64 [%rd34], %rd1317; + st.global.u64 [%rd34+8], %rd1318; + st.global.u64 [%rd34+16], %rd1319; + st.global.u64 [%rd34+24], %rd1320; + st.global.u64 [%rd34+32], %rd1321; + st.global.u64 [%rd34+40], %rd1322; + st.global.u64 [%rd34+48], %rd1323; + st.global.u64 [%rd34+56], %rd1324; + st.global.v2.u32 [%rd34+64], {%r10101, %r10100}; + st.global.v2.u32 [%rd34+72], {%r10032, %r10033}; + st.global.v2.u32 [%rd34+80], {%r10040, %r10041}; + st.global.v2.u32 [%rd34+88], {%r10048, %r10049}; + st.global.v2.u32 [%rd34+96], {%r10056, %r10057}; + st.global.v2.u32 [%rd34+104], {%r10064, %r10065}; + st.global.v2.u32 [%rd34+112], {%r10072, %r10073}; + st.global.v2.u32 [%rd34+120], {%r10080, %r10081}; + +$L__BB2_36: + cvta.to.global.u64 %rd1265, %rd361; + shl.b32 %r1695, %r45, 1; + mul.wide.u32 %rd661, %r1695, -954391867; + shr.u64 %rd662, %rd661, 32; + cvt.u32.u64 %r13386, %rd662; + sub.s32 %r13387, %r1695, %r13386; + shr.u32 %r13388, %r13387, 1; + add.s32 %r13389, %r13388, %r13386; + shr.u32 %r13390, %r13389, 20; + mul.lo.s32 %r13391, %r13390, 1179641; + sub.s32 %r13392, %r1695, %r13391; + mul.wide.u32 %rd664, %r13392, 64; + add.s64 %rd126, %rd1265, %rd664; + or.b32 %r1696, %r1695, 1; + mul.wide.u32 %rd665, %r1696, -954391867; + shr.u64 %rd666, %rd665, 32; + cvt.u32.u64 %r13393, %rd666; + sub.s32 %r13394, %r1696, %r13393; + shr.u32 %r13395, %r13394, 1; + add.s32 %r13396, %r13395, %r13393; + shr.u32 %r13397, %r13396, 20; + mul.lo.s32 %r13398, %r13397, 1179641; + sub.s32 %r13399, %r1696, %r13398; + mul.wide.u32 %rd667, %r13399, 64; + add.s64 %rd127, %rd1265, %rd667; + @%p12 bra $L__BB2_50; + + cvta.to.global.u64 %rd668, %rd360; + mul.wide.u32 %rd669, %r45, 128; + add.s64 %rd128, %rd668, %rd669; + ld.global.u64 %rd1333, [%rd128]; + setp.eq.s64 %p25, %rd1333, 0; + @%p25 bra $L__BB2_39; + + ld.global.u64 %rd1348, [%rd128+120]; + ld.global.u64 %rd1347, [%rd128+112]; + ld.global.u64 %rd1346, [%rd128+104]; + ld.global.u64 %rd1345, [%rd128+96]; + ld.global.u64 %rd1344, [%rd128+88]; + ld.global.u64 %rd1343, [%rd128+80]; + ld.global.u64 %rd1342, [%rd128+72]; + ld.global.u64 %rd1341, [%rd128+64]; + ld.global.u64 %rd1340, [%rd128+56]; + ld.global.u64 %rd1339, [%rd128+48]; + ld.global.u64 %rd1338, [%rd128+40]; + ld.global.u64 %rd1337, [%rd128+32]; + ld.global.u64 %rd1336, [%rd128+24]; + ld.global.u64 %rd1335, [%rd128+16]; + ld.global.u64 %rd1334, [%rd128+8]; + bra.uni $L__BB2_61; + +$L__BB2_50: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd771, 1179641; + st.local.u64 [%rd2+8], %rd771; + st.local.u32 [%rd2+16], %r1695; + ld.global.u64 %rd772, [%rd126]; + ld.global.u64 %rd773, [%rd126+8]; + ld.global.u64 %rd774, [%rd126+16]; + ld.global.u64 %rd775, [%rd126+24]; + ld.global.u64 %rd776, [%rd126+32]; + ld.global.u64 %rd777, [%rd126+40]; + ld.global.u64 %rd778, [%rd126+48]; + ld.global.u64 %rd779, [%rd126+56]; + st.local.u64 [%rd2+24], %rd772; + st.local.u64 [%rd2+32], %rd773; + st.local.u64 [%rd2+40], %rd774; + st.local.u64 [%rd2+48], %rd775; + st.local.u64 [%rd2+56], %rd776; + st.local.u64 [%rd2+64], %rd777; + st.local.u64 [%rd2+72], %rd778; + st.local.u64 [%rd2+80], %rd779; + cvt.u32.u64 %r16725, %rd772; + xor.b32 %r16726, %r1695, %r16725; + st.local.u32 [%rd2+24], %r16726; + mov.u32 %r30531, 0; + st.local.v2.u32 [%rd2+96], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+104], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+112], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+120], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+128], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+136], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+144], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+152], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+160], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+168], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+176], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+184], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+192], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+200], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+208], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+216], {%r30531, %r30531}; + mov.u32 %r30546, -2147483648; + mov.u32 %r16698, 1; + st.local.v2.u32 [%rd2+88], {%r16698, %r30546}; + ld.local.v2.u32 {%r30567, %r30568}, [%rd2+24]; + mov.b64 {%r30565, %r30566}, %rd777; + shr.u64 %rd780, %rd773, 32; + cvt.u32.u64 %r30579, %rd773; + cvt.u32.u64 %r30580, %rd780; + shr.u64 %rd781, %rd778, 32; + cvt.u32.u64 %r30577, %rd778; + cvt.u32.u64 %r30578, %rd781; + shr.u64 %rd782, %rd774, 32; + cvt.u32.u64 %r30575, %rd774; + cvt.u32.u64 %r30576, %rd782; + shr.u64 %rd783, %rd779, 32; + cvt.u32.u64 %r30573, %rd779; + cvt.u32.u64 %r30574, %rd783; + shr.u64 %rd784, %rd775, 32; + cvt.u32.u64 %r30571, %rd775; + cvt.u32.u64 %r30572, %rd784; + shr.u64 %rd785, %rd776, 32; + cvt.u32.u64 %r30569, %rd776; + cvt.u32.u64 %r30570, %rd785; + mov.u32 %r30532, %r30531; + mov.u32 %r30533, %r30531; + mov.u32 %r30534, %r30531; + mov.u32 %r30535, %r30531; + mov.u32 %r30536, %r30531; + mov.u32 %r30537, %r30531; + mov.u32 %r30538, %r30531; + mov.u32 %r30539, %r30531; + mov.u32 %r30540, %r30531; + mov.u32 %r30541, %r30531; + mov.u32 %r30542, %r30531; + mov.u32 %r30543, %r30531; + mov.u32 %r30544, %r30531; + mov.u32 %r30545, %r16698; + mov.u32 %r30547, %r30531; + mov.u32 %r30548, %r30531; + mov.u32 %r30549, %r30531; + mov.u32 %r30550, %r30531; + mov.u32 %r30551, %r30531; + mov.u32 %r30552, %r30531; + mov.u32 %r30553, %r30531; + mov.u32 %r30554, %r30531; + mov.u32 %r30555, %r30531; + mov.u32 %r30556, %r30531; + mov.u32 %r30557, %r30531; + mov.u32 %r30558, %r30531; + mov.u32 %r30559, %r30531; + mov.u32 %r30560, %r30531; + mov.u32 %r30561, %r30531; + mov.u32 %r30562, %r30531; + mov.u32 %r30563, %r30531; + mov.u32 %r30564, %r30531; + mov.u32 %r30581, %r30531; + +$L__BB2_51: + // begin inline asm + // xor5 + lop3.b32 %r16729, %r30567, %r30565, %r30563, 0x96; + lop3.b32 %r16729, %r16729, %r30561, %r30559, 0x96; + lop3.b32 %r16730, %r30568, %r30566, %r30564, 0x96; + lop3.b32 %r16730, %r16730, %r30562, %r30560, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16741, %r30579, %r30577, %r30557, 0x96; + lop3.b32 %r16741, %r16741, %r30555, %r30553, 0x96; + lop3.b32 %r16742, %r30580, %r30578, %r30558, 0x96; + lop3.b32 %r16742, %r16742, %r30556, %r30554, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16753, %r30575, %r30573, %r30551, 0x96; + lop3.b32 %r16753, %r16753, %r30549, %r30547, 0x96; + lop3.b32 %r16754, %r30576, %r30574, %r30552, 0x96; + lop3.b32 %r16754, %r16754, %r30550, %r30548, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16765, %r30571, %r30545, %r30543, 0x96; + lop3.b32 %r16765, %r16765, %r30541, %r30539, 0x96; + lop3.b32 %r16766, %r30572, %r30546, %r30544, 0x96; + lop3.b32 %r16766, %r16766, %r30542, %r30540, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16777, %r30569, %r30537, %r30535, 0x96; + lop3.b32 %r16777, %r16777, %r30533, %r30531, 0x96; + lop3.b32 %r16778, %r30570, %r30538, %r30536, 0x96; + lop3.b32 %r16778, %r16778, %r30534, %r30532, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16789, %r16742, %r16741, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16793, %r16741, %r16742, %r16698; + // end inline asm + xor.b32 %r17223, %r16789, %r16777; + xor.b32 %r17224, %r16793, %r16778; + xor.b32 %r17056, %r30567, %r17223; + xor.b32 %r17059, %r30568, %r17224; + xor.b32 %r16963, %r30565, %r17223; + xor.b32 %r16962, %r30566, %r17224; + xor.b32 %r17010, %r30563, %r17223; + xor.b32 %r17011, %r30564, %r17224; + xor.b32 %r16915, %r30561, %r17223; + xor.b32 %r16914, %r30562, %r17224; + xor.b32 %r16866, %r30559, %r17223; + xor.b32 %r16867, %r30560, %r17224; + // begin inline asm + shf.l.wrap.b32 %r16797, %r16754, %r16753, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16801, %r16753, %r16754, %r16698; + // end inline asm + xor.b32 %r17225, %r16797, %r16729; + xor.b32 %r17226, %r16801, %r16730; + xor.b32 %r17018, %r30579, %r17225; + xor.b32 %r17019, %r30580, %r17226; + xor.b32 %r16835, %r30577, %r17225; + xor.b32 %r16834, %r30578, %r17226; + xor.b32 %r16994, %r30557, %r17225; + xor.b32 %r16995, %r30558, %r17226; + xor.b32 %r16955, %r30555, %r17225; + xor.b32 %r16954, %r30556, %r17226; + xor.b32 %r16938, %r30553, %r17225; + xor.b32 %r16939, %r30554, %r17226; + // begin inline asm + shf.l.wrap.b32 %r16805, %r16766, %r16765, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16809, %r16765, %r16766, %r16698; + // end inline asm + xor.b32 %r17227, %r16805, %r16741; + xor.b32 %r17228, %r16809, %r16742; + xor.b32 %r16875, %r30575, %r17227; + xor.b32 %r16874, %r30576, %r17228; + xor.b32 %r17002, %r30573, %r17227; + xor.b32 %r17003, %r30574, %r17228; + xor.b32 %r16883, %r30551, %r17227; + xor.b32 %r16882, %r30552, %r17228; + xor.b32 %r16986, %r30549, %r17227; + xor.b32 %r16987, %r30550, %r17228; + xor.b32 %r16851, %r30547, %r17227; + xor.b32 %r16850, %r30548, %r17228; + // begin inline asm + shf.l.wrap.b32 %r16813, %r16778, %r16777, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16817, %r16777, %r16778, %r16698; + // end inline asm + xor.b32 %r17229, %r16813, %r16753; + xor.b32 %r17230, %r16817, %r16754; + xor.b32 %r16970, %r30571, %r17229; + xor.b32 %r16971, %r30572, %r17230; + xor.b32 %r16947, %r30545, %r17229; + xor.b32 %r16946, %r30546, %r17230; + xor.b32 %r16890, %r30543, %r17229; + xor.b32 %r16891, %r30544, %r17230; + xor.b32 %r16978, %r30541, %r17229; + xor.b32 %r16979, %r30542, %r17230; + xor.b32 %r16907, %r30539, %r17229; + xor.b32 %r16906, %r30540, %r17230; + // begin inline asm + shf.l.wrap.b32 %r16821, %r16730, %r16729, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16825, %r16729, %r16730, %r16698; + // end inline asm + xor.b32 %r17231, %r16821, %r16765; + xor.b32 %r17232, %r16825, %r16766; + xor.b32 %r16922, %r30569, %r17231; + xor.b32 %r16923, %r30570, %r17232; + xor.b32 %r16842, %r30537, %r17231; + xor.b32 %r16843, %r30538, %r17232; + xor.b32 %r16859, %r30535, %r17231; + xor.b32 %r16858, %r30536, %r17232; + xor.b32 %r16898, %r30533, %r17231; + xor.b32 %r16899, %r30534, %r17232; + xor.b32 %r16930, %r30531, %r17231; + xor.b32 %r16931, %r30532, %r17232; + mov.u32 %r16836, 44; + // begin inline asm + shf.l.wrap.b32 %r16829, %r16835, %r16834, %r16836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16833, %r16834, %r16835, %r16836; + // end inline asm + mov.u32 %r16844, 20; + // begin inline asm + shf.l.wrap.b32 %r16837, %r16843, %r16842, %r16844; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16841, %r16842, %r16843, %r16844; + // end inline asm + mov.u32 %r16852, 61; + // begin inline asm + shf.l.wrap.b32 %r16845, %r16851, %r16850, %r16852; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16849, %r16850, %r16851, %r16852; + // end inline asm + mov.u32 %r16860, 39; + // begin inline asm + shf.l.wrap.b32 %r16853, %r16859, %r16858, %r16860; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16857, %r16858, %r16859, %r16860; + // end inline asm + mov.u32 %r16868, 18; + // begin inline asm + shf.l.wrap.b32 %r16861, %r16867, %r16866, %r16868; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16865, %r16866, %r16867, %r16868; + // end inline asm + mov.u32 %r16876, 62; + // begin inline asm + shf.l.wrap.b32 %r16869, %r16875, %r16874, %r16876; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16873, %r16874, %r16875, %r16876; + // end inline asm + mov.u32 %r16884, 43; + // begin inline asm + shf.l.wrap.b32 %r16877, %r16883, %r16882, %r16884; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16881, %r16882, %r16883, %r16884; + // end inline asm + mov.u32 %r16892, 25; + // begin inline asm + shf.l.wrap.b32 %r16885, %r16891, %r16890, %r16892; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16889, %r16890, %r16891, %r16892; + // end inline asm + mov.u32 %r16900, 8; + // begin inline asm + shf.l.wrap.b32 %r16893, %r16899, %r16898, %r16900; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16897, %r16898, %r16899, %r16900; + // end inline asm + mov.u32 %r16908, 56; + // begin inline asm + shf.l.wrap.b32 %r16901, %r16907, %r16906, %r16908; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16905, %r16906, %r16907, %r16908; + // end inline asm + mov.u32 %r16916, 41; + // begin inline asm + shf.l.wrap.b32 %r16909, %r16915, %r16914, %r16916; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16913, %r16914, %r16915, %r16916; + // end inline asm + mov.u32 %r16924, 27; + // begin inline asm + shf.l.wrap.b32 %r16917, %r16923, %r16922, %r16924; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16921, %r16922, %r16923, %r16924; + // end inline asm + mov.u32 %r16932, 14; + // begin inline asm + shf.l.wrap.b32 %r16925, %r16931, %r16930, %r16932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16929, %r16930, %r16931, %r16932; + // end inline asm + mov.u32 %r16940, 2; + // begin inline asm + shf.l.wrap.b32 %r16933, %r16939, %r16938, %r16940; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16937, %r16938, %r16939, %r16940; + // end inline asm + mov.u32 %r16948, 55; + // begin inline asm + shf.l.wrap.b32 %r16941, %r16947, %r16946, %r16948; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16945, %r16946, %r16947, %r16948; + // end inline asm + mov.u32 %r16956, 45; + // begin inline asm + shf.l.wrap.b32 %r16949, %r16955, %r16954, %r16956; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16953, %r16954, %r16955, %r16956; + // end inline asm + mov.u32 %r16964, 36; + // begin inline asm + shf.l.wrap.b32 %r16957, %r16963, %r16962, %r16964; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16961, %r16962, %r16963, %r16964; + // end inline asm + mov.u32 %r16972, 28; + // begin inline asm + shf.l.wrap.b32 %r16965, %r16971, %r16970, %r16972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16969, %r16970, %r16971, %r16972; + // end inline asm + mov.u32 %r16980, 21; + // begin inline asm + shf.l.wrap.b32 %r16973, %r16979, %r16978, %r16980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16977, %r16978, %r16979, %r16980; + // end inline asm + mov.u32 %r16988, 15; + // begin inline asm + shf.l.wrap.b32 %r16981, %r16987, %r16986, %r16988; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16985, %r16986, %r16987, %r16988; + // end inline asm + mov.u32 %r16996, 10; + // begin inline asm + shf.l.wrap.b32 %r16989, %r16995, %r16994, %r16996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16993, %r16994, %r16995, %r16996; + // end inline asm + mov.u32 %r17004, 6; + // begin inline asm + shf.l.wrap.b32 %r16997, %r17003, %r17002, %r17004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17001, %r17002, %r17003, %r17004; + // end inline asm + mov.u32 %r17012, 3; + // begin inline asm + shf.l.wrap.b32 %r17005, %r17011, %r17010, %r17012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17009, %r17010, %r17011, %r17012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17013, %r17019, %r17018, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17017, %r17018, %r17019, %r16698; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17021, %r17056, %r16829, %r16877, 0xD2; + lop3.b32 %r17022, %r17059, %r16833, %r16881, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30579, %r16829, %r16877, %r16973, 0xD2; + lop3.b32 %r30580, %r16833, %r16881, %r16977, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30575, %r16877, %r16973, %r16925, 0xD2; + lop3.b32 %r30576, %r16881, %r16977, %r16929, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30571, %r16973, %r16925, %r17056, 0xD2; + lop3.b32 %r30572, %r16977, %r16929, %r17059, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30569, %r16925, %r17056, %r16829, 0xD2; + lop3.b32 %r30570, %r16929, %r17059, %r16833, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30565, %r16965, %r16837, %r17005, 0xD2; + lop3.b32 %r30566, %r16969, %r16841, %r17009, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30577, %r16837, %r17005, %r16949, 0xD2; + lop3.b32 %r30578, %r16841, %r17009, %r16953, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30573, %r17005, %r16949, %r16845, 0xD2; + lop3.b32 %r30574, %r17009, %r16953, %r16849, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30545, %r16949, %r16845, %r16965, 0xD2; + lop3.b32 %r30546, %r16953, %r16849, %r16969, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30545, %r30546}; + // begin inline asm + // chi + lop3.b32 %r30537, %r16845, %r16965, %r16837, 0xD2; + lop3.b32 %r30538, %r16849, %r16969, %r16841, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30537, %r30538}; + // begin inline asm + // chi + lop3.b32 %r30563, %r17013, %r16997, %r16885, 0xD2; + lop3.b32 %r30564, %r17017, %r17001, %r16889, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30563, %r30564}; + // begin inline asm + // chi + lop3.b32 %r30557, %r16997, %r16885, %r16893, 0xD2; + lop3.b32 %r30558, %r17001, %r16889, %r16897, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30557, %r30558}; + // begin inline asm + // chi + lop3.b32 %r30551, %r16885, %r16893, %r16861, 0xD2; + lop3.b32 %r30552, %r16889, %r16897, %r16865, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30551, %r30552}; + // begin inline asm + // chi + lop3.b32 %r30543, %r16893, %r16861, %r17013, 0xD2; + lop3.b32 %r30544, %r16897, %r16865, %r17017, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30543, %r30544}; + // begin inline asm + // chi + lop3.b32 %r30535, %r16861, %r17013, %r16997, 0xD2; + lop3.b32 %r30536, %r16865, %r17017, %r17001, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30535, %r30536}; + // begin inline asm + // chi + lop3.b32 %r30561, %r16917, %r16957, %r16989, 0xD2; + lop3.b32 %r30562, %r16921, %r16961, %r16993, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30561, %r30562}; + // begin inline asm + // chi + lop3.b32 %r30555, %r16957, %r16989, %r16981, 0xD2; + lop3.b32 %r30556, %r16961, %r16993, %r16985, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30555, %r30556}; + // begin inline asm + // chi + lop3.b32 %r30549, %r16989, %r16981, %r16901, 0xD2; + lop3.b32 %r30550, %r16993, %r16985, %r16905, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30549, %r30550}; + // begin inline asm + // chi + lop3.b32 %r30541, %r16981, %r16901, %r16917, 0xD2; + lop3.b32 %r30542, %r16985, %r16905, %r16921, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30541, %r30542}; + // begin inline asm + // chi + lop3.b32 %r30533, %r16901, %r16917, %r16957, 0xD2; + lop3.b32 %r30534, %r16905, %r16921, %r16961, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30533, %r30534}; + // begin inline asm + // chi + lop3.b32 %r30559, %r16869, %r16941, %r16853, 0xD2; + lop3.b32 %r30560, %r16873, %r16945, %r16857, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30559, %r30560}; + // begin inline asm + // chi + lop3.b32 %r30553, %r16941, %r16853, %r16909, 0xD2; + lop3.b32 %r30554, %r16945, %r16857, %r16913, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30553, %r30554}; + // begin inline asm + // chi + lop3.b32 %r30547, %r16853, %r16909, %r16933, 0xD2; + lop3.b32 %r30548, %r16857, %r16913, %r16937, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30547, %r30548}; + // begin inline asm + // chi + lop3.b32 %r30539, %r16909, %r16933, %r16869, 0xD2; + lop3.b32 %r30540, %r16913, %r16937, %r16873, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30539, %r30540}; + // begin inline asm + // chi + lop3.b32 %r30531, %r16933, %r16869, %r16941, 0xD2; + lop3.b32 %r30532, %r16937, %r16873, %r16945, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30531, %r30532}; + mul.wide.s32 %rd787, %r30581, 8; + mov.u64 %rd788, keccak_round_constants; + cvta.const.u64 %rd789, %rd788; + add.s64 %rd786, %rd789, %rd787; + // begin inline asm + ld.global.nc.v2.u32 {%r17221,%r17222}, [%rd786]; + // end inline asm + xor.b32 %r30567, %r17021, %r17221; + xor.b32 %r30568, %r17022, %r17222; + add.s32 %r30581, %r30581, 1; + setp.lt.u32 %p31, %r30581, 23; + @%p31 bra $L__BB2_51; + + add.u64 %rd176, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r30579, %r30580}; + st.local.v2.u32 [%rd2+72], {%r30577, %r30578}; + st.local.v2.u32 [%rd2+40], {%r30575, %r30576}; + st.local.v2.u32 [%rd2+80], {%r30573, %r30574}; + st.local.v2.u32 [%rd2+48], {%r30571, %r30572}; + st.local.v2.u32 [%rd2+56], {%r30569, %r30570}; + st.local.v2.u32 [%rd2+24], {%r30567, %r30568}; + // begin inline asm + // xor5 + lop3.b32 %r17233, %r30567, %r30565, %r30563, 0x96; + lop3.b32 %r17233, %r17233, %r30561, %r30559, 0x96; + lop3.b32 %r17234, %r30568, %r30566, %r30564, 0x96; + lop3.b32 %r17234, %r17234, %r30562, %r30560, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17245, %r30579, %r30577, %r30557, 0x96; + lop3.b32 %r17245, %r17245, %r30555, %r30553, 0x96; + lop3.b32 %r17246, %r30580, %r30578, %r30558, 0x96; + lop3.b32 %r17246, %r17246, %r30556, %r30554, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17257, %r30575, %r30573, %r30551, 0x96; + lop3.b32 %r17257, %r17257, %r30549, %r30547, 0x96; + lop3.b32 %r17258, %r30576, %r30574, %r30552, 0x96; + lop3.b32 %r17258, %r17258, %r30550, %r30548, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17269, %r30571, %r30545, %r30543, 0x96; + lop3.b32 %r17269, %r17269, %r30541, %r30539, 0x96; + lop3.b32 %r17270, %r30572, %r30546, %r30544, 0x96; + lop3.b32 %r17270, %r17270, %r30542, %r30540, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17281, %r30569, %r30537, %r30535, 0x96; + lop3.b32 %r17281, %r17281, %r30533, %r30531, 0x96; + lop3.b32 %r17282, %r30570, %r30538, %r30536, 0x96; + lop3.b32 %r17282, %r17282, %r30534, %r30532, 0x96; + // end inline asm + mov.u32 %r17485, 1; + // begin inline asm + shf.l.wrap.b32 %r17293, %r17246, %r17245, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17297, %r17245, %r17246, %r17485; + // end inline asm + xor.b32 %r17512, %r17293, %r17281; + xor.b32 %r17513, %r17297, %r17282; + xor.b32 %r17440, %r30567, %r17512; + xor.b32 %r17443, %r30568, %r17513; + xor.b32 %r17403, %r30564, %r17513; + xor.b32 %r17402, %r30563, %r17512; + st.local.v2.u32 [%rd2+104], {%r17402, %r17403}; + // begin inline asm + shf.l.wrap.b32 %r17301, %r17258, %r17257, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17305, %r17257, %r17258, %r17485; + // end inline asm + xor.b32 %r17514, %r17301, %r17233; + xor.b32 %r17515, %r17305, %r17234; + xor.b32 %r17339, %r30577, %r17514; + xor.b32 %r17338, %r30578, %r17515; + xor.b32 %r17378, %r30556, %r17515; + xor.b32 %r17379, %r30555, %r17514; + st.local.v2.u32 [%rd2+152], {%r17379, %r17378}; + // begin inline asm + shf.l.wrap.b32 %r17309, %r17270, %r17269, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17313, %r17269, %r17270, %r17485; + // end inline asm + xor.b32 %r17516, %r17309, %r17245; + xor.b32 %r17517, %r17313, %r17246; + xor.b32 %r17362, %r30552, %r17517; + xor.b32 %r17363, %r30551, %r17516; + st.local.v2.u32 [%rd2+120], {%r17363, %r17362}; + xor.b32 %r17354, %r30548, %r17517; + xor.b32 %r17355, %r30547, %r17516; + st.local.v2.u32 [%rd2+200], {%r17355, %r17354}; + // begin inline asm + shf.l.wrap.b32 %r17317, %r17282, %r17281, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17321, %r17281, %r17282, %r17485; + // end inline asm + xor.b32 %r17518, %r17317, %r17257; + xor.b32 %r17519, %r17321, %r17258; + xor.b32 %r17386, %r30571, %r17518; + xor.b32 %r17387, %r30572, %r17519; + xor.b32 %r17395, %r30542, %r17519; + xor.b32 %r17394, %r30541, %r17518; + st.local.v2.u32 [%rd2+168], {%r17394, %r17395}; + // begin inline asm + shf.l.wrap.b32 %r17325, %r17234, %r17233, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17329, %r17233, %r17234, %r17485; + // end inline asm + xor.b32 %r17520, %r17325, %r17269; + xor.b32 %r17521, %r17329, %r17270; + xor.b32 %r17346, %r30537, %r17520; + xor.b32 %r17347, %r30538, %r17521; + xor.b32 %r17371, %r30532, %r17521; + xor.b32 %r17370, %r30531, %r17520; + st.local.v2.u32 [%rd2+216], {%r17370, %r17371}; + // begin inline asm + shf.l.wrap.b32 %r17333, %r17339, %r17338, %r16836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17337, %r17338, %r17339, %r16836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17341, %r17347, %r17346, %r16844; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17345, %r17346, %r17347, %r16844; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17353, %r17354, %r17355, %r16852; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17349, %r17355, %r17354, %r16852; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r17349, %r17353}; + // begin inline asm + shf.l.wrap.b32 %r17357, %r17363, %r17362, %r16884; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17361, %r17362, %r17363, %r16884; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17365, %r17371, %r17370, %r16932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17369, %r17370, %r17371, %r16932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17377, %r17378, %r17379, %r16956; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17373, %r17379, %r17378, %r16956; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r17373, %r17377}; + // begin inline asm + shf.l.wrap.b32 %r17381, %r17387, %r17386, %r16972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17385, %r17386, %r17387, %r16972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17389, %r17395, %r17394, %r16980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17393, %r17394, %r17395, %r16980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17397, %r17403, %r17402, %r17012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17401, %r17402, %r17403, %r17012; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17405, %r17440, %r17333, %r17357, 0xD2; + lop3.b32 %r17406, %r17443, %r17337, %r17361, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30714, %r17333, %r17357, %r17389, 0xD2; + lop3.b32 %r30715, %r17337, %r17361, %r17393, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30714, %r30715}; + // begin inline asm + // chi + lop3.b32 %r30710, %r17357, %r17389, %r17365, 0xD2; + lop3.b32 %r30711, %r17361, %r17393, %r17369, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30710, %r30711}; + // begin inline asm + // chi + lop3.b32 %r30706, %r17389, %r17365, %r17440, 0xD2; + lop3.b32 %r30707, %r17393, %r17369, %r17443, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30706, %r30707}; + // begin inline asm + // chi + lop3.b32 %r30704, %r17365, %r17440, %r17333, 0xD2; + lop3.b32 %r30705, %r17369, %r17443, %r17337, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30704, %r30705}; + // begin inline asm + // chi + lop3.b32 %r30700, %r17381, %r17341, %r17397, 0xD2; + lop3.b32 %r30701, %r17385, %r17345, %r17401, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30700, %r30701}; + // begin inline asm + // chi + lop3.b32 %r30712, %r17341, %r17397, %r17373, 0xD2; + lop3.b32 %r30713, %r17345, %r17401, %r17377, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30712, %r30713}; + // begin inline asm + // chi + lop3.b32 %r30708, %r17397, %r17373, %r17349, 0xD2; + lop3.b32 %r30709, %r17401, %r17377, %r17353, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30708, %r30709}; + add.s64 %rd790, %rd789, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r17469,%r17470}, [%rd790]; + // end inline asm + xor.b32 %r30702, %r17405, %r17469; + xor.b32 %r30703, %r17406, %r17470; + st.local.v2.u32 [%rd2+24], {%r30702, %r30703}; + st.local.u64 [%rd176], %rd361; + mov.u64 %rd794, 1179641; + st.local.u64 [%rd176+8], %rd794; + st.local.u32 [%rd176+16], %r1696; + ld.global.u64 %rd795, [%rd127]; + ld.global.u64 %rd796, [%rd127+8]; + ld.global.u64 %rd797, [%rd127+16]; + ld.global.u64 %rd798, [%rd127+24]; + ld.global.u64 %rd799, [%rd127+32]; + ld.global.u64 %rd800, [%rd127+40]; + ld.global.u64 %rd801, [%rd127+48]; + ld.global.u64 %rd802, [%rd127+56]; + st.local.u64 [%rd176+32], %rd796; + st.local.u64 [%rd176+40], %rd797; + st.local.u64 [%rd176+48], %rd798; + st.local.u64 [%rd176+56], %rd799; + st.local.u64 [%rd176+64], %rd800; + st.local.u64 [%rd176+72], %rd801; + st.local.u64 [%rd176+80], %rd802; + cvt.u32.u64 %r17522, %rd795; + xor.b32 %r17523, %r1696, %r17522; + st.local.u64 [%rd176+24], %rd795; + st.local.u32 [%rd176+24], %r17523; + mov.u32 %r30582, 0; + st.local.v2.u32 [%rd176+96], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+104], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+112], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+120], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+128], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+136], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+144], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+152], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+160], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+168], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+176], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+184], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+192], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+200], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+208], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+216], {%r30582, %r30582}; + mov.u32 %r30597, -2147483648; + st.local.v2.u32 [%rd176+88], {%r17485, %r30597}; + ld.local.v2.u32 {%r30618, %r30619}, [%rd176+24]; + mov.b64 {%r30616, %r30617}, %rd800; + shr.u64 %rd803, %rd796, 32; + cvt.u32.u64 %r30630, %rd796; + cvt.u32.u64 %r30631, %rd803; + shr.u64 %rd804, %rd801, 32; + cvt.u32.u64 %r30628, %rd801; + cvt.u32.u64 %r30629, %rd804; + shr.u64 %rd805, %rd797, 32; + cvt.u32.u64 %r30626, %rd797; + cvt.u32.u64 %r30627, %rd805; + shr.u64 %rd806, %rd802, 32; + cvt.u32.u64 %r30624, %rd802; + cvt.u32.u64 %r30625, %rd806; + shr.u64 %rd807, %rd798, 32; + cvt.u32.u64 %r30622, %rd798; + cvt.u32.u64 %r30623, %rd807; + shr.u64 %rd808, %rd799, 32; + cvt.u32.u64 %r30620, %rd799; + cvt.u32.u64 %r30621, %rd808; + mov.u32 %r30583, %r30582; + mov.u32 %r30584, %r30582; + mov.u32 %r30585, %r30582; + mov.u32 %r30586, %r30582; + mov.u32 %r30587, %r30582; + mov.u32 %r30588, %r30582; + mov.u32 %r30589, %r30582; + mov.u32 %r30590, %r30582; + mov.u32 %r30591, %r30582; + mov.u32 %r30592, %r30582; + mov.u32 %r30593, %r30582; + mov.u32 %r30594, %r30582; + mov.u32 %r30595, %r30582; + mov.u32 %r30596, %r17485; + mov.u32 %r30598, %r30582; + mov.u32 %r30599, %r30582; + mov.u32 %r30600, %r30582; + mov.u32 %r30601, %r30582; + mov.u32 %r30602, %r30582; + mov.u32 %r30603, %r30582; + mov.u32 %r30604, %r30582; + mov.u32 %r30605, %r30582; + mov.u32 %r30606, %r30582; + mov.u32 %r30607, %r30582; + mov.u32 %r30608, %r30582; + mov.u32 %r30609, %r30582; + mov.u32 %r30610, %r30582; + mov.u32 %r30611, %r30582; + mov.u32 %r30612, %r30582; + mov.u32 %r30613, %r30582; + mov.u32 %r30614, %r30582; + mov.u32 %r30615, %r30582; + mov.u32 %r30632, %r30582; + +$L__BB2_53: + // begin inline asm + // xor5 + lop3.b32 %r17526, %r30618, %r30616, %r30614, 0x96; + lop3.b32 %r17526, %r17526, %r30612, %r30610, 0x96; + lop3.b32 %r17527, %r30619, %r30617, %r30615, 0x96; + lop3.b32 %r17527, %r17527, %r30613, %r30611, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17538, %r30630, %r30628, %r30608, 0x96; + lop3.b32 %r17538, %r17538, %r30606, %r30604, 0x96; + lop3.b32 %r17539, %r30631, %r30629, %r30609, 0x96; + lop3.b32 %r17539, %r17539, %r30607, %r30605, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17550, %r30626, %r30624, %r30602, 0x96; + lop3.b32 %r17550, %r17550, %r30600, %r30598, 0x96; + lop3.b32 %r17551, %r30627, %r30625, %r30603, 0x96; + lop3.b32 %r17551, %r17551, %r30601, %r30599, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17562, %r30622, %r30596, %r30594, 0x96; + lop3.b32 %r17562, %r17562, %r30592, %r30590, 0x96; + lop3.b32 %r17563, %r30623, %r30597, %r30595, 0x96; + lop3.b32 %r17563, %r17563, %r30593, %r30591, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17574, %r30620, %r30588, %r30586, 0x96; + lop3.b32 %r17574, %r17574, %r30584, %r30582, 0x96; + lop3.b32 %r17575, %r30621, %r30589, %r30587, 0x96; + lop3.b32 %r17575, %r17575, %r30585, %r30583, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17586, %r17539, %r17538, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17590, %r17538, %r17539, %r17485; + // end inline asm + xor.b32 %r18020, %r17586, %r17574; + xor.b32 %r18021, %r17590, %r17575; + xor.b32 %r17853, %r30618, %r18020; + xor.b32 %r17856, %r30619, %r18021; + xor.b32 %r17760, %r30616, %r18020; + xor.b32 %r17759, %r30617, %r18021; + xor.b32 %r17807, %r30614, %r18020; + xor.b32 %r17808, %r30615, %r18021; + xor.b32 %r17712, %r30612, %r18020; + xor.b32 %r17711, %r30613, %r18021; + xor.b32 %r17663, %r30610, %r18020; + xor.b32 %r17664, %r30611, %r18021; + // begin inline asm + shf.l.wrap.b32 %r17594, %r17551, %r17550, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17598, %r17550, %r17551, %r17485; + // end inline asm + xor.b32 %r18022, %r17594, %r17526; + xor.b32 %r18023, %r17598, %r17527; + xor.b32 %r17815, %r30630, %r18022; + xor.b32 %r17816, %r30631, %r18023; + xor.b32 %r17632, %r30628, %r18022; + xor.b32 %r17631, %r30629, %r18023; + xor.b32 %r17791, %r30608, %r18022; + xor.b32 %r17792, %r30609, %r18023; + xor.b32 %r17752, %r30606, %r18022; + xor.b32 %r17751, %r30607, %r18023; + xor.b32 %r17735, %r30604, %r18022; + xor.b32 %r17736, %r30605, %r18023; + // begin inline asm + shf.l.wrap.b32 %r17602, %r17563, %r17562, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17606, %r17562, %r17563, %r17485; + // end inline asm + xor.b32 %r18024, %r17602, %r17538; + xor.b32 %r18025, %r17606, %r17539; + xor.b32 %r17672, %r30626, %r18024; + xor.b32 %r17671, %r30627, %r18025; + xor.b32 %r17799, %r30624, %r18024; + xor.b32 %r17800, %r30625, %r18025; + xor.b32 %r17680, %r30602, %r18024; + xor.b32 %r17679, %r30603, %r18025; + xor.b32 %r17783, %r30600, %r18024; + xor.b32 %r17784, %r30601, %r18025; + xor.b32 %r17648, %r30598, %r18024; + xor.b32 %r17647, %r30599, %r18025; + // begin inline asm + shf.l.wrap.b32 %r17610, %r17575, %r17574, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17614, %r17574, %r17575, %r17485; + // end inline asm + xor.b32 %r18026, %r17610, %r17550; + xor.b32 %r18027, %r17614, %r17551; + xor.b32 %r17767, %r30622, %r18026; + xor.b32 %r17768, %r30623, %r18027; + xor.b32 %r17744, %r30596, %r18026; + xor.b32 %r17743, %r30597, %r18027; + xor.b32 %r17687, %r30594, %r18026; + xor.b32 %r17688, %r30595, %r18027; + xor.b32 %r17775, %r30592, %r18026; + xor.b32 %r17776, %r30593, %r18027; + xor.b32 %r17704, %r30590, %r18026; + xor.b32 %r17703, %r30591, %r18027; + // begin inline asm + shf.l.wrap.b32 %r17618, %r17527, %r17526, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17622, %r17526, %r17527, %r17485; + // end inline asm + xor.b32 %r18028, %r17618, %r17562; + xor.b32 %r18029, %r17622, %r17563; + xor.b32 %r17719, %r30620, %r18028; + xor.b32 %r17720, %r30621, %r18029; + xor.b32 %r17639, %r30588, %r18028; + xor.b32 %r17640, %r30589, %r18029; + xor.b32 %r17656, %r30586, %r18028; + xor.b32 %r17655, %r30587, %r18029; + xor.b32 %r17695, %r30584, %r18028; + xor.b32 %r17696, %r30585, %r18029; + xor.b32 %r17727, %r30582, %r18028; + xor.b32 %r17728, %r30583, %r18029; + mov.u32 %r17633, 44; + // begin inline asm + shf.l.wrap.b32 %r17626, %r17632, %r17631, %r17633; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17630, %r17631, %r17632, %r17633; + // end inline asm + mov.u32 %r17641, 20; + // begin inline asm + shf.l.wrap.b32 %r17634, %r17640, %r17639, %r17641; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17638, %r17639, %r17640, %r17641; + // end inline asm + mov.u32 %r17649, 61; + // begin inline asm + shf.l.wrap.b32 %r17642, %r17648, %r17647, %r17649; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17646, %r17647, %r17648, %r17649; + // end inline asm + mov.u32 %r17657, 39; + // begin inline asm + shf.l.wrap.b32 %r17650, %r17656, %r17655, %r17657; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17654, %r17655, %r17656, %r17657; + // end inline asm + mov.u32 %r17665, 18; + // begin inline asm + shf.l.wrap.b32 %r17658, %r17664, %r17663, %r17665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17662, %r17663, %r17664, %r17665; + // end inline asm + mov.u32 %r17673, 62; + // begin inline asm + shf.l.wrap.b32 %r17666, %r17672, %r17671, %r17673; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17670, %r17671, %r17672, %r17673; + // end inline asm + mov.u32 %r17681, 43; + // begin inline asm + shf.l.wrap.b32 %r17674, %r17680, %r17679, %r17681; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17678, %r17679, %r17680, %r17681; + // end inline asm + mov.u32 %r17689, 25; + // begin inline asm + shf.l.wrap.b32 %r17682, %r17688, %r17687, %r17689; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17686, %r17687, %r17688, %r17689; + // end inline asm + mov.u32 %r17697, 8; + // begin inline asm + shf.l.wrap.b32 %r17690, %r17696, %r17695, %r17697; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17694, %r17695, %r17696, %r17697; + // end inline asm + mov.u32 %r17705, 56; + // begin inline asm + shf.l.wrap.b32 %r17698, %r17704, %r17703, %r17705; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17702, %r17703, %r17704, %r17705; + // end inline asm + mov.u32 %r17713, 41; + // begin inline asm + shf.l.wrap.b32 %r17706, %r17712, %r17711, %r17713; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17710, %r17711, %r17712, %r17713; + // end inline asm + mov.u32 %r17721, 27; + // begin inline asm + shf.l.wrap.b32 %r17714, %r17720, %r17719, %r17721; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17718, %r17719, %r17720, %r17721; + // end inline asm + mov.u32 %r17729, 14; + // begin inline asm + shf.l.wrap.b32 %r17722, %r17728, %r17727, %r17729; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17726, %r17727, %r17728, %r17729; + // end inline asm + mov.u32 %r17737, 2; + // begin inline asm + shf.l.wrap.b32 %r17730, %r17736, %r17735, %r17737; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17734, %r17735, %r17736, %r17737; + // end inline asm + mov.u32 %r17745, 55; + // begin inline asm + shf.l.wrap.b32 %r17738, %r17744, %r17743, %r17745; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17742, %r17743, %r17744, %r17745; + // end inline asm + mov.u32 %r17753, 45; + // begin inline asm + shf.l.wrap.b32 %r17746, %r17752, %r17751, %r17753; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17750, %r17751, %r17752, %r17753; + // end inline asm + mov.u32 %r17761, 36; + // begin inline asm + shf.l.wrap.b32 %r17754, %r17760, %r17759, %r17761; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17758, %r17759, %r17760, %r17761; + // end inline asm + mov.u32 %r17769, 28; + // begin inline asm + shf.l.wrap.b32 %r17762, %r17768, %r17767, %r17769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17766, %r17767, %r17768, %r17769; + // end inline asm + mov.u32 %r17777, 21; + // begin inline asm + shf.l.wrap.b32 %r17770, %r17776, %r17775, %r17777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17774, %r17775, %r17776, %r17777; + // end inline asm + mov.u32 %r17785, 15; + // begin inline asm + shf.l.wrap.b32 %r17778, %r17784, %r17783, %r17785; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17782, %r17783, %r17784, %r17785; + // end inline asm + mov.u32 %r17793, 10; + // begin inline asm + shf.l.wrap.b32 %r17786, %r17792, %r17791, %r17793; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17790, %r17791, %r17792, %r17793; + // end inline asm + mov.u32 %r17801, 6; + // begin inline asm + shf.l.wrap.b32 %r17794, %r17800, %r17799, %r17801; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17798, %r17799, %r17800, %r17801; + // end inline asm + mov.u32 %r17809, 3; + // begin inline asm + shf.l.wrap.b32 %r17802, %r17808, %r17807, %r17809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17806, %r17807, %r17808, %r17809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17810, %r17816, %r17815, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17814, %r17815, %r17816, %r17485; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17818, %r17853, %r17626, %r17674, 0xD2; + lop3.b32 %r17819, %r17856, %r17630, %r17678, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30630, %r17626, %r17674, %r17770, 0xD2; + lop3.b32 %r30631, %r17630, %r17678, %r17774, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30626, %r17674, %r17770, %r17722, 0xD2; + lop3.b32 %r30627, %r17678, %r17774, %r17726, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30622, %r17770, %r17722, %r17853, 0xD2; + lop3.b32 %r30623, %r17774, %r17726, %r17856, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30620, %r17722, %r17853, %r17626, 0xD2; + lop3.b32 %r30621, %r17726, %r17856, %r17630, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30616, %r17762, %r17634, %r17802, 0xD2; + lop3.b32 %r30617, %r17766, %r17638, %r17806, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30628, %r17634, %r17802, %r17746, 0xD2; + lop3.b32 %r30629, %r17638, %r17806, %r17750, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30624, %r17802, %r17746, %r17642, 0xD2; + lop3.b32 %r30625, %r17806, %r17750, %r17646, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30596, %r17746, %r17642, %r17762, 0xD2; + lop3.b32 %r30597, %r17750, %r17646, %r17766, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r30596, %r30597}; + // begin inline asm + // chi + lop3.b32 %r30588, %r17642, %r17762, %r17634, 0xD2; + lop3.b32 %r30589, %r17646, %r17766, %r17638, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r30588, %r30589}; + // begin inline asm + // chi + lop3.b32 %r30614, %r17810, %r17794, %r17682, 0xD2; + lop3.b32 %r30615, %r17814, %r17798, %r17686, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+104], {%r30614, %r30615}; + // begin inline asm + // chi + lop3.b32 %r30608, %r17794, %r17682, %r17690, 0xD2; + lop3.b32 %r30609, %r17798, %r17686, %r17694, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+112], {%r30608, %r30609}; + // begin inline asm + // chi + lop3.b32 %r30602, %r17682, %r17690, %r17658, 0xD2; + lop3.b32 %r30603, %r17686, %r17694, %r17662, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+120], {%r30602, %r30603}; + // begin inline asm + // chi + lop3.b32 %r30594, %r17690, %r17658, %r17810, 0xD2; + lop3.b32 %r30595, %r17694, %r17662, %r17814, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+128], {%r30594, %r30595}; + // begin inline asm + // chi + lop3.b32 %r30586, %r17658, %r17810, %r17794, 0xD2; + lop3.b32 %r30587, %r17662, %r17814, %r17798, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+136], {%r30586, %r30587}; + // begin inline asm + // chi + lop3.b32 %r30612, %r17714, %r17754, %r17786, 0xD2; + lop3.b32 %r30613, %r17718, %r17758, %r17790, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+144], {%r30612, %r30613}; + // begin inline asm + // chi + lop3.b32 %r30606, %r17754, %r17786, %r17778, 0xD2; + lop3.b32 %r30607, %r17758, %r17790, %r17782, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+152], {%r30606, %r30607}; + // begin inline asm + // chi + lop3.b32 %r30600, %r17786, %r17778, %r17698, 0xD2; + lop3.b32 %r30601, %r17790, %r17782, %r17702, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+160], {%r30600, %r30601}; + // begin inline asm + // chi + lop3.b32 %r30592, %r17778, %r17698, %r17714, 0xD2; + lop3.b32 %r30593, %r17782, %r17702, %r17718, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+168], {%r30592, %r30593}; + // begin inline asm + // chi + lop3.b32 %r30584, %r17698, %r17714, %r17754, 0xD2; + lop3.b32 %r30585, %r17702, %r17718, %r17758, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+176], {%r30584, %r30585}; + // begin inline asm + // chi + lop3.b32 %r30610, %r17666, %r17738, %r17650, 0xD2; + lop3.b32 %r30611, %r17670, %r17742, %r17654, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+184], {%r30610, %r30611}; + // begin inline asm + // chi + lop3.b32 %r30604, %r17738, %r17650, %r17706, 0xD2; + lop3.b32 %r30605, %r17742, %r17654, %r17710, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+192], {%r30604, %r30605}; + // begin inline asm + // chi + lop3.b32 %r30598, %r17650, %r17706, %r17730, 0xD2; + lop3.b32 %r30599, %r17654, %r17710, %r17734, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+200], {%r30598, %r30599}; + // begin inline asm + // chi + lop3.b32 %r30590, %r17706, %r17730, %r17666, 0xD2; + lop3.b32 %r30591, %r17710, %r17734, %r17670, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+208], {%r30590, %r30591}; + // begin inline asm + // chi + lop3.b32 %r30582, %r17730, %r17666, %r17738, 0xD2; + lop3.b32 %r30583, %r17734, %r17670, %r17742, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+216], {%r30582, %r30583}; + mul.wide.s32 %rd810, %r30632, 8; + add.s64 %rd809, %rd789, %rd810; + // begin inline asm + ld.global.nc.v2.u32 {%r18018,%r18019}, [%rd809]; + // end inline asm + xor.b32 %r30618, %r17818, %r18018; + xor.b32 %r30619, %r17819, %r18019; + add.s32 %r30632, %r30632, 1; + setp.lt.u32 %p32, %r30632, 23; + @%p32 bra $L__BB2_53; + + mov.u32 %r30665, 0; + mov.u32 %r18129, 1; + st.local.v2.u32 [%rd176+32], {%r30630, %r30631}; + st.local.v2.u32 [%rd176+72], {%r30628, %r30629}; + st.local.v2.u32 [%rd176+40], {%r30626, %r30627}; + st.local.v2.u32 [%rd176+80], {%r30624, %r30625}; + st.local.v2.u32 [%rd176+48], {%r30622, %r30623}; + st.local.v2.u32 [%rd176+56], {%r30620, %r30621}; + st.local.v2.u32 [%rd176+24], {%r30618, %r30619}; + // begin inline asm + // xor5 + lop3.b32 %r18030, %r30618, %r30616, %r30614, 0x96; + lop3.b32 %r18030, %r18030, %r30612, %r30610, 0x96; + lop3.b32 %r18031, %r30619, %r30617, %r30615, 0x96; + lop3.b32 %r18031, %r18031, %r30613, %r30611, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18042, %r30630, %r30628, %r30608, 0x96; + lop3.b32 %r18042, %r18042, %r30606, %r30604, 0x96; + lop3.b32 %r18043, %r30631, %r30629, %r30609, 0x96; + lop3.b32 %r18043, %r18043, %r30607, %r30605, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18054, %r30626, %r30624, %r30602, 0x96; + lop3.b32 %r18054, %r18054, %r30600, %r30598, 0x96; + lop3.b32 %r18055, %r30627, %r30625, %r30603, 0x96; + lop3.b32 %r18055, %r18055, %r30601, %r30599, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18066, %r30622, %r30596, %r30594, 0x96; + lop3.b32 %r18066, %r18066, %r30592, %r30590, 0x96; + lop3.b32 %r18067, %r30623, %r30597, %r30595, 0x96; + lop3.b32 %r18067, %r18067, %r30593, %r30591, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18078, %r30620, %r30588, %r30586, 0x96; + lop3.b32 %r18078, %r18078, %r30584, %r30582, 0x96; + lop3.b32 %r18079, %r30621, %r30589, %r30587, 0x96; + lop3.b32 %r18079, %r18079, %r30585, %r30583, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18090, %r18043, %r18042, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18094, %r18042, %r18043, %r18129; + // end inline asm + xor.b32 %r18269, %r18090, %r18078; + xor.b32 %r18270, %r18094, %r18079; + xor.b32 %r18237, %r30618, %r18269; + xor.b32 %r18240, %r30619, %r18270; + xor.b32 %r18200, %r30615, %r18270; + xor.b32 %r18199, %r30614, %r18269; + st.local.v2.u32 [%rd176+104], {%r18199, %r18200}; + // begin inline asm + shf.l.wrap.b32 %r18098, %r18055, %r18054, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18102, %r18054, %r18055, %r18129; + // end inline asm + xor.b32 %r18271, %r18098, %r18030; + xor.b32 %r18272, %r18102, %r18031; + xor.b32 %r18136, %r30628, %r18271; + xor.b32 %r18135, %r30629, %r18272; + xor.b32 %r18175, %r30607, %r18272; + xor.b32 %r18176, %r30606, %r18271; + st.local.v2.u32 [%rd176+152], {%r18176, %r18175}; + // begin inline asm + shf.l.wrap.b32 %r18106, %r18067, %r18066, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18110, %r18066, %r18067, %r18129; + // end inline asm + xor.b32 %r18273, %r18106, %r18042; + xor.b32 %r18274, %r18110, %r18043; + xor.b32 %r18159, %r30603, %r18274; + xor.b32 %r18160, %r30602, %r18273; + st.local.v2.u32 [%rd176+120], {%r18160, %r18159}; + xor.b32 %r18151, %r30599, %r18274; + xor.b32 %r18152, %r30598, %r18273; + st.local.v2.u32 [%rd176+200], {%r18152, %r18151}; + // begin inline asm + shf.l.wrap.b32 %r18114, %r18079, %r18078, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18118, %r18078, %r18079, %r18129; + // end inline asm + xor.b32 %r18275, %r18114, %r18054; + xor.b32 %r18276, %r18118, %r18055; + xor.b32 %r18183, %r30622, %r18275; + xor.b32 %r18184, %r30623, %r18276; + xor.b32 %r18192, %r30593, %r18276; + xor.b32 %r18191, %r30592, %r18275; + st.local.v2.u32 [%rd176+168], {%r18191, %r18192}; + // begin inline asm + shf.l.wrap.b32 %r18122, %r18031, %r18030, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18126, %r18030, %r18031, %r18129; + // end inline asm + xor.b32 %r18277, %r18122, %r18066; + xor.b32 %r18278, %r18126, %r18067; + xor.b32 %r18143, %r30588, %r18277; + xor.b32 %r18144, %r30589, %r18278; + xor.b32 %r18168, %r30583, %r18278; + xor.b32 %r18167, %r30582, %r18277; + st.local.v2.u32 [%rd176+216], {%r18167, %r18168}; + // begin inline asm + shf.l.wrap.b32 %r18130, %r18136, %r18135, %r17633; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18134, %r18135, %r18136, %r17633; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18138, %r18144, %r18143, %r17641; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18142, %r18143, %r18144, %r17641; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18150, %r18151, %r18152, %r17649; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18146, %r18152, %r18151, %r17649; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r18146, %r18150}; + // begin inline asm + shf.l.wrap.b32 %r18154, %r18160, %r18159, %r17681; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18158, %r18159, %r18160, %r17681; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18162, %r18168, %r18167, %r17729; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18166, %r18167, %r18168, %r17729; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18174, %r18175, %r18176, %r17753; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18170, %r18176, %r18175, %r17753; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r18170, %r18174}; + // begin inline asm + shf.l.wrap.b32 %r18178, %r18184, %r18183, %r17769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18182, %r18183, %r18184, %r17769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18186, %r18192, %r18191, %r17777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18190, %r18191, %r18192, %r17777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18194, %r18200, %r18199, %r17809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18198, %r18199, %r18200, %r17809; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18202, %r18237, %r18130, %r18154, 0xD2; + lop3.b32 %r18203, %r18240, %r18134, %r18158, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30765, %r18130, %r18154, %r18186, 0xD2; + lop3.b32 %r30766, %r18134, %r18158, %r18190, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+32], {%r30765, %r30766}; + // begin inline asm + // chi + lop3.b32 %r30761, %r18154, %r18186, %r18162, 0xD2; + lop3.b32 %r30762, %r18158, %r18190, %r18166, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+40], {%r30761, %r30762}; + // begin inline asm + // chi + lop3.b32 %r30757, %r18186, %r18162, %r18237, 0xD2; + lop3.b32 %r30758, %r18190, %r18166, %r18240, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+48], {%r30757, %r30758}; + // begin inline asm + // chi + lop3.b32 %r30755, %r18162, %r18237, %r18130, 0xD2; + lop3.b32 %r30756, %r18166, %r18240, %r18134, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+56], {%r30755, %r30756}; + // begin inline asm + // chi + lop3.b32 %r30751, %r18178, %r18138, %r18194, 0xD2; + lop3.b32 %r30752, %r18182, %r18142, %r18198, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+64], {%r30751, %r30752}; + // begin inline asm + // chi + lop3.b32 %r30763, %r18138, %r18194, %r18170, 0xD2; + lop3.b32 %r30764, %r18142, %r18198, %r18174, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+72], {%r30763, %r30764}; + // begin inline asm + // chi + lop3.b32 %r30759, %r18194, %r18170, %r18146, 0xD2; + lop3.b32 %r30760, %r18198, %r18174, %r18150, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+80], {%r30759, %r30760}; + // begin inline asm + ld.global.nc.v2.u32 {%r18266,%r18267}, [%rd790]; + // end inline asm + xor.b32 %r30753, %r18202, %r18266; + xor.b32 %r30754, %r18203, %r18267; + st.local.v2.u32 [%rd176+24], {%r30753, %r30754}; + add.s64 %rd178, %rd2, 24; + add.s64 %rd179, %rd176, 24; + +$L__BB2_55: + cvta.to.global.u64 %rd1269, %rd361; + shl.b32 %r18279, %r30665, 2; + cvt.u64.u32 %rd820, %r18279; + and.b64 %rd821, %rd820, 60; + add.s64 %rd822, %rd178, %rd821; + xor.b32 %r18280, %r1695, %r30665; + mul.lo.s32 %r18281, %r18280, 16777619; + ld.local.u32 %r18282, [%rd822]; + xor.b32 %r18283, %r18281, %r18282; + mul.wide.u32 %rd823, %r18283, -954391867; + shr.u64 %rd824, %rd823, 32; + cvt.u32.u64 %r18284, %rd824; + sub.s32 %r18285, %r18283, %r18284; + shr.u32 %r18286, %r18285, 1; + add.s32 %r18287, %r18286, %r18284; + shr.u32 %r18288, %r18287, 20; + mul.lo.s32 %r18289, %r18288, 1179641; + sub.s32 %r18290, %r18283, %r18289; + mul.wide.u32 %rd825, %r18290, 64; + add.s64 %rd826, %rd1269, %rd825; + mul.lo.s32 %r18291, %r30702, 16777619; + ld.global.u32 %r18292, [%rd826]; + xor.b32 %r30702, %r18291, %r18292; + mul.lo.s32 %r18293, %r30703, 16777619; + ld.global.u32 %r18294, [%rd826+4]; + xor.b32 %r30703, %r18293, %r18294; + mul.lo.s32 %r18295, %r30714, 16777619; + ld.global.u32 %r18296, [%rd826+8]; + mul.lo.s32 %r18297, %r30715, 16777619; + ld.global.u32 %r18298, [%rd826+12]; + xor.b32 %r18299, %r18297, %r18298; + xor.b32 %r30714, %r18295, %r18296; + mov.b64 %rd827, {%r30714, %r18299}; + mul.lo.s32 %r18300, %r30710, 16777619; + ld.global.u32 %r18301, [%rd826+16]; + mul.lo.s32 %r18302, %r30711, 16777619; + ld.global.u32 %r18303, [%rd826+20]; + xor.b32 %r18304, %r18302, %r18303; + xor.b32 %r30710, %r18300, %r18301; + mov.b64 %rd828, {%r30710, %r18304}; + mul.lo.s32 %r18305, %r30706, 16777619; + ld.global.u32 %r18306, [%rd826+24]; + mul.lo.s32 %r18307, %r30707, 16777619; + ld.global.u32 %r18308, [%rd826+28]; + xor.b32 %r18309, %r18307, %r18308; + xor.b32 %r30706, %r18305, %r18306; + mov.b64 %rd829, {%r30706, %r18309}; + mul.lo.s32 %r18310, %r30704, 16777619; + ld.global.u32 %r18311, [%rd826+32]; + mul.lo.s32 %r18312, %r30705, 16777619; + ld.global.u32 %r18313, [%rd826+36]; + xor.b32 %r18314, %r18312, %r18313; + xor.b32 %r30704, %r18310, %r18311; + mov.b64 %rd830, {%r30704, %r18314}; + mul.lo.s32 %r18315, %r30700, 16777619; + ld.global.u32 %r18316, [%rd826+40]; + xor.b32 %r30700, %r18315, %r18316; + mul.lo.s32 %r18317, %r30701, 16777619; + ld.global.u32 %r18318, [%rd826+44]; + xor.b32 %r30701, %r18317, %r18318; + mul.lo.s32 %r18319, %r30712, 16777619; + ld.global.u32 %r18320, [%rd826+48]; + mul.lo.s32 %r18321, %r30713, 16777619; + ld.global.u32 %r18322, [%rd826+52]; + xor.b32 %r18323, %r18321, %r18322; + xor.b32 %r30712, %r18319, %r18320; + mov.b64 %rd831, {%r30712, %r18323}; + mul.lo.s32 %r18324, %r30708, 16777619; + ld.global.u32 %r18325, [%rd826+56]; + mul.lo.s32 %r18326, %r30709, 16777619; + ld.global.u32 %r18327, [%rd826+60]; + xor.b32 %r18328, %r18326, %r18327; + xor.b32 %r30708, %r18324, %r18325; + mov.b64 %rd832, {%r30708, %r18328}; + st.local.v2.u32 [%rd2+24], {%r30702, %r30703}; + st.local.v2.u32 [%rd2+32], {%r30714, %r18299}; + st.local.v2.u32 [%rd2+40], {%r30710, %r18304}; + st.local.v2.u32 [%rd2+48], {%r30706, %r18309}; + st.local.v2.u32 [%rd2+56], {%r30704, %r18314}; + st.local.v2.u32 [%rd2+64], {%r30700, %r30701}; + st.local.v2.u32 [%rd2+72], {%r30712, %r18323}; + st.local.v2.u32 [%rd2+80], {%r30708, %r18328}; + add.s64 %rd833, %rd179, %rd821; + xor.b32 %r18329, %r1696, %r30665; + mul.lo.s32 %r18330, %r18329, 16777619; + ld.local.u32 %r18331, [%rd833]; + xor.b32 %r18332, %r18330, %r18331; + mul.wide.u32 %rd834, %r18332, -954391867; + shr.u64 %rd835, %rd834, 32; + cvt.u32.u64 %r18333, %rd835; + sub.s32 %r18334, %r18332, %r18333; + shr.u32 %r18335, %r18334, 1; + add.s32 %r18336, %r18335, %r18333; + shr.u32 %r18337, %r18336, 20; + mul.lo.s32 %r18338, %r18337, 1179641; + sub.s32 %r18339, %r18332, %r18338; + mul.wide.u32 %rd836, %r18339, 64; + add.s64 %rd837, %rd1269, %rd836; + mul.lo.s32 %r18340, %r30753, 16777619; + ld.global.u32 %r18341, [%rd837]; + xor.b32 %r30753, %r18340, %r18341; + mul.lo.s32 %r18342, %r30754, 16777619; + ld.global.u32 %r18343, [%rd837+4]; + xor.b32 %r30754, %r18342, %r18343; + mul.lo.s32 %r18344, %r30765, 16777619; + ld.global.u32 %r18345, [%rd837+8]; + mul.lo.s32 %r18346, %r30766, 16777619; + ld.global.u32 %r18347, [%rd837+12]; + xor.b32 %r18348, %r18346, %r18347; + xor.b32 %r30765, %r18344, %r18345; + mov.b64 %rd838, {%r30765, %r18348}; + mul.lo.s32 %r18349, %r30761, 16777619; + ld.global.u32 %r18350, [%rd837+16]; + mul.lo.s32 %r18351, %r30762, 16777619; + ld.global.u32 %r18352, [%rd837+20]; + xor.b32 %r18353, %r18351, %r18352; + xor.b32 %r30761, %r18349, %r18350; + mov.b64 %rd839, {%r30761, %r18353}; + mul.lo.s32 %r18354, %r30757, 16777619; + ld.global.u32 %r18355, [%rd837+24]; + mul.lo.s32 %r18356, %r30758, 16777619; + ld.global.u32 %r18357, [%rd837+28]; + xor.b32 %r18358, %r18356, %r18357; + xor.b32 %r30757, %r18354, %r18355; + mov.b64 %rd840, {%r30757, %r18358}; + mul.lo.s32 %r18359, %r30755, 16777619; + ld.global.u32 %r18360, [%rd837+32]; + mul.lo.s32 %r18361, %r30756, 16777619; + ld.global.u32 %r18362, [%rd837+36]; + xor.b32 %r18363, %r18361, %r18362; + xor.b32 %r30755, %r18359, %r18360; + mov.b64 %rd841, {%r30755, %r18363}; + mul.lo.s32 %r18364, %r30751, 16777619; + ld.global.u32 %r18365, [%rd837+40]; + xor.b32 %r30751, %r18364, %r18365; + mul.lo.s32 %r18366, %r30752, 16777619; + ld.global.u32 %r18367, [%rd837+44]; + xor.b32 %r30752, %r18366, %r18367; + mul.lo.s32 %r18368, %r30763, 16777619; + ld.global.u32 %r18369, [%rd837+48]; + mul.lo.s32 %r18370, %r30764, 16777619; + ld.global.u32 %r18371, [%rd837+52]; + xor.b32 %r18372, %r18370, %r18371; + xor.b32 %r30763, %r18368, %r18369; + mov.b64 %rd842, {%r30763, %r18372}; + mul.lo.s32 %r18373, %r30759, 16777619; + ld.global.u32 %r18374, [%rd837+56]; + mul.lo.s32 %r18375, %r30760, 16777619; + ld.global.u32 %r18376, [%rd837+60]; + xor.b32 %r18377, %r18375, %r18376; + xor.b32 %r30759, %r18373, %r18374; + mov.b64 %rd843, {%r30759, %r18377}; + st.local.v2.u32 [%rd176+24], {%r30753, %r30754}; + st.local.v2.u32 [%rd176+32], {%r30765, %r18348}; + st.local.v2.u32 [%rd176+40], {%r30761, %r18353}; + st.local.v2.u32 [%rd176+48], {%r30757, %r18358}; + st.local.v2.u32 [%rd176+56], {%r30755, %r18363}; + st.local.v2.u32 [%rd176+64], {%r30751, %r30752}; + st.local.v2.u32 [%rd176+72], {%r30763, %r18372}; + st.local.v2.u32 [%rd176+80], {%r30759, %r18377}; + add.s32 %r30665, %r30665, 1; + setp.lt.u32 %p33, %r30665, 512; + shr.u64 %rd844, %rd827, 32; + cvt.u32.u64 %r30715, %rd844; + shr.u64 %rd845, %rd828, 32; + cvt.u32.u64 %r30711, %rd845; + shr.u64 %rd846, %rd829, 32; + cvt.u32.u64 %r30707, %rd846; + shr.u64 %rd847, %rd830, 32; + cvt.u32.u64 %r30705, %rd847; + shr.u64 %rd848, %rd831, 32; + cvt.u32.u64 %r30713, %rd848; + shr.u64 %rd849, %rd832, 32; + cvt.u32.u64 %r30709, %rd849; + shr.u64 %rd850, %rd838, 32; + cvt.u32.u64 %r30766, %rd850; + shr.u64 %rd851, %rd839, 32; + cvt.u32.u64 %r30762, %rd851; + shr.u64 %rd852, %rd840, 32; + cvt.u32.u64 %r30758, %rd852; + shr.u64 %rd853, %rd841, 32; + cvt.u32.u64 %r30756, %rd853; + shr.u64 %rd854, %rd842, 32; + cvt.u32.u64 %r30764, %rd854; + shr.u64 %rd855, %rd843, 32; + cvt.u32.u64 %r30760, %rd855; + @%p33 bra $L__BB2_55; + + mov.u32 %r30666, 0; + st.local.v2.u32 [%rd2+96], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+104], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+112], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+120], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+128], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+136], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+144], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+152], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+160], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+168], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+176], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+184], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+192], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+200], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+208], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+216], {%r30666, %r30666}; + mov.u32 %r30681, -2147483648; + mov.u32 %r18392, 1; + st.local.v2.u32 [%rd2+88], {%r18392, %r30681}; + mov.u32 %r30667, %r30666; + mov.u32 %r30668, %r30666; + mov.u32 %r30669, %r30666; + mov.u32 %r30670, %r30666; + mov.u32 %r30671, %r30666; + mov.u32 %r30672, %r30666; + mov.u32 %r30673, %r30666; + mov.u32 %r30674, %r30666; + mov.u32 %r30675, %r30666; + mov.u32 %r30676, %r30666; + mov.u32 %r30677, %r30666; + mov.u32 %r30678, %r30666; + mov.u32 %r30679, %r30666; + mov.u32 %r30680, %r18392; + mov.u32 %r30682, %r30666; + mov.u32 %r30683, %r30666; + mov.u32 %r30684, %r30666; + mov.u32 %r30685, %r30666; + mov.u32 %r30686, %r30666; + mov.u32 %r30687, %r30666; + mov.u32 %r30688, %r30666; + mov.u32 %r30689, %r30666; + mov.u32 %r30690, %r30666; + mov.u32 %r30691, %r30666; + mov.u32 %r30692, %r30666; + mov.u32 %r30693, %r30666; + mov.u32 %r30694, %r30666; + mov.u32 %r30695, %r30666; + mov.u32 %r30696, %r30666; + mov.u32 %r30697, %r30666; + mov.u32 %r30698, %r30666; + mov.u32 %r30699, %r30666; + mov.u32 %r30716, %r30666; + +$L__BB2_57: + // begin inline asm + // xor5 + lop3.b32 %r18419, %r30702, %r30700, %r30698, 0x96; + lop3.b32 %r18419, %r18419, %r30696, %r30694, 0x96; + lop3.b32 %r18420, %r30703, %r30701, %r30699, 0x96; + lop3.b32 %r18420, %r18420, %r30697, %r30695, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18431, %r30714, %r30712, %r30692, 0x96; + lop3.b32 %r18431, %r18431, %r30690, %r30688, 0x96; + lop3.b32 %r18432, %r30715, %r30713, %r30693, 0x96; + lop3.b32 %r18432, %r18432, %r30691, %r30689, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18443, %r30710, %r30708, %r30686, 0x96; + lop3.b32 %r18443, %r18443, %r30684, %r30682, 0x96; + lop3.b32 %r18444, %r30711, %r30709, %r30687, 0x96; + lop3.b32 %r18444, %r18444, %r30685, %r30683, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18455, %r30706, %r30680, %r30678, 0x96; + lop3.b32 %r18455, %r18455, %r30676, %r30674, 0x96; + lop3.b32 %r18456, %r30707, %r30681, %r30679, 0x96; + lop3.b32 %r18456, %r18456, %r30677, %r30675, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18467, %r30704, %r30672, %r30670, 0x96; + lop3.b32 %r18467, %r18467, %r30668, %r30666, 0x96; + lop3.b32 %r18468, %r30705, %r30673, %r30671, 0x96; + lop3.b32 %r18468, %r18468, %r30669, %r30667, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18479, %r18432, %r18431, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18483, %r18431, %r18432, %r18392; + // end inline asm + xor.b32 %r18913, %r18479, %r18467; + xor.b32 %r18914, %r18483, %r18468; + xor.b32 %r18746, %r30702, %r18913; + xor.b32 %r18749, %r30703, %r18914; + xor.b32 %r18653, %r30700, %r18913; + xor.b32 %r18652, %r30701, %r18914; + xor.b32 %r18700, %r30698, %r18913; + xor.b32 %r18701, %r30699, %r18914; + xor.b32 %r18605, %r30696, %r18913; + xor.b32 %r18604, %r30697, %r18914; + xor.b32 %r18556, %r30694, %r18913; + xor.b32 %r18557, %r30695, %r18914; + // begin inline asm + shf.l.wrap.b32 %r18487, %r18444, %r18443, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18491, %r18443, %r18444, %r18392; + // end inline asm + xor.b32 %r18915, %r18487, %r18419; + xor.b32 %r18916, %r18491, %r18420; + xor.b32 %r18708, %r30714, %r18915; + xor.b32 %r18709, %r30715, %r18916; + xor.b32 %r18525, %r30712, %r18915; + xor.b32 %r18524, %r30713, %r18916; + xor.b32 %r18684, %r30692, %r18915; + xor.b32 %r18685, %r30693, %r18916; + xor.b32 %r18645, %r30690, %r18915; + xor.b32 %r18644, %r30691, %r18916; + xor.b32 %r18628, %r30688, %r18915; + xor.b32 %r18629, %r30689, %r18916; + // begin inline asm + shf.l.wrap.b32 %r18495, %r18456, %r18455, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18499, %r18455, %r18456, %r18392; + // end inline asm + xor.b32 %r18917, %r18495, %r18431; + xor.b32 %r18918, %r18499, %r18432; + xor.b32 %r18565, %r30710, %r18917; + xor.b32 %r18564, %r30711, %r18918; + xor.b32 %r18692, %r30708, %r18917; + xor.b32 %r18693, %r30709, %r18918; + xor.b32 %r18573, %r30686, %r18917; + xor.b32 %r18572, %r30687, %r18918; + xor.b32 %r18676, %r30684, %r18917; + xor.b32 %r18677, %r30685, %r18918; + xor.b32 %r18541, %r30682, %r18917; + xor.b32 %r18540, %r30683, %r18918; + // begin inline asm + shf.l.wrap.b32 %r18503, %r18468, %r18467, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18507, %r18467, %r18468, %r18392; + // end inline asm + xor.b32 %r18919, %r18503, %r18443; + xor.b32 %r18920, %r18507, %r18444; + xor.b32 %r18660, %r30706, %r18919; + xor.b32 %r18661, %r30707, %r18920; + xor.b32 %r18637, %r30680, %r18919; + xor.b32 %r18636, %r30681, %r18920; + xor.b32 %r18580, %r30678, %r18919; + xor.b32 %r18581, %r30679, %r18920; + xor.b32 %r18668, %r30676, %r18919; + xor.b32 %r18669, %r30677, %r18920; + xor.b32 %r18597, %r30674, %r18919; + xor.b32 %r18596, %r30675, %r18920; + // begin inline asm + shf.l.wrap.b32 %r18511, %r18420, %r18419, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18515, %r18419, %r18420, %r18392; + // end inline asm + xor.b32 %r18921, %r18511, %r18455; + xor.b32 %r18922, %r18515, %r18456; + xor.b32 %r18612, %r30704, %r18921; + xor.b32 %r18613, %r30705, %r18922; + xor.b32 %r18532, %r30672, %r18921; + xor.b32 %r18533, %r30673, %r18922; + xor.b32 %r18549, %r30670, %r18921; + xor.b32 %r18548, %r30671, %r18922; + xor.b32 %r18588, %r30668, %r18921; + xor.b32 %r18589, %r30669, %r18922; + xor.b32 %r18620, %r30666, %r18921; + xor.b32 %r18621, %r30667, %r18922; + mov.u32 %r18526, 44; + // begin inline asm + shf.l.wrap.b32 %r18519, %r18525, %r18524, %r18526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18523, %r18524, %r18525, %r18526; + // end inline asm + mov.u32 %r18534, 20; + // begin inline asm + shf.l.wrap.b32 %r18527, %r18533, %r18532, %r18534; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18531, %r18532, %r18533, %r18534; + // end inline asm + mov.u32 %r18542, 61; + // begin inline asm + shf.l.wrap.b32 %r18535, %r18541, %r18540, %r18542; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18539, %r18540, %r18541, %r18542; + // end inline asm + mov.u32 %r18550, 39; + // begin inline asm + shf.l.wrap.b32 %r18543, %r18549, %r18548, %r18550; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18547, %r18548, %r18549, %r18550; + // end inline asm + mov.u32 %r18558, 18; + // begin inline asm + shf.l.wrap.b32 %r18551, %r18557, %r18556, %r18558; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18555, %r18556, %r18557, %r18558; + // end inline asm + mov.u32 %r18566, 62; + // begin inline asm + shf.l.wrap.b32 %r18559, %r18565, %r18564, %r18566; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18563, %r18564, %r18565, %r18566; + // end inline asm + mov.u32 %r18574, 43; + // begin inline asm + shf.l.wrap.b32 %r18567, %r18573, %r18572, %r18574; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18571, %r18572, %r18573, %r18574; + // end inline asm + mov.u32 %r18582, 25; + // begin inline asm + shf.l.wrap.b32 %r18575, %r18581, %r18580, %r18582; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18579, %r18580, %r18581, %r18582; + // end inline asm + mov.u32 %r18590, 8; + // begin inline asm + shf.l.wrap.b32 %r18583, %r18589, %r18588, %r18590; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18587, %r18588, %r18589, %r18590; + // end inline asm + mov.u32 %r18598, 56; + // begin inline asm + shf.l.wrap.b32 %r18591, %r18597, %r18596, %r18598; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18595, %r18596, %r18597, %r18598; + // end inline asm + mov.u32 %r18606, 41; + // begin inline asm + shf.l.wrap.b32 %r18599, %r18605, %r18604, %r18606; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18603, %r18604, %r18605, %r18606; + // end inline asm + mov.u32 %r18614, 27; + // begin inline asm + shf.l.wrap.b32 %r18607, %r18613, %r18612, %r18614; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18611, %r18612, %r18613, %r18614; + // end inline asm + mov.u32 %r18622, 14; + // begin inline asm + shf.l.wrap.b32 %r18615, %r18621, %r18620, %r18622; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18619, %r18620, %r18621, %r18622; + // end inline asm + mov.u32 %r18630, 2; + // begin inline asm + shf.l.wrap.b32 %r18623, %r18629, %r18628, %r18630; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18627, %r18628, %r18629, %r18630; + // end inline asm + mov.u32 %r18638, 55; + // begin inline asm + shf.l.wrap.b32 %r18631, %r18637, %r18636, %r18638; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18635, %r18636, %r18637, %r18638; + // end inline asm + mov.u32 %r18646, 45; + // begin inline asm + shf.l.wrap.b32 %r18639, %r18645, %r18644, %r18646; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18643, %r18644, %r18645, %r18646; + // end inline asm + mov.u32 %r18654, 36; + // begin inline asm + shf.l.wrap.b32 %r18647, %r18653, %r18652, %r18654; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18651, %r18652, %r18653, %r18654; + // end inline asm + mov.u32 %r18662, 28; + // begin inline asm + shf.l.wrap.b32 %r18655, %r18661, %r18660, %r18662; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18659, %r18660, %r18661, %r18662; + // end inline asm + mov.u32 %r18670, 21; + // begin inline asm + shf.l.wrap.b32 %r18663, %r18669, %r18668, %r18670; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18667, %r18668, %r18669, %r18670; + // end inline asm + mov.u32 %r18678, 15; + // begin inline asm + shf.l.wrap.b32 %r18671, %r18677, %r18676, %r18678; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18675, %r18676, %r18677, %r18678; + // end inline asm + mov.u32 %r18686, 10; + // begin inline asm + shf.l.wrap.b32 %r18679, %r18685, %r18684, %r18686; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18683, %r18684, %r18685, %r18686; + // end inline asm + mov.u32 %r18694, 6; + // begin inline asm + shf.l.wrap.b32 %r18687, %r18693, %r18692, %r18694; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18691, %r18692, %r18693, %r18694; + // end inline asm + mov.u32 %r18702, 3; + // begin inline asm + shf.l.wrap.b32 %r18695, %r18701, %r18700, %r18702; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18699, %r18700, %r18701, %r18702; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18703, %r18709, %r18708, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18707, %r18708, %r18709, %r18392; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18711, %r18746, %r18519, %r18567, 0xD2; + lop3.b32 %r18712, %r18749, %r18523, %r18571, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30714, %r18519, %r18567, %r18663, 0xD2; + lop3.b32 %r30715, %r18523, %r18571, %r18667, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30710, %r18567, %r18663, %r18615, 0xD2; + lop3.b32 %r30711, %r18571, %r18667, %r18619, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30706, %r18663, %r18615, %r18746, 0xD2; + lop3.b32 %r30707, %r18667, %r18619, %r18749, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30704, %r18615, %r18746, %r18519, 0xD2; + lop3.b32 %r30705, %r18619, %r18749, %r18523, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30700, %r18655, %r18527, %r18695, 0xD2; + lop3.b32 %r30701, %r18659, %r18531, %r18699, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30712, %r18527, %r18695, %r18639, 0xD2; + lop3.b32 %r30713, %r18531, %r18699, %r18643, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30708, %r18695, %r18639, %r18535, 0xD2; + lop3.b32 %r30709, %r18699, %r18643, %r18539, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30680, %r18639, %r18535, %r18655, 0xD2; + lop3.b32 %r30681, %r18643, %r18539, %r18659, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30680, %r30681}; + // begin inline asm + // chi + lop3.b32 %r30672, %r18535, %r18655, %r18527, 0xD2; + lop3.b32 %r30673, %r18539, %r18659, %r18531, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30672, %r30673}; + // begin inline asm + // chi + lop3.b32 %r30698, %r18703, %r18687, %r18575, 0xD2; + lop3.b32 %r30699, %r18707, %r18691, %r18579, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30698, %r30699}; + // begin inline asm + // chi + lop3.b32 %r30692, %r18687, %r18575, %r18583, 0xD2; + lop3.b32 %r30693, %r18691, %r18579, %r18587, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30692, %r30693}; + // begin inline asm + // chi + lop3.b32 %r30686, %r18575, %r18583, %r18551, 0xD2; + lop3.b32 %r30687, %r18579, %r18587, %r18555, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30686, %r30687}; + // begin inline asm + // chi + lop3.b32 %r30678, %r18583, %r18551, %r18703, 0xD2; + lop3.b32 %r30679, %r18587, %r18555, %r18707, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30678, %r30679}; + // begin inline asm + // chi + lop3.b32 %r30670, %r18551, %r18703, %r18687, 0xD2; + lop3.b32 %r30671, %r18555, %r18707, %r18691, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30670, %r30671}; + // begin inline asm + // chi + lop3.b32 %r30696, %r18607, %r18647, %r18679, 0xD2; + lop3.b32 %r30697, %r18611, %r18651, %r18683, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30696, %r30697}; + // begin inline asm + // chi + lop3.b32 %r30690, %r18647, %r18679, %r18671, 0xD2; + lop3.b32 %r30691, %r18651, %r18683, %r18675, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30690, %r30691}; + // begin inline asm + // chi + lop3.b32 %r30684, %r18679, %r18671, %r18591, 0xD2; + lop3.b32 %r30685, %r18683, %r18675, %r18595, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30684, %r30685}; + // begin inline asm + // chi + lop3.b32 %r30676, %r18671, %r18591, %r18607, 0xD2; + lop3.b32 %r30677, %r18675, %r18595, %r18611, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30676, %r30677}; + // begin inline asm + // chi + lop3.b32 %r30668, %r18591, %r18607, %r18647, 0xD2; + lop3.b32 %r30669, %r18595, %r18611, %r18651, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30668, %r30669}; + // begin inline asm + // chi + lop3.b32 %r30694, %r18559, %r18631, %r18543, 0xD2; + lop3.b32 %r30695, %r18563, %r18635, %r18547, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30694, %r30695}; + // begin inline asm + // chi + lop3.b32 %r30688, %r18631, %r18543, %r18599, 0xD2; + lop3.b32 %r30689, %r18635, %r18547, %r18603, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30688, %r30689}; + // begin inline asm + // chi + lop3.b32 %r30682, %r18543, %r18599, %r18623, 0xD2; + lop3.b32 %r30683, %r18547, %r18603, %r18627, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30682, %r30683}; + // begin inline asm + // chi + lop3.b32 %r30674, %r18599, %r18623, %r18559, 0xD2; + lop3.b32 %r30675, %r18603, %r18627, %r18563, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30674, %r30675}; + // begin inline asm + // chi + lop3.b32 %r30666, %r18623, %r18559, %r18631, 0xD2; + lop3.b32 %r30667, %r18627, %r18563, %r18635, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30666, %r30667}; + mul.wide.s32 %rd857, %r30716, 8; + add.s64 %rd856, %rd789, %rd857; + // begin inline asm + ld.global.nc.v2.u32 {%r18911,%r18912}, [%rd856]; + // end inline asm + xor.b32 %r30702, %r18711, %r18911; + xor.b32 %r30703, %r18712, %r18912; + add.s32 %r30716, %r30716, 1; + setp.lt.u32 %p34, %r30716, 23; + @%p34 bra $L__BB2_57; + + st.local.v2.u32 [%rd2+32], {%r30714, %r30715}; + st.local.v2.u32 [%rd2+72], {%r30712, %r30713}; + st.local.v2.u32 [%rd2+40], {%r30710, %r30711}; + st.local.v2.u32 [%rd2+80], {%r30708, %r30709}; + st.local.v2.u32 [%rd2+48], {%r30706, %r30707}; + st.local.v2.u32 [%rd2+56], {%r30704, %r30705}; + st.local.v2.u32 [%rd2+24], {%r30702, %r30703}; + // begin inline asm + // xor5 + lop3.b32 %r18923, %r30702, %r30700, %r30698, 0x96; + lop3.b32 %r18923, %r18923, %r30696, %r30694, 0x96; + lop3.b32 %r18924, %r30703, %r30701, %r30699, 0x96; + lop3.b32 %r18924, %r18924, %r30697, %r30695, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18935, %r30714, %r30712, %r30692, 0x96; + lop3.b32 %r18935, %r18935, %r30690, %r30688, 0x96; + lop3.b32 %r18936, %r30715, %r30713, %r30693, 0x96; + lop3.b32 %r18936, %r18936, %r30691, %r30689, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18947, %r30710, %r30708, %r30686, 0x96; + lop3.b32 %r18947, %r18947, %r30684, %r30682, 0x96; + lop3.b32 %r18948, %r30711, %r30709, %r30687, 0x96; + lop3.b32 %r18948, %r18948, %r30685, %r30683, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18959, %r30706, %r30680, %r30678, 0x96; + lop3.b32 %r18959, %r18959, %r30676, %r30674, 0x96; + lop3.b32 %r18960, %r30707, %r30681, %r30679, 0x96; + lop3.b32 %r18960, %r18960, %r30677, %r30675, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18971, %r30704, %r30672, %r30670, 0x96; + lop3.b32 %r18971, %r18971, %r30668, %r30666, 0x96; + lop3.b32 %r18972, %r30705, %r30673, %r30671, 0x96; + lop3.b32 %r18972, %r18972, %r30669, %r30667, 0x96; + // end inline asm + mov.u32 %r19175, 1; + // begin inline asm + shf.l.wrap.b32 %r18983, %r18936, %r18935, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18987, %r18935, %r18936, %r19175; + // end inline asm + xor.b32 %r19202, %r18983, %r18971; + xor.b32 %r19203, %r18987, %r18972; + xor.b32 %r19130, %r30702, %r19202; + xor.b32 %r19133, %r30703, %r19203; + xor.b32 %r19093, %r30699, %r19203; + xor.b32 %r19092, %r30698, %r19202; + st.local.v2.u32 [%rd2+104], {%r19092, %r19093}; + // begin inline asm + shf.l.wrap.b32 %r18991, %r18948, %r18947, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18995, %r18947, %r18948, %r19175; + // end inline asm + xor.b32 %r19204, %r18991, %r18923; + xor.b32 %r19205, %r18995, %r18924; + xor.b32 %r19029, %r30712, %r19204; + xor.b32 %r19028, %r30713, %r19205; + xor.b32 %r19068, %r30691, %r19205; + xor.b32 %r19069, %r30690, %r19204; + st.local.v2.u32 [%rd2+152], {%r19069, %r19068}; + // begin inline asm + shf.l.wrap.b32 %r18999, %r18960, %r18959, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19003, %r18959, %r18960, %r19175; + // end inline asm + xor.b32 %r19206, %r18999, %r18935; + xor.b32 %r19207, %r19003, %r18936; + xor.b32 %r19052, %r30687, %r19207; + xor.b32 %r19053, %r30686, %r19206; + st.local.v2.u32 [%rd2+120], {%r19053, %r19052}; + xor.b32 %r19044, %r30683, %r19207; + xor.b32 %r19045, %r30682, %r19206; + st.local.v2.u32 [%rd2+200], {%r19045, %r19044}; + // begin inline asm + shf.l.wrap.b32 %r19007, %r18972, %r18971, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19011, %r18971, %r18972, %r19175; + // end inline asm + xor.b32 %r19208, %r19007, %r18947; + xor.b32 %r19209, %r19011, %r18948; + xor.b32 %r19076, %r30706, %r19208; + xor.b32 %r19077, %r30707, %r19209; + xor.b32 %r19085, %r30677, %r19209; + xor.b32 %r19084, %r30676, %r19208; + st.local.v2.u32 [%rd2+168], {%r19084, %r19085}; + // begin inline asm + shf.l.wrap.b32 %r19015, %r18924, %r18923, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19019, %r18923, %r18924, %r19175; + // end inline asm + xor.b32 %r19210, %r19015, %r18959; + xor.b32 %r19211, %r19019, %r18960; + xor.b32 %r19036, %r30672, %r19210; + xor.b32 %r19037, %r30673, %r19211; + xor.b32 %r19061, %r30667, %r19211; + xor.b32 %r19060, %r30666, %r19210; + st.local.v2.u32 [%rd2+216], {%r19060, %r19061}; + // begin inline asm + shf.l.wrap.b32 %r19023, %r19029, %r19028, %r18526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19027, %r19028, %r19029, %r18526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19031, %r19037, %r19036, %r18534; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19035, %r19036, %r19037, %r18534; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19043, %r19044, %r19045, %r18542; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19039, %r19045, %r19044, %r18542; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r19039, %r19043}; + // begin inline asm + shf.l.wrap.b32 %r19047, %r19053, %r19052, %r18574; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19051, %r19052, %r19053, %r18574; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19055, %r19061, %r19060, %r18622; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19059, %r19060, %r19061, %r18622; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19067, %r19068, %r19069, %r18646; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19063, %r19069, %r19068, %r18646; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r19063, %r19067}; + // begin inline asm + shf.l.wrap.b32 %r19071, %r19077, %r19076, %r18662; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19075, %r19076, %r19077, %r18662; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19079, %r19085, %r19084, %r18670; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19083, %r19084, %r19085, %r18670; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19087, %r19093, %r19092, %r18702; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19091, %r19092, %r19093, %r18702; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19095, %r19130, %r19023, %r19047, 0xD2; + lop3.b32 %r19096, %r19133, %r19027, %r19051, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19103, %r19023, %r19047, %r19079, 0xD2; + lop3.b32 %r19104, %r19027, %r19051, %r19083, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r19103, %r19104}; + // begin inline asm + // chi + lop3.b32 %r19111, %r19047, %r19079, %r19055, 0xD2; + lop3.b32 %r19112, %r19051, %r19083, %r19059, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r19111, %r19112}; + // begin inline asm + // chi + lop3.b32 %r19119, %r19079, %r19055, %r19130, 0xD2; + lop3.b32 %r19120, %r19083, %r19059, %r19133, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r19119, %r19120}; + // begin inline asm + // chi + lop3.b32 %r19127, %r19055, %r19130, %r19023, 0xD2; + lop3.b32 %r19128, %r19059, %r19133, %r19027, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r19127, %r19128}; + // begin inline asm + // chi + lop3.b32 %r19135, %r19071, %r19031, %r19087, 0xD2; + lop3.b32 %r19136, %r19075, %r19035, %r19091, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r19135, %r19136}; + // begin inline asm + // chi + lop3.b32 %r19143, %r19031, %r19087, %r19063, 0xD2; + lop3.b32 %r19144, %r19035, %r19091, %r19067, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r19143, %r19144}; + // begin inline asm + // chi + lop3.b32 %r19151, %r19087, %r19063, %r19039, 0xD2; + lop3.b32 %r19152, %r19091, %r19067, %r19043, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r19151, %r19152}; + // begin inline asm + ld.global.nc.v2.u32 {%r19159,%r19160}, [%rd790]; + // end inline asm + xor.b32 %r19212, %r19096, %r19160; + xor.b32 %r19213, %r19095, %r19159; + mov.b64 %rd1333, {%r19213, %r19212}; + mov.b64 %rd1334, {%r19103, %r19104}; + mov.b64 %rd1335, {%r19111, %r19112}; + mov.b64 %rd1336, {%r19119, %r19120}; + mov.b64 %rd1337, {%r19127, %r19128}; + mov.b64 %rd1338, {%r19135, %r19136}; + mov.b64 %rd1339, {%r19143, %r19144}; + mov.b64 %rd1340, {%r19151, %r19152}; + mov.u32 %r30717, 0; + st.local.v2.u32 [%rd2+24], {%r19213, %r19212}; + st.local.v2.u32 [%rd176+96], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+104], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+112], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+120], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+128], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+136], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+144], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+152], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+160], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+168], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+176], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+184], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+192], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+200], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+208], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+216], {%r30717, %r30717}; + mov.u32 %r30732, -2147483648; + st.local.v2.u32 [%rd176+88], {%r19175, %r30732}; + mov.u32 %r30718, %r30717; + mov.u32 %r30719, %r30717; + mov.u32 %r30720, %r30717; + mov.u32 %r30721, %r30717; + mov.u32 %r30722, %r30717; + mov.u32 %r30723, %r30717; + mov.u32 %r30724, %r30717; + mov.u32 %r30725, %r30717; + mov.u32 %r30726, %r30717; + mov.u32 %r30727, %r30717; + mov.u32 %r30728, %r30717; + mov.u32 %r30729, %r30717; + mov.u32 %r30730, %r30717; + mov.u32 %r30731, %r19175; + mov.u32 %r30733, %r30717; + mov.u32 %r30734, %r30717; + mov.u32 %r30735, %r30717; + mov.u32 %r30736, %r30717; + mov.u32 %r30737, %r30717; + mov.u32 %r30738, %r30717; + mov.u32 %r30739, %r30717; + mov.u32 %r30740, %r30717; + mov.u32 %r30741, %r30717; + mov.u32 %r30742, %r30717; + mov.u32 %r30743, %r30717; + mov.u32 %r30744, %r30717; + mov.u32 %r30745, %r30717; + mov.u32 %r30746, %r30717; + mov.u32 %r30747, %r30717; + mov.u32 %r30748, %r30717; + mov.u32 %r30749, %r30717; + mov.u32 %r30750, %r30717; + mov.u32 %r30767, %r30717; + +$L__BB2_59: + // begin inline asm + // xor5 + lop3.b32 %r19214, %r30753, %r30751, %r30749, 0x96; + lop3.b32 %r19214, %r19214, %r30747, %r30745, 0x96; + lop3.b32 %r19215, %r30754, %r30752, %r30750, 0x96; + lop3.b32 %r19215, %r19215, %r30748, %r30746, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19226, %r30765, %r30763, %r30743, 0x96; + lop3.b32 %r19226, %r19226, %r30741, %r30739, 0x96; + lop3.b32 %r19227, %r30766, %r30764, %r30744, 0x96; + lop3.b32 %r19227, %r19227, %r30742, %r30740, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19238, %r30761, %r30759, %r30737, 0x96; + lop3.b32 %r19238, %r19238, %r30735, %r30733, 0x96; + lop3.b32 %r19239, %r30762, %r30760, %r30738, 0x96; + lop3.b32 %r19239, %r19239, %r30736, %r30734, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19250, %r30757, %r30731, %r30729, 0x96; + lop3.b32 %r19250, %r19250, %r30727, %r30725, 0x96; + lop3.b32 %r19251, %r30758, %r30732, %r30730, 0x96; + lop3.b32 %r19251, %r19251, %r30728, %r30726, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19262, %r30755, %r30723, %r30721, 0x96; + lop3.b32 %r19262, %r19262, %r30719, %r30717, 0x96; + lop3.b32 %r19263, %r30756, %r30724, %r30722, 0x96; + lop3.b32 %r19263, %r19263, %r30720, %r30718, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19274, %r19227, %r19226, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19278, %r19226, %r19227, %r19175; + // end inline asm + xor.b32 %r19708, %r19274, %r19262; + xor.b32 %r19709, %r19278, %r19263; + xor.b32 %r19541, %r30753, %r19708; + xor.b32 %r19544, %r30754, %r19709; + xor.b32 %r19448, %r30751, %r19708; + xor.b32 %r19447, %r30752, %r19709; + xor.b32 %r19495, %r30749, %r19708; + xor.b32 %r19496, %r30750, %r19709; + xor.b32 %r19400, %r30747, %r19708; + xor.b32 %r19399, %r30748, %r19709; + xor.b32 %r19351, %r30745, %r19708; + xor.b32 %r19352, %r30746, %r19709; + // begin inline asm + shf.l.wrap.b32 %r19282, %r19239, %r19238, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19286, %r19238, %r19239, %r19175; + // end inline asm + xor.b32 %r19710, %r19282, %r19214; + xor.b32 %r19711, %r19286, %r19215; + xor.b32 %r19503, %r30765, %r19710; + xor.b32 %r19504, %r30766, %r19711; + xor.b32 %r19320, %r30763, %r19710; + xor.b32 %r19319, %r30764, %r19711; + xor.b32 %r19479, %r30743, %r19710; + xor.b32 %r19480, %r30744, %r19711; + xor.b32 %r19440, %r30741, %r19710; + xor.b32 %r19439, %r30742, %r19711; + xor.b32 %r19423, %r30739, %r19710; + xor.b32 %r19424, %r30740, %r19711; + // begin inline asm + shf.l.wrap.b32 %r19290, %r19251, %r19250, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19294, %r19250, %r19251, %r19175; + // end inline asm + xor.b32 %r19712, %r19290, %r19226; + xor.b32 %r19713, %r19294, %r19227; + xor.b32 %r19360, %r30761, %r19712; + xor.b32 %r19359, %r30762, %r19713; + xor.b32 %r19487, %r30759, %r19712; + xor.b32 %r19488, %r30760, %r19713; + xor.b32 %r19368, %r30737, %r19712; + xor.b32 %r19367, %r30738, %r19713; + xor.b32 %r19471, %r30735, %r19712; + xor.b32 %r19472, %r30736, %r19713; + xor.b32 %r19336, %r30733, %r19712; + xor.b32 %r19335, %r30734, %r19713; + // begin inline asm + shf.l.wrap.b32 %r19298, %r19263, %r19262, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19302, %r19262, %r19263, %r19175; + // end inline asm + xor.b32 %r19714, %r19298, %r19238; + xor.b32 %r19715, %r19302, %r19239; + xor.b32 %r19455, %r30757, %r19714; + xor.b32 %r19456, %r30758, %r19715; + xor.b32 %r19432, %r30731, %r19714; + xor.b32 %r19431, %r30732, %r19715; + xor.b32 %r19375, %r30729, %r19714; + xor.b32 %r19376, %r30730, %r19715; + xor.b32 %r19463, %r30727, %r19714; + xor.b32 %r19464, %r30728, %r19715; + xor.b32 %r19392, %r30725, %r19714; + xor.b32 %r19391, %r30726, %r19715; + // begin inline asm + shf.l.wrap.b32 %r19306, %r19215, %r19214, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19310, %r19214, %r19215, %r19175; + // end inline asm + xor.b32 %r19716, %r19306, %r19250; + xor.b32 %r19717, %r19310, %r19251; + xor.b32 %r19407, %r30755, %r19716; + xor.b32 %r19408, %r30756, %r19717; + xor.b32 %r19327, %r30723, %r19716; + xor.b32 %r19328, %r30724, %r19717; + xor.b32 %r19344, %r30721, %r19716; + xor.b32 %r19343, %r30722, %r19717; + xor.b32 %r19383, %r30719, %r19716; + xor.b32 %r19384, %r30720, %r19717; + xor.b32 %r19415, %r30717, %r19716; + xor.b32 %r19416, %r30718, %r19717; + mov.u32 %r19321, 44; + // begin inline asm + shf.l.wrap.b32 %r19314, %r19320, %r19319, %r19321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19318, %r19319, %r19320, %r19321; + // end inline asm + mov.u32 %r19329, 20; + // begin inline asm + shf.l.wrap.b32 %r19322, %r19328, %r19327, %r19329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19326, %r19327, %r19328, %r19329; + // end inline asm + mov.u32 %r19337, 61; + // begin inline asm + shf.l.wrap.b32 %r19330, %r19336, %r19335, %r19337; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19334, %r19335, %r19336, %r19337; + // end inline asm + mov.u32 %r19345, 39; + // begin inline asm + shf.l.wrap.b32 %r19338, %r19344, %r19343, %r19345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19342, %r19343, %r19344, %r19345; + // end inline asm + mov.u32 %r19353, 18; + // begin inline asm + shf.l.wrap.b32 %r19346, %r19352, %r19351, %r19353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19350, %r19351, %r19352, %r19353; + // end inline asm + mov.u32 %r19361, 62; + // begin inline asm + shf.l.wrap.b32 %r19354, %r19360, %r19359, %r19361; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19358, %r19359, %r19360, %r19361; + // end inline asm + mov.u32 %r19369, 43; + // begin inline asm + shf.l.wrap.b32 %r19362, %r19368, %r19367, %r19369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19366, %r19367, %r19368, %r19369; + // end inline asm + mov.u32 %r19377, 25; + // begin inline asm + shf.l.wrap.b32 %r19370, %r19376, %r19375, %r19377; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19374, %r19375, %r19376, %r19377; + // end inline asm + mov.u32 %r19385, 8; + // begin inline asm + shf.l.wrap.b32 %r19378, %r19384, %r19383, %r19385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19382, %r19383, %r19384, %r19385; + // end inline asm + mov.u32 %r19393, 56; + // begin inline asm + shf.l.wrap.b32 %r19386, %r19392, %r19391, %r19393; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19390, %r19391, %r19392, %r19393; + // end inline asm + mov.u32 %r19401, 41; + // begin inline asm + shf.l.wrap.b32 %r19394, %r19400, %r19399, %r19401; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19398, %r19399, %r19400, %r19401; + // end inline asm + mov.u32 %r19409, 27; + // begin inline asm + shf.l.wrap.b32 %r19402, %r19408, %r19407, %r19409; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19406, %r19407, %r19408, %r19409; + // end inline asm + mov.u32 %r19417, 14; + // begin inline asm + shf.l.wrap.b32 %r19410, %r19416, %r19415, %r19417; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19414, %r19415, %r19416, %r19417; + // end inline asm + mov.u32 %r19425, 2; + // begin inline asm + shf.l.wrap.b32 %r19418, %r19424, %r19423, %r19425; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19422, %r19423, %r19424, %r19425; + // end inline asm + mov.u32 %r19433, 55; + // begin inline asm + shf.l.wrap.b32 %r19426, %r19432, %r19431, %r19433; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19430, %r19431, %r19432, %r19433; + // end inline asm + mov.u32 %r19441, 45; + // begin inline asm + shf.l.wrap.b32 %r19434, %r19440, %r19439, %r19441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19438, %r19439, %r19440, %r19441; + // end inline asm + mov.u32 %r19449, 36; + // begin inline asm + shf.l.wrap.b32 %r19442, %r19448, %r19447, %r19449; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19446, %r19447, %r19448, %r19449; + // end inline asm + mov.u32 %r19457, 28; + // begin inline asm + shf.l.wrap.b32 %r19450, %r19456, %r19455, %r19457; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19454, %r19455, %r19456, %r19457; + // end inline asm + mov.u32 %r19465, 21; + // begin inline asm + shf.l.wrap.b32 %r19458, %r19464, %r19463, %r19465; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19462, %r19463, %r19464, %r19465; + // end inline asm + mov.u32 %r19473, 15; + // begin inline asm + shf.l.wrap.b32 %r19466, %r19472, %r19471, %r19473; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19470, %r19471, %r19472, %r19473; + // end inline asm + mov.u32 %r19481, 10; + // begin inline asm + shf.l.wrap.b32 %r19474, %r19480, %r19479, %r19481; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19478, %r19479, %r19480, %r19481; + // end inline asm + mov.u32 %r19489, 6; + // begin inline asm + shf.l.wrap.b32 %r19482, %r19488, %r19487, %r19489; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19486, %r19487, %r19488, %r19489; + // end inline asm + mov.u32 %r19497, 3; + // begin inline asm + shf.l.wrap.b32 %r19490, %r19496, %r19495, %r19497; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19494, %r19495, %r19496, %r19497; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19498, %r19504, %r19503, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19502, %r19503, %r19504, %r19175; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19506, %r19541, %r19314, %r19362, 0xD2; + lop3.b32 %r19507, %r19544, %r19318, %r19366, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30765, %r19314, %r19362, %r19458, 0xD2; + lop3.b32 %r30766, %r19318, %r19366, %r19462, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30761, %r19362, %r19458, %r19410, 0xD2; + lop3.b32 %r30762, %r19366, %r19462, %r19414, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30757, %r19458, %r19410, %r19541, 0xD2; + lop3.b32 %r30758, %r19462, %r19414, %r19544, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30755, %r19410, %r19541, %r19314, 0xD2; + lop3.b32 %r30756, %r19414, %r19544, %r19318, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30751, %r19450, %r19322, %r19490, 0xD2; + lop3.b32 %r30752, %r19454, %r19326, %r19494, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30763, %r19322, %r19490, %r19434, 0xD2; + lop3.b32 %r30764, %r19326, %r19494, %r19438, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30759, %r19490, %r19434, %r19330, 0xD2; + lop3.b32 %r30760, %r19494, %r19438, %r19334, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30731, %r19434, %r19330, %r19450, 0xD2; + lop3.b32 %r30732, %r19438, %r19334, %r19454, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r30731, %r30732}; + // begin inline asm + // chi + lop3.b32 %r30723, %r19330, %r19450, %r19322, 0xD2; + lop3.b32 %r30724, %r19334, %r19454, %r19326, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r30723, %r30724}; + // begin inline asm + // chi + lop3.b32 %r30749, %r19498, %r19482, %r19370, 0xD2; + lop3.b32 %r30750, %r19502, %r19486, %r19374, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+104], {%r30749, %r30750}; + // begin inline asm + // chi + lop3.b32 %r30743, %r19482, %r19370, %r19378, 0xD2; + lop3.b32 %r30744, %r19486, %r19374, %r19382, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+112], {%r30743, %r30744}; + // begin inline asm + // chi + lop3.b32 %r30737, %r19370, %r19378, %r19346, 0xD2; + lop3.b32 %r30738, %r19374, %r19382, %r19350, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+120], {%r30737, %r30738}; + // begin inline asm + // chi + lop3.b32 %r30729, %r19378, %r19346, %r19498, 0xD2; + lop3.b32 %r30730, %r19382, %r19350, %r19502, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+128], {%r30729, %r30730}; + // begin inline asm + // chi + lop3.b32 %r30721, %r19346, %r19498, %r19482, 0xD2; + lop3.b32 %r30722, %r19350, %r19502, %r19486, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+136], {%r30721, %r30722}; + // begin inline asm + // chi + lop3.b32 %r30747, %r19402, %r19442, %r19474, 0xD2; + lop3.b32 %r30748, %r19406, %r19446, %r19478, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+144], {%r30747, %r30748}; + // begin inline asm + // chi + lop3.b32 %r30741, %r19442, %r19474, %r19466, 0xD2; + lop3.b32 %r30742, %r19446, %r19478, %r19470, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+152], {%r30741, %r30742}; + // begin inline asm + // chi + lop3.b32 %r30735, %r19474, %r19466, %r19386, 0xD2; + lop3.b32 %r30736, %r19478, %r19470, %r19390, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+160], {%r30735, %r30736}; + // begin inline asm + // chi + lop3.b32 %r30727, %r19466, %r19386, %r19402, 0xD2; + lop3.b32 %r30728, %r19470, %r19390, %r19406, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+168], {%r30727, %r30728}; + // begin inline asm + // chi + lop3.b32 %r30719, %r19386, %r19402, %r19442, 0xD2; + lop3.b32 %r30720, %r19390, %r19406, %r19446, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+176], {%r30719, %r30720}; + // begin inline asm + // chi + lop3.b32 %r30745, %r19354, %r19426, %r19338, 0xD2; + lop3.b32 %r30746, %r19358, %r19430, %r19342, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+184], {%r30745, %r30746}; + // begin inline asm + // chi + lop3.b32 %r30739, %r19426, %r19338, %r19394, 0xD2; + lop3.b32 %r30740, %r19430, %r19342, %r19398, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+192], {%r30739, %r30740}; + // begin inline asm + // chi + lop3.b32 %r30733, %r19338, %r19394, %r19418, 0xD2; + lop3.b32 %r30734, %r19342, %r19398, %r19422, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+200], {%r30733, %r30734}; + // begin inline asm + // chi + lop3.b32 %r30725, %r19394, %r19418, %r19354, 0xD2; + lop3.b32 %r30726, %r19398, %r19422, %r19358, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+208], {%r30725, %r30726}; + // begin inline asm + // chi + lop3.b32 %r30717, %r19418, %r19354, %r19426, 0xD2; + lop3.b32 %r30718, %r19422, %r19358, %r19430, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+216], {%r30717, %r30718}; + mul.wide.s32 %rd864, %r30767, 8; + add.s64 %rd863, %rd789, %rd864; + // begin inline asm + ld.global.nc.v2.u32 {%r19706,%r19707}, [%rd863]; + // end inline asm + xor.b32 %r30753, %r19506, %r19706; + xor.b32 %r30754, %r19507, %r19707; + add.s32 %r30767, %r30767, 1; + setp.lt.u32 %p35, %r30767, 23; + @%p35 bra $L__BB2_59; + + mov.u32 %r19817, 1; + st.local.v2.u32 [%rd176+32], {%r30765, %r30766}; + st.local.v2.u32 [%rd176+72], {%r30763, %r30764}; + st.local.v2.u32 [%rd176+40], {%r30761, %r30762}; + st.local.v2.u32 [%rd176+80], {%r30759, %r30760}; + st.local.v2.u32 [%rd176+48], {%r30757, %r30758}; + st.local.v2.u32 [%rd176+56], {%r30755, %r30756}; + st.local.v2.u32 [%rd176+24], {%r30753, %r30754}; + // begin inline asm + // xor5 + lop3.b32 %r19718, %r30753, %r30751, %r30749, 0x96; + lop3.b32 %r19718, %r19718, %r30747, %r30745, 0x96; + lop3.b32 %r19719, %r30754, %r30752, %r30750, 0x96; + lop3.b32 %r19719, %r19719, %r30748, %r30746, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19730, %r30765, %r30763, %r30743, 0x96; + lop3.b32 %r19730, %r19730, %r30741, %r30739, 0x96; + lop3.b32 %r19731, %r30766, %r30764, %r30744, 0x96; + lop3.b32 %r19731, %r19731, %r30742, %r30740, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19742, %r30761, %r30759, %r30737, 0x96; + lop3.b32 %r19742, %r19742, %r30735, %r30733, 0x96; + lop3.b32 %r19743, %r30762, %r30760, %r30738, 0x96; + lop3.b32 %r19743, %r19743, %r30736, %r30734, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19754, %r30757, %r30731, %r30729, 0x96; + lop3.b32 %r19754, %r19754, %r30727, %r30725, 0x96; + lop3.b32 %r19755, %r30758, %r30732, %r30730, 0x96; + lop3.b32 %r19755, %r19755, %r30728, %r30726, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19766, %r30755, %r30723, %r30721, 0x96; + lop3.b32 %r19766, %r19766, %r30719, %r30717, 0x96; + lop3.b32 %r19767, %r30756, %r30724, %r30722, 0x96; + lop3.b32 %r19767, %r19767, %r30720, %r30718, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19778, %r19731, %r19730, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19782, %r19730, %r19731, %r19817; + // end inline asm + xor.b32 %r19956, %r19778, %r19766; + xor.b32 %r19957, %r19782, %r19767; + xor.b32 %r19925, %r30753, %r19956; + xor.b32 %r19928, %r30754, %r19957; + xor.b32 %r19888, %r30750, %r19957; + xor.b32 %r19887, %r30749, %r19956; + st.local.v2.u32 [%rd176+104], {%r19887, %r19888}; + // begin inline asm + shf.l.wrap.b32 %r19786, %r19743, %r19742, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19790, %r19742, %r19743, %r19817; + // end inline asm + xor.b32 %r19958, %r19786, %r19718; + xor.b32 %r19959, %r19790, %r19719; + xor.b32 %r19824, %r30763, %r19958; + xor.b32 %r19823, %r30764, %r19959; + xor.b32 %r19863, %r30742, %r19959; + xor.b32 %r19864, %r30741, %r19958; + st.local.v2.u32 [%rd176+152], {%r19864, %r19863}; + // begin inline asm + shf.l.wrap.b32 %r19794, %r19755, %r19754, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19798, %r19754, %r19755, %r19817; + // end inline asm + xor.b32 %r19960, %r19794, %r19730; + xor.b32 %r19961, %r19798, %r19731; + xor.b32 %r19847, %r30738, %r19961; + xor.b32 %r19848, %r30737, %r19960; + st.local.v2.u32 [%rd176+120], {%r19848, %r19847}; + xor.b32 %r19839, %r30734, %r19961; + xor.b32 %r19840, %r30733, %r19960; + st.local.v2.u32 [%rd176+200], {%r19840, %r19839}; + // begin inline asm + shf.l.wrap.b32 %r19802, %r19767, %r19766, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19806, %r19766, %r19767, %r19817; + // end inline asm + xor.b32 %r19962, %r19802, %r19742; + xor.b32 %r19963, %r19806, %r19743; + xor.b32 %r19871, %r30757, %r19962; + xor.b32 %r19872, %r30758, %r19963; + xor.b32 %r19880, %r30728, %r19963; + xor.b32 %r19879, %r30727, %r19962; + st.local.v2.u32 [%rd176+168], {%r19879, %r19880}; + // begin inline asm + shf.l.wrap.b32 %r19810, %r19719, %r19718, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19814, %r19718, %r19719, %r19817; + // end inline asm + xor.b32 %r19964, %r19810, %r19754; + xor.b32 %r19965, %r19814, %r19755; + xor.b32 %r19831, %r30723, %r19964; + xor.b32 %r19832, %r30724, %r19965; + xor.b32 %r19856, %r30718, %r19965; + xor.b32 %r19855, %r30717, %r19964; + st.local.v2.u32 [%rd176+216], {%r19855, %r19856}; + // begin inline asm + shf.l.wrap.b32 %r19818, %r19824, %r19823, %r19321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19822, %r19823, %r19824, %r19321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19826, %r19832, %r19831, %r19329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19830, %r19831, %r19832, %r19329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19838, %r19839, %r19840, %r19337; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19834, %r19840, %r19839, %r19337; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r19834, %r19838}; + // begin inline asm + shf.l.wrap.b32 %r19842, %r19848, %r19847, %r19369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19846, %r19847, %r19848, %r19369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19850, %r19856, %r19855, %r19417; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19854, %r19855, %r19856, %r19417; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19862, %r19863, %r19864, %r19441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19858, %r19864, %r19863, %r19441; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r19858, %r19862}; + // begin inline asm + shf.l.wrap.b32 %r19866, %r19872, %r19871, %r19457; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19870, %r19871, %r19872, %r19457; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19874, %r19880, %r19879, %r19465; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19878, %r19879, %r19880, %r19465; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19882, %r19888, %r19887, %r19497; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19886, %r19887, %r19888, %r19497; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19890, %r19925, %r19818, %r19842, 0xD2; + lop3.b32 %r19891, %r19928, %r19822, %r19846, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19898, %r19818, %r19842, %r19874, 0xD2; + lop3.b32 %r19899, %r19822, %r19846, %r19878, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+32], {%r19898, %r19899}; + // begin inline asm + // chi + lop3.b32 %r19906, %r19842, %r19874, %r19850, 0xD2; + lop3.b32 %r19907, %r19846, %r19878, %r19854, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+40], {%r19906, %r19907}; + // begin inline asm + // chi + lop3.b32 %r19914, %r19874, %r19850, %r19925, 0xD2; + lop3.b32 %r19915, %r19878, %r19854, %r19928, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+48], {%r19914, %r19915}; + // begin inline asm + // chi + lop3.b32 %r19922, %r19850, %r19925, %r19818, 0xD2; + lop3.b32 %r19923, %r19854, %r19928, %r19822, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+56], {%r19922, %r19923}; + // begin inline asm + // chi + lop3.b32 %r19930, %r19866, %r19826, %r19882, 0xD2; + lop3.b32 %r19931, %r19870, %r19830, %r19886, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+64], {%r19930, %r19931}; + // begin inline asm + // chi + lop3.b32 %r19938, %r19826, %r19882, %r19858, 0xD2; + lop3.b32 %r19939, %r19830, %r19886, %r19862, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+72], {%r19938, %r19939}; + // begin inline asm + // chi + lop3.b32 %r19946, %r19882, %r19858, %r19834, 0xD2; + lop3.b32 %r19947, %r19886, %r19862, %r19838, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+80], {%r19946, %r19947}; + // begin inline asm + ld.global.nc.v2.u32 {%r19954,%r19955}, [%rd790]; + // end inline asm + xor.b32 %r19966, %r19891, %r19955; + xor.b32 %r19967, %r19890, %r19954; + st.local.v2.u32 [%rd176+24], {%r19967, %r19966}; + mov.b64 %rd1342, {%r19898, %r19899}; + mov.b64 %rd1343, {%r19906, %r19907}; + mov.b64 %rd1346, {%r19930, %r19931}; + mov.b64 %rd1347, {%r19938, %r19939}; + mov.b64 %rd1348, {%r19946, %r19947}; + mov.b64 %rd1341, {%r19967, %r19966}; + mov.b64 %rd1344, {%r19914, %r19915}; + mov.b64 %rd1345, {%r19922, %r19923}; + bra.uni $L__BB2_61; + +$L__BB2_39: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd671, 1179641; + st.local.u64 [%rd2+8], %rd671; + st.local.u32 [%rd2+16], %r1695; + ld.global.u64 %rd672, [%rd126]; + ld.global.u64 %rd673, [%rd126+8]; + ld.global.u64 %rd674, [%rd126+16]; + ld.global.u64 %rd675, [%rd126+24]; + ld.global.u64 %rd676, [%rd126+32]; + ld.global.u64 %rd677, [%rd126+40]; + ld.global.u64 %rd678, [%rd126+48]; + ld.global.u64 %rd679, [%rd126+56]; + st.local.u64 [%rd2+24], %rd672; + st.local.u64 [%rd2+32], %rd673; + st.local.u64 [%rd2+40], %rd674; + st.local.u64 [%rd2+48], %rd675; + st.local.u64 [%rd2+56], %rd676; + st.local.u64 [%rd2+64], %rd677; + st.local.u64 [%rd2+72], %rd678; + st.local.u64 [%rd2+80], %rd679; + cvt.u32.u64 %r13441, %rd672; + xor.b32 %r13442, %r1695, %r13441; + st.local.u32 [%rd2+24], %r13442; + mov.u32 %r30294, 0; + st.local.v2.u32 [%rd2+96], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+104], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+112], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+120], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+128], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+136], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+144], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+152], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+160], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+168], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+176], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+184], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+192], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+200], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+208], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+216], {%r30294, %r30294}; + mov.u32 %r30309, -2147483648; + mov.u32 %r13414, 1; + st.local.v2.u32 [%rd2+88], {%r13414, %r30309}; + ld.local.v2.u32 {%r30330, %r30331}, [%rd2+24]; + mov.b64 {%r30328, %r30329}, %rd677; + shr.u64 %rd680, %rd673, 32; + cvt.u32.u64 %r30342, %rd673; + cvt.u32.u64 %r30343, %rd680; + shr.u64 %rd681, %rd678, 32; + cvt.u32.u64 %r30340, %rd678; + cvt.u32.u64 %r30341, %rd681; + shr.u64 %rd682, %rd674, 32; + cvt.u32.u64 %r30338, %rd674; + cvt.u32.u64 %r30339, %rd682; + shr.u64 %rd683, %rd679, 32; + cvt.u32.u64 %r30336, %rd679; + cvt.u32.u64 %r30337, %rd683; + shr.u64 %rd684, %rd675, 32; + cvt.u32.u64 %r30334, %rd675; + cvt.u32.u64 %r30335, %rd684; + shr.u64 %rd685, %rd676, 32; + cvt.u32.u64 %r30332, %rd676; + cvt.u32.u64 %r30333, %rd685; + mov.u32 %r30295, %r30294; + mov.u32 %r30296, %r30294; + mov.u32 %r30297, %r30294; + mov.u32 %r30298, %r30294; + mov.u32 %r30299, %r30294; + mov.u32 %r30300, %r30294; + mov.u32 %r30301, %r30294; + mov.u32 %r30302, %r30294; + mov.u32 %r30303, %r30294; + mov.u32 %r30304, %r30294; + mov.u32 %r30305, %r30294; + mov.u32 %r30306, %r30294; + mov.u32 %r30307, %r30294; + mov.u32 %r30308, %r13414; + mov.u32 %r30310, %r30294; + mov.u32 %r30311, %r30294; + mov.u32 %r30312, %r30294; + mov.u32 %r30313, %r30294; + mov.u32 %r30314, %r30294; + mov.u32 %r30315, %r30294; + mov.u32 %r30316, %r30294; + mov.u32 %r30317, %r30294; + mov.u32 %r30318, %r30294; + mov.u32 %r30319, %r30294; + mov.u32 %r30320, %r30294; + mov.u32 %r30321, %r30294; + mov.u32 %r30322, %r30294; + mov.u32 %r30323, %r30294; + mov.u32 %r30324, %r30294; + mov.u32 %r30325, %r30294; + mov.u32 %r30326, %r30294; + mov.u32 %r30327, %r30294; + mov.u32 %r30344, %r30294; + +$L__BB2_40: + // begin inline asm + // xor5 + lop3.b32 %r13445, %r30330, %r30328, %r30326, 0x96; + lop3.b32 %r13445, %r13445, %r30324, %r30322, 0x96; + lop3.b32 %r13446, %r30331, %r30329, %r30327, 0x96; + lop3.b32 %r13446, %r13446, %r30325, %r30323, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13457, %r30342, %r30340, %r30320, 0x96; + lop3.b32 %r13457, %r13457, %r30318, %r30316, 0x96; + lop3.b32 %r13458, %r30343, %r30341, %r30321, 0x96; + lop3.b32 %r13458, %r13458, %r30319, %r30317, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13469, %r30338, %r30336, %r30314, 0x96; + lop3.b32 %r13469, %r13469, %r30312, %r30310, 0x96; + lop3.b32 %r13470, %r30339, %r30337, %r30315, 0x96; + lop3.b32 %r13470, %r13470, %r30313, %r30311, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13481, %r30334, %r30308, %r30306, 0x96; + lop3.b32 %r13481, %r13481, %r30304, %r30302, 0x96; + lop3.b32 %r13482, %r30335, %r30309, %r30307, 0x96; + lop3.b32 %r13482, %r13482, %r30305, %r30303, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13493, %r30332, %r30300, %r30298, 0x96; + lop3.b32 %r13493, %r13493, %r30296, %r30294, 0x96; + lop3.b32 %r13494, %r30333, %r30301, %r30299, 0x96; + lop3.b32 %r13494, %r13494, %r30297, %r30295, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13505, %r13458, %r13457, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13509, %r13457, %r13458, %r13414; + // end inline asm + xor.b32 %r13939, %r13505, %r13493; + xor.b32 %r13940, %r13509, %r13494; + xor.b32 %r13772, %r30330, %r13939; + xor.b32 %r13775, %r30331, %r13940; + xor.b32 %r13679, %r30328, %r13939; + xor.b32 %r13678, %r30329, %r13940; + xor.b32 %r13726, %r30326, %r13939; + xor.b32 %r13727, %r30327, %r13940; + xor.b32 %r13631, %r30324, %r13939; + xor.b32 %r13630, %r30325, %r13940; + xor.b32 %r13582, %r30322, %r13939; + xor.b32 %r13583, %r30323, %r13940; + // begin inline asm + shf.l.wrap.b32 %r13513, %r13470, %r13469, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13517, %r13469, %r13470, %r13414; + // end inline asm + xor.b32 %r13941, %r13513, %r13445; + xor.b32 %r13942, %r13517, %r13446; + xor.b32 %r13734, %r30342, %r13941; + xor.b32 %r13735, %r30343, %r13942; + xor.b32 %r13551, %r30340, %r13941; + xor.b32 %r13550, %r30341, %r13942; + xor.b32 %r13710, %r30320, %r13941; + xor.b32 %r13711, %r30321, %r13942; + xor.b32 %r13671, %r30318, %r13941; + xor.b32 %r13670, %r30319, %r13942; + xor.b32 %r13654, %r30316, %r13941; + xor.b32 %r13655, %r30317, %r13942; + // begin inline asm + shf.l.wrap.b32 %r13521, %r13482, %r13481, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13525, %r13481, %r13482, %r13414; + // end inline asm + xor.b32 %r13943, %r13521, %r13457; + xor.b32 %r13944, %r13525, %r13458; + xor.b32 %r13591, %r30338, %r13943; + xor.b32 %r13590, %r30339, %r13944; + xor.b32 %r13718, %r30336, %r13943; + xor.b32 %r13719, %r30337, %r13944; + xor.b32 %r13599, %r30314, %r13943; + xor.b32 %r13598, %r30315, %r13944; + xor.b32 %r13702, %r30312, %r13943; + xor.b32 %r13703, %r30313, %r13944; + xor.b32 %r13567, %r30310, %r13943; + xor.b32 %r13566, %r30311, %r13944; + // begin inline asm + shf.l.wrap.b32 %r13529, %r13494, %r13493, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13533, %r13493, %r13494, %r13414; + // end inline asm + xor.b32 %r13945, %r13529, %r13469; + xor.b32 %r13946, %r13533, %r13470; + xor.b32 %r13686, %r30334, %r13945; + xor.b32 %r13687, %r30335, %r13946; + xor.b32 %r13663, %r30308, %r13945; + xor.b32 %r13662, %r30309, %r13946; + xor.b32 %r13606, %r30306, %r13945; + xor.b32 %r13607, %r30307, %r13946; + xor.b32 %r13694, %r30304, %r13945; + xor.b32 %r13695, %r30305, %r13946; + xor.b32 %r13623, %r30302, %r13945; + xor.b32 %r13622, %r30303, %r13946; + // begin inline asm + shf.l.wrap.b32 %r13537, %r13446, %r13445, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13541, %r13445, %r13446, %r13414; + // end inline asm + xor.b32 %r13947, %r13537, %r13481; + xor.b32 %r13948, %r13541, %r13482; + xor.b32 %r13638, %r30332, %r13947; + xor.b32 %r13639, %r30333, %r13948; + xor.b32 %r13558, %r30300, %r13947; + xor.b32 %r13559, %r30301, %r13948; + xor.b32 %r13575, %r30298, %r13947; + xor.b32 %r13574, %r30299, %r13948; + xor.b32 %r13614, %r30296, %r13947; + xor.b32 %r13615, %r30297, %r13948; + xor.b32 %r13646, %r30294, %r13947; + xor.b32 %r13647, %r30295, %r13948; + mov.u32 %r13552, 44; + // begin inline asm + shf.l.wrap.b32 %r13545, %r13551, %r13550, %r13552; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13549, %r13550, %r13551, %r13552; + // end inline asm + mov.u32 %r13560, 20; + // begin inline asm + shf.l.wrap.b32 %r13553, %r13559, %r13558, %r13560; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13557, %r13558, %r13559, %r13560; + // end inline asm + mov.u32 %r13568, 61; + // begin inline asm + shf.l.wrap.b32 %r13561, %r13567, %r13566, %r13568; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13565, %r13566, %r13567, %r13568; + // end inline asm + mov.u32 %r13576, 39; + // begin inline asm + shf.l.wrap.b32 %r13569, %r13575, %r13574, %r13576; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13573, %r13574, %r13575, %r13576; + // end inline asm + mov.u32 %r13584, 18; + // begin inline asm + shf.l.wrap.b32 %r13577, %r13583, %r13582, %r13584; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13581, %r13582, %r13583, %r13584; + // end inline asm + mov.u32 %r13592, 62; + // begin inline asm + shf.l.wrap.b32 %r13585, %r13591, %r13590, %r13592; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13589, %r13590, %r13591, %r13592; + // end inline asm + mov.u32 %r13600, 43; + // begin inline asm + shf.l.wrap.b32 %r13593, %r13599, %r13598, %r13600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13597, %r13598, %r13599, %r13600; + // end inline asm + mov.u32 %r13608, 25; + // begin inline asm + shf.l.wrap.b32 %r13601, %r13607, %r13606, %r13608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13605, %r13606, %r13607, %r13608; + // end inline asm + mov.u32 %r13616, 8; + // begin inline asm + shf.l.wrap.b32 %r13609, %r13615, %r13614, %r13616; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13613, %r13614, %r13615, %r13616; + // end inline asm + mov.u32 %r13624, 56; + // begin inline asm + shf.l.wrap.b32 %r13617, %r13623, %r13622, %r13624; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13621, %r13622, %r13623, %r13624; + // end inline asm + mov.u32 %r13632, 41; + // begin inline asm + shf.l.wrap.b32 %r13625, %r13631, %r13630, %r13632; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13629, %r13630, %r13631, %r13632; + // end inline asm + mov.u32 %r13640, 27; + // begin inline asm + shf.l.wrap.b32 %r13633, %r13639, %r13638, %r13640; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13637, %r13638, %r13639, %r13640; + // end inline asm + mov.u32 %r13648, 14; + // begin inline asm + shf.l.wrap.b32 %r13641, %r13647, %r13646, %r13648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13645, %r13646, %r13647, %r13648; + // end inline asm + mov.u32 %r13656, 2; + // begin inline asm + shf.l.wrap.b32 %r13649, %r13655, %r13654, %r13656; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13653, %r13654, %r13655, %r13656; + // end inline asm + mov.u32 %r13664, 55; + // begin inline asm + shf.l.wrap.b32 %r13657, %r13663, %r13662, %r13664; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13661, %r13662, %r13663, %r13664; + // end inline asm + mov.u32 %r13672, 45; + // begin inline asm + shf.l.wrap.b32 %r13665, %r13671, %r13670, %r13672; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13669, %r13670, %r13671, %r13672; + // end inline asm + mov.u32 %r13680, 36; + // begin inline asm + shf.l.wrap.b32 %r13673, %r13679, %r13678, %r13680; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13677, %r13678, %r13679, %r13680; + // end inline asm + mov.u32 %r13688, 28; + // begin inline asm + shf.l.wrap.b32 %r13681, %r13687, %r13686, %r13688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13685, %r13686, %r13687, %r13688; + // end inline asm + mov.u32 %r13696, 21; + // begin inline asm + shf.l.wrap.b32 %r13689, %r13695, %r13694, %r13696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13693, %r13694, %r13695, %r13696; + // end inline asm + mov.u32 %r13704, 15; + // begin inline asm + shf.l.wrap.b32 %r13697, %r13703, %r13702, %r13704; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13701, %r13702, %r13703, %r13704; + // end inline asm + mov.u32 %r13712, 10; + // begin inline asm + shf.l.wrap.b32 %r13705, %r13711, %r13710, %r13712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13709, %r13710, %r13711, %r13712; + // end inline asm + mov.u32 %r13720, 6; + // begin inline asm + shf.l.wrap.b32 %r13713, %r13719, %r13718, %r13720; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13717, %r13718, %r13719, %r13720; + // end inline asm + mov.u32 %r13728, 3; + // begin inline asm + shf.l.wrap.b32 %r13721, %r13727, %r13726, %r13728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13725, %r13726, %r13727, %r13728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13729, %r13735, %r13734, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13733, %r13734, %r13735, %r13414; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13737, %r13772, %r13545, %r13593, 0xD2; + lop3.b32 %r13738, %r13775, %r13549, %r13597, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30342, %r13545, %r13593, %r13689, 0xD2; + lop3.b32 %r30343, %r13549, %r13597, %r13693, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30338, %r13593, %r13689, %r13641, 0xD2; + lop3.b32 %r30339, %r13597, %r13693, %r13645, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30334, %r13689, %r13641, %r13772, 0xD2; + lop3.b32 %r30335, %r13693, %r13645, %r13775, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30332, %r13641, %r13772, %r13545, 0xD2; + lop3.b32 %r30333, %r13645, %r13775, %r13549, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30328, %r13681, %r13553, %r13721, 0xD2; + lop3.b32 %r30329, %r13685, %r13557, %r13725, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30340, %r13553, %r13721, %r13665, 0xD2; + lop3.b32 %r30341, %r13557, %r13725, %r13669, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30336, %r13721, %r13665, %r13561, 0xD2; + lop3.b32 %r30337, %r13725, %r13669, %r13565, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30308, %r13665, %r13561, %r13681, 0xD2; + lop3.b32 %r30309, %r13669, %r13565, %r13685, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30308, %r30309}; + // begin inline asm + // chi + lop3.b32 %r30300, %r13561, %r13681, %r13553, 0xD2; + lop3.b32 %r30301, %r13565, %r13685, %r13557, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30300, %r30301}; + // begin inline asm + // chi + lop3.b32 %r30326, %r13729, %r13713, %r13601, 0xD2; + lop3.b32 %r30327, %r13733, %r13717, %r13605, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30326, %r30327}; + // begin inline asm + // chi + lop3.b32 %r30320, %r13713, %r13601, %r13609, 0xD2; + lop3.b32 %r30321, %r13717, %r13605, %r13613, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30320, %r30321}; + // begin inline asm + // chi + lop3.b32 %r30314, %r13601, %r13609, %r13577, 0xD2; + lop3.b32 %r30315, %r13605, %r13613, %r13581, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30314, %r30315}; + // begin inline asm + // chi + lop3.b32 %r30306, %r13609, %r13577, %r13729, 0xD2; + lop3.b32 %r30307, %r13613, %r13581, %r13733, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30306, %r30307}; + // begin inline asm + // chi + lop3.b32 %r30298, %r13577, %r13729, %r13713, 0xD2; + lop3.b32 %r30299, %r13581, %r13733, %r13717, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30298, %r30299}; + // begin inline asm + // chi + lop3.b32 %r30324, %r13633, %r13673, %r13705, 0xD2; + lop3.b32 %r30325, %r13637, %r13677, %r13709, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30324, %r30325}; + // begin inline asm + // chi + lop3.b32 %r30318, %r13673, %r13705, %r13697, 0xD2; + lop3.b32 %r30319, %r13677, %r13709, %r13701, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30318, %r30319}; + // begin inline asm + // chi + lop3.b32 %r30312, %r13705, %r13697, %r13617, 0xD2; + lop3.b32 %r30313, %r13709, %r13701, %r13621, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30312, %r30313}; + // begin inline asm + // chi + lop3.b32 %r30304, %r13697, %r13617, %r13633, 0xD2; + lop3.b32 %r30305, %r13701, %r13621, %r13637, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30304, %r30305}; + // begin inline asm + // chi + lop3.b32 %r30296, %r13617, %r13633, %r13673, 0xD2; + lop3.b32 %r30297, %r13621, %r13637, %r13677, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30296, %r30297}; + // begin inline asm + // chi + lop3.b32 %r30322, %r13585, %r13657, %r13569, 0xD2; + lop3.b32 %r30323, %r13589, %r13661, %r13573, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30322, %r30323}; + // begin inline asm + // chi + lop3.b32 %r30316, %r13657, %r13569, %r13625, 0xD2; + lop3.b32 %r30317, %r13661, %r13573, %r13629, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30316, %r30317}; + // begin inline asm + // chi + lop3.b32 %r30310, %r13569, %r13625, %r13649, 0xD2; + lop3.b32 %r30311, %r13573, %r13629, %r13653, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30310, %r30311}; + // begin inline asm + // chi + lop3.b32 %r30302, %r13625, %r13649, %r13585, 0xD2; + lop3.b32 %r30303, %r13629, %r13653, %r13589, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30302, %r30303}; + // begin inline asm + // chi + lop3.b32 %r30294, %r13649, %r13585, %r13657, 0xD2; + lop3.b32 %r30295, %r13653, %r13589, %r13661, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30294, %r30295}; + mul.wide.s32 %rd687, %r30344, 8; + mov.u64 %rd688, keccak_round_constants; + cvta.const.u64 %rd689, %rd688; + add.s64 %rd686, %rd689, %rd687; + // begin inline asm + ld.global.nc.v2.u32 {%r13937,%r13938}, [%rd686]; + // end inline asm + xor.b32 %r30330, %r13737, %r13937; + xor.b32 %r30331, %r13738, %r13938; + add.s32 %r30344, %r30344, 1; + setp.lt.u32 %p26, %r30344, 23; + @%p26 bra $L__BB2_40; + + add.u64 %rd147, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r30342, %r30343}; + st.local.v2.u32 [%rd2+72], {%r30340, %r30341}; + st.local.v2.u32 [%rd2+40], {%r30338, %r30339}; + st.local.v2.u32 [%rd2+80], {%r30336, %r30337}; + st.local.v2.u32 [%rd2+48], {%r30334, %r30335}; + st.local.v2.u32 [%rd2+56], {%r30332, %r30333}; + st.local.v2.u32 [%rd2+24], {%r30330, %r30331}; + // begin inline asm + // xor5 + lop3.b32 %r13949, %r30330, %r30328, %r30326, 0x96; + lop3.b32 %r13949, %r13949, %r30324, %r30322, 0x96; + lop3.b32 %r13950, %r30331, %r30329, %r30327, 0x96; + lop3.b32 %r13950, %r13950, %r30325, %r30323, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13961, %r30342, %r30340, %r30320, 0x96; + lop3.b32 %r13961, %r13961, %r30318, %r30316, 0x96; + lop3.b32 %r13962, %r30343, %r30341, %r30321, 0x96; + lop3.b32 %r13962, %r13962, %r30319, %r30317, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13973, %r30338, %r30336, %r30314, 0x96; + lop3.b32 %r13973, %r13973, %r30312, %r30310, 0x96; + lop3.b32 %r13974, %r30339, %r30337, %r30315, 0x96; + lop3.b32 %r13974, %r13974, %r30313, %r30311, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13985, %r30334, %r30308, %r30306, 0x96; + lop3.b32 %r13985, %r13985, %r30304, %r30302, 0x96; + lop3.b32 %r13986, %r30335, %r30309, %r30307, 0x96; + lop3.b32 %r13986, %r13986, %r30305, %r30303, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13997, %r30332, %r30300, %r30298, 0x96; + lop3.b32 %r13997, %r13997, %r30296, %r30294, 0x96; + lop3.b32 %r13998, %r30333, %r30301, %r30299, 0x96; + lop3.b32 %r13998, %r13998, %r30297, %r30295, 0x96; + // end inline asm + mov.u32 %r14201, 1; + // begin inline asm + shf.l.wrap.b32 %r14009, %r13962, %r13961, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14013, %r13961, %r13962, %r14201; + // end inline asm + xor.b32 %r14228, %r14009, %r13997; + xor.b32 %r14229, %r14013, %r13998; + xor.b32 %r14156, %r30330, %r14228; + xor.b32 %r14159, %r30331, %r14229; + xor.b32 %r14119, %r30327, %r14229; + xor.b32 %r14118, %r30326, %r14228; + st.local.v2.u32 [%rd2+104], {%r14118, %r14119}; + // begin inline asm + shf.l.wrap.b32 %r14017, %r13974, %r13973, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14021, %r13973, %r13974, %r14201; + // end inline asm + xor.b32 %r14230, %r14017, %r13949; + xor.b32 %r14231, %r14021, %r13950; + xor.b32 %r14055, %r30340, %r14230; + xor.b32 %r14054, %r30341, %r14231; + xor.b32 %r14094, %r30319, %r14231; + xor.b32 %r14095, %r30318, %r14230; + st.local.v2.u32 [%rd2+152], {%r14095, %r14094}; + // begin inline asm + shf.l.wrap.b32 %r14025, %r13986, %r13985, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14029, %r13985, %r13986, %r14201; + // end inline asm + xor.b32 %r14232, %r14025, %r13961; + xor.b32 %r14233, %r14029, %r13962; + xor.b32 %r14078, %r30315, %r14233; + xor.b32 %r14079, %r30314, %r14232; + st.local.v2.u32 [%rd2+120], {%r14079, %r14078}; + xor.b32 %r14070, %r30311, %r14233; + xor.b32 %r14071, %r30310, %r14232; + st.local.v2.u32 [%rd2+200], {%r14071, %r14070}; + // begin inline asm + shf.l.wrap.b32 %r14033, %r13998, %r13997, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14037, %r13997, %r13998, %r14201; + // end inline asm + xor.b32 %r14234, %r14033, %r13973; + xor.b32 %r14235, %r14037, %r13974; + xor.b32 %r14102, %r30334, %r14234; + xor.b32 %r14103, %r30335, %r14235; + xor.b32 %r14111, %r30305, %r14235; + xor.b32 %r14110, %r30304, %r14234; + st.local.v2.u32 [%rd2+168], {%r14110, %r14111}; + // begin inline asm + shf.l.wrap.b32 %r14041, %r13950, %r13949, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14045, %r13949, %r13950, %r14201; + // end inline asm + xor.b32 %r14236, %r14041, %r13985; + xor.b32 %r14237, %r14045, %r13986; + xor.b32 %r14062, %r30300, %r14236; + xor.b32 %r14063, %r30301, %r14237; + xor.b32 %r14087, %r30295, %r14237; + xor.b32 %r14086, %r30294, %r14236; + st.local.v2.u32 [%rd2+216], {%r14086, %r14087}; + // begin inline asm + shf.l.wrap.b32 %r14049, %r14055, %r14054, %r13552; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14053, %r14054, %r14055, %r13552; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14057, %r14063, %r14062, %r13560; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14061, %r14062, %r14063, %r13560; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14069, %r14070, %r14071, %r13568; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14065, %r14071, %r14070, %r13568; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r14065, %r14069}; + // begin inline asm + shf.l.wrap.b32 %r14073, %r14079, %r14078, %r13600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14077, %r14078, %r14079, %r13600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14081, %r14087, %r14086, %r13648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14085, %r14086, %r14087, %r13648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14093, %r14094, %r14095, %r13672; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14089, %r14095, %r14094, %r13672; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r14089, %r14093}; + // begin inline asm + shf.l.wrap.b32 %r14097, %r14103, %r14102, %r13688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14101, %r14102, %r14103, %r13688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14105, %r14111, %r14110, %r13696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14109, %r14110, %r14111, %r13696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14113, %r14119, %r14118, %r13728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14117, %r14118, %r14119, %r13728; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14121, %r14156, %r14049, %r14073, 0xD2; + lop3.b32 %r14122, %r14159, %r14053, %r14077, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30477, %r14049, %r14073, %r14105, 0xD2; + lop3.b32 %r30478, %r14053, %r14077, %r14109, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30477, %r30478}; + // begin inline asm + // chi + lop3.b32 %r30473, %r14073, %r14105, %r14081, 0xD2; + lop3.b32 %r30474, %r14077, %r14109, %r14085, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30473, %r30474}; + // begin inline asm + // chi + lop3.b32 %r30469, %r14105, %r14081, %r14156, 0xD2; + lop3.b32 %r30470, %r14109, %r14085, %r14159, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30469, %r30470}; + // begin inline asm + // chi + lop3.b32 %r30467, %r14081, %r14156, %r14049, 0xD2; + lop3.b32 %r30468, %r14085, %r14159, %r14053, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30467, %r30468}; + // begin inline asm + // chi + lop3.b32 %r30463, %r14097, %r14057, %r14113, 0xD2; + lop3.b32 %r30464, %r14101, %r14061, %r14117, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30463, %r30464}; + // begin inline asm + // chi + lop3.b32 %r30475, %r14057, %r14113, %r14089, 0xD2; + lop3.b32 %r30476, %r14061, %r14117, %r14093, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30475, %r30476}; + // begin inline asm + // chi + lop3.b32 %r30471, %r14113, %r14089, %r14065, 0xD2; + lop3.b32 %r30472, %r14117, %r14093, %r14069, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30471, %r30472}; + add.s64 %rd690, %rd689, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r14185,%r14186}, [%rd690]; + // end inline asm + xor.b32 %r30465, %r14121, %r14185; + xor.b32 %r30466, %r14122, %r14186; + st.local.v2.u32 [%rd2+24], {%r30465, %r30466}; + st.local.u64 [%rd147], %rd361; + mov.u64 %rd694, 1179641; + st.local.u64 [%rd147+8], %rd694; + add.s32 %r1891, %r1695, 1; + st.local.u32 [%rd147+16], %r1891; + ld.global.u64 %rd695, [%rd127]; + ld.global.u64 %rd696, [%rd127+8]; + ld.global.u64 %rd697, [%rd127+16]; + ld.global.u64 %rd698, [%rd127+24]; + ld.global.u64 %rd699, [%rd127+32]; + ld.global.u64 %rd700, [%rd127+40]; + ld.global.u64 %rd701, [%rd127+48]; + ld.global.u64 %rd702, [%rd127+56]; + st.local.u64 [%rd147+32], %rd696; + st.local.u64 [%rd147+40], %rd697; + st.local.u64 [%rd147+48], %rd698; + st.local.u64 [%rd147+56], %rd699; + st.local.u64 [%rd147+64], %rd700; + st.local.u64 [%rd147+72], %rd701; + st.local.u64 [%rd147+80], %rd702; + cvt.u32.u64 %r14238, %rd695; + xor.b32 %r14239, %r1891, %r14238; + st.local.u64 [%rd147+24], %rd695; + st.local.u32 [%rd147+24], %r14239; + mov.u32 %r30345, 0; + st.local.v2.u32 [%rd147+96], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+104], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+112], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+120], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+128], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+136], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+144], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+152], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+160], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+168], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+176], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+184], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+192], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+200], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+208], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+216], {%r30345, %r30345}; + mov.u32 %r30360, -2147483648; + st.local.v2.u32 [%rd147+88], {%r14201, %r30360}; + ld.local.v2.u32 {%r30381, %r30382}, [%rd147+24]; + mov.b64 {%r30379, %r30380}, %rd700; + shr.u64 %rd703, %rd696, 32; + cvt.u32.u64 %r30393, %rd696; + cvt.u32.u64 %r30394, %rd703; + shr.u64 %rd704, %rd701, 32; + cvt.u32.u64 %r30391, %rd701; + cvt.u32.u64 %r30392, %rd704; + shr.u64 %rd705, %rd697, 32; + cvt.u32.u64 %r30389, %rd697; + cvt.u32.u64 %r30390, %rd705; + shr.u64 %rd706, %rd702, 32; + cvt.u32.u64 %r30387, %rd702; + cvt.u32.u64 %r30388, %rd706; + shr.u64 %rd707, %rd698, 32; + cvt.u32.u64 %r30385, %rd698; + cvt.u32.u64 %r30386, %rd707; + shr.u64 %rd708, %rd699, 32; + cvt.u32.u64 %r30383, %rd699; + cvt.u32.u64 %r30384, %rd708; + mov.u32 %r30346, %r30345; + mov.u32 %r30347, %r30345; + mov.u32 %r30348, %r30345; + mov.u32 %r30349, %r30345; + mov.u32 %r30350, %r30345; + mov.u32 %r30351, %r30345; + mov.u32 %r30352, %r30345; + mov.u32 %r30353, %r30345; + mov.u32 %r30354, %r30345; + mov.u32 %r30355, %r30345; + mov.u32 %r30356, %r30345; + mov.u32 %r30357, %r30345; + mov.u32 %r30358, %r30345; + mov.u32 %r30359, %r14201; + mov.u32 %r30361, %r30345; + mov.u32 %r30362, %r30345; + mov.u32 %r30363, %r30345; + mov.u32 %r30364, %r30345; + mov.u32 %r30365, %r30345; + mov.u32 %r30366, %r30345; + mov.u32 %r30367, %r30345; + mov.u32 %r30368, %r30345; + mov.u32 %r30369, %r30345; + mov.u32 %r30370, %r30345; + mov.u32 %r30371, %r30345; + mov.u32 %r30372, %r30345; + mov.u32 %r30373, %r30345; + mov.u32 %r30374, %r30345; + mov.u32 %r30375, %r30345; + mov.u32 %r30376, %r30345; + mov.u32 %r30377, %r30345; + mov.u32 %r30378, %r30345; + mov.u32 %r30395, %r30345; + +$L__BB2_42: + // begin inline asm + // xor5 + lop3.b32 %r14242, %r30381, %r30379, %r30377, 0x96; + lop3.b32 %r14242, %r14242, %r30375, %r30373, 0x96; + lop3.b32 %r14243, %r30382, %r30380, %r30378, 0x96; + lop3.b32 %r14243, %r14243, %r30376, %r30374, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14254, %r30393, %r30391, %r30371, 0x96; + lop3.b32 %r14254, %r14254, %r30369, %r30367, 0x96; + lop3.b32 %r14255, %r30394, %r30392, %r30372, 0x96; + lop3.b32 %r14255, %r14255, %r30370, %r30368, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14266, %r30389, %r30387, %r30365, 0x96; + lop3.b32 %r14266, %r14266, %r30363, %r30361, 0x96; + lop3.b32 %r14267, %r30390, %r30388, %r30366, 0x96; + lop3.b32 %r14267, %r14267, %r30364, %r30362, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14278, %r30385, %r30359, %r30357, 0x96; + lop3.b32 %r14278, %r14278, %r30355, %r30353, 0x96; + lop3.b32 %r14279, %r30386, %r30360, %r30358, 0x96; + lop3.b32 %r14279, %r14279, %r30356, %r30354, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14290, %r30383, %r30351, %r30349, 0x96; + lop3.b32 %r14290, %r14290, %r30347, %r30345, 0x96; + lop3.b32 %r14291, %r30384, %r30352, %r30350, 0x96; + lop3.b32 %r14291, %r14291, %r30348, %r30346, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14302, %r14255, %r14254, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14306, %r14254, %r14255, %r14201; + // end inline asm + xor.b32 %r14736, %r14302, %r14290; + xor.b32 %r14737, %r14306, %r14291; + xor.b32 %r14569, %r30381, %r14736; + xor.b32 %r14572, %r30382, %r14737; + xor.b32 %r14476, %r30379, %r14736; + xor.b32 %r14475, %r30380, %r14737; + xor.b32 %r14523, %r30377, %r14736; + xor.b32 %r14524, %r30378, %r14737; + xor.b32 %r14428, %r30375, %r14736; + xor.b32 %r14427, %r30376, %r14737; + xor.b32 %r14379, %r30373, %r14736; + xor.b32 %r14380, %r30374, %r14737; + // begin inline asm + shf.l.wrap.b32 %r14310, %r14267, %r14266, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14314, %r14266, %r14267, %r14201; + // end inline asm + xor.b32 %r14738, %r14310, %r14242; + xor.b32 %r14739, %r14314, %r14243; + xor.b32 %r14531, %r30393, %r14738; + xor.b32 %r14532, %r30394, %r14739; + xor.b32 %r14348, %r30391, %r14738; + xor.b32 %r14347, %r30392, %r14739; + xor.b32 %r14507, %r30371, %r14738; + xor.b32 %r14508, %r30372, %r14739; + xor.b32 %r14468, %r30369, %r14738; + xor.b32 %r14467, %r30370, %r14739; + xor.b32 %r14451, %r30367, %r14738; + xor.b32 %r14452, %r30368, %r14739; + // begin inline asm + shf.l.wrap.b32 %r14318, %r14279, %r14278, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14322, %r14278, %r14279, %r14201; + // end inline asm + xor.b32 %r14740, %r14318, %r14254; + xor.b32 %r14741, %r14322, %r14255; + xor.b32 %r14388, %r30389, %r14740; + xor.b32 %r14387, %r30390, %r14741; + xor.b32 %r14515, %r30387, %r14740; + xor.b32 %r14516, %r30388, %r14741; + xor.b32 %r14396, %r30365, %r14740; + xor.b32 %r14395, %r30366, %r14741; + xor.b32 %r14499, %r30363, %r14740; + xor.b32 %r14500, %r30364, %r14741; + xor.b32 %r14364, %r30361, %r14740; + xor.b32 %r14363, %r30362, %r14741; + // begin inline asm + shf.l.wrap.b32 %r14326, %r14291, %r14290, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14330, %r14290, %r14291, %r14201; + // end inline asm + xor.b32 %r14742, %r14326, %r14266; + xor.b32 %r14743, %r14330, %r14267; + xor.b32 %r14483, %r30385, %r14742; + xor.b32 %r14484, %r30386, %r14743; + xor.b32 %r14460, %r30359, %r14742; + xor.b32 %r14459, %r30360, %r14743; + xor.b32 %r14403, %r30357, %r14742; + xor.b32 %r14404, %r30358, %r14743; + xor.b32 %r14491, %r30355, %r14742; + xor.b32 %r14492, %r30356, %r14743; + xor.b32 %r14420, %r30353, %r14742; + xor.b32 %r14419, %r30354, %r14743; + // begin inline asm + shf.l.wrap.b32 %r14334, %r14243, %r14242, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14338, %r14242, %r14243, %r14201; + // end inline asm + xor.b32 %r14744, %r14334, %r14278; + xor.b32 %r14745, %r14338, %r14279; + xor.b32 %r14435, %r30383, %r14744; + xor.b32 %r14436, %r30384, %r14745; + xor.b32 %r14355, %r30351, %r14744; + xor.b32 %r14356, %r30352, %r14745; + xor.b32 %r14372, %r30349, %r14744; + xor.b32 %r14371, %r30350, %r14745; + xor.b32 %r14411, %r30347, %r14744; + xor.b32 %r14412, %r30348, %r14745; + xor.b32 %r14443, %r30345, %r14744; + xor.b32 %r14444, %r30346, %r14745; + mov.u32 %r14349, 44; + // begin inline asm + shf.l.wrap.b32 %r14342, %r14348, %r14347, %r14349; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14346, %r14347, %r14348, %r14349; + // end inline asm + mov.u32 %r14357, 20; + // begin inline asm + shf.l.wrap.b32 %r14350, %r14356, %r14355, %r14357; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14354, %r14355, %r14356, %r14357; + // end inline asm + mov.u32 %r14365, 61; + // begin inline asm + shf.l.wrap.b32 %r14358, %r14364, %r14363, %r14365; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14362, %r14363, %r14364, %r14365; + // end inline asm + mov.u32 %r14373, 39; + // begin inline asm + shf.l.wrap.b32 %r14366, %r14372, %r14371, %r14373; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14370, %r14371, %r14372, %r14373; + // end inline asm + mov.u32 %r14381, 18; + // begin inline asm + shf.l.wrap.b32 %r14374, %r14380, %r14379, %r14381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14378, %r14379, %r14380, %r14381; + // end inline asm + mov.u32 %r14389, 62; + // begin inline asm + shf.l.wrap.b32 %r14382, %r14388, %r14387, %r14389; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14386, %r14387, %r14388, %r14389; + // end inline asm + mov.u32 %r14397, 43; + // begin inline asm + shf.l.wrap.b32 %r14390, %r14396, %r14395, %r14397; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14394, %r14395, %r14396, %r14397; + // end inline asm + mov.u32 %r14405, 25; + // begin inline asm + shf.l.wrap.b32 %r14398, %r14404, %r14403, %r14405; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14402, %r14403, %r14404, %r14405; + // end inline asm + mov.u32 %r14413, 8; + // begin inline asm + shf.l.wrap.b32 %r14406, %r14412, %r14411, %r14413; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14410, %r14411, %r14412, %r14413; + // end inline asm + mov.u32 %r14421, 56; + // begin inline asm + shf.l.wrap.b32 %r14414, %r14420, %r14419, %r14421; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14418, %r14419, %r14420, %r14421; + // end inline asm + mov.u32 %r14429, 41; + // begin inline asm + shf.l.wrap.b32 %r14422, %r14428, %r14427, %r14429; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14426, %r14427, %r14428, %r14429; + // end inline asm + mov.u32 %r14437, 27; + // begin inline asm + shf.l.wrap.b32 %r14430, %r14436, %r14435, %r14437; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14434, %r14435, %r14436, %r14437; + // end inline asm + mov.u32 %r14445, 14; + // begin inline asm + shf.l.wrap.b32 %r14438, %r14444, %r14443, %r14445; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14442, %r14443, %r14444, %r14445; + // end inline asm + mov.u32 %r14453, 2; + // begin inline asm + shf.l.wrap.b32 %r14446, %r14452, %r14451, %r14453; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14450, %r14451, %r14452, %r14453; + // end inline asm + mov.u32 %r14461, 55; + // begin inline asm + shf.l.wrap.b32 %r14454, %r14460, %r14459, %r14461; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14458, %r14459, %r14460, %r14461; + // end inline asm + mov.u32 %r14469, 45; + // begin inline asm + shf.l.wrap.b32 %r14462, %r14468, %r14467, %r14469; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14466, %r14467, %r14468, %r14469; + // end inline asm + mov.u32 %r14477, 36; + // begin inline asm + shf.l.wrap.b32 %r14470, %r14476, %r14475, %r14477; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14474, %r14475, %r14476, %r14477; + // end inline asm + mov.u32 %r14485, 28; + // begin inline asm + shf.l.wrap.b32 %r14478, %r14484, %r14483, %r14485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14482, %r14483, %r14484, %r14485; + // end inline asm + mov.u32 %r14493, 21; + // begin inline asm + shf.l.wrap.b32 %r14486, %r14492, %r14491, %r14493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14490, %r14491, %r14492, %r14493; + // end inline asm + mov.u32 %r14501, 15; + // begin inline asm + shf.l.wrap.b32 %r14494, %r14500, %r14499, %r14501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14498, %r14499, %r14500, %r14501; + // end inline asm + mov.u32 %r14509, 10; + // begin inline asm + shf.l.wrap.b32 %r14502, %r14508, %r14507, %r14509; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14506, %r14507, %r14508, %r14509; + // end inline asm + mov.u32 %r14517, 6; + // begin inline asm + shf.l.wrap.b32 %r14510, %r14516, %r14515, %r14517; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14514, %r14515, %r14516, %r14517; + // end inline asm + mov.u32 %r14525, 3; + // begin inline asm + shf.l.wrap.b32 %r14518, %r14524, %r14523, %r14525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14522, %r14523, %r14524, %r14525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14526, %r14532, %r14531, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14530, %r14531, %r14532, %r14201; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14534, %r14569, %r14342, %r14390, 0xD2; + lop3.b32 %r14535, %r14572, %r14346, %r14394, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30393, %r14342, %r14390, %r14486, 0xD2; + lop3.b32 %r30394, %r14346, %r14394, %r14490, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30389, %r14390, %r14486, %r14438, 0xD2; + lop3.b32 %r30390, %r14394, %r14490, %r14442, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30385, %r14486, %r14438, %r14569, 0xD2; + lop3.b32 %r30386, %r14490, %r14442, %r14572, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30383, %r14438, %r14569, %r14342, 0xD2; + lop3.b32 %r30384, %r14442, %r14572, %r14346, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30379, %r14478, %r14350, %r14518, 0xD2; + lop3.b32 %r30380, %r14482, %r14354, %r14522, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30391, %r14350, %r14518, %r14462, 0xD2; + lop3.b32 %r30392, %r14354, %r14522, %r14466, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30387, %r14518, %r14462, %r14358, 0xD2; + lop3.b32 %r30388, %r14522, %r14466, %r14362, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30359, %r14462, %r14358, %r14478, 0xD2; + lop3.b32 %r30360, %r14466, %r14362, %r14482, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r30359, %r30360}; + // begin inline asm + // chi + lop3.b32 %r30351, %r14358, %r14478, %r14350, 0xD2; + lop3.b32 %r30352, %r14362, %r14482, %r14354, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r30351, %r30352}; + // begin inline asm + // chi + lop3.b32 %r30377, %r14526, %r14510, %r14398, 0xD2; + lop3.b32 %r30378, %r14530, %r14514, %r14402, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+104], {%r30377, %r30378}; + // begin inline asm + // chi + lop3.b32 %r30371, %r14510, %r14398, %r14406, 0xD2; + lop3.b32 %r30372, %r14514, %r14402, %r14410, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+112], {%r30371, %r30372}; + // begin inline asm + // chi + lop3.b32 %r30365, %r14398, %r14406, %r14374, 0xD2; + lop3.b32 %r30366, %r14402, %r14410, %r14378, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+120], {%r30365, %r30366}; + // begin inline asm + // chi + lop3.b32 %r30357, %r14406, %r14374, %r14526, 0xD2; + lop3.b32 %r30358, %r14410, %r14378, %r14530, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+128], {%r30357, %r30358}; + // begin inline asm + // chi + lop3.b32 %r30349, %r14374, %r14526, %r14510, 0xD2; + lop3.b32 %r30350, %r14378, %r14530, %r14514, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+136], {%r30349, %r30350}; + // begin inline asm + // chi + lop3.b32 %r30375, %r14430, %r14470, %r14502, 0xD2; + lop3.b32 %r30376, %r14434, %r14474, %r14506, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+144], {%r30375, %r30376}; + // begin inline asm + // chi + lop3.b32 %r30369, %r14470, %r14502, %r14494, 0xD2; + lop3.b32 %r30370, %r14474, %r14506, %r14498, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+152], {%r30369, %r30370}; + // begin inline asm + // chi + lop3.b32 %r30363, %r14502, %r14494, %r14414, 0xD2; + lop3.b32 %r30364, %r14506, %r14498, %r14418, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+160], {%r30363, %r30364}; + // begin inline asm + // chi + lop3.b32 %r30355, %r14494, %r14414, %r14430, 0xD2; + lop3.b32 %r30356, %r14498, %r14418, %r14434, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+168], {%r30355, %r30356}; + // begin inline asm + // chi + lop3.b32 %r30347, %r14414, %r14430, %r14470, 0xD2; + lop3.b32 %r30348, %r14418, %r14434, %r14474, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+176], {%r30347, %r30348}; + // begin inline asm + // chi + lop3.b32 %r30373, %r14382, %r14454, %r14366, 0xD2; + lop3.b32 %r30374, %r14386, %r14458, %r14370, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+184], {%r30373, %r30374}; + // begin inline asm + // chi + lop3.b32 %r30367, %r14454, %r14366, %r14422, 0xD2; + lop3.b32 %r30368, %r14458, %r14370, %r14426, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+192], {%r30367, %r30368}; + // begin inline asm + // chi + lop3.b32 %r30361, %r14366, %r14422, %r14446, 0xD2; + lop3.b32 %r30362, %r14370, %r14426, %r14450, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+200], {%r30361, %r30362}; + // begin inline asm + // chi + lop3.b32 %r30353, %r14422, %r14446, %r14382, 0xD2; + lop3.b32 %r30354, %r14426, %r14450, %r14386, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+208], {%r30353, %r30354}; + // begin inline asm + // chi + lop3.b32 %r30345, %r14446, %r14382, %r14454, 0xD2; + lop3.b32 %r30346, %r14450, %r14386, %r14458, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+216], {%r30345, %r30346}; + mul.wide.s32 %rd710, %r30395, 8; + add.s64 %rd709, %rd689, %rd710; + // begin inline asm + ld.global.nc.v2.u32 {%r14734,%r14735}, [%rd709]; + // end inline asm + xor.b32 %r30381, %r14534, %r14734; + xor.b32 %r30382, %r14535, %r14735; + add.s32 %r30395, %r30395, 1; + setp.lt.u32 %p27, %r30395, 23; + @%p27 bra $L__BB2_42; + + mov.u32 %r30428, 0; + mov.u32 %r14845, 1; + st.local.v2.u32 [%rd147+32], {%r30393, %r30394}; + st.local.v2.u32 [%rd147+72], {%r30391, %r30392}; + st.local.v2.u32 [%rd147+40], {%r30389, %r30390}; + st.local.v2.u32 [%rd147+80], {%r30387, %r30388}; + st.local.v2.u32 [%rd147+48], {%r30385, %r30386}; + st.local.v2.u32 [%rd147+56], {%r30383, %r30384}; + st.local.v2.u32 [%rd147+24], {%r30381, %r30382}; + // begin inline asm + // xor5 + lop3.b32 %r14746, %r30381, %r30379, %r30377, 0x96; + lop3.b32 %r14746, %r14746, %r30375, %r30373, 0x96; + lop3.b32 %r14747, %r30382, %r30380, %r30378, 0x96; + lop3.b32 %r14747, %r14747, %r30376, %r30374, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14758, %r30393, %r30391, %r30371, 0x96; + lop3.b32 %r14758, %r14758, %r30369, %r30367, 0x96; + lop3.b32 %r14759, %r30394, %r30392, %r30372, 0x96; + lop3.b32 %r14759, %r14759, %r30370, %r30368, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14770, %r30389, %r30387, %r30365, 0x96; + lop3.b32 %r14770, %r14770, %r30363, %r30361, 0x96; + lop3.b32 %r14771, %r30390, %r30388, %r30366, 0x96; + lop3.b32 %r14771, %r14771, %r30364, %r30362, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14782, %r30385, %r30359, %r30357, 0x96; + lop3.b32 %r14782, %r14782, %r30355, %r30353, 0x96; + lop3.b32 %r14783, %r30386, %r30360, %r30358, 0x96; + lop3.b32 %r14783, %r14783, %r30356, %r30354, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14794, %r30383, %r30351, %r30349, 0x96; + lop3.b32 %r14794, %r14794, %r30347, %r30345, 0x96; + lop3.b32 %r14795, %r30384, %r30352, %r30350, 0x96; + lop3.b32 %r14795, %r14795, %r30348, %r30346, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14806, %r14759, %r14758, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14810, %r14758, %r14759, %r14845; + // end inline asm + xor.b32 %r14985, %r14806, %r14794; + xor.b32 %r14986, %r14810, %r14795; + xor.b32 %r14953, %r30381, %r14985; + xor.b32 %r14956, %r30382, %r14986; + xor.b32 %r14916, %r30378, %r14986; + xor.b32 %r14915, %r30377, %r14985; + st.local.v2.u32 [%rd147+104], {%r14915, %r14916}; + // begin inline asm + shf.l.wrap.b32 %r14814, %r14771, %r14770, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14818, %r14770, %r14771, %r14845; + // end inline asm + xor.b32 %r14987, %r14814, %r14746; + xor.b32 %r14988, %r14818, %r14747; + xor.b32 %r14852, %r30391, %r14987; + xor.b32 %r14851, %r30392, %r14988; + xor.b32 %r14891, %r30370, %r14988; + xor.b32 %r14892, %r30369, %r14987; + st.local.v2.u32 [%rd147+152], {%r14892, %r14891}; + // begin inline asm + shf.l.wrap.b32 %r14822, %r14783, %r14782, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14826, %r14782, %r14783, %r14845; + // end inline asm + xor.b32 %r14989, %r14822, %r14758; + xor.b32 %r14990, %r14826, %r14759; + xor.b32 %r14875, %r30366, %r14990; + xor.b32 %r14876, %r30365, %r14989; + st.local.v2.u32 [%rd147+120], {%r14876, %r14875}; + xor.b32 %r14867, %r30362, %r14990; + xor.b32 %r14868, %r30361, %r14989; + st.local.v2.u32 [%rd147+200], {%r14868, %r14867}; + // begin inline asm + shf.l.wrap.b32 %r14830, %r14795, %r14794, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14834, %r14794, %r14795, %r14845; + // end inline asm + xor.b32 %r14991, %r14830, %r14770; + xor.b32 %r14992, %r14834, %r14771; + xor.b32 %r14899, %r30385, %r14991; + xor.b32 %r14900, %r30386, %r14992; + xor.b32 %r14908, %r30356, %r14992; + xor.b32 %r14907, %r30355, %r14991; + st.local.v2.u32 [%rd147+168], {%r14907, %r14908}; + // begin inline asm + shf.l.wrap.b32 %r14838, %r14747, %r14746, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14842, %r14746, %r14747, %r14845; + // end inline asm + xor.b32 %r14993, %r14838, %r14782; + xor.b32 %r14994, %r14842, %r14783; + xor.b32 %r14859, %r30351, %r14993; + xor.b32 %r14860, %r30352, %r14994; + xor.b32 %r14884, %r30346, %r14994; + xor.b32 %r14883, %r30345, %r14993; + st.local.v2.u32 [%rd147+216], {%r14883, %r14884}; + // begin inline asm + shf.l.wrap.b32 %r14846, %r14852, %r14851, %r14349; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14850, %r14851, %r14852, %r14349; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14854, %r14860, %r14859, %r14357; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14858, %r14859, %r14860, %r14357; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14866, %r14867, %r14868, %r14365; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14862, %r14868, %r14867, %r14365; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r14862, %r14866}; + // begin inline asm + shf.l.wrap.b32 %r14870, %r14876, %r14875, %r14397; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14874, %r14875, %r14876, %r14397; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14878, %r14884, %r14883, %r14445; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14882, %r14883, %r14884, %r14445; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14890, %r14891, %r14892, %r14469; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14886, %r14892, %r14891, %r14469; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r14886, %r14890}; + // begin inline asm + shf.l.wrap.b32 %r14894, %r14900, %r14899, %r14485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14898, %r14899, %r14900, %r14485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14902, %r14908, %r14907, %r14493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14906, %r14907, %r14908, %r14493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14910, %r14916, %r14915, %r14525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14914, %r14915, %r14916, %r14525; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14918, %r14953, %r14846, %r14870, 0xD2; + lop3.b32 %r14919, %r14956, %r14850, %r14874, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30528, %r14846, %r14870, %r14902, 0xD2; + lop3.b32 %r30529, %r14850, %r14874, %r14906, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+32], {%r30528, %r30529}; + // begin inline asm + // chi + lop3.b32 %r30524, %r14870, %r14902, %r14878, 0xD2; + lop3.b32 %r30525, %r14874, %r14906, %r14882, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+40], {%r30524, %r30525}; + // begin inline asm + // chi + lop3.b32 %r30520, %r14902, %r14878, %r14953, 0xD2; + lop3.b32 %r30521, %r14906, %r14882, %r14956, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+48], {%r30520, %r30521}; + // begin inline asm + // chi + lop3.b32 %r30518, %r14878, %r14953, %r14846, 0xD2; + lop3.b32 %r30519, %r14882, %r14956, %r14850, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+56], {%r30518, %r30519}; + // begin inline asm + // chi + lop3.b32 %r30514, %r14894, %r14854, %r14910, 0xD2; + lop3.b32 %r30515, %r14898, %r14858, %r14914, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+64], {%r30514, %r30515}; + // begin inline asm + // chi + lop3.b32 %r30526, %r14854, %r14910, %r14886, 0xD2; + lop3.b32 %r30527, %r14858, %r14914, %r14890, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+72], {%r30526, %r30527}; + // begin inline asm + // chi + lop3.b32 %r30522, %r14910, %r14886, %r14862, 0xD2; + lop3.b32 %r30523, %r14914, %r14890, %r14866, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+80], {%r30522, %r30523}; + // begin inline asm + ld.global.nc.v2.u32 {%r14982,%r14983}, [%rd690]; + // end inline asm + xor.b32 %r30516, %r14918, %r14982; + xor.b32 %r30517, %r14919, %r14983; + st.local.v2.u32 [%rd147+24], {%r30516, %r30517}; + add.s64 %rd149, %rd2, 24; + add.s64 %rd150, %rd147, 24; + +$L__BB2_44: + cvta.to.global.u64 %rd1268, %rd361; + shl.b32 %r14995, %r30428, 2; + cvt.u64.u32 %rd720, %r14995; + and.b64 %rd721, %rd720, 60; + add.s64 %rd722, %rd149, %rd721; + xor.b32 %r14996, %r1695, %r30428; + mul.lo.s32 %r14997, %r14996, 16777619; + ld.local.u32 %r14998, [%rd722]; + xor.b32 %r14999, %r14997, %r14998; + mul.wide.u32 %rd723, %r14999, -954391867; + shr.u64 %rd724, %rd723, 32; + cvt.u32.u64 %r15000, %rd724; + sub.s32 %r15001, %r14999, %r15000; + shr.u32 %r15002, %r15001, 1; + add.s32 %r15003, %r15002, %r15000; + shr.u32 %r15004, %r15003, 20; + mul.lo.s32 %r15005, %r15004, 1179641; + sub.s32 %r15006, %r14999, %r15005; + mul.wide.u32 %rd725, %r15006, 64; + add.s64 %rd726, %rd1268, %rd725; + mul.lo.s32 %r15007, %r30465, 16777619; + ld.global.u32 %r15008, [%rd726]; + xor.b32 %r30465, %r15007, %r15008; + mul.lo.s32 %r15009, %r30466, 16777619; + ld.global.u32 %r15010, [%rd726+4]; + xor.b32 %r30466, %r15009, %r15010; + mul.lo.s32 %r15011, %r30477, 16777619; + ld.global.u32 %r15012, [%rd726+8]; + mul.lo.s32 %r15013, %r30478, 16777619; + ld.global.u32 %r15014, [%rd726+12]; + xor.b32 %r15015, %r15013, %r15014; + xor.b32 %r30477, %r15011, %r15012; + mov.b64 %rd727, {%r30477, %r15015}; + mul.lo.s32 %r15016, %r30473, 16777619; + ld.global.u32 %r15017, [%rd726+16]; + mul.lo.s32 %r15018, %r30474, 16777619; + ld.global.u32 %r15019, [%rd726+20]; + xor.b32 %r15020, %r15018, %r15019; + xor.b32 %r30473, %r15016, %r15017; + mov.b64 %rd728, {%r30473, %r15020}; + mul.lo.s32 %r15021, %r30469, 16777619; + ld.global.u32 %r15022, [%rd726+24]; + mul.lo.s32 %r15023, %r30470, 16777619; + ld.global.u32 %r15024, [%rd726+28]; + xor.b32 %r15025, %r15023, %r15024; + xor.b32 %r30469, %r15021, %r15022; + mov.b64 %rd729, {%r30469, %r15025}; + mul.lo.s32 %r15026, %r30467, 16777619; + ld.global.u32 %r15027, [%rd726+32]; + mul.lo.s32 %r15028, %r30468, 16777619; + ld.global.u32 %r15029, [%rd726+36]; + xor.b32 %r15030, %r15028, %r15029; + xor.b32 %r30467, %r15026, %r15027; + mov.b64 %rd730, {%r30467, %r15030}; + mul.lo.s32 %r15031, %r30463, 16777619; + ld.global.u32 %r15032, [%rd726+40]; + xor.b32 %r30463, %r15031, %r15032; + mul.lo.s32 %r15033, %r30464, 16777619; + ld.global.u32 %r15034, [%rd726+44]; + xor.b32 %r30464, %r15033, %r15034; + mul.lo.s32 %r15035, %r30475, 16777619; + ld.global.u32 %r15036, [%rd726+48]; + mul.lo.s32 %r15037, %r30476, 16777619; + ld.global.u32 %r15038, [%rd726+52]; + xor.b32 %r15039, %r15037, %r15038; + xor.b32 %r30475, %r15035, %r15036; + mov.b64 %rd731, {%r30475, %r15039}; + mul.lo.s32 %r15040, %r30471, 16777619; + ld.global.u32 %r15041, [%rd726+56]; + mul.lo.s32 %r15042, %r30472, 16777619; + ld.global.u32 %r15043, [%rd726+60]; + xor.b32 %r15044, %r15042, %r15043; + xor.b32 %r30471, %r15040, %r15041; + mov.b64 %rd732, {%r30471, %r15044}; + st.local.v2.u32 [%rd2+24], {%r30465, %r30466}; + st.local.v2.u32 [%rd2+32], {%r30477, %r15015}; + st.local.v2.u32 [%rd2+40], {%r30473, %r15020}; + st.local.v2.u32 [%rd2+48], {%r30469, %r15025}; + st.local.v2.u32 [%rd2+56], {%r30467, %r15030}; + st.local.v2.u32 [%rd2+64], {%r30463, %r30464}; + st.local.v2.u32 [%rd2+72], {%r30475, %r15039}; + st.local.v2.u32 [%rd2+80], {%r30471, %r15044}; + add.s64 %rd733, %rd150, %rd721; + xor.b32 %r15045, %r1891, %r30428; + mul.lo.s32 %r15046, %r15045, 16777619; + ld.local.u32 %r15047, [%rd733]; + xor.b32 %r15048, %r15046, %r15047; + mul.wide.u32 %rd734, %r15048, -954391867; + shr.u64 %rd735, %rd734, 32; + cvt.u32.u64 %r15049, %rd735; + sub.s32 %r15050, %r15048, %r15049; + shr.u32 %r15051, %r15050, 1; + add.s32 %r15052, %r15051, %r15049; + shr.u32 %r15053, %r15052, 20; + mul.lo.s32 %r15054, %r15053, 1179641; + sub.s32 %r15055, %r15048, %r15054; + mul.wide.u32 %rd736, %r15055, 64; + add.s64 %rd737, %rd1268, %rd736; + mul.lo.s32 %r15056, %r30516, 16777619; + ld.global.u32 %r15057, [%rd737]; + xor.b32 %r30516, %r15056, %r15057; + mul.lo.s32 %r15058, %r30517, 16777619; + ld.global.u32 %r15059, [%rd737+4]; + xor.b32 %r30517, %r15058, %r15059; + mul.lo.s32 %r15060, %r30528, 16777619; + ld.global.u32 %r15061, [%rd737+8]; + mul.lo.s32 %r15062, %r30529, 16777619; + ld.global.u32 %r15063, [%rd737+12]; + xor.b32 %r15064, %r15062, %r15063; + xor.b32 %r30528, %r15060, %r15061; + mov.b64 %rd738, {%r30528, %r15064}; + mul.lo.s32 %r15065, %r30524, 16777619; + ld.global.u32 %r15066, [%rd737+16]; + mul.lo.s32 %r15067, %r30525, 16777619; + ld.global.u32 %r15068, [%rd737+20]; + xor.b32 %r15069, %r15067, %r15068; + xor.b32 %r30524, %r15065, %r15066; + mov.b64 %rd739, {%r30524, %r15069}; + mul.lo.s32 %r15070, %r30520, 16777619; + ld.global.u32 %r15071, [%rd737+24]; + mul.lo.s32 %r15072, %r30521, 16777619; + ld.global.u32 %r15073, [%rd737+28]; + xor.b32 %r15074, %r15072, %r15073; + xor.b32 %r30520, %r15070, %r15071; + mov.b64 %rd740, {%r30520, %r15074}; + mul.lo.s32 %r15075, %r30518, 16777619; + ld.global.u32 %r15076, [%rd737+32]; + mul.lo.s32 %r15077, %r30519, 16777619; + ld.global.u32 %r15078, [%rd737+36]; + xor.b32 %r15079, %r15077, %r15078; + xor.b32 %r30518, %r15075, %r15076; + mov.b64 %rd741, {%r30518, %r15079}; + mul.lo.s32 %r15080, %r30514, 16777619; + ld.global.u32 %r15081, [%rd737+40]; + xor.b32 %r30514, %r15080, %r15081; + mul.lo.s32 %r15082, %r30515, 16777619; + ld.global.u32 %r15083, [%rd737+44]; + xor.b32 %r30515, %r15082, %r15083; + mul.lo.s32 %r15084, %r30526, 16777619; + ld.global.u32 %r15085, [%rd737+48]; + mul.lo.s32 %r15086, %r30527, 16777619; + ld.global.u32 %r15087, [%rd737+52]; + xor.b32 %r15088, %r15086, %r15087; + xor.b32 %r30526, %r15084, %r15085; + mov.b64 %rd742, {%r30526, %r15088}; + mul.lo.s32 %r15089, %r30522, 16777619; + ld.global.u32 %r15090, [%rd737+56]; + mul.lo.s32 %r15091, %r30523, 16777619; + ld.global.u32 %r15092, [%rd737+60]; + xor.b32 %r15093, %r15091, %r15092; + xor.b32 %r30522, %r15089, %r15090; + mov.b64 %rd743, {%r30522, %r15093}; + st.local.v2.u32 [%rd147+24], {%r30516, %r30517}; + st.local.v2.u32 [%rd147+32], {%r30528, %r15064}; + st.local.v2.u32 [%rd147+40], {%r30524, %r15069}; + st.local.v2.u32 [%rd147+48], {%r30520, %r15074}; + st.local.v2.u32 [%rd147+56], {%r30518, %r15079}; + st.local.v2.u32 [%rd147+64], {%r30514, %r30515}; + st.local.v2.u32 [%rd147+72], {%r30526, %r15088}; + st.local.v2.u32 [%rd147+80], {%r30522, %r15093}; + add.s32 %r30428, %r30428, 1; + setp.lt.u32 %p28, %r30428, 512; + shr.u64 %rd744, %rd727, 32; + cvt.u32.u64 %r30478, %rd744; + shr.u64 %rd745, %rd728, 32; + cvt.u32.u64 %r30474, %rd745; + shr.u64 %rd746, %rd729, 32; + cvt.u32.u64 %r30470, %rd746; + shr.u64 %rd747, %rd730, 32; + cvt.u32.u64 %r30468, %rd747; + shr.u64 %rd748, %rd731, 32; + cvt.u32.u64 %r30476, %rd748; + shr.u64 %rd749, %rd732, 32; + cvt.u32.u64 %r30472, %rd749; + shr.u64 %rd750, %rd738, 32; + cvt.u32.u64 %r30529, %rd750; + shr.u64 %rd751, %rd739, 32; + cvt.u32.u64 %r30525, %rd751; + shr.u64 %rd752, %rd740, 32; + cvt.u32.u64 %r30521, %rd752; + shr.u64 %rd753, %rd741, 32; + cvt.u32.u64 %r30519, %rd753; + shr.u64 %rd754, %rd742, 32; + cvt.u32.u64 %r30527, %rd754; + shr.u64 %rd755, %rd743, 32; + cvt.u32.u64 %r30523, %rd755; + @%p28 bra $L__BB2_44; + + mov.u32 %r30429, 0; + st.local.v2.u32 [%rd2+96], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+104], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+112], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+120], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+128], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+136], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+144], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+152], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+160], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+168], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+176], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+184], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+192], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+200], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+208], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+216], {%r30429, %r30429}; + mov.u32 %r30444, -2147483648; + mov.u32 %r15108, 1; + st.local.v2.u32 [%rd2+88], {%r15108, %r30444}; + mov.u32 %r30430, %r30429; + mov.u32 %r30431, %r30429; + mov.u32 %r30432, %r30429; + mov.u32 %r30433, %r30429; + mov.u32 %r30434, %r30429; + mov.u32 %r30435, %r30429; + mov.u32 %r30436, %r30429; + mov.u32 %r30437, %r30429; + mov.u32 %r30438, %r30429; + mov.u32 %r30439, %r30429; + mov.u32 %r30440, %r30429; + mov.u32 %r30441, %r30429; + mov.u32 %r30442, %r30429; + mov.u32 %r30443, %r15108; + mov.u32 %r30445, %r30429; + mov.u32 %r30446, %r30429; + mov.u32 %r30447, %r30429; + mov.u32 %r30448, %r30429; + mov.u32 %r30449, %r30429; + mov.u32 %r30450, %r30429; + mov.u32 %r30451, %r30429; + mov.u32 %r30452, %r30429; + mov.u32 %r30453, %r30429; + mov.u32 %r30454, %r30429; + mov.u32 %r30455, %r30429; + mov.u32 %r30456, %r30429; + mov.u32 %r30457, %r30429; + mov.u32 %r30458, %r30429; + mov.u32 %r30459, %r30429; + mov.u32 %r30460, %r30429; + mov.u32 %r30461, %r30429; + mov.u32 %r30462, %r30429; + mov.u32 %r30479, %r30429; + +$L__BB2_46: + // begin inline asm + // xor5 + lop3.b32 %r15135, %r30465, %r30463, %r30461, 0x96; + lop3.b32 %r15135, %r15135, %r30459, %r30457, 0x96; + lop3.b32 %r15136, %r30466, %r30464, %r30462, 0x96; + lop3.b32 %r15136, %r15136, %r30460, %r30458, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15147, %r30477, %r30475, %r30455, 0x96; + lop3.b32 %r15147, %r15147, %r30453, %r30451, 0x96; + lop3.b32 %r15148, %r30478, %r30476, %r30456, 0x96; + lop3.b32 %r15148, %r15148, %r30454, %r30452, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15159, %r30473, %r30471, %r30449, 0x96; + lop3.b32 %r15159, %r15159, %r30447, %r30445, 0x96; + lop3.b32 %r15160, %r30474, %r30472, %r30450, 0x96; + lop3.b32 %r15160, %r15160, %r30448, %r30446, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15171, %r30469, %r30443, %r30441, 0x96; + lop3.b32 %r15171, %r15171, %r30439, %r30437, 0x96; + lop3.b32 %r15172, %r30470, %r30444, %r30442, 0x96; + lop3.b32 %r15172, %r15172, %r30440, %r30438, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15183, %r30467, %r30435, %r30433, 0x96; + lop3.b32 %r15183, %r15183, %r30431, %r30429, 0x96; + lop3.b32 %r15184, %r30468, %r30436, %r30434, 0x96; + lop3.b32 %r15184, %r15184, %r30432, %r30430, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15195, %r15148, %r15147, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15199, %r15147, %r15148, %r15108; + // end inline asm + xor.b32 %r15629, %r15195, %r15183; + xor.b32 %r15630, %r15199, %r15184; + xor.b32 %r15462, %r30465, %r15629; + xor.b32 %r15465, %r30466, %r15630; + xor.b32 %r15369, %r30463, %r15629; + xor.b32 %r15368, %r30464, %r15630; + xor.b32 %r15416, %r30461, %r15629; + xor.b32 %r15417, %r30462, %r15630; + xor.b32 %r15321, %r30459, %r15629; + xor.b32 %r15320, %r30460, %r15630; + xor.b32 %r15272, %r30457, %r15629; + xor.b32 %r15273, %r30458, %r15630; + // begin inline asm + shf.l.wrap.b32 %r15203, %r15160, %r15159, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15207, %r15159, %r15160, %r15108; + // end inline asm + xor.b32 %r15631, %r15203, %r15135; + xor.b32 %r15632, %r15207, %r15136; + xor.b32 %r15424, %r30477, %r15631; + xor.b32 %r15425, %r30478, %r15632; + xor.b32 %r15241, %r30475, %r15631; + xor.b32 %r15240, %r30476, %r15632; + xor.b32 %r15400, %r30455, %r15631; + xor.b32 %r15401, %r30456, %r15632; + xor.b32 %r15361, %r30453, %r15631; + xor.b32 %r15360, %r30454, %r15632; + xor.b32 %r15344, %r30451, %r15631; + xor.b32 %r15345, %r30452, %r15632; + // begin inline asm + shf.l.wrap.b32 %r15211, %r15172, %r15171, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15215, %r15171, %r15172, %r15108; + // end inline asm + xor.b32 %r15633, %r15211, %r15147; + xor.b32 %r15634, %r15215, %r15148; + xor.b32 %r15281, %r30473, %r15633; + xor.b32 %r15280, %r30474, %r15634; + xor.b32 %r15408, %r30471, %r15633; + xor.b32 %r15409, %r30472, %r15634; + xor.b32 %r15289, %r30449, %r15633; + xor.b32 %r15288, %r30450, %r15634; + xor.b32 %r15392, %r30447, %r15633; + xor.b32 %r15393, %r30448, %r15634; + xor.b32 %r15257, %r30445, %r15633; + xor.b32 %r15256, %r30446, %r15634; + // begin inline asm + shf.l.wrap.b32 %r15219, %r15184, %r15183, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15223, %r15183, %r15184, %r15108; + // end inline asm + xor.b32 %r15635, %r15219, %r15159; + xor.b32 %r15636, %r15223, %r15160; + xor.b32 %r15376, %r30469, %r15635; + xor.b32 %r15377, %r30470, %r15636; + xor.b32 %r15353, %r30443, %r15635; + xor.b32 %r15352, %r30444, %r15636; + xor.b32 %r15296, %r30441, %r15635; + xor.b32 %r15297, %r30442, %r15636; + xor.b32 %r15384, %r30439, %r15635; + xor.b32 %r15385, %r30440, %r15636; + xor.b32 %r15313, %r30437, %r15635; + xor.b32 %r15312, %r30438, %r15636; + // begin inline asm + shf.l.wrap.b32 %r15227, %r15136, %r15135, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15231, %r15135, %r15136, %r15108; + // end inline asm + xor.b32 %r15637, %r15227, %r15171; + xor.b32 %r15638, %r15231, %r15172; + xor.b32 %r15328, %r30467, %r15637; + xor.b32 %r15329, %r30468, %r15638; + xor.b32 %r15248, %r30435, %r15637; + xor.b32 %r15249, %r30436, %r15638; + xor.b32 %r15265, %r30433, %r15637; + xor.b32 %r15264, %r30434, %r15638; + xor.b32 %r15304, %r30431, %r15637; + xor.b32 %r15305, %r30432, %r15638; + xor.b32 %r15336, %r30429, %r15637; + xor.b32 %r15337, %r30430, %r15638; + mov.u32 %r15242, 44; + // begin inline asm + shf.l.wrap.b32 %r15235, %r15241, %r15240, %r15242; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15239, %r15240, %r15241, %r15242; + // end inline asm + mov.u32 %r15250, 20; + // begin inline asm + shf.l.wrap.b32 %r15243, %r15249, %r15248, %r15250; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15247, %r15248, %r15249, %r15250; + // end inline asm + mov.u32 %r15258, 61; + // begin inline asm + shf.l.wrap.b32 %r15251, %r15257, %r15256, %r15258; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15255, %r15256, %r15257, %r15258; + // end inline asm + mov.u32 %r15266, 39; + // begin inline asm + shf.l.wrap.b32 %r15259, %r15265, %r15264, %r15266; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15263, %r15264, %r15265, %r15266; + // end inline asm + mov.u32 %r15274, 18; + // begin inline asm + shf.l.wrap.b32 %r15267, %r15273, %r15272, %r15274; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15271, %r15272, %r15273, %r15274; + // end inline asm + mov.u32 %r15282, 62; + // begin inline asm + shf.l.wrap.b32 %r15275, %r15281, %r15280, %r15282; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15279, %r15280, %r15281, %r15282; + // end inline asm + mov.u32 %r15290, 43; + // begin inline asm + shf.l.wrap.b32 %r15283, %r15289, %r15288, %r15290; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15287, %r15288, %r15289, %r15290; + // end inline asm + mov.u32 %r15298, 25; + // begin inline asm + shf.l.wrap.b32 %r15291, %r15297, %r15296, %r15298; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15295, %r15296, %r15297, %r15298; + // end inline asm + mov.u32 %r15306, 8; + // begin inline asm + shf.l.wrap.b32 %r15299, %r15305, %r15304, %r15306; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15303, %r15304, %r15305, %r15306; + // end inline asm + mov.u32 %r15314, 56; + // begin inline asm + shf.l.wrap.b32 %r15307, %r15313, %r15312, %r15314; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15311, %r15312, %r15313, %r15314; + // end inline asm + mov.u32 %r15322, 41; + // begin inline asm + shf.l.wrap.b32 %r15315, %r15321, %r15320, %r15322; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15319, %r15320, %r15321, %r15322; + // end inline asm + mov.u32 %r15330, 27; + // begin inline asm + shf.l.wrap.b32 %r15323, %r15329, %r15328, %r15330; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15327, %r15328, %r15329, %r15330; + // end inline asm + mov.u32 %r15338, 14; + // begin inline asm + shf.l.wrap.b32 %r15331, %r15337, %r15336, %r15338; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15335, %r15336, %r15337, %r15338; + // end inline asm + mov.u32 %r15346, 2; + // begin inline asm + shf.l.wrap.b32 %r15339, %r15345, %r15344, %r15346; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15343, %r15344, %r15345, %r15346; + // end inline asm + mov.u32 %r15354, 55; + // begin inline asm + shf.l.wrap.b32 %r15347, %r15353, %r15352, %r15354; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15351, %r15352, %r15353, %r15354; + // end inline asm + mov.u32 %r15362, 45; + // begin inline asm + shf.l.wrap.b32 %r15355, %r15361, %r15360, %r15362; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15359, %r15360, %r15361, %r15362; + // end inline asm + mov.u32 %r15370, 36; + // begin inline asm + shf.l.wrap.b32 %r15363, %r15369, %r15368, %r15370; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15367, %r15368, %r15369, %r15370; + // end inline asm + mov.u32 %r15378, 28; + // begin inline asm + shf.l.wrap.b32 %r15371, %r15377, %r15376, %r15378; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15375, %r15376, %r15377, %r15378; + // end inline asm + mov.u32 %r15386, 21; + // begin inline asm + shf.l.wrap.b32 %r15379, %r15385, %r15384, %r15386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15383, %r15384, %r15385, %r15386; + // end inline asm + mov.u32 %r15394, 15; + // begin inline asm + shf.l.wrap.b32 %r15387, %r15393, %r15392, %r15394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15391, %r15392, %r15393, %r15394; + // end inline asm + mov.u32 %r15402, 10; + // begin inline asm + shf.l.wrap.b32 %r15395, %r15401, %r15400, %r15402; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15399, %r15400, %r15401, %r15402; + // end inline asm + mov.u32 %r15410, 6; + // begin inline asm + shf.l.wrap.b32 %r15403, %r15409, %r15408, %r15410; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15407, %r15408, %r15409, %r15410; + // end inline asm + mov.u32 %r15418, 3; + // begin inline asm + shf.l.wrap.b32 %r15411, %r15417, %r15416, %r15418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15415, %r15416, %r15417, %r15418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15419, %r15425, %r15424, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15423, %r15424, %r15425, %r15108; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15427, %r15462, %r15235, %r15283, 0xD2; + lop3.b32 %r15428, %r15465, %r15239, %r15287, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30477, %r15235, %r15283, %r15379, 0xD2; + lop3.b32 %r30478, %r15239, %r15287, %r15383, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30473, %r15283, %r15379, %r15331, 0xD2; + lop3.b32 %r30474, %r15287, %r15383, %r15335, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30469, %r15379, %r15331, %r15462, 0xD2; + lop3.b32 %r30470, %r15383, %r15335, %r15465, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30467, %r15331, %r15462, %r15235, 0xD2; + lop3.b32 %r30468, %r15335, %r15465, %r15239, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30463, %r15371, %r15243, %r15411, 0xD2; + lop3.b32 %r30464, %r15375, %r15247, %r15415, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30475, %r15243, %r15411, %r15355, 0xD2; + lop3.b32 %r30476, %r15247, %r15415, %r15359, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30471, %r15411, %r15355, %r15251, 0xD2; + lop3.b32 %r30472, %r15415, %r15359, %r15255, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30443, %r15355, %r15251, %r15371, 0xD2; + lop3.b32 %r30444, %r15359, %r15255, %r15375, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30443, %r30444}; + // begin inline asm + // chi + lop3.b32 %r30435, %r15251, %r15371, %r15243, 0xD2; + lop3.b32 %r30436, %r15255, %r15375, %r15247, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30435, %r30436}; + // begin inline asm + // chi + lop3.b32 %r30461, %r15419, %r15403, %r15291, 0xD2; + lop3.b32 %r30462, %r15423, %r15407, %r15295, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30461, %r30462}; + // begin inline asm + // chi + lop3.b32 %r30455, %r15403, %r15291, %r15299, 0xD2; + lop3.b32 %r30456, %r15407, %r15295, %r15303, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30455, %r30456}; + // begin inline asm + // chi + lop3.b32 %r30449, %r15291, %r15299, %r15267, 0xD2; + lop3.b32 %r30450, %r15295, %r15303, %r15271, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30449, %r30450}; + // begin inline asm + // chi + lop3.b32 %r30441, %r15299, %r15267, %r15419, 0xD2; + lop3.b32 %r30442, %r15303, %r15271, %r15423, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30441, %r30442}; + // begin inline asm + // chi + lop3.b32 %r30433, %r15267, %r15419, %r15403, 0xD2; + lop3.b32 %r30434, %r15271, %r15423, %r15407, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30433, %r30434}; + // begin inline asm + // chi + lop3.b32 %r30459, %r15323, %r15363, %r15395, 0xD2; + lop3.b32 %r30460, %r15327, %r15367, %r15399, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30459, %r30460}; + // begin inline asm + // chi + lop3.b32 %r30453, %r15363, %r15395, %r15387, 0xD2; + lop3.b32 %r30454, %r15367, %r15399, %r15391, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30453, %r30454}; + // begin inline asm + // chi + lop3.b32 %r30447, %r15395, %r15387, %r15307, 0xD2; + lop3.b32 %r30448, %r15399, %r15391, %r15311, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30447, %r30448}; + // begin inline asm + // chi + lop3.b32 %r30439, %r15387, %r15307, %r15323, 0xD2; + lop3.b32 %r30440, %r15391, %r15311, %r15327, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30439, %r30440}; + // begin inline asm + // chi + lop3.b32 %r30431, %r15307, %r15323, %r15363, 0xD2; + lop3.b32 %r30432, %r15311, %r15327, %r15367, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30431, %r30432}; + // begin inline asm + // chi + lop3.b32 %r30457, %r15275, %r15347, %r15259, 0xD2; + lop3.b32 %r30458, %r15279, %r15351, %r15263, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30457, %r30458}; + // begin inline asm + // chi + lop3.b32 %r30451, %r15347, %r15259, %r15315, 0xD2; + lop3.b32 %r30452, %r15351, %r15263, %r15319, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30451, %r30452}; + // begin inline asm + // chi + lop3.b32 %r30445, %r15259, %r15315, %r15339, 0xD2; + lop3.b32 %r30446, %r15263, %r15319, %r15343, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30445, %r30446}; + // begin inline asm + // chi + lop3.b32 %r30437, %r15315, %r15339, %r15275, 0xD2; + lop3.b32 %r30438, %r15319, %r15343, %r15279, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30437, %r30438}; + // begin inline asm + // chi + lop3.b32 %r30429, %r15339, %r15275, %r15347, 0xD2; + lop3.b32 %r30430, %r15343, %r15279, %r15351, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30429, %r30430}; + mul.wide.s32 %rd757, %r30479, 8; + add.s64 %rd756, %rd689, %rd757; + // begin inline asm + ld.global.nc.v2.u32 {%r15627,%r15628}, [%rd756]; + // end inline asm + xor.b32 %r30465, %r15427, %r15627; + xor.b32 %r30466, %r15428, %r15628; + add.s32 %r30479, %r30479, 1; + setp.lt.u32 %p29, %r30479, 23; + @%p29 bra $L__BB2_46; + + st.local.v2.u32 [%rd2+32], {%r30477, %r30478}; + st.local.v2.u32 [%rd2+72], {%r30475, %r30476}; + st.local.v2.u32 [%rd2+40], {%r30473, %r30474}; + st.local.v2.u32 [%rd2+80], {%r30471, %r30472}; + st.local.v2.u32 [%rd2+48], {%r30469, %r30470}; + st.local.v2.u32 [%rd2+56], {%r30467, %r30468}; + st.local.v2.u32 [%rd2+24], {%r30465, %r30466}; + // begin inline asm + // xor5 + lop3.b32 %r15639, %r30465, %r30463, %r30461, 0x96; + lop3.b32 %r15639, %r15639, %r30459, %r30457, 0x96; + lop3.b32 %r15640, %r30466, %r30464, %r30462, 0x96; + lop3.b32 %r15640, %r15640, %r30460, %r30458, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15651, %r30477, %r30475, %r30455, 0x96; + lop3.b32 %r15651, %r15651, %r30453, %r30451, 0x96; + lop3.b32 %r15652, %r30478, %r30476, %r30456, 0x96; + lop3.b32 %r15652, %r15652, %r30454, %r30452, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15663, %r30473, %r30471, %r30449, 0x96; + lop3.b32 %r15663, %r15663, %r30447, %r30445, 0x96; + lop3.b32 %r15664, %r30474, %r30472, %r30450, 0x96; + lop3.b32 %r15664, %r15664, %r30448, %r30446, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15675, %r30469, %r30443, %r30441, 0x96; + lop3.b32 %r15675, %r15675, %r30439, %r30437, 0x96; + lop3.b32 %r15676, %r30470, %r30444, %r30442, 0x96; + lop3.b32 %r15676, %r15676, %r30440, %r30438, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15687, %r30467, %r30435, %r30433, 0x96; + lop3.b32 %r15687, %r15687, %r30431, %r30429, 0x96; + lop3.b32 %r15688, %r30468, %r30436, %r30434, 0x96; + lop3.b32 %r15688, %r15688, %r30432, %r30430, 0x96; + // end inline asm + mov.u32 %r15891, 1; + // begin inline asm + shf.l.wrap.b32 %r15699, %r15652, %r15651, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15703, %r15651, %r15652, %r15891; + // end inline asm + xor.b32 %r15918, %r15699, %r15687; + xor.b32 %r15919, %r15703, %r15688; + xor.b32 %r15846, %r30465, %r15918; + xor.b32 %r15849, %r30466, %r15919; + xor.b32 %r15809, %r30462, %r15919; + xor.b32 %r15808, %r30461, %r15918; + st.local.v2.u32 [%rd2+104], {%r15808, %r15809}; + // begin inline asm + shf.l.wrap.b32 %r15707, %r15664, %r15663, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15711, %r15663, %r15664, %r15891; + // end inline asm + xor.b32 %r15920, %r15707, %r15639; + xor.b32 %r15921, %r15711, %r15640; + xor.b32 %r15745, %r30475, %r15920; + xor.b32 %r15744, %r30476, %r15921; + xor.b32 %r15784, %r30454, %r15921; + xor.b32 %r15785, %r30453, %r15920; + st.local.v2.u32 [%rd2+152], {%r15785, %r15784}; + // begin inline asm + shf.l.wrap.b32 %r15715, %r15676, %r15675, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15719, %r15675, %r15676, %r15891; + // end inline asm + xor.b32 %r15922, %r15715, %r15651; + xor.b32 %r15923, %r15719, %r15652; + xor.b32 %r15768, %r30450, %r15923; + xor.b32 %r15769, %r30449, %r15922; + st.local.v2.u32 [%rd2+120], {%r15769, %r15768}; + xor.b32 %r15760, %r30446, %r15923; + xor.b32 %r15761, %r30445, %r15922; + st.local.v2.u32 [%rd2+200], {%r15761, %r15760}; + // begin inline asm + shf.l.wrap.b32 %r15723, %r15688, %r15687, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15727, %r15687, %r15688, %r15891; + // end inline asm + xor.b32 %r15924, %r15723, %r15663; + xor.b32 %r15925, %r15727, %r15664; + xor.b32 %r15792, %r30469, %r15924; + xor.b32 %r15793, %r30470, %r15925; + xor.b32 %r15801, %r30440, %r15925; + xor.b32 %r15800, %r30439, %r15924; + st.local.v2.u32 [%rd2+168], {%r15800, %r15801}; + // begin inline asm + shf.l.wrap.b32 %r15731, %r15640, %r15639, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15735, %r15639, %r15640, %r15891; + // end inline asm + xor.b32 %r15926, %r15731, %r15675; + xor.b32 %r15927, %r15735, %r15676; + xor.b32 %r15752, %r30435, %r15926; + xor.b32 %r15753, %r30436, %r15927; + xor.b32 %r15777, %r30430, %r15927; + xor.b32 %r15776, %r30429, %r15926; + st.local.v2.u32 [%rd2+216], {%r15776, %r15777}; + // begin inline asm + shf.l.wrap.b32 %r15739, %r15745, %r15744, %r15242; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15743, %r15744, %r15745, %r15242; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15747, %r15753, %r15752, %r15250; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15751, %r15752, %r15753, %r15250; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15759, %r15760, %r15761, %r15258; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15755, %r15761, %r15760, %r15258; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r15755, %r15759}; + // begin inline asm + shf.l.wrap.b32 %r15763, %r15769, %r15768, %r15290; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15767, %r15768, %r15769, %r15290; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15771, %r15777, %r15776, %r15338; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15775, %r15776, %r15777, %r15338; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15783, %r15784, %r15785, %r15362; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15779, %r15785, %r15784, %r15362; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r15779, %r15783}; + // begin inline asm + shf.l.wrap.b32 %r15787, %r15793, %r15792, %r15378; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15791, %r15792, %r15793, %r15378; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15795, %r15801, %r15800, %r15386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15799, %r15800, %r15801, %r15386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15803, %r15809, %r15808, %r15418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15807, %r15808, %r15809, %r15418; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15811, %r15846, %r15739, %r15763, 0xD2; + lop3.b32 %r15812, %r15849, %r15743, %r15767, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15819, %r15739, %r15763, %r15795, 0xD2; + lop3.b32 %r15820, %r15743, %r15767, %r15799, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r15819, %r15820}; + // begin inline asm + // chi + lop3.b32 %r15827, %r15763, %r15795, %r15771, 0xD2; + lop3.b32 %r15828, %r15767, %r15799, %r15775, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r15827, %r15828}; + // begin inline asm + // chi + lop3.b32 %r15835, %r15795, %r15771, %r15846, 0xD2; + lop3.b32 %r15836, %r15799, %r15775, %r15849, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r15835, %r15836}; + // begin inline asm + // chi + lop3.b32 %r15843, %r15771, %r15846, %r15739, 0xD2; + lop3.b32 %r15844, %r15775, %r15849, %r15743, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r15843, %r15844}; + // begin inline asm + // chi + lop3.b32 %r15851, %r15787, %r15747, %r15803, 0xD2; + lop3.b32 %r15852, %r15791, %r15751, %r15807, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r15851, %r15852}; + // begin inline asm + // chi + lop3.b32 %r15859, %r15747, %r15803, %r15779, 0xD2; + lop3.b32 %r15860, %r15751, %r15807, %r15783, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r15859, %r15860}; + // begin inline asm + // chi + lop3.b32 %r15867, %r15803, %r15779, %r15755, 0xD2; + lop3.b32 %r15868, %r15807, %r15783, %r15759, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r15867, %r15868}; + // begin inline asm + ld.global.nc.v2.u32 {%r15875,%r15876}, [%rd690]; + // end inline asm + xor.b32 %r15928, %r15812, %r15876; + xor.b32 %r15929, %r15811, %r15875; + mov.b64 %rd1333, {%r15929, %r15928}; + mov.b64 %rd1334, {%r15819, %r15820}; + mov.b64 %rd1335, {%r15827, %r15828}; + mov.b64 %rd1336, {%r15835, %r15836}; + mov.b64 %rd1337, {%r15843, %r15844}; + mov.b64 %rd1338, {%r15851, %r15852}; + mov.b64 %rd1339, {%r15859, %r15860}; + mov.b64 %rd1340, {%r15867, %r15868}; + mov.u32 %r30480, 0; + st.local.v2.u32 [%rd2+24], {%r15929, %r15928}; + st.local.v2.u32 [%rd147+96], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+104], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+112], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+120], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+128], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+136], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+144], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+152], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+160], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+168], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+176], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+184], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+192], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+200], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+208], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+216], {%r30480, %r30480}; + mov.u32 %r30495, -2147483648; + st.local.v2.u32 [%rd147+88], {%r15891, %r30495}; + mov.u32 %r30481, %r30480; + mov.u32 %r30482, %r30480; + mov.u32 %r30483, %r30480; + mov.u32 %r30484, %r30480; + mov.u32 %r30485, %r30480; + mov.u32 %r30486, %r30480; + mov.u32 %r30487, %r30480; + mov.u32 %r30488, %r30480; + mov.u32 %r30489, %r30480; + mov.u32 %r30490, %r30480; + mov.u32 %r30491, %r30480; + mov.u32 %r30492, %r30480; + mov.u32 %r30493, %r30480; + mov.u32 %r30494, %r15891; + mov.u32 %r30496, %r30480; + mov.u32 %r30497, %r30480; + mov.u32 %r30498, %r30480; + mov.u32 %r30499, %r30480; + mov.u32 %r30500, %r30480; + mov.u32 %r30501, %r30480; + mov.u32 %r30502, %r30480; + mov.u32 %r30503, %r30480; + mov.u32 %r30504, %r30480; + mov.u32 %r30505, %r30480; + mov.u32 %r30506, %r30480; + mov.u32 %r30507, %r30480; + mov.u32 %r30508, %r30480; + mov.u32 %r30509, %r30480; + mov.u32 %r30510, %r30480; + mov.u32 %r30511, %r30480; + mov.u32 %r30512, %r30480; + mov.u32 %r30513, %r30480; + mov.u32 %r30530, %r30480; + +$L__BB2_48: + // begin inline asm + // xor5 + lop3.b32 %r15930, %r30516, %r30514, %r30512, 0x96; + lop3.b32 %r15930, %r15930, %r30510, %r30508, 0x96; + lop3.b32 %r15931, %r30517, %r30515, %r30513, 0x96; + lop3.b32 %r15931, %r15931, %r30511, %r30509, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15942, %r30528, %r30526, %r30506, 0x96; + lop3.b32 %r15942, %r15942, %r30504, %r30502, 0x96; + lop3.b32 %r15943, %r30529, %r30527, %r30507, 0x96; + lop3.b32 %r15943, %r15943, %r30505, %r30503, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15954, %r30524, %r30522, %r30500, 0x96; + lop3.b32 %r15954, %r15954, %r30498, %r30496, 0x96; + lop3.b32 %r15955, %r30525, %r30523, %r30501, 0x96; + lop3.b32 %r15955, %r15955, %r30499, %r30497, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15966, %r30520, %r30494, %r30492, 0x96; + lop3.b32 %r15966, %r15966, %r30490, %r30488, 0x96; + lop3.b32 %r15967, %r30521, %r30495, %r30493, 0x96; + lop3.b32 %r15967, %r15967, %r30491, %r30489, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15978, %r30518, %r30486, %r30484, 0x96; + lop3.b32 %r15978, %r15978, %r30482, %r30480, 0x96; + lop3.b32 %r15979, %r30519, %r30487, %r30485, 0x96; + lop3.b32 %r15979, %r15979, %r30483, %r30481, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15990, %r15943, %r15942, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15994, %r15942, %r15943, %r15891; + // end inline asm + xor.b32 %r16424, %r15990, %r15978; + xor.b32 %r16425, %r15994, %r15979; + xor.b32 %r16257, %r30516, %r16424; + xor.b32 %r16260, %r30517, %r16425; + xor.b32 %r16164, %r30514, %r16424; + xor.b32 %r16163, %r30515, %r16425; + xor.b32 %r16211, %r30512, %r16424; + xor.b32 %r16212, %r30513, %r16425; + xor.b32 %r16116, %r30510, %r16424; + xor.b32 %r16115, %r30511, %r16425; + xor.b32 %r16067, %r30508, %r16424; + xor.b32 %r16068, %r30509, %r16425; + // begin inline asm + shf.l.wrap.b32 %r15998, %r15955, %r15954, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16002, %r15954, %r15955, %r15891; + // end inline asm + xor.b32 %r16426, %r15998, %r15930; + xor.b32 %r16427, %r16002, %r15931; + xor.b32 %r16219, %r30528, %r16426; + xor.b32 %r16220, %r30529, %r16427; + xor.b32 %r16036, %r30526, %r16426; + xor.b32 %r16035, %r30527, %r16427; + xor.b32 %r16195, %r30506, %r16426; + xor.b32 %r16196, %r30507, %r16427; + xor.b32 %r16156, %r30504, %r16426; + xor.b32 %r16155, %r30505, %r16427; + xor.b32 %r16139, %r30502, %r16426; + xor.b32 %r16140, %r30503, %r16427; + // begin inline asm + shf.l.wrap.b32 %r16006, %r15967, %r15966, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16010, %r15966, %r15967, %r15891; + // end inline asm + xor.b32 %r16428, %r16006, %r15942; + xor.b32 %r16429, %r16010, %r15943; + xor.b32 %r16076, %r30524, %r16428; + xor.b32 %r16075, %r30525, %r16429; + xor.b32 %r16203, %r30522, %r16428; + xor.b32 %r16204, %r30523, %r16429; + xor.b32 %r16084, %r30500, %r16428; + xor.b32 %r16083, %r30501, %r16429; + xor.b32 %r16187, %r30498, %r16428; + xor.b32 %r16188, %r30499, %r16429; + xor.b32 %r16052, %r30496, %r16428; + xor.b32 %r16051, %r30497, %r16429; + // begin inline asm + shf.l.wrap.b32 %r16014, %r15979, %r15978, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16018, %r15978, %r15979, %r15891; + // end inline asm + xor.b32 %r16430, %r16014, %r15954; + xor.b32 %r16431, %r16018, %r15955; + xor.b32 %r16171, %r30520, %r16430; + xor.b32 %r16172, %r30521, %r16431; + xor.b32 %r16148, %r30494, %r16430; + xor.b32 %r16147, %r30495, %r16431; + xor.b32 %r16091, %r30492, %r16430; + xor.b32 %r16092, %r30493, %r16431; + xor.b32 %r16179, %r30490, %r16430; + xor.b32 %r16180, %r30491, %r16431; + xor.b32 %r16108, %r30488, %r16430; + xor.b32 %r16107, %r30489, %r16431; + // begin inline asm + shf.l.wrap.b32 %r16022, %r15931, %r15930, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16026, %r15930, %r15931, %r15891; + // end inline asm + xor.b32 %r16432, %r16022, %r15966; + xor.b32 %r16433, %r16026, %r15967; + xor.b32 %r16123, %r30518, %r16432; + xor.b32 %r16124, %r30519, %r16433; + xor.b32 %r16043, %r30486, %r16432; + xor.b32 %r16044, %r30487, %r16433; + xor.b32 %r16060, %r30484, %r16432; + xor.b32 %r16059, %r30485, %r16433; + xor.b32 %r16099, %r30482, %r16432; + xor.b32 %r16100, %r30483, %r16433; + xor.b32 %r16131, %r30480, %r16432; + xor.b32 %r16132, %r30481, %r16433; + mov.u32 %r16037, 44; + // begin inline asm + shf.l.wrap.b32 %r16030, %r16036, %r16035, %r16037; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16034, %r16035, %r16036, %r16037; + // end inline asm + mov.u32 %r16045, 20; + // begin inline asm + shf.l.wrap.b32 %r16038, %r16044, %r16043, %r16045; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16042, %r16043, %r16044, %r16045; + // end inline asm + mov.u32 %r16053, 61; + // begin inline asm + shf.l.wrap.b32 %r16046, %r16052, %r16051, %r16053; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16050, %r16051, %r16052, %r16053; + // end inline asm + mov.u32 %r16061, 39; + // begin inline asm + shf.l.wrap.b32 %r16054, %r16060, %r16059, %r16061; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16058, %r16059, %r16060, %r16061; + // end inline asm + mov.u32 %r16069, 18; + // begin inline asm + shf.l.wrap.b32 %r16062, %r16068, %r16067, %r16069; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16066, %r16067, %r16068, %r16069; + // end inline asm + mov.u32 %r16077, 62; + // begin inline asm + shf.l.wrap.b32 %r16070, %r16076, %r16075, %r16077; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16074, %r16075, %r16076, %r16077; + // end inline asm + mov.u32 %r16085, 43; + // begin inline asm + shf.l.wrap.b32 %r16078, %r16084, %r16083, %r16085; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16082, %r16083, %r16084, %r16085; + // end inline asm + mov.u32 %r16093, 25; + // begin inline asm + shf.l.wrap.b32 %r16086, %r16092, %r16091, %r16093; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16090, %r16091, %r16092, %r16093; + // end inline asm + mov.u32 %r16101, 8; + // begin inline asm + shf.l.wrap.b32 %r16094, %r16100, %r16099, %r16101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16098, %r16099, %r16100, %r16101; + // end inline asm + mov.u32 %r16109, 56; + // begin inline asm + shf.l.wrap.b32 %r16102, %r16108, %r16107, %r16109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16106, %r16107, %r16108, %r16109; + // end inline asm + mov.u32 %r16117, 41; + // begin inline asm + shf.l.wrap.b32 %r16110, %r16116, %r16115, %r16117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16114, %r16115, %r16116, %r16117; + // end inline asm + mov.u32 %r16125, 27; + // begin inline asm + shf.l.wrap.b32 %r16118, %r16124, %r16123, %r16125; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16122, %r16123, %r16124, %r16125; + // end inline asm + mov.u32 %r16133, 14; + // begin inline asm + shf.l.wrap.b32 %r16126, %r16132, %r16131, %r16133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16130, %r16131, %r16132, %r16133; + // end inline asm + mov.u32 %r16141, 2; + // begin inline asm + shf.l.wrap.b32 %r16134, %r16140, %r16139, %r16141; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16138, %r16139, %r16140, %r16141; + // end inline asm + mov.u32 %r16149, 55; + // begin inline asm + shf.l.wrap.b32 %r16142, %r16148, %r16147, %r16149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16146, %r16147, %r16148, %r16149; + // end inline asm + mov.u32 %r16157, 45; + // begin inline asm + shf.l.wrap.b32 %r16150, %r16156, %r16155, %r16157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16154, %r16155, %r16156, %r16157; + // end inline asm + mov.u32 %r16165, 36; + // begin inline asm + shf.l.wrap.b32 %r16158, %r16164, %r16163, %r16165; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16162, %r16163, %r16164, %r16165; + // end inline asm + mov.u32 %r16173, 28; + // begin inline asm + shf.l.wrap.b32 %r16166, %r16172, %r16171, %r16173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16170, %r16171, %r16172, %r16173; + // end inline asm + mov.u32 %r16181, 21; + // begin inline asm + shf.l.wrap.b32 %r16174, %r16180, %r16179, %r16181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16178, %r16179, %r16180, %r16181; + // end inline asm + mov.u32 %r16189, 15; + // begin inline asm + shf.l.wrap.b32 %r16182, %r16188, %r16187, %r16189; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16186, %r16187, %r16188, %r16189; + // end inline asm + mov.u32 %r16197, 10; + // begin inline asm + shf.l.wrap.b32 %r16190, %r16196, %r16195, %r16197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16194, %r16195, %r16196, %r16197; + // end inline asm + mov.u32 %r16205, 6; + // begin inline asm + shf.l.wrap.b32 %r16198, %r16204, %r16203, %r16205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16202, %r16203, %r16204, %r16205; + // end inline asm + mov.u32 %r16213, 3; + // begin inline asm + shf.l.wrap.b32 %r16206, %r16212, %r16211, %r16213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16210, %r16211, %r16212, %r16213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16214, %r16220, %r16219, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16218, %r16219, %r16220, %r15891; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16222, %r16257, %r16030, %r16078, 0xD2; + lop3.b32 %r16223, %r16260, %r16034, %r16082, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30528, %r16030, %r16078, %r16174, 0xD2; + lop3.b32 %r30529, %r16034, %r16082, %r16178, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30524, %r16078, %r16174, %r16126, 0xD2; + lop3.b32 %r30525, %r16082, %r16178, %r16130, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30520, %r16174, %r16126, %r16257, 0xD2; + lop3.b32 %r30521, %r16178, %r16130, %r16260, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30518, %r16126, %r16257, %r16030, 0xD2; + lop3.b32 %r30519, %r16130, %r16260, %r16034, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30514, %r16166, %r16038, %r16206, 0xD2; + lop3.b32 %r30515, %r16170, %r16042, %r16210, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30526, %r16038, %r16206, %r16150, 0xD2; + lop3.b32 %r30527, %r16042, %r16210, %r16154, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30522, %r16206, %r16150, %r16046, 0xD2; + lop3.b32 %r30523, %r16210, %r16154, %r16050, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30494, %r16150, %r16046, %r16166, 0xD2; + lop3.b32 %r30495, %r16154, %r16050, %r16170, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r30494, %r30495}; + // begin inline asm + // chi + lop3.b32 %r30486, %r16046, %r16166, %r16038, 0xD2; + lop3.b32 %r30487, %r16050, %r16170, %r16042, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r30486, %r30487}; + // begin inline asm + // chi + lop3.b32 %r30512, %r16214, %r16198, %r16086, 0xD2; + lop3.b32 %r30513, %r16218, %r16202, %r16090, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+104], {%r30512, %r30513}; + // begin inline asm + // chi + lop3.b32 %r30506, %r16198, %r16086, %r16094, 0xD2; + lop3.b32 %r30507, %r16202, %r16090, %r16098, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+112], {%r30506, %r30507}; + // begin inline asm + // chi + lop3.b32 %r30500, %r16086, %r16094, %r16062, 0xD2; + lop3.b32 %r30501, %r16090, %r16098, %r16066, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+120], {%r30500, %r30501}; + // begin inline asm + // chi + lop3.b32 %r30492, %r16094, %r16062, %r16214, 0xD2; + lop3.b32 %r30493, %r16098, %r16066, %r16218, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+128], {%r30492, %r30493}; + // begin inline asm + // chi + lop3.b32 %r30484, %r16062, %r16214, %r16198, 0xD2; + lop3.b32 %r30485, %r16066, %r16218, %r16202, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+136], {%r30484, %r30485}; + // begin inline asm + // chi + lop3.b32 %r30510, %r16118, %r16158, %r16190, 0xD2; + lop3.b32 %r30511, %r16122, %r16162, %r16194, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+144], {%r30510, %r30511}; + // begin inline asm + // chi + lop3.b32 %r30504, %r16158, %r16190, %r16182, 0xD2; + lop3.b32 %r30505, %r16162, %r16194, %r16186, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+152], {%r30504, %r30505}; + // begin inline asm + // chi + lop3.b32 %r30498, %r16190, %r16182, %r16102, 0xD2; + lop3.b32 %r30499, %r16194, %r16186, %r16106, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+160], {%r30498, %r30499}; + // begin inline asm + // chi + lop3.b32 %r30490, %r16182, %r16102, %r16118, 0xD2; + lop3.b32 %r30491, %r16186, %r16106, %r16122, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+168], {%r30490, %r30491}; + // begin inline asm + // chi + lop3.b32 %r30482, %r16102, %r16118, %r16158, 0xD2; + lop3.b32 %r30483, %r16106, %r16122, %r16162, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+176], {%r30482, %r30483}; + // begin inline asm + // chi + lop3.b32 %r30508, %r16070, %r16142, %r16054, 0xD2; + lop3.b32 %r30509, %r16074, %r16146, %r16058, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+184], {%r30508, %r30509}; + // begin inline asm + // chi + lop3.b32 %r30502, %r16142, %r16054, %r16110, 0xD2; + lop3.b32 %r30503, %r16146, %r16058, %r16114, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+192], {%r30502, %r30503}; + // begin inline asm + // chi + lop3.b32 %r30496, %r16054, %r16110, %r16134, 0xD2; + lop3.b32 %r30497, %r16058, %r16114, %r16138, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+200], {%r30496, %r30497}; + // begin inline asm + // chi + lop3.b32 %r30488, %r16110, %r16134, %r16070, 0xD2; + lop3.b32 %r30489, %r16114, %r16138, %r16074, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+208], {%r30488, %r30489}; + // begin inline asm + // chi + lop3.b32 %r30480, %r16134, %r16070, %r16142, 0xD2; + lop3.b32 %r30481, %r16138, %r16074, %r16146, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+216], {%r30480, %r30481}; + mul.wide.s32 %rd764, %r30530, 8; + add.s64 %rd763, %rd689, %rd764; + // begin inline asm + ld.global.nc.v2.u32 {%r16422,%r16423}, [%rd763]; + // end inline asm + xor.b32 %r30516, %r16222, %r16422; + xor.b32 %r30517, %r16223, %r16423; + add.s32 %r30530, %r30530, 1; + setp.lt.u32 %p30, %r30530, 23; + @%p30 bra $L__BB2_48; + + mov.u32 %r16533, 1; + st.local.v2.u32 [%rd147+32], {%r30528, %r30529}; + st.local.v2.u32 [%rd147+72], {%r30526, %r30527}; + st.local.v2.u32 [%rd147+40], {%r30524, %r30525}; + st.local.v2.u32 [%rd147+80], {%r30522, %r30523}; + st.local.v2.u32 [%rd147+48], {%r30520, %r30521}; + st.local.v2.u32 [%rd147+56], {%r30518, %r30519}; + st.local.v2.u32 [%rd147+24], {%r30516, %r30517}; + // begin inline asm + // xor5 + lop3.b32 %r16434, %r30516, %r30514, %r30512, 0x96; + lop3.b32 %r16434, %r16434, %r30510, %r30508, 0x96; + lop3.b32 %r16435, %r30517, %r30515, %r30513, 0x96; + lop3.b32 %r16435, %r16435, %r30511, %r30509, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16446, %r30528, %r30526, %r30506, 0x96; + lop3.b32 %r16446, %r16446, %r30504, %r30502, 0x96; + lop3.b32 %r16447, %r30529, %r30527, %r30507, 0x96; + lop3.b32 %r16447, %r16447, %r30505, %r30503, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16458, %r30524, %r30522, %r30500, 0x96; + lop3.b32 %r16458, %r16458, %r30498, %r30496, 0x96; + lop3.b32 %r16459, %r30525, %r30523, %r30501, 0x96; + lop3.b32 %r16459, %r16459, %r30499, %r30497, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16470, %r30520, %r30494, %r30492, 0x96; + lop3.b32 %r16470, %r16470, %r30490, %r30488, 0x96; + lop3.b32 %r16471, %r30521, %r30495, %r30493, 0x96; + lop3.b32 %r16471, %r16471, %r30491, %r30489, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16482, %r30518, %r30486, %r30484, 0x96; + lop3.b32 %r16482, %r16482, %r30482, %r30480, 0x96; + lop3.b32 %r16483, %r30519, %r30487, %r30485, 0x96; + lop3.b32 %r16483, %r16483, %r30483, %r30481, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16494, %r16447, %r16446, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16498, %r16446, %r16447, %r16533; + // end inline asm + xor.b32 %r16672, %r16494, %r16482; + xor.b32 %r16673, %r16498, %r16483; + xor.b32 %r16641, %r30516, %r16672; + xor.b32 %r16644, %r30517, %r16673; + xor.b32 %r16604, %r30513, %r16673; + xor.b32 %r16603, %r30512, %r16672; + st.local.v2.u32 [%rd147+104], {%r16603, %r16604}; + // begin inline asm + shf.l.wrap.b32 %r16502, %r16459, %r16458, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16506, %r16458, %r16459, %r16533; + // end inline asm + xor.b32 %r16674, %r16502, %r16434; + xor.b32 %r16675, %r16506, %r16435; + xor.b32 %r16540, %r30526, %r16674; + xor.b32 %r16539, %r30527, %r16675; + xor.b32 %r16579, %r30505, %r16675; + xor.b32 %r16580, %r30504, %r16674; + st.local.v2.u32 [%rd147+152], {%r16580, %r16579}; + // begin inline asm + shf.l.wrap.b32 %r16510, %r16471, %r16470, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16514, %r16470, %r16471, %r16533; + // end inline asm + xor.b32 %r16676, %r16510, %r16446; + xor.b32 %r16677, %r16514, %r16447; + xor.b32 %r16563, %r30501, %r16677; + xor.b32 %r16564, %r30500, %r16676; + st.local.v2.u32 [%rd147+120], {%r16564, %r16563}; + xor.b32 %r16555, %r30497, %r16677; + xor.b32 %r16556, %r30496, %r16676; + st.local.v2.u32 [%rd147+200], {%r16556, %r16555}; + // begin inline asm + shf.l.wrap.b32 %r16518, %r16483, %r16482, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16522, %r16482, %r16483, %r16533; + // end inline asm + xor.b32 %r16678, %r16518, %r16458; + xor.b32 %r16679, %r16522, %r16459; + xor.b32 %r16587, %r30520, %r16678; + xor.b32 %r16588, %r30521, %r16679; + xor.b32 %r16596, %r30491, %r16679; + xor.b32 %r16595, %r30490, %r16678; + st.local.v2.u32 [%rd147+168], {%r16595, %r16596}; + // begin inline asm + shf.l.wrap.b32 %r16526, %r16435, %r16434, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16530, %r16434, %r16435, %r16533; + // end inline asm + xor.b32 %r16680, %r16526, %r16470; + xor.b32 %r16681, %r16530, %r16471; + xor.b32 %r16547, %r30486, %r16680; + xor.b32 %r16548, %r30487, %r16681; + xor.b32 %r16572, %r30481, %r16681; + xor.b32 %r16571, %r30480, %r16680; + st.local.v2.u32 [%rd147+216], {%r16571, %r16572}; + // begin inline asm + shf.l.wrap.b32 %r16534, %r16540, %r16539, %r16037; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16538, %r16539, %r16540, %r16037; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16542, %r16548, %r16547, %r16045; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16546, %r16547, %r16548, %r16045; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16554, %r16555, %r16556, %r16053; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16550, %r16556, %r16555, %r16053; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r16550, %r16554}; + // begin inline asm + shf.l.wrap.b32 %r16558, %r16564, %r16563, %r16085; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16562, %r16563, %r16564, %r16085; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16566, %r16572, %r16571, %r16133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16570, %r16571, %r16572, %r16133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16578, %r16579, %r16580, %r16157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16574, %r16580, %r16579, %r16157; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r16574, %r16578}; + // begin inline asm + shf.l.wrap.b32 %r16582, %r16588, %r16587, %r16173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16586, %r16587, %r16588, %r16173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16590, %r16596, %r16595, %r16181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16594, %r16595, %r16596, %r16181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16598, %r16604, %r16603, %r16213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16602, %r16603, %r16604, %r16213; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16606, %r16641, %r16534, %r16558, 0xD2; + lop3.b32 %r16607, %r16644, %r16538, %r16562, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16614, %r16534, %r16558, %r16590, 0xD2; + lop3.b32 %r16615, %r16538, %r16562, %r16594, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+32], {%r16614, %r16615}; + // begin inline asm + // chi + lop3.b32 %r16622, %r16558, %r16590, %r16566, 0xD2; + lop3.b32 %r16623, %r16562, %r16594, %r16570, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+40], {%r16622, %r16623}; + // begin inline asm + // chi + lop3.b32 %r16630, %r16590, %r16566, %r16641, 0xD2; + lop3.b32 %r16631, %r16594, %r16570, %r16644, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+48], {%r16630, %r16631}; + // begin inline asm + // chi + lop3.b32 %r16638, %r16566, %r16641, %r16534, 0xD2; + lop3.b32 %r16639, %r16570, %r16644, %r16538, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+56], {%r16638, %r16639}; + // begin inline asm + // chi + lop3.b32 %r16646, %r16582, %r16542, %r16598, 0xD2; + lop3.b32 %r16647, %r16586, %r16546, %r16602, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+64], {%r16646, %r16647}; + // begin inline asm + // chi + lop3.b32 %r16654, %r16542, %r16598, %r16574, 0xD2; + lop3.b32 %r16655, %r16546, %r16602, %r16578, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+72], {%r16654, %r16655}; + // begin inline asm + // chi + lop3.b32 %r16662, %r16598, %r16574, %r16550, 0xD2; + lop3.b32 %r16663, %r16602, %r16578, %r16554, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+80], {%r16662, %r16663}; + // begin inline asm + ld.global.nc.v2.u32 {%r16670,%r16671}, [%rd690]; + // end inline asm + xor.b32 %r16682, %r16607, %r16671; + xor.b32 %r16683, %r16606, %r16670; + st.local.v2.u32 [%rd147+24], {%r16683, %r16682}; + mov.b64 %rd1342, {%r16614, %r16615}; + mov.b64 %rd1343, {%r16622, %r16623}; + mov.b64 %rd1346, {%r16646, %r16647}; + mov.b64 %rd1347, {%r16654, %r16655}; + mov.b64 %rd1348, {%r16662, %r16663}; + mov.b64 %rd1341, {%r16683, %r16682}; + mov.b64 %rd1344, {%r16630, %r16631}; + mov.b64 %rd1345, {%r16638, %r16639}; + st.global.u64 [%rd128], %rd1333; + st.global.u64 [%rd128+8], %rd1334; + st.global.u64 [%rd128+16], %rd1335; + st.global.u64 [%rd128+24], %rd1336; + st.global.u64 [%rd128+32], %rd1337; + st.global.u64 [%rd128+40], %rd1338; + st.global.u64 [%rd128+48], %rd1339; + st.global.u64 [%rd128+56], %rd1340; + st.global.v2.u32 [%rd128+64], {%r16683, %r16682}; + st.global.v2.u32 [%rd128+72], {%r16614, %r16615}; + st.global.v2.u32 [%rd128+80], {%r16622, %r16623}; + st.global.v2.u32 [%rd128+88], {%r16630, %r16631}; + st.global.v2.u32 [%rd128+96], {%r16638, %r16639}; + st.global.v2.u32 [%rd128+104], {%r16646, %r16647}; + st.global.v2.u32 [%rd128+112], {%r16654, %r16655}; + st.global.v2.u32 [%rd128+120], {%r16662, %r16663}; + +$L__BB2_61: + cvta.to.global.u64 %rd1266, %rd361; + shl.b32 %r3343, %r46, 1; + mul.wide.u32 %rd870, %r3343, -954391867; + shr.u64 %rd871, %rd870, 32; + cvt.u32.u64 %r19968, %rd871; + sub.s32 %r19969, %r3343, %r19968; + shr.u32 %r19970, %r19969, 1; + add.s32 %r19971, %r19970, %r19968; + shr.u32 %r19972, %r19971, 20; + mul.lo.s32 %r19973, %r19972, 1179641; + sub.s32 %r19974, %r3343, %r19973; + mul.wide.u32 %rd873, %r19974, 64; + add.s64 %rd220, %rd1266, %rd873; + or.b32 %r3344, %r3343, 1; + mul.wide.u32 %rd874, %r3344, -954391867; + shr.u64 %rd875, %rd874, 32; + cvt.u32.u64 %r19975, %rd875; + sub.s32 %r19976, %r3344, %r19975; + shr.u32 %r19977, %r19976, 1; + add.s32 %r19978, %r19977, %r19975; + shr.u32 %r19979, %r19978, 20; + mul.lo.s32 %r19980, %r19979, 1179641; + sub.s32 %r19981, %r3344, %r19980; + mul.wide.u32 %rd876, %r19981, 64; + add.s64 %rd221, %rd1266, %rd876; + @%p12 bra $L__BB2_75; + + cvta.to.global.u64 %rd877, %rd360; + mul.wide.u32 %rd878, %r46, 128; + add.s64 %rd222, %rd877, %rd878; + ld.global.u64 %rd1349, [%rd222]; + setp.eq.s64 %p37, %rd1349, 0; + @%p37 bra $L__BB2_64; + + ld.global.u64 %rd1364, [%rd222+120]; + ld.global.u64 %rd1363, [%rd222+112]; + ld.global.u64 %rd1362, [%rd222+104]; + ld.global.u64 %rd1361, [%rd222+96]; + ld.global.u64 %rd1360, [%rd222+88]; + ld.global.u64 %rd1359, [%rd222+80]; + ld.global.u64 %rd1358, [%rd222+72]; + ld.global.u64 %rd1357, [%rd222+64]; + ld.global.u64 %rd1356, [%rd222+56]; + ld.global.u64 %rd1355, [%rd222+48]; + ld.global.u64 %rd1354, [%rd222+40]; + ld.global.u64 %rd1353, [%rd222+32]; + ld.global.u64 %rd1352, [%rd222+24]; + ld.global.u64 %rd1351, [%rd222+16]; + ld.global.u64 %rd1350, [%rd222+8]; + bra.uni $L__BB2_86; + +$L__BB2_75: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd994, 1179641; + st.local.u64 [%rd2+8], %rd994; + st.local.u32 [%rd2+16], %r3343; + ld.global.u64 %rd995, [%rd220]; + ld.global.u64 %rd996, [%rd220+8]; + ld.global.u64 %rd997, [%rd220+16]; + ld.global.u64 %rd998, [%rd220+24]; + ld.global.u64 %rd999, [%rd220+32]; + ld.global.u64 %rd1000, [%rd220+40]; + ld.global.u64 %rd1001, [%rd220+48]; + ld.global.u64 %rd1002, [%rd220+56]; + st.local.u64 [%rd2+24], %rd995; + st.local.u64 [%rd2+32], %rd996; + st.local.u64 [%rd2+40], %rd997; + st.local.u64 [%rd2+48], %rd998; + st.local.u64 [%rd2+56], %rd999; + st.local.u64 [%rd2+64], %rd1000; + st.local.u64 [%rd2+72], %rd1001; + st.local.u64 [%rd2+80], %rd1002; + cvt.u32.u64 %r23308, %rd995; + xor.b32 %r23309, %r3343, %r23308; + st.local.u32 [%rd2+24], %r23309; + mov.u32 %r31005, 0; + st.local.v2.u32 [%rd2+96], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+104], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+112], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+120], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+128], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+136], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+144], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+152], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+160], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+168], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+176], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+184], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+192], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+200], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+208], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+216], {%r31005, %r31005}; + mov.u32 %r31020, -2147483648; + mov.u32 %r23281, 1; + st.local.v2.u32 [%rd2+88], {%r23281, %r31020}; + ld.local.v2.u32 {%r31041, %r31042}, [%rd2+24]; + mov.b64 {%r31039, %r31040}, %rd1000; + shr.u64 %rd1003, %rd996, 32; + cvt.u32.u64 %r31053, %rd996; + cvt.u32.u64 %r31054, %rd1003; + shr.u64 %rd1004, %rd1001, 32; + cvt.u32.u64 %r31051, %rd1001; + cvt.u32.u64 %r31052, %rd1004; + shr.u64 %rd1005, %rd997, 32; + cvt.u32.u64 %r31049, %rd997; + cvt.u32.u64 %r31050, %rd1005; + shr.u64 %rd1006, %rd1002, 32; + cvt.u32.u64 %r31047, %rd1002; + cvt.u32.u64 %r31048, %rd1006; + shr.u64 %rd1007, %rd998, 32; + cvt.u32.u64 %r31045, %rd998; + cvt.u32.u64 %r31046, %rd1007; + shr.u64 %rd1008, %rd999, 32; + cvt.u32.u64 %r31043, %rd999; + cvt.u32.u64 %r31044, %rd1008; + mov.u32 %r31006, %r31005; + mov.u32 %r31007, %r31005; + mov.u32 %r31008, %r31005; + mov.u32 %r31009, %r31005; + mov.u32 %r31010, %r31005; + mov.u32 %r31011, %r31005; + mov.u32 %r31012, %r31005; + mov.u32 %r31013, %r31005; + mov.u32 %r31014, %r31005; + mov.u32 %r31015, %r31005; + mov.u32 %r31016, %r31005; + mov.u32 %r31017, %r31005; + mov.u32 %r31018, %r31005; + mov.u32 %r31019, %r23281; + mov.u32 %r31021, %r31005; + mov.u32 %r31022, %r31005; + mov.u32 %r31023, %r31005; + mov.u32 %r31024, %r31005; + mov.u32 %r31025, %r31005; + mov.u32 %r31026, %r31005; + mov.u32 %r31027, %r31005; + mov.u32 %r31028, %r31005; + mov.u32 %r31029, %r31005; + mov.u32 %r31030, %r31005; + mov.u32 %r31031, %r31005; + mov.u32 %r31032, %r31005; + mov.u32 %r31033, %r31005; + mov.u32 %r31034, %r31005; + mov.u32 %r31035, %r31005; + mov.u32 %r31036, %r31005; + mov.u32 %r31037, %r31005; + mov.u32 %r31038, %r31005; + mov.u32 %r31055, %r31005; + +$L__BB2_76: + // begin inline asm + // xor5 + lop3.b32 %r23312, %r31041, %r31039, %r31037, 0x96; + lop3.b32 %r23312, %r23312, %r31035, %r31033, 0x96; + lop3.b32 %r23313, %r31042, %r31040, %r31038, 0x96; + lop3.b32 %r23313, %r23313, %r31036, %r31034, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23324, %r31053, %r31051, %r31031, 0x96; + lop3.b32 %r23324, %r23324, %r31029, %r31027, 0x96; + lop3.b32 %r23325, %r31054, %r31052, %r31032, 0x96; + lop3.b32 %r23325, %r23325, %r31030, %r31028, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23336, %r31049, %r31047, %r31025, 0x96; + lop3.b32 %r23336, %r23336, %r31023, %r31021, 0x96; + lop3.b32 %r23337, %r31050, %r31048, %r31026, 0x96; + lop3.b32 %r23337, %r23337, %r31024, %r31022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23348, %r31045, %r31019, %r31017, 0x96; + lop3.b32 %r23348, %r23348, %r31015, %r31013, 0x96; + lop3.b32 %r23349, %r31046, %r31020, %r31018, 0x96; + lop3.b32 %r23349, %r23349, %r31016, %r31014, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23360, %r31043, %r31011, %r31009, 0x96; + lop3.b32 %r23360, %r23360, %r31007, %r31005, 0x96; + lop3.b32 %r23361, %r31044, %r31012, %r31010, 0x96; + lop3.b32 %r23361, %r23361, %r31008, %r31006, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23372, %r23325, %r23324, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23376, %r23324, %r23325, %r23281; + // end inline asm + xor.b32 %r23806, %r23372, %r23360; + xor.b32 %r23807, %r23376, %r23361; + xor.b32 %r23639, %r31041, %r23806; + xor.b32 %r23642, %r31042, %r23807; + xor.b32 %r23546, %r31039, %r23806; + xor.b32 %r23545, %r31040, %r23807; + xor.b32 %r23593, %r31037, %r23806; + xor.b32 %r23594, %r31038, %r23807; + xor.b32 %r23498, %r31035, %r23806; + xor.b32 %r23497, %r31036, %r23807; + xor.b32 %r23449, %r31033, %r23806; + xor.b32 %r23450, %r31034, %r23807; + // begin inline asm + shf.l.wrap.b32 %r23380, %r23337, %r23336, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23384, %r23336, %r23337, %r23281; + // end inline asm + xor.b32 %r23808, %r23380, %r23312; + xor.b32 %r23809, %r23384, %r23313; + xor.b32 %r23601, %r31053, %r23808; + xor.b32 %r23602, %r31054, %r23809; + xor.b32 %r23418, %r31051, %r23808; + xor.b32 %r23417, %r31052, %r23809; + xor.b32 %r23577, %r31031, %r23808; + xor.b32 %r23578, %r31032, %r23809; + xor.b32 %r23538, %r31029, %r23808; + xor.b32 %r23537, %r31030, %r23809; + xor.b32 %r23521, %r31027, %r23808; + xor.b32 %r23522, %r31028, %r23809; + // begin inline asm + shf.l.wrap.b32 %r23388, %r23349, %r23348, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23392, %r23348, %r23349, %r23281; + // end inline asm + xor.b32 %r23810, %r23388, %r23324; + xor.b32 %r23811, %r23392, %r23325; + xor.b32 %r23458, %r31049, %r23810; + xor.b32 %r23457, %r31050, %r23811; + xor.b32 %r23585, %r31047, %r23810; + xor.b32 %r23586, %r31048, %r23811; + xor.b32 %r23466, %r31025, %r23810; + xor.b32 %r23465, %r31026, %r23811; + xor.b32 %r23569, %r31023, %r23810; + xor.b32 %r23570, %r31024, %r23811; + xor.b32 %r23434, %r31021, %r23810; + xor.b32 %r23433, %r31022, %r23811; + // begin inline asm + shf.l.wrap.b32 %r23396, %r23361, %r23360, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23400, %r23360, %r23361, %r23281; + // end inline asm + xor.b32 %r23812, %r23396, %r23336; + xor.b32 %r23813, %r23400, %r23337; + xor.b32 %r23553, %r31045, %r23812; + xor.b32 %r23554, %r31046, %r23813; + xor.b32 %r23530, %r31019, %r23812; + xor.b32 %r23529, %r31020, %r23813; + xor.b32 %r23473, %r31017, %r23812; + xor.b32 %r23474, %r31018, %r23813; + xor.b32 %r23561, %r31015, %r23812; + xor.b32 %r23562, %r31016, %r23813; + xor.b32 %r23490, %r31013, %r23812; + xor.b32 %r23489, %r31014, %r23813; + // begin inline asm + shf.l.wrap.b32 %r23404, %r23313, %r23312, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23408, %r23312, %r23313, %r23281; + // end inline asm + xor.b32 %r23814, %r23404, %r23348; + xor.b32 %r23815, %r23408, %r23349; + xor.b32 %r23505, %r31043, %r23814; + xor.b32 %r23506, %r31044, %r23815; + xor.b32 %r23425, %r31011, %r23814; + xor.b32 %r23426, %r31012, %r23815; + xor.b32 %r23442, %r31009, %r23814; + xor.b32 %r23441, %r31010, %r23815; + xor.b32 %r23481, %r31007, %r23814; + xor.b32 %r23482, %r31008, %r23815; + xor.b32 %r23513, %r31005, %r23814; + xor.b32 %r23514, %r31006, %r23815; + mov.u32 %r23419, 44; + // begin inline asm + shf.l.wrap.b32 %r23412, %r23418, %r23417, %r23419; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23416, %r23417, %r23418, %r23419; + // end inline asm + mov.u32 %r23427, 20; + // begin inline asm + shf.l.wrap.b32 %r23420, %r23426, %r23425, %r23427; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23424, %r23425, %r23426, %r23427; + // end inline asm + mov.u32 %r23435, 61; + // begin inline asm + shf.l.wrap.b32 %r23428, %r23434, %r23433, %r23435; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23432, %r23433, %r23434, %r23435; + // end inline asm + mov.u32 %r23443, 39; + // begin inline asm + shf.l.wrap.b32 %r23436, %r23442, %r23441, %r23443; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23440, %r23441, %r23442, %r23443; + // end inline asm + mov.u32 %r23451, 18; + // begin inline asm + shf.l.wrap.b32 %r23444, %r23450, %r23449, %r23451; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23448, %r23449, %r23450, %r23451; + // end inline asm + mov.u32 %r23459, 62; + // begin inline asm + shf.l.wrap.b32 %r23452, %r23458, %r23457, %r23459; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23456, %r23457, %r23458, %r23459; + // end inline asm + mov.u32 %r23467, 43; + // begin inline asm + shf.l.wrap.b32 %r23460, %r23466, %r23465, %r23467; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23464, %r23465, %r23466, %r23467; + // end inline asm + mov.u32 %r23475, 25; + // begin inline asm + shf.l.wrap.b32 %r23468, %r23474, %r23473, %r23475; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23472, %r23473, %r23474, %r23475; + // end inline asm + mov.u32 %r23483, 8; + // begin inline asm + shf.l.wrap.b32 %r23476, %r23482, %r23481, %r23483; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23480, %r23481, %r23482, %r23483; + // end inline asm + mov.u32 %r23491, 56; + // begin inline asm + shf.l.wrap.b32 %r23484, %r23490, %r23489, %r23491; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23488, %r23489, %r23490, %r23491; + // end inline asm + mov.u32 %r23499, 41; + // begin inline asm + shf.l.wrap.b32 %r23492, %r23498, %r23497, %r23499; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23496, %r23497, %r23498, %r23499; + // end inline asm + mov.u32 %r23507, 27; + // begin inline asm + shf.l.wrap.b32 %r23500, %r23506, %r23505, %r23507; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23504, %r23505, %r23506, %r23507; + // end inline asm + mov.u32 %r23515, 14; + // begin inline asm + shf.l.wrap.b32 %r23508, %r23514, %r23513, %r23515; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23512, %r23513, %r23514, %r23515; + // end inline asm + mov.u32 %r23523, 2; + // begin inline asm + shf.l.wrap.b32 %r23516, %r23522, %r23521, %r23523; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23520, %r23521, %r23522, %r23523; + // end inline asm + mov.u32 %r23531, 55; + // begin inline asm + shf.l.wrap.b32 %r23524, %r23530, %r23529, %r23531; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23528, %r23529, %r23530, %r23531; + // end inline asm + mov.u32 %r23539, 45; + // begin inline asm + shf.l.wrap.b32 %r23532, %r23538, %r23537, %r23539; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23536, %r23537, %r23538, %r23539; + // end inline asm + mov.u32 %r23547, 36; + // begin inline asm + shf.l.wrap.b32 %r23540, %r23546, %r23545, %r23547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23544, %r23545, %r23546, %r23547; + // end inline asm + mov.u32 %r23555, 28; + // begin inline asm + shf.l.wrap.b32 %r23548, %r23554, %r23553, %r23555; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23552, %r23553, %r23554, %r23555; + // end inline asm + mov.u32 %r23563, 21; + // begin inline asm + shf.l.wrap.b32 %r23556, %r23562, %r23561, %r23563; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23560, %r23561, %r23562, %r23563; + // end inline asm + mov.u32 %r23571, 15; + // begin inline asm + shf.l.wrap.b32 %r23564, %r23570, %r23569, %r23571; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23568, %r23569, %r23570, %r23571; + // end inline asm + mov.u32 %r23579, 10; + // begin inline asm + shf.l.wrap.b32 %r23572, %r23578, %r23577, %r23579; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23576, %r23577, %r23578, %r23579; + // end inline asm + mov.u32 %r23587, 6; + // begin inline asm + shf.l.wrap.b32 %r23580, %r23586, %r23585, %r23587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23584, %r23585, %r23586, %r23587; + // end inline asm + mov.u32 %r23595, 3; + // begin inline asm + shf.l.wrap.b32 %r23588, %r23594, %r23593, %r23595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23592, %r23593, %r23594, %r23595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23596, %r23602, %r23601, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23600, %r23601, %r23602, %r23281; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23604, %r23639, %r23412, %r23460, 0xD2; + lop3.b32 %r23605, %r23642, %r23416, %r23464, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31053, %r23412, %r23460, %r23556, 0xD2; + lop3.b32 %r31054, %r23416, %r23464, %r23560, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31049, %r23460, %r23556, %r23508, 0xD2; + lop3.b32 %r31050, %r23464, %r23560, %r23512, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31045, %r23556, %r23508, %r23639, 0xD2; + lop3.b32 %r31046, %r23560, %r23512, %r23642, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31043, %r23508, %r23639, %r23412, 0xD2; + lop3.b32 %r31044, %r23512, %r23642, %r23416, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31039, %r23548, %r23420, %r23588, 0xD2; + lop3.b32 %r31040, %r23552, %r23424, %r23592, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31051, %r23420, %r23588, %r23532, 0xD2; + lop3.b32 %r31052, %r23424, %r23592, %r23536, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31047, %r23588, %r23532, %r23428, 0xD2; + lop3.b32 %r31048, %r23592, %r23536, %r23432, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31019, %r23532, %r23428, %r23548, 0xD2; + lop3.b32 %r31020, %r23536, %r23432, %r23552, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r31019, %r31020}; + // begin inline asm + // chi + lop3.b32 %r31011, %r23428, %r23548, %r23420, 0xD2; + lop3.b32 %r31012, %r23432, %r23552, %r23424, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r31011, %r31012}; + // begin inline asm + // chi + lop3.b32 %r31037, %r23596, %r23580, %r23468, 0xD2; + lop3.b32 %r31038, %r23600, %r23584, %r23472, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r31037, %r31038}; + // begin inline asm + // chi + lop3.b32 %r31031, %r23580, %r23468, %r23476, 0xD2; + lop3.b32 %r31032, %r23584, %r23472, %r23480, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r31031, %r31032}; + // begin inline asm + // chi + lop3.b32 %r31025, %r23468, %r23476, %r23444, 0xD2; + lop3.b32 %r31026, %r23472, %r23480, %r23448, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r31025, %r31026}; + // begin inline asm + // chi + lop3.b32 %r31017, %r23476, %r23444, %r23596, 0xD2; + lop3.b32 %r31018, %r23480, %r23448, %r23600, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r31017, %r31018}; + // begin inline asm + // chi + lop3.b32 %r31009, %r23444, %r23596, %r23580, 0xD2; + lop3.b32 %r31010, %r23448, %r23600, %r23584, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r31009, %r31010}; + // begin inline asm + // chi + lop3.b32 %r31035, %r23500, %r23540, %r23572, 0xD2; + lop3.b32 %r31036, %r23504, %r23544, %r23576, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r31035, %r31036}; + // begin inline asm + // chi + lop3.b32 %r31029, %r23540, %r23572, %r23564, 0xD2; + lop3.b32 %r31030, %r23544, %r23576, %r23568, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r31029, %r31030}; + // begin inline asm + // chi + lop3.b32 %r31023, %r23572, %r23564, %r23484, 0xD2; + lop3.b32 %r31024, %r23576, %r23568, %r23488, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r31023, %r31024}; + // begin inline asm + // chi + lop3.b32 %r31015, %r23564, %r23484, %r23500, 0xD2; + lop3.b32 %r31016, %r23568, %r23488, %r23504, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r31015, %r31016}; + // begin inline asm + // chi + lop3.b32 %r31007, %r23484, %r23500, %r23540, 0xD2; + lop3.b32 %r31008, %r23488, %r23504, %r23544, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r31007, %r31008}; + // begin inline asm + // chi + lop3.b32 %r31033, %r23452, %r23524, %r23436, 0xD2; + lop3.b32 %r31034, %r23456, %r23528, %r23440, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r31033, %r31034}; + // begin inline asm + // chi + lop3.b32 %r31027, %r23524, %r23436, %r23492, 0xD2; + lop3.b32 %r31028, %r23528, %r23440, %r23496, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r31027, %r31028}; + // begin inline asm + // chi + lop3.b32 %r31021, %r23436, %r23492, %r23516, 0xD2; + lop3.b32 %r31022, %r23440, %r23496, %r23520, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r31021, %r31022}; + // begin inline asm + // chi + lop3.b32 %r31013, %r23492, %r23516, %r23452, 0xD2; + lop3.b32 %r31014, %r23496, %r23520, %r23456, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r31013, %r31014}; + // begin inline asm + // chi + lop3.b32 %r31005, %r23516, %r23452, %r23524, 0xD2; + lop3.b32 %r31006, %r23520, %r23456, %r23528, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r31005, %r31006}; + mul.wide.s32 %rd1010, %r31055, 8; + mov.u64 %rd1011, keccak_round_constants; + cvta.const.u64 %rd1012, %rd1011; + add.s64 %rd1009, %rd1012, %rd1010; + // begin inline asm + ld.global.nc.v2.u32 {%r23804,%r23805}, [%rd1009]; + // end inline asm + xor.b32 %r31041, %r23604, %r23804; + xor.b32 %r31042, %r23605, %r23805; + add.s32 %r31055, %r31055, 1; + setp.lt.u32 %p43, %r31055, 23; + @%p43 bra $L__BB2_76; + + add.u64 %rd270, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r31053, %r31054}; + st.local.v2.u32 [%rd2+72], {%r31051, %r31052}; + st.local.v2.u32 [%rd2+40], {%r31049, %r31050}; + st.local.v2.u32 [%rd2+80], {%r31047, %r31048}; + st.local.v2.u32 [%rd2+48], {%r31045, %r31046}; + st.local.v2.u32 [%rd2+56], {%r31043, %r31044}; + st.local.v2.u32 [%rd2+24], {%r31041, %r31042}; + // begin inline asm + // xor5 + lop3.b32 %r23816, %r31041, %r31039, %r31037, 0x96; + lop3.b32 %r23816, %r23816, %r31035, %r31033, 0x96; + lop3.b32 %r23817, %r31042, %r31040, %r31038, 0x96; + lop3.b32 %r23817, %r23817, %r31036, %r31034, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23828, %r31053, %r31051, %r31031, 0x96; + lop3.b32 %r23828, %r23828, %r31029, %r31027, 0x96; + lop3.b32 %r23829, %r31054, %r31052, %r31032, 0x96; + lop3.b32 %r23829, %r23829, %r31030, %r31028, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23840, %r31049, %r31047, %r31025, 0x96; + lop3.b32 %r23840, %r23840, %r31023, %r31021, 0x96; + lop3.b32 %r23841, %r31050, %r31048, %r31026, 0x96; + lop3.b32 %r23841, %r23841, %r31024, %r31022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23852, %r31045, %r31019, %r31017, 0x96; + lop3.b32 %r23852, %r23852, %r31015, %r31013, 0x96; + lop3.b32 %r23853, %r31046, %r31020, %r31018, 0x96; + lop3.b32 %r23853, %r23853, %r31016, %r31014, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23864, %r31043, %r31011, %r31009, 0x96; + lop3.b32 %r23864, %r23864, %r31007, %r31005, 0x96; + lop3.b32 %r23865, %r31044, %r31012, %r31010, 0x96; + lop3.b32 %r23865, %r23865, %r31008, %r31006, 0x96; + // end inline asm + mov.u32 %r31070, 1; + // begin inline asm + shf.l.wrap.b32 %r23876, %r23829, %r23828, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23880, %r23828, %r23829, %r31070; + // end inline asm + xor.b32 %r24095, %r23876, %r23864; + xor.b32 %r24096, %r23880, %r23865; + xor.b32 %r24023, %r31041, %r24095; + xor.b32 %r24026, %r31042, %r24096; + xor.b32 %r23986, %r31038, %r24096; + xor.b32 %r23985, %r31037, %r24095; + st.local.v2.u32 [%rd2+104], {%r23985, %r23986}; + // begin inline asm + shf.l.wrap.b32 %r23884, %r23841, %r23840, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23888, %r23840, %r23841, %r31070; + // end inline asm + xor.b32 %r24097, %r23884, %r23816; + xor.b32 %r24098, %r23888, %r23817; + xor.b32 %r23922, %r31051, %r24097; + xor.b32 %r23921, %r31052, %r24098; + xor.b32 %r23961, %r31030, %r24098; + xor.b32 %r23962, %r31029, %r24097; + st.local.v2.u32 [%rd2+152], {%r23962, %r23961}; + // begin inline asm + shf.l.wrap.b32 %r23892, %r23853, %r23852, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23896, %r23852, %r23853, %r31070; + // end inline asm + xor.b32 %r24099, %r23892, %r23828; + xor.b32 %r24100, %r23896, %r23829; + xor.b32 %r23945, %r31026, %r24100; + xor.b32 %r23946, %r31025, %r24099; + st.local.v2.u32 [%rd2+120], {%r23946, %r23945}; + xor.b32 %r23937, %r31022, %r24100; + xor.b32 %r23938, %r31021, %r24099; + st.local.v2.u32 [%rd2+200], {%r23938, %r23937}; + // begin inline asm + shf.l.wrap.b32 %r23900, %r23865, %r23864, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23904, %r23864, %r23865, %r31070; + // end inline asm + xor.b32 %r24101, %r23900, %r23840; + xor.b32 %r24102, %r23904, %r23841; + xor.b32 %r23969, %r31045, %r24101; + xor.b32 %r23970, %r31046, %r24102; + xor.b32 %r23978, %r31016, %r24102; + xor.b32 %r23977, %r31015, %r24101; + st.local.v2.u32 [%rd2+168], {%r23977, %r23978}; + // begin inline asm + shf.l.wrap.b32 %r23908, %r23817, %r23816, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23912, %r23816, %r23817, %r31070; + // end inline asm + xor.b32 %r24103, %r23908, %r23852; + xor.b32 %r24104, %r23912, %r23853; + xor.b32 %r23929, %r31011, %r24103; + xor.b32 %r23930, %r31012, %r24104; + xor.b32 %r23954, %r31006, %r24104; + xor.b32 %r23953, %r31005, %r24103; + st.local.v2.u32 [%rd2+216], {%r23953, %r23954}; + // begin inline asm + shf.l.wrap.b32 %r23916, %r23922, %r23921, %r23419; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23920, %r23921, %r23922, %r23419; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23924, %r23930, %r23929, %r23427; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23928, %r23929, %r23930, %r23427; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23936, %r23937, %r23938, %r23435; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23932, %r23938, %r23937, %r23435; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r23932, %r23936}; + // begin inline asm + shf.l.wrap.b32 %r23940, %r23946, %r23945, %r23467; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23944, %r23945, %r23946, %r23467; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23948, %r23954, %r23953, %r23515; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23952, %r23953, %r23954, %r23515; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23960, %r23961, %r23962, %r23539; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23956, %r23962, %r23961, %r23539; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r23956, %r23960}; + // begin inline asm + shf.l.wrap.b32 %r23964, %r23970, %r23969, %r23555; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23968, %r23969, %r23970, %r23555; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23972, %r23978, %r23977, %r23563; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23976, %r23977, %r23978, %r23563; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23980, %r23986, %r23985, %r23595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23984, %r23985, %r23986, %r23595; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23988, %r24023, %r23916, %r23940, 0xD2; + lop3.b32 %r23989, %r24026, %r23920, %r23944, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31188, %r23916, %r23940, %r23972, 0xD2; + lop3.b32 %r31189, %r23920, %r23944, %r23976, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r31188, %r31189}; + // begin inline asm + // chi + lop3.b32 %r31184, %r23940, %r23972, %r23948, 0xD2; + lop3.b32 %r31185, %r23944, %r23976, %r23952, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r31184, %r31185}; + // begin inline asm + // chi + lop3.b32 %r31180, %r23972, %r23948, %r24023, 0xD2; + lop3.b32 %r31181, %r23976, %r23952, %r24026, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r31180, %r31181}; + // begin inline asm + // chi + lop3.b32 %r31178, %r23948, %r24023, %r23916, 0xD2; + lop3.b32 %r31179, %r23952, %r24026, %r23920, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r31178, %r31179}; + // begin inline asm + // chi + lop3.b32 %r31174, %r23964, %r23924, %r23980, 0xD2; + lop3.b32 %r31175, %r23968, %r23928, %r23984, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r31174, %r31175}; + // begin inline asm + // chi + lop3.b32 %r31186, %r23924, %r23980, %r23956, 0xD2; + lop3.b32 %r31187, %r23928, %r23984, %r23960, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r31186, %r31187}; + // begin inline asm + // chi + lop3.b32 %r31182, %r23980, %r23956, %r23932, 0xD2; + lop3.b32 %r31183, %r23984, %r23960, %r23936, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r31182, %r31183}; + add.s64 %rd1013, %rd1012, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r24052,%r24053}, [%rd1013]; + // end inline asm + xor.b32 %r31176, %r23988, %r24052; + xor.b32 %r31177, %r23989, %r24053; + st.local.v2.u32 [%rd2+24], {%r31176, %r31177}; + st.local.u64 [%rd270], %rd361; + mov.u64 %rd1017, 1179641; + st.local.u64 [%rd270+8], %rd1017; + st.local.u32 [%rd270+16], %r3344; + ld.global.u64 %rd1018, [%rd221]; + ld.global.u64 %rd1019, [%rd221+8]; + ld.global.u64 %rd1020, [%rd221+16]; + ld.global.u64 %rd1021, [%rd221+24]; + ld.global.u64 %rd1022, [%rd221+32]; + ld.global.u64 %rd1023, [%rd221+40]; + ld.global.u64 %rd1024, [%rd221+48]; + ld.global.u64 %rd1025, [%rd221+56]; + st.local.u64 [%rd270+32], %rd1019; + st.local.u64 [%rd270+40], %rd1020; + st.local.u64 [%rd270+48], %rd1021; + st.local.u64 [%rd270+56], %rd1022; + st.local.u64 [%rd270+64], %rd1023; + st.local.u64 [%rd270+72], %rd1024; + st.local.u64 [%rd270+80], %rd1025; + cvt.u32.u64 %r24105, %rd1018; + xor.b32 %r24106, %r3344, %r24105; + st.local.u64 [%rd270+24], %rd1018; + st.local.u32 [%rd270+24], %r24106; + mov.u32 %r31056, 0; + st.local.v2.u32 [%rd270+96], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+104], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+112], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+120], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+128], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+136], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+144], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+152], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+160], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+168], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+176], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+184], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+192], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+200], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+208], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+216], {%r31056, %r31056}; + mov.u32 %r31071, -2147483648; + st.local.v2.u32 [%rd270+88], {%r31070, %r31071}; + ld.local.v2.u32 {%r31092, %r31093}, [%rd270+24]; + mov.b64 {%r31090, %r31091}, %rd1023; + shr.u64 %rd1026, %rd1019, 32; + cvt.u32.u64 %r31104, %rd1019; + cvt.u32.u64 %r31105, %rd1026; + shr.u64 %rd1027, %rd1024, 32; + cvt.u32.u64 %r31102, %rd1024; + cvt.u32.u64 %r31103, %rd1027; + shr.u64 %rd1028, %rd1020, 32; + cvt.u32.u64 %r31100, %rd1020; + cvt.u32.u64 %r31101, %rd1028; + shr.u64 %rd1029, %rd1025, 32; + cvt.u32.u64 %r31098, %rd1025; + cvt.u32.u64 %r31099, %rd1029; + shr.u64 %rd1030, %rd1021, 32; + cvt.u32.u64 %r31096, %rd1021; + cvt.u32.u64 %r31097, %rd1030; + shr.u64 %rd1031, %rd1022, 32; + cvt.u32.u64 %r31094, %rd1022; + cvt.u32.u64 %r31095, %rd1031; + mov.u32 %r31057, %r31056; + mov.u32 %r31058, %r31056; + mov.u32 %r31059, %r31056; + mov.u32 %r31060, %r31056; + mov.u32 %r31061, %r31056; + mov.u32 %r31062, %r31056; + mov.u32 %r31063, %r31056; + mov.u32 %r31064, %r31056; + mov.u32 %r31065, %r31056; + mov.u32 %r31066, %r31056; + mov.u32 %r31067, %r31056; + mov.u32 %r31068, %r31056; + mov.u32 %r31069, %r31056; + mov.u32 %r31072, %r31056; + mov.u32 %r31073, %r31056; + mov.u32 %r31074, %r31056; + mov.u32 %r31075, %r31056; + mov.u32 %r31076, %r31056; + mov.u32 %r31077, %r31056; + mov.u32 %r31078, %r31056; + mov.u32 %r31079, %r31056; + mov.u32 %r31080, %r31056; + mov.u32 %r31081, %r31056; + mov.u32 %r31082, %r31056; + mov.u32 %r31083, %r31056; + mov.u32 %r31084, %r31056; + mov.u32 %r31085, %r31056; + mov.u32 %r31086, %r31056; + mov.u32 %r31087, %r31056; + mov.u32 %r31088, %r31056; + mov.u32 %r31089, %r31056; + mov.u32 %r31106, %r31056; + +$L__BB2_78: + mov.u32 %r29797, 1; + mov.u64 %rd1296, keccak_round_constants; + cvta.const.u64 %rd1295, %rd1296; + // begin inline asm + // xor5 + lop3.b32 %r24109, %r31092, %r31090, %r31088, 0x96; + lop3.b32 %r24109, %r24109, %r31086, %r31084, 0x96; + lop3.b32 %r24110, %r31093, %r31091, %r31089, 0x96; + lop3.b32 %r24110, %r24110, %r31087, %r31085, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24121, %r31104, %r31102, %r31082, 0x96; + lop3.b32 %r24121, %r24121, %r31080, %r31078, 0x96; + lop3.b32 %r24122, %r31105, %r31103, %r31083, 0x96; + lop3.b32 %r24122, %r24122, %r31081, %r31079, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24133, %r31100, %r31098, %r31076, 0x96; + lop3.b32 %r24133, %r24133, %r31074, %r31072, 0x96; + lop3.b32 %r24134, %r31101, %r31099, %r31077, 0x96; + lop3.b32 %r24134, %r24134, %r31075, %r31073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24145, %r31096, %r31070, %r31068, 0x96; + lop3.b32 %r24145, %r24145, %r31066, %r31064, 0x96; + lop3.b32 %r24146, %r31097, %r31071, %r31069, 0x96; + lop3.b32 %r24146, %r24146, %r31067, %r31065, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24157, %r31094, %r31062, %r31060, 0x96; + lop3.b32 %r24157, %r24157, %r31058, %r31056, 0x96; + lop3.b32 %r24158, %r31095, %r31063, %r31061, 0x96; + lop3.b32 %r24158, %r24158, %r31059, %r31057, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24169, %r24122, %r24121, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24173, %r24121, %r24122, %r29797; + // end inline asm + xor.b32 %r24603, %r24169, %r24157; + xor.b32 %r24604, %r24173, %r24158; + xor.b32 %r24436, %r31092, %r24603; + xor.b32 %r24439, %r31093, %r24604; + xor.b32 %r24343, %r31090, %r24603; + xor.b32 %r24342, %r31091, %r24604; + xor.b32 %r24390, %r31088, %r24603; + xor.b32 %r24391, %r31089, %r24604; + xor.b32 %r24295, %r31086, %r24603; + xor.b32 %r24294, %r31087, %r24604; + xor.b32 %r24246, %r31084, %r24603; + xor.b32 %r24247, %r31085, %r24604; + // begin inline asm + shf.l.wrap.b32 %r24177, %r24134, %r24133, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24181, %r24133, %r24134, %r29797; + // end inline asm + xor.b32 %r24605, %r24177, %r24109; + xor.b32 %r24606, %r24181, %r24110; + xor.b32 %r24398, %r31104, %r24605; + xor.b32 %r24399, %r31105, %r24606; + xor.b32 %r24215, %r31102, %r24605; + xor.b32 %r24214, %r31103, %r24606; + xor.b32 %r24374, %r31082, %r24605; + xor.b32 %r24375, %r31083, %r24606; + xor.b32 %r24335, %r31080, %r24605; + xor.b32 %r24334, %r31081, %r24606; + xor.b32 %r24318, %r31078, %r24605; + xor.b32 %r24319, %r31079, %r24606; + // begin inline asm + shf.l.wrap.b32 %r24185, %r24146, %r24145, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24189, %r24145, %r24146, %r29797; + // end inline asm + xor.b32 %r24607, %r24185, %r24121; + xor.b32 %r24608, %r24189, %r24122; + xor.b32 %r24255, %r31100, %r24607; + xor.b32 %r24254, %r31101, %r24608; + xor.b32 %r24382, %r31098, %r24607; + xor.b32 %r24383, %r31099, %r24608; + xor.b32 %r24263, %r31076, %r24607; + xor.b32 %r24262, %r31077, %r24608; + xor.b32 %r24366, %r31074, %r24607; + xor.b32 %r24367, %r31075, %r24608; + xor.b32 %r24231, %r31072, %r24607; + xor.b32 %r24230, %r31073, %r24608; + // begin inline asm + shf.l.wrap.b32 %r24193, %r24158, %r24157, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24197, %r24157, %r24158, %r29797; + // end inline asm + xor.b32 %r24609, %r24193, %r24133; + xor.b32 %r24610, %r24197, %r24134; + xor.b32 %r24350, %r31096, %r24609; + xor.b32 %r24351, %r31097, %r24610; + xor.b32 %r24327, %r31070, %r24609; + xor.b32 %r24326, %r31071, %r24610; + xor.b32 %r24270, %r31068, %r24609; + xor.b32 %r24271, %r31069, %r24610; + xor.b32 %r24358, %r31066, %r24609; + xor.b32 %r24359, %r31067, %r24610; + xor.b32 %r24287, %r31064, %r24609; + xor.b32 %r24286, %r31065, %r24610; + // begin inline asm + shf.l.wrap.b32 %r24201, %r24110, %r24109, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24205, %r24109, %r24110, %r29797; + // end inline asm + xor.b32 %r24611, %r24201, %r24145; + xor.b32 %r24612, %r24205, %r24146; + xor.b32 %r24302, %r31094, %r24611; + xor.b32 %r24303, %r31095, %r24612; + xor.b32 %r24222, %r31062, %r24611; + xor.b32 %r24223, %r31063, %r24612; + xor.b32 %r24239, %r31060, %r24611; + xor.b32 %r24238, %r31061, %r24612; + xor.b32 %r24278, %r31058, %r24611; + xor.b32 %r24279, %r31059, %r24612; + xor.b32 %r24310, %r31056, %r24611; + xor.b32 %r24311, %r31057, %r24612; + mov.u32 %r24216, 44; + // begin inline asm + shf.l.wrap.b32 %r24209, %r24215, %r24214, %r24216; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24213, %r24214, %r24215, %r24216; + // end inline asm + mov.u32 %r24224, 20; + // begin inline asm + shf.l.wrap.b32 %r24217, %r24223, %r24222, %r24224; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24221, %r24222, %r24223, %r24224; + // end inline asm + mov.u32 %r24232, 61; + // begin inline asm + shf.l.wrap.b32 %r24225, %r24231, %r24230, %r24232; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24229, %r24230, %r24231, %r24232; + // end inline asm + mov.u32 %r24240, 39; + // begin inline asm + shf.l.wrap.b32 %r24233, %r24239, %r24238, %r24240; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24237, %r24238, %r24239, %r24240; + // end inline asm + mov.u32 %r24248, 18; + // begin inline asm + shf.l.wrap.b32 %r24241, %r24247, %r24246, %r24248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24245, %r24246, %r24247, %r24248; + // end inline asm + mov.u32 %r24256, 62; + // begin inline asm + shf.l.wrap.b32 %r24249, %r24255, %r24254, %r24256; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24253, %r24254, %r24255, %r24256; + // end inline asm + mov.u32 %r24264, 43; + // begin inline asm + shf.l.wrap.b32 %r24257, %r24263, %r24262, %r24264; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24261, %r24262, %r24263, %r24264; + // end inline asm + mov.u32 %r24272, 25; + // begin inline asm + shf.l.wrap.b32 %r24265, %r24271, %r24270, %r24272; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24269, %r24270, %r24271, %r24272; + // end inline asm + mov.u32 %r24280, 8; + // begin inline asm + shf.l.wrap.b32 %r24273, %r24279, %r24278, %r24280; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24277, %r24278, %r24279, %r24280; + // end inline asm + mov.u32 %r24288, 56; + // begin inline asm + shf.l.wrap.b32 %r24281, %r24287, %r24286, %r24288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24285, %r24286, %r24287, %r24288; + // end inline asm + mov.u32 %r24296, 41; + // begin inline asm + shf.l.wrap.b32 %r24289, %r24295, %r24294, %r24296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24293, %r24294, %r24295, %r24296; + // end inline asm + mov.u32 %r24304, 27; + // begin inline asm + shf.l.wrap.b32 %r24297, %r24303, %r24302, %r24304; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24301, %r24302, %r24303, %r24304; + // end inline asm + mov.u32 %r24312, 14; + // begin inline asm + shf.l.wrap.b32 %r24305, %r24311, %r24310, %r24312; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24309, %r24310, %r24311, %r24312; + // end inline asm + mov.u32 %r24320, 2; + // begin inline asm + shf.l.wrap.b32 %r24313, %r24319, %r24318, %r24320; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24317, %r24318, %r24319, %r24320; + // end inline asm + mov.u32 %r24328, 55; + // begin inline asm + shf.l.wrap.b32 %r24321, %r24327, %r24326, %r24328; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24325, %r24326, %r24327, %r24328; + // end inline asm + mov.u32 %r24336, 45; + // begin inline asm + shf.l.wrap.b32 %r24329, %r24335, %r24334, %r24336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24333, %r24334, %r24335, %r24336; + // end inline asm + mov.u32 %r24344, 36; + // begin inline asm + shf.l.wrap.b32 %r24337, %r24343, %r24342, %r24344; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24341, %r24342, %r24343, %r24344; + // end inline asm + mov.u32 %r24352, 28; + // begin inline asm + shf.l.wrap.b32 %r24345, %r24351, %r24350, %r24352; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24349, %r24350, %r24351, %r24352; + // end inline asm + mov.u32 %r24360, 21; + // begin inline asm + shf.l.wrap.b32 %r24353, %r24359, %r24358, %r24360; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24357, %r24358, %r24359, %r24360; + // end inline asm + mov.u32 %r24368, 15; + // begin inline asm + shf.l.wrap.b32 %r24361, %r24367, %r24366, %r24368; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24365, %r24366, %r24367, %r24368; + // end inline asm + mov.u32 %r24376, 10; + // begin inline asm + shf.l.wrap.b32 %r24369, %r24375, %r24374, %r24376; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24373, %r24374, %r24375, %r24376; + // end inline asm + mov.u32 %r24384, 6; + // begin inline asm + shf.l.wrap.b32 %r24377, %r24383, %r24382, %r24384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24381, %r24382, %r24383, %r24384; + // end inline asm + mov.u32 %r24392, 3; + // begin inline asm + shf.l.wrap.b32 %r24385, %r24391, %r24390, %r24392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24389, %r24390, %r24391, %r24392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24393, %r24399, %r24398, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24397, %r24398, %r24399, %r29797; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24401, %r24436, %r24209, %r24257, 0xD2; + lop3.b32 %r24402, %r24439, %r24213, %r24261, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31104, %r24209, %r24257, %r24353, 0xD2; + lop3.b32 %r31105, %r24213, %r24261, %r24357, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31100, %r24257, %r24353, %r24305, 0xD2; + lop3.b32 %r31101, %r24261, %r24357, %r24309, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31096, %r24353, %r24305, %r24436, 0xD2; + lop3.b32 %r31097, %r24357, %r24309, %r24439, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31094, %r24305, %r24436, %r24209, 0xD2; + lop3.b32 %r31095, %r24309, %r24439, %r24213, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31090, %r24345, %r24217, %r24385, 0xD2; + lop3.b32 %r31091, %r24349, %r24221, %r24389, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31102, %r24217, %r24385, %r24329, 0xD2; + lop3.b32 %r31103, %r24221, %r24389, %r24333, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31098, %r24385, %r24329, %r24225, 0xD2; + lop3.b32 %r31099, %r24389, %r24333, %r24229, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31070, %r24329, %r24225, %r24345, 0xD2; + lop3.b32 %r31071, %r24333, %r24229, %r24349, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r31070, %r31071}; + // begin inline asm + // chi + lop3.b32 %r31062, %r24225, %r24345, %r24217, 0xD2; + lop3.b32 %r31063, %r24229, %r24349, %r24221, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r31062, %r31063}; + // begin inline asm + // chi + lop3.b32 %r31088, %r24393, %r24377, %r24265, 0xD2; + lop3.b32 %r31089, %r24397, %r24381, %r24269, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+104], {%r31088, %r31089}; + // begin inline asm + // chi + lop3.b32 %r31082, %r24377, %r24265, %r24273, 0xD2; + lop3.b32 %r31083, %r24381, %r24269, %r24277, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+112], {%r31082, %r31083}; + // begin inline asm + // chi + lop3.b32 %r31076, %r24265, %r24273, %r24241, 0xD2; + lop3.b32 %r31077, %r24269, %r24277, %r24245, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+120], {%r31076, %r31077}; + // begin inline asm + // chi + lop3.b32 %r31068, %r24273, %r24241, %r24393, 0xD2; + lop3.b32 %r31069, %r24277, %r24245, %r24397, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+128], {%r31068, %r31069}; + // begin inline asm + // chi + lop3.b32 %r31060, %r24241, %r24393, %r24377, 0xD2; + lop3.b32 %r31061, %r24245, %r24397, %r24381, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+136], {%r31060, %r31061}; + // begin inline asm + // chi + lop3.b32 %r31086, %r24297, %r24337, %r24369, 0xD2; + lop3.b32 %r31087, %r24301, %r24341, %r24373, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+144], {%r31086, %r31087}; + // begin inline asm + // chi + lop3.b32 %r31080, %r24337, %r24369, %r24361, 0xD2; + lop3.b32 %r31081, %r24341, %r24373, %r24365, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+152], {%r31080, %r31081}; + // begin inline asm + // chi + lop3.b32 %r31074, %r24369, %r24361, %r24281, 0xD2; + lop3.b32 %r31075, %r24373, %r24365, %r24285, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+160], {%r31074, %r31075}; + // begin inline asm + // chi + lop3.b32 %r31066, %r24361, %r24281, %r24297, 0xD2; + lop3.b32 %r31067, %r24365, %r24285, %r24301, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+168], {%r31066, %r31067}; + // begin inline asm + // chi + lop3.b32 %r31058, %r24281, %r24297, %r24337, 0xD2; + lop3.b32 %r31059, %r24285, %r24301, %r24341, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+176], {%r31058, %r31059}; + // begin inline asm + // chi + lop3.b32 %r31084, %r24249, %r24321, %r24233, 0xD2; + lop3.b32 %r31085, %r24253, %r24325, %r24237, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+184], {%r31084, %r31085}; + // begin inline asm + // chi + lop3.b32 %r31078, %r24321, %r24233, %r24289, 0xD2; + lop3.b32 %r31079, %r24325, %r24237, %r24293, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+192], {%r31078, %r31079}; + // begin inline asm + // chi + lop3.b32 %r31072, %r24233, %r24289, %r24313, 0xD2; + lop3.b32 %r31073, %r24237, %r24293, %r24317, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+200], {%r31072, %r31073}; + // begin inline asm + // chi + lop3.b32 %r31064, %r24289, %r24313, %r24249, 0xD2; + lop3.b32 %r31065, %r24293, %r24317, %r24253, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+208], {%r31064, %r31065}; + // begin inline asm + // chi + lop3.b32 %r31056, %r24313, %r24249, %r24321, 0xD2; + lop3.b32 %r31057, %r24317, %r24253, %r24325, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+216], {%r31056, %r31057}; + mul.wide.s32 %rd1033, %r31106, 8; + add.s64 %rd1032, %rd1295, %rd1033; + // begin inline asm + ld.global.nc.v2.u32 {%r24601,%r24602}, [%rd1032]; + // end inline asm + xor.b32 %r31092, %r24401, %r24601; + xor.b32 %r31093, %r24402, %r24602; + add.s32 %r31106, %r31106, 1; + setp.lt.u32 %p44, %r31106, 23; + @%p44 bra $L__BB2_78; + + mov.u64 %rd1284, keccak_round_constants; + cvta.const.u64 %rd1283, %rd1284; + add.s64 %rd1282, %rd1283, 184; + mov.u32 %r29795, 3; + mov.u32 %r29794, 21; + mov.u32 %r29793, 28; + mov.u32 %r29792, 45; + mov.u32 %r29791, 14; + mov.u32 %r29790, 43; + mov.u32 %r29789, 61; + mov.u32 %r29788, 20; + mov.u32 %r29787, 44; + mov.u32 %r31139, 0; + mov.u32 %r24712, 1; + st.local.v2.u32 [%rd270+32], {%r31104, %r31105}; + st.local.v2.u32 [%rd270+72], {%r31102, %r31103}; + st.local.v2.u32 [%rd270+40], {%r31100, %r31101}; + st.local.v2.u32 [%rd270+80], {%r31098, %r31099}; + st.local.v2.u32 [%rd270+48], {%r31096, %r31097}; + st.local.v2.u32 [%rd270+56], {%r31094, %r31095}; + st.local.v2.u32 [%rd270+24], {%r31092, %r31093}; + // begin inline asm + // xor5 + lop3.b32 %r24613, %r31092, %r31090, %r31088, 0x96; + lop3.b32 %r24613, %r24613, %r31086, %r31084, 0x96; + lop3.b32 %r24614, %r31093, %r31091, %r31089, 0x96; + lop3.b32 %r24614, %r24614, %r31087, %r31085, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24625, %r31104, %r31102, %r31082, 0x96; + lop3.b32 %r24625, %r24625, %r31080, %r31078, 0x96; + lop3.b32 %r24626, %r31105, %r31103, %r31083, 0x96; + lop3.b32 %r24626, %r24626, %r31081, %r31079, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24637, %r31100, %r31098, %r31076, 0x96; + lop3.b32 %r24637, %r24637, %r31074, %r31072, 0x96; + lop3.b32 %r24638, %r31101, %r31099, %r31077, 0x96; + lop3.b32 %r24638, %r24638, %r31075, %r31073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24649, %r31096, %r31070, %r31068, 0x96; + lop3.b32 %r24649, %r24649, %r31066, %r31064, 0x96; + lop3.b32 %r24650, %r31097, %r31071, %r31069, 0x96; + lop3.b32 %r24650, %r24650, %r31067, %r31065, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24661, %r31094, %r31062, %r31060, 0x96; + lop3.b32 %r24661, %r24661, %r31058, %r31056, 0x96; + lop3.b32 %r24662, %r31095, %r31063, %r31061, 0x96; + lop3.b32 %r24662, %r24662, %r31059, %r31057, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24673, %r24626, %r24625, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24677, %r24625, %r24626, %r24712; + // end inline asm + xor.b32 %r24852, %r24673, %r24661; + xor.b32 %r24853, %r24677, %r24662; + xor.b32 %r24820, %r31092, %r24852; + xor.b32 %r24823, %r31093, %r24853; + xor.b32 %r24783, %r31089, %r24853; + xor.b32 %r24782, %r31088, %r24852; + st.local.v2.u32 [%rd270+104], {%r24782, %r24783}; + // begin inline asm + shf.l.wrap.b32 %r24681, %r24638, %r24637, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24685, %r24637, %r24638, %r24712; + // end inline asm + xor.b32 %r24854, %r24681, %r24613; + xor.b32 %r24855, %r24685, %r24614; + xor.b32 %r24719, %r31102, %r24854; + xor.b32 %r24718, %r31103, %r24855; + xor.b32 %r24758, %r31081, %r24855; + xor.b32 %r24759, %r31080, %r24854; + st.local.v2.u32 [%rd270+152], {%r24759, %r24758}; + // begin inline asm + shf.l.wrap.b32 %r24689, %r24650, %r24649, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24693, %r24649, %r24650, %r24712; + // end inline asm + xor.b32 %r24856, %r24689, %r24625; + xor.b32 %r24857, %r24693, %r24626; + xor.b32 %r24742, %r31077, %r24857; + xor.b32 %r24743, %r31076, %r24856; + st.local.v2.u32 [%rd270+120], {%r24743, %r24742}; + xor.b32 %r24734, %r31073, %r24857; + xor.b32 %r24735, %r31072, %r24856; + st.local.v2.u32 [%rd270+200], {%r24735, %r24734}; + // begin inline asm + shf.l.wrap.b32 %r24697, %r24662, %r24661, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24701, %r24661, %r24662, %r24712; + // end inline asm + xor.b32 %r24858, %r24697, %r24637; + xor.b32 %r24859, %r24701, %r24638; + xor.b32 %r24766, %r31096, %r24858; + xor.b32 %r24767, %r31097, %r24859; + xor.b32 %r24775, %r31067, %r24859; + xor.b32 %r24774, %r31066, %r24858; + st.local.v2.u32 [%rd270+168], {%r24774, %r24775}; + // begin inline asm + shf.l.wrap.b32 %r24705, %r24614, %r24613, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24709, %r24613, %r24614, %r24712; + // end inline asm + xor.b32 %r24860, %r24705, %r24649; + xor.b32 %r24861, %r24709, %r24650; + xor.b32 %r24726, %r31062, %r24860; + xor.b32 %r24727, %r31063, %r24861; + xor.b32 %r24751, %r31057, %r24861; + xor.b32 %r24750, %r31056, %r24860; + st.local.v2.u32 [%rd270+216], {%r24750, %r24751}; + // begin inline asm + shf.l.wrap.b32 %r24713, %r24719, %r24718, %r29787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24717, %r24718, %r24719, %r29787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24721, %r24727, %r24726, %r29788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24725, %r24726, %r24727, %r29788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24733, %r24734, %r24735, %r29789; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24729, %r24735, %r24734, %r29789; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r24729, %r24733}; + // begin inline asm + shf.l.wrap.b32 %r24737, %r24743, %r24742, %r29790; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24741, %r24742, %r24743, %r29790; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24745, %r24751, %r24750, %r29791; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24749, %r24750, %r24751, %r29791; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24757, %r24758, %r24759, %r29792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24753, %r24759, %r24758, %r29792; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r24753, %r24757}; + // begin inline asm + shf.l.wrap.b32 %r24761, %r24767, %r24766, %r29793; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24765, %r24766, %r24767, %r29793; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24769, %r24775, %r24774, %r29794; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24773, %r24774, %r24775, %r29794; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24777, %r24783, %r24782, %r29795; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24781, %r24782, %r24783, %r29795; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24785, %r24820, %r24713, %r24737, 0xD2; + lop3.b32 %r24786, %r24823, %r24717, %r24741, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31239, %r24713, %r24737, %r24769, 0xD2; + lop3.b32 %r31240, %r24717, %r24741, %r24773, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+32], {%r31239, %r31240}; + // begin inline asm + // chi + lop3.b32 %r31235, %r24737, %r24769, %r24745, 0xD2; + lop3.b32 %r31236, %r24741, %r24773, %r24749, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+40], {%r31235, %r31236}; + // begin inline asm + // chi + lop3.b32 %r31231, %r24769, %r24745, %r24820, 0xD2; + lop3.b32 %r31232, %r24773, %r24749, %r24823, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+48], {%r31231, %r31232}; + // begin inline asm + // chi + lop3.b32 %r31229, %r24745, %r24820, %r24713, 0xD2; + lop3.b32 %r31230, %r24749, %r24823, %r24717, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+56], {%r31229, %r31230}; + // begin inline asm + // chi + lop3.b32 %r31225, %r24761, %r24721, %r24777, 0xD2; + lop3.b32 %r31226, %r24765, %r24725, %r24781, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+64], {%r31225, %r31226}; + // begin inline asm + // chi + lop3.b32 %r31237, %r24721, %r24777, %r24753, 0xD2; + lop3.b32 %r31238, %r24725, %r24781, %r24757, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+72], {%r31237, %r31238}; + // begin inline asm + // chi + lop3.b32 %r31233, %r24777, %r24753, %r24729, 0xD2; + lop3.b32 %r31234, %r24781, %r24757, %r24733, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+80], {%r31233, %r31234}; + // begin inline asm + ld.global.nc.v2.u32 {%r24849,%r24850}, [%rd1282]; + // end inline asm + xor.b32 %r31227, %r24785, %r24849; + xor.b32 %r31228, %r24786, %r24850; + st.local.v2.u32 [%rd270+24], {%r31227, %r31228}; + add.s64 %rd273, %rd270, 24; + add.s64 %rd274, %rd2, 24; + +$L__BB2_80: + or.b32 %r29796, %r3343, 1; + cvta.to.global.u64 %rd1267, %rd361; + shl.b32 %r24862, %r31139, 2; + cvt.u64.u32 %rd1041, %r24862; + and.b64 %rd1042, %rd1041, 60; + add.s64 %rd1043, %rd274, %rd1042; + xor.b32 %r24863, %r3343, %r31139; + mul.lo.s32 %r24864, %r24863, 16777619; + ld.local.u32 %r24865, [%rd1043]; + xor.b32 %r24866, %r24864, %r24865; + mul.wide.u32 %rd1044, %r24866, -954391867; + shr.u64 %rd1045, %rd1044, 32; + cvt.u32.u64 %r24867, %rd1045; + sub.s32 %r24868, %r24866, %r24867; + shr.u32 %r24869, %r24868, 1; + add.s32 %r24870, %r24869, %r24867; + shr.u32 %r24871, %r24870, 20; + mul.lo.s32 %r24872, %r24871, 1179641; + sub.s32 %r24873, %r24866, %r24872; + mul.wide.u32 %rd1046, %r24873, 64; + add.s64 %rd1047, %rd1267, %rd1046; + mul.lo.s32 %r24874, %r31176, 16777619; + ld.global.u32 %r24875, [%rd1047]; + xor.b32 %r31176, %r24874, %r24875; + mul.lo.s32 %r24876, %r31177, 16777619; + ld.global.u32 %r24877, [%rd1047+4]; + xor.b32 %r31177, %r24876, %r24877; + mul.lo.s32 %r24878, %r31188, 16777619; + ld.global.u32 %r24879, [%rd1047+8]; + mul.lo.s32 %r24880, %r31189, 16777619; + ld.global.u32 %r24881, [%rd1047+12]; + xor.b32 %r24882, %r24880, %r24881; + xor.b32 %r31188, %r24878, %r24879; + mov.b64 %rd1048, {%r31188, %r24882}; + mul.lo.s32 %r24883, %r31184, 16777619; + ld.global.u32 %r24884, [%rd1047+16]; + mul.lo.s32 %r24885, %r31185, 16777619; + ld.global.u32 %r24886, [%rd1047+20]; + xor.b32 %r24887, %r24885, %r24886; + xor.b32 %r31184, %r24883, %r24884; + mov.b64 %rd1049, {%r31184, %r24887}; + mul.lo.s32 %r24888, %r31180, 16777619; + ld.global.u32 %r24889, [%rd1047+24]; + mul.lo.s32 %r24890, %r31181, 16777619; + ld.global.u32 %r24891, [%rd1047+28]; + xor.b32 %r24892, %r24890, %r24891; + xor.b32 %r31180, %r24888, %r24889; + mov.b64 %rd1050, {%r31180, %r24892}; + mul.lo.s32 %r24893, %r31178, 16777619; + ld.global.u32 %r24894, [%rd1047+32]; + mul.lo.s32 %r24895, %r31179, 16777619; + ld.global.u32 %r24896, [%rd1047+36]; + xor.b32 %r24897, %r24895, %r24896; + xor.b32 %r31178, %r24893, %r24894; + mov.b64 %rd1051, {%r31178, %r24897}; + mul.lo.s32 %r24898, %r31174, 16777619; + ld.global.u32 %r24899, [%rd1047+40]; + xor.b32 %r31174, %r24898, %r24899; + mul.lo.s32 %r24900, %r31175, 16777619; + ld.global.u32 %r24901, [%rd1047+44]; + xor.b32 %r31175, %r24900, %r24901; + mul.lo.s32 %r24902, %r31186, 16777619; + ld.global.u32 %r24903, [%rd1047+48]; + mul.lo.s32 %r24904, %r31187, 16777619; + ld.global.u32 %r24905, [%rd1047+52]; + xor.b32 %r24906, %r24904, %r24905; + xor.b32 %r31186, %r24902, %r24903; + mov.b64 %rd1052, {%r31186, %r24906}; + mul.lo.s32 %r24907, %r31182, 16777619; + ld.global.u32 %r24908, [%rd1047+56]; + mul.lo.s32 %r24909, %r31183, 16777619; + ld.global.u32 %r24910, [%rd1047+60]; + xor.b32 %r24911, %r24909, %r24910; + xor.b32 %r31182, %r24907, %r24908; + mov.b64 %rd1053, {%r31182, %r24911}; + st.local.v2.u32 [%rd2+24], {%r31176, %r31177}; + st.local.v2.u32 [%rd2+32], {%r31188, %r24882}; + st.local.v2.u32 [%rd2+40], {%r31184, %r24887}; + st.local.v2.u32 [%rd2+48], {%r31180, %r24892}; + st.local.v2.u32 [%rd2+56], {%r31178, %r24897}; + st.local.v2.u32 [%rd2+64], {%r31174, %r31175}; + st.local.v2.u32 [%rd2+72], {%r31186, %r24906}; + st.local.v2.u32 [%rd2+80], {%r31182, %r24911}; + add.s64 %rd1054, %rd273, %rd1042; + xor.b32 %r24912, %r29796, %r31139; + mul.lo.s32 %r24913, %r24912, 16777619; + ld.local.u32 %r24914, [%rd1054]; + xor.b32 %r24915, %r24913, %r24914; + mul.wide.u32 %rd1055, %r24915, -954391867; + shr.u64 %rd1056, %rd1055, 32; + cvt.u32.u64 %r24916, %rd1056; + sub.s32 %r24917, %r24915, %r24916; + shr.u32 %r24918, %r24917, 1; + add.s32 %r24919, %r24918, %r24916; + shr.u32 %r24920, %r24919, 20; + mul.lo.s32 %r24921, %r24920, 1179641; + sub.s32 %r24922, %r24915, %r24921; + mul.wide.u32 %rd1057, %r24922, 64; + add.s64 %rd1058, %rd1267, %rd1057; + mul.lo.s32 %r24923, %r31227, 16777619; + ld.global.u32 %r24924, [%rd1058]; + xor.b32 %r31227, %r24923, %r24924; + mul.lo.s32 %r24925, %r31228, 16777619; + ld.global.u32 %r24926, [%rd1058+4]; + xor.b32 %r31228, %r24925, %r24926; + mul.lo.s32 %r24927, %r31239, 16777619; + ld.global.u32 %r24928, [%rd1058+8]; + mul.lo.s32 %r24929, %r31240, 16777619; + ld.global.u32 %r24930, [%rd1058+12]; + xor.b32 %r24931, %r24929, %r24930; + xor.b32 %r31239, %r24927, %r24928; + mov.b64 %rd1059, {%r31239, %r24931}; + mul.lo.s32 %r24932, %r31235, 16777619; + ld.global.u32 %r24933, [%rd1058+16]; + mul.lo.s32 %r24934, %r31236, 16777619; + ld.global.u32 %r24935, [%rd1058+20]; + xor.b32 %r24936, %r24934, %r24935; + xor.b32 %r31235, %r24932, %r24933; + mov.b64 %rd1060, {%r31235, %r24936}; + mul.lo.s32 %r24937, %r31231, 16777619; + ld.global.u32 %r24938, [%rd1058+24]; + mul.lo.s32 %r24939, %r31232, 16777619; + ld.global.u32 %r24940, [%rd1058+28]; + xor.b32 %r24941, %r24939, %r24940; + xor.b32 %r31231, %r24937, %r24938; + mov.b64 %rd1061, {%r31231, %r24941}; + mul.lo.s32 %r24942, %r31229, 16777619; + ld.global.u32 %r24943, [%rd1058+32]; + mul.lo.s32 %r24944, %r31230, 16777619; + ld.global.u32 %r24945, [%rd1058+36]; + xor.b32 %r24946, %r24944, %r24945; + xor.b32 %r31229, %r24942, %r24943; + mov.b64 %rd1062, {%r31229, %r24946}; + mul.lo.s32 %r24947, %r31225, 16777619; + ld.global.u32 %r24948, [%rd1058+40]; + xor.b32 %r31225, %r24947, %r24948; + mul.lo.s32 %r24949, %r31226, 16777619; + ld.global.u32 %r24950, [%rd1058+44]; + xor.b32 %r31226, %r24949, %r24950; + mul.lo.s32 %r24951, %r31237, 16777619; + ld.global.u32 %r24952, [%rd1058+48]; + mul.lo.s32 %r24953, %r31238, 16777619; + ld.global.u32 %r24954, [%rd1058+52]; + xor.b32 %r24955, %r24953, %r24954; + xor.b32 %r31237, %r24951, %r24952; + mov.b64 %rd1063, {%r31237, %r24955}; + mul.lo.s32 %r24956, %r31233, 16777619; + ld.global.u32 %r24957, [%rd1058+56]; + mul.lo.s32 %r24958, %r31234, 16777619; + ld.global.u32 %r24959, [%rd1058+60]; + xor.b32 %r24960, %r24958, %r24959; + xor.b32 %r31233, %r24956, %r24957; + mov.b64 %rd1064, {%r31233, %r24960}; + st.local.v2.u32 [%rd270+24], {%r31227, %r31228}; + st.local.v2.u32 [%rd270+32], {%r31239, %r24931}; + st.local.v2.u32 [%rd270+40], {%r31235, %r24936}; + st.local.v2.u32 [%rd270+48], {%r31231, %r24941}; + st.local.v2.u32 [%rd270+56], {%r31229, %r24946}; + st.local.v2.u32 [%rd270+64], {%r31225, %r31226}; + st.local.v2.u32 [%rd270+72], {%r31237, %r24955}; + st.local.v2.u32 [%rd270+80], {%r31233, %r24960}; + add.s32 %r31139, %r31139, 1; + setp.lt.u32 %p45, %r31139, 512; + shr.u64 %rd1065, %rd1048, 32; + cvt.u32.u64 %r31189, %rd1065; + shr.u64 %rd1066, %rd1049, 32; + cvt.u32.u64 %r31185, %rd1066; + shr.u64 %rd1067, %rd1050, 32; + cvt.u32.u64 %r31181, %rd1067; + shr.u64 %rd1068, %rd1051, 32; + cvt.u32.u64 %r31179, %rd1068; + shr.u64 %rd1069, %rd1052, 32; + cvt.u32.u64 %r31187, %rd1069; + shr.u64 %rd1070, %rd1053, 32; + cvt.u32.u64 %r31183, %rd1070; + shr.u64 %rd1071, %rd1059, 32; + cvt.u32.u64 %r31240, %rd1071; + shr.u64 %rd1072, %rd1060, 32; + cvt.u32.u64 %r31236, %rd1072; + shr.u64 %rd1073, %rd1061, 32; + cvt.u32.u64 %r31232, %rd1073; + shr.u64 %rd1074, %rd1062, 32; + cvt.u32.u64 %r31230, %rd1074; + shr.u64 %rd1075, %rd1063, 32; + cvt.u32.u64 %r31238, %rd1075; + shr.u64 %rd1076, %rd1064, 32; + cvt.u32.u64 %r31234, %rd1076; + @%p45 bra $L__BB2_80; + + mov.u32 %r31140, 0; + st.local.v2.u32 [%rd2+96], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+104], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+112], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+120], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+128], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+136], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+144], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+152], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+160], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+168], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+176], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+184], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+192], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+200], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+208], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+216], {%r31140, %r31140}; + mov.u32 %r31155, -2147483648; + mov.u32 %r31154, 1; + st.local.v2.u32 [%rd2+88], {%r31154, %r31155}; + mov.u32 %r31141, %r31140; + mov.u32 %r31142, %r31140; + mov.u32 %r31143, %r31140; + mov.u32 %r31144, %r31140; + mov.u32 %r31145, %r31140; + mov.u32 %r31146, %r31140; + mov.u32 %r31147, %r31140; + mov.u32 %r31148, %r31140; + mov.u32 %r31149, %r31140; + mov.u32 %r31150, %r31140; + mov.u32 %r31151, %r31140; + mov.u32 %r31152, %r31140; + mov.u32 %r31153, %r31140; + mov.u32 %r31156, %r31140; + mov.u32 %r31157, %r31140; + mov.u32 %r31158, %r31140; + mov.u32 %r31159, %r31140; + mov.u32 %r31160, %r31140; + mov.u32 %r31161, %r31140; + mov.u32 %r31162, %r31140; + mov.u32 %r31163, %r31140; + mov.u32 %r31164, %r31140; + mov.u32 %r31165, %r31140; + mov.u32 %r31166, %r31140; + mov.u32 %r31167, %r31140; + mov.u32 %r31168, %r31140; + mov.u32 %r31169, %r31140; + mov.u32 %r31170, %r31140; + mov.u32 %r31171, %r31140; + mov.u32 %r31172, %r31140; + mov.u32 %r31173, %r31140; + mov.u32 %r31190, %r31140; + +$L__BB2_82: + mov.u32 %r29807, 1; + mov.u64 %rd1286, keccak_round_constants; + cvta.const.u64 %rd1285, %rd1286; + // begin inline asm + // xor5 + lop3.b32 %r25002, %r31176, %r31174, %r31172, 0x96; + lop3.b32 %r25002, %r25002, %r31170, %r31168, 0x96; + lop3.b32 %r25003, %r31177, %r31175, %r31173, 0x96; + lop3.b32 %r25003, %r25003, %r31171, %r31169, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25014, %r31188, %r31186, %r31166, 0x96; + lop3.b32 %r25014, %r25014, %r31164, %r31162, 0x96; + lop3.b32 %r25015, %r31189, %r31187, %r31167, 0x96; + lop3.b32 %r25015, %r25015, %r31165, %r31163, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25026, %r31184, %r31182, %r31160, 0x96; + lop3.b32 %r25026, %r25026, %r31158, %r31156, 0x96; + lop3.b32 %r25027, %r31185, %r31183, %r31161, 0x96; + lop3.b32 %r25027, %r25027, %r31159, %r31157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25038, %r31180, %r31154, %r31152, 0x96; + lop3.b32 %r25038, %r25038, %r31150, %r31148, 0x96; + lop3.b32 %r25039, %r31181, %r31155, %r31153, 0x96; + lop3.b32 %r25039, %r25039, %r31151, %r31149, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25050, %r31178, %r31146, %r31144, 0x96; + lop3.b32 %r25050, %r25050, %r31142, %r31140, 0x96; + lop3.b32 %r25051, %r31179, %r31147, %r31145, 0x96; + lop3.b32 %r25051, %r25051, %r31143, %r31141, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25062, %r25015, %r25014, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25066, %r25014, %r25015, %r29807; + // end inline asm + xor.b32 %r25496, %r25062, %r25050; + xor.b32 %r25497, %r25066, %r25051; + xor.b32 %r25329, %r31176, %r25496; + xor.b32 %r25332, %r31177, %r25497; + xor.b32 %r25236, %r31174, %r25496; + xor.b32 %r25235, %r31175, %r25497; + xor.b32 %r25283, %r31172, %r25496; + xor.b32 %r25284, %r31173, %r25497; + xor.b32 %r25188, %r31170, %r25496; + xor.b32 %r25187, %r31171, %r25497; + xor.b32 %r25139, %r31168, %r25496; + xor.b32 %r25140, %r31169, %r25497; + // begin inline asm + shf.l.wrap.b32 %r25070, %r25027, %r25026, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25074, %r25026, %r25027, %r29807; + // end inline asm + xor.b32 %r25498, %r25070, %r25002; + xor.b32 %r25499, %r25074, %r25003; + xor.b32 %r25291, %r31188, %r25498; + xor.b32 %r25292, %r31189, %r25499; + xor.b32 %r25108, %r31186, %r25498; + xor.b32 %r25107, %r31187, %r25499; + xor.b32 %r25267, %r31166, %r25498; + xor.b32 %r25268, %r31167, %r25499; + xor.b32 %r25228, %r31164, %r25498; + xor.b32 %r25227, %r31165, %r25499; + xor.b32 %r25211, %r31162, %r25498; + xor.b32 %r25212, %r31163, %r25499; + // begin inline asm + shf.l.wrap.b32 %r25078, %r25039, %r25038, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25082, %r25038, %r25039, %r29807; + // end inline asm + xor.b32 %r25500, %r25078, %r25014; + xor.b32 %r25501, %r25082, %r25015; + xor.b32 %r25148, %r31184, %r25500; + xor.b32 %r25147, %r31185, %r25501; + xor.b32 %r25275, %r31182, %r25500; + xor.b32 %r25276, %r31183, %r25501; + xor.b32 %r25156, %r31160, %r25500; + xor.b32 %r25155, %r31161, %r25501; + xor.b32 %r25259, %r31158, %r25500; + xor.b32 %r25260, %r31159, %r25501; + xor.b32 %r25124, %r31156, %r25500; + xor.b32 %r25123, %r31157, %r25501; + // begin inline asm + shf.l.wrap.b32 %r25086, %r25051, %r25050, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25090, %r25050, %r25051, %r29807; + // end inline asm + xor.b32 %r25502, %r25086, %r25026; + xor.b32 %r25503, %r25090, %r25027; + xor.b32 %r25243, %r31180, %r25502; + xor.b32 %r25244, %r31181, %r25503; + xor.b32 %r25220, %r31154, %r25502; + xor.b32 %r25219, %r31155, %r25503; + xor.b32 %r25163, %r31152, %r25502; + xor.b32 %r25164, %r31153, %r25503; + xor.b32 %r25251, %r31150, %r25502; + xor.b32 %r25252, %r31151, %r25503; + xor.b32 %r25180, %r31148, %r25502; + xor.b32 %r25179, %r31149, %r25503; + // begin inline asm + shf.l.wrap.b32 %r25094, %r25003, %r25002, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25098, %r25002, %r25003, %r29807; + // end inline asm + xor.b32 %r25504, %r25094, %r25038; + xor.b32 %r25505, %r25098, %r25039; + xor.b32 %r25195, %r31178, %r25504; + xor.b32 %r25196, %r31179, %r25505; + xor.b32 %r25115, %r31146, %r25504; + xor.b32 %r25116, %r31147, %r25505; + xor.b32 %r25132, %r31144, %r25504; + xor.b32 %r25131, %r31145, %r25505; + xor.b32 %r25171, %r31142, %r25504; + xor.b32 %r25172, %r31143, %r25505; + xor.b32 %r25203, %r31140, %r25504; + xor.b32 %r25204, %r31141, %r25505; + mov.u32 %r25109, 44; + // begin inline asm + shf.l.wrap.b32 %r25102, %r25108, %r25107, %r25109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25106, %r25107, %r25108, %r25109; + // end inline asm + mov.u32 %r25117, 20; + // begin inline asm + shf.l.wrap.b32 %r25110, %r25116, %r25115, %r25117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25114, %r25115, %r25116, %r25117; + // end inline asm + mov.u32 %r25125, 61; + // begin inline asm + shf.l.wrap.b32 %r25118, %r25124, %r25123, %r25125; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25122, %r25123, %r25124, %r25125; + // end inline asm + mov.u32 %r25133, 39; + // begin inline asm + shf.l.wrap.b32 %r25126, %r25132, %r25131, %r25133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25130, %r25131, %r25132, %r25133; + // end inline asm + mov.u32 %r25141, 18; + // begin inline asm + shf.l.wrap.b32 %r25134, %r25140, %r25139, %r25141; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25138, %r25139, %r25140, %r25141; + // end inline asm + mov.u32 %r25149, 62; + // begin inline asm + shf.l.wrap.b32 %r25142, %r25148, %r25147, %r25149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25146, %r25147, %r25148, %r25149; + // end inline asm + mov.u32 %r25157, 43; + // begin inline asm + shf.l.wrap.b32 %r25150, %r25156, %r25155, %r25157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25154, %r25155, %r25156, %r25157; + // end inline asm + mov.u32 %r25165, 25; + // begin inline asm + shf.l.wrap.b32 %r25158, %r25164, %r25163, %r25165; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25162, %r25163, %r25164, %r25165; + // end inline asm + mov.u32 %r25173, 8; + // begin inline asm + shf.l.wrap.b32 %r25166, %r25172, %r25171, %r25173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25170, %r25171, %r25172, %r25173; + // end inline asm + mov.u32 %r25181, 56; + // begin inline asm + shf.l.wrap.b32 %r25174, %r25180, %r25179, %r25181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25178, %r25179, %r25180, %r25181; + // end inline asm + mov.u32 %r25189, 41; + // begin inline asm + shf.l.wrap.b32 %r25182, %r25188, %r25187, %r25189; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25186, %r25187, %r25188, %r25189; + // end inline asm + mov.u32 %r25197, 27; + // begin inline asm + shf.l.wrap.b32 %r25190, %r25196, %r25195, %r25197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25194, %r25195, %r25196, %r25197; + // end inline asm + mov.u32 %r25205, 14; + // begin inline asm + shf.l.wrap.b32 %r25198, %r25204, %r25203, %r25205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25202, %r25203, %r25204, %r25205; + // end inline asm + mov.u32 %r25213, 2; + // begin inline asm + shf.l.wrap.b32 %r25206, %r25212, %r25211, %r25213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25210, %r25211, %r25212, %r25213; + // end inline asm + mov.u32 %r25221, 55; + // begin inline asm + shf.l.wrap.b32 %r25214, %r25220, %r25219, %r25221; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25218, %r25219, %r25220, %r25221; + // end inline asm + mov.u32 %r25229, 45; + // begin inline asm + shf.l.wrap.b32 %r25222, %r25228, %r25227, %r25229; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25226, %r25227, %r25228, %r25229; + // end inline asm + mov.u32 %r25237, 36; + // begin inline asm + shf.l.wrap.b32 %r25230, %r25236, %r25235, %r25237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25234, %r25235, %r25236, %r25237; + // end inline asm + mov.u32 %r25245, 28; + // begin inline asm + shf.l.wrap.b32 %r25238, %r25244, %r25243, %r25245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25242, %r25243, %r25244, %r25245; + // end inline asm + mov.u32 %r25253, 21; + // begin inline asm + shf.l.wrap.b32 %r25246, %r25252, %r25251, %r25253; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25250, %r25251, %r25252, %r25253; + // end inline asm + mov.u32 %r25261, 15; + // begin inline asm + shf.l.wrap.b32 %r25254, %r25260, %r25259, %r25261; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25258, %r25259, %r25260, %r25261; + // end inline asm + mov.u32 %r25269, 10; + // begin inline asm + shf.l.wrap.b32 %r25262, %r25268, %r25267, %r25269; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25266, %r25267, %r25268, %r25269; + // end inline asm + mov.u32 %r25277, 6; + // begin inline asm + shf.l.wrap.b32 %r25270, %r25276, %r25275, %r25277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25274, %r25275, %r25276, %r25277; + // end inline asm + mov.u32 %r25285, 3; + // begin inline asm + shf.l.wrap.b32 %r25278, %r25284, %r25283, %r25285; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25282, %r25283, %r25284, %r25285; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25286, %r25292, %r25291, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25290, %r25291, %r25292, %r29807; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25294, %r25329, %r25102, %r25150, 0xD2; + lop3.b32 %r25295, %r25332, %r25106, %r25154, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31188, %r25102, %r25150, %r25246, 0xD2; + lop3.b32 %r31189, %r25106, %r25154, %r25250, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31184, %r25150, %r25246, %r25198, 0xD2; + lop3.b32 %r31185, %r25154, %r25250, %r25202, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31180, %r25246, %r25198, %r25329, 0xD2; + lop3.b32 %r31181, %r25250, %r25202, %r25332, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31178, %r25198, %r25329, %r25102, 0xD2; + lop3.b32 %r31179, %r25202, %r25332, %r25106, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31174, %r25238, %r25110, %r25278, 0xD2; + lop3.b32 %r31175, %r25242, %r25114, %r25282, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31186, %r25110, %r25278, %r25222, 0xD2; + lop3.b32 %r31187, %r25114, %r25282, %r25226, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31182, %r25278, %r25222, %r25118, 0xD2; + lop3.b32 %r31183, %r25282, %r25226, %r25122, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31154, %r25222, %r25118, %r25238, 0xD2; + lop3.b32 %r31155, %r25226, %r25122, %r25242, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r31154, %r31155}; + // begin inline asm + // chi + lop3.b32 %r31146, %r25118, %r25238, %r25110, 0xD2; + lop3.b32 %r31147, %r25122, %r25242, %r25114, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r31146, %r31147}; + // begin inline asm + // chi + lop3.b32 %r31172, %r25286, %r25270, %r25158, 0xD2; + lop3.b32 %r31173, %r25290, %r25274, %r25162, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r31172, %r31173}; + // begin inline asm + // chi + lop3.b32 %r31166, %r25270, %r25158, %r25166, 0xD2; + lop3.b32 %r31167, %r25274, %r25162, %r25170, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r31166, %r31167}; + // begin inline asm + // chi + lop3.b32 %r31160, %r25158, %r25166, %r25134, 0xD2; + lop3.b32 %r31161, %r25162, %r25170, %r25138, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r31160, %r31161}; + // begin inline asm + // chi + lop3.b32 %r31152, %r25166, %r25134, %r25286, 0xD2; + lop3.b32 %r31153, %r25170, %r25138, %r25290, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r31152, %r31153}; + // begin inline asm + // chi + lop3.b32 %r31144, %r25134, %r25286, %r25270, 0xD2; + lop3.b32 %r31145, %r25138, %r25290, %r25274, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r31144, %r31145}; + // begin inline asm + // chi + lop3.b32 %r31170, %r25190, %r25230, %r25262, 0xD2; + lop3.b32 %r31171, %r25194, %r25234, %r25266, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r31170, %r31171}; + // begin inline asm + // chi + lop3.b32 %r31164, %r25230, %r25262, %r25254, 0xD2; + lop3.b32 %r31165, %r25234, %r25266, %r25258, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r31164, %r31165}; + // begin inline asm + // chi + lop3.b32 %r31158, %r25262, %r25254, %r25174, 0xD2; + lop3.b32 %r31159, %r25266, %r25258, %r25178, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r31158, %r31159}; + // begin inline asm + // chi + lop3.b32 %r31150, %r25254, %r25174, %r25190, 0xD2; + lop3.b32 %r31151, %r25258, %r25178, %r25194, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r31150, %r31151}; + // begin inline asm + // chi + lop3.b32 %r31142, %r25174, %r25190, %r25230, 0xD2; + lop3.b32 %r31143, %r25178, %r25194, %r25234, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r31142, %r31143}; + // begin inline asm + // chi + lop3.b32 %r31168, %r25142, %r25214, %r25126, 0xD2; + lop3.b32 %r31169, %r25146, %r25218, %r25130, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r31168, %r31169}; + // begin inline asm + // chi + lop3.b32 %r31162, %r25214, %r25126, %r25182, 0xD2; + lop3.b32 %r31163, %r25218, %r25130, %r25186, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r31162, %r31163}; + // begin inline asm + // chi + lop3.b32 %r31156, %r25126, %r25182, %r25206, 0xD2; + lop3.b32 %r31157, %r25130, %r25186, %r25210, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r31156, %r31157}; + // begin inline asm + // chi + lop3.b32 %r31148, %r25182, %r25206, %r25142, 0xD2; + lop3.b32 %r31149, %r25186, %r25210, %r25146, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r31148, %r31149}; + // begin inline asm + // chi + lop3.b32 %r31140, %r25206, %r25142, %r25214, 0xD2; + lop3.b32 %r31141, %r25210, %r25146, %r25218, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r31140, %r31141}; + mul.wide.s32 %rd1080, %r31190, 8; + add.s64 %rd1079, %rd1285, %rd1080; + // begin inline asm + ld.global.nc.v2.u32 {%r25494,%r25495}, [%rd1079]; + // end inline asm + xor.b32 %r31176, %r25294, %r25494; + xor.b32 %r31177, %r25295, %r25495; + add.s32 %r31190, %r31190, 1; + setp.lt.u32 %p46, %r31190, 23; + @%p46 bra $L__BB2_82; + + mov.u32 %r29806, 3; + mov.u32 %r29805, 21; + mov.u32 %r29804, 28; + mov.u32 %r29803, 45; + mov.u32 %r29802, 14; + mov.u32 %r29801, 43; + mov.u32 %r29800, 61; + mov.u32 %r29799, 20; + mov.u32 %r29798, 44; + mov.u64 %rd1289, keccak_round_constants; + cvta.const.u64 %rd1288, %rd1289; + add.s64 %rd1287, %rd1288, 184; + st.local.v2.u32 [%rd2+32], {%r31188, %r31189}; + st.local.v2.u32 [%rd2+72], {%r31186, %r31187}; + st.local.v2.u32 [%rd2+40], {%r31184, %r31185}; + st.local.v2.u32 [%rd2+80], {%r31182, %r31183}; + st.local.v2.u32 [%rd2+48], {%r31180, %r31181}; + st.local.v2.u32 [%rd2+56], {%r31178, %r31179}; + st.local.v2.u32 [%rd2+24], {%r31176, %r31177}; + // begin inline asm + // xor5 + lop3.b32 %r25506, %r31176, %r31174, %r31172, 0x96; + lop3.b32 %r25506, %r25506, %r31170, %r31168, 0x96; + lop3.b32 %r25507, %r31177, %r31175, %r31173, 0x96; + lop3.b32 %r25507, %r25507, %r31171, %r31169, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25518, %r31188, %r31186, %r31166, 0x96; + lop3.b32 %r25518, %r25518, %r31164, %r31162, 0x96; + lop3.b32 %r25519, %r31189, %r31187, %r31167, 0x96; + lop3.b32 %r25519, %r25519, %r31165, %r31163, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25530, %r31184, %r31182, %r31160, 0x96; + lop3.b32 %r25530, %r25530, %r31158, %r31156, 0x96; + lop3.b32 %r25531, %r31185, %r31183, %r31161, 0x96; + lop3.b32 %r25531, %r25531, %r31159, %r31157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25542, %r31180, %r31154, %r31152, 0x96; + lop3.b32 %r25542, %r25542, %r31150, %r31148, 0x96; + lop3.b32 %r25543, %r31181, %r31155, %r31153, 0x96; + lop3.b32 %r25543, %r25543, %r31151, %r31149, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25554, %r31178, %r31146, %r31144, 0x96; + lop3.b32 %r25554, %r25554, %r31142, %r31140, 0x96; + lop3.b32 %r25555, %r31179, %r31147, %r31145, 0x96; + lop3.b32 %r25555, %r25555, %r31143, %r31141, 0x96; + // end inline asm + mov.u32 %r31205, 1; + // begin inline asm + shf.l.wrap.b32 %r25566, %r25519, %r25518, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25570, %r25518, %r25519, %r31205; + // end inline asm + xor.b32 %r25785, %r25566, %r25554; + xor.b32 %r25786, %r25570, %r25555; + xor.b32 %r25713, %r31176, %r25785; + xor.b32 %r25716, %r31177, %r25786; + xor.b32 %r25676, %r31173, %r25786; + xor.b32 %r25675, %r31172, %r25785; + st.local.v2.u32 [%rd2+104], {%r25675, %r25676}; + // begin inline asm + shf.l.wrap.b32 %r25574, %r25531, %r25530, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25578, %r25530, %r25531, %r31205; + // end inline asm + xor.b32 %r25787, %r25574, %r25506; + xor.b32 %r25788, %r25578, %r25507; + xor.b32 %r25612, %r31186, %r25787; + xor.b32 %r25611, %r31187, %r25788; + xor.b32 %r25651, %r31165, %r25788; + xor.b32 %r25652, %r31164, %r25787; + st.local.v2.u32 [%rd2+152], {%r25652, %r25651}; + // begin inline asm + shf.l.wrap.b32 %r25582, %r25543, %r25542, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25586, %r25542, %r25543, %r31205; + // end inline asm + xor.b32 %r25789, %r25582, %r25518; + xor.b32 %r25790, %r25586, %r25519; + xor.b32 %r25635, %r31161, %r25790; + xor.b32 %r25636, %r31160, %r25789; + st.local.v2.u32 [%rd2+120], {%r25636, %r25635}; + xor.b32 %r25627, %r31157, %r25790; + xor.b32 %r25628, %r31156, %r25789; + st.local.v2.u32 [%rd2+200], {%r25628, %r25627}; + // begin inline asm + shf.l.wrap.b32 %r25590, %r25555, %r25554, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25594, %r25554, %r25555, %r31205; + // end inline asm + xor.b32 %r25791, %r25590, %r25530; + xor.b32 %r25792, %r25594, %r25531; + xor.b32 %r25659, %r31180, %r25791; + xor.b32 %r25660, %r31181, %r25792; + xor.b32 %r25668, %r31151, %r25792; + xor.b32 %r25667, %r31150, %r25791; + st.local.v2.u32 [%rd2+168], {%r25667, %r25668}; + // begin inline asm + shf.l.wrap.b32 %r25598, %r25507, %r25506, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25602, %r25506, %r25507, %r31205; + // end inline asm + xor.b32 %r25793, %r25598, %r25542; + xor.b32 %r25794, %r25602, %r25543; + xor.b32 %r25619, %r31146, %r25793; + xor.b32 %r25620, %r31147, %r25794; + xor.b32 %r25644, %r31141, %r25794; + xor.b32 %r25643, %r31140, %r25793; + st.local.v2.u32 [%rd2+216], {%r25643, %r25644}; + // begin inline asm + shf.l.wrap.b32 %r25606, %r25612, %r25611, %r29798; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25610, %r25611, %r25612, %r29798; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25614, %r25620, %r25619, %r29799; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25618, %r25619, %r25620, %r29799; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25626, %r25627, %r25628, %r29800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25622, %r25628, %r25627, %r29800; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r25622, %r25626}; + // begin inline asm + shf.l.wrap.b32 %r25630, %r25636, %r25635, %r29801; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25634, %r25635, %r25636, %r29801; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25638, %r25644, %r25643, %r29802; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25642, %r25643, %r25644, %r29802; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25650, %r25651, %r25652, %r29803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25646, %r25652, %r25651, %r29803; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r25646, %r25650}; + // begin inline asm + shf.l.wrap.b32 %r25654, %r25660, %r25659, %r29804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25658, %r25659, %r25660, %r29804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25662, %r25668, %r25667, %r29805; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25666, %r25667, %r25668, %r29805; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25670, %r25676, %r25675, %r29806; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25674, %r25675, %r25676, %r29806; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25678, %r25713, %r25606, %r25630, 0xD2; + lop3.b32 %r25679, %r25716, %r25610, %r25634, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25686, %r25606, %r25630, %r25662, 0xD2; + lop3.b32 %r25687, %r25610, %r25634, %r25666, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r25686, %r25687}; + // begin inline asm + // chi + lop3.b32 %r25694, %r25630, %r25662, %r25638, 0xD2; + lop3.b32 %r25695, %r25634, %r25666, %r25642, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r25694, %r25695}; + // begin inline asm + // chi + lop3.b32 %r25702, %r25662, %r25638, %r25713, 0xD2; + lop3.b32 %r25703, %r25666, %r25642, %r25716, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r25702, %r25703}; + // begin inline asm + // chi + lop3.b32 %r25710, %r25638, %r25713, %r25606, 0xD2; + lop3.b32 %r25711, %r25642, %r25716, %r25610, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r25710, %r25711}; + // begin inline asm + // chi + lop3.b32 %r25718, %r25654, %r25614, %r25670, 0xD2; + lop3.b32 %r25719, %r25658, %r25618, %r25674, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r25718, %r25719}; + // begin inline asm + // chi + lop3.b32 %r25726, %r25614, %r25670, %r25646, 0xD2; + lop3.b32 %r25727, %r25618, %r25674, %r25650, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r25726, %r25727}; + // begin inline asm + // chi + lop3.b32 %r25734, %r25670, %r25646, %r25622, 0xD2; + lop3.b32 %r25735, %r25674, %r25650, %r25626, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r25734, %r25735}; + // begin inline asm + ld.global.nc.v2.u32 {%r25742,%r25743}, [%rd1287]; + // end inline asm + xor.b32 %r25795, %r25679, %r25743; + xor.b32 %r25796, %r25678, %r25742; + mov.b64 %rd1349, {%r25796, %r25795}; + mov.b64 %rd1350, {%r25686, %r25687}; + mov.b64 %rd1351, {%r25694, %r25695}; + mov.b64 %rd1352, {%r25702, %r25703}; + mov.b64 %rd1353, {%r25710, %r25711}; + mov.b64 %rd1354, {%r25718, %r25719}; + mov.b64 %rd1355, {%r25726, %r25727}; + mov.b64 %rd1356, {%r25734, %r25735}; + mov.u32 %r31191, 0; + st.local.v2.u32 [%rd2+24], {%r25796, %r25795}; + st.local.v2.u32 [%rd270+96], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+104], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+112], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+120], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+128], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+136], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+144], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+152], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+160], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+168], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+176], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+184], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+192], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+200], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+208], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+216], {%r31191, %r31191}; + mov.u32 %r31206, -2147483648; + st.local.v2.u32 [%rd270+88], {%r31205, %r31206}; + mov.u32 %r31192, %r31191; + mov.u32 %r31193, %r31191; + mov.u32 %r31194, %r31191; + mov.u32 %r31195, %r31191; + mov.u32 %r31196, %r31191; + mov.u32 %r31197, %r31191; + mov.u32 %r31198, %r31191; + mov.u32 %r31199, %r31191; + mov.u32 %r31200, %r31191; + mov.u32 %r31201, %r31191; + mov.u32 %r31202, %r31191; + mov.u32 %r31203, %r31191; + mov.u32 %r31204, %r31191; + mov.u32 %r31207, %r31191; + mov.u32 %r31208, %r31191; + mov.u32 %r31209, %r31191; + mov.u32 %r31210, %r31191; + mov.u32 %r31211, %r31191; + mov.u32 %r31212, %r31191; + mov.u32 %r31213, %r31191; + mov.u32 %r31214, %r31191; + mov.u32 %r31215, %r31191; + mov.u32 %r31216, %r31191; + mov.u32 %r31217, %r31191; + mov.u32 %r31218, %r31191; + mov.u32 %r31219, %r31191; + mov.u32 %r31220, %r31191; + mov.u32 %r31221, %r31191; + mov.u32 %r31222, %r31191; + mov.u32 %r31223, %r31191; + mov.u32 %r31224, %r31191; + mov.u32 %r31241, %r31191; + +$L__BB2_84: + mov.u32 %r29817, 1; + mov.u64 %rd1291, keccak_round_constants; + cvta.const.u64 %rd1290, %rd1291; + // begin inline asm + // xor5 + lop3.b32 %r25797, %r31227, %r31225, %r31223, 0x96; + lop3.b32 %r25797, %r25797, %r31221, %r31219, 0x96; + lop3.b32 %r25798, %r31228, %r31226, %r31224, 0x96; + lop3.b32 %r25798, %r25798, %r31222, %r31220, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25809, %r31239, %r31237, %r31217, 0x96; + lop3.b32 %r25809, %r25809, %r31215, %r31213, 0x96; + lop3.b32 %r25810, %r31240, %r31238, %r31218, 0x96; + lop3.b32 %r25810, %r25810, %r31216, %r31214, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25821, %r31235, %r31233, %r31211, 0x96; + lop3.b32 %r25821, %r25821, %r31209, %r31207, 0x96; + lop3.b32 %r25822, %r31236, %r31234, %r31212, 0x96; + lop3.b32 %r25822, %r25822, %r31210, %r31208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25833, %r31231, %r31205, %r31203, 0x96; + lop3.b32 %r25833, %r25833, %r31201, %r31199, 0x96; + lop3.b32 %r25834, %r31232, %r31206, %r31204, 0x96; + lop3.b32 %r25834, %r25834, %r31202, %r31200, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25845, %r31229, %r31197, %r31195, 0x96; + lop3.b32 %r25845, %r25845, %r31193, %r31191, 0x96; + lop3.b32 %r25846, %r31230, %r31198, %r31196, 0x96; + lop3.b32 %r25846, %r25846, %r31194, %r31192, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25857, %r25810, %r25809, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25861, %r25809, %r25810, %r29817; + // end inline asm + xor.b32 %r26291, %r25857, %r25845; + xor.b32 %r26292, %r25861, %r25846; + xor.b32 %r26124, %r31227, %r26291; + xor.b32 %r26127, %r31228, %r26292; + xor.b32 %r26031, %r31225, %r26291; + xor.b32 %r26030, %r31226, %r26292; + xor.b32 %r26078, %r31223, %r26291; + xor.b32 %r26079, %r31224, %r26292; + xor.b32 %r25983, %r31221, %r26291; + xor.b32 %r25982, %r31222, %r26292; + xor.b32 %r25934, %r31219, %r26291; + xor.b32 %r25935, %r31220, %r26292; + // begin inline asm + shf.l.wrap.b32 %r25865, %r25822, %r25821, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25869, %r25821, %r25822, %r29817; + // end inline asm + xor.b32 %r26293, %r25865, %r25797; + xor.b32 %r26294, %r25869, %r25798; + xor.b32 %r26086, %r31239, %r26293; + xor.b32 %r26087, %r31240, %r26294; + xor.b32 %r25903, %r31237, %r26293; + xor.b32 %r25902, %r31238, %r26294; + xor.b32 %r26062, %r31217, %r26293; + xor.b32 %r26063, %r31218, %r26294; + xor.b32 %r26023, %r31215, %r26293; + xor.b32 %r26022, %r31216, %r26294; + xor.b32 %r26006, %r31213, %r26293; + xor.b32 %r26007, %r31214, %r26294; + // begin inline asm + shf.l.wrap.b32 %r25873, %r25834, %r25833, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25877, %r25833, %r25834, %r29817; + // end inline asm + xor.b32 %r26295, %r25873, %r25809; + xor.b32 %r26296, %r25877, %r25810; + xor.b32 %r25943, %r31235, %r26295; + xor.b32 %r25942, %r31236, %r26296; + xor.b32 %r26070, %r31233, %r26295; + xor.b32 %r26071, %r31234, %r26296; + xor.b32 %r25951, %r31211, %r26295; + xor.b32 %r25950, %r31212, %r26296; + xor.b32 %r26054, %r31209, %r26295; + xor.b32 %r26055, %r31210, %r26296; + xor.b32 %r25919, %r31207, %r26295; + xor.b32 %r25918, %r31208, %r26296; + // begin inline asm + shf.l.wrap.b32 %r25881, %r25846, %r25845, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25885, %r25845, %r25846, %r29817; + // end inline asm + xor.b32 %r26297, %r25881, %r25821; + xor.b32 %r26298, %r25885, %r25822; + xor.b32 %r26038, %r31231, %r26297; + xor.b32 %r26039, %r31232, %r26298; + xor.b32 %r26015, %r31205, %r26297; + xor.b32 %r26014, %r31206, %r26298; + xor.b32 %r25958, %r31203, %r26297; + xor.b32 %r25959, %r31204, %r26298; + xor.b32 %r26046, %r31201, %r26297; + xor.b32 %r26047, %r31202, %r26298; + xor.b32 %r25975, %r31199, %r26297; + xor.b32 %r25974, %r31200, %r26298; + // begin inline asm + shf.l.wrap.b32 %r25889, %r25798, %r25797, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25893, %r25797, %r25798, %r29817; + // end inline asm + xor.b32 %r26299, %r25889, %r25833; + xor.b32 %r26300, %r25893, %r25834; + xor.b32 %r25990, %r31229, %r26299; + xor.b32 %r25991, %r31230, %r26300; + xor.b32 %r25910, %r31197, %r26299; + xor.b32 %r25911, %r31198, %r26300; + xor.b32 %r25927, %r31195, %r26299; + xor.b32 %r25926, %r31196, %r26300; + xor.b32 %r25966, %r31193, %r26299; + xor.b32 %r25967, %r31194, %r26300; + xor.b32 %r25998, %r31191, %r26299; + xor.b32 %r25999, %r31192, %r26300; + mov.u32 %r25904, 44; + // begin inline asm + shf.l.wrap.b32 %r25897, %r25903, %r25902, %r25904; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25901, %r25902, %r25903, %r25904; + // end inline asm + mov.u32 %r25912, 20; + // begin inline asm + shf.l.wrap.b32 %r25905, %r25911, %r25910, %r25912; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25909, %r25910, %r25911, %r25912; + // end inline asm + mov.u32 %r25920, 61; + // begin inline asm + shf.l.wrap.b32 %r25913, %r25919, %r25918, %r25920; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25917, %r25918, %r25919, %r25920; + // end inline asm + mov.u32 %r25928, 39; + // begin inline asm + shf.l.wrap.b32 %r25921, %r25927, %r25926, %r25928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25925, %r25926, %r25927, %r25928; + // end inline asm + mov.u32 %r25936, 18; + // begin inline asm + shf.l.wrap.b32 %r25929, %r25935, %r25934, %r25936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25933, %r25934, %r25935, %r25936; + // end inline asm + mov.u32 %r25944, 62; + // begin inline asm + shf.l.wrap.b32 %r25937, %r25943, %r25942, %r25944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25941, %r25942, %r25943, %r25944; + // end inline asm + mov.u32 %r25952, 43; + // begin inline asm + shf.l.wrap.b32 %r25945, %r25951, %r25950, %r25952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25949, %r25950, %r25951, %r25952; + // end inline asm + mov.u32 %r25960, 25; + // begin inline asm + shf.l.wrap.b32 %r25953, %r25959, %r25958, %r25960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25957, %r25958, %r25959, %r25960; + // end inline asm + mov.u32 %r25968, 8; + // begin inline asm + shf.l.wrap.b32 %r25961, %r25967, %r25966, %r25968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25965, %r25966, %r25967, %r25968; + // end inline asm + mov.u32 %r25976, 56; + // begin inline asm + shf.l.wrap.b32 %r25969, %r25975, %r25974, %r25976; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25973, %r25974, %r25975, %r25976; + // end inline asm + mov.u32 %r25984, 41; + // begin inline asm + shf.l.wrap.b32 %r25977, %r25983, %r25982, %r25984; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25981, %r25982, %r25983, %r25984; + // end inline asm + mov.u32 %r25992, 27; + // begin inline asm + shf.l.wrap.b32 %r25985, %r25991, %r25990, %r25992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25989, %r25990, %r25991, %r25992; + // end inline asm + mov.u32 %r26000, 14; + // begin inline asm + shf.l.wrap.b32 %r25993, %r25999, %r25998, %r26000; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25997, %r25998, %r25999, %r26000; + // end inline asm + mov.u32 %r26008, 2; + // begin inline asm + shf.l.wrap.b32 %r26001, %r26007, %r26006, %r26008; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26005, %r26006, %r26007, %r26008; + // end inline asm + mov.u32 %r26016, 55; + // begin inline asm + shf.l.wrap.b32 %r26009, %r26015, %r26014, %r26016; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26013, %r26014, %r26015, %r26016; + // end inline asm + mov.u32 %r26024, 45; + // begin inline asm + shf.l.wrap.b32 %r26017, %r26023, %r26022, %r26024; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26021, %r26022, %r26023, %r26024; + // end inline asm + mov.u32 %r26032, 36; + // begin inline asm + shf.l.wrap.b32 %r26025, %r26031, %r26030, %r26032; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26029, %r26030, %r26031, %r26032; + // end inline asm + mov.u32 %r26040, 28; + // begin inline asm + shf.l.wrap.b32 %r26033, %r26039, %r26038, %r26040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26037, %r26038, %r26039, %r26040; + // end inline asm + mov.u32 %r26048, 21; + // begin inline asm + shf.l.wrap.b32 %r26041, %r26047, %r26046, %r26048; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26045, %r26046, %r26047, %r26048; + // end inline asm + mov.u32 %r26056, 15; + // begin inline asm + shf.l.wrap.b32 %r26049, %r26055, %r26054, %r26056; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26053, %r26054, %r26055, %r26056; + // end inline asm + mov.u32 %r26064, 10; + // begin inline asm + shf.l.wrap.b32 %r26057, %r26063, %r26062, %r26064; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26061, %r26062, %r26063, %r26064; + // end inline asm + mov.u32 %r26072, 6; + // begin inline asm + shf.l.wrap.b32 %r26065, %r26071, %r26070, %r26072; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26069, %r26070, %r26071, %r26072; + // end inline asm + mov.u32 %r26080, 3; + // begin inline asm + shf.l.wrap.b32 %r26073, %r26079, %r26078, %r26080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26077, %r26078, %r26079, %r26080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26081, %r26087, %r26086, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26085, %r26086, %r26087, %r29817; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26089, %r26124, %r25897, %r25945, 0xD2; + lop3.b32 %r26090, %r26127, %r25901, %r25949, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31239, %r25897, %r25945, %r26041, 0xD2; + lop3.b32 %r31240, %r25901, %r25949, %r26045, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31235, %r25945, %r26041, %r25993, 0xD2; + lop3.b32 %r31236, %r25949, %r26045, %r25997, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31231, %r26041, %r25993, %r26124, 0xD2; + lop3.b32 %r31232, %r26045, %r25997, %r26127, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31229, %r25993, %r26124, %r25897, 0xD2; + lop3.b32 %r31230, %r25997, %r26127, %r25901, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31225, %r26033, %r25905, %r26073, 0xD2; + lop3.b32 %r31226, %r26037, %r25909, %r26077, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31237, %r25905, %r26073, %r26017, 0xD2; + lop3.b32 %r31238, %r25909, %r26077, %r26021, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31233, %r26073, %r26017, %r25913, 0xD2; + lop3.b32 %r31234, %r26077, %r26021, %r25917, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31205, %r26017, %r25913, %r26033, 0xD2; + lop3.b32 %r31206, %r26021, %r25917, %r26037, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r31205, %r31206}; + // begin inline asm + // chi + lop3.b32 %r31197, %r25913, %r26033, %r25905, 0xD2; + lop3.b32 %r31198, %r25917, %r26037, %r25909, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r31197, %r31198}; + // begin inline asm + // chi + lop3.b32 %r31223, %r26081, %r26065, %r25953, 0xD2; + lop3.b32 %r31224, %r26085, %r26069, %r25957, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+104], {%r31223, %r31224}; + // begin inline asm + // chi + lop3.b32 %r31217, %r26065, %r25953, %r25961, 0xD2; + lop3.b32 %r31218, %r26069, %r25957, %r25965, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+112], {%r31217, %r31218}; + // begin inline asm + // chi + lop3.b32 %r31211, %r25953, %r25961, %r25929, 0xD2; + lop3.b32 %r31212, %r25957, %r25965, %r25933, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+120], {%r31211, %r31212}; + // begin inline asm + // chi + lop3.b32 %r31203, %r25961, %r25929, %r26081, 0xD2; + lop3.b32 %r31204, %r25965, %r25933, %r26085, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+128], {%r31203, %r31204}; + // begin inline asm + // chi + lop3.b32 %r31195, %r25929, %r26081, %r26065, 0xD2; + lop3.b32 %r31196, %r25933, %r26085, %r26069, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+136], {%r31195, %r31196}; + // begin inline asm + // chi + lop3.b32 %r31221, %r25985, %r26025, %r26057, 0xD2; + lop3.b32 %r31222, %r25989, %r26029, %r26061, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+144], {%r31221, %r31222}; + // begin inline asm + // chi + lop3.b32 %r31215, %r26025, %r26057, %r26049, 0xD2; + lop3.b32 %r31216, %r26029, %r26061, %r26053, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+152], {%r31215, %r31216}; + // begin inline asm + // chi + lop3.b32 %r31209, %r26057, %r26049, %r25969, 0xD2; + lop3.b32 %r31210, %r26061, %r26053, %r25973, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+160], {%r31209, %r31210}; + // begin inline asm + // chi + lop3.b32 %r31201, %r26049, %r25969, %r25985, 0xD2; + lop3.b32 %r31202, %r26053, %r25973, %r25989, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+168], {%r31201, %r31202}; + // begin inline asm + // chi + lop3.b32 %r31193, %r25969, %r25985, %r26025, 0xD2; + lop3.b32 %r31194, %r25973, %r25989, %r26029, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+176], {%r31193, %r31194}; + // begin inline asm + // chi + lop3.b32 %r31219, %r25937, %r26009, %r25921, 0xD2; + lop3.b32 %r31220, %r25941, %r26013, %r25925, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+184], {%r31219, %r31220}; + // begin inline asm + // chi + lop3.b32 %r31213, %r26009, %r25921, %r25977, 0xD2; + lop3.b32 %r31214, %r26013, %r25925, %r25981, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+192], {%r31213, %r31214}; + // begin inline asm + // chi + lop3.b32 %r31207, %r25921, %r25977, %r26001, 0xD2; + lop3.b32 %r31208, %r25925, %r25981, %r26005, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+200], {%r31207, %r31208}; + // begin inline asm + // chi + lop3.b32 %r31199, %r25977, %r26001, %r25937, 0xD2; + lop3.b32 %r31200, %r25981, %r26005, %r25941, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+208], {%r31199, %r31200}; + // begin inline asm + // chi + lop3.b32 %r31191, %r26001, %r25937, %r26009, 0xD2; + lop3.b32 %r31192, %r26005, %r25941, %r26013, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+216], {%r31191, %r31192}; + mul.wide.s32 %rd1091, %r31241, 8; + add.s64 %rd1090, %rd1290, %rd1091; + // begin inline asm + ld.global.nc.v2.u32 {%r26289,%r26290}, [%rd1090]; + // end inline asm + xor.b32 %r31227, %r26089, %r26289; + xor.b32 %r31228, %r26090, %r26290; + add.s32 %r31241, %r31241, 1; + setp.lt.u32 %p47, %r31241, 23; + @%p47 bra $L__BB2_84; + + mov.u32 %r29816, 3; + mov.u32 %r29815, 21; + mov.u32 %r29814, 28; + mov.u32 %r29813, 45; + mov.u32 %r29812, 14; + mov.u32 %r29811, 43; + mov.u32 %r29810, 61; + mov.u32 %r29809, 20; + mov.u32 %r29808, 44; + mov.u64 %rd1294, keccak_round_constants; + cvta.const.u64 %rd1293, %rd1294; + add.s64 %rd1292, %rd1293, 184; + mov.u32 %r26400, 1; + st.local.v2.u32 [%rd270+32], {%r31239, %r31240}; + st.local.v2.u32 [%rd270+72], {%r31237, %r31238}; + st.local.v2.u32 [%rd270+40], {%r31235, %r31236}; + st.local.v2.u32 [%rd270+80], {%r31233, %r31234}; + st.local.v2.u32 [%rd270+48], {%r31231, %r31232}; + st.local.v2.u32 [%rd270+56], {%r31229, %r31230}; + st.local.v2.u32 [%rd270+24], {%r31227, %r31228}; + // begin inline asm + // xor5 + lop3.b32 %r26301, %r31227, %r31225, %r31223, 0x96; + lop3.b32 %r26301, %r26301, %r31221, %r31219, 0x96; + lop3.b32 %r26302, %r31228, %r31226, %r31224, 0x96; + lop3.b32 %r26302, %r26302, %r31222, %r31220, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26313, %r31239, %r31237, %r31217, 0x96; + lop3.b32 %r26313, %r26313, %r31215, %r31213, 0x96; + lop3.b32 %r26314, %r31240, %r31238, %r31218, 0x96; + lop3.b32 %r26314, %r26314, %r31216, %r31214, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26325, %r31235, %r31233, %r31211, 0x96; + lop3.b32 %r26325, %r26325, %r31209, %r31207, 0x96; + lop3.b32 %r26326, %r31236, %r31234, %r31212, 0x96; + lop3.b32 %r26326, %r26326, %r31210, %r31208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26337, %r31231, %r31205, %r31203, 0x96; + lop3.b32 %r26337, %r26337, %r31201, %r31199, 0x96; + lop3.b32 %r26338, %r31232, %r31206, %r31204, 0x96; + lop3.b32 %r26338, %r26338, %r31202, %r31200, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26349, %r31229, %r31197, %r31195, 0x96; + lop3.b32 %r26349, %r26349, %r31193, %r31191, 0x96; + lop3.b32 %r26350, %r31230, %r31198, %r31196, 0x96; + lop3.b32 %r26350, %r26350, %r31194, %r31192, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26361, %r26314, %r26313, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26365, %r26313, %r26314, %r26400; + // end inline asm + xor.b32 %r26539, %r26361, %r26349; + xor.b32 %r26540, %r26365, %r26350; + xor.b32 %r26508, %r31227, %r26539; + xor.b32 %r26511, %r31228, %r26540; + xor.b32 %r26471, %r31224, %r26540; + xor.b32 %r26470, %r31223, %r26539; + st.local.v2.u32 [%rd270+104], {%r26470, %r26471}; + // begin inline asm + shf.l.wrap.b32 %r26369, %r26326, %r26325, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26373, %r26325, %r26326, %r26400; + // end inline asm + xor.b32 %r26541, %r26369, %r26301; + xor.b32 %r26542, %r26373, %r26302; + xor.b32 %r26407, %r31237, %r26541; + xor.b32 %r26406, %r31238, %r26542; + xor.b32 %r26446, %r31216, %r26542; + xor.b32 %r26447, %r31215, %r26541; + st.local.v2.u32 [%rd270+152], {%r26447, %r26446}; + // begin inline asm + shf.l.wrap.b32 %r26377, %r26338, %r26337, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26381, %r26337, %r26338, %r26400; + // end inline asm + xor.b32 %r26543, %r26377, %r26313; + xor.b32 %r26544, %r26381, %r26314; + xor.b32 %r26430, %r31212, %r26544; + xor.b32 %r26431, %r31211, %r26543; + st.local.v2.u32 [%rd270+120], {%r26431, %r26430}; + xor.b32 %r26422, %r31208, %r26544; + xor.b32 %r26423, %r31207, %r26543; + st.local.v2.u32 [%rd270+200], {%r26423, %r26422}; + // begin inline asm + shf.l.wrap.b32 %r26385, %r26350, %r26349, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26389, %r26349, %r26350, %r26400; + // end inline asm + xor.b32 %r26545, %r26385, %r26325; + xor.b32 %r26546, %r26389, %r26326; + xor.b32 %r26454, %r31231, %r26545; + xor.b32 %r26455, %r31232, %r26546; + xor.b32 %r26463, %r31202, %r26546; + xor.b32 %r26462, %r31201, %r26545; + st.local.v2.u32 [%rd270+168], {%r26462, %r26463}; + // begin inline asm + shf.l.wrap.b32 %r26393, %r26302, %r26301, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26397, %r26301, %r26302, %r26400; + // end inline asm + xor.b32 %r26547, %r26393, %r26337; + xor.b32 %r26548, %r26397, %r26338; + xor.b32 %r26414, %r31197, %r26547; + xor.b32 %r26415, %r31198, %r26548; + xor.b32 %r26439, %r31192, %r26548; + xor.b32 %r26438, %r31191, %r26547; + st.local.v2.u32 [%rd270+216], {%r26438, %r26439}; + // begin inline asm + shf.l.wrap.b32 %r26401, %r26407, %r26406, %r29808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26405, %r26406, %r26407, %r29808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26409, %r26415, %r26414, %r29809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26413, %r26414, %r26415, %r29809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26421, %r26422, %r26423, %r29810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26417, %r26423, %r26422, %r29810; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r26417, %r26421}; + // begin inline asm + shf.l.wrap.b32 %r26425, %r26431, %r26430, %r29811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26429, %r26430, %r26431, %r29811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26433, %r26439, %r26438, %r29812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26437, %r26438, %r26439, %r29812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26445, %r26446, %r26447, %r29813; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26441, %r26447, %r26446, %r29813; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r26441, %r26445}; + // begin inline asm + shf.l.wrap.b32 %r26449, %r26455, %r26454, %r29814; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26453, %r26454, %r26455, %r29814; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26457, %r26463, %r26462, %r29815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26461, %r26462, %r26463, %r29815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26465, %r26471, %r26470, %r29816; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26469, %r26470, %r26471, %r29816; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26473, %r26508, %r26401, %r26425, 0xD2; + lop3.b32 %r26474, %r26511, %r26405, %r26429, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26481, %r26401, %r26425, %r26457, 0xD2; + lop3.b32 %r26482, %r26405, %r26429, %r26461, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+32], {%r26481, %r26482}; + // begin inline asm + // chi + lop3.b32 %r26489, %r26425, %r26457, %r26433, 0xD2; + lop3.b32 %r26490, %r26429, %r26461, %r26437, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+40], {%r26489, %r26490}; + // begin inline asm + // chi + lop3.b32 %r26497, %r26457, %r26433, %r26508, 0xD2; + lop3.b32 %r26498, %r26461, %r26437, %r26511, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+48], {%r26497, %r26498}; + // begin inline asm + // chi + lop3.b32 %r26505, %r26433, %r26508, %r26401, 0xD2; + lop3.b32 %r26506, %r26437, %r26511, %r26405, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+56], {%r26505, %r26506}; + // begin inline asm + // chi + lop3.b32 %r26513, %r26449, %r26409, %r26465, 0xD2; + lop3.b32 %r26514, %r26453, %r26413, %r26469, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+64], {%r26513, %r26514}; + // begin inline asm + // chi + lop3.b32 %r26521, %r26409, %r26465, %r26441, 0xD2; + lop3.b32 %r26522, %r26413, %r26469, %r26445, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+72], {%r26521, %r26522}; + // begin inline asm + // chi + lop3.b32 %r26529, %r26465, %r26441, %r26417, 0xD2; + lop3.b32 %r26530, %r26469, %r26445, %r26421, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+80], {%r26529, %r26530}; + // begin inline asm + ld.global.nc.v2.u32 {%r26537,%r26538}, [%rd1292]; + // end inline asm + xor.b32 %r26549, %r26474, %r26538; + xor.b32 %r26550, %r26473, %r26537; + st.local.v2.u32 [%rd270+24], {%r26550, %r26549}; + mov.b64 %rd1358, {%r26481, %r26482}; + mov.b64 %rd1359, {%r26489, %r26490}; + mov.b64 %rd1362, {%r26513, %r26514}; + mov.b64 %rd1363, {%r26521, %r26522}; + mov.b64 %rd1364, {%r26529, %r26530}; + mov.b64 %rd1357, {%r26550, %r26549}; + mov.b64 %rd1360, {%r26497, %r26498}; + mov.b64 %rd1361, {%r26505, %r26506}; + bra.uni $L__BB2_86; + +$L__BB2_64: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd881, 1179641; + st.local.u64 [%rd2+8], %rd881; + st.local.u32 [%rd2+16], %r3343; + ld.global.u64 %rd882, [%rd220]; + ld.global.u64 %rd883, [%rd220+8]; + ld.global.u64 %rd884, [%rd220+16]; + ld.global.u64 %rd885, [%rd220+24]; + ld.global.u64 %rd886, [%rd220+32]; + ld.global.u64 %rd887, [%rd220+40]; + ld.global.u64 %rd888, [%rd220+48]; + ld.global.u64 %rd889, [%rd220+56]; + st.local.u64 [%rd2+24], %rd882; + st.local.u64 [%rd2+32], %rd883; + st.local.u64 [%rd2+40], %rd884; + st.local.u64 [%rd2+48], %rd885; + st.local.u64 [%rd2+56], %rd886; + st.local.u64 [%rd2+64], %rd887; + st.local.u64 [%rd2+72], %rd888; + st.local.u64 [%rd2+80], %rd889; + cvt.u32.u64 %r20023, %rd882; + xor.b32 %r20024, %r3343, %r20023; + st.local.u32 [%rd2+24], %r20024; + mov.u32 %r30768, 0; + st.local.v2.u32 [%rd2+96], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+104], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+112], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+120], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+128], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+136], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+144], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+152], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+160], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+168], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+176], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+184], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+192], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+200], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+208], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+216], {%r30768, %r30768}; + mov.u32 %r30783, -2147483648; + mov.u32 %r19996, 1; + st.local.v2.u32 [%rd2+88], {%r19996, %r30783}; + ld.local.v2.u32 {%r30804, %r30805}, [%rd2+24]; + mov.b64 {%r30802, %r30803}, %rd887; + shr.u64 %rd890, %rd883, 32; + cvt.u32.u64 %r30816, %rd883; + cvt.u32.u64 %r30817, %rd890; + shr.u64 %rd891, %rd888, 32; + cvt.u32.u64 %r30814, %rd888; + cvt.u32.u64 %r30815, %rd891; + shr.u64 %rd892, %rd884, 32; + cvt.u32.u64 %r30812, %rd884; + cvt.u32.u64 %r30813, %rd892; + shr.u64 %rd893, %rd889, 32; + cvt.u32.u64 %r30810, %rd889; + cvt.u32.u64 %r30811, %rd893; + shr.u64 %rd894, %rd885, 32; + cvt.u32.u64 %r30808, %rd885; + cvt.u32.u64 %r30809, %rd894; + shr.u64 %rd895, %rd886, 32; + cvt.u32.u64 %r30806, %rd886; + cvt.u32.u64 %r30807, %rd895; + mov.u32 %r30769, %r30768; + mov.u32 %r30770, %r30768; + mov.u32 %r30771, %r30768; + mov.u32 %r30772, %r30768; + mov.u32 %r30773, %r30768; + mov.u32 %r30774, %r30768; + mov.u32 %r30775, %r30768; + mov.u32 %r30776, %r30768; + mov.u32 %r30777, %r30768; + mov.u32 %r30778, %r30768; + mov.u32 %r30779, %r30768; + mov.u32 %r30780, %r30768; + mov.u32 %r30781, %r30768; + mov.u32 %r30782, %r19996; + mov.u32 %r30784, %r30768; + mov.u32 %r30785, %r30768; + mov.u32 %r30786, %r30768; + mov.u32 %r30787, %r30768; + mov.u32 %r30788, %r30768; + mov.u32 %r30789, %r30768; + mov.u32 %r30790, %r30768; + mov.u32 %r30791, %r30768; + mov.u32 %r30792, %r30768; + mov.u32 %r30793, %r30768; + mov.u32 %r30794, %r30768; + mov.u32 %r30795, %r30768; + mov.u32 %r30796, %r30768; + mov.u32 %r30797, %r30768; + mov.u32 %r30798, %r30768; + mov.u32 %r30799, %r30768; + mov.u32 %r30800, %r30768; + mov.u32 %r30801, %r30768; + mov.u32 %r30818, %r30768; + +$L__BB2_65: + // begin inline asm + // xor5 + lop3.b32 %r20027, %r30804, %r30802, %r30800, 0x96; + lop3.b32 %r20027, %r20027, %r30798, %r30796, 0x96; + lop3.b32 %r20028, %r30805, %r30803, %r30801, 0x96; + lop3.b32 %r20028, %r20028, %r30799, %r30797, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20039, %r30816, %r30814, %r30794, 0x96; + lop3.b32 %r20039, %r20039, %r30792, %r30790, 0x96; + lop3.b32 %r20040, %r30817, %r30815, %r30795, 0x96; + lop3.b32 %r20040, %r20040, %r30793, %r30791, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20051, %r30812, %r30810, %r30788, 0x96; + lop3.b32 %r20051, %r20051, %r30786, %r30784, 0x96; + lop3.b32 %r20052, %r30813, %r30811, %r30789, 0x96; + lop3.b32 %r20052, %r20052, %r30787, %r30785, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20063, %r30808, %r30782, %r30780, 0x96; + lop3.b32 %r20063, %r20063, %r30778, %r30776, 0x96; + lop3.b32 %r20064, %r30809, %r30783, %r30781, 0x96; + lop3.b32 %r20064, %r20064, %r30779, %r30777, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20075, %r30806, %r30774, %r30772, 0x96; + lop3.b32 %r20075, %r20075, %r30770, %r30768, 0x96; + lop3.b32 %r20076, %r30807, %r30775, %r30773, 0x96; + lop3.b32 %r20076, %r20076, %r30771, %r30769, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20087, %r20040, %r20039, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20091, %r20039, %r20040, %r19996; + // end inline asm + xor.b32 %r20521, %r20087, %r20075; + xor.b32 %r20522, %r20091, %r20076; + xor.b32 %r20354, %r30804, %r20521; + xor.b32 %r20357, %r30805, %r20522; + xor.b32 %r20261, %r30802, %r20521; + xor.b32 %r20260, %r30803, %r20522; + xor.b32 %r20308, %r30800, %r20521; + xor.b32 %r20309, %r30801, %r20522; + xor.b32 %r20213, %r30798, %r20521; + xor.b32 %r20212, %r30799, %r20522; + xor.b32 %r20164, %r30796, %r20521; + xor.b32 %r20165, %r30797, %r20522; + // begin inline asm + shf.l.wrap.b32 %r20095, %r20052, %r20051, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20099, %r20051, %r20052, %r19996; + // end inline asm + xor.b32 %r20523, %r20095, %r20027; + xor.b32 %r20524, %r20099, %r20028; + xor.b32 %r20316, %r30816, %r20523; + xor.b32 %r20317, %r30817, %r20524; + xor.b32 %r20133, %r30814, %r20523; + xor.b32 %r20132, %r30815, %r20524; + xor.b32 %r20292, %r30794, %r20523; + xor.b32 %r20293, %r30795, %r20524; + xor.b32 %r20253, %r30792, %r20523; + xor.b32 %r20252, %r30793, %r20524; + xor.b32 %r20236, %r30790, %r20523; + xor.b32 %r20237, %r30791, %r20524; + // begin inline asm + shf.l.wrap.b32 %r20103, %r20064, %r20063, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20107, %r20063, %r20064, %r19996; + // end inline asm + xor.b32 %r20525, %r20103, %r20039; + xor.b32 %r20526, %r20107, %r20040; + xor.b32 %r20173, %r30812, %r20525; + xor.b32 %r20172, %r30813, %r20526; + xor.b32 %r20300, %r30810, %r20525; + xor.b32 %r20301, %r30811, %r20526; + xor.b32 %r20181, %r30788, %r20525; + xor.b32 %r20180, %r30789, %r20526; + xor.b32 %r20284, %r30786, %r20525; + xor.b32 %r20285, %r30787, %r20526; + xor.b32 %r20149, %r30784, %r20525; + xor.b32 %r20148, %r30785, %r20526; + // begin inline asm + shf.l.wrap.b32 %r20111, %r20076, %r20075, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20115, %r20075, %r20076, %r19996; + // end inline asm + xor.b32 %r20527, %r20111, %r20051; + xor.b32 %r20528, %r20115, %r20052; + xor.b32 %r20268, %r30808, %r20527; + xor.b32 %r20269, %r30809, %r20528; + xor.b32 %r20245, %r30782, %r20527; + xor.b32 %r20244, %r30783, %r20528; + xor.b32 %r20188, %r30780, %r20527; + xor.b32 %r20189, %r30781, %r20528; + xor.b32 %r20276, %r30778, %r20527; + xor.b32 %r20277, %r30779, %r20528; + xor.b32 %r20205, %r30776, %r20527; + xor.b32 %r20204, %r30777, %r20528; + // begin inline asm + shf.l.wrap.b32 %r20119, %r20028, %r20027, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20123, %r20027, %r20028, %r19996; + // end inline asm + xor.b32 %r20529, %r20119, %r20063; + xor.b32 %r20530, %r20123, %r20064; + xor.b32 %r20220, %r30806, %r20529; + xor.b32 %r20221, %r30807, %r20530; + xor.b32 %r20140, %r30774, %r20529; + xor.b32 %r20141, %r30775, %r20530; + xor.b32 %r20157, %r30772, %r20529; + xor.b32 %r20156, %r30773, %r20530; + xor.b32 %r20196, %r30770, %r20529; + xor.b32 %r20197, %r30771, %r20530; + xor.b32 %r20228, %r30768, %r20529; + xor.b32 %r20229, %r30769, %r20530; + mov.u32 %r20134, 44; + // begin inline asm + shf.l.wrap.b32 %r20127, %r20133, %r20132, %r20134; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20131, %r20132, %r20133, %r20134; + // end inline asm + mov.u32 %r20142, 20; + // begin inline asm + shf.l.wrap.b32 %r20135, %r20141, %r20140, %r20142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20139, %r20140, %r20141, %r20142; + // end inline asm + mov.u32 %r20150, 61; + // begin inline asm + shf.l.wrap.b32 %r20143, %r20149, %r20148, %r20150; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20147, %r20148, %r20149, %r20150; + // end inline asm + mov.u32 %r20158, 39; + // begin inline asm + shf.l.wrap.b32 %r20151, %r20157, %r20156, %r20158; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20155, %r20156, %r20157, %r20158; + // end inline asm + mov.u32 %r20166, 18; + // begin inline asm + shf.l.wrap.b32 %r20159, %r20165, %r20164, %r20166; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20163, %r20164, %r20165, %r20166; + // end inline asm + mov.u32 %r20174, 62; + // begin inline asm + shf.l.wrap.b32 %r20167, %r20173, %r20172, %r20174; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20171, %r20172, %r20173, %r20174; + // end inline asm + mov.u32 %r20182, 43; + // begin inline asm + shf.l.wrap.b32 %r20175, %r20181, %r20180, %r20182; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20179, %r20180, %r20181, %r20182; + // end inline asm + mov.u32 %r20190, 25; + // begin inline asm + shf.l.wrap.b32 %r20183, %r20189, %r20188, %r20190; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20187, %r20188, %r20189, %r20190; + // end inline asm + mov.u32 %r20198, 8; + // begin inline asm + shf.l.wrap.b32 %r20191, %r20197, %r20196, %r20198; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20195, %r20196, %r20197, %r20198; + // end inline asm + mov.u32 %r20206, 56; + // begin inline asm + shf.l.wrap.b32 %r20199, %r20205, %r20204, %r20206; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20203, %r20204, %r20205, %r20206; + // end inline asm + mov.u32 %r20214, 41; + // begin inline asm + shf.l.wrap.b32 %r20207, %r20213, %r20212, %r20214; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20211, %r20212, %r20213, %r20214; + // end inline asm + mov.u32 %r20222, 27; + // begin inline asm + shf.l.wrap.b32 %r20215, %r20221, %r20220, %r20222; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20219, %r20220, %r20221, %r20222; + // end inline asm + mov.u32 %r20230, 14; + // begin inline asm + shf.l.wrap.b32 %r20223, %r20229, %r20228, %r20230; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20227, %r20228, %r20229, %r20230; + // end inline asm + mov.u32 %r20238, 2; + // begin inline asm + shf.l.wrap.b32 %r20231, %r20237, %r20236, %r20238; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20235, %r20236, %r20237, %r20238; + // end inline asm + mov.u32 %r20246, 55; + // begin inline asm + shf.l.wrap.b32 %r20239, %r20245, %r20244, %r20246; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20243, %r20244, %r20245, %r20246; + // end inline asm + mov.u32 %r20254, 45; + // begin inline asm + shf.l.wrap.b32 %r20247, %r20253, %r20252, %r20254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20251, %r20252, %r20253, %r20254; + // end inline asm + mov.u32 %r20262, 36; + // begin inline asm + shf.l.wrap.b32 %r20255, %r20261, %r20260, %r20262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20259, %r20260, %r20261, %r20262; + // end inline asm + mov.u32 %r20270, 28; + // begin inline asm + shf.l.wrap.b32 %r20263, %r20269, %r20268, %r20270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20267, %r20268, %r20269, %r20270; + // end inline asm + mov.u32 %r20278, 21; + // begin inline asm + shf.l.wrap.b32 %r20271, %r20277, %r20276, %r20278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20275, %r20276, %r20277, %r20278; + // end inline asm + mov.u32 %r20286, 15; + // begin inline asm + shf.l.wrap.b32 %r20279, %r20285, %r20284, %r20286; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20283, %r20284, %r20285, %r20286; + // end inline asm + mov.u32 %r20294, 10; + // begin inline asm + shf.l.wrap.b32 %r20287, %r20293, %r20292, %r20294; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20291, %r20292, %r20293, %r20294; + // end inline asm + mov.u32 %r20302, 6; + // begin inline asm + shf.l.wrap.b32 %r20295, %r20301, %r20300, %r20302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20299, %r20300, %r20301, %r20302; + // end inline asm + mov.u32 %r20310, 3; + // begin inline asm + shf.l.wrap.b32 %r20303, %r20309, %r20308, %r20310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20307, %r20308, %r20309, %r20310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20311, %r20317, %r20316, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20315, %r20316, %r20317, %r19996; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20319, %r20354, %r20127, %r20175, 0xD2; + lop3.b32 %r20320, %r20357, %r20131, %r20179, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30816, %r20127, %r20175, %r20271, 0xD2; + lop3.b32 %r30817, %r20131, %r20179, %r20275, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30812, %r20175, %r20271, %r20223, 0xD2; + lop3.b32 %r30813, %r20179, %r20275, %r20227, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30808, %r20271, %r20223, %r20354, 0xD2; + lop3.b32 %r30809, %r20275, %r20227, %r20357, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30806, %r20223, %r20354, %r20127, 0xD2; + lop3.b32 %r30807, %r20227, %r20357, %r20131, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30802, %r20263, %r20135, %r20303, 0xD2; + lop3.b32 %r30803, %r20267, %r20139, %r20307, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30814, %r20135, %r20303, %r20247, 0xD2; + lop3.b32 %r30815, %r20139, %r20307, %r20251, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30810, %r20303, %r20247, %r20143, 0xD2; + lop3.b32 %r30811, %r20307, %r20251, %r20147, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30782, %r20247, %r20143, %r20263, 0xD2; + lop3.b32 %r30783, %r20251, %r20147, %r20267, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30782, %r30783}; + // begin inline asm + // chi + lop3.b32 %r30774, %r20143, %r20263, %r20135, 0xD2; + lop3.b32 %r30775, %r20147, %r20267, %r20139, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30774, %r30775}; + // begin inline asm + // chi + lop3.b32 %r30800, %r20311, %r20295, %r20183, 0xD2; + lop3.b32 %r30801, %r20315, %r20299, %r20187, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30800, %r30801}; + // begin inline asm + // chi + lop3.b32 %r30794, %r20295, %r20183, %r20191, 0xD2; + lop3.b32 %r30795, %r20299, %r20187, %r20195, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30794, %r30795}; + // begin inline asm + // chi + lop3.b32 %r30788, %r20183, %r20191, %r20159, 0xD2; + lop3.b32 %r30789, %r20187, %r20195, %r20163, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30788, %r30789}; + // begin inline asm + // chi + lop3.b32 %r30780, %r20191, %r20159, %r20311, 0xD2; + lop3.b32 %r30781, %r20195, %r20163, %r20315, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30780, %r30781}; + // begin inline asm + // chi + lop3.b32 %r30772, %r20159, %r20311, %r20295, 0xD2; + lop3.b32 %r30773, %r20163, %r20315, %r20299, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30772, %r30773}; + // begin inline asm + // chi + lop3.b32 %r30798, %r20215, %r20255, %r20287, 0xD2; + lop3.b32 %r30799, %r20219, %r20259, %r20291, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30798, %r30799}; + // begin inline asm + // chi + lop3.b32 %r30792, %r20255, %r20287, %r20279, 0xD2; + lop3.b32 %r30793, %r20259, %r20291, %r20283, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30792, %r30793}; + // begin inline asm + // chi + lop3.b32 %r30786, %r20287, %r20279, %r20199, 0xD2; + lop3.b32 %r30787, %r20291, %r20283, %r20203, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30786, %r30787}; + // begin inline asm + // chi + lop3.b32 %r30778, %r20279, %r20199, %r20215, 0xD2; + lop3.b32 %r30779, %r20283, %r20203, %r20219, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30778, %r30779}; + // begin inline asm + // chi + lop3.b32 %r30770, %r20199, %r20215, %r20255, 0xD2; + lop3.b32 %r30771, %r20203, %r20219, %r20259, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30770, %r30771}; + // begin inline asm + // chi + lop3.b32 %r30796, %r20167, %r20239, %r20151, 0xD2; + lop3.b32 %r30797, %r20171, %r20243, %r20155, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30796, %r30797}; + // begin inline asm + // chi + lop3.b32 %r30790, %r20239, %r20151, %r20207, 0xD2; + lop3.b32 %r30791, %r20243, %r20155, %r20211, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30790, %r30791}; + // begin inline asm + // chi + lop3.b32 %r30784, %r20151, %r20207, %r20231, 0xD2; + lop3.b32 %r30785, %r20155, %r20211, %r20235, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30784, %r30785}; + // begin inline asm + // chi + lop3.b32 %r30776, %r20207, %r20231, %r20167, 0xD2; + lop3.b32 %r30777, %r20211, %r20235, %r20171, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30776, %r30777}; + // begin inline asm + // chi + lop3.b32 %r30768, %r20231, %r20167, %r20239, 0xD2; + lop3.b32 %r30769, %r20235, %r20171, %r20243, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30768, %r30769}; + mul.wide.s32 %rd899, %r30818, 8; + mov.u64 %rd900, keccak_round_constants; + cvta.const.u64 %rd901, %rd900; + add.s64 %rd896, %rd901, %rd899; + // begin inline asm + ld.global.nc.v2.u32 {%r20519,%r20520}, [%rd896]; + // end inline asm + xor.b32 %r30804, %r20319, %r20519; + xor.b32 %r30805, %r20320, %r20520; + add.s32 %r30818, %r30818, 1; + setp.lt.u32 %p38, %r30818, 23; + @%p38 bra $L__BB2_65; + + st.local.v2.u32 [%rd2+32], {%r30816, %r30817}; + st.local.v2.u32 [%rd2+72], {%r30814, %r30815}; + st.local.v2.u32 [%rd2+40], {%r30812, %r30813}; + st.local.v2.u32 [%rd2+80], {%r30810, %r30811}; + st.local.v2.u32 [%rd2+48], {%r30808, %r30809}; + st.local.v2.u32 [%rd2+56], {%r30806, %r30807}; + st.local.v2.u32 [%rd2+24], {%r30804, %r30805}; + // begin inline asm + // xor5 + lop3.b32 %r20531, %r30804, %r30802, %r30800, 0x96; + lop3.b32 %r20531, %r20531, %r30798, %r30796, 0x96; + lop3.b32 %r20532, %r30805, %r30803, %r30801, 0x96; + lop3.b32 %r20532, %r20532, %r30799, %r30797, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20543, %r30816, %r30814, %r30794, 0x96; + lop3.b32 %r20543, %r20543, %r30792, %r30790, 0x96; + lop3.b32 %r20544, %r30817, %r30815, %r30795, 0x96; + lop3.b32 %r20544, %r20544, %r30793, %r30791, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20555, %r30812, %r30810, %r30788, 0x96; + lop3.b32 %r20555, %r20555, %r30786, %r30784, 0x96; + lop3.b32 %r20556, %r30813, %r30811, %r30789, 0x96; + lop3.b32 %r20556, %r20556, %r30787, %r30785, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20567, %r30808, %r30782, %r30780, 0x96; + lop3.b32 %r20567, %r20567, %r30778, %r30776, 0x96; + lop3.b32 %r20568, %r30809, %r30783, %r30781, 0x96; + lop3.b32 %r20568, %r20568, %r30779, %r30777, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20579, %r30806, %r30774, %r30772, 0x96; + lop3.b32 %r20579, %r20579, %r30770, %r30768, 0x96; + lop3.b32 %r20580, %r30807, %r30775, %r30773, 0x96; + lop3.b32 %r20580, %r20580, %r30771, %r30769, 0x96; + // end inline asm + mov.u32 %r30833, 1; + // begin inline asm + shf.l.wrap.b32 %r20591, %r20544, %r20543, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20595, %r20543, %r20544, %r30833; + // end inline asm + xor.b32 %r20810, %r20591, %r20579; + xor.b32 %r20811, %r20595, %r20580; + xor.b32 %r20738, %r30804, %r20810; + xor.b32 %r20741, %r30805, %r20811; + xor.b32 %r20701, %r30801, %r20811; + xor.b32 %r20700, %r30800, %r20810; + st.local.v2.u32 [%rd2+104], {%r20700, %r20701}; + // begin inline asm + shf.l.wrap.b32 %r20599, %r20556, %r20555, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20603, %r20555, %r20556, %r30833; + // end inline asm + xor.b32 %r20812, %r20599, %r20531; + xor.b32 %r20813, %r20603, %r20532; + xor.b32 %r20637, %r30814, %r20812; + xor.b32 %r20636, %r30815, %r20813; + xor.b32 %r20676, %r30793, %r20813; + xor.b32 %r20677, %r30792, %r20812; + st.local.v2.u32 [%rd2+152], {%r20677, %r20676}; + // begin inline asm + shf.l.wrap.b32 %r20607, %r20568, %r20567, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20611, %r20567, %r20568, %r30833; + // end inline asm + xor.b32 %r20814, %r20607, %r20543; + xor.b32 %r20815, %r20611, %r20544; + xor.b32 %r20660, %r30789, %r20815; + xor.b32 %r20661, %r30788, %r20814; + st.local.v2.u32 [%rd2+120], {%r20661, %r20660}; + xor.b32 %r20652, %r30785, %r20815; + xor.b32 %r20653, %r30784, %r20814; + st.local.v2.u32 [%rd2+200], {%r20653, %r20652}; + // begin inline asm + shf.l.wrap.b32 %r20615, %r20580, %r20579, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20619, %r20579, %r20580, %r30833; + // end inline asm + xor.b32 %r20816, %r20615, %r20555; + xor.b32 %r20817, %r20619, %r20556; + xor.b32 %r20684, %r30808, %r20816; + xor.b32 %r20685, %r30809, %r20817; + xor.b32 %r20693, %r30779, %r20817; + xor.b32 %r20692, %r30778, %r20816; + st.local.v2.u32 [%rd2+168], {%r20692, %r20693}; + // begin inline asm + shf.l.wrap.b32 %r20623, %r20532, %r20531, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20627, %r20531, %r20532, %r30833; + // end inline asm + xor.b32 %r20818, %r20623, %r20567; + xor.b32 %r20819, %r20627, %r20568; + xor.b32 %r20644, %r30774, %r20818; + xor.b32 %r20645, %r30775, %r20819; + xor.b32 %r20669, %r30769, %r20819; + xor.b32 %r20668, %r30768, %r20818; + st.local.v2.u32 [%rd2+216], {%r20668, %r20669}; + // begin inline asm + shf.l.wrap.b32 %r20631, %r20637, %r20636, %r20134; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20635, %r20636, %r20637, %r20134; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20639, %r20645, %r20644, %r20142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20643, %r20644, %r20645, %r20142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20651, %r20652, %r20653, %r20150; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20647, %r20653, %r20652, %r20150; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r20647, %r20651}; + // begin inline asm + shf.l.wrap.b32 %r20655, %r20661, %r20660, %r20182; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20659, %r20660, %r20661, %r20182; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20663, %r20669, %r20668, %r20230; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20667, %r20668, %r20669, %r20230; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20675, %r20676, %r20677, %r20254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20671, %r20677, %r20676, %r20254; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r20671, %r20675}; + // begin inline asm + shf.l.wrap.b32 %r20679, %r20685, %r20684, %r20270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20683, %r20684, %r20685, %r20270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20687, %r20693, %r20692, %r20278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20691, %r20692, %r20693, %r20278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20695, %r20701, %r20700, %r20310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20699, %r20700, %r20701, %r20310; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20703, %r20738, %r20631, %r20655, 0xD2; + lop3.b32 %r20704, %r20741, %r20635, %r20659, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30951, %r20631, %r20655, %r20687, 0xD2; + lop3.b32 %r30952, %r20635, %r20659, %r20691, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30951, %r30952}; + // begin inline asm + // chi + lop3.b32 %r30947, %r20655, %r20687, %r20663, 0xD2; + lop3.b32 %r30948, %r20659, %r20691, %r20667, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30947, %r30948}; + // begin inline asm + // chi + lop3.b32 %r30943, %r20687, %r20663, %r20738, 0xD2; + lop3.b32 %r30944, %r20691, %r20667, %r20741, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30943, %r30944}; + // begin inline asm + // chi + lop3.b32 %r30941, %r20663, %r20738, %r20631, 0xD2; + lop3.b32 %r30942, %r20667, %r20741, %r20635, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30941, %r30942}; + // begin inline asm + // chi + lop3.b32 %r30937, %r20679, %r20639, %r20695, 0xD2; + lop3.b32 %r30938, %r20683, %r20643, %r20699, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30937, %r30938}; + // begin inline asm + // chi + lop3.b32 %r30949, %r20639, %r20695, %r20671, 0xD2; + lop3.b32 %r30950, %r20643, %r20699, %r20675, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30949, %r30950}; + // begin inline asm + // chi + lop3.b32 %r30945, %r20695, %r20671, %r20647, 0xD2; + lop3.b32 %r30946, %r20699, %r20675, %r20651, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30945, %r30946}; + add.s64 %rd902, %rd901, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r20767,%r20768}, [%rd902]; + // end inline asm + xor.b32 %r30939, %r20703, %r20767; + xor.b32 %r30940, %r20704, %r20768; + add.u64 %rd908, %SPL, 1912; + st.local.v2.u32 [%rd2+24], {%r30939, %r30940}; + st.local.u64 [%rd908], %rd361; + mov.u64 %rd909, 1179641; + st.local.u64 [%rd908+8], %rd909; + add.s32 %r20820, %r3343, 1; + st.local.u32 [%rd908+16], %r20820; + ld.global.u64 %rd910, [%rd221]; + ld.global.u64 %rd911, [%rd221+8]; + ld.global.u64 %rd912, [%rd221+16]; + ld.global.u64 %rd913, [%rd221+24]; + ld.global.u64 %rd914, [%rd221+32]; + ld.global.u64 %rd915, [%rd221+40]; + ld.global.u64 %rd916, [%rd221+48]; + ld.global.u64 %rd917, [%rd221+56]; + st.local.u64 [%rd908+32], %rd911; + st.local.u64 [%rd908+40], %rd912; + st.local.u64 [%rd908+48], %rd913; + st.local.u64 [%rd908+56], %rd914; + st.local.u64 [%rd908+64], %rd915; + st.local.u64 [%rd908+72], %rd916; + st.local.u64 [%rd908+80], %rd917; + cvt.u32.u64 %r20821, %rd910; + xor.b32 %r20822, %r20820, %r20821; + st.local.u64 [%rd908+24], %rd910; + st.local.u32 [%rd908+24], %r20822; + mov.u32 %r30819, 0; + st.local.v2.u32 [%rd908+96], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+104], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+112], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+120], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+128], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+136], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+144], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+152], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+160], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+168], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+176], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+184], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+192], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+200], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+208], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+216], {%r30819, %r30819}; + mov.u32 %r30834, -2147483648; + st.local.v2.u32 [%rd908+88], {%r30833, %r30834}; + ld.local.v2.u32 {%r30855, %r30856}, [%rd908+24]; + mov.b64 {%r30853, %r30854}, %rd915; + shr.u64 %rd918, %rd911, 32; + cvt.u32.u64 %r30867, %rd911; + cvt.u32.u64 %r30868, %rd918; + shr.u64 %rd919, %rd916, 32; + cvt.u32.u64 %r30865, %rd916; + cvt.u32.u64 %r30866, %rd919; + shr.u64 %rd920, %rd912, 32; + cvt.u32.u64 %r30863, %rd912; + cvt.u32.u64 %r30864, %rd920; + shr.u64 %rd921, %rd917, 32; + cvt.u32.u64 %r30861, %rd917; + cvt.u32.u64 %r30862, %rd921; + shr.u64 %rd922, %rd913, 32; + cvt.u32.u64 %r30859, %rd913; + cvt.u32.u64 %r30860, %rd922; + shr.u64 %rd923, %rd914, 32; + cvt.u32.u64 %r30857, %rd914; + cvt.u32.u64 %r30858, %rd923; + mov.u32 %r30820, %r30819; + mov.u32 %r30821, %r30819; + mov.u32 %r30822, %r30819; + mov.u32 %r30823, %r30819; + mov.u32 %r30824, %r30819; + mov.u32 %r30825, %r30819; + mov.u32 %r30826, %r30819; + mov.u32 %r30827, %r30819; + mov.u32 %r30828, %r30819; + mov.u32 %r30829, %r30819; + mov.u32 %r30830, %r30819; + mov.u32 %r30831, %r30819; + mov.u32 %r30832, %r30819; + mov.u32 %r30835, %r30819; + mov.u32 %r30836, %r30819; + mov.u32 %r30837, %r30819; + mov.u32 %r30838, %r30819; + mov.u32 %r30839, %r30819; + mov.u32 %r30840, %r30819; + mov.u32 %r30841, %r30819; + mov.u32 %r30842, %r30819; + mov.u32 %r30843, %r30819; + mov.u32 %r30844, %r30819; + mov.u32 %r30845, %r30819; + mov.u32 %r30846, %r30819; + mov.u32 %r30847, %r30819; + mov.u32 %r30848, %r30819; + mov.u32 %r30849, %r30819; + mov.u32 %r30850, %r30819; + mov.u32 %r30851, %r30819; + mov.u32 %r30852, %r30819; + mov.u32 %r30869, %r30819; + +$L__BB2_67: + mov.u32 %r29766, 1; + // begin inline asm + // xor5 + lop3.b32 %r20825, %r30855, %r30853, %r30851, 0x96; + lop3.b32 %r20825, %r20825, %r30849, %r30847, 0x96; + lop3.b32 %r20826, %r30856, %r30854, %r30852, 0x96; + lop3.b32 %r20826, %r20826, %r30850, %r30848, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20837, %r30867, %r30865, %r30845, 0x96; + lop3.b32 %r20837, %r20837, %r30843, %r30841, 0x96; + lop3.b32 %r20838, %r30868, %r30866, %r30846, 0x96; + lop3.b32 %r20838, %r20838, %r30844, %r30842, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20849, %r30863, %r30861, %r30839, 0x96; + lop3.b32 %r20849, %r20849, %r30837, %r30835, 0x96; + lop3.b32 %r20850, %r30864, %r30862, %r30840, 0x96; + lop3.b32 %r20850, %r20850, %r30838, %r30836, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20861, %r30859, %r30833, %r30831, 0x96; + lop3.b32 %r20861, %r20861, %r30829, %r30827, 0x96; + lop3.b32 %r20862, %r30860, %r30834, %r30832, 0x96; + lop3.b32 %r20862, %r20862, %r30830, %r30828, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20873, %r30857, %r30825, %r30823, 0x96; + lop3.b32 %r20873, %r20873, %r30821, %r30819, 0x96; + lop3.b32 %r20874, %r30858, %r30826, %r30824, 0x96; + lop3.b32 %r20874, %r20874, %r30822, %r30820, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20885, %r20838, %r20837, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20889, %r20837, %r20838, %r29766; + // end inline asm + xor.b32 %r21319, %r20885, %r20873; + xor.b32 %r21320, %r20889, %r20874; + xor.b32 %r21152, %r30855, %r21319; + xor.b32 %r21155, %r30856, %r21320; + xor.b32 %r21059, %r30853, %r21319; + xor.b32 %r21058, %r30854, %r21320; + xor.b32 %r21106, %r30851, %r21319; + xor.b32 %r21107, %r30852, %r21320; + xor.b32 %r21011, %r30849, %r21319; + xor.b32 %r21010, %r30850, %r21320; + xor.b32 %r20962, %r30847, %r21319; + xor.b32 %r20963, %r30848, %r21320; + // begin inline asm + shf.l.wrap.b32 %r20893, %r20850, %r20849, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20897, %r20849, %r20850, %r29766; + // end inline asm + xor.b32 %r21321, %r20893, %r20825; + xor.b32 %r21322, %r20897, %r20826; + xor.b32 %r21114, %r30867, %r21321; + xor.b32 %r21115, %r30868, %r21322; + xor.b32 %r20931, %r30865, %r21321; + xor.b32 %r20930, %r30866, %r21322; + xor.b32 %r21090, %r30845, %r21321; + xor.b32 %r21091, %r30846, %r21322; + xor.b32 %r21051, %r30843, %r21321; + xor.b32 %r21050, %r30844, %r21322; + xor.b32 %r21034, %r30841, %r21321; + xor.b32 %r21035, %r30842, %r21322; + // begin inline asm + shf.l.wrap.b32 %r20901, %r20862, %r20861, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20905, %r20861, %r20862, %r29766; + // end inline asm + xor.b32 %r21323, %r20901, %r20837; + xor.b32 %r21324, %r20905, %r20838; + xor.b32 %r20971, %r30863, %r21323; + xor.b32 %r20970, %r30864, %r21324; + xor.b32 %r21098, %r30861, %r21323; + xor.b32 %r21099, %r30862, %r21324; + xor.b32 %r20979, %r30839, %r21323; + xor.b32 %r20978, %r30840, %r21324; + xor.b32 %r21082, %r30837, %r21323; + xor.b32 %r21083, %r30838, %r21324; + xor.b32 %r20947, %r30835, %r21323; + xor.b32 %r20946, %r30836, %r21324; + // begin inline asm + shf.l.wrap.b32 %r20909, %r20874, %r20873, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20913, %r20873, %r20874, %r29766; + // end inline asm + xor.b32 %r21325, %r20909, %r20849; + xor.b32 %r21326, %r20913, %r20850; + xor.b32 %r21066, %r30859, %r21325; + xor.b32 %r21067, %r30860, %r21326; + xor.b32 %r21043, %r30833, %r21325; + xor.b32 %r21042, %r30834, %r21326; + xor.b32 %r20986, %r30831, %r21325; + xor.b32 %r20987, %r30832, %r21326; + xor.b32 %r21074, %r30829, %r21325; + xor.b32 %r21075, %r30830, %r21326; + xor.b32 %r21003, %r30827, %r21325; + xor.b32 %r21002, %r30828, %r21326; + // begin inline asm + shf.l.wrap.b32 %r20917, %r20826, %r20825, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20921, %r20825, %r20826, %r29766; + // end inline asm + xor.b32 %r21327, %r20917, %r20861; + xor.b32 %r21328, %r20921, %r20862; + xor.b32 %r21018, %r30857, %r21327; + xor.b32 %r21019, %r30858, %r21328; + xor.b32 %r20938, %r30825, %r21327; + xor.b32 %r20939, %r30826, %r21328; + xor.b32 %r20955, %r30823, %r21327; + xor.b32 %r20954, %r30824, %r21328; + xor.b32 %r20994, %r30821, %r21327; + xor.b32 %r20995, %r30822, %r21328; + xor.b32 %r21026, %r30819, %r21327; + xor.b32 %r21027, %r30820, %r21328; + mov.u32 %r20932, 44; + // begin inline asm + shf.l.wrap.b32 %r20925, %r20931, %r20930, %r20932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20929, %r20930, %r20931, %r20932; + // end inline asm + mov.u32 %r20940, 20; + // begin inline asm + shf.l.wrap.b32 %r20933, %r20939, %r20938, %r20940; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20937, %r20938, %r20939, %r20940; + // end inline asm + mov.u32 %r20948, 61; + // begin inline asm + shf.l.wrap.b32 %r20941, %r20947, %r20946, %r20948; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20945, %r20946, %r20947, %r20948; + // end inline asm + mov.u32 %r20956, 39; + // begin inline asm + shf.l.wrap.b32 %r20949, %r20955, %r20954, %r20956; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20953, %r20954, %r20955, %r20956; + // end inline asm + mov.u32 %r20964, 18; + // begin inline asm + shf.l.wrap.b32 %r20957, %r20963, %r20962, %r20964; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20961, %r20962, %r20963, %r20964; + // end inline asm + mov.u32 %r20972, 62; + // begin inline asm + shf.l.wrap.b32 %r20965, %r20971, %r20970, %r20972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20969, %r20970, %r20971, %r20972; + // end inline asm + mov.u32 %r20980, 43; + // begin inline asm + shf.l.wrap.b32 %r20973, %r20979, %r20978, %r20980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20977, %r20978, %r20979, %r20980; + // end inline asm + mov.u32 %r20988, 25; + // begin inline asm + shf.l.wrap.b32 %r20981, %r20987, %r20986, %r20988; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20985, %r20986, %r20987, %r20988; + // end inline asm + mov.u32 %r20996, 8; + // begin inline asm + shf.l.wrap.b32 %r20989, %r20995, %r20994, %r20996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20993, %r20994, %r20995, %r20996; + // end inline asm + mov.u32 %r21004, 56; + // begin inline asm + shf.l.wrap.b32 %r20997, %r21003, %r21002, %r21004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21001, %r21002, %r21003, %r21004; + // end inline asm + mov.u32 %r21012, 41; + // begin inline asm + shf.l.wrap.b32 %r21005, %r21011, %r21010, %r21012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21009, %r21010, %r21011, %r21012; + // end inline asm + mov.u32 %r21020, 27; + // begin inline asm + shf.l.wrap.b32 %r21013, %r21019, %r21018, %r21020; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21017, %r21018, %r21019, %r21020; + // end inline asm + mov.u32 %r21028, 14; + // begin inline asm + shf.l.wrap.b32 %r21021, %r21027, %r21026, %r21028; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21025, %r21026, %r21027, %r21028; + // end inline asm + mov.u32 %r21036, 2; + // begin inline asm + shf.l.wrap.b32 %r21029, %r21035, %r21034, %r21036; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21033, %r21034, %r21035, %r21036; + // end inline asm + mov.u32 %r21044, 55; + // begin inline asm + shf.l.wrap.b32 %r21037, %r21043, %r21042, %r21044; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21041, %r21042, %r21043, %r21044; + // end inline asm + mov.u32 %r21052, 45; + // begin inline asm + shf.l.wrap.b32 %r21045, %r21051, %r21050, %r21052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21049, %r21050, %r21051, %r21052; + // end inline asm + mov.u32 %r21060, 36; + // begin inline asm + shf.l.wrap.b32 %r21053, %r21059, %r21058, %r21060; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21057, %r21058, %r21059, %r21060; + // end inline asm + mov.u32 %r21068, 28; + // begin inline asm + shf.l.wrap.b32 %r21061, %r21067, %r21066, %r21068; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21065, %r21066, %r21067, %r21068; + // end inline asm + mov.u32 %r21076, 21; + // begin inline asm + shf.l.wrap.b32 %r21069, %r21075, %r21074, %r21076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21073, %r21074, %r21075, %r21076; + // end inline asm + mov.u32 %r21084, 15; + // begin inline asm + shf.l.wrap.b32 %r21077, %r21083, %r21082, %r21084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21081, %r21082, %r21083, %r21084; + // end inline asm + mov.u32 %r21092, 10; + // begin inline asm + shf.l.wrap.b32 %r21085, %r21091, %r21090, %r21092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21089, %r21090, %r21091, %r21092; + // end inline asm + mov.u32 %r21100, 6; + // begin inline asm + shf.l.wrap.b32 %r21093, %r21099, %r21098, %r21100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21097, %r21098, %r21099, %r21100; + // end inline asm + mov.u32 %r21108, 3; + // begin inline asm + shf.l.wrap.b32 %r21101, %r21107, %r21106, %r21108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21105, %r21106, %r21107, %r21108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21109, %r21115, %r21114, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21113, %r21114, %r21115, %r29766; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21117, %r21152, %r20925, %r20973, 0xD2; + lop3.b32 %r21118, %r21155, %r20929, %r20977, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30867, %r20925, %r20973, %r21069, 0xD2; + lop3.b32 %r30868, %r20929, %r20977, %r21073, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30863, %r20973, %r21069, %r21021, 0xD2; + lop3.b32 %r30864, %r20977, %r21073, %r21025, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30859, %r21069, %r21021, %r21152, 0xD2; + lop3.b32 %r30860, %r21073, %r21025, %r21155, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30857, %r21021, %r21152, %r20925, 0xD2; + lop3.b32 %r30858, %r21025, %r21155, %r20929, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30853, %r21061, %r20933, %r21101, 0xD2; + lop3.b32 %r30854, %r21065, %r20937, %r21105, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30865, %r20933, %r21101, %r21045, 0xD2; + lop3.b32 %r30866, %r20937, %r21105, %r21049, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30861, %r21101, %r21045, %r20941, 0xD2; + lop3.b32 %r30862, %r21105, %r21049, %r20945, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30833, %r21045, %r20941, %r21061, 0xD2; + lop3.b32 %r30834, %r21049, %r20945, %r21065, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r30833, %r30834}; + // begin inline asm + // chi + lop3.b32 %r30825, %r20941, %r21061, %r20933, 0xD2; + lop3.b32 %r30826, %r20945, %r21065, %r20937, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r30825, %r30826}; + // begin inline asm + // chi + lop3.b32 %r30851, %r21109, %r21093, %r20981, 0xD2; + lop3.b32 %r30852, %r21113, %r21097, %r20985, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+104], {%r30851, %r30852}; + // begin inline asm + // chi + lop3.b32 %r30845, %r21093, %r20981, %r20989, 0xD2; + lop3.b32 %r30846, %r21097, %r20985, %r20993, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+112], {%r30845, %r30846}; + // begin inline asm + // chi + lop3.b32 %r30839, %r20981, %r20989, %r20957, 0xD2; + lop3.b32 %r30840, %r20985, %r20993, %r20961, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+120], {%r30839, %r30840}; + // begin inline asm + // chi + lop3.b32 %r30831, %r20989, %r20957, %r21109, 0xD2; + lop3.b32 %r30832, %r20993, %r20961, %r21113, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+128], {%r30831, %r30832}; + // begin inline asm + // chi + lop3.b32 %r30823, %r20957, %r21109, %r21093, 0xD2; + lop3.b32 %r30824, %r20961, %r21113, %r21097, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+136], {%r30823, %r30824}; + // begin inline asm + // chi + lop3.b32 %r30849, %r21013, %r21053, %r21085, 0xD2; + lop3.b32 %r30850, %r21017, %r21057, %r21089, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+144], {%r30849, %r30850}; + // begin inline asm + // chi + lop3.b32 %r30843, %r21053, %r21085, %r21077, 0xD2; + lop3.b32 %r30844, %r21057, %r21089, %r21081, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+152], {%r30843, %r30844}; + // begin inline asm + // chi + lop3.b32 %r30837, %r21085, %r21077, %r20997, 0xD2; + lop3.b32 %r30838, %r21089, %r21081, %r21001, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+160], {%r30837, %r30838}; + // begin inline asm + // chi + lop3.b32 %r30829, %r21077, %r20997, %r21013, 0xD2; + lop3.b32 %r30830, %r21081, %r21001, %r21017, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+168], {%r30829, %r30830}; + // begin inline asm + // chi + lop3.b32 %r30821, %r20997, %r21013, %r21053, 0xD2; + lop3.b32 %r30822, %r21001, %r21017, %r21057, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+176], {%r30821, %r30822}; + // begin inline asm + // chi + lop3.b32 %r30847, %r20965, %r21037, %r20949, 0xD2; + lop3.b32 %r30848, %r20969, %r21041, %r20953, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+184], {%r30847, %r30848}; + // begin inline asm + // chi + lop3.b32 %r30841, %r21037, %r20949, %r21005, 0xD2; + lop3.b32 %r30842, %r21041, %r20953, %r21009, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+192], {%r30841, %r30842}; + // begin inline asm + // chi + lop3.b32 %r30835, %r20949, %r21005, %r21029, 0xD2; + lop3.b32 %r30836, %r20953, %r21009, %r21033, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+200], {%r30835, %r30836}; + // begin inline asm + // chi + lop3.b32 %r30827, %r21005, %r21029, %r20965, 0xD2; + lop3.b32 %r30828, %r21009, %r21033, %r20969, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+208], {%r30827, %r30828}; + // begin inline asm + // chi + lop3.b32 %r30819, %r21029, %r20965, %r21037, 0xD2; + lop3.b32 %r30820, %r21033, %r20969, %r21041, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+216], {%r30819, %r30820}; + mul.wide.s32 %rd927, %r30869, 8; + add.s64 %rd924, %rd901, %rd927; + // begin inline asm + ld.global.nc.v2.u32 {%r21317,%r21318}, [%rd924]; + // end inline asm + xor.b32 %r30855, %r21117, %r21317; + xor.b32 %r30856, %r21118, %r21318; + add.s32 %r30869, %r30869, 1; + setp.lt.u32 %p39, %r30869, 23; + @%p39 bra $L__BB2_67; + + mov.u32 %r29764, 3; + mov.u32 %r29763, 21; + mov.u32 %r29762, 28; + mov.u32 %r29761, 45; + mov.u32 %r29760, 14; + mov.u32 %r29759, 43; + mov.u32 %r29758, 61; + mov.u32 %r29757, 20; + mov.u32 %r29756, 44; + mov.u32 %r30902, 0; + mov.u32 %r21428, 1; + st.local.v2.u32 [%rd908+32], {%r30867, %r30868}; + st.local.v2.u32 [%rd908+72], {%r30865, %r30866}; + st.local.v2.u32 [%rd908+40], {%r30863, %r30864}; + st.local.v2.u32 [%rd908+80], {%r30861, %r30862}; + st.local.v2.u32 [%rd908+48], {%r30859, %r30860}; + st.local.v2.u32 [%rd908+56], {%r30857, %r30858}; + st.local.v2.u32 [%rd908+24], {%r30855, %r30856}; + // begin inline asm + // xor5 + lop3.b32 %r21329, %r30855, %r30853, %r30851, 0x96; + lop3.b32 %r21329, %r21329, %r30849, %r30847, 0x96; + lop3.b32 %r21330, %r30856, %r30854, %r30852, 0x96; + lop3.b32 %r21330, %r21330, %r30850, %r30848, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21341, %r30867, %r30865, %r30845, 0x96; + lop3.b32 %r21341, %r21341, %r30843, %r30841, 0x96; + lop3.b32 %r21342, %r30868, %r30866, %r30846, 0x96; + lop3.b32 %r21342, %r21342, %r30844, %r30842, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21353, %r30863, %r30861, %r30839, 0x96; + lop3.b32 %r21353, %r21353, %r30837, %r30835, 0x96; + lop3.b32 %r21354, %r30864, %r30862, %r30840, 0x96; + lop3.b32 %r21354, %r21354, %r30838, %r30836, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21365, %r30859, %r30833, %r30831, 0x96; + lop3.b32 %r21365, %r21365, %r30829, %r30827, 0x96; + lop3.b32 %r21366, %r30860, %r30834, %r30832, 0x96; + lop3.b32 %r21366, %r21366, %r30830, %r30828, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21377, %r30857, %r30825, %r30823, 0x96; + lop3.b32 %r21377, %r21377, %r30821, %r30819, 0x96; + lop3.b32 %r21378, %r30858, %r30826, %r30824, 0x96; + lop3.b32 %r21378, %r21378, %r30822, %r30820, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21389, %r21342, %r21341, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21393, %r21341, %r21342, %r21428; + // end inline asm + xor.b32 %r21568, %r21389, %r21377; + xor.b32 %r21569, %r21393, %r21378; + xor.b32 %r21536, %r30855, %r21568; + xor.b32 %r21539, %r30856, %r21569; + xor.b32 %r21499, %r30852, %r21569; + xor.b32 %r21498, %r30851, %r21568; + st.local.v2.u32 [%rd908+104], {%r21498, %r21499}; + // begin inline asm + shf.l.wrap.b32 %r21397, %r21354, %r21353, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21401, %r21353, %r21354, %r21428; + // end inline asm + xor.b32 %r21570, %r21397, %r21329; + xor.b32 %r21571, %r21401, %r21330; + xor.b32 %r21435, %r30865, %r21570; + xor.b32 %r21434, %r30866, %r21571; + xor.b32 %r21474, %r30844, %r21571; + xor.b32 %r21475, %r30843, %r21570; + st.local.v2.u32 [%rd908+152], {%r21475, %r21474}; + // begin inline asm + shf.l.wrap.b32 %r21405, %r21366, %r21365, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21409, %r21365, %r21366, %r21428; + // end inline asm + xor.b32 %r21572, %r21405, %r21341; + xor.b32 %r21573, %r21409, %r21342; + xor.b32 %r21458, %r30840, %r21573; + xor.b32 %r21459, %r30839, %r21572; + st.local.v2.u32 [%rd908+120], {%r21459, %r21458}; + xor.b32 %r21450, %r30836, %r21573; + xor.b32 %r21451, %r30835, %r21572; + st.local.v2.u32 [%rd908+200], {%r21451, %r21450}; + // begin inline asm + shf.l.wrap.b32 %r21413, %r21378, %r21377, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21417, %r21377, %r21378, %r21428; + // end inline asm + xor.b32 %r21574, %r21413, %r21353; + xor.b32 %r21575, %r21417, %r21354; + xor.b32 %r21482, %r30859, %r21574; + xor.b32 %r21483, %r30860, %r21575; + xor.b32 %r21491, %r30830, %r21575; + xor.b32 %r21490, %r30829, %r21574; + st.local.v2.u32 [%rd908+168], {%r21490, %r21491}; + // begin inline asm + shf.l.wrap.b32 %r21421, %r21330, %r21329, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21425, %r21329, %r21330, %r21428; + // end inline asm + xor.b32 %r21576, %r21421, %r21365; + xor.b32 %r21577, %r21425, %r21366; + xor.b32 %r21442, %r30825, %r21576; + xor.b32 %r21443, %r30826, %r21577; + xor.b32 %r21467, %r30820, %r21577; + xor.b32 %r21466, %r30819, %r21576; + st.local.v2.u32 [%rd908+216], {%r21466, %r21467}; + // begin inline asm + shf.l.wrap.b32 %r21429, %r21435, %r21434, %r29756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21433, %r21434, %r21435, %r29756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21437, %r21443, %r21442, %r29757; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21441, %r21442, %r21443, %r29757; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21449, %r21450, %r21451, %r29758; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21445, %r21451, %r21450, %r29758; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r21445, %r21449}; + // begin inline asm + shf.l.wrap.b32 %r21453, %r21459, %r21458, %r29759; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21457, %r21458, %r21459, %r29759; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21461, %r21467, %r21466, %r29760; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21465, %r21466, %r21467, %r29760; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21473, %r21474, %r21475, %r29761; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21469, %r21475, %r21474, %r29761; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r21469, %r21473}; + // begin inline asm + shf.l.wrap.b32 %r21477, %r21483, %r21482, %r29762; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21481, %r21482, %r21483, %r29762; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21485, %r21491, %r21490, %r29763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21489, %r21490, %r21491, %r29763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21493, %r21499, %r21498, %r29764; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21497, %r21498, %r21499, %r29764; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21501, %r21536, %r21429, %r21453, 0xD2; + lop3.b32 %r21502, %r21539, %r21433, %r21457, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31002, %r21429, %r21453, %r21485, 0xD2; + lop3.b32 %r31003, %r21433, %r21457, %r21489, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+32], {%r31002, %r31003}; + // begin inline asm + // chi + lop3.b32 %r30998, %r21453, %r21485, %r21461, 0xD2; + lop3.b32 %r30999, %r21457, %r21489, %r21465, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+40], {%r30998, %r30999}; + // begin inline asm + // chi + lop3.b32 %r30994, %r21485, %r21461, %r21536, 0xD2; + lop3.b32 %r30995, %r21489, %r21465, %r21539, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+48], {%r30994, %r30995}; + // begin inline asm + // chi + lop3.b32 %r30992, %r21461, %r21536, %r21429, 0xD2; + lop3.b32 %r30993, %r21465, %r21539, %r21433, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+56], {%r30992, %r30993}; + // begin inline asm + // chi + lop3.b32 %r30988, %r21477, %r21437, %r21493, 0xD2; + lop3.b32 %r30989, %r21481, %r21441, %r21497, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+64], {%r30988, %r30989}; + // begin inline asm + // chi + lop3.b32 %r31000, %r21437, %r21493, %r21469, 0xD2; + lop3.b32 %r31001, %r21441, %r21497, %r21473, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+72], {%r31000, %r31001}; + // begin inline asm + // chi + lop3.b32 %r30996, %r21493, %r21469, %r21445, 0xD2; + lop3.b32 %r30997, %r21497, %r21473, %r21449, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+80], {%r30996, %r30997}; + // begin inline asm + ld.global.nc.v2.u32 {%r21565,%r21566}, [%rd902]; + // end inline asm + xor.b32 %r30990, %r21501, %r21565; + xor.b32 %r30991, %r21502, %r21566; + st.local.v2.u32 [%rd908+24], {%r30990, %r30991}; + add.s64 %rd242, %rd908, 24; + add.s64 %rd243, %rd2, 24; + +$L__BB2_69: + add.s32 %r29765, %r3343, 1; + cvta.to.global.u64 %rd1258, %rd361; + shl.b32 %r21578, %r30902, 2; + cvt.u64.u32 %rd935, %r21578; + and.b64 %rd936, %rd935, 60; + add.s64 %rd937, %rd243, %rd936; + xor.b32 %r21579, %r3343, %r30902; + mul.lo.s32 %r21580, %r21579, 16777619; + ld.local.u32 %r21581, [%rd937]; + xor.b32 %r21582, %r21580, %r21581; + mul.wide.u32 %rd938, %r21582, -954391867; + shr.u64 %rd939, %rd938, 32; + cvt.u32.u64 %r21583, %rd939; + sub.s32 %r21584, %r21582, %r21583; + shr.u32 %r21585, %r21584, 1; + add.s32 %r21586, %r21585, %r21583; + shr.u32 %r21587, %r21586, 20; + mul.lo.s32 %r21588, %r21587, 1179641; + sub.s32 %r21589, %r21582, %r21588; + mul.wide.u32 %rd940, %r21589, 64; + add.s64 %rd941, %rd1258, %rd940; + mul.lo.s32 %r21590, %r30939, 16777619; + ld.global.u32 %r21591, [%rd941]; + xor.b32 %r30939, %r21590, %r21591; + mul.lo.s32 %r21592, %r30940, 16777619; + ld.global.u32 %r21593, [%rd941+4]; + xor.b32 %r30940, %r21592, %r21593; + mul.lo.s32 %r21594, %r30951, 16777619; + ld.global.u32 %r21595, [%rd941+8]; + mul.lo.s32 %r21596, %r30952, 16777619; + ld.global.u32 %r21597, [%rd941+12]; + xor.b32 %r21598, %r21596, %r21597; + xor.b32 %r30951, %r21594, %r21595; + mov.b64 %rd942, {%r30951, %r21598}; + mul.lo.s32 %r21599, %r30947, 16777619; + ld.global.u32 %r21600, [%rd941+16]; + mul.lo.s32 %r21601, %r30948, 16777619; + ld.global.u32 %r21602, [%rd941+20]; + xor.b32 %r21603, %r21601, %r21602; + xor.b32 %r30947, %r21599, %r21600; + mov.b64 %rd943, {%r30947, %r21603}; + mul.lo.s32 %r21604, %r30943, 16777619; + ld.global.u32 %r21605, [%rd941+24]; + mul.lo.s32 %r21606, %r30944, 16777619; + ld.global.u32 %r21607, [%rd941+28]; + xor.b32 %r21608, %r21606, %r21607; + xor.b32 %r30943, %r21604, %r21605; + mov.b64 %rd944, {%r30943, %r21608}; + mul.lo.s32 %r21609, %r30941, 16777619; + ld.global.u32 %r21610, [%rd941+32]; + mul.lo.s32 %r21611, %r30942, 16777619; + ld.global.u32 %r21612, [%rd941+36]; + xor.b32 %r21613, %r21611, %r21612; + xor.b32 %r30941, %r21609, %r21610; + mov.b64 %rd945, {%r30941, %r21613}; + mul.lo.s32 %r21614, %r30937, 16777619; + ld.global.u32 %r21615, [%rd941+40]; + xor.b32 %r30937, %r21614, %r21615; + mul.lo.s32 %r21616, %r30938, 16777619; + ld.global.u32 %r21617, [%rd941+44]; + xor.b32 %r30938, %r21616, %r21617; + mul.lo.s32 %r21618, %r30949, 16777619; + ld.global.u32 %r21619, [%rd941+48]; + mul.lo.s32 %r21620, %r30950, 16777619; + ld.global.u32 %r21621, [%rd941+52]; + xor.b32 %r21622, %r21620, %r21621; + xor.b32 %r30949, %r21618, %r21619; + mov.b64 %rd946, {%r30949, %r21622}; + mul.lo.s32 %r21623, %r30945, 16777619; + ld.global.u32 %r21624, [%rd941+56]; + mul.lo.s32 %r21625, %r30946, 16777619; + ld.global.u32 %r21626, [%rd941+60]; + xor.b32 %r21627, %r21625, %r21626; + xor.b32 %r30945, %r21623, %r21624; + mov.b64 %rd947, {%r30945, %r21627}; + st.local.v2.u32 [%rd2+24], {%r30939, %r30940}; + st.local.v2.u32 [%rd2+32], {%r30951, %r21598}; + st.local.v2.u32 [%rd2+40], {%r30947, %r21603}; + st.local.v2.u32 [%rd2+48], {%r30943, %r21608}; + st.local.v2.u32 [%rd2+56], {%r30941, %r21613}; + st.local.v2.u32 [%rd2+64], {%r30937, %r30938}; + st.local.v2.u32 [%rd2+72], {%r30949, %r21622}; + st.local.v2.u32 [%rd2+80], {%r30945, %r21627}; + add.s64 %rd948, %rd242, %rd936; + xor.b32 %r21628, %r29765, %r30902; + mul.lo.s32 %r21629, %r21628, 16777619; + ld.local.u32 %r21630, [%rd948]; + xor.b32 %r21631, %r21629, %r21630; + mul.wide.u32 %rd949, %r21631, -954391867; + shr.u64 %rd950, %rd949, 32; + cvt.u32.u64 %r21632, %rd950; + sub.s32 %r21633, %r21631, %r21632; + shr.u32 %r21634, %r21633, 1; + add.s32 %r21635, %r21634, %r21632; + shr.u32 %r21636, %r21635, 20; + mul.lo.s32 %r21637, %r21636, 1179641; + sub.s32 %r21638, %r21631, %r21637; + mul.wide.u32 %rd951, %r21638, 64; + add.s64 %rd952, %rd1258, %rd951; + mul.lo.s32 %r21639, %r30990, 16777619; + ld.global.u32 %r21640, [%rd952]; + xor.b32 %r30990, %r21639, %r21640; + mul.lo.s32 %r21641, %r30991, 16777619; + ld.global.u32 %r21642, [%rd952+4]; + xor.b32 %r30991, %r21641, %r21642; + mul.lo.s32 %r21643, %r31002, 16777619; + ld.global.u32 %r21644, [%rd952+8]; + mul.lo.s32 %r21645, %r31003, 16777619; + ld.global.u32 %r21646, [%rd952+12]; + xor.b32 %r21647, %r21645, %r21646; + xor.b32 %r31002, %r21643, %r21644; + mov.b64 %rd953, {%r31002, %r21647}; + mul.lo.s32 %r21648, %r30998, 16777619; + ld.global.u32 %r21649, [%rd952+16]; + mul.lo.s32 %r21650, %r30999, 16777619; + ld.global.u32 %r21651, [%rd952+20]; + xor.b32 %r21652, %r21650, %r21651; + xor.b32 %r30998, %r21648, %r21649; + mov.b64 %rd954, {%r30998, %r21652}; + mul.lo.s32 %r21653, %r30994, 16777619; + ld.global.u32 %r21654, [%rd952+24]; + mul.lo.s32 %r21655, %r30995, 16777619; + ld.global.u32 %r21656, [%rd952+28]; + xor.b32 %r21657, %r21655, %r21656; + xor.b32 %r30994, %r21653, %r21654; + mov.b64 %rd955, {%r30994, %r21657}; + mul.lo.s32 %r21658, %r30992, 16777619; + ld.global.u32 %r21659, [%rd952+32]; + mul.lo.s32 %r21660, %r30993, 16777619; + ld.global.u32 %r21661, [%rd952+36]; + xor.b32 %r21662, %r21660, %r21661; + xor.b32 %r30992, %r21658, %r21659; + mov.b64 %rd956, {%r30992, %r21662}; + mul.lo.s32 %r21663, %r30988, 16777619; + ld.global.u32 %r21664, [%rd952+40]; + xor.b32 %r30988, %r21663, %r21664; + mul.lo.s32 %r21665, %r30989, 16777619; + ld.global.u32 %r21666, [%rd952+44]; + xor.b32 %r30989, %r21665, %r21666; + mul.lo.s32 %r21667, %r31000, 16777619; + ld.global.u32 %r21668, [%rd952+48]; + mul.lo.s32 %r21669, %r31001, 16777619; + ld.global.u32 %r21670, [%rd952+52]; + xor.b32 %r21671, %r21669, %r21670; + xor.b32 %r31000, %r21667, %r21668; + mov.b64 %rd957, {%r31000, %r21671}; + mul.lo.s32 %r21672, %r30996, 16777619; + ld.global.u32 %r21673, [%rd952+56]; + mul.lo.s32 %r21674, %r30997, 16777619; + ld.global.u32 %r21675, [%rd952+60]; + xor.b32 %r21676, %r21674, %r21675; + xor.b32 %r30996, %r21672, %r21673; + mov.b64 %rd958, {%r30996, %r21676}; + st.local.v2.u32 [%rd908+24], {%r30990, %r30991}; + st.local.v2.u32 [%rd908+32], {%r31002, %r21647}; + st.local.v2.u32 [%rd908+40], {%r30998, %r21652}; + st.local.v2.u32 [%rd908+48], {%r30994, %r21657}; + st.local.v2.u32 [%rd908+56], {%r30992, %r21662}; + st.local.v2.u32 [%rd908+64], {%r30988, %r30989}; + st.local.v2.u32 [%rd908+72], {%r31000, %r21671}; + st.local.v2.u32 [%rd908+80], {%r30996, %r21676}; + add.s32 %r30902, %r30902, 1; + setp.lt.u32 %p40, %r30902, 512; + shr.u64 %rd959, %rd942, 32; + cvt.u32.u64 %r30952, %rd959; + shr.u64 %rd960, %rd943, 32; + cvt.u32.u64 %r30948, %rd960; + shr.u64 %rd961, %rd944, 32; + cvt.u32.u64 %r30944, %rd961; + shr.u64 %rd962, %rd945, 32; + cvt.u32.u64 %r30942, %rd962; + shr.u64 %rd963, %rd946, 32; + cvt.u32.u64 %r30950, %rd963; + shr.u64 %rd964, %rd947, 32; + cvt.u32.u64 %r30946, %rd964; + shr.u64 %rd965, %rd953, 32; + cvt.u32.u64 %r31003, %rd965; + shr.u64 %rd966, %rd954, 32; + cvt.u32.u64 %r30999, %rd966; + shr.u64 %rd967, %rd955, 32; + cvt.u32.u64 %r30995, %rd967; + shr.u64 %rd968, %rd956, 32; + cvt.u32.u64 %r30993, %rd968; + shr.u64 %rd969, %rd957, 32; + cvt.u32.u64 %r31001, %rd969; + shr.u64 %rd970, %rd958, 32; + cvt.u32.u64 %r30997, %rd970; + @%p40 bra $L__BB2_69; + + mov.u32 %r30903, 0; + st.local.v2.u32 [%rd2+96], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+104], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+112], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+120], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+128], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+136], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+144], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+152], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+160], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+168], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+176], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+184], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+192], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+200], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+208], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+216], {%r30903, %r30903}; + mov.u32 %r30918, -2147483648; + mov.u32 %r30917, 1; + st.local.v2.u32 [%rd2+88], {%r30917, %r30918}; + mov.u32 %r30904, %r30903; + mov.u32 %r30905, %r30903; + mov.u32 %r30906, %r30903; + mov.u32 %r30907, %r30903; + mov.u32 %r30908, %r30903; + mov.u32 %r30909, %r30903; + mov.u32 %r30910, %r30903; + mov.u32 %r30911, %r30903; + mov.u32 %r30912, %r30903; + mov.u32 %r30913, %r30903; + mov.u32 %r30914, %r30903; + mov.u32 %r30915, %r30903; + mov.u32 %r30916, %r30903; + mov.u32 %r30919, %r30903; + mov.u32 %r30920, %r30903; + mov.u32 %r30921, %r30903; + mov.u32 %r30922, %r30903; + mov.u32 %r30923, %r30903; + mov.u32 %r30924, %r30903; + mov.u32 %r30925, %r30903; + mov.u32 %r30926, %r30903; + mov.u32 %r30927, %r30903; + mov.u32 %r30928, %r30903; + mov.u32 %r30929, %r30903; + mov.u32 %r30930, %r30903; + mov.u32 %r30931, %r30903; + mov.u32 %r30932, %r30903; + mov.u32 %r30933, %r30903; + mov.u32 %r30934, %r30903; + mov.u32 %r30935, %r30903; + mov.u32 %r30936, %r30903; + mov.u32 %r30953, %r30903; + +$L__BB2_71: + mov.u32 %r29776, 1; + mov.u64 %rd1281, keccak_round_constants; + cvta.const.u64 %rd1280, %rd1281; + // begin inline asm + // xor5 + lop3.b32 %r21718, %r30939, %r30937, %r30935, 0x96; + lop3.b32 %r21718, %r21718, %r30933, %r30931, 0x96; + lop3.b32 %r21719, %r30940, %r30938, %r30936, 0x96; + lop3.b32 %r21719, %r21719, %r30934, %r30932, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21730, %r30951, %r30949, %r30929, 0x96; + lop3.b32 %r21730, %r21730, %r30927, %r30925, 0x96; + lop3.b32 %r21731, %r30952, %r30950, %r30930, 0x96; + lop3.b32 %r21731, %r21731, %r30928, %r30926, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21742, %r30947, %r30945, %r30923, 0x96; + lop3.b32 %r21742, %r21742, %r30921, %r30919, 0x96; + lop3.b32 %r21743, %r30948, %r30946, %r30924, 0x96; + lop3.b32 %r21743, %r21743, %r30922, %r30920, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21754, %r30943, %r30917, %r30915, 0x96; + lop3.b32 %r21754, %r21754, %r30913, %r30911, 0x96; + lop3.b32 %r21755, %r30944, %r30918, %r30916, 0x96; + lop3.b32 %r21755, %r21755, %r30914, %r30912, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21766, %r30941, %r30909, %r30907, 0x96; + lop3.b32 %r21766, %r21766, %r30905, %r30903, 0x96; + lop3.b32 %r21767, %r30942, %r30910, %r30908, 0x96; + lop3.b32 %r21767, %r21767, %r30906, %r30904, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21778, %r21731, %r21730, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21782, %r21730, %r21731, %r29776; + // end inline asm + xor.b32 %r22212, %r21778, %r21766; + xor.b32 %r22213, %r21782, %r21767; + xor.b32 %r22045, %r30939, %r22212; + xor.b32 %r22048, %r30940, %r22213; + xor.b32 %r21952, %r30937, %r22212; + xor.b32 %r21951, %r30938, %r22213; + xor.b32 %r21999, %r30935, %r22212; + xor.b32 %r22000, %r30936, %r22213; + xor.b32 %r21904, %r30933, %r22212; + xor.b32 %r21903, %r30934, %r22213; + xor.b32 %r21855, %r30931, %r22212; + xor.b32 %r21856, %r30932, %r22213; + // begin inline asm + shf.l.wrap.b32 %r21786, %r21743, %r21742, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21790, %r21742, %r21743, %r29776; + // end inline asm + xor.b32 %r22214, %r21786, %r21718; + xor.b32 %r22215, %r21790, %r21719; + xor.b32 %r22007, %r30951, %r22214; + xor.b32 %r22008, %r30952, %r22215; + xor.b32 %r21824, %r30949, %r22214; + xor.b32 %r21823, %r30950, %r22215; + xor.b32 %r21983, %r30929, %r22214; + xor.b32 %r21984, %r30930, %r22215; + xor.b32 %r21944, %r30927, %r22214; + xor.b32 %r21943, %r30928, %r22215; + xor.b32 %r21927, %r30925, %r22214; + xor.b32 %r21928, %r30926, %r22215; + // begin inline asm + shf.l.wrap.b32 %r21794, %r21755, %r21754, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21798, %r21754, %r21755, %r29776; + // end inline asm + xor.b32 %r22216, %r21794, %r21730; + xor.b32 %r22217, %r21798, %r21731; + xor.b32 %r21864, %r30947, %r22216; + xor.b32 %r21863, %r30948, %r22217; + xor.b32 %r21991, %r30945, %r22216; + xor.b32 %r21992, %r30946, %r22217; + xor.b32 %r21872, %r30923, %r22216; + xor.b32 %r21871, %r30924, %r22217; + xor.b32 %r21975, %r30921, %r22216; + xor.b32 %r21976, %r30922, %r22217; + xor.b32 %r21840, %r30919, %r22216; + xor.b32 %r21839, %r30920, %r22217; + // begin inline asm + shf.l.wrap.b32 %r21802, %r21767, %r21766, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21806, %r21766, %r21767, %r29776; + // end inline asm + xor.b32 %r22218, %r21802, %r21742; + xor.b32 %r22219, %r21806, %r21743; + xor.b32 %r21959, %r30943, %r22218; + xor.b32 %r21960, %r30944, %r22219; + xor.b32 %r21936, %r30917, %r22218; + xor.b32 %r21935, %r30918, %r22219; + xor.b32 %r21879, %r30915, %r22218; + xor.b32 %r21880, %r30916, %r22219; + xor.b32 %r21967, %r30913, %r22218; + xor.b32 %r21968, %r30914, %r22219; + xor.b32 %r21896, %r30911, %r22218; + xor.b32 %r21895, %r30912, %r22219; + // begin inline asm + shf.l.wrap.b32 %r21810, %r21719, %r21718, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21814, %r21718, %r21719, %r29776; + // end inline asm + xor.b32 %r22220, %r21810, %r21754; + xor.b32 %r22221, %r21814, %r21755; + xor.b32 %r21911, %r30941, %r22220; + xor.b32 %r21912, %r30942, %r22221; + xor.b32 %r21831, %r30909, %r22220; + xor.b32 %r21832, %r30910, %r22221; + xor.b32 %r21848, %r30907, %r22220; + xor.b32 %r21847, %r30908, %r22221; + xor.b32 %r21887, %r30905, %r22220; + xor.b32 %r21888, %r30906, %r22221; + xor.b32 %r21919, %r30903, %r22220; + xor.b32 %r21920, %r30904, %r22221; + mov.u32 %r21825, 44; + // begin inline asm + shf.l.wrap.b32 %r21818, %r21824, %r21823, %r21825; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21822, %r21823, %r21824, %r21825; + // end inline asm + mov.u32 %r21833, 20; + // begin inline asm + shf.l.wrap.b32 %r21826, %r21832, %r21831, %r21833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21830, %r21831, %r21832, %r21833; + // end inline asm + mov.u32 %r21841, 61; + // begin inline asm + shf.l.wrap.b32 %r21834, %r21840, %r21839, %r21841; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21838, %r21839, %r21840, %r21841; + // end inline asm + mov.u32 %r21849, 39; + // begin inline asm + shf.l.wrap.b32 %r21842, %r21848, %r21847, %r21849; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21846, %r21847, %r21848, %r21849; + // end inline asm + mov.u32 %r21857, 18; + // begin inline asm + shf.l.wrap.b32 %r21850, %r21856, %r21855, %r21857; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21854, %r21855, %r21856, %r21857; + // end inline asm + mov.u32 %r21865, 62; + // begin inline asm + shf.l.wrap.b32 %r21858, %r21864, %r21863, %r21865; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21862, %r21863, %r21864, %r21865; + // end inline asm + mov.u32 %r21873, 43; + // begin inline asm + shf.l.wrap.b32 %r21866, %r21872, %r21871, %r21873; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21870, %r21871, %r21872, %r21873; + // end inline asm + mov.u32 %r21881, 25; + // begin inline asm + shf.l.wrap.b32 %r21874, %r21880, %r21879, %r21881; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21878, %r21879, %r21880, %r21881; + // end inline asm + mov.u32 %r21889, 8; + // begin inline asm + shf.l.wrap.b32 %r21882, %r21888, %r21887, %r21889; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21886, %r21887, %r21888, %r21889; + // end inline asm + mov.u32 %r21897, 56; + // begin inline asm + shf.l.wrap.b32 %r21890, %r21896, %r21895, %r21897; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21894, %r21895, %r21896, %r21897; + // end inline asm + mov.u32 %r21905, 41; + // begin inline asm + shf.l.wrap.b32 %r21898, %r21904, %r21903, %r21905; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21902, %r21903, %r21904, %r21905; + // end inline asm + mov.u32 %r21913, 27; + // begin inline asm + shf.l.wrap.b32 %r21906, %r21912, %r21911, %r21913; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21910, %r21911, %r21912, %r21913; + // end inline asm + mov.u32 %r21921, 14; + // begin inline asm + shf.l.wrap.b32 %r21914, %r21920, %r21919, %r21921; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21918, %r21919, %r21920, %r21921; + // end inline asm + mov.u32 %r21929, 2; + // begin inline asm + shf.l.wrap.b32 %r21922, %r21928, %r21927, %r21929; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21926, %r21927, %r21928, %r21929; + // end inline asm + mov.u32 %r21937, 55; + // begin inline asm + shf.l.wrap.b32 %r21930, %r21936, %r21935, %r21937; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21934, %r21935, %r21936, %r21937; + // end inline asm + mov.u32 %r21945, 45; + // begin inline asm + shf.l.wrap.b32 %r21938, %r21944, %r21943, %r21945; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21942, %r21943, %r21944, %r21945; + // end inline asm + mov.u32 %r21953, 36; + // begin inline asm + shf.l.wrap.b32 %r21946, %r21952, %r21951, %r21953; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21950, %r21951, %r21952, %r21953; + // end inline asm + mov.u32 %r21961, 28; + // begin inline asm + shf.l.wrap.b32 %r21954, %r21960, %r21959, %r21961; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21958, %r21959, %r21960, %r21961; + // end inline asm + mov.u32 %r21969, 21; + // begin inline asm + shf.l.wrap.b32 %r21962, %r21968, %r21967, %r21969; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21966, %r21967, %r21968, %r21969; + // end inline asm + mov.u32 %r21977, 15; + // begin inline asm + shf.l.wrap.b32 %r21970, %r21976, %r21975, %r21977; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21974, %r21975, %r21976, %r21977; + // end inline asm + mov.u32 %r21985, 10; + // begin inline asm + shf.l.wrap.b32 %r21978, %r21984, %r21983, %r21985; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21982, %r21983, %r21984, %r21985; + // end inline asm + mov.u32 %r21993, 6; + // begin inline asm + shf.l.wrap.b32 %r21986, %r21992, %r21991, %r21993; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21990, %r21991, %r21992, %r21993; + // end inline asm + mov.u32 %r22001, 3; + // begin inline asm + shf.l.wrap.b32 %r21994, %r22000, %r21999, %r22001; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21998, %r21999, %r22000, %r22001; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22002, %r22008, %r22007, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22006, %r22007, %r22008, %r29776; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22010, %r22045, %r21818, %r21866, 0xD2; + lop3.b32 %r22011, %r22048, %r21822, %r21870, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30951, %r21818, %r21866, %r21962, 0xD2; + lop3.b32 %r30952, %r21822, %r21870, %r21966, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30947, %r21866, %r21962, %r21914, 0xD2; + lop3.b32 %r30948, %r21870, %r21966, %r21918, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30943, %r21962, %r21914, %r22045, 0xD2; + lop3.b32 %r30944, %r21966, %r21918, %r22048, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30941, %r21914, %r22045, %r21818, 0xD2; + lop3.b32 %r30942, %r21918, %r22048, %r21822, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30937, %r21954, %r21826, %r21994, 0xD2; + lop3.b32 %r30938, %r21958, %r21830, %r21998, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30949, %r21826, %r21994, %r21938, 0xD2; + lop3.b32 %r30950, %r21830, %r21998, %r21942, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30945, %r21994, %r21938, %r21834, 0xD2; + lop3.b32 %r30946, %r21998, %r21942, %r21838, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30917, %r21938, %r21834, %r21954, 0xD2; + lop3.b32 %r30918, %r21942, %r21838, %r21958, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30917, %r30918}; + // begin inline asm + // chi + lop3.b32 %r30909, %r21834, %r21954, %r21826, 0xD2; + lop3.b32 %r30910, %r21838, %r21958, %r21830, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30909, %r30910}; + // begin inline asm + // chi + lop3.b32 %r30935, %r22002, %r21986, %r21874, 0xD2; + lop3.b32 %r30936, %r22006, %r21990, %r21878, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30935, %r30936}; + // begin inline asm + // chi + lop3.b32 %r30929, %r21986, %r21874, %r21882, 0xD2; + lop3.b32 %r30930, %r21990, %r21878, %r21886, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30929, %r30930}; + // begin inline asm + // chi + lop3.b32 %r30923, %r21874, %r21882, %r21850, 0xD2; + lop3.b32 %r30924, %r21878, %r21886, %r21854, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30923, %r30924}; + // begin inline asm + // chi + lop3.b32 %r30915, %r21882, %r21850, %r22002, 0xD2; + lop3.b32 %r30916, %r21886, %r21854, %r22006, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30915, %r30916}; + // begin inline asm + // chi + lop3.b32 %r30907, %r21850, %r22002, %r21986, 0xD2; + lop3.b32 %r30908, %r21854, %r22006, %r21990, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30907, %r30908}; + // begin inline asm + // chi + lop3.b32 %r30933, %r21906, %r21946, %r21978, 0xD2; + lop3.b32 %r30934, %r21910, %r21950, %r21982, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30933, %r30934}; + // begin inline asm + // chi + lop3.b32 %r30927, %r21946, %r21978, %r21970, 0xD2; + lop3.b32 %r30928, %r21950, %r21982, %r21974, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30927, %r30928}; + // begin inline asm + // chi + lop3.b32 %r30921, %r21978, %r21970, %r21890, 0xD2; + lop3.b32 %r30922, %r21982, %r21974, %r21894, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30921, %r30922}; + // begin inline asm + // chi + lop3.b32 %r30913, %r21970, %r21890, %r21906, 0xD2; + lop3.b32 %r30914, %r21974, %r21894, %r21910, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30913, %r30914}; + // begin inline asm + // chi + lop3.b32 %r30905, %r21890, %r21906, %r21946, 0xD2; + lop3.b32 %r30906, %r21894, %r21910, %r21950, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30905, %r30906}; + // begin inline asm + // chi + lop3.b32 %r30931, %r21858, %r21930, %r21842, 0xD2; + lop3.b32 %r30932, %r21862, %r21934, %r21846, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30931, %r30932}; + // begin inline asm + // chi + lop3.b32 %r30925, %r21930, %r21842, %r21898, 0xD2; + lop3.b32 %r30926, %r21934, %r21846, %r21902, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30925, %r30926}; + // begin inline asm + // chi + lop3.b32 %r30919, %r21842, %r21898, %r21922, 0xD2; + lop3.b32 %r30920, %r21846, %r21902, %r21926, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30919, %r30920}; + // begin inline asm + // chi + lop3.b32 %r30911, %r21898, %r21922, %r21858, 0xD2; + lop3.b32 %r30912, %r21902, %r21926, %r21862, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30911, %r30912}; + // begin inline asm + // chi + lop3.b32 %r30903, %r21922, %r21858, %r21930, 0xD2; + lop3.b32 %r30904, %r21926, %r21862, %r21934, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30903, %r30904}; + mul.wide.s32 %rd974, %r30953, 8; + add.s64 %rd973, %rd1280, %rd974; + // begin inline asm + ld.global.nc.v2.u32 {%r22210,%r22211}, [%rd973]; + // end inline asm + xor.b32 %r30939, %r22010, %r22210; + xor.b32 %r30940, %r22011, %r22211; + add.s32 %r30953, %r30953, 1; + setp.lt.u32 %p41, %r30953, 23; + @%p41 bra $L__BB2_71; + + mov.u32 %r29775, 3; + mov.u32 %r29774, 21; + mov.u32 %r29773, 28; + mov.u32 %r29772, 45; + mov.u32 %r29771, 14; + mov.u32 %r29770, 43; + mov.u32 %r29769, 61; + mov.u32 %r29768, 20; + mov.u32 %r29767, 44; + mov.u64 %rd1274, keccak_round_constants; + cvta.const.u64 %rd1273, %rd1274; + add.s64 %rd1272, %rd1273, 184; + st.local.v2.u32 [%rd2+32], {%r30951, %r30952}; + st.local.v2.u32 [%rd2+72], {%r30949, %r30950}; + st.local.v2.u32 [%rd2+40], {%r30947, %r30948}; + st.local.v2.u32 [%rd2+80], {%r30945, %r30946}; + st.local.v2.u32 [%rd2+48], {%r30943, %r30944}; + st.local.v2.u32 [%rd2+56], {%r30941, %r30942}; + st.local.v2.u32 [%rd2+24], {%r30939, %r30940}; + // begin inline asm + // xor5 + lop3.b32 %r22222, %r30939, %r30937, %r30935, 0x96; + lop3.b32 %r22222, %r22222, %r30933, %r30931, 0x96; + lop3.b32 %r22223, %r30940, %r30938, %r30936, 0x96; + lop3.b32 %r22223, %r22223, %r30934, %r30932, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22234, %r30951, %r30949, %r30929, 0x96; + lop3.b32 %r22234, %r22234, %r30927, %r30925, 0x96; + lop3.b32 %r22235, %r30952, %r30950, %r30930, 0x96; + lop3.b32 %r22235, %r22235, %r30928, %r30926, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22246, %r30947, %r30945, %r30923, 0x96; + lop3.b32 %r22246, %r22246, %r30921, %r30919, 0x96; + lop3.b32 %r22247, %r30948, %r30946, %r30924, 0x96; + lop3.b32 %r22247, %r22247, %r30922, %r30920, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22258, %r30943, %r30917, %r30915, 0x96; + lop3.b32 %r22258, %r22258, %r30913, %r30911, 0x96; + lop3.b32 %r22259, %r30944, %r30918, %r30916, 0x96; + lop3.b32 %r22259, %r22259, %r30914, %r30912, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22270, %r30941, %r30909, %r30907, 0x96; + lop3.b32 %r22270, %r22270, %r30905, %r30903, 0x96; + lop3.b32 %r22271, %r30942, %r30910, %r30908, 0x96; + lop3.b32 %r22271, %r22271, %r30906, %r30904, 0x96; + // end inline asm + mov.u32 %r30968, 1; + // begin inline asm + shf.l.wrap.b32 %r22282, %r22235, %r22234, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22286, %r22234, %r22235, %r30968; + // end inline asm + xor.b32 %r22501, %r22282, %r22270; + xor.b32 %r22502, %r22286, %r22271; + xor.b32 %r22429, %r30939, %r22501; + xor.b32 %r22432, %r30940, %r22502; + xor.b32 %r22392, %r30936, %r22502; + xor.b32 %r22391, %r30935, %r22501; + st.local.v2.u32 [%rd2+104], {%r22391, %r22392}; + // begin inline asm + shf.l.wrap.b32 %r22290, %r22247, %r22246, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22294, %r22246, %r22247, %r30968; + // end inline asm + xor.b32 %r22503, %r22290, %r22222; + xor.b32 %r22504, %r22294, %r22223; + xor.b32 %r22328, %r30949, %r22503; + xor.b32 %r22327, %r30950, %r22504; + xor.b32 %r22367, %r30928, %r22504; + xor.b32 %r22368, %r30927, %r22503; + st.local.v2.u32 [%rd2+152], {%r22368, %r22367}; + // begin inline asm + shf.l.wrap.b32 %r22298, %r22259, %r22258, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22302, %r22258, %r22259, %r30968; + // end inline asm + xor.b32 %r22505, %r22298, %r22234; + xor.b32 %r22506, %r22302, %r22235; + xor.b32 %r22351, %r30924, %r22506; + xor.b32 %r22352, %r30923, %r22505; + st.local.v2.u32 [%rd2+120], {%r22352, %r22351}; + xor.b32 %r22343, %r30920, %r22506; + xor.b32 %r22344, %r30919, %r22505; + st.local.v2.u32 [%rd2+200], {%r22344, %r22343}; + // begin inline asm + shf.l.wrap.b32 %r22306, %r22271, %r22270, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22310, %r22270, %r22271, %r30968; + // end inline asm + xor.b32 %r22507, %r22306, %r22246; + xor.b32 %r22508, %r22310, %r22247; + xor.b32 %r22375, %r30943, %r22507; + xor.b32 %r22376, %r30944, %r22508; + xor.b32 %r22384, %r30914, %r22508; + xor.b32 %r22383, %r30913, %r22507; + st.local.v2.u32 [%rd2+168], {%r22383, %r22384}; + // begin inline asm + shf.l.wrap.b32 %r22314, %r22223, %r22222, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22318, %r22222, %r22223, %r30968; + // end inline asm + xor.b32 %r22509, %r22314, %r22258; + xor.b32 %r22510, %r22318, %r22259; + xor.b32 %r22335, %r30909, %r22509; + xor.b32 %r22336, %r30910, %r22510; + xor.b32 %r22360, %r30904, %r22510; + xor.b32 %r22359, %r30903, %r22509; + st.local.v2.u32 [%rd2+216], {%r22359, %r22360}; + // begin inline asm + shf.l.wrap.b32 %r22322, %r22328, %r22327, %r29767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22326, %r22327, %r22328, %r29767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22330, %r22336, %r22335, %r29768; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22334, %r22335, %r22336, %r29768; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22342, %r22343, %r22344, %r29769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22338, %r22344, %r22343, %r29769; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r22338, %r22342}; + // begin inline asm + shf.l.wrap.b32 %r22346, %r22352, %r22351, %r29770; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22350, %r22351, %r22352, %r29770; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22354, %r22360, %r22359, %r29771; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22358, %r22359, %r22360, %r29771; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22366, %r22367, %r22368, %r29772; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22362, %r22368, %r22367, %r29772; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r22362, %r22366}; + // begin inline asm + shf.l.wrap.b32 %r22370, %r22376, %r22375, %r29773; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22374, %r22375, %r22376, %r29773; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22378, %r22384, %r22383, %r29774; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22382, %r22383, %r22384, %r29774; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22386, %r22392, %r22391, %r29775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22390, %r22391, %r22392, %r29775; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22394, %r22429, %r22322, %r22346, 0xD2; + lop3.b32 %r22395, %r22432, %r22326, %r22350, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22402, %r22322, %r22346, %r22378, 0xD2; + lop3.b32 %r22403, %r22326, %r22350, %r22382, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r22402, %r22403}; + // begin inline asm + // chi + lop3.b32 %r22410, %r22346, %r22378, %r22354, 0xD2; + lop3.b32 %r22411, %r22350, %r22382, %r22358, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r22410, %r22411}; + // begin inline asm + // chi + lop3.b32 %r22418, %r22378, %r22354, %r22429, 0xD2; + lop3.b32 %r22419, %r22382, %r22358, %r22432, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r22418, %r22419}; + // begin inline asm + // chi + lop3.b32 %r22426, %r22354, %r22429, %r22322, 0xD2; + lop3.b32 %r22427, %r22358, %r22432, %r22326, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r22426, %r22427}; + // begin inline asm + // chi + lop3.b32 %r22434, %r22370, %r22330, %r22386, 0xD2; + lop3.b32 %r22435, %r22374, %r22334, %r22390, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r22434, %r22435}; + // begin inline asm + // chi + lop3.b32 %r22442, %r22330, %r22386, %r22362, 0xD2; + lop3.b32 %r22443, %r22334, %r22390, %r22366, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r22442, %r22443}; + // begin inline asm + // chi + lop3.b32 %r22450, %r22386, %r22362, %r22338, 0xD2; + lop3.b32 %r22451, %r22390, %r22366, %r22342, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r22450, %r22451}; + // begin inline asm + ld.global.nc.v2.u32 {%r22458,%r22459}, [%rd1272]; + // end inline asm + xor.b32 %r22511, %r22395, %r22459; + xor.b32 %r22512, %r22394, %r22458; + mov.b64 %rd1349, {%r22512, %r22511}; + mov.b64 %rd1350, {%r22402, %r22403}; + mov.b64 %rd1351, {%r22410, %r22411}; + mov.b64 %rd1352, {%r22418, %r22419}; + mov.b64 %rd1353, {%r22426, %r22427}; + mov.b64 %rd1354, {%r22434, %r22435}; + mov.b64 %rd1355, {%r22442, %r22443}; + mov.b64 %rd1356, {%r22450, %r22451}; + mov.u32 %r30954, 0; + st.local.v2.u32 [%rd2+24], {%r22512, %r22511}; + st.local.v2.u32 [%rd908+96], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+104], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+112], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+120], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+128], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+136], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+144], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+152], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+160], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+168], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+176], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+184], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+192], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+200], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+208], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+216], {%r30954, %r30954}; + mov.u32 %r30969, -2147483648; + st.local.v2.u32 [%rd908+88], {%r30968, %r30969}; + mov.u32 %r30955, %r30954; + mov.u32 %r30956, %r30954; + mov.u32 %r30957, %r30954; + mov.u32 %r30958, %r30954; + mov.u32 %r30959, %r30954; + mov.u32 %r30960, %r30954; + mov.u32 %r30961, %r30954; + mov.u32 %r30962, %r30954; + mov.u32 %r30963, %r30954; + mov.u32 %r30964, %r30954; + mov.u32 %r30965, %r30954; + mov.u32 %r30966, %r30954; + mov.u32 %r30967, %r30954; + mov.u32 %r30970, %r30954; + mov.u32 %r30971, %r30954; + mov.u32 %r30972, %r30954; + mov.u32 %r30973, %r30954; + mov.u32 %r30974, %r30954; + mov.u32 %r30975, %r30954; + mov.u32 %r30976, %r30954; + mov.u32 %r30977, %r30954; + mov.u32 %r30978, %r30954; + mov.u32 %r30979, %r30954; + mov.u32 %r30980, %r30954; + mov.u32 %r30981, %r30954; + mov.u32 %r30982, %r30954; + mov.u32 %r30983, %r30954; + mov.u32 %r30984, %r30954; + mov.u32 %r30985, %r30954; + mov.u32 %r30986, %r30954; + mov.u32 %r30987, %r30954; + mov.u32 %r31004, %r30954; + +$L__BB2_73: + mov.u32 %r29786, 1; + mov.u64 %rd1276, keccak_round_constants; + cvta.const.u64 %rd1275, %rd1276; + // begin inline asm + // xor5 + lop3.b32 %r22513, %r30990, %r30988, %r30986, 0x96; + lop3.b32 %r22513, %r22513, %r30984, %r30982, 0x96; + lop3.b32 %r22514, %r30991, %r30989, %r30987, 0x96; + lop3.b32 %r22514, %r22514, %r30985, %r30983, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22525, %r31002, %r31000, %r30980, 0x96; + lop3.b32 %r22525, %r22525, %r30978, %r30976, 0x96; + lop3.b32 %r22526, %r31003, %r31001, %r30981, 0x96; + lop3.b32 %r22526, %r22526, %r30979, %r30977, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22537, %r30998, %r30996, %r30974, 0x96; + lop3.b32 %r22537, %r22537, %r30972, %r30970, 0x96; + lop3.b32 %r22538, %r30999, %r30997, %r30975, 0x96; + lop3.b32 %r22538, %r22538, %r30973, %r30971, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22549, %r30994, %r30968, %r30966, 0x96; + lop3.b32 %r22549, %r22549, %r30964, %r30962, 0x96; + lop3.b32 %r22550, %r30995, %r30969, %r30967, 0x96; + lop3.b32 %r22550, %r22550, %r30965, %r30963, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22561, %r30992, %r30960, %r30958, 0x96; + lop3.b32 %r22561, %r22561, %r30956, %r30954, 0x96; + lop3.b32 %r22562, %r30993, %r30961, %r30959, 0x96; + lop3.b32 %r22562, %r22562, %r30957, %r30955, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22573, %r22526, %r22525, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22577, %r22525, %r22526, %r29786; + // end inline asm + xor.b32 %r23007, %r22573, %r22561; + xor.b32 %r23008, %r22577, %r22562; + xor.b32 %r22840, %r30990, %r23007; + xor.b32 %r22843, %r30991, %r23008; + xor.b32 %r22747, %r30988, %r23007; + xor.b32 %r22746, %r30989, %r23008; + xor.b32 %r22794, %r30986, %r23007; + xor.b32 %r22795, %r30987, %r23008; + xor.b32 %r22699, %r30984, %r23007; + xor.b32 %r22698, %r30985, %r23008; + xor.b32 %r22650, %r30982, %r23007; + xor.b32 %r22651, %r30983, %r23008; + // begin inline asm + shf.l.wrap.b32 %r22581, %r22538, %r22537, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22585, %r22537, %r22538, %r29786; + // end inline asm + xor.b32 %r23009, %r22581, %r22513; + xor.b32 %r23010, %r22585, %r22514; + xor.b32 %r22802, %r31002, %r23009; + xor.b32 %r22803, %r31003, %r23010; + xor.b32 %r22619, %r31000, %r23009; + xor.b32 %r22618, %r31001, %r23010; + xor.b32 %r22778, %r30980, %r23009; + xor.b32 %r22779, %r30981, %r23010; + xor.b32 %r22739, %r30978, %r23009; + xor.b32 %r22738, %r30979, %r23010; + xor.b32 %r22722, %r30976, %r23009; + xor.b32 %r22723, %r30977, %r23010; + // begin inline asm + shf.l.wrap.b32 %r22589, %r22550, %r22549, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22593, %r22549, %r22550, %r29786; + // end inline asm + xor.b32 %r23011, %r22589, %r22525; + xor.b32 %r23012, %r22593, %r22526; + xor.b32 %r22659, %r30998, %r23011; + xor.b32 %r22658, %r30999, %r23012; + xor.b32 %r22786, %r30996, %r23011; + xor.b32 %r22787, %r30997, %r23012; + xor.b32 %r22667, %r30974, %r23011; + xor.b32 %r22666, %r30975, %r23012; + xor.b32 %r22770, %r30972, %r23011; + xor.b32 %r22771, %r30973, %r23012; + xor.b32 %r22635, %r30970, %r23011; + xor.b32 %r22634, %r30971, %r23012; + // begin inline asm + shf.l.wrap.b32 %r22597, %r22562, %r22561, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22601, %r22561, %r22562, %r29786; + // end inline asm + xor.b32 %r23013, %r22597, %r22537; + xor.b32 %r23014, %r22601, %r22538; + xor.b32 %r22754, %r30994, %r23013; + xor.b32 %r22755, %r30995, %r23014; + xor.b32 %r22731, %r30968, %r23013; + xor.b32 %r22730, %r30969, %r23014; + xor.b32 %r22674, %r30966, %r23013; + xor.b32 %r22675, %r30967, %r23014; + xor.b32 %r22762, %r30964, %r23013; + xor.b32 %r22763, %r30965, %r23014; + xor.b32 %r22691, %r30962, %r23013; + xor.b32 %r22690, %r30963, %r23014; + // begin inline asm + shf.l.wrap.b32 %r22605, %r22514, %r22513, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22609, %r22513, %r22514, %r29786; + // end inline asm + xor.b32 %r23015, %r22605, %r22549; + xor.b32 %r23016, %r22609, %r22550; + xor.b32 %r22706, %r30992, %r23015; + xor.b32 %r22707, %r30993, %r23016; + xor.b32 %r22626, %r30960, %r23015; + xor.b32 %r22627, %r30961, %r23016; + xor.b32 %r22643, %r30958, %r23015; + xor.b32 %r22642, %r30959, %r23016; + xor.b32 %r22682, %r30956, %r23015; + xor.b32 %r22683, %r30957, %r23016; + xor.b32 %r22714, %r30954, %r23015; + xor.b32 %r22715, %r30955, %r23016; + mov.u32 %r22620, 44; + // begin inline asm + shf.l.wrap.b32 %r22613, %r22619, %r22618, %r22620; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22617, %r22618, %r22619, %r22620; + // end inline asm + mov.u32 %r22628, 20; + // begin inline asm + shf.l.wrap.b32 %r22621, %r22627, %r22626, %r22628; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22625, %r22626, %r22627, %r22628; + // end inline asm + mov.u32 %r22636, 61; + // begin inline asm + shf.l.wrap.b32 %r22629, %r22635, %r22634, %r22636; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22633, %r22634, %r22635, %r22636; + // end inline asm + mov.u32 %r22644, 39; + // begin inline asm + shf.l.wrap.b32 %r22637, %r22643, %r22642, %r22644; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22641, %r22642, %r22643, %r22644; + // end inline asm + mov.u32 %r22652, 18; + // begin inline asm + shf.l.wrap.b32 %r22645, %r22651, %r22650, %r22652; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22649, %r22650, %r22651, %r22652; + // end inline asm + mov.u32 %r22660, 62; + // begin inline asm + shf.l.wrap.b32 %r22653, %r22659, %r22658, %r22660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22657, %r22658, %r22659, %r22660; + // end inline asm + mov.u32 %r22668, 43; + // begin inline asm + shf.l.wrap.b32 %r22661, %r22667, %r22666, %r22668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22665, %r22666, %r22667, %r22668; + // end inline asm + mov.u32 %r22676, 25; + // begin inline asm + shf.l.wrap.b32 %r22669, %r22675, %r22674, %r22676; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22673, %r22674, %r22675, %r22676; + // end inline asm + mov.u32 %r22684, 8; + // begin inline asm + shf.l.wrap.b32 %r22677, %r22683, %r22682, %r22684; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22681, %r22682, %r22683, %r22684; + // end inline asm + mov.u32 %r22692, 56; + // begin inline asm + shf.l.wrap.b32 %r22685, %r22691, %r22690, %r22692; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22689, %r22690, %r22691, %r22692; + // end inline asm + mov.u32 %r22700, 41; + // begin inline asm + shf.l.wrap.b32 %r22693, %r22699, %r22698, %r22700; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22697, %r22698, %r22699, %r22700; + // end inline asm + mov.u32 %r22708, 27; + // begin inline asm + shf.l.wrap.b32 %r22701, %r22707, %r22706, %r22708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22705, %r22706, %r22707, %r22708; + // end inline asm + mov.u32 %r22716, 14; + // begin inline asm + shf.l.wrap.b32 %r22709, %r22715, %r22714, %r22716; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22713, %r22714, %r22715, %r22716; + // end inline asm + mov.u32 %r22724, 2; + // begin inline asm + shf.l.wrap.b32 %r22717, %r22723, %r22722, %r22724; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22721, %r22722, %r22723, %r22724; + // end inline asm + mov.u32 %r22732, 55; + // begin inline asm + shf.l.wrap.b32 %r22725, %r22731, %r22730, %r22732; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22729, %r22730, %r22731, %r22732; + // end inline asm + mov.u32 %r22740, 45; + // begin inline asm + shf.l.wrap.b32 %r22733, %r22739, %r22738, %r22740; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22737, %r22738, %r22739, %r22740; + // end inline asm + mov.u32 %r22748, 36; + // begin inline asm + shf.l.wrap.b32 %r22741, %r22747, %r22746, %r22748; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22745, %r22746, %r22747, %r22748; + // end inline asm + mov.u32 %r22756, 28; + // begin inline asm + shf.l.wrap.b32 %r22749, %r22755, %r22754, %r22756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22753, %r22754, %r22755, %r22756; + // end inline asm + mov.u32 %r22764, 21; + // begin inline asm + shf.l.wrap.b32 %r22757, %r22763, %r22762, %r22764; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22761, %r22762, %r22763, %r22764; + // end inline asm + mov.u32 %r22772, 15; + // begin inline asm + shf.l.wrap.b32 %r22765, %r22771, %r22770, %r22772; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22769, %r22770, %r22771, %r22772; + // end inline asm + mov.u32 %r22780, 10; + // begin inline asm + shf.l.wrap.b32 %r22773, %r22779, %r22778, %r22780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22777, %r22778, %r22779, %r22780; + // end inline asm + mov.u32 %r22788, 6; + // begin inline asm + shf.l.wrap.b32 %r22781, %r22787, %r22786, %r22788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22785, %r22786, %r22787, %r22788; + // end inline asm + mov.u32 %r22796, 3; + // begin inline asm + shf.l.wrap.b32 %r22789, %r22795, %r22794, %r22796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22793, %r22794, %r22795, %r22796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22797, %r22803, %r22802, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22801, %r22802, %r22803, %r29786; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22805, %r22840, %r22613, %r22661, 0xD2; + lop3.b32 %r22806, %r22843, %r22617, %r22665, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31002, %r22613, %r22661, %r22757, 0xD2; + lop3.b32 %r31003, %r22617, %r22665, %r22761, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30998, %r22661, %r22757, %r22709, 0xD2; + lop3.b32 %r30999, %r22665, %r22761, %r22713, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30994, %r22757, %r22709, %r22840, 0xD2; + lop3.b32 %r30995, %r22761, %r22713, %r22843, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30992, %r22709, %r22840, %r22613, 0xD2; + lop3.b32 %r30993, %r22713, %r22843, %r22617, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30988, %r22749, %r22621, %r22789, 0xD2; + lop3.b32 %r30989, %r22753, %r22625, %r22793, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31000, %r22621, %r22789, %r22733, 0xD2; + lop3.b32 %r31001, %r22625, %r22793, %r22737, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30996, %r22789, %r22733, %r22629, 0xD2; + lop3.b32 %r30997, %r22793, %r22737, %r22633, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30968, %r22733, %r22629, %r22749, 0xD2; + lop3.b32 %r30969, %r22737, %r22633, %r22753, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r30968, %r30969}; + // begin inline asm + // chi + lop3.b32 %r30960, %r22629, %r22749, %r22621, 0xD2; + lop3.b32 %r30961, %r22633, %r22753, %r22625, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r30960, %r30961}; + // begin inline asm + // chi + lop3.b32 %r30986, %r22797, %r22781, %r22669, 0xD2; + lop3.b32 %r30987, %r22801, %r22785, %r22673, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+104], {%r30986, %r30987}; + // begin inline asm + // chi + lop3.b32 %r30980, %r22781, %r22669, %r22677, 0xD2; + lop3.b32 %r30981, %r22785, %r22673, %r22681, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+112], {%r30980, %r30981}; + // begin inline asm + // chi + lop3.b32 %r30974, %r22669, %r22677, %r22645, 0xD2; + lop3.b32 %r30975, %r22673, %r22681, %r22649, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+120], {%r30974, %r30975}; + // begin inline asm + // chi + lop3.b32 %r30966, %r22677, %r22645, %r22797, 0xD2; + lop3.b32 %r30967, %r22681, %r22649, %r22801, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+128], {%r30966, %r30967}; + // begin inline asm + // chi + lop3.b32 %r30958, %r22645, %r22797, %r22781, 0xD2; + lop3.b32 %r30959, %r22649, %r22801, %r22785, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+136], {%r30958, %r30959}; + // begin inline asm + // chi + lop3.b32 %r30984, %r22701, %r22741, %r22773, 0xD2; + lop3.b32 %r30985, %r22705, %r22745, %r22777, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+144], {%r30984, %r30985}; + // begin inline asm + // chi + lop3.b32 %r30978, %r22741, %r22773, %r22765, 0xD2; + lop3.b32 %r30979, %r22745, %r22777, %r22769, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+152], {%r30978, %r30979}; + // begin inline asm + // chi + lop3.b32 %r30972, %r22773, %r22765, %r22685, 0xD2; + lop3.b32 %r30973, %r22777, %r22769, %r22689, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+160], {%r30972, %r30973}; + // begin inline asm + // chi + lop3.b32 %r30964, %r22765, %r22685, %r22701, 0xD2; + lop3.b32 %r30965, %r22769, %r22689, %r22705, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+168], {%r30964, %r30965}; + // begin inline asm + // chi + lop3.b32 %r30956, %r22685, %r22701, %r22741, 0xD2; + lop3.b32 %r30957, %r22689, %r22705, %r22745, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+176], {%r30956, %r30957}; + // begin inline asm + // chi + lop3.b32 %r30982, %r22653, %r22725, %r22637, 0xD2; + lop3.b32 %r30983, %r22657, %r22729, %r22641, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+184], {%r30982, %r30983}; + // begin inline asm + // chi + lop3.b32 %r30976, %r22725, %r22637, %r22693, 0xD2; + lop3.b32 %r30977, %r22729, %r22641, %r22697, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+192], {%r30976, %r30977}; + // begin inline asm + // chi + lop3.b32 %r30970, %r22637, %r22693, %r22717, 0xD2; + lop3.b32 %r30971, %r22641, %r22697, %r22721, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+200], {%r30970, %r30971}; + // begin inline asm + // chi + lop3.b32 %r30962, %r22693, %r22717, %r22653, 0xD2; + lop3.b32 %r30963, %r22697, %r22721, %r22657, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+208], {%r30962, %r30963}; + // begin inline asm + // chi + lop3.b32 %r30954, %r22717, %r22653, %r22725, 0xD2; + lop3.b32 %r30955, %r22721, %r22657, %r22729, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+216], {%r30954, %r30955}; + mul.wide.s32 %rd985, %r31004, 8; + add.s64 %rd984, %rd1275, %rd985; + // begin inline asm + ld.global.nc.v2.u32 {%r23005,%r23006}, [%rd984]; + // end inline asm + xor.b32 %r30990, %r22805, %r23005; + xor.b32 %r30991, %r22806, %r23006; + add.s32 %r31004, %r31004, 1; + setp.lt.u32 %p42, %r31004, 23; + @%p42 bra $L__BB2_73; + + mov.u32 %r29785, 3; + mov.u32 %r29784, 21; + mov.u32 %r29783, 28; + mov.u32 %r29782, 45; + mov.u32 %r29781, 14; + mov.u32 %r29780, 43; + mov.u32 %r29779, 61; + mov.u32 %r29778, 20; + mov.u32 %r29777, 44; + mov.u64 %rd1279, keccak_round_constants; + cvta.const.u64 %rd1278, %rd1279; + add.s64 %rd1277, %rd1278, 184; + mov.u32 %r23116, 1; + st.local.v2.u32 [%rd908+32], {%r31002, %r31003}; + st.local.v2.u32 [%rd908+72], {%r31000, %r31001}; + st.local.v2.u32 [%rd908+40], {%r30998, %r30999}; + st.local.v2.u32 [%rd908+80], {%r30996, %r30997}; + st.local.v2.u32 [%rd908+48], {%r30994, %r30995}; + st.local.v2.u32 [%rd908+56], {%r30992, %r30993}; + st.local.v2.u32 [%rd908+24], {%r30990, %r30991}; + // begin inline asm + // xor5 + lop3.b32 %r23017, %r30990, %r30988, %r30986, 0x96; + lop3.b32 %r23017, %r23017, %r30984, %r30982, 0x96; + lop3.b32 %r23018, %r30991, %r30989, %r30987, 0x96; + lop3.b32 %r23018, %r23018, %r30985, %r30983, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23029, %r31002, %r31000, %r30980, 0x96; + lop3.b32 %r23029, %r23029, %r30978, %r30976, 0x96; + lop3.b32 %r23030, %r31003, %r31001, %r30981, 0x96; + lop3.b32 %r23030, %r23030, %r30979, %r30977, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23041, %r30998, %r30996, %r30974, 0x96; + lop3.b32 %r23041, %r23041, %r30972, %r30970, 0x96; + lop3.b32 %r23042, %r30999, %r30997, %r30975, 0x96; + lop3.b32 %r23042, %r23042, %r30973, %r30971, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23053, %r30994, %r30968, %r30966, 0x96; + lop3.b32 %r23053, %r23053, %r30964, %r30962, 0x96; + lop3.b32 %r23054, %r30995, %r30969, %r30967, 0x96; + lop3.b32 %r23054, %r23054, %r30965, %r30963, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23065, %r30992, %r30960, %r30958, 0x96; + lop3.b32 %r23065, %r23065, %r30956, %r30954, 0x96; + lop3.b32 %r23066, %r30993, %r30961, %r30959, 0x96; + lop3.b32 %r23066, %r23066, %r30957, %r30955, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23077, %r23030, %r23029, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23081, %r23029, %r23030, %r23116; + // end inline asm + xor.b32 %r23255, %r23077, %r23065; + xor.b32 %r23256, %r23081, %r23066; + xor.b32 %r23224, %r30990, %r23255; + xor.b32 %r23227, %r30991, %r23256; + xor.b32 %r23187, %r30987, %r23256; + xor.b32 %r23186, %r30986, %r23255; + st.local.v2.u32 [%rd908+104], {%r23186, %r23187}; + // begin inline asm + shf.l.wrap.b32 %r23085, %r23042, %r23041, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23089, %r23041, %r23042, %r23116; + // end inline asm + xor.b32 %r23257, %r23085, %r23017; + xor.b32 %r23258, %r23089, %r23018; + xor.b32 %r23123, %r31000, %r23257; + xor.b32 %r23122, %r31001, %r23258; + xor.b32 %r23162, %r30979, %r23258; + xor.b32 %r23163, %r30978, %r23257; + st.local.v2.u32 [%rd908+152], {%r23163, %r23162}; + // begin inline asm + shf.l.wrap.b32 %r23093, %r23054, %r23053, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23097, %r23053, %r23054, %r23116; + // end inline asm + xor.b32 %r23259, %r23093, %r23029; + xor.b32 %r23260, %r23097, %r23030; + xor.b32 %r23146, %r30975, %r23260; + xor.b32 %r23147, %r30974, %r23259; + st.local.v2.u32 [%rd908+120], {%r23147, %r23146}; + xor.b32 %r23138, %r30971, %r23260; + xor.b32 %r23139, %r30970, %r23259; + st.local.v2.u32 [%rd908+200], {%r23139, %r23138}; + // begin inline asm + shf.l.wrap.b32 %r23101, %r23066, %r23065, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23105, %r23065, %r23066, %r23116; + // end inline asm + xor.b32 %r23261, %r23101, %r23041; + xor.b32 %r23262, %r23105, %r23042; + xor.b32 %r23170, %r30994, %r23261; + xor.b32 %r23171, %r30995, %r23262; + xor.b32 %r23179, %r30965, %r23262; + xor.b32 %r23178, %r30964, %r23261; + st.local.v2.u32 [%rd908+168], {%r23178, %r23179}; + // begin inline asm + shf.l.wrap.b32 %r23109, %r23018, %r23017, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23113, %r23017, %r23018, %r23116; + // end inline asm + xor.b32 %r23263, %r23109, %r23053; + xor.b32 %r23264, %r23113, %r23054; + xor.b32 %r23130, %r30960, %r23263; + xor.b32 %r23131, %r30961, %r23264; + xor.b32 %r23155, %r30955, %r23264; + xor.b32 %r23154, %r30954, %r23263; + st.local.v2.u32 [%rd908+216], {%r23154, %r23155}; + // begin inline asm + shf.l.wrap.b32 %r23117, %r23123, %r23122, %r29777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23121, %r23122, %r23123, %r29777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23125, %r23131, %r23130, %r29778; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23129, %r23130, %r23131, %r29778; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23137, %r23138, %r23139, %r29779; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23133, %r23139, %r23138, %r29779; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r23133, %r23137}; + // begin inline asm + shf.l.wrap.b32 %r23141, %r23147, %r23146, %r29780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23145, %r23146, %r23147, %r29780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23149, %r23155, %r23154, %r29781; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23153, %r23154, %r23155, %r29781; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23161, %r23162, %r23163, %r29782; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23157, %r23163, %r23162, %r29782; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r23157, %r23161}; + // begin inline asm + shf.l.wrap.b32 %r23165, %r23171, %r23170, %r29783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23169, %r23170, %r23171, %r29783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23173, %r23179, %r23178, %r29784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23177, %r23178, %r23179, %r29784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23181, %r23187, %r23186, %r29785; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23185, %r23186, %r23187, %r29785; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23189, %r23224, %r23117, %r23141, 0xD2; + lop3.b32 %r23190, %r23227, %r23121, %r23145, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23197, %r23117, %r23141, %r23173, 0xD2; + lop3.b32 %r23198, %r23121, %r23145, %r23177, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+32], {%r23197, %r23198}; + // begin inline asm + // chi + lop3.b32 %r23205, %r23141, %r23173, %r23149, 0xD2; + lop3.b32 %r23206, %r23145, %r23177, %r23153, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+40], {%r23205, %r23206}; + // begin inline asm + // chi + lop3.b32 %r23213, %r23173, %r23149, %r23224, 0xD2; + lop3.b32 %r23214, %r23177, %r23153, %r23227, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+48], {%r23213, %r23214}; + // begin inline asm + // chi + lop3.b32 %r23221, %r23149, %r23224, %r23117, 0xD2; + lop3.b32 %r23222, %r23153, %r23227, %r23121, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+56], {%r23221, %r23222}; + // begin inline asm + // chi + lop3.b32 %r23229, %r23165, %r23125, %r23181, 0xD2; + lop3.b32 %r23230, %r23169, %r23129, %r23185, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+64], {%r23229, %r23230}; + // begin inline asm + // chi + lop3.b32 %r23237, %r23125, %r23181, %r23157, 0xD2; + lop3.b32 %r23238, %r23129, %r23185, %r23161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+72], {%r23237, %r23238}; + // begin inline asm + // chi + lop3.b32 %r23245, %r23181, %r23157, %r23133, 0xD2; + lop3.b32 %r23246, %r23185, %r23161, %r23137, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+80], {%r23245, %r23246}; + // begin inline asm + ld.global.nc.v2.u32 {%r23253,%r23254}, [%rd1277]; + // end inline asm + xor.b32 %r23265, %r23190, %r23254; + xor.b32 %r23266, %r23189, %r23253; + st.local.v2.u32 [%rd908+24], {%r23266, %r23265}; + mov.b64 %rd1358, {%r23197, %r23198}; + mov.b64 %rd1359, {%r23205, %r23206}; + mov.b64 %rd1362, {%r23229, %r23230}; + mov.b64 %rd1363, {%r23237, %r23238}; + mov.b64 %rd1364, {%r23245, %r23246}; + mov.b64 %rd1357, {%r23266, %r23265}; + mov.b64 %rd1360, {%r23213, %r23214}; + mov.b64 %rd1361, {%r23221, %r23222}; + st.global.u64 [%rd222], %rd1349; + st.global.u64 [%rd222+8], %rd1350; + st.global.u64 [%rd222+16], %rd1351; + st.global.u64 [%rd222+24], %rd1352; + st.global.u64 [%rd222+32], %rd1353; + st.global.u64 [%rd222+40], %rd1354; + st.global.u64 [%rd222+48], %rd1355; + st.global.u64 [%rd222+56], %rd1356; + st.global.v2.u32 [%rd222+64], {%r23266, %r23265}; + st.global.v2.u32 [%rd222+72], {%r23197, %r23198}; + st.global.v2.u32 [%rd222+80], {%r23205, %r23206}; + st.global.v2.u32 [%rd222+88], {%r23213, %r23214}; + st.global.v2.u32 [%rd222+96], {%r23221, %r23222}; + st.global.v2.u32 [%rd222+104], {%r23229, %r23230}; + st.global.v2.u32 [%rd222+112], {%r23237, %r23238}; + st.global.v2.u32 [%rd222+120], {%r23245, %r23246}; + +$L__BB2_86: + mul.lo.s32 %r26551, %r12, 16777619; + mov.b64 {%r26552, %r26553}, %rd1333; + mul.lo.s32 %r26554, %r13, 16777619; + xor.b32 %r26555, %r26551, %r26552; + xor.b32 %r26556, %r26554, %r26553; + mov.b64 %rd1099, {%r26555, %r26556}; + mov.b64 {%r26557, %r26558}, %rd1349; + xor.b32 %r26559, %r26558, %r13; + xor.b32 %r26560, %r26557, %r12; + mov.b64 %rd1100, {%r26560, %r26559}; + mul.lo.s32 %r26561, %r14, 16777619; + mov.b64 {%r26562, %r26563}, %rd1334; + mul.lo.s32 %r26564, %r15, 16777619; + xor.b32 %r26565, %r26564, %r26563; + xor.b32 %r26566, %r26561, %r26562; + mov.b64 %rd1101, {%r26566, %r26565}; + mov.b64 {%r26567, %r26568}, %rd1350; + xor.b32 %r26569, %r26568, %r15; + xor.b32 %r26570, %r26567, %r14; + mov.b64 %rd1102, {%r26570, %r26569}; + mul.lo.s32 %r26571, %r16, 16777619; + mov.b64 {%r26572, %r26573}, %rd1335; + mul.lo.s32 %r26574, %r17, 16777619; + xor.b32 %r26575, %r26574, %r26573; + xor.b32 %r26576, %r26571, %r26572; + mov.b64 %rd1103, {%r26576, %r26575}; + mov.b64 {%r26577, %r26578}, %rd1351; + xor.b32 %r26579, %r26578, %r17; + xor.b32 %r26580, %r26577, %r16; + mov.b64 %rd1104, {%r26580, %r26579}; + mul.lo.s32 %r26581, %r18, 16777619; + mov.b64 {%r26582, %r26583}, %rd1336; + mul.lo.s32 %r26584, %r19, 16777619; + xor.b32 %r26585, %r26584, %r26583; + xor.b32 %r26586, %r26581, %r26582; + mov.b64 %rd1105, {%r26586, %r26585}; + mov.b64 {%r26587, %r26588}, %rd1352; + xor.b32 %r26589, %r26588, %r19; + xor.b32 %r26590, %r26587, %r18; + mov.b64 %rd1106, {%r26590, %r26589}; + mul.lo.s32 %r26591, %r20, 16777619; + mov.b64 {%r26592, %r26593}, %rd1337; + mul.lo.s32 %r26594, %r21, 16777619; + xor.b32 %r26595, %r26594, %r26593; + xor.b32 %r26596, %r26591, %r26592; + mov.b64 %rd1107, {%r26596, %r26595}; + mov.b64 {%r26597, %r26598}, %rd1353; + xor.b32 %r26599, %r26598, %r21; + xor.b32 %r26600, %r26597, %r20; + mov.b64 %rd1108, {%r26600, %r26599}; + mul.lo.s32 %r26601, %r22, 16777619; + mov.b64 {%r26602, %r26603}, %rd1338; + mul.lo.s32 %r26604, %r23, 16777619; + xor.b32 %r26605, %r26604, %r26603; + xor.b32 %r26606, %r26601, %r26602; + mov.b64 %rd1109, {%r26606, %r26605}; + mov.b64 {%r26607, %r26608}, %rd1354; + xor.b32 %r26609, %r26608, %r23; + xor.b32 %r26610, %r26607, %r22; + mov.b64 %rd1110, {%r26610, %r26609}; + mul.lo.s32 %r26611, %r24, 16777619; + mov.b64 {%r26612, %r26613}, %rd1339; + mul.lo.s32 %r26614, %r25, 16777619; + xor.b32 %r26615, %r26614, %r26613; + xor.b32 %r26616, %r26611, %r26612; + mov.b64 %rd1111, {%r26616, %r26615}; + mov.b64 {%r26617, %r26618}, %rd1355; + xor.b32 %r26619, %r26618, %r25; + xor.b32 %r26620, %r26617, %r24; + mov.b64 %rd1112, {%r26620, %r26619}; + mul.lo.s32 %r26621, %r26, 16777619; + mov.b64 {%r26622, %r26623}, %rd1340; + mul.lo.s32 %r26624, %r27, 16777619; + xor.b32 %r26625, %r26624, %r26623; + xor.b32 %r26626, %r26621, %r26622; + mov.b64 %rd1113, {%r26626, %r26625}; + mov.b64 {%r26627, %r26628}, %rd1356; + xor.b32 %r26629, %r26628, %r27; + xor.b32 %r26630, %r26627, %r26; + mov.b64 %rd1114, {%r26630, %r26629}; + mul.lo.s32 %r26631, %r28, 16777619; + mov.b64 {%r26632, %r26633}, %rd1341; + mul.lo.s32 %r26634, %r29, 16777619; + xor.b32 %r26635, %r26634, %r26633; + xor.b32 %r26636, %r26631, %r26632; + mov.b64 %rd1115, {%r26636, %r26635}; + mov.b64 {%r26637, %r26638}, %rd1357; + xor.b32 %r26639, %r26638, %r29; + xor.b32 %r26640, %r26637, %r28; + mov.b64 %rd1116, {%r26640, %r26639}; + mul.lo.s32 %r26641, %r30, 16777619; + mov.b64 {%r26642, %r26643}, %rd1342; + mul.lo.s32 %r26644, %r31, 16777619; + xor.b32 %r26645, %r26644, %r26643; + xor.b32 %r26646, %r26641, %r26642; + mov.b64 %rd1117, {%r26646, %r26645}; + mov.b64 {%r26647, %r26648}, %rd1358; + xor.b32 %r26649, %r26648, %r31; + xor.b32 %r26650, %r26647, %r30; + mov.b64 %rd1118, {%r26650, %r26649}; + mul.lo.s32 %r26651, %r32, 16777619; + mov.b64 {%r26652, %r26653}, %rd1343; + mul.lo.s32 %r26654, %r33, 16777619; + xor.b32 %r26655, %r26654, %r26653; + xor.b32 %r26656, %r26651, %r26652; + mov.b64 %rd1119, {%r26656, %r26655}; + mov.b64 {%r26657, %r26658}, %rd1359; + xor.b32 %r26659, %r26658, %r33; + xor.b32 %r26660, %r26657, %r32; + mov.b64 %rd1120, {%r26660, %r26659}; + mul.lo.s32 %r26661, %r34, 16777619; + mov.b64 {%r26662, %r26663}, %rd1344; + mul.lo.s32 %r26664, %r35, 16777619; + xor.b32 %r26665, %r26664, %r26663; + xor.b32 %r26666, %r26661, %r26662; + mov.b64 %rd1121, {%r26666, %r26665}; + mov.b64 {%r26667, %r26668}, %rd1360; + xor.b32 %r26669, %r26668, %r35; + xor.b32 %r26670, %r26667, %r34; + mov.b64 %rd1122, {%r26670, %r26669}; + mul.lo.s32 %r26671, %r36, 16777619; + mov.b64 {%r26672, %r26673}, %rd1345; + mul.lo.s32 %r26674, %r37, 16777619; + xor.b32 %r26675, %r26674, %r26673; + xor.b32 %r26676, %r26671, %r26672; + mov.b64 %rd1123, {%r26676, %r26675}; + mov.b64 {%r26677, %r26678}, %rd1361; + xor.b32 %r26679, %r26678, %r37; + xor.b32 %r26680, %r26677, %r36; + mov.b64 %rd1124, {%r26680, %r26679}; + mul.lo.s32 %r26681, %r38, 16777619; + mov.b64 {%r26682, %r26683}, %rd1346; + mul.lo.s32 %r26684, %r39, 16777619; + xor.b32 %r26685, %r26684, %r26683; + xor.b32 %r26686, %r26681, %r26682; + mov.b64 %rd1125, {%r26686, %r26685}; + mov.b64 {%r26687, %r26688}, %rd1362; + xor.b32 %r26689, %r26688, %r39; + xor.b32 %r26690, %r26687, %r38; + mov.b64 %rd1126, {%r26690, %r26689}; + mul.lo.s32 %r26691, %r40, 16777619; + mov.b64 {%r26692, %r26693}, %rd1347; + mul.lo.s32 %r26694, %r41, 16777619; + xor.b32 %r26695, %r26694, %r26693; + xor.b32 %r26696, %r26691, %r26692; + mov.b64 %rd1127, {%r26696, %r26695}; + mov.b64 {%r26697, %r26698}, %rd1363; + xor.b32 %r26699, %r26698, %r41; + xor.b32 %r26700, %r26697, %r40; + mov.b64 %rd1128, {%r26700, %r26699}; + mul.lo.s32 %r26701, %r42, 16777619; + mov.b64 {%r26702, %r26703}, %rd1348; + mul.lo.s32 %r26704, %r43, 16777619; + xor.b32 %r26705, %r26704, %r26703; + xor.b32 %r26706, %r26701, %r26702; + mov.b64 %rd1129, {%r26706, %r26705}; + mov.b64 {%r26707, %r26708}, %rd1364; + xor.b32 %r26709, %r26708, %r43; + xor.b32 %r26710, %r26707, %r42; + mov.b64 %rd1130, {%r26710, %r26709}; + mul.lo.s64 %rd1131, %rd1317, %rd1099; + add.s64 %rd1316, %rd1131, %rd1100; + mul.lo.s64 %rd1132, %rd1318, %rd1101; + add.s64 %rd1315, %rd1132, %rd1102; + mul.lo.s64 %rd1133, %rd1319, %rd1103; + add.s64 %rd1314, %rd1133, %rd1104; + mul.lo.s64 %rd1134, %rd1320, %rd1105; + add.s64 %rd1313, %rd1134, %rd1106; + mul.lo.s64 %rd1135, %rd1321, %rd1107; + add.s64 %rd1312, %rd1135, %rd1108; + mul.lo.s64 %rd1136, %rd1322, %rd1109; + add.s64 %rd1311, %rd1136, %rd1110; + mul.lo.s64 %rd1137, %rd1323, %rd1111; + add.s64 %rd1310, %rd1137, %rd1112; + mul.lo.s64 %rd1138, %rd1324, %rd1113; + add.s64 %rd1309, %rd1138, %rd1114; + mul.lo.s64 %rd1139, %rd1325, %rd1115; + add.s64 %rd1308, %rd1139, %rd1116; + mul.lo.s64 %rd1140, %rd1326, %rd1117; + add.s64 %rd1307, %rd1140, %rd1118; + mul.lo.s64 %rd1141, %rd1327, %rd1119; + add.s64 %rd1306, %rd1141, %rd1120; + mul.lo.s64 %rd1142, %rd1328, %rd1121; + add.s64 %rd1305, %rd1142, %rd1122; + mul.lo.s64 %rd1143, %rd1329, %rd1123; + add.s64 %rd1304, %rd1143, %rd1124; + mul.lo.s64 %rd1144, %rd1330, %rd1125; + add.s64 %rd1303, %rd1144, %rd1126; + mul.lo.s64 %rd1145, %rd1331, %rd1127; + add.s64 %rd1302, %rd1145, %rd1128; + mul.lo.s64 %rd1146, %rd1332, %rd1129; + add.s64 %rd1301, %rd1146, %rd1130; + add.s32 %r29819, %r29819, 1; + setp.lt.u32 %p48, %r29819, 32; + @%p48 bra $L__BB2_11; + + add.u64 %rd1259, %SPL, 2000; + add.u64 %rd1256, %SP, 2000; + add.u64 %rd1255, %SP, 0; + mov.u64 %rd1147, 0; + mov.b64 {%r26711, %r26712}, %rd1316; + mul.lo.s32 %r26713, %r26711, 16777619; + xor.b32 %r26714, %r26713, %r26712; + mul.lo.s32 %r26715, %r26714, 16777619; + mov.b64 {%r26716, %r26717}, %rd1315; + xor.b32 %r26718, %r26715, %r26716; + mul.lo.s32 %r26719, %r26718, 16777619; + mov.b64 {%r26720, %r26721}, %rd1314; + mul.lo.s32 %r26722, %r26720, 16777619; + xor.b32 %r26723, %r26722, %r26721; + mul.lo.s32 %r26724, %r26723, 16777619; + mov.b64 {%r26725, %r26726}, %rd1313; + xor.b32 %r26727, %r26724, %r26725; + mul.lo.s32 %r26728, %r26727, 16777619; + mov.b64 {%r26729, %r26730}, %rd1312; + mul.lo.s32 %r26731, %r26729, 16777619; + xor.b32 %r26732, %r26731, %r26730; + mul.lo.s32 %r26733, %r26732, 16777619; + mov.b64 {%r26734, %r26735}, %rd1311; + xor.b32 %r26736, %r26733, %r26734; + mul.lo.s32 %r26737, %r26736, 16777619; + mov.b64 {%r26738, %r26739}, %rd1310; + mul.lo.s32 %r26740, %r26738, 16777619; + xor.b32 %r26741, %r26740, %r26739; + mul.lo.s32 %r26742, %r26741, 16777619; + mov.b64 {%r26743, %r26744}, %rd1309; + xor.b32 %r26745, %r26742, %r26743; + mul.lo.s32 %r26746, %r26745, 16777619; + mov.b64 {%r26747, %r26748}, %rd1308; + mul.lo.s32 %r26749, %r26747, 16777619; + xor.b32 %r26750, %r26749, %r26748; + mul.lo.s32 %r26751, %r26750, 16777619; + mov.b64 {%r26752, %r26753}, %rd1307; + xor.b32 %r26754, %r26751, %r26752; + mul.lo.s32 %r26755, %r26754, 16777619; + mov.b64 {%r26756, %r26757}, %rd1306; + mul.lo.s32 %r26758, %r26756, 16777619; + xor.b32 %r26759, %r26758, %r26757; + mul.lo.s32 %r26760, %r26759, 16777619; + mov.b64 {%r26761, %r26762}, %rd1305; + xor.b32 %r26763, %r26760, %r26761; + mul.lo.s32 %r26764, %r26763, 16777619; + mov.b64 {%r26765, %r26766}, %rd1304; + mul.lo.s32 %r26767, %r26765, 16777619; + xor.b32 %r26768, %r26767, %r26766; + mul.lo.s32 %r26769, %r26768, 16777619; + mov.b64 {%r26770, %r26771}, %rd1303; + xor.b32 %r26772, %r26769, %r26770; + mul.lo.s32 %r26773, %r26772, 16777619; + mov.b64 {%r26774, %r26775}, %rd1302; + mul.lo.s32 %r26776, %r26774, 16777619; + xor.b32 %r26777, %r26776, %r26775; + mul.lo.s32 %r26778, %r26777, 16777619; + mov.b64 {%r26779, %r26780}, %rd1301; + xor.b32 %r26781, %r26778, %r26779; + mul.lo.s32 %r26782, %r26781, 16777619; + mov.u32 %r26783, 0; + st.local.v4.u32 [%rd1259+32], {%r26783, %r26783, %r26783, %r26783}; + st.local.v4.u32 [%rd1259+48], {%r26783, %r26783, %r26783, %r26783}; + st.local.v4.u32 [%rd1259+64], {%r26783, %r26783, %r26783, %r26783}; + xor.b32 %r26784, %r26746, %r26744; + xor.b32 %r26785, %r26737, %r26735; + xor.b32 %r26786, %r26728, %r26726; + xor.b32 %r26787, %r26719, %r26717; + st.local.v4.u32 [%rd1259], {%r26787, %r26786, %r26785, %r26784}; + xor.b32 %r26788, %r26782, %r26780; + xor.b32 %r26789, %r26773, %r26771; + xor.b32 %r26790, %r26764, %r26762; + xor.b32 %r26791, %r26755, %r26753; + st.local.v4.u32 [%rd1259+16], {%r26791, %r26790, %r26789, %r26788}; + mov.u32 %r26792, -1150833019; + mov.u32 %r26793, 1779033703; + st.local.v2.u32 [%rd2], {%r26793, %r26792}; + mov.u32 %r26794, -1521486534; + mov.u32 %r26795, 1013904242; + st.local.v2.u32 [%rd2+8], {%r26795, %r26794}; + mov.u32 %r26796, -1694144372; + mov.u32 %r26797, 1359893119; + st.local.v2.u32 [%rd2+16], {%r26797, %r26796}; + mov.u32 %r26798, 1541459225; + mov.u32 %r26799, 528734635; + st.local.v2.u32 [%rd2+24], {%r26799, %r26798}; + st.local.v2.u32 [%rd2+32], {%r26793, %r26792}; + st.local.v2.u32 [%rd2+40], {%r26795, %r26794}; + st.local.v2.u32 [%rd2+48], {%r26797, %r26796}; + st.local.v2.u32 [%rd2+56], {%r26799, %r26798}; + st.local.u64 [%rd2+64], %rd1147; + st.local.v2.u32 [%rd2+72], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+80], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+88], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+96], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+104], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+112], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+120], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+128], {%r26783, %r26783}; + mov.u16 %rs498, 0; + st.local.v2.u8 [%rd2+136], {%rs498, %rs498}; + st.local.u8 [%rd2+138], %rs498; + st.local.u8 [%rd2+144], %rs498; + { // callseq 3, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd1255; + .param .b64 param1; + st.param.b64 [param1+0], %rd1256; + call.uni + _Z20blake3_hasher_updateP13blake3_hasherPKvy, + ( + param0, + param1 + ); + } // callseq 3 + ld.local.u8 %rd1367, [%rd2+144]; + setp.eq.s64 %p49, %rd1367, 0; + @%p49 bra $L__BB2_95; + + ld.local.v2.u8 {%rs862, %rs500}, [%rd2+136]; + cvt.u32.u16 %r26800, %rs500; + mul.wide.u32 %rd1151, %r26800, 64; + cvt.u64.u16 %rd1152, %rs862; + neg.s64 %rd1153, %rd1152; + setp.eq.s64 %p50, %rd1151, %rd1153; + @%p50 bra $L__BB2_90; + bra.uni $L__BB2_89; + +$L__BB2_90: + add.s64 %rd1367, %rd1367, -2; + shl.b64 %rd1155, %rd1367, 5; + add.s64 %rd1158, %rd2, %rd1155; + ld.local.u8 %rs665, [%rd2+138]; + mov.u64 %rd1368, 0; + or.b16 %rs732, %rs665, 4; + ld.local.v2.u32 {%r31257, %r31256}, [%rd2]; + ld.local.v2.u32 {%r31255, %r31254}, [%rd2+8]; + ld.local.v2.u32 {%r31253, %r31252}, [%rd2+16]; + ld.local.v2.u32 {%r31251, %r31250}, [%rd2+24]; + ld.local.u8 %rs798, [%rd1158+145]; + ld.local.u8 %rs799, [%rd1158+146]; + ld.local.u8 %rs800, [%rd1158+147]; + ld.local.u8 %rs801, [%rd1158+148]; + ld.local.u8 %rs802, [%rd1158+149]; + ld.local.u8 %rs803, [%rd1158+150]; + ld.local.u8 %rs804, [%rd1158+151]; + ld.local.u8 %rs805, [%rd1158+152]; + ld.local.u8 %rs806, [%rd1158+153]; + ld.local.u8 %rs807, [%rd1158+154]; + ld.local.u8 %rs808, [%rd1158+155]; + ld.local.u8 %rs809, [%rd1158+156]; + ld.local.u8 %rs810, [%rd1158+157]; + ld.local.u8 %rs811, [%rd1158+158]; + ld.local.u8 %rs812, [%rd1158+159]; + ld.local.u8 %rs813, [%rd1158+160]; + ld.local.u8 %rs814, [%rd1158+161]; + ld.local.u8 %rs815, [%rd1158+162]; + ld.local.u8 %rs816, [%rd1158+163]; + ld.local.u8 %rs817, [%rd1158+164]; + ld.local.u8 %rs818, [%rd1158+165]; + ld.local.u8 %rs819, [%rd1158+166]; + ld.local.u8 %rs820, [%rd1158+167]; + ld.local.u8 %rs821, [%rd1158+168]; + ld.local.u8 %rs822, [%rd1158+169]; + ld.local.u8 %rs823, [%rd1158+170]; + ld.local.u8 %rs824, [%rd1158+171]; + ld.local.u8 %rs825, [%rd1158+172]; + ld.local.u8 %rs826, [%rd1158+173]; + ld.local.u8 %rs827, [%rd1158+174]; + ld.local.u8 %rs828, [%rd1158+175]; + ld.local.u8 %rs829, [%rd1158+176]; + ld.local.u8 %rs830, [%rd1158+177]; + ld.local.u8 %rs831, [%rd1158+178]; + ld.local.u8 %rs832, [%rd1158+179]; + ld.local.u8 %rs833, [%rd1158+180]; + ld.local.u8 %rs834, [%rd1158+181]; + ld.local.u8 %rs835, [%rd1158+182]; + ld.local.u8 %rs836, [%rd1158+183]; + ld.local.u8 %rs837, [%rd1158+184]; + ld.local.u8 %rs838, [%rd1158+185]; + ld.local.u8 %rs839, [%rd1158+186]; + ld.local.u8 %rs840, [%rd1158+187]; + ld.local.u8 %rs841, [%rd1158+188]; + ld.local.u8 %rs842, [%rd1158+189]; + ld.local.u8 %rs843, [%rd1158+190]; + ld.local.u8 %rs844, [%rd1158+191]; + ld.local.u8 %rs845, [%rd1158+192]; + ld.local.u8 %rs846, [%rd1158+193]; + ld.local.u8 %rs847, [%rd1158+194]; + ld.local.u8 %rs848, [%rd1158+195]; + ld.local.u8 %rs849, [%rd1158+196]; + ld.local.u8 %rs850, [%rd1158+197]; + ld.local.u8 %rs851, [%rd1158+198]; + ld.local.u8 %rs852, [%rd1158+199]; + ld.local.v4.u16 {%rs853, %rs855, %rs857, %rs859}, [%rd1158+200]; + shr.u16 %rs854, %rs853, 8; + shr.u16 %rs856, %rs855, 8; + shr.u16 %rs858, %rs857, 8; + shr.u16 %rs860, %rs859, 8; + ld.local.u8 %rs861, [%rd1158+208]; + mov.u16 %rs862, 64; + bra.uni $L__BB2_91; + +$L__BB2_95: + ld.local.v4.u8 {%rs568, %rs569, %rs570, %rs571}, [%rd2+136]; + setp.eq.s16 %p54, %rs569, 0; + selp.u16 %rs573, 1, 0, %p54; + ld.local.v2.u32 {%r28817, %r28818}, [%rd2+32]; + ld.local.v2.u32 {%r28821, %r28822}, [%rd2+40]; + ld.local.v2.u32 {%r28825, %r28826}, [%rd2+48]; + ld.local.v2.u32 {%r28829, %r28830}, [%rd2+56]; + ld.local.v4.u16 {%rs574, %rs575, %rs576, %rs577}, [%rd2+72]; + shr.u16 %rs579, %rs574, 8; + shr.u16 %rs581, %rs575, 8; + shr.u16 %rs583, %rs576, 8; + shr.u16 %rs585, %rs577, 8; + ld.local.v4.u16 {%rs586, %rs587, %rs588, %rs589}, [%rd2+80]; + shr.u16 %rs591, %rs586, 8; + shr.u16 %rs593, %rs587, 8; + shr.u16 %rs595, %rs588, 8; + shr.u16 %rs597, %rs589, 8; + ld.local.v4.u16 {%rs598, %rs599, %rs600, %rs601}, [%rd2+88]; + shr.u16 %rs603, %rs598, 8; + shr.u16 %rs605, %rs599, 8; + shr.u16 %rs607, %rs600, 8; + shr.u16 %rs609, %rs601, 8; + ld.local.v4.u16 {%rs610, %rs611, %rs612, %rs613}, [%rd2+96]; + shr.u16 %rs615, %rs610, 8; + shr.u16 %rs617, %rs611, 8; + shr.u16 %rs619, %rs612, 8; + shr.u16 %rs621, %rs613, 8; + ld.local.v4.u16 {%rs622, %rs623, %rs624, %rs625}, [%rd2+104]; + shr.u16 %rs627, %rs622, 8; + shr.u16 %rs629, %rs623, 8; + shr.u16 %rs631, %rs624, 8; + shr.u16 %rs633, %rs625, 8; + ld.local.v4.u16 {%rs634, %rs635, %rs636, %rs637}, [%rd2+112]; + shr.u16 %rs639, %rs634, 8; + shr.u16 %rs641, %rs635, 8; + shr.u16 %rs643, %rs636, 8; + shr.u16 %rs645, %rs637, 8; + ld.local.v4.u16 {%rs646, %rs647, %rs648, %rs649}, [%rd2+120]; + shr.u16 %rs651, %rs646, 8; + shr.u16 %rs653, %rs647, 8; + ld.local.v2.u8 {%rs655, %rs656}, [%rd2+126]; + ld.local.u16 %r28833, [%rd2+132]; + ld.local.v2.u8 {%rs659, %rs660}, [%rd2+134]; + or.b16 %rs663, %rs570, %rs573; + or.b16 %rs664, %rs663, 10; + cvt.u32.u16 %r28834, %rs574; + and.b32 %r28835, %r28834, 255; + cvt.u32.u16 %r28836, %rs579; + prmt.b32 %r28837, %r28836, %r28835, 30212; + cvt.u32.u16 %r28838, %rs575; + prmt.b32 %r28839, %r28838, %r28837, 28756; + cvt.u32.u16 %r28840, %rs581; + prmt.b32 %r28841, %r28840, %r28839, 1620; + cvt.u32.u16 %r28842, %rs576; + and.b32 %r28843, %r28842, 255; + cvt.u32.u16 %r28844, %rs583; + prmt.b32 %r28845, %r28844, %r28843, 30212; + cvt.u32.u16 %r28846, %rs577; + prmt.b32 %r28847, %r28846, %r28845, 28756; + cvt.u32.u16 %r28848, %rs585; + prmt.b32 %r28849, %r28848, %r28847, 1620; + cvt.u32.u16 %r28850, %rs586; + and.b32 %r28851, %r28850, 255; + cvt.u32.u16 %r28852, %rs591; + prmt.b32 %r28853, %r28852, %r28851, 30212; + cvt.u32.u16 %r28854, %rs587; + prmt.b32 %r28855, %r28854, %r28853, 28756; + cvt.u32.u16 %r28856, %rs593; + prmt.b32 %r28857, %r28856, %r28855, 1620; + cvt.u32.u16 %r28858, %rs588; + and.b32 %r28859, %r28858, 255; + cvt.u32.u16 %r28860, %rs595; + prmt.b32 %r28861, %r28860, %r28859, 30212; + cvt.u32.u16 %r28862, %rs589; + prmt.b32 %r28863, %r28862, %r28861, 28756; + cvt.u32.u16 %r28864, %rs597; + prmt.b32 %r28865, %r28864, %r28863, 1620; + cvt.u32.u16 %r28866, %rs598; + and.b32 %r28867, %r28866, 255; + cvt.u32.u16 %r28868, %rs603; + prmt.b32 %r28869, %r28868, %r28867, 30212; + cvt.u32.u16 %r28870, %rs599; + prmt.b32 %r28871, %r28870, %r28869, 28756; + cvt.u32.u16 %r28872, %rs605; + prmt.b32 %r28873, %r28872, %r28871, 1620; + cvt.u32.u16 %r28874, %rs600; + and.b32 %r28875, %r28874, 255; + cvt.u32.u16 %r28876, %rs607; + prmt.b32 %r28877, %r28876, %r28875, 30212; + cvt.u32.u16 %r28878, %rs601; + prmt.b32 %r28879, %r28878, %r28877, 28756; + cvt.u32.u16 %r28880, %rs609; + prmt.b32 %r28881, %r28880, %r28879, 1620; + cvt.u32.u16 %r28882, %rs610; + and.b32 %r28883, %r28882, 255; + cvt.u32.u16 %r28884, %rs615; + prmt.b32 %r28885, %r28884, %r28883, 30212; + cvt.u32.u16 %r28886, %rs611; + prmt.b32 %r28887, %r28886, %r28885, 28756; + cvt.u32.u16 %r28888, %rs617; + prmt.b32 %r28889, %r28888, %r28887, 1620; + cvt.u32.u16 %r28890, %rs612; + and.b32 %r28891, %r28890, 255; + cvt.u32.u16 %r28892, %rs619; + prmt.b32 %r28893, %r28892, %r28891, 30212; + cvt.u32.u16 %r28894, %rs613; + prmt.b32 %r28895, %r28894, %r28893, 28756; + cvt.u32.u16 %r28896, %rs621; + prmt.b32 %r28897, %r28896, %r28895, 1620; + cvt.u32.u16 %r28898, %rs622; + and.b32 %r28899, %r28898, 255; + cvt.u32.u16 %r28900, %rs627; + prmt.b32 %r28901, %r28900, %r28899, 30212; + cvt.u32.u16 %r28902, %rs623; + prmt.b32 %r28903, %r28902, %r28901, 28756; + cvt.u32.u16 %r28904, %rs629; + prmt.b32 %r28905, %r28904, %r28903, 1620; + cvt.u32.u16 %r28906, %rs624; + and.b32 %r28907, %r28906, 255; + cvt.u32.u16 %r28908, %rs631; + prmt.b32 %r28909, %r28908, %r28907, 30212; + cvt.u32.u16 %r28910, %rs625; + prmt.b32 %r28911, %r28910, %r28909, 28756; + cvt.u32.u16 %r28912, %rs633; + prmt.b32 %r28913, %r28912, %r28911, 1620; + cvt.u32.u16 %r28914, %rs634; + and.b32 %r28915, %r28914, 255; + cvt.u32.u16 %r28916, %rs639; + prmt.b32 %r28917, %r28916, %r28915, 30212; + cvt.u32.u16 %r28918, %rs635; + prmt.b32 %r28919, %r28918, %r28917, 28756; + cvt.u32.u16 %r28920, %rs641; + prmt.b32 %r28921, %r28920, %r28919, 1620; + cvt.u32.u16 %r28922, %rs636; + and.b32 %r28923, %r28922, 255; + cvt.u32.u16 %r28924, %rs643; + prmt.b32 %r28925, %r28924, %r28923, 30212; + cvt.u32.u16 %r28926, %rs637; + prmt.b32 %r28927, %r28926, %r28925, 28756; + cvt.u32.u16 %r28928, %rs645; + prmt.b32 %r28929, %r28928, %r28927, 1620; + cvt.u32.u16 %r28930, %rs646; + and.b32 %r28931, %r28930, 255; + cvt.u32.u16 %r28932, %rs651; + prmt.b32 %r28933, %r28932, %r28931, 30212; + cvt.u32.u16 %r28934, %rs647; + prmt.b32 %r28935, %r28934, %r28933, 28756; + cvt.u32.u16 %r28936, %rs653; + prmt.b32 %r28937, %r28936, %r28935, 1620; + cvt.u32.u16 %r28938, %rs648; + and.b32 %r28939, %r28938, 255; + ld.local.u8 %r28940, [%rd2+125]; + prmt.b32 %r28941, %r28940, %r28939, 30212; + cvt.u32.u16 %r28942, %rs655; + prmt.b32 %r28943, %r28942, %r28941, 28756; + cvt.u32.u16 %r28944, %rs656; + prmt.b32 %r28945, %r28944, %r28943, 1620; + ld.local.u32 %r28946, [%rd2+128]; + cvt.u32.u16 %r28947, %rs659; + prmt.b32 %r28948, %r28947, %r28833, 28756; + cvt.u32.u16 %r28949, %rs660; + prmt.b32 %r28950, %r28949, %r28948, 1620; + cvt.u32.u16 %r28951, %rs568; + cvt.u32.u16 %r28952, %rs664; + and.b32 %r28953, %r28952, 255; + add.s32 %r28954, %r28825, %r28817; + add.s32 %r28955, %r28954, %r28841; + add.s32 %r28956, %r28849, %r28955; + add.s32 %r28957, %r28826, %r28818; + add.s32 %r28958, %r28957, %r28857; + add.s32 %r28959, %r28865, %r28958; + add.s32 %r28960, %r28829, %r28821; + add.s32 %r28961, %r28960, %r28873; + xor.b32 %r28962, %r28961, %r28951; + shr.u32 %r28963, %r28961, 16; + shl.b32 %r28964, %r28962, 16; + or.b32 %r28965, %r28964, %r28963; + add.s32 %r28966, %r28965, 1013904242; + xor.b32 %r28967, %r28966, %r28829; + shf.l.wrap.b32 %r28968, %r28967, %r28967, 20; + add.s32 %r28969, %r28881, %r28961; + add.s32 %r28970, %r28969, %r28968; + xor.b32 %r28971, %r28970, %r28965; + shf.l.wrap.b32 %r28972, %r28971, %r28971, 24; + add.s32 %r28973, %r28972, %r28966; + xor.b32 %r28974, %r28973, %r28968; + shf.l.wrap.b32 %r28975, %r28974, %r28974, 25; + add.s32 %r28976, %r28830, %r28822; + add.s32 %r28977, %r28976, %r28889; + xor.b32 %r28978, %r28977, %r28953; + shr.u32 %r28979, %r28977, 16; + shl.b32 %r28980, %r28978, 16; + or.b32 %r28981, %r28980, %r28979; + add.s32 %r28982, %r28981, -1521486534; + xor.b32 %r28983, %r28982, %r28830; + shf.l.wrap.b32 %r28984, %r28983, %r28983, 20; + add.s32 %r28985, %r28897, %r28977; + add.s32 %r28986, %r28985, %r28984; + xor.b32 %r28987, %r28986, %r28981; + shf.l.wrap.b32 %r28988, %r28987, %r28987, 24; + add.s32 %r28989, %r28988, %r28982; + xor.b32 %r28990, %r28989, %r28984; + shf.l.wrap.b32 %r28991, %r28990, %r28990, 25; + add.s32 %r28992, %r28921, %r28975; + add.s32 %r28993, %r28991, %r28970; + add.s32 %r28994, %r28993, %r28937; + add.s32 %r28995, %r28945, %r28994; + add.s32 %r28996, %r28946, %r28986; + shf.l.wrap.b32 %r28997, %r28955, %r28955, 16; + add.s32 %r28998, %r28997, 1779033703; + xor.b32 %r28999, %r28998, %r28825; + shf.l.wrap.b32 %r29000, %r28999, %r28999, 20; + add.s32 %r29001, %r28956, %r29000; + xor.b32 %r29002, %r29001, %r28997; + shf.l.wrap.b32 %r29003, %r29002, %r29002, 24; + add.s32 %r29004, %r29003, %r28998; + xor.b32 %r29005, %r29004, %r29000; + shf.l.wrap.b32 %r29006, %r29005, %r29005, 25; + shf.l.wrap.b32 %r29007, %r28958, %r28958, 16; + add.s32 %r29008, %r29007, -1150833019; + xor.b32 %r29009, %r29008, %r28826; + shf.l.wrap.b32 %r29010, %r29009, %r29009, 20; + add.s32 %r29011, %r28959, %r29010; + xor.b32 %r29012, %r29011, %r29007; + shf.l.wrap.b32 %r29013, %r29012, %r29012, 24; + add.s32 %r29014, %r29013, %r29008; + xor.b32 %r29015, %r29014, %r29010; + shf.l.wrap.b32 %r29016, %r29015, %r29015, 25; + add.s32 %r29017, %r29001, %r28905; + add.s32 %r29018, %r29017, %r29016; + xor.b32 %r29019, %r29018, %r28988; + shf.l.wrap.b32 %r29020, %r29019, %r29019, 16; + add.s32 %r29021, %r29020, %r28973; + xor.b32 %r29022, %r29021, %r29016; + shf.l.wrap.b32 %r29023, %r29022, %r29022, 20; + add.s32 %r29024, %r29018, %r28913; + add.s32 %r29025, %r29024, %r29023; + xor.b32 %r29026, %r29025, %r29020; + shf.l.wrap.b32 %r29027, %r29026, %r29026, 24; + add.s32 %r29028, %r29027, %r29021; + xor.b32 %r29029, %r29028, %r29023; + shf.l.wrap.b32 %r29030, %r29029, %r29029, 25; + add.s32 %r29031, %r28992, %r29011; + xor.b32 %r29032, %r29003, %r29031; + shf.l.wrap.b32 %r29033, %r29032, %r29032, 16; + add.s32 %r29034, %r29033, %r28989; + xor.b32 %r29035, %r29034, %r28975; + shf.l.wrap.b32 %r29036, %r29035, %r29035, 20; + add.s32 %r29037, %r29031, %r28929; + add.s32 %r29038, %r29037, %r29036; + xor.b32 %r29039, %r29038, %r29033; + shf.l.wrap.b32 %r29040, %r29039, %r29039, 24; + add.s32 %r29041, %r29040, %r29034; + xor.b32 %r29042, %r29041, %r29036; + shf.l.wrap.b32 %r29043, %r29042, %r29042, 25; + xor.b32 %r29044, %r29013, %r28994; + shf.l.wrap.b32 %r29045, %r29044, %r29044, 16; + add.s32 %r29046, %r29045, %r29004; + xor.b32 %r29047, %r29046, %r28991; + shf.l.wrap.b32 %r29048, %r29047, %r29047, 20; + add.s32 %r29049, %r28995, %r29048; + xor.b32 %r29050, %r29049, %r29045; + shf.l.wrap.b32 %r29051, %r29050, %r29050, 24; + add.s32 %r29052, %r29051, %r29046; + xor.b32 %r29053, %r29052, %r29048; + shf.l.wrap.b32 %r29054, %r29053, %r29053, 25; + add.s32 %r29055, %r28996, %r29006; + xor.b32 %r29056, %r29055, %r28972; + shf.l.wrap.b32 %r29057, %r29056, %r29056, 16; + add.s32 %r29058, %r29057, %r29014; + xor.b32 %r29059, %r29058, %r29006; + shf.l.wrap.b32 %r29060, %r29059, %r29059, 20; + add.s32 %r29061, %r29055, %r28950; + add.s32 %r29062, %r29061, %r29060; + xor.b32 %r29063, %r29062, %r29057; + shf.l.wrap.b32 %r29064, %r29063, %r29063, 24; + add.s32 %r29065, %r29064, %r29058; + xor.b32 %r29066, %r29065, %r29060; + shf.l.wrap.b32 %r29067, %r29066, %r29066, 25; + add.s32 %r29068, %r29025, %r28857; + add.s32 %r29069, %r29068, %r29067; + xor.b32 %r29070, %r29069, %r29040; + shf.l.wrap.b32 %r29071, %r29070, %r29070, 16; + add.s32 %r29072, %r29071, %r29052; + xor.b32 %r29073, %r29072, %r29067; + shf.l.wrap.b32 %r29074, %r29073, %r29073, 20; + add.s32 %r29075, %r29069, %r28889; + add.s32 %r29076, %r29075, %r29074; + xor.b32 %r29077, %r29076, %r29071; + shf.l.wrap.b32 %r29078, %r29077, %r29077, 24; + add.s32 %r29079, %r29078, %r29072; + xor.b32 %r29080, %r29079, %r29074; + shf.l.wrap.b32 %r29081, %r29080, %r29080, 25; + add.s32 %r29082, %r29038, %r28865; + add.s32 %r29083, %r29082, %r29030; + xor.b32 %r29084, %r29083, %r29051; + shf.l.wrap.b32 %r29085, %r29084, %r29084, 16; + add.s32 %r29086, %r29085, %r29065; + xor.b32 %r29087, %r29086, %r29030; + shf.l.wrap.b32 %r29088, %r29087, %r29087, 20; + add.s32 %r29089, %r29083, %r28921; + add.s32 %r29090, %r29089, %r29088; + xor.b32 %r29091, %r29090, %r29085; + shf.l.wrap.b32 %r29092, %r29091, %r29091, 24; + add.s32 %r29093, %r29092, %r29086; + xor.b32 %r29094, %r29093, %r29088; + shf.l.wrap.b32 %r29095, %r29094, %r29094, 25; + add.s32 %r29096, %r29049, %r28897; + add.s32 %r29097, %r29096, %r29043; + xor.b32 %r29098, %r29064, %r29097; + shf.l.wrap.b32 %r29099, %r29098, %r29098, 16; + add.s32 %r29100, %r29099, %r29028; + xor.b32 %r29101, %r29100, %r29043; + shf.l.wrap.b32 %r29102, %r29101, %r29101, 20; + add.s32 %r29103, %r29097, %r28841; + add.s32 %r29104, %r29103, %r29102; + xor.b32 %r29105, %r29104, %r29099; + shf.l.wrap.b32 %r29106, %r29105, %r29105, 24; + add.s32 %r29107, %r29106, %r29100; + xor.b32 %r29108, %r29107, %r29102; + shf.l.wrap.b32 %r29109, %r29108, %r29108, 25; + add.s32 %r29110, %r29062, %r28873; + add.s32 %r29111, %r29110, %r29054; + xor.b32 %r29112, %r29027, %r29111; + shf.l.wrap.b32 %r29113, %r29112, %r29112, 16; + add.s32 %r29114, %r29113, %r29041; + xor.b32 %r29115, %r29114, %r29054; + shf.l.wrap.b32 %r29116, %r29115, %r29115, 20; + add.s32 %r29117, %r29111, %r28945; + add.s32 %r29118, %r29117, %r29116; + xor.b32 %r29119, %r29118, %r29113; + shf.l.wrap.b32 %r29120, %r29119, %r29119, 24; + add.s32 %r29121, %r29120, %r29114; + xor.b32 %r29122, %r29121, %r29116; + shf.l.wrap.b32 %r29123, %r29122, %r29122, 25; + add.s32 %r29124, %r29076, %r28849; + add.s32 %r29125, %r29124, %r29095; + xor.b32 %r29126, %r29125, %r29120; + shf.l.wrap.b32 %r29127, %r29126, %r29126, 16; + add.s32 %r29128, %r29127, %r29107; + xor.b32 %r29129, %r29128, %r29095; + shf.l.wrap.b32 %r29130, %r29129, %r29129, 20; + add.s32 %r29131, %r29125, %r28929; + add.s32 %r29132, %r29131, %r29130; + xor.b32 %r29133, %r29132, %r29127; + shf.l.wrap.b32 %r29134, %r29133, %r29133, 24; + add.s32 %r29135, %r29134, %r29128; + xor.b32 %r29136, %r29135, %r29130; + shf.l.wrap.b32 %r29137, %r29136, %r29136, 25; + add.s32 %r29138, %r29109, %r28937; + add.s32 %r29139, %r29138, %r29090; + xor.b32 %r29140, %r29078, %r29139; + shf.l.wrap.b32 %r29141, %r29140, %r29140, 16; + add.s32 %r29142, %r29141, %r29121; + xor.b32 %r29143, %r29142, %r29109; + shf.l.wrap.b32 %r29144, %r29143, %r29143, 20; + add.s32 %r29145, %r29139, %r28881; + add.s32 %r29146, %r29145, %r29144; + xor.b32 %r29147, %r29146, %r29141; + shf.l.wrap.b32 %r29148, %r29147, %r29147, 24; + add.s32 %r29149, %r29148, %r29142; + xor.b32 %r29150, %r29149, %r29144; + shf.l.wrap.b32 %r29151, %r29150, %r29150, 25; + add.s32 %r29152, %r29104, %r28913; + add.s32 %r29153, %r29152, %r29123; + xor.b32 %r29154, %r29092, %r29153; + shf.l.wrap.b32 %r29155, %r29154, %r29154, 16; + add.s32 %r29156, %r29155, %r29079; + xor.b32 %r29157, %r29156, %r29123; + shf.l.wrap.b32 %r29158, %r29157, %r29157, 20; + add.s32 %r29159, %r29153, %r28946; + add.s32 %r29160, %r29159, %r29158; + xor.b32 %r29161, %r29160, %r29155; + shf.l.wrap.b32 %r29162, %r29161, %r29161, 24; + add.s32 %r29163, %r29162, %r29156; + xor.b32 %r29164, %r29163, %r29158; + shf.l.wrap.b32 %r29165, %r29164, %r29164, 25; + add.s32 %r29166, %r29118, %r28950; + add.s32 %r29167, %r29166, %r29081; + xor.b32 %r29168, %r29167, %r29106; + shf.l.wrap.b32 %r29169, %r29168, %r29168, 16; + add.s32 %r29170, %r29169, %r29093; + xor.b32 %r29171, %r29170, %r29081; + shf.l.wrap.b32 %r29172, %r29171, %r29171, 20; + add.s32 %r29173, %r29167, %r28905; + add.s32 %r29174, %r29173, %r29172; + xor.b32 %r29175, %r29174, %r29169; + shf.l.wrap.b32 %r29176, %r29175, %r29175, 24; + add.s32 %r29177, %r29176, %r29170; + xor.b32 %r29178, %r29177, %r29172; + shf.l.wrap.b32 %r29179, %r29178, %r29178, 25; + add.s32 %r29180, %r29132, %r28865; + add.s32 %r29181, %r29180, %r29179; + xor.b32 %r29182, %r29181, %r29148; + shf.l.wrap.b32 %r29183, %r29182, %r29182, 16; + add.s32 %r29184, %r29183, %r29163; + xor.b32 %r29185, %r29184, %r29179; + shf.l.wrap.b32 %r29186, %r29185, %r29185, 20; + add.s32 %r29187, %r29181, %r28873; + add.s32 %r29188, %r29187, %r29186; + xor.b32 %r29189, %r29188, %r29183; + shf.l.wrap.b32 %r29190, %r29189, %r29189, 24; + add.s32 %r29191, %r29190, %r29184; + xor.b32 %r29192, %r29191, %r29186; + shf.l.wrap.b32 %r29193, %r29192, %r29192, 25; + add.s32 %r29194, %r29146, %r28921; + add.s32 %r29195, %r29194, %r29137; + xor.b32 %r29196, %r29195, %r29162; + shf.l.wrap.b32 %r29197, %r29196, %r29196, 16; + add.s32 %r29198, %r29197, %r29177; + xor.b32 %r29199, %r29198, %r29137; + shf.l.wrap.b32 %r29200, %r29199, %r29199, 20; + add.s32 %r29201, %r29195, %r28937; + add.s32 %r29202, %r29201, %r29200; + xor.b32 %r29203, %r29202, %r29197; + shf.l.wrap.b32 %r29204, %r29203, %r29203, 24; + add.s32 %r29205, %r29204, %r29198; + xor.b32 %r29206, %r29205, %r29200; + shf.l.wrap.b32 %r29207, %r29206, %r29206, 25; + add.s32 %r29208, %r29160, %r28945; + add.s32 %r29209, %r29208, %r29151; + xor.b32 %r29210, %r29176, %r29209; + shf.l.wrap.b32 %r29211, %r29210, %r29210, 16; + add.s32 %r29212, %r29211, %r29135; + xor.b32 %r29213, %r29212, %r29151; + shf.l.wrap.b32 %r29214, %r29213, %r29213, 20; + add.s32 %r29215, %r29209, %r28857; + add.s32 %r29216, %r29215, %r29214; + xor.b32 %r29217, %r29216, %r29211; + shf.l.wrap.b32 %r29218, %r29217, %r29217, 24; + add.s32 %r29219, %r29218, %r29212; + xor.b32 %r29220, %r29219, %r29214; + shf.l.wrap.b32 %r29221, %r29220, %r29220, 25; + add.s32 %r29222, %r29174, %r28897; + add.s32 %r29223, %r29222, %r29165; + xor.b32 %r29224, %r29134, %r29223; + shf.l.wrap.b32 %r29225, %r29224, %r29224, 16; + add.s32 %r29226, %r29225, %r29149; + xor.b32 %r29227, %r29226, %r29165; + shf.l.wrap.b32 %r29228, %r29227, %r29227, 20; + add.s32 %r29229, %r29223, %r28946; + add.s32 %r29230, %r29229, %r29228; + xor.b32 %r29231, %r29230, %r29225; + shf.l.wrap.b32 %r29232, %r29231, %r29231, 24; + add.s32 %r29233, %r29232, %r29226; + xor.b32 %r29234, %r29233, %r29228; + shf.l.wrap.b32 %r29235, %r29234, %r29234, 25; + add.s32 %r29236, %r29188, %r28889; + add.s32 %r29237, %r29236, %r29207; + xor.b32 %r29238, %r29237, %r29232; + shf.l.wrap.b32 %r29239, %r29238, %r29238, 16; + add.s32 %r29240, %r29239, %r29219; + xor.b32 %r29241, %r29240, %r29207; + shf.l.wrap.b32 %r29242, %r29241, %r29241, 20; + add.s32 %r29243, %r29237, %r28881; + add.s32 %r29244, %r29243, %r29242; + xor.b32 %r29245, %r29244, %r29239; + shf.l.wrap.b32 %r29246, %r29245, %r29245, 24; + add.s32 %r29247, %r29246, %r29240; + xor.b32 %r29248, %r29247, %r29242; + shf.l.wrap.b32 %r29249, %r29248, %r29248, 25; + add.s32 %r29250, %r29221, %r28913; + add.s32 %r29251, %r29250, %r29202; + xor.b32 %r29252, %r29190, %r29251; + shf.l.wrap.b32 %r29253, %r29252, %r29252, 16; + add.s32 %r29254, %r29253, %r29233; + xor.b32 %r29255, %r29254, %r29221; + shf.l.wrap.b32 %r29256, %r29255, %r29255, 20; + add.s32 %r29257, %r29251, %r28841; + add.s32 %r29258, %r29257, %r29256; + xor.b32 %r29259, %r29258, %r29253; + shf.l.wrap.b32 %r29260, %r29259, %r29259, 24; + add.s32 %r29261, %r29260, %r29254; + xor.b32 %r29262, %r29261, %r29256; + shf.l.wrap.b32 %r29263, %r29262, %r29262, 25; + add.s32 %r29264, %r29216, %r28929; + add.s32 %r29265, %r29264, %r29235; + xor.b32 %r29266, %r29204, %r29265; + shf.l.wrap.b32 %r29267, %r29266, %r29266, 16; + add.s32 %r29268, %r29267, %r29191; + xor.b32 %r29269, %r29268, %r29235; + shf.l.wrap.b32 %r29270, %r29269, %r29269, 20; + add.s32 %r29271, %r29265, %r28950; + add.s32 %r29272, %r29271, %r29270; + xor.b32 %r29273, %r29272, %r29267; + shf.l.wrap.b32 %r29274, %r29273, %r29273, 24; + add.s32 %r29275, %r29274, %r29268; + xor.b32 %r29276, %r29275, %r29270; + shf.l.wrap.b32 %r29277, %r29276, %r29276, 25; + add.s32 %r29278, %r29230, %r28905; + add.s32 %r29279, %r29278, %r29193; + xor.b32 %r29280, %r29279, %r29218; + shf.l.wrap.b32 %r29281, %r29280, %r29280, 16; + add.s32 %r29282, %r29281, %r29205; + xor.b32 %r29283, %r29282, %r29193; + shf.l.wrap.b32 %r29284, %r29283, %r29283, 20; + add.s32 %r29285, %r29279, %r28849; + add.s32 %r29286, %r29285, %r29284; + xor.b32 %r29287, %r29286, %r29281; + shf.l.wrap.b32 %r29288, %r29287, %r29287, 24; + add.s32 %r29289, %r29288, %r29282; + xor.b32 %r29290, %r29289, %r29284; + shf.l.wrap.b32 %r29291, %r29290, %r29290, 25; + add.s32 %r29292, %r29244, %r28921; + add.s32 %r29293, %r29292, %r29291; + xor.b32 %r29294, %r29293, %r29260; + shf.l.wrap.b32 %r29295, %r29294, %r29294, 16; + add.s32 %r29296, %r29295, %r29275; + xor.b32 %r29297, %r29296, %r29291; + shf.l.wrap.b32 %r29298, %r29297, %r29297, 20; + add.s32 %r29299, %r29293, %r28897; + add.s32 %r29300, %r29299, %r29298; + xor.b32 %r29301, %r29300, %r29295; + shf.l.wrap.b32 %r29302, %r29301, %r29301, 24; + add.s32 %r29303, %r29302, %r29296; + xor.b32 %r29304, %r29303, %r29298; + shf.l.wrap.b32 %r29305, %r29304, %r29304, 25; + add.s32 %r29306, %r29258, %r28937; + add.s32 %r29307, %r29306, %r29249; + xor.b32 %r29308, %r29307, %r29274; + shf.l.wrap.b32 %r29309, %r29308, %r29308, 16; + add.s32 %r29310, %r29309, %r29289; + xor.b32 %r29311, %r29310, %r29249; + shf.l.wrap.b32 %r29312, %r29311, %r29311, 20; + add.s32 %r29313, %r29307, %r28913; + add.s32 %r29314, %r29313, %r29312; + xor.b32 %r29315, %r29314, %r29309; + shf.l.wrap.b32 %r29316, %r29315, %r29315, 24; + add.s32 %r29317, %r29316, %r29310; + xor.b32 %r29318, %r29317, %r29312; + shf.l.wrap.b32 %r29319, %r29318, %r29318, 25; + add.s32 %r29320, %r29272, %r28946; + add.s32 %r29321, %r29320, %r29263; + xor.b32 %r29322, %r29288, %r29321; + shf.l.wrap.b32 %r29323, %r29322, %r29322, 16; + add.s32 %r29324, %r29323, %r29247; + xor.b32 %r29325, %r29324, %r29263; + shf.l.wrap.b32 %r29326, %r29325, %r29325, 20; + add.s32 %r29327, %r29321, %r28865; + add.s32 %r29328, %r29327, %r29326; + xor.b32 %r29329, %r29328, %r29323; + shf.l.wrap.b32 %r29330, %r29329, %r29329, 24; + add.s32 %r29331, %r29330, %r29324; + xor.b32 %r29332, %r29331, %r29326; + shf.l.wrap.b32 %r29333, %r29332, %r29332, 25; + add.s32 %r29334, %r29286, %r28945; + add.s32 %r29335, %r29334, %r29277; + xor.b32 %r29336, %r29246, %r29335; + shf.l.wrap.b32 %r29337, %r29336, %r29336, 16; + add.s32 %r29338, %r29337, %r29261; + xor.b32 %r29339, %r29338, %r29277; + shf.l.wrap.b32 %r29340, %r29339, %r29339, 20; + add.s32 %r29341, %r29335, %r28950; + add.s32 %r29342, %r29341, %r29340; + xor.b32 %r29343, %r29342, %r29337; + shf.l.wrap.b32 %r29344, %r29343, %r29343, 24; + add.s32 %r29345, %r29344, %r29338; + xor.b32 %r29346, %r29345, %r29340; + shf.l.wrap.b32 %r29347, %r29346, %r29346, 25; + add.s32 %r29348, %r29300, %r28873; + add.s32 %r29349, %r29348, %r29319; + xor.b32 %r29350, %r29349, %r29344; + shf.l.wrap.b32 %r29351, %r29350, %r29350, 16; + add.s32 %r29352, %r29351, %r29331; + xor.b32 %r29353, %r29352, %r29319; + shf.l.wrap.b32 %r29354, %r29353, %r29353, 20; + add.s32 %r29355, %r29349, %r28841; + add.s32 %r29356, %r29355, %r29354; + xor.b32 %r29357, %r29356, %r29351; + shf.l.wrap.b32 %r29358, %r29357, %r29357, 24; + add.s32 %r29359, %r29358, %r29352; + xor.b32 %r29360, %r29359, %r29354; + shf.l.wrap.b32 %r29361, %r29360, %r29360, 25; + add.s32 %r29362, %r29333, %r28929; + add.s32 %r29363, %r29362, %r29314; + xor.b32 %r29364, %r29302, %r29363; + shf.l.wrap.b32 %r29365, %r29364, %r29364, 16; + add.s32 %r29366, %r29365, %r29345; + xor.b32 %r29367, %r29366, %r29333; + shf.l.wrap.b32 %r29368, %r29367, %r29367, 20; + add.s32 %r29369, %r29363, %r28857; + add.s32 %r29370, %r29369, %r29368; + xor.b32 %r29371, %r29370, %r29365; + shf.l.wrap.b32 %r29372, %r29371, %r29371, 24; + add.s32 %r29373, %r29372, %r29366; + xor.b32 %r29374, %r29373, %r29368; + shf.l.wrap.b32 %r29375, %r29374, %r29374, 25; + add.s32 %r29376, %r29328, %r28881; + add.s32 %r29377, %r29376, %r29347; + xor.b32 %r29378, %r29316, %r29377; + shf.l.wrap.b32 %r29379, %r29378, %r29378, 16; + add.s32 %r29380, %r29379, %r29303; + xor.b32 %r29381, %r29380, %r29347; + shf.l.wrap.b32 %r29382, %r29381, %r29381, 20; + add.s32 %r29383, %r29377, %r28905; + add.s32 %r29384, %r29383, %r29382; + xor.b32 %r29385, %r29384, %r29379; + shf.l.wrap.b32 %r29386, %r29385, %r29385, 24; + add.s32 %r29387, %r29386, %r29380; + xor.b32 %r29388, %r29387, %r29382; + shf.l.wrap.b32 %r29389, %r29388, %r29388, 25; + add.s32 %r29390, %r29342, %r28849; + add.s32 %r29391, %r29390, %r29305; + xor.b32 %r29392, %r29391, %r29330; + shf.l.wrap.b32 %r29393, %r29392, %r29392, 16; + add.s32 %r29394, %r29393, %r29317; + xor.b32 %r29395, %r29394, %r29305; + shf.l.wrap.b32 %r29396, %r29395, %r29395, 20; + add.s32 %r29397, %r29391, %r28889; + add.s32 %r29398, %r29397, %r29396; + xor.b32 %r29399, %r29398, %r29393; + shf.l.wrap.b32 %r29400, %r29399, %r29399, 24; + add.s32 %r29401, %r29400, %r29394; + xor.b32 %r29402, %r29401, %r29396; + shf.l.wrap.b32 %r29403, %r29402, %r29402, 25; + add.s32 %r29404, %r29356, %r28937; + add.s32 %r29405, %r29404, %r29403; + xor.b32 %r29406, %r29405, %r29372; + shf.l.wrap.b32 %r29407, %r29406, %r29406, 16; + add.s32 %r29408, %r29407, %r29387; + xor.b32 %r29409, %r29408, %r29403; + shf.l.wrap.b32 %r29410, %r29409, %r29409, 20; + add.s32 %r29411, %r29405, %r28945; + add.s32 %r29412, %r29411, %r29410; + xor.b32 %r29413, %r29412, %r29407; + shf.l.wrap.b32 %r29414, %r29413, %r29413, 24; + add.s32 %r29415, %r29414, %r29408; + xor.b32 %r29416, %r29415, %r29410; + shf.l.wrap.b32 %r29417, %r29416, %r29416, 25; + add.s32 %r29418, %r29370, %r28913; + add.s32 %r29419, %r29418, %r29361; + xor.b32 %r29420, %r29419, %r29386; + shf.l.wrap.b32 %r29421, %r29420, %r29420, 16; + add.s32 %r29422, %r29421, %r29401; + xor.b32 %r29423, %r29422, %r29361; + shf.l.wrap.b32 %r29424, %r29423, %r29423, 20; + add.s32 %r29425, %r29419, %r28929; + add.s32 %r29426, %r29425, %r29424; + xor.b32 %r29427, %r29426, %r29421; + shf.l.wrap.b32 %r29428, %r29427, %r29427, 24; + add.s32 %r29429, %r29428, %r29422; + xor.b32 %r29430, %r29429, %r29424; + shf.l.wrap.b32 %r29431, %r29430, %r29430, 25; + add.s32 %r29432, %r29384, %r28950; + add.s32 %r29433, %r29432, %r29375; + xor.b32 %r29434, %r29400, %r29433; + shf.l.wrap.b32 %r29435, %r29434, %r29434, 16; + add.s32 %r29436, %r29435, %r29359; + xor.b32 %r29437, %r29436, %r29375; + shf.l.wrap.b32 %r29438, %r29437, %r29437, 20; + add.s32 %r29439, %r29433, %r28921; + add.s32 %r29440, %r29439, %r29438; + xor.b32 %r29441, %r29440, %r29435; + shf.l.wrap.b32 %r29442, %r29441, %r29441, 24; + add.s32 %r29443, %r29442, %r29436; + xor.b32 %r29444, %r29443, %r29438; + shf.l.wrap.b32 %r29445, %r29444, %r29444, 25; + add.s32 %r29446, %r29398, %r28946; + add.s32 %r29447, %r29446, %r29389; + xor.b32 %r29448, %r29358, %r29447; + shf.l.wrap.b32 %r29449, %r29448, %r29448, 16; + add.s32 %r29450, %r29449, %r29373; + xor.b32 %r29451, %r29450, %r29389; + shf.l.wrap.b32 %r29452, %r29451, %r29451, 20; + add.s32 %r29453, %r29447, %r28905; + add.s32 %r29454, %r29453, %r29452; + xor.b32 %r29455, %r29454, %r29449; + shf.l.wrap.b32 %r29456, %r29455, %r29455, 24; + add.s32 %r29457, %r29456, %r29450; + xor.b32 %r29458, %r29457, %r29452; + shf.l.wrap.b32 %r29459, %r29458, %r29458, 25; + add.s32 %r29460, %r29412, %r28897; + add.s32 %r29461, %r29460, %r29431; + xor.b32 %r29462, %r29461, %r29456; + shf.l.wrap.b32 %r29463, %r29462, %r29462, 16; + add.s32 %r29464, %r29463, %r29443; + xor.b32 %r29465, %r29464, %r29431; + shf.l.wrap.b32 %r29466, %r29465, %r29465, 20; + add.s32 %r29467, %r29461, %r28857; + add.s32 %r29468, %r29467, %r29466; + xor.b32 %r29469, %r29468, %r29463; + shf.l.wrap.b32 %r29470, %r29469, %r29469, 24; + add.s32 %r29471, %r29470, %r29464; + xor.b32 %r29472, %r29471, %r29466; + shf.l.wrap.b32 %r29473, %r29472, %r29472, 25; + add.s32 %r29474, %r29445, %r28881; + add.s32 %r29475, %r29474, %r29426; + xor.b32 %r29476, %r29414, %r29475; + shf.l.wrap.b32 %r29477, %r29476, %r29476, 16; + add.s32 %r29478, %r29477, %r29457; + xor.b32 %r29479, %r29478, %r29445; + shf.l.wrap.b32 %r29480, %r29479, %r29479, 20; + add.s32 %r29481, %r29475, %r28865; + add.s32 %r29482, %r29481, %r29480; + xor.b32 %r29483, %r29482, %r29477; + shf.l.wrap.b32 %r29484, %r29483, %r29483, 24; + add.s32 %r29485, %r29484, %r29478; + xor.b32 %r29486, %r29485, %r29480; + shf.l.wrap.b32 %r29487, %r29486, %r29486, 25; + add.s32 %r29488, %r29440, %r28841; + add.s32 %r29489, %r29488, %r29459; + xor.b32 %r29490, %r29428, %r29489; + shf.l.wrap.b32 %r29491, %r29490, %r29490, 16; + add.s32 %r29492, %r29491, %r29415; + xor.b32 %r29493, %r29492, %r29459; + shf.l.wrap.b32 %r29494, %r29493, %r29493, 20; + add.s32 %r29495, %r29489, %r28849; + add.s32 %r29496, %r29495, %r29494; + xor.b32 %r29497, %r29496, %r29491; + shf.l.wrap.b32 %r29498, %r29497, %r29497, 24; + add.s32 %r29499, %r29498, %r29492; + xor.b32 %r29500, %r29499, %r29494; + shf.l.wrap.b32 %r29501, %r29500, %r29500, 25; + add.s32 %r29502, %r29454, %r28889; + add.s32 %r29503, %r29502, %r29417; + xor.b32 %r29504, %r29503, %r29442; + shf.l.wrap.b32 %r29505, %r29504, %r29504, 16; + add.s32 %r29506, %r29505, %r29429; + xor.b32 %r29507, %r29506, %r29417; + shf.l.wrap.b32 %r29508, %r29507, %r29507, 20; + add.s32 %r29509, %r29503, %r28873; + add.s32 %r29510, %r29509, %r29508; + xor.b32 %r29511, %r29510, %r29505; + shf.l.wrap.b32 %r29512, %r29511, %r29511, 24; + add.s32 %r29513, %r29512, %r29506; + xor.b32 %r29514, %r29513, %r29508; + shf.l.wrap.b32 %r29515, %r29514, %r29514, 25; + add.s32 %r29516, %r29468, %r28913; + add.s32 %r29517, %r29516, %r29515; + xor.b32 %r29518, %r29517, %r29484; + shf.l.wrap.b32 %r29519, %r29518, %r29518, 16; + add.s32 %r29520, %r29519, %r29499; + xor.b32 %r29521, %r29520, %r29515; + shf.l.wrap.b32 %r29522, %r29521, %r29521, 20; + add.s32 %r29523, %r29517, %r28946; + add.s32 %r29524, %r29523, %r29522; + xor.b32 %r29525, %r29524, %r29519; + shf.l.wrap.b32 %r29526, %r29525, %r29525, 24; + add.s32 %r29527, %r29526, %r29520; + xor.b32 %r29528, %r29527, %r29522; + shf.l.wrap.b32 %r29529, %r29528, %r29528, 25; + add.s32 %r29530, %r29482, %r28929; + add.s32 %r29531, %r29530, %r29473; + xor.b32 %r29532, %r29531, %r29498; + shf.l.wrap.b32 %r29533, %r29532, %r29532, 16; + add.s32 %r29534, %r29533, %r29513; + xor.b32 %r29535, %r29534, %r29473; + shf.l.wrap.b32 %r29536, %r29535, %r29535, 20; + add.s32 %r29537, %r29531, %r28881; + add.s32 %r29538, %r29537, %r29536; + xor.b32 %r29539, %r29538, %r29533; + shf.l.wrap.b32 %r29540, %r29539, %r29539, 24; + add.s32 %r29541, %r29540, %r29534; + xor.b32 %r29542, %r29541, %r29536; + shf.l.wrap.b32 %r29543, %r29542, %r29542, 25; + add.s32 %r29544, %r29496, %r28905; + add.s32 %r29545, %r29544, %r29487; + xor.b32 %r29546, %r29512, %r29545; + shf.l.wrap.b32 %r29547, %r29546, %r29546, 16; + add.s32 %r29548, %r29547, %r29471; + xor.b32 %r29549, %r29548, %r29487; + shf.l.wrap.b32 %r29550, %r29549, %r29549, 20; + add.s32 %r29551, %r29545, %r28937; + add.s32 %r29552, %r29551, %r29550; + xor.b32 %r29553, %r29552, %r29547; + shf.l.wrap.b32 %r29554, %r29553, %r29553, 24; + add.s32 %r29555, %r29554, %r29548; + xor.b32 %r29556, %r29555, %r29550; + shf.l.wrap.b32 %r29557, %r29556, %r29556, 25; + add.s32 %r29558, %r29510, %r28950; + add.s32 %r29559, %r29558, %r29501; + xor.b32 %r29560, %r29470, %r29559; + shf.l.wrap.b32 %r29561, %r29560, %r29560, 16; + add.s32 %r29562, %r29561, %r29485; + xor.b32 %r29563, %r29562, %r29501; + shf.l.wrap.b32 %r29564, %r29563, %r29563, 20; + add.s32 %r29565, %r29559, %r28849; + add.s32 %r29566, %r29565, %r29564; + xor.b32 %r29567, %r29566, %r29561; + shf.l.wrap.b32 %r29568, %r29567, %r29567, 24; + add.s32 %r29569, %r29568, %r29562; + xor.b32 %r29570, %r29569, %r29564; + shf.l.wrap.b32 %r29571, %r29570, %r29570, 25; + add.s32 %r29572, %r29524, %r28945; + add.s32 %r29573, %r29572, %r29543; + xor.b32 %r29574, %r29573, %r29568; + shf.l.wrap.b32 %r29575, %r29574, %r29574, 16; + add.s32 %r29576, %r29575, %r29555; + xor.b32 %r29577, %r29576, %r29543; + shf.l.wrap.b32 %r29578, %r29577, %r29577, 20; + add.s32 %r29579, %r29573, %r28865; + add.s32 %r29580, %r29579, %r29578; + xor.b32 %r29581, %r29580, %r29575; + shf.l.wrap.b32 %r29582, %r29581, %r29581, 24; + add.s32 %r29583, %r29582, %r29576; + xor.b32 %r29584, %r29583, %r29578; + shf.l.wrap.b32 %r29585, %r29584, %r29584, 25; + add.s32 %r29586, %r29557, %r28841; + add.s32 %r29587, %r29586, %r29538; + xor.b32 %r29588, %r29526, %r29587; + shf.l.wrap.b32 %r29589, %r29588, %r29588, 16; + add.s32 %r29590, %r29589, %r29569; + xor.b32 %r29591, %r29590, %r29557; + shf.l.wrap.b32 %r29592, %r29591, %r29591, 20; + add.s32 %r29593, %r29587, %r28921; + add.s32 %r29594, %r29593, %r29592; + xor.b32 %r29595, %r29594, %r29589; + shf.l.wrap.b32 %r29596, %r29595, %r29595, 24; + add.s32 %r29597, %r29596, %r29590; + xor.b32 %r29598, %r29597, %r29592; + shf.l.wrap.b32 %r29599, %r29598, %r29598, 25; + add.s32 %r29600, %r29552, %r28857; + add.s32 %r29601, %r29600, %r29571; + xor.b32 %r29602, %r29540, %r29601; + shf.l.wrap.b32 %r29603, %r29602, %r29602, 16; + add.s32 %r29604, %r29603, %r29527; + xor.b32 %r29605, %r29604, %r29571; + shf.l.wrap.b32 %r29606, %r29605, %r29605, 20; + add.s32 %r29607, %r29601, %r28889; + add.s32 %r29608, %r29607, %r29606; + xor.b32 %r29609, %r29608, %r29603; + shf.l.wrap.b32 %r29610, %r29609, %r29609, 24; + add.s32 %r29611, %r29610, %r29604; + xor.b32 %r29612, %r29611, %r29606; + shf.l.wrap.b32 %r29613, %r29612, %r29612, 25; + add.s32 %r29614, %r29566, %r28873; + add.s32 %r29615, %r29614, %r29529; + xor.b32 %r29616, %r29615, %r29554; + shf.l.wrap.b32 %r29617, %r29616, %r29616, 16; + add.s32 %r29618, %r29617, %r29541; + xor.b32 %r29619, %r29618, %r29529; + shf.l.wrap.b32 %r29620, %r29619, %r29619, 20; + add.s32 %r29621, %r29615, %r28897; + add.s32 %r29622, %r29621, %r29620; + xor.b32 %r29623, %r29622, %r29617; + shf.l.wrap.b32 %r29624, %r29623, %r29623, 24; + add.s32 %r29625, %r29624, %r29618; + xor.b32 %r29626, %r29625, %r29620; + shf.l.wrap.b32 %r29627, %r29626, %r29626, 25; + add.s32 %r29628, %r29580, %r28929; + add.s32 %r29629, %r29628, %r29627; + xor.b32 %r29630, %r29629, %r29596; + shf.l.wrap.b32 %r29631, %r29630, %r29630, 16; + add.s32 %r29632, %r29631, %r29611; + xor.b32 %r29633, %r29632, %r29627; + shf.l.wrap.b32 %r29634, %r29633, %r29633, 20; + add.s32 %r29635, %r29629, %r28950; + add.s32 %r29636, %r29635, %r29634; + xor.b32 %r29637, %r29636, %r29631; + shf.l.wrap.b32 %r29638, %r29637, %r29637, 24; + add.s32 %r29639, %r29638, %r29632; + xor.b32 %r29640, %r29639, %r29634; + shf.l.wrap.b32 %r29641, %r29640, %r29640, 25; + add.s32 %r29642, %r29594, %r28881; + add.s32 %r29643, %r29642, %r29585; + xor.b32 %r29644, %r29643, %r29610; + shf.l.wrap.b32 %r29645, %r29644, %r29644, 16; + add.s32 %r29646, %r29645, %r29625; + xor.b32 %r29647, %r29646, %r29585; + shf.l.wrap.b32 %r29648, %r29647, %r29647, 20; + add.s32 %r29649, %r29643, %r28841; + add.s32 %r29650, %r29649, %r29648; + xor.b32 %r29651, %r29650, %r29645; + shf.l.wrap.b32 %r29652, %r29651, %r29651, 24; + add.s32 %r29653, %r29652, %r29646; + xor.b32 %r29654, %r29653, %r29648; + shf.l.wrap.b32 %r29655, %r29654, %r29654, 25; + add.s32 %r29656, %r29608, %r28849; + add.s32 %r29657, %r29656, %r29599; + xor.b32 %r29658, %r29624, %r29657; + shf.l.wrap.b32 %r29659, %r29658, %r29658, 16; + add.s32 %r29660, %r29659, %r29583; + xor.b32 %r29661, %r29660, %r29599; + shf.l.wrap.b32 %r29662, %r29661, %r29661, 20; + add.s32 %r29663, %r29657, %r28913; + add.s32 %r29664, %r29663, %r29662; + xor.b32 %r29665, %r29664, %r29659; + shf.l.wrap.b32 %r29666, %r29665, %r29665, 24; + add.s32 %r29667, %r29666, %r29660; + xor.b32 %r29668, %r29667, %r29662; + shf.l.wrap.b32 %r29669, %r29668, %r29668, 25; + add.s32 %r29670, %r29622, %r28905; + add.s32 %r29671, %r29670, %r29613; + xor.b32 %r29672, %r29582, %r29671; + shf.l.wrap.b32 %r29673, %r29672, %r29672, 16; + add.s32 %r29674, %r29673, %r29597; + xor.b32 %r29675, %r29674, %r29613; + shf.l.wrap.b32 %r29676, %r29675, %r29675, 20; + add.s32 %r29677, %r29671, %r28889; + add.s32 %r29678, %r29677, %r29676; + xor.b32 %r29679, %r29678, %r29673; + shf.l.wrap.b32 %r29680, %r29679, %r29679, 24; + add.s32 %r29681, %r29680, %r29674; + xor.b32 %r29682, %r29681, %r29676; + shf.l.wrap.b32 %r29683, %r29682, %r29682, 25; + add.s32 %r29684, %r29636, %r28946; + add.s32 %r29685, %r29684, %r29655; + xor.b32 %r29686, %r29685, %r29680; + shf.l.wrap.b32 %r29687, %r29686, %r29686, 16; + add.s32 %r29688, %r29687, %r29667; + xor.b32 %r29689, %r29688, %r29655; + shf.l.wrap.b32 %r29690, %r29689, %r29689, 20; + add.s32 %r29691, %r29685, %r28921; + add.s32 %r29692, %r29691, %r29690; + xor.b32 %r29693, %r29692, %r29687; + shf.l.wrap.b32 %r29694, %r29693, %r29693, 24; + add.s32 %r29695, %r29694, %r29688; + xor.b32 %r29696, %r29695, %r29690; + shf.l.wrap.b32 %r29697, %r29696, %r29696, 25; + add.s32 %r29698, %r29669, %r28857; + add.s32 %r29699, %r29698, %r29650; + xor.b32 %r29700, %r29638, %r29699; + shf.l.wrap.b32 %r29701, %r29700, %r29700, 16; + add.s32 %r29702, %r29701, %r29681; + xor.b32 %r29703, %r29702, %r29669; + shf.l.wrap.b32 %r29704, %r29703, %r29703, 20; + add.s32 %r29705, %r29699, %r28937; + add.s32 %r29706, %r29705, %r29704; + xor.b32 %r29707, %r29706, %r29701; + shf.l.wrap.b32 %r29708, %r29707, %r29707, 24; + add.s32 %r29709, %r29708, %r29702; + xor.b32 %r29710, %r29709, %r29704; + shf.l.wrap.b32 %r29711, %r29710, %r29710, 25; + add.s32 %r29712, %r29664, %r28865; + add.s32 %r29713, %r29712, %r29683; + xor.b32 %r29714, %r29652, %r29713; + shf.l.wrap.b32 %r29715, %r29714, %r29714, 16; + add.s32 %r29716, %r29715, %r29639; + xor.b32 %r29717, %r29716, %r29683; + shf.l.wrap.b32 %r29718, %r29717, %r29717, 20; + add.s32 %r29719, %r29713, %r28873; + add.s32 %r29720, %r29719, %r29718; + xor.b32 %r29721, %r29720, %r29715; + shf.l.wrap.b32 %r29722, %r29721, %r29721, 24; + add.s32 %r29723, %r29722, %r29716; + xor.b32 %r29724, %r29723, %r29718; + shf.l.wrap.b32 %r29725, %r29724, %r29724, 25; + add.s32 %r29726, %r29678, %r28897; + add.s32 %r29727, %r29726, %r29641; + xor.b32 %r29728, %r29727, %r29666; + shf.l.wrap.b32 %r29729, %r29728, %r29728, 16; + add.s32 %r29730, %r29729, %r29653; + xor.b32 %r29731, %r29730, %r29641; + shf.l.wrap.b32 %r29732, %r29731, %r29731, 20; + add.s32 %r29733, %r29727, %r28945; + add.s32 %r29734, %r29733, %r29732; + xor.b32 %r29735, %r29734, %r29729; + shf.l.wrap.b32 %r29736, %r29735, %r29735, 24; + add.s32 %r29737, %r29736, %r29730; + xor.b32 %r29738, %r29737, %r29732; + shf.l.wrap.b32 %r29739, %r29738, %r29738, 25; + xor.b32 %r29740, %r29692, %r29723; + cvt.u64.u32 %rd1207, %r29740; + xor.b32 %r29741, %r29737, %r29706; + and.b32 %r29742, %r29741, 255; + cvt.u64.u32 %rd1208, %r29742; + cvt.u64.u32 %rd1209, %r29741; + shl.b64 %rd1210, %rd1209, 32; + and.b64 %rd1211, %rd1210, 280375465082880; + and.b64 %rd1212, %rd1210, 71776119061217280; + shr.u32 %r29743, %r29741, 24; + cvt.u64.u32 %rd1213, %r29743; + shl.b64 %rd1214, %rd1213, 56; + bfi.b64 %rd1215, %rd1208, %rd1207, 32, 32; + or.b64 %rd1216, %rd1215, %rd1211; + or.b64 %rd1217, %rd1216, %rd1212; + or.b64 %rd353, %rd1217, %rd1214; + xor.b32 %r29744, %r29695, %r29720; + cvt.u64.u32 %rd1218, %r29744; + xor.b32 %r29745, %r29734, %r29709; + and.b32 %r29746, %r29745, 255; + cvt.u64.u32 %rd1219, %r29746; + cvt.u64.u32 %rd1220, %r29745; + shl.b64 %rd1221, %rd1220, 32; + and.b64 %rd1222, %rd1221, 280375465082880; + and.b64 %rd1223, %rd1221, 71776119061217280; + shr.u32 %r29747, %r29745, 24; + cvt.u64.u32 %rd1224, %r29747; + shl.b64 %rd1225, %rd1224, 56; + bfi.b64 %rd1226, %rd1219, %rd1218, 32, 32; + or.b64 %rd1227, %rd1226, %rd1222; + or.b64 %rd1228, %rd1227, %rd1223; + or.b64 %rd352, %rd1228, %rd1225; + xor.b32 %r29748, %r29739, %r29708; + cvt.u64.u32 %rd1229, %r29748; + xor.b32 %r29749, %r29697, %r29722; + and.b32 %r29750, %r29749, 255; + cvt.u64.u32 %rd1230, %r29750; + cvt.u64.u32 %rd1231, %r29749; + shl.b64 %rd1232, %rd1231, 32; + and.b64 %rd1233, %rd1232, 280375465082880; + and.b64 %rd1234, %rd1232, 71776119061217280; + shr.u32 %r29751, %r29749, 24; + cvt.u64.u32 %rd1235, %r29751; + shl.b64 %rd1236, %rd1235, 56; + bfi.b64 %rd1237, %rd1230, %rd1229, 32, 32; + or.b64 %rd1238, %rd1237, %rd1233; + or.b64 %rd1239, %rd1238, %rd1234; + or.b64 %rd1370, %rd1239, %rd1236; + xor.b32 %r29752, %r29736, %r29711; + cvt.u64.u32 %rd1240, %r29752; + xor.b32 %r29753, %r29694, %r29725; + and.b32 %r29754, %r29753, 255; + cvt.u64.u32 %rd1241, %r29754; + cvt.u64.u32 %rd1242, %r29753; + shl.b64 %rd1243, %rd1242, 32; + and.b64 %rd1244, %rd1243, 280375465082880; + and.b64 %rd1245, %rd1243, 71776119061217280; + shr.u32 %r29755, %r29753, 24; + cvt.u64.u32 %rd1246, %r29755; + shl.b64 %rd1247, %rd1246, 56; + bfi.b64 %rd1248, %rd1241, %rd1240, 32, 32; + or.b64 %rd1249, %rd1248, %rd1244; + or.b64 %rd1250, %rd1249, %rd1245; + or.b64 %rd1369, %rd1250, %rd1247; + bra.uni $L__BB2_96; + +$L__BB2_89: + setp.eq.s16 %p51, %rs500, 0; + selp.u16 %rs502, 1, 0, %p51; + ld.local.u8 %rs665, [%rd2+138]; + or.b16 %rs503, %rs665, %rs502; + or.b16 %rs732, %rs503, 2; + ld.local.u64 %rd1368, [%rd2+64]; + ld.local.v2.u32 {%r31257, %r31256}, [%rd2+32]; + ld.local.v2.u32 {%r31255, %r31254}, [%rd2+40]; + ld.local.v2.u32 {%r31253, %r31252}, [%rd2+48]; + ld.local.v2.u32 {%r31251, %r31250}, [%rd2+56]; + ld.local.v4.u16 {%rs798, %rs800, %rs802, %rs804}, [%rd2+72]; + shr.u16 %rs799, %rs798, 8; + shr.u16 %rs801, %rs800, 8; + shr.u16 %rs803, %rs802, 8; + shr.u16 %rs805, %rs804, 8; + ld.local.v4.u16 {%rs806, %rs808, %rs810, %rs812}, [%rd2+80]; + shr.u16 %rs807, %rs806, 8; + shr.u16 %rs809, %rs808, 8; + shr.u16 %rs811, %rs810, 8; + shr.u16 %rs813, %rs812, 8; + ld.local.v4.u16 {%rs814, %rs816, %rs818, %rs820}, [%rd2+88]; + shr.u16 %rs815, %rs814, 8; + shr.u16 %rs817, %rs816, 8; + shr.u16 %rs819, %rs818, 8; + shr.u16 %rs821, %rs820, 8; + ld.local.v4.u16 {%rs822, %rs824, %rs826, %rs828}, [%rd2+96]; + shr.u16 %rs823, %rs822, 8; + shr.u16 %rs825, %rs824, 8; + shr.u16 %rs827, %rs826, 8; + shr.u16 %rs829, %rs828, 8; + ld.local.v4.u16 {%rs830, %rs832, %rs834, %rs836}, [%rd2+104]; + shr.u16 %rs831, %rs830, 8; + shr.u16 %rs833, %rs832, 8; + shr.u16 %rs835, %rs834, 8; + shr.u16 %rs837, %rs836, 8; + ld.local.v4.u16 {%rs838, %rs840, %rs842, %rs844}, [%rd2+112]; + shr.u16 %rs839, %rs838, 8; + shr.u16 %rs841, %rs840, 8; + shr.u16 %rs843, %rs842, 8; + shr.u16 %rs845, %rs844, 8; + ld.local.v4.u8 {%rs846, %rs847, %rs848, %rs849}, [%rd2+120]; + ld.local.v2.u8 {%rs850, %rs851}, [%rd2+124]; + ld.local.v2.u8 {%rs852, %rs853}, [%rd2+126]; + ld.local.v4.u8 {%rs854, %rs855, %rs856, %rs857}, [%rd2+128]; + ld.local.v2.u8 {%rs858, %rs859}, [%rd2+132]; + ld.local.v2.u8 {%rs860, %rs861}, [%rd2+134]; + +$L__BB2_91: + setp.eq.s64 %p52, %rd1367, 0; + mov.u32 %r31258, %r31257; + mov.u32 %r31259, %r31256; + mov.u32 %r31260, %r31255; + mov.u32 %r31261, %r31254; + mov.u32 %r31262, %r31253; + mov.u32 %r31263, %r31252; + mov.u32 %r31264, %r31251; + mov.u32 %r31265, %r31250; + mov.u16 %rs863, %rs732; + @%p52 bra $L__BB2_94; + + or.b16 %rs863, %rs665, 4; + ld.local.v2.u32 {%r31258, %r31259}, [%rd2]; + ld.local.v2.u32 {%r31260, %r31261}, [%rd2+8]; + ld.local.v2.u32 {%r31262, %r31263}, [%rd2+16]; + ld.local.v2.u32 {%r31264, %r31265}, [%rd2+24]; + mov.u16 %rs766, %rs829; + mov.u16 %rs767, %rs828; + mov.u16 %rs768, %rs827; + mov.u16 %rs769, %rs826; + mov.u16 %rs770, %rs825; + mov.u16 %rs771, %rs824; + mov.u16 %rs772, %rs823; + mov.u16 %rs773, %rs822; + mov.u16 %rs774, %rs821; + mov.u16 %rs775, %rs820; + mov.u16 %rs776, %rs819; + mov.u16 %rs777, %rs818; + mov.u16 %rs778, %rs817; + mov.u16 %rs779, %rs816; + mov.u16 %rs780, %rs815; + mov.u16 %rs781, %rs814; + mov.u16 %rs782, %rs813; + mov.u16 %rs783, %rs812; + mov.u16 %rs784, %rs811; + mov.u16 %rs785, %rs810; + mov.u16 %rs786, %rs809; + mov.u16 %rs787, %rs808; + mov.u16 %rs788, %rs807; + mov.u16 %rs789, %rs806; + mov.u16 %rs790, %rs805; + mov.u16 %rs791, %rs804; + mov.u16 %rs792, %rs803; + mov.u16 %rs793, %rs802; + mov.u16 %rs794, %rs801; + mov.u16 %rs795, %rs800; + mov.u16 %rs796, %rs799; + mov.u16 %rs797, %rs798; + +$L__BB2_93: + add.s64 %rd1367, %rd1367, -1; + shl.b64 %rd1160, %rd1367, 5; + add.s64 %rd1161, %rd2, %rd1160; + ld.local.u8 %rs798, [%rd1161+145]; + mov.u64 %rd1159, 0; + ld.local.u8 %rs799, [%rd1161+146]; + ld.local.u8 %rs800, [%rd1161+147]; + ld.local.u8 %rs801, [%rd1161+148]; + ld.local.u8 %rs802, [%rd1161+149]; + ld.local.u8 %rs803, [%rd1161+150]; + ld.local.u8 %rs804, [%rd1161+151]; + ld.local.u8 %rs805, [%rd1161+152]; + ld.local.u8 %rs806, [%rd1161+153]; + ld.local.u8 %rs807, [%rd1161+154]; + ld.local.u8 %rs808, [%rd1161+155]; + ld.local.u8 %rs809, [%rd1161+156]; + ld.local.u8 %rs810, [%rd1161+157]; + ld.local.u8 %rs811, [%rd1161+158]; + ld.local.u8 %rs812, [%rd1161+159]; + ld.local.u8 %rs813, [%rd1161+160]; + ld.local.u8 %rs814, [%rd1161+161]; + ld.local.u8 %rs815, [%rd1161+162]; + ld.local.u8 %rs816, [%rd1161+163]; + ld.local.u8 %rs817, [%rd1161+164]; + ld.local.u8 %rs818, [%rd1161+165]; + ld.local.u8 %rs819, [%rd1161+166]; + ld.local.u8 %rs820, [%rd1161+167]; + ld.local.u8 %rs821, [%rd1161+168]; + ld.local.u8 %rs822, [%rd1161+169]; + ld.local.u8 %rs823, [%rd1161+170]; + ld.local.u8 %rs824, [%rd1161+171]; + ld.local.u8 %rs825, [%rd1161+172]; + ld.local.u8 %rs826, [%rd1161+173]; + ld.local.u8 %rs827, [%rd1161+174]; + ld.local.u8 %rs828, [%rd1161+175]; + ld.local.u8 %rs829, [%rd1161+176]; + cvt.u32.u16 %r26825, %rs797; + and.b32 %r26826, %r26825, 255; + cvt.u32.u16 %r26827, %rs796; + prmt.b32 %r26828, %r26827, %r26826, 30212; + cvt.u32.u16 %r26829, %rs795; + shl.b32 %r26830, %r26829, 16; + and.b32 %r26831, %r26830, 16711680; + or.b32 %r26832, %r26828, %r26831; + cvt.u32.u16 %r26833, %rs794; + shl.b32 %r26834, %r26833, 24; + or.b32 %r26835, %r26832, %r26834; + cvt.u32.u16 %r26836, %rs793; + and.b32 %r26837, %r26836, 255; + cvt.u32.u16 %r26838, %rs792; + prmt.b32 %r26839, %r26838, %r26837, 30212; + cvt.u32.u16 %r26840, %rs791; + shl.b32 %r26841, %r26840, 16; + and.b32 %r26842, %r26841, 16711680; + or.b32 %r26843, %r26839, %r26842; + cvt.u32.u16 %r26844, %rs790; + shl.b32 %r26845, %r26844, 24; + or.b32 %r26846, %r26843, %r26845; + cvt.u32.u16 %r26847, %rs789; + and.b32 %r26848, %r26847, 255; + cvt.u32.u16 %r26849, %rs788; + prmt.b32 %r26850, %r26849, %r26848, 30212; + cvt.u32.u16 %r26851, %rs787; + shl.b32 %r26852, %r26851, 16; + and.b32 %r26853, %r26852, 16711680; + or.b32 %r26854, %r26850, %r26853; + cvt.u32.u16 %r26855, %rs786; + shl.b32 %r26856, %r26855, 24; + or.b32 %r26857, %r26854, %r26856; + cvt.u32.u16 %r26858, %rs785; + and.b32 %r26859, %r26858, 255; + cvt.u32.u16 %r26860, %rs784; + prmt.b32 %r26861, %r26860, %r26859, 30212; + cvt.u32.u16 %r26862, %rs783; + shl.b32 %r26863, %r26862, 16; + and.b32 %r26864, %r26863, 16711680; + or.b32 %r26865, %r26861, %r26864; + cvt.u32.u16 %r26866, %rs782; + shl.b32 %r26867, %r26866, 24; + or.b32 %r26868, %r26865, %r26867; + cvt.u32.u16 %r26869, %rs781; + and.b32 %r26870, %r26869, 255; + cvt.u32.u16 %r26871, %rs780; + prmt.b32 %r26872, %r26871, %r26870, 30212; + cvt.u32.u16 %r26873, %rs779; + shl.b32 %r26874, %r26873, 16; + and.b32 %r26875, %r26874, 16711680; + or.b32 %r26876, %r26872, %r26875; + cvt.u32.u16 %r26877, %rs778; + shl.b32 %r26878, %r26877, 24; + or.b32 %r26879, %r26876, %r26878; + cvt.u32.u16 %r26880, %rs777; + and.b32 %r26881, %r26880, 255; + cvt.u32.u16 %r26882, %rs776; + prmt.b32 %r26883, %r26882, %r26881, 30212; + cvt.u32.u16 %r26884, %rs775; + shl.b32 %r26885, %r26884, 16; + and.b32 %r26886, %r26885, 16711680; + or.b32 %r26887, %r26883, %r26886; + cvt.u32.u16 %r26888, %rs774; + shl.b32 %r26889, %r26888, 24; + or.b32 %r26890, %r26887, %r26889; + cvt.u32.u16 %r26891, %rs773; + and.b32 %r26892, %r26891, 255; + cvt.u32.u16 %r26893, %rs772; + prmt.b32 %r26894, %r26893, %r26892, 30212; + cvt.u32.u16 %r26895, %rs771; + shl.b32 %r26896, %r26895, 16; + and.b32 %r26897, %r26896, 16711680; + or.b32 %r26898, %r26894, %r26897; + cvt.u32.u16 %r26899, %rs770; + shl.b32 %r26900, %r26899, 24; + or.b32 %r26901, %r26898, %r26900; + cvt.u32.u16 %r26902, %rs769; + and.b32 %r26903, %r26902, 255; + cvt.u32.u16 %r26904, %rs768; + prmt.b32 %r26905, %r26904, %r26903, 30212; + cvt.u32.u16 %r26906, %rs767; + shl.b32 %r26907, %r26906, 16; + and.b32 %r26908, %r26907, 16711680; + or.b32 %r26909, %r26905, %r26908; + cvt.u32.u16 %r26910, %rs766; + shl.b32 %r26911, %r26910, 24; + or.b32 %r26912, %r26909, %r26911; + cvt.u32.u16 %r26913, %rs830; + and.b32 %r26914, %r26913, 255; + cvt.u32.u16 %r26915, %rs831; + prmt.b32 %r26916, %r26915, %r26914, 30212; + cvt.u32.u16 %r26917, %rs832; + shl.b32 %r26918, %r26917, 16; + and.b32 %r26919, %r26918, 16711680; + or.b32 %r26920, %r26916, %r26919; + cvt.u32.u16 %r26921, %rs833; + shl.b32 %r26922, %r26921, 24; + or.b32 %r26923, %r26920, %r26922; + cvt.u32.u16 %r26924, %rs834; + and.b32 %r26925, %r26924, 255; + cvt.u32.u16 %r26926, %rs835; + prmt.b32 %r26927, %r26926, %r26925, 30212; + cvt.u32.u16 %r26928, %rs836; + shl.b32 %r26929, %r26928, 16; + and.b32 %r26930, %r26929, 16711680; + or.b32 %r26931, %r26927, %r26930; + cvt.u32.u16 %r26932, %rs837; + shl.b32 %r26933, %r26932, 24; + or.b32 %r26934, %r26931, %r26933; + cvt.u32.u16 %r26935, %rs838; + and.b32 %r26936, %r26935, 255; + cvt.u32.u16 %r26937, %rs839; + prmt.b32 %r26938, %r26937, %r26936, 30212; + cvt.u32.u16 %r26939, %rs840; + shl.b32 %r26940, %r26939, 16; + and.b32 %r26941, %r26940, 16711680; + or.b32 %r26942, %r26938, %r26941; + cvt.u32.u16 %r26943, %rs841; + shl.b32 %r26944, %r26943, 24; + or.b32 %r26945, %r26942, %r26944; + cvt.u32.u16 %r26946, %rs842; + and.b32 %r26947, %r26946, 255; + cvt.u32.u16 %r26948, %rs843; + prmt.b32 %r26949, %r26948, %r26947, 30212; + cvt.u32.u16 %r26950, %rs844; + shl.b32 %r26951, %r26950, 16; + and.b32 %r26952, %r26951, 16711680; + or.b32 %r26953, %r26949, %r26952; + cvt.u32.u16 %r26954, %rs845; + shl.b32 %r26955, %r26954, 24; + or.b32 %r26956, %r26953, %r26955; + cvt.u32.u16 %r26957, %rs846; + and.b32 %r26958, %r26957, 255; + cvt.u32.u16 %r26959, %rs847; + prmt.b32 %r26960, %r26959, %r26958, 30212; + cvt.u32.u16 %r26961, %rs848; + shl.b32 %r26962, %r26961, 16; + and.b32 %r26963, %r26962, 16711680; + or.b32 %r26964, %r26960, %r26963; + cvt.u32.u16 %r26965, %rs849; + shl.b32 %r26966, %r26965, 24; + or.b32 %r26967, %r26964, %r26966; + cvt.u32.u16 %r26968, %rs850; + and.b32 %r26969, %r26968, 255; + cvt.u32.u16 %r26970, %rs851; + prmt.b32 %r26971, %r26970, %r26969, 30212; + cvt.u32.u16 %r26972, %rs852; + shl.b32 %r26973, %r26972, 16; + and.b32 %r26974, %r26973, 16711680; + or.b32 %r26975, %r26971, %r26974; + cvt.u32.u16 %r26976, %rs853; + shl.b32 %r26977, %r26976, 24; + or.b32 %r26978, %r26975, %r26977; + cvt.u32.u16 %r26979, %rs854; + and.b32 %r26980, %r26979, 255; + cvt.u32.u16 %r26981, %rs855; + prmt.b32 %r26982, %r26981, %r26980, 30212; + cvt.u32.u16 %r26983, %rs856; + shl.b32 %r26984, %r26983, 16; + and.b32 %r26985, %r26984, 16711680; + or.b32 %r26986, %r26982, %r26985; + cvt.u32.u16 %r26987, %rs857; + shl.b32 %r26988, %r26987, 24; + or.b32 %r26989, %r26986, %r26988; + cvt.u32.u16 %r26990, %rs858; + and.b32 %r26991, %r26990, 255; + cvt.u32.u16 %r26992, %rs859; + prmt.b32 %r26993, %r26992, %r26991, 30212; + cvt.u32.u16 %r26994, %rs860; + shl.b32 %r26995, %r26994, 16; + and.b32 %r26996, %r26995, 16711680; + or.b32 %r26997, %r26993, %r26996; + cvt.u32.u16 %r26998, %rs861; + shl.b32 %r26999, %r26998, 24; + or.b32 %r27000, %r26997, %r26999; + shr.u64 %rd1162, %rd1368, 32; + cvt.u32.u64 %r27001, %rd1162; + add.s32 %r27002, %r31257, %r26835; + add.s32 %r27003, %r27002, %r31253; + cvt.u32.u64 %r27004, %rd1368; + xor.b32 %r27005, %r27003, %r27004; + shf.l.wrap.b32 %r27006, %r27005, %r27005, 16; + add.s32 %r27007, %r27006, 1779033703; + xor.b32 %r27008, %r27007, %r31253; + shf.l.wrap.b32 %r27009, %r27008, %r27008, 20; + add.s32 %r27010, %r27003, %r26846; + add.s32 %r27011, %r27010, %r27009; + xor.b32 %r27012, %r27011, %r27006; + shf.l.wrap.b32 %r27013, %r27012, %r27012, 24; + add.s32 %r27014, %r27013, %r27007; + xor.b32 %r27015, %r27014, %r27009; + shf.l.wrap.b32 %r27016, %r27015, %r27015, 25; + add.s32 %r27017, %r31256, %r26857; + add.s32 %r27018, %r27017, %r31252; + xor.b32 %r27019, %r27018, %r27001; + shf.l.wrap.b32 %r27020, %r27019, %r27019, 16; + add.s32 %r27021, %r27020, -1150833019; + xor.b32 %r27022, %r27021, %r31252; + shf.l.wrap.b32 %r27023, %r27022, %r27022, 20; + add.s32 %r27024, %r27018, %r26868; + add.s32 %r27025, %r27024, %r27023; + xor.b32 %r27026, %r27025, %r27020; + shf.l.wrap.b32 %r27027, %r27026, %r27026, 24; + add.s32 %r27028, %r27027, %r27021; + xor.b32 %r27029, %r27028, %r27023; + shf.l.wrap.b32 %r27030, %r27029, %r27029, 25; + add.s32 %r27031, %r31255, %r26879; + add.s32 %r27032, %r27031, %r31251; + cvt.u32.u16 %r27033, %rs862; + and.b32 %r27034, %r27033, 255; + xor.b32 %r27035, %r27032, %r27034; + shr.u32 %r27036, %r27032, 16; + shl.b32 %r27037, %r27035, 16; + or.b32 %r27038, %r27037, %r27036; + add.s32 %r27039, %r27038, 1013904242; + xor.b32 %r27040, %r27039, %r31251; + shf.l.wrap.b32 %r27041, %r27040, %r27040, 20; + add.s32 %r27042, %r27032, %r26890; + add.s32 %r27043, %r27042, %r27041; + xor.b32 %r27044, %r27043, %r27038; + shf.l.wrap.b32 %r27045, %r27044, %r27044, 24; + add.s32 %r27046, %r27045, %r27039; + xor.b32 %r27047, %r27046, %r27041; + shf.l.wrap.b32 %r27048, %r27047, %r27047, 25; + add.s32 %r27049, %r31254, %r26901; + add.s32 %r27050, %r27049, %r31250; + cvt.u32.u16 %r27051, %rs732; + and.b32 %r27052, %r27051, 255; + xor.b32 %r27053, %r27050, %r27052; + shr.u32 %r27054, %r27050, 16; + shl.b32 %r27055, %r27053, 16; + or.b32 %r27056, %r27055, %r27054; + add.s32 %r27057, %r27056, -1521486534; + xor.b32 %r27058, %r27057, %r31250; + shf.l.wrap.b32 %r27059, %r27058, %r27058, 20; + add.s32 %r27060, %r27050, %r26912; + add.s32 %r27061, %r27060, %r27059; + xor.b32 %r27062, %r27061, %r27056; + shf.l.wrap.b32 %r27063, %r27062, %r27062, 24; + add.s32 %r27064, %r27063, %r27057; + xor.b32 %r27065, %r27064, %r27059; + shf.l.wrap.b32 %r27066, %r27065, %r27065, 25; + add.s32 %r27067, %r27011, %r26923; + add.s32 %r27068, %r27067, %r27030; + xor.b32 %r27069, %r27068, %r27063; + shf.l.wrap.b32 %r27070, %r27069, %r27069, 16; + add.s32 %r27071, %r27070, %r27046; + xor.b32 %r27072, %r27071, %r27030; + shf.l.wrap.b32 %r27073, %r27072, %r27072, 20; + add.s32 %r27074, %r27068, %r26934; + add.s32 %r27075, %r27074, %r27073; + xor.b32 %r27076, %r27075, %r27070; + shf.l.wrap.b32 %r27077, %r27076, %r27076, 24; + add.s32 %r27078, %r27077, %r27071; + xor.b32 %r27079, %r27078, %r27073; + shf.l.wrap.b32 %r27080, %r27079, %r27079, 25; + add.s32 %r27081, %r27025, %r26945; + add.s32 %r27082, %r27081, %r27048; + xor.b32 %r27083, %r27082, %r27013; + shf.l.wrap.b32 %r27084, %r27083, %r27083, 16; + add.s32 %r27085, %r27084, %r27064; + xor.b32 %r27086, %r27085, %r27048; + shf.l.wrap.b32 %r27087, %r27086, %r27086, 20; + add.s32 %r27088, %r27082, %r26956; + add.s32 %r27089, %r27088, %r27087; + xor.b32 %r27090, %r27089, %r27084; + shf.l.wrap.b32 %r27091, %r27090, %r27090, 24; + add.s32 %r27092, %r27091, %r27085; + xor.b32 %r27093, %r27092, %r27087; + shf.l.wrap.b32 %r27094, %r27093, %r27093, 25; + add.s32 %r27095, %r27043, %r26967; + add.s32 %r27096, %r27095, %r27066; + xor.b32 %r27097, %r27096, %r27027; + shf.l.wrap.b32 %r27098, %r27097, %r27097, 16; + add.s32 %r27099, %r27098, %r27014; + xor.b32 %r27100, %r27099, %r27066; + shf.l.wrap.b32 %r27101, %r27100, %r27100, 20; + add.s32 %r27102, %r27096, %r26978; + add.s32 %r27103, %r27102, %r27101; + xor.b32 %r27104, %r27103, %r27098; + shf.l.wrap.b32 %r27105, %r27104, %r27104, 24; + add.s32 %r27106, %r27105, %r27099; + xor.b32 %r27107, %r27106, %r27101; + shf.l.wrap.b32 %r27108, %r27107, %r27107, 25; + add.s32 %r27109, %r27061, %r26989; + add.s32 %r27110, %r27109, %r27016; + xor.b32 %r27111, %r27110, %r27045; + shf.l.wrap.b32 %r27112, %r27111, %r27111, 16; + add.s32 %r27113, %r27112, %r27028; + xor.b32 %r27114, %r27113, %r27016; + shf.l.wrap.b32 %r27115, %r27114, %r27114, 20; + add.s32 %r27116, %r27110, %r27000; + add.s32 %r27117, %r27116, %r27115; + xor.b32 %r27118, %r27117, %r27112; + shf.l.wrap.b32 %r27119, %r27118, %r27118, 24; + add.s32 %r27120, %r27119, %r27113; + xor.b32 %r27121, %r27120, %r27115; + shf.l.wrap.b32 %r27122, %r27121, %r27121, 25; + add.s32 %r27123, %r27075, %r26857; + add.s32 %r27124, %r27123, %r27122; + xor.b32 %r27125, %r27124, %r27091; + shf.l.wrap.b32 %r27126, %r27125, %r27125, 16; + add.s32 %r27127, %r27126, %r27106; + xor.b32 %r27128, %r27127, %r27122; + shf.l.wrap.b32 %r27129, %r27128, %r27128, 20; + add.s32 %r27130, %r27124, %r26901; + add.s32 %r27131, %r27130, %r27129; + xor.b32 %r27132, %r27131, %r27126; + shf.l.wrap.b32 %r27133, %r27132, %r27132, 24; + add.s32 %r27134, %r27133, %r27127; + xor.b32 %r27135, %r27134, %r27129; + shf.l.wrap.b32 %r27136, %r27135, %r27135, 25; + add.s32 %r27137, %r27089, %r26868; + add.s32 %r27138, %r27137, %r27080; + xor.b32 %r27139, %r27138, %r27105; + shf.l.wrap.b32 %r27140, %r27139, %r27139, 16; + add.s32 %r27141, %r27140, %r27120; + xor.b32 %r27142, %r27141, %r27080; + shf.l.wrap.b32 %r27143, %r27142, %r27142, 20; + add.s32 %r27144, %r27138, %r26945; + add.s32 %r27145, %r27144, %r27143; + xor.b32 %r27146, %r27145, %r27140; + shf.l.wrap.b32 %r27147, %r27146, %r27146, 24; + add.s32 %r27148, %r27147, %r27141; + xor.b32 %r27149, %r27148, %r27143; + shf.l.wrap.b32 %r27150, %r27149, %r27149, 25; + add.s32 %r27151, %r27103, %r26912; + add.s32 %r27152, %r27151, %r27094; + xor.b32 %r27153, %r27152, %r27119; + shf.l.wrap.b32 %r27154, %r27153, %r27153, 16; + add.s32 %r27155, %r27154, %r27078; + xor.b32 %r27156, %r27155, %r27094; + shf.l.wrap.b32 %r27157, %r27156, %r27156, 20; + add.s32 %r27158, %r27152, %r26835; + add.s32 %r27159, %r27158, %r27157; + xor.b32 %r27160, %r27159, %r27154; + shf.l.wrap.b32 %r27161, %r27160, %r27160, 24; + add.s32 %r27162, %r27161, %r27155; + xor.b32 %r27163, %r27162, %r27157; + shf.l.wrap.b32 %r27164, %r27163, %r27163, 25; + add.s32 %r27165, %r27117, %r26879; + add.s32 %r27166, %r27165, %r27108; + xor.b32 %r27167, %r27166, %r27077; + shf.l.wrap.b32 %r27168, %r27167, %r27167, 16; + add.s32 %r27169, %r27168, %r27092; + xor.b32 %r27170, %r27169, %r27108; + shf.l.wrap.b32 %r27171, %r27170, %r27170, 20; + add.s32 %r27172, %r27166, %r26978; + add.s32 %r27173, %r27172, %r27171; + xor.b32 %r27174, %r27173, %r27168; + shf.l.wrap.b32 %r27175, %r27174, %r27174, 24; + add.s32 %r27176, %r27175, %r27169; + xor.b32 %r27177, %r27176, %r27171; + shf.l.wrap.b32 %r27178, %r27177, %r27177, 25; + add.s32 %r27179, %r27131, %r26846; + add.s32 %r27180, %r27179, %r27150; + xor.b32 %r27181, %r27180, %r27175; + shf.l.wrap.b32 %r27182, %r27181, %r27181, 16; + add.s32 %r27183, %r27182, %r27162; + xor.b32 %r27184, %r27183, %r27150; + shf.l.wrap.b32 %r27185, %r27184, %r27184, 20; + add.s32 %r27186, %r27180, %r26956; + add.s32 %r27187, %r27186, %r27185; + xor.b32 %r27188, %r27187, %r27182; + shf.l.wrap.b32 %r27189, %r27188, %r27188, 24; + add.s32 %r27190, %r27189, %r27183; + xor.b32 %r27191, %r27190, %r27185; + shf.l.wrap.b32 %r27192, %r27191, %r27191, 25; + add.s32 %r27193, %r27145, %r26967; + add.s32 %r27194, %r27193, %r27164; + xor.b32 %r27195, %r27194, %r27133; + shf.l.wrap.b32 %r27196, %r27195, %r27195, 16; + add.s32 %r27197, %r27196, %r27176; + xor.b32 %r27198, %r27197, %r27164; + shf.l.wrap.b32 %r27199, %r27198, %r27198, 20; + add.s32 %r27200, %r27194, %r26890; + add.s32 %r27201, %r27200, %r27199; + xor.b32 %r27202, %r27201, %r27196; + shf.l.wrap.b32 %r27203, %r27202, %r27202, 24; + add.s32 %r27204, %r27203, %r27197; + xor.b32 %r27205, %r27204, %r27199; + shf.l.wrap.b32 %r27206, %r27205, %r27205, 25; + add.s32 %r27207, %r27159, %r26934; + add.s32 %r27208, %r27207, %r27178; + xor.b32 %r27209, %r27208, %r27147; + shf.l.wrap.b32 %r27210, %r27209, %r27209, 16; + add.s32 %r27211, %r27210, %r27134; + xor.b32 %r27212, %r27211, %r27178; + shf.l.wrap.b32 %r27213, %r27212, %r27212, 20; + add.s32 %r27214, %r27208, %r26989; + add.s32 %r27215, %r27214, %r27213; + xor.b32 %r27216, %r27215, %r27210; + shf.l.wrap.b32 %r27217, %r27216, %r27216, 24; + add.s32 %r27218, %r27217, %r27211; + xor.b32 %r27219, %r27218, %r27213; + shf.l.wrap.b32 %r27220, %r27219, %r27219, 25; + add.s32 %r27221, %r27173, %r27000; + add.s32 %r27222, %r27221, %r27136; + xor.b32 %r27223, %r27222, %r27161; + shf.l.wrap.b32 %r27224, %r27223, %r27223, 16; + add.s32 %r27225, %r27224, %r27148; + xor.b32 %r27226, %r27225, %r27136; + shf.l.wrap.b32 %r27227, %r27226, %r27226, 20; + add.s32 %r27228, %r27222, %r26923; + add.s32 %r27229, %r27228, %r27227; + xor.b32 %r27230, %r27229, %r27224; + shf.l.wrap.b32 %r27231, %r27230, %r27230, 24; + add.s32 %r27232, %r27231, %r27225; + xor.b32 %r27233, %r27232, %r27227; + shf.l.wrap.b32 %r27234, %r27233, %r27233, 25; + add.s32 %r27235, %r27187, %r26868; + add.s32 %r27236, %r27235, %r27234; + xor.b32 %r27237, %r27236, %r27203; + shf.l.wrap.b32 %r27238, %r27237, %r27237, 16; + add.s32 %r27239, %r27238, %r27218; + xor.b32 %r27240, %r27239, %r27234; + shf.l.wrap.b32 %r27241, %r27240, %r27240, 20; + add.s32 %r27242, %r27236, %r26879; + add.s32 %r27243, %r27242, %r27241; + xor.b32 %r27244, %r27243, %r27238; + shf.l.wrap.b32 %r27245, %r27244, %r27244, 24; + add.s32 %r27246, %r27245, %r27239; + xor.b32 %r27247, %r27246, %r27241; + shf.l.wrap.b32 %r27248, %r27247, %r27247, 25; + add.s32 %r27249, %r27201, %r26945; + add.s32 %r27250, %r27249, %r27192; + xor.b32 %r27251, %r27250, %r27217; + shf.l.wrap.b32 %r27252, %r27251, %r27251, 16; + add.s32 %r27253, %r27252, %r27232; + xor.b32 %r27254, %r27253, %r27192; + shf.l.wrap.b32 %r27255, %r27254, %r27254, 20; + add.s32 %r27256, %r27250, %r26967; + add.s32 %r27257, %r27256, %r27255; + xor.b32 %r27258, %r27257, %r27252; + shf.l.wrap.b32 %r27259, %r27258, %r27258, 24; + add.s32 %r27260, %r27259, %r27253; + xor.b32 %r27261, %r27260, %r27255; + shf.l.wrap.b32 %r27262, %r27261, %r27261, 25; + add.s32 %r27263, %r27215, %r26978; + add.s32 %r27264, %r27263, %r27206; + xor.b32 %r27265, %r27264, %r27231; + shf.l.wrap.b32 %r27266, %r27265, %r27265, 16; + add.s32 %r27267, %r27266, %r27190; + xor.b32 %r27268, %r27267, %r27206; + shf.l.wrap.b32 %r27269, %r27268, %r27268, 20; + add.s32 %r27270, %r27264, %r26857; + add.s32 %r27271, %r27270, %r27269; + xor.b32 %r27272, %r27271, %r27266; + shf.l.wrap.b32 %r27273, %r27272, %r27272, 24; + add.s32 %r27274, %r27273, %r27267; + xor.b32 %r27275, %r27274, %r27269; + shf.l.wrap.b32 %r27276, %r27275, %r27275, 25; + add.s32 %r27277, %r27229, %r26912; + add.s32 %r27278, %r27277, %r27220; + xor.b32 %r27279, %r27278, %r27189; + shf.l.wrap.b32 %r27280, %r27279, %r27279, 16; + add.s32 %r27281, %r27280, %r27204; + xor.b32 %r27282, %r27281, %r27220; + shf.l.wrap.b32 %r27283, %r27282, %r27282, 20; + add.s32 %r27284, %r27278, %r26989; + add.s32 %r27285, %r27284, %r27283; + xor.b32 %r27286, %r27285, %r27280; + shf.l.wrap.b32 %r27287, %r27286, %r27286, 24; + add.s32 %r27288, %r27287, %r27281; + xor.b32 %r27289, %r27288, %r27283; + shf.l.wrap.b32 %r27290, %r27289, %r27289, 25; + add.s32 %r27291, %r27243, %r26901; + add.s32 %r27292, %r27291, %r27262; + xor.b32 %r27293, %r27292, %r27287; + shf.l.wrap.b32 %r27294, %r27293, %r27293, 16; + add.s32 %r27295, %r27294, %r27274; + xor.b32 %r27296, %r27295, %r27262; + shf.l.wrap.b32 %r27297, %r27296, %r27296, 20; + add.s32 %r27298, %r27292, %r26890; + add.s32 %r27299, %r27298, %r27297; + xor.b32 %r27300, %r27299, %r27294; + shf.l.wrap.b32 %r27301, %r27300, %r27300, 24; + add.s32 %r27302, %r27301, %r27295; + xor.b32 %r27303, %r27302, %r27297; + shf.l.wrap.b32 %r27304, %r27303, %r27303, 25; + add.s32 %r27305, %r27257, %r26934; + add.s32 %r27306, %r27305, %r27276; + xor.b32 %r27307, %r27306, %r27245; + shf.l.wrap.b32 %r27308, %r27307, %r27307, 16; + add.s32 %r27309, %r27308, %r27288; + xor.b32 %r27310, %r27309, %r27276; + shf.l.wrap.b32 %r27311, %r27310, %r27310, 20; + add.s32 %r27312, %r27306, %r26835; + add.s32 %r27313, %r27312, %r27311; + xor.b32 %r27314, %r27313, %r27308; + shf.l.wrap.b32 %r27315, %r27314, %r27314, 24; + add.s32 %r27316, %r27315, %r27309; + xor.b32 %r27317, %r27316, %r27311; + shf.l.wrap.b32 %r27318, %r27317, %r27317, 25; + add.s32 %r27319, %r27271, %r26956; + add.s32 %r27320, %r27319, %r27290; + xor.b32 %r27321, %r27320, %r27259; + shf.l.wrap.b32 %r27322, %r27321, %r27321, 16; + add.s32 %r27323, %r27322, %r27246; + xor.b32 %r27324, %r27323, %r27290; + shf.l.wrap.b32 %r27325, %r27324, %r27324, 20; + add.s32 %r27326, %r27320, %r27000; + add.s32 %r27327, %r27326, %r27325; + xor.b32 %r27328, %r27327, %r27322; + shf.l.wrap.b32 %r27329, %r27328, %r27328, 24; + add.s32 %r27330, %r27329, %r27323; + xor.b32 %r27331, %r27330, %r27325; + shf.l.wrap.b32 %r27332, %r27331, %r27331, 25; + add.s32 %r27333, %r27285, %r26923; + add.s32 %r27334, %r27333, %r27248; + xor.b32 %r27335, %r27334, %r27273; + shf.l.wrap.b32 %r27336, %r27335, %r27335, 16; + add.s32 %r27337, %r27336, %r27260; + xor.b32 %r27338, %r27337, %r27248; + shf.l.wrap.b32 %r27339, %r27338, %r27338, 20; + add.s32 %r27340, %r27334, %r26846; + add.s32 %r27341, %r27340, %r27339; + xor.b32 %r27342, %r27341, %r27336; + shf.l.wrap.b32 %r27343, %r27342, %r27342, 24; + add.s32 %r27344, %r27343, %r27337; + xor.b32 %r27345, %r27344, %r27339; + shf.l.wrap.b32 %r27346, %r27345, %r27345, 25; + add.s32 %r27347, %r27299, %r26945; + add.s32 %r27348, %r27347, %r27346; + xor.b32 %r27349, %r27348, %r27315; + shf.l.wrap.b32 %r27350, %r27349, %r27349, 16; + add.s32 %r27351, %r27350, %r27330; + xor.b32 %r27352, %r27351, %r27346; + shf.l.wrap.b32 %r27353, %r27352, %r27352, 20; + add.s32 %r27354, %r27348, %r26912; + add.s32 %r27355, %r27354, %r27353; + xor.b32 %r27356, %r27355, %r27350; + shf.l.wrap.b32 %r27357, %r27356, %r27356, 24; + add.s32 %r27358, %r27357, %r27351; + xor.b32 %r27359, %r27358, %r27353; + shf.l.wrap.b32 %r27360, %r27359, %r27359, 25; + add.s32 %r27361, %r27313, %r26967; + add.s32 %r27362, %r27361, %r27304; + xor.b32 %r27363, %r27362, %r27329; + shf.l.wrap.b32 %r27364, %r27363, %r27363, 16; + add.s32 %r27365, %r27364, %r27344; + xor.b32 %r27366, %r27365, %r27304; + shf.l.wrap.b32 %r27367, %r27366, %r27366, 20; + add.s32 %r27368, %r27362, %r26934; + add.s32 %r27369, %r27368, %r27367; + xor.b32 %r27370, %r27369, %r27364; + shf.l.wrap.b32 %r27371, %r27370, %r27370, 24; + add.s32 %r27372, %r27371, %r27365; + xor.b32 %r27373, %r27372, %r27367; + shf.l.wrap.b32 %r27374, %r27373, %r27373, 25; + add.s32 %r27375, %r27327, %r26989; + add.s32 %r27376, %r27375, %r27318; + xor.b32 %r27377, %r27376, %r27343; + shf.l.wrap.b32 %r27378, %r27377, %r27377, 16; + add.s32 %r27379, %r27378, %r27302; + xor.b32 %r27380, %r27379, %r27318; + shf.l.wrap.b32 %r27381, %r27380, %r27380, 20; + add.s32 %r27382, %r27376, %r26868; + add.s32 %r27383, %r27382, %r27381; + xor.b32 %r27384, %r27383, %r27378; + shf.l.wrap.b32 %r27385, %r27384, %r27384, 24; + add.s32 %r27386, %r27385, %r27379; + xor.b32 %r27387, %r27386, %r27381; + shf.l.wrap.b32 %r27388, %r27387, %r27387, 25; + add.s32 %r27389, %r27341, %r26978; + add.s32 %r27390, %r27389, %r27332; + xor.b32 %r27391, %r27390, %r27301; + shf.l.wrap.b32 %r27392, %r27391, %r27391, 16; + add.s32 %r27393, %r27392, %r27316; + xor.b32 %r27394, %r27393, %r27332; + shf.l.wrap.b32 %r27395, %r27394, %r27394, 20; + add.s32 %r27396, %r27390, %r27000; + add.s32 %r27397, %r27396, %r27395; + xor.b32 %r27398, %r27397, %r27392; + shf.l.wrap.b32 %r27399, %r27398, %r27398, 24; + add.s32 %r27400, %r27399, %r27393; + xor.b32 %r27401, %r27400, %r27395; + shf.l.wrap.b32 %r27402, %r27401, %r27401, 25; + add.s32 %r27403, %r27355, %r26879; + add.s32 %r27404, %r27403, %r27374; + xor.b32 %r27405, %r27404, %r27399; + shf.l.wrap.b32 %r27406, %r27405, %r27405, 16; + add.s32 %r27407, %r27406, %r27386; + xor.b32 %r27408, %r27407, %r27374; + shf.l.wrap.b32 %r27409, %r27408, %r27408, 20; + add.s32 %r27410, %r27404, %r26835; + add.s32 %r27411, %r27410, %r27409; + xor.b32 %r27412, %r27411, %r27406; + shf.l.wrap.b32 %r27413, %r27412, %r27412, 24; + add.s32 %r27414, %r27413, %r27407; + xor.b32 %r27415, %r27414, %r27409; + shf.l.wrap.b32 %r27416, %r27415, %r27415, 25; + add.s32 %r27417, %r27369, %r26956; + add.s32 %r27418, %r27417, %r27388; + xor.b32 %r27419, %r27418, %r27357; + shf.l.wrap.b32 %r27420, %r27419, %r27419, 16; + add.s32 %r27421, %r27420, %r27400; + xor.b32 %r27422, %r27421, %r27388; + shf.l.wrap.b32 %r27423, %r27422, %r27422, 20; + add.s32 %r27424, %r27418, %r26857; + add.s32 %r27425, %r27424, %r27423; + xor.b32 %r27426, %r27425, %r27420; + shf.l.wrap.b32 %r27427, %r27426, %r27426, 24; + add.s32 %r27428, %r27427, %r27421; + xor.b32 %r27429, %r27428, %r27423; + shf.l.wrap.b32 %r27430, %r27429, %r27429, 25; + add.s32 %r27431, %r27383, %r26890; + add.s32 %r27432, %r27431, %r27402; + xor.b32 %r27433, %r27432, %r27371; + shf.l.wrap.b32 %r27434, %r27433, %r27433, 16; + add.s32 %r27435, %r27434, %r27358; + xor.b32 %r27436, %r27435, %r27402; + shf.l.wrap.b32 %r27437, %r27436, %r27436, 20; + add.s32 %r27438, %r27432, %r26923; + add.s32 %r27439, %r27438, %r27437; + xor.b32 %r27440, %r27439, %r27434; + shf.l.wrap.b32 %r27441, %r27440, %r27440, 24; + add.s32 %r27442, %r27441, %r27435; + xor.b32 %r27443, %r27442, %r27437; + shf.l.wrap.b32 %r27444, %r27443, %r27443, 25; + add.s32 %r27445, %r27397, %r26846; + add.s32 %r27446, %r27445, %r27360; + xor.b32 %r27447, %r27446, %r27385; + shf.l.wrap.b32 %r27448, %r27447, %r27447, 16; + add.s32 %r27449, %r27448, %r27372; + xor.b32 %r27450, %r27449, %r27360; + shf.l.wrap.b32 %r27451, %r27450, %r27450, 20; + add.s32 %r27452, %r27446, %r26901; + add.s32 %r27453, %r27452, %r27451; + xor.b32 %r27454, %r27453, %r27448; + shf.l.wrap.b32 %r27455, %r27454, %r27454, 24; + add.s32 %r27456, %r27455, %r27449; + xor.b32 %r27457, %r27456, %r27451; + shf.l.wrap.b32 %r27458, %r27457, %r27457, 25; + add.s32 %r27459, %r27411, %r26967; + add.s32 %r27460, %r27459, %r27458; + xor.b32 %r27461, %r27460, %r27427; + shf.l.wrap.b32 %r27462, %r27461, %r27461, 16; + add.s32 %r27463, %r27462, %r27442; + xor.b32 %r27464, %r27463, %r27458; + shf.l.wrap.b32 %r27465, %r27464, %r27464, 20; + add.s32 %r27466, %r27460, %r26978; + add.s32 %r27467, %r27466, %r27465; + xor.b32 %r27468, %r27467, %r27462; + shf.l.wrap.b32 %r27469, %r27468, %r27468, 24; + add.s32 %r27470, %r27469, %r27463; + xor.b32 %r27471, %r27470, %r27465; + shf.l.wrap.b32 %r27472, %r27471, %r27471, 25; + add.s32 %r27473, %r27425, %r26934; + add.s32 %r27474, %r27473, %r27416; + xor.b32 %r27475, %r27474, %r27441; + shf.l.wrap.b32 %r27476, %r27475, %r27475, 16; + add.s32 %r27477, %r27476, %r27456; + xor.b32 %r27478, %r27477, %r27416; + shf.l.wrap.b32 %r27479, %r27478, %r27478, 20; + add.s32 %r27480, %r27474, %r26956; + add.s32 %r27481, %r27480, %r27479; + xor.b32 %r27482, %r27481, %r27476; + shf.l.wrap.b32 %r27483, %r27482, %r27482, 24; + add.s32 %r27484, %r27483, %r27477; + xor.b32 %r27485, %r27484, %r27479; + shf.l.wrap.b32 %r27486, %r27485, %r27485, 25; + add.s32 %r27487, %r27439, %r27000; + add.s32 %r27488, %r27487, %r27430; + xor.b32 %r27489, %r27488, %r27455; + shf.l.wrap.b32 %r27490, %r27489, %r27489, 16; + add.s32 %r27491, %r27490, %r27414; + xor.b32 %r27492, %r27491, %r27430; + shf.l.wrap.b32 %r27493, %r27492, %r27492, 20; + add.s32 %r27494, %r27488, %r26945; + add.s32 %r27495, %r27494, %r27493; + xor.b32 %r27496, %r27495, %r27490; + shf.l.wrap.b32 %r27497, %r27496, %r27496, 24; + add.s32 %r27498, %r27497, %r27491; + xor.b32 %r27499, %r27498, %r27493; + shf.l.wrap.b32 %r27500, %r27499, %r27499, 25; + add.s32 %r27501, %r27453, %r26989; + add.s32 %r27502, %r27501, %r27444; + xor.b32 %r27503, %r27502, %r27413; + shf.l.wrap.b32 %r27504, %r27503, %r27503, 16; + add.s32 %r27505, %r27504, %r27428; + xor.b32 %r27506, %r27505, %r27444; + shf.l.wrap.b32 %r27507, %r27506, %r27506, 20; + add.s32 %r27508, %r27502, %r26923; + add.s32 %r27509, %r27508, %r27507; + xor.b32 %r27510, %r27509, %r27504; + shf.l.wrap.b32 %r27511, %r27510, %r27510, 24; + add.s32 %r27512, %r27511, %r27505; + xor.b32 %r27513, %r27512, %r27507; + shf.l.wrap.b32 %r27514, %r27513, %r27513, 25; + add.s32 %r27515, %r27467, %r26912; + add.s32 %r27516, %r27515, %r27486; + xor.b32 %r27517, %r27516, %r27511; + shf.l.wrap.b32 %r27518, %r27517, %r27517, 16; + add.s32 %r27519, %r27518, %r27498; + xor.b32 %r27520, %r27519, %r27486; + shf.l.wrap.b32 %r27521, %r27520, %r27520, 20; + add.s32 %r27522, %r27516, %r26857; + add.s32 %r27523, %r27522, %r27521; + xor.b32 %r27524, %r27523, %r27518; + shf.l.wrap.b32 %r27525, %r27524, %r27524, 24; + add.s32 %r27526, %r27525, %r27519; + xor.b32 %r27527, %r27526, %r27521; + shf.l.wrap.b32 %r27528, %r27527, %r27527, 25; + add.s32 %r27529, %r27481, %r26890; + add.s32 %r27530, %r27529, %r27500; + xor.b32 %r27531, %r27530, %r27469; + shf.l.wrap.b32 %r27532, %r27531, %r27531, 16; + add.s32 %r27533, %r27532, %r27512; + xor.b32 %r27534, %r27533, %r27500; + shf.l.wrap.b32 %r27535, %r27534, %r27534, 20; + add.s32 %r27536, %r27530, %r26868; + add.s32 %r27537, %r27536, %r27535; + xor.b32 %r27538, %r27537, %r27532; + shf.l.wrap.b32 %r27539, %r27538, %r27538, 24; + add.s32 %r27540, %r27539, %r27533; + xor.b32 %r27541, %r27540, %r27535; + shf.l.wrap.b32 %r27542, %r27541, %r27541, 25; + add.s32 %r27543, %r27495, %r26835; + add.s32 %r27544, %r27543, %r27514; + xor.b32 %r27545, %r27544, %r27483; + shf.l.wrap.b32 %r27546, %r27545, %r27545, 16; + add.s32 %r27547, %r27546, %r27470; + xor.b32 %r27548, %r27547, %r27514; + shf.l.wrap.b32 %r27549, %r27548, %r27548, 20; + add.s32 %r27550, %r27544, %r26846; + add.s32 %r27551, %r27550, %r27549; + xor.b32 %r27552, %r27551, %r27546; + shf.l.wrap.b32 %r27553, %r27552, %r27552, 24; + add.s32 %r27554, %r27553, %r27547; + xor.b32 %r27555, %r27554, %r27549; + shf.l.wrap.b32 %r27556, %r27555, %r27555, 25; + add.s32 %r27557, %r27509, %r26901; + add.s32 %r27558, %r27557, %r27472; + xor.b32 %r27559, %r27558, %r27497; + shf.l.wrap.b32 %r27560, %r27559, %r27559, 16; + add.s32 %r27561, %r27560, %r27484; + xor.b32 %r27562, %r27561, %r27472; + shf.l.wrap.b32 %r27563, %r27562, %r27562, 20; + add.s32 %r27564, %r27558, %r26879; + add.s32 %r27565, %r27564, %r27563; + xor.b32 %r27566, %r27565, %r27560; + shf.l.wrap.b32 %r27567, %r27566, %r27566, 24; + add.s32 %r27568, %r27567, %r27561; + xor.b32 %r27569, %r27568, %r27563; + shf.l.wrap.b32 %r27570, %r27569, %r27569, 25; + add.s32 %r27571, %r27523, %r26934; + add.s32 %r27572, %r27571, %r27570; + xor.b32 %r27573, %r27572, %r27539; + shf.l.wrap.b32 %r27574, %r27573, %r27573, 16; + add.s32 %r27575, %r27574, %r27554; + xor.b32 %r27576, %r27575, %r27570; + shf.l.wrap.b32 %r27577, %r27576, %r27576, 20; + add.s32 %r27578, %r27572, %r26989; + add.s32 %r27579, %r27578, %r27577; + xor.b32 %r27580, %r27579, %r27574; + shf.l.wrap.b32 %r27581, %r27580, %r27580, 24; + add.s32 %r27582, %r27581, %r27575; + xor.b32 %r27583, %r27582, %r27577; + shf.l.wrap.b32 %r27584, %r27583, %r27583, 25; + add.s32 %r27585, %r27537, %r26956; + add.s32 %r27586, %r27585, %r27528; + xor.b32 %r27587, %r27586, %r27553; + shf.l.wrap.b32 %r27588, %r27587, %r27587, 16; + add.s32 %r27589, %r27588, %r27568; + xor.b32 %r27590, %r27589, %r27528; + shf.l.wrap.b32 %r27591, %r27590, %r27590, 20; + add.s32 %r27592, %r27586, %r26890; + add.s32 %r27593, %r27592, %r27591; + xor.b32 %r27594, %r27593, %r27588; + shf.l.wrap.b32 %r27595, %r27594, %r27594, 24; + add.s32 %r27596, %r27595, %r27589; + xor.b32 %r27597, %r27596, %r27591; + shf.l.wrap.b32 %r27598, %r27597, %r27597, 25; + add.s32 %r27599, %r27551, %r26923; + add.s32 %r27600, %r27599, %r27542; + xor.b32 %r27601, %r27600, %r27567; + shf.l.wrap.b32 %r27602, %r27601, %r27601, 16; + add.s32 %r27603, %r27602, %r27526; + xor.b32 %r27604, %r27603, %r27542; + shf.l.wrap.b32 %r27605, %r27604, %r27604, 20; + add.s32 %r27606, %r27600, %r26967; + add.s32 %r27607, %r27606, %r27605; + xor.b32 %r27608, %r27607, %r27602; + shf.l.wrap.b32 %r27609, %r27608, %r27608, 24; + add.s32 %r27610, %r27609, %r27603; + xor.b32 %r27611, %r27610, %r27605; + shf.l.wrap.b32 %r27612, %r27611, %r27611, 25; + add.s32 %r27613, %r27565, %r27000; + add.s32 %r27614, %r27613, %r27556; + xor.b32 %r27615, %r27614, %r27525; + shf.l.wrap.b32 %r27616, %r27615, %r27615, 16; + add.s32 %r27617, %r27616, %r27540; + xor.b32 %r27618, %r27617, %r27556; + shf.l.wrap.b32 %r27619, %r27618, %r27618, 20; + add.s32 %r27620, %r27614, %r26846; + add.s32 %r27621, %r27620, %r27619; + xor.b32 %r27622, %r27621, %r27616; + shf.l.wrap.b32 %r27623, %r27622, %r27622, 24; + add.s32 %r27624, %r27623, %r27617; + xor.b32 %r27625, %r27624, %r27619; + shf.l.wrap.b32 %r27626, %r27625, %r27625, 25; + add.s32 %r27627, %r27579, %r26978; + add.s32 %r27628, %r27627, %r27598; + xor.b32 %r27629, %r27628, %r27623; + shf.l.wrap.b32 %r27630, %r27629, %r27629, 16; + add.s32 %r27631, %r27630, %r27610; + xor.b32 %r27632, %r27631, %r27598; + shf.l.wrap.b32 %r27633, %r27632, %r27632, 20; + add.s32 %r27634, %r27628, %r26868; + add.s32 %r27635, %r27634, %r27633; + xor.b32 %r27636, %r27635, %r27630; + shf.l.wrap.b32 %r27637, %r27636, %r27636, 24; + add.s32 %r27638, %r27637, %r27631; + xor.b32 %r27639, %r27638, %r27633; + shf.l.wrap.b32 %r27640, %r27639, %r27639, 25; + add.s32 %r27641, %r27593, %r26835; + add.s32 %r27642, %r27641, %r27612; + xor.b32 %r27643, %r27642, %r27581; + shf.l.wrap.b32 %r27644, %r27643, %r27643, 16; + add.s32 %r27645, %r27644, %r27624; + xor.b32 %r27646, %r27645, %r27612; + shf.l.wrap.b32 %r27647, %r27646, %r27646, 20; + add.s32 %r27648, %r27642, %r26945; + add.s32 %r27649, %r27648, %r27647; + xor.b32 %r27650, %r27649, %r27644; + shf.l.wrap.b32 %r27651, %r27650, %r27650, 24; + add.s32 %r27652, %r27651, %r27645; + xor.b32 %r27653, %r27652, %r27647; + shf.l.wrap.b32 %r27654, %r27653, %r27653, 25; + add.s32 %r27655, %r27607, %r26857; + add.s32 %r27656, %r27655, %r27626; + xor.b32 %r27657, %r27656, %r27595; + shf.l.wrap.b32 %r27658, %r27657, %r27657, 16; + add.s32 %r27659, %r27658, %r27582; + xor.b32 %r27660, %r27659, %r27626; + shf.l.wrap.b32 %r27661, %r27660, %r27660, 20; + add.s32 %r27662, %r27656, %r26901; + add.s32 %r27663, %r27662, %r27661; + xor.b32 %r27664, %r27663, %r27658; + shf.l.wrap.b32 %r27665, %r27664, %r27664, 24; + add.s32 %r27666, %r27665, %r27659; + xor.b32 %r27667, %r27666, %r27661; + shf.l.wrap.b32 %r27668, %r27667, %r27667, 25; + add.s32 %r27669, %r27621, %r26879; + add.s32 %r27670, %r27669, %r27584; + xor.b32 %r27671, %r27670, %r27609; + shf.l.wrap.b32 %r27672, %r27671, %r27671, 16; + add.s32 %r27673, %r27672, %r27596; + xor.b32 %r27674, %r27673, %r27584; + shf.l.wrap.b32 %r27675, %r27674, %r27674, 20; + add.s32 %r27676, %r27670, %r26912; + add.s32 %r27677, %r27676, %r27675; + xor.b32 %r27678, %r27677, %r27672; + shf.l.wrap.b32 %r27679, %r27678, %r27678, 24; + add.s32 %r27680, %r27679, %r27673; + xor.b32 %r27681, %r27680, %r27675; + shf.l.wrap.b32 %r27682, %r27681, %r27681, 25; + add.s32 %r27683, %r27635, %r26956; + add.s32 %r27684, %r27683, %r27682; + xor.b32 %r27685, %r27684, %r27651; + shf.l.wrap.b32 %r27686, %r27685, %r27685, 16; + add.s32 %r27687, %r27686, %r27666; + xor.b32 %r27688, %r27687, %r27682; + shf.l.wrap.b32 %r27689, %r27688, %r27688, 20; + add.s32 %r27690, %r27684, %r27000; + add.s32 %r27691, %r27690, %r27689; + xor.b32 %r27692, %r27691, %r27686; + shf.l.wrap.b32 %r27693, %r27692, %r27692, 24; + add.s32 %r27694, %r27693, %r27687; + xor.b32 %r27695, %r27694, %r27689; + shf.l.wrap.b32 %r27696, %r27695, %r27695, 25; + add.s32 %r27697, %r27649, %r26890; + add.s32 %r27698, %r27697, %r27640; + xor.b32 %r27699, %r27698, %r27665; + shf.l.wrap.b32 %r27700, %r27699, %r27699, 16; + add.s32 %r27701, %r27700, %r27680; + xor.b32 %r27702, %r27701, %r27640; + shf.l.wrap.b32 %r27703, %r27702, %r27702, 20; + add.s32 %r27704, %r27698, %r26835; + add.s32 %r27705, %r27704, %r27703; + xor.b32 %r27706, %r27705, %r27700; + shf.l.wrap.b32 %r27707, %r27706, %r27706, 24; + add.s32 %r27708, %r27707, %r27701; + xor.b32 %r27709, %r27708, %r27703; + shf.l.wrap.b32 %r27710, %r27709, %r27709, 25; + add.s32 %r27711, %r27663, %r26846; + add.s32 %r27712, %r27711, %r27654; + xor.b32 %r27713, %r27712, %r27679; + shf.l.wrap.b32 %r27714, %r27713, %r27713, 16; + add.s32 %r27715, %r27714, %r27638; + xor.b32 %r27716, %r27715, %r27654; + shf.l.wrap.b32 %r27717, %r27716, %r27716, 20; + add.s32 %r27718, %r27712, %r26934; + add.s32 %r27719, %r27718, %r27717; + xor.b32 %r27720, %r27719, %r27714; + shf.l.wrap.b32 %r27721, %r27720, %r27720, 24; + add.s32 %r27722, %r27721, %r27715; + xor.b32 %r27723, %r27722, %r27717; + shf.l.wrap.b32 %r27724, %r27723, %r27723, 25; + add.s32 %r27725, %r27677, %r26923; + add.s32 %r27726, %r27725, %r27668; + xor.b32 %r27727, %r27726, %r27637; + shf.l.wrap.b32 %r27728, %r27727, %r27727, 16; + add.s32 %r27729, %r27728, %r27652; + xor.b32 %r27730, %r27729, %r27668; + shf.l.wrap.b32 %r27731, %r27730, %r27730, 20; + add.s32 %r27732, %r27726, %r26901; + add.s32 %r27733, %r27732, %r27731; + xor.b32 %r27734, %r27733, %r27728; + shf.l.wrap.b32 %r27735, %r27734, %r27734, 24; + add.s32 %r27736, %r27735, %r27729; + xor.b32 %r27737, %r27736, %r27731; + shf.l.wrap.b32 %r27738, %r27737, %r27737, 25; + add.s32 %r27739, %r27691, %r26989; + add.s32 %r27740, %r27739, %r27710; + xor.b32 %r27741, %r27740, %r27735; + shf.l.wrap.b32 %r27742, %r27741, %r27741, 16; + add.s32 %r27743, %r27742, %r27722; + xor.b32 %r27744, %r27743, %r27710; + shf.l.wrap.b32 %r27745, %r27744, %r27744, 20; + add.s32 %r27746, %r27740, %r26945; + add.s32 %r27747, %r27746, %r27745; + xor.b32 %r27748, %r27747, %r27742; + shr.u32 %r27749, %r27748, 8; + shf.l.wrap.b32 %r27750, %r27748, %r27748, 24; + add.s32 %r27751, %r27750, %r27743; + xor.b32 %r27752, %r27751, %r27745; + shr.u32 %r27753, %r27752, 7; + shf.l.wrap.b32 %r27754, %r27752, %r27752, 25; + add.s32 %r27755, %r27705, %r26857; + add.s32 %r27756, %r27755, %r27724; + xor.b32 %r27757, %r27756, %r27693; + shf.l.wrap.b32 %r27758, %r27757, %r27757, 16; + add.s32 %r27759, %r27758, %r27736; + xor.b32 %r27760, %r27759, %r27724; + shf.l.wrap.b32 %r27761, %r27760, %r27760, 20; + add.s32 %r27762, %r27756, %r26967; + add.s32 %r27763, %r27762, %r27761; + xor.b32 %r27764, %r27763, %r27758; + shr.u32 %r27765, %r27764, 8; + shf.l.wrap.b32 %r27766, %r27764, %r27764, 24; + add.s32 %r27767, %r27766, %r27759; + xor.b32 %r27768, %r27767, %r27761; + shr.u32 %r27769, %r27768, 7; + shf.l.wrap.b32 %r27770, %r27768, %r27768, 25; + add.s32 %r27771, %r27719, %r26868; + add.s32 %r27772, %r27771, %r27738; + xor.b32 %r27773, %r27772, %r27707; + shf.l.wrap.b32 %r27774, %r27773, %r27773, 16; + add.s32 %r27775, %r27774, %r27694; + xor.b32 %r27776, %r27775, %r27738; + shf.l.wrap.b32 %r27777, %r27776, %r27776, 20; + add.s32 %r27778, %r27772, %r26879; + add.s32 %r27779, %r27778, %r27777; + xor.b32 %r27780, %r27779, %r27774; + shr.u32 %r27781, %r27780, 8; + shf.l.wrap.b32 %r27782, %r27780, %r27780, 24; + add.s32 %r27783, %r27782, %r27775; + xor.b32 %r27784, %r27783, %r27777; + shr.u32 %r27785, %r27784, 7; + shf.l.wrap.b32 %r27786, %r27784, %r27784, 25; + add.s32 %r27787, %r27733, %r26912; + add.s32 %r27788, %r27787, %r27696; + xor.b32 %r27789, %r27788, %r27721; + shf.l.wrap.b32 %r27790, %r27789, %r27789, 16; + add.s32 %r27791, %r27790, %r27708; + xor.b32 %r27792, %r27791, %r27696; + shf.l.wrap.b32 %r27793, %r27792, %r27792, 20; + add.s32 %r27794, %r27788, %r26978; + add.s32 %r27795, %r27794, %r27793; + xor.b32 %r27796, %r27795, %r27790; + shr.u32 %r27797, %r27796, 8; + shf.l.wrap.b32 %r27798, %r27796, %r27796, 24; + add.s32 %r27799, %r27798, %r27791; + xor.b32 %r27800, %r27799, %r27793; + shr.u32 %r27801, %r27800, 7; + shf.l.wrap.b32 %r27802, %r27800, %r27800, 25; + xor.b32 %r27803, %r27783, %r27747; + xor.b32 %r27804, %r27799, %r27763; + xor.b32 %r27805, %r27751, %r27779; + xor.b32 %r27806, %r27767, %r27795; + xor.b32 %r27807, %r27802, %r27766; + xor.b32 %r27808, %r27754, %r27782; + xor.b32 %r27809, %r27770, %r27798; + xor.b32 %r27810, %r27786, %r27750; + cvt.u16.u32 %rs551, %r27783; + cvt.u16.u32 %rs552, %r27747; + xor.b16 %rs830, %rs551, %rs552; + shr.u32 %r27811, %r27803, 8; + cvt.u16.u32 %rs831, %r27811; + shr.u32 %r27812, %r27803, 16; + cvt.u16.u32 %rs832, %r27812; + shr.u32 %r27813, %r27803, 24; + cvt.u16.u32 %rs833, %r27813; + cvt.u16.u32 %rs553, %r27799; + cvt.u16.u32 %rs554, %r27763; + xor.b16 %rs834, %rs553, %rs554; + shr.u32 %r27814, %r27804, 8; + cvt.u16.u32 %rs835, %r27814; + shr.u32 %r27815, %r27804, 16; + cvt.u16.u32 %rs836, %r27815; + shr.u32 %r27816, %r27804, 24; + cvt.u16.u32 %rs837, %r27816; + cvt.u16.u32 %rs555, %r27779; + cvt.u16.u32 %rs556, %r27751; + xor.b16 %rs838, %rs556, %rs555; + shr.u32 %r27817, %r27805, 8; + cvt.u16.u32 %rs839, %r27817; + shr.u32 %r27818, %r27805, 16; + cvt.u16.u32 %rs840, %r27818; + shr.u32 %r27819, %r27805, 24; + cvt.u16.u32 %rs841, %r27819; + cvt.u16.u32 %rs557, %r27767; + cvt.u16.u32 %rs558, %r27795; + xor.b16 %rs842, %rs557, %rs558; + shr.u32 %r27820, %r27806, 8; + cvt.u16.u32 %rs843, %r27820; + shr.u32 %r27821, %r27806, 16; + cvt.u16.u32 %rs844, %r27821; + shr.u32 %r27822, %r27806, 24; + cvt.u16.u32 %rs845, %r27822; + cvt.u16.u32 %rs559, %r27801; + cvt.u16.u32 %rs560, %r27765; + xor.b16 %rs846, %rs559, %rs560; + shr.u32 %r27823, %r27807, 8; + cvt.u16.u32 %rs847, %r27823; + shr.u32 %r27824, %r27807, 16; + cvt.u16.u32 %rs848, %r27824; + shr.u32 %r27825, %r27807, 24; + cvt.u16.u32 %rs849, %r27825; + cvt.u16.u32 %rs561, %r27781; + cvt.u16.u32 %rs562, %r27753; + xor.b16 %rs850, %rs562, %rs561; + shr.u32 %r27826, %r27808, 8; + cvt.u16.u32 %rs851, %r27826; + shr.u32 %r27827, %r27808, 16; + cvt.u16.u32 %rs852, %r27827; + shr.u32 %r27828, %r27808, 24; + cvt.u16.u32 %rs853, %r27828; + cvt.u16.u32 %rs563, %r27797; + cvt.u16.u32 %rs564, %r27769; + xor.b16 %rs854, %rs564, %rs563; + shr.u32 %r27829, %r27809, 8; + cvt.u16.u32 %rs855, %r27829; + shr.u32 %r27830, %r27809, 16; + cvt.u16.u32 %rs856, %r27830; + shr.u32 %r27831, %r27809, 24; + cvt.u16.u32 %rs857, %r27831; + cvt.u16.u32 %rs565, %r27749; + cvt.u16.u32 %rs566, %r27785; + xor.b16 %rs858, %rs566, %rs565; + shr.u32 %r27832, %r27810, 8; + cvt.u16.u32 %rs859, %r27832; + shr.u32 %r27833, %r27810, 16; + cvt.u16.u32 %rs860, %r27833; + shr.u32 %r27834, %r27810, 24; + cvt.u16.u32 %rs861, %r27834; + setp.ne.s64 %p53, %rd1367, 0; + mov.u16 %rs862, 64; + mov.u16 %rs732, %rs863; + mov.u16 %rs766, %rs829; + mov.u16 %rs767, %rs828; + mov.u16 %rs768, %rs827; + mov.u16 %rs769, %rs826; + mov.u16 %rs770, %rs825; + mov.u16 %rs771, %rs824; + mov.u16 %rs772, %rs823; + mov.u16 %rs773, %rs822; + mov.u16 %rs774, %rs821; + mov.u16 %rs775, %rs820; + mov.u16 %rs776, %rs819; + mov.u16 %rs777, %rs818; + mov.u16 %rs778, %rs817; + mov.u16 %rs779, %rs816; + mov.u16 %rs780, %rs815; + mov.u16 %rs781, %rs814; + mov.u16 %rs782, %rs813; + mov.u16 %rs783, %rs812; + mov.u16 %rs784, %rs811; + mov.u16 %rs785, %rs810; + mov.u16 %rs786, %rs809; + mov.u16 %rs787, %rs808; + mov.u16 %rs788, %rs807; + mov.u16 %rs789, %rs806; + mov.u16 %rs790, %rs805; + mov.u16 %rs791, %rs804; + mov.u16 %rs792, %rs803; + mov.u16 %rs793, %rs802; + mov.u16 %rs794, %rs801; + mov.u16 %rs795, %rs800; + mov.u16 %rs796, %rs799; + mov.u16 %rs797, %rs798; + mov.u64 %rd1368, %rd1159; + mov.u32 %r31250, %r31265; + mov.u32 %r31251, %r31264; + mov.u32 %r31252, %r31263; + mov.u32 %r31253, %r31262; + mov.u32 %r31254, %r31261; + mov.u32 %r31255, %r31260; + mov.u32 %r31256, %r31259; + mov.u32 %r31257, %r31258; + @%p53 bra $L__BB2_93; + +$L__BB2_94: + cvt.u32.u16 %r27835, %rs798; + and.b32 %r27836, %r27835, 255; + cvt.u32.u16 %r27837, %rs799; + prmt.b32 %r27838, %r27837, %r27836, 30212; + cvt.u32.u16 %r27839, %rs800; + shl.b32 %r27840, %r27839, 16; + and.b32 %r27841, %r27840, 16711680; + or.b32 %r27842, %r27838, %r27841; + cvt.u32.u16 %r27843, %rs801; + shl.b32 %r27844, %r27843, 24; + or.b32 %r27845, %r27842, %r27844; + cvt.u32.u16 %r27846, %rs802; + and.b32 %r27847, %r27846, 255; + cvt.u32.u16 %r27848, %rs803; + prmt.b32 %r27849, %r27848, %r27847, 30212; + cvt.u32.u16 %r27850, %rs804; + shl.b32 %r27851, %r27850, 16; + and.b32 %r27852, %r27851, 16711680; + or.b32 %r27853, %r27849, %r27852; + cvt.u32.u16 %r27854, %rs805; + shl.b32 %r27855, %r27854, 24; + or.b32 %r27856, %r27853, %r27855; + cvt.u32.u16 %r27857, %rs806; + and.b32 %r27858, %r27857, 255; + cvt.u32.u16 %r27859, %rs807; + prmt.b32 %r27860, %r27859, %r27858, 30212; + cvt.u32.u16 %r27861, %rs808; + shl.b32 %r27862, %r27861, 16; + and.b32 %r27863, %r27862, 16711680; + or.b32 %r27864, %r27860, %r27863; + cvt.u32.u16 %r27865, %rs809; + shl.b32 %r27866, %r27865, 24; + or.b32 %r27867, %r27864, %r27866; + cvt.u32.u16 %r27868, %rs810; + and.b32 %r27869, %r27868, 255; + cvt.u32.u16 %r27870, %rs811; + prmt.b32 %r27871, %r27870, %r27869, 30212; + cvt.u32.u16 %r27872, %rs812; + shl.b32 %r27873, %r27872, 16; + and.b32 %r27874, %r27873, 16711680; + or.b32 %r27875, %r27871, %r27874; + cvt.u32.u16 %r27876, %rs813; + shl.b32 %r27877, %r27876, 24; + or.b32 %r27878, %r27875, %r27877; + cvt.u32.u16 %r27879, %rs814; + and.b32 %r27880, %r27879, 255; + cvt.u32.u16 %r27881, %rs815; + prmt.b32 %r27882, %r27881, %r27880, 30212; + cvt.u32.u16 %r27883, %rs816; + shl.b32 %r27884, %r27883, 16; + and.b32 %r27885, %r27884, 16711680; + or.b32 %r27886, %r27882, %r27885; + cvt.u32.u16 %r27887, %rs817; + shl.b32 %r27888, %r27887, 24; + or.b32 %r27889, %r27886, %r27888; + cvt.u32.u16 %r27890, %rs818; + and.b32 %r27891, %r27890, 255; + cvt.u32.u16 %r27892, %rs819; + prmt.b32 %r27893, %r27892, %r27891, 30212; + cvt.u32.u16 %r27894, %rs820; + shl.b32 %r27895, %r27894, 16; + and.b32 %r27896, %r27895, 16711680; + or.b32 %r27897, %r27893, %r27896; + cvt.u32.u16 %r27898, %rs821; + shl.b32 %r27899, %r27898, 24; + or.b32 %r27900, %r27897, %r27899; + cvt.u32.u16 %r27901, %rs822; + and.b32 %r27902, %r27901, 255; + cvt.u32.u16 %r27903, %rs823; + prmt.b32 %r27904, %r27903, %r27902, 30212; + cvt.u32.u16 %r27905, %rs824; + shl.b32 %r27906, %r27905, 16; + and.b32 %r27907, %r27906, 16711680; + or.b32 %r27908, %r27904, %r27907; + cvt.u32.u16 %r27909, %rs825; + shl.b32 %r27910, %r27909, 24; + or.b32 %r27911, %r27908, %r27910; + cvt.u32.u16 %r27912, %rs826; + and.b32 %r27913, %r27912, 255; + cvt.u32.u16 %r27914, %rs827; + prmt.b32 %r27915, %r27914, %r27913, 30212; + cvt.u32.u16 %r27916, %rs828; + shl.b32 %r27917, %r27916, 16; + and.b32 %r27918, %r27917, 16711680; + or.b32 %r27919, %r27915, %r27918; + cvt.u32.u16 %r27920, %rs829; + shl.b32 %r27921, %r27920, 24; + or.b32 %r27922, %r27919, %r27921; + cvt.u32.u16 %r27923, %rs830; + and.b32 %r27924, %r27923, 255; + cvt.u32.u16 %r27925, %rs831; + prmt.b32 %r27926, %r27925, %r27924, 30212; + cvt.u32.u16 %r27927, %rs832; + shl.b32 %r27928, %r27927, 16; + and.b32 %r27929, %r27928, 16711680; + or.b32 %r27930, %r27926, %r27929; + cvt.u32.u16 %r27931, %rs833; + shl.b32 %r27932, %r27931, 24; + or.b32 %r27933, %r27930, %r27932; + cvt.u32.u16 %r27934, %rs834; + and.b32 %r27935, %r27934, 255; + cvt.u32.u16 %r27936, %rs835; + prmt.b32 %r27937, %r27936, %r27935, 30212; + cvt.u32.u16 %r27938, %rs836; + shl.b32 %r27939, %r27938, 16; + and.b32 %r27940, %r27939, 16711680; + or.b32 %r27941, %r27937, %r27940; + cvt.u32.u16 %r27942, %rs837; + shl.b32 %r27943, %r27942, 24; + or.b32 %r27944, %r27941, %r27943; + cvt.u32.u16 %r27945, %rs838; + and.b32 %r27946, %r27945, 255; + cvt.u32.u16 %r27947, %rs839; + prmt.b32 %r27948, %r27947, %r27946, 30212; + cvt.u32.u16 %r27949, %rs840; + shl.b32 %r27950, %r27949, 16; + and.b32 %r27951, %r27950, 16711680; + or.b32 %r27952, %r27948, %r27951; + cvt.u32.u16 %r27953, %rs841; + shl.b32 %r27954, %r27953, 24; + or.b32 %r27955, %r27952, %r27954; + cvt.u32.u16 %r27956, %rs842; + and.b32 %r27957, %r27956, 255; + cvt.u32.u16 %r27958, %rs843; + prmt.b32 %r27959, %r27958, %r27957, 30212; + cvt.u32.u16 %r27960, %rs844; + shl.b32 %r27961, %r27960, 16; + and.b32 %r27962, %r27961, 16711680; + or.b32 %r27963, %r27959, %r27962; + cvt.u32.u16 %r27964, %rs845; + shl.b32 %r27965, %r27964, 24; + or.b32 %r27966, %r27963, %r27965; + cvt.u32.u16 %r27967, %rs846; + and.b32 %r27968, %r27967, 255; + cvt.u32.u16 %r27969, %rs847; + prmt.b32 %r27970, %r27969, %r27968, 30212; + cvt.u32.u16 %r27971, %rs848; + shl.b32 %r27972, %r27971, 16; + and.b32 %r27973, %r27972, 16711680; + or.b32 %r27974, %r27970, %r27973; + cvt.u32.u16 %r27975, %rs849; + shl.b32 %r27976, %r27975, 24; + or.b32 %r27977, %r27974, %r27976; + cvt.u32.u16 %r27978, %rs850; + and.b32 %r27979, %r27978, 255; + cvt.u32.u16 %r27980, %rs851; + prmt.b32 %r27981, %r27980, %r27979, 30212; + cvt.u32.u16 %r27982, %rs852; + shl.b32 %r27983, %r27982, 16; + and.b32 %r27984, %r27983, 16711680; + or.b32 %r27985, %r27981, %r27984; + cvt.u32.u16 %r27986, %rs853; + shl.b32 %r27987, %r27986, 24; + or.b32 %r27988, %r27985, %r27987; + cvt.u32.u16 %r27989, %rs854; + and.b32 %r27990, %r27989, 255; + cvt.u32.u16 %r27991, %rs855; + prmt.b32 %r27992, %r27991, %r27990, 30212; + cvt.u32.u16 %r27993, %rs856; + shl.b32 %r27994, %r27993, 16; + and.b32 %r27995, %r27994, 16711680; + or.b32 %r27996, %r27992, %r27995; + cvt.u32.u16 %r27997, %rs857; + shl.b32 %r27998, %r27997, 24; + or.b32 %r27999, %r27996, %r27998; + cvt.u32.u16 %r28000, %rs858; + and.b32 %r28001, %r28000, 255; + cvt.u32.u16 %r28002, %rs859; + prmt.b32 %r28003, %r28002, %r28001, 30212; + cvt.u32.u16 %r28004, %rs860; + shl.b32 %r28005, %r28004, 16; + and.b32 %r28006, %r28005, 16711680; + or.b32 %r28007, %r28003, %r28006; + cvt.u32.u16 %r28008, %rs861; + shl.b32 %r28009, %r28008, 24; + or.b32 %r28010, %r28007, %r28009; + or.b16 %rs567, %rs863, 8; + cvt.u32.u16 %r28011, %rs567; + and.b32 %r28012, %r28011, 255; + add.s32 %r28013, %r31262, %r31258; + add.s32 %r28014, %r28013, %r27845; + add.s32 %r28015, %r27856, %r28014; + add.s32 %r28016, %r31263, %r31259; + add.s32 %r28017, %r28016, %r27867; + add.s32 %r28018, %r27878, %r28017; + add.s32 %r28019, %r31264, %r31260; + add.s32 %r28020, %r28019, %r27889; + cvt.u32.u16 %r28021, %rs862; + and.b32 %r28022, %r28021, 255; + xor.b32 %r28023, %r28020, %r28022; + shr.u32 %r28024, %r28020, 16; + shl.b32 %r28025, %r28023, 16; + or.b32 %r28026, %r28025, %r28024; + add.s32 %r28027, %r28026, 1013904242; + xor.b32 %r28028, %r28027, %r31264; + shf.l.wrap.b32 %r28029, %r28028, %r28028, 20; + add.s32 %r28030, %r27900, %r28020; + add.s32 %r28031, %r28030, %r28029; + xor.b32 %r28032, %r28031, %r28026; + shf.l.wrap.b32 %r28033, %r28032, %r28032, 24; + add.s32 %r28034, %r28033, %r28027; + xor.b32 %r28035, %r28034, %r28029; + shf.l.wrap.b32 %r28036, %r28035, %r28035, 25; + add.s32 %r28037, %r31265, %r31261; + add.s32 %r28038, %r28037, %r27911; + xor.b32 %r28039, %r28038, %r28012; + shr.u32 %r28040, %r28038, 16; + shl.b32 %r28041, %r28039, 16; + or.b32 %r28042, %r28041, %r28040; + add.s32 %r28043, %r28042, -1521486534; + xor.b32 %r28044, %r28043, %r31265; + shf.l.wrap.b32 %r28045, %r28044, %r28044, 20; + add.s32 %r28046, %r27922, %r28038; + add.s32 %r28047, %r28046, %r28045; + xor.b32 %r28048, %r28047, %r28042; + shf.l.wrap.b32 %r28049, %r28048, %r28048, 24; + add.s32 %r28050, %r28049, %r28043; + xor.b32 %r28051, %r28050, %r28045; + shf.l.wrap.b32 %r28052, %r28051, %r28051, 25; + add.s32 %r28053, %r28036, %r27955; + add.s32 %r28054, %r28031, %r27977; + add.s32 %r28055, %r28054, %r28052; + add.s32 %r28056, %r28055, %r27988; + add.s32 %r28057, %r28047, %r27999; + shf.l.wrap.b32 %r28058, %r28014, %r28014, 16; + add.s32 %r28059, %r28058, 1779033703; + xor.b32 %r28060, %r28059, %r31262; + shf.l.wrap.b32 %r28061, %r28060, %r28060, 20; + add.s32 %r28062, %r28015, %r28061; + xor.b32 %r28063, %r28062, %r28058; + shf.l.wrap.b32 %r28064, %r28063, %r28063, 24; + add.s32 %r28065, %r28064, %r28059; + xor.b32 %r28066, %r28065, %r28061; + shf.l.wrap.b32 %r28067, %r28066, %r28066, 25; + shf.l.wrap.b32 %r28068, %r28017, %r28017, 16; + add.s32 %r28069, %r28068, -1150833019; + xor.b32 %r28070, %r28069, %r31263; + shf.l.wrap.b32 %r28071, %r28070, %r28070, 20; + add.s32 %r28072, %r28018, %r28071; + xor.b32 %r28073, %r28072, %r28068; + shf.l.wrap.b32 %r28074, %r28073, %r28073, 24; + add.s32 %r28075, %r28074, %r28069; + xor.b32 %r28076, %r28075, %r28071; + shf.l.wrap.b32 %r28077, %r28076, %r28076, 25; + add.s32 %r28078, %r28062, %r27933; + add.s32 %r28079, %r28078, %r28077; + xor.b32 %r28080, %r28079, %r28049; + shf.l.wrap.b32 %r28081, %r28080, %r28080, 16; + add.s32 %r28082, %r28081, %r28034; + xor.b32 %r28083, %r28082, %r28077; + shf.l.wrap.b32 %r28084, %r28083, %r28083, 20; + add.s32 %r28085, %r28079, %r27944; + add.s32 %r28086, %r28085, %r28084; + xor.b32 %r28087, %r28086, %r28081; + shf.l.wrap.b32 %r28088, %r28087, %r28087, 24; + add.s32 %r28089, %r28088, %r28082; + xor.b32 %r28090, %r28089, %r28084; + shf.l.wrap.b32 %r28091, %r28090, %r28090, 25; + add.s32 %r28092, %r28053, %r28072; + xor.b32 %r28093, %r28064, %r28092; + shf.l.wrap.b32 %r28094, %r28093, %r28093, 16; + add.s32 %r28095, %r28094, %r28050; + xor.b32 %r28096, %r28095, %r28036; + shf.l.wrap.b32 %r28097, %r28096, %r28096, 20; + add.s32 %r28098, %r28092, %r27966; + add.s32 %r28099, %r28098, %r28097; + xor.b32 %r28100, %r28099, %r28094; + shf.l.wrap.b32 %r28101, %r28100, %r28100, 24; + add.s32 %r28102, %r28101, %r28095; + xor.b32 %r28103, %r28102, %r28097; + shf.l.wrap.b32 %r28104, %r28103, %r28103, 25; + xor.b32 %r28105, %r28074, %r28055; + shf.l.wrap.b32 %r28106, %r28105, %r28105, 16; + add.s32 %r28107, %r28106, %r28065; + xor.b32 %r28108, %r28107, %r28052; + shf.l.wrap.b32 %r28109, %r28108, %r28108, 20; + add.s32 %r28110, %r28056, %r28109; + xor.b32 %r28111, %r28110, %r28106; + shf.l.wrap.b32 %r28112, %r28111, %r28111, 24; + add.s32 %r28113, %r28112, %r28107; + xor.b32 %r28114, %r28113, %r28109; + shf.l.wrap.b32 %r28115, %r28114, %r28114, 25; + add.s32 %r28116, %r28057, %r28067; + xor.b32 %r28117, %r28116, %r28033; + shf.l.wrap.b32 %r28118, %r28117, %r28117, 16; + add.s32 %r28119, %r28118, %r28075; + xor.b32 %r28120, %r28119, %r28067; + shf.l.wrap.b32 %r28121, %r28120, %r28120, 20; + add.s32 %r28122, %r28116, %r28010; + add.s32 %r28123, %r28122, %r28121; + xor.b32 %r28124, %r28123, %r28118; + shf.l.wrap.b32 %r28125, %r28124, %r28124, 24; + add.s32 %r28126, %r28125, %r28119; + xor.b32 %r28127, %r28126, %r28121; + shf.l.wrap.b32 %r28128, %r28127, %r28127, 25; + add.s32 %r28129, %r28086, %r27867; + add.s32 %r28130, %r28129, %r28128; + xor.b32 %r28131, %r28130, %r28101; + shf.l.wrap.b32 %r28132, %r28131, %r28131, 16; + add.s32 %r28133, %r28132, %r28113; + xor.b32 %r28134, %r28133, %r28128; + shf.l.wrap.b32 %r28135, %r28134, %r28134, 20; + add.s32 %r28136, %r28130, %r27911; + add.s32 %r28137, %r28136, %r28135; + xor.b32 %r28138, %r28137, %r28132; + shf.l.wrap.b32 %r28139, %r28138, %r28138, 24; + add.s32 %r28140, %r28139, %r28133; + xor.b32 %r28141, %r28140, %r28135; + shf.l.wrap.b32 %r28142, %r28141, %r28141, 25; + add.s32 %r28143, %r28099, %r27878; + add.s32 %r28144, %r28143, %r28091; + xor.b32 %r28145, %r28144, %r28112; + shf.l.wrap.b32 %r28146, %r28145, %r28145, 16; + add.s32 %r28147, %r28146, %r28126; + xor.b32 %r28148, %r28147, %r28091; + shf.l.wrap.b32 %r28149, %r28148, %r28148, 20; + add.s32 %r28150, %r28144, %r27955; + add.s32 %r28151, %r28150, %r28149; + xor.b32 %r28152, %r28151, %r28146; + shf.l.wrap.b32 %r28153, %r28152, %r28152, 24; + add.s32 %r28154, %r28153, %r28147; + xor.b32 %r28155, %r28154, %r28149; + shf.l.wrap.b32 %r28156, %r28155, %r28155, 25; + add.s32 %r28157, %r28110, %r27922; + add.s32 %r28158, %r28157, %r28104; + xor.b32 %r28159, %r28125, %r28158; + shf.l.wrap.b32 %r28160, %r28159, %r28159, 16; + add.s32 %r28161, %r28160, %r28089; + xor.b32 %r28162, %r28161, %r28104; + shf.l.wrap.b32 %r28163, %r28162, %r28162, 20; + add.s32 %r28164, %r28158, %r27845; + add.s32 %r28165, %r28164, %r28163; + xor.b32 %r28166, %r28165, %r28160; + shf.l.wrap.b32 %r28167, %r28166, %r28166, 24; + add.s32 %r28168, %r28167, %r28161; + xor.b32 %r28169, %r28168, %r28163; + shf.l.wrap.b32 %r28170, %r28169, %r28169, 25; + add.s32 %r28171, %r28123, %r27889; + add.s32 %r28172, %r28171, %r28115; + xor.b32 %r28173, %r28088, %r28172; + shf.l.wrap.b32 %r28174, %r28173, %r28173, 16; + add.s32 %r28175, %r28174, %r28102; + xor.b32 %r28176, %r28175, %r28115; + shf.l.wrap.b32 %r28177, %r28176, %r28176, 20; + add.s32 %r28178, %r28172, %r27988; + add.s32 %r28179, %r28178, %r28177; + xor.b32 %r28180, %r28179, %r28174; + shf.l.wrap.b32 %r28181, %r28180, %r28180, 24; + add.s32 %r28182, %r28181, %r28175; + xor.b32 %r28183, %r28182, %r28177; + shf.l.wrap.b32 %r28184, %r28183, %r28183, 25; + add.s32 %r28185, %r28137, %r27856; + add.s32 %r28186, %r28185, %r28156; + xor.b32 %r28187, %r28186, %r28181; + shf.l.wrap.b32 %r28188, %r28187, %r28187, 16; + add.s32 %r28189, %r28188, %r28168; + xor.b32 %r28190, %r28189, %r28156; + shf.l.wrap.b32 %r28191, %r28190, %r28190, 20; + add.s32 %r28192, %r28186, %r27966; + add.s32 %r28193, %r28192, %r28191; + xor.b32 %r28194, %r28193, %r28188; + shf.l.wrap.b32 %r28195, %r28194, %r28194, 24; + add.s32 %r28196, %r28195, %r28189; + xor.b32 %r28197, %r28196, %r28191; + shf.l.wrap.b32 %r28198, %r28197, %r28197, 25; + add.s32 %r28199, %r28170, %r27977; + add.s32 %r28200, %r28199, %r28151; + xor.b32 %r28201, %r28139, %r28200; + shf.l.wrap.b32 %r28202, %r28201, %r28201, 16; + add.s32 %r28203, %r28202, %r28182; + xor.b32 %r28204, %r28203, %r28170; + shf.l.wrap.b32 %r28205, %r28204, %r28204, 20; + add.s32 %r28206, %r28200, %r27900; + add.s32 %r28207, %r28206, %r28205; + xor.b32 %r28208, %r28207, %r28202; + shf.l.wrap.b32 %r28209, %r28208, %r28208, 24; + add.s32 %r28210, %r28209, %r28203; + xor.b32 %r28211, %r28210, %r28205; + shf.l.wrap.b32 %r28212, %r28211, %r28211, 25; + add.s32 %r28213, %r28165, %r27944; + add.s32 %r28214, %r28213, %r28184; + xor.b32 %r28215, %r28153, %r28214; + shf.l.wrap.b32 %r28216, %r28215, %r28215, 16; + add.s32 %r28217, %r28216, %r28140; + xor.b32 %r28218, %r28217, %r28184; + shf.l.wrap.b32 %r28219, %r28218, %r28218, 20; + add.s32 %r28220, %r28214, %r27999; + add.s32 %r28221, %r28220, %r28219; + xor.b32 %r28222, %r28221, %r28216; + shf.l.wrap.b32 %r28223, %r28222, %r28222, 24; + add.s32 %r28224, %r28223, %r28217; + xor.b32 %r28225, %r28224, %r28219; + shf.l.wrap.b32 %r28226, %r28225, %r28225, 25; + add.s32 %r28227, %r28179, %r28010; + add.s32 %r28228, %r28227, %r28142; + xor.b32 %r28229, %r28228, %r28167; + shf.l.wrap.b32 %r28230, %r28229, %r28229, 16; + add.s32 %r28231, %r28230, %r28154; + xor.b32 %r28232, %r28231, %r28142; + shf.l.wrap.b32 %r28233, %r28232, %r28232, 20; + add.s32 %r28234, %r28228, %r27933; + add.s32 %r28235, %r28234, %r28233; + xor.b32 %r28236, %r28235, %r28230; + shf.l.wrap.b32 %r28237, %r28236, %r28236, 24; + add.s32 %r28238, %r28237, %r28231; + xor.b32 %r28239, %r28238, %r28233; + shf.l.wrap.b32 %r28240, %r28239, %r28239, 25; + add.s32 %r28241, %r28193, %r27878; + add.s32 %r28242, %r28241, %r28240; + xor.b32 %r28243, %r28242, %r28209; + shf.l.wrap.b32 %r28244, %r28243, %r28243, 16; + add.s32 %r28245, %r28244, %r28224; + xor.b32 %r28246, %r28245, %r28240; + shf.l.wrap.b32 %r28247, %r28246, %r28246, 20; + add.s32 %r28248, %r28242, %r27889; + add.s32 %r28249, %r28248, %r28247; + xor.b32 %r28250, %r28249, %r28244; + shf.l.wrap.b32 %r28251, %r28250, %r28250, 24; + add.s32 %r28252, %r28251, %r28245; + xor.b32 %r28253, %r28252, %r28247; + shf.l.wrap.b32 %r28254, %r28253, %r28253, 25; + add.s32 %r28255, %r28207, %r27955; + add.s32 %r28256, %r28255, %r28198; + xor.b32 %r28257, %r28256, %r28223; + shf.l.wrap.b32 %r28258, %r28257, %r28257, 16; + add.s32 %r28259, %r28258, %r28238; + xor.b32 %r28260, %r28259, %r28198; + shf.l.wrap.b32 %r28261, %r28260, %r28260, 20; + add.s32 %r28262, %r28256, %r27977; + add.s32 %r28263, %r28262, %r28261; + xor.b32 %r28264, %r28263, %r28258; + shf.l.wrap.b32 %r28265, %r28264, %r28264, 24; + add.s32 %r28266, %r28265, %r28259; + xor.b32 %r28267, %r28266, %r28261; + shf.l.wrap.b32 %r28268, %r28267, %r28267, 25; + add.s32 %r28269, %r28221, %r27988; + add.s32 %r28270, %r28269, %r28212; + xor.b32 %r28271, %r28237, %r28270; + shf.l.wrap.b32 %r28272, %r28271, %r28271, 16; + add.s32 %r28273, %r28272, %r28196; + xor.b32 %r28274, %r28273, %r28212; + shf.l.wrap.b32 %r28275, %r28274, %r28274, 20; + add.s32 %r28276, %r28270, %r27867; + add.s32 %r28277, %r28276, %r28275; + xor.b32 %r28278, %r28277, %r28272; + shf.l.wrap.b32 %r28279, %r28278, %r28278, 24; + add.s32 %r28280, %r28279, %r28273; + xor.b32 %r28281, %r28280, %r28275; + shf.l.wrap.b32 %r28282, %r28281, %r28281, 25; + add.s32 %r28283, %r28235, %r27922; + add.s32 %r28284, %r28283, %r28226; + xor.b32 %r28285, %r28195, %r28284; + shf.l.wrap.b32 %r28286, %r28285, %r28285, 16; + add.s32 %r28287, %r28286, %r28210; + xor.b32 %r28288, %r28287, %r28226; + shf.l.wrap.b32 %r28289, %r28288, %r28288, 20; + add.s32 %r28290, %r28284, %r27999; + add.s32 %r28291, %r28290, %r28289; + xor.b32 %r28292, %r28291, %r28286; + shf.l.wrap.b32 %r28293, %r28292, %r28292, 24; + add.s32 %r28294, %r28293, %r28287; + xor.b32 %r28295, %r28294, %r28289; + shf.l.wrap.b32 %r28296, %r28295, %r28295, 25; + add.s32 %r28297, %r28249, %r27911; + add.s32 %r28298, %r28297, %r28268; + xor.b32 %r28299, %r28298, %r28293; + shf.l.wrap.b32 %r28300, %r28299, %r28299, 16; + add.s32 %r28301, %r28300, %r28280; + xor.b32 %r28302, %r28301, %r28268; + shf.l.wrap.b32 %r28303, %r28302, %r28302, 20; + add.s32 %r28304, %r28298, %r27900; + add.s32 %r28305, %r28304, %r28303; + xor.b32 %r28306, %r28305, %r28300; + shf.l.wrap.b32 %r28307, %r28306, %r28306, 24; + add.s32 %r28308, %r28307, %r28301; + xor.b32 %r28309, %r28308, %r28303; + shf.l.wrap.b32 %r28310, %r28309, %r28309, 25; + add.s32 %r28311, %r28282, %r27944; + add.s32 %r28312, %r28311, %r28263; + xor.b32 %r28313, %r28251, %r28312; + shf.l.wrap.b32 %r28314, %r28313, %r28313, 16; + add.s32 %r28315, %r28314, %r28294; + xor.b32 %r28316, %r28315, %r28282; + shf.l.wrap.b32 %r28317, %r28316, %r28316, 20; + add.s32 %r28318, %r28312, %r27845; + add.s32 %r28319, %r28318, %r28317; + xor.b32 %r28320, %r28319, %r28314; + shf.l.wrap.b32 %r28321, %r28320, %r28320, 24; + add.s32 %r28322, %r28321, %r28315; + xor.b32 %r28323, %r28322, %r28317; + shf.l.wrap.b32 %r28324, %r28323, %r28323, 25; + add.s32 %r28325, %r28277, %r27966; + add.s32 %r28326, %r28325, %r28296; + xor.b32 %r28327, %r28265, %r28326; + shf.l.wrap.b32 %r28328, %r28327, %r28327, 16; + add.s32 %r28329, %r28328, %r28252; + xor.b32 %r28330, %r28329, %r28296; + shf.l.wrap.b32 %r28331, %r28330, %r28330, 20; + add.s32 %r28332, %r28326, %r28010; + add.s32 %r28333, %r28332, %r28331; + xor.b32 %r28334, %r28333, %r28328; + shf.l.wrap.b32 %r28335, %r28334, %r28334, 24; + add.s32 %r28336, %r28335, %r28329; + xor.b32 %r28337, %r28336, %r28331; + shf.l.wrap.b32 %r28338, %r28337, %r28337, 25; + add.s32 %r28339, %r28291, %r27933; + add.s32 %r28340, %r28339, %r28254; + xor.b32 %r28341, %r28340, %r28279; + shf.l.wrap.b32 %r28342, %r28341, %r28341, 16; + add.s32 %r28343, %r28342, %r28266; + xor.b32 %r28344, %r28343, %r28254; + shf.l.wrap.b32 %r28345, %r28344, %r28344, 20; + add.s32 %r28346, %r28340, %r27856; + add.s32 %r28347, %r28346, %r28345; + xor.b32 %r28348, %r28347, %r28342; + shf.l.wrap.b32 %r28349, %r28348, %r28348, 24; + add.s32 %r28350, %r28349, %r28343; + xor.b32 %r28351, %r28350, %r28345; + shf.l.wrap.b32 %r28352, %r28351, %r28351, 25; + add.s32 %r28353, %r28305, %r27955; + add.s32 %r28354, %r28353, %r28352; + xor.b32 %r28355, %r28354, %r28321; + shf.l.wrap.b32 %r28356, %r28355, %r28355, 16; + add.s32 %r28357, %r28356, %r28336; + xor.b32 %r28358, %r28357, %r28352; + shf.l.wrap.b32 %r28359, %r28358, %r28358, 20; + add.s32 %r28360, %r28354, %r27922; + add.s32 %r28361, %r28360, %r28359; + xor.b32 %r28362, %r28361, %r28356; + shf.l.wrap.b32 %r28363, %r28362, %r28362, 24; + add.s32 %r28364, %r28363, %r28357; + xor.b32 %r28365, %r28364, %r28359; + shf.l.wrap.b32 %r28366, %r28365, %r28365, 25; + add.s32 %r28367, %r28319, %r27977; + add.s32 %r28368, %r28367, %r28310; + xor.b32 %r28369, %r28368, %r28335; + shf.l.wrap.b32 %r28370, %r28369, %r28369, 16; + add.s32 %r28371, %r28370, %r28350; + xor.b32 %r28372, %r28371, %r28310; + shf.l.wrap.b32 %r28373, %r28372, %r28372, 20; + add.s32 %r28374, %r28368, %r27944; + add.s32 %r28375, %r28374, %r28373; + xor.b32 %r28376, %r28375, %r28370; + shf.l.wrap.b32 %r28377, %r28376, %r28376, 24; + add.s32 %r28378, %r28377, %r28371; + xor.b32 %r28379, %r28378, %r28373; + shf.l.wrap.b32 %r28380, %r28379, %r28379, 25; + add.s32 %r28381, %r28333, %r27999; + add.s32 %r28382, %r28381, %r28324; + xor.b32 %r28383, %r28349, %r28382; + shf.l.wrap.b32 %r28384, %r28383, %r28383, 16; + add.s32 %r28385, %r28384, %r28308; + xor.b32 %r28386, %r28385, %r28324; + shf.l.wrap.b32 %r28387, %r28386, %r28386, 20; + add.s32 %r28388, %r28382, %r27878; + add.s32 %r28389, %r28388, %r28387; + xor.b32 %r28390, %r28389, %r28384; + shf.l.wrap.b32 %r28391, %r28390, %r28390, 24; + add.s32 %r28392, %r28391, %r28385; + xor.b32 %r28393, %r28392, %r28387; + shf.l.wrap.b32 %r28394, %r28393, %r28393, 25; + add.s32 %r28395, %r28347, %r27988; + add.s32 %r28396, %r28395, %r28338; + xor.b32 %r28397, %r28307, %r28396; + shf.l.wrap.b32 %r28398, %r28397, %r28397, 16; + add.s32 %r28399, %r28398, %r28322; + xor.b32 %r28400, %r28399, %r28338; + shf.l.wrap.b32 %r28401, %r28400, %r28400, 20; + add.s32 %r28402, %r28396, %r28010; + add.s32 %r28403, %r28402, %r28401; + xor.b32 %r28404, %r28403, %r28398; + shf.l.wrap.b32 %r28405, %r28404, %r28404, 24; + add.s32 %r28406, %r28405, %r28399; + xor.b32 %r28407, %r28406, %r28401; + shf.l.wrap.b32 %r28408, %r28407, %r28407, 25; + add.s32 %r28409, %r28361, %r27889; + add.s32 %r28410, %r28409, %r28380; + xor.b32 %r28411, %r28410, %r28405; + shf.l.wrap.b32 %r28412, %r28411, %r28411, 16; + add.s32 %r28413, %r28412, %r28392; + xor.b32 %r28414, %r28413, %r28380; + shf.l.wrap.b32 %r28415, %r28414, %r28414, 20; + add.s32 %r28416, %r28410, %r27845; + add.s32 %r28417, %r28416, %r28415; + xor.b32 %r28418, %r28417, %r28412; + shf.l.wrap.b32 %r28419, %r28418, %r28418, 24; + add.s32 %r28420, %r28419, %r28413; + xor.b32 %r28421, %r28420, %r28415; + shf.l.wrap.b32 %r28422, %r28421, %r28421, 25; + add.s32 %r28423, %r28394, %r27966; + add.s32 %r28424, %r28423, %r28375; + xor.b32 %r28425, %r28363, %r28424; + shf.l.wrap.b32 %r28426, %r28425, %r28425, 16; + add.s32 %r28427, %r28426, %r28406; + xor.b32 %r28428, %r28427, %r28394; + shf.l.wrap.b32 %r28429, %r28428, %r28428, 20; + add.s32 %r28430, %r28424, %r27867; + add.s32 %r28431, %r28430, %r28429; + xor.b32 %r28432, %r28431, %r28426; + shf.l.wrap.b32 %r28433, %r28432, %r28432, 24; + add.s32 %r28434, %r28433, %r28427; + xor.b32 %r28435, %r28434, %r28429; + shf.l.wrap.b32 %r28436, %r28435, %r28435, 25; + add.s32 %r28437, %r28389, %r27900; + add.s32 %r28438, %r28437, %r28408; + xor.b32 %r28439, %r28377, %r28438; + shf.l.wrap.b32 %r28440, %r28439, %r28439, 16; + add.s32 %r28441, %r28440, %r28364; + xor.b32 %r28442, %r28441, %r28408; + shf.l.wrap.b32 %r28443, %r28442, %r28442, 20; + add.s32 %r28444, %r28438, %r27933; + add.s32 %r28445, %r28444, %r28443; + xor.b32 %r28446, %r28445, %r28440; + shf.l.wrap.b32 %r28447, %r28446, %r28446, 24; + add.s32 %r28448, %r28447, %r28441; + xor.b32 %r28449, %r28448, %r28443; + shf.l.wrap.b32 %r28450, %r28449, %r28449, 25; + add.s32 %r28451, %r28403, %r27856; + add.s32 %r28452, %r28451, %r28366; + xor.b32 %r28453, %r28452, %r28391; + shf.l.wrap.b32 %r28454, %r28453, %r28453, 16; + add.s32 %r28455, %r28454, %r28378; + xor.b32 %r28456, %r28455, %r28366; + shf.l.wrap.b32 %r28457, %r28456, %r28456, 20; + add.s32 %r28458, %r28452, %r27911; + add.s32 %r28459, %r28458, %r28457; + xor.b32 %r28460, %r28459, %r28454; + shf.l.wrap.b32 %r28461, %r28460, %r28460, 24; + add.s32 %r28462, %r28461, %r28455; + xor.b32 %r28463, %r28462, %r28457; + shf.l.wrap.b32 %r28464, %r28463, %r28463, 25; + add.s32 %r28465, %r28417, %r27977; + add.s32 %r28466, %r28465, %r28464; + xor.b32 %r28467, %r28466, %r28433; + shf.l.wrap.b32 %r28468, %r28467, %r28467, 16; + add.s32 %r28469, %r28468, %r28448; + xor.b32 %r28470, %r28469, %r28464; + shf.l.wrap.b32 %r28471, %r28470, %r28470, 20; + add.s32 %r28472, %r28466, %r27988; + add.s32 %r28473, %r28472, %r28471; + xor.b32 %r28474, %r28473, %r28468; + shf.l.wrap.b32 %r28475, %r28474, %r28474, 24; + add.s32 %r28476, %r28475, %r28469; + xor.b32 %r28477, %r28476, %r28471; + shf.l.wrap.b32 %r28478, %r28477, %r28477, 25; + add.s32 %r28479, %r28431, %r27944; + add.s32 %r28480, %r28479, %r28422; + xor.b32 %r28481, %r28480, %r28447; + shf.l.wrap.b32 %r28482, %r28481, %r28481, 16; + add.s32 %r28483, %r28482, %r28462; + xor.b32 %r28484, %r28483, %r28422; + shf.l.wrap.b32 %r28485, %r28484, %r28484, 20; + add.s32 %r28486, %r28480, %r27966; + add.s32 %r28487, %r28486, %r28485; + xor.b32 %r28488, %r28487, %r28482; + shf.l.wrap.b32 %r28489, %r28488, %r28488, 24; + add.s32 %r28490, %r28489, %r28483; + xor.b32 %r28491, %r28490, %r28485; + shf.l.wrap.b32 %r28492, %r28491, %r28491, 25; + add.s32 %r28493, %r28445, %r28010; + add.s32 %r28494, %r28493, %r28436; + xor.b32 %r28495, %r28461, %r28494; + shf.l.wrap.b32 %r28496, %r28495, %r28495, 16; + add.s32 %r28497, %r28496, %r28420; + xor.b32 %r28498, %r28497, %r28436; + shf.l.wrap.b32 %r28499, %r28498, %r28498, 20; + add.s32 %r28500, %r28494, %r27955; + add.s32 %r28501, %r28500, %r28499; + xor.b32 %r28502, %r28501, %r28496; + shf.l.wrap.b32 %r28503, %r28502, %r28502, 24; + add.s32 %r28504, %r28503, %r28497; + xor.b32 %r28505, %r28504, %r28499; + shf.l.wrap.b32 %r28506, %r28505, %r28505, 25; + add.s32 %r28507, %r28459, %r27999; + add.s32 %r28508, %r28507, %r28450; + xor.b32 %r28509, %r28419, %r28508; + shf.l.wrap.b32 %r28510, %r28509, %r28509, 16; + add.s32 %r28511, %r28510, %r28434; + xor.b32 %r28512, %r28511, %r28450; + shf.l.wrap.b32 %r28513, %r28512, %r28512, 20; + add.s32 %r28514, %r28508, %r27933; + add.s32 %r28515, %r28514, %r28513; + xor.b32 %r28516, %r28515, %r28510; + shf.l.wrap.b32 %r28517, %r28516, %r28516, 24; + add.s32 %r28518, %r28517, %r28511; + xor.b32 %r28519, %r28518, %r28513; + shf.l.wrap.b32 %r28520, %r28519, %r28519, 25; + add.s32 %r28521, %r28473, %r27922; + add.s32 %r28522, %r28521, %r28492; + xor.b32 %r28523, %r28522, %r28517; + shf.l.wrap.b32 %r28524, %r28523, %r28523, 16; + add.s32 %r28525, %r28524, %r28504; + xor.b32 %r28526, %r28525, %r28492; + shf.l.wrap.b32 %r28527, %r28526, %r28526, 20; + add.s32 %r28528, %r28522, %r27867; + add.s32 %r28529, %r28528, %r28527; + xor.b32 %r28530, %r28529, %r28524; + shf.l.wrap.b32 %r28531, %r28530, %r28530, 24; + add.s32 %r28532, %r28531, %r28525; + xor.b32 %r28533, %r28532, %r28527; + shf.l.wrap.b32 %r28534, %r28533, %r28533, 25; + add.s32 %r28535, %r28506, %r27900; + add.s32 %r28536, %r28535, %r28487; + xor.b32 %r28537, %r28475, %r28536; + shf.l.wrap.b32 %r28538, %r28537, %r28537, 16; + add.s32 %r28539, %r28538, %r28518; + xor.b32 %r28540, %r28539, %r28506; + shf.l.wrap.b32 %r28541, %r28540, %r28540, 20; + add.s32 %r28542, %r28536, %r27878; + add.s32 %r28543, %r28542, %r28541; + xor.b32 %r28544, %r28543, %r28538; + shf.l.wrap.b32 %r28545, %r28544, %r28544, 24; + add.s32 %r28546, %r28545, %r28539; + xor.b32 %r28547, %r28546, %r28541; + shf.l.wrap.b32 %r28548, %r28547, %r28547, 25; + add.s32 %r28549, %r28501, %r27845; + add.s32 %r28550, %r28549, %r28520; + xor.b32 %r28551, %r28489, %r28550; + shf.l.wrap.b32 %r28552, %r28551, %r28551, 16; + add.s32 %r28553, %r28552, %r28476; + xor.b32 %r28554, %r28553, %r28520; + shf.l.wrap.b32 %r28555, %r28554, %r28554, 20; + add.s32 %r28556, %r28550, %r27856; + add.s32 %r28557, %r28556, %r28555; + xor.b32 %r28558, %r28557, %r28552; + shf.l.wrap.b32 %r28559, %r28558, %r28558, 24; + add.s32 %r28560, %r28559, %r28553; + xor.b32 %r28561, %r28560, %r28555; + shf.l.wrap.b32 %r28562, %r28561, %r28561, 25; + add.s32 %r28563, %r28515, %r27911; + add.s32 %r28564, %r28563, %r28478; + xor.b32 %r28565, %r28564, %r28503; + shf.l.wrap.b32 %r28566, %r28565, %r28565, 16; + add.s32 %r28567, %r28566, %r28490; + xor.b32 %r28568, %r28567, %r28478; + shf.l.wrap.b32 %r28569, %r28568, %r28568, 20; + add.s32 %r28570, %r28564, %r27889; + add.s32 %r28571, %r28570, %r28569; + xor.b32 %r28572, %r28571, %r28566; + shf.l.wrap.b32 %r28573, %r28572, %r28572, 24; + add.s32 %r28574, %r28573, %r28567; + xor.b32 %r28575, %r28574, %r28569; + shf.l.wrap.b32 %r28576, %r28575, %r28575, 25; + add.s32 %r28577, %r28529, %r27944; + add.s32 %r28578, %r28577, %r28576; + xor.b32 %r28579, %r28578, %r28545; + shf.l.wrap.b32 %r28580, %r28579, %r28579, 16; + add.s32 %r28581, %r28580, %r28560; + xor.b32 %r28582, %r28581, %r28576; + shf.l.wrap.b32 %r28583, %r28582, %r28582, 20; + add.s32 %r28584, %r28578, %r27999; + add.s32 %r28585, %r28584, %r28583; + xor.b32 %r28586, %r28585, %r28580; + shf.l.wrap.b32 %r28587, %r28586, %r28586, 24; + add.s32 %r28588, %r28587, %r28581; + xor.b32 %r28589, %r28588, %r28583; + shf.l.wrap.b32 %r28590, %r28589, %r28589, 25; + add.s32 %r28591, %r28543, %r27966; + add.s32 %r28592, %r28591, %r28534; + xor.b32 %r28593, %r28592, %r28559; + shf.l.wrap.b32 %r28594, %r28593, %r28593, 16; + add.s32 %r28595, %r28594, %r28574; + xor.b32 %r28596, %r28595, %r28534; + shf.l.wrap.b32 %r28597, %r28596, %r28596, 20; + add.s32 %r28598, %r28592, %r27900; + add.s32 %r28599, %r28598, %r28597; + xor.b32 %r28600, %r28599, %r28594; + shf.l.wrap.b32 %r28601, %r28600, %r28600, 24; + add.s32 %r28602, %r28601, %r28595; + xor.b32 %r28603, %r28602, %r28597; + shf.l.wrap.b32 %r28604, %r28603, %r28603, 25; + add.s32 %r28605, %r28557, %r27933; + add.s32 %r28606, %r28605, %r28548; + xor.b32 %r28607, %r28573, %r28606; + shf.l.wrap.b32 %r28608, %r28607, %r28607, 16; + add.s32 %r28609, %r28608, %r28532; + xor.b32 %r28610, %r28609, %r28548; + shf.l.wrap.b32 %r28611, %r28610, %r28610, 20; + add.s32 %r28612, %r28606, %r27977; + add.s32 %r28613, %r28612, %r28611; + xor.b32 %r28614, %r28613, %r28608; + shf.l.wrap.b32 %r28615, %r28614, %r28614, 24; + add.s32 %r28616, %r28615, %r28609; + xor.b32 %r28617, %r28616, %r28611; + shf.l.wrap.b32 %r28618, %r28617, %r28617, 25; + add.s32 %r28619, %r28571, %r28010; + add.s32 %r28620, %r28619, %r28562; + xor.b32 %r28621, %r28531, %r28620; + shf.l.wrap.b32 %r28622, %r28621, %r28621, 16; + add.s32 %r28623, %r28622, %r28546; + xor.b32 %r28624, %r28623, %r28562; + shf.l.wrap.b32 %r28625, %r28624, %r28624, 20; + add.s32 %r28626, %r28620, %r27856; + add.s32 %r28627, %r28626, %r28625; + xor.b32 %r28628, %r28627, %r28622; + shf.l.wrap.b32 %r28629, %r28628, %r28628, 24; + add.s32 %r28630, %r28629, %r28623; + xor.b32 %r28631, %r28630, %r28625; + shf.l.wrap.b32 %r28632, %r28631, %r28631, 25; + add.s32 %r28633, %r28585, %r27988; + add.s32 %r28634, %r28633, %r28604; + xor.b32 %r28635, %r28634, %r28629; + shf.l.wrap.b32 %r28636, %r28635, %r28635, 16; + add.s32 %r28637, %r28636, %r28616; + xor.b32 %r28638, %r28637, %r28604; + shf.l.wrap.b32 %r28639, %r28638, %r28638, 20; + add.s32 %r28640, %r28634, %r27878; + add.s32 %r28641, %r28640, %r28639; + xor.b32 %r28642, %r28641, %r28636; + shf.l.wrap.b32 %r28643, %r28642, %r28642, 24; + add.s32 %r28644, %r28643, %r28637; + xor.b32 %r28645, %r28644, %r28639; + shf.l.wrap.b32 %r28646, %r28645, %r28645, 25; + add.s32 %r28647, %r28618, %r27845; + add.s32 %r28648, %r28647, %r28599; + xor.b32 %r28649, %r28587, %r28648; + shf.l.wrap.b32 %r28650, %r28649, %r28649, 16; + add.s32 %r28651, %r28650, %r28630; + xor.b32 %r28652, %r28651, %r28618; + shf.l.wrap.b32 %r28653, %r28652, %r28652, 20; + add.s32 %r28654, %r28648, %r27955; + add.s32 %r28655, %r28654, %r28653; + xor.b32 %r28656, %r28655, %r28650; + shf.l.wrap.b32 %r28657, %r28656, %r28656, 24; + add.s32 %r28658, %r28657, %r28651; + xor.b32 %r28659, %r28658, %r28653; + shf.l.wrap.b32 %r28660, %r28659, %r28659, 25; + add.s32 %r28661, %r28613, %r27867; + add.s32 %r28662, %r28661, %r28632; + xor.b32 %r28663, %r28601, %r28662; + shf.l.wrap.b32 %r28664, %r28663, %r28663, 16; + add.s32 %r28665, %r28664, %r28588; + xor.b32 %r28666, %r28665, %r28632; + shf.l.wrap.b32 %r28667, %r28666, %r28666, 20; + add.s32 %r28668, %r28662, %r27911; + add.s32 %r28669, %r28668, %r28667; + xor.b32 %r28670, %r28669, %r28664; + shf.l.wrap.b32 %r28671, %r28670, %r28670, 24; + add.s32 %r28672, %r28671, %r28665; + xor.b32 %r28673, %r28672, %r28667; + shf.l.wrap.b32 %r28674, %r28673, %r28673, 25; + add.s32 %r28675, %r28627, %r27889; + add.s32 %r28676, %r28675, %r28590; + xor.b32 %r28677, %r28676, %r28615; + shf.l.wrap.b32 %r28678, %r28677, %r28677, 16; + add.s32 %r28679, %r28678, %r28602; + xor.b32 %r28680, %r28679, %r28590; + shf.l.wrap.b32 %r28681, %r28680, %r28680, 20; + add.s32 %r28682, %r28676, %r27922; + add.s32 %r28683, %r28682, %r28681; + xor.b32 %r28684, %r28683, %r28678; + shf.l.wrap.b32 %r28685, %r28684, %r28684, 24; + add.s32 %r28686, %r28685, %r28679; + xor.b32 %r28687, %r28686, %r28681; + shf.l.wrap.b32 %r28688, %r28687, %r28687, 25; + add.s32 %r28689, %r28641, %r27966; + add.s32 %r28690, %r28689, %r28688; + xor.b32 %r28691, %r28690, %r28657; + shf.l.wrap.b32 %r28692, %r28691, %r28691, 16; + add.s32 %r28693, %r28692, %r28672; + xor.b32 %r28694, %r28693, %r28688; + shf.l.wrap.b32 %r28695, %r28694, %r28694, 20; + add.s32 %r28696, %r28690, %r28010; + add.s32 %r28697, %r28696, %r28695; + xor.b32 %r28698, %r28697, %r28692; + shf.l.wrap.b32 %r28699, %r28698, %r28698, 24; + add.s32 %r28700, %r28699, %r28693; + xor.b32 %r28701, %r28700, %r28695; + shf.l.wrap.b32 %r28702, %r28701, %r28701, 25; + add.s32 %r28703, %r28655, %r27900; + add.s32 %r28704, %r28703, %r28646; + xor.b32 %r28705, %r28704, %r28671; + shf.l.wrap.b32 %r28706, %r28705, %r28705, 16; + add.s32 %r28707, %r28706, %r28686; + xor.b32 %r28708, %r28707, %r28646; + shf.l.wrap.b32 %r28709, %r28708, %r28708, 20; + add.s32 %r28710, %r28704, %r27845; + add.s32 %r28711, %r28710, %r28709; + xor.b32 %r28712, %r28711, %r28706; + shf.l.wrap.b32 %r28713, %r28712, %r28712, 24; + add.s32 %r28714, %r28713, %r28707; + xor.b32 %r28715, %r28714, %r28709; + shf.l.wrap.b32 %r28716, %r28715, %r28715, 25; + add.s32 %r28717, %r28669, %r27856; + add.s32 %r28718, %r28717, %r28660; + xor.b32 %r28719, %r28685, %r28718; + shf.l.wrap.b32 %r28720, %r28719, %r28719, 16; + add.s32 %r28721, %r28720, %r28644; + xor.b32 %r28722, %r28721, %r28660; + shf.l.wrap.b32 %r28723, %r28722, %r28722, 20; + add.s32 %r28724, %r28718, %r27944; + add.s32 %r28725, %r28724, %r28723; + xor.b32 %r28726, %r28725, %r28720; + shf.l.wrap.b32 %r28727, %r28726, %r28726, 24; + add.s32 %r28728, %r28727, %r28721; + xor.b32 %r28729, %r28728, %r28723; + shf.l.wrap.b32 %r28730, %r28729, %r28729, 25; + add.s32 %r28731, %r28683, %r27933; + add.s32 %r28732, %r28731, %r28674; + xor.b32 %r28733, %r28643, %r28732; + shf.l.wrap.b32 %r28734, %r28733, %r28733, 16; + add.s32 %r28735, %r28734, %r28658; + xor.b32 %r28736, %r28735, %r28674; + shf.l.wrap.b32 %r28737, %r28736, %r28736, 20; + add.s32 %r28738, %r28732, %r27911; + add.s32 %r28739, %r28738, %r28737; + xor.b32 %r28740, %r28739, %r28734; + shf.l.wrap.b32 %r28741, %r28740, %r28740, 24; + add.s32 %r28742, %r28741, %r28735; + xor.b32 %r28743, %r28742, %r28737; + shf.l.wrap.b32 %r28744, %r28743, %r28743, 25; + add.s32 %r28745, %r28697, %r27999; + add.s32 %r28746, %r28745, %r28716; + xor.b32 %r28747, %r28746, %r28741; + shf.l.wrap.b32 %r28748, %r28747, %r28747, 16; + add.s32 %r28749, %r28748, %r28728; + xor.b32 %r28750, %r28749, %r28716; + shf.l.wrap.b32 %r28751, %r28750, %r28750, 20; + add.s32 %r28752, %r28746, %r27955; + add.s32 %r28753, %r28752, %r28751; + xor.b32 %r28754, %r28753, %r28748; + shf.l.wrap.b32 %r28755, %r28754, %r28754, 24; + add.s32 %r28756, %r28755, %r28749; + xor.b32 %r28757, %r28756, %r28751; + shf.l.wrap.b32 %r28758, %r28757, %r28757, 25; + add.s32 %r28759, %r28730, %r27867; + add.s32 %r28760, %r28759, %r28711; + xor.b32 %r28761, %r28699, %r28760; + shf.l.wrap.b32 %r28762, %r28761, %r28761, 16; + add.s32 %r28763, %r28762, %r28742; + xor.b32 %r28764, %r28763, %r28730; + shf.l.wrap.b32 %r28765, %r28764, %r28764, 20; + add.s32 %r28766, %r28760, %r27977; + add.s32 %r28767, %r28766, %r28765; + xor.b32 %r28768, %r28767, %r28762; + shf.l.wrap.b32 %r28769, %r28768, %r28768, 24; + add.s32 %r28770, %r28769, %r28763; + xor.b32 %r28771, %r28770, %r28765; + shf.l.wrap.b32 %r28772, %r28771, %r28771, 25; + add.s32 %r28773, %r28725, %r27878; + add.s32 %r28774, %r28773, %r28744; + xor.b32 %r28775, %r28713, %r28774; + shf.l.wrap.b32 %r28776, %r28775, %r28775, 16; + add.s32 %r28777, %r28776, %r28700; + xor.b32 %r28778, %r28777, %r28744; + shf.l.wrap.b32 %r28779, %r28778, %r28778, 20; + add.s32 %r28780, %r28774, %r27889; + add.s32 %r28781, %r28780, %r28779; + xor.b32 %r28782, %r28781, %r28776; + shf.l.wrap.b32 %r28783, %r28782, %r28782, 24; + add.s32 %r28784, %r28783, %r28777; + xor.b32 %r28785, %r28784, %r28779; + shf.l.wrap.b32 %r28786, %r28785, %r28785, 25; + add.s32 %r28787, %r28739, %r27922; + add.s32 %r28788, %r28787, %r28702; + xor.b32 %r28789, %r28788, %r28727; + shf.l.wrap.b32 %r28790, %r28789, %r28789, 16; + add.s32 %r28791, %r28790, %r28714; + xor.b32 %r28792, %r28791, %r28702; + shf.l.wrap.b32 %r28793, %r28792, %r28792, 20; + add.s32 %r28794, %r28788, %r27988; + add.s32 %r28795, %r28794, %r28793; + xor.b32 %r28796, %r28795, %r28790; + shf.l.wrap.b32 %r28797, %r28796, %r28796, 24; + add.s32 %r28798, %r28797, %r28791; + xor.b32 %r28799, %r28798, %r28793; + shf.l.wrap.b32 %r28800, %r28799, %r28799, 25; + xor.b32 %r28801, %r28753, %r28784; + cvt.u64.u32 %rd1163, %r28801; + xor.b32 %r28802, %r28798, %r28767; + and.b32 %r28803, %r28802, 255; + cvt.u64.u32 %rd1164, %r28803; + cvt.u64.u32 %rd1165, %r28802; + shl.b64 %rd1166, %rd1165, 32; + and.b64 %rd1167, %rd1166, 280375465082880; + and.b64 %rd1168, %rd1166, 71776119061217280; + shr.u32 %r28804, %r28802, 24; + cvt.u64.u32 %rd1169, %r28804; + shl.b64 %rd1170, %rd1169, 56; + bfi.b64 %rd1171, %rd1164, %rd1163, 32, 32; + or.b64 %rd1172, %rd1171, %rd1167; + or.b64 %rd1173, %rd1172, %rd1168; + or.b64 %rd353, %rd1173, %rd1170; + xor.b32 %r28805, %r28756, %r28781; + cvt.u64.u32 %rd1174, %r28805; + xor.b32 %r28806, %r28795, %r28770; + and.b32 %r28807, %r28806, 255; + cvt.u64.u32 %rd1175, %r28807; + cvt.u64.u32 %rd1176, %r28806; + shl.b64 %rd1177, %rd1176, 32; + and.b64 %rd1178, %rd1177, 280375465082880; + and.b64 %rd1179, %rd1177, 71776119061217280; + shr.u32 %r28808, %r28806, 24; + cvt.u64.u32 %rd1180, %r28808; + shl.b64 %rd1181, %rd1180, 56; + bfi.b64 %rd1182, %rd1175, %rd1174, 32, 32; + or.b64 %rd1183, %rd1182, %rd1178; + or.b64 %rd1184, %rd1183, %rd1179; + or.b64 %rd352, %rd1184, %rd1181; + xor.b32 %r28809, %r28800, %r28769; + cvt.u64.u32 %rd1185, %r28809; + xor.b32 %r28810, %r28758, %r28783; + and.b32 %r28811, %r28810, 255; + cvt.u64.u32 %rd1186, %r28811; + cvt.u64.u32 %rd1187, %r28810; + shl.b64 %rd1188, %rd1187, 32; + and.b64 %rd1189, %rd1188, 280375465082880; + and.b64 %rd1190, %rd1188, 71776119061217280; + shr.u32 %r28812, %r28810, 24; + cvt.u64.u32 %rd1191, %r28812; + shl.b64 %rd1192, %rd1191, 56; + bfi.b64 %rd1193, %rd1186, %rd1185, 32, 32; + or.b64 %rd1194, %rd1193, %rd1189; + or.b64 %rd1195, %rd1194, %rd1190; + or.b64 %rd1370, %rd1195, %rd1192; + xor.b32 %r28813, %r28797, %r28772; + cvt.u64.u32 %rd1196, %r28813; + xor.b32 %r28814, %r28755, %r28786; + and.b32 %r28815, %r28814, 255; + cvt.u64.u32 %rd1197, %r28815; + cvt.u64.u32 %rd1198, %r28814; + shl.b64 %rd1199, %rd1198, 32; + and.b64 %rd1200, %rd1199, 280375465082880; + and.b64 %rd1201, %rd1199, 71776119061217280; + shr.u32 %r28816, %r28814, 24; + cvt.u64.u32 %rd1202, %r28816; + shl.b64 %rd1203, %rd1202, 56; + bfi.b64 %rd1204, %rd1197, %rd1196, 32, 32; + or.b64 %rd1205, %rd1204, %rd1200; + or.b64 %rd1206, %rd1205, %rd1201; + or.b64 %rd1369, %rd1206, %rd1203; + +$L__BB2_96: + ld.const.u64 %rd354, [target+24]; + setp.eq.s64 %p55, %rd1369, %rd354; + @%p55 bra $L__BB2_98; + bra.uni $L__BB2_97; + +$L__BB2_98: + ld.const.u64 %rd355, [target+16]; + setp.eq.s64 %p56, %rd1370, %rd355; + @%p56 bra $L__BB2_100; + bra.uni $L__BB2_99; + +$L__BB2_100: + ld.const.u64 %rd356, [target+8]; + setp.eq.s64 %p57, %rd352, %rd356; + @%p57 bra $L__BB2_102; + bra.uni $L__BB2_101; + +$L__BB2_102: + ld.const.u64 %rd1251, [target]; + setp.lt.u64 %p59, %rd353, %rd1251; + bra.uni $L__BB2_103; + +$L__BB2_97: + setp.lt.u64 %p59, %rd1369, %rd354; + bra.uni $L__BB2_103; + +$L__BB2_99: + setp.lt.u64 %p59, %rd1370, %rd355; + bra.uni $L__BB2_103; + +$L__BB2_101: + setp.lt.u64 %p59, %rd352, %rd356; + +$L__BB2_103: + not.pred %p58, %p59; + @%p58 bra $L__BB2_105; + + ld.param.u64 %rd1264, [heavy_hash_param_0]; + ld.param.u64 %rd1263, [heavy_hash_param_1]; + and.b64 %rd1262, %rd1299, %rd1264; + or.b64 %rd1261, %rd1262, %rd1263; + ld.param.u64 %rd1257, [heavy_hash_param_5]; + cvta.to.global.u64 %rd1252, %rd1257; + mov.u64 %rd1253, 0; + atom.global.cas.b64 %rd1254, [%rd1252], %rd1253, %rd1261; + +$L__BB2_105: + ret; + +} + diff --git a/plugins/cuda/resources/karlsen-cuda-sm75.ptx b/plugins/cuda/resources/karlsen-cuda-sm75.ptx new file mode 100644 index 0000000..c66b59b --- /dev/null +++ b/plugins/cuda/resources/karlsen-cuda-sm75.ptx @@ -0,0 +1,42131 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-31833905 +// Cuda compilation tools, release 11.8, V11.8.89 +// Based on NVVM 7.0.1 +// + +.version 7.8 +.target sm_75 +.address_size 64 + +.global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; +.global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; +.global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; +.global .align 1 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; +.global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; +.global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; +.const .align 8 .b8 keccak_round_constants[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .u64 _ZZN10item_state6updateEjE9num_words = 16; +.global .align 8 .u64 _ZZ15fishhash_kernelRK16fishhash_contextRK7hash512E9num_words = 32; +.const .align 1 .b8 matrix[4096]; +.const .align 1 .b8 hash_header[72]; +.const .align 8 .b8 target[32]; + +.func (.param .b64 func_retval0) _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh( + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3, + .param .b32 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5 +) +{ + .local .align 16 .b8 __local_depot0[224]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<28>; + .reg .b16 %rs<233>; + .reg .b32 %r<3965>; + .reg .b64 %rd<175>; + + + mov.u64 %SPL, __local_depot0; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs75, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4]; + ld.param.u64 %rd69, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd171, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + ld.param.u64 %rd71, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2]; + ld.param.u64 %rd165, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + ld.param.u64 %rd73, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd155, %rd73; + cvta.to.local.u64 %rd2, %rd71; + add.u64 %rd153, %SPL, 16; + add.u64 %rd149, %SP, 96; + cvta.to.local.u64 %rd4, %rd149; + setp.lt.u64 %p1, %rd171, 1025; + @%p1 bra $L__BB0_14; + bra.uni $L__BB0_1; + +$L__BB0_14: + add.u64 %rd162, %SPL, 0; + setp.ne.s64 %p16, %rd171, 1024; + mov.u64 %rd159, 0; + mov.u64 %rd151, %rd159; + @%p16 bra $L__BB0_16; + + mov.u64 %rd171, 0; + st.local.u64 [%rd162], %rd69; + mov.u64 %rd151, 1; + mov.u64 %rd159, 1024; + +$L__BB0_16: + setp.eq.s64 %p17, %rd151, 0; + @%p17 bra $L__BB0_21; + + or.b16 %rs1, %rs75, 1; + mov.u64 %rd163, %rd151; + +$L__BB0_18: + ld.local.u64 %rd166, [%rd162]; + ld.local.u8 %r1060, [%rd2]; + ld.local.u8 %r1061, [%rd2+1]; + prmt.b32 %r1062, %r1061, %r1060, 30212; + ld.local.u8 %r1063, [%rd2+2]; + ld.local.u8 %r1064, [%rd2+3]; + prmt.b32 %r1065, %r1064, %r1063, 30212; + prmt.b32 %r3948, %r1065, %r1062, 4180; + ld.local.u8 %r1066, [%rd2+4]; + ld.local.u8 %r1067, [%rd2+5]; + prmt.b32 %r1068, %r1067, %r1066, 30212; + ld.local.u8 %r1069, [%rd2+6]; + ld.local.u8 %r1070, [%rd2+7]; + prmt.b32 %r1071, %r1070, %r1069, 30212; + prmt.b32 %r3947, %r1071, %r1068, 4180; + ld.local.u8 %r1072, [%rd2+8]; + ld.local.u8 %r1073, [%rd2+9]; + prmt.b32 %r1074, %r1073, %r1072, 30212; + ld.local.u8 %r1075, [%rd2+10]; + ld.local.u8 %r1076, [%rd2+11]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + prmt.b32 %r3946, %r1077, %r1074, 4180; + ld.local.u8 %r1078, [%rd2+12]; + ld.local.u8 %r1079, [%rd2+13]; + prmt.b32 %r1080, %r1079, %r1078, 30212; + ld.local.u8 %r1081, [%rd2+14]; + ld.local.u8 %r1082, [%rd2+15]; + prmt.b32 %r1083, %r1082, %r1081, 30212; + prmt.b32 %r3945, %r1083, %r1080, 4180; + mov.u64 %rd167, 16; + ld.local.u8 %r1084, [%rd2+16]; + ld.local.u8 %r1085, [%rd2+17]; + prmt.b32 %r1086, %r1085, %r1084, 30212; + ld.local.u8 %r1087, [%rd2+18]; + ld.local.u8 %r1088, [%rd2+19]; + prmt.b32 %r1089, %r1088, %r1087, 30212; + prmt.b32 %r3944, %r1089, %r1086, 4180; + ld.local.u8 %r1090, [%rd2+20]; + ld.local.u8 %r1091, [%rd2+21]; + prmt.b32 %r1092, %r1091, %r1090, 30212; + ld.local.u8 %r1093, [%rd2+22]; + ld.local.u8 %r1094, [%rd2+23]; + prmt.b32 %r1095, %r1094, %r1093, 30212; + prmt.b32 %r3943, %r1095, %r1092, 4180; + ld.local.u8 %r1096, [%rd2+24]; + ld.local.u8 %r1097, [%rd2+25]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd2+26]; + ld.local.u8 %r1100, [%rd2+27]; + prmt.b32 %r1101, %r1100, %r1099, 30212; + prmt.b32 %r3942, %r1101, %r1098, 4180; + ld.local.u8 %r1102, [%rd2+28]; + ld.local.u8 %r1103, [%rd2+29]; + prmt.b32 %r1104, %r1103, %r1102, 30212; + ld.local.u8 %r1105, [%rd2+30]; + ld.local.u8 %r1106, [%rd2+31]; + prmt.b32 %r1107, %r1106, %r1105, 30212; + prmt.b32 %r3941, %r1107, %r1104, 4180; + mov.u16 %rs197, %rs1; + +$L__BB0_19: + shr.u64 %rd143, %rd165, 32; + cvt.u32.u64 %r3940, %rd143; + cvt.u32.u64 %r3939, %rd165; + setp.eq.s64 %p18, %rd167, 1; + selp.b16 %rs79, 2, 0, %p18; + or.b16 %rs80, %rs79, %rs197; + ld.u8 %r1108, [%rd166]; + ld.u8 %r1109, [%rd166+1]; + prmt.b32 %r1110, %r1109, %r1108, 30212; + ld.u8 %r1111, [%rd166+2]; + prmt.b32 %r1112, %r1111, %r1110, 28756; + ld.u8 %r1113, [%rd166+3]; + prmt.b32 %r1114, %r1113, %r1112, 1620; + ld.u8 %r1115, [%rd166+4]; + ld.u8 %r1116, [%rd166+5]; + prmt.b32 %r1117, %r1116, %r1115, 30212; + ld.u8 %r1118, [%rd166+6]; + prmt.b32 %r1119, %r1118, %r1117, 28756; + ld.u8 %r1120, [%rd166+7]; + prmt.b32 %r1121, %r1120, %r1119, 1620; + ld.u8 %r1122, [%rd166+8]; + ld.u8 %r1123, [%rd166+9]; + prmt.b32 %r1124, %r1123, %r1122, 30212; + ld.u8 %r1125, [%rd166+10]; + prmt.b32 %r1126, %r1125, %r1124, 28756; + ld.u8 %r1127, [%rd166+11]; + prmt.b32 %r1128, %r1127, %r1126, 1620; + ld.u8 %r1129, [%rd166+12]; + ld.u8 %r1130, [%rd166+13]; + prmt.b32 %r1131, %r1130, %r1129, 30212; + ld.u8 %r1132, [%rd166+14]; + prmt.b32 %r1133, %r1132, %r1131, 28756; + ld.u8 %r1134, [%rd166+15]; + prmt.b32 %r1135, %r1134, %r1133, 1620; + ld.u8 %r1136, [%rd166+16]; + ld.u8 %r1137, [%rd166+17]; + prmt.b32 %r1138, %r1137, %r1136, 30212; + ld.u8 %r1139, [%rd166+18]; + prmt.b32 %r1140, %r1139, %r1138, 28756; + ld.u8 %r1141, [%rd166+19]; + prmt.b32 %r1142, %r1141, %r1140, 1620; + ld.u8 %r1143, [%rd166+20]; + ld.u8 %r1144, [%rd166+21]; + prmt.b32 %r1145, %r1144, %r1143, 30212; + ld.u8 %r1146, [%rd166+22]; + prmt.b32 %r1147, %r1146, %r1145, 28756; + ld.u8 %r1148, [%rd166+23]; + prmt.b32 %r1149, %r1148, %r1147, 1620; + ld.u8 %r1150, [%rd166+24]; + ld.u8 %r1151, [%rd166+25]; + prmt.b32 %r1152, %r1151, %r1150, 30212; + ld.u8 %r1153, [%rd166+26]; + prmt.b32 %r1154, %r1153, %r1152, 28756; + ld.u8 %r1155, [%rd166+27]; + prmt.b32 %r1156, %r1155, %r1154, 1620; + ld.u8 %r1157, [%rd166+28]; + ld.u8 %r1158, [%rd166+29]; + prmt.b32 %r1159, %r1158, %r1157, 30212; + ld.u8 %r1160, [%rd166+30]; + prmt.b32 %r1161, %r1160, %r1159, 28756; + ld.u8 %r1162, [%rd166+31]; + prmt.b32 %r1163, %r1162, %r1161, 1620; + ld.u8 %r1164, [%rd166+32]; + ld.u8 %r1165, [%rd166+33]; + prmt.b32 %r1166, %r1165, %r1164, 30212; + ld.u8 %r1167, [%rd166+34]; + prmt.b32 %r1168, %r1167, %r1166, 28756; + ld.u8 %r1169, [%rd166+35]; + prmt.b32 %r1170, %r1169, %r1168, 1620; + ld.u8 %r1171, [%rd166+36]; + ld.u8 %r1172, [%rd166+37]; + prmt.b32 %r1173, %r1172, %r1171, 30212; + ld.u8 %r1174, [%rd166+38]; + prmt.b32 %r1175, %r1174, %r1173, 28756; + ld.u8 %r1176, [%rd166+39]; + prmt.b32 %r1177, %r1176, %r1175, 1620; + ld.u8 %r1178, [%rd166+40]; + ld.u8 %r1179, [%rd166+41]; + prmt.b32 %r1180, %r1179, %r1178, 30212; + ld.u8 %r1181, [%rd166+42]; + prmt.b32 %r1182, %r1181, %r1180, 28756; + ld.u8 %r1183, [%rd166+43]; + prmt.b32 %r1184, %r1183, %r1182, 1620; + ld.u8 %r1185, [%rd166+44]; + ld.u8 %r1186, [%rd166+45]; + prmt.b32 %r1187, %r1186, %r1185, 30212; + ld.u8 %r1188, [%rd166+46]; + prmt.b32 %r1189, %r1188, %r1187, 28756; + ld.u8 %r1190, [%rd166+47]; + prmt.b32 %r1191, %r1190, %r1189, 1620; + ld.u8 %r1192, [%rd166+48]; + ld.u8 %r1193, [%rd166+49]; + prmt.b32 %r1194, %r1193, %r1192, 30212; + ld.u8 %r1195, [%rd166+50]; + prmt.b32 %r1196, %r1195, %r1194, 28756; + ld.u8 %r1197, [%rd166+51]; + prmt.b32 %r1198, %r1197, %r1196, 1620; + ld.u8 %r1199, [%rd166+52]; + ld.u8 %r1200, [%rd166+53]; + prmt.b32 %r1201, %r1200, %r1199, 30212; + ld.u8 %r1202, [%rd166+54]; + prmt.b32 %r1203, %r1202, %r1201, 28756; + ld.u8 %r1204, [%rd166+55]; + prmt.b32 %r1205, %r1204, %r1203, 1620; + ld.u8 %r1206, [%rd166+56]; + ld.u8 %r1207, [%rd166+57]; + prmt.b32 %r1208, %r1207, %r1206, 30212; + ld.u8 %r1209, [%rd166+58]; + prmt.b32 %r1210, %r1209, %r1208, 28756; + ld.u8 %r1211, [%rd166+59]; + prmt.b32 %r1212, %r1211, %r1210, 1620; + ld.u8 %r1213, [%rd166+60]; + ld.u8 %r1214, [%rd166+61]; + prmt.b32 %r1215, %r1214, %r1213, 30212; + ld.u8 %r1216, [%rd166+62]; + prmt.b32 %r1217, %r1216, %r1215, 28756; + ld.u8 %r1218, [%rd166+63]; + prmt.b32 %r1219, %r1218, %r1217, 1620; + cvt.u32.u16 %r1220, %rs80; + and.b32 %r1221, %r1220, 255; + add.s32 %r1222, %r3944, %r3948; + add.s32 %r1223, %r1222, %r1114; + xor.b32 %r1224, %r1223, %r3939; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 16; + add.s32 %r1226, %r1225, 1779033703; + xor.b32 %r1227, %r1226, %r3944; + shf.l.wrap.b32 %r1228, %r1227, %r1227, 20; + add.s32 %r1229, %r1121, %r1223; + add.s32 %r1230, %r1229, %r1228; + xor.b32 %r1231, %r1230, %r1225; + shf.l.wrap.b32 %r1232, %r1231, %r1231, 24; + add.s32 %r1233, %r1232, %r1226; + xor.b32 %r1234, %r1233, %r1228; + shf.l.wrap.b32 %r1235, %r1234, %r1234, 25; + add.s32 %r1236, %r3943, %r3947; + add.s32 %r1237, %r1236, %r1128; + xor.b32 %r1238, %r1237, %r3940; + shf.l.wrap.b32 %r1239, %r1238, %r1238, 16; + add.s32 %r1240, %r1239, -1150833019; + xor.b32 %r1241, %r1240, %r3943; + shf.l.wrap.b32 %r1242, %r1241, %r1241, 20; + add.s32 %r1243, %r1135, %r1237; + add.s32 %r1244, %r1243, %r1242; + xor.b32 %r1245, %r1244, %r1239; + shf.l.wrap.b32 %r1246, %r1245, %r1245, 24; + add.s32 %r1247, %r1246, %r1240; + xor.b32 %r1248, %r1247, %r1242; + shf.l.wrap.b32 %r1249, %r1248, %r1248, 25; + add.s32 %r1250, %r3942, %r3946; + add.s32 %r1251, %r1250, %r1142; + shr.u32 %r1252, %r1251, 16; + shl.b32 %r1253, %r1251, 16; + xor.b32 %r1254, %r1253, 4194304; + or.b32 %r1255, %r1254, %r1252; + add.s32 %r1256, %r1255, 1013904242; + xor.b32 %r1257, %r1256, %r3942; + shf.l.wrap.b32 %r1258, %r1257, %r1257, 20; + add.s32 %r1259, %r1149, %r1251; + add.s32 %r1260, %r1259, %r1258; + xor.b32 %r1261, %r1260, %r1255; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 24; + add.s32 %r1263, %r1262, %r1256; + xor.b32 %r1264, %r1263, %r1258; + shf.l.wrap.b32 %r1265, %r1264, %r1264, 25; + add.s32 %r1266, %r3941, %r3945; + add.s32 %r1267, %r1266, %r1156; + xor.b32 %r1268, %r1267, %r1221; + shr.u32 %r1269, %r1267, 16; + shl.b32 %r1270, %r1268, 16; + or.b32 %r1271, %r1270, %r1269; + add.s32 %r1272, %r1271, -1521486534; + xor.b32 %r1273, %r1272, %r3941; + shf.l.wrap.b32 %r1274, %r1273, %r1273, 20; + add.s32 %r1275, %r1163, %r1267; + add.s32 %r1276, %r1275, %r1274; + xor.b32 %r1277, %r1276, %r1271; + shf.l.wrap.b32 %r1278, %r1277, %r1277, 24; + add.s32 %r1279, %r1278, %r1272; + xor.b32 %r1280, %r1279, %r1274; + shf.l.wrap.b32 %r1281, %r1280, %r1280, 25; + add.s32 %r1282, %r1249, %r1230; + add.s32 %r1283, %r1282, %r1170; + xor.b32 %r1284, %r1278, %r1283; + shf.l.wrap.b32 %r1285, %r1284, %r1284, 16; + add.s32 %r1286, %r1285, %r1263; + xor.b32 %r1287, %r1286, %r1249; + shf.l.wrap.b32 %r1288, %r1287, %r1287, 20; + add.s32 %r1289, %r1177, %r1283; + add.s32 %r1290, %r1289, %r1288; + xor.b32 %r1291, %r1290, %r1285; + shf.l.wrap.b32 %r1292, %r1291, %r1291, 24; + add.s32 %r1293, %r1292, %r1286; + xor.b32 %r1294, %r1293, %r1288; + shf.l.wrap.b32 %r1295, %r1294, %r1294, 25; + add.s32 %r1296, %r1265, %r1244; + add.s32 %r1297, %r1296, %r1184; + xor.b32 %r1298, %r1297, %r1232; + shf.l.wrap.b32 %r1299, %r1298, %r1298, 16; + add.s32 %r1300, %r1299, %r1279; + xor.b32 %r1301, %r1300, %r1265; + shf.l.wrap.b32 %r1302, %r1301, %r1301, 20; + add.s32 %r1303, %r1191, %r1297; + add.s32 %r1304, %r1303, %r1302; + xor.b32 %r1305, %r1304, %r1299; + shf.l.wrap.b32 %r1306, %r1305, %r1305, 24; + add.s32 %r1307, %r1306, %r1300; + xor.b32 %r1308, %r1307, %r1302; + shf.l.wrap.b32 %r1309, %r1308, %r1308, 25; + add.s32 %r1310, %r1281, %r1260; + add.s32 %r1311, %r1310, %r1198; + xor.b32 %r1312, %r1311, %r1246; + shf.l.wrap.b32 %r1313, %r1312, %r1312, 16; + add.s32 %r1314, %r1313, %r1233; + xor.b32 %r1315, %r1314, %r1281; + shf.l.wrap.b32 %r1316, %r1315, %r1315, 20; + add.s32 %r1317, %r1205, %r1311; + add.s32 %r1318, %r1317, %r1316; + xor.b32 %r1319, %r1318, %r1313; + shf.l.wrap.b32 %r1320, %r1319, %r1319, 24; + add.s32 %r1321, %r1320, %r1314; + xor.b32 %r1322, %r1321, %r1316; + shf.l.wrap.b32 %r1323, %r1322, %r1322, 25; + add.s32 %r1324, %r1276, %r1235; + add.s32 %r1325, %r1324, %r1212; + xor.b32 %r1326, %r1325, %r1262; + shf.l.wrap.b32 %r1327, %r1326, %r1326, 16; + add.s32 %r1328, %r1327, %r1247; + xor.b32 %r1329, %r1328, %r1235; + shf.l.wrap.b32 %r1330, %r1329, %r1329, 20; + add.s32 %r1331, %r1219, %r1325; + add.s32 %r1332, %r1331, %r1330; + xor.b32 %r1333, %r1332, %r1327; + shf.l.wrap.b32 %r1334, %r1333, %r1333, 24; + add.s32 %r1335, %r1334, %r1328; + xor.b32 %r1336, %r1335, %r1330; + shf.l.wrap.b32 %r1337, %r1336, %r1336, 25; + add.s32 %r1338, %r1290, %r1128; + add.s32 %r1339, %r1338, %r1337; + xor.b32 %r1340, %r1339, %r1306; + shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; + add.s32 %r1342, %r1341, %r1321; + xor.b32 %r1343, %r1342, %r1337; + shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; + add.s32 %r1345, %r1339, %r1156; + add.s32 %r1346, %r1345, %r1344; + xor.b32 %r1347, %r1346, %r1341; + shf.l.wrap.b32 %r1348, %r1347, %r1347, 24; + add.s32 %r1349, %r1348, %r1342; + xor.b32 %r1350, %r1349, %r1344; + shf.l.wrap.b32 %r1351, %r1350, %r1350, 25; + add.s32 %r1352, %r1304, %r1135; + add.s32 %r1353, %r1352, %r1295; + xor.b32 %r1354, %r1320, %r1353; + shf.l.wrap.b32 %r1355, %r1354, %r1354, 16; + add.s32 %r1356, %r1335, %r1355; + xor.b32 %r1357, %r1356, %r1295; + shf.l.wrap.b32 %r1358, %r1357, %r1357, 20; + add.s32 %r1359, %r1353, %r1184; + add.s32 %r1360, %r1359, %r1358; + xor.b32 %r1361, %r1360, %r1355; + shf.l.wrap.b32 %r1362, %r1361, %r1361, 24; + add.s32 %r1363, %r1362, %r1356; + xor.b32 %r1364, %r1363, %r1358; + shf.l.wrap.b32 %r1365, %r1364, %r1364, 25; + add.s32 %r1366, %r1309, %r1163; + add.s32 %r1367, %r1366, %r1318; + xor.b32 %r1368, %r1334, %r1367; + shf.l.wrap.b32 %r1369, %r1368, %r1368, 16; + add.s32 %r1370, %r1369, %r1293; + xor.b32 %r1371, %r1370, %r1309; + shf.l.wrap.b32 %r1372, %r1371, %r1371, 20; + add.s32 %r1373, %r1367, %r1114; + add.s32 %r1374, %r1373, %r1372; + xor.b32 %r1375, %r1374, %r1369; + shf.l.wrap.b32 %r1376, %r1375, %r1375, 24; + add.s32 %r1377, %r1376, %r1370; + xor.b32 %r1378, %r1377, %r1372; + shf.l.wrap.b32 %r1379, %r1378, %r1378, 25; + add.s32 %r1380, %r1323, %r1142; + add.s32 %r1381, %r1380, %r1332; + xor.b32 %r1382, %r1381, %r1292; + shf.l.wrap.b32 %r1383, %r1382, %r1382, 16; + add.s32 %r1384, %r1383, %r1307; + xor.b32 %r1385, %r1384, %r1323; + shf.l.wrap.b32 %r1386, %r1385, %r1385, 20; + add.s32 %r1387, %r1381, %r1205; + add.s32 %r1388, %r1387, %r1386; + xor.b32 %r1389, %r1388, %r1383; + shf.l.wrap.b32 %r1390, %r1389, %r1389, 24; + add.s32 %r1391, %r1390, %r1384; + xor.b32 %r1392, %r1391, %r1386; + shf.l.wrap.b32 %r1393, %r1392, %r1392, 25; + add.s32 %r1394, %r1365, %r1121; + add.s32 %r1395, %r1394, %r1346; + xor.b32 %r1396, %r1395, %r1390; + shf.l.wrap.b32 %r1397, %r1396, %r1396, 16; + add.s32 %r1398, %r1397, %r1377; + xor.b32 %r1399, %r1398, %r1365; + shf.l.wrap.b32 %r1400, %r1399, %r1399, 20; + add.s32 %r1401, %r1395, %r1191; + add.s32 %r1402, %r1401, %r1400; + xor.b32 %r1403, %r1402, %r1397; + shf.l.wrap.b32 %r1404, %r1403, %r1403, 24; + add.s32 %r1405, %r1404, %r1398; + xor.b32 %r1406, %r1405, %r1400; + shf.l.wrap.b32 %r1407, %r1406, %r1406, 25; + add.s32 %r1408, %r1360, %r1198; + add.s32 %r1409, %r1408, %r1379; + xor.b32 %r1410, %r1348, %r1409; + shf.l.wrap.b32 %r1411, %r1410, %r1410, 16; + add.s32 %r1412, %r1411, %r1391; + xor.b32 %r1413, %r1412, %r1379; + shf.l.wrap.b32 %r1414, %r1413, %r1413, 20; + add.s32 %r1415, %r1409, %r1149; + add.s32 %r1416, %r1415, %r1414; + xor.b32 %r1417, %r1416, %r1411; + shf.l.wrap.b32 %r1418, %r1417, %r1417, 24; + add.s32 %r1419, %r1418, %r1412; + xor.b32 %r1420, %r1419, %r1414; + shf.l.wrap.b32 %r1421, %r1420, %r1420, 25; + add.s32 %r1422, %r1374, %r1177; + add.s32 %r1423, %r1422, %r1393; + xor.b32 %r1424, %r1423, %r1362; + shf.l.wrap.b32 %r1425, %r1424, %r1424, 16; + add.s32 %r1426, %r1425, %r1349; + xor.b32 %r1427, %r1426, %r1393; + shf.l.wrap.b32 %r1428, %r1427, %r1427, 20; + add.s32 %r1429, %r1423, %r1212; + add.s32 %r1430, %r1429, %r1428; + xor.b32 %r1431, %r1430, %r1425; + shf.l.wrap.b32 %r1432, %r1431, %r1431, 24; + add.s32 %r1433, %r1432, %r1426; + xor.b32 %r1434, %r1433, %r1428; + shf.l.wrap.b32 %r1435, %r1434, %r1434, 25; + add.s32 %r1436, %r1388, %r1219; + add.s32 %r1437, %r1436, %r1351; + xor.b32 %r1438, %r1437, %r1376; + shf.l.wrap.b32 %r1439, %r1438, %r1438, 16; + add.s32 %r1440, %r1439, %r1363; + xor.b32 %r1441, %r1440, %r1351; + shf.l.wrap.b32 %r1442, %r1441, %r1441, 20; + add.s32 %r1443, %r1437, %r1170; + add.s32 %r1444, %r1443, %r1442; + xor.b32 %r1445, %r1444, %r1439; + shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; + add.s32 %r1447, %r1446, %r1440; + xor.b32 %r1448, %r1447, %r1442; + shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; + add.s32 %r1450, %r1402, %r1135; + add.s32 %r1451, %r1450, %r1449; + xor.b32 %r1452, %r1451, %r1418; + shf.l.wrap.b32 %r1453, %r1452, %r1452, 16; + add.s32 %r1454, %r1453, %r1433; + xor.b32 %r1455, %r1454, %r1449; + shf.l.wrap.b32 %r1456, %r1455, %r1455, 20; + add.s32 %r1457, %r1451, %r1142; + add.s32 %r1458, %r1457, %r1456; + xor.b32 %r1459, %r1458, %r1453; + shf.l.wrap.b32 %r1460, %r1459, %r1459, 24; + add.s32 %r1461, %r1460, %r1454; + xor.b32 %r1462, %r1461, %r1456; + shf.l.wrap.b32 %r1463, %r1462, %r1462, 25; + add.s32 %r1464, %r1416, %r1184; + add.s32 %r1465, %r1464, %r1407; + xor.b32 %r1466, %r1465, %r1432; + shf.l.wrap.b32 %r1467, %r1466, %r1466, 16; + add.s32 %r1468, %r1467, %r1447; + xor.b32 %r1469, %r1468, %r1407; + shf.l.wrap.b32 %r1470, %r1469, %r1469, 20; + add.s32 %r1471, %r1465, %r1198; + add.s32 %r1472, %r1471, %r1470; + xor.b32 %r1473, %r1472, %r1467; + shf.l.wrap.b32 %r1474, %r1473, %r1473, 24; + add.s32 %r1475, %r1474, %r1468; + xor.b32 %r1476, %r1475, %r1470; + shf.l.wrap.b32 %r1477, %r1476, %r1476, 25; + add.s32 %r1478, %r1430, %r1205; + add.s32 %r1479, %r1478, %r1421; + xor.b32 %r1480, %r1446, %r1479; + shf.l.wrap.b32 %r1481, %r1480, %r1480, 16; + add.s32 %r1482, %r1481, %r1405; + xor.b32 %r1483, %r1482, %r1421; + shf.l.wrap.b32 %r1484, %r1483, %r1483, 20; + add.s32 %r1485, %r1479, %r1128; + add.s32 %r1486, %r1485, %r1484; + xor.b32 %r1487, %r1486, %r1481; + shf.l.wrap.b32 %r1488, %r1487, %r1487, 24; + add.s32 %r1489, %r1488, %r1482; + xor.b32 %r1490, %r1489, %r1484; + shf.l.wrap.b32 %r1491, %r1490, %r1490, 25; + add.s32 %r1492, %r1435, %r1163; + add.s32 %r1493, %r1492, %r1444; + xor.b32 %r1494, %r1493, %r1404; + shf.l.wrap.b32 %r1495, %r1494, %r1494, 16; + add.s32 %r1496, %r1495, %r1419; + xor.b32 %r1497, %r1496, %r1435; + shf.l.wrap.b32 %r1498, %r1497, %r1497, 20; + add.s32 %r1499, %r1493, %r1212; + add.s32 %r1500, %r1499, %r1498; + xor.b32 %r1501, %r1500, %r1495; + shf.l.wrap.b32 %r1502, %r1501, %r1501, 24; + add.s32 %r1503, %r1502, %r1496; + xor.b32 %r1504, %r1503, %r1498; + shf.l.wrap.b32 %r1505, %r1504, %r1504, 25; + add.s32 %r1506, %r1477, %r1156; + add.s32 %r1507, %r1506, %r1458; + xor.b32 %r1508, %r1507, %r1502; + shf.l.wrap.b32 %r1509, %r1508, %r1508, 16; + add.s32 %r1510, %r1509, %r1489; + xor.b32 %r1511, %r1510, %r1477; + shf.l.wrap.b32 %r1512, %r1511, %r1511, 20; + add.s32 %r1513, %r1507, %r1149; + add.s32 %r1514, %r1513, %r1512; + xor.b32 %r1515, %r1514, %r1509; + shf.l.wrap.b32 %r1516, %r1515, %r1515, 24; + add.s32 %r1517, %r1516, %r1510; + xor.b32 %r1518, %r1517, %r1512; + shf.l.wrap.b32 %r1519, %r1518, %r1518, 25; + add.s32 %r1520, %r1472, %r1177; + add.s32 %r1521, %r1520, %r1491; + xor.b32 %r1522, %r1460, %r1521; + shf.l.wrap.b32 %r1523, %r1522, %r1522, 16; + add.s32 %r1524, %r1523, %r1503; + xor.b32 %r1525, %r1524, %r1491; + shf.l.wrap.b32 %r1526, %r1525, %r1525, 20; + add.s32 %r1527, %r1521, %r1114; + add.s32 %r1528, %r1527, %r1526; + xor.b32 %r1529, %r1528, %r1523; + shf.l.wrap.b32 %r1530, %r1529, %r1529, 24; + add.s32 %r1531, %r1530, %r1524; + xor.b32 %r1532, %r1531, %r1526; + shf.l.wrap.b32 %r1533, %r1532, %r1532, 25; + add.s32 %r1534, %r1486, %r1191; + add.s32 %r1535, %r1534, %r1505; + xor.b32 %r1536, %r1535, %r1474; + shf.l.wrap.b32 %r1537, %r1536, %r1536, 16; + add.s32 %r1538, %r1537, %r1461; + xor.b32 %r1539, %r1538, %r1505; + shf.l.wrap.b32 %r1540, %r1539, %r1539, 20; + add.s32 %r1541, %r1535, %r1219; + add.s32 %r1542, %r1541, %r1540; + xor.b32 %r1543, %r1542, %r1537; + shf.l.wrap.b32 %r1544, %r1543, %r1543, 24; + add.s32 %r1545, %r1544, %r1538; + xor.b32 %r1546, %r1545, %r1540; + shf.l.wrap.b32 %r1547, %r1546, %r1546, 25; + add.s32 %r1548, %r1500, %r1170; + add.s32 %r1549, %r1548, %r1463; + xor.b32 %r1550, %r1549, %r1488; + shf.l.wrap.b32 %r1551, %r1550, %r1550, 16; + add.s32 %r1552, %r1551, %r1475; + xor.b32 %r1553, %r1552, %r1463; + shf.l.wrap.b32 %r1554, %r1553, %r1553, 20; + add.s32 %r1555, %r1549, %r1121; + add.s32 %r1556, %r1555, %r1554; + xor.b32 %r1557, %r1556, %r1551; + shf.l.wrap.b32 %r1558, %r1557, %r1557, 24; + add.s32 %r1559, %r1558, %r1552; + xor.b32 %r1560, %r1559, %r1554; + shf.l.wrap.b32 %r1561, %r1560, %r1560, 25; + add.s32 %r1562, %r1514, %r1184; + add.s32 %r1563, %r1562, %r1561; + xor.b32 %r1564, %r1563, %r1530; + shf.l.wrap.b32 %r1565, %r1564, %r1564, 16; + add.s32 %r1566, %r1565, %r1545; + xor.b32 %r1567, %r1566, %r1561; + shf.l.wrap.b32 %r1568, %r1567, %r1567, 20; + add.s32 %r1569, %r1563, %r1163; + add.s32 %r1570, %r1569, %r1568; + xor.b32 %r1571, %r1570, %r1565; + shf.l.wrap.b32 %r1572, %r1571, %r1571, 24; + add.s32 %r1573, %r1572, %r1566; + xor.b32 %r1574, %r1573, %r1568; + shf.l.wrap.b32 %r1575, %r1574, %r1574, 25; + add.s32 %r1576, %r1528, %r1198; + add.s32 %r1577, %r1576, %r1519; + xor.b32 %r1578, %r1577, %r1544; + shf.l.wrap.b32 %r1579, %r1578, %r1578, 16; + add.s32 %r1580, %r1579, %r1559; + xor.b32 %r1581, %r1580, %r1519; + shf.l.wrap.b32 %r1582, %r1581, %r1581, 20; + add.s32 %r1583, %r1577, %r1177; + add.s32 %r1584, %r1583, %r1582; + xor.b32 %r1585, %r1584, %r1579; + shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; + add.s32 %r1587, %r1586, %r1580; + xor.b32 %r1588, %r1587, %r1582; + shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; + add.s32 %r1590, %r1542, %r1212; + add.s32 %r1591, %r1590, %r1533; + xor.b32 %r1592, %r1558, %r1591; + shf.l.wrap.b32 %r1593, %r1592, %r1592, 16; + add.s32 %r1594, %r1593, %r1517; + xor.b32 %r1595, %r1594, %r1533; + shf.l.wrap.b32 %r1596, %r1595, %r1595, 20; + add.s32 %r1597, %r1591, %r1135; + add.s32 %r1598, %r1597, %r1596; + xor.b32 %r1599, %r1598, %r1593; + shf.l.wrap.b32 %r1600, %r1599, %r1599, 24; + add.s32 %r1601, %r1600, %r1594; + xor.b32 %r1602, %r1601, %r1596; + shf.l.wrap.b32 %r1603, %r1602, %r1602, 25; + add.s32 %r1604, %r1547, %r1205; + add.s32 %r1605, %r1604, %r1556; + xor.b32 %r1606, %r1605, %r1516; + shf.l.wrap.b32 %r1607, %r1606, %r1606, 16; + add.s32 %r1608, %r1607, %r1531; + xor.b32 %r1609, %r1608, %r1547; + shf.l.wrap.b32 %r1610, %r1609, %r1609, 20; + add.s32 %r1611, %r1605, %r1219; + add.s32 %r1612, %r1611, %r1610; + xor.b32 %r1613, %r1612, %r1607; + shf.l.wrap.b32 %r1614, %r1613, %r1613, 24; + add.s32 %r1615, %r1614, %r1608; + xor.b32 %r1616, %r1615, %r1610; + shf.l.wrap.b32 %r1617, %r1616, %r1616, 25; + add.s32 %r1618, %r1589, %r1142; + add.s32 %r1619, %r1618, %r1570; + xor.b32 %r1620, %r1619, %r1614; + shf.l.wrap.b32 %r1621, %r1620, %r1620, 16; + add.s32 %r1622, %r1621, %r1601; + xor.b32 %r1623, %r1622, %r1589; + shf.l.wrap.b32 %r1624, %r1623, %r1623, 20; + add.s32 %r1625, %r1619, %r1114; + add.s32 %r1626, %r1625, %r1624; + xor.b32 %r1627, %r1626, %r1621; + shf.l.wrap.b32 %r1628, %r1627, %r1627, 24; + add.s32 %r1629, %r1628, %r1622; + xor.b32 %r1630, %r1629, %r1624; + shf.l.wrap.b32 %r1631, %r1630, %r1630, 25; + add.s32 %r1632, %r1584, %r1191; + add.s32 %r1633, %r1632, %r1603; + xor.b32 %r1634, %r1572, %r1633; + shf.l.wrap.b32 %r1635, %r1634, %r1634, 16; + add.s32 %r1636, %r1635, %r1615; + xor.b32 %r1637, %r1636, %r1603; + shf.l.wrap.b32 %r1638, %r1637, %r1637, 20; + add.s32 %r1639, %r1633, %r1128; + add.s32 %r1640, %r1639, %r1638; + xor.b32 %r1641, %r1640, %r1635; + shf.l.wrap.b32 %r1642, %r1641, %r1641, 24; + add.s32 %r1643, %r1642, %r1636; + xor.b32 %r1644, %r1643, %r1638; + shf.l.wrap.b32 %r1645, %r1644, %r1644, 25; + add.s32 %r1646, %r1598, %r1149; + add.s32 %r1647, %r1646, %r1617; + xor.b32 %r1648, %r1647, %r1586; + shf.l.wrap.b32 %r1649, %r1648, %r1648, 16; + add.s32 %r1650, %r1649, %r1573; + xor.b32 %r1651, %r1650, %r1617; + shf.l.wrap.b32 %r1652, %r1651, %r1651, 20; + add.s32 %r1653, %r1647, %r1170; + add.s32 %r1654, %r1653, %r1652; + xor.b32 %r1655, %r1654, %r1649; + shf.l.wrap.b32 %r1656, %r1655, %r1655, 24; + add.s32 %r1657, %r1656, %r1650; + xor.b32 %r1658, %r1657, %r1652; + shf.l.wrap.b32 %r1659, %r1658, %r1658, 25; + add.s32 %r1660, %r1612, %r1121; + add.s32 %r1661, %r1660, %r1575; + xor.b32 %r1662, %r1661, %r1600; + shf.l.wrap.b32 %r1663, %r1662, %r1662, 16; + add.s32 %r1664, %r1663, %r1587; + xor.b32 %r1665, %r1664, %r1575; + shf.l.wrap.b32 %r1666, %r1665, %r1665, 20; + add.s32 %r1667, %r1661, %r1156; + add.s32 %r1668, %r1667, %r1666; + xor.b32 %r1669, %r1668, %r1663; + shf.l.wrap.b32 %r1670, %r1669, %r1669, 24; + add.s32 %r1671, %r1670, %r1664; + xor.b32 %r1672, %r1671, %r1666; + shf.l.wrap.b32 %r1673, %r1672, %r1672, 25; + add.s32 %r1674, %r1626, %r1198; + add.s32 %r1675, %r1674, %r1673; + xor.b32 %r1676, %r1675, %r1642; + shf.l.wrap.b32 %r1677, %r1676, %r1676, 16; + add.s32 %r1678, %r1677, %r1657; + xor.b32 %r1679, %r1678, %r1673; + shf.l.wrap.b32 %r1680, %r1679, %r1679, 20; + add.s32 %r1681, %r1675, %r1205; + add.s32 %r1682, %r1681, %r1680; + xor.b32 %r1683, %r1682, %r1677; + shf.l.wrap.b32 %r1684, %r1683, %r1683, 24; + add.s32 %r1685, %r1684, %r1678; + xor.b32 %r1686, %r1685, %r1680; + shf.l.wrap.b32 %r1687, %r1686, %r1686, 25; + add.s32 %r1688, %r1640, %r1177; + add.s32 %r1689, %r1688, %r1631; + xor.b32 %r1690, %r1689, %r1656; + shf.l.wrap.b32 %r1691, %r1690, %r1690, 16; + add.s32 %r1692, %r1691, %r1671; + xor.b32 %r1693, %r1692, %r1631; + shf.l.wrap.b32 %r1694, %r1693, %r1693, 20; + add.s32 %r1695, %r1689, %r1191; + add.s32 %r1696, %r1695, %r1694; + xor.b32 %r1697, %r1696, %r1691; + shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; + add.s32 %r1699, %r1698, %r1692; + xor.b32 %r1700, %r1699, %r1694; + shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; + add.s32 %r1702, %r1654, %r1219; + add.s32 %r1703, %r1702, %r1645; + xor.b32 %r1704, %r1670, %r1703; + shf.l.wrap.b32 %r1705, %r1704, %r1704, 16; + add.s32 %r1706, %r1705, %r1629; + xor.b32 %r1707, %r1706, %r1645; + shf.l.wrap.b32 %r1708, %r1707, %r1707, 20; + add.s32 %r1709, %r1703, %r1184; + add.s32 %r1710, %r1709, %r1708; + xor.b32 %r1711, %r1710, %r1705; + shf.l.wrap.b32 %r1712, %r1711, %r1711, 24; + add.s32 %r1713, %r1712, %r1706; + xor.b32 %r1714, %r1713, %r1708; + shf.l.wrap.b32 %r1715, %r1714, %r1714, 25; + add.s32 %r1716, %r1659, %r1212; + add.s32 %r1717, %r1716, %r1668; + xor.b32 %r1718, %r1717, %r1628; + shf.l.wrap.b32 %r1719, %r1718, %r1718, 16; + add.s32 %r1720, %r1719, %r1643; + xor.b32 %r1721, %r1720, %r1659; + shf.l.wrap.b32 %r1722, %r1721, %r1721, 20; + add.s32 %r1723, %r1717, %r1170; + add.s32 %r1724, %r1723, %r1722; + xor.b32 %r1725, %r1724, %r1719; + shf.l.wrap.b32 %r1726, %r1725, %r1725, 24; + add.s32 %r1727, %r1726, %r1720; + xor.b32 %r1728, %r1727, %r1722; + shf.l.wrap.b32 %r1729, %r1728, %r1728, 25; + add.s32 %r1730, %r1701, %r1163; + add.s32 %r1731, %r1730, %r1682; + xor.b32 %r1732, %r1731, %r1726; + shf.l.wrap.b32 %r1733, %r1732, %r1732, 16; + add.s32 %r1734, %r1733, %r1713; + xor.b32 %r1735, %r1734, %r1701; + shf.l.wrap.b32 %r1736, %r1735, %r1735, 20; + add.s32 %r1737, %r1731, %r1128; + add.s32 %r1738, %r1737, %r1736; + xor.b32 %r1739, %r1738, %r1733; + shf.l.wrap.b32 %r1740, %r1739, %r1739, 24; + add.s32 %r1741, %r1740, %r1734; + xor.b32 %r1742, %r1741, %r1736; + shf.l.wrap.b32 %r1743, %r1742, %r1742, 25; + add.s32 %r1744, %r1696, %r1149; + add.s32 %r1745, %r1744, %r1715; + xor.b32 %r1746, %r1684, %r1745; + shf.l.wrap.b32 %r1747, %r1746, %r1746, 16; + add.s32 %r1748, %r1747, %r1727; + xor.b32 %r1749, %r1748, %r1715; + shf.l.wrap.b32 %r1750, %r1749, %r1749, 20; + add.s32 %r1751, %r1745, %r1135; + add.s32 %r1752, %r1751, %r1750; + xor.b32 %r1753, %r1752, %r1747; + shf.l.wrap.b32 %r1754, %r1753, %r1753, 24; + add.s32 %r1755, %r1754, %r1748; + xor.b32 %r1756, %r1755, %r1750; + shf.l.wrap.b32 %r1757, %r1756, %r1756, 25; + add.s32 %r1758, %r1710, %r1114; + add.s32 %r1759, %r1758, %r1729; + xor.b32 %r1760, %r1759, %r1698; + shf.l.wrap.b32 %r1761, %r1760, %r1760, 16; + add.s32 %r1762, %r1761, %r1685; + xor.b32 %r1763, %r1762, %r1729; + shf.l.wrap.b32 %r1764, %r1763, %r1763, 20; + add.s32 %r1765, %r1759, %r1121; + add.s32 %r1766, %r1765, %r1764; + xor.b32 %r1767, %r1766, %r1761; + shf.l.wrap.b32 %r1768, %r1767, %r1767, 24; + add.s32 %r1769, %r1768, %r1762; + xor.b32 %r1770, %r1769, %r1764; + shf.l.wrap.b32 %r1771, %r1770, %r1770, 25; + add.s32 %r1772, %r1724, %r1156; + add.s32 %r1773, %r1772, %r1687; + xor.b32 %r1774, %r1773, %r1712; + shf.l.wrap.b32 %r1775, %r1774, %r1774, 16; + add.s32 %r1776, %r1775, %r1699; + xor.b32 %r1777, %r1776, %r1687; + shf.l.wrap.b32 %r1778, %r1777, %r1777, 20; + add.s32 %r1779, %r1773, %r1142; + add.s32 %r1780, %r1779, %r1778; + xor.b32 %r1781, %r1780, %r1775; + shf.l.wrap.b32 %r1782, %r1781, %r1781, 24; + add.s32 %r1783, %r1782, %r1776; + xor.b32 %r1784, %r1783, %r1778; + shf.l.wrap.b32 %r1785, %r1784, %r1784, 25; + add.s32 %r1786, %r1738, %r1177; + add.s32 %r1787, %r1786, %r1785; + xor.b32 %r1788, %r1787, %r1754; + shf.l.wrap.b32 %r1789, %r1788, %r1788, 16; + add.s32 %r1790, %r1789, %r1769; + xor.b32 %r1791, %r1790, %r1785; + shf.l.wrap.b32 %r1792, %r1791, %r1791, 20; + add.s32 %r1793, %r1787, %r1212; + add.s32 %r1794, %r1793, %r1792; + xor.b32 %r1795, %r1794, %r1789; + shf.l.wrap.b32 %r1796, %r1795, %r1795, 24; + add.s32 %r1797, %r1796, %r1790; + xor.b32 %r1798, %r1797, %r1792; + shf.l.wrap.b32 %r1799, %r1798, %r1798, 25; + add.s32 %r1800, %r1752, %r1191; + add.s32 %r1801, %r1800, %r1743; + xor.b32 %r1802, %r1801, %r1768; + shf.l.wrap.b32 %r1803, %r1802, %r1802, 16; + add.s32 %r1804, %r1803, %r1783; + xor.b32 %r1805, %r1804, %r1743; + shf.l.wrap.b32 %r1806, %r1805, %r1805, 20; + add.s32 %r1807, %r1801, %r1149; + add.s32 %r1808, %r1807, %r1806; + xor.b32 %r1809, %r1808, %r1803; + shf.l.wrap.b32 %r1810, %r1809, %r1809, 24; + add.s32 %r1811, %r1810, %r1804; + xor.b32 %r1812, %r1811, %r1806; + shf.l.wrap.b32 %r1813, %r1812, %r1812, 25; + add.s32 %r1814, %r1766, %r1170; + add.s32 %r1815, %r1814, %r1757; + xor.b32 %r1816, %r1782, %r1815; + shf.l.wrap.b32 %r1817, %r1816, %r1816, 16; + add.s32 %r1818, %r1817, %r1741; + xor.b32 %r1819, %r1818, %r1757; + shf.l.wrap.b32 %r1820, %r1819, %r1819, 20; + add.s32 %r1821, %r1815, %r1198; + add.s32 %r1822, %r1821, %r1820; + xor.b32 %r1823, %r1822, %r1817; + shf.l.wrap.b32 %r1824, %r1823, %r1823, 24; + add.s32 %r1825, %r1824, %r1818; + xor.b32 %r1826, %r1825, %r1820; + shf.l.wrap.b32 %r1827, %r1826, %r1826, 25; + add.s32 %r1828, %r1771, %r1219; + add.s32 %r1829, %r1828, %r1780; + xor.b32 %r1830, %r1829, %r1740; + shf.l.wrap.b32 %r1831, %r1830, %r1830, 16; + add.s32 %r1832, %r1831, %r1755; + xor.b32 %r1833, %r1832, %r1771; + shf.l.wrap.b32 %r1834, %r1833, %r1833, 20; + add.s32 %r1835, %r1829, %r1121; + add.s32 %r1836, %r1835, %r1834; + xor.b32 %r1837, %r1836, %r1831; + shf.l.wrap.b32 %r1838, %r1837, %r1837, 24; + add.s32 %r1839, %r1838, %r1832; + xor.b32 %r1840, %r1839, %r1834; + shf.l.wrap.b32 %r1841, %r1840, %r1840, 25; + add.s32 %r1842, %r1813, %r1205; + add.s32 %r1843, %r1842, %r1794; + xor.b32 %r1844, %r1843, %r1838; + shf.l.wrap.b32 %r1845, %r1844, %r1844, 16; + add.s32 %r1846, %r1845, %r1825; + xor.b32 %r1847, %r1846, %r1813; + shf.l.wrap.b32 %r1848, %r1847, %r1847, 20; + add.s32 %r1849, %r1843, %r1135; + add.s32 %r1850, %r1849, %r1848; + xor.b32 %r1851, %r1850, %r1845; + shf.l.wrap.b32 %r1852, %r1851, %r1851, 24; + add.s32 %r1853, %r1852, %r1846; + xor.b32 %r1854, %r1853, %r1848; + shf.l.wrap.b32 %r1855, %r1854, %r1854, 25; + add.s32 %r1856, %r1808, %r1114; + add.s32 %r1857, %r1856, %r1827; + xor.b32 %r1858, %r1796, %r1857; + shf.l.wrap.b32 %r1859, %r1858, %r1858, 16; + add.s32 %r1860, %r1859, %r1839; + xor.b32 %r1861, %r1860, %r1827; + shf.l.wrap.b32 %r1862, %r1861, %r1861, 20; + add.s32 %r1863, %r1857, %r1184; + add.s32 %r1864, %r1863, %r1862; + xor.b32 %r1865, %r1864, %r1859; + shf.l.wrap.b32 %r1866, %r1865, %r1865, 24; + add.s32 %r1867, %r1866, %r1860; + xor.b32 %r1868, %r1867, %r1862; + shf.l.wrap.b32 %r1869, %r1868, %r1868, 25; + add.s32 %r1870, %r1822, %r1128; + add.s32 %r1871, %r1870, %r1841; + xor.b32 %r1872, %r1871, %r1810; + shf.l.wrap.b32 %r1873, %r1872, %r1872, 16; + add.s32 %r1874, %r1873, %r1797; + xor.b32 %r1875, %r1874, %r1841; + shf.l.wrap.b32 %r1876, %r1875, %r1875, 20; + add.s32 %r1877, %r1871, %r1156; + add.s32 %r1878, %r1877, %r1876; + xor.b32 %r1879, %r1878, %r1873; + shf.l.wrap.b32 %r1880, %r1879, %r1879, 24; + add.s32 %r1881, %r1880, %r1874; + xor.b32 %r1882, %r1881, %r1876; + shf.l.wrap.b32 %r1883, %r1882, %r1882, 25; + add.s32 %r1884, %r1836, %r1142; + add.s32 %r1885, %r1884, %r1799; + xor.b32 %r1886, %r1885, %r1824; + shf.l.wrap.b32 %r1887, %r1886, %r1886, 16; + add.s32 %r1888, %r1887, %r1811; + xor.b32 %r1889, %r1888, %r1799; + shf.l.wrap.b32 %r1890, %r1889, %r1889, 20; + add.s32 %r1891, %r1885, %r1163; + add.s32 %r1892, %r1891, %r1890; + xor.b32 %r1893, %r1892, %r1887; + shf.l.wrap.b32 %r1894, %r1893, %r1893, 24; + add.s32 %r1895, %r1894, %r1888; + xor.b32 %r1896, %r1895, %r1890; + shf.l.wrap.b32 %r1897, %r1896, %r1896, 25; + add.s32 %r1898, %r1850, %r1191; + add.s32 %r1899, %r1898, %r1897; + xor.b32 %r1900, %r1899, %r1866; + shf.l.wrap.b32 %r1901, %r1900, %r1900, 16; + add.s32 %r1902, %r1901, %r1881; + xor.b32 %r1903, %r1902, %r1897; + shf.l.wrap.b32 %r1904, %r1903, %r1903, 20; + add.s32 %r1905, %r1899, %r1219; + add.s32 %r1906, %r1905, %r1904; + xor.b32 %r1907, %r1906, %r1901; + shf.l.wrap.b32 %r1908, %r1907, %r1907, 24; + add.s32 %r1909, %r1908, %r1902; + xor.b32 %r1910, %r1909, %r1904; + shf.l.wrap.b32 %r1911, %r1910, %r1910, 25; + add.s32 %r1912, %r1864, %r1149; + add.s32 %r1913, %r1912, %r1855; + xor.b32 %r1914, %r1913, %r1880; + shf.l.wrap.b32 %r1915, %r1914, %r1914, 16; + add.s32 %r1916, %r1915, %r1895; + xor.b32 %r1917, %r1916, %r1855; + shf.l.wrap.b32 %r1918, %r1917, %r1917, 20; + add.s32 %r1919, %r1913, %r1114; + add.s32 %r1920, %r1919, %r1918; + xor.b32 %r1921, %r1920, %r1915; + shf.l.wrap.b32 %r1922, %r1921, %r1921, 24; + add.s32 %r1923, %r1922, %r1916; + xor.b32 %r1924, %r1923, %r1918; + shf.l.wrap.b32 %r1925, %r1924, %r1924, 25; + add.s32 %r1926, %r1878, %r1121; + add.s32 %r1927, %r1926, %r1869; + xor.b32 %r1928, %r1894, %r1927; + shf.l.wrap.b32 %r1929, %r1928, %r1928, 16; + add.s32 %r1930, %r1929, %r1853; + xor.b32 %r1931, %r1930, %r1869; + shf.l.wrap.b32 %r1932, %r1931, %r1931, 20; + add.s32 %r1933, %r1927, %r1177; + add.s32 %r1934, %r1933, %r1932; + xor.b32 %r1935, %r1934, %r1929; + shf.l.wrap.b32 %r1936, %r1935, %r1935, 24; + add.s32 %r1937, %r1936, %r1930; + xor.b32 %r1938, %r1937, %r1932; + shf.l.wrap.b32 %r1939, %r1938, %r1938, 25; + add.s32 %r1940, %r1883, %r1170; + add.s32 %r1941, %r1940, %r1892; + xor.b32 %r1942, %r1941, %r1852; + shf.l.wrap.b32 %r1943, %r1942, %r1942, 16; + add.s32 %r1944, %r1943, %r1867; + xor.b32 %r1945, %r1944, %r1883; + shf.l.wrap.b32 %r1946, %r1945, %r1945, 20; + add.s32 %r1947, %r1941, %r1156; + add.s32 %r1948, %r1947, %r1946; + xor.b32 %r1949, %r1948, %r1943; + shf.l.wrap.b32 %r1950, %r1949, %r1949, 24; + add.s32 %r1951, %r1950, %r1944; + xor.b32 %r1952, %r1951, %r1946; + shf.l.wrap.b32 %r1953, %r1952, %r1952, 25; + add.s32 %r1954, %r1925, %r1212; + add.s32 %r1955, %r1954, %r1906; + xor.b32 %r1956, %r1955, %r1950; + shf.l.wrap.b32 %r1957, %r1956, %r1956, 16; + add.s32 %r1958, %r1957, %r1937; + xor.b32 %r1959, %r1958, %r1925; + shf.l.wrap.b32 %r1960, %r1959, %r1959, 20; + add.s32 %r1961, %r1955, %r1184; + add.s32 %r1962, %r1961, %r1960; + xor.b32 %r1963, %r1962, %r1957; + shf.l.wrap.b32 %r1964, %r1963, %r1963, 24; + add.s32 %r1965, %r1964, %r1958; + xor.b32 %r1966, %r1965, %r1960; + shf.l.wrap.b32 %r1967, %r1966, %r1966, 25; + add.s32 %r1968, %r1920, %r1128; + add.s32 %r1969, %r1968, %r1939; + xor.b32 %r1970, %r1908, %r1969; + shf.l.wrap.b32 %r1971, %r1970, %r1970, 16; + add.s32 %r1972, %r1971, %r1951; + xor.b32 %r1973, %r1972, %r1939; + shf.l.wrap.b32 %r1974, %r1973, %r1973, 20; + add.s32 %r1975, %r1969, %r1198; + add.s32 %r1976, %r1975, %r1974; + xor.b32 %r1977, %r1976, %r1971; + shf.l.wrap.b32 %r1978, %r1977, %r1977, 24; + add.s32 %r1979, %r1978, %r1972; + xor.b32 %r1980, %r1979, %r1974; + shf.l.wrap.b32 %r1981, %r1980, %r1980, 25; + add.s32 %r1982, %r1934, %r1135; + add.s32 %r1983, %r1982, %r1953; + xor.b32 %r1984, %r1983, %r1922; + shf.l.wrap.b32 %r1985, %r1984, %r1984, 16; + add.s32 %r1986, %r1985, %r1909; + xor.b32 %r1987, %r1986, %r1953; + shf.l.wrap.b32 %r1988, %r1987, %r1987, 20; + add.s32 %r1989, %r1983, %r1142; + add.s32 %r1990, %r1989, %r1988; + xor.b32 %r1991, %r1990, %r1985; + shf.l.wrap.b32 %r1992, %r1991, %r1991, 24; + add.s32 %r1993, %r1992, %r1986; + xor.b32 %r1994, %r1993, %r1988; + shf.l.wrap.b32 %r1995, %r1994, %r1994, 25; + add.s32 %r1996, %r1948, %r1163; + add.s32 %r1997, %r1996, %r1911; + xor.b32 %r1998, %r1997, %r1936; + shf.l.wrap.b32 %r1999, %r1998, %r1998, 16; + add.s32 %r2000, %r1999, %r1923; + xor.b32 %r2001, %r2000, %r1911; + shf.l.wrap.b32 %r2002, %r2001, %r2001, 20; + add.s32 %r2003, %r1997, %r1205; + add.s32 %r2004, %r2003, %r2002; + xor.b32 %r2005, %r2004, %r1999; + shf.l.wrap.b32 %r2006, %r2005, %r2005, 24; + add.s32 %r2007, %r2006, %r2000; + xor.b32 %r2008, %r2007, %r2002; + shf.l.wrap.b32 %r2009, %r2008, %r2008, 25; + xor.b32 %r3948, %r1993, %r1962; + xor.b32 %r3947, %r2007, %r1976; + xor.b32 %r3946, %r1965, %r1990; + xor.b32 %r3945, %r2004, %r1979; + xor.b32 %r3944, %r2009, %r1978; + xor.b32 %r3943, %r1967, %r1992; + xor.b32 %r3942, %r2006, %r1981; + xor.b32 %r3941, %r1995, %r1964; + add.s64 %rd166, %rd166, 64; + add.s64 %rd167, %rd167, -1; + setp.ne.s64 %p19, %rd167, 0; + mov.u16 %rs197, %rs75; + @%p19 bra $L__BB0_19; + + st.local.u8 [%rd155], %r3948; + shr.u32 %r2010, %r3948, 8; + st.local.u8 [%rd155+1], %r2010; + shr.u32 %r2011, %r3948, 16; + st.local.u8 [%rd155+2], %r2011; + shr.u32 %r2012, %r3948, 24; + st.local.u8 [%rd155+3], %r2012; + st.local.u8 [%rd155+4], %r3947; + shr.u32 %r2013, %r3947, 8; + st.local.u8 [%rd155+5], %r2013; + shr.u32 %r2014, %r3947, 16; + st.local.u8 [%rd155+6], %r2014; + shr.u32 %r2015, %r3947, 24; + st.local.u8 [%rd155+7], %r2015; + st.local.u8 [%rd155+8], %r3946; + shr.u32 %r2016, %r3946, 8; + st.local.u8 [%rd155+9], %r2016; + shr.u32 %r2017, %r3946, 16; + st.local.u8 [%rd155+10], %r2017; + shr.u32 %r2018, %r3946, 24; + st.local.u8 [%rd155+11], %r2018; + st.local.u8 [%rd155+12], %r3945; + shr.u32 %r2019, %r3945, 8; + st.local.u8 [%rd155+13], %r2019; + shr.u32 %r2020, %r3945, 16; + st.local.u8 [%rd155+14], %r2020; + shr.u32 %r2021, %r3945, 24; + st.local.u8 [%rd155+15], %r2021; + st.local.u8 [%rd155+16], %r3944; + shr.u32 %r2022, %r3944, 8; + st.local.u8 [%rd155+17], %r2022; + shr.u32 %r2023, %r3944, 16; + st.local.u8 [%rd155+18], %r2023; + shr.u32 %r2024, %r3944, 24; + st.local.u8 [%rd155+19], %r2024; + st.local.u8 [%rd155+20], %r3943; + shr.u32 %r2025, %r3943, 8; + st.local.u8 [%rd155+21], %r2025; + shr.u32 %r2026, %r3943, 16; + st.local.u8 [%rd155+22], %r2026; + shr.u32 %r2027, %r3943, 24; + st.local.u8 [%rd155+23], %r2027; + st.local.u8 [%rd155+24], %r3942; + shr.u32 %r2028, %r3942, 8; + st.local.u8 [%rd155+25], %r2028; + shr.u32 %r2029, %r3942, 16; + st.local.u8 [%rd155+26], %r2029; + shr.u32 %r2030, %r3942, 24; + st.local.u8 [%rd155+27], %r2030; + st.local.u8 [%rd155+28], %r3941; + shr.u32 %r2031, %r3941, 8; + st.local.u8 [%rd155+29], %r2031; + shr.u32 %r2032, %r3941, 16; + st.local.u8 [%rd155+30], %r2032; + shr.u32 %r2033, %r3941, 24; + st.local.u8 [%rd155+31], %r2033; + add.s64 %rd165, %rd165, 1; + add.s64 %rd162, %rd162, 8; + add.s64 %rd155, %rd155, 32; + add.s64 %rd163, %rd163, -1; + setp.ne.s64 %p20, %rd163, 0; + @%p20 bra $L__BB0_18; + +$L__BB0_21: + ld.param.u64 %rd139, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + setp.ge.u64 %p21, %rd159, %rd139; + @%p21 bra $L__BB0_30; + + ld.param.u64 %rd140, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd135, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + add.s64 %rd127, %rd151, %rd135; + ld.local.u8 %r2034, [%rd2]; + ld.local.u8 %r2035, [%rd2+1]; + prmt.b32 %r2036, %r2035, %r2034, 30212; + ld.local.u8 %r2037, [%rd2+2]; + ld.local.u8 %r2038, [%rd2+3]; + prmt.b32 %r2039, %r2038, %r2037, 30212; + prmt.b32 %r3964, %r2039, %r2036, 4180; + ld.local.u8 %r2040, [%rd2+4]; + ld.local.u8 %r2041, [%rd2+5]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd2+6]; + ld.local.u8 %r2044, [%rd2+7]; + prmt.b32 %r2045, %r2044, %r2043, 30212; + prmt.b32 %r3963, %r2045, %r2042, 4180; + ld.local.u8 %r2046, [%rd2+8]; + ld.local.u8 %r2047, [%rd2+9]; + prmt.b32 %r2048, %r2047, %r2046, 30212; + ld.local.u8 %r2049, [%rd2+10]; + ld.local.u8 %r2050, [%rd2+11]; + prmt.b32 %r2051, %r2050, %r2049, 30212; + prmt.b32 %r3962, %r2051, %r2048, 4180; + ld.local.u8 %r2052, [%rd2+12]; + ld.local.u8 %r2053, [%rd2+13]; + prmt.b32 %r2054, %r2053, %r2052, 30212; + ld.local.u8 %r2055, [%rd2+14]; + ld.local.u8 %r2056, [%rd2+15]; + prmt.b32 %r2057, %r2056, %r2055, 30212; + prmt.b32 %r3961, %r2057, %r2054, 4180; + ld.local.u8 %r2058, [%rd2+16]; + ld.local.u8 %r2059, [%rd2+17]; + prmt.b32 %r2060, %r2059, %r2058, 30212; + ld.local.u8 %r2061, [%rd2+18]; + ld.local.u8 %r2062, [%rd2+19]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + prmt.b32 %r3960, %r2063, %r2060, 4180; + ld.local.u8 %r2064, [%rd2+20]; + ld.local.u8 %r2065, [%rd2+21]; + prmt.b32 %r2066, %r2065, %r2064, 30212; + ld.local.u8 %r2067, [%rd2+22]; + ld.local.u8 %r2068, [%rd2+23]; + prmt.b32 %r2069, %r2068, %r2067, 30212; + prmt.b32 %r3959, %r2069, %r2066, 4180; + ld.local.u8 %r2070, [%rd2+24]; + ld.local.u8 %r2071, [%rd2+25]; + prmt.b32 %r2072, %r2071, %r2070, 30212; + ld.local.u8 %r2073, [%rd2+26]; + ld.local.u8 %r2074, [%rd2+27]; + prmt.b32 %r2075, %r2074, %r2073, 30212; + prmt.b32 %r3958, %r2075, %r2072, 4180; + ld.local.u8 %r2076, [%rd2+28]; + ld.local.u8 %r2077, [%rd2+29]; + prmt.b32 %r2078, %r2077, %r2076, 30212; + ld.local.u8 %r2079, [%rd2+30]; + ld.local.u8 %r2080, [%rd2+31]; + prmt.b32 %r2081, %r2080, %r2079, 30212; + prmt.b32 %r3957, %r2081, %r2078, 4180; + add.u64 %rd53, %SPL, 16; + mov.u32 %r2082, 0; + st.local.v2.u32 [%rd53], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+8], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+16], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+24], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+32], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+40], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+48], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+56], {%r2082, %r2082}; + mov.u16 %rs199, 0; + st.local.v2.u8 [%rd53+64], {%rs199, %rs199}; + st.local.u8 [%rd53+66], %rs75; + add.s64 %rd170, %rd140, %rd159; + cvt.u32.u64 %r36, %rd127; + shr.u64 %rd129, %rd127, 32; + cvt.u32.u64 %r37, %rd129; + setp.lt.u64 %p22, %rd171, 65; + @%p22 bra $L__BB0_25; + + add.s64 %rd56, %rd53, 64; + mov.u16 %rs198, 0; + +$L__BB0_24: + and.b16 %rs83, %rs198, 255; + setp.eq.s16 %p23, %rs83, 0; + selp.u16 %rs84, 1, 0, %p23; + or.b16 %rs85, %rs84, %rs75; + ld.u8 %r2083, [%rd170]; + ld.u8 %r2084, [%rd170+1]; + prmt.b32 %r2085, %r2084, %r2083, 30212; + ld.u8 %r2086, [%rd170+2]; + prmt.b32 %r2087, %r2086, %r2085, 28756; + ld.u8 %r2088, [%rd170+3]; + prmt.b32 %r2089, %r2088, %r2087, 1620; + ld.u8 %r2090, [%rd170+4]; + ld.u8 %r2091, [%rd170+5]; + prmt.b32 %r2092, %r2091, %r2090, 30212; + ld.u8 %r2093, [%rd170+6]; + prmt.b32 %r2094, %r2093, %r2092, 28756; + ld.u8 %r2095, [%rd170+7]; + prmt.b32 %r2096, %r2095, %r2094, 1620; + ld.u8 %r2097, [%rd170+8]; + ld.u8 %r2098, [%rd170+9]; + prmt.b32 %r2099, %r2098, %r2097, 30212; + ld.u8 %r2100, [%rd170+10]; + prmt.b32 %r2101, %r2100, %r2099, 28756; + ld.u8 %r2102, [%rd170+11]; + prmt.b32 %r2103, %r2102, %r2101, 1620; + ld.u8 %r2104, [%rd170+12]; + ld.u8 %r2105, [%rd170+13]; + prmt.b32 %r2106, %r2105, %r2104, 30212; + ld.u8 %r2107, [%rd170+14]; + prmt.b32 %r2108, %r2107, %r2106, 28756; + ld.u8 %r2109, [%rd170+15]; + prmt.b32 %r2110, %r2109, %r2108, 1620; + ld.u8 %r2111, [%rd170+16]; + ld.u8 %r2112, [%rd170+17]; + prmt.b32 %r2113, %r2112, %r2111, 30212; + ld.u8 %r2114, [%rd170+18]; + prmt.b32 %r2115, %r2114, %r2113, 28756; + ld.u8 %r2116, [%rd170+19]; + prmt.b32 %r2117, %r2116, %r2115, 1620; + ld.u8 %r2118, [%rd170+20]; + ld.u8 %r2119, [%rd170+21]; + prmt.b32 %r2120, %r2119, %r2118, 30212; + ld.u8 %r2121, [%rd170+22]; + prmt.b32 %r2122, %r2121, %r2120, 28756; + ld.u8 %r2123, [%rd170+23]; + prmt.b32 %r2124, %r2123, %r2122, 1620; + ld.u8 %r2125, [%rd170+24]; + ld.u8 %r2126, [%rd170+25]; + prmt.b32 %r2127, %r2126, %r2125, 30212; + ld.u8 %r2128, [%rd170+26]; + prmt.b32 %r2129, %r2128, %r2127, 28756; + ld.u8 %r2130, [%rd170+27]; + prmt.b32 %r2131, %r2130, %r2129, 1620; + ld.u8 %r2132, [%rd170+28]; + ld.u8 %r2133, [%rd170+29]; + prmt.b32 %r2134, %r2133, %r2132, 30212; + ld.u8 %r2135, [%rd170+30]; + prmt.b32 %r2136, %r2135, %r2134, 28756; + ld.u8 %r2137, [%rd170+31]; + prmt.b32 %r2138, %r2137, %r2136, 1620; + ld.u8 %r2139, [%rd170+32]; + ld.u8 %r2140, [%rd170+33]; + prmt.b32 %r2141, %r2140, %r2139, 30212; + ld.u8 %r2142, [%rd170+34]; + prmt.b32 %r2143, %r2142, %r2141, 28756; + ld.u8 %r2144, [%rd170+35]; + prmt.b32 %r2145, %r2144, %r2143, 1620; + ld.u8 %r2146, [%rd170+36]; + ld.u8 %r2147, [%rd170+37]; + prmt.b32 %r2148, %r2147, %r2146, 30212; + ld.u8 %r2149, [%rd170+38]; + prmt.b32 %r2150, %r2149, %r2148, 28756; + ld.u8 %r2151, [%rd170+39]; + prmt.b32 %r2152, %r2151, %r2150, 1620; + ld.u8 %r2153, [%rd170+40]; + ld.u8 %r2154, [%rd170+41]; + prmt.b32 %r2155, %r2154, %r2153, 30212; + ld.u8 %r2156, [%rd170+42]; + prmt.b32 %r2157, %r2156, %r2155, 28756; + ld.u8 %r2158, [%rd170+43]; + prmt.b32 %r2159, %r2158, %r2157, 1620; + ld.u8 %r2160, [%rd170+44]; + ld.u8 %r2161, [%rd170+45]; + prmt.b32 %r2162, %r2161, %r2160, 30212; + ld.u8 %r2163, [%rd170+46]; + prmt.b32 %r2164, %r2163, %r2162, 28756; + ld.u8 %r2165, [%rd170+47]; + prmt.b32 %r2166, %r2165, %r2164, 1620; + ld.u8 %r2167, [%rd170+48]; + ld.u8 %r2168, [%rd170+49]; + prmt.b32 %r2169, %r2168, %r2167, 30212; + ld.u8 %r2170, [%rd170+50]; + prmt.b32 %r2171, %r2170, %r2169, 28756; + ld.u8 %r2172, [%rd170+51]; + prmt.b32 %r2173, %r2172, %r2171, 1620; + ld.u8 %r2174, [%rd170+52]; + ld.u8 %r2175, [%rd170+53]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.u8 %r2177, [%rd170+54]; + prmt.b32 %r2178, %r2177, %r2176, 28756; + ld.u8 %r2179, [%rd170+55]; + prmt.b32 %r2180, %r2179, %r2178, 1620; + ld.u8 %r2181, [%rd170+56]; + ld.u8 %r2182, [%rd170+57]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.u8 %r2184, [%rd170+58]; + prmt.b32 %r2185, %r2184, %r2183, 28756; + ld.u8 %r2186, [%rd170+59]; + prmt.b32 %r2187, %r2186, %r2185, 1620; + ld.u8 %r2188, [%rd170+60]; + ld.u8 %r2189, [%rd170+61]; + prmt.b32 %r2190, %r2189, %r2188, 30212; + ld.u8 %r2191, [%rd170+62]; + prmt.b32 %r2192, %r2191, %r2190, 28756; + ld.u8 %r2193, [%rd170+63]; + prmt.b32 %r2194, %r2193, %r2192, 1620; + cvt.u32.u16 %r2195, %rs85; + add.s32 %r2196, %r3964, %r2089; + add.s32 %r2197, %r2196, %r3960; + xor.b32 %r2198, %r2197, %r36; + shf.l.wrap.b32 %r2199, %r2198, %r2198, 16; + add.s32 %r2200, %r2199, 1779033703; + xor.b32 %r2201, %r2200, %r3960; + shf.l.wrap.b32 %r2202, %r2201, %r2201, 20; + add.s32 %r2203, %r2197, %r2096; + add.s32 %r2204, %r2203, %r2202; + xor.b32 %r2205, %r2204, %r2199; + shf.l.wrap.b32 %r2206, %r2205, %r2205, 24; + add.s32 %r2207, %r2206, %r2200; + xor.b32 %r2208, %r2207, %r2202; + shf.l.wrap.b32 %r2209, %r2208, %r2208, 25; + add.s32 %r2210, %r3963, %r2103; + add.s32 %r2211, %r2210, %r3959; + xor.b32 %r2212, %r2211, %r37; + shf.l.wrap.b32 %r2213, %r2212, %r2212, 16; + add.s32 %r2214, %r2213, -1150833019; + xor.b32 %r2215, %r2214, %r3959; + shf.l.wrap.b32 %r2216, %r2215, %r2215, 20; + add.s32 %r2217, %r2211, %r2110; + add.s32 %r2218, %r2217, %r2216; + xor.b32 %r2219, %r2218, %r2213; + shf.l.wrap.b32 %r2220, %r2219, %r2219, 24; + add.s32 %r2221, %r2220, %r2214; + xor.b32 %r2222, %r2221, %r2216; + shf.l.wrap.b32 %r2223, %r2222, %r2222, 25; + add.s32 %r2224, %r3962, %r2117; + add.s32 %r2225, %r2224, %r3958; + shr.u32 %r2226, %r2225, 16; + shl.b32 %r2227, %r2225, 16; + xor.b32 %r2228, %r2227, 4194304; + or.b32 %r2229, %r2228, %r2226; + add.s32 %r2230, %r2229, 1013904242; + xor.b32 %r2231, %r2230, %r3958; + shf.l.wrap.b32 %r2232, %r2231, %r2231, 20; + add.s32 %r2233, %r2225, %r2124; + add.s32 %r2234, %r2233, %r2232; + xor.b32 %r2235, %r2234, %r2229; + shf.l.wrap.b32 %r2236, %r2235, %r2235, 24; + add.s32 %r2237, %r2236, %r2230; + xor.b32 %r2238, %r2237, %r2232; + shf.l.wrap.b32 %r2239, %r2238, %r2238, 25; + add.s32 %r2240, %r3961, %r2131; + add.s32 %r2241, %r2240, %r3957; + xor.b32 %r2242, %r2241, %r2195; + shr.u32 %r2243, %r2241, 16; + shl.b32 %r2244, %r2242, 16; + or.b32 %r2245, %r2244, %r2243; + add.s32 %r2246, %r2245, -1521486534; + xor.b32 %r2247, %r2246, %r3957; + shf.l.wrap.b32 %r2248, %r2247, %r2247, 20; + add.s32 %r2249, %r2241, %r2138; + add.s32 %r2250, %r2249, %r2248; + xor.b32 %r2251, %r2250, %r2245; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 24; + add.s32 %r2253, %r2252, %r2246; + xor.b32 %r2254, %r2253, %r2248; + shf.l.wrap.b32 %r2255, %r2254, %r2254, 25; + add.s32 %r2256, %r2204, %r2145; + add.s32 %r2257, %r2256, %r2223; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 16; + add.s32 %r2260, %r2259, %r2237; + xor.b32 %r2261, %r2260, %r2223; + shf.l.wrap.b32 %r2262, %r2261, %r2261, 20; + add.s32 %r2263, %r2257, %r2152; + add.s32 %r2264, %r2263, %r2262; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 24; + add.s32 %r2267, %r2266, %r2260; + xor.b32 %r2268, %r2267, %r2262; + shf.l.wrap.b32 %r2269, %r2268, %r2268, 25; + add.s32 %r2270, %r2218, %r2159; + add.s32 %r2271, %r2270, %r2239; + xor.b32 %r2272, %r2271, %r2206; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 16; + add.s32 %r2274, %r2273, %r2253; + xor.b32 %r2275, %r2274, %r2239; + shf.l.wrap.b32 %r2276, %r2275, %r2275, 20; + add.s32 %r2277, %r2271, %r2166; + add.s32 %r2278, %r2277, %r2276; + xor.b32 %r2279, %r2278, %r2273; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 24; + add.s32 %r2281, %r2280, %r2274; + xor.b32 %r2282, %r2281, %r2276; + shf.l.wrap.b32 %r2283, %r2282, %r2282, 25; + add.s32 %r2284, %r2234, %r2173; + add.s32 %r2285, %r2284, %r2255; + xor.b32 %r2286, %r2285, %r2220; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 16; + add.s32 %r2288, %r2287, %r2207; + xor.b32 %r2289, %r2288, %r2255; + shf.l.wrap.b32 %r2290, %r2289, %r2289, 20; + add.s32 %r2291, %r2285, %r2180; + add.s32 %r2292, %r2291, %r2290; + xor.b32 %r2293, %r2292, %r2287; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 24; + add.s32 %r2295, %r2294, %r2288; + xor.b32 %r2296, %r2295, %r2290; + shf.l.wrap.b32 %r2297, %r2296, %r2296, 25; + add.s32 %r2298, %r2250, %r2187; + add.s32 %r2299, %r2298, %r2209; + xor.b32 %r2300, %r2299, %r2236; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 16; + add.s32 %r2302, %r2301, %r2221; + xor.b32 %r2303, %r2302, %r2209; + shf.l.wrap.b32 %r2304, %r2303, %r2303, 20; + add.s32 %r2305, %r2299, %r2194; + add.s32 %r2306, %r2305, %r2304; + xor.b32 %r2307, %r2306, %r2301; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 24; + add.s32 %r2309, %r2308, %r2302; + xor.b32 %r2310, %r2309, %r2304; + shf.l.wrap.b32 %r2311, %r2310, %r2310, 25; + add.s32 %r2312, %r2264, %r2103; + add.s32 %r2313, %r2312, %r2311; + xor.b32 %r2314, %r2313, %r2280; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 16; + add.s32 %r2316, %r2315, %r2295; + xor.b32 %r2317, %r2316, %r2311; + shf.l.wrap.b32 %r2318, %r2317, %r2317, 20; + add.s32 %r2319, %r2313, %r2131; + add.s32 %r2320, %r2319, %r2318; + xor.b32 %r2321, %r2320, %r2315; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 24; + add.s32 %r2323, %r2322, %r2316; + xor.b32 %r2324, %r2323, %r2318; + shf.l.wrap.b32 %r2325, %r2324, %r2324, 25; + add.s32 %r2326, %r2278, %r2110; + add.s32 %r2327, %r2326, %r2269; + xor.b32 %r2328, %r2327, %r2294; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 16; + add.s32 %r2330, %r2329, %r2309; + xor.b32 %r2331, %r2330, %r2269; + shf.l.wrap.b32 %r2332, %r2331, %r2331, 20; + add.s32 %r2333, %r2327, %r2159; + add.s32 %r2334, %r2333, %r2332; + xor.b32 %r2335, %r2334, %r2329; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 24; + add.s32 %r2337, %r2336, %r2330; + xor.b32 %r2338, %r2337, %r2332; + shf.l.wrap.b32 %r2339, %r2338, %r2338, 25; + add.s32 %r2340, %r2292, %r2138; + add.s32 %r2341, %r2340, %r2283; + xor.b32 %r2342, %r2341, %r2308; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 16; + add.s32 %r2344, %r2343, %r2267; + xor.b32 %r2345, %r2344, %r2283; + shf.l.wrap.b32 %r2346, %r2345, %r2345, 20; + add.s32 %r2347, %r2341, %r2089; + add.s32 %r2348, %r2347, %r2346; + xor.b32 %r2349, %r2348, %r2343; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 24; + add.s32 %r2351, %r2350, %r2344; + xor.b32 %r2352, %r2351, %r2346; + shf.l.wrap.b32 %r2353, %r2352, %r2352, 25; + add.s32 %r2354, %r2306, %r2117; + add.s32 %r2355, %r2354, %r2297; + xor.b32 %r2356, %r2355, %r2266; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 16; + add.s32 %r2358, %r2357, %r2281; + xor.b32 %r2359, %r2358, %r2297; + shf.l.wrap.b32 %r2360, %r2359, %r2359, 20; + add.s32 %r2361, %r2355, %r2180; + add.s32 %r2362, %r2361, %r2360; + xor.b32 %r2363, %r2362, %r2357; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 24; + add.s32 %r2365, %r2364, %r2358; + xor.b32 %r2366, %r2365, %r2360; + shf.l.wrap.b32 %r2367, %r2366, %r2366, 25; + add.s32 %r2368, %r2320, %r2096; + add.s32 %r2369, %r2368, %r2339; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 16; + add.s32 %r2372, %r2371, %r2351; + xor.b32 %r2373, %r2372, %r2339; + shf.l.wrap.b32 %r2374, %r2373, %r2373, 20; + add.s32 %r2375, %r2369, %r2166; + add.s32 %r2376, %r2375, %r2374; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 24; + add.s32 %r2379, %r2378, %r2372; + xor.b32 %r2380, %r2379, %r2374; + shf.l.wrap.b32 %r2381, %r2380, %r2380, 25; + add.s32 %r2382, %r2334, %r2173; + add.s32 %r2383, %r2382, %r2353; + xor.b32 %r2384, %r2383, %r2322; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 16; + add.s32 %r2386, %r2385, %r2365; + xor.b32 %r2387, %r2386, %r2353; + shf.l.wrap.b32 %r2388, %r2387, %r2387, 20; + add.s32 %r2389, %r2383, %r2124; + add.s32 %r2390, %r2389, %r2388; + xor.b32 %r2391, %r2390, %r2385; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 24; + add.s32 %r2393, %r2392, %r2386; + xor.b32 %r2394, %r2393, %r2388; + shf.l.wrap.b32 %r2395, %r2394, %r2394, 25; + add.s32 %r2396, %r2348, %r2152; + add.s32 %r2397, %r2396, %r2367; + xor.b32 %r2398, %r2397, %r2336; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 16; + add.s32 %r2400, %r2399, %r2323; + xor.b32 %r2401, %r2400, %r2367; + shf.l.wrap.b32 %r2402, %r2401, %r2401, 20; + add.s32 %r2403, %r2397, %r2187; + add.s32 %r2404, %r2403, %r2402; + xor.b32 %r2405, %r2404, %r2399; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 24; + add.s32 %r2407, %r2406, %r2400; + xor.b32 %r2408, %r2407, %r2402; + shf.l.wrap.b32 %r2409, %r2408, %r2408, 25; + add.s32 %r2410, %r2362, %r2194; + add.s32 %r2411, %r2410, %r2325; + xor.b32 %r2412, %r2411, %r2350; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 16; + add.s32 %r2414, %r2413, %r2337; + xor.b32 %r2415, %r2414, %r2325; + shf.l.wrap.b32 %r2416, %r2415, %r2415, 20; + add.s32 %r2417, %r2411, %r2145; + add.s32 %r2418, %r2417, %r2416; + xor.b32 %r2419, %r2418, %r2413; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 24; + add.s32 %r2421, %r2420, %r2414; + xor.b32 %r2422, %r2421, %r2416; + shf.l.wrap.b32 %r2423, %r2422, %r2422, 25; + add.s32 %r2424, %r2376, %r2110; + add.s32 %r2425, %r2424, %r2423; + xor.b32 %r2426, %r2425, %r2392; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 16; + add.s32 %r2428, %r2427, %r2407; + xor.b32 %r2429, %r2428, %r2423; + shf.l.wrap.b32 %r2430, %r2429, %r2429, 20; + add.s32 %r2431, %r2425, %r2117; + add.s32 %r2432, %r2431, %r2430; + xor.b32 %r2433, %r2432, %r2427; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 24; + add.s32 %r2435, %r2434, %r2428; + xor.b32 %r2436, %r2435, %r2430; + shf.l.wrap.b32 %r2437, %r2436, %r2436, 25; + add.s32 %r2438, %r2390, %r2159; + add.s32 %r2439, %r2438, %r2381; + xor.b32 %r2440, %r2439, %r2406; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 16; + add.s32 %r2442, %r2441, %r2421; + xor.b32 %r2443, %r2442, %r2381; + shf.l.wrap.b32 %r2444, %r2443, %r2443, 20; + add.s32 %r2445, %r2439, %r2173; + add.s32 %r2446, %r2445, %r2444; + xor.b32 %r2447, %r2446, %r2441; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 24; + add.s32 %r2449, %r2448, %r2442; + xor.b32 %r2450, %r2449, %r2444; + shf.l.wrap.b32 %r2451, %r2450, %r2450, 25; + add.s32 %r2452, %r2404, %r2180; + add.s32 %r2453, %r2452, %r2395; + xor.b32 %r2454, %r2453, %r2420; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 16; + add.s32 %r2456, %r2455, %r2379; + xor.b32 %r2457, %r2456, %r2395; + shf.l.wrap.b32 %r2458, %r2457, %r2457, 20; + add.s32 %r2459, %r2453, %r2103; + add.s32 %r2460, %r2459, %r2458; + xor.b32 %r2461, %r2460, %r2455; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 24; + add.s32 %r2463, %r2462, %r2456; + xor.b32 %r2464, %r2463, %r2458; + shf.l.wrap.b32 %r2465, %r2464, %r2464, 25; + add.s32 %r2466, %r2418, %r2138; + add.s32 %r2467, %r2466, %r2409; + xor.b32 %r2468, %r2467, %r2378; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 16; + add.s32 %r2470, %r2469, %r2393; + xor.b32 %r2471, %r2470, %r2409; + shf.l.wrap.b32 %r2472, %r2471, %r2471, 20; + add.s32 %r2473, %r2467, %r2187; + add.s32 %r2474, %r2473, %r2472; + xor.b32 %r2475, %r2474, %r2469; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 24; + add.s32 %r2477, %r2476, %r2470; + xor.b32 %r2478, %r2477, %r2472; + shf.l.wrap.b32 %r2479, %r2478, %r2478, 25; + add.s32 %r2480, %r2432, %r2131; + add.s32 %r2481, %r2480, %r2451; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 16; + add.s32 %r2484, %r2483, %r2463; + xor.b32 %r2485, %r2484, %r2451; + shf.l.wrap.b32 %r2486, %r2485, %r2485, 20; + add.s32 %r2487, %r2481, %r2124; + add.s32 %r2488, %r2487, %r2486; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 24; + add.s32 %r2491, %r2490, %r2484; + xor.b32 %r2492, %r2491, %r2486; + shf.l.wrap.b32 %r2493, %r2492, %r2492, 25; + add.s32 %r2494, %r2446, %r2152; + add.s32 %r2495, %r2494, %r2465; + xor.b32 %r2496, %r2495, %r2434; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 16; + add.s32 %r2498, %r2497, %r2477; + xor.b32 %r2499, %r2498, %r2465; + shf.l.wrap.b32 %r2500, %r2499, %r2499, 20; + add.s32 %r2501, %r2495, %r2089; + add.s32 %r2502, %r2501, %r2500; + xor.b32 %r2503, %r2502, %r2497; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 24; + add.s32 %r2505, %r2504, %r2498; + xor.b32 %r2506, %r2505, %r2500; + shf.l.wrap.b32 %r2507, %r2506, %r2506, 25; + add.s32 %r2508, %r2460, %r2166; + add.s32 %r2509, %r2508, %r2479; + xor.b32 %r2510, %r2509, %r2448; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 16; + add.s32 %r2512, %r2511, %r2435; + xor.b32 %r2513, %r2512, %r2479; + shf.l.wrap.b32 %r2514, %r2513, %r2513, 20; + add.s32 %r2515, %r2509, %r2194; + add.s32 %r2516, %r2515, %r2514; + xor.b32 %r2517, %r2516, %r2511; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 24; + add.s32 %r2519, %r2518, %r2512; + xor.b32 %r2520, %r2519, %r2514; + shf.l.wrap.b32 %r2521, %r2520, %r2520, 25; + add.s32 %r2522, %r2474, %r2145; + add.s32 %r2523, %r2522, %r2437; + xor.b32 %r2524, %r2523, %r2462; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 16; + add.s32 %r2526, %r2525, %r2449; + xor.b32 %r2527, %r2526, %r2437; + shf.l.wrap.b32 %r2528, %r2527, %r2527, 20; + add.s32 %r2529, %r2523, %r2096; + add.s32 %r2530, %r2529, %r2528; + xor.b32 %r2531, %r2530, %r2525; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 24; + add.s32 %r2533, %r2532, %r2526; + xor.b32 %r2534, %r2533, %r2528; + shf.l.wrap.b32 %r2535, %r2534, %r2534, 25; + add.s32 %r2536, %r2488, %r2159; + add.s32 %r2537, %r2536, %r2535; + xor.b32 %r2538, %r2537, %r2504; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 16; + add.s32 %r2540, %r2539, %r2519; + xor.b32 %r2541, %r2540, %r2535; + shf.l.wrap.b32 %r2542, %r2541, %r2541, 20; + add.s32 %r2543, %r2537, %r2138; + add.s32 %r2544, %r2543, %r2542; + xor.b32 %r2545, %r2544, %r2539; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 24; + add.s32 %r2547, %r2546, %r2540; + xor.b32 %r2548, %r2547, %r2542; + shf.l.wrap.b32 %r2549, %r2548, %r2548, 25; + add.s32 %r2550, %r2502, %r2173; + add.s32 %r2551, %r2550, %r2493; + xor.b32 %r2552, %r2551, %r2518; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 16; + add.s32 %r2554, %r2553, %r2533; + xor.b32 %r2555, %r2554, %r2493; + shf.l.wrap.b32 %r2556, %r2555, %r2555, 20; + add.s32 %r2557, %r2551, %r2152; + add.s32 %r2558, %r2557, %r2556; + xor.b32 %r2559, %r2558, %r2553; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 24; + add.s32 %r2561, %r2560, %r2554; + xor.b32 %r2562, %r2561, %r2556; + shf.l.wrap.b32 %r2563, %r2562, %r2562, 25; + add.s32 %r2564, %r2516, %r2187; + add.s32 %r2565, %r2564, %r2507; + xor.b32 %r2566, %r2565, %r2532; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 16; + add.s32 %r2568, %r2567, %r2491; + xor.b32 %r2569, %r2568, %r2507; + shf.l.wrap.b32 %r2570, %r2569, %r2569, 20; + add.s32 %r2571, %r2565, %r2110; + add.s32 %r2572, %r2571, %r2570; + xor.b32 %r2573, %r2572, %r2567; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 24; + add.s32 %r2575, %r2574, %r2568; + xor.b32 %r2576, %r2575, %r2570; + shf.l.wrap.b32 %r2577, %r2576, %r2576, 25; + add.s32 %r2578, %r2530, %r2180; + add.s32 %r2579, %r2578, %r2521; + xor.b32 %r2580, %r2579, %r2490; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 16; + add.s32 %r2582, %r2581, %r2505; + xor.b32 %r2583, %r2582, %r2521; + shf.l.wrap.b32 %r2584, %r2583, %r2583, 20; + add.s32 %r2585, %r2579, %r2194; + add.s32 %r2586, %r2585, %r2584; + xor.b32 %r2587, %r2586, %r2581; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 24; + add.s32 %r2589, %r2588, %r2582; + xor.b32 %r2590, %r2589, %r2584; + shf.l.wrap.b32 %r2591, %r2590, %r2590, 25; + add.s32 %r2592, %r2544, %r2117; + add.s32 %r2593, %r2592, %r2563; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 16; + add.s32 %r2596, %r2595, %r2575; + xor.b32 %r2597, %r2596, %r2563; + shf.l.wrap.b32 %r2598, %r2597, %r2597, 20; + add.s32 %r2599, %r2593, %r2089; + add.s32 %r2600, %r2599, %r2598; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 24; + add.s32 %r2603, %r2602, %r2596; + xor.b32 %r2604, %r2603, %r2598; + shf.l.wrap.b32 %r2605, %r2604, %r2604, 25; + add.s32 %r2606, %r2558, %r2166; + add.s32 %r2607, %r2606, %r2577; + xor.b32 %r2608, %r2607, %r2546; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 16; + add.s32 %r2610, %r2609, %r2589; + xor.b32 %r2611, %r2610, %r2577; + shf.l.wrap.b32 %r2612, %r2611, %r2611, 20; + add.s32 %r2613, %r2607, %r2103; + add.s32 %r2614, %r2613, %r2612; + xor.b32 %r2615, %r2614, %r2609; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 24; + add.s32 %r2617, %r2616, %r2610; + xor.b32 %r2618, %r2617, %r2612; + shf.l.wrap.b32 %r2619, %r2618, %r2618, 25; + add.s32 %r2620, %r2572, %r2124; + add.s32 %r2621, %r2620, %r2591; + xor.b32 %r2622, %r2621, %r2560; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 16; + add.s32 %r2624, %r2623, %r2547; + xor.b32 %r2625, %r2624, %r2591; + shf.l.wrap.b32 %r2626, %r2625, %r2625, 20; + add.s32 %r2627, %r2621, %r2145; + add.s32 %r2628, %r2627, %r2626; + xor.b32 %r2629, %r2628, %r2623; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 24; + add.s32 %r2631, %r2630, %r2624; + xor.b32 %r2632, %r2631, %r2626; + shf.l.wrap.b32 %r2633, %r2632, %r2632, 25; + add.s32 %r2634, %r2586, %r2096; + add.s32 %r2635, %r2634, %r2549; + xor.b32 %r2636, %r2635, %r2574; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 16; + add.s32 %r2638, %r2637, %r2561; + xor.b32 %r2639, %r2638, %r2549; + shf.l.wrap.b32 %r2640, %r2639, %r2639, 20; + add.s32 %r2641, %r2635, %r2131; + add.s32 %r2642, %r2641, %r2640; + xor.b32 %r2643, %r2642, %r2637; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 24; + add.s32 %r2645, %r2644, %r2638; + xor.b32 %r2646, %r2645, %r2640; + shf.l.wrap.b32 %r2647, %r2646, %r2646, 25; + add.s32 %r2648, %r2600, %r2173; + add.s32 %r2649, %r2648, %r2647; + xor.b32 %r2650, %r2649, %r2616; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 16; + add.s32 %r2652, %r2651, %r2631; + xor.b32 %r2653, %r2652, %r2647; + shf.l.wrap.b32 %r2654, %r2653, %r2653, 20; + add.s32 %r2655, %r2649, %r2180; + add.s32 %r2656, %r2655, %r2654; + xor.b32 %r2657, %r2656, %r2651; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 24; + add.s32 %r2659, %r2658, %r2652; + xor.b32 %r2660, %r2659, %r2654; + shf.l.wrap.b32 %r2661, %r2660, %r2660, 25; + add.s32 %r2662, %r2614, %r2152; + add.s32 %r2663, %r2662, %r2605; + xor.b32 %r2664, %r2663, %r2630; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 16; + add.s32 %r2666, %r2665, %r2645; + xor.b32 %r2667, %r2666, %r2605; + shf.l.wrap.b32 %r2668, %r2667, %r2667, 20; + add.s32 %r2669, %r2663, %r2166; + add.s32 %r2670, %r2669, %r2668; + xor.b32 %r2671, %r2670, %r2665; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 24; + add.s32 %r2673, %r2672, %r2666; + xor.b32 %r2674, %r2673, %r2668; + shf.l.wrap.b32 %r2675, %r2674, %r2674, 25; + add.s32 %r2676, %r2628, %r2194; + add.s32 %r2677, %r2676, %r2619; + xor.b32 %r2678, %r2677, %r2644; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 16; + add.s32 %r2680, %r2679, %r2603; + xor.b32 %r2681, %r2680, %r2619; + shf.l.wrap.b32 %r2682, %r2681, %r2681, 20; + add.s32 %r2683, %r2677, %r2159; + add.s32 %r2684, %r2683, %r2682; + xor.b32 %r2685, %r2684, %r2679; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 24; + add.s32 %r2687, %r2686, %r2680; + xor.b32 %r2688, %r2687, %r2682; + shf.l.wrap.b32 %r2689, %r2688, %r2688, 25; + add.s32 %r2690, %r2642, %r2187; + add.s32 %r2691, %r2690, %r2633; + xor.b32 %r2692, %r2691, %r2602; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 16; + add.s32 %r2694, %r2693, %r2617; + xor.b32 %r2695, %r2694, %r2633; + shf.l.wrap.b32 %r2696, %r2695, %r2695, 20; + add.s32 %r2697, %r2691, %r2145; + add.s32 %r2698, %r2697, %r2696; + xor.b32 %r2699, %r2698, %r2693; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 24; + add.s32 %r2701, %r2700, %r2694; + xor.b32 %r2702, %r2701, %r2696; + shf.l.wrap.b32 %r2703, %r2702, %r2702, 25; + add.s32 %r2704, %r2656, %r2138; + add.s32 %r2705, %r2704, %r2675; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 16; + add.s32 %r2708, %r2707, %r2687; + xor.b32 %r2709, %r2708, %r2675; + shf.l.wrap.b32 %r2710, %r2709, %r2709, 20; + add.s32 %r2711, %r2705, %r2103; + add.s32 %r2712, %r2711, %r2710; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 24; + add.s32 %r2715, %r2714, %r2708; + xor.b32 %r2716, %r2715, %r2710; + shf.l.wrap.b32 %r2717, %r2716, %r2716, 25; + add.s32 %r2718, %r2670, %r2124; + add.s32 %r2719, %r2718, %r2689; + xor.b32 %r2720, %r2719, %r2658; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 16; + add.s32 %r2722, %r2721, %r2701; + xor.b32 %r2723, %r2722, %r2689; + shf.l.wrap.b32 %r2724, %r2723, %r2723, 20; + add.s32 %r2725, %r2719, %r2110; + add.s32 %r2726, %r2725, %r2724; + xor.b32 %r2727, %r2726, %r2721; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 24; + add.s32 %r2729, %r2728, %r2722; + xor.b32 %r2730, %r2729, %r2724; + shf.l.wrap.b32 %r2731, %r2730, %r2730, 25; + add.s32 %r2732, %r2684, %r2089; + add.s32 %r2733, %r2732, %r2703; + xor.b32 %r2734, %r2733, %r2672; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 16; + add.s32 %r2736, %r2735, %r2659; + xor.b32 %r2737, %r2736, %r2703; + shf.l.wrap.b32 %r2738, %r2737, %r2737, 20; + add.s32 %r2739, %r2733, %r2096; + add.s32 %r2740, %r2739, %r2738; + xor.b32 %r2741, %r2740, %r2735; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 24; + add.s32 %r2743, %r2742, %r2736; + xor.b32 %r2744, %r2743, %r2738; + shf.l.wrap.b32 %r2745, %r2744, %r2744, 25; + add.s32 %r2746, %r2698, %r2131; + add.s32 %r2747, %r2746, %r2661; + xor.b32 %r2748, %r2747, %r2686; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 16; + add.s32 %r2750, %r2749, %r2673; + xor.b32 %r2751, %r2750, %r2661; + shf.l.wrap.b32 %r2752, %r2751, %r2751, 20; + add.s32 %r2753, %r2747, %r2117; + add.s32 %r2754, %r2753, %r2752; + xor.b32 %r2755, %r2754, %r2749; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 24; + add.s32 %r2757, %r2756, %r2750; + xor.b32 %r2758, %r2757, %r2752; + shf.l.wrap.b32 %r2759, %r2758, %r2758, 25; + add.s32 %r2760, %r2712, %r2152; + add.s32 %r2761, %r2760, %r2759; + xor.b32 %r2762, %r2761, %r2728; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 16; + add.s32 %r2764, %r2763, %r2743; + xor.b32 %r2765, %r2764, %r2759; + shf.l.wrap.b32 %r2766, %r2765, %r2765, 20; + add.s32 %r2767, %r2761, %r2187; + add.s32 %r2768, %r2767, %r2766; + xor.b32 %r2769, %r2768, %r2763; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 24; + add.s32 %r2771, %r2770, %r2764; + xor.b32 %r2772, %r2771, %r2766; + shf.l.wrap.b32 %r2773, %r2772, %r2772, 25; + add.s32 %r2774, %r2726, %r2166; + add.s32 %r2775, %r2774, %r2717; + xor.b32 %r2776, %r2775, %r2742; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 16; + add.s32 %r2778, %r2777, %r2757; + xor.b32 %r2779, %r2778, %r2717; + shf.l.wrap.b32 %r2780, %r2779, %r2779, 20; + add.s32 %r2781, %r2775, %r2124; + add.s32 %r2782, %r2781, %r2780; + xor.b32 %r2783, %r2782, %r2777; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 24; + add.s32 %r2785, %r2784, %r2778; + xor.b32 %r2786, %r2785, %r2780; + shf.l.wrap.b32 %r2787, %r2786, %r2786, 25; + add.s32 %r2788, %r2740, %r2145; + add.s32 %r2789, %r2788, %r2731; + xor.b32 %r2790, %r2789, %r2756; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 16; + add.s32 %r2792, %r2791, %r2715; + xor.b32 %r2793, %r2792, %r2731; + shf.l.wrap.b32 %r2794, %r2793, %r2793, 20; + add.s32 %r2795, %r2789, %r2173; + add.s32 %r2796, %r2795, %r2794; + xor.b32 %r2797, %r2796, %r2791; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 24; + add.s32 %r2799, %r2798, %r2792; + xor.b32 %r2800, %r2799, %r2794; + shf.l.wrap.b32 %r2801, %r2800, %r2800, 25; + add.s32 %r2802, %r2754, %r2194; + add.s32 %r2803, %r2802, %r2745; + xor.b32 %r2804, %r2803, %r2714; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 16; + add.s32 %r2806, %r2805, %r2729; + xor.b32 %r2807, %r2806, %r2745; + shf.l.wrap.b32 %r2808, %r2807, %r2807, 20; + add.s32 %r2809, %r2803, %r2096; + add.s32 %r2810, %r2809, %r2808; + xor.b32 %r2811, %r2810, %r2805; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 24; + add.s32 %r2813, %r2812, %r2806; + xor.b32 %r2814, %r2813, %r2808; + shf.l.wrap.b32 %r2815, %r2814, %r2814, 25; + add.s32 %r2816, %r2768, %r2180; + add.s32 %r2817, %r2816, %r2787; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 16; + add.s32 %r2820, %r2819, %r2799; + xor.b32 %r2821, %r2820, %r2787; + shf.l.wrap.b32 %r2822, %r2821, %r2821, 20; + add.s32 %r2823, %r2817, %r2110; + add.s32 %r2824, %r2823, %r2822; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 24; + add.s32 %r2827, %r2826, %r2820; + xor.b32 %r2828, %r2827, %r2822; + shf.l.wrap.b32 %r2829, %r2828, %r2828, 25; + add.s32 %r2830, %r2782, %r2089; + add.s32 %r2831, %r2830, %r2801; + xor.b32 %r2832, %r2831, %r2770; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 16; + add.s32 %r2834, %r2833, %r2813; + xor.b32 %r2835, %r2834, %r2801; + shf.l.wrap.b32 %r2836, %r2835, %r2835, 20; + add.s32 %r2837, %r2831, %r2159; + add.s32 %r2838, %r2837, %r2836; + xor.b32 %r2839, %r2838, %r2833; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 24; + add.s32 %r2841, %r2840, %r2834; + xor.b32 %r2842, %r2841, %r2836; + shf.l.wrap.b32 %r2843, %r2842, %r2842, 25; + add.s32 %r2844, %r2796, %r2103; + add.s32 %r2845, %r2844, %r2815; + xor.b32 %r2846, %r2845, %r2784; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 16; + add.s32 %r2848, %r2847, %r2771; + xor.b32 %r2849, %r2848, %r2815; + shf.l.wrap.b32 %r2850, %r2849, %r2849, 20; + add.s32 %r2851, %r2845, %r2131; + add.s32 %r2852, %r2851, %r2850; + xor.b32 %r2853, %r2852, %r2847; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 24; + add.s32 %r2855, %r2854, %r2848; + xor.b32 %r2856, %r2855, %r2850; + shf.l.wrap.b32 %r2857, %r2856, %r2856, 25; + add.s32 %r2858, %r2810, %r2117; + add.s32 %r2859, %r2858, %r2773; + xor.b32 %r2860, %r2859, %r2798; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 16; + add.s32 %r2862, %r2861, %r2785; + xor.b32 %r2863, %r2862, %r2773; + shf.l.wrap.b32 %r2864, %r2863, %r2863, 20; + add.s32 %r2865, %r2859, %r2138; + add.s32 %r2866, %r2865, %r2864; + xor.b32 %r2867, %r2866, %r2861; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 24; + add.s32 %r2869, %r2868, %r2862; + xor.b32 %r2870, %r2869, %r2864; + shf.l.wrap.b32 %r2871, %r2870, %r2870, 25; + add.s32 %r2872, %r2824, %r2166; + add.s32 %r2873, %r2872, %r2871; + xor.b32 %r2874, %r2873, %r2840; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 16; + add.s32 %r2876, %r2875, %r2855; + xor.b32 %r2877, %r2876, %r2871; + shf.l.wrap.b32 %r2878, %r2877, %r2877, 20; + add.s32 %r2879, %r2873, %r2194; + add.s32 %r2880, %r2879, %r2878; + xor.b32 %r2881, %r2880, %r2875; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 24; + add.s32 %r2883, %r2882, %r2876; + xor.b32 %r2884, %r2883, %r2878; + shf.l.wrap.b32 %r2885, %r2884, %r2884, 25; + add.s32 %r2886, %r2838, %r2124; + add.s32 %r2887, %r2886, %r2829; + xor.b32 %r2888, %r2887, %r2854; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 16; + add.s32 %r2890, %r2889, %r2869; + xor.b32 %r2891, %r2890, %r2829; + shf.l.wrap.b32 %r2892, %r2891, %r2891, 20; + add.s32 %r2893, %r2887, %r2089; + add.s32 %r2894, %r2893, %r2892; + xor.b32 %r2895, %r2894, %r2889; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 24; + add.s32 %r2897, %r2896, %r2890; + xor.b32 %r2898, %r2897, %r2892; + shf.l.wrap.b32 %r2899, %r2898, %r2898, 25; + add.s32 %r2900, %r2852, %r2096; + add.s32 %r2901, %r2900, %r2843; + xor.b32 %r2902, %r2901, %r2868; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 16; + add.s32 %r2904, %r2903, %r2827; + xor.b32 %r2905, %r2904, %r2843; + shf.l.wrap.b32 %r2906, %r2905, %r2905, 20; + add.s32 %r2907, %r2901, %r2152; + add.s32 %r2908, %r2907, %r2906; + xor.b32 %r2909, %r2908, %r2903; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 24; + add.s32 %r2911, %r2910, %r2904; + xor.b32 %r2912, %r2911, %r2906; + shf.l.wrap.b32 %r2913, %r2912, %r2912, 25; + add.s32 %r2914, %r2866, %r2145; + add.s32 %r2915, %r2914, %r2857; + xor.b32 %r2916, %r2915, %r2826; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 16; + add.s32 %r2918, %r2917, %r2841; + xor.b32 %r2919, %r2918, %r2857; + shf.l.wrap.b32 %r2920, %r2919, %r2919, 20; + add.s32 %r2921, %r2915, %r2131; + add.s32 %r2922, %r2921, %r2920; + xor.b32 %r2923, %r2922, %r2917; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 24; + add.s32 %r2925, %r2924, %r2918; + xor.b32 %r2926, %r2925, %r2920; + shf.l.wrap.b32 %r2927, %r2926, %r2926, 25; + add.s32 %r2928, %r2880, %r2187; + add.s32 %r2929, %r2928, %r2899; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 16; + add.s32 %r2932, %r2931, %r2911; + xor.b32 %r2933, %r2932, %r2899; + shf.l.wrap.b32 %r2934, %r2933, %r2933, 20; + add.s32 %r2935, %r2929, %r2159; + add.s32 %r2936, %r2935, %r2934; + xor.b32 %r2937, %r2936, %r2931; + shf.l.wrap.b32 %r2938, %r2937, %r2937, 24; + add.s32 %r2939, %r2938, %r2932; + xor.b32 %r2940, %r2939, %r2934; + shf.l.wrap.b32 %r2941, %r2940, %r2940, 25; + add.s32 %r2942, %r2894, %r2103; + add.s32 %r2943, %r2942, %r2913; + xor.b32 %r2944, %r2943, %r2882; + shf.l.wrap.b32 %r2945, %r2944, %r2944, 16; + add.s32 %r2946, %r2945, %r2925; + xor.b32 %r2947, %r2946, %r2913; + shf.l.wrap.b32 %r2948, %r2947, %r2947, 20; + add.s32 %r2949, %r2943, %r2173; + add.s32 %r2950, %r2949, %r2948; + xor.b32 %r2951, %r2950, %r2945; + shf.l.wrap.b32 %r2952, %r2951, %r2951, 24; + add.s32 %r2953, %r2952, %r2946; + xor.b32 %r2954, %r2953, %r2948; + shf.l.wrap.b32 %r2955, %r2954, %r2954, 25; + add.s32 %r2956, %r2908, %r2110; + add.s32 %r2957, %r2956, %r2927; + xor.b32 %r2958, %r2957, %r2896; + shf.l.wrap.b32 %r2959, %r2958, %r2958, 16; + add.s32 %r2960, %r2959, %r2883; + xor.b32 %r2961, %r2960, %r2927; + shf.l.wrap.b32 %r2962, %r2961, %r2961, 20; + add.s32 %r2963, %r2957, %r2117; + add.s32 %r2964, %r2963, %r2962; + xor.b32 %r2965, %r2964, %r2959; + shf.l.wrap.b32 %r2966, %r2965, %r2965, 24; + add.s32 %r2967, %r2966, %r2960; + xor.b32 %r2968, %r2967, %r2962; + shf.l.wrap.b32 %r2969, %r2968, %r2968, 25; + add.s32 %r2970, %r2922, %r2138; + add.s32 %r2971, %r2970, %r2885; + xor.b32 %r2972, %r2971, %r2910; + shf.l.wrap.b32 %r2973, %r2972, %r2972, 16; + add.s32 %r2974, %r2973, %r2897; + xor.b32 %r2975, %r2974, %r2885; + shf.l.wrap.b32 %r2976, %r2975, %r2975, 20; + add.s32 %r2977, %r2971, %r2180; + add.s32 %r2978, %r2977, %r2976; + xor.b32 %r2979, %r2978, %r2973; + shf.l.wrap.b32 %r2980, %r2979, %r2979, 24; + add.s32 %r2981, %r2980, %r2974; + xor.b32 %r2982, %r2981, %r2976; + shf.l.wrap.b32 %r2983, %r2982, %r2982, 25; + xor.b32 %r3964, %r2967, %r2936; + xor.b32 %r3963, %r2981, %r2950; + xor.b32 %r3962, %r2939, %r2964; + xor.b32 %r3961, %r2953, %r2978; + xor.b32 %r3960, %r2983, %r2952; + xor.b32 %r3959, %r2941, %r2966; + xor.b32 %r3958, %r2955, %r2980; + xor.b32 %r3957, %r2969, %r2938; + add.s16 %rs198, %rs198, 1; + st.local.u8 [%rd56+1], %rs198; + add.s64 %rd170, %rd170, 64; + add.s64 %rd171, %rd171, -64; + setp.gt.u64 %p24, %rd171, 64; + @%p24 bra $L__BB0_24; + +$L__BB0_25: + min.u64 %rd63, %rd171, 64; + setp.eq.s64 %p25, %rd63, 0; + mov.u16 %rs200, %rs199; + mov.u16 %rs201, %rs199; + mov.u16 %rs202, %rs199; + mov.u16 %rs203, %rs199; + mov.u16 %rs204, %rs199; + mov.u16 %rs205, %rs199; + mov.u16 %rs206, %rs199; + mov.u16 %rs207, %rs199; + mov.u16 %rs208, %rs199; + mov.u16 %rs209, %rs199; + mov.u16 %rs210, %rs199; + mov.u16 %rs211, %rs199; + mov.u16 %rs212, %rs199; + mov.u16 %rs213, %rs199; + mov.u16 %rs214, %rs199; + mov.u16 %rs215, %rs199; + mov.u16 %rs216, %rs199; + mov.u16 %rs217, %rs199; + mov.u16 %rs218, %rs199; + mov.u16 %rs219, %rs199; + mov.u16 %rs220, %rs199; + mov.u16 %rs221, %rs199; + mov.u16 %rs222, %rs199; + mov.u16 %rs223, %rs199; + mov.u16 %rs224, %rs199; + mov.u16 %rs225, %rs199; + mov.u16 %rs226, %rs199; + mov.u16 %rs227, %rs199; + mov.u16 %rs228, %rs199; + mov.u16 %rs229, %rs199; + mov.u16 %rs230, %rs199; + mov.u16 %rs231, %rs199; + mov.u16 %rs232, %rs199; + @%p25 bra $L__BB0_29; + + mov.u64 %rd172, 0; + +$L__BB0_27: + add.s64 %rd131, %rd170, %rd172; + ld.u8 %rs121, [%rd131]; + add.s64 %rd132, %rd53, %rd172; + st.local.u8 [%rd132], %rs121; + add.s64 %rd172, %rd172, 1; + setp.lt.u64 %p26, %rd172, %rd63; + @%p26 bra $L__BB0_27; + + ld.local.v4.u16 {%rs229, %rs230, %rs231, %rs232}, [%rd53]; + ld.local.v4.u16 {%rs225, %rs226, %rs227, %rs228}, [%rd53+8]; + ld.local.v4.u16 {%rs221, %rs222, %rs223, %rs224}, [%rd53+16]; + ld.local.v4.u16 {%rs217, %rs218, %rs219, %rs220}, [%rd53+24]; + ld.local.v4.u16 {%rs213, %rs214, %rs215, %rs216}, [%rd53+32]; + ld.local.v4.u16 {%rs209, %rs210, %rs211, %rs212}, [%rd53+40]; + ld.local.v4.u16 {%rs205, %rs206, %rs207, %rs208}, [%rd53+48]; + ld.local.v4.u16 {%rs202, %rs203, %rs204, %rs153}, [%rd53+56]; + ld.local.u8 %rs201, [%rd53+61]; + ld.local.v2.u8 {%rs199, %rs200}, [%rd53+62]; + +$L__BB0_29: + ld.param.u64 %rd138, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd137, %rd138; + ld.local.v4.u8 {%rs156, %rs157, %rs158, %rs159}, [%rd53+64]; + cvt.u16.u64 %rs162, %rd63; + add.s16 %rs163, %rs156, %rs162; + st.local.u8 [%rd53+64], %rs163; + setp.eq.s16 %p27, %rs157, 0; + selp.u16 %rs164, 1, 0, %p27; + or.b16 %rs165, %rs158, %rs164; + or.b16 %rs166, %rs165, 2; + shr.u16 %rs167, %rs229, 8; + shr.u16 %rs168, %rs230, 8; + shr.u16 %rs169, %rs231, 8; + shr.u16 %rs170, %rs232, 8; + shr.u16 %rs171, %rs225, 8; + shr.u16 %rs172, %rs226, 8; + shr.u16 %rs173, %rs227, 8; + shr.u16 %rs174, %rs228, 8; + shr.u16 %rs175, %rs221, 8; + shr.u16 %rs176, %rs222, 8; + shr.u16 %rs177, %rs223, 8; + shr.u16 %rs178, %rs224, 8; + shr.u16 %rs179, %rs217, 8; + shr.u16 %rs180, %rs218, 8; + shr.u16 %rs181, %rs219, 8; + shr.u16 %rs182, %rs220, 8; + shr.u16 %rs183, %rs213, 8; + shr.u16 %rs184, %rs214, 8; + shr.u16 %rs185, %rs215, 8; + shr.u16 %rs186, %rs216, 8; + shr.u16 %rs187, %rs209, 8; + shr.u16 %rs188, %rs210, 8; + shr.u16 %rs189, %rs211, 8; + shr.u16 %rs190, %rs212, 8; + shr.u16 %rs191, %rs205, 8; + shr.u16 %rs192, %rs206, 8; + shr.u16 %rs193, %rs207, 8; + shr.u16 %rs194, %rs208, 8; + shr.u16 %rs195, %rs202, 8; + shr.u16 %rs196, %rs203, 8; + shl.b64 %rd133, %rd151, 5; + add.s64 %rd134, %rd137, %rd133; + cvt.u32.u16 %r2984, %rs229; + and.b32 %r2985, %r2984, 255; + cvt.u32.u16 %r2986, %rs167; + prmt.b32 %r2987, %r2986, %r2985, 30212; + cvt.u32.u16 %r2988, %rs230; + prmt.b32 %r2989, %r2988, %r2987, 28756; + cvt.u32.u16 %r2990, %rs168; + prmt.b32 %r2991, %r2990, %r2989, 1620; + cvt.u32.u16 %r2992, %rs231; + and.b32 %r2993, %r2992, 255; + cvt.u32.u16 %r2994, %rs169; + prmt.b32 %r2995, %r2994, %r2993, 30212; + cvt.u32.u16 %r2996, %rs232; + prmt.b32 %r2997, %r2996, %r2995, 28756; + cvt.u32.u16 %r2998, %rs170; + prmt.b32 %r2999, %r2998, %r2997, 1620; + cvt.u32.u16 %r3000, %rs225; + and.b32 %r3001, %r3000, 255; + cvt.u32.u16 %r3002, %rs171; + prmt.b32 %r3003, %r3002, %r3001, 30212; + cvt.u32.u16 %r3004, %rs226; + prmt.b32 %r3005, %r3004, %r3003, 28756; + cvt.u32.u16 %r3006, %rs172; + prmt.b32 %r3007, %r3006, %r3005, 1620; + cvt.u32.u16 %r3008, %rs227; + and.b32 %r3009, %r3008, 255; + cvt.u32.u16 %r3010, %rs173; + prmt.b32 %r3011, %r3010, %r3009, 30212; + cvt.u32.u16 %r3012, %rs228; + prmt.b32 %r3013, %r3012, %r3011, 28756; + cvt.u32.u16 %r3014, %rs174; + prmt.b32 %r3015, %r3014, %r3013, 1620; + cvt.u32.u16 %r3016, %rs221; + and.b32 %r3017, %r3016, 255; + cvt.u32.u16 %r3018, %rs175; + prmt.b32 %r3019, %r3018, %r3017, 30212; + cvt.u32.u16 %r3020, %rs222; + prmt.b32 %r3021, %r3020, %r3019, 28756; + cvt.u32.u16 %r3022, %rs176; + prmt.b32 %r3023, %r3022, %r3021, 1620; + cvt.u32.u16 %r3024, %rs223; + and.b32 %r3025, %r3024, 255; + cvt.u32.u16 %r3026, %rs177; + prmt.b32 %r3027, %r3026, %r3025, 30212; + cvt.u32.u16 %r3028, %rs224; + prmt.b32 %r3029, %r3028, %r3027, 28756; + cvt.u32.u16 %r3030, %rs178; + prmt.b32 %r3031, %r3030, %r3029, 1620; + cvt.u32.u16 %r3032, %rs217; + and.b32 %r3033, %r3032, 255; + cvt.u32.u16 %r3034, %rs179; + prmt.b32 %r3035, %r3034, %r3033, 30212; + cvt.u32.u16 %r3036, %rs218; + prmt.b32 %r3037, %r3036, %r3035, 28756; + cvt.u32.u16 %r3038, %rs180; + prmt.b32 %r3039, %r3038, %r3037, 1620; + cvt.u32.u16 %r3040, %rs219; + and.b32 %r3041, %r3040, 255; + cvt.u32.u16 %r3042, %rs181; + prmt.b32 %r3043, %r3042, %r3041, 30212; + cvt.u32.u16 %r3044, %rs220; + prmt.b32 %r3045, %r3044, %r3043, 28756; + cvt.u32.u16 %r3046, %rs182; + prmt.b32 %r3047, %r3046, %r3045, 1620; + cvt.u32.u16 %r3048, %rs213; + and.b32 %r3049, %r3048, 255; + cvt.u32.u16 %r3050, %rs183; + prmt.b32 %r3051, %r3050, %r3049, 30212; + cvt.u32.u16 %r3052, %rs214; + prmt.b32 %r3053, %r3052, %r3051, 28756; + cvt.u32.u16 %r3054, %rs184; + prmt.b32 %r3055, %r3054, %r3053, 1620; + cvt.u32.u16 %r3056, %rs215; + and.b32 %r3057, %r3056, 255; + cvt.u32.u16 %r3058, %rs185; + prmt.b32 %r3059, %r3058, %r3057, 30212; + cvt.u32.u16 %r3060, %rs216; + prmt.b32 %r3061, %r3060, %r3059, 28756; + cvt.u32.u16 %r3062, %rs186; + prmt.b32 %r3063, %r3062, %r3061, 1620; + cvt.u32.u16 %r3064, %rs209; + and.b32 %r3065, %r3064, 255; + cvt.u32.u16 %r3066, %rs187; + prmt.b32 %r3067, %r3066, %r3065, 30212; + cvt.u32.u16 %r3068, %rs210; + prmt.b32 %r3069, %r3068, %r3067, 28756; + cvt.u32.u16 %r3070, %rs188; + prmt.b32 %r3071, %r3070, %r3069, 1620; + cvt.u32.u16 %r3072, %rs211; + and.b32 %r3073, %r3072, 255; + cvt.u32.u16 %r3074, %rs189; + prmt.b32 %r3075, %r3074, %r3073, 30212; + cvt.u32.u16 %r3076, %rs212; + prmt.b32 %r3077, %r3076, %r3075, 28756; + cvt.u32.u16 %r3078, %rs190; + prmt.b32 %r3079, %r3078, %r3077, 1620; + cvt.u32.u16 %r3080, %rs205; + and.b32 %r3081, %r3080, 255; + cvt.u32.u16 %r3082, %rs191; + prmt.b32 %r3083, %r3082, %r3081, 30212; + cvt.u32.u16 %r3084, %rs206; + prmt.b32 %r3085, %r3084, %r3083, 28756; + cvt.u32.u16 %r3086, %rs192; + prmt.b32 %r3087, %r3086, %r3085, 1620; + cvt.u32.u16 %r3088, %rs207; + and.b32 %r3089, %r3088, 255; + cvt.u32.u16 %r3090, %rs193; + prmt.b32 %r3091, %r3090, %r3089, 30212; + cvt.u32.u16 %r3092, %rs208; + prmt.b32 %r3093, %r3092, %r3091, 28756; + cvt.u32.u16 %r3094, %rs194; + prmt.b32 %r3095, %r3094, %r3093, 1620; + cvt.u32.u16 %r3096, %rs202; + and.b32 %r3097, %r3096, 255; + cvt.u32.u16 %r3098, %rs195; + prmt.b32 %r3099, %r3098, %r3097, 30212; + cvt.u32.u16 %r3100, %rs203; + prmt.b32 %r3101, %r3100, %r3099, 28756; + cvt.u32.u16 %r3102, %rs196; + prmt.b32 %r3103, %r3102, %r3101, 1620; + cvt.u32.u16 %r3104, %rs204; + and.b32 %r3105, %r3104, 255; + cvt.u32.u16 %r3106, %rs201; + prmt.b32 %r3107, %r3106, %r3105, 30212; + cvt.u32.u16 %r3108, %rs199; + shl.b32 %r3109, %r3108, 16; + and.b32 %r3110, %r3109, 16711680; + or.b32 %r3111, %r3107, %r3110; + cvt.u32.u16 %r3112, %rs200; + shl.b32 %r3113, %r3112, 24; + or.b32 %r3114, %r3111, %r3113; + cvt.u32.u16 %r3115, %rs163; + and.b32 %r3116, %r3115, 255; + cvt.u32.u16 %r3117, %rs166; + and.b32 %r3118, %r3117, 255; + add.s32 %r3119, %r3960, %r3964; + add.s32 %r3120, %r3119, %r2991; + xor.b32 %r3121, %r3120, %r36; + shf.l.wrap.b32 %r3122, %r3121, %r3121, 16; + add.s32 %r3123, %r3122, 1779033703; + xor.b32 %r3124, %r3123, %r3960; + shf.l.wrap.b32 %r3125, %r3124, %r3124, 20; + add.s32 %r3126, %r2999, %r3120; + add.s32 %r3127, %r3126, %r3125; + xor.b32 %r3128, %r3127, %r3122; + shf.l.wrap.b32 %r3129, %r3128, %r3128, 24; + add.s32 %r3130, %r3129, %r3123; + xor.b32 %r3131, %r3130, %r3125; + shf.l.wrap.b32 %r3132, %r3131, %r3131, 25; + add.s32 %r3133, %r3959, %r3963; + add.s32 %r3134, %r3133, %r3007; + xor.b32 %r3135, %r3134, %r37; + shf.l.wrap.b32 %r3136, %r3135, %r3135, 16; + add.s32 %r3137, %r3136, -1150833019; + xor.b32 %r3138, %r3137, %r3959; + shf.l.wrap.b32 %r3139, %r3138, %r3138, 20; + add.s32 %r3140, %r3015, %r3134; + add.s32 %r3141, %r3140, %r3139; + xor.b32 %r3142, %r3141, %r3136; + shf.l.wrap.b32 %r3143, %r3142, %r3142, 24; + add.s32 %r3144, %r3143, %r3137; + xor.b32 %r3145, %r3144, %r3139; + shf.l.wrap.b32 %r3146, %r3145, %r3145, 25; + add.s32 %r3147, %r3958, %r3962; + add.s32 %r3148, %r3147, %r3023; + xor.b32 %r3149, %r3148, %r3116; + shr.u32 %r3150, %r3148, 16; + shl.b32 %r3151, %r3149, 16; + or.b32 %r3152, %r3151, %r3150; + add.s32 %r3153, %r3152, 1013904242; + xor.b32 %r3154, %r3153, %r3958; + shf.l.wrap.b32 %r3155, %r3154, %r3154, 20; + add.s32 %r3156, %r3031, %r3148; + add.s32 %r3157, %r3156, %r3155; + xor.b32 %r3158, %r3157, %r3152; + shf.l.wrap.b32 %r3159, %r3158, %r3158, 24; + add.s32 %r3160, %r3159, %r3153; + xor.b32 %r3161, %r3160, %r3155; + shf.l.wrap.b32 %r3162, %r3161, %r3161, 25; + add.s32 %r3163, %r3957, %r3961; + add.s32 %r3164, %r3163, %r3039; + xor.b32 %r3165, %r3164, %r3118; + shr.u32 %r3166, %r3164, 16; + shl.b32 %r3167, %r3165, 16; + or.b32 %r3168, %r3167, %r3166; + add.s32 %r3169, %r3168, -1521486534; + xor.b32 %r3170, %r3169, %r3957; + shf.l.wrap.b32 %r3171, %r3170, %r3170, 20; + add.s32 %r3172, %r3047, %r3164; + add.s32 %r3173, %r3172, %r3171; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 24; + add.s32 %r3176, %r3175, %r3169; + xor.b32 %r3177, %r3176, %r3171; + shf.l.wrap.b32 %r3178, %r3177, %r3177, 25; + add.s32 %r3179, %r3146, %r3127; + add.s32 %r3180, %r3179, %r3055; + xor.b32 %r3181, %r3175, %r3180; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 16; + add.s32 %r3183, %r3182, %r3160; + xor.b32 %r3184, %r3183, %r3146; + shf.l.wrap.b32 %r3185, %r3184, %r3184, 20; + add.s32 %r3186, %r3063, %r3180; + add.s32 %r3187, %r3186, %r3185; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 24; + add.s32 %r3190, %r3189, %r3183; + xor.b32 %r3191, %r3190, %r3185; + shf.l.wrap.b32 %r3192, %r3191, %r3191, 25; + add.s32 %r3193, %r3162, %r3141; + add.s32 %r3194, %r3193, %r3071; + xor.b32 %r3195, %r3194, %r3129; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 16; + add.s32 %r3197, %r3196, %r3176; + xor.b32 %r3198, %r3197, %r3162; + shf.l.wrap.b32 %r3199, %r3198, %r3198, 20; + add.s32 %r3200, %r3079, %r3194; + add.s32 %r3201, %r3200, %r3199; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 24; + add.s32 %r3204, %r3203, %r3197; + xor.b32 %r3205, %r3204, %r3199; + shf.l.wrap.b32 %r3206, %r3205, %r3205, 25; + add.s32 %r3207, %r3178, %r3157; + add.s32 %r3208, %r3207, %r3087; + xor.b32 %r3209, %r3208, %r3143; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 16; + add.s32 %r3211, %r3210, %r3130; + xor.b32 %r3212, %r3211, %r3178; + shf.l.wrap.b32 %r3213, %r3212, %r3212, 20; + add.s32 %r3214, %r3095, %r3208; + add.s32 %r3215, %r3214, %r3213; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 24; + add.s32 %r3218, %r3217, %r3211; + xor.b32 %r3219, %r3218, %r3213; + shf.l.wrap.b32 %r3220, %r3219, %r3219, 25; + add.s32 %r3221, %r3173, %r3132; + add.s32 %r3222, %r3221, %r3103; + xor.b32 %r3223, %r3222, %r3159; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 16; + add.s32 %r3225, %r3224, %r3144; + xor.b32 %r3226, %r3225, %r3132; + shf.l.wrap.b32 %r3227, %r3226, %r3226, 20; + add.s32 %r3228, %r3114, %r3222; + add.s32 %r3229, %r3228, %r3227; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 24; + add.s32 %r3232, %r3231, %r3225; + xor.b32 %r3233, %r3232, %r3227; + shf.l.wrap.b32 %r3234, %r3233, %r3233, 25; + add.s32 %r3235, %r3187, %r3007; + add.s32 %r3236, %r3235, %r3234; + xor.b32 %r3237, %r3236, %r3203; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 16; + add.s32 %r3239, %r3238, %r3218; + xor.b32 %r3240, %r3239, %r3234; + shf.l.wrap.b32 %r3241, %r3240, %r3240, 20; + add.s32 %r3242, %r3236, %r3039; + add.s32 %r3243, %r3242, %r3241; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 24; + add.s32 %r3246, %r3245, %r3239; + xor.b32 %r3247, %r3246, %r3241; + shf.l.wrap.b32 %r3248, %r3247, %r3247, 25; + add.s32 %r3249, %r3201, %r3015; + add.s32 %r3250, %r3249, %r3192; + xor.b32 %r3251, %r3217, %r3250; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 16; + add.s32 %r3253, %r3232, %r3252; + xor.b32 %r3254, %r3253, %r3192; + shf.l.wrap.b32 %r3255, %r3254, %r3254, 20; + add.s32 %r3256, %r3250, %r3071; + add.s32 %r3257, %r3256, %r3255; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 24; + add.s32 %r3260, %r3259, %r3253; + xor.b32 %r3261, %r3260, %r3255; + shf.l.wrap.b32 %r3262, %r3261, %r3261, 25; + add.s32 %r3263, %r3206, %r3047; + add.s32 %r3264, %r3263, %r3215; + xor.b32 %r3265, %r3231, %r3264; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 16; + add.s32 %r3267, %r3266, %r3190; + xor.b32 %r3268, %r3267, %r3206; + shf.l.wrap.b32 %r3269, %r3268, %r3268, 20; + add.s32 %r3270, %r3264, %r2991; + add.s32 %r3271, %r3270, %r3269; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 24; + add.s32 %r3274, %r3273, %r3267; + xor.b32 %r3275, %r3274, %r3269; + shf.l.wrap.b32 %r3276, %r3275, %r3275, 25; + add.s32 %r3277, %r3220, %r3023; + add.s32 %r3278, %r3277, %r3229; + xor.b32 %r3279, %r3278, %r3189; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 16; + add.s32 %r3281, %r3280, %r3204; + xor.b32 %r3282, %r3281, %r3220; + shf.l.wrap.b32 %r3283, %r3282, %r3282, 20; + add.s32 %r3284, %r3278, %r3095; + add.s32 %r3285, %r3284, %r3283; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 24; + add.s32 %r3288, %r3287, %r3281; + xor.b32 %r3289, %r3288, %r3283; + shf.l.wrap.b32 %r3290, %r3289, %r3289, 25; + add.s32 %r3291, %r3262, %r2999; + add.s32 %r3292, %r3291, %r3243; + xor.b32 %r3293, %r3292, %r3287; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 16; + add.s32 %r3295, %r3294, %r3274; + xor.b32 %r3296, %r3295, %r3262; + shf.l.wrap.b32 %r3297, %r3296, %r3296, 20; + add.s32 %r3298, %r3292, %r3079; + add.s32 %r3299, %r3298, %r3297; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 24; + add.s32 %r3302, %r3301, %r3295; + xor.b32 %r3303, %r3302, %r3297; + shf.l.wrap.b32 %r3304, %r3303, %r3303, 25; + add.s32 %r3305, %r3257, %r3087; + add.s32 %r3306, %r3305, %r3276; + xor.b32 %r3307, %r3245, %r3306; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 16; + add.s32 %r3309, %r3308, %r3288; + xor.b32 %r3310, %r3309, %r3276; + shf.l.wrap.b32 %r3311, %r3310, %r3310, 20; + add.s32 %r3312, %r3306, %r3031; + add.s32 %r3313, %r3312, %r3311; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 24; + add.s32 %r3316, %r3315, %r3309; + xor.b32 %r3317, %r3316, %r3311; + shf.l.wrap.b32 %r3318, %r3317, %r3317, 25; + add.s32 %r3319, %r3271, %r3063; + add.s32 %r3320, %r3319, %r3290; + xor.b32 %r3321, %r3320, %r3259; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 16; + add.s32 %r3323, %r3322, %r3246; + xor.b32 %r3324, %r3323, %r3290; + shf.l.wrap.b32 %r3325, %r3324, %r3324, 20; + add.s32 %r3326, %r3320, %r3103; + add.s32 %r3327, %r3326, %r3325; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 24; + add.s32 %r3330, %r3329, %r3323; + xor.b32 %r3331, %r3330, %r3325; + shf.l.wrap.b32 %r3332, %r3331, %r3331, 25; + add.s32 %r3333, %r3285, %r3114; + add.s32 %r3334, %r3333, %r3248; + xor.b32 %r3335, %r3334, %r3273; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 16; + add.s32 %r3337, %r3336, %r3260; + xor.b32 %r3338, %r3337, %r3248; + shf.l.wrap.b32 %r3339, %r3338, %r3338, 20; + add.s32 %r3340, %r3334, %r3055; + add.s32 %r3341, %r3340, %r3339; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 24; + add.s32 %r3344, %r3343, %r3337; + xor.b32 %r3345, %r3344, %r3339; + shf.l.wrap.b32 %r3346, %r3345, %r3345, 25; + add.s32 %r3347, %r3299, %r3015; + add.s32 %r3348, %r3347, %r3346; + xor.b32 %r3349, %r3348, %r3315; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 16; + add.s32 %r3351, %r3350, %r3330; + xor.b32 %r3352, %r3351, %r3346; + shf.l.wrap.b32 %r3353, %r3352, %r3352, 20; + add.s32 %r3354, %r3348, %r3023; + add.s32 %r3355, %r3354, %r3353; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 24; + add.s32 %r3358, %r3357, %r3351; + xor.b32 %r3359, %r3358, %r3353; + shf.l.wrap.b32 %r3360, %r3359, %r3359, 25; + add.s32 %r3361, %r3313, %r3071; + add.s32 %r3362, %r3361, %r3304; + xor.b32 %r3363, %r3362, %r3329; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 16; + add.s32 %r3365, %r3364, %r3344; + xor.b32 %r3366, %r3365, %r3304; + shf.l.wrap.b32 %r3367, %r3366, %r3366, 20; + add.s32 %r3368, %r3362, %r3087; + add.s32 %r3369, %r3368, %r3367; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 24; + add.s32 %r3372, %r3371, %r3365; + xor.b32 %r3373, %r3372, %r3367; + shf.l.wrap.b32 %r3374, %r3373, %r3373, 25; + add.s32 %r3375, %r3327, %r3095; + add.s32 %r3376, %r3375, %r3318; + xor.b32 %r3377, %r3343, %r3376; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 16; + add.s32 %r3379, %r3378, %r3302; + xor.b32 %r3380, %r3379, %r3318; + shf.l.wrap.b32 %r3381, %r3380, %r3380, 20; + add.s32 %r3382, %r3376, %r3007; + add.s32 %r3383, %r3382, %r3381; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 24; + add.s32 %r3386, %r3385, %r3379; + xor.b32 %r3387, %r3386, %r3381; + shf.l.wrap.b32 %r3388, %r3387, %r3387, 25; + add.s32 %r3389, %r3332, %r3047; + add.s32 %r3390, %r3389, %r3341; + xor.b32 %r3391, %r3390, %r3301; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 16; + add.s32 %r3393, %r3392, %r3316; + xor.b32 %r3394, %r3393, %r3332; + shf.l.wrap.b32 %r3395, %r3394, %r3394, 20; + add.s32 %r3396, %r3390, %r3103; + add.s32 %r3397, %r3396, %r3395; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 24; + add.s32 %r3400, %r3399, %r3393; + xor.b32 %r3401, %r3400, %r3395; + shf.l.wrap.b32 %r3402, %r3401, %r3401, 25; + add.s32 %r3403, %r3374, %r3039; + add.s32 %r3404, %r3403, %r3355; + xor.b32 %r3405, %r3404, %r3399; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 16; + add.s32 %r3407, %r3406, %r3386; + xor.b32 %r3408, %r3407, %r3374; + shf.l.wrap.b32 %r3409, %r3408, %r3408, 20; + add.s32 %r3410, %r3404, %r3031; + add.s32 %r3411, %r3410, %r3409; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 24; + add.s32 %r3414, %r3413, %r3407; + xor.b32 %r3415, %r3414, %r3409; + shf.l.wrap.b32 %r3416, %r3415, %r3415, 25; + add.s32 %r3417, %r3369, %r3063; + add.s32 %r3418, %r3417, %r3388; + xor.b32 %r3419, %r3357, %r3418; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 16; + add.s32 %r3421, %r3420, %r3400; + xor.b32 %r3422, %r3421, %r3388; + shf.l.wrap.b32 %r3423, %r3422, %r3422, 20; + add.s32 %r3424, %r3418, %r2991; + add.s32 %r3425, %r3424, %r3423; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 24; + add.s32 %r3428, %r3427, %r3421; + xor.b32 %r3429, %r3428, %r3423; + shf.l.wrap.b32 %r3430, %r3429, %r3429, 25; + add.s32 %r3431, %r3383, %r3079; + add.s32 %r3432, %r3431, %r3402; + xor.b32 %r3433, %r3432, %r3371; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 16; + add.s32 %r3435, %r3434, %r3358; + xor.b32 %r3436, %r3435, %r3402; + shf.l.wrap.b32 %r3437, %r3436, %r3436, 20; + add.s32 %r3438, %r3432, %r3114; + add.s32 %r3439, %r3438, %r3437; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 24; + add.s32 %r3442, %r3441, %r3435; + xor.b32 %r3443, %r3442, %r3437; + shf.l.wrap.b32 %r3444, %r3443, %r3443, 25; + add.s32 %r3445, %r3397, %r3055; + add.s32 %r3446, %r3445, %r3360; + xor.b32 %r3447, %r3446, %r3385; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 16; + add.s32 %r3449, %r3448, %r3372; + xor.b32 %r3450, %r3449, %r3360; + shf.l.wrap.b32 %r3451, %r3450, %r3450, 20; + add.s32 %r3452, %r3446, %r2999; + add.s32 %r3453, %r3452, %r3451; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 24; + add.s32 %r3456, %r3455, %r3449; + xor.b32 %r3457, %r3456, %r3451; + shf.l.wrap.b32 %r3458, %r3457, %r3457, 25; + add.s32 %r3459, %r3411, %r3071; + add.s32 %r3460, %r3459, %r3458; + xor.b32 %r3461, %r3460, %r3427; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 16; + add.s32 %r3463, %r3462, %r3442; + xor.b32 %r3464, %r3463, %r3458; + shf.l.wrap.b32 %r3465, %r3464, %r3464, 20; + add.s32 %r3466, %r3460, %r3047; + add.s32 %r3467, %r3466, %r3465; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 24; + add.s32 %r3470, %r3469, %r3463; + xor.b32 %r3471, %r3470, %r3465; + shf.l.wrap.b32 %r3472, %r3471, %r3471, 25; + add.s32 %r3473, %r3425, %r3087; + add.s32 %r3474, %r3473, %r3416; + xor.b32 %r3475, %r3474, %r3441; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 16; + add.s32 %r3477, %r3476, %r3456; + xor.b32 %r3478, %r3477, %r3416; + shf.l.wrap.b32 %r3479, %r3478, %r3478, 20; + add.s32 %r3480, %r3474, %r3063; + add.s32 %r3481, %r3480, %r3479; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 24; + add.s32 %r3484, %r3483, %r3477; + xor.b32 %r3485, %r3484, %r3479; + shf.l.wrap.b32 %r3486, %r3485, %r3485, 25; + add.s32 %r3487, %r3439, %r3103; + add.s32 %r3488, %r3487, %r3430; + xor.b32 %r3489, %r3455, %r3488; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 16; + add.s32 %r3491, %r3490, %r3414; + xor.b32 %r3492, %r3491, %r3430; + shf.l.wrap.b32 %r3493, %r3492, %r3492, 20; + add.s32 %r3494, %r3488, %r3015; + add.s32 %r3495, %r3494, %r3493; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 24; + add.s32 %r3498, %r3497, %r3491; + xor.b32 %r3499, %r3498, %r3493; + shf.l.wrap.b32 %r3500, %r3499, %r3499, 25; + add.s32 %r3501, %r3444, %r3095; + add.s32 %r3502, %r3501, %r3453; + xor.b32 %r3503, %r3502, %r3413; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 16; + add.s32 %r3505, %r3504, %r3428; + xor.b32 %r3506, %r3505, %r3444; + shf.l.wrap.b32 %r3507, %r3506, %r3506, 20; + add.s32 %r3508, %r3502, %r3114; + add.s32 %r3509, %r3508, %r3507; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 24; + add.s32 %r3512, %r3511, %r3505; + xor.b32 %r3513, %r3512, %r3507; + shf.l.wrap.b32 %r3514, %r3513, %r3513, 25; + add.s32 %r3515, %r3486, %r3023; + add.s32 %r3516, %r3515, %r3467; + xor.b32 %r3517, %r3516, %r3511; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 16; + add.s32 %r3519, %r3518, %r3498; + xor.b32 %r3520, %r3519, %r3486; + shf.l.wrap.b32 %r3521, %r3520, %r3520, 20; + add.s32 %r3522, %r3516, %r2991; + add.s32 %r3523, %r3522, %r3521; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 24; + add.s32 %r3526, %r3525, %r3519; + xor.b32 %r3527, %r3526, %r3521; + shf.l.wrap.b32 %r3528, %r3527, %r3527, 25; + add.s32 %r3529, %r3481, %r3079; + add.s32 %r3530, %r3529, %r3500; + xor.b32 %r3531, %r3469, %r3530; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 16; + add.s32 %r3533, %r3532, %r3512; + xor.b32 %r3534, %r3533, %r3500; + shf.l.wrap.b32 %r3535, %r3534, %r3534, 20; + add.s32 %r3536, %r3530, %r3007; + add.s32 %r3537, %r3536, %r3535; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 24; + add.s32 %r3540, %r3539, %r3533; + xor.b32 %r3541, %r3540, %r3535; + shf.l.wrap.b32 %r3542, %r3541, %r3541, 25; + add.s32 %r3543, %r3495, %r3031; + add.s32 %r3544, %r3543, %r3514; + xor.b32 %r3545, %r3544, %r3483; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 16; + add.s32 %r3547, %r3546, %r3470; + xor.b32 %r3548, %r3547, %r3514; + shf.l.wrap.b32 %r3549, %r3548, %r3548, 20; + add.s32 %r3550, %r3544, %r3055; + add.s32 %r3551, %r3550, %r3549; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 24; + add.s32 %r3554, %r3553, %r3547; + xor.b32 %r3555, %r3554, %r3549; + shf.l.wrap.b32 %r3556, %r3555, %r3555, 25; + add.s32 %r3557, %r3509, %r2999; + add.s32 %r3558, %r3557, %r3472; + xor.b32 %r3559, %r3558, %r3497; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 16; + add.s32 %r3561, %r3560, %r3484; + xor.b32 %r3562, %r3561, %r3472; + shf.l.wrap.b32 %r3563, %r3562, %r3562, 20; + add.s32 %r3564, %r3558, %r3039; + add.s32 %r3565, %r3564, %r3563; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 24; + add.s32 %r3568, %r3567, %r3561; + xor.b32 %r3569, %r3568, %r3563; + shf.l.wrap.b32 %r3570, %r3569, %r3569, 25; + add.s32 %r3571, %r3523, %r3087; + add.s32 %r3572, %r3571, %r3570; + xor.b32 %r3573, %r3572, %r3539; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 16; + add.s32 %r3575, %r3574, %r3554; + xor.b32 %r3576, %r3575, %r3570; + shf.l.wrap.b32 %r3577, %r3576, %r3576, 20; + add.s32 %r3578, %r3572, %r3095; + add.s32 %r3579, %r3578, %r3577; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 24; + add.s32 %r3582, %r3581, %r3575; + xor.b32 %r3583, %r3582, %r3577; + shf.l.wrap.b32 %r3584, %r3583, %r3583, 25; + add.s32 %r3585, %r3537, %r3063; + add.s32 %r3586, %r3585, %r3528; + xor.b32 %r3587, %r3586, %r3553; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 16; + add.s32 %r3589, %r3588, %r3568; + xor.b32 %r3590, %r3589, %r3528; + shf.l.wrap.b32 %r3591, %r3590, %r3590, 20; + add.s32 %r3592, %r3586, %r3079; + add.s32 %r3593, %r3592, %r3591; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 24; + add.s32 %r3596, %r3595, %r3589; + xor.b32 %r3597, %r3596, %r3591; + shf.l.wrap.b32 %r3598, %r3597, %r3597, 25; + add.s32 %r3599, %r3551, %r3114; + add.s32 %r3600, %r3599, %r3542; + xor.b32 %r3601, %r3567, %r3600; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 16; + add.s32 %r3603, %r3602, %r3526; + xor.b32 %r3604, %r3603, %r3542; + shf.l.wrap.b32 %r3605, %r3604, %r3604, 20; + add.s32 %r3606, %r3600, %r3071; + add.s32 %r3607, %r3606, %r3605; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 24; + add.s32 %r3610, %r3609, %r3603; + xor.b32 %r3611, %r3610, %r3605; + shf.l.wrap.b32 %r3612, %r3611, %r3611, 25; + add.s32 %r3613, %r3556, %r3103; + add.s32 %r3614, %r3613, %r3565; + xor.b32 %r3615, %r3614, %r3525; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 16; + add.s32 %r3617, %r3616, %r3540; + xor.b32 %r3618, %r3617, %r3556; + shf.l.wrap.b32 %r3619, %r3618, %r3618, 20; + add.s32 %r3620, %r3614, %r3055; + add.s32 %r3621, %r3620, %r3619; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 24; + add.s32 %r3624, %r3623, %r3617; + xor.b32 %r3625, %r3624, %r3619; + shf.l.wrap.b32 %r3626, %r3625, %r3625, 25; + add.s32 %r3627, %r3598, %r3047; + add.s32 %r3628, %r3627, %r3579; + xor.b32 %r3629, %r3628, %r3623; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 16; + add.s32 %r3631, %r3630, %r3610; + xor.b32 %r3632, %r3631, %r3598; + shf.l.wrap.b32 %r3633, %r3632, %r3632, 20; + add.s32 %r3634, %r3628, %r3007; + add.s32 %r3635, %r3634, %r3633; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 24; + add.s32 %r3638, %r3637, %r3631; + xor.b32 %r3639, %r3638, %r3633; + shf.l.wrap.b32 %r3640, %r3639, %r3639, 25; + add.s32 %r3641, %r3593, %r3031; + add.s32 %r3642, %r3641, %r3612; + xor.b32 %r3643, %r3581, %r3642; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 16; + add.s32 %r3645, %r3644, %r3624; + xor.b32 %r3646, %r3645, %r3612; + shf.l.wrap.b32 %r3647, %r3646, %r3646, 20; + add.s32 %r3648, %r3642, %r3015; + add.s32 %r3649, %r3648, %r3647; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 24; + add.s32 %r3652, %r3651, %r3645; + xor.b32 %r3653, %r3652, %r3647; + shf.l.wrap.b32 %r3654, %r3653, %r3653, 25; + add.s32 %r3655, %r3607, %r2991; + add.s32 %r3656, %r3655, %r3626; + xor.b32 %r3657, %r3656, %r3595; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 16; + add.s32 %r3659, %r3658, %r3582; + xor.b32 %r3660, %r3659, %r3626; + shf.l.wrap.b32 %r3661, %r3660, %r3660, 20; + add.s32 %r3662, %r3656, %r2999; + add.s32 %r3663, %r3662, %r3661; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 24; + add.s32 %r3666, %r3665, %r3659; + xor.b32 %r3667, %r3666, %r3661; + shf.l.wrap.b32 %r3668, %r3667, %r3667, 25; + add.s32 %r3669, %r3621, %r3039; + add.s32 %r3670, %r3669, %r3584; + xor.b32 %r3671, %r3670, %r3609; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 16; + add.s32 %r3673, %r3672, %r3596; + xor.b32 %r3674, %r3673, %r3584; + shf.l.wrap.b32 %r3675, %r3674, %r3674, 20; + add.s32 %r3676, %r3670, %r3023; + add.s32 %r3677, %r3676, %r3675; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 24; + add.s32 %r3680, %r3679, %r3673; + xor.b32 %r3681, %r3680, %r3675; + shf.l.wrap.b32 %r3682, %r3681, %r3681, 25; + add.s32 %r3683, %r3635, %r3063; + add.s32 %r3684, %r3683, %r3682; + xor.b32 %r3685, %r3684, %r3651; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 16; + add.s32 %r3687, %r3686, %r3666; + xor.b32 %r3688, %r3687, %r3682; + shf.l.wrap.b32 %r3689, %r3688, %r3688, 20; + add.s32 %r3690, %r3684, %r3103; + add.s32 %r3691, %r3690, %r3689; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 24; + add.s32 %r3694, %r3693, %r3687; + xor.b32 %r3695, %r3694, %r3689; + shf.l.wrap.b32 %r3696, %r3695, %r3695, 25; + add.s32 %r3697, %r3649, %r3079; + add.s32 %r3698, %r3697, %r3640; + xor.b32 %r3699, %r3698, %r3665; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 16; + add.s32 %r3701, %r3700, %r3680; + xor.b32 %r3702, %r3701, %r3640; + shf.l.wrap.b32 %r3703, %r3702, %r3702, 20; + add.s32 %r3704, %r3698, %r3031; + add.s32 %r3705, %r3704, %r3703; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 24; + add.s32 %r3708, %r3707, %r3701; + xor.b32 %r3709, %r3708, %r3703; + shf.l.wrap.b32 %r3710, %r3709, %r3709, 25; + add.s32 %r3711, %r3663, %r3055; + add.s32 %r3712, %r3711, %r3654; + xor.b32 %r3713, %r3679, %r3712; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 16; + add.s32 %r3715, %r3714, %r3638; + xor.b32 %r3716, %r3715, %r3654; + shf.l.wrap.b32 %r3717, %r3716, %r3716, 20; + add.s32 %r3718, %r3712, %r3087; + add.s32 %r3719, %r3718, %r3717; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 24; + add.s32 %r3722, %r3721, %r3715; + xor.b32 %r3723, %r3722, %r3717; + shf.l.wrap.b32 %r3724, %r3723, %r3723, 25; + add.s32 %r3725, %r3668, %r3114; + add.s32 %r3726, %r3725, %r3677; + xor.b32 %r3727, %r3726, %r3637; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 16; + add.s32 %r3729, %r3728, %r3652; + xor.b32 %r3730, %r3729, %r3668; + shf.l.wrap.b32 %r3731, %r3730, %r3730, 20; + add.s32 %r3732, %r3726, %r2999; + add.s32 %r3733, %r3732, %r3731; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 24; + add.s32 %r3736, %r3735, %r3729; + xor.b32 %r3737, %r3736, %r3731; + shf.l.wrap.b32 %r3738, %r3737, %r3737, 25; + add.s32 %r3739, %r3710, %r3095; + add.s32 %r3740, %r3739, %r3691; + xor.b32 %r3741, %r3740, %r3735; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 16; + add.s32 %r3743, %r3742, %r3722; + xor.b32 %r3744, %r3743, %r3710; + shf.l.wrap.b32 %r3745, %r3744, %r3744, 20; + add.s32 %r3746, %r3740, %r3015; + add.s32 %r3747, %r3746, %r3745; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 24; + add.s32 %r3750, %r3749, %r3743; + xor.b32 %r3751, %r3750, %r3745; + shf.l.wrap.b32 %r3752, %r3751, %r3751, 25; + add.s32 %r3753, %r3705, %r2991; + add.s32 %r3754, %r3753, %r3724; + xor.b32 %r3755, %r3693, %r3754; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 16; + add.s32 %r3757, %r3756, %r3736; + xor.b32 %r3758, %r3757, %r3724; + shf.l.wrap.b32 %r3759, %r3758, %r3758, 20; + add.s32 %r3760, %r3754, %r3071; + add.s32 %r3761, %r3760, %r3759; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 24; + add.s32 %r3764, %r3763, %r3757; + xor.b32 %r3765, %r3764, %r3759; + shf.l.wrap.b32 %r3766, %r3765, %r3765, 25; + add.s32 %r3767, %r3719, %r3007; + add.s32 %r3768, %r3767, %r3738; + xor.b32 %r3769, %r3768, %r3707; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 16; + add.s32 %r3771, %r3770, %r3694; + xor.b32 %r3772, %r3771, %r3738; + shf.l.wrap.b32 %r3773, %r3772, %r3772, 20; + add.s32 %r3774, %r3768, %r3039; + add.s32 %r3775, %r3774, %r3773; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 24; + add.s32 %r3778, %r3777, %r3771; + xor.b32 %r3779, %r3778, %r3773; + shf.l.wrap.b32 %r3780, %r3779, %r3779, 25; + add.s32 %r3781, %r3733, %r3023; + add.s32 %r3782, %r3781, %r3696; + xor.b32 %r3783, %r3782, %r3721; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 16; + add.s32 %r3785, %r3784, %r3708; + xor.b32 %r3786, %r3785, %r3696; + shf.l.wrap.b32 %r3787, %r3786, %r3786, 20; + add.s32 %r3788, %r3782, %r3047; + add.s32 %r3789, %r3788, %r3787; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 24; + add.s32 %r3792, %r3791, %r3785; + xor.b32 %r3793, %r3792, %r3787; + shf.l.wrap.b32 %r3794, %r3793, %r3793, 25; + add.s32 %r3795, %r3747, %r3079; + add.s32 %r3796, %r3795, %r3794; + xor.b32 %r3797, %r3796, %r3763; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 16; + add.s32 %r3799, %r3798, %r3778; + xor.b32 %r3800, %r3799, %r3794; + shf.l.wrap.b32 %r3801, %r3800, %r3800, 20; + add.s32 %r3802, %r3796, %r3114; + add.s32 %r3803, %r3802, %r3801; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 24; + add.s32 %r3806, %r3805, %r3799; + xor.b32 %r3807, %r3806, %r3801; + shf.l.wrap.b32 %r3808, %r3807, %r3807, 25; + add.s32 %r3809, %r3761, %r3031; + add.s32 %r3810, %r3809, %r3752; + xor.b32 %r3811, %r3810, %r3777; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 16; + add.s32 %r3813, %r3812, %r3792; + xor.b32 %r3814, %r3813, %r3752; + shf.l.wrap.b32 %r3815, %r3814, %r3814, 20; + add.s32 %r3816, %r3810, %r2991; + add.s32 %r3817, %r3816, %r3815; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 24; + add.s32 %r3820, %r3819, %r3813; + xor.b32 %r3821, %r3820, %r3815; + shf.l.wrap.b32 %r3822, %r3821, %r3821, 25; + add.s32 %r3823, %r3775, %r2999; + add.s32 %r3824, %r3823, %r3766; + xor.b32 %r3825, %r3791, %r3824; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 16; + add.s32 %r3827, %r3826, %r3750; + xor.b32 %r3828, %r3827, %r3766; + shf.l.wrap.b32 %r3829, %r3828, %r3828, 20; + add.s32 %r3830, %r3824, %r3063; + add.s32 %r3831, %r3830, %r3829; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 24; + add.s32 %r3834, %r3833, %r3827; + xor.b32 %r3835, %r3834, %r3829; + shf.l.wrap.b32 %r3836, %r3835, %r3835, 25; + add.s32 %r3837, %r3780, %r3055; + add.s32 %r3838, %r3837, %r3789; + xor.b32 %r3839, %r3838, %r3749; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 16; + add.s32 %r3841, %r3840, %r3764; + xor.b32 %r3842, %r3841, %r3780; + shf.l.wrap.b32 %r3843, %r3842, %r3842, 20; + add.s32 %r3844, %r3838, %r3039; + add.s32 %r3845, %r3844, %r3843; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 24; + add.s32 %r3848, %r3847, %r3841; + xor.b32 %r3849, %r3848, %r3843; + shf.l.wrap.b32 %r3850, %r3849, %r3849, 25; + add.s32 %r3851, %r3822, %r3103; + add.s32 %r3852, %r3851, %r3803; + xor.b32 %r3853, %r3852, %r3847; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 16; + add.s32 %r3855, %r3854, %r3834; + xor.b32 %r3856, %r3855, %r3822; + shf.l.wrap.b32 %r3857, %r3856, %r3856, 20; + add.s32 %r3858, %r3852, %r3071; + add.s32 %r3859, %r3858, %r3857; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 24; + add.s32 %r3862, %r3861, %r3855; + xor.b32 %r3863, %r3862, %r3857; + shf.l.wrap.b32 %r3864, %r3863, %r3863, 25; + add.s32 %r3865, %r3817, %r3007; + add.s32 %r3866, %r3865, %r3836; + xor.b32 %r3867, %r3805, %r3866; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 16; + add.s32 %r3869, %r3868, %r3848; + xor.b32 %r3870, %r3869, %r3836; + shf.l.wrap.b32 %r3871, %r3870, %r3870, 20; + add.s32 %r3872, %r3866, %r3087; + add.s32 %r3873, %r3872, %r3871; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 24; + add.s32 %r3876, %r3875, %r3869; + xor.b32 %r3877, %r3876, %r3871; + shf.l.wrap.b32 %r3878, %r3877, %r3877, 25; + add.s32 %r3879, %r3831, %r3015; + add.s32 %r3880, %r3879, %r3850; + xor.b32 %r3881, %r3880, %r3819; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 16; + add.s32 %r3883, %r3882, %r3806; + xor.b32 %r3884, %r3883, %r3850; + shf.l.wrap.b32 %r3885, %r3884, %r3884, 20; + add.s32 %r3886, %r3880, %r3023; + add.s32 %r3887, %r3886, %r3885; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 24; + add.s32 %r3890, %r3889, %r3883; + xor.b32 %r3891, %r3890, %r3885; + shf.l.wrap.b32 %r3892, %r3891, %r3891, 25; + add.s32 %r3893, %r3845, %r3047; + add.s32 %r3894, %r3893, %r3808; + xor.b32 %r3895, %r3894, %r3833; + shf.l.wrap.b32 %r3896, %r3895, %r3895, 16; + add.s32 %r3897, %r3896, %r3820; + xor.b32 %r3898, %r3897, %r3808; + shf.l.wrap.b32 %r3899, %r3898, %r3898, 20; + add.s32 %r3900, %r3894, %r3095; + add.s32 %r3901, %r3900, %r3899; + xor.b32 %r3902, %r3901, %r3896; + shf.l.wrap.b32 %r3903, %r3902, %r3902, 24; + add.s32 %r3904, %r3903, %r3897; + xor.b32 %r3905, %r3904, %r3899; + shf.l.wrap.b32 %r3906, %r3905, %r3905, 25; + xor.b32 %r3907, %r3890, %r3859; + xor.b32 %r3908, %r3904, %r3873; + xor.b32 %r3909, %r3862, %r3887; + xor.b32 %r3910, %r3901, %r3876; + xor.b32 %r3911, %r3906, %r3875; + xor.b32 %r3912, %r3864, %r3889; + xor.b32 %r3913, %r3903, %r3878; + xor.b32 %r3914, %r3892, %r3861; + st.local.u8 [%rd134], %r3907; + shr.u32 %r3915, %r3907, 8; + st.local.u8 [%rd134+1], %r3915; + shr.u32 %r3916, %r3907, 16; + st.local.u8 [%rd134+2], %r3916; + shr.u32 %r3917, %r3907, 24; + st.local.u8 [%rd134+3], %r3917; + st.local.u8 [%rd134+4], %r3908; + shr.u32 %r3918, %r3908, 8; + st.local.u8 [%rd134+5], %r3918; + shr.u32 %r3919, %r3908, 16; + st.local.u8 [%rd134+6], %r3919; + shr.u32 %r3920, %r3908, 24; + st.local.u8 [%rd134+7], %r3920; + st.local.u8 [%rd134+8], %r3909; + shr.u32 %r3921, %r3909, 8; + st.local.u8 [%rd134+9], %r3921; + shr.u32 %r3922, %r3909, 16; + st.local.u8 [%rd134+10], %r3922; + shr.u32 %r3923, %r3909, 24; + st.local.u8 [%rd134+11], %r3923; + st.local.u8 [%rd134+12], %r3910; + shr.u32 %r3924, %r3910, 8; + st.local.u8 [%rd134+13], %r3924; + shr.u32 %r3925, %r3910, 16; + st.local.u8 [%rd134+14], %r3925; + shr.u32 %r3926, %r3910, 24; + st.local.u8 [%rd134+15], %r3926; + st.local.u8 [%rd134+16], %r3911; + shr.u32 %r3927, %r3911, 8; + st.local.u8 [%rd134+17], %r3927; + shr.u32 %r3928, %r3911, 16; + st.local.u8 [%rd134+18], %r3928; + shr.u32 %r3929, %r3911, 24; + st.local.u8 [%rd134+19], %r3929; + st.local.u8 [%rd134+20], %r3912; + shr.u32 %r3930, %r3912, 8; + st.local.u8 [%rd134+21], %r3930; + shr.u32 %r3931, %r3912, 16; + st.local.u8 [%rd134+22], %r3931; + shr.u32 %r3932, %r3912, 24; + st.local.u8 [%rd134+23], %r3932; + st.local.u8 [%rd134+24], %r3913; + shr.u32 %r3933, %r3913, 8; + st.local.u8 [%rd134+25], %r3933; + shr.u32 %r3934, %r3913, 16; + st.local.u8 [%rd134+26], %r3934; + shr.u32 %r3935, %r3913, 24; + st.local.u8 [%rd134+27], %r3935; + st.local.u8 [%rd134+28], %r3914; + shr.u32 %r3936, %r3914, 8; + st.local.u8 [%rd134+29], %r3936; + shr.u32 %r3937, %r3914, 16; + st.local.u8 [%rd134+30], %r3937; + shr.u32 %r3938, %r3914, 24; + st.local.u8 [%rd134+31], %r3938; + add.s64 %rd151, %rd151, 1; + bra.uni $L__BB0_30; + +$L__BB0_1: + add.s64 %rd76, %rd171, -1; + shr.u64 %rd77, %rd76, 10; + or.b64 %rd78, %rd77, 1; + setp.gt.u64 %p2, %rd78, 4294967295; + shr.u64 %rd79, %rd76, 42; + selp.b64 %rd80, %rd79, %rd78, %p2; + selp.b32 %r62, 32, 0, %p2; + and.b64 %rd81, %rd80, 4294901760; + setp.ne.s64 %p3, %rd81, 0; + shr.u64 %rd82, %rd80, 16; + or.b32 %r63, %r62, 16; + selp.b64 %rd83, %rd82, %rd80, %p3; + selp.b32 %r64, %r63, %r62, %p3; + and.b64 %rd84, %rd83, 65280; + setp.ne.s64 %p4, %rd84, 0; + shr.u64 %rd85, %rd83, 8; + or.b32 %r65, %r64, 8; + selp.b64 %rd86, %rd85, %rd83, %p4; + selp.b32 %r66, %r65, %r64, %p4; + and.b64 %rd87, %rd86, 240; + setp.ne.s64 %p5, %rd87, 0; + shr.u64 %rd88, %rd86, 4; + or.b32 %r67, %r66, 4; + selp.b64 %rd89, %rd88, %rd86, %p5; + selp.b32 %r68, %r67, %r66, %p5; + and.b64 %rd90, %rd89, 12; + setp.ne.s64 %p6, %rd90, 0; + shr.u64 %rd91, %rd89, 2; + add.s32 %r69, %r68, 2; + selp.b64 %rd92, %rd91, %rd89, %p6; + selp.b32 %r70, %r69, %r68, %p6; + and.b64 %rd93, %rd92, 2; + shr.u64 %rd94, %rd93, 1; + cvt.u32.u64 %r71, %rd94; + add.s32 %r72, %r70, %r71; + mov.u64 %rd95, 1024; + shl.b64 %rd96, %rd95, %r72; + sub.s64 %rd97, %rd171, %rd96; + add.s64 %rd98, %rd69, %rd96; + shr.u64 %rd99, %rd96, 10; + add.s64 %rd100, %rd99, %rd165; + setp.gt.u64 %p7, %rd96, 1024; + selp.b64 %rd101, 64, 32, %p7; + add.s64 %rd103, %rd149, %rd101; + cvt.u32.u16 %r73, %rs75; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd69; + .param .b64 param1; + st.param.b64 [param1+0], %rd96; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd165; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd149; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd5, [retval0+0]; + } // callseq 0 + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd98; + .param .b64 param1; + st.param.b64 [param1+0], %rd97; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd100; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd103; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd6, [retval0+0]; + } // callseq 1 + setp.eq.s64 %p8, %rd5, 1; + @%p8 bra $L__BB0_12; + bra.uni $L__BB0_2; + +$L__BB0_12: + mov.u64 %rd158, 0; + +$L__BB0_13: + add.s64 %rd117, %rd4, %rd158; + ld.local.u8 %rs78, [%rd117]; + add.s64 %rd118, %rd155, %rd158; + st.local.u8 [%rd118], %rs78; + add.s64 %rd158, %rd158, 1; + setp.lt.u64 %p15, %rd158, 64; + mov.u64 %rd151, 2; + @%p15 bra $L__BB0_13; + bra.uni $L__BB0_30; + +$L__BB0_2: + add.s64 %rd7, %rd6, %rd5; + setp.lt.u64 %p9, %rd7, 2; + mov.u64 %rd151, 0; + mov.u64 %rd152, %rd151; + @%p9 bra $L__BB0_5; + + mov.u64 %rd146, %rd153; + mov.u64 %rd147, %rd7; + +$L__BB0_4: + st.local.u64 [%rd146], %rd149; + add.s64 %rd151, %rd151, 1; + add.s64 %rd149, %rd149, 64; + add.s64 %rd152, %rd152, 2; + add.s64 %rd146, %rd146, 8; + add.s64 %rd147, %rd147, -2; + setp.gt.u64 %p10, %rd147, 1; + @%p10 bra $L__BB0_4; + +$L__BB0_5: + setp.eq.s64 %p11, %rd151, 0; + @%p11 bra $L__BB0_8; + + or.b16 %rs76, %rs75, 4; + cvt.u32.u16 %r1, %rs76; + mov.u64 %rd154, %rd151; + +$L__BB0_7: + ld.local.u64 %rd109, [%rd153]; + ld.u8 %r74, [%rd109]; + ld.u8 %r75, [%rd109+1]; + prmt.b32 %r76, %r75, %r74, 30212; + ld.u8 %r77, [%rd109+2]; + prmt.b32 %r78, %r77, %r76, 28756; + ld.u8 %r79, [%rd109+3]; + prmt.b32 %r80, %r79, %r78, 1620; + ld.u8 %r81, [%rd109+4]; + ld.u8 %r82, [%rd109+5]; + prmt.b32 %r83, %r82, %r81, 30212; + ld.u8 %r84, [%rd109+6]; + prmt.b32 %r85, %r84, %r83, 28756; + ld.u8 %r86, [%rd109+7]; + prmt.b32 %r87, %r86, %r85, 1620; + ld.u8 %r88, [%rd109+8]; + ld.u8 %r89, [%rd109+9]; + prmt.b32 %r90, %r89, %r88, 30212; + ld.u8 %r91, [%rd109+10]; + prmt.b32 %r92, %r91, %r90, 28756; + ld.u8 %r93, [%rd109+11]; + prmt.b32 %r94, %r93, %r92, 1620; + ld.u8 %r95, [%rd109+12]; + ld.u8 %r96, [%rd109+13]; + prmt.b32 %r97, %r96, %r95, 30212; + ld.u8 %r98, [%rd109+14]; + prmt.b32 %r99, %r98, %r97, 28756; + ld.u8 %r100, [%rd109+15]; + prmt.b32 %r101, %r100, %r99, 1620; + ld.u8 %r102, [%rd109+16]; + ld.u8 %r103, [%rd109+17]; + prmt.b32 %r104, %r103, %r102, 30212; + ld.u8 %r105, [%rd109+18]; + prmt.b32 %r106, %r105, %r104, 28756; + ld.u8 %r107, [%rd109+19]; + prmt.b32 %r108, %r107, %r106, 1620; + ld.u8 %r109, [%rd109+20]; + ld.u8 %r110, [%rd109+21]; + prmt.b32 %r111, %r110, %r109, 30212; + ld.u8 %r112, [%rd109+22]; + prmt.b32 %r113, %r112, %r111, 28756; + ld.u8 %r114, [%rd109+23]; + prmt.b32 %r115, %r114, %r113, 1620; + ld.u8 %r116, [%rd109+24]; + ld.u8 %r117, [%rd109+25]; + prmt.b32 %r118, %r117, %r116, 30212; + ld.u8 %r119, [%rd109+26]; + prmt.b32 %r120, %r119, %r118, 28756; + ld.u8 %r121, [%rd109+27]; + prmt.b32 %r122, %r121, %r120, 1620; + ld.u8 %r123, [%rd109+28]; + ld.u8 %r124, [%rd109+29]; + prmt.b32 %r125, %r124, %r123, 30212; + ld.u8 %r126, [%rd109+30]; + prmt.b32 %r127, %r126, %r125, 28756; + ld.u8 %r128, [%rd109+31]; + prmt.b32 %r129, %r128, %r127, 1620; + ld.u8 %r130, [%rd109+32]; + ld.u8 %r131, [%rd109+33]; + prmt.b32 %r132, %r131, %r130, 30212; + ld.u8 %r133, [%rd109+34]; + prmt.b32 %r134, %r133, %r132, 28756; + ld.u8 %r135, [%rd109+35]; + prmt.b32 %r136, %r135, %r134, 1620; + ld.u8 %r137, [%rd109+36]; + ld.u8 %r138, [%rd109+37]; + prmt.b32 %r139, %r138, %r137, 30212; + ld.u8 %r140, [%rd109+38]; + prmt.b32 %r141, %r140, %r139, 28756; + ld.u8 %r142, [%rd109+39]; + prmt.b32 %r143, %r142, %r141, 1620; + ld.u8 %r144, [%rd109+40]; + ld.u8 %r145, [%rd109+41]; + prmt.b32 %r146, %r145, %r144, 30212; + ld.u8 %r147, [%rd109+42]; + prmt.b32 %r148, %r147, %r146, 28756; + ld.u8 %r149, [%rd109+43]; + prmt.b32 %r150, %r149, %r148, 1620; + ld.u8 %r151, [%rd109+44]; + ld.u8 %r152, [%rd109+45]; + prmt.b32 %r153, %r152, %r151, 30212; + ld.u8 %r154, [%rd109+46]; + prmt.b32 %r155, %r154, %r153, 28756; + ld.u8 %r156, [%rd109+47]; + prmt.b32 %r157, %r156, %r155, 1620; + ld.u8 %r158, [%rd109+48]; + ld.u8 %r159, [%rd109+49]; + prmt.b32 %r160, %r159, %r158, 30212; + ld.u8 %r161, [%rd109+50]; + prmt.b32 %r162, %r161, %r160, 28756; + ld.u8 %r163, [%rd109+51]; + prmt.b32 %r164, %r163, %r162, 1620; + ld.u8 %r165, [%rd109+52]; + ld.u8 %r166, [%rd109+53]; + prmt.b32 %r167, %r166, %r165, 30212; + ld.u8 %r168, [%rd109+54]; + prmt.b32 %r169, %r168, %r167, 28756; + ld.u8 %r170, [%rd109+55]; + prmt.b32 %r171, %r170, %r169, 1620; + ld.u8 %r172, [%rd109+56]; + ld.u8 %r173, [%rd109+57]; + prmt.b32 %r174, %r173, %r172, 30212; + ld.u8 %r175, [%rd109+58]; + prmt.b32 %r176, %r175, %r174, 28756; + ld.u8 %r177, [%rd109+59]; + prmt.b32 %r178, %r177, %r176, 1620; + ld.u8 %r179, [%rd109+60]; + ld.u8 %r180, [%rd109+61]; + prmt.b32 %r181, %r180, %r179, 30212; + ld.u8 %r182, [%rd109+62]; + prmt.b32 %r183, %r182, %r181, 28756; + ld.u8 %r184, [%rd109+63]; + prmt.b32 %r185, %r184, %r183, 1620; + ld.local.u8 %r186, [%rd2+16]; + ld.local.u8 %r187, [%rd2+17]; + prmt.b32 %r188, %r187, %r186, 30212; + ld.local.u8 %r189, [%rd2+18]; + ld.local.u8 %r190, [%rd2+19]; + prmt.b32 %r191, %r190, %r189, 30212; + prmt.b32 %r192, %r191, %r188, 4180; + ld.local.u8 %r193, [%rd2]; + ld.local.u8 %r194, [%rd2+1]; + prmt.b32 %r195, %r194, %r193, 30212; + ld.local.u8 %r196, [%rd2+2]; + ld.local.u8 %r197, [%rd2+3]; + prmt.b32 %r198, %r197, %r196, 30212; + prmt.b32 %r199, %r198, %r195, 4180; + add.s32 %r200, %r192, %r199; + add.s32 %r201, %r200, %r80; + shf.l.wrap.b32 %r202, %r201, %r201, 16; + add.s32 %r203, %r202, 1779033703; + xor.b32 %r204, %r203, %r192; + shf.l.wrap.b32 %r205, %r204, %r204, 20; + add.s32 %r206, %r87, %r201; + add.s32 %r207, %r206, %r205; + xor.b32 %r208, %r207, %r202; + shf.l.wrap.b32 %r209, %r208, %r208, 24; + add.s32 %r210, %r209, %r203; + xor.b32 %r211, %r210, %r205; + shf.l.wrap.b32 %r212, %r211, %r211, 25; + ld.local.u8 %r213, [%rd2+20]; + ld.local.u8 %r214, [%rd2+21]; + prmt.b32 %r215, %r214, %r213, 30212; + ld.local.u8 %r216, [%rd2+22]; + ld.local.u8 %r217, [%rd2+23]; + prmt.b32 %r218, %r217, %r216, 30212; + prmt.b32 %r219, %r218, %r215, 4180; + ld.local.u8 %r220, [%rd2+4]; + ld.local.u8 %r221, [%rd2+5]; + prmt.b32 %r222, %r221, %r220, 30212; + ld.local.u8 %r223, [%rd2+6]; + ld.local.u8 %r224, [%rd2+7]; + prmt.b32 %r225, %r224, %r223, 30212; + prmt.b32 %r226, %r225, %r222, 4180; + add.s32 %r227, %r219, %r226; + add.s32 %r228, %r227, %r94; + shf.l.wrap.b32 %r229, %r228, %r228, 16; + add.s32 %r230, %r229, -1150833019; + xor.b32 %r231, %r230, %r219; + shf.l.wrap.b32 %r232, %r231, %r231, 20; + add.s32 %r233, %r101, %r228; + add.s32 %r234, %r233, %r232; + xor.b32 %r235, %r234, %r229; + shf.l.wrap.b32 %r236, %r235, %r235, 24; + add.s32 %r237, %r236, %r230; + xor.b32 %r238, %r237, %r232; + shf.l.wrap.b32 %r239, %r238, %r238, 25; + ld.local.u8 %r240, [%rd2+24]; + ld.local.u8 %r241, [%rd2+25]; + prmt.b32 %r242, %r241, %r240, 30212; + ld.local.u8 %r243, [%rd2+26]; + ld.local.u8 %r244, [%rd2+27]; + prmt.b32 %r245, %r244, %r243, 30212; + prmt.b32 %r246, %r245, %r242, 4180; + ld.local.u8 %r247, [%rd2+8]; + ld.local.u8 %r248, [%rd2+9]; + prmt.b32 %r249, %r248, %r247, 30212; + ld.local.u8 %r250, [%rd2+10]; + ld.local.u8 %r251, [%rd2+11]; + prmt.b32 %r252, %r251, %r250, 30212; + prmt.b32 %r253, %r252, %r249, 4180; + add.s32 %r254, %r246, %r253; + add.s32 %r255, %r254, %r108; + shr.u32 %r256, %r255, 16; + shl.b32 %r257, %r255, 16; + xor.b32 %r258, %r257, 4194304; + or.b32 %r259, %r258, %r256; + add.s32 %r260, %r259, 1013904242; + xor.b32 %r261, %r260, %r246; + shf.l.wrap.b32 %r262, %r261, %r261, 20; + add.s32 %r263, %r115, %r255; + add.s32 %r264, %r263, %r262; + xor.b32 %r265, %r264, %r259; + shf.l.wrap.b32 %r266, %r265, %r265, 24; + add.s32 %r267, %r266, %r260; + xor.b32 %r268, %r267, %r262; + shf.l.wrap.b32 %r269, %r268, %r268, 25; + ld.local.u8 %r270, [%rd2+28]; + ld.local.u8 %r271, [%rd2+29]; + prmt.b32 %r272, %r271, %r270, 30212; + ld.local.u8 %r273, [%rd2+30]; + ld.local.u8 %r274, [%rd2+31]; + prmt.b32 %r275, %r274, %r273, 30212; + prmt.b32 %r276, %r275, %r272, 4180; + ld.local.u8 %r277, [%rd2+12]; + ld.local.u8 %r278, [%rd2+13]; + prmt.b32 %r279, %r278, %r277, 30212; + ld.local.u8 %r280, [%rd2+14]; + ld.local.u8 %r281, [%rd2+15]; + prmt.b32 %r282, %r281, %r280, 30212; + prmt.b32 %r283, %r282, %r279, 4180; + add.s32 %r284, %r276, %r283; + add.s32 %r285, %r284, %r122; + xor.b32 %r286, %r285, %r1; + shr.u32 %r287, %r285, 16; + shl.b32 %r288, %r286, 16; + or.b32 %r289, %r288, %r287; + add.s32 %r290, %r289, -1521486534; + xor.b32 %r291, %r290, %r276; + shf.l.wrap.b32 %r292, %r291, %r291, 20; + add.s32 %r293, %r129, %r285; + add.s32 %r294, %r293, %r292; + xor.b32 %r295, %r294, %r289; + shf.l.wrap.b32 %r296, %r295, %r295, 24; + add.s32 %r297, %r296, %r290; + xor.b32 %r298, %r297, %r292; + shf.l.wrap.b32 %r299, %r298, %r298, 25; + add.s32 %r300, %r239, %r207; + add.s32 %r301, %r300, %r136; + xor.b32 %r302, %r296, %r301; + shf.l.wrap.b32 %r303, %r302, %r302, 16; + add.s32 %r304, %r303, %r267; + xor.b32 %r305, %r304, %r239; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r143, %r301; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + add.s32 %r314, %r269, %r234; + add.s32 %r315, %r314, %r150; + xor.b32 %r316, %r315, %r209; + shf.l.wrap.b32 %r317, %r316, %r316, 16; + add.s32 %r318, %r317, %r297; + xor.b32 %r319, %r318, %r269; + shf.l.wrap.b32 %r320, %r319, %r319, 20; + add.s32 %r321, %r157, %r315; + add.s32 %r322, %r321, %r320; + xor.b32 %r323, %r322, %r317; + shf.l.wrap.b32 %r324, %r323, %r323, 24; + add.s32 %r325, %r324, %r318; + xor.b32 %r326, %r325, %r320; + shf.l.wrap.b32 %r327, %r326, %r326, 25; + add.s32 %r328, %r299, %r264; + add.s32 %r329, %r328, %r164; + xor.b32 %r330, %r329, %r236; + shf.l.wrap.b32 %r331, %r330, %r330, 16; + add.s32 %r332, %r331, %r210; + xor.b32 %r333, %r332, %r299; + shf.l.wrap.b32 %r334, %r333, %r333, 20; + add.s32 %r335, %r171, %r329; + add.s32 %r336, %r335, %r334; + xor.b32 %r337, %r336, %r331; + shf.l.wrap.b32 %r338, %r337, %r337, 24; + add.s32 %r339, %r338, %r332; + xor.b32 %r340, %r339, %r334; + shf.l.wrap.b32 %r341, %r340, %r340, 25; + add.s32 %r342, %r294, %r212; + add.s32 %r343, %r342, %r178; + xor.b32 %r344, %r343, %r266; + shf.l.wrap.b32 %r345, %r344, %r344, 16; + add.s32 %r346, %r345, %r237; + xor.b32 %r347, %r346, %r212; + shf.l.wrap.b32 %r348, %r347, %r347, 20; + add.s32 %r349, %r185, %r343; + add.s32 %r350, %r349, %r348; + xor.b32 %r351, %r350, %r345; + shf.l.wrap.b32 %r352, %r351, %r351, 24; + add.s32 %r353, %r352, %r346; + xor.b32 %r354, %r353, %r348; + shf.l.wrap.b32 %r355, %r354, %r354, 25; + add.s32 %r356, %r308, %r94; + add.s32 %r357, %r356, %r355; + xor.b32 %r358, %r357, %r324; + shf.l.wrap.b32 %r359, %r358, %r358, 16; + add.s32 %r360, %r359, %r339; + xor.b32 %r361, %r360, %r355; + shf.l.wrap.b32 %r362, %r361, %r361, 20; + add.s32 %r363, %r357, %r122; + add.s32 %r364, %r363, %r362; + xor.b32 %r365, %r364, %r359; + shf.l.wrap.b32 %r366, %r365, %r365, 24; + add.s32 %r367, %r366, %r360; + xor.b32 %r368, %r367, %r362; + shf.l.wrap.b32 %r369, %r368, %r368, 25; + add.s32 %r370, %r322, %r101; + add.s32 %r371, %r370, %r313; + xor.b32 %r372, %r338, %r371; + shf.l.wrap.b32 %r373, %r372, %r372, 16; + add.s32 %r374, %r353, %r373; + xor.b32 %r375, %r374, %r313; + shf.l.wrap.b32 %r376, %r375, %r375, 20; + add.s32 %r377, %r371, %r150; + add.s32 %r378, %r377, %r376; + xor.b32 %r379, %r378, %r373; + shf.l.wrap.b32 %r380, %r379, %r379, 24; + add.s32 %r381, %r380, %r374; + xor.b32 %r382, %r381, %r376; + shf.l.wrap.b32 %r383, %r382, %r382, 25; + add.s32 %r384, %r327, %r129; + add.s32 %r385, %r384, %r336; + xor.b32 %r386, %r352, %r385; + shf.l.wrap.b32 %r387, %r386, %r386, 16; + add.s32 %r388, %r387, %r311; + xor.b32 %r389, %r388, %r327; + shf.l.wrap.b32 %r390, %r389, %r389, 20; + add.s32 %r391, %r385, %r80; + add.s32 %r392, %r391, %r390; + xor.b32 %r393, %r392, %r387; + shf.l.wrap.b32 %r394, %r393, %r393, 24; + add.s32 %r395, %r394, %r388; + xor.b32 %r396, %r395, %r390; + shf.l.wrap.b32 %r397, %r396, %r396, 25; + add.s32 %r398, %r341, %r108; + add.s32 %r399, %r398, %r350; + xor.b32 %r400, %r399, %r310; + shf.l.wrap.b32 %r401, %r400, %r400, 16; + add.s32 %r402, %r401, %r325; + xor.b32 %r403, %r402, %r341; + shf.l.wrap.b32 %r404, %r403, %r403, 20; + add.s32 %r405, %r399, %r171; + add.s32 %r406, %r405, %r404; + xor.b32 %r407, %r406, %r401; + shf.l.wrap.b32 %r408, %r407, %r407, 24; + add.s32 %r409, %r408, %r402; + xor.b32 %r410, %r409, %r404; + shf.l.wrap.b32 %r411, %r410, %r410, 25; + add.s32 %r412, %r383, %r87; + add.s32 %r413, %r412, %r364; + xor.b32 %r414, %r413, %r408; + shf.l.wrap.b32 %r415, %r414, %r414, 16; + add.s32 %r416, %r415, %r395; + xor.b32 %r417, %r416, %r383; + shf.l.wrap.b32 %r418, %r417, %r417, 20; + add.s32 %r419, %r413, %r157; + add.s32 %r420, %r419, %r418; + xor.b32 %r421, %r420, %r415; + shf.l.wrap.b32 %r422, %r421, %r421, 24; + add.s32 %r423, %r422, %r416; + xor.b32 %r424, %r423, %r418; + shf.l.wrap.b32 %r425, %r424, %r424, 25; + add.s32 %r426, %r378, %r164; + add.s32 %r427, %r426, %r397; + xor.b32 %r428, %r366, %r427; + shf.l.wrap.b32 %r429, %r428, %r428, 16; + add.s32 %r430, %r429, %r409; + xor.b32 %r431, %r430, %r397; + shf.l.wrap.b32 %r432, %r431, %r431, 20; + add.s32 %r433, %r427, %r115; + add.s32 %r434, %r433, %r432; + xor.b32 %r435, %r434, %r429; + shf.l.wrap.b32 %r436, %r435, %r435, 24; + add.s32 %r437, %r436, %r430; + xor.b32 %r438, %r437, %r432; + shf.l.wrap.b32 %r439, %r438, %r438, 25; + add.s32 %r440, %r392, %r143; + add.s32 %r441, %r440, %r411; + xor.b32 %r442, %r441, %r380; + shf.l.wrap.b32 %r443, %r442, %r442, 16; + add.s32 %r444, %r443, %r367; + xor.b32 %r445, %r444, %r411; + shf.l.wrap.b32 %r446, %r445, %r445, 20; + add.s32 %r447, %r441, %r178; + add.s32 %r448, %r447, %r446; + xor.b32 %r449, %r448, %r443; + shf.l.wrap.b32 %r450, %r449, %r449, 24; + add.s32 %r451, %r450, %r444; + xor.b32 %r452, %r451, %r446; + shf.l.wrap.b32 %r453, %r452, %r452, 25; + add.s32 %r454, %r406, %r185; + add.s32 %r455, %r454, %r369; + xor.b32 %r456, %r455, %r394; + shf.l.wrap.b32 %r457, %r456, %r456, 16; + add.s32 %r458, %r457, %r381; + xor.b32 %r459, %r458, %r369; + shf.l.wrap.b32 %r460, %r459, %r459, 20; + add.s32 %r461, %r455, %r136; + add.s32 %r462, %r461, %r460; + xor.b32 %r463, %r462, %r457; + shf.l.wrap.b32 %r464, %r463, %r463, 24; + add.s32 %r465, %r464, %r458; + xor.b32 %r466, %r465, %r460; + shf.l.wrap.b32 %r467, %r466, %r466, 25; + add.s32 %r468, %r420, %r101; + add.s32 %r469, %r468, %r467; + xor.b32 %r470, %r469, %r436; + shf.l.wrap.b32 %r471, %r470, %r470, 16; + add.s32 %r472, %r471, %r451; + xor.b32 %r473, %r472, %r467; + shf.l.wrap.b32 %r474, %r473, %r473, 20; + add.s32 %r475, %r469, %r108; + add.s32 %r476, %r475, %r474; + xor.b32 %r477, %r476, %r471; + shf.l.wrap.b32 %r478, %r477, %r477, 24; + add.s32 %r479, %r478, %r472; + xor.b32 %r480, %r479, %r474; + shf.l.wrap.b32 %r481, %r480, %r480, 25; + add.s32 %r482, %r434, %r150; + add.s32 %r483, %r482, %r425; + xor.b32 %r484, %r483, %r450; + shf.l.wrap.b32 %r485, %r484, %r484, 16; + add.s32 %r486, %r485, %r465; + xor.b32 %r487, %r486, %r425; + shf.l.wrap.b32 %r488, %r487, %r487, 20; + add.s32 %r489, %r483, %r164; + add.s32 %r490, %r489, %r488; + xor.b32 %r491, %r490, %r485; + shf.l.wrap.b32 %r492, %r491, %r491, 24; + add.s32 %r493, %r492, %r486; + xor.b32 %r494, %r493, %r488; + shf.l.wrap.b32 %r495, %r494, %r494, 25; + add.s32 %r496, %r448, %r171; + add.s32 %r497, %r496, %r439; + xor.b32 %r498, %r464, %r497; + shf.l.wrap.b32 %r499, %r498, %r498, 16; + add.s32 %r500, %r499, %r423; + xor.b32 %r501, %r500, %r439; + shf.l.wrap.b32 %r502, %r501, %r501, 20; + add.s32 %r503, %r497, %r94; + add.s32 %r504, %r503, %r502; + xor.b32 %r505, %r504, %r499; + shf.l.wrap.b32 %r506, %r505, %r505, 24; + add.s32 %r507, %r506, %r500; + xor.b32 %r508, %r507, %r502; + shf.l.wrap.b32 %r509, %r508, %r508, 25; + add.s32 %r510, %r453, %r129; + add.s32 %r511, %r510, %r462; + xor.b32 %r512, %r511, %r422; + shf.l.wrap.b32 %r513, %r512, %r512, 16; + add.s32 %r514, %r513, %r437; + xor.b32 %r515, %r514, %r453; + shf.l.wrap.b32 %r516, %r515, %r515, 20; + add.s32 %r517, %r511, %r178; + add.s32 %r518, %r517, %r516; + xor.b32 %r519, %r518, %r513; + shf.l.wrap.b32 %r520, %r519, %r519, 24; + add.s32 %r521, %r520, %r514; + xor.b32 %r522, %r521, %r516; + shf.l.wrap.b32 %r523, %r522, %r522, 25; + add.s32 %r524, %r495, %r122; + add.s32 %r525, %r524, %r476; + xor.b32 %r526, %r525, %r520; + shf.l.wrap.b32 %r527, %r526, %r526, 16; + add.s32 %r528, %r527, %r507; + xor.b32 %r529, %r528, %r495; + shf.l.wrap.b32 %r530, %r529, %r529, 20; + add.s32 %r531, %r525, %r115; + add.s32 %r532, %r531, %r530; + xor.b32 %r533, %r532, %r527; + shf.l.wrap.b32 %r534, %r533, %r533, 24; + add.s32 %r535, %r534, %r528; + xor.b32 %r536, %r535, %r530; + shf.l.wrap.b32 %r537, %r536, %r536, 25; + add.s32 %r538, %r490, %r143; + add.s32 %r539, %r538, %r509; + xor.b32 %r540, %r478, %r539; + shf.l.wrap.b32 %r541, %r540, %r540, 16; + add.s32 %r542, %r541, %r521; + xor.b32 %r543, %r542, %r509; + shf.l.wrap.b32 %r544, %r543, %r543, 20; + add.s32 %r545, %r539, %r80; + add.s32 %r546, %r545, %r544; + xor.b32 %r547, %r546, %r541; + shf.l.wrap.b32 %r548, %r547, %r547, 24; + add.s32 %r549, %r548, %r542; + xor.b32 %r550, %r549, %r544; + shf.l.wrap.b32 %r551, %r550, %r550, 25; + add.s32 %r552, %r504, %r157; + add.s32 %r553, %r552, %r523; + xor.b32 %r554, %r553, %r492; + shf.l.wrap.b32 %r555, %r554, %r554, 16; + add.s32 %r556, %r555, %r479; + xor.b32 %r557, %r556, %r523; + shf.l.wrap.b32 %r558, %r557, %r557, 20; + add.s32 %r559, %r553, %r185; + add.s32 %r560, %r559, %r558; + xor.b32 %r561, %r560, %r555; + shf.l.wrap.b32 %r562, %r561, %r561, 24; + add.s32 %r563, %r562, %r556; + xor.b32 %r564, %r563, %r558; + shf.l.wrap.b32 %r565, %r564, %r564, 25; + add.s32 %r566, %r518, %r136; + add.s32 %r567, %r566, %r481; + xor.b32 %r568, %r567, %r506; + shf.l.wrap.b32 %r569, %r568, %r568, 16; + add.s32 %r570, %r569, %r493; + xor.b32 %r571, %r570, %r481; + shf.l.wrap.b32 %r572, %r571, %r571, 20; + add.s32 %r573, %r567, %r87; + add.s32 %r574, %r573, %r572; + xor.b32 %r575, %r574, %r569; + shf.l.wrap.b32 %r576, %r575, %r575, 24; + add.s32 %r577, %r576, %r570; + xor.b32 %r578, %r577, %r572; + shf.l.wrap.b32 %r579, %r578, %r578, 25; + add.s32 %r580, %r532, %r150; + add.s32 %r581, %r580, %r579; + xor.b32 %r582, %r581, %r548; + shf.l.wrap.b32 %r583, %r582, %r582, 16; + add.s32 %r584, %r583, %r563; + xor.b32 %r585, %r584, %r579; + shf.l.wrap.b32 %r586, %r585, %r585, 20; + add.s32 %r587, %r581, %r129; + add.s32 %r588, %r587, %r586; + xor.b32 %r589, %r588, %r583; + shf.l.wrap.b32 %r590, %r589, %r589, 24; + add.s32 %r591, %r590, %r584; + xor.b32 %r592, %r591, %r586; + shf.l.wrap.b32 %r593, %r592, %r592, 25; + add.s32 %r594, %r546, %r164; + add.s32 %r595, %r594, %r537; + xor.b32 %r596, %r595, %r562; + shf.l.wrap.b32 %r597, %r596, %r596, 16; + add.s32 %r598, %r597, %r577; + xor.b32 %r599, %r598, %r537; + shf.l.wrap.b32 %r600, %r599, %r599, 20; + add.s32 %r601, %r595, %r143; + add.s32 %r602, %r601, %r600; + xor.b32 %r603, %r602, %r597; + shf.l.wrap.b32 %r604, %r603, %r603, 24; + add.s32 %r605, %r604, %r598; + xor.b32 %r606, %r605, %r600; + shf.l.wrap.b32 %r607, %r606, %r606, 25; + add.s32 %r608, %r560, %r178; + add.s32 %r609, %r608, %r551; + xor.b32 %r610, %r576, %r609; + shf.l.wrap.b32 %r611, %r610, %r610, 16; + add.s32 %r612, %r611, %r535; + xor.b32 %r613, %r612, %r551; + shf.l.wrap.b32 %r614, %r613, %r613, 20; + add.s32 %r615, %r609, %r101; + add.s32 %r616, %r615, %r614; + xor.b32 %r617, %r616, %r611; + shf.l.wrap.b32 %r618, %r617, %r617, 24; + add.s32 %r619, %r618, %r612; + xor.b32 %r620, %r619, %r614; + shf.l.wrap.b32 %r621, %r620, %r620, 25; + add.s32 %r622, %r565, %r171; + add.s32 %r623, %r622, %r574; + xor.b32 %r624, %r623, %r534; + shf.l.wrap.b32 %r625, %r624, %r624, 16; + add.s32 %r626, %r625, %r549; + xor.b32 %r627, %r626, %r565; + shf.l.wrap.b32 %r628, %r627, %r627, 20; + add.s32 %r629, %r623, %r185; + add.s32 %r630, %r629, %r628; + xor.b32 %r631, %r630, %r625; + shf.l.wrap.b32 %r632, %r631, %r631, 24; + add.s32 %r633, %r632, %r626; + xor.b32 %r634, %r633, %r628; + shf.l.wrap.b32 %r635, %r634, %r634, 25; + add.s32 %r636, %r607, %r108; + add.s32 %r637, %r636, %r588; + xor.b32 %r638, %r637, %r632; + shf.l.wrap.b32 %r639, %r638, %r638, 16; + add.s32 %r640, %r639, %r619; + xor.b32 %r641, %r640, %r607; + shf.l.wrap.b32 %r642, %r641, %r641, 20; + add.s32 %r643, %r637, %r80; + add.s32 %r644, %r643, %r642; + xor.b32 %r645, %r644, %r639; + shf.l.wrap.b32 %r646, %r645, %r645, 24; + add.s32 %r647, %r646, %r640; + xor.b32 %r648, %r647, %r642; + shf.l.wrap.b32 %r649, %r648, %r648, 25; + add.s32 %r650, %r602, %r157; + add.s32 %r651, %r650, %r621; + xor.b32 %r652, %r590, %r651; + shf.l.wrap.b32 %r653, %r652, %r652, 16; + add.s32 %r654, %r653, %r633; + xor.b32 %r655, %r654, %r621; + shf.l.wrap.b32 %r656, %r655, %r655, 20; + add.s32 %r657, %r651, %r94; + add.s32 %r658, %r657, %r656; + xor.b32 %r659, %r658, %r653; + shf.l.wrap.b32 %r660, %r659, %r659, 24; + add.s32 %r661, %r660, %r654; + xor.b32 %r662, %r661, %r656; + shf.l.wrap.b32 %r663, %r662, %r662, 25; + add.s32 %r664, %r616, %r115; + add.s32 %r665, %r664, %r635; + xor.b32 %r666, %r665, %r604; + shf.l.wrap.b32 %r667, %r666, %r666, 16; + add.s32 %r668, %r667, %r591; + xor.b32 %r669, %r668, %r635; + shf.l.wrap.b32 %r670, %r669, %r669, 20; + add.s32 %r671, %r665, %r136; + add.s32 %r672, %r671, %r670; + xor.b32 %r673, %r672, %r667; + shf.l.wrap.b32 %r674, %r673, %r673, 24; + add.s32 %r675, %r674, %r668; + xor.b32 %r676, %r675, %r670; + shf.l.wrap.b32 %r677, %r676, %r676, 25; + add.s32 %r678, %r630, %r87; + add.s32 %r679, %r678, %r593; + xor.b32 %r680, %r679, %r618; + shf.l.wrap.b32 %r681, %r680, %r680, 16; + add.s32 %r682, %r681, %r605; + xor.b32 %r683, %r682, %r593; + shf.l.wrap.b32 %r684, %r683, %r683, 20; + add.s32 %r685, %r679, %r122; + add.s32 %r686, %r685, %r684; + xor.b32 %r687, %r686, %r681; + shf.l.wrap.b32 %r688, %r687, %r687, 24; + add.s32 %r689, %r688, %r682; + xor.b32 %r690, %r689, %r684; + shf.l.wrap.b32 %r691, %r690, %r690, 25; + add.s32 %r692, %r644, %r164; + add.s32 %r693, %r692, %r691; + xor.b32 %r694, %r693, %r660; + shf.l.wrap.b32 %r695, %r694, %r694, 16; + add.s32 %r696, %r695, %r675; + xor.b32 %r697, %r696, %r691; + shf.l.wrap.b32 %r698, %r697, %r697, 20; + add.s32 %r699, %r693, %r171; + add.s32 %r700, %r699, %r698; + xor.b32 %r701, %r700, %r695; + shf.l.wrap.b32 %r702, %r701, %r701, 24; + add.s32 %r703, %r702, %r696; + xor.b32 %r704, %r703, %r698; + shf.l.wrap.b32 %r705, %r704, %r704, 25; + add.s32 %r706, %r658, %r143; + add.s32 %r707, %r706, %r649; + xor.b32 %r708, %r707, %r674; + shf.l.wrap.b32 %r709, %r708, %r708, 16; + add.s32 %r710, %r709, %r689; + xor.b32 %r711, %r710, %r649; + shf.l.wrap.b32 %r712, %r711, %r711, 20; + add.s32 %r713, %r707, %r157; + add.s32 %r714, %r713, %r712; + xor.b32 %r715, %r714, %r709; + shf.l.wrap.b32 %r716, %r715, %r715, 24; + add.s32 %r717, %r716, %r710; + xor.b32 %r718, %r717, %r712; + shf.l.wrap.b32 %r719, %r718, %r718, 25; + add.s32 %r720, %r672, %r185; + add.s32 %r721, %r720, %r663; + xor.b32 %r722, %r688, %r721; + shf.l.wrap.b32 %r723, %r722, %r722, 16; + add.s32 %r724, %r723, %r647; + xor.b32 %r725, %r724, %r663; + shf.l.wrap.b32 %r726, %r725, %r725, 20; + add.s32 %r727, %r721, %r150; + add.s32 %r728, %r727, %r726; + xor.b32 %r729, %r728, %r723; + shf.l.wrap.b32 %r730, %r729, %r729, 24; + add.s32 %r731, %r730, %r724; + xor.b32 %r732, %r731, %r726; + shf.l.wrap.b32 %r733, %r732, %r732, 25; + add.s32 %r734, %r677, %r178; + add.s32 %r735, %r734, %r686; + xor.b32 %r736, %r735, %r646; + shf.l.wrap.b32 %r737, %r736, %r736, 16; + add.s32 %r738, %r737, %r661; + xor.b32 %r739, %r738, %r677; + shf.l.wrap.b32 %r740, %r739, %r739, 20; + add.s32 %r741, %r735, %r136; + add.s32 %r742, %r741, %r740; + xor.b32 %r743, %r742, %r737; + shf.l.wrap.b32 %r744, %r743, %r743, 24; + add.s32 %r745, %r744, %r738; + xor.b32 %r746, %r745, %r740; + shf.l.wrap.b32 %r747, %r746, %r746, 25; + add.s32 %r748, %r719, %r129; + add.s32 %r749, %r748, %r700; + xor.b32 %r750, %r749, %r744; + shf.l.wrap.b32 %r751, %r750, %r750, 16; + add.s32 %r752, %r751, %r731; + xor.b32 %r753, %r752, %r719; + shf.l.wrap.b32 %r754, %r753, %r753, 20; + add.s32 %r755, %r749, %r94; + add.s32 %r756, %r755, %r754; + xor.b32 %r757, %r756, %r751; + shf.l.wrap.b32 %r758, %r757, %r757, 24; + add.s32 %r759, %r758, %r752; + xor.b32 %r760, %r759, %r754; + shf.l.wrap.b32 %r761, %r760, %r760, 25; + add.s32 %r762, %r714, %r115; + add.s32 %r763, %r762, %r733; + xor.b32 %r764, %r702, %r763; + shf.l.wrap.b32 %r765, %r764, %r764, 16; + add.s32 %r766, %r765, %r745; + xor.b32 %r767, %r766, %r733; + shf.l.wrap.b32 %r768, %r767, %r767, 20; + add.s32 %r769, %r763, %r101; + add.s32 %r770, %r769, %r768; + xor.b32 %r771, %r770, %r765; + shf.l.wrap.b32 %r772, %r771, %r771, 24; + add.s32 %r773, %r772, %r766; + xor.b32 %r774, %r773, %r768; + shf.l.wrap.b32 %r775, %r774, %r774, 25; + add.s32 %r776, %r728, %r80; + add.s32 %r777, %r776, %r747; + xor.b32 %r778, %r777, %r716; + shf.l.wrap.b32 %r779, %r778, %r778, 16; + add.s32 %r780, %r779, %r703; + xor.b32 %r781, %r780, %r747; + shf.l.wrap.b32 %r782, %r781, %r781, 20; + add.s32 %r783, %r777, %r87; + add.s32 %r784, %r783, %r782; + xor.b32 %r785, %r784, %r779; + shf.l.wrap.b32 %r786, %r785, %r785, 24; + add.s32 %r787, %r786, %r780; + xor.b32 %r788, %r787, %r782; + shf.l.wrap.b32 %r789, %r788, %r788, 25; + add.s32 %r790, %r742, %r122; + add.s32 %r791, %r790, %r705; + xor.b32 %r792, %r791, %r730; + shf.l.wrap.b32 %r793, %r792, %r792, 16; + add.s32 %r794, %r793, %r717; + xor.b32 %r795, %r794, %r705; + shf.l.wrap.b32 %r796, %r795, %r795, 20; + add.s32 %r797, %r791, %r108; + add.s32 %r798, %r797, %r796; + xor.b32 %r799, %r798, %r793; + shf.l.wrap.b32 %r800, %r799, %r799, 24; + add.s32 %r801, %r800, %r794; + xor.b32 %r802, %r801, %r796; + shf.l.wrap.b32 %r803, %r802, %r802, 25; + add.s32 %r804, %r756, %r143; + add.s32 %r805, %r804, %r803; + xor.b32 %r806, %r805, %r772; + shf.l.wrap.b32 %r807, %r806, %r806, 16; + add.s32 %r808, %r807, %r787; + xor.b32 %r809, %r808, %r803; + shf.l.wrap.b32 %r810, %r809, %r809, 20; + add.s32 %r811, %r805, %r178; + add.s32 %r812, %r811, %r810; + xor.b32 %r813, %r812, %r807; + shf.l.wrap.b32 %r814, %r813, %r813, 24; + add.s32 %r815, %r814, %r808; + xor.b32 %r816, %r815, %r810; + shf.l.wrap.b32 %r817, %r816, %r816, 25; + add.s32 %r818, %r770, %r157; + add.s32 %r819, %r818, %r761; + xor.b32 %r820, %r819, %r786; + shf.l.wrap.b32 %r821, %r820, %r820, 16; + add.s32 %r822, %r821, %r801; + xor.b32 %r823, %r822, %r761; + shf.l.wrap.b32 %r824, %r823, %r823, 20; + add.s32 %r825, %r819, %r115; + add.s32 %r826, %r825, %r824; + xor.b32 %r827, %r826, %r821; + shf.l.wrap.b32 %r828, %r827, %r827, 24; + add.s32 %r829, %r828, %r822; + xor.b32 %r830, %r829, %r824; + shf.l.wrap.b32 %r831, %r830, %r830, 25; + add.s32 %r832, %r784, %r136; + add.s32 %r833, %r832, %r775; + xor.b32 %r834, %r800, %r833; + shf.l.wrap.b32 %r835, %r834, %r834, 16; + add.s32 %r836, %r835, %r759; + xor.b32 %r837, %r836, %r775; + shf.l.wrap.b32 %r838, %r837, %r837, 20; + add.s32 %r839, %r833, %r164; + add.s32 %r840, %r839, %r838; + xor.b32 %r841, %r840, %r835; + shf.l.wrap.b32 %r842, %r841, %r841, 24; + add.s32 %r843, %r842, %r836; + xor.b32 %r844, %r843, %r838; + shf.l.wrap.b32 %r845, %r844, %r844, 25; + add.s32 %r846, %r789, %r185; + add.s32 %r847, %r846, %r798; + xor.b32 %r848, %r847, %r758; + shf.l.wrap.b32 %r849, %r848, %r848, 16; + add.s32 %r850, %r849, %r773; + xor.b32 %r851, %r850, %r789; + shf.l.wrap.b32 %r852, %r851, %r851, 20; + add.s32 %r853, %r847, %r87; + add.s32 %r854, %r853, %r852; + xor.b32 %r855, %r854, %r849; + shf.l.wrap.b32 %r856, %r855, %r855, 24; + add.s32 %r857, %r856, %r850; + xor.b32 %r858, %r857, %r852; + shf.l.wrap.b32 %r859, %r858, %r858, 25; + add.s32 %r860, %r831, %r171; + add.s32 %r861, %r860, %r812; + xor.b32 %r862, %r861, %r856; + shf.l.wrap.b32 %r863, %r862, %r862, 16; + add.s32 %r864, %r863, %r843; + xor.b32 %r865, %r864, %r831; + shf.l.wrap.b32 %r866, %r865, %r865, 20; + add.s32 %r867, %r861, %r101; + add.s32 %r868, %r867, %r866; + xor.b32 %r869, %r868, %r863; + shf.l.wrap.b32 %r870, %r869, %r869, 24; + add.s32 %r871, %r870, %r864; + xor.b32 %r872, %r871, %r866; + shf.l.wrap.b32 %r873, %r872, %r872, 25; + add.s32 %r874, %r826, %r80; + add.s32 %r875, %r874, %r845; + xor.b32 %r876, %r814, %r875; + shf.l.wrap.b32 %r877, %r876, %r876, 16; + add.s32 %r878, %r877, %r857; + xor.b32 %r879, %r878, %r845; + shf.l.wrap.b32 %r880, %r879, %r879, 20; + add.s32 %r881, %r875, %r150; + add.s32 %r882, %r881, %r880; + xor.b32 %r883, %r882, %r877; + shf.l.wrap.b32 %r884, %r883, %r883, 24; + add.s32 %r885, %r884, %r878; + xor.b32 %r886, %r885, %r880; + shf.l.wrap.b32 %r887, %r886, %r886, 25; + add.s32 %r888, %r840, %r94; + add.s32 %r889, %r888, %r859; + xor.b32 %r890, %r889, %r828; + shf.l.wrap.b32 %r891, %r890, %r890, 16; + add.s32 %r892, %r891, %r815; + xor.b32 %r893, %r892, %r859; + shf.l.wrap.b32 %r894, %r893, %r893, 20; + add.s32 %r895, %r889, %r122; + add.s32 %r896, %r895, %r894; + xor.b32 %r897, %r896, %r891; + shf.l.wrap.b32 %r898, %r897, %r897, 24; + add.s32 %r899, %r898, %r892; + xor.b32 %r900, %r899, %r894; + shf.l.wrap.b32 %r901, %r900, %r900, 25; + add.s32 %r902, %r854, %r108; + add.s32 %r903, %r902, %r817; + xor.b32 %r904, %r903, %r842; + shf.l.wrap.b32 %r905, %r904, %r904, 16; + add.s32 %r906, %r905, %r829; + xor.b32 %r907, %r906, %r817; + shf.l.wrap.b32 %r908, %r907, %r907, 20; + add.s32 %r909, %r903, %r129; + add.s32 %r910, %r909, %r908; + xor.b32 %r911, %r910, %r905; + shf.l.wrap.b32 %r912, %r911, %r911, 24; + add.s32 %r913, %r912, %r906; + xor.b32 %r914, %r913, %r908; + shf.l.wrap.b32 %r915, %r914, %r914, 25; + add.s32 %r916, %r868, %r157; + add.s32 %r917, %r916, %r915; + xor.b32 %r918, %r917, %r884; + shf.l.wrap.b32 %r919, %r918, %r918, 16; + add.s32 %r920, %r919, %r899; + xor.b32 %r921, %r920, %r915; + shf.l.wrap.b32 %r922, %r921, %r921, 20; + add.s32 %r923, %r917, %r185; + add.s32 %r924, %r923, %r922; + xor.b32 %r925, %r924, %r919; + shf.l.wrap.b32 %r926, %r925, %r925, 24; + add.s32 %r927, %r926, %r920; + xor.b32 %r928, %r927, %r922; + shf.l.wrap.b32 %r929, %r928, %r928, 25; + add.s32 %r930, %r882, %r115; + add.s32 %r931, %r930, %r873; + xor.b32 %r932, %r931, %r898; + shf.l.wrap.b32 %r933, %r932, %r932, 16; + add.s32 %r934, %r933, %r913; + xor.b32 %r935, %r934, %r873; + shf.l.wrap.b32 %r936, %r935, %r935, 20; + add.s32 %r937, %r931, %r80; + add.s32 %r938, %r937, %r936; + xor.b32 %r939, %r938, %r933; + shf.l.wrap.b32 %r940, %r939, %r939, 24; + add.s32 %r941, %r940, %r934; + xor.b32 %r942, %r941, %r936; + shf.l.wrap.b32 %r943, %r942, %r942, 25; + add.s32 %r944, %r896, %r87; + add.s32 %r945, %r944, %r887; + xor.b32 %r946, %r912, %r945; + shf.l.wrap.b32 %r947, %r946, %r946, 16; + add.s32 %r948, %r947, %r871; + xor.b32 %r949, %r948, %r887; + shf.l.wrap.b32 %r950, %r949, %r949, 20; + add.s32 %r951, %r945, %r143; + add.s32 %r952, %r951, %r950; + xor.b32 %r953, %r952, %r947; + shf.l.wrap.b32 %r954, %r953, %r953, 24; + add.s32 %r955, %r954, %r948; + xor.b32 %r956, %r955, %r950; + shf.l.wrap.b32 %r957, %r956, %r956, 25; + add.s32 %r958, %r901, %r136; + add.s32 %r959, %r958, %r910; + xor.b32 %r960, %r959, %r870; + shf.l.wrap.b32 %r961, %r960, %r960, 16; + add.s32 %r962, %r961, %r885; + xor.b32 %r963, %r962, %r901; + shf.l.wrap.b32 %r964, %r963, %r963, 20; + add.s32 %r965, %r959, %r122; + add.s32 %r966, %r965, %r964; + xor.b32 %r967, %r966, %r961; + shf.l.wrap.b32 %r968, %r967, %r967, 24; + add.s32 %r969, %r968, %r962; + xor.b32 %r970, %r969, %r964; + shf.l.wrap.b32 %r971, %r970, %r970, 25; + add.s32 %r972, %r943, %r178; + add.s32 %r973, %r972, %r924; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 16; + add.s32 %r976, %r975, %r955; + xor.b32 %r977, %r976, %r943; + shf.l.wrap.b32 %r978, %r977, %r977, 20; + add.s32 %r979, %r973, %r150; + add.s32 %r980, %r979, %r978; + xor.b32 %r981, %r980, %r975; + shf.l.wrap.b32 %r982, %r981, %r981, 24; + add.s32 %r983, %r982, %r976; + xor.b32 %r984, %r983, %r978; + shf.l.wrap.b32 %r985, %r984, %r984, 25; + add.s32 %r986, %r938, %r94; + add.s32 %r987, %r986, %r957; + xor.b32 %r988, %r926, %r987; + shf.l.wrap.b32 %r989, %r988, %r988, 16; + add.s32 %r990, %r989, %r969; + xor.b32 %r991, %r990, %r957; + shf.l.wrap.b32 %r992, %r991, %r991, 20; + add.s32 %r993, %r987, %r164; + add.s32 %r994, %r993, %r992; + xor.b32 %r995, %r994, %r989; + shf.l.wrap.b32 %r996, %r995, %r995, 24; + add.s32 %r997, %r996, %r990; + xor.b32 %r998, %r997, %r992; + shf.l.wrap.b32 %r999, %r998, %r998, 25; + add.s32 %r1000, %r952, %r101; + add.s32 %r1001, %r1000, %r971; + xor.b32 %r1002, %r1001, %r940; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 16; + add.s32 %r1004, %r1003, %r927; + xor.b32 %r1005, %r1004, %r971; + shf.l.wrap.b32 %r1006, %r1005, %r1005, 20; + add.s32 %r1007, %r1001, %r108; + add.s32 %r1008, %r1007, %r1006; + xor.b32 %r1009, %r1008, %r1003; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 24; + add.s32 %r1011, %r1010, %r1004; + xor.b32 %r1012, %r1011, %r1006; + shf.l.wrap.b32 %r1013, %r1012, %r1012, 25; + add.s32 %r1014, %r966, %r129; + add.s32 %r1015, %r1014, %r929; + xor.b32 %r1016, %r1015, %r954; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 16; + add.s32 %r1018, %r1017, %r941; + xor.b32 %r1019, %r1018, %r929; + shf.l.wrap.b32 %r1020, %r1019, %r1019, 20; + add.s32 %r1021, %r1015, %r171; + add.s32 %r1022, %r1021, %r1020; + xor.b32 %r1023, %r1022, %r1017; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 24; + add.s32 %r1025, %r1024, %r1018; + xor.b32 %r1026, %r1025, %r1020; + shf.l.wrap.b32 %r1027, %r1026, %r1026, 25; + xor.b32 %r1028, %r1011, %r980; + xor.b32 %r1029, %r1025, %r994; + xor.b32 %r1030, %r983, %r1008; + xor.b32 %r1031, %r1022, %r997; + xor.b32 %r1032, %r1027, %r996; + xor.b32 %r1033, %r985, %r1010; + xor.b32 %r1034, %r1024, %r999; + xor.b32 %r1035, %r1013, %r982; + st.local.u8 [%rd155], %r1028; + shr.u32 %r1036, %r1028, 8; + st.local.u8 [%rd155+1], %r1036; + shr.u32 %r1037, %r1028, 16; + st.local.u8 [%rd155+2], %r1037; + shr.u32 %r1038, %r1028, 24; + st.local.u8 [%rd155+3], %r1038; + st.local.u8 [%rd155+4], %r1029; + shr.u32 %r1039, %r1029, 8; + st.local.u8 [%rd155+5], %r1039; + shr.u32 %r1040, %r1029, 16; + st.local.u8 [%rd155+6], %r1040; + shr.u32 %r1041, %r1029, 24; + st.local.u8 [%rd155+7], %r1041; + st.local.u8 [%rd155+8], %r1030; + shr.u32 %r1042, %r1030, 8; + st.local.u8 [%rd155+9], %r1042; + shr.u32 %r1043, %r1030, 16; + st.local.u8 [%rd155+10], %r1043; + shr.u32 %r1044, %r1030, 24; + st.local.u8 [%rd155+11], %r1044; + st.local.u8 [%rd155+12], %r1031; + shr.u32 %r1045, %r1031, 8; + st.local.u8 [%rd155+13], %r1045; + shr.u32 %r1046, %r1031, 16; + st.local.u8 [%rd155+14], %r1046; + shr.u32 %r1047, %r1031, 24; + st.local.u8 [%rd155+15], %r1047; + st.local.u8 [%rd155+16], %r1032; + shr.u32 %r1048, %r1032, 8; + st.local.u8 [%rd155+17], %r1048; + shr.u32 %r1049, %r1032, 16; + st.local.u8 [%rd155+18], %r1049; + shr.u32 %r1050, %r1032, 24; + st.local.u8 [%rd155+19], %r1050; + st.local.u8 [%rd155+20], %r1033; + shr.u32 %r1051, %r1033, 8; + st.local.u8 [%rd155+21], %r1051; + shr.u32 %r1052, %r1033, 16; + st.local.u8 [%rd155+22], %r1052; + shr.u32 %r1053, %r1033, 24; + st.local.u8 [%rd155+23], %r1053; + st.local.u8 [%rd155+24], %r1034; + shr.u32 %r1054, %r1034, 8; + st.local.u8 [%rd155+25], %r1054; + shr.u32 %r1055, %r1034, 16; + st.local.u8 [%rd155+26], %r1055; + shr.u32 %r1056, %r1034, 24; + st.local.u8 [%rd155+27], %r1056; + st.local.u8 [%rd155+28], %r1035; + shr.u32 %r1057, %r1035, 8; + st.local.u8 [%rd155+29], %r1057; + shr.u32 %r1058, %r1035, 16; + st.local.u8 [%rd155+30], %r1058; + shr.u32 %r1059, %r1035, 24; + st.local.u8 [%rd155+31], %r1059; + add.s64 %rd153, %rd153, 8; + add.s64 %rd155, %rd155, 32; + add.s64 %rd154, %rd154, -1; + setp.ne.s64 %p12, %rd154, 0; + @%p12 bra $L__BB0_7; + +$L__BB0_8: + setp.le.u64 %p13, %rd7, %rd152; + @%p13 bra $L__BB0_30; + + add.u64 %rd144, %SPL, 96; + ld.param.u64 %rd142, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd141, %rd142; + shl.b64 %rd111, %rd151, 6; + shl.b64 %rd112, %rd151, 5; + add.s64 %rd27, %rd141, %rd112; + add.s64 %rd28, %rd144, %rd111; + mov.u64 %rd156, 0; + +$L__BB0_10: + add.s64 %rd113, %rd28, %rd156; + ld.local.u8 %rs77, [%rd113]; + add.s64 %rd114, %rd27, %rd156; + st.local.u8 [%rd114], %rs77; + add.s64 %rd156, %rd156, 1; + setp.lt.u64 %p14, %rd156, 32; + @%p14 bra $L__BB0_10; + + add.s64 %rd151, %rd151, 1; + +$L__BB0_30: + st.param.b64 [func_retval0+0], %rd151; + ret; + +} +.func _Z20blake3_hasher_updateP13blake3_hasherPKvy( + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0, + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1 +) +{ + .local .align 16 .b8 __local_depot1[144]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<54>; + .reg .b16 %rs<393>; + .reg .b32 %r<11690>; + .reg .b64 %rd<273>; + + + mov.u64 %SPL, __local_depot1; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd98, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + ld.param.u64 %rd254, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd261, %rd254; + cvta.to.local.u64 %rd2, %rd98; + add.s64 %rd3, %rd2, 136; + mov.u64 %rd262, 32; + ld.local.v2.u8 {%rs102, %rs103}, [%rd2+136]; + cvt.u64.u16 %rd4, %rs103; + cvt.u32.u16 %r144, %rs103; + mul.wide.u32 %rd101, %r144, 64; + cvt.u64.u16 %rd5, %rs102; + neg.s64 %rd102, %rd5; + setp.eq.s64 %p1, %rd101, %rd102; + @%p1 bra $L__BB1_24; + + shl.b64 %rd103, %rd4, 6; + mov.u64 %rd104, 1024; + sub.s64 %rd105, %rd104, %rd5; + sub.s64 %rd106, %rd105, %rd103; + min.u64 %rd6, %rd106, 32; + setp.eq.s16 %p2, %rs102, 0; + mov.u16 %rs351, 0; + mov.u64 %rd244, %rd6; + @%p2 bra $L__BB1_9; + + cvt.u32.u16 %r145, %rs102; + prmt.b32 %r147, %r144, %r145, 30212; + cvt.u16.u32 %rs350, %r147; + mov.u64 %rd107, 64; + sub.s64 %rd108, %rd107, %rd5; + min.u64 %rd7, %rd108, %rd6; + setp.eq.s64 %p3, %rd7, 0; + @%p3 bra $L__BB1_6; + + add.s64 %rd110, %rd2, %rd5; + add.s64 %rd8, %rd110, 72; + mov.u64 %rd237, 0; + +$L__BB1_4: + add.s64 %rd111, %rd261, %rd237; + ld.local.u8 %rs107, [%rd111]; + add.s64 %rd112, %rd8, %rd237; + st.local.u8 [%rd112], %rs107; + add.s64 %rd237, %rd237, 1; + setp.lt.u64 %p4, %rd237, %rd7; + @%p4 bra $L__BB1_4; + + ld.local.u8 %rs350, [%rd3]; + +$L__BB1_6: + cvt.u16.u64 %rs108, %rd7; + add.s16 %rs351, %rs350, %rs108; + mov.u64 %rd244, 0; + st.local.u8 [%rd3], %rs351; + add.s64 %rd261, %rd261, %rd7; + sub.s64 %rd12, %rd6, %rd7; + setp.eq.s64 %p5, %rd12, 0; + @%p5 bra $L__BB1_9; + + add.s64 %rd13, %rd2, 72; + ld.local.u8 %rs109, [%rd3+1]; + mov.u64 %rd238, 0; + setp.eq.s16 %p6, %rs109, 0; + mov.u16 %rs351, 0; + selp.u16 %rs111, 1, 0, %p6; + ld.local.u8 %rs112, [%rd3+2]; + or.b16 %rs113, %rs112, %rs111; + ld.local.u8 %r148, [%rd3+-64]; + ld.local.u8 %r149, [%rd3+-63]; + prmt.b32 %r150, %r149, %r148, 30212; + ld.local.u8 %r151, [%rd3+-62]; + prmt.b32 %r152, %r151, %r150, 28756; + ld.local.u8 %r153, [%rd3+-61]; + prmt.b32 %r154, %r153, %r152, 1620; + ld.local.u8 %r155, [%rd3+-60]; + ld.local.u8 %r156, [%rd3+-59]; + prmt.b32 %r157, %r156, %r155, 30212; + ld.local.u8 %r158, [%rd3+-58]; + prmt.b32 %r159, %r158, %r157, 28756; + ld.local.u8 %r160, [%rd3+-57]; + prmt.b32 %r161, %r160, %r159, 1620; + ld.local.u8 %r162, [%rd3+-56]; + ld.local.u8 %r163, [%rd3+-55]; + prmt.b32 %r164, %r163, %r162, 30212; + ld.local.u8 %r165, [%rd3+-54]; + prmt.b32 %r166, %r165, %r164, 28756; + ld.local.u8 %r167, [%rd3+-53]; + prmt.b32 %r168, %r167, %r166, 1620; + ld.local.u8 %r169, [%rd3+-52]; + ld.local.u8 %r170, [%rd3+-51]; + prmt.b32 %r171, %r170, %r169, 30212; + ld.local.u8 %r172, [%rd3+-50]; + prmt.b32 %r173, %r172, %r171, 28756; + ld.local.u8 %r174, [%rd3+-49]; + prmt.b32 %r175, %r174, %r173, 1620; + ld.local.u8 %r176, [%rd3+-48]; + ld.local.u8 %r177, [%rd3+-47]; + prmt.b32 %r178, %r177, %r176, 30212; + ld.local.u8 %r179, [%rd3+-46]; + prmt.b32 %r180, %r179, %r178, 28756; + ld.local.u8 %r181, [%rd3+-45]; + prmt.b32 %r182, %r181, %r180, 1620; + ld.local.u8 %r183, [%rd3+-44]; + ld.local.u8 %r184, [%rd3+-43]; + prmt.b32 %r185, %r184, %r183, 30212; + ld.local.u8 %r186, [%rd3+-42]; + prmt.b32 %r187, %r186, %r185, 28756; + ld.local.u8 %r188, [%rd3+-41]; + prmt.b32 %r189, %r188, %r187, 1620; + ld.local.u8 %r190, [%rd3+-40]; + ld.local.u8 %r191, [%rd3+-39]; + prmt.b32 %r192, %r191, %r190, 30212; + ld.local.u8 %r193, [%rd3+-38]; + prmt.b32 %r194, %r193, %r192, 28756; + ld.local.u8 %r195, [%rd3+-37]; + prmt.b32 %r196, %r195, %r194, 1620; + ld.local.u8 %r197, [%rd3+-36]; + ld.local.u8 %r198, [%rd3+-35]; + prmt.b32 %r199, %r198, %r197, 30212; + ld.local.u8 %r200, [%rd3+-34]; + prmt.b32 %r201, %r200, %r199, 28756; + ld.local.u8 %r202, [%rd3+-33]; + prmt.b32 %r203, %r202, %r201, 1620; + ld.local.u8 %r204, [%rd3+-32]; + ld.local.u8 %r205, [%rd3+-31]; + prmt.b32 %r206, %r205, %r204, 30212; + ld.local.u8 %r207, [%rd3+-30]; + prmt.b32 %r208, %r207, %r206, 28756; + ld.local.u8 %r209, [%rd3+-29]; + prmt.b32 %r210, %r209, %r208, 1620; + ld.local.u8 %r211, [%rd3+-28]; + ld.local.u8 %r212, [%rd3+-27]; + prmt.b32 %r213, %r212, %r211, 30212; + ld.local.u8 %r214, [%rd3+-26]; + prmt.b32 %r215, %r214, %r213, 28756; + ld.local.u8 %r216, [%rd3+-25]; + prmt.b32 %r217, %r216, %r215, 1620; + ld.local.u8 %r218, [%rd3+-24]; + ld.local.u8 %r219, [%rd3+-23]; + prmt.b32 %r220, %r219, %r218, 30212; + ld.local.u8 %r221, [%rd3+-22]; + prmt.b32 %r222, %r221, %r220, 28756; + ld.local.u8 %r223, [%rd3+-21]; + prmt.b32 %r224, %r223, %r222, 1620; + ld.local.u8 %r225, [%rd3+-20]; + ld.local.u8 %r226, [%rd3+-19]; + prmt.b32 %r227, %r226, %r225, 30212; + ld.local.u8 %r228, [%rd3+-18]; + prmt.b32 %r229, %r228, %r227, 28756; + ld.local.u8 %r230, [%rd3+-17]; + prmt.b32 %r231, %r230, %r229, 1620; + ld.local.u8 %r232, [%rd3+-16]; + ld.local.u8 %r233, [%rd3+-15]; + prmt.b32 %r234, %r233, %r232, 30212; + ld.local.u8 %r235, [%rd3+-14]; + prmt.b32 %r236, %r235, %r234, 28756; + ld.local.u8 %r237, [%rd3+-13]; + prmt.b32 %r238, %r237, %r236, 1620; + ld.local.u8 %r239, [%rd3+-12]; + ld.local.u8 %r240, [%rd3+-11]; + prmt.b32 %r241, %r240, %r239, 30212; + ld.local.u8 %r242, [%rd3+-10]; + prmt.b32 %r243, %r242, %r241, 28756; + ld.local.u8 %r244, [%rd3+-9]; + prmt.b32 %r245, %r244, %r243, 1620; + ld.local.u8 %r246, [%rd3+-8]; + ld.local.u8 %r247, [%rd3+-7]; + prmt.b32 %r248, %r247, %r246, 30212; + ld.local.u8 %r249, [%rd3+-6]; + prmt.b32 %r250, %r249, %r248, 28756; + ld.local.u8 %r251, [%rd3+-5]; + prmt.b32 %r252, %r251, %r250, 1620; + ld.local.u8 %r253, [%rd3+-4]; + ld.local.u8 %r254, [%rd3+-3]; + prmt.b32 %r255, %r254, %r253, 30212; + ld.local.u8 %r256, [%rd3+-2]; + prmt.b32 %r257, %r256, %r255, 28756; + ld.local.u8 %r258, [%rd3+-1]; + prmt.b32 %r259, %r258, %r257, 1620; + ld.local.u64 %rd115, [%rd3+-72]; + cvt.u32.u64 %r260, %rd115; + shr.u64 %rd116, %rd115, 32; + cvt.u32.u64 %r261, %rd116; + cvt.u32.u16 %r262, %rs113; + and.b32 %r263, %r262, 255; + ld.local.u32 %r264, [%rd3+-104]; + add.s32 %r265, %r264, %r154; + ld.local.u32 %r266, [%rd3+-88]; + add.s32 %r267, %r265, %r266; + xor.b32 %r268, %r267, %r260; + shf.l.wrap.b32 %r269, %r268, %r268, 16; + add.s32 %r270, %r269, 1779033703; + xor.b32 %r271, %r270, %r266; + shf.l.wrap.b32 %r272, %r271, %r271, 20; + add.s32 %r273, %r267, %r161; + add.s32 %r274, %r273, %r272; + xor.b32 %r275, %r274, %r269; + shf.l.wrap.b32 %r276, %r275, %r275, 24; + add.s32 %r277, %r276, %r270; + xor.b32 %r278, %r277, %r272; + shf.l.wrap.b32 %r279, %r278, %r278, 25; + ld.local.u32 %r280, [%rd3+-100]; + add.s32 %r281, %r280, %r168; + ld.local.u32 %r282, [%rd3+-84]; + add.s32 %r283, %r281, %r282; + xor.b32 %r284, %r283, %r261; + shf.l.wrap.b32 %r285, %r284, %r284, 16; + add.s32 %r286, %r285, -1150833019; + xor.b32 %r287, %r286, %r282; + shf.l.wrap.b32 %r288, %r287, %r287, 20; + add.s32 %r289, %r283, %r175; + add.s32 %r290, %r289, %r288; + xor.b32 %r291, %r290, %r285; + shf.l.wrap.b32 %r292, %r291, %r291, 24; + add.s32 %r293, %r292, %r286; + xor.b32 %r294, %r293, %r288; + shf.l.wrap.b32 %r295, %r294, %r294, 25; + ld.local.u32 %r296, [%rd3+-96]; + add.s32 %r297, %r296, %r182; + ld.local.u32 %r298, [%rd3+-80]; + add.s32 %r299, %r297, %r298; + shr.u32 %r300, %r299, 16; + shl.b32 %r301, %r299, 16; + xor.b32 %r302, %r301, 4194304; + or.b32 %r303, %r302, %r300; + add.s32 %r304, %r303, 1013904242; + xor.b32 %r305, %r304, %r298; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r299, %r189; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + ld.local.u32 %r314, [%rd3+-92]; + add.s32 %r315, %r314, %r196; + ld.local.u32 %r316, [%rd3+-76]; + add.s32 %r317, %r315, %r316; + xor.b32 %r318, %r317, %r263; + shr.u32 %r319, %r317, 16; + shl.b32 %r320, %r318, 16; + or.b32 %r321, %r320, %r319; + add.s32 %r322, %r321, -1521486534; + xor.b32 %r323, %r322, %r316; + shf.l.wrap.b32 %r324, %r323, %r323, 20; + add.s32 %r325, %r317, %r203; + add.s32 %r326, %r325, %r324; + xor.b32 %r327, %r326, %r321; + shf.l.wrap.b32 %r328, %r327, %r327, 24; + add.s32 %r329, %r328, %r322; + xor.b32 %r330, %r329, %r324; + shf.l.wrap.b32 %r331, %r330, %r330, 25; + add.s32 %r332, %r274, %r210; + add.s32 %r333, %r332, %r295; + xor.b32 %r334, %r333, %r328; + shf.l.wrap.b32 %r335, %r334, %r334, 16; + add.s32 %r336, %r335, %r311; + xor.b32 %r337, %r336, %r295; + shf.l.wrap.b32 %r338, %r337, %r337, 20; + add.s32 %r339, %r333, %r217; + add.s32 %r340, %r339, %r338; + xor.b32 %r341, %r340, %r335; + shf.l.wrap.b32 %r342, %r341, %r341, 24; + add.s32 %r343, %r342, %r336; + xor.b32 %r344, %r343, %r338; + shf.l.wrap.b32 %r345, %r344, %r344, 25; + add.s32 %r346, %r290, %r224; + add.s32 %r347, %r346, %r313; + xor.b32 %r348, %r347, %r276; + shf.l.wrap.b32 %r349, %r348, %r348, 16; + add.s32 %r350, %r349, %r329; + xor.b32 %r351, %r350, %r313; + shf.l.wrap.b32 %r352, %r351, %r351, 20; + add.s32 %r353, %r347, %r231; + add.s32 %r354, %r353, %r352; + xor.b32 %r355, %r354, %r349; + shf.l.wrap.b32 %r356, %r355, %r355, 24; + add.s32 %r357, %r356, %r350; + xor.b32 %r358, %r357, %r352; + shf.l.wrap.b32 %r359, %r358, %r358, 25; + add.s32 %r360, %r308, %r238; + add.s32 %r361, %r360, %r331; + xor.b32 %r362, %r361, %r292; + shf.l.wrap.b32 %r363, %r362, %r362, 16; + add.s32 %r364, %r363, %r277; + xor.b32 %r365, %r364, %r331; + shf.l.wrap.b32 %r366, %r365, %r365, 20; + add.s32 %r367, %r361, %r245; + add.s32 %r368, %r367, %r366; + xor.b32 %r369, %r368, %r363; + shf.l.wrap.b32 %r370, %r369, %r369, 24; + add.s32 %r371, %r370, %r364; + xor.b32 %r372, %r371, %r366; + shf.l.wrap.b32 %r373, %r372, %r372, 25; + add.s32 %r374, %r326, %r252; + add.s32 %r375, %r374, %r279; + xor.b32 %r376, %r375, %r310; + shf.l.wrap.b32 %r377, %r376, %r376, 16; + add.s32 %r378, %r377, %r293; + xor.b32 %r379, %r378, %r279; + shf.l.wrap.b32 %r380, %r379, %r379, 20; + add.s32 %r381, %r375, %r259; + add.s32 %r382, %r381, %r380; + xor.b32 %r383, %r382, %r377; + shf.l.wrap.b32 %r384, %r383, %r383, 24; + add.s32 %r385, %r384, %r378; + xor.b32 %r386, %r385, %r380; + shf.l.wrap.b32 %r387, %r386, %r386, 25; + add.s32 %r388, %r340, %r168; + add.s32 %r389, %r388, %r387; + xor.b32 %r390, %r389, %r356; + shf.l.wrap.b32 %r391, %r390, %r390, 16; + add.s32 %r392, %r391, %r371; + xor.b32 %r393, %r392, %r387; + shf.l.wrap.b32 %r394, %r393, %r393, 20; + add.s32 %r395, %r389, %r196; + add.s32 %r396, %r395, %r394; + xor.b32 %r397, %r396, %r391; + shf.l.wrap.b32 %r398, %r397, %r397, 24; + add.s32 %r399, %r398, %r392; + xor.b32 %r400, %r399, %r394; + shf.l.wrap.b32 %r401, %r400, %r400, 25; + add.s32 %r402, %r354, %r175; + add.s32 %r403, %r402, %r345; + xor.b32 %r404, %r403, %r370; + shf.l.wrap.b32 %r405, %r404, %r404, 16; + add.s32 %r406, %r405, %r385; + xor.b32 %r407, %r406, %r345; + shf.l.wrap.b32 %r408, %r407, %r407, 20; + add.s32 %r409, %r403, %r224; + add.s32 %r410, %r409, %r408; + xor.b32 %r411, %r410, %r405; + shf.l.wrap.b32 %r412, %r411, %r411, 24; + add.s32 %r413, %r412, %r406; + xor.b32 %r414, %r413, %r408; + shf.l.wrap.b32 %r415, %r414, %r414, 25; + add.s32 %r416, %r368, %r203; + add.s32 %r417, %r416, %r359; + xor.b32 %r418, %r417, %r384; + shf.l.wrap.b32 %r419, %r418, %r418, 16; + add.s32 %r420, %r419, %r343; + xor.b32 %r421, %r420, %r359; + shf.l.wrap.b32 %r422, %r421, %r421, 20; + add.s32 %r423, %r417, %r154; + add.s32 %r424, %r423, %r422; + xor.b32 %r425, %r424, %r419; + shf.l.wrap.b32 %r426, %r425, %r425, 24; + add.s32 %r427, %r426, %r420; + xor.b32 %r428, %r427, %r422; + shf.l.wrap.b32 %r429, %r428, %r428, 25; + add.s32 %r430, %r382, %r182; + add.s32 %r431, %r430, %r373; + xor.b32 %r432, %r431, %r342; + shf.l.wrap.b32 %r433, %r432, %r432, 16; + add.s32 %r434, %r433, %r357; + xor.b32 %r435, %r434, %r373; + shf.l.wrap.b32 %r436, %r435, %r435, 20; + add.s32 %r437, %r431, %r245; + add.s32 %r438, %r437, %r436; + xor.b32 %r439, %r438, %r433; + shf.l.wrap.b32 %r440, %r439, %r439, 24; + add.s32 %r441, %r440, %r434; + xor.b32 %r442, %r441, %r436; + shf.l.wrap.b32 %r443, %r442, %r442, 25; + add.s32 %r444, %r396, %r161; + add.s32 %r445, %r444, %r415; + xor.b32 %r446, %r445, %r440; + shf.l.wrap.b32 %r447, %r446, %r446, 16; + add.s32 %r448, %r447, %r427; + xor.b32 %r449, %r448, %r415; + shf.l.wrap.b32 %r450, %r449, %r449, 20; + add.s32 %r451, %r445, %r231; + add.s32 %r452, %r451, %r450; + xor.b32 %r453, %r452, %r447; + shf.l.wrap.b32 %r454, %r453, %r453, 24; + add.s32 %r455, %r454, %r448; + xor.b32 %r456, %r455, %r450; + shf.l.wrap.b32 %r457, %r456, %r456, 25; + add.s32 %r458, %r410, %r238; + add.s32 %r459, %r458, %r429; + xor.b32 %r460, %r459, %r398; + shf.l.wrap.b32 %r461, %r460, %r460, 16; + add.s32 %r462, %r461, %r441; + xor.b32 %r463, %r462, %r429; + shf.l.wrap.b32 %r464, %r463, %r463, 20; + add.s32 %r465, %r459, %r189; + add.s32 %r466, %r465, %r464; + xor.b32 %r467, %r466, %r461; + shf.l.wrap.b32 %r468, %r467, %r467, 24; + add.s32 %r469, %r468, %r462; + xor.b32 %r470, %r469, %r464; + shf.l.wrap.b32 %r471, %r470, %r470, 25; + add.s32 %r472, %r424, %r217; + add.s32 %r473, %r472, %r443; + xor.b32 %r474, %r473, %r412; + shf.l.wrap.b32 %r475, %r474, %r474, 16; + add.s32 %r476, %r475, %r399; + xor.b32 %r477, %r476, %r443; + shf.l.wrap.b32 %r478, %r477, %r477, 20; + add.s32 %r479, %r473, %r252; + add.s32 %r480, %r479, %r478; + xor.b32 %r481, %r480, %r475; + shf.l.wrap.b32 %r482, %r481, %r481, 24; + add.s32 %r483, %r482, %r476; + xor.b32 %r484, %r483, %r478; + shf.l.wrap.b32 %r485, %r484, %r484, 25; + add.s32 %r486, %r438, %r259; + add.s32 %r487, %r486, %r401; + xor.b32 %r488, %r487, %r426; + shf.l.wrap.b32 %r489, %r488, %r488, 16; + add.s32 %r490, %r489, %r413; + xor.b32 %r491, %r490, %r401; + shf.l.wrap.b32 %r492, %r491, %r491, 20; + add.s32 %r493, %r487, %r210; + add.s32 %r494, %r493, %r492; + xor.b32 %r495, %r494, %r489; + shf.l.wrap.b32 %r496, %r495, %r495, 24; + add.s32 %r497, %r496, %r490; + xor.b32 %r498, %r497, %r492; + shf.l.wrap.b32 %r499, %r498, %r498, 25; + add.s32 %r500, %r452, %r175; + add.s32 %r501, %r500, %r499; + xor.b32 %r502, %r501, %r468; + shf.l.wrap.b32 %r503, %r502, %r502, 16; + add.s32 %r504, %r503, %r483; + xor.b32 %r505, %r504, %r499; + shf.l.wrap.b32 %r506, %r505, %r505, 20; + add.s32 %r507, %r501, %r182; + add.s32 %r508, %r507, %r506; + xor.b32 %r509, %r508, %r503; + shf.l.wrap.b32 %r510, %r509, %r509, 24; + add.s32 %r511, %r510, %r504; + xor.b32 %r512, %r511, %r506; + shf.l.wrap.b32 %r513, %r512, %r512, 25; + add.s32 %r514, %r466, %r224; + add.s32 %r515, %r514, %r457; + xor.b32 %r516, %r515, %r482; + shf.l.wrap.b32 %r517, %r516, %r516, 16; + add.s32 %r518, %r517, %r497; + xor.b32 %r519, %r518, %r457; + shf.l.wrap.b32 %r520, %r519, %r519, 20; + add.s32 %r521, %r515, %r238; + add.s32 %r522, %r521, %r520; + xor.b32 %r523, %r522, %r517; + shf.l.wrap.b32 %r524, %r523, %r523, 24; + add.s32 %r525, %r524, %r518; + xor.b32 %r526, %r525, %r520; + shf.l.wrap.b32 %r527, %r526, %r526, 25; + add.s32 %r528, %r480, %r245; + add.s32 %r529, %r528, %r471; + xor.b32 %r530, %r529, %r496; + shf.l.wrap.b32 %r531, %r530, %r530, 16; + add.s32 %r532, %r531, %r455; + xor.b32 %r533, %r532, %r471; + shf.l.wrap.b32 %r534, %r533, %r533, 20; + add.s32 %r535, %r529, %r168; + add.s32 %r536, %r535, %r534; + xor.b32 %r537, %r536, %r531; + shf.l.wrap.b32 %r538, %r537, %r537, 24; + add.s32 %r539, %r538, %r532; + xor.b32 %r540, %r539, %r534; + shf.l.wrap.b32 %r541, %r540, %r540, 25; + add.s32 %r542, %r494, %r203; + add.s32 %r543, %r542, %r485; + xor.b32 %r544, %r543, %r454; + shf.l.wrap.b32 %r545, %r544, %r544, 16; + add.s32 %r546, %r545, %r469; + xor.b32 %r547, %r546, %r485; + shf.l.wrap.b32 %r548, %r547, %r547, 20; + add.s32 %r549, %r543, %r252; + add.s32 %r550, %r549, %r548; + xor.b32 %r551, %r550, %r545; + shf.l.wrap.b32 %r552, %r551, %r551, 24; + add.s32 %r553, %r552, %r546; + xor.b32 %r554, %r553, %r548; + shf.l.wrap.b32 %r555, %r554, %r554, 25; + add.s32 %r556, %r508, %r196; + add.s32 %r557, %r556, %r527; + xor.b32 %r558, %r557, %r552; + shf.l.wrap.b32 %r559, %r558, %r558, 16; + add.s32 %r560, %r559, %r539; + xor.b32 %r561, %r560, %r527; + shf.l.wrap.b32 %r562, %r561, %r561, 20; + add.s32 %r563, %r557, %r189; + add.s32 %r564, %r563, %r562; + xor.b32 %r565, %r564, %r559; + shf.l.wrap.b32 %r566, %r565, %r565, 24; + add.s32 %r567, %r566, %r560; + xor.b32 %r568, %r567, %r562; + shf.l.wrap.b32 %r569, %r568, %r568, 25; + add.s32 %r570, %r522, %r217; + add.s32 %r571, %r570, %r541; + xor.b32 %r572, %r571, %r510; + shf.l.wrap.b32 %r573, %r572, %r572, 16; + add.s32 %r574, %r573, %r553; + xor.b32 %r575, %r574, %r541; + shf.l.wrap.b32 %r576, %r575, %r575, 20; + add.s32 %r577, %r571, %r154; + add.s32 %r578, %r577, %r576; + xor.b32 %r579, %r578, %r573; + shf.l.wrap.b32 %r580, %r579, %r579, 24; + add.s32 %r581, %r580, %r574; + xor.b32 %r582, %r581, %r576; + shf.l.wrap.b32 %r583, %r582, %r582, 25; + add.s32 %r584, %r536, %r231; + add.s32 %r585, %r584, %r555; + xor.b32 %r586, %r585, %r524; + shf.l.wrap.b32 %r587, %r586, %r586, 16; + add.s32 %r588, %r587, %r511; + xor.b32 %r589, %r588, %r555; + shf.l.wrap.b32 %r590, %r589, %r589, 20; + add.s32 %r591, %r585, %r259; + add.s32 %r592, %r591, %r590; + xor.b32 %r593, %r592, %r587; + shf.l.wrap.b32 %r594, %r593, %r593, 24; + add.s32 %r595, %r594, %r588; + xor.b32 %r596, %r595, %r590; + shf.l.wrap.b32 %r597, %r596, %r596, 25; + add.s32 %r598, %r550, %r210; + add.s32 %r599, %r598, %r513; + xor.b32 %r600, %r599, %r538; + shf.l.wrap.b32 %r601, %r600, %r600, 16; + add.s32 %r602, %r601, %r525; + xor.b32 %r603, %r602, %r513; + shf.l.wrap.b32 %r604, %r603, %r603, 20; + add.s32 %r605, %r599, %r161; + add.s32 %r606, %r605, %r604; + xor.b32 %r607, %r606, %r601; + shf.l.wrap.b32 %r608, %r607, %r607, 24; + add.s32 %r609, %r608, %r602; + xor.b32 %r610, %r609, %r604; + shf.l.wrap.b32 %r611, %r610, %r610, 25; + add.s32 %r612, %r564, %r224; + add.s32 %r613, %r612, %r611; + xor.b32 %r614, %r613, %r580; + shf.l.wrap.b32 %r615, %r614, %r614, 16; + add.s32 %r616, %r615, %r595; + xor.b32 %r617, %r616, %r611; + shf.l.wrap.b32 %r618, %r617, %r617, 20; + add.s32 %r619, %r613, %r203; + add.s32 %r620, %r619, %r618; + xor.b32 %r621, %r620, %r615; + shf.l.wrap.b32 %r622, %r621, %r621, 24; + add.s32 %r623, %r622, %r616; + xor.b32 %r624, %r623, %r618; + shf.l.wrap.b32 %r625, %r624, %r624, 25; + add.s32 %r626, %r578, %r238; + add.s32 %r627, %r626, %r569; + xor.b32 %r628, %r627, %r594; + shf.l.wrap.b32 %r629, %r628, %r628, 16; + add.s32 %r630, %r629, %r609; + xor.b32 %r631, %r630, %r569; + shf.l.wrap.b32 %r632, %r631, %r631, 20; + add.s32 %r633, %r627, %r217; + add.s32 %r634, %r633, %r632; + xor.b32 %r635, %r634, %r629; + shf.l.wrap.b32 %r636, %r635, %r635, 24; + add.s32 %r637, %r636, %r630; + xor.b32 %r638, %r637, %r632; + shf.l.wrap.b32 %r639, %r638, %r638, 25; + add.s32 %r640, %r592, %r252; + add.s32 %r641, %r640, %r583; + xor.b32 %r642, %r641, %r608; + shf.l.wrap.b32 %r643, %r642, %r642, 16; + add.s32 %r644, %r643, %r567; + xor.b32 %r645, %r644, %r583; + shf.l.wrap.b32 %r646, %r645, %r645, 20; + add.s32 %r647, %r641, %r175; + add.s32 %r648, %r647, %r646; + xor.b32 %r649, %r648, %r643; + shf.l.wrap.b32 %r650, %r649, %r649, 24; + add.s32 %r651, %r650, %r644; + xor.b32 %r652, %r651, %r646; + shf.l.wrap.b32 %r653, %r652, %r652, 25; + add.s32 %r654, %r606, %r245; + add.s32 %r655, %r654, %r597; + xor.b32 %r656, %r655, %r566; + shf.l.wrap.b32 %r657, %r656, %r656, 16; + add.s32 %r658, %r657, %r581; + xor.b32 %r659, %r658, %r597; + shf.l.wrap.b32 %r660, %r659, %r659, 20; + add.s32 %r661, %r655, %r259; + add.s32 %r662, %r661, %r660; + xor.b32 %r663, %r662, %r657; + shf.l.wrap.b32 %r664, %r663, %r663, 24; + add.s32 %r665, %r664, %r658; + xor.b32 %r666, %r665, %r660; + shf.l.wrap.b32 %r667, %r666, %r666, 25; + add.s32 %r668, %r620, %r182; + add.s32 %r669, %r668, %r639; + xor.b32 %r670, %r669, %r664; + shf.l.wrap.b32 %r671, %r670, %r670, 16; + add.s32 %r672, %r671, %r651; + xor.b32 %r673, %r672, %r639; + shf.l.wrap.b32 %r674, %r673, %r673, 20; + add.s32 %r675, %r669, %r154; + add.s32 %r676, %r675, %r674; + xor.b32 %r677, %r676, %r671; + shf.l.wrap.b32 %r678, %r677, %r677, 24; + add.s32 %r679, %r678, %r672; + xor.b32 %r680, %r679, %r674; + shf.l.wrap.b32 %r681, %r680, %r680, 25; + add.s32 %r682, %r634, %r231; + add.s32 %r683, %r682, %r653; + xor.b32 %r684, %r683, %r622; + shf.l.wrap.b32 %r685, %r684, %r684, 16; + add.s32 %r686, %r685, %r665; + xor.b32 %r687, %r686, %r653; + shf.l.wrap.b32 %r688, %r687, %r687, 20; + add.s32 %r689, %r683, %r168; + add.s32 %r690, %r689, %r688; + xor.b32 %r691, %r690, %r685; + shf.l.wrap.b32 %r692, %r691, %r691, 24; + add.s32 %r693, %r692, %r686; + xor.b32 %r694, %r693, %r688; + shf.l.wrap.b32 %r695, %r694, %r694, 25; + add.s32 %r696, %r648, %r189; + add.s32 %r697, %r696, %r667; + xor.b32 %r698, %r697, %r636; + shf.l.wrap.b32 %r699, %r698, %r698, 16; + add.s32 %r700, %r699, %r623; + xor.b32 %r701, %r700, %r667; + shf.l.wrap.b32 %r702, %r701, %r701, 20; + add.s32 %r703, %r697, %r210; + add.s32 %r704, %r703, %r702; + xor.b32 %r705, %r704, %r699; + shf.l.wrap.b32 %r706, %r705, %r705, 24; + add.s32 %r707, %r706, %r700; + xor.b32 %r708, %r707, %r702; + shf.l.wrap.b32 %r709, %r708, %r708, 25; + add.s32 %r710, %r662, %r161; + add.s32 %r711, %r710, %r625; + xor.b32 %r712, %r711, %r650; + shf.l.wrap.b32 %r713, %r712, %r712, 16; + add.s32 %r714, %r713, %r637; + xor.b32 %r715, %r714, %r625; + shf.l.wrap.b32 %r716, %r715, %r715, 20; + add.s32 %r717, %r711, %r196; + add.s32 %r718, %r717, %r716; + xor.b32 %r719, %r718, %r713; + shf.l.wrap.b32 %r720, %r719, %r719, 24; + add.s32 %r721, %r720, %r714; + xor.b32 %r722, %r721, %r716; + shf.l.wrap.b32 %r723, %r722, %r722, 25; + add.s32 %r724, %r676, %r238; + add.s32 %r725, %r724, %r723; + xor.b32 %r726, %r725, %r692; + shf.l.wrap.b32 %r727, %r726, %r726, 16; + add.s32 %r728, %r727, %r707; + xor.b32 %r729, %r728, %r723; + shf.l.wrap.b32 %r730, %r729, %r729, 20; + add.s32 %r731, %r725, %r245; + add.s32 %r732, %r731, %r730; + xor.b32 %r733, %r732, %r727; + shf.l.wrap.b32 %r734, %r733, %r733, 24; + add.s32 %r735, %r734, %r728; + xor.b32 %r736, %r735, %r730; + shf.l.wrap.b32 %r737, %r736, %r736, 25; + add.s32 %r738, %r690, %r217; + add.s32 %r739, %r738, %r681; + xor.b32 %r740, %r739, %r706; + shf.l.wrap.b32 %r741, %r740, %r740, 16; + add.s32 %r742, %r741, %r721; + xor.b32 %r743, %r742, %r681; + shf.l.wrap.b32 %r744, %r743, %r743, 20; + add.s32 %r745, %r739, %r231; + add.s32 %r746, %r745, %r744; + xor.b32 %r747, %r746, %r741; + shf.l.wrap.b32 %r748, %r747, %r747, 24; + add.s32 %r749, %r748, %r742; + xor.b32 %r750, %r749, %r744; + shf.l.wrap.b32 %r751, %r750, %r750, 25; + add.s32 %r752, %r704, %r259; + add.s32 %r753, %r752, %r695; + xor.b32 %r754, %r753, %r720; + shf.l.wrap.b32 %r755, %r754, %r754, 16; + add.s32 %r756, %r755, %r679; + xor.b32 %r757, %r756, %r695; + shf.l.wrap.b32 %r758, %r757, %r757, 20; + add.s32 %r759, %r753, %r224; + add.s32 %r760, %r759, %r758; + xor.b32 %r761, %r760, %r755; + shf.l.wrap.b32 %r762, %r761, %r761, 24; + add.s32 %r763, %r762, %r756; + xor.b32 %r764, %r763, %r758; + shf.l.wrap.b32 %r765, %r764, %r764, 25; + add.s32 %r766, %r718, %r252; + add.s32 %r767, %r766, %r709; + xor.b32 %r768, %r767, %r678; + shf.l.wrap.b32 %r769, %r768, %r768, 16; + add.s32 %r770, %r769, %r693; + xor.b32 %r771, %r770, %r709; + shf.l.wrap.b32 %r772, %r771, %r771, 20; + add.s32 %r773, %r767, %r210; + add.s32 %r774, %r773, %r772; + xor.b32 %r775, %r774, %r769; + shf.l.wrap.b32 %r776, %r775, %r775, 24; + add.s32 %r777, %r776, %r770; + xor.b32 %r778, %r777, %r772; + shf.l.wrap.b32 %r779, %r778, %r778, 25; + add.s32 %r780, %r732, %r203; + add.s32 %r781, %r780, %r751; + xor.b32 %r782, %r781, %r776; + shf.l.wrap.b32 %r783, %r782, %r782, 16; + add.s32 %r784, %r783, %r763; + xor.b32 %r785, %r784, %r751; + shf.l.wrap.b32 %r786, %r785, %r785, 20; + add.s32 %r787, %r781, %r168; + add.s32 %r788, %r787, %r786; + xor.b32 %r789, %r788, %r783; + shf.l.wrap.b32 %r790, %r789, %r789, 24; + add.s32 %r791, %r790, %r784; + xor.b32 %r792, %r791, %r786; + shf.l.wrap.b32 %r793, %r792, %r792, 25; + add.s32 %r794, %r746, %r189; + add.s32 %r795, %r794, %r765; + xor.b32 %r796, %r795, %r734; + shf.l.wrap.b32 %r797, %r796, %r796, 16; + add.s32 %r798, %r797, %r777; + xor.b32 %r799, %r798, %r765; + shf.l.wrap.b32 %r800, %r799, %r799, 20; + add.s32 %r801, %r795, %r175; + add.s32 %r802, %r801, %r800; + xor.b32 %r803, %r802, %r797; + shf.l.wrap.b32 %r804, %r803, %r803, 24; + add.s32 %r805, %r804, %r798; + xor.b32 %r806, %r805, %r800; + shf.l.wrap.b32 %r807, %r806, %r806, 25; + add.s32 %r808, %r760, %r154; + add.s32 %r809, %r808, %r779; + xor.b32 %r810, %r809, %r748; + shf.l.wrap.b32 %r811, %r810, %r810, 16; + add.s32 %r812, %r811, %r735; + xor.b32 %r813, %r812, %r779; + shf.l.wrap.b32 %r814, %r813, %r813, 20; + add.s32 %r815, %r809, %r161; + add.s32 %r816, %r815, %r814; + xor.b32 %r817, %r816, %r811; + shf.l.wrap.b32 %r818, %r817, %r817, 24; + add.s32 %r819, %r818, %r812; + xor.b32 %r820, %r819, %r814; + shf.l.wrap.b32 %r821, %r820, %r820, 25; + add.s32 %r822, %r774, %r196; + add.s32 %r823, %r822, %r737; + xor.b32 %r824, %r823, %r762; + shf.l.wrap.b32 %r825, %r824, %r824, 16; + add.s32 %r826, %r825, %r749; + xor.b32 %r827, %r826, %r737; + shf.l.wrap.b32 %r828, %r827, %r827, 20; + add.s32 %r829, %r823, %r182; + add.s32 %r830, %r829, %r828; + xor.b32 %r831, %r830, %r825; + shf.l.wrap.b32 %r832, %r831, %r831, 24; + add.s32 %r833, %r832, %r826; + xor.b32 %r834, %r833, %r828; + shf.l.wrap.b32 %r835, %r834, %r834, 25; + add.s32 %r836, %r788, %r217; + add.s32 %r837, %r836, %r835; + xor.b32 %r838, %r837, %r804; + shf.l.wrap.b32 %r839, %r838, %r838, 16; + add.s32 %r840, %r839, %r819; + xor.b32 %r841, %r840, %r835; + shf.l.wrap.b32 %r842, %r841, %r841, 20; + add.s32 %r843, %r837, %r252; + add.s32 %r844, %r843, %r842; + xor.b32 %r845, %r844, %r839; + shf.l.wrap.b32 %r846, %r845, %r845, 24; + add.s32 %r847, %r846, %r840; + xor.b32 %r848, %r847, %r842; + shf.l.wrap.b32 %r849, %r848, %r848, 25; + add.s32 %r850, %r802, %r231; + add.s32 %r851, %r850, %r793; + xor.b32 %r852, %r851, %r818; + shf.l.wrap.b32 %r853, %r852, %r852, 16; + add.s32 %r854, %r853, %r833; + xor.b32 %r855, %r854, %r793; + shf.l.wrap.b32 %r856, %r855, %r855, 20; + add.s32 %r857, %r851, %r189; + add.s32 %r858, %r857, %r856; + xor.b32 %r859, %r858, %r853; + shf.l.wrap.b32 %r860, %r859, %r859, 24; + add.s32 %r861, %r860, %r854; + xor.b32 %r862, %r861, %r856; + shf.l.wrap.b32 %r863, %r862, %r862, 25; + add.s32 %r864, %r816, %r210; + add.s32 %r865, %r864, %r807; + xor.b32 %r866, %r865, %r832; + shf.l.wrap.b32 %r867, %r866, %r866, 16; + add.s32 %r868, %r867, %r791; + xor.b32 %r869, %r868, %r807; + shf.l.wrap.b32 %r870, %r869, %r869, 20; + add.s32 %r871, %r865, %r238; + add.s32 %r872, %r871, %r870; + xor.b32 %r873, %r872, %r867; + shf.l.wrap.b32 %r874, %r873, %r873, 24; + add.s32 %r875, %r874, %r868; + xor.b32 %r876, %r875, %r870; + shf.l.wrap.b32 %r877, %r876, %r876, 25; + add.s32 %r878, %r830, %r259; + add.s32 %r879, %r878, %r821; + xor.b32 %r880, %r879, %r790; + shf.l.wrap.b32 %r881, %r880, %r880, 16; + add.s32 %r882, %r881, %r805; + xor.b32 %r883, %r882, %r821; + shf.l.wrap.b32 %r884, %r883, %r883, 20; + add.s32 %r885, %r879, %r161; + add.s32 %r886, %r885, %r884; + xor.b32 %r887, %r886, %r881; + shf.l.wrap.b32 %r888, %r887, %r887, 24; + add.s32 %r889, %r888, %r882; + xor.b32 %r890, %r889, %r884; + shf.l.wrap.b32 %r891, %r890, %r890, 25; + add.s32 %r892, %r844, %r245; + add.s32 %r893, %r892, %r863; + xor.b32 %r894, %r893, %r888; + shf.l.wrap.b32 %r895, %r894, %r894, 16; + add.s32 %r896, %r895, %r875; + xor.b32 %r897, %r896, %r863; + shf.l.wrap.b32 %r898, %r897, %r897, 20; + add.s32 %r899, %r893, %r175; + add.s32 %r900, %r899, %r898; + xor.b32 %r901, %r900, %r895; + shf.l.wrap.b32 %r902, %r901, %r901, 24; + add.s32 %r903, %r902, %r896; + xor.b32 %r904, %r903, %r898; + shf.l.wrap.b32 %r905, %r904, %r904, 25; + add.s32 %r906, %r858, %r154; + add.s32 %r907, %r906, %r877; + xor.b32 %r908, %r907, %r846; + shf.l.wrap.b32 %r909, %r908, %r908, 16; + add.s32 %r910, %r909, %r889; + xor.b32 %r911, %r910, %r877; + shf.l.wrap.b32 %r912, %r911, %r911, 20; + add.s32 %r913, %r907, %r224; + add.s32 %r914, %r913, %r912; + xor.b32 %r915, %r914, %r909; + shf.l.wrap.b32 %r916, %r915, %r915, 24; + add.s32 %r917, %r916, %r910; + xor.b32 %r918, %r917, %r912; + shf.l.wrap.b32 %r919, %r918, %r918, 25; + add.s32 %r920, %r872, %r168; + add.s32 %r921, %r920, %r891; + xor.b32 %r922, %r921, %r860; + shf.l.wrap.b32 %r923, %r922, %r922, 16; + add.s32 %r924, %r923, %r847; + xor.b32 %r925, %r924, %r891; + shf.l.wrap.b32 %r926, %r925, %r925, 20; + add.s32 %r927, %r921, %r196; + add.s32 %r928, %r927, %r926; + xor.b32 %r929, %r928, %r923; + shf.l.wrap.b32 %r930, %r929, %r929, 24; + add.s32 %r931, %r930, %r924; + xor.b32 %r932, %r931, %r926; + shf.l.wrap.b32 %r933, %r932, %r932, 25; + add.s32 %r934, %r886, %r182; + add.s32 %r935, %r934, %r849; + xor.b32 %r936, %r935, %r874; + shf.l.wrap.b32 %r937, %r936, %r936, 16; + add.s32 %r938, %r937, %r861; + xor.b32 %r939, %r938, %r849; + shf.l.wrap.b32 %r940, %r939, %r939, 20; + add.s32 %r941, %r935, %r203; + add.s32 %r942, %r941, %r940; + xor.b32 %r943, %r942, %r937; + shf.l.wrap.b32 %r944, %r943, %r943, 24; + add.s32 %r945, %r944, %r938; + xor.b32 %r946, %r945, %r940; + shf.l.wrap.b32 %r947, %r946, %r946, 25; + add.s32 %r948, %r900, %r231; + add.s32 %r949, %r948, %r947; + xor.b32 %r950, %r949, %r916; + shf.l.wrap.b32 %r951, %r950, %r950, 16; + add.s32 %r952, %r951, %r931; + xor.b32 %r953, %r952, %r947; + shf.l.wrap.b32 %r954, %r953, %r953, 20; + add.s32 %r955, %r949, %r259; + add.s32 %r956, %r955, %r954; + xor.b32 %r957, %r956, %r951; + shf.l.wrap.b32 %r958, %r957, %r957, 24; + add.s32 %r959, %r958, %r952; + xor.b32 %r960, %r959, %r954; + shf.l.wrap.b32 %r961, %r960, %r960, 25; + add.s32 %r962, %r914, %r189; + add.s32 %r963, %r962, %r905; + xor.b32 %r964, %r963, %r930; + shf.l.wrap.b32 %r965, %r964, %r964, 16; + add.s32 %r966, %r965, %r945; + xor.b32 %r967, %r966, %r905; + shf.l.wrap.b32 %r968, %r967, %r967, 20; + add.s32 %r969, %r963, %r154; + add.s32 %r970, %r969, %r968; + xor.b32 %r971, %r970, %r965; + shf.l.wrap.b32 %r972, %r971, %r971, 24; + add.s32 %r973, %r972, %r966; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 25; + add.s32 %r976, %r928, %r161; + add.s32 %r977, %r976, %r919; + xor.b32 %r978, %r977, %r944; + shf.l.wrap.b32 %r979, %r978, %r978, 16; + add.s32 %r980, %r979, %r903; + xor.b32 %r981, %r980, %r919; + shf.l.wrap.b32 %r982, %r981, %r981, 20; + add.s32 %r983, %r977, %r217; + add.s32 %r984, %r983, %r982; + xor.b32 %r985, %r984, %r979; + shf.l.wrap.b32 %r986, %r985, %r985, 24; + add.s32 %r987, %r986, %r980; + xor.b32 %r988, %r987, %r982; + shf.l.wrap.b32 %r989, %r988, %r988, 25; + add.s32 %r990, %r942, %r210; + add.s32 %r991, %r990, %r933; + xor.b32 %r992, %r991, %r902; + shf.l.wrap.b32 %r993, %r992, %r992, 16; + add.s32 %r994, %r993, %r917; + xor.b32 %r995, %r994, %r933; + shf.l.wrap.b32 %r996, %r995, %r995, 20; + add.s32 %r997, %r991, %r196; + add.s32 %r998, %r997, %r996; + xor.b32 %r999, %r998, %r993; + shf.l.wrap.b32 %r1000, %r999, %r999, 24; + add.s32 %r1001, %r1000, %r994; + xor.b32 %r1002, %r1001, %r996; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 25; + add.s32 %r1004, %r956, %r252; + add.s32 %r1005, %r1004, %r975; + xor.b32 %r1006, %r1005, %r1000; + shf.l.wrap.b32 %r1007, %r1006, %r1006, 16; + add.s32 %r1008, %r1007, %r987; + xor.b32 %r1009, %r1008, %r975; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 20; + add.s32 %r1011, %r1005, %r224; + add.s32 %r1012, %r1011, %r1010; + xor.b32 %r1013, %r1012, %r1007; + shf.l.wrap.b32 %r1014, %r1013, %r1013, 24; + add.s32 %r1015, %r1014, %r1008; + xor.b32 %r1016, %r1015, %r1010; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 25; + add.s32 %r1018, %r970, %r168; + add.s32 %r1019, %r1018, %r989; + xor.b32 %r1020, %r1019, %r958; + shf.l.wrap.b32 %r1021, %r1020, %r1020, 16; + add.s32 %r1022, %r1021, %r1001; + xor.b32 %r1023, %r1022, %r989; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 20; + add.s32 %r1025, %r1019, %r238; + add.s32 %r1026, %r1025, %r1024; + xor.b32 %r1027, %r1026, %r1021; + shf.l.wrap.b32 %r1028, %r1027, %r1027, 24; + add.s32 %r1029, %r1028, %r1022; + xor.b32 %r1030, %r1029, %r1024; + shf.l.wrap.b32 %r1031, %r1030, %r1030, 25; + add.s32 %r1032, %r984, %r175; + add.s32 %r1033, %r1032, %r1003; + xor.b32 %r1034, %r1033, %r972; + shf.l.wrap.b32 %r1035, %r1034, %r1034, 16; + add.s32 %r1036, %r1035, %r959; + xor.b32 %r1037, %r1036, %r1003; + shf.l.wrap.b32 %r1038, %r1037, %r1037, 20; + add.s32 %r1039, %r1033, %r182; + add.s32 %r1040, %r1039, %r1038; + xor.b32 %r1041, %r1040, %r1035; + shf.l.wrap.b32 %r1042, %r1041, %r1041, 24; + add.s32 %r1043, %r1042, %r1036; + xor.b32 %r1044, %r1043, %r1038; + shf.l.wrap.b32 %r1045, %r1044, %r1044, 25; + add.s32 %r1046, %r998, %r203; + add.s32 %r1047, %r1046, %r961; + xor.b32 %r1048, %r1047, %r986; + shf.l.wrap.b32 %r1049, %r1048, %r1048, 16; + add.s32 %r1050, %r1049, %r973; + xor.b32 %r1051, %r1050, %r961; + shf.l.wrap.b32 %r1052, %r1051, %r1051, 20; + add.s32 %r1053, %r1047, %r245; + add.s32 %r1054, %r1053, %r1052; + xor.b32 %r1055, %r1054, %r1049; + shf.l.wrap.b32 %r1056, %r1055, %r1055, 24; + add.s32 %r1057, %r1056, %r1050; + xor.b32 %r1058, %r1057, %r1052; + shf.l.wrap.b32 %r1059, %r1058, %r1058, 25; + xor.b32 %r1060, %r1043, %r1012; + st.local.u32 [%rd3+-104], %r1060; + xor.b32 %r1061, %r1057, %r1026; + st.local.u32 [%rd3+-100], %r1061; + xor.b32 %r1062, %r1015, %r1040; + st.local.u32 [%rd3+-96], %r1062; + xor.b32 %r1063, %r1029, %r1054; + st.local.u32 [%rd3+-92], %r1063; + xor.b32 %r1064, %r1059, %r1028; + st.local.u32 [%rd3+-88], %r1064; + xor.b32 %r1065, %r1017, %r1042; + st.local.u32 [%rd3+-84], %r1065; + xor.b32 %r1066, %r1031, %r1056; + st.local.u32 [%rd3+-80], %r1066; + xor.b32 %r1067, %r1045, %r1014; + st.local.u32 [%rd3+-76], %r1067; + add.s16 %rs114, %rs109, 1; + st.local.v2.u8 [%rd3], {%rs351, %rs114}; + +$L__BB1_8: + add.s64 %rd117, %rd13, %rd238; + st.local.u8 [%rd117], %rs351; + add.s64 %rd238, %rd238, 1; + setp.lt.u64 %p7, %rd238, 64; + mov.u64 %rd244, %rd12; + @%p7 bra $L__BB1_8; + +$L__BB1_9: + setp.lt.u64 %p8, %rd244, 65; + @%p8 bra $L__BB1_12; + + ld.local.u8 %rs9, [%rd3+2]; + ld.local.u8 %rs352, [%rd3+1]; + ld.local.u32 %r11657, [%rd3+-104]; + ld.local.u32 %r11656, [%rd3+-100]; + ld.local.u32 %r11655, [%rd3+-96]; + ld.local.u32 %r11654, [%rd3+-92]; + ld.local.u32 %r11653, [%rd3+-88]; + ld.local.u32 %r11652, [%rd3+-84]; + ld.local.u32 %r11651, [%rd3+-80]; + ld.local.u32 %r11650, [%rd3+-76]; + ld.local.u64 %rd118, [%rd3+-72]; + cvt.u32.u64 %r9, %rd118; + shr.u64 %rd119, %rd118, 32; + cvt.u32.u64 %r10, %rd119; + +$L__BB1_11: + and.b16 %rs116, %rs352, 255; + setp.eq.s16 %p9, %rs116, 0; + selp.u16 %rs117, 1, 0, %p9; + or.b16 %rs118, %rs9, %rs117; + ld.local.u8 %r1068, [%rd261]; + ld.local.u8 %r1069, [%rd261+1]; + prmt.b32 %r1070, %r1069, %r1068, 30212; + ld.local.u8 %r1071, [%rd261+2]; + prmt.b32 %r1072, %r1071, %r1070, 28756; + ld.local.u8 %r1073, [%rd261+3]; + prmt.b32 %r1074, %r1073, %r1072, 1620; + ld.local.u8 %r1075, [%rd261+4]; + ld.local.u8 %r1076, [%rd261+5]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + ld.local.u8 %r1078, [%rd261+6]; + prmt.b32 %r1079, %r1078, %r1077, 28756; + ld.local.u8 %r1080, [%rd261+7]; + prmt.b32 %r1081, %r1080, %r1079, 1620; + ld.local.u8 %r1082, [%rd261+8]; + ld.local.u8 %r1083, [%rd261+9]; + prmt.b32 %r1084, %r1083, %r1082, 30212; + ld.local.u8 %r1085, [%rd261+10]; + prmt.b32 %r1086, %r1085, %r1084, 28756; + ld.local.u8 %r1087, [%rd261+11]; + prmt.b32 %r1088, %r1087, %r1086, 1620; + ld.local.u8 %r1089, [%rd261+12]; + ld.local.u8 %r1090, [%rd261+13]; + prmt.b32 %r1091, %r1090, %r1089, 30212; + ld.local.u8 %r1092, [%rd261+14]; + prmt.b32 %r1093, %r1092, %r1091, 28756; + ld.local.u8 %r1094, [%rd261+15]; + prmt.b32 %r1095, %r1094, %r1093, 1620; + ld.local.u8 %r1096, [%rd261+16]; + ld.local.u8 %r1097, [%rd261+17]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd261+18]; + prmt.b32 %r1100, %r1099, %r1098, 28756; + ld.local.u8 %r1101, [%rd261+19]; + prmt.b32 %r1102, %r1101, %r1100, 1620; + ld.local.u8 %r1103, [%rd261+20]; + ld.local.u8 %r1104, [%rd261+21]; + prmt.b32 %r1105, %r1104, %r1103, 30212; + ld.local.u8 %r1106, [%rd261+22]; + prmt.b32 %r1107, %r1106, %r1105, 28756; + ld.local.u8 %r1108, [%rd261+23]; + prmt.b32 %r1109, %r1108, %r1107, 1620; + ld.local.u8 %r1110, [%rd261+24]; + ld.local.u8 %r1111, [%rd261+25]; + prmt.b32 %r1112, %r1111, %r1110, 30212; + ld.local.u8 %r1113, [%rd261+26]; + prmt.b32 %r1114, %r1113, %r1112, 28756; + ld.local.u8 %r1115, [%rd261+27]; + prmt.b32 %r1116, %r1115, %r1114, 1620; + ld.local.u8 %r1117, [%rd261+28]; + ld.local.u8 %r1118, [%rd261+29]; + prmt.b32 %r1119, %r1118, %r1117, 30212; + ld.local.u8 %r1120, [%rd261+30]; + prmt.b32 %r1121, %r1120, %r1119, 28756; + ld.local.u8 %r1122, [%rd261+31]; + prmt.b32 %r1123, %r1122, %r1121, 1620; + ld.local.u8 %r1124, [%rd261+32]; + ld.local.u8 %r1125, [%rd261+33]; + prmt.b32 %r1126, %r1125, %r1124, 30212; + ld.local.u8 %r1127, [%rd261+34]; + prmt.b32 %r1128, %r1127, %r1126, 28756; + ld.local.u8 %r1129, [%rd261+35]; + prmt.b32 %r1130, %r1129, %r1128, 1620; + ld.local.u8 %r1131, [%rd261+36]; + ld.local.u8 %r1132, [%rd261+37]; + prmt.b32 %r1133, %r1132, %r1131, 30212; + ld.local.u8 %r1134, [%rd261+38]; + prmt.b32 %r1135, %r1134, %r1133, 28756; + ld.local.u8 %r1136, [%rd261+39]; + prmt.b32 %r1137, %r1136, %r1135, 1620; + ld.local.u8 %r1138, [%rd261+40]; + ld.local.u8 %r1139, [%rd261+41]; + prmt.b32 %r1140, %r1139, %r1138, 30212; + ld.local.u8 %r1141, [%rd261+42]; + prmt.b32 %r1142, %r1141, %r1140, 28756; + ld.local.u8 %r1143, [%rd261+43]; + prmt.b32 %r1144, %r1143, %r1142, 1620; + ld.local.u8 %r1145, [%rd261+44]; + ld.local.u8 %r1146, [%rd261+45]; + prmt.b32 %r1147, %r1146, %r1145, 30212; + ld.local.u8 %r1148, [%rd261+46]; + prmt.b32 %r1149, %r1148, %r1147, 28756; + ld.local.u8 %r1150, [%rd261+47]; + prmt.b32 %r1151, %r1150, %r1149, 1620; + ld.local.u8 %r1152, [%rd261+48]; + ld.local.u8 %r1153, [%rd261+49]; + prmt.b32 %r1154, %r1153, %r1152, 30212; + ld.local.u8 %r1155, [%rd261+50]; + prmt.b32 %r1156, %r1155, %r1154, 28756; + ld.local.u8 %r1157, [%rd261+51]; + prmt.b32 %r1158, %r1157, %r1156, 1620; + ld.local.u8 %r1159, [%rd261+52]; + ld.local.u8 %r1160, [%rd261+53]; + prmt.b32 %r1161, %r1160, %r1159, 30212; + ld.local.u8 %r1162, [%rd261+54]; + prmt.b32 %r1163, %r1162, %r1161, 28756; + ld.local.u8 %r1164, [%rd261+55]; + prmt.b32 %r1165, %r1164, %r1163, 1620; + ld.local.u8 %r1166, [%rd261+56]; + ld.local.u8 %r1167, [%rd261+57]; + prmt.b32 %r1168, %r1167, %r1166, 30212; + ld.local.u8 %r1169, [%rd261+58]; + prmt.b32 %r1170, %r1169, %r1168, 28756; + ld.local.u8 %r1171, [%rd261+59]; + prmt.b32 %r1172, %r1171, %r1170, 1620; + ld.local.u8 %r1173, [%rd261+60]; + ld.local.u8 %r1174, [%rd261+61]; + prmt.b32 %r1175, %r1174, %r1173, 30212; + ld.local.u8 %r1176, [%rd261+62]; + prmt.b32 %r1177, %r1176, %r1175, 28756; + ld.local.u8 %r1178, [%rd261+63]; + prmt.b32 %r1179, %r1178, %r1177, 1620; + cvt.u32.u16 %r1180, %rs118; + and.b32 %r1181, %r1180, 255; + add.s32 %r1182, %r11657, %r1074; + add.s32 %r1183, %r1182, %r11653; + xor.b32 %r1184, %r1183, %r9; + shf.l.wrap.b32 %r1185, %r1184, %r1184, 16; + add.s32 %r1186, %r1185, 1779033703; + xor.b32 %r1187, %r1186, %r11653; + shf.l.wrap.b32 %r1188, %r1187, %r1187, 20; + add.s32 %r1189, %r1183, %r1081; + add.s32 %r1190, %r1189, %r1188; + xor.b32 %r1191, %r1190, %r1185; + shf.l.wrap.b32 %r1192, %r1191, %r1191, 24; + add.s32 %r1193, %r1192, %r1186; + xor.b32 %r1194, %r1193, %r1188; + shf.l.wrap.b32 %r1195, %r1194, %r1194, 25; + add.s32 %r1196, %r11656, %r1088; + add.s32 %r1197, %r1196, %r11652; + xor.b32 %r1198, %r1197, %r10; + shf.l.wrap.b32 %r1199, %r1198, %r1198, 16; + add.s32 %r1200, %r1199, -1150833019; + xor.b32 %r1201, %r1200, %r11652; + shf.l.wrap.b32 %r1202, %r1201, %r1201, 20; + add.s32 %r1203, %r1197, %r1095; + add.s32 %r1204, %r1203, %r1202; + xor.b32 %r1205, %r1204, %r1199; + shf.l.wrap.b32 %r1206, %r1205, %r1205, 24; + add.s32 %r1207, %r1206, %r1200; + xor.b32 %r1208, %r1207, %r1202; + shf.l.wrap.b32 %r1209, %r1208, %r1208, 25; + add.s32 %r1210, %r11655, %r1102; + add.s32 %r1211, %r1210, %r11651; + shr.u32 %r1212, %r1211, 16; + shl.b32 %r1213, %r1211, 16; + xor.b32 %r1214, %r1213, 4194304; + or.b32 %r1215, %r1214, %r1212; + add.s32 %r1216, %r1215, 1013904242; + xor.b32 %r1217, %r1216, %r11651; + shf.l.wrap.b32 %r1218, %r1217, %r1217, 20; + add.s32 %r1219, %r1211, %r1109; + add.s32 %r1220, %r1219, %r1218; + xor.b32 %r1221, %r1220, %r1215; + shf.l.wrap.b32 %r1222, %r1221, %r1221, 24; + add.s32 %r1223, %r1222, %r1216; + xor.b32 %r1224, %r1223, %r1218; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 25; + add.s32 %r1226, %r11654, %r1116; + add.s32 %r1227, %r1226, %r11650; + xor.b32 %r1228, %r1227, %r1181; + shr.u32 %r1229, %r1227, 16; + shl.b32 %r1230, %r1228, 16; + or.b32 %r1231, %r1230, %r1229; + add.s32 %r1232, %r1231, -1521486534; + xor.b32 %r1233, %r1232, %r11650; + shf.l.wrap.b32 %r1234, %r1233, %r1233, 20; + add.s32 %r1235, %r1227, %r1123; + add.s32 %r1236, %r1235, %r1234; + xor.b32 %r1237, %r1236, %r1231; + shf.l.wrap.b32 %r1238, %r1237, %r1237, 24; + add.s32 %r1239, %r1238, %r1232; + xor.b32 %r1240, %r1239, %r1234; + shf.l.wrap.b32 %r1241, %r1240, %r1240, 25; + add.s32 %r1242, %r1190, %r1130; + add.s32 %r1243, %r1242, %r1209; + xor.b32 %r1244, %r1243, %r1238; + shf.l.wrap.b32 %r1245, %r1244, %r1244, 16; + add.s32 %r1246, %r1245, %r1223; + xor.b32 %r1247, %r1246, %r1209; + shf.l.wrap.b32 %r1248, %r1247, %r1247, 20; + add.s32 %r1249, %r1243, %r1137; + add.s32 %r1250, %r1249, %r1248; + xor.b32 %r1251, %r1250, %r1245; + shf.l.wrap.b32 %r1252, %r1251, %r1251, 24; + add.s32 %r1253, %r1252, %r1246; + xor.b32 %r1254, %r1253, %r1248; + shf.l.wrap.b32 %r1255, %r1254, %r1254, 25; + add.s32 %r1256, %r1204, %r1144; + add.s32 %r1257, %r1256, %r1225; + xor.b32 %r1258, %r1257, %r1192; + shf.l.wrap.b32 %r1259, %r1258, %r1258, 16; + add.s32 %r1260, %r1259, %r1239; + xor.b32 %r1261, %r1260, %r1225; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 20; + add.s32 %r1263, %r1257, %r1151; + add.s32 %r1264, %r1263, %r1262; + xor.b32 %r1265, %r1264, %r1259; + shf.l.wrap.b32 %r1266, %r1265, %r1265, 24; + add.s32 %r1267, %r1266, %r1260; + xor.b32 %r1268, %r1267, %r1262; + shf.l.wrap.b32 %r1269, %r1268, %r1268, 25; + add.s32 %r1270, %r1220, %r1158; + add.s32 %r1271, %r1270, %r1241; + xor.b32 %r1272, %r1271, %r1206; + shf.l.wrap.b32 %r1273, %r1272, %r1272, 16; + add.s32 %r1274, %r1273, %r1193; + xor.b32 %r1275, %r1274, %r1241; + shf.l.wrap.b32 %r1276, %r1275, %r1275, 20; + add.s32 %r1277, %r1271, %r1165; + add.s32 %r1278, %r1277, %r1276; + xor.b32 %r1279, %r1278, %r1273; + shf.l.wrap.b32 %r1280, %r1279, %r1279, 24; + add.s32 %r1281, %r1280, %r1274; + xor.b32 %r1282, %r1281, %r1276; + shf.l.wrap.b32 %r1283, %r1282, %r1282, 25; + add.s32 %r1284, %r1236, %r1172; + add.s32 %r1285, %r1284, %r1195; + xor.b32 %r1286, %r1285, %r1222; + shf.l.wrap.b32 %r1287, %r1286, %r1286, 16; + add.s32 %r1288, %r1287, %r1207; + xor.b32 %r1289, %r1288, %r1195; + shf.l.wrap.b32 %r1290, %r1289, %r1289, 20; + add.s32 %r1291, %r1285, %r1179; + add.s32 %r1292, %r1291, %r1290; + xor.b32 %r1293, %r1292, %r1287; + shf.l.wrap.b32 %r1294, %r1293, %r1293, 24; + add.s32 %r1295, %r1294, %r1288; + xor.b32 %r1296, %r1295, %r1290; + shf.l.wrap.b32 %r1297, %r1296, %r1296, 25; + add.s32 %r1298, %r1250, %r1088; + add.s32 %r1299, %r1298, %r1297; + xor.b32 %r1300, %r1299, %r1266; + shf.l.wrap.b32 %r1301, %r1300, %r1300, 16; + add.s32 %r1302, %r1301, %r1281; + xor.b32 %r1303, %r1302, %r1297; + shf.l.wrap.b32 %r1304, %r1303, %r1303, 20; + add.s32 %r1305, %r1299, %r1116; + add.s32 %r1306, %r1305, %r1304; + xor.b32 %r1307, %r1306, %r1301; + shf.l.wrap.b32 %r1308, %r1307, %r1307, 24; + add.s32 %r1309, %r1308, %r1302; + xor.b32 %r1310, %r1309, %r1304; + shf.l.wrap.b32 %r1311, %r1310, %r1310, 25; + add.s32 %r1312, %r1264, %r1095; + add.s32 %r1313, %r1312, %r1255; + xor.b32 %r1314, %r1313, %r1280; + shf.l.wrap.b32 %r1315, %r1314, %r1314, 16; + add.s32 %r1316, %r1315, %r1295; + xor.b32 %r1317, %r1316, %r1255; + shf.l.wrap.b32 %r1318, %r1317, %r1317, 20; + add.s32 %r1319, %r1313, %r1144; + add.s32 %r1320, %r1319, %r1318; + xor.b32 %r1321, %r1320, %r1315; + shf.l.wrap.b32 %r1322, %r1321, %r1321, 24; + add.s32 %r1323, %r1322, %r1316; + xor.b32 %r1324, %r1323, %r1318; + shf.l.wrap.b32 %r1325, %r1324, %r1324, 25; + add.s32 %r1326, %r1278, %r1123; + add.s32 %r1327, %r1326, %r1269; + xor.b32 %r1328, %r1327, %r1294; + shf.l.wrap.b32 %r1329, %r1328, %r1328, 16; + add.s32 %r1330, %r1329, %r1253; + xor.b32 %r1331, %r1330, %r1269; + shf.l.wrap.b32 %r1332, %r1331, %r1331, 20; + add.s32 %r1333, %r1327, %r1074; + add.s32 %r1334, %r1333, %r1332; + xor.b32 %r1335, %r1334, %r1329; + shf.l.wrap.b32 %r1336, %r1335, %r1335, 24; + add.s32 %r1337, %r1336, %r1330; + xor.b32 %r1338, %r1337, %r1332; + shf.l.wrap.b32 %r1339, %r1338, %r1338, 25; + add.s32 %r1340, %r1292, %r1102; + add.s32 %r1341, %r1340, %r1283; + xor.b32 %r1342, %r1341, %r1252; + shf.l.wrap.b32 %r1343, %r1342, %r1342, 16; + add.s32 %r1344, %r1343, %r1267; + xor.b32 %r1345, %r1344, %r1283; + shf.l.wrap.b32 %r1346, %r1345, %r1345, 20; + add.s32 %r1347, %r1341, %r1165; + add.s32 %r1348, %r1347, %r1346; + xor.b32 %r1349, %r1348, %r1343; + shf.l.wrap.b32 %r1350, %r1349, %r1349, 24; + add.s32 %r1351, %r1350, %r1344; + xor.b32 %r1352, %r1351, %r1346; + shf.l.wrap.b32 %r1353, %r1352, %r1352, 25; + add.s32 %r1354, %r1306, %r1081; + add.s32 %r1355, %r1354, %r1325; + xor.b32 %r1356, %r1355, %r1350; + shf.l.wrap.b32 %r1357, %r1356, %r1356, 16; + add.s32 %r1358, %r1357, %r1337; + xor.b32 %r1359, %r1358, %r1325; + shf.l.wrap.b32 %r1360, %r1359, %r1359, 20; + add.s32 %r1361, %r1355, %r1151; + add.s32 %r1362, %r1361, %r1360; + xor.b32 %r1363, %r1362, %r1357; + shf.l.wrap.b32 %r1364, %r1363, %r1363, 24; + add.s32 %r1365, %r1364, %r1358; + xor.b32 %r1366, %r1365, %r1360; + shf.l.wrap.b32 %r1367, %r1366, %r1366, 25; + add.s32 %r1368, %r1320, %r1158; + add.s32 %r1369, %r1368, %r1339; + xor.b32 %r1370, %r1369, %r1308; + shf.l.wrap.b32 %r1371, %r1370, %r1370, 16; + add.s32 %r1372, %r1371, %r1351; + xor.b32 %r1373, %r1372, %r1339; + shf.l.wrap.b32 %r1374, %r1373, %r1373, 20; + add.s32 %r1375, %r1369, %r1109; + add.s32 %r1376, %r1375, %r1374; + xor.b32 %r1377, %r1376, %r1371; + shf.l.wrap.b32 %r1378, %r1377, %r1377, 24; + add.s32 %r1379, %r1378, %r1372; + xor.b32 %r1380, %r1379, %r1374; + shf.l.wrap.b32 %r1381, %r1380, %r1380, 25; + add.s32 %r1382, %r1334, %r1137; + add.s32 %r1383, %r1382, %r1353; + xor.b32 %r1384, %r1383, %r1322; + shf.l.wrap.b32 %r1385, %r1384, %r1384, 16; + add.s32 %r1386, %r1385, %r1309; + xor.b32 %r1387, %r1386, %r1353; + shf.l.wrap.b32 %r1388, %r1387, %r1387, 20; + add.s32 %r1389, %r1383, %r1172; + add.s32 %r1390, %r1389, %r1388; + xor.b32 %r1391, %r1390, %r1385; + shf.l.wrap.b32 %r1392, %r1391, %r1391, 24; + add.s32 %r1393, %r1392, %r1386; + xor.b32 %r1394, %r1393, %r1388; + shf.l.wrap.b32 %r1395, %r1394, %r1394, 25; + add.s32 %r1396, %r1348, %r1179; + add.s32 %r1397, %r1396, %r1311; + xor.b32 %r1398, %r1397, %r1336; + shf.l.wrap.b32 %r1399, %r1398, %r1398, 16; + add.s32 %r1400, %r1399, %r1323; + xor.b32 %r1401, %r1400, %r1311; + shf.l.wrap.b32 %r1402, %r1401, %r1401, 20; + add.s32 %r1403, %r1397, %r1130; + add.s32 %r1404, %r1403, %r1402; + xor.b32 %r1405, %r1404, %r1399; + shf.l.wrap.b32 %r1406, %r1405, %r1405, 24; + add.s32 %r1407, %r1406, %r1400; + xor.b32 %r1408, %r1407, %r1402; + shf.l.wrap.b32 %r1409, %r1408, %r1408, 25; + add.s32 %r1410, %r1362, %r1095; + add.s32 %r1411, %r1410, %r1409; + xor.b32 %r1412, %r1411, %r1378; + shf.l.wrap.b32 %r1413, %r1412, %r1412, 16; + add.s32 %r1414, %r1413, %r1393; + xor.b32 %r1415, %r1414, %r1409; + shf.l.wrap.b32 %r1416, %r1415, %r1415, 20; + add.s32 %r1417, %r1411, %r1102; + add.s32 %r1418, %r1417, %r1416; + xor.b32 %r1419, %r1418, %r1413; + shf.l.wrap.b32 %r1420, %r1419, %r1419, 24; + add.s32 %r1421, %r1420, %r1414; + xor.b32 %r1422, %r1421, %r1416; + shf.l.wrap.b32 %r1423, %r1422, %r1422, 25; + add.s32 %r1424, %r1376, %r1144; + add.s32 %r1425, %r1424, %r1367; + xor.b32 %r1426, %r1425, %r1392; + shf.l.wrap.b32 %r1427, %r1426, %r1426, 16; + add.s32 %r1428, %r1427, %r1407; + xor.b32 %r1429, %r1428, %r1367; + shf.l.wrap.b32 %r1430, %r1429, %r1429, 20; + add.s32 %r1431, %r1425, %r1158; + add.s32 %r1432, %r1431, %r1430; + xor.b32 %r1433, %r1432, %r1427; + shf.l.wrap.b32 %r1434, %r1433, %r1433, 24; + add.s32 %r1435, %r1434, %r1428; + xor.b32 %r1436, %r1435, %r1430; + shf.l.wrap.b32 %r1437, %r1436, %r1436, 25; + add.s32 %r1438, %r1390, %r1165; + add.s32 %r1439, %r1438, %r1381; + xor.b32 %r1440, %r1439, %r1406; + shf.l.wrap.b32 %r1441, %r1440, %r1440, 16; + add.s32 %r1442, %r1441, %r1365; + xor.b32 %r1443, %r1442, %r1381; + shf.l.wrap.b32 %r1444, %r1443, %r1443, 20; + add.s32 %r1445, %r1439, %r1088; + add.s32 %r1446, %r1445, %r1444; + xor.b32 %r1447, %r1446, %r1441; + shf.l.wrap.b32 %r1448, %r1447, %r1447, 24; + add.s32 %r1449, %r1448, %r1442; + xor.b32 %r1450, %r1449, %r1444; + shf.l.wrap.b32 %r1451, %r1450, %r1450, 25; + add.s32 %r1452, %r1404, %r1123; + add.s32 %r1453, %r1452, %r1395; + xor.b32 %r1454, %r1453, %r1364; + shf.l.wrap.b32 %r1455, %r1454, %r1454, 16; + add.s32 %r1456, %r1455, %r1379; + xor.b32 %r1457, %r1456, %r1395; + shf.l.wrap.b32 %r1458, %r1457, %r1457, 20; + add.s32 %r1459, %r1453, %r1172; + add.s32 %r1460, %r1459, %r1458; + xor.b32 %r1461, %r1460, %r1455; + shf.l.wrap.b32 %r1462, %r1461, %r1461, 24; + add.s32 %r1463, %r1462, %r1456; + xor.b32 %r1464, %r1463, %r1458; + shf.l.wrap.b32 %r1465, %r1464, %r1464, 25; + add.s32 %r1466, %r1418, %r1116; + add.s32 %r1467, %r1466, %r1437; + xor.b32 %r1468, %r1467, %r1462; + shf.l.wrap.b32 %r1469, %r1468, %r1468, 16; + add.s32 %r1470, %r1469, %r1449; + xor.b32 %r1471, %r1470, %r1437; + shf.l.wrap.b32 %r1472, %r1471, %r1471, 20; + add.s32 %r1473, %r1467, %r1109; + add.s32 %r1474, %r1473, %r1472; + xor.b32 %r1475, %r1474, %r1469; + shf.l.wrap.b32 %r1476, %r1475, %r1475, 24; + add.s32 %r1477, %r1476, %r1470; + xor.b32 %r1478, %r1477, %r1472; + shf.l.wrap.b32 %r1479, %r1478, %r1478, 25; + add.s32 %r1480, %r1432, %r1137; + add.s32 %r1481, %r1480, %r1451; + xor.b32 %r1482, %r1481, %r1420; + shf.l.wrap.b32 %r1483, %r1482, %r1482, 16; + add.s32 %r1484, %r1483, %r1463; + xor.b32 %r1485, %r1484, %r1451; + shf.l.wrap.b32 %r1486, %r1485, %r1485, 20; + add.s32 %r1487, %r1481, %r1074; + add.s32 %r1488, %r1487, %r1486; + xor.b32 %r1489, %r1488, %r1483; + shf.l.wrap.b32 %r1490, %r1489, %r1489, 24; + add.s32 %r1491, %r1490, %r1484; + xor.b32 %r1492, %r1491, %r1486; + shf.l.wrap.b32 %r1493, %r1492, %r1492, 25; + add.s32 %r1494, %r1446, %r1151; + add.s32 %r1495, %r1494, %r1465; + xor.b32 %r1496, %r1495, %r1434; + shf.l.wrap.b32 %r1497, %r1496, %r1496, 16; + add.s32 %r1498, %r1497, %r1421; + xor.b32 %r1499, %r1498, %r1465; + shf.l.wrap.b32 %r1500, %r1499, %r1499, 20; + add.s32 %r1501, %r1495, %r1179; + add.s32 %r1502, %r1501, %r1500; + xor.b32 %r1503, %r1502, %r1497; + shf.l.wrap.b32 %r1504, %r1503, %r1503, 24; + add.s32 %r1505, %r1504, %r1498; + xor.b32 %r1506, %r1505, %r1500; + shf.l.wrap.b32 %r1507, %r1506, %r1506, 25; + add.s32 %r1508, %r1460, %r1130; + add.s32 %r1509, %r1508, %r1423; + xor.b32 %r1510, %r1509, %r1448; + shf.l.wrap.b32 %r1511, %r1510, %r1510, 16; + add.s32 %r1512, %r1511, %r1435; + xor.b32 %r1513, %r1512, %r1423; + shf.l.wrap.b32 %r1514, %r1513, %r1513, 20; + add.s32 %r1515, %r1509, %r1081; + add.s32 %r1516, %r1515, %r1514; + xor.b32 %r1517, %r1516, %r1511; + shf.l.wrap.b32 %r1518, %r1517, %r1517, 24; + add.s32 %r1519, %r1518, %r1512; + xor.b32 %r1520, %r1519, %r1514; + shf.l.wrap.b32 %r1521, %r1520, %r1520, 25; + add.s32 %r1522, %r1474, %r1144; + add.s32 %r1523, %r1522, %r1521; + xor.b32 %r1524, %r1523, %r1490; + shf.l.wrap.b32 %r1525, %r1524, %r1524, 16; + add.s32 %r1526, %r1525, %r1505; + xor.b32 %r1527, %r1526, %r1521; + shf.l.wrap.b32 %r1528, %r1527, %r1527, 20; + add.s32 %r1529, %r1523, %r1123; + add.s32 %r1530, %r1529, %r1528; + xor.b32 %r1531, %r1530, %r1525; + shf.l.wrap.b32 %r1532, %r1531, %r1531, 24; + add.s32 %r1533, %r1532, %r1526; + xor.b32 %r1534, %r1533, %r1528; + shf.l.wrap.b32 %r1535, %r1534, %r1534, 25; + add.s32 %r1536, %r1488, %r1158; + add.s32 %r1537, %r1536, %r1479; + xor.b32 %r1538, %r1537, %r1504; + shf.l.wrap.b32 %r1539, %r1538, %r1538, 16; + add.s32 %r1540, %r1539, %r1519; + xor.b32 %r1541, %r1540, %r1479; + shf.l.wrap.b32 %r1542, %r1541, %r1541, 20; + add.s32 %r1543, %r1537, %r1137; + add.s32 %r1544, %r1543, %r1542; + xor.b32 %r1545, %r1544, %r1539; + shf.l.wrap.b32 %r1546, %r1545, %r1545, 24; + add.s32 %r1547, %r1546, %r1540; + xor.b32 %r1548, %r1547, %r1542; + shf.l.wrap.b32 %r1549, %r1548, %r1548, 25; + add.s32 %r1550, %r1502, %r1172; + add.s32 %r1551, %r1550, %r1493; + xor.b32 %r1552, %r1551, %r1518; + shf.l.wrap.b32 %r1553, %r1552, %r1552, 16; + add.s32 %r1554, %r1553, %r1477; + xor.b32 %r1555, %r1554, %r1493; + shf.l.wrap.b32 %r1556, %r1555, %r1555, 20; + add.s32 %r1557, %r1551, %r1095; + add.s32 %r1558, %r1557, %r1556; + xor.b32 %r1559, %r1558, %r1553; + shf.l.wrap.b32 %r1560, %r1559, %r1559, 24; + add.s32 %r1561, %r1560, %r1554; + xor.b32 %r1562, %r1561, %r1556; + shf.l.wrap.b32 %r1563, %r1562, %r1562, 25; + add.s32 %r1564, %r1516, %r1165; + add.s32 %r1565, %r1564, %r1507; + xor.b32 %r1566, %r1565, %r1476; + shf.l.wrap.b32 %r1567, %r1566, %r1566, 16; + add.s32 %r1568, %r1567, %r1491; + xor.b32 %r1569, %r1568, %r1507; + shf.l.wrap.b32 %r1570, %r1569, %r1569, 20; + add.s32 %r1571, %r1565, %r1179; + add.s32 %r1572, %r1571, %r1570; + xor.b32 %r1573, %r1572, %r1567; + shf.l.wrap.b32 %r1574, %r1573, %r1573, 24; + add.s32 %r1575, %r1574, %r1568; + xor.b32 %r1576, %r1575, %r1570; + shf.l.wrap.b32 %r1577, %r1576, %r1576, 25; + add.s32 %r1578, %r1530, %r1102; + add.s32 %r1579, %r1578, %r1549; + xor.b32 %r1580, %r1579, %r1574; + shf.l.wrap.b32 %r1581, %r1580, %r1580, 16; + add.s32 %r1582, %r1581, %r1561; + xor.b32 %r1583, %r1582, %r1549; + shf.l.wrap.b32 %r1584, %r1583, %r1583, 20; + add.s32 %r1585, %r1579, %r1074; + add.s32 %r1586, %r1585, %r1584; + xor.b32 %r1587, %r1586, %r1581; + shf.l.wrap.b32 %r1588, %r1587, %r1587, 24; + add.s32 %r1589, %r1588, %r1582; + xor.b32 %r1590, %r1589, %r1584; + shf.l.wrap.b32 %r1591, %r1590, %r1590, 25; + add.s32 %r1592, %r1544, %r1151; + add.s32 %r1593, %r1592, %r1563; + xor.b32 %r1594, %r1593, %r1532; + shf.l.wrap.b32 %r1595, %r1594, %r1594, 16; + add.s32 %r1596, %r1595, %r1575; + xor.b32 %r1597, %r1596, %r1563; + shf.l.wrap.b32 %r1598, %r1597, %r1597, 20; + add.s32 %r1599, %r1593, %r1088; + add.s32 %r1600, %r1599, %r1598; + xor.b32 %r1601, %r1600, %r1595; + shf.l.wrap.b32 %r1602, %r1601, %r1601, 24; + add.s32 %r1603, %r1602, %r1596; + xor.b32 %r1604, %r1603, %r1598; + shf.l.wrap.b32 %r1605, %r1604, %r1604, 25; + add.s32 %r1606, %r1558, %r1109; + add.s32 %r1607, %r1606, %r1577; + xor.b32 %r1608, %r1607, %r1546; + shf.l.wrap.b32 %r1609, %r1608, %r1608, 16; + add.s32 %r1610, %r1609, %r1533; + xor.b32 %r1611, %r1610, %r1577; + shf.l.wrap.b32 %r1612, %r1611, %r1611, 20; + add.s32 %r1613, %r1607, %r1130; + add.s32 %r1614, %r1613, %r1612; + xor.b32 %r1615, %r1614, %r1609; + shf.l.wrap.b32 %r1616, %r1615, %r1615, 24; + add.s32 %r1617, %r1616, %r1610; + xor.b32 %r1618, %r1617, %r1612; + shf.l.wrap.b32 %r1619, %r1618, %r1618, 25; + add.s32 %r1620, %r1572, %r1081; + add.s32 %r1621, %r1620, %r1535; + xor.b32 %r1622, %r1621, %r1560; + shf.l.wrap.b32 %r1623, %r1622, %r1622, 16; + add.s32 %r1624, %r1623, %r1547; + xor.b32 %r1625, %r1624, %r1535; + shf.l.wrap.b32 %r1626, %r1625, %r1625, 20; + add.s32 %r1627, %r1621, %r1116; + add.s32 %r1628, %r1627, %r1626; + xor.b32 %r1629, %r1628, %r1623; + shf.l.wrap.b32 %r1630, %r1629, %r1629, 24; + add.s32 %r1631, %r1630, %r1624; + xor.b32 %r1632, %r1631, %r1626; + shf.l.wrap.b32 %r1633, %r1632, %r1632, 25; + add.s32 %r1634, %r1586, %r1158; + add.s32 %r1635, %r1634, %r1633; + xor.b32 %r1636, %r1635, %r1602; + shf.l.wrap.b32 %r1637, %r1636, %r1636, 16; + add.s32 %r1638, %r1637, %r1617; + xor.b32 %r1639, %r1638, %r1633; + shf.l.wrap.b32 %r1640, %r1639, %r1639, 20; + add.s32 %r1641, %r1635, %r1165; + add.s32 %r1642, %r1641, %r1640; + xor.b32 %r1643, %r1642, %r1637; + shf.l.wrap.b32 %r1644, %r1643, %r1643, 24; + add.s32 %r1645, %r1644, %r1638; + xor.b32 %r1646, %r1645, %r1640; + shf.l.wrap.b32 %r1647, %r1646, %r1646, 25; + add.s32 %r1648, %r1600, %r1137; + add.s32 %r1649, %r1648, %r1591; + xor.b32 %r1650, %r1649, %r1616; + shf.l.wrap.b32 %r1651, %r1650, %r1650, 16; + add.s32 %r1652, %r1651, %r1631; + xor.b32 %r1653, %r1652, %r1591; + shf.l.wrap.b32 %r1654, %r1653, %r1653, 20; + add.s32 %r1655, %r1649, %r1151; + add.s32 %r1656, %r1655, %r1654; + xor.b32 %r1657, %r1656, %r1651; + shf.l.wrap.b32 %r1658, %r1657, %r1657, 24; + add.s32 %r1659, %r1658, %r1652; + xor.b32 %r1660, %r1659, %r1654; + shf.l.wrap.b32 %r1661, %r1660, %r1660, 25; + add.s32 %r1662, %r1614, %r1179; + add.s32 %r1663, %r1662, %r1605; + xor.b32 %r1664, %r1663, %r1630; + shf.l.wrap.b32 %r1665, %r1664, %r1664, 16; + add.s32 %r1666, %r1665, %r1589; + xor.b32 %r1667, %r1666, %r1605; + shf.l.wrap.b32 %r1668, %r1667, %r1667, 20; + add.s32 %r1669, %r1663, %r1144; + add.s32 %r1670, %r1669, %r1668; + xor.b32 %r1671, %r1670, %r1665; + shf.l.wrap.b32 %r1672, %r1671, %r1671, 24; + add.s32 %r1673, %r1672, %r1666; + xor.b32 %r1674, %r1673, %r1668; + shf.l.wrap.b32 %r1675, %r1674, %r1674, 25; + add.s32 %r1676, %r1628, %r1172; + add.s32 %r1677, %r1676, %r1619; + xor.b32 %r1678, %r1677, %r1588; + shf.l.wrap.b32 %r1679, %r1678, %r1678, 16; + add.s32 %r1680, %r1679, %r1603; + xor.b32 %r1681, %r1680, %r1619; + shf.l.wrap.b32 %r1682, %r1681, %r1681, 20; + add.s32 %r1683, %r1677, %r1130; + add.s32 %r1684, %r1683, %r1682; + xor.b32 %r1685, %r1684, %r1679; + shf.l.wrap.b32 %r1686, %r1685, %r1685, 24; + add.s32 %r1687, %r1686, %r1680; + xor.b32 %r1688, %r1687, %r1682; + shf.l.wrap.b32 %r1689, %r1688, %r1688, 25; + add.s32 %r1690, %r1642, %r1123; + add.s32 %r1691, %r1690, %r1661; + xor.b32 %r1692, %r1691, %r1686; + shf.l.wrap.b32 %r1693, %r1692, %r1692, 16; + add.s32 %r1694, %r1693, %r1673; + xor.b32 %r1695, %r1694, %r1661; + shf.l.wrap.b32 %r1696, %r1695, %r1695, 20; + add.s32 %r1697, %r1691, %r1088; + add.s32 %r1698, %r1697, %r1696; + xor.b32 %r1699, %r1698, %r1693; + shf.l.wrap.b32 %r1700, %r1699, %r1699, 24; + add.s32 %r1701, %r1700, %r1694; + xor.b32 %r1702, %r1701, %r1696; + shf.l.wrap.b32 %r1703, %r1702, %r1702, 25; + add.s32 %r1704, %r1656, %r1109; + add.s32 %r1705, %r1704, %r1675; + xor.b32 %r1706, %r1705, %r1644; + shf.l.wrap.b32 %r1707, %r1706, %r1706, 16; + add.s32 %r1708, %r1707, %r1687; + xor.b32 %r1709, %r1708, %r1675; + shf.l.wrap.b32 %r1710, %r1709, %r1709, 20; + add.s32 %r1711, %r1705, %r1095; + add.s32 %r1712, %r1711, %r1710; + xor.b32 %r1713, %r1712, %r1707; + shf.l.wrap.b32 %r1714, %r1713, %r1713, 24; + add.s32 %r1715, %r1714, %r1708; + xor.b32 %r1716, %r1715, %r1710; + shf.l.wrap.b32 %r1717, %r1716, %r1716, 25; + add.s32 %r1718, %r1670, %r1074; + add.s32 %r1719, %r1718, %r1689; + xor.b32 %r1720, %r1719, %r1658; + shf.l.wrap.b32 %r1721, %r1720, %r1720, 16; + add.s32 %r1722, %r1721, %r1645; + xor.b32 %r1723, %r1722, %r1689; + shf.l.wrap.b32 %r1724, %r1723, %r1723, 20; + add.s32 %r1725, %r1719, %r1081; + add.s32 %r1726, %r1725, %r1724; + xor.b32 %r1727, %r1726, %r1721; + shf.l.wrap.b32 %r1728, %r1727, %r1727, 24; + add.s32 %r1729, %r1728, %r1722; + xor.b32 %r1730, %r1729, %r1724; + shf.l.wrap.b32 %r1731, %r1730, %r1730, 25; + add.s32 %r1732, %r1684, %r1116; + add.s32 %r1733, %r1732, %r1647; + xor.b32 %r1734, %r1733, %r1672; + shf.l.wrap.b32 %r1735, %r1734, %r1734, 16; + add.s32 %r1736, %r1735, %r1659; + xor.b32 %r1737, %r1736, %r1647; + shf.l.wrap.b32 %r1738, %r1737, %r1737, 20; + add.s32 %r1739, %r1733, %r1102; + add.s32 %r1740, %r1739, %r1738; + xor.b32 %r1741, %r1740, %r1735; + shf.l.wrap.b32 %r1742, %r1741, %r1741, 24; + add.s32 %r1743, %r1742, %r1736; + xor.b32 %r1744, %r1743, %r1738; + shf.l.wrap.b32 %r1745, %r1744, %r1744, 25; + add.s32 %r1746, %r1698, %r1137; + add.s32 %r1747, %r1746, %r1745; + xor.b32 %r1748, %r1747, %r1714; + shf.l.wrap.b32 %r1749, %r1748, %r1748, 16; + add.s32 %r1750, %r1749, %r1729; + xor.b32 %r1751, %r1750, %r1745; + shf.l.wrap.b32 %r1752, %r1751, %r1751, 20; + add.s32 %r1753, %r1747, %r1172; + add.s32 %r1754, %r1753, %r1752; + xor.b32 %r1755, %r1754, %r1749; + shf.l.wrap.b32 %r1756, %r1755, %r1755, 24; + add.s32 %r1757, %r1756, %r1750; + xor.b32 %r1758, %r1757, %r1752; + shf.l.wrap.b32 %r1759, %r1758, %r1758, 25; + add.s32 %r1760, %r1712, %r1151; + add.s32 %r1761, %r1760, %r1703; + xor.b32 %r1762, %r1761, %r1728; + shf.l.wrap.b32 %r1763, %r1762, %r1762, 16; + add.s32 %r1764, %r1763, %r1743; + xor.b32 %r1765, %r1764, %r1703; + shf.l.wrap.b32 %r1766, %r1765, %r1765, 20; + add.s32 %r1767, %r1761, %r1109; + add.s32 %r1768, %r1767, %r1766; + xor.b32 %r1769, %r1768, %r1763; + shf.l.wrap.b32 %r1770, %r1769, %r1769, 24; + add.s32 %r1771, %r1770, %r1764; + xor.b32 %r1772, %r1771, %r1766; + shf.l.wrap.b32 %r1773, %r1772, %r1772, 25; + add.s32 %r1774, %r1726, %r1130; + add.s32 %r1775, %r1774, %r1717; + xor.b32 %r1776, %r1775, %r1742; + shf.l.wrap.b32 %r1777, %r1776, %r1776, 16; + add.s32 %r1778, %r1777, %r1701; + xor.b32 %r1779, %r1778, %r1717; + shf.l.wrap.b32 %r1780, %r1779, %r1779, 20; + add.s32 %r1781, %r1775, %r1158; + add.s32 %r1782, %r1781, %r1780; + xor.b32 %r1783, %r1782, %r1777; + shf.l.wrap.b32 %r1784, %r1783, %r1783, 24; + add.s32 %r1785, %r1784, %r1778; + xor.b32 %r1786, %r1785, %r1780; + shf.l.wrap.b32 %r1787, %r1786, %r1786, 25; + add.s32 %r1788, %r1740, %r1179; + add.s32 %r1789, %r1788, %r1731; + xor.b32 %r1790, %r1789, %r1700; + shf.l.wrap.b32 %r1791, %r1790, %r1790, 16; + add.s32 %r1792, %r1791, %r1715; + xor.b32 %r1793, %r1792, %r1731; + shf.l.wrap.b32 %r1794, %r1793, %r1793, 20; + add.s32 %r1795, %r1789, %r1081; + add.s32 %r1796, %r1795, %r1794; + xor.b32 %r1797, %r1796, %r1791; + shf.l.wrap.b32 %r1798, %r1797, %r1797, 24; + add.s32 %r1799, %r1798, %r1792; + xor.b32 %r1800, %r1799, %r1794; + shf.l.wrap.b32 %r1801, %r1800, %r1800, 25; + add.s32 %r1802, %r1754, %r1165; + add.s32 %r1803, %r1802, %r1773; + xor.b32 %r1804, %r1803, %r1798; + shf.l.wrap.b32 %r1805, %r1804, %r1804, 16; + add.s32 %r1806, %r1805, %r1785; + xor.b32 %r1807, %r1806, %r1773; + shf.l.wrap.b32 %r1808, %r1807, %r1807, 20; + add.s32 %r1809, %r1803, %r1095; + add.s32 %r1810, %r1809, %r1808; + xor.b32 %r1811, %r1810, %r1805; + shf.l.wrap.b32 %r1812, %r1811, %r1811, 24; + add.s32 %r1813, %r1812, %r1806; + xor.b32 %r1814, %r1813, %r1808; + shf.l.wrap.b32 %r1815, %r1814, %r1814, 25; + add.s32 %r1816, %r1768, %r1074; + add.s32 %r1817, %r1816, %r1787; + xor.b32 %r1818, %r1817, %r1756; + shf.l.wrap.b32 %r1819, %r1818, %r1818, 16; + add.s32 %r1820, %r1819, %r1799; + xor.b32 %r1821, %r1820, %r1787; + shf.l.wrap.b32 %r1822, %r1821, %r1821, 20; + add.s32 %r1823, %r1817, %r1144; + add.s32 %r1824, %r1823, %r1822; + xor.b32 %r1825, %r1824, %r1819; + shf.l.wrap.b32 %r1826, %r1825, %r1825, 24; + add.s32 %r1827, %r1826, %r1820; + xor.b32 %r1828, %r1827, %r1822; + shf.l.wrap.b32 %r1829, %r1828, %r1828, 25; + add.s32 %r1830, %r1782, %r1088; + add.s32 %r1831, %r1830, %r1801; + xor.b32 %r1832, %r1831, %r1770; + shf.l.wrap.b32 %r1833, %r1832, %r1832, 16; + add.s32 %r1834, %r1833, %r1757; + xor.b32 %r1835, %r1834, %r1801; + shf.l.wrap.b32 %r1836, %r1835, %r1835, 20; + add.s32 %r1837, %r1831, %r1116; + add.s32 %r1838, %r1837, %r1836; + xor.b32 %r1839, %r1838, %r1833; + shf.l.wrap.b32 %r1840, %r1839, %r1839, 24; + add.s32 %r1841, %r1840, %r1834; + xor.b32 %r1842, %r1841, %r1836; + shf.l.wrap.b32 %r1843, %r1842, %r1842, 25; + add.s32 %r1844, %r1796, %r1102; + add.s32 %r1845, %r1844, %r1759; + xor.b32 %r1846, %r1845, %r1784; + shf.l.wrap.b32 %r1847, %r1846, %r1846, 16; + add.s32 %r1848, %r1847, %r1771; + xor.b32 %r1849, %r1848, %r1759; + shf.l.wrap.b32 %r1850, %r1849, %r1849, 20; + add.s32 %r1851, %r1845, %r1123; + add.s32 %r1852, %r1851, %r1850; + xor.b32 %r1853, %r1852, %r1847; + shf.l.wrap.b32 %r1854, %r1853, %r1853, 24; + add.s32 %r1855, %r1854, %r1848; + xor.b32 %r1856, %r1855, %r1850; + shf.l.wrap.b32 %r1857, %r1856, %r1856, 25; + add.s32 %r1858, %r1810, %r1151; + add.s32 %r1859, %r1858, %r1857; + xor.b32 %r1860, %r1859, %r1826; + shf.l.wrap.b32 %r1861, %r1860, %r1860, 16; + add.s32 %r1862, %r1861, %r1841; + xor.b32 %r1863, %r1862, %r1857; + shf.l.wrap.b32 %r1864, %r1863, %r1863, 20; + add.s32 %r1865, %r1859, %r1179; + add.s32 %r1866, %r1865, %r1864; + xor.b32 %r1867, %r1866, %r1861; + shf.l.wrap.b32 %r1868, %r1867, %r1867, 24; + add.s32 %r1869, %r1868, %r1862; + xor.b32 %r1870, %r1869, %r1864; + shf.l.wrap.b32 %r1871, %r1870, %r1870, 25; + add.s32 %r1872, %r1824, %r1109; + add.s32 %r1873, %r1872, %r1815; + xor.b32 %r1874, %r1873, %r1840; + shf.l.wrap.b32 %r1875, %r1874, %r1874, 16; + add.s32 %r1876, %r1875, %r1855; + xor.b32 %r1877, %r1876, %r1815; + shf.l.wrap.b32 %r1878, %r1877, %r1877, 20; + add.s32 %r1879, %r1873, %r1074; + add.s32 %r1880, %r1879, %r1878; + xor.b32 %r1881, %r1880, %r1875; + shf.l.wrap.b32 %r1882, %r1881, %r1881, 24; + add.s32 %r1883, %r1882, %r1876; + xor.b32 %r1884, %r1883, %r1878; + shf.l.wrap.b32 %r1885, %r1884, %r1884, 25; + add.s32 %r1886, %r1838, %r1081; + add.s32 %r1887, %r1886, %r1829; + xor.b32 %r1888, %r1887, %r1854; + shf.l.wrap.b32 %r1889, %r1888, %r1888, 16; + add.s32 %r1890, %r1889, %r1813; + xor.b32 %r1891, %r1890, %r1829; + shf.l.wrap.b32 %r1892, %r1891, %r1891, 20; + add.s32 %r1893, %r1887, %r1137; + add.s32 %r1894, %r1893, %r1892; + xor.b32 %r1895, %r1894, %r1889; + shf.l.wrap.b32 %r1896, %r1895, %r1895, 24; + add.s32 %r1897, %r1896, %r1890; + xor.b32 %r1898, %r1897, %r1892; + shf.l.wrap.b32 %r1899, %r1898, %r1898, 25; + add.s32 %r1900, %r1852, %r1130; + add.s32 %r1901, %r1900, %r1843; + xor.b32 %r1902, %r1901, %r1812; + shf.l.wrap.b32 %r1903, %r1902, %r1902, 16; + add.s32 %r1904, %r1903, %r1827; + xor.b32 %r1905, %r1904, %r1843; + shf.l.wrap.b32 %r1906, %r1905, %r1905, 20; + add.s32 %r1907, %r1901, %r1116; + add.s32 %r1908, %r1907, %r1906; + xor.b32 %r1909, %r1908, %r1903; + shf.l.wrap.b32 %r1910, %r1909, %r1909, 24; + add.s32 %r1911, %r1910, %r1904; + xor.b32 %r1912, %r1911, %r1906; + shf.l.wrap.b32 %r1913, %r1912, %r1912, 25; + add.s32 %r1914, %r1866, %r1172; + add.s32 %r1915, %r1914, %r1885; + xor.b32 %r1916, %r1915, %r1910; + shf.l.wrap.b32 %r1917, %r1916, %r1916, 16; + add.s32 %r1918, %r1917, %r1897; + xor.b32 %r1919, %r1918, %r1885; + shf.l.wrap.b32 %r1920, %r1919, %r1919, 20; + add.s32 %r1921, %r1915, %r1144; + add.s32 %r1922, %r1921, %r1920; + xor.b32 %r1923, %r1922, %r1917; + shf.l.wrap.b32 %r1924, %r1923, %r1923, 24; + add.s32 %r1925, %r1924, %r1918; + xor.b32 %r1926, %r1925, %r1920; + shf.l.wrap.b32 %r1927, %r1926, %r1926, 25; + add.s32 %r1928, %r1880, %r1088; + add.s32 %r1929, %r1928, %r1899; + xor.b32 %r1930, %r1929, %r1868; + shf.l.wrap.b32 %r1931, %r1930, %r1930, 16; + add.s32 %r1932, %r1931, %r1911; + xor.b32 %r1933, %r1932, %r1899; + shf.l.wrap.b32 %r1934, %r1933, %r1933, 20; + add.s32 %r1935, %r1929, %r1158; + add.s32 %r1936, %r1935, %r1934; + xor.b32 %r1937, %r1936, %r1931; + shf.l.wrap.b32 %r1938, %r1937, %r1937, 24; + add.s32 %r1939, %r1938, %r1932; + xor.b32 %r1940, %r1939, %r1934; + shf.l.wrap.b32 %r1941, %r1940, %r1940, 25; + add.s32 %r1942, %r1894, %r1095; + add.s32 %r1943, %r1942, %r1913; + xor.b32 %r1944, %r1943, %r1882; + shf.l.wrap.b32 %r1945, %r1944, %r1944, 16; + add.s32 %r1946, %r1945, %r1869; + xor.b32 %r1947, %r1946, %r1913; + shf.l.wrap.b32 %r1948, %r1947, %r1947, 20; + add.s32 %r1949, %r1943, %r1102; + add.s32 %r1950, %r1949, %r1948; + xor.b32 %r1951, %r1950, %r1945; + shf.l.wrap.b32 %r1952, %r1951, %r1951, 24; + add.s32 %r1953, %r1952, %r1946; + xor.b32 %r1954, %r1953, %r1948; + shf.l.wrap.b32 %r1955, %r1954, %r1954, 25; + add.s32 %r1956, %r1908, %r1123; + add.s32 %r1957, %r1956, %r1871; + xor.b32 %r1958, %r1957, %r1896; + shf.l.wrap.b32 %r1959, %r1958, %r1958, 16; + add.s32 %r1960, %r1959, %r1883; + xor.b32 %r1961, %r1960, %r1871; + shf.l.wrap.b32 %r1962, %r1961, %r1961, 20; + add.s32 %r1963, %r1957, %r1165; + add.s32 %r1964, %r1963, %r1962; + xor.b32 %r1965, %r1964, %r1959; + shf.l.wrap.b32 %r1966, %r1965, %r1965, 24; + add.s32 %r1967, %r1966, %r1960; + xor.b32 %r1968, %r1967, %r1962; + shf.l.wrap.b32 %r1969, %r1968, %r1968, 25; + xor.b32 %r11657, %r1953, %r1922; + st.local.u32 [%rd3+-104], %r11657; + xor.b32 %r11656, %r1967, %r1936; + st.local.u32 [%rd3+-100], %r11656; + xor.b32 %r11655, %r1925, %r1950; + st.local.u32 [%rd3+-96], %r11655; + xor.b32 %r11654, %r1939, %r1964; + st.local.u32 [%rd3+-92], %r11654; + xor.b32 %r11653, %r1969, %r1938; + st.local.u32 [%rd3+-88], %r11653; + xor.b32 %r11652, %r1927, %r1952; + st.local.u32 [%rd3+-84], %r11652; + xor.b32 %r11651, %r1941, %r1966; + st.local.u32 [%rd3+-80], %r11651; + xor.b32 %r11650, %r1955, %r1924; + st.local.u32 [%rd3+-76], %r11650; + add.s16 %rs352, %rs352, 1; + st.local.u8 [%rd3+1], %rs352; + add.s64 %rd261, %rd261, 64; + add.s64 %rd244, %rd244, -64; + setp.gt.u64 %p10, %rd244, 64; + @%p10 bra $L__BB1_11; + +$L__BB1_12: + cvt.u64.u16 %rd120, %rs351; + and.b64 %rd24, %rd120, 255; + mov.u64 %rd121, 64; + sub.s64 %rd122, %rd121, %rd24; + min.u64 %rd25, %rd122, %rd244; + setp.eq.s64 %p11, %rd25, 0; + @%p11 bra $L__BB1_15; + + add.s64 %rd124, %rd2, %rd24; + add.s64 %rd26, %rd124, 72; + mov.u64 %rd245, 0; + +$L__BB1_14: + add.s64 %rd125, %rd261, %rd245; + ld.local.u8 %rs119, [%rd125]; + add.s64 %rd126, %rd26, %rd245; + st.local.u8 [%rd126], %rs119; + add.s64 %rd245, %rd245, 1; + setp.lt.u64 %p12, %rd245, %rd25; + @%p12 bra $L__BB1_14; + +$L__BB1_15: + cvt.u16.u64 %rs120, %rd25; + ld.local.u8 %rs121, [%rd3]; + add.s16 %rs13, %rs121, %rs120; + st.local.u8 [%rd3], %rs13; + mov.u64 %rd127, 32; + sub.s64 %rd29, %rd127, %rd6; + setp.eq.s64 %p13, %rd29, 0; + @%p13 bra $L__BB1_68; + + ld.local.u8 %rs122, [%rd3+1]; + setp.eq.s16 %p14, %rs122, 0; + selp.u16 %rs123, 1, 0, %p14; + ld.local.u8 %rs124, [%rd3+2]; + or.b16 %rs125, %rs124, %rs123; + or.b16 %rs126, %rs125, 2; + ld.local.u8 %r1970, [%rd3+-64]; + ld.local.u8 %r1971, [%rd3+-63]; + prmt.b32 %r1972, %r1971, %r1970, 30212; + ld.local.u8 %r1973, [%rd3+-62]; + prmt.b32 %r1974, %r1973, %r1972, 28756; + ld.local.u8 %r1975, [%rd3+-61]; + prmt.b32 %r1976, %r1975, %r1974, 1620; + ld.local.u8 %r1977, [%rd3+-60]; + ld.local.u8 %r1978, [%rd3+-59]; + prmt.b32 %r1979, %r1978, %r1977, 30212; + ld.local.u8 %r1980, [%rd3+-58]; + prmt.b32 %r1981, %r1980, %r1979, 28756; + ld.local.u8 %r1982, [%rd3+-57]; + prmt.b32 %r1983, %r1982, %r1981, 1620; + ld.local.u8 %r1984, [%rd3+-56]; + ld.local.u8 %r1985, [%rd3+-55]; + prmt.b32 %r1986, %r1985, %r1984, 30212; + ld.local.u8 %r1987, [%rd3+-54]; + prmt.b32 %r1988, %r1987, %r1986, 28756; + ld.local.u8 %r1989, [%rd3+-53]; + prmt.b32 %r1990, %r1989, %r1988, 1620; + ld.local.u8 %r1991, [%rd3+-52]; + ld.local.u8 %r1992, [%rd3+-51]; + prmt.b32 %r1993, %r1992, %r1991, 30212; + ld.local.u8 %r1994, [%rd3+-50]; + prmt.b32 %r1995, %r1994, %r1993, 28756; + ld.local.u8 %r1996, [%rd3+-49]; + prmt.b32 %r1997, %r1996, %r1995, 1620; + ld.local.u8 %r1998, [%rd3+-48]; + ld.local.u8 %r1999, [%rd3+-47]; + prmt.b32 %r2000, %r1999, %r1998, 30212; + ld.local.u8 %r2001, [%rd3+-46]; + prmt.b32 %r2002, %r2001, %r2000, 28756; + ld.local.u8 %r2003, [%rd3+-45]; + prmt.b32 %r2004, %r2003, %r2002, 1620; + ld.local.u8 %r2005, [%rd3+-44]; + ld.local.u8 %r2006, [%rd3+-43]; + prmt.b32 %r2007, %r2006, %r2005, 30212; + ld.local.u8 %r2008, [%rd3+-42]; + prmt.b32 %r2009, %r2008, %r2007, 28756; + ld.local.u8 %r2010, [%rd3+-41]; + prmt.b32 %r2011, %r2010, %r2009, 1620; + ld.local.u8 %r2012, [%rd3+-40]; + ld.local.u8 %r2013, [%rd3+-39]; + prmt.b32 %r2014, %r2013, %r2012, 30212; + ld.local.u8 %r2015, [%rd3+-38]; + prmt.b32 %r2016, %r2015, %r2014, 28756; + ld.local.u8 %r2017, [%rd3+-37]; + prmt.b32 %r2018, %r2017, %r2016, 1620; + ld.local.u8 %r2019, [%rd3+-36]; + ld.local.u8 %r2020, [%rd3+-35]; + prmt.b32 %r2021, %r2020, %r2019, 30212; + ld.local.u8 %r2022, [%rd3+-34]; + prmt.b32 %r2023, %r2022, %r2021, 28756; + ld.local.u8 %r2024, [%rd3+-33]; + prmt.b32 %r2025, %r2024, %r2023, 1620; + ld.local.u8 %r2026, [%rd3+-32]; + ld.local.u8 %r2027, [%rd3+-31]; + prmt.b32 %r2028, %r2027, %r2026, 30212; + ld.local.u8 %r2029, [%rd3+-30]; + prmt.b32 %r2030, %r2029, %r2028, 28756; + ld.local.u8 %r2031, [%rd3+-29]; + prmt.b32 %r2032, %r2031, %r2030, 1620; + ld.local.u8 %r2033, [%rd3+-28]; + ld.local.u8 %r2034, [%rd3+-27]; + prmt.b32 %r2035, %r2034, %r2033, 30212; + ld.local.u8 %r2036, [%rd3+-26]; + prmt.b32 %r2037, %r2036, %r2035, 28756; + ld.local.u8 %r2038, [%rd3+-25]; + prmt.b32 %r2039, %r2038, %r2037, 1620; + ld.local.u8 %r2040, [%rd3+-24]; + ld.local.u8 %r2041, [%rd3+-23]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd3+-22]; + prmt.b32 %r2044, %r2043, %r2042, 28756; + ld.local.u8 %r2045, [%rd3+-21]; + prmt.b32 %r2046, %r2045, %r2044, 1620; + ld.local.u8 %r2047, [%rd3+-20]; + ld.local.u8 %r2048, [%rd3+-19]; + prmt.b32 %r2049, %r2048, %r2047, 30212; + ld.local.u8 %r2050, [%rd3+-18]; + prmt.b32 %r2051, %r2050, %r2049, 28756; + ld.local.u8 %r2052, [%rd3+-17]; + prmt.b32 %r2053, %r2052, %r2051, 1620; + ld.local.u8 %r2054, [%rd3+-16]; + ld.local.u8 %r2055, [%rd3+-15]; + prmt.b32 %r2056, %r2055, %r2054, 30212; + ld.local.u8 %r2057, [%rd3+-14]; + prmt.b32 %r2058, %r2057, %r2056, 28756; + ld.local.u8 %r2059, [%rd3+-13]; + prmt.b32 %r2060, %r2059, %r2058, 1620; + ld.local.u8 %r2061, [%rd3+-12]; + ld.local.u8 %r2062, [%rd3+-11]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + ld.local.u8 %r2064, [%rd3+-10]; + prmt.b32 %r2065, %r2064, %r2063, 28756; + ld.local.u8 %r2066, [%rd3+-9]; + prmt.b32 %r2067, %r2066, %r2065, 1620; + ld.local.u8 %r2068, [%rd3+-8]; + ld.local.u8 %r2069, [%rd3+-7]; + prmt.b32 %r2070, %r2069, %r2068, 30212; + ld.local.u8 %r2071, [%rd3+-6]; + prmt.b32 %r2072, %r2071, %r2070, 28756; + ld.local.u8 %r2073, [%rd3+-5]; + prmt.b32 %r2074, %r2073, %r2072, 1620; + ld.local.u8 %r2075, [%rd3+-4]; + ld.local.u8 %r2076, [%rd3+-3]; + prmt.b32 %r2077, %r2076, %r2075, 30212; + ld.local.u8 %r2078, [%rd3+-2]; + prmt.b32 %r2079, %r2078, %r2077, 28756; + ld.local.u8 %r2080, [%rd3+-1]; + prmt.b32 %r2081, %r2080, %r2079, 1620; + ld.local.u64 %rd128, [%rd3+-72]; + cvt.u32.u64 %r2082, %rd128; + shr.u64 %rd129, %rd128, 32; + cvt.u32.u64 %r2083, %rd129; + cvt.u32.u16 %r2084, %rs126; + and.b32 %r2085, %r2084, 255; + ld.local.u8 %r2086, [%rd3+-88]; + ld.local.u8 %r2087, [%rd3+-87]; + prmt.b32 %r2088, %r2087, %r2086, 30212; + ld.local.u8 %r2089, [%rd3+-86]; + ld.local.u8 %r2090, [%rd3+-85]; + prmt.b32 %r2091, %r2090, %r2089, 30212; + prmt.b32 %r2092, %r2091, %r2088, 4180; + ld.local.u8 %r2093, [%rd3+-104]; + ld.local.u8 %r2094, [%rd3+-103]; + prmt.b32 %r2095, %r2094, %r2093, 30212; + ld.local.u8 %r2096, [%rd3+-102]; + ld.local.u8 %r2097, [%rd3+-101]; + prmt.b32 %r2098, %r2097, %r2096, 30212; + prmt.b32 %r2099, %r2098, %r2095, 4180; + add.s32 %r2100, %r2092, %r2099; + add.s32 %r2101, %r2100, %r1976; + xor.b32 %r2102, %r2101, %r2082; + shf.l.wrap.b32 %r2103, %r2102, %r2102, 16; + add.s32 %r2104, %r2103, 1779033703; + xor.b32 %r2105, %r2104, %r2092; + shf.l.wrap.b32 %r2106, %r2105, %r2105, 20; + add.s32 %r2107, %r1983, %r2101; + add.s32 %r2108, %r2107, %r2106; + xor.b32 %r2109, %r2108, %r2103; + shf.l.wrap.b32 %r2110, %r2109, %r2109, 24; + add.s32 %r2111, %r2110, %r2104; + xor.b32 %r2112, %r2111, %r2106; + shf.l.wrap.b32 %r2113, %r2112, %r2112, 25; + ld.local.u8 %r2114, [%rd3+-84]; + ld.local.u8 %r2115, [%rd3+-83]; + prmt.b32 %r2116, %r2115, %r2114, 30212; + ld.local.u8 %r2117, [%rd3+-82]; + ld.local.u8 %r2118, [%rd3+-81]; + prmt.b32 %r2119, %r2118, %r2117, 30212; + prmt.b32 %r2120, %r2119, %r2116, 4180; + ld.local.u8 %r2121, [%rd3+-100]; + ld.local.u8 %r2122, [%rd3+-99]; + prmt.b32 %r2123, %r2122, %r2121, 30212; + ld.local.u8 %r2124, [%rd3+-98]; + ld.local.u8 %r2125, [%rd3+-97]; + prmt.b32 %r2126, %r2125, %r2124, 30212; + prmt.b32 %r2127, %r2126, %r2123, 4180; + add.s32 %r2128, %r2120, %r2127; + add.s32 %r2129, %r2128, %r1990; + xor.b32 %r2130, %r2129, %r2083; + shf.l.wrap.b32 %r2131, %r2130, %r2130, 16; + add.s32 %r2132, %r2131, -1150833019; + xor.b32 %r2133, %r2132, %r2120; + shf.l.wrap.b32 %r2134, %r2133, %r2133, 20; + add.s32 %r2135, %r1997, %r2129; + add.s32 %r2136, %r2135, %r2134; + xor.b32 %r2137, %r2136, %r2131; + shf.l.wrap.b32 %r2138, %r2137, %r2137, 24; + add.s32 %r2139, %r2138, %r2132; + xor.b32 %r2140, %r2139, %r2134; + shf.l.wrap.b32 %r2141, %r2140, %r2140, 25; + ld.local.u8 %r2142, [%rd3+-80]; + ld.local.u8 %r2143, [%rd3+-79]; + prmt.b32 %r2144, %r2143, %r2142, 30212; + ld.local.u8 %r2145, [%rd3+-78]; + ld.local.u8 %r2146, [%rd3+-77]; + prmt.b32 %r2147, %r2146, %r2145, 30212; + prmt.b32 %r2148, %r2147, %r2144, 4180; + ld.local.u8 %r2149, [%rd3+-96]; + ld.local.u8 %r2150, [%rd3+-95]; + prmt.b32 %r2151, %r2150, %r2149, 30212; + ld.local.u8 %r2152, [%rd3+-94]; + ld.local.u8 %r2153, [%rd3+-93]; + prmt.b32 %r2154, %r2153, %r2152, 30212; + prmt.b32 %r2155, %r2154, %r2151, 4180; + add.s32 %r2156, %r2148, %r2155; + add.s32 %r2157, %r2156, %r2004; + cvt.u32.u16 %r2158, %rs13; + and.b32 %r2159, %r2158, 255; + xor.b32 %r2160, %r2157, %r2159; + shr.u32 %r2161, %r2157, 16; + shl.b32 %r2162, %r2160, 16; + or.b32 %r2163, %r2162, %r2161; + add.s32 %r2164, %r2163, 1013904242; + xor.b32 %r2165, %r2164, %r2148; + shf.l.wrap.b32 %r2166, %r2165, %r2165, 20; + add.s32 %r2167, %r2011, %r2157; + add.s32 %r2168, %r2167, %r2166; + xor.b32 %r2169, %r2168, %r2163; + shf.l.wrap.b32 %r2170, %r2169, %r2169, 24; + add.s32 %r2171, %r2170, %r2164; + xor.b32 %r2172, %r2171, %r2166; + shf.l.wrap.b32 %r2173, %r2172, %r2172, 25; + ld.local.u8 %r2174, [%rd3+-76]; + ld.local.u8 %r2175, [%rd3+-75]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.local.u8 %r2177, [%rd3+-74]; + ld.local.u8 %r2178, [%rd3+-73]; + prmt.b32 %r2179, %r2178, %r2177, 30212; + prmt.b32 %r2180, %r2179, %r2176, 4180; + ld.local.u8 %r2181, [%rd3+-92]; + ld.local.u8 %r2182, [%rd3+-91]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.local.u8 %r2184, [%rd3+-90]; + ld.local.u8 %r2185, [%rd3+-89]; + prmt.b32 %r2186, %r2185, %r2184, 30212; + prmt.b32 %r2187, %r2186, %r2183, 4180; + add.s32 %r2188, %r2180, %r2187; + add.s32 %r2189, %r2188, %r2018; + xor.b32 %r2190, %r2189, %r2085; + shr.u32 %r2191, %r2189, 16; + shl.b32 %r2192, %r2190, 16; + or.b32 %r2193, %r2192, %r2191; + add.s32 %r2194, %r2193, -1521486534; + xor.b32 %r2195, %r2194, %r2180; + shf.l.wrap.b32 %r2196, %r2195, %r2195, 20; + add.s32 %r2197, %r2025, %r2189; + add.s32 %r2198, %r2197, %r2196; + xor.b32 %r2199, %r2198, %r2193; + shf.l.wrap.b32 %r2200, %r2199, %r2199, 24; + add.s32 %r2201, %r2200, %r2194; + xor.b32 %r2202, %r2201, %r2196; + shf.l.wrap.b32 %r2203, %r2202, %r2202, 25; + add.s32 %r2204, %r2141, %r2108; + add.s32 %r2205, %r2204, %r2032; + xor.b32 %r2206, %r2200, %r2205; + shf.l.wrap.b32 %r2207, %r2206, %r2206, 16; + add.s32 %r2208, %r2207, %r2171; + xor.b32 %r2209, %r2208, %r2141; + shf.l.wrap.b32 %r2210, %r2209, %r2209, 20; + add.s32 %r2211, %r2039, %r2205; + add.s32 %r2212, %r2211, %r2210; + xor.b32 %r2213, %r2212, %r2207; + shf.l.wrap.b32 %r2214, %r2213, %r2213, 24; + add.s32 %r2215, %r2214, %r2208; + xor.b32 %r2216, %r2215, %r2210; + shf.l.wrap.b32 %r2217, %r2216, %r2216, 25; + add.s32 %r2218, %r2173, %r2136; + add.s32 %r2219, %r2218, %r2046; + xor.b32 %r2220, %r2219, %r2110; + shf.l.wrap.b32 %r2221, %r2220, %r2220, 16; + add.s32 %r2222, %r2221, %r2201; + xor.b32 %r2223, %r2222, %r2173; + shf.l.wrap.b32 %r2224, %r2223, %r2223, 20; + add.s32 %r2225, %r2053, %r2219; + add.s32 %r2226, %r2225, %r2224; + xor.b32 %r2227, %r2226, %r2221; + shf.l.wrap.b32 %r2228, %r2227, %r2227, 24; + add.s32 %r2229, %r2228, %r2222; + xor.b32 %r2230, %r2229, %r2224; + shf.l.wrap.b32 %r2231, %r2230, %r2230, 25; + add.s32 %r2232, %r2203, %r2168; + add.s32 %r2233, %r2232, %r2060; + xor.b32 %r2234, %r2233, %r2138; + shf.l.wrap.b32 %r2235, %r2234, %r2234, 16; + add.s32 %r2236, %r2235, %r2111; + xor.b32 %r2237, %r2236, %r2203; + shf.l.wrap.b32 %r2238, %r2237, %r2237, 20; + add.s32 %r2239, %r2067, %r2233; + add.s32 %r2240, %r2239, %r2238; + xor.b32 %r2241, %r2240, %r2235; + shf.l.wrap.b32 %r2242, %r2241, %r2241, 24; + add.s32 %r2243, %r2242, %r2236; + xor.b32 %r2244, %r2243, %r2238; + shf.l.wrap.b32 %r2245, %r2244, %r2244, 25; + add.s32 %r2246, %r2198, %r2113; + add.s32 %r2247, %r2246, %r2074; + xor.b32 %r2248, %r2247, %r2170; + shf.l.wrap.b32 %r2249, %r2248, %r2248, 16; + add.s32 %r2250, %r2249, %r2139; + xor.b32 %r2251, %r2250, %r2113; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 20; + add.s32 %r2253, %r2081, %r2247; + add.s32 %r2254, %r2253, %r2252; + xor.b32 %r2255, %r2254, %r2249; + shf.l.wrap.b32 %r2256, %r2255, %r2255, 24; + add.s32 %r2257, %r2256, %r2250; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 25; + add.s32 %r2260, %r2212, %r1990; + add.s32 %r2261, %r2260, %r2259; + xor.b32 %r2262, %r2261, %r2228; + shf.l.wrap.b32 %r2263, %r2262, %r2262, 16; + add.s32 %r2264, %r2263, %r2243; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 20; + add.s32 %r2267, %r2261, %r2018; + add.s32 %r2268, %r2267, %r2266; + xor.b32 %r2269, %r2268, %r2263; + shf.l.wrap.b32 %r2270, %r2269, %r2269, 24; + add.s32 %r2271, %r2270, %r2264; + xor.b32 %r2272, %r2271, %r2266; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 25; + add.s32 %r2274, %r2226, %r1997; + add.s32 %r2275, %r2274, %r2217; + xor.b32 %r2276, %r2242, %r2275; + shf.l.wrap.b32 %r2277, %r2276, %r2276, 16; + add.s32 %r2278, %r2257, %r2277; + xor.b32 %r2279, %r2278, %r2217; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 20; + add.s32 %r2281, %r2275, %r2046; + add.s32 %r2282, %r2281, %r2280; + xor.b32 %r2283, %r2282, %r2277; + shf.l.wrap.b32 %r2284, %r2283, %r2283, 24; + add.s32 %r2285, %r2284, %r2278; + xor.b32 %r2286, %r2285, %r2280; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 25; + add.s32 %r2288, %r2231, %r2025; + add.s32 %r2289, %r2288, %r2240; + xor.b32 %r2290, %r2256, %r2289; + shf.l.wrap.b32 %r2291, %r2290, %r2290, 16; + add.s32 %r2292, %r2291, %r2215; + xor.b32 %r2293, %r2292, %r2231; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 20; + add.s32 %r2295, %r2289, %r1976; + add.s32 %r2296, %r2295, %r2294; + xor.b32 %r2297, %r2296, %r2291; + shf.l.wrap.b32 %r2298, %r2297, %r2297, 24; + add.s32 %r2299, %r2298, %r2292; + xor.b32 %r2300, %r2299, %r2294; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 25; + add.s32 %r2302, %r2245, %r2004; + add.s32 %r2303, %r2302, %r2254; + xor.b32 %r2304, %r2303, %r2214; + shf.l.wrap.b32 %r2305, %r2304, %r2304, 16; + add.s32 %r2306, %r2305, %r2229; + xor.b32 %r2307, %r2306, %r2245; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 20; + add.s32 %r2309, %r2303, %r2067; + add.s32 %r2310, %r2309, %r2308; + xor.b32 %r2311, %r2310, %r2305; + shf.l.wrap.b32 %r2312, %r2311, %r2311, 24; + add.s32 %r2313, %r2312, %r2306; + xor.b32 %r2314, %r2313, %r2308; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 25; + add.s32 %r2316, %r2287, %r1983; + add.s32 %r2317, %r2316, %r2268; + xor.b32 %r2318, %r2317, %r2312; + shf.l.wrap.b32 %r2319, %r2318, %r2318, 16; + add.s32 %r2320, %r2319, %r2299; + xor.b32 %r2321, %r2320, %r2287; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 20; + add.s32 %r2323, %r2317, %r2053; + add.s32 %r2324, %r2323, %r2322; + xor.b32 %r2325, %r2324, %r2319; + shf.l.wrap.b32 %r2326, %r2325, %r2325, 24; + add.s32 %r2327, %r2326, %r2320; + xor.b32 %r2328, %r2327, %r2322; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 25; + add.s32 %r2330, %r2282, %r2060; + add.s32 %r2331, %r2330, %r2301; + xor.b32 %r2332, %r2270, %r2331; + shf.l.wrap.b32 %r2333, %r2332, %r2332, 16; + add.s32 %r2334, %r2333, %r2313; + xor.b32 %r2335, %r2334, %r2301; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 20; + add.s32 %r2337, %r2331, %r2011; + add.s32 %r2338, %r2337, %r2336; + xor.b32 %r2339, %r2338, %r2333; + shf.l.wrap.b32 %r2340, %r2339, %r2339, 24; + add.s32 %r2341, %r2340, %r2334; + xor.b32 %r2342, %r2341, %r2336; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 25; + add.s32 %r2344, %r2296, %r2039; + add.s32 %r2345, %r2344, %r2315; + xor.b32 %r2346, %r2345, %r2284; + shf.l.wrap.b32 %r2347, %r2346, %r2346, 16; + add.s32 %r2348, %r2347, %r2271; + xor.b32 %r2349, %r2348, %r2315; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 20; + add.s32 %r2351, %r2345, %r2074; + add.s32 %r2352, %r2351, %r2350; + xor.b32 %r2353, %r2352, %r2347; + shf.l.wrap.b32 %r2354, %r2353, %r2353, 24; + add.s32 %r2355, %r2354, %r2348; + xor.b32 %r2356, %r2355, %r2350; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 25; + add.s32 %r2358, %r2310, %r2081; + add.s32 %r2359, %r2358, %r2273; + xor.b32 %r2360, %r2359, %r2298; + shf.l.wrap.b32 %r2361, %r2360, %r2360, 16; + add.s32 %r2362, %r2361, %r2285; + xor.b32 %r2363, %r2362, %r2273; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 20; + add.s32 %r2365, %r2359, %r2032; + add.s32 %r2366, %r2365, %r2364; + xor.b32 %r2367, %r2366, %r2361; + shf.l.wrap.b32 %r2368, %r2367, %r2367, 24; + add.s32 %r2369, %r2368, %r2362; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 25; + add.s32 %r2372, %r2324, %r1997; + add.s32 %r2373, %r2372, %r2371; + xor.b32 %r2374, %r2373, %r2340; + shf.l.wrap.b32 %r2375, %r2374, %r2374, 16; + add.s32 %r2376, %r2375, %r2355; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 20; + add.s32 %r2379, %r2373, %r2004; + add.s32 %r2380, %r2379, %r2378; + xor.b32 %r2381, %r2380, %r2375; + shf.l.wrap.b32 %r2382, %r2381, %r2381, 24; + add.s32 %r2383, %r2382, %r2376; + xor.b32 %r2384, %r2383, %r2378; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 25; + add.s32 %r2386, %r2338, %r2046; + add.s32 %r2387, %r2386, %r2329; + xor.b32 %r2388, %r2387, %r2354; + shf.l.wrap.b32 %r2389, %r2388, %r2388, 16; + add.s32 %r2390, %r2389, %r2369; + xor.b32 %r2391, %r2390, %r2329; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 20; + add.s32 %r2393, %r2387, %r2060; + add.s32 %r2394, %r2393, %r2392; + xor.b32 %r2395, %r2394, %r2389; + shf.l.wrap.b32 %r2396, %r2395, %r2395, 24; + add.s32 %r2397, %r2396, %r2390; + xor.b32 %r2398, %r2397, %r2392; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 25; + add.s32 %r2400, %r2352, %r2067; + add.s32 %r2401, %r2400, %r2343; + xor.b32 %r2402, %r2368, %r2401; + shf.l.wrap.b32 %r2403, %r2402, %r2402, 16; + add.s32 %r2404, %r2403, %r2327; + xor.b32 %r2405, %r2404, %r2343; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 20; + add.s32 %r2407, %r2401, %r1990; + add.s32 %r2408, %r2407, %r2406; + xor.b32 %r2409, %r2408, %r2403; + shf.l.wrap.b32 %r2410, %r2409, %r2409, 24; + add.s32 %r2411, %r2410, %r2404; + xor.b32 %r2412, %r2411, %r2406; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 25; + add.s32 %r2414, %r2357, %r2025; + add.s32 %r2415, %r2414, %r2366; + xor.b32 %r2416, %r2415, %r2326; + shf.l.wrap.b32 %r2417, %r2416, %r2416, 16; + add.s32 %r2418, %r2417, %r2341; + xor.b32 %r2419, %r2418, %r2357; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 20; + add.s32 %r2421, %r2415, %r2074; + add.s32 %r2422, %r2421, %r2420; + xor.b32 %r2423, %r2422, %r2417; + shf.l.wrap.b32 %r2424, %r2423, %r2423, 24; + add.s32 %r2425, %r2424, %r2418; + xor.b32 %r2426, %r2425, %r2420; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 25; + add.s32 %r2428, %r2399, %r2018; + add.s32 %r2429, %r2428, %r2380; + xor.b32 %r2430, %r2429, %r2424; + shf.l.wrap.b32 %r2431, %r2430, %r2430, 16; + add.s32 %r2432, %r2431, %r2411; + xor.b32 %r2433, %r2432, %r2399; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 20; + add.s32 %r2435, %r2429, %r2011; + add.s32 %r2436, %r2435, %r2434; + xor.b32 %r2437, %r2436, %r2431; + shf.l.wrap.b32 %r2438, %r2437, %r2437, 24; + add.s32 %r2439, %r2438, %r2432; + xor.b32 %r2440, %r2439, %r2434; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 25; + add.s32 %r2442, %r2394, %r2039; + add.s32 %r2443, %r2442, %r2413; + xor.b32 %r2444, %r2382, %r2443; + shf.l.wrap.b32 %r2445, %r2444, %r2444, 16; + add.s32 %r2446, %r2445, %r2425; + xor.b32 %r2447, %r2446, %r2413; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 20; + add.s32 %r2449, %r2443, %r1976; + add.s32 %r2450, %r2449, %r2448; + xor.b32 %r2451, %r2450, %r2445; + shf.l.wrap.b32 %r2452, %r2451, %r2451, 24; + add.s32 %r2453, %r2452, %r2446; + xor.b32 %r2454, %r2453, %r2448; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 25; + add.s32 %r2456, %r2408, %r2053; + add.s32 %r2457, %r2456, %r2427; + xor.b32 %r2458, %r2457, %r2396; + shf.l.wrap.b32 %r2459, %r2458, %r2458, 16; + add.s32 %r2460, %r2459, %r2383; + xor.b32 %r2461, %r2460, %r2427; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 20; + add.s32 %r2463, %r2457, %r2081; + add.s32 %r2464, %r2463, %r2462; + xor.b32 %r2465, %r2464, %r2459; + shf.l.wrap.b32 %r2466, %r2465, %r2465, 24; + add.s32 %r2467, %r2466, %r2460; + xor.b32 %r2468, %r2467, %r2462; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 25; + add.s32 %r2470, %r2422, %r2032; + add.s32 %r2471, %r2470, %r2385; + xor.b32 %r2472, %r2471, %r2410; + shf.l.wrap.b32 %r2473, %r2472, %r2472, 16; + add.s32 %r2474, %r2473, %r2397; + xor.b32 %r2475, %r2474, %r2385; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 20; + add.s32 %r2477, %r2471, %r1983; + add.s32 %r2478, %r2477, %r2476; + xor.b32 %r2479, %r2478, %r2473; + shf.l.wrap.b32 %r2480, %r2479, %r2479, 24; + add.s32 %r2481, %r2480, %r2474; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 25; + add.s32 %r2484, %r2436, %r2046; + add.s32 %r2485, %r2484, %r2483; + xor.b32 %r2486, %r2485, %r2452; + shf.l.wrap.b32 %r2487, %r2486, %r2486, 16; + add.s32 %r2488, %r2487, %r2467; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 20; + add.s32 %r2491, %r2485, %r2025; + add.s32 %r2492, %r2491, %r2490; + xor.b32 %r2493, %r2492, %r2487; + shf.l.wrap.b32 %r2494, %r2493, %r2493, 24; + add.s32 %r2495, %r2494, %r2488; + xor.b32 %r2496, %r2495, %r2490; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 25; + add.s32 %r2498, %r2450, %r2060; + add.s32 %r2499, %r2498, %r2441; + xor.b32 %r2500, %r2499, %r2466; + shf.l.wrap.b32 %r2501, %r2500, %r2500, 16; + add.s32 %r2502, %r2501, %r2481; + xor.b32 %r2503, %r2502, %r2441; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 20; + add.s32 %r2505, %r2499, %r2039; + add.s32 %r2506, %r2505, %r2504; + xor.b32 %r2507, %r2506, %r2501; + shf.l.wrap.b32 %r2508, %r2507, %r2507, 24; + add.s32 %r2509, %r2508, %r2502; + xor.b32 %r2510, %r2509, %r2504; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 25; + add.s32 %r2512, %r2464, %r2074; + add.s32 %r2513, %r2512, %r2455; + xor.b32 %r2514, %r2480, %r2513; + shf.l.wrap.b32 %r2515, %r2514, %r2514, 16; + add.s32 %r2516, %r2515, %r2439; + xor.b32 %r2517, %r2516, %r2455; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 20; + add.s32 %r2519, %r2513, %r1997; + add.s32 %r2520, %r2519, %r2518; + xor.b32 %r2521, %r2520, %r2515; + shf.l.wrap.b32 %r2522, %r2521, %r2521, 24; + add.s32 %r2523, %r2522, %r2516; + xor.b32 %r2524, %r2523, %r2518; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 25; + add.s32 %r2526, %r2469, %r2067; + add.s32 %r2527, %r2526, %r2478; + xor.b32 %r2528, %r2527, %r2438; + shf.l.wrap.b32 %r2529, %r2528, %r2528, 16; + add.s32 %r2530, %r2529, %r2453; + xor.b32 %r2531, %r2530, %r2469; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 20; + add.s32 %r2533, %r2527, %r2081; + add.s32 %r2534, %r2533, %r2532; + xor.b32 %r2535, %r2534, %r2529; + shf.l.wrap.b32 %r2536, %r2535, %r2535, 24; + add.s32 %r2537, %r2536, %r2530; + xor.b32 %r2538, %r2537, %r2532; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 25; + add.s32 %r2540, %r2511, %r2004; + add.s32 %r2541, %r2540, %r2492; + xor.b32 %r2542, %r2541, %r2536; + shf.l.wrap.b32 %r2543, %r2542, %r2542, 16; + add.s32 %r2544, %r2543, %r2523; + xor.b32 %r2545, %r2544, %r2511; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 20; + add.s32 %r2547, %r2541, %r1976; + add.s32 %r2548, %r2547, %r2546; + xor.b32 %r2549, %r2548, %r2543; + shf.l.wrap.b32 %r2550, %r2549, %r2549, 24; + add.s32 %r2551, %r2550, %r2544; + xor.b32 %r2552, %r2551, %r2546; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 25; + add.s32 %r2554, %r2506, %r2053; + add.s32 %r2555, %r2554, %r2525; + xor.b32 %r2556, %r2494, %r2555; + shf.l.wrap.b32 %r2557, %r2556, %r2556, 16; + add.s32 %r2558, %r2557, %r2537; + xor.b32 %r2559, %r2558, %r2525; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 20; + add.s32 %r2561, %r2555, %r1990; + add.s32 %r2562, %r2561, %r2560; + xor.b32 %r2563, %r2562, %r2557; + shf.l.wrap.b32 %r2564, %r2563, %r2563, 24; + add.s32 %r2565, %r2564, %r2558; + xor.b32 %r2566, %r2565, %r2560; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 25; + add.s32 %r2568, %r2520, %r2011; + add.s32 %r2569, %r2568, %r2539; + xor.b32 %r2570, %r2569, %r2508; + shf.l.wrap.b32 %r2571, %r2570, %r2570, 16; + add.s32 %r2572, %r2571, %r2495; + xor.b32 %r2573, %r2572, %r2539; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 20; + add.s32 %r2575, %r2569, %r2032; + add.s32 %r2576, %r2575, %r2574; + xor.b32 %r2577, %r2576, %r2571; + shf.l.wrap.b32 %r2578, %r2577, %r2577, 24; + add.s32 %r2579, %r2578, %r2572; + xor.b32 %r2580, %r2579, %r2574; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 25; + add.s32 %r2582, %r2534, %r1983; + add.s32 %r2583, %r2582, %r2497; + xor.b32 %r2584, %r2583, %r2522; + shf.l.wrap.b32 %r2585, %r2584, %r2584, 16; + add.s32 %r2586, %r2585, %r2509; + xor.b32 %r2587, %r2586, %r2497; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 20; + add.s32 %r2589, %r2583, %r2018; + add.s32 %r2590, %r2589, %r2588; + xor.b32 %r2591, %r2590, %r2585; + shf.l.wrap.b32 %r2592, %r2591, %r2591, 24; + add.s32 %r2593, %r2592, %r2586; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 25; + add.s32 %r2596, %r2548, %r2060; + add.s32 %r2597, %r2596, %r2595; + xor.b32 %r2598, %r2597, %r2564; + shf.l.wrap.b32 %r2599, %r2598, %r2598, 16; + add.s32 %r2600, %r2599, %r2579; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 20; + add.s32 %r2603, %r2597, %r2067; + add.s32 %r2604, %r2603, %r2602; + xor.b32 %r2605, %r2604, %r2599; + shf.l.wrap.b32 %r2606, %r2605, %r2605, 24; + add.s32 %r2607, %r2606, %r2600; + xor.b32 %r2608, %r2607, %r2602; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 25; + add.s32 %r2610, %r2562, %r2039; + add.s32 %r2611, %r2610, %r2553; + xor.b32 %r2612, %r2611, %r2578; + shf.l.wrap.b32 %r2613, %r2612, %r2612, 16; + add.s32 %r2614, %r2613, %r2593; + xor.b32 %r2615, %r2614, %r2553; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 20; + add.s32 %r2617, %r2611, %r2053; + add.s32 %r2618, %r2617, %r2616; + xor.b32 %r2619, %r2618, %r2613; + shf.l.wrap.b32 %r2620, %r2619, %r2619, 24; + add.s32 %r2621, %r2620, %r2614; + xor.b32 %r2622, %r2621, %r2616; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 25; + add.s32 %r2624, %r2576, %r2081; + add.s32 %r2625, %r2624, %r2567; + xor.b32 %r2626, %r2592, %r2625; + shf.l.wrap.b32 %r2627, %r2626, %r2626, 16; + add.s32 %r2628, %r2627, %r2551; + xor.b32 %r2629, %r2628, %r2567; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 20; + add.s32 %r2631, %r2625, %r2046; + add.s32 %r2632, %r2631, %r2630; + xor.b32 %r2633, %r2632, %r2627; + shf.l.wrap.b32 %r2634, %r2633, %r2633, 24; + add.s32 %r2635, %r2634, %r2628; + xor.b32 %r2636, %r2635, %r2630; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 25; + add.s32 %r2638, %r2581, %r2074; + add.s32 %r2639, %r2638, %r2590; + xor.b32 %r2640, %r2639, %r2550; + shf.l.wrap.b32 %r2641, %r2640, %r2640, 16; + add.s32 %r2642, %r2641, %r2565; + xor.b32 %r2643, %r2642, %r2581; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 20; + add.s32 %r2645, %r2639, %r2032; + add.s32 %r2646, %r2645, %r2644; + xor.b32 %r2647, %r2646, %r2641; + shf.l.wrap.b32 %r2648, %r2647, %r2647, 24; + add.s32 %r2649, %r2648, %r2642; + xor.b32 %r2650, %r2649, %r2644; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 25; + add.s32 %r2652, %r2623, %r2025; + add.s32 %r2653, %r2652, %r2604; + xor.b32 %r2654, %r2653, %r2648; + shf.l.wrap.b32 %r2655, %r2654, %r2654, 16; + add.s32 %r2656, %r2655, %r2635; + xor.b32 %r2657, %r2656, %r2623; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 20; + add.s32 %r2659, %r2653, %r1990; + add.s32 %r2660, %r2659, %r2658; + xor.b32 %r2661, %r2660, %r2655; + shf.l.wrap.b32 %r2662, %r2661, %r2661, 24; + add.s32 %r2663, %r2662, %r2656; + xor.b32 %r2664, %r2663, %r2658; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 25; + add.s32 %r2666, %r2618, %r2011; + add.s32 %r2667, %r2666, %r2637; + xor.b32 %r2668, %r2606, %r2667; + shf.l.wrap.b32 %r2669, %r2668, %r2668, 16; + add.s32 %r2670, %r2669, %r2649; + xor.b32 %r2671, %r2670, %r2637; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 20; + add.s32 %r2673, %r2667, %r1997; + add.s32 %r2674, %r2673, %r2672; + xor.b32 %r2675, %r2674, %r2669; + shf.l.wrap.b32 %r2676, %r2675, %r2675, 24; + add.s32 %r2677, %r2676, %r2670; + xor.b32 %r2678, %r2677, %r2672; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 25; + add.s32 %r2680, %r2632, %r1976; + add.s32 %r2681, %r2680, %r2651; + xor.b32 %r2682, %r2681, %r2620; + shf.l.wrap.b32 %r2683, %r2682, %r2682, 16; + add.s32 %r2684, %r2683, %r2607; + xor.b32 %r2685, %r2684, %r2651; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 20; + add.s32 %r2687, %r2681, %r1983; + add.s32 %r2688, %r2687, %r2686; + xor.b32 %r2689, %r2688, %r2683; + shf.l.wrap.b32 %r2690, %r2689, %r2689, 24; + add.s32 %r2691, %r2690, %r2684; + xor.b32 %r2692, %r2691, %r2686; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 25; + add.s32 %r2694, %r2646, %r2018; + add.s32 %r2695, %r2694, %r2609; + xor.b32 %r2696, %r2695, %r2634; + shf.l.wrap.b32 %r2697, %r2696, %r2696, 16; + add.s32 %r2698, %r2697, %r2621; + xor.b32 %r2699, %r2698, %r2609; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 20; + add.s32 %r2701, %r2695, %r2004; + add.s32 %r2702, %r2701, %r2700; + xor.b32 %r2703, %r2702, %r2697; + shf.l.wrap.b32 %r2704, %r2703, %r2703, 24; + add.s32 %r2705, %r2704, %r2698; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 25; + add.s32 %r2708, %r2660, %r2039; + add.s32 %r2709, %r2708, %r2707; + xor.b32 %r2710, %r2709, %r2676; + shf.l.wrap.b32 %r2711, %r2710, %r2710, 16; + add.s32 %r2712, %r2711, %r2691; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 20; + add.s32 %r2715, %r2709, %r2074; + add.s32 %r2716, %r2715, %r2714; + xor.b32 %r2717, %r2716, %r2711; + shf.l.wrap.b32 %r2718, %r2717, %r2717, 24; + add.s32 %r2719, %r2718, %r2712; + xor.b32 %r2720, %r2719, %r2714; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 25; + add.s32 %r2722, %r2674, %r2053; + add.s32 %r2723, %r2722, %r2665; + xor.b32 %r2724, %r2723, %r2690; + shf.l.wrap.b32 %r2725, %r2724, %r2724, 16; + add.s32 %r2726, %r2725, %r2705; + xor.b32 %r2727, %r2726, %r2665; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 20; + add.s32 %r2729, %r2723, %r2011; + add.s32 %r2730, %r2729, %r2728; + xor.b32 %r2731, %r2730, %r2725; + shf.l.wrap.b32 %r2732, %r2731, %r2731, 24; + add.s32 %r2733, %r2732, %r2726; + xor.b32 %r2734, %r2733, %r2728; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 25; + add.s32 %r2736, %r2688, %r2032; + add.s32 %r2737, %r2736, %r2679; + xor.b32 %r2738, %r2704, %r2737; + shf.l.wrap.b32 %r2739, %r2738, %r2738, 16; + add.s32 %r2740, %r2739, %r2663; + xor.b32 %r2741, %r2740, %r2679; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 20; + add.s32 %r2743, %r2737, %r2060; + add.s32 %r2744, %r2743, %r2742; + xor.b32 %r2745, %r2744, %r2739; + shf.l.wrap.b32 %r2746, %r2745, %r2745, 24; + add.s32 %r2747, %r2746, %r2740; + xor.b32 %r2748, %r2747, %r2742; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 25; + add.s32 %r2750, %r2693, %r2081; + add.s32 %r2751, %r2750, %r2702; + xor.b32 %r2752, %r2751, %r2662; + shf.l.wrap.b32 %r2753, %r2752, %r2752, 16; + add.s32 %r2754, %r2753, %r2677; + xor.b32 %r2755, %r2754, %r2693; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 20; + add.s32 %r2757, %r2751, %r1983; + add.s32 %r2758, %r2757, %r2756; + xor.b32 %r2759, %r2758, %r2753; + shf.l.wrap.b32 %r2760, %r2759, %r2759, 24; + add.s32 %r2761, %r2760, %r2754; + xor.b32 %r2762, %r2761, %r2756; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 25; + add.s32 %r2764, %r2735, %r2067; + add.s32 %r2765, %r2764, %r2716; + xor.b32 %r2766, %r2765, %r2760; + shf.l.wrap.b32 %r2767, %r2766, %r2766, 16; + add.s32 %r2768, %r2767, %r2747; + xor.b32 %r2769, %r2768, %r2735; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 20; + add.s32 %r2771, %r2765, %r1997; + add.s32 %r2772, %r2771, %r2770; + xor.b32 %r2773, %r2772, %r2767; + shf.l.wrap.b32 %r2774, %r2773, %r2773, 24; + add.s32 %r2775, %r2774, %r2768; + xor.b32 %r2776, %r2775, %r2770; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 25; + add.s32 %r2778, %r2730, %r1976; + add.s32 %r2779, %r2778, %r2749; + xor.b32 %r2780, %r2718, %r2779; + shf.l.wrap.b32 %r2781, %r2780, %r2780, 16; + add.s32 %r2782, %r2781, %r2761; + xor.b32 %r2783, %r2782, %r2749; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 20; + add.s32 %r2785, %r2779, %r2046; + add.s32 %r2786, %r2785, %r2784; + xor.b32 %r2787, %r2786, %r2781; + shf.l.wrap.b32 %r2788, %r2787, %r2787, 24; + add.s32 %r2789, %r2788, %r2782; + xor.b32 %r2790, %r2789, %r2784; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 25; + add.s32 %r2792, %r2744, %r1990; + add.s32 %r2793, %r2792, %r2763; + xor.b32 %r2794, %r2793, %r2732; + shf.l.wrap.b32 %r2795, %r2794, %r2794, 16; + add.s32 %r2796, %r2795, %r2719; + xor.b32 %r2797, %r2796, %r2763; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 20; + add.s32 %r2799, %r2793, %r2018; + add.s32 %r2800, %r2799, %r2798; + xor.b32 %r2801, %r2800, %r2795; + shf.l.wrap.b32 %r2802, %r2801, %r2801, 24; + add.s32 %r2803, %r2802, %r2796; + xor.b32 %r2804, %r2803, %r2798; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 25; + add.s32 %r2806, %r2758, %r2004; + add.s32 %r2807, %r2806, %r2721; + xor.b32 %r2808, %r2807, %r2746; + shf.l.wrap.b32 %r2809, %r2808, %r2808, 16; + add.s32 %r2810, %r2809, %r2733; + xor.b32 %r2811, %r2810, %r2721; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 20; + add.s32 %r2813, %r2807, %r2025; + add.s32 %r2814, %r2813, %r2812; + xor.b32 %r2815, %r2814, %r2809; + shf.l.wrap.b32 %r2816, %r2815, %r2815, 24; + add.s32 %r2817, %r2816, %r2810; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 25; + add.s32 %r2820, %r2772, %r2053; + add.s32 %r2821, %r2820, %r2819; + xor.b32 %r2822, %r2821, %r2788; + shf.l.wrap.b32 %r2823, %r2822, %r2822, 16; + add.s32 %r2824, %r2823, %r2803; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 20; + add.s32 %r2827, %r2821, %r2081; + add.s32 %r2828, %r2827, %r2826; + xor.b32 %r2829, %r2828, %r2823; + shf.l.wrap.b32 %r2830, %r2829, %r2829, 24; + add.s32 %r2831, %r2830, %r2824; + xor.b32 %r2832, %r2831, %r2826; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 25; + add.s32 %r2834, %r2786, %r2011; + add.s32 %r2835, %r2834, %r2777; + xor.b32 %r2836, %r2835, %r2802; + shf.l.wrap.b32 %r2837, %r2836, %r2836, 16; + add.s32 %r2838, %r2837, %r2817; + xor.b32 %r2839, %r2838, %r2777; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 20; + add.s32 %r2841, %r2835, %r1976; + add.s32 %r2842, %r2841, %r2840; + xor.b32 %r2843, %r2842, %r2837; + shf.l.wrap.b32 %r2844, %r2843, %r2843, 24; + add.s32 %r2845, %r2844, %r2838; + xor.b32 %r2846, %r2845, %r2840; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 25; + add.s32 %r2848, %r2800, %r1983; + add.s32 %r2849, %r2848, %r2791; + xor.b32 %r2850, %r2816, %r2849; + shf.l.wrap.b32 %r2851, %r2850, %r2850, 16; + add.s32 %r2852, %r2851, %r2775; + xor.b32 %r2853, %r2852, %r2791; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 20; + add.s32 %r2855, %r2849, %r2039; + add.s32 %r2856, %r2855, %r2854; + xor.b32 %r2857, %r2856, %r2851; + shf.l.wrap.b32 %r2858, %r2857, %r2857, 24; + add.s32 %r2859, %r2858, %r2852; + xor.b32 %r2860, %r2859, %r2854; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 25; + add.s32 %r2862, %r2805, %r2032; + add.s32 %r2863, %r2862, %r2814; + xor.b32 %r2864, %r2863, %r2774; + shf.l.wrap.b32 %r2865, %r2864, %r2864, 16; + add.s32 %r2866, %r2865, %r2789; + xor.b32 %r2867, %r2866, %r2805; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 20; + add.s32 %r2869, %r2863, %r2018; + add.s32 %r2870, %r2869, %r2868; + xor.b32 %r2871, %r2870, %r2865; + shf.l.wrap.b32 %r2872, %r2871, %r2871, 24; + add.s32 %r2873, %r2872, %r2866; + xor.b32 %r2874, %r2873, %r2868; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 25; + add.s32 %r2876, %r2847, %r2074; + add.s32 %r2877, %r2876, %r2828; + xor.b32 %r2878, %r2877, %r2872; + shf.l.wrap.b32 %r2879, %r2878, %r2878, 16; + add.s32 %r2880, %r2879, %r2859; + xor.b32 %r2881, %r2880, %r2847; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 20; + add.s32 %r2883, %r2877, %r2046; + add.s32 %r2884, %r2883, %r2882; + xor.b32 %r2885, %r2884, %r2879; + shf.l.wrap.b32 %r2886, %r2885, %r2885, 24; + add.s32 %r2887, %r2886, %r2880; + xor.b32 %r2888, %r2887, %r2882; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 25; + add.s32 %r2890, %r2842, %r1990; + add.s32 %r2891, %r2890, %r2861; + xor.b32 %r2892, %r2830, %r2891; + shf.l.wrap.b32 %r2893, %r2892, %r2892, 16; + add.s32 %r2894, %r2893, %r2873; + xor.b32 %r2895, %r2894, %r2861; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 20; + add.s32 %r2897, %r2891, %r2060; + add.s32 %r2898, %r2897, %r2896; + xor.b32 %r2899, %r2898, %r2893; + shf.l.wrap.b32 %r2900, %r2899, %r2899, 24; + add.s32 %r2901, %r2900, %r2894; + xor.b32 %r2902, %r2901, %r2896; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 25; + add.s32 %r2904, %r2856, %r1997; + add.s32 %r2905, %r2904, %r2875; + xor.b32 %r2906, %r2905, %r2844; + shf.l.wrap.b32 %r2907, %r2906, %r2906, 16; + add.s32 %r2908, %r2907, %r2831; + xor.b32 %r2909, %r2908, %r2875; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 20; + add.s32 %r2911, %r2905, %r2004; + add.s32 %r2912, %r2911, %r2910; + xor.b32 %r2913, %r2912, %r2907; + shf.l.wrap.b32 %r2914, %r2913, %r2913, 24; + add.s32 %r2915, %r2914, %r2908; + xor.b32 %r2916, %r2915, %r2910; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 25; + add.s32 %r2918, %r2870, %r2025; + add.s32 %r2919, %r2918, %r2833; + xor.b32 %r2920, %r2919, %r2858; + shf.l.wrap.b32 %r2921, %r2920, %r2920, 16; + add.s32 %r2922, %r2921, %r2845; + xor.b32 %r2923, %r2922, %r2833; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 20; + add.s32 %r2925, %r2919, %r2067; + add.s32 %r2926, %r2925, %r2924; + xor.b32 %r2927, %r2926, %r2921; + shf.l.wrap.b32 %r2928, %r2927, %r2927, 24; + add.s32 %r2929, %r2928, %r2922; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 25; + xor.b32 %r27, %r2915, %r2884; + xor.b32 %r28, %r2929, %r2898; + xor.b32 %r29, %r2887, %r2912; + xor.b32 %r30, %r2926, %r2901; + xor.b32 %r31, %r2931, %r2900; + xor.b32 %r32, %r2889, %r2914; + xor.b32 %r33, %r2928, %r2903; + xor.b32 %r34, %r2917, %r2886; + popc.b64 %r2932, %rd128; + cvt.u64.u32 %rd30, %r2932; + ld.local.u8 %rs127, [%rd3+8]; + cvt.u64.u16 %rd130, %rs127; + setp.ge.u64 %p15, %rd30, %rd130; + mul.wide.u16 %r11659, %rs127, 32; + @%p15 bra $L__BB1_19; + +$L__BB1_18: + add.s32 %r2933, %r11659, -64; + cvt.s64.s32 %rd131, %r2933; + add.s64 %rd132, %rd2, %rd131; + ld.local.u8 %r2934, [%rd3+2]; + ld.local.u8 %r2935, [%rd132+145]; + ld.local.u8 %r2936, [%rd132+146]; + prmt.b32 %r2937, %r2936, %r2935, 30212; + ld.local.u8 %r2938, [%rd132+147]; + prmt.b32 %r2939, %r2938, %r2937, 28756; + ld.local.u8 %r2940, [%rd132+148]; + prmt.b32 %r2941, %r2940, %r2939, 1620; + ld.local.u8 %r2942, [%rd132+149]; + ld.local.u8 %r2943, [%rd132+150]; + prmt.b32 %r2944, %r2943, %r2942, 30212; + ld.local.u8 %r2945, [%rd132+151]; + prmt.b32 %r2946, %r2945, %r2944, 28756; + ld.local.u8 %r2947, [%rd132+152]; + prmt.b32 %r2948, %r2947, %r2946, 1620; + ld.local.u8 %r2949, [%rd132+153]; + ld.local.u8 %r2950, [%rd132+154]; + prmt.b32 %r2951, %r2950, %r2949, 30212; + ld.local.u8 %r2952, [%rd132+155]; + prmt.b32 %r2953, %r2952, %r2951, 28756; + ld.local.u8 %r2954, [%rd132+156]; + prmt.b32 %r2955, %r2954, %r2953, 1620; + ld.local.u8 %r2956, [%rd132+157]; + ld.local.u8 %r2957, [%rd132+158]; + prmt.b32 %r2958, %r2957, %r2956, 30212; + ld.local.u8 %r2959, [%rd132+159]; + prmt.b32 %r2960, %r2959, %r2958, 28756; + ld.local.u8 %r2961, [%rd132+160]; + prmt.b32 %r2962, %r2961, %r2960, 1620; + ld.local.u8 %r2963, [%rd132+161]; + ld.local.u8 %r2964, [%rd132+162]; + prmt.b32 %r2965, %r2964, %r2963, 30212; + ld.local.u8 %r2966, [%rd132+163]; + prmt.b32 %r2967, %r2966, %r2965, 28756; + ld.local.u8 %r2968, [%rd132+164]; + prmt.b32 %r2969, %r2968, %r2967, 1620; + ld.local.u8 %r2970, [%rd132+165]; + ld.local.u8 %r2971, [%rd132+166]; + prmt.b32 %r2972, %r2971, %r2970, 30212; + ld.local.u8 %r2973, [%rd132+167]; + prmt.b32 %r2974, %r2973, %r2972, 28756; + ld.local.u8 %r2975, [%rd132+168]; + prmt.b32 %r2976, %r2975, %r2974, 1620; + ld.local.u8 %r2977, [%rd132+169]; + ld.local.u8 %r2978, [%rd132+170]; + prmt.b32 %r2979, %r2978, %r2977, 30212; + ld.local.u8 %r2980, [%rd132+171]; + prmt.b32 %r2981, %r2980, %r2979, 28756; + ld.local.u8 %r2982, [%rd132+172]; + prmt.b32 %r2983, %r2982, %r2981, 1620; + ld.local.u8 %r2984, [%rd132+173]; + ld.local.u8 %r2985, [%rd132+174]; + prmt.b32 %r2986, %r2985, %r2984, 30212; + ld.local.u8 %r2987, [%rd132+175]; + prmt.b32 %r2988, %r2987, %r2986, 28756; + ld.local.u8 %r2989, [%rd132+176]; + prmt.b32 %r2990, %r2989, %r2988, 1620; + ld.local.u8 %r2991, [%rd132+177]; + ld.local.u8 %r2992, [%rd132+178]; + prmt.b32 %r2993, %r2992, %r2991, 30212; + ld.local.u8 %r2994, [%rd132+179]; + prmt.b32 %r2995, %r2994, %r2993, 28756; + ld.local.u8 %r2996, [%rd132+180]; + prmt.b32 %r2997, %r2996, %r2995, 1620; + ld.local.u8 %r2998, [%rd132+181]; + ld.local.u8 %r2999, [%rd132+182]; + prmt.b32 %r3000, %r2999, %r2998, 30212; + ld.local.u8 %r3001, [%rd132+183]; + prmt.b32 %r3002, %r3001, %r3000, 28756; + ld.local.u8 %r3003, [%rd132+184]; + prmt.b32 %r3004, %r3003, %r3002, 1620; + ld.local.u8 %r3005, [%rd132+185]; + ld.local.u8 %r3006, [%rd132+186]; + prmt.b32 %r3007, %r3006, %r3005, 30212; + ld.local.u8 %r3008, [%rd132+187]; + prmt.b32 %r3009, %r3008, %r3007, 28756; + ld.local.u8 %r3010, [%rd132+188]; + prmt.b32 %r3011, %r3010, %r3009, 1620; + ld.local.u8 %r3012, [%rd132+189]; + ld.local.u8 %r3013, [%rd132+190]; + prmt.b32 %r3014, %r3013, %r3012, 30212; + ld.local.u8 %r3015, [%rd132+191]; + prmt.b32 %r3016, %r3015, %r3014, 28756; + ld.local.u8 %r3017, [%rd132+192]; + prmt.b32 %r3018, %r3017, %r3016, 1620; + ld.local.u8 %r3019, [%rd132+193]; + ld.local.u8 %r3020, [%rd132+194]; + prmt.b32 %r3021, %r3020, %r3019, 30212; + ld.local.u8 %r3022, [%rd132+195]; + prmt.b32 %r3023, %r3022, %r3021, 28756; + ld.local.u8 %r3024, [%rd132+196]; + prmt.b32 %r3025, %r3024, %r3023, 1620; + ld.local.u8 %r3026, [%rd132+197]; + ld.local.u8 %r3027, [%rd132+198]; + prmt.b32 %r3028, %r3027, %r3026, 30212; + ld.local.u8 %r3029, [%rd132+199]; + prmt.b32 %r3030, %r3029, %r3028, 28756; + ld.local.u8 %r3031, [%rd132+200]; + prmt.b32 %r3032, %r3031, %r3030, 1620; + ld.local.u8 %r3033, [%rd132+201]; + ld.local.u8 %r3034, [%rd132+202]; + prmt.b32 %r3035, %r3034, %r3033, 30212; + ld.local.u8 %r3036, [%rd132+203]; + prmt.b32 %r3037, %r3036, %r3035, 28756; + ld.local.u8 %r3038, [%rd132+204]; + prmt.b32 %r3039, %r3038, %r3037, 1620; + ld.local.u8 %r3040, [%rd132+205]; + ld.local.u8 %r3041, [%rd132+206]; + prmt.b32 %r3042, %r3041, %r3040, 30212; + ld.local.u8 %r3043, [%rd132+207]; + prmt.b32 %r3044, %r3043, %r3042, 28756; + ld.local.u8 %r3045, [%rd132+208]; + prmt.b32 %r3046, %r3045, %r3044, 1620; + or.b32 %r3047, %r2934, 4; + ld.local.u8 %r3048, [%rd3+-120]; + ld.local.u8 %r3049, [%rd3+-119]; + prmt.b32 %r3050, %r3049, %r3048, 30212; + ld.local.u8 %r3051, [%rd3+-118]; + ld.local.u8 %r3052, [%rd3+-117]; + prmt.b32 %r3053, %r3052, %r3051, 30212; + prmt.b32 %r3054, %r3053, %r3050, 4180; + ld.local.u8 %r3055, [%rd3+-136]; + ld.local.u8 %r3056, [%rd3+-135]; + prmt.b32 %r3057, %r3056, %r3055, 30212; + ld.local.u8 %r3058, [%rd3+-134]; + ld.local.u8 %r3059, [%rd3+-133]; + prmt.b32 %r3060, %r3059, %r3058, 30212; + prmt.b32 %r3061, %r3060, %r3057, 4180; + add.s32 %r3062, %r3054, %r3061; + add.s32 %r3063, %r3062, %r2941; + shf.l.wrap.b32 %r3064, %r3063, %r3063, 16; + add.s32 %r3065, %r3064, 1779033703; + xor.b32 %r3066, %r3065, %r3054; + shf.l.wrap.b32 %r3067, %r3066, %r3066, 20; + add.s32 %r3068, %r2948, %r3063; + add.s32 %r3069, %r3068, %r3067; + xor.b32 %r3070, %r3069, %r3064; + shf.l.wrap.b32 %r3071, %r3070, %r3070, 24; + add.s32 %r3072, %r3071, %r3065; + xor.b32 %r3073, %r3072, %r3067; + shf.l.wrap.b32 %r3074, %r3073, %r3073, 25; + ld.local.u8 %r3075, [%rd3+-116]; + ld.local.u8 %r3076, [%rd3+-115]; + prmt.b32 %r3077, %r3076, %r3075, 30212; + ld.local.u8 %r3078, [%rd3+-114]; + ld.local.u8 %r3079, [%rd3+-113]; + prmt.b32 %r3080, %r3079, %r3078, 30212; + prmt.b32 %r3081, %r3080, %r3077, 4180; + ld.local.u8 %r3082, [%rd3+-132]; + ld.local.u8 %r3083, [%rd3+-131]; + prmt.b32 %r3084, %r3083, %r3082, 30212; + ld.local.u8 %r3085, [%rd3+-130]; + ld.local.u8 %r3086, [%rd3+-129]; + prmt.b32 %r3087, %r3086, %r3085, 30212; + prmt.b32 %r3088, %r3087, %r3084, 4180; + add.s32 %r3089, %r3081, %r3088; + add.s32 %r3090, %r3089, %r2955; + shf.l.wrap.b32 %r3091, %r3090, %r3090, 16; + add.s32 %r3092, %r3091, -1150833019; + xor.b32 %r3093, %r3092, %r3081; + shf.l.wrap.b32 %r3094, %r3093, %r3093, 20; + add.s32 %r3095, %r2962, %r3090; + add.s32 %r3096, %r3095, %r3094; + xor.b32 %r3097, %r3096, %r3091; + shf.l.wrap.b32 %r3098, %r3097, %r3097, 24; + add.s32 %r3099, %r3098, %r3092; + xor.b32 %r3100, %r3099, %r3094; + shf.l.wrap.b32 %r3101, %r3100, %r3100, 25; + ld.local.u8 %r3102, [%rd3+-112]; + ld.local.u8 %r3103, [%rd3+-111]; + prmt.b32 %r3104, %r3103, %r3102, 30212; + ld.local.u8 %r3105, [%rd3+-110]; + ld.local.u8 %r3106, [%rd3+-109]; + prmt.b32 %r3107, %r3106, %r3105, 30212; + prmt.b32 %r3108, %r3107, %r3104, 4180; + ld.local.u8 %r3109, [%rd3+-128]; + ld.local.u8 %r3110, [%rd3+-127]; + prmt.b32 %r3111, %r3110, %r3109, 30212; + ld.local.u8 %r3112, [%rd3+-126]; + ld.local.u8 %r3113, [%rd3+-125]; + prmt.b32 %r3114, %r3113, %r3112, 30212; + prmt.b32 %r3115, %r3114, %r3111, 4180; + add.s32 %r3116, %r3108, %r3115; + add.s32 %r3117, %r3116, %r2969; + shr.u32 %r3118, %r3117, 16; + shl.b32 %r3119, %r3117, 16; + xor.b32 %r3120, %r3119, 4194304; + or.b32 %r3121, %r3120, %r3118; + add.s32 %r3122, %r3121, 1013904242; + xor.b32 %r3123, %r3122, %r3108; + shf.l.wrap.b32 %r3124, %r3123, %r3123, 20; + add.s32 %r3125, %r2976, %r3117; + add.s32 %r3126, %r3125, %r3124; + xor.b32 %r3127, %r3126, %r3121; + shf.l.wrap.b32 %r3128, %r3127, %r3127, 24; + add.s32 %r3129, %r3128, %r3122; + xor.b32 %r3130, %r3129, %r3124; + shf.l.wrap.b32 %r3131, %r3130, %r3130, 25; + ld.local.u8 %r3132, [%rd3+-108]; + ld.local.u8 %r3133, [%rd3+-107]; + prmt.b32 %r3134, %r3133, %r3132, 30212; + ld.local.u8 %r3135, [%rd3+-106]; + ld.local.u8 %r3136, [%rd3+-105]; + prmt.b32 %r3137, %r3136, %r3135, 30212; + prmt.b32 %r3138, %r3137, %r3134, 4180; + ld.local.u8 %r3139, [%rd3+-124]; + ld.local.u8 %r3140, [%rd3+-123]; + prmt.b32 %r3141, %r3140, %r3139, 30212; + ld.local.u8 %r3142, [%rd3+-122]; + ld.local.u8 %r3143, [%rd3+-121]; + prmt.b32 %r3144, %r3143, %r3142, 30212; + prmt.b32 %r3145, %r3144, %r3141, 4180; + add.s32 %r3146, %r3138, %r3145; + add.s32 %r3147, %r3146, %r2983; + xor.b32 %r3148, %r3147, %r3047; + shr.u32 %r3149, %r3147, 16; + shl.b32 %r3150, %r3148, 16; + or.b32 %r3151, %r3150, %r3149; + add.s32 %r3152, %r3151, -1521486534; + xor.b32 %r3153, %r3152, %r3138; + shf.l.wrap.b32 %r3154, %r3153, %r3153, 20; + add.s32 %r3155, %r2990, %r3147; + add.s32 %r3156, %r3155, %r3154; + xor.b32 %r3157, %r3156, %r3151; + shf.l.wrap.b32 %r3158, %r3157, %r3157, 24; + add.s32 %r3159, %r3158, %r3152; + xor.b32 %r3160, %r3159, %r3154; + shf.l.wrap.b32 %r3161, %r3160, %r3160, 25; + add.s32 %r3162, %r3101, %r3069; + add.s32 %r3163, %r3162, %r2997; + xor.b32 %r3164, %r3158, %r3163; + shf.l.wrap.b32 %r3165, %r3164, %r3164, 16; + add.s32 %r3166, %r3165, %r3129; + xor.b32 %r3167, %r3166, %r3101; + shf.l.wrap.b32 %r3168, %r3167, %r3167, 20; + add.s32 %r3169, %r3004, %r3163; + add.s32 %r3170, %r3169, %r3168; + xor.b32 %r3171, %r3170, %r3165; + shf.l.wrap.b32 %r3172, %r3171, %r3171, 24; + add.s32 %r3173, %r3172, %r3166; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 25; + add.s32 %r3176, %r3131, %r3096; + add.s32 %r3177, %r3176, %r3011; + xor.b32 %r3178, %r3177, %r3071; + shf.l.wrap.b32 %r3179, %r3178, %r3178, 16; + add.s32 %r3180, %r3179, %r3159; + xor.b32 %r3181, %r3180, %r3131; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 20; + add.s32 %r3183, %r3018, %r3177; + add.s32 %r3184, %r3183, %r3182; + xor.b32 %r3185, %r3184, %r3179; + shf.l.wrap.b32 %r3186, %r3185, %r3185, 24; + add.s32 %r3187, %r3186, %r3180; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 25; + add.s32 %r3190, %r3161, %r3126; + add.s32 %r3191, %r3190, %r3025; + xor.b32 %r3192, %r3191, %r3098; + shf.l.wrap.b32 %r3193, %r3192, %r3192, 16; + add.s32 %r3194, %r3193, %r3072; + xor.b32 %r3195, %r3194, %r3161; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 20; + add.s32 %r3197, %r3032, %r3191; + add.s32 %r3198, %r3197, %r3196; + xor.b32 %r3199, %r3198, %r3193; + shf.l.wrap.b32 %r3200, %r3199, %r3199, 24; + add.s32 %r3201, %r3200, %r3194; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 25; + add.s32 %r3204, %r3156, %r3074; + add.s32 %r3205, %r3204, %r3039; + xor.b32 %r3206, %r3205, %r3128; + shf.l.wrap.b32 %r3207, %r3206, %r3206, 16; + add.s32 %r3208, %r3207, %r3099; + xor.b32 %r3209, %r3208, %r3074; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 20; + add.s32 %r3211, %r3046, %r3205; + add.s32 %r3212, %r3211, %r3210; + xor.b32 %r3213, %r3212, %r3207; + shf.l.wrap.b32 %r3214, %r3213, %r3213, 24; + add.s32 %r3215, %r3214, %r3208; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 25; + add.s32 %r3218, %r3170, %r2955; + add.s32 %r3219, %r3218, %r3217; + xor.b32 %r3220, %r3219, %r3186; + shf.l.wrap.b32 %r3221, %r3220, %r3220, 16; + add.s32 %r3222, %r3221, %r3201; + xor.b32 %r3223, %r3222, %r3217; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 20; + add.s32 %r3225, %r3219, %r2983; + add.s32 %r3226, %r3225, %r3224; + xor.b32 %r3227, %r3226, %r3221; + shf.l.wrap.b32 %r3228, %r3227, %r3227, 24; + add.s32 %r3229, %r3228, %r3222; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 25; + add.s32 %r3232, %r3184, %r2962; + add.s32 %r3233, %r3232, %r3175; + xor.b32 %r3234, %r3200, %r3233; + shf.l.wrap.b32 %r3235, %r3234, %r3234, 16; + add.s32 %r3236, %r3215, %r3235; + xor.b32 %r3237, %r3236, %r3175; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 20; + add.s32 %r3239, %r3233, %r3011; + add.s32 %r3240, %r3239, %r3238; + xor.b32 %r3241, %r3240, %r3235; + shf.l.wrap.b32 %r3242, %r3241, %r3241, 24; + add.s32 %r3243, %r3242, %r3236; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 25; + add.s32 %r3246, %r3189, %r2990; + add.s32 %r3247, %r3246, %r3198; + xor.b32 %r3248, %r3214, %r3247; + shf.l.wrap.b32 %r3249, %r3248, %r3248, 16; + add.s32 %r3250, %r3249, %r3173; + xor.b32 %r3251, %r3250, %r3189; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 20; + add.s32 %r3253, %r3247, %r2941; + add.s32 %r3254, %r3253, %r3252; + xor.b32 %r3255, %r3254, %r3249; + shf.l.wrap.b32 %r3256, %r3255, %r3255, 24; + add.s32 %r3257, %r3256, %r3250; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 25; + add.s32 %r3260, %r3203, %r2969; + add.s32 %r3261, %r3260, %r3212; + xor.b32 %r3262, %r3261, %r3172; + shf.l.wrap.b32 %r3263, %r3262, %r3262, 16; + add.s32 %r3264, %r3263, %r3187; + xor.b32 %r3265, %r3264, %r3203; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 20; + add.s32 %r3267, %r3261, %r3032; + add.s32 %r3268, %r3267, %r3266; + xor.b32 %r3269, %r3268, %r3263; + shf.l.wrap.b32 %r3270, %r3269, %r3269, 24; + add.s32 %r3271, %r3270, %r3264; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 25; + add.s32 %r3274, %r3245, %r2948; + add.s32 %r3275, %r3274, %r3226; + xor.b32 %r3276, %r3275, %r3270; + shf.l.wrap.b32 %r3277, %r3276, %r3276, 16; + add.s32 %r3278, %r3277, %r3257; + xor.b32 %r3279, %r3278, %r3245; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 20; + add.s32 %r3281, %r3275, %r3018; + add.s32 %r3282, %r3281, %r3280; + xor.b32 %r3283, %r3282, %r3277; + shf.l.wrap.b32 %r3284, %r3283, %r3283, 24; + add.s32 %r3285, %r3284, %r3278; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 25; + add.s32 %r3288, %r3240, %r3025; + add.s32 %r3289, %r3288, %r3259; + xor.b32 %r3290, %r3228, %r3289; + shf.l.wrap.b32 %r3291, %r3290, %r3290, 16; + add.s32 %r3292, %r3291, %r3271; + xor.b32 %r3293, %r3292, %r3259; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 20; + add.s32 %r3295, %r3289, %r2976; + add.s32 %r3296, %r3295, %r3294; + xor.b32 %r3297, %r3296, %r3291; + shf.l.wrap.b32 %r3298, %r3297, %r3297, 24; + add.s32 %r3299, %r3298, %r3292; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 25; + add.s32 %r3302, %r3254, %r3004; + add.s32 %r3303, %r3302, %r3273; + xor.b32 %r3304, %r3303, %r3242; + shf.l.wrap.b32 %r3305, %r3304, %r3304, 16; + add.s32 %r3306, %r3305, %r3229; + xor.b32 %r3307, %r3306, %r3273; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 20; + add.s32 %r3309, %r3303, %r3039; + add.s32 %r3310, %r3309, %r3308; + xor.b32 %r3311, %r3310, %r3305; + shf.l.wrap.b32 %r3312, %r3311, %r3311, 24; + add.s32 %r3313, %r3312, %r3306; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 25; + add.s32 %r3316, %r3268, %r3046; + add.s32 %r3317, %r3316, %r3231; + xor.b32 %r3318, %r3317, %r3256; + shf.l.wrap.b32 %r3319, %r3318, %r3318, 16; + add.s32 %r3320, %r3319, %r3243; + xor.b32 %r3321, %r3320, %r3231; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 20; + add.s32 %r3323, %r3317, %r2997; + add.s32 %r3324, %r3323, %r3322; + xor.b32 %r3325, %r3324, %r3319; + shf.l.wrap.b32 %r3326, %r3325, %r3325, 24; + add.s32 %r3327, %r3326, %r3320; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 25; + add.s32 %r3330, %r3282, %r2962; + add.s32 %r3331, %r3330, %r3329; + xor.b32 %r3332, %r3331, %r3298; + shf.l.wrap.b32 %r3333, %r3332, %r3332, 16; + add.s32 %r3334, %r3333, %r3313; + xor.b32 %r3335, %r3334, %r3329; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 20; + add.s32 %r3337, %r3331, %r2969; + add.s32 %r3338, %r3337, %r3336; + xor.b32 %r3339, %r3338, %r3333; + shf.l.wrap.b32 %r3340, %r3339, %r3339, 24; + add.s32 %r3341, %r3340, %r3334; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 25; + add.s32 %r3344, %r3296, %r3011; + add.s32 %r3345, %r3344, %r3287; + xor.b32 %r3346, %r3345, %r3312; + shf.l.wrap.b32 %r3347, %r3346, %r3346, 16; + add.s32 %r3348, %r3347, %r3327; + xor.b32 %r3349, %r3348, %r3287; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 20; + add.s32 %r3351, %r3345, %r3025; + add.s32 %r3352, %r3351, %r3350; + xor.b32 %r3353, %r3352, %r3347; + shf.l.wrap.b32 %r3354, %r3353, %r3353, 24; + add.s32 %r3355, %r3354, %r3348; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 25; + add.s32 %r3358, %r3310, %r3032; + add.s32 %r3359, %r3358, %r3301; + xor.b32 %r3360, %r3326, %r3359; + shf.l.wrap.b32 %r3361, %r3360, %r3360, 16; + add.s32 %r3362, %r3361, %r3285; + xor.b32 %r3363, %r3362, %r3301; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 20; + add.s32 %r3365, %r3359, %r2955; + add.s32 %r3366, %r3365, %r3364; + xor.b32 %r3367, %r3366, %r3361; + shf.l.wrap.b32 %r3368, %r3367, %r3367, 24; + add.s32 %r3369, %r3368, %r3362; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 25; + add.s32 %r3372, %r3315, %r2990; + add.s32 %r3373, %r3372, %r3324; + xor.b32 %r3374, %r3373, %r3284; + shf.l.wrap.b32 %r3375, %r3374, %r3374, 16; + add.s32 %r3376, %r3375, %r3299; + xor.b32 %r3377, %r3376, %r3315; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 20; + add.s32 %r3379, %r3373, %r3039; + add.s32 %r3380, %r3379, %r3378; + xor.b32 %r3381, %r3380, %r3375; + shf.l.wrap.b32 %r3382, %r3381, %r3381, 24; + add.s32 %r3383, %r3382, %r3376; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 25; + add.s32 %r3386, %r3357, %r2983; + add.s32 %r3387, %r3386, %r3338; + xor.b32 %r3388, %r3387, %r3382; + shf.l.wrap.b32 %r3389, %r3388, %r3388, 16; + add.s32 %r3390, %r3389, %r3369; + xor.b32 %r3391, %r3390, %r3357; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 20; + add.s32 %r3393, %r3387, %r2976; + add.s32 %r3394, %r3393, %r3392; + xor.b32 %r3395, %r3394, %r3389; + shf.l.wrap.b32 %r3396, %r3395, %r3395, 24; + add.s32 %r3397, %r3396, %r3390; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 25; + add.s32 %r3400, %r3352, %r3004; + add.s32 %r3401, %r3400, %r3371; + xor.b32 %r3402, %r3340, %r3401; + shf.l.wrap.b32 %r3403, %r3402, %r3402, 16; + add.s32 %r3404, %r3403, %r3383; + xor.b32 %r3405, %r3404, %r3371; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 20; + add.s32 %r3407, %r3401, %r2941; + add.s32 %r3408, %r3407, %r3406; + xor.b32 %r3409, %r3408, %r3403; + shf.l.wrap.b32 %r3410, %r3409, %r3409, 24; + add.s32 %r3411, %r3410, %r3404; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 25; + add.s32 %r3414, %r3366, %r3018; + add.s32 %r3415, %r3414, %r3385; + xor.b32 %r3416, %r3415, %r3354; + shf.l.wrap.b32 %r3417, %r3416, %r3416, 16; + add.s32 %r3418, %r3417, %r3341; + xor.b32 %r3419, %r3418, %r3385; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 20; + add.s32 %r3421, %r3415, %r3046; + add.s32 %r3422, %r3421, %r3420; + xor.b32 %r3423, %r3422, %r3417; + shf.l.wrap.b32 %r3424, %r3423, %r3423, 24; + add.s32 %r3425, %r3424, %r3418; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 25; + add.s32 %r3428, %r3380, %r2997; + add.s32 %r3429, %r3428, %r3343; + xor.b32 %r3430, %r3429, %r3368; + shf.l.wrap.b32 %r3431, %r3430, %r3430, 16; + add.s32 %r3432, %r3431, %r3355; + xor.b32 %r3433, %r3432, %r3343; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 20; + add.s32 %r3435, %r3429, %r2948; + add.s32 %r3436, %r3435, %r3434; + xor.b32 %r3437, %r3436, %r3431; + shf.l.wrap.b32 %r3438, %r3437, %r3437, 24; + add.s32 %r3439, %r3438, %r3432; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 25; + add.s32 %r3442, %r3394, %r3011; + add.s32 %r3443, %r3442, %r3441; + xor.b32 %r3444, %r3443, %r3410; + shf.l.wrap.b32 %r3445, %r3444, %r3444, 16; + add.s32 %r3446, %r3445, %r3425; + xor.b32 %r3447, %r3446, %r3441; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 20; + add.s32 %r3449, %r3443, %r2990; + add.s32 %r3450, %r3449, %r3448; + xor.b32 %r3451, %r3450, %r3445; + shf.l.wrap.b32 %r3452, %r3451, %r3451, 24; + add.s32 %r3453, %r3452, %r3446; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 25; + add.s32 %r3456, %r3408, %r3025; + add.s32 %r3457, %r3456, %r3399; + xor.b32 %r3458, %r3457, %r3424; + shf.l.wrap.b32 %r3459, %r3458, %r3458, 16; + add.s32 %r3460, %r3459, %r3439; + xor.b32 %r3461, %r3460, %r3399; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 20; + add.s32 %r3463, %r3457, %r3004; + add.s32 %r3464, %r3463, %r3462; + xor.b32 %r3465, %r3464, %r3459; + shf.l.wrap.b32 %r3466, %r3465, %r3465, 24; + add.s32 %r3467, %r3466, %r3460; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 25; + add.s32 %r3470, %r3422, %r3039; + add.s32 %r3471, %r3470, %r3413; + xor.b32 %r3472, %r3438, %r3471; + shf.l.wrap.b32 %r3473, %r3472, %r3472, 16; + add.s32 %r3474, %r3473, %r3397; + xor.b32 %r3475, %r3474, %r3413; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 20; + add.s32 %r3477, %r3471, %r2962; + add.s32 %r3478, %r3477, %r3476; + xor.b32 %r3479, %r3478, %r3473; + shf.l.wrap.b32 %r3480, %r3479, %r3479, 24; + add.s32 %r3481, %r3480, %r3474; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 25; + add.s32 %r3484, %r3427, %r3032; + add.s32 %r3485, %r3484, %r3436; + xor.b32 %r3486, %r3485, %r3396; + shf.l.wrap.b32 %r3487, %r3486, %r3486, 16; + add.s32 %r3488, %r3487, %r3411; + xor.b32 %r3489, %r3488, %r3427; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 20; + add.s32 %r3491, %r3485, %r3046; + add.s32 %r3492, %r3491, %r3490; + xor.b32 %r3493, %r3492, %r3487; + shf.l.wrap.b32 %r3494, %r3493, %r3493, 24; + add.s32 %r3495, %r3494, %r3488; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 25; + add.s32 %r3498, %r3469, %r2969; + add.s32 %r3499, %r3498, %r3450; + xor.b32 %r3500, %r3499, %r3494; + shf.l.wrap.b32 %r3501, %r3500, %r3500, 16; + add.s32 %r3502, %r3501, %r3481; + xor.b32 %r3503, %r3502, %r3469; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 20; + add.s32 %r3505, %r3499, %r2941; + add.s32 %r3506, %r3505, %r3504; + xor.b32 %r3507, %r3506, %r3501; + shf.l.wrap.b32 %r3508, %r3507, %r3507, 24; + add.s32 %r3509, %r3508, %r3502; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 25; + add.s32 %r3512, %r3464, %r3018; + add.s32 %r3513, %r3512, %r3483; + xor.b32 %r3514, %r3452, %r3513; + shf.l.wrap.b32 %r3515, %r3514, %r3514, 16; + add.s32 %r3516, %r3515, %r3495; + xor.b32 %r3517, %r3516, %r3483; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 20; + add.s32 %r3519, %r3513, %r2955; + add.s32 %r3520, %r3519, %r3518; + xor.b32 %r3521, %r3520, %r3515; + shf.l.wrap.b32 %r3522, %r3521, %r3521, 24; + add.s32 %r3523, %r3522, %r3516; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 25; + add.s32 %r3526, %r3478, %r2976; + add.s32 %r3527, %r3526, %r3497; + xor.b32 %r3528, %r3527, %r3466; + shf.l.wrap.b32 %r3529, %r3528, %r3528, 16; + add.s32 %r3530, %r3529, %r3453; + xor.b32 %r3531, %r3530, %r3497; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 20; + add.s32 %r3533, %r3527, %r2997; + add.s32 %r3534, %r3533, %r3532; + xor.b32 %r3535, %r3534, %r3529; + shf.l.wrap.b32 %r3536, %r3535, %r3535, 24; + add.s32 %r3537, %r3536, %r3530; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 25; + add.s32 %r3540, %r3492, %r2948; + add.s32 %r3541, %r3540, %r3455; + xor.b32 %r3542, %r3541, %r3480; + shf.l.wrap.b32 %r3543, %r3542, %r3542, 16; + add.s32 %r3544, %r3543, %r3467; + xor.b32 %r3545, %r3544, %r3455; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 20; + add.s32 %r3547, %r3541, %r2983; + add.s32 %r3548, %r3547, %r3546; + xor.b32 %r3549, %r3548, %r3543; + shf.l.wrap.b32 %r3550, %r3549, %r3549, 24; + add.s32 %r3551, %r3550, %r3544; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 25; + add.s32 %r3554, %r3506, %r3025; + add.s32 %r3555, %r3554, %r3553; + xor.b32 %r3556, %r3555, %r3522; + shf.l.wrap.b32 %r3557, %r3556, %r3556, 16; + add.s32 %r3558, %r3557, %r3537; + xor.b32 %r3559, %r3558, %r3553; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 20; + add.s32 %r3561, %r3555, %r3032; + add.s32 %r3562, %r3561, %r3560; + xor.b32 %r3563, %r3562, %r3557; + shf.l.wrap.b32 %r3564, %r3563, %r3563, 24; + add.s32 %r3565, %r3564, %r3558; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 25; + add.s32 %r3568, %r3520, %r3004; + add.s32 %r3569, %r3568, %r3511; + xor.b32 %r3570, %r3569, %r3536; + shf.l.wrap.b32 %r3571, %r3570, %r3570, 16; + add.s32 %r3572, %r3571, %r3551; + xor.b32 %r3573, %r3572, %r3511; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 20; + add.s32 %r3575, %r3569, %r3018; + add.s32 %r3576, %r3575, %r3574; + xor.b32 %r3577, %r3576, %r3571; + shf.l.wrap.b32 %r3578, %r3577, %r3577, 24; + add.s32 %r3579, %r3578, %r3572; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 25; + add.s32 %r3582, %r3534, %r3046; + add.s32 %r3583, %r3582, %r3525; + xor.b32 %r3584, %r3550, %r3583; + shf.l.wrap.b32 %r3585, %r3584, %r3584, 16; + add.s32 %r3586, %r3585, %r3509; + xor.b32 %r3587, %r3586, %r3525; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 20; + add.s32 %r3589, %r3583, %r3011; + add.s32 %r3590, %r3589, %r3588; + xor.b32 %r3591, %r3590, %r3585; + shf.l.wrap.b32 %r3592, %r3591, %r3591, 24; + add.s32 %r3593, %r3592, %r3586; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 25; + add.s32 %r3596, %r3539, %r3039; + add.s32 %r3597, %r3596, %r3548; + xor.b32 %r3598, %r3597, %r3508; + shf.l.wrap.b32 %r3599, %r3598, %r3598, 16; + add.s32 %r3600, %r3599, %r3523; + xor.b32 %r3601, %r3600, %r3539; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 20; + add.s32 %r3603, %r3597, %r2997; + add.s32 %r3604, %r3603, %r3602; + xor.b32 %r3605, %r3604, %r3599; + shf.l.wrap.b32 %r3606, %r3605, %r3605, 24; + add.s32 %r3607, %r3606, %r3600; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 25; + add.s32 %r3610, %r3581, %r2990; + add.s32 %r3611, %r3610, %r3562; + xor.b32 %r3612, %r3611, %r3606; + shf.l.wrap.b32 %r3613, %r3612, %r3612, 16; + add.s32 %r3614, %r3613, %r3593; + xor.b32 %r3615, %r3614, %r3581; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 20; + add.s32 %r3617, %r3611, %r2955; + add.s32 %r3618, %r3617, %r3616; + xor.b32 %r3619, %r3618, %r3613; + shf.l.wrap.b32 %r3620, %r3619, %r3619, 24; + add.s32 %r3621, %r3620, %r3614; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 25; + add.s32 %r3624, %r3576, %r2976; + add.s32 %r3625, %r3624, %r3595; + xor.b32 %r3626, %r3564, %r3625; + shf.l.wrap.b32 %r3627, %r3626, %r3626, 16; + add.s32 %r3628, %r3627, %r3607; + xor.b32 %r3629, %r3628, %r3595; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 20; + add.s32 %r3631, %r3625, %r2962; + add.s32 %r3632, %r3631, %r3630; + xor.b32 %r3633, %r3632, %r3627; + shf.l.wrap.b32 %r3634, %r3633, %r3633, 24; + add.s32 %r3635, %r3634, %r3628; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 25; + add.s32 %r3638, %r3590, %r2941; + add.s32 %r3639, %r3638, %r3609; + xor.b32 %r3640, %r3639, %r3578; + shf.l.wrap.b32 %r3641, %r3640, %r3640, 16; + add.s32 %r3642, %r3641, %r3565; + xor.b32 %r3643, %r3642, %r3609; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 20; + add.s32 %r3645, %r3639, %r2948; + add.s32 %r3646, %r3645, %r3644; + xor.b32 %r3647, %r3646, %r3641; + shf.l.wrap.b32 %r3648, %r3647, %r3647, 24; + add.s32 %r3649, %r3648, %r3642; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 25; + add.s32 %r3652, %r3604, %r2983; + add.s32 %r3653, %r3652, %r3567; + xor.b32 %r3654, %r3653, %r3592; + shf.l.wrap.b32 %r3655, %r3654, %r3654, 16; + add.s32 %r3656, %r3655, %r3579; + xor.b32 %r3657, %r3656, %r3567; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 20; + add.s32 %r3659, %r3653, %r2969; + add.s32 %r3660, %r3659, %r3658; + xor.b32 %r3661, %r3660, %r3655; + shf.l.wrap.b32 %r3662, %r3661, %r3661, 24; + add.s32 %r3663, %r3662, %r3656; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 25; + add.s32 %r3666, %r3618, %r3004; + add.s32 %r3667, %r3666, %r3665; + xor.b32 %r3668, %r3667, %r3634; + shf.l.wrap.b32 %r3669, %r3668, %r3668, 16; + add.s32 %r3670, %r3669, %r3649; + xor.b32 %r3671, %r3670, %r3665; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 20; + add.s32 %r3673, %r3667, %r3039; + add.s32 %r3674, %r3673, %r3672; + xor.b32 %r3675, %r3674, %r3669; + shf.l.wrap.b32 %r3676, %r3675, %r3675, 24; + add.s32 %r3677, %r3676, %r3670; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 25; + add.s32 %r3680, %r3632, %r3018; + add.s32 %r3681, %r3680, %r3623; + xor.b32 %r3682, %r3681, %r3648; + shf.l.wrap.b32 %r3683, %r3682, %r3682, 16; + add.s32 %r3684, %r3683, %r3663; + xor.b32 %r3685, %r3684, %r3623; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 20; + add.s32 %r3687, %r3681, %r2976; + add.s32 %r3688, %r3687, %r3686; + xor.b32 %r3689, %r3688, %r3683; + shf.l.wrap.b32 %r3690, %r3689, %r3689, 24; + add.s32 %r3691, %r3690, %r3684; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 25; + add.s32 %r3694, %r3646, %r2997; + add.s32 %r3695, %r3694, %r3637; + xor.b32 %r3696, %r3662, %r3695; + shf.l.wrap.b32 %r3697, %r3696, %r3696, 16; + add.s32 %r3698, %r3697, %r3621; + xor.b32 %r3699, %r3698, %r3637; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 20; + add.s32 %r3701, %r3695, %r3025; + add.s32 %r3702, %r3701, %r3700; + xor.b32 %r3703, %r3702, %r3697; + shf.l.wrap.b32 %r3704, %r3703, %r3703, 24; + add.s32 %r3705, %r3704, %r3698; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 25; + add.s32 %r3708, %r3651, %r3046; + add.s32 %r3709, %r3708, %r3660; + xor.b32 %r3710, %r3709, %r3620; + shf.l.wrap.b32 %r3711, %r3710, %r3710, 16; + add.s32 %r3712, %r3711, %r3635; + xor.b32 %r3713, %r3712, %r3651; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 20; + add.s32 %r3715, %r3709, %r2948; + add.s32 %r3716, %r3715, %r3714; + xor.b32 %r3717, %r3716, %r3711; + shf.l.wrap.b32 %r3718, %r3717, %r3717, 24; + add.s32 %r3719, %r3718, %r3712; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 25; + add.s32 %r3722, %r3693, %r3032; + add.s32 %r3723, %r3722, %r3674; + xor.b32 %r3724, %r3723, %r3718; + shf.l.wrap.b32 %r3725, %r3724, %r3724, 16; + add.s32 %r3726, %r3725, %r3705; + xor.b32 %r3727, %r3726, %r3693; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 20; + add.s32 %r3729, %r3723, %r2962; + add.s32 %r3730, %r3729, %r3728; + xor.b32 %r3731, %r3730, %r3725; + shf.l.wrap.b32 %r3732, %r3731, %r3731, 24; + add.s32 %r3733, %r3732, %r3726; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 25; + add.s32 %r3736, %r3688, %r2941; + add.s32 %r3737, %r3736, %r3707; + xor.b32 %r3738, %r3676, %r3737; + shf.l.wrap.b32 %r3739, %r3738, %r3738, 16; + add.s32 %r3740, %r3739, %r3719; + xor.b32 %r3741, %r3740, %r3707; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 20; + add.s32 %r3743, %r3737, %r3011; + add.s32 %r3744, %r3743, %r3742; + xor.b32 %r3745, %r3744, %r3739; + shf.l.wrap.b32 %r3746, %r3745, %r3745, 24; + add.s32 %r3747, %r3746, %r3740; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 25; + add.s32 %r3750, %r3702, %r2955; + add.s32 %r3751, %r3750, %r3721; + xor.b32 %r3752, %r3751, %r3690; + shf.l.wrap.b32 %r3753, %r3752, %r3752, 16; + add.s32 %r3754, %r3753, %r3677; + xor.b32 %r3755, %r3754, %r3721; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 20; + add.s32 %r3757, %r3751, %r2983; + add.s32 %r3758, %r3757, %r3756; + xor.b32 %r3759, %r3758, %r3753; + shf.l.wrap.b32 %r3760, %r3759, %r3759, 24; + add.s32 %r3761, %r3760, %r3754; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 25; + add.s32 %r3764, %r3716, %r2969; + add.s32 %r3765, %r3764, %r3679; + xor.b32 %r3766, %r3765, %r3704; + shf.l.wrap.b32 %r3767, %r3766, %r3766, 16; + add.s32 %r3768, %r3767, %r3691; + xor.b32 %r3769, %r3768, %r3679; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 20; + add.s32 %r3771, %r3765, %r2990; + add.s32 %r3772, %r3771, %r3770; + xor.b32 %r3773, %r3772, %r3767; + shf.l.wrap.b32 %r3774, %r3773, %r3773, 24; + add.s32 %r3775, %r3774, %r3768; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 25; + add.s32 %r3778, %r3730, %r3018; + add.s32 %r3779, %r3778, %r3777; + xor.b32 %r3780, %r3779, %r3746; + shf.l.wrap.b32 %r3781, %r3780, %r3780, 16; + add.s32 %r3782, %r3781, %r3761; + xor.b32 %r3783, %r3782, %r3777; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 20; + add.s32 %r3785, %r3779, %r3046; + add.s32 %r3786, %r3785, %r3784; + xor.b32 %r3787, %r3786, %r3781; + shf.l.wrap.b32 %r3788, %r3787, %r3787, 24; + add.s32 %r3789, %r3788, %r3782; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 25; + add.s32 %r3792, %r3744, %r2976; + add.s32 %r3793, %r3792, %r3735; + xor.b32 %r3794, %r3793, %r3760; + shf.l.wrap.b32 %r3795, %r3794, %r3794, 16; + add.s32 %r3796, %r3795, %r3775; + xor.b32 %r3797, %r3796, %r3735; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 20; + add.s32 %r3799, %r3793, %r2941; + add.s32 %r3800, %r3799, %r3798; + xor.b32 %r3801, %r3800, %r3795; + shf.l.wrap.b32 %r3802, %r3801, %r3801, 24; + add.s32 %r3803, %r3802, %r3796; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 25; + add.s32 %r3806, %r3758, %r2948; + add.s32 %r3807, %r3806, %r3749; + xor.b32 %r3808, %r3774, %r3807; + shf.l.wrap.b32 %r3809, %r3808, %r3808, 16; + add.s32 %r3810, %r3809, %r3733; + xor.b32 %r3811, %r3810, %r3749; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 20; + add.s32 %r3813, %r3807, %r3004; + add.s32 %r3814, %r3813, %r3812; + xor.b32 %r3815, %r3814, %r3809; + shf.l.wrap.b32 %r3816, %r3815, %r3815, 24; + add.s32 %r3817, %r3816, %r3810; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 25; + add.s32 %r3820, %r3763, %r2997; + add.s32 %r3821, %r3820, %r3772; + xor.b32 %r3822, %r3821, %r3732; + shf.l.wrap.b32 %r3823, %r3822, %r3822, 16; + add.s32 %r3824, %r3823, %r3747; + xor.b32 %r3825, %r3824, %r3763; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 20; + add.s32 %r3827, %r3821, %r2983; + add.s32 %r3828, %r3827, %r3826; + xor.b32 %r3829, %r3828, %r3823; + shf.l.wrap.b32 %r3830, %r3829, %r3829, 24; + add.s32 %r3831, %r3830, %r3824; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 25; + add.s32 %r3834, %r3805, %r3039; + add.s32 %r3835, %r3834, %r3786; + xor.b32 %r3836, %r3835, %r3830; + shf.l.wrap.b32 %r3837, %r3836, %r3836, 16; + add.s32 %r3838, %r3837, %r3817; + xor.b32 %r3839, %r3838, %r3805; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 20; + add.s32 %r3841, %r3835, %r3011; + add.s32 %r3842, %r3841, %r3840; + xor.b32 %r3843, %r3842, %r3837; + shf.l.wrap.b32 %r3844, %r3843, %r3843, 24; + add.s32 %r3845, %r3844, %r3838; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 25; + add.s32 %r3848, %r3800, %r2955; + add.s32 %r3849, %r3848, %r3819; + xor.b32 %r3850, %r3788, %r3849; + shf.l.wrap.b32 %r3851, %r3850, %r3850, 16; + add.s32 %r3852, %r3851, %r3831; + xor.b32 %r3853, %r3852, %r3819; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 20; + add.s32 %r3855, %r3849, %r3025; + add.s32 %r3856, %r3855, %r3854; + xor.b32 %r3857, %r3856, %r3851; + shf.l.wrap.b32 %r3858, %r3857, %r3857, 24; + add.s32 %r3859, %r3858, %r3852; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 25; + add.s32 %r3862, %r3814, %r2962; + add.s32 %r3863, %r3862, %r3833; + xor.b32 %r3864, %r3863, %r3802; + shf.l.wrap.b32 %r3865, %r3864, %r3864, 16; + add.s32 %r3866, %r3865, %r3789; + xor.b32 %r3867, %r3866, %r3833; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 20; + add.s32 %r3869, %r3863, %r2969; + add.s32 %r3870, %r3869, %r3868; + xor.b32 %r3871, %r3870, %r3865; + shf.l.wrap.b32 %r3872, %r3871, %r3871, 24; + add.s32 %r3873, %r3872, %r3866; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 25; + add.s32 %r3876, %r3828, %r2990; + add.s32 %r3877, %r3876, %r3791; + xor.b32 %r3878, %r3877, %r3816; + shf.l.wrap.b32 %r3879, %r3878, %r3878, 16; + add.s32 %r3880, %r3879, %r3803; + xor.b32 %r3881, %r3880, %r3791; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 20; + add.s32 %r3883, %r3877, %r3032; + add.s32 %r3884, %r3883, %r3882; + xor.b32 %r3885, %r3884, %r3879; + shf.l.wrap.b32 %r3886, %r3885, %r3885, 24; + add.s32 %r3887, %r3886, %r3880; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 25; + xor.b32 %r3890, %r3873, %r3842; + xor.b32 %r3891, %r3887, %r3856; + xor.b32 %r3892, %r3845, %r3870; + xor.b32 %r3893, %r3884, %r3859; + xor.b32 %r3894, %r3889, %r3858; + xor.b32 %r3895, %r3847, %r3872; + xor.b32 %r3896, %r3886, %r3861; + xor.b32 %r3897, %r3875, %r3844; + st.local.u8 [%rd132+145], %r3890; + shr.u32 %r3898, %r3890, 8; + st.local.u8 [%rd132+146], %r3898; + shr.u32 %r3899, %r3890, 16; + st.local.u8 [%rd132+147], %r3899; + shr.u32 %r3900, %r3890, 24; + st.local.u8 [%rd132+148], %r3900; + st.local.u8 [%rd132+149], %r3891; + shr.u32 %r3901, %r3891, 8; + st.local.u8 [%rd132+150], %r3901; + shr.u32 %r3902, %r3891, 16; + st.local.u8 [%rd132+151], %r3902; + shr.u32 %r3903, %r3891, 24; + st.local.u8 [%rd132+152], %r3903; + st.local.u8 [%rd132+153], %r3892; + shr.u32 %r3904, %r3892, 8; + st.local.u8 [%rd132+154], %r3904; + shr.u32 %r3905, %r3892, 16; + st.local.u8 [%rd132+155], %r3905; + shr.u32 %r3906, %r3892, 24; + st.local.u8 [%rd132+156], %r3906; + st.local.u8 [%rd132+157], %r3893; + shr.u32 %r3907, %r3893, 8; + st.local.u8 [%rd132+158], %r3907; + shr.u32 %r3908, %r3893, 16; + st.local.u8 [%rd132+159], %r3908; + shr.u32 %r3909, %r3893, 24; + st.local.u8 [%rd132+160], %r3909; + st.local.u8 [%rd132+161], %r3894; + shr.u32 %r3910, %r3894, 8; + st.local.u8 [%rd132+162], %r3910; + shr.u32 %r3911, %r3894, 16; + st.local.u8 [%rd132+163], %r3911; + shr.u32 %r3912, %r3894, 24; + st.local.u8 [%rd132+164], %r3912; + st.local.u8 [%rd132+165], %r3895; + shr.u32 %r3913, %r3895, 8; + st.local.u8 [%rd132+166], %r3913; + shr.u32 %r3914, %r3895, 16; + st.local.u8 [%rd132+167], %r3914; + shr.u32 %r3915, %r3895, 24; + st.local.u8 [%rd132+168], %r3915; + st.local.u8 [%rd132+169], %r3896; + shr.u32 %r3916, %r3896, 8; + st.local.u8 [%rd132+170], %r3916; + shr.u32 %r3917, %r3896, 16; + st.local.u8 [%rd132+171], %r3917; + shr.u32 %r3918, %r3896, 24; + st.local.u8 [%rd132+172], %r3918; + st.local.u8 [%rd132+173], %r3897; + shr.u32 %r3919, %r3897, 8; + st.local.u8 [%rd132+174], %r3919; + shr.u32 %r3920, %r3897, 16; + st.local.u8 [%rd132+175], %r3920; + shr.u32 %r3921, %r3897, 24; + st.local.u8 [%rd132+176], %r3921; + ld.local.u8 %rs128, [%rd3+8]; + add.s16 %rs129, %rs128, -1; + st.local.u8 [%rd3+8], %rs129; + cvt.u64.u16 %rd133, %rs129; + and.b64 %rd134, %rd133, 255; + setp.lt.u64 %p16, %rd30, %rd134; + and.b16 %rs130, %rs129, 255; + mul.wide.u16 %r11659, %rs130, 32; + @%p16 bra $L__BB1_18; + +$L__BB1_19: + ld.param.u64 %rd223, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvt.s64.s32 %rd136, %r11659; + add.s64 %rd137, %rd2, %rd136; + mov.u64 %rd246, 0; + st.local.u8 [%rd137+145], %r27; + shr.u32 %r3922, %r27, 8; + st.local.u8 [%rd137+146], %r3922; + shr.u32 %r3923, %r27, 16; + st.local.u8 [%rd137+147], %r3923; + shr.u32 %r3924, %r27, 24; + st.local.u8 [%rd137+148], %r3924; + st.local.u8 [%rd137+149], %r28; + shr.u32 %r3925, %r28, 8; + st.local.u8 [%rd137+150], %r3925; + shr.u32 %r3926, %r28, 16; + st.local.u8 [%rd137+151], %r3926; + shr.u32 %r3927, %r28, 24; + st.local.u8 [%rd137+152], %r3927; + st.local.u8 [%rd137+153], %r29; + shr.u32 %r3928, %r29, 8; + st.local.u8 [%rd137+154], %r3928; + shr.u32 %r3929, %r29, 16; + st.local.u8 [%rd137+155], %r3929; + shr.u32 %r3930, %r29, 24; + st.local.u8 [%rd137+156], %r3930; + st.local.u8 [%rd137+157], %r30; + shr.u32 %r3931, %r30, 8; + st.local.u8 [%rd137+158], %r3931; + shr.u32 %r3932, %r30, 16; + st.local.u8 [%rd137+159], %r3932; + shr.u32 %r3933, %r30, 24; + st.local.u8 [%rd137+160], %r3933; + st.local.u8 [%rd137+161], %r31; + shr.u32 %r3934, %r31, 8; + st.local.u8 [%rd137+162], %r3934; + shr.u32 %r3935, %r31, 16; + st.local.u8 [%rd137+163], %r3935; + shr.u32 %r3936, %r31, 24; + st.local.u8 [%rd137+164], %r3936; + st.local.u8 [%rd137+165], %r32; + shr.u32 %r3937, %r32, 8; + st.local.u8 [%rd137+166], %r3937; + shr.u32 %r3938, %r32, 16; + st.local.u8 [%rd137+167], %r3938; + shr.u32 %r3939, %r32, 24; + st.local.u8 [%rd137+168], %r3939; + st.local.u8 [%rd137+169], %r33; + shr.u32 %r3940, %r33, 8; + st.local.u8 [%rd137+170], %r3940; + shr.u32 %r3941, %r33, 16; + st.local.u8 [%rd137+171], %r3941; + shr.u32 %r3942, %r33, 24; + st.local.u8 [%rd137+172], %r3942; + st.local.u8 [%rd137+173], %r34; + shr.u32 %r3943, %r34, 8; + st.local.u8 [%rd137+174], %r3943; + shr.u32 %r3944, %r34, 16; + st.local.u8 [%rd137+175], %r3944; + shr.u32 %r3945, %r34, 24; + st.local.u8 [%rd137+176], %r3945; + ld.local.u8 %rs131, [%rd3+8]; + add.s16 %rs132, %rs131, 1; + st.local.u8 [%rd3+8], %rs132; + ld.local.u64 %rd138, [%rd3+-72]; + add.s64 %rd32, %rd138, 1; + add.s64 %rd254, %rd223, %rd6; + +$L__BB1_20: + add.s64 %rd139, %rd2, %rd246; + ld.local.u8 %rs133, [%rd139]; + st.local.u8 [%rd139+32], %rs133; + add.s64 %rd246, %rd246, 1; + setp.lt.u64 %p17, %rd246, 32; + @%p17 bra $L__BB1_20; + + mov.u64 %rd247, 0; + st.local.u64 [%rd3+-72], %rd32; + mov.u16 %rs134, 0; + st.local.u8 [%rd3+1], %rs134; + +$L__BB1_22: + add.s64 %rd141, %rd2, %rd247; + st.local.u8 [%rd141+72], %rs134; + add.s64 %rd247, %rd247, 1; + setp.lt.u64 %p18, %rd247, 64; + @%p18 bra $L__BB1_22; + + ld.param.u64 %rd236, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd235, %rd236; + add.s64 %rd261, %rd235, %rd6; + mov.u64 %rd225, 32; + sub.s64 %rd262, %rd225, %rd6; + mov.u16 %rs136, 0; + st.local.u8 [%rd3], %rs136; + +$L__BB1_24: + setp.lt.u64 %p19, %rd262, 1025; + @%p19 bra $L__BB1_48; + + ld.local.u64 %rd251, [%rd3+-72]; + add.u64 %rd142, %SP, 0; + add.u64 %rd42, %SPL, 0; + +$L__BB1_26: + or.b64 %rd143, %rd262, 1; + mov.u64 %rd144, 1; + setp.gt.u64 %p20, %rd143, 4294967295; + shr.u64 %rd145, %rd262, 32; + selp.b64 %rd146, %rd145, %rd143, %p20; + selp.b32 %r3946, 32, 0, %p20; + and.b64 %rd147, %rd146, 4294901760; + setp.ne.s64 %p21, %rd147, 0; + shr.u64 %rd148, %rd146, 16; + or.b32 %r3947, %r3946, 16; + selp.b64 %rd149, %rd148, %rd146, %p21; + selp.b32 %r3948, %r3947, %r3946, %p21; + and.b64 %rd150, %rd149, 65280; + setp.ne.s64 %p22, %rd150, 0; + shr.u64 %rd151, %rd149, 8; + or.b32 %r3949, %r3948, 8; + selp.b64 %rd152, %rd151, %rd149, %p22; + selp.b32 %r3950, %r3949, %r3948, %p22; + and.b64 %rd153, %rd152, 240; + setp.ne.s64 %p23, %rd153, 0; + shr.u64 %rd154, %rd152, 4; + or.b32 %r3951, %r3950, 4; + selp.b64 %rd155, %rd154, %rd152, %p23; + selp.b32 %r3952, %r3951, %r3950, %p23; + and.b64 %rd156, %rd155, 12; + setp.ne.s64 %p24, %rd156, 0; + shr.u64 %rd157, %rd155, 2; + add.s32 %r3953, %r3952, 2; + selp.b64 %rd158, %rd157, %rd155, %p24; + selp.b32 %r3954, %r3953, %r3952, %p24; + and.b64 %rd159, %rd158, 2; + shr.u64 %rd160, %rd159, 1; + cvt.u32.u64 %r3955, %rd160; + add.s32 %r3956, %r3954, %r3955; + shl.b64 %rd255, %rd144, %r3956; + shl.b64 %rd48, %rd251, 10; + +$L__BB1_27: + mov.u64 %rd49, %rd255; + add.s64 %rd161, %rd49, -1; + and.b64 %rd162, %rd161, %rd48; + setp.ne.s64 %p25, %rd162, 0; + shr.u64 %rd255, %rd49, 1; + @%p25 bra $L__BB1_27; + + ld.local.u8 %rs14, [%rd3+2]; + setp.lt.u64 %p26, %rd49, 1025; + @%p26 bra $L__BB1_36; + bra.uni $L__BB1_29; + +$L__BB1_36: + ld.local.u8 %r5955, [%rd3+-136]; + ld.local.u8 %r5956, [%rd3+-135]; + prmt.b32 %r5957, %r5956, %r5955, 30212; + ld.local.u8 %r5958, [%rd3+-134]; + ld.local.u8 %r5959, [%rd3+-133]; + prmt.b32 %r5960, %r5959, %r5958, 30212; + prmt.b32 %r11679, %r5960, %r5957, 4180; + ld.local.u8 %r5961, [%rd3+-132]; + ld.local.u8 %r5962, [%rd3+-131]; + prmt.b32 %r5963, %r5962, %r5961, 30212; + ld.local.u8 %r5964, [%rd3+-130]; + ld.local.u8 %r5965, [%rd3+-129]; + prmt.b32 %r5966, %r5965, %r5964, 30212; + prmt.b32 %r11678, %r5966, %r5963, 4180; + ld.local.u8 %r5967, [%rd3+-128]; + ld.local.u8 %r5968, [%rd3+-127]; + prmt.b32 %r5969, %r5968, %r5967, 30212; + ld.local.u8 %r5970, [%rd3+-126]; + ld.local.u8 %r5971, [%rd3+-125]; + prmt.b32 %r5972, %r5971, %r5970, 30212; + prmt.b32 %r11677, %r5972, %r5969, 4180; + ld.local.u8 %r5973, [%rd3+-124]; + ld.local.u8 %r5974, [%rd3+-123]; + prmt.b32 %r5975, %r5974, %r5973, 30212; + ld.local.u8 %r5976, [%rd3+-122]; + ld.local.u8 %r5977, [%rd3+-121]; + prmt.b32 %r5978, %r5977, %r5976, 30212; + prmt.b32 %r11676, %r5978, %r5975, 4180; + ld.local.u8 %r5979, [%rd3+-120]; + ld.local.u8 %r5980, [%rd3+-119]; + prmt.b32 %r5981, %r5980, %r5979, 30212; + ld.local.u8 %r5982, [%rd3+-118]; + ld.local.u8 %r5983, [%rd3+-117]; + prmt.b32 %r5984, %r5983, %r5982, 30212; + prmt.b32 %r11675, %r5984, %r5981, 4180; + ld.local.u8 %r5985, [%rd3+-116]; + ld.local.u8 %r5986, [%rd3+-115]; + prmt.b32 %r5987, %r5986, %r5985, 30212; + ld.local.u8 %r5988, [%rd3+-114]; + ld.local.u8 %r5989, [%rd3+-113]; + prmt.b32 %r5990, %r5989, %r5988, 30212; + prmt.b32 %r11674, %r5990, %r5987, 4180; + ld.local.u8 %r5991, [%rd3+-112]; + ld.local.u8 %r5992, [%rd3+-111]; + prmt.b32 %r5993, %r5992, %r5991, 30212; + ld.local.u8 %r5994, [%rd3+-110]; + ld.local.u8 %r5995, [%rd3+-109]; + prmt.b32 %r5996, %r5995, %r5994, 30212; + prmt.b32 %r11673, %r5996, %r5993, 4180; + ld.local.u8 %r5997, [%rd3+-108]; + ld.local.u8 %r5998, [%rd3+-107]; + prmt.b32 %r5999, %r5998, %r5997, 30212; + ld.local.u8 %r6000, [%rd3+-106]; + ld.local.u8 %r6001, [%rd3+-105]; + prmt.b32 %r6002, %r6001, %r6000, 30212; + prmt.b32 %r11672, %r6002, %r5999, 4180; + add.u64 %rd53, %SPL, 64; + mov.u32 %r6003, 0; + st.local.v2.u32 [%rd53], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+8], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+16], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+24], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+32], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+40], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+48], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+56], {%r6003, %r6003}; + mov.u16 %rs354, 0; + st.local.v2.u8 [%rd53+64], {%rs354, %rs354}; + st.local.u8 [%rd53+66], %rs14; + cvt.u32.u64 %r71, %rd251; + shr.u64 %rd185, %rd251, 32; + cvt.u32.u64 %r72, %rd185; + setp.lt.u64 %p31, %rd49, 65; + mov.u64 %rd258, %rd261; + mov.u64 %rd259, %rd49; + @%p31 bra $L__BB1_39; + + add.s64 %rd54, %rd53, 64; + mov.u16 %rs353, 0; + mov.u64 %rd259, %rd49; + mov.u64 %rd258, %rd261; + +$L__BB1_38: + and.b16 %rs213, %rs353, 255; + setp.eq.s16 %p32, %rs213, 0; + selp.u16 %rs214, 1, 0, %p32; + or.b16 %rs215, %rs14, %rs214; + ld.local.u8 %r6004, [%rd258]; + ld.local.u8 %r6005, [%rd258+1]; + prmt.b32 %r6006, %r6005, %r6004, 30212; + ld.local.u8 %r6007, [%rd258+2]; + prmt.b32 %r6008, %r6007, %r6006, 28756; + ld.local.u8 %r6009, [%rd258+3]; + prmt.b32 %r6010, %r6009, %r6008, 1620; + ld.local.u8 %r6011, [%rd258+4]; + ld.local.u8 %r6012, [%rd258+5]; + prmt.b32 %r6013, %r6012, %r6011, 30212; + ld.local.u8 %r6014, [%rd258+6]; + prmt.b32 %r6015, %r6014, %r6013, 28756; + ld.local.u8 %r6016, [%rd258+7]; + prmt.b32 %r6017, %r6016, %r6015, 1620; + ld.local.u8 %r6018, [%rd258+8]; + ld.local.u8 %r6019, [%rd258+9]; + prmt.b32 %r6020, %r6019, %r6018, 30212; + ld.local.u8 %r6021, [%rd258+10]; + prmt.b32 %r6022, %r6021, %r6020, 28756; + ld.local.u8 %r6023, [%rd258+11]; + prmt.b32 %r6024, %r6023, %r6022, 1620; + ld.local.u8 %r6025, [%rd258+12]; + ld.local.u8 %r6026, [%rd258+13]; + prmt.b32 %r6027, %r6026, %r6025, 30212; + ld.local.u8 %r6028, [%rd258+14]; + prmt.b32 %r6029, %r6028, %r6027, 28756; + ld.local.u8 %r6030, [%rd258+15]; + prmt.b32 %r6031, %r6030, %r6029, 1620; + ld.local.u8 %r6032, [%rd258+16]; + ld.local.u8 %r6033, [%rd258+17]; + prmt.b32 %r6034, %r6033, %r6032, 30212; + ld.local.u8 %r6035, [%rd258+18]; + prmt.b32 %r6036, %r6035, %r6034, 28756; + ld.local.u8 %r6037, [%rd258+19]; + prmt.b32 %r6038, %r6037, %r6036, 1620; + ld.local.u8 %r6039, [%rd258+20]; + ld.local.u8 %r6040, [%rd258+21]; + prmt.b32 %r6041, %r6040, %r6039, 30212; + ld.local.u8 %r6042, [%rd258+22]; + prmt.b32 %r6043, %r6042, %r6041, 28756; + ld.local.u8 %r6044, [%rd258+23]; + prmt.b32 %r6045, %r6044, %r6043, 1620; + ld.local.u8 %r6046, [%rd258+24]; + ld.local.u8 %r6047, [%rd258+25]; + prmt.b32 %r6048, %r6047, %r6046, 30212; + ld.local.u8 %r6049, [%rd258+26]; + prmt.b32 %r6050, %r6049, %r6048, 28756; + ld.local.u8 %r6051, [%rd258+27]; + prmt.b32 %r6052, %r6051, %r6050, 1620; + ld.local.u8 %r6053, [%rd258+28]; + ld.local.u8 %r6054, [%rd258+29]; + prmt.b32 %r6055, %r6054, %r6053, 30212; + ld.local.u8 %r6056, [%rd258+30]; + prmt.b32 %r6057, %r6056, %r6055, 28756; + ld.local.u8 %r6058, [%rd258+31]; + prmt.b32 %r6059, %r6058, %r6057, 1620; + ld.local.u8 %r6060, [%rd258+32]; + ld.local.u8 %r6061, [%rd258+33]; + prmt.b32 %r6062, %r6061, %r6060, 30212; + ld.local.u8 %r6063, [%rd258+34]; + prmt.b32 %r6064, %r6063, %r6062, 28756; + ld.local.u8 %r6065, [%rd258+35]; + prmt.b32 %r6066, %r6065, %r6064, 1620; + ld.local.u8 %r6067, [%rd258+36]; + ld.local.u8 %r6068, [%rd258+37]; + prmt.b32 %r6069, %r6068, %r6067, 30212; + ld.local.u8 %r6070, [%rd258+38]; + prmt.b32 %r6071, %r6070, %r6069, 28756; + ld.local.u8 %r6072, [%rd258+39]; + prmt.b32 %r6073, %r6072, %r6071, 1620; + ld.local.u8 %r6074, [%rd258+40]; + ld.local.u8 %r6075, [%rd258+41]; + prmt.b32 %r6076, %r6075, %r6074, 30212; + ld.local.u8 %r6077, [%rd258+42]; + prmt.b32 %r6078, %r6077, %r6076, 28756; + ld.local.u8 %r6079, [%rd258+43]; + prmt.b32 %r6080, %r6079, %r6078, 1620; + ld.local.u8 %r6081, [%rd258+44]; + ld.local.u8 %r6082, [%rd258+45]; + prmt.b32 %r6083, %r6082, %r6081, 30212; + ld.local.u8 %r6084, [%rd258+46]; + prmt.b32 %r6085, %r6084, %r6083, 28756; + ld.local.u8 %r6086, [%rd258+47]; + prmt.b32 %r6087, %r6086, %r6085, 1620; + ld.local.u8 %r6088, [%rd258+48]; + ld.local.u8 %r6089, [%rd258+49]; + prmt.b32 %r6090, %r6089, %r6088, 30212; + ld.local.u8 %r6091, [%rd258+50]; + prmt.b32 %r6092, %r6091, %r6090, 28756; + ld.local.u8 %r6093, [%rd258+51]; + prmt.b32 %r6094, %r6093, %r6092, 1620; + ld.local.u8 %r6095, [%rd258+52]; + ld.local.u8 %r6096, [%rd258+53]; + prmt.b32 %r6097, %r6096, %r6095, 30212; + ld.local.u8 %r6098, [%rd258+54]; + prmt.b32 %r6099, %r6098, %r6097, 28756; + ld.local.u8 %r6100, [%rd258+55]; + prmt.b32 %r6101, %r6100, %r6099, 1620; + ld.local.u8 %r6102, [%rd258+56]; + ld.local.u8 %r6103, [%rd258+57]; + prmt.b32 %r6104, %r6103, %r6102, 30212; + ld.local.u8 %r6105, [%rd258+58]; + prmt.b32 %r6106, %r6105, %r6104, 28756; + ld.local.u8 %r6107, [%rd258+59]; + prmt.b32 %r6108, %r6107, %r6106, 1620; + ld.local.u8 %r6109, [%rd258+60]; + ld.local.u8 %r6110, [%rd258+61]; + prmt.b32 %r6111, %r6110, %r6109, 30212; + ld.local.u8 %r6112, [%rd258+62]; + prmt.b32 %r6113, %r6112, %r6111, 28756; + ld.local.u8 %r6114, [%rd258+63]; + prmt.b32 %r6115, %r6114, %r6113, 1620; + cvt.u32.u16 %r6116, %rs215; + and.b32 %r6117, %r6116, 255; + add.s32 %r6118, %r11679, %r11675; + add.s32 %r6119, %r6118, %r6010; + xor.b32 %r6120, %r6119, %r71; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 16; + add.s32 %r6122, %r6121, 1779033703; + xor.b32 %r6123, %r6122, %r11675; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 20; + add.s32 %r6125, %r6017, %r6119; + add.s32 %r6126, %r6125, %r6124; + xor.b32 %r6127, %r6126, %r6121; + shf.l.wrap.b32 %r6128, %r6127, %r6127, 24; + add.s32 %r6129, %r6128, %r6122; + xor.b32 %r6130, %r6129, %r6124; + shf.l.wrap.b32 %r6131, %r6130, %r6130, 25; + add.s32 %r6132, %r11678, %r11674; + add.s32 %r6133, %r6132, %r6024; + xor.b32 %r6134, %r6133, %r72; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 16; + add.s32 %r6136, %r6135, -1150833019; + xor.b32 %r6137, %r6136, %r11674; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 20; + add.s32 %r6139, %r6031, %r6133; + add.s32 %r6140, %r6139, %r6138; + xor.b32 %r6141, %r6140, %r6135; + shf.l.wrap.b32 %r6142, %r6141, %r6141, 24; + add.s32 %r6143, %r6142, %r6136; + xor.b32 %r6144, %r6143, %r6138; + shf.l.wrap.b32 %r6145, %r6144, %r6144, 25; + add.s32 %r6146, %r11677, %r11673; + add.s32 %r6147, %r6146, %r6038; + shr.u32 %r6148, %r6147, 16; + shl.b32 %r6149, %r6147, 16; + xor.b32 %r6150, %r6149, 4194304; + or.b32 %r6151, %r6150, %r6148; + add.s32 %r6152, %r6151, 1013904242; + xor.b32 %r6153, %r6152, %r11673; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 20; + add.s32 %r6155, %r6045, %r6147; + add.s32 %r6156, %r6155, %r6154; + xor.b32 %r6157, %r6156, %r6151; + shf.l.wrap.b32 %r6158, %r6157, %r6157, 24; + add.s32 %r6159, %r6158, %r6152; + xor.b32 %r6160, %r6159, %r6154; + shf.l.wrap.b32 %r6161, %r6160, %r6160, 25; + add.s32 %r6162, %r11676, %r11672; + add.s32 %r6163, %r6162, %r6052; + xor.b32 %r6164, %r6163, %r6117; + shr.u32 %r6165, %r6163, 16; + shl.b32 %r6166, %r6164, 16; + or.b32 %r6167, %r6166, %r6165; + add.s32 %r6168, %r6167, -1521486534; + xor.b32 %r6169, %r6168, %r11672; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 20; + add.s32 %r6171, %r6059, %r6163; + add.s32 %r6172, %r6171, %r6170; + xor.b32 %r6173, %r6172, %r6167; + shf.l.wrap.b32 %r6174, %r6173, %r6173, 24; + add.s32 %r6175, %r6174, %r6168; + xor.b32 %r6176, %r6175, %r6170; + shf.l.wrap.b32 %r6177, %r6176, %r6176, 25; + add.s32 %r6178, %r6145, %r6126; + add.s32 %r6179, %r6178, %r6066; + xor.b32 %r6180, %r6174, %r6179; + shf.l.wrap.b32 %r6181, %r6180, %r6180, 16; + add.s32 %r6182, %r6181, %r6159; + xor.b32 %r6183, %r6182, %r6145; + shf.l.wrap.b32 %r6184, %r6183, %r6183, 20; + add.s32 %r6185, %r6073, %r6179; + add.s32 %r6186, %r6185, %r6184; + xor.b32 %r6187, %r6186, %r6181; + shf.l.wrap.b32 %r6188, %r6187, %r6187, 24; + add.s32 %r6189, %r6188, %r6182; + xor.b32 %r6190, %r6189, %r6184; + shf.l.wrap.b32 %r6191, %r6190, %r6190, 25; + add.s32 %r6192, %r6161, %r6140; + add.s32 %r6193, %r6192, %r6080; + xor.b32 %r6194, %r6193, %r6128; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 16; + add.s32 %r6196, %r6195, %r6175; + xor.b32 %r6197, %r6196, %r6161; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 20; + add.s32 %r6199, %r6087, %r6193; + add.s32 %r6200, %r6199, %r6198; + xor.b32 %r6201, %r6200, %r6195; + shf.l.wrap.b32 %r6202, %r6201, %r6201, 24; + add.s32 %r6203, %r6202, %r6196; + xor.b32 %r6204, %r6203, %r6198; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 25; + add.s32 %r6206, %r6177, %r6156; + add.s32 %r6207, %r6206, %r6094; + xor.b32 %r6208, %r6207, %r6142; + shf.l.wrap.b32 %r6209, %r6208, %r6208, 16; + add.s32 %r6210, %r6209, %r6129; + xor.b32 %r6211, %r6210, %r6177; + shf.l.wrap.b32 %r6212, %r6211, %r6211, 20; + add.s32 %r6213, %r6101, %r6207; + add.s32 %r6214, %r6213, %r6212; + xor.b32 %r6215, %r6214, %r6209; + shf.l.wrap.b32 %r6216, %r6215, %r6215, 24; + add.s32 %r6217, %r6216, %r6210; + xor.b32 %r6218, %r6217, %r6212; + shf.l.wrap.b32 %r6219, %r6218, %r6218, 25; + add.s32 %r6220, %r6172, %r6131; + add.s32 %r6221, %r6220, %r6108; + xor.b32 %r6222, %r6221, %r6158; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 16; + add.s32 %r6224, %r6223, %r6143; + xor.b32 %r6225, %r6224, %r6131; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 20; + add.s32 %r6227, %r6115, %r6221; + add.s32 %r6228, %r6227, %r6226; + xor.b32 %r6229, %r6228, %r6223; + shf.l.wrap.b32 %r6230, %r6229, %r6229, 24; + add.s32 %r6231, %r6230, %r6224; + xor.b32 %r6232, %r6231, %r6226; + shf.l.wrap.b32 %r6233, %r6232, %r6232, 25; + add.s32 %r6234, %r6186, %r6024; + add.s32 %r6235, %r6234, %r6233; + xor.b32 %r6236, %r6235, %r6202; + shf.l.wrap.b32 %r6237, %r6236, %r6236, 16; + add.s32 %r6238, %r6237, %r6217; + xor.b32 %r6239, %r6238, %r6233; + shf.l.wrap.b32 %r6240, %r6239, %r6239, 20; + add.s32 %r6241, %r6235, %r6052; + add.s32 %r6242, %r6241, %r6240; + xor.b32 %r6243, %r6242, %r6237; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 24; + add.s32 %r6245, %r6244, %r6238; + xor.b32 %r6246, %r6245, %r6240; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 25; + add.s32 %r6248, %r6200, %r6031; + add.s32 %r6249, %r6248, %r6191; + xor.b32 %r6250, %r6216, %r6249; + shf.l.wrap.b32 %r6251, %r6250, %r6250, 16; + add.s32 %r6252, %r6231, %r6251; + xor.b32 %r6253, %r6252, %r6191; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 20; + add.s32 %r6255, %r6249, %r6080; + add.s32 %r6256, %r6255, %r6254; + xor.b32 %r6257, %r6256, %r6251; + shf.l.wrap.b32 %r6258, %r6257, %r6257, 24; + add.s32 %r6259, %r6258, %r6252; + xor.b32 %r6260, %r6259, %r6254; + shf.l.wrap.b32 %r6261, %r6260, %r6260, 25; + add.s32 %r6262, %r6205, %r6059; + add.s32 %r6263, %r6262, %r6214; + xor.b32 %r6264, %r6230, %r6263; + shf.l.wrap.b32 %r6265, %r6264, %r6264, 16; + add.s32 %r6266, %r6265, %r6189; + xor.b32 %r6267, %r6266, %r6205; + shf.l.wrap.b32 %r6268, %r6267, %r6267, 20; + add.s32 %r6269, %r6263, %r6010; + add.s32 %r6270, %r6269, %r6268; + xor.b32 %r6271, %r6270, %r6265; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 24; + add.s32 %r6273, %r6272, %r6266; + xor.b32 %r6274, %r6273, %r6268; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 25; + add.s32 %r6276, %r6219, %r6038; + add.s32 %r6277, %r6276, %r6228; + xor.b32 %r6278, %r6277, %r6188; + shf.l.wrap.b32 %r6279, %r6278, %r6278, 16; + add.s32 %r6280, %r6279, %r6203; + xor.b32 %r6281, %r6280, %r6219; + shf.l.wrap.b32 %r6282, %r6281, %r6281, 20; + add.s32 %r6283, %r6277, %r6101; + add.s32 %r6284, %r6283, %r6282; + xor.b32 %r6285, %r6284, %r6279; + shf.l.wrap.b32 %r6286, %r6285, %r6285, 24; + add.s32 %r6287, %r6286, %r6280; + xor.b32 %r6288, %r6287, %r6282; + shf.l.wrap.b32 %r6289, %r6288, %r6288, 25; + add.s32 %r6290, %r6242, %r6017; + add.s32 %r6291, %r6290, %r6261; + xor.b32 %r6292, %r6291, %r6286; + shf.l.wrap.b32 %r6293, %r6292, %r6292, 16; + add.s32 %r6294, %r6293, %r6273; + xor.b32 %r6295, %r6294, %r6261; + shf.l.wrap.b32 %r6296, %r6295, %r6295, 20; + add.s32 %r6297, %r6291, %r6087; + add.s32 %r6298, %r6297, %r6296; + xor.b32 %r6299, %r6298, %r6293; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 24; + add.s32 %r6301, %r6300, %r6294; + xor.b32 %r6302, %r6301, %r6296; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 25; + add.s32 %r6304, %r6256, %r6094; + add.s32 %r6305, %r6304, %r6275; + xor.b32 %r6306, %r6305, %r6244; + shf.l.wrap.b32 %r6307, %r6306, %r6306, 16; + add.s32 %r6308, %r6307, %r6287; + xor.b32 %r6309, %r6308, %r6275; + shf.l.wrap.b32 %r6310, %r6309, %r6309, 20; + add.s32 %r6311, %r6305, %r6045; + add.s32 %r6312, %r6311, %r6310; + xor.b32 %r6313, %r6312, %r6307; + shf.l.wrap.b32 %r6314, %r6313, %r6313, 24; + add.s32 %r6315, %r6314, %r6308; + xor.b32 %r6316, %r6315, %r6310; + shf.l.wrap.b32 %r6317, %r6316, %r6316, 25; + add.s32 %r6318, %r6270, %r6073; + add.s32 %r6319, %r6318, %r6289; + xor.b32 %r6320, %r6319, %r6258; + shf.l.wrap.b32 %r6321, %r6320, %r6320, 16; + add.s32 %r6322, %r6321, %r6245; + xor.b32 %r6323, %r6322, %r6289; + shf.l.wrap.b32 %r6324, %r6323, %r6323, 20; + add.s32 %r6325, %r6319, %r6108; + add.s32 %r6326, %r6325, %r6324; + xor.b32 %r6327, %r6326, %r6321; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 24; + add.s32 %r6329, %r6328, %r6322; + xor.b32 %r6330, %r6329, %r6324; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 25; + add.s32 %r6332, %r6284, %r6115; + add.s32 %r6333, %r6332, %r6247; + xor.b32 %r6334, %r6333, %r6272; + shf.l.wrap.b32 %r6335, %r6334, %r6334, 16; + add.s32 %r6336, %r6335, %r6259; + xor.b32 %r6337, %r6336, %r6247; + shf.l.wrap.b32 %r6338, %r6337, %r6337, 20; + add.s32 %r6339, %r6333, %r6066; + add.s32 %r6340, %r6339, %r6338; + xor.b32 %r6341, %r6340, %r6335; + shf.l.wrap.b32 %r6342, %r6341, %r6341, 24; + add.s32 %r6343, %r6342, %r6336; + xor.b32 %r6344, %r6343, %r6338; + shf.l.wrap.b32 %r6345, %r6344, %r6344, 25; + add.s32 %r6346, %r6298, %r6031; + add.s32 %r6347, %r6346, %r6345; + xor.b32 %r6348, %r6347, %r6314; + shf.l.wrap.b32 %r6349, %r6348, %r6348, 16; + add.s32 %r6350, %r6349, %r6329; + xor.b32 %r6351, %r6350, %r6345; + shf.l.wrap.b32 %r6352, %r6351, %r6351, 20; + add.s32 %r6353, %r6347, %r6038; + add.s32 %r6354, %r6353, %r6352; + xor.b32 %r6355, %r6354, %r6349; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 24; + add.s32 %r6357, %r6356, %r6350; + xor.b32 %r6358, %r6357, %r6352; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 25; + add.s32 %r6360, %r6312, %r6080; + add.s32 %r6361, %r6360, %r6303; + xor.b32 %r6362, %r6361, %r6328; + shf.l.wrap.b32 %r6363, %r6362, %r6362, 16; + add.s32 %r6364, %r6363, %r6343; + xor.b32 %r6365, %r6364, %r6303; + shf.l.wrap.b32 %r6366, %r6365, %r6365, 20; + add.s32 %r6367, %r6361, %r6094; + add.s32 %r6368, %r6367, %r6366; + xor.b32 %r6369, %r6368, %r6363; + shf.l.wrap.b32 %r6370, %r6369, %r6369, 24; + add.s32 %r6371, %r6370, %r6364; + xor.b32 %r6372, %r6371, %r6366; + shf.l.wrap.b32 %r6373, %r6372, %r6372, 25; + add.s32 %r6374, %r6326, %r6101; + add.s32 %r6375, %r6374, %r6317; + xor.b32 %r6376, %r6375, %r6342; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 16; + add.s32 %r6378, %r6377, %r6301; + xor.b32 %r6379, %r6378, %r6317; + shf.l.wrap.b32 %r6380, %r6379, %r6379, 20; + add.s32 %r6381, %r6375, %r6024; + add.s32 %r6382, %r6381, %r6380; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 24; + add.s32 %r6385, %r6384, %r6378; + xor.b32 %r6386, %r6385, %r6380; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 25; + add.s32 %r6388, %r6340, %r6059; + add.s32 %r6389, %r6388, %r6331; + xor.b32 %r6390, %r6389, %r6300; + shf.l.wrap.b32 %r6391, %r6390, %r6390, 16; + add.s32 %r6392, %r6391, %r6315; + xor.b32 %r6393, %r6392, %r6331; + shf.l.wrap.b32 %r6394, %r6393, %r6393, 20; + add.s32 %r6395, %r6389, %r6108; + add.s32 %r6396, %r6395, %r6394; + xor.b32 %r6397, %r6396, %r6391; + shf.l.wrap.b32 %r6398, %r6397, %r6397, 24; + add.s32 %r6399, %r6398, %r6392; + xor.b32 %r6400, %r6399, %r6394; + shf.l.wrap.b32 %r6401, %r6400, %r6400, 25; + add.s32 %r6402, %r6354, %r6052; + add.s32 %r6403, %r6402, %r6373; + xor.b32 %r6404, %r6403, %r6398; + shf.l.wrap.b32 %r6405, %r6404, %r6404, 16; + add.s32 %r6406, %r6405, %r6385; + xor.b32 %r6407, %r6406, %r6373; + shf.l.wrap.b32 %r6408, %r6407, %r6407, 20; + add.s32 %r6409, %r6403, %r6045; + add.s32 %r6410, %r6409, %r6408; + xor.b32 %r6411, %r6410, %r6405; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 24; + add.s32 %r6413, %r6412, %r6406; + xor.b32 %r6414, %r6413, %r6408; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 25; + add.s32 %r6416, %r6368, %r6073; + add.s32 %r6417, %r6416, %r6387; + xor.b32 %r6418, %r6417, %r6356; + shf.l.wrap.b32 %r6419, %r6418, %r6418, 16; + add.s32 %r6420, %r6419, %r6399; + xor.b32 %r6421, %r6420, %r6387; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 20; + add.s32 %r6423, %r6417, %r6010; + add.s32 %r6424, %r6423, %r6422; + xor.b32 %r6425, %r6424, %r6419; + shf.l.wrap.b32 %r6426, %r6425, %r6425, 24; + add.s32 %r6427, %r6426, %r6420; + xor.b32 %r6428, %r6427, %r6422; + shf.l.wrap.b32 %r6429, %r6428, %r6428, 25; + add.s32 %r6430, %r6382, %r6087; + add.s32 %r6431, %r6430, %r6401; + xor.b32 %r6432, %r6431, %r6370; + shf.l.wrap.b32 %r6433, %r6432, %r6432, 16; + add.s32 %r6434, %r6433, %r6357; + xor.b32 %r6435, %r6434, %r6401; + shf.l.wrap.b32 %r6436, %r6435, %r6435, 20; + add.s32 %r6437, %r6431, %r6115; + add.s32 %r6438, %r6437, %r6436; + xor.b32 %r6439, %r6438, %r6433; + shf.l.wrap.b32 %r6440, %r6439, %r6439, 24; + add.s32 %r6441, %r6440, %r6434; + xor.b32 %r6442, %r6441, %r6436; + shf.l.wrap.b32 %r6443, %r6442, %r6442, 25; + add.s32 %r6444, %r6396, %r6066; + add.s32 %r6445, %r6444, %r6359; + xor.b32 %r6446, %r6445, %r6384; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 16; + add.s32 %r6448, %r6447, %r6371; + xor.b32 %r6449, %r6448, %r6359; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 20; + add.s32 %r6451, %r6445, %r6017; + add.s32 %r6452, %r6451, %r6450; + xor.b32 %r6453, %r6452, %r6447; + shf.l.wrap.b32 %r6454, %r6453, %r6453, 24; + add.s32 %r6455, %r6454, %r6448; + xor.b32 %r6456, %r6455, %r6450; + shf.l.wrap.b32 %r6457, %r6456, %r6456, 25; + add.s32 %r6458, %r6410, %r6080; + add.s32 %r6459, %r6458, %r6457; + xor.b32 %r6460, %r6459, %r6426; + shf.l.wrap.b32 %r6461, %r6460, %r6460, 16; + add.s32 %r6462, %r6461, %r6441; + xor.b32 %r6463, %r6462, %r6457; + shf.l.wrap.b32 %r6464, %r6463, %r6463, 20; + add.s32 %r6465, %r6459, %r6059; + add.s32 %r6466, %r6465, %r6464; + xor.b32 %r6467, %r6466, %r6461; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 24; + add.s32 %r6469, %r6468, %r6462; + xor.b32 %r6470, %r6469, %r6464; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 25; + add.s32 %r6472, %r6424, %r6094; + add.s32 %r6473, %r6472, %r6415; + xor.b32 %r6474, %r6473, %r6440; + shf.l.wrap.b32 %r6475, %r6474, %r6474, 16; + add.s32 %r6476, %r6475, %r6455; + xor.b32 %r6477, %r6476, %r6415; + shf.l.wrap.b32 %r6478, %r6477, %r6477, 20; + add.s32 %r6479, %r6473, %r6073; + add.s32 %r6480, %r6479, %r6478; + xor.b32 %r6481, %r6480, %r6475; + shf.l.wrap.b32 %r6482, %r6481, %r6481, 24; + add.s32 %r6483, %r6482, %r6476; + xor.b32 %r6484, %r6483, %r6478; + shf.l.wrap.b32 %r6485, %r6484, %r6484, 25; + add.s32 %r6486, %r6438, %r6108; + add.s32 %r6487, %r6486, %r6429; + xor.b32 %r6488, %r6487, %r6454; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 16; + add.s32 %r6490, %r6489, %r6413; + xor.b32 %r6491, %r6490, %r6429; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 20; + add.s32 %r6493, %r6487, %r6031; + add.s32 %r6494, %r6493, %r6492; + xor.b32 %r6495, %r6494, %r6489; + shf.l.wrap.b32 %r6496, %r6495, %r6495, 24; + add.s32 %r6497, %r6496, %r6490; + xor.b32 %r6498, %r6497, %r6492; + shf.l.wrap.b32 %r6499, %r6498, %r6498, 25; + add.s32 %r6500, %r6452, %r6101; + add.s32 %r6501, %r6500, %r6443; + xor.b32 %r6502, %r6501, %r6412; + shf.l.wrap.b32 %r6503, %r6502, %r6502, 16; + add.s32 %r6504, %r6503, %r6427; + xor.b32 %r6505, %r6504, %r6443; + shf.l.wrap.b32 %r6506, %r6505, %r6505, 20; + add.s32 %r6507, %r6501, %r6115; + add.s32 %r6508, %r6507, %r6506; + xor.b32 %r6509, %r6508, %r6503; + shf.l.wrap.b32 %r6510, %r6509, %r6509, 24; + add.s32 %r6511, %r6510, %r6504; + xor.b32 %r6512, %r6511, %r6506; + shf.l.wrap.b32 %r6513, %r6512, %r6512, 25; + add.s32 %r6514, %r6466, %r6038; + add.s32 %r6515, %r6514, %r6485; + xor.b32 %r6516, %r6515, %r6510; + shf.l.wrap.b32 %r6517, %r6516, %r6516, 16; + add.s32 %r6518, %r6517, %r6497; + xor.b32 %r6519, %r6518, %r6485; + shf.l.wrap.b32 %r6520, %r6519, %r6519, 20; + add.s32 %r6521, %r6515, %r6010; + add.s32 %r6522, %r6521, %r6520; + xor.b32 %r6523, %r6522, %r6517; + shf.l.wrap.b32 %r6524, %r6523, %r6523, 24; + add.s32 %r6525, %r6524, %r6518; + xor.b32 %r6526, %r6525, %r6520; + shf.l.wrap.b32 %r6527, %r6526, %r6526, 25; + add.s32 %r6528, %r6480, %r6087; + add.s32 %r6529, %r6528, %r6499; + xor.b32 %r6530, %r6529, %r6468; + shf.l.wrap.b32 %r6531, %r6530, %r6530, 16; + add.s32 %r6532, %r6531, %r6511; + xor.b32 %r6533, %r6532, %r6499; + shf.l.wrap.b32 %r6534, %r6533, %r6533, 20; + add.s32 %r6535, %r6529, %r6024; + add.s32 %r6536, %r6535, %r6534; + xor.b32 %r6537, %r6536, %r6531; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 24; + add.s32 %r6539, %r6538, %r6532; + xor.b32 %r6540, %r6539, %r6534; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 25; + add.s32 %r6542, %r6494, %r6045; + add.s32 %r6543, %r6542, %r6513; + xor.b32 %r6544, %r6543, %r6482; + shf.l.wrap.b32 %r6545, %r6544, %r6544, 16; + add.s32 %r6546, %r6545, %r6469; + xor.b32 %r6547, %r6546, %r6513; + shf.l.wrap.b32 %r6548, %r6547, %r6547, 20; + add.s32 %r6549, %r6543, %r6066; + add.s32 %r6550, %r6549, %r6548; + xor.b32 %r6551, %r6550, %r6545; + shf.l.wrap.b32 %r6552, %r6551, %r6551, 24; + add.s32 %r6553, %r6552, %r6546; + xor.b32 %r6554, %r6553, %r6548; + shf.l.wrap.b32 %r6555, %r6554, %r6554, 25; + add.s32 %r6556, %r6508, %r6017; + add.s32 %r6557, %r6556, %r6471; + xor.b32 %r6558, %r6557, %r6496; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 16; + add.s32 %r6560, %r6559, %r6483; + xor.b32 %r6561, %r6560, %r6471; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 20; + add.s32 %r6563, %r6557, %r6052; + add.s32 %r6564, %r6563, %r6562; + xor.b32 %r6565, %r6564, %r6559; + shf.l.wrap.b32 %r6566, %r6565, %r6565, 24; + add.s32 %r6567, %r6566, %r6560; + xor.b32 %r6568, %r6567, %r6562; + shf.l.wrap.b32 %r6569, %r6568, %r6568, 25; + add.s32 %r6570, %r6522, %r6094; + add.s32 %r6571, %r6570, %r6569; + xor.b32 %r6572, %r6571, %r6538; + shf.l.wrap.b32 %r6573, %r6572, %r6572, 16; + add.s32 %r6574, %r6573, %r6553; + xor.b32 %r6575, %r6574, %r6569; + shf.l.wrap.b32 %r6576, %r6575, %r6575, 20; + add.s32 %r6577, %r6571, %r6101; + add.s32 %r6578, %r6577, %r6576; + xor.b32 %r6579, %r6578, %r6573; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 24; + add.s32 %r6581, %r6580, %r6574; + xor.b32 %r6582, %r6581, %r6576; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 25; + add.s32 %r6584, %r6536, %r6073; + add.s32 %r6585, %r6584, %r6527; + xor.b32 %r6586, %r6585, %r6552; + shf.l.wrap.b32 %r6587, %r6586, %r6586, 16; + add.s32 %r6588, %r6587, %r6567; + xor.b32 %r6589, %r6588, %r6527; + shf.l.wrap.b32 %r6590, %r6589, %r6589, 20; + add.s32 %r6591, %r6585, %r6087; + add.s32 %r6592, %r6591, %r6590; + xor.b32 %r6593, %r6592, %r6587; + shf.l.wrap.b32 %r6594, %r6593, %r6593, 24; + add.s32 %r6595, %r6594, %r6588; + xor.b32 %r6596, %r6595, %r6590; + shf.l.wrap.b32 %r6597, %r6596, %r6596, 25; + add.s32 %r6598, %r6550, %r6115; + add.s32 %r6599, %r6598, %r6541; + xor.b32 %r6600, %r6599, %r6566; + shf.l.wrap.b32 %r6601, %r6600, %r6600, 16; + add.s32 %r6602, %r6601, %r6525; + xor.b32 %r6603, %r6602, %r6541; + shf.l.wrap.b32 %r6604, %r6603, %r6603, 20; + add.s32 %r6605, %r6599, %r6080; + add.s32 %r6606, %r6605, %r6604; + xor.b32 %r6607, %r6606, %r6601; + shf.l.wrap.b32 %r6608, %r6607, %r6607, 24; + add.s32 %r6609, %r6608, %r6602; + xor.b32 %r6610, %r6609, %r6604; + shf.l.wrap.b32 %r6611, %r6610, %r6610, 25; + add.s32 %r6612, %r6564, %r6108; + add.s32 %r6613, %r6612, %r6555; + xor.b32 %r6614, %r6613, %r6524; + shf.l.wrap.b32 %r6615, %r6614, %r6614, 16; + add.s32 %r6616, %r6615, %r6539; + xor.b32 %r6617, %r6616, %r6555; + shf.l.wrap.b32 %r6618, %r6617, %r6617, 20; + add.s32 %r6619, %r6613, %r6066; + add.s32 %r6620, %r6619, %r6618; + xor.b32 %r6621, %r6620, %r6615; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 24; + add.s32 %r6623, %r6622, %r6616; + xor.b32 %r6624, %r6623, %r6618; + shf.l.wrap.b32 %r6625, %r6624, %r6624, 25; + add.s32 %r6626, %r6578, %r6059; + add.s32 %r6627, %r6626, %r6597; + xor.b32 %r6628, %r6627, %r6622; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 16; + add.s32 %r6630, %r6629, %r6609; + xor.b32 %r6631, %r6630, %r6597; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 20; + add.s32 %r6633, %r6627, %r6024; + add.s32 %r6634, %r6633, %r6632; + xor.b32 %r6635, %r6634, %r6629; + shf.l.wrap.b32 %r6636, %r6635, %r6635, 24; + add.s32 %r6637, %r6636, %r6630; + xor.b32 %r6638, %r6637, %r6632; + shf.l.wrap.b32 %r6639, %r6638, %r6638, 25; + add.s32 %r6640, %r6592, %r6045; + add.s32 %r6641, %r6640, %r6611; + xor.b32 %r6642, %r6641, %r6580; + shf.l.wrap.b32 %r6643, %r6642, %r6642, 16; + add.s32 %r6644, %r6643, %r6623; + xor.b32 %r6645, %r6644, %r6611; + shf.l.wrap.b32 %r6646, %r6645, %r6645, 20; + add.s32 %r6647, %r6641, %r6031; + add.s32 %r6648, %r6647, %r6646; + xor.b32 %r6649, %r6648, %r6643; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 24; + add.s32 %r6651, %r6650, %r6644; + xor.b32 %r6652, %r6651, %r6646; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 25; + add.s32 %r6654, %r6606, %r6010; + add.s32 %r6655, %r6654, %r6625; + xor.b32 %r6656, %r6655, %r6594; + shf.l.wrap.b32 %r6657, %r6656, %r6656, 16; + add.s32 %r6658, %r6657, %r6581; + xor.b32 %r6659, %r6658, %r6625; + shf.l.wrap.b32 %r6660, %r6659, %r6659, 20; + add.s32 %r6661, %r6655, %r6017; + add.s32 %r6662, %r6661, %r6660; + xor.b32 %r6663, %r6662, %r6657; + shf.l.wrap.b32 %r6664, %r6663, %r6663, 24; + add.s32 %r6665, %r6664, %r6658; + xor.b32 %r6666, %r6665, %r6660; + shf.l.wrap.b32 %r6667, %r6666, %r6666, 25; + add.s32 %r6668, %r6620, %r6052; + add.s32 %r6669, %r6668, %r6583; + xor.b32 %r6670, %r6669, %r6608; + shf.l.wrap.b32 %r6671, %r6670, %r6670, 16; + add.s32 %r6672, %r6671, %r6595; + xor.b32 %r6673, %r6672, %r6583; + shf.l.wrap.b32 %r6674, %r6673, %r6673, 20; + add.s32 %r6675, %r6669, %r6038; + add.s32 %r6676, %r6675, %r6674; + xor.b32 %r6677, %r6676, %r6671; + shf.l.wrap.b32 %r6678, %r6677, %r6677, 24; + add.s32 %r6679, %r6678, %r6672; + xor.b32 %r6680, %r6679, %r6674; + shf.l.wrap.b32 %r6681, %r6680, %r6680, 25; + add.s32 %r6682, %r6634, %r6073; + add.s32 %r6683, %r6682, %r6681; + xor.b32 %r6684, %r6683, %r6650; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 16; + add.s32 %r6686, %r6685, %r6665; + xor.b32 %r6687, %r6686, %r6681; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 20; + add.s32 %r6689, %r6683, %r6108; + add.s32 %r6690, %r6689, %r6688; + xor.b32 %r6691, %r6690, %r6685; + shf.l.wrap.b32 %r6692, %r6691, %r6691, 24; + add.s32 %r6693, %r6692, %r6686; + xor.b32 %r6694, %r6693, %r6688; + shf.l.wrap.b32 %r6695, %r6694, %r6694, 25; + add.s32 %r6696, %r6648, %r6087; + add.s32 %r6697, %r6696, %r6639; + xor.b32 %r6698, %r6697, %r6664; + shf.l.wrap.b32 %r6699, %r6698, %r6698, 16; + add.s32 %r6700, %r6699, %r6679; + xor.b32 %r6701, %r6700, %r6639; + shf.l.wrap.b32 %r6702, %r6701, %r6701, 20; + add.s32 %r6703, %r6697, %r6045; + add.s32 %r6704, %r6703, %r6702; + xor.b32 %r6705, %r6704, %r6699; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 24; + add.s32 %r6707, %r6706, %r6700; + xor.b32 %r6708, %r6707, %r6702; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 25; + add.s32 %r6710, %r6662, %r6066; + add.s32 %r6711, %r6710, %r6653; + xor.b32 %r6712, %r6711, %r6678; + shf.l.wrap.b32 %r6713, %r6712, %r6712, 16; + add.s32 %r6714, %r6713, %r6637; + xor.b32 %r6715, %r6714, %r6653; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 20; + add.s32 %r6717, %r6711, %r6094; + add.s32 %r6718, %r6717, %r6716; + xor.b32 %r6719, %r6718, %r6713; + shf.l.wrap.b32 %r6720, %r6719, %r6719, 24; + add.s32 %r6721, %r6720, %r6714; + xor.b32 %r6722, %r6721, %r6716; + shf.l.wrap.b32 %r6723, %r6722, %r6722, 25; + add.s32 %r6724, %r6676, %r6115; + add.s32 %r6725, %r6724, %r6667; + xor.b32 %r6726, %r6725, %r6636; + shf.l.wrap.b32 %r6727, %r6726, %r6726, 16; + add.s32 %r6728, %r6727, %r6651; + xor.b32 %r6729, %r6728, %r6667; + shf.l.wrap.b32 %r6730, %r6729, %r6729, 20; + add.s32 %r6731, %r6725, %r6017; + add.s32 %r6732, %r6731, %r6730; + xor.b32 %r6733, %r6732, %r6727; + shf.l.wrap.b32 %r6734, %r6733, %r6733, 24; + add.s32 %r6735, %r6734, %r6728; + xor.b32 %r6736, %r6735, %r6730; + shf.l.wrap.b32 %r6737, %r6736, %r6736, 25; + add.s32 %r6738, %r6690, %r6101; + add.s32 %r6739, %r6738, %r6709; + xor.b32 %r6740, %r6739, %r6734; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 16; + add.s32 %r6742, %r6741, %r6721; + xor.b32 %r6743, %r6742, %r6709; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 20; + add.s32 %r6745, %r6739, %r6031; + add.s32 %r6746, %r6745, %r6744; + xor.b32 %r6747, %r6746, %r6741; + shf.l.wrap.b32 %r6748, %r6747, %r6747, 24; + add.s32 %r6749, %r6748, %r6742; + xor.b32 %r6750, %r6749, %r6744; + shf.l.wrap.b32 %r6751, %r6750, %r6750, 25; + add.s32 %r6752, %r6704, %r6010; + add.s32 %r6753, %r6752, %r6723; + xor.b32 %r6754, %r6753, %r6692; + shf.l.wrap.b32 %r6755, %r6754, %r6754, 16; + add.s32 %r6756, %r6755, %r6735; + xor.b32 %r6757, %r6756, %r6723; + shf.l.wrap.b32 %r6758, %r6757, %r6757, 20; + add.s32 %r6759, %r6753, %r6080; + add.s32 %r6760, %r6759, %r6758; + xor.b32 %r6761, %r6760, %r6755; + shf.l.wrap.b32 %r6762, %r6761, %r6761, 24; + add.s32 %r6763, %r6762, %r6756; + xor.b32 %r6764, %r6763, %r6758; + shf.l.wrap.b32 %r6765, %r6764, %r6764, 25; + add.s32 %r6766, %r6718, %r6024; + add.s32 %r6767, %r6766, %r6737; + xor.b32 %r6768, %r6767, %r6706; + shf.l.wrap.b32 %r6769, %r6768, %r6768, 16; + add.s32 %r6770, %r6769, %r6693; + xor.b32 %r6771, %r6770, %r6737; + shf.l.wrap.b32 %r6772, %r6771, %r6771, 20; + add.s32 %r6773, %r6767, %r6052; + add.s32 %r6774, %r6773, %r6772; + xor.b32 %r6775, %r6774, %r6769; + shf.l.wrap.b32 %r6776, %r6775, %r6775, 24; + add.s32 %r6777, %r6776, %r6770; + xor.b32 %r6778, %r6777, %r6772; + shf.l.wrap.b32 %r6779, %r6778, %r6778, 25; + add.s32 %r6780, %r6732, %r6038; + add.s32 %r6781, %r6780, %r6695; + xor.b32 %r6782, %r6781, %r6720; + shf.l.wrap.b32 %r6783, %r6782, %r6782, 16; + add.s32 %r6784, %r6783, %r6707; + xor.b32 %r6785, %r6784, %r6695; + shf.l.wrap.b32 %r6786, %r6785, %r6785, 20; + add.s32 %r6787, %r6781, %r6059; + add.s32 %r6788, %r6787, %r6786; + xor.b32 %r6789, %r6788, %r6783; + shf.l.wrap.b32 %r6790, %r6789, %r6789, 24; + add.s32 %r6791, %r6790, %r6784; + xor.b32 %r6792, %r6791, %r6786; + shf.l.wrap.b32 %r6793, %r6792, %r6792, 25; + add.s32 %r6794, %r6746, %r6087; + add.s32 %r6795, %r6794, %r6793; + xor.b32 %r6796, %r6795, %r6762; + shf.l.wrap.b32 %r6797, %r6796, %r6796, 16; + add.s32 %r6798, %r6797, %r6777; + xor.b32 %r6799, %r6798, %r6793; + shf.l.wrap.b32 %r6800, %r6799, %r6799, 20; + add.s32 %r6801, %r6795, %r6115; + add.s32 %r6802, %r6801, %r6800; + xor.b32 %r6803, %r6802, %r6797; + shf.l.wrap.b32 %r6804, %r6803, %r6803, 24; + add.s32 %r6805, %r6804, %r6798; + xor.b32 %r6806, %r6805, %r6800; + shf.l.wrap.b32 %r6807, %r6806, %r6806, 25; + add.s32 %r6808, %r6760, %r6045; + add.s32 %r6809, %r6808, %r6751; + xor.b32 %r6810, %r6809, %r6776; + shf.l.wrap.b32 %r6811, %r6810, %r6810, 16; + add.s32 %r6812, %r6811, %r6791; + xor.b32 %r6813, %r6812, %r6751; + shf.l.wrap.b32 %r6814, %r6813, %r6813, 20; + add.s32 %r6815, %r6809, %r6010; + add.s32 %r6816, %r6815, %r6814; + xor.b32 %r6817, %r6816, %r6811; + shf.l.wrap.b32 %r6818, %r6817, %r6817, 24; + add.s32 %r6819, %r6818, %r6812; + xor.b32 %r6820, %r6819, %r6814; + shf.l.wrap.b32 %r6821, %r6820, %r6820, 25; + add.s32 %r6822, %r6774, %r6017; + add.s32 %r6823, %r6822, %r6765; + xor.b32 %r6824, %r6823, %r6790; + shf.l.wrap.b32 %r6825, %r6824, %r6824, 16; + add.s32 %r6826, %r6825, %r6749; + xor.b32 %r6827, %r6826, %r6765; + shf.l.wrap.b32 %r6828, %r6827, %r6827, 20; + add.s32 %r6829, %r6823, %r6073; + add.s32 %r6830, %r6829, %r6828; + xor.b32 %r6831, %r6830, %r6825; + shf.l.wrap.b32 %r6832, %r6831, %r6831, 24; + add.s32 %r6833, %r6832, %r6826; + xor.b32 %r6834, %r6833, %r6828; + shf.l.wrap.b32 %r6835, %r6834, %r6834, 25; + add.s32 %r6836, %r6788, %r6066; + add.s32 %r6837, %r6836, %r6779; + xor.b32 %r6838, %r6837, %r6748; + shf.l.wrap.b32 %r6839, %r6838, %r6838, 16; + add.s32 %r6840, %r6839, %r6763; + xor.b32 %r6841, %r6840, %r6779; + shf.l.wrap.b32 %r6842, %r6841, %r6841, 20; + add.s32 %r6843, %r6837, %r6052; + add.s32 %r6844, %r6843, %r6842; + xor.b32 %r6845, %r6844, %r6839; + shf.l.wrap.b32 %r6846, %r6845, %r6845, 24; + add.s32 %r6847, %r6846, %r6840; + xor.b32 %r6848, %r6847, %r6842; + shf.l.wrap.b32 %r6849, %r6848, %r6848, 25; + add.s32 %r6850, %r6802, %r6108; + add.s32 %r6851, %r6850, %r6821; + xor.b32 %r6852, %r6851, %r6846; + shf.l.wrap.b32 %r6853, %r6852, %r6852, 16; + add.s32 %r6854, %r6853, %r6833; + xor.b32 %r6855, %r6854, %r6821; + shf.l.wrap.b32 %r6856, %r6855, %r6855, 20; + add.s32 %r6857, %r6851, %r6080; + add.s32 %r6858, %r6857, %r6856; + xor.b32 %r6859, %r6858, %r6853; + shf.l.wrap.b32 %r6860, %r6859, %r6859, 24; + add.s32 %r6861, %r6860, %r6854; + xor.b32 %r6862, %r6861, %r6856; + shf.l.wrap.b32 %r6863, %r6862, %r6862, 25; + add.s32 %r6864, %r6816, %r6024; + add.s32 %r6865, %r6864, %r6835; + xor.b32 %r6866, %r6865, %r6804; + shf.l.wrap.b32 %r6867, %r6866, %r6866, 16; + add.s32 %r6868, %r6867, %r6847; + xor.b32 %r6869, %r6868, %r6835; + shf.l.wrap.b32 %r6870, %r6869, %r6869, 20; + add.s32 %r6871, %r6865, %r6094; + add.s32 %r6872, %r6871, %r6870; + xor.b32 %r6873, %r6872, %r6867; + shf.l.wrap.b32 %r6874, %r6873, %r6873, 24; + add.s32 %r6875, %r6874, %r6868; + xor.b32 %r6876, %r6875, %r6870; + shf.l.wrap.b32 %r6877, %r6876, %r6876, 25; + add.s32 %r6878, %r6830, %r6031; + add.s32 %r6879, %r6878, %r6849; + xor.b32 %r6880, %r6879, %r6818; + shf.l.wrap.b32 %r6881, %r6880, %r6880, 16; + add.s32 %r6882, %r6881, %r6805; + xor.b32 %r6883, %r6882, %r6849; + shf.l.wrap.b32 %r6884, %r6883, %r6883, 20; + add.s32 %r6885, %r6879, %r6038; + add.s32 %r6886, %r6885, %r6884; + xor.b32 %r6887, %r6886, %r6881; + shf.l.wrap.b32 %r6888, %r6887, %r6887, 24; + add.s32 %r6889, %r6888, %r6882; + xor.b32 %r6890, %r6889, %r6884; + shf.l.wrap.b32 %r6891, %r6890, %r6890, 25; + add.s32 %r6892, %r6844, %r6059; + add.s32 %r6893, %r6892, %r6807; + xor.b32 %r6894, %r6893, %r6832; + shf.l.wrap.b32 %r6895, %r6894, %r6894, 16; + add.s32 %r6896, %r6895, %r6819; + xor.b32 %r6897, %r6896, %r6807; + shf.l.wrap.b32 %r6898, %r6897, %r6897, 20; + add.s32 %r6899, %r6893, %r6101; + add.s32 %r6900, %r6899, %r6898; + xor.b32 %r6901, %r6900, %r6895; + shf.l.wrap.b32 %r6902, %r6901, %r6901, 24; + add.s32 %r6903, %r6902, %r6896; + xor.b32 %r6904, %r6903, %r6898; + shf.l.wrap.b32 %r6905, %r6904, %r6904, 25; + xor.b32 %r11679, %r6889, %r6858; + xor.b32 %r11678, %r6903, %r6872; + xor.b32 %r11677, %r6861, %r6886; + xor.b32 %r11676, %r6900, %r6875; + xor.b32 %r11675, %r6905, %r6874; + xor.b32 %r11674, %r6863, %r6888; + xor.b32 %r11673, %r6902, %r6877; + xor.b32 %r11672, %r6891, %r6860; + add.s16 %rs353, %rs353, 1; + st.local.u8 [%rd54+1], %rs353; + add.s64 %rd258, %rd258, 64; + add.s64 %rd259, %rd259, -64; + setp.gt.u64 %p33, %rd259, 64; + @%p33 bra $L__BB1_38; + +$L__BB1_39: + min.u64 %rd61, %rd259, 64; + setp.eq.s64 %p34, %rd61, 0; + mov.u16 %rs355, %rs354; + mov.u16 %rs356, %rs354; + mov.u16 %rs357, %rs354; + mov.u16 %rs358, %rs354; + mov.u16 %rs359, %rs354; + mov.u16 %rs360, %rs354; + mov.u16 %rs361, %rs354; + mov.u16 %rs362, %rs354; + mov.u16 %rs363, %rs354; + mov.u16 %rs364, %rs354; + mov.u16 %rs365, %rs354; + mov.u16 %rs366, %rs354; + mov.u16 %rs367, %rs354; + mov.u16 %rs368, %rs354; + mov.u16 %rs369, %rs354; + mov.u16 %rs370, %rs354; + mov.u16 %rs371, %rs354; + mov.u16 %rs372, %rs354; + mov.u16 %rs373, %rs354; + mov.u16 %rs374, %rs354; + mov.u16 %rs375, %rs354; + mov.u16 %rs376, %rs354; + mov.u16 %rs377, %rs354; + mov.u16 %rs378, %rs354; + mov.u16 %rs379, %rs354; + mov.u16 %rs380, %rs354; + mov.u16 %rs381, %rs354; + mov.u16 %rs382, %rs354; + mov.u16 %rs383, %rs354; + mov.u16 %rs384, %rs354; + mov.u16 %rs385, %rs354; + mov.u16 %rs386, %rs354; + mov.u16 %rs387, %rs354; + @%p34 bra $L__BB1_43; + + mov.u64 %rd260, 0; + +$L__BB1_41: + add.s64 %rd187, %rd258, %rd260; + ld.local.u8 %rs251, [%rd187]; + add.s64 %rd188, %rd53, %rd260; + st.local.u8 [%rd188], %rs251; + add.s64 %rd260, %rd260, 1; + setp.lt.u64 %p35, %rd260, %rd61; + @%p35 bra $L__BB1_41; + + ld.local.v4.u16 {%rs384, %rs385, %rs386, %rs387}, [%rd53]; + ld.local.v4.u16 {%rs380, %rs381, %rs382, %rs383}, [%rd53+8]; + ld.local.v4.u16 {%rs376, %rs377, %rs378, %rs379}, [%rd53+16]; + ld.local.v4.u16 {%rs372, %rs373, %rs374, %rs375}, [%rd53+24]; + ld.local.v4.u16 {%rs368, %rs369, %rs370, %rs371}, [%rd53+32]; + ld.local.v4.u16 {%rs364, %rs365, %rs366, %rs367}, [%rd53+40]; + ld.local.v4.u16 {%rs360, %rs361, %rs362, %rs363}, [%rd53+48]; + ld.local.v4.u16 {%rs357, %rs358, %rs359, %rs283}, [%rd53+56]; + ld.local.u8 %rs356, [%rd53+61]; + ld.local.v2.u8 {%rs354, %rs355}, [%rd53+62]; + +$L__BB1_43: + ld.local.v4.u8 {%rs286, %rs287, %rs288, %rs289}, [%rd53+64]; + cvt.u16.u64 %rs292, %rd61; + add.s16 %rs293, %rs286, %rs292; + st.local.u8 [%rd53+64], %rs293; + setp.eq.s16 %p36, %rs287, 0; + selp.u16 %rs294, 1, 0, %p36; + or.b16 %rs295, %rs288, %rs294; + or.b16 %rs296, %rs295, 2; + shr.u16 %rs297, %rs384, 8; + shr.u16 %rs298, %rs385, 8; + shr.u16 %rs299, %rs386, 8; + shr.u16 %rs300, %rs387, 8; + shr.u16 %rs301, %rs380, 8; + shr.u16 %rs302, %rs381, 8; + shr.u16 %rs303, %rs382, 8; + shr.u16 %rs304, %rs383, 8; + shr.u16 %rs305, %rs376, 8; + shr.u16 %rs306, %rs377, 8; + shr.u16 %rs307, %rs378, 8; + shr.u16 %rs308, %rs379, 8; + shr.u16 %rs309, %rs372, 8; + shr.u16 %rs310, %rs373, 8; + shr.u16 %rs311, %rs374, 8; + shr.u16 %rs312, %rs375, 8; + shr.u16 %rs313, %rs368, 8; + shr.u16 %rs314, %rs369, 8; + shr.u16 %rs315, %rs370, 8; + shr.u16 %rs316, %rs371, 8; + shr.u16 %rs317, %rs364, 8; + shr.u16 %rs318, %rs365, 8; + shr.u16 %rs319, %rs366, 8; + shr.u16 %rs320, %rs367, 8; + shr.u16 %rs321, %rs360, 8; + shr.u16 %rs322, %rs361, 8; + shr.u16 %rs323, %rs362, 8; + shr.u16 %rs324, %rs363, 8; + shr.u16 %rs325, %rs357, 8; + shr.u16 %rs326, %rs358, 8; + cvt.u32.u16 %r6906, %rs384; + and.b32 %r6907, %r6906, 255; + cvt.u32.u16 %r6908, %rs297; + prmt.b32 %r6909, %r6908, %r6907, 30212; + cvt.u32.u16 %r6910, %rs385; + prmt.b32 %r6911, %r6910, %r6909, 28756; + cvt.u32.u16 %r6912, %rs298; + prmt.b32 %r6913, %r6912, %r6911, 1620; + cvt.u32.u16 %r6914, %rs386; + and.b32 %r6915, %r6914, 255; + cvt.u32.u16 %r6916, %rs299; + prmt.b32 %r6917, %r6916, %r6915, 30212; + cvt.u32.u16 %r6918, %rs387; + prmt.b32 %r6919, %r6918, %r6917, 28756; + cvt.u32.u16 %r6920, %rs300; + prmt.b32 %r6921, %r6920, %r6919, 1620; + cvt.u32.u16 %r6922, %rs380; + and.b32 %r6923, %r6922, 255; + cvt.u32.u16 %r6924, %rs301; + prmt.b32 %r6925, %r6924, %r6923, 30212; + cvt.u32.u16 %r6926, %rs381; + prmt.b32 %r6927, %r6926, %r6925, 28756; + cvt.u32.u16 %r6928, %rs302; + prmt.b32 %r6929, %r6928, %r6927, 1620; + cvt.u32.u16 %r6930, %rs382; + and.b32 %r6931, %r6930, 255; + cvt.u32.u16 %r6932, %rs303; + prmt.b32 %r6933, %r6932, %r6931, 30212; + cvt.u32.u16 %r6934, %rs383; + prmt.b32 %r6935, %r6934, %r6933, 28756; + cvt.u32.u16 %r6936, %rs304; + prmt.b32 %r6937, %r6936, %r6935, 1620; + cvt.u32.u16 %r6938, %rs376; + and.b32 %r6939, %r6938, 255; + cvt.u32.u16 %r6940, %rs305; + prmt.b32 %r6941, %r6940, %r6939, 30212; + cvt.u32.u16 %r6942, %rs377; + prmt.b32 %r6943, %r6942, %r6941, 28756; + cvt.u32.u16 %r6944, %rs306; + prmt.b32 %r6945, %r6944, %r6943, 1620; + cvt.u32.u16 %r6946, %rs378; + and.b32 %r6947, %r6946, 255; + cvt.u32.u16 %r6948, %rs307; + prmt.b32 %r6949, %r6948, %r6947, 30212; + cvt.u32.u16 %r6950, %rs379; + prmt.b32 %r6951, %r6950, %r6949, 28756; + cvt.u32.u16 %r6952, %rs308; + prmt.b32 %r6953, %r6952, %r6951, 1620; + cvt.u32.u16 %r6954, %rs372; + and.b32 %r6955, %r6954, 255; + cvt.u32.u16 %r6956, %rs309; + prmt.b32 %r6957, %r6956, %r6955, 30212; + cvt.u32.u16 %r6958, %rs373; + prmt.b32 %r6959, %r6958, %r6957, 28756; + cvt.u32.u16 %r6960, %rs310; + prmt.b32 %r6961, %r6960, %r6959, 1620; + cvt.u32.u16 %r6962, %rs374; + and.b32 %r6963, %r6962, 255; + cvt.u32.u16 %r6964, %rs311; + prmt.b32 %r6965, %r6964, %r6963, 30212; + cvt.u32.u16 %r6966, %rs375; + prmt.b32 %r6967, %r6966, %r6965, 28756; + cvt.u32.u16 %r6968, %rs312; + prmt.b32 %r6969, %r6968, %r6967, 1620; + cvt.u32.u16 %r6970, %rs368; + and.b32 %r6971, %r6970, 255; + cvt.u32.u16 %r6972, %rs313; + prmt.b32 %r6973, %r6972, %r6971, 30212; + cvt.u32.u16 %r6974, %rs369; + prmt.b32 %r6975, %r6974, %r6973, 28756; + cvt.u32.u16 %r6976, %rs314; + prmt.b32 %r6977, %r6976, %r6975, 1620; + cvt.u32.u16 %r6978, %rs370; + and.b32 %r6979, %r6978, 255; + cvt.u32.u16 %r6980, %rs315; + prmt.b32 %r6981, %r6980, %r6979, 30212; + cvt.u32.u16 %r6982, %rs371; + prmt.b32 %r6983, %r6982, %r6981, 28756; + cvt.u32.u16 %r6984, %rs316; + prmt.b32 %r6985, %r6984, %r6983, 1620; + cvt.u32.u16 %r6986, %rs364; + and.b32 %r6987, %r6986, 255; + cvt.u32.u16 %r6988, %rs317; + prmt.b32 %r6989, %r6988, %r6987, 30212; + cvt.u32.u16 %r6990, %rs365; + prmt.b32 %r6991, %r6990, %r6989, 28756; + cvt.u32.u16 %r6992, %rs318; + prmt.b32 %r6993, %r6992, %r6991, 1620; + cvt.u32.u16 %r6994, %rs366; + and.b32 %r6995, %r6994, 255; + cvt.u32.u16 %r6996, %rs319; + prmt.b32 %r6997, %r6996, %r6995, 30212; + cvt.u32.u16 %r6998, %rs367; + prmt.b32 %r6999, %r6998, %r6997, 28756; + cvt.u32.u16 %r7000, %rs320; + prmt.b32 %r7001, %r7000, %r6999, 1620; + cvt.u32.u16 %r7002, %rs360; + and.b32 %r7003, %r7002, 255; + cvt.u32.u16 %r7004, %rs321; + prmt.b32 %r7005, %r7004, %r7003, 30212; + cvt.u32.u16 %r7006, %rs361; + prmt.b32 %r7007, %r7006, %r7005, 28756; + cvt.u32.u16 %r7008, %rs322; + prmt.b32 %r7009, %r7008, %r7007, 1620; + cvt.u32.u16 %r7010, %rs362; + and.b32 %r7011, %r7010, 255; + cvt.u32.u16 %r7012, %rs323; + prmt.b32 %r7013, %r7012, %r7011, 30212; + cvt.u32.u16 %r7014, %rs363; + prmt.b32 %r7015, %r7014, %r7013, 28756; + cvt.u32.u16 %r7016, %rs324; + prmt.b32 %r7017, %r7016, %r7015, 1620; + cvt.u32.u16 %r7018, %rs357; + and.b32 %r7019, %r7018, 255; + cvt.u32.u16 %r7020, %rs325; + prmt.b32 %r7021, %r7020, %r7019, 30212; + cvt.u32.u16 %r7022, %rs358; + prmt.b32 %r7023, %r7022, %r7021, 28756; + cvt.u32.u16 %r7024, %rs326; + prmt.b32 %r7025, %r7024, %r7023, 1620; + cvt.u32.u16 %r7026, %rs359; + and.b32 %r7027, %r7026, 255; + cvt.u32.u16 %r7028, %rs356; + prmt.b32 %r7029, %r7028, %r7027, 30212; + cvt.u32.u16 %r7030, %rs354; + shl.b32 %r7031, %r7030, 16; + and.b32 %r7032, %r7031, 16711680; + or.b32 %r7033, %r7029, %r7032; + cvt.u32.u16 %r7034, %rs355; + shl.b32 %r7035, %r7034, 24; + or.b32 %r7036, %r7033, %r7035; + cvt.u32.u16 %r7037, %rs293; + and.b32 %r7038, %r7037, 255; + cvt.u32.u16 %r7039, %rs296; + and.b32 %r7040, %r7039, 255; + add.s32 %r7041, %r11675, %r11679; + add.s32 %r7042, %r7041, %r6913; + xor.b32 %r7043, %r7042, %r71; + shf.l.wrap.b32 %r7044, %r7043, %r7043, 16; + add.s32 %r7045, %r7044, 1779033703; + xor.b32 %r7046, %r7045, %r11675; + shf.l.wrap.b32 %r7047, %r7046, %r7046, 20; + add.s32 %r7048, %r6921, %r7042; + add.s32 %r7049, %r7048, %r7047; + xor.b32 %r7050, %r7049, %r7044; + shf.l.wrap.b32 %r7051, %r7050, %r7050, 24; + add.s32 %r7052, %r7051, %r7045; + xor.b32 %r7053, %r7052, %r7047; + shf.l.wrap.b32 %r7054, %r7053, %r7053, 25; + add.s32 %r7055, %r11674, %r11678; + add.s32 %r7056, %r7055, %r6929; + xor.b32 %r7057, %r7056, %r72; + shf.l.wrap.b32 %r7058, %r7057, %r7057, 16; + add.s32 %r7059, %r7058, -1150833019; + xor.b32 %r7060, %r7059, %r11674; + shf.l.wrap.b32 %r7061, %r7060, %r7060, 20; + add.s32 %r7062, %r6937, %r7056; + add.s32 %r7063, %r7062, %r7061; + xor.b32 %r7064, %r7063, %r7058; + shf.l.wrap.b32 %r7065, %r7064, %r7064, 24; + add.s32 %r7066, %r7065, %r7059; + xor.b32 %r7067, %r7066, %r7061; + shf.l.wrap.b32 %r7068, %r7067, %r7067, 25; + add.s32 %r7069, %r11673, %r11677; + add.s32 %r7070, %r7069, %r6945; + xor.b32 %r7071, %r7070, %r7038; + shr.u32 %r7072, %r7070, 16; + shl.b32 %r7073, %r7071, 16; + or.b32 %r7074, %r7073, %r7072; + add.s32 %r7075, %r7074, 1013904242; + xor.b32 %r7076, %r7075, %r11673; + shf.l.wrap.b32 %r7077, %r7076, %r7076, 20; + add.s32 %r7078, %r6953, %r7070; + add.s32 %r7079, %r7078, %r7077; + xor.b32 %r7080, %r7079, %r7074; + shf.l.wrap.b32 %r7081, %r7080, %r7080, 24; + add.s32 %r7082, %r7081, %r7075; + xor.b32 %r7083, %r7082, %r7077; + shf.l.wrap.b32 %r7084, %r7083, %r7083, 25; + add.s32 %r7085, %r11672, %r11676; + add.s32 %r7086, %r7085, %r6961; + xor.b32 %r7087, %r7086, %r7040; + shr.u32 %r7088, %r7086, 16; + shl.b32 %r7089, %r7087, 16; + or.b32 %r7090, %r7089, %r7088; + add.s32 %r7091, %r7090, -1521486534; + xor.b32 %r7092, %r7091, %r11672; + shf.l.wrap.b32 %r7093, %r7092, %r7092, 20; + add.s32 %r7094, %r6969, %r7086; + add.s32 %r7095, %r7094, %r7093; + xor.b32 %r7096, %r7095, %r7090; + shf.l.wrap.b32 %r7097, %r7096, %r7096, 24; + add.s32 %r7098, %r7097, %r7091; + xor.b32 %r7099, %r7098, %r7093; + shf.l.wrap.b32 %r7100, %r7099, %r7099, 25; + add.s32 %r7101, %r7068, %r7049; + add.s32 %r7102, %r7101, %r6977; + xor.b32 %r7103, %r7097, %r7102; + shf.l.wrap.b32 %r7104, %r7103, %r7103, 16; + add.s32 %r7105, %r7104, %r7082; + xor.b32 %r7106, %r7105, %r7068; + shf.l.wrap.b32 %r7107, %r7106, %r7106, 20; + add.s32 %r7108, %r6985, %r7102; + add.s32 %r7109, %r7108, %r7107; + xor.b32 %r7110, %r7109, %r7104; + shf.l.wrap.b32 %r7111, %r7110, %r7110, 24; + add.s32 %r7112, %r7111, %r7105; + xor.b32 %r7113, %r7112, %r7107; + shf.l.wrap.b32 %r7114, %r7113, %r7113, 25; + add.s32 %r7115, %r7084, %r7063; + add.s32 %r7116, %r7115, %r6993; + xor.b32 %r7117, %r7116, %r7051; + shf.l.wrap.b32 %r7118, %r7117, %r7117, 16; + add.s32 %r7119, %r7118, %r7098; + xor.b32 %r7120, %r7119, %r7084; + shf.l.wrap.b32 %r7121, %r7120, %r7120, 20; + add.s32 %r7122, %r7001, %r7116; + add.s32 %r7123, %r7122, %r7121; + xor.b32 %r7124, %r7123, %r7118; + shf.l.wrap.b32 %r7125, %r7124, %r7124, 24; + add.s32 %r7126, %r7125, %r7119; + xor.b32 %r7127, %r7126, %r7121; + shf.l.wrap.b32 %r7128, %r7127, %r7127, 25; + add.s32 %r7129, %r7100, %r7079; + add.s32 %r7130, %r7129, %r7009; + xor.b32 %r7131, %r7130, %r7065; + shf.l.wrap.b32 %r7132, %r7131, %r7131, 16; + add.s32 %r7133, %r7132, %r7052; + xor.b32 %r7134, %r7133, %r7100; + shf.l.wrap.b32 %r7135, %r7134, %r7134, 20; + add.s32 %r7136, %r7017, %r7130; + add.s32 %r7137, %r7136, %r7135; + xor.b32 %r7138, %r7137, %r7132; + shf.l.wrap.b32 %r7139, %r7138, %r7138, 24; + add.s32 %r7140, %r7139, %r7133; + xor.b32 %r7141, %r7140, %r7135; + shf.l.wrap.b32 %r7142, %r7141, %r7141, 25; + add.s32 %r7143, %r7095, %r7054; + add.s32 %r7144, %r7143, %r7025; + xor.b32 %r7145, %r7144, %r7081; + shf.l.wrap.b32 %r7146, %r7145, %r7145, 16; + add.s32 %r7147, %r7146, %r7066; + xor.b32 %r7148, %r7147, %r7054; + shf.l.wrap.b32 %r7149, %r7148, %r7148, 20; + add.s32 %r7150, %r7036, %r7144; + add.s32 %r7151, %r7150, %r7149; + xor.b32 %r7152, %r7151, %r7146; + shf.l.wrap.b32 %r7153, %r7152, %r7152, 24; + add.s32 %r7154, %r7153, %r7147; + xor.b32 %r7155, %r7154, %r7149; + shf.l.wrap.b32 %r7156, %r7155, %r7155, 25; + add.s32 %r7157, %r7109, %r6929; + add.s32 %r7158, %r7157, %r7156; + xor.b32 %r7159, %r7158, %r7125; + shf.l.wrap.b32 %r7160, %r7159, %r7159, 16; + add.s32 %r7161, %r7160, %r7140; + xor.b32 %r7162, %r7161, %r7156; + shf.l.wrap.b32 %r7163, %r7162, %r7162, 20; + add.s32 %r7164, %r7158, %r6961; + add.s32 %r7165, %r7164, %r7163; + xor.b32 %r7166, %r7165, %r7160; + shf.l.wrap.b32 %r7167, %r7166, %r7166, 24; + add.s32 %r7168, %r7167, %r7161; + xor.b32 %r7169, %r7168, %r7163; + shf.l.wrap.b32 %r7170, %r7169, %r7169, 25; + add.s32 %r7171, %r7123, %r6937; + add.s32 %r7172, %r7171, %r7114; + xor.b32 %r7173, %r7139, %r7172; + shf.l.wrap.b32 %r7174, %r7173, %r7173, 16; + add.s32 %r7175, %r7154, %r7174; + xor.b32 %r7176, %r7175, %r7114; + shf.l.wrap.b32 %r7177, %r7176, %r7176, 20; + add.s32 %r7178, %r7172, %r6993; + add.s32 %r7179, %r7178, %r7177; + xor.b32 %r7180, %r7179, %r7174; + shf.l.wrap.b32 %r7181, %r7180, %r7180, 24; + add.s32 %r7182, %r7181, %r7175; + xor.b32 %r7183, %r7182, %r7177; + shf.l.wrap.b32 %r7184, %r7183, %r7183, 25; + add.s32 %r7185, %r7128, %r6969; + add.s32 %r7186, %r7185, %r7137; + xor.b32 %r7187, %r7153, %r7186; + shf.l.wrap.b32 %r7188, %r7187, %r7187, 16; + add.s32 %r7189, %r7188, %r7112; + xor.b32 %r7190, %r7189, %r7128; + shf.l.wrap.b32 %r7191, %r7190, %r7190, 20; + add.s32 %r7192, %r7186, %r6913; + add.s32 %r7193, %r7192, %r7191; + xor.b32 %r7194, %r7193, %r7188; + shf.l.wrap.b32 %r7195, %r7194, %r7194, 24; + add.s32 %r7196, %r7195, %r7189; + xor.b32 %r7197, %r7196, %r7191; + shf.l.wrap.b32 %r7198, %r7197, %r7197, 25; + add.s32 %r7199, %r7142, %r6945; + add.s32 %r7200, %r7199, %r7151; + xor.b32 %r7201, %r7200, %r7111; + shf.l.wrap.b32 %r7202, %r7201, %r7201, 16; + add.s32 %r7203, %r7202, %r7126; + xor.b32 %r7204, %r7203, %r7142; + shf.l.wrap.b32 %r7205, %r7204, %r7204, 20; + add.s32 %r7206, %r7200, %r7017; + add.s32 %r7207, %r7206, %r7205; + xor.b32 %r7208, %r7207, %r7202; + shf.l.wrap.b32 %r7209, %r7208, %r7208, 24; + add.s32 %r7210, %r7209, %r7203; + xor.b32 %r7211, %r7210, %r7205; + shf.l.wrap.b32 %r7212, %r7211, %r7211, 25; + add.s32 %r7213, %r7184, %r6921; + add.s32 %r7214, %r7213, %r7165; + xor.b32 %r7215, %r7214, %r7209; + shf.l.wrap.b32 %r7216, %r7215, %r7215, 16; + add.s32 %r7217, %r7216, %r7196; + xor.b32 %r7218, %r7217, %r7184; + shf.l.wrap.b32 %r7219, %r7218, %r7218, 20; + add.s32 %r7220, %r7214, %r7001; + add.s32 %r7221, %r7220, %r7219; + xor.b32 %r7222, %r7221, %r7216; + shf.l.wrap.b32 %r7223, %r7222, %r7222, 24; + add.s32 %r7224, %r7223, %r7217; + xor.b32 %r7225, %r7224, %r7219; + shf.l.wrap.b32 %r7226, %r7225, %r7225, 25; + add.s32 %r7227, %r7179, %r7009; + add.s32 %r7228, %r7227, %r7198; + xor.b32 %r7229, %r7167, %r7228; + shf.l.wrap.b32 %r7230, %r7229, %r7229, 16; + add.s32 %r7231, %r7230, %r7210; + xor.b32 %r7232, %r7231, %r7198; + shf.l.wrap.b32 %r7233, %r7232, %r7232, 20; + add.s32 %r7234, %r7228, %r6953; + add.s32 %r7235, %r7234, %r7233; + xor.b32 %r7236, %r7235, %r7230; + shf.l.wrap.b32 %r7237, %r7236, %r7236, 24; + add.s32 %r7238, %r7237, %r7231; + xor.b32 %r7239, %r7238, %r7233; + shf.l.wrap.b32 %r7240, %r7239, %r7239, 25; + add.s32 %r7241, %r7193, %r6985; + add.s32 %r7242, %r7241, %r7212; + xor.b32 %r7243, %r7242, %r7181; + shf.l.wrap.b32 %r7244, %r7243, %r7243, 16; + add.s32 %r7245, %r7244, %r7168; + xor.b32 %r7246, %r7245, %r7212; + shf.l.wrap.b32 %r7247, %r7246, %r7246, 20; + add.s32 %r7248, %r7242, %r7025; + add.s32 %r7249, %r7248, %r7247; + xor.b32 %r7250, %r7249, %r7244; + shf.l.wrap.b32 %r7251, %r7250, %r7250, 24; + add.s32 %r7252, %r7251, %r7245; + xor.b32 %r7253, %r7252, %r7247; + shf.l.wrap.b32 %r7254, %r7253, %r7253, 25; + add.s32 %r7255, %r7207, %r7036; + add.s32 %r7256, %r7255, %r7170; + xor.b32 %r7257, %r7256, %r7195; + shf.l.wrap.b32 %r7258, %r7257, %r7257, 16; + add.s32 %r7259, %r7258, %r7182; + xor.b32 %r7260, %r7259, %r7170; + shf.l.wrap.b32 %r7261, %r7260, %r7260, 20; + add.s32 %r7262, %r7256, %r6977; + add.s32 %r7263, %r7262, %r7261; + xor.b32 %r7264, %r7263, %r7258; + shf.l.wrap.b32 %r7265, %r7264, %r7264, 24; + add.s32 %r7266, %r7265, %r7259; + xor.b32 %r7267, %r7266, %r7261; + shf.l.wrap.b32 %r7268, %r7267, %r7267, 25; + add.s32 %r7269, %r7221, %r6937; + add.s32 %r7270, %r7269, %r7268; + xor.b32 %r7271, %r7270, %r7237; + shf.l.wrap.b32 %r7272, %r7271, %r7271, 16; + add.s32 %r7273, %r7272, %r7252; + xor.b32 %r7274, %r7273, %r7268; + shf.l.wrap.b32 %r7275, %r7274, %r7274, 20; + add.s32 %r7276, %r7270, %r6945; + add.s32 %r7277, %r7276, %r7275; + xor.b32 %r7278, %r7277, %r7272; + shf.l.wrap.b32 %r7279, %r7278, %r7278, 24; + add.s32 %r7280, %r7279, %r7273; + xor.b32 %r7281, %r7280, %r7275; + shf.l.wrap.b32 %r7282, %r7281, %r7281, 25; + add.s32 %r7283, %r7235, %r6993; + add.s32 %r7284, %r7283, %r7226; + xor.b32 %r7285, %r7284, %r7251; + shf.l.wrap.b32 %r7286, %r7285, %r7285, 16; + add.s32 %r7287, %r7286, %r7266; + xor.b32 %r7288, %r7287, %r7226; + shf.l.wrap.b32 %r7289, %r7288, %r7288, 20; + add.s32 %r7290, %r7284, %r7009; + add.s32 %r7291, %r7290, %r7289; + xor.b32 %r7292, %r7291, %r7286; + shf.l.wrap.b32 %r7293, %r7292, %r7292, 24; + add.s32 %r7294, %r7293, %r7287; + xor.b32 %r7295, %r7294, %r7289; + shf.l.wrap.b32 %r7296, %r7295, %r7295, 25; + add.s32 %r7297, %r7249, %r7017; + add.s32 %r7298, %r7297, %r7240; + xor.b32 %r7299, %r7265, %r7298; + shf.l.wrap.b32 %r7300, %r7299, %r7299, 16; + add.s32 %r7301, %r7300, %r7224; + xor.b32 %r7302, %r7301, %r7240; + shf.l.wrap.b32 %r7303, %r7302, %r7302, 20; + add.s32 %r7304, %r7298, %r6929; + add.s32 %r7305, %r7304, %r7303; + xor.b32 %r7306, %r7305, %r7300; + shf.l.wrap.b32 %r7307, %r7306, %r7306, 24; + add.s32 %r7308, %r7307, %r7301; + xor.b32 %r7309, %r7308, %r7303; + shf.l.wrap.b32 %r7310, %r7309, %r7309, 25; + add.s32 %r7311, %r7254, %r6969; + add.s32 %r7312, %r7311, %r7263; + xor.b32 %r7313, %r7312, %r7223; + shf.l.wrap.b32 %r7314, %r7313, %r7313, 16; + add.s32 %r7315, %r7314, %r7238; + xor.b32 %r7316, %r7315, %r7254; + shf.l.wrap.b32 %r7317, %r7316, %r7316, 20; + add.s32 %r7318, %r7312, %r7025; + add.s32 %r7319, %r7318, %r7317; + xor.b32 %r7320, %r7319, %r7314; + shf.l.wrap.b32 %r7321, %r7320, %r7320, 24; + add.s32 %r7322, %r7321, %r7315; + xor.b32 %r7323, %r7322, %r7317; + shf.l.wrap.b32 %r7324, %r7323, %r7323, 25; + add.s32 %r7325, %r7277, %r6961; + add.s32 %r7326, %r7325, %r7296; + xor.b32 %r7327, %r7326, %r7321; + shf.l.wrap.b32 %r7328, %r7327, %r7327, 16; + add.s32 %r7329, %r7328, %r7308; + xor.b32 %r7330, %r7329, %r7296; + shf.l.wrap.b32 %r7331, %r7330, %r7330, 20; + add.s32 %r7332, %r7326, %r6953; + add.s32 %r7333, %r7332, %r7331; + xor.b32 %r7334, %r7333, %r7328; + shf.l.wrap.b32 %r7335, %r7334, %r7334, 24; + add.s32 %r7336, %r7335, %r7329; + xor.b32 %r7337, %r7336, %r7331; + shf.l.wrap.b32 %r7338, %r7337, %r7337, 25; + add.s32 %r7339, %r7291, %r6985; + add.s32 %r7340, %r7339, %r7310; + xor.b32 %r7341, %r7279, %r7340; + shf.l.wrap.b32 %r7342, %r7341, %r7341, 16; + add.s32 %r7343, %r7342, %r7322; + xor.b32 %r7344, %r7343, %r7310; + shf.l.wrap.b32 %r7345, %r7344, %r7344, 20; + add.s32 %r7346, %r7340, %r6913; + add.s32 %r7347, %r7346, %r7345; + xor.b32 %r7348, %r7347, %r7342; + shf.l.wrap.b32 %r7349, %r7348, %r7348, 24; + add.s32 %r7350, %r7349, %r7343; + xor.b32 %r7351, %r7350, %r7345; + shf.l.wrap.b32 %r7352, %r7351, %r7351, 25; + add.s32 %r7353, %r7305, %r7001; + add.s32 %r7354, %r7353, %r7324; + xor.b32 %r7355, %r7354, %r7293; + shf.l.wrap.b32 %r7356, %r7355, %r7355, 16; + add.s32 %r7357, %r7356, %r7280; + xor.b32 %r7358, %r7357, %r7324; + shf.l.wrap.b32 %r7359, %r7358, %r7358, 20; + add.s32 %r7360, %r7354, %r7036; + add.s32 %r7361, %r7360, %r7359; + xor.b32 %r7362, %r7361, %r7356; + shf.l.wrap.b32 %r7363, %r7362, %r7362, 24; + add.s32 %r7364, %r7363, %r7357; + xor.b32 %r7365, %r7364, %r7359; + shf.l.wrap.b32 %r7366, %r7365, %r7365, 25; + add.s32 %r7367, %r7319, %r6977; + add.s32 %r7368, %r7367, %r7282; + xor.b32 %r7369, %r7368, %r7307; + shf.l.wrap.b32 %r7370, %r7369, %r7369, 16; + add.s32 %r7371, %r7370, %r7294; + xor.b32 %r7372, %r7371, %r7282; + shf.l.wrap.b32 %r7373, %r7372, %r7372, 20; + add.s32 %r7374, %r7368, %r6921; + add.s32 %r7375, %r7374, %r7373; + xor.b32 %r7376, %r7375, %r7370; + shf.l.wrap.b32 %r7377, %r7376, %r7376, 24; + add.s32 %r7378, %r7377, %r7371; + xor.b32 %r7379, %r7378, %r7373; + shf.l.wrap.b32 %r7380, %r7379, %r7379, 25; + add.s32 %r7381, %r7333, %r6993; + add.s32 %r7382, %r7381, %r7380; + xor.b32 %r7383, %r7382, %r7349; + shf.l.wrap.b32 %r7384, %r7383, %r7383, 16; + add.s32 %r7385, %r7384, %r7364; + xor.b32 %r7386, %r7385, %r7380; + shf.l.wrap.b32 %r7387, %r7386, %r7386, 20; + add.s32 %r7388, %r7382, %r6969; + add.s32 %r7389, %r7388, %r7387; + xor.b32 %r7390, %r7389, %r7384; + shf.l.wrap.b32 %r7391, %r7390, %r7390, 24; + add.s32 %r7392, %r7391, %r7385; + xor.b32 %r7393, %r7392, %r7387; + shf.l.wrap.b32 %r7394, %r7393, %r7393, 25; + add.s32 %r7395, %r7347, %r7009; + add.s32 %r7396, %r7395, %r7338; + xor.b32 %r7397, %r7396, %r7363; + shf.l.wrap.b32 %r7398, %r7397, %r7397, 16; + add.s32 %r7399, %r7398, %r7378; + xor.b32 %r7400, %r7399, %r7338; + shf.l.wrap.b32 %r7401, %r7400, %r7400, 20; + add.s32 %r7402, %r7396, %r6985; + add.s32 %r7403, %r7402, %r7401; + xor.b32 %r7404, %r7403, %r7398; + shf.l.wrap.b32 %r7405, %r7404, %r7404, 24; + add.s32 %r7406, %r7405, %r7399; + xor.b32 %r7407, %r7406, %r7401; + shf.l.wrap.b32 %r7408, %r7407, %r7407, 25; + add.s32 %r7409, %r7361, %r7025; + add.s32 %r7410, %r7409, %r7352; + xor.b32 %r7411, %r7377, %r7410; + shf.l.wrap.b32 %r7412, %r7411, %r7411, 16; + add.s32 %r7413, %r7412, %r7336; + xor.b32 %r7414, %r7413, %r7352; + shf.l.wrap.b32 %r7415, %r7414, %r7414, 20; + add.s32 %r7416, %r7410, %r6937; + add.s32 %r7417, %r7416, %r7415; + xor.b32 %r7418, %r7417, %r7412; + shf.l.wrap.b32 %r7419, %r7418, %r7418, 24; + add.s32 %r7420, %r7419, %r7413; + xor.b32 %r7421, %r7420, %r7415; + shf.l.wrap.b32 %r7422, %r7421, %r7421, 25; + add.s32 %r7423, %r7375, %r7017; + add.s32 %r7424, %r7423, %r7366; + xor.b32 %r7425, %r7424, %r7335; + shf.l.wrap.b32 %r7426, %r7425, %r7425, 16; + add.s32 %r7427, %r7426, %r7350; + xor.b32 %r7428, %r7427, %r7366; + shf.l.wrap.b32 %r7429, %r7428, %r7428, 20; + add.s32 %r7430, %r7424, %r7036; + add.s32 %r7431, %r7430, %r7429; + xor.b32 %r7432, %r7431, %r7426; + shf.l.wrap.b32 %r7433, %r7432, %r7432, 24; + add.s32 %r7434, %r7433, %r7427; + xor.b32 %r7435, %r7434, %r7429; + shf.l.wrap.b32 %r7436, %r7435, %r7435, 25; + add.s32 %r7437, %r7389, %r6945; + add.s32 %r7438, %r7437, %r7408; + xor.b32 %r7439, %r7438, %r7433; + shf.l.wrap.b32 %r7440, %r7439, %r7439, 16; + add.s32 %r7441, %r7440, %r7420; + xor.b32 %r7442, %r7441, %r7408; + shf.l.wrap.b32 %r7443, %r7442, %r7442, 20; + add.s32 %r7444, %r7438, %r6913; + add.s32 %r7445, %r7444, %r7443; + xor.b32 %r7446, %r7445, %r7440; + shf.l.wrap.b32 %r7447, %r7446, %r7446, 24; + add.s32 %r7448, %r7447, %r7441; + xor.b32 %r7449, %r7448, %r7443; + shf.l.wrap.b32 %r7450, %r7449, %r7449, 25; + add.s32 %r7451, %r7403, %r7001; + add.s32 %r7452, %r7451, %r7422; + xor.b32 %r7453, %r7391, %r7452; + shf.l.wrap.b32 %r7454, %r7453, %r7453, 16; + add.s32 %r7455, %r7454, %r7434; + xor.b32 %r7456, %r7455, %r7422; + shf.l.wrap.b32 %r7457, %r7456, %r7456, 20; + add.s32 %r7458, %r7452, %r6929; + add.s32 %r7459, %r7458, %r7457; + xor.b32 %r7460, %r7459, %r7454; + shf.l.wrap.b32 %r7461, %r7460, %r7460, 24; + add.s32 %r7462, %r7461, %r7455; + xor.b32 %r7463, %r7462, %r7457; + shf.l.wrap.b32 %r7464, %r7463, %r7463, 25; + add.s32 %r7465, %r7417, %r6953; + add.s32 %r7466, %r7465, %r7436; + xor.b32 %r7467, %r7466, %r7405; + shf.l.wrap.b32 %r7468, %r7467, %r7467, 16; + add.s32 %r7469, %r7468, %r7392; + xor.b32 %r7470, %r7469, %r7436; + shf.l.wrap.b32 %r7471, %r7470, %r7470, 20; + add.s32 %r7472, %r7466, %r6977; + add.s32 %r7473, %r7472, %r7471; + xor.b32 %r7474, %r7473, %r7468; + shf.l.wrap.b32 %r7475, %r7474, %r7474, 24; + add.s32 %r7476, %r7475, %r7469; + xor.b32 %r7477, %r7476, %r7471; + shf.l.wrap.b32 %r7478, %r7477, %r7477, 25; + add.s32 %r7479, %r7431, %r6921; + add.s32 %r7480, %r7479, %r7394; + xor.b32 %r7481, %r7480, %r7419; + shf.l.wrap.b32 %r7482, %r7481, %r7481, 16; + add.s32 %r7483, %r7482, %r7406; + xor.b32 %r7484, %r7483, %r7394; + shf.l.wrap.b32 %r7485, %r7484, %r7484, 20; + add.s32 %r7486, %r7480, %r6961; + add.s32 %r7487, %r7486, %r7485; + xor.b32 %r7488, %r7487, %r7482; + shf.l.wrap.b32 %r7489, %r7488, %r7488, 24; + add.s32 %r7490, %r7489, %r7483; + xor.b32 %r7491, %r7490, %r7485; + shf.l.wrap.b32 %r7492, %r7491, %r7491, 25; + add.s32 %r7493, %r7445, %r7009; + add.s32 %r7494, %r7493, %r7492; + xor.b32 %r7495, %r7494, %r7461; + shf.l.wrap.b32 %r7496, %r7495, %r7495, 16; + add.s32 %r7497, %r7496, %r7476; + xor.b32 %r7498, %r7497, %r7492; + shf.l.wrap.b32 %r7499, %r7498, %r7498, 20; + add.s32 %r7500, %r7494, %r7017; + add.s32 %r7501, %r7500, %r7499; + xor.b32 %r7502, %r7501, %r7496; + shf.l.wrap.b32 %r7503, %r7502, %r7502, 24; + add.s32 %r7504, %r7503, %r7497; + xor.b32 %r7505, %r7504, %r7499; + shf.l.wrap.b32 %r7506, %r7505, %r7505, 25; + add.s32 %r7507, %r7459, %r6985; + add.s32 %r7508, %r7507, %r7450; + xor.b32 %r7509, %r7508, %r7475; + shf.l.wrap.b32 %r7510, %r7509, %r7509, 16; + add.s32 %r7511, %r7510, %r7490; + xor.b32 %r7512, %r7511, %r7450; + shf.l.wrap.b32 %r7513, %r7512, %r7512, 20; + add.s32 %r7514, %r7508, %r7001; + add.s32 %r7515, %r7514, %r7513; + xor.b32 %r7516, %r7515, %r7510; + shf.l.wrap.b32 %r7517, %r7516, %r7516, 24; + add.s32 %r7518, %r7517, %r7511; + xor.b32 %r7519, %r7518, %r7513; + shf.l.wrap.b32 %r7520, %r7519, %r7519, 25; + add.s32 %r7521, %r7473, %r7036; + add.s32 %r7522, %r7521, %r7464; + xor.b32 %r7523, %r7489, %r7522; + shf.l.wrap.b32 %r7524, %r7523, %r7523, 16; + add.s32 %r7525, %r7524, %r7448; + xor.b32 %r7526, %r7525, %r7464; + shf.l.wrap.b32 %r7527, %r7526, %r7526, 20; + add.s32 %r7528, %r7522, %r6993; + add.s32 %r7529, %r7528, %r7527; + xor.b32 %r7530, %r7529, %r7524; + shf.l.wrap.b32 %r7531, %r7530, %r7530, 24; + add.s32 %r7532, %r7531, %r7525; + xor.b32 %r7533, %r7532, %r7527; + shf.l.wrap.b32 %r7534, %r7533, %r7533, 25; + add.s32 %r7535, %r7487, %r7025; + add.s32 %r7536, %r7535, %r7478; + xor.b32 %r7537, %r7536, %r7447; + shf.l.wrap.b32 %r7538, %r7537, %r7537, 16; + add.s32 %r7539, %r7538, %r7462; + xor.b32 %r7540, %r7539, %r7478; + shf.l.wrap.b32 %r7541, %r7540, %r7540, 20; + add.s32 %r7542, %r7536, %r6977; + add.s32 %r7543, %r7542, %r7541; + xor.b32 %r7544, %r7543, %r7538; + shf.l.wrap.b32 %r7545, %r7544, %r7544, 24; + add.s32 %r7546, %r7545, %r7539; + xor.b32 %r7547, %r7546, %r7541; + shf.l.wrap.b32 %r7548, %r7547, %r7547, 25; + add.s32 %r7549, %r7501, %r6969; + add.s32 %r7550, %r7549, %r7520; + xor.b32 %r7551, %r7550, %r7545; + shf.l.wrap.b32 %r7552, %r7551, %r7551, 16; + add.s32 %r7553, %r7552, %r7532; + xor.b32 %r7554, %r7553, %r7520; + shf.l.wrap.b32 %r7555, %r7554, %r7554, 20; + add.s32 %r7556, %r7550, %r6929; + add.s32 %r7557, %r7556, %r7555; + xor.b32 %r7558, %r7557, %r7552; + shf.l.wrap.b32 %r7559, %r7558, %r7558, 24; + add.s32 %r7560, %r7559, %r7553; + xor.b32 %r7561, %r7560, %r7555; + shf.l.wrap.b32 %r7562, %r7561, %r7561, 25; + add.s32 %r7563, %r7515, %r6953; + add.s32 %r7564, %r7563, %r7534; + xor.b32 %r7565, %r7503, %r7564; + shf.l.wrap.b32 %r7566, %r7565, %r7565, 16; + add.s32 %r7567, %r7566, %r7546; + xor.b32 %r7568, %r7567, %r7534; + shf.l.wrap.b32 %r7569, %r7568, %r7568, 20; + add.s32 %r7570, %r7564, %r6937; + add.s32 %r7571, %r7570, %r7569; + xor.b32 %r7572, %r7571, %r7566; + shf.l.wrap.b32 %r7573, %r7572, %r7572, 24; + add.s32 %r7574, %r7573, %r7567; + xor.b32 %r7575, %r7574, %r7569; + shf.l.wrap.b32 %r7576, %r7575, %r7575, 25; + add.s32 %r7577, %r7529, %r6913; + add.s32 %r7578, %r7577, %r7548; + xor.b32 %r7579, %r7578, %r7517; + shf.l.wrap.b32 %r7580, %r7579, %r7579, 16; + add.s32 %r7581, %r7580, %r7504; + xor.b32 %r7582, %r7581, %r7548; + shf.l.wrap.b32 %r7583, %r7582, %r7582, 20; + add.s32 %r7584, %r7578, %r6921; + add.s32 %r7585, %r7584, %r7583; + xor.b32 %r7586, %r7585, %r7580; + shf.l.wrap.b32 %r7587, %r7586, %r7586, 24; + add.s32 %r7588, %r7587, %r7581; + xor.b32 %r7589, %r7588, %r7583; + shf.l.wrap.b32 %r7590, %r7589, %r7589, 25; + add.s32 %r7591, %r7543, %r6961; + add.s32 %r7592, %r7591, %r7506; + xor.b32 %r7593, %r7592, %r7531; + shf.l.wrap.b32 %r7594, %r7593, %r7593, 16; + add.s32 %r7595, %r7594, %r7518; + xor.b32 %r7596, %r7595, %r7506; + shf.l.wrap.b32 %r7597, %r7596, %r7596, 20; + add.s32 %r7598, %r7592, %r6945; + add.s32 %r7599, %r7598, %r7597; + xor.b32 %r7600, %r7599, %r7594; + shf.l.wrap.b32 %r7601, %r7600, %r7600, 24; + add.s32 %r7602, %r7601, %r7595; + xor.b32 %r7603, %r7602, %r7597; + shf.l.wrap.b32 %r7604, %r7603, %r7603, 25; + add.s32 %r7605, %r7557, %r6985; + add.s32 %r7606, %r7605, %r7604; + xor.b32 %r7607, %r7606, %r7573; + shf.l.wrap.b32 %r7608, %r7607, %r7607, 16; + add.s32 %r7609, %r7608, %r7588; + xor.b32 %r7610, %r7609, %r7604; + shf.l.wrap.b32 %r7611, %r7610, %r7610, 20; + add.s32 %r7612, %r7606, %r7025; + add.s32 %r7613, %r7612, %r7611; + xor.b32 %r7614, %r7613, %r7608; + shf.l.wrap.b32 %r7615, %r7614, %r7614, 24; + add.s32 %r7616, %r7615, %r7609; + xor.b32 %r7617, %r7616, %r7611; + shf.l.wrap.b32 %r7618, %r7617, %r7617, 25; + add.s32 %r7619, %r7571, %r7001; + add.s32 %r7620, %r7619, %r7562; + xor.b32 %r7621, %r7620, %r7587; + shf.l.wrap.b32 %r7622, %r7621, %r7621, 16; + add.s32 %r7623, %r7622, %r7602; + xor.b32 %r7624, %r7623, %r7562; + shf.l.wrap.b32 %r7625, %r7624, %r7624, 20; + add.s32 %r7626, %r7620, %r6953; + add.s32 %r7627, %r7626, %r7625; + xor.b32 %r7628, %r7627, %r7622; + shf.l.wrap.b32 %r7629, %r7628, %r7628, 24; + add.s32 %r7630, %r7629, %r7623; + xor.b32 %r7631, %r7630, %r7625; + shf.l.wrap.b32 %r7632, %r7631, %r7631, 25; + add.s32 %r7633, %r7585, %r6977; + add.s32 %r7634, %r7633, %r7576; + xor.b32 %r7635, %r7601, %r7634; + shf.l.wrap.b32 %r7636, %r7635, %r7635, 16; + add.s32 %r7637, %r7636, %r7560; + xor.b32 %r7638, %r7637, %r7576; + shf.l.wrap.b32 %r7639, %r7638, %r7638, 20; + add.s32 %r7640, %r7634, %r7009; + add.s32 %r7641, %r7640, %r7639; + xor.b32 %r7642, %r7641, %r7636; + shf.l.wrap.b32 %r7643, %r7642, %r7642, 24; + add.s32 %r7644, %r7643, %r7637; + xor.b32 %r7645, %r7644, %r7639; + shf.l.wrap.b32 %r7646, %r7645, %r7645, 25; + add.s32 %r7647, %r7599, %r7036; + add.s32 %r7648, %r7647, %r7590; + xor.b32 %r7649, %r7648, %r7559; + shf.l.wrap.b32 %r7650, %r7649, %r7649, 16; + add.s32 %r7651, %r7650, %r7574; + xor.b32 %r7652, %r7651, %r7590; + shf.l.wrap.b32 %r7653, %r7652, %r7652, 20; + add.s32 %r7654, %r7648, %r6921; + add.s32 %r7655, %r7654, %r7653; + xor.b32 %r7656, %r7655, %r7650; + shf.l.wrap.b32 %r7657, %r7656, %r7656, 24; + add.s32 %r7658, %r7657, %r7651; + xor.b32 %r7659, %r7658, %r7653; + shf.l.wrap.b32 %r7660, %r7659, %r7659, 25; + add.s32 %r7661, %r7613, %r7017; + add.s32 %r7662, %r7661, %r7632; + xor.b32 %r7663, %r7662, %r7657; + shf.l.wrap.b32 %r7664, %r7663, %r7663, 16; + add.s32 %r7665, %r7664, %r7644; + xor.b32 %r7666, %r7665, %r7632; + shf.l.wrap.b32 %r7667, %r7666, %r7666, 20; + add.s32 %r7668, %r7662, %r6937; + add.s32 %r7669, %r7668, %r7667; + xor.b32 %r7670, %r7669, %r7664; + shf.l.wrap.b32 %r7671, %r7670, %r7670, 24; + add.s32 %r7672, %r7671, %r7665; + xor.b32 %r7673, %r7672, %r7667; + shf.l.wrap.b32 %r7674, %r7673, %r7673, 25; + add.s32 %r7675, %r7627, %r6913; + add.s32 %r7676, %r7675, %r7646; + xor.b32 %r7677, %r7615, %r7676; + shf.l.wrap.b32 %r7678, %r7677, %r7677, 16; + add.s32 %r7679, %r7678, %r7658; + xor.b32 %r7680, %r7679, %r7646; + shf.l.wrap.b32 %r7681, %r7680, %r7680, 20; + add.s32 %r7682, %r7676, %r6993; + add.s32 %r7683, %r7682, %r7681; + xor.b32 %r7684, %r7683, %r7678; + shf.l.wrap.b32 %r7685, %r7684, %r7684, 24; + add.s32 %r7686, %r7685, %r7679; + xor.b32 %r7687, %r7686, %r7681; + shf.l.wrap.b32 %r7688, %r7687, %r7687, 25; + add.s32 %r7689, %r7641, %r6929; + add.s32 %r7690, %r7689, %r7660; + xor.b32 %r7691, %r7690, %r7629; + shf.l.wrap.b32 %r7692, %r7691, %r7691, 16; + add.s32 %r7693, %r7692, %r7616; + xor.b32 %r7694, %r7693, %r7660; + shf.l.wrap.b32 %r7695, %r7694, %r7694, 20; + add.s32 %r7696, %r7690, %r6961; + add.s32 %r7697, %r7696, %r7695; + xor.b32 %r7698, %r7697, %r7692; + shf.l.wrap.b32 %r7699, %r7698, %r7698, 24; + add.s32 %r7700, %r7699, %r7693; + xor.b32 %r7701, %r7700, %r7695; + shf.l.wrap.b32 %r7702, %r7701, %r7701, 25; + add.s32 %r7703, %r7655, %r6945; + add.s32 %r7704, %r7703, %r7618; + xor.b32 %r7705, %r7704, %r7643; + shf.l.wrap.b32 %r7706, %r7705, %r7705, 16; + add.s32 %r7707, %r7706, %r7630; + xor.b32 %r7708, %r7707, %r7618; + shf.l.wrap.b32 %r7709, %r7708, %r7708, 20; + add.s32 %r7710, %r7704, %r6969; + add.s32 %r7711, %r7710, %r7709; + xor.b32 %r7712, %r7711, %r7706; + shf.l.wrap.b32 %r7713, %r7712, %r7712, 24; + add.s32 %r7714, %r7713, %r7707; + xor.b32 %r7715, %r7714, %r7709; + shf.l.wrap.b32 %r7716, %r7715, %r7715, 25; + add.s32 %r7717, %r7669, %r7001; + add.s32 %r7718, %r7717, %r7716; + xor.b32 %r7719, %r7718, %r7685; + shf.l.wrap.b32 %r7720, %r7719, %r7719, 16; + add.s32 %r7721, %r7720, %r7700; + xor.b32 %r7722, %r7721, %r7716; + shf.l.wrap.b32 %r7723, %r7722, %r7722, 20; + add.s32 %r7724, %r7718, %r7036; + add.s32 %r7725, %r7724, %r7723; + xor.b32 %r7726, %r7725, %r7720; + shf.l.wrap.b32 %r7727, %r7726, %r7726, 24; + add.s32 %r7728, %r7727, %r7721; + xor.b32 %r7729, %r7728, %r7723; + shf.l.wrap.b32 %r7730, %r7729, %r7729, 25; + add.s32 %r7731, %r7683, %r6953; + add.s32 %r7732, %r7731, %r7674; + xor.b32 %r7733, %r7732, %r7699; + shf.l.wrap.b32 %r7734, %r7733, %r7733, 16; + add.s32 %r7735, %r7734, %r7714; + xor.b32 %r7736, %r7735, %r7674; + shf.l.wrap.b32 %r7737, %r7736, %r7736, 20; + add.s32 %r7738, %r7732, %r6913; + add.s32 %r7739, %r7738, %r7737; + xor.b32 %r7740, %r7739, %r7734; + shf.l.wrap.b32 %r7741, %r7740, %r7740, 24; + add.s32 %r7742, %r7741, %r7735; + xor.b32 %r7743, %r7742, %r7737; + shf.l.wrap.b32 %r7744, %r7743, %r7743, 25; + add.s32 %r7745, %r7697, %r6921; + add.s32 %r7746, %r7745, %r7688; + xor.b32 %r7747, %r7713, %r7746; + shf.l.wrap.b32 %r7748, %r7747, %r7747, 16; + add.s32 %r7749, %r7748, %r7672; + xor.b32 %r7750, %r7749, %r7688; + shf.l.wrap.b32 %r7751, %r7750, %r7750, 20; + add.s32 %r7752, %r7746, %r6985; + add.s32 %r7753, %r7752, %r7751; + xor.b32 %r7754, %r7753, %r7748; + shf.l.wrap.b32 %r7755, %r7754, %r7754, 24; + add.s32 %r7756, %r7755, %r7749; + xor.b32 %r7757, %r7756, %r7751; + shf.l.wrap.b32 %r7758, %r7757, %r7757, 25; + add.s32 %r7759, %r7711, %r6977; + add.s32 %r7760, %r7759, %r7702; + xor.b32 %r7761, %r7760, %r7671; + shf.l.wrap.b32 %r7762, %r7761, %r7761, 16; + add.s32 %r7763, %r7762, %r7686; + xor.b32 %r7764, %r7763, %r7702; + shf.l.wrap.b32 %r7765, %r7764, %r7764, 20; + add.s32 %r7766, %r7760, %r6961; + add.s32 %r7767, %r7766, %r7765; + xor.b32 %r7768, %r7767, %r7762; + shf.l.wrap.b32 %r7769, %r7768, %r7768, 24; + add.s32 %r7770, %r7769, %r7763; + xor.b32 %r7771, %r7770, %r7765; + shf.l.wrap.b32 %r7772, %r7771, %r7771, 25; + add.s32 %r7773, %r7725, %r7025; + add.s32 %r7774, %r7773, %r7744; + xor.b32 %r7775, %r7774, %r7769; + shf.l.wrap.b32 %r7776, %r7775, %r7775, 16; + add.s32 %r7777, %r7776, %r7756; + xor.b32 %r7778, %r7777, %r7744; + shf.l.wrap.b32 %r7779, %r7778, %r7778, 20; + add.s32 %r7780, %r7774, %r6993; + add.s32 %r7781, %r7780, %r7779; + xor.b32 %r7782, %r7781, %r7776; + shf.l.wrap.b32 %r7783, %r7782, %r7782, 24; + add.s32 %r7784, %r7783, %r7777; + xor.b32 %r7785, %r7784, %r7779; + shf.l.wrap.b32 %r7786, %r7785, %r7785, 25; + add.s32 %r7787, %r7739, %r6929; + add.s32 %r7788, %r7787, %r7758; + xor.b32 %r7789, %r7727, %r7788; + shf.l.wrap.b32 %r7790, %r7789, %r7789, 16; + add.s32 %r7791, %r7790, %r7770; + xor.b32 %r7792, %r7791, %r7758; + shf.l.wrap.b32 %r7793, %r7792, %r7792, 20; + add.s32 %r7794, %r7788, %r7009; + add.s32 %r7795, %r7794, %r7793; + xor.b32 %r7796, %r7795, %r7790; + shf.l.wrap.b32 %r7797, %r7796, %r7796, 24; + add.s32 %r7798, %r7797, %r7791; + xor.b32 %r7799, %r7798, %r7793; + shf.l.wrap.b32 %r7800, %r7799, %r7799, 25; + add.s32 %r7801, %r7753, %r6937; + add.s32 %r7802, %r7801, %r7772; + xor.b32 %r7803, %r7802, %r7741; + shf.l.wrap.b32 %r7804, %r7803, %r7803, 16; + add.s32 %r7805, %r7804, %r7728; + xor.b32 %r7806, %r7805, %r7772; + shf.l.wrap.b32 %r7807, %r7806, %r7806, 20; + add.s32 %r7808, %r7802, %r6945; + add.s32 %r7809, %r7808, %r7807; + xor.b32 %r7810, %r7809, %r7804; + shf.l.wrap.b32 %r7811, %r7810, %r7810, 24; + add.s32 %r7812, %r7811, %r7805; + xor.b32 %r7813, %r7812, %r7807; + shf.l.wrap.b32 %r7814, %r7813, %r7813, 25; + add.s32 %r7815, %r7767, %r6969; + add.s32 %r7816, %r7815, %r7730; + xor.b32 %r7817, %r7816, %r7755; + shf.l.wrap.b32 %r7818, %r7817, %r7817, 16; + add.s32 %r7819, %r7818, %r7742; + xor.b32 %r7820, %r7819, %r7730; + shf.l.wrap.b32 %r7821, %r7820, %r7820, 20; + add.s32 %r7822, %r7816, %r7017; + add.s32 %r7823, %r7822, %r7821; + xor.b32 %r7824, %r7823, %r7818; + shf.l.wrap.b32 %r7825, %r7824, %r7824, 24; + add.s32 %r7826, %r7825, %r7819; + xor.b32 %r7827, %r7826, %r7821; + shf.l.wrap.b32 %r7828, %r7827, %r7827, 25; + xor.b32 %r97, %r7812, %r7781; + xor.b32 %r98, %r7826, %r7795; + xor.b32 %r99, %r7784, %r7809; + xor.b32 %r100, %r7823, %r7798; + xor.b32 %r101, %r7828, %r7797; + xor.b32 %r102, %r7786, %r7811; + xor.b32 %r103, %r7825, %r7800; + xor.b32 %r104, %r7814, %r7783; + ld.local.u8 %rs327, [%rd3+8]; + cvt.u64.u16 %rd189, %rs327; + popc.b64 %r7829, %rd251; + cvt.u64.u32 %rd64, %r7829; + setp.ge.u64 %p37, %rd64, %rd189; + mul.wide.u16 %r11681, %rs327, 32; + @%p37 bra $L__BB1_46; + +$L__BB1_45: + popc.b64 %r11649, %rd251; + cvt.u64.u32 %rd230, %r11649; + add.s32 %r7830, %r11681, -64; + cvt.s64.s32 %rd190, %r7830; + add.s64 %rd191, %rd2, %rd190; + ld.local.u8 %r7831, [%rd3+2]; + ld.local.u8 %r7832, [%rd191+145]; + ld.local.u8 %r7833, [%rd191+146]; + prmt.b32 %r7834, %r7833, %r7832, 30212; + ld.local.u8 %r7835, [%rd191+147]; + prmt.b32 %r7836, %r7835, %r7834, 28756; + ld.local.u8 %r7837, [%rd191+148]; + prmt.b32 %r7838, %r7837, %r7836, 1620; + ld.local.u8 %r7839, [%rd191+149]; + ld.local.u8 %r7840, [%rd191+150]; + prmt.b32 %r7841, %r7840, %r7839, 30212; + ld.local.u8 %r7842, [%rd191+151]; + prmt.b32 %r7843, %r7842, %r7841, 28756; + ld.local.u8 %r7844, [%rd191+152]; + prmt.b32 %r7845, %r7844, %r7843, 1620; + ld.local.u8 %r7846, [%rd191+153]; + ld.local.u8 %r7847, [%rd191+154]; + prmt.b32 %r7848, %r7847, %r7846, 30212; + ld.local.u8 %r7849, [%rd191+155]; + prmt.b32 %r7850, %r7849, %r7848, 28756; + ld.local.u8 %r7851, [%rd191+156]; + prmt.b32 %r7852, %r7851, %r7850, 1620; + ld.local.u8 %r7853, [%rd191+157]; + ld.local.u8 %r7854, [%rd191+158]; + prmt.b32 %r7855, %r7854, %r7853, 30212; + ld.local.u8 %r7856, [%rd191+159]; + prmt.b32 %r7857, %r7856, %r7855, 28756; + ld.local.u8 %r7858, [%rd191+160]; + prmt.b32 %r7859, %r7858, %r7857, 1620; + ld.local.u8 %r7860, [%rd191+161]; + ld.local.u8 %r7861, [%rd191+162]; + prmt.b32 %r7862, %r7861, %r7860, 30212; + ld.local.u8 %r7863, [%rd191+163]; + prmt.b32 %r7864, %r7863, %r7862, 28756; + ld.local.u8 %r7865, [%rd191+164]; + prmt.b32 %r7866, %r7865, %r7864, 1620; + ld.local.u8 %r7867, [%rd191+165]; + ld.local.u8 %r7868, [%rd191+166]; + prmt.b32 %r7869, %r7868, %r7867, 30212; + ld.local.u8 %r7870, [%rd191+167]; + prmt.b32 %r7871, %r7870, %r7869, 28756; + ld.local.u8 %r7872, [%rd191+168]; + prmt.b32 %r7873, %r7872, %r7871, 1620; + ld.local.u8 %r7874, [%rd191+169]; + ld.local.u8 %r7875, [%rd191+170]; + prmt.b32 %r7876, %r7875, %r7874, 30212; + ld.local.u8 %r7877, [%rd191+171]; + prmt.b32 %r7878, %r7877, %r7876, 28756; + ld.local.u8 %r7879, [%rd191+172]; + prmt.b32 %r7880, %r7879, %r7878, 1620; + ld.local.u8 %r7881, [%rd191+173]; + ld.local.u8 %r7882, [%rd191+174]; + prmt.b32 %r7883, %r7882, %r7881, 30212; + ld.local.u8 %r7884, [%rd191+175]; + prmt.b32 %r7885, %r7884, %r7883, 28756; + ld.local.u8 %r7886, [%rd191+176]; + prmt.b32 %r7887, %r7886, %r7885, 1620; + ld.local.u8 %r7888, [%rd191+177]; + ld.local.u8 %r7889, [%rd191+178]; + prmt.b32 %r7890, %r7889, %r7888, 30212; + ld.local.u8 %r7891, [%rd191+179]; + prmt.b32 %r7892, %r7891, %r7890, 28756; + ld.local.u8 %r7893, [%rd191+180]; + prmt.b32 %r7894, %r7893, %r7892, 1620; + ld.local.u8 %r7895, [%rd191+181]; + ld.local.u8 %r7896, [%rd191+182]; + prmt.b32 %r7897, %r7896, %r7895, 30212; + ld.local.u8 %r7898, [%rd191+183]; + prmt.b32 %r7899, %r7898, %r7897, 28756; + ld.local.u8 %r7900, [%rd191+184]; + prmt.b32 %r7901, %r7900, %r7899, 1620; + ld.local.u8 %r7902, [%rd191+185]; + ld.local.u8 %r7903, [%rd191+186]; + prmt.b32 %r7904, %r7903, %r7902, 30212; + ld.local.u8 %r7905, [%rd191+187]; + prmt.b32 %r7906, %r7905, %r7904, 28756; + ld.local.u8 %r7907, [%rd191+188]; + prmt.b32 %r7908, %r7907, %r7906, 1620; + ld.local.u8 %r7909, [%rd191+189]; + ld.local.u8 %r7910, [%rd191+190]; + prmt.b32 %r7911, %r7910, %r7909, 30212; + ld.local.u8 %r7912, [%rd191+191]; + prmt.b32 %r7913, %r7912, %r7911, 28756; + ld.local.u8 %r7914, [%rd191+192]; + prmt.b32 %r7915, %r7914, %r7913, 1620; + ld.local.u8 %r7916, [%rd191+193]; + ld.local.u8 %r7917, [%rd191+194]; + prmt.b32 %r7918, %r7917, %r7916, 30212; + ld.local.u8 %r7919, [%rd191+195]; + prmt.b32 %r7920, %r7919, %r7918, 28756; + ld.local.u8 %r7921, [%rd191+196]; + prmt.b32 %r7922, %r7921, %r7920, 1620; + ld.local.u8 %r7923, [%rd191+197]; + ld.local.u8 %r7924, [%rd191+198]; + prmt.b32 %r7925, %r7924, %r7923, 30212; + ld.local.u8 %r7926, [%rd191+199]; + prmt.b32 %r7927, %r7926, %r7925, 28756; + ld.local.u8 %r7928, [%rd191+200]; + prmt.b32 %r7929, %r7928, %r7927, 1620; + ld.local.u8 %r7930, [%rd191+201]; + ld.local.u8 %r7931, [%rd191+202]; + prmt.b32 %r7932, %r7931, %r7930, 30212; + ld.local.u8 %r7933, [%rd191+203]; + prmt.b32 %r7934, %r7933, %r7932, 28756; + ld.local.u8 %r7935, [%rd191+204]; + prmt.b32 %r7936, %r7935, %r7934, 1620; + ld.local.u8 %r7937, [%rd191+205]; + ld.local.u8 %r7938, [%rd191+206]; + prmt.b32 %r7939, %r7938, %r7937, 30212; + ld.local.u8 %r7940, [%rd191+207]; + prmt.b32 %r7941, %r7940, %r7939, 28756; + ld.local.u8 %r7942, [%rd191+208]; + prmt.b32 %r7943, %r7942, %r7941, 1620; + or.b32 %r7944, %r7831, 4; + ld.local.u8 %r7945, [%rd3+-120]; + ld.local.u8 %r7946, [%rd3+-119]; + prmt.b32 %r7947, %r7946, %r7945, 30212; + ld.local.u8 %r7948, [%rd3+-118]; + ld.local.u8 %r7949, [%rd3+-117]; + prmt.b32 %r7950, %r7949, %r7948, 30212; + prmt.b32 %r7951, %r7950, %r7947, 4180; + ld.local.u8 %r7952, [%rd3+-136]; + ld.local.u8 %r7953, [%rd3+-135]; + prmt.b32 %r7954, %r7953, %r7952, 30212; + ld.local.u8 %r7955, [%rd3+-134]; + ld.local.u8 %r7956, [%rd3+-133]; + prmt.b32 %r7957, %r7956, %r7955, 30212; + prmt.b32 %r7958, %r7957, %r7954, 4180; + add.s32 %r7959, %r7951, %r7958; + add.s32 %r7960, %r7959, %r7838; + shf.l.wrap.b32 %r7961, %r7960, %r7960, 16; + add.s32 %r7962, %r7961, 1779033703; + xor.b32 %r7963, %r7962, %r7951; + shf.l.wrap.b32 %r7964, %r7963, %r7963, 20; + add.s32 %r7965, %r7845, %r7960; + add.s32 %r7966, %r7965, %r7964; + xor.b32 %r7967, %r7966, %r7961; + shf.l.wrap.b32 %r7968, %r7967, %r7967, 24; + add.s32 %r7969, %r7968, %r7962; + xor.b32 %r7970, %r7969, %r7964; + shf.l.wrap.b32 %r7971, %r7970, %r7970, 25; + ld.local.u8 %r7972, [%rd3+-116]; + ld.local.u8 %r7973, [%rd3+-115]; + prmt.b32 %r7974, %r7973, %r7972, 30212; + ld.local.u8 %r7975, [%rd3+-114]; + ld.local.u8 %r7976, [%rd3+-113]; + prmt.b32 %r7977, %r7976, %r7975, 30212; + prmt.b32 %r7978, %r7977, %r7974, 4180; + ld.local.u8 %r7979, [%rd3+-132]; + ld.local.u8 %r7980, [%rd3+-131]; + prmt.b32 %r7981, %r7980, %r7979, 30212; + ld.local.u8 %r7982, [%rd3+-130]; + ld.local.u8 %r7983, [%rd3+-129]; + prmt.b32 %r7984, %r7983, %r7982, 30212; + prmt.b32 %r7985, %r7984, %r7981, 4180; + add.s32 %r7986, %r7978, %r7985; + add.s32 %r7987, %r7986, %r7852; + shf.l.wrap.b32 %r7988, %r7987, %r7987, 16; + add.s32 %r7989, %r7988, -1150833019; + xor.b32 %r7990, %r7989, %r7978; + shf.l.wrap.b32 %r7991, %r7990, %r7990, 20; + add.s32 %r7992, %r7859, %r7987; + add.s32 %r7993, %r7992, %r7991; + xor.b32 %r7994, %r7993, %r7988; + shf.l.wrap.b32 %r7995, %r7994, %r7994, 24; + add.s32 %r7996, %r7995, %r7989; + xor.b32 %r7997, %r7996, %r7991; + shf.l.wrap.b32 %r7998, %r7997, %r7997, 25; + ld.local.u8 %r7999, [%rd3+-112]; + ld.local.u8 %r8000, [%rd3+-111]; + prmt.b32 %r8001, %r8000, %r7999, 30212; + ld.local.u8 %r8002, [%rd3+-110]; + ld.local.u8 %r8003, [%rd3+-109]; + prmt.b32 %r8004, %r8003, %r8002, 30212; + prmt.b32 %r8005, %r8004, %r8001, 4180; + ld.local.u8 %r8006, [%rd3+-128]; + ld.local.u8 %r8007, [%rd3+-127]; + prmt.b32 %r8008, %r8007, %r8006, 30212; + ld.local.u8 %r8009, [%rd3+-126]; + ld.local.u8 %r8010, [%rd3+-125]; + prmt.b32 %r8011, %r8010, %r8009, 30212; + prmt.b32 %r8012, %r8011, %r8008, 4180; + add.s32 %r8013, %r8005, %r8012; + add.s32 %r8014, %r8013, %r7866; + shr.u32 %r8015, %r8014, 16; + shl.b32 %r8016, %r8014, 16; + xor.b32 %r8017, %r8016, 4194304; + or.b32 %r8018, %r8017, %r8015; + add.s32 %r8019, %r8018, 1013904242; + xor.b32 %r8020, %r8019, %r8005; + shf.l.wrap.b32 %r8021, %r8020, %r8020, 20; + add.s32 %r8022, %r7873, %r8014; + add.s32 %r8023, %r8022, %r8021; + xor.b32 %r8024, %r8023, %r8018; + shf.l.wrap.b32 %r8025, %r8024, %r8024, 24; + add.s32 %r8026, %r8025, %r8019; + xor.b32 %r8027, %r8026, %r8021; + shf.l.wrap.b32 %r8028, %r8027, %r8027, 25; + ld.local.u8 %r8029, [%rd3+-108]; + ld.local.u8 %r8030, [%rd3+-107]; + prmt.b32 %r8031, %r8030, %r8029, 30212; + ld.local.u8 %r8032, [%rd3+-106]; + ld.local.u8 %r8033, [%rd3+-105]; + prmt.b32 %r8034, %r8033, %r8032, 30212; + prmt.b32 %r8035, %r8034, %r8031, 4180; + ld.local.u8 %r8036, [%rd3+-124]; + ld.local.u8 %r8037, [%rd3+-123]; + prmt.b32 %r8038, %r8037, %r8036, 30212; + ld.local.u8 %r8039, [%rd3+-122]; + ld.local.u8 %r8040, [%rd3+-121]; + prmt.b32 %r8041, %r8040, %r8039, 30212; + prmt.b32 %r8042, %r8041, %r8038, 4180; + add.s32 %r8043, %r8035, %r8042; + add.s32 %r8044, %r8043, %r7880; + xor.b32 %r8045, %r8044, %r7944; + shr.u32 %r8046, %r8044, 16; + shl.b32 %r8047, %r8045, 16; + or.b32 %r8048, %r8047, %r8046; + add.s32 %r8049, %r8048, -1521486534; + xor.b32 %r8050, %r8049, %r8035; + shf.l.wrap.b32 %r8051, %r8050, %r8050, 20; + add.s32 %r8052, %r7887, %r8044; + add.s32 %r8053, %r8052, %r8051; + xor.b32 %r8054, %r8053, %r8048; + shf.l.wrap.b32 %r8055, %r8054, %r8054, 24; + add.s32 %r8056, %r8055, %r8049; + xor.b32 %r8057, %r8056, %r8051; + shf.l.wrap.b32 %r8058, %r8057, %r8057, 25; + add.s32 %r8059, %r7998, %r7966; + add.s32 %r8060, %r8059, %r7894; + xor.b32 %r8061, %r8055, %r8060; + shf.l.wrap.b32 %r8062, %r8061, %r8061, 16; + add.s32 %r8063, %r8062, %r8026; + xor.b32 %r8064, %r8063, %r7998; + shf.l.wrap.b32 %r8065, %r8064, %r8064, 20; + add.s32 %r8066, %r7901, %r8060; + add.s32 %r8067, %r8066, %r8065; + xor.b32 %r8068, %r8067, %r8062; + shf.l.wrap.b32 %r8069, %r8068, %r8068, 24; + add.s32 %r8070, %r8069, %r8063; + xor.b32 %r8071, %r8070, %r8065; + shf.l.wrap.b32 %r8072, %r8071, %r8071, 25; + add.s32 %r8073, %r8028, %r7993; + add.s32 %r8074, %r8073, %r7908; + xor.b32 %r8075, %r8074, %r7968; + shf.l.wrap.b32 %r8076, %r8075, %r8075, 16; + add.s32 %r8077, %r8076, %r8056; + xor.b32 %r8078, %r8077, %r8028; + shf.l.wrap.b32 %r8079, %r8078, %r8078, 20; + add.s32 %r8080, %r7915, %r8074; + add.s32 %r8081, %r8080, %r8079; + xor.b32 %r8082, %r8081, %r8076; + shf.l.wrap.b32 %r8083, %r8082, %r8082, 24; + add.s32 %r8084, %r8083, %r8077; + xor.b32 %r8085, %r8084, %r8079; + shf.l.wrap.b32 %r8086, %r8085, %r8085, 25; + add.s32 %r8087, %r8058, %r8023; + add.s32 %r8088, %r8087, %r7922; + xor.b32 %r8089, %r8088, %r7995; + shf.l.wrap.b32 %r8090, %r8089, %r8089, 16; + add.s32 %r8091, %r8090, %r7969; + xor.b32 %r8092, %r8091, %r8058; + shf.l.wrap.b32 %r8093, %r8092, %r8092, 20; + add.s32 %r8094, %r7929, %r8088; + add.s32 %r8095, %r8094, %r8093; + xor.b32 %r8096, %r8095, %r8090; + shf.l.wrap.b32 %r8097, %r8096, %r8096, 24; + add.s32 %r8098, %r8097, %r8091; + xor.b32 %r8099, %r8098, %r8093; + shf.l.wrap.b32 %r8100, %r8099, %r8099, 25; + add.s32 %r8101, %r8053, %r7971; + add.s32 %r8102, %r8101, %r7936; + xor.b32 %r8103, %r8102, %r8025; + shf.l.wrap.b32 %r8104, %r8103, %r8103, 16; + add.s32 %r8105, %r8104, %r7996; + xor.b32 %r8106, %r8105, %r7971; + shf.l.wrap.b32 %r8107, %r8106, %r8106, 20; + add.s32 %r8108, %r7943, %r8102; + add.s32 %r8109, %r8108, %r8107; + xor.b32 %r8110, %r8109, %r8104; + shf.l.wrap.b32 %r8111, %r8110, %r8110, 24; + add.s32 %r8112, %r8111, %r8105; + xor.b32 %r8113, %r8112, %r8107; + shf.l.wrap.b32 %r8114, %r8113, %r8113, 25; + add.s32 %r8115, %r8067, %r7852; + add.s32 %r8116, %r8115, %r8114; + xor.b32 %r8117, %r8116, %r8083; + shf.l.wrap.b32 %r8118, %r8117, %r8117, 16; + add.s32 %r8119, %r8118, %r8098; + xor.b32 %r8120, %r8119, %r8114; + shf.l.wrap.b32 %r8121, %r8120, %r8120, 20; + add.s32 %r8122, %r8116, %r7880; + add.s32 %r8123, %r8122, %r8121; + xor.b32 %r8124, %r8123, %r8118; + shf.l.wrap.b32 %r8125, %r8124, %r8124, 24; + add.s32 %r8126, %r8125, %r8119; + xor.b32 %r8127, %r8126, %r8121; + shf.l.wrap.b32 %r8128, %r8127, %r8127, 25; + add.s32 %r8129, %r8081, %r7859; + add.s32 %r8130, %r8129, %r8072; + xor.b32 %r8131, %r8097, %r8130; + shf.l.wrap.b32 %r8132, %r8131, %r8131, 16; + add.s32 %r8133, %r8112, %r8132; + xor.b32 %r8134, %r8133, %r8072; + shf.l.wrap.b32 %r8135, %r8134, %r8134, 20; + add.s32 %r8136, %r8130, %r7908; + add.s32 %r8137, %r8136, %r8135; + xor.b32 %r8138, %r8137, %r8132; + shf.l.wrap.b32 %r8139, %r8138, %r8138, 24; + add.s32 %r8140, %r8139, %r8133; + xor.b32 %r8141, %r8140, %r8135; + shf.l.wrap.b32 %r8142, %r8141, %r8141, 25; + add.s32 %r8143, %r8086, %r7887; + add.s32 %r8144, %r8143, %r8095; + xor.b32 %r8145, %r8111, %r8144; + shf.l.wrap.b32 %r8146, %r8145, %r8145, 16; + add.s32 %r8147, %r8146, %r8070; + xor.b32 %r8148, %r8147, %r8086; + shf.l.wrap.b32 %r8149, %r8148, %r8148, 20; + add.s32 %r8150, %r8144, %r7838; + add.s32 %r8151, %r8150, %r8149; + xor.b32 %r8152, %r8151, %r8146; + shf.l.wrap.b32 %r8153, %r8152, %r8152, 24; + add.s32 %r8154, %r8153, %r8147; + xor.b32 %r8155, %r8154, %r8149; + shf.l.wrap.b32 %r8156, %r8155, %r8155, 25; + add.s32 %r8157, %r8100, %r7866; + add.s32 %r8158, %r8157, %r8109; + xor.b32 %r8159, %r8158, %r8069; + shf.l.wrap.b32 %r8160, %r8159, %r8159, 16; + add.s32 %r8161, %r8160, %r8084; + xor.b32 %r8162, %r8161, %r8100; + shf.l.wrap.b32 %r8163, %r8162, %r8162, 20; + add.s32 %r8164, %r8158, %r7929; + add.s32 %r8165, %r8164, %r8163; + xor.b32 %r8166, %r8165, %r8160; + shf.l.wrap.b32 %r8167, %r8166, %r8166, 24; + add.s32 %r8168, %r8167, %r8161; + xor.b32 %r8169, %r8168, %r8163; + shf.l.wrap.b32 %r8170, %r8169, %r8169, 25; + add.s32 %r8171, %r8142, %r7845; + add.s32 %r8172, %r8171, %r8123; + xor.b32 %r8173, %r8172, %r8167; + shf.l.wrap.b32 %r8174, %r8173, %r8173, 16; + add.s32 %r8175, %r8174, %r8154; + xor.b32 %r8176, %r8175, %r8142; + shf.l.wrap.b32 %r8177, %r8176, %r8176, 20; + add.s32 %r8178, %r8172, %r7915; + add.s32 %r8179, %r8178, %r8177; + xor.b32 %r8180, %r8179, %r8174; + shf.l.wrap.b32 %r8181, %r8180, %r8180, 24; + add.s32 %r8182, %r8181, %r8175; + xor.b32 %r8183, %r8182, %r8177; + shf.l.wrap.b32 %r8184, %r8183, %r8183, 25; + add.s32 %r8185, %r8137, %r7922; + add.s32 %r8186, %r8185, %r8156; + xor.b32 %r8187, %r8125, %r8186; + shf.l.wrap.b32 %r8188, %r8187, %r8187, 16; + add.s32 %r8189, %r8188, %r8168; + xor.b32 %r8190, %r8189, %r8156; + shf.l.wrap.b32 %r8191, %r8190, %r8190, 20; + add.s32 %r8192, %r8186, %r7873; + add.s32 %r8193, %r8192, %r8191; + xor.b32 %r8194, %r8193, %r8188; + shf.l.wrap.b32 %r8195, %r8194, %r8194, 24; + add.s32 %r8196, %r8195, %r8189; + xor.b32 %r8197, %r8196, %r8191; + shf.l.wrap.b32 %r8198, %r8197, %r8197, 25; + add.s32 %r8199, %r8151, %r7901; + add.s32 %r8200, %r8199, %r8170; + xor.b32 %r8201, %r8200, %r8139; + shf.l.wrap.b32 %r8202, %r8201, %r8201, 16; + add.s32 %r8203, %r8202, %r8126; + xor.b32 %r8204, %r8203, %r8170; + shf.l.wrap.b32 %r8205, %r8204, %r8204, 20; + add.s32 %r8206, %r8200, %r7936; + add.s32 %r8207, %r8206, %r8205; + xor.b32 %r8208, %r8207, %r8202; + shf.l.wrap.b32 %r8209, %r8208, %r8208, 24; + add.s32 %r8210, %r8209, %r8203; + xor.b32 %r8211, %r8210, %r8205; + shf.l.wrap.b32 %r8212, %r8211, %r8211, 25; + add.s32 %r8213, %r8165, %r7943; + add.s32 %r8214, %r8213, %r8128; + xor.b32 %r8215, %r8214, %r8153; + shf.l.wrap.b32 %r8216, %r8215, %r8215, 16; + add.s32 %r8217, %r8216, %r8140; + xor.b32 %r8218, %r8217, %r8128; + shf.l.wrap.b32 %r8219, %r8218, %r8218, 20; + add.s32 %r8220, %r8214, %r7894; + add.s32 %r8221, %r8220, %r8219; + xor.b32 %r8222, %r8221, %r8216; + shf.l.wrap.b32 %r8223, %r8222, %r8222, 24; + add.s32 %r8224, %r8223, %r8217; + xor.b32 %r8225, %r8224, %r8219; + shf.l.wrap.b32 %r8226, %r8225, %r8225, 25; + add.s32 %r8227, %r8179, %r7859; + add.s32 %r8228, %r8227, %r8226; + xor.b32 %r8229, %r8228, %r8195; + shf.l.wrap.b32 %r8230, %r8229, %r8229, 16; + add.s32 %r8231, %r8230, %r8210; + xor.b32 %r8232, %r8231, %r8226; + shf.l.wrap.b32 %r8233, %r8232, %r8232, 20; + add.s32 %r8234, %r8228, %r7866; + add.s32 %r8235, %r8234, %r8233; + xor.b32 %r8236, %r8235, %r8230; + shf.l.wrap.b32 %r8237, %r8236, %r8236, 24; + add.s32 %r8238, %r8237, %r8231; + xor.b32 %r8239, %r8238, %r8233; + shf.l.wrap.b32 %r8240, %r8239, %r8239, 25; + add.s32 %r8241, %r8193, %r7908; + add.s32 %r8242, %r8241, %r8184; + xor.b32 %r8243, %r8242, %r8209; + shf.l.wrap.b32 %r8244, %r8243, %r8243, 16; + add.s32 %r8245, %r8244, %r8224; + xor.b32 %r8246, %r8245, %r8184; + shf.l.wrap.b32 %r8247, %r8246, %r8246, 20; + add.s32 %r8248, %r8242, %r7922; + add.s32 %r8249, %r8248, %r8247; + xor.b32 %r8250, %r8249, %r8244; + shf.l.wrap.b32 %r8251, %r8250, %r8250, 24; + add.s32 %r8252, %r8251, %r8245; + xor.b32 %r8253, %r8252, %r8247; + shf.l.wrap.b32 %r8254, %r8253, %r8253, 25; + add.s32 %r8255, %r8207, %r7929; + add.s32 %r8256, %r8255, %r8198; + xor.b32 %r8257, %r8223, %r8256; + shf.l.wrap.b32 %r8258, %r8257, %r8257, 16; + add.s32 %r8259, %r8258, %r8182; + xor.b32 %r8260, %r8259, %r8198; + shf.l.wrap.b32 %r8261, %r8260, %r8260, 20; + add.s32 %r8262, %r8256, %r7852; + add.s32 %r8263, %r8262, %r8261; + xor.b32 %r8264, %r8263, %r8258; + shf.l.wrap.b32 %r8265, %r8264, %r8264, 24; + add.s32 %r8266, %r8265, %r8259; + xor.b32 %r8267, %r8266, %r8261; + shf.l.wrap.b32 %r8268, %r8267, %r8267, 25; + add.s32 %r8269, %r8212, %r7887; + add.s32 %r8270, %r8269, %r8221; + xor.b32 %r8271, %r8270, %r8181; + shf.l.wrap.b32 %r8272, %r8271, %r8271, 16; + add.s32 %r8273, %r8272, %r8196; + xor.b32 %r8274, %r8273, %r8212; + shf.l.wrap.b32 %r8275, %r8274, %r8274, 20; + add.s32 %r8276, %r8270, %r7936; + add.s32 %r8277, %r8276, %r8275; + xor.b32 %r8278, %r8277, %r8272; + shf.l.wrap.b32 %r8279, %r8278, %r8278, 24; + add.s32 %r8280, %r8279, %r8273; + xor.b32 %r8281, %r8280, %r8275; + shf.l.wrap.b32 %r8282, %r8281, %r8281, 25; + add.s32 %r8283, %r8235, %r7880; + add.s32 %r8284, %r8283, %r8254; + xor.b32 %r8285, %r8284, %r8279; + shf.l.wrap.b32 %r8286, %r8285, %r8285, 16; + add.s32 %r8287, %r8286, %r8266; + xor.b32 %r8288, %r8287, %r8254; + shf.l.wrap.b32 %r8289, %r8288, %r8288, 20; + add.s32 %r8290, %r8284, %r7873; + add.s32 %r8291, %r8290, %r8289; + xor.b32 %r8292, %r8291, %r8286; + shf.l.wrap.b32 %r8293, %r8292, %r8292, 24; + add.s32 %r8294, %r8293, %r8287; + xor.b32 %r8295, %r8294, %r8289; + shf.l.wrap.b32 %r8296, %r8295, %r8295, 25; + add.s32 %r8297, %r8249, %r7901; + add.s32 %r8298, %r8297, %r8268; + xor.b32 %r8299, %r8237, %r8298; + shf.l.wrap.b32 %r8300, %r8299, %r8299, 16; + add.s32 %r8301, %r8300, %r8280; + xor.b32 %r8302, %r8301, %r8268; + shf.l.wrap.b32 %r8303, %r8302, %r8302, 20; + add.s32 %r8304, %r8298, %r7838; + add.s32 %r8305, %r8304, %r8303; + xor.b32 %r8306, %r8305, %r8300; + shf.l.wrap.b32 %r8307, %r8306, %r8306, 24; + add.s32 %r8308, %r8307, %r8301; + xor.b32 %r8309, %r8308, %r8303; + shf.l.wrap.b32 %r8310, %r8309, %r8309, 25; + add.s32 %r8311, %r8263, %r7915; + add.s32 %r8312, %r8311, %r8282; + xor.b32 %r8313, %r8312, %r8251; + shf.l.wrap.b32 %r8314, %r8313, %r8313, 16; + add.s32 %r8315, %r8314, %r8238; + xor.b32 %r8316, %r8315, %r8282; + shf.l.wrap.b32 %r8317, %r8316, %r8316, 20; + add.s32 %r8318, %r8312, %r7943; + add.s32 %r8319, %r8318, %r8317; + xor.b32 %r8320, %r8319, %r8314; + shf.l.wrap.b32 %r8321, %r8320, %r8320, 24; + add.s32 %r8322, %r8321, %r8315; + xor.b32 %r8323, %r8322, %r8317; + shf.l.wrap.b32 %r8324, %r8323, %r8323, 25; + add.s32 %r8325, %r8277, %r7894; + add.s32 %r8326, %r8325, %r8240; + xor.b32 %r8327, %r8326, %r8265; + shf.l.wrap.b32 %r8328, %r8327, %r8327, 16; + add.s32 %r8329, %r8328, %r8252; + xor.b32 %r8330, %r8329, %r8240; + shf.l.wrap.b32 %r8331, %r8330, %r8330, 20; + add.s32 %r8332, %r8326, %r7845; + add.s32 %r8333, %r8332, %r8331; + xor.b32 %r8334, %r8333, %r8328; + shf.l.wrap.b32 %r8335, %r8334, %r8334, 24; + add.s32 %r8336, %r8335, %r8329; + xor.b32 %r8337, %r8336, %r8331; + shf.l.wrap.b32 %r8338, %r8337, %r8337, 25; + add.s32 %r8339, %r8291, %r7908; + add.s32 %r8340, %r8339, %r8338; + xor.b32 %r8341, %r8340, %r8307; + shf.l.wrap.b32 %r8342, %r8341, %r8341, 16; + add.s32 %r8343, %r8342, %r8322; + xor.b32 %r8344, %r8343, %r8338; + shf.l.wrap.b32 %r8345, %r8344, %r8344, 20; + add.s32 %r8346, %r8340, %r7887; + add.s32 %r8347, %r8346, %r8345; + xor.b32 %r8348, %r8347, %r8342; + shf.l.wrap.b32 %r8349, %r8348, %r8348, 24; + add.s32 %r8350, %r8349, %r8343; + xor.b32 %r8351, %r8350, %r8345; + shf.l.wrap.b32 %r8352, %r8351, %r8351, 25; + add.s32 %r8353, %r8305, %r7922; + add.s32 %r8354, %r8353, %r8296; + xor.b32 %r8355, %r8354, %r8321; + shf.l.wrap.b32 %r8356, %r8355, %r8355, 16; + add.s32 %r8357, %r8356, %r8336; + xor.b32 %r8358, %r8357, %r8296; + shf.l.wrap.b32 %r8359, %r8358, %r8358, 20; + add.s32 %r8360, %r8354, %r7901; + add.s32 %r8361, %r8360, %r8359; + xor.b32 %r8362, %r8361, %r8356; + shf.l.wrap.b32 %r8363, %r8362, %r8362, 24; + add.s32 %r8364, %r8363, %r8357; + xor.b32 %r8365, %r8364, %r8359; + shf.l.wrap.b32 %r8366, %r8365, %r8365, 25; + add.s32 %r8367, %r8319, %r7936; + add.s32 %r8368, %r8367, %r8310; + xor.b32 %r8369, %r8335, %r8368; + shf.l.wrap.b32 %r8370, %r8369, %r8369, 16; + add.s32 %r8371, %r8370, %r8294; + xor.b32 %r8372, %r8371, %r8310; + shf.l.wrap.b32 %r8373, %r8372, %r8372, 20; + add.s32 %r8374, %r8368, %r7859; + add.s32 %r8375, %r8374, %r8373; + xor.b32 %r8376, %r8375, %r8370; + shf.l.wrap.b32 %r8377, %r8376, %r8376, 24; + add.s32 %r8378, %r8377, %r8371; + xor.b32 %r8379, %r8378, %r8373; + shf.l.wrap.b32 %r8380, %r8379, %r8379, 25; + add.s32 %r8381, %r8333, %r7929; + add.s32 %r8382, %r8381, %r8324; + xor.b32 %r8383, %r8382, %r8293; + shf.l.wrap.b32 %r8384, %r8383, %r8383, 16; + add.s32 %r8385, %r8384, %r8308; + xor.b32 %r8386, %r8385, %r8324; + shf.l.wrap.b32 %r8387, %r8386, %r8386, 20; + add.s32 %r8388, %r8382, %r7943; + add.s32 %r8389, %r8388, %r8387; + xor.b32 %r8390, %r8389, %r8384; + shf.l.wrap.b32 %r8391, %r8390, %r8390, 24; + add.s32 %r8392, %r8391, %r8385; + xor.b32 %r8393, %r8392, %r8387; + shf.l.wrap.b32 %r8394, %r8393, %r8393, 25; + add.s32 %r8395, %r8347, %r7866; + add.s32 %r8396, %r8395, %r8366; + xor.b32 %r8397, %r8396, %r8391; + shf.l.wrap.b32 %r8398, %r8397, %r8397, 16; + add.s32 %r8399, %r8398, %r8378; + xor.b32 %r8400, %r8399, %r8366; + shf.l.wrap.b32 %r8401, %r8400, %r8400, 20; + add.s32 %r8402, %r8396, %r7838; + add.s32 %r8403, %r8402, %r8401; + xor.b32 %r8404, %r8403, %r8398; + shf.l.wrap.b32 %r8405, %r8404, %r8404, 24; + add.s32 %r8406, %r8405, %r8399; + xor.b32 %r8407, %r8406, %r8401; + shf.l.wrap.b32 %r8408, %r8407, %r8407, 25; + add.s32 %r8409, %r8361, %r7915; + add.s32 %r8410, %r8409, %r8380; + xor.b32 %r8411, %r8349, %r8410; + shf.l.wrap.b32 %r8412, %r8411, %r8411, 16; + add.s32 %r8413, %r8412, %r8392; + xor.b32 %r8414, %r8413, %r8380; + shf.l.wrap.b32 %r8415, %r8414, %r8414, 20; + add.s32 %r8416, %r8410, %r7852; + add.s32 %r8417, %r8416, %r8415; + xor.b32 %r8418, %r8417, %r8412; + shf.l.wrap.b32 %r8419, %r8418, %r8418, 24; + add.s32 %r8420, %r8419, %r8413; + xor.b32 %r8421, %r8420, %r8415; + shf.l.wrap.b32 %r8422, %r8421, %r8421, 25; + add.s32 %r8423, %r8375, %r7873; + add.s32 %r8424, %r8423, %r8394; + xor.b32 %r8425, %r8424, %r8363; + shf.l.wrap.b32 %r8426, %r8425, %r8425, 16; + add.s32 %r8427, %r8426, %r8350; + xor.b32 %r8428, %r8427, %r8394; + shf.l.wrap.b32 %r8429, %r8428, %r8428, 20; + add.s32 %r8430, %r8424, %r7894; + add.s32 %r8431, %r8430, %r8429; + xor.b32 %r8432, %r8431, %r8426; + shf.l.wrap.b32 %r8433, %r8432, %r8432, 24; + add.s32 %r8434, %r8433, %r8427; + xor.b32 %r8435, %r8434, %r8429; + shf.l.wrap.b32 %r8436, %r8435, %r8435, 25; + add.s32 %r8437, %r8389, %r7845; + add.s32 %r8438, %r8437, %r8352; + xor.b32 %r8439, %r8438, %r8377; + shf.l.wrap.b32 %r8440, %r8439, %r8439, 16; + add.s32 %r8441, %r8440, %r8364; + xor.b32 %r8442, %r8441, %r8352; + shf.l.wrap.b32 %r8443, %r8442, %r8442, 20; + add.s32 %r8444, %r8438, %r7880; + add.s32 %r8445, %r8444, %r8443; + xor.b32 %r8446, %r8445, %r8440; + shf.l.wrap.b32 %r8447, %r8446, %r8446, 24; + add.s32 %r8448, %r8447, %r8441; + xor.b32 %r8449, %r8448, %r8443; + shf.l.wrap.b32 %r8450, %r8449, %r8449, 25; + add.s32 %r8451, %r8403, %r7922; + add.s32 %r8452, %r8451, %r8450; + xor.b32 %r8453, %r8452, %r8419; + shf.l.wrap.b32 %r8454, %r8453, %r8453, 16; + add.s32 %r8455, %r8454, %r8434; + xor.b32 %r8456, %r8455, %r8450; + shf.l.wrap.b32 %r8457, %r8456, %r8456, 20; + add.s32 %r8458, %r8452, %r7929; + add.s32 %r8459, %r8458, %r8457; + xor.b32 %r8460, %r8459, %r8454; + shf.l.wrap.b32 %r8461, %r8460, %r8460, 24; + add.s32 %r8462, %r8461, %r8455; + xor.b32 %r8463, %r8462, %r8457; + shf.l.wrap.b32 %r8464, %r8463, %r8463, 25; + add.s32 %r8465, %r8417, %r7901; + add.s32 %r8466, %r8465, %r8408; + xor.b32 %r8467, %r8466, %r8433; + shf.l.wrap.b32 %r8468, %r8467, %r8467, 16; + add.s32 %r8469, %r8468, %r8448; + xor.b32 %r8470, %r8469, %r8408; + shf.l.wrap.b32 %r8471, %r8470, %r8470, 20; + add.s32 %r8472, %r8466, %r7915; + add.s32 %r8473, %r8472, %r8471; + xor.b32 %r8474, %r8473, %r8468; + shf.l.wrap.b32 %r8475, %r8474, %r8474, 24; + add.s32 %r8476, %r8475, %r8469; + xor.b32 %r8477, %r8476, %r8471; + shf.l.wrap.b32 %r8478, %r8477, %r8477, 25; + add.s32 %r8479, %r8431, %r7943; + add.s32 %r8480, %r8479, %r8422; + xor.b32 %r8481, %r8447, %r8480; + shf.l.wrap.b32 %r8482, %r8481, %r8481, 16; + add.s32 %r8483, %r8482, %r8406; + xor.b32 %r8484, %r8483, %r8422; + shf.l.wrap.b32 %r8485, %r8484, %r8484, 20; + add.s32 %r8486, %r8480, %r7908; + add.s32 %r8487, %r8486, %r8485; + xor.b32 %r8488, %r8487, %r8482; + shf.l.wrap.b32 %r8489, %r8488, %r8488, 24; + add.s32 %r8490, %r8489, %r8483; + xor.b32 %r8491, %r8490, %r8485; + shf.l.wrap.b32 %r8492, %r8491, %r8491, 25; + add.s32 %r8493, %r8445, %r7936; + add.s32 %r8494, %r8493, %r8436; + xor.b32 %r8495, %r8494, %r8405; + shf.l.wrap.b32 %r8496, %r8495, %r8495, 16; + add.s32 %r8497, %r8496, %r8420; + xor.b32 %r8498, %r8497, %r8436; + shf.l.wrap.b32 %r8499, %r8498, %r8498, 20; + add.s32 %r8500, %r8494, %r7894; + add.s32 %r8501, %r8500, %r8499; + xor.b32 %r8502, %r8501, %r8496; + shf.l.wrap.b32 %r8503, %r8502, %r8502, 24; + add.s32 %r8504, %r8503, %r8497; + xor.b32 %r8505, %r8504, %r8499; + shf.l.wrap.b32 %r8506, %r8505, %r8505, 25; + add.s32 %r8507, %r8459, %r7887; + add.s32 %r8508, %r8507, %r8478; + xor.b32 %r8509, %r8508, %r8503; + shf.l.wrap.b32 %r8510, %r8509, %r8509, 16; + add.s32 %r8511, %r8510, %r8490; + xor.b32 %r8512, %r8511, %r8478; + shf.l.wrap.b32 %r8513, %r8512, %r8512, 20; + add.s32 %r8514, %r8508, %r7852; + add.s32 %r8515, %r8514, %r8513; + xor.b32 %r8516, %r8515, %r8510; + shf.l.wrap.b32 %r8517, %r8516, %r8516, 24; + add.s32 %r8518, %r8517, %r8511; + xor.b32 %r8519, %r8518, %r8513; + shf.l.wrap.b32 %r8520, %r8519, %r8519, 25; + add.s32 %r8521, %r8473, %r7873; + add.s32 %r8522, %r8521, %r8492; + xor.b32 %r8523, %r8461, %r8522; + shf.l.wrap.b32 %r8524, %r8523, %r8523, 16; + add.s32 %r8525, %r8524, %r8504; + xor.b32 %r8526, %r8525, %r8492; + shf.l.wrap.b32 %r8527, %r8526, %r8526, 20; + add.s32 %r8528, %r8522, %r7859; + add.s32 %r8529, %r8528, %r8527; + xor.b32 %r8530, %r8529, %r8524; + shf.l.wrap.b32 %r8531, %r8530, %r8530, 24; + add.s32 %r8532, %r8531, %r8525; + xor.b32 %r8533, %r8532, %r8527; + shf.l.wrap.b32 %r8534, %r8533, %r8533, 25; + add.s32 %r8535, %r8487, %r7838; + add.s32 %r8536, %r8535, %r8506; + xor.b32 %r8537, %r8536, %r8475; + shf.l.wrap.b32 %r8538, %r8537, %r8537, 16; + add.s32 %r8539, %r8538, %r8462; + xor.b32 %r8540, %r8539, %r8506; + shf.l.wrap.b32 %r8541, %r8540, %r8540, 20; + add.s32 %r8542, %r8536, %r7845; + add.s32 %r8543, %r8542, %r8541; + xor.b32 %r8544, %r8543, %r8538; + shf.l.wrap.b32 %r8545, %r8544, %r8544, 24; + add.s32 %r8546, %r8545, %r8539; + xor.b32 %r8547, %r8546, %r8541; + shf.l.wrap.b32 %r8548, %r8547, %r8547, 25; + add.s32 %r8549, %r8501, %r7880; + add.s32 %r8550, %r8549, %r8464; + xor.b32 %r8551, %r8550, %r8489; + shf.l.wrap.b32 %r8552, %r8551, %r8551, 16; + add.s32 %r8553, %r8552, %r8476; + xor.b32 %r8554, %r8553, %r8464; + shf.l.wrap.b32 %r8555, %r8554, %r8554, 20; + add.s32 %r8556, %r8550, %r7866; + add.s32 %r8557, %r8556, %r8555; + xor.b32 %r8558, %r8557, %r8552; + shf.l.wrap.b32 %r8559, %r8558, %r8558, 24; + add.s32 %r8560, %r8559, %r8553; + xor.b32 %r8561, %r8560, %r8555; + shf.l.wrap.b32 %r8562, %r8561, %r8561, 25; + add.s32 %r8563, %r8515, %r7901; + add.s32 %r8564, %r8563, %r8562; + xor.b32 %r8565, %r8564, %r8531; + shf.l.wrap.b32 %r8566, %r8565, %r8565, 16; + add.s32 %r8567, %r8566, %r8546; + xor.b32 %r8568, %r8567, %r8562; + shf.l.wrap.b32 %r8569, %r8568, %r8568, 20; + add.s32 %r8570, %r8564, %r7936; + add.s32 %r8571, %r8570, %r8569; + xor.b32 %r8572, %r8571, %r8566; + shf.l.wrap.b32 %r8573, %r8572, %r8572, 24; + add.s32 %r8574, %r8573, %r8567; + xor.b32 %r8575, %r8574, %r8569; + shf.l.wrap.b32 %r8576, %r8575, %r8575, 25; + add.s32 %r8577, %r8529, %r7915; + add.s32 %r8578, %r8577, %r8520; + xor.b32 %r8579, %r8578, %r8545; + shf.l.wrap.b32 %r8580, %r8579, %r8579, 16; + add.s32 %r8581, %r8580, %r8560; + xor.b32 %r8582, %r8581, %r8520; + shf.l.wrap.b32 %r8583, %r8582, %r8582, 20; + add.s32 %r8584, %r8578, %r7873; + add.s32 %r8585, %r8584, %r8583; + xor.b32 %r8586, %r8585, %r8580; + shf.l.wrap.b32 %r8587, %r8586, %r8586, 24; + add.s32 %r8588, %r8587, %r8581; + xor.b32 %r8589, %r8588, %r8583; + shf.l.wrap.b32 %r8590, %r8589, %r8589, 25; + add.s32 %r8591, %r8543, %r7894; + add.s32 %r8592, %r8591, %r8534; + xor.b32 %r8593, %r8559, %r8592; + shf.l.wrap.b32 %r8594, %r8593, %r8593, 16; + add.s32 %r8595, %r8594, %r8518; + xor.b32 %r8596, %r8595, %r8534; + shf.l.wrap.b32 %r8597, %r8596, %r8596, 20; + add.s32 %r8598, %r8592, %r7922; + add.s32 %r8599, %r8598, %r8597; + xor.b32 %r8600, %r8599, %r8594; + shf.l.wrap.b32 %r8601, %r8600, %r8600, 24; + add.s32 %r8602, %r8601, %r8595; + xor.b32 %r8603, %r8602, %r8597; + shf.l.wrap.b32 %r8604, %r8603, %r8603, 25; + add.s32 %r8605, %r8557, %r7943; + add.s32 %r8606, %r8605, %r8548; + xor.b32 %r8607, %r8606, %r8517; + shf.l.wrap.b32 %r8608, %r8607, %r8607, 16; + add.s32 %r8609, %r8608, %r8532; + xor.b32 %r8610, %r8609, %r8548; + shf.l.wrap.b32 %r8611, %r8610, %r8610, 20; + add.s32 %r8612, %r8606, %r7845; + add.s32 %r8613, %r8612, %r8611; + xor.b32 %r8614, %r8613, %r8608; + shf.l.wrap.b32 %r8615, %r8614, %r8614, 24; + add.s32 %r8616, %r8615, %r8609; + xor.b32 %r8617, %r8616, %r8611; + shf.l.wrap.b32 %r8618, %r8617, %r8617, 25; + add.s32 %r8619, %r8571, %r7929; + add.s32 %r8620, %r8619, %r8590; + xor.b32 %r8621, %r8620, %r8615; + shf.l.wrap.b32 %r8622, %r8621, %r8621, 16; + add.s32 %r8623, %r8622, %r8602; + xor.b32 %r8624, %r8623, %r8590; + shf.l.wrap.b32 %r8625, %r8624, %r8624, 20; + add.s32 %r8626, %r8620, %r7859; + add.s32 %r8627, %r8626, %r8625; + xor.b32 %r8628, %r8627, %r8622; + shf.l.wrap.b32 %r8629, %r8628, %r8628, 24; + add.s32 %r8630, %r8629, %r8623; + xor.b32 %r8631, %r8630, %r8625; + shf.l.wrap.b32 %r8632, %r8631, %r8631, 25; + add.s32 %r8633, %r8585, %r7838; + add.s32 %r8634, %r8633, %r8604; + xor.b32 %r8635, %r8573, %r8634; + shf.l.wrap.b32 %r8636, %r8635, %r8635, 16; + add.s32 %r8637, %r8636, %r8616; + xor.b32 %r8638, %r8637, %r8604; + shf.l.wrap.b32 %r8639, %r8638, %r8638, 20; + add.s32 %r8640, %r8634, %r7908; + add.s32 %r8641, %r8640, %r8639; + xor.b32 %r8642, %r8641, %r8636; + shf.l.wrap.b32 %r8643, %r8642, %r8642, 24; + add.s32 %r8644, %r8643, %r8637; + xor.b32 %r8645, %r8644, %r8639; + shf.l.wrap.b32 %r8646, %r8645, %r8645, 25; + add.s32 %r8647, %r8599, %r7852; + add.s32 %r8648, %r8647, %r8618; + xor.b32 %r8649, %r8648, %r8587; + shf.l.wrap.b32 %r8650, %r8649, %r8649, 16; + add.s32 %r8651, %r8650, %r8574; + xor.b32 %r8652, %r8651, %r8618; + shf.l.wrap.b32 %r8653, %r8652, %r8652, 20; + add.s32 %r8654, %r8648, %r7880; + add.s32 %r8655, %r8654, %r8653; + xor.b32 %r8656, %r8655, %r8650; + shf.l.wrap.b32 %r8657, %r8656, %r8656, 24; + add.s32 %r8658, %r8657, %r8651; + xor.b32 %r8659, %r8658, %r8653; + shf.l.wrap.b32 %r8660, %r8659, %r8659, 25; + add.s32 %r8661, %r8613, %r7866; + add.s32 %r8662, %r8661, %r8576; + xor.b32 %r8663, %r8662, %r8601; + shf.l.wrap.b32 %r8664, %r8663, %r8663, 16; + add.s32 %r8665, %r8664, %r8588; + xor.b32 %r8666, %r8665, %r8576; + shf.l.wrap.b32 %r8667, %r8666, %r8666, 20; + add.s32 %r8668, %r8662, %r7887; + add.s32 %r8669, %r8668, %r8667; + xor.b32 %r8670, %r8669, %r8664; + shf.l.wrap.b32 %r8671, %r8670, %r8670, 24; + add.s32 %r8672, %r8671, %r8665; + xor.b32 %r8673, %r8672, %r8667; + shf.l.wrap.b32 %r8674, %r8673, %r8673, 25; + add.s32 %r8675, %r8627, %r7915; + add.s32 %r8676, %r8675, %r8674; + xor.b32 %r8677, %r8676, %r8643; + shf.l.wrap.b32 %r8678, %r8677, %r8677, 16; + add.s32 %r8679, %r8678, %r8658; + xor.b32 %r8680, %r8679, %r8674; + shf.l.wrap.b32 %r8681, %r8680, %r8680, 20; + add.s32 %r8682, %r8676, %r7943; + add.s32 %r8683, %r8682, %r8681; + xor.b32 %r8684, %r8683, %r8678; + shf.l.wrap.b32 %r8685, %r8684, %r8684, 24; + add.s32 %r8686, %r8685, %r8679; + xor.b32 %r8687, %r8686, %r8681; + shf.l.wrap.b32 %r8688, %r8687, %r8687, 25; + add.s32 %r8689, %r8641, %r7873; + add.s32 %r8690, %r8689, %r8632; + xor.b32 %r8691, %r8690, %r8657; + shf.l.wrap.b32 %r8692, %r8691, %r8691, 16; + add.s32 %r8693, %r8692, %r8672; + xor.b32 %r8694, %r8693, %r8632; + shf.l.wrap.b32 %r8695, %r8694, %r8694, 20; + add.s32 %r8696, %r8690, %r7838; + add.s32 %r8697, %r8696, %r8695; + xor.b32 %r8698, %r8697, %r8692; + shf.l.wrap.b32 %r8699, %r8698, %r8698, 24; + add.s32 %r8700, %r8699, %r8693; + xor.b32 %r8701, %r8700, %r8695; + shf.l.wrap.b32 %r8702, %r8701, %r8701, 25; + add.s32 %r8703, %r8655, %r7845; + add.s32 %r8704, %r8703, %r8646; + xor.b32 %r8705, %r8671, %r8704; + shf.l.wrap.b32 %r8706, %r8705, %r8705, 16; + add.s32 %r8707, %r8706, %r8630; + xor.b32 %r8708, %r8707, %r8646; + shf.l.wrap.b32 %r8709, %r8708, %r8708, 20; + add.s32 %r8710, %r8704, %r7901; + add.s32 %r8711, %r8710, %r8709; + xor.b32 %r8712, %r8711, %r8706; + shf.l.wrap.b32 %r8713, %r8712, %r8712, 24; + add.s32 %r8714, %r8713, %r8707; + xor.b32 %r8715, %r8714, %r8709; + shf.l.wrap.b32 %r8716, %r8715, %r8715, 25; + add.s32 %r8717, %r8669, %r7894; + add.s32 %r8718, %r8717, %r8660; + xor.b32 %r8719, %r8718, %r8629; + shf.l.wrap.b32 %r8720, %r8719, %r8719, 16; + add.s32 %r8721, %r8720, %r8644; + xor.b32 %r8722, %r8721, %r8660; + shf.l.wrap.b32 %r8723, %r8722, %r8722, 20; + add.s32 %r8724, %r8718, %r7880; + add.s32 %r8725, %r8724, %r8723; + xor.b32 %r8726, %r8725, %r8720; + shf.l.wrap.b32 %r8727, %r8726, %r8726, 24; + add.s32 %r8728, %r8727, %r8721; + xor.b32 %r8729, %r8728, %r8723; + shf.l.wrap.b32 %r8730, %r8729, %r8729, 25; + add.s32 %r8731, %r8683, %r7936; + add.s32 %r8732, %r8731, %r8702; + xor.b32 %r8733, %r8732, %r8727; + shf.l.wrap.b32 %r8734, %r8733, %r8733, 16; + add.s32 %r8735, %r8734, %r8714; + xor.b32 %r8736, %r8735, %r8702; + shf.l.wrap.b32 %r8737, %r8736, %r8736, 20; + add.s32 %r8738, %r8732, %r7908; + add.s32 %r8739, %r8738, %r8737; + xor.b32 %r8740, %r8739, %r8734; + shf.l.wrap.b32 %r8741, %r8740, %r8740, 24; + add.s32 %r8742, %r8741, %r8735; + xor.b32 %r8743, %r8742, %r8737; + shf.l.wrap.b32 %r8744, %r8743, %r8743, 25; + add.s32 %r8745, %r8697, %r7852; + add.s32 %r8746, %r8745, %r8716; + xor.b32 %r8747, %r8685, %r8746; + shf.l.wrap.b32 %r8748, %r8747, %r8747, 16; + add.s32 %r8749, %r8748, %r8728; + xor.b32 %r8750, %r8749, %r8716; + shf.l.wrap.b32 %r8751, %r8750, %r8750, 20; + add.s32 %r8752, %r8746, %r7922; + add.s32 %r8753, %r8752, %r8751; + xor.b32 %r8754, %r8753, %r8748; + shf.l.wrap.b32 %r8755, %r8754, %r8754, 24; + add.s32 %r8756, %r8755, %r8749; + xor.b32 %r8757, %r8756, %r8751; + shf.l.wrap.b32 %r8758, %r8757, %r8757, 25; + add.s32 %r8759, %r8711, %r7859; + add.s32 %r8760, %r8759, %r8730; + xor.b32 %r8761, %r8760, %r8699; + shf.l.wrap.b32 %r8762, %r8761, %r8761, 16; + add.s32 %r8763, %r8762, %r8686; + xor.b32 %r8764, %r8763, %r8730; + shf.l.wrap.b32 %r8765, %r8764, %r8764, 20; + add.s32 %r8766, %r8760, %r7866; + add.s32 %r8767, %r8766, %r8765; + xor.b32 %r8768, %r8767, %r8762; + shf.l.wrap.b32 %r8769, %r8768, %r8768, 24; + add.s32 %r8770, %r8769, %r8763; + xor.b32 %r8771, %r8770, %r8765; + shf.l.wrap.b32 %r8772, %r8771, %r8771, 25; + add.s32 %r8773, %r8725, %r7887; + add.s32 %r8774, %r8773, %r8688; + xor.b32 %r8775, %r8774, %r8713; + shf.l.wrap.b32 %r8776, %r8775, %r8775, 16; + add.s32 %r8777, %r8776, %r8700; + xor.b32 %r8778, %r8777, %r8688; + shf.l.wrap.b32 %r8779, %r8778, %r8778, 20; + add.s32 %r8780, %r8774, %r7929; + add.s32 %r8781, %r8780, %r8779; + xor.b32 %r8782, %r8781, %r8776; + shf.l.wrap.b32 %r8783, %r8782, %r8782, 24; + add.s32 %r8784, %r8783, %r8777; + xor.b32 %r8785, %r8784, %r8779; + shf.l.wrap.b32 %r8786, %r8785, %r8785, 25; + xor.b32 %r8787, %r8770, %r8739; + xor.b32 %r8788, %r8784, %r8753; + xor.b32 %r8789, %r8742, %r8767; + xor.b32 %r8790, %r8781, %r8756; + xor.b32 %r8791, %r8786, %r8755; + xor.b32 %r8792, %r8744, %r8769; + xor.b32 %r8793, %r8783, %r8758; + xor.b32 %r8794, %r8772, %r8741; + st.local.u8 [%rd191+145], %r8787; + shr.u32 %r8795, %r8787, 8; + st.local.u8 [%rd191+146], %r8795; + shr.u32 %r8796, %r8787, 16; + st.local.u8 [%rd191+147], %r8796; + shr.u32 %r8797, %r8787, 24; + st.local.u8 [%rd191+148], %r8797; + st.local.u8 [%rd191+149], %r8788; + shr.u32 %r8798, %r8788, 8; + st.local.u8 [%rd191+150], %r8798; + shr.u32 %r8799, %r8788, 16; + st.local.u8 [%rd191+151], %r8799; + shr.u32 %r8800, %r8788, 24; + st.local.u8 [%rd191+152], %r8800; + st.local.u8 [%rd191+153], %r8789; + shr.u32 %r8801, %r8789, 8; + st.local.u8 [%rd191+154], %r8801; + shr.u32 %r8802, %r8789, 16; + st.local.u8 [%rd191+155], %r8802; + shr.u32 %r8803, %r8789, 24; + st.local.u8 [%rd191+156], %r8803; + st.local.u8 [%rd191+157], %r8790; + shr.u32 %r8804, %r8790, 8; + st.local.u8 [%rd191+158], %r8804; + shr.u32 %r8805, %r8790, 16; + st.local.u8 [%rd191+159], %r8805; + shr.u32 %r8806, %r8790, 24; + st.local.u8 [%rd191+160], %r8806; + st.local.u8 [%rd191+161], %r8791; + shr.u32 %r8807, %r8791, 8; + st.local.u8 [%rd191+162], %r8807; + shr.u32 %r8808, %r8791, 16; + st.local.u8 [%rd191+163], %r8808; + shr.u32 %r8809, %r8791, 24; + st.local.u8 [%rd191+164], %r8809; + st.local.u8 [%rd191+165], %r8792; + shr.u32 %r8810, %r8792, 8; + st.local.u8 [%rd191+166], %r8810; + shr.u32 %r8811, %r8792, 16; + st.local.u8 [%rd191+167], %r8811; + shr.u32 %r8812, %r8792, 24; + st.local.u8 [%rd191+168], %r8812; + st.local.u8 [%rd191+169], %r8793; + shr.u32 %r8813, %r8793, 8; + st.local.u8 [%rd191+170], %r8813; + shr.u32 %r8814, %r8793, 16; + st.local.u8 [%rd191+171], %r8814; + shr.u32 %r8815, %r8793, 24; + st.local.u8 [%rd191+172], %r8815; + st.local.u8 [%rd191+173], %r8794; + shr.u32 %r8816, %r8794, 8; + st.local.u8 [%rd191+174], %r8816; + shr.u32 %r8817, %r8794, 16; + st.local.u8 [%rd191+175], %r8817; + shr.u32 %r8818, %r8794, 24; + st.local.u8 [%rd191+176], %r8818; + ld.local.u8 %rs328, [%rd3+8]; + add.s16 %rs329, %rs328, -1; + st.local.u8 [%rd3+8], %rs329; + cvt.u64.u16 %rd192, %rs329; + and.b64 %rd193, %rd192, 255; + setp.lt.u64 %p38, %rd230, %rd193; + and.b16 %rs330, %rs329, 255; + mul.wide.u16 %r11681, %rs330, 32; + @%p38 bra $L__BB1_45; + +$L__BB1_46: + cvt.s64.s32 %rd194, %r11681; + add.s64 %rd195, %rd2, %rd194; + st.local.u8 [%rd195+145], %r97; + shr.u32 %r8819, %r97, 8; + st.local.u8 [%rd195+146], %r8819; + shr.u32 %r8820, %r97, 16; + st.local.u8 [%rd195+147], %r8820; + shr.u32 %r8821, %r97, 24; + st.local.u8 [%rd195+148], %r8821; + st.local.u8 [%rd195+149], %r98; + shr.u32 %r8822, %r98, 8; + st.local.u8 [%rd195+150], %r8822; + shr.u32 %r8823, %r98, 16; + st.local.u8 [%rd195+151], %r8823; + shr.u32 %r8824, %r98, 24; + st.local.u8 [%rd195+152], %r8824; + st.local.u8 [%rd195+153], %r99; + shr.u32 %r8825, %r99, 8; + st.local.u8 [%rd195+154], %r8825; + shr.u32 %r8826, %r99, 16; + st.local.u8 [%rd195+155], %r8826; + shr.u32 %r8827, %r99, 24; + st.local.u8 [%rd195+156], %r8827; + st.local.u8 [%rd195+157], %r100; + shr.u32 %r8828, %r100, 8; + st.local.u8 [%rd195+158], %r8828; + shr.u32 %r8829, %r100, 16; + st.local.u8 [%rd195+159], %r8829; + shr.u32 %r8830, %r100, 24; + st.local.u8 [%rd195+160], %r8830; + st.local.u8 [%rd195+161], %r101; + shr.u32 %r8831, %r101, 8; + st.local.u8 [%rd195+162], %r8831; + shr.u32 %r8832, %r101, 16; + st.local.u8 [%rd195+163], %r8832; + shr.u32 %r8833, %r101, 24; + st.local.u8 [%rd195+164], %r8833; + st.local.u8 [%rd195+165], %r102; + shr.u32 %r8834, %r102, 8; + st.local.u8 [%rd195+166], %r8834; + shr.u32 %r8835, %r102, 16; + st.local.u8 [%rd195+167], %r8835; + shr.u32 %r8836, %r102, 24; + st.local.u8 [%rd195+168], %r8836; + st.local.u8 [%rd195+169], %r103; + shr.u32 %r8837, %r103, 8; + st.local.u8 [%rd195+170], %r8837; + shr.u32 %r8838, %r103, 16; + st.local.u8 [%rd195+171], %r8838; + shr.u32 %r8839, %r103, 24; + st.local.u8 [%rd195+172], %r8839; + st.local.u8 [%rd195+173], %r104; + shr.u32 %r8840, %r104, 8; + st.local.u8 [%rd195+174], %r8840; + shr.u32 %r8841, %r104, 16; + st.local.u8 [%rd195+175], %r8841; + shr.u32 %r8842, %r104, 24; + st.local.u8 [%rd195+176], %r8842; + ld.local.u8 %rs388, [%rd3+8]; + bra.uni $L__BB1_47; + +$L__BB1_29: + cvt.u32.u16 %r3957, %rs14; + and.b32 %r3958, %r3957, 255; + { // callseq 2, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd254; + .param .b64 param1; + st.param.b64 [param1+0], %rd49; + .param .b64 param2; + st.param.b64 [param2+0], %rd98; + .param .b64 param3; + st.param.b64 [param3+0], %rd251; + .param .b32 param4; + st.param.b32 [param4+0], %r3958; + .param .b64 param5; + st.param.b64 [param5+0], %rd142; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd164, [retval0+0]; + } // callseq 2 + ld.local.v4.u32 {%r3959, %r3960, %r3961, %r3962}, [%rd42]; + ld.local.v4.u32 {%r3963, %r3964, %r3965, %r3966}, [%rd42+16]; + ld.local.v4.u32 {%r3967, %r3968, %r3969, %r3970}, [%rd42+32]; + ld.local.v4.u32 {%r3971, %r3972, %r3973, %r3974}, [%rd42+48]; + ld.local.u64 %rd165, [%rd3+-72]; + popc.b64 %r3975, %rd165; + cvt.u64.u32 %rd51, %r3975; + ld.local.u8 %rs137, [%rd3+8]; + cvt.u64.u16 %rd166, %rs137; + setp.ge.u64 %p27, %rd51, %rd166; + mul.wide.u16 %r11661, %rs137, 32; + @%p27 bra $L__BB1_32; + +$L__BB1_31: + popc.b64 %r11647, %rd165; + cvt.u64.u32 %rd226, %r11647; + add.s32 %r3976, %r11661, -64; + cvt.s64.s32 %rd167, %r3976; + add.s64 %rd168, %rd2, %rd167; + ld.local.u8 %r3977, [%rd3+2]; + ld.local.u8 %r3978, [%rd168+145]; + ld.local.u8 %r3979, [%rd168+146]; + prmt.b32 %r3980, %r3979, %r3978, 30212; + ld.local.u8 %r3981, [%rd168+147]; + prmt.b32 %r3982, %r3981, %r3980, 28756; + ld.local.u8 %r3983, [%rd168+148]; + prmt.b32 %r3984, %r3983, %r3982, 1620; + ld.local.u8 %r3985, [%rd168+149]; + ld.local.u8 %r3986, [%rd168+150]; + prmt.b32 %r3987, %r3986, %r3985, 30212; + ld.local.u8 %r3988, [%rd168+151]; + prmt.b32 %r3989, %r3988, %r3987, 28756; + ld.local.u8 %r3990, [%rd168+152]; + prmt.b32 %r3991, %r3990, %r3989, 1620; + ld.local.u8 %r3992, [%rd168+153]; + ld.local.u8 %r3993, [%rd168+154]; + prmt.b32 %r3994, %r3993, %r3992, 30212; + ld.local.u8 %r3995, [%rd168+155]; + prmt.b32 %r3996, %r3995, %r3994, 28756; + ld.local.u8 %r3997, [%rd168+156]; + prmt.b32 %r3998, %r3997, %r3996, 1620; + ld.local.u8 %r3999, [%rd168+157]; + ld.local.u8 %r4000, [%rd168+158]; + prmt.b32 %r4001, %r4000, %r3999, 30212; + ld.local.u8 %r4002, [%rd168+159]; + prmt.b32 %r4003, %r4002, %r4001, 28756; + ld.local.u8 %r4004, [%rd168+160]; + prmt.b32 %r4005, %r4004, %r4003, 1620; + ld.local.u8 %r4006, [%rd168+161]; + ld.local.u8 %r4007, [%rd168+162]; + prmt.b32 %r4008, %r4007, %r4006, 30212; + ld.local.u8 %r4009, [%rd168+163]; + prmt.b32 %r4010, %r4009, %r4008, 28756; + ld.local.u8 %r4011, [%rd168+164]; + prmt.b32 %r4012, %r4011, %r4010, 1620; + ld.local.u8 %r4013, [%rd168+165]; + ld.local.u8 %r4014, [%rd168+166]; + prmt.b32 %r4015, %r4014, %r4013, 30212; + ld.local.u8 %r4016, [%rd168+167]; + prmt.b32 %r4017, %r4016, %r4015, 28756; + ld.local.u8 %r4018, [%rd168+168]; + prmt.b32 %r4019, %r4018, %r4017, 1620; + ld.local.u8 %r4020, [%rd168+169]; + ld.local.u8 %r4021, [%rd168+170]; + prmt.b32 %r4022, %r4021, %r4020, 30212; + ld.local.u8 %r4023, [%rd168+171]; + prmt.b32 %r4024, %r4023, %r4022, 28756; + ld.local.u8 %r4025, [%rd168+172]; + prmt.b32 %r4026, %r4025, %r4024, 1620; + ld.local.u8 %r4027, [%rd168+173]; + ld.local.u8 %r4028, [%rd168+174]; + prmt.b32 %r4029, %r4028, %r4027, 30212; + ld.local.u8 %r4030, [%rd168+175]; + prmt.b32 %r4031, %r4030, %r4029, 28756; + ld.local.u8 %r4032, [%rd168+176]; + prmt.b32 %r4033, %r4032, %r4031, 1620; + ld.local.u8 %r4034, [%rd168+177]; + ld.local.u8 %r4035, [%rd168+178]; + prmt.b32 %r4036, %r4035, %r4034, 30212; + ld.local.u8 %r4037, [%rd168+179]; + prmt.b32 %r4038, %r4037, %r4036, 28756; + ld.local.u8 %r4039, [%rd168+180]; + prmt.b32 %r4040, %r4039, %r4038, 1620; + ld.local.u8 %r4041, [%rd168+181]; + ld.local.u8 %r4042, [%rd168+182]; + prmt.b32 %r4043, %r4042, %r4041, 30212; + ld.local.u8 %r4044, [%rd168+183]; + prmt.b32 %r4045, %r4044, %r4043, 28756; + ld.local.u8 %r4046, [%rd168+184]; + prmt.b32 %r4047, %r4046, %r4045, 1620; + ld.local.u8 %r4048, [%rd168+185]; + ld.local.u8 %r4049, [%rd168+186]; + prmt.b32 %r4050, %r4049, %r4048, 30212; + ld.local.u8 %r4051, [%rd168+187]; + prmt.b32 %r4052, %r4051, %r4050, 28756; + ld.local.u8 %r4053, [%rd168+188]; + prmt.b32 %r4054, %r4053, %r4052, 1620; + ld.local.u8 %r4055, [%rd168+189]; + ld.local.u8 %r4056, [%rd168+190]; + prmt.b32 %r4057, %r4056, %r4055, 30212; + ld.local.u8 %r4058, [%rd168+191]; + prmt.b32 %r4059, %r4058, %r4057, 28756; + ld.local.u8 %r4060, [%rd168+192]; + prmt.b32 %r4061, %r4060, %r4059, 1620; + ld.local.u8 %r4062, [%rd168+193]; + ld.local.u8 %r4063, [%rd168+194]; + prmt.b32 %r4064, %r4063, %r4062, 30212; + ld.local.u8 %r4065, [%rd168+195]; + prmt.b32 %r4066, %r4065, %r4064, 28756; + ld.local.u8 %r4067, [%rd168+196]; + prmt.b32 %r4068, %r4067, %r4066, 1620; + ld.local.u8 %r4069, [%rd168+197]; + ld.local.u8 %r4070, [%rd168+198]; + prmt.b32 %r4071, %r4070, %r4069, 30212; + ld.local.u8 %r4072, [%rd168+199]; + prmt.b32 %r4073, %r4072, %r4071, 28756; + ld.local.u8 %r4074, [%rd168+200]; + prmt.b32 %r4075, %r4074, %r4073, 1620; + ld.local.u8 %r4076, [%rd168+201]; + ld.local.u8 %r4077, [%rd168+202]; + prmt.b32 %r4078, %r4077, %r4076, 30212; + ld.local.u8 %r4079, [%rd168+203]; + prmt.b32 %r4080, %r4079, %r4078, 28756; + ld.local.u8 %r4081, [%rd168+204]; + prmt.b32 %r4082, %r4081, %r4080, 1620; + ld.local.u8 %r4083, [%rd168+205]; + ld.local.u8 %r4084, [%rd168+206]; + prmt.b32 %r4085, %r4084, %r4083, 30212; + ld.local.u8 %r4086, [%rd168+207]; + prmt.b32 %r4087, %r4086, %r4085, 28756; + ld.local.u8 %r4088, [%rd168+208]; + prmt.b32 %r4089, %r4088, %r4087, 1620; + or.b32 %r4090, %r3977, 4; + ld.local.u8 %r4091, [%rd3+-120]; + ld.local.u8 %r4092, [%rd3+-119]; + prmt.b32 %r4093, %r4092, %r4091, 30212; + ld.local.u8 %r4094, [%rd3+-118]; + ld.local.u8 %r4095, [%rd3+-117]; + prmt.b32 %r4096, %r4095, %r4094, 30212; + prmt.b32 %r4097, %r4096, %r4093, 4180; + ld.local.u8 %r4098, [%rd3+-136]; + ld.local.u8 %r4099, [%rd3+-135]; + prmt.b32 %r4100, %r4099, %r4098, 30212; + ld.local.u8 %r4101, [%rd3+-134]; + ld.local.u8 %r4102, [%rd3+-133]; + prmt.b32 %r4103, %r4102, %r4101, 30212; + prmt.b32 %r4104, %r4103, %r4100, 4180; + add.s32 %r4105, %r4097, %r4104; + add.s32 %r4106, %r4105, %r3984; + shf.l.wrap.b32 %r4107, %r4106, %r4106, 16; + add.s32 %r4108, %r4107, 1779033703; + xor.b32 %r4109, %r4108, %r4097; + shf.l.wrap.b32 %r4110, %r4109, %r4109, 20; + add.s32 %r4111, %r3991, %r4106; + add.s32 %r4112, %r4111, %r4110; + xor.b32 %r4113, %r4112, %r4107; + shf.l.wrap.b32 %r4114, %r4113, %r4113, 24; + add.s32 %r4115, %r4114, %r4108; + xor.b32 %r4116, %r4115, %r4110; + shf.l.wrap.b32 %r4117, %r4116, %r4116, 25; + ld.local.u8 %r4118, [%rd3+-116]; + ld.local.u8 %r4119, [%rd3+-115]; + prmt.b32 %r4120, %r4119, %r4118, 30212; + ld.local.u8 %r4121, [%rd3+-114]; + ld.local.u8 %r4122, [%rd3+-113]; + prmt.b32 %r4123, %r4122, %r4121, 30212; + prmt.b32 %r4124, %r4123, %r4120, 4180; + ld.local.u8 %r4125, [%rd3+-132]; + ld.local.u8 %r4126, [%rd3+-131]; + prmt.b32 %r4127, %r4126, %r4125, 30212; + ld.local.u8 %r4128, [%rd3+-130]; + ld.local.u8 %r4129, [%rd3+-129]; + prmt.b32 %r4130, %r4129, %r4128, 30212; + prmt.b32 %r4131, %r4130, %r4127, 4180; + add.s32 %r4132, %r4124, %r4131; + add.s32 %r4133, %r4132, %r3998; + shf.l.wrap.b32 %r4134, %r4133, %r4133, 16; + add.s32 %r4135, %r4134, -1150833019; + xor.b32 %r4136, %r4135, %r4124; + shf.l.wrap.b32 %r4137, %r4136, %r4136, 20; + add.s32 %r4138, %r4005, %r4133; + add.s32 %r4139, %r4138, %r4137; + xor.b32 %r4140, %r4139, %r4134; + shf.l.wrap.b32 %r4141, %r4140, %r4140, 24; + add.s32 %r4142, %r4141, %r4135; + xor.b32 %r4143, %r4142, %r4137; + shf.l.wrap.b32 %r4144, %r4143, %r4143, 25; + ld.local.u8 %r4145, [%rd3+-112]; + ld.local.u8 %r4146, [%rd3+-111]; + prmt.b32 %r4147, %r4146, %r4145, 30212; + ld.local.u8 %r4148, [%rd3+-110]; + ld.local.u8 %r4149, [%rd3+-109]; + prmt.b32 %r4150, %r4149, %r4148, 30212; + prmt.b32 %r4151, %r4150, %r4147, 4180; + ld.local.u8 %r4152, [%rd3+-128]; + ld.local.u8 %r4153, [%rd3+-127]; + prmt.b32 %r4154, %r4153, %r4152, 30212; + ld.local.u8 %r4155, [%rd3+-126]; + ld.local.u8 %r4156, [%rd3+-125]; + prmt.b32 %r4157, %r4156, %r4155, 30212; + prmt.b32 %r4158, %r4157, %r4154, 4180; + add.s32 %r4159, %r4151, %r4158; + add.s32 %r4160, %r4159, %r4012; + shr.u32 %r4161, %r4160, 16; + shl.b32 %r4162, %r4160, 16; + xor.b32 %r4163, %r4162, 4194304; + or.b32 %r4164, %r4163, %r4161; + add.s32 %r4165, %r4164, 1013904242; + xor.b32 %r4166, %r4165, %r4151; + shf.l.wrap.b32 %r4167, %r4166, %r4166, 20; + add.s32 %r4168, %r4019, %r4160; + add.s32 %r4169, %r4168, %r4167; + xor.b32 %r4170, %r4169, %r4164; + shf.l.wrap.b32 %r4171, %r4170, %r4170, 24; + add.s32 %r4172, %r4171, %r4165; + xor.b32 %r4173, %r4172, %r4167; + shf.l.wrap.b32 %r4174, %r4173, %r4173, 25; + ld.local.u8 %r4175, [%rd3+-108]; + ld.local.u8 %r4176, [%rd3+-107]; + prmt.b32 %r4177, %r4176, %r4175, 30212; + ld.local.u8 %r4178, [%rd3+-106]; + ld.local.u8 %r4179, [%rd3+-105]; + prmt.b32 %r4180, %r4179, %r4178, 30212; + prmt.b32 %r4181, %r4180, %r4177, 4180; + ld.local.u8 %r4182, [%rd3+-124]; + ld.local.u8 %r4183, [%rd3+-123]; + prmt.b32 %r4184, %r4183, %r4182, 30212; + ld.local.u8 %r4185, [%rd3+-122]; + ld.local.u8 %r4186, [%rd3+-121]; + prmt.b32 %r4187, %r4186, %r4185, 30212; + prmt.b32 %r4188, %r4187, %r4184, 4180; + add.s32 %r4189, %r4181, %r4188; + add.s32 %r4190, %r4189, %r4026; + xor.b32 %r4191, %r4190, %r4090; + shr.u32 %r4192, %r4190, 16; + shl.b32 %r4193, %r4191, 16; + or.b32 %r4194, %r4193, %r4192; + add.s32 %r4195, %r4194, -1521486534; + xor.b32 %r4196, %r4195, %r4181; + shf.l.wrap.b32 %r4197, %r4196, %r4196, 20; + add.s32 %r4198, %r4033, %r4190; + add.s32 %r4199, %r4198, %r4197; + xor.b32 %r4200, %r4199, %r4194; + shf.l.wrap.b32 %r4201, %r4200, %r4200, 24; + add.s32 %r4202, %r4201, %r4195; + xor.b32 %r4203, %r4202, %r4197; + shf.l.wrap.b32 %r4204, %r4203, %r4203, 25; + add.s32 %r4205, %r4144, %r4112; + add.s32 %r4206, %r4205, %r4040; + xor.b32 %r4207, %r4201, %r4206; + shf.l.wrap.b32 %r4208, %r4207, %r4207, 16; + add.s32 %r4209, %r4208, %r4172; + xor.b32 %r4210, %r4209, %r4144; + shf.l.wrap.b32 %r4211, %r4210, %r4210, 20; + add.s32 %r4212, %r4047, %r4206; + add.s32 %r4213, %r4212, %r4211; + xor.b32 %r4214, %r4213, %r4208; + shf.l.wrap.b32 %r4215, %r4214, %r4214, 24; + add.s32 %r4216, %r4215, %r4209; + xor.b32 %r4217, %r4216, %r4211; + shf.l.wrap.b32 %r4218, %r4217, %r4217, 25; + add.s32 %r4219, %r4174, %r4139; + add.s32 %r4220, %r4219, %r4054; + xor.b32 %r4221, %r4220, %r4114; + shf.l.wrap.b32 %r4222, %r4221, %r4221, 16; + add.s32 %r4223, %r4222, %r4202; + xor.b32 %r4224, %r4223, %r4174; + shf.l.wrap.b32 %r4225, %r4224, %r4224, 20; + add.s32 %r4226, %r4061, %r4220; + add.s32 %r4227, %r4226, %r4225; + xor.b32 %r4228, %r4227, %r4222; + shf.l.wrap.b32 %r4229, %r4228, %r4228, 24; + add.s32 %r4230, %r4229, %r4223; + xor.b32 %r4231, %r4230, %r4225; + shf.l.wrap.b32 %r4232, %r4231, %r4231, 25; + add.s32 %r4233, %r4204, %r4169; + add.s32 %r4234, %r4233, %r4068; + xor.b32 %r4235, %r4234, %r4141; + shf.l.wrap.b32 %r4236, %r4235, %r4235, 16; + add.s32 %r4237, %r4236, %r4115; + xor.b32 %r4238, %r4237, %r4204; + shf.l.wrap.b32 %r4239, %r4238, %r4238, 20; + add.s32 %r4240, %r4075, %r4234; + add.s32 %r4241, %r4240, %r4239; + xor.b32 %r4242, %r4241, %r4236; + shf.l.wrap.b32 %r4243, %r4242, %r4242, 24; + add.s32 %r4244, %r4243, %r4237; + xor.b32 %r4245, %r4244, %r4239; + shf.l.wrap.b32 %r4246, %r4245, %r4245, 25; + add.s32 %r4247, %r4199, %r4117; + add.s32 %r4248, %r4247, %r4082; + xor.b32 %r4249, %r4248, %r4171; + shf.l.wrap.b32 %r4250, %r4249, %r4249, 16; + add.s32 %r4251, %r4250, %r4142; + xor.b32 %r4252, %r4251, %r4117; + shf.l.wrap.b32 %r4253, %r4252, %r4252, 20; + add.s32 %r4254, %r4089, %r4248; + add.s32 %r4255, %r4254, %r4253; + xor.b32 %r4256, %r4255, %r4250; + shf.l.wrap.b32 %r4257, %r4256, %r4256, 24; + add.s32 %r4258, %r4257, %r4251; + xor.b32 %r4259, %r4258, %r4253; + shf.l.wrap.b32 %r4260, %r4259, %r4259, 25; + add.s32 %r4261, %r4213, %r3998; + add.s32 %r4262, %r4261, %r4260; + xor.b32 %r4263, %r4262, %r4229; + shf.l.wrap.b32 %r4264, %r4263, %r4263, 16; + add.s32 %r4265, %r4264, %r4244; + xor.b32 %r4266, %r4265, %r4260; + shf.l.wrap.b32 %r4267, %r4266, %r4266, 20; + add.s32 %r4268, %r4262, %r4026; + add.s32 %r4269, %r4268, %r4267; + xor.b32 %r4270, %r4269, %r4264; + shf.l.wrap.b32 %r4271, %r4270, %r4270, 24; + add.s32 %r4272, %r4271, %r4265; + xor.b32 %r4273, %r4272, %r4267; + shf.l.wrap.b32 %r4274, %r4273, %r4273, 25; + add.s32 %r4275, %r4227, %r4005; + add.s32 %r4276, %r4275, %r4218; + xor.b32 %r4277, %r4243, %r4276; + shf.l.wrap.b32 %r4278, %r4277, %r4277, 16; + add.s32 %r4279, %r4258, %r4278; + xor.b32 %r4280, %r4279, %r4218; + shf.l.wrap.b32 %r4281, %r4280, %r4280, 20; + add.s32 %r4282, %r4276, %r4054; + add.s32 %r4283, %r4282, %r4281; + xor.b32 %r4284, %r4283, %r4278; + shf.l.wrap.b32 %r4285, %r4284, %r4284, 24; + add.s32 %r4286, %r4285, %r4279; + xor.b32 %r4287, %r4286, %r4281; + shf.l.wrap.b32 %r4288, %r4287, %r4287, 25; + add.s32 %r4289, %r4232, %r4033; + add.s32 %r4290, %r4289, %r4241; + xor.b32 %r4291, %r4257, %r4290; + shf.l.wrap.b32 %r4292, %r4291, %r4291, 16; + add.s32 %r4293, %r4292, %r4216; + xor.b32 %r4294, %r4293, %r4232; + shf.l.wrap.b32 %r4295, %r4294, %r4294, 20; + add.s32 %r4296, %r4290, %r3984; + add.s32 %r4297, %r4296, %r4295; + xor.b32 %r4298, %r4297, %r4292; + shf.l.wrap.b32 %r4299, %r4298, %r4298, 24; + add.s32 %r4300, %r4299, %r4293; + xor.b32 %r4301, %r4300, %r4295; + shf.l.wrap.b32 %r4302, %r4301, %r4301, 25; + add.s32 %r4303, %r4246, %r4012; + add.s32 %r4304, %r4303, %r4255; + xor.b32 %r4305, %r4304, %r4215; + shf.l.wrap.b32 %r4306, %r4305, %r4305, 16; + add.s32 %r4307, %r4306, %r4230; + xor.b32 %r4308, %r4307, %r4246; + shf.l.wrap.b32 %r4309, %r4308, %r4308, 20; + add.s32 %r4310, %r4304, %r4075; + add.s32 %r4311, %r4310, %r4309; + xor.b32 %r4312, %r4311, %r4306; + shf.l.wrap.b32 %r4313, %r4312, %r4312, 24; + add.s32 %r4314, %r4313, %r4307; + xor.b32 %r4315, %r4314, %r4309; + shf.l.wrap.b32 %r4316, %r4315, %r4315, 25; + add.s32 %r4317, %r4288, %r3991; + add.s32 %r4318, %r4317, %r4269; + xor.b32 %r4319, %r4318, %r4313; + shf.l.wrap.b32 %r4320, %r4319, %r4319, 16; + add.s32 %r4321, %r4320, %r4300; + xor.b32 %r4322, %r4321, %r4288; + shf.l.wrap.b32 %r4323, %r4322, %r4322, 20; + add.s32 %r4324, %r4318, %r4061; + add.s32 %r4325, %r4324, %r4323; + xor.b32 %r4326, %r4325, %r4320; + shf.l.wrap.b32 %r4327, %r4326, %r4326, 24; + add.s32 %r4328, %r4327, %r4321; + xor.b32 %r4329, %r4328, %r4323; + shf.l.wrap.b32 %r4330, %r4329, %r4329, 25; + add.s32 %r4331, %r4283, %r4068; + add.s32 %r4332, %r4331, %r4302; + xor.b32 %r4333, %r4271, %r4332; + shf.l.wrap.b32 %r4334, %r4333, %r4333, 16; + add.s32 %r4335, %r4334, %r4314; + xor.b32 %r4336, %r4335, %r4302; + shf.l.wrap.b32 %r4337, %r4336, %r4336, 20; + add.s32 %r4338, %r4332, %r4019; + add.s32 %r4339, %r4338, %r4337; + xor.b32 %r4340, %r4339, %r4334; + shf.l.wrap.b32 %r4341, %r4340, %r4340, 24; + add.s32 %r4342, %r4341, %r4335; + xor.b32 %r4343, %r4342, %r4337; + shf.l.wrap.b32 %r4344, %r4343, %r4343, 25; + add.s32 %r4345, %r4297, %r4047; + add.s32 %r4346, %r4345, %r4316; + xor.b32 %r4347, %r4346, %r4285; + shf.l.wrap.b32 %r4348, %r4347, %r4347, 16; + add.s32 %r4349, %r4348, %r4272; + xor.b32 %r4350, %r4349, %r4316; + shf.l.wrap.b32 %r4351, %r4350, %r4350, 20; + add.s32 %r4352, %r4346, %r4082; + add.s32 %r4353, %r4352, %r4351; + xor.b32 %r4354, %r4353, %r4348; + shf.l.wrap.b32 %r4355, %r4354, %r4354, 24; + add.s32 %r4356, %r4355, %r4349; + xor.b32 %r4357, %r4356, %r4351; + shf.l.wrap.b32 %r4358, %r4357, %r4357, 25; + add.s32 %r4359, %r4311, %r4089; + add.s32 %r4360, %r4359, %r4274; + xor.b32 %r4361, %r4360, %r4299; + shf.l.wrap.b32 %r4362, %r4361, %r4361, 16; + add.s32 %r4363, %r4362, %r4286; + xor.b32 %r4364, %r4363, %r4274; + shf.l.wrap.b32 %r4365, %r4364, %r4364, 20; + add.s32 %r4366, %r4360, %r4040; + add.s32 %r4367, %r4366, %r4365; + xor.b32 %r4368, %r4367, %r4362; + shf.l.wrap.b32 %r4369, %r4368, %r4368, 24; + add.s32 %r4370, %r4369, %r4363; + xor.b32 %r4371, %r4370, %r4365; + shf.l.wrap.b32 %r4372, %r4371, %r4371, 25; + add.s32 %r4373, %r4325, %r4005; + add.s32 %r4374, %r4373, %r4372; + xor.b32 %r4375, %r4374, %r4341; + shf.l.wrap.b32 %r4376, %r4375, %r4375, 16; + add.s32 %r4377, %r4376, %r4356; + xor.b32 %r4378, %r4377, %r4372; + shf.l.wrap.b32 %r4379, %r4378, %r4378, 20; + add.s32 %r4380, %r4374, %r4012; + add.s32 %r4381, %r4380, %r4379; + xor.b32 %r4382, %r4381, %r4376; + shf.l.wrap.b32 %r4383, %r4382, %r4382, 24; + add.s32 %r4384, %r4383, %r4377; + xor.b32 %r4385, %r4384, %r4379; + shf.l.wrap.b32 %r4386, %r4385, %r4385, 25; + add.s32 %r4387, %r4339, %r4054; + add.s32 %r4388, %r4387, %r4330; + xor.b32 %r4389, %r4388, %r4355; + shf.l.wrap.b32 %r4390, %r4389, %r4389, 16; + add.s32 %r4391, %r4390, %r4370; + xor.b32 %r4392, %r4391, %r4330; + shf.l.wrap.b32 %r4393, %r4392, %r4392, 20; + add.s32 %r4394, %r4388, %r4068; + add.s32 %r4395, %r4394, %r4393; + xor.b32 %r4396, %r4395, %r4390; + shf.l.wrap.b32 %r4397, %r4396, %r4396, 24; + add.s32 %r4398, %r4397, %r4391; + xor.b32 %r4399, %r4398, %r4393; + shf.l.wrap.b32 %r4400, %r4399, %r4399, 25; + add.s32 %r4401, %r4353, %r4075; + add.s32 %r4402, %r4401, %r4344; + xor.b32 %r4403, %r4369, %r4402; + shf.l.wrap.b32 %r4404, %r4403, %r4403, 16; + add.s32 %r4405, %r4404, %r4328; + xor.b32 %r4406, %r4405, %r4344; + shf.l.wrap.b32 %r4407, %r4406, %r4406, 20; + add.s32 %r4408, %r4402, %r3998; + add.s32 %r4409, %r4408, %r4407; + xor.b32 %r4410, %r4409, %r4404; + shf.l.wrap.b32 %r4411, %r4410, %r4410, 24; + add.s32 %r4412, %r4411, %r4405; + xor.b32 %r4413, %r4412, %r4407; + shf.l.wrap.b32 %r4414, %r4413, %r4413, 25; + add.s32 %r4415, %r4358, %r4033; + add.s32 %r4416, %r4415, %r4367; + xor.b32 %r4417, %r4416, %r4327; + shf.l.wrap.b32 %r4418, %r4417, %r4417, 16; + add.s32 %r4419, %r4418, %r4342; + xor.b32 %r4420, %r4419, %r4358; + shf.l.wrap.b32 %r4421, %r4420, %r4420, 20; + add.s32 %r4422, %r4416, %r4082; + add.s32 %r4423, %r4422, %r4421; + xor.b32 %r4424, %r4423, %r4418; + shf.l.wrap.b32 %r4425, %r4424, %r4424, 24; + add.s32 %r4426, %r4425, %r4419; + xor.b32 %r4427, %r4426, %r4421; + shf.l.wrap.b32 %r4428, %r4427, %r4427, 25; + add.s32 %r4429, %r4381, %r4026; + add.s32 %r4430, %r4429, %r4400; + xor.b32 %r4431, %r4430, %r4425; + shf.l.wrap.b32 %r4432, %r4431, %r4431, 16; + add.s32 %r4433, %r4432, %r4412; + xor.b32 %r4434, %r4433, %r4400; + shf.l.wrap.b32 %r4435, %r4434, %r4434, 20; + add.s32 %r4436, %r4430, %r4019; + add.s32 %r4437, %r4436, %r4435; + xor.b32 %r4438, %r4437, %r4432; + shf.l.wrap.b32 %r4439, %r4438, %r4438, 24; + add.s32 %r4440, %r4439, %r4433; + xor.b32 %r4441, %r4440, %r4435; + shf.l.wrap.b32 %r4442, %r4441, %r4441, 25; + add.s32 %r4443, %r4395, %r4047; + add.s32 %r4444, %r4443, %r4414; + xor.b32 %r4445, %r4383, %r4444; + shf.l.wrap.b32 %r4446, %r4445, %r4445, 16; + add.s32 %r4447, %r4446, %r4426; + xor.b32 %r4448, %r4447, %r4414; + shf.l.wrap.b32 %r4449, %r4448, %r4448, 20; + add.s32 %r4450, %r4444, %r3984; + add.s32 %r4451, %r4450, %r4449; + xor.b32 %r4452, %r4451, %r4446; + shf.l.wrap.b32 %r4453, %r4452, %r4452, 24; + add.s32 %r4454, %r4453, %r4447; + xor.b32 %r4455, %r4454, %r4449; + shf.l.wrap.b32 %r4456, %r4455, %r4455, 25; + add.s32 %r4457, %r4409, %r4061; + add.s32 %r4458, %r4457, %r4428; + xor.b32 %r4459, %r4458, %r4397; + shf.l.wrap.b32 %r4460, %r4459, %r4459, 16; + add.s32 %r4461, %r4460, %r4384; + xor.b32 %r4462, %r4461, %r4428; + shf.l.wrap.b32 %r4463, %r4462, %r4462, 20; + add.s32 %r4464, %r4458, %r4089; + add.s32 %r4465, %r4464, %r4463; + xor.b32 %r4466, %r4465, %r4460; + shf.l.wrap.b32 %r4467, %r4466, %r4466, 24; + add.s32 %r4468, %r4467, %r4461; + xor.b32 %r4469, %r4468, %r4463; + shf.l.wrap.b32 %r4470, %r4469, %r4469, 25; + add.s32 %r4471, %r4423, %r4040; + add.s32 %r4472, %r4471, %r4386; + xor.b32 %r4473, %r4472, %r4411; + shf.l.wrap.b32 %r4474, %r4473, %r4473, 16; + add.s32 %r4475, %r4474, %r4398; + xor.b32 %r4476, %r4475, %r4386; + shf.l.wrap.b32 %r4477, %r4476, %r4476, 20; + add.s32 %r4478, %r4472, %r3991; + add.s32 %r4479, %r4478, %r4477; + xor.b32 %r4480, %r4479, %r4474; + shf.l.wrap.b32 %r4481, %r4480, %r4480, 24; + add.s32 %r4482, %r4481, %r4475; + xor.b32 %r4483, %r4482, %r4477; + shf.l.wrap.b32 %r4484, %r4483, %r4483, 25; + add.s32 %r4485, %r4437, %r4054; + add.s32 %r4486, %r4485, %r4484; + xor.b32 %r4487, %r4486, %r4453; + shf.l.wrap.b32 %r4488, %r4487, %r4487, 16; + add.s32 %r4489, %r4488, %r4468; + xor.b32 %r4490, %r4489, %r4484; + shf.l.wrap.b32 %r4491, %r4490, %r4490, 20; + add.s32 %r4492, %r4486, %r4033; + add.s32 %r4493, %r4492, %r4491; + xor.b32 %r4494, %r4493, %r4488; + shf.l.wrap.b32 %r4495, %r4494, %r4494, 24; + add.s32 %r4496, %r4495, %r4489; + xor.b32 %r4497, %r4496, %r4491; + shf.l.wrap.b32 %r4498, %r4497, %r4497, 25; + add.s32 %r4499, %r4451, %r4068; + add.s32 %r4500, %r4499, %r4442; + xor.b32 %r4501, %r4500, %r4467; + shf.l.wrap.b32 %r4502, %r4501, %r4501, 16; + add.s32 %r4503, %r4502, %r4482; + xor.b32 %r4504, %r4503, %r4442; + shf.l.wrap.b32 %r4505, %r4504, %r4504, 20; + add.s32 %r4506, %r4500, %r4047; + add.s32 %r4507, %r4506, %r4505; + xor.b32 %r4508, %r4507, %r4502; + shf.l.wrap.b32 %r4509, %r4508, %r4508, 24; + add.s32 %r4510, %r4509, %r4503; + xor.b32 %r4511, %r4510, %r4505; + shf.l.wrap.b32 %r4512, %r4511, %r4511, 25; + add.s32 %r4513, %r4465, %r4082; + add.s32 %r4514, %r4513, %r4456; + xor.b32 %r4515, %r4481, %r4514; + shf.l.wrap.b32 %r4516, %r4515, %r4515, 16; + add.s32 %r4517, %r4516, %r4440; + xor.b32 %r4518, %r4517, %r4456; + shf.l.wrap.b32 %r4519, %r4518, %r4518, 20; + add.s32 %r4520, %r4514, %r4005; + add.s32 %r4521, %r4520, %r4519; + xor.b32 %r4522, %r4521, %r4516; + shf.l.wrap.b32 %r4523, %r4522, %r4522, 24; + add.s32 %r4524, %r4523, %r4517; + xor.b32 %r4525, %r4524, %r4519; + shf.l.wrap.b32 %r4526, %r4525, %r4525, 25; + add.s32 %r4527, %r4479, %r4075; + add.s32 %r4528, %r4527, %r4470; + xor.b32 %r4529, %r4528, %r4439; + shf.l.wrap.b32 %r4530, %r4529, %r4529, 16; + add.s32 %r4531, %r4530, %r4454; + xor.b32 %r4532, %r4531, %r4470; + shf.l.wrap.b32 %r4533, %r4532, %r4532, 20; + add.s32 %r4534, %r4528, %r4089; + add.s32 %r4535, %r4534, %r4533; + xor.b32 %r4536, %r4535, %r4530; + shf.l.wrap.b32 %r4537, %r4536, %r4536, 24; + add.s32 %r4538, %r4537, %r4531; + xor.b32 %r4539, %r4538, %r4533; + shf.l.wrap.b32 %r4540, %r4539, %r4539, 25; + add.s32 %r4541, %r4493, %r4012; + add.s32 %r4542, %r4541, %r4512; + xor.b32 %r4543, %r4542, %r4537; + shf.l.wrap.b32 %r4544, %r4543, %r4543, 16; + add.s32 %r4545, %r4544, %r4524; + xor.b32 %r4546, %r4545, %r4512; + shf.l.wrap.b32 %r4547, %r4546, %r4546, 20; + add.s32 %r4548, %r4542, %r3984; + add.s32 %r4549, %r4548, %r4547; + xor.b32 %r4550, %r4549, %r4544; + shf.l.wrap.b32 %r4551, %r4550, %r4550, 24; + add.s32 %r4552, %r4551, %r4545; + xor.b32 %r4553, %r4552, %r4547; + shf.l.wrap.b32 %r4554, %r4553, %r4553, 25; + add.s32 %r4555, %r4507, %r4061; + add.s32 %r4556, %r4555, %r4526; + xor.b32 %r4557, %r4495, %r4556; + shf.l.wrap.b32 %r4558, %r4557, %r4557, 16; + add.s32 %r4559, %r4558, %r4538; + xor.b32 %r4560, %r4559, %r4526; + shf.l.wrap.b32 %r4561, %r4560, %r4560, 20; + add.s32 %r4562, %r4556, %r3998; + add.s32 %r4563, %r4562, %r4561; + xor.b32 %r4564, %r4563, %r4558; + shf.l.wrap.b32 %r4565, %r4564, %r4564, 24; + add.s32 %r4566, %r4565, %r4559; + xor.b32 %r4567, %r4566, %r4561; + shf.l.wrap.b32 %r4568, %r4567, %r4567, 25; + add.s32 %r4569, %r4521, %r4019; + add.s32 %r4570, %r4569, %r4540; + xor.b32 %r4571, %r4570, %r4509; + shf.l.wrap.b32 %r4572, %r4571, %r4571, 16; + add.s32 %r4573, %r4572, %r4496; + xor.b32 %r4574, %r4573, %r4540; + shf.l.wrap.b32 %r4575, %r4574, %r4574, 20; + add.s32 %r4576, %r4570, %r4040; + add.s32 %r4577, %r4576, %r4575; + xor.b32 %r4578, %r4577, %r4572; + shf.l.wrap.b32 %r4579, %r4578, %r4578, 24; + add.s32 %r4580, %r4579, %r4573; + xor.b32 %r4581, %r4580, %r4575; + shf.l.wrap.b32 %r4582, %r4581, %r4581, 25; + add.s32 %r4583, %r4535, %r3991; + add.s32 %r4584, %r4583, %r4498; + xor.b32 %r4585, %r4584, %r4523; + shf.l.wrap.b32 %r4586, %r4585, %r4585, 16; + add.s32 %r4587, %r4586, %r4510; + xor.b32 %r4588, %r4587, %r4498; + shf.l.wrap.b32 %r4589, %r4588, %r4588, 20; + add.s32 %r4590, %r4584, %r4026; + add.s32 %r4591, %r4590, %r4589; + xor.b32 %r4592, %r4591, %r4586; + shf.l.wrap.b32 %r4593, %r4592, %r4592, 24; + add.s32 %r4594, %r4593, %r4587; + xor.b32 %r4595, %r4594, %r4589; + shf.l.wrap.b32 %r4596, %r4595, %r4595, 25; + add.s32 %r4597, %r4549, %r4068; + add.s32 %r4598, %r4597, %r4596; + xor.b32 %r4599, %r4598, %r4565; + shf.l.wrap.b32 %r4600, %r4599, %r4599, 16; + add.s32 %r4601, %r4600, %r4580; + xor.b32 %r4602, %r4601, %r4596; + shf.l.wrap.b32 %r4603, %r4602, %r4602, 20; + add.s32 %r4604, %r4598, %r4075; + add.s32 %r4605, %r4604, %r4603; + xor.b32 %r4606, %r4605, %r4600; + shf.l.wrap.b32 %r4607, %r4606, %r4606, 24; + add.s32 %r4608, %r4607, %r4601; + xor.b32 %r4609, %r4608, %r4603; + shf.l.wrap.b32 %r4610, %r4609, %r4609, 25; + add.s32 %r4611, %r4563, %r4047; + add.s32 %r4612, %r4611, %r4554; + xor.b32 %r4613, %r4612, %r4579; + shf.l.wrap.b32 %r4614, %r4613, %r4613, 16; + add.s32 %r4615, %r4614, %r4594; + xor.b32 %r4616, %r4615, %r4554; + shf.l.wrap.b32 %r4617, %r4616, %r4616, 20; + add.s32 %r4618, %r4612, %r4061; + add.s32 %r4619, %r4618, %r4617; + xor.b32 %r4620, %r4619, %r4614; + shf.l.wrap.b32 %r4621, %r4620, %r4620, 24; + add.s32 %r4622, %r4621, %r4615; + xor.b32 %r4623, %r4622, %r4617; + shf.l.wrap.b32 %r4624, %r4623, %r4623, 25; + add.s32 %r4625, %r4577, %r4089; + add.s32 %r4626, %r4625, %r4568; + xor.b32 %r4627, %r4593, %r4626; + shf.l.wrap.b32 %r4628, %r4627, %r4627, 16; + add.s32 %r4629, %r4628, %r4552; + xor.b32 %r4630, %r4629, %r4568; + shf.l.wrap.b32 %r4631, %r4630, %r4630, 20; + add.s32 %r4632, %r4626, %r4054; + add.s32 %r4633, %r4632, %r4631; + xor.b32 %r4634, %r4633, %r4628; + shf.l.wrap.b32 %r4635, %r4634, %r4634, 24; + add.s32 %r4636, %r4635, %r4629; + xor.b32 %r4637, %r4636, %r4631; + shf.l.wrap.b32 %r4638, %r4637, %r4637, 25; + add.s32 %r4639, %r4591, %r4082; + add.s32 %r4640, %r4639, %r4582; + xor.b32 %r4641, %r4640, %r4551; + shf.l.wrap.b32 %r4642, %r4641, %r4641, 16; + add.s32 %r4643, %r4642, %r4566; + xor.b32 %r4644, %r4643, %r4582; + shf.l.wrap.b32 %r4645, %r4644, %r4644, 20; + add.s32 %r4646, %r4640, %r4040; + add.s32 %r4647, %r4646, %r4645; + xor.b32 %r4648, %r4647, %r4642; + shf.l.wrap.b32 %r4649, %r4648, %r4648, 24; + add.s32 %r4650, %r4649, %r4643; + xor.b32 %r4651, %r4650, %r4645; + shf.l.wrap.b32 %r4652, %r4651, %r4651, 25; + add.s32 %r4653, %r4605, %r4033; + add.s32 %r4654, %r4653, %r4624; + xor.b32 %r4655, %r4654, %r4649; + shf.l.wrap.b32 %r4656, %r4655, %r4655, 16; + add.s32 %r4657, %r4656, %r4636; + xor.b32 %r4658, %r4657, %r4624; + shf.l.wrap.b32 %r4659, %r4658, %r4658, 20; + add.s32 %r4660, %r4654, %r3998; + add.s32 %r4661, %r4660, %r4659; + xor.b32 %r4662, %r4661, %r4656; + shf.l.wrap.b32 %r4663, %r4662, %r4662, 24; + add.s32 %r4664, %r4663, %r4657; + xor.b32 %r4665, %r4664, %r4659; + shf.l.wrap.b32 %r4666, %r4665, %r4665, 25; + add.s32 %r4667, %r4619, %r4019; + add.s32 %r4668, %r4667, %r4638; + xor.b32 %r4669, %r4607, %r4668; + shf.l.wrap.b32 %r4670, %r4669, %r4669, 16; + add.s32 %r4671, %r4670, %r4650; + xor.b32 %r4672, %r4671, %r4638; + shf.l.wrap.b32 %r4673, %r4672, %r4672, 20; + add.s32 %r4674, %r4668, %r4005; + add.s32 %r4675, %r4674, %r4673; + xor.b32 %r4676, %r4675, %r4670; + shf.l.wrap.b32 %r4677, %r4676, %r4676, 24; + add.s32 %r4678, %r4677, %r4671; + xor.b32 %r4679, %r4678, %r4673; + shf.l.wrap.b32 %r4680, %r4679, %r4679, 25; + add.s32 %r4681, %r4633, %r3984; + add.s32 %r4682, %r4681, %r4652; + xor.b32 %r4683, %r4682, %r4621; + shf.l.wrap.b32 %r4684, %r4683, %r4683, 16; + add.s32 %r4685, %r4684, %r4608; + xor.b32 %r4686, %r4685, %r4652; + shf.l.wrap.b32 %r4687, %r4686, %r4686, 20; + add.s32 %r4688, %r4682, %r3991; + add.s32 %r4689, %r4688, %r4687; + xor.b32 %r4690, %r4689, %r4684; + shf.l.wrap.b32 %r4691, %r4690, %r4690, 24; + add.s32 %r4692, %r4691, %r4685; + xor.b32 %r4693, %r4692, %r4687; + shf.l.wrap.b32 %r4694, %r4693, %r4693, 25; + add.s32 %r4695, %r4647, %r4026; + add.s32 %r4696, %r4695, %r4610; + xor.b32 %r4697, %r4696, %r4635; + shf.l.wrap.b32 %r4698, %r4697, %r4697, 16; + add.s32 %r4699, %r4698, %r4622; + xor.b32 %r4700, %r4699, %r4610; + shf.l.wrap.b32 %r4701, %r4700, %r4700, 20; + add.s32 %r4702, %r4696, %r4012; + add.s32 %r4703, %r4702, %r4701; + xor.b32 %r4704, %r4703, %r4698; + shf.l.wrap.b32 %r4705, %r4704, %r4704, 24; + add.s32 %r4706, %r4705, %r4699; + xor.b32 %r4707, %r4706, %r4701; + shf.l.wrap.b32 %r4708, %r4707, %r4707, 25; + add.s32 %r4709, %r4661, %r4047; + add.s32 %r4710, %r4709, %r4708; + xor.b32 %r4711, %r4710, %r4677; + shf.l.wrap.b32 %r4712, %r4711, %r4711, 16; + add.s32 %r4713, %r4712, %r4692; + xor.b32 %r4714, %r4713, %r4708; + shf.l.wrap.b32 %r4715, %r4714, %r4714, 20; + add.s32 %r4716, %r4710, %r4082; + add.s32 %r4717, %r4716, %r4715; + xor.b32 %r4718, %r4717, %r4712; + shf.l.wrap.b32 %r4719, %r4718, %r4718, 24; + add.s32 %r4720, %r4719, %r4713; + xor.b32 %r4721, %r4720, %r4715; + shf.l.wrap.b32 %r4722, %r4721, %r4721, 25; + add.s32 %r4723, %r4675, %r4061; + add.s32 %r4724, %r4723, %r4666; + xor.b32 %r4725, %r4724, %r4691; + shf.l.wrap.b32 %r4726, %r4725, %r4725, 16; + add.s32 %r4727, %r4726, %r4706; + xor.b32 %r4728, %r4727, %r4666; + shf.l.wrap.b32 %r4729, %r4728, %r4728, 20; + add.s32 %r4730, %r4724, %r4019; + add.s32 %r4731, %r4730, %r4729; + xor.b32 %r4732, %r4731, %r4726; + shf.l.wrap.b32 %r4733, %r4732, %r4732, 24; + add.s32 %r4734, %r4733, %r4727; + xor.b32 %r4735, %r4734, %r4729; + shf.l.wrap.b32 %r4736, %r4735, %r4735, 25; + add.s32 %r4737, %r4689, %r4040; + add.s32 %r4738, %r4737, %r4680; + xor.b32 %r4739, %r4705, %r4738; + shf.l.wrap.b32 %r4740, %r4739, %r4739, 16; + add.s32 %r4741, %r4740, %r4664; + xor.b32 %r4742, %r4741, %r4680; + shf.l.wrap.b32 %r4743, %r4742, %r4742, 20; + add.s32 %r4744, %r4738, %r4068; + add.s32 %r4745, %r4744, %r4743; + xor.b32 %r4746, %r4745, %r4740; + shf.l.wrap.b32 %r4747, %r4746, %r4746, 24; + add.s32 %r4748, %r4747, %r4741; + xor.b32 %r4749, %r4748, %r4743; + shf.l.wrap.b32 %r4750, %r4749, %r4749, 25; + add.s32 %r4751, %r4703, %r4089; + add.s32 %r4752, %r4751, %r4694; + xor.b32 %r4753, %r4752, %r4663; + shf.l.wrap.b32 %r4754, %r4753, %r4753, 16; + add.s32 %r4755, %r4754, %r4678; + xor.b32 %r4756, %r4755, %r4694; + shf.l.wrap.b32 %r4757, %r4756, %r4756, 20; + add.s32 %r4758, %r4752, %r3991; + add.s32 %r4759, %r4758, %r4757; + xor.b32 %r4760, %r4759, %r4754; + shf.l.wrap.b32 %r4761, %r4760, %r4760, 24; + add.s32 %r4762, %r4761, %r4755; + xor.b32 %r4763, %r4762, %r4757; + shf.l.wrap.b32 %r4764, %r4763, %r4763, 25; + add.s32 %r4765, %r4717, %r4075; + add.s32 %r4766, %r4765, %r4736; + xor.b32 %r4767, %r4766, %r4761; + shf.l.wrap.b32 %r4768, %r4767, %r4767, 16; + add.s32 %r4769, %r4768, %r4748; + xor.b32 %r4770, %r4769, %r4736; + shf.l.wrap.b32 %r4771, %r4770, %r4770, 20; + add.s32 %r4772, %r4766, %r4005; + add.s32 %r4773, %r4772, %r4771; + xor.b32 %r4774, %r4773, %r4768; + shf.l.wrap.b32 %r4775, %r4774, %r4774, 24; + add.s32 %r4776, %r4775, %r4769; + xor.b32 %r4777, %r4776, %r4771; + shf.l.wrap.b32 %r4778, %r4777, %r4777, 25; + add.s32 %r4779, %r4731, %r3984; + add.s32 %r4780, %r4779, %r4750; + xor.b32 %r4781, %r4719, %r4780; + shf.l.wrap.b32 %r4782, %r4781, %r4781, 16; + add.s32 %r4783, %r4782, %r4762; + xor.b32 %r4784, %r4783, %r4750; + shf.l.wrap.b32 %r4785, %r4784, %r4784, 20; + add.s32 %r4786, %r4780, %r4054; + add.s32 %r4787, %r4786, %r4785; + xor.b32 %r4788, %r4787, %r4782; + shf.l.wrap.b32 %r4789, %r4788, %r4788, 24; + add.s32 %r4790, %r4789, %r4783; + xor.b32 %r4791, %r4790, %r4785; + shf.l.wrap.b32 %r4792, %r4791, %r4791, 25; + add.s32 %r4793, %r4745, %r3998; + add.s32 %r4794, %r4793, %r4764; + xor.b32 %r4795, %r4794, %r4733; + shf.l.wrap.b32 %r4796, %r4795, %r4795, 16; + add.s32 %r4797, %r4796, %r4720; + xor.b32 %r4798, %r4797, %r4764; + shf.l.wrap.b32 %r4799, %r4798, %r4798, 20; + add.s32 %r4800, %r4794, %r4026; + add.s32 %r4801, %r4800, %r4799; + xor.b32 %r4802, %r4801, %r4796; + shf.l.wrap.b32 %r4803, %r4802, %r4802, 24; + add.s32 %r4804, %r4803, %r4797; + xor.b32 %r4805, %r4804, %r4799; + shf.l.wrap.b32 %r4806, %r4805, %r4805, 25; + add.s32 %r4807, %r4759, %r4012; + add.s32 %r4808, %r4807, %r4722; + xor.b32 %r4809, %r4808, %r4747; + shf.l.wrap.b32 %r4810, %r4809, %r4809, 16; + add.s32 %r4811, %r4810, %r4734; + xor.b32 %r4812, %r4811, %r4722; + shf.l.wrap.b32 %r4813, %r4812, %r4812, 20; + add.s32 %r4814, %r4808, %r4033; + add.s32 %r4815, %r4814, %r4813; + xor.b32 %r4816, %r4815, %r4810; + shf.l.wrap.b32 %r4817, %r4816, %r4816, 24; + add.s32 %r4818, %r4817, %r4811; + xor.b32 %r4819, %r4818, %r4813; + shf.l.wrap.b32 %r4820, %r4819, %r4819, 25; + add.s32 %r4821, %r4773, %r4061; + add.s32 %r4822, %r4821, %r4820; + xor.b32 %r4823, %r4822, %r4789; + shf.l.wrap.b32 %r4824, %r4823, %r4823, 16; + add.s32 %r4825, %r4824, %r4804; + xor.b32 %r4826, %r4825, %r4820; + shf.l.wrap.b32 %r4827, %r4826, %r4826, 20; + add.s32 %r4828, %r4822, %r4089; + add.s32 %r4829, %r4828, %r4827; + xor.b32 %r4830, %r4829, %r4824; + shf.l.wrap.b32 %r4831, %r4830, %r4830, 24; + add.s32 %r4832, %r4831, %r4825; + xor.b32 %r4833, %r4832, %r4827; + shf.l.wrap.b32 %r4834, %r4833, %r4833, 25; + add.s32 %r4835, %r4787, %r4019; + add.s32 %r4836, %r4835, %r4778; + xor.b32 %r4837, %r4836, %r4803; + shf.l.wrap.b32 %r4838, %r4837, %r4837, 16; + add.s32 %r4839, %r4838, %r4818; + xor.b32 %r4840, %r4839, %r4778; + shf.l.wrap.b32 %r4841, %r4840, %r4840, 20; + add.s32 %r4842, %r4836, %r3984; + add.s32 %r4843, %r4842, %r4841; + xor.b32 %r4844, %r4843, %r4838; + shf.l.wrap.b32 %r4845, %r4844, %r4844, 24; + add.s32 %r4846, %r4845, %r4839; + xor.b32 %r4847, %r4846, %r4841; + shf.l.wrap.b32 %r4848, %r4847, %r4847, 25; + add.s32 %r4849, %r4801, %r3991; + add.s32 %r4850, %r4849, %r4792; + xor.b32 %r4851, %r4817, %r4850; + shf.l.wrap.b32 %r4852, %r4851, %r4851, 16; + add.s32 %r4853, %r4852, %r4776; + xor.b32 %r4854, %r4853, %r4792; + shf.l.wrap.b32 %r4855, %r4854, %r4854, 20; + add.s32 %r4856, %r4850, %r4047; + add.s32 %r4857, %r4856, %r4855; + xor.b32 %r4858, %r4857, %r4852; + shf.l.wrap.b32 %r4859, %r4858, %r4858, 24; + add.s32 %r4860, %r4859, %r4853; + xor.b32 %r4861, %r4860, %r4855; + shf.l.wrap.b32 %r4862, %r4861, %r4861, 25; + add.s32 %r4863, %r4815, %r4040; + add.s32 %r4864, %r4863, %r4806; + xor.b32 %r4865, %r4864, %r4775; + shf.l.wrap.b32 %r4866, %r4865, %r4865, 16; + add.s32 %r4867, %r4866, %r4790; + xor.b32 %r4868, %r4867, %r4806; + shf.l.wrap.b32 %r4869, %r4868, %r4868, 20; + add.s32 %r4870, %r4864, %r4026; + add.s32 %r4871, %r4870, %r4869; + xor.b32 %r4872, %r4871, %r4866; + shf.l.wrap.b32 %r4873, %r4872, %r4872, 24; + add.s32 %r4874, %r4873, %r4867; + xor.b32 %r4875, %r4874, %r4869; + shf.l.wrap.b32 %r4876, %r4875, %r4875, 25; + add.s32 %r4877, %r4829, %r4082; + add.s32 %r4878, %r4877, %r4848; + xor.b32 %r4879, %r4878, %r4873; + shf.l.wrap.b32 %r4880, %r4879, %r4879, 16; + add.s32 %r4881, %r4880, %r4860; + xor.b32 %r4882, %r4881, %r4848; + shf.l.wrap.b32 %r4883, %r4882, %r4882, 20; + add.s32 %r4884, %r4878, %r4054; + add.s32 %r4885, %r4884, %r4883; + xor.b32 %r4886, %r4885, %r4880; + shf.l.wrap.b32 %r4887, %r4886, %r4886, 24; + add.s32 %r4888, %r4887, %r4881; + xor.b32 %r4889, %r4888, %r4883; + shf.l.wrap.b32 %r4890, %r4889, %r4889, 25; + add.s32 %r4891, %r4843, %r3998; + add.s32 %r4892, %r4891, %r4862; + xor.b32 %r4893, %r4831, %r4892; + shf.l.wrap.b32 %r4894, %r4893, %r4893, 16; + add.s32 %r4895, %r4894, %r4874; + xor.b32 %r4896, %r4895, %r4862; + shf.l.wrap.b32 %r4897, %r4896, %r4896, 20; + add.s32 %r4898, %r4892, %r4068; + add.s32 %r4899, %r4898, %r4897; + xor.b32 %r4900, %r4899, %r4894; + shf.l.wrap.b32 %r4901, %r4900, %r4900, 24; + add.s32 %r4902, %r4901, %r4895; + xor.b32 %r4903, %r4902, %r4897; + shf.l.wrap.b32 %r4904, %r4903, %r4903, 25; + add.s32 %r4905, %r4857, %r4005; + add.s32 %r4906, %r4905, %r4876; + xor.b32 %r4907, %r4906, %r4845; + shf.l.wrap.b32 %r4908, %r4907, %r4907, 16; + add.s32 %r4909, %r4908, %r4832; + xor.b32 %r4910, %r4909, %r4876; + shf.l.wrap.b32 %r4911, %r4910, %r4910, 20; + add.s32 %r4912, %r4906, %r4012; + add.s32 %r4913, %r4912, %r4911; + xor.b32 %r4914, %r4913, %r4908; + shf.l.wrap.b32 %r4915, %r4914, %r4914, 24; + add.s32 %r4916, %r4915, %r4909; + xor.b32 %r4917, %r4916, %r4911; + shf.l.wrap.b32 %r4918, %r4917, %r4917, 25; + add.s32 %r4919, %r4871, %r4033; + add.s32 %r4920, %r4919, %r4834; + xor.b32 %r4921, %r4920, %r4859; + shf.l.wrap.b32 %r4922, %r4921, %r4921, 16; + add.s32 %r4923, %r4922, %r4846; + xor.b32 %r4924, %r4923, %r4834; + shf.l.wrap.b32 %r4925, %r4924, %r4924, 20; + add.s32 %r4926, %r4920, %r4075; + add.s32 %r4927, %r4926, %r4925; + xor.b32 %r4928, %r4927, %r4922; + shf.l.wrap.b32 %r4929, %r4928, %r4928, 24; + add.s32 %r4930, %r4929, %r4923; + xor.b32 %r4931, %r4930, %r4925; + shf.l.wrap.b32 %r4932, %r4931, %r4931, 25; + xor.b32 %r4933, %r4916, %r4885; + xor.b32 %r4934, %r4930, %r4899; + xor.b32 %r4935, %r4888, %r4913; + xor.b32 %r4936, %r4927, %r4902; + xor.b32 %r4937, %r4932, %r4901; + xor.b32 %r4938, %r4890, %r4915; + xor.b32 %r4939, %r4929, %r4904; + xor.b32 %r4940, %r4918, %r4887; + st.local.u8 [%rd168+145], %r4933; + shr.u32 %r4941, %r4933, 8; + st.local.u8 [%rd168+146], %r4941; + shr.u32 %r4942, %r4933, 16; + st.local.u8 [%rd168+147], %r4942; + shr.u32 %r4943, %r4933, 24; + st.local.u8 [%rd168+148], %r4943; + st.local.u8 [%rd168+149], %r4934; + shr.u32 %r4944, %r4934, 8; + st.local.u8 [%rd168+150], %r4944; + shr.u32 %r4945, %r4934, 16; + st.local.u8 [%rd168+151], %r4945; + shr.u32 %r4946, %r4934, 24; + st.local.u8 [%rd168+152], %r4946; + st.local.u8 [%rd168+153], %r4935; + shr.u32 %r4947, %r4935, 8; + st.local.u8 [%rd168+154], %r4947; + shr.u32 %r4948, %r4935, 16; + st.local.u8 [%rd168+155], %r4948; + shr.u32 %r4949, %r4935, 24; + st.local.u8 [%rd168+156], %r4949; + st.local.u8 [%rd168+157], %r4936; + shr.u32 %r4950, %r4936, 8; + st.local.u8 [%rd168+158], %r4950; + shr.u32 %r4951, %r4936, 16; + st.local.u8 [%rd168+159], %r4951; + shr.u32 %r4952, %r4936, 24; + st.local.u8 [%rd168+160], %r4952; + st.local.u8 [%rd168+161], %r4937; + shr.u32 %r4953, %r4937, 8; + st.local.u8 [%rd168+162], %r4953; + shr.u32 %r4954, %r4937, 16; + st.local.u8 [%rd168+163], %r4954; + shr.u32 %r4955, %r4937, 24; + st.local.u8 [%rd168+164], %r4955; + st.local.u8 [%rd168+165], %r4938; + shr.u32 %r4956, %r4938, 8; + st.local.u8 [%rd168+166], %r4956; + shr.u32 %r4957, %r4938, 16; + st.local.u8 [%rd168+167], %r4957; + shr.u32 %r4958, %r4938, 24; + st.local.u8 [%rd168+168], %r4958; + st.local.u8 [%rd168+169], %r4939; + shr.u32 %r4959, %r4939, 8; + st.local.u8 [%rd168+170], %r4959; + shr.u32 %r4960, %r4939, 16; + st.local.u8 [%rd168+171], %r4960; + shr.u32 %r4961, %r4939, 24; + st.local.u8 [%rd168+172], %r4961; + st.local.u8 [%rd168+173], %r4940; + shr.u32 %r4962, %r4940, 8; + st.local.u8 [%rd168+174], %r4962; + shr.u32 %r4963, %r4940, 16; + st.local.u8 [%rd168+175], %r4963; + shr.u32 %r4964, %r4940, 24; + st.local.u8 [%rd168+176], %r4964; + ld.local.u8 %rs138, [%rd3+8]; + add.s16 %rs139, %rs138, -1; + st.local.u8 [%rd3+8], %rs139; + cvt.u64.u16 %rd169, %rs139; + and.b64 %rd170, %rd169, 255; + setp.lt.u64 %p28, %rd226, %rd170; + and.b16 %rs140, %rs139, 255; + mul.wide.u16 %r11661, %rs140, 32; + @%p28 bra $L__BB1_31; + +$L__BB1_32: + cvt.s64.s32 %rd171, %r11661; + add.s64 %rd172, %rd2, %rd171; + mov.b32 {%rs141, %rs142}, %r3959; + st.local.u8 [%rd172+145], %rs141; + shr.u16 %rs143, %rs141, 8; + st.local.u8 [%rd172+146], %rs143; + st.local.u8 [%rd172+147], %rs142; + shr.u16 %rs144, %rs142, 8; + st.local.u8 [%rd172+148], %rs144; + mov.b32 {%rs145, %rs146}, %r3960; + st.local.u8 [%rd172+149], %rs145; + shr.u16 %rs147, %rs145, 8; + st.local.u8 [%rd172+150], %rs147; + st.local.u8 [%rd172+151], %rs146; + shr.u16 %rs148, %rs146, 8; + st.local.u8 [%rd172+152], %rs148; + mov.b32 {%rs149, %rs150}, %r3961; + st.local.u8 [%rd172+153], %rs149; + shr.u16 %rs151, %rs149, 8; + st.local.u8 [%rd172+154], %rs151; + st.local.u8 [%rd172+155], %rs150; + shr.u16 %rs152, %rs150, 8; + st.local.u8 [%rd172+156], %rs152; + mov.b32 {%rs153, %rs154}, %r3962; + st.local.u8 [%rd172+157], %rs153; + shr.u16 %rs155, %rs153, 8; + st.local.u8 [%rd172+158], %rs155; + st.local.u8 [%rd172+159], %rs154; + shr.u16 %rs156, %rs154, 8; + st.local.u8 [%rd172+160], %rs156; + mov.b32 {%rs157, %rs158}, %r3963; + st.local.u8 [%rd172+161], %rs157; + shr.u16 %rs159, %rs157, 8; + st.local.u8 [%rd172+162], %rs159; + st.local.u8 [%rd172+163], %rs158; + shr.u16 %rs160, %rs158, 8; + st.local.u8 [%rd172+164], %rs160; + mov.b32 {%rs161, %rs162}, %r3964; + st.local.u8 [%rd172+165], %rs161; + shr.u16 %rs163, %rs161, 8; + st.local.u8 [%rd172+166], %rs163; + st.local.u8 [%rd172+167], %rs162; + shr.u16 %rs164, %rs162, 8; + st.local.u8 [%rd172+168], %rs164; + mov.b32 {%rs165, %rs166}, %r3965; + st.local.u8 [%rd172+169], %rs165; + shr.u16 %rs167, %rs165, 8; + st.local.u8 [%rd172+170], %rs167; + st.local.u8 [%rd172+171], %rs166; + shr.u16 %rs168, %rs166, 8; + st.local.u8 [%rd172+172], %rs168; + mov.b32 {%rs169, %rs170}, %r3966; + st.local.u8 [%rd172+173], %rs169; + shr.u16 %rs171, %rs169, 8; + st.local.u8 [%rd172+174], %rs171; + st.local.u8 [%rd172+175], %rs170; + shr.u16 %rs172, %rs170, 8; + st.local.u8 [%rd172+176], %rs172; + ld.local.u8 %rs173, [%rd3+8]; + add.s16 %rs174, %rs173, 1; + st.local.u8 [%rd3+8], %rs174; + shr.u64 %rd173, %rd49, 11; + ld.local.u64 %rd174, [%rd3+-72]; + add.s64 %rd175, %rd174, %rd173; + popc.b64 %r4965, %rd175; + cvt.u64.u32 %rd52, %r4965; + cvt.u64.u16 %rd176, %rs174; + and.b64 %rd177, %rd176, 255; + setp.ge.u64 %p29, %rd52, %rd177; + and.b16 %rs175, %rs174, 255; + mul.wide.u16 %r11663, %rs175, 32; + @%p29 bra $L__BB1_35; + +$L__BB1_34: + shr.u64 %rd229, %rd49, 11; + add.s64 %rd228, %rd174, %rd229; + popc.b64 %r11648, %rd228; + cvt.u64.u32 %rd227, %r11648; + add.s32 %r4966, %r11663, -64; + cvt.s64.s32 %rd178, %r4966; + add.s64 %rd179, %rd2, %rd178; + ld.local.u8 %r4967, [%rd3+2]; + ld.local.u8 %r4968, [%rd179+145]; + ld.local.u8 %r4969, [%rd179+146]; + prmt.b32 %r4970, %r4969, %r4968, 30212; + ld.local.u8 %r4971, [%rd179+147]; + prmt.b32 %r4972, %r4971, %r4970, 28756; + ld.local.u8 %r4973, [%rd179+148]; + prmt.b32 %r4974, %r4973, %r4972, 1620; + ld.local.u8 %r4975, [%rd179+149]; + ld.local.u8 %r4976, [%rd179+150]; + prmt.b32 %r4977, %r4976, %r4975, 30212; + ld.local.u8 %r4978, [%rd179+151]; + prmt.b32 %r4979, %r4978, %r4977, 28756; + ld.local.u8 %r4980, [%rd179+152]; + prmt.b32 %r4981, %r4980, %r4979, 1620; + ld.local.u8 %r4982, [%rd179+153]; + ld.local.u8 %r4983, [%rd179+154]; + prmt.b32 %r4984, %r4983, %r4982, 30212; + ld.local.u8 %r4985, [%rd179+155]; + prmt.b32 %r4986, %r4985, %r4984, 28756; + ld.local.u8 %r4987, [%rd179+156]; + prmt.b32 %r4988, %r4987, %r4986, 1620; + ld.local.u8 %r4989, [%rd179+157]; + ld.local.u8 %r4990, [%rd179+158]; + prmt.b32 %r4991, %r4990, %r4989, 30212; + ld.local.u8 %r4992, [%rd179+159]; + prmt.b32 %r4993, %r4992, %r4991, 28756; + ld.local.u8 %r4994, [%rd179+160]; + prmt.b32 %r4995, %r4994, %r4993, 1620; + ld.local.u8 %r4996, [%rd179+161]; + ld.local.u8 %r4997, [%rd179+162]; + prmt.b32 %r4998, %r4997, %r4996, 30212; + ld.local.u8 %r4999, [%rd179+163]; + prmt.b32 %r5000, %r4999, %r4998, 28756; + ld.local.u8 %r5001, [%rd179+164]; + prmt.b32 %r5002, %r5001, %r5000, 1620; + ld.local.u8 %r5003, [%rd179+165]; + ld.local.u8 %r5004, [%rd179+166]; + prmt.b32 %r5005, %r5004, %r5003, 30212; + ld.local.u8 %r5006, [%rd179+167]; + prmt.b32 %r5007, %r5006, %r5005, 28756; + ld.local.u8 %r5008, [%rd179+168]; + prmt.b32 %r5009, %r5008, %r5007, 1620; + ld.local.u8 %r5010, [%rd179+169]; + ld.local.u8 %r5011, [%rd179+170]; + prmt.b32 %r5012, %r5011, %r5010, 30212; + ld.local.u8 %r5013, [%rd179+171]; + prmt.b32 %r5014, %r5013, %r5012, 28756; + ld.local.u8 %r5015, [%rd179+172]; + prmt.b32 %r5016, %r5015, %r5014, 1620; + ld.local.u8 %r5017, [%rd179+173]; + ld.local.u8 %r5018, [%rd179+174]; + prmt.b32 %r5019, %r5018, %r5017, 30212; + ld.local.u8 %r5020, [%rd179+175]; + prmt.b32 %r5021, %r5020, %r5019, 28756; + ld.local.u8 %r5022, [%rd179+176]; + prmt.b32 %r5023, %r5022, %r5021, 1620; + ld.local.u8 %r5024, [%rd179+177]; + ld.local.u8 %r5025, [%rd179+178]; + prmt.b32 %r5026, %r5025, %r5024, 30212; + ld.local.u8 %r5027, [%rd179+179]; + prmt.b32 %r5028, %r5027, %r5026, 28756; + ld.local.u8 %r5029, [%rd179+180]; + prmt.b32 %r5030, %r5029, %r5028, 1620; + ld.local.u8 %r5031, [%rd179+181]; + ld.local.u8 %r5032, [%rd179+182]; + prmt.b32 %r5033, %r5032, %r5031, 30212; + ld.local.u8 %r5034, [%rd179+183]; + prmt.b32 %r5035, %r5034, %r5033, 28756; + ld.local.u8 %r5036, [%rd179+184]; + prmt.b32 %r5037, %r5036, %r5035, 1620; + ld.local.u8 %r5038, [%rd179+185]; + ld.local.u8 %r5039, [%rd179+186]; + prmt.b32 %r5040, %r5039, %r5038, 30212; + ld.local.u8 %r5041, [%rd179+187]; + prmt.b32 %r5042, %r5041, %r5040, 28756; + ld.local.u8 %r5043, [%rd179+188]; + prmt.b32 %r5044, %r5043, %r5042, 1620; + ld.local.u8 %r5045, [%rd179+189]; + ld.local.u8 %r5046, [%rd179+190]; + prmt.b32 %r5047, %r5046, %r5045, 30212; + ld.local.u8 %r5048, [%rd179+191]; + prmt.b32 %r5049, %r5048, %r5047, 28756; + ld.local.u8 %r5050, [%rd179+192]; + prmt.b32 %r5051, %r5050, %r5049, 1620; + ld.local.u8 %r5052, [%rd179+193]; + ld.local.u8 %r5053, [%rd179+194]; + prmt.b32 %r5054, %r5053, %r5052, 30212; + ld.local.u8 %r5055, [%rd179+195]; + prmt.b32 %r5056, %r5055, %r5054, 28756; + ld.local.u8 %r5057, [%rd179+196]; + prmt.b32 %r5058, %r5057, %r5056, 1620; + ld.local.u8 %r5059, [%rd179+197]; + ld.local.u8 %r5060, [%rd179+198]; + prmt.b32 %r5061, %r5060, %r5059, 30212; + ld.local.u8 %r5062, [%rd179+199]; + prmt.b32 %r5063, %r5062, %r5061, 28756; + ld.local.u8 %r5064, [%rd179+200]; + prmt.b32 %r5065, %r5064, %r5063, 1620; + ld.local.u8 %r5066, [%rd179+201]; + ld.local.u8 %r5067, [%rd179+202]; + prmt.b32 %r5068, %r5067, %r5066, 30212; + ld.local.u8 %r5069, [%rd179+203]; + prmt.b32 %r5070, %r5069, %r5068, 28756; + ld.local.u8 %r5071, [%rd179+204]; + prmt.b32 %r5072, %r5071, %r5070, 1620; + ld.local.u8 %r5073, [%rd179+205]; + ld.local.u8 %r5074, [%rd179+206]; + prmt.b32 %r5075, %r5074, %r5073, 30212; + ld.local.u8 %r5076, [%rd179+207]; + prmt.b32 %r5077, %r5076, %r5075, 28756; + ld.local.u8 %r5078, [%rd179+208]; + prmt.b32 %r5079, %r5078, %r5077, 1620; + or.b32 %r5080, %r4967, 4; + ld.local.u8 %r5081, [%rd3+-120]; + ld.local.u8 %r5082, [%rd3+-119]; + prmt.b32 %r5083, %r5082, %r5081, 30212; + ld.local.u8 %r5084, [%rd3+-118]; + ld.local.u8 %r5085, [%rd3+-117]; + prmt.b32 %r5086, %r5085, %r5084, 30212; + prmt.b32 %r5087, %r5086, %r5083, 4180; + ld.local.u8 %r5088, [%rd3+-136]; + ld.local.u8 %r5089, [%rd3+-135]; + prmt.b32 %r5090, %r5089, %r5088, 30212; + ld.local.u8 %r5091, [%rd3+-134]; + ld.local.u8 %r5092, [%rd3+-133]; + prmt.b32 %r5093, %r5092, %r5091, 30212; + prmt.b32 %r5094, %r5093, %r5090, 4180; + add.s32 %r5095, %r5087, %r5094; + add.s32 %r5096, %r5095, %r4974; + shf.l.wrap.b32 %r5097, %r5096, %r5096, 16; + add.s32 %r5098, %r5097, 1779033703; + xor.b32 %r5099, %r5098, %r5087; + shf.l.wrap.b32 %r5100, %r5099, %r5099, 20; + add.s32 %r5101, %r4981, %r5096; + add.s32 %r5102, %r5101, %r5100; + xor.b32 %r5103, %r5102, %r5097; + shf.l.wrap.b32 %r5104, %r5103, %r5103, 24; + add.s32 %r5105, %r5104, %r5098; + xor.b32 %r5106, %r5105, %r5100; + shf.l.wrap.b32 %r5107, %r5106, %r5106, 25; + ld.local.u8 %r5108, [%rd3+-116]; + ld.local.u8 %r5109, [%rd3+-115]; + prmt.b32 %r5110, %r5109, %r5108, 30212; + ld.local.u8 %r5111, [%rd3+-114]; + ld.local.u8 %r5112, [%rd3+-113]; + prmt.b32 %r5113, %r5112, %r5111, 30212; + prmt.b32 %r5114, %r5113, %r5110, 4180; + ld.local.u8 %r5115, [%rd3+-132]; + ld.local.u8 %r5116, [%rd3+-131]; + prmt.b32 %r5117, %r5116, %r5115, 30212; + ld.local.u8 %r5118, [%rd3+-130]; + ld.local.u8 %r5119, [%rd3+-129]; + prmt.b32 %r5120, %r5119, %r5118, 30212; + prmt.b32 %r5121, %r5120, %r5117, 4180; + add.s32 %r5122, %r5114, %r5121; + add.s32 %r5123, %r5122, %r4988; + shf.l.wrap.b32 %r5124, %r5123, %r5123, 16; + add.s32 %r5125, %r5124, -1150833019; + xor.b32 %r5126, %r5125, %r5114; + shf.l.wrap.b32 %r5127, %r5126, %r5126, 20; + add.s32 %r5128, %r4995, %r5123; + add.s32 %r5129, %r5128, %r5127; + xor.b32 %r5130, %r5129, %r5124; + shf.l.wrap.b32 %r5131, %r5130, %r5130, 24; + add.s32 %r5132, %r5131, %r5125; + xor.b32 %r5133, %r5132, %r5127; + shf.l.wrap.b32 %r5134, %r5133, %r5133, 25; + ld.local.u8 %r5135, [%rd3+-112]; + ld.local.u8 %r5136, [%rd3+-111]; + prmt.b32 %r5137, %r5136, %r5135, 30212; + ld.local.u8 %r5138, [%rd3+-110]; + ld.local.u8 %r5139, [%rd3+-109]; + prmt.b32 %r5140, %r5139, %r5138, 30212; + prmt.b32 %r5141, %r5140, %r5137, 4180; + ld.local.u8 %r5142, [%rd3+-128]; + ld.local.u8 %r5143, [%rd3+-127]; + prmt.b32 %r5144, %r5143, %r5142, 30212; + ld.local.u8 %r5145, [%rd3+-126]; + ld.local.u8 %r5146, [%rd3+-125]; + prmt.b32 %r5147, %r5146, %r5145, 30212; + prmt.b32 %r5148, %r5147, %r5144, 4180; + add.s32 %r5149, %r5141, %r5148; + add.s32 %r5150, %r5149, %r5002; + shr.u32 %r5151, %r5150, 16; + shl.b32 %r5152, %r5150, 16; + xor.b32 %r5153, %r5152, 4194304; + or.b32 %r5154, %r5153, %r5151; + add.s32 %r5155, %r5154, 1013904242; + xor.b32 %r5156, %r5155, %r5141; + shf.l.wrap.b32 %r5157, %r5156, %r5156, 20; + add.s32 %r5158, %r5009, %r5150; + add.s32 %r5159, %r5158, %r5157; + xor.b32 %r5160, %r5159, %r5154; + shf.l.wrap.b32 %r5161, %r5160, %r5160, 24; + add.s32 %r5162, %r5161, %r5155; + xor.b32 %r5163, %r5162, %r5157; + shf.l.wrap.b32 %r5164, %r5163, %r5163, 25; + ld.local.u8 %r5165, [%rd3+-108]; + ld.local.u8 %r5166, [%rd3+-107]; + prmt.b32 %r5167, %r5166, %r5165, 30212; + ld.local.u8 %r5168, [%rd3+-106]; + ld.local.u8 %r5169, [%rd3+-105]; + prmt.b32 %r5170, %r5169, %r5168, 30212; + prmt.b32 %r5171, %r5170, %r5167, 4180; + ld.local.u8 %r5172, [%rd3+-124]; + ld.local.u8 %r5173, [%rd3+-123]; + prmt.b32 %r5174, %r5173, %r5172, 30212; + ld.local.u8 %r5175, [%rd3+-122]; + ld.local.u8 %r5176, [%rd3+-121]; + prmt.b32 %r5177, %r5176, %r5175, 30212; + prmt.b32 %r5178, %r5177, %r5174, 4180; + add.s32 %r5179, %r5171, %r5178; + add.s32 %r5180, %r5179, %r5016; + xor.b32 %r5181, %r5180, %r5080; + shr.u32 %r5182, %r5180, 16; + shl.b32 %r5183, %r5181, 16; + or.b32 %r5184, %r5183, %r5182; + add.s32 %r5185, %r5184, -1521486534; + xor.b32 %r5186, %r5185, %r5171; + shf.l.wrap.b32 %r5187, %r5186, %r5186, 20; + add.s32 %r5188, %r5023, %r5180; + add.s32 %r5189, %r5188, %r5187; + xor.b32 %r5190, %r5189, %r5184; + shf.l.wrap.b32 %r5191, %r5190, %r5190, 24; + add.s32 %r5192, %r5191, %r5185; + xor.b32 %r5193, %r5192, %r5187; + shf.l.wrap.b32 %r5194, %r5193, %r5193, 25; + add.s32 %r5195, %r5134, %r5102; + add.s32 %r5196, %r5195, %r5030; + xor.b32 %r5197, %r5191, %r5196; + shf.l.wrap.b32 %r5198, %r5197, %r5197, 16; + add.s32 %r5199, %r5198, %r5162; + xor.b32 %r5200, %r5199, %r5134; + shf.l.wrap.b32 %r5201, %r5200, %r5200, 20; + add.s32 %r5202, %r5037, %r5196; + add.s32 %r5203, %r5202, %r5201; + xor.b32 %r5204, %r5203, %r5198; + shf.l.wrap.b32 %r5205, %r5204, %r5204, 24; + add.s32 %r5206, %r5205, %r5199; + xor.b32 %r5207, %r5206, %r5201; + shf.l.wrap.b32 %r5208, %r5207, %r5207, 25; + add.s32 %r5209, %r5164, %r5129; + add.s32 %r5210, %r5209, %r5044; + xor.b32 %r5211, %r5210, %r5104; + shf.l.wrap.b32 %r5212, %r5211, %r5211, 16; + add.s32 %r5213, %r5212, %r5192; + xor.b32 %r5214, %r5213, %r5164; + shf.l.wrap.b32 %r5215, %r5214, %r5214, 20; + add.s32 %r5216, %r5051, %r5210; + add.s32 %r5217, %r5216, %r5215; + xor.b32 %r5218, %r5217, %r5212; + shf.l.wrap.b32 %r5219, %r5218, %r5218, 24; + add.s32 %r5220, %r5219, %r5213; + xor.b32 %r5221, %r5220, %r5215; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 25; + add.s32 %r5223, %r5194, %r5159; + add.s32 %r5224, %r5223, %r5058; + xor.b32 %r5225, %r5224, %r5131; + shf.l.wrap.b32 %r5226, %r5225, %r5225, 16; + add.s32 %r5227, %r5226, %r5105; + xor.b32 %r5228, %r5227, %r5194; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 20; + add.s32 %r5230, %r5065, %r5224; + add.s32 %r5231, %r5230, %r5229; + xor.b32 %r5232, %r5231, %r5226; + shf.l.wrap.b32 %r5233, %r5232, %r5232, 24; + add.s32 %r5234, %r5233, %r5227; + xor.b32 %r5235, %r5234, %r5229; + shf.l.wrap.b32 %r5236, %r5235, %r5235, 25; + add.s32 %r5237, %r5189, %r5107; + add.s32 %r5238, %r5237, %r5072; + xor.b32 %r5239, %r5238, %r5161; + shf.l.wrap.b32 %r5240, %r5239, %r5239, 16; + add.s32 %r5241, %r5240, %r5132; + xor.b32 %r5242, %r5241, %r5107; + shf.l.wrap.b32 %r5243, %r5242, %r5242, 20; + add.s32 %r5244, %r5079, %r5238; + add.s32 %r5245, %r5244, %r5243; + xor.b32 %r5246, %r5245, %r5240; + shf.l.wrap.b32 %r5247, %r5246, %r5246, 24; + add.s32 %r5248, %r5247, %r5241; + xor.b32 %r5249, %r5248, %r5243; + shf.l.wrap.b32 %r5250, %r5249, %r5249, 25; + add.s32 %r5251, %r5203, %r4988; + add.s32 %r5252, %r5251, %r5250; + xor.b32 %r5253, %r5252, %r5219; + shf.l.wrap.b32 %r5254, %r5253, %r5253, 16; + add.s32 %r5255, %r5254, %r5234; + xor.b32 %r5256, %r5255, %r5250; + shf.l.wrap.b32 %r5257, %r5256, %r5256, 20; + add.s32 %r5258, %r5252, %r5016; + add.s32 %r5259, %r5258, %r5257; + xor.b32 %r5260, %r5259, %r5254; + shf.l.wrap.b32 %r5261, %r5260, %r5260, 24; + add.s32 %r5262, %r5261, %r5255; + xor.b32 %r5263, %r5262, %r5257; + shf.l.wrap.b32 %r5264, %r5263, %r5263, 25; + add.s32 %r5265, %r5217, %r4995; + add.s32 %r5266, %r5265, %r5208; + xor.b32 %r5267, %r5233, %r5266; + shf.l.wrap.b32 %r5268, %r5267, %r5267, 16; + add.s32 %r5269, %r5248, %r5268; + xor.b32 %r5270, %r5269, %r5208; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 20; + add.s32 %r5272, %r5266, %r5044; + add.s32 %r5273, %r5272, %r5271; + xor.b32 %r5274, %r5273, %r5268; + shf.l.wrap.b32 %r5275, %r5274, %r5274, 24; + add.s32 %r5276, %r5275, %r5269; + xor.b32 %r5277, %r5276, %r5271; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 25; + add.s32 %r5279, %r5222, %r5023; + add.s32 %r5280, %r5279, %r5231; + xor.b32 %r5281, %r5247, %r5280; + shf.l.wrap.b32 %r5282, %r5281, %r5281, 16; + add.s32 %r5283, %r5282, %r5206; + xor.b32 %r5284, %r5283, %r5222; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 20; + add.s32 %r5286, %r5280, %r4974; + add.s32 %r5287, %r5286, %r5285; + xor.b32 %r5288, %r5287, %r5282; + shf.l.wrap.b32 %r5289, %r5288, %r5288, 24; + add.s32 %r5290, %r5289, %r5283; + xor.b32 %r5291, %r5290, %r5285; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 25; + add.s32 %r5293, %r5236, %r5002; + add.s32 %r5294, %r5293, %r5245; + xor.b32 %r5295, %r5294, %r5205; + shf.l.wrap.b32 %r5296, %r5295, %r5295, 16; + add.s32 %r5297, %r5296, %r5220; + xor.b32 %r5298, %r5297, %r5236; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 20; + add.s32 %r5300, %r5294, %r5065; + add.s32 %r5301, %r5300, %r5299; + xor.b32 %r5302, %r5301, %r5296; + shf.l.wrap.b32 %r5303, %r5302, %r5302, 24; + add.s32 %r5304, %r5303, %r5297; + xor.b32 %r5305, %r5304, %r5299; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 25; + add.s32 %r5307, %r5278, %r4981; + add.s32 %r5308, %r5307, %r5259; + xor.b32 %r5309, %r5308, %r5303; + shf.l.wrap.b32 %r5310, %r5309, %r5309, 16; + add.s32 %r5311, %r5310, %r5290; + xor.b32 %r5312, %r5311, %r5278; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 20; + add.s32 %r5314, %r5308, %r5051; + add.s32 %r5315, %r5314, %r5313; + xor.b32 %r5316, %r5315, %r5310; + shf.l.wrap.b32 %r5317, %r5316, %r5316, 24; + add.s32 %r5318, %r5317, %r5311; + xor.b32 %r5319, %r5318, %r5313; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 25; + add.s32 %r5321, %r5273, %r5058; + add.s32 %r5322, %r5321, %r5292; + xor.b32 %r5323, %r5261, %r5322; + shf.l.wrap.b32 %r5324, %r5323, %r5323, 16; + add.s32 %r5325, %r5324, %r5304; + xor.b32 %r5326, %r5325, %r5292; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 20; + add.s32 %r5328, %r5322, %r5009; + add.s32 %r5329, %r5328, %r5327; + xor.b32 %r5330, %r5329, %r5324; + shf.l.wrap.b32 %r5331, %r5330, %r5330, 24; + add.s32 %r5332, %r5331, %r5325; + xor.b32 %r5333, %r5332, %r5327; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 25; + add.s32 %r5335, %r5287, %r5037; + add.s32 %r5336, %r5335, %r5306; + xor.b32 %r5337, %r5336, %r5275; + shf.l.wrap.b32 %r5338, %r5337, %r5337, 16; + add.s32 %r5339, %r5338, %r5262; + xor.b32 %r5340, %r5339, %r5306; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 20; + add.s32 %r5342, %r5336, %r5072; + add.s32 %r5343, %r5342, %r5341; + xor.b32 %r5344, %r5343, %r5338; + shf.l.wrap.b32 %r5345, %r5344, %r5344, 24; + add.s32 %r5346, %r5345, %r5339; + xor.b32 %r5347, %r5346, %r5341; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 25; + add.s32 %r5349, %r5301, %r5079; + add.s32 %r5350, %r5349, %r5264; + xor.b32 %r5351, %r5350, %r5289; + shf.l.wrap.b32 %r5352, %r5351, %r5351, 16; + add.s32 %r5353, %r5352, %r5276; + xor.b32 %r5354, %r5353, %r5264; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 20; + add.s32 %r5356, %r5350, %r5030; + add.s32 %r5357, %r5356, %r5355; + xor.b32 %r5358, %r5357, %r5352; + shf.l.wrap.b32 %r5359, %r5358, %r5358, 24; + add.s32 %r5360, %r5359, %r5353; + xor.b32 %r5361, %r5360, %r5355; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 25; + add.s32 %r5363, %r5315, %r4995; + add.s32 %r5364, %r5363, %r5362; + xor.b32 %r5365, %r5364, %r5331; + shf.l.wrap.b32 %r5366, %r5365, %r5365, 16; + add.s32 %r5367, %r5366, %r5346; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 20; + add.s32 %r5370, %r5364, %r5002; + add.s32 %r5371, %r5370, %r5369; + xor.b32 %r5372, %r5371, %r5366; + shf.l.wrap.b32 %r5373, %r5372, %r5372, 24; + add.s32 %r5374, %r5373, %r5367; + xor.b32 %r5375, %r5374, %r5369; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 25; + add.s32 %r5377, %r5329, %r5044; + add.s32 %r5378, %r5377, %r5320; + xor.b32 %r5379, %r5378, %r5345; + shf.l.wrap.b32 %r5380, %r5379, %r5379, 16; + add.s32 %r5381, %r5380, %r5360; + xor.b32 %r5382, %r5381, %r5320; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 20; + add.s32 %r5384, %r5378, %r5058; + add.s32 %r5385, %r5384, %r5383; + xor.b32 %r5386, %r5385, %r5380; + shf.l.wrap.b32 %r5387, %r5386, %r5386, 24; + add.s32 %r5388, %r5387, %r5381; + xor.b32 %r5389, %r5388, %r5383; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 25; + add.s32 %r5391, %r5343, %r5065; + add.s32 %r5392, %r5391, %r5334; + xor.b32 %r5393, %r5359, %r5392; + shf.l.wrap.b32 %r5394, %r5393, %r5393, 16; + add.s32 %r5395, %r5394, %r5318; + xor.b32 %r5396, %r5395, %r5334; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 20; + add.s32 %r5398, %r5392, %r4988; + add.s32 %r5399, %r5398, %r5397; + xor.b32 %r5400, %r5399, %r5394; + shf.l.wrap.b32 %r5401, %r5400, %r5400, 24; + add.s32 %r5402, %r5401, %r5395; + xor.b32 %r5403, %r5402, %r5397; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 25; + add.s32 %r5405, %r5348, %r5023; + add.s32 %r5406, %r5405, %r5357; + xor.b32 %r5407, %r5406, %r5317; + shf.l.wrap.b32 %r5408, %r5407, %r5407, 16; + add.s32 %r5409, %r5408, %r5332; + xor.b32 %r5410, %r5409, %r5348; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 20; + add.s32 %r5412, %r5406, %r5072; + add.s32 %r5413, %r5412, %r5411; + xor.b32 %r5414, %r5413, %r5408; + shf.l.wrap.b32 %r5415, %r5414, %r5414, 24; + add.s32 %r5416, %r5415, %r5409; + xor.b32 %r5417, %r5416, %r5411; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 25; + add.s32 %r5419, %r5371, %r5016; + add.s32 %r5420, %r5419, %r5390; + xor.b32 %r5421, %r5420, %r5415; + shf.l.wrap.b32 %r5422, %r5421, %r5421, 16; + add.s32 %r5423, %r5422, %r5402; + xor.b32 %r5424, %r5423, %r5390; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 20; + add.s32 %r5426, %r5420, %r5009; + add.s32 %r5427, %r5426, %r5425; + xor.b32 %r5428, %r5427, %r5422; + shf.l.wrap.b32 %r5429, %r5428, %r5428, 24; + add.s32 %r5430, %r5429, %r5423; + xor.b32 %r5431, %r5430, %r5425; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 25; + add.s32 %r5433, %r5385, %r5037; + add.s32 %r5434, %r5433, %r5404; + xor.b32 %r5435, %r5373, %r5434; + shf.l.wrap.b32 %r5436, %r5435, %r5435, 16; + add.s32 %r5437, %r5436, %r5416; + xor.b32 %r5438, %r5437, %r5404; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 20; + add.s32 %r5440, %r5434, %r4974; + add.s32 %r5441, %r5440, %r5439; + xor.b32 %r5442, %r5441, %r5436; + shf.l.wrap.b32 %r5443, %r5442, %r5442, 24; + add.s32 %r5444, %r5443, %r5437; + xor.b32 %r5445, %r5444, %r5439; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 25; + add.s32 %r5447, %r5399, %r5051; + add.s32 %r5448, %r5447, %r5418; + xor.b32 %r5449, %r5448, %r5387; + shf.l.wrap.b32 %r5450, %r5449, %r5449, 16; + add.s32 %r5451, %r5450, %r5374; + xor.b32 %r5452, %r5451, %r5418; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 20; + add.s32 %r5454, %r5448, %r5079; + add.s32 %r5455, %r5454, %r5453; + xor.b32 %r5456, %r5455, %r5450; + shf.l.wrap.b32 %r5457, %r5456, %r5456, 24; + add.s32 %r5458, %r5457, %r5451; + xor.b32 %r5459, %r5458, %r5453; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 25; + add.s32 %r5461, %r5413, %r5030; + add.s32 %r5462, %r5461, %r5376; + xor.b32 %r5463, %r5462, %r5401; + shf.l.wrap.b32 %r5464, %r5463, %r5463, 16; + add.s32 %r5465, %r5464, %r5388; + xor.b32 %r5466, %r5465, %r5376; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 20; + add.s32 %r5468, %r5462, %r4981; + add.s32 %r5469, %r5468, %r5467; + xor.b32 %r5470, %r5469, %r5464; + shf.l.wrap.b32 %r5471, %r5470, %r5470, 24; + add.s32 %r5472, %r5471, %r5465; + xor.b32 %r5473, %r5472, %r5467; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 25; + add.s32 %r5475, %r5427, %r5044; + add.s32 %r5476, %r5475, %r5474; + xor.b32 %r5477, %r5476, %r5443; + shf.l.wrap.b32 %r5478, %r5477, %r5477, 16; + add.s32 %r5479, %r5478, %r5458; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 20; + add.s32 %r5482, %r5476, %r5023; + add.s32 %r5483, %r5482, %r5481; + xor.b32 %r5484, %r5483, %r5478; + shf.l.wrap.b32 %r5485, %r5484, %r5484, 24; + add.s32 %r5486, %r5485, %r5479; + xor.b32 %r5487, %r5486, %r5481; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 25; + add.s32 %r5489, %r5441, %r5058; + add.s32 %r5490, %r5489, %r5432; + xor.b32 %r5491, %r5490, %r5457; + shf.l.wrap.b32 %r5492, %r5491, %r5491, 16; + add.s32 %r5493, %r5492, %r5472; + xor.b32 %r5494, %r5493, %r5432; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 20; + add.s32 %r5496, %r5490, %r5037; + add.s32 %r5497, %r5496, %r5495; + xor.b32 %r5498, %r5497, %r5492; + shf.l.wrap.b32 %r5499, %r5498, %r5498, 24; + add.s32 %r5500, %r5499, %r5493; + xor.b32 %r5501, %r5500, %r5495; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 25; + add.s32 %r5503, %r5455, %r5072; + add.s32 %r5504, %r5503, %r5446; + xor.b32 %r5505, %r5471, %r5504; + shf.l.wrap.b32 %r5506, %r5505, %r5505, 16; + add.s32 %r5507, %r5506, %r5430; + xor.b32 %r5508, %r5507, %r5446; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 20; + add.s32 %r5510, %r5504, %r4995; + add.s32 %r5511, %r5510, %r5509; + xor.b32 %r5512, %r5511, %r5506; + shf.l.wrap.b32 %r5513, %r5512, %r5512, 24; + add.s32 %r5514, %r5513, %r5507; + xor.b32 %r5515, %r5514, %r5509; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 25; + add.s32 %r5517, %r5469, %r5065; + add.s32 %r5518, %r5517, %r5460; + xor.b32 %r5519, %r5518, %r5429; + shf.l.wrap.b32 %r5520, %r5519, %r5519, 16; + add.s32 %r5521, %r5520, %r5444; + xor.b32 %r5522, %r5521, %r5460; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 20; + add.s32 %r5524, %r5518, %r5079; + add.s32 %r5525, %r5524, %r5523; + xor.b32 %r5526, %r5525, %r5520; + shf.l.wrap.b32 %r5527, %r5526, %r5526, 24; + add.s32 %r5528, %r5527, %r5521; + xor.b32 %r5529, %r5528, %r5523; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 25; + add.s32 %r5531, %r5483, %r5002; + add.s32 %r5532, %r5531, %r5502; + xor.b32 %r5533, %r5532, %r5527; + shf.l.wrap.b32 %r5534, %r5533, %r5533, 16; + add.s32 %r5535, %r5534, %r5514; + xor.b32 %r5536, %r5535, %r5502; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 20; + add.s32 %r5538, %r5532, %r4974; + add.s32 %r5539, %r5538, %r5537; + xor.b32 %r5540, %r5539, %r5534; + shf.l.wrap.b32 %r5541, %r5540, %r5540, 24; + add.s32 %r5542, %r5541, %r5535; + xor.b32 %r5543, %r5542, %r5537; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 25; + add.s32 %r5545, %r5497, %r5051; + add.s32 %r5546, %r5545, %r5516; + xor.b32 %r5547, %r5485, %r5546; + shf.l.wrap.b32 %r5548, %r5547, %r5547, 16; + add.s32 %r5549, %r5548, %r5528; + xor.b32 %r5550, %r5549, %r5516; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 20; + add.s32 %r5552, %r5546, %r4988; + add.s32 %r5553, %r5552, %r5551; + xor.b32 %r5554, %r5553, %r5548; + shf.l.wrap.b32 %r5555, %r5554, %r5554, 24; + add.s32 %r5556, %r5555, %r5549; + xor.b32 %r5557, %r5556, %r5551; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 25; + add.s32 %r5559, %r5511, %r5009; + add.s32 %r5560, %r5559, %r5530; + xor.b32 %r5561, %r5560, %r5499; + shf.l.wrap.b32 %r5562, %r5561, %r5561, 16; + add.s32 %r5563, %r5562, %r5486; + xor.b32 %r5564, %r5563, %r5530; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 20; + add.s32 %r5566, %r5560, %r5030; + add.s32 %r5567, %r5566, %r5565; + xor.b32 %r5568, %r5567, %r5562; + shf.l.wrap.b32 %r5569, %r5568, %r5568, 24; + add.s32 %r5570, %r5569, %r5563; + xor.b32 %r5571, %r5570, %r5565; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 25; + add.s32 %r5573, %r5525, %r4981; + add.s32 %r5574, %r5573, %r5488; + xor.b32 %r5575, %r5574, %r5513; + shf.l.wrap.b32 %r5576, %r5575, %r5575, 16; + add.s32 %r5577, %r5576, %r5500; + xor.b32 %r5578, %r5577, %r5488; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 20; + add.s32 %r5580, %r5574, %r5016; + add.s32 %r5581, %r5580, %r5579; + xor.b32 %r5582, %r5581, %r5576; + shf.l.wrap.b32 %r5583, %r5582, %r5582, 24; + add.s32 %r5584, %r5583, %r5577; + xor.b32 %r5585, %r5584, %r5579; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 25; + add.s32 %r5587, %r5539, %r5058; + add.s32 %r5588, %r5587, %r5586; + xor.b32 %r5589, %r5588, %r5555; + shf.l.wrap.b32 %r5590, %r5589, %r5589, 16; + add.s32 %r5591, %r5590, %r5570; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 20; + add.s32 %r5594, %r5588, %r5065; + add.s32 %r5595, %r5594, %r5593; + xor.b32 %r5596, %r5595, %r5590; + shf.l.wrap.b32 %r5597, %r5596, %r5596, 24; + add.s32 %r5598, %r5597, %r5591; + xor.b32 %r5599, %r5598, %r5593; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 25; + add.s32 %r5601, %r5553, %r5037; + add.s32 %r5602, %r5601, %r5544; + xor.b32 %r5603, %r5602, %r5569; + shf.l.wrap.b32 %r5604, %r5603, %r5603, 16; + add.s32 %r5605, %r5604, %r5584; + xor.b32 %r5606, %r5605, %r5544; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 20; + add.s32 %r5608, %r5602, %r5051; + add.s32 %r5609, %r5608, %r5607; + xor.b32 %r5610, %r5609, %r5604; + shf.l.wrap.b32 %r5611, %r5610, %r5610, 24; + add.s32 %r5612, %r5611, %r5605; + xor.b32 %r5613, %r5612, %r5607; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 25; + add.s32 %r5615, %r5567, %r5079; + add.s32 %r5616, %r5615, %r5558; + xor.b32 %r5617, %r5583, %r5616; + shf.l.wrap.b32 %r5618, %r5617, %r5617, 16; + add.s32 %r5619, %r5618, %r5542; + xor.b32 %r5620, %r5619, %r5558; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 20; + add.s32 %r5622, %r5616, %r5044; + add.s32 %r5623, %r5622, %r5621; + xor.b32 %r5624, %r5623, %r5618; + shf.l.wrap.b32 %r5625, %r5624, %r5624, 24; + add.s32 %r5626, %r5625, %r5619; + xor.b32 %r5627, %r5626, %r5621; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 25; + add.s32 %r5629, %r5581, %r5072; + add.s32 %r5630, %r5629, %r5572; + xor.b32 %r5631, %r5630, %r5541; + shf.l.wrap.b32 %r5632, %r5631, %r5631, 16; + add.s32 %r5633, %r5632, %r5556; + xor.b32 %r5634, %r5633, %r5572; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 20; + add.s32 %r5636, %r5630, %r5030; + add.s32 %r5637, %r5636, %r5635; + xor.b32 %r5638, %r5637, %r5632; + shf.l.wrap.b32 %r5639, %r5638, %r5638, 24; + add.s32 %r5640, %r5639, %r5633; + xor.b32 %r5641, %r5640, %r5635; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 25; + add.s32 %r5643, %r5595, %r5023; + add.s32 %r5644, %r5643, %r5614; + xor.b32 %r5645, %r5644, %r5639; + shf.l.wrap.b32 %r5646, %r5645, %r5645, 16; + add.s32 %r5647, %r5646, %r5626; + xor.b32 %r5648, %r5647, %r5614; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 20; + add.s32 %r5650, %r5644, %r4988; + add.s32 %r5651, %r5650, %r5649; + xor.b32 %r5652, %r5651, %r5646; + shf.l.wrap.b32 %r5653, %r5652, %r5652, 24; + add.s32 %r5654, %r5653, %r5647; + xor.b32 %r5655, %r5654, %r5649; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 25; + add.s32 %r5657, %r5609, %r5009; + add.s32 %r5658, %r5657, %r5628; + xor.b32 %r5659, %r5597, %r5658; + shf.l.wrap.b32 %r5660, %r5659, %r5659, 16; + add.s32 %r5661, %r5660, %r5640; + xor.b32 %r5662, %r5661, %r5628; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 20; + add.s32 %r5664, %r5658, %r4995; + add.s32 %r5665, %r5664, %r5663; + xor.b32 %r5666, %r5665, %r5660; + shf.l.wrap.b32 %r5667, %r5666, %r5666, 24; + add.s32 %r5668, %r5667, %r5661; + xor.b32 %r5669, %r5668, %r5663; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 25; + add.s32 %r5671, %r5623, %r4974; + add.s32 %r5672, %r5671, %r5642; + xor.b32 %r5673, %r5672, %r5611; + shf.l.wrap.b32 %r5674, %r5673, %r5673, 16; + add.s32 %r5675, %r5674, %r5598; + xor.b32 %r5676, %r5675, %r5642; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 20; + add.s32 %r5678, %r5672, %r4981; + add.s32 %r5679, %r5678, %r5677; + xor.b32 %r5680, %r5679, %r5674; + shf.l.wrap.b32 %r5681, %r5680, %r5680, 24; + add.s32 %r5682, %r5681, %r5675; + xor.b32 %r5683, %r5682, %r5677; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 25; + add.s32 %r5685, %r5637, %r5016; + add.s32 %r5686, %r5685, %r5600; + xor.b32 %r5687, %r5686, %r5625; + shf.l.wrap.b32 %r5688, %r5687, %r5687, 16; + add.s32 %r5689, %r5688, %r5612; + xor.b32 %r5690, %r5689, %r5600; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 20; + add.s32 %r5692, %r5686, %r5002; + add.s32 %r5693, %r5692, %r5691; + xor.b32 %r5694, %r5693, %r5688; + shf.l.wrap.b32 %r5695, %r5694, %r5694, 24; + add.s32 %r5696, %r5695, %r5689; + xor.b32 %r5697, %r5696, %r5691; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 25; + add.s32 %r5699, %r5651, %r5037; + add.s32 %r5700, %r5699, %r5698; + xor.b32 %r5701, %r5700, %r5667; + shf.l.wrap.b32 %r5702, %r5701, %r5701, 16; + add.s32 %r5703, %r5702, %r5682; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 20; + add.s32 %r5706, %r5700, %r5072; + add.s32 %r5707, %r5706, %r5705; + xor.b32 %r5708, %r5707, %r5702; + shf.l.wrap.b32 %r5709, %r5708, %r5708, 24; + add.s32 %r5710, %r5709, %r5703; + xor.b32 %r5711, %r5710, %r5705; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 25; + add.s32 %r5713, %r5665, %r5051; + add.s32 %r5714, %r5713, %r5656; + xor.b32 %r5715, %r5714, %r5681; + shf.l.wrap.b32 %r5716, %r5715, %r5715, 16; + add.s32 %r5717, %r5716, %r5696; + xor.b32 %r5718, %r5717, %r5656; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 20; + add.s32 %r5720, %r5714, %r5009; + add.s32 %r5721, %r5720, %r5719; + xor.b32 %r5722, %r5721, %r5716; + shf.l.wrap.b32 %r5723, %r5722, %r5722, 24; + add.s32 %r5724, %r5723, %r5717; + xor.b32 %r5725, %r5724, %r5719; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 25; + add.s32 %r5727, %r5679, %r5030; + add.s32 %r5728, %r5727, %r5670; + xor.b32 %r5729, %r5695, %r5728; + shf.l.wrap.b32 %r5730, %r5729, %r5729, 16; + add.s32 %r5731, %r5730, %r5654; + xor.b32 %r5732, %r5731, %r5670; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 20; + add.s32 %r5734, %r5728, %r5058; + add.s32 %r5735, %r5734, %r5733; + xor.b32 %r5736, %r5735, %r5730; + shf.l.wrap.b32 %r5737, %r5736, %r5736, 24; + add.s32 %r5738, %r5737, %r5731; + xor.b32 %r5739, %r5738, %r5733; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 25; + add.s32 %r5741, %r5693, %r5079; + add.s32 %r5742, %r5741, %r5684; + xor.b32 %r5743, %r5742, %r5653; + shf.l.wrap.b32 %r5744, %r5743, %r5743, 16; + add.s32 %r5745, %r5744, %r5668; + xor.b32 %r5746, %r5745, %r5684; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 20; + add.s32 %r5748, %r5742, %r4981; + add.s32 %r5749, %r5748, %r5747; + xor.b32 %r5750, %r5749, %r5744; + shf.l.wrap.b32 %r5751, %r5750, %r5750, 24; + add.s32 %r5752, %r5751, %r5745; + xor.b32 %r5753, %r5752, %r5747; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 25; + add.s32 %r5755, %r5707, %r5065; + add.s32 %r5756, %r5755, %r5726; + xor.b32 %r5757, %r5756, %r5751; + shf.l.wrap.b32 %r5758, %r5757, %r5757, 16; + add.s32 %r5759, %r5758, %r5738; + xor.b32 %r5760, %r5759, %r5726; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 20; + add.s32 %r5762, %r5756, %r4995; + add.s32 %r5763, %r5762, %r5761; + xor.b32 %r5764, %r5763, %r5758; + shf.l.wrap.b32 %r5765, %r5764, %r5764, 24; + add.s32 %r5766, %r5765, %r5759; + xor.b32 %r5767, %r5766, %r5761; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 25; + add.s32 %r5769, %r5721, %r4974; + add.s32 %r5770, %r5769, %r5740; + xor.b32 %r5771, %r5709, %r5770; + shf.l.wrap.b32 %r5772, %r5771, %r5771, 16; + add.s32 %r5773, %r5772, %r5752; + xor.b32 %r5774, %r5773, %r5740; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 20; + add.s32 %r5776, %r5770, %r5044; + add.s32 %r5777, %r5776, %r5775; + xor.b32 %r5778, %r5777, %r5772; + shf.l.wrap.b32 %r5779, %r5778, %r5778, 24; + add.s32 %r5780, %r5779, %r5773; + xor.b32 %r5781, %r5780, %r5775; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 25; + add.s32 %r5783, %r5735, %r4988; + add.s32 %r5784, %r5783, %r5754; + xor.b32 %r5785, %r5784, %r5723; + shf.l.wrap.b32 %r5786, %r5785, %r5785, 16; + add.s32 %r5787, %r5786, %r5710; + xor.b32 %r5788, %r5787, %r5754; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 20; + add.s32 %r5790, %r5784, %r5016; + add.s32 %r5791, %r5790, %r5789; + xor.b32 %r5792, %r5791, %r5786; + shf.l.wrap.b32 %r5793, %r5792, %r5792, 24; + add.s32 %r5794, %r5793, %r5787; + xor.b32 %r5795, %r5794, %r5789; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 25; + add.s32 %r5797, %r5749, %r5002; + add.s32 %r5798, %r5797, %r5712; + xor.b32 %r5799, %r5798, %r5737; + shf.l.wrap.b32 %r5800, %r5799, %r5799, 16; + add.s32 %r5801, %r5800, %r5724; + xor.b32 %r5802, %r5801, %r5712; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 20; + add.s32 %r5804, %r5798, %r5023; + add.s32 %r5805, %r5804, %r5803; + xor.b32 %r5806, %r5805, %r5800; + shf.l.wrap.b32 %r5807, %r5806, %r5806, 24; + add.s32 %r5808, %r5807, %r5801; + xor.b32 %r5809, %r5808, %r5803; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 25; + add.s32 %r5811, %r5763, %r5051; + add.s32 %r5812, %r5811, %r5810; + xor.b32 %r5813, %r5812, %r5779; + shf.l.wrap.b32 %r5814, %r5813, %r5813, 16; + add.s32 %r5815, %r5814, %r5794; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 20; + add.s32 %r5818, %r5812, %r5079; + add.s32 %r5819, %r5818, %r5817; + xor.b32 %r5820, %r5819, %r5814; + shf.l.wrap.b32 %r5821, %r5820, %r5820, 24; + add.s32 %r5822, %r5821, %r5815; + xor.b32 %r5823, %r5822, %r5817; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 25; + add.s32 %r5825, %r5777, %r5009; + add.s32 %r5826, %r5825, %r5768; + xor.b32 %r5827, %r5826, %r5793; + shf.l.wrap.b32 %r5828, %r5827, %r5827, 16; + add.s32 %r5829, %r5828, %r5808; + xor.b32 %r5830, %r5829, %r5768; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 20; + add.s32 %r5832, %r5826, %r4974; + add.s32 %r5833, %r5832, %r5831; + xor.b32 %r5834, %r5833, %r5828; + shf.l.wrap.b32 %r5835, %r5834, %r5834, 24; + add.s32 %r5836, %r5835, %r5829; + xor.b32 %r5837, %r5836, %r5831; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 25; + add.s32 %r5839, %r5791, %r4981; + add.s32 %r5840, %r5839, %r5782; + xor.b32 %r5841, %r5807, %r5840; + shf.l.wrap.b32 %r5842, %r5841, %r5841, 16; + add.s32 %r5843, %r5842, %r5766; + xor.b32 %r5844, %r5843, %r5782; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 20; + add.s32 %r5846, %r5840, %r5037; + add.s32 %r5847, %r5846, %r5845; + xor.b32 %r5848, %r5847, %r5842; + shf.l.wrap.b32 %r5849, %r5848, %r5848, 24; + add.s32 %r5850, %r5849, %r5843; + xor.b32 %r5851, %r5850, %r5845; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 25; + add.s32 %r5853, %r5805, %r5030; + add.s32 %r5854, %r5853, %r5796; + xor.b32 %r5855, %r5854, %r5765; + shf.l.wrap.b32 %r5856, %r5855, %r5855, 16; + add.s32 %r5857, %r5856, %r5780; + xor.b32 %r5858, %r5857, %r5796; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 20; + add.s32 %r5860, %r5854, %r5016; + add.s32 %r5861, %r5860, %r5859; + xor.b32 %r5862, %r5861, %r5856; + shf.l.wrap.b32 %r5863, %r5862, %r5862, 24; + add.s32 %r5864, %r5863, %r5857; + xor.b32 %r5865, %r5864, %r5859; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 25; + add.s32 %r5867, %r5819, %r5072; + add.s32 %r5868, %r5867, %r5838; + xor.b32 %r5869, %r5868, %r5863; + shf.l.wrap.b32 %r5870, %r5869, %r5869, 16; + add.s32 %r5871, %r5870, %r5850; + xor.b32 %r5872, %r5871, %r5838; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 20; + add.s32 %r5874, %r5868, %r5044; + add.s32 %r5875, %r5874, %r5873; + xor.b32 %r5876, %r5875, %r5870; + shf.l.wrap.b32 %r5877, %r5876, %r5876, 24; + add.s32 %r5878, %r5877, %r5871; + xor.b32 %r5879, %r5878, %r5873; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 25; + add.s32 %r5881, %r5833, %r4988; + add.s32 %r5882, %r5881, %r5852; + xor.b32 %r5883, %r5821, %r5882; + shf.l.wrap.b32 %r5884, %r5883, %r5883, 16; + add.s32 %r5885, %r5884, %r5864; + xor.b32 %r5886, %r5885, %r5852; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 20; + add.s32 %r5888, %r5882, %r5058; + add.s32 %r5889, %r5888, %r5887; + xor.b32 %r5890, %r5889, %r5884; + shf.l.wrap.b32 %r5891, %r5890, %r5890, 24; + add.s32 %r5892, %r5891, %r5885; + xor.b32 %r5893, %r5892, %r5887; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 25; + add.s32 %r5895, %r5847, %r4995; + add.s32 %r5896, %r5895, %r5866; + xor.b32 %r5897, %r5896, %r5835; + shf.l.wrap.b32 %r5898, %r5897, %r5897, 16; + add.s32 %r5899, %r5898, %r5822; + xor.b32 %r5900, %r5899, %r5866; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 20; + add.s32 %r5902, %r5896, %r5002; + add.s32 %r5903, %r5902, %r5901; + xor.b32 %r5904, %r5903, %r5898; + shf.l.wrap.b32 %r5905, %r5904, %r5904, 24; + add.s32 %r5906, %r5905, %r5899; + xor.b32 %r5907, %r5906, %r5901; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 25; + add.s32 %r5909, %r5861, %r5023; + add.s32 %r5910, %r5909, %r5824; + xor.b32 %r5911, %r5910, %r5849; + shf.l.wrap.b32 %r5912, %r5911, %r5911, 16; + add.s32 %r5913, %r5912, %r5836; + xor.b32 %r5914, %r5913, %r5824; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 20; + add.s32 %r5916, %r5910, %r5065; + add.s32 %r5917, %r5916, %r5915; + xor.b32 %r5918, %r5917, %r5912; + shf.l.wrap.b32 %r5919, %r5918, %r5918, 24; + add.s32 %r5920, %r5919, %r5913; + xor.b32 %r5921, %r5920, %r5915; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 25; + xor.b32 %r5923, %r5906, %r5875; + xor.b32 %r5924, %r5920, %r5889; + xor.b32 %r5925, %r5878, %r5903; + xor.b32 %r5926, %r5917, %r5892; + xor.b32 %r5927, %r5922, %r5891; + xor.b32 %r5928, %r5880, %r5905; + xor.b32 %r5929, %r5919, %r5894; + xor.b32 %r5930, %r5908, %r5877; + st.local.u8 [%rd179+145], %r5923; + shr.u32 %r5931, %r5923, 8; + st.local.u8 [%rd179+146], %r5931; + shr.u32 %r5932, %r5923, 16; + st.local.u8 [%rd179+147], %r5932; + shr.u32 %r5933, %r5923, 24; + st.local.u8 [%rd179+148], %r5933; + st.local.u8 [%rd179+149], %r5924; + shr.u32 %r5934, %r5924, 8; + st.local.u8 [%rd179+150], %r5934; + shr.u32 %r5935, %r5924, 16; + st.local.u8 [%rd179+151], %r5935; + shr.u32 %r5936, %r5924, 24; + st.local.u8 [%rd179+152], %r5936; + st.local.u8 [%rd179+153], %r5925; + shr.u32 %r5937, %r5925, 8; + st.local.u8 [%rd179+154], %r5937; + shr.u32 %r5938, %r5925, 16; + st.local.u8 [%rd179+155], %r5938; + shr.u32 %r5939, %r5925, 24; + st.local.u8 [%rd179+156], %r5939; + st.local.u8 [%rd179+157], %r5926; + shr.u32 %r5940, %r5926, 8; + st.local.u8 [%rd179+158], %r5940; + shr.u32 %r5941, %r5926, 16; + st.local.u8 [%rd179+159], %r5941; + shr.u32 %r5942, %r5926, 24; + st.local.u8 [%rd179+160], %r5942; + st.local.u8 [%rd179+161], %r5927; + shr.u32 %r5943, %r5927, 8; + st.local.u8 [%rd179+162], %r5943; + shr.u32 %r5944, %r5927, 16; + st.local.u8 [%rd179+163], %r5944; + shr.u32 %r5945, %r5927, 24; + st.local.u8 [%rd179+164], %r5945; + st.local.u8 [%rd179+165], %r5928; + shr.u32 %r5946, %r5928, 8; + st.local.u8 [%rd179+166], %r5946; + shr.u32 %r5947, %r5928, 16; + st.local.u8 [%rd179+167], %r5947; + shr.u32 %r5948, %r5928, 24; + st.local.u8 [%rd179+168], %r5948; + st.local.u8 [%rd179+169], %r5929; + shr.u32 %r5949, %r5929, 8; + st.local.u8 [%rd179+170], %r5949; + shr.u32 %r5950, %r5929, 16; + st.local.u8 [%rd179+171], %r5950; + shr.u32 %r5951, %r5929, 24; + st.local.u8 [%rd179+172], %r5951; + st.local.u8 [%rd179+173], %r5930; + shr.u32 %r5952, %r5930, 8; + st.local.u8 [%rd179+174], %r5952; + shr.u32 %r5953, %r5930, 16; + st.local.u8 [%rd179+175], %r5953; + shr.u32 %r5954, %r5930, 24; + st.local.u8 [%rd179+176], %r5954; + ld.local.u8 %rs176, [%rd3+8]; + add.s16 %rs177, %rs176, -1; + st.local.u8 [%rd3+8], %rs177; + cvt.u64.u16 %rd180, %rs177; + and.b64 %rd181, %rd180, 255; + setp.lt.u64 %p30, %rd227, %rd181; + and.b16 %rs178, %rs177, 255; + mul.wide.u16 %r11663, %rs178, 32; + @%p30 bra $L__BB1_34; + +$L__BB1_35: + cvt.s64.s32 %rd182, %r11663; + add.s64 %rd183, %rd2, %rd182; + mov.b32 {%rs179, %rs180}, %r3967; + st.local.u8 [%rd183+145], %rs179; + shr.u16 %rs181, %rs179, 8; + st.local.u8 [%rd183+146], %rs181; + st.local.u8 [%rd183+147], %rs180; + shr.u16 %rs182, %rs180, 8; + st.local.u8 [%rd183+148], %rs182; + mov.b32 {%rs183, %rs184}, %r3968; + st.local.u8 [%rd183+149], %rs183; + shr.u16 %rs185, %rs183, 8; + st.local.u8 [%rd183+150], %rs185; + st.local.u8 [%rd183+151], %rs184; + shr.u16 %rs186, %rs184, 8; + st.local.u8 [%rd183+152], %rs186; + mov.b32 {%rs187, %rs188}, %r3969; + st.local.u8 [%rd183+153], %rs187; + shr.u16 %rs189, %rs187, 8; + st.local.u8 [%rd183+154], %rs189; + st.local.u8 [%rd183+155], %rs188; + shr.u16 %rs190, %rs188, 8; + st.local.u8 [%rd183+156], %rs190; + mov.b32 {%rs191, %rs192}, %r3970; + st.local.u8 [%rd183+157], %rs191; + shr.u16 %rs193, %rs191, 8; + st.local.u8 [%rd183+158], %rs193; + st.local.u8 [%rd183+159], %rs192; + shr.u16 %rs194, %rs192, 8; + st.local.u8 [%rd183+160], %rs194; + mov.b32 {%rs195, %rs196}, %r3971; + st.local.u8 [%rd183+161], %rs195; + shr.u16 %rs197, %rs195, 8; + st.local.u8 [%rd183+162], %rs197; + st.local.u8 [%rd183+163], %rs196; + shr.u16 %rs198, %rs196, 8; + st.local.u8 [%rd183+164], %rs198; + mov.b32 {%rs199, %rs200}, %r3972; + st.local.u8 [%rd183+165], %rs199; + shr.u16 %rs201, %rs199, 8; + st.local.u8 [%rd183+166], %rs201; + st.local.u8 [%rd183+167], %rs200; + shr.u16 %rs202, %rs200, 8; + st.local.u8 [%rd183+168], %rs202; + mov.b32 {%rs203, %rs204}, %r3973; + st.local.u8 [%rd183+169], %rs203; + shr.u16 %rs205, %rs203, 8; + st.local.u8 [%rd183+170], %rs205; + st.local.u8 [%rd183+171], %rs204; + shr.u16 %rs206, %rs204, 8; + st.local.u8 [%rd183+172], %rs206; + mov.b32 {%rs207, %rs208}, %r3974; + st.local.u8 [%rd183+173], %rs207; + shr.u16 %rs209, %rs207, 8; + st.local.u8 [%rd183+174], %rs209; + st.local.u8 [%rd183+175], %rs208; + shr.u16 %rs210, %rs208, 8; + st.local.u8 [%rd183+176], %rs210; + ld.local.u8 %rs388, [%rd3+8]; + +$L__BB1_47: + add.s16 %rs331, %rs388, 1; + st.local.u8 [%rd3+8], %rs331; + ld.local.u64 %rd196, [%rd3+-72]; + shr.u64 %rd197, %rd49, 10; + add.s64 %rd251, %rd196, %rd197; + st.local.u64 [%rd3+-72], %rd251; + add.s64 %rd261, %rd261, %rd49; + add.s64 %rd254, %rd254, %rd49; + sub.s64 %rd262, %rd262, %rd49; + setp.gt.u64 %p39, %rd262, 1024; + @%p39 bra $L__BB1_26; + +$L__BB1_48: + setp.eq.s64 %p40, %rd262, 0; + @%p40 bra $L__BB1_68; + + ld.local.u8 %rs389, [%rd3]; + cvt.u64.u16 %rd71, %rs389; + setp.eq.s16 %p41, %rs389, 0; + mov.u16 %rs390, 0; + mov.u64 %rd271, %rd262; + @%p41 bra $L__BB1_57; + + mov.u64 %rd198, 64; + sub.s64 %rd199, %rd198, %rd71; + min.u64 %rd72, %rd199, %rd262; + setp.eq.s64 %p42, %rd72, 0; + @%p42 bra $L__BB1_54; + + add.s64 %rd201, %rd2, %rd71; + add.s64 %rd73, %rd201, 72; + mov.u64 %rd263, 0; + +$L__BB1_52: + add.s64 %rd202, %rd261, %rd263; + ld.local.u8 %rs333, [%rd202]; + add.s64 %rd203, %rd73, %rd263; + st.local.u8 [%rd203], %rs333; + add.s64 %rd263, %rd263, 1; + setp.lt.u64 %p43, %rd263, %rd72; + @%p43 bra $L__BB1_52; + + ld.local.u8 %rs389, [%rd3]; + +$L__BB1_54: + cvt.u16.u64 %rs334, %rd72; + add.s16 %rs390, %rs389, %rs334; + mov.u64 %rd271, 0; + st.local.u8 [%rd3], %rs390; + add.s64 %rd261, %rd261, %rd72; + sub.s64 %rd77, %rd262, %rd72; + setp.eq.s64 %p44, %rd77, 0; + @%p44 bra $L__BB1_57; + + add.s64 %rd78, %rd2, 72; + ld.local.u8 %rs335, [%rd3+1]; + mov.u64 %rd264, 0; + setp.eq.s16 %p45, %rs335, 0; + mov.u16 %rs390, 0; + selp.u16 %rs337, 1, 0, %p45; + ld.local.u8 %rs338, [%rd3+2]; + or.b16 %rs339, %rs338, %rs337; + ld.local.u8 %r8843, [%rd3+-64]; + ld.local.u8 %r8844, [%rd3+-63]; + prmt.b32 %r8845, %r8844, %r8843, 30212; + ld.local.u8 %r8846, [%rd3+-62]; + prmt.b32 %r8847, %r8846, %r8845, 28756; + ld.local.u8 %r8848, [%rd3+-61]; + prmt.b32 %r8849, %r8848, %r8847, 1620; + ld.local.u8 %r8850, [%rd3+-60]; + ld.local.u8 %r8851, [%rd3+-59]; + prmt.b32 %r8852, %r8851, %r8850, 30212; + ld.local.u8 %r8853, [%rd3+-58]; + prmt.b32 %r8854, %r8853, %r8852, 28756; + ld.local.u8 %r8855, [%rd3+-57]; + prmt.b32 %r8856, %r8855, %r8854, 1620; + ld.local.u8 %r8857, [%rd3+-56]; + ld.local.u8 %r8858, [%rd3+-55]; + prmt.b32 %r8859, %r8858, %r8857, 30212; + ld.local.u8 %r8860, [%rd3+-54]; + prmt.b32 %r8861, %r8860, %r8859, 28756; + ld.local.u8 %r8862, [%rd3+-53]; + prmt.b32 %r8863, %r8862, %r8861, 1620; + ld.local.u8 %r8864, [%rd3+-52]; + ld.local.u8 %r8865, [%rd3+-51]; + prmt.b32 %r8866, %r8865, %r8864, 30212; + ld.local.u8 %r8867, [%rd3+-50]; + prmt.b32 %r8868, %r8867, %r8866, 28756; + ld.local.u8 %r8869, [%rd3+-49]; + prmt.b32 %r8870, %r8869, %r8868, 1620; + ld.local.u8 %r8871, [%rd3+-48]; + ld.local.u8 %r8872, [%rd3+-47]; + prmt.b32 %r8873, %r8872, %r8871, 30212; + ld.local.u8 %r8874, [%rd3+-46]; + prmt.b32 %r8875, %r8874, %r8873, 28756; + ld.local.u8 %r8876, [%rd3+-45]; + prmt.b32 %r8877, %r8876, %r8875, 1620; + ld.local.u8 %r8878, [%rd3+-44]; + ld.local.u8 %r8879, [%rd3+-43]; + prmt.b32 %r8880, %r8879, %r8878, 30212; + ld.local.u8 %r8881, [%rd3+-42]; + prmt.b32 %r8882, %r8881, %r8880, 28756; + ld.local.u8 %r8883, [%rd3+-41]; + prmt.b32 %r8884, %r8883, %r8882, 1620; + ld.local.u8 %r8885, [%rd3+-40]; + ld.local.u8 %r8886, [%rd3+-39]; + prmt.b32 %r8887, %r8886, %r8885, 30212; + ld.local.u8 %r8888, [%rd3+-38]; + prmt.b32 %r8889, %r8888, %r8887, 28756; + ld.local.u8 %r8890, [%rd3+-37]; + prmt.b32 %r8891, %r8890, %r8889, 1620; + ld.local.u8 %r8892, [%rd3+-36]; + ld.local.u8 %r8893, [%rd3+-35]; + prmt.b32 %r8894, %r8893, %r8892, 30212; + ld.local.u8 %r8895, [%rd3+-34]; + prmt.b32 %r8896, %r8895, %r8894, 28756; + ld.local.u8 %r8897, [%rd3+-33]; + prmt.b32 %r8898, %r8897, %r8896, 1620; + ld.local.u8 %r8899, [%rd3+-32]; + ld.local.u8 %r8900, [%rd3+-31]; + prmt.b32 %r8901, %r8900, %r8899, 30212; + ld.local.u8 %r8902, [%rd3+-30]; + prmt.b32 %r8903, %r8902, %r8901, 28756; + ld.local.u8 %r8904, [%rd3+-29]; + prmt.b32 %r8905, %r8904, %r8903, 1620; + ld.local.u8 %r8906, [%rd3+-28]; + ld.local.u8 %r8907, [%rd3+-27]; + prmt.b32 %r8908, %r8907, %r8906, 30212; + ld.local.u8 %r8909, [%rd3+-26]; + prmt.b32 %r8910, %r8909, %r8908, 28756; + ld.local.u8 %r8911, [%rd3+-25]; + prmt.b32 %r8912, %r8911, %r8910, 1620; + ld.local.u8 %r8913, [%rd3+-24]; + ld.local.u8 %r8914, [%rd3+-23]; + prmt.b32 %r8915, %r8914, %r8913, 30212; + ld.local.u8 %r8916, [%rd3+-22]; + prmt.b32 %r8917, %r8916, %r8915, 28756; + ld.local.u8 %r8918, [%rd3+-21]; + prmt.b32 %r8919, %r8918, %r8917, 1620; + ld.local.u8 %r8920, [%rd3+-20]; + ld.local.u8 %r8921, [%rd3+-19]; + prmt.b32 %r8922, %r8921, %r8920, 30212; + ld.local.u8 %r8923, [%rd3+-18]; + prmt.b32 %r8924, %r8923, %r8922, 28756; + ld.local.u8 %r8925, [%rd3+-17]; + prmt.b32 %r8926, %r8925, %r8924, 1620; + ld.local.u8 %r8927, [%rd3+-16]; + ld.local.u8 %r8928, [%rd3+-15]; + prmt.b32 %r8929, %r8928, %r8927, 30212; + ld.local.u8 %r8930, [%rd3+-14]; + prmt.b32 %r8931, %r8930, %r8929, 28756; + ld.local.u8 %r8932, [%rd3+-13]; + prmt.b32 %r8933, %r8932, %r8931, 1620; + ld.local.u8 %r8934, [%rd3+-12]; + ld.local.u8 %r8935, [%rd3+-11]; + prmt.b32 %r8936, %r8935, %r8934, 30212; + ld.local.u8 %r8937, [%rd3+-10]; + prmt.b32 %r8938, %r8937, %r8936, 28756; + ld.local.u8 %r8939, [%rd3+-9]; + prmt.b32 %r8940, %r8939, %r8938, 1620; + ld.local.u8 %r8941, [%rd3+-8]; + ld.local.u8 %r8942, [%rd3+-7]; + prmt.b32 %r8943, %r8942, %r8941, 30212; + ld.local.u8 %r8944, [%rd3+-6]; + prmt.b32 %r8945, %r8944, %r8943, 28756; + ld.local.u8 %r8946, [%rd3+-5]; + prmt.b32 %r8947, %r8946, %r8945, 1620; + ld.local.u8 %r8948, [%rd3+-4]; + ld.local.u8 %r8949, [%rd3+-3]; + prmt.b32 %r8950, %r8949, %r8948, 30212; + ld.local.u8 %r8951, [%rd3+-2]; + prmt.b32 %r8952, %r8951, %r8950, 28756; + ld.local.u8 %r8953, [%rd3+-1]; + prmt.b32 %r8954, %r8953, %r8952, 1620; + ld.local.u64 %rd206, [%rd3+-72]; + cvt.u32.u64 %r8955, %rd206; + shr.u64 %rd207, %rd206, 32; + cvt.u32.u64 %r8956, %rd207; + cvt.u32.u16 %r8957, %rs339; + and.b32 %r8958, %r8957, 255; + ld.local.u32 %r8959, [%rd3+-104]; + add.s32 %r8960, %r8959, %r8849; + ld.local.u32 %r8961, [%rd3+-88]; + add.s32 %r8962, %r8960, %r8961; + xor.b32 %r8963, %r8962, %r8955; + shf.l.wrap.b32 %r8964, %r8963, %r8963, 16; + add.s32 %r8965, %r8964, 1779033703; + xor.b32 %r8966, %r8965, %r8961; + shf.l.wrap.b32 %r8967, %r8966, %r8966, 20; + add.s32 %r8968, %r8962, %r8856; + add.s32 %r8969, %r8968, %r8967; + xor.b32 %r8970, %r8969, %r8964; + shf.l.wrap.b32 %r8971, %r8970, %r8970, 24; + add.s32 %r8972, %r8971, %r8965; + xor.b32 %r8973, %r8972, %r8967; + shf.l.wrap.b32 %r8974, %r8973, %r8973, 25; + ld.local.u32 %r8975, [%rd3+-100]; + add.s32 %r8976, %r8975, %r8863; + ld.local.u32 %r8977, [%rd3+-84]; + add.s32 %r8978, %r8976, %r8977; + xor.b32 %r8979, %r8978, %r8956; + shf.l.wrap.b32 %r8980, %r8979, %r8979, 16; + add.s32 %r8981, %r8980, -1150833019; + xor.b32 %r8982, %r8981, %r8977; + shf.l.wrap.b32 %r8983, %r8982, %r8982, 20; + add.s32 %r8984, %r8978, %r8870; + add.s32 %r8985, %r8984, %r8983; + xor.b32 %r8986, %r8985, %r8980; + shf.l.wrap.b32 %r8987, %r8986, %r8986, 24; + add.s32 %r8988, %r8987, %r8981; + xor.b32 %r8989, %r8988, %r8983; + shf.l.wrap.b32 %r8990, %r8989, %r8989, 25; + ld.local.u32 %r8991, [%rd3+-96]; + add.s32 %r8992, %r8991, %r8877; + ld.local.u32 %r8993, [%rd3+-80]; + add.s32 %r8994, %r8992, %r8993; + shr.u32 %r8995, %r8994, 16; + shl.b32 %r8996, %r8994, 16; + xor.b32 %r8997, %r8996, 4194304; + or.b32 %r8998, %r8997, %r8995; + add.s32 %r8999, %r8998, 1013904242; + xor.b32 %r9000, %r8999, %r8993; + shf.l.wrap.b32 %r9001, %r9000, %r9000, 20; + add.s32 %r9002, %r8994, %r8884; + add.s32 %r9003, %r9002, %r9001; + xor.b32 %r9004, %r9003, %r8998; + shf.l.wrap.b32 %r9005, %r9004, %r9004, 24; + add.s32 %r9006, %r9005, %r8999; + xor.b32 %r9007, %r9006, %r9001; + shf.l.wrap.b32 %r9008, %r9007, %r9007, 25; + ld.local.u32 %r9009, [%rd3+-92]; + add.s32 %r9010, %r9009, %r8891; + ld.local.u32 %r9011, [%rd3+-76]; + add.s32 %r9012, %r9010, %r9011; + xor.b32 %r9013, %r9012, %r8958; + shr.u32 %r9014, %r9012, 16; + shl.b32 %r9015, %r9013, 16; + or.b32 %r9016, %r9015, %r9014; + add.s32 %r9017, %r9016, -1521486534; + xor.b32 %r9018, %r9017, %r9011; + shf.l.wrap.b32 %r9019, %r9018, %r9018, 20; + add.s32 %r9020, %r9012, %r8898; + add.s32 %r9021, %r9020, %r9019; + xor.b32 %r9022, %r9021, %r9016; + shf.l.wrap.b32 %r9023, %r9022, %r9022, 24; + add.s32 %r9024, %r9023, %r9017; + xor.b32 %r9025, %r9024, %r9019; + shf.l.wrap.b32 %r9026, %r9025, %r9025, 25; + add.s32 %r9027, %r8969, %r8905; + add.s32 %r9028, %r9027, %r8990; + xor.b32 %r9029, %r9028, %r9023; + shf.l.wrap.b32 %r9030, %r9029, %r9029, 16; + add.s32 %r9031, %r9030, %r9006; + xor.b32 %r9032, %r9031, %r8990; + shf.l.wrap.b32 %r9033, %r9032, %r9032, 20; + add.s32 %r9034, %r9028, %r8912; + add.s32 %r9035, %r9034, %r9033; + xor.b32 %r9036, %r9035, %r9030; + shf.l.wrap.b32 %r9037, %r9036, %r9036, 24; + add.s32 %r9038, %r9037, %r9031; + xor.b32 %r9039, %r9038, %r9033; + shf.l.wrap.b32 %r9040, %r9039, %r9039, 25; + add.s32 %r9041, %r8985, %r8919; + add.s32 %r9042, %r9041, %r9008; + xor.b32 %r9043, %r9042, %r8971; + shf.l.wrap.b32 %r9044, %r9043, %r9043, 16; + add.s32 %r9045, %r9044, %r9024; + xor.b32 %r9046, %r9045, %r9008; + shf.l.wrap.b32 %r9047, %r9046, %r9046, 20; + add.s32 %r9048, %r9042, %r8926; + add.s32 %r9049, %r9048, %r9047; + xor.b32 %r9050, %r9049, %r9044; + shf.l.wrap.b32 %r9051, %r9050, %r9050, 24; + add.s32 %r9052, %r9051, %r9045; + xor.b32 %r9053, %r9052, %r9047; + shf.l.wrap.b32 %r9054, %r9053, %r9053, 25; + add.s32 %r9055, %r9003, %r8933; + add.s32 %r9056, %r9055, %r9026; + xor.b32 %r9057, %r9056, %r8987; + shf.l.wrap.b32 %r9058, %r9057, %r9057, 16; + add.s32 %r9059, %r9058, %r8972; + xor.b32 %r9060, %r9059, %r9026; + shf.l.wrap.b32 %r9061, %r9060, %r9060, 20; + add.s32 %r9062, %r9056, %r8940; + add.s32 %r9063, %r9062, %r9061; + xor.b32 %r9064, %r9063, %r9058; + shf.l.wrap.b32 %r9065, %r9064, %r9064, 24; + add.s32 %r9066, %r9065, %r9059; + xor.b32 %r9067, %r9066, %r9061; + shf.l.wrap.b32 %r9068, %r9067, %r9067, 25; + add.s32 %r9069, %r9021, %r8947; + add.s32 %r9070, %r9069, %r8974; + xor.b32 %r9071, %r9070, %r9005; + shf.l.wrap.b32 %r9072, %r9071, %r9071, 16; + add.s32 %r9073, %r9072, %r8988; + xor.b32 %r9074, %r9073, %r8974; + shf.l.wrap.b32 %r9075, %r9074, %r9074, 20; + add.s32 %r9076, %r9070, %r8954; + add.s32 %r9077, %r9076, %r9075; + xor.b32 %r9078, %r9077, %r9072; + shf.l.wrap.b32 %r9079, %r9078, %r9078, 24; + add.s32 %r9080, %r9079, %r9073; + xor.b32 %r9081, %r9080, %r9075; + shf.l.wrap.b32 %r9082, %r9081, %r9081, 25; + add.s32 %r9083, %r9035, %r8863; + add.s32 %r9084, %r9083, %r9082; + xor.b32 %r9085, %r9084, %r9051; + shf.l.wrap.b32 %r9086, %r9085, %r9085, 16; + add.s32 %r9087, %r9086, %r9066; + xor.b32 %r9088, %r9087, %r9082; + shf.l.wrap.b32 %r9089, %r9088, %r9088, 20; + add.s32 %r9090, %r9084, %r8891; + add.s32 %r9091, %r9090, %r9089; + xor.b32 %r9092, %r9091, %r9086; + shf.l.wrap.b32 %r9093, %r9092, %r9092, 24; + add.s32 %r9094, %r9093, %r9087; + xor.b32 %r9095, %r9094, %r9089; + shf.l.wrap.b32 %r9096, %r9095, %r9095, 25; + add.s32 %r9097, %r9049, %r8870; + add.s32 %r9098, %r9097, %r9040; + xor.b32 %r9099, %r9098, %r9065; + shf.l.wrap.b32 %r9100, %r9099, %r9099, 16; + add.s32 %r9101, %r9100, %r9080; + xor.b32 %r9102, %r9101, %r9040; + shf.l.wrap.b32 %r9103, %r9102, %r9102, 20; + add.s32 %r9104, %r9098, %r8919; + add.s32 %r9105, %r9104, %r9103; + xor.b32 %r9106, %r9105, %r9100; + shf.l.wrap.b32 %r9107, %r9106, %r9106, 24; + add.s32 %r9108, %r9107, %r9101; + xor.b32 %r9109, %r9108, %r9103; + shf.l.wrap.b32 %r9110, %r9109, %r9109, 25; + add.s32 %r9111, %r9063, %r8898; + add.s32 %r9112, %r9111, %r9054; + xor.b32 %r9113, %r9112, %r9079; + shf.l.wrap.b32 %r9114, %r9113, %r9113, 16; + add.s32 %r9115, %r9114, %r9038; + xor.b32 %r9116, %r9115, %r9054; + shf.l.wrap.b32 %r9117, %r9116, %r9116, 20; + add.s32 %r9118, %r9112, %r8849; + add.s32 %r9119, %r9118, %r9117; + xor.b32 %r9120, %r9119, %r9114; + shf.l.wrap.b32 %r9121, %r9120, %r9120, 24; + add.s32 %r9122, %r9121, %r9115; + xor.b32 %r9123, %r9122, %r9117; + shf.l.wrap.b32 %r9124, %r9123, %r9123, 25; + add.s32 %r9125, %r9077, %r8877; + add.s32 %r9126, %r9125, %r9068; + xor.b32 %r9127, %r9126, %r9037; + shf.l.wrap.b32 %r9128, %r9127, %r9127, 16; + add.s32 %r9129, %r9128, %r9052; + xor.b32 %r9130, %r9129, %r9068; + shf.l.wrap.b32 %r9131, %r9130, %r9130, 20; + add.s32 %r9132, %r9126, %r8940; + add.s32 %r9133, %r9132, %r9131; + xor.b32 %r9134, %r9133, %r9128; + shf.l.wrap.b32 %r9135, %r9134, %r9134, 24; + add.s32 %r9136, %r9135, %r9129; + xor.b32 %r9137, %r9136, %r9131; + shf.l.wrap.b32 %r9138, %r9137, %r9137, 25; + add.s32 %r9139, %r9091, %r8856; + add.s32 %r9140, %r9139, %r9110; + xor.b32 %r9141, %r9140, %r9135; + shf.l.wrap.b32 %r9142, %r9141, %r9141, 16; + add.s32 %r9143, %r9142, %r9122; + xor.b32 %r9144, %r9143, %r9110; + shf.l.wrap.b32 %r9145, %r9144, %r9144, 20; + add.s32 %r9146, %r9140, %r8926; + add.s32 %r9147, %r9146, %r9145; + xor.b32 %r9148, %r9147, %r9142; + shf.l.wrap.b32 %r9149, %r9148, %r9148, 24; + add.s32 %r9150, %r9149, %r9143; + xor.b32 %r9151, %r9150, %r9145; + shf.l.wrap.b32 %r9152, %r9151, %r9151, 25; + add.s32 %r9153, %r9105, %r8933; + add.s32 %r9154, %r9153, %r9124; + xor.b32 %r9155, %r9154, %r9093; + shf.l.wrap.b32 %r9156, %r9155, %r9155, 16; + add.s32 %r9157, %r9156, %r9136; + xor.b32 %r9158, %r9157, %r9124; + shf.l.wrap.b32 %r9159, %r9158, %r9158, 20; + add.s32 %r9160, %r9154, %r8884; + add.s32 %r9161, %r9160, %r9159; + xor.b32 %r9162, %r9161, %r9156; + shf.l.wrap.b32 %r9163, %r9162, %r9162, 24; + add.s32 %r9164, %r9163, %r9157; + xor.b32 %r9165, %r9164, %r9159; + shf.l.wrap.b32 %r9166, %r9165, %r9165, 25; + add.s32 %r9167, %r9119, %r8912; + add.s32 %r9168, %r9167, %r9138; + xor.b32 %r9169, %r9168, %r9107; + shf.l.wrap.b32 %r9170, %r9169, %r9169, 16; + add.s32 %r9171, %r9170, %r9094; + xor.b32 %r9172, %r9171, %r9138; + shf.l.wrap.b32 %r9173, %r9172, %r9172, 20; + add.s32 %r9174, %r9168, %r8947; + add.s32 %r9175, %r9174, %r9173; + xor.b32 %r9176, %r9175, %r9170; + shf.l.wrap.b32 %r9177, %r9176, %r9176, 24; + add.s32 %r9178, %r9177, %r9171; + xor.b32 %r9179, %r9178, %r9173; + shf.l.wrap.b32 %r9180, %r9179, %r9179, 25; + add.s32 %r9181, %r9133, %r8954; + add.s32 %r9182, %r9181, %r9096; + xor.b32 %r9183, %r9182, %r9121; + shf.l.wrap.b32 %r9184, %r9183, %r9183, 16; + add.s32 %r9185, %r9184, %r9108; + xor.b32 %r9186, %r9185, %r9096; + shf.l.wrap.b32 %r9187, %r9186, %r9186, 20; + add.s32 %r9188, %r9182, %r8905; + add.s32 %r9189, %r9188, %r9187; + xor.b32 %r9190, %r9189, %r9184; + shf.l.wrap.b32 %r9191, %r9190, %r9190, 24; + add.s32 %r9192, %r9191, %r9185; + xor.b32 %r9193, %r9192, %r9187; + shf.l.wrap.b32 %r9194, %r9193, %r9193, 25; + add.s32 %r9195, %r9147, %r8870; + add.s32 %r9196, %r9195, %r9194; + xor.b32 %r9197, %r9196, %r9163; + shf.l.wrap.b32 %r9198, %r9197, %r9197, 16; + add.s32 %r9199, %r9198, %r9178; + xor.b32 %r9200, %r9199, %r9194; + shf.l.wrap.b32 %r9201, %r9200, %r9200, 20; + add.s32 %r9202, %r9196, %r8877; + add.s32 %r9203, %r9202, %r9201; + xor.b32 %r9204, %r9203, %r9198; + shf.l.wrap.b32 %r9205, %r9204, %r9204, 24; + add.s32 %r9206, %r9205, %r9199; + xor.b32 %r9207, %r9206, %r9201; + shf.l.wrap.b32 %r9208, %r9207, %r9207, 25; + add.s32 %r9209, %r9161, %r8919; + add.s32 %r9210, %r9209, %r9152; + xor.b32 %r9211, %r9210, %r9177; + shf.l.wrap.b32 %r9212, %r9211, %r9211, 16; + add.s32 %r9213, %r9212, %r9192; + xor.b32 %r9214, %r9213, %r9152; + shf.l.wrap.b32 %r9215, %r9214, %r9214, 20; + add.s32 %r9216, %r9210, %r8933; + add.s32 %r9217, %r9216, %r9215; + xor.b32 %r9218, %r9217, %r9212; + shf.l.wrap.b32 %r9219, %r9218, %r9218, 24; + add.s32 %r9220, %r9219, %r9213; + xor.b32 %r9221, %r9220, %r9215; + shf.l.wrap.b32 %r9222, %r9221, %r9221, 25; + add.s32 %r9223, %r9175, %r8940; + add.s32 %r9224, %r9223, %r9166; + xor.b32 %r9225, %r9224, %r9191; + shf.l.wrap.b32 %r9226, %r9225, %r9225, 16; + add.s32 %r9227, %r9226, %r9150; + xor.b32 %r9228, %r9227, %r9166; + shf.l.wrap.b32 %r9229, %r9228, %r9228, 20; + add.s32 %r9230, %r9224, %r8863; + add.s32 %r9231, %r9230, %r9229; + xor.b32 %r9232, %r9231, %r9226; + shf.l.wrap.b32 %r9233, %r9232, %r9232, 24; + add.s32 %r9234, %r9233, %r9227; + xor.b32 %r9235, %r9234, %r9229; + shf.l.wrap.b32 %r9236, %r9235, %r9235, 25; + add.s32 %r9237, %r9189, %r8898; + add.s32 %r9238, %r9237, %r9180; + xor.b32 %r9239, %r9238, %r9149; + shf.l.wrap.b32 %r9240, %r9239, %r9239, 16; + add.s32 %r9241, %r9240, %r9164; + xor.b32 %r9242, %r9241, %r9180; + shf.l.wrap.b32 %r9243, %r9242, %r9242, 20; + add.s32 %r9244, %r9238, %r8947; + add.s32 %r9245, %r9244, %r9243; + xor.b32 %r9246, %r9245, %r9240; + shf.l.wrap.b32 %r9247, %r9246, %r9246, 24; + add.s32 %r9248, %r9247, %r9241; + xor.b32 %r9249, %r9248, %r9243; + shf.l.wrap.b32 %r9250, %r9249, %r9249, 25; + add.s32 %r9251, %r9203, %r8891; + add.s32 %r9252, %r9251, %r9222; + xor.b32 %r9253, %r9252, %r9247; + shf.l.wrap.b32 %r9254, %r9253, %r9253, 16; + add.s32 %r9255, %r9254, %r9234; + xor.b32 %r9256, %r9255, %r9222; + shf.l.wrap.b32 %r9257, %r9256, %r9256, 20; + add.s32 %r9258, %r9252, %r8884; + add.s32 %r9259, %r9258, %r9257; + xor.b32 %r9260, %r9259, %r9254; + shf.l.wrap.b32 %r9261, %r9260, %r9260, 24; + add.s32 %r9262, %r9261, %r9255; + xor.b32 %r9263, %r9262, %r9257; + shf.l.wrap.b32 %r9264, %r9263, %r9263, 25; + add.s32 %r9265, %r9217, %r8912; + add.s32 %r9266, %r9265, %r9236; + xor.b32 %r9267, %r9266, %r9205; + shf.l.wrap.b32 %r9268, %r9267, %r9267, 16; + add.s32 %r9269, %r9268, %r9248; + xor.b32 %r9270, %r9269, %r9236; + shf.l.wrap.b32 %r9271, %r9270, %r9270, 20; + add.s32 %r9272, %r9266, %r8849; + add.s32 %r9273, %r9272, %r9271; + xor.b32 %r9274, %r9273, %r9268; + shf.l.wrap.b32 %r9275, %r9274, %r9274, 24; + add.s32 %r9276, %r9275, %r9269; + xor.b32 %r9277, %r9276, %r9271; + shf.l.wrap.b32 %r9278, %r9277, %r9277, 25; + add.s32 %r9279, %r9231, %r8926; + add.s32 %r9280, %r9279, %r9250; + xor.b32 %r9281, %r9280, %r9219; + shf.l.wrap.b32 %r9282, %r9281, %r9281, 16; + add.s32 %r9283, %r9282, %r9206; + xor.b32 %r9284, %r9283, %r9250; + shf.l.wrap.b32 %r9285, %r9284, %r9284, 20; + add.s32 %r9286, %r9280, %r8954; + add.s32 %r9287, %r9286, %r9285; + xor.b32 %r9288, %r9287, %r9282; + shf.l.wrap.b32 %r9289, %r9288, %r9288, 24; + add.s32 %r9290, %r9289, %r9283; + xor.b32 %r9291, %r9290, %r9285; + shf.l.wrap.b32 %r9292, %r9291, %r9291, 25; + add.s32 %r9293, %r9245, %r8905; + add.s32 %r9294, %r9293, %r9208; + xor.b32 %r9295, %r9294, %r9233; + shf.l.wrap.b32 %r9296, %r9295, %r9295, 16; + add.s32 %r9297, %r9296, %r9220; + xor.b32 %r9298, %r9297, %r9208; + shf.l.wrap.b32 %r9299, %r9298, %r9298, 20; + add.s32 %r9300, %r9294, %r8856; + add.s32 %r9301, %r9300, %r9299; + xor.b32 %r9302, %r9301, %r9296; + shf.l.wrap.b32 %r9303, %r9302, %r9302, 24; + add.s32 %r9304, %r9303, %r9297; + xor.b32 %r9305, %r9304, %r9299; + shf.l.wrap.b32 %r9306, %r9305, %r9305, 25; + add.s32 %r9307, %r9259, %r8919; + add.s32 %r9308, %r9307, %r9306; + xor.b32 %r9309, %r9308, %r9275; + shf.l.wrap.b32 %r9310, %r9309, %r9309, 16; + add.s32 %r9311, %r9310, %r9290; + xor.b32 %r9312, %r9311, %r9306; + shf.l.wrap.b32 %r9313, %r9312, %r9312, 20; + add.s32 %r9314, %r9308, %r8898; + add.s32 %r9315, %r9314, %r9313; + xor.b32 %r9316, %r9315, %r9310; + shf.l.wrap.b32 %r9317, %r9316, %r9316, 24; + add.s32 %r9318, %r9317, %r9311; + xor.b32 %r9319, %r9318, %r9313; + shf.l.wrap.b32 %r9320, %r9319, %r9319, 25; + add.s32 %r9321, %r9273, %r8933; + add.s32 %r9322, %r9321, %r9264; + xor.b32 %r9323, %r9322, %r9289; + shf.l.wrap.b32 %r9324, %r9323, %r9323, 16; + add.s32 %r9325, %r9324, %r9304; + xor.b32 %r9326, %r9325, %r9264; + shf.l.wrap.b32 %r9327, %r9326, %r9326, 20; + add.s32 %r9328, %r9322, %r8912; + add.s32 %r9329, %r9328, %r9327; + xor.b32 %r9330, %r9329, %r9324; + shf.l.wrap.b32 %r9331, %r9330, %r9330, 24; + add.s32 %r9332, %r9331, %r9325; + xor.b32 %r9333, %r9332, %r9327; + shf.l.wrap.b32 %r9334, %r9333, %r9333, 25; + add.s32 %r9335, %r9287, %r8947; + add.s32 %r9336, %r9335, %r9278; + xor.b32 %r9337, %r9336, %r9303; + shf.l.wrap.b32 %r9338, %r9337, %r9337, 16; + add.s32 %r9339, %r9338, %r9262; + xor.b32 %r9340, %r9339, %r9278; + shf.l.wrap.b32 %r9341, %r9340, %r9340, 20; + add.s32 %r9342, %r9336, %r8870; + add.s32 %r9343, %r9342, %r9341; + xor.b32 %r9344, %r9343, %r9338; + shf.l.wrap.b32 %r9345, %r9344, %r9344, 24; + add.s32 %r9346, %r9345, %r9339; + xor.b32 %r9347, %r9346, %r9341; + shf.l.wrap.b32 %r9348, %r9347, %r9347, 25; + add.s32 %r9349, %r9301, %r8940; + add.s32 %r9350, %r9349, %r9292; + xor.b32 %r9351, %r9350, %r9261; + shf.l.wrap.b32 %r9352, %r9351, %r9351, 16; + add.s32 %r9353, %r9352, %r9276; + xor.b32 %r9354, %r9353, %r9292; + shf.l.wrap.b32 %r9355, %r9354, %r9354, 20; + add.s32 %r9356, %r9350, %r8954; + add.s32 %r9357, %r9356, %r9355; + xor.b32 %r9358, %r9357, %r9352; + shf.l.wrap.b32 %r9359, %r9358, %r9358, 24; + add.s32 %r9360, %r9359, %r9353; + xor.b32 %r9361, %r9360, %r9355; + shf.l.wrap.b32 %r9362, %r9361, %r9361, 25; + add.s32 %r9363, %r9315, %r8877; + add.s32 %r9364, %r9363, %r9334; + xor.b32 %r9365, %r9364, %r9359; + shf.l.wrap.b32 %r9366, %r9365, %r9365, 16; + add.s32 %r9367, %r9366, %r9346; + xor.b32 %r9368, %r9367, %r9334; + shf.l.wrap.b32 %r9369, %r9368, %r9368, 20; + add.s32 %r9370, %r9364, %r8849; + add.s32 %r9371, %r9370, %r9369; + xor.b32 %r9372, %r9371, %r9366; + shf.l.wrap.b32 %r9373, %r9372, %r9372, 24; + add.s32 %r9374, %r9373, %r9367; + xor.b32 %r9375, %r9374, %r9369; + shf.l.wrap.b32 %r9376, %r9375, %r9375, 25; + add.s32 %r9377, %r9329, %r8926; + add.s32 %r9378, %r9377, %r9348; + xor.b32 %r9379, %r9378, %r9317; + shf.l.wrap.b32 %r9380, %r9379, %r9379, 16; + add.s32 %r9381, %r9380, %r9360; + xor.b32 %r9382, %r9381, %r9348; + shf.l.wrap.b32 %r9383, %r9382, %r9382, 20; + add.s32 %r9384, %r9378, %r8863; + add.s32 %r9385, %r9384, %r9383; + xor.b32 %r9386, %r9385, %r9380; + shf.l.wrap.b32 %r9387, %r9386, %r9386, 24; + add.s32 %r9388, %r9387, %r9381; + xor.b32 %r9389, %r9388, %r9383; + shf.l.wrap.b32 %r9390, %r9389, %r9389, 25; + add.s32 %r9391, %r9343, %r8884; + add.s32 %r9392, %r9391, %r9362; + xor.b32 %r9393, %r9392, %r9331; + shf.l.wrap.b32 %r9394, %r9393, %r9393, 16; + add.s32 %r9395, %r9394, %r9318; + xor.b32 %r9396, %r9395, %r9362; + shf.l.wrap.b32 %r9397, %r9396, %r9396, 20; + add.s32 %r9398, %r9392, %r8905; + add.s32 %r9399, %r9398, %r9397; + xor.b32 %r9400, %r9399, %r9394; + shf.l.wrap.b32 %r9401, %r9400, %r9400, 24; + add.s32 %r9402, %r9401, %r9395; + xor.b32 %r9403, %r9402, %r9397; + shf.l.wrap.b32 %r9404, %r9403, %r9403, 25; + add.s32 %r9405, %r9357, %r8856; + add.s32 %r9406, %r9405, %r9320; + xor.b32 %r9407, %r9406, %r9345; + shf.l.wrap.b32 %r9408, %r9407, %r9407, 16; + add.s32 %r9409, %r9408, %r9332; + xor.b32 %r9410, %r9409, %r9320; + shf.l.wrap.b32 %r9411, %r9410, %r9410, 20; + add.s32 %r9412, %r9406, %r8891; + add.s32 %r9413, %r9412, %r9411; + xor.b32 %r9414, %r9413, %r9408; + shf.l.wrap.b32 %r9415, %r9414, %r9414, 24; + add.s32 %r9416, %r9415, %r9409; + xor.b32 %r9417, %r9416, %r9411; + shf.l.wrap.b32 %r9418, %r9417, %r9417, 25; + add.s32 %r9419, %r9371, %r8933; + add.s32 %r9420, %r9419, %r9418; + xor.b32 %r9421, %r9420, %r9387; + shf.l.wrap.b32 %r9422, %r9421, %r9421, 16; + add.s32 %r9423, %r9422, %r9402; + xor.b32 %r9424, %r9423, %r9418; + shf.l.wrap.b32 %r9425, %r9424, %r9424, 20; + add.s32 %r9426, %r9420, %r8940; + add.s32 %r9427, %r9426, %r9425; + xor.b32 %r9428, %r9427, %r9422; + shf.l.wrap.b32 %r9429, %r9428, %r9428, 24; + add.s32 %r9430, %r9429, %r9423; + xor.b32 %r9431, %r9430, %r9425; + shf.l.wrap.b32 %r9432, %r9431, %r9431, 25; + add.s32 %r9433, %r9385, %r8912; + add.s32 %r9434, %r9433, %r9376; + xor.b32 %r9435, %r9434, %r9401; + shf.l.wrap.b32 %r9436, %r9435, %r9435, 16; + add.s32 %r9437, %r9436, %r9416; + xor.b32 %r9438, %r9437, %r9376; + shf.l.wrap.b32 %r9439, %r9438, %r9438, 20; + add.s32 %r9440, %r9434, %r8926; + add.s32 %r9441, %r9440, %r9439; + xor.b32 %r9442, %r9441, %r9436; + shf.l.wrap.b32 %r9443, %r9442, %r9442, 24; + add.s32 %r9444, %r9443, %r9437; + xor.b32 %r9445, %r9444, %r9439; + shf.l.wrap.b32 %r9446, %r9445, %r9445, 25; + add.s32 %r9447, %r9399, %r8954; + add.s32 %r9448, %r9447, %r9390; + xor.b32 %r9449, %r9448, %r9415; + shf.l.wrap.b32 %r9450, %r9449, %r9449, 16; + add.s32 %r9451, %r9450, %r9374; + xor.b32 %r9452, %r9451, %r9390; + shf.l.wrap.b32 %r9453, %r9452, %r9452, 20; + add.s32 %r9454, %r9448, %r8919; + add.s32 %r9455, %r9454, %r9453; + xor.b32 %r9456, %r9455, %r9450; + shf.l.wrap.b32 %r9457, %r9456, %r9456, 24; + add.s32 %r9458, %r9457, %r9451; + xor.b32 %r9459, %r9458, %r9453; + shf.l.wrap.b32 %r9460, %r9459, %r9459, 25; + add.s32 %r9461, %r9413, %r8947; + add.s32 %r9462, %r9461, %r9404; + xor.b32 %r9463, %r9462, %r9373; + shf.l.wrap.b32 %r9464, %r9463, %r9463, 16; + add.s32 %r9465, %r9464, %r9388; + xor.b32 %r9466, %r9465, %r9404; + shf.l.wrap.b32 %r9467, %r9466, %r9466, 20; + add.s32 %r9468, %r9462, %r8905; + add.s32 %r9469, %r9468, %r9467; + xor.b32 %r9470, %r9469, %r9464; + shf.l.wrap.b32 %r9471, %r9470, %r9470, 24; + add.s32 %r9472, %r9471, %r9465; + xor.b32 %r9473, %r9472, %r9467; + shf.l.wrap.b32 %r9474, %r9473, %r9473, 25; + add.s32 %r9475, %r9427, %r8898; + add.s32 %r9476, %r9475, %r9446; + xor.b32 %r9477, %r9476, %r9471; + shf.l.wrap.b32 %r9478, %r9477, %r9477, 16; + add.s32 %r9479, %r9478, %r9458; + xor.b32 %r9480, %r9479, %r9446; + shf.l.wrap.b32 %r9481, %r9480, %r9480, 20; + add.s32 %r9482, %r9476, %r8863; + add.s32 %r9483, %r9482, %r9481; + xor.b32 %r9484, %r9483, %r9478; + shf.l.wrap.b32 %r9485, %r9484, %r9484, 24; + add.s32 %r9486, %r9485, %r9479; + xor.b32 %r9487, %r9486, %r9481; + shf.l.wrap.b32 %r9488, %r9487, %r9487, 25; + add.s32 %r9489, %r9441, %r8884; + add.s32 %r9490, %r9489, %r9460; + xor.b32 %r9491, %r9490, %r9429; + shf.l.wrap.b32 %r9492, %r9491, %r9491, 16; + add.s32 %r9493, %r9492, %r9472; + xor.b32 %r9494, %r9493, %r9460; + shf.l.wrap.b32 %r9495, %r9494, %r9494, 20; + add.s32 %r9496, %r9490, %r8870; + add.s32 %r9497, %r9496, %r9495; + xor.b32 %r9498, %r9497, %r9492; + shf.l.wrap.b32 %r9499, %r9498, %r9498, 24; + add.s32 %r9500, %r9499, %r9493; + xor.b32 %r9501, %r9500, %r9495; + shf.l.wrap.b32 %r9502, %r9501, %r9501, 25; + add.s32 %r9503, %r9455, %r8849; + add.s32 %r9504, %r9503, %r9474; + xor.b32 %r9505, %r9504, %r9443; + shf.l.wrap.b32 %r9506, %r9505, %r9505, 16; + add.s32 %r9507, %r9506, %r9430; + xor.b32 %r9508, %r9507, %r9474; + shf.l.wrap.b32 %r9509, %r9508, %r9508, 20; + add.s32 %r9510, %r9504, %r8856; + add.s32 %r9511, %r9510, %r9509; + xor.b32 %r9512, %r9511, %r9506; + shf.l.wrap.b32 %r9513, %r9512, %r9512, 24; + add.s32 %r9514, %r9513, %r9507; + xor.b32 %r9515, %r9514, %r9509; + shf.l.wrap.b32 %r9516, %r9515, %r9515, 25; + add.s32 %r9517, %r9469, %r8891; + add.s32 %r9518, %r9517, %r9432; + xor.b32 %r9519, %r9518, %r9457; + shf.l.wrap.b32 %r9520, %r9519, %r9519, 16; + add.s32 %r9521, %r9520, %r9444; + xor.b32 %r9522, %r9521, %r9432; + shf.l.wrap.b32 %r9523, %r9522, %r9522, 20; + add.s32 %r9524, %r9518, %r8877; + add.s32 %r9525, %r9524, %r9523; + xor.b32 %r9526, %r9525, %r9520; + shf.l.wrap.b32 %r9527, %r9526, %r9526, 24; + add.s32 %r9528, %r9527, %r9521; + xor.b32 %r9529, %r9528, %r9523; + shf.l.wrap.b32 %r9530, %r9529, %r9529, 25; + add.s32 %r9531, %r9483, %r8912; + add.s32 %r9532, %r9531, %r9530; + xor.b32 %r9533, %r9532, %r9499; + shf.l.wrap.b32 %r9534, %r9533, %r9533, 16; + add.s32 %r9535, %r9534, %r9514; + xor.b32 %r9536, %r9535, %r9530; + shf.l.wrap.b32 %r9537, %r9536, %r9536, 20; + add.s32 %r9538, %r9532, %r8947; + add.s32 %r9539, %r9538, %r9537; + xor.b32 %r9540, %r9539, %r9534; + shf.l.wrap.b32 %r9541, %r9540, %r9540, 24; + add.s32 %r9542, %r9541, %r9535; + xor.b32 %r9543, %r9542, %r9537; + shf.l.wrap.b32 %r9544, %r9543, %r9543, 25; + add.s32 %r9545, %r9497, %r8926; + add.s32 %r9546, %r9545, %r9488; + xor.b32 %r9547, %r9546, %r9513; + shf.l.wrap.b32 %r9548, %r9547, %r9547, 16; + add.s32 %r9549, %r9548, %r9528; + xor.b32 %r9550, %r9549, %r9488; + shf.l.wrap.b32 %r9551, %r9550, %r9550, 20; + add.s32 %r9552, %r9546, %r8884; + add.s32 %r9553, %r9552, %r9551; + xor.b32 %r9554, %r9553, %r9548; + shf.l.wrap.b32 %r9555, %r9554, %r9554, 24; + add.s32 %r9556, %r9555, %r9549; + xor.b32 %r9557, %r9556, %r9551; + shf.l.wrap.b32 %r9558, %r9557, %r9557, 25; + add.s32 %r9559, %r9511, %r8905; + add.s32 %r9560, %r9559, %r9502; + xor.b32 %r9561, %r9560, %r9527; + shf.l.wrap.b32 %r9562, %r9561, %r9561, 16; + add.s32 %r9563, %r9562, %r9486; + xor.b32 %r9564, %r9563, %r9502; + shf.l.wrap.b32 %r9565, %r9564, %r9564, 20; + add.s32 %r9566, %r9560, %r8933; + add.s32 %r9567, %r9566, %r9565; + xor.b32 %r9568, %r9567, %r9562; + shf.l.wrap.b32 %r9569, %r9568, %r9568, 24; + add.s32 %r9570, %r9569, %r9563; + xor.b32 %r9571, %r9570, %r9565; + shf.l.wrap.b32 %r9572, %r9571, %r9571, 25; + add.s32 %r9573, %r9525, %r8954; + add.s32 %r9574, %r9573, %r9516; + xor.b32 %r9575, %r9574, %r9485; + shf.l.wrap.b32 %r9576, %r9575, %r9575, 16; + add.s32 %r9577, %r9576, %r9500; + xor.b32 %r9578, %r9577, %r9516; + shf.l.wrap.b32 %r9579, %r9578, %r9578, 20; + add.s32 %r9580, %r9574, %r8856; + add.s32 %r9581, %r9580, %r9579; + xor.b32 %r9582, %r9581, %r9576; + shf.l.wrap.b32 %r9583, %r9582, %r9582, 24; + add.s32 %r9584, %r9583, %r9577; + xor.b32 %r9585, %r9584, %r9579; + shf.l.wrap.b32 %r9586, %r9585, %r9585, 25; + add.s32 %r9587, %r9539, %r8940; + add.s32 %r9588, %r9587, %r9558; + xor.b32 %r9589, %r9588, %r9583; + shf.l.wrap.b32 %r9590, %r9589, %r9589, 16; + add.s32 %r9591, %r9590, %r9570; + xor.b32 %r9592, %r9591, %r9558; + shf.l.wrap.b32 %r9593, %r9592, %r9592, 20; + add.s32 %r9594, %r9588, %r8870; + add.s32 %r9595, %r9594, %r9593; + xor.b32 %r9596, %r9595, %r9590; + shf.l.wrap.b32 %r9597, %r9596, %r9596, 24; + add.s32 %r9598, %r9597, %r9591; + xor.b32 %r9599, %r9598, %r9593; + shf.l.wrap.b32 %r9600, %r9599, %r9599, 25; + add.s32 %r9601, %r9553, %r8849; + add.s32 %r9602, %r9601, %r9572; + xor.b32 %r9603, %r9602, %r9541; + shf.l.wrap.b32 %r9604, %r9603, %r9603, 16; + add.s32 %r9605, %r9604, %r9584; + xor.b32 %r9606, %r9605, %r9572; + shf.l.wrap.b32 %r9607, %r9606, %r9606, 20; + add.s32 %r9608, %r9602, %r8919; + add.s32 %r9609, %r9608, %r9607; + xor.b32 %r9610, %r9609, %r9604; + shf.l.wrap.b32 %r9611, %r9610, %r9610, 24; + add.s32 %r9612, %r9611, %r9605; + xor.b32 %r9613, %r9612, %r9607; + shf.l.wrap.b32 %r9614, %r9613, %r9613, 25; + add.s32 %r9615, %r9567, %r8863; + add.s32 %r9616, %r9615, %r9586; + xor.b32 %r9617, %r9616, %r9555; + shf.l.wrap.b32 %r9618, %r9617, %r9617, 16; + add.s32 %r9619, %r9618, %r9542; + xor.b32 %r9620, %r9619, %r9586; + shf.l.wrap.b32 %r9621, %r9620, %r9620, 20; + add.s32 %r9622, %r9616, %r8891; + add.s32 %r9623, %r9622, %r9621; + xor.b32 %r9624, %r9623, %r9618; + shf.l.wrap.b32 %r9625, %r9624, %r9624, 24; + add.s32 %r9626, %r9625, %r9619; + xor.b32 %r9627, %r9626, %r9621; + shf.l.wrap.b32 %r9628, %r9627, %r9627, 25; + add.s32 %r9629, %r9581, %r8877; + add.s32 %r9630, %r9629, %r9544; + xor.b32 %r9631, %r9630, %r9569; + shf.l.wrap.b32 %r9632, %r9631, %r9631, 16; + add.s32 %r9633, %r9632, %r9556; + xor.b32 %r9634, %r9633, %r9544; + shf.l.wrap.b32 %r9635, %r9634, %r9634, 20; + add.s32 %r9636, %r9630, %r8898; + add.s32 %r9637, %r9636, %r9635; + xor.b32 %r9638, %r9637, %r9632; + shf.l.wrap.b32 %r9639, %r9638, %r9638, 24; + add.s32 %r9640, %r9639, %r9633; + xor.b32 %r9641, %r9640, %r9635; + shf.l.wrap.b32 %r9642, %r9641, %r9641, 25; + add.s32 %r9643, %r9595, %r8926; + add.s32 %r9644, %r9643, %r9642; + xor.b32 %r9645, %r9644, %r9611; + shf.l.wrap.b32 %r9646, %r9645, %r9645, 16; + add.s32 %r9647, %r9646, %r9626; + xor.b32 %r9648, %r9647, %r9642; + shf.l.wrap.b32 %r9649, %r9648, %r9648, 20; + add.s32 %r9650, %r9644, %r8954; + add.s32 %r9651, %r9650, %r9649; + xor.b32 %r9652, %r9651, %r9646; + shf.l.wrap.b32 %r9653, %r9652, %r9652, 24; + add.s32 %r9654, %r9653, %r9647; + xor.b32 %r9655, %r9654, %r9649; + shf.l.wrap.b32 %r9656, %r9655, %r9655, 25; + add.s32 %r9657, %r9609, %r8884; + add.s32 %r9658, %r9657, %r9600; + xor.b32 %r9659, %r9658, %r9625; + shf.l.wrap.b32 %r9660, %r9659, %r9659, 16; + add.s32 %r9661, %r9660, %r9640; + xor.b32 %r9662, %r9661, %r9600; + shf.l.wrap.b32 %r9663, %r9662, %r9662, 20; + add.s32 %r9664, %r9658, %r8849; + add.s32 %r9665, %r9664, %r9663; + xor.b32 %r9666, %r9665, %r9660; + shf.l.wrap.b32 %r9667, %r9666, %r9666, 24; + add.s32 %r9668, %r9667, %r9661; + xor.b32 %r9669, %r9668, %r9663; + shf.l.wrap.b32 %r9670, %r9669, %r9669, 25; + add.s32 %r9671, %r9623, %r8856; + add.s32 %r9672, %r9671, %r9614; + xor.b32 %r9673, %r9672, %r9639; + shf.l.wrap.b32 %r9674, %r9673, %r9673, 16; + add.s32 %r9675, %r9674, %r9598; + xor.b32 %r9676, %r9675, %r9614; + shf.l.wrap.b32 %r9677, %r9676, %r9676, 20; + add.s32 %r9678, %r9672, %r8912; + add.s32 %r9679, %r9678, %r9677; + xor.b32 %r9680, %r9679, %r9674; + shf.l.wrap.b32 %r9681, %r9680, %r9680, 24; + add.s32 %r9682, %r9681, %r9675; + xor.b32 %r9683, %r9682, %r9677; + shf.l.wrap.b32 %r9684, %r9683, %r9683, 25; + add.s32 %r9685, %r9637, %r8905; + add.s32 %r9686, %r9685, %r9628; + xor.b32 %r9687, %r9686, %r9597; + shf.l.wrap.b32 %r9688, %r9687, %r9687, 16; + add.s32 %r9689, %r9688, %r9612; + xor.b32 %r9690, %r9689, %r9628; + shf.l.wrap.b32 %r9691, %r9690, %r9690, 20; + add.s32 %r9692, %r9686, %r8891; + add.s32 %r9693, %r9692, %r9691; + xor.b32 %r9694, %r9693, %r9688; + shf.l.wrap.b32 %r9695, %r9694, %r9694, 24; + add.s32 %r9696, %r9695, %r9689; + xor.b32 %r9697, %r9696, %r9691; + shf.l.wrap.b32 %r9698, %r9697, %r9697, 25; + add.s32 %r9699, %r9651, %r8947; + add.s32 %r9700, %r9699, %r9670; + xor.b32 %r9701, %r9700, %r9695; + shf.l.wrap.b32 %r9702, %r9701, %r9701, 16; + add.s32 %r9703, %r9702, %r9682; + xor.b32 %r9704, %r9703, %r9670; + shf.l.wrap.b32 %r9705, %r9704, %r9704, 20; + add.s32 %r9706, %r9700, %r8919; + add.s32 %r9707, %r9706, %r9705; + xor.b32 %r9708, %r9707, %r9702; + shf.l.wrap.b32 %r9709, %r9708, %r9708, 24; + add.s32 %r9710, %r9709, %r9703; + xor.b32 %r9711, %r9710, %r9705; + shf.l.wrap.b32 %r9712, %r9711, %r9711, 25; + add.s32 %r9713, %r9665, %r8863; + add.s32 %r9714, %r9713, %r9684; + xor.b32 %r9715, %r9714, %r9653; + shf.l.wrap.b32 %r9716, %r9715, %r9715, 16; + add.s32 %r9717, %r9716, %r9696; + xor.b32 %r9718, %r9717, %r9684; + shf.l.wrap.b32 %r9719, %r9718, %r9718, 20; + add.s32 %r9720, %r9714, %r8933; + add.s32 %r9721, %r9720, %r9719; + xor.b32 %r9722, %r9721, %r9716; + shf.l.wrap.b32 %r9723, %r9722, %r9722, 24; + add.s32 %r9724, %r9723, %r9717; + xor.b32 %r9725, %r9724, %r9719; + shf.l.wrap.b32 %r9726, %r9725, %r9725, 25; + add.s32 %r9727, %r9679, %r8870; + add.s32 %r9728, %r9727, %r9698; + xor.b32 %r9729, %r9728, %r9667; + shf.l.wrap.b32 %r9730, %r9729, %r9729, 16; + add.s32 %r9731, %r9730, %r9654; + xor.b32 %r9732, %r9731, %r9698; + shf.l.wrap.b32 %r9733, %r9732, %r9732, 20; + add.s32 %r9734, %r9728, %r8877; + add.s32 %r9735, %r9734, %r9733; + xor.b32 %r9736, %r9735, %r9730; + shf.l.wrap.b32 %r9737, %r9736, %r9736, 24; + add.s32 %r9738, %r9737, %r9731; + xor.b32 %r9739, %r9738, %r9733; + shf.l.wrap.b32 %r9740, %r9739, %r9739, 25; + add.s32 %r9741, %r9693, %r8898; + add.s32 %r9742, %r9741, %r9656; + xor.b32 %r9743, %r9742, %r9681; + shf.l.wrap.b32 %r9744, %r9743, %r9743, 16; + add.s32 %r9745, %r9744, %r9668; + xor.b32 %r9746, %r9745, %r9656; + shf.l.wrap.b32 %r9747, %r9746, %r9746, 20; + add.s32 %r9748, %r9742, %r8940; + add.s32 %r9749, %r9748, %r9747; + xor.b32 %r9750, %r9749, %r9744; + shf.l.wrap.b32 %r9751, %r9750, %r9750, 24; + add.s32 %r9752, %r9751, %r9745; + xor.b32 %r9753, %r9752, %r9747; + shf.l.wrap.b32 %r9754, %r9753, %r9753, 25; + xor.b32 %r9755, %r9738, %r9707; + st.local.u32 [%rd3+-104], %r9755; + xor.b32 %r9756, %r9752, %r9721; + st.local.u32 [%rd3+-100], %r9756; + xor.b32 %r9757, %r9710, %r9735; + st.local.u32 [%rd3+-96], %r9757; + xor.b32 %r9758, %r9724, %r9749; + st.local.u32 [%rd3+-92], %r9758; + xor.b32 %r9759, %r9754, %r9723; + st.local.u32 [%rd3+-88], %r9759; + xor.b32 %r9760, %r9712, %r9737; + st.local.u32 [%rd3+-84], %r9760; + xor.b32 %r9761, %r9726, %r9751; + st.local.u32 [%rd3+-80], %r9761; + xor.b32 %r9762, %r9740, %r9709; + st.local.u32 [%rd3+-76], %r9762; + add.s16 %rs340, %rs335, 1; + st.local.v2.u8 [%rd3], {%rs390, %rs340}; + +$L__BB1_56: + add.s64 %rd208, %rd78, %rd264; + st.local.u8 [%rd208], %rs390; + add.s64 %rd264, %rd264, 1; + setp.lt.u64 %p46, %rd264, 64; + mov.u64 %rd271, %rd77; + @%p46 bra $L__BB1_56; + +$L__BB1_57: + setp.gt.u64 %p47, %rd271, 64; + @%p47 bra $L__BB1_59; + bra.uni $L__BB1_58; + +$L__BB1_59: + ld.local.u8 %rs95, [%rd3+2]; + ld.local.u8 %rs391, [%rd3+1]; + ld.local.u32 %r11689, [%rd3+-104]; + ld.local.u32 %r11688, [%rd3+-100]; + ld.local.u32 %r11687, [%rd3+-96]; + ld.local.u32 %r11686, [%rd3+-92]; + ld.local.u32 %r11685, [%rd3+-88]; + ld.local.u32 %r11684, [%rd3+-84]; + ld.local.u32 %r11683, [%rd3+-80]; + ld.local.u32 %r11682, [%rd3+-76]; + ld.local.u64 %rd269, [%rd3+-72]; + cvt.u32.u64 %r117, %rd269; + shr.u64 %rd209, %rd269, 32; + cvt.u32.u64 %r118, %rd209; + +$L__BB1_60: + and.b16 %rs342, %rs391, 255; + setp.eq.s16 %p48, %rs342, 0; + selp.u16 %rs343, 1, 0, %p48; + or.b16 %rs344, %rs95, %rs343; + ld.local.u8 %r9763, [%rd261]; + ld.local.u8 %r9764, [%rd261+1]; + prmt.b32 %r9765, %r9764, %r9763, 30212; + ld.local.u8 %r9766, [%rd261+2]; + prmt.b32 %r9767, %r9766, %r9765, 28756; + ld.local.u8 %r9768, [%rd261+3]; + prmt.b32 %r9769, %r9768, %r9767, 1620; + ld.local.u8 %r9770, [%rd261+4]; + ld.local.u8 %r9771, [%rd261+5]; + prmt.b32 %r9772, %r9771, %r9770, 30212; + ld.local.u8 %r9773, [%rd261+6]; + prmt.b32 %r9774, %r9773, %r9772, 28756; + ld.local.u8 %r9775, [%rd261+7]; + prmt.b32 %r9776, %r9775, %r9774, 1620; + ld.local.u8 %r9777, [%rd261+8]; + ld.local.u8 %r9778, [%rd261+9]; + prmt.b32 %r9779, %r9778, %r9777, 30212; + ld.local.u8 %r9780, [%rd261+10]; + prmt.b32 %r9781, %r9780, %r9779, 28756; + ld.local.u8 %r9782, [%rd261+11]; + prmt.b32 %r9783, %r9782, %r9781, 1620; + ld.local.u8 %r9784, [%rd261+12]; + ld.local.u8 %r9785, [%rd261+13]; + prmt.b32 %r9786, %r9785, %r9784, 30212; + ld.local.u8 %r9787, [%rd261+14]; + prmt.b32 %r9788, %r9787, %r9786, 28756; + ld.local.u8 %r9789, [%rd261+15]; + prmt.b32 %r9790, %r9789, %r9788, 1620; + ld.local.u8 %r9791, [%rd261+16]; + ld.local.u8 %r9792, [%rd261+17]; + prmt.b32 %r9793, %r9792, %r9791, 30212; + ld.local.u8 %r9794, [%rd261+18]; + prmt.b32 %r9795, %r9794, %r9793, 28756; + ld.local.u8 %r9796, [%rd261+19]; + prmt.b32 %r9797, %r9796, %r9795, 1620; + ld.local.u8 %r9798, [%rd261+20]; + ld.local.u8 %r9799, [%rd261+21]; + prmt.b32 %r9800, %r9799, %r9798, 30212; + ld.local.u8 %r9801, [%rd261+22]; + prmt.b32 %r9802, %r9801, %r9800, 28756; + ld.local.u8 %r9803, [%rd261+23]; + prmt.b32 %r9804, %r9803, %r9802, 1620; + ld.local.u8 %r9805, [%rd261+24]; + ld.local.u8 %r9806, [%rd261+25]; + prmt.b32 %r9807, %r9806, %r9805, 30212; + ld.local.u8 %r9808, [%rd261+26]; + prmt.b32 %r9809, %r9808, %r9807, 28756; + ld.local.u8 %r9810, [%rd261+27]; + prmt.b32 %r9811, %r9810, %r9809, 1620; + ld.local.u8 %r9812, [%rd261+28]; + ld.local.u8 %r9813, [%rd261+29]; + prmt.b32 %r9814, %r9813, %r9812, 30212; + ld.local.u8 %r9815, [%rd261+30]; + prmt.b32 %r9816, %r9815, %r9814, 28756; + ld.local.u8 %r9817, [%rd261+31]; + prmt.b32 %r9818, %r9817, %r9816, 1620; + ld.local.u8 %r9819, [%rd261+32]; + ld.local.u8 %r9820, [%rd261+33]; + prmt.b32 %r9821, %r9820, %r9819, 30212; + ld.local.u8 %r9822, [%rd261+34]; + prmt.b32 %r9823, %r9822, %r9821, 28756; + ld.local.u8 %r9824, [%rd261+35]; + prmt.b32 %r9825, %r9824, %r9823, 1620; + ld.local.u8 %r9826, [%rd261+36]; + ld.local.u8 %r9827, [%rd261+37]; + prmt.b32 %r9828, %r9827, %r9826, 30212; + ld.local.u8 %r9829, [%rd261+38]; + prmt.b32 %r9830, %r9829, %r9828, 28756; + ld.local.u8 %r9831, [%rd261+39]; + prmt.b32 %r9832, %r9831, %r9830, 1620; + ld.local.u8 %r9833, [%rd261+40]; + ld.local.u8 %r9834, [%rd261+41]; + prmt.b32 %r9835, %r9834, %r9833, 30212; + ld.local.u8 %r9836, [%rd261+42]; + prmt.b32 %r9837, %r9836, %r9835, 28756; + ld.local.u8 %r9838, [%rd261+43]; + prmt.b32 %r9839, %r9838, %r9837, 1620; + ld.local.u8 %r9840, [%rd261+44]; + ld.local.u8 %r9841, [%rd261+45]; + prmt.b32 %r9842, %r9841, %r9840, 30212; + ld.local.u8 %r9843, [%rd261+46]; + prmt.b32 %r9844, %r9843, %r9842, 28756; + ld.local.u8 %r9845, [%rd261+47]; + prmt.b32 %r9846, %r9845, %r9844, 1620; + ld.local.u8 %r9847, [%rd261+48]; + ld.local.u8 %r9848, [%rd261+49]; + prmt.b32 %r9849, %r9848, %r9847, 30212; + ld.local.u8 %r9850, [%rd261+50]; + prmt.b32 %r9851, %r9850, %r9849, 28756; + ld.local.u8 %r9852, [%rd261+51]; + prmt.b32 %r9853, %r9852, %r9851, 1620; + ld.local.u8 %r9854, [%rd261+52]; + ld.local.u8 %r9855, [%rd261+53]; + prmt.b32 %r9856, %r9855, %r9854, 30212; + ld.local.u8 %r9857, [%rd261+54]; + prmt.b32 %r9858, %r9857, %r9856, 28756; + ld.local.u8 %r9859, [%rd261+55]; + prmt.b32 %r9860, %r9859, %r9858, 1620; + ld.local.u8 %r9861, [%rd261+56]; + ld.local.u8 %r9862, [%rd261+57]; + prmt.b32 %r9863, %r9862, %r9861, 30212; + ld.local.u8 %r9864, [%rd261+58]; + prmt.b32 %r9865, %r9864, %r9863, 28756; + ld.local.u8 %r9866, [%rd261+59]; + prmt.b32 %r9867, %r9866, %r9865, 1620; + ld.local.u8 %r9868, [%rd261+60]; + ld.local.u8 %r9869, [%rd261+61]; + prmt.b32 %r9870, %r9869, %r9868, 30212; + ld.local.u8 %r9871, [%rd261+62]; + prmt.b32 %r9872, %r9871, %r9870, 28756; + ld.local.u8 %r9873, [%rd261+63]; + prmt.b32 %r9874, %r9873, %r9872, 1620; + cvt.u32.u16 %r9875, %rs344; + and.b32 %r9876, %r9875, 255; + add.s32 %r9877, %r11689, %r11685; + add.s32 %r9878, %r9877, %r9769; + xor.b32 %r9879, %r9878, %r117; + shf.l.wrap.b32 %r9880, %r9879, %r9879, 16; + add.s32 %r9881, %r9880, 1779033703; + xor.b32 %r9882, %r9881, %r11685; + shf.l.wrap.b32 %r9883, %r9882, %r9882, 20; + add.s32 %r9884, %r9776, %r9878; + add.s32 %r9885, %r9884, %r9883; + xor.b32 %r9886, %r9885, %r9880; + shf.l.wrap.b32 %r9887, %r9886, %r9886, 24; + add.s32 %r9888, %r9887, %r9881; + xor.b32 %r9889, %r9888, %r9883; + shf.l.wrap.b32 %r9890, %r9889, %r9889, 25; + add.s32 %r9891, %r11688, %r11684; + add.s32 %r9892, %r9891, %r9783; + xor.b32 %r9893, %r9892, %r118; + shf.l.wrap.b32 %r9894, %r9893, %r9893, 16; + add.s32 %r9895, %r9894, -1150833019; + xor.b32 %r9896, %r9895, %r11684; + shf.l.wrap.b32 %r9897, %r9896, %r9896, 20; + add.s32 %r9898, %r9790, %r9892; + add.s32 %r9899, %r9898, %r9897; + xor.b32 %r9900, %r9899, %r9894; + shf.l.wrap.b32 %r9901, %r9900, %r9900, 24; + add.s32 %r9902, %r9901, %r9895; + xor.b32 %r9903, %r9902, %r9897; + shf.l.wrap.b32 %r9904, %r9903, %r9903, 25; + add.s32 %r9905, %r11687, %r11683; + add.s32 %r9906, %r9905, %r9797; + shr.u32 %r9907, %r9906, 16; + shl.b32 %r9908, %r9906, 16; + xor.b32 %r9909, %r9908, 4194304; + or.b32 %r9910, %r9909, %r9907; + add.s32 %r9911, %r9910, 1013904242; + xor.b32 %r9912, %r9911, %r11683; + shf.l.wrap.b32 %r9913, %r9912, %r9912, 20; + add.s32 %r9914, %r9804, %r9906; + add.s32 %r9915, %r9914, %r9913; + xor.b32 %r9916, %r9915, %r9910; + shf.l.wrap.b32 %r9917, %r9916, %r9916, 24; + add.s32 %r9918, %r9917, %r9911; + xor.b32 %r9919, %r9918, %r9913; + shf.l.wrap.b32 %r9920, %r9919, %r9919, 25; + add.s32 %r9921, %r11686, %r11682; + add.s32 %r9922, %r9921, %r9811; + xor.b32 %r9923, %r9922, %r9876; + shr.u32 %r9924, %r9922, 16; + shl.b32 %r9925, %r9923, 16; + or.b32 %r9926, %r9925, %r9924; + add.s32 %r9927, %r9926, -1521486534; + xor.b32 %r9928, %r9927, %r11682; + shf.l.wrap.b32 %r9929, %r9928, %r9928, 20; + add.s32 %r9930, %r9818, %r9922; + add.s32 %r9931, %r9930, %r9929; + xor.b32 %r9932, %r9931, %r9926; + shf.l.wrap.b32 %r9933, %r9932, %r9932, 24; + add.s32 %r9934, %r9933, %r9927; + xor.b32 %r9935, %r9934, %r9929; + shf.l.wrap.b32 %r9936, %r9935, %r9935, 25; + add.s32 %r9937, %r9904, %r9885; + add.s32 %r9938, %r9937, %r9825; + xor.b32 %r9939, %r9933, %r9938; + shf.l.wrap.b32 %r9940, %r9939, %r9939, 16; + add.s32 %r9941, %r9940, %r9918; + xor.b32 %r9942, %r9941, %r9904; + shf.l.wrap.b32 %r9943, %r9942, %r9942, 20; + add.s32 %r9944, %r9832, %r9938; + add.s32 %r9945, %r9944, %r9943; + xor.b32 %r9946, %r9945, %r9940; + shf.l.wrap.b32 %r9947, %r9946, %r9946, 24; + add.s32 %r9948, %r9947, %r9941; + xor.b32 %r9949, %r9948, %r9943; + shf.l.wrap.b32 %r9950, %r9949, %r9949, 25; + add.s32 %r9951, %r9920, %r9899; + add.s32 %r9952, %r9951, %r9839; + xor.b32 %r9953, %r9952, %r9887; + shf.l.wrap.b32 %r9954, %r9953, %r9953, 16; + add.s32 %r9955, %r9954, %r9934; + xor.b32 %r9956, %r9955, %r9920; + shf.l.wrap.b32 %r9957, %r9956, %r9956, 20; + add.s32 %r9958, %r9846, %r9952; + add.s32 %r9959, %r9958, %r9957; + xor.b32 %r9960, %r9959, %r9954; + shf.l.wrap.b32 %r9961, %r9960, %r9960, 24; + add.s32 %r9962, %r9961, %r9955; + xor.b32 %r9963, %r9962, %r9957; + shf.l.wrap.b32 %r9964, %r9963, %r9963, 25; + add.s32 %r9965, %r9936, %r9915; + add.s32 %r9966, %r9965, %r9853; + xor.b32 %r9967, %r9966, %r9901; + shf.l.wrap.b32 %r9968, %r9967, %r9967, 16; + add.s32 %r9969, %r9968, %r9888; + xor.b32 %r9970, %r9969, %r9936; + shf.l.wrap.b32 %r9971, %r9970, %r9970, 20; + add.s32 %r9972, %r9860, %r9966; + add.s32 %r9973, %r9972, %r9971; + xor.b32 %r9974, %r9973, %r9968; + shf.l.wrap.b32 %r9975, %r9974, %r9974, 24; + add.s32 %r9976, %r9975, %r9969; + xor.b32 %r9977, %r9976, %r9971; + shf.l.wrap.b32 %r9978, %r9977, %r9977, 25; + add.s32 %r9979, %r9931, %r9890; + add.s32 %r9980, %r9979, %r9867; + xor.b32 %r9981, %r9980, %r9917; + shf.l.wrap.b32 %r9982, %r9981, %r9981, 16; + add.s32 %r9983, %r9982, %r9902; + xor.b32 %r9984, %r9983, %r9890; + shf.l.wrap.b32 %r9985, %r9984, %r9984, 20; + add.s32 %r9986, %r9874, %r9980; + add.s32 %r9987, %r9986, %r9985; + xor.b32 %r9988, %r9987, %r9982; + shf.l.wrap.b32 %r9989, %r9988, %r9988, 24; + add.s32 %r9990, %r9989, %r9983; + xor.b32 %r9991, %r9990, %r9985; + shf.l.wrap.b32 %r9992, %r9991, %r9991, 25; + add.s32 %r9993, %r9945, %r9783; + add.s32 %r9994, %r9993, %r9992; + xor.b32 %r9995, %r9994, %r9961; + shf.l.wrap.b32 %r9996, %r9995, %r9995, 16; + add.s32 %r9997, %r9996, %r9976; + xor.b32 %r9998, %r9997, %r9992; + shf.l.wrap.b32 %r9999, %r9998, %r9998, 20; + add.s32 %r10000, %r9994, %r9811; + add.s32 %r10001, %r10000, %r9999; + xor.b32 %r10002, %r10001, %r9996; + shf.l.wrap.b32 %r10003, %r10002, %r10002, 24; + add.s32 %r10004, %r10003, %r9997; + xor.b32 %r10005, %r10004, %r9999; + shf.l.wrap.b32 %r10006, %r10005, %r10005, 25; + add.s32 %r10007, %r9959, %r9790; + add.s32 %r10008, %r10007, %r9950; + xor.b32 %r10009, %r9975, %r10008; + shf.l.wrap.b32 %r10010, %r10009, %r10009, 16; + add.s32 %r10011, %r9990, %r10010; + xor.b32 %r10012, %r10011, %r9950; + shf.l.wrap.b32 %r10013, %r10012, %r10012, 20; + add.s32 %r10014, %r10008, %r9839; + add.s32 %r10015, %r10014, %r10013; + xor.b32 %r10016, %r10015, %r10010; + shf.l.wrap.b32 %r10017, %r10016, %r10016, 24; + add.s32 %r10018, %r10017, %r10011; + xor.b32 %r10019, %r10018, %r10013; + shf.l.wrap.b32 %r10020, %r10019, %r10019, 25; + add.s32 %r10021, %r9964, %r9818; + add.s32 %r10022, %r10021, %r9973; + xor.b32 %r10023, %r9989, %r10022; + shf.l.wrap.b32 %r10024, %r10023, %r10023, 16; + add.s32 %r10025, %r10024, %r9948; + xor.b32 %r10026, %r10025, %r9964; + shf.l.wrap.b32 %r10027, %r10026, %r10026, 20; + add.s32 %r10028, %r10022, %r9769; + add.s32 %r10029, %r10028, %r10027; + xor.b32 %r10030, %r10029, %r10024; + shf.l.wrap.b32 %r10031, %r10030, %r10030, 24; + add.s32 %r10032, %r10031, %r10025; + xor.b32 %r10033, %r10032, %r10027; + shf.l.wrap.b32 %r10034, %r10033, %r10033, 25; + add.s32 %r10035, %r9978, %r9797; + add.s32 %r10036, %r10035, %r9987; + xor.b32 %r10037, %r10036, %r9947; + shf.l.wrap.b32 %r10038, %r10037, %r10037, 16; + add.s32 %r10039, %r10038, %r9962; + xor.b32 %r10040, %r10039, %r9978; + shf.l.wrap.b32 %r10041, %r10040, %r10040, 20; + add.s32 %r10042, %r10036, %r9860; + add.s32 %r10043, %r10042, %r10041; + xor.b32 %r10044, %r10043, %r10038; + shf.l.wrap.b32 %r10045, %r10044, %r10044, 24; + add.s32 %r10046, %r10045, %r10039; + xor.b32 %r10047, %r10046, %r10041; + shf.l.wrap.b32 %r10048, %r10047, %r10047, 25; + add.s32 %r10049, %r10001, %r9776; + add.s32 %r10050, %r10049, %r10020; + xor.b32 %r10051, %r10050, %r10045; + shf.l.wrap.b32 %r10052, %r10051, %r10051, 16; + add.s32 %r10053, %r10052, %r10032; + xor.b32 %r10054, %r10053, %r10020; + shf.l.wrap.b32 %r10055, %r10054, %r10054, 20; + add.s32 %r10056, %r10050, %r9846; + add.s32 %r10057, %r10056, %r10055; + xor.b32 %r10058, %r10057, %r10052; + shf.l.wrap.b32 %r10059, %r10058, %r10058, 24; + add.s32 %r10060, %r10059, %r10053; + xor.b32 %r10061, %r10060, %r10055; + shf.l.wrap.b32 %r10062, %r10061, %r10061, 25; + add.s32 %r10063, %r10015, %r9853; + add.s32 %r10064, %r10063, %r10034; + xor.b32 %r10065, %r10064, %r10003; + shf.l.wrap.b32 %r10066, %r10065, %r10065, 16; + add.s32 %r10067, %r10066, %r10046; + xor.b32 %r10068, %r10067, %r10034; + shf.l.wrap.b32 %r10069, %r10068, %r10068, 20; + add.s32 %r10070, %r10064, %r9804; + add.s32 %r10071, %r10070, %r10069; + xor.b32 %r10072, %r10071, %r10066; + shf.l.wrap.b32 %r10073, %r10072, %r10072, 24; + add.s32 %r10074, %r10073, %r10067; + xor.b32 %r10075, %r10074, %r10069; + shf.l.wrap.b32 %r10076, %r10075, %r10075, 25; + add.s32 %r10077, %r10029, %r9832; + add.s32 %r10078, %r10077, %r10048; + xor.b32 %r10079, %r10078, %r10017; + shf.l.wrap.b32 %r10080, %r10079, %r10079, 16; + add.s32 %r10081, %r10080, %r10004; + xor.b32 %r10082, %r10081, %r10048; + shf.l.wrap.b32 %r10083, %r10082, %r10082, 20; + add.s32 %r10084, %r10078, %r9867; + add.s32 %r10085, %r10084, %r10083; + xor.b32 %r10086, %r10085, %r10080; + shf.l.wrap.b32 %r10087, %r10086, %r10086, 24; + add.s32 %r10088, %r10087, %r10081; + xor.b32 %r10089, %r10088, %r10083; + shf.l.wrap.b32 %r10090, %r10089, %r10089, 25; + add.s32 %r10091, %r10043, %r9874; + add.s32 %r10092, %r10091, %r10006; + xor.b32 %r10093, %r10092, %r10031; + shf.l.wrap.b32 %r10094, %r10093, %r10093, 16; + add.s32 %r10095, %r10094, %r10018; + xor.b32 %r10096, %r10095, %r10006; + shf.l.wrap.b32 %r10097, %r10096, %r10096, 20; + add.s32 %r10098, %r10092, %r9825; + add.s32 %r10099, %r10098, %r10097; + xor.b32 %r10100, %r10099, %r10094; + shf.l.wrap.b32 %r10101, %r10100, %r10100, 24; + add.s32 %r10102, %r10101, %r10095; + xor.b32 %r10103, %r10102, %r10097; + shf.l.wrap.b32 %r10104, %r10103, %r10103, 25; + add.s32 %r10105, %r10057, %r9790; + add.s32 %r10106, %r10105, %r10104; + xor.b32 %r10107, %r10106, %r10073; + shf.l.wrap.b32 %r10108, %r10107, %r10107, 16; + add.s32 %r10109, %r10108, %r10088; + xor.b32 %r10110, %r10109, %r10104; + shf.l.wrap.b32 %r10111, %r10110, %r10110, 20; + add.s32 %r10112, %r10106, %r9797; + add.s32 %r10113, %r10112, %r10111; + xor.b32 %r10114, %r10113, %r10108; + shf.l.wrap.b32 %r10115, %r10114, %r10114, 24; + add.s32 %r10116, %r10115, %r10109; + xor.b32 %r10117, %r10116, %r10111; + shf.l.wrap.b32 %r10118, %r10117, %r10117, 25; + add.s32 %r10119, %r10071, %r9839; + add.s32 %r10120, %r10119, %r10062; + xor.b32 %r10121, %r10120, %r10087; + shf.l.wrap.b32 %r10122, %r10121, %r10121, 16; + add.s32 %r10123, %r10122, %r10102; + xor.b32 %r10124, %r10123, %r10062; + shf.l.wrap.b32 %r10125, %r10124, %r10124, 20; + add.s32 %r10126, %r10120, %r9853; + add.s32 %r10127, %r10126, %r10125; + xor.b32 %r10128, %r10127, %r10122; + shf.l.wrap.b32 %r10129, %r10128, %r10128, 24; + add.s32 %r10130, %r10129, %r10123; + xor.b32 %r10131, %r10130, %r10125; + shf.l.wrap.b32 %r10132, %r10131, %r10131, 25; + add.s32 %r10133, %r10085, %r9860; + add.s32 %r10134, %r10133, %r10076; + xor.b32 %r10135, %r10134, %r10101; + shf.l.wrap.b32 %r10136, %r10135, %r10135, 16; + add.s32 %r10137, %r10136, %r10060; + xor.b32 %r10138, %r10137, %r10076; + shf.l.wrap.b32 %r10139, %r10138, %r10138, 20; + add.s32 %r10140, %r10134, %r9783; + add.s32 %r10141, %r10140, %r10139; + xor.b32 %r10142, %r10141, %r10136; + shf.l.wrap.b32 %r10143, %r10142, %r10142, 24; + add.s32 %r10144, %r10143, %r10137; + xor.b32 %r10145, %r10144, %r10139; + shf.l.wrap.b32 %r10146, %r10145, %r10145, 25; + add.s32 %r10147, %r10099, %r9818; + add.s32 %r10148, %r10147, %r10090; + xor.b32 %r10149, %r10148, %r10059; + shf.l.wrap.b32 %r10150, %r10149, %r10149, 16; + add.s32 %r10151, %r10150, %r10074; + xor.b32 %r10152, %r10151, %r10090; + shf.l.wrap.b32 %r10153, %r10152, %r10152, 20; + add.s32 %r10154, %r10148, %r9867; + add.s32 %r10155, %r10154, %r10153; + xor.b32 %r10156, %r10155, %r10150; + shf.l.wrap.b32 %r10157, %r10156, %r10156, 24; + add.s32 %r10158, %r10157, %r10151; + xor.b32 %r10159, %r10158, %r10153; + shf.l.wrap.b32 %r10160, %r10159, %r10159, 25; + add.s32 %r10161, %r10113, %r9811; + add.s32 %r10162, %r10161, %r10132; + xor.b32 %r10163, %r10162, %r10157; + shf.l.wrap.b32 %r10164, %r10163, %r10163, 16; + add.s32 %r10165, %r10164, %r10144; + xor.b32 %r10166, %r10165, %r10132; + shf.l.wrap.b32 %r10167, %r10166, %r10166, 20; + add.s32 %r10168, %r10162, %r9804; + add.s32 %r10169, %r10168, %r10167; + xor.b32 %r10170, %r10169, %r10164; + shf.l.wrap.b32 %r10171, %r10170, %r10170, 24; + add.s32 %r10172, %r10171, %r10165; + xor.b32 %r10173, %r10172, %r10167; + shf.l.wrap.b32 %r10174, %r10173, %r10173, 25; + add.s32 %r10175, %r10127, %r9832; + add.s32 %r10176, %r10175, %r10146; + xor.b32 %r10177, %r10176, %r10115; + shf.l.wrap.b32 %r10178, %r10177, %r10177, 16; + add.s32 %r10179, %r10178, %r10158; + xor.b32 %r10180, %r10179, %r10146; + shf.l.wrap.b32 %r10181, %r10180, %r10180, 20; + add.s32 %r10182, %r10176, %r9769; + add.s32 %r10183, %r10182, %r10181; + xor.b32 %r10184, %r10183, %r10178; + shf.l.wrap.b32 %r10185, %r10184, %r10184, 24; + add.s32 %r10186, %r10185, %r10179; + xor.b32 %r10187, %r10186, %r10181; + shf.l.wrap.b32 %r10188, %r10187, %r10187, 25; + add.s32 %r10189, %r10141, %r9846; + add.s32 %r10190, %r10189, %r10160; + xor.b32 %r10191, %r10190, %r10129; + shf.l.wrap.b32 %r10192, %r10191, %r10191, 16; + add.s32 %r10193, %r10192, %r10116; + xor.b32 %r10194, %r10193, %r10160; + shf.l.wrap.b32 %r10195, %r10194, %r10194, 20; + add.s32 %r10196, %r10190, %r9874; + add.s32 %r10197, %r10196, %r10195; + xor.b32 %r10198, %r10197, %r10192; + shf.l.wrap.b32 %r10199, %r10198, %r10198, 24; + add.s32 %r10200, %r10199, %r10193; + xor.b32 %r10201, %r10200, %r10195; + shf.l.wrap.b32 %r10202, %r10201, %r10201, 25; + add.s32 %r10203, %r10155, %r9825; + add.s32 %r10204, %r10203, %r10118; + xor.b32 %r10205, %r10204, %r10143; + shf.l.wrap.b32 %r10206, %r10205, %r10205, 16; + add.s32 %r10207, %r10206, %r10130; + xor.b32 %r10208, %r10207, %r10118; + shf.l.wrap.b32 %r10209, %r10208, %r10208, 20; + add.s32 %r10210, %r10204, %r9776; + add.s32 %r10211, %r10210, %r10209; + xor.b32 %r10212, %r10211, %r10206; + shf.l.wrap.b32 %r10213, %r10212, %r10212, 24; + add.s32 %r10214, %r10213, %r10207; + xor.b32 %r10215, %r10214, %r10209; + shf.l.wrap.b32 %r10216, %r10215, %r10215, 25; + add.s32 %r10217, %r10169, %r9839; + add.s32 %r10218, %r10217, %r10216; + xor.b32 %r10219, %r10218, %r10185; + shf.l.wrap.b32 %r10220, %r10219, %r10219, 16; + add.s32 %r10221, %r10220, %r10200; + xor.b32 %r10222, %r10221, %r10216; + shf.l.wrap.b32 %r10223, %r10222, %r10222, 20; + add.s32 %r10224, %r10218, %r9818; + add.s32 %r10225, %r10224, %r10223; + xor.b32 %r10226, %r10225, %r10220; + shf.l.wrap.b32 %r10227, %r10226, %r10226, 24; + add.s32 %r10228, %r10227, %r10221; + xor.b32 %r10229, %r10228, %r10223; + shf.l.wrap.b32 %r10230, %r10229, %r10229, 25; + add.s32 %r10231, %r10183, %r9853; + add.s32 %r10232, %r10231, %r10174; + xor.b32 %r10233, %r10232, %r10199; + shf.l.wrap.b32 %r10234, %r10233, %r10233, 16; + add.s32 %r10235, %r10234, %r10214; + xor.b32 %r10236, %r10235, %r10174; + shf.l.wrap.b32 %r10237, %r10236, %r10236, 20; + add.s32 %r10238, %r10232, %r9832; + add.s32 %r10239, %r10238, %r10237; + xor.b32 %r10240, %r10239, %r10234; + shf.l.wrap.b32 %r10241, %r10240, %r10240, 24; + add.s32 %r10242, %r10241, %r10235; + xor.b32 %r10243, %r10242, %r10237; + shf.l.wrap.b32 %r10244, %r10243, %r10243, 25; + add.s32 %r10245, %r10197, %r9867; + add.s32 %r10246, %r10245, %r10188; + xor.b32 %r10247, %r10246, %r10213; + shf.l.wrap.b32 %r10248, %r10247, %r10247, 16; + add.s32 %r10249, %r10248, %r10172; + xor.b32 %r10250, %r10249, %r10188; + shf.l.wrap.b32 %r10251, %r10250, %r10250, 20; + add.s32 %r10252, %r10246, %r9790; + add.s32 %r10253, %r10252, %r10251; + xor.b32 %r10254, %r10253, %r10248; + shf.l.wrap.b32 %r10255, %r10254, %r10254, 24; + add.s32 %r10256, %r10255, %r10249; + xor.b32 %r10257, %r10256, %r10251; + shf.l.wrap.b32 %r10258, %r10257, %r10257, 25; + add.s32 %r10259, %r10211, %r9860; + add.s32 %r10260, %r10259, %r10202; + xor.b32 %r10261, %r10260, %r10171; + shf.l.wrap.b32 %r10262, %r10261, %r10261, 16; + add.s32 %r10263, %r10262, %r10186; + xor.b32 %r10264, %r10263, %r10202; + shf.l.wrap.b32 %r10265, %r10264, %r10264, 20; + add.s32 %r10266, %r10260, %r9874; + add.s32 %r10267, %r10266, %r10265; + xor.b32 %r10268, %r10267, %r10262; + shf.l.wrap.b32 %r10269, %r10268, %r10268, 24; + add.s32 %r10270, %r10269, %r10263; + xor.b32 %r10271, %r10270, %r10265; + shf.l.wrap.b32 %r10272, %r10271, %r10271, 25; + add.s32 %r10273, %r10225, %r9797; + add.s32 %r10274, %r10273, %r10244; + xor.b32 %r10275, %r10274, %r10269; + shf.l.wrap.b32 %r10276, %r10275, %r10275, 16; + add.s32 %r10277, %r10276, %r10256; + xor.b32 %r10278, %r10277, %r10244; + shf.l.wrap.b32 %r10279, %r10278, %r10278, 20; + add.s32 %r10280, %r10274, %r9769; + add.s32 %r10281, %r10280, %r10279; + xor.b32 %r10282, %r10281, %r10276; + shf.l.wrap.b32 %r10283, %r10282, %r10282, 24; + add.s32 %r10284, %r10283, %r10277; + xor.b32 %r10285, %r10284, %r10279; + shf.l.wrap.b32 %r10286, %r10285, %r10285, 25; + add.s32 %r10287, %r10239, %r9846; + add.s32 %r10288, %r10287, %r10258; + xor.b32 %r10289, %r10288, %r10227; + shf.l.wrap.b32 %r10290, %r10289, %r10289, 16; + add.s32 %r10291, %r10290, %r10270; + xor.b32 %r10292, %r10291, %r10258; + shf.l.wrap.b32 %r10293, %r10292, %r10292, 20; + add.s32 %r10294, %r10288, %r9783; + add.s32 %r10295, %r10294, %r10293; + xor.b32 %r10296, %r10295, %r10290; + shf.l.wrap.b32 %r10297, %r10296, %r10296, 24; + add.s32 %r10298, %r10297, %r10291; + xor.b32 %r10299, %r10298, %r10293; + shf.l.wrap.b32 %r10300, %r10299, %r10299, 25; + add.s32 %r10301, %r10253, %r9804; + add.s32 %r10302, %r10301, %r10272; + xor.b32 %r10303, %r10302, %r10241; + shf.l.wrap.b32 %r10304, %r10303, %r10303, 16; + add.s32 %r10305, %r10304, %r10228; + xor.b32 %r10306, %r10305, %r10272; + shf.l.wrap.b32 %r10307, %r10306, %r10306, 20; + add.s32 %r10308, %r10302, %r9825; + add.s32 %r10309, %r10308, %r10307; + xor.b32 %r10310, %r10309, %r10304; + shf.l.wrap.b32 %r10311, %r10310, %r10310, 24; + add.s32 %r10312, %r10311, %r10305; + xor.b32 %r10313, %r10312, %r10307; + shf.l.wrap.b32 %r10314, %r10313, %r10313, 25; + add.s32 %r10315, %r10267, %r9776; + add.s32 %r10316, %r10315, %r10230; + xor.b32 %r10317, %r10316, %r10255; + shf.l.wrap.b32 %r10318, %r10317, %r10317, 16; + add.s32 %r10319, %r10318, %r10242; + xor.b32 %r10320, %r10319, %r10230; + shf.l.wrap.b32 %r10321, %r10320, %r10320, 20; + add.s32 %r10322, %r10316, %r9811; + add.s32 %r10323, %r10322, %r10321; + xor.b32 %r10324, %r10323, %r10318; + shf.l.wrap.b32 %r10325, %r10324, %r10324, 24; + add.s32 %r10326, %r10325, %r10319; + xor.b32 %r10327, %r10326, %r10321; + shf.l.wrap.b32 %r10328, %r10327, %r10327, 25; + add.s32 %r10329, %r10281, %r9853; + add.s32 %r10330, %r10329, %r10328; + xor.b32 %r10331, %r10330, %r10297; + shf.l.wrap.b32 %r10332, %r10331, %r10331, 16; + add.s32 %r10333, %r10332, %r10312; + xor.b32 %r10334, %r10333, %r10328; + shf.l.wrap.b32 %r10335, %r10334, %r10334, 20; + add.s32 %r10336, %r10330, %r9860; + add.s32 %r10337, %r10336, %r10335; + xor.b32 %r10338, %r10337, %r10332; + shf.l.wrap.b32 %r10339, %r10338, %r10338, 24; + add.s32 %r10340, %r10339, %r10333; + xor.b32 %r10341, %r10340, %r10335; + shf.l.wrap.b32 %r10342, %r10341, %r10341, 25; + add.s32 %r10343, %r10295, %r9832; + add.s32 %r10344, %r10343, %r10286; + xor.b32 %r10345, %r10344, %r10311; + shf.l.wrap.b32 %r10346, %r10345, %r10345, 16; + add.s32 %r10347, %r10346, %r10326; + xor.b32 %r10348, %r10347, %r10286; + shf.l.wrap.b32 %r10349, %r10348, %r10348, 20; + add.s32 %r10350, %r10344, %r9846; + add.s32 %r10351, %r10350, %r10349; + xor.b32 %r10352, %r10351, %r10346; + shf.l.wrap.b32 %r10353, %r10352, %r10352, 24; + add.s32 %r10354, %r10353, %r10347; + xor.b32 %r10355, %r10354, %r10349; + shf.l.wrap.b32 %r10356, %r10355, %r10355, 25; + add.s32 %r10357, %r10309, %r9874; + add.s32 %r10358, %r10357, %r10300; + xor.b32 %r10359, %r10358, %r10325; + shf.l.wrap.b32 %r10360, %r10359, %r10359, 16; + add.s32 %r10361, %r10360, %r10284; + xor.b32 %r10362, %r10361, %r10300; + shf.l.wrap.b32 %r10363, %r10362, %r10362, 20; + add.s32 %r10364, %r10358, %r9839; + add.s32 %r10365, %r10364, %r10363; + xor.b32 %r10366, %r10365, %r10360; + shf.l.wrap.b32 %r10367, %r10366, %r10366, 24; + add.s32 %r10368, %r10367, %r10361; + xor.b32 %r10369, %r10368, %r10363; + shf.l.wrap.b32 %r10370, %r10369, %r10369, 25; + add.s32 %r10371, %r10323, %r9867; + add.s32 %r10372, %r10371, %r10314; + xor.b32 %r10373, %r10372, %r10283; + shf.l.wrap.b32 %r10374, %r10373, %r10373, 16; + add.s32 %r10375, %r10374, %r10298; + xor.b32 %r10376, %r10375, %r10314; + shf.l.wrap.b32 %r10377, %r10376, %r10376, 20; + add.s32 %r10378, %r10372, %r9825; + add.s32 %r10379, %r10378, %r10377; + xor.b32 %r10380, %r10379, %r10374; + shf.l.wrap.b32 %r10381, %r10380, %r10380, 24; + add.s32 %r10382, %r10381, %r10375; + xor.b32 %r10383, %r10382, %r10377; + shf.l.wrap.b32 %r10384, %r10383, %r10383, 25; + add.s32 %r10385, %r10337, %r9818; + add.s32 %r10386, %r10385, %r10356; + xor.b32 %r10387, %r10386, %r10381; + shf.l.wrap.b32 %r10388, %r10387, %r10387, 16; + add.s32 %r10389, %r10388, %r10368; + xor.b32 %r10390, %r10389, %r10356; + shf.l.wrap.b32 %r10391, %r10390, %r10390, 20; + add.s32 %r10392, %r10386, %r9783; + add.s32 %r10393, %r10392, %r10391; + xor.b32 %r10394, %r10393, %r10388; + shf.l.wrap.b32 %r10395, %r10394, %r10394, 24; + add.s32 %r10396, %r10395, %r10389; + xor.b32 %r10397, %r10396, %r10391; + shf.l.wrap.b32 %r10398, %r10397, %r10397, 25; + add.s32 %r10399, %r10351, %r9804; + add.s32 %r10400, %r10399, %r10370; + xor.b32 %r10401, %r10400, %r10339; + shf.l.wrap.b32 %r10402, %r10401, %r10401, 16; + add.s32 %r10403, %r10402, %r10382; + xor.b32 %r10404, %r10403, %r10370; + shf.l.wrap.b32 %r10405, %r10404, %r10404, 20; + add.s32 %r10406, %r10400, %r9790; + add.s32 %r10407, %r10406, %r10405; + xor.b32 %r10408, %r10407, %r10402; + shf.l.wrap.b32 %r10409, %r10408, %r10408, 24; + add.s32 %r10410, %r10409, %r10403; + xor.b32 %r10411, %r10410, %r10405; + shf.l.wrap.b32 %r10412, %r10411, %r10411, 25; + add.s32 %r10413, %r10365, %r9769; + add.s32 %r10414, %r10413, %r10384; + xor.b32 %r10415, %r10414, %r10353; + shf.l.wrap.b32 %r10416, %r10415, %r10415, 16; + add.s32 %r10417, %r10416, %r10340; + xor.b32 %r10418, %r10417, %r10384; + shf.l.wrap.b32 %r10419, %r10418, %r10418, 20; + add.s32 %r10420, %r10414, %r9776; + add.s32 %r10421, %r10420, %r10419; + xor.b32 %r10422, %r10421, %r10416; + shf.l.wrap.b32 %r10423, %r10422, %r10422, 24; + add.s32 %r10424, %r10423, %r10417; + xor.b32 %r10425, %r10424, %r10419; + shf.l.wrap.b32 %r10426, %r10425, %r10425, 25; + add.s32 %r10427, %r10379, %r9811; + add.s32 %r10428, %r10427, %r10342; + xor.b32 %r10429, %r10428, %r10367; + shf.l.wrap.b32 %r10430, %r10429, %r10429, 16; + add.s32 %r10431, %r10430, %r10354; + xor.b32 %r10432, %r10431, %r10342; + shf.l.wrap.b32 %r10433, %r10432, %r10432, 20; + add.s32 %r10434, %r10428, %r9797; + add.s32 %r10435, %r10434, %r10433; + xor.b32 %r10436, %r10435, %r10430; + shf.l.wrap.b32 %r10437, %r10436, %r10436, 24; + add.s32 %r10438, %r10437, %r10431; + xor.b32 %r10439, %r10438, %r10433; + shf.l.wrap.b32 %r10440, %r10439, %r10439, 25; + add.s32 %r10441, %r10393, %r9832; + add.s32 %r10442, %r10441, %r10440; + xor.b32 %r10443, %r10442, %r10409; + shf.l.wrap.b32 %r10444, %r10443, %r10443, 16; + add.s32 %r10445, %r10444, %r10424; + xor.b32 %r10446, %r10445, %r10440; + shf.l.wrap.b32 %r10447, %r10446, %r10446, 20; + add.s32 %r10448, %r10442, %r9867; + add.s32 %r10449, %r10448, %r10447; + xor.b32 %r10450, %r10449, %r10444; + shf.l.wrap.b32 %r10451, %r10450, %r10450, 24; + add.s32 %r10452, %r10451, %r10445; + xor.b32 %r10453, %r10452, %r10447; + shf.l.wrap.b32 %r10454, %r10453, %r10453, 25; + add.s32 %r10455, %r10407, %r9846; + add.s32 %r10456, %r10455, %r10398; + xor.b32 %r10457, %r10456, %r10423; + shf.l.wrap.b32 %r10458, %r10457, %r10457, 16; + add.s32 %r10459, %r10458, %r10438; + xor.b32 %r10460, %r10459, %r10398; + shf.l.wrap.b32 %r10461, %r10460, %r10460, 20; + add.s32 %r10462, %r10456, %r9804; + add.s32 %r10463, %r10462, %r10461; + xor.b32 %r10464, %r10463, %r10458; + shf.l.wrap.b32 %r10465, %r10464, %r10464, 24; + add.s32 %r10466, %r10465, %r10459; + xor.b32 %r10467, %r10466, %r10461; + shf.l.wrap.b32 %r10468, %r10467, %r10467, 25; + add.s32 %r10469, %r10421, %r9825; + add.s32 %r10470, %r10469, %r10412; + xor.b32 %r10471, %r10470, %r10437; + shf.l.wrap.b32 %r10472, %r10471, %r10471, 16; + add.s32 %r10473, %r10472, %r10396; + xor.b32 %r10474, %r10473, %r10412; + shf.l.wrap.b32 %r10475, %r10474, %r10474, 20; + add.s32 %r10476, %r10470, %r9853; + add.s32 %r10477, %r10476, %r10475; + xor.b32 %r10478, %r10477, %r10472; + shf.l.wrap.b32 %r10479, %r10478, %r10478, 24; + add.s32 %r10480, %r10479, %r10473; + xor.b32 %r10481, %r10480, %r10475; + shf.l.wrap.b32 %r10482, %r10481, %r10481, 25; + add.s32 %r10483, %r10435, %r9874; + add.s32 %r10484, %r10483, %r10426; + xor.b32 %r10485, %r10484, %r10395; + shf.l.wrap.b32 %r10486, %r10485, %r10485, 16; + add.s32 %r10487, %r10486, %r10410; + xor.b32 %r10488, %r10487, %r10426; + shf.l.wrap.b32 %r10489, %r10488, %r10488, 20; + add.s32 %r10490, %r10484, %r9776; + add.s32 %r10491, %r10490, %r10489; + xor.b32 %r10492, %r10491, %r10486; + shf.l.wrap.b32 %r10493, %r10492, %r10492, 24; + add.s32 %r10494, %r10493, %r10487; + xor.b32 %r10495, %r10494, %r10489; + shf.l.wrap.b32 %r10496, %r10495, %r10495, 25; + add.s32 %r10497, %r10449, %r9860; + add.s32 %r10498, %r10497, %r10468; + xor.b32 %r10499, %r10498, %r10493; + shf.l.wrap.b32 %r10500, %r10499, %r10499, 16; + add.s32 %r10501, %r10500, %r10480; + xor.b32 %r10502, %r10501, %r10468; + shf.l.wrap.b32 %r10503, %r10502, %r10502, 20; + add.s32 %r10504, %r10498, %r9790; + add.s32 %r10505, %r10504, %r10503; + xor.b32 %r10506, %r10505, %r10500; + shf.l.wrap.b32 %r10507, %r10506, %r10506, 24; + add.s32 %r10508, %r10507, %r10501; + xor.b32 %r10509, %r10508, %r10503; + shf.l.wrap.b32 %r10510, %r10509, %r10509, 25; + add.s32 %r10511, %r10463, %r9769; + add.s32 %r10512, %r10511, %r10482; + xor.b32 %r10513, %r10512, %r10451; + shf.l.wrap.b32 %r10514, %r10513, %r10513, 16; + add.s32 %r10515, %r10514, %r10494; + xor.b32 %r10516, %r10515, %r10482; + shf.l.wrap.b32 %r10517, %r10516, %r10516, 20; + add.s32 %r10518, %r10512, %r9839; + add.s32 %r10519, %r10518, %r10517; + xor.b32 %r10520, %r10519, %r10514; + shf.l.wrap.b32 %r10521, %r10520, %r10520, 24; + add.s32 %r10522, %r10521, %r10515; + xor.b32 %r10523, %r10522, %r10517; + shf.l.wrap.b32 %r10524, %r10523, %r10523, 25; + add.s32 %r10525, %r10477, %r9783; + add.s32 %r10526, %r10525, %r10496; + xor.b32 %r10527, %r10526, %r10465; + shf.l.wrap.b32 %r10528, %r10527, %r10527, 16; + add.s32 %r10529, %r10528, %r10452; + xor.b32 %r10530, %r10529, %r10496; + shf.l.wrap.b32 %r10531, %r10530, %r10530, 20; + add.s32 %r10532, %r10526, %r9811; + add.s32 %r10533, %r10532, %r10531; + xor.b32 %r10534, %r10533, %r10528; + shf.l.wrap.b32 %r10535, %r10534, %r10534, 24; + add.s32 %r10536, %r10535, %r10529; + xor.b32 %r10537, %r10536, %r10531; + shf.l.wrap.b32 %r10538, %r10537, %r10537, 25; + add.s32 %r10539, %r10491, %r9797; + add.s32 %r10540, %r10539, %r10454; + xor.b32 %r10541, %r10540, %r10479; + shf.l.wrap.b32 %r10542, %r10541, %r10541, 16; + add.s32 %r10543, %r10542, %r10466; + xor.b32 %r10544, %r10543, %r10454; + shf.l.wrap.b32 %r10545, %r10544, %r10544, 20; + add.s32 %r10546, %r10540, %r9818; + add.s32 %r10547, %r10546, %r10545; + xor.b32 %r10548, %r10547, %r10542; + shf.l.wrap.b32 %r10549, %r10548, %r10548, 24; + add.s32 %r10550, %r10549, %r10543; + xor.b32 %r10551, %r10550, %r10545; + shf.l.wrap.b32 %r10552, %r10551, %r10551, 25; + add.s32 %r10553, %r10505, %r9846; + add.s32 %r10554, %r10553, %r10552; + xor.b32 %r10555, %r10554, %r10521; + shf.l.wrap.b32 %r10556, %r10555, %r10555, 16; + add.s32 %r10557, %r10556, %r10536; + xor.b32 %r10558, %r10557, %r10552; + shf.l.wrap.b32 %r10559, %r10558, %r10558, 20; + add.s32 %r10560, %r10554, %r9874; + add.s32 %r10561, %r10560, %r10559; + xor.b32 %r10562, %r10561, %r10556; + shf.l.wrap.b32 %r10563, %r10562, %r10562, 24; + add.s32 %r10564, %r10563, %r10557; + xor.b32 %r10565, %r10564, %r10559; + shf.l.wrap.b32 %r10566, %r10565, %r10565, 25; + add.s32 %r10567, %r10519, %r9804; + add.s32 %r10568, %r10567, %r10510; + xor.b32 %r10569, %r10568, %r10535; + shf.l.wrap.b32 %r10570, %r10569, %r10569, 16; + add.s32 %r10571, %r10570, %r10550; + xor.b32 %r10572, %r10571, %r10510; + shf.l.wrap.b32 %r10573, %r10572, %r10572, 20; + add.s32 %r10574, %r10568, %r9769; + add.s32 %r10575, %r10574, %r10573; + xor.b32 %r10576, %r10575, %r10570; + shf.l.wrap.b32 %r10577, %r10576, %r10576, 24; + add.s32 %r10578, %r10577, %r10571; + xor.b32 %r10579, %r10578, %r10573; + shf.l.wrap.b32 %r10580, %r10579, %r10579, 25; + add.s32 %r10581, %r10533, %r9776; + add.s32 %r10582, %r10581, %r10524; + xor.b32 %r10583, %r10582, %r10549; + shf.l.wrap.b32 %r10584, %r10583, %r10583, 16; + add.s32 %r10585, %r10584, %r10508; + xor.b32 %r10586, %r10585, %r10524; + shf.l.wrap.b32 %r10587, %r10586, %r10586, 20; + add.s32 %r10588, %r10582, %r9832; + add.s32 %r10589, %r10588, %r10587; + xor.b32 %r10590, %r10589, %r10584; + shf.l.wrap.b32 %r10591, %r10590, %r10590, 24; + add.s32 %r10592, %r10591, %r10585; + xor.b32 %r10593, %r10592, %r10587; + shf.l.wrap.b32 %r10594, %r10593, %r10593, 25; + add.s32 %r10595, %r10547, %r9825; + add.s32 %r10596, %r10595, %r10538; + xor.b32 %r10597, %r10596, %r10507; + shf.l.wrap.b32 %r10598, %r10597, %r10597, 16; + add.s32 %r10599, %r10598, %r10522; + xor.b32 %r10600, %r10599, %r10538; + shf.l.wrap.b32 %r10601, %r10600, %r10600, 20; + add.s32 %r10602, %r10596, %r9811; + add.s32 %r10603, %r10602, %r10601; + xor.b32 %r10604, %r10603, %r10598; + shf.l.wrap.b32 %r10605, %r10604, %r10604, 24; + add.s32 %r10606, %r10605, %r10599; + xor.b32 %r10607, %r10606, %r10601; + shf.l.wrap.b32 %r10608, %r10607, %r10607, 25; + add.s32 %r10609, %r10561, %r9867; + add.s32 %r10610, %r10609, %r10580; + xor.b32 %r10611, %r10610, %r10605; + shf.l.wrap.b32 %r10612, %r10611, %r10611, 16; + add.s32 %r10613, %r10612, %r10592; + xor.b32 %r10614, %r10613, %r10580; + shf.l.wrap.b32 %r10615, %r10614, %r10614, 20; + add.s32 %r10616, %r10610, %r9839; + add.s32 %r10617, %r10616, %r10615; + xor.b32 %r10618, %r10617, %r10612; + shf.l.wrap.b32 %r10619, %r10618, %r10618, 24; + add.s32 %r10620, %r10619, %r10613; + xor.b32 %r10621, %r10620, %r10615; + shf.l.wrap.b32 %r10622, %r10621, %r10621, 25; + add.s32 %r10623, %r10575, %r9783; + add.s32 %r10624, %r10623, %r10594; + xor.b32 %r10625, %r10624, %r10563; + shf.l.wrap.b32 %r10626, %r10625, %r10625, 16; + add.s32 %r10627, %r10626, %r10606; + xor.b32 %r10628, %r10627, %r10594; + shf.l.wrap.b32 %r10629, %r10628, %r10628, 20; + add.s32 %r10630, %r10624, %r9853; + add.s32 %r10631, %r10630, %r10629; + xor.b32 %r10632, %r10631, %r10626; + shf.l.wrap.b32 %r10633, %r10632, %r10632, 24; + add.s32 %r10634, %r10633, %r10627; + xor.b32 %r10635, %r10634, %r10629; + shf.l.wrap.b32 %r10636, %r10635, %r10635, 25; + add.s32 %r10637, %r10589, %r9790; + add.s32 %r10638, %r10637, %r10608; + xor.b32 %r10639, %r10638, %r10577; + shf.l.wrap.b32 %r10640, %r10639, %r10639, 16; + add.s32 %r10641, %r10640, %r10564; + xor.b32 %r10642, %r10641, %r10608; + shf.l.wrap.b32 %r10643, %r10642, %r10642, 20; + add.s32 %r10644, %r10638, %r9797; + add.s32 %r10645, %r10644, %r10643; + xor.b32 %r10646, %r10645, %r10640; + shf.l.wrap.b32 %r10647, %r10646, %r10646, 24; + add.s32 %r10648, %r10647, %r10641; + xor.b32 %r10649, %r10648, %r10643; + shf.l.wrap.b32 %r10650, %r10649, %r10649, 25; + add.s32 %r10651, %r10603, %r9818; + add.s32 %r10652, %r10651, %r10566; + xor.b32 %r10653, %r10652, %r10591; + shf.l.wrap.b32 %r10654, %r10653, %r10653, 16; + add.s32 %r10655, %r10654, %r10578; + xor.b32 %r10656, %r10655, %r10566; + shf.l.wrap.b32 %r10657, %r10656, %r10656, 20; + add.s32 %r10658, %r10652, %r9860; + add.s32 %r10659, %r10658, %r10657; + xor.b32 %r10660, %r10659, %r10654; + shf.l.wrap.b32 %r10661, %r10660, %r10660, 24; + add.s32 %r10662, %r10661, %r10655; + xor.b32 %r10663, %r10662, %r10657; + shf.l.wrap.b32 %r10664, %r10663, %r10663, 25; + xor.b32 %r11689, %r10648, %r10617; + st.local.u32 [%rd3+-104], %r11689; + xor.b32 %r11688, %r10662, %r10631; + st.local.u32 [%rd3+-100], %r11688; + xor.b32 %r11687, %r10620, %r10645; + st.local.u32 [%rd3+-96], %r11687; + xor.b32 %r11686, %r10659, %r10634; + st.local.u32 [%rd3+-92], %r11686; + xor.b32 %r11685, %r10664, %r10633; + st.local.u32 [%rd3+-88], %r11685; + xor.b32 %r11684, %r10622, %r10647; + st.local.u32 [%rd3+-84], %r11684; + xor.b32 %r11683, %r10661, %r10636; + st.local.u32 [%rd3+-80], %r11683; + xor.b32 %r11682, %r10650, %r10619; + st.local.u32 [%rd3+-76], %r11682; + add.s16 %rs391, %rs391, 1; + st.local.u8 [%rd3+1], %rs391; + add.s64 %rd261, %rd261, 64; + add.s64 %rd271, %rd271, -64; + setp.gt.u64 %p49, %rd271, 64; + @%p49 bra $L__BB1_60; + bra.uni $L__BB1_61; + +$L__BB1_58: + ld.local.u64 %rd269, [%rd3+-72]; + +$L__BB1_61: + cvt.u64.u16 %rd210, %rs390; + and.b64 %rd92, %rd210, 255; + mov.u64 %rd211, 64; + sub.s64 %rd212, %rd211, %rd92; + min.u64 %rd93, %rd212, %rd271; + setp.eq.s64 %p50, %rd93, 0; + @%p50 bra $L__BB1_64; + + add.s64 %rd214, %rd2, %rd92; + add.s64 %rd94, %rd214, 72; + mov.u64 %rd272, 0; + +$L__BB1_63: + add.s64 %rd215, %rd261, %rd272; + ld.local.u8 %rs345, [%rd215]; + add.s64 %rd216, %rd94, %rd272; + st.local.u8 [%rd216], %rs345; + add.s64 %rd272, %rd272, 1; + setp.lt.u64 %p51, %rd272, %rd93; + @%p51 bra $L__BB1_63; + +$L__BB1_64: + cvt.u16.u64 %rs346, %rd93; + ld.local.u8 %rs347, [%rd3]; + add.s16 %rs348, %rs347, %rs346; + st.local.u8 [%rd3], %rs348; + ld.local.u8 %rs392, [%rd3+8]; + cvt.u64.u16 %rd217, %rs392; + and.b64 %rd218, %rd217, 255; + popc.b64 %r10665, %rd269; + cvt.u64.u32 %rd97, %r10665; + setp.ge.u64 %p52, %rd97, %rd218; + @%p52 bra $L__BB1_68; + + ld.local.u8 %r10666, [%rd3+2]; + or.b32 %r135, %r10666, 4; + ld.local.u8 %r10667, [%rd3+-120]; + ld.local.u8 %r10668, [%rd3+-119]; + prmt.b32 %r10669, %r10668, %r10667, 30212; + ld.local.u8 %r10670, [%rd3+-118]; + ld.local.u8 %r10671, [%rd3+-117]; + prmt.b32 %r10672, %r10671, %r10670, 30212; + prmt.b32 %r136, %r10672, %r10669, 4180; + ld.local.u8 %r10673, [%rd3+-136]; + ld.local.u8 %r10674, [%rd3+-135]; + prmt.b32 %r10675, %r10674, %r10673, 30212; + ld.local.u8 %r10676, [%rd3+-134]; + ld.local.u8 %r10677, [%rd3+-133]; + prmt.b32 %r10678, %r10677, %r10676, 30212; + prmt.b32 %r10679, %r10678, %r10675, 4180; + add.s32 %r137, %r136, %r10679; + ld.local.u8 %r10680, [%rd3+-116]; + ld.local.u8 %r10681, [%rd3+-115]; + prmt.b32 %r10682, %r10681, %r10680, 30212; + ld.local.u8 %r10683, [%rd3+-114]; + ld.local.u8 %r10684, [%rd3+-113]; + prmt.b32 %r10685, %r10684, %r10683, 30212; + prmt.b32 %r138, %r10685, %r10682, 4180; + ld.local.u8 %r10686, [%rd3+-132]; + ld.local.u8 %r10687, [%rd3+-131]; + prmt.b32 %r10688, %r10687, %r10686, 30212; + ld.local.u8 %r10689, [%rd3+-130]; + ld.local.u8 %r10690, [%rd3+-129]; + prmt.b32 %r10691, %r10690, %r10689, 30212; + prmt.b32 %r10692, %r10691, %r10688, 4180; + add.s32 %r139, %r138, %r10692; + ld.local.u8 %r10693, [%rd3+-112]; + ld.local.u8 %r10694, [%rd3+-111]; + prmt.b32 %r10695, %r10694, %r10693, 30212; + ld.local.u8 %r10696, [%rd3+-110]; + ld.local.u8 %r10697, [%rd3+-109]; + prmt.b32 %r10698, %r10697, %r10696, 30212; + prmt.b32 %r140, %r10698, %r10695, 4180; + ld.local.u8 %r10699, [%rd3+-128]; + ld.local.u8 %r10700, [%rd3+-127]; + prmt.b32 %r10701, %r10700, %r10699, 30212; + ld.local.u8 %r10702, [%rd3+-126]; + ld.local.u8 %r10703, [%rd3+-125]; + prmt.b32 %r10704, %r10703, %r10702, 30212; + prmt.b32 %r10705, %r10704, %r10701, 4180; + add.s32 %r141, %r140, %r10705; + ld.local.u8 %r10706, [%rd3+-108]; + ld.local.u8 %r10707, [%rd3+-107]; + prmt.b32 %r10708, %r10707, %r10706, 30212; + ld.local.u8 %r10709, [%rd3+-106]; + ld.local.u8 %r10710, [%rd3+-105]; + prmt.b32 %r10711, %r10710, %r10709, 30212; + prmt.b32 %r142, %r10711, %r10708, 4180; + ld.local.u8 %r10712, [%rd3+-124]; + ld.local.u8 %r10713, [%rd3+-123]; + prmt.b32 %r10714, %r10713, %r10712, 30212; + ld.local.u8 %r10715, [%rd3+-122]; + ld.local.u8 %r10716, [%rd3+-121]; + prmt.b32 %r10717, %r10716, %r10715, 30212; + prmt.b32 %r10718, %r10717, %r10714, 4180; + add.s32 %r143, %r142, %r10718; + +$L__BB1_66: + and.b16 %rs349, %rs392, 255; + mul.wide.u16 %r10719, %rs349, 32; + add.s32 %r10720, %r10719, -64; + cvt.s64.s32 %rd219, %r10720; + add.s64 %rd220, %rd2, %rd219; + ld.local.u8 %r10721, [%rd220+145]; + ld.local.u8 %r10722, [%rd220+146]; + prmt.b32 %r10723, %r10722, %r10721, 30212; + ld.local.u8 %r10724, [%rd220+147]; + prmt.b32 %r10725, %r10724, %r10723, 28756; + ld.local.u8 %r10726, [%rd220+148]; + prmt.b32 %r10727, %r10726, %r10725, 1620; + ld.local.u8 %r10728, [%rd220+149]; + ld.local.u8 %r10729, [%rd220+150]; + prmt.b32 %r10730, %r10729, %r10728, 30212; + ld.local.u8 %r10731, [%rd220+151]; + prmt.b32 %r10732, %r10731, %r10730, 28756; + ld.local.u8 %r10733, [%rd220+152]; + prmt.b32 %r10734, %r10733, %r10732, 1620; + ld.local.u8 %r10735, [%rd220+153]; + ld.local.u8 %r10736, [%rd220+154]; + prmt.b32 %r10737, %r10736, %r10735, 30212; + ld.local.u8 %r10738, [%rd220+155]; + prmt.b32 %r10739, %r10738, %r10737, 28756; + ld.local.u8 %r10740, [%rd220+156]; + prmt.b32 %r10741, %r10740, %r10739, 1620; + ld.local.u8 %r10742, [%rd220+157]; + ld.local.u8 %r10743, [%rd220+158]; + prmt.b32 %r10744, %r10743, %r10742, 30212; + ld.local.u8 %r10745, [%rd220+159]; + prmt.b32 %r10746, %r10745, %r10744, 28756; + ld.local.u8 %r10747, [%rd220+160]; + prmt.b32 %r10748, %r10747, %r10746, 1620; + ld.local.u8 %r10749, [%rd220+161]; + ld.local.u8 %r10750, [%rd220+162]; + prmt.b32 %r10751, %r10750, %r10749, 30212; + ld.local.u8 %r10752, [%rd220+163]; + prmt.b32 %r10753, %r10752, %r10751, 28756; + ld.local.u8 %r10754, [%rd220+164]; + prmt.b32 %r10755, %r10754, %r10753, 1620; + ld.local.u8 %r10756, [%rd220+165]; + ld.local.u8 %r10757, [%rd220+166]; + prmt.b32 %r10758, %r10757, %r10756, 30212; + ld.local.u8 %r10759, [%rd220+167]; + prmt.b32 %r10760, %r10759, %r10758, 28756; + ld.local.u8 %r10761, [%rd220+168]; + prmt.b32 %r10762, %r10761, %r10760, 1620; + ld.local.u8 %r10763, [%rd220+169]; + ld.local.u8 %r10764, [%rd220+170]; + prmt.b32 %r10765, %r10764, %r10763, 30212; + ld.local.u8 %r10766, [%rd220+171]; + prmt.b32 %r10767, %r10766, %r10765, 28756; + ld.local.u8 %r10768, [%rd220+172]; + prmt.b32 %r10769, %r10768, %r10767, 1620; + ld.local.u8 %r10770, [%rd220+173]; + ld.local.u8 %r10771, [%rd220+174]; + prmt.b32 %r10772, %r10771, %r10770, 30212; + ld.local.u8 %r10773, [%rd220+175]; + prmt.b32 %r10774, %r10773, %r10772, 28756; + ld.local.u8 %r10775, [%rd220+176]; + prmt.b32 %r10776, %r10775, %r10774, 1620; + ld.local.u8 %r10777, [%rd220+177]; + ld.local.u8 %r10778, [%rd220+178]; + prmt.b32 %r10779, %r10778, %r10777, 30212; + ld.local.u8 %r10780, [%rd220+179]; + prmt.b32 %r10781, %r10780, %r10779, 28756; + ld.local.u8 %r10782, [%rd220+180]; + prmt.b32 %r10783, %r10782, %r10781, 1620; + ld.local.u8 %r10784, [%rd220+181]; + ld.local.u8 %r10785, [%rd220+182]; + prmt.b32 %r10786, %r10785, %r10784, 30212; + ld.local.u8 %r10787, [%rd220+183]; + prmt.b32 %r10788, %r10787, %r10786, 28756; + ld.local.u8 %r10789, [%rd220+184]; + prmt.b32 %r10790, %r10789, %r10788, 1620; + ld.local.u8 %r10791, [%rd220+185]; + ld.local.u8 %r10792, [%rd220+186]; + prmt.b32 %r10793, %r10792, %r10791, 30212; + ld.local.u8 %r10794, [%rd220+187]; + prmt.b32 %r10795, %r10794, %r10793, 28756; + ld.local.u8 %r10796, [%rd220+188]; + prmt.b32 %r10797, %r10796, %r10795, 1620; + ld.local.u8 %r10798, [%rd220+189]; + ld.local.u8 %r10799, [%rd220+190]; + prmt.b32 %r10800, %r10799, %r10798, 30212; + ld.local.u8 %r10801, [%rd220+191]; + prmt.b32 %r10802, %r10801, %r10800, 28756; + ld.local.u8 %r10803, [%rd220+192]; + prmt.b32 %r10804, %r10803, %r10802, 1620; + ld.local.u8 %r10805, [%rd220+193]; + ld.local.u8 %r10806, [%rd220+194]; + prmt.b32 %r10807, %r10806, %r10805, 30212; + ld.local.u8 %r10808, [%rd220+195]; + prmt.b32 %r10809, %r10808, %r10807, 28756; + ld.local.u8 %r10810, [%rd220+196]; + prmt.b32 %r10811, %r10810, %r10809, 1620; + ld.local.u8 %r10812, [%rd220+197]; + ld.local.u8 %r10813, [%rd220+198]; + prmt.b32 %r10814, %r10813, %r10812, 30212; + ld.local.u8 %r10815, [%rd220+199]; + prmt.b32 %r10816, %r10815, %r10814, 28756; + ld.local.u8 %r10817, [%rd220+200]; + prmt.b32 %r10818, %r10817, %r10816, 1620; + ld.local.u8 %r10819, [%rd220+201]; + ld.local.u8 %r10820, [%rd220+202]; + prmt.b32 %r10821, %r10820, %r10819, 30212; + ld.local.u8 %r10822, [%rd220+203]; + prmt.b32 %r10823, %r10822, %r10821, 28756; + ld.local.u8 %r10824, [%rd220+204]; + prmt.b32 %r10825, %r10824, %r10823, 1620; + ld.local.u8 %r10826, [%rd220+205]; + ld.local.u8 %r10827, [%rd220+206]; + prmt.b32 %r10828, %r10827, %r10826, 30212; + ld.local.u8 %r10829, [%rd220+207]; + prmt.b32 %r10830, %r10829, %r10828, 28756; + ld.local.u8 %r10831, [%rd220+208]; + prmt.b32 %r10832, %r10831, %r10830, 1620; + add.s32 %r10833, %r137, %r10727; + shf.l.wrap.b32 %r10834, %r10833, %r10833, 16; + add.s32 %r10835, %r10834, 1779033703; + xor.b32 %r10836, %r10835, %r136; + shf.l.wrap.b32 %r10837, %r10836, %r10836, 20; + add.s32 %r10838, %r10734, %r10833; + add.s32 %r10839, %r10838, %r10837; + xor.b32 %r10840, %r10839, %r10834; + shf.l.wrap.b32 %r10841, %r10840, %r10840, 24; + add.s32 %r10842, %r10841, %r10835; + xor.b32 %r10843, %r10842, %r10837; + shf.l.wrap.b32 %r10844, %r10843, %r10843, 25; + add.s32 %r10845, %r139, %r10741; + shf.l.wrap.b32 %r10846, %r10845, %r10845, 16; + add.s32 %r10847, %r10846, -1150833019; + xor.b32 %r10848, %r10847, %r138; + shf.l.wrap.b32 %r10849, %r10848, %r10848, 20; + add.s32 %r10850, %r10748, %r10845; + add.s32 %r10851, %r10850, %r10849; + xor.b32 %r10852, %r10851, %r10846; + shf.l.wrap.b32 %r10853, %r10852, %r10852, 24; + add.s32 %r10854, %r10853, %r10847; + xor.b32 %r10855, %r10854, %r10849; + shf.l.wrap.b32 %r10856, %r10855, %r10855, 25; + add.s32 %r10857, %r141, %r10755; + shr.u32 %r10858, %r10857, 16; + shl.b32 %r10859, %r10857, 16; + xor.b32 %r10860, %r10859, 4194304; + or.b32 %r10861, %r10860, %r10858; + add.s32 %r10862, %r10861, 1013904242; + xor.b32 %r10863, %r10862, %r140; + shf.l.wrap.b32 %r10864, %r10863, %r10863, 20; + add.s32 %r10865, %r10762, %r10857; + add.s32 %r10866, %r10865, %r10864; + xor.b32 %r10867, %r10866, %r10861; + shf.l.wrap.b32 %r10868, %r10867, %r10867, 24; + add.s32 %r10869, %r10868, %r10862; + xor.b32 %r10870, %r10869, %r10864; + shf.l.wrap.b32 %r10871, %r10870, %r10870, 25; + add.s32 %r10872, %r143, %r10769; + xor.b32 %r10873, %r10872, %r135; + shr.u32 %r10874, %r10872, 16; + shl.b32 %r10875, %r10873, 16; + or.b32 %r10876, %r10875, %r10874; + add.s32 %r10877, %r10876, -1521486534; + xor.b32 %r10878, %r10877, %r142; + shf.l.wrap.b32 %r10879, %r10878, %r10878, 20; + add.s32 %r10880, %r10776, %r10872; + add.s32 %r10881, %r10880, %r10879; + xor.b32 %r10882, %r10881, %r10876; + shf.l.wrap.b32 %r10883, %r10882, %r10882, 24; + add.s32 %r10884, %r10883, %r10877; + xor.b32 %r10885, %r10884, %r10879; + shf.l.wrap.b32 %r10886, %r10885, %r10885, 25; + add.s32 %r10887, %r10856, %r10839; + add.s32 %r10888, %r10887, %r10783; + xor.b32 %r10889, %r10883, %r10888; + shf.l.wrap.b32 %r10890, %r10889, %r10889, 16; + add.s32 %r10891, %r10890, %r10869; + xor.b32 %r10892, %r10891, %r10856; + shf.l.wrap.b32 %r10893, %r10892, %r10892, 20; + add.s32 %r10894, %r10790, %r10888; + add.s32 %r10895, %r10894, %r10893; + xor.b32 %r10896, %r10895, %r10890; + shf.l.wrap.b32 %r10897, %r10896, %r10896, 24; + add.s32 %r10898, %r10897, %r10891; + xor.b32 %r10899, %r10898, %r10893; + shf.l.wrap.b32 %r10900, %r10899, %r10899, 25; + add.s32 %r10901, %r10871, %r10851; + add.s32 %r10902, %r10901, %r10797; + xor.b32 %r10903, %r10902, %r10841; + shf.l.wrap.b32 %r10904, %r10903, %r10903, 16; + add.s32 %r10905, %r10904, %r10884; + xor.b32 %r10906, %r10905, %r10871; + shf.l.wrap.b32 %r10907, %r10906, %r10906, 20; + add.s32 %r10908, %r10804, %r10902; + add.s32 %r10909, %r10908, %r10907; + xor.b32 %r10910, %r10909, %r10904; + shf.l.wrap.b32 %r10911, %r10910, %r10910, 24; + add.s32 %r10912, %r10911, %r10905; + xor.b32 %r10913, %r10912, %r10907; + shf.l.wrap.b32 %r10914, %r10913, %r10913, 25; + add.s32 %r10915, %r10886, %r10866; + add.s32 %r10916, %r10915, %r10811; + xor.b32 %r10917, %r10916, %r10853; + shf.l.wrap.b32 %r10918, %r10917, %r10917, 16; + add.s32 %r10919, %r10918, %r10842; + xor.b32 %r10920, %r10919, %r10886; + shf.l.wrap.b32 %r10921, %r10920, %r10920, 20; + add.s32 %r10922, %r10818, %r10916; + add.s32 %r10923, %r10922, %r10921; + xor.b32 %r10924, %r10923, %r10918; + shf.l.wrap.b32 %r10925, %r10924, %r10924, 24; + add.s32 %r10926, %r10925, %r10919; + xor.b32 %r10927, %r10926, %r10921; + shf.l.wrap.b32 %r10928, %r10927, %r10927, 25; + add.s32 %r10929, %r10881, %r10844; + add.s32 %r10930, %r10929, %r10825; + xor.b32 %r10931, %r10930, %r10868; + shf.l.wrap.b32 %r10932, %r10931, %r10931, 16; + add.s32 %r10933, %r10932, %r10854; + xor.b32 %r10934, %r10933, %r10844; + shf.l.wrap.b32 %r10935, %r10934, %r10934, 20; + add.s32 %r10936, %r10832, %r10930; + add.s32 %r10937, %r10936, %r10935; + xor.b32 %r10938, %r10937, %r10932; + shf.l.wrap.b32 %r10939, %r10938, %r10938, 24; + add.s32 %r10940, %r10939, %r10933; + xor.b32 %r10941, %r10940, %r10935; + shf.l.wrap.b32 %r10942, %r10941, %r10941, 25; + add.s32 %r10943, %r10895, %r10741; + add.s32 %r10944, %r10943, %r10942; + xor.b32 %r10945, %r10944, %r10911; + shf.l.wrap.b32 %r10946, %r10945, %r10945, 16; + add.s32 %r10947, %r10946, %r10926; + xor.b32 %r10948, %r10947, %r10942; + shf.l.wrap.b32 %r10949, %r10948, %r10948, 20; + add.s32 %r10950, %r10944, %r10769; + add.s32 %r10951, %r10950, %r10949; + xor.b32 %r10952, %r10951, %r10946; + shf.l.wrap.b32 %r10953, %r10952, %r10952, 24; + add.s32 %r10954, %r10953, %r10947; + xor.b32 %r10955, %r10954, %r10949; + shf.l.wrap.b32 %r10956, %r10955, %r10955, 25; + add.s32 %r10957, %r10909, %r10748; + add.s32 %r10958, %r10957, %r10900; + xor.b32 %r10959, %r10925, %r10958; + shf.l.wrap.b32 %r10960, %r10959, %r10959, 16; + add.s32 %r10961, %r10940, %r10960; + xor.b32 %r10962, %r10961, %r10900; + shf.l.wrap.b32 %r10963, %r10962, %r10962, 20; + add.s32 %r10964, %r10958, %r10797; + add.s32 %r10965, %r10964, %r10963; + xor.b32 %r10966, %r10965, %r10960; + shf.l.wrap.b32 %r10967, %r10966, %r10966, 24; + add.s32 %r10968, %r10967, %r10961; + xor.b32 %r10969, %r10968, %r10963; + shf.l.wrap.b32 %r10970, %r10969, %r10969, 25; + add.s32 %r10971, %r10914, %r10776; + add.s32 %r10972, %r10971, %r10923; + xor.b32 %r10973, %r10939, %r10972; + shf.l.wrap.b32 %r10974, %r10973, %r10973, 16; + add.s32 %r10975, %r10974, %r10898; + xor.b32 %r10976, %r10975, %r10914; + shf.l.wrap.b32 %r10977, %r10976, %r10976, 20; + add.s32 %r10978, %r10972, %r10727; + add.s32 %r10979, %r10978, %r10977; + xor.b32 %r10980, %r10979, %r10974; + shf.l.wrap.b32 %r10981, %r10980, %r10980, 24; + add.s32 %r10982, %r10981, %r10975; + xor.b32 %r10983, %r10982, %r10977; + shf.l.wrap.b32 %r10984, %r10983, %r10983, 25; + add.s32 %r10985, %r10928, %r10755; + add.s32 %r10986, %r10985, %r10937; + xor.b32 %r10987, %r10986, %r10897; + shf.l.wrap.b32 %r10988, %r10987, %r10987, 16; + add.s32 %r10989, %r10988, %r10912; + xor.b32 %r10990, %r10989, %r10928; + shf.l.wrap.b32 %r10991, %r10990, %r10990, 20; + add.s32 %r10992, %r10986, %r10818; + add.s32 %r10993, %r10992, %r10991; + xor.b32 %r10994, %r10993, %r10988; + shf.l.wrap.b32 %r10995, %r10994, %r10994, 24; + add.s32 %r10996, %r10995, %r10989; + xor.b32 %r10997, %r10996, %r10991; + shf.l.wrap.b32 %r10998, %r10997, %r10997, 25; + add.s32 %r10999, %r10970, %r10734; + add.s32 %r11000, %r10999, %r10951; + xor.b32 %r11001, %r11000, %r10995; + shf.l.wrap.b32 %r11002, %r11001, %r11001, 16; + add.s32 %r11003, %r11002, %r10982; + xor.b32 %r11004, %r11003, %r10970; + shf.l.wrap.b32 %r11005, %r11004, %r11004, 20; + add.s32 %r11006, %r11000, %r10804; + add.s32 %r11007, %r11006, %r11005; + xor.b32 %r11008, %r11007, %r11002; + shf.l.wrap.b32 %r11009, %r11008, %r11008, 24; + add.s32 %r11010, %r11009, %r11003; + xor.b32 %r11011, %r11010, %r11005; + shf.l.wrap.b32 %r11012, %r11011, %r11011, 25; + add.s32 %r11013, %r10965, %r10811; + add.s32 %r11014, %r11013, %r10984; + xor.b32 %r11015, %r10953, %r11014; + shf.l.wrap.b32 %r11016, %r11015, %r11015, 16; + add.s32 %r11017, %r11016, %r10996; + xor.b32 %r11018, %r11017, %r10984; + shf.l.wrap.b32 %r11019, %r11018, %r11018, 20; + add.s32 %r11020, %r11014, %r10762; + add.s32 %r11021, %r11020, %r11019; + xor.b32 %r11022, %r11021, %r11016; + shf.l.wrap.b32 %r11023, %r11022, %r11022, 24; + add.s32 %r11024, %r11023, %r11017; + xor.b32 %r11025, %r11024, %r11019; + shf.l.wrap.b32 %r11026, %r11025, %r11025, 25; + add.s32 %r11027, %r10979, %r10790; + add.s32 %r11028, %r11027, %r10998; + xor.b32 %r11029, %r11028, %r10967; + shf.l.wrap.b32 %r11030, %r11029, %r11029, 16; + add.s32 %r11031, %r11030, %r10954; + xor.b32 %r11032, %r11031, %r10998; + shf.l.wrap.b32 %r11033, %r11032, %r11032, 20; + add.s32 %r11034, %r11028, %r10825; + add.s32 %r11035, %r11034, %r11033; + xor.b32 %r11036, %r11035, %r11030; + shf.l.wrap.b32 %r11037, %r11036, %r11036, 24; + add.s32 %r11038, %r11037, %r11031; + xor.b32 %r11039, %r11038, %r11033; + shf.l.wrap.b32 %r11040, %r11039, %r11039, 25; + add.s32 %r11041, %r10993, %r10832; + add.s32 %r11042, %r11041, %r10956; + xor.b32 %r11043, %r11042, %r10981; + shf.l.wrap.b32 %r11044, %r11043, %r11043, 16; + add.s32 %r11045, %r11044, %r10968; + xor.b32 %r11046, %r11045, %r10956; + shf.l.wrap.b32 %r11047, %r11046, %r11046, 20; + add.s32 %r11048, %r11042, %r10783; + add.s32 %r11049, %r11048, %r11047; + xor.b32 %r11050, %r11049, %r11044; + shf.l.wrap.b32 %r11051, %r11050, %r11050, 24; + add.s32 %r11052, %r11051, %r11045; + xor.b32 %r11053, %r11052, %r11047; + shf.l.wrap.b32 %r11054, %r11053, %r11053, 25; + add.s32 %r11055, %r11007, %r10748; + add.s32 %r11056, %r11055, %r11054; + xor.b32 %r11057, %r11056, %r11023; + shf.l.wrap.b32 %r11058, %r11057, %r11057, 16; + add.s32 %r11059, %r11058, %r11038; + xor.b32 %r11060, %r11059, %r11054; + shf.l.wrap.b32 %r11061, %r11060, %r11060, 20; + add.s32 %r11062, %r11056, %r10755; + add.s32 %r11063, %r11062, %r11061; + xor.b32 %r11064, %r11063, %r11058; + shf.l.wrap.b32 %r11065, %r11064, %r11064, 24; + add.s32 %r11066, %r11065, %r11059; + xor.b32 %r11067, %r11066, %r11061; + shf.l.wrap.b32 %r11068, %r11067, %r11067, 25; + add.s32 %r11069, %r11021, %r10797; + add.s32 %r11070, %r11069, %r11012; + xor.b32 %r11071, %r11070, %r11037; + shf.l.wrap.b32 %r11072, %r11071, %r11071, 16; + add.s32 %r11073, %r11072, %r11052; + xor.b32 %r11074, %r11073, %r11012; + shf.l.wrap.b32 %r11075, %r11074, %r11074, 20; + add.s32 %r11076, %r11070, %r10811; + add.s32 %r11077, %r11076, %r11075; + xor.b32 %r11078, %r11077, %r11072; + shf.l.wrap.b32 %r11079, %r11078, %r11078, 24; + add.s32 %r11080, %r11079, %r11073; + xor.b32 %r11081, %r11080, %r11075; + shf.l.wrap.b32 %r11082, %r11081, %r11081, 25; + add.s32 %r11083, %r11035, %r10818; + add.s32 %r11084, %r11083, %r11026; + xor.b32 %r11085, %r11051, %r11084; + shf.l.wrap.b32 %r11086, %r11085, %r11085, 16; + add.s32 %r11087, %r11086, %r11010; + xor.b32 %r11088, %r11087, %r11026; + shf.l.wrap.b32 %r11089, %r11088, %r11088, 20; + add.s32 %r11090, %r11084, %r10741; + add.s32 %r11091, %r11090, %r11089; + xor.b32 %r11092, %r11091, %r11086; + shf.l.wrap.b32 %r11093, %r11092, %r11092, 24; + add.s32 %r11094, %r11093, %r11087; + xor.b32 %r11095, %r11094, %r11089; + shf.l.wrap.b32 %r11096, %r11095, %r11095, 25; + add.s32 %r11097, %r11040, %r10776; + add.s32 %r11098, %r11097, %r11049; + xor.b32 %r11099, %r11098, %r11009; + shf.l.wrap.b32 %r11100, %r11099, %r11099, 16; + add.s32 %r11101, %r11100, %r11024; + xor.b32 %r11102, %r11101, %r11040; + shf.l.wrap.b32 %r11103, %r11102, %r11102, 20; + add.s32 %r11104, %r11098, %r10825; + add.s32 %r11105, %r11104, %r11103; + xor.b32 %r11106, %r11105, %r11100; + shf.l.wrap.b32 %r11107, %r11106, %r11106, 24; + add.s32 %r11108, %r11107, %r11101; + xor.b32 %r11109, %r11108, %r11103; + shf.l.wrap.b32 %r11110, %r11109, %r11109, 25; + add.s32 %r11111, %r11082, %r10769; + add.s32 %r11112, %r11111, %r11063; + xor.b32 %r11113, %r11112, %r11107; + shf.l.wrap.b32 %r11114, %r11113, %r11113, 16; + add.s32 %r11115, %r11114, %r11094; + xor.b32 %r11116, %r11115, %r11082; + shf.l.wrap.b32 %r11117, %r11116, %r11116, 20; + add.s32 %r11118, %r11112, %r10762; + add.s32 %r11119, %r11118, %r11117; + xor.b32 %r11120, %r11119, %r11114; + shf.l.wrap.b32 %r11121, %r11120, %r11120, 24; + add.s32 %r11122, %r11121, %r11115; + xor.b32 %r11123, %r11122, %r11117; + shf.l.wrap.b32 %r11124, %r11123, %r11123, 25; + add.s32 %r11125, %r11077, %r10790; + add.s32 %r11126, %r11125, %r11096; + xor.b32 %r11127, %r11065, %r11126; + shf.l.wrap.b32 %r11128, %r11127, %r11127, 16; + add.s32 %r11129, %r11128, %r11108; + xor.b32 %r11130, %r11129, %r11096; + shf.l.wrap.b32 %r11131, %r11130, %r11130, 20; + add.s32 %r11132, %r11126, %r10727; + add.s32 %r11133, %r11132, %r11131; + xor.b32 %r11134, %r11133, %r11128; + shf.l.wrap.b32 %r11135, %r11134, %r11134, 24; + add.s32 %r11136, %r11135, %r11129; + xor.b32 %r11137, %r11136, %r11131; + shf.l.wrap.b32 %r11138, %r11137, %r11137, 25; + add.s32 %r11139, %r11091, %r10804; + add.s32 %r11140, %r11139, %r11110; + xor.b32 %r11141, %r11140, %r11079; + shf.l.wrap.b32 %r11142, %r11141, %r11141, 16; + add.s32 %r11143, %r11142, %r11066; + xor.b32 %r11144, %r11143, %r11110; + shf.l.wrap.b32 %r11145, %r11144, %r11144, 20; + add.s32 %r11146, %r11140, %r10832; + add.s32 %r11147, %r11146, %r11145; + xor.b32 %r11148, %r11147, %r11142; + shf.l.wrap.b32 %r11149, %r11148, %r11148, 24; + add.s32 %r11150, %r11149, %r11143; + xor.b32 %r11151, %r11150, %r11145; + shf.l.wrap.b32 %r11152, %r11151, %r11151, 25; + add.s32 %r11153, %r11105, %r10783; + add.s32 %r11154, %r11153, %r11068; + xor.b32 %r11155, %r11154, %r11093; + shf.l.wrap.b32 %r11156, %r11155, %r11155, 16; + add.s32 %r11157, %r11156, %r11080; + xor.b32 %r11158, %r11157, %r11068; + shf.l.wrap.b32 %r11159, %r11158, %r11158, 20; + add.s32 %r11160, %r11154, %r10734; + add.s32 %r11161, %r11160, %r11159; + xor.b32 %r11162, %r11161, %r11156; + shf.l.wrap.b32 %r11163, %r11162, %r11162, 24; + add.s32 %r11164, %r11163, %r11157; + xor.b32 %r11165, %r11164, %r11159; + shf.l.wrap.b32 %r11166, %r11165, %r11165, 25; + add.s32 %r11167, %r11119, %r10797; + add.s32 %r11168, %r11167, %r11166; + xor.b32 %r11169, %r11168, %r11135; + shf.l.wrap.b32 %r11170, %r11169, %r11169, 16; + add.s32 %r11171, %r11170, %r11150; + xor.b32 %r11172, %r11171, %r11166; + shf.l.wrap.b32 %r11173, %r11172, %r11172, 20; + add.s32 %r11174, %r11168, %r10776; + add.s32 %r11175, %r11174, %r11173; + xor.b32 %r11176, %r11175, %r11170; + shf.l.wrap.b32 %r11177, %r11176, %r11176, 24; + add.s32 %r11178, %r11177, %r11171; + xor.b32 %r11179, %r11178, %r11173; + shf.l.wrap.b32 %r11180, %r11179, %r11179, 25; + add.s32 %r11181, %r11133, %r10811; + add.s32 %r11182, %r11181, %r11124; + xor.b32 %r11183, %r11182, %r11149; + shf.l.wrap.b32 %r11184, %r11183, %r11183, 16; + add.s32 %r11185, %r11184, %r11164; + xor.b32 %r11186, %r11185, %r11124; + shf.l.wrap.b32 %r11187, %r11186, %r11186, 20; + add.s32 %r11188, %r11182, %r10790; + add.s32 %r11189, %r11188, %r11187; + xor.b32 %r11190, %r11189, %r11184; + shf.l.wrap.b32 %r11191, %r11190, %r11190, 24; + add.s32 %r11192, %r11191, %r11185; + xor.b32 %r11193, %r11192, %r11187; + shf.l.wrap.b32 %r11194, %r11193, %r11193, 25; + add.s32 %r11195, %r11147, %r10825; + add.s32 %r11196, %r11195, %r11138; + xor.b32 %r11197, %r11163, %r11196; + shf.l.wrap.b32 %r11198, %r11197, %r11197, 16; + add.s32 %r11199, %r11198, %r11122; + xor.b32 %r11200, %r11199, %r11138; + shf.l.wrap.b32 %r11201, %r11200, %r11200, 20; + add.s32 %r11202, %r11196, %r10748; + add.s32 %r11203, %r11202, %r11201; + xor.b32 %r11204, %r11203, %r11198; + shf.l.wrap.b32 %r11205, %r11204, %r11204, 24; + add.s32 %r11206, %r11205, %r11199; + xor.b32 %r11207, %r11206, %r11201; + shf.l.wrap.b32 %r11208, %r11207, %r11207, 25; + add.s32 %r11209, %r11152, %r10818; + add.s32 %r11210, %r11209, %r11161; + xor.b32 %r11211, %r11210, %r11121; + shf.l.wrap.b32 %r11212, %r11211, %r11211, 16; + add.s32 %r11213, %r11212, %r11136; + xor.b32 %r11214, %r11213, %r11152; + shf.l.wrap.b32 %r11215, %r11214, %r11214, 20; + add.s32 %r11216, %r11210, %r10832; + add.s32 %r11217, %r11216, %r11215; + xor.b32 %r11218, %r11217, %r11212; + shf.l.wrap.b32 %r11219, %r11218, %r11218, 24; + add.s32 %r11220, %r11219, %r11213; + xor.b32 %r11221, %r11220, %r11215; + shf.l.wrap.b32 %r11222, %r11221, %r11221, 25; + add.s32 %r11223, %r11194, %r10755; + add.s32 %r11224, %r11223, %r11175; + xor.b32 %r11225, %r11224, %r11219; + shf.l.wrap.b32 %r11226, %r11225, %r11225, 16; + add.s32 %r11227, %r11226, %r11206; + xor.b32 %r11228, %r11227, %r11194; + shf.l.wrap.b32 %r11229, %r11228, %r11228, 20; + add.s32 %r11230, %r11224, %r10727; + add.s32 %r11231, %r11230, %r11229; + xor.b32 %r11232, %r11231, %r11226; + shf.l.wrap.b32 %r11233, %r11232, %r11232, 24; + add.s32 %r11234, %r11233, %r11227; + xor.b32 %r11235, %r11234, %r11229; + shf.l.wrap.b32 %r11236, %r11235, %r11235, 25; + add.s32 %r11237, %r11189, %r10804; + add.s32 %r11238, %r11237, %r11208; + xor.b32 %r11239, %r11177, %r11238; + shf.l.wrap.b32 %r11240, %r11239, %r11239, 16; + add.s32 %r11241, %r11240, %r11220; + xor.b32 %r11242, %r11241, %r11208; + shf.l.wrap.b32 %r11243, %r11242, %r11242, 20; + add.s32 %r11244, %r11238, %r10741; + add.s32 %r11245, %r11244, %r11243; + xor.b32 %r11246, %r11245, %r11240; + shf.l.wrap.b32 %r11247, %r11246, %r11246, 24; + add.s32 %r11248, %r11247, %r11241; + xor.b32 %r11249, %r11248, %r11243; + shf.l.wrap.b32 %r11250, %r11249, %r11249, 25; + add.s32 %r11251, %r11203, %r10762; + add.s32 %r11252, %r11251, %r11222; + xor.b32 %r11253, %r11252, %r11191; + shf.l.wrap.b32 %r11254, %r11253, %r11253, 16; + add.s32 %r11255, %r11254, %r11178; + xor.b32 %r11256, %r11255, %r11222; + shf.l.wrap.b32 %r11257, %r11256, %r11256, 20; + add.s32 %r11258, %r11252, %r10783; + add.s32 %r11259, %r11258, %r11257; + xor.b32 %r11260, %r11259, %r11254; + shf.l.wrap.b32 %r11261, %r11260, %r11260, 24; + add.s32 %r11262, %r11261, %r11255; + xor.b32 %r11263, %r11262, %r11257; + shf.l.wrap.b32 %r11264, %r11263, %r11263, 25; + add.s32 %r11265, %r11217, %r10734; + add.s32 %r11266, %r11265, %r11180; + xor.b32 %r11267, %r11266, %r11205; + shf.l.wrap.b32 %r11268, %r11267, %r11267, 16; + add.s32 %r11269, %r11268, %r11192; + xor.b32 %r11270, %r11269, %r11180; + shf.l.wrap.b32 %r11271, %r11270, %r11270, 20; + add.s32 %r11272, %r11266, %r10769; + add.s32 %r11273, %r11272, %r11271; + xor.b32 %r11274, %r11273, %r11268; + shf.l.wrap.b32 %r11275, %r11274, %r11274, 24; + add.s32 %r11276, %r11275, %r11269; + xor.b32 %r11277, %r11276, %r11271; + shf.l.wrap.b32 %r11278, %r11277, %r11277, 25; + add.s32 %r11279, %r11231, %r10811; + add.s32 %r11280, %r11279, %r11278; + xor.b32 %r11281, %r11280, %r11247; + shf.l.wrap.b32 %r11282, %r11281, %r11281, 16; + add.s32 %r11283, %r11282, %r11262; + xor.b32 %r11284, %r11283, %r11278; + shf.l.wrap.b32 %r11285, %r11284, %r11284, 20; + add.s32 %r11286, %r11280, %r10818; + add.s32 %r11287, %r11286, %r11285; + xor.b32 %r11288, %r11287, %r11282; + shf.l.wrap.b32 %r11289, %r11288, %r11288, 24; + add.s32 %r11290, %r11289, %r11283; + xor.b32 %r11291, %r11290, %r11285; + shf.l.wrap.b32 %r11292, %r11291, %r11291, 25; + add.s32 %r11293, %r11245, %r10790; + add.s32 %r11294, %r11293, %r11236; + xor.b32 %r11295, %r11294, %r11261; + shf.l.wrap.b32 %r11296, %r11295, %r11295, 16; + add.s32 %r11297, %r11296, %r11276; + xor.b32 %r11298, %r11297, %r11236; + shf.l.wrap.b32 %r11299, %r11298, %r11298, 20; + add.s32 %r11300, %r11294, %r10804; + add.s32 %r11301, %r11300, %r11299; + xor.b32 %r11302, %r11301, %r11296; + shf.l.wrap.b32 %r11303, %r11302, %r11302, 24; + add.s32 %r11304, %r11303, %r11297; + xor.b32 %r11305, %r11304, %r11299; + shf.l.wrap.b32 %r11306, %r11305, %r11305, 25; + add.s32 %r11307, %r11259, %r10832; + add.s32 %r11308, %r11307, %r11250; + xor.b32 %r11309, %r11275, %r11308; + shf.l.wrap.b32 %r11310, %r11309, %r11309, 16; + add.s32 %r11311, %r11310, %r11234; + xor.b32 %r11312, %r11311, %r11250; + shf.l.wrap.b32 %r11313, %r11312, %r11312, 20; + add.s32 %r11314, %r11308, %r10797; + add.s32 %r11315, %r11314, %r11313; + xor.b32 %r11316, %r11315, %r11310; + shf.l.wrap.b32 %r11317, %r11316, %r11316, 24; + add.s32 %r11318, %r11317, %r11311; + xor.b32 %r11319, %r11318, %r11313; + shf.l.wrap.b32 %r11320, %r11319, %r11319, 25; + add.s32 %r11321, %r11264, %r10825; + add.s32 %r11322, %r11321, %r11273; + xor.b32 %r11323, %r11322, %r11233; + shf.l.wrap.b32 %r11324, %r11323, %r11323, 16; + add.s32 %r11325, %r11324, %r11248; + xor.b32 %r11326, %r11325, %r11264; + shf.l.wrap.b32 %r11327, %r11326, %r11326, 20; + add.s32 %r11328, %r11322, %r10783; + add.s32 %r11329, %r11328, %r11327; + xor.b32 %r11330, %r11329, %r11324; + shf.l.wrap.b32 %r11331, %r11330, %r11330, 24; + add.s32 %r11332, %r11331, %r11325; + xor.b32 %r11333, %r11332, %r11327; + shf.l.wrap.b32 %r11334, %r11333, %r11333, 25; + add.s32 %r11335, %r11306, %r10776; + add.s32 %r11336, %r11335, %r11287; + xor.b32 %r11337, %r11336, %r11331; + shf.l.wrap.b32 %r11338, %r11337, %r11337, 16; + add.s32 %r11339, %r11338, %r11318; + xor.b32 %r11340, %r11339, %r11306; + shf.l.wrap.b32 %r11341, %r11340, %r11340, 20; + add.s32 %r11342, %r11336, %r10741; + add.s32 %r11343, %r11342, %r11341; + xor.b32 %r11344, %r11343, %r11338; + shf.l.wrap.b32 %r11345, %r11344, %r11344, 24; + add.s32 %r11346, %r11345, %r11339; + xor.b32 %r11347, %r11346, %r11341; + shf.l.wrap.b32 %r11348, %r11347, %r11347, 25; + add.s32 %r11349, %r11301, %r10762; + add.s32 %r11350, %r11349, %r11320; + xor.b32 %r11351, %r11289, %r11350; + shf.l.wrap.b32 %r11352, %r11351, %r11351, 16; + add.s32 %r11353, %r11352, %r11332; + xor.b32 %r11354, %r11353, %r11320; + shf.l.wrap.b32 %r11355, %r11354, %r11354, 20; + add.s32 %r11356, %r11350, %r10748; + add.s32 %r11357, %r11356, %r11355; + xor.b32 %r11358, %r11357, %r11352; + shf.l.wrap.b32 %r11359, %r11358, %r11358, 24; + add.s32 %r11360, %r11359, %r11353; + xor.b32 %r11361, %r11360, %r11355; + shf.l.wrap.b32 %r11362, %r11361, %r11361, 25; + add.s32 %r11363, %r11315, %r10727; + add.s32 %r11364, %r11363, %r11334; + xor.b32 %r11365, %r11364, %r11303; + shf.l.wrap.b32 %r11366, %r11365, %r11365, 16; + add.s32 %r11367, %r11366, %r11290; + xor.b32 %r11368, %r11367, %r11334; + shf.l.wrap.b32 %r11369, %r11368, %r11368, 20; + add.s32 %r11370, %r11364, %r10734; + add.s32 %r11371, %r11370, %r11369; + xor.b32 %r11372, %r11371, %r11366; + shf.l.wrap.b32 %r11373, %r11372, %r11372, 24; + add.s32 %r11374, %r11373, %r11367; + xor.b32 %r11375, %r11374, %r11369; + shf.l.wrap.b32 %r11376, %r11375, %r11375, 25; + add.s32 %r11377, %r11329, %r10769; + add.s32 %r11378, %r11377, %r11292; + xor.b32 %r11379, %r11378, %r11317; + shf.l.wrap.b32 %r11380, %r11379, %r11379, 16; + add.s32 %r11381, %r11380, %r11304; + xor.b32 %r11382, %r11381, %r11292; + shf.l.wrap.b32 %r11383, %r11382, %r11382, 20; + add.s32 %r11384, %r11378, %r10755; + add.s32 %r11385, %r11384, %r11383; + xor.b32 %r11386, %r11385, %r11380; + shf.l.wrap.b32 %r11387, %r11386, %r11386, 24; + add.s32 %r11388, %r11387, %r11381; + xor.b32 %r11389, %r11388, %r11383; + shf.l.wrap.b32 %r11390, %r11389, %r11389, 25; + add.s32 %r11391, %r11343, %r10790; + add.s32 %r11392, %r11391, %r11390; + xor.b32 %r11393, %r11392, %r11359; + shf.l.wrap.b32 %r11394, %r11393, %r11393, 16; + add.s32 %r11395, %r11394, %r11374; + xor.b32 %r11396, %r11395, %r11390; + shf.l.wrap.b32 %r11397, %r11396, %r11396, 20; + add.s32 %r11398, %r11392, %r10825; + add.s32 %r11399, %r11398, %r11397; + xor.b32 %r11400, %r11399, %r11394; + shf.l.wrap.b32 %r11401, %r11400, %r11400, 24; + add.s32 %r11402, %r11401, %r11395; + xor.b32 %r11403, %r11402, %r11397; + shf.l.wrap.b32 %r11404, %r11403, %r11403, 25; + add.s32 %r11405, %r11357, %r10804; + add.s32 %r11406, %r11405, %r11348; + xor.b32 %r11407, %r11406, %r11373; + shf.l.wrap.b32 %r11408, %r11407, %r11407, 16; + add.s32 %r11409, %r11408, %r11388; + xor.b32 %r11410, %r11409, %r11348; + shf.l.wrap.b32 %r11411, %r11410, %r11410, 20; + add.s32 %r11412, %r11406, %r10762; + add.s32 %r11413, %r11412, %r11411; + xor.b32 %r11414, %r11413, %r11408; + shf.l.wrap.b32 %r11415, %r11414, %r11414, 24; + add.s32 %r11416, %r11415, %r11409; + xor.b32 %r11417, %r11416, %r11411; + shf.l.wrap.b32 %r11418, %r11417, %r11417, 25; + add.s32 %r11419, %r11371, %r10783; + add.s32 %r11420, %r11419, %r11362; + xor.b32 %r11421, %r11387, %r11420; + shf.l.wrap.b32 %r11422, %r11421, %r11421, 16; + add.s32 %r11423, %r11422, %r11346; + xor.b32 %r11424, %r11423, %r11362; + shf.l.wrap.b32 %r11425, %r11424, %r11424, 20; + add.s32 %r11426, %r11420, %r10811; + add.s32 %r11427, %r11426, %r11425; + xor.b32 %r11428, %r11427, %r11422; + shf.l.wrap.b32 %r11429, %r11428, %r11428, 24; + add.s32 %r11430, %r11429, %r11423; + xor.b32 %r11431, %r11430, %r11425; + shf.l.wrap.b32 %r11432, %r11431, %r11431, 25; + add.s32 %r11433, %r11376, %r10832; + add.s32 %r11434, %r11433, %r11385; + xor.b32 %r11435, %r11434, %r11345; + shf.l.wrap.b32 %r11436, %r11435, %r11435, 16; + add.s32 %r11437, %r11436, %r11360; + xor.b32 %r11438, %r11437, %r11376; + shf.l.wrap.b32 %r11439, %r11438, %r11438, 20; + add.s32 %r11440, %r11434, %r10734; + add.s32 %r11441, %r11440, %r11439; + xor.b32 %r11442, %r11441, %r11436; + shf.l.wrap.b32 %r11443, %r11442, %r11442, 24; + add.s32 %r11444, %r11443, %r11437; + xor.b32 %r11445, %r11444, %r11439; + shf.l.wrap.b32 %r11446, %r11445, %r11445, 25; + add.s32 %r11447, %r11418, %r10818; + add.s32 %r11448, %r11447, %r11399; + xor.b32 %r11449, %r11448, %r11443; + shf.l.wrap.b32 %r11450, %r11449, %r11449, 16; + add.s32 %r11451, %r11450, %r11430; + xor.b32 %r11452, %r11451, %r11418; + shf.l.wrap.b32 %r11453, %r11452, %r11452, 20; + add.s32 %r11454, %r11448, %r10748; + add.s32 %r11455, %r11454, %r11453; + xor.b32 %r11456, %r11455, %r11450; + shf.l.wrap.b32 %r11457, %r11456, %r11456, 24; + add.s32 %r11458, %r11457, %r11451; + xor.b32 %r11459, %r11458, %r11453; + shf.l.wrap.b32 %r11460, %r11459, %r11459, 25; + add.s32 %r11461, %r11413, %r10727; + add.s32 %r11462, %r11461, %r11432; + xor.b32 %r11463, %r11401, %r11462; + shf.l.wrap.b32 %r11464, %r11463, %r11463, 16; + add.s32 %r11465, %r11464, %r11444; + xor.b32 %r11466, %r11465, %r11432; + shf.l.wrap.b32 %r11467, %r11466, %r11466, 20; + add.s32 %r11468, %r11462, %r10797; + add.s32 %r11469, %r11468, %r11467; + xor.b32 %r11470, %r11469, %r11464; + shf.l.wrap.b32 %r11471, %r11470, %r11470, 24; + add.s32 %r11472, %r11471, %r11465; + xor.b32 %r11473, %r11472, %r11467; + shf.l.wrap.b32 %r11474, %r11473, %r11473, 25; + add.s32 %r11475, %r11427, %r10741; + add.s32 %r11476, %r11475, %r11446; + xor.b32 %r11477, %r11476, %r11415; + shf.l.wrap.b32 %r11478, %r11477, %r11477, 16; + add.s32 %r11479, %r11478, %r11402; + xor.b32 %r11480, %r11479, %r11446; + shf.l.wrap.b32 %r11481, %r11480, %r11480, 20; + add.s32 %r11482, %r11476, %r10769; + add.s32 %r11483, %r11482, %r11481; + xor.b32 %r11484, %r11483, %r11478; + shf.l.wrap.b32 %r11485, %r11484, %r11484, 24; + add.s32 %r11486, %r11485, %r11479; + xor.b32 %r11487, %r11486, %r11481; + shf.l.wrap.b32 %r11488, %r11487, %r11487, 25; + add.s32 %r11489, %r11441, %r10755; + add.s32 %r11490, %r11489, %r11404; + xor.b32 %r11491, %r11490, %r11429; + shf.l.wrap.b32 %r11492, %r11491, %r11491, 16; + add.s32 %r11493, %r11492, %r11416; + xor.b32 %r11494, %r11493, %r11404; + shf.l.wrap.b32 %r11495, %r11494, %r11494, 20; + add.s32 %r11496, %r11490, %r10776; + add.s32 %r11497, %r11496, %r11495; + xor.b32 %r11498, %r11497, %r11492; + shf.l.wrap.b32 %r11499, %r11498, %r11498, 24; + add.s32 %r11500, %r11499, %r11493; + xor.b32 %r11501, %r11500, %r11495; + shf.l.wrap.b32 %r11502, %r11501, %r11501, 25; + add.s32 %r11503, %r11455, %r10804; + add.s32 %r11504, %r11503, %r11502; + xor.b32 %r11505, %r11504, %r11471; + shf.l.wrap.b32 %r11506, %r11505, %r11505, 16; + add.s32 %r11507, %r11506, %r11486; + xor.b32 %r11508, %r11507, %r11502; + shf.l.wrap.b32 %r11509, %r11508, %r11508, 20; + add.s32 %r11510, %r11504, %r10832; + add.s32 %r11511, %r11510, %r11509; + xor.b32 %r11512, %r11511, %r11506; + shf.l.wrap.b32 %r11513, %r11512, %r11512, 24; + add.s32 %r11514, %r11513, %r11507; + xor.b32 %r11515, %r11514, %r11509; + shf.l.wrap.b32 %r11516, %r11515, %r11515, 25; + add.s32 %r11517, %r11469, %r10762; + add.s32 %r11518, %r11517, %r11460; + xor.b32 %r11519, %r11518, %r11485; + shf.l.wrap.b32 %r11520, %r11519, %r11519, 16; + add.s32 %r11521, %r11520, %r11500; + xor.b32 %r11522, %r11521, %r11460; + shf.l.wrap.b32 %r11523, %r11522, %r11522, 20; + add.s32 %r11524, %r11518, %r10727; + add.s32 %r11525, %r11524, %r11523; + xor.b32 %r11526, %r11525, %r11520; + shf.l.wrap.b32 %r11527, %r11526, %r11526, 24; + add.s32 %r11528, %r11527, %r11521; + xor.b32 %r11529, %r11528, %r11523; + shf.l.wrap.b32 %r11530, %r11529, %r11529, 25; + add.s32 %r11531, %r11483, %r10734; + add.s32 %r11532, %r11531, %r11474; + xor.b32 %r11533, %r11499, %r11532; + shf.l.wrap.b32 %r11534, %r11533, %r11533, 16; + add.s32 %r11535, %r11534, %r11458; + xor.b32 %r11536, %r11535, %r11474; + shf.l.wrap.b32 %r11537, %r11536, %r11536, 20; + add.s32 %r11538, %r11532, %r10790; + add.s32 %r11539, %r11538, %r11537; + xor.b32 %r11540, %r11539, %r11534; + shf.l.wrap.b32 %r11541, %r11540, %r11540, 24; + add.s32 %r11542, %r11541, %r11535; + xor.b32 %r11543, %r11542, %r11537; + shf.l.wrap.b32 %r11544, %r11543, %r11543, 25; + add.s32 %r11545, %r11488, %r10783; + add.s32 %r11546, %r11545, %r11497; + xor.b32 %r11547, %r11546, %r11457; + shf.l.wrap.b32 %r11548, %r11547, %r11547, 16; + add.s32 %r11549, %r11548, %r11472; + xor.b32 %r11550, %r11549, %r11488; + shf.l.wrap.b32 %r11551, %r11550, %r11550, 20; + add.s32 %r11552, %r11546, %r10769; + add.s32 %r11553, %r11552, %r11551; + xor.b32 %r11554, %r11553, %r11548; + shf.l.wrap.b32 %r11555, %r11554, %r11554, 24; + add.s32 %r11556, %r11555, %r11549; + xor.b32 %r11557, %r11556, %r11551; + shf.l.wrap.b32 %r11558, %r11557, %r11557, 25; + add.s32 %r11559, %r11530, %r10825; + add.s32 %r11560, %r11559, %r11511; + xor.b32 %r11561, %r11560, %r11555; + shf.l.wrap.b32 %r11562, %r11561, %r11561, 16; + add.s32 %r11563, %r11562, %r11542; + xor.b32 %r11564, %r11563, %r11530; + shf.l.wrap.b32 %r11565, %r11564, %r11564, 20; + add.s32 %r11566, %r11560, %r10797; + add.s32 %r11567, %r11566, %r11565; + xor.b32 %r11568, %r11567, %r11562; + shf.l.wrap.b32 %r11569, %r11568, %r11568, 24; + add.s32 %r11570, %r11569, %r11563; + xor.b32 %r11571, %r11570, %r11565; + shf.l.wrap.b32 %r11572, %r11571, %r11571, 25; + add.s32 %r11573, %r11525, %r10741; + add.s32 %r11574, %r11573, %r11544; + xor.b32 %r11575, %r11513, %r11574; + shf.l.wrap.b32 %r11576, %r11575, %r11575, 16; + add.s32 %r11577, %r11576, %r11556; + xor.b32 %r11578, %r11577, %r11544; + shf.l.wrap.b32 %r11579, %r11578, %r11578, 20; + add.s32 %r11580, %r11574, %r10811; + add.s32 %r11581, %r11580, %r11579; + xor.b32 %r11582, %r11581, %r11576; + shf.l.wrap.b32 %r11583, %r11582, %r11582, 24; + add.s32 %r11584, %r11583, %r11577; + xor.b32 %r11585, %r11584, %r11579; + shf.l.wrap.b32 %r11586, %r11585, %r11585, 25; + add.s32 %r11587, %r11539, %r10748; + add.s32 %r11588, %r11587, %r11558; + xor.b32 %r11589, %r11588, %r11527; + shf.l.wrap.b32 %r11590, %r11589, %r11589, 16; + add.s32 %r11591, %r11590, %r11514; + xor.b32 %r11592, %r11591, %r11558; + shf.l.wrap.b32 %r11593, %r11592, %r11592, 20; + add.s32 %r11594, %r11588, %r10755; + add.s32 %r11595, %r11594, %r11593; + xor.b32 %r11596, %r11595, %r11590; + shf.l.wrap.b32 %r11597, %r11596, %r11596, 24; + add.s32 %r11598, %r11597, %r11591; + xor.b32 %r11599, %r11598, %r11593; + shf.l.wrap.b32 %r11600, %r11599, %r11599, 25; + add.s32 %r11601, %r11553, %r10776; + add.s32 %r11602, %r11601, %r11516; + xor.b32 %r11603, %r11602, %r11541; + shf.l.wrap.b32 %r11604, %r11603, %r11603, 16; + add.s32 %r11605, %r11604, %r11528; + xor.b32 %r11606, %r11605, %r11516; + shf.l.wrap.b32 %r11607, %r11606, %r11606, 20; + add.s32 %r11608, %r11602, %r10818; + add.s32 %r11609, %r11608, %r11607; + xor.b32 %r11610, %r11609, %r11604; + shf.l.wrap.b32 %r11611, %r11610, %r11610, 24; + add.s32 %r11612, %r11611, %r11605; + xor.b32 %r11613, %r11612, %r11607; + shf.l.wrap.b32 %r11614, %r11613, %r11613, 25; + xor.b32 %r11615, %r11598, %r11567; + xor.b32 %r11616, %r11612, %r11581; + xor.b32 %r11617, %r11570, %r11595; + xor.b32 %r11618, %r11609, %r11584; + xor.b32 %r11619, %r11614, %r11583; + xor.b32 %r11620, %r11572, %r11597; + xor.b32 %r11621, %r11611, %r11586; + xor.b32 %r11622, %r11600, %r11569; + st.local.u8 [%rd220+145], %r11615; + shr.u32 %r11623, %r11615, 8; + st.local.u8 [%rd220+146], %r11623; + shr.u32 %r11624, %r11615, 16; + st.local.u8 [%rd220+147], %r11624; + shr.u32 %r11625, %r11615, 24; + st.local.u8 [%rd220+148], %r11625; + st.local.u8 [%rd220+149], %r11616; + shr.u32 %r11626, %r11616, 8; + st.local.u8 [%rd220+150], %r11626; + shr.u32 %r11627, %r11616, 16; + st.local.u8 [%rd220+151], %r11627; + shr.u32 %r11628, %r11616, 24; + st.local.u8 [%rd220+152], %r11628; + st.local.u8 [%rd220+153], %r11617; + shr.u32 %r11629, %r11617, 8; + st.local.u8 [%rd220+154], %r11629; + shr.u32 %r11630, %r11617, 16; + st.local.u8 [%rd220+155], %r11630; + shr.u32 %r11631, %r11617, 24; + st.local.u8 [%rd220+156], %r11631; + st.local.u8 [%rd220+157], %r11618; + shr.u32 %r11632, %r11618, 8; + st.local.u8 [%rd220+158], %r11632; + shr.u32 %r11633, %r11618, 16; + st.local.u8 [%rd220+159], %r11633; + shr.u32 %r11634, %r11618, 24; + st.local.u8 [%rd220+160], %r11634; + st.local.u8 [%rd220+161], %r11619; + shr.u32 %r11635, %r11619, 8; + st.local.u8 [%rd220+162], %r11635; + shr.u32 %r11636, %r11619, 16; + st.local.u8 [%rd220+163], %r11636; + shr.u32 %r11637, %r11619, 24; + st.local.u8 [%rd220+164], %r11637; + st.local.u8 [%rd220+165], %r11620; + shr.u32 %r11638, %r11620, 8; + st.local.u8 [%rd220+166], %r11638; + shr.u32 %r11639, %r11620, 16; + st.local.u8 [%rd220+167], %r11639; + shr.u32 %r11640, %r11620, 24; + st.local.u8 [%rd220+168], %r11640; + st.local.u8 [%rd220+169], %r11621; + shr.u32 %r11641, %r11621, 8; + st.local.u8 [%rd220+170], %r11641; + shr.u32 %r11642, %r11621, 16; + st.local.u8 [%rd220+171], %r11642; + shr.u32 %r11643, %r11621, 24; + st.local.u8 [%rd220+172], %r11643; + st.local.u8 [%rd220+173], %r11622; + shr.u32 %r11644, %r11622, 8; + st.local.u8 [%rd220+174], %r11644; + shr.u32 %r11645, %r11622, 16; + st.local.u8 [%rd220+175], %r11645; + shr.u32 %r11646, %r11622, 24; + st.local.u8 [%rd220+176], %r11646; + add.s16 %rs392, %rs392, -1; + cvt.u64.u16 %rd221, %rs392; + and.b64 %rd222, %rd221, 255; + setp.lt.u64 %p53, %rd97, %rd222; + @%p53 bra $L__BB1_66; + + ld.param.u64 %rd233, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + cvta.to.local.u64 %rd232, %rd233; + add.s64 %rd231, %rd232, 136; + st.local.u8 [%rd231+8], %rs392; + +$L__BB1_68: + ret; + +} + // .globl heavy_hash +.visible .entry heavy_hash( + .param .u64 heavy_hash_param_0, + .param .u64 heavy_hash_param_1, + .param .u64 heavy_hash_param_2, + .param .u8 heavy_hash_param_3, + .param .u64 heavy_hash_param_4, + .param .u64 heavy_hash_param_5, + .param .u64 heavy_hash_param_6, + .param .u64 heavy_hash_param_7 +) +{ + .local .align 16 .b8 __local_depot2[2080]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<60>; + .reg .b16 %rs<864>; + .reg .b32 %r<31266>; + .reg .b64 %rd<1373>; + + + mov.u64 %SPL, __local_depot2; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs409, [heavy_hash_param_3]; + ld.param.u64 %rd357, [heavy_hash_param_0]; + ld.param.u64 %rd358, [heavy_hash_param_1]; + ld.param.u64 %rd362, [heavy_hash_param_2]; + ld.param.u64 %rd363, [heavy_hash_param_4]; + ld.param.u64 %rd359, [heavy_hash_param_5]; + ld.param.u64 %rd360, [heavy_hash_param_6]; + ld.param.u64 %rd361, [heavy_hash_param_7]; + cvta.to.global.u64 %rd1, %rd363; + add.u64 %rd2, %SPL, 0; + add.u64 %rd3, %SPL, 2000; + mov.u32 %r5040, %ntid.x; + mov.u32 %r5041, %ctaid.x; + mov.u32 %r5042, %tid.x; + mad.lo.s32 %r5043, %r5041, %r5040, %r5042; + cvt.s64.s32 %rd4, %r5043; + setp.ge.u64 %p6, %rd4, %rd362; + @%p6 bra $L__BB2_105; + + cvt.u32.u64 %r5044, %rd4; + setp.ne.s32 %p7, %r5044, 0; + @%p7 bra $L__BB2_3; + + cvta.to.global.u64 %rd366, %rd359; + mov.u64 %rd367, 0; + st.global.u64 [%rd366], %rd367; + +$L__BB2_3: + setp.eq.s16 %p8, %rs409, 0; + @%p8 bra $L__BB2_5; + + shl.b64 %rd368, %rd4, 5; + add.s64 %rd369, %rd1, %rd368; + ld.global.v2.u64 {%rd370, %rd371}, [%rd369]; + mul.lo.s64 %rd374, %rd371, 5; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd374, 7; + shr.b64 %rhs, %rd374, 57; + add.u64 %rd375, %lhs, %rhs; + } + mul.lo.s64 %rd1299, %rd375, 9; + shl.b64 %rd376, %rd371, 17; + ld.global.v2.u64 {%rd377, %rd378}, [%rd369+16]; + xor.b64 %rd381, %rd377, %rd370; + xor.b64 %rd382, %rd378, %rd371; + xor.b64 %rd383, %rd371, %rd381; + xor.b64 %rd384, %rd370, %rd382; + st.global.v2.u64 [%rd369], {%rd384, %rd383}; + { + .reg .b32 %dummy; + mov.b64 {%r5045,%dummy}, %rd382; + } + { + .reg .b32 %dummy; + mov.b64 {%dummy,%r5046}, %rd382; + } + shf.r.wrap.b32 %r5047, %r5046, %r5045, 19; + shf.r.wrap.b32 %r5048, %r5045, %r5046, 19; + mov.b64 %rd385, {%r5048, %r5047}; + xor.b64 %rd386, %rd381, %rd376; + st.global.v2.u64 [%rd369+16], {%rd386, %rd385}; + bra.uni $L__BB2_6; + +$L__BB2_5: + ld.global.u64 %rd387, [%rd1]; + xor.b64 %rd1299, %rd387, %rd4; + +$L__BB2_6: + and.b64 %rd389, %rd1299, %rd357; + or.b64 %rd8, %rd389, %rd358; + mov.u64 %rd1300, 0; + mov.u32 %r29818, 0; + mov.u64 %rd390, hash_header; + +$L__BB2_7: + add.s64 %rd391, %rd390, %rd1300; + ld.const.u8 %rs410, [%rd391]; + add.s64 %rd392, %rd3, %rd1300; + st.local.u8 [%rd392], %rs410; + add.s64 %rd1300, %rd1300, 1; + add.s32 %r29818, %r29818, 1; + setp.lt.u32 %p9, %r29818, 72; + @%p9 bra $L__BB2_7; + + ld.local.v4.u32 {%r5050, %r5051, %r5052, %r5053}, [%rd3]; + mov.u64 %rd393, 0; + ld.local.v4.u32 {%r5054, %r5055, %r5056, %r5057}, [%rd3+16]; + ld.local.v4.u32 {%r5058, %r5059, %r5060, %r5061}, [%rd3+32]; + ld.local.v4.u32 {%r5062, %r5063, %r5064, %r5065}, [%rd3+48]; + st.local.u64 [%rd3+72], %rd8; + mov.u32 %r5066, -1150833019; + mov.u32 %r5067, 1779033703; + st.local.v2.u32 [%rd2], {%r5067, %r5066}; + mov.u32 %r5068, -1521486534; + mov.u32 %r5069, 1013904242; + st.local.v2.u32 [%rd2+8], {%r5069, %r5068}; + mov.u32 %r5070, -1694144372; + mov.u32 %r5071, 1359893119; + st.local.v2.u32 [%rd2+16], {%r5071, %r5070}; + mov.u32 %r5072, 1541459225; + mov.u32 %r5073, 528734635; + st.local.v2.u32 [%rd2+24], {%r5073, %r5072}; + st.local.u64 [%rd2+64], %rd393; + mov.u32 %r5074, 0; + st.local.v2.u32 [%rd2+72], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+80], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+88], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+96], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+104], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+112], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+120], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+128], {%r5074, %r5074}; + mov.u16 %rs411, 0; + st.local.v2.u8 [%rd2+136], {%rs411, %rs411}; + st.local.u8 [%rd2+138], %rs411; + st.local.v2.u32 [%rd2+32], {%r5067, %r5066}; + st.local.v2.u32 [%rd2+40], {%r5069, %r5068}; + st.local.v2.u32 [%rd2+48], {%r5071, %r5070}; + st.local.v2.u32 [%rd2+56], {%r5073, %r5072}; + st.local.u8 [%rd2+144], %rs411; + ld.local.v4.u8 {%rs412, %rs413, %rs414, %rs415}, [%rd2+136]; + setp.eq.s16 %p10, %rs413, 0; + selp.u16 %rs419, 1, 0, %p10; + or.b16 %rs420, %rs414, %rs419; + mov.b32 {%rs421, %rs422}, %r5050; + shr.u16 %rs423, %rs421, 8; + shr.u16 %rs424, %rs422, 8; + mov.b32 {%rs425, %rs426}, %r5051; + shr.u16 %rs427, %rs425, 8; + shr.u16 %rs428, %rs426, 8; + mov.b32 {%rs429, %rs430}, %r5052; + shr.u16 %rs431, %rs429, 8; + shr.u16 %rs432, %rs430, 8; + mov.b32 {%rs433, %rs434}, %r5053; + shr.u16 %rs435, %rs433, 8; + shr.u16 %rs436, %rs434, 8; + cvt.u32.u16 %r5079, %rs421; + and.b32 %r5080, %r5079, 255; + cvt.u32.u16 %r5081, %rs423; + prmt.b32 %r5082, %r5081, %r5080, 30212; + cvt.u32.u16 %r5083, %rs422; + prmt.b32 %r5084, %r5083, %r5082, 28756; + cvt.u32.u16 %r5085, %rs424; + prmt.b32 %r5086, %r5085, %r5084, 1620; + cvt.u32.u16 %r5087, %rs425; + and.b32 %r5088, %r5087, 255; + cvt.u32.u16 %r5089, %rs427; + prmt.b32 %r5090, %r5089, %r5088, 30212; + cvt.u32.u16 %r5091, %rs426; + prmt.b32 %r5092, %r5091, %r5090, 28756; + cvt.u32.u16 %r5093, %rs428; + prmt.b32 %r5094, %r5093, %r5092, 1620; + cvt.u32.u16 %r5095, %rs429; + and.b32 %r5096, %r5095, 255; + cvt.u32.u16 %r5097, %rs431; + prmt.b32 %r5098, %r5097, %r5096, 30212; + cvt.u32.u16 %r5099, %rs430; + prmt.b32 %r5100, %r5099, %r5098, 28756; + cvt.u32.u16 %r5101, %rs432; + prmt.b32 %r5102, %r5101, %r5100, 1620; + cvt.u32.u16 %r5103, %rs433; + and.b32 %r5104, %r5103, 255; + cvt.u32.u16 %r5105, %rs435; + prmt.b32 %r5106, %r5105, %r5104, 30212; + cvt.u32.u16 %r5107, %rs434; + prmt.b32 %r5108, %r5107, %r5106, 28756; + cvt.u32.u16 %r5109, %rs436; + prmt.b32 %r5110, %r5109, %r5108, 1620; + mov.b32 {%rs437, %rs438}, %r5054; + shr.u16 %rs439, %rs437, 8; + shr.u16 %rs440, %rs438, 8; + mov.b32 {%rs441, %rs442}, %r5055; + shr.u16 %rs443, %rs441, 8; + shr.u16 %rs444, %rs442, 8; + mov.b32 {%rs445, %rs446}, %r5056; + shr.u16 %rs447, %rs445, 8; + shr.u16 %rs448, %rs446, 8; + mov.b32 {%rs449, %rs450}, %r5057; + shr.u16 %rs451, %rs449, 8; + shr.u16 %rs452, %rs450, 8; + cvt.u32.u16 %r5115, %rs437; + and.b32 %r5116, %r5115, 255; + cvt.u32.u16 %r5117, %rs439; + prmt.b32 %r5118, %r5117, %r5116, 30212; + cvt.u32.u16 %r5119, %rs438; + prmt.b32 %r5120, %r5119, %r5118, 28756; + cvt.u32.u16 %r5121, %rs440; + prmt.b32 %r5122, %r5121, %r5120, 1620; + cvt.u32.u16 %r5123, %rs441; + and.b32 %r5124, %r5123, 255; + cvt.u32.u16 %r5125, %rs443; + prmt.b32 %r5126, %r5125, %r5124, 30212; + cvt.u32.u16 %r5127, %rs442; + prmt.b32 %r5128, %r5127, %r5126, 28756; + cvt.u32.u16 %r5129, %rs444; + prmt.b32 %r5130, %r5129, %r5128, 1620; + cvt.u32.u16 %r5131, %rs445; + and.b32 %r5132, %r5131, 255; + cvt.u32.u16 %r5133, %rs447; + prmt.b32 %r5134, %r5133, %r5132, 30212; + cvt.u32.u16 %r5135, %rs446; + prmt.b32 %r5136, %r5135, %r5134, 28756; + cvt.u32.u16 %r5137, %rs448; + prmt.b32 %r5138, %r5137, %r5136, 1620; + cvt.u32.u16 %r5139, %rs449; + and.b32 %r5140, %r5139, 255; + cvt.u32.u16 %r5141, %rs451; + prmt.b32 %r5142, %r5141, %r5140, 30212; + cvt.u32.u16 %r5143, %rs450; + prmt.b32 %r5144, %r5143, %r5142, 28756; + cvt.u32.u16 %r5145, %rs452; + prmt.b32 %r5146, %r5145, %r5144, 1620; + mov.b32 {%rs453, %rs454}, %r5058; + shr.u16 %rs455, %rs453, 8; + shr.u16 %rs456, %rs454, 8; + mov.b32 {%rs457, %rs458}, %r5059; + shr.u16 %rs459, %rs457, 8; + shr.u16 %rs460, %rs458, 8; + mov.b32 {%rs461, %rs462}, %r5060; + shr.u16 %rs463, %rs461, 8; + shr.u16 %rs464, %rs462, 8; + mov.b32 {%rs465, %rs466}, %r5061; + shr.u16 %rs467, %rs465, 8; + shr.u16 %rs468, %rs466, 8; + cvt.u32.u16 %r5151, %rs453; + and.b32 %r5152, %r5151, 255; + cvt.u32.u16 %r5153, %rs455; + prmt.b32 %r5154, %r5153, %r5152, 30212; + cvt.u32.u16 %r5155, %rs454; + prmt.b32 %r5156, %r5155, %r5154, 28756; + cvt.u32.u16 %r5157, %rs456; + prmt.b32 %r5158, %r5157, %r5156, 1620; + cvt.u32.u16 %r5159, %rs457; + and.b32 %r5160, %r5159, 255; + cvt.u32.u16 %r5161, %rs459; + prmt.b32 %r5162, %r5161, %r5160, 30212; + cvt.u32.u16 %r5163, %rs458; + prmt.b32 %r5164, %r5163, %r5162, 28756; + cvt.u32.u16 %r5165, %rs460; + prmt.b32 %r5166, %r5165, %r5164, 1620; + cvt.u32.u16 %r5167, %rs461; + and.b32 %r5168, %r5167, 255; + cvt.u32.u16 %r5169, %rs463; + prmt.b32 %r5170, %r5169, %r5168, 30212; + cvt.u32.u16 %r5171, %rs462; + prmt.b32 %r5172, %r5171, %r5170, 28756; + cvt.u32.u16 %r5173, %rs464; + prmt.b32 %r5174, %r5173, %r5172, 1620; + cvt.u32.u16 %r5175, %rs465; + and.b32 %r5176, %r5175, 255; + cvt.u32.u16 %r5177, %rs467; + prmt.b32 %r5178, %r5177, %r5176, 30212; + cvt.u32.u16 %r5179, %rs466; + prmt.b32 %r5180, %r5179, %r5178, 28756; + cvt.u32.u16 %r5181, %rs468; + prmt.b32 %r5182, %r5181, %r5180, 1620; + mov.b32 {%rs469, %rs470}, %r5062; + shr.u16 %rs471, %rs469, 8; + shr.u16 %rs472, %rs470, 8; + mov.b32 {%rs473, %rs474}, %r5063; + shr.u16 %rs475, %rs473, 8; + shr.u16 %rs476, %rs474, 8; + mov.b32 {%rs477, %rs478}, %r5064; + shr.u16 %rs479, %rs477, 8; + shr.u16 %rs480, %rs478, 8; + mov.b32 {%rs481, %rs482}, %r5065; + shr.u16 %rs483, %rs481, 8; + shr.u16 %rs484, %rs482, 8; + cvt.u32.u16 %r5187, %rs469; + and.b32 %r5188, %r5187, 255; + cvt.u32.u16 %r5189, %rs471; + prmt.b32 %r5190, %r5189, %r5188, 30212; + cvt.u32.u16 %r5191, %rs470; + prmt.b32 %r5192, %r5191, %r5190, 28756; + cvt.u32.u16 %r5193, %rs472; + prmt.b32 %r5194, %r5193, %r5192, 1620; + cvt.u32.u16 %r5195, %rs473; + and.b32 %r5196, %r5195, 255; + cvt.u32.u16 %r5197, %rs475; + prmt.b32 %r5198, %r5197, %r5196, 30212; + cvt.u32.u16 %r5199, %rs474; + prmt.b32 %r5200, %r5199, %r5198, 28756; + cvt.u32.u16 %r5201, %rs476; + prmt.b32 %r5202, %r5201, %r5200, 1620; + cvt.u32.u16 %r5203, %rs477; + and.b32 %r5204, %r5203, 255; + cvt.u32.u16 %r5205, %rs479; + prmt.b32 %r5206, %r5205, %r5204, 30212; + cvt.u32.u16 %r5207, %rs478; + prmt.b32 %r5208, %r5207, %r5206, 28756; + cvt.u32.u16 %r5209, %rs480; + prmt.b32 %r5210, %r5209, %r5208, 1620; + cvt.u32.u16 %r5211, %rs481; + and.b32 %r5212, %r5211, 255; + cvt.u32.u16 %r5213, %rs483; + prmt.b32 %r5214, %r5213, %r5212, 30212; + cvt.u32.u16 %r5215, %rs482; + prmt.b32 %r5216, %r5215, %r5214, 28756; + cvt.u32.u16 %r5217, %rs484; + prmt.b32 %r5218, %r5217, %r5216, 1620; + cvt.u32.u16 %r5219, %rs420; + and.b32 %r5220, %r5219, 255; + add.s32 %r5221, %r5086, -1156040474; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 16; + add.s32 %r5223, %r5222, 1779033703; + xor.b32 %r5224, %r5223, 1359893119; + shf.l.wrap.b32 %r5225, %r5224, %r5224, 20; + add.s32 %r5226, %r5094, %r5221; + add.s32 %r5227, %r5226, %r5225; + xor.b32 %r5228, %r5227, %r5222; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 24; + add.s32 %r5230, %r5229, %r5223; + xor.b32 %r5231, %r5230, %r5225; + shf.l.wrap.b32 %r5232, %r5231, %r5231, 25; + add.s32 %r5233, %r5102, 1449989905; + shf.l.wrap.b32 %r5234, %r5233, %r5233, 16; + add.s32 %r5235, %r5234, -1150833019; + xor.b32 %r5236, %r5235, -1694144372; + shf.l.wrap.b32 %r5237, %r5236, %r5236, 20; + add.s32 %r5238, %r5110, %r5233; + add.s32 %r5239, %r5238, %r5237; + xor.b32 %r5240, %r5239, %r5234; + shf.l.wrap.b32 %r5241, %r5240, %r5240, 24; + add.s32 %r5242, %r5241, %r5235; + xor.b32 %r5243, %r5242, %r5237; + shf.l.wrap.b32 %r5244, %r5243, %r5243, 25; + add.s32 %r5245, %r5122, 1542638877; + shr.u32 %r5246, %r5245, 16; + shl.b32 %r5247, %r5245, 16; + xor.b32 %r5248, %r5247, 4194304; + or.b32 %r5249, %r5248, %r5246; + add.s32 %r5250, %r5249, 1013904242; + xor.b32 %r5251, %r5250, 528734635; + shf.l.wrap.b32 %r5252, %r5251, %r5251, 20; + add.s32 %r5253, %r5130, %r5245; + add.s32 %r5254, %r5253, %r5252; + xor.b32 %r5255, %r5254, %r5249; + shf.l.wrap.b32 %r5256, %r5255, %r5255, 24; + add.s32 %r5257, %r5256, %r5250; + xor.b32 %r5258, %r5257, %r5252; + shf.l.wrap.b32 %r5259, %r5258, %r5258, 25; + add.s32 %r5260, %r5138, 19972691; + xor.b32 %r5261, %r5260, %r5220; + shr.u32 %r5262, %r5260, 16; + shl.b32 %r5263, %r5261, 16; + or.b32 %r5264, %r5263, %r5262; + add.s32 %r5265, %r5264, -1521486534; + xor.b32 %r5266, %r5265, 1541459225; + shf.l.wrap.b32 %r5267, %r5266, %r5266, 20; + add.s32 %r5268, %r5146, %r5260; + add.s32 %r5269, %r5268, %r5267; + xor.b32 %r5270, %r5269, %r5264; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 24; + add.s32 %r5272, %r5271, %r5265; + xor.b32 %r5273, %r5272, %r5267; + shf.l.wrap.b32 %r5274, %r5273, %r5273, 25; + add.s32 %r5275, %r5244, %r5227; + add.s32 %r5276, %r5275, %r5158; + xor.b32 %r5277, %r5271, %r5276; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 16; + add.s32 %r5279, %r5278, %r5257; + xor.b32 %r5280, %r5279, %r5244; + shf.l.wrap.b32 %r5281, %r5280, %r5280, 20; + add.s32 %r5282, %r5166, %r5276; + add.s32 %r5283, %r5282, %r5281; + xor.b32 %r5284, %r5283, %r5278; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 24; + add.s32 %r5286, %r5285, %r5279; + xor.b32 %r5287, %r5286, %r5281; + shf.l.wrap.b32 %r5288, %r5287, %r5287, 25; + add.s32 %r5289, %r5259, %r5239; + add.s32 %r5290, %r5289, %r5174; + xor.b32 %r5291, %r5290, %r5229; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 16; + add.s32 %r5293, %r5292, %r5272; + xor.b32 %r5294, %r5293, %r5259; + shf.l.wrap.b32 %r5295, %r5294, %r5294, 20; + add.s32 %r5296, %r5182, %r5290; + add.s32 %r5297, %r5296, %r5295; + xor.b32 %r5298, %r5297, %r5292; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 24; + add.s32 %r5300, %r5299, %r5293; + xor.b32 %r5301, %r5300, %r5295; + shf.l.wrap.b32 %r5302, %r5301, %r5301, 25; + add.s32 %r5303, %r5274, %r5254; + add.s32 %r5304, %r5303, %r5194; + xor.b32 %r5305, %r5304, %r5241; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 16; + add.s32 %r5307, %r5306, %r5230; + xor.b32 %r5308, %r5307, %r5274; + shf.l.wrap.b32 %r5309, %r5308, %r5308, 20; + add.s32 %r5310, %r5202, %r5304; + add.s32 %r5311, %r5310, %r5309; + xor.b32 %r5312, %r5311, %r5306; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 24; + add.s32 %r5314, %r5313, %r5307; + xor.b32 %r5315, %r5314, %r5309; + shf.l.wrap.b32 %r5316, %r5315, %r5315, 25; + add.s32 %r5317, %r5269, %r5232; + add.s32 %r5318, %r5317, %r5210; + xor.b32 %r5319, %r5318, %r5256; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 16; + add.s32 %r5321, %r5320, %r5242; + xor.b32 %r5322, %r5321, %r5232; + shf.l.wrap.b32 %r5323, %r5322, %r5322, 20; + add.s32 %r5324, %r5218, %r5318; + add.s32 %r5325, %r5324, %r5323; + xor.b32 %r5326, %r5325, %r5320; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 24; + add.s32 %r5328, %r5327, %r5321; + xor.b32 %r5329, %r5328, %r5323; + shf.l.wrap.b32 %r5330, %r5329, %r5329, 25; + add.s32 %r5331, %r5283, %r5102; + add.s32 %r5332, %r5331, %r5330; + xor.b32 %r5333, %r5332, %r5299; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 16; + add.s32 %r5335, %r5334, %r5314; + xor.b32 %r5336, %r5335, %r5330; + shf.l.wrap.b32 %r5337, %r5336, %r5336, 20; + add.s32 %r5338, %r5332, %r5138; + add.s32 %r5339, %r5338, %r5337; + xor.b32 %r5340, %r5339, %r5334; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 24; + add.s32 %r5342, %r5341, %r5335; + xor.b32 %r5343, %r5342, %r5337; + shf.l.wrap.b32 %r5344, %r5343, %r5343, 25; + add.s32 %r5345, %r5297, %r5110; + add.s32 %r5346, %r5345, %r5288; + xor.b32 %r5347, %r5313, %r5346; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 16; + add.s32 %r5349, %r5328, %r5348; + xor.b32 %r5350, %r5349, %r5288; + shf.l.wrap.b32 %r5351, %r5350, %r5350, 20; + add.s32 %r5352, %r5346, %r5174; + add.s32 %r5353, %r5352, %r5351; + xor.b32 %r5354, %r5353, %r5348; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 24; + add.s32 %r5356, %r5355, %r5349; + xor.b32 %r5357, %r5356, %r5351; + shf.l.wrap.b32 %r5358, %r5357, %r5357, 25; + add.s32 %r5359, %r5302, %r5146; + add.s32 %r5360, %r5359, %r5311; + xor.b32 %r5361, %r5327, %r5360; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 16; + add.s32 %r5363, %r5362, %r5286; + xor.b32 %r5364, %r5363, %r5302; + shf.l.wrap.b32 %r5365, %r5364, %r5364, 20; + add.s32 %r5366, %r5360, %r5086; + add.s32 %r5367, %r5366, %r5365; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 24; + add.s32 %r5370, %r5369, %r5363; + xor.b32 %r5371, %r5370, %r5365; + shf.l.wrap.b32 %r5372, %r5371, %r5371, 25; + add.s32 %r5373, %r5316, %r5122; + add.s32 %r5374, %r5373, %r5325; + xor.b32 %r5375, %r5374, %r5285; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 16; + add.s32 %r5377, %r5376, %r5300; + xor.b32 %r5378, %r5377, %r5316; + shf.l.wrap.b32 %r5379, %r5378, %r5378, 20; + add.s32 %r5380, %r5374, %r5202; + add.s32 %r5381, %r5380, %r5379; + xor.b32 %r5382, %r5381, %r5376; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 24; + add.s32 %r5384, %r5383, %r5377; + xor.b32 %r5385, %r5384, %r5379; + shf.l.wrap.b32 %r5386, %r5385, %r5385, 25; + add.s32 %r5387, %r5339, %r5094; + add.s32 %r5388, %r5387, %r5358; + xor.b32 %r5389, %r5388, %r5383; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 16; + add.s32 %r5391, %r5390, %r5370; + xor.b32 %r5392, %r5391, %r5358; + shf.l.wrap.b32 %r5393, %r5392, %r5392, 20; + add.s32 %r5394, %r5388, %r5182; + add.s32 %r5395, %r5394, %r5393; + xor.b32 %r5396, %r5395, %r5390; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 24; + add.s32 %r5398, %r5397, %r5391; + xor.b32 %r5399, %r5398, %r5393; + shf.l.wrap.b32 %r5400, %r5399, %r5399, 25; + add.s32 %r5401, %r5353, %r5194; + add.s32 %r5402, %r5401, %r5372; + xor.b32 %r5403, %r5402, %r5341; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 16; + add.s32 %r5405, %r5404, %r5384; + xor.b32 %r5406, %r5405, %r5372; + shf.l.wrap.b32 %r5407, %r5406, %r5406, 20; + add.s32 %r5408, %r5402, %r5130; + add.s32 %r5409, %r5408, %r5407; + xor.b32 %r5410, %r5409, %r5404; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 24; + add.s32 %r5412, %r5411, %r5405; + xor.b32 %r5413, %r5412, %r5407; + shf.l.wrap.b32 %r5414, %r5413, %r5413, 25; + add.s32 %r5415, %r5367, %r5166; + add.s32 %r5416, %r5415, %r5386; + xor.b32 %r5417, %r5416, %r5355; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 16; + add.s32 %r5419, %r5418, %r5342; + xor.b32 %r5420, %r5419, %r5386; + shf.l.wrap.b32 %r5421, %r5420, %r5420, 20; + add.s32 %r5422, %r5416, %r5210; + add.s32 %r5423, %r5422, %r5421; + xor.b32 %r5424, %r5423, %r5418; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 24; + add.s32 %r5426, %r5425, %r5419; + xor.b32 %r5427, %r5426, %r5421; + shf.l.wrap.b32 %r5428, %r5427, %r5427, 25; + add.s32 %r5429, %r5381, %r5218; + add.s32 %r5430, %r5429, %r5344; + xor.b32 %r5431, %r5430, %r5369; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 16; + add.s32 %r5433, %r5432, %r5356; + xor.b32 %r5434, %r5433, %r5344; + shf.l.wrap.b32 %r5435, %r5434, %r5434, 20; + add.s32 %r5436, %r5430, %r5158; + add.s32 %r5437, %r5436, %r5435; + xor.b32 %r5438, %r5437, %r5432; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 24; + add.s32 %r5440, %r5439, %r5433; + xor.b32 %r5441, %r5440, %r5435; + shf.l.wrap.b32 %r5442, %r5441, %r5441, 25; + add.s32 %r5443, %r5395, %r5110; + add.s32 %r5444, %r5443, %r5442; + xor.b32 %r5445, %r5444, %r5411; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 16; + add.s32 %r5447, %r5446, %r5426; + xor.b32 %r5448, %r5447, %r5442; + shf.l.wrap.b32 %r5449, %r5448, %r5448, 20; + add.s32 %r5450, %r5444, %r5122; + add.s32 %r5451, %r5450, %r5449; + xor.b32 %r5452, %r5451, %r5446; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 24; + add.s32 %r5454, %r5453, %r5447; + xor.b32 %r5455, %r5454, %r5449; + shf.l.wrap.b32 %r5456, %r5455, %r5455, 25; + add.s32 %r5457, %r5409, %r5174; + add.s32 %r5458, %r5457, %r5400; + xor.b32 %r5459, %r5458, %r5425; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 16; + add.s32 %r5461, %r5460, %r5440; + xor.b32 %r5462, %r5461, %r5400; + shf.l.wrap.b32 %r5463, %r5462, %r5462, 20; + add.s32 %r5464, %r5458, %r5194; + add.s32 %r5465, %r5464, %r5463; + xor.b32 %r5466, %r5465, %r5460; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 24; + add.s32 %r5468, %r5467, %r5461; + xor.b32 %r5469, %r5468, %r5463; + shf.l.wrap.b32 %r5470, %r5469, %r5469, 25; + add.s32 %r5471, %r5423, %r5202; + add.s32 %r5472, %r5471, %r5414; + xor.b32 %r5473, %r5472, %r5439; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 16; + add.s32 %r5475, %r5474, %r5398; + xor.b32 %r5476, %r5475, %r5414; + shf.l.wrap.b32 %r5477, %r5476, %r5476, 20; + add.s32 %r5478, %r5472, %r5102; + add.s32 %r5479, %r5478, %r5477; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 24; + add.s32 %r5482, %r5481, %r5475; + xor.b32 %r5483, %r5482, %r5477; + shf.l.wrap.b32 %r5484, %r5483, %r5483, 25; + add.s32 %r5485, %r5437, %r5146; + add.s32 %r5486, %r5485, %r5428; + xor.b32 %r5487, %r5486, %r5397; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 16; + add.s32 %r5489, %r5488, %r5412; + xor.b32 %r5490, %r5489, %r5428; + shf.l.wrap.b32 %r5491, %r5490, %r5490, 20; + add.s32 %r5492, %r5486, %r5210; + add.s32 %r5493, %r5492, %r5491; + xor.b32 %r5494, %r5493, %r5488; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 24; + add.s32 %r5496, %r5495, %r5489; + xor.b32 %r5497, %r5496, %r5491; + shf.l.wrap.b32 %r5498, %r5497, %r5497, 25; + add.s32 %r5499, %r5451, %r5138; + add.s32 %r5500, %r5499, %r5470; + xor.b32 %r5501, %r5500, %r5495; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 16; + add.s32 %r5503, %r5502, %r5482; + xor.b32 %r5504, %r5503, %r5470; + shf.l.wrap.b32 %r5505, %r5504, %r5504, 20; + add.s32 %r5506, %r5500, %r5130; + add.s32 %r5507, %r5506, %r5505; + xor.b32 %r5508, %r5507, %r5502; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 24; + add.s32 %r5510, %r5509, %r5503; + xor.b32 %r5511, %r5510, %r5505; + shf.l.wrap.b32 %r5512, %r5511, %r5511, 25; + add.s32 %r5513, %r5465, %r5166; + add.s32 %r5514, %r5513, %r5484; + xor.b32 %r5515, %r5514, %r5453; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 16; + add.s32 %r5517, %r5516, %r5496; + xor.b32 %r5518, %r5517, %r5484; + shf.l.wrap.b32 %r5519, %r5518, %r5518, 20; + add.s32 %r5520, %r5514, %r5086; + add.s32 %r5521, %r5520, %r5519; + xor.b32 %r5522, %r5521, %r5516; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 24; + add.s32 %r5524, %r5523, %r5517; + xor.b32 %r5525, %r5524, %r5519; + shf.l.wrap.b32 %r5526, %r5525, %r5525, 25; + add.s32 %r5527, %r5479, %r5182; + add.s32 %r5528, %r5527, %r5498; + xor.b32 %r5529, %r5528, %r5467; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 16; + add.s32 %r5531, %r5530, %r5454; + xor.b32 %r5532, %r5531, %r5498; + shf.l.wrap.b32 %r5533, %r5532, %r5532, 20; + add.s32 %r5534, %r5528, %r5218; + add.s32 %r5535, %r5534, %r5533; + xor.b32 %r5536, %r5535, %r5530; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 24; + add.s32 %r5538, %r5537, %r5531; + xor.b32 %r5539, %r5538, %r5533; + shf.l.wrap.b32 %r5540, %r5539, %r5539, 25; + add.s32 %r5541, %r5493, %r5158; + add.s32 %r5542, %r5541, %r5456; + xor.b32 %r5543, %r5542, %r5481; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 16; + add.s32 %r5545, %r5544, %r5468; + xor.b32 %r5546, %r5545, %r5456; + shf.l.wrap.b32 %r5547, %r5546, %r5546, 20; + add.s32 %r5548, %r5542, %r5094; + add.s32 %r5549, %r5548, %r5547; + xor.b32 %r5550, %r5549, %r5544; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 24; + add.s32 %r5552, %r5551, %r5545; + xor.b32 %r5553, %r5552, %r5547; + shf.l.wrap.b32 %r5554, %r5553, %r5553, 25; + add.s32 %r5555, %r5507, %r5174; + add.s32 %r5556, %r5555, %r5554; + xor.b32 %r5557, %r5556, %r5523; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 16; + add.s32 %r5559, %r5558, %r5538; + xor.b32 %r5560, %r5559, %r5554; + shf.l.wrap.b32 %r5561, %r5560, %r5560, 20; + add.s32 %r5562, %r5556, %r5146; + add.s32 %r5563, %r5562, %r5561; + xor.b32 %r5564, %r5563, %r5558; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 24; + add.s32 %r5566, %r5565, %r5559; + xor.b32 %r5567, %r5566, %r5561; + shf.l.wrap.b32 %r5568, %r5567, %r5567, 25; + add.s32 %r5569, %r5521, %r5194; + add.s32 %r5570, %r5569, %r5512; + xor.b32 %r5571, %r5570, %r5537; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 16; + add.s32 %r5573, %r5572, %r5552; + xor.b32 %r5574, %r5573, %r5512; + shf.l.wrap.b32 %r5575, %r5574, %r5574, 20; + add.s32 %r5576, %r5570, %r5166; + add.s32 %r5577, %r5576, %r5575; + xor.b32 %r5578, %r5577, %r5572; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 24; + add.s32 %r5580, %r5579, %r5573; + xor.b32 %r5581, %r5580, %r5575; + shf.l.wrap.b32 %r5582, %r5581, %r5581, 25; + add.s32 %r5583, %r5535, %r5210; + add.s32 %r5584, %r5583, %r5526; + xor.b32 %r5585, %r5584, %r5551; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 16; + add.s32 %r5587, %r5586, %r5510; + xor.b32 %r5588, %r5587, %r5526; + shf.l.wrap.b32 %r5589, %r5588, %r5588, 20; + add.s32 %r5590, %r5584, %r5110; + add.s32 %r5591, %r5590, %r5589; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 24; + add.s32 %r5594, %r5593, %r5587; + xor.b32 %r5595, %r5594, %r5589; + shf.l.wrap.b32 %r5596, %r5595, %r5595, 25; + add.s32 %r5597, %r5549, %r5202; + add.s32 %r5598, %r5597, %r5540; + xor.b32 %r5599, %r5598, %r5509; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 16; + add.s32 %r5601, %r5600, %r5524; + xor.b32 %r5602, %r5601, %r5540; + shf.l.wrap.b32 %r5603, %r5602, %r5602, 20; + add.s32 %r5604, %r5598, %r5218; + add.s32 %r5605, %r5604, %r5603; + xor.b32 %r5606, %r5605, %r5600; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 24; + add.s32 %r5608, %r5607, %r5601; + xor.b32 %r5609, %r5608, %r5603; + shf.l.wrap.b32 %r5610, %r5609, %r5609, 25; + add.s32 %r5611, %r5563, %r5122; + add.s32 %r5612, %r5611, %r5582; + xor.b32 %r5613, %r5612, %r5607; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 16; + add.s32 %r5615, %r5614, %r5594; + xor.b32 %r5616, %r5615, %r5582; + shf.l.wrap.b32 %r5617, %r5616, %r5616, 20; + add.s32 %r5618, %r5612, %r5086; + add.s32 %r5619, %r5618, %r5617; + xor.b32 %r5620, %r5619, %r5614; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 24; + add.s32 %r5622, %r5621, %r5615; + xor.b32 %r5623, %r5622, %r5617; + shf.l.wrap.b32 %r5624, %r5623, %r5623, 25; + add.s32 %r5625, %r5577, %r5182; + add.s32 %r5626, %r5625, %r5596; + xor.b32 %r5627, %r5626, %r5565; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 16; + add.s32 %r5629, %r5628, %r5608; + xor.b32 %r5630, %r5629, %r5596; + shf.l.wrap.b32 %r5631, %r5630, %r5630, 20; + add.s32 %r5632, %r5626, %r5102; + add.s32 %r5633, %r5632, %r5631; + xor.b32 %r5634, %r5633, %r5628; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 24; + add.s32 %r5636, %r5635, %r5629; + xor.b32 %r5637, %r5636, %r5631; + shf.l.wrap.b32 %r5638, %r5637, %r5637, 25; + add.s32 %r5639, %r5591, %r5130; + add.s32 %r5640, %r5639, %r5610; + xor.b32 %r5641, %r5640, %r5579; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 16; + add.s32 %r5643, %r5642, %r5566; + xor.b32 %r5644, %r5643, %r5610; + shf.l.wrap.b32 %r5645, %r5644, %r5644, 20; + add.s32 %r5646, %r5640, %r5158; + add.s32 %r5647, %r5646, %r5645; + xor.b32 %r5648, %r5647, %r5642; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 24; + add.s32 %r5650, %r5649, %r5643; + xor.b32 %r5651, %r5650, %r5645; + shf.l.wrap.b32 %r5652, %r5651, %r5651, 25; + add.s32 %r5653, %r5605, %r5094; + add.s32 %r5654, %r5653, %r5568; + xor.b32 %r5655, %r5654, %r5593; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 16; + add.s32 %r5657, %r5656, %r5580; + xor.b32 %r5658, %r5657, %r5568; + shf.l.wrap.b32 %r5659, %r5658, %r5658, 20; + add.s32 %r5660, %r5654, %r5138; + add.s32 %r5661, %r5660, %r5659; + xor.b32 %r5662, %r5661, %r5656; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 24; + add.s32 %r5664, %r5663, %r5657; + xor.b32 %r5665, %r5664, %r5659; + shf.l.wrap.b32 %r5666, %r5665, %r5665, 25; + add.s32 %r5667, %r5619, %r5194; + add.s32 %r5668, %r5667, %r5666; + xor.b32 %r5669, %r5668, %r5635; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 16; + add.s32 %r5671, %r5670, %r5650; + xor.b32 %r5672, %r5671, %r5666; + shf.l.wrap.b32 %r5673, %r5672, %r5672, 20; + add.s32 %r5674, %r5668, %r5202; + add.s32 %r5675, %r5674, %r5673; + xor.b32 %r5676, %r5675, %r5670; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 24; + add.s32 %r5678, %r5677, %r5671; + xor.b32 %r5679, %r5678, %r5673; + shf.l.wrap.b32 %r5680, %r5679, %r5679, 25; + add.s32 %r5681, %r5633, %r5166; + add.s32 %r5682, %r5681, %r5624; + xor.b32 %r5683, %r5682, %r5649; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 16; + add.s32 %r5685, %r5684, %r5664; + xor.b32 %r5686, %r5685, %r5624; + shf.l.wrap.b32 %r5687, %r5686, %r5686, 20; + add.s32 %r5688, %r5682, %r5182; + add.s32 %r5689, %r5688, %r5687; + xor.b32 %r5690, %r5689, %r5684; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 24; + add.s32 %r5692, %r5691, %r5685; + xor.b32 %r5693, %r5692, %r5687; + shf.l.wrap.b32 %r5694, %r5693, %r5693, 25; + add.s32 %r5695, %r5647, %r5218; + add.s32 %r5696, %r5695, %r5638; + xor.b32 %r5697, %r5696, %r5663; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 16; + add.s32 %r5699, %r5698, %r5622; + xor.b32 %r5700, %r5699, %r5638; + shf.l.wrap.b32 %r5701, %r5700, %r5700, 20; + add.s32 %r5702, %r5696, %r5174; + add.s32 %r5703, %r5702, %r5701; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 24; + add.s32 %r5706, %r5705, %r5699; + xor.b32 %r5707, %r5706, %r5701; + shf.l.wrap.b32 %r5708, %r5707, %r5707, 25; + add.s32 %r5709, %r5661, %r5210; + add.s32 %r5710, %r5709, %r5652; + xor.b32 %r5711, %r5710, %r5621; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 16; + add.s32 %r5713, %r5712, %r5636; + xor.b32 %r5714, %r5713, %r5652; + shf.l.wrap.b32 %r5715, %r5714, %r5714, 20; + add.s32 %r5716, %r5710, %r5158; + add.s32 %r5717, %r5716, %r5715; + xor.b32 %r5718, %r5717, %r5712; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 24; + add.s32 %r5720, %r5719, %r5713; + xor.b32 %r5721, %r5720, %r5715; + shf.l.wrap.b32 %r5722, %r5721, %r5721, 25; + add.s32 %r5723, %r5675, %r5146; + add.s32 %r5724, %r5723, %r5694; + xor.b32 %r5725, %r5724, %r5719; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 16; + add.s32 %r5727, %r5726, %r5706; + xor.b32 %r5728, %r5727, %r5694; + shf.l.wrap.b32 %r5729, %r5728, %r5728, 20; + add.s32 %r5730, %r5724, %r5102; + add.s32 %r5731, %r5730, %r5729; + xor.b32 %r5732, %r5731, %r5726; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 24; + add.s32 %r5734, %r5733, %r5727; + xor.b32 %r5735, %r5734, %r5729; + shf.l.wrap.b32 %r5736, %r5735, %r5735, 25; + add.s32 %r5737, %r5689, %r5130; + add.s32 %r5738, %r5737, %r5708; + xor.b32 %r5739, %r5738, %r5677; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 16; + add.s32 %r5741, %r5740, %r5720; + xor.b32 %r5742, %r5741, %r5708; + shf.l.wrap.b32 %r5743, %r5742, %r5742, 20; + add.s32 %r5744, %r5738, %r5110; + add.s32 %r5745, %r5744, %r5743; + xor.b32 %r5746, %r5745, %r5740; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 24; + add.s32 %r5748, %r5747, %r5741; + xor.b32 %r5749, %r5748, %r5743; + shf.l.wrap.b32 %r5750, %r5749, %r5749, 25; + add.s32 %r5751, %r5703, %r5086; + add.s32 %r5752, %r5751, %r5722; + xor.b32 %r5753, %r5752, %r5691; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 16; + add.s32 %r5755, %r5754, %r5678; + xor.b32 %r5756, %r5755, %r5722; + shf.l.wrap.b32 %r5757, %r5756, %r5756, 20; + add.s32 %r5758, %r5752, %r5094; + add.s32 %r5759, %r5758, %r5757; + xor.b32 %r5760, %r5759, %r5754; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 24; + add.s32 %r5762, %r5761, %r5755; + xor.b32 %r5763, %r5762, %r5757; + shf.l.wrap.b32 %r5764, %r5763, %r5763, 25; + add.s32 %r5765, %r5717, %r5138; + add.s32 %r5766, %r5765, %r5680; + xor.b32 %r5767, %r5766, %r5705; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 16; + add.s32 %r5769, %r5768, %r5692; + xor.b32 %r5770, %r5769, %r5680; + shf.l.wrap.b32 %r5771, %r5770, %r5770, 20; + add.s32 %r5772, %r5766, %r5122; + add.s32 %r5773, %r5772, %r5771; + xor.b32 %r5774, %r5773, %r5768; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 24; + add.s32 %r5776, %r5775, %r5769; + xor.b32 %r5777, %r5776, %r5771; + shf.l.wrap.b32 %r5778, %r5777, %r5777, 25; + add.s32 %r5779, %r5731, %r5166; + add.s32 %r5780, %r5779, %r5778; + xor.b32 %r5781, %r5780, %r5747; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 16; + add.s32 %r5783, %r5782, %r5762; + xor.b32 %r5784, %r5783, %r5778; + shf.l.wrap.b32 %r5785, %r5784, %r5784, 20; + add.s32 %r5786, %r5780, %r5210; + add.s32 %r5787, %r5786, %r5785; + xor.b32 %r5788, %r5787, %r5782; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 24; + add.s32 %r5790, %r5789, %r5783; + xor.b32 %r5791, %r5790, %r5785; + shf.l.wrap.b32 %r5792, %r5791, %r5791, 25; + add.s32 %r5793, %r5745, %r5182; + add.s32 %r5794, %r5793, %r5736; + xor.b32 %r5795, %r5794, %r5761; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 16; + add.s32 %r5797, %r5796, %r5776; + xor.b32 %r5798, %r5797, %r5736; + shf.l.wrap.b32 %r5799, %r5798, %r5798, 20; + add.s32 %r5800, %r5794, %r5130; + add.s32 %r5801, %r5800, %r5799; + xor.b32 %r5802, %r5801, %r5796; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 24; + add.s32 %r5804, %r5803, %r5797; + xor.b32 %r5805, %r5804, %r5799; + shf.l.wrap.b32 %r5806, %r5805, %r5805, 25; + add.s32 %r5807, %r5759, %r5158; + add.s32 %r5808, %r5807, %r5750; + xor.b32 %r5809, %r5808, %r5775; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 16; + add.s32 %r5811, %r5810, %r5734; + xor.b32 %r5812, %r5811, %r5750; + shf.l.wrap.b32 %r5813, %r5812, %r5812, 20; + add.s32 %r5814, %r5808, %r5194; + add.s32 %r5815, %r5814, %r5813; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 24; + add.s32 %r5818, %r5817, %r5811; + xor.b32 %r5819, %r5818, %r5813; + shf.l.wrap.b32 %r5820, %r5819, %r5819, 25; + add.s32 %r5821, %r5773, %r5218; + add.s32 %r5822, %r5821, %r5764; + xor.b32 %r5823, %r5822, %r5733; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 16; + add.s32 %r5825, %r5824, %r5748; + xor.b32 %r5826, %r5825, %r5764; + shf.l.wrap.b32 %r5827, %r5826, %r5826, 20; + add.s32 %r5828, %r5822, %r5094; + add.s32 %r5829, %r5828, %r5827; + xor.b32 %r5830, %r5829, %r5824; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 24; + add.s32 %r5832, %r5831, %r5825; + xor.b32 %r5833, %r5832, %r5827; + shf.l.wrap.b32 %r5834, %r5833, %r5833, 25; + add.s32 %r5835, %r5787, %r5202; + add.s32 %r5836, %r5835, %r5806; + xor.b32 %r5837, %r5836, %r5831; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 16; + add.s32 %r5839, %r5838, %r5818; + xor.b32 %r5840, %r5839, %r5806; + shf.l.wrap.b32 %r5841, %r5840, %r5840, 20; + add.s32 %r5842, %r5836, %r5110; + add.s32 %r5843, %r5842, %r5841; + xor.b32 %r5844, %r5843, %r5838; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 24; + add.s32 %r5846, %r5845, %r5839; + xor.b32 %r5847, %r5846, %r5841; + shf.l.wrap.b32 %r5848, %r5847, %r5847, 25; + add.s32 %r5849, %r5801, %r5086; + add.s32 %r5850, %r5849, %r5820; + xor.b32 %r5851, %r5850, %r5789; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 16; + add.s32 %r5853, %r5852, %r5832; + xor.b32 %r5854, %r5853, %r5820; + shf.l.wrap.b32 %r5855, %r5854, %r5854, 20; + add.s32 %r5856, %r5850, %r5174; + add.s32 %r5857, %r5856, %r5855; + xor.b32 %r5858, %r5857, %r5852; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 24; + add.s32 %r5860, %r5859, %r5853; + xor.b32 %r5861, %r5860, %r5855; + shf.l.wrap.b32 %r5862, %r5861, %r5861, 25; + add.s32 %r5863, %r5815, %r5102; + add.s32 %r5864, %r5863, %r5834; + xor.b32 %r5865, %r5864, %r5803; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 16; + add.s32 %r5867, %r5866, %r5790; + xor.b32 %r5868, %r5867, %r5834; + shf.l.wrap.b32 %r5869, %r5868, %r5868, 20; + add.s32 %r5870, %r5864, %r5138; + add.s32 %r5871, %r5870, %r5869; + xor.b32 %r5872, %r5871, %r5866; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 24; + add.s32 %r5874, %r5873, %r5867; + xor.b32 %r5875, %r5874, %r5869; + shf.l.wrap.b32 %r5876, %r5875, %r5875, 25; + add.s32 %r5877, %r5829, %r5122; + add.s32 %r5878, %r5877, %r5792; + xor.b32 %r5879, %r5878, %r5817; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 16; + add.s32 %r5881, %r5880, %r5804; + xor.b32 %r5882, %r5881, %r5792; + shf.l.wrap.b32 %r5883, %r5882, %r5882, 20; + add.s32 %r5884, %r5878, %r5146; + add.s32 %r5885, %r5884, %r5883; + xor.b32 %r5886, %r5885, %r5880; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 24; + add.s32 %r5888, %r5887, %r5881; + xor.b32 %r5889, %r5888, %r5883; + shf.l.wrap.b32 %r5890, %r5889, %r5889, 25; + add.s32 %r5891, %r5843, %r5182; + add.s32 %r5892, %r5891, %r5890; + xor.b32 %r5893, %r5892, %r5859; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 16; + add.s32 %r5895, %r5894, %r5874; + xor.b32 %r5896, %r5895, %r5890; + shf.l.wrap.b32 %r5897, %r5896, %r5896, 20; + add.s32 %r5898, %r5892, %r5218; + add.s32 %r5899, %r5898, %r5897; + xor.b32 %r5900, %r5899, %r5894; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 24; + add.s32 %r5902, %r5901, %r5895; + xor.b32 %r5903, %r5902, %r5897; + shf.l.wrap.b32 %r5904, %r5903, %r5903, 25; + add.s32 %r5905, %r5857, %r5130; + add.s32 %r5906, %r5905, %r5848; + xor.b32 %r5907, %r5906, %r5873; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 16; + add.s32 %r5909, %r5908, %r5888; + xor.b32 %r5910, %r5909, %r5848; + shf.l.wrap.b32 %r5911, %r5910, %r5910, 20; + add.s32 %r5912, %r5906, %r5086; + add.s32 %r5913, %r5912, %r5911; + xor.b32 %r5914, %r5913, %r5908; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 24; + add.s32 %r5916, %r5915, %r5909; + xor.b32 %r5917, %r5916, %r5911; + shf.l.wrap.b32 %r5918, %r5917, %r5917, 25; + add.s32 %r5919, %r5871, %r5094; + add.s32 %r5920, %r5919, %r5862; + xor.b32 %r5921, %r5920, %r5887; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 16; + add.s32 %r5923, %r5922, %r5846; + xor.b32 %r5924, %r5923, %r5862; + shf.l.wrap.b32 %r5925, %r5924, %r5924, 20; + add.s32 %r5926, %r5920, %r5166; + add.s32 %r5927, %r5926, %r5925; + xor.b32 %r5928, %r5927, %r5922; + shf.l.wrap.b32 %r5929, %r5928, %r5928, 24; + add.s32 %r5930, %r5929, %r5923; + xor.b32 %r5931, %r5930, %r5925; + shf.l.wrap.b32 %r5932, %r5931, %r5931, 25; + add.s32 %r5933, %r5885, %r5158; + add.s32 %r5934, %r5933, %r5876; + xor.b32 %r5935, %r5934, %r5845; + shf.l.wrap.b32 %r5936, %r5935, %r5935, 16; + add.s32 %r5937, %r5936, %r5860; + xor.b32 %r5938, %r5937, %r5876; + shf.l.wrap.b32 %r5939, %r5938, %r5938, 20; + add.s32 %r5940, %r5934, %r5138; + add.s32 %r5941, %r5940, %r5939; + xor.b32 %r5942, %r5941, %r5936; + shf.l.wrap.b32 %r5943, %r5942, %r5942, 24; + add.s32 %r5944, %r5943, %r5937; + xor.b32 %r5945, %r5944, %r5939; + shf.l.wrap.b32 %r5946, %r5945, %r5945, 25; + add.s32 %r5947, %r5899, %r5210; + add.s32 %r5948, %r5947, %r5918; + xor.b32 %r5949, %r5948, %r5943; + shf.l.wrap.b32 %r5950, %r5949, %r5949, 16; + add.s32 %r5951, %r5950, %r5930; + xor.b32 %r5952, %r5951, %r5918; + shf.l.wrap.b32 %r5953, %r5952, %r5952, 20; + add.s32 %r5954, %r5948, %r5174; + add.s32 %r5955, %r5954, %r5953; + xor.b32 %r5956, %r5955, %r5950; + shf.l.wrap.b32 %r5957, %r5956, %r5956, 24; + add.s32 %r5958, %r5957, %r5951; + xor.b32 %r5959, %r5958, %r5953; + shf.l.wrap.b32 %r5960, %r5959, %r5959, 25; + add.s32 %r5961, %r5913, %r5102; + add.s32 %r5962, %r5961, %r5932; + xor.b32 %r5963, %r5962, %r5901; + shf.l.wrap.b32 %r5964, %r5963, %r5963, 16; + add.s32 %r5965, %r5964, %r5944; + xor.b32 %r5966, %r5965, %r5932; + shf.l.wrap.b32 %r5967, %r5966, %r5966, 20; + add.s32 %r5968, %r5962, %r5194; + add.s32 %r5969, %r5968, %r5967; + xor.b32 %r5970, %r5969, %r5964; + shf.l.wrap.b32 %r5971, %r5970, %r5970, 24; + add.s32 %r5972, %r5971, %r5965; + xor.b32 %r5973, %r5972, %r5967; + shf.l.wrap.b32 %r5974, %r5973, %r5973, 25; + add.s32 %r5975, %r5927, %r5110; + add.s32 %r5976, %r5975, %r5946; + xor.b32 %r5977, %r5976, %r5915; + shf.l.wrap.b32 %r5978, %r5977, %r5977, 16; + add.s32 %r5979, %r5978, %r5902; + xor.b32 %r5980, %r5979, %r5946; + shf.l.wrap.b32 %r5981, %r5980, %r5980, 20; + add.s32 %r5982, %r5976, %r5122; + add.s32 %r5983, %r5982, %r5981; + xor.b32 %r5984, %r5983, %r5978; + shf.l.wrap.b32 %r5985, %r5984, %r5984, 24; + add.s32 %r5986, %r5985, %r5979; + xor.b32 %r5987, %r5986, %r5981; + shf.l.wrap.b32 %r5988, %r5987, %r5987, 25; + add.s32 %r5989, %r5941, %r5146; + add.s32 %r5990, %r5989, %r5904; + xor.b32 %r5991, %r5990, %r5929; + shf.l.wrap.b32 %r5992, %r5991, %r5991, 16; + add.s32 %r5993, %r5992, %r5916; + xor.b32 %r5994, %r5993, %r5904; + shf.l.wrap.b32 %r5995, %r5994, %r5994, 20; + add.s32 %r5996, %r5990, %r5202; + add.s32 %r5997, %r5996, %r5995; + xor.b32 %r5998, %r5997, %r5992; + shf.l.wrap.b32 %r5999, %r5998, %r5998, 24; + add.s32 %r6000, %r5999, %r5993; + xor.b32 %r6001, %r6000, %r5995; + shf.l.wrap.b32 %r6002, %r6001, %r6001, 25; + xor.b32 %r3, %r5986, %r5955; + xor.b32 %r4, %r6000, %r5969; + st.local.v2.u32 [%rd2+32], {%r3, %r4}; + xor.b32 %r5, %r5958, %r5983; + xor.b32 %r6, %r5997, %r5972; + st.local.v2.u32 [%rd2+40], {%r5, %r6}; + xor.b32 %r7, %r6002, %r5971; + xor.b32 %r8, %r5960, %r5985; + st.local.v2.u32 [%rd2+48], {%r7, %r8}; + xor.b32 %r9, %r5999, %r5974; + xor.b32 %r10, %r5988, %r5957; + st.local.v2.u32 [%rd2+56], {%r9, %r10}; + ld.local.v4.u32 {%r6003, %r6004, %r6005, %r6006}, [%rd3+64]; + st.local.v2.u32 [%rd2+72], {%r6003, %r6004}; + st.local.v2.u32 [%rd2+80], {%r6005, %r6006}; + add.s16 %rs1, %rs412, 16; + and.b16 %rs485, %rs1, 255; + add.s16 %rs486, %rs413, 1; + st.local.v2.u8 [%rd2+136], {%rs1, %rs486}; + cvt.u32.u16 %r6011, %rs486; + cvt.u32.u16 %r6012, %rs485; + prmt.b32 %r6013, %r6011, %r6012, 30212; + cvt.u16.u32 %rs487, %r6013; + shr.u16 %rs2, %rs487, 8; + mov.b32 {%rs5, %rs6}, %r6004; + mov.b32 {%rs3, %rs4}, %r6003; + mov.b32 {%rs9, %rs10}, %r6006; + mov.b32 {%rs7, %rs8}, %r6005; + setp.eq.s16 %p11, %rs2, 0; + selp.u16 %rs488, 1, 0, %p11; + shr.u16 %rs489, %rs3, 8; + shr.u16 %rs490, %rs4, 8; + shr.u16 %rs491, %rs5, 8; + shr.u16 %rs492, %rs6, 8; + shr.u16 %rs493, %rs7, 8; + shr.u16 %rs494, %rs8, 8; + shr.u16 %rs495, %rs9, 8; + shr.u16 %rs496, %rs10, 8; + or.b16 %rs497, %rs488, 10; + cvt.u32.u16 %r6014, %rs3; + and.b32 %r6015, %r6014, 255; + cvt.u32.u16 %r6016, %rs489; + prmt.b32 %r6017, %r6016, %r6015, 30212; + cvt.u32.u16 %r6018, %rs4; + prmt.b32 %r6019, %r6018, %r6017, 28756; + cvt.u32.u16 %r6020, %rs490; + prmt.b32 %r6021, %r6020, %r6019, 1620; + cvt.u32.u16 %r6022, %rs5; + and.b32 %r6023, %r6022, 255; + cvt.u32.u16 %r6024, %rs491; + prmt.b32 %r6025, %r6024, %r6023, 30212; + cvt.u32.u16 %r6026, %rs6; + prmt.b32 %r6027, %r6026, %r6025, 28756; + cvt.u32.u16 %r6028, %rs492; + prmt.b32 %r6029, %r6028, %r6027, 1620; + cvt.u32.u16 %r6030, %rs7; + and.b32 %r6031, %r6030, 255; + cvt.u32.u16 %r6032, %rs493; + prmt.b32 %r6033, %r6032, %r6031, 30212; + cvt.u32.u16 %r6034, %rs8; + prmt.b32 %r6035, %r6034, %r6033, 28756; + cvt.u32.u16 %r6036, %rs494; + prmt.b32 %r6037, %r6036, %r6035, 1620; + cvt.u32.u16 %r6038, %rs9; + and.b32 %r6039, %r6038, 255; + cvt.u32.u16 %r6040, %rs495; + prmt.b32 %r6041, %r6040, %r6039, 30212; + cvt.u32.u16 %r6042, %rs10; + prmt.b32 %r6043, %r6042, %r6041, 28756; + cvt.u32.u16 %r6044, %rs496; + prmt.b32 %r6045, %r6044, %r6043, 1620; + cvt.u32.u16 %r6046, %rs497; + add.s32 %r6047, %r7, %r3; + add.s32 %r6048, %r6047, %r6021; + add.s32 %r6049, %r6029, %r6048; + add.s32 %r6050, %r8, %r4; + add.s32 %r6051, %r6050, %r6037; + add.s32 %r6052, %r6045, %r6051; + add.s32 %r6053, %r9, %r5; + cvt.u32.u16 %r6054, %rs1; + and.b32 %r6055, %r6054, 255; + xor.b32 %r6056, %r6053, %r6055; + shr.u32 %r6057, %r6053, 16; + shl.b32 %r6058, %r6056, 16; + or.b32 %r6059, %r6058, %r6057; + add.s32 %r6060, %r6059, 1013904242; + xor.b32 %r6061, %r6060, %r9; + shf.l.wrap.b32 %r6062, %r6061, %r6061, 20; + add.s32 %r6063, %r6053, %r6062; + xor.b32 %r6064, %r6063, %r6059; + shf.l.wrap.b32 %r6065, %r6064, %r6064, 24; + add.s32 %r6066, %r6065, %r6060; + xor.b32 %r6067, %r6066, %r6062; + shf.l.wrap.b32 %r6068, %r6067, %r6067, 25; + add.s32 %r6069, %r10, %r6; + xor.b32 %r6070, %r6069, %r6046; + shr.u32 %r6071, %r6069, 16; + shl.b32 %r6072, %r6070, 16; + or.b32 %r6073, %r6072, %r6071; + add.s32 %r6074, %r6073, -1521486534; + xor.b32 %r6075, %r6074, %r10; + shf.l.wrap.b32 %r6076, %r6075, %r6075, 20; + add.s32 %r6077, %r6069, %r6076; + xor.b32 %r6078, %r6077, %r6073; + shf.l.wrap.b32 %r6079, %r6078, %r6078, 24; + add.s32 %r6080, %r6079, %r6074; + xor.b32 %r6081, %r6080, %r6076; + shf.l.wrap.b32 %r6082, %r6081, %r6081, 25; + add.s32 %r6083, %r6082, %r6063; + shf.l.wrap.b32 %r6084, %r6048, %r6048, 16; + add.s32 %r6085, %r6084, 1779033703; + xor.b32 %r6086, %r6085, %r7; + shf.l.wrap.b32 %r6087, %r6086, %r6086, 20; + add.s32 %r6088, %r6049, %r6087; + xor.b32 %r6089, %r6088, %r6084; + shf.l.wrap.b32 %r6090, %r6089, %r6089, 24; + add.s32 %r6091, %r6090, %r6085; + xor.b32 %r6092, %r6091, %r6087; + shf.l.wrap.b32 %r6093, %r6092, %r6092, 25; + shf.l.wrap.b32 %r6094, %r6051, %r6051, 16; + add.s32 %r6095, %r6094, -1150833019; + xor.b32 %r6096, %r6095, %r8; + shf.l.wrap.b32 %r6097, %r6096, %r6096, 20; + add.s32 %r6098, %r6052, %r6097; + xor.b32 %r6099, %r6098, %r6094; + shf.l.wrap.b32 %r6100, %r6099, %r6099, 24; + add.s32 %r6101, %r6100, %r6095; + xor.b32 %r6102, %r6101, %r6097; + shf.l.wrap.b32 %r6103, %r6102, %r6102, 25; + add.s32 %r6104, %r6088, %r6103; + xor.b32 %r6105, %r6104, %r6079; + shf.l.wrap.b32 %r6106, %r6105, %r6105, 16; + add.s32 %r6107, %r6106, %r6066; + xor.b32 %r6108, %r6107, %r6103; + shf.l.wrap.b32 %r6109, %r6108, %r6108, 20; + add.s32 %r6110, %r6104, %r6109; + xor.b32 %r6111, %r6110, %r6106; + shf.l.wrap.b32 %r6112, %r6111, %r6111, 24; + add.s32 %r6113, %r6112, %r6107; + xor.b32 %r6114, %r6113, %r6109; + shf.l.wrap.b32 %r6115, %r6114, %r6114, 25; + add.s32 %r6116, %r6068, %r6098; + xor.b32 %r6117, %r6090, %r6116; + shf.l.wrap.b32 %r6118, %r6117, %r6117, 16; + add.s32 %r6119, %r6118, %r6080; + xor.b32 %r6120, %r6119, %r6068; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 20; + add.s32 %r6122, %r6116, %r6121; + xor.b32 %r6123, %r6122, %r6118; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 24; + add.s32 %r6125, %r6124, %r6119; + xor.b32 %r6126, %r6125, %r6121; + shf.l.wrap.b32 %r6127, %r6126, %r6126, 25; + xor.b32 %r6128, %r6100, %r6083; + shf.l.wrap.b32 %r6129, %r6128, %r6128, 16; + add.s32 %r6130, %r6129, %r6091; + xor.b32 %r6131, %r6130, %r6082; + shf.l.wrap.b32 %r6132, %r6131, %r6131, 20; + add.s32 %r6133, %r6083, %r6132; + xor.b32 %r6134, %r6133, %r6129; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 24; + add.s32 %r6136, %r6135, %r6130; + xor.b32 %r6137, %r6136, %r6132; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 25; + add.s32 %r6139, %r6077, %r6093; + xor.b32 %r6140, %r6139, %r6065; + shf.l.wrap.b32 %r6141, %r6140, %r6140, 16; + add.s32 %r6142, %r6141, %r6101; + xor.b32 %r6143, %r6142, %r6093; + shf.l.wrap.b32 %r6144, %r6143, %r6143, 20; + add.s32 %r6145, %r6139, %r6144; + xor.b32 %r6146, %r6145, %r6141; + shf.l.wrap.b32 %r6147, %r6146, %r6146, 24; + add.s32 %r6148, %r6147, %r6142; + xor.b32 %r6149, %r6148, %r6144; + shf.l.wrap.b32 %r6150, %r6149, %r6149, 25; + add.s32 %r6151, %r6110, %r6037; + add.s32 %r6152, %r6151, %r6150; + xor.b32 %r6153, %r6152, %r6124; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 16; + add.s32 %r6155, %r6154, %r6136; + xor.b32 %r6156, %r6155, %r6150; + shf.l.wrap.b32 %r6157, %r6156, %r6156, 20; + add.s32 %r6158, %r6152, %r6157; + xor.b32 %r6159, %r6158, %r6154; + shf.l.wrap.b32 %r6160, %r6159, %r6159, 24; + add.s32 %r6161, %r6160, %r6155; + xor.b32 %r6162, %r6161, %r6157; + shf.l.wrap.b32 %r6163, %r6162, %r6162, 25; + add.s32 %r6164, %r6122, %r6045; + add.s32 %r6165, %r6164, %r6115; + xor.b32 %r6166, %r6165, %r6135; + shf.l.wrap.b32 %r6167, %r6166, %r6166, 16; + add.s32 %r6168, %r6167, %r6148; + xor.b32 %r6169, %r6168, %r6115; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 20; + add.s32 %r6171, %r6165, %r6170; + xor.b32 %r6172, %r6171, %r6167; + shf.l.wrap.b32 %r6173, %r6172, %r6172, 24; + add.s32 %r6174, %r6173, %r6168; + xor.b32 %r6175, %r6174, %r6170; + shf.l.wrap.b32 %r6176, %r6175, %r6175, 25; + add.s32 %r6177, %r6133, %r6127; + xor.b32 %r6178, %r6147, %r6177; + shf.l.wrap.b32 %r6179, %r6178, %r6178, 16; + add.s32 %r6180, %r6179, %r6113; + xor.b32 %r6181, %r6180, %r6127; + shf.l.wrap.b32 %r6182, %r6181, %r6181, 20; + add.s32 %r6183, %r6177, %r6021; + add.s32 %r6184, %r6183, %r6182; + xor.b32 %r6185, %r6184, %r6179; + shf.l.wrap.b32 %r6186, %r6185, %r6185, 24; + add.s32 %r6187, %r6186, %r6180; + xor.b32 %r6188, %r6187, %r6182; + shf.l.wrap.b32 %r6189, %r6188, %r6188, 25; + add.s32 %r6190, %r6145, %r6138; + xor.b32 %r6191, %r6112, %r6190; + shf.l.wrap.b32 %r6192, %r6191, %r6191, 16; + add.s32 %r6193, %r6192, %r6125; + xor.b32 %r6194, %r6193, %r6138; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 20; + add.s32 %r6196, %r6190, %r6195; + xor.b32 %r6197, %r6196, %r6192; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 24; + add.s32 %r6199, %r6198, %r6193; + xor.b32 %r6200, %r6199, %r6195; + shf.l.wrap.b32 %r6201, %r6200, %r6200, 25; + add.s32 %r6202, %r6158, %r6029; + add.s32 %r6203, %r6202, %r6176; + xor.b32 %r6204, %r6203, %r6198; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 16; + add.s32 %r6206, %r6205, %r6187; + xor.b32 %r6207, %r6206, %r6176; + shf.l.wrap.b32 %r6208, %r6207, %r6207, 20; + add.s32 %r6209, %r6203, %r6208; + xor.b32 %r6210, %r6209, %r6205; + shf.l.wrap.b32 %r6211, %r6210, %r6210, 24; + add.s32 %r6212, %r6211, %r6206; + xor.b32 %r6213, %r6212, %r6208; + shf.l.wrap.b32 %r6214, %r6213, %r6213, 25; + add.s32 %r6215, %r6189, %r6171; + xor.b32 %r6216, %r6160, %r6215; + shf.l.wrap.b32 %r6217, %r6216, %r6216, 16; + add.s32 %r6218, %r6217, %r6199; + xor.b32 %r6219, %r6218, %r6189; + shf.l.wrap.b32 %r6220, %r6219, %r6219, 20; + add.s32 %r6221, %r6215, %r6220; + xor.b32 %r6222, %r6221, %r6217; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 24; + add.s32 %r6224, %r6223, %r6218; + xor.b32 %r6225, %r6224, %r6220; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 25; + add.s32 %r6227, %r6184, %r6201; + xor.b32 %r6228, %r6173, %r6227; + shf.l.wrap.b32 %r6229, %r6228, %r6228, 16; + add.s32 %r6230, %r6229, %r6161; + xor.b32 %r6231, %r6230, %r6201; + shf.l.wrap.b32 %r6232, %r6231, %r6231, 20; + add.s32 %r6233, %r6227, %r6232; + xor.b32 %r6234, %r6233, %r6229; + shf.l.wrap.b32 %r6235, %r6234, %r6234, 24; + add.s32 %r6236, %r6235, %r6230; + xor.b32 %r6237, %r6236, %r6232; + shf.l.wrap.b32 %r6238, %r6237, %r6237, 25; + add.s32 %r6239, %r6196, %r6163; + xor.b32 %r6240, %r6239, %r6186; + shf.l.wrap.b32 %r6241, %r6240, %r6240, 16; + add.s32 %r6242, %r6241, %r6174; + xor.b32 %r6243, %r6242, %r6163; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 20; + add.s32 %r6245, %r6239, %r6244; + xor.b32 %r6246, %r6245, %r6241; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 24; + add.s32 %r6248, %r6247, %r6242; + xor.b32 %r6249, %r6248, %r6244; + shf.l.wrap.b32 %r6250, %r6249, %r6249, 25; + add.s32 %r6251, %r6209, %r6045; + add.s32 %r6252, %r6251, %r6250; + xor.b32 %r6253, %r6252, %r6223; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 16; + add.s32 %r6255, %r6254, %r6236; + xor.b32 %r6256, %r6255, %r6250; + shf.l.wrap.b32 %r6257, %r6256, %r6256, 20; + add.s32 %r6258, %r6252, %r6257; + xor.b32 %r6259, %r6258, %r6254; + shf.l.wrap.b32 %r6260, %r6259, %r6259, 24; + add.s32 %r6261, %r6260, %r6255; + xor.b32 %r6262, %r6261, %r6257; + shf.l.wrap.b32 %r6263, %r6262, %r6262, 25; + add.s32 %r6264, %r6221, %r6214; + xor.b32 %r6265, %r6264, %r6235; + shf.l.wrap.b32 %r6266, %r6265, %r6265, 16; + add.s32 %r6267, %r6266, %r6248; + xor.b32 %r6268, %r6267, %r6214; + shf.l.wrap.b32 %r6269, %r6268, %r6268, 20; + add.s32 %r6270, %r6264, %r6269; + xor.b32 %r6271, %r6270, %r6266; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 24; + add.s32 %r6273, %r6272, %r6267; + xor.b32 %r6274, %r6273, %r6269; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 25; + add.s32 %r6276, %r6233, %r6226; + xor.b32 %r6277, %r6247, %r6276; + shf.l.wrap.b32 %r6278, %r6277, %r6277, 16; + add.s32 %r6279, %r6278, %r6212; + xor.b32 %r6280, %r6279, %r6226; + shf.l.wrap.b32 %r6281, %r6280, %r6280, 20; + add.s32 %r6282, %r6276, %r6037; + add.s32 %r6283, %r6282, %r6281; + xor.b32 %r6284, %r6283, %r6278; + shf.l.wrap.b32 %r6285, %r6284, %r6284, 24; + add.s32 %r6286, %r6285, %r6279; + xor.b32 %r6287, %r6286, %r6281; + shf.l.wrap.b32 %r6288, %r6287, %r6287, 25; + add.s32 %r6289, %r6245, %r6238; + xor.b32 %r6290, %r6211, %r6289; + shf.l.wrap.b32 %r6291, %r6290, %r6290, 16; + add.s32 %r6292, %r6291, %r6224; + xor.b32 %r6293, %r6292, %r6238; + shf.l.wrap.b32 %r6294, %r6293, %r6293, 20; + add.s32 %r6295, %r6289, %r6294; + xor.b32 %r6296, %r6295, %r6291; + shf.l.wrap.b32 %r6297, %r6296, %r6296, 24; + add.s32 %r6298, %r6297, %r6292; + xor.b32 %r6299, %r6298, %r6294; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 25; + add.s32 %r6301, %r6258, %r6275; + xor.b32 %r6302, %r6301, %r6297; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 16; + add.s32 %r6304, %r6303, %r6286; + xor.b32 %r6305, %r6304, %r6275; + shf.l.wrap.b32 %r6306, %r6305, %r6305, 20; + add.s32 %r6307, %r6301, %r6306; + xor.b32 %r6308, %r6307, %r6303; + shf.l.wrap.b32 %r6309, %r6308, %r6308, 24; + add.s32 %r6310, %r6309, %r6304; + xor.b32 %r6311, %r6310, %r6306; + shf.l.wrap.b32 %r6312, %r6311, %r6311, 25; + add.s32 %r6313, %r6288, %r6270; + xor.b32 %r6314, %r6260, %r6313; + shf.l.wrap.b32 %r6315, %r6314, %r6314, 16; + add.s32 %r6316, %r6315, %r6298; + xor.b32 %r6317, %r6316, %r6288; + shf.l.wrap.b32 %r6318, %r6317, %r6317, 20; + add.s32 %r6319, %r6313, %r6021; + add.s32 %r6320, %r6319, %r6318; + xor.b32 %r6321, %r6320, %r6315; + shf.l.wrap.b32 %r6322, %r6321, %r6321, 24; + add.s32 %r6323, %r6322, %r6316; + xor.b32 %r6324, %r6323, %r6318; + shf.l.wrap.b32 %r6325, %r6324, %r6324, 25; + add.s32 %r6326, %r6283, %r6300; + xor.b32 %r6327, %r6272, %r6326; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 16; + add.s32 %r6329, %r6328, %r6261; + xor.b32 %r6330, %r6329, %r6300; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 20; + add.s32 %r6332, %r6326, %r6331; + xor.b32 %r6333, %r6332, %r6328; + shf.l.wrap.b32 %r6334, %r6333, %r6333, 24; + add.s32 %r6335, %r6334, %r6329; + xor.b32 %r6336, %r6335, %r6331; + shf.l.wrap.b32 %r6337, %r6336, %r6336, 25; + add.s32 %r6338, %r6295, %r6263; + xor.b32 %r6339, %r6338, %r6285; + shf.l.wrap.b32 %r6340, %r6339, %r6339, 16; + add.s32 %r6341, %r6340, %r6273; + xor.b32 %r6342, %r6341, %r6263; + shf.l.wrap.b32 %r6343, %r6342, %r6342, 20; + add.s32 %r6344, %r6338, %r6029; + add.s32 %r6345, %r6344, %r6343; + xor.b32 %r6346, %r6345, %r6340; + shf.l.wrap.b32 %r6347, %r6346, %r6346, 24; + add.s32 %r6348, %r6347, %r6341; + xor.b32 %r6349, %r6348, %r6343; + shf.l.wrap.b32 %r6350, %r6349, %r6349, 25; + add.s32 %r6351, %r6307, %r6350; + xor.b32 %r6352, %r6351, %r6322; + shf.l.wrap.b32 %r6353, %r6352, %r6352, 16; + add.s32 %r6354, %r6353, %r6335; + xor.b32 %r6355, %r6354, %r6350; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 20; + add.s32 %r6357, %r6351, %r6356; + xor.b32 %r6358, %r6357, %r6353; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 24; + add.s32 %r6360, %r6359, %r6354; + xor.b32 %r6361, %r6360, %r6356; + shf.l.wrap.b32 %r6362, %r6361, %r6361, 25; + add.s32 %r6363, %r6320, %r6312; + xor.b32 %r6364, %r6363, %r6334; + shf.l.wrap.b32 %r6365, %r6364, %r6364, 16; + add.s32 %r6366, %r6365, %r6348; + xor.b32 %r6367, %r6366, %r6312; + shf.l.wrap.b32 %r6368, %r6367, %r6367, 20; + add.s32 %r6369, %r6363, %r6368; + xor.b32 %r6370, %r6369, %r6365; + shf.l.wrap.b32 %r6371, %r6370, %r6370, 24; + add.s32 %r6372, %r6371, %r6366; + xor.b32 %r6373, %r6372, %r6368; + shf.l.wrap.b32 %r6374, %r6373, %r6373, 25; + add.s32 %r6375, %r6332, %r6325; + xor.b32 %r6376, %r6347, %r6375; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 16; + add.s32 %r6378, %r6377, %r6310; + xor.b32 %r6379, %r6378, %r6325; + shf.l.wrap.b32 %r6380, %r6379, %r6379, 20; + add.s32 %r6381, %r6375, %r6045; + add.s32 %r6382, %r6381, %r6380; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 24; + add.s32 %r6385, %r6384, %r6378; + xor.b32 %r6386, %r6385, %r6380; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 25; + add.s32 %r6388, %r6345, %r6337; + xor.b32 %r6389, %r6309, %r6388; + shf.l.wrap.b32 %r6390, %r6389, %r6389, 16; + add.s32 %r6391, %r6390, %r6323; + xor.b32 %r6392, %r6391, %r6337; + shf.l.wrap.b32 %r6393, %r6392, %r6392, 20; + add.s32 %r6394, %r6388, %r6393; + xor.b32 %r6395, %r6394, %r6390; + shf.l.wrap.b32 %r6396, %r6395, %r6395, 24; + add.s32 %r6397, %r6396, %r6391; + xor.b32 %r6398, %r6397, %r6393; + shf.l.wrap.b32 %r6399, %r6398, %r6398, 25; + add.s32 %r6400, %r6357, %r6374; + xor.b32 %r6401, %r6400, %r6396; + shf.l.wrap.b32 %r6402, %r6401, %r6401, 16; + add.s32 %r6403, %r6402, %r6385; + xor.b32 %r6404, %r6403, %r6374; + shf.l.wrap.b32 %r6405, %r6404, %r6404, 20; + add.s32 %r6406, %r6400, %r6021; + add.s32 %r6407, %r6406, %r6405; + xor.b32 %r6408, %r6407, %r6402; + shf.l.wrap.b32 %r6409, %r6408, %r6408, 24; + add.s32 %r6410, %r6409, %r6403; + xor.b32 %r6411, %r6410, %r6405; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 25; + add.s32 %r6413, %r6387, %r6369; + xor.b32 %r6414, %r6359, %r6413; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 16; + add.s32 %r6416, %r6415, %r6397; + xor.b32 %r6417, %r6416, %r6387; + shf.l.wrap.b32 %r6418, %r6417, %r6417, 20; + add.s32 %r6419, %r6413, %r6037; + add.s32 %r6420, %r6419, %r6418; + xor.b32 %r6421, %r6420, %r6415; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 24; + add.s32 %r6423, %r6422, %r6416; + xor.b32 %r6424, %r6423, %r6418; + shf.l.wrap.b32 %r6425, %r6424, %r6424, 25; + add.s32 %r6426, %r6382, %r6399; + xor.b32 %r6427, %r6371, %r6426; + shf.l.wrap.b32 %r6428, %r6427, %r6427, 16; + add.s32 %r6429, %r6428, %r6360; + xor.b32 %r6430, %r6429, %r6399; + shf.l.wrap.b32 %r6431, %r6430, %r6430, 20; + add.s32 %r6432, %r6426, %r6431; + xor.b32 %r6433, %r6432, %r6428; + shf.l.wrap.b32 %r6434, %r6433, %r6433, 24; + add.s32 %r6435, %r6434, %r6429; + xor.b32 %r6436, %r6435, %r6431; + shf.l.wrap.b32 %r6437, %r6436, %r6436, 25; + add.s32 %r6438, %r6394, %r6029; + add.s32 %r6439, %r6438, %r6362; + xor.b32 %r6440, %r6439, %r6384; + shf.l.wrap.b32 %r6441, %r6440, %r6440, 16; + add.s32 %r6442, %r6441, %r6372; + xor.b32 %r6443, %r6442, %r6362; + shf.l.wrap.b32 %r6444, %r6443, %r6443, 20; + add.s32 %r6445, %r6439, %r6444; + xor.b32 %r6446, %r6445, %r6441; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 24; + add.s32 %r6448, %r6447, %r6442; + xor.b32 %r6449, %r6448, %r6444; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 25; + add.s32 %r6451, %r6407, %r6450; + xor.b32 %r6452, %r6451, %r6422; + shf.l.wrap.b32 %r6453, %r6452, %r6452, 16; + add.s32 %r6454, %r6453, %r6435; + xor.b32 %r6455, %r6454, %r6450; + shf.l.wrap.b32 %r6456, %r6455, %r6455, 20; + add.s32 %r6457, %r6451, %r6456; + xor.b32 %r6458, %r6457, %r6453; + shf.l.wrap.b32 %r6459, %r6458, %r6458, 24; + add.s32 %r6460, %r6459, %r6454; + xor.b32 %r6461, %r6460, %r6456; + shf.l.wrap.b32 %r6462, %r6461, %r6461, 25; + add.s32 %r6463, %r6420, %r6412; + xor.b32 %r6464, %r6463, %r6434; + shf.l.wrap.b32 %r6465, %r6464, %r6464, 16; + add.s32 %r6466, %r6465, %r6448; + xor.b32 %r6467, %r6466, %r6412; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 20; + add.s32 %r6469, %r6463, %r6468; + xor.b32 %r6470, %r6469, %r6465; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 24; + add.s32 %r6472, %r6471, %r6466; + xor.b32 %r6473, %r6472, %r6468; + shf.l.wrap.b32 %r6474, %r6473, %r6473, 25; + add.s32 %r6475, %r6432, %r6425; + xor.b32 %r6476, %r6447, %r6475; + shf.l.wrap.b32 %r6477, %r6476, %r6476, 16; + add.s32 %r6478, %r6477, %r6410; + xor.b32 %r6479, %r6478, %r6425; + shf.l.wrap.b32 %r6480, %r6479, %r6479, 20; + add.s32 %r6481, %r6475, %r6480; + xor.b32 %r6482, %r6481, %r6477; + shf.l.wrap.b32 %r6483, %r6482, %r6482, 24; + add.s32 %r6484, %r6483, %r6478; + xor.b32 %r6485, %r6484, %r6480; + shf.l.wrap.b32 %r6486, %r6485, %r6485, 25; + add.s32 %r6487, %r6445, %r6437; + xor.b32 %r6488, %r6409, %r6487; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 16; + add.s32 %r6490, %r6489, %r6423; + xor.b32 %r6491, %r6490, %r6437; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 20; + add.s32 %r6493, %r6487, %r6492; + xor.b32 %r6494, %r6493, %r6489; + shf.l.wrap.b32 %r6495, %r6494, %r6494, 24; + add.s32 %r6496, %r6495, %r6490; + xor.b32 %r6497, %r6496, %r6492; + shf.l.wrap.b32 %r6498, %r6497, %r6497, 25; + add.s32 %r6499, %r6457, %r6474; + xor.b32 %r6500, %r6499, %r6495; + shf.l.wrap.b32 %r6501, %r6500, %r6500, 16; + add.s32 %r6502, %r6501, %r6484; + xor.b32 %r6503, %r6502, %r6474; + shf.l.wrap.b32 %r6504, %r6503, %r6503, 20; + add.s32 %r6505, %r6499, %r6037; + add.s32 %r6506, %r6505, %r6504; + xor.b32 %r6507, %r6506, %r6501; + shf.l.wrap.b32 %r6508, %r6507, %r6507, 24; + add.s32 %r6509, %r6508, %r6502; + xor.b32 %r6510, %r6509, %r6504; + shf.l.wrap.b32 %r6511, %r6510, %r6510, 25; + add.s32 %r6512, %r6486, %r6469; + xor.b32 %r6513, %r6459, %r6512; + shf.l.wrap.b32 %r6514, %r6513, %r6513, 16; + add.s32 %r6515, %r6514, %r6496; + xor.b32 %r6516, %r6515, %r6486; + shf.l.wrap.b32 %r6517, %r6516, %r6516, 20; + add.s32 %r6518, %r6512, %r6045; + add.s32 %r6519, %r6518, %r6517; + xor.b32 %r6520, %r6519, %r6514; + shf.l.wrap.b32 %r6521, %r6520, %r6520, 24; + add.s32 %r6522, %r6521, %r6515; + xor.b32 %r6523, %r6522, %r6517; + shf.l.wrap.b32 %r6524, %r6523, %r6523, 25; + add.s32 %r6525, %r6481, %r6021; + add.s32 %r6526, %r6525, %r6498; + xor.b32 %r6527, %r6471, %r6526; + shf.l.wrap.b32 %r6528, %r6527, %r6527, 16; + add.s32 %r6529, %r6528, %r6460; + xor.b32 %r6530, %r6529, %r6498; + shf.l.wrap.b32 %r6531, %r6530, %r6530, 20; + add.s32 %r6532, %r6526, %r6029; + add.s32 %r6533, %r6532, %r6531; + xor.b32 %r6534, %r6533, %r6528; + shf.l.wrap.b32 %r6535, %r6534, %r6534, 24; + add.s32 %r6536, %r6535, %r6529; + xor.b32 %r6537, %r6536, %r6531; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 25; + add.s32 %r6539, %r6493, %r6462; + xor.b32 %r6540, %r6539, %r6483; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 16; + add.s32 %r6542, %r6541, %r6472; + xor.b32 %r6543, %r6542, %r6462; + shf.l.wrap.b32 %r6544, %r6543, %r6543, 20; + add.s32 %r6545, %r6539, %r6544; + xor.b32 %r6546, %r6545, %r6541; + shf.l.wrap.b32 %r6547, %r6546, %r6546, 24; + add.s32 %r6548, %r6547, %r6542; + xor.b32 %r6549, %r6548, %r6544; + shf.l.wrap.b32 %r6550, %r6549, %r6549, 25; + add.s32 %r6551, %r6506, %r6550; + xor.b32 %r6552, %r6551, %r6521; + shf.l.wrap.b32 %r6553, %r6552, %r6552, 16; + add.s32 %r6554, %r6553, %r6536; + xor.b32 %r6555, %r6554, %r6550; + shf.l.wrap.b32 %r6556, %r6555, %r6555, 20; + add.s32 %r6557, %r6551, %r6556; + xor.b32 %r6558, %r6557, %r6553; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 24; + add.s32 %r6560, %r6559, %r6554; + xor.b32 %r6561, %r6560, %r6556; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 25; + add.s32 %r6563, %r6519, %r6511; + xor.b32 %r6564, %r6563, %r6535; + shf.l.wrap.b32 %r6565, %r6564, %r6564, 16; + add.s32 %r6566, %r6565, %r6548; + xor.b32 %r6567, %r6566, %r6511; + shf.l.wrap.b32 %r6568, %r6567, %r6567, 20; + add.s32 %r6569, %r6563, %r6568; + xor.b32 %r6570, %r6569, %r6565; + shf.l.wrap.b32 %r6571, %r6570, %r6570, 24; + add.s32 %r6572, %r6571, %r6566; + xor.b32 %r6573, %r6572, %r6568; + shf.l.wrap.b32 %r6574, %r6573, %r6573, 25; + add.s32 %r6575, %r6533, %r6524; + xor.b32 %r6576, %r6547, %r6575; + shf.l.wrap.b32 %r6577, %r6576, %r6576, 16; + add.s32 %r6578, %r6577, %r6509; + xor.b32 %r6579, %r6578, %r6524; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 20; + add.s32 %r6581, %r6575, %r6580; + xor.b32 %r6582, %r6581, %r6577; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 24; + add.s32 %r6584, %r6583, %r6578; + xor.b32 %r6585, %r6584, %r6580; + shf.l.wrap.b32 %r6586, %r6585, %r6585, 25; + add.s32 %r6587, %r6545, %r6538; + xor.b32 %r6588, %r6508, %r6587; + shf.l.wrap.b32 %r6589, %r6588, %r6588, 16; + add.s32 %r6590, %r6589, %r6522; + xor.b32 %r6591, %r6590, %r6538; + shf.l.wrap.b32 %r6592, %r6591, %r6591, 20; + add.s32 %r6593, %r6587, %r6029; + add.s32 %r6594, %r6593, %r6592; + xor.b32 %r6595, %r6594, %r6589; + shf.l.wrap.b32 %r6596, %r6595, %r6595, 24; + add.s32 %r6597, %r6596, %r6590; + xor.b32 %r6598, %r6597, %r6592; + shf.l.wrap.b32 %r6599, %r6598, %r6598, 25; + add.s32 %r6600, %r6557, %r6574; + xor.b32 %r6601, %r6600, %r6596; + shf.l.wrap.b32 %r6602, %r6601, %r6601, 16; + add.s32 %r6603, %r6602, %r6584; + xor.b32 %r6604, %r6603, %r6574; + shf.l.wrap.b32 %r6605, %r6604, %r6604, 20; + add.s32 %r6606, %r6600, %r6045; + add.s32 %r6607, %r6606, %r6605; + xor.b32 %r6608, %r6607, %r6602; + shf.l.wrap.b32 %r6609, %r6608, %r6608, 24; + add.s32 %r6610, %r6609, %r6603; + xor.b32 %r6611, %r6610, %r6605; + shf.l.wrap.b32 %r6612, %r6611, %r6611, 25; + add.s32 %r6613, %r6586, %r6021; + add.s32 %r6614, %r6613, %r6569; + xor.b32 %r6615, %r6559, %r6614; + shf.l.wrap.b32 %r6616, %r6615, %r6615, 16; + add.s32 %r6617, %r6616, %r6597; + xor.b32 %r6618, %r6617, %r6586; + shf.l.wrap.b32 %r6619, %r6618, %r6618, 20; + add.s32 %r6620, %r6614, %r6619; + xor.b32 %r6621, %r6620, %r6616; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 24; + add.s32 %r6623, %r6622, %r6617; + xor.b32 %r6624, %r6623, %r6619; + shf.l.wrap.b32 %r6625, %r6624, %r6624, 25; + add.s32 %r6626, %r6581, %r6037; + add.s32 %r6627, %r6626, %r6599; + xor.b32 %r6628, %r6571, %r6627; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 16; + add.s32 %r6630, %r6629, %r6560; + xor.b32 %r6631, %r6630, %r6599; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 20; + add.s32 %r6633, %r6627, %r6632; + xor.b32 %r6634, %r6633, %r6629; + shf.l.wrap.b32 %r6635, %r6634, %r6634, 24; + add.s32 %r6636, %r6635, %r6630; + xor.b32 %r6637, %r6636, %r6632; + shf.l.wrap.b32 %r6638, %r6637, %r6637, 25; + add.s32 %r6639, %r6594, %r6562; + xor.b32 %r6640, %r6639, %r6583; + shf.l.wrap.b32 %r6641, %r6640, %r6640, 16; + add.s32 %r6642, %r6641, %r6572; + xor.b32 %r6643, %r6642, %r6562; + shf.l.wrap.b32 %r6644, %r6643, %r6643, 20; + add.s32 %r6645, %r6639, %r6644; + xor.b32 %r6646, %r6645, %r6641; + shf.l.wrap.b32 %r6647, %r6646, %r6646, 24; + add.s32 %r6648, %r6647, %r6642; + xor.b32 %r6649, %r6648, %r6644; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 25; + add.s32 %r6651, %r6607, %r6650; + xor.b32 %r6652, %r6651, %r6622; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 16; + add.s32 %r6654, %r6653, %r6636; + xor.b32 %r6655, %r6654, %r6650; + shf.l.wrap.b32 %r6656, %r6655, %r6655, 20; + add.s32 %r6657, %r6651, %r6656; + xor.b32 %r6658, %r6657, %r6653; + shf.l.wrap.b32 %r6659, %r6658, %r6658, 24; + add.s32 %r6660, %r6659, %r6654; + xor.b32 %r6661, %r6660, %r6656; + shf.l.wrap.b32 %r6662, %r6661, %r6661, 25; + add.s32 %r6663, %r6620, %r6612; + xor.b32 %r6664, %r6663, %r6635; + shf.l.wrap.b32 %r6665, %r6664, %r6664, 16; + add.s32 %r6666, %r6665, %r6648; + xor.b32 %r6667, %r6666, %r6612; + shf.l.wrap.b32 %r6668, %r6667, %r6667, 20; + add.s32 %r6669, %r6663, %r6021; + add.s32 %r6670, %r6669, %r6668; + xor.b32 %r6671, %r6670, %r6665; + shf.l.wrap.b32 %r6672, %r6671, %r6671, 24; + add.s32 %r6673, %r6672, %r6666; + xor.b32 %r6674, %r6673, %r6668; + shf.l.wrap.b32 %r6675, %r6674, %r6674, 25; + add.s32 %r6676, %r6633, %r6029; + add.s32 %r6677, %r6676, %r6625; + xor.b32 %r6678, %r6647, %r6677; + shf.l.wrap.b32 %r6679, %r6678, %r6678, 16; + add.s32 %r6680, %r6679, %r6610; + xor.b32 %r6681, %r6680, %r6625; + shf.l.wrap.b32 %r6682, %r6681, %r6681, 20; + add.s32 %r6683, %r6677, %r6682; + xor.b32 %r6684, %r6683, %r6679; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 24; + add.s32 %r6686, %r6685, %r6680; + xor.b32 %r6687, %r6686, %r6682; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 25; + add.s32 %r6689, %r6645, %r6638; + xor.b32 %r6690, %r6609, %r6689; + shf.l.wrap.b32 %r6691, %r6690, %r6690, 16; + add.s32 %r6692, %r6691, %r6623; + xor.b32 %r6693, %r6692, %r6638; + shf.l.wrap.b32 %r6694, %r6693, %r6693, 20; + add.s32 %r6695, %r6689, %r6694; + xor.b32 %r6696, %r6695, %r6691; + shf.l.wrap.b32 %r6697, %r6696, %r6696, 24; + add.s32 %r6698, %r6697, %r6692; + xor.b32 %r6699, %r6698, %r6694; + shf.l.wrap.b32 %r6700, %r6699, %r6699, 25; + add.s32 %r6701, %r6657, %r6675; + xor.b32 %r6702, %r6701, %r6697; + shf.l.wrap.b32 %r6703, %r6702, %r6702, 16; + add.s32 %r6704, %r6703, %r6686; + xor.b32 %r6705, %r6704, %r6675; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 20; + add.s32 %r6707, %r6701, %r6706; + xor.b32 %r6708, %r6707, %r6703; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 24; + add.s32 %r6710, %r6709, %r6704; + xor.b32 %r6711, %r6710, %r6706; + shf.l.wrap.b32 %r6712, %r6711, %r6711, 25; + add.s32 %r6713, %r6688, %r6037; + add.s32 %r6714, %r6713, %r6670; + xor.b32 %r6715, %r6659, %r6714; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 16; + add.s32 %r6717, %r6716, %r6698; + xor.b32 %r6718, %r6717, %r6688; + shf.l.wrap.b32 %r6719, %r6718, %r6718, 20; + add.s32 %r6720, %r6714, %r6719; + xor.b32 %r6721, %r6720, %r6716; + shf.l.wrap.b32 %r6722, %r6721, %r6721, 24; + add.s32 %r6723, %r6722, %r6717; + xor.b32 %r6724, %r6723, %r6719; + shf.l.wrap.b32 %r6725, %r6724, %r6724, 25; + add.s32 %r6726, %r6683, %r6045; + add.s32 %r6727, %r6726, %r6700; + xor.b32 %r6728, %r6672, %r6727; + shf.l.wrap.b32 %r6729, %r6728, %r6728, 16; + add.s32 %r6730, %r6729, %r6660; + xor.b32 %r6731, %r6730, %r6700; + shf.l.wrap.b32 %r6732, %r6731, %r6731, 20; + add.s32 %r6733, %r6727, %r6732; + xor.b32 %r6734, %r6733, %r6729; + shf.l.wrap.b32 %r6735, %r6734, %r6734, 24; + add.s32 %r6736, %r6735, %r6730; + xor.b32 %r6737, %r6736, %r6732; + shf.l.wrap.b32 %r6738, %r6737, %r6737, 25; + add.s32 %r6739, %r6695, %r6662; + xor.b32 %r6740, %r6739, %r6685; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 16; + add.s32 %r6742, %r6741, %r6673; + xor.b32 %r6743, %r6742, %r6662; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 20; + add.s32 %r6745, %r6739, %r6744; + xor.b32 %r6746, %r6745, %r6741; + shf.l.wrap.b32 %r6747, %r6746, %r6746, 24; + add.s32 %r6748, %r6747, %r6742; + xor.b32 %r6749, %r6748, %r6744; + shf.l.wrap.b32 %r6750, %r6749, %r6749, 25; + xor.b32 %r6751, %r6707, %r6736; + cvt.u64.u32 %rd394, %r6751; + xor.b32 %r6752, %r6748, %r6720; + and.b32 %r6753, %r6752, 255; + cvt.u64.u32 %rd395, %r6753; + bfi.b64 %rd396, %rd395, %rd394, 32, 32; + cvt.u64.u32 %rd397, %r6752; + shl.b64 %rd398, %rd397, 32; + and.b64 %rd399, %rd398, 280375465082880; + or.b64 %rd400, %rd396, %rd399; + and.b64 %rd401, %rd398, 71776119061217280; + shr.u32 %r6754, %r6752, 24; + cvt.u64.u32 %rd402, %r6754; + shl.b64 %rd403, %rd402, 56; + or.b64 %rd404, %rd400, %rd401; + or.b64 %rd1308, %rd404, %rd403; + xor.b32 %r6755, %r6710, %r6733; + cvt.u64.u32 %rd405, %r6755; + xor.b32 %r6756, %r6745, %r6723; + and.b32 %r6757, %r6756, 255; + cvt.u64.u32 %rd406, %r6757; + bfi.b64 %rd407, %rd406, %rd405, 32, 32; + cvt.u64.u32 %rd408, %r6756; + shl.b64 %rd409, %rd408, 32; + and.b64 %rd410, %rd409, 280375465082880; + or.b64 %rd411, %rd407, %rd410; + and.b64 %rd412, %rd409, 71776119061217280; + shr.u32 %r6758, %r6756, 24; + cvt.u64.u32 %rd413, %r6758; + shl.b64 %rd414, %rd413, 56; + or.b64 %rd415, %rd411, %rd412; + or.b64 %rd1307, %rd415, %rd414; + xor.b32 %r6759, %r6750, %r6722; + cvt.u64.u32 %rd416, %r6759; + xor.b32 %r6760, %r6712, %r6735; + and.b32 %r6761, %r6760, 255; + cvt.u64.u32 %rd417, %r6761; + bfi.b64 %rd418, %rd417, %rd416, 32, 32; + cvt.u64.u32 %rd419, %r6760; + shl.b64 %rd420, %rd419, 32; + and.b64 %rd421, %rd420, 280375465082880; + or.b64 %rd422, %rd418, %rd421; + and.b64 %rd423, %rd420, 71776119061217280; + shr.u32 %r6762, %r6760, 24; + cvt.u64.u32 %rd424, %r6762; + shl.b64 %rd425, %rd424, 56; + or.b64 %rd426, %rd422, %rd423; + or.b64 %rd1306, %rd426, %rd425; + xor.b32 %r6763, %r6747, %r6725; + cvt.u64.u32 %rd427, %r6763; + xor.b32 %r6764, %r6709, %r6738; + and.b32 %r6765, %r6764, 255; + cvt.u64.u32 %rd428, %r6765; + bfi.b64 %rd429, %rd428, %rd427, 32, 32; + cvt.u64.u32 %rd430, %r6764; + shl.b64 %rd431, %rd430, 32; + and.b64 %rd432, %rd431, 280375465082880; + or.b64 %rd433, %rd429, %rd432; + and.b64 %rd434, %rd431, 71776119061217280; + shr.u32 %r6766, %r6764, 24; + cvt.u64.u32 %rd435, %r6766; + shl.b64 %rd436, %rd435, 56; + or.b64 %rd437, %rd433, %rd434; + or.b64 %rd1305, %rd437, %rd436; + add.u64 %rd1297, %SPL, 2000; + mov.u64 %rd1301, 0; + mov.u32 %r29819, 0; + st.local.v4.u32 [%rd1297+32], {%r29819, %r29819, %r29819, %r29819}; + st.local.v4.u32 [%rd1297+48], {%r29819, %r29819, %r29819, %r29819}; + st.local.v4.u32 [%rd1297+64], {%r29819, %r29819, %r29819, %r29819}; + st.local.v2.u64 [%rd1297], {%rd1308, %rd1307}; + st.local.v2.u64 [%rd1297+16], {%rd1306, %rd1305}; + mov.u64 %rd1302, %rd1301; + mov.u64 %rd1303, %rd1301; + mov.u64 %rd1304, %rd1301; + mov.u64 %rd1309, %rd1301; + mov.u64 %rd1310, %rd1301; + mov.u64 %rd1311, %rd1301; + mov.u64 %rd1312, %rd1301; + mov.u64 %rd1313, %rd1305; + mov.u64 %rd1314, %rd1306; + mov.u64 %rd1315, %rd1307; + mov.u64 %rd1316, %rd1308; + +$L__BB2_11: + mov.b64 {%r12, %r13}, %rd1316; + xor.b32 %r6768, %r13, %r12; + mov.b64 {%r14, %r15}, %rd1315; + xor.b32 %r6769, %r6768, %r14; + xor.b32 %r6770, %r6769, %r15; + mov.b64 {%r16, %r17}, %rd1314; + xor.b32 %r6771, %r17, %r16; + mov.b64 {%r18, %r19}, %rd1313; + xor.b32 %r6772, %r6771, %r18; + xor.b32 %r6773, %r6772, %r19; + mov.b64 {%r20, %r21}, %rd1312; + xor.b32 %r6774, %r21, %r20; + mov.b64 {%r22, %r23}, %rd1311; + xor.b32 %r6775, %r6774, %r22; + xor.b32 %r6776, %r6775, %r23; + mov.b64 {%r24, %r25}, %rd1310; + xor.b32 %r6777, %r25, %r24; + mov.b64 {%r26, %r27}, %rd1309; + xor.b32 %r6778, %r6777, %r26; + xor.b32 %r6779, %r6778, %r27; + mov.b64 {%r28, %r29}, %rd1308; + xor.b32 %r6780, %r29, %r28; + mov.b64 {%r30, %r31}, %rd1307; + xor.b32 %r6781, %r6780, %r30; + xor.b32 %r6782, %r6781, %r31; + mov.b64 {%r32, %r33}, %rd1306; + xor.b32 %r6783, %r33, %r32; + mov.b64 {%r34, %r35}, %rd1305; + xor.b32 %r6784, %r6783, %r34; + xor.b32 %r6785, %r6784, %r35; + mov.b64 {%r36, %r37}, %rd1304; + xor.b32 %r6786, %r37, %r36; + mov.b64 {%r38, %r39}, %rd1303; + xor.b32 %r6787, %r6786, %r38; + xor.b32 %r6788, %r6787, %r39; + mov.b64 {%r40, %r41}, %rd1302; + xor.b32 %r6789, %r41, %r40; + mov.b64 {%r42, %r43}, %rd1301; + xor.b32 %r6790, %r6789, %r42; + xor.b32 %r6791, %r6790, %r43; + xor.b32 %r6792, %r6779, %r6770; + xor.b32 %r6793, %r6792, %r6788; + mul.wide.u32 %rd446, %r6793, 1908875315; + shr.u64 %rd447, %rd446, 56; + cvt.u32.u64 %r6794, %rd447; + mul.lo.s32 %r6795, %r6794, 37748717; + sub.s32 %r44, %r6793, %r6795; + xor.b32 %r6796, %r6782, %r6773; + xor.b32 %r6797, %r6796, %r6791; + mul.wide.u32 %rd448, %r6797, 1908875315; + shr.u64 %rd449, %rd448, 56; + cvt.u32.u64 %r6798, %rd449; + mul.lo.s32 %r6799, %r6798, 37748717; + sub.s32 %r45, %r6797, %r6799; + xor.b32 %r6800, %r6776, %r29819; + xor.b32 %r6801, %r6800, %r6785; + mul.wide.u32 %rd450, %r6801, 1908875315; + shr.u64 %rd451, %rd450, 56; + cvt.u32.u64 %r6802, %rd451; + mul.lo.s32 %r6803, %r6802, 37748717; + sub.s32 %r46, %r6801, %r6803; + shl.b32 %r47, %r44, 1; + mul.wide.u32 %rd452, %r47, -954391867; + shr.u64 %rd453, %rd452, 32; + cvt.u32.u64 %r6804, %rd453; + sub.s32 %r6805, %r47, %r6804; + shr.u32 %r6806, %r6805, 1; + add.s32 %r6807, %r6806, %r6804; + shr.u32 %r6808, %r6807, 20; + mul.lo.s32 %r6809, %r6808, 1179641; + sub.s32 %r6810, %r47, %r6809; + cvta.to.global.u64 %rd454, %rd361; + mul.wide.u32 %rd455, %r6810, 64; + add.s64 %rd32, %rd454, %rd455; + or.b32 %r48, %r47, 1; + mul.wide.u32 %rd456, %r48, -954391867; + shr.u64 %rd457, %rd456, 32; + cvt.u32.u64 %r6811, %rd457; + sub.s32 %r6812, %r48, %r6811; + shr.u32 %r6813, %r6812, 1; + add.s32 %r6814, %r6813, %r6811; + shr.u32 %r6815, %r6814, 20; + mul.lo.s32 %r6816, %r6815, 1179641; + sub.s32 %r6817, %r48, %r6816; + mul.wide.u32 %rd458, %r6817, 64; + add.s64 %rd33, %rd454, %rd458; + setp.eq.s64 %p12, %rd360, 0; + @%p12 bra $L__BB2_25; + + cvta.to.global.u64 %rd459, %rd360; + mul.wide.u32 %rd460, %r44, 128; + add.s64 %rd34, %rd459, %rd460; + ld.global.u64 %rd1317, [%rd34]; + setp.eq.s64 %p13, %rd1317, 0; + @%p13 bra $L__BB2_14; + + ld.global.u64 %rd1332, [%rd34+120]; + ld.global.u64 %rd1331, [%rd34+112]; + ld.global.u64 %rd1330, [%rd34+104]; + ld.global.u64 %rd1329, [%rd34+96]; + ld.global.u64 %rd1328, [%rd34+88]; + ld.global.u64 %rd1327, [%rd34+80]; + ld.global.u64 %rd1326, [%rd34+72]; + ld.global.u64 %rd1325, [%rd34+64]; + ld.global.u64 %rd1324, [%rd34+56]; + ld.global.u64 %rd1323, [%rd34+48]; + ld.global.u64 %rd1322, [%rd34+40]; + ld.global.u64 %rd1321, [%rd34+32]; + ld.global.u64 %rd1320, [%rd34+24]; + ld.global.u64 %rd1319, [%rd34+16]; + ld.global.u64 %rd1318, [%rd34+8]; + bra.uni $L__BB2_36; + +$L__BB2_25: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd562, 1179641; + st.local.u64 [%rd2+8], %rd562; + st.local.u32 [%rd2+16], %r47; + ld.global.u64 %rd563, [%rd32]; + ld.global.u64 %rd564, [%rd32+8]; + ld.global.u64 %rd565, [%rd32+16]; + ld.global.u64 %rd566, [%rd32+24]; + ld.global.u64 %rd567, [%rd32+32]; + ld.global.u64 %rd568, [%rd32+40]; + ld.global.u64 %rd569, [%rd32+48]; + ld.global.u64 %rd570, [%rd32+56]; + st.local.u64 [%rd2+24], %rd563; + st.local.u64 [%rd2+32], %rd564; + st.local.u64 [%rd2+40], %rd565; + st.local.u64 [%rd2+48], %rd566; + st.local.u64 [%rd2+56], %rd567; + st.local.u64 [%rd2+64], %rd568; + st.local.u64 [%rd2+72], %rd569; + st.local.u64 [%rd2+80], %rd570; + cvt.u32.u64 %r10143, %rd563; + xor.b32 %r10144, %r47, %r10143; + st.local.u32 [%rd2+24], %r10144; + mov.u32 %r30057, 0; + st.local.v2.u32 [%rd2+96], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+104], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+112], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+120], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+128], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+136], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+144], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+152], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+160], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+168], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+176], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+184], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+192], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+200], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+208], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+216], {%r30057, %r30057}; + mov.u32 %r30072, -2147483648; + mov.u32 %r10116, 1; + st.local.v2.u32 [%rd2+88], {%r10116, %r30072}; + ld.local.v2.u32 {%r30093, %r30094}, [%rd2+24]; + mov.b64 {%r30091, %r30092}, %rd568; + shr.u64 %rd571, %rd564, 32; + cvt.u32.u64 %r30105, %rd564; + cvt.u32.u64 %r30106, %rd571; + shr.u64 %rd572, %rd569, 32; + cvt.u32.u64 %r30103, %rd569; + cvt.u32.u64 %r30104, %rd572; + shr.u64 %rd573, %rd565, 32; + cvt.u32.u64 %r30101, %rd565; + cvt.u32.u64 %r30102, %rd573; + shr.u64 %rd574, %rd570, 32; + cvt.u32.u64 %r30099, %rd570; + cvt.u32.u64 %r30100, %rd574; + shr.u64 %rd575, %rd566, 32; + cvt.u32.u64 %r30097, %rd566; + cvt.u32.u64 %r30098, %rd575; + shr.u64 %rd576, %rd567, 32; + cvt.u32.u64 %r30095, %rd567; + cvt.u32.u64 %r30096, %rd576; + mov.u32 %r30058, %r30057; + mov.u32 %r30059, %r30057; + mov.u32 %r30060, %r30057; + mov.u32 %r30061, %r30057; + mov.u32 %r30062, %r30057; + mov.u32 %r30063, %r30057; + mov.u32 %r30064, %r30057; + mov.u32 %r30065, %r30057; + mov.u32 %r30066, %r30057; + mov.u32 %r30067, %r30057; + mov.u32 %r30068, %r30057; + mov.u32 %r30069, %r30057; + mov.u32 %r30070, %r30057; + mov.u32 %r30071, %r10116; + mov.u32 %r30073, %r30057; + mov.u32 %r30074, %r30057; + mov.u32 %r30075, %r30057; + mov.u32 %r30076, %r30057; + mov.u32 %r30077, %r30057; + mov.u32 %r30078, %r30057; + mov.u32 %r30079, %r30057; + mov.u32 %r30080, %r30057; + mov.u32 %r30081, %r30057; + mov.u32 %r30082, %r30057; + mov.u32 %r30083, %r30057; + mov.u32 %r30084, %r30057; + mov.u32 %r30085, %r30057; + mov.u32 %r30086, %r30057; + mov.u32 %r30087, %r30057; + mov.u32 %r30088, %r30057; + mov.u32 %r30089, %r30057; + mov.u32 %r30090, %r30057; + mov.u32 %r30107, %r30057; + +$L__BB2_26: + // begin inline asm + // xor5 + lop3.b32 %r10147, %r30093, %r30091, %r30089, 0x96; + lop3.b32 %r10147, %r10147, %r30087, %r30085, 0x96; + lop3.b32 %r10148, %r30094, %r30092, %r30090, 0x96; + lop3.b32 %r10148, %r10148, %r30088, %r30086, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10159, %r30105, %r30103, %r30083, 0x96; + lop3.b32 %r10159, %r10159, %r30081, %r30079, 0x96; + lop3.b32 %r10160, %r30106, %r30104, %r30084, 0x96; + lop3.b32 %r10160, %r10160, %r30082, %r30080, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10171, %r30101, %r30099, %r30077, 0x96; + lop3.b32 %r10171, %r10171, %r30075, %r30073, 0x96; + lop3.b32 %r10172, %r30102, %r30100, %r30078, 0x96; + lop3.b32 %r10172, %r10172, %r30076, %r30074, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10183, %r30097, %r30071, %r30069, 0x96; + lop3.b32 %r10183, %r10183, %r30067, %r30065, 0x96; + lop3.b32 %r10184, %r30098, %r30072, %r30070, 0x96; + lop3.b32 %r10184, %r10184, %r30068, %r30066, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10195, %r30095, %r30063, %r30061, 0x96; + lop3.b32 %r10195, %r10195, %r30059, %r30057, 0x96; + lop3.b32 %r10196, %r30096, %r30064, %r30062, 0x96; + lop3.b32 %r10196, %r10196, %r30060, %r30058, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10207, %r10160, %r10159, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10211, %r10159, %r10160, %r10116; + // end inline asm + xor.b32 %r10641, %r10207, %r10195; + xor.b32 %r10642, %r10211, %r10196; + xor.b32 %r10474, %r30093, %r10641; + xor.b32 %r10477, %r30094, %r10642; + xor.b32 %r10381, %r30091, %r10641; + xor.b32 %r10380, %r30092, %r10642; + xor.b32 %r10428, %r30089, %r10641; + xor.b32 %r10429, %r30090, %r10642; + xor.b32 %r10333, %r30087, %r10641; + xor.b32 %r10332, %r30088, %r10642; + xor.b32 %r10284, %r30085, %r10641; + xor.b32 %r10285, %r30086, %r10642; + // begin inline asm + shf.l.wrap.b32 %r10215, %r10172, %r10171, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10219, %r10171, %r10172, %r10116; + // end inline asm + xor.b32 %r10643, %r10215, %r10147; + xor.b32 %r10644, %r10219, %r10148; + xor.b32 %r10436, %r30105, %r10643; + xor.b32 %r10437, %r30106, %r10644; + xor.b32 %r10253, %r30103, %r10643; + xor.b32 %r10252, %r30104, %r10644; + xor.b32 %r10412, %r30083, %r10643; + xor.b32 %r10413, %r30084, %r10644; + xor.b32 %r10373, %r30081, %r10643; + xor.b32 %r10372, %r30082, %r10644; + xor.b32 %r10356, %r30079, %r10643; + xor.b32 %r10357, %r30080, %r10644; + // begin inline asm + shf.l.wrap.b32 %r10223, %r10184, %r10183, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10227, %r10183, %r10184, %r10116; + // end inline asm + xor.b32 %r10645, %r10223, %r10159; + xor.b32 %r10646, %r10227, %r10160; + xor.b32 %r10293, %r30101, %r10645; + xor.b32 %r10292, %r30102, %r10646; + xor.b32 %r10420, %r30099, %r10645; + xor.b32 %r10421, %r30100, %r10646; + xor.b32 %r10301, %r30077, %r10645; + xor.b32 %r10300, %r30078, %r10646; + xor.b32 %r10404, %r30075, %r10645; + xor.b32 %r10405, %r30076, %r10646; + xor.b32 %r10269, %r30073, %r10645; + xor.b32 %r10268, %r30074, %r10646; + // begin inline asm + shf.l.wrap.b32 %r10231, %r10196, %r10195, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10235, %r10195, %r10196, %r10116; + // end inline asm + xor.b32 %r10647, %r10231, %r10171; + xor.b32 %r10648, %r10235, %r10172; + xor.b32 %r10388, %r30097, %r10647; + xor.b32 %r10389, %r30098, %r10648; + xor.b32 %r10365, %r30071, %r10647; + xor.b32 %r10364, %r30072, %r10648; + xor.b32 %r10308, %r30069, %r10647; + xor.b32 %r10309, %r30070, %r10648; + xor.b32 %r10396, %r30067, %r10647; + xor.b32 %r10397, %r30068, %r10648; + xor.b32 %r10325, %r30065, %r10647; + xor.b32 %r10324, %r30066, %r10648; + // begin inline asm + shf.l.wrap.b32 %r10239, %r10148, %r10147, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10243, %r10147, %r10148, %r10116; + // end inline asm + xor.b32 %r10649, %r10239, %r10183; + xor.b32 %r10650, %r10243, %r10184; + xor.b32 %r10340, %r30095, %r10649; + xor.b32 %r10341, %r30096, %r10650; + xor.b32 %r10260, %r30063, %r10649; + xor.b32 %r10261, %r30064, %r10650; + xor.b32 %r10277, %r30061, %r10649; + xor.b32 %r10276, %r30062, %r10650; + xor.b32 %r10316, %r30059, %r10649; + xor.b32 %r10317, %r30060, %r10650; + xor.b32 %r10348, %r30057, %r10649; + xor.b32 %r10349, %r30058, %r10650; + mov.u32 %r10254, 44; + // begin inline asm + shf.l.wrap.b32 %r10247, %r10253, %r10252, %r10254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10251, %r10252, %r10253, %r10254; + // end inline asm + mov.u32 %r10262, 20; + // begin inline asm + shf.l.wrap.b32 %r10255, %r10261, %r10260, %r10262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10259, %r10260, %r10261, %r10262; + // end inline asm + mov.u32 %r10270, 61; + // begin inline asm + shf.l.wrap.b32 %r10263, %r10269, %r10268, %r10270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10267, %r10268, %r10269, %r10270; + // end inline asm + mov.u32 %r10278, 39; + // begin inline asm + shf.l.wrap.b32 %r10271, %r10277, %r10276, %r10278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10275, %r10276, %r10277, %r10278; + // end inline asm + mov.u32 %r10286, 18; + // begin inline asm + shf.l.wrap.b32 %r10279, %r10285, %r10284, %r10286; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10283, %r10284, %r10285, %r10286; + // end inline asm + mov.u32 %r10294, 62; + // begin inline asm + shf.l.wrap.b32 %r10287, %r10293, %r10292, %r10294; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10291, %r10292, %r10293, %r10294; + // end inline asm + mov.u32 %r10302, 43; + // begin inline asm + shf.l.wrap.b32 %r10295, %r10301, %r10300, %r10302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10299, %r10300, %r10301, %r10302; + // end inline asm + mov.u32 %r10310, 25; + // begin inline asm + shf.l.wrap.b32 %r10303, %r10309, %r10308, %r10310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10307, %r10308, %r10309, %r10310; + // end inline asm + mov.u32 %r10318, 8; + // begin inline asm + shf.l.wrap.b32 %r10311, %r10317, %r10316, %r10318; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10315, %r10316, %r10317, %r10318; + // end inline asm + mov.u32 %r10326, 56; + // begin inline asm + shf.l.wrap.b32 %r10319, %r10325, %r10324, %r10326; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10323, %r10324, %r10325, %r10326; + // end inline asm + mov.u32 %r10334, 41; + // begin inline asm + shf.l.wrap.b32 %r10327, %r10333, %r10332, %r10334; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10331, %r10332, %r10333, %r10334; + // end inline asm + mov.u32 %r10342, 27; + // begin inline asm + shf.l.wrap.b32 %r10335, %r10341, %r10340, %r10342; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10339, %r10340, %r10341, %r10342; + // end inline asm + mov.u32 %r10350, 14; + // begin inline asm + shf.l.wrap.b32 %r10343, %r10349, %r10348, %r10350; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10347, %r10348, %r10349, %r10350; + // end inline asm + mov.u32 %r10358, 2; + // begin inline asm + shf.l.wrap.b32 %r10351, %r10357, %r10356, %r10358; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10355, %r10356, %r10357, %r10358; + // end inline asm + mov.u32 %r10366, 55; + // begin inline asm + shf.l.wrap.b32 %r10359, %r10365, %r10364, %r10366; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10363, %r10364, %r10365, %r10366; + // end inline asm + mov.u32 %r10374, 45; + // begin inline asm + shf.l.wrap.b32 %r10367, %r10373, %r10372, %r10374; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10371, %r10372, %r10373, %r10374; + // end inline asm + mov.u32 %r10382, 36; + // begin inline asm + shf.l.wrap.b32 %r10375, %r10381, %r10380, %r10382; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10379, %r10380, %r10381, %r10382; + // end inline asm + mov.u32 %r10390, 28; + // begin inline asm + shf.l.wrap.b32 %r10383, %r10389, %r10388, %r10390; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10387, %r10388, %r10389, %r10390; + // end inline asm + mov.u32 %r10398, 21; + // begin inline asm + shf.l.wrap.b32 %r10391, %r10397, %r10396, %r10398; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10395, %r10396, %r10397, %r10398; + // end inline asm + mov.u32 %r10406, 15; + // begin inline asm + shf.l.wrap.b32 %r10399, %r10405, %r10404, %r10406; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10403, %r10404, %r10405, %r10406; + // end inline asm + mov.u32 %r10414, 10; + // begin inline asm + shf.l.wrap.b32 %r10407, %r10413, %r10412, %r10414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10411, %r10412, %r10413, %r10414; + // end inline asm + mov.u32 %r10422, 6; + // begin inline asm + shf.l.wrap.b32 %r10415, %r10421, %r10420, %r10422; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10419, %r10420, %r10421, %r10422; + // end inline asm + mov.u32 %r10430, 3; + // begin inline asm + shf.l.wrap.b32 %r10423, %r10429, %r10428, %r10430; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10427, %r10428, %r10429, %r10430; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10431, %r10437, %r10436, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10435, %r10436, %r10437, %r10116; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10439, %r10474, %r10247, %r10295, 0xD2; + lop3.b32 %r10440, %r10477, %r10251, %r10299, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30105, %r10247, %r10295, %r10391, 0xD2; + lop3.b32 %r30106, %r10251, %r10299, %r10395, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30101, %r10295, %r10391, %r10343, 0xD2; + lop3.b32 %r30102, %r10299, %r10395, %r10347, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30097, %r10391, %r10343, %r10474, 0xD2; + lop3.b32 %r30098, %r10395, %r10347, %r10477, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30095, %r10343, %r10474, %r10247, 0xD2; + lop3.b32 %r30096, %r10347, %r10477, %r10251, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30091, %r10383, %r10255, %r10423, 0xD2; + lop3.b32 %r30092, %r10387, %r10259, %r10427, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30103, %r10255, %r10423, %r10367, 0xD2; + lop3.b32 %r30104, %r10259, %r10427, %r10371, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30099, %r10423, %r10367, %r10263, 0xD2; + lop3.b32 %r30100, %r10427, %r10371, %r10267, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30071, %r10367, %r10263, %r10383, 0xD2; + lop3.b32 %r30072, %r10371, %r10267, %r10387, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30071, %r30072}; + // begin inline asm + // chi + lop3.b32 %r30063, %r10263, %r10383, %r10255, 0xD2; + lop3.b32 %r30064, %r10267, %r10387, %r10259, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30063, %r30064}; + // begin inline asm + // chi + lop3.b32 %r30089, %r10431, %r10415, %r10303, 0xD2; + lop3.b32 %r30090, %r10435, %r10419, %r10307, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30089, %r30090}; + // begin inline asm + // chi + lop3.b32 %r30083, %r10415, %r10303, %r10311, 0xD2; + lop3.b32 %r30084, %r10419, %r10307, %r10315, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30083, %r30084}; + // begin inline asm + // chi + lop3.b32 %r30077, %r10303, %r10311, %r10279, 0xD2; + lop3.b32 %r30078, %r10307, %r10315, %r10283, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30077, %r30078}; + // begin inline asm + // chi + lop3.b32 %r30069, %r10311, %r10279, %r10431, 0xD2; + lop3.b32 %r30070, %r10315, %r10283, %r10435, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30069, %r30070}; + // begin inline asm + // chi + lop3.b32 %r30061, %r10279, %r10431, %r10415, 0xD2; + lop3.b32 %r30062, %r10283, %r10435, %r10419, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30061, %r30062}; + // begin inline asm + // chi + lop3.b32 %r30087, %r10335, %r10375, %r10407, 0xD2; + lop3.b32 %r30088, %r10339, %r10379, %r10411, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30087, %r30088}; + // begin inline asm + // chi + lop3.b32 %r30081, %r10375, %r10407, %r10399, 0xD2; + lop3.b32 %r30082, %r10379, %r10411, %r10403, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30081, %r30082}; + // begin inline asm + // chi + lop3.b32 %r30075, %r10407, %r10399, %r10319, 0xD2; + lop3.b32 %r30076, %r10411, %r10403, %r10323, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30075, %r30076}; + // begin inline asm + // chi + lop3.b32 %r30067, %r10399, %r10319, %r10335, 0xD2; + lop3.b32 %r30068, %r10403, %r10323, %r10339, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30067, %r30068}; + // begin inline asm + // chi + lop3.b32 %r30059, %r10319, %r10335, %r10375, 0xD2; + lop3.b32 %r30060, %r10323, %r10339, %r10379, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30059, %r30060}; + // begin inline asm + // chi + lop3.b32 %r30085, %r10287, %r10359, %r10271, 0xD2; + lop3.b32 %r30086, %r10291, %r10363, %r10275, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30085, %r30086}; + // begin inline asm + // chi + lop3.b32 %r30079, %r10359, %r10271, %r10327, 0xD2; + lop3.b32 %r30080, %r10363, %r10275, %r10331, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30079, %r30080}; + // begin inline asm + // chi + lop3.b32 %r30073, %r10271, %r10327, %r10351, 0xD2; + lop3.b32 %r30074, %r10275, %r10331, %r10355, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30073, %r30074}; + // begin inline asm + // chi + lop3.b32 %r30065, %r10327, %r10351, %r10287, 0xD2; + lop3.b32 %r30066, %r10331, %r10355, %r10291, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30065, %r30066}; + // begin inline asm + // chi + lop3.b32 %r30057, %r10351, %r10287, %r10359, 0xD2; + lop3.b32 %r30058, %r10355, %r10291, %r10363, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30057, %r30058}; + mul.wide.s32 %rd578, %r30107, 8; + mov.u64 %rd579, keccak_round_constants; + cvta.const.u64 %rd580, %rd579; + add.s64 %rd577, %rd580, %rd578; + // begin inline asm + ld.global.nc.v2.u32 {%r10639,%r10640}, [%rd577]; + // end inline asm + xor.b32 %r30093, %r10439, %r10639; + xor.b32 %r30094, %r10440, %r10640; + add.s32 %r30107, %r30107, 1; + setp.lt.u32 %p19, %r30107, 23; + @%p19 bra $L__BB2_26; + + add.u64 %rd82, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r30105, %r30106}; + st.local.v2.u32 [%rd2+72], {%r30103, %r30104}; + st.local.v2.u32 [%rd2+40], {%r30101, %r30102}; + st.local.v2.u32 [%rd2+80], {%r30099, %r30100}; + st.local.v2.u32 [%rd2+48], {%r30097, %r30098}; + st.local.v2.u32 [%rd2+56], {%r30095, %r30096}; + st.local.v2.u32 [%rd2+24], {%r30093, %r30094}; + // begin inline asm + // xor5 + lop3.b32 %r10651, %r30093, %r30091, %r30089, 0x96; + lop3.b32 %r10651, %r10651, %r30087, %r30085, 0x96; + lop3.b32 %r10652, %r30094, %r30092, %r30090, 0x96; + lop3.b32 %r10652, %r10652, %r30088, %r30086, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10663, %r30105, %r30103, %r30083, 0x96; + lop3.b32 %r10663, %r10663, %r30081, %r30079, 0x96; + lop3.b32 %r10664, %r30106, %r30104, %r30084, 0x96; + lop3.b32 %r10664, %r10664, %r30082, %r30080, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10675, %r30101, %r30099, %r30077, 0x96; + lop3.b32 %r10675, %r10675, %r30075, %r30073, 0x96; + lop3.b32 %r10676, %r30102, %r30100, %r30078, 0x96; + lop3.b32 %r10676, %r10676, %r30076, %r30074, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10687, %r30097, %r30071, %r30069, 0x96; + lop3.b32 %r10687, %r10687, %r30067, %r30065, 0x96; + lop3.b32 %r10688, %r30098, %r30072, %r30070, 0x96; + lop3.b32 %r10688, %r10688, %r30068, %r30066, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10699, %r30095, %r30063, %r30061, 0x96; + lop3.b32 %r10699, %r10699, %r30059, %r30057, 0x96; + lop3.b32 %r10700, %r30096, %r30064, %r30062, 0x96; + lop3.b32 %r10700, %r10700, %r30060, %r30058, 0x96; + // end inline asm + mov.u32 %r10903, 1; + // begin inline asm + shf.l.wrap.b32 %r10711, %r10664, %r10663, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10715, %r10663, %r10664, %r10903; + // end inline asm + xor.b32 %r10930, %r10711, %r10699; + xor.b32 %r10931, %r10715, %r10700; + xor.b32 %r10858, %r30093, %r10930; + xor.b32 %r10861, %r30094, %r10931; + xor.b32 %r10821, %r30090, %r10931; + xor.b32 %r10820, %r30089, %r10930; + st.local.v2.u32 [%rd2+104], {%r10820, %r10821}; + // begin inline asm + shf.l.wrap.b32 %r10719, %r10676, %r10675, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10723, %r10675, %r10676, %r10903; + // end inline asm + xor.b32 %r10932, %r10719, %r10651; + xor.b32 %r10933, %r10723, %r10652; + xor.b32 %r10757, %r30103, %r10932; + xor.b32 %r10756, %r30104, %r10933; + xor.b32 %r10796, %r30082, %r10933; + xor.b32 %r10797, %r30081, %r10932; + st.local.v2.u32 [%rd2+152], {%r10797, %r10796}; + // begin inline asm + shf.l.wrap.b32 %r10727, %r10688, %r10687, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10731, %r10687, %r10688, %r10903; + // end inline asm + xor.b32 %r10934, %r10727, %r10663; + xor.b32 %r10935, %r10731, %r10664; + xor.b32 %r10780, %r30078, %r10935; + xor.b32 %r10781, %r30077, %r10934; + st.local.v2.u32 [%rd2+120], {%r10781, %r10780}; + xor.b32 %r10772, %r30074, %r10935; + xor.b32 %r10773, %r30073, %r10934; + st.local.v2.u32 [%rd2+200], {%r10773, %r10772}; + // begin inline asm + shf.l.wrap.b32 %r10735, %r10700, %r10699, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10739, %r10699, %r10700, %r10903; + // end inline asm + xor.b32 %r10936, %r10735, %r10675; + xor.b32 %r10937, %r10739, %r10676; + xor.b32 %r10804, %r30097, %r10936; + xor.b32 %r10805, %r30098, %r10937; + xor.b32 %r10813, %r30068, %r10937; + xor.b32 %r10812, %r30067, %r10936; + st.local.v2.u32 [%rd2+168], {%r10812, %r10813}; + // begin inline asm + shf.l.wrap.b32 %r10743, %r10652, %r10651, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10747, %r10651, %r10652, %r10903; + // end inline asm + xor.b32 %r10938, %r10743, %r10687; + xor.b32 %r10939, %r10747, %r10688; + xor.b32 %r10764, %r30063, %r10938; + xor.b32 %r10765, %r30064, %r10939; + xor.b32 %r10789, %r30058, %r10939; + xor.b32 %r10788, %r30057, %r10938; + st.local.v2.u32 [%rd2+216], {%r10788, %r10789}; + // begin inline asm + shf.l.wrap.b32 %r10751, %r10757, %r10756, %r10254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10755, %r10756, %r10757, %r10254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10759, %r10765, %r10764, %r10262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10763, %r10764, %r10765, %r10262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10771, %r10772, %r10773, %r10270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10767, %r10773, %r10772, %r10270; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r10767, %r10771}; + // begin inline asm + shf.l.wrap.b32 %r10775, %r10781, %r10780, %r10302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10779, %r10780, %r10781, %r10302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10783, %r10789, %r10788, %r10350; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10787, %r10788, %r10789, %r10350; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10795, %r10796, %r10797, %r10374; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10791, %r10797, %r10796, %r10374; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r10791, %r10795}; + // begin inline asm + shf.l.wrap.b32 %r10799, %r10805, %r10804, %r10390; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10803, %r10804, %r10805, %r10390; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10807, %r10813, %r10812, %r10398; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10811, %r10812, %r10813, %r10398; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10815, %r10821, %r10820, %r10430; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10819, %r10820, %r10821, %r10430; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10823, %r10858, %r10751, %r10775, 0xD2; + lop3.b32 %r10824, %r10861, %r10755, %r10779, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30240, %r10751, %r10775, %r10807, 0xD2; + lop3.b32 %r30241, %r10755, %r10779, %r10811, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30240, %r30241}; + // begin inline asm + // chi + lop3.b32 %r30236, %r10775, %r10807, %r10783, 0xD2; + lop3.b32 %r30237, %r10779, %r10811, %r10787, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30236, %r30237}; + // begin inline asm + // chi + lop3.b32 %r30232, %r10807, %r10783, %r10858, 0xD2; + lop3.b32 %r30233, %r10811, %r10787, %r10861, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30232, %r30233}; + // begin inline asm + // chi + lop3.b32 %r30230, %r10783, %r10858, %r10751, 0xD2; + lop3.b32 %r30231, %r10787, %r10861, %r10755, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30230, %r30231}; + // begin inline asm + // chi + lop3.b32 %r30226, %r10799, %r10759, %r10815, 0xD2; + lop3.b32 %r30227, %r10803, %r10763, %r10819, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30226, %r30227}; + // begin inline asm + // chi + lop3.b32 %r30238, %r10759, %r10815, %r10791, 0xD2; + lop3.b32 %r30239, %r10763, %r10819, %r10795, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30238, %r30239}; + // begin inline asm + // chi + lop3.b32 %r30234, %r10815, %r10791, %r10767, 0xD2; + lop3.b32 %r30235, %r10819, %r10795, %r10771, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30234, %r30235}; + add.s64 %rd581, %rd580, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r10887,%r10888}, [%rd581]; + // end inline asm + xor.b32 %r30228, %r10823, %r10887; + xor.b32 %r30229, %r10824, %r10888; + st.local.v2.u32 [%rd2+24], {%r30228, %r30229}; + st.local.u64 [%rd82], %rd361; + mov.u64 %rd585, 1179641; + st.local.u64 [%rd82+8], %rd585; + st.local.u32 [%rd82+16], %r48; + ld.global.u64 %rd586, [%rd33]; + ld.global.u64 %rd587, [%rd33+8]; + ld.global.u64 %rd588, [%rd33+16]; + ld.global.u64 %rd589, [%rd33+24]; + ld.global.u64 %rd590, [%rd33+32]; + ld.global.u64 %rd591, [%rd33+40]; + ld.global.u64 %rd592, [%rd33+48]; + ld.global.u64 %rd593, [%rd33+56]; + st.local.u64 [%rd82+32], %rd587; + st.local.u64 [%rd82+40], %rd588; + st.local.u64 [%rd82+48], %rd589; + st.local.u64 [%rd82+56], %rd590; + st.local.u64 [%rd82+64], %rd591; + st.local.u64 [%rd82+72], %rd592; + st.local.u64 [%rd82+80], %rd593; + cvt.u32.u64 %r10940, %rd586; + xor.b32 %r10941, %r48, %r10940; + st.local.u64 [%rd82+24], %rd586; + st.local.u32 [%rd82+24], %r10941; + mov.u32 %r30108, 0; + st.local.v2.u32 [%rd82+96], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+104], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+112], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+120], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+128], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+136], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+144], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+152], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+160], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+168], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+176], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+184], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+192], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+200], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+208], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+216], {%r30108, %r30108}; + mov.u32 %r30123, -2147483648; + st.local.v2.u32 [%rd82+88], {%r10903, %r30123}; + ld.local.v2.u32 {%r30144, %r30145}, [%rd82+24]; + mov.b64 {%r30142, %r30143}, %rd591; + shr.u64 %rd594, %rd587, 32; + cvt.u32.u64 %r30156, %rd587; + cvt.u32.u64 %r30157, %rd594; + shr.u64 %rd595, %rd592, 32; + cvt.u32.u64 %r30154, %rd592; + cvt.u32.u64 %r30155, %rd595; + shr.u64 %rd596, %rd588, 32; + cvt.u32.u64 %r30152, %rd588; + cvt.u32.u64 %r30153, %rd596; + shr.u64 %rd597, %rd593, 32; + cvt.u32.u64 %r30150, %rd593; + cvt.u32.u64 %r30151, %rd597; + shr.u64 %rd598, %rd589, 32; + cvt.u32.u64 %r30148, %rd589; + cvt.u32.u64 %r30149, %rd598; + shr.u64 %rd599, %rd590, 32; + cvt.u32.u64 %r30146, %rd590; + cvt.u32.u64 %r30147, %rd599; + mov.u32 %r30109, %r30108; + mov.u32 %r30110, %r30108; + mov.u32 %r30111, %r30108; + mov.u32 %r30112, %r30108; + mov.u32 %r30113, %r30108; + mov.u32 %r30114, %r30108; + mov.u32 %r30115, %r30108; + mov.u32 %r30116, %r30108; + mov.u32 %r30117, %r30108; + mov.u32 %r30118, %r30108; + mov.u32 %r30119, %r30108; + mov.u32 %r30120, %r30108; + mov.u32 %r30121, %r30108; + mov.u32 %r30122, %r10903; + mov.u32 %r30124, %r30108; + mov.u32 %r30125, %r30108; + mov.u32 %r30126, %r30108; + mov.u32 %r30127, %r30108; + mov.u32 %r30128, %r30108; + mov.u32 %r30129, %r30108; + mov.u32 %r30130, %r30108; + mov.u32 %r30131, %r30108; + mov.u32 %r30132, %r30108; + mov.u32 %r30133, %r30108; + mov.u32 %r30134, %r30108; + mov.u32 %r30135, %r30108; + mov.u32 %r30136, %r30108; + mov.u32 %r30137, %r30108; + mov.u32 %r30138, %r30108; + mov.u32 %r30139, %r30108; + mov.u32 %r30140, %r30108; + mov.u32 %r30141, %r30108; + mov.u32 %r30158, %r30108; + +$L__BB2_28: + // begin inline asm + // xor5 + lop3.b32 %r10944, %r30144, %r30142, %r30140, 0x96; + lop3.b32 %r10944, %r10944, %r30138, %r30136, 0x96; + lop3.b32 %r10945, %r30145, %r30143, %r30141, 0x96; + lop3.b32 %r10945, %r10945, %r30139, %r30137, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10956, %r30156, %r30154, %r30134, 0x96; + lop3.b32 %r10956, %r10956, %r30132, %r30130, 0x96; + lop3.b32 %r10957, %r30157, %r30155, %r30135, 0x96; + lop3.b32 %r10957, %r10957, %r30133, %r30131, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10968, %r30152, %r30150, %r30128, 0x96; + lop3.b32 %r10968, %r10968, %r30126, %r30124, 0x96; + lop3.b32 %r10969, %r30153, %r30151, %r30129, 0x96; + lop3.b32 %r10969, %r10969, %r30127, %r30125, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10980, %r30148, %r30122, %r30120, 0x96; + lop3.b32 %r10980, %r10980, %r30118, %r30116, 0x96; + lop3.b32 %r10981, %r30149, %r30123, %r30121, 0x96; + lop3.b32 %r10981, %r10981, %r30119, %r30117, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10992, %r30146, %r30114, %r30112, 0x96; + lop3.b32 %r10992, %r10992, %r30110, %r30108, 0x96; + lop3.b32 %r10993, %r30147, %r30115, %r30113, 0x96; + lop3.b32 %r10993, %r10993, %r30111, %r30109, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11004, %r10957, %r10956, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11008, %r10956, %r10957, %r10903; + // end inline asm + xor.b32 %r11438, %r11004, %r10992; + xor.b32 %r11439, %r11008, %r10993; + xor.b32 %r11271, %r30144, %r11438; + xor.b32 %r11274, %r30145, %r11439; + xor.b32 %r11178, %r30142, %r11438; + xor.b32 %r11177, %r30143, %r11439; + xor.b32 %r11225, %r30140, %r11438; + xor.b32 %r11226, %r30141, %r11439; + xor.b32 %r11130, %r30138, %r11438; + xor.b32 %r11129, %r30139, %r11439; + xor.b32 %r11081, %r30136, %r11438; + xor.b32 %r11082, %r30137, %r11439; + // begin inline asm + shf.l.wrap.b32 %r11012, %r10969, %r10968, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11016, %r10968, %r10969, %r10903; + // end inline asm + xor.b32 %r11440, %r11012, %r10944; + xor.b32 %r11441, %r11016, %r10945; + xor.b32 %r11233, %r30156, %r11440; + xor.b32 %r11234, %r30157, %r11441; + xor.b32 %r11050, %r30154, %r11440; + xor.b32 %r11049, %r30155, %r11441; + xor.b32 %r11209, %r30134, %r11440; + xor.b32 %r11210, %r30135, %r11441; + xor.b32 %r11170, %r30132, %r11440; + xor.b32 %r11169, %r30133, %r11441; + xor.b32 %r11153, %r30130, %r11440; + xor.b32 %r11154, %r30131, %r11441; + // begin inline asm + shf.l.wrap.b32 %r11020, %r10981, %r10980, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11024, %r10980, %r10981, %r10903; + // end inline asm + xor.b32 %r11442, %r11020, %r10956; + xor.b32 %r11443, %r11024, %r10957; + xor.b32 %r11090, %r30152, %r11442; + xor.b32 %r11089, %r30153, %r11443; + xor.b32 %r11217, %r30150, %r11442; + xor.b32 %r11218, %r30151, %r11443; + xor.b32 %r11098, %r30128, %r11442; + xor.b32 %r11097, %r30129, %r11443; + xor.b32 %r11201, %r30126, %r11442; + xor.b32 %r11202, %r30127, %r11443; + xor.b32 %r11066, %r30124, %r11442; + xor.b32 %r11065, %r30125, %r11443; + // begin inline asm + shf.l.wrap.b32 %r11028, %r10993, %r10992, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11032, %r10992, %r10993, %r10903; + // end inline asm + xor.b32 %r11444, %r11028, %r10968; + xor.b32 %r11445, %r11032, %r10969; + xor.b32 %r11185, %r30148, %r11444; + xor.b32 %r11186, %r30149, %r11445; + xor.b32 %r11162, %r30122, %r11444; + xor.b32 %r11161, %r30123, %r11445; + xor.b32 %r11105, %r30120, %r11444; + xor.b32 %r11106, %r30121, %r11445; + xor.b32 %r11193, %r30118, %r11444; + xor.b32 %r11194, %r30119, %r11445; + xor.b32 %r11122, %r30116, %r11444; + xor.b32 %r11121, %r30117, %r11445; + // begin inline asm + shf.l.wrap.b32 %r11036, %r10945, %r10944, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11040, %r10944, %r10945, %r10903; + // end inline asm + xor.b32 %r11446, %r11036, %r10980; + xor.b32 %r11447, %r11040, %r10981; + xor.b32 %r11137, %r30146, %r11446; + xor.b32 %r11138, %r30147, %r11447; + xor.b32 %r11057, %r30114, %r11446; + xor.b32 %r11058, %r30115, %r11447; + xor.b32 %r11074, %r30112, %r11446; + xor.b32 %r11073, %r30113, %r11447; + xor.b32 %r11113, %r30110, %r11446; + xor.b32 %r11114, %r30111, %r11447; + xor.b32 %r11145, %r30108, %r11446; + xor.b32 %r11146, %r30109, %r11447; + mov.u32 %r11051, 44; + // begin inline asm + shf.l.wrap.b32 %r11044, %r11050, %r11049, %r11051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11048, %r11049, %r11050, %r11051; + // end inline asm + mov.u32 %r11059, 20; + // begin inline asm + shf.l.wrap.b32 %r11052, %r11058, %r11057, %r11059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11056, %r11057, %r11058, %r11059; + // end inline asm + mov.u32 %r11067, 61; + // begin inline asm + shf.l.wrap.b32 %r11060, %r11066, %r11065, %r11067; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11064, %r11065, %r11066, %r11067; + // end inline asm + mov.u32 %r11075, 39; + // begin inline asm + shf.l.wrap.b32 %r11068, %r11074, %r11073, %r11075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11072, %r11073, %r11074, %r11075; + // end inline asm + mov.u32 %r11083, 18; + // begin inline asm + shf.l.wrap.b32 %r11076, %r11082, %r11081, %r11083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11080, %r11081, %r11082, %r11083; + // end inline asm + mov.u32 %r11091, 62; + // begin inline asm + shf.l.wrap.b32 %r11084, %r11090, %r11089, %r11091; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11088, %r11089, %r11090, %r11091; + // end inline asm + mov.u32 %r11099, 43; + // begin inline asm + shf.l.wrap.b32 %r11092, %r11098, %r11097, %r11099; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11096, %r11097, %r11098, %r11099; + // end inline asm + mov.u32 %r11107, 25; + // begin inline asm + shf.l.wrap.b32 %r11100, %r11106, %r11105, %r11107; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11104, %r11105, %r11106, %r11107; + // end inline asm + mov.u32 %r11115, 8; + // begin inline asm + shf.l.wrap.b32 %r11108, %r11114, %r11113, %r11115; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11112, %r11113, %r11114, %r11115; + // end inline asm + mov.u32 %r11123, 56; + // begin inline asm + shf.l.wrap.b32 %r11116, %r11122, %r11121, %r11123; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11120, %r11121, %r11122, %r11123; + // end inline asm + mov.u32 %r11131, 41; + // begin inline asm + shf.l.wrap.b32 %r11124, %r11130, %r11129, %r11131; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11128, %r11129, %r11130, %r11131; + // end inline asm + mov.u32 %r11139, 27; + // begin inline asm + shf.l.wrap.b32 %r11132, %r11138, %r11137, %r11139; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11136, %r11137, %r11138, %r11139; + // end inline asm + mov.u32 %r11147, 14; + // begin inline asm + shf.l.wrap.b32 %r11140, %r11146, %r11145, %r11147; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11144, %r11145, %r11146, %r11147; + // end inline asm + mov.u32 %r11155, 2; + // begin inline asm + shf.l.wrap.b32 %r11148, %r11154, %r11153, %r11155; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11152, %r11153, %r11154, %r11155; + // end inline asm + mov.u32 %r11163, 55; + // begin inline asm + shf.l.wrap.b32 %r11156, %r11162, %r11161, %r11163; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11160, %r11161, %r11162, %r11163; + // end inline asm + mov.u32 %r11171, 45; + // begin inline asm + shf.l.wrap.b32 %r11164, %r11170, %r11169, %r11171; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11168, %r11169, %r11170, %r11171; + // end inline asm + mov.u32 %r11179, 36; + // begin inline asm + shf.l.wrap.b32 %r11172, %r11178, %r11177, %r11179; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11176, %r11177, %r11178, %r11179; + // end inline asm + mov.u32 %r11187, 28; + // begin inline asm + shf.l.wrap.b32 %r11180, %r11186, %r11185, %r11187; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11184, %r11185, %r11186, %r11187; + // end inline asm + mov.u32 %r11195, 21; + // begin inline asm + shf.l.wrap.b32 %r11188, %r11194, %r11193, %r11195; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11192, %r11193, %r11194, %r11195; + // end inline asm + mov.u32 %r11203, 15; + // begin inline asm + shf.l.wrap.b32 %r11196, %r11202, %r11201, %r11203; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11200, %r11201, %r11202, %r11203; + // end inline asm + mov.u32 %r11211, 10; + // begin inline asm + shf.l.wrap.b32 %r11204, %r11210, %r11209, %r11211; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11208, %r11209, %r11210, %r11211; + // end inline asm + mov.u32 %r11219, 6; + // begin inline asm + shf.l.wrap.b32 %r11212, %r11218, %r11217, %r11219; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11216, %r11217, %r11218, %r11219; + // end inline asm + mov.u32 %r11227, 3; + // begin inline asm + shf.l.wrap.b32 %r11220, %r11226, %r11225, %r11227; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11224, %r11225, %r11226, %r11227; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11228, %r11234, %r11233, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11232, %r11233, %r11234, %r10903; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r11236, %r11271, %r11044, %r11092, 0xD2; + lop3.b32 %r11237, %r11274, %r11048, %r11096, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30156, %r11044, %r11092, %r11188, 0xD2; + lop3.b32 %r30157, %r11048, %r11096, %r11192, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30152, %r11092, %r11188, %r11140, 0xD2; + lop3.b32 %r30153, %r11096, %r11192, %r11144, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30148, %r11188, %r11140, %r11271, 0xD2; + lop3.b32 %r30149, %r11192, %r11144, %r11274, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30146, %r11140, %r11271, %r11044, 0xD2; + lop3.b32 %r30147, %r11144, %r11274, %r11048, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30142, %r11180, %r11052, %r11220, 0xD2; + lop3.b32 %r30143, %r11184, %r11056, %r11224, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30154, %r11052, %r11220, %r11164, 0xD2; + lop3.b32 %r30155, %r11056, %r11224, %r11168, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30150, %r11220, %r11164, %r11060, 0xD2; + lop3.b32 %r30151, %r11224, %r11168, %r11064, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30122, %r11164, %r11060, %r11180, 0xD2; + lop3.b32 %r30123, %r11168, %r11064, %r11184, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r30122, %r30123}; + // begin inline asm + // chi + lop3.b32 %r30114, %r11060, %r11180, %r11052, 0xD2; + lop3.b32 %r30115, %r11064, %r11184, %r11056, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r30114, %r30115}; + // begin inline asm + // chi + lop3.b32 %r30140, %r11228, %r11212, %r11100, 0xD2; + lop3.b32 %r30141, %r11232, %r11216, %r11104, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+104], {%r30140, %r30141}; + // begin inline asm + // chi + lop3.b32 %r30134, %r11212, %r11100, %r11108, 0xD2; + lop3.b32 %r30135, %r11216, %r11104, %r11112, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+112], {%r30134, %r30135}; + // begin inline asm + // chi + lop3.b32 %r30128, %r11100, %r11108, %r11076, 0xD2; + lop3.b32 %r30129, %r11104, %r11112, %r11080, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+120], {%r30128, %r30129}; + // begin inline asm + // chi + lop3.b32 %r30120, %r11108, %r11076, %r11228, 0xD2; + lop3.b32 %r30121, %r11112, %r11080, %r11232, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+128], {%r30120, %r30121}; + // begin inline asm + // chi + lop3.b32 %r30112, %r11076, %r11228, %r11212, 0xD2; + lop3.b32 %r30113, %r11080, %r11232, %r11216, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+136], {%r30112, %r30113}; + // begin inline asm + // chi + lop3.b32 %r30138, %r11132, %r11172, %r11204, 0xD2; + lop3.b32 %r30139, %r11136, %r11176, %r11208, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+144], {%r30138, %r30139}; + // begin inline asm + // chi + lop3.b32 %r30132, %r11172, %r11204, %r11196, 0xD2; + lop3.b32 %r30133, %r11176, %r11208, %r11200, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+152], {%r30132, %r30133}; + // begin inline asm + // chi + lop3.b32 %r30126, %r11204, %r11196, %r11116, 0xD2; + lop3.b32 %r30127, %r11208, %r11200, %r11120, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+160], {%r30126, %r30127}; + // begin inline asm + // chi + lop3.b32 %r30118, %r11196, %r11116, %r11132, 0xD2; + lop3.b32 %r30119, %r11200, %r11120, %r11136, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+168], {%r30118, %r30119}; + // begin inline asm + // chi + lop3.b32 %r30110, %r11116, %r11132, %r11172, 0xD2; + lop3.b32 %r30111, %r11120, %r11136, %r11176, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+176], {%r30110, %r30111}; + // begin inline asm + // chi + lop3.b32 %r30136, %r11084, %r11156, %r11068, 0xD2; + lop3.b32 %r30137, %r11088, %r11160, %r11072, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+184], {%r30136, %r30137}; + // begin inline asm + // chi + lop3.b32 %r30130, %r11156, %r11068, %r11124, 0xD2; + lop3.b32 %r30131, %r11160, %r11072, %r11128, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+192], {%r30130, %r30131}; + // begin inline asm + // chi + lop3.b32 %r30124, %r11068, %r11124, %r11148, 0xD2; + lop3.b32 %r30125, %r11072, %r11128, %r11152, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+200], {%r30124, %r30125}; + // begin inline asm + // chi + lop3.b32 %r30116, %r11124, %r11148, %r11084, 0xD2; + lop3.b32 %r30117, %r11128, %r11152, %r11088, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+208], {%r30116, %r30117}; + // begin inline asm + // chi + lop3.b32 %r30108, %r11148, %r11084, %r11156, 0xD2; + lop3.b32 %r30109, %r11152, %r11088, %r11160, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+216], {%r30108, %r30109}; + mul.wide.s32 %rd601, %r30158, 8; + add.s64 %rd600, %rd580, %rd601; + // begin inline asm + ld.global.nc.v2.u32 {%r11436,%r11437}, [%rd600]; + // end inline asm + xor.b32 %r30144, %r11236, %r11436; + xor.b32 %r30145, %r11237, %r11437; + add.s32 %r30158, %r30158, 1; + setp.lt.u32 %p20, %r30158, 23; + @%p20 bra $L__BB2_28; + + mov.u32 %r30191, 0; + mov.u32 %r11547, 1; + st.local.v2.u32 [%rd82+32], {%r30156, %r30157}; + st.local.v2.u32 [%rd82+72], {%r30154, %r30155}; + st.local.v2.u32 [%rd82+40], {%r30152, %r30153}; + st.local.v2.u32 [%rd82+80], {%r30150, %r30151}; + st.local.v2.u32 [%rd82+48], {%r30148, %r30149}; + st.local.v2.u32 [%rd82+56], {%r30146, %r30147}; + st.local.v2.u32 [%rd82+24], {%r30144, %r30145}; + // begin inline asm + // xor5 + lop3.b32 %r11448, %r30144, %r30142, %r30140, 0x96; + lop3.b32 %r11448, %r11448, %r30138, %r30136, 0x96; + lop3.b32 %r11449, %r30145, %r30143, %r30141, 0x96; + lop3.b32 %r11449, %r11449, %r30139, %r30137, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11460, %r30156, %r30154, %r30134, 0x96; + lop3.b32 %r11460, %r11460, %r30132, %r30130, 0x96; + lop3.b32 %r11461, %r30157, %r30155, %r30135, 0x96; + lop3.b32 %r11461, %r11461, %r30133, %r30131, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11472, %r30152, %r30150, %r30128, 0x96; + lop3.b32 %r11472, %r11472, %r30126, %r30124, 0x96; + lop3.b32 %r11473, %r30153, %r30151, %r30129, 0x96; + lop3.b32 %r11473, %r11473, %r30127, %r30125, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11484, %r30148, %r30122, %r30120, 0x96; + lop3.b32 %r11484, %r11484, %r30118, %r30116, 0x96; + lop3.b32 %r11485, %r30149, %r30123, %r30121, 0x96; + lop3.b32 %r11485, %r11485, %r30119, %r30117, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11496, %r30146, %r30114, %r30112, 0x96; + lop3.b32 %r11496, %r11496, %r30110, %r30108, 0x96; + lop3.b32 %r11497, %r30147, %r30115, %r30113, 0x96; + lop3.b32 %r11497, %r11497, %r30111, %r30109, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11508, %r11461, %r11460, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11512, %r11460, %r11461, %r11547; + // end inline asm + xor.b32 %r11687, %r11508, %r11496; + xor.b32 %r11688, %r11512, %r11497; + xor.b32 %r11655, %r30144, %r11687; + xor.b32 %r11658, %r30145, %r11688; + xor.b32 %r11618, %r30141, %r11688; + xor.b32 %r11617, %r30140, %r11687; + st.local.v2.u32 [%rd82+104], {%r11617, %r11618}; + // begin inline asm + shf.l.wrap.b32 %r11516, %r11473, %r11472, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11520, %r11472, %r11473, %r11547; + // end inline asm + xor.b32 %r11689, %r11516, %r11448; + xor.b32 %r11690, %r11520, %r11449; + xor.b32 %r11554, %r30154, %r11689; + xor.b32 %r11553, %r30155, %r11690; + xor.b32 %r11593, %r30133, %r11690; + xor.b32 %r11594, %r30132, %r11689; + st.local.v2.u32 [%rd82+152], {%r11594, %r11593}; + // begin inline asm + shf.l.wrap.b32 %r11524, %r11485, %r11484, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11528, %r11484, %r11485, %r11547; + // end inline asm + xor.b32 %r11691, %r11524, %r11460; + xor.b32 %r11692, %r11528, %r11461; + xor.b32 %r11577, %r30129, %r11692; + xor.b32 %r11578, %r30128, %r11691; + st.local.v2.u32 [%rd82+120], {%r11578, %r11577}; + xor.b32 %r11569, %r30125, %r11692; + xor.b32 %r11570, %r30124, %r11691; + st.local.v2.u32 [%rd82+200], {%r11570, %r11569}; + // begin inline asm + shf.l.wrap.b32 %r11532, %r11497, %r11496, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11536, %r11496, %r11497, %r11547; + // end inline asm + xor.b32 %r11693, %r11532, %r11472; + xor.b32 %r11694, %r11536, %r11473; + xor.b32 %r11601, %r30148, %r11693; + xor.b32 %r11602, %r30149, %r11694; + xor.b32 %r11610, %r30119, %r11694; + xor.b32 %r11609, %r30118, %r11693; + st.local.v2.u32 [%rd82+168], {%r11609, %r11610}; + // begin inline asm + shf.l.wrap.b32 %r11540, %r11449, %r11448, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11544, %r11448, %r11449, %r11547; + // end inline asm + xor.b32 %r11695, %r11540, %r11484; + xor.b32 %r11696, %r11544, %r11485; + xor.b32 %r11561, %r30114, %r11695; + xor.b32 %r11562, %r30115, %r11696; + xor.b32 %r11586, %r30109, %r11696; + xor.b32 %r11585, %r30108, %r11695; + st.local.v2.u32 [%rd82+216], {%r11585, %r11586}; + // begin inline asm + shf.l.wrap.b32 %r11548, %r11554, %r11553, %r11051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11552, %r11553, %r11554, %r11051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11556, %r11562, %r11561, %r11059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11560, %r11561, %r11562, %r11059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11568, %r11569, %r11570, %r11067; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11564, %r11570, %r11569, %r11067; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r11564, %r11568}; + // begin inline asm + shf.l.wrap.b32 %r11572, %r11578, %r11577, %r11099; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11576, %r11577, %r11578, %r11099; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11580, %r11586, %r11585, %r11147; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11584, %r11585, %r11586, %r11147; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11592, %r11593, %r11594, %r11171; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11588, %r11594, %r11593, %r11171; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r11588, %r11592}; + // begin inline asm + shf.l.wrap.b32 %r11596, %r11602, %r11601, %r11187; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11600, %r11601, %r11602, %r11187; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11604, %r11610, %r11609, %r11195; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11608, %r11609, %r11610, %r11195; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11612, %r11618, %r11617, %r11227; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11616, %r11617, %r11618, %r11227; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r11620, %r11655, %r11548, %r11572, 0xD2; + lop3.b32 %r11621, %r11658, %r11552, %r11576, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30291, %r11548, %r11572, %r11604, 0xD2; + lop3.b32 %r30292, %r11552, %r11576, %r11608, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+32], {%r30291, %r30292}; + // begin inline asm + // chi + lop3.b32 %r30287, %r11572, %r11604, %r11580, 0xD2; + lop3.b32 %r30288, %r11576, %r11608, %r11584, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+40], {%r30287, %r30288}; + // begin inline asm + // chi + lop3.b32 %r30283, %r11604, %r11580, %r11655, 0xD2; + lop3.b32 %r30284, %r11608, %r11584, %r11658, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+48], {%r30283, %r30284}; + // begin inline asm + // chi + lop3.b32 %r30281, %r11580, %r11655, %r11548, 0xD2; + lop3.b32 %r30282, %r11584, %r11658, %r11552, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+56], {%r30281, %r30282}; + // begin inline asm + // chi + lop3.b32 %r30277, %r11596, %r11556, %r11612, 0xD2; + lop3.b32 %r30278, %r11600, %r11560, %r11616, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+64], {%r30277, %r30278}; + // begin inline asm + // chi + lop3.b32 %r30289, %r11556, %r11612, %r11588, 0xD2; + lop3.b32 %r30290, %r11560, %r11616, %r11592, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+72], {%r30289, %r30290}; + // begin inline asm + // chi + lop3.b32 %r30285, %r11612, %r11588, %r11564, 0xD2; + lop3.b32 %r30286, %r11616, %r11592, %r11568, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+80], {%r30285, %r30286}; + // begin inline asm + ld.global.nc.v2.u32 {%r11684,%r11685}, [%rd581]; + // end inline asm + xor.b32 %r30279, %r11620, %r11684; + xor.b32 %r30280, %r11621, %r11685; + st.local.v2.u32 [%rd82+24], {%r30279, %r30280}; + add.s64 %rd84, %rd82, 24; + add.s64 %rd85, %rd2, 24; + +$L__BB2_30: + cvta.to.global.u64 %rd1271, %rd361; + shl.b32 %r11697, %r30191, 2; + cvt.u64.u32 %rd611, %r11697; + and.b64 %rd612, %rd611, 60; + add.s64 %rd613, %rd85, %rd612; + xor.b32 %r11698, %r47, %r30191; + mul.lo.s32 %r11699, %r11698, 16777619; + ld.local.u32 %r11700, [%rd613]; + xor.b32 %r11701, %r11699, %r11700; + mul.wide.u32 %rd614, %r11701, -954391867; + shr.u64 %rd615, %rd614, 32; + cvt.u32.u64 %r11702, %rd615; + sub.s32 %r11703, %r11701, %r11702; + shr.u32 %r11704, %r11703, 1; + add.s32 %r11705, %r11704, %r11702; + shr.u32 %r11706, %r11705, 20; + mul.lo.s32 %r11707, %r11706, 1179641; + sub.s32 %r11708, %r11701, %r11707; + mul.wide.u32 %rd616, %r11708, 64; + add.s64 %rd617, %rd1271, %rd616; + mul.lo.s32 %r11709, %r30228, 16777619; + ld.global.u32 %r11710, [%rd617]; + xor.b32 %r30228, %r11709, %r11710; + mul.lo.s32 %r11711, %r30229, 16777619; + ld.global.u32 %r11712, [%rd617+4]; + xor.b32 %r30229, %r11711, %r11712; + mul.lo.s32 %r11713, %r30240, 16777619; + ld.global.u32 %r11714, [%rd617+8]; + mul.lo.s32 %r11715, %r30241, 16777619; + ld.global.u32 %r11716, [%rd617+12]; + xor.b32 %r11717, %r11715, %r11716; + xor.b32 %r30240, %r11713, %r11714; + mov.b64 %rd618, {%r30240, %r11717}; + mul.lo.s32 %r11718, %r30236, 16777619; + ld.global.u32 %r11719, [%rd617+16]; + mul.lo.s32 %r11720, %r30237, 16777619; + ld.global.u32 %r11721, [%rd617+20]; + xor.b32 %r11722, %r11720, %r11721; + xor.b32 %r30236, %r11718, %r11719; + mov.b64 %rd619, {%r30236, %r11722}; + mul.lo.s32 %r11723, %r30232, 16777619; + ld.global.u32 %r11724, [%rd617+24]; + mul.lo.s32 %r11725, %r30233, 16777619; + ld.global.u32 %r11726, [%rd617+28]; + xor.b32 %r11727, %r11725, %r11726; + xor.b32 %r30232, %r11723, %r11724; + mov.b64 %rd620, {%r30232, %r11727}; + mul.lo.s32 %r11728, %r30230, 16777619; + ld.global.u32 %r11729, [%rd617+32]; + mul.lo.s32 %r11730, %r30231, 16777619; + ld.global.u32 %r11731, [%rd617+36]; + xor.b32 %r11732, %r11730, %r11731; + xor.b32 %r30230, %r11728, %r11729; + mov.b64 %rd621, {%r30230, %r11732}; + mul.lo.s32 %r11733, %r30226, 16777619; + ld.global.u32 %r11734, [%rd617+40]; + xor.b32 %r30226, %r11733, %r11734; + mul.lo.s32 %r11735, %r30227, 16777619; + ld.global.u32 %r11736, [%rd617+44]; + xor.b32 %r30227, %r11735, %r11736; + mul.lo.s32 %r11737, %r30238, 16777619; + ld.global.u32 %r11738, [%rd617+48]; + mul.lo.s32 %r11739, %r30239, 16777619; + ld.global.u32 %r11740, [%rd617+52]; + xor.b32 %r11741, %r11739, %r11740; + xor.b32 %r30238, %r11737, %r11738; + mov.b64 %rd622, {%r30238, %r11741}; + mul.lo.s32 %r11742, %r30234, 16777619; + ld.global.u32 %r11743, [%rd617+56]; + mul.lo.s32 %r11744, %r30235, 16777619; + ld.global.u32 %r11745, [%rd617+60]; + xor.b32 %r11746, %r11744, %r11745; + xor.b32 %r30234, %r11742, %r11743; + mov.b64 %rd623, {%r30234, %r11746}; + st.local.v2.u32 [%rd2+24], {%r30228, %r30229}; + st.local.v2.u32 [%rd2+32], {%r30240, %r11717}; + st.local.v2.u32 [%rd2+40], {%r30236, %r11722}; + st.local.v2.u32 [%rd2+48], {%r30232, %r11727}; + st.local.v2.u32 [%rd2+56], {%r30230, %r11732}; + st.local.v2.u32 [%rd2+64], {%r30226, %r30227}; + st.local.v2.u32 [%rd2+72], {%r30238, %r11741}; + st.local.v2.u32 [%rd2+80], {%r30234, %r11746}; + add.s64 %rd624, %rd84, %rd612; + xor.b32 %r11747, %r48, %r30191; + mul.lo.s32 %r11748, %r11747, 16777619; + ld.local.u32 %r11749, [%rd624]; + xor.b32 %r11750, %r11748, %r11749; + mul.wide.u32 %rd625, %r11750, -954391867; + shr.u64 %rd626, %rd625, 32; + cvt.u32.u64 %r11751, %rd626; + sub.s32 %r11752, %r11750, %r11751; + shr.u32 %r11753, %r11752, 1; + add.s32 %r11754, %r11753, %r11751; + shr.u32 %r11755, %r11754, 20; + mul.lo.s32 %r11756, %r11755, 1179641; + sub.s32 %r11757, %r11750, %r11756; + mul.wide.u32 %rd627, %r11757, 64; + add.s64 %rd628, %rd1271, %rd627; + mul.lo.s32 %r11758, %r30279, 16777619; + ld.global.u32 %r11759, [%rd628]; + xor.b32 %r30279, %r11758, %r11759; + mul.lo.s32 %r11760, %r30280, 16777619; + ld.global.u32 %r11761, [%rd628+4]; + xor.b32 %r30280, %r11760, %r11761; + mul.lo.s32 %r11762, %r30291, 16777619; + ld.global.u32 %r11763, [%rd628+8]; + mul.lo.s32 %r11764, %r30292, 16777619; + ld.global.u32 %r11765, [%rd628+12]; + xor.b32 %r11766, %r11764, %r11765; + xor.b32 %r30291, %r11762, %r11763; + mov.b64 %rd629, {%r30291, %r11766}; + mul.lo.s32 %r11767, %r30287, 16777619; + ld.global.u32 %r11768, [%rd628+16]; + mul.lo.s32 %r11769, %r30288, 16777619; + ld.global.u32 %r11770, [%rd628+20]; + xor.b32 %r11771, %r11769, %r11770; + xor.b32 %r30287, %r11767, %r11768; + mov.b64 %rd630, {%r30287, %r11771}; + mul.lo.s32 %r11772, %r30283, 16777619; + ld.global.u32 %r11773, [%rd628+24]; + mul.lo.s32 %r11774, %r30284, 16777619; + ld.global.u32 %r11775, [%rd628+28]; + xor.b32 %r11776, %r11774, %r11775; + xor.b32 %r30283, %r11772, %r11773; + mov.b64 %rd631, {%r30283, %r11776}; + mul.lo.s32 %r11777, %r30281, 16777619; + ld.global.u32 %r11778, [%rd628+32]; + mul.lo.s32 %r11779, %r30282, 16777619; + ld.global.u32 %r11780, [%rd628+36]; + xor.b32 %r11781, %r11779, %r11780; + xor.b32 %r30281, %r11777, %r11778; + mov.b64 %rd632, {%r30281, %r11781}; + mul.lo.s32 %r11782, %r30277, 16777619; + ld.global.u32 %r11783, [%rd628+40]; + xor.b32 %r30277, %r11782, %r11783; + mul.lo.s32 %r11784, %r30278, 16777619; + ld.global.u32 %r11785, [%rd628+44]; + xor.b32 %r30278, %r11784, %r11785; + mul.lo.s32 %r11786, %r30289, 16777619; + ld.global.u32 %r11787, [%rd628+48]; + mul.lo.s32 %r11788, %r30290, 16777619; + ld.global.u32 %r11789, [%rd628+52]; + xor.b32 %r11790, %r11788, %r11789; + xor.b32 %r30289, %r11786, %r11787; + mov.b64 %rd633, {%r30289, %r11790}; + mul.lo.s32 %r11791, %r30285, 16777619; + ld.global.u32 %r11792, [%rd628+56]; + mul.lo.s32 %r11793, %r30286, 16777619; + ld.global.u32 %r11794, [%rd628+60]; + xor.b32 %r11795, %r11793, %r11794; + xor.b32 %r30285, %r11791, %r11792; + mov.b64 %rd634, {%r30285, %r11795}; + st.local.v2.u32 [%rd82+24], {%r30279, %r30280}; + st.local.v2.u32 [%rd82+32], {%r30291, %r11766}; + st.local.v2.u32 [%rd82+40], {%r30287, %r11771}; + st.local.v2.u32 [%rd82+48], {%r30283, %r11776}; + st.local.v2.u32 [%rd82+56], {%r30281, %r11781}; + st.local.v2.u32 [%rd82+64], {%r30277, %r30278}; + st.local.v2.u32 [%rd82+72], {%r30289, %r11790}; + st.local.v2.u32 [%rd82+80], {%r30285, %r11795}; + add.s32 %r30191, %r30191, 1; + setp.lt.u32 %p21, %r30191, 512; + shr.u64 %rd635, %rd618, 32; + cvt.u32.u64 %r30241, %rd635; + shr.u64 %rd636, %rd619, 32; + cvt.u32.u64 %r30237, %rd636; + shr.u64 %rd637, %rd620, 32; + cvt.u32.u64 %r30233, %rd637; + shr.u64 %rd638, %rd621, 32; + cvt.u32.u64 %r30231, %rd638; + shr.u64 %rd639, %rd622, 32; + cvt.u32.u64 %r30239, %rd639; + shr.u64 %rd640, %rd623, 32; + cvt.u32.u64 %r30235, %rd640; + shr.u64 %rd641, %rd629, 32; + cvt.u32.u64 %r30292, %rd641; + shr.u64 %rd642, %rd630, 32; + cvt.u32.u64 %r30288, %rd642; + shr.u64 %rd643, %rd631, 32; + cvt.u32.u64 %r30284, %rd643; + shr.u64 %rd644, %rd632, 32; + cvt.u32.u64 %r30282, %rd644; + shr.u64 %rd645, %rd633, 32; + cvt.u32.u64 %r30290, %rd645; + shr.u64 %rd646, %rd634, 32; + cvt.u32.u64 %r30286, %rd646; + @%p21 bra $L__BB2_30; + + mov.u32 %r30192, 0; + st.local.v2.u32 [%rd2+96], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+104], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+112], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+120], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+128], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+136], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+144], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+152], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+160], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+168], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+176], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+184], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+192], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+200], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+208], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+216], {%r30192, %r30192}; + mov.u32 %r30207, -2147483648; + mov.u32 %r11810, 1; + st.local.v2.u32 [%rd2+88], {%r11810, %r30207}; + mov.u32 %r30193, %r30192; + mov.u32 %r30194, %r30192; + mov.u32 %r30195, %r30192; + mov.u32 %r30196, %r30192; + mov.u32 %r30197, %r30192; + mov.u32 %r30198, %r30192; + mov.u32 %r30199, %r30192; + mov.u32 %r30200, %r30192; + mov.u32 %r30201, %r30192; + mov.u32 %r30202, %r30192; + mov.u32 %r30203, %r30192; + mov.u32 %r30204, %r30192; + mov.u32 %r30205, %r30192; + mov.u32 %r30206, %r11810; + mov.u32 %r30208, %r30192; + mov.u32 %r30209, %r30192; + mov.u32 %r30210, %r30192; + mov.u32 %r30211, %r30192; + mov.u32 %r30212, %r30192; + mov.u32 %r30213, %r30192; + mov.u32 %r30214, %r30192; + mov.u32 %r30215, %r30192; + mov.u32 %r30216, %r30192; + mov.u32 %r30217, %r30192; + mov.u32 %r30218, %r30192; + mov.u32 %r30219, %r30192; + mov.u32 %r30220, %r30192; + mov.u32 %r30221, %r30192; + mov.u32 %r30222, %r30192; + mov.u32 %r30223, %r30192; + mov.u32 %r30224, %r30192; + mov.u32 %r30225, %r30192; + mov.u32 %r30242, %r30192; + +$L__BB2_32: + // begin inline asm + // xor5 + lop3.b32 %r11837, %r30228, %r30226, %r30224, 0x96; + lop3.b32 %r11837, %r11837, %r30222, %r30220, 0x96; + lop3.b32 %r11838, %r30229, %r30227, %r30225, 0x96; + lop3.b32 %r11838, %r11838, %r30223, %r30221, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11849, %r30240, %r30238, %r30218, 0x96; + lop3.b32 %r11849, %r11849, %r30216, %r30214, 0x96; + lop3.b32 %r11850, %r30241, %r30239, %r30219, 0x96; + lop3.b32 %r11850, %r11850, %r30217, %r30215, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11861, %r30236, %r30234, %r30212, 0x96; + lop3.b32 %r11861, %r11861, %r30210, %r30208, 0x96; + lop3.b32 %r11862, %r30237, %r30235, %r30213, 0x96; + lop3.b32 %r11862, %r11862, %r30211, %r30209, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11873, %r30232, %r30206, %r30204, 0x96; + lop3.b32 %r11873, %r11873, %r30202, %r30200, 0x96; + lop3.b32 %r11874, %r30233, %r30207, %r30205, 0x96; + lop3.b32 %r11874, %r11874, %r30203, %r30201, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11885, %r30230, %r30198, %r30196, 0x96; + lop3.b32 %r11885, %r11885, %r30194, %r30192, 0x96; + lop3.b32 %r11886, %r30231, %r30199, %r30197, 0x96; + lop3.b32 %r11886, %r11886, %r30195, %r30193, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11897, %r11850, %r11849, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11901, %r11849, %r11850, %r11810; + // end inline asm + xor.b32 %r12331, %r11897, %r11885; + xor.b32 %r12332, %r11901, %r11886; + xor.b32 %r12164, %r30228, %r12331; + xor.b32 %r12167, %r30229, %r12332; + xor.b32 %r12071, %r30226, %r12331; + xor.b32 %r12070, %r30227, %r12332; + xor.b32 %r12118, %r30224, %r12331; + xor.b32 %r12119, %r30225, %r12332; + xor.b32 %r12023, %r30222, %r12331; + xor.b32 %r12022, %r30223, %r12332; + xor.b32 %r11974, %r30220, %r12331; + xor.b32 %r11975, %r30221, %r12332; + // begin inline asm + shf.l.wrap.b32 %r11905, %r11862, %r11861, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11909, %r11861, %r11862, %r11810; + // end inline asm + xor.b32 %r12333, %r11905, %r11837; + xor.b32 %r12334, %r11909, %r11838; + xor.b32 %r12126, %r30240, %r12333; + xor.b32 %r12127, %r30241, %r12334; + xor.b32 %r11943, %r30238, %r12333; + xor.b32 %r11942, %r30239, %r12334; + xor.b32 %r12102, %r30218, %r12333; + xor.b32 %r12103, %r30219, %r12334; + xor.b32 %r12063, %r30216, %r12333; + xor.b32 %r12062, %r30217, %r12334; + xor.b32 %r12046, %r30214, %r12333; + xor.b32 %r12047, %r30215, %r12334; + // begin inline asm + shf.l.wrap.b32 %r11913, %r11874, %r11873, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11917, %r11873, %r11874, %r11810; + // end inline asm + xor.b32 %r12335, %r11913, %r11849; + xor.b32 %r12336, %r11917, %r11850; + xor.b32 %r11983, %r30236, %r12335; + xor.b32 %r11982, %r30237, %r12336; + xor.b32 %r12110, %r30234, %r12335; + xor.b32 %r12111, %r30235, %r12336; + xor.b32 %r11991, %r30212, %r12335; + xor.b32 %r11990, %r30213, %r12336; + xor.b32 %r12094, %r30210, %r12335; + xor.b32 %r12095, %r30211, %r12336; + xor.b32 %r11959, %r30208, %r12335; + xor.b32 %r11958, %r30209, %r12336; + // begin inline asm + shf.l.wrap.b32 %r11921, %r11886, %r11885, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11925, %r11885, %r11886, %r11810; + // end inline asm + xor.b32 %r12337, %r11921, %r11861; + xor.b32 %r12338, %r11925, %r11862; + xor.b32 %r12078, %r30232, %r12337; + xor.b32 %r12079, %r30233, %r12338; + xor.b32 %r12055, %r30206, %r12337; + xor.b32 %r12054, %r30207, %r12338; + xor.b32 %r11998, %r30204, %r12337; + xor.b32 %r11999, %r30205, %r12338; + xor.b32 %r12086, %r30202, %r12337; + xor.b32 %r12087, %r30203, %r12338; + xor.b32 %r12015, %r30200, %r12337; + xor.b32 %r12014, %r30201, %r12338; + // begin inline asm + shf.l.wrap.b32 %r11929, %r11838, %r11837, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11933, %r11837, %r11838, %r11810; + // end inline asm + xor.b32 %r12339, %r11929, %r11873; + xor.b32 %r12340, %r11933, %r11874; + xor.b32 %r12030, %r30230, %r12339; + xor.b32 %r12031, %r30231, %r12340; + xor.b32 %r11950, %r30198, %r12339; + xor.b32 %r11951, %r30199, %r12340; + xor.b32 %r11967, %r30196, %r12339; + xor.b32 %r11966, %r30197, %r12340; + xor.b32 %r12006, %r30194, %r12339; + xor.b32 %r12007, %r30195, %r12340; + xor.b32 %r12038, %r30192, %r12339; + xor.b32 %r12039, %r30193, %r12340; + mov.u32 %r11944, 44; + // begin inline asm + shf.l.wrap.b32 %r11937, %r11943, %r11942, %r11944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11941, %r11942, %r11943, %r11944; + // end inline asm + mov.u32 %r11952, 20; + // begin inline asm + shf.l.wrap.b32 %r11945, %r11951, %r11950, %r11952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11949, %r11950, %r11951, %r11952; + // end inline asm + mov.u32 %r11960, 61; + // begin inline asm + shf.l.wrap.b32 %r11953, %r11959, %r11958, %r11960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11957, %r11958, %r11959, %r11960; + // end inline asm + mov.u32 %r11968, 39; + // begin inline asm + shf.l.wrap.b32 %r11961, %r11967, %r11966, %r11968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11965, %r11966, %r11967, %r11968; + // end inline asm + mov.u32 %r11976, 18; + // begin inline asm + shf.l.wrap.b32 %r11969, %r11975, %r11974, %r11976; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11973, %r11974, %r11975, %r11976; + // end inline asm + mov.u32 %r11984, 62; + // begin inline asm + shf.l.wrap.b32 %r11977, %r11983, %r11982, %r11984; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11981, %r11982, %r11983, %r11984; + // end inline asm + mov.u32 %r11992, 43; + // begin inline asm + shf.l.wrap.b32 %r11985, %r11991, %r11990, %r11992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11989, %r11990, %r11991, %r11992; + // end inline asm + mov.u32 %r12000, 25; + // begin inline asm + shf.l.wrap.b32 %r11993, %r11999, %r11998, %r12000; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11997, %r11998, %r11999, %r12000; + // end inline asm + mov.u32 %r12008, 8; + // begin inline asm + shf.l.wrap.b32 %r12001, %r12007, %r12006, %r12008; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12005, %r12006, %r12007, %r12008; + // end inline asm + mov.u32 %r12016, 56; + // begin inline asm + shf.l.wrap.b32 %r12009, %r12015, %r12014, %r12016; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12013, %r12014, %r12015, %r12016; + // end inline asm + mov.u32 %r12024, 41; + // begin inline asm + shf.l.wrap.b32 %r12017, %r12023, %r12022, %r12024; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12021, %r12022, %r12023, %r12024; + // end inline asm + mov.u32 %r12032, 27; + // begin inline asm + shf.l.wrap.b32 %r12025, %r12031, %r12030, %r12032; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12029, %r12030, %r12031, %r12032; + // end inline asm + mov.u32 %r12040, 14; + // begin inline asm + shf.l.wrap.b32 %r12033, %r12039, %r12038, %r12040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12037, %r12038, %r12039, %r12040; + // end inline asm + mov.u32 %r12048, 2; + // begin inline asm + shf.l.wrap.b32 %r12041, %r12047, %r12046, %r12048; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12045, %r12046, %r12047, %r12048; + // end inline asm + mov.u32 %r12056, 55; + // begin inline asm + shf.l.wrap.b32 %r12049, %r12055, %r12054, %r12056; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12053, %r12054, %r12055, %r12056; + // end inline asm + mov.u32 %r12064, 45; + // begin inline asm + shf.l.wrap.b32 %r12057, %r12063, %r12062, %r12064; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12061, %r12062, %r12063, %r12064; + // end inline asm + mov.u32 %r12072, 36; + // begin inline asm + shf.l.wrap.b32 %r12065, %r12071, %r12070, %r12072; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12069, %r12070, %r12071, %r12072; + // end inline asm + mov.u32 %r12080, 28; + // begin inline asm + shf.l.wrap.b32 %r12073, %r12079, %r12078, %r12080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12077, %r12078, %r12079, %r12080; + // end inline asm + mov.u32 %r12088, 21; + // begin inline asm + shf.l.wrap.b32 %r12081, %r12087, %r12086, %r12088; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12085, %r12086, %r12087, %r12088; + // end inline asm + mov.u32 %r12096, 15; + // begin inline asm + shf.l.wrap.b32 %r12089, %r12095, %r12094, %r12096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12093, %r12094, %r12095, %r12096; + // end inline asm + mov.u32 %r12104, 10; + // begin inline asm + shf.l.wrap.b32 %r12097, %r12103, %r12102, %r12104; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12101, %r12102, %r12103, %r12104; + // end inline asm + mov.u32 %r12112, 6; + // begin inline asm + shf.l.wrap.b32 %r12105, %r12111, %r12110, %r12112; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12109, %r12110, %r12111, %r12112; + // end inline asm + mov.u32 %r12120, 3; + // begin inline asm + shf.l.wrap.b32 %r12113, %r12119, %r12118, %r12120; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12117, %r12118, %r12119, %r12120; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12121, %r12127, %r12126, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12125, %r12126, %r12127, %r11810; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12129, %r12164, %r11937, %r11985, 0xD2; + lop3.b32 %r12130, %r12167, %r11941, %r11989, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30240, %r11937, %r11985, %r12081, 0xD2; + lop3.b32 %r30241, %r11941, %r11989, %r12085, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30236, %r11985, %r12081, %r12033, 0xD2; + lop3.b32 %r30237, %r11989, %r12085, %r12037, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30232, %r12081, %r12033, %r12164, 0xD2; + lop3.b32 %r30233, %r12085, %r12037, %r12167, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30230, %r12033, %r12164, %r11937, 0xD2; + lop3.b32 %r30231, %r12037, %r12167, %r11941, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30226, %r12073, %r11945, %r12113, 0xD2; + lop3.b32 %r30227, %r12077, %r11949, %r12117, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30238, %r11945, %r12113, %r12057, 0xD2; + lop3.b32 %r30239, %r11949, %r12117, %r12061, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30234, %r12113, %r12057, %r11953, 0xD2; + lop3.b32 %r30235, %r12117, %r12061, %r11957, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30206, %r12057, %r11953, %r12073, 0xD2; + lop3.b32 %r30207, %r12061, %r11957, %r12077, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30206, %r30207}; + // begin inline asm + // chi + lop3.b32 %r30198, %r11953, %r12073, %r11945, 0xD2; + lop3.b32 %r30199, %r11957, %r12077, %r11949, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30198, %r30199}; + // begin inline asm + // chi + lop3.b32 %r30224, %r12121, %r12105, %r11993, 0xD2; + lop3.b32 %r30225, %r12125, %r12109, %r11997, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30224, %r30225}; + // begin inline asm + // chi + lop3.b32 %r30218, %r12105, %r11993, %r12001, 0xD2; + lop3.b32 %r30219, %r12109, %r11997, %r12005, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30218, %r30219}; + // begin inline asm + // chi + lop3.b32 %r30212, %r11993, %r12001, %r11969, 0xD2; + lop3.b32 %r30213, %r11997, %r12005, %r11973, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30212, %r30213}; + // begin inline asm + // chi + lop3.b32 %r30204, %r12001, %r11969, %r12121, 0xD2; + lop3.b32 %r30205, %r12005, %r11973, %r12125, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30204, %r30205}; + // begin inline asm + // chi + lop3.b32 %r30196, %r11969, %r12121, %r12105, 0xD2; + lop3.b32 %r30197, %r11973, %r12125, %r12109, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30196, %r30197}; + // begin inline asm + // chi + lop3.b32 %r30222, %r12025, %r12065, %r12097, 0xD2; + lop3.b32 %r30223, %r12029, %r12069, %r12101, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30222, %r30223}; + // begin inline asm + // chi + lop3.b32 %r30216, %r12065, %r12097, %r12089, 0xD2; + lop3.b32 %r30217, %r12069, %r12101, %r12093, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30216, %r30217}; + // begin inline asm + // chi + lop3.b32 %r30210, %r12097, %r12089, %r12009, 0xD2; + lop3.b32 %r30211, %r12101, %r12093, %r12013, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30210, %r30211}; + // begin inline asm + // chi + lop3.b32 %r30202, %r12089, %r12009, %r12025, 0xD2; + lop3.b32 %r30203, %r12093, %r12013, %r12029, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30202, %r30203}; + // begin inline asm + // chi + lop3.b32 %r30194, %r12009, %r12025, %r12065, 0xD2; + lop3.b32 %r30195, %r12013, %r12029, %r12069, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30194, %r30195}; + // begin inline asm + // chi + lop3.b32 %r30220, %r11977, %r12049, %r11961, 0xD2; + lop3.b32 %r30221, %r11981, %r12053, %r11965, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30220, %r30221}; + // begin inline asm + // chi + lop3.b32 %r30214, %r12049, %r11961, %r12017, 0xD2; + lop3.b32 %r30215, %r12053, %r11965, %r12021, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30214, %r30215}; + // begin inline asm + // chi + lop3.b32 %r30208, %r11961, %r12017, %r12041, 0xD2; + lop3.b32 %r30209, %r11965, %r12021, %r12045, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30208, %r30209}; + // begin inline asm + // chi + lop3.b32 %r30200, %r12017, %r12041, %r11977, 0xD2; + lop3.b32 %r30201, %r12021, %r12045, %r11981, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30200, %r30201}; + // begin inline asm + // chi + lop3.b32 %r30192, %r12041, %r11977, %r12049, 0xD2; + lop3.b32 %r30193, %r12045, %r11981, %r12053, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30192, %r30193}; + mul.wide.s32 %rd648, %r30242, 8; + add.s64 %rd647, %rd580, %rd648; + // begin inline asm + ld.global.nc.v2.u32 {%r12329,%r12330}, [%rd647]; + // end inline asm + xor.b32 %r30228, %r12129, %r12329; + xor.b32 %r30229, %r12130, %r12330; + add.s32 %r30242, %r30242, 1; + setp.lt.u32 %p22, %r30242, 23; + @%p22 bra $L__BB2_32; + + st.local.v2.u32 [%rd2+32], {%r30240, %r30241}; + st.local.v2.u32 [%rd2+72], {%r30238, %r30239}; + st.local.v2.u32 [%rd2+40], {%r30236, %r30237}; + st.local.v2.u32 [%rd2+80], {%r30234, %r30235}; + st.local.v2.u32 [%rd2+48], {%r30232, %r30233}; + st.local.v2.u32 [%rd2+56], {%r30230, %r30231}; + st.local.v2.u32 [%rd2+24], {%r30228, %r30229}; + // begin inline asm + // xor5 + lop3.b32 %r12341, %r30228, %r30226, %r30224, 0x96; + lop3.b32 %r12341, %r12341, %r30222, %r30220, 0x96; + lop3.b32 %r12342, %r30229, %r30227, %r30225, 0x96; + lop3.b32 %r12342, %r12342, %r30223, %r30221, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12353, %r30240, %r30238, %r30218, 0x96; + lop3.b32 %r12353, %r12353, %r30216, %r30214, 0x96; + lop3.b32 %r12354, %r30241, %r30239, %r30219, 0x96; + lop3.b32 %r12354, %r12354, %r30217, %r30215, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12365, %r30236, %r30234, %r30212, 0x96; + lop3.b32 %r12365, %r12365, %r30210, %r30208, 0x96; + lop3.b32 %r12366, %r30237, %r30235, %r30213, 0x96; + lop3.b32 %r12366, %r12366, %r30211, %r30209, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12377, %r30232, %r30206, %r30204, 0x96; + lop3.b32 %r12377, %r12377, %r30202, %r30200, 0x96; + lop3.b32 %r12378, %r30233, %r30207, %r30205, 0x96; + lop3.b32 %r12378, %r12378, %r30203, %r30201, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12389, %r30230, %r30198, %r30196, 0x96; + lop3.b32 %r12389, %r12389, %r30194, %r30192, 0x96; + lop3.b32 %r12390, %r30231, %r30199, %r30197, 0x96; + lop3.b32 %r12390, %r12390, %r30195, %r30193, 0x96; + // end inline asm + mov.u32 %r12593, 1; + // begin inline asm + shf.l.wrap.b32 %r12401, %r12354, %r12353, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12405, %r12353, %r12354, %r12593; + // end inline asm + xor.b32 %r12620, %r12401, %r12389; + xor.b32 %r12621, %r12405, %r12390; + xor.b32 %r12548, %r30228, %r12620; + xor.b32 %r12551, %r30229, %r12621; + xor.b32 %r12511, %r30225, %r12621; + xor.b32 %r12510, %r30224, %r12620; + st.local.v2.u32 [%rd2+104], {%r12510, %r12511}; + // begin inline asm + shf.l.wrap.b32 %r12409, %r12366, %r12365, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12413, %r12365, %r12366, %r12593; + // end inline asm + xor.b32 %r12622, %r12409, %r12341; + xor.b32 %r12623, %r12413, %r12342; + xor.b32 %r12447, %r30238, %r12622; + xor.b32 %r12446, %r30239, %r12623; + xor.b32 %r12486, %r30217, %r12623; + xor.b32 %r12487, %r30216, %r12622; + st.local.v2.u32 [%rd2+152], {%r12487, %r12486}; + // begin inline asm + shf.l.wrap.b32 %r12417, %r12378, %r12377, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12421, %r12377, %r12378, %r12593; + // end inline asm + xor.b32 %r12624, %r12417, %r12353; + xor.b32 %r12625, %r12421, %r12354; + xor.b32 %r12470, %r30213, %r12625; + xor.b32 %r12471, %r30212, %r12624; + st.local.v2.u32 [%rd2+120], {%r12471, %r12470}; + xor.b32 %r12462, %r30209, %r12625; + xor.b32 %r12463, %r30208, %r12624; + st.local.v2.u32 [%rd2+200], {%r12463, %r12462}; + // begin inline asm + shf.l.wrap.b32 %r12425, %r12390, %r12389, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12429, %r12389, %r12390, %r12593; + // end inline asm + xor.b32 %r12626, %r12425, %r12365; + xor.b32 %r12627, %r12429, %r12366; + xor.b32 %r12494, %r30232, %r12626; + xor.b32 %r12495, %r30233, %r12627; + xor.b32 %r12503, %r30203, %r12627; + xor.b32 %r12502, %r30202, %r12626; + st.local.v2.u32 [%rd2+168], {%r12502, %r12503}; + // begin inline asm + shf.l.wrap.b32 %r12433, %r12342, %r12341, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12437, %r12341, %r12342, %r12593; + // end inline asm + xor.b32 %r12628, %r12433, %r12377; + xor.b32 %r12629, %r12437, %r12378; + xor.b32 %r12454, %r30198, %r12628; + xor.b32 %r12455, %r30199, %r12629; + xor.b32 %r12479, %r30193, %r12629; + xor.b32 %r12478, %r30192, %r12628; + st.local.v2.u32 [%rd2+216], {%r12478, %r12479}; + // begin inline asm + shf.l.wrap.b32 %r12441, %r12447, %r12446, %r11944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12445, %r12446, %r12447, %r11944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12449, %r12455, %r12454, %r11952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12453, %r12454, %r12455, %r11952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12461, %r12462, %r12463, %r11960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12457, %r12463, %r12462, %r11960; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r12457, %r12461}; + // begin inline asm + shf.l.wrap.b32 %r12465, %r12471, %r12470, %r11992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12469, %r12470, %r12471, %r11992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12473, %r12479, %r12478, %r12040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12477, %r12478, %r12479, %r12040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12485, %r12486, %r12487, %r12064; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12481, %r12487, %r12486, %r12064; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r12481, %r12485}; + // begin inline asm + shf.l.wrap.b32 %r12489, %r12495, %r12494, %r12080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12493, %r12494, %r12495, %r12080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12497, %r12503, %r12502, %r12088; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12501, %r12502, %r12503, %r12088; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12505, %r12511, %r12510, %r12120; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12509, %r12510, %r12511, %r12120; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12513, %r12548, %r12441, %r12465, 0xD2; + lop3.b32 %r12514, %r12551, %r12445, %r12469, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12521, %r12441, %r12465, %r12497, 0xD2; + lop3.b32 %r12522, %r12445, %r12469, %r12501, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r12521, %r12522}; + // begin inline asm + // chi + lop3.b32 %r12529, %r12465, %r12497, %r12473, 0xD2; + lop3.b32 %r12530, %r12469, %r12501, %r12477, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r12529, %r12530}; + // begin inline asm + // chi + lop3.b32 %r12537, %r12497, %r12473, %r12548, 0xD2; + lop3.b32 %r12538, %r12501, %r12477, %r12551, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r12537, %r12538}; + // begin inline asm + // chi + lop3.b32 %r12545, %r12473, %r12548, %r12441, 0xD2; + lop3.b32 %r12546, %r12477, %r12551, %r12445, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r12545, %r12546}; + // begin inline asm + // chi + lop3.b32 %r12553, %r12489, %r12449, %r12505, 0xD2; + lop3.b32 %r12554, %r12493, %r12453, %r12509, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r12553, %r12554}; + // begin inline asm + // chi + lop3.b32 %r12561, %r12449, %r12505, %r12481, 0xD2; + lop3.b32 %r12562, %r12453, %r12509, %r12485, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r12561, %r12562}; + // begin inline asm + // chi + lop3.b32 %r12569, %r12505, %r12481, %r12457, 0xD2; + lop3.b32 %r12570, %r12509, %r12485, %r12461, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r12569, %r12570}; + // begin inline asm + ld.global.nc.v2.u32 {%r12577,%r12578}, [%rd581]; + // end inline asm + xor.b32 %r12630, %r12514, %r12578; + xor.b32 %r12631, %r12513, %r12577; + mov.b64 %rd1317, {%r12631, %r12630}; + mov.b64 %rd1318, {%r12521, %r12522}; + mov.b64 %rd1319, {%r12529, %r12530}; + mov.b64 %rd1320, {%r12537, %r12538}; + mov.b64 %rd1321, {%r12545, %r12546}; + mov.b64 %rd1322, {%r12553, %r12554}; + mov.b64 %rd1323, {%r12561, %r12562}; + mov.b64 %rd1324, {%r12569, %r12570}; + mov.u32 %r30243, 0; + st.local.v2.u32 [%rd2+24], {%r12631, %r12630}; + st.local.v2.u32 [%rd82+96], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+104], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+112], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+120], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+128], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+136], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+144], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+152], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+160], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+168], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+176], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+184], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+192], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+200], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+208], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+216], {%r30243, %r30243}; + mov.u32 %r30258, -2147483648; + st.local.v2.u32 [%rd82+88], {%r12593, %r30258}; + mov.u32 %r30244, %r30243; + mov.u32 %r30245, %r30243; + mov.u32 %r30246, %r30243; + mov.u32 %r30247, %r30243; + mov.u32 %r30248, %r30243; + mov.u32 %r30249, %r30243; + mov.u32 %r30250, %r30243; + mov.u32 %r30251, %r30243; + mov.u32 %r30252, %r30243; + mov.u32 %r30253, %r30243; + mov.u32 %r30254, %r30243; + mov.u32 %r30255, %r30243; + mov.u32 %r30256, %r30243; + mov.u32 %r30257, %r12593; + mov.u32 %r30259, %r30243; + mov.u32 %r30260, %r30243; + mov.u32 %r30261, %r30243; + mov.u32 %r30262, %r30243; + mov.u32 %r30263, %r30243; + mov.u32 %r30264, %r30243; + mov.u32 %r30265, %r30243; + mov.u32 %r30266, %r30243; + mov.u32 %r30267, %r30243; + mov.u32 %r30268, %r30243; + mov.u32 %r30269, %r30243; + mov.u32 %r30270, %r30243; + mov.u32 %r30271, %r30243; + mov.u32 %r30272, %r30243; + mov.u32 %r30273, %r30243; + mov.u32 %r30274, %r30243; + mov.u32 %r30275, %r30243; + mov.u32 %r30276, %r30243; + mov.u32 %r30293, %r30243; + +$L__BB2_34: + // begin inline asm + // xor5 + lop3.b32 %r12632, %r30279, %r30277, %r30275, 0x96; + lop3.b32 %r12632, %r12632, %r30273, %r30271, 0x96; + lop3.b32 %r12633, %r30280, %r30278, %r30276, 0x96; + lop3.b32 %r12633, %r12633, %r30274, %r30272, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12644, %r30291, %r30289, %r30269, 0x96; + lop3.b32 %r12644, %r12644, %r30267, %r30265, 0x96; + lop3.b32 %r12645, %r30292, %r30290, %r30270, 0x96; + lop3.b32 %r12645, %r12645, %r30268, %r30266, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12656, %r30287, %r30285, %r30263, 0x96; + lop3.b32 %r12656, %r12656, %r30261, %r30259, 0x96; + lop3.b32 %r12657, %r30288, %r30286, %r30264, 0x96; + lop3.b32 %r12657, %r12657, %r30262, %r30260, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12668, %r30283, %r30257, %r30255, 0x96; + lop3.b32 %r12668, %r12668, %r30253, %r30251, 0x96; + lop3.b32 %r12669, %r30284, %r30258, %r30256, 0x96; + lop3.b32 %r12669, %r12669, %r30254, %r30252, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12680, %r30281, %r30249, %r30247, 0x96; + lop3.b32 %r12680, %r12680, %r30245, %r30243, 0x96; + lop3.b32 %r12681, %r30282, %r30250, %r30248, 0x96; + lop3.b32 %r12681, %r12681, %r30246, %r30244, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12692, %r12645, %r12644, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12696, %r12644, %r12645, %r12593; + // end inline asm + xor.b32 %r13126, %r12692, %r12680; + xor.b32 %r13127, %r12696, %r12681; + xor.b32 %r12959, %r30279, %r13126; + xor.b32 %r12962, %r30280, %r13127; + xor.b32 %r12866, %r30277, %r13126; + xor.b32 %r12865, %r30278, %r13127; + xor.b32 %r12913, %r30275, %r13126; + xor.b32 %r12914, %r30276, %r13127; + xor.b32 %r12818, %r30273, %r13126; + xor.b32 %r12817, %r30274, %r13127; + xor.b32 %r12769, %r30271, %r13126; + xor.b32 %r12770, %r30272, %r13127; + // begin inline asm + shf.l.wrap.b32 %r12700, %r12657, %r12656, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12704, %r12656, %r12657, %r12593; + // end inline asm + xor.b32 %r13128, %r12700, %r12632; + xor.b32 %r13129, %r12704, %r12633; + xor.b32 %r12921, %r30291, %r13128; + xor.b32 %r12922, %r30292, %r13129; + xor.b32 %r12738, %r30289, %r13128; + xor.b32 %r12737, %r30290, %r13129; + xor.b32 %r12897, %r30269, %r13128; + xor.b32 %r12898, %r30270, %r13129; + xor.b32 %r12858, %r30267, %r13128; + xor.b32 %r12857, %r30268, %r13129; + xor.b32 %r12841, %r30265, %r13128; + xor.b32 %r12842, %r30266, %r13129; + // begin inline asm + shf.l.wrap.b32 %r12708, %r12669, %r12668, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12712, %r12668, %r12669, %r12593; + // end inline asm + xor.b32 %r13130, %r12708, %r12644; + xor.b32 %r13131, %r12712, %r12645; + xor.b32 %r12778, %r30287, %r13130; + xor.b32 %r12777, %r30288, %r13131; + xor.b32 %r12905, %r30285, %r13130; + xor.b32 %r12906, %r30286, %r13131; + xor.b32 %r12786, %r30263, %r13130; + xor.b32 %r12785, %r30264, %r13131; + xor.b32 %r12889, %r30261, %r13130; + xor.b32 %r12890, %r30262, %r13131; + xor.b32 %r12754, %r30259, %r13130; + xor.b32 %r12753, %r30260, %r13131; + // begin inline asm + shf.l.wrap.b32 %r12716, %r12681, %r12680, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12720, %r12680, %r12681, %r12593; + // end inline asm + xor.b32 %r13132, %r12716, %r12656; + xor.b32 %r13133, %r12720, %r12657; + xor.b32 %r12873, %r30283, %r13132; + xor.b32 %r12874, %r30284, %r13133; + xor.b32 %r12850, %r30257, %r13132; + xor.b32 %r12849, %r30258, %r13133; + xor.b32 %r12793, %r30255, %r13132; + xor.b32 %r12794, %r30256, %r13133; + xor.b32 %r12881, %r30253, %r13132; + xor.b32 %r12882, %r30254, %r13133; + xor.b32 %r12810, %r30251, %r13132; + xor.b32 %r12809, %r30252, %r13133; + // begin inline asm + shf.l.wrap.b32 %r12724, %r12633, %r12632, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12728, %r12632, %r12633, %r12593; + // end inline asm + xor.b32 %r13134, %r12724, %r12668; + xor.b32 %r13135, %r12728, %r12669; + xor.b32 %r12825, %r30281, %r13134; + xor.b32 %r12826, %r30282, %r13135; + xor.b32 %r12745, %r30249, %r13134; + xor.b32 %r12746, %r30250, %r13135; + xor.b32 %r12762, %r30247, %r13134; + xor.b32 %r12761, %r30248, %r13135; + xor.b32 %r12801, %r30245, %r13134; + xor.b32 %r12802, %r30246, %r13135; + xor.b32 %r12833, %r30243, %r13134; + xor.b32 %r12834, %r30244, %r13135; + mov.u32 %r12739, 44; + // begin inline asm + shf.l.wrap.b32 %r12732, %r12738, %r12737, %r12739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12736, %r12737, %r12738, %r12739; + // end inline asm + mov.u32 %r12747, 20; + // begin inline asm + shf.l.wrap.b32 %r12740, %r12746, %r12745, %r12747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12744, %r12745, %r12746, %r12747; + // end inline asm + mov.u32 %r12755, 61; + // begin inline asm + shf.l.wrap.b32 %r12748, %r12754, %r12753, %r12755; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12752, %r12753, %r12754, %r12755; + // end inline asm + mov.u32 %r12763, 39; + // begin inline asm + shf.l.wrap.b32 %r12756, %r12762, %r12761, %r12763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12760, %r12761, %r12762, %r12763; + // end inline asm + mov.u32 %r12771, 18; + // begin inline asm + shf.l.wrap.b32 %r12764, %r12770, %r12769, %r12771; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12768, %r12769, %r12770, %r12771; + // end inline asm + mov.u32 %r12779, 62; + // begin inline asm + shf.l.wrap.b32 %r12772, %r12778, %r12777, %r12779; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12776, %r12777, %r12778, %r12779; + // end inline asm + mov.u32 %r12787, 43; + // begin inline asm + shf.l.wrap.b32 %r12780, %r12786, %r12785, %r12787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12784, %r12785, %r12786, %r12787; + // end inline asm + mov.u32 %r12795, 25; + // begin inline asm + shf.l.wrap.b32 %r12788, %r12794, %r12793, %r12795; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12792, %r12793, %r12794, %r12795; + // end inline asm + mov.u32 %r12803, 8; + // begin inline asm + shf.l.wrap.b32 %r12796, %r12802, %r12801, %r12803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12800, %r12801, %r12802, %r12803; + // end inline asm + mov.u32 %r12811, 56; + // begin inline asm + shf.l.wrap.b32 %r12804, %r12810, %r12809, %r12811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12808, %r12809, %r12810, %r12811; + // end inline asm + mov.u32 %r12819, 41; + // begin inline asm + shf.l.wrap.b32 %r12812, %r12818, %r12817, %r12819; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12816, %r12817, %r12818, %r12819; + // end inline asm + mov.u32 %r12827, 27; + // begin inline asm + shf.l.wrap.b32 %r12820, %r12826, %r12825, %r12827; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12824, %r12825, %r12826, %r12827; + // end inline asm + mov.u32 %r12835, 14; + // begin inline asm + shf.l.wrap.b32 %r12828, %r12834, %r12833, %r12835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12832, %r12833, %r12834, %r12835; + // end inline asm + mov.u32 %r12843, 2; + // begin inline asm + shf.l.wrap.b32 %r12836, %r12842, %r12841, %r12843; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12840, %r12841, %r12842, %r12843; + // end inline asm + mov.u32 %r12851, 55; + // begin inline asm + shf.l.wrap.b32 %r12844, %r12850, %r12849, %r12851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12848, %r12849, %r12850, %r12851; + // end inline asm + mov.u32 %r12859, 45; + // begin inline asm + shf.l.wrap.b32 %r12852, %r12858, %r12857, %r12859; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12856, %r12857, %r12858, %r12859; + // end inline asm + mov.u32 %r12867, 36; + // begin inline asm + shf.l.wrap.b32 %r12860, %r12866, %r12865, %r12867; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12864, %r12865, %r12866, %r12867; + // end inline asm + mov.u32 %r12875, 28; + // begin inline asm + shf.l.wrap.b32 %r12868, %r12874, %r12873, %r12875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12872, %r12873, %r12874, %r12875; + // end inline asm + mov.u32 %r12883, 21; + // begin inline asm + shf.l.wrap.b32 %r12876, %r12882, %r12881, %r12883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12880, %r12881, %r12882, %r12883; + // end inline asm + mov.u32 %r12891, 15; + // begin inline asm + shf.l.wrap.b32 %r12884, %r12890, %r12889, %r12891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12888, %r12889, %r12890, %r12891; + // end inline asm + mov.u32 %r12899, 10; + // begin inline asm + shf.l.wrap.b32 %r12892, %r12898, %r12897, %r12899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12896, %r12897, %r12898, %r12899; + // end inline asm + mov.u32 %r12907, 6; + // begin inline asm + shf.l.wrap.b32 %r12900, %r12906, %r12905, %r12907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12904, %r12905, %r12906, %r12907; + // end inline asm + mov.u32 %r12915, 3; + // begin inline asm + shf.l.wrap.b32 %r12908, %r12914, %r12913, %r12915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12912, %r12913, %r12914, %r12915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12916, %r12922, %r12921, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12920, %r12921, %r12922, %r12593; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12924, %r12959, %r12732, %r12780, 0xD2; + lop3.b32 %r12925, %r12962, %r12736, %r12784, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30291, %r12732, %r12780, %r12876, 0xD2; + lop3.b32 %r30292, %r12736, %r12784, %r12880, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30287, %r12780, %r12876, %r12828, 0xD2; + lop3.b32 %r30288, %r12784, %r12880, %r12832, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30283, %r12876, %r12828, %r12959, 0xD2; + lop3.b32 %r30284, %r12880, %r12832, %r12962, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30281, %r12828, %r12959, %r12732, 0xD2; + lop3.b32 %r30282, %r12832, %r12962, %r12736, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30277, %r12868, %r12740, %r12908, 0xD2; + lop3.b32 %r30278, %r12872, %r12744, %r12912, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30289, %r12740, %r12908, %r12852, 0xD2; + lop3.b32 %r30290, %r12744, %r12912, %r12856, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30285, %r12908, %r12852, %r12748, 0xD2; + lop3.b32 %r30286, %r12912, %r12856, %r12752, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30257, %r12852, %r12748, %r12868, 0xD2; + lop3.b32 %r30258, %r12856, %r12752, %r12872, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r30257, %r30258}; + // begin inline asm + // chi + lop3.b32 %r30249, %r12748, %r12868, %r12740, 0xD2; + lop3.b32 %r30250, %r12752, %r12872, %r12744, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r30249, %r30250}; + // begin inline asm + // chi + lop3.b32 %r30275, %r12916, %r12900, %r12788, 0xD2; + lop3.b32 %r30276, %r12920, %r12904, %r12792, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+104], {%r30275, %r30276}; + // begin inline asm + // chi + lop3.b32 %r30269, %r12900, %r12788, %r12796, 0xD2; + lop3.b32 %r30270, %r12904, %r12792, %r12800, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+112], {%r30269, %r30270}; + // begin inline asm + // chi + lop3.b32 %r30263, %r12788, %r12796, %r12764, 0xD2; + lop3.b32 %r30264, %r12792, %r12800, %r12768, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+120], {%r30263, %r30264}; + // begin inline asm + // chi + lop3.b32 %r30255, %r12796, %r12764, %r12916, 0xD2; + lop3.b32 %r30256, %r12800, %r12768, %r12920, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+128], {%r30255, %r30256}; + // begin inline asm + // chi + lop3.b32 %r30247, %r12764, %r12916, %r12900, 0xD2; + lop3.b32 %r30248, %r12768, %r12920, %r12904, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+136], {%r30247, %r30248}; + // begin inline asm + // chi + lop3.b32 %r30273, %r12820, %r12860, %r12892, 0xD2; + lop3.b32 %r30274, %r12824, %r12864, %r12896, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+144], {%r30273, %r30274}; + // begin inline asm + // chi + lop3.b32 %r30267, %r12860, %r12892, %r12884, 0xD2; + lop3.b32 %r30268, %r12864, %r12896, %r12888, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+152], {%r30267, %r30268}; + // begin inline asm + // chi + lop3.b32 %r30261, %r12892, %r12884, %r12804, 0xD2; + lop3.b32 %r30262, %r12896, %r12888, %r12808, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+160], {%r30261, %r30262}; + // begin inline asm + // chi + lop3.b32 %r30253, %r12884, %r12804, %r12820, 0xD2; + lop3.b32 %r30254, %r12888, %r12808, %r12824, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+168], {%r30253, %r30254}; + // begin inline asm + // chi + lop3.b32 %r30245, %r12804, %r12820, %r12860, 0xD2; + lop3.b32 %r30246, %r12808, %r12824, %r12864, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+176], {%r30245, %r30246}; + // begin inline asm + // chi + lop3.b32 %r30271, %r12772, %r12844, %r12756, 0xD2; + lop3.b32 %r30272, %r12776, %r12848, %r12760, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+184], {%r30271, %r30272}; + // begin inline asm + // chi + lop3.b32 %r30265, %r12844, %r12756, %r12812, 0xD2; + lop3.b32 %r30266, %r12848, %r12760, %r12816, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+192], {%r30265, %r30266}; + // begin inline asm + // chi + lop3.b32 %r30259, %r12756, %r12812, %r12836, 0xD2; + lop3.b32 %r30260, %r12760, %r12816, %r12840, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+200], {%r30259, %r30260}; + // begin inline asm + // chi + lop3.b32 %r30251, %r12812, %r12836, %r12772, 0xD2; + lop3.b32 %r30252, %r12816, %r12840, %r12776, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+208], {%r30251, %r30252}; + // begin inline asm + // chi + lop3.b32 %r30243, %r12836, %r12772, %r12844, 0xD2; + lop3.b32 %r30244, %r12840, %r12776, %r12848, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+216], {%r30243, %r30244}; + mul.wide.s32 %rd655, %r30293, 8; + add.s64 %rd654, %rd580, %rd655; + // begin inline asm + ld.global.nc.v2.u32 {%r13124,%r13125}, [%rd654]; + // end inline asm + xor.b32 %r30279, %r12924, %r13124; + xor.b32 %r30280, %r12925, %r13125; + add.s32 %r30293, %r30293, 1; + setp.lt.u32 %p23, %r30293, 23; + @%p23 bra $L__BB2_34; + + mov.u32 %r13235, 1; + st.local.v2.u32 [%rd82+32], {%r30291, %r30292}; + st.local.v2.u32 [%rd82+72], {%r30289, %r30290}; + st.local.v2.u32 [%rd82+40], {%r30287, %r30288}; + st.local.v2.u32 [%rd82+80], {%r30285, %r30286}; + st.local.v2.u32 [%rd82+48], {%r30283, %r30284}; + st.local.v2.u32 [%rd82+56], {%r30281, %r30282}; + st.local.v2.u32 [%rd82+24], {%r30279, %r30280}; + // begin inline asm + // xor5 + lop3.b32 %r13136, %r30279, %r30277, %r30275, 0x96; + lop3.b32 %r13136, %r13136, %r30273, %r30271, 0x96; + lop3.b32 %r13137, %r30280, %r30278, %r30276, 0x96; + lop3.b32 %r13137, %r13137, %r30274, %r30272, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13148, %r30291, %r30289, %r30269, 0x96; + lop3.b32 %r13148, %r13148, %r30267, %r30265, 0x96; + lop3.b32 %r13149, %r30292, %r30290, %r30270, 0x96; + lop3.b32 %r13149, %r13149, %r30268, %r30266, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13160, %r30287, %r30285, %r30263, 0x96; + lop3.b32 %r13160, %r13160, %r30261, %r30259, 0x96; + lop3.b32 %r13161, %r30288, %r30286, %r30264, 0x96; + lop3.b32 %r13161, %r13161, %r30262, %r30260, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13172, %r30283, %r30257, %r30255, 0x96; + lop3.b32 %r13172, %r13172, %r30253, %r30251, 0x96; + lop3.b32 %r13173, %r30284, %r30258, %r30256, 0x96; + lop3.b32 %r13173, %r13173, %r30254, %r30252, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13184, %r30281, %r30249, %r30247, 0x96; + lop3.b32 %r13184, %r13184, %r30245, %r30243, 0x96; + lop3.b32 %r13185, %r30282, %r30250, %r30248, 0x96; + lop3.b32 %r13185, %r13185, %r30246, %r30244, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13196, %r13149, %r13148, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13200, %r13148, %r13149, %r13235; + // end inline asm + xor.b32 %r13374, %r13196, %r13184; + xor.b32 %r13375, %r13200, %r13185; + xor.b32 %r13343, %r30279, %r13374; + xor.b32 %r13346, %r30280, %r13375; + xor.b32 %r13306, %r30276, %r13375; + xor.b32 %r13305, %r30275, %r13374; + st.local.v2.u32 [%rd82+104], {%r13305, %r13306}; + // begin inline asm + shf.l.wrap.b32 %r13204, %r13161, %r13160, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13208, %r13160, %r13161, %r13235; + // end inline asm + xor.b32 %r13376, %r13204, %r13136; + xor.b32 %r13377, %r13208, %r13137; + xor.b32 %r13242, %r30289, %r13376; + xor.b32 %r13241, %r30290, %r13377; + xor.b32 %r13281, %r30268, %r13377; + xor.b32 %r13282, %r30267, %r13376; + st.local.v2.u32 [%rd82+152], {%r13282, %r13281}; + // begin inline asm + shf.l.wrap.b32 %r13212, %r13173, %r13172, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13216, %r13172, %r13173, %r13235; + // end inline asm + xor.b32 %r13378, %r13212, %r13148; + xor.b32 %r13379, %r13216, %r13149; + xor.b32 %r13265, %r30264, %r13379; + xor.b32 %r13266, %r30263, %r13378; + st.local.v2.u32 [%rd82+120], {%r13266, %r13265}; + xor.b32 %r13257, %r30260, %r13379; + xor.b32 %r13258, %r30259, %r13378; + st.local.v2.u32 [%rd82+200], {%r13258, %r13257}; + // begin inline asm + shf.l.wrap.b32 %r13220, %r13185, %r13184, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13224, %r13184, %r13185, %r13235; + // end inline asm + xor.b32 %r13380, %r13220, %r13160; + xor.b32 %r13381, %r13224, %r13161; + xor.b32 %r13289, %r30283, %r13380; + xor.b32 %r13290, %r30284, %r13381; + xor.b32 %r13298, %r30254, %r13381; + xor.b32 %r13297, %r30253, %r13380; + st.local.v2.u32 [%rd82+168], {%r13297, %r13298}; + // begin inline asm + shf.l.wrap.b32 %r13228, %r13137, %r13136, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13232, %r13136, %r13137, %r13235; + // end inline asm + xor.b32 %r13382, %r13228, %r13172; + xor.b32 %r13383, %r13232, %r13173; + xor.b32 %r13249, %r30249, %r13382; + xor.b32 %r13250, %r30250, %r13383; + xor.b32 %r13274, %r30244, %r13383; + xor.b32 %r13273, %r30243, %r13382; + st.local.v2.u32 [%rd82+216], {%r13273, %r13274}; + // begin inline asm + shf.l.wrap.b32 %r13236, %r13242, %r13241, %r12739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13240, %r13241, %r13242, %r12739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13244, %r13250, %r13249, %r12747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13248, %r13249, %r13250, %r12747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13256, %r13257, %r13258, %r12755; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13252, %r13258, %r13257, %r12755; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r13252, %r13256}; + // begin inline asm + shf.l.wrap.b32 %r13260, %r13266, %r13265, %r12787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13264, %r13265, %r13266, %r12787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13268, %r13274, %r13273, %r12835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13272, %r13273, %r13274, %r12835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13280, %r13281, %r13282, %r12859; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13276, %r13282, %r13281, %r12859; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r13276, %r13280}; + // begin inline asm + shf.l.wrap.b32 %r13284, %r13290, %r13289, %r12875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13288, %r13289, %r13290, %r12875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13292, %r13298, %r13297, %r12883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13296, %r13297, %r13298, %r12883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13300, %r13306, %r13305, %r12915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13304, %r13305, %r13306, %r12915; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13308, %r13343, %r13236, %r13260, 0xD2; + lop3.b32 %r13309, %r13346, %r13240, %r13264, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13316, %r13236, %r13260, %r13292, 0xD2; + lop3.b32 %r13317, %r13240, %r13264, %r13296, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+32], {%r13316, %r13317}; + // begin inline asm + // chi + lop3.b32 %r13324, %r13260, %r13292, %r13268, 0xD2; + lop3.b32 %r13325, %r13264, %r13296, %r13272, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+40], {%r13324, %r13325}; + // begin inline asm + // chi + lop3.b32 %r13332, %r13292, %r13268, %r13343, 0xD2; + lop3.b32 %r13333, %r13296, %r13272, %r13346, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+48], {%r13332, %r13333}; + // begin inline asm + // chi + lop3.b32 %r13340, %r13268, %r13343, %r13236, 0xD2; + lop3.b32 %r13341, %r13272, %r13346, %r13240, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+56], {%r13340, %r13341}; + // begin inline asm + // chi + lop3.b32 %r13348, %r13284, %r13244, %r13300, 0xD2; + lop3.b32 %r13349, %r13288, %r13248, %r13304, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+64], {%r13348, %r13349}; + // begin inline asm + // chi + lop3.b32 %r13356, %r13244, %r13300, %r13276, 0xD2; + lop3.b32 %r13357, %r13248, %r13304, %r13280, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+72], {%r13356, %r13357}; + // begin inline asm + // chi + lop3.b32 %r13364, %r13300, %r13276, %r13252, 0xD2; + lop3.b32 %r13365, %r13304, %r13280, %r13256, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+80], {%r13364, %r13365}; + // begin inline asm + ld.global.nc.v2.u32 {%r13372,%r13373}, [%rd581]; + // end inline asm + xor.b32 %r13384, %r13309, %r13373; + xor.b32 %r13385, %r13308, %r13372; + st.local.v2.u32 [%rd82+24], {%r13385, %r13384}; + mov.b64 %rd1326, {%r13316, %r13317}; + mov.b64 %rd1327, {%r13324, %r13325}; + mov.b64 %rd1330, {%r13348, %r13349}; + mov.b64 %rd1331, {%r13356, %r13357}; + mov.b64 %rd1332, {%r13364, %r13365}; + mov.b64 %rd1325, {%r13385, %r13384}; + mov.b64 %rd1328, {%r13332, %r13333}; + mov.b64 %rd1329, {%r13340, %r13341}; + bra.uni $L__BB2_36; + +$L__BB2_14: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd462, 1179641; + st.local.u64 [%rd2+8], %rd462; + st.local.u32 [%rd2+16], %r47; + ld.global.u64 %rd463, [%rd32]; + ld.global.u64 %rd464, [%rd32+8]; + ld.global.u64 %rd465, [%rd32+16]; + ld.global.u64 %rd466, [%rd32+24]; + ld.global.u64 %rd467, [%rd32+32]; + ld.global.u64 %rd468, [%rd32+40]; + ld.global.u64 %rd469, [%rd32+48]; + ld.global.u64 %rd470, [%rd32+56]; + st.local.u64 [%rd2+24], %rd463; + st.local.u64 [%rd2+32], %rd464; + st.local.u64 [%rd2+40], %rd465; + st.local.u64 [%rd2+48], %rd466; + st.local.u64 [%rd2+56], %rd467; + st.local.u64 [%rd2+64], %rd468; + st.local.u64 [%rd2+72], %rd469; + st.local.u64 [%rd2+80], %rd470; + cvt.u32.u64 %r6859, %rd463; + xor.b32 %r6860, %r47, %r6859; + st.local.u32 [%rd2+24], %r6860; + mov.u32 %r29820, 0; + st.local.v2.u32 [%rd2+96], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+104], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+112], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+120], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+128], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+136], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+144], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+152], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+160], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+168], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+176], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+184], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+192], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+200], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+208], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+216], {%r29820, %r29820}; + mov.u32 %r29835, -2147483648; + mov.u32 %r6832, 1; + st.local.v2.u32 [%rd2+88], {%r6832, %r29835}; + ld.local.v2.u32 {%r29856, %r29857}, [%rd2+24]; + mov.b64 {%r29854, %r29855}, %rd468; + shr.u64 %rd471, %rd464, 32; + cvt.u32.u64 %r29868, %rd464; + cvt.u32.u64 %r29869, %rd471; + shr.u64 %rd472, %rd469, 32; + cvt.u32.u64 %r29866, %rd469; + cvt.u32.u64 %r29867, %rd472; + shr.u64 %rd473, %rd465, 32; + cvt.u32.u64 %r29864, %rd465; + cvt.u32.u64 %r29865, %rd473; + shr.u64 %rd474, %rd470, 32; + cvt.u32.u64 %r29862, %rd470; + cvt.u32.u64 %r29863, %rd474; + shr.u64 %rd475, %rd466, 32; + cvt.u32.u64 %r29860, %rd466; + cvt.u32.u64 %r29861, %rd475; + shr.u64 %rd476, %rd467, 32; + cvt.u32.u64 %r29858, %rd467; + cvt.u32.u64 %r29859, %rd476; + mov.u32 %r29821, %r29820; + mov.u32 %r29822, %r29820; + mov.u32 %r29823, %r29820; + mov.u32 %r29824, %r29820; + mov.u32 %r29825, %r29820; + mov.u32 %r29826, %r29820; + mov.u32 %r29827, %r29820; + mov.u32 %r29828, %r29820; + mov.u32 %r29829, %r29820; + mov.u32 %r29830, %r29820; + mov.u32 %r29831, %r29820; + mov.u32 %r29832, %r29820; + mov.u32 %r29833, %r29820; + mov.u32 %r29834, %r6832; + mov.u32 %r29836, %r29820; + mov.u32 %r29837, %r29820; + mov.u32 %r29838, %r29820; + mov.u32 %r29839, %r29820; + mov.u32 %r29840, %r29820; + mov.u32 %r29841, %r29820; + mov.u32 %r29842, %r29820; + mov.u32 %r29843, %r29820; + mov.u32 %r29844, %r29820; + mov.u32 %r29845, %r29820; + mov.u32 %r29846, %r29820; + mov.u32 %r29847, %r29820; + mov.u32 %r29848, %r29820; + mov.u32 %r29849, %r29820; + mov.u32 %r29850, %r29820; + mov.u32 %r29851, %r29820; + mov.u32 %r29852, %r29820; + mov.u32 %r29853, %r29820; + mov.u32 %r29870, %r29820; + +$L__BB2_15: + // begin inline asm + // xor5 + lop3.b32 %r6863, %r29856, %r29854, %r29852, 0x96; + lop3.b32 %r6863, %r6863, %r29850, %r29848, 0x96; + lop3.b32 %r6864, %r29857, %r29855, %r29853, 0x96; + lop3.b32 %r6864, %r6864, %r29851, %r29849, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6875, %r29868, %r29866, %r29846, 0x96; + lop3.b32 %r6875, %r6875, %r29844, %r29842, 0x96; + lop3.b32 %r6876, %r29869, %r29867, %r29847, 0x96; + lop3.b32 %r6876, %r6876, %r29845, %r29843, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6887, %r29864, %r29862, %r29840, 0x96; + lop3.b32 %r6887, %r6887, %r29838, %r29836, 0x96; + lop3.b32 %r6888, %r29865, %r29863, %r29841, 0x96; + lop3.b32 %r6888, %r6888, %r29839, %r29837, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6899, %r29860, %r29834, %r29832, 0x96; + lop3.b32 %r6899, %r6899, %r29830, %r29828, 0x96; + lop3.b32 %r6900, %r29861, %r29835, %r29833, 0x96; + lop3.b32 %r6900, %r6900, %r29831, %r29829, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6911, %r29858, %r29826, %r29824, 0x96; + lop3.b32 %r6911, %r6911, %r29822, %r29820, 0x96; + lop3.b32 %r6912, %r29859, %r29827, %r29825, 0x96; + lop3.b32 %r6912, %r6912, %r29823, %r29821, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6923, %r6876, %r6875, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6927, %r6875, %r6876, %r6832; + // end inline asm + xor.b32 %r7357, %r6923, %r6911; + xor.b32 %r7358, %r6927, %r6912; + xor.b32 %r7190, %r29856, %r7357; + xor.b32 %r7193, %r29857, %r7358; + xor.b32 %r7097, %r29854, %r7357; + xor.b32 %r7096, %r29855, %r7358; + xor.b32 %r7144, %r29852, %r7357; + xor.b32 %r7145, %r29853, %r7358; + xor.b32 %r7049, %r29850, %r7357; + xor.b32 %r7048, %r29851, %r7358; + xor.b32 %r7000, %r29848, %r7357; + xor.b32 %r7001, %r29849, %r7358; + // begin inline asm + shf.l.wrap.b32 %r6931, %r6888, %r6887, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6935, %r6887, %r6888, %r6832; + // end inline asm + xor.b32 %r7359, %r6931, %r6863; + xor.b32 %r7360, %r6935, %r6864; + xor.b32 %r7152, %r29868, %r7359; + xor.b32 %r7153, %r29869, %r7360; + xor.b32 %r6969, %r29866, %r7359; + xor.b32 %r6968, %r29867, %r7360; + xor.b32 %r7128, %r29846, %r7359; + xor.b32 %r7129, %r29847, %r7360; + xor.b32 %r7089, %r29844, %r7359; + xor.b32 %r7088, %r29845, %r7360; + xor.b32 %r7072, %r29842, %r7359; + xor.b32 %r7073, %r29843, %r7360; + // begin inline asm + shf.l.wrap.b32 %r6939, %r6900, %r6899, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6943, %r6899, %r6900, %r6832; + // end inline asm + xor.b32 %r7361, %r6939, %r6875; + xor.b32 %r7362, %r6943, %r6876; + xor.b32 %r7009, %r29864, %r7361; + xor.b32 %r7008, %r29865, %r7362; + xor.b32 %r7136, %r29862, %r7361; + xor.b32 %r7137, %r29863, %r7362; + xor.b32 %r7017, %r29840, %r7361; + xor.b32 %r7016, %r29841, %r7362; + xor.b32 %r7120, %r29838, %r7361; + xor.b32 %r7121, %r29839, %r7362; + xor.b32 %r6985, %r29836, %r7361; + xor.b32 %r6984, %r29837, %r7362; + // begin inline asm + shf.l.wrap.b32 %r6947, %r6912, %r6911, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6951, %r6911, %r6912, %r6832; + // end inline asm + xor.b32 %r7363, %r6947, %r6887; + xor.b32 %r7364, %r6951, %r6888; + xor.b32 %r7104, %r29860, %r7363; + xor.b32 %r7105, %r29861, %r7364; + xor.b32 %r7081, %r29834, %r7363; + xor.b32 %r7080, %r29835, %r7364; + xor.b32 %r7024, %r29832, %r7363; + xor.b32 %r7025, %r29833, %r7364; + xor.b32 %r7112, %r29830, %r7363; + xor.b32 %r7113, %r29831, %r7364; + xor.b32 %r7041, %r29828, %r7363; + xor.b32 %r7040, %r29829, %r7364; + // begin inline asm + shf.l.wrap.b32 %r6955, %r6864, %r6863, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6959, %r6863, %r6864, %r6832; + // end inline asm + xor.b32 %r7365, %r6955, %r6899; + xor.b32 %r7366, %r6959, %r6900; + xor.b32 %r7056, %r29858, %r7365; + xor.b32 %r7057, %r29859, %r7366; + xor.b32 %r6976, %r29826, %r7365; + xor.b32 %r6977, %r29827, %r7366; + xor.b32 %r6993, %r29824, %r7365; + xor.b32 %r6992, %r29825, %r7366; + xor.b32 %r7032, %r29822, %r7365; + xor.b32 %r7033, %r29823, %r7366; + xor.b32 %r7064, %r29820, %r7365; + xor.b32 %r7065, %r29821, %r7366; + mov.u32 %r6970, 44; + // begin inline asm + shf.l.wrap.b32 %r6963, %r6969, %r6968, %r6970; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6967, %r6968, %r6969, %r6970; + // end inline asm + mov.u32 %r6978, 20; + // begin inline asm + shf.l.wrap.b32 %r6971, %r6977, %r6976, %r6978; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6975, %r6976, %r6977, %r6978; + // end inline asm + mov.u32 %r6986, 61; + // begin inline asm + shf.l.wrap.b32 %r6979, %r6985, %r6984, %r6986; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6983, %r6984, %r6985, %r6986; + // end inline asm + mov.u32 %r6994, 39; + // begin inline asm + shf.l.wrap.b32 %r6987, %r6993, %r6992, %r6994; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6991, %r6992, %r6993, %r6994; + // end inline asm + mov.u32 %r7002, 18; + // begin inline asm + shf.l.wrap.b32 %r6995, %r7001, %r7000, %r7002; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6999, %r7000, %r7001, %r7002; + // end inline asm + mov.u32 %r7010, 62; + // begin inline asm + shf.l.wrap.b32 %r7003, %r7009, %r7008, %r7010; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7007, %r7008, %r7009, %r7010; + // end inline asm + mov.u32 %r7018, 43; + // begin inline asm + shf.l.wrap.b32 %r7011, %r7017, %r7016, %r7018; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7015, %r7016, %r7017, %r7018; + // end inline asm + mov.u32 %r7026, 25; + // begin inline asm + shf.l.wrap.b32 %r7019, %r7025, %r7024, %r7026; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7023, %r7024, %r7025, %r7026; + // end inline asm + mov.u32 %r7034, 8; + // begin inline asm + shf.l.wrap.b32 %r7027, %r7033, %r7032, %r7034; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7031, %r7032, %r7033, %r7034; + // end inline asm + mov.u32 %r7042, 56; + // begin inline asm + shf.l.wrap.b32 %r7035, %r7041, %r7040, %r7042; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7039, %r7040, %r7041, %r7042; + // end inline asm + mov.u32 %r7050, 41; + // begin inline asm + shf.l.wrap.b32 %r7043, %r7049, %r7048, %r7050; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7047, %r7048, %r7049, %r7050; + // end inline asm + mov.u32 %r7058, 27; + // begin inline asm + shf.l.wrap.b32 %r7051, %r7057, %r7056, %r7058; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7055, %r7056, %r7057, %r7058; + // end inline asm + mov.u32 %r7066, 14; + // begin inline asm + shf.l.wrap.b32 %r7059, %r7065, %r7064, %r7066; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7063, %r7064, %r7065, %r7066; + // end inline asm + mov.u32 %r7074, 2; + // begin inline asm + shf.l.wrap.b32 %r7067, %r7073, %r7072, %r7074; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7071, %r7072, %r7073, %r7074; + // end inline asm + mov.u32 %r7082, 55; + // begin inline asm + shf.l.wrap.b32 %r7075, %r7081, %r7080, %r7082; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7079, %r7080, %r7081, %r7082; + // end inline asm + mov.u32 %r7090, 45; + // begin inline asm + shf.l.wrap.b32 %r7083, %r7089, %r7088, %r7090; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7087, %r7088, %r7089, %r7090; + // end inline asm + mov.u32 %r7098, 36; + // begin inline asm + shf.l.wrap.b32 %r7091, %r7097, %r7096, %r7098; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7095, %r7096, %r7097, %r7098; + // end inline asm + mov.u32 %r7106, 28; + // begin inline asm + shf.l.wrap.b32 %r7099, %r7105, %r7104, %r7106; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7103, %r7104, %r7105, %r7106; + // end inline asm + mov.u32 %r7114, 21; + // begin inline asm + shf.l.wrap.b32 %r7107, %r7113, %r7112, %r7114; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7111, %r7112, %r7113, %r7114; + // end inline asm + mov.u32 %r7122, 15; + // begin inline asm + shf.l.wrap.b32 %r7115, %r7121, %r7120, %r7122; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7119, %r7120, %r7121, %r7122; + // end inline asm + mov.u32 %r7130, 10; + // begin inline asm + shf.l.wrap.b32 %r7123, %r7129, %r7128, %r7130; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7127, %r7128, %r7129, %r7130; + // end inline asm + mov.u32 %r7138, 6; + // begin inline asm + shf.l.wrap.b32 %r7131, %r7137, %r7136, %r7138; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7135, %r7136, %r7137, %r7138; + // end inline asm + mov.u32 %r7146, 3; + // begin inline asm + shf.l.wrap.b32 %r7139, %r7145, %r7144, %r7146; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7143, %r7144, %r7145, %r7146; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7147, %r7153, %r7152, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7151, %r7152, %r7153, %r6832; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r7155, %r7190, %r6963, %r7011, 0xD2; + lop3.b32 %r7156, %r7193, %r6967, %r7015, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29868, %r6963, %r7011, %r7107, 0xD2; + lop3.b32 %r29869, %r6967, %r7015, %r7111, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29864, %r7011, %r7107, %r7059, 0xD2; + lop3.b32 %r29865, %r7015, %r7111, %r7063, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29860, %r7107, %r7059, %r7190, 0xD2; + lop3.b32 %r29861, %r7111, %r7063, %r7193, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29858, %r7059, %r7190, %r6963, 0xD2; + lop3.b32 %r29859, %r7063, %r7193, %r6967, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29854, %r7099, %r6971, %r7139, 0xD2; + lop3.b32 %r29855, %r7103, %r6975, %r7143, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29866, %r6971, %r7139, %r7083, 0xD2; + lop3.b32 %r29867, %r6975, %r7143, %r7087, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29862, %r7139, %r7083, %r6979, 0xD2; + lop3.b32 %r29863, %r7143, %r7087, %r6983, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29834, %r7083, %r6979, %r7099, 0xD2; + lop3.b32 %r29835, %r7087, %r6983, %r7103, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r29834, %r29835}; + // begin inline asm + // chi + lop3.b32 %r29826, %r6979, %r7099, %r6971, 0xD2; + lop3.b32 %r29827, %r6983, %r7103, %r6975, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r29826, %r29827}; + // begin inline asm + // chi + lop3.b32 %r29852, %r7147, %r7131, %r7019, 0xD2; + lop3.b32 %r29853, %r7151, %r7135, %r7023, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r29852, %r29853}; + // begin inline asm + // chi + lop3.b32 %r29846, %r7131, %r7019, %r7027, 0xD2; + lop3.b32 %r29847, %r7135, %r7023, %r7031, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r29846, %r29847}; + // begin inline asm + // chi + lop3.b32 %r29840, %r7019, %r7027, %r6995, 0xD2; + lop3.b32 %r29841, %r7023, %r7031, %r6999, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r29840, %r29841}; + // begin inline asm + // chi + lop3.b32 %r29832, %r7027, %r6995, %r7147, 0xD2; + lop3.b32 %r29833, %r7031, %r6999, %r7151, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r29832, %r29833}; + // begin inline asm + // chi + lop3.b32 %r29824, %r6995, %r7147, %r7131, 0xD2; + lop3.b32 %r29825, %r6999, %r7151, %r7135, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r29824, %r29825}; + // begin inline asm + // chi + lop3.b32 %r29850, %r7051, %r7091, %r7123, 0xD2; + lop3.b32 %r29851, %r7055, %r7095, %r7127, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r29850, %r29851}; + // begin inline asm + // chi + lop3.b32 %r29844, %r7091, %r7123, %r7115, 0xD2; + lop3.b32 %r29845, %r7095, %r7127, %r7119, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r29844, %r29845}; + // begin inline asm + // chi + lop3.b32 %r29838, %r7123, %r7115, %r7035, 0xD2; + lop3.b32 %r29839, %r7127, %r7119, %r7039, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r29838, %r29839}; + // begin inline asm + // chi + lop3.b32 %r29830, %r7115, %r7035, %r7051, 0xD2; + lop3.b32 %r29831, %r7119, %r7039, %r7055, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r29830, %r29831}; + // begin inline asm + // chi + lop3.b32 %r29822, %r7035, %r7051, %r7091, 0xD2; + lop3.b32 %r29823, %r7039, %r7055, %r7095, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r29822, %r29823}; + // begin inline asm + // chi + lop3.b32 %r29848, %r7003, %r7075, %r6987, 0xD2; + lop3.b32 %r29849, %r7007, %r7079, %r6991, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r29848, %r29849}; + // begin inline asm + // chi + lop3.b32 %r29842, %r7075, %r6987, %r7043, 0xD2; + lop3.b32 %r29843, %r7079, %r6991, %r7047, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r29842, %r29843}; + // begin inline asm + // chi + lop3.b32 %r29836, %r6987, %r7043, %r7067, 0xD2; + lop3.b32 %r29837, %r6991, %r7047, %r7071, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r29836, %r29837}; + // begin inline asm + // chi + lop3.b32 %r29828, %r7043, %r7067, %r7003, 0xD2; + lop3.b32 %r29829, %r7047, %r7071, %r7007, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r29828, %r29829}; + // begin inline asm + // chi + lop3.b32 %r29820, %r7067, %r7003, %r7075, 0xD2; + lop3.b32 %r29821, %r7071, %r7007, %r7079, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r29820, %r29821}; + mul.wide.s32 %rd478, %r29870, 8; + mov.u64 %rd479, keccak_round_constants; + cvta.const.u64 %rd480, %rd479; + add.s64 %rd477, %rd480, %rd478; + // begin inline asm + ld.global.nc.v2.u32 {%r7355,%r7356}, [%rd477]; + // end inline asm + xor.b32 %r29856, %r7155, %r7355; + xor.b32 %r29857, %r7156, %r7356; + add.s32 %r29870, %r29870, 1; + setp.lt.u32 %p14, %r29870, 23; + @%p14 bra $L__BB2_15; + + add.u64 %rd53, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r29868, %r29869}; + st.local.v2.u32 [%rd2+72], {%r29866, %r29867}; + st.local.v2.u32 [%rd2+40], {%r29864, %r29865}; + st.local.v2.u32 [%rd2+80], {%r29862, %r29863}; + st.local.v2.u32 [%rd2+48], {%r29860, %r29861}; + st.local.v2.u32 [%rd2+56], {%r29858, %r29859}; + st.local.v2.u32 [%rd2+24], {%r29856, %r29857}; + // begin inline asm + // xor5 + lop3.b32 %r7367, %r29856, %r29854, %r29852, 0x96; + lop3.b32 %r7367, %r7367, %r29850, %r29848, 0x96; + lop3.b32 %r7368, %r29857, %r29855, %r29853, 0x96; + lop3.b32 %r7368, %r7368, %r29851, %r29849, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7379, %r29868, %r29866, %r29846, 0x96; + lop3.b32 %r7379, %r7379, %r29844, %r29842, 0x96; + lop3.b32 %r7380, %r29869, %r29867, %r29847, 0x96; + lop3.b32 %r7380, %r7380, %r29845, %r29843, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7391, %r29864, %r29862, %r29840, 0x96; + lop3.b32 %r7391, %r7391, %r29838, %r29836, 0x96; + lop3.b32 %r7392, %r29865, %r29863, %r29841, 0x96; + lop3.b32 %r7392, %r7392, %r29839, %r29837, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7403, %r29860, %r29834, %r29832, 0x96; + lop3.b32 %r7403, %r7403, %r29830, %r29828, 0x96; + lop3.b32 %r7404, %r29861, %r29835, %r29833, 0x96; + lop3.b32 %r7404, %r7404, %r29831, %r29829, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7415, %r29858, %r29826, %r29824, 0x96; + lop3.b32 %r7415, %r7415, %r29822, %r29820, 0x96; + lop3.b32 %r7416, %r29859, %r29827, %r29825, 0x96; + lop3.b32 %r7416, %r7416, %r29823, %r29821, 0x96; + // end inline asm + mov.u32 %r7619, 1; + // begin inline asm + shf.l.wrap.b32 %r7427, %r7380, %r7379, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7431, %r7379, %r7380, %r7619; + // end inline asm + xor.b32 %r7646, %r7427, %r7415; + xor.b32 %r7647, %r7431, %r7416; + xor.b32 %r7574, %r29856, %r7646; + xor.b32 %r7577, %r29857, %r7647; + xor.b32 %r7537, %r29853, %r7647; + xor.b32 %r7536, %r29852, %r7646; + st.local.v2.u32 [%rd2+104], {%r7536, %r7537}; + // begin inline asm + shf.l.wrap.b32 %r7435, %r7392, %r7391, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7439, %r7391, %r7392, %r7619; + // end inline asm + xor.b32 %r7648, %r7435, %r7367; + xor.b32 %r7649, %r7439, %r7368; + xor.b32 %r7473, %r29866, %r7648; + xor.b32 %r7472, %r29867, %r7649; + xor.b32 %r7512, %r29845, %r7649; + xor.b32 %r7513, %r29844, %r7648; + st.local.v2.u32 [%rd2+152], {%r7513, %r7512}; + // begin inline asm + shf.l.wrap.b32 %r7443, %r7404, %r7403, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7447, %r7403, %r7404, %r7619; + // end inline asm + xor.b32 %r7650, %r7443, %r7379; + xor.b32 %r7651, %r7447, %r7380; + xor.b32 %r7496, %r29841, %r7651; + xor.b32 %r7497, %r29840, %r7650; + st.local.v2.u32 [%rd2+120], {%r7497, %r7496}; + xor.b32 %r7488, %r29837, %r7651; + xor.b32 %r7489, %r29836, %r7650; + st.local.v2.u32 [%rd2+200], {%r7489, %r7488}; + // begin inline asm + shf.l.wrap.b32 %r7451, %r7416, %r7415, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7455, %r7415, %r7416, %r7619; + // end inline asm + xor.b32 %r7652, %r7451, %r7391; + xor.b32 %r7653, %r7455, %r7392; + xor.b32 %r7520, %r29860, %r7652; + xor.b32 %r7521, %r29861, %r7653; + xor.b32 %r7529, %r29831, %r7653; + xor.b32 %r7528, %r29830, %r7652; + st.local.v2.u32 [%rd2+168], {%r7528, %r7529}; + // begin inline asm + shf.l.wrap.b32 %r7459, %r7368, %r7367, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7463, %r7367, %r7368, %r7619; + // end inline asm + xor.b32 %r7654, %r7459, %r7403; + xor.b32 %r7655, %r7463, %r7404; + xor.b32 %r7480, %r29826, %r7654; + xor.b32 %r7481, %r29827, %r7655; + xor.b32 %r7505, %r29821, %r7655; + xor.b32 %r7504, %r29820, %r7654; + st.local.v2.u32 [%rd2+216], {%r7504, %r7505}; + // begin inline asm + shf.l.wrap.b32 %r7467, %r7473, %r7472, %r6970; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7471, %r7472, %r7473, %r6970; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7475, %r7481, %r7480, %r6978; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7479, %r7480, %r7481, %r6978; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7487, %r7488, %r7489, %r6986; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7483, %r7489, %r7488, %r6986; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r7483, %r7487}; + // begin inline asm + shf.l.wrap.b32 %r7491, %r7497, %r7496, %r7018; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7495, %r7496, %r7497, %r7018; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7499, %r7505, %r7504, %r7066; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7503, %r7504, %r7505, %r7066; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7511, %r7512, %r7513, %r7090; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7507, %r7513, %r7512, %r7090; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r7507, %r7511}; + // begin inline asm + shf.l.wrap.b32 %r7515, %r7521, %r7520, %r7106; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7519, %r7520, %r7521, %r7106; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7523, %r7529, %r7528, %r7114; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7527, %r7528, %r7529, %r7114; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7531, %r7537, %r7536, %r7146; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7535, %r7536, %r7537, %r7146; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r7539, %r7574, %r7467, %r7491, 0xD2; + lop3.b32 %r7540, %r7577, %r7471, %r7495, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30003, %r7467, %r7491, %r7523, 0xD2; + lop3.b32 %r30004, %r7471, %r7495, %r7527, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30003, %r30004}; + // begin inline asm + // chi + lop3.b32 %r29999, %r7491, %r7523, %r7499, 0xD2; + lop3.b32 %r30000, %r7495, %r7527, %r7503, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r29999, %r30000}; + // begin inline asm + // chi + lop3.b32 %r29995, %r7523, %r7499, %r7574, 0xD2; + lop3.b32 %r29996, %r7527, %r7503, %r7577, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r29995, %r29996}; + // begin inline asm + // chi + lop3.b32 %r29993, %r7499, %r7574, %r7467, 0xD2; + lop3.b32 %r29994, %r7503, %r7577, %r7471, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r29993, %r29994}; + // begin inline asm + // chi + lop3.b32 %r29989, %r7515, %r7475, %r7531, 0xD2; + lop3.b32 %r29990, %r7519, %r7479, %r7535, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r29989, %r29990}; + // begin inline asm + // chi + lop3.b32 %r30001, %r7475, %r7531, %r7507, 0xD2; + lop3.b32 %r30002, %r7479, %r7535, %r7511, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30001, %r30002}; + // begin inline asm + // chi + lop3.b32 %r29997, %r7531, %r7507, %r7483, 0xD2; + lop3.b32 %r29998, %r7535, %r7511, %r7487, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r29997, %r29998}; + add.s64 %rd481, %rd480, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r7603,%r7604}, [%rd481]; + // end inline asm + xor.b32 %r29991, %r7539, %r7603; + xor.b32 %r29992, %r7540, %r7604; + st.local.v2.u32 [%rd2+24], {%r29991, %r29992}; + st.local.u64 [%rd53], %rd361; + mov.u64 %rd485, 1179641; + st.local.u64 [%rd53+8], %rd485; + add.s32 %r243, %r47, 1; + st.local.u32 [%rd53+16], %r243; + ld.global.u64 %rd486, [%rd33]; + ld.global.u64 %rd487, [%rd33+8]; + ld.global.u64 %rd488, [%rd33+16]; + ld.global.u64 %rd489, [%rd33+24]; + ld.global.u64 %rd490, [%rd33+32]; + ld.global.u64 %rd491, [%rd33+40]; + ld.global.u64 %rd492, [%rd33+48]; + ld.global.u64 %rd493, [%rd33+56]; + st.local.u64 [%rd53+32], %rd487; + st.local.u64 [%rd53+40], %rd488; + st.local.u64 [%rd53+48], %rd489; + st.local.u64 [%rd53+56], %rd490; + st.local.u64 [%rd53+64], %rd491; + st.local.u64 [%rd53+72], %rd492; + st.local.u64 [%rd53+80], %rd493; + cvt.u32.u64 %r7656, %rd486; + xor.b32 %r7657, %r243, %r7656; + st.local.u64 [%rd53+24], %rd486; + st.local.u32 [%rd53+24], %r7657; + mov.u32 %r29871, 0; + st.local.v2.u32 [%rd53+96], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+104], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+112], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+120], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+128], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+136], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+144], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+152], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+160], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+168], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+176], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+184], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+192], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+200], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+208], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+216], {%r29871, %r29871}; + mov.u32 %r29886, -2147483648; + st.local.v2.u32 [%rd53+88], {%r7619, %r29886}; + ld.local.v2.u32 {%r29907, %r29908}, [%rd53+24]; + mov.b64 {%r29905, %r29906}, %rd491; + shr.u64 %rd494, %rd487, 32; + cvt.u32.u64 %r29919, %rd487; + cvt.u32.u64 %r29920, %rd494; + shr.u64 %rd495, %rd492, 32; + cvt.u32.u64 %r29917, %rd492; + cvt.u32.u64 %r29918, %rd495; + shr.u64 %rd496, %rd488, 32; + cvt.u32.u64 %r29915, %rd488; + cvt.u32.u64 %r29916, %rd496; + shr.u64 %rd497, %rd493, 32; + cvt.u32.u64 %r29913, %rd493; + cvt.u32.u64 %r29914, %rd497; + shr.u64 %rd498, %rd489, 32; + cvt.u32.u64 %r29911, %rd489; + cvt.u32.u64 %r29912, %rd498; + shr.u64 %rd499, %rd490, 32; + cvt.u32.u64 %r29909, %rd490; + cvt.u32.u64 %r29910, %rd499; + mov.u32 %r29872, %r29871; + mov.u32 %r29873, %r29871; + mov.u32 %r29874, %r29871; + mov.u32 %r29875, %r29871; + mov.u32 %r29876, %r29871; + mov.u32 %r29877, %r29871; + mov.u32 %r29878, %r29871; + mov.u32 %r29879, %r29871; + mov.u32 %r29880, %r29871; + mov.u32 %r29881, %r29871; + mov.u32 %r29882, %r29871; + mov.u32 %r29883, %r29871; + mov.u32 %r29884, %r29871; + mov.u32 %r29885, %r7619; + mov.u32 %r29887, %r29871; + mov.u32 %r29888, %r29871; + mov.u32 %r29889, %r29871; + mov.u32 %r29890, %r29871; + mov.u32 %r29891, %r29871; + mov.u32 %r29892, %r29871; + mov.u32 %r29893, %r29871; + mov.u32 %r29894, %r29871; + mov.u32 %r29895, %r29871; + mov.u32 %r29896, %r29871; + mov.u32 %r29897, %r29871; + mov.u32 %r29898, %r29871; + mov.u32 %r29899, %r29871; + mov.u32 %r29900, %r29871; + mov.u32 %r29901, %r29871; + mov.u32 %r29902, %r29871; + mov.u32 %r29903, %r29871; + mov.u32 %r29904, %r29871; + mov.u32 %r29921, %r29871; + +$L__BB2_17: + // begin inline asm + // xor5 + lop3.b32 %r7660, %r29907, %r29905, %r29903, 0x96; + lop3.b32 %r7660, %r7660, %r29901, %r29899, 0x96; + lop3.b32 %r7661, %r29908, %r29906, %r29904, 0x96; + lop3.b32 %r7661, %r7661, %r29902, %r29900, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7672, %r29919, %r29917, %r29897, 0x96; + lop3.b32 %r7672, %r7672, %r29895, %r29893, 0x96; + lop3.b32 %r7673, %r29920, %r29918, %r29898, 0x96; + lop3.b32 %r7673, %r7673, %r29896, %r29894, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7684, %r29915, %r29913, %r29891, 0x96; + lop3.b32 %r7684, %r7684, %r29889, %r29887, 0x96; + lop3.b32 %r7685, %r29916, %r29914, %r29892, 0x96; + lop3.b32 %r7685, %r7685, %r29890, %r29888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7696, %r29911, %r29885, %r29883, 0x96; + lop3.b32 %r7696, %r7696, %r29881, %r29879, 0x96; + lop3.b32 %r7697, %r29912, %r29886, %r29884, 0x96; + lop3.b32 %r7697, %r7697, %r29882, %r29880, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7708, %r29909, %r29877, %r29875, 0x96; + lop3.b32 %r7708, %r7708, %r29873, %r29871, 0x96; + lop3.b32 %r7709, %r29910, %r29878, %r29876, 0x96; + lop3.b32 %r7709, %r7709, %r29874, %r29872, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7720, %r7673, %r7672, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7724, %r7672, %r7673, %r7619; + // end inline asm + xor.b32 %r8154, %r7720, %r7708; + xor.b32 %r8155, %r7724, %r7709; + xor.b32 %r7987, %r29907, %r8154; + xor.b32 %r7990, %r29908, %r8155; + xor.b32 %r7894, %r29905, %r8154; + xor.b32 %r7893, %r29906, %r8155; + xor.b32 %r7941, %r29903, %r8154; + xor.b32 %r7942, %r29904, %r8155; + xor.b32 %r7846, %r29901, %r8154; + xor.b32 %r7845, %r29902, %r8155; + xor.b32 %r7797, %r29899, %r8154; + xor.b32 %r7798, %r29900, %r8155; + // begin inline asm + shf.l.wrap.b32 %r7728, %r7685, %r7684, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7732, %r7684, %r7685, %r7619; + // end inline asm + xor.b32 %r8156, %r7728, %r7660; + xor.b32 %r8157, %r7732, %r7661; + xor.b32 %r7949, %r29919, %r8156; + xor.b32 %r7950, %r29920, %r8157; + xor.b32 %r7766, %r29917, %r8156; + xor.b32 %r7765, %r29918, %r8157; + xor.b32 %r7925, %r29897, %r8156; + xor.b32 %r7926, %r29898, %r8157; + xor.b32 %r7886, %r29895, %r8156; + xor.b32 %r7885, %r29896, %r8157; + xor.b32 %r7869, %r29893, %r8156; + xor.b32 %r7870, %r29894, %r8157; + // begin inline asm + shf.l.wrap.b32 %r7736, %r7697, %r7696, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7740, %r7696, %r7697, %r7619; + // end inline asm + xor.b32 %r8158, %r7736, %r7672; + xor.b32 %r8159, %r7740, %r7673; + xor.b32 %r7806, %r29915, %r8158; + xor.b32 %r7805, %r29916, %r8159; + xor.b32 %r7933, %r29913, %r8158; + xor.b32 %r7934, %r29914, %r8159; + xor.b32 %r7814, %r29891, %r8158; + xor.b32 %r7813, %r29892, %r8159; + xor.b32 %r7917, %r29889, %r8158; + xor.b32 %r7918, %r29890, %r8159; + xor.b32 %r7782, %r29887, %r8158; + xor.b32 %r7781, %r29888, %r8159; + // begin inline asm + shf.l.wrap.b32 %r7744, %r7709, %r7708, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7748, %r7708, %r7709, %r7619; + // end inline asm + xor.b32 %r8160, %r7744, %r7684; + xor.b32 %r8161, %r7748, %r7685; + xor.b32 %r7901, %r29911, %r8160; + xor.b32 %r7902, %r29912, %r8161; + xor.b32 %r7878, %r29885, %r8160; + xor.b32 %r7877, %r29886, %r8161; + xor.b32 %r7821, %r29883, %r8160; + xor.b32 %r7822, %r29884, %r8161; + xor.b32 %r7909, %r29881, %r8160; + xor.b32 %r7910, %r29882, %r8161; + xor.b32 %r7838, %r29879, %r8160; + xor.b32 %r7837, %r29880, %r8161; + // begin inline asm + shf.l.wrap.b32 %r7752, %r7661, %r7660, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7756, %r7660, %r7661, %r7619; + // end inline asm + xor.b32 %r8162, %r7752, %r7696; + xor.b32 %r8163, %r7756, %r7697; + xor.b32 %r7853, %r29909, %r8162; + xor.b32 %r7854, %r29910, %r8163; + xor.b32 %r7773, %r29877, %r8162; + xor.b32 %r7774, %r29878, %r8163; + xor.b32 %r7790, %r29875, %r8162; + xor.b32 %r7789, %r29876, %r8163; + xor.b32 %r7829, %r29873, %r8162; + xor.b32 %r7830, %r29874, %r8163; + xor.b32 %r7861, %r29871, %r8162; + xor.b32 %r7862, %r29872, %r8163; + mov.u32 %r7767, 44; + // begin inline asm + shf.l.wrap.b32 %r7760, %r7766, %r7765, %r7767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7764, %r7765, %r7766, %r7767; + // end inline asm + mov.u32 %r7775, 20; + // begin inline asm + shf.l.wrap.b32 %r7768, %r7774, %r7773, %r7775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7772, %r7773, %r7774, %r7775; + // end inline asm + mov.u32 %r7783, 61; + // begin inline asm + shf.l.wrap.b32 %r7776, %r7782, %r7781, %r7783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7780, %r7781, %r7782, %r7783; + // end inline asm + mov.u32 %r7791, 39; + // begin inline asm + shf.l.wrap.b32 %r7784, %r7790, %r7789, %r7791; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7788, %r7789, %r7790, %r7791; + // end inline asm + mov.u32 %r7799, 18; + // begin inline asm + shf.l.wrap.b32 %r7792, %r7798, %r7797, %r7799; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7796, %r7797, %r7798, %r7799; + // end inline asm + mov.u32 %r7807, 62; + // begin inline asm + shf.l.wrap.b32 %r7800, %r7806, %r7805, %r7807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7804, %r7805, %r7806, %r7807; + // end inline asm + mov.u32 %r7815, 43; + // begin inline asm + shf.l.wrap.b32 %r7808, %r7814, %r7813, %r7815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7812, %r7813, %r7814, %r7815; + // end inline asm + mov.u32 %r7823, 25; + // begin inline asm + shf.l.wrap.b32 %r7816, %r7822, %r7821, %r7823; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7820, %r7821, %r7822, %r7823; + // end inline asm + mov.u32 %r7831, 8; + // begin inline asm + shf.l.wrap.b32 %r7824, %r7830, %r7829, %r7831; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7828, %r7829, %r7830, %r7831; + // end inline asm + mov.u32 %r7839, 56; + // begin inline asm + shf.l.wrap.b32 %r7832, %r7838, %r7837, %r7839; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7836, %r7837, %r7838, %r7839; + // end inline asm + mov.u32 %r7847, 41; + // begin inline asm + shf.l.wrap.b32 %r7840, %r7846, %r7845, %r7847; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7844, %r7845, %r7846, %r7847; + // end inline asm + mov.u32 %r7855, 27; + // begin inline asm + shf.l.wrap.b32 %r7848, %r7854, %r7853, %r7855; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7852, %r7853, %r7854, %r7855; + // end inline asm + mov.u32 %r7863, 14; + // begin inline asm + shf.l.wrap.b32 %r7856, %r7862, %r7861, %r7863; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7860, %r7861, %r7862, %r7863; + // end inline asm + mov.u32 %r7871, 2; + // begin inline asm + shf.l.wrap.b32 %r7864, %r7870, %r7869, %r7871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7868, %r7869, %r7870, %r7871; + // end inline asm + mov.u32 %r7879, 55; + // begin inline asm + shf.l.wrap.b32 %r7872, %r7878, %r7877, %r7879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7876, %r7877, %r7878, %r7879; + // end inline asm + mov.u32 %r7887, 45; + // begin inline asm + shf.l.wrap.b32 %r7880, %r7886, %r7885, %r7887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7884, %r7885, %r7886, %r7887; + // end inline asm + mov.u32 %r7895, 36; + // begin inline asm + shf.l.wrap.b32 %r7888, %r7894, %r7893, %r7895; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7892, %r7893, %r7894, %r7895; + // end inline asm + mov.u32 %r7903, 28; + // begin inline asm + shf.l.wrap.b32 %r7896, %r7902, %r7901, %r7903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7900, %r7901, %r7902, %r7903; + // end inline asm + mov.u32 %r7911, 21; + // begin inline asm + shf.l.wrap.b32 %r7904, %r7910, %r7909, %r7911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7908, %r7909, %r7910, %r7911; + // end inline asm + mov.u32 %r7919, 15; + // begin inline asm + shf.l.wrap.b32 %r7912, %r7918, %r7917, %r7919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7916, %r7917, %r7918, %r7919; + // end inline asm + mov.u32 %r7927, 10; + // begin inline asm + shf.l.wrap.b32 %r7920, %r7926, %r7925, %r7927; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7924, %r7925, %r7926, %r7927; + // end inline asm + mov.u32 %r7935, 6; + // begin inline asm + shf.l.wrap.b32 %r7928, %r7934, %r7933, %r7935; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7932, %r7933, %r7934, %r7935; + // end inline asm + mov.u32 %r7943, 3; + // begin inline asm + shf.l.wrap.b32 %r7936, %r7942, %r7941, %r7943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7940, %r7941, %r7942, %r7943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7944, %r7950, %r7949, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7948, %r7949, %r7950, %r7619; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r7952, %r7987, %r7760, %r7808, 0xD2; + lop3.b32 %r7953, %r7990, %r7764, %r7812, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29919, %r7760, %r7808, %r7904, 0xD2; + lop3.b32 %r29920, %r7764, %r7812, %r7908, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29915, %r7808, %r7904, %r7856, 0xD2; + lop3.b32 %r29916, %r7812, %r7908, %r7860, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29911, %r7904, %r7856, %r7987, 0xD2; + lop3.b32 %r29912, %r7908, %r7860, %r7990, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29909, %r7856, %r7987, %r7760, 0xD2; + lop3.b32 %r29910, %r7860, %r7990, %r7764, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29905, %r7896, %r7768, %r7936, 0xD2; + lop3.b32 %r29906, %r7900, %r7772, %r7940, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29917, %r7768, %r7936, %r7880, 0xD2; + lop3.b32 %r29918, %r7772, %r7940, %r7884, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29913, %r7936, %r7880, %r7776, 0xD2; + lop3.b32 %r29914, %r7940, %r7884, %r7780, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29885, %r7880, %r7776, %r7896, 0xD2; + lop3.b32 %r29886, %r7884, %r7780, %r7900, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r29885, %r29886}; + // begin inline asm + // chi + lop3.b32 %r29877, %r7776, %r7896, %r7768, 0xD2; + lop3.b32 %r29878, %r7780, %r7900, %r7772, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r29877, %r29878}; + // begin inline asm + // chi + lop3.b32 %r29903, %r7944, %r7928, %r7816, 0xD2; + lop3.b32 %r29904, %r7948, %r7932, %r7820, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+104], {%r29903, %r29904}; + // begin inline asm + // chi + lop3.b32 %r29897, %r7928, %r7816, %r7824, 0xD2; + lop3.b32 %r29898, %r7932, %r7820, %r7828, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+112], {%r29897, %r29898}; + // begin inline asm + // chi + lop3.b32 %r29891, %r7816, %r7824, %r7792, 0xD2; + lop3.b32 %r29892, %r7820, %r7828, %r7796, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+120], {%r29891, %r29892}; + // begin inline asm + // chi + lop3.b32 %r29883, %r7824, %r7792, %r7944, 0xD2; + lop3.b32 %r29884, %r7828, %r7796, %r7948, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+128], {%r29883, %r29884}; + // begin inline asm + // chi + lop3.b32 %r29875, %r7792, %r7944, %r7928, 0xD2; + lop3.b32 %r29876, %r7796, %r7948, %r7932, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+136], {%r29875, %r29876}; + // begin inline asm + // chi + lop3.b32 %r29901, %r7848, %r7888, %r7920, 0xD2; + lop3.b32 %r29902, %r7852, %r7892, %r7924, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+144], {%r29901, %r29902}; + // begin inline asm + // chi + lop3.b32 %r29895, %r7888, %r7920, %r7912, 0xD2; + lop3.b32 %r29896, %r7892, %r7924, %r7916, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+152], {%r29895, %r29896}; + // begin inline asm + // chi + lop3.b32 %r29889, %r7920, %r7912, %r7832, 0xD2; + lop3.b32 %r29890, %r7924, %r7916, %r7836, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+160], {%r29889, %r29890}; + // begin inline asm + // chi + lop3.b32 %r29881, %r7912, %r7832, %r7848, 0xD2; + lop3.b32 %r29882, %r7916, %r7836, %r7852, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+168], {%r29881, %r29882}; + // begin inline asm + // chi + lop3.b32 %r29873, %r7832, %r7848, %r7888, 0xD2; + lop3.b32 %r29874, %r7836, %r7852, %r7892, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+176], {%r29873, %r29874}; + // begin inline asm + // chi + lop3.b32 %r29899, %r7800, %r7872, %r7784, 0xD2; + lop3.b32 %r29900, %r7804, %r7876, %r7788, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+184], {%r29899, %r29900}; + // begin inline asm + // chi + lop3.b32 %r29893, %r7872, %r7784, %r7840, 0xD2; + lop3.b32 %r29894, %r7876, %r7788, %r7844, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+192], {%r29893, %r29894}; + // begin inline asm + // chi + lop3.b32 %r29887, %r7784, %r7840, %r7864, 0xD2; + lop3.b32 %r29888, %r7788, %r7844, %r7868, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+200], {%r29887, %r29888}; + // begin inline asm + // chi + lop3.b32 %r29879, %r7840, %r7864, %r7800, 0xD2; + lop3.b32 %r29880, %r7844, %r7868, %r7804, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+208], {%r29879, %r29880}; + // begin inline asm + // chi + lop3.b32 %r29871, %r7864, %r7800, %r7872, 0xD2; + lop3.b32 %r29872, %r7868, %r7804, %r7876, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+216], {%r29871, %r29872}; + mul.wide.s32 %rd501, %r29921, 8; + add.s64 %rd500, %rd480, %rd501; + // begin inline asm + ld.global.nc.v2.u32 {%r8152,%r8153}, [%rd500]; + // end inline asm + xor.b32 %r29907, %r7952, %r8152; + xor.b32 %r29908, %r7953, %r8153; + add.s32 %r29921, %r29921, 1; + setp.lt.u32 %p15, %r29921, 23; + @%p15 bra $L__BB2_17; + + mov.u32 %r29954, 0; + mov.u32 %r8263, 1; + st.local.v2.u32 [%rd53+32], {%r29919, %r29920}; + st.local.v2.u32 [%rd53+72], {%r29917, %r29918}; + st.local.v2.u32 [%rd53+40], {%r29915, %r29916}; + st.local.v2.u32 [%rd53+80], {%r29913, %r29914}; + st.local.v2.u32 [%rd53+48], {%r29911, %r29912}; + st.local.v2.u32 [%rd53+56], {%r29909, %r29910}; + st.local.v2.u32 [%rd53+24], {%r29907, %r29908}; + // begin inline asm + // xor5 + lop3.b32 %r8164, %r29907, %r29905, %r29903, 0x96; + lop3.b32 %r8164, %r8164, %r29901, %r29899, 0x96; + lop3.b32 %r8165, %r29908, %r29906, %r29904, 0x96; + lop3.b32 %r8165, %r8165, %r29902, %r29900, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8176, %r29919, %r29917, %r29897, 0x96; + lop3.b32 %r8176, %r8176, %r29895, %r29893, 0x96; + lop3.b32 %r8177, %r29920, %r29918, %r29898, 0x96; + lop3.b32 %r8177, %r8177, %r29896, %r29894, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8188, %r29915, %r29913, %r29891, 0x96; + lop3.b32 %r8188, %r8188, %r29889, %r29887, 0x96; + lop3.b32 %r8189, %r29916, %r29914, %r29892, 0x96; + lop3.b32 %r8189, %r8189, %r29890, %r29888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8200, %r29911, %r29885, %r29883, 0x96; + lop3.b32 %r8200, %r8200, %r29881, %r29879, 0x96; + lop3.b32 %r8201, %r29912, %r29886, %r29884, 0x96; + lop3.b32 %r8201, %r8201, %r29882, %r29880, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8212, %r29909, %r29877, %r29875, 0x96; + lop3.b32 %r8212, %r8212, %r29873, %r29871, 0x96; + lop3.b32 %r8213, %r29910, %r29878, %r29876, 0x96; + lop3.b32 %r8213, %r8213, %r29874, %r29872, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8224, %r8177, %r8176, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8228, %r8176, %r8177, %r8263; + // end inline asm + xor.b32 %r8403, %r8224, %r8212; + xor.b32 %r8404, %r8228, %r8213; + xor.b32 %r8371, %r29907, %r8403; + xor.b32 %r8374, %r29908, %r8404; + xor.b32 %r8334, %r29904, %r8404; + xor.b32 %r8333, %r29903, %r8403; + st.local.v2.u32 [%rd53+104], {%r8333, %r8334}; + // begin inline asm + shf.l.wrap.b32 %r8232, %r8189, %r8188, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8236, %r8188, %r8189, %r8263; + // end inline asm + xor.b32 %r8405, %r8232, %r8164; + xor.b32 %r8406, %r8236, %r8165; + xor.b32 %r8270, %r29917, %r8405; + xor.b32 %r8269, %r29918, %r8406; + xor.b32 %r8309, %r29896, %r8406; + xor.b32 %r8310, %r29895, %r8405; + st.local.v2.u32 [%rd53+152], {%r8310, %r8309}; + // begin inline asm + shf.l.wrap.b32 %r8240, %r8201, %r8200, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8244, %r8200, %r8201, %r8263; + // end inline asm + xor.b32 %r8407, %r8240, %r8176; + xor.b32 %r8408, %r8244, %r8177; + xor.b32 %r8293, %r29892, %r8408; + xor.b32 %r8294, %r29891, %r8407; + st.local.v2.u32 [%rd53+120], {%r8294, %r8293}; + xor.b32 %r8285, %r29888, %r8408; + xor.b32 %r8286, %r29887, %r8407; + st.local.v2.u32 [%rd53+200], {%r8286, %r8285}; + // begin inline asm + shf.l.wrap.b32 %r8248, %r8213, %r8212, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8252, %r8212, %r8213, %r8263; + // end inline asm + xor.b32 %r8409, %r8248, %r8188; + xor.b32 %r8410, %r8252, %r8189; + xor.b32 %r8317, %r29911, %r8409; + xor.b32 %r8318, %r29912, %r8410; + xor.b32 %r8326, %r29882, %r8410; + xor.b32 %r8325, %r29881, %r8409; + st.local.v2.u32 [%rd53+168], {%r8325, %r8326}; + // begin inline asm + shf.l.wrap.b32 %r8256, %r8165, %r8164, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8260, %r8164, %r8165, %r8263; + // end inline asm + xor.b32 %r8411, %r8256, %r8200; + xor.b32 %r8412, %r8260, %r8201; + xor.b32 %r8277, %r29877, %r8411; + xor.b32 %r8278, %r29878, %r8412; + xor.b32 %r8302, %r29872, %r8412; + xor.b32 %r8301, %r29871, %r8411; + st.local.v2.u32 [%rd53+216], {%r8301, %r8302}; + // begin inline asm + shf.l.wrap.b32 %r8264, %r8270, %r8269, %r7767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8268, %r8269, %r8270, %r7767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8272, %r8278, %r8277, %r7775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8276, %r8277, %r8278, %r7775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8284, %r8285, %r8286, %r7783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8280, %r8286, %r8285, %r7783; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r8280, %r8284}; + // begin inline asm + shf.l.wrap.b32 %r8288, %r8294, %r8293, %r7815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8292, %r8293, %r8294, %r7815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8296, %r8302, %r8301, %r7863; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8300, %r8301, %r8302, %r7863; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8308, %r8309, %r8310, %r7887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8304, %r8310, %r8309, %r7887; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r8304, %r8308}; + // begin inline asm + shf.l.wrap.b32 %r8312, %r8318, %r8317, %r7903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8316, %r8317, %r8318, %r7903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8320, %r8326, %r8325, %r7911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8324, %r8325, %r8326, %r7911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8328, %r8334, %r8333, %r7943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8332, %r8333, %r8334, %r7943; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r8336, %r8371, %r8264, %r8288, 0xD2; + lop3.b32 %r8337, %r8374, %r8268, %r8292, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30054, %r8264, %r8288, %r8320, 0xD2; + lop3.b32 %r30055, %r8268, %r8292, %r8324, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+32], {%r30054, %r30055}; + // begin inline asm + // chi + lop3.b32 %r30050, %r8288, %r8320, %r8296, 0xD2; + lop3.b32 %r30051, %r8292, %r8324, %r8300, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+40], {%r30050, %r30051}; + // begin inline asm + // chi + lop3.b32 %r30046, %r8320, %r8296, %r8371, 0xD2; + lop3.b32 %r30047, %r8324, %r8300, %r8374, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+48], {%r30046, %r30047}; + // begin inline asm + // chi + lop3.b32 %r30044, %r8296, %r8371, %r8264, 0xD2; + lop3.b32 %r30045, %r8300, %r8374, %r8268, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+56], {%r30044, %r30045}; + // begin inline asm + // chi + lop3.b32 %r30040, %r8312, %r8272, %r8328, 0xD2; + lop3.b32 %r30041, %r8316, %r8276, %r8332, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+64], {%r30040, %r30041}; + // begin inline asm + // chi + lop3.b32 %r30052, %r8272, %r8328, %r8304, 0xD2; + lop3.b32 %r30053, %r8276, %r8332, %r8308, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+72], {%r30052, %r30053}; + // begin inline asm + // chi + lop3.b32 %r30048, %r8328, %r8304, %r8280, 0xD2; + lop3.b32 %r30049, %r8332, %r8308, %r8284, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+80], {%r30048, %r30049}; + // begin inline asm + ld.global.nc.v2.u32 {%r8400,%r8401}, [%rd481]; + // end inline asm + xor.b32 %r30042, %r8336, %r8400; + xor.b32 %r30043, %r8337, %r8401; + st.local.v2.u32 [%rd53+24], {%r30042, %r30043}; + add.s64 %rd55, %rd53, 24; + add.s64 %rd56, %rd2, 24; + +$L__BB2_19: + cvta.to.global.u64 %rd1270, %rd361; + shl.b32 %r8413, %r29954, 2; + cvt.u64.u32 %rd511, %r8413; + and.b64 %rd512, %rd511, 60; + add.s64 %rd513, %rd56, %rd512; + xor.b32 %r8414, %r47, %r29954; + mul.lo.s32 %r8415, %r8414, 16777619; + ld.local.u32 %r8416, [%rd513]; + xor.b32 %r8417, %r8415, %r8416; + mul.wide.u32 %rd514, %r8417, -954391867; + shr.u64 %rd515, %rd514, 32; + cvt.u32.u64 %r8418, %rd515; + sub.s32 %r8419, %r8417, %r8418; + shr.u32 %r8420, %r8419, 1; + add.s32 %r8421, %r8420, %r8418; + shr.u32 %r8422, %r8421, 20; + mul.lo.s32 %r8423, %r8422, 1179641; + sub.s32 %r8424, %r8417, %r8423; + mul.wide.u32 %rd516, %r8424, 64; + add.s64 %rd517, %rd1270, %rd516; + mul.lo.s32 %r8425, %r29991, 16777619; + ld.global.u32 %r8426, [%rd517]; + xor.b32 %r29991, %r8425, %r8426; + mul.lo.s32 %r8427, %r29992, 16777619; + ld.global.u32 %r8428, [%rd517+4]; + xor.b32 %r29992, %r8427, %r8428; + mul.lo.s32 %r8429, %r30003, 16777619; + ld.global.u32 %r8430, [%rd517+8]; + mul.lo.s32 %r8431, %r30004, 16777619; + ld.global.u32 %r8432, [%rd517+12]; + xor.b32 %r8433, %r8431, %r8432; + xor.b32 %r30003, %r8429, %r8430; + mov.b64 %rd518, {%r30003, %r8433}; + mul.lo.s32 %r8434, %r29999, 16777619; + ld.global.u32 %r8435, [%rd517+16]; + mul.lo.s32 %r8436, %r30000, 16777619; + ld.global.u32 %r8437, [%rd517+20]; + xor.b32 %r8438, %r8436, %r8437; + xor.b32 %r29999, %r8434, %r8435; + mov.b64 %rd519, {%r29999, %r8438}; + mul.lo.s32 %r8439, %r29995, 16777619; + ld.global.u32 %r8440, [%rd517+24]; + mul.lo.s32 %r8441, %r29996, 16777619; + ld.global.u32 %r8442, [%rd517+28]; + xor.b32 %r8443, %r8441, %r8442; + xor.b32 %r29995, %r8439, %r8440; + mov.b64 %rd520, {%r29995, %r8443}; + mul.lo.s32 %r8444, %r29993, 16777619; + ld.global.u32 %r8445, [%rd517+32]; + mul.lo.s32 %r8446, %r29994, 16777619; + ld.global.u32 %r8447, [%rd517+36]; + xor.b32 %r8448, %r8446, %r8447; + xor.b32 %r29993, %r8444, %r8445; + mov.b64 %rd521, {%r29993, %r8448}; + mul.lo.s32 %r8449, %r29989, 16777619; + ld.global.u32 %r8450, [%rd517+40]; + xor.b32 %r29989, %r8449, %r8450; + mul.lo.s32 %r8451, %r29990, 16777619; + ld.global.u32 %r8452, [%rd517+44]; + xor.b32 %r29990, %r8451, %r8452; + mul.lo.s32 %r8453, %r30001, 16777619; + ld.global.u32 %r8454, [%rd517+48]; + mul.lo.s32 %r8455, %r30002, 16777619; + ld.global.u32 %r8456, [%rd517+52]; + xor.b32 %r8457, %r8455, %r8456; + xor.b32 %r30001, %r8453, %r8454; + mov.b64 %rd522, {%r30001, %r8457}; + mul.lo.s32 %r8458, %r29997, 16777619; + ld.global.u32 %r8459, [%rd517+56]; + mul.lo.s32 %r8460, %r29998, 16777619; + ld.global.u32 %r8461, [%rd517+60]; + xor.b32 %r8462, %r8460, %r8461; + xor.b32 %r29997, %r8458, %r8459; + mov.b64 %rd523, {%r29997, %r8462}; + st.local.v2.u32 [%rd2+24], {%r29991, %r29992}; + st.local.v2.u32 [%rd2+32], {%r30003, %r8433}; + st.local.v2.u32 [%rd2+40], {%r29999, %r8438}; + st.local.v2.u32 [%rd2+48], {%r29995, %r8443}; + st.local.v2.u32 [%rd2+56], {%r29993, %r8448}; + st.local.v2.u32 [%rd2+64], {%r29989, %r29990}; + st.local.v2.u32 [%rd2+72], {%r30001, %r8457}; + st.local.v2.u32 [%rd2+80], {%r29997, %r8462}; + add.s64 %rd524, %rd55, %rd512; + xor.b32 %r8463, %r243, %r29954; + mul.lo.s32 %r8464, %r8463, 16777619; + ld.local.u32 %r8465, [%rd524]; + xor.b32 %r8466, %r8464, %r8465; + mul.wide.u32 %rd525, %r8466, -954391867; + shr.u64 %rd526, %rd525, 32; + cvt.u32.u64 %r8467, %rd526; + sub.s32 %r8468, %r8466, %r8467; + shr.u32 %r8469, %r8468, 1; + add.s32 %r8470, %r8469, %r8467; + shr.u32 %r8471, %r8470, 20; + mul.lo.s32 %r8472, %r8471, 1179641; + sub.s32 %r8473, %r8466, %r8472; + mul.wide.u32 %rd527, %r8473, 64; + add.s64 %rd528, %rd1270, %rd527; + mul.lo.s32 %r8474, %r30042, 16777619; + ld.global.u32 %r8475, [%rd528]; + xor.b32 %r30042, %r8474, %r8475; + mul.lo.s32 %r8476, %r30043, 16777619; + ld.global.u32 %r8477, [%rd528+4]; + xor.b32 %r30043, %r8476, %r8477; + mul.lo.s32 %r8478, %r30054, 16777619; + ld.global.u32 %r8479, [%rd528+8]; + mul.lo.s32 %r8480, %r30055, 16777619; + ld.global.u32 %r8481, [%rd528+12]; + xor.b32 %r8482, %r8480, %r8481; + xor.b32 %r30054, %r8478, %r8479; + mov.b64 %rd529, {%r30054, %r8482}; + mul.lo.s32 %r8483, %r30050, 16777619; + ld.global.u32 %r8484, [%rd528+16]; + mul.lo.s32 %r8485, %r30051, 16777619; + ld.global.u32 %r8486, [%rd528+20]; + xor.b32 %r8487, %r8485, %r8486; + xor.b32 %r30050, %r8483, %r8484; + mov.b64 %rd530, {%r30050, %r8487}; + mul.lo.s32 %r8488, %r30046, 16777619; + ld.global.u32 %r8489, [%rd528+24]; + mul.lo.s32 %r8490, %r30047, 16777619; + ld.global.u32 %r8491, [%rd528+28]; + xor.b32 %r8492, %r8490, %r8491; + xor.b32 %r30046, %r8488, %r8489; + mov.b64 %rd531, {%r30046, %r8492}; + mul.lo.s32 %r8493, %r30044, 16777619; + ld.global.u32 %r8494, [%rd528+32]; + mul.lo.s32 %r8495, %r30045, 16777619; + ld.global.u32 %r8496, [%rd528+36]; + xor.b32 %r8497, %r8495, %r8496; + xor.b32 %r30044, %r8493, %r8494; + mov.b64 %rd532, {%r30044, %r8497}; + mul.lo.s32 %r8498, %r30040, 16777619; + ld.global.u32 %r8499, [%rd528+40]; + xor.b32 %r30040, %r8498, %r8499; + mul.lo.s32 %r8500, %r30041, 16777619; + ld.global.u32 %r8501, [%rd528+44]; + xor.b32 %r30041, %r8500, %r8501; + mul.lo.s32 %r8502, %r30052, 16777619; + ld.global.u32 %r8503, [%rd528+48]; + mul.lo.s32 %r8504, %r30053, 16777619; + ld.global.u32 %r8505, [%rd528+52]; + xor.b32 %r8506, %r8504, %r8505; + xor.b32 %r30052, %r8502, %r8503; + mov.b64 %rd533, {%r30052, %r8506}; + mul.lo.s32 %r8507, %r30048, 16777619; + ld.global.u32 %r8508, [%rd528+56]; + mul.lo.s32 %r8509, %r30049, 16777619; + ld.global.u32 %r8510, [%rd528+60]; + xor.b32 %r8511, %r8509, %r8510; + xor.b32 %r30048, %r8507, %r8508; + mov.b64 %rd534, {%r30048, %r8511}; + st.local.v2.u32 [%rd53+24], {%r30042, %r30043}; + st.local.v2.u32 [%rd53+32], {%r30054, %r8482}; + st.local.v2.u32 [%rd53+40], {%r30050, %r8487}; + st.local.v2.u32 [%rd53+48], {%r30046, %r8492}; + st.local.v2.u32 [%rd53+56], {%r30044, %r8497}; + st.local.v2.u32 [%rd53+64], {%r30040, %r30041}; + st.local.v2.u32 [%rd53+72], {%r30052, %r8506}; + st.local.v2.u32 [%rd53+80], {%r30048, %r8511}; + add.s32 %r29954, %r29954, 1; + setp.lt.u32 %p16, %r29954, 512; + shr.u64 %rd535, %rd518, 32; + cvt.u32.u64 %r30004, %rd535; + shr.u64 %rd536, %rd519, 32; + cvt.u32.u64 %r30000, %rd536; + shr.u64 %rd537, %rd520, 32; + cvt.u32.u64 %r29996, %rd537; + shr.u64 %rd538, %rd521, 32; + cvt.u32.u64 %r29994, %rd538; + shr.u64 %rd539, %rd522, 32; + cvt.u32.u64 %r30002, %rd539; + shr.u64 %rd540, %rd523, 32; + cvt.u32.u64 %r29998, %rd540; + shr.u64 %rd541, %rd529, 32; + cvt.u32.u64 %r30055, %rd541; + shr.u64 %rd542, %rd530, 32; + cvt.u32.u64 %r30051, %rd542; + shr.u64 %rd543, %rd531, 32; + cvt.u32.u64 %r30047, %rd543; + shr.u64 %rd544, %rd532, 32; + cvt.u32.u64 %r30045, %rd544; + shr.u64 %rd545, %rd533, 32; + cvt.u32.u64 %r30053, %rd545; + shr.u64 %rd546, %rd534, 32; + cvt.u32.u64 %r30049, %rd546; + @%p16 bra $L__BB2_19; + + mov.u32 %r29955, 0; + st.local.v2.u32 [%rd2+96], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+104], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+112], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+120], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+128], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+136], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+144], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+152], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+160], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+168], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+176], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+184], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+192], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+200], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+208], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+216], {%r29955, %r29955}; + mov.u32 %r29970, -2147483648; + mov.u32 %r8526, 1; + st.local.v2.u32 [%rd2+88], {%r8526, %r29970}; + mov.u32 %r29956, %r29955; + mov.u32 %r29957, %r29955; + mov.u32 %r29958, %r29955; + mov.u32 %r29959, %r29955; + mov.u32 %r29960, %r29955; + mov.u32 %r29961, %r29955; + mov.u32 %r29962, %r29955; + mov.u32 %r29963, %r29955; + mov.u32 %r29964, %r29955; + mov.u32 %r29965, %r29955; + mov.u32 %r29966, %r29955; + mov.u32 %r29967, %r29955; + mov.u32 %r29968, %r29955; + mov.u32 %r29969, %r8526; + mov.u32 %r29971, %r29955; + mov.u32 %r29972, %r29955; + mov.u32 %r29973, %r29955; + mov.u32 %r29974, %r29955; + mov.u32 %r29975, %r29955; + mov.u32 %r29976, %r29955; + mov.u32 %r29977, %r29955; + mov.u32 %r29978, %r29955; + mov.u32 %r29979, %r29955; + mov.u32 %r29980, %r29955; + mov.u32 %r29981, %r29955; + mov.u32 %r29982, %r29955; + mov.u32 %r29983, %r29955; + mov.u32 %r29984, %r29955; + mov.u32 %r29985, %r29955; + mov.u32 %r29986, %r29955; + mov.u32 %r29987, %r29955; + mov.u32 %r29988, %r29955; + mov.u32 %r30005, %r29955; + +$L__BB2_21: + // begin inline asm + // xor5 + lop3.b32 %r8553, %r29991, %r29989, %r29987, 0x96; + lop3.b32 %r8553, %r8553, %r29985, %r29983, 0x96; + lop3.b32 %r8554, %r29992, %r29990, %r29988, 0x96; + lop3.b32 %r8554, %r8554, %r29986, %r29984, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8565, %r30003, %r30001, %r29981, 0x96; + lop3.b32 %r8565, %r8565, %r29979, %r29977, 0x96; + lop3.b32 %r8566, %r30004, %r30002, %r29982, 0x96; + lop3.b32 %r8566, %r8566, %r29980, %r29978, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8577, %r29999, %r29997, %r29975, 0x96; + lop3.b32 %r8577, %r8577, %r29973, %r29971, 0x96; + lop3.b32 %r8578, %r30000, %r29998, %r29976, 0x96; + lop3.b32 %r8578, %r8578, %r29974, %r29972, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8589, %r29995, %r29969, %r29967, 0x96; + lop3.b32 %r8589, %r8589, %r29965, %r29963, 0x96; + lop3.b32 %r8590, %r29996, %r29970, %r29968, 0x96; + lop3.b32 %r8590, %r8590, %r29966, %r29964, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8601, %r29993, %r29961, %r29959, 0x96; + lop3.b32 %r8601, %r8601, %r29957, %r29955, 0x96; + lop3.b32 %r8602, %r29994, %r29962, %r29960, 0x96; + lop3.b32 %r8602, %r8602, %r29958, %r29956, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8613, %r8566, %r8565, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8617, %r8565, %r8566, %r8526; + // end inline asm + xor.b32 %r9047, %r8613, %r8601; + xor.b32 %r9048, %r8617, %r8602; + xor.b32 %r8880, %r29991, %r9047; + xor.b32 %r8883, %r29992, %r9048; + xor.b32 %r8787, %r29989, %r9047; + xor.b32 %r8786, %r29990, %r9048; + xor.b32 %r8834, %r29987, %r9047; + xor.b32 %r8835, %r29988, %r9048; + xor.b32 %r8739, %r29985, %r9047; + xor.b32 %r8738, %r29986, %r9048; + xor.b32 %r8690, %r29983, %r9047; + xor.b32 %r8691, %r29984, %r9048; + // begin inline asm + shf.l.wrap.b32 %r8621, %r8578, %r8577, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8625, %r8577, %r8578, %r8526; + // end inline asm + xor.b32 %r9049, %r8621, %r8553; + xor.b32 %r9050, %r8625, %r8554; + xor.b32 %r8842, %r30003, %r9049; + xor.b32 %r8843, %r30004, %r9050; + xor.b32 %r8659, %r30001, %r9049; + xor.b32 %r8658, %r30002, %r9050; + xor.b32 %r8818, %r29981, %r9049; + xor.b32 %r8819, %r29982, %r9050; + xor.b32 %r8779, %r29979, %r9049; + xor.b32 %r8778, %r29980, %r9050; + xor.b32 %r8762, %r29977, %r9049; + xor.b32 %r8763, %r29978, %r9050; + // begin inline asm + shf.l.wrap.b32 %r8629, %r8590, %r8589, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8633, %r8589, %r8590, %r8526; + // end inline asm + xor.b32 %r9051, %r8629, %r8565; + xor.b32 %r9052, %r8633, %r8566; + xor.b32 %r8699, %r29999, %r9051; + xor.b32 %r8698, %r30000, %r9052; + xor.b32 %r8826, %r29997, %r9051; + xor.b32 %r8827, %r29998, %r9052; + xor.b32 %r8707, %r29975, %r9051; + xor.b32 %r8706, %r29976, %r9052; + xor.b32 %r8810, %r29973, %r9051; + xor.b32 %r8811, %r29974, %r9052; + xor.b32 %r8675, %r29971, %r9051; + xor.b32 %r8674, %r29972, %r9052; + // begin inline asm + shf.l.wrap.b32 %r8637, %r8602, %r8601, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8641, %r8601, %r8602, %r8526; + // end inline asm + xor.b32 %r9053, %r8637, %r8577; + xor.b32 %r9054, %r8641, %r8578; + xor.b32 %r8794, %r29995, %r9053; + xor.b32 %r8795, %r29996, %r9054; + xor.b32 %r8771, %r29969, %r9053; + xor.b32 %r8770, %r29970, %r9054; + xor.b32 %r8714, %r29967, %r9053; + xor.b32 %r8715, %r29968, %r9054; + xor.b32 %r8802, %r29965, %r9053; + xor.b32 %r8803, %r29966, %r9054; + xor.b32 %r8731, %r29963, %r9053; + xor.b32 %r8730, %r29964, %r9054; + // begin inline asm + shf.l.wrap.b32 %r8645, %r8554, %r8553, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8649, %r8553, %r8554, %r8526; + // end inline asm + xor.b32 %r9055, %r8645, %r8589; + xor.b32 %r9056, %r8649, %r8590; + xor.b32 %r8746, %r29993, %r9055; + xor.b32 %r8747, %r29994, %r9056; + xor.b32 %r8666, %r29961, %r9055; + xor.b32 %r8667, %r29962, %r9056; + xor.b32 %r8683, %r29959, %r9055; + xor.b32 %r8682, %r29960, %r9056; + xor.b32 %r8722, %r29957, %r9055; + xor.b32 %r8723, %r29958, %r9056; + xor.b32 %r8754, %r29955, %r9055; + xor.b32 %r8755, %r29956, %r9056; + mov.u32 %r8660, 44; + // begin inline asm + shf.l.wrap.b32 %r8653, %r8659, %r8658, %r8660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8657, %r8658, %r8659, %r8660; + // end inline asm + mov.u32 %r8668, 20; + // begin inline asm + shf.l.wrap.b32 %r8661, %r8667, %r8666, %r8668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8665, %r8666, %r8667, %r8668; + // end inline asm + mov.u32 %r8676, 61; + // begin inline asm + shf.l.wrap.b32 %r8669, %r8675, %r8674, %r8676; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8673, %r8674, %r8675, %r8676; + // end inline asm + mov.u32 %r8684, 39; + // begin inline asm + shf.l.wrap.b32 %r8677, %r8683, %r8682, %r8684; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8681, %r8682, %r8683, %r8684; + // end inline asm + mov.u32 %r8692, 18; + // begin inline asm + shf.l.wrap.b32 %r8685, %r8691, %r8690, %r8692; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8689, %r8690, %r8691, %r8692; + // end inline asm + mov.u32 %r8700, 62; + // begin inline asm + shf.l.wrap.b32 %r8693, %r8699, %r8698, %r8700; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8697, %r8698, %r8699, %r8700; + // end inline asm + mov.u32 %r8708, 43; + // begin inline asm + shf.l.wrap.b32 %r8701, %r8707, %r8706, %r8708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8705, %r8706, %r8707, %r8708; + // end inline asm + mov.u32 %r8716, 25; + // begin inline asm + shf.l.wrap.b32 %r8709, %r8715, %r8714, %r8716; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8713, %r8714, %r8715, %r8716; + // end inline asm + mov.u32 %r8724, 8; + // begin inline asm + shf.l.wrap.b32 %r8717, %r8723, %r8722, %r8724; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8721, %r8722, %r8723, %r8724; + // end inline asm + mov.u32 %r8732, 56; + // begin inline asm + shf.l.wrap.b32 %r8725, %r8731, %r8730, %r8732; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8729, %r8730, %r8731, %r8732; + // end inline asm + mov.u32 %r8740, 41; + // begin inline asm + shf.l.wrap.b32 %r8733, %r8739, %r8738, %r8740; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8737, %r8738, %r8739, %r8740; + // end inline asm + mov.u32 %r8748, 27; + // begin inline asm + shf.l.wrap.b32 %r8741, %r8747, %r8746, %r8748; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8745, %r8746, %r8747, %r8748; + // end inline asm + mov.u32 %r8756, 14; + // begin inline asm + shf.l.wrap.b32 %r8749, %r8755, %r8754, %r8756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8753, %r8754, %r8755, %r8756; + // end inline asm + mov.u32 %r8764, 2; + // begin inline asm + shf.l.wrap.b32 %r8757, %r8763, %r8762, %r8764; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8761, %r8762, %r8763, %r8764; + // end inline asm + mov.u32 %r8772, 55; + // begin inline asm + shf.l.wrap.b32 %r8765, %r8771, %r8770, %r8772; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8769, %r8770, %r8771, %r8772; + // end inline asm + mov.u32 %r8780, 45; + // begin inline asm + shf.l.wrap.b32 %r8773, %r8779, %r8778, %r8780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8777, %r8778, %r8779, %r8780; + // end inline asm + mov.u32 %r8788, 36; + // begin inline asm + shf.l.wrap.b32 %r8781, %r8787, %r8786, %r8788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8785, %r8786, %r8787, %r8788; + // end inline asm + mov.u32 %r8796, 28; + // begin inline asm + shf.l.wrap.b32 %r8789, %r8795, %r8794, %r8796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8793, %r8794, %r8795, %r8796; + // end inline asm + mov.u32 %r8804, 21; + // begin inline asm + shf.l.wrap.b32 %r8797, %r8803, %r8802, %r8804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8801, %r8802, %r8803, %r8804; + // end inline asm + mov.u32 %r8812, 15; + // begin inline asm + shf.l.wrap.b32 %r8805, %r8811, %r8810, %r8812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8809, %r8810, %r8811, %r8812; + // end inline asm + mov.u32 %r8820, 10; + // begin inline asm + shf.l.wrap.b32 %r8813, %r8819, %r8818, %r8820; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8817, %r8818, %r8819, %r8820; + // end inline asm + mov.u32 %r8828, 6; + // begin inline asm + shf.l.wrap.b32 %r8821, %r8827, %r8826, %r8828; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8825, %r8826, %r8827, %r8828; + // end inline asm + mov.u32 %r8836, 3; + // begin inline asm + shf.l.wrap.b32 %r8829, %r8835, %r8834, %r8836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8833, %r8834, %r8835, %r8836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8837, %r8843, %r8842, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8841, %r8842, %r8843, %r8526; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r8845, %r8880, %r8653, %r8701, 0xD2; + lop3.b32 %r8846, %r8883, %r8657, %r8705, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30003, %r8653, %r8701, %r8797, 0xD2; + lop3.b32 %r30004, %r8657, %r8705, %r8801, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29999, %r8701, %r8797, %r8749, 0xD2; + lop3.b32 %r30000, %r8705, %r8801, %r8753, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29995, %r8797, %r8749, %r8880, 0xD2; + lop3.b32 %r29996, %r8801, %r8753, %r8883, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29993, %r8749, %r8880, %r8653, 0xD2; + lop3.b32 %r29994, %r8753, %r8883, %r8657, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29989, %r8789, %r8661, %r8829, 0xD2; + lop3.b32 %r29990, %r8793, %r8665, %r8833, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30001, %r8661, %r8829, %r8773, 0xD2; + lop3.b32 %r30002, %r8665, %r8833, %r8777, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29997, %r8829, %r8773, %r8669, 0xD2; + lop3.b32 %r29998, %r8833, %r8777, %r8673, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29969, %r8773, %r8669, %r8789, 0xD2; + lop3.b32 %r29970, %r8777, %r8673, %r8793, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r29969, %r29970}; + // begin inline asm + // chi + lop3.b32 %r29961, %r8669, %r8789, %r8661, 0xD2; + lop3.b32 %r29962, %r8673, %r8793, %r8665, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r29961, %r29962}; + // begin inline asm + // chi + lop3.b32 %r29987, %r8837, %r8821, %r8709, 0xD2; + lop3.b32 %r29988, %r8841, %r8825, %r8713, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r29987, %r29988}; + // begin inline asm + // chi + lop3.b32 %r29981, %r8821, %r8709, %r8717, 0xD2; + lop3.b32 %r29982, %r8825, %r8713, %r8721, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r29981, %r29982}; + // begin inline asm + // chi + lop3.b32 %r29975, %r8709, %r8717, %r8685, 0xD2; + lop3.b32 %r29976, %r8713, %r8721, %r8689, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r29975, %r29976}; + // begin inline asm + // chi + lop3.b32 %r29967, %r8717, %r8685, %r8837, 0xD2; + lop3.b32 %r29968, %r8721, %r8689, %r8841, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r29967, %r29968}; + // begin inline asm + // chi + lop3.b32 %r29959, %r8685, %r8837, %r8821, 0xD2; + lop3.b32 %r29960, %r8689, %r8841, %r8825, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r29959, %r29960}; + // begin inline asm + // chi + lop3.b32 %r29985, %r8741, %r8781, %r8813, 0xD2; + lop3.b32 %r29986, %r8745, %r8785, %r8817, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r29985, %r29986}; + // begin inline asm + // chi + lop3.b32 %r29979, %r8781, %r8813, %r8805, 0xD2; + lop3.b32 %r29980, %r8785, %r8817, %r8809, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r29979, %r29980}; + // begin inline asm + // chi + lop3.b32 %r29973, %r8813, %r8805, %r8725, 0xD2; + lop3.b32 %r29974, %r8817, %r8809, %r8729, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r29973, %r29974}; + // begin inline asm + // chi + lop3.b32 %r29965, %r8805, %r8725, %r8741, 0xD2; + lop3.b32 %r29966, %r8809, %r8729, %r8745, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r29965, %r29966}; + // begin inline asm + // chi + lop3.b32 %r29957, %r8725, %r8741, %r8781, 0xD2; + lop3.b32 %r29958, %r8729, %r8745, %r8785, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r29957, %r29958}; + // begin inline asm + // chi + lop3.b32 %r29983, %r8693, %r8765, %r8677, 0xD2; + lop3.b32 %r29984, %r8697, %r8769, %r8681, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r29983, %r29984}; + // begin inline asm + // chi + lop3.b32 %r29977, %r8765, %r8677, %r8733, 0xD2; + lop3.b32 %r29978, %r8769, %r8681, %r8737, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r29977, %r29978}; + // begin inline asm + // chi + lop3.b32 %r29971, %r8677, %r8733, %r8757, 0xD2; + lop3.b32 %r29972, %r8681, %r8737, %r8761, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r29971, %r29972}; + // begin inline asm + // chi + lop3.b32 %r29963, %r8733, %r8757, %r8693, 0xD2; + lop3.b32 %r29964, %r8737, %r8761, %r8697, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r29963, %r29964}; + // begin inline asm + // chi + lop3.b32 %r29955, %r8757, %r8693, %r8765, 0xD2; + lop3.b32 %r29956, %r8761, %r8697, %r8769, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r29955, %r29956}; + mul.wide.s32 %rd548, %r30005, 8; + add.s64 %rd547, %rd480, %rd548; + // begin inline asm + ld.global.nc.v2.u32 {%r9045,%r9046}, [%rd547]; + // end inline asm + xor.b32 %r29991, %r8845, %r9045; + xor.b32 %r29992, %r8846, %r9046; + add.s32 %r30005, %r30005, 1; + setp.lt.u32 %p17, %r30005, 23; + @%p17 bra $L__BB2_21; + + st.local.v2.u32 [%rd2+32], {%r30003, %r30004}; + st.local.v2.u32 [%rd2+72], {%r30001, %r30002}; + st.local.v2.u32 [%rd2+40], {%r29999, %r30000}; + st.local.v2.u32 [%rd2+80], {%r29997, %r29998}; + st.local.v2.u32 [%rd2+48], {%r29995, %r29996}; + st.local.v2.u32 [%rd2+56], {%r29993, %r29994}; + st.local.v2.u32 [%rd2+24], {%r29991, %r29992}; + // begin inline asm + // xor5 + lop3.b32 %r9057, %r29991, %r29989, %r29987, 0x96; + lop3.b32 %r9057, %r9057, %r29985, %r29983, 0x96; + lop3.b32 %r9058, %r29992, %r29990, %r29988, 0x96; + lop3.b32 %r9058, %r9058, %r29986, %r29984, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9069, %r30003, %r30001, %r29981, 0x96; + lop3.b32 %r9069, %r9069, %r29979, %r29977, 0x96; + lop3.b32 %r9070, %r30004, %r30002, %r29982, 0x96; + lop3.b32 %r9070, %r9070, %r29980, %r29978, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9081, %r29999, %r29997, %r29975, 0x96; + lop3.b32 %r9081, %r9081, %r29973, %r29971, 0x96; + lop3.b32 %r9082, %r30000, %r29998, %r29976, 0x96; + lop3.b32 %r9082, %r9082, %r29974, %r29972, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9093, %r29995, %r29969, %r29967, 0x96; + lop3.b32 %r9093, %r9093, %r29965, %r29963, 0x96; + lop3.b32 %r9094, %r29996, %r29970, %r29968, 0x96; + lop3.b32 %r9094, %r9094, %r29966, %r29964, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9105, %r29993, %r29961, %r29959, 0x96; + lop3.b32 %r9105, %r9105, %r29957, %r29955, 0x96; + lop3.b32 %r9106, %r29994, %r29962, %r29960, 0x96; + lop3.b32 %r9106, %r9106, %r29958, %r29956, 0x96; + // end inline asm + mov.u32 %r9309, 1; + // begin inline asm + shf.l.wrap.b32 %r9117, %r9070, %r9069, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9121, %r9069, %r9070, %r9309; + // end inline asm + xor.b32 %r9336, %r9117, %r9105; + xor.b32 %r9337, %r9121, %r9106; + xor.b32 %r9264, %r29991, %r9336; + xor.b32 %r9267, %r29992, %r9337; + xor.b32 %r9227, %r29988, %r9337; + xor.b32 %r9226, %r29987, %r9336; + st.local.v2.u32 [%rd2+104], {%r9226, %r9227}; + // begin inline asm + shf.l.wrap.b32 %r9125, %r9082, %r9081, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9129, %r9081, %r9082, %r9309; + // end inline asm + xor.b32 %r9338, %r9125, %r9057; + xor.b32 %r9339, %r9129, %r9058; + xor.b32 %r9163, %r30001, %r9338; + xor.b32 %r9162, %r30002, %r9339; + xor.b32 %r9202, %r29980, %r9339; + xor.b32 %r9203, %r29979, %r9338; + st.local.v2.u32 [%rd2+152], {%r9203, %r9202}; + // begin inline asm + shf.l.wrap.b32 %r9133, %r9094, %r9093, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9137, %r9093, %r9094, %r9309; + // end inline asm + xor.b32 %r9340, %r9133, %r9069; + xor.b32 %r9341, %r9137, %r9070; + xor.b32 %r9186, %r29976, %r9341; + xor.b32 %r9187, %r29975, %r9340; + st.local.v2.u32 [%rd2+120], {%r9187, %r9186}; + xor.b32 %r9178, %r29972, %r9341; + xor.b32 %r9179, %r29971, %r9340; + st.local.v2.u32 [%rd2+200], {%r9179, %r9178}; + // begin inline asm + shf.l.wrap.b32 %r9141, %r9106, %r9105, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9145, %r9105, %r9106, %r9309; + // end inline asm + xor.b32 %r9342, %r9141, %r9081; + xor.b32 %r9343, %r9145, %r9082; + xor.b32 %r9210, %r29995, %r9342; + xor.b32 %r9211, %r29996, %r9343; + xor.b32 %r9219, %r29966, %r9343; + xor.b32 %r9218, %r29965, %r9342; + st.local.v2.u32 [%rd2+168], {%r9218, %r9219}; + // begin inline asm + shf.l.wrap.b32 %r9149, %r9058, %r9057, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9153, %r9057, %r9058, %r9309; + // end inline asm + xor.b32 %r9344, %r9149, %r9093; + xor.b32 %r9345, %r9153, %r9094; + xor.b32 %r9170, %r29961, %r9344; + xor.b32 %r9171, %r29962, %r9345; + xor.b32 %r9195, %r29956, %r9345; + xor.b32 %r9194, %r29955, %r9344; + st.local.v2.u32 [%rd2+216], {%r9194, %r9195}; + // begin inline asm + shf.l.wrap.b32 %r9157, %r9163, %r9162, %r8660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9161, %r9162, %r9163, %r8660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9165, %r9171, %r9170, %r8668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9169, %r9170, %r9171, %r8668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9177, %r9178, %r9179, %r8676; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9173, %r9179, %r9178, %r8676; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r9173, %r9177}; + // begin inline asm + shf.l.wrap.b32 %r9181, %r9187, %r9186, %r8708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9185, %r9186, %r9187, %r8708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9189, %r9195, %r9194, %r8756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9193, %r9194, %r9195, %r8756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9201, %r9202, %r9203, %r8780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9197, %r9203, %r9202, %r8780; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r9197, %r9201}; + // begin inline asm + shf.l.wrap.b32 %r9205, %r9211, %r9210, %r8796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9209, %r9210, %r9211, %r8796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9213, %r9219, %r9218, %r8804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9217, %r9218, %r9219, %r8804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9221, %r9227, %r9226, %r8836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9225, %r9226, %r9227, %r8836; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r9229, %r9264, %r9157, %r9181, 0xD2; + lop3.b32 %r9230, %r9267, %r9161, %r9185, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r9237, %r9157, %r9181, %r9213, 0xD2; + lop3.b32 %r9238, %r9161, %r9185, %r9217, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r9237, %r9238}; + // begin inline asm + // chi + lop3.b32 %r9245, %r9181, %r9213, %r9189, 0xD2; + lop3.b32 %r9246, %r9185, %r9217, %r9193, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r9245, %r9246}; + // begin inline asm + // chi + lop3.b32 %r9253, %r9213, %r9189, %r9264, 0xD2; + lop3.b32 %r9254, %r9217, %r9193, %r9267, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r9253, %r9254}; + // begin inline asm + // chi + lop3.b32 %r9261, %r9189, %r9264, %r9157, 0xD2; + lop3.b32 %r9262, %r9193, %r9267, %r9161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r9261, %r9262}; + // begin inline asm + // chi + lop3.b32 %r9269, %r9205, %r9165, %r9221, 0xD2; + lop3.b32 %r9270, %r9209, %r9169, %r9225, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r9269, %r9270}; + // begin inline asm + // chi + lop3.b32 %r9277, %r9165, %r9221, %r9197, 0xD2; + lop3.b32 %r9278, %r9169, %r9225, %r9201, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r9277, %r9278}; + // begin inline asm + // chi + lop3.b32 %r9285, %r9221, %r9197, %r9173, 0xD2; + lop3.b32 %r9286, %r9225, %r9201, %r9177, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r9285, %r9286}; + // begin inline asm + ld.global.nc.v2.u32 {%r9293,%r9294}, [%rd481]; + // end inline asm + xor.b32 %r9346, %r9230, %r9294; + xor.b32 %r9347, %r9229, %r9293; + mov.b64 %rd1317, {%r9347, %r9346}; + mov.b64 %rd1318, {%r9237, %r9238}; + mov.b64 %rd1319, {%r9245, %r9246}; + mov.b64 %rd1320, {%r9253, %r9254}; + mov.b64 %rd1321, {%r9261, %r9262}; + mov.b64 %rd1322, {%r9269, %r9270}; + mov.b64 %rd1323, {%r9277, %r9278}; + mov.b64 %rd1324, {%r9285, %r9286}; + mov.u32 %r30006, 0; + st.local.v2.u32 [%rd2+24], {%r9347, %r9346}; + st.local.v2.u32 [%rd53+96], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+104], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+112], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+120], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+128], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+136], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+144], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+152], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+160], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+168], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+176], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+184], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+192], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+200], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+208], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+216], {%r30006, %r30006}; + mov.u32 %r30021, -2147483648; + st.local.v2.u32 [%rd53+88], {%r9309, %r30021}; + mov.u32 %r30007, %r30006; + mov.u32 %r30008, %r30006; + mov.u32 %r30009, %r30006; + mov.u32 %r30010, %r30006; + mov.u32 %r30011, %r30006; + mov.u32 %r30012, %r30006; + mov.u32 %r30013, %r30006; + mov.u32 %r30014, %r30006; + mov.u32 %r30015, %r30006; + mov.u32 %r30016, %r30006; + mov.u32 %r30017, %r30006; + mov.u32 %r30018, %r30006; + mov.u32 %r30019, %r30006; + mov.u32 %r30020, %r9309; + mov.u32 %r30022, %r30006; + mov.u32 %r30023, %r30006; + mov.u32 %r30024, %r30006; + mov.u32 %r30025, %r30006; + mov.u32 %r30026, %r30006; + mov.u32 %r30027, %r30006; + mov.u32 %r30028, %r30006; + mov.u32 %r30029, %r30006; + mov.u32 %r30030, %r30006; + mov.u32 %r30031, %r30006; + mov.u32 %r30032, %r30006; + mov.u32 %r30033, %r30006; + mov.u32 %r30034, %r30006; + mov.u32 %r30035, %r30006; + mov.u32 %r30036, %r30006; + mov.u32 %r30037, %r30006; + mov.u32 %r30038, %r30006; + mov.u32 %r30039, %r30006; + mov.u32 %r30056, %r30006; + +$L__BB2_23: + // begin inline asm + // xor5 + lop3.b32 %r9348, %r30042, %r30040, %r30038, 0x96; + lop3.b32 %r9348, %r9348, %r30036, %r30034, 0x96; + lop3.b32 %r9349, %r30043, %r30041, %r30039, 0x96; + lop3.b32 %r9349, %r9349, %r30037, %r30035, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9360, %r30054, %r30052, %r30032, 0x96; + lop3.b32 %r9360, %r9360, %r30030, %r30028, 0x96; + lop3.b32 %r9361, %r30055, %r30053, %r30033, 0x96; + lop3.b32 %r9361, %r9361, %r30031, %r30029, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9372, %r30050, %r30048, %r30026, 0x96; + lop3.b32 %r9372, %r9372, %r30024, %r30022, 0x96; + lop3.b32 %r9373, %r30051, %r30049, %r30027, 0x96; + lop3.b32 %r9373, %r9373, %r30025, %r30023, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9384, %r30046, %r30020, %r30018, 0x96; + lop3.b32 %r9384, %r9384, %r30016, %r30014, 0x96; + lop3.b32 %r9385, %r30047, %r30021, %r30019, 0x96; + lop3.b32 %r9385, %r9385, %r30017, %r30015, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9396, %r30044, %r30012, %r30010, 0x96; + lop3.b32 %r9396, %r9396, %r30008, %r30006, 0x96; + lop3.b32 %r9397, %r30045, %r30013, %r30011, 0x96; + lop3.b32 %r9397, %r9397, %r30009, %r30007, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9408, %r9361, %r9360, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9412, %r9360, %r9361, %r9309; + // end inline asm + xor.b32 %r9842, %r9408, %r9396; + xor.b32 %r9843, %r9412, %r9397; + xor.b32 %r9675, %r30042, %r9842; + xor.b32 %r9678, %r30043, %r9843; + xor.b32 %r9582, %r30040, %r9842; + xor.b32 %r9581, %r30041, %r9843; + xor.b32 %r9629, %r30038, %r9842; + xor.b32 %r9630, %r30039, %r9843; + xor.b32 %r9534, %r30036, %r9842; + xor.b32 %r9533, %r30037, %r9843; + xor.b32 %r9485, %r30034, %r9842; + xor.b32 %r9486, %r30035, %r9843; + // begin inline asm + shf.l.wrap.b32 %r9416, %r9373, %r9372, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9420, %r9372, %r9373, %r9309; + // end inline asm + xor.b32 %r9844, %r9416, %r9348; + xor.b32 %r9845, %r9420, %r9349; + xor.b32 %r9637, %r30054, %r9844; + xor.b32 %r9638, %r30055, %r9845; + xor.b32 %r9454, %r30052, %r9844; + xor.b32 %r9453, %r30053, %r9845; + xor.b32 %r9613, %r30032, %r9844; + xor.b32 %r9614, %r30033, %r9845; + xor.b32 %r9574, %r30030, %r9844; + xor.b32 %r9573, %r30031, %r9845; + xor.b32 %r9557, %r30028, %r9844; + xor.b32 %r9558, %r30029, %r9845; + // begin inline asm + shf.l.wrap.b32 %r9424, %r9385, %r9384, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9428, %r9384, %r9385, %r9309; + // end inline asm + xor.b32 %r9846, %r9424, %r9360; + xor.b32 %r9847, %r9428, %r9361; + xor.b32 %r9494, %r30050, %r9846; + xor.b32 %r9493, %r30051, %r9847; + xor.b32 %r9621, %r30048, %r9846; + xor.b32 %r9622, %r30049, %r9847; + xor.b32 %r9502, %r30026, %r9846; + xor.b32 %r9501, %r30027, %r9847; + xor.b32 %r9605, %r30024, %r9846; + xor.b32 %r9606, %r30025, %r9847; + xor.b32 %r9470, %r30022, %r9846; + xor.b32 %r9469, %r30023, %r9847; + // begin inline asm + shf.l.wrap.b32 %r9432, %r9397, %r9396, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9436, %r9396, %r9397, %r9309; + // end inline asm + xor.b32 %r9848, %r9432, %r9372; + xor.b32 %r9849, %r9436, %r9373; + xor.b32 %r9589, %r30046, %r9848; + xor.b32 %r9590, %r30047, %r9849; + xor.b32 %r9566, %r30020, %r9848; + xor.b32 %r9565, %r30021, %r9849; + xor.b32 %r9509, %r30018, %r9848; + xor.b32 %r9510, %r30019, %r9849; + xor.b32 %r9597, %r30016, %r9848; + xor.b32 %r9598, %r30017, %r9849; + xor.b32 %r9526, %r30014, %r9848; + xor.b32 %r9525, %r30015, %r9849; + // begin inline asm + shf.l.wrap.b32 %r9440, %r9349, %r9348, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9444, %r9348, %r9349, %r9309; + // end inline asm + xor.b32 %r9850, %r9440, %r9384; + xor.b32 %r9851, %r9444, %r9385; + xor.b32 %r9541, %r30044, %r9850; + xor.b32 %r9542, %r30045, %r9851; + xor.b32 %r9461, %r30012, %r9850; + xor.b32 %r9462, %r30013, %r9851; + xor.b32 %r9478, %r30010, %r9850; + xor.b32 %r9477, %r30011, %r9851; + xor.b32 %r9517, %r30008, %r9850; + xor.b32 %r9518, %r30009, %r9851; + xor.b32 %r9549, %r30006, %r9850; + xor.b32 %r9550, %r30007, %r9851; + mov.u32 %r9455, 44; + // begin inline asm + shf.l.wrap.b32 %r9448, %r9454, %r9453, %r9455; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9452, %r9453, %r9454, %r9455; + // end inline asm + mov.u32 %r9463, 20; + // begin inline asm + shf.l.wrap.b32 %r9456, %r9462, %r9461, %r9463; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9460, %r9461, %r9462, %r9463; + // end inline asm + mov.u32 %r9471, 61; + // begin inline asm + shf.l.wrap.b32 %r9464, %r9470, %r9469, %r9471; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9468, %r9469, %r9470, %r9471; + // end inline asm + mov.u32 %r9479, 39; + // begin inline asm + shf.l.wrap.b32 %r9472, %r9478, %r9477, %r9479; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9476, %r9477, %r9478, %r9479; + // end inline asm + mov.u32 %r9487, 18; + // begin inline asm + shf.l.wrap.b32 %r9480, %r9486, %r9485, %r9487; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9484, %r9485, %r9486, %r9487; + // end inline asm + mov.u32 %r9495, 62; + // begin inline asm + shf.l.wrap.b32 %r9488, %r9494, %r9493, %r9495; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9492, %r9493, %r9494, %r9495; + // end inline asm + mov.u32 %r9503, 43; + // begin inline asm + shf.l.wrap.b32 %r9496, %r9502, %r9501, %r9503; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9500, %r9501, %r9502, %r9503; + // end inline asm + mov.u32 %r9511, 25; + // begin inline asm + shf.l.wrap.b32 %r9504, %r9510, %r9509, %r9511; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9508, %r9509, %r9510, %r9511; + // end inline asm + mov.u32 %r9519, 8; + // begin inline asm + shf.l.wrap.b32 %r9512, %r9518, %r9517, %r9519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9516, %r9517, %r9518, %r9519; + // end inline asm + mov.u32 %r9527, 56; + // begin inline asm + shf.l.wrap.b32 %r9520, %r9526, %r9525, %r9527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9524, %r9525, %r9526, %r9527; + // end inline asm + mov.u32 %r9535, 41; + // begin inline asm + shf.l.wrap.b32 %r9528, %r9534, %r9533, %r9535; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9532, %r9533, %r9534, %r9535; + // end inline asm + mov.u32 %r9543, 27; + // begin inline asm + shf.l.wrap.b32 %r9536, %r9542, %r9541, %r9543; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9540, %r9541, %r9542, %r9543; + // end inline asm + mov.u32 %r9551, 14; + // begin inline asm + shf.l.wrap.b32 %r9544, %r9550, %r9549, %r9551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9548, %r9549, %r9550, %r9551; + // end inline asm + mov.u32 %r9559, 2; + // begin inline asm + shf.l.wrap.b32 %r9552, %r9558, %r9557, %r9559; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9556, %r9557, %r9558, %r9559; + // end inline asm + mov.u32 %r9567, 55; + // begin inline asm + shf.l.wrap.b32 %r9560, %r9566, %r9565, %r9567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9564, %r9565, %r9566, %r9567; + // end inline asm + mov.u32 %r9575, 45; + // begin inline asm + shf.l.wrap.b32 %r9568, %r9574, %r9573, %r9575; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9572, %r9573, %r9574, %r9575; + // end inline asm + mov.u32 %r9583, 36; + // begin inline asm + shf.l.wrap.b32 %r9576, %r9582, %r9581, %r9583; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9580, %r9581, %r9582, %r9583; + // end inline asm + mov.u32 %r9591, 28; + // begin inline asm + shf.l.wrap.b32 %r9584, %r9590, %r9589, %r9591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9588, %r9589, %r9590, %r9591; + // end inline asm + mov.u32 %r9599, 21; + // begin inline asm + shf.l.wrap.b32 %r9592, %r9598, %r9597, %r9599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9596, %r9597, %r9598, %r9599; + // end inline asm + mov.u32 %r9607, 15; + // begin inline asm + shf.l.wrap.b32 %r9600, %r9606, %r9605, %r9607; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9604, %r9605, %r9606, %r9607; + // end inline asm + mov.u32 %r9615, 10; + // begin inline asm + shf.l.wrap.b32 %r9608, %r9614, %r9613, %r9615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9612, %r9613, %r9614, %r9615; + // end inline asm + mov.u32 %r9623, 6; + // begin inline asm + shf.l.wrap.b32 %r9616, %r9622, %r9621, %r9623; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9620, %r9621, %r9622, %r9623; + // end inline asm + mov.u32 %r9631, 3; + // begin inline asm + shf.l.wrap.b32 %r9624, %r9630, %r9629, %r9631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9628, %r9629, %r9630, %r9631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9632, %r9638, %r9637, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9636, %r9637, %r9638, %r9309; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r9640, %r9675, %r9448, %r9496, 0xD2; + lop3.b32 %r9641, %r9678, %r9452, %r9500, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30054, %r9448, %r9496, %r9592, 0xD2; + lop3.b32 %r30055, %r9452, %r9500, %r9596, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30050, %r9496, %r9592, %r9544, 0xD2; + lop3.b32 %r30051, %r9500, %r9596, %r9548, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30046, %r9592, %r9544, %r9675, 0xD2; + lop3.b32 %r30047, %r9596, %r9548, %r9678, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30044, %r9544, %r9675, %r9448, 0xD2; + lop3.b32 %r30045, %r9548, %r9678, %r9452, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30040, %r9584, %r9456, %r9624, 0xD2; + lop3.b32 %r30041, %r9588, %r9460, %r9628, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30052, %r9456, %r9624, %r9568, 0xD2; + lop3.b32 %r30053, %r9460, %r9628, %r9572, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30048, %r9624, %r9568, %r9464, 0xD2; + lop3.b32 %r30049, %r9628, %r9572, %r9468, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30020, %r9568, %r9464, %r9584, 0xD2; + lop3.b32 %r30021, %r9572, %r9468, %r9588, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r30020, %r30021}; + // begin inline asm + // chi + lop3.b32 %r30012, %r9464, %r9584, %r9456, 0xD2; + lop3.b32 %r30013, %r9468, %r9588, %r9460, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r30012, %r30013}; + // begin inline asm + // chi + lop3.b32 %r30038, %r9632, %r9616, %r9504, 0xD2; + lop3.b32 %r30039, %r9636, %r9620, %r9508, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+104], {%r30038, %r30039}; + // begin inline asm + // chi + lop3.b32 %r30032, %r9616, %r9504, %r9512, 0xD2; + lop3.b32 %r30033, %r9620, %r9508, %r9516, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+112], {%r30032, %r30033}; + // begin inline asm + // chi + lop3.b32 %r30026, %r9504, %r9512, %r9480, 0xD2; + lop3.b32 %r30027, %r9508, %r9516, %r9484, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+120], {%r30026, %r30027}; + // begin inline asm + // chi + lop3.b32 %r30018, %r9512, %r9480, %r9632, 0xD2; + lop3.b32 %r30019, %r9516, %r9484, %r9636, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+128], {%r30018, %r30019}; + // begin inline asm + // chi + lop3.b32 %r30010, %r9480, %r9632, %r9616, 0xD2; + lop3.b32 %r30011, %r9484, %r9636, %r9620, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+136], {%r30010, %r30011}; + // begin inline asm + // chi + lop3.b32 %r30036, %r9536, %r9576, %r9608, 0xD2; + lop3.b32 %r30037, %r9540, %r9580, %r9612, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+144], {%r30036, %r30037}; + // begin inline asm + // chi + lop3.b32 %r30030, %r9576, %r9608, %r9600, 0xD2; + lop3.b32 %r30031, %r9580, %r9612, %r9604, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+152], {%r30030, %r30031}; + // begin inline asm + // chi + lop3.b32 %r30024, %r9608, %r9600, %r9520, 0xD2; + lop3.b32 %r30025, %r9612, %r9604, %r9524, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+160], {%r30024, %r30025}; + // begin inline asm + // chi + lop3.b32 %r30016, %r9600, %r9520, %r9536, 0xD2; + lop3.b32 %r30017, %r9604, %r9524, %r9540, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+168], {%r30016, %r30017}; + // begin inline asm + // chi + lop3.b32 %r30008, %r9520, %r9536, %r9576, 0xD2; + lop3.b32 %r30009, %r9524, %r9540, %r9580, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+176], {%r30008, %r30009}; + // begin inline asm + // chi + lop3.b32 %r30034, %r9488, %r9560, %r9472, 0xD2; + lop3.b32 %r30035, %r9492, %r9564, %r9476, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+184], {%r30034, %r30035}; + // begin inline asm + // chi + lop3.b32 %r30028, %r9560, %r9472, %r9528, 0xD2; + lop3.b32 %r30029, %r9564, %r9476, %r9532, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+192], {%r30028, %r30029}; + // begin inline asm + // chi + lop3.b32 %r30022, %r9472, %r9528, %r9552, 0xD2; + lop3.b32 %r30023, %r9476, %r9532, %r9556, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+200], {%r30022, %r30023}; + // begin inline asm + // chi + lop3.b32 %r30014, %r9528, %r9552, %r9488, 0xD2; + lop3.b32 %r30015, %r9532, %r9556, %r9492, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+208], {%r30014, %r30015}; + // begin inline asm + // chi + lop3.b32 %r30006, %r9552, %r9488, %r9560, 0xD2; + lop3.b32 %r30007, %r9556, %r9492, %r9564, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+216], {%r30006, %r30007}; + mul.wide.s32 %rd555, %r30056, 8; + add.s64 %rd554, %rd480, %rd555; + // begin inline asm + ld.global.nc.v2.u32 {%r9840,%r9841}, [%rd554]; + // end inline asm + xor.b32 %r30042, %r9640, %r9840; + xor.b32 %r30043, %r9641, %r9841; + add.s32 %r30056, %r30056, 1; + setp.lt.u32 %p18, %r30056, 23; + @%p18 bra $L__BB2_23; + + mov.u32 %r9951, 1; + st.local.v2.u32 [%rd53+32], {%r30054, %r30055}; + st.local.v2.u32 [%rd53+72], {%r30052, %r30053}; + st.local.v2.u32 [%rd53+40], {%r30050, %r30051}; + st.local.v2.u32 [%rd53+80], {%r30048, %r30049}; + st.local.v2.u32 [%rd53+48], {%r30046, %r30047}; + st.local.v2.u32 [%rd53+56], {%r30044, %r30045}; + st.local.v2.u32 [%rd53+24], {%r30042, %r30043}; + // begin inline asm + // xor5 + lop3.b32 %r9852, %r30042, %r30040, %r30038, 0x96; + lop3.b32 %r9852, %r9852, %r30036, %r30034, 0x96; + lop3.b32 %r9853, %r30043, %r30041, %r30039, 0x96; + lop3.b32 %r9853, %r9853, %r30037, %r30035, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9864, %r30054, %r30052, %r30032, 0x96; + lop3.b32 %r9864, %r9864, %r30030, %r30028, 0x96; + lop3.b32 %r9865, %r30055, %r30053, %r30033, 0x96; + lop3.b32 %r9865, %r9865, %r30031, %r30029, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9876, %r30050, %r30048, %r30026, 0x96; + lop3.b32 %r9876, %r9876, %r30024, %r30022, 0x96; + lop3.b32 %r9877, %r30051, %r30049, %r30027, 0x96; + lop3.b32 %r9877, %r9877, %r30025, %r30023, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9888, %r30046, %r30020, %r30018, 0x96; + lop3.b32 %r9888, %r9888, %r30016, %r30014, 0x96; + lop3.b32 %r9889, %r30047, %r30021, %r30019, 0x96; + lop3.b32 %r9889, %r9889, %r30017, %r30015, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9900, %r30044, %r30012, %r30010, 0x96; + lop3.b32 %r9900, %r9900, %r30008, %r30006, 0x96; + lop3.b32 %r9901, %r30045, %r30013, %r30011, 0x96; + lop3.b32 %r9901, %r9901, %r30009, %r30007, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9912, %r9865, %r9864, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9916, %r9864, %r9865, %r9951; + // end inline asm + xor.b32 %r10090, %r9912, %r9900; + xor.b32 %r10091, %r9916, %r9901; + xor.b32 %r10059, %r30042, %r10090; + xor.b32 %r10062, %r30043, %r10091; + xor.b32 %r10022, %r30039, %r10091; + xor.b32 %r10021, %r30038, %r10090; + st.local.v2.u32 [%rd53+104], {%r10021, %r10022}; + // begin inline asm + shf.l.wrap.b32 %r9920, %r9877, %r9876, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9924, %r9876, %r9877, %r9951; + // end inline asm + xor.b32 %r10092, %r9920, %r9852; + xor.b32 %r10093, %r9924, %r9853; + xor.b32 %r9958, %r30052, %r10092; + xor.b32 %r9957, %r30053, %r10093; + xor.b32 %r9997, %r30031, %r10093; + xor.b32 %r9998, %r30030, %r10092; + st.local.v2.u32 [%rd53+152], {%r9998, %r9997}; + // begin inline asm + shf.l.wrap.b32 %r9928, %r9889, %r9888, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9932, %r9888, %r9889, %r9951; + // end inline asm + xor.b32 %r10094, %r9928, %r9864; + xor.b32 %r10095, %r9932, %r9865; + xor.b32 %r9981, %r30027, %r10095; + xor.b32 %r9982, %r30026, %r10094; + st.local.v2.u32 [%rd53+120], {%r9982, %r9981}; + xor.b32 %r9973, %r30023, %r10095; + xor.b32 %r9974, %r30022, %r10094; + st.local.v2.u32 [%rd53+200], {%r9974, %r9973}; + // begin inline asm + shf.l.wrap.b32 %r9936, %r9901, %r9900, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9940, %r9900, %r9901, %r9951; + // end inline asm + xor.b32 %r10096, %r9936, %r9876; + xor.b32 %r10097, %r9940, %r9877; + xor.b32 %r10005, %r30046, %r10096; + xor.b32 %r10006, %r30047, %r10097; + xor.b32 %r10014, %r30017, %r10097; + xor.b32 %r10013, %r30016, %r10096; + st.local.v2.u32 [%rd53+168], {%r10013, %r10014}; + // begin inline asm + shf.l.wrap.b32 %r9944, %r9853, %r9852, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9948, %r9852, %r9853, %r9951; + // end inline asm + xor.b32 %r10098, %r9944, %r9888; + xor.b32 %r10099, %r9948, %r9889; + xor.b32 %r9965, %r30012, %r10098; + xor.b32 %r9966, %r30013, %r10099; + xor.b32 %r9990, %r30007, %r10099; + xor.b32 %r9989, %r30006, %r10098; + st.local.v2.u32 [%rd53+216], {%r9989, %r9990}; + // begin inline asm + shf.l.wrap.b32 %r9952, %r9958, %r9957, %r9455; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9956, %r9957, %r9958, %r9455; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9960, %r9966, %r9965, %r9463; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9964, %r9965, %r9966, %r9463; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9972, %r9973, %r9974, %r9471; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9968, %r9974, %r9973, %r9471; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r9968, %r9972}; + // begin inline asm + shf.l.wrap.b32 %r9976, %r9982, %r9981, %r9503; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9980, %r9981, %r9982, %r9503; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9984, %r9990, %r9989, %r9551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9988, %r9989, %r9990, %r9551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9996, %r9997, %r9998, %r9575; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9992, %r9998, %r9997, %r9575; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r9992, %r9996}; + // begin inline asm + shf.l.wrap.b32 %r10000, %r10006, %r10005, %r9591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10004, %r10005, %r10006, %r9591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10008, %r10014, %r10013, %r9599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10012, %r10013, %r10014, %r9599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10016, %r10022, %r10021, %r9631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10020, %r10021, %r10022, %r9631; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10024, %r10059, %r9952, %r9976, 0xD2; + lop3.b32 %r10025, %r10062, %r9956, %r9980, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10032, %r9952, %r9976, %r10008, 0xD2; + lop3.b32 %r10033, %r9956, %r9980, %r10012, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+32], {%r10032, %r10033}; + // begin inline asm + // chi + lop3.b32 %r10040, %r9976, %r10008, %r9984, 0xD2; + lop3.b32 %r10041, %r9980, %r10012, %r9988, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+40], {%r10040, %r10041}; + // begin inline asm + // chi + lop3.b32 %r10048, %r10008, %r9984, %r10059, 0xD2; + lop3.b32 %r10049, %r10012, %r9988, %r10062, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+48], {%r10048, %r10049}; + // begin inline asm + // chi + lop3.b32 %r10056, %r9984, %r10059, %r9952, 0xD2; + lop3.b32 %r10057, %r9988, %r10062, %r9956, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+56], {%r10056, %r10057}; + // begin inline asm + // chi + lop3.b32 %r10064, %r10000, %r9960, %r10016, 0xD2; + lop3.b32 %r10065, %r10004, %r9964, %r10020, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+64], {%r10064, %r10065}; + // begin inline asm + // chi + lop3.b32 %r10072, %r9960, %r10016, %r9992, 0xD2; + lop3.b32 %r10073, %r9964, %r10020, %r9996, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+72], {%r10072, %r10073}; + // begin inline asm + // chi + lop3.b32 %r10080, %r10016, %r9992, %r9968, 0xD2; + lop3.b32 %r10081, %r10020, %r9996, %r9972, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+80], {%r10080, %r10081}; + // begin inline asm + ld.global.nc.v2.u32 {%r10088,%r10089}, [%rd481]; + // end inline asm + xor.b32 %r10100, %r10025, %r10089; + xor.b32 %r10101, %r10024, %r10088; + st.local.v2.u32 [%rd53+24], {%r10101, %r10100}; + mov.b64 %rd1326, {%r10032, %r10033}; + mov.b64 %rd1327, {%r10040, %r10041}; + mov.b64 %rd1330, {%r10064, %r10065}; + mov.b64 %rd1331, {%r10072, %r10073}; + mov.b64 %rd1332, {%r10080, %r10081}; + mov.b64 %rd1325, {%r10101, %r10100}; + mov.b64 %rd1328, {%r10048, %r10049}; + mov.b64 %rd1329, {%r10056, %r10057}; + st.global.u64 [%rd34], %rd1317; + st.global.u64 [%rd34+8], %rd1318; + st.global.u64 [%rd34+16], %rd1319; + st.global.u64 [%rd34+24], %rd1320; + st.global.u64 [%rd34+32], %rd1321; + st.global.u64 [%rd34+40], %rd1322; + st.global.u64 [%rd34+48], %rd1323; + st.global.u64 [%rd34+56], %rd1324; + st.global.v2.u32 [%rd34+64], {%r10101, %r10100}; + st.global.v2.u32 [%rd34+72], {%r10032, %r10033}; + st.global.v2.u32 [%rd34+80], {%r10040, %r10041}; + st.global.v2.u32 [%rd34+88], {%r10048, %r10049}; + st.global.v2.u32 [%rd34+96], {%r10056, %r10057}; + st.global.v2.u32 [%rd34+104], {%r10064, %r10065}; + st.global.v2.u32 [%rd34+112], {%r10072, %r10073}; + st.global.v2.u32 [%rd34+120], {%r10080, %r10081}; + +$L__BB2_36: + cvta.to.global.u64 %rd1265, %rd361; + shl.b32 %r1695, %r45, 1; + mul.wide.u32 %rd661, %r1695, -954391867; + shr.u64 %rd662, %rd661, 32; + cvt.u32.u64 %r13386, %rd662; + sub.s32 %r13387, %r1695, %r13386; + shr.u32 %r13388, %r13387, 1; + add.s32 %r13389, %r13388, %r13386; + shr.u32 %r13390, %r13389, 20; + mul.lo.s32 %r13391, %r13390, 1179641; + sub.s32 %r13392, %r1695, %r13391; + mul.wide.u32 %rd664, %r13392, 64; + add.s64 %rd126, %rd1265, %rd664; + or.b32 %r1696, %r1695, 1; + mul.wide.u32 %rd665, %r1696, -954391867; + shr.u64 %rd666, %rd665, 32; + cvt.u32.u64 %r13393, %rd666; + sub.s32 %r13394, %r1696, %r13393; + shr.u32 %r13395, %r13394, 1; + add.s32 %r13396, %r13395, %r13393; + shr.u32 %r13397, %r13396, 20; + mul.lo.s32 %r13398, %r13397, 1179641; + sub.s32 %r13399, %r1696, %r13398; + mul.wide.u32 %rd667, %r13399, 64; + add.s64 %rd127, %rd1265, %rd667; + @%p12 bra $L__BB2_50; + + cvta.to.global.u64 %rd668, %rd360; + mul.wide.u32 %rd669, %r45, 128; + add.s64 %rd128, %rd668, %rd669; + ld.global.u64 %rd1333, [%rd128]; + setp.eq.s64 %p25, %rd1333, 0; + @%p25 bra $L__BB2_39; + + ld.global.u64 %rd1348, [%rd128+120]; + ld.global.u64 %rd1347, [%rd128+112]; + ld.global.u64 %rd1346, [%rd128+104]; + ld.global.u64 %rd1345, [%rd128+96]; + ld.global.u64 %rd1344, [%rd128+88]; + ld.global.u64 %rd1343, [%rd128+80]; + ld.global.u64 %rd1342, [%rd128+72]; + ld.global.u64 %rd1341, [%rd128+64]; + ld.global.u64 %rd1340, [%rd128+56]; + ld.global.u64 %rd1339, [%rd128+48]; + ld.global.u64 %rd1338, [%rd128+40]; + ld.global.u64 %rd1337, [%rd128+32]; + ld.global.u64 %rd1336, [%rd128+24]; + ld.global.u64 %rd1335, [%rd128+16]; + ld.global.u64 %rd1334, [%rd128+8]; + bra.uni $L__BB2_61; + +$L__BB2_50: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd771, 1179641; + st.local.u64 [%rd2+8], %rd771; + st.local.u32 [%rd2+16], %r1695; + ld.global.u64 %rd772, [%rd126]; + ld.global.u64 %rd773, [%rd126+8]; + ld.global.u64 %rd774, [%rd126+16]; + ld.global.u64 %rd775, [%rd126+24]; + ld.global.u64 %rd776, [%rd126+32]; + ld.global.u64 %rd777, [%rd126+40]; + ld.global.u64 %rd778, [%rd126+48]; + ld.global.u64 %rd779, [%rd126+56]; + st.local.u64 [%rd2+24], %rd772; + st.local.u64 [%rd2+32], %rd773; + st.local.u64 [%rd2+40], %rd774; + st.local.u64 [%rd2+48], %rd775; + st.local.u64 [%rd2+56], %rd776; + st.local.u64 [%rd2+64], %rd777; + st.local.u64 [%rd2+72], %rd778; + st.local.u64 [%rd2+80], %rd779; + cvt.u32.u64 %r16725, %rd772; + xor.b32 %r16726, %r1695, %r16725; + st.local.u32 [%rd2+24], %r16726; + mov.u32 %r30531, 0; + st.local.v2.u32 [%rd2+96], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+104], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+112], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+120], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+128], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+136], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+144], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+152], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+160], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+168], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+176], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+184], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+192], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+200], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+208], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+216], {%r30531, %r30531}; + mov.u32 %r30546, -2147483648; + mov.u32 %r16698, 1; + st.local.v2.u32 [%rd2+88], {%r16698, %r30546}; + ld.local.v2.u32 {%r30567, %r30568}, [%rd2+24]; + mov.b64 {%r30565, %r30566}, %rd777; + shr.u64 %rd780, %rd773, 32; + cvt.u32.u64 %r30579, %rd773; + cvt.u32.u64 %r30580, %rd780; + shr.u64 %rd781, %rd778, 32; + cvt.u32.u64 %r30577, %rd778; + cvt.u32.u64 %r30578, %rd781; + shr.u64 %rd782, %rd774, 32; + cvt.u32.u64 %r30575, %rd774; + cvt.u32.u64 %r30576, %rd782; + shr.u64 %rd783, %rd779, 32; + cvt.u32.u64 %r30573, %rd779; + cvt.u32.u64 %r30574, %rd783; + shr.u64 %rd784, %rd775, 32; + cvt.u32.u64 %r30571, %rd775; + cvt.u32.u64 %r30572, %rd784; + shr.u64 %rd785, %rd776, 32; + cvt.u32.u64 %r30569, %rd776; + cvt.u32.u64 %r30570, %rd785; + mov.u32 %r30532, %r30531; + mov.u32 %r30533, %r30531; + mov.u32 %r30534, %r30531; + mov.u32 %r30535, %r30531; + mov.u32 %r30536, %r30531; + mov.u32 %r30537, %r30531; + mov.u32 %r30538, %r30531; + mov.u32 %r30539, %r30531; + mov.u32 %r30540, %r30531; + mov.u32 %r30541, %r30531; + mov.u32 %r30542, %r30531; + mov.u32 %r30543, %r30531; + mov.u32 %r30544, %r30531; + mov.u32 %r30545, %r16698; + mov.u32 %r30547, %r30531; + mov.u32 %r30548, %r30531; + mov.u32 %r30549, %r30531; + mov.u32 %r30550, %r30531; + mov.u32 %r30551, %r30531; + mov.u32 %r30552, %r30531; + mov.u32 %r30553, %r30531; + mov.u32 %r30554, %r30531; + mov.u32 %r30555, %r30531; + mov.u32 %r30556, %r30531; + mov.u32 %r30557, %r30531; + mov.u32 %r30558, %r30531; + mov.u32 %r30559, %r30531; + mov.u32 %r30560, %r30531; + mov.u32 %r30561, %r30531; + mov.u32 %r30562, %r30531; + mov.u32 %r30563, %r30531; + mov.u32 %r30564, %r30531; + mov.u32 %r30581, %r30531; + +$L__BB2_51: + // begin inline asm + // xor5 + lop3.b32 %r16729, %r30567, %r30565, %r30563, 0x96; + lop3.b32 %r16729, %r16729, %r30561, %r30559, 0x96; + lop3.b32 %r16730, %r30568, %r30566, %r30564, 0x96; + lop3.b32 %r16730, %r16730, %r30562, %r30560, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16741, %r30579, %r30577, %r30557, 0x96; + lop3.b32 %r16741, %r16741, %r30555, %r30553, 0x96; + lop3.b32 %r16742, %r30580, %r30578, %r30558, 0x96; + lop3.b32 %r16742, %r16742, %r30556, %r30554, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16753, %r30575, %r30573, %r30551, 0x96; + lop3.b32 %r16753, %r16753, %r30549, %r30547, 0x96; + lop3.b32 %r16754, %r30576, %r30574, %r30552, 0x96; + lop3.b32 %r16754, %r16754, %r30550, %r30548, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16765, %r30571, %r30545, %r30543, 0x96; + lop3.b32 %r16765, %r16765, %r30541, %r30539, 0x96; + lop3.b32 %r16766, %r30572, %r30546, %r30544, 0x96; + lop3.b32 %r16766, %r16766, %r30542, %r30540, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16777, %r30569, %r30537, %r30535, 0x96; + lop3.b32 %r16777, %r16777, %r30533, %r30531, 0x96; + lop3.b32 %r16778, %r30570, %r30538, %r30536, 0x96; + lop3.b32 %r16778, %r16778, %r30534, %r30532, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16789, %r16742, %r16741, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16793, %r16741, %r16742, %r16698; + // end inline asm + xor.b32 %r17223, %r16789, %r16777; + xor.b32 %r17224, %r16793, %r16778; + xor.b32 %r17056, %r30567, %r17223; + xor.b32 %r17059, %r30568, %r17224; + xor.b32 %r16963, %r30565, %r17223; + xor.b32 %r16962, %r30566, %r17224; + xor.b32 %r17010, %r30563, %r17223; + xor.b32 %r17011, %r30564, %r17224; + xor.b32 %r16915, %r30561, %r17223; + xor.b32 %r16914, %r30562, %r17224; + xor.b32 %r16866, %r30559, %r17223; + xor.b32 %r16867, %r30560, %r17224; + // begin inline asm + shf.l.wrap.b32 %r16797, %r16754, %r16753, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16801, %r16753, %r16754, %r16698; + // end inline asm + xor.b32 %r17225, %r16797, %r16729; + xor.b32 %r17226, %r16801, %r16730; + xor.b32 %r17018, %r30579, %r17225; + xor.b32 %r17019, %r30580, %r17226; + xor.b32 %r16835, %r30577, %r17225; + xor.b32 %r16834, %r30578, %r17226; + xor.b32 %r16994, %r30557, %r17225; + xor.b32 %r16995, %r30558, %r17226; + xor.b32 %r16955, %r30555, %r17225; + xor.b32 %r16954, %r30556, %r17226; + xor.b32 %r16938, %r30553, %r17225; + xor.b32 %r16939, %r30554, %r17226; + // begin inline asm + shf.l.wrap.b32 %r16805, %r16766, %r16765, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16809, %r16765, %r16766, %r16698; + // end inline asm + xor.b32 %r17227, %r16805, %r16741; + xor.b32 %r17228, %r16809, %r16742; + xor.b32 %r16875, %r30575, %r17227; + xor.b32 %r16874, %r30576, %r17228; + xor.b32 %r17002, %r30573, %r17227; + xor.b32 %r17003, %r30574, %r17228; + xor.b32 %r16883, %r30551, %r17227; + xor.b32 %r16882, %r30552, %r17228; + xor.b32 %r16986, %r30549, %r17227; + xor.b32 %r16987, %r30550, %r17228; + xor.b32 %r16851, %r30547, %r17227; + xor.b32 %r16850, %r30548, %r17228; + // begin inline asm + shf.l.wrap.b32 %r16813, %r16778, %r16777, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16817, %r16777, %r16778, %r16698; + // end inline asm + xor.b32 %r17229, %r16813, %r16753; + xor.b32 %r17230, %r16817, %r16754; + xor.b32 %r16970, %r30571, %r17229; + xor.b32 %r16971, %r30572, %r17230; + xor.b32 %r16947, %r30545, %r17229; + xor.b32 %r16946, %r30546, %r17230; + xor.b32 %r16890, %r30543, %r17229; + xor.b32 %r16891, %r30544, %r17230; + xor.b32 %r16978, %r30541, %r17229; + xor.b32 %r16979, %r30542, %r17230; + xor.b32 %r16907, %r30539, %r17229; + xor.b32 %r16906, %r30540, %r17230; + // begin inline asm + shf.l.wrap.b32 %r16821, %r16730, %r16729, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16825, %r16729, %r16730, %r16698; + // end inline asm + xor.b32 %r17231, %r16821, %r16765; + xor.b32 %r17232, %r16825, %r16766; + xor.b32 %r16922, %r30569, %r17231; + xor.b32 %r16923, %r30570, %r17232; + xor.b32 %r16842, %r30537, %r17231; + xor.b32 %r16843, %r30538, %r17232; + xor.b32 %r16859, %r30535, %r17231; + xor.b32 %r16858, %r30536, %r17232; + xor.b32 %r16898, %r30533, %r17231; + xor.b32 %r16899, %r30534, %r17232; + xor.b32 %r16930, %r30531, %r17231; + xor.b32 %r16931, %r30532, %r17232; + mov.u32 %r16836, 44; + // begin inline asm + shf.l.wrap.b32 %r16829, %r16835, %r16834, %r16836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16833, %r16834, %r16835, %r16836; + // end inline asm + mov.u32 %r16844, 20; + // begin inline asm + shf.l.wrap.b32 %r16837, %r16843, %r16842, %r16844; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16841, %r16842, %r16843, %r16844; + // end inline asm + mov.u32 %r16852, 61; + // begin inline asm + shf.l.wrap.b32 %r16845, %r16851, %r16850, %r16852; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16849, %r16850, %r16851, %r16852; + // end inline asm + mov.u32 %r16860, 39; + // begin inline asm + shf.l.wrap.b32 %r16853, %r16859, %r16858, %r16860; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16857, %r16858, %r16859, %r16860; + // end inline asm + mov.u32 %r16868, 18; + // begin inline asm + shf.l.wrap.b32 %r16861, %r16867, %r16866, %r16868; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16865, %r16866, %r16867, %r16868; + // end inline asm + mov.u32 %r16876, 62; + // begin inline asm + shf.l.wrap.b32 %r16869, %r16875, %r16874, %r16876; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16873, %r16874, %r16875, %r16876; + // end inline asm + mov.u32 %r16884, 43; + // begin inline asm + shf.l.wrap.b32 %r16877, %r16883, %r16882, %r16884; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16881, %r16882, %r16883, %r16884; + // end inline asm + mov.u32 %r16892, 25; + // begin inline asm + shf.l.wrap.b32 %r16885, %r16891, %r16890, %r16892; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16889, %r16890, %r16891, %r16892; + // end inline asm + mov.u32 %r16900, 8; + // begin inline asm + shf.l.wrap.b32 %r16893, %r16899, %r16898, %r16900; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16897, %r16898, %r16899, %r16900; + // end inline asm + mov.u32 %r16908, 56; + // begin inline asm + shf.l.wrap.b32 %r16901, %r16907, %r16906, %r16908; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16905, %r16906, %r16907, %r16908; + // end inline asm + mov.u32 %r16916, 41; + // begin inline asm + shf.l.wrap.b32 %r16909, %r16915, %r16914, %r16916; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16913, %r16914, %r16915, %r16916; + // end inline asm + mov.u32 %r16924, 27; + // begin inline asm + shf.l.wrap.b32 %r16917, %r16923, %r16922, %r16924; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16921, %r16922, %r16923, %r16924; + // end inline asm + mov.u32 %r16932, 14; + // begin inline asm + shf.l.wrap.b32 %r16925, %r16931, %r16930, %r16932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16929, %r16930, %r16931, %r16932; + // end inline asm + mov.u32 %r16940, 2; + // begin inline asm + shf.l.wrap.b32 %r16933, %r16939, %r16938, %r16940; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16937, %r16938, %r16939, %r16940; + // end inline asm + mov.u32 %r16948, 55; + // begin inline asm + shf.l.wrap.b32 %r16941, %r16947, %r16946, %r16948; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16945, %r16946, %r16947, %r16948; + // end inline asm + mov.u32 %r16956, 45; + // begin inline asm + shf.l.wrap.b32 %r16949, %r16955, %r16954, %r16956; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16953, %r16954, %r16955, %r16956; + // end inline asm + mov.u32 %r16964, 36; + // begin inline asm + shf.l.wrap.b32 %r16957, %r16963, %r16962, %r16964; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16961, %r16962, %r16963, %r16964; + // end inline asm + mov.u32 %r16972, 28; + // begin inline asm + shf.l.wrap.b32 %r16965, %r16971, %r16970, %r16972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16969, %r16970, %r16971, %r16972; + // end inline asm + mov.u32 %r16980, 21; + // begin inline asm + shf.l.wrap.b32 %r16973, %r16979, %r16978, %r16980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16977, %r16978, %r16979, %r16980; + // end inline asm + mov.u32 %r16988, 15; + // begin inline asm + shf.l.wrap.b32 %r16981, %r16987, %r16986, %r16988; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16985, %r16986, %r16987, %r16988; + // end inline asm + mov.u32 %r16996, 10; + // begin inline asm + shf.l.wrap.b32 %r16989, %r16995, %r16994, %r16996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16993, %r16994, %r16995, %r16996; + // end inline asm + mov.u32 %r17004, 6; + // begin inline asm + shf.l.wrap.b32 %r16997, %r17003, %r17002, %r17004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17001, %r17002, %r17003, %r17004; + // end inline asm + mov.u32 %r17012, 3; + // begin inline asm + shf.l.wrap.b32 %r17005, %r17011, %r17010, %r17012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17009, %r17010, %r17011, %r17012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17013, %r17019, %r17018, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17017, %r17018, %r17019, %r16698; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17021, %r17056, %r16829, %r16877, 0xD2; + lop3.b32 %r17022, %r17059, %r16833, %r16881, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30579, %r16829, %r16877, %r16973, 0xD2; + lop3.b32 %r30580, %r16833, %r16881, %r16977, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30575, %r16877, %r16973, %r16925, 0xD2; + lop3.b32 %r30576, %r16881, %r16977, %r16929, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30571, %r16973, %r16925, %r17056, 0xD2; + lop3.b32 %r30572, %r16977, %r16929, %r17059, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30569, %r16925, %r17056, %r16829, 0xD2; + lop3.b32 %r30570, %r16929, %r17059, %r16833, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30565, %r16965, %r16837, %r17005, 0xD2; + lop3.b32 %r30566, %r16969, %r16841, %r17009, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30577, %r16837, %r17005, %r16949, 0xD2; + lop3.b32 %r30578, %r16841, %r17009, %r16953, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30573, %r17005, %r16949, %r16845, 0xD2; + lop3.b32 %r30574, %r17009, %r16953, %r16849, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30545, %r16949, %r16845, %r16965, 0xD2; + lop3.b32 %r30546, %r16953, %r16849, %r16969, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30545, %r30546}; + // begin inline asm + // chi + lop3.b32 %r30537, %r16845, %r16965, %r16837, 0xD2; + lop3.b32 %r30538, %r16849, %r16969, %r16841, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30537, %r30538}; + // begin inline asm + // chi + lop3.b32 %r30563, %r17013, %r16997, %r16885, 0xD2; + lop3.b32 %r30564, %r17017, %r17001, %r16889, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30563, %r30564}; + // begin inline asm + // chi + lop3.b32 %r30557, %r16997, %r16885, %r16893, 0xD2; + lop3.b32 %r30558, %r17001, %r16889, %r16897, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30557, %r30558}; + // begin inline asm + // chi + lop3.b32 %r30551, %r16885, %r16893, %r16861, 0xD2; + lop3.b32 %r30552, %r16889, %r16897, %r16865, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30551, %r30552}; + // begin inline asm + // chi + lop3.b32 %r30543, %r16893, %r16861, %r17013, 0xD2; + lop3.b32 %r30544, %r16897, %r16865, %r17017, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30543, %r30544}; + // begin inline asm + // chi + lop3.b32 %r30535, %r16861, %r17013, %r16997, 0xD2; + lop3.b32 %r30536, %r16865, %r17017, %r17001, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30535, %r30536}; + // begin inline asm + // chi + lop3.b32 %r30561, %r16917, %r16957, %r16989, 0xD2; + lop3.b32 %r30562, %r16921, %r16961, %r16993, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30561, %r30562}; + // begin inline asm + // chi + lop3.b32 %r30555, %r16957, %r16989, %r16981, 0xD2; + lop3.b32 %r30556, %r16961, %r16993, %r16985, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30555, %r30556}; + // begin inline asm + // chi + lop3.b32 %r30549, %r16989, %r16981, %r16901, 0xD2; + lop3.b32 %r30550, %r16993, %r16985, %r16905, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30549, %r30550}; + // begin inline asm + // chi + lop3.b32 %r30541, %r16981, %r16901, %r16917, 0xD2; + lop3.b32 %r30542, %r16985, %r16905, %r16921, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30541, %r30542}; + // begin inline asm + // chi + lop3.b32 %r30533, %r16901, %r16917, %r16957, 0xD2; + lop3.b32 %r30534, %r16905, %r16921, %r16961, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30533, %r30534}; + // begin inline asm + // chi + lop3.b32 %r30559, %r16869, %r16941, %r16853, 0xD2; + lop3.b32 %r30560, %r16873, %r16945, %r16857, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30559, %r30560}; + // begin inline asm + // chi + lop3.b32 %r30553, %r16941, %r16853, %r16909, 0xD2; + lop3.b32 %r30554, %r16945, %r16857, %r16913, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30553, %r30554}; + // begin inline asm + // chi + lop3.b32 %r30547, %r16853, %r16909, %r16933, 0xD2; + lop3.b32 %r30548, %r16857, %r16913, %r16937, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30547, %r30548}; + // begin inline asm + // chi + lop3.b32 %r30539, %r16909, %r16933, %r16869, 0xD2; + lop3.b32 %r30540, %r16913, %r16937, %r16873, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30539, %r30540}; + // begin inline asm + // chi + lop3.b32 %r30531, %r16933, %r16869, %r16941, 0xD2; + lop3.b32 %r30532, %r16937, %r16873, %r16945, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30531, %r30532}; + mul.wide.s32 %rd787, %r30581, 8; + mov.u64 %rd788, keccak_round_constants; + cvta.const.u64 %rd789, %rd788; + add.s64 %rd786, %rd789, %rd787; + // begin inline asm + ld.global.nc.v2.u32 {%r17221,%r17222}, [%rd786]; + // end inline asm + xor.b32 %r30567, %r17021, %r17221; + xor.b32 %r30568, %r17022, %r17222; + add.s32 %r30581, %r30581, 1; + setp.lt.u32 %p31, %r30581, 23; + @%p31 bra $L__BB2_51; + + add.u64 %rd176, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r30579, %r30580}; + st.local.v2.u32 [%rd2+72], {%r30577, %r30578}; + st.local.v2.u32 [%rd2+40], {%r30575, %r30576}; + st.local.v2.u32 [%rd2+80], {%r30573, %r30574}; + st.local.v2.u32 [%rd2+48], {%r30571, %r30572}; + st.local.v2.u32 [%rd2+56], {%r30569, %r30570}; + st.local.v2.u32 [%rd2+24], {%r30567, %r30568}; + // begin inline asm + // xor5 + lop3.b32 %r17233, %r30567, %r30565, %r30563, 0x96; + lop3.b32 %r17233, %r17233, %r30561, %r30559, 0x96; + lop3.b32 %r17234, %r30568, %r30566, %r30564, 0x96; + lop3.b32 %r17234, %r17234, %r30562, %r30560, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17245, %r30579, %r30577, %r30557, 0x96; + lop3.b32 %r17245, %r17245, %r30555, %r30553, 0x96; + lop3.b32 %r17246, %r30580, %r30578, %r30558, 0x96; + lop3.b32 %r17246, %r17246, %r30556, %r30554, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17257, %r30575, %r30573, %r30551, 0x96; + lop3.b32 %r17257, %r17257, %r30549, %r30547, 0x96; + lop3.b32 %r17258, %r30576, %r30574, %r30552, 0x96; + lop3.b32 %r17258, %r17258, %r30550, %r30548, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17269, %r30571, %r30545, %r30543, 0x96; + lop3.b32 %r17269, %r17269, %r30541, %r30539, 0x96; + lop3.b32 %r17270, %r30572, %r30546, %r30544, 0x96; + lop3.b32 %r17270, %r17270, %r30542, %r30540, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17281, %r30569, %r30537, %r30535, 0x96; + lop3.b32 %r17281, %r17281, %r30533, %r30531, 0x96; + lop3.b32 %r17282, %r30570, %r30538, %r30536, 0x96; + lop3.b32 %r17282, %r17282, %r30534, %r30532, 0x96; + // end inline asm + mov.u32 %r17485, 1; + // begin inline asm + shf.l.wrap.b32 %r17293, %r17246, %r17245, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17297, %r17245, %r17246, %r17485; + // end inline asm + xor.b32 %r17512, %r17293, %r17281; + xor.b32 %r17513, %r17297, %r17282; + xor.b32 %r17440, %r30567, %r17512; + xor.b32 %r17443, %r30568, %r17513; + xor.b32 %r17403, %r30564, %r17513; + xor.b32 %r17402, %r30563, %r17512; + st.local.v2.u32 [%rd2+104], {%r17402, %r17403}; + // begin inline asm + shf.l.wrap.b32 %r17301, %r17258, %r17257, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17305, %r17257, %r17258, %r17485; + // end inline asm + xor.b32 %r17514, %r17301, %r17233; + xor.b32 %r17515, %r17305, %r17234; + xor.b32 %r17339, %r30577, %r17514; + xor.b32 %r17338, %r30578, %r17515; + xor.b32 %r17378, %r30556, %r17515; + xor.b32 %r17379, %r30555, %r17514; + st.local.v2.u32 [%rd2+152], {%r17379, %r17378}; + // begin inline asm + shf.l.wrap.b32 %r17309, %r17270, %r17269, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17313, %r17269, %r17270, %r17485; + // end inline asm + xor.b32 %r17516, %r17309, %r17245; + xor.b32 %r17517, %r17313, %r17246; + xor.b32 %r17362, %r30552, %r17517; + xor.b32 %r17363, %r30551, %r17516; + st.local.v2.u32 [%rd2+120], {%r17363, %r17362}; + xor.b32 %r17354, %r30548, %r17517; + xor.b32 %r17355, %r30547, %r17516; + st.local.v2.u32 [%rd2+200], {%r17355, %r17354}; + // begin inline asm + shf.l.wrap.b32 %r17317, %r17282, %r17281, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17321, %r17281, %r17282, %r17485; + // end inline asm + xor.b32 %r17518, %r17317, %r17257; + xor.b32 %r17519, %r17321, %r17258; + xor.b32 %r17386, %r30571, %r17518; + xor.b32 %r17387, %r30572, %r17519; + xor.b32 %r17395, %r30542, %r17519; + xor.b32 %r17394, %r30541, %r17518; + st.local.v2.u32 [%rd2+168], {%r17394, %r17395}; + // begin inline asm + shf.l.wrap.b32 %r17325, %r17234, %r17233, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17329, %r17233, %r17234, %r17485; + // end inline asm + xor.b32 %r17520, %r17325, %r17269; + xor.b32 %r17521, %r17329, %r17270; + xor.b32 %r17346, %r30537, %r17520; + xor.b32 %r17347, %r30538, %r17521; + xor.b32 %r17371, %r30532, %r17521; + xor.b32 %r17370, %r30531, %r17520; + st.local.v2.u32 [%rd2+216], {%r17370, %r17371}; + // begin inline asm + shf.l.wrap.b32 %r17333, %r17339, %r17338, %r16836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17337, %r17338, %r17339, %r16836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17341, %r17347, %r17346, %r16844; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17345, %r17346, %r17347, %r16844; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17353, %r17354, %r17355, %r16852; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17349, %r17355, %r17354, %r16852; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r17349, %r17353}; + // begin inline asm + shf.l.wrap.b32 %r17357, %r17363, %r17362, %r16884; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17361, %r17362, %r17363, %r16884; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17365, %r17371, %r17370, %r16932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17369, %r17370, %r17371, %r16932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17377, %r17378, %r17379, %r16956; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17373, %r17379, %r17378, %r16956; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r17373, %r17377}; + // begin inline asm + shf.l.wrap.b32 %r17381, %r17387, %r17386, %r16972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17385, %r17386, %r17387, %r16972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17389, %r17395, %r17394, %r16980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17393, %r17394, %r17395, %r16980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17397, %r17403, %r17402, %r17012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17401, %r17402, %r17403, %r17012; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17405, %r17440, %r17333, %r17357, 0xD2; + lop3.b32 %r17406, %r17443, %r17337, %r17361, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30714, %r17333, %r17357, %r17389, 0xD2; + lop3.b32 %r30715, %r17337, %r17361, %r17393, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30714, %r30715}; + // begin inline asm + // chi + lop3.b32 %r30710, %r17357, %r17389, %r17365, 0xD2; + lop3.b32 %r30711, %r17361, %r17393, %r17369, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30710, %r30711}; + // begin inline asm + // chi + lop3.b32 %r30706, %r17389, %r17365, %r17440, 0xD2; + lop3.b32 %r30707, %r17393, %r17369, %r17443, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30706, %r30707}; + // begin inline asm + // chi + lop3.b32 %r30704, %r17365, %r17440, %r17333, 0xD2; + lop3.b32 %r30705, %r17369, %r17443, %r17337, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30704, %r30705}; + // begin inline asm + // chi + lop3.b32 %r30700, %r17381, %r17341, %r17397, 0xD2; + lop3.b32 %r30701, %r17385, %r17345, %r17401, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30700, %r30701}; + // begin inline asm + // chi + lop3.b32 %r30712, %r17341, %r17397, %r17373, 0xD2; + lop3.b32 %r30713, %r17345, %r17401, %r17377, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30712, %r30713}; + // begin inline asm + // chi + lop3.b32 %r30708, %r17397, %r17373, %r17349, 0xD2; + lop3.b32 %r30709, %r17401, %r17377, %r17353, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30708, %r30709}; + add.s64 %rd790, %rd789, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r17469,%r17470}, [%rd790]; + // end inline asm + xor.b32 %r30702, %r17405, %r17469; + xor.b32 %r30703, %r17406, %r17470; + st.local.v2.u32 [%rd2+24], {%r30702, %r30703}; + st.local.u64 [%rd176], %rd361; + mov.u64 %rd794, 1179641; + st.local.u64 [%rd176+8], %rd794; + st.local.u32 [%rd176+16], %r1696; + ld.global.u64 %rd795, [%rd127]; + ld.global.u64 %rd796, [%rd127+8]; + ld.global.u64 %rd797, [%rd127+16]; + ld.global.u64 %rd798, [%rd127+24]; + ld.global.u64 %rd799, [%rd127+32]; + ld.global.u64 %rd800, [%rd127+40]; + ld.global.u64 %rd801, [%rd127+48]; + ld.global.u64 %rd802, [%rd127+56]; + st.local.u64 [%rd176+32], %rd796; + st.local.u64 [%rd176+40], %rd797; + st.local.u64 [%rd176+48], %rd798; + st.local.u64 [%rd176+56], %rd799; + st.local.u64 [%rd176+64], %rd800; + st.local.u64 [%rd176+72], %rd801; + st.local.u64 [%rd176+80], %rd802; + cvt.u32.u64 %r17522, %rd795; + xor.b32 %r17523, %r1696, %r17522; + st.local.u64 [%rd176+24], %rd795; + st.local.u32 [%rd176+24], %r17523; + mov.u32 %r30582, 0; + st.local.v2.u32 [%rd176+96], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+104], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+112], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+120], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+128], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+136], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+144], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+152], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+160], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+168], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+176], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+184], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+192], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+200], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+208], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+216], {%r30582, %r30582}; + mov.u32 %r30597, -2147483648; + st.local.v2.u32 [%rd176+88], {%r17485, %r30597}; + ld.local.v2.u32 {%r30618, %r30619}, [%rd176+24]; + mov.b64 {%r30616, %r30617}, %rd800; + shr.u64 %rd803, %rd796, 32; + cvt.u32.u64 %r30630, %rd796; + cvt.u32.u64 %r30631, %rd803; + shr.u64 %rd804, %rd801, 32; + cvt.u32.u64 %r30628, %rd801; + cvt.u32.u64 %r30629, %rd804; + shr.u64 %rd805, %rd797, 32; + cvt.u32.u64 %r30626, %rd797; + cvt.u32.u64 %r30627, %rd805; + shr.u64 %rd806, %rd802, 32; + cvt.u32.u64 %r30624, %rd802; + cvt.u32.u64 %r30625, %rd806; + shr.u64 %rd807, %rd798, 32; + cvt.u32.u64 %r30622, %rd798; + cvt.u32.u64 %r30623, %rd807; + shr.u64 %rd808, %rd799, 32; + cvt.u32.u64 %r30620, %rd799; + cvt.u32.u64 %r30621, %rd808; + mov.u32 %r30583, %r30582; + mov.u32 %r30584, %r30582; + mov.u32 %r30585, %r30582; + mov.u32 %r30586, %r30582; + mov.u32 %r30587, %r30582; + mov.u32 %r30588, %r30582; + mov.u32 %r30589, %r30582; + mov.u32 %r30590, %r30582; + mov.u32 %r30591, %r30582; + mov.u32 %r30592, %r30582; + mov.u32 %r30593, %r30582; + mov.u32 %r30594, %r30582; + mov.u32 %r30595, %r30582; + mov.u32 %r30596, %r17485; + mov.u32 %r30598, %r30582; + mov.u32 %r30599, %r30582; + mov.u32 %r30600, %r30582; + mov.u32 %r30601, %r30582; + mov.u32 %r30602, %r30582; + mov.u32 %r30603, %r30582; + mov.u32 %r30604, %r30582; + mov.u32 %r30605, %r30582; + mov.u32 %r30606, %r30582; + mov.u32 %r30607, %r30582; + mov.u32 %r30608, %r30582; + mov.u32 %r30609, %r30582; + mov.u32 %r30610, %r30582; + mov.u32 %r30611, %r30582; + mov.u32 %r30612, %r30582; + mov.u32 %r30613, %r30582; + mov.u32 %r30614, %r30582; + mov.u32 %r30615, %r30582; + mov.u32 %r30632, %r30582; + +$L__BB2_53: + // begin inline asm + // xor5 + lop3.b32 %r17526, %r30618, %r30616, %r30614, 0x96; + lop3.b32 %r17526, %r17526, %r30612, %r30610, 0x96; + lop3.b32 %r17527, %r30619, %r30617, %r30615, 0x96; + lop3.b32 %r17527, %r17527, %r30613, %r30611, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17538, %r30630, %r30628, %r30608, 0x96; + lop3.b32 %r17538, %r17538, %r30606, %r30604, 0x96; + lop3.b32 %r17539, %r30631, %r30629, %r30609, 0x96; + lop3.b32 %r17539, %r17539, %r30607, %r30605, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17550, %r30626, %r30624, %r30602, 0x96; + lop3.b32 %r17550, %r17550, %r30600, %r30598, 0x96; + lop3.b32 %r17551, %r30627, %r30625, %r30603, 0x96; + lop3.b32 %r17551, %r17551, %r30601, %r30599, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17562, %r30622, %r30596, %r30594, 0x96; + lop3.b32 %r17562, %r17562, %r30592, %r30590, 0x96; + lop3.b32 %r17563, %r30623, %r30597, %r30595, 0x96; + lop3.b32 %r17563, %r17563, %r30593, %r30591, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17574, %r30620, %r30588, %r30586, 0x96; + lop3.b32 %r17574, %r17574, %r30584, %r30582, 0x96; + lop3.b32 %r17575, %r30621, %r30589, %r30587, 0x96; + lop3.b32 %r17575, %r17575, %r30585, %r30583, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17586, %r17539, %r17538, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17590, %r17538, %r17539, %r17485; + // end inline asm + xor.b32 %r18020, %r17586, %r17574; + xor.b32 %r18021, %r17590, %r17575; + xor.b32 %r17853, %r30618, %r18020; + xor.b32 %r17856, %r30619, %r18021; + xor.b32 %r17760, %r30616, %r18020; + xor.b32 %r17759, %r30617, %r18021; + xor.b32 %r17807, %r30614, %r18020; + xor.b32 %r17808, %r30615, %r18021; + xor.b32 %r17712, %r30612, %r18020; + xor.b32 %r17711, %r30613, %r18021; + xor.b32 %r17663, %r30610, %r18020; + xor.b32 %r17664, %r30611, %r18021; + // begin inline asm + shf.l.wrap.b32 %r17594, %r17551, %r17550, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17598, %r17550, %r17551, %r17485; + // end inline asm + xor.b32 %r18022, %r17594, %r17526; + xor.b32 %r18023, %r17598, %r17527; + xor.b32 %r17815, %r30630, %r18022; + xor.b32 %r17816, %r30631, %r18023; + xor.b32 %r17632, %r30628, %r18022; + xor.b32 %r17631, %r30629, %r18023; + xor.b32 %r17791, %r30608, %r18022; + xor.b32 %r17792, %r30609, %r18023; + xor.b32 %r17752, %r30606, %r18022; + xor.b32 %r17751, %r30607, %r18023; + xor.b32 %r17735, %r30604, %r18022; + xor.b32 %r17736, %r30605, %r18023; + // begin inline asm + shf.l.wrap.b32 %r17602, %r17563, %r17562, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17606, %r17562, %r17563, %r17485; + // end inline asm + xor.b32 %r18024, %r17602, %r17538; + xor.b32 %r18025, %r17606, %r17539; + xor.b32 %r17672, %r30626, %r18024; + xor.b32 %r17671, %r30627, %r18025; + xor.b32 %r17799, %r30624, %r18024; + xor.b32 %r17800, %r30625, %r18025; + xor.b32 %r17680, %r30602, %r18024; + xor.b32 %r17679, %r30603, %r18025; + xor.b32 %r17783, %r30600, %r18024; + xor.b32 %r17784, %r30601, %r18025; + xor.b32 %r17648, %r30598, %r18024; + xor.b32 %r17647, %r30599, %r18025; + // begin inline asm + shf.l.wrap.b32 %r17610, %r17575, %r17574, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17614, %r17574, %r17575, %r17485; + // end inline asm + xor.b32 %r18026, %r17610, %r17550; + xor.b32 %r18027, %r17614, %r17551; + xor.b32 %r17767, %r30622, %r18026; + xor.b32 %r17768, %r30623, %r18027; + xor.b32 %r17744, %r30596, %r18026; + xor.b32 %r17743, %r30597, %r18027; + xor.b32 %r17687, %r30594, %r18026; + xor.b32 %r17688, %r30595, %r18027; + xor.b32 %r17775, %r30592, %r18026; + xor.b32 %r17776, %r30593, %r18027; + xor.b32 %r17704, %r30590, %r18026; + xor.b32 %r17703, %r30591, %r18027; + // begin inline asm + shf.l.wrap.b32 %r17618, %r17527, %r17526, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17622, %r17526, %r17527, %r17485; + // end inline asm + xor.b32 %r18028, %r17618, %r17562; + xor.b32 %r18029, %r17622, %r17563; + xor.b32 %r17719, %r30620, %r18028; + xor.b32 %r17720, %r30621, %r18029; + xor.b32 %r17639, %r30588, %r18028; + xor.b32 %r17640, %r30589, %r18029; + xor.b32 %r17656, %r30586, %r18028; + xor.b32 %r17655, %r30587, %r18029; + xor.b32 %r17695, %r30584, %r18028; + xor.b32 %r17696, %r30585, %r18029; + xor.b32 %r17727, %r30582, %r18028; + xor.b32 %r17728, %r30583, %r18029; + mov.u32 %r17633, 44; + // begin inline asm + shf.l.wrap.b32 %r17626, %r17632, %r17631, %r17633; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17630, %r17631, %r17632, %r17633; + // end inline asm + mov.u32 %r17641, 20; + // begin inline asm + shf.l.wrap.b32 %r17634, %r17640, %r17639, %r17641; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17638, %r17639, %r17640, %r17641; + // end inline asm + mov.u32 %r17649, 61; + // begin inline asm + shf.l.wrap.b32 %r17642, %r17648, %r17647, %r17649; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17646, %r17647, %r17648, %r17649; + // end inline asm + mov.u32 %r17657, 39; + // begin inline asm + shf.l.wrap.b32 %r17650, %r17656, %r17655, %r17657; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17654, %r17655, %r17656, %r17657; + // end inline asm + mov.u32 %r17665, 18; + // begin inline asm + shf.l.wrap.b32 %r17658, %r17664, %r17663, %r17665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17662, %r17663, %r17664, %r17665; + // end inline asm + mov.u32 %r17673, 62; + // begin inline asm + shf.l.wrap.b32 %r17666, %r17672, %r17671, %r17673; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17670, %r17671, %r17672, %r17673; + // end inline asm + mov.u32 %r17681, 43; + // begin inline asm + shf.l.wrap.b32 %r17674, %r17680, %r17679, %r17681; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17678, %r17679, %r17680, %r17681; + // end inline asm + mov.u32 %r17689, 25; + // begin inline asm + shf.l.wrap.b32 %r17682, %r17688, %r17687, %r17689; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17686, %r17687, %r17688, %r17689; + // end inline asm + mov.u32 %r17697, 8; + // begin inline asm + shf.l.wrap.b32 %r17690, %r17696, %r17695, %r17697; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17694, %r17695, %r17696, %r17697; + // end inline asm + mov.u32 %r17705, 56; + // begin inline asm + shf.l.wrap.b32 %r17698, %r17704, %r17703, %r17705; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17702, %r17703, %r17704, %r17705; + // end inline asm + mov.u32 %r17713, 41; + // begin inline asm + shf.l.wrap.b32 %r17706, %r17712, %r17711, %r17713; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17710, %r17711, %r17712, %r17713; + // end inline asm + mov.u32 %r17721, 27; + // begin inline asm + shf.l.wrap.b32 %r17714, %r17720, %r17719, %r17721; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17718, %r17719, %r17720, %r17721; + // end inline asm + mov.u32 %r17729, 14; + // begin inline asm + shf.l.wrap.b32 %r17722, %r17728, %r17727, %r17729; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17726, %r17727, %r17728, %r17729; + // end inline asm + mov.u32 %r17737, 2; + // begin inline asm + shf.l.wrap.b32 %r17730, %r17736, %r17735, %r17737; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17734, %r17735, %r17736, %r17737; + // end inline asm + mov.u32 %r17745, 55; + // begin inline asm + shf.l.wrap.b32 %r17738, %r17744, %r17743, %r17745; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17742, %r17743, %r17744, %r17745; + // end inline asm + mov.u32 %r17753, 45; + // begin inline asm + shf.l.wrap.b32 %r17746, %r17752, %r17751, %r17753; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17750, %r17751, %r17752, %r17753; + // end inline asm + mov.u32 %r17761, 36; + // begin inline asm + shf.l.wrap.b32 %r17754, %r17760, %r17759, %r17761; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17758, %r17759, %r17760, %r17761; + // end inline asm + mov.u32 %r17769, 28; + // begin inline asm + shf.l.wrap.b32 %r17762, %r17768, %r17767, %r17769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17766, %r17767, %r17768, %r17769; + // end inline asm + mov.u32 %r17777, 21; + // begin inline asm + shf.l.wrap.b32 %r17770, %r17776, %r17775, %r17777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17774, %r17775, %r17776, %r17777; + // end inline asm + mov.u32 %r17785, 15; + // begin inline asm + shf.l.wrap.b32 %r17778, %r17784, %r17783, %r17785; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17782, %r17783, %r17784, %r17785; + // end inline asm + mov.u32 %r17793, 10; + // begin inline asm + shf.l.wrap.b32 %r17786, %r17792, %r17791, %r17793; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17790, %r17791, %r17792, %r17793; + // end inline asm + mov.u32 %r17801, 6; + // begin inline asm + shf.l.wrap.b32 %r17794, %r17800, %r17799, %r17801; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17798, %r17799, %r17800, %r17801; + // end inline asm + mov.u32 %r17809, 3; + // begin inline asm + shf.l.wrap.b32 %r17802, %r17808, %r17807, %r17809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17806, %r17807, %r17808, %r17809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17810, %r17816, %r17815, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17814, %r17815, %r17816, %r17485; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17818, %r17853, %r17626, %r17674, 0xD2; + lop3.b32 %r17819, %r17856, %r17630, %r17678, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30630, %r17626, %r17674, %r17770, 0xD2; + lop3.b32 %r30631, %r17630, %r17678, %r17774, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30626, %r17674, %r17770, %r17722, 0xD2; + lop3.b32 %r30627, %r17678, %r17774, %r17726, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30622, %r17770, %r17722, %r17853, 0xD2; + lop3.b32 %r30623, %r17774, %r17726, %r17856, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30620, %r17722, %r17853, %r17626, 0xD2; + lop3.b32 %r30621, %r17726, %r17856, %r17630, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30616, %r17762, %r17634, %r17802, 0xD2; + lop3.b32 %r30617, %r17766, %r17638, %r17806, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30628, %r17634, %r17802, %r17746, 0xD2; + lop3.b32 %r30629, %r17638, %r17806, %r17750, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30624, %r17802, %r17746, %r17642, 0xD2; + lop3.b32 %r30625, %r17806, %r17750, %r17646, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30596, %r17746, %r17642, %r17762, 0xD2; + lop3.b32 %r30597, %r17750, %r17646, %r17766, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r30596, %r30597}; + // begin inline asm + // chi + lop3.b32 %r30588, %r17642, %r17762, %r17634, 0xD2; + lop3.b32 %r30589, %r17646, %r17766, %r17638, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r30588, %r30589}; + // begin inline asm + // chi + lop3.b32 %r30614, %r17810, %r17794, %r17682, 0xD2; + lop3.b32 %r30615, %r17814, %r17798, %r17686, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+104], {%r30614, %r30615}; + // begin inline asm + // chi + lop3.b32 %r30608, %r17794, %r17682, %r17690, 0xD2; + lop3.b32 %r30609, %r17798, %r17686, %r17694, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+112], {%r30608, %r30609}; + // begin inline asm + // chi + lop3.b32 %r30602, %r17682, %r17690, %r17658, 0xD2; + lop3.b32 %r30603, %r17686, %r17694, %r17662, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+120], {%r30602, %r30603}; + // begin inline asm + // chi + lop3.b32 %r30594, %r17690, %r17658, %r17810, 0xD2; + lop3.b32 %r30595, %r17694, %r17662, %r17814, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+128], {%r30594, %r30595}; + // begin inline asm + // chi + lop3.b32 %r30586, %r17658, %r17810, %r17794, 0xD2; + lop3.b32 %r30587, %r17662, %r17814, %r17798, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+136], {%r30586, %r30587}; + // begin inline asm + // chi + lop3.b32 %r30612, %r17714, %r17754, %r17786, 0xD2; + lop3.b32 %r30613, %r17718, %r17758, %r17790, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+144], {%r30612, %r30613}; + // begin inline asm + // chi + lop3.b32 %r30606, %r17754, %r17786, %r17778, 0xD2; + lop3.b32 %r30607, %r17758, %r17790, %r17782, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+152], {%r30606, %r30607}; + // begin inline asm + // chi + lop3.b32 %r30600, %r17786, %r17778, %r17698, 0xD2; + lop3.b32 %r30601, %r17790, %r17782, %r17702, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+160], {%r30600, %r30601}; + // begin inline asm + // chi + lop3.b32 %r30592, %r17778, %r17698, %r17714, 0xD2; + lop3.b32 %r30593, %r17782, %r17702, %r17718, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+168], {%r30592, %r30593}; + // begin inline asm + // chi + lop3.b32 %r30584, %r17698, %r17714, %r17754, 0xD2; + lop3.b32 %r30585, %r17702, %r17718, %r17758, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+176], {%r30584, %r30585}; + // begin inline asm + // chi + lop3.b32 %r30610, %r17666, %r17738, %r17650, 0xD2; + lop3.b32 %r30611, %r17670, %r17742, %r17654, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+184], {%r30610, %r30611}; + // begin inline asm + // chi + lop3.b32 %r30604, %r17738, %r17650, %r17706, 0xD2; + lop3.b32 %r30605, %r17742, %r17654, %r17710, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+192], {%r30604, %r30605}; + // begin inline asm + // chi + lop3.b32 %r30598, %r17650, %r17706, %r17730, 0xD2; + lop3.b32 %r30599, %r17654, %r17710, %r17734, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+200], {%r30598, %r30599}; + // begin inline asm + // chi + lop3.b32 %r30590, %r17706, %r17730, %r17666, 0xD2; + lop3.b32 %r30591, %r17710, %r17734, %r17670, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+208], {%r30590, %r30591}; + // begin inline asm + // chi + lop3.b32 %r30582, %r17730, %r17666, %r17738, 0xD2; + lop3.b32 %r30583, %r17734, %r17670, %r17742, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+216], {%r30582, %r30583}; + mul.wide.s32 %rd810, %r30632, 8; + add.s64 %rd809, %rd789, %rd810; + // begin inline asm + ld.global.nc.v2.u32 {%r18018,%r18019}, [%rd809]; + // end inline asm + xor.b32 %r30618, %r17818, %r18018; + xor.b32 %r30619, %r17819, %r18019; + add.s32 %r30632, %r30632, 1; + setp.lt.u32 %p32, %r30632, 23; + @%p32 bra $L__BB2_53; + + mov.u32 %r30665, 0; + mov.u32 %r18129, 1; + st.local.v2.u32 [%rd176+32], {%r30630, %r30631}; + st.local.v2.u32 [%rd176+72], {%r30628, %r30629}; + st.local.v2.u32 [%rd176+40], {%r30626, %r30627}; + st.local.v2.u32 [%rd176+80], {%r30624, %r30625}; + st.local.v2.u32 [%rd176+48], {%r30622, %r30623}; + st.local.v2.u32 [%rd176+56], {%r30620, %r30621}; + st.local.v2.u32 [%rd176+24], {%r30618, %r30619}; + // begin inline asm + // xor5 + lop3.b32 %r18030, %r30618, %r30616, %r30614, 0x96; + lop3.b32 %r18030, %r18030, %r30612, %r30610, 0x96; + lop3.b32 %r18031, %r30619, %r30617, %r30615, 0x96; + lop3.b32 %r18031, %r18031, %r30613, %r30611, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18042, %r30630, %r30628, %r30608, 0x96; + lop3.b32 %r18042, %r18042, %r30606, %r30604, 0x96; + lop3.b32 %r18043, %r30631, %r30629, %r30609, 0x96; + lop3.b32 %r18043, %r18043, %r30607, %r30605, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18054, %r30626, %r30624, %r30602, 0x96; + lop3.b32 %r18054, %r18054, %r30600, %r30598, 0x96; + lop3.b32 %r18055, %r30627, %r30625, %r30603, 0x96; + lop3.b32 %r18055, %r18055, %r30601, %r30599, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18066, %r30622, %r30596, %r30594, 0x96; + lop3.b32 %r18066, %r18066, %r30592, %r30590, 0x96; + lop3.b32 %r18067, %r30623, %r30597, %r30595, 0x96; + lop3.b32 %r18067, %r18067, %r30593, %r30591, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18078, %r30620, %r30588, %r30586, 0x96; + lop3.b32 %r18078, %r18078, %r30584, %r30582, 0x96; + lop3.b32 %r18079, %r30621, %r30589, %r30587, 0x96; + lop3.b32 %r18079, %r18079, %r30585, %r30583, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18090, %r18043, %r18042, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18094, %r18042, %r18043, %r18129; + // end inline asm + xor.b32 %r18269, %r18090, %r18078; + xor.b32 %r18270, %r18094, %r18079; + xor.b32 %r18237, %r30618, %r18269; + xor.b32 %r18240, %r30619, %r18270; + xor.b32 %r18200, %r30615, %r18270; + xor.b32 %r18199, %r30614, %r18269; + st.local.v2.u32 [%rd176+104], {%r18199, %r18200}; + // begin inline asm + shf.l.wrap.b32 %r18098, %r18055, %r18054, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18102, %r18054, %r18055, %r18129; + // end inline asm + xor.b32 %r18271, %r18098, %r18030; + xor.b32 %r18272, %r18102, %r18031; + xor.b32 %r18136, %r30628, %r18271; + xor.b32 %r18135, %r30629, %r18272; + xor.b32 %r18175, %r30607, %r18272; + xor.b32 %r18176, %r30606, %r18271; + st.local.v2.u32 [%rd176+152], {%r18176, %r18175}; + // begin inline asm + shf.l.wrap.b32 %r18106, %r18067, %r18066, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18110, %r18066, %r18067, %r18129; + // end inline asm + xor.b32 %r18273, %r18106, %r18042; + xor.b32 %r18274, %r18110, %r18043; + xor.b32 %r18159, %r30603, %r18274; + xor.b32 %r18160, %r30602, %r18273; + st.local.v2.u32 [%rd176+120], {%r18160, %r18159}; + xor.b32 %r18151, %r30599, %r18274; + xor.b32 %r18152, %r30598, %r18273; + st.local.v2.u32 [%rd176+200], {%r18152, %r18151}; + // begin inline asm + shf.l.wrap.b32 %r18114, %r18079, %r18078, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18118, %r18078, %r18079, %r18129; + // end inline asm + xor.b32 %r18275, %r18114, %r18054; + xor.b32 %r18276, %r18118, %r18055; + xor.b32 %r18183, %r30622, %r18275; + xor.b32 %r18184, %r30623, %r18276; + xor.b32 %r18192, %r30593, %r18276; + xor.b32 %r18191, %r30592, %r18275; + st.local.v2.u32 [%rd176+168], {%r18191, %r18192}; + // begin inline asm + shf.l.wrap.b32 %r18122, %r18031, %r18030, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18126, %r18030, %r18031, %r18129; + // end inline asm + xor.b32 %r18277, %r18122, %r18066; + xor.b32 %r18278, %r18126, %r18067; + xor.b32 %r18143, %r30588, %r18277; + xor.b32 %r18144, %r30589, %r18278; + xor.b32 %r18168, %r30583, %r18278; + xor.b32 %r18167, %r30582, %r18277; + st.local.v2.u32 [%rd176+216], {%r18167, %r18168}; + // begin inline asm + shf.l.wrap.b32 %r18130, %r18136, %r18135, %r17633; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18134, %r18135, %r18136, %r17633; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18138, %r18144, %r18143, %r17641; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18142, %r18143, %r18144, %r17641; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18150, %r18151, %r18152, %r17649; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18146, %r18152, %r18151, %r17649; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r18146, %r18150}; + // begin inline asm + shf.l.wrap.b32 %r18154, %r18160, %r18159, %r17681; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18158, %r18159, %r18160, %r17681; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18162, %r18168, %r18167, %r17729; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18166, %r18167, %r18168, %r17729; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18174, %r18175, %r18176, %r17753; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18170, %r18176, %r18175, %r17753; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r18170, %r18174}; + // begin inline asm + shf.l.wrap.b32 %r18178, %r18184, %r18183, %r17769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18182, %r18183, %r18184, %r17769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18186, %r18192, %r18191, %r17777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18190, %r18191, %r18192, %r17777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18194, %r18200, %r18199, %r17809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18198, %r18199, %r18200, %r17809; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18202, %r18237, %r18130, %r18154, 0xD2; + lop3.b32 %r18203, %r18240, %r18134, %r18158, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30765, %r18130, %r18154, %r18186, 0xD2; + lop3.b32 %r30766, %r18134, %r18158, %r18190, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+32], {%r30765, %r30766}; + // begin inline asm + // chi + lop3.b32 %r30761, %r18154, %r18186, %r18162, 0xD2; + lop3.b32 %r30762, %r18158, %r18190, %r18166, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+40], {%r30761, %r30762}; + // begin inline asm + // chi + lop3.b32 %r30757, %r18186, %r18162, %r18237, 0xD2; + lop3.b32 %r30758, %r18190, %r18166, %r18240, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+48], {%r30757, %r30758}; + // begin inline asm + // chi + lop3.b32 %r30755, %r18162, %r18237, %r18130, 0xD2; + lop3.b32 %r30756, %r18166, %r18240, %r18134, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+56], {%r30755, %r30756}; + // begin inline asm + // chi + lop3.b32 %r30751, %r18178, %r18138, %r18194, 0xD2; + lop3.b32 %r30752, %r18182, %r18142, %r18198, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+64], {%r30751, %r30752}; + // begin inline asm + // chi + lop3.b32 %r30763, %r18138, %r18194, %r18170, 0xD2; + lop3.b32 %r30764, %r18142, %r18198, %r18174, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+72], {%r30763, %r30764}; + // begin inline asm + // chi + lop3.b32 %r30759, %r18194, %r18170, %r18146, 0xD2; + lop3.b32 %r30760, %r18198, %r18174, %r18150, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+80], {%r30759, %r30760}; + // begin inline asm + ld.global.nc.v2.u32 {%r18266,%r18267}, [%rd790]; + // end inline asm + xor.b32 %r30753, %r18202, %r18266; + xor.b32 %r30754, %r18203, %r18267; + st.local.v2.u32 [%rd176+24], {%r30753, %r30754}; + add.s64 %rd178, %rd2, 24; + add.s64 %rd179, %rd176, 24; + +$L__BB2_55: + cvta.to.global.u64 %rd1269, %rd361; + shl.b32 %r18279, %r30665, 2; + cvt.u64.u32 %rd820, %r18279; + and.b64 %rd821, %rd820, 60; + add.s64 %rd822, %rd178, %rd821; + xor.b32 %r18280, %r1695, %r30665; + mul.lo.s32 %r18281, %r18280, 16777619; + ld.local.u32 %r18282, [%rd822]; + xor.b32 %r18283, %r18281, %r18282; + mul.wide.u32 %rd823, %r18283, -954391867; + shr.u64 %rd824, %rd823, 32; + cvt.u32.u64 %r18284, %rd824; + sub.s32 %r18285, %r18283, %r18284; + shr.u32 %r18286, %r18285, 1; + add.s32 %r18287, %r18286, %r18284; + shr.u32 %r18288, %r18287, 20; + mul.lo.s32 %r18289, %r18288, 1179641; + sub.s32 %r18290, %r18283, %r18289; + mul.wide.u32 %rd825, %r18290, 64; + add.s64 %rd826, %rd1269, %rd825; + mul.lo.s32 %r18291, %r30702, 16777619; + ld.global.u32 %r18292, [%rd826]; + xor.b32 %r30702, %r18291, %r18292; + mul.lo.s32 %r18293, %r30703, 16777619; + ld.global.u32 %r18294, [%rd826+4]; + xor.b32 %r30703, %r18293, %r18294; + mul.lo.s32 %r18295, %r30714, 16777619; + ld.global.u32 %r18296, [%rd826+8]; + mul.lo.s32 %r18297, %r30715, 16777619; + ld.global.u32 %r18298, [%rd826+12]; + xor.b32 %r18299, %r18297, %r18298; + xor.b32 %r30714, %r18295, %r18296; + mov.b64 %rd827, {%r30714, %r18299}; + mul.lo.s32 %r18300, %r30710, 16777619; + ld.global.u32 %r18301, [%rd826+16]; + mul.lo.s32 %r18302, %r30711, 16777619; + ld.global.u32 %r18303, [%rd826+20]; + xor.b32 %r18304, %r18302, %r18303; + xor.b32 %r30710, %r18300, %r18301; + mov.b64 %rd828, {%r30710, %r18304}; + mul.lo.s32 %r18305, %r30706, 16777619; + ld.global.u32 %r18306, [%rd826+24]; + mul.lo.s32 %r18307, %r30707, 16777619; + ld.global.u32 %r18308, [%rd826+28]; + xor.b32 %r18309, %r18307, %r18308; + xor.b32 %r30706, %r18305, %r18306; + mov.b64 %rd829, {%r30706, %r18309}; + mul.lo.s32 %r18310, %r30704, 16777619; + ld.global.u32 %r18311, [%rd826+32]; + mul.lo.s32 %r18312, %r30705, 16777619; + ld.global.u32 %r18313, [%rd826+36]; + xor.b32 %r18314, %r18312, %r18313; + xor.b32 %r30704, %r18310, %r18311; + mov.b64 %rd830, {%r30704, %r18314}; + mul.lo.s32 %r18315, %r30700, 16777619; + ld.global.u32 %r18316, [%rd826+40]; + xor.b32 %r30700, %r18315, %r18316; + mul.lo.s32 %r18317, %r30701, 16777619; + ld.global.u32 %r18318, [%rd826+44]; + xor.b32 %r30701, %r18317, %r18318; + mul.lo.s32 %r18319, %r30712, 16777619; + ld.global.u32 %r18320, [%rd826+48]; + mul.lo.s32 %r18321, %r30713, 16777619; + ld.global.u32 %r18322, [%rd826+52]; + xor.b32 %r18323, %r18321, %r18322; + xor.b32 %r30712, %r18319, %r18320; + mov.b64 %rd831, {%r30712, %r18323}; + mul.lo.s32 %r18324, %r30708, 16777619; + ld.global.u32 %r18325, [%rd826+56]; + mul.lo.s32 %r18326, %r30709, 16777619; + ld.global.u32 %r18327, [%rd826+60]; + xor.b32 %r18328, %r18326, %r18327; + xor.b32 %r30708, %r18324, %r18325; + mov.b64 %rd832, {%r30708, %r18328}; + st.local.v2.u32 [%rd2+24], {%r30702, %r30703}; + st.local.v2.u32 [%rd2+32], {%r30714, %r18299}; + st.local.v2.u32 [%rd2+40], {%r30710, %r18304}; + st.local.v2.u32 [%rd2+48], {%r30706, %r18309}; + st.local.v2.u32 [%rd2+56], {%r30704, %r18314}; + st.local.v2.u32 [%rd2+64], {%r30700, %r30701}; + st.local.v2.u32 [%rd2+72], {%r30712, %r18323}; + st.local.v2.u32 [%rd2+80], {%r30708, %r18328}; + add.s64 %rd833, %rd179, %rd821; + xor.b32 %r18329, %r1696, %r30665; + mul.lo.s32 %r18330, %r18329, 16777619; + ld.local.u32 %r18331, [%rd833]; + xor.b32 %r18332, %r18330, %r18331; + mul.wide.u32 %rd834, %r18332, -954391867; + shr.u64 %rd835, %rd834, 32; + cvt.u32.u64 %r18333, %rd835; + sub.s32 %r18334, %r18332, %r18333; + shr.u32 %r18335, %r18334, 1; + add.s32 %r18336, %r18335, %r18333; + shr.u32 %r18337, %r18336, 20; + mul.lo.s32 %r18338, %r18337, 1179641; + sub.s32 %r18339, %r18332, %r18338; + mul.wide.u32 %rd836, %r18339, 64; + add.s64 %rd837, %rd1269, %rd836; + mul.lo.s32 %r18340, %r30753, 16777619; + ld.global.u32 %r18341, [%rd837]; + xor.b32 %r30753, %r18340, %r18341; + mul.lo.s32 %r18342, %r30754, 16777619; + ld.global.u32 %r18343, [%rd837+4]; + xor.b32 %r30754, %r18342, %r18343; + mul.lo.s32 %r18344, %r30765, 16777619; + ld.global.u32 %r18345, [%rd837+8]; + mul.lo.s32 %r18346, %r30766, 16777619; + ld.global.u32 %r18347, [%rd837+12]; + xor.b32 %r18348, %r18346, %r18347; + xor.b32 %r30765, %r18344, %r18345; + mov.b64 %rd838, {%r30765, %r18348}; + mul.lo.s32 %r18349, %r30761, 16777619; + ld.global.u32 %r18350, [%rd837+16]; + mul.lo.s32 %r18351, %r30762, 16777619; + ld.global.u32 %r18352, [%rd837+20]; + xor.b32 %r18353, %r18351, %r18352; + xor.b32 %r30761, %r18349, %r18350; + mov.b64 %rd839, {%r30761, %r18353}; + mul.lo.s32 %r18354, %r30757, 16777619; + ld.global.u32 %r18355, [%rd837+24]; + mul.lo.s32 %r18356, %r30758, 16777619; + ld.global.u32 %r18357, [%rd837+28]; + xor.b32 %r18358, %r18356, %r18357; + xor.b32 %r30757, %r18354, %r18355; + mov.b64 %rd840, {%r30757, %r18358}; + mul.lo.s32 %r18359, %r30755, 16777619; + ld.global.u32 %r18360, [%rd837+32]; + mul.lo.s32 %r18361, %r30756, 16777619; + ld.global.u32 %r18362, [%rd837+36]; + xor.b32 %r18363, %r18361, %r18362; + xor.b32 %r30755, %r18359, %r18360; + mov.b64 %rd841, {%r30755, %r18363}; + mul.lo.s32 %r18364, %r30751, 16777619; + ld.global.u32 %r18365, [%rd837+40]; + xor.b32 %r30751, %r18364, %r18365; + mul.lo.s32 %r18366, %r30752, 16777619; + ld.global.u32 %r18367, [%rd837+44]; + xor.b32 %r30752, %r18366, %r18367; + mul.lo.s32 %r18368, %r30763, 16777619; + ld.global.u32 %r18369, [%rd837+48]; + mul.lo.s32 %r18370, %r30764, 16777619; + ld.global.u32 %r18371, [%rd837+52]; + xor.b32 %r18372, %r18370, %r18371; + xor.b32 %r30763, %r18368, %r18369; + mov.b64 %rd842, {%r30763, %r18372}; + mul.lo.s32 %r18373, %r30759, 16777619; + ld.global.u32 %r18374, [%rd837+56]; + mul.lo.s32 %r18375, %r30760, 16777619; + ld.global.u32 %r18376, [%rd837+60]; + xor.b32 %r18377, %r18375, %r18376; + xor.b32 %r30759, %r18373, %r18374; + mov.b64 %rd843, {%r30759, %r18377}; + st.local.v2.u32 [%rd176+24], {%r30753, %r30754}; + st.local.v2.u32 [%rd176+32], {%r30765, %r18348}; + st.local.v2.u32 [%rd176+40], {%r30761, %r18353}; + st.local.v2.u32 [%rd176+48], {%r30757, %r18358}; + st.local.v2.u32 [%rd176+56], {%r30755, %r18363}; + st.local.v2.u32 [%rd176+64], {%r30751, %r30752}; + st.local.v2.u32 [%rd176+72], {%r30763, %r18372}; + st.local.v2.u32 [%rd176+80], {%r30759, %r18377}; + add.s32 %r30665, %r30665, 1; + setp.lt.u32 %p33, %r30665, 512; + shr.u64 %rd844, %rd827, 32; + cvt.u32.u64 %r30715, %rd844; + shr.u64 %rd845, %rd828, 32; + cvt.u32.u64 %r30711, %rd845; + shr.u64 %rd846, %rd829, 32; + cvt.u32.u64 %r30707, %rd846; + shr.u64 %rd847, %rd830, 32; + cvt.u32.u64 %r30705, %rd847; + shr.u64 %rd848, %rd831, 32; + cvt.u32.u64 %r30713, %rd848; + shr.u64 %rd849, %rd832, 32; + cvt.u32.u64 %r30709, %rd849; + shr.u64 %rd850, %rd838, 32; + cvt.u32.u64 %r30766, %rd850; + shr.u64 %rd851, %rd839, 32; + cvt.u32.u64 %r30762, %rd851; + shr.u64 %rd852, %rd840, 32; + cvt.u32.u64 %r30758, %rd852; + shr.u64 %rd853, %rd841, 32; + cvt.u32.u64 %r30756, %rd853; + shr.u64 %rd854, %rd842, 32; + cvt.u32.u64 %r30764, %rd854; + shr.u64 %rd855, %rd843, 32; + cvt.u32.u64 %r30760, %rd855; + @%p33 bra $L__BB2_55; + + mov.u32 %r30666, 0; + st.local.v2.u32 [%rd2+96], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+104], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+112], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+120], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+128], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+136], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+144], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+152], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+160], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+168], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+176], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+184], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+192], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+200], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+208], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+216], {%r30666, %r30666}; + mov.u32 %r30681, -2147483648; + mov.u32 %r18392, 1; + st.local.v2.u32 [%rd2+88], {%r18392, %r30681}; + mov.u32 %r30667, %r30666; + mov.u32 %r30668, %r30666; + mov.u32 %r30669, %r30666; + mov.u32 %r30670, %r30666; + mov.u32 %r30671, %r30666; + mov.u32 %r30672, %r30666; + mov.u32 %r30673, %r30666; + mov.u32 %r30674, %r30666; + mov.u32 %r30675, %r30666; + mov.u32 %r30676, %r30666; + mov.u32 %r30677, %r30666; + mov.u32 %r30678, %r30666; + mov.u32 %r30679, %r30666; + mov.u32 %r30680, %r18392; + mov.u32 %r30682, %r30666; + mov.u32 %r30683, %r30666; + mov.u32 %r30684, %r30666; + mov.u32 %r30685, %r30666; + mov.u32 %r30686, %r30666; + mov.u32 %r30687, %r30666; + mov.u32 %r30688, %r30666; + mov.u32 %r30689, %r30666; + mov.u32 %r30690, %r30666; + mov.u32 %r30691, %r30666; + mov.u32 %r30692, %r30666; + mov.u32 %r30693, %r30666; + mov.u32 %r30694, %r30666; + mov.u32 %r30695, %r30666; + mov.u32 %r30696, %r30666; + mov.u32 %r30697, %r30666; + mov.u32 %r30698, %r30666; + mov.u32 %r30699, %r30666; + mov.u32 %r30716, %r30666; + +$L__BB2_57: + // begin inline asm + // xor5 + lop3.b32 %r18419, %r30702, %r30700, %r30698, 0x96; + lop3.b32 %r18419, %r18419, %r30696, %r30694, 0x96; + lop3.b32 %r18420, %r30703, %r30701, %r30699, 0x96; + lop3.b32 %r18420, %r18420, %r30697, %r30695, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18431, %r30714, %r30712, %r30692, 0x96; + lop3.b32 %r18431, %r18431, %r30690, %r30688, 0x96; + lop3.b32 %r18432, %r30715, %r30713, %r30693, 0x96; + lop3.b32 %r18432, %r18432, %r30691, %r30689, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18443, %r30710, %r30708, %r30686, 0x96; + lop3.b32 %r18443, %r18443, %r30684, %r30682, 0x96; + lop3.b32 %r18444, %r30711, %r30709, %r30687, 0x96; + lop3.b32 %r18444, %r18444, %r30685, %r30683, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18455, %r30706, %r30680, %r30678, 0x96; + lop3.b32 %r18455, %r18455, %r30676, %r30674, 0x96; + lop3.b32 %r18456, %r30707, %r30681, %r30679, 0x96; + lop3.b32 %r18456, %r18456, %r30677, %r30675, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18467, %r30704, %r30672, %r30670, 0x96; + lop3.b32 %r18467, %r18467, %r30668, %r30666, 0x96; + lop3.b32 %r18468, %r30705, %r30673, %r30671, 0x96; + lop3.b32 %r18468, %r18468, %r30669, %r30667, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18479, %r18432, %r18431, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18483, %r18431, %r18432, %r18392; + // end inline asm + xor.b32 %r18913, %r18479, %r18467; + xor.b32 %r18914, %r18483, %r18468; + xor.b32 %r18746, %r30702, %r18913; + xor.b32 %r18749, %r30703, %r18914; + xor.b32 %r18653, %r30700, %r18913; + xor.b32 %r18652, %r30701, %r18914; + xor.b32 %r18700, %r30698, %r18913; + xor.b32 %r18701, %r30699, %r18914; + xor.b32 %r18605, %r30696, %r18913; + xor.b32 %r18604, %r30697, %r18914; + xor.b32 %r18556, %r30694, %r18913; + xor.b32 %r18557, %r30695, %r18914; + // begin inline asm + shf.l.wrap.b32 %r18487, %r18444, %r18443, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18491, %r18443, %r18444, %r18392; + // end inline asm + xor.b32 %r18915, %r18487, %r18419; + xor.b32 %r18916, %r18491, %r18420; + xor.b32 %r18708, %r30714, %r18915; + xor.b32 %r18709, %r30715, %r18916; + xor.b32 %r18525, %r30712, %r18915; + xor.b32 %r18524, %r30713, %r18916; + xor.b32 %r18684, %r30692, %r18915; + xor.b32 %r18685, %r30693, %r18916; + xor.b32 %r18645, %r30690, %r18915; + xor.b32 %r18644, %r30691, %r18916; + xor.b32 %r18628, %r30688, %r18915; + xor.b32 %r18629, %r30689, %r18916; + // begin inline asm + shf.l.wrap.b32 %r18495, %r18456, %r18455, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18499, %r18455, %r18456, %r18392; + // end inline asm + xor.b32 %r18917, %r18495, %r18431; + xor.b32 %r18918, %r18499, %r18432; + xor.b32 %r18565, %r30710, %r18917; + xor.b32 %r18564, %r30711, %r18918; + xor.b32 %r18692, %r30708, %r18917; + xor.b32 %r18693, %r30709, %r18918; + xor.b32 %r18573, %r30686, %r18917; + xor.b32 %r18572, %r30687, %r18918; + xor.b32 %r18676, %r30684, %r18917; + xor.b32 %r18677, %r30685, %r18918; + xor.b32 %r18541, %r30682, %r18917; + xor.b32 %r18540, %r30683, %r18918; + // begin inline asm + shf.l.wrap.b32 %r18503, %r18468, %r18467, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18507, %r18467, %r18468, %r18392; + // end inline asm + xor.b32 %r18919, %r18503, %r18443; + xor.b32 %r18920, %r18507, %r18444; + xor.b32 %r18660, %r30706, %r18919; + xor.b32 %r18661, %r30707, %r18920; + xor.b32 %r18637, %r30680, %r18919; + xor.b32 %r18636, %r30681, %r18920; + xor.b32 %r18580, %r30678, %r18919; + xor.b32 %r18581, %r30679, %r18920; + xor.b32 %r18668, %r30676, %r18919; + xor.b32 %r18669, %r30677, %r18920; + xor.b32 %r18597, %r30674, %r18919; + xor.b32 %r18596, %r30675, %r18920; + // begin inline asm + shf.l.wrap.b32 %r18511, %r18420, %r18419, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18515, %r18419, %r18420, %r18392; + // end inline asm + xor.b32 %r18921, %r18511, %r18455; + xor.b32 %r18922, %r18515, %r18456; + xor.b32 %r18612, %r30704, %r18921; + xor.b32 %r18613, %r30705, %r18922; + xor.b32 %r18532, %r30672, %r18921; + xor.b32 %r18533, %r30673, %r18922; + xor.b32 %r18549, %r30670, %r18921; + xor.b32 %r18548, %r30671, %r18922; + xor.b32 %r18588, %r30668, %r18921; + xor.b32 %r18589, %r30669, %r18922; + xor.b32 %r18620, %r30666, %r18921; + xor.b32 %r18621, %r30667, %r18922; + mov.u32 %r18526, 44; + // begin inline asm + shf.l.wrap.b32 %r18519, %r18525, %r18524, %r18526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18523, %r18524, %r18525, %r18526; + // end inline asm + mov.u32 %r18534, 20; + // begin inline asm + shf.l.wrap.b32 %r18527, %r18533, %r18532, %r18534; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18531, %r18532, %r18533, %r18534; + // end inline asm + mov.u32 %r18542, 61; + // begin inline asm + shf.l.wrap.b32 %r18535, %r18541, %r18540, %r18542; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18539, %r18540, %r18541, %r18542; + // end inline asm + mov.u32 %r18550, 39; + // begin inline asm + shf.l.wrap.b32 %r18543, %r18549, %r18548, %r18550; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18547, %r18548, %r18549, %r18550; + // end inline asm + mov.u32 %r18558, 18; + // begin inline asm + shf.l.wrap.b32 %r18551, %r18557, %r18556, %r18558; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18555, %r18556, %r18557, %r18558; + // end inline asm + mov.u32 %r18566, 62; + // begin inline asm + shf.l.wrap.b32 %r18559, %r18565, %r18564, %r18566; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18563, %r18564, %r18565, %r18566; + // end inline asm + mov.u32 %r18574, 43; + // begin inline asm + shf.l.wrap.b32 %r18567, %r18573, %r18572, %r18574; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18571, %r18572, %r18573, %r18574; + // end inline asm + mov.u32 %r18582, 25; + // begin inline asm + shf.l.wrap.b32 %r18575, %r18581, %r18580, %r18582; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18579, %r18580, %r18581, %r18582; + // end inline asm + mov.u32 %r18590, 8; + // begin inline asm + shf.l.wrap.b32 %r18583, %r18589, %r18588, %r18590; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18587, %r18588, %r18589, %r18590; + // end inline asm + mov.u32 %r18598, 56; + // begin inline asm + shf.l.wrap.b32 %r18591, %r18597, %r18596, %r18598; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18595, %r18596, %r18597, %r18598; + // end inline asm + mov.u32 %r18606, 41; + // begin inline asm + shf.l.wrap.b32 %r18599, %r18605, %r18604, %r18606; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18603, %r18604, %r18605, %r18606; + // end inline asm + mov.u32 %r18614, 27; + // begin inline asm + shf.l.wrap.b32 %r18607, %r18613, %r18612, %r18614; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18611, %r18612, %r18613, %r18614; + // end inline asm + mov.u32 %r18622, 14; + // begin inline asm + shf.l.wrap.b32 %r18615, %r18621, %r18620, %r18622; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18619, %r18620, %r18621, %r18622; + // end inline asm + mov.u32 %r18630, 2; + // begin inline asm + shf.l.wrap.b32 %r18623, %r18629, %r18628, %r18630; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18627, %r18628, %r18629, %r18630; + // end inline asm + mov.u32 %r18638, 55; + // begin inline asm + shf.l.wrap.b32 %r18631, %r18637, %r18636, %r18638; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18635, %r18636, %r18637, %r18638; + // end inline asm + mov.u32 %r18646, 45; + // begin inline asm + shf.l.wrap.b32 %r18639, %r18645, %r18644, %r18646; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18643, %r18644, %r18645, %r18646; + // end inline asm + mov.u32 %r18654, 36; + // begin inline asm + shf.l.wrap.b32 %r18647, %r18653, %r18652, %r18654; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18651, %r18652, %r18653, %r18654; + // end inline asm + mov.u32 %r18662, 28; + // begin inline asm + shf.l.wrap.b32 %r18655, %r18661, %r18660, %r18662; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18659, %r18660, %r18661, %r18662; + // end inline asm + mov.u32 %r18670, 21; + // begin inline asm + shf.l.wrap.b32 %r18663, %r18669, %r18668, %r18670; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18667, %r18668, %r18669, %r18670; + // end inline asm + mov.u32 %r18678, 15; + // begin inline asm + shf.l.wrap.b32 %r18671, %r18677, %r18676, %r18678; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18675, %r18676, %r18677, %r18678; + // end inline asm + mov.u32 %r18686, 10; + // begin inline asm + shf.l.wrap.b32 %r18679, %r18685, %r18684, %r18686; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18683, %r18684, %r18685, %r18686; + // end inline asm + mov.u32 %r18694, 6; + // begin inline asm + shf.l.wrap.b32 %r18687, %r18693, %r18692, %r18694; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18691, %r18692, %r18693, %r18694; + // end inline asm + mov.u32 %r18702, 3; + // begin inline asm + shf.l.wrap.b32 %r18695, %r18701, %r18700, %r18702; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18699, %r18700, %r18701, %r18702; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18703, %r18709, %r18708, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18707, %r18708, %r18709, %r18392; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18711, %r18746, %r18519, %r18567, 0xD2; + lop3.b32 %r18712, %r18749, %r18523, %r18571, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30714, %r18519, %r18567, %r18663, 0xD2; + lop3.b32 %r30715, %r18523, %r18571, %r18667, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30710, %r18567, %r18663, %r18615, 0xD2; + lop3.b32 %r30711, %r18571, %r18667, %r18619, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30706, %r18663, %r18615, %r18746, 0xD2; + lop3.b32 %r30707, %r18667, %r18619, %r18749, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30704, %r18615, %r18746, %r18519, 0xD2; + lop3.b32 %r30705, %r18619, %r18749, %r18523, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30700, %r18655, %r18527, %r18695, 0xD2; + lop3.b32 %r30701, %r18659, %r18531, %r18699, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30712, %r18527, %r18695, %r18639, 0xD2; + lop3.b32 %r30713, %r18531, %r18699, %r18643, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30708, %r18695, %r18639, %r18535, 0xD2; + lop3.b32 %r30709, %r18699, %r18643, %r18539, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30680, %r18639, %r18535, %r18655, 0xD2; + lop3.b32 %r30681, %r18643, %r18539, %r18659, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30680, %r30681}; + // begin inline asm + // chi + lop3.b32 %r30672, %r18535, %r18655, %r18527, 0xD2; + lop3.b32 %r30673, %r18539, %r18659, %r18531, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30672, %r30673}; + // begin inline asm + // chi + lop3.b32 %r30698, %r18703, %r18687, %r18575, 0xD2; + lop3.b32 %r30699, %r18707, %r18691, %r18579, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30698, %r30699}; + // begin inline asm + // chi + lop3.b32 %r30692, %r18687, %r18575, %r18583, 0xD2; + lop3.b32 %r30693, %r18691, %r18579, %r18587, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30692, %r30693}; + // begin inline asm + // chi + lop3.b32 %r30686, %r18575, %r18583, %r18551, 0xD2; + lop3.b32 %r30687, %r18579, %r18587, %r18555, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30686, %r30687}; + // begin inline asm + // chi + lop3.b32 %r30678, %r18583, %r18551, %r18703, 0xD2; + lop3.b32 %r30679, %r18587, %r18555, %r18707, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30678, %r30679}; + // begin inline asm + // chi + lop3.b32 %r30670, %r18551, %r18703, %r18687, 0xD2; + lop3.b32 %r30671, %r18555, %r18707, %r18691, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30670, %r30671}; + // begin inline asm + // chi + lop3.b32 %r30696, %r18607, %r18647, %r18679, 0xD2; + lop3.b32 %r30697, %r18611, %r18651, %r18683, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30696, %r30697}; + // begin inline asm + // chi + lop3.b32 %r30690, %r18647, %r18679, %r18671, 0xD2; + lop3.b32 %r30691, %r18651, %r18683, %r18675, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30690, %r30691}; + // begin inline asm + // chi + lop3.b32 %r30684, %r18679, %r18671, %r18591, 0xD2; + lop3.b32 %r30685, %r18683, %r18675, %r18595, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30684, %r30685}; + // begin inline asm + // chi + lop3.b32 %r30676, %r18671, %r18591, %r18607, 0xD2; + lop3.b32 %r30677, %r18675, %r18595, %r18611, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30676, %r30677}; + // begin inline asm + // chi + lop3.b32 %r30668, %r18591, %r18607, %r18647, 0xD2; + lop3.b32 %r30669, %r18595, %r18611, %r18651, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30668, %r30669}; + // begin inline asm + // chi + lop3.b32 %r30694, %r18559, %r18631, %r18543, 0xD2; + lop3.b32 %r30695, %r18563, %r18635, %r18547, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30694, %r30695}; + // begin inline asm + // chi + lop3.b32 %r30688, %r18631, %r18543, %r18599, 0xD2; + lop3.b32 %r30689, %r18635, %r18547, %r18603, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30688, %r30689}; + // begin inline asm + // chi + lop3.b32 %r30682, %r18543, %r18599, %r18623, 0xD2; + lop3.b32 %r30683, %r18547, %r18603, %r18627, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30682, %r30683}; + // begin inline asm + // chi + lop3.b32 %r30674, %r18599, %r18623, %r18559, 0xD2; + lop3.b32 %r30675, %r18603, %r18627, %r18563, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30674, %r30675}; + // begin inline asm + // chi + lop3.b32 %r30666, %r18623, %r18559, %r18631, 0xD2; + lop3.b32 %r30667, %r18627, %r18563, %r18635, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30666, %r30667}; + mul.wide.s32 %rd857, %r30716, 8; + add.s64 %rd856, %rd789, %rd857; + // begin inline asm + ld.global.nc.v2.u32 {%r18911,%r18912}, [%rd856]; + // end inline asm + xor.b32 %r30702, %r18711, %r18911; + xor.b32 %r30703, %r18712, %r18912; + add.s32 %r30716, %r30716, 1; + setp.lt.u32 %p34, %r30716, 23; + @%p34 bra $L__BB2_57; + + st.local.v2.u32 [%rd2+32], {%r30714, %r30715}; + st.local.v2.u32 [%rd2+72], {%r30712, %r30713}; + st.local.v2.u32 [%rd2+40], {%r30710, %r30711}; + st.local.v2.u32 [%rd2+80], {%r30708, %r30709}; + st.local.v2.u32 [%rd2+48], {%r30706, %r30707}; + st.local.v2.u32 [%rd2+56], {%r30704, %r30705}; + st.local.v2.u32 [%rd2+24], {%r30702, %r30703}; + // begin inline asm + // xor5 + lop3.b32 %r18923, %r30702, %r30700, %r30698, 0x96; + lop3.b32 %r18923, %r18923, %r30696, %r30694, 0x96; + lop3.b32 %r18924, %r30703, %r30701, %r30699, 0x96; + lop3.b32 %r18924, %r18924, %r30697, %r30695, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18935, %r30714, %r30712, %r30692, 0x96; + lop3.b32 %r18935, %r18935, %r30690, %r30688, 0x96; + lop3.b32 %r18936, %r30715, %r30713, %r30693, 0x96; + lop3.b32 %r18936, %r18936, %r30691, %r30689, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18947, %r30710, %r30708, %r30686, 0x96; + lop3.b32 %r18947, %r18947, %r30684, %r30682, 0x96; + lop3.b32 %r18948, %r30711, %r30709, %r30687, 0x96; + lop3.b32 %r18948, %r18948, %r30685, %r30683, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18959, %r30706, %r30680, %r30678, 0x96; + lop3.b32 %r18959, %r18959, %r30676, %r30674, 0x96; + lop3.b32 %r18960, %r30707, %r30681, %r30679, 0x96; + lop3.b32 %r18960, %r18960, %r30677, %r30675, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18971, %r30704, %r30672, %r30670, 0x96; + lop3.b32 %r18971, %r18971, %r30668, %r30666, 0x96; + lop3.b32 %r18972, %r30705, %r30673, %r30671, 0x96; + lop3.b32 %r18972, %r18972, %r30669, %r30667, 0x96; + // end inline asm + mov.u32 %r19175, 1; + // begin inline asm + shf.l.wrap.b32 %r18983, %r18936, %r18935, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18987, %r18935, %r18936, %r19175; + // end inline asm + xor.b32 %r19202, %r18983, %r18971; + xor.b32 %r19203, %r18987, %r18972; + xor.b32 %r19130, %r30702, %r19202; + xor.b32 %r19133, %r30703, %r19203; + xor.b32 %r19093, %r30699, %r19203; + xor.b32 %r19092, %r30698, %r19202; + st.local.v2.u32 [%rd2+104], {%r19092, %r19093}; + // begin inline asm + shf.l.wrap.b32 %r18991, %r18948, %r18947, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18995, %r18947, %r18948, %r19175; + // end inline asm + xor.b32 %r19204, %r18991, %r18923; + xor.b32 %r19205, %r18995, %r18924; + xor.b32 %r19029, %r30712, %r19204; + xor.b32 %r19028, %r30713, %r19205; + xor.b32 %r19068, %r30691, %r19205; + xor.b32 %r19069, %r30690, %r19204; + st.local.v2.u32 [%rd2+152], {%r19069, %r19068}; + // begin inline asm + shf.l.wrap.b32 %r18999, %r18960, %r18959, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19003, %r18959, %r18960, %r19175; + // end inline asm + xor.b32 %r19206, %r18999, %r18935; + xor.b32 %r19207, %r19003, %r18936; + xor.b32 %r19052, %r30687, %r19207; + xor.b32 %r19053, %r30686, %r19206; + st.local.v2.u32 [%rd2+120], {%r19053, %r19052}; + xor.b32 %r19044, %r30683, %r19207; + xor.b32 %r19045, %r30682, %r19206; + st.local.v2.u32 [%rd2+200], {%r19045, %r19044}; + // begin inline asm + shf.l.wrap.b32 %r19007, %r18972, %r18971, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19011, %r18971, %r18972, %r19175; + // end inline asm + xor.b32 %r19208, %r19007, %r18947; + xor.b32 %r19209, %r19011, %r18948; + xor.b32 %r19076, %r30706, %r19208; + xor.b32 %r19077, %r30707, %r19209; + xor.b32 %r19085, %r30677, %r19209; + xor.b32 %r19084, %r30676, %r19208; + st.local.v2.u32 [%rd2+168], {%r19084, %r19085}; + // begin inline asm + shf.l.wrap.b32 %r19015, %r18924, %r18923, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19019, %r18923, %r18924, %r19175; + // end inline asm + xor.b32 %r19210, %r19015, %r18959; + xor.b32 %r19211, %r19019, %r18960; + xor.b32 %r19036, %r30672, %r19210; + xor.b32 %r19037, %r30673, %r19211; + xor.b32 %r19061, %r30667, %r19211; + xor.b32 %r19060, %r30666, %r19210; + st.local.v2.u32 [%rd2+216], {%r19060, %r19061}; + // begin inline asm + shf.l.wrap.b32 %r19023, %r19029, %r19028, %r18526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19027, %r19028, %r19029, %r18526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19031, %r19037, %r19036, %r18534; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19035, %r19036, %r19037, %r18534; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19043, %r19044, %r19045, %r18542; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19039, %r19045, %r19044, %r18542; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r19039, %r19043}; + // begin inline asm + shf.l.wrap.b32 %r19047, %r19053, %r19052, %r18574; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19051, %r19052, %r19053, %r18574; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19055, %r19061, %r19060, %r18622; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19059, %r19060, %r19061, %r18622; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19067, %r19068, %r19069, %r18646; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19063, %r19069, %r19068, %r18646; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r19063, %r19067}; + // begin inline asm + shf.l.wrap.b32 %r19071, %r19077, %r19076, %r18662; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19075, %r19076, %r19077, %r18662; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19079, %r19085, %r19084, %r18670; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19083, %r19084, %r19085, %r18670; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19087, %r19093, %r19092, %r18702; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19091, %r19092, %r19093, %r18702; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19095, %r19130, %r19023, %r19047, 0xD2; + lop3.b32 %r19096, %r19133, %r19027, %r19051, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19103, %r19023, %r19047, %r19079, 0xD2; + lop3.b32 %r19104, %r19027, %r19051, %r19083, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r19103, %r19104}; + // begin inline asm + // chi + lop3.b32 %r19111, %r19047, %r19079, %r19055, 0xD2; + lop3.b32 %r19112, %r19051, %r19083, %r19059, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r19111, %r19112}; + // begin inline asm + // chi + lop3.b32 %r19119, %r19079, %r19055, %r19130, 0xD2; + lop3.b32 %r19120, %r19083, %r19059, %r19133, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r19119, %r19120}; + // begin inline asm + // chi + lop3.b32 %r19127, %r19055, %r19130, %r19023, 0xD2; + lop3.b32 %r19128, %r19059, %r19133, %r19027, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r19127, %r19128}; + // begin inline asm + // chi + lop3.b32 %r19135, %r19071, %r19031, %r19087, 0xD2; + lop3.b32 %r19136, %r19075, %r19035, %r19091, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r19135, %r19136}; + // begin inline asm + // chi + lop3.b32 %r19143, %r19031, %r19087, %r19063, 0xD2; + lop3.b32 %r19144, %r19035, %r19091, %r19067, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r19143, %r19144}; + // begin inline asm + // chi + lop3.b32 %r19151, %r19087, %r19063, %r19039, 0xD2; + lop3.b32 %r19152, %r19091, %r19067, %r19043, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r19151, %r19152}; + // begin inline asm + ld.global.nc.v2.u32 {%r19159,%r19160}, [%rd790]; + // end inline asm + xor.b32 %r19212, %r19096, %r19160; + xor.b32 %r19213, %r19095, %r19159; + mov.b64 %rd1333, {%r19213, %r19212}; + mov.b64 %rd1334, {%r19103, %r19104}; + mov.b64 %rd1335, {%r19111, %r19112}; + mov.b64 %rd1336, {%r19119, %r19120}; + mov.b64 %rd1337, {%r19127, %r19128}; + mov.b64 %rd1338, {%r19135, %r19136}; + mov.b64 %rd1339, {%r19143, %r19144}; + mov.b64 %rd1340, {%r19151, %r19152}; + mov.u32 %r30717, 0; + st.local.v2.u32 [%rd2+24], {%r19213, %r19212}; + st.local.v2.u32 [%rd176+96], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+104], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+112], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+120], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+128], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+136], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+144], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+152], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+160], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+168], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+176], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+184], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+192], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+200], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+208], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+216], {%r30717, %r30717}; + mov.u32 %r30732, -2147483648; + st.local.v2.u32 [%rd176+88], {%r19175, %r30732}; + mov.u32 %r30718, %r30717; + mov.u32 %r30719, %r30717; + mov.u32 %r30720, %r30717; + mov.u32 %r30721, %r30717; + mov.u32 %r30722, %r30717; + mov.u32 %r30723, %r30717; + mov.u32 %r30724, %r30717; + mov.u32 %r30725, %r30717; + mov.u32 %r30726, %r30717; + mov.u32 %r30727, %r30717; + mov.u32 %r30728, %r30717; + mov.u32 %r30729, %r30717; + mov.u32 %r30730, %r30717; + mov.u32 %r30731, %r19175; + mov.u32 %r30733, %r30717; + mov.u32 %r30734, %r30717; + mov.u32 %r30735, %r30717; + mov.u32 %r30736, %r30717; + mov.u32 %r30737, %r30717; + mov.u32 %r30738, %r30717; + mov.u32 %r30739, %r30717; + mov.u32 %r30740, %r30717; + mov.u32 %r30741, %r30717; + mov.u32 %r30742, %r30717; + mov.u32 %r30743, %r30717; + mov.u32 %r30744, %r30717; + mov.u32 %r30745, %r30717; + mov.u32 %r30746, %r30717; + mov.u32 %r30747, %r30717; + mov.u32 %r30748, %r30717; + mov.u32 %r30749, %r30717; + mov.u32 %r30750, %r30717; + mov.u32 %r30767, %r30717; + +$L__BB2_59: + // begin inline asm + // xor5 + lop3.b32 %r19214, %r30753, %r30751, %r30749, 0x96; + lop3.b32 %r19214, %r19214, %r30747, %r30745, 0x96; + lop3.b32 %r19215, %r30754, %r30752, %r30750, 0x96; + lop3.b32 %r19215, %r19215, %r30748, %r30746, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19226, %r30765, %r30763, %r30743, 0x96; + lop3.b32 %r19226, %r19226, %r30741, %r30739, 0x96; + lop3.b32 %r19227, %r30766, %r30764, %r30744, 0x96; + lop3.b32 %r19227, %r19227, %r30742, %r30740, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19238, %r30761, %r30759, %r30737, 0x96; + lop3.b32 %r19238, %r19238, %r30735, %r30733, 0x96; + lop3.b32 %r19239, %r30762, %r30760, %r30738, 0x96; + lop3.b32 %r19239, %r19239, %r30736, %r30734, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19250, %r30757, %r30731, %r30729, 0x96; + lop3.b32 %r19250, %r19250, %r30727, %r30725, 0x96; + lop3.b32 %r19251, %r30758, %r30732, %r30730, 0x96; + lop3.b32 %r19251, %r19251, %r30728, %r30726, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19262, %r30755, %r30723, %r30721, 0x96; + lop3.b32 %r19262, %r19262, %r30719, %r30717, 0x96; + lop3.b32 %r19263, %r30756, %r30724, %r30722, 0x96; + lop3.b32 %r19263, %r19263, %r30720, %r30718, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19274, %r19227, %r19226, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19278, %r19226, %r19227, %r19175; + // end inline asm + xor.b32 %r19708, %r19274, %r19262; + xor.b32 %r19709, %r19278, %r19263; + xor.b32 %r19541, %r30753, %r19708; + xor.b32 %r19544, %r30754, %r19709; + xor.b32 %r19448, %r30751, %r19708; + xor.b32 %r19447, %r30752, %r19709; + xor.b32 %r19495, %r30749, %r19708; + xor.b32 %r19496, %r30750, %r19709; + xor.b32 %r19400, %r30747, %r19708; + xor.b32 %r19399, %r30748, %r19709; + xor.b32 %r19351, %r30745, %r19708; + xor.b32 %r19352, %r30746, %r19709; + // begin inline asm + shf.l.wrap.b32 %r19282, %r19239, %r19238, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19286, %r19238, %r19239, %r19175; + // end inline asm + xor.b32 %r19710, %r19282, %r19214; + xor.b32 %r19711, %r19286, %r19215; + xor.b32 %r19503, %r30765, %r19710; + xor.b32 %r19504, %r30766, %r19711; + xor.b32 %r19320, %r30763, %r19710; + xor.b32 %r19319, %r30764, %r19711; + xor.b32 %r19479, %r30743, %r19710; + xor.b32 %r19480, %r30744, %r19711; + xor.b32 %r19440, %r30741, %r19710; + xor.b32 %r19439, %r30742, %r19711; + xor.b32 %r19423, %r30739, %r19710; + xor.b32 %r19424, %r30740, %r19711; + // begin inline asm + shf.l.wrap.b32 %r19290, %r19251, %r19250, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19294, %r19250, %r19251, %r19175; + // end inline asm + xor.b32 %r19712, %r19290, %r19226; + xor.b32 %r19713, %r19294, %r19227; + xor.b32 %r19360, %r30761, %r19712; + xor.b32 %r19359, %r30762, %r19713; + xor.b32 %r19487, %r30759, %r19712; + xor.b32 %r19488, %r30760, %r19713; + xor.b32 %r19368, %r30737, %r19712; + xor.b32 %r19367, %r30738, %r19713; + xor.b32 %r19471, %r30735, %r19712; + xor.b32 %r19472, %r30736, %r19713; + xor.b32 %r19336, %r30733, %r19712; + xor.b32 %r19335, %r30734, %r19713; + // begin inline asm + shf.l.wrap.b32 %r19298, %r19263, %r19262, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19302, %r19262, %r19263, %r19175; + // end inline asm + xor.b32 %r19714, %r19298, %r19238; + xor.b32 %r19715, %r19302, %r19239; + xor.b32 %r19455, %r30757, %r19714; + xor.b32 %r19456, %r30758, %r19715; + xor.b32 %r19432, %r30731, %r19714; + xor.b32 %r19431, %r30732, %r19715; + xor.b32 %r19375, %r30729, %r19714; + xor.b32 %r19376, %r30730, %r19715; + xor.b32 %r19463, %r30727, %r19714; + xor.b32 %r19464, %r30728, %r19715; + xor.b32 %r19392, %r30725, %r19714; + xor.b32 %r19391, %r30726, %r19715; + // begin inline asm + shf.l.wrap.b32 %r19306, %r19215, %r19214, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19310, %r19214, %r19215, %r19175; + // end inline asm + xor.b32 %r19716, %r19306, %r19250; + xor.b32 %r19717, %r19310, %r19251; + xor.b32 %r19407, %r30755, %r19716; + xor.b32 %r19408, %r30756, %r19717; + xor.b32 %r19327, %r30723, %r19716; + xor.b32 %r19328, %r30724, %r19717; + xor.b32 %r19344, %r30721, %r19716; + xor.b32 %r19343, %r30722, %r19717; + xor.b32 %r19383, %r30719, %r19716; + xor.b32 %r19384, %r30720, %r19717; + xor.b32 %r19415, %r30717, %r19716; + xor.b32 %r19416, %r30718, %r19717; + mov.u32 %r19321, 44; + // begin inline asm + shf.l.wrap.b32 %r19314, %r19320, %r19319, %r19321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19318, %r19319, %r19320, %r19321; + // end inline asm + mov.u32 %r19329, 20; + // begin inline asm + shf.l.wrap.b32 %r19322, %r19328, %r19327, %r19329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19326, %r19327, %r19328, %r19329; + // end inline asm + mov.u32 %r19337, 61; + // begin inline asm + shf.l.wrap.b32 %r19330, %r19336, %r19335, %r19337; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19334, %r19335, %r19336, %r19337; + // end inline asm + mov.u32 %r19345, 39; + // begin inline asm + shf.l.wrap.b32 %r19338, %r19344, %r19343, %r19345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19342, %r19343, %r19344, %r19345; + // end inline asm + mov.u32 %r19353, 18; + // begin inline asm + shf.l.wrap.b32 %r19346, %r19352, %r19351, %r19353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19350, %r19351, %r19352, %r19353; + // end inline asm + mov.u32 %r19361, 62; + // begin inline asm + shf.l.wrap.b32 %r19354, %r19360, %r19359, %r19361; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19358, %r19359, %r19360, %r19361; + // end inline asm + mov.u32 %r19369, 43; + // begin inline asm + shf.l.wrap.b32 %r19362, %r19368, %r19367, %r19369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19366, %r19367, %r19368, %r19369; + // end inline asm + mov.u32 %r19377, 25; + // begin inline asm + shf.l.wrap.b32 %r19370, %r19376, %r19375, %r19377; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19374, %r19375, %r19376, %r19377; + // end inline asm + mov.u32 %r19385, 8; + // begin inline asm + shf.l.wrap.b32 %r19378, %r19384, %r19383, %r19385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19382, %r19383, %r19384, %r19385; + // end inline asm + mov.u32 %r19393, 56; + // begin inline asm + shf.l.wrap.b32 %r19386, %r19392, %r19391, %r19393; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19390, %r19391, %r19392, %r19393; + // end inline asm + mov.u32 %r19401, 41; + // begin inline asm + shf.l.wrap.b32 %r19394, %r19400, %r19399, %r19401; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19398, %r19399, %r19400, %r19401; + // end inline asm + mov.u32 %r19409, 27; + // begin inline asm + shf.l.wrap.b32 %r19402, %r19408, %r19407, %r19409; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19406, %r19407, %r19408, %r19409; + // end inline asm + mov.u32 %r19417, 14; + // begin inline asm + shf.l.wrap.b32 %r19410, %r19416, %r19415, %r19417; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19414, %r19415, %r19416, %r19417; + // end inline asm + mov.u32 %r19425, 2; + // begin inline asm + shf.l.wrap.b32 %r19418, %r19424, %r19423, %r19425; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19422, %r19423, %r19424, %r19425; + // end inline asm + mov.u32 %r19433, 55; + // begin inline asm + shf.l.wrap.b32 %r19426, %r19432, %r19431, %r19433; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19430, %r19431, %r19432, %r19433; + // end inline asm + mov.u32 %r19441, 45; + // begin inline asm + shf.l.wrap.b32 %r19434, %r19440, %r19439, %r19441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19438, %r19439, %r19440, %r19441; + // end inline asm + mov.u32 %r19449, 36; + // begin inline asm + shf.l.wrap.b32 %r19442, %r19448, %r19447, %r19449; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19446, %r19447, %r19448, %r19449; + // end inline asm + mov.u32 %r19457, 28; + // begin inline asm + shf.l.wrap.b32 %r19450, %r19456, %r19455, %r19457; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19454, %r19455, %r19456, %r19457; + // end inline asm + mov.u32 %r19465, 21; + // begin inline asm + shf.l.wrap.b32 %r19458, %r19464, %r19463, %r19465; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19462, %r19463, %r19464, %r19465; + // end inline asm + mov.u32 %r19473, 15; + // begin inline asm + shf.l.wrap.b32 %r19466, %r19472, %r19471, %r19473; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19470, %r19471, %r19472, %r19473; + // end inline asm + mov.u32 %r19481, 10; + // begin inline asm + shf.l.wrap.b32 %r19474, %r19480, %r19479, %r19481; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19478, %r19479, %r19480, %r19481; + // end inline asm + mov.u32 %r19489, 6; + // begin inline asm + shf.l.wrap.b32 %r19482, %r19488, %r19487, %r19489; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19486, %r19487, %r19488, %r19489; + // end inline asm + mov.u32 %r19497, 3; + // begin inline asm + shf.l.wrap.b32 %r19490, %r19496, %r19495, %r19497; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19494, %r19495, %r19496, %r19497; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19498, %r19504, %r19503, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19502, %r19503, %r19504, %r19175; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19506, %r19541, %r19314, %r19362, 0xD2; + lop3.b32 %r19507, %r19544, %r19318, %r19366, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30765, %r19314, %r19362, %r19458, 0xD2; + lop3.b32 %r30766, %r19318, %r19366, %r19462, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30761, %r19362, %r19458, %r19410, 0xD2; + lop3.b32 %r30762, %r19366, %r19462, %r19414, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30757, %r19458, %r19410, %r19541, 0xD2; + lop3.b32 %r30758, %r19462, %r19414, %r19544, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30755, %r19410, %r19541, %r19314, 0xD2; + lop3.b32 %r30756, %r19414, %r19544, %r19318, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30751, %r19450, %r19322, %r19490, 0xD2; + lop3.b32 %r30752, %r19454, %r19326, %r19494, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30763, %r19322, %r19490, %r19434, 0xD2; + lop3.b32 %r30764, %r19326, %r19494, %r19438, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30759, %r19490, %r19434, %r19330, 0xD2; + lop3.b32 %r30760, %r19494, %r19438, %r19334, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30731, %r19434, %r19330, %r19450, 0xD2; + lop3.b32 %r30732, %r19438, %r19334, %r19454, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r30731, %r30732}; + // begin inline asm + // chi + lop3.b32 %r30723, %r19330, %r19450, %r19322, 0xD2; + lop3.b32 %r30724, %r19334, %r19454, %r19326, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r30723, %r30724}; + // begin inline asm + // chi + lop3.b32 %r30749, %r19498, %r19482, %r19370, 0xD2; + lop3.b32 %r30750, %r19502, %r19486, %r19374, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+104], {%r30749, %r30750}; + // begin inline asm + // chi + lop3.b32 %r30743, %r19482, %r19370, %r19378, 0xD2; + lop3.b32 %r30744, %r19486, %r19374, %r19382, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+112], {%r30743, %r30744}; + // begin inline asm + // chi + lop3.b32 %r30737, %r19370, %r19378, %r19346, 0xD2; + lop3.b32 %r30738, %r19374, %r19382, %r19350, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+120], {%r30737, %r30738}; + // begin inline asm + // chi + lop3.b32 %r30729, %r19378, %r19346, %r19498, 0xD2; + lop3.b32 %r30730, %r19382, %r19350, %r19502, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+128], {%r30729, %r30730}; + // begin inline asm + // chi + lop3.b32 %r30721, %r19346, %r19498, %r19482, 0xD2; + lop3.b32 %r30722, %r19350, %r19502, %r19486, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+136], {%r30721, %r30722}; + // begin inline asm + // chi + lop3.b32 %r30747, %r19402, %r19442, %r19474, 0xD2; + lop3.b32 %r30748, %r19406, %r19446, %r19478, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+144], {%r30747, %r30748}; + // begin inline asm + // chi + lop3.b32 %r30741, %r19442, %r19474, %r19466, 0xD2; + lop3.b32 %r30742, %r19446, %r19478, %r19470, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+152], {%r30741, %r30742}; + // begin inline asm + // chi + lop3.b32 %r30735, %r19474, %r19466, %r19386, 0xD2; + lop3.b32 %r30736, %r19478, %r19470, %r19390, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+160], {%r30735, %r30736}; + // begin inline asm + // chi + lop3.b32 %r30727, %r19466, %r19386, %r19402, 0xD2; + lop3.b32 %r30728, %r19470, %r19390, %r19406, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+168], {%r30727, %r30728}; + // begin inline asm + // chi + lop3.b32 %r30719, %r19386, %r19402, %r19442, 0xD2; + lop3.b32 %r30720, %r19390, %r19406, %r19446, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+176], {%r30719, %r30720}; + // begin inline asm + // chi + lop3.b32 %r30745, %r19354, %r19426, %r19338, 0xD2; + lop3.b32 %r30746, %r19358, %r19430, %r19342, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+184], {%r30745, %r30746}; + // begin inline asm + // chi + lop3.b32 %r30739, %r19426, %r19338, %r19394, 0xD2; + lop3.b32 %r30740, %r19430, %r19342, %r19398, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+192], {%r30739, %r30740}; + // begin inline asm + // chi + lop3.b32 %r30733, %r19338, %r19394, %r19418, 0xD2; + lop3.b32 %r30734, %r19342, %r19398, %r19422, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+200], {%r30733, %r30734}; + // begin inline asm + // chi + lop3.b32 %r30725, %r19394, %r19418, %r19354, 0xD2; + lop3.b32 %r30726, %r19398, %r19422, %r19358, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+208], {%r30725, %r30726}; + // begin inline asm + // chi + lop3.b32 %r30717, %r19418, %r19354, %r19426, 0xD2; + lop3.b32 %r30718, %r19422, %r19358, %r19430, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+216], {%r30717, %r30718}; + mul.wide.s32 %rd864, %r30767, 8; + add.s64 %rd863, %rd789, %rd864; + // begin inline asm + ld.global.nc.v2.u32 {%r19706,%r19707}, [%rd863]; + // end inline asm + xor.b32 %r30753, %r19506, %r19706; + xor.b32 %r30754, %r19507, %r19707; + add.s32 %r30767, %r30767, 1; + setp.lt.u32 %p35, %r30767, 23; + @%p35 bra $L__BB2_59; + + mov.u32 %r19817, 1; + st.local.v2.u32 [%rd176+32], {%r30765, %r30766}; + st.local.v2.u32 [%rd176+72], {%r30763, %r30764}; + st.local.v2.u32 [%rd176+40], {%r30761, %r30762}; + st.local.v2.u32 [%rd176+80], {%r30759, %r30760}; + st.local.v2.u32 [%rd176+48], {%r30757, %r30758}; + st.local.v2.u32 [%rd176+56], {%r30755, %r30756}; + st.local.v2.u32 [%rd176+24], {%r30753, %r30754}; + // begin inline asm + // xor5 + lop3.b32 %r19718, %r30753, %r30751, %r30749, 0x96; + lop3.b32 %r19718, %r19718, %r30747, %r30745, 0x96; + lop3.b32 %r19719, %r30754, %r30752, %r30750, 0x96; + lop3.b32 %r19719, %r19719, %r30748, %r30746, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19730, %r30765, %r30763, %r30743, 0x96; + lop3.b32 %r19730, %r19730, %r30741, %r30739, 0x96; + lop3.b32 %r19731, %r30766, %r30764, %r30744, 0x96; + lop3.b32 %r19731, %r19731, %r30742, %r30740, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19742, %r30761, %r30759, %r30737, 0x96; + lop3.b32 %r19742, %r19742, %r30735, %r30733, 0x96; + lop3.b32 %r19743, %r30762, %r30760, %r30738, 0x96; + lop3.b32 %r19743, %r19743, %r30736, %r30734, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19754, %r30757, %r30731, %r30729, 0x96; + lop3.b32 %r19754, %r19754, %r30727, %r30725, 0x96; + lop3.b32 %r19755, %r30758, %r30732, %r30730, 0x96; + lop3.b32 %r19755, %r19755, %r30728, %r30726, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19766, %r30755, %r30723, %r30721, 0x96; + lop3.b32 %r19766, %r19766, %r30719, %r30717, 0x96; + lop3.b32 %r19767, %r30756, %r30724, %r30722, 0x96; + lop3.b32 %r19767, %r19767, %r30720, %r30718, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19778, %r19731, %r19730, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19782, %r19730, %r19731, %r19817; + // end inline asm + xor.b32 %r19956, %r19778, %r19766; + xor.b32 %r19957, %r19782, %r19767; + xor.b32 %r19925, %r30753, %r19956; + xor.b32 %r19928, %r30754, %r19957; + xor.b32 %r19888, %r30750, %r19957; + xor.b32 %r19887, %r30749, %r19956; + st.local.v2.u32 [%rd176+104], {%r19887, %r19888}; + // begin inline asm + shf.l.wrap.b32 %r19786, %r19743, %r19742, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19790, %r19742, %r19743, %r19817; + // end inline asm + xor.b32 %r19958, %r19786, %r19718; + xor.b32 %r19959, %r19790, %r19719; + xor.b32 %r19824, %r30763, %r19958; + xor.b32 %r19823, %r30764, %r19959; + xor.b32 %r19863, %r30742, %r19959; + xor.b32 %r19864, %r30741, %r19958; + st.local.v2.u32 [%rd176+152], {%r19864, %r19863}; + // begin inline asm + shf.l.wrap.b32 %r19794, %r19755, %r19754, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19798, %r19754, %r19755, %r19817; + // end inline asm + xor.b32 %r19960, %r19794, %r19730; + xor.b32 %r19961, %r19798, %r19731; + xor.b32 %r19847, %r30738, %r19961; + xor.b32 %r19848, %r30737, %r19960; + st.local.v2.u32 [%rd176+120], {%r19848, %r19847}; + xor.b32 %r19839, %r30734, %r19961; + xor.b32 %r19840, %r30733, %r19960; + st.local.v2.u32 [%rd176+200], {%r19840, %r19839}; + // begin inline asm + shf.l.wrap.b32 %r19802, %r19767, %r19766, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19806, %r19766, %r19767, %r19817; + // end inline asm + xor.b32 %r19962, %r19802, %r19742; + xor.b32 %r19963, %r19806, %r19743; + xor.b32 %r19871, %r30757, %r19962; + xor.b32 %r19872, %r30758, %r19963; + xor.b32 %r19880, %r30728, %r19963; + xor.b32 %r19879, %r30727, %r19962; + st.local.v2.u32 [%rd176+168], {%r19879, %r19880}; + // begin inline asm + shf.l.wrap.b32 %r19810, %r19719, %r19718, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19814, %r19718, %r19719, %r19817; + // end inline asm + xor.b32 %r19964, %r19810, %r19754; + xor.b32 %r19965, %r19814, %r19755; + xor.b32 %r19831, %r30723, %r19964; + xor.b32 %r19832, %r30724, %r19965; + xor.b32 %r19856, %r30718, %r19965; + xor.b32 %r19855, %r30717, %r19964; + st.local.v2.u32 [%rd176+216], {%r19855, %r19856}; + // begin inline asm + shf.l.wrap.b32 %r19818, %r19824, %r19823, %r19321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19822, %r19823, %r19824, %r19321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19826, %r19832, %r19831, %r19329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19830, %r19831, %r19832, %r19329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19838, %r19839, %r19840, %r19337; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19834, %r19840, %r19839, %r19337; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r19834, %r19838}; + // begin inline asm + shf.l.wrap.b32 %r19842, %r19848, %r19847, %r19369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19846, %r19847, %r19848, %r19369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19850, %r19856, %r19855, %r19417; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19854, %r19855, %r19856, %r19417; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19862, %r19863, %r19864, %r19441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19858, %r19864, %r19863, %r19441; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r19858, %r19862}; + // begin inline asm + shf.l.wrap.b32 %r19866, %r19872, %r19871, %r19457; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19870, %r19871, %r19872, %r19457; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19874, %r19880, %r19879, %r19465; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19878, %r19879, %r19880, %r19465; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19882, %r19888, %r19887, %r19497; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19886, %r19887, %r19888, %r19497; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19890, %r19925, %r19818, %r19842, 0xD2; + lop3.b32 %r19891, %r19928, %r19822, %r19846, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19898, %r19818, %r19842, %r19874, 0xD2; + lop3.b32 %r19899, %r19822, %r19846, %r19878, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+32], {%r19898, %r19899}; + // begin inline asm + // chi + lop3.b32 %r19906, %r19842, %r19874, %r19850, 0xD2; + lop3.b32 %r19907, %r19846, %r19878, %r19854, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+40], {%r19906, %r19907}; + // begin inline asm + // chi + lop3.b32 %r19914, %r19874, %r19850, %r19925, 0xD2; + lop3.b32 %r19915, %r19878, %r19854, %r19928, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+48], {%r19914, %r19915}; + // begin inline asm + // chi + lop3.b32 %r19922, %r19850, %r19925, %r19818, 0xD2; + lop3.b32 %r19923, %r19854, %r19928, %r19822, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+56], {%r19922, %r19923}; + // begin inline asm + // chi + lop3.b32 %r19930, %r19866, %r19826, %r19882, 0xD2; + lop3.b32 %r19931, %r19870, %r19830, %r19886, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+64], {%r19930, %r19931}; + // begin inline asm + // chi + lop3.b32 %r19938, %r19826, %r19882, %r19858, 0xD2; + lop3.b32 %r19939, %r19830, %r19886, %r19862, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+72], {%r19938, %r19939}; + // begin inline asm + // chi + lop3.b32 %r19946, %r19882, %r19858, %r19834, 0xD2; + lop3.b32 %r19947, %r19886, %r19862, %r19838, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+80], {%r19946, %r19947}; + // begin inline asm + ld.global.nc.v2.u32 {%r19954,%r19955}, [%rd790]; + // end inline asm + xor.b32 %r19966, %r19891, %r19955; + xor.b32 %r19967, %r19890, %r19954; + st.local.v2.u32 [%rd176+24], {%r19967, %r19966}; + mov.b64 %rd1342, {%r19898, %r19899}; + mov.b64 %rd1343, {%r19906, %r19907}; + mov.b64 %rd1346, {%r19930, %r19931}; + mov.b64 %rd1347, {%r19938, %r19939}; + mov.b64 %rd1348, {%r19946, %r19947}; + mov.b64 %rd1341, {%r19967, %r19966}; + mov.b64 %rd1344, {%r19914, %r19915}; + mov.b64 %rd1345, {%r19922, %r19923}; + bra.uni $L__BB2_61; + +$L__BB2_39: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd671, 1179641; + st.local.u64 [%rd2+8], %rd671; + st.local.u32 [%rd2+16], %r1695; + ld.global.u64 %rd672, [%rd126]; + ld.global.u64 %rd673, [%rd126+8]; + ld.global.u64 %rd674, [%rd126+16]; + ld.global.u64 %rd675, [%rd126+24]; + ld.global.u64 %rd676, [%rd126+32]; + ld.global.u64 %rd677, [%rd126+40]; + ld.global.u64 %rd678, [%rd126+48]; + ld.global.u64 %rd679, [%rd126+56]; + st.local.u64 [%rd2+24], %rd672; + st.local.u64 [%rd2+32], %rd673; + st.local.u64 [%rd2+40], %rd674; + st.local.u64 [%rd2+48], %rd675; + st.local.u64 [%rd2+56], %rd676; + st.local.u64 [%rd2+64], %rd677; + st.local.u64 [%rd2+72], %rd678; + st.local.u64 [%rd2+80], %rd679; + cvt.u32.u64 %r13441, %rd672; + xor.b32 %r13442, %r1695, %r13441; + st.local.u32 [%rd2+24], %r13442; + mov.u32 %r30294, 0; + st.local.v2.u32 [%rd2+96], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+104], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+112], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+120], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+128], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+136], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+144], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+152], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+160], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+168], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+176], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+184], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+192], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+200], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+208], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+216], {%r30294, %r30294}; + mov.u32 %r30309, -2147483648; + mov.u32 %r13414, 1; + st.local.v2.u32 [%rd2+88], {%r13414, %r30309}; + ld.local.v2.u32 {%r30330, %r30331}, [%rd2+24]; + mov.b64 {%r30328, %r30329}, %rd677; + shr.u64 %rd680, %rd673, 32; + cvt.u32.u64 %r30342, %rd673; + cvt.u32.u64 %r30343, %rd680; + shr.u64 %rd681, %rd678, 32; + cvt.u32.u64 %r30340, %rd678; + cvt.u32.u64 %r30341, %rd681; + shr.u64 %rd682, %rd674, 32; + cvt.u32.u64 %r30338, %rd674; + cvt.u32.u64 %r30339, %rd682; + shr.u64 %rd683, %rd679, 32; + cvt.u32.u64 %r30336, %rd679; + cvt.u32.u64 %r30337, %rd683; + shr.u64 %rd684, %rd675, 32; + cvt.u32.u64 %r30334, %rd675; + cvt.u32.u64 %r30335, %rd684; + shr.u64 %rd685, %rd676, 32; + cvt.u32.u64 %r30332, %rd676; + cvt.u32.u64 %r30333, %rd685; + mov.u32 %r30295, %r30294; + mov.u32 %r30296, %r30294; + mov.u32 %r30297, %r30294; + mov.u32 %r30298, %r30294; + mov.u32 %r30299, %r30294; + mov.u32 %r30300, %r30294; + mov.u32 %r30301, %r30294; + mov.u32 %r30302, %r30294; + mov.u32 %r30303, %r30294; + mov.u32 %r30304, %r30294; + mov.u32 %r30305, %r30294; + mov.u32 %r30306, %r30294; + mov.u32 %r30307, %r30294; + mov.u32 %r30308, %r13414; + mov.u32 %r30310, %r30294; + mov.u32 %r30311, %r30294; + mov.u32 %r30312, %r30294; + mov.u32 %r30313, %r30294; + mov.u32 %r30314, %r30294; + mov.u32 %r30315, %r30294; + mov.u32 %r30316, %r30294; + mov.u32 %r30317, %r30294; + mov.u32 %r30318, %r30294; + mov.u32 %r30319, %r30294; + mov.u32 %r30320, %r30294; + mov.u32 %r30321, %r30294; + mov.u32 %r30322, %r30294; + mov.u32 %r30323, %r30294; + mov.u32 %r30324, %r30294; + mov.u32 %r30325, %r30294; + mov.u32 %r30326, %r30294; + mov.u32 %r30327, %r30294; + mov.u32 %r30344, %r30294; + +$L__BB2_40: + // begin inline asm + // xor5 + lop3.b32 %r13445, %r30330, %r30328, %r30326, 0x96; + lop3.b32 %r13445, %r13445, %r30324, %r30322, 0x96; + lop3.b32 %r13446, %r30331, %r30329, %r30327, 0x96; + lop3.b32 %r13446, %r13446, %r30325, %r30323, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13457, %r30342, %r30340, %r30320, 0x96; + lop3.b32 %r13457, %r13457, %r30318, %r30316, 0x96; + lop3.b32 %r13458, %r30343, %r30341, %r30321, 0x96; + lop3.b32 %r13458, %r13458, %r30319, %r30317, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13469, %r30338, %r30336, %r30314, 0x96; + lop3.b32 %r13469, %r13469, %r30312, %r30310, 0x96; + lop3.b32 %r13470, %r30339, %r30337, %r30315, 0x96; + lop3.b32 %r13470, %r13470, %r30313, %r30311, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13481, %r30334, %r30308, %r30306, 0x96; + lop3.b32 %r13481, %r13481, %r30304, %r30302, 0x96; + lop3.b32 %r13482, %r30335, %r30309, %r30307, 0x96; + lop3.b32 %r13482, %r13482, %r30305, %r30303, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13493, %r30332, %r30300, %r30298, 0x96; + lop3.b32 %r13493, %r13493, %r30296, %r30294, 0x96; + lop3.b32 %r13494, %r30333, %r30301, %r30299, 0x96; + lop3.b32 %r13494, %r13494, %r30297, %r30295, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13505, %r13458, %r13457, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13509, %r13457, %r13458, %r13414; + // end inline asm + xor.b32 %r13939, %r13505, %r13493; + xor.b32 %r13940, %r13509, %r13494; + xor.b32 %r13772, %r30330, %r13939; + xor.b32 %r13775, %r30331, %r13940; + xor.b32 %r13679, %r30328, %r13939; + xor.b32 %r13678, %r30329, %r13940; + xor.b32 %r13726, %r30326, %r13939; + xor.b32 %r13727, %r30327, %r13940; + xor.b32 %r13631, %r30324, %r13939; + xor.b32 %r13630, %r30325, %r13940; + xor.b32 %r13582, %r30322, %r13939; + xor.b32 %r13583, %r30323, %r13940; + // begin inline asm + shf.l.wrap.b32 %r13513, %r13470, %r13469, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13517, %r13469, %r13470, %r13414; + // end inline asm + xor.b32 %r13941, %r13513, %r13445; + xor.b32 %r13942, %r13517, %r13446; + xor.b32 %r13734, %r30342, %r13941; + xor.b32 %r13735, %r30343, %r13942; + xor.b32 %r13551, %r30340, %r13941; + xor.b32 %r13550, %r30341, %r13942; + xor.b32 %r13710, %r30320, %r13941; + xor.b32 %r13711, %r30321, %r13942; + xor.b32 %r13671, %r30318, %r13941; + xor.b32 %r13670, %r30319, %r13942; + xor.b32 %r13654, %r30316, %r13941; + xor.b32 %r13655, %r30317, %r13942; + // begin inline asm + shf.l.wrap.b32 %r13521, %r13482, %r13481, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13525, %r13481, %r13482, %r13414; + // end inline asm + xor.b32 %r13943, %r13521, %r13457; + xor.b32 %r13944, %r13525, %r13458; + xor.b32 %r13591, %r30338, %r13943; + xor.b32 %r13590, %r30339, %r13944; + xor.b32 %r13718, %r30336, %r13943; + xor.b32 %r13719, %r30337, %r13944; + xor.b32 %r13599, %r30314, %r13943; + xor.b32 %r13598, %r30315, %r13944; + xor.b32 %r13702, %r30312, %r13943; + xor.b32 %r13703, %r30313, %r13944; + xor.b32 %r13567, %r30310, %r13943; + xor.b32 %r13566, %r30311, %r13944; + // begin inline asm + shf.l.wrap.b32 %r13529, %r13494, %r13493, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13533, %r13493, %r13494, %r13414; + // end inline asm + xor.b32 %r13945, %r13529, %r13469; + xor.b32 %r13946, %r13533, %r13470; + xor.b32 %r13686, %r30334, %r13945; + xor.b32 %r13687, %r30335, %r13946; + xor.b32 %r13663, %r30308, %r13945; + xor.b32 %r13662, %r30309, %r13946; + xor.b32 %r13606, %r30306, %r13945; + xor.b32 %r13607, %r30307, %r13946; + xor.b32 %r13694, %r30304, %r13945; + xor.b32 %r13695, %r30305, %r13946; + xor.b32 %r13623, %r30302, %r13945; + xor.b32 %r13622, %r30303, %r13946; + // begin inline asm + shf.l.wrap.b32 %r13537, %r13446, %r13445, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13541, %r13445, %r13446, %r13414; + // end inline asm + xor.b32 %r13947, %r13537, %r13481; + xor.b32 %r13948, %r13541, %r13482; + xor.b32 %r13638, %r30332, %r13947; + xor.b32 %r13639, %r30333, %r13948; + xor.b32 %r13558, %r30300, %r13947; + xor.b32 %r13559, %r30301, %r13948; + xor.b32 %r13575, %r30298, %r13947; + xor.b32 %r13574, %r30299, %r13948; + xor.b32 %r13614, %r30296, %r13947; + xor.b32 %r13615, %r30297, %r13948; + xor.b32 %r13646, %r30294, %r13947; + xor.b32 %r13647, %r30295, %r13948; + mov.u32 %r13552, 44; + // begin inline asm + shf.l.wrap.b32 %r13545, %r13551, %r13550, %r13552; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13549, %r13550, %r13551, %r13552; + // end inline asm + mov.u32 %r13560, 20; + // begin inline asm + shf.l.wrap.b32 %r13553, %r13559, %r13558, %r13560; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13557, %r13558, %r13559, %r13560; + // end inline asm + mov.u32 %r13568, 61; + // begin inline asm + shf.l.wrap.b32 %r13561, %r13567, %r13566, %r13568; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13565, %r13566, %r13567, %r13568; + // end inline asm + mov.u32 %r13576, 39; + // begin inline asm + shf.l.wrap.b32 %r13569, %r13575, %r13574, %r13576; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13573, %r13574, %r13575, %r13576; + // end inline asm + mov.u32 %r13584, 18; + // begin inline asm + shf.l.wrap.b32 %r13577, %r13583, %r13582, %r13584; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13581, %r13582, %r13583, %r13584; + // end inline asm + mov.u32 %r13592, 62; + // begin inline asm + shf.l.wrap.b32 %r13585, %r13591, %r13590, %r13592; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13589, %r13590, %r13591, %r13592; + // end inline asm + mov.u32 %r13600, 43; + // begin inline asm + shf.l.wrap.b32 %r13593, %r13599, %r13598, %r13600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13597, %r13598, %r13599, %r13600; + // end inline asm + mov.u32 %r13608, 25; + // begin inline asm + shf.l.wrap.b32 %r13601, %r13607, %r13606, %r13608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13605, %r13606, %r13607, %r13608; + // end inline asm + mov.u32 %r13616, 8; + // begin inline asm + shf.l.wrap.b32 %r13609, %r13615, %r13614, %r13616; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13613, %r13614, %r13615, %r13616; + // end inline asm + mov.u32 %r13624, 56; + // begin inline asm + shf.l.wrap.b32 %r13617, %r13623, %r13622, %r13624; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13621, %r13622, %r13623, %r13624; + // end inline asm + mov.u32 %r13632, 41; + // begin inline asm + shf.l.wrap.b32 %r13625, %r13631, %r13630, %r13632; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13629, %r13630, %r13631, %r13632; + // end inline asm + mov.u32 %r13640, 27; + // begin inline asm + shf.l.wrap.b32 %r13633, %r13639, %r13638, %r13640; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13637, %r13638, %r13639, %r13640; + // end inline asm + mov.u32 %r13648, 14; + // begin inline asm + shf.l.wrap.b32 %r13641, %r13647, %r13646, %r13648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13645, %r13646, %r13647, %r13648; + // end inline asm + mov.u32 %r13656, 2; + // begin inline asm + shf.l.wrap.b32 %r13649, %r13655, %r13654, %r13656; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13653, %r13654, %r13655, %r13656; + // end inline asm + mov.u32 %r13664, 55; + // begin inline asm + shf.l.wrap.b32 %r13657, %r13663, %r13662, %r13664; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13661, %r13662, %r13663, %r13664; + // end inline asm + mov.u32 %r13672, 45; + // begin inline asm + shf.l.wrap.b32 %r13665, %r13671, %r13670, %r13672; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13669, %r13670, %r13671, %r13672; + // end inline asm + mov.u32 %r13680, 36; + // begin inline asm + shf.l.wrap.b32 %r13673, %r13679, %r13678, %r13680; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13677, %r13678, %r13679, %r13680; + // end inline asm + mov.u32 %r13688, 28; + // begin inline asm + shf.l.wrap.b32 %r13681, %r13687, %r13686, %r13688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13685, %r13686, %r13687, %r13688; + // end inline asm + mov.u32 %r13696, 21; + // begin inline asm + shf.l.wrap.b32 %r13689, %r13695, %r13694, %r13696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13693, %r13694, %r13695, %r13696; + // end inline asm + mov.u32 %r13704, 15; + // begin inline asm + shf.l.wrap.b32 %r13697, %r13703, %r13702, %r13704; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13701, %r13702, %r13703, %r13704; + // end inline asm + mov.u32 %r13712, 10; + // begin inline asm + shf.l.wrap.b32 %r13705, %r13711, %r13710, %r13712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13709, %r13710, %r13711, %r13712; + // end inline asm + mov.u32 %r13720, 6; + // begin inline asm + shf.l.wrap.b32 %r13713, %r13719, %r13718, %r13720; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13717, %r13718, %r13719, %r13720; + // end inline asm + mov.u32 %r13728, 3; + // begin inline asm + shf.l.wrap.b32 %r13721, %r13727, %r13726, %r13728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13725, %r13726, %r13727, %r13728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13729, %r13735, %r13734, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13733, %r13734, %r13735, %r13414; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13737, %r13772, %r13545, %r13593, 0xD2; + lop3.b32 %r13738, %r13775, %r13549, %r13597, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30342, %r13545, %r13593, %r13689, 0xD2; + lop3.b32 %r30343, %r13549, %r13597, %r13693, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30338, %r13593, %r13689, %r13641, 0xD2; + lop3.b32 %r30339, %r13597, %r13693, %r13645, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30334, %r13689, %r13641, %r13772, 0xD2; + lop3.b32 %r30335, %r13693, %r13645, %r13775, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30332, %r13641, %r13772, %r13545, 0xD2; + lop3.b32 %r30333, %r13645, %r13775, %r13549, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30328, %r13681, %r13553, %r13721, 0xD2; + lop3.b32 %r30329, %r13685, %r13557, %r13725, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30340, %r13553, %r13721, %r13665, 0xD2; + lop3.b32 %r30341, %r13557, %r13725, %r13669, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30336, %r13721, %r13665, %r13561, 0xD2; + lop3.b32 %r30337, %r13725, %r13669, %r13565, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30308, %r13665, %r13561, %r13681, 0xD2; + lop3.b32 %r30309, %r13669, %r13565, %r13685, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30308, %r30309}; + // begin inline asm + // chi + lop3.b32 %r30300, %r13561, %r13681, %r13553, 0xD2; + lop3.b32 %r30301, %r13565, %r13685, %r13557, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30300, %r30301}; + // begin inline asm + // chi + lop3.b32 %r30326, %r13729, %r13713, %r13601, 0xD2; + lop3.b32 %r30327, %r13733, %r13717, %r13605, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30326, %r30327}; + // begin inline asm + // chi + lop3.b32 %r30320, %r13713, %r13601, %r13609, 0xD2; + lop3.b32 %r30321, %r13717, %r13605, %r13613, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30320, %r30321}; + // begin inline asm + // chi + lop3.b32 %r30314, %r13601, %r13609, %r13577, 0xD2; + lop3.b32 %r30315, %r13605, %r13613, %r13581, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30314, %r30315}; + // begin inline asm + // chi + lop3.b32 %r30306, %r13609, %r13577, %r13729, 0xD2; + lop3.b32 %r30307, %r13613, %r13581, %r13733, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30306, %r30307}; + // begin inline asm + // chi + lop3.b32 %r30298, %r13577, %r13729, %r13713, 0xD2; + lop3.b32 %r30299, %r13581, %r13733, %r13717, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30298, %r30299}; + // begin inline asm + // chi + lop3.b32 %r30324, %r13633, %r13673, %r13705, 0xD2; + lop3.b32 %r30325, %r13637, %r13677, %r13709, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30324, %r30325}; + // begin inline asm + // chi + lop3.b32 %r30318, %r13673, %r13705, %r13697, 0xD2; + lop3.b32 %r30319, %r13677, %r13709, %r13701, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30318, %r30319}; + // begin inline asm + // chi + lop3.b32 %r30312, %r13705, %r13697, %r13617, 0xD2; + lop3.b32 %r30313, %r13709, %r13701, %r13621, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30312, %r30313}; + // begin inline asm + // chi + lop3.b32 %r30304, %r13697, %r13617, %r13633, 0xD2; + lop3.b32 %r30305, %r13701, %r13621, %r13637, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30304, %r30305}; + // begin inline asm + // chi + lop3.b32 %r30296, %r13617, %r13633, %r13673, 0xD2; + lop3.b32 %r30297, %r13621, %r13637, %r13677, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30296, %r30297}; + // begin inline asm + // chi + lop3.b32 %r30322, %r13585, %r13657, %r13569, 0xD2; + lop3.b32 %r30323, %r13589, %r13661, %r13573, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30322, %r30323}; + // begin inline asm + // chi + lop3.b32 %r30316, %r13657, %r13569, %r13625, 0xD2; + lop3.b32 %r30317, %r13661, %r13573, %r13629, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30316, %r30317}; + // begin inline asm + // chi + lop3.b32 %r30310, %r13569, %r13625, %r13649, 0xD2; + lop3.b32 %r30311, %r13573, %r13629, %r13653, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30310, %r30311}; + // begin inline asm + // chi + lop3.b32 %r30302, %r13625, %r13649, %r13585, 0xD2; + lop3.b32 %r30303, %r13629, %r13653, %r13589, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30302, %r30303}; + // begin inline asm + // chi + lop3.b32 %r30294, %r13649, %r13585, %r13657, 0xD2; + lop3.b32 %r30295, %r13653, %r13589, %r13661, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30294, %r30295}; + mul.wide.s32 %rd687, %r30344, 8; + mov.u64 %rd688, keccak_round_constants; + cvta.const.u64 %rd689, %rd688; + add.s64 %rd686, %rd689, %rd687; + // begin inline asm + ld.global.nc.v2.u32 {%r13937,%r13938}, [%rd686]; + // end inline asm + xor.b32 %r30330, %r13737, %r13937; + xor.b32 %r30331, %r13738, %r13938; + add.s32 %r30344, %r30344, 1; + setp.lt.u32 %p26, %r30344, 23; + @%p26 bra $L__BB2_40; + + add.u64 %rd147, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r30342, %r30343}; + st.local.v2.u32 [%rd2+72], {%r30340, %r30341}; + st.local.v2.u32 [%rd2+40], {%r30338, %r30339}; + st.local.v2.u32 [%rd2+80], {%r30336, %r30337}; + st.local.v2.u32 [%rd2+48], {%r30334, %r30335}; + st.local.v2.u32 [%rd2+56], {%r30332, %r30333}; + st.local.v2.u32 [%rd2+24], {%r30330, %r30331}; + // begin inline asm + // xor5 + lop3.b32 %r13949, %r30330, %r30328, %r30326, 0x96; + lop3.b32 %r13949, %r13949, %r30324, %r30322, 0x96; + lop3.b32 %r13950, %r30331, %r30329, %r30327, 0x96; + lop3.b32 %r13950, %r13950, %r30325, %r30323, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13961, %r30342, %r30340, %r30320, 0x96; + lop3.b32 %r13961, %r13961, %r30318, %r30316, 0x96; + lop3.b32 %r13962, %r30343, %r30341, %r30321, 0x96; + lop3.b32 %r13962, %r13962, %r30319, %r30317, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13973, %r30338, %r30336, %r30314, 0x96; + lop3.b32 %r13973, %r13973, %r30312, %r30310, 0x96; + lop3.b32 %r13974, %r30339, %r30337, %r30315, 0x96; + lop3.b32 %r13974, %r13974, %r30313, %r30311, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13985, %r30334, %r30308, %r30306, 0x96; + lop3.b32 %r13985, %r13985, %r30304, %r30302, 0x96; + lop3.b32 %r13986, %r30335, %r30309, %r30307, 0x96; + lop3.b32 %r13986, %r13986, %r30305, %r30303, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13997, %r30332, %r30300, %r30298, 0x96; + lop3.b32 %r13997, %r13997, %r30296, %r30294, 0x96; + lop3.b32 %r13998, %r30333, %r30301, %r30299, 0x96; + lop3.b32 %r13998, %r13998, %r30297, %r30295, 0x96; + // end inline asm + mov.u32 %r14201, 1; + // begin inline asm + shf.l.wrap.b32 %r14009, %r13962, %r13961, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14013, %r13961, %r13962, %r14201; + // end inline asm + xor.b32 %r14228, %r14009, %r13997; + xor.b32 %r14229, %r14013, %r13998; + xor.b32 %r14156, %r30330, %r14228; + xor.b32 %r14159, %r30331, %r14229; + xor.b32 %r14119, %r30327, %r14229; + xor.b32 %r14118, %r30326, %r14228; + st.local.v2.u32 [%rd2+104], {%r14118, %r14119}; + // begin inline asm + shf.l.wrap.b32 %r14017, %r13974, %r13973, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14021, %r13973, %r13974, %r14201; + // end inline asm + xor.b32 %r14230, %r14017, %r13949; + xor.b32 %r14231, %r14021, %r13950; + xor.b32 %r14055, %r30340, %r14230; + xor.b32 %r14054, %r30341, %r14231; + xor.b32 %r14094, %r30319, %r14231; + xor.b32 %r14095, %r30318, %r14230; + st.local.v2.u32 [%rd2+152], {%r14095, %r14094}; + // begin inline asm + shf.l.wrap.b32 %r14025, %r13986, %r13985, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14029, %r13985, %r13986, %r14201; + // end inline asm + xor.b32 %r14232, %r14025, %r13961; + xor.b32 %r14233, %r14029, %r13962; + xor.b32 %r14078, %r30315, %r14233; + xor.b32 %r14079, %r30314, %r14232; + st.local.v2.u32 [%rd2+120], {%r14079, %r14078}; + xor.b32 %r14070, %r30311, %r14233; + xor.b32 %r14071, %r30310, %r14232; + st.local.v2.u32 [%rd2+200], {%r14071, %r14070}; + // begin inline asm + shf.l.wrap.b32 %r14033, %r13998, %r13997, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14037, %r13997, %r13998, %r14201; + // end inline asm + xor.b32 %r14234, %r14033, %r13973; + xor.b32 %r14235, %r14037, %r13974; + xor.b32 %r14102, %r30334, %r14234; + xor.b32 %r14103, %r30335, %r14235; + xor.b32 %r14111, %r30305, %r14235; + xor.b32 %r14110, %r30304, %r14234; + st.local.v2.u32 [%rd2+168], {%r14110, %r14111}; + // begin inline asm + shf.l.wrap.b32 %r14041, %r13950, %r13949, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14045, %r13949, %r13950, %r14201; + // end inline asm + xor.b32 %r14236, %r14041, %r13985; + xor.b32 %r14237, %r14045, %r13986; + xor.b32 %r14062, %r30300, %r14236; + xor.b32 %r14063, %r30301, %r14237; + xor.b32 %r14087, %r30295, %r14237; + xor.b32 %r14086, %r30294, %r14236; + st.local.v2.u32 [%rd2+216], {%r14086, %r14087}; + // begin inline asm + shf.l.wrap.b32 %r14049, %r14055, %r14054, %r13552; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14053, %r14054, %r14055, %r13552; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14057, %r14063, %r14062, %r13560; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14061, %r14062, %r14063, %r13560; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14069, %r14070, %r14071, %r13568; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14065, %r14071, %r14070, %r13568; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r14065, %r14069}; + // begin inline asm + shf.l.wrap.b32 %r14073, %r14079, %r14078, %r13600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14077, %r14078, %r14079, %r13600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14081, %r14087, %r14086, %r13648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14085, %r14086, %r14087, %r13648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14093, %r14094, %r14095, %r13672; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14089, %r14095, %r14094, %r13672; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r14089, %r14093}; + // begin inline asm + shf.l.wrap.b32 %r14097, %r14103, %r14102, %r13688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14101, %r14102, %r14103, %r13688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14105, %r14111, %r14110, %r13696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14109, %r14110, %r14111, %r13696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14113, %r14119, %r14118, %r13728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14117, %r14118, %r14119, %r13728; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14121, %r14156, %r14049, %r14073, 0xD2; + lop3.b32 %r14122, %r14159, %r14053, %r14077, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30477, %r14049, %r14073, %r14105, 0xD2; + lop3.b32 %r30478, %r14053, %r14077, %r14109, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30477, %r30478}; + // begin inline asm + // chi + lop3.b32 %r30473, %r14073, %r14105, %r14081, 0xD2; + lop3.b32 %r30474, %r14077, %r14109, %r14085, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30473, %r30474}; + // begin inline asm + // chi + lop3.b32 %r30469, %r14105, %r14081, %r14156, 0xD2; + lop3.b32 %r30470, %r14109, %r14085, %r14159, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30469, %r30470}; + // begin inline asm + // chi + lop3.b32 %r30467, %r14081, %r14156, %r14049, 0xD2; + lop3.b32 %r30468, %r14085, %r14159, %r14053, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30467, %r30468}; + // begin inline asm + // chi + lop3.b32 %r30463, %r14097, %r14057, %r14113, 0xD2; + lop3.b32 %r30464, %r14101, %r14061, %r14117, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30463, %r30464}; + // begin inline asm + // chi + lop3.b32 %r30475, %r14057, %r14113, %r14089, 0xD2; + lop3.b32 %r30476, %r14061, %r14117, %r14093, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30475, %r30476}; + // begin inline asm + // chi + lop3.b32 %r30471, %r14113, %r14089, %r14065, 0xD2; + lop3.b32 %r30472, %r14117, %r14093, %r14069, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30471, %r30472}; + add.s64 %rd690, %rd689, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r14185,%r14186}, [%rd690]; + // end inline asm + xor.b32 %r30465, %r14121, %r14185; + xor.b32 %r30466, %r14122, %r14186; + st.local.v2.u32 [%rd2+24], {%r30465, %r30466}; + st.local.u64 [%rd147], %rd361; + mov.u64 %rd694, 1179641; + st.local.u64 [%rd147+8], %rd694; + add.s32 %r1891, %r1695, 1; + st.local.u32 [%rd147+16], %r1891; + ld.global.u64 %rd695, [%rd127]; + ld.global.u64 %rd696, [%rd127+8]; + ld.global.u64 %rd697, [%rd127+16]; + ld.global.u64 %rd698, [%rd127+24]; + ld.global.u64 %rd699, [%rd127+32]; + ld.global.u64 %rd700, [%rd127+40]; + ld.global.u64 %rd701, [%rd127+48]; + ld.global.u64 %rd702, [%rd127+56]; + st.local.u64 [%rd147+32], %rd696; + st.local.u64 [%rd147+40], %rd697; + st.local.u64 [%rd147+48], %rd698; + st.local.u64 [%rd147+56], %rd699; + st.local.u64 [%rd147+64], %rd700; + st.local.u64 [%rd147+72], %rd701; + st.local.u64 [%rd147+80], %rd702; + cvt.u32.u64 %r14238, %rd695; + xor.b32 %r14239, %r1891, %r14238; + st.local.u64 [%rd147+24], %rd695; + st.local.u32 [%rd147+24], %r14239; + mov.u32 %r30345, 0; + st.local.v2.u32 [%rd147+96], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+104], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+112], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+120], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+128], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+136], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+144], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+152], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+160], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+168], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+176], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+184], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+192], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+200], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+208], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+216], {%r30345, %r30345}; + mov.u32 %r30360, -2147483648; + st.local.v2.u32 [%rd147+88], {%r14201, %r30360}; + ld.local.v2.u32 {%r30381, %r30382}, [%rd147+24]; + mov.b64 {%r30379, %r30380}, %rd700; + shr.u64 %rd703, %rd696, 32; + cvt.u32.u64 %r30393, %rd696; + cvt.u32.u64 %r30394, %rd703; + shr.u64 %rd704, %rd701, 32; + cvt.u32.u64 %r30391, %rd701; + cvt.u32.u64 %r30392, %rd704; + shr.u64 %rd705, %rd697, 32; + cvt.u32.u64 %r30389, %rd697; + cvt.u32.u64 %r30390, %rd705; + shr.u64 %rd706, %rd702, 32; + cvt.u32.u64 %r30387, %rd702; + cvt.u32.u64 %r30388, %rd706; + shr.u64 %rd707, %rd698, 32; + cvt.u32.u64 %r30385, %rd698; + cvt.u32.u64 %r30386, %rd707; + shr.u64 %rd708, %rd699, 32; + cvt.u32.u64 %r30383, %rd699; + cvt.u32.u64 %r30384, %rd708; + mov.u32 %r30346, %r30345; + mov.u32 %r30347, %r30345; + mov.u32 %r30348, %r30345; + mov.u32 %r30349, %r30345; + mov.u32 %r30350, %r30345; + mov.u32 %r30351, %r30345; + mov.u32 %r30352, %r30345; + mov.u32 %r30353, %r30345; + mov.u32 %r30354, %r30345; + mov.u32 %r30355, %r30345; + mov.u32 %r30356, %r30345; + mov.u32 %r30357, %r30345; + mov.u32 %r30358, %r30345; + mov.u32 %r30359, %r14201; + mov.u32 %r30361, %r30345; + mov.u32 %r30362, %r30345; + mov.u32 %r30363, %r30345; + mov.u32 %r30364, %r30345; + mov.u32 %r30365, %r30345; + mov.u32 %r30366, %r30345; + mov.u32 %r30367, %r30345; + mov.u32 %r30368, %r30345; + mov.u32 %r30369, %r30345; + mov.u32 %r30370, %r30345; + mov.u32 %r30371, %r30345; + mov.u32 %r30372, %r30345; + mov.u32 %r30373, %r30345; + mov.u32 %r30374, %r30345; + mov.u32 %r30375, %r30345; + mov.u32 %r30376, %r30345; + mov.u32 %r30377, %r30345; + mov.u32 %r30378, %r30345; + mov.u32 %r30395, %r30345; + +$L__BB2_42: + // begin inline asm + // xor5 + lop3.b32 %r14242, %r30381, %r30379, %r30377, 0x96; + lop3.b32 %r14242, %r14242, %r30375, %r30373, 0x96; + lop3.b32 %r14243, %r30382, %r30380, %r30378, 0x96; + lop3.b32 %r14243, %r14243, %r30376, %r30374, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14254, %r30393, %r30391, %r30371, 0x96; + lop3.b32 %r14254, %r14254, %r30369, %r30367, 0x96; + lop3.b32 %r14255, %r30394, %r30392, %r30372, 0x96; + lop3.b32 %r14255, %r14255, %r30370, %r30368, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14266, %r30389, %r30387, %r30365, 0x96; + lop3.b32 %r14266, %r14266, %r30363, %r30361, 0x96; + lop3.b32 %r14267, %r30390, %r30388, %r30366, 0x96; + lop3.b32 %r14267, %r14267, %r30364, %r30362, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14278, %r30385, %r30359, %r30357, 0x96; + lop3.b32 %r14278, %r14278, %r30355, %r30353, 0x96; + lop3.b32 %r14279, %r30386, %r30360, %r30358, 0x96; + lop3.b32 %r14279, %r14279, %r30356, %r30354, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14290, %r30383, %r30351, %r30349, 0x96; + lop3.b32 %r14290, %r14290, %r30347, %r30345, 0x96; + lop3.b32 %r14291, %r30384, %r30352, %r30350, 0x96; + lop3.b32 %r14291, %r14291, %r30348, %r30346, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14302, %r14255, %r14254, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14306, %r14254, %r14255, %r14201; + // end inline asm + xor.b32 %r14736, %r14302, %r14290; + xor.b32 %r14737, %r14306, %r14291; + xor.b32 %r14569, %r30381, %r14736; + xor.b32 %r14572, %r30382, %r14737; + xor.b32 %r14476, %r30379, %r14736; + xor.b32 %r14475, %r30380, %r14737; + xor.b32 %r14523, %r30377, %r14736; + xor.b32 %r14524, %r30378, %r14737; + xor.b32 %r14428, %r30375, %r14736; + xor.b32 %r14427, %r30376, %r14737; + xor.b32 %r14379, %r30373, %r14736; + xor.b32 %r14380, %r30374, %r14737; + // begin inline asm + shf.l.wrap.b32 %r14310, %r14267, %r14266, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14314, %r14266, %r14267, %r14201; + // end inline asm + xor.b32 %r14738, %r14310, %r14242; + xor.b32 %r14739, %r14314, %r14243; + xor.b32 %r14531, %r30393, %r14738; + xor.b32 %r14532, %r30394, %r14739; + xor.b32 %r14348, %r30391, %r14738; + xor.b32 %r14347, %r30392, %r14739; + xor.b32 %r14507, %r30371, %r14738; + xor.b32 %r14508, %r30372, %r14739; + xor.b32 %r14468, %r30369, %r14738; + xor.b32 %r14467, %r30370, %r14739; + xor.b32 %r14451, %r30367, %r14738; + xor.b32 %r14452, %r30368, %r14739; + // begin inline asm + shf.l.wrap.b32 %r14318, %r14279, %r14278, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14322, %r14278, %r14279, %r14201; + // end inline asm + xor.b32 %r14740, %r14318, %r14254; + xor.b32 %r14741, %r14322, %r14255; + xor.b32 %r14388, %r30389, %r14740; + xor.b32 %r14387, %r30390, %r14741; + xor.b32 %r14515, %r30387, %r14740; + xor.b32 %r14516, %r30388, %r14741; + xor.b32 %r14396, %r30365, %r14740; + xor.b32 %r14395, %r30366, %r14741; + xor.b32 %r14499, %r30363, %r14740; + xor.b32 %r14500, %r30364, %r14741; + xor.b32 %r14364, %r30361, %r14740; + xor.b32 %r14363, %r30362, %r14741; + // begin inline asm + shf.l.wrap.b32 %r14326, %r14291, %r14290, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14330, %r14290, %r14291, %r14201; + // end inline asm + xor.b32 %r14742, %r14326, %r14266; + xor.b32 %r14743, %r14330, %r14267; + xor.b32 %r14483, %r30385, %r14742; + xor.b32 %r14484, %r30386, %r14743; + xor.b32 %r14460, %r30359, %r14742; + xor.b32 %r14459, %r30360, %r14743; + xor.b32 %r14403, %r30357, %r14742; + xor.b32 %r14404, %r30358, %r14743; + xor.b32 %r14491, %r30355, %r14742; + xor.b32 %r14492, %r30356, %r14743; + xor.b32 %r14420, %r30353, %r14742; + xor.b32 %r14419, %r30354, %r14743; + // begin inline asm + shf.l.wrap.b32 %r14334, %r14243, %r14242, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14338, %r14242, %r14243, %r14201; + // end inline asm + xor.b32 %r14744, %r14334, %r14278; + xor.b32 %r14745, %r14338, %r14279; + xor.b32 %r14435, %r30383, %r14744; + xor.b32 %r14436, %r30384, %r14745; + xor.b32 %r14355, %r30351, %r14744; + xor.b32 %r14356, %r30352, %r14745; + xor.b32 %r14372, %r30349, %r14744; + xor.b32 %r14371, %r30350, %r14745; + xor.b32 %r14411, %r30347, %r14744; + xor.b32 %r14412, %r30348, %r14745; + xor.b32 %r14443, %r30345, %r14744; + xor.b32 %r14444, %r30346, %r14745; + mov.u32 %r14349, 44; + // begin inline asm + shf.l.wrap.b32 %r14342, %r14348, %r14347, %r14349; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14346, %r14347, %r14348, %r14349; + // end inline asm + mov.u32 %r14357, 20; + // begin inline asm + shf.l.wrap.b32 %r14350, %r14356, %r14355, %r14357; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14354, %r14355, %r14356, %r14357; + // end inline asm + mov.u32 %r14365, 61; + // begin inline asm + shf.l.wrap.b32 %r14358, %r14364, %r14363, %r14365; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14362, %r14363, %r14364, %r14365; + // end inline asm + mov.u32 %r14373, 39; + // begin inline asm + shf.l.wrap.b32 %r14366, %r14372, %r14371, %r14373; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14370, %r14371, %r14372, %r14373; + // end inline asm + mov.u32 %r14381, 18; + // begin inline asm + shf.l.wrap.b32 %r14374, %r14380, %r14379, %r14381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14378, %r14379, %r14380, %r14381; + // end inline asm + mov.u32 %r14389, 62; + // begin inline asm + shf.l.wrap.b32 %r14382, %r14388, %r14387, %r14389; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14386, %r14387, %r14388, %r14389; + // end inline asm + mov.u32 %r14397, 43; + // begin inline asm + shf.l.wrap.b32 %r14390, %r14396, %r14395, %r14397; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14394, %r14395, %r14396, %r14397; + // end inline asm + mov.u32 %r14405, 25; + // begin inline asm + shf.l.wrap.b32 %r14398, %r14404, %r14403, %r14405; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14402, %r14403, %r14404, %r14405; + // end inline asm + mov.u32 %r14413, 8; + // begin inline asm + shf.l.wrap.b32 %r14406, %r14412, %r14411, %r14413; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14410, %r14411, %r14412, %r14413; + // end inline asm + mov.u32 %r14421, 56; + // begin inline asm + shf.l.wrap.b32 %r14414, %r14420, %r14419, %r14421; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14418, %r14419, %r14420, %r14421; + // end inline asm + mov.u32 %r14429, 41; + // begin inline asm + shf.l.wrap.b32 %r14422, %r14428, %r14427, %r14429; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14426, %r14427, %r14428, %r14429; + // end inline asm + mov.u32 %r14437, 27; + // begin inline asm + shf.l.wrap.b32 %r14430, %r14436, %r14435, %r14437; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14434, %r14435, %r14436, %r14437; + // end inline asm + mov.u32 %r14445, 14; + // begin inline asm + shf.l.wrap.b32 %r14438, %r14444, %r14443, %r14445; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14442, %r14443, %r14444, %r14445; + // end inline asm + mov.u32 %r14453, 2; + // begin inline asm + shf.l.wrap.b32 %r14446, %r14452, %r14451, %r14453; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14450, %r14451, %r14452, %r14453; + // end inline asm + mov.u32 %r14461, 55; + // begin inline asm + shf.l.wrap.b32 %r14454, %r14460, %r14459, %r14461; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14458, %r14459, %r14460, %r14461; + // end inline asm + mov.u32 %r14469, 45; + // begin inline asm + shf.l.wrap.b32 %r14462, %r14468, %r14467, %r14469; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14466, %r14467, %r14468, %r14469; + // end inline asm + mov.u32 %r14477, 36; + // begin inline asm + shf.l.wrap.b32 %r14470, %r14476, %r14475, %r14477; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14474, %r14475, %r14476, %r14477; + // end inline asm + mov.u32 %r14485, 28; + // begin inline asm + shf.l.wrap.b32 %r14478, %r14484, %r14483, %r14485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14482, %r14483, %r14484, %r14485; + // end inline asm + mov.u32 %r14493, 21; + // begin inline asm + shf.l.wrap.b32 %r14486, %r14492, %r14491, %r14493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14490, %r14491, %r14492, %r14493; + // end inline asm + mov.u32 %r14501, 15; + // begin inline asm + shf.l.wrap.b32 %r14494, %r14500, %r14499, %r14501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14498, %r14499, %r14500, %r14501; + // end inline asm + mov.u32 %r14509, 10; + // begin inline asm + shf.l.wrap.b32 %r14502, %r14508, %r14507, %r14509; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14506, %r14507, %r14508, %r14509; + // end inline asm + mov.u32 %r14517, 6; + // begin inline asm + shf.l.wrap.b32 %r14510, %r14516, %r14515, %r14517; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14514, %r14515, %r14516, %r14517; + // end inline asm + mov.u32 %r14525, 3; + // begin inline asm + shf.l.wrap.b32 %r14518, %r14524, %r14523, %r14525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14522, %r14523, %r14524, %r14525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14526, %r14532, %r14531, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14530, %r14531, %r14532, %r14201; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14534, %r14569, %r14342, %r14390, 0xD2; + lop3.b32 %r14535, %r14572, %r14346, %r14394, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30393, %r14342, %r14390, %r14486, 0xD2; + lop3.b32 %r30394, %r14346, %r14394, %r14490, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30389, %r14390, %r14486, %r14438, 0xD2; + lop3.b32 %r30390, %r14394, %r14490, %r14442, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30385, %r14486, %r14438, %r14569, 0xD2; + lop3.b32 %r30386, %r14490, %r14442, %r14572, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30383, %r14438, %r14569, %r14342, 0xD2; + lop3.b32 %r30384, %r14442, %r14572, %r14346, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30379, %r14478, %r14350, %r14518, 0xD2; + lop3.b32 %r30380, %r14482, %r14354, %r14522, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30391, %r14350, %r14518, %r14462, 0xD2; + lop3.b32 %r30392, %r14354, %r14522, %r14466, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30387, %r14518, %r14462, %r14358, 0xD2; + lop3.b32 %r30388, %r14522, %r14466, %r14362, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30359, %r14462, %r14358, %r14478, 0xD2; + lop3.b32 %r30360, %r14466, %r14362, %r14482, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r30359, %r30360}; + // begin inline asm + // chi + lop3.b32 %r30351, %r14358, %r14478, %r14350, 0xD2; + lop3.b32 %r30352, %r14362, %r14482, %r14354, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r30351, %r30352}; + // begin inline asm + // chi + lop3.b32 %r30377, %r14526, %r14510, %r14398, 0xD2; + lop3.b32 %r30378, %r14530, %r14514, %r14402, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+104], {%r30377, %r30378}; + // begin inline asm + // chi + lop3.b32 %r30371, %r14510, %r14398, %r14406, 0xD2; + lop3.b32 %r30372, %r14514, %r14402, %r14410, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+112], {%r30371, %r30372}; + // begin inline asm + // chi + lop3.b32 %r30365, %r14398, %r14406, %r14374, 0xD2; + lop3.b32 %r30366, %r14402, %r14410, %r14378, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+120], {%r30365, %r30366}; + // begin inline asm + // chi + lop3.b32 %r30357, %r14406, %r14374, %r14526, 0xD2; + lop3.b32 %r30358, %r14410, %r14378, %r14530, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+128], {%r30357, %r30358}; + // begin inline asm + // chi + lop3.b32 %r30349, %r14374, %r14526, %r14510, 0xD2; + lop3.b32 %r30350, %r14378, %r14530, %r14514, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+136], {%r30349, %r30350}; + // begin inline asm + // chi + lop3.b32 %r30375, %r14430, %r14470, %r14502, 0xD2; + lop3.b32 %r30376, %r14434, %r14474, %r14506, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+144], {%r30375, %r30376}; + // begin inline asm + // chi + lop3.b32 %r30369, %r14470, %r14502, %r14494, 0xD2; + lop3.b32 %r30370, %r14474, %r14506, %r14498, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+152], {%r30369, %r30370}; + // begin inline asm + // chi + lop3.b32 %r30363, %r14502, %r14494, %r14414, 0xD2; + lop3.b32 %r30364, %r14506, %r14498, %r14418, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+160], {%r30363, %r30364}; + // begin inline asm + // chi + lop3.b32 %r30355, %r14494, %r14414, %r14430, 0xD2; + lop3.b32 %r30356, %r14498, %r14418, %r14434, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+168], {%r30355, %r30356}; + // begin inline asm + // chi + lop3.b32 %r30347, %r14414, %r14430, %r14470, 0xD2; + lop3.b32 %r30348, %r14418, %r14434, %r14474, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+176], {%r30347, %r30348}; + // begin inline asm + // chi + lop3.b32 %r30373, %r14382, %r14454, %r14366, 0xD2; + lop3.b32 %r30374, %r14386, %r14458, %r14370, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+184], {%r30373, %r30374}; + // begin inline asm + // chi + lop3.b32 %r30367, %r14454, %r14366, %r14422, 0xD2; + lop3.b32 %r30368, %r14458, %r14370, %r14426, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+192], {%r30367, %r30368}; + // begin inline asm + // chi + lop3.b32 %r30361, %r14366, %r14422, %r14446, 0xD2; + lop3.b32 %r30362, %r14370, %r14426, %r14450, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+200], {%r30361, %r30362}; + // begin inline asm + // chi + lop3.b32 %r30353, %r14422, %r14446, %r14382, 0xD2; + lop3.b32 %r30354, %r14426, %r14450, %r14386, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+208], {%r30353, %r30354}; + // begin inline asm + // chi + lop3.b32 %r30345, %r14446, %r14382, %r14454, 0xD2; + lop3.b32 %r30346, %r14450, %r14386, %r14458, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+216], {%r30345, %r30346}; + mul.wide.s32 %rd710, %r30395, 8; + add.s64 %rd709, %rd689, %rd710; + // begin inline asm + ld.global.nc.v2.u32 {%r14734,%r14735}, [%rd709]; + // end inline asm + xor.b32 %r30381, %r14534, %r14734; + xor.b32 %r30382, %r14535, %r14735; + add.s32 %r30395, %r30395, 1; + setp.lt.u32 %p27, %r30395, 23; + @%p27 bra $L__BB2_42; + + mov.u32 %r30428, 0; + mov.u32 %r14845, 1; + st.local.v2.u32 [%rd147+32], {%r30393, %r30394}; + st.local.v2.u32 [%rd147+72], {%r30391, %r30392}; + st.local.v2.u32 [%rd147+40], {%r30389, %r30390}; + st.local.v2.u32 [%rd147+80], {%r30387, %r30388}; + st.local.v2.u32 [%rd147+48], {%r30385, %r30386}; + st.local.v2.u32 [%rd147+56], {%r30383, %r30384}; + st.local.v2.u32 [%rd147+24], {%r30381, %r30382}; + // begin inline asm + // xor5 + lop3.b32 %r14746, %r30381, %r30379, %r30377, 0x96; + lop3.b32 %r14746, %r14746, %r30375, %r30373, 0x96; + lop3.b32 %r14747, %r30382, %r30380, %r30378, 0x96; + lop3.b32 %r14747, %r14747, %r30376, %r30374, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14758, %r30393, %r30391, %r30371, 0x96; + lop3.b32 %r14758, %r14758, %r30369, %r30367, 0x96; + lop3.b32 %r14759, %r30394, %r30392, %r30372, 0x96; + lop3.b32 %r14759, %r14759, %r30370, %r30368, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14770, %r30389, %r30387, %r30365, 0x96; + lop3.b32 %r14770, %r14770, %r30363, %r30361, 0x96; + lop3.b32 %r14771, %r30390, %r30388, %r30366, 0x96; + lop3.b32 %r14771, %r14771, %r30364, %r30362, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14782, %r30385, %r30359, %r30357, 0x96; + lop3.b32 %r14782, %r14782, %r30355, %r30353, 0x96; + lop3.b32 %r14783, %r30386, %r30360, %r30358, 0x96; + lop3.b32 %r14783, %r14783, %r30356, %r30354, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14794, %r30383, %r30351, %r30349, 0x96; + lop3.b32 %r14794, %r14794, %r30347, %r30345, 0x96; + lop3.b32 %r14795, %r30384, %r30352, %r30350, 0x96; + lop3.b32 %r14795, %r14795, %r30348, %r30346, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14806, %r14759, %r14758, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14810, %r14758, %r14759, %r14845; + // end inline asm + xor.b32 %r14985, %r14806, %r14794; + xor.b32 %r14986, %r14810, %r14795; + xor.b32 %r14953, %r30381, %r14985; + xor.b32 %r14956, %r30382, %r14986; + xor.b32 %r14916, %r30378, %r14986; + xor.b32 %r14915, %r30377, %r14985; + st.local.v2.u32 [%rd147+104], {%r14915, %r14916}; + // begin inline asm + shf.l.wrap.b32 %r14814, %r14771, %r14770, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14818, %r14770, %r14771, %r14845; + // end inline asm + xor.b32 %r14987, %r14814, %r14746; + xor.b32 %r14988, %r14818, %r14747; + xor.b32 %r14852, %r30391, %r14987; + xor.b32 %r14851, %r30392, %r14988; + xor.b32 %r14891, %r30370, %r14988; + xor.b32 %r14892, %r30369, %r14987; + st.local.v2.u32 [%rd147+152], {%r14892, %r14891}; + // begin inline asm + shf.l.wrap.b32 %r14822, %r14783, %r14782, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14826, %r14782, %r14783, %r14845; + // end inline asm + xor.b32 %r14989, %r14822, %r14758; + xor.b32 %r14990, %r14826, %r14759; + xor.b32 %r14875, %r30366, %r14990; + xor.b32 %r14876, %r30365, %r14989; + st.local.v2.u32 [%rd147+120], {%r14876, %r14875}; + xor.b32 %r14867, %r30362, %r14990; + xor.b32 %r14868, %r30361, %r14989; + st.local.v2.u32 [%rd147+200], {%r14868, %r14867}; + // begin inline asm + shf.l.wrap.b32 %r14830, %r14795, %r14794, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14834, %r14794, %r14795, %r14845; + // end inline asm + xor.b32 %r14991, %r14830, %r14770; + xor.b32 %r14992, %r14834, %r14771; + xor.b32 %r14899, %r30385, %r14991; + xor.b32 %r14900, %r30386, %r14992; + xor.b32 %r14908, %r30356, %r14992; + xor.b32 %r14907, %r30355, %r14991; + st.local.v2.u32 [%rd147+168], {%r14907, %r14908}; + // begin inline asm + shf.l.wrap.b32 %r14838, %r14747, %r14746, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14842, %r14746, %r14747, %r14845; + // end inline asm + xor.b32 %r14993, %r14838, %r14782; + xor.b32 %r14994, %r14842, %r14783; + xor.b32 %r14859, %r30351, %r14993; + xor.b32 %r14860, %r30352, %r14994; + xor.b32 %r14884, %r30346, %r14994; + xor.b32 %r14883, %r30345, %r14993; + st.local.v2.u32 [%rd147+216], {%r14883, %r14884}; + // begin inline asm + shf.l.wrap.b32 %r14846, %r14852, %r14851, %r14349; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14850, %r14851, %r14852, %r14349; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14854, %r14860, %r14859, %r14357; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14858, %r14859, %r14860, %r14357; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14866, %r14867, %r14868, %r14365; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14862, %r14868, %r14867, %r14365; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r14862, %r14866}; + // begin inline asm + shf.l.wrap.b32 %r14870, %r14876, %r14875, %r14397; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14874, %r14875, %r14876, %r14397; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14878, %r14884, %r14883, %r14445; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14882, %r14883, %r14884, %r14445; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14890, %r14891, %r14892, %r14469; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14886, %r14892, %r14891, %r14469; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r14886, %r14890}; + // begin inline asm + shf.l.wrap.b32 %r14894, %r14900, %r14899, %r14485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14898, %r14899, %r14900, %r14485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14902, %r14908, %r14907, %r14493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14906, %r14907, %r14908, %r14493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14910, %r14916, %r14915, %r14525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14914, %r14915, %r14916, %r14525; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14918, %r14953, %r14846, %r14870, 0xD2; + lop3.b32 %r14919, %r14956, %r14850, %r14874, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30528, %r14846, %r14870, %r14902, 0xD2; + lop3.b32 %r30529, %r14850, %r14874, %r14906, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+32], {%r30528, %r30529}; + // begin inline asm + // chi + lop3.b32 %r30524, %r14870, %r14902, %r14878, 0xD2; + lop3.b32 %r30525, %r14874, %r14906, %r14882, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+40], {%r30524, %r30525}; + // begin inline asm + // chi + lop3.b32 %r30520, %r14902, %r14878, %r14953, 0xD2; + lop3.b32 %r30521, %r14906, %r14882, %r14956, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+48], {%r30520, %r30521}; + // begin inline asm + // chi + lop3.b32 %r30518, %r14878, %r14953, %r14846, 0xD2; + lop3.b32 %r30519, %r14882, %r14956, %r14850, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+56], {%r30518, %r30519}; + // begin inline asm + // chi + lop3.b32 %r30514, %r14894, %r14854, %r14910, 0xD2; + lop3.b32 %r30515, %r14898, %r14858, %r14914, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+64], {%r30514, %r30515}; + // begin inline asm + // chi + lop3.b32 %r30526, %r14854, %r14910, %r14886, 0xD2; + lop3.b32 %r30527, %r14858, %r14914, %r14890, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+72], {%r30526, %r30527}; + // begin inline asm + // chi + lop3.b32 %r30522, %r14910, %r14886, %r14862, 0xD2; + lop3.b32 %r30523, %r14914, %r14890, %r14866, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+80], {%r30522, %r30523}; + // begin inline asm + ld.global.nc.v2.u32 {%r14982,%r14983}, [%rd690]; + // end inline asm + xor.b32 %r30516, %r14918, %r14982; + xor.b32 %r30517, %r14919, %r14983; + st.local.v2.u32 [%rd147+24], {%r30516, %r30517}; + add.s64 %rd149, %rd2, 24; + add.s64 %rd150, %rd147, 24; + +$L__BB2_44: + cvta.to.global.u64 %rd1268, %rd361; + shl.b32 %r14995, %r30428, 2; + cvt.u64.u32 %rd720, %r14995; + and.b64 %rd721, %rd720, 60; + add.s64 %rd722, %rd149, %rd721; + xor.b32 %r14996, %r1695, %r30428; + mul.lo.s32 %r14997, %r14996, 16777619; + ld.local.u32 %r14998, [%rd722]; + xor.b32 %r14999, %r14997, %r14998; + mul.wide.u32 %rd723, %r14999, -954391867; + shr.u64 %rd724, %rd723, 32; + cvt.u32.u64 %r15000, %rd724; + sub.s32 %r15001, %r14999, %r15000; + shr.u32 %r15002, %r15001, 1; + add.s32 %r15003, %r15002, %r15000; + shr.u32 %r15004, %r15003, 20; + mul.lo.s32 %r15005, %r15004, 1179641; + sub.s32 %r15006, %r14999, %r15005; + mul.wide.u32 %rd725, %r15006, 64; + add.s64 %rd726, %rd1268, %rd725; + mul.lo.s32 %r15007, %r30465, 16777619; + ld.global.u32 %r15008, [%rd726]; + xor.b32 %r30465, %r15007, %r15008; + mul.lo.s32 %r15009, %r30466, 16777619; + ld.global.u32 %r15010, [%rd726+4]; + xor.b32 %r30466, %r15009, %r15010; + mul.lo.s32 %r15011, %r30477, 16777619; + ld.global.u32 %r15012, [%rd726+8]; + mul.lo.s32 %r15013, %r30478, 16777619; + ld.global.u32 %r15014, [%rd726+12]; + xor.b32 %r15015, %r15013, %r15014; + xor.b32 %r30477, %r15011, %r15012; + mov.b64 %rd727, {%r30477, %r15015}; + mul.lo.s32 %r15016, %r30473, 16777619; + ld.global.u32 %r15017, [%rd726+16]; + mul.lo.s32 %r15018, %r30474, 16777619; + ld.global.u32 %r15019, [%rd726+20]; + xor.b32 %r15020, %r15018, %r15019; + xor.b32 %r30473, %r15016, %r15017; + mov.b64 %rd728, {%r30473, %r15020}; + mul.lo.s32 %r15021, %r30469, 16777619; + ld.global.u32 %r15022, [%rd726+24]; + mul.lo.s32 %r15023, %r30470, 16777619; + ld.global.u32 %r15024, [%rd726+28]; + xor.b32 %r15025, %r15023, %r15024; + xor.b32 %r30469, %r15021, %r15022; + mov.b64 %rd729, {%r30469, %r15025}; + mul.lo.s32 %r15026, %r30467, 16777619; + ld.global.u32 %r15027, [%rd726+32]; + mul.lo.s32 %r15028, %r30468, 16777619; + ld.global.u32 %r15029, [%rd726+36]; + xor.b32 %r15030, %r15028, %r15029; + xor.b32 %r30467, %r15026, %r15027; + mov.b64 %rd730, {%r30467, %r15030}; + mul.lo.s32 %r15031, %r30463, 16777619; + ld.global.u32 %r15032, [%rd726+40]; + xor.b32 %r30463, %r15031, %r15032; + mul.lo.s32 %r15033, %r30464, 16777619; + ld.global.u32 %r15034, [%rd726+44]; + xor.b32 %r30464, %r15033, %r15034; + mul.lo.s32 %r15035, %r30475, 16777619; + ld.global.u32 %r15036, [%rd726+48]; + mul.lo.s32 %r15037, %r30476, 16777619; + ld.global.u32 %r15038, [%rd726+52]; + xor.b32 %r15039, %r15037, %r15038; + xor.b32 %r30475, %r15035, %r15036; + mov.b64 %rd731, {%r30475, %r15039}; + mul.lo.s32 %r15040, %r30471, 16777619; + ld.global.u32 %r15041, [%rd726+56]; + mul.lo.s32 %r15042, %r30472, 16777619; + ld.global.u32 %r15043, [%rd726+60]; + xor.b32 %r15044, %r15042, %r15043; + xor.b32 %r30471, %r15040, %r15041; + mov.b64 %rd732, {%r30471, %r15044}; + st.local.v2.u32 [%rd2+24], {%r30465, %r30466}; + st.local.v2.u32 [%rd2+32], {%r30477, %r15015}; + st.local.v2.u32 [%rd2+40], {%r30473, %r15020}; + st.local.v2.u32 [%rd2+48], {%r30469, %r15025}; + st.local.v2.u32 [%rd2+56], {%r30467, %r15030}; + st.local.v2.u32 [%rd2+64], {%r30463, %r30464}; + st.local.v2.u32 [%rd2+72], {%r30475, %r15039}; + st.local.v2.u32 [%rd2+80], {%r30471, %r15044}; + add.s64 %rd733, %rd150, %rd721; + xor.b32 %r15045, %r1891, %r30428; + mul.lo.s32 %r15046, %r15045, 16777619; + ld.local.u32 %r15047, [%rd733]; + xor.b32 %r15048, %r15046, %r15047; + mul.wide.u32 %rd734, %r15048, -954391867; + shr.u64 %rd735, %rd734, 32; + cvt.u32.u64 %r15049, %rd735; + sub.s32 %r15050, %r15048, %r15049; + shr.u32 %r15051, %r15050, 1; + add.s32 %r15052, %r15051, %r15049; + shr.u32 %r15053, %r15052, 20; + mul.lo.s32 %r15054, %r15053, 1179641; + sub.s32 %r15055, %r15048, %r15054; + mul.wide.u32 %rd736, %r15055, 64; + add.s64 %rd737, %rd1268, %rd736; + mul.lo.s32 %r15056, %r30516, 16777619; + ld.global.u32 %r15057, [%rd737]; + xor.b32 %r30516, %r15056, %r15057; + mul.lo.s32 %r15058, %r30517, 16777619; + ld.global.u32 %r15059, [%rd737+4]; + xor.b32 %r30517, %r15058, %r15059; + mul.lo.s32 %r15060, %r30528, 16777619; + ld.global.u32 %r15061, [%rd737+8]; + mul.lo.s32 %r15062, %r30529, 16777619; + ld.global.u32 %r15063, [%rd737+12]; + xor.b32 %r15064, %r15062, %r15063; + xor.b32 %r30528, %r15060, %r15061; + mov.b64 %rd738, {%r30528, %r15064}; + mul.lo.s32 %r15065, %r30524, 16777619; + ld.global.u32 %r15066, [%rd737+16]; + mul.lo.s32 %r15067, %r30525, 16777619; + ld.global.u32 %r15068, [%rd737+20]; + xor.b32 %r15069, %r15067, %r15068; + xor.b32 %r30524, %r15065, %r15066; + mov.b64 %rd739, {%r30524, %r15069}; + mul.lo.s32 %r15070, %r30520, 16777619; + ld.global.u32 %r15071, [%rd737+24]; + mul.lo.s32 %r15072, %r30521, 16777619; + ld.global.u32 %r15073, [%rd737+28]; + xor.b32 %r15074, %r15072, %r15073; + xor.b32 %r30520, %r15070, %r15071; + mov.b64 %rd740, {%r30520, %r15074}; + mul.lo.s32 %r15075, %r30518, 16777619; + ld.global.u32 %r15076, [%rd737+32]; + mul.lo.s32 %r15077, %r30519, 16777619; + ld.global.u32 %r15078, [%rd737+36]; + xor.b32 %r15079, %r15077, %r15078; + xor.b32 %r30518, %r15075, %r15076; + mov.b64 %rd741, {%r30518, %r15079}; + mul.lo.s32 %r15080, %r30514, 16777619; + ld.global.u32 %r15081, [%rd737+40]; + xor.b32 %r30514, %r15080, %r15081; + mul.lo.s32 %r15082, %r30515, 16777619; + ld.global.u32 %r15083, [%rd737+44]; + xor.b32 %r30515, %r15082, %r15083; + mul.lo.s32 %r15084, %r30526, 16777619; + ld.global.u32 %r15085, [%rd737+48]; + mul.lo.s32 %r15086, %r30527, 16777619; + ld.global.u32 %r15087, [%rd737+52]; + xor.b32 %r15088, %r15086, %r15087; + xor.b32 %r30526, %r15084, %r15085; + mov.b64 %rd742, {%r30526, %r15088}; + mul.lo.s32 %r15089, %r30522, 16777619; + ld.global.u32 %r15090, [%rd737+56]; + mul.lo.s32 %r15091, %r30523, 16777619; + ld.global.u32 %r15092, [%rd737+60]; + xor.b32 %r15093, %r15091, %r15092; + xor.b32 %r30522, %r15089, %r15090; + mov.b64 %rd743, {%r30522, %r15093}; + st.local.v2.u32 [%rd147+24], {%r30516, %r30517}; + st.local.v2.u32 [%rd147+32], {%r30528, %r15064}; + st.local.v2.u32 [%rd147+40], {%r30524, %r15069}; + st.local.v2.u32 [%rd147+48], {%r30520, %r15074}; + st.local.v2.u32 [%rd147+56], {%r30518, %r15079}; + st.local.v2.u32 [%rd147+64], {%r30514, %r30515}; + st.local.v2.u32 [%rd147+72], {%r30526, %r15088}; + st.local.v2.u32 [%rd147+80], {%r30522, %r15093}; + add.s32 %r30428, %r30428, 1; + setp.lt.u32 %p28, %r30428, 512; + shr.u64 %rd744, %rd727, 32; + cvt.u32.u64 %r30478, %rd744; + shr.u64 %rd745, %rd728, 32; + cvt.u32.u64 %r30474, %rd745; + shr.u64 %rd746, %rd729, 32; + cvt.u32.u64 %r30470, %rd746; + shr.u64 %rd747, %rd730, 32; + cvt.u32.u64 %r30468, %rd747; + shr.u64 %rd748, %rd731, 32; + cvt.u32.u64 %r30476, %rd748; + shr.u64 %rd749, %rd732, 32; + cvt.u32.u64 %r30472, %rd749; + shr.u64 %rd750, %rd738, 32; + cvt.u32.u64 %r30529, %rd750; + shr.u64 %rd751, %rd739, 32; + cvt.u32.u64 %r30525, %rd751; + shr.u64 %rd752, %rd740, 32; + cvt.u32.u64 %r30521, %rd752; + shr.u64 %rd753, %rd741, 32; + cvt.u32.u64 %r30519, %rd753; + shr.u64 %rd754, %rd742, 32; + cvt.u32.u64 %r30527, %rd754; + shr.u64 %rd755, %rd743, 32; + cvt.u32.u64 %r30523, %rd755; + @%p28 bra $L__BB2_44; + + mov.u32 %r30429, 0; + st.local.v2.u32 [%rd2+96], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+104], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+112], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+120], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+128], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+136], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+144], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+152], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+160], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+168], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+176], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+184], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+192], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+200], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+208], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+216], {%r30429, %r30429}; + mov.u32 %r30444, -2147483648; + mov.u32 %r15108, 1; + st.local.v2.u32 [%rd2+88], {%r15108, %r30444}; + mov.u32 %r30430, %r30429; + mov.u32 %r30431, %r30429; + mov.u32 %r30432, %r30429; + mov.u32 %r30433, %r30429; + mov.u32 %r30434, %r30429; + mov.u32 %r30435, %r30429; + mov.u32 %r30436, %r30429; + mov.u32 %r30437, %r30429; + mov.u32 %r30438, %r30429; + mov.u32 %r30439, %r30429; + mov.u32 %r30440, %r30429; + mov.u32 %r30441, %r30429; + mov.u32 %r30442, %r30429; + mov.u32 %r30443, %r15108; + mov.u32 %r30445, %r30429; + mov.u32 %r30446, %r30429; + mov.u32 %r30447, %r30429; + mov.u32 %r30448, %r30429; + mov.u32 %r30449, %r30429; + mov.u32 %r30450, %r30429; + mov.u32 %r30451, %r30429; + mov.u32 %r30452, %r30429; + mov.u32 %r30453, %r30429; + mov.u32 %r30454, %r30429; + mov.u32 %r30455, %r30429; + mov.u32 %r30456, %r30429; + mov.u32 %r30457, %r30429; + mov.u32 %r30458, %r30429; + mov.u32 %r30459, %r30429; + mov.u32 %r30460, %r30429; + mov.u32 %r30461, %r30429; + mov.u32 %r30462, %r30429; + mov.u32 %r30479, %r30429; + +$L__BB2_46: + // begin inline asm + // xor5 + lop3.b32 %r15135, %r30465, %r30463, %r30461, 0x96; + lop3.b32 %r15135, %r15135, %r30459, %r30457, 0x96; + lop3.b32 %r15136, %r30466, %r30464, %r30462, 0x96; + lop3.b32 %r15136, %r15136, %r30460, %r30458, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15147, %r30477, %r30475, %r30455, 0x96; + lop3.b32 %r15147, %r15147, %r30453, %r30451, 0x96; + lop3.b32 %r15148, %r30478, %r30476, %r30456, 0x96; + lop3.b32 %r15148, %r15148, %r30454, %r30452, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15159, %r30473, %r30471, %r30449, 0x96; + lop3.b32 %r15159, %r15159, %r30447, %r30445, 0x96; + lop3.b32 %r15160, %r30474, %r30472, %r30450, 0x96; + lop3.b32 %r15160, %r15160, %r30448, %r30446, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15171, %r30469, %r30443, %r30441, 0x96; + lop3.b32 %r15171, %r15171, %r30439, %r30437, 0x96; + lop3.b32 %r15172, %r30470, %r30444, %r30442, 0x96; + lop3.b32 %r15172, %r15172, %r30440, %r30438, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15183, %r30467, %r30435, %r30433, 0x96; + lop3.b32 %r15183, %r15183, %r30431, %r30429, 0x96; + lop3.b32 %r15184, %r30468, %r30436, %r30434, 0x96; + lop3.b32 %r15184, %r15184, %r30432, %r30430, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15195, %r15148, %r15147, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15199, %r15147, %r15148, %r15108; + // end inline asm + xor.b32 %r15629, %r15195, %r15183; + xor.b32 %r15630, %r15199, %r15184; + xor.b32 %r15462, %r30465, %r15629; + xor.b32 %r15465, %r30466, %r15630; + xor.b32 %r15369, %r30463, %r15629; + xor.b32 %r15368, %r30464, %r15630; + xor.b32 %r15416, %r30461, %r15629; + xor.b32 %r15417, %r30462, %r15630; + xor.b32 %r15321, %r30459, %r15629; + xor.b32 %r15320, %r30460, %r15630; + xor.b32 %r15272, %r30457, %r15629; + xor.b32 %r15273, %r30458, %r15630; + // begin inline asm + shf.l.wrap.b32 %r15203, %r15160, %r15159, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15207, %r15159, %r15160, %r15108; + // end inline asm + xor.b32 %r15631, %r15203, %r15135; + xor.b32 %r15632, %r15207, %r15136; + xor.b32 %r15424, %r30477, %r15631; + xor.b32 %r15425, %r30478, %r15632; + xor.b32 %r15241, %r30475, %r15631; + xor.b32 %r15240, %r30476, %r15632; + xor.b32 %r15400, %r30455, %r15631; + xor.b32 %r15401, %r30456, %r15632; + xor.b32 %r15361, %r30453, %r15631; + xor.b32 %r15360, %r30454, %r15632; + xor.b32 %r15344, %r30451, %r15631; + xor.b32 %r15345, %r30452, %r15632; + // begin inline asm + shf.l.wrap.b32 %r15211, %r15172, %r15171, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15215, %r15171, %r15172, %r15108; + // end inline asm + xor.b32 %r15633, %r15211, %r15147; + xor.b32 %r15634, %r15215, %r15148; + xor.b32 %r15281, %r30473, %r15633; + xor.b32 %r15280, %r30474, %r15634; + xor.b32 %r15408, %r30471, %r15633; + xor.b32 %r15409, %r30472, %r15634; + xor.b32 %r15289, %r30449, %r15633; + xor.b32 %r15288, %r30450, %r15634; + xor.b32 %r15392, %r30447, %r15633; + xor.b32 %r15393, %r30448, %r15634; + xor.b32 %r15257, %r30445, %r15633; + xor.b32 %r15256, %r30446, %r15634; + // begin inline asm + shf.l.wrap.b32 %r15219, %r15184, %r15183, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15223, %r15183, %r15184, %r15108; + // end inline asm + xor.b32 %r15635, %r15219, %r15159; + xor.b32 %r15636, %r15223, %r15160; + xor.b32 %r15376, %r30469, %r15635; + xor.b32 %r15377, %r30470, %r15636; + xor.b32 %r15353, %r30443, %r15635; + xor.b32 %r15352, %r30444, %r15636; + xor.b32 %r15296, %r30441, %r15635; + xor.b32 %r15297, %r30442, %r15636; + xor.b32 %r15384, %r30439, %r15635; + xor.b32 %r15385, %r30440, %r15636; + xor.b32 %r15313, %r30437, %r15635; + xor.b32 %r15312, %r30438, %r15636; + // begin inline asm + shf.l.wrap.b32 %r15227, %r15136, %r15135, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15231, %r15135, %r15136, %r15108; + // end inline asm + xor.b32 %r15637, %r15227, %r15171; + xor.b32 %r15638, %r15231, %r15172; + xor.b32 %r15328, %r30467, %r15637; + xor.b32 %r15329, %r30468, %r15638; + xor.b32 %r15248, %r30435, %r15637; + xor.b32 %r15249, %r30436, %r15638; + xor.b32 %r15265, %r30433, %r15637; + xor.b32 %r15264, %r30434, %r15638; + xor.b32 %r15304, %r30431, %r15637; + xor.b32 %r15305, %r30432, %r15638; + xor.b32 %r15336, %r30429, %r15637; + xor.b32 %r15337, %r30430, %r15638; + mov.u32 %r15242, 44; + // begin inline asm + shf.l.wrap.b32 %r15235, %r15241, %r15240, %r15242; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15239, %r15240, %r15241, %r15242; + // end inline asm + mov.u32 %r15250, 20; + // begin inline asm + shf.l.wrap.b32 %r15243, %r15249, %r15248, %r15250; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15247, %r15248, %r15249, %r15250; + // end inline asm + mov.u32 %r15258, 61; + // begin inline asm + shf.l.wrap.b32 %r15251, %r15257, %r15256, %r15258; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15255, %r15256, %r15257, %r15258; + // end inline asm + mov.u32 %r15266, 39; + // begin inline asm + shf.l.wrap.b32 %r15259, %r15265, %r15264, %r15266; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15263, %r15264, %r15265, %r15266; + // end inline asm + mov.u32 %r15274, 18; + // begin inline asm + shf.l.wrap.b32 %r15267, %r15273, %r15272, %r15274; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15271, %r15272, %r15273, %r15274; + // end inline asm + mov.u32 %r15282, 62; + // begin inline asm + shf.l.wrap.b32 %r15275, %r15281, %r15280, %r15282; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15279, %r15280, %r15281, %r15282; + // end inline asm + mov.u32 %r15290, 43; + // begin inline asm + shf.l.wrap.b32 %r15283, %r15289, %r15288, %r15290; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15287, %r15288, %r15289, %r15290; + // end inline asm + mov.u32 %r15298, 25; + // begin inline asm + shf.l.wrap.b32 %r15291, %r15297, %r15296, %r15298; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15295, %r15296, %r15297, %r15298; + // end inline asm + mov.u32 %r15306, 8; + // begin inline asm + shf.l.wrap.b32 %r15299, %r15305, %r15304, %r15306; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15303, %r15304, %r15305, %r15306; + // end inline asm + mov.u32 %r15314, 56; + // begin inline asm + shf.l.wrap.b32 %r15307, %r15313, %r15312, %r15314; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15311, %r15312, %r15313, %r15314; + // end inline asm + mov.u32 %r15322, 41; + // begin inline asm + shf.l.wrap.b32 %r15315, %r15321, %r15320, %r15322; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15319, %r15320, %r15321, %r15322; + // end inline asm + mov.u32 %r15330, 27; + // begin inline asm + shf.l.wrap.b32 %r15323, %r15329, %r15328, %r15330; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15327, %r15328, %r15329, %r15330; + // end inline asm + mov.u32 %r15338, 14; + // begin inline asm + shf.l.wrap.b32 %r15331, %r15337, %r15336, %r15338; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15335, %r15336, %r15337, %r15338; + // end inline asm + mov.u32 %r15346, 2; + // begin inline asm + shf.l.wrap.b32 %r15339, %r15345, %r15344, %r15346; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15343, %r15344, %r15345, %r15346; + // end inline asm + mov.u32 %r15354, 55; + // begin inline asm + shf.l.wrap.b32 %r15347, %r15353, %r15352, %r15354; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15351, %r15352, %r15353, %r15354; + // end inline asm + mov.u32 %r15362, 45; + // begin inline asm + shf.l.wrap.b32 %r15355, %r15361, %r15360, %r15362; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15359, %r15360, %r15361, %r15362; + // end inline asm + mov.u32 %r15370, 36; + // begin inline asm + shf.l.wrap.b32 %r15363, %r15369, %r15368, %r15370; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15367, %r15368, %r15369, %r15370; + // end inline asm + mov.u32 %r15378, 28; + // begin inline asm + shf.l.wrap.b32 %r15371, %r15377, %r15376, %r15378; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15375, %r15376, %r15377, %r15378; + // end inline asm + mov.u32 %r15386, 21; + // begin inline asm + shf.l.wrap.b32 %r15379, %r15385, %r15384, %r15386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15383, %r15384, %r15385, %r15386; + // end inline asm + mov.u32 %r15394, 15; + // begin inline asm + shf.l.wrap.b32 %r15387, %r15393, %r15392, %r15394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15391, %r15392, %r15393, %r15394; + // end inline asm + mov.u32 %r15402, 10; + // begin inline asm + shf.l.wrap.b32 %r15395, %r15401, %r15400, %r15402; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15399, %r15400, %r15401, %r15402; + // end inline asm + mov.u32 %r15410, 6; + // begin inline asm + shf.l.wrap.b32 %r15403, %r15409, %r15408, %r15410; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15407, %r15408, %r15409, %r15410; + // end inline asm + mov.u32 %r15418, 3; + // begin inline asm + shf.l.wrap.b32 %r15411, %r15417, %r15416, %r15418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15415, %r15416, %r15417, %r15418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15419, %r15425, %r15424, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15423, %r15424, %r15425, %r15108; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15427, %r15462, %r15235, %r15283, 0xD2; + lop3.b32 %r15428, %r15465, %r15239, %r15287, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30477, %r15235, %r15283, %r15379, 0xD2; + lop3.b32 %r30478, %r15239, %r15287, %r15383, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30473, %r15283, %r15379, %r15331, 0xD2; + lop3.b32 %r30474, %r15287, %r15383, %r15335, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30469, %r15379, %r15331, %r15462, 0xD2; + lop3.b32 %r30470, %r15383, %r15335, %r15465, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30467, %r15331, %r15462, %r15235, 0xD2; + lop3.b32 %r30468, %r15335, %r15465, %r15239, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30463, %r15371, %r15243, %r15411, 0xD2; + lop3.b32 %r30464, %r15375, %r15247, %r15415, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30475, %r15243, %r15411, %r15355, 0xD2; + lop3.b32 %r30476, %r15247, %r15415, %r15359, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30471, %r15411, %r15355, %r15251, 0xD2; + lop3.b32 %r30472, %r15415, %r15359, %r15255, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30443, %r15355, %r15251, %r15371, 0xD2; + lop3.b32 %r30444, %r15359, %r15255, %r15375, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30443, %r30444}; + // begin inline asm + // chi + lop3.b32 %r30435, %r15251, %r15371, %r15243, 0xD2; + lop3.b32 %r30436, %r15255, %r15375, %r15247, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30435, %r30436}; + // begin inline asm + // chi + lop3.b32 %r30461, %r15419, %r15403, %r15291, 0xD2; + lop3.b32 %r30462, %r15423, %r15407, %r15295, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30461, %r30462}; + // begin inline asm + // chi + lop3.b32 %r30455, %r15403, %r15291, %r15299, 0xD2; + lop3.b32 %r30456, %r15407, %r15295, %r15303, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30455, %r30456}; + // begin inline asm + // chi + lop3.b32 %r30449, %r15291, %r15299, %r15267, 0xD2; + lop3.b32 %r30450, %r15295, %r15303, %r15271, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30449, %r30450}; + // begin inline asm + // chi + lop3.b32 %r30441, %r15299, %r15267, %r15419, 0xD2; + lop3.b32 %r30442, %r15303, %r15271, %r15423, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30441, %r30442}; + // begin inline asm + // chi + lop3.b32 %r30433, %r15267, %r15419, %r15403, 0xD2; + lop3.b32 %r30434, %r15271, %r15423, %r15407, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30433, %r30434}; + // begin inline asm + // chi + lop3.b32 %r30459, %r15323, %r15363, %r15395, 0xD2; + lop3.b32 %r30460, %r15327, %r15367, %r15399, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30459, %r30460}; + // begin inline asm + // chi + lop3.b32 %r30453, %r15363, %r15395, %r15387, 0xD2; + lop3.b32 %r30454, %r15367, %r15399, %r15391, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30453, %r30454}; + // begin inline asm + // chi + lop3.b32 %r30447, %r15395, %r15387, %r15307, 0xD2; + lop3.b32 %r30448, %r15399, %r15391, %r15311, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30447, %r30448}; + // begin inline asm + // chi + lop3.b32 %r30439, %r15387, %r15307, %r15323, 0xD2; + lop3.b32 %r30440, %r15391, %r15311, %r15327, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30439, %r30440}; + // begin inline asm + // chi + lop3.b32 %r30431, %r15307, %r15323, %r15363, 0xD2; + lop3.b32 %r30432, %r15311, %r15327, %r15367, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30431, %r30432}; + // begin inline asm + // chi + lop3.b32 %r30457, %r15275, %r15347, %r15259, 0xD2; + lop3.b32 %r30458, %r15279, %r15351, %r15263, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30457, %r30458}; + // begin inline asm + // chi + lop3.b32 %r30451, %r15347, %r15259, %r15315, 0xD2; + lop3.b32 %r30452, %r15351, %r15263, %r15319, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30451, %r30452}; + // begin inline asm + // chi + lop3.b32 %r30445, %r15259, %r15315, %r15339, 0xD2; + lop3.b32 %r30446, %r15263, %r15319, %r15343, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30445, %r30446}; + // begin inline asm + // chi + lop3.b32 %r30437, %r15315, %r15339, %r15275, 0xD2; + lop3.b32 %r30438, %r15319, %r15343, %r15279, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30437, %r30438}; + // begin inline asm + // chi + lop3.b32 %r30429, %r15339, %r15275, %r15347, 0xD2; + lop3.b32 %r30430, %r15343, %r15279, %r15351, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30429, %r30430}; + mul.wide.s32 %rd757, %r30479, 8; + add.s64 %rd756, %rd689, %rd757; + // begin inline asm + ld.global.nc.v2.u32 {%r15627,%r15628}, [%rd756]; + // end inline asm + xor.b32 %r30465, %r15427, %r15627; + xor.b32 %r30466, %r15428, %r15628; + add.s32 %r30479, %r30479, 1; + setp.lt.u32 %p29, %r30479, 23; + @%p29 bra $L__BB2_46; + + st.local.v2.u32 [%rd2+32], {%r30477, %r30478}; + st.local.v2.u32 [%rd2+72], {%r30475, %r30476}; + st.local.v2.u32 [%rd2+40], {%r30473, %r30474}; + st.local.v2.u32 [%rd2+80], {%r30471, %r30472}; + st.local.v2.u32 [%rd2+48], {%r30469, %r30470}; + st.local.v2.u32 [%rd2+56], {%r30467, %r30468}; + st.local.v2.u32 [%rd2+24], {%r30465, %r30466}; + // begin inline asm + // xor5 + lop3.b32 %r15639, %r30465, %r30463, %r30461, 0x96; + lop3.b32 %r15639, %r15639, %r30459, %r30457, 0x96; + lop3.b32 %r15640, %r30466, %r30464, %r30462, 0x96; + lop3.b32 %r15640, %r15640, %r30460, %r30458, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15651, %r30477, %r30475, %r30455, 0x96; + lop3.b32 %r15651, %r15651, %r30453, %r30451, 0x96; + lop3.b32 %r15652, %r30478, %r30476, %r30456, 0x96; + lop3.b32 %r15652, %r15652, %r30454, %r30452, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15663, %r30473, %r30471, %r30449, 0x96; + lop3.b32 %r15663, %r15663, %r30447, %r30445, 0x96; + lop3.b32 %r15664, %r30474, %r30472, %r30450, 0x96; + lop3.b32 %r15664, %r15664, %r30448, %r30446, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15675, %r30469, %r30443, %r30441, 0x96; + lop3.b32 %r15675, %r15675, %r30439, %r30437, 0x96; + lop3.b32 %r15676, %r30470, %r30444, %r30442, 0x96; + lop3.b32 %r15676, %r15676, %r30440, %r30438, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15687, %r30467, %r30435, %r30433, 0x96; + lop3.b32 %r15687, %r15687, %r30431, %r30429, 0x96; + lop3.b32 %r15688, %r30468, %r30436, %r30434, 0x96; + lop3.b32 %r15688, %r15688, %r30432, %r30430, 0x96; + // end inline asm + mov.u32 %r15891, 1; + // begin inline asm + shf.l.wrap.b32 %r15699, %r15652, %r15651, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15703, %r15651, %r15652, %r15891; + // end inline asm + xor.b32 %r15918, %r15699, %r15687; + xor.b32 %r15919, %r15703, %r15688; + xor.b32 %r15846, %r30465, %r15918; + xor.b32 %r15849, %r30466, %r15919; + xor.b32 %r15809, %r30462, %r15919; + xor.b32 %r15808, %r30461, %r15918; + st.local.v2.u32 [%rd2+104], {%r15808, %r15809}; + // begin inline asm + shf.l.wrap.b32 %r15707, %r15664, %r15663, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15711, %r15663, %r15664, %r15891; + // end inline asm + xor.b32 %r15920, %r15707, %r15639; + xor.b32 %r15921, %r15711, %r15640; + xor.b32 %r15745, %r30475, %r15920; + xor.b32 %r15744, %r30476, %r15921; + xor.b32 %r15784, %r30454, %r15921; + xor.b32 %r15785, %r30453, %r15920; + st.local.v2.u32 [%rd2+152], {%r15785, %r15784}; + // begin inline asm + shf.l.wrap.b32 %r15715, %r15676, %r15675, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15719, %r15675, %r15676, %r15891; + // end inline asm + xor.b32 %r15922, %r15715, %r15651; + xor.b32 %r15923, %r15719, %r15652; + xor.b32 %r15768, %r30450, %r15923; + xor.b32 %r15769, %r30449, %r15922; + st.local.v2.u32 [%rd2+120], {%r15769, %r15768}; + xor.b32 %r15760, %r30446, %r15923; + xor.b32 %r15761, %r30445, %r15922; + st.local.v2.u32 [%rd2+200], {%r15761, %r15760}; + // begin inline asm + shf.l.wrap.b32 %r15723, %r15688, %r15687, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15727, %r15687, %r15688, %r15891; + // end inline asm + xor.b32 %r15924, %r15723, %r15663; + xor.b32 %r15925, %r15727, %r15664; + xor.b32 %r15792, %r30469, %r15924; + xor.b32 %r15793, %r30470, %r15925; + xor.b32 %r15801, %r30440, %r15925; + xor.b32 %r15800, %r30439, %r15924; + st.local.v2.u32 [%rd2+168], {%r15800, %r15801}; + // begin inline asm + shf.l.wrap.b32 %r15731, %r15640, %r15639, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15735, %r15639, %r15640, %r15891; + // end inline asm + xor.b32 %r15926, %r15731, %r15675; + xor.b32 %r15927, %r15735, %r15676; + xor.b32 %r15752, %r30435, %r15926; + xor.b32 %r15753, %r30436, %r15927; + xor.b32 %r15777, %r30430, %r15927; + xor.b32 %r15776, %r30429, %r15926; + st.local.v2.u32 [%rd2+216], {%r15776, %r15777}; + // begin inline asm + shf.l.wrap.b32 %r15739, %r15745, %r15744, %r15242; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15743, %r15744, %r15745, %r15242; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15747, %r15753, %r15752, %r15250; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15751, %r15752, %r15753, %r15250; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15759, %r15760, %r15761, %r15258; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15755, %r15761, %r15760, %r15258; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r15755, %r15759}; + // begin inline asm + shf.l.wrap.b32 %r15763, %r15769, %r15768, %r15290; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15767, %r15768, %r15769, %r15290; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15771, %r15777, %r15776, %r15338; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15775, %r15776, %r15777, %r15338; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15783, %r15784, %r15785, %r15362; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15779, %r15785, %r15784, %r15362; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r15779, %r15783}; + // begin inline asm + shf.l.wrap.b32 %r15787, %r15793, %r15792, %r15378; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15791, %r15792, %r15793, %r15378; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15795, %r15801, %r15800, %r15386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15799, %r15800, %r15801, %r15386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15803, %r15809, %r15808, %r15418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15807, %r15808, %r15809, %r15418; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15811, %r15846, %r15739, %r15763, 0xD2; + lop3.b32 %r15812, %r15849, %r15743, %r15767, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15819, %r15739, %r15763, %r15795, 0xD2; + lop3.b32 %r15820, %r15743, %r15767, %r15799, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r15819, %r15820}; + // begin inline asm + // chi + lop3.b32 %r15827, %r15763, %r15795, %r15771, 0xD2; + lop3.b32 %r15828, %r15767, %r15799, %r15775, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r15827, %r15828}; + // begin inline asm + // chi + lop3.b32 %r15835, %r15795, %r15771, %r15846, 0xD2; + lop3.b32 %r15836, %r15799, %r15775, %r15849, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r15835, %r15836}; + // begin inline asm + // chi + lop3.b32 %r15843, %r15771, %r15846, %r15739, 0xD2; + lop3.b32 %r15844, %r15775, %r15849, %r15743, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r15843, %r15844}; + // begin inline asm + // chi + lop3.b32 %r15851, %r15787, %r15747, %r15803, 0xD2; + lop3.b32 %r15852, %r15791, %r15751, %r15807, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r15851, %r15852}; + // begin inline asm + // chi + lop3.b32 %r15859, %r15747, %r15803, %r15779, 0xD2; + lop3.b32 %r15860, %r15751, %r15807, %r15783, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r15859, %r15860}; + // begin inline asm + // chi + lop3.b32 %r15867, %r15803, %r15779, %r15755, 0xD2; + lop3.b32 %r15868, %r15807, %r15783, %r15759, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r15867, %r15868}; + // begin inline asm + ld.global.nc.v2.u32 {%r15875,%r15876}, [%rd690]; + // end inline asm + xor.b32 %r15928, %r15812, %r15876; + xor.b32 %r15929, %r15811, %r15875; + mov.b64 %rd1333, {%r15929, %r15928}; + mov.b64 %rd1334, {%r15819, %r15820}; + mov.b64 %rd1335, {%r15827, %r15828}; + mov.b64 %rd1336, {%r15835, %r15836}; + mov.b64 %rd1337, {%r15843, %r15844}; + mov.b64 %rd1338, {%r15851, %r15852}; + mov.b64 %rd1339, {%r15859, %r15860}; + mov.b64 %rd1340, {%r15867, %r15868}; + mov.u32 %r30480, 0; + st.local.v2.u32 [%rd2+24], {%r15929, %r15928}; + st.local.v2.u32 [%rd147+96], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+104], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+112], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+120], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+128], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+136], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+144], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+152], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+160], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+168], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+176], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+184], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+192], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+200], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+208], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+216], {%r30480, %r30480}; + mov.u32 %r30495, -2147483648; + st.local.v2.u32 [%rd147+88], {%r15891, %r30495}; + mov.u32 %r30481, %r30480; + mov.u32 %r30482, %r30480; + mov.u32 %r30483, %r30480; + mov.u32 %r30484, %r30480; + mov.u32 %r30485, %r30480; + mov.u32 %r30486, %r30480; + mov.u32 %r30487, %r30480; + mov.u32 %r30488, %r30480; + mov.u32 %r30489, %r30480; + mov.u32 %r30490, %r30480; + mov.u32 %r30491, %r30480; + mov.u32 %r30492, %r30480; + mov.u32 %r30493, %r30480; + mov.u32 %r30494, %r15891; + mov.u32 %r30496, %r30480; + mov.u32 %r30497, %r30480; + mov.u32 %r30498, %r30480; + mov.u32 %r30499, %r30480; + mov.u32 %r30500, %r30480; + mov.u32 %r30501, %r30480; + mov.u32 %r30502, %r30480; + mov.u32 %r30503, %r30480; + mov.u32 %r30504, %r30480; + mov.u32 %r30505, %r30480; + mov.u32 %r30506, %r30480; + mov.u32 %r30507, %r30480; + mov.u32 %r30508, %r30480; + mov.u32 %r30509, %r30480; + mov.u32 %r30510, %r30480; + mov.u32 %r30511, %r30480; + mov.u32 %r30512, %r30480; + mov.u32 %r30513, %r30480; + mov.u32 %r30530, %r30480; + +$L__BB2_48: + // begin inline asm + // xor5 + lop3.b32 %r15930, %r30516, %r30514, %r30512, 0x96; + lop3.b32 %r15930, %r15930, %r30510, %r30508, 0x96; + lop3.b32 %r15931, %r30517, %r30515, %r30513, 0x96; + lop3.b32 %r15931, %r15931, %r30511, %r30509, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15942, %r30528, %r30526, %r30506, 0x96; + lop3.b32 %r15942, %r15942, %r30504, %r30502, 0x96; + lop3.b32 %r15943, %r30529, %r30527, %r30507, 0x96; + lop3.b32 %r15943, %r15943, %r30505, %r30503, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15954, %r30524, %r30522, %r30500, 0x96; + lop3.b32 %r15954, %r15954, %r30498, %r30496, 0x96; + lop3.b32 %r15955, %r30525, %r30523, %r30501, 0x96; + lop3.b32 %r15955, %r15955, %r30499, %r30497, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15966, %r30520, %r30494, %r30492, 0x96; + lop3.b32 %r15966, %r15966, %r30490, %r30488, 0x96; + lop3.b32 %r15967, %r30521, %r30495, %r30493, 0x96; + lop3.b32 %r15967, %r15967, %r30491, %r30489, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15978, %r30518, %r30486, %r30484, 0x96; + lop3.b32 %r15978, %r15978, %r30482, %r30480, 0x96; + lop3.b32 %r15979, %r30519, %r30487, %r30485, 0x96; + lop3.b32 %r15979, %r15979, %r30483, %r30481, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15990, %r15943, %r15942, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15994, %r15942, %r15943, %r15891; + // end inline asm + xor.b32 %r16424, %r15990, %r15978; + xor.b32 %r16425, %r15994, %r15979; + xor.b32 %r16257, %r30516, %r16424; + xor.b32 %r16260, %r30517, %r16425; + xor.b32 %r16164, %r30514, %r16424; + xor.b32 %r16163, %r30515, %r16425; + xor.b32 %r16211, %r30512, %r16424; + xor.b32 %r16212, %r30513, %r16425; + xor.b32 %r16116, %r30510, %r16424; + xor.b32 %r16115, %r30511, %r16425; + xor.b32 %r16067, %r30508, %r16424; + xor.b32 %r16068, %r30509, %r16425; + // begin inline asm + shf.l.wrap.b32 %r15998, %r15955, %r15954, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16002, %r15954, %r15955, %r15891; + // end inline asm + xor.b32 %r16426, %r15998, %r15930; + xor.b32 %r16427, %r16002, %r15931; + xor.b32 %r16219, %r30528, %r16426; + xor.b32 %r16220, %r30529, %r16427; + xor.b32 %r16036, %r30526, %r16426; + xor.b32 %r16035, %r30527, %r16427; + xor.b32 %r16195, %r30506, %r16426; + xor.b32 %r16196, %r30507, %r16427; + xor.b32 %r16156, %r30504, %r16426; + xor.b32 %r16155, %r30505, %r16427; + xor.b32 %r16139, %r30502, %r16426; + xor.b32 %r16140, %r30503, %r16427; + // begin inline asm + shf.l.wrap.b32 %r16006, %r15967, %r15966, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16010, %r15966, %r15967, %r15891; + // end inline asm + xor.b32 %r16428, %r16006, %r15942; + xor.b32 %r16429, %r16010, %r15943; + xor.b32 %r16076, %r30524, %r16428; + xor.b32 %r16075, %r30525, %r16429; + xor.b32 %r16203, %r30522, %r16428; + xor.b32 %r16204, %r30523, %r16429; + xor.b32 %r16084, %r30500, %r16428; + xor.b32 %r16083, %r30501, %r16429; + xor.b32 %r16187, %r30498, %r16428; + xor.b32 %r16188, %r30499, %r16429; + xor.b32 %r16052, %r30496, %r16428; + xor.b32 %r16051, %r30497, %r16429; + // begin inline asm + shf.l.wrap.b32 %r16014, %r15979, %r15978, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16018, %r15978, %r15979, %r15891; + // end inline asm + xor.b32 %r16430, %r16014, %r15954; + xor.b32 %r16431, %r16018, %r15955; + xor.b32 %r16171, %r30520, %r16430; + xor.b32 %r16172, %r30521, %r16431; + xor.b32 %r16148, %r30494, %r16430; + xor.b32 %r16147, %r30495, %r16431; + xor.b32 %r16091, %r30492, %r16430; + xor.b32 %r16092, %r30493, %r16431; + xor.b32 %r16179, %r30490, %r16430; + xor.b32 %r16180, %r30491, %r16431; + xor.b32 %r16108, %r30488, %r16430; + xor.b32 %r16107, %r30489, %r16431; + // begin inline asm + shf.l.wrap.b32 %r16022, %r15931, %r15930, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16026, %r15930, %r15931, %r15891; + // end inline asm + xor.b32 %r16432, %r16022, %r15966; + xor.b32 %r16433, %r16026, %r15967; + xor.b32 %r16123, %r30518, %r16432; + xor.b32 %r16124, %r30519, %r16433; + xor.b32 %r16043, %r30486, %r16432; + xor.b32 %r16044, %r30487, %r16433; + xor.b32 %r16060, %r30484, %r16432; + xor.b32 %r16059, %r30485, %r16433; + xor.b32 %r16099, %r30482, %r16432; + xor.b32 %r16100, %r30483, %r16433; + xor.b32 %r16131, %r30480, %r16432; + xor.b32 %r16132, %r30481, %r16433; + mov.u32 %r16037, 44; + // begin inline asm + shf.l.wrap.b32 %r16030, %r16036, %r16035, %r16037; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16034, %r16035, %r16036, %r16037; + // end inline asm + mov.u32 %r16045, 20; + // begin inline asm + shf.l.wrap.b32 %r16038, %r16044, %r16043, %r16045; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16042, %r16043, %r16044, %r16045; + // end inline asm + mov.u32 %r16053, 61; + // begin inline asm + shf.l.wrap.b32 %r16046, %r16052, %r16051, %r16053; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16050, %r16051, %r16052, %r16053; + // end inline asm + mov.u32 %r16061, 39; + // begin inline asm + shf.l.wrap.b32 %r16054, %r16060, %r16059, %r16061; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16058, %r16059, %r16060, %r16061; + // end inline asm + mov.u32 %r16069, 18; + // begin inline asm + shf.l.wrap.b32 %r16062, %r16068, %r16067, %r16069; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16066, %r16067, %r16068, %r16069; + // end inline asm + mov.u32 %r16077, 62; + // begin inline asm + shf.l.wrap.b32 %r16070, %r16076, %r16075, %r16077; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16074, %r16075, %r16076, %r16077; + // end inline asm + mov.u32 %r16085, 43; + // begin inline asm + shf.l.wrap.b32 %r16078, %r16084, %r16083, %r16085; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16082, %r16083, %r16084, %r16085; + // end inline asm + mov.u32 %r16093, 25; + // begin inline asm + shf.l.wrap.b32 %r16086, %r16092, %r16091, %r16093; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16090, %r16091, %r16092, %r16093; + // end inline asm + mov.u32 %r16101, 8; + // begin inline asm + shf.l.wrap.b32 %r16094, %r16100, %r16099, %r16101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16098, %r16099, %r16100, %r16101; + // end inline asm + mov.u32 %r16109, 56; + // begin inline asm + shf.l.wrap.b32 %r16102, %r16108, %r16107, %r16109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16106, %r16107, %r16108, %r16109; + // end inline asm + mov.u32 %r16117, 41; + // begin inline asm + shf.l.wrap.b32 %r16110, %r16116, %r16115, %r16117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16114, %r16115, %r16116, %r16117; + // end inline asm + mov.u32 %r16125, 27; + // begin inline asm + shf.l.wrap.b32 %r16118, %r16124, %r16123, %r16125; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16122, %r16123, %r16124, %r16125; + // end inline asm + mov.u32 %r16133, 14; + // begin inline asm + shf.l.wrap.b32 %r16126, %r16132, %r16131, %r16133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16130, %r16131, %r16132, %r16133; + // end inline asm + mov.u32 %r16141, 2; + // begin inline asm + shf.l.wrap.b32 %r16134, %r16140, %r16139, %r16141; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16138, %r16139, %r16140, %r16141; + // end inline asm + mov.u32 %r16149, 55; + // begin inline asm + shf.l.wrap.b32 %r16142, %r16148, %r16147, %r16149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16146, %r16147, %r16148, %r16149; + // end inline asm + mov.u32 %r16157, 45; + // begin inline asm + shf.l.wrap.b32 %r16150, %r16156, %r16155, %r16157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16154, %r16155, %r16156, %r16157; + // end inline asm + mov.u32 %r16165, 36; + // begin inline asm + shf.l.wrap.b32 %r16158, %r16164, %r16163, %r16165; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16162, %r16163, %r16164, %r16165; + // end inline asm + mov.u32 %r16173, 28; + // begin inline asm + shf.l.wrap.b32 %r16166, %r16172, %r16171, %r16173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16170, %r16171, %r16172, %r16173; + // end inline asm + mov.u32 %r16181, 21; + // begin inline asm + shf.l.wrap.b32 %r16174, %r16180, %r16179, %r16181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16178, %r16179, %r16180, %r16181; + // end inline asm + mov.u32 %r16189, 15; + // begin inline asm + shf.l.wrap.b32 %r16182, %r16188, %r16187, %r16189; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16186, %r16187, %r16188, %r16189; + // end inline asm + mov.u32 %r16197, 10; + // begin inline asm + shf.l.wrap.b32 %r16190, %r16196, %r16195, %r16197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16194, %r16195, %r16196, %r16197; + // end inline asm + mov.u32 %r16205, 6; + // begin inline asm + shf.l.wrap.b32 %r16198, %r16204, %r16203, %r16205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16202, %r16203, %r16204, %r16205; + // end inline asm + mov.u32 %r16213, 3; + // begin inline asm + shf.l.wrap.b32 %r16206, %r16212, %r16211, %r16213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16210, %r16211, %r16212, %r16213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16214, %r16220, %r16219, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16218, %r16219, %r16220, %r15891; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16222, %r16257, %r16030, %r16078, 0xD2; + lop3.b32 %r16223, %r16260, %r16034, %r16082, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30528, %r16030, %r16078, %r16174, 0xD2; + lop3.b32 %r30529, %r16034, %r16082, %r16178, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30524, %r16078, %r16174, %r16126, 0xD2; + lop3.b32 %r30525, %r16082, %r16178, %r16130, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30520, %r16174, %r16126, %r16257, 0xD2; + lop3.b32 %r30521, %r16178, %r16130, %r16260, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30518, %r16126, %r16257, %r16030, 0xD2; + lop3.b32 %r30519, %r16130, %r16260, %r16034, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30514, %r16166, %r16038, %r16206, 0xD2; + lop3.b32 %r30515, %r16170, %r16042, %r16210, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30526, %r16038, %r16206, %r16150, 0xD2; + lop3.b32 %r30527, %r16042, %r16210, %r16154, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30522, %r16206, %r16150, %r16046, 0xD2; + lop3.b32 %r30523, %r16210, %r16154, %r16050, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30494, %r16150, %r16046, %r16166, 0xD2; + lop3.b32 %r30495, %r16154, %r16050, %r16170, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r30494, %r30495}; + // begin inline asm + // chi + lop3.b32 %r30486, %r16046, %r16166, %r16038, 0xD2; + lop3.b32 %r30487, %r16050, %r16170, %r16042, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r30486, %r30487}; + // begin inline asm + // chi + lop3.b32 %r30512, %r16214, %r16198, %r16086, 0xD2; + lop3.b32 %r30513, %r16218, %r16202, %r16090, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+104], {%r30512, %r30513}; + // begin inline asm + // chi + lop3.b32 %r30506, %r16198, %r16086, %r16094, 0xD2; + lop3.b32 %r30507, %r16202, %r16090, %r16098, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+112], {%r30506, %r30507}; + // begin inline asm + // chi + lop3.b32 %r30500, %r16086, %r16094, %r16062, 0xD2; + lop3.b32 %r30501, %r16090, %r16098, %r16066, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+120], {%r30500, %r30501}; + // begin inline asm + // chi + lop3.b32 %r30492, %r16094, %r16062, %r16214, 0xD2; + lop3.b32 %r30493, %r16098, %r16066, %r16218, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+128], {%r30492, %r30493}; + // begin inline asm + // chi + lop3.b32 %r30484, %r16062, %r16214, %r16198, 0xD2; + lop3.b32 %r30485, %r16066, %r16218, %r16202, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+136], {%r30484, %r30485}; + // begin inline asm + // chi + lop3.b32 %r30510, %r16118, %r16158, %r16190, 0xD2; + lop3.b32 %r30511, %r16122, %r16162, %r16194, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+144], {%r30510, %r30511}; + // begin inline asm + // chi + lop3.b32 %r30504, %r16158, %r16190, %r16182, 0xD2; + lop3.b32 %r30505, %r16162, %r16194, %r16186, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+152], {%r30504, %r30505}; + // begin inline asm + // chi + lop3.b32 %r30498, %r16190, %r16182, %r16102, 0xD2; + lop3.b32 %r30499, %r16194, %r16186, %r16106, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+160], {%r30498, %r30499}; + // begin inline asm + // chi + lop3.b32 %r30490, %r16182, %r16102, %r16118, 0xD2; + lop3.b32 %r30491, %r16186, %r16106, %r16122, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+168], {%r30490, %r30491}; + // begin inline asm + // chi + lop3.b32 %r30482, %r16102, %r16118, %r16158, 0xD2; + lop3.b32 %r30483, %r16106, %r16122, %r16162, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+176], {%r30482, %r30483}; + // begin inline asm + // chi + lop3.b32 %r30508, %r16070, %r16142, %r16054, 0xD2; + lop3.b32 %r30509, %r16074, %r16146, %r16058, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+184], {%r30508, %r30509}; + // begin inline asm + // chi + lop3.b32 %r30502, %r16142, %r16054, %r16110, 0xD2; + lop3.b32 %r30503, %r16146, %r16058, %r16114, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+192], {%r30502, %r30503}; + // begin inline asm + // chi + lop3.b32 %r30496, %r16054, %r16110, %r16134, 0xD2; + lop3.b32 %r30497, %r16058, %r16114, %r16138, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+200], {%r30496, %r30497}; + // begin inline asm + // chi + lop3.b32 %r30488, %r16110, %r16134, %r16070, 0xD2; + lop3.b32 %r30489, %r16114, %r16138, %r16074, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+208], {%r30488, %r30489}; + // begin inline asm + // chi + lop3.b32 %r30480, %r16134, %r16070, %r16142, 0xD2; + lop3.b32 %r30481, %r16138, %r16074, %r16146, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+216], {%r30480, %r30481}; + mul.wide.s32 %rd764, %r30530, 8; + add.s64 %rd763, %rd689, %rd764; + // begin inline asm + ld.global.nc.v2.u32 {%r16422,%r16423}, [%rd763]; + // end inline asm + xor.b32 %r30516, %r16222, %r16422; + xor.b32 %r30517, %r16223, %r16423; + add.s32 %r30530, %r30530, 1; + setp.lt.u32 %p30, %r30530, 23; + @%p30 bra $L__BB2_48; + + mov.u32 %r16533, 1; + st.local.v2.u32 [%rd147+32], {%r30528, %r30529}; + st.local.v2.u32 [%rd147+72], {%r30526, %r30527}; + st.local.v2.u32 [%rd147+40], {%r30524, %r30525}; + st.local.v2.u32 [%rd147+80], {%r30522, %r30523}; + st.local.v2.u32 [%rd147+48], {%r30520, %r30521}; + st.local.v2.u32 [%rd147+56], {%r30518, %r30519}; + st.local.v2.u32 [%rd147+24], {%r30516, %r30517}; + // begin inline asm + // xor5 + lop3.b32 %r16434, %r30516, %r30514, %r30512, 0x96; + lop3.b32 %r16434, %r16434, %r30510, %r30508, 0x96; + lop3.b32 %r16435, %r30517, %r30515, %r30513, 0x96; + lop3.b32 %r16435, %r16435, %r30511, %r30509, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16446, %r30528, %r30526, %r30506, 0x96; + lop3.b32 %r16446, %r16446, %r30504, %r30502, 0x96; + lop3.b32 %r16447, %r30529, %r30527, %r30507, 0x96; + lop3.b32 %r16447, %r16447, %r30505, %r30503, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16458, %r30524, %r30522, %r30500, 0x96; + lop3.b32 %r16458, %r16458, %r30498, %r30496, 0x96; + lop3.b32 %r16459, %r30525, %r30523, %r30501, 0x96; + lop3.b32 %r16459, %r16459, %r30499, %r30497, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16470, %r30520, %r30494, %r30492, 0x96; + lop3.b32 %r16470, %r16470, %r30490, %r30488, 0x96; + lop3.b32 %r16471, %r30521, %r30495, %r30493, 0x96; + lop3.b32 %r16471, %r16471, %r30491, %r30489, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16482, %r30518, %r30486, %r30484, 0x96; + lop3.b32 %r16482, %r16482, %r30482, %r30480, 0x96; + lop3.b32 %r16483, %r30519, %r30487, %r30485, 0x96; + lop3.b32 %r16483, %r16483, %r30483, %r30481, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16494, %r16447, %r16446, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16498, %r16446, %r16447, %r16533; + // end inline asm + xor.b32 %r16672, %r16494, %r16482; + xor.b32 %r16673, %r16498, %r16483; + xor.b32 %r16641, %r30516, %r16672; + xor.b32 %r16644, %r30517, %r16673; + xor.b32 %r16604, %r30513, %r16673; + xor.b32 %r16603, %r30512, %r16672; + st.local.v2.u32 [%rd147+104], {%r16603, %r16604}; + // begin inline asm + shf.l.wrap.b32 %r16502, %r16459, %r16458, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16506, %r16458, %r16459, %r16533; + // end inline asm + xor.b32 %r16674, %r16502, %r16434; + xor.b32 %r16675, %r16506, %r16435; + xor.b32 %r16540, %r30526, %r16674; + xor.b32 %r16539, %r30527, %r16675; + xor.b32 %r16579, %r30505, %r16675; + xor.b32 %r16580, %r30504, %r16674; + st.local.v2.u32 [%rd147+152], {%r16580, %r16579}; + // begin inline asm + shf.l.wrap.b32 %r16510, %r16471, %r16470, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16514, %r16470, %r16471, %r16533; + // end inline asm + xor.b32 %r16676, %r16510, %r16446; + xor.b32 %r16677, %r16514, %r16447; + xor.b32 %r16563, %r30501, %r16677; + xor.b32 %r16564, %r30500, %r16676; + st.local.v2.u32 [%rd147+120], {%r16564, %r16563}; + xor.b32 %r16555, %r30497, %r16677; + xor.b32 %r16556, %r30496, %r16676; + st.local.v2.u32 [%rd147+200], {%r16556, %r16555}; + // begin inline asm + shf.l.wrap.b32 %r16518, %r16483, %r16482, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16522, %r16482, %r16483, %r16533; + // end inline asm + xor.b32 %r16678, %r16518, %r16458; + xor.b32 %r16679, %r16522, %r16459; + xor.b32 %r16587, %r30520, %r16678; + xor.b32 %r16588, %r30521, %r16679; + xor.b32 %r16596, %r30491, %r16679; + xor.b32 %r16595, %r30490, %r16678; + st.local.v2.u32 [%rd147+168], {%r16595, %r16596}; + // begin inline asm + shf.l.wrap.b32 %r16526, %r16435, %r16434, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16530, %r16434, %r16435, %r16533; + // end inline asm + xor.b32 %r16680, %r16526, %r16470; + xor.b32 %r16681, %r16530, %r16471; + xor.b32 %r16547, %r30486, %r16680; + xor.b32 %r16548, %r30487, %r16681; + xor.b32 %r16572, %r30481, %r16681; + xor.b32 %r16571, %r30480, %r16680; + st.local.v2.u32 [%rd147+216], {%r16571, %r16572}; + // begin inline asm + shf.l.wrap.b32 %r16534, %r16540, %r16539, %r16037; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16538, %r16539, %r16540, %r16037; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16542, %r16548, %r16547, %r16045; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16546, %r16547, %r16548, %r16045; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16554, %r16555, %r16556, %r16053; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16550, %r16556, %r16555, %r16053; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r16550, %r16554}; + // begin inline asm + shf.l.wrap.b32 %r16558, %r16564, %r16563, %r16085; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16562, %r16563, %r16564, %r16085; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16566, %r16572, %r16571, %r16133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16570, %r16571, %r16572, %r16133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16578, %r16579, %r16580, %r16157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16574, %r16580, %r16579, %r16157; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r16574, %r16578}; + // begin inline asm + shf.l.wrap.b32 %r16582, %r16588, %r16587, %r16173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16586, %r16587, %r16588, %r16173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16590, %r16596, %r16595, %r16181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16594, %r16595, %r16596, %r16181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16598, %r16604, %r16603, %r16213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16602, %r16603, %r16604, %r16213; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16606, %r16641, %r16534, %r16558, 0xD2; + lop3.b32 %r16607, %r16644, %r16538, %r16562, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16614, %r16534, %r16558, %r16590, 0xD2; + lop3.b32 %r16615, %r16538, %r16562, %r16594, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+32], {%r16614, %r16615}; + // begin inline asm + // chi + lop3.b32 %r16622, %r16558, %r16590, %r16566, 0xD2; + lop3.b32 %r16623, %r16562, %r16594, %r16570, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+40], {%r16622, %r16623}; + // begin inline asm + // chi + lop3.b32 %r16630, %r16590, %r16566, %r16641, 0xD2; + lop3.b32 %r16631, %r16594, %r16570, %r16644, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+48], {%r16630, %r16631}; + // begin inline asm + // chi + lop3.b32 %r16638, %r16566, %r16641, %r16534, 0xD2; + lop3.b32 %r16639, %r16570, %r16644, %r16538, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+56], {%r16638, %r16639}; + // begin inline asm + // chi + lop3.b32 %r16646, %r16582, %r16542, %r16598, 0xD2; + lop3.b32 %r16647, %r16586, %r16546, %r16602, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+64], {%r16646, %r16647}; + // begin inline asm + // chi + lop3.b32 %r16654, %r16542, %r16598, %r16574, 0xD2; + lop3.b32 %r16655, %r16546, %r16602, %r16578, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+72], {%r16654, %r16655}; + // begin inline asm + // chi + lop3.b32 %r16662, %r16598, %r16574, %r16550, 0xD2; + lop3.b32 %r16663, %r16602, %r16578, %r16554, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+80], {%r16662, %r16663}; + // begin inline asm + ld.global.nc.v2.u32 {%r16670,%r16671}, [%rd690]; + // end inline asm + xor.b32 %r16682, %r16607, %r16671; + xor.b32 %r16683, %r16606, %r16670; + st.local.v2.u32 [%rd147+24], {%r16683, %r16682}; + mov.b64 %rd1342, {%r16614, %r16615}; + mov.b64 %rd1343, {%r16622, %r16623}; + mov.b64 %rd1346, {%r16646, %r16647}; + mov.b64 %rd1347, {%r16654, %r16655}; + mov.b64 %rd1348, {%r16662, %r16663}; + mov.b64 %rd1341, {%r16683, %r16682}; + mov.b64 %rd1344, {%r16630, %r16631}; + mov.b64 %rd1345, {%r16638, %r16639}; + st.global.u64 [%rd128], %rd1333; + st.global.u64 [%rd128+8], %rd1334; + st.global.u64 [%rd128+16], %rd1335; + st.global.u64 [%rd128+24], %rd1336; + st.global.u64 [%rd128+32], %rd1337; + st.global.u64 [%rd128+40], %rd1338; + st.global.u64 [%rd128+48], %rd1339; + st.global.u64 [%rd128+56], %rd1340; + st.global.v2.u32 [%rd128+64], {%r16683, %r16682}; + st.global.v2.u32 [%rd128+72], {%r16614, %r16615}; + st.global.v2.u32 [%rd128+80], {%r16622, %r16623}; + st.global.v2.u32 [%rd128+88], {%r16630, %r16631}; + st.global.v2.u32 [%rd128+96], {%r16638, %r16639}; + st.global.v2.u32 [%rd128+104], {%r16646, %r16647}; + st.global.v2.u32 [%rd128+112], {%r16654, %r16655}; + st.global.v2.u32 [%rd128+120], {%r16662, %r16663}; + +$L__BB2_61: + cvta.to.global.u64 %rd1266, %rd361; + shl.b32 %r3343, %r46, 1; + mul.wide.u32 %rd870, %r3343, -954391867; + shr.u64 %rd871, %rd870, 32; + cvt.u32.u64 %r19968, %rd871; + sub.s32 %r19969, %r3343, %r19968; + shr.u32 %r19970, %r19969, 1; + add.s32 %r19971, %r19970, %r19968; + shr.u32 %r19972, %r19971, 20; + mul.lo.s32 %r19973, %r19972, 1179641; + sub.s32 %r19974, %r3343, %r19973; + mul.wide.u32 %rd873, %r19974, 64; + add.s64 %rd220, %rd1266, %rd873; + or.b32 %r3344, %r3343, 1; + mul.wide.u32 %rd874, %r3344, -954391867; + shr.u64 %rd875, %rd874, 32; + cvt.u32.u64 %r19975, %rd875; + sub.s32 %r19976, %r3344, %r19975; + shr.u32 %r19977, %r19976, 1; + add.s32 %r19978, %r19977, %r19975; + shr.u32 %r19979, %r19978, 20; + mul.lo.s32 %r19980, %r19979, 1179641; + sub.s32 %r19981, %r3344, %r19980; + mul.wide.u32 %rd876, %r19981, 64; + add.s64 %rd221, %rd1266, %rd876; + @%p12 bra $L__BB2_75; + + cvta.to.global.u64 %rd877, %rd360; + mul.wide.u32 %rd878, %r46, 128; + add.s64 %rd222, %rd877, %rd878; + ld.global.u64 %rd1349, [%rd222]; + setp.eq.s64 %p37, %rd1349, 0; + @%p37 bra $L__BB2_64; + + ld.global.u64 %rd1364, [%rd222+120]; + ld.global.u64 %rd1363, [%rd222+112]; + ld.global.u64 %rd1362, [%rd222+104]; + ld.global.u64 %rd1361, [%rd222+96]; + ld.global.u64 %rd1360, [%rd222+88]; + ld.global.u64 %rd1359, [%rd222+80]; + ld.global.u64 %rd1358, [%rd222+72]; + ld.global.u64 %rd1357, [%rd222+64]; + ld.global.u64 %rd1356, [%rd222+56]; + ld.global.u64 %rd1355, [%rd222+48]; + ld.global.u64 %rd1354, [%rd222+40]; + ld.global.u64 %rd1353, [%rd222+32]; + ld.global.u64 %rd1352, [%rd222+24]; + ld.global.u64 %rd1351, [%rd222+16]; + ld.global.u64 %rd1350, [%rd222+8]; + bra.uni $L__BB2_86; + +$L__BB2_75: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd994, 1179641; + st.local.u64 [%rd2+8], %rd994; + st.local.u32 [%rd2+16], %r3343; + ld.global.u64 %rd995, [%rd220]; + ld.global.u64 %rd996, [%rd220+8]; + ld.global.u64 %rd997, [%rd220+16]; + ld.global.u64 %rd998, [%rd220+24]; + ld.global.u64 %rd999, [%rd220+32]; + ld.global.u64 %rd1000, [%rd220+40]; + ld.global.u64 %rd1001, [%rd220+48]; + ld.global.u64 %rd1002, [%rd220+56]; + st.local.u64 [%rd2+24], %rd995; + st.local.u64 [%rd2+32], %rd996; + st.local.u64 [%rd2+40], %rd997; + st.local.u64 [%rd2+48], %rd998; + st.local.u64 [%rd2+56], %rd999; + st.local.u64 [%rd2+64], %rd1000; + st.local.u64 [%rd2+72], %rd1001; + st.local.u64 [%rd2+80], %rd1002; + cvt.u32.u64 %r23308, %rd995; + xor.b32 %r23309, %r3343, %r23308; + st.local.u32 [%rd2+24], %r23309; + mov.u32 %r31005, 0; + st.local.v2.u32 [%rd2+96], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+104], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+112], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+120], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+128], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+136], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+144], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+152], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+160], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+168], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+176], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+184], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+192], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+200], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+208], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+216], {%r31005, %r31005}; + mov.u32 %r31020, -2147483648; + mov.u32 %r23281, 1; + st.local.v2.u32 [%rd2+88], {%r23281, %r31020}; + ld.local.v2.u32 {%r31041, %r31042}, [%rd2+24]; + mov.b64 {%r31039, %r31040}, %rd1000; + shr.u64 %rd1003, %rd996, 32; + cvt.u32.u64 %r31053, %rd996; + cvt.u32.u64 %r31054, %rd1003; + shr.u64 %rd1004, %rd1001, 32; + cvt.u32.u64 %r31051, %rd1001; + cvt.u32.u64 %r31052, %rd1004; + shr.u64 %rd1005, %rd997, 32; + cvt.u32.u64 %r31049, %rd997; + cvt.u32.u64 %r31050, %rd1005; + shr.u64 %rd1006, %rd1002, 32; + cvt.u32.u64 %r31047, %rd1002; + cvt.u32.u64 %r31048, %rd1006; + shr.u64 %rd1007, %rd998, 32; + cvt.u32.u64 %r31045, %rd998; + cvt.u32.u64 %r31046, %rd1007; + shr.u64 %rd1008, %rd999, 32; + cvt.u32.u64 %r31043, %rd999; + cvt.u32.u64 %r31044, %rd1008; + mov.u32 %r31006, %r31005; + mov.u32 %r31007, %r31005; + mov.u32 %r31008, %r31005; + mov.u32 %r31009, %r31005; + mov.u32 %r31010, %r31005; + mov.u32 %r31011, %r31005; + mov.u32 %r31012, %r31005; + mov.u32 %r31013, %r31005; + mov.u32 %r31014, %r31005; + mov.u32 %r31015, %r31005; + mov.u32 %r31016, %r31005; + mov.u32 %r31017, %r31005; + mov.u32 %r31018, %r31005; + mov.u32 %r31019, %r23281; + mov.u32 %r31021, %r31005; + mov.u32 %r31022, %r31005; + mov.u32 %r31023, %r31005; + mov.u32 %r31024, %r31005; + mov.u32 %r31025, %r31005; + mov.u32 %r31026, %r31005; + mov.u32 %r31027, %r31005; + mov.u32 %r31028, %r31005; + mov.u32 %r31029, %r31005; + mov.u32 %r31030, %r31005; + mov.u32 %r31031, %r31005; + mov.u32 %r31032, %r31005; + mov.u32 %r31033, %r31005; + mov.u32 %r31034, %r31005; + mov.u32 %r31035, %r31005; + mov.u32 %r31036, %r31005; + mov.u32 %r31037, %r31005; + mov.u32 %r31038, %r31005; + mov.u32 %r31055, %r31005; + +$L__BB2_76: + // begin inline asm + // xor5 + lop3.b32 %r23312, %r31041, %r31039, %r31037, 0x96; + lop3.b32 %r23312, %r23312, %r31035, %r31033, 0x96; + lop3.b32 %r23313, %r31042, %r31040, %r31038, 0x96; + lop3.b32 %r23313, %r23313, %r31036, %r31034, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23324, %r31053, %r31051, %r31031, 0x96; + lop3.b32 %r23324, %r23324, %r31029, %r31027, 0x96; + lop3.b32 %r23325, %r31054, %r31052, %r31032, 0x96; + lop3.b32 %r23325, %r23325, %r31030, %r31028, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23336, %r31049, %r31047, %r31025, 0x96; + lop3.b32 %r23336, %r23336, %r31023, %r31021, 0x96; + lop3.b32 %r23337, %r31050, %r31048, %r31026, 0x96; + lop3.b32 %r23337, %r23337, %r31024, %r31022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23348, %r31045, %r31019, %r31017, 0x96; + lop3.b32 %r23348, %r23348, %r31015, %r31013, 0x96; + lop3.b32 %r23349, %r31046, %r31020, %r31018, 0x96; + lop3.b32 %r23349, %r23349, %r31016, %r31014, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23360, %r31043, %r31011, %r31009, 0x96; + lop3.b32 %r23360, %r23360, %r31007, %r31005, 0x96; + lop3.b32 %r23361, %r31044, %r31012, %r31010, 0x96; + lop3.b32 %r23361, %r23361, %r31008, %r31006, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23372, %r23325, %r23324, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23376, %r23324, %r23325, %r23281; + // end inline asm + xor.b32 %r23806, %r23372, %r23360; + xor.b32 %r23807, %r23376, %r23361; + xor.b32 %r23639, %r31041, %r23806; + xor.b32 %r23642, %r31042, %r23807; + xor.b32 %r23546, %r31039, %r23806; + xor.b32 %r23545, %r31040, %r23807; + xor.b32 %r23593, %r31037, %r23806; + xor.b32 %r23594, %r31038, %r23807; + xor.b32 %r23498, %r31035, %r23806; + xor.b32 %r23497, %r31036, %r23807; + xor.b32 %r23449, %r31033, %r23806; + xor.b32 %r23450, %r31034, %r23807; + // begin inline asm + shf.l.wrap.b32 %r23380, %r23337, %r23336, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23384, %r23336, %r23337, %r23281; + // end inline asm + xor.b32 %r23808, %r23380, %r23312; + xor.b32 %r23809, %r23384, %r23313; + xor.b32 %r23601, %r31053, %r23808; + xor.b32 %r23602, %r31054, %r23809; + xor.b32 %r23418, %r31051, %r23808; + xor.b32 %r23417, %r31052, %r23809; + xor.b32 %r23577, %r31031, %r23808; + xor.b32 %r23578, %r31032, %r23809; + xor.b32 %r23538, %r31029, %r23808; + xor.b32 %r23537, %r31030, %r23809; + xor.b32 %r23521, %r31027, %r23808; + xor.b32 %r23522, %r31028, %r23809; + // begin inline asm + shf.l.wrap.b32 %r23388, %r23349, %r23348, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23392, %r23348, %r23349, %r23281; + // end inline asm + xor.b32 %r23810, %r23388, %r23324; + xor.b32 %r23811, %r23392, %r23325; + xor.b32 %r23458, %r31049, %r23810; + xor.b32 %r23457, %r31050, %r23811; + xor.b32 %r23585, %r31047, %r23810; + xor.b32 %r23586, %r31048, %r23811; + xor.b32 %r23466, %r31025, %r23810; + xor.b32 %r23465, %r31026, %r23811; + xor.b32 %r23569, %r31023, %r23810; + xor.b32 %r23570, %r31024, %r23811; + xor.b32 %r23434, %r31021, %r23810; + xor.b32 %r23433, %r31022, %r23811; + // begin inline asm + shf.l.wrap.b32 %r23396, %r23361, %r23360, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23400, %r23360, %r23361, %r23281; + // end inline asm + xor.b32 %r23812, %r23396, %r23336; + xor.b32 %r23813, %r23400, %r23337; + xor.b32 %r23553, %r31045, %r23812; + xor.b32 %r23554, %r31046, %r23813; + xor.b32 %r23530, %r31019, %r23812; + xor.b32 %r23529, %r31020, %r23813; + xor.b32 %r23473, %r31017, %r23812; + xor.b32 %r23474, %r31018, %r23813; + xor.b32 %r23561, %r31015, %r23812; + xor.b32 %r23562, %r31016, %r23813; + xor.b32 %r23490, %r31013, %r23812; + xor.b32 %r23489, %r31014, %r23813; + // begin inline asm + shf.l.wrap.b32 %r23404, %r23313, %r23312, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23408, %r23312, %r23313, %r23281; + // end inline asm + xor.b32 %r23814, %r23404, %r23348; + xor.b32 %r23815, %r23408, %r23349; + xor.b32 %r23505, %r31043, %r23814; + xor.b32 %r23506, %r31044, %r23815; + xor.b32 %r23425, %r31011, %r23814; + xor.b32 %r23426, %r31012, %r23815; + xor.b32 %r23442, %r31009, %r23814; + xor.b32 %r23441, %r31010, %r23815; + xor.b32 %r23481, %r31007, %r23814; + xor.b32 %r23482, %r31008, %r23815; + xor.b32 %r23513, %r31005, %r23814; + xor.b32 %r23514, %r31006, %r23815; + mov.u32 %r23419, 44; + // begin inline asm + shf.l.wrap.b32 %r23412, %r23418, %r23417, %r23419; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23416, %r23417, %r23418, %r23419; + // end inline asm + mov.u32 %r23427, 20; + // begin inline asm + shf.l.wrap.b32 %r23420, %r23426, %r23425, %r23427; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23424, %r23425, %r23426, %r23427; + // end inline asm + mov.u32 %r23435, 61; + // begin inline asm + shf.l.wrap.b32 %r23428, %r23434, %r23433, %r23435; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23432, %r23433, %r23434, %r23435; + // end inline asm + mov.u32 %r23443, 39; + // begin inline asm + shf.l.wrap.b32 %r23436, %r23442, %r23441, %r23443; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23440, %r23441, %r23442, %r23443; + // end inline asm + mov.u32 %r23451, 18; + // begin inline asm + shf.l.wrap.b32 %r23444, %r23450, %r23449, %r23451; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23448, %r23449, %r23450, %r23451; + // end inline asm + mov.u32 %r23459, 62; + // begin inline asm + shf.l.wrap.b32 %r23452, %r23458, %r23457, %r23459; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23456, %r23457, %r23458, %r23459; + // end inline asm + mov.u32 %r23467, 43; + // begin inline asm + shf.l.wrap.b32 %r23460, %r23466, %r23465, %r23467; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23464, %r23465, %r23466, %r23467; + // end inline asm + mov.u32 %r23475, 25; + // begin inline asm + shf.l.wrap.b32 %r23468, %r23474, %r23473, %r23475; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23472, %r23473, %r23474, %r23475; + // end inline asm + mov.u32 %r23483, 8; + // begin inline asm + shf.l.wrap.b32 %r23476, %r23482, %r23481, %r23483; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23480, %r23481, %r23482, %r23483; + // end inline asm + mov.u32 %r23491, 56; + // begin inline asm + shf.l.wrap.b32 %r23484, %r23490, %r23489, %r23491; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23488, %r23489, %r23490, %r23491; + // end inline asm + mov.u32 %r23499, 41; + // begin inline asm + shf.l.wrap.b32 %r23492, %r23498, %r23497, %r23499; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23496, %r23497, %r23498, %r23499; + // end inline asm + mov.u32 %r23507, 27; + // begin inline asm + shf.l.wrap.b32 %r23500, %r23506, %r23505, %r23507; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23504, %r23505, %r23506, %r23507; + // end inline asm + mov.u32 %r23515, 14; + // begin inline asm + shf.l.wrap.b32 %r23508, %r23514, %r23513, %r23515; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23512, %r23513, %r23514, %r23515; + // end inline asm + mov.u32 %r23523, 2; + // begin inline asm + shf.l.wrap.b32 %r23516, %r23522, %r23521, %r23523; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23520, %r23521, %r23522, %r23523; + // end inline asm + mov.u32 %r23531, 55; + // begin inline asm + shf.l.wrap.b32 %r23524, %r23530, %r23529, %r23531; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23528, %r23529, %r23530, %r23531; + // end inline asm + mov.u32 %r23539, 45; + // begin inline asm + shf.l.wrap.b32 %r23532, %r23538, %r23537, %r23539; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23536, %r23537, %r23538, %r23539; + // end inline asm + mov.u32 %r23547, 36; + // begin inline asm + shf.l.wrap.b32 %r23540, %r23546, %r23545, %r23547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23544, %r23545, %r23546, %r23547; + // end inline asm + mov.u32 %r23555, 28; + // begin inline asm + shf.l.wrap.b32 %r23548, %r23554, %r23553, %r23555; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23552, %r23553, %r23554, %r23555; + // end inline asm + mov.u32 %r23563, 21; + // begin inline asm + shf.l.wrap.b32 %r23556, %r23562, %r23561, %r23563; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23560, %r23561, %r23562, %r23563; + // end inline asm + mov.u32 %r23571, 15; + // begin inline asm + shf.l.wrap.b32 %r23564, %r23570, %r23569, %r23571; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23568, %r23569, %r23570, %r23571; + // end inline asm + mov.u32 %r23579, 10; + // begin inline asm + shf.l.wrap.b32 %r23572, %r23578, %r23577, %r23579; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23576, %r23577, %r23578, %r23579; + // end inline asm + mov.u32 %r23587, 6; + // begin inline asm + shf.l.wrap.b32 %r23580, %r23586, %r23585, %r23587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23584, %r23585, %r23586, %r23587; + // end inline asm + mov.u32 %r23595, 3; + // begin inline asm + shf.l.wrap.b32 %r23588, %r23594, %r23593, %r23595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23592, %r23593, %r23594, %r23595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23596, %r23602, %r23601, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23600, %r23601, %r23602, %r23281; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23604, %r23639, %r23412, %r23460, 0xD2; + lop3.b32 %r23605, %r23642, %r23416, %r23464, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31053, %r23412, %r23460, %r23556, 0xD2; + lop3.b32 %r31054, %r23416, %r23464, %r23560, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31049, %r23460, %r23556, %r23508, 0xD2; + lop3.b32 %r31050, %r23464, %r23560, %r23512, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31045, %r23556, %r23508, %r23639, 0xD2; + lop3.b32 %r31046, %r23560, %r23512, %r23642, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31043, %r23508, %r23639, %r23412, 0xD2; + lop3.b32 %r31044, %r23512, %r23642, %r23416, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31039, %r23548, %r23420, %r23588, 0xD2; + lop3.b32 %r31040, %r23552, %r23424, %r23592, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31051, %r23420, %r23588, %r23532, 0xD2; + lop3.b32 %r31052, %r23424, %r23592, %r23536, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31047, %r23588, %r23532, %r23428, 0xD2; + lop3.b32 %r31048, %r23592, %r23536, %r23432, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31019, %r23532, %r23428, %r23548, 0xD2; + lop3.b32 %r31020, %r23536, %r23432, %r23552, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r31019, %r31020}; + // begin inline asm + // chi + lop3.b32 %r31011, %r23428, %r23548, %r23420, 0xD2; + lop3.b32 %r31012, %r23432, %r23552, %r23424, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r31011, %r31012}; + // begin inline asm + // chi + lop3.b32 %r31037, %r23596, %r23580, %r23468, 0xD2; + lop3.b32 %r31038, %r23600, %r23584, %r23472, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r31037, %r31038}; + // begin inline asm + // chi + lop3.b32 %r31031, %r23580, %r23468, %r23476, 0xD2; + lop3.b32 %r31032, %r23584, %r23472, %r23480, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r31031, %r31032}; + // begin inline asm + // chi + lop3.b32 %r31025, %r23468, %r23476, %r23444, 0xD2; + lop3.b32 %r31026, %r23472, %r23480, %r23448, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r31025, %r31026}; + // begin inline asm + // chi + lop3.b32 %r31017, %r23476, %r23444, %r23596, 0xD2; + lop3.b32 %r31018, %r23480, %r23448, %r23600, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r31017, %r31018}; + // begin inline asm + // chi + lop3.b32 %r31009, %r23444, %r23596, %r23580, 0xD2; + lop3.b32 %r31010, %r23448, %r23600, %r23584, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r31009, %r31010}; + // begin inline asm + // chi + lop3.b32 %r31035, %r23500, %r23540, %r23572, 0xD2; + lop3.b32 %r31036, %r23504, %r23544, %r23576, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r31035, %r31036}; + // begin inline asm + // chi + lop3.b32 %r31029, %r23540, %r23572, %r23564, 0xD2; + lop3.b32 %r31030, %r23544, %r23576, %r23568, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r31029, %r31030}; + // begin inline asm + // chi + lop3.b32 %r31023, %r23572, %r23564, %r23484, 0xD2; + lop3.b32 %r31024, %r23576, %r23568, %r23488, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r31023, %r31024}; + // begin inline asm + // chi + lop3.b32 %r31015, %r23564, %r23484, %r23500, 0xD2; + lop3.b32 %r31016, %r23568, %r23488, %r23504, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r31015, %r31016}; + // begin inline asm + // chi + lop3.b32 %r31007, %r23484, %r23500, %r23540, 0xD2; + lop3.b32 %r31008, %r23488, %r23504, %r23544, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r31007, %r31008}; + // begin inline asm + // chi + lop3.b32 %r31033, %r23452, %r23524, %r23436, 0xD2; + lop3.b32 %r31034, %r23456, %r23528, %r23440, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r31033, %r31034}; + // begin inline asm + // chi + lop3.b32 %r31027, %r23524, %r23436, %r23492, 0xD2; + lop3.b32 %r31028, %r23528, %r23440, %r23496, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r31027, %r31028}; + // begin inline asm + // chi + lop3.b32 %r31021, %r23436, %r23492, %r23516, 0xD2; + lop3.b32 %r31022, %r23440, %r23496, %r23520, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r31021, %r31022}; + // begin inline asm + // chi + lop3.b32 %r31013, %r23492, %r23516, %r23452, 0xD2; + lop3.b32 %r31014, %r23496, %r23520, %r23456, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r31013, %r31014}; + // begin inline asm + // chi + lop3.b32 %r31005, %r23516, %r23452, %r23524, 0xD2; + lop3.b32 %r31006, %r23520, %r23456, %r23528, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r31005, %r31006}; + mul.wide.s32 %rd1010, %r31055, 8; + mov.u64 %rd1011, keccak_round_constants; + cvta.const.u64 %rd1012, %rd1011; + add.s64 %rd1009, %rd1012, %rd1010; + // begin inline asm + ld.global.nc.v2.u32 {%r23804,%r23805}, [%rd1009]; + // end inline asm + xor.b32 %r31041, %r23604, %r23804; + xor.b32 %r31042, %r23605, %r23805; + add.s32 %r31055, %r31055, 1; + setp.lt.u32 %p43, %r31055, 23; + @%p43 bra $L__BB2_76; + + add.u64 %rd270, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r31053, %r31054}; + st.local.v2.u32 [%rd2+72], {%r31051, %r31052}; + st.local.v2.u32 [%rd2+40], {%r31049, %r31050}; + st.local.v2.u32 [%rd2+80], {%r31047, %r31048}; + st.local.v2.u32 [%rd2+48], {%r31045, %r31046}; + st.local.v2.u32 [%rd2+56], {%r31043, %r31044}; + st.local.v2.u32 [%rd2+24], {%r31041, %r31042}; + // begin inline asm + // xor5 + lop3.b32 %r23816, %r31041, %r31039, %r31037, 0x96; + lop3.b32 %r23816, %r23816, %r31035, %r31033, 0x96; + lop3.b32 %r23817, %r31042, %r31040, %r31038, 0x96; + lop3.b32 %r23817, %r23817, %r31036, %r31034, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23828, %r31053, %r31051, %r31031, 0x96; + lop3.b32 %r23828, %r23828, %r31029, %r31027, 0x96; + lop3.b32 %r23829, %r31054, %r31052, %r31032, 0x96; + lop3.b32 %r23829, %r23829, %r31030, %r31028, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23840, %r31049, %r31047, %r31025, 0x96; + lop3.b32 %r23840, %r23840, %r31023, %r31021, 0x96; + lop3.b32 %r23841, %r31050, %r31048, %r31026, 0x96; + lop3.b32 %r23841, %r23841, %r31024, %r31022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23852, %r31045, %r31019, %r31017, 0x96; + lop3.b32 %r23852, %r23852, %r31015, %r31013, 0x96; + lop3.b32 %r23853, %r31046, %r31020, %r31018, 0x96; + lop3.b32 %r23853, %r23853, %r31016, %r31014, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23864, %r31043, %r31011, %r31009, 0x96; + lop3.b32 %r23864, %r23864, %r31007, %r31005, 0x96; + lop3.b32 %r23865, %r31044, %r31012, %r31010, 0x96; + lop3.b32 %r23865, %r23865, %r31008, %r31006, 0x96; + // end inline asm + mov.u32 %r31070, 1; + // begin inline asm + shf.l.wrap.b32 %r23876, %r23829, %r23828, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23880, %r23828, %r23829, %r31070; + // end inline asm + xor.b32 %r24095, %r23876, %r23864; + xor.b32 %r24096, %r23880, %r23865; + xor.b32 %r24023, %r31041, %r24095; + xor.b32 %r24026, %r31042, %r24096; + xor.b32 %r23986, %r31038, %r24096; + xor.b32 %r23985, %r31037, %r24095; + st.local.v2.u32 [%rd2+104], {%r23985, %r23986}; + // begin inline asm + shf.l.wrap.b32 %r23884, %r23841, %r23840, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23888, %r23840, %r23841, %r31070; + // end inline asm + xor.b32 %r24097, %r23884, %r23816; + xor.b32 %r24098, %r23888, %r23817; + xor.b32 %r23922, %r31051, %r24097; + xor.b32 %r23921, %r31052, %r24098; + xor.b32 %r23961, %r31030, %r24098; + xor.b32 %r23962, %r31029, %r24097; + st.local.v2.u32 [%rd2+152], {%r23962, %r23961}; + // begin inline asm + shf.l.wrap.b32 %r23892, %r23853, %r23852, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23896, %r23852, %r23853, %r31070; + // end inline asm + xor.b32 %r24099, %r23892, %r23828; + xor.b32 %r24100, %r23896, %r23829; + xor.b32 %r23945, %r31026, %r24100; + xor.b32 %r23946, %r31025, %r24099; + st.local.v2.u32 [%rd2+120], {%r23946, %r23945}; + xor.b32 %r23937, %r31022, %r24100; + xor.b32 %r23938, %r31021, %r24099; + st.local.v2.u32 [%rd2+200], {%r23938, %r23937}; + // begin inline asm + shf.l.wrap.b32 %r23900, %r23865, %r23864, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23904, %r23864, %r23865, %r31070; + // end inline asm + xor.b32 %r24101, %r23900, %r23840; + xor.b32 %r24102, %r23904, %r23841; + xor.b32 %r23969, %r31045, %r24101; + xor.b32 %r23970, %r31046, %r24102; + xor.b32 %r23978, %r31016, %r24102; + xor.b32 %r23977, %r31015, %r24101; + st.local.v2.u32 [%rd2+168], {%r23977, %r23978}; + // begin inline asm + shf.l.wrap.b32 %r23908, %r23817, %r23816, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23912, %r23816, %r23817, %r31070; + // end inline asm + xor.b32 %r24103, %r23908, %r23852; + xor.b32 %r24104, %r23912, %r23853; + xor.b32 %r23929, %r31011, %r24103; + xor.b32 %r23930, %r31012, %r24104; + xor.b32 %r23954, %r31006, %r24104; + xor.b32 %r23953, %r31005, %r24103; + st.local.v2.u32 [%rd2+216], {%r23953, %r23954}; + // begin inline asm + shf.l.wrap.b32 %r23916, %r23922, %r23921, %r23419; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23920, %r23921, %r23922, %r23419; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23924, %r23930, %r23929, %r23427; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23928, %r23929, %r23930, %r23427; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23936, %r23937, %r23938, %r23435; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23932, %r23938, %r23937, %r23435; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r23932, %r23936}; + // begin inline asm + shf.l.wrap.b32 %r23940, %r23946, %r23945, %r23467; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23944, %r23945, %r23946, %r23467; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23948, %r23954, %r23953, %r23515; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23952, %r23953, %r23954, %r23515; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23960, %r23961, %r23962, %r23539; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23956, %r23962, %r23961, %r23539; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r23956, %r23960}; + // begin inline asm + shf.l.wrap.b32 %r23964, %r23970, %r23969, %r23555; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23968, %r23969, %r23970, %r23555; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23972, %r23978, %r23977, %r23563; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23976, %r23977, %r23978, %r23563; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23980, %r23986, %r23985, %r23595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23984, %r23985, %r23986, %r23595; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23988, %r24023, %r23916, %r23940, 0xD2; + lop3.b32 %r23989, %r24026, %r23920, %r23944, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31188, %r23916, %r23940, %r23972, 0xD2; + lop3.b32 %r31189, %r23920, %r23944, %r23976, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r31188, %r31189}; + // begin inline asm + // chi + lop3.b32 %r31184, %r23940, %r23972, %r23948, 0xD2; + lop3.b32 %r31185, %r23944, %r23976, %r23952, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r31184, %r31185}; + // begin inline asm + // chi + lop3.b32 %r31180, %r23972, %r23948, %r24023, 0xD2; + lop3.b32 %r31181, %r23976, %r23952, %r24026, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r31180, %r31181}; + // begin inline asm + // chi + lop3.b32 %r31178, %r23948, %r24023, %r23916, 0xD2; + lop3.b32 %r31179, %r23952, %r24026, %r23920, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r31178, %r31179}; + // begin inline asm + // chi + lop3.b32 %r31174, %r23964, %r23924, %r23980, 0xD2; + lop3.b32 %r31175, %r23968, %r23928, %r23984, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r31174, %r31175}; + // begin inline asm + // chi + lop3.b32 %r31186, %r23924, %r23980, %r23956, 0xD2; + lop3.b32 %r31187, %r23928, %r23984, %r23960, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r31186, %r31187}; + // begin inline asm + // chi + lop3.b32 %r31182, %r23980, %r23956, %r23932, 0xD2; + lop3.b32 %r31183, %r23984, %r23960, %r23936, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r31182, %r31183}; + add.s64 %rd1013, %rd1012, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r24052,%r24053}, [%rd1013]; + // end inline asm + xor.b32 %r31176, %r23988, %r24052; + xor.b32 %r31177, %r23989, %r24053; + st.local.v2.u32 [%rd2+24], {%r31176, %r31177}; + st.local.u64 [%rd270], %rd361; + mov.u64 %rd1017, 1179641; + st.local.u64 [%rd270+8], %rd1017; + st.local.u32 [%rd270+16], %r3344; + ld.global.u64 %rd1018, [%rd221]; + ld.global.u64 %rd1019, [%rd221+8]; + ld.global.u64 %rd1020, [%rd221+16]; + ld.global.u64 %rd1021, [%rd221+24]; + ld.global.u64 %rd1022, [%rd221+32]; + ld.global.u64 %rd1023, [%rd221+40]; + ld.global.u64 %rd1024, [%rd221+48]; + ld.global.u64 %rd1025, [%rd221+56]; + st.local.u64 [%rd270+32], %rd1019; + st.local.u64 [%rd270+40], %rd1020; + st.local.u64 [%rd270+48], %rd1021; + st.local.u64 [%rd270+56], %rd1022; + st.local.u64 [%rd270+64], %rd1023; + st.local.u64 [%rd270+72], %rd1024; + st.local.u64 [%rd270+80], %rd1025; + cvt.u32.u64 %r24105, %rd1018; + xor.b32 %r24106, %r3344, %r24105; + st.local.u64 [%rd270+24], %rd1018; + st.local.u32 [%rd270+24], %r24106; + mov.u32 %r31056, 0; + st.local.v2.u32 [%rd270+96], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+104], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+112], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+120], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+128], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+136], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+144], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+152], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+160], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+168], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+176], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+184], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+192], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+200], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+208], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+216], {%r31056, %r31056}; + mov.u32 %r31071, -2147483648; + st.local.v2.u32 [%rd270+88], {%r31070, %r31071}; + ld.local.v2.u32 {%r31092, %r31093}, [%rd270+24]; + mov.b64 {%r31090, %r31091}, %rd1023; + shr.u64 %rd1026, %rd1019, 32; + cvt.u32.u64 %r31104, %rd1019; + cvt.u32.u64 %r31105, %rd1026; + shr.u64 %rd1027, %rd1024, 32; + cvt.u32.u64 %r31102, %rd1024; + cvt.u32.u64 %r31103, %rd1027; + shr.u64 %rd1028, %rd1020, 32; + cvt.u32.u64 %r31100, %rd1020; + cvt.u32.u64 %r31101, %rd1028; + shr.u64 %rd1029, %rd1025, 32; + cvt.u32.u64 %r31098, %rd1025; + cvt.u32.u64 %r31099, %rd1029; + shr.u64 %rd1030, %rd1021, 32; + cvt.u32.u64 %r31096, %rd1021; + cvt.u32.u64 %r31097, %rd1030; + shr.u64 %rd1031, %rd1022, 32; + cvt.u32.u64 %r31094, %rd1022; + cvt.u32.u64 %r31095, %rd1031; + mov.u32 %r31057, %r31056; + mov.u32 %r31058, %r31056; + mov.u32 %r31059, %r31056; + mov.u32 %r31060, %r31056; + mov.u32 %r31061, %r31056; + mov.u32 %r31062, %r31056; + mov.u32 %r31063, %r31056; + mov.u32 %r31064, %r31056; + mov.u32 %r31065, %r31056; + mov.u32 %r31066, %r31056; + mov.u32 %r31067, %r31056; + mov.u32 %r31068, %r31056; + mov.u32 %r31069, %r31056; + mov.u32 %r31072, %r31056; + mov.u32 %r31073, %r31056; + mov.u32 %r31074, %r31056; + mov.u32 %r31075, %r31056; + mov.u32 %r31076, %r31056; + mov.u32 %r31077, %r31056; + mov.u32 %r31078, %r31056; + mov.u32 %r31079, %r31056; + mov.u32 %r31080, %r31056; + mov.u32 %r31081, %r31056; + mov.u32 %r31082, %r31056; + mov.u32 %r31083, %r31056; + mov.u32 %r31084, %r31056; + mov.u32 %r31085, %r31056; + mov.u32 %r31086, %r31056; + mov.u32 %r31087, %r31056; + mov.u32 %r31088, %r31056; + mov.u32 %r31089, %r31056; + mov.u32 %r31106, %r31056; + +$L__BB2_78: + mov.u32 %r29797, 1; + mov.u64 %rd1296, keccak_round_constants; + cvta.const.u64 %rd1295, %rd1296; + // begin inline asm + // xor5 + lop3.b32 %r24109, %r31092, %r31090, %r31088, 0x96; + lop3.b32 %r24109, %r24109, %r31086, %r31084, 0x96; + lop3.b32 %r24110, %r31093, %r31091, %r31089, 0x96; + lop3.b32 %r24110, %r24110, %r31087, %r31085, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24121, %r31104, %r31102, %r31082, 0x96; + lop3.b32 %r24121, %r24121, %r31080, %r31078, 0x96; + lop3.b32 %r24122, %r31105, %r31103, %r31083, 0x96; + lop3.b32 %r24122, %r24122, %r31081, %r31079, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24133, %r31100, %r31098, %r31076, 0x96; + lop3.b32 %r24133, %r24133, %r31074, %r31072, 0x96; + lop3.b32 %r24134, %r31101, %r31099, %r31077, 0x96; + lop3.b32 %r24134, %r24134, %r31075, %r31073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24145, %r31096, %r31070, %r31068, 0x96; + lop3.b32 %r24145, %r24145, %r31066, %r31064, 0x96; + lop3.b32 %r24146, %r31097, %r31071, %r31069, 0x96; + lop3.b32 %r24146, %r24146, %r31067, %r31065, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24157, %r31094, %r31062, %r31060, 0x96; + lop3.b32 %r24157, %r24157, %r31058, %r31056, 0x96; + lop3.b32 %r24158, %r31095, %r31063, %r31061, 0x96; + lop3.b32 %r24158, %r24158, %r31059, %r31057, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24169, %r24122, %r24121, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24173, %r24121, %r24122, %r29797; + // end inline asm + xor.b32 %r24603, %r24169, %r24157; + xor.b32 %r24604, %r24173, %r24158; + xor.b32 %r24436, %r31092, %r24603; + xor.b32 %r24439, %r31093, %r24604; + xor.b32 %r24343, %r31090, %r24603; + xor.b32 %r24342, %r31091, %r24604; + xor.b32 %r24390, %r31088, %r24603; + xor.b32 %r24391, %r31089, %r24604; + xor.b32 %r24295, %r31086, %r24603; + xor.b32 %r24294, %r31087, %r24604; + xor.b32 %r24246, %r31084, %r24603; + xor.b32 %r24247, %r31085, %r24604; + // begin inline asm + shf.l.wrap.b32 %r24177, %r24134, %r24133, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24181, %r24133, %r24134, %r29797; + // end inline asm + xor.b32 %r24605, %r24177, %r24109; + xor.b32 %r24606, %r24181, %r24110; + xor.b32 %r24398, %r31104, %r24605; + xor.b32 %r24399, %r31105, %r24606; + xor.b32 %r24215, %r31102, %r24605; + xor.b32 %r24214, %r31103, %r24606; + xor.b32 %r24374, %r31082, %r24605; + xor.b32 %r24375, %r31083, %r24606; + xor.b32 %r24335, %r31080, %r24605; + xor.b32 %r24334, %r31081, %r24606; + xor.b32 %r24318, %r31078, %r24605; + xor.b32 %r24319, %r31079, %r24606; + // begin inline asm + shf.l.wrap.b32 %r24185, %r24146, %r24145, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24189, %r24145, %r24146, %r29797; + // end inline asm + xor.b32 %r24607, %r24185, %r24121; + xor.b32 %r24608, %r24189, %r24122; + xor.b32 %r24255, %r31100, %r24607; + xor.b32 %r24254, %r31101, %r24608; + xor.b32 %r24382, %r31098, %r24607; + xor.b32 %r24383, %r31099, %r24608; + xor.b32 %r24263, %r31076, %r24607; + xor.b32 %r24262, %r31077, %r24608; + xor.b32 %r24366, %r31074, %r24607; + xor.b32 %r24367, %r31075, %r24608; + xor.b32 %r24231, %r31072, %r24607; + xor.b32 %r24230, %r31073, %r24608; + // begin inline asm + shf.l.wrap.b32 %r24193, %r24158, %r24157, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24197, %r24157, %r24158, %r29797; + // end inline asm + xor.b32 %r24609, %r24193, %r24133; + xor.b32 %r24610, %r24197, %r24134; + xor.b32 %r24350, %r31096, %r24609; + xor.b32 %r24351, %r31097, %r24610; + xor.b32 %r24327, %r31070, %r24609; + xor.b32 %r24326, %r31071, %r24610; + xor.b32 %r24270, %r31068, %r24609; + xor.b32 %r24271, %r31069, %r24610; + xor.b32 %r24358, %r31066, %r24609; + xor.b32 %r24359, %r31067, %r24610; + xor.b32 %r24287, %r31064, %r24609; + xor.b32 %r24286, %r31065, %r24610; + // begin inline asm + shf.l.wrap.b32 %r24201, %r24110, %r24109, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24205, %r24109, %r24110, %r29797; + // end inline asm + xor.b32 %r24611, %r24201, %r24145; + xor.b32 %r24612, %r24205, %r24146; + xor.b32 %r24302, %r31094, %r24611; + xor.b32 %r24303, %r31095, %r24612; + xor.b32 %r24222, %r31062, %r24611; + xor.b32 %r24223, %r31063, %r24612; + xor.b32 %r24239, %r31060, %r24611; + xor.b32 %r24238, %r31061, %r24612; + xor.b32 %r24278, %r31058, %r24611; + xor.b32 %r24279, %r31059, %r24612; + xor.b32 %r24310, %r31056, %r24611; + xor.b32 %r24311, %r31057, %r24612; + mov.u32 %r24216, 44; + // begin inline asm + shf.l.wrap.b32 %r24209, %r24215, %r24214, %r24216; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24213, %r24214, %r24215, %r24216; + // end inline asm + mov.u32 %r24224, 20; + // begin inline asm + shf.l.wrap.b32 %r24217, %r24223, %r24222, %r24224; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24221, %r24222, %r24223, %r24224; + // end inline asm + mov.u32 %r24232, 61; + // begin inline asm + shf.l.wrap.b32 %r24225, %r24231, %r24230, %r24232; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24229, %r24230, %r24231, %r24232; + // end inline asm + mov.u32 %r24240, 39; + // begin inline asm + shf.l.wrap.b32 %r24233, %r24239, %r24238, %r24240; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24237, %r24238, %r24239, %r24240; + // end inline asm + mov.u32 %r24248, 18; + // begin inline asm + shf.l.wrap.b32 %r24241, %r24247, %r24246, %r24248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24245, %r24246, %r24247, %r24248; + // end inline asm + mov.u32 %r24256, 62; + // begin inline asm + shf.l.wrap.b32 %r24249, %r24255, %r24254, %r24256; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24253, %r24254, %r24255, %r24256; + // end inline asm + mov.u32 %r24264, 43; + // begin inline asm + shf.l.wrap.b32 %r24257, %r24263, %r24262, %r24264; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24261, %r24262, %r24263, %r24264; + // end inline asm + mov.u32 %r24272, 25; + // begin inline asm + shf.l.wrap.b32 %r24265, %r24271, %r24270, %r24272; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24269, %r24270, %r24271, %r24272; + // end inline asm + mov.u32 %r24280, 8; + // begin inline asm + shf.l.wrap.b32 %r24273, %r24279, %r24278, %r24280; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24277, %r24278, %r24279, %r24280; + // end inline asm + mov.u32 %r24288, 56; + // begin inline asm + shf.l.wrap.b32 %r24281, %r24287, %r24286, %r24288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24285, %r24286, %r24287, %r24288; + // end inline asm + mov.u32 %r24296, 41; + // begin inline asm + shf.l.wrap.b32 %r24289, %r24295, %r24294, %r24296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24293, %r24294, %r24295, %r24296; + // end inline asm + mov.u32 %r24304, 27; + // begin inline asm + shf.l.wrap.b32 %r24297, %r24303, %r24302, %r24304; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24301, %r24302, %r24303, %r24304; + // end inline asm + mov.u32 %r24312, 14; + // begin inline asm + shf.l.wrap.b32 %r24305, %r24311, %r24310, %r24312; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24309, %r24310, %r24311, %r24312; + // end inline asm + mov.u32 %r24320, 2; + // begin inline asm + shf.l.wrap.b32 %r24313, %r24319, %r24318, %r24320; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24317, %r24318, %r24319, %r24320; + // end inline asm + mov.u32 %r24328, 55; + // begin inline asm + shf.l.wrap.b32 %r24321, %r24327, %r24326, %r24328; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24325, %r24326, %r24327, %r24328; + // end inline asm + mov.u32 %r24336, 45; + // begin inline asm + shf.l.wrap.b32 %r24329, %r24335, %r24334, %r24336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24333, %r24334, %r24335, %r24336; + // end inline asm + mov.u32 %r24344, 36; + // begin inline asm + shf.l.wrap.b32 %r24337, %r24343, %r24342, %r24344; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24341, %r24342, %r24343, %r24344; + // end inline asm + mov.u32 %r24352, 28; + // begin inline asm + shf.l.wrap.b32 %r24345, %r24351, %r24350, %r24352; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24349, %r24350, %r24351, %r24352; + // end inline asm + mov.u32 %r24360, 21; + // begin inline asm + shf.l.wrap.b32 %r24353, %r24359, %r24358, %r24360; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24357, %r24358, %r24359, %r24360; + // end inline asm + mov.u32 %r24368, 15; + // begin inline asm + shf.l.wrap.b32 %r24361, %r24367, %r24366, %r24368; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24365, %r24366, %r24367, %r24368; + // end inline asm + mov.u32 %r24376, 10; + // begin inline asm + shf.l.wrap.b32 %r24369, %r24375, %r24374, %r24376; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24373, %r24374, %r24375, %r24376; + // end inline asm + mov.u32 %r24384, 6; + // begin inline asm + shf.l.wrap.b32 %r24377, %r24383, %r24382, %r24384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24381, %r24382, %r24383, %r24384; + // end inline asm + mov.u32 %r24392, 3; + // begin inline asm + shf.l.wrap.b32 %r24385, %r24391, %r24390, %r24392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24389, %r24390, %r24391, %r24392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24393, %r24399, %r24398, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24397, %r24398, %r24399, %r29797; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24401, %r24436, %r24209, %r24257, 0xD2; + lop3.b32 %r24402, %r24439, %r24213, %r24261, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31104, %r24209, %r24257, %r24353, 0xD2; + lop3.b32 %r31105, %r24213, %r24261, %r24357, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31100, %r24257, %r24353, %r24305, 0xD2; + lop3.b32 %r31101, %r24261, %r24357, %r24309, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31096, %r24353, %r24305, %r24436, 0xD2; + lop3.b32 %r31097, %r24357, %r24309, %r24439, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31094, %r24305, %r24436, %r24209, 0xD2; + lop3.b32 %r31095, %r24309, %r24439, %r24213, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31090, %r24345, %r24217, %r24385, 0xD2; + lop3.b32 %r31091, %r24349, %r24221, %r24389, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31102, %r24217, %r24385, %r24329, 0xD2; + lop3.b32 %r31103, %r24221, %r24389, %r24333, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31098, %r24385, %r24329, %r24225, 0xD2; + lop3.b32 %r31099, %r24389, %r24333, %r24229, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31070, %r24329, %r24225, %r24345, 0xD2; + lop3.b32 %r31071, %r24333, %r24229, %r24349, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r31070, %r31071}; + // begin inline asm + // chi + lop3.b32 %r31062, %r24225, %r24345, %r24217, 0xD2; + lop3.b32 %r31063, %r24229, %r24349, %r24221, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r31062, %r31063}; + // begin inline asm + // chi + lop3.b32 %r31088, %r24393, %r24377, %r24265, 0xD2; + lop3.b32 %r31089, %r24397, %r24381, %r24269, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+104], {%r31088, %r31089}; + // begin inline asm + // chi + lop3.b32 %r31082, %r24377, %r24265, %r24273, 0xD2; + lop3.b32 %r31083, %r24381, %r24269, %r24277, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+112], {%r31082, %r31083}; + // begin inline asm + // chi + lop3.b32 %r31076, %r24265, %r24273, %r24241, 0xD2; + lop3.b32 %r31077, %r24269, %r24277, %r24245, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+120], {%r31076, %r31077}; + // begin inline asm + // chi + lop3.b32 %r31068, %r24273, %r24241, %r24393, 0xD2; + lop3.b32 %r31069, %r24277, %r24245, %r24397, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+128], {%r31068, %r31069}; + // begin inline asm + // chi + lop3.b32 %r31060, %r24241, %r24393, %r24377, 0xD2; + lop3.b32 %r31061, %r24245, %r24397, %r24381, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+136], {%r31060, %r31061}; + // begin inline asm + // chi + lop3.b32 %r31086, %r24297, %r24337, %r24369, 0xD2; + lop3.b32 %r31087, %r24301, %r24341, %r24373, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+144], {%r31086, %r31087}; + // begin inline asm + // chi + lop3.b32 %r31080, %r24337, %r24369, %r24361, 0xD2; + lop3.b32 %r31081, %r24341, %r24373, %r24365, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+152], {%r31080, %r31081}; + // begin inline asm + // chi + lop3.b32 %r31074, %r24369, %r24361, %r24281, 0xD2; + lop3.b32 %r31075, %r24373, %r24365, %r24285, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+160], {%r31074, %r31075}; + // begin inline asm + // chi + lop3.b32 %r31066, %r24361, %r24281, %r24297, 0xD2; + lop3.b32 %r31067, %r24365, %r24285, %r24301, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+168], {%r31066, %r31067}; + // begin inline asm + // chi + lop3.b32 %r31058, %r24281, %r24297, %r24337, 0xD2; + lop3.b32 %r31059, %r24285, %r24301, %r24341, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+176], {%r31058, %r31059}; + // begin inline asm + // chi + lop3.b32 %r31084, %r24249, %r24321, %r24233, 0xD2; + lop3.b32 %r31085, %r24253, %r24325, %r24237, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+184], {%r31084, %r31085}; + // begin inline asm + // chi + lop3.b32 %r31078, %r24321, %r24233, %r24289, 0xD2; + lop3.b32 %r31079, %r24325, %r24237, %r24293, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+192], {%r31078, %r31079}; + // begin inline asm + // chi + lop3.b32 %r31072, %r24233, %r24289, %r24313, 0xD2; + lop3.b32 %r31073, %r24237, %r24293, %r24317, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+200], {%r31072, %r31073}; + // begin inline asm + // chi + lop3.b32 %r31064, %r24289, %r24313, %r24249, 0xD2; + lop3.b32 %r31065, %r24293, %r24317, %r24253, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+208], {%r31064, %r31065}; + // begin inline asm + // chi + lop3.b32 %r31056, %r24313, %r24249, %r24321, 0xD2; + lop3.b32 %r31057, %r24317, %r24253, %r24325, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+216], {%r31056, %r31057}; + mul.wide.s32 %rd1033, %r31106, 8; + add.s64 %rd1032, %rd1295, %rd1033; + // begin inline asm + ld.global.nc.v2.u32 {%r24601,%r24602}, [%rd1032]; + // end inline asm + xor.b32 %r31092, %r24401, %r24601; + xor.b32 %r31093, %r24402, %r24602; + add.s32 %r31106, %r31106, 1; + setp.lt.u32 %p44, %r31106, 23; + @%p44 bra $L__BB2_78; + + mov.u64 %rd1284, keccak_round_constants; + cvta.const.u64 %rd1283, %rd1284; + add.s64 %rd1282, %rd1283, 184; + mov.u32 %r29795, 3; + mov.u32 %r29794, 21; + mov.u32 %r29793, 28; + mov.u32 %r29792, 45; + mov.u32 %r29791, 14; + mov.u32 %r29790, 43; + mov.u32 %r29789, 61; + mov.u32 %r29788, 20; + mov.u32 %r29787, 44; + mov.u32 %r31139, 0; + mov.u32 %r24712, 1; + st.local.v2.u32 [%rd270+32], {%r31104, %r31105}; + st.local.v2.u32 [%rd270+72], {%r31102, %r31103}; + st.local.v2.u32 [%rd270+40], {%r31100, %r31101}; + st.local.v2.u32 [%rd270+80], {%r31098, %r31099}; + st.local.v2.u32 [%rd270+48], {%r31096, %r31097}; + st.local.v2.u32 [%rd270+56], {%r31094, %r31095}; + st.local.v2.u32 [%rd270+24], {%r31092, %r31093}; + // begin inline asm + // xor5 + lop3.b32 %r24613, %r31092, %r31090, %r31088, 0x96; + lop3.b32 %r24613, %r24613, %r31086, %r31084, 0x96; + lop3.b32 %r24614, %r31093, %r31091, %r31089, 0x96; + lop3.b32 %r24614, %r24614, %r31087, %r31085, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24625, %r31104, %r31102, %r31082, 0x96; + lop3.b32 %r24625, %r24625, %r31080, %r31078, 0x96; + lop3.b32 %r24626, %r31105, %r31103, %r31083, 0x96; + lop3.b32 %r24626, %r24626, %r31081, %r31079, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24637, %r31100, %r31098, %r31076, 0x96; + lop3.b32 %r24637, %r24637, %r31074, %r31072, 0x96; + lop3.b32 %r24638, %r31101, %r31099, %r31077, 0x96; + lop3.b32 %r24638, %r24638, %r31075, %r31073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24649, %r31096, %r31070, %r31068, 0x96; + lop3.b32 %r24649, %r24649, %r31066, %r31064, 0x96; + lop3.b32 %r24650, %r31097, %r31071, %r31069, 0x96; + lop3.b32 %r24650, %r24650, %r31067, %r31065, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24661, %r31094, %r31062, %r31060, 0x96; + lop3.b32 %r24661, %r24661, %r31058, %r31056, 0x96; + lop3.b32 %r24662, %r31095, %r31063, %r31061, 0x96; + lop3.b32 %r24662, %r24662, %r31059, %r31057, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24673, %r24626, %r24625, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24677, %r24625, %r24626, %r24712; + // end inline asm + xor.b32 %r24852, %r24673, %r24661; + xor.b32 %r24853, %r24677, %r24662; + xor.b32 %r24820, %r31092, %r24852; + xor.b32 %r24823, %r31093, %r24853; + xor.b32 %r24783, %r31089, %r24853; + xor.b32 %r24782, %r31088, %r24852; + st.local.v2.u32 [%rd270+104], {%r24782, %r24783}; + // begin inline asm + shf.l.wrap.b32 %r24681, %r24638, %r24637, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24685, %r24637, %r24638, %r24712; + // end inline asm + xor.b32 %r24854, %r24681, %r24613; + xor.b32 %r24855, %r24685, %r24614; + xor.b32 %r24719, %r31102, %r24854; + xor.b32 %r24718, %r31103, %r24855; + xor.b32 %r24758, %r31081, %r24855; + xor.b32 %r24759, %r31080, %r24854; + st.local.v2.u32 [%rd270+152], {%r24759, %r24758}; + // begin inline asm + shf.l.wrap.b32 %r24689, %r24650, %r24649, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24693, %r24649, %r24650, %r24712; + // end inline asm + xor.b32 %r24856, %r24689, %r24625; + xor.b32 %r24857, %r24693, %r24626; + xor.b32 %r24742, %r31077, %r24857; + xor.b32 %r24743, %r31076, %r24856; + st.local.v2.u32 [%rd270+120], {%r24743, %r24742}; + xor.b32 %r24734, %r31073, %r24857; + xor.b32 %r24735, %r31072, %r24856; + st.local.v2.u32 [%rd270+200], {%r24735, %r24734}; + // begin inline asm + shf.l.wrap.b32 %r24697, %r24662, %r24661, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24701, %r24661, %r24662, %r24712; + // end inline asm + xor.b32 %r24858, %r24697, %r24637; + xor.b32 %r24859, %r24701, %r24638; + xor.b32 %r24766, %r31096, %r24858; + xor.b32 %r24767, %r31097, %r24859; + xor.b32 %r24775, %r31067, %r24859; + xor.b32 %r24774, %r31066, %r24858; + st.local.v2.u32 [%rd270+168], {%r24774, %r24775}; + // begin inline asm + shf.l.wrap.b32 %r24705, %r24614, %r24613, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24709, %r24613, %r24614, %r24712; + // end inline asm + xor.b32 %r24860, %r24705, %r24649; + xor.b32 %r24861, %r24709, %r24650; + xor.b32 %r24726, %r31062, %r24860; + xor.b32 %r24727, %r31063, %r24861; + xor.b32 %r24751, %r31057, %r24861; + xor.b32 %r24750, %r31056, %r24860; + st.local.v2.u32 [%rd270+216], {%r24750, %r24751}; + // begin inline asm + shf.l.wrap.b32 %r24713, %r24719, %r24718, %r29787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24717, %r24718, %r24719, %r29787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24721, %r24727, %r24726, %r29788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24725, %r24726, %r24727, %r29788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24733, %r24734, %r24735, %r29789; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24729, %r24735, %r24734, %r29789; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r24729, %r24733}; + // begin inline asm + shf.l.wrap.b32 %r24737, %r24743, %r24742, %r29790; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24741, %r24742, %r24743, %r29790; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24745, %r24751, %r24750, %r29791; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24749, %r24750, %r24751, %r29791; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24757, %r24758, %r24759, %r29792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24753, %r24759, %r24758, %r29792; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r24753, %r24757}; + // begin inline asm + shf.l.wrap.b32 %r24761, %r24767, %r24766, %r29793; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24765, %r24766, %r24767, %r29793; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24769, %r24775, %r24774, %r29794; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24773, %r24774, %r24775, %r29794; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24777, %r24783, %r24782, %r29795; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24781, %r24782, %r24783, %r29795; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24785, %r24820, %r24713, %r24737, 0xD2; + lop3.b32 %r24786, %r24823, %r24717, %r24741, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31239, %r24713, %r24737, %r24769, 0xD2; + lop3.b32 %r31240, %r24717, %r24741, %r24773, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+32], {%r31239, %r31240}; + // begin inline asm + // chi + lop3.b32 %r31235, %r24737, %r24769, %r24745, 0xD2; + lop3.b32 %r31236, %r24741, %r24773, %r24749, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+40], {%r31235, %r31236}; + // begin inline asm + // chi + lop3.b32 %r31231, %r24769, %r24745, %r24820, 0xD2; + lop3.b32 %r31232, %r24773, %r24749, %r24823, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+48], {%r31231, %r31232}; + // begin inline asm + // chi + lop3.b32 %r31229, %r24745, %r24820, %r24713, 0xD2; + lop3.b32 %r31230, %r24749, %r24823, %r24717, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+56], {%r31229, %r31230}; + // begin inline asm + // chi + lop3.b32 %r31225, %r24761, %r24721, %r24777, 0xD2; + lop3.b32 %r31226, %r24765, %r24725, %r24781, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+64], {%r31225, %r31226}; + // begin inline asm + // chi + lop3.b32 %r31237, %r24721, %r24777, %r24753, 0xD2; + lop3.b32 %r31238, %r24725, %r24781, %r24757, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+72], {%r31237, %r31238}; + // begin inline asm + // chi + lop3.b32 %r31233, %r24777, %r24753, %r24729, 0xD2; + lop3.b32 %r31234, %r24781, %r24757, %r24733, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+80], {%r31233, %r31234}; + // begin inline asm + ld.global.nc.v2.u32 {%r24849,%r24850}, [%rd1282]; + // end inline asm + xor.b32 %r31227, %r24785, %r24849; + xor.b32 %r31228, %r24786, %r24850; + st.local.v2.u32 [%rd270+24], {%r31227, %r31228}; + add.s64 %rd273, %rd270, 24; + add.s64 %rd274, %rd2, 24; + +$L__BB2_80: + or.b32 %r29796, %r3343, 1; + cvta.to.global.u64 %rd1267, %rd361; + shl.b32 %r24862, %r31139, 2; + cvt.u64.u32 %rd1041, %r24862; + and.b64 %rd1042, %rd1041, 60; + add.s64 %rd1043, %rd274, %rd1042; + xor.b32 %r24863, %r3343, %r31139; + mul.lo.s32 %r24864, %r24863, 16777619; + ld.local.u32 %r24865, [%rd1043]; + xor.b32 %r24866, %r24864, %r24865; + mul.wide.u32 %rd1044, %r24866, -954391867; + shr.u64 %rd1045, %rd1044, 32; + cvt.u32.u64 %r24867, %rd1045; + sub.s32 %r24868, %r24866, %r24867; + shr.u32 %r24869, %r24868, 1; + add.s32 %r24870, %r24869, %r24867; + shr.u32 %r24871, %r24870, 20; + mul.lo.s32 %r24872, %r24871, 1179641; + sub.s32 %r24873, %r24866, %r24872; + mul.wide.u32 %rd1046, %r24873, 64; + add.s64 %rd1047, %rd1267, %rd1046; + mul.lo.s32 %r24874, %r31176, 16777619; + ld.global.u32 %r24875, [%rd1047]; + xor.b32 %r31176, %r24874, %r24875; + mul.lo.s32 %r24876, %r31177, 16777619; + ld.global.u32 %r24877, [%rd1047+4]; + xor.b32 %r31177, %r24876, %r24877; + mul.lo.s32 %r24878, %r31188, 16777619; + ld.global.u32 %r24879, [%rd1047+8]; + mul.lo.s32 %r24880, %r31189, 16777619; + ld.global.u32 %r24881, [%rd1047+12]; + xor.b32 %r24882, %r24880, %r24881; + xor.b32 %r31188, %r24878, %r24879; + mov.b64 %rd1048, {%r31188, %r24882}; + mul.lo.s32 %r24883, %r31184, 16777619; + ld.global.u32 %r24884, [%rd1047+16]; + mul.lo.s32 %r24885, %r31185, 16777619; + ld.global.u32 %r24886, [%rd1047+20]; + xor.b32 %r24887, %r24885, %r24886; + xor.b32 %r31184, %r24883, %r24884; + mov.b64 %rd1049, {%r31184, %r24887}; + mul.lo.s32 %r24888, %r31180, 16777619; + ld.global.u32 %r24889, [%rd1047+24]; + mul.lo.s32 %r24890, %r31181, 16777619; + ld.global.u32 %r24891, [%rd1047+28]; + xor.b32 %r24892, %r24890, %r24891; + xor.b32 %r31180, %r24888, %r24889; + mov.b64 %rd1050, {%r31180, %r24892}; + mul.lo.s32 %r24893, %r31178, 16777619; + ld.global.u32 %r24894, [%rd1047+32]; + mul.lo.s32 %r24895, %r31179, 16777619; + ld.global.u32 %r24896, [%rd1047+36]; + xor.b32 %r24897, %r24895, %r24896; + xor.b32 %r31178, %r24893, %r24894; + mov.b64 %rd1051, {%r31178, %r24897}; + mul.lo.s32 %r24898, %r31174, 16777619; + ld.global.u32 %r24899, [%rd1047+40]; + xor.b32 %r31174, %r24898, %r24899; + mul.lo.s32 %r24900, %r31175, 16777619; + ld.global.u32 %r24901, [%rd1047+44]; + xor.b32 %r31175, %r24900, %r24901; + mul.lo.s32 %r24902, %r31186, 16777619; + ld.global.u32 %r24903, [%rd1047+48]; + mul.lo.s32 %r24904, %r31187, 16777619; + ld.global.u32 %r24905, [%rd1047+52]; + xor.b32 %r24906, %r24904, %r24905; + xor.b32 %r31186, %r24902, %r24903; + mov.b64 %rd1052, {%r31186, %r24906}; + mul.lo.s32 %r24907, %r31182, 16777619; + ld.global.u32 %r24908, [%rd1047+56]; + mul.lo.s32 %r24909, %r31183, 16777619; + ld.global.u32 %r24910, [%rd1047+60]; + xor.b32 %r24911, %r24909, %r24910; + xor.b32 %r31182, %r24907, %r24908; + mov.b64 %rd1053, {%r31182, %r24911}; + st.local.v2.u32 [%rd2+24], {%r31176, %r31177}; + st.local.v2.u32 [%rd2+32], {%r31188, %r24882}; + st.local.v2.u32 [%rd2+40], {%r31184, %r24887}; + st.local.v2.u32 [%rd2+48], {%r31180, %r24892}; + st.local.v2.u32 [%rd2+56], {%r31178, %r24897}; + st.local.v2.u32 [%rd2+64], {%r31174, %r31175}; + st.local.v2.u32 [%rd2+72], {%r31186, %r24906}; + st.local.v2.u32 [%rd2+80], {%r31182, %r24911}; + add.s64 %rd1054, %rd273, %rd1042; + xor.b32 %r24912, %r29796, %r31139; + mul.lo.s32 %r24913, %r24912, 16777619; + ld.local.u32 %r24914, [%rd1054]; + xor.b32 %r24915, %r24913, %r24914; + mul.wide.u32 %rd1055, %r24915, -954391867; + shr.u64 %rd1056, %rd1055, 32; + cvt.u32.u64 %r24916, %rd1056; + sub.s32 %r24917, %r24915, %r24916; + shr.u32 %r24918, %r24917, 1; + add.s32 %r24919, %r24918, %r24916; + shr.u32 %r24920, %r24919, 20; + mul.lo.s32 %r24921, %r24920, 1179641; + sub.s32 %r24922, %r24915, %r24921; + mul.wide.u32 %rd1057, %r24922, 64; + add.s64 %rd1058, %rd1267, %rd1057; + mul.lo.s32 %r24923, %r31227, 16777619; + ld.global.u32 %r24924, [%rd1058]; + xor.b32 %r31227, %r24923, %r24924; + mul.lo.s32 %r24925, %r31228, 16777619; + ld.global.u32 %r24926, [%rd1058+4]; + xor.b32 %r31228, %r24925, %r24926; + mul.lo.s32 %r24927, %r31239, 16777619; + ld.global.u32 %r24928, [%rd1058+8]; + mul.lo.s32 %r24929, %r31240, 16777619; + ld.global.u32 %r24930, [%rd1058+12]; + xor.b32 %r24931, %r24929, %r24930; + xor.b32 %r31239, %r24927, %r24928; + mov.b64 %rd1059, {%r31239, %r24931}; + mul.lo.s32 %r24932, %r31235, 16777619; + ld.global.u32 %r24933, [%rd1058+16]; + mul.lo.s32 %r24934, %r31236, 16777619; + ld.global.u32 %r24935, [%rd1058+20]; + xor.b32 %r24936, %r24934, %r24935; + xor.b32 %r31235, %r24932, %r24933; + mov.b64 %rd1060, {%r31235, %r24936}; + mul.lo.s32 %r24937, %r31231, 16777619; + ld.global.u32 %r24938, [%rd1058+24]; + mul.lo.s32 %r24939, %r31232, 16777619; + ld.global.u32 %r24940, [%rd1058+28]; + xor.b32 %r24941, %r24939, %r24940; + xor.b32 %r31231, %r24937, %r24938; + mov.b64 %rd1061, {%r31231, %r24941}; + mul.lo.s32 %r24942, %r31229, 16777619; + ld.global.u32 %r24943, [%rd1058+32]; + mul.lo.s32 %r24944, %r31230, 16777619; + ld.global.u32 %r24945, [%rd1058+36]; + xor.b32 %r24946, %r24944, %r24945; + xor.b32 %r31229, %r24942, %r24943; + mov.b64 %rd1062, {%r31229, %r24946}; + mul.lo.s32 %r24947, %r31225, 16777619; + ld.global.u32 %r24948, [%rd1058+40]; + xor.b32 %r31225, %r24947, %r24948; + mul.lo.s32 %r24949, %r31226, 16777619; + ld.global.u32 %r24950, [%rd1058+44]; + xor.b32 %r31226, %r24949, %r24950; + mul.lo.s32 %r24951, %r31237, 16777619; + ld.global.u32 %r24952, [%rd1058+48]; + mul.lo.s32 %r24953, %r31238, 16777619; + ld.global.u32 %r24954, [%rd1058+52]; + xor.b32 %r24955, %r24953, %r24954; + xor.b32 %r31237, %r24951, %r24952; + mov.b64 %rd1063, {%r31237, %r24955}; + mul.lo.s32 %r24956, %r31233, 16777619; + ld.global.u32 %r24957, [%rd1058+56]; + mul.lo.s32 %r24958, %r31234, 16777619; + ld.global.u32 %r24959, [%rd1058+60]; + xor.b32 %r24960, %r24958, %r24959; + xor.b32 %r31233, %r24956, %r24957; + mov.b64 %rd1064, {%r31233, %r24960}; + st.local.v2.u32 [%rd270+24], {%r31227, %r31228}; + st.local.v2.u32 [%rd270+32], {%r31239, %r24931}; + st.local.v2.u32 [%rd270+40], {%r31235, %r24936}; + st.local.v2.u32 [%rd270+48], {%r31231, %r24941}; + st.local.v2.u32 [%rd270+56], {%r31229, %r24946}; + st.local.v2.u32 [%rd270+64], {%r31225, %r31226}; + st.local.v2.u32 [%rd270+72], {%r31237, %r24955}; + st.local.v2.u32 [%rd270+80], {%r31233, %r24960}; + add.s32 %r31139, %r31139, 1; + setp.lt.u32 %p45, %r31139, 512; + shr.u64 %rd1065, %rd1048, 32; + cvt.u32.u64 %r31189, %rd1065; + shr.u64 %rd1066, %rd1049, 32; + cvt.u32.u64 %r31185, %rd1066; + shr.u64 %rd1067, %rd1050, 32; + cvt.u32.u64 %r31181, %rd1067; + shr.u64 %rd1068, %rd1051, 32; + cvt.u32.u64 %r31179, %rd1068; + shr.u64 %rd1069, %rd1052, 32; + cvt.u32.u64 %r31187, %rd1069; + shr.u64 %rd1070, %rd1053, 32; + cvt.u32.u64 %r31183, %rd1070; + shr.u64 %rd1071, %rd1059, 32; + cvt.u32.u64 %r31240, %rd1071; + shr.u64 %rd1072, %rd1060, 32; + cvt.u32.u64 %r31236, %rd1072; + shr.u64 %rd1073, %rd1061, 32; + cvt.u32.u64 %r31232, %rd1073; + shr.u64 %rd1074, %rd1062, 32; + cvt.u32.u64 %r31230, %rd1074; + shr.u64 %rd1075, %rd1063, 32; + cvt.u32.u64 %r31238, %rd1075; + shr.u64 %rd1076, %rd1064, 32; + cvt.u32.u64 %r31234, %rd1076; + @%p45 bra $L__BB2_80; + + mov.u32 %r31140, 0; + st.local.v2.u32 [%rd2+96], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+104], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+112], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+120], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+128], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+136], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+144], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+152], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+160], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+168], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+176], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+184], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+192], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+200], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+208], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+216], {%r31140, %r31140}; + mov.u32 %r31155, -2147483648; + mov.u32 %r31154, 1; + st.local.v2.u32 [%rd2+88], {%r31154, %r31155}; + mov.u32 %r31141, %r31140; + mov.u32 %r31142, %r31140; + mov.u32 %r31143, %r31140; + mov.u32 %r31144, %r31140; + mov.u32 %r31145, %r31140; + mov.u32 %r31146, %r31140; + mov.u32 %r31147, %r31140; + mov.u32 %r31148, %r31140; + mov.u32 %r31149, %r31140; + mov.u32 %r31150, %r31140; + mov.u32 %r31151, %r31140; + mov.u32 %r31152, %r31140; + mov.u32 %r31153, %r31140; + mov.u32 %r31156, %r31140; + mov.u32 %r31157, %r31140; + mov.u32 %r31158, %r31140; + mov.u32 %r31159, %r31140; + mov.u32 %r31160, %r31140; + mov.u32 %r31161, %r31140; + mov.u32 %r31162, %r31140; + mov.u32 %r31163, %r31140; + mov.u32 %r31164, %r31140; + mov.u32 %r31165, %r31140; + mov.u32 %r31166, %r31140; + mov.u32 %r31167, %r31140; + mov.u32 %r31168, %r31140; + mov.u32 %r31169, %r31140; + mov.u32 %r31170, %r31140; + mov.u32 %r31171, %r31140; + mov.u32 %r31172, %r31140; + mov.u32 %r31173, %r31140; + mov.u32 %r31190, %r31140; + +$L__BB2_82: + mov.u32 %r29807, 1; + mov.u64 %rd1286, keccak_round_constants; + cvta.const.u64 %rd1285, %rd1286; + // begin inline asm + // xor5 + lop3.b32 %r25002, %r31176, %r31174, %r31172, 0x96; + lop3.b32 %r25002, %r25002, %r31170, %r31168, 0x96; + lop3.b32 %r25003, %r31177, %r31175, %r31173, 0x96; + lop3.b32 %r25003, %r25003, %r31171, %r31169, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25014, %r31188, %r31186, %r31166, 0x96; + lop3.b32 %r25014, %r25014, %r31164, %r31162, 0x96; + lop3.b32 %r25015, %r31189, %r31187, %r31167, 0x96; + lop3.b32 %r25015, %r25015, %r31165, %r31163, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25026, %r31184, %r31182, %r31160, 0x96; + lop3.b32 %r25026, %r25026, %r31158, %r31156, 0x96; + lop3.b32 %r25027, %r31185, %r31183, %r31161, 0x96; + lop3.b32 %r25027, %r25027, %r31159, %r31157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25038, %r31180, %r31154, %r31152, 0x96; + lop3.b32 %r25038, %r25038, %r31150, %r31148, 0x96; + lop3.b32 %r25039, %r31181, %r31155, %r31153, 0x96; + lop3.b32 %r25039, %r25039, %r31151, %r31149, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25050, %r31178, %r31146, %r31144, 0x96; + lop3.b32 %r25050, %r25050, %r31142, %r31140, 0x96; + lop3.b32 %r25051, %r31179, %r31147, %r31145, 0x96; + lop3.b32 %r25051, %r25051, %r31143, %r31141, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25062, %r25015, %r25014, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25066, %r25014, %r25015, %r29807; + // end inline asm + xor.b32 %r25496, %r25062, %r25050; + xor.b32 %r25497, %r25066, %r25051; + xor.b32 %r25329, %r31176, %r25496; + xor.b32 %r25332, %r31177, %r25497; + xor.b32 %r25236, %r31174, %r25496; + xor.b32 %r25235, %r31175, %r25497; + xor.b32 %r25283, %r31172, %r25496; + xor.b32 %r25284, %r31173, %r25497; + xor.b32 %r25188, %r31170, %r25496; + xor.b32 %r25187, %r31171, %r25497; + xor.b32 %r25139, %r31168, %r25496; + xor.b32 %r25140, %r31169, %r25497; + // begin inline asm + shf.l.wrap.b32 %r25070, %r25027, %r25026, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25074, %r25026, %r25027, %r29807; + // end inline asm + xor.b32 %r25498, %r25070, %r25002; + xor.b32 %r25499, %r25074, %r25003; + xor.b32 %r25291, %r31188, %r25498; + xor.b32 %r25292, %r31189, %r25499; + xor.b32 %r25108, %r31186, %r25498; + xor.b32 %r25107, %r31187, %r25499; + xor.b32 %r25267, %r31166, %r25498; + xor.b32 %r25268, %r31167, %r25499; + xor.b32 %r25228, %r31164, %r25498; + xor.b32 %r25227, %r31165, %r25499; + xor.b32 %r25211, %r31162, %r25498; + xor.b32 %r25212, %r31163, %r25499; + // begin inline asm + shf.l.wrap.b32 %r25078, %r25039, %r25038, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25082, %r25038, %r25039, %r29807; + // end inline asm + xor.b32 %r25500, %r25078, %r25014; + xor.b32 %r25501, %r25082, %r25015; + xor.b32 %r25148, %r31184, %r25500; + xor.b32 %r25147, %r31185, %r25501; + xor.b32 %r25275, %r31182, %r25500; + xor.b32 %r25276, %r31183, %r25501; + xor.b32 %r25156, %r31160, %r25500; + xor.b32 %r25155, %r31161, %r25501; + xor.b32 %r25259, %r31158, %r25500; + xor.b32 %r25260, %r31159, %r25501; + xor.b32 %r25124, %r31156, %r25500; + xor.b32 %r25123, %r31157, %r25501; + // begin inline asm + shf.l.wrap.b32 %r25086, %r25051, %r25050, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25090, %r25050, %r25051, %r29807; + // end inline asm + xor.b32 %r25502, %r25086, %r25026; + xor.b32 %r25503, %r25090, %r25027; + xor.b32 %r25243, %r31180, %r25502; + xor.b32 %r25244, %r31181, %r25503; + xor.b32 %r25220, %r31154, %r25502; + xor.b32 %r25219, %r31155, %r25503; + xor.b32 %r25163, %r31152, %r25502; + xor.b32 %r25164, %r31153, %r25503; + xor.b32 %r25251, %r31150, %r25502; + xor.b32 %r25252, %r31151, %r25503; + xor.b32 %r25180, %r31148, %r25502; + xor.b32 %r25179, %r31149, %r25503; + // begin inline asm + shf.l.wrap.b32 %r25094, %r25003, %r25002, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25098, %r25002, %r25003, %r29807; + // end inline asm + xor.b32 %r25504, %r25094, %r25038; + xor.b32 %r25505, %r25098, %r25039; + xor.b32 %r25195, %r31178, %r25504; + xor.b32 %r25196, %r31179, %r25505; + xor.b32 %r25115, %r31146, %r25504; + xor.b32 %r25116, %r31147, %r25505; + xor.b32 %r25132, %r31144, %r25504; + xor.b32 %r25131, %r31145, %r25505; + xor.b32 %r25171, %r31142, %r25504; + xor.b32 %r25172, %r31143, %r25505; + xor.b32 %r25203, %r31140, %r25504; + xor.b32 %r25204, %r31141, %r25505; + mov.u32 %r25109, 44; + // begin inline asm + shf.l.wrap.b32 %r25102, %r25108, %r25107, %r25109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25106, %r25107, %r25108, %r25109; + // end inline asm + mov.u32 %r25117, 20; + // begin inline asm + shf.l.wrap.b32 %r25110, %r25116, %r25115, %r25117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25114, %r25115, %r25116, %r25117; + // end inline asm + mov.u32 %r25125, 61; + // begin inline asm + shf.l.wrap.b32 %r25118, %r25124, %r25123, %r25125; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25122, %r25123, %r25124, %r25125; + // end inline asm + mov.u32 %r25133, 39; + // begin inline asm + shf.l.wrap.b32 %r25126, %r25132, %r25131, %r25133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25130, %r25131, %r25132, %r25133; + // end inline asm + mov.u32 %r25141, 18; + // begin inline asm + shf.l.wrap.b32 %r25134, %r25140, %r25139, %r25141; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25138, %r25139, %r25140, %r25141; + // end inline asm + mov.u32 %r25149, 62; + // begin inline asm + shf.l.wrap.b32 %r25142, %r25148, %r25147, %r25149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25146, %r25147, %r25148, %r25149; + // end inline asm + mov.u32 %r25157, 43; + // begin inline asm + shf.l.wrap.b32 %r25150, %r25156, %r25155, %r25157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25154, %r25155, %r25156, %r25157; + // end inline asm + mov.u32 %r25165, 25; + // begin inline asm + shf.l.wrap.b32 %r25158, %r25164, %r25163, %r25165; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25162, %r25163, %r25164, %r25165; + // end inline asm + mov.u32 %r25173, 8; + // begin inline asm + shf.l.wrap.b32 %r25166, %r25172, %r25171, %r25173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25170, %r25171, %r25172, %r25173; + // end inline asm + mov.u32 %r25181, 56; + // begin inline asm + shf.l.wrap.b32 %r25174, %r25180, %r25179, %r25181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25178, %r25179, %r25180, %r25181; + // end inline asm + mov.u32 %r25189, 41; + // begin inline asm + shf.l.wrap.b32 %r25182, %r25188, %r25187, %r25189; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25186, %r25187, %r25188, %r25189; + // end inline asm + mov.u32 %r25197, 27; + // begin inline asm + shf.l.wrap.b32 %r25190, %r25196, %r25195, %r25197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25194, %r25195, %r25196, %r25197; + // end inline asm + mov.u32 %r25205, 14; + // begin inline asm + shf.l.wrap.b32 %r25198, %r25204, %r25203, %r25205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25202, %r25203, %r25204, %r25205; + // end inline asm + mov.u32 %r25213, 2; + // begin inline asm + shf.l.wrap.b32 %r25206, %r25212, %r25211, %r25213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25210, %r25211, %r25212, %r25213; + // end inline asm + mov.u32 %r25221, 55; + // begin inline asm + shf.l.wrap.b32 %r25214, %r25220, %r25219, %r25221; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25218, %r25219, %r25220, %r25221; + // end inline asm + mov.u32 %r25229, 45; + // begin inline asm + shf.l.wrap.b32 %r25222, %r25228, %r25227, %r25229; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25226, %r25227, %r25228, %r25229; + // end inline asm + mov.u32 %r25237, 36; + // begin inline asm + shf.l.wrap.b32 %r25230, %r25236, %r25235, %r25237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25234, %r25235, %r25236, %r25237; + // end inline asm + mov.u32 %r25245, 28; + // begin inline asm + shf.l.wrap.b32 %r25238, %r25244, %r25243, %r25245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25242, %r25243, %r25244, %r25245; + // end inline asm + mov.u32 %r25253, 21; + // begin inline asm + shf.l.wrap.b32 %r25246, %r25252, %r25251, %r25253; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25250, %r25251, %r25252, %r25253; + // end inline asm + mov.u32 %r25261, 15; + // begin inline asm + shf.l.wrap.b32 %r25254, %r25260, %r25259, %r25261; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25258, %r25259, %r25260, %r25261; + // end inline asm + mov.u32 %r25269, 10; + // begin inline asm + shf.l.wrap.b32 %r25262, %r25268, %r25267, %r25269; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25266, %r25267, %r25268, %r25269; + // end inline asm + mov.u32 %r25277, 6; + // begin inline asm + shf.l.wrap.b32 %r25270, %r25276, %r25275, %r25277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25274, %r25275, %r25276, %r25277; + // end inline asm + mov.u32 %r25285, 3; + // begin inline asm + shf.l.wrap.b32 %r25278, %r25284, %r25283, %r25285; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25282, %r25283, %r25284, %r25285; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25286, %r25292, %r25291, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25290, %r25291, %r25292, %r29807; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25294, %r25329, %r25102, %r25150, 0xD2; + lop3.b32 %r25295, %r25332, %r25106, %r25154, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31188, %r25102, %r25150, %r25246, 0xD2; + lop3.b32 %r31189, %r25106, %r25154, %r25250, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31184, %r25150, %r25246, %r25198, 0xD2; + lop3.b32 %r31185, %r25154, %r25250, %r25202, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31180, %r25246, %r25198, %r25329, 0xD2; + lop3.b32 %r31181, %r25250, %r25202, %r25332, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31178, %r25198, %r25329, %r25102, 0xD2; + lop3.b32 %r31179, %r25202, %r25332, %r25106, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31174, %r25238, %r25110, %r25278, 0xD2; + lop3.b32 %r31175, %r25242, %r25114, %r25282, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31186, %r25110, %r25278, %r25222, 0xD2; + lop3.b32 %r31187, %r25114, %r25282, %r25226, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31182, %r25278, %r25222, %r25118, 0xD2; + lop3.b32 %r31183, %r25282, %r25226, %r25122, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31154, %r25222, %r25118, %r25238, 0xD2; + lop3.b32 %r31155, %r25226, %r25122, %r25242, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r31154, %r31155}; + // begin inline asm + // chi + lop3.b32 %r31146, %r25118, %r25238, %r25110, 0xD2; + lop3.b32 %r31147, %r25122, %r25242, %r25114, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r31146, %r31147}; + // begin inline asm + // chi + lop3.b32 %r31172, %r25286, %r25270, %r25158, 0xD2; + lop3.b32 %r31173, %r25290, %r25274, %r25162, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r31172, %r31173}; + // begin inline asm + // chi + lop3.b32 %r31166, %r25270, %r25158, %r25166, 0xD2; + lop3.b32 %r31167, %r25274, %r25162, %r25170, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r31166, %r31167}; + // begin inline asm + // chi + lop3.b32 %r31160, %r25158, %r25166, %r25134, 0xD2; + lop3.b32 %r31161, %r25162, %r25170, %r25138, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r31160, %r31161}; + // begin inline asm + // chi + lop3.b32 %r31152, %r25166, %r25134, %r25286, 0xD2; + lop3.b32 %r31153, %r25170, %r25138, %r25290, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r31152, %r31153}; + // begin inline asm + // chi + lop3.b32 %r31144, %r25134, %r25286, %r25270, 0xD2; + lop3.b32 %r31145, %r25138, %r25290, %r25274, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r31144, %r31145}; + // begin inline asm + // chi + lop3.b32 %r31170, %r25190, %r25230, %r25262, 0xD2; + lop3.b32 %r31171, %r25194, %r25234, %r25266, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r31170, %r31171}; + // begin inline asm + // chi + lop3.b32 %r31164, %r25230, %r25262, %r25254, 0xD2; + lop3.b32 %r31165, %r25234, %r25266, %r25258, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r31164, %r31165}; + // begin inline asm + // chi + lop3.b32 %r31158, %r25262, %r25254, %r25174, 0xD2; + lop3.b32 %r31159, %r25266, %r25258, %r25178, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r31158, %r31159}; + // begin inline asm + // chi + lop3.b32 %r31150, %r25254, %r25174, %r25190, 0xD2; + lop3.b32 %r31151, %r25258, %r25178, %r25194, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r31150, %r31151}; + // begin inline asm + // chi + lop3.b32 %r31142, %r25174, %r25190, %r25230, 0xD2; + lop3.b32 %r31143, %r25178, %r25194, %r25234, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r31142, %r31143}; + // begin inline asm + // chi + lop3.b32 %r31168, %r25142, %r25214, %r25126, 0xD2; + lop3.b32 %r31169, %r25146, %r25218, %r25130, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r31168, %r31169}; + // begin inline asm + // chi + lop3.b32 %r31162, %r25214, %r25126, %r25182, 0xD2; + lop3.b32 %r31163, %r25218, %r25130, %r25186, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r31162, %r31163}; + // begin inline asm + // chi + lop3.b32 %r31156, %r25126, %r25182, %r25206, 0xD2; + lop3.b32 %r31157, %r25130, %r25186, %r25210, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r31156, %r31157}; + // begin inline asm + // chi + lop3.b32 %r31148, %r25182, %r25206, %r25142, 0xD2; + lop3.b32 %r31149, %r25186, %r25210, %r25146, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r31148, %r31149}; + // begin inline asm + // chi + lop3.b32 %r31140, %r25206, %r25142, %r25214, 0xD2; + lop3.b32 %r31141, %r25210, %r25146, %r25218, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r31140, %r31141}; + mul.wide.s32 %rd1080, %r31190, 8; + add.s64 %rd1079, %rd1285, %rd1080; + // begin inline asm + ld.global.nc.v2.u32 {%r25494,%r25495}, [%rd1079]; + // end inline asm + xor.b32 %r31176, %r25294, %r25494; + xor.b32 %r31177, %r25295, %r25495; + add.s32 %r31190, %r31190, 1; + setp.lt.u32 %p46, %r31190, 23; + @%p46 bra $L__BB2_82; + + mov.u32 %r29806, 3; + mov.u32 %r29805, 21; + mov.u32 %r29804, 28; + mov.u32 %r29803, 45; + mov.u32 %r29802, 14; + mov.u32 %r29801, 43; + mov.u32 %r29800, 61; + mov.u32 %r29799, 20; + mov.u32 %r29798, 44; + mov.u64 %rd1289, keccak_round_constants; + cvta.const.u64 %rd1288, %rd1289; + add.s64 %rd1287, %rd1288, 184; + st.local.v2.u32 [%rd2+32], {%r31188, %r31189}; + st.local.v2.u32 [%rd2+72], {%r31186, %r31187}; + st.local.v2.u32 [%rd2+40], {%r31184, %r31185}; + st.local.v2.u32 [%rd2+80], {%r31182, %r31183}; + st.local.v2.u32 [%rd2+48], {%r31180, %r31181}; + st.local.v2.u32 [%rd2+56], {%r31178, %r31179}; + st.local.v2.u32 [%rd2+24], {%r31176, %r31177}; + // begin inline asm + // xor5 + lop3.b32 %r25506, %r31176, %r31174, %r31172, 0x96; + lop3.b32 %r25506, %r25506, %r31170, %r31168, 0x96; + lop3.b32 %r25507, %r31177, %r31175, %r31173, 0x96; + lop3.b32 %r25507, %r25507, %r31171, %r31169, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25518, %r31188, %r31186, %r31166, 0x96; + lop3.b32 %r25518, %r25518, %r31164, %r31162, 0x96; + lop3.b32 %r25519, %r31189, %r31187, %r31167, 0x96; + lop3.b32 %r25519, %r25519, %r31165, %r31163, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25530, %r31184, %r31182, %r31160, 0x96; + lop3.b32 %r25530, %r25530, %r31158, %r31156, 0x96; + lop3.b32 %r25531, %r31185, %r31183, %r31161, 0x96; + lop3.b32 %r25531, %r25531, %r31159, %r31157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25542, %r31180, %r31154, %r31152, 0x96; + lop3.b32 %r25542, %r25542, %r31150, %r31148, 0x96; + lop3.b32 %r25543, %r31181, %r31155, %r31153, 0x96; + lop3.b32 %r25543, %r25543, %r31151, %r31149, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25554, %r31178, %r31146, %r31144, 0x96; + lop3.b32 %r25554, %r25554, %r31142, %r31140, 0x96; + lop3.b32 %r25555, %r31179, %r31147, %r31145, 0x96; + lop3.b32 %r25555, %r25555, %r31143, %r31141, 0x96; + // end inline asm + mov.u32 %r31205, 1; + // begin inline asm + shf.l.wrap.b32 %r25566, %r25519, %r25518, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25570, %r25518, %r25519, %r31205; + // end inline asm + xor.b32 %r25785, %r25566, %r25554; + xor.b32 %r25786, %r25570, %r25555; + xor.b32 %r25713, %r31176, %r25785; + xor.b32 %r25716, %r31177, %r25786; + xor.b32 %r25676, %r31173, %r25786; + xor.b32 %r25675, %r31172, %r25785; + st.local.v2.u32 [%rd2+104], {%r25675, %r25676}; + // begin inline asm + shf.l.wrap.b32 %r25574, %r25531, %r25530, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25578, %r25530, %r25531, %r31205; + // end inline asm + xor.b32 %r25787, %r25574, %r25506; + xor.b32 %r25788, %r25578, %r25507; + xor.b32 %r25612, %r31186, %r25787; + xor.b32 %r25611, %r31187, %r25788; + xor.b32 %r25651, %r31165, %r25788; + xor.b32 %r25652, %r31164, %r25787; + st.local.v2.u32 [%rd2+152], {%r25652, %r25651}; + // begin inline asm + shf.l.wrap.b32 %r25582, %r25543, %r25542, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25586, %r25542, %r25543, %r31205; + // end inline asm + xor.b32 %r25789, %r25582, %r25518; + xor.b32 %r25790, %r25586, %r25519; + xor.b32 %r25635, %r31161, %r25790; + xor.b32 %r25636, %r31160, %r25789; + st.local.v2.u32 [%rd2+120], {%r25636, %r25635}; + xor.b32 %r25627, %r31157, %r25790; + xor.b32 %r25628, %r31156, %r25789; + st.local.v2.u32 [%rd2+200], {%r25628, %r25627}; + // begin inline asm + shf.l.wrap.b32 %r25590, %r25555, %r25554, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25594, %r25554, %r25555, %r31205; + // end inline asm + xor.b32 %r25791, %r25590, %r25530; + xor.b32 %r25792, %r25594, %r25531; + xor.b32 %r25659, %r31180, %r25791; + xor.b32 %r25660, %r31181, %r25792; + xor.b32 %r25668, %r31151, %r25792; + xor.b32 %r25667, %r31150, %r25791; + st.local.v2.u32 [%rd2+168], {%r25667, %r25668}; + // begin inline asm + shf.l.wrap.b32 %r25598, %r25507, %r25506, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25602, %r25506, %r25507, %r31205; + // end inline asm + xor.b32 %r25793, %r25598, %r25542; + xor.b32 %r25794, %r25602, %r25543; + xor.b32 %r25619, %r31146, %r25793; + xor.b32 %r25620, %r31147, %r25794; + xor.b32 %r25644, %r31141, %r25794; + xor.b32 %r25643, %r31140, %r25793; + st.local.v2.u32 [%rd2+216], {%r25643, %r25644}; + // begin inline asm + shf.l.wrap.b32 %r25606, %r25612, %r25611, %r29798; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25610, %r25611, %r25612, %r29798; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25614, %r25620, %r25619, %r29799; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25618, %r25619, %r25620, %r29799; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25626, %r25627, %r25628, %r29800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25622, %r25628, %r25627, %r29800; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r25622, %r25626}; + // begin inline asm + shf.l.wrap.b32 %r25630, %r25636, %r25635, %r29801; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25634, %r25635, %r25636, %r29801; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25638, %r25644, %r25643, %r29802; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25642, %r25643, %r25644, %r29802; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25650, %r25651, %r25652, %r29803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25646, %r25652, %r25651, %r29803; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r25646, %r25650}; + // begin inline asm + shf.l.wrap.b32 %r25654, %r25660, %r25659, %r29804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25658, %r25659, %r25660, %r29804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25662, %r25668, %r25667, %r29805; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25666, %r25667, %r25668, %r29805; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25670, %r25676, %r25675, %r29806; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25674, %r25675, %r25676, %r29806; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25678, %r25713, %r25606, %r25630, 0xD2; + lop3.b32 %r25679, %r25716, %r25610, %r25634, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25686, %r25606, %r25630, %r25662, 0xD2; + lop3.b32 %r25687, %r25610, %r25634, %r25666, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r25686, %r25687}; + // begin inline asm + // chi + lop3.b32 %r25694, %r25630, %r25662, %r25638, 0xD2; + lop3.b32 %r25695, %r25634, %r25666, %r25642, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r25694, %r25695}; + // begin inline asm + // chi + lop3.b32 %r25702, %r25662, %r25638, %r25713, 0xD2; + lop3.b32 %r25703, %r25666, %r25642, %r25716, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r25702, %r25703}; + // begin inline asm + // chi + lop3.b32 %r25710, %r25638, %r25713, %r25606, 0xD2; + lop3.b32 %r25711, %r25642, %r25716, %r25610, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r25710, %r25711}; + // begin inline asm + // chi + lop3.b32 %r25718, %r25654, %r25614, %r25670, 0xD2; + lop3.b32 %r25719, %r25658, %r25618, %r25674, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r25718, %r25719}; + // begin inline asm + // chi + lop3.b32 %r25726, %r25614, %r25670, %r25646, 0xD2; + lop3.b32 %r25727, %r25618, %r25674, %r25650, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r25726, %r25727}; + // begin inline asm + // chi + lop3.b32 %r25734, %r25670, %r25646, %r25622, 0xD2; + lop3.b32 %r25735, %r25674, %r25650, %r25626, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r25734, %r25735}; + // begin inline asm + ld.global.nc.v2.u32 {%r25742,%r25743}, [%rd1287]; + // end inline asm + xor.b32 %r25795, %r25679, %r25743; + xor.b32 %r25796, %r25678, %r25742; + mov.b64 %rd1349, {%r25796, %r25795}; + mov.b64 %rd1350, {%r25686, %r25687}; + mov.b64 %rd1351, {%r25694, %r25695}; + mov.b64 %rd1352, {%r25702, %r25703}; + mov.b64 %rd1353, {%r25710, %r25711}; + mov.b64 %rd1354, {%r25718, %r25719}; + mov.b64 %rd1355, {%r25726, %r25727}; + mov.b64 %rd1356, {%r25734, %r25735}; + mov.u32 %r31191, 0; + st.local.v2.u32 [%rd2+24], {%r25796, %r25795}; + st.local.v2.u32 [%rd270+96], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+104], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+112], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+120], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+128], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+136], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+144], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+152], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+160], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+168], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+176], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+184], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+192], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+200], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+208], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+216], {%r31191, %r31191}; + mov.u32 %r31206, -2147483648; + st.local.v2.u32 [%rd270+88], {%r31205, %r31206}; + mov.u32 %r31192, %r31191; + mov.u32 %r31193, %r31191; + mov.u32 %r31194, %r31191; + mov.u32 %r31195, %r31191; + mov.u32 %r31196, %r31191; + mov.u32 %r31197, %r31191; + mov.u32 %r31198, %r31191; + mov.u32 %r31199, %r31191; + mov.u32 %r31200, %r31191; + mov.u32 %r31201, %r31191; + mov.u32 %r31202, %r31191; + mov.u32 %r31203, %r31191; + mov.u32 %r31204, %r31191; + mov.u32 %r31207, %r31191; + mov.u32 %r31208, %r31191; + mov.u32 %r31209, %r31191; + mov.u32 %r31210, %r31191; + mov.u32 %r31211, %r31191; + mov.u32 %r31212, %r31191; + mov.u32 %r31213, %r31191; + mov.u32 %r31214, %r31191; + mov.u32 %r31215, %r31191; + mov.u32 %r31216, %r31191; + mov.u32 %r31217, %r31191; + mov.u32 %r31218, %r31191; + mov.u32 %r31219, %r31191; + mov.u32 %r31220, %r31191; + mov.u32 %r31221, %r31191; + mov.u32 %r31222, %r31191; + mov.u32 %r31223, %r31191; + mov.u32 %r31224, %r31191; + mov.u32 %r31241, %r31191; + +$L__BB2_84: + mov.u32 %r29817, 1; + mov.u64 %rd1291, keccak_round_constants; + cvta.const.u64 %rd1290, %rd1291; + // begin inline asm + // xor5 + lop3.b32 %r25797, %r31227, %r31225, %r31223, 0x96; + lop3.b32 %r25797, %r25797, %r31221, %r31219, 0x96; + lop3.b32 %r25798, %r31228, %r31226, %r31224, 0x96; + lop3.b32 %r25798, %r25798, %r31222, %r31220, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25809, %r31239, %r31237, %r31217, 0x96; + lop3.b32 %r25809, %r25809, %r31215, %r31213, 0x96; + lop3.b32 %r25810, %r31240, %r31238, %r31218, 0x96; + lop3.b32 %r25810, %r25810, %r31216, %r31214, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25821, %r31235, %r31233, %r31211, 0x96; + lop3.b32 %r25821, %r25821, %r31209, %r31207, 0x96; + lop3.b32 %r25822, %r31236, %r31234, %r31212, 0x96; + lop3.b32 %r25822, %r25822, %r31210, %r31208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25833, %r31231, %r31205, %r31203, 0x96; + lop3.b32 %r25833, %r25833, %r31201, %r31199, 0x96; + lop3.b32 %r25834, %r31232, %r31206, %r31204, 0x96; + lop3.b32 %r25834, %r25834, %r31202, %r31200, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25845, %r31229, %r31197, %r31195, 0x96; + lop3.b32 %r25845, %r25845, %r31193, %r31191, 0x96; + lop3.b32 %r25846, %r31230, %r31198, %r31196, 0x96; + lop3.b32 %r25846, %r25846, %r31194, %r31192, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25857, %r25810, %r25809, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25861, %r25809, %r25810, %r29817; + // end inline asm + xor.b32 %r26291, %r25857, %r25845; + xor.b32 %r26292, %r25861, %r25846; + xor.b32 %r26124, %r31227, %r26291; + xor.b32 %r26127, %r31228, %r26292; + xor.b32 %r26031, %r31225, %r26291; + xor.b32 %r26030, %r31226, %r26292; + xor.b32 %r26078, %r31223, %r26291; + xor.b32 %r26079, %r31224, %r26292; + xor.b32 %r25983, %r31221, %r26291; + xor.b32 %r25982, %r31222, %r26292; + xor.b32 %r25934, %r31219, %r26291; + xor.b32 %r25935, %r31220, %r26292; + // begin inline asm + shf.l.wrap.b32 %r25865, %r25822, %r25821, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25869, %r25821, %r25822, %r29817; + // end inline asm + xor.b32 %r26293, %r25865, %r25797; + xor.b32 %r26294, %r25869, %r25798; + xor.b32 %r26086, %r31239, %r26293; + xor.b32 %r26087, %r31240, %r26294; + xor.b32 %r25903, %r31237, %r26293; + xor.b32 %r25902, %r31238, %r26294; + xor.b32 %r26062, %r31217, %r26293; + xor.b32 %r26063, %r31218, %r26294; + xor.b32 %r26023, %r31215, %r26293; + xor.b32 %r26022, %r31216, %r26294; + xor.b32 %r26006, %r31213, %r26293; + xor.b32 %r26007, %r31214, %r26294; + // begin inline asm + shf.l.wrap.b32 %r25873, %r25834, %r25833, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25877, %r25833, %r25834, %r29817; + // end inline asm + xor.b32 %r26295, %r25873, %r25809; + xor.b32 %r26296, %r25877, %r25810; + xor.b32 %r25943, %r31235, %r26295; + xor.b32 %r25942, %r31236, %r26296; + xor.b32 %r26070, %r31233, %r26295; + xor.b32 %r26071, %r31234, %r26296; + xor.b32 %r25951, %r31211, %r26295; + xor.b32 %r25950, %r31212, %r26296; + xor.b32 %r26054, %r31209, %r26295; + xor.b32 %r26055, %r31210, %r26296; + xor.b32 %r25919, %r31207, %r26295; + xor.b32 %r25918, %r31208, %r26296; + // begin inline asm + shf.l.wrap.b32 %r25881, %r25846, %r25845, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25885, %r25845, %r25846, %r29817; + // end inline asm + xor.b32 %r26297, %r25881, %r25821; + xor.b32 %r26298, %r25885, %r25822; + xor.b32 %r26038, %r31231, %r26297; + xor.b32 %r26039, %r31232, %r26298; + xor.b32 %r26015, %r31205, %r26297; + xor.b32 %r26014, %r31206, %r26298; + xor.b32 %r25958, %r31203, %r26297; + xor.b32 %r25959, %r31204, %r26298; + xor.b32 %r26046, %r31201, %r26297; + xor.b32 %r26047, %r31202, %r26298; + xor.b32 %r25975, %r31199, %r26297; + xor.b32 %r25974, %r31200, %r26298; + // begin inline asm + shf.l.wrap.b32 %r25889, %r25798, %r25797, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25893, %r25797, %r25798, %r29817; + // end inline asm + xor.b32 %r26299, %r25889, %r25833; + xor.b32 %r26300, %r25893, %r25834; + xor.b32 %r25990, %r31229, %r26299; + xor.b32 %r25991, %r31230, %r26300; + xor.b32 %r25910, %r31197, %r26299; + xor.b32 %r25911, %r31198, %r26300; + xor.b32 %r25927, %r31195, %r26299; + xor.b32 %r25926, %r31196, %r26300; + xor.b32 %r25966, %r31193, %r26299; + xor.b32 %r25967, %r31194, %r26300; + xor.b32 %r25998, %r31191, %r26299; + xor.b32 %r25999, %r31192, %r26300; + mov.u32 %r25904, 44; + // begin inline asm + shf.l.wrap.b32 %r25897, %r25903, %r25902, %r25904; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25901, %r25902, %r25903, %r25904; + // end inline asm + mov.u32 %r25912, 20; + // begin inline asm + shf.l.wrap.b32 %r25905, %r25911, %r25910, %r25912; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25909, %r25910, %r25911, %r25912; + // end inline asm + mov.u32 %r25920, 61; + // begin inline asm + shf.l.wrap.b32 %r25913, %r25919, %r25918, %r25920; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25917, %r25918, %r25919, %r25920; + // end inline asm + mov.u32 %r25928, 39; + // begin inline asm + shf.l.wrap.b32 %r25921, %r25927, %r25926, %r25928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25925, %r25926, %r25927, %r25928; + // end inline asm + mov.u32 %r25936, 18; + // begin inline asm + shf.l.wrap.b32 %r25929, %r25935, %r25934, %r25936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25933, %r25934, %r25935, %r25936; + // end inline asm + mov.u32 %r25944, 62; + // begin inline asm + shf.l.wrap.b32 %r25937, %r25943, %r25942, %r25944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25941, %r25942, %r25943, %r25944; + // end inline asm + mov.u32 %r25952, 43; + // begin inline asm + shf.l.wrap.b32 %r25945, %r25951, %r25950, %r25952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25949, %r25950, %r25951, %r25952; + // end inline asm + mov.u32 %r25960, 25; + // begin inline asm + shf.l.wrap.b32 %r25953, %r25959, %r25958, %r25960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25957, %r25958, %r25959, %r25960; + // end inline asm + mov.u32 %r25968, 8; + // begin inline asm + shf.l.wrap.b32 %r25961, %r25967, %r25966, %r25968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25965, %r25966, %r25967, %r25968; + // end inline asm + mov.u32 %r25976, 56; + // begin inline asm + shf.l.wrap.b32 %r25969, %r25975, %r25974, %r25976; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25973, %r25974, %r25975, %r25976; + // end inline asm + mov.u32 %r25984, 41; + // begin inline asm + shf.l.wrap.b32 %r25977, %r25983, %r25982, %r25984; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25981, %r25982, %r25983, %r25984; + // end inline asm + mov.u32 %r25992, 27; + // begin inline asm + shf.l.wrap.b32 %r25985, %r25991, %r25990, %r25992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25989, %r25990, %r25991, %r25992; + // end inline asm + mov.u32 %r26000, 14; + // begin inline asm + shf.l.wrap.b32 %r25993, %r25999, %r25998, %r26000; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25997, %r25998, %r25999, %r26000; + // end inline asm + mov.u32 %r26008, 2; + // begin inline asm + shf.l.wrap.b32 %r26001, %r26007, %r26006, %r26008; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26005, %r26006, %r26007, %r26008; + // end inline asm + mov.u32 %r26016, 55; + // begin inline asm + shf.l.wrap.b32 %r26009, %r26015, %r26014, %r26016; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26013, %r26014, %r26015, %r26016; + // end inline asm + mov.u32 %r26024, 45; + // begin inline asm + shf.l.wrap.b32 %r26017, %r26023, %r26022, %r26024; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26021, %r26022, %r26023, %r26024; + // end inline asm + mov.u32 %r26032, 36; + // begin inline asm + shf.l.wrap.b32 %r26025, %r26031, %r26030, %r26032; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26029, %r26030, %r26031, %r26032; + // end inline asm + mov.u32 %r26040, 28; + // begin inline asm + shf.l.wrap.b32 %r26033, %r26039, %r26038, %r26040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26037, %r26038, %r26039, %r26040; + // end inline asm + mov.u32 %r26048, 21; + // begin inline asm + shf.l.wrap.b32 %r26041, %r26047, %r26046, %r26048; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26045, %r26046, %r26047, %r26048; + // end inline asm + mov.u32 %r26056, 15; + // begin inline asm + shf.l.wrap.b32 %r26049, %r26055, %r26054, %r26056; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26053, %r26054, %r26055, %r26056; + // end inline asm + mov.u32 %r26064, 10; + // begin inline asm + shf.l.wrap.b32 %r26057, %r26063, %r26062, %r26064; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26061, %r26062, %r26063, %r26064; + // end inline asm + mov.u32 %r26072, 6; + // begin inline asm + shf.l.wrap.b32 %r26065, %r26071, %r26070, %r26072; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26069, %r26070, %r26071, %r26072; + // end inline asm + mov.u32 %r26080, 3; + // begin inline asm + shf.l.wrap.b32 %r26073, %r26079, %r26078, %r26080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26077, %r26078, %r26079, %r26080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26081, %r26087, %r26086, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26085, %r26086, %r26087, %r29817; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26089, %r26124, %r25897, %r25945, 0xD2; + lop3.b32 %r26090, %r26127, %r25901, %r25949, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31239, %r25897, %r25945, %r26041, 0xD2; + lop3.b32 %r31240, %r25901, %r25949, %r26045, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31235, %r25945, %r26041, %r25993, 0xD2; + lop3.b32 %r31236, %r25949, %r26045, %r25997, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31231, %r26041, %r25993, %r26124, 0xD2; + lop3.b32 %r31232, %r26045, %r25997, %r26127, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31229, %r25993, %r26124, %r25897, 0xD2; + lop3.b32 %r31230, %r25997, %r26127, %r25901, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31225, %r26033, %r25905, %r26073, 0xD2; + lop3.b32 %r31226, %r26037, %r25909, %r26077, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31237, %r25905, %r26073, %r26017, 0xD2; + lop3.b32 %r31238, %r25909, %r26077, %r26021, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31233, %r26073, %r26017, %r25913, 0xD2; + lop3.b32 %r31234, %r26077, %r26021, %r25917, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31205, %r26017, %r25913, %r26033, 0xD2; + lop3.b32 %r31206, %r26021, %r25917, %r26037, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r31205, %r31206}; + // begin inline asm + // chi + lop3.b32 %r31197, %r25913, %r26033, %r25905, 0xD2; + lop3.b32 %r31198, %r25917, %r26037, %r25909, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r31197, %r31198}; + // begin inline asm + // chi + lop3.b32 %r31223, %r26081, %r26065, %r25953, 0xD2; + lop3.b32 %r31224, %r26085, %r26069, %r25957, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+104], {%r31223, %r31224}; + // begin inline asm + // chi + lop3.b32 %r31217, %r26065, %r25953, %r25961, 0xD2; + lop3.b32 %r31218, %r26069, %r25957, %r25965, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+112], {%r31217, %r31218}; + // begin inline asm + // chi + lop3.b32 %r31211, %r25953, %r25961, %r25929, 0xD2; + lop3.b32 %r31212, %r25957, %r25965, %r25933, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+120], {%r31211, %r31212}; + // begin inline asm + // chi + lop3.b32 %r31203, %r25961, %r25929, %r26081, 0xD2; + lop3.b32 %r31204, %r25965, %r25933, %r26085, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+128], {%r31203, %r31204}; + // begin inline asm + // chi + lop3.b32 %r31195, %r25929, %r26081, %r26065, 0xD2; + lop3.b32 %r31196, %r25933, %r26085, %r26069, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+136], {%r31195, %r31196}; + // begin inline asm + // chi + lop3.b32 %r31221, %r25985, %r26025, %r26057, 0xD2; + lop3.b32 %r31222, %r25989, %r26029, %r26061, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+144], {%r31221, %r31222}; + // begin inline asm + // chi + lop3.b32 %r31215, %r26025, %r26057, %r26049, 0xD2; + lop3.b32 %r31216, %r26029, %r26061, %r26053, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+152], {%r31215, %r31216}; + // begin inline asm + // chi + lop3.b32 %r31209, %r26057, %r26049, %r25969, 0xD2; + lop3.b32 %r31210, %r26061, %r26053, %r25973, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+160], {%r31209, %r31210}; + // begin inline asm + // chi + lop3.b32 %r31201, %r26049, %r25969, %r25985, 0xD2; + lop3.b32 %r31202, %r26053, %r25973, %r25989, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+168], {%r31201, %r31202}; + // begin inline asm + // chi + lop3.b32 %r31193, %r25969, %r25985, %r26025, 0xD2; + lop3.b32 %r31194, %r25973, %r25989, %r26029, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+176], {%r31193, %r31194}; + // begin inline asm + // chi + lop3.b32 %r31219, %r25937, %r26009, %r25921, 0xD2; + lop3.b32 %r31220, %r25941, %r26013, %r25925, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+184], {%r31219, %r31220}; + // begin inline asm + // chi + lop3.b32 %r31213, %r26009, %r25921, %r25977, 0xD2; + lop3.b32 %r31214, %r26013, %r25925, %r25981, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+192], {%r31213, %r31214}; + // begin inline asm + // chi + lop3.b32 %r31207, %r25921, %r25977, %r26001, 0xD2; + lop3.b32 %r31208, %r25925, %r25981, %r26005, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+200], {%r31207, %r31208}; + // begin inline asm + // chi + lop3.b32 %r31199, %r25977, %r26001, %r25937, 0xD2; + lop3.b32 %r31200, %r25981, %r26005, %r25941, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+208], {%r31199, %r31200}; + // begin inline asm + // chi + lop3.b32 %r31191, %r26001, %r25937, %r26009, 0xD2; + lop3.b32 %r31192, %r26005, %r25941, %r26013, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+216], {%r31191, %r31192}; + mul.wide.s32 %rd1091, %r31241, 8; + add.s64 %rd1090, %rd1290, %rd1091; + // begin inline asm + ld.global.nc.v2.u32 {%r26289,%r26290}, [%rd1090]; + // end inline asm + xor.b32 %r31227, %r26089, %r26289; + xor.b32 %r31228, %r26090, %r26290; + add.s32 %r31241, %r31241, 1; + setp.lt.u32 %p47, %r31241, 23; + @%p47 bra $L__BB2_84; + + mov.u32 %r29816, 3; + mov.u32 %r29815, 21; + mov.u32 %r29814, 28; + mov.u32 %r29813, 45; + mov.u32 %r29812, 14; + mov.u32 %r29811, 43; + mov.u32 %r29810, 61; + mov.u32 %r29809, 20; + mov.u32 %r29808, 44; + mov.u64 %rd1294, keccak_round_constants; + cvta.const.u64 %rd1293, %rd1294; + add.s64 %rd1292, %rd1293, 184; + mov.u32 %r26400, 1; + st.local.v2.u32 [%rd270+32], {%r31239, %r31240}; + st.local.v2.u32 [%rd270+72], {%r31237, %r31238}; + st.local.v2.u32 [%rd270+40], {%r31235, %r31236}; + st.local.v2.u32 [%rd270+80], {%r31233, %r31234}; + st.local.v2.u32 [%rd270+48], {%r31231, %r31232}; + st.local.v2.u32 [%rd270+56], {%r31229, %r31230}; + st.local.v2.u32 [%rd270+24], {%r31227, %r31228}; + // begin inline asm + // xor5 + lop3.b32 %r26301, %r31227, %r31225, %r31223, 0x96; + lop3.b32 %r26301, %r26301, %r31221, %r31219, 0x96; + lop3.b32 %r26302, %r31228, %r31226, %r31224, 0x96; + lop3.b32 %r26302, %r26302, %r31222, %r31220, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26313, %r31239, %r31237, %r31217, 0x96; + lop3.b32 %r26313, %r26313, %r31215, %r31213, 0x96; + lop3.b32 %r26314, %r31240, %r31238, %r31218, 0x96; + lop3.b32 %r26314, %r26314, %r31216, %r31214, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26325, %r31235, %r31233, %r31211, 0x96; + lop3.b32 %r26325, %r26325, %r31209, %r31207, 0x96; + lop3.b32 %r26326, %r31236, %r31234, %r31212, 0x96; + lop3.b32 %r26326, %r26326, %r31210, %r31208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26337, %r31231, %r31205, %r31203, 0x96; + lop3.b32 %r26337, %r26337, %r31201, %r31199, 0x96; + lop3.b32 %r26338, %r31232, %r31206, %r31204, 0x96; + lop3.b32 %r26338, %r26338, %r31202, %r31200, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26349, %r31229, %r31197, %r31195, 0x96; + lop3.b32 %r26349, %r26349, %r31193, %r31191, 0x96; + lop3.b32 %r26350, %r31230, %r31198, %r31196, 0x96; + lop3.b32 %r26350, %r26350, %r31194, %r31192, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26361, %r26314, %r26313, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26365, %r26313, %r26314, %r26400; + // end inline asm + xor.b32 %r26539, %r26361, %r26349; + xor.b32 %r26540, %r26365, %r26350; + xor.b32 %r26508, %r31227, %r26539; + xor.b32 %r26511, %r31228, %r26540; + xor.b32 %r26471, %r31224, %r26540; + xor.b32 %r26470, %r31223, %r26539; + st.local.v2.u32 [%rd270+104], {%r26470, %r26471}; + // begin inline asm + shf.l.wrap.b32 %r26369, %r26326, %r26325, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26373, %r26325, %r26326, %r26400; + // end inline asm + xor.b32 %r26541, %r26369, %r26301; + xor.b32 %r26542, %r26373, %r26302; + xor.b32 %r26407, %r31237, %r26541; + xor.b32 %r26406, %r31238, %r26542; + xor.b32 %r26446, %r31216, %r26542; + xor.b32 %r26447, %r31215, %r26541; + st.local.v2.u32 [%rd270+152], {%r26447, %r26446}; + // begin inline asm + shf.l.wrap.b32 %r26377, %r26338, %r26337, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26381, %r26337, %r26338, %r26400; + // end inline asm + xor.b32 %r26543, %r26377, %r26313; + xor.b32 %r26544, %r26381, %r26314; + xor.b32 %r26430, %r31212, %r26544; + xor.b32 %r26431, %r31211, %r26543; + st.local.v2.u32 [%rd270+120], {%r26431, %r26430}; + xor.b32 %r26422, %r31208, %r26544; + xor.b32 %r26423, %r31207, %r26543; + st.local.v2.u32 [%rd270+200], {%r26423, %r26422}; + // begin inline asm + shf.l.wrap.b32 %r26385, %r26350, %r26349, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26389, %r26349, %r26350, %r26400; + // end inline asm + xor.b32 %r26545, %r26385, %r26325; + xor.b32 %r26546, %r26389, %r26326; + xor.b32 %r26454, %r31231, %r26545; + xor.b32 %r26455, %r31232, %r26546; + xor.b32 %r26463, %r31202, %r26546; + xor.b32 %r26462, %r31201, %r26545; + st.local.v2.u32 [%rd270+168], {%r26462, %r26463}; + // begin inline asm + shf.l.wrap.b32 %r26393, %r26302, %r26301, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26397, %r26301, %r26302, %r26400; + // end inline asm + xor.b32 %r26547, %r26393, %r26337; + xor.b32 %r26548, %r26397, %r26338; + xor.b32 %r26414, %r31197, %r26547; + xor.b32 %r26415, %r31198, %r26548; + xor.b32 %r26439, %r31192, %r26548; + xor.b32 %r26438, %r31191, %r26547; + st.local.v2.u32 [%rd270+216], {%r26438, %r26439}; + // begin inline asm + shf.l.wrap.b32 %r26401, %r26407, %r26406, %r29808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26405, %r26406, %r26407, %r29808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26409, %r26415, %r26414, %r29809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26413, %r26414, %r26415, %r29809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26421, %r26422, %r26423, %r29810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26417, %r26423, %r26422, %r29810; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r26417, %r26421}; + // begin inline asm + shf.l.wrap.b32 %r26425, %r26431, %r26430, %r29811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26429, %r26430, %r26431, %r29811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26433, %r26439, %r26438, %r29812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26437, %r26438, %r26439, %r29812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26445, %r26446, %r26447, %r29813; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26441, %r26447, %r26446, %r29813; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r26441, %r26445}; + // begin inline asm + shf.l.wrap.b32 %r26449, %r26455, %r26454, %r29814; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26453, %r26454, %r26455, %r29814; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26457, %r26463, %r26462, %r29815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26461, %r26462, %r26463, %r29815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26465, %r26471, %r26470, %r29816; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26469, %r26470, %r26471, %r29816; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26473, %r26508, %r26401, %r26425, 0xD2; + lop3.b32 %r26474, %r26511, %r26405, %r26429, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26481, %r26401, %r26425, %r26457, 0xD2; + lop3.b32 %r26482, %r26405, %r26429, %r26461, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+32], {%r26481, %r26482}; + // begin inline asm + // chi + lop3.b32 %r26489, %r26425, %r26457, %r26433, 0xD2; + lop3.b32 %r26490, %r26429, %r26461, %r26437, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+40], {%r26489, %r26490}; + // begin inline asm + // chi + lop3.b32 %r26497, %r26457, %r26433, %r26508, 0xD2; + lop3.b32 %r26498, %r26461, %r26437, %r26511, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+48], {%r26497, %r26498}; + // begin inline asm + // chi + lop3.b32 %r26505, %r26433, %r26508, %r26401, 0xD2; + lop3.b32 %r26506, %r26437, %r26511, %r26405, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+56], {%r26505, %r26506}; + // begin inline asm + // chi + lop3.b32 %r26513, %r26449, %r26409, %r26465, 0xD2; + lop3.b32 %r26514, %r26453, %r26413, %r26469, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+64], {%r26513, %r26514}; + // begin inline asm + // chi + lop3.b32 %r26521, %r26409, %r26465, %r26441, 0xD2; + lop3.b32 %r26522, %r26413, %r26469, %r26445, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+72], {%r26521, %r26522}; + // begin inline asm + // chi + lop3.b32 %r26529, %r26465, %r26441, %r26417, 0xD2; + lop3.b32 %r26530, %r26469, %r26445, %r26421, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+80], {%r26529, %r26530}; + // begin inline asm + ld.global.nc.v2.u32 {%r26537,%r26538}, [%rd1292]; + // end inline asm + xor.b32 %r26549, %r26474, %r26538; + xor.b32 %r26550, %r26473, %r26537; + st.local.v2.u32 [%rd270+24], {%r26550, %r26549}; + mov.b64 %rd1358, {%r26481, %r26482}; + mov.b64 %rd1359, {%r26489, %r26490}; + mov.b64 %rd1362, {%r26513, %r26514}; + mov.b64 %rd1363, {%r26521, %r26522}; + mov.b64 %rd1364, {%r26529, %r26530}; + mov.b64 %rd1357, {%r26550, %r26549}; + mov.b64 %rd1360, {%r26497, %r26498}; + mov.b64 %rd1361, {%r26505, %r26506}; + bra.uni $L__BB2_86; + +$L__BB2_64: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd881, 1179641; + st.local.u64 [%rd2+8], %rd881; + st.local.u32 [%rd2+16], %r3343; + ld.global.u64 %rd882, [%rd220]; + ld.global.u64 %rd883, [%rd220+8]; + ld.global.u64 %rd884, [%rd220+16]; + ld.global.u64 %rd885, [%rd220+24]; + ld.global.u64 %rd886, [%rd220+32]; + ld.global.u64 %rd887, [%rd220+40]; + ld.global.u64 %rd888, [%rd220+48]; + ld.global.u64 %rd889, [%rd220+56]; + st.local.u64 [%rd2+24], %rd882; + st.local.u64 [%rd2+32], %rd883; + st.local.u64 [%rd2+40], %rd884; + st.local.u64 [%rd2+48], %rd885; + st.local.u64 [%rd2+56], %rd886; + st.local.u64 [%rd2+64], %rd887; + st.local.u64 [%rd2+72], %rd888; + st.local.u64 [%rd2+80], %rd889; + cvt.u32.u64 %r20023, %rd882; + xor.b32 %r20024, %r3343, %r20023; + st.local.u32 [%rd2+24], %r20024; + mov.u32 %r30768, 0; + st.local.v2.u32 [%rd2+96], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+104], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+112], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+120], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+128], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+136], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+144], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+152], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+160], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+168], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+176], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+184], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+192], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+200], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+208], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+216], {%r30768, %r30768}; + mov.u32 %r30783, -2147483648; + mov.u32 %r19996, 1; + st.local.v2.u32 [%rd2+88], {%r19996, %r30783}; + ld.local.v2.u32 {%r30804, %r30805}, [%rd2+24]; + mov.b64 {%r30802, %r30803}, %rd887; + shr.u64 %rd890, %rd883, 32; + cvt.u32.u64 %r30816, %rd883; + cvt.u32.u64 %r30817, %rd890; + shr.u64 %rd891, %rd888, 32; + cvt.u32.u64 %r30814, %rd888; + cvt.u32.u64 %r30815, %rd891; + shr.u64 %rd892, %rd884, 32; + cvt.u32.u64 %r30812, %rd884; + cvt.u32.u64 %r30813, %rd892; + shr.u64 %rd893, %rd889, 32; + cvt.u32.u64 %r30810, %rd889; + cvt.u32.u64 %r30811, %rd893; + shr.u64 %rd894, %rd885, 32; + cvt.u32.u64 %r30808, %rd885; + cvt.u32.u64 %r30809, %rd894; + shr.u64 %rd895, %rd886, 32; + cvt.u32.u64 %r30806, %rd886; + cvt.u32.u64 %r30807, %rd895; + mov.u32 %r30769, %r30768; + mov.u32 %r30770, %r30768; + mov.u32 %r30771, %r30768; + mov.u32 %r30772, %r30768; + mov.u32 %r30773, %r30768; + mov.u32 %r30774, %r30768; + mov.u32 %r30775, %r30768; + mov.u32 %r30776, %r30768; + mov.u32 %r30777, %r30768; + mov.u32 %r30778, %r30768; + mov.u32 %r30779, %r30768; + mov.u32 %r30780, %r30768; + mov.u32 %r30781, %r30768; + mov.u32 %r30782, %r19996; + mov.u32 %r30784, %r30768; + mov.u32 %r30785, %r30768; + mov.u32 %r30786, %r30768; + mov.u32 %r30787, %r30768; + mov.u32 %r30788, %r30768; + mov.u32 %r30789, %r30768; + mov.u32 %r30790, %r30768; + mov.u32 %r30791, %r30768; + mov.u32 %r30792, %r30768; + mov.u32 %r30793, %r30768; + mov.u32 %r30794, %r30768; + mov.u32 %r30795, %r30768; + mov.u32 %r30796, %r30768; + mov.u32 %r30797, %r30768; + mov.u32 %r30798, %r30768; + mov.u32 %r30799, %r30768; + mov.u32 %r30800, %r30768; + mov.u32 %r30801, %r30768; + mov.u32 %r30818, %r30768; + +$L__BB2_65: + // begin inline asm + // xor5 + lop3.b32 %r20027, %r30804, %r30802, %r30800, 0x96; + lop3.b32 %r20027, %r20027, %r30798, %r30796, 0x96; + lop3.b32 %r20028, %r30805, %r30803, %r30801, 0x96; + lop3.b32 %r20028, %r20028, %r30799, %r30797, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20039, %r30816, %r30814, %r30794, 0x96; + lop3.b32 %r20039, %r20039, %r30792, %r30790, 0x96; + lop3.b32 %r20040, %r30817, %r30815, %r30795, 0x96; + lop3.b32 %r20040, %r20040, %r30793, %r30791, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20051, %r30812, %r30810, %r30788, 0x96; + lop3.b32 %r20051, %r20051, %r30786, %r30784, 0x96; + lop3.b32 %r20052, %r30813, %r30811, %r30789, 0x96; + lop3.b32 %r20052, %r20052, %r30787, %r30785, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20063, %r30808, %r30782, %r30780, 0x96; + lop3.b32 %r20063, %r20063, %r30778, %r30776, 0x96; + lop3.b32 %r20064, %r30809, %r30783, %r30781, 0x96; + lop3.b32 %r20064, %r20064, %r30779, %r30777, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20075, %r30806, %r30774, %r30772, 0x96; + lop3.b32 %r20075, %r20075, %r30770, %r30768, 0x96; + lop3.b32 %r20076, %r30807, %r30775, %r30773, 0x96; + lop3.b32 %r20076, %r20076, %r30771, %r30769, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20087, %r20040, %r20039, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20091, %r20039, %r20040, %r19996; + // end inline asm + xor.b32 %r20521, %r20087, %r20075; + xor.b32 %r20522, %r20091, %r20076; + xor.b32 %r20354, %r30804, %r20521; + xor.b32 %r20357, %r30805, %r20522; + xor.b32 %r20261, %r30802, %r20521; + xor.b32 %r20260, %r30803, %r20522; + xor.b32 %r20308, %r30800, %r20521; + xor.b32 %r20309, %r30801, %r20522; + xor.b32 %r20213, %r30798, %r20521; + xor.b32 %r20212, %r30799, %r20522; + xor.b32 %r20164, %r30796, %r20521; + xor.b32 %r20165, %r30797, %r20522; + // begin inline asm + shf.l.wrap.b32 %r20095, %r20052, %r20051, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20099, %r20051, %r20052, %r19996; + // end inline asm + xor.b32 %r20523, %r20095, %r20027; + xor.b32 %r20524, %r20099, %r20028; + xor.b32 %r20316, %r30816, %r20523; + xor.b32 %r20317, %r30817, %r20524; + xor.b32 %r20133, %r30814, %r20523; + xor.b32 %r20132, %r30815, %r20524; + xor.b32 %r20292, %r30794, %r20523; + xor.b32 %r20293, %r30795, %r20524; + xor.b32 %r20253, %r30792, %r20523; + xor.b32 %r20252, %r30793, %r20524; + xor.b32 %r20236, %r30790, %r20523; + xor.b32 %r20237, %r30791, %r20524; + // begin inline asm + shf.l.wrap.b32 %r20103, %r20064, %r20063, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20107, %r20063, %r20064, %r19996; + // end inline asm + xor.b32 %r20525, %r20103, %r20039; + xor.b32 %r20526, %r20107, %r20040; + xor.b32 %r20173, %r30812, %r20525; + xor.b32 %r20172, %r30813, %r20526; + xor.b32 %r20300, %r30810, %r20525; + xor.b32 %r20301, %r30811, %r20526; + xor.b32 %r20181, %r30788, %r20525; + xor.b32 %r20180, %r30789, %r20526; + xor.b32 %r20284, %r30786, %r20525; + xor.b32 %r20285, %r30787, %r20526; + xor.b32 %r20149, %r30784, %r20525; + xor.b32 %r20148, %r30785, %r20526; + // begin inline asm + shf.l.wrap.b32 %r20111, %r20076, %r20075, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20115, %r20075, %r20076, %r19996; + // end inline asm + xor.b32 %r20527, %r20111, %r20051; + xor.b32 %r20528, %r20115, %r20052; + xor.b32 %r20268, %r30808, %r20527; + xor.b32 %r20269, %r30809, %r20528; + xor.b32 %r20245, %r30782, %r20527; + xor.b32 %r20244, %r30783, %r20528; + xor.b32 %r20188, %r30780, %r20527; + xor.b32 %r20189, %r30781, %r20528; + xor.b32 %r20276, %r30778, %r20527; + xor.b32 %r20277, %r30779, %r20528; + xor.b32 %r20205, %r30776, %r20527; + xor.b32 %r20204, %r30777, %r20528; + // begin inline asm + shf.l.wrap.b32 %r20119, %r20028, %r20027, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20123, %r20027, %r20028, %r19996; + // end inline asm + xor.b32 %r20529, %r20119, %r20063; + xor.b32 %r20530, %r20123, %r20064; + xor.b32 %r20220, %r30806, %r20529; + xor.b32 %r20221, %r30807, %r20530; + xor.b32 %r20140, %r30774, %r20529; + xor.b32 %r20141, %r30775, %r20530; + xor.b32 %r20157, %r30772, %r20529; + xor.b32 %r20156, %r30773, %r20530; + xor.b32 %r20196, %r30770, %r20529; + xor.b32 %r20197, %r30771, %r20530; + xor.b32 %r20228, %r30768, %r20529; + xor.b32 %r20229, %r30769, %r20530; + mov.u32 %r20134, 44; + // begin inline asm + shf.l.wrap.b32 %r20127, %r20133, %r20132, %r20134; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20131, %r20132, %r20133, %r20134; + // end inline asm + mov.u32 %r20142, 20; + // begin inline asm + shf.l.wrap.b32 %r20135, %r20141, %r20140, %r20142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20139, %r20140, %r20141, %r20142; + // end inline asm + mov.u32 %r20150, 61; + // begin inline asm + shf.l.wrap.b32 %r20143, %r20149, %r20148, %r20150; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20147, %r20148, %r20149, %r20150; + // end inline asm + mov.u32 %r20158, 39; + // begin inline asm + shf.l.wrap.b32 %r20151, %r20157, %r20156, %r20158; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20155, %r20156, %r20157, %r20158; + // end inline asm + mov.u32 %r20166, 18; + // begin inline asm + shf.l.wrap.b32 %r20159, %r20165, %r20164, %r20166; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20163, %r20164, %r20165, %r20166; + // end inline asm + mov.u32 %r20174, 62; + // begin inline asm + shf.l.wrap.b32 %r20167, %r20173, %r20172, %r20174; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20171, %r20172, %r20173, %r20174; + // end inline asm + mov.u32 %r20182, 43; + // begin inline asm + shf.l.wrap.b32 %r20175, %r20181, %r20180, %r20182; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20179, %r20180, %r20181, %r20182; + // end inline asm + mov.u32 %r20190, 25; + // begin inline asm + shf.l.wrap.b32 %r20183, %r20189, %r20188, %r20190; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20187, %r20188, %r20189, %r20190; + // end inline asm + mov.u32 %r20198, 8; + // begin inline asm + shf.l.wrap.b32 %r20191, %r20197, %r20196, %r20198; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20195, %r20196, %r20197, %r20198; + // end inline asm + mov.u32 %r20206, 56; + // begin inline asm + shf.l.wrap.b32 %r20199, %r20205, %r20204, %r20206; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20203, %r20204, %r20205, %r20206; + // end inline asm + mov.u32 %r20214, 41; + // begin inline asm + shf.l.wrap.b32 %r20207, %r20213, %r20212, %r20214; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20211, %r20212, %r20213, %r20214; + // end inline asm + mov.u32 %r20222, 27; + // begin inline asm + shf.l.wrap.b32 %r20215, %r20221, %r20220, %r20222; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20219, %r20220, %r20221, %r20222; + // end inline asm + mov.u32 %r20230, 14; + // begin inline asm + shf.l.wrap.b32 %r20223, %r20229, %r20228, %r20230; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20227, %r20228, %r20229, %r20230; + // end inline asm + mov.u32 %r20238, 2; + // begin inline asm + shf.l.wrap.b32 %r20231, %r20237, %r20236, %r20238; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20235, %r20236, %r20237, %r20238; + // end inline asm + mov.u32 %r20246, 55; + // begin inline asm + shf.l.wrap.b32 %r20239, %r20245, %r20244, %r20246; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20243, %r20244, %r20245, %r20246; + // end inline asm + mov.u32 %r20254, 45; + // begin inline asm + shf.l.wrap.b32 %r20247, %r20253, %r20252, %r20254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20251, %r20252, %r20253, %r20254; + // end inline asm + mov.u32 %r20262, 36; + // begin inline asm + shf.l.wrap.b32 %r20255, %r20261, %r20260, %r20262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20259, %r20260, %r20261, %r20262; + // end inline asm + mov.u32 %r20270, 28; + // begin inline asm + shf.l.wrap.b32 %r20263, %r20269, %r20268, %r20270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20267, %r20268, %r20269, %r20270; + // end inline asm + mov.u32 %r20278, 21; + // begin inline asm + shf.l.wrap.b32 %r20271, %r20277, %r20276, %r20278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20275, %r20276, %r20277, %r20278; + // end inline asm + mov.u32 %r20286, 15; + // begin inline asm + shf.l.wrap.b32 %r20279, %r20285, %r20284, %r20286; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20283, %r20284, %r20285, %r20286; + // end inline asm + mov.u32 %r20294, 10; + // begin inline asm + shf.l.wrap.b32 %r20287, %r20293, %r20292, %r20294; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20291, %r20292, %r20293, %r20294; + // end inline asm + mov.u32 %r20302, 6; + // begin inline asm + shf.l.wrap.b32 %r20295, %r20301, %r20300, %r20302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20299, %r20300, %r20301, %r20302; + // end inline asm + mov.u32 %r20310, 3; + // begin inline asm + shf.l.wrap.b32 %r20303, %r20309, %r20308, %r20310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20307, %r20308, %r20309, %r20310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20311, %r20317, %r20316, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20315, %r20316, %r20317, %r19996; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20319, %r20354, %r20127, %r20175, 0xD2; + lop3.b32 %r20320, %r20357, %r20131, %r20179, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30816, %r20127, %r20175, %r20271, 0xD2; + lop3.b32 %r30817, %r20131, %r20179, %r20275, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30812, %r20175, %r20271, %r20223, 0xD2; + lop3.b32 %r30813, %r20179, %r20275, %r20227, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30808, %r20271, %r20223, %r20354, 0xD2; + lop3.b32 %r30809, %r20275, %r20227, %r20357, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30806, %r20223, %r20354, %r20127, 0xD2; + lop3.b32 %r30807, %r20227, %r20357, %r20131, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30802, %r20263, %r20135, %r20303, 0xD2; + lop3.b32 %r30803, %r20267, %r20139, %r20307, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30814, %r20135, %r20303, %r20247, 0xD2; + lop3.b32 %r30815, %r20139, %r20307, %r20251, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30810, %r20303, %r20247, %r20143, 0xD2; + lop3.b32 %r30811, %r20307, %r20251, %r20147, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30782, %r20247, %r20143, %r20263, 0xD2; + lop3.b32 %r30783, %r20251, %r20147, %r20267, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30782, %r30783}; + // begin inline asm + // chi + lop3.b32 %r30774, %r20143, %r20263, %r20135, 0xD2; + lop3.b32 %r30775, %r20147, %r20267, %r20139, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30774, %r30775}; + // begin inline asm + // chi + lop3.b32 %r30800, %r20311, %r20295, %r20183, 0xD2; + lop3.b32 %r30801, %r20315, %r20299, %r20187, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30800, %r30801}; + // begin inline asm + // chi + lop3.b32 %r30794, %r20295, %r20183, %r20191, 0xD2; + lop3.b32 %r30795, %r20299, %r20187, %r20195, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30794, %r30795}; + // begin inline asm + // chi + lop3.b32 %r30788, %r20183, %r20191, %r20159, 0xD2; + lop3.b32 %r30789, %r20187, %r20195, %r20163, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30788, %r30789}; + // begin inline asm + // chi + lop3.b32 %r30780, %r20191, %r20159, %r20311, 0xD2; + lop3.b32 %r30781, %r20195, %r20163, %r20315, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30780, %r30781}; + // begin inline asm + // chi + lop3.b32 %r30772, %r20159, %r20311, %r20295, 0xD2; + lop3.b32 %r30773, %r20163, %r20315, %r20299, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30772, %r30773}; + // begin inline asm + // chi + lop3.b32 %r30798, %r20215, %r20255, %r20287, 0xD2; + lop3.b32 %r30799, %r20219, %r20259, %r20291, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30798, %r30799}; + // begin inline asm + // chi + lop3.b32 %r30792, %r20255, %r20287, %r20279, 0xD2; + lop3.b32 %r30793, %r20259, %r20291, %r20283, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30792, %r30793}; + // begin inline asm + // chi + lop3.b32 %r30786, %r20287, %r20279, %r20199, 0xD2; + lop3.b32 %r30787, %r20291, %r20283, %r20203, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30786, %r30787}; + // begin inline asm + // chi + lop3.b32 %r30778, %r20279, %r20199, %r20215, 0xD2; + lop3.b32 %r30779, %r20283, %r20203, %r20219, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30778, %r30779}; + // begin inline asm + // chi + lop3.b32 %r30770, %r20199, %r20215, %r20255, 0xD2; + lop3.b32 %r30771, %r20203, %r20219, %r20259, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30770, %r30771}; + // begin inline asm + // chi + lop3.b32 %r30796, %r20167, %r20239, %r20151, 0xD2; + lop3.b32 %r30797, %r20171, %r20243, %r20155, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30796, %r30797}; + // begin inline asm + // chi + lop3.b32 %r30790, %r20239, %r20151, %r20207, 0xD2; + lop3.b32 %r30791, %r20243, %r20155, %r20211, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30790, %r30791}; + // begin inline asm + // chi + lop3.b32 %r30784, %r20151, %r20207, %r20231, 0xD2; + lop3.b32 %r30785, %r20155, %r20211, %r20235, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30784, %r30785}; + // begin inline asm + // chi + lop3.b32 %r30776, %r20207, %r20231, %r20167, 0xD2; + lop3.b32 %r30777, %r20211, %r20235, %r20171, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30776, %r30777}; + // begin inline asm + // chi + lop3.b32 %r30768, %r20231, %r20167, %r20239, 0xD2; + lop3.b32 %r30769, %r20235, %r20171, %r20243, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30768, %r30769}; + mul.wide.s32 %rd899, %r30818, 8; + mov.u64 %rd900, keccak_round_constants; + cvta.const.u64 %rd901, %rd900; + add.s64 %rd896, %rd901, %rd899; + // begin inline asm + ld.global.nc.v2.u32 {%r20519,%r20520}, [%rd896]; + // end inline asm + xor.b32 %r30804, %r20319, %r20519; + xor.b32 %r30805, %r20320, %r20520; + add.s32 %r30818, %r30818, 1; + setp.lt.u32 %p38, %r30818, 23; + @%p38 bra $L__BB2_65; + + st.local.v2.u32 [%rd2+32], {%r30816, %r30817}; + st.local.v2.u32 [%rd2+72], {%r30814, %r30815}; + st.local.v2.u32 [%rd2+40], {%r30812, %r30813}; + st.local.v2.u32 [%rd2+80], {%r30810, %r30811}; + st.local.v2.u32 [%rd2+48], {%r30808, %r30809}; + st.local.v2.u32 [%rd2+56], {%r30806, %r30807}; + st.local.v2.u32 [%rd2+24], {%r30804, %r30805}; + // begin inline asm + // xor5 + lop3.b32 %r20531, %r30804, %r30802, %r30800, 0x96; + lop3.b32 %r20531, %r20531, %r30798, %r30796, 0x96; + lop3.b32 %r20532, %r30805, %r30803, %r30801, 0x96; + lop3.b32 %r20532, %r20532, %r30799, %r30797, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20543, %r30816, %r30814, %r30794, 0x96; + lop3.b32 %r20543, %r20543, %r30792, %r30790, 0x96; + lop3.b32 %r20544, %r30817, %r30815, %r30795, 0x96; + lop3.b32 %r20544, %r20544, %r30793, %r30791, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20555, %r30812, %r30810, %r30788, 0x96; + lop3.b32 %r20555, %r20555, %r30786, %r30784, 0x96; + lop3.b32 %r20556, %r30813, %r30811, %r30789, 0x96; + lop3.b32 %r20556, %r20556, %r30787, %r30785, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20567, %r30808, %r30782, %r30780, 0x96; + lop3.b32 %r20567, %r20567, %r30778, %r30776, 0x96; + lop3.b32 %r20568, %r30809, %r30783, %r30781, 0x96; + lop3.b32 %r20568, %r20568, %r30779, %r30777, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20579, %r30806, %r30774, %r30772, 0x96; + lop3.b32 %r20579, %r20579, %r30770, %r30768, 0x96; + lop3.b32 %r20580, %r30807, %r30775, %r30773, 0x96; + lop3.b32 %r20580, %r20580, %r30771, %r30769, 0x96; + // end inline asm + mov.u32 %r30833, 1; + // begin inline asm + shf.l.wrap.b32 %r20591, %r20544, %r20543, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20595, %r20543, %r20544, %r30833; + // end inline asm + xor.b32 %r20810, %r20591, %r20579; + xor.b32 %r20811, %r20595, %r20580; + xor.b32 %r20738, %r30804, %r20810; + xor.b32 %r20741, %r30805, %r20811; + xor.b32 %r20701, %r30801, %r20811; + xor.b32 %r20700, %r30800, %r20810; + st.local.v2.u32 [%rd2+104], {%r20700, %r20701}; + // begin inline asm + shf.l.wrap.b32 %r20599, %r20556, %r20555, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20603, %r20555, %r20556, %r30833; + // end inline asm + xor.b32 %r20812, %r20599, %r20531; + xor.b32 %r20813, %r20603, %r20532; + xor.b32 %r20637, %r30814, %r20812; + xor.b32 %r20636, %r30815, %r20813; + xor.b32 %r20676, %r30793, %r20813; + xor.b32 %r20677, %r30792, %r20812; + st.local.v2.u32 [%rd2+152], {%r20677, %r20676}; + // begin inline asm + shf.l.wrap.b32 %r20607, %r20568, %r20567, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20611, %r20567, %r20568, %r30833; + // end inline asm + xor.b32 %r20814, %r20607, %r20543; + xor.b32 %r20815, %r20611, %r20544; + xor.b32 %r20660, %r30789, %r20815; + xor.b32 %r20661, %r30788, %r20814; + st.local.v2.u32 [%rd2+120], {%r20661, %r20660}; + xor.b32 %r20652, %r30785, %r20815; + xor.b32 %r20653, %r30784, %r20814; + st.local.v2.u32 [%rd2+200], {%r20653, %r20652}; + // begin inline asm + shf.l.wrap.b32 %r20615, %r20580, %r20579, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20619, %r20579, %r20580, %r30833; + // end inline asm + xor.b32 %r20816, %r20615, %r20555; + xor.b32 %r20817, %r20619, %r20556; + xor.b32 %r20684, %r30808, %r20816; + xor.b32 %r20685, %r30809, %r20817; + xor.b32 %r20693, %r30779, %r20817; + xor.b32 %r20692, %r30778, %r20816; + st.local.v2.u32 [%rd2+168], {%r20692, %r20693}; + // begin inline asm + shf.l.wrap.b32 %r20623, %r20532, %r20531, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20627, %r20531, %r20532, %r30833; + // end inline asm + xor.b32 %r20818, %r20623, %r20567; + xor.b32 %r20819, %r20627, %r20568; + xor.b32 %r20644, %r30774, %r20818; + xor.b32 %r20645, %r30775, %r20819; + xor.b32 %r20669, %r30769, %r20819; + xor.b32 %r20668, %r30768, %r20818; + st.local.v2.u32 [%rd2+216], {%r20668, %r20669}; + // begin inline asm + shf.l.wrap.b32 %r20631, %r20637, %r20636, %r20134; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20635, %r20636, %r20637, %r20134; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20639, %r20645, %r20644, %r20142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20643, %r20644, %r20645, %r20142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20651, %r20652, %r20653, %r20150; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20647, %r20653, %r20652, %r20150; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r20647, %r20651}; + // begin inline asm + shf.l.wrap.b32 %r20655, %r20661, %r20660, %r20182; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20659, %r20660, %r20661, %r20182; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20663, %r20669, %r20668, %r20230; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20667, %r20668, %r20669, %r20230; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20675, %r20676, %r20677, %r20254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20671, %r20677, %r20676, %r20254; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r20671, %r20675}; + // begin inline asm + shf.l.wrap.b32 %r20679, %r20685, %r20684, %r20270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20683, %r20684, %r20685, %r20270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20687, %r20693, %r20692, %r20278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20691, %r20692, %r20693, %r20278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20695, %r20701, %r20700, %r20310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20699, %r20700, %r20701, %r20310; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20703, %r20738, %r20631, %r20655, 0xD2; + lop3.b32 %r20704, %r20741, %r20635, %r20659, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30951, %r20631, %r20655, %r20687, 0xD2; + lop3.b32 %r30952, %r20635, %r20659, %r20691, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30951, %r30952}; + // begin inline asm + // chi + lop3.b32 %r30947, %r20655, %r20687, %r20663, 0xD2; + lop3.b32 %r30948, %r20659, %r20691, %r20667, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30947, %r30948}; + // begin inline asm + // chi + lop3.b32 %r30943, %r20687, %r20663, %r20738, 0xD2; + lop3.b32 %r30944, %r20691, %r20667, %r20741, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30943, %r30944}; + // begin inline asm + // chi + lop3.b32 %r30941, %r20663, %r20738, %r20631, 0xD2; + lop3.b32 %r30942, %r20667, %r20741, %r20635, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30941, %r30942}; + // begin inline asm + // chi + lop3.b32 %r30937, %r20679, %r20639, %r20695, 0xD2; + lop3.b32 %r30938, %r20683, %r20643, %r20699, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30937, %r30938}; + // begin inline asm + // chi + lop3.b32 %r30949, %r20639, %r20695, %r20671, 0xD2; + lop3.b32 %r30950, %r20643, %r20699, %r20675, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30949, %r30950}; + // begin inline asm + // chi + lop3.b32 %r30945, %r20695, %r20671, %r20647, 0xD2; + lop3.b32 %r30946, %r20699, %r20675, %r20651, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30945, %r30946}; + add.s64 %rd902, %rd901, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r20767,%r20768}, [%rd902]; + // end inline asm + xor.b32 %r30939, %r20703, %r20767; + xor.b32 %r30940, %r20704, %r20768; + add.u64 %rd908, %SPL, 1912; + st.local.v2.u32 [%rd2+24], {%r30939, %r30940}; + st.local.u64 [%rd908], %rd361; + mov.u64 %rd909, 1179641; + st.local.u64 [%rd908+8], %rd909; + add.s32 %r20820, %r3343, 1; + st.local.u32 [%rd908+16], %r20820; + ld.global.u64 %rd910, [%rd221]; + ld.global.u64 %rd911, [%rd221+8]; + ld.global.u64 %rd912, [%rd221+16]; + ld.global.u64 %rd913, [%rd221+24]; + ld.global.u64 %rd914, [%rd221+32]; + ld.global.u64 %rd915, [%rd221+40]; + ld.global.u64 %rd916, [%rd221+48]; + ld.global.u64 %rd917, [%rd221+56]; + st.local.u64 [%rd908+32], %rd911; + st.local.u64 [%rd908+40], %rd912; + st.local.u64 [%rd908+48], %rd913; + st.local.u64 [%rd908+56], %rd914; + st.local.u64 [%rd908+64], %rd915; + st.local.u64 [%rd908+72], %rd916; + st.local.u64 [%rd908+80], %rd917; + cvt.u32.u64 %r20821, %rd910; + xor.b32 %r20822, %r20820, %r20821; + st.local.u64 [%rd908+24], %rd910; + st.local.u32 [%rd908+24], %r20822; + mov.u32 %r30819, 0; + st.local.v2.u32 [%rd908+96], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+104], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+112], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+120], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+128], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+136], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+144], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+152], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+160], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+168], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+176], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+184], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+192], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+200], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+208], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+216], {%r30819, %r30819}; + mov.u32 %r30834, -2147483648; + st.local.v2.u32 [%rd908+88], {%r30833, %r30834}; + ld.local.v2.u32 {%r30855, %r30856}, [%rd908+24]; + mov.b64 {%r30853, %r30854}, %rd915; + shr.u64 %rd918, %rd911, 32; + cvt.u32.u64 %r30867, %rd911; + cvt.u32.u64 %r30868, %rd918; + shr.u64 %rd919, %rd916, 32; + cvt.u32.u64 %r30865, %rd916; + cvt.u32.u64 %r30866, %rd919; + shr.u64 %rd920, %rd912, 32; + cvt.u32.u64 %r30863, %rd912; + cvt.u32.u64 %r30864, %rd920; + shr.u64 %rd921, %rd917, 32; + cvt.u32.u64 %r30861, %rd917; + cvt.u32.u64 %r30862, %rd921; + shr.u64 %rd922, %rd913, 32; + cvt.u32.u64 %r30859, %rd913; + cvt.u32.u64 %r30860, %rd922; + shr.u64 %rd923, %rd914, 32; + cvt.u32.u64 %r30857, %rd914; + cvt.u32.u64 %r30858, %rd923; + mov.u32 %r30820, %r30819; + mov.u32 %r30821, %r30819; + mov.u32 %r30822, %r30819; + mov.u32 %r30823, %r30819; + mov.u32 %r30824, %r30819; + mov.u32 %r30825, %r30819; + mov.u32 %r30826, %r30819; + mov.u32 %r30827, %r30819; + mov.u32 %r30828, %r30819; + mov.u32 %r30829, %r30819; + mov.u32 %r30830, %r30819; + mov.u32 %r30831, %r30819; + mov.u32 %r30832, %r30819; + mov.u32 %r30835, %r30819; + mov.u32 %r30836, %r30819; + mov.u32 %r30837, %r30819; + mov.u32 %r30838, %r30819; + mov.u32 %r30839, %r30819; + mov.u32 %r30840, %r30819; + mov.u32 %r30841, %r30819; + mov.u32 %r30842, %r30819; + mov.u32 %r30843, %r30819; + mov.u32 %r30844, %r30819; + mov.u32 %r30845, %r30819; + mov.u32 %r30846, %r30819; + mov.u32 %r30847, %r30819; + mov.u32 %r30848, %r30819; + mov.u32 %r30849, %r30819; + mov.u32 %r30850, %r30819; + mov.u32 %r30851, %r30819; + mov.u32 %r30852, %r30819; + mov.u32 %r30869, %r30819; + +$L__BB2_67: + mov.u32 %r29766, 1; + // begin inline asm + // xor5 + lop3.b32 %r20825, %r30855, %r30853, %r30851, 0x96; + lop3.b32 %r20825, %r20825, %r30849, %r30847, 0x96; + lop3.b32 %r20826, %r30856, %r30854, %r30852, 0x96; + lop3.b32 %r20826, %r20826, %r30850, %r30848, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20837, %r30867, %r30865, %r30845, 0x96; + lop3.b32 %r20837, %r20837, %r30843, %r30841, 0x96; + lop3.b32 %r20838, %r30868, %r30866, %r30846, 0x96; + lop3.b32 %r20838, %r20838, %r30844, %r30842, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20849, %r30863, %r30861, %r30839, 0x96; + lop3.b32 %r20849, %r20849, %r30837, %r30835, 0x96; + lop3.b32 %r20850, %r30864, %r30862, %r30840, 0x96; + lop3.b32 %r20850, %r20850, %r30838, %r30836, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20861, %r30859, %r30833, %r30831, 0x96; + lop3.b32 %r20861, %r20861, %r30829, %r30827, 0x96; + lop3.b32 %r20862, %r30860, %r30834, %r30832, 0x96; + lop3.b32 %r20862, %r20862, %r30830, %r30828, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20873, %r30857, %r30825, %r30823, 0x96; + lop3.b32 %r20873, %r20873, %r30821, %r30819, 0x96; + lop3.b32 %r20874, %r30858, %r30826, %r30824, 0x96; + lop3.b32 %r20874, %r20874, %r30822, %r30820, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20885, %r20838, %r20837, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20889, %r20837, %r20838, %r29766; + // end inline asm + xor.b32 %r21319, %r20885, %r20873; + xor.b32 %r21320, %r20889, %r20874; + xor.b32 %r21152, %r30855, %r21319; + xor.b32 %r21155, %r30856, %r21320; + xor.b32 %r21059, %r30853, %r21319; + xor.b32 %r21058, %r30854, %r21320; + xor.b32 %r21106, %r30851, %r21319; + xor.b32 %r21107, %r30852, %r21320; + xor.b32 %r21011, %r30849, %r21319; + xor.b32 %r21010, %r30850, %r21320; + xor.b32 %r20962, %r30847, %r21319; + xor.b32 %r20963, %r30848, %r21320; + // begin inline asm + shf.l.wrap.b32 %r20893, %r20850, %r20849, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20897, %r20849, %r20850, %r29766; + // end inline asm + xor.b32 %r21321, %r20893, %r20825; + xor.b32 %r21322, %r20897, %r20826; + xor.b32 %r21114, %r30867, %r21321; + xor.b32 %r21115, %r30868, %r21322; + xor.b32 %r20931, %r30865, %r21321; + xor.b32 %r20930, %r30866, %r21322; + xor.b32 %r21090, %r30845, %r21321; + xor.b32 %r21091, %r30846, %r21322; + xor.b32 %r21051, %r30843, %r21321; + xor.b32 %r21050, %r30844, %r21322; + xor.b32 %r21034, %r30841, %r21321; + xor.b32 %r21035, %r30842, %r21322; + // begin inline asm + shf.l.wrap.b32 %r20901, %r20862, %r20861, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20905, %r20861, %r20862, %r29766; + // end inline asm + xor.b32 %r21323, %r20901, %r20837; + xor.b32 %r21324, %r20905, %r20838; + xor.b32 %r20971, %r30863, %r21323; + xor.b32 %r20970, %r30864, %r21324; + xor.b32 %r21098, %r30861, %r21323; + xor.b32 %r21099, %r30862, %r21324; + xor.b32 %r20979, %r30839, %r21323; + xor.b32 %r20978, %r30840, %r21324; + xor.b32 %r21082, %r30837, %r21323; + xor.b32 %r21083, %r30838, %r21324; + xor.b32 %r20947, %r30835, %r21323; + xor.b32 %r20946, %r30836, %r21324; + // begin inline asm + shf.l.wrap.b32 %r20909, %r20874, %r20873, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20913, %r20873, %r20874, %r29766; + // end inline asm + xor.b32 %r21325, %r20909, %r20849; + xor.b32 %r21326, %r20913, %r20850; + xor.b32 %r21066, %r30859, %r21325; + xor.b32 %r21067, %r30860, %r21326; + xor.b32 %r21043, %r30833, %r21325; + xor.b32 %r21042, %r30834, %r21326; + xor.b32 %r20986, %r30831, %r21325; + xor.b32 %r20987, %r30832, %r21326; + xor.b32 %r21074, %r30829, %r21325; + xor.b32 %r21075, %r30830, %r21326; + xor.b32 %r21003, %r30827, %r21325; + xor.b32 %r21002, %r30828, %r21326; + // begin inline asm + shf.l.wrap.b32 %r20917, %r20826, %r20825, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20921, %r20825, %r20826, %r29766; + // end inline asm + xor.b32 %r21327, %r20917, %r20861; + xor.b32 %r21328, %r20921, %r20862; + xor.b32 %r21018, %r30857, %r21327; + xor.b32 %r21019, %r30858, %r21328; + xor.b32 %r20938, %r30825, %r21327; + xor.b32 %r20939, %r30826, %r21328; + xor.b32 %r20955, %r30823, %r21327; + xor.b32 %r20954, %r30824, %r21328; + xor.b32 %r20994, %r30821, %r21327; + xor.b32 %r20995, %r30822, %r21328; + xor.b32 %r21026, %r30819, %r21327; + xor.b32 %r21027, %r30820, %r21328; + mov.u32 %r20932, 44; + // begin inline asm + shf.l.wrap.b32 %r20925, %r20931, %r20930, %r20932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20929, %r20930, %r20931, %r20932; + // end inline asm + mov.u32 %r20940, 20; + // begin inline asm + shf.l.wrap.b32 %r20933, %r20939, %r20938, %r20940; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20937, %r20938, %r20939, %r20940; + // end inline asm + mov.u32 %r20948, 61; + // begin inline asm + shf.l.wrap.b32 %r20941, %r20947, %r20946, %r20948; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20945, %r20946, %r20947, %r20948; + // end inline asm + mov.u32 %r20956, 39; + // begin inline asm + shf.l.wrap.b32 %r20949, %r20955, %r20954, %r20956; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20953, %r20954, %r20955, %r20956; + // end inline asm + mov.u32 %r20964, 18; + // begin inline asm + shf.l.wrap.b32 %r20957, %r20963, %r20962, %r20964; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20961, %r20962, %r20963, %r20964; + // end inline asm + mov.u32 %r20972, 62; + // begin inline asm + shf.l.wrap.b32 %r20965, %r20971, %r20970, %r20972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20969, %r20970, %r20971, %r20972; + // end inline asm + mov.u32 %r20980, 43; + // begin inline asm + shf.l.wrap.b32 %r20973, %r20979, %r20978, %r20980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20977, %r20978, %r20979, %r20980; + // end inline asm + mov.u32 %r20988, 25; + // begin inline asm + shf.l.wrap.b32 %r20981, %r20987, %r20986, %r20988; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20985, %r20986, %r20987, %r20988; + // end inline asm + mov.u32 %r20996, 8; + // begin inline asm + shf.l.wrap.b32 %r20989, %r20995, %r20994, %r20996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20993, %r20994, %r20995, %r20996; + // end inline asm + mov.u32 %r21004, 56; + // begin inline asm + shf.l.wrap.b32 %r20997, %r21003, %r21002, %r21004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21001, %r21002, %r21003, %r21004; + // end inline asm + mov.u32 %r21012, 41; + // begin inline asm + shf.l.wrap.b32 %r21005, %r21011, %r21010, %r21012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21009, %r21010, %r21011, %r21012; + // end inline asm + mov.u32 %r21020, 27; + // begin inline asm + shf.l.wrap.b32 %r21013, %r21019, %r21018, %r21020; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21017, %r21018, %r21019, %r21020; + // end inline asm + mov.u32 %r21028, 14; + // begin inline asm + shf.l.wrap.b32 %r21021, %r21027, %r21026, %r21028; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21025, %r21026, %r21027, %r21028; + // end inline asm + mov.u32 %r21036, 2; + // begin inline asm + shf.l.wrap.b32 %r21029, %r21035, %r21034, %r21036; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21033, %r21034, %r21035, %r21036; + // end inline asm + mov.u32 %r21044, 55; + // begin inline asm + shf.l.wrap.b32 %r21037, %r21043, %r21042, %r21044; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21041, %r21042, %r21043, %r21044; + // end inline asm + mov.u32 %r21052, 45; + // begin inline asm + shf.l.wrap.b32 %r21045, %r21051, %r21050, %r21052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21049, %r21050, %r21051, %r21052; + // end inline asm + mov.u32 %r21060, 36; + // begin inline asm + shf.l.wrap.b32 %r21053, %r21059, %r21058, %r21060; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21057, %r21058, %r21059, %r21060; + // end inline asm + mov.u32 %r21068, 28; + // begin inline asm + shf.l.wrap.b32 %r21061, %r21067, %r21066, %r21068; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21065, %r21066, %r21067, %r21068; + // end inline asm + mov.u32 %r21076, 21; + // begin inline asm + shf.l.wrap.b32 %r21069, %r21075, %r21074, %r21076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21073, %r21074, %r21075, %r21076; + // end inline asm + mov.u32 %r21084, 15; + // begin inline asm + shf.l.wrap.b32 %r21077, %r21083, %r21082, %r21084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21081, %r21082, %r21083, %r21084; + // end inline asm + mov.u32 %r21092, 10; + // begin inline asm + shf.l.wrap.b32 %r21085, %r21091, %r21090, %r21092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21089, %r21090, %r21091, %r21092; + // end inline asm + mov.u32 %r21100, 6; + // begin inline asm + shf.l.wrap.b32 %r21093, %r21099, %r21098, %r21100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21097, %r21098, %r21099, %r21100; + // end inline asm + mov.u32 %r21108, 3; + // begin inline asm + shf.l.wrap.b32 %r21101, %r21107, %r21106, %r21108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21105, %r21106, %r21107, %r21108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21109, %r21115, %r21114, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21113, %r21114, %r21115, %r29766; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21117, %r21152, %r20925, %r20973, 0xD2; + lop3.b32 %r21118, %r21155, %r20929, %r20977, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30867, %r20925, %r20973, %r21069, 0xD2; + lop3.b32 %r30868, %r20929, %r20977, %r21073, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30863, %r20973, %r21069, %r21021, 0xD2; + lop3.b32 %r30864, %r20977, %r21073, %r21025, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30859, %r21069, %r21021, %r21152, 0xD2; + lop3.b32 %r30860, %r21073, %r21025, %r21155, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30857, %r21021, %r21152, %r20925, 0xD2; + lop3.b32 %r30858, %r21025, %r21155, %r20929, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30853, %r21061, %r20933, %r21101, 0xD2; + lop3.b32 %r30854, %r21065, %r20937, %r21105, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30865, %r20933, %r21101, %r21045, 0xD2; + lop3.b32 %r30866, %r20937, %r21105, %r21049, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30861, %r21101, %r21045, %r20941, 0xD2; + lop3.b32 %r30862, %r21105, %r21049, %r20945, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30833, %r21045, %r20941, %r21061, 0xD2; + lop3.b32 %r30834, %r21049, %r20945, %r21065, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r30833, %r30834}; + // begin inline asm + // chi + lop3.b32 %r30825, %r20941, %r21061, %r20933, 0xD2; + lop3.b32 %r30826, %r20945, %r21065, %r20937, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r30825, %r30826}; + // begin inline asm + // chi + lop3.b32 %r30851, %r21109, %r21093, %r20981, 0xD2; + lop3.b32 %r30852, %r21113, %r21097, %r20985, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+104], {%r30851, %r30852}; + // begin inline asm + // chi + lop3.b32 %r30845, %r21093, %r20981, %r20989, 0xD2; + lop3.b32 %r30846, %r21097, %r20985, %r20993, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+112], {%r30845, %r30846}; + // begin inline asm + // chi + lop3.b32 %r30839, %r20981, %r20989, %r20957, 0xD2; + lop3.b32 %r30840, %r20985, %r20993, %r20961, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+120], {%r30839, %r30840}; + // begin inline asm + // chi + lop3.b32 %r30831, %r20989, %r20957, %r21109, 0xD2; + lop3.b32 %r30832, %r20993, %r20961, %r21113, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+128], {%r30831, %r30832}; + // begin inline asm + // chi + lop3.b32 %r30823, %r20957, %r21109, %r21093, 0xD2; + lop3.b32 %r30824, %r20961, %r21113, %r21097, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+136], {%r30823, %r30824}; + // begin inline asm + // chi + lop3.b32 %r30849, %r21013, %r21053, %r21085, 0xD2; + lop3.b32 %r30850, %r21017, %r21057, %r21089, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+144], {%r30849, %r30850}; + // begin inline asm + // chi + lop3.b32 %r30843, %r21053, %r21085, %r21077, 0xD2; + lop3.b32 %r30844, %r21057, %r21089, %r21081, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+152], {%r30843, %r30844}; + // begin inline asm + // chi + lop3.b32 %r30837, %r21085, %r21077, %r20997, 0xD2; + lop3.b32 %r30838, %r21089, %r21081, %r21001, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+160], {%r30837, %r30838}; + // begin inline asm + // chi + lop3.b32 %r30829, %r21077, %r20997, %r21013, 0xD2; + lop3.b32 %r30830, %r21081, %r21001, %r21017, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+168], {%r30829, %r30830}; + // begin inline asm + // chi + lop3.b32 %r30821, %r20997, %r21013, %r21053, 0xD2; + lop3.b32 %r30822, %r21001, %r21017, %r21057, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+176], {%r30821, %r30822}; + // begin inline asm + // chi + lop3.b32 %r30847, %r20965, %r21037, %r20949, 0xD2; + lop3.b32 %r30848, %r20969, %r21041, %r20953, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+184], {%r30847, %r30848}; + // begin inline asm + // chi + lop3.b32 %r30841, %r21037, %r20949, %r21005, 0xD2; + lop3.b32 %r30842, %r21041, %r20953, %r21009, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+192], {%r30841, %r30842}; + // begin inline asm + // chi + lop3.b32 %r30835, %r20949, %r21005, %r21029, 0xD2; + lop3.b32 %r30836, %r20953, %r21009, %r21033, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+200], {%r30835, %r30836}; + // begin inline asm + // chi + lop3.b32 %r30827, %r21005, %r21029, %r20965, 0xD2; + lop3.b32 %r30828, %r21009, %r21033, %r20969, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+208], {%r30827, %r30828}; + // begin inline asm + // chi + lop3.b32 %r30819, %r21029, %r20965, %r21037, 0xD2; + lop3.b32 %r30820, %r21033, %r20969, %r21041, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+216], {%r30819, %r30820}; + mul.wide.s32 %rd927, %r30869, 8; + add.s64 %rd924, %rd901, %rd927; + // begin inline asm + ld.global.nc.v2.u32 {%r21317,%r21318}, [%rd924]; + // end inline asm + xor.b32 %r30855, %r21117, %r21317; + xor.b32 %r30856, %r21118, %r21318; + add.s32 %r30869, %r30869, 1; + setp.lt.u32 %p39, %r30869, 23; + @%p39 bra $L__BB2_67; + + mov.u32 %r29764, 3; + mov.u32 %r29763, 21; + mov.u32 %r29762, 28; + mov.u32 %r29761, 45; + mov.u32 %r29760, 14; + mov.u32 %r29759, 43; + mov.u32 %r29758, 61; + mov.u32 %r29757, 20; + mov.u32 %r29756, 44; + mov.u32 %r30902, 0; + mov.u32 %r21428, 1; + st.local.v2.u32 [%rd908+32], {%r30867, %r30868}; + st.local.v2.u32 [%rd908+72], {%r30865, %r30866}; + st.local.v2.u32 [%rd908+40], {%r30863, %r30864}; + st.local.v2.u32 [%rd908+80], {%r30861, %r30862}; + st.local.v2.u32 [%rd908+48], {%r30859, %r30860}; + st.local.v2.u32 [%rd908+56], {%r30857, %r30858}; + st.local.v2.u32 [%rd908+24], {%r30855, %r30856}; + // begin inline asm + // xor5 + lop3.b32 %r21329, %r30855, %r30853, %r30851, 0x96; + lop3.b32 %r21329, %r21329, %r30849, %r30847, 0x96; + lop3.b32 %r21330, %r30856, %r30854, %r30852, 0x96; + lop3.b32 %r21330, %r21330, %r30850, %r30848, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21341, %r30867, %r30865, %r30845, 0x96; + lop3.b32 %r21341, %r21341, %r30843, %r30841, 0x96; + lop3.b32 %r21342, %r30868, %r30866, %r30846, 0x96; + lop3.b32 %r21342, %r21342, %r30844, %r30842, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21353, %r30863, %r30861, %r30839, 0x96; + lop3.b32 %r21353, %r21353, %r30837, %r30835, 0x96; + lop3.b32 %r21354, %r30864, %r30862, %r30840, 0x96; + lop3.b32 %r21354, %r21354, %r30838, %r30836, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21365, %r30859, %r30833, %r30831, 0x96; + lop3.b32 %r21365, %r21365, %r30829, %r30827, 0x96; + lop3.b32 %r21366, %r30860, %r30834, %r30832, 0x96; + lop3.b32 %r21366, %r21366, %r30830, %r30828, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21377, %r30857, %r30825, %r30823, 0x96; + lop3.b32 %r21377, %r21377, %r30821, %r30819, 0x96; + lop3.b32 %r21378, %r30858, %r30826, %r30824, 0x96; + lop3.b32 %r21378, %r21378, %r30822, %r30820, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21389, %r21342, %r21341, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21393, %r21341, %r21342, %r21428; + // end inline asm + xor.b32 %r21568, %r21389, %r21377; + xor.b32 %r21569, %r21393, %r21378; + xor.b32 %r21536, %r30855, %r21568; + xor.b32 %r21539, %r30856, %r21569; + xor.b32 %r21499, %r30852, %r21569; + xor.b32 %r21498, %r30851, %r21568; + st.local.v2.u32 [%rd908+104], {%r21498, %r21499}; + // begin inline asm + shf.l.wrap.b32 %r21397, %r21354, %r21353, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21401, %r21353, %r21354, %r21428; + // end inline asm + xor.b32 %r21570, %r21397, %r21329; + xor.b32 %r21571, %r21401, %r21330; + xor.b32 %r21435, %r30865, %r21570; + xor.b32 %r21434, %r30866, %r21571; + xor.b32 %r21474, %r30844, %r21571; + xor.b32 %r21475, %r30843, %r21570; + st.local.v2.u32 [%rd908+152], {%r21475, %r21474}; + // begin inline asm + shf.l.wrap.b32 %r21405, %r21366, %r21365, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21409, %r21365, %r21366, %r21428; + // end inline asm + xor.b32 %r21572, %r21405, %r21341; + xor.b32 %r21573, %r21409, %r21342; + xor.b32 %r21458, %r30840, %r21573; + xor.b32 %r21459, %r30839, %r21572; + st.local.v2.u32 [%rd908+120], {%r21459, %r21458}; + xor.b32 %r21450, %r30836, %r21573; + xor.b32 %r21451, %r30835, %r21572; + st.local.v2.u32 [%rd908+200], {%r21451, %r21450}; + // begin inline asm + shf.l.wrap.b32 %r21413, %r21378, %r21377, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21417, %r21377, %r21378, %r21428; + // end inline asm + xor.b32 %r21574, %r21413, %r21353; + xor.b32 %r21575, %r21417, %r21354; + xor.b32 %r21482, %r30859, %r21574; + xor.b32 %r21483, %r30860, %r21575; + xor.b32 %r21491, %r30830, %r21575; + xor.b32 %r21490, %r30829, %r21574; + st.local.v2.u32 [%rd908+168], {%r21490, %r21491}; + // begin inline asm + shf.l.wrap.b32 %r21421, %r21330, %r21329, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21425, %r21329, %r21330, %r21428; + // end inline asm + xor.b32 %r21576, %r21421, %r21365; + xor.b32 %r21577, %r21425, %r21366; + xor.b32 %r21442, %r30825, %r21576; + xor.b32 %r21443, %r30826, %r21577; + xor.b32 %r21467, %r30820, %r21577; + xor.b32 %r21466, %r30819, %r21576; + st.local.v2.u32 [%rd908+216], {%r21466, %r21467}; + // begin inline asm + shf.l.wrap.b32 %r21429, %r21435, %r21434, %r29756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21433, %r21434, %r21435, %r29756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21437, %r21443, %r21442, %r29757; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21441, %r21442, %r21443, %r29757; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21449, %r21450, %r21451, %r29758; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21445, %r21451, %r21450, %r29758; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r21445, %r21449}; + // begin inline asm + shf.l.wrap.b32 %r21453, %r21459, %r21458, %r29759; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21457, %r21458, %r21459, %r29759; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21461, %r21467, %r21466, %r29760; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21465, %r21466, %r21467, %r29760; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21473, %r21474, %r21475, %r29761; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21469, %r21475, %r21474, %r29761; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r21469, %r21473}; + // begin inline asm + shf.l.wrap.b32 %r21477, %r21483, %r21482, %r29762; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21481, %r21482, %r21483, %r29762; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21485, %r21491, %r21490, %r29763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21489, %r21490, %r21491, %r29763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21493, %r21499, %r21498, %r29764; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21497, %r21498, %r21499, %r29764; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21501, %r21536, %r21429, %r21453, 0xD2; + lop3.b32 %r21502, %r21539, %r21433, %r21457, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31002, %r21429, %r21453, %r21485, 0xD2; + lop3.b32 %r31003, %r21433, %r21457, %r21489, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+32], {%r31002, %r31003}; + // begin inline asm + // chi + lop3.b32 %r30998, %r21453, %r21485, %r21461, 0xD2; + lop3.b32 %r30999, %r21457, %r21489, %r21465, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+40], {%r30998, %r30999}; + // begin inline asm + // chi + lop3.b32 %r30994, %r21485, %r21461, %r21536, 0xD2; + lop3.b32 %r30995, %r21489, %r21465, %r21539, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+48], {%r30994, %r30995}; + // begin inline asm + // chi + lop3.b32 %r30992, %r21461, %r21536, %r21429, 0xD2; + lop3.b32 %r30993, %r21465, %r21539, %r21433, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+56], {%r30992, %r30993}; + // begin inline asm + // chi + lop3.b32 %r30988, %r21477, %r21437, %r21493, 0xD2; + lop3.b32 %r30989, %r21481, %r21441, %r21497, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+64], {%r30988, %r30989}; + // begin inline asm + // chi + lop3.b32 %r31000, %r21437, %r21493, %r21469, 0xD2; + lop3.b32 %r31001, %r21441, %r21497, %r21473, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+72], {%r31000, %r31001}; + // begin inline asm + // chi + lop3.b32 %r30996, %r21493, %r21469, %r21445, 0xD2; + lop3.b32 %r30997, %r21497, %r21473, %r21449, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+80], {%r30996, %r30997}; + // begin inline asm + ld.global.nc.v2.u32 {%r21565,%r21566}, [%rd902]; + // end inline asm + xor.b32 %r30990, %r21501, %r21565; + xor.b32 %r30991, %r21502, %r21566; + st.local.v2.u32 [%rd908+24], {%r30990, %r30991}; + add.s64 %rd242, %rd908, 24; + add.s64 %rd243, %rd2, 24; + +$L__BB2_69: + add.s32 %r29765, %r3343, 1; + cvta.to.global.u64 %rd1258, %rd361; + shl.b32 %r21578, %r30902, 2; + cvt.u64.u32 %rd935, %r21578; + and.b64 %rd936, %rd935, 60; + add.s64 %rd937, %rd243, %rd936; + xor.b32 %r21579, %r3343, %r30902; + mul.lo.s32 %r21580, %r21579, 16777619; + ld.local.u32 %r21581, [%rd937]; + xor.b32 %r21582, %r21580, %r21581; + mul.wide.u32 %rd938, %r21582, -954391867; + shr.u64 %rd939, %rd938, 32; + cvt.u32.u64 %r21583, %rd939; + sub.s32 %r21584, %r21582, %r21583; + shr.u32 %r21585, %r21584, 1; + add.s32 %r21586, %r21585, %r21583; + shr.u32 %r21587, %r21586, 20; + mul.lo.s32 %r21588, %r21587, 1179641; + sub.s32 %r21589, %r21582, %r21588; + mul.wide.u32 %rd940, %r21589, 64; + add.s64 %rd941, %rd1258, %rd940; + mul.lo.s32 %r21590, %r30939, 16777619; + ld.global.u32 %r21591, [%rd941]; + xor.b32 %r30939, %r21590, %r21591; + mul.lo.s32 %r21592, %r30940, 16777619; + ld.global.u32 %r21593, [%rd941+4]; + xor.b32 %r30940, %r21592, %r21593; + mul.lo.s32 %r21594, %r30951, 16777619; + ld.global.u32 %r21595, [%rd941+8]; + mul.lo.s32 %r21596, %r30952, 16777619; + ld.global.u32 %r21597, [%rd941+12]; + xor.b32 %r21598, %r21596, %r21597; + xor.b32 %r30951, %r21594, %r21595; + mov.b64 %rd942, {%r30951, %r21598}; + mul.lo.s32 %r21599, %r30947, 16777619; + ld.global.u32 %r21600, [%rd941+16]; + mul.lo.s32 %r21601, %r30948, 16777619; + ld.global.u32 %r21602, [%rd941+20]; + xor.b32 %r21603, %r21601, %r21602; + xor.b32 %r30947, %r21599, %r21600; + mov.b64 %rd943, {%r30947, %r21603}; + mul.lo.s32 %r21604, %r30943, 16777619; + ld.global.u32 %r21605, [%rd941+24]; + mul.lo.s32 %r21606, %r30944, 16777619; + ld.global.u32 %r21607, [%rd941+28]; + xor.b32 %r21608, %r21606, %r21607; + xor.b32 %r30943, %r21604, %r21605; + mov.b64 %rd944, {%r30943, %r21608}; + mul.lo.s32 %r21609, %r30941, 16777619; + ld.global.u32 %r21610, [%rd941+32]; + mul.lo.s32 %r21611, %r30942, 16777619; + ld.global.u32 %r21612, [%rd941+36]; + xor.b32 %r21613, %r21611, %r21612; + xor.b32 %r30941, %r21609, %r21610; + mov.b64 %rd945, {%r30941, %r21613}; + mul.lo.s32 %r21614, %r30937, 16777619; + ld.global.u32 %r21615, [%rd941+40]; + xor.b32 %r30937, %r21614, %r21615; + mul.lo.s32 %r21616, %r30938, 16777619; + ld.global.u32 %r21617, [%rd941+44]; + xor.b32 %r30938, %r21616, %r21617; + mul.lo.s32 %r21618, %r30949, 16777619; + ld.global.u32 %r21619, [%rd941+48]; + mul.lo.s32 %r21620, %r30950, 16777619; + ld.global.u32 %r21621, [%rd941+52]; + xor.b32 %r21622, %r21620, %r21621; + xor.b32 %r30949, %r21618, %r21619; + mov.b64 %rd946, {%r30949, %r21622}; + mul.lo.s32 %r21623, %r30945, 16777619; + ld.global.u32 %r21624, [%rd941+56]; + mul.lo.s32 %r21625, %r30946, 16777619; + ld.global.u32 %r21626, [%rd941+60]; + xor.b32 %r21627, %r21625, %r21626; + xor.b32 %r30945, %r21623, %r21624; + mov.b64 %rd947, {%r30945, %r21627}; + st.local.v2.u32 [%rd2+24], {%r30939, %r30940}; + st.local.v2.u32 [%rd2+32], {%r30951, %r21598}; + st.local.v2.u32 [%rd2+40], {%r30947, %r21603}; + st.local.v2.u32 [%rd2+48], {%r30943, %r21608}; + st.local.v2.u32 [%rd2+56], {%r30941, %r21613}; + st.local.v2.u32 [%rd2+64], {%r30937, %r30938}; + st.local.v2.u32 [%rd2+72], {%r30949, %r21622}; + st.local.v2.u32 [%rd2+80], {%r30945, %r21627}; + add.s64 %rd948, %rd242, %rd936; + xor.b32 %r21628, %r29765, %r30902; + mul.lo.s32 %r21629, %r21628, 16777619; + ld.local.u32 %r21630, [%rd948]; + xor.b32 %r21631, %r21629, %r21630; + mul.wide.u32 %rd949, %r21631, -954391867; + shr.u64 %rd950, %rd949, 32; + cvt.u32.u64 %r21632, %rd950; + sub.s32 %r21633, %r21631, %r21632; + shr.u32 %r21634, %r21633, 1; + add.s32 %r21635, %r21634, %r21632; + shr.u32 %r21636, %r21635, 20; + mul.lo.s32 %r21637, %r21636, 1179641; + sub.s32 %r21638, %r21631, %r21637; + mul.wide.u32 %rd951, %r21638, 64; + add.s64 %rd952, %rd1258, %rd951; + mul.lo.s32 %r21639, %r30990, 16777619; + ld.global.u32 %r21640, [%rd952]; + xor.b32 %r30990, %r21639, %r21640; + mul.lo.s32 %r21641, %r30991, 16777619; + ld.global.u32 %r21642, [%rd952+4]; + xor.b32 %r30991, %r21641, %r21642; + mul.lo.s32 %r21643, %r31002, 16777619; + ld.global.u32 %r21644, [%rd952+8]; + mul.lo.s32 %r21645, %r31003, 16777619; + ld.global.u32 %r21646, [%rd952+12]; + xor.b32 %r21647, %r21645, %r21646; + xor.b32 %r31002, %r21643, %r21644; + mov.b64 %rd953, {%r31002, %r21647}; + mul.lo.s32 %r21648, %r30998, 16777619; + ld.global.u32 %r21649, [%rd952+16]; + mul.lo.s32 %r21650, %r30999, 16777619; + ld.global.u32 %r21651, [%rd952+20]; + xor.b32 %r21652, %r21650, %r21651; + xor.b32 %r30998, %r21648, %r21649; + mov.b64 %rd954, {%r30998, %r21652}; + mul.lo.s32 %r21653, %r30994, 16777619; + ld.global.u32 %r21654, [%rd952+24]; + mul.lo.s32 %r21655, %r30995, 16777619; + ld.global.u32 %r21656, [%rd952+28]; + xor.b32 %r21657, %r21655, %r21656; + xor.b32 %r30994, %r21653, %r21654; + mov.b64 %rd955, {%r30994, %r21657}; + mul.lo.s32 %r21658, %r30992, 16777619; + ld.global.u32 %r21659, [%rd952+32]; + mul.lo.s32 %r21660, %r30993, 16777619; + ld.global.u32 %r21661, [%rd952+36]; + xor.b32 %r21662, %r21660, %r21661; + xor.b32 %r30992, %r21658, %r21659; + mov.b64 %rd956, {%r30992, %r21662}; + mul.lo.s32 %r21663, %r30988, 16777619; + ld.global.u32 %r21664, [%rd952+40]; + xor.b32 %r30988, %r21663, %r21664; + mul.lo.s32 %r21665, %r30989, 16777619; + ld.global.u32 %r21666, [%rd952+44]; + xor.b32 %r30989, %r21665, %r21666; + mul.lo.s32 %r21667, %r31000, 16777619; + ld.global.u32 %r21668, [%rd952+48]; + mul.lo.s32 %r21669, %r31001, 16777619; + ld.global.u32 %r21670, [%rd952+52]; + xor.b32 %r21671, %r21669, %r21670; + xor.b32 %r31000, %r21667, %r21668; + mov.b64 %rd957, {%r31000, %r21671}; + mul.lo.s32 %r21672, %r30996, 16777619; + ld.global.u32 %r21673, [%rd952+56]; + mul.lo.s32 %r21674, %r30997, 16777619; + ld.global.u32 %r21675, [%rd952+60]; + xor.b32 %r21676, %r21674, %r21675; + xor.b32 %r30996, %r21672, %r21673; + mov.b64 %rd958, {%r30996, %r21676}; + st.local.v2.u32 [%rd908+24], {%r30990, %r30991}; + st.local.v2.u32 [%rd908+32], {%r31002, %r21647}; + st.local.v2.u32 [%rd908+40], {%r30998, %r21652}; + st.local.v2.u32 [%rd908+48], {%r30994, %r21657}; + st.local.v2.u32 [%rd908+56], {%r30992, %r21662}; + st.local.v2.u32 [%rd908+64], {%r30988, %r30989}; + st.local.v2.u32 [%rd908+72], {%r31000, %r21671}; + st.local.v2.u32 [%rd908+80], {%r30996, %r21676}; + add.s32 %r30902, %r30902, 1; + setp.lt.u32 %p40, %r30902, 512; + shr.u64 %rd959, %rd942, 32; + cvt.u32.u64 %r30952, %rd959; + shr.u64 %rd960, %rd943, 32; + cvt.u32.u64 %r30948, %rd960; + shr.u64 %rd961, %rd944, 32; + cvt.u32.u64 %r30944, %rd961; + shr.u64 %rd962, %rd945, 32; + cvt.u32.u64 %r30942, %rd962; + shr.u64 %rd963, %rd946, 32; + cvt.u32.u64 %r30950, %rd963; + shr.u64 %rd964, %rd947, 32; + cvt.u32.u64 %r30946, %rd964; + shr.u64 %rd965, %rd953, 32; + cvt.u32.u64 %r31003, %rd965; + shr.u64 %rd966, %rd954, 32; + cvt.u32.u64 %r30999, %rd966; + shr.u64 %rd967, %rd955, 32; + cvt.u32.u64 %r30995, %rd967; + shr.u64 %rd968, %rd956, 32; + cvt.u32.u64 %r30993, %rd968; + shr.u64 %rd969, %rd957, 32; + cvt.u32.u64 %r31001, %rd969; + shr.u64 %rd970, %rd958, 32; + cvt.u32.u64 %r30997, %rd970; + @%p40 bra $L__BB2_69; + + mov.u32 %r30903, 0; + st.local.v2.u32 [%rd2+96], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+104], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+112], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+120], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+128], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+136], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+144], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+152], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+160], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+168], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+176], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+184], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+192], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+200], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+208], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+216], {%r30903, %r30903}; + mov.u32 %r30918, -2147483648; + mov.u32 %r30917, 1; + st.local.v2.u32 [%rd2+88], {%r30917, %r30918}; + mov.u32 %r30904, %r30903; + mov.u32 %r30905, %r30903; + mov.u32 %r30906, %r30903; + mov.u32 %r30907, %r30903; + mov.u32 %r30908, %r30903; + mov.u32 %r30909, %r30903; + mov.u32 %r30910, %r30903; + mov.u32 %r30911, %r30903; + mov.u32 %r30912, %r30903; + mov.u32 %r30913, %r30903; + mov.u32 %r30914, %r30903; + mov.u32 %r30915, %r30903; + mov.u32 %r30916, %r30903; + mov.u32 %r30919, %r30903; + mov.u32 %r30920, %r30903; + mov.u32 %r30921, %r30903; + mov.u32 %r30922, %r30903; + mov.u32 %r30923, %r30903; + mov.u32 %r30924, %r30903; + mov.u32 %r30925, %r30903; + mov.u32 %r30926, %r30903; + mov.u32 %r30927, %r30903; + mov.u32 %r30928, %r30903; + mov.u32 %r30929, %r30903; + mov.u32 %r30930, %r30903; + mov.u32 %r30931, %r30903; + mov.u32 %r30932, %r30903; + mov.u32 %r30933, %r30903; + mov.u32 %r30934, %r30903; + mov.u32 %r30935, %r30903; + mov.u32 %r30936, %r30903; + mov.u32 %r30953, %r30903; + +$L__BB2_71: + mov.u32 %r29776, 1; + mov.u64 %rd1281, keccak_round_constants; + cvta.const.u64 %rd1280, %rd1281; + // begin inline asm + // xor5 + lop3.b32 %r21718, %r30939, %r30937, %r30935, 0x96; + lop3.b32 %r21718, %r21718, %r30933, %r30931, 0x96; + lop3.b32 %r21719, %r30940, %r30938, %r30936, 0x96; + lop3.b32 %r21719, %r21719, %r30934, %r30932, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21730, %r30951, %r30949, %r30929, 0x96; + lop3.b32 %r21730, %r21730, %r30927, %r30925, 0x96; + lop3.b32 %r21731, %r30952, %r30950, %r30930, 0x96; + lop3.b32 %r21731, %r21731, %r30928, %r30926, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21742, %r30947, %r30945, %r30923, 0x96; + lop3.b32 %r21742, %r21742, %r30921, %r30919, 0x96; + lop3.b32 %r21743, %r30948, %r30946, %r30924, 0x96; + lop3.b32 %r21743, %r21743, %r30922, %r30920, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21754, %r30943, %r30917, %r30915, 0x96; + lop3.b32 %r21754, %r21754, %r30913, %r30911, 0x96; + lop3.b32 %r21755, %r30944, %r30918, %r30916, 0x96; + lop3.b32 %r21755, %r21755, %r30914, %r30912, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21766, %r30941, %r30909, %r30907, 0x96; + lop3.b32 %r21766, %r21766, %r30905, %r30903, 0x96; + lop3.b32 %r21767, %r30942, %r30910, %r30908, 0x96; + lop3.b32 %r21767, %r21767, %r30906, %r30904, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21778, %r21731, %r21730, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21782, %r21730, %r21731, %r29776; + // end inline asm + xor.b32 %r22212, %r21778, %r21766; + xor.b32 %r22213, %r21782, %r21767; + xor.b32 %r22045, %r30939, %r22212; + xor.b32 %r22048, %r30940, %r22213; + xor.b32 %r21952, %r30937, %r22212; + xor.b32 %r21951, %r30938, %r22213; + xor.b32 %r21999, %r30935, %r22212; + xor.b32 %r22000, %r30936, %r22213; + xor.b32 %r21904, %r30933, %r22212; + xor.b32 %r21903, %r30934, %r22213; + xor.b32 %r21855, %r30931, %r22212; + xor.b32 %r21856, %r30932, %r22213; + // begin inline asm + shf.l.wrap.b32 %r21786, %r21743, %r21742, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21790, %r21742, %r21743, %r29776; + // end inline asm + xor.b32 %r22214, %r21786, %r21718; + xor.b32 %r22215, %r21790, %r21719; + xor.b32 %r22007, %r30951, %r22214; + xor.b32 %r22008, %r30952, %r22215; + xor.b32 %r21824, %r30949, %r22214; + xor.b32 %r21823, %r30950, %r22215; + xor.b32 %r21983, %r30929, %r22214; + xor.b32 %r21984, %r30930, %r22215; + xor.b32 %r21944, %r30927, %r22214; + xor.b32 %r21943, %r30928, %r22215; + xor.b32 %r21927, %r30925, %r22214; + xor.b32 %r21928, %r30926, %r22215; + // begin inline asm + shf.l.wrap.b32 %r21794, %r21755, %r21754, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21798, %r21754, %r21755, %r29776; + // end inline asm + xor.b32 %r22216, %r21794, %r21730; + xor.b32 %r22217, %r21798, %r21731; + xor.b32 %r21864, %r30947, %r22216; + xor.b32 %r21863, %r30948, %r22217; + xor.b32 %r21991, %r30945, %r22216; + xor.b32 %r21992, %r30946, %r22217; + xor.b32 %r21872, %r30923, %r22216; + xor.b32 %r21871, %r30924, %r22217; + xor.b32 %r21975, %r30921, %r22216; + xor.b32 %r21976, %r30922, %r22217; + xor.b32 %r21840, %r30919, %r22216; + xor.b32 %r21839, %r30920, %r22217; + // begin inline asm + shf.l.wrap.b32 %r21802, %r21767, %r21766, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21806, %r21766, %r21767, %r29776; + // end inline asm + xor.b32 %r22218, %r21802, %r21742; + xor.b32 %r22219, %r21806, %r21743; + xor.b32 %r21959, %r30943, %r22218; + xor.b32 %r21960, %r30944, %r22219; + xor.b32 %r21936, %r30917, %r22218; + xor.b32 %r21935, %r30918, %r22219; + xor.b32 %r21879, %r30915, %r22218; + xor.b32 %r21880, %r30916, %r22219; + xor.b32 %r21967, %r30913, %r22218; + xor.b32 %r21968, %r30914, %r22219; + xor.b32 %r21896, %r30911, %r22218; + xor.b32 %r21895, %r30912, %r22219; + // begin inline asm + shf.l.wrap.b32 %r21810, %r21719, %r21718, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21814, %r21718, %r21719, %r29776; + // end inline asm + xor.b32 %r22220, %r21810, %r21754; + xor.b32 %r22221, %r21814, %r21755; + xor.b32 %r21911, %r30941, %r22220; + xor.b32 %r21912, %r30942, %r22221; + xor.b32 %r21831, %r30909, %r22220; + xor.b32 %r21832, %r30910, %r22221; + xor.b32 %r21848, %r30907, %r22220; + xor.b32 %r21847, %r30908, %r22221; + xor.b32 %r21887, %r30905, %r22220; + xor.b32 %r21888, %r30906, %r22221; + xor.b32 %r21919, %r30903, %r22220; + xor.b32 %r21920, %r30904, %r22221; + mov.u32 %r21825, 44; + // begin inline asm + shf.l.wrap.b32 %r21818, %r21824, %r21823, %r21825; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21822, %r21823, %r21824, %r21825; + // end inline asm + mov.u32 %r21833, 20; + // begin inline asm + shf.l.wrap.b32 %r21826, %r21832, %r21831, %r21833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21830, %r21831, %r21832, %r21833; + // end inline asm + mov.u32 %r21841, 61; + // begin inline asm + shf.l.wrap.b32 %r21834, %r21840, %r21839, %r21841; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21838, %r21839, %r21840, %r21841; + // end inline asm + mov.u32 %r21849, 39; + // begin inline asm + shf.l.wrap.b32 %r21842, %r21848, %r21847, %r21849; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21846, %r21847, %r21848, %r21849; + // end inline asm + mov.u32 %r21857, 18; + // begin inline asm + shf.l.wrap.b32 %r21850, %r21856, %r21855, %r21857; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21854, %r21855, %r21856, %r21857; + // end inline asm + mov.u32 %r21865, 62; + // begin inline asm + shf.l.wrap.b32 %r21858, %r21864, %r21863, %r21865; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21862, %r21863, %r21864, %r21865; + // end inline asm + mov.u32 %r21873, 43; + // begin inline asm + shf.l.wrap.b32 %r21866, %r21872, %r21871, %r21873; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21870, %r21871, %r21872, %r21873; + // end inline asm + mov.u32 %r21881, 25; + // begin inline asm + shf.l.wrap.b32 %r21874, %r21880, %r21879, %r21881; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21878, %r21879, %r21880, %r21881; + // end inline asm + mov.u32 %r21889, 8; + // begin inline asm + shf.l.wrap.b32 %r21882, %r21888, %r21887, %r21889; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21886, %r21887, %r21888, %r21889; + // end inline asm + mov.u32 %r21897, 56; + // begin inline asm + shf.l.wrap.b32 %r21890, %r21896, %r21895, %r21897; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21894, %r21895, %r21896, %r21897; + // end inline asm + mov.u32 %r21905, 41; + // begin inline asm + shf.l.wrap.b32 %r21898, %r21904, %r21903, %r21905; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21902, %r21903, %r21904, %r21905; + // end inline asm + mov.u32 %r21913, 27; + // begin inline asm + shf.l.wrap.b32 %r21906, %r21912, %r21911, %r21913; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21910, %r21911, %r21912, %r21913; + // end inline asm + mov.u32 %r21921, 14; + // begin inline asm + shf.l.wrap.b32 %r21914, %r21920, %r21919, %r21921; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21918, %r21919, %r21920, %r21921; + // end inline asm + mov.u32 %r21929, 2; + // begin inline asm + shf.l.wrap.b32 %r21922, %r21928, %r21927, %r21929; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21926, %r21927, %r21928, %r21929; + // end inline asm + mov.u32 %r21937, 55; + // begin inline asm + shf.l.wrap.b32 %r21930, %r21936, %r21935, %r21937; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21934, %r21935, %r21936, %r21937; + // end inline asm + mov.u32 %r21945, 45; + // begin inline asm + shf.l.wrap.b32 %r21938, %r21944, %r21943, %r21945; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21942, %r21943, %r21944, %r21945; + // end inline asm + mov.u32 %r21953, 36; + // begin inline asm + shf.l.wrap.b32 %r21946, %r21952, %r21951, %r21953; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21950, %r21951, %r21952, %r21953; + // end inline asm + mov.u32 %r21961, 28; + // begin inline asm + shf.l.wrap.b32 %r21954, %r21960, %r21959, %r21961; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21958, %r21959, %r21960, %r21961; + // end inline asm + mov.u32 %r21969, 21; + // begin inline asm + shf.l.wrap.b32 %r21962, %r21968, %r21967, %r21969; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21966, %r21967, %r21968, %r21969; + // end inline asm + mov.u32 %r21977, 15; + // begin inline asm + shf.l.wrap.b32 %r21970, %r21976, %r21975, %r21977; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21974, %r21975, %r21976, %r21977; + // end inline asm + mov.u32 %r21985, 10; + // begin inline asm + shf.l.wrap.b32 %r21978, %r21984, %r21983, %r21985; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21982, %r21983, %r21984, %r21985; + // end inline asm + mov.u32 %r21993, 6; + // begin inline asm + shf.l.wrap.b32 %r21986, %r21992, %r21991, %r21993; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21990, %r21991, %r21992, %r21993; + // end inline asm + mov.u32 %r22001, 3; + // begin inline asm + shf.l.wrap.b32 %r21994, %r22000, %r21999, %r22001; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21998, %r21999, %r22000, %r22001; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22002, %r22008, %r22007, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22006, %r22007, %r22008, %r29776; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22010, %r22045, %r21818, %r21866, 0xD2; + lop3.b32 %r22011, %r22048, %r21822, %r21870, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30951, %r21818, %r21866, %r21962, 0xD2; + lop3.b32 %r30952, %r21822, %r21870, %r21966, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30947, %r21866, %r21962, %r21914, 0xD2; + lop3.b32 %r30948, %r21870, %r21966, %r21918, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30943, %r21962, %r21914, %r22045, 0xD2; + lop3.b32 %r30944, %r21966, %r21918, %r22048, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30941, %r21914, %r22045, %r21818, 0xD2; + lop3.b32 %r30942, %r21918, %r22048, %r21822, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30937, %r21954, %r21826, %r21994, 0xD2; + lop3.b32 %r30938, %r21958, %r21830, %r21998, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30949, %r21826, %r21994, %r21938, 0xD2; + lop3.b32 %r30950, %r21830, %r21998, %r21942, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30945, %r21994, %r21938, %r21834, 0xD2; + lop3.b32 %r30946, %r21998, %r21942, %r21838, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30917, %r21938, %r21834, %r21954, 0xD2; + lop3.b32 %r30918, %r21942, %r21838, %r21958, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30917, %r30918}; + // begin inline asm + // chi + lop3.b32 %r30909, %r21834, %r21954, %r21826, 0xD2; + lop3.b32 %r30910, %r21838, %r21958, %r21830, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30909, %r30910}; + // begin inline asm + // chi + lop3.b32 %r30935, %r22002, %r21986, %r21874, 0xD2; + lop3.b32 %r30936, %r22006, %r21990, %r21878, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30935, %r30936}; + // begin inline asm + // chi + lop3.b32 %r30929, %r21986, %r21874, %r21882, 0xD2; + lop3.b32 %r30930, %r21990, %r21878, %r21886, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30929, %r30930}; + // begin inline asm + // chi + lop3.b32 %r30923, %r21874, %r21882, %r21850, 0xD2; + lop3.b32 %r30924, %r21878, %r21886, %r21854, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30923, %r30924}; + // begin inline asm + // chi + lop3.b32 %r30915, %r21882, %r21850, %r22002, 0xD2; + lop3.b32 %r30916, %r21886, %r21854, %r22006, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30915, %r30916}; + // begin inline asm + // chi + lop3.b32 %r30907, %r21850, %r22002, %r21986, 0xD2; + lop3.b32 %r30908, %r21854, %r22006, %r21990, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30907, %r30908}; + // begin inline asm + // chi + lop3.b32 %r30933, %r21906, %r21946, %r21978, 0xD2; + lop3.b32 %r30934, %r21910, %r21950, %r21982, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30933, %r30934}; + // begin inline asm + // chi + lop3.b32 %r30927, %r21946, %r21978, %r21970, 0xD2; + lop3.b32 %r30928, %r21950, %r21982, %r21974, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30927, %r30928}; + // begin inline asm + // chi + lop3.b32 %r30921, %r21978, %r21970, %r21890, 0xD2; + lop3.b32 %r30922, %r21982, %r21974, %r21894, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30921, %r30922}; + // begin inline asm + // chi + lop3.b32 %r30913, %r21970, %r21890, %r21906, 0xD2; + lop3.b32 %r30914, %r21974, %r21894, %r21910, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30913, %r30914}; + // begin inline asm + // chi + lop3.b32 %r30905, %r21890, %r21906, %r21946, 0xD2; + lop3.b32 %r30906, %r21894, %r21910, %r21950, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30905, %r30906}; + // begin inline asm + // chi + lop3.b32 %r30931, %r21858, %r21930, %r21842, 0xD2; + lop3.b32 %r30932, %r21862, %r21934, %r21846, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30931, %r30932}; + // begin inline asm + // chi + lop3.b32 %r30925, %r21930, %r21842, %r21898, 0xD2; + lop3.b32 %r30926, %r21934, %r21846, %r21902, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30925, %r30926}; + // begin inline asm + // chi + lop3.b32 %r30919, %r21842, %r21898, %r21922, 0xD2; + lop3.b32 %r30920, %r21846, %r21902, %r21926, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30919, %r30920}; + // begin inline asm + // chi + lop3.b32 %r30911, %r21898, %r21922, %r21858, 0xD2; + lop3.b32 %r30912, %r21902, %r21926, %r21862, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30911, %r30912}; + // begin inline asm + // chi + lop3.b32 %r30903, %r21922, %r21858, %r21930, 0xD2; + lop3.b32 %r30904, %r21926, %r21862, %r21934, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30903, %r30904}; + mul.wide.s32 %rd974, %r30953, 8; + add.s64 %rd973, %rd1280, %rd974; + // begin inline asm + ld.global.nc.v2.u32 {%r22210,%r22211}, [%rd973]; + // end inline asm + xor.b32 %r30939, %r22010, %r22210; + xor.b32 %r30940, %r22011, %r22211; + add.s32 %r30953, %r30953, 1; + setp.lt.u32 %p41, %r30953, 23; + @%p41 bra $L__BB2_71; + + mov.u32 %r29775, 3; + mov.u32 %r29774, 21; + mov.u32 %r29773, 28; + mov.u32 %r29772, 45; + mov.u32 %r29771, 14; + mov.u32 %r29770, 43; + mov.u32 %r29769, 61; + mov.u32 %r29768, 20; + mov.u32 %r29767, 44; + mov.u64 %rd1274, keccak_round_constants; + cvta.const.u64 %rd1273, %rd1274; + add.s64 %rd1272, %rd1273, 184; + st.local.v2.u32 [%rd2+32], {%r30951, %r30952}; + st.local.v2.u32 [%rd2+72], {%r30949, %r30950}; + st.local.v2.u32 [%rd2+40], {%r30947, %r30948}; + st.local.v2.u32 [%rd2+80], {%r30945, %r30946}; + st.local.v2.u32 [%rd2+48], {%r30943, %r30944}; + st.local.v2.u32 [%rd2+56], {%r30941, %r30942}; + st.local.v2.u32 [%rd2+24], {%r30939, %r30940}; + // begin inline asm + // xor5 + lop3.b32 %r22222, %r30939, %r30937, %r30935, 0x96; + lop3.b32 %r22222, %r22222, %r30933, %r30931, 0x96; + lop3.b32 %r22223, %r30940, %r30938, %r30936, 0x96; + lop3.b32 %r22223, %r22223, %r30934, %r30932, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22234, %r30951, %r30949, %r30929, 0x96; + lop3.b32 %r22234, %r22234, %r30927, %r30925, 0x96; + lop3.b32 %r22235, %r30952, %r30950, %r30930, 0x96; + lop3.b32 %r22235, %r22235, %r30928, %r30926, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22246, %r30947, %r30945, %r30923, 0x96; + lop3.b32 %r22246, %r22246, %r30921, %r30919, 0x96; + lop3.b32 %r22247, %r30948, %r30946, %r30924, 0x96; + lop3.b32 %r22247, %r22247, %r30922, %r30920, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22258, %r30943, %r30917, %r30915, 0x96; + lop3.b32 %r22258, %r22258, %r30913, %r30911, 0x96; + lop3.b32 %r22259, %r30944, %r30918, %r30916, 0x96; + lop3.b32 %r22259, %r22259, %r30914, %r30912, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22270, %r30941, %r30909, %r30907, 0x96; + lop3.b32 %r22270, %r22270, %r30905, %r30903, 0x96; + lop3.b32 %r22271, %r30942, %r30910, %r30908, 0x96; + lop3.b32 %r22271, %r22271, %r30906, %r30904, 0x96; + // end inline asm + mov.u32 %r30968, 1; + // begin inline asm + shf.l.wrap.b32 %r22282, %r22235, %r22234, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22286, %r22234, %r22235, %r30968; + // end inline asm + xor.b32 %r22501, %r22282, %r22270; + xor.b32 %r22502, %r22286, %r22271; + xor.b32 %r22429, %r30939, %r22501; + xor.b32 %r22432, %r30940, %r22502; + xor.b32 %r22392, %r30936, %r22502; + xor.b32 %r22391, %r30935, %r22501; + st.local.v2.u32 [%rd2+104], {%r22391, %r22392}; + // begin inline asm + shf.l.wrap.b32 %r22290, %r22247, %r22246, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22294, %r22246, %r22247, %r30968; + // end inline asm + xor.b32 %r22503, %r22290, %r22222; + xor.b32 %r22504, %r22294, %r22223; + xor.b32 %r22328, %r30949, %r22503; + xor.b32 %r22327, %r30950, %r22504; + xor.b32 %r22367, %r30928, %r22504; + xor.b32 %r22368, %r30927, %r22503; + st.local.v2.u32 [%rd2+152], {%r22368, %r22367}; + // begin inline asm + shf.l.wrap.b32 %r22298, %r22259, %r22258, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22302, %r22258, %r22259, %r30968; + // end inline asm + xor.b32 %r22505, %r22298, %r22234; + xor.b32 %r22506, %r22302, %r22235; + xor.b32 %r22351, %r30924, %r22506; + xor.b32 %r22352, %r30923, %r22505; + st.local.v2.u32 [%rd2+120], {%r22352, %r22351}; + xor.b32 %r22343, %r30920, %r22506; + xor.b32 %r22344, %r30919, %r22505; + st.local.v2.u32 [%rd2+200], {%r22344, %r22343}; + // begin inline asm + shf.l.wrap.b32 %r22306, %r22271, %r22270, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22310, %r22270, %r22271, %r30968; + // end inline asm + xor.b32 %r22507, %r22306, %r22246; + xor.b32 %r22508, %r22310, %r22247; + xor.b32 %r22375, %r30943, %r22507; + xor.b32 %r22376, %r30944, %r22508; + xor.b32 %r22384, %r30914, %r22508; + xor.b32 %r22383, %r30913, %r22507; + st.local.v2.u32 [%rd2+168], {%r22383, %r22384}; + // begin inline asm + shf.l.wrap.b32 %r22314, %r22223, %r22222, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22318, %r22222, %r22223, %r30968; + // end inline asm + xor.b32 %r22509, %r22314, %r22258; + xor.b32 %r22510, %r22318, %r22259; + xor.b32 %r22335, %r30909, %r22509; + xor.b32 %r22336, %r30910, %r22510; + xor.b32 %r22360, %r30904, %r22510; + xor.b32 %r22359, %r30903, %r22509; + st.local.v2.u32 [%rd2+216], {%r22359, %r22360}; + // begin inline asm + shf.l.wrap.b32 %r22322, %r22328, %r22327, %r29767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22326, %r22327, %r22328, %r29767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22330, %r22336, %r22335, %r29768; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22334, %r22335, %r22336, %r29768; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22342, %r22343, %r22344, %r29769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22338, %r22344, %r22343, %r29769; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r22338, %r22342}; + // begin inline asm + shf.l.wrap.b32 %r22346, %r22352, %r22351, %r29770; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22350, %r22351, %r22352, %r29770; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22354, %r22360, %r22359, %r29771; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22358, %r22359, %r22360, %r29771; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22366, %r22367, %r22368, %r29772; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22362, %r22368, %r22367, %r29772; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r22362, %r22366}; + // begin inline asm + shf.l.wrap.b32 %r22370, %r22376, %r22375, %r29773; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22374, %r22375, %r22376, %r29773; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22378, %r22384, %r22383, %r29774; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22382, %r22383, %r22384, %r29774; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22386, %r22392, %r22391, %r29775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22390, %r22391, %r22392, %r29775; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22394, %r22429, %r22322, %r22346, 0xD2; + lop3.b32 %r22395, %r22432, %r22326, %r22350, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22402, %r22322, %r22346, %r22378, 0xD2; + lop3.b32 %r22403, %r22326, %r22350, %r22382, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r22402, %r22403}; + // begin inline asm + // chi + lop3.b32 %r22410, %r22346, %r22378, %r22354, 0xD2; + lop3.b32 %r22411, %r22350, %r22382, %r22358, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r22410, %r22411}; + // begin inline asm + // chi + lop3.b32 %r22418, %r22378, %r22354, %r22429, 0xD2; + lop3.b32 %r22419, %r22382, %r22358, %r22432, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r22418, %r22419}; + // begin inline asm + // chi + lop3.b32 %r22426, %r22354, %r22429, %r22322, 0xD2; + lop3.b32 %r22427, %r22358, %r22432, %r22326, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r22426, %r22427}; + // begin inline asm + // chi + lop3.b32 %r22434, %r22370, %r22330, %r22386, 0xD2; + lop3.b32 %r22435, %r22374, %r22334, %r22390, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r22434, %r22435}; + // begin inline asm + // chi + lop3.b32 %r22442, %r22330, %r22386, %r22362, 0xD2; + lop3.b32 %r22443, %r22334, %r22390, %r22366, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r22442, %r22443}; + // begin inline asm + // chi + lop3.b32 %r22450, %r22386, %r22362, %r22338, 0xD2; + lop3.b32 %r22451, %r22390, %r22366, %r22342, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r22450, %r22451}; + // begin inline asm + ld.global.nc.v2.u32 {%r22458,%r22459}, [%rd1272]; + // end inline asm + xor.b32 %r22511, %r22395, %r22459; + xor.b32 %r22512, %r22394, %r22458; + mov.b64 %rd1349, {%r22512, %r22511}; + mov.b64 %rd1350, {%r22402, %r22403}; + mov.b64 %rd1351, {%r22410, %r22411}; + mov.b64 %rd1352, {%r22418, %r22419}; + mov.b64 %rd1353, {%r22426, %r22427}; + mov.b64 %rd1354, {%r22434, %r22435}; + mov.b64 %rd1355, {%r22442, %r22443}; + mov.b64 %rd1356, {%r22450, %r22451}; + mov.u32 %r30954, 0; + st.local.v2.u32 [%rd2+24], {%r22512, %r22511}; + st.local.v2.u32 [%rd908+96], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+104], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+112], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+120], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+128], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+136], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+144], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+152], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+160], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+168], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+176], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+184], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+192], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+200], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+208], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+216], {%r30954, %r30954}; + mov.u32 %r30969, -2147483648; + st.local.v2.u32 [%rd908+88], {%r30968, %r30969}; + mov.u32 %r30955, %r30954; + mov.u32 %r30956, %r30954; + mov.u32 %r30957, %r30954; + mov.u32 %r30958, %r30954; + mov.u32 %r30959, %r30954; + mov.u32 %r30960, %r30954; + mov.u32 %r30961, %r30954; + mov.u32 %r30962, %r30954; + mov.u32 %r30963, %r30954; + mov.u32 %r30964, %r30954; + mov.u32 %r30965, %r30954; + mov.u32 %r30966, %r30954; + mov.u32 %r30967, %r30954; + mov.u32 %r30970, %r30954; + mov.u32 %r30971, %r30954; + mov.u32 %r30972, %r30954; + mov.u32 %r30973, %r30954; + mov.u32 %r30974, %r30954; + mov.u32 %r30975, %r30954; + mov.u32 %r30976, %r30954; + mov.u32 %r30977, %r30954; + mov.u32 %r30978, %r30954; + mov.u32 %r30979, %r30954; + mov.u32 %r30980, %r30954; + mov.u32 %r30981, %r30954; + mov.u32 %r30982, %r30954; + mov.u32 %r30983, %r30954; + mov.u32 %r30984, %r30954; + mov.u32 %r30985, %r30954; + mov.u32 %r30986, %r30954; + mov.u32 %r30987, %r30954; + mov.u32 %r31004, %r30954; + +$L__BB2_73: + mov.u32 %r29786, 1; + mov.u64 %rd1276, keccak_round_constants; + cvta.const.u64 %rd1275, %rd1276; + // begin inline asm + // xor5 + lop3.b32 %r22513, %r30990, %r30988, %r30986, 0x96; + lop3.b32 %r22513, %r22513, %r30984, %r30982, 0x96; + lop3.b32 %r22514, %r30991, %r30989, %r30987, 0x96; + lop3.b32 %r22514, %r22514, %r30985, %r30983, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22525, %r31002, %r31000, %r30980, 0x96; + lop3.b32 %r22525, %r22525, %r30978, %r30976, 0x96; + lop3.b32 %r22526, %r31003, %r31001, %r30981, 0x96; + lop3.b32 %r22526, %r22526, %r30979, %r30977, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22537, %r30998, %r30996, %r30974, 0x96; + lop3.b32 %r22537, %r22537, %r30972, %r30970, 0x96; + lop3.b32 %r22538, %r30999, %r30997, %r30975, 0x96; + lop3.b32 %r22538, %r22538, %r30973, %r30971, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22549, %r30994, %r30968, %r30966, 0x96; + lop3.b32 %r22549, %r22549, %r30964, %r30962, 0x96; + lop3.b32 %r22550, %r30995, %r30969, %r30967, 0x96; + lop3.b32 %r22550, %r22550, %r30965, %r30963, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22561, %r30992, %r30960, %r30958, 0x96; + lop3.b32 %r22561, %r22561, %r30956, %r30954, 0x96; + lop3.b32 %r22562, %r30993, %r30961, %r30959, 0x96; + lop3.b32 %r22562, %r22562, %r30957, %r30955, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22573, %r22526, %r22525, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22577, %r22525, %r22526, %r29786; + // end inline asm + xor.b32 %r23007, %r22573, %r22561; + xor.b32 %r23008, %r22577, %r22562; + xor.b32 %r22840, %r30990, %r23007; + xor.b32 %r22843, %r30991, %r23008; + xor.b32 %r22747, %r30988, %r23007; + xor.b32 %r22746, %r30989, %r23008; + xor.b32 %r22794, %r30986, %r23007; + xor.b32 %r22795, %r30987, %r23008; + xor.b32 %r22699, %r30984, %r23007; + xor.b32 %r22698, %r30985, %r23008; + xor.b32 %r22650, %r30982, %r23007; + xor.b32 %r22651, %r30983, %r23008; + // begin inline asm + shf.l.wrap.b32 %r22581, %r22538, %r22537, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22585, %r22537, %r22538, %r29786; + // end inline asm + xor.b32 %r23009, %r22581, %r22513; + xor.b32 %r23010, %r22585, %r22514; + xor.b32 %r22802, %r31002, %r23009; + xor.b32 %r22803, %r31003, %r23010; + xor.b32 %r22619, %r31000, %r23009; + xor.b32 %r22618, %r31001, %r23010; + xor.b32 %r22778, %r30980, %r23009; + xor.b32 %r22779, %r30981, %r23010; + xor.b32 %r22739, %r30978, %r23009; + xor.b32 %r22738, %r30979, %r23010; + xor.b32 %r22722, %r30976, %r23009; + xor.b32 %r22723, %r30977, %r23010; + // begin inline asm + shf.l.wrap.b32 %r22589, %r22550, %r22549, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22593, %r22549, %r22550, %r29786; + // end inline asm + xor.b32 %r23011, %r22589, %r22525; + xor.b32 %r23012, %r22593, %r22526; + xor.b32 %r22659, %r30998, %r23011; + xor.b32 %r22658, %r30999, %r23012; + xor.b32 %r22786, %r30996, %r23011; + xor.b32 %r22787, %r30997, %r23012; + xor.b32 %r22667, %r30974, %r23011; + xor.b32 %r22666, %r30975, %r23012; + xor.b32 %r22770, %r30972, %r23011; + xor.b32 %r22771, %r30973, %r23012; + xor.b32 %r22635, %r30970, %r23011; + xor.b32 %r22634, %r30971, %r23012; + // begin inline asm + shf.l.wrap.b32 %r22597, %r22562, %r22561, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22601, %r22561, %r22562, %r29786; + // end inline asm + xor.b32 %r23013, %r22597, %r22537; + xor.b32 %r23014, %r22601, %r22538; + xor.b32 %r22754, %r30994, %r23013; + xor.b32 %r22755, %r30995, %r23014; + xor.b32 %r22731, %r30968, %r23013; + xor.b32 %r22730, %r30969, %r23014; + xor.b32 %r22674, %r30966, %r23013; + xor.b32 %r22675, %r30967, %r23014; + xor.b32 %r22762, %r30964, %r23013; + xor.b32 %r22763, %r30965, %r23014; + xor.b32 %r22691, %r30962, %r23013; + xor.b32 %r22690, %r30963, %r23014; + // begin inline asm + shf.l.wrap.b32 %r22605, %r22514, %r22513, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22609, %r22513, %r22514, %r29786; + // end inline asm + xor.b32 %r23015, %r22605, %r22549; + xor.b32 %r23016, %r22609, %r22550; + xor.b32 %r22706, %r30992, %r23015; + xor.b32 %r22707, %r30993, %r23016; + xor.b32 %r22626, %r30960, %r23015; + xor.b32 %r22627, %r30961, %r23016; + xor.b32 %r22643, %r30958, %r23015; + xor.b32 %r22642, %r30959, %r23016; + xor.b32 %r22682, %r30956, %r23015; + xor.b32 %r22683, %r30957, %r23016; + xor.b32 %r22714, %r30954, %r23015; + xor.b32 %r22715, %r30955, %r23016; + mov.u32 %r22620, 44; + // begin inline asm + shf.l.wrap.b32 %r22613, %r22619, %r22618, %r22620; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22617, %r22618, %r22619, %r22620; + // end inline asm + mov.u32 %r22628, 20; + // begin inline asm + shf.l.wrap.b32 %r22621, %r22627, %r22626, %r22628; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22625, %r22626, %r22627, %r22628; + // end inline asm + mov.u32 %r22636, 61; + // begin inline asm + shf.l.wrap.b32 %r22629, %r22635, %r22634, %r22636; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22633, %r22634, %r22635, %r22636; + // end inline asm + mov.u32 %r22644, 39; + // begin inline asm + shf.l.wrap.b32 %r22637, %r22643, %r22642, %r22644; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22641, %r22642, %r22643, %r22644; + // end inline asm + mov.u32 %r22652, 18; + // begin inline asm + shf.l.wrap.b32 %r22645, %r22651, %r22650, %r22652; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22649, %r22650, %r22651, %r22652; + // end inline asm + mov.u32 %r22660, 62; + // begin inline asm + shf.l.wrap.b32 %r22653, %r22659, %r22658, %r22660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22657, %r22658, %r22659, %r22660; + // end inline asm + mov.u32 %r22668, 43; + // begin inline asm + shf.l.wrap.b32 %r22661, %r22667, %r22666, %r22668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22665, %r22666, %r22667, %r22668; + // end inline asm + mov.u32 %r22676, 25; + // begin inline asm + shf.l.wrap.b32 %r22669, %r22675, %r22674, %r22676; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22673, %r22674, %r22675, %r22676; + // end inline asm + mov.u32 %r22684, 8; + // begin inline asm + shf.l.wrap.b32 %r22677, %r22683, %r22682, %r22684; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22681, %r22682, %r22683, %r22684; + // end inline asm + mov.u32 %r22692, 56; + // begin inline asm + shf.l.wrap.b32 %r22685, %r22691, %r22690, %r22692; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22689, %r22690, %r22691, %r22692; + // end inline asm + mov.u32 %r22700, 41; + // begin inline asm + shf.l.wrap.b32 %r22693, %r22699, %r22698, %r22700; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22697, %r22698, %r22699, %r22700; + // end inline asm + mov.u32 %r22708, 27; + // begin inline asm + shf.l.wrap.b32 %r22701, %r22707, %r22706, %r22708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22705, %r22706, %r22707, %r22708; + // end inline asm + mov.u32 %r22716, 14; + // begin inline asm + shf.l.wrap.b32 %r22709, %r22715, %r22714, %r22716; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22713, %r22714, %r22715, %r22716; + // end inline asm + mov.u32 %r22724, 2; + // begin inline asm + shf.l.wrap.b32 %r22717, %r22723, %r22722, %r22724; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22721, %r22722, %r22723, %r22724; + // end inline asm + mov.u32 %r22732, 55; + // begin inline asm + shf.l.wrap.b32 %r22725, %r22731, %r22730, %r22732; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22729, %r22730, %r22731, %r22732; + // end inline asm + mov.u32 %r22740, 45; + // begin inline asm + shf.l.wrap.b32 %r22733, %r22739, %r22738, %r22740; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22737, %r22738, %r22739, %r22740; + // end inline asm + mov.u32 %r22748, 36; + // begin inline asm + shf.l.wrap.b32 %r22741, %r22747, %r22746, %r22748; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22745, %r22746, %r22747, %r22748; + // end inline asm + mov.u32 %r22756, 28; + // begin inline asm + shf.l.wrap.b32 %r22749, %r22755, %r22754, %r22756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22753, %r22754, %r22755, %r22756; + // end inline asm + mov.u32 %r22764, 21; + // begin inline asm + shf.l.wrap.b32 %r22757, %r22763, %r22762, %r22764; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22761, %r22762, %r22763, %r22764; + // end inline asm + mov.u32 %r22772, 15; + // begin inline asm + shf.l.wrap.b32 %r22765, %r22771, %r22770, %r22772; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22769, %r22770, %r22771, %r22772; + // end inline asm + mov.u32 %r22780, 10; + // begin inline asm + shf.l.wrap.b32 %r22773, %r22779, %r22778, %r22780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22777, %r22778, %r22779, %r22780; + // end inline asm + mov.u32 %r22788, 6; + // begin inline asm + shf.l.wrap.b32 %r22781, %r22787, %r22786, %r22788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22785, %r22786, %r22787, %r22788; + // end inline asm + mov.u32 %r22796, 3; + // begin inline asm + shf.l.wrap.b32 %r22789, %r22795, %r22794, %r22796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22793, %r22794, %r22795, %r22796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22797, %r22803, %r22802, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22801, %r22802, %r22803, %r29786; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22805, %r22840, %r22613, %r22661, 0xD2; + lop3.b32 %r22806, %r22843, %r22617, %r22665, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31002, %r22613, %r22661, %r22757, 0xD2; + lop3.b32 %r31003, %r22617, %r22665, %r22761, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30998, %r22661, %r22757, %r22709, 0xD2; + lop3.b32 %r30999, %r22665, %r22761, %r22713, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30994, %r22757, %r22709, %r22840, 0xD2; + lop3.b32 %r30995, %r22761, %r22713, %r22843, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30992, %r22709, %r22840, %r22613, 0xD2; + lop3.b32 %r30993, %r22713, %r22843, %r22617, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30988, %r22749, %r22621, %r22789, 0xD2; + lop3.b32 %r30989, %r22753, %r22625, %r22793, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31000, %r22621, %r22789, %r22733, 0xD2; + lop3.b32 %r31001, %r22625, %r22793, %r22737, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30996, %r22789, %r22733, %r22629, 0xD2; + lop3.b32 %r30997, %r22793, %r22737, %r22633, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30968, %r22733, %r22629, %r22749, 0xD2; + lop3.b32 %r30969, %r22737, %r22633, %r22753, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r30968, %r30969}; + // begin inline asm + // chi + lop3.b32 %r30960, %r22629, %r22749, %r22621, 0xD2; + lop3.b32 %r30961, %r22633, %r22753, %r22625, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r30960, %r30961}; + // begin inline asm + // chi + lop3.b32 %r30986, %r22797, %r22781, %r22669, 0xD2; + lop3.b32 %r30987, %r22801, %r22785, %r22673, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+104], {%r30986, %r30987}; + // begin inline asm + // chi + lop3.b32 %r30980, %r22781, %r22669, %r22677, 0xD2; + lop3.b32 %r30981, %r22785, %r22673, %r22681, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+112], {%r30980, %r30981}; + // begin inline asm + // chi + lop3.b32 %r30974, %r22669, %r22677, %r22645, 0xD2; + lop3.b32 %r30975, %r22673, %r22681, %r22649, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+120], {%r30974, %r30975}; + // begin inline asm + // chi + lop3.b32 %r30966, %r22677, %r22645, %r22797, 0xD2; + lop3.b32 %r30967, %r22681, %r22649, %r22801, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+128], {%r30966, %r30967}; + // begin inline asm + // chi + lop3.b32 %r30958, %r22645, %r22797, %r22781, 0xD2; + lop3.b32 %r30959, %r22649, %r22801, %r22785, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+136], {%r30958, %r30959}; + // begin inline asm + // chi + lop3.b32 %r30984, %r22701, %r22741, %r22773, 0xD2; + lop3.b32 %r30985, %r22705, %r22745, %r22777, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+144], {%r30984, %r30985}; + // begin inline asm + // chi + lop3.b32 %r30978, %r22741, %r22773, %r22765, 0xD2; + lop3.b32 %r30979, %r22745, %r22777, %r22769, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+152], {%r30978, %r30979}; + // begin inline asm + // chi + lop3.b32 %r30972, %r22773, %r22765, %r22685, 0xD2; + lop3.b32 %r30973, %r22777, %r22769, %r22689, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+160], {%r30972, %r30973}; + // begin inline asm + // chi + lop3.b32 %r30964, %r22765, %r22685, %r22701, 0xD2; + lop3.b32 %r30965, %r22769, %r22689, %r22705, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+168], {%r30964, %r30965}; + // begin inline asm + // chi + lop3.b32 %r30956, %r22685, %r22701, %r22741, 0xD2; + lop3.b32 %r30957, %r22689, %r22705, %r22745, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+176], {%r30956, %r30957}; + // begin inline asm + // chi + lop3.b32 %r30982, %r22653, %r22725, %r22637, 0xD2; + lop3.b32 %r30983, %r22657, %r22729, %r22641, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+184], {%r30982, %r30983}; + // begin inline asm + // chi + lop3.b32 %r30976, %r22725, %r22637, %r22693, 0xD2; + lop3.b32 %r30977, %r22729, %r22641, %r22697, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+192], {%r30976, %r30977}; + // begin inline asm + // chi + lop3.b32 %r30970, %r22637, %r22693, %r22717, 0xD2; + lop3.b32 %r30971, %r22641, %r22697, %r22721, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+200], {%r30970, %r30971}; + // begin inline asm + // chi + lop3.b32 %r30962, %r22693, %r22717, %r22653, 0xD2; + lop3.b32 %r30963, %r22697, %r22721, %r22657, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+208], {%r30962, %r30963}; + // begin inline asm + // chi + lop3.b32 %r30954, %r22717, %r22653, %r22725, 0xD2; + lop3.b32 %r30955, %r22721, %r22657, %r22729, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+216], {%r30954, %r30955}; + mul.wide.s32 %rd985, %r31004, 8; + add.s64 %rd984, %rd1275, %rd985; + // begin inline asm + ld.global.nc.v2.u32 {%r23005,%r23006}, [%rd984]; + // end inline asm + xor.b32 %r30990, %r22805, %r23005; + xor.b32 %r30991, %r22806, %r23006; + add.s32 %r31004, %r31004, 1; + setp.lt.u32 %p42, %r31004, 23; + @%p42 bra $L__BB2_73; + + mov.u32 %r29785, 3; + mov.u32 %r29784, 21; + mov.u32 %r29783, 28; + mov.u32 %r29782, 45; + mov.u32 %r29781, 14; + mov.u32 %r29780, 43; + mov.u32 %r29779, 61; + mov.u32 %r29778, 20; + mov.u32 %r29777, 44; + mov.u64 %rd1279, keccak_round_constants; + cvta.const.u64 %rd1278, %rd1279; + add.s64 %rd1277, %rd1278, 184; + mov.u32 %r23116, 1; + st.local.v2.u32 [%rd908+32], {%r31002, %r31003}; + st.local.v2.u32 [%rd908+72], {%r31000, %r31001}; + st.local.v2.u32 [%rd908+40], {%r30998, %r30999}; + st.local.v2.u32 [%rd908+80], {%r30996, %r30997}; + st.local.v2.u32 [%rd908+48], {%r30994, %r30995}; + st.local.v2.u32 [%rd908+56], {%r30992, %r30993}; + st.local.v2.u32 [%rd908+24], {%r30990, %r30991}; + // begin inline asm + // xor5 + lop3.b32 %r23017, %r30990, %r30988, %r30986, 0x96; + lop3.b32 %r23017, %r23017, %r30984, %r30982, 0x96; + lop3.b32 %r23018, %r30991, %r30989, %r30987, 0x96; + lop3.b32 %r23018, %r23018, %r30985, %r30983, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23029, %r31002, %r31000, %r30980, 0x96; + lop3.b32 %r23029, %r23029, %r30978, %r30976, 0x96; + lop3.b32 %r23030, %r31003, %r31001, %r30981, 0x96; + lop3.b32 %r23030, %r23030, %r30979, %r30977, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23041, %r30998, %r30996, %r30974, 0x96; + lop3.b32 %r23041, %r23041, %r30972, %r30970, 0x96; + lop3.b32 %r23042, %r30999, %r30997, %r30975, 0x96; + lop3.b32 %r23042, %r23042, %r30973, %r30971, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23053, %r30994, %r30968, %r30966, 0x96; + lop3.b32 %r23053, %r23053, %r30964, %r30962, 0x96; + lop3.b32 %r23054, %r30995, %r30969, %r30967, 0x96; + lop3.b32 %r23054, %r23054, %r30965, %r30963, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23065, %r30992, %r30960, %r30958, 0x96; + lop3.b32 %r23065, %r23065, %r30956, %r30954, 0x96; + lop3.b32 %r23066, %r30993, %r30961, %r30959, 0x96; + lop3.b32 %r23066, %r23066, %r30957, %r30955, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23077, %r23030, %r23029, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23081, %r23029, %r23030, %r23116; + // end inline asm + xor.b32 %r23255, %r23077, %r23065; + xor.b32 %r23256, %r23081, %r23066; + xor.b32 %r23224, %r30990, %r23255; + xor.b32 %r23227, %r30991, %r23256; + xor.b32 %r23187, %r30987, %r23256; + xor.b32 %r23186, %r30986, %r23255; + st.local.v2.u32 [%rd908+104], {%r23186, %r23187}; + // begin inline asm + shf.l.wrap.b32 %r23085, %r23042, %r23041, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23089, %r23041, %r23042, %r23116; + // end inline asm + xor.b32 %r23257, %r23085, %r23017; + xor.b32 %r23258, %r23089, %r23018; + xor.b32 %r23123, %r31000, %r23257; + xor.b32 %r23122, %r31001, %r23258; + xor.b32 %r23162, %r30979, %r23258; + xor.b32 %r23163, %r30978, %r23257; + st.local.v2.u32 [%rd908+152], {%r23163, %r23162}; + // begin inline asm + shf.l.wrap.b32 %r23093, %r23054, %r23053, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23097, %r23053, %r23054, %r23116; + // end inline asm + xor.b32 %r23259, %r23093, %r23029; + xor.b32 %r23260, %r23097, %r23030; + xor.b32 %r23146, %r30975, %r23260; + xor.b32 %r23147, %r30974, %r23259; + st.local.v2.u32 [%rd908+120], {%r23147, %r23146}; + xor.b32 %r23138, %r30971, %r23260; + xor.b32 %r23139, %r30970, %r23259; + st.local.v2.u32 [%rd908+200], {%r23139, %r23138}; + // begin inline asm + shf.l.wrap.b32 %r23101, %r23066, %r23065, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23105, %r23065, %r23066, %r23116; + // end inline asm + xor.b32 %r23261, %r23101, %r23041; + xor.b32 %r23262, %r23105, %r23042; + xor.b32 %r23170, %r30994, %r23261; + xor.b32 %r23171, %r30995, %r23262; + xor.b32 %r23179, %r30965, %r23262; + xor.b32 %r23178, %r30964, %r23261; + st.local.v2.u32 [%rd908+168], {%r23178, %r23179}; + // begin inline asm + shf.l.wrap.b32 %r23109, %r23018, %r23017, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23113, %r23017, %r23018, %r23116; + // end inline asm + xor.b32 %r23263, %r23109, %r23053; + xor.b32 %r23264, %r23113, %r23054; + xor.b32 %r23130, %r30960, %r23263; + xor.b32 %r23131, %r30961, %r23264; + xor.b32 %r23155, %r30955, %r23264; + xor.b32 %r23154, %r30954, %r23263; + st.local.v2.u32 [%rd908+216], {%r23154, %r23155}; + // begin inline asm + shf.l.wrap.b32 %r23117, %r23123, %r23122, %r29777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23121, %r23122, %r23123, %r29777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23125, %r23131, %r23130, %r29778; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23129, %r23130, %r23131, %r29778; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23137, %r23138, %r23139, %r29779; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23133, %r23139, %r23138, %r29779; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r23133, %r23137}; + // begin inline asm + shf.l.wrap.b32 %r23141, %r23147, %r23146, %r29780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23145, %r23146, %r23147, %r29780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23149, %r23155, %r23154, %r29781; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23153, %r23154, %r23155, %r29781; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23161, %r23162, %r23163, %r29782; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23157, %r23163, %r23162, %r29782; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r23157, %r23161}; + // begin inline asm + shf.l.wrap.b32 %r23165, %r23171, %r23170, %r29783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23169, %r23170, %r23171, %r29783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23173, %r23179, %r23178, %r29784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23177, %r23178, %r23179, %r29784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23181, %r23187, %r23186, %r29785; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23185, %r23186, %r23187, %r29785; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23189, %r23224, %r23117, %r23141, 0xD2; + lop3.b32 %r23190, %r23227, %r23121, %r23145, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23197, %r23117, %r23141, %r23173, 0xD2; + lop3.b32 %r23198, %r23121, %r23145, %r23177, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+32], {%r23197, %r23198}; + // begin inline asm + // chi + lop3.b32 %r23205, %r23141, %r23173, %r23149, 0xD2; + lop3.b32 %r23206, %r23145, %r23177, %r23153, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+40], {%r23205, %r23206}; + // begin inline asm + // chi + lop3.b32 %r23213, %r23173, %r23149, %r23224, 0xD2; + lop3.b32 %r23214, %r23177, %r23153, %r23227, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+48], {%r23213, %r23214}; + // begin inline asm + // chi + lop3.b32 %r23221, %r23149, %r23224, %r23117, 0xD2; + lop3.b32 %r23222, %r23153, %r23227, %r23121, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+56], {%r23221, %r23222}; + // begin inline asm + // chi + lop3.b32 %r23229, %r23165, %r23125, %r23181, 0xD2; + lop3.b32 %r23230, %r23169, %r23129, %r23185, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+64], {%r23229, %r23230}; + // begin inline asm + // chi + lop3.b32 %r23237, %r23125, %r23181, %r23157, 0xD2; + lop3.b32 %r23238, %r23129, %r23185, %r23161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+72], {%r23237, %r23238}; + // begin inline asm + // chi + lop3.b32 %r23245, %r23181, %r23157, %r23133, 0xD2; + lop3.b32 %r23246, %r23185, %r23161, %r23137, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+80], {%r23245, %r23246}; + // begin inline asm + ld.global.nc.v2.u32 {%r23253,%r23254}, [%rd1277]; + // end inline asm + xor.b32 %r23265, %r23190, %r23254; + xor.b32 %r23266, %r23189, %r23253; + st.local.v2.u32 [%rd908+24], {%r23266, %r23265}; + mov.b64 %rd1358, {%r23197, %r23198}; + mov.b64 %rd1359, {%r23205, %r23206}; + mov.b64 %rd1362, {%r23229, %r23230}; + mov.b64 %rd1363, {%r23237, %r23238}; + mov.b64 %rd1364, {%r23245, %r23246}; + mov.b64 %rd1357, {%r23266, %r23265}; + mov.b64 %rd1360, {%r23213, %r23214}; + mov.b64 %rd1361, {%r23221, %r23222}; + st.global.u64 [%rd222], %rd1349; + st.global.u64 [%rd222+8], %rd1350; + st.global.u64 [%rd222+16], %rd1351; + st.global.u64 [%rd222+24], %rd1352; + st.global.u64 [%rd222+32], %rd1353; + st.global.u64 [%rd222+40], %rd1354; + st.global.u64 [%rd222+48], %rd1355; + st.global.u64 [%rd222+56], %rd1356; + st.global.v2.u32 [%rd222+64], {%r23266, %r23265}; + st.global.v2.u32 [%rd222+72], {%r23197, %r23198}; + st.global.v2.u32 [%rd222+80], {%r23205, %r23206}; + st.global.v2.u32 [%rd222+88], {%r23213, %r23214}; + st.global.v2.u32 [%rd222+96], {%r23221, %r23222}; + st.global.v2.u32 [%rd222+104], {%r23229, %r23230}; + st.global.v2.u32 [%rd222+112], {%r23237, %r23238}; + st.global.v2.u32 [%rd222+120], {%r23245, %r23246}; + +$L__BB2_86: + mul.lo.s32 %r26551, %r12, 16777619; + mov.b64 {%r26552, %r26553}, %rd1333; + mul.lo.s32 %r26554, %r13, 16777619; + xor.b32 %r26555, %r26551, %r26552; + xor.b32 %r26556, %r26554, %r26553; + mov.b64 %rd1099, {%r26555, %r26556}; + mov.b64 {%r26557, %r26558}, %rd1349; + xor.b32 %r26559, %r26558, %r13; + xor.b32 %r26560, %r26557, %r12; + mov.b64 %rd1100, {%r26560, %r26559}; + mul.lo.s32 %r26561, %r14, 16777619; + mov.b64 {%r26562, %r26563}, %rd1334; + mul.lo.s32 %r26564, %r15, 16777619; + xor.b32 %r26565, %r26564, %r26563; + xor.b32 %r26566, %r26561, %r26562; + mov.b64 %rd1101, {%r26566, %r26565}; + mov.b64 {%r26567, %r26568}, %rd1350; + xor.b32 %r26569, %r26568, %r15; + xor.b32 %r26570, %r26567, %r14; + mov.b64 %rd1102, {%r26570, %r26569}; + mul.lo.s32 %r26571, %r16, 16777619; + mov.b64 {%r26572, %r26573}, %rd1335; + mul.lo.s32 %r26574, %r17, 16777619; + xor.b32 %r26575, %r26574, %r26573; + xor.b32 %r26576, %r26571, %r26572; + mov.b64 %rd1103, {%r26576, %r26575}; + mov.b64 {%r26577, %r26578}, %rd1351; + xor.b32 %r26579, %r26578, %r17; + xor.b32 %r26580, %r26577, %r16; + mov.b64 %rd1104, {%r26580, %r26579}; + mul.lo.s32 %r26581, %r18, 16777619; + mov.b64 {%r26582, %r26583}, %rd1336; + mul.lo.s32 %r26584, %r19, 16777619; + xor.b32 %r26585, %r26584, %r26583; + xor.b32 %r26586, %r26581, %r26582; + mov.b64 %rd1105, {%r26586, %r26585}; + mov.b64 {%r26587, %r26588}, %rd1352; + xor.b32 %r26589, %r26588, %r19; + xor.b32 %r26590, %r26587, %r18; + mov.b64 %rd1106, {%r26590, %r26589}; + mul.lo.s32 %r26591, %r20, 16777619; + mov.b64 {%r26592, %r26593}, %rd1337; + mul.lo.s32 %r26594, %r21, 16777619; + xor.b32 %r26595, %r26594, %r26593; + xor.b32 %r26596, %r26591, %r26592; + mov.b64 %rd1107, {%r26596, %r26595}; + mov.b64 {%r26597, %r26598}, %rd1353; + xor.b32 %r26599, %r26598, %r21; + xor.b32 %r26600, %r26597, %r20; + mov.b64 %rd1108, {%r26600, %r26599}; + mul.lo.s32 %r26601, %r22, 16777619; + mov.b64 {%r26602, %r26603}, %rd1338; + mul.lo.s32 %r26604, %r23, 16777619; + xor.b32 %r26605, %r26604, %r26603; + xor.b32 %r26606, %r26601, %r26602; + mov.b64 %rd1109, {%r26606, %r26605}; + mov.b64 {%r26607, %r26608}, %rd1354; + xor.b32 %r26609, %r26608, %r23; + xor.b32 %r26610, %r26607, %r22; + mov.b64 %rd1110, {%r26610, %r26609}; + mul.lo.s32 %r26611, %r24, 16777619; + mov.b64 {%r26612, %r26613}, %rd1339; + mul.lo.s32 %r26614, %r25, 16777619; + xor.b32 %r26615, %r26614, %r26613; + xor.b32 %r26616, %r26611, %r26612; + mov.b64 %rd1111, {%r26616, %r26615}; + mov.b64 {%r26617, %r26618}, %rd1355; + xor.b32 %r26619, %r26618, %r25; + xor.b32 %r26620, %r26617, %r24; + mov.b64 %rd1112, {%r26620, %r26619}; + mul.lo.s32 %r26621, %r26, 16777619; + mov.b64 {%r26622, %r26623}, %rd1340; + mul.lo.s32 %r26624, %r27, 16777619; + xor.b32 %r26625, %r26624, %r26623; + xor.b32 %r26626, %r26621, %r26622; + mov.b64 %rd1113, {%r26626, %r26625}; + mov.b64 {%r26627, %r26628}, %rd1356; + xor.b32 %r26629, %r26628, %r27; + xor.b32 %r26630, %r26627, %r26; + mov.b64 %rd1114, {%r26630, %r26629}; + mul.lo.s32 %r26631, %r28, 16777619; + mov.b64 {%r26632, %r26633}, %rd1341; + mul.lo.s32 %r26634, %r29, 16777619; + xor.b32 %r26635, %r26634, %r26633; + xor.b32 %r26636, %r26631, %r26632; + mov.b64 %rd1115, {%r26636, %r26635}; + mov.b64 {%r26637, %r26638}, %rd1357; + xor.b32 %r26639, %r26638, %r29; + xor.b32 %r26640, %r26637, %r28; + mov.b64 %rd1116, {%r26640, %r26639}; + mul.lo.s32 %r26641, %r30, 16777619; + mov.b64 {%r26642, %r26643}, %rd1342; + mul.lo.s32 %r26644, %r31, 16777619; + xor.b32 %r26645, %r26644, %r26643; + xor.b32 %r26646, %r26641, %r26642; + mov.b64 %rd1117, {%r26646, %r26645}; + mov.b64 {%r26647, %r26648}, %rd1358; + xor.b32 %r26649, %r26648, %r31; + xor.b32 %r26650, %r26647, %r30; + mov.b64 %rd1118, {%r26650, %r26649}; + mul.lo.s32 %r26651, %r32, 16777619; + mov.b64 {%r26652, %r26653}, %rd1343; + mul.lo.s32 %r26654, %r33, 16777619; + xor.b32 %r26655, %r26654, %r26653; + xor.b32 %r26656, %r26651, %r26652; + mov.b64 %rd1119, {%r26656, %r26655}; + mov.b64 {%r26657, %r26658}, %rd1359; + xor.b32 %r26659, %r26658, %r33; + xor.b32 %r26660, %r26657, %r32; + mov.b64 %rd1120, {%r26660, %r26659}; + mul.lo.s32 %r26661, %r34, 16777619; + mov.b64 {%r26662, %r26663}, %rd1344; + mul.lo.s32 %r26664, %r35, 16777619; + xor.b32 %r26665, %r26664, %r26663; + xor.b32 %r26666, %r26661, %r26662; + mov.b64 %rd1121, {%r26666, %r26665}; + mov.b64 {%r26667, %r26668}, %rd1360; + xor.b32 %r26669, %r26668, %r35; + xor.b32 %r26670, %r26667, %r34; + mov.b64 %rd1122, {%r26670, %r26669}; + mul.lo.s32 %r26671, %r36, 16777619; + mov.b64 {%r26672, %r26673}, %rd1345; + mul.lo.s32 %r26674, %r37, 16777619; + xor.b32 %r26675, %r26674, %r26673; + xor.b32 %r26676, %r26671, %r26672; + mov.b64 %rd1123, {%r26676, %r26675}; + mov.b64 {%r26677, %r26678}, %rd1361; + xor.b32 %r26679, %r26678, %r37; + xor.b32 %r26680, %r26677, %r36; + mov.b64 %rd1124, {%r26680, %r26679}; + mul.lo.s32 %r26681, %r38, 16777619; + mov.b64 {%r26682, %r26683}, %rd1346; + mul.lo.s32 %r26684, %r39, 16777619; + xor.b32 %r26685, %r26684, %r26683; + xor.b32 %r26686, %r26681, %r26682; + mov.b64 %rd1125, {%r26686, %r26685}; + mov.b64 {%r26687, %r26688}, %rd1362; + xor.b32 %r26689, %r26688, %r39; + xor.b32 %r26690, %r26687, %r38; + mov.b64 %rd1126, {%r26690, %r26689}; + mul.lo.s32 %r26691, %r40, 16777619; + mov.b64 {%r26692, %r26693}, %rd1347; + mul.lo.s32 %r26694, %r41, 16777619; + xor.b32 %r26695, %r26694, %r26693; + xor.b32 %r26696, %r26691, %r26692; + mov.b64 %rd1127, {%r26696, %r26695}; + mov.b64 {%r26697, %r26698}, %rd1363; + xor.b32 %r26699, %r26698, %r41; + xor.b32 %r26700, %r26697, %r40; + mov.b64 %rd1128, {%r26700, %r26699}; + mul.lo.s32 %r26701, %r42, 16777619; + mov.b64 {%r26702, %r26703}, %rd1348; + mul.lo.s32 %r26704, %r43, 16777619; + xor.b32 %r26705, %r26704, %r26703; + xor.b32 %r26706, %r26701, %r26702; + mov.b64 %rd1129, {%r26706, %r26705}; + mov.b64 {%r26707, %r26708}, %rd1364; + xor.b32 %r26709, %r26708, %r43; + xor.b32 %r26710, %r26707, %r42; + mov.b64 %rd1130, {%r26710, %r26709}; + mul.lo.s64 %rd1131, %rd1317, %rd1099; + add.s64 %rd1316, %rd1131, %rd1100; + mul.lo.s64 %rd1132, %rd1318, %rd1101; + add.s64 %rd1315, %rd1132, %rd1102; + mul.lo.s64 %rd1133, %rd1319, %rd1103; + add.s64 %rd1314, %rd1133, %rd1104; + mul.lo.s64 %rd1134, %rd1320, %rd1105; + add.s64 %rd1313, %rd1134, %rd1106; + mul.lo.s64 %rd1135, %rd1321, %rd1107; + add.s64 %rd1312, %rd1135, %rd1108; + mul.lo.s64 %rd1136, %rd1322, %rd1109; + add.s64 %rd1311, %rd1136, %rd1110; + mul.lo.s64 %rd1137, %rd1323, %rd1111; + add.s64 %rd1310, %rd1137, %rd1112; + mul.lo.s64 %rd1138, %rd1324, %rd1113; + add.s64 %rd1309, %rd1138, %rd1114; + mul.lo.s64 %rd1139, %rd1325, %rd1115; + add.s64 %rd1308, %rd1139, %rd1116; + mul.lo.s64 %rd1140, %rd1326, %rd1117; + add.s64 %rd1307, %rd1140, %rd1118; + mul.lo.s64 %rd1141, %rd1327, %rd1119; + add.s64 %rd1306, %rd1141, %rd1120; + mul.lo.s64 %rd1142, %rd1328, %rd1121; + add.s64 %rd1305, %rd1142, %rd1122; + mul.lo.s64 %rd1143, %rd1329, %rd1123; + add.s64 %rd1304, %rd1143, %rd1124; + mul.lo.s64 %rd1144, %rd1330, %rd1125; + add.s64 %rd1303, %rd1144, %rd1126; + mul.lo.s64 %rd1145, %rd1331, %rd1127; + add.s64 %rd1302, %rd1145, %rd1128; + mul.lo.s64 %rd1146, %rd1332, %rd1129; + add.s64 %rd1301, %rd1146, %rd1130; + add.s32 %r29819, %r29819, 1; + setp.lt.u32 %p48, %r29819, 32; + @%p48 bra $L__BB2_11; + + add.u64 %rd1259, %SPL, 2000; + add.u64 %rd1256, %SP, 2000; + add.u64 %rd1255, %SP, 0; + mov.u64 %rd1147, 0; + mov.b64 {%r26711, %r26712}, %rd1316; + mul.lo.s32 %r26713, %r26711, 16777619; + xor.b32 %r26714, %r26713, %r26712; + mul.lo.s32 %r26715, %r26714, 16777619; + mov.b64 {%r26716, %r26717}, %rd1315; + xor.b32 %r26718, %r26715, %r26716; + mul.lo.s32 %r26719, %r26718, 16777619; + mov.b64 {%r26720, %r26721}, %rd1314; + mul.lo.s32 %r26722, %r26720, 16777619; + xor.b32 %r26723, %r26722, %r26721; + mul.lo.s32 %r26724, %r26723, 16777619; + mov.b64 {%r26725, %r26726}, %rd1313; + xor.b32 %r26727, %r26724, %r26725; + mul.lo.s32 %r26728, %r26727, 16777619; + mov.b64 {%r26729, %r26730}, %rd1312; + mul.lo.s32 %r26731, %r26729, 16777619; + xor.b32 %r26732, %r26731, %r26730; + mul.lo.s32 %r26733, %r26732, 16777619; + mov.b64 {%r26734, %r26735}, %rd1311; + xor.b32 %r26736, %r26733, %r26734; + mul.lo.s32 %r26737, %r26736, 16777619; + mov.b64 {%r26738, %r26739}, %rd1310; + mul.lo.s32 %r26740, %r26738, 16777619; + xor.b32 %r26741, %r26740, %r26739; + mul.lo.s32 %r26742, %r26741, 16777619; + mov.b64 {%r26743, %r26744}, %rd1309; + xor.b32 %r26745, %r26742, %r26743; + mul.lo.s32 %r26746, %r26745, 16777619; + mov.b64 {%r26747, %r26748}, %rd1308; + mul.lo.s32 %r26749, %r26747, 16777619; + xor.b32 %r26750, %r26749, %r26748; + mul.lo.s32 %r26751, %r26750, 16777619; + mov.b64 {%r26752, %r26753}, %rd1307; + xor.b32 %r26754, %r26751, %r26752; + mul.lo.s32 %r26755, %r26754, 16777619; + mov.b64 {%r26756, %r26757}, %rd1306; + mul.lo.s32 %r26758, %r26756, 16777619; + xor.b32 %r26759, %r26758, %r26757; + mul.lo.s32 %r26760, %r26759, 16777619; + mov.b64 {%r26761, %r26762}, %rd1305; + xor.b32 %r26763, %r26760, %r26761; + mul.lo.s32 %r26764, %r26763, 16777619; + mov.b64 {%r26765, %r26766}, %rd1304; + mul.lo.s32 %r26767, %r26765, 16777619; + xor.b32 %r26768, %r26767, %r26766; + mul.lo.s32 %r26769, %r26768, 16777619; + mov.b64 {%r26770, %r26771}, %rd1303; + xor.b32 %r26772, %r26769, %r26770; + mul.lo.s32 %r26773, %r26772, 16777619; + mov.b64 {%r26774, %r26775}, %rd1302; + mul.lo.s32 %r26776, %r26774, 16777619; + xor.b32 %r26777, %r26776, %r26775; + mul.lo.s32 %r26778, %r26777, 16777619; + mov.b64 {%r26779, %r26780}, %rd1301; + xor.b32 %r26781, %r26778, %r26779; + mul.lo.s32 %r26782, %r26781, 16777619; + mov.u32 %r26783, 0; + st.local.v4.u32 [%rd1259+32], {%r26783, %r26783, %r26783, %r26783}; + st.local.v4.u32 [%rd1259+48], {%r26783, %r26783, %r26783, %r26783}; + st.local.v4.u32 [%rd1259+64], {%r26783, %r26783, %r26783, %r26783}; + xor.b32 %r26784, %r26746, %r26744; + xor.b32 %r26785, %r26737, %r26735; + xor.b32 %r26786, %r26728, %r26726; + xor.b32 %r26787, %r26719, %r26717; + st.local.v4.u32 [%rd1259], {%r26787, %r26786, %r26785, %r26784}; + xor.b32 %r26788, %r26782, %r26780; + xor.b32 %r26789, %r26773, %r26771; + xor.b32 %r26790, %r26764, %r26762; + xor.b32 %r26791, %r26755, %r26753; + st.local.v4.u32 [%rd1259+16], {%r26791, %r26790, %r26789, %r26788}; + mov.u32 %r26792, -1150833019; + mov.u32 %r26793, 1779033703; + st.local.v2.u32 [%rd2], {%r26793, %r26792}; + mov.u32 %r26794, -1521486534; + mov.u32 %r26795, 1013904242; + st.local.v2.u32 [%rd2+8], {%r26795, %r26794}; + mov.u32 %r26796, -1694144372; + mov.u32 %r26797, 1359893119; + st.local.v2.u32 [%rd2+16], {%r26797, %r26796}; + mov.u32 %r26798, 1541459225; + mov.u32 %r26799, 528734635; + st.local.v2.u32 [%rd2+24], {%r26799, %r26798}; + st.local.v2.u32 [%rd2+32], {%r26793, %r26792}; + st.local.v2.u32 [%rd2+40], {%r26795, %r26794}; + st.local.v2.u32 [%rd2+48], {%r26797, %r26796}; + st.local.v2.u32 [%rd2+56], {%r26799, %r26798}; + st.local.u64 [%rd2+64], %rd1147; + st.local.v2.u32 [%rd2+72], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+80], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+88], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+96], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+104], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+112], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+120], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+128], {%r26783, %r26783}; + mov.u16 %rs498, 0; + st.local.v2.u8 [%rd2+136], {%rs498, %rs498}; + st.local.u8 [%rd2+138], %rs498; + st.local.u8 [%rd2+144], %rs498; + { // callseq 3, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd1255; + .param .b64 param1; + st.param.b64 [param1+0], %rd1256; + call.uni + _Z20blake3_hasher_updateP13blake3_hasherPKvy, + ( + param0, + param1 + ); + } // callseq 3 + ld.local.u8 %rd1367, [%rd2+144]; + setp.eq.s64 %p49, %rd1367, 0; + @%p49 bra $L__BB2_95; + + ld.local.v2.u8 {%rs862, %rs500}, [%rd2+136]; + cvt.u32.u16 %r26800, %rs500; + mul.wide.u32 %rd1151, %r26800, 64; + cvt.u64.u16 %rd1152, %rs862; + neg.s64 %rd1153, %rd1152; + setp.eq.s64 %p50, %rd1151, %rd1153; + @%p50 bra $L__BB2_90; + bra.uni $L__BB2_89; + +$L__BB2_90: + add.s64 %rd1367, %rd1367, -2; + shl.b64 %rd1155, %rd1367, 5; + add.s64 %rd1158, %rd2, %rd1155; + ld.local.u8 %rs665, [%rd2+138]; + mov.u64 %rd1368, 0; + or.b16 %rs732, %rs665, 4; + ld.local.v2.u32 {%r31257, %r31256}, [%rd2]; + ld.local.v2.u32 {%r31255, %r31254}, [%rd2+8]; + ld.local.v2.u32 {%r31253, %r31252}, [%rd2+16]; + ld.local.v2.u32 {%r31251, %r31250}, [%rd2+24]; + ld.local.u8 %rs798, [%rd1158+145]; + ld.local.u8 %rs799, [%rd1158+146]; + ld.local.u8 %rs800, [%rd1158+147]; + ld.local.u8 %rs801, [%rd1158+148]; + ld.local.u8 %rs802, [%rd1158+149]; + ld.local.u8 %rs803, [%rd1158+150]; + ld.local.u8 %rs804, [%rd1158+151]; + ld.local.u8 %rs805, [%rd1158+152]; + ld.local.u8 %rs806, [%rd1158+153]; + ld.local.u8 %rs807, [%rd1158+154]; + ld.local.u8 %rs808, [%rd1158+155]; + ld.local.u8 %rs809, [%rd1158+156]; + ld.local.u8 %rs810, [%rd1158+157]; + ld.local.u8 %rs811, [%rd1158+158]; + ld.local.u8 %rs812, [%rd1158+159]; + ld.local.u8 %rs813, [%rd1158+160]; + ld.local.u8 %rs814, [%rd1158+161]; + ld.local.u8 %rs815, [%rd1158+162]; + ld.local.u8 %rs816, [%rd1158+163]; + ld.local.u8 %rs817, [%rd1158+164]; + ld.local.u8 %rs818, [%rd1158+165]; + ld.local.u8 %rs819, [%rd1158+166]; + ld.local.u8 %rs820, [%rd1158+167]; + ld.local.u8 %rs821, [%rd1158+168]; + ld.local.u8 %rs822, [%rd1158+169]; + ld.local.u8 %rs823, [%rd1158+170]; + ld.local.u8 %rs824, [%rd1158+171]; + ld.local.u8 %rs825, [%rd1158+172]; + ld.local.u8 %rs826, [%rd1158+173]; + ld.local.u8 %rs827, [%rd1158+174]; + ld.local.u8 %rs828, [%rd1158+175]; + ld.local.u8 %rs829, [%rd1158+176]; + ld.local.u8 %rs830, [%rd1158+177]; + ld.local.u8 %rs831, [%rd1158+178]; + ld.local.u8 %rs832, [%rd1158+179]; + ld.local.u8 %rs833, [%rd1158+180]; + ld.local.u8 %rs834, [%rd1158+181]; + ld.local.u8 %rs835, [%rd1158+182]; + ld.local.u8 %rs836, [%rd1158+183]; + ld.local.u8 %rs837, [%rd1158+184]; + ld.local.u8 %rs838, [%rd1158+185]; + ld.local.u8 %rs839, [%rd1158+186]; + ld.local.u8 %rs840, [%rd1158+187]; + ld.local.u8 %rs841, [%rd1158+188]; + ld.local.u8 %rs842, [%rd1158+189]; + ld.local.u8 %rs843, [%rd1158+190]; + ld.local.u8 %rs844, [%rd1158+191]; + ld.local.u8 %rs845, [%rd1158+192]; + ld.local.u8 %rs846, [%rd1158+193]; + ld.local.u8 %rs847, [%rd1158+194]; + ld.local.u8 %rs848, [%rd1158+195]; + ld.local.u8 %rs849, [%rd1158+196]; + ld.local.u8 %rs850, [%rd1158+197]; + ld.local.u8 %rs851, [%rd1158+198]; + ld.local.u8 %rs852, [%rd1158+199]; + ld.local.v4.u16 {%rs853, %rs855, %rs857, %rs859}, [%rd1158+200]; + shr.u16 %rs854, %rs853, 8; + shr.u16 %rs856, %rs855, 8; + shr.u16 %rs858, %rs857, 8; + shr.u16 %rs860, %rs859, 8; + ld.local.u8 %rs861, [%rd1158+208]; + mov.u16 %rs862, 64; + bra.uni $L__BB2_91; + +$L__BB2_95: + ld.local.v4.u8 {%rs568, %rs569, %rs570, %rs571}, [%rd2+136]; + setp.eq.s16 %p54, %rs569, 0; + selp.u16 %rs573, 1, 0, %p54; + ld.local.v2.u32 {%r28817, %r28818}, [%rd2+32]; + ld.local.v2.u32 {%r28821, %r28822}, [%rd2+40]; + ld.local.v2.u32 {%r28825, %r28826}, [%rd2+48]; + ld.local.v2.u32 {%r28829, %r28830}, [%rd2+56]; + ld.local.v4.u16 {%rs574, %rs575, %rs576, %rs577}, [%rd2+72]; + shr.u16 %rs579, %rs574, 8; + shr.u16 %rs581, %rs575, 8; + shr.u16 %rs583, %rs576, 8; + shr.u16 %rs585, %rs577, 8; + ld.local.v4.u16 {%rs586, %rs587, %rs588, %rs589}, [%rd2+80]; + shr.u16 %rs591, %rs586, 8; + shr.u16 %rs593, %rs587, 8; + shr.u16 %rs595, %rs588, 8; + shr.u16 %rs597, %rs589, 8; + ld.local.v4.u16 {%rs598, %rs599, %rs600, %rs601}, [%rd2+88]; + shr.u16 %rs603, %rs598, 8; + shr.u16 %rs605, %rs599, 8; + shr.u16 %rs607, %rs600, 8; + shr.u16 %rs609, %rs601, 8; + ld.local.v4.u16 {%rs610, %rs611, %rs612, %rs613}, [%rd2+96]; + shr.u16 %rs615, %rs610, 8; + shr.u16 %rs617, %rs611, 8; + shr.u16 %rs619, %rs612, 8; + shr.u16 %rs621, %rs613, 8; + ld.local.v4.u16 {%rs622, %rs623, %rs624, %rs625}, [%rd2+104]; + shr.u16 %rs627, %rs622, 8; + shr.u16 %rs629, %rs623, 8; + shr.u16 %rs631, %rs624, 8; + shr.u16 %rs633, %rs625, 8; + ld.local.v4.u16 {%rs634, %rs635, %rs636, %rs637}, [%rd2+112]; + shr.u16 %rs639, %rs634, 8; + shr.u16 %rs641, %rs635, 8; + shr.u16 %rs643, %rs636, 8; + shr.u16 %rs645, %rs637, 8; + ld.local.v4.u16 {%rs646, %rs647, %rs648, %rs649}, [%rd2+120]; + shr.u16 %rs651, %rs646, 8; + shr.u16 %rs653, %rs647, 8; + ld.local.v2.u8 {%rs655, %rs656}, [%rd2+126]; + ld.local.u16 %r28833, [%rd2+132]; + ld.local.v2.u8 {%rs659, %rs660}, [%rd2+134]; + or.b16 %rs663, %rs570, %rs573; + or.b16 %rs664, %rs663, 10; + cvt.u32.u16 %r28834, %rs574; + and.b32 %r28835, %r28834, 255; + cvt.u32.u16 %r28836, %rs579; + prmt.b32 %r28837, %r28836, %r28835, 30212; + cvt.u32.u16 %r28838, %rs575; + prmt.b32 %r28839, %r28838, %r28837, 28756; + cvt.u32.u16 %r28840, %rs581; + prmt.b32 %r28841, %r28840, %r28839, 1620; + cvt.u32.u16 %r28842, %rs576; + and.b32 %r28843, %r28842, 255; + cvt.u32.u16 %r28844, %rs583; + prmt.b32 %r28845, %r28844, %r28843, 30212; + cvt.u32.u16 %r28846, %rs577; + prmt.b32 %r28847, %r28846, %r28845, 28756; + cvt.u32.u16 %r28848, %rs585; + prmt.b32 %r28849, %r28848, %r28847, 1620; + cvt.u32.u16 %r28850, %rs586; + and.b32 %r28851, %r28850, 255; + cvt.u32.u16 %r28852, %rs591; + prmt.b32 %r28853, %r28852, %r28851, 30212; + cvt.u32.u16 %r28854, %rs587; + prmt.b32 %r28855, %r28854, %r28853, 28756; + cvt.u32.u16 %r28856, %rs593; + prmt.b32 %r28857, %r28856, %r28855, 1620; + cvt.u32.u16 %r28858, %rs588; + and.b32 %r28859, %r28858, 255; + cvt.u32.u16 %r28860, %rs595; + prmt.b32 %r28861, %r28860, %r28859, 30212; + cvt.u32.u16 %r28862, %rs589; + prmt.b32 %r28863, %r28862, %r28861, 28756; + cvt.u32.u16 %r28864, %rs597; + prmt.b32 %r28865, %r28864, %r28863, 1620; + cvt.u32.u16 %r28866, %rs598; + and.b32 %r28867, %r28866, 255; + cvt.u32.u16 %r28868, %rs603; + prmt.b32 %r28869, %r28868, %r28867, 30212; + cvt.u32.u16 %r28870, %rs599; + prmt.b32 %r28871, %r28870, %r28869, 28756; + cvt.u32.u16 %r28872, %rs605; + prmt.b32 %r28873, %r28872, %r28871, 1620; + cvt.u32.u16 %r28874, %rs600; + and.b32 %r28875, %r28874, 255; + cvt.u32.u16 %r28876, %rs607; + prmt.b32 %r28877, %r28876, %r28875, 30212; + cvt.u32.u16 %r28878, %rs601; + prmt.b32 %r28879, %r28878, %r28877, 28756; + cvt.u32.u16 %r28880, %rs609; + prmt.b32 %r28881, %r28880, %r28879, 1620; + cvt.u32.u16 %r28882, %rs610; + and.b32 %r28883, %r28882, 255; + cvt.u32.u16 %r28884, %rs615; + prmt.b32 %r28885, %r28884, %r28883, 30212; + cvt.u32.u16 %r28886, %rs611; + prmt.b32 %r28887, %r28886, %r28885, 28756; + cvt.u32.u16 %r28888, %rs617; + prmt.b32 %r28889, %r28888, %r28887, 1620; + cvt.u32.u16 %r28890, %rs612; + and.b32 %r28891, %r28890, 255; + cvt.u32.u16 %r28892, %rs619; + prmt.b32 %r28893, %r28892, %r28891, 30212; + cvt.u32.u16 %r28894, %rs613; + prmt.b32 %r28895, %r28894, %r28893, 28756; + cvt.u32.u16 %r28896, %rs621; + prmt.b32 %r28897, %r28896, %r28895, 1620; + cvt.u32.u16 %r28898, %rs622; + and.b32 %r28899, %r28898, 255; + cvt.u32.u16 %r28900, %rs627; + prmt.b32 %r28901, %r28900, %r28899, 30212; + cvt.u32.u16 %r28902, %rs623; + prmt.b32 %r28903, %r28902, %r28901, 28756; + cvt.u32.u16 %r28904, %rs629; + prmt.b32 %r28905, %r28904, %r28903, 1620; + cvt.u32.u16 %r28906, %rs624; + and.b32 %r28907, %r28906, 255; + cvt.u32.u16 %r28908, %rs631; + prmt.b32 %r28909, %r28908, %r28907, 30212; + cvt.u32.u16 %r28910, %rs625; + prmt.b32 %r28911, %r28910, %r28909, 28756; + cvt.u32.u16 %r28912, %rs633; + prmt.b32 %r28913, %r28912, %r28911, 1620; + cvt.u32.u16 %r28914, %rs634; + and.b32 %r28915, %r28914, 255; + cvt.u32.u16 %r28916, %rs639; + prmt.b32 %r28917, %r28916, %r28915, 30212; + cvt.u32.u16 %r28918, %rs635; + prmt.b32 %r28919, %r28918, %r28917, 28756; + cvt.u32.u16 %r28920, %rs641; + prmt.b32 %r28921, %r28920, %r28919, 1620; + cvt.u32.u16 %r28922, %rs636; + and.b32 %r28923, %r28922, 255; + cvt.u32.u16 %r28924, %rs643; + prmt.b32 %r28925, %r28924, %r28923, 30212; + cvt.u32.u16 %r28926, %rs637; + prmt.b32 %r28927, %r28926, %r28925, 28756; + cvt.u32.u16 %r28928, %rs645; + prmt.b32 %r28929, %r28928, %r28927, 1620; + cvt.u32.u16 %r28930, %rs646; + and.b32 %r28931, %r28930, 255; + cvt.u32.u16 %r28932, %rs651; + prmt.b32 %r28933, %r28932, %r28931, 30212; + cvt.u32.u16 %r28934, %rs647; + prmt.b32 %r28935, %r28934, %r28933, 28756; + cvt.u32.u16 %r28936, %rs653; + prmt.b32 %r28937, %r28936, %r28935, 1620; + cvt.u32.u16 %r28938, %rs648; + and.b32 %r28939, %r28938, 255; + ld.local.u8 %r28940, [%rd2+125]; + prmt.b32 %r28941, %r28940, %r28939, 30212; + cvt.u32.u16 %r28942, %rs655; + prmt.b32 %r28943, %r28942, %r28941, 28756; + cvt.u32.u16 %r28944, %rs656; + prmt.b32 %r28945, %r28944, %r28943, 1620; + ld.local.u32 %r28946, [%rd2+128]; + cvt.u32.u16 %r28947, %rs659; + prmt.b32 %r28948, %r28947, %r28833, 28756; + cvt.u32.u16 %r28949, %rs660; + prmt.b32 %r28950, %r28949, %r28948, 1620; + cvt.u32.u16 %r28951, %rs568; + cvt.u32.u16 %r28952, %rs664; + and.b32 %r28953, %r28952, 255; + add.s32 %r28954, %r28825, %r28817; + add.s32 %r28955, %r28954, %r28841; + add.s32 %r28956, %r28849, %r28955; + add.s32 %r28957, %r28826, %r28818; + add.s32 %r28958, %r28957, %r28857; + add.s32 %r28959, %r28865, %r28958; + add.s32 %r28960, %r28829, %r28821; + add.s32 %r28961, %r28960, %r28873; + xor.b32 %r28962, %r28961, %r28951; + shr.u32 %r28963, %r28961, 16; + shl.b32 %r28964, %r28962, 16; + or.b32 %r28965, %r28964, %r28963; + add.s32 %r28966, %r28965, 1013904242; + xor.b32 %r28967, %r28966, %r28829; + shf.l.wrap.b32 %r28968, %r28967, %r28967, 20; + add.s32 %r28969, %r28881, %r28961; + add.s32 %r28970, %r28969, %r28968; + xor.b32 %r28971, %r28970, %r28965; + shf.l.wrap.b32 %r28972, %r28971, %r28971, 24; + add.s32 %r28973, %r28972, %r28966; + xor.b32 %r28974, %r28973, %r28968; + shf.l.wrap.b32 %r28975, %r28974, %r28974, 25; + add.s32 %r28976, %r28830, %r28822; + add.s32 %r28977, %r28976, %r28889; + xor.b32 %r28978, %r28977, %r28953; + shr.u32 %r28979, %r28977, 16; + shl.b32 %r28980, %r28978, 16; + or.b32 %r28981, %r28980, %r28979; + add.s32 %r28982, %r28981, -1521486534; + xor.b32 %r28983, %r28982, %r28830; + shf.l.wrap.b32 %r28984, %r28983, %r28983, 20; + add.s32 %r28985, %r28897, %r28977; + add.s32 %r28986, %r28985, %r28984; + xor.b32 %r28987, %r28986, %r28981; + shf.l.wrap.b32 %r28988, %r28987, %r28987, 24; + add.s32 %r28989, %r28988, %r28982; + xor.b32 %r28990, %r28989, %r28984; + shf.l.wrap.b32 %r28991, %r28990, %r28990, 25; + add.s32 %r28992, %r28921, %r28975; + add.s32 %r28993, %r28991, %r28970; + add.s32 %r28994, %r28993, %r28937; + add.s32 %r28995, %r28945, %r28994; + add.s32 %r28996, %r28946, %r28986; + shf.l.wrap.b32 %r28997, %r28955, %r28955, 16; + add.s32 %r28998, %r28997, 1779033703; + xor.b32 %r28999, %r28998, %r28825; + shf.l.wrap.b32 %r29000, %r28999, %r28999, 20; + add.s32 %r29001, %r28956, %r29000; + xor.b32 %r29002, %r29001, %r28997; + shf.l.wrap.b32 %r29003, %r29002, %r29002, 24; + add.s32 %r29004, %r29003, %r28998; + xor.b32 %r29005, %r29004, %r29000; + shf.l.wrap.b32 %r29006, %r29005, %r29005, 25; + shf.l.wrap.b32 %r29007, %r28958, %r28958, 16; + add.s32 %r29008, %r29007, -1150833019; + xor.b32 %r29009, %r29008, %r28826; + shf.l.wrap.b32 %r29010, %r29009, %r29009, 20; + add.s32 %r29011, %r28959, %r29010; + xor.b32 %r29012, %r29011, %r29007; + shf.l.wrap.b32 %r29013, %r29012, %r29012, 24; + add.s32 %r29014, %r29013, %r29008; + xor.b32 %r29015, %r29014, %r29010; + shf.l.wrap.b32 %r29016, %r29015, %r29015, 25; + add.s32 %r29017, %r29001, %r28905; + add.s32 %r29018, %r29017, %r29016; + xor.b32 %r29019, %r29018, %r28988; + shf.l.wrap.b32 %r29020, %r29019, %r29019, 16; + add.s32 %r29021, %r29020, %r28973; + xor.b32 %r29022, %r29021, %r29016; + shf.l.wrap.b32 %r29023, %r29022, %r29022, 20; + add.s32 %r29024, %r29018, %r28913; + add.s32 %r29025, %r29024, %r29023; + xor.b32 %r29026, %r29025, %r29020; + shf.l.wrap.b32 %r29027, %r29026, %r29026, 24; + add.s32 %r29028, %r29027, %r29021; + xor.b32 %r29029, %r29028, %r29023; + shf.l.wrap.b32 %r29030, %r29029, %r29029, 25; + add.s32 %r29031, %r28992, %r29011; + xor.b32 %r29032, %r29003, %r29031; + shf.l.wrap.b32 %r29033, %r29032, %r29032, 16; + add.s32 %r29034, %r29033, %r28989; + xor.b32 %r29035, %r29034, %r28975; + shf.l.wrap.b32 %r29036, %r29035, %r29035, 20; + add.s32 %r29037, %r29031, %r28929; + add.s32 %r29038, %r29037, %r29036; + xor.b32 %r29039, %r29038, %r29033; + shf.l.wrap.b32 %r29040, %r29039, %r29039, 24; + add.s32 %r29041, %r29040, %r29034; + xor.b32 %r29042, %r29041, %r29036; + shf.l.wrap.b32 %r29043, %r29042, %r29042, 25; + xor.b32 %r29044, %r29013, %r28994; + shf.l.wrap.b32 %r29045, %r29044, %r29044, 16; + add.s32 %r29046, %r29045, %r29004; + xor.b32 %r29047, %r29046, %r28991; + shf.l.wrap.b32 %r29048, %r29047, %r29047, 20; + add.s32 %r29049, %r28995, %r29048; + xor.b32 %r29050, %r29049, %r29045; + shf.l.wrap.b32 %r29051, %r29050, %r29050, 24; + add.s32 %r29052, %r29051, %r29046; + xor.b32 %r29053, %r29052, %r29048; + shf.l.wrap.b32 %r29054, %r29053, %r29053, 25; + add.s32 %r29055, %r28996, %r29006; + xor.b32 %r29056, %r29055, %r28972; + shf.l.wrap.b32 %r29057, %r29056, %r29056, 16; + add.s32 %r29058, %r29057, %r29014; + xor.b32 %r29059, %r29058, %r29006; + shf.l.wrap.b32 %r29060, %r29059, %r29059, 20; + add.s32 %r29061, %r29055, %r28950; + add.s32 %r29062, %r29061, %r29060; + xor.b32 %r29063, %r29062, %r29057; + shf.l.wrap.b32 %r29064, %r29063, %r29063, 24; + add.s32 %r29065, %r29064, %r29058; + xor.b32 %r29066, %r29065, %r29060; + shf.l.wrap.b32 %r29067, %r29066, %r29066, 25; + add.s32 %r29068, %r29025, %r28857; + add.s32 %r29069, %r29068, %r29067; + xor.b32 %r29070, %r29069, %r29040; + shf.l.wrap.b32 %r29071, %r29070, %r29070, 16; + add.s32 %r29072, %r29071, %r29052; + xor.b32 %r29073, %r29072, %r29067; + shf.l.wrap.b32 %r29074, %r29073, %r29073, 20; + add.s32 %r29075, %r29069, %r28889; + add.s32 %r29076, %r29075, %r29074; + xor.b32 %r29077, %r29076, %r29071; + shf.l.wrap.b32 %r29078, %r29077, %r29077, 24; + add.s32 %r29079, %r29078, %r29072; + xor.b32 %r29080, %r29079, %r29074; + shf.l.wrap.b32 %r29081, %r29080, %r29080, 25; + add.s32 %r29082, %r29038, %r28865; + add.s32 %r29083, %r29082, %r29030; + xor.b32 %r29084, %r29083, %r29051; + shf.l.wrap.b32 %r29085, %r29084, %r29084, 16; + add.s32 %r29086, %r29085, %r29065; + xor.b32 %r29087, %r29086, %r29030; + shf.l.wrap.b32 %r29088, %r29087, %r29087, 20; + add.s32 %r29089, %r29083, %r28921; + add.s32 %r29090, %r29089, %r29088; + xor.b32 %r29091, %r29090, %r29085; + shf.l.wrap.b32 %r29092, %r29091, %r29091, 24; + add.s32 %r29093, %r29092, %r29086; + xor.b32 %r29094, %r29093, %r29088; + shf.l.wrap.b32 %r29095, %r29094, %r29094, 25; + add.s32 %r29096, %r29049, %r28897; + add.s32 %r29097, %r29096, %r29043; + xor.b32 %r29098, %r29064, %r29097; + shf.l.wrap.b32 %r29099, %r29098, %r29098, 16; + add.s32 %r29100, %r29099, %r29028; + xor.b32 %r29101, %r29100, %r29043; + shf.l.wrap.b32 %r29102, %r29101, %r29101, 20; + add.s32 %r29103, %r29097, %r28841; + add.s32 %r29104, %r29103, %r29102; + xor.b32 %r29105, %r29104, %r29099; + shf.l.wrap.b32 %r29106, %r29105, %r29105, 24; + add.s32 %r29107, %r29106, %r29100; + xor.b32 %r29108, %r29107, %r29102; + shf.l.wrap.b32 %r29109, %r29108, %r29108, 25; + add.s32 %r29110, %r29062, %r28873; + add.s32 %r29111, %r29110, %r29054; + xor.b32 %r29112, %r29027, %r29111; + shf.l.wrap.b32 %r29113, %r29112, %r29112, 16; + add.s32 %r29114, %r29113, %r29041; + xor.b32 %r29115, %r29114, %r29054; + shf.l.wrap.b32 %r29116, %r29115, %r29115, 20; + add.s32 %r29117, %r29111, %r28945; + add.s32 %r29118, %r29117, %r29116; + xor.b32 %r29119, %r29118, %r29113; + shf.l.wrap.b32 %r29120, %r29119, %r29119, 24; + add.s32 %r29121, %r29120, %r29114; + xor.b32 %r29122, %r29121, %r29116; + shf.l.wrap.b32 %r29123, %r29122, %r29122, 25; + add.s32 %r29124, %r29076, %r28849; + add.s32 %r29125, %r29124, %r29095; + xor.b32 %r29126, %r29125, %r29120; + shf.l.wrap.b32 %r29127, %r29126, %r29126, 16; + add.s32 %r29128, %r29127, %r29107; + xor.b32 %r29129, %r29128, %r29095; + shf.l.wrap.b32 %r29130, %r29129, %r29129, 20; + add.s32 %r29131, %r29125, %r28929; + add.s32 %r29132, %r29131, %r29130; + xor.b32 %r29133, %r29132, %r29127; + shf.l.wrap.b32 %r29134, %r29133, %r29133, 24; + add.s32 %r29135, %r29134, %r29128; + xor.b32 %r29136, %r29135, %r29130; + shf.l.wrap.b32 %r29137, %r29136, %r29136, 25; + add.s32 %r29138, %r29109, %r28937; + add.s32 %r29139, %r29138, %r29090; + xor.b32 %r29140, %r29078, %r29139; + shf.l.wrap.b32 %r29141, %r29140, %r29140, 16; + add.s32 %r29142, %r29141, %r29121; + xor.b32 %r29143, %r29142, %r29109; + shf.l.wrap.b32 %r29144, %r29143, %r29143, 20; + add.s32 %r29145, %r29139, %r28881; + add.s32 %r29146, %r29145, %r29144; + xor.b32 %r29147, %r29146, %r29141; + shf.l.wrap.b32 %r29148, %r29147, %r29147, 24; + add.s32 %r29149, %r29148, %r29142; + xor.b32 %r29150, %r29149, %r29144; + shf.l.wrap.b32 %r29151, %r29150, %r29150, 25; + add.s32 %r29152, %r29104, %r28913; + add.s32 %r29153, %r29152, %r29123; + xor.b32 %r29154, %r29092, %r29153; + shf.l.wrap.b32 %r29155, %r29154, %r29154, 16; + add.s32 %r29156, %r29155, %r29079; + xor.b32 %r29157, %r29156, %r29123; + shf.l.wrap.b32 %r29158, %r29157, %r29157, 20; + add.s32 %r29159, %r29153, %r28946; + add.s32 %r29160, %r29159, %r29158; + xor.b32 %r29161, %r29160, %r29155; + shf.l.wrap.b32 %r29162, %r29161, %r29161, 24; + add.s32 %r29163, %r29162, %r29156; + xor.b32 %r29164, %r29163, %r29158; + shf.l.wrap.b32 %r29165, %r29164, %r29164, 25; + add.s32 %r29166, %r29118, %r28950; + add.s32 %r29167, %r29166, %r29081; + xor.b32 %r29168, %r29167, %r29106; + shf.l.wrap.b32 %r29169, %r29168, %r29168, 16; + add.s32 %r29170, %r29169, %r29093; + xor.b32 %r29171, %r29170, %r29081; + shf.l.wrap.b32 %r29172, %r29171, %r29171, 20; + add.s32 %r29173, %r29167, %r28905; + add.s32 %r29174, %r29173, %r29172; + xor.b32 %r29175, %r29174, %r29169; + shf.l.wrap.b32 %r29176, %r29175, %r29175, 24; + add.s32 %r29177, %r29176, %r29170; + xor.b32 %r29178, %r29177, %r29172; + shf.l.wrap.b32 %r29179, %r29178, %r29178, 25; + add.s32 %r29180, %r29132, %r28865; + add.s32 %r29181, %r29180, %r29179; + xor.b32 %r29182, %r29181, %r29148; + shf.l.wrap.b32 %r29183, %r29182, %r29182, 16; + add.s32 %r29184, %r29183, %r29163; + xor.b32 %r29185, %r29184, %r29179; + shf.l.wrap.b32 %r29186, %r29185, %r29185, 20; + add.s32 %r29187, %r29181, %r28873; + add.s32 %r29188, %r29187, %r29186; + xor.b32 %r29189, %r29188, %r29183; + shf.l.wrap.b32 %r29190, %r29189, %r29189, 24; + add.s32 %r29191, %r29190, %r29184; + xor.b32 %r29192, %r29191, %r29186; + shf.l.wrap.b32 %r29193, %r29192, %r29192, 25; + add.s32 %r29194, %r29146, %r28921; + add.s32 %r29195, %r29194, %r29137; + xor.b32 %r29196, %r29195, %r29162; + shf.l.wrap.b32 %r29197, %r29196, %r29196, 16; + add.s32 %r29198, %r29197, %r29177; + xor.b32 %r29199, %r29198, %r29137; + shf.l.wrap.b32 %r29200, %r29199, %r29199, 20; + add.s32 %r29201, %r29195, %r28937; + add.s32 %r29202, %r29201, %r29200; + xor.b32 %r29203, %r29202, %r29197; + shf.l.wrap.b32 %r29204, %r29203, %r29203, 24; + add.s32 %r29205, %r29204, %r29198; + xor.b32 %r29206, %r29205, %r29200; + shf.l.wrap.b32 %r29207, %r29206, %r29206, 25; + add.s32 %r29208, %r29160, %r28945; + add.s32 %r29209, %r29208, %r29151; + xor.b32 %r29210, %r29176, %r29209; + shf.l.wrap.b32 %r29211, %r29210, %r29210, 16; + add.s32 %r29212, %r29211, %r29135; + xor.b32 %r29213, %r29212, %r29151; + shf.l.wrap.b32 %r29214, %r29213, %r29213, 20; + add.s32 %r29215, %r29209, %r28857; + add.s32 %r29216, %r29215, %r29214; + xor.b32 %r29217, %r29216, %r29211; + shf.l.wrap.b32 %r29218, %r29217, %r29217, 24; + add.s32 %r29219, %r29218, %r29212; + xor.b32 %r29220, %r29219, %r29214; + shf.l.wrap.b32 %r29221, %r29220, %r29220, 25; + add.s32 %r29222, %r29174, %r28897; + add.s32 %r29223, %r29222, %r29165; + xor.b32 %r29224, %r29134, %r29223; + shf.l.wrap.b32 %r29225, %r29224, %r29224, 16; + add.s32 %r29226, %r29225, %r29149; + xor.b32 %r29227, %r29226, %r29165; + shf.l.wrap.b32 %r29228, %r29227, %r29227, 20; + add.s32 %r29229, %r29223, %r28946; + add.s32 %r29230, %r29229, %r29228; + xor.b32 %r29231, %r29230, %r29225; + shf.l.wrap.b32 %r29232, %r29231, %r29231, 24; + add.s32 %r29233, %r29232, %r29226; + xor.b32 %r29234, %r29233, %r29228; + shf.l.wrap.b32 %r29235, %r29234, %r29234, 25; + add.s32 %r29236, %r29188, %r28889; + add.s32 %r29237, %r29236, %r29207; + xor.b32 %r29238, %r29237, %r29232; + shf.l.wrap.b32 %r29239, %r29238, %r29238, 16; + add.s32 %r29240, %r29239, %r29219; + xor.b32 %r29241, %r29240, %r29207; + shf.l.wrap.b32 %r29242, %r29241, %r29241, 20; + add.s32 %r29243, %r29237, %r28881; + add.s32 %r29244, %r29243, %r29242; + xor.b32 %r29245, %r29244, %r29239; + shf.l.wrap.b32 %r29246, %r29245, %r29245, 24; + add.s32 %r29247, %r29246, %r29240; + xor.b32 %r29248, %r29247, %r29242; + shf.l.wrap.b32 %r29249, %r29248, %r29248, 25; + add.s32 %r29250, %r29221, %r28913; + add.s32 %r29251, %r29250, %r29202; + xor.b32 %r29252, %r29190, %r29251; + shf.l.wrap.b32 %r29253, %r29252, %r29252, 16; + add.s32 %r29254, %r29253, %r29233; + xor.b32 %r29255, %r29254, %r29221; + shf.l.wrap.b32 %r29256, %r29255, %r29255, 20; + add.s32 %r29257, %r29251, %r28841; + add.s32 %r29258, %r29257, %r29256; + xor.b32 %r29259, %r29258, %r29253; + shf.l.wrap.b32 %r29260, %r29259, %r29259, 24; + add.s32 %r29261, %r29260, %r29254; + xor.b32 %r29262, %r29261, %r29256; + shf.l.wrap.b32 %r29263, %r29262, %r29262, 25; + add.s32 %r29264, %r29216, %r28929; + add.s32 %r29265, %r29264, %r29235; + xor.b32 %r29266, %r29204, %r29265; + shf.l.wrap.b32 %r29267, %r29266, %r29266, 16; + add.s32 %r29268, %r29267, %r29191; + xor.b32 %r29269, %r29268, %r29235; + shf.l.wrap.b32 %r29270, %r29269, %r29269, 20; + add.s32 %r29271, %r29265, %r28950; + add.s32 %r29272, %r29271, %r29270; + xor.b32 %r29273, %r29272, %r29267; + shf.l.wrap.b32 %r29274, %r29273, %r29273, 24; + add.s32 %r29275, %r29274, %r29268; + xor.b32 %r29276, %r29275, %r29270; + shf.l.wrap.b32 %r29277, %r29276, %r29276, 25; + add.s32 %r29278, %r29230, %r28905; + add.s32 %r29279, %r29278, %r29193; + xor.b32 %r29280, %r29279, %r29218; + shf.l.wrap.b32 %r29281, %r29280, %r29280, 16; + add.s32 %r29282, %r29281, %r29205; + xor.b32 %r29283, %r29282, %r29193; + shf.l.wrap.b32 %r29284, %r29283, %r29283, 20; + add.s32 %r29285, %r29279, %r28849; + add.s32 %r29286, %r29285, %r29284; + xor.b32 %r29287, %r29286, %r29281; + shf.l.wrap.b32 %r29288, %r29287, %r29287, 24; + add.s32 %r29289, %r29288, %r29282; + xor.b32 %r29290, %r29289, %r29284; + shf.l.wrap.b32 %r29291, %r29290, %r29290, 25; + add.s32 %r29292, %r29244, %r28921; + add.s32 %r29293, %r29292, %r29291; + xor.b32 %r29294, %r29293, %r29260; + shf.l.wrap.b32 %r29295, %r29294, %r29294, 16; + add.s32 %r29296, %r29295, %r29275; + xor.b32 %r29297, %r29296, %r29291; + shf.l.wrap.b32 %r29298, %r29297, %r29297, 20; + add.s32 %r29299, %r29293, %r28897; + add.s32 %r29300, %r29299, %r29298; + xor.b32 %r29301, %r29300, %r29295; + shf.l.wrap.b32 %r29302, %r29301, %r29301, 24; + add.s32 %r29303, %r29302, %r29296; + xor.b32 %r29304, %r29303, %r29298; + shf.l.wrap.b32 %r29305, %r29304, %r29304, 25; + add.s32 %r29306, %r29258, %r28937; + add.s32 %r29307, %r29306, %r29249; + xor.b32 %r29308, %r29307, %r29274; + shf.l.wrap.b32 %r29309, %r29308, %r29308, 16; + add.s32 %r29310, %r29309, %r29289; + xor.b32 %r29311, %r29310, %r29249; + shf.l.wrap.b32 %r29312, %r29311, %r29311, 20; + add.s32 %r29313, %r29307, %r28913; + add.s32 %r29314, %r29313, %r29312; + xor.b32 %r29315, %r29314, %r29309; + shf.l.wrap.b32 %r29316, %r29315, %r29315, 24; + add.s32 %r29317, %r29316, %r29310; + xor.b32 %r29318, %r29317, %r29312; + shf.l.wrap.b32 %r29319, %r29318, %r29318, 25; + add.s32 %r29320, %r29272, %r28946; + add.s32 %r29321, %r29320, %r29263; + xor.b32 %r29322, %r29288, %r29321; + shf.l.wrap.b32 %r29323, %r29322, %r29322, 16; + add.s32 %r29324, %r29323, %r29247; + xor.b32 %r29325, %r29324, %r29263; + shf.l.wrap.b32 %r29326, %r29325, %r29325, 20; + add.s32 %r29327, %r29321, %r28865; + add.s32 %r29328, %r29327, %r29326; + xor.b32 %r29329, %r29328, %r29323; + shf.l.wrap.b32 %r29330, %r29329, %r29329, 24; + add.s32 %r29331, %r29330, %r29324; + xor.b32 %r29332, %r29331, %r29326; + shf.l.wrap.b32 %r29333, %r29332, %r29332, 25; + add.s32 %r29334, %r29286, %r28945; + add.s32 %r29335, %r29334, %r29277; + xor.b32 %r29336, %r29246, %r29335; + shf.l.wrap.b32 %r29337, %r29336, %r29336, 16; + add.s32 %r29338, %r29337, %r29261; + xor.b32 %r29339, %r29338, %r29277; + shf.l.wrap.b32 %r29340, %r29339, %r29339, 20; + add.s32 %r29341, %r29335, %r28950; + add.s32 %r29342, %r29341, %r29340; + xor.b32 %r29343, %r29342, %r29337; + shf.l.wrap.b32 %r29344, %r29343, %r29343, 24; + add.s32 %r29345, %r29344, %r29338; + xor.b32 %r29346, %r29345, %r29340; + shf.l.wrap.b32 %r29347, %r29346, %r29346, 25; + add.s32 %r29348, %r29300, %r28873; + add.s32 %r29349, %r29348, %r29319; + xor.b32 %r29350, %r29349, %r29344; + shf.l.wrap.b32 %r29351, %r29350, %r29350, 16; + add.s32 %r29352, %r29351, %r29331; + xor.b32 %r29353, %r29352, %r29319; + shf.l.wrap.b32 %r29354, %r29353, %r29353, 20; + add.s32 %r29355, %r29349, %r28841; + add.s32 %r29356, %r29355, %r29354; + xor.b32 %r29357, %r29356, %r29351; + shf.l.wrap.b32 %r29358, %r29357, %r29357, 24; + add.s32 %r29359, %r29358, %r29352; + xor.b32 %r29360, %r29359, %r29354; + shf.l.wrap.b32 %r29361, %r29360, %r29360, 25; + add.s32 %r29362, %r29333, %r28929; + add.s32 %r29363, %r29362, %r29314; + xor.b32 %r29364, %r29302, %r29363; + shf.l.wrap.b32 %r29365, %r29364, %r29364, 16; + add.s32 %r29366, %r29365, %r29345; + xor.b32 %r29367, %r29366, %r29333; + shf.l.wrap.b32 %r29368, %r29367, %r29367, 20; + add.s32 %r29369, %r29363, %r28857; + add.s32 %r29370, %r29369, %r29368; + xor.b32 %r29371, %r29370, %r29365; + shf.l.wrap.b32 %r29372, %r29371, %r29371, 24; + add.s32 %r29373, %r29372, %r29366; + xor.b32 %r29374, %r29373, %r29368; + shf.l.wrap.b32 %r29375, %r29374, %r29374, 25; + add.s32 %r29376, %r29328, %r28881; + add.s32 %r29377, %r29376, %r29347; + xor.b32 %r29378, %r29316, %r29377; + shf.l.wrap.b32 %r29379, %r29378, %r29378, 16; + add.s32 %r29380, %r29379, %r29303; + xor.b32 %r29381, %r29380, %r29347; + shf.l.wrap.b32 %r29382, %r29381, %r29381, 20; + add.s32 %r29383, %r29377, %r28905; + add.s32 %r29384, %r29383, %r29382; + xor.b32 %r29385, %r29384, %r29379; + shf.l.wrap.b32 %r29386, %r29385, %r29385, 24; + add.s32 %r29387, %r29386, %r29380; + xor.b32 %r29388, %r29387, %r29382; + shf.l.wrap.b32 %r29389, %r29388, %r29388, 25; + add.s32 %r29390, %r29342, %r28849; + add.s32 %r29391, %r29390, %r29305; + xor.b32 %r29392, %r29391, %r29330; + shf.l.wrap.b32 %r29393, %r29392, %r29392, 16; + add.s32 %r29394, %r29393, %r29317; + xor.b32 %r29395, %r29394, %r29305; + shf.l.wrap.b32 %r29396, %r29395, %r29395, 20; + add.s32 %r29397, %r29391, %r28889; + add.s32 %r29398, %r29397, %r29396; + xor.b32 %r29399, %r29398, %r29393; + shf.l.wrap.b32 %r29400, %r29399, %r29399, 24; + add.s32 %r29401, %r29400, %r29394; + xor.b32 %r29402, %r29401, %r29396; + shf.l.wrap.b32 %r29403, %r29402, %r29402, 25; + add.s32 %r29404, %r29356, %r28937; + add.s32 %r29405, %r29404, %r29403; + xor.b32 %r29406, %r29405, %r29372; + shf.l.wrap.b32 %r29407, %r29406, %r29406, 16; + add.s32 %r29408, %r29407, %r29387; + xor.b32 %r29409, %r29408, %r29403; + shf.l.wrap.b32 %r29410, %r29409, %r29409, 20; + add.s32 %r29411, %r29405, %r28945; + add.s32 %r29412, %r29411, %r29410; + xor.b32 %r29413, %r29412, %r29407; + shf.l.wrap.b32 %r29414, %r29413, %r29413, 24; + add.s32 %r29415, %r29414, %r29408; + xor.b32 %r29416, %r29415, %r29410; + shf.l.wrap.b32 %r29417, %r29416, %r29416, 25; + add.s32 %r29418, %r29370, %r28913; + add.s32 %r29419, %r29418, %r29361; + xor.b32 %r29420, %r29419, %r29386; + shf.l.wrap.b32 %r29421, %r29420, %r29420, 16; + add.s32 %r29422, %r29421, %r29401; + xor.b32 %r29423, %r29422, %r29361; + shf.l.wrap.b32 %r29424, %r29423, %r29423, 20; + add.s32 %r29425, %r29419, %r28929; + add.s32 %r29426, %r29425, %r29424; + xor.b32 %r29427, %r29426, %r29421; + shf.l.wrap.b32 %r29428, %r29427, %r29427, 24; + add.s32 %r29429, %r29428, %r29422; + xor.b32 %r29430, %r29429, %r29424; + shf.l.wrap.b32 %r29431, %r29430, %r29430, 25; + add.s32 %r29432, %r29384, %r28950; + add.s32 %r29433, %r29432, %r29375; + xor.b32 %r29434, %r29400, %r29433; + shf.l.wrap.b32 %r29435, %r29434, %r29434, 16; + add.s32 %r29436, %r29435, %r29359; + xor.b32 %r29437, %r29436, %r29375; + shf.l.wrap.b32 %r29438, %r29437, %r29437, 20; + add.s32 %r29439, %r29433, %r28921; + add.s32 %r29440, %r29439, %r29438; + xor.b32 %r29441, %r29440, %r29435; + shf.l.wrap.b32 %r29442, %r29441, %r29441, 24; + add.s32 %r29443, %r29442, %r29436; + xor.b32 %r29444, %r29443, %r29438; + shf.l.wrap.b32 %r29445, %r29444, %r29444, 25; + add.s32 %r29446, %r29398, %r28946; + add.s32 %r29447, %r29446, %r29389; + xor.b32 %r29448, %r29358, %r29447; + shf.l.wrap.b32 %r29449, %r29448, %r29448, 16; + add.s32 %r29450, %r29449, %r29373; + xor.b32 %r29451, %r29450, %r29389; + shf.l.wrap.b32 %r29452, %r29451, %r29451, 20; + add.s32 %r29453, %r29447, %r28905; + add.s32 %r29454, %r29453, %r29452; + xor.b32 %r29455, %r29454, %r29449; + shf.l.wrap.b32 %r29456, %r29455, %r29455, 24; + add.s32 %r29457, %r29456, %r29450; + xor.b32 %r29458, %r29457, %r29452; + shf.l.wrap.b32 %r29459, %r29458, %r29458, 25; + add.s32 %r29460, %r29412, %r28897; + add.s32 %r29461, %r29460, %r29431; + xor.b32 %r29462, %r29461, %r29456; + shf.l.wrap.b32 %r29463, %r29462, %r29462, 16; + add.s32 %r29464, %r29463, %r29443; + xor.b32 %r29465, %r29464, %r29431; + shf.l.wrap.b32 %r29466, %r29465, %r29465, 20; + add.s32 %r29467, %r29461, %r28857; + add.s32 %r29468, %r29467, %r29466; + xor.b32 %r29469, %r29468, %r29463; + shf.l.wrap.b32 %r29470, %r29469, %r29469, 24; + add.s32 %r29471, %r29470, %r29464; + xor.b32 %r29472, %r29471, %r29466; + shf.l.wrap.b32 %r29473, %r29472, %r29472, 25; + add.s32 %r29474, %r29445, %r28881; + add.s32 %r29475, %r29474, %r29426; + xor.b32 %r29476, %r29414, %r29475; + shf.l.wrap.b32 %r29477, %r29476, %r29476, 16; + add.s32 %r29478, %r29477, %r29457; + xor.b32 %r29479, %r29478, %r29445; + shf.l.wrap.b32 %r29480, %r29479, %r29479, 20; + add.s32 %r29481, %r29475, %r28865; + add.s32 %r29482, %r29481, %r29480; + xor.b32 %r29483, %r29482, %r29477; + shf.l.wrap.b32 %r29484, %r29483, %r29483, 24; + add.s32 %r29485, %r29484, %r29478; + xor.b32 %r29486, %r29485, %r29480; + shf.l.wrap.b32 %r29487, %r29486, %r29486, 25; + add.s32 %r29488, %r29440, %r28841; + add.s32 %r29489, %r29488, %r29459; + xor.b32 %r29490, %r29428, %r29489; + shf.l.wrap.b32 %r29491, %r29490, %r29490, 16; + add.s32 %r29492, %r29491, %r29415; + xor.b32 %r29493, %r29492, %r29459; + shf.l.wrap.b32 %r29494, %r29493, %r29493, 20; + add.s32 %r29495, %r29489, %r28849; + add.s32 %r29496, %r29495, %r29494; + xor.b32 %r29497, %r29496, %r29491; + shf.l.wrap.b32 %r29498, %r29497, %r29497, 24; + add.s32 %r29499, %r29498, %r29492; + xor.b32 %r29500, %r29499, %r29494; + shf.l.wrap.b32 %r29501, %r29500, %r29500, 25; + add.s32 %r29502, %r29454, %r28889; + add.s32 %r29503, %r29502, %r29417; + xor.b32 %r29504, %r29503, %r29442; + shf.l.wrap.b32 %r29505, %r29504, %r29504, 16; + add.s32 %r29506, %r29505, %r29429; + xor.b32 %r29507, %r29506, %r29417; + shf.l.wrap.b32 %r29508, %r29507, %r29507, 20; + add.s32 %r29509, %r29503, %r28873; + add.s32 %r29510, %r29509, %r29508; + xor.b32 %r29511, %r29510, %r29505; + shf.l.wrap.b32 %r29512, %r29511, %r29511, 24; + add.s32 %r29513, %r29512, %r29506; + xor.b32 %r29514, %r29513, %r29508; + shf.l.wrap.b32 %r29515, %r29514, %r29514, 25; + add.s32 %r29516, %r29468, %r28913; + add.s32 %r29517, %r29516, %r29515; + xor.b32 %r29518, %r29517, %r29484; + shf.l.wrap.b32 %r29519, %r29518, %r29518, 16; + add.s32 %r29520, %r29519, %r29499; + xor.b32 %r29521, %r29520, %r29515; + shf.l.wrap.b32 %r29522, %r29521, %r29521, 20; + add.s32 %r29523, %r29517, %r28946; + add.s32 %r29524, %r29523, %r29522; + xor.b32 %r29525, %r29524, %r29519; + shf.l.wrap.b32 %r29526, %r29525, %r29525, 24; + add.s32 %r29527, %r29526, %r29520; + xor.b32 %r29528, %r29527, %r29522; + shf.l.wrap.b32 %r29529, %r29528, %r29528, 25; + add.s32 %r29530, %r29482, %r28929; + add.s32 %r29531, %r29530, %r29473; + xor.b32 %r29532, %r29531, %r29498; + shf.l.wrap.b32 %r29533, %r29532, %r29532, 16; + add.s32 %r29534, %r29533, %r29513; + xor.b32 %r29535, %r29534, %r29473; + shf.l.wrap.b32 %r29536, %r29535, %r29535, 20; + add.s32 %r29537, %r29531, %r28881; + add.s32 %r29538, %r29537, %r29536; + xor.b32 %r29539, %r29538, %r29533; + shf.l.wrap.b32 %r29540, %r29539, %r29539, 24; + add.s32 %r29541, %r29540, %r29534; + xor.b32 %r29542, %r29541, %r29536; + shf.l.wrap.b32 %r29543, %r29542, %r29542, 25; + add.s32 %r29544, %r29496, %r28905; + add.s32 %r29545, %r29544, %r29487; + xor.b32 %r29546, %r29512, %r29545; + shf.l.wrap.b32 %r29547, %r29546, %r29546, 16; + add.s32 %r29548, %r29547, %r29471; + xor.b32 %r29549, %r29548, %r29487; + shf.l.wrap.b32 %r29550, %r29549, %r29549, 20; + add.s32 %r29551, %r29545, %r28937; + add.s32 %r29552, %r29551, %r29550; + xor.b32 %r29553, %r29552, %r29547; + shf.l.wrap.b32 %r29554, %r29553, %r29553, 24; + add.s32 %r29555, %r29554, %r29548; + xor.b32 %r29556, %r29555, %r29550; + shf.l.wrap.b32 %r29557, %r29556, %r29556, 25; + add.s32 %r29558, %r29510, %r28950; + add.s32 %r29559, %r29558, %r29501; + xor.b32 %r29560, %r29470, %r29559; + shf.l.wrap.b32 %r29561, %r29560, %r29560, 16; + add.s32 %r29562, %r29561, %r29485; + xor.b32 %r29563, %r29562, %r29501; + shf.l.wrap.b32 %r29564, %r29563, %r29563, 20; + add.s32 %r29565, %r29559, %r28849; + add.s32 %r29566, %r29565, %r29564; + xor.b32 %r29567, %r29566, %r29561; + shf.l.wrap.b32 %r29568, %r29567, %r29567, 24; + add.s32 %r29569, %r29568, %r29562; + xor.b32 %r29570, %r29569, %r29564; + shf.l.wrap.b32 %r29571, %r29570, %r29570, 25; + add.s32 %r29572, %r29524, %r28945; + add.s32 %r29573, %r29572, %r29543; + xor.b32 %r29574, %r29573, %r29568; + shf.l.wrap.b32 %r29575, %r29574, %r29574, 16; + add.s32 %r29576, %r29575, %r29555; + xor.b32 %r29577, %r29576, %r29543; + shf.l.wrap.b32 %r29578, %r29577, %r29577, 20; + add.s32 %r29579, %r29573, %r28865; + add.s32 %r29580, %r29579, %r29578; + xor.b32 %r29581, %r29580, %r29575; + shf.l.wrap.b32 %r29582, %r29581, %r29581, 24; + add.s32 %r29583, %r29582, %r29576; + xor.b32 %r29584, %r29583, %r29578; + shf.l.wrap.b32 %r29585, %r29584, %r29584, 25; + add.s32 %r29586, %r29557, %r28841; + add.s32 %r29587, %r29586, %r29538; + xor.b32 %r29588, %r29526, %r29587; + shf.l.wrap.b32 %r29589, %r29588, %r29588, 16; + add.s32 %r29590, %r29589, %r29569; + xor.b32 %r29591, %r29590, %r29557; + shf.l.wrap.b32 %r29592, %r29591, %r29591, 20; + add.s32 %r29593, %r29587, %r28921; + add.s32 %r29594, %r29593, %r29592; + xor.b32 %r29595, %r29594, %r29589; + shf.l.wrap.b32 %r29596, %r29595, %r29595, 24; + add.s32 %r29597, %r29596, %r29590; + xor.b32 %r29598, %r29597, %r29592; + shf.l.wrap.b32 %r29599, %r29598, %r29598, 25; + add.s32 %r29600, %r29552, %r28857; + add.s32 %r29601, %r29600, %r29571; + xor.b32 %r29602, %r29540, %r29601; + shf.l.wrap.b32 %r29603, %r29602, %r29602, 16; + add.s32 %r29604, %r29603, %r29527; + xor.b32 %r29605, %r29604, %r29571; + shf.l.wrap.b32 %r29606, %r29605, %r29605, 20; + add.s32 %r29607, %r29601, %r28889; + add.s32 %r29608, %r29607, %r29606; + xor.b32 %r29609, %r29608, %r29603; + shf.l.wrap.b32 %r29610, %r29609, %r29609, 24; + add.s32 %r29611, %r29610, %r29604; + xor.b32 %r29612, %r29611, %r29606; + shf.l.wrap.b32 %r29613, %r29612, %r29612, 25; + add.s32 %r29614, %r29566, %r28873; + add.s32 %r29615, %r29614, %r29529; + xor.b32 %r29616, %r29615, %r29554; + shf.l.wrap.b32 %r29617, %r29616, %r29616, 16; + add.s32 %r29618, %r29617, %r29541; + xor.b32 %r29619, %r29618, %r29529; + shf.l.wrap.b32 %r29620, %r29619, %r29619, 20; + add.s32 %r29621, %r29615, %r28897; + add.s32 %r29622, %r29621, %r29620; + xor.b32 %r29623, %r29622, %r29617; + shf.l.wrap.b32 %r29624, %r29623, %r29623, 24; + add.s32 %r29625, %r29624, %r29618; + xor.b32 %r29626, %r29625, %r29620; + shf.l.wrap.b32 %r29627, %r29626, %r29626, 25; + add.s32 %r29628, %r29580, %r28929; + add.s32 %r29629, %r29628, %r29627; + xor.b32 %r29630, %r29629, %r29596; + shf.l.wrap.b32 %r29631, %r29630, %r29630, 16; + add.s32 %r29632, %r29631, %r29611; + xor.b32 %r29633, %r29632, %r29627; + shf.l.wrap.b32 %r29634, %r29633, %r29633, 20; + add.s32 %r29635, %r29629, %r28950; + add.s32 %r29636, %r29635, %r29634; + xor.b32 %r29637, %r29636, %r29631; + shf.l.wrap.b32 %r29638, %r29637, %r29637, 24; + add.s32 %r29639, %r29638, %r29632; + xor.b32 %r29640, %r29639, %r29634; + shf.l.wrap.b32 %r29641, %r29640, %r29640, 25; + add.s32 %r29642, %r29594, %r28881; + add.s32 %r29643, %r29642, %r29585; + xor.b32 %r29644, %r29643, %r29610; + shf.l.wrap.b32 %r29645, %r29644, %r29644, 16; + add.s32 %r29646, %r29645, %r29625; + xor.b32 %r29647, %r29646, %r29585; + shf.l.wrap.b32 %r29648, %r29647, %r29647, 20; + add.s32 %r29649, %r29643, %r28841; + add.s32 %r29650, %r29649, %r29648; + xor.b32 %r29651, %r29650, %r29645; + shf.l.wrap.b32 %r29652, %r29651, %r29651, 24; + add.s32 %r29653, %r29652, %r29646; + xor.b32 %r29654, %r29653, %r29648; + shf.l.wrap.b32 %r29655, %r29654, %r29654, 25; + add.s32 %r29656, %r29608, %r28849; + add.s32 %r29657, %r29656, %r29599; + xor.b32 %r29658, %r29624, %r29657; + shf.l.wrap.b32 %r29659, %r29658, %r29658, 16; + add.s32 %r29660, %r29659, %r29583; + xor.b32 %r29661, %r29660, %r29599; + shf.l.wrap.b32 %r29662, %r29661, %r29661, 20; + add.s32 %r29663, %r29657, %r28913; + add.s32 %r29664, %r29663, %r29662; + xor.b32 %r29665, %r29664, %r29659; + shf.l.wrap.b32 %r29666, %r29665, %r29665, 24; + add.s32 %r29667, %r29666, %r29660; + xor.b32 %r29668, %r29667, %r29662; + shf.l.wrap.b32 %r29669, %r29668, %r29668, 25; + add.s32 %r29670, %r29622, %r28905; + add.s32 %r29671, %r29670, %r29613; + xor.b32 %r29672, %r29582, %r29671; + shf.l.wrap.b32 %r29673, %r29672, %r29672, 16; + add.s32 %r29674, %r29673, %r29597; + xor.b32 %r29675, %r29674, %r29613; + shf.l.wrap.b32 %r29676, %r29675, %r29675, 20; + add.s32 %r29677, %r29671, %r28889; + add.s32 %r29678, %r29677, %r29676; + xor.b32 %r29679, %r29678, %r29673; + shf.l.wrap.b32 %r29680, %r29679, %r29679, 24; + add.s32 %r29681, %r29680, %r29674; + xor.b32 %r29682, %r29681, %r29676; + shf.l.wrap.b32 %r29683, %r29682, %r29682, 25; + add.s32 %r29684, %r29636, %r28946; + add.s32 %r29685, %r29684, %r29655; + xor.b32 %r29686, %r29685, %r29680; + shf.l.wrap.b32 %r29687, %r29686, %r29686, 16; + add.s32 %r29688, %r29687, %r29667; + xor.b32 %r29689, %r29688, %r29655; + shf.l.wrap.b32 %r29690, %r29689, %r29689, 20; + add.s32 %r29691, %r29685, %r28921; + add.s32 %r29692, %r29691, %r29690; + xor.b32 %r29693, %r29692, %r29687; + shf.l.wrap.b32 %r29694, %r29693, %r29693, 24; + add.s32 %r29695, %r29694, %r29688; + xor.b32 %r29696, %r29695, %r29690; + shf.l.wrap.b32 %r29697, %r29696, %r29696, 25; + add.s32 %r29698, %r29669, %r28857; + add.s32 %r29699, %r29698, %r29650; + xor.b32 %r29700, %r29638, %r29699; + shf.l.wrap.b32 %r29701, %r29700, %r29700, 16; + add.s32 %r29702, %r29701, %r29681; + xor.b32 %r29703, %r29702, %r29669; + shf.l.wrap.b32 %r29704, %r29703, %r29703, 20; + add.s32 %r29705, %r29699, %r28937; + add.s32 %r29706, %r29705, %r29704; + xor.b32 %r29707, %r29706, %r29701; + shf.l.wrap.b32 %r29708, %r29707, %r29707, 24; + add.s32 %r29709, %r29708, %r29702; + xor.b32 %r29710, %r29709, %r29704; + shf.l.wrap.b32 %r29711, %r29710, %r29710, 25; + add.s32 %r29712, %r29664, %r28865; + add.s32 %r29713, %r29712, %r29683; + xor.b32 %r29714, %r29652, %r29713; + shf.l.wrap.b32 %r29715, %r29714, %r29714, 16; + add.s32 %r29716, %r29715, %r29639; + xor.b32 %r29717, %r29716, %r29683; + shf.l.wrap.b32 %r29718, %r29717, %r29717, 20; + add.s32 %r29719, %r29713, %r28873; + add.s32 %r29720, %r29719, %r29718; + xor.b32 %r29721, %r29720, %r29715; + shf.l.wrap.b32 %r29722, %r29721, %r29721, 24; + add.s32 %r29723, %r29722, %r29716; + xor.b32 %r29724, %r29723, %r29718; + shf.l.wrap.b32 %r29725, %r29724, %r29724, 25; + add.s32 %r29726, %r29678, %r28897; + add.s32 %r29727, %r29726, %r29641; + xor.b32 %r29728, %r29727, %r29666; + shf.l.wrap.b32 %r29729, %r29728, %r29728, 16; + add.s32 %r29730, %r29729, %r29653; + xor.b32 %r29731, %r29730, %r29641; + shf.l.wrap.b32 %r29732, %r29731, %r29731, 20; + add.s32 %r29733, %r29727, %r28945; + add.s32 %r29734, %r29733, %r29732; + xor.b32 %r29735, %r29734, %r29729; + shf.l.wrap.b32 %r29736, %r29735, %r29735, 24; + add.s32 %r29737, %r29736, %r29730; + xor.b32 %r29738, %r29737, %r29732; + shf.l.wrap.b32 %r29739, %r29738, %r29738, 25; + xor.b32 %r29740, %r29692, %r29723; + cvt.u64.u32 %rd1207, %r29740; + xor.b32 %r29741, %r29737, %r29706; + and.b32 %r29742, %r29741, 255; + cvt.u64.u32 %rd1208, %r29742; + cvt.u64.u32 %rd1209, %r29741; + shl.b64 %rd1210, %rd1209, 32; + and.b64 %rd1211, %rd1210, 280375465082880; + and.b64 %rd1212, %rd1210, 71776119061217280; + shr.u32 %r29743, %r29741, 24; + cvt.u64.u32 %rd1213, %r29743; + shl.b64 %rd1214, %rd1213, 56; + bfi.b64 %rd1215, %rd1208, %rd1207, 32, 32; + or.b64 %rd1216, %rd1215, %rd1211; + or.b64 %rd1217, %rd1216, %rd1212; + or.b64 %rd353, %rd1217, %rd1214; + xor.b32 %r29744, %r29695, %r29720; + cvt.u64.u32 %rd1218, %r29744; + xor.b32 %r29745, %r29734, %r29709; + and.b32 %r29746, %r29745, 255; + cvt.u64.u32 %rd1219, %r29746; + cvt.u64.u32 %rd1220, %r29745; + shl.b64 %rd1221, %rd1220, 32; + and.b64 %rd1222, %rd1221, 280375465082880; + and.b64 %rd1223, %rd1221, 71776119061217280; + shr.u32 %r29747, %r29745, 24; + cvt.u64.u32 %rd1224, %r29747; + shl.b64 %rd1225, %rd1224, 56; + bfi.b64 %rd1226, %rd1219, %rd1218, 32, 32; + or.b64 %rd1227, %rd1226, %rd1222; + or.b64 %rd1228, %rd1227, %rd1223; + or.b64 %rd352, %rd1228, %rd1225; + xor.b32 %r29748, %r29739, %r29708; + cvt.u64.u32 %rd1229, %r29748; + xor.b32 %r29749, %r29697, %r29722; + and.b32 %r29750, %r29749, 255; + cvt.u64.u32 %rd1230, %r29750; + cvt.u64.u32 %rd1231, %r29749; + shl.b64 %rd1232, %rd1231, 32; + and.b64 %rd1233, %rd1232, 280375465082880; + and.b64 %rd1234, %rd1232, 71776119061217280; + shr.u32 %r29751, %r29749, 24; + cvt.u64.u32 %rd1235, %r29751; + shl.b64 %rd1236, %rd1235, 56; + bfi.b64 %rd1237, %rd1230, %rd1229, 32, 32; + or.b64 %rd1238, %rd1237, %rd1233; + or.b64 %rd1239, %rd1238, %rd1234; + or.b64 %rd1370, %rd1239, %rd1236; + xor.b32 %r29752, %r29736, %r29711; + cvt.u64.u32 %rd1240, %r29752; + xor.b32 %r29753, %r29694, %r29725; + and.b32 %r29754, %r29753, 255; + cvt.u64.u32 %rd1241, %r29754; + cvt.u64.u32 %rd1242, %r29753; + shl.b64 %rd1243, %rd1242, 32; + and.b64 %rd1244, %rd1243, 280375465082880; + and.b64 %rd1245, %rd1243, 71776119061217280; + shr.u32 %r29755, %r29753, 24; + cvt.u64.u32 %rd1246, %r29755; + shl.b64 %rd1247, %rd1246, 56; + bfi.b64 %rd1248, %rd1241, %rd1240, 32, 32; + or.b64 %rd1249, %rd1248, %rd1244; + or.b64 %rd1250, %rd1249, %rd1245; + or.b64 %rd1369, %rd1250, %rd1247; + bra.uni $L__BB2_96; + +$L__BB2_89: + setp.eq.s16 %p51, %rs500, 0; + selp.u16 %rs502, 1, 0, %p51; + ld.local.u8 %rs665, [%rd2+138]; + or.b16 %rs503, %rs665, %rs502; + or.b16 %rs732, %rs503, 2; + ld.local.u64 %rd1368, [%rd2+64]; + ld.local.v2.u32 {%r31257, %r31256}, [%rd2+32]; + ld.local.v2.u32 {%r31255, %r31254}, [%rd2+40]; + ld.local.v2.u32 {%r31253, %r31252}, [%rd2+48]; + ld.local.v2.u32 {%r31251, %r31250}, [%rd2+56]; + ld.local.v4.u16 {%rs798, %rs800, %rs802, %rs804}, [%rd2+72]; + shr.u16 %rs799, %rs798, 8; + shr.u16 %rs801, %rs800, 8; + shr.u16 %rs803, %rs802, 8; + shr.u16 %rs805, %rs804, 8; + ld.local.v4.u16 {%rs806, %rs808, %rs810, %rs812}, [%rd2+80]; + shr.u16 %rs807, %rs806, 8; + shr.u16 %rs809, %rs808, 8; + shr.u16 %rs811, %rs810, 8; + shr.u16 %rs813, %rs812, 8; + ld.local.v4.u16 {%rs814, %rs816, %rs818, %rs820}, [%rd2+88]; + shr.u16 %rs815, %rs814, 8; + shr.u16 %rs817, %rs816, 8; + shr.u16 %rs819, %rs818, 8; + shr.u16 %rs821, %rs820, 8; + ld.local.v4.u16 {%rs822, %rs824, %rs826, %rs828}, [%rd2+96]; + shr.u16 %rs823, %rs822, 8; + shr.u16 %rs825, %rs824, 8; + shr.u16 %rs827, %rs826, 8; + shr.u16 %rs829, %rs828, 8; + ld.local.v4.u16 {%rs830, %rs832, %rs834, %rs836}, [%rd2+104]; + shr.u16 %rs831, %rs830, 8; + shr.u16 %rs833, %rs832, 8; + shr.u16 %rs835, %rs834, 8; + shr.u16 %rs837, %rs836, 8; + ld.local.v4.u16 {%rs838, %rs840, %rs842, %rs844}, [%rd2+112]; + shr.u16 %rs839, %rs838, 8; + shr.u16 %rs841, %rs840, 8; + shr.u16 %rs843, %rs842, 8; + shr.u16 %rs845, %rs844, 8; + ld.local.v4.u8 {%rs846, %rs847, %rs848, %rs849}, [%rd2+120]; + ld.local.v2.u8 {%rs850, %rs851}, [%rd2+124]; + ld.local.v2.u8 {%rs852, %rs853}, [%rd2+126]; + ld.local.v4.u8 {%rs854, %rs855, %rs856, %rs857}, [%rd2+128]; + ld.local.v2.u8 {%rs858, %rs859}, [%rd2+132]; + ld.local.v2.u8 {%rs860, %rs861}, [%rd2+134]; + +$L__BB2_91: + setp.eq.s64 %p52, %rd1367, 0; + mov.u32 %r31258, %r31257; + mov.u32 %r31259, %r31256; + mov.u32 %r31260, %r31255; + mov.u32 %r31261, %r31254; + mov.u32 %r31262, %r31253; + mov.u32 %r31263, %r31252; + mov.u32 %r31264, %r31251; + mov.u32 %r31265, %r31250; + mov.u16 %rs863, %rs732; + @%p52 bra $L__BB2_94; + + or.b16 %rs863, %rs665, 4; + ld.local.v2.u32 {%r31258, %r31259}, [%rd2]; + ld.local.v2.u32 {%r31260, %r31261}, [%rd2+8]; + ld.local.v2.u32 {%r31262, %r31263}, [%rd2+16]; + ld.local.v2.u32 {%r31264, %r31265}, [%rd2+24]; + mov.u16 %rs766, %rs829; + mov.u16 %rs767, %rs828; + mov.u16 %rs768, %rs827; + mov.u16 %rs769, %rs826; + mov.u16 %rs770, %rs825; + mov.u16 %rs771, %rs824; + mov.u16 %rs772, %rs823; + mov.u16 %rs773, %rs822; + mov.u16 %rs774, %rs821; + mov.u16 %rs775, %rs820; + mov.u16 %rs776, %rs819; + mov.u16 %rs777, %rs818; + mov.u16 %rs778, %rs817; + mov.u16 %rs779, %rs816; + mov.u16 %rs780, %rs815; + mov.u16 %rs781, %rs814; + mov.u16 %rs782, %rs813; + mov.u16 %rs783, %rs812; + mov.u16 %rs784, %rs811; + mov.u16 %rs785, %rs810; + mov.u16 %rs786, %rs809; + mov.u16 %rs787, %rs808; + mov.u16 %rs788, %rs807; + mov.u16 %rs789, %rs806; + mov.u16 %rs790, %rs805; + mov.u16 %rs791, %rs804; + mov.u16 %rs792, %rs803; + mov.u16 %rs793, %rs802; + mov.u16 %rs794, %rs801; + mov.u16 %rs795, %rs800; + mov.u16 %rs796, %rs799; + mov.u16 %rs797, %rs798; + +$L__BB2_93: + add.s64 %rd1367, %rd1367, -1; + shl.b64 %rd1160, %rd1367, 5; + add.s64 %rd1161, %rd2, %rd1160; + ld.local.u8 %rs798, [%rd1161+145]; + mov.u64 %rd1159, 0; + ld.local.u8 %rs799, [%rd1161+146]; + ld.local.u8 %rs800, [%rd1161+147]; + ld.local.u8 %rs801, [%rd1161+148]; + ld.local.u8 %rs802, [%rd1161+149]; + ld.local.u8 %rs803, [%rd1161+150]; + ld.local.u8 %rs804, [%rd1161+151]; + ld.local.u8 %rs805, [%rd1161+152]; + ld.local.u8 %rs806, [%rd1161+153]; + ld.local.u8 %rs807, [%rd1161+154]; + ld.local.u8 %rs808, [%rd1161+155]; + ld.local.u8 %rs809, [%rd1161+156]; + ld.local.u8 %rs810, [%rd1161+157]; + ld.local.u8 %rs811, [%rd1161+158]; + ld.local.u8 %rs812, [%rd1161+159]; + ld.local.u8 %rs813, [%rd1161+160]; + ld.local.u8 %rs814, [%rd1161+161]; + ld.local.u8 %rs815, [%rd1161+162]; + ld.local.u8 %rs816, [%rd1161+163]; + ld.local.u8 %rs817, [%rd1161+164]; + ld.local.u8 %rs818, [%rd1161+165]; + ld.local.u8 %rs819, [%rd1161+166]; + ld.local.u8 %rs820, [%rd1161+167]; + ld.local.u8 %rs821, [%rd1161+168]; + ld.local.u8 %rs822, [%rd1161+169]; + ld.local.u8 %rs823, [%rd1161+170]; + ld.local.u8 %rs824, [%rd1161+171]; + ld.local.u8 %rs825, [%rd1161+172]; + ld.local.u8 %rs826, [%rd1161+173]; + ld.local.u8 %rs827, [%rd1161+174]; + ld.local.u8 %rs828, [%rd1161+175]; + ld.local.u8 %rs829, [%rd1161+176]; + cvt.u32.u16 %r26825, %rs797; + and.b32 %r26826, %r26825, 255; + cvt.u32.u16 %r26827, %rs796; + prmt.b32 %r26828, %r26827, %r26826, 30212; + cvt.u32.u16 %r26829, %rs795; + shl.b32 %r26830, %r26829, 16; + and.b32 %r26831, %r26830, 16711680; + or.b32 %r26832, %r26828, %r26831; + cvt.u32.u16 %r26833, %rs794; + shl.b32 %r26834, %r26833, 24; + or.b32 %r26835, %r26832, %r26834; + cvt.u32.u16 %r26836, %rs793; + and.b32 %r26837, %r26836, 255; + cvt.u32.u16 %r26838, %rs792; + prmt.b32 %r26839, %r26838, %r26837, 30212; + cvt.u32.u16 %r26840, %rs791; + shl.b32 %r26841, %r26840, 16; + and.b32 %r26842, %r26841, 16711680; + or.b32 %r26843, %r26839, %r26842; + cvt.u32.u16 %r26844, %rs790; + shl.b32 %r26845, %r26844, 24; + or.b32 %r26846, %r26843, %r26845; + cvt.u32.u16 %r26847, %rs789; + and.b32 %r26848, %r26847, 255; + cvt.u32.u16 %r26849, %rs788; + prmt.b32 %r26850, %r26849, %r26848, 30212; + cvt.u32.u16 %r26851, %rs787; + shl.b32 %r26852, %r26851, 16; + and.b32 %r26853, %r26852, 16711680; + or.b32 %r26854, %r26850, %r26853; + cvt.u32.u16 %r26855, %rs786; + shl.b32 %r26856, %r26855, 24; + or.b32 %r26857, %r26854, %r26856; + cvt.u32.u16 %r26858, %rs785; + and.b32 %r26859, %r26858, 255; + cvt.u32.u16 %r26860, %rs784; + prmt.b32 %r26861, %r26860, %r26859, 30212; + cvt.u32.u16 %r26862, %rs783; + shl.b32 %r26863, %r26862, 16; + and.b32 %r26864, %r26863, 16711680; + or.b32 %r26865, %r26861, %r26864; + cvt.u32.u16 %r26866, %rs782; + shl.b32 %r26867, %r26866, 24; + or.b32 %r26868, %r26865, %r26867; + cvt.u32.u16 %r26869, %rs781; + and.b32 %r26870, %r26869, 255; + cvt.u32.u16 %r26871, %rs780; + prmt.b32 %r26872, %r26871, %r26870, 30212; + cvt.u32.u16 %r26873, %rs779; + shl.b32 %r26874, %r26873, 16; + and.b32 %r26875, %r26874, 16711680; + or.b32 %r26876, %r26872, %r26875; + cvt.u32.u16 %r26877, %rs778; + shl.b32 %r26878, %r26877, 24; + or.b32 %r26879, %r26876, %r26878; + cvt.u32.u16 %r26880, %rs777; + and.b32 %r26881, %r26880, 255; + cvt.u32.u16 %r26882, %rs776; + prmt.b32 %r26883, %r26882, %r26881, 30212; + cvt.u32.u16 %r26884, %rs775; + shl.b32 %r26885, %r26884, 16; + and.b32 %r26886, %r26885, 16711680; + or.b32 %r26887, %r26883, %r26886; + cvt.u32.u16 %r26888, %rs774; + shl.b32 %r26889, %r26888, 24; + or.b32 %r26890, %r26887, %r26889; + cvt.u32.u16 %r26891, %rs773; + and.b32 %r26892, %r26891, 255; + cvt.u32.u16 %r26893, %rs772; + prmt.b32 %r26894, %r26893, %r26892, 30212; + cvt.u32.u16 %r26895, %rs771; + shl.b32 %r26896, %r26895, 16; + and.b32 %r26897, %r26896, 16711680; + or.b32 %r26898, %r26894, %r26897; + cvt.u32.u16 %r26899, %rs770; + shl.b32 %r26900, %r26899, 24; + or.b32 %r26901, %r26898, %r26900; + cvt.u32.u16 %r26902, %rs769; + and.b32 %r26903, %r26902, 255; + cvt.u32.u16 %r26904, %rs768; + prmt.b32 %r26905, %r26904, %r26903, 30212; + cvt.u32.u16 %r26906, %rs767; + shl.b32 %r26907, %r26906, 16; + and.b32 %r26908, %r26907, 16711680; + or.b32 %r26909, %r26905, %r26908; + cvt.u32.u16 %r26910, %rs766; + shl.b32 %r26911, %r26910, 24; + or.b32 %r26912, %r26909, %r26911; + cvt.u32.u16 %r26913, %rs830; + and.b32 %r26914, %r26913, 255; + cvt.u32.u16 %r26915, %rs831; + prmt.b32 %r26916, %r26915, %r26914, 30212; + cvt.u32.u16 %r26917, %rs832; + shl.b32 %r26918, %r26917, 16; + and.b32 %r26919, %r26918, 16711680; + or.b32 %r26920, %r26916, %r26919; + cvt.u32.u16 %r26921, %rs833; + shl.b32 %r26922, %r26921, 24; + or.b32 %r26923, %r26920, %r26922; + cvt.u32.u16 %r26924, %rs834; + and.b32 %r26925, %r26924, 255; + cvt.u32.u16 %r26926, %rs835; + prmt.b32 %r26927, %r26926, %r26925, 30212; + cvt.u32.u16 %r26928, %rs836; + shl.b32 %r26929, %r26928, 16; + and.b32 %r26930, %r26929, 16711680; + or.b32 %r26931, %r26927, %r26930; + cvt.u32.u16 %r26932, %rs837; + shl.b32 %r26933, %r26932, 24; + or.b32 %r26934, %r26931, %r26933; + cvt.u32.u16 %r26935, %rs838; + and.b32 %r26936, %r26935, 255; + cvt.u32.u16 %r26937, %rs839; + prmt.b32 %r26938, %r26937, %r26936, 30212; + cvt.u32.u16 %r26939, %rs840; + shl.b32 %r26940, %r26939, 16; + and.b32 %r26941, %r26940, 16711680; + or.b32 %r26942, %r26938, %r26941; + cvt.u32.u16 %r26943, %rs841; + shl.b32 %r26944, %r26943, 24; + or.b32 %r26945, %r26942, %r26944; + cvt.u32.u16 %r26946, %rs842; + and.b32 %r26947, %r26946, 255; + cvt.u32.u16 %r26948, %rs843; + prmt.b32 %r26949, %r26948, %r26947, 30212; + cvt.u32.u16 %r26950, %rs844; + shl.b32 %r26951, %r26950, 16; + and.b32 %r26952, %r26951, 16711680; + or.b32 %r26953, %r26949, %r26952; + cvt.u32.u16 %r26954, %rs845; + shl.b32 %r26955, %r26954, 24; + or.b32 %r26956, %r26953, %r26955; + cvt.u32.u16 %r26957, %rs846; + and.b32 %r26958, %r26957, 255; + cvt.u32.u16 %r26959, %rs847; + prmt.b32 %r26960, %r26959, %r26958, 30212; + cvt.u32.u16 %r26961, %rs848; + shl.b32 %r26962, %r26961, 16; + and.b32 %r26963, %r26962, 16711680; + or.b32 %r26964, %r26960, %r26963; + cvt.u32.u16 %r26965, %rs849; + shl.b32 %r26966, %r26965, 24; + or.b32 %r26967, %r26964, %r26966; + cvt.u32.u16 %r26968, %rs850; + and.b32 %r26969, %r26968, 255; + cvt.u32.u16 %r26970, %rs851; + prmt.b32 %r26971, %r26970, %r26969, 30212; + cvt.u32.u16 %r26972, %rs852; + shl.b32 %r26973, %r26972, 16; + and.b32 %r26974, %r26973, 16711680; + or.b32 %r26975, %r26971, %r26974; + cvt.u32.u16 %r26976, %rs853; + shl.b32 %r26977, %r26976, 24; + or.b32 %r26978, %r26975, %r26977; + cvt.u32.u16 %r26979, %rs854; + and.b32 %r26980, %r26979, 255; + cvt.u32.u16 %r26981, %rs855; + prmt.b32 %r26982, %r26981, %r26980, 30212; + cvt.u32.u16 %r26983, %rs856; + shl.b32 %r26984, %r26983, 16; + and.b32 %r26985, %r26984, 16711680; + or.b32 %r26986, %r26982, %r26985; + cvt.u32.u16 %r26987, %rs857; + shl.b32 %r26988, %r26987, 24; + or.b32 %r26989, %r26986, %r26988; + cvt.u32.u16 %r26990, %rs858; + and.b32 %r26991, %r26990, 255; + cvt.u32.u16 %r26992, %rs859; + prmt.b32 %r26993, %r26992, %r26991, 30212; + cvt.u32.u16 %r26994, %rs860; + shl.b32 %r26995, %r26994, 16; + and.b32 %r26996, %r26995, 16711680; + or.b32 %r26997, %r26993, %r26996; + cvt.u32.u16 %r26998, %rs861; + shl.b32 %r26999, %r26998, 24; + or.b32 %r27000, %r26997, %r26999; + shr.u64 %rd1162, %rd1368, 32; + cvt.u32.u64 %r27001, %rd1162; + add.s32 %r27002, %r31257, %r26835; + add.s32 %r27003, %r27002, %r31253; + cvt.u32.u64 %r27004, %rd1368; + xor.b32 %r27005, %r27003, %r27004; + shf.l.wrap.b32 %r27006, %r27005, %r27005, 16; + add.s32 %r27007, %r27006, 1779033703; + xor.b32 %r27008, %r27007, %r31253; + shf.l.wrap.b32 %r27009, %r27008, %r27008, 20; + add.s32 %r27010, %r27003, %r26846; + add.s32 %r27011, %r27010, %r27009; + xor.b32 %r27012, %r27011, %r27006; + shf.l.wrap.b32 %r27013, %r27012, %r27012, 24; + add.s32 %r27014, %r27013, %r27007; + xor.b32 %r27015, %r27014, %r27009; + shf.l.wrap.b32 %r27016, %r27015, %r27015, 25; + add.s32 %r27017, %r31256, %r26857; + add.s32 %r27018, %r27017, %r31252; + xor.b32 %r27019, %r27018, %r27001; + shf.l.wrap.b32 %r27020, %r27019, %r27019, 16; + add.s32 %r27021, %r27020, -1150833019; + xor.b32 %r27022, %r27021, %r31252; + shf.l.wrap.b32 %r27023, %r27022, %r27022, 20; + add.s32 %r27024, %r27018, %r26868; + add.s32 %r27025, %r27024, %r27023; + xor.b32 %r27026, %r27025, %r27020; + shf.l.wrap.b32 %r27027, %r27026, %r27026, 24; + add.s32 %r27028, %r27027, %r27021; + xor.b32 %r27029, %r27028, %r27023; + shf.l.wrap.b32 %r27030, %r27029, %r27029, 25; + add.s32 %r27031, %r31255, %r26879; + add.s32 %r27032, %r27031, %r31251; + cvt.u32.u16 %r27033, %rs862; + and.b32 %r27034, %r27033, 255; + xor.b32 %r27035, %r27032, %r27034; + shr.u32 %r27036, %r27032, 16; + shl.b32 %r27037, %r27035, 16; + or.b32 %r27038, %r27037, %r27036; + add.s32 %r27039, %r27038, 1013904242; + xor.b32 %r27040, %r27039, %r31251; + shf.l.wrap.b32 %r27041, %r27040, %r27040, 20; + add.s32 %r27042, %r27032, %r26890; + add.s32 %r27043, %r27042, %r27041; + xor.b32 %r27044, %r27043, %r27038; + shf.l.wrap.b32 %r27045, %r27044, %r27044, 24; + add.s32 %r27046, %r27045, %r27039; + xor.b32 %r27047, %r27046, %r27041; + shf.l.wrap.b32 %r27048, %r27047, %r27047, 25; + add.s32 %r27049, %r31254, %r26901; + add.s32 %r27050, %r27049, %r31250; + cvt.u32.u16 %r27051, %rs732; + and.b32 %r27052, %r27051, 255; + xor.b32 %r27053, %r27050, %r27052; + shr.u32 %r27054, %r27050, 16; + shl.b32 %r27055, %r27053, 16; + or.b32 %r27056, %r27055, %r27054; + add.s32 %r27057, %r27056, -1521486534; + xor.b32 %r27058, %r27057, %r31250; + shf.l.wrap.b32 %r27059, %r27058, %r27058, 20; + add.s32 %r27060, %r27050, %r26912; + add.s32 %r27061, %r27060, %r27059; + xor.b32 %r27062, %r27061, %r27056; + shf.l.wrap.b32 %r27063, %r27062, %r27062, 24; + add.s32 %r27064, %r27063, %r27057; + xor.b32 %r27065, %r27064, %r27059; + shf.l.wrap.b32 %r27066, %r27065, %r27065, 25; + add.s32 %r27067, %r27011, %r26923; + add.s32 %r27068, %r27067, %r27030; + xor.b32 %r27069, %r27068, %r27063; + shf.l.wrap.b32 %r27070, %r27069, %r27069, 16; + add.s32 %r27071, %r27070, %r27046; + xor.b32 %r27072, %r27071, %r27030; + shf.l.wrap.b32 %r27073, %r27072, %r27072, 20; + add.s32 %r27074, %r27068, %r26934; + add.s32 %r27075, %r27074, %r27073; + xor.b32 %r27076, %r27075, %r27070; + shf.l.wrap.b32 %r27077, %r27076, %r27076, 24; + add.s32 %r27078, %r27077, %r27071; + xor.b32 %r27079, %r27078, %r27073; + shf.l.wrap.b32 %r27080, %r27079, %r27079, 25; + add.s32 %r27081, %r27025, %r26945; + add.s32 %r27082, %r27081, %r27048; + xor.b32 %r27083, %r27082, %r27013; + shf.l.wrap.b32 %r27084, %r27083, %r27083, 16; + add.s32 %r27085, %r27084, %r27064; + xor.b32 %r27086, %r27085, %r27048; + shf.l.wrap.b32 %r27087, %r27086, %r27086, 20; + add.s32 %r27088, %r27082, %r26956; + add.s32 %r27089, %r27088, %r27087; + xor.b32 %r27090, %r27089, %r27084; + shf.l.wrap.b32 %r27091, %r27090, %r27090, 24; + add.s32 %r27092, %r27091, %r27085; + xor.b32 %r27093, %r27092, %r27087; + shf.l.wrap.b32 %r27094, %r27093, %r27093, 25; + add.s32 %r27095, %r27043, %r26967; + add.s32 %r27096, %r27095, %r27066; + xor.b32 %r27097, %r27096, %r27027; + shf.l.wrap.b32 %r27098, %r27097, %r27097, 16; + add.s32 %r27099, %r27098, %r27014; + xor.b32 %r27100, %r27099, %r27066; + shf.l.wrap.b32 %r27101, %r27100, %r27100, 20; + add.s32 %r27102, %r27096, %r26978; + add.s32 %r27103, %r27102, %r27101; + xor.b32 %r27104, %r27103, %r27098; + shf.l.wrap.b32 %r27105, %r27104, %r27104, 24; + add.s32 %r27106, %r27105, %r27099; + xor.b32 %r27107, %r27106, %r27101; + shf.l.wrap.b32 %r27108, %r27107, %r27107, 25; + add.s32 %r27109, %r27061, %r26989; + add.s32 %r27110, %r27109, %r27016; + xor.b32 %r27111, %r27110, %r27045; + shf.l.wrap.b32 %r27112, %r27111, %r27111, 16; + add.s32 %r27113, %r27112, %r27028; + xor.b32 %r27114, %r27113, %r27016; + shf.l.wrap.b32 %r27115, %r27114, %r27114, 20; + add.s32 %r27116, %r27110, %r27000; + add.s32 %r27117, %r27116, %r27115; + xor.b32 %r27118, %r27117, %r27112; + shf.l.wrap.b32 %r27119, %r27118, %r27118, 24; + add.s32 %r27120, %r27119, %r27113; + xor.b32 %r27121, %r27120, %r27115; + shf.l.wrap.b32 %r27122, %r27121, %r27121, 25; + add.s32 %r27123, %r27075, %r26857; + add.s32 %r27124, %r27123, %r27122; + xor.b32 %r27125, %r27124, %r27091; + shf.l.wrap.b32 %r27126, %r27125, %r27125, 16; + add.s32 %r27127, %r27126, %r27106; + xor.b32 %r27128, %r27127, %r27122; + shf.l.wrap.b32 %r27129, %r27128, %r27128, 20; + add.s32 %r27130, %r27124, %r26901; + add.s32 %r27131, %r27130, %r27129; + xor.b32 %r27132, %r27131, %r27126; + shf.l.wrap.b32 %r27133, %r27132, %r27132, 24; + add.s32 %r27134, %r27133, %r27127; + xor.b32 %r27135, %r27134, %r27129; + shf.l.wrap.b32 %r27136, %r27135, %r27135, 25; + add.s32 %r27137, %r27089, %r26868; + add.s32 %r27138, %r27137, %r27080; + xor.b32 %r27139, %r27138, %r27105; + shf.l.wrap.b32 %r27140, %r27139, %r27139, 16; + add.s32 %r27141, %r27140, %r27120; + xor.b32 %r27142, %r27141, %r27080; + shf.l.wrap.b32 %r27143, %r27142, %r27142, 20; + add.s32 %r27144, %r27138, %r26945; + add.s32 %r27145, %r27144, %r27143; + xor.b32 %r27146, %r27145, %r27140; + shf.l.wrap.b32 %r27147, %r27146, %r27146, 24; + add.s32 %r27148, %r27147, %r27141; + xor.b32 %r27149, %r27148, %r27143; + shf.l.wrap.b32 %r27150, %r27149, %r27149, 25; + add.s32 %r27151, %r27103, %r26912; + add.s32 %r27152, %r27151, %r27094; + xor.b32 %r27153, %r27152, %r27119; + shf.l.wrap.b32 %r27154, %r27153, %r27153, 16; + add.s32 %r27155, %r27154, %r27078; + xor.b32 %r27156, %r27155, %r27094; + shf.l.wrap.b32 %r27157, %r27156, %r27156, 20; + add.s32 %r27158, %r27152, %r26835; + add.s32 %r27159, %r27158, %r27157; + xor.b32 %r27160, %r27159, %r27154; + shf.l.wrap.b32 %r27161, %r27160, %r27160, 24; + add.s32 %r27162, %r27161, %r27155; + xor.b32 %r27163, %r27162, %r27157; + shf.l.wrap.b32 %r27164, %r27163, %r27163, 25; + add.s32 %r27165, %r27117, %r26879; + add.s32 %r27166, %r27165, %r27108; + xor.b32 %r27167, %r27166, %r27077; + shf.l.wrap.b32 %r27168, %r27167, %r27167, 16; + add.s32 %r27169, %r27168, %r27092; + xor.b32 %r27170, %r27169, %r27108; + shf.l.wrap.b32 %r27171, %r27170, %r27170, 20; + add.s32 %r27172, %r27166, %r26978; + add.s32 %r27173, %r27172, %r27171; + xor.b32 %r27174, %r27173, %r27168; + shf.l.wrap.b32 %r27175, %r27174, %r27174, 24; + add.s32 %r27176, %r27175, %r27169; + xor.b32 %r27177, %r27176, %r27171; + shf.l.wrap.b32 %r27178, %r27177, %r27177, 25; + add.s32 %r27179, %r27131, %r26846; + add.s32 %r27180, %r27179, %r27150; + xor.b32 %r27181, %r27180, %r27175; + shf.l.wrap.b32 %r27182, %r27181, %r27181, 16; + add.s32 %r27183, %r27182, %r27162; + xor.b32 %r27184, %r27183, %r27150; + shf.l.wrap.b32 %r27185, %r27184, %r27184, 20; + add.s32 %r27186, %r27180, %r26956; + add.s32 %r27187, %r27186, %r27185; + xor.b32 %r27188, %r27187, %r27182; + shf.l.wrap.b32 %r27189, %r27188, %r27188, 24; + add.s32 %r27190, %r27189, %r27183; + xor.b32 %r27191, %r27190, %r27185; + shf.l.wrap.b32 %r27192, %r27191, %r27191, 25; + add.s32 %r27193, %r27145, %r26967; + add.s32 %r27194, %r27193, %r27164; + xor.b32 %r27195, %r27194, %r27133; + shf.l.wrap.b32 %r27196, %r27195, %r27195, 16; + add.s32 %r27197, %r27196, %r27176; + xor.b32 %r27198, %r27197, %r27164; + shf.l.wrap.b32 %r27199, %r27198, %r27198, 20; + add.s32 %r27200, %r27194, %r26890; + add.s32 %r27201, %r27200, %r27199; + xor.b32 %r27202, %r27201, %r27196; + shf.l.wrap.b32 %r27203, %r27202, %r27202, 24; + add.s32 %r27204, %r27203, %r27197; + xor.b32 %r27205, %r27204, %r27199; + shf.l.wrap.b32 %r27206, %r27205, %r27205, 25; + add.s32 %r27207, %r27159, %r26934; + add.s32 %r27208, %r27207, %r27178; + xor.b32 %r27209, %r27208, %r27147; + shf.l.wrap.b32 %r27210, %r27209, %r27209, 16; + add.s32 %r27211, %r27210, %r27134; + xor.b32 %r27212, %r27211, %r27178; + shf.l.wrap.b32 %r27213, %r27212, %r27212, 20; + add.s32 %r27214, %r27208, %r26989; + add.s32 %r27215, %r27214, %r27213; + xor.b32 %r27216, %r27215, %r27210; + shf.l.wrap.b32 %r27217, %r27216, %r27216, 24; + add.s32 %r27218, %r27217, %r27211; + xor.b32 %r27219, %r27218, %r27213; + shf.l.wrap.b32 %r27220, %r27219, %r27219, 25; + add.s32 %r27221, %r27173, %r27000; + add.s32 %r27222, %r27221, %r27136; + xor.b32 %r27223, %r27222, %r27161; + shf.l.wrap.b32 %r27224, %r27223, %r27223, 16; + add.s32 %r27225, %r27224, %r27148; + xor.b32 %r27226, %r27225, %r27136; + shf.l.wrap.b32 %r27227, %r27226, %r27226, 20; + add.s32 %r27228, %r27222, %r26923; + add.s32 %r27229, %r27228, %r27227; + xor.b32 %r27230, %r27229, %r27224; + shf.l.wrap.b32 %r27231, %r27230, %r27230, 24; + add.s32 %r27232, %r27231, %r27225; + xor.b32 %r27233, %r27232, %r27227; + shf.l.wrap.b32 %r27234, %r27233, %r27233, 25; + add.s32 %r27235, %r27187, %r26868; + add.s32 %r27236, %r27235, %r27234; + xor.b32 %r27237, %r27236, %r27203; + shf.l.wrap.b32 %r27238, %r27237, %r27237, 16; + add.s32 %r27239, %r27238, %r27218; + xor.b32 %r27240, %r27239, %r27234; + shf.l.wrap.b32 %r27241, %r27240, %r27240, 20; + add.s32 %r27242, %r27236, %r26879; + add.s32 %r27243, %r27242, %r27241; + xor.b32 %r27244, %r27243, %r27238; + shf.l.wrap.b32 %r27245, %r27244, %r27244, 24; + add.s32 %r27246, %r27245, %r27239; + xor.b32 %r27247, %r27246, %r27241; + shf.l.wrap.b32 %r27248, %r27247, %r27247, 25; + add.s32 %r27249, %r27201, %r26945; + add.s32 %r27250, %r27249, %r27192; + xor.b32 %r27251, %r27250, %r27217; + shf.l.wrap.b32 %r27252, %r27251, %r27251, 16; + add.s32 %r27253, %r27252, %r27232; + xor.b32 %r27254, %r27253, %r27192; + shf.l.wrap.b32 %r27255, %r27254, %r27254, 20; + add.s32 %r27256, %r27250, %r26967; + add.s32 %r27257, %r27256, %r27255; + xor.b32 %r27258, %r27257, %r27252; + shf.l.wrap.b32 %r27259, %r27258, %r27258, 24; + add.s32 %r27260, %r27259, %r27253; + xor.b32 %r27261, %r27260, %r27255; + shf.l.wrap.b32 %r27262, %r27261, %r27261, 25; + add.s32 %r27263, %r27215, %r26978; + add.s32 %r27264, %r27263, %r27206; + xor.b32 %r27265, %r27264, %r27231; + shf.l.wrap.b32 %r27266, %r27265, %r27265, 16; + add.s32 %r27267, %r27266, %r27190; + xor.b32 %r27268, %r27267, %r27206; + shf.l.wrap.b32 %r27269, %r27268, %r27268, 20; + add.s32 %r27270, %r27264, %r26857; + add.s32 %r27271, %r27270, %r27269; + xor.b32 %r27272, %r27271, %r27266; + shf.l.wrap.b32 %r27273, %r27272, %r27272, 24; + add.s32 %r27274, %r27273, %r27267; + xor.b32 %r27275, %r27274, %r27269; + shf.l.wrap.b32 %r27276, %r27275, %r27275, 25; + add.s32 %r27277, %r27229, %r26912; + add.s32 %r27278, %r27277, %r27220; + xor.b32 %r27279, %r27278, %r27189; + shf.l.wrap.b32 %r27280, %r27279, %r27279, 16; + add.s32 %r27281, %r27280, %r27204; + xor.b32 %r27282, %r27281, %r27220; + shf.l.wrap.b32 %r27283, %r27282, %r27282, 20; + add.s32 %r27284, %r27278, %r26989; + add.s32 %r27285, %r27284, %r27283; + xor.b32 %r27286, %r27285, %r27280; + shf.l.wrap.b32 %r27287, %r27286, %r27286, 24; + add.s32 %r27288, %r27287, %r27281; + xor.b32 %r27289, %r27288, %r27283; + shf.l.wrap.b32 %r27290, %r27289, %r27289, 25; + add.s32 %r27291, %r27243, %r26901; + add.s32 %r27292, %r27291, %r27262; + xor.b32 %r27293, %r27292, %r27287; + shf.l.wrap.b32 %r27294, %r27293, %r27293, 16; + add.s32 %r27295, %r27294, %r27274; + xor.b32 %r27296, %r27295, %r27262; + shf.l.wrap.b32 %r27297, %r27296, %r27296, 20; + add.s32 %r27298, %r27292, %r26890; + add.s32 %r27299, %r27298, %r27297; + xor.b32 %r27300, %r27299, %r27294; + shf.l.wrap.b32 %r27301, %r27300, %r27300, 24; + add.s32 %r27302, %r27301, %r27295; + xor.b32 %r27303, %r27302, %r27297; + shf.l.wrap.b32 %r27304, %r27303, %r27303, 25; + add.s32 %r27305, %r27257, %r26934; + add.s32 %r27306, %r27305, %r27276; + xor.b32 %r27307, %r27306, %r27245; + shf.l.wrap.b32 %r27308, %r27307, %r27307, 16; + add.s32 %r27309, %r27308, %r27288; + xor.b32 %r27310, %r27309, %r27276; + shf.l.wrap.b32 %r27311, %r27310, %r27310, 20; + add.s32 %r27312, %r27306, %r26835; + add.s32 %r27313, %r27312, %r27311; + xor.b32 %r27314, %r27313, %r27308; + shf.l.wrap.b32 %r27315, %r27314, %r27314, 24; + add.s32 %r27316, %r27315, %r27309; + xor.b32 %r27317, %r27316, %r27311; + shf.l.wrap.b32 %r27318, %r27317, %r27317, 25; + add.s32 %r27319, %r27271, %r26956; + add.s32 %r27320, %r27319, %r27290; + xor.b32 %r27321, %r27320, %r27259; + shf.l.wrap.b32 %r27322, %r27321, %r27321, 16; + add.s32 %r27323, %r27322, %r27246; + xor.b32 %r27324, %r27323, %r27290; + shf.l.wrap.b32 %r27325, %r27324, %r27324, 20; + add.s32 %r27326, %r27320, %r27000; + add.s32 %r27327, %r27326, %r27325; + xor.b32 %r27328, %r27327, %r27322; + shf.l.wrap.b32 %r27329, %r27328, %r27328, 24; + add.s32 %r27330, %r27329, %r27323; + xor.b32 %r27331, %r27330, %r27325; + shf.l.wrap.b32 %r27332, %r27331, %r27331, 25; + add.s32 %r27333, %r27285, %r26923; + add.s32 %r27334, %r27333, %r27248; + xor.b32 %r27335, %r27334, %r27273; + shf.l.wrap.b32 %r27336, %r27335, %r27335, 16; + add.s32 %r27337, %r27336, %r27260; + xor.b32 %r27338, %r27337, %r27248; + shf.l.wrap.b32 %r27339, %r27338, %r27338, 20; + add.s32 %r27340, %r27334, %r26846; + add.s32 %r27341, %r27340, %r27339; + xor.b32 %r27342, %r27341, %r27336; + shf.l.wrap.b32 %r27343, %r27342, %r27342, 24; + add.s32 %r27344, %r27343, %r27337; + xor.b32 %r27345, %r27344, %r27339; + shf.l.wrap.b32 %r27346, %r27345, %r27345, 25; + add.s32 %r27347, %r27299, %r26945; + add.s32 %r27348, %r27347, %r27346; + xor.b32 %r27349, %r27348, %r27315; + shf.l.wrap.b32 %r27350, %r27349, %r27349, 16; + add.s32 %r27351, %r27350, %r27330; + xor.b32 %r27352, %r27351, %r27346; + shf.l.wrap.b32 %r27353, %r27352, %r27352, 20; + add.s32 %r27354, %r27348, %r26912; + add.s32 %r27355, %r27354, %r27353; + xor.b32 %r27356, %r27355, %r27350; + shf.l.wrap.b32 %r27357, %r27356, %r27356, 24; + add.s32 %r27358, %r27357, %r27351; + xor.b32 %r27359, %r27358, %r27353; + shf.l.wrap.b32 %r27360, %r27359, %r27359, 25; + add.s32 %r27361, %r27313, %r26967; + add.s32 %r27362, %r27361, %r27304; + xor.b32 %r27363, %r27362, %r27329; + shf.l.wrap.b32 %r27364, %r27363, %r27363, 16; + add.s32 %r27365, %r27364, %r27344; + xor.b32 %r27366, %r27365, %r27304; + shf.l.wrap.b32 %r27367, %r27366, %r27366, 20; + add.s32 %r27368, %r27362, %r26934; + add.s32 %r27369, %r27368, %r27367; + xor.b32 %r27370, %r27369, %r27364; + shf.l.wrap.b32 %r27371, %r27370, %r27370, 24; + add.s32 %r27372, %r27371, %r27365; + xor.b32 %r27373, %r27372, %r27367; + shf.l.wrap.b32 %r27374, %r27373, %r27373, 25; + add.s32 %r27375, %r27327, %r26989; + add.s32 %r27376, %r27375, %r27318; + xor.b32 %r27377, %r27376, %r27343; + shf.l.wrap.b32 %r27378, %r27377, %r27377, 16; + add.s32 %r27379, %r27378, %r27302; + xor.b32 %r27380, %r27379, %r27318; + shf.l.wrap.b32 %r27381, %r27380, %r27380, 20; + add.s32 %r27382, %r27376, %r26868; + add.s32 %r27383, %r27382, %r27381; + xor.b32 %r27384, %r27383, %r27378; + shf.l.wrap.b32 %r27385, %r27384, %r27384, 24; + add.s32 %r27386, %r27385, %r27379; + xor.b32 %r27387, %r27386, %r27381; + shf.l.wrap.b32 %r27388, %r27387, %r27387, 25; + add.s32 %r27389, %r27341, %r26978; + add.s32 %r27390, %r27389, %r27332; + xor.b32 %r27391, %r27390, %r27301; + shf.l.wrap.b32 %r27392, %r27391, %r27391, 16; + add.s32 %r27393, %r27392, %r27316; + xor.b32 %r27394, %r27393, %r27332; + shf.l.wrap.b32 %r27395, %r27394, %r27394, 20; + add.s32 %r27396, %r27390, %r27000; + add.s32 %r27397, %r27396, %r27395; + xor.b32 %r27398, %r27397, %r27392; + shf.l.wrap.b32 %r27399, %r27398, %r27398, 24; + add.s32 %r27400, %r27399, %r27393; + xor.b32 %r27401, %r27400, %r27395; + shf.l.wrap.b32 %r27402, %r27401, %r27401, 25; + add.s32 %r27403, %r27355, %r26879; + add.s32 %r27404, %r27403, %r27374; + xor.b32 %r27405, %r27404, %r27399; + shf.l.wrap.b32 %r27406, %r27405, %r27405, 16; + add.s32 %r27407, %r27406, %r27386; + xor.b32 %r27408, %r27407, %r27374; + shf.l.wrap.b32 %r27409, %r27408, %r27408, 20; + add.s32 %r27410, %r27404, %r26835; + add.s32 %r27411, %r27410, %r27409; + xor.b32 %r27412, %r27411, %r27406; + shf.l.wrap.b32 %r27413, %r27412, %r27412, 24; + add.s32 %r27414, %r27413, %r27407; + xor.b32 %r27415, %r27414, %r27409; + shf.l.wrap.b32 %r27416, %r27415, %r27415, 25; + add.s32 %r27417, %r27369, %r26956; + add.s32 %r27418, %r27417, %r27388; + xor.b32 %r27419, %r27418, %r27357; + shf.l.wrap.b32 %r27420, %r27419, %r27419, 16; + add.s32 %r27421, %r27420, %r27400; + xor.b32 %r27422, %r27421, %r27388; + shf.l.wrap.b32 %r27423, %r27422, %r27422, 20; + add.s32 %r27424, %r27418, %r26857; + add.s32 %r27425, %r27424, %r27423; + xor.b32 %r27426, %r27425, %r27420; + shf.l.wrap.b32 %r27427, %r27426, %r27426, 24; + add.s32 %r27428, %r27427, %r27421; + xor.b32 %r27429, %r27428, %r27423; + shf.l.wrap.b32 %r27430, %r27429, %r27429, 25; + add.s32 %r27431, %r27383, %r26890; + add.s32 %r27432, %r27431, %r27402; + xor.b32 %r27433, %r27432, %r27371; + shf.l.wrap.b32 %r27434, %r27433, %r27433, 16; + add.s32 %r27435, %r27434, %r27358; + xor.b32 %r27436, %r27435, %r27402; + shf.l.wrap.b32 %r27437, %r27436, %r27436, 20; + add.s32 %r27438, %r27432, %r26923; + add.s32 %r27439, %r27438, %r27437; + xor.b32 %r27440, %r27439, %r27434; + shf.l.wrap.b32 %r27441, %r27440, %r27440, 24; + add.s32 %r27442, %r27441, %r27435; + xor.b32 %r27443, %r27442, %r27437; + shf.l.wrap.b32 %r27444, %r27443, %r27443, 25; + add.s32 %r27445, %r27397, %r26846; + add.s32 %r27446, %r27445, %r27360; + xor.b32 %r27447, %r27446, %r27385; + shf.l.wrap.b32 %r27448, %r27447, %r27447, 16; + add.s32 %r27449, %r27448, %r27372; + xor.b32 %r27450, %r27449, %r27360; + shf.l.wrap.b32 %r27451, %r27450, %r27450, 20; + add.s32 %r27452, %r27446, %r26901; + add.s32 %r27453, %r27452, %r27451; + xor.b32 %r27454, %r27453, %r27448; + shf.l.wrap.b32 %r27455, %r27454, %r27454, 24; + add.s32 %r27456, %r27455, %r27449; + xor.b32 %r27457, %r27456, %r27451; + shf.l.wrap.b32 %r27458, %r27457, %r27457, 25; + add.s32 %r27459, %r27411, %r26967; + add.s32 %r27460, %r27459, %r27458; + xor.b32 %r27461, %r27460, %r27427; + shf.l.wrap.b32 %r27462, %r27461, %r27461, 16; + add.s32 %r27463, %r27462, %r27442; + xor.b32 %r27464, %r27463, %r27458; + shf.l.wrap.b32 %r27465, %r27464, %r27464, 20; + add.s32 %r27466, %r27460, %r26978; + add.s32 %r27467, %r27466, %r27465; + xor.b32 %r27468, %r27467, %r27462; + shf.l.wrap.b32 %r27469, %r27468, %r27468, 24; + add.s32 %r27470, %r27469, %r27463; + xor.b32 %r27471, %r27470, %r27465; + shf.l.wrap.b32 %r27472, %r27471, %r27471, 25; + add.s32 %r27473, %r27425, %r26934; + add.s32 %r27474, %r27473, %r27416; + xor.b32 %r27475, %r27474, %r27441; + shf.l.wrap.b32 %r27476, %r27475, %r27475, 16; + add.s32 %r27477, %r27476, %r27456; + xor.b32 %r27478, %r27477, %r27416; + shf.l.wrap.b32 %r27479, %r27478, %r27478, 20; + add.s32 %r27480, %r27474, %r26956; + add.s32 %r27481, %r27480, %r27479; + xor.b32 %r27482, %r27481, %r27476; + shf.l.wrap.b32 %r27483, %r27482, %r27482, 24; + add.s32 %r27484, %r27483, %r27477; + xor.b32 %r27485, %r27484, %r27479; + shf.l.wrap.b32 %r27486, %r27485, %r27485, 25; + add.s32 %r27487, %r27439, %r27000; + add.s32 %r27488, %r27487, %r27430; + xor.b32 %r27489, %r27488, %r27455; + shf.l.wrap.b32 %r27490, %r27489, %r27489, 16; + add.s32 %r27491, %r27490, %r27414; + xor.b32 %r27492, %r27491, %r27430; + shf.l.wrap.b32 %r27493, %r27492, %r27492, 20; + add.s32 %r27494, %r27488, %r26945; + add.s32 %r27495, %r27494, %r27493; + xor.b32 %r27496, %r27495, %r27490; + shf.l.wrap.b32 %r27497, %r27496, %r27496, 24; + add.s32 %r27498, %r27497, %r27491; + xor.b32 %r27499, %r27498, %r27493; + shf.l.wrap.b32 %r27500, %r27499, %r27499, 25; + add.s32 %r27501, %r27453, %r26989; + add.s32 %r27502, %r27501, %r27444; + xor.b32 %r27503, %r27502, %r27413; + shf.l.wrap.b32 %r27504, %r27503, %r27503, 16; + add.s32 %r27505, %r27504, %r27428; + xor.b32 %r27506, %r27505, %r27444; + shf.l.wrap.b32 %r27507, %r27506, %r27506, 20; + add.s32 %r27508, %r27502, %r26923; + add.s32 %r27509, %r27508, %r27507; + xor.b32 %r27510, %r27509, %r27504; + shf.l.wrap.b32 %r27511, %r27510, %r27510, 24; + add.s32 %r27512, %r27511, %r27505; + xor.b32 %r27513, %r27512, %r27507; + shf.l.wrap.b32 %r27514, %r27513, %r27513, 25; + add.s32 %r27515, %r27467, %r26912; + add.s32 %r27516, %r27515, %r27486; + xor.b32 %r27517, %r27516, %r27511; + shf.l.wrap.b32 %r27518, %r27517, %r27517, 16; + add.s32 %r27519, %r27518, %r27498; + xor.b32 %r27520, %r27519, %r27486; + shf.l.wrap.b32 %r27521, %r27520, %r27520, 20; + add.s32 %r27522, %r27516, %r26857; + add.s32 %r27523, %r27522, %r27521; + xor.b32 %r27524, %r27523, %r27518; + shf.l.wrap.b32 %r27525, %r27524, %r27524, 24; + add.s32 %r27526, %r27525, %r27519; + xor.b32 %r27527, %r27526, %r27521; + shf.l.wrap.b32 %r27528, %r27527, %r27527, 25; + add.s32 %r27529, %r27481, %r26890; + add.s32 %r27530, %r27529, %r27500; + xor.b32 %r27531, %r27530, %r27469; + shf.l.wrap.b32 %r27532, %r27531, %r27531, 16; + add.s32 %r27533, %r27532, %r27512; + xor.b32 %r27534, %r27533, %r27500; + shf.l.wrap.b32 %r27535, %r27534, %r27534, 20; + add.s32 %r27536, %r27530, %r26868; + add.s32 %r27537, %r27536, %r27535; + xor.b32 %r27538, %r27537, %r27532; + shf.l.wrap.b32 %r27539, %r27538, %r27538, 24; + add.s32 %r27540, %r27539, %r27533; + xor.b32 %r27541, %r27540, %r27535; + shf.l.wrap.b32 %r27542, %r27541, %r27541, 25; + add.s32 %r27543, %r27495, %r26835; + add.s32 %r27544, %r27543, %r27514; + xor.b32 %r27545, %r27544, %r27483; + shf.l.wrap.b32 %r27546, %r27545, %r27545, 16; + add.s32 %r27547, %r27546, %r27470; + xor.b32 %r27548, %r27547, %r27514; + shf.l.wrap.b32 %r27549, %r27548, %r27548, 20; + add.s32 %r27550, %r27544, %r26846; + add.s32 %r27551, %r27550, %r27549; + xor.b32 %r27552, %r27551, %r27546; + shf.l.wrap.b32 %r27553, %r27552, %r27552, 24; + add.s32 %r27554, %r27553, %r27547; + xor.b32 %r27555, %r27554, %r27549; + shf.l.wrap.b32 %r27556, %r27555, %r27555, 25; + add.s32 %r27557, %r27509, %r26901; + add.s32 %r27558, %r27557, %r27472; + xor.b32 %r27559, %r27558, %r27497; + shf.l.wrap.b32 %r27560, %r27559, %r27559, 16; + add.s32 %r27561, %r27560, %r27484; + xor.b32 %r27562, %r27561, %r27472; + shf.l.wrap.b32 %r27563, %r27562, %r27562, 20; + add.s32 %r27564, %r27558, %r26879; + add.s32 %r27565, %r27564, %r27563; + xor.b32 %r27566, %r27565, %r27560; + shf.l.wrap.b32 %r27567, %r27566, %r27566, 24; + add.s32 %r27568, %r27567, %r27561; + xor.b32 %r27569, %r27568, %r27563; + shf.l.wrap.b32 %r27570, %r27569, %r27569, 25; + add.s32 %r27571, %r27523, %r26934; + add.s32 %r27572, %r27571, %r27570; + xor.b32 %r27573, %r27572, %r27539; + shf.l.wrap.b32 %r27574, %r27573, %r27573, 16; + add.s32 %r27575, %r27574, %r27554; + xor.b32 %r27576, %r27575, %r27570; + shf.l.wrap.b32 %r27577, %r27576, %r27576, 20; + add.s32 %r27578, %r27572, %r26989; + add.s32 %r27579, %r27578, %r27577; + xor.b32 %r27580, %r27579, %r27574; + shf.l.wrap.b32 %r27581, %r27580, %r27580, 24; + add.s32 %r27582, %r27581, %r27575; + xor.b32 %r27583, %r27582, %r27577; + shf.l.wrap.b32 %r27584, %r27583, %r27583, 25; + add.s32 %r27585, %r27537, %r26956; + add.s32 %r27586, %r27585, %r27528; + xor.b32 %r27587, %r27586, %r27553; + shf.l.wrap.b32 %r27588, %r27587, %r27587, 16; + add.s32 %r27589, %r27588, %r27568; + xor.b32 %r27590, %r27589, %r27528; + shf.l.wrap.b32 %r27591, %r27590, %r27590, 20; + add.s32 %r27592, %r27586, %r26890; + add.s32 %r27593, %r27592, %r27591; + xor.b32 %r27594, %r27593, %r27588; + shf.l.wrap.b32 %r27595, %r27594, %r27594, 24; + add.s32 %r27596, %r27595, %r27589; + xor.b32 %r27597, %r27596, %r27591; + shf.l.wrap.b32 %r27598, %r27597, %r27597, 25; + add.s32 %r27599, %r27551, %r26923; + add.s32 %r27600, %r27599, %r27542; + xor.b32 %r27601, %r27600, %r27567; + shf.l.wrap.b32 %r27602, %r27601, %r27601, 16; + add.s32 %r27603, %r27602, %r27526; + xor.b32 %r27604, %r27603, %r27542; + shf.l.wrap.b32 %r27605, %r27604, %r27604, 20; + add.s32 %r27606, %r27600, %r26967; + add.s32 %r27607, %r27606, %r27605; + xor.b32 %r27608, %r27607, %r27602; + shf.l.wrap.b32 %r27609, %r27608, %r27608, 24; + add.s32 %r27610, %r27609, %r27603; + xor.b32 %r27611, %r27610, %r27605; + shf.l.wrap.b32 %r27612, %r27611, %r27611, 25; + add.s32 %r27613, %r27565, %r27000; + add.s32 %r27614, %r27613, %r27556; + xor.b32 %r27615, %r27614, %r27525; + shf.l.wrap.b32 %r27616, %r27615, %r27615, 16; + add.s32 %r27617, %r27616, %r27540; + xor.b32 %r27618, %r27617, %r27556; + shf.l.wrap.b32 %r27619, %r27618, %r27618, 20; + add.s32 %r27620, %r27614, %r26846; + add.s32 %r27621, %r27620, %r27619; + xor.b32 %r27622, %r27621, %r27616; + shf.l.wrap.b32 %r27623, %r27622, %r27622, 24; + add.s32 %r27624, %r27623, %r27617; + xor.b32 %r27625, %r27624, %r27619; + shf.l.wrap.b32 %r27626, %r27625, %r27625, 25; + add.s32 %r27627, %r27579, %r26978; + add.s32 %r27628, %r27627, %r27598; + xor.b32 %r27629, %r27628, %r27623; + shf.l.wrap.b32 %r27630, %r27629, %r27629, 16; + add.s32 %r27631, %r27630, %r27610; + xor.b32 %r27632, %r27631, %r27598; + shf.l.wrap.b32 %r27633, %r27632, %r27632, 20; + add.s32 %r27634, %r27628, %r26868; + add.s32 %r27635, %r27634, %r27633; + xor.b32 %r27636, %r27635, %r27630; + shf.l.wrap.b32 %r27637, %r27636, %r27636, 24; + add.s32 %r27638, %r27637, %r27631; + xor.b32 %r27639, %r27638, %r27633; + shf.l.wrap.b32 %r27640, %r27639, %r27639, 25; + add.s32 %r27641, %r27593, %r26835; + add.s32 %r27642, %r27641, %r27612; + xor.b32 %r27643, %r27642, %r27581; + shf.l.wrap.b32 %r27644, %r27643, %r27643, 16; + add.s32 %r27645, %r27644, %r27624; + xor.b32 %r27646, %r27645, %r27612; + shf.l.wrap.b32 %r27647, %r27646, %r27646, 20; + add.s32 %r27648, %r27642, %r26945; + add.s32 %r27649, %r27648, %r27647; + xor.b32 %r27650, %r27649, %r27644; + shf.l.wrap.b32 %r27651, %r27650, %r27650, 24; + add.s32 %r27652, %r27651, %r27645; + xor.b32 %r27653, %r27652, %r27647; + shf.l.wrap.b32 %r27654, %r27653, %r27653, 25; + add.s32 %r27655, %r27607, %r26857; + add.s32 %r27656, %r27655, %r27626; + xor.b32 %r27657, %r27656, %r27595; + shf.l.wrap.b32 %r27658, %r27657, %r27657, 16; + add.s32 %r27659, %r27658, %r27582; + xor.b32 %r27660, %r27659, %r27626; + shf.l.wrap.b32 %r27661, %r27660, %r27660, 20; + add.s32 %r27662, %r27656, %r26901; + add.s32 %r27663, %r27662, %r27661; + xor.b32 %r27664, %r27663, %r27658; + shf.l.wrap.b32 %r27665, %r27664, %r27664, 24; + add.s32 %r27666, %r27665, %r27659; + xor.b32 %r27667, %r27666, %r27661; + shf.l.wrap.b32 %r27668, %r27667, %r27667, 25; + add.s32 %r27669, %r27621, %r26879; + add.s32 %r27670, %r27669, %r27584; + xor.b32 %r27671, %r27670, %r27609; + shf.l.wrap.b32 %r27672, %r27671, %r27671, 16; + add.s32 %r27673, %r27672, %r27596; + xor.b32 %r27674, %r27673, %r27584; + shf.l.wrap.b32 %r27675, %r27674, %r27674, 20; + add.s32 %r27676, %r27670, %r26912; + add.s32 %r27677, %r27676, %r27675; + xor.b32 %r27678, %r27677, %r27672; + shf.l.wrap.b32 %r27679, %r27678, %r27678, 24; + add.s32 %r27680, %r27679, %r27673; + xor.b32 %r27681, %r27680, %r27675; + shf.l.wrap.b32 %r27682, %r27681, %r27681, 25; + add.s32 %r27683, %r27635, %r26956; + add.s32 %r27684, %r27683, %r27682; + xor.b32 %r27685, %r27684, %r27651; + shf.l.wrap.b32 %r27686, %r27685, %r27685, 16; + add.s32 %r27687, %r27686, %r27666; + xor.b32 %r27688, %r27687, %r27682; + shf.l.wrap.b32 %r27689, %r27688, %r27688, 20; + add.s32 %r27690, %r27684, %r27000; + add.s32 %r27691, %r27690, %r27689; + xor.b32 %r27692, %r27691, %r27686; + shf.l.wrap.b32 %r27693, %r27692, %r27692, 24; + add.s32 %r27694, %r27693, %r27687; + xor.b32 %r27695, %r27694, %r27689; + shf.l.wrap.b32 %r27696, %r27695, %r27695, 25; + add.s32 %r27697, %r27649, %r26890; + add.s32 %r27698, %r27697, %r27640; + xor.b32 %r27699, %r27698, %r27665; + shf.l.wrap.b32 %r27700, %r27699, %r27699, 16; + add.s32 %r27701, %r27700, %r27680; + xor.b32 %r27702, %r27701, %r27640; + shf.l.wrap.b32 %r27703, %r27702, %r27702, 20; + add.s32 %r27704, %r27698, %r26835; + add.s32 %r27705, %r27704, %r27703; + xor.b32 %r27706, %r27705, %r27700; + shf.l.wrap.b32 %r27707, %r27706, %r27706, 24; + add.s32 %r27708, %r27707, %r27701; + xor.b32 %r27709, %r27708, %r27703; + shf.l.wrap.b32 %r27710, %r27709, %r27709, 25; + add.s32 %r27711, %r27663, %r26846; + add.s32 %r27712, %r27711, %r27654; + xor.b32 %r27713, %r27712, %r27679; + shf.l.wrap.b32 %r27714, %r27713, %r27713, 16; + add.s32 %r27715, %r27714, %r27638; + xor.b32 %r27716, %r27715, %r27654; + shf.l.wrap.b32 %r27717, %r27716, %r27716, 20; + add.s32 %r27718, %r27712, %r26934; + add.s32 %r27719, %r27718, %r27717; + xor.b32 %r27720, %r27719, %r27714; + shf.l.wrap.b32 %r27721, %r27720, %r27720, 24; + add.s32 %r27722, %r27721, %r27715; + xor.b32 %r27723, %r27722, %r27717; + shf.l.wrap.b32 %r27724, %r27723, %r27723, 25; + add.s32 %r27725, %r27677, %r26923; + add.s32 %r27726, %r27725, %r27668; + xor.b32 %r27727, %r27726, %r27637; + shf.l.wrap.b32 %r27728, %r27727, %r27727, 16; + add.s32 %r27729, %r27728, %r27652; + xor.b32 %r27730, %r27729, %r27668; + shf.l.wrap.b32 %r27731, %r27730, %r27730, 20; + add.s32 %r27732, %r27726, %r26901; + add.s32 %r27733, %r27732, %r27731; + xor.b32 %r27734, %r27733, %r27728; + shf.l.wrap.b32 %r27735, %r27734, %r27734, 24; + add.s32 %r27736, %r27735, %r27729; + xor.b32 %r27737, %r27736, %r27731; + shf.l.wrap.b32 %r27738, %r27737, %r27737, 25; + add.s32 %r27739, %r27691, %r26989; + add.s32 %r27740, %r27739, %r27710; + xor.b32 %r27741, %r27740, %r27735; + shf.l.wrap.b32 %r27742, %r27741, %r27741, 16; + add.s32 %r27743, %r27742, %r27722; + xor.b32 %r27744, %r27743, %r27710; + shf.l.wrap.b32 %r27745, %r27744, %r27744, 20; + add.s32 %r27746, %r27740, %r26945; + add.s32 %r27747, %r27746, %r27745; + xor.b32 %r27748, %r27747, %r27742; + shr.u32 %r27749, %r27748, 8; + shf.l.wrap.b32 %r27750, %r27748, %r27748, 24; + add.s32 %r27751, %r27750, %r27743; + xor.b32 %r27752, %r27751, %r27745; + shr.u32 %r27753, %r27752, 7; + shf.l.wrap.b32 %r27754, %r27752, %r27752, 25; + add.s32 %r27755, %r27705, %r26857; + add.s32 %r27756, %r27755, %r27724; + xor.b32 %r27757, %r27756, %r27693; + shf.l.wrap.b32 %r27758, %r27757, %r27757, 16; + add.s32 %r27759, %r27758, %r27736; + xor.b32 %r27760, %r27759, %r27724; + shf.l.wrap.b32 %r27761, %r27760, %r27760, 20; + add.s32 %r27762, %r27756, %r26967; + add.s32 %r27763, %r27762, %r27761; + xor.b32 %r27764, %r27763, %r27758; + shr.u32 %r27765, %r27764, 8; + shf.l.wrap.b32 %r27766, %r27764, %r27764, 24; + add.s32 %r27767, %r27766, %r27759; + xor.b32 %r27768, %r27767, %r27761; + shr.u32 %r27769, %r27768, 7; + shf.l.wrap.b32 %r27770, %r27768, %r27768, 25; + add.s32 %r27771, %r27719, %r26868; + add.s32 %r27772, %r27771, %r27738; + xor.b32 %r27773, %r27772, %r27707; + shf.l.wrap.b32 %r27774, %r27773, %r27773, 16; + add.s32 %r27775, %r27774, %r27694; + xor.b32 %r27776, %r27775, %r27738; + shf.l.wrap.b32 %r27777, %r27776, %r27776, 20; + add.s32 %r27778, %r27772, %r26879; + add.s32 %r27779, %r27778, %r27777; + xor.b32 %r27780, %r27779, %r27774; + shr.u32 %r27781, %r27780, 8; + shf.l.wrap.b32 %r27782, %r27780, %r27780, 24; + add.s32 %r27783, %r27782, %r27775; + xor.b32 %r27784, %r27783, %r27777; + shr.u32 %r27785, %r27784, 7; + shf.l.wrap.b32 %r27786, %r27784, %r27784, 25; + add.s32 %r27787, %r27733, %r26912; + add.s32 %r27788, %r27787, %r27696; + xor.b32 %r27789, %r27788, %r27721; + shf.l.wrap.b32 %r27790, %r27789, %r27789, 16; + add.s32 %r27791, %r27790, %r27708; + xor.b32 %r27792, %r27791, %r27696; + shf.l.wrap.b32 %r27793, %r27792, %r27792, 20; + add.s32 %r27794, %r27788, %r26978; + add.s32 %r27795, %r27794, %r27793; + xor.b32 %r27796, %r27795, %r27790; + shr.u32 %r27797, %r27796, 8; + shf.l.wrap.b32 %r27798, %r27796, %r27796, 24; + add.s32 %r27799, %r27798, %r27791; + xor.b32 %r27800, %r27799, %r27793; + shr.u32 %r27801, %r27800, 7; + shf.l.wrap.b32 %r27802, %r27800, %r27800, 25; + xor.b32 %r27803, %r27783, %r27747; + xor.b32 %r27804, %r27799, %r27763; + xor.b32 %r27805, %r27751, %r27779; + xor.b32 %r27806, %r27767, %r27795; + xor.b32 %r27807, %r27802, %r27766; + xor.b32 %r27808, %r27754, %r27782; + xor.b32 %r27809, %r27770, %r27798; + xor.b32 %r27810, %r27786, %r27750; + cvt.u16.u32 %rs551, %r27783; + cvt.u16.u32 %rs552, %r27747; + xor.b16 %rs830, %rs551, %rs552; + shr.u32 %r27811, %r27803, 8; + cvt.u16.u32 %rs831, %r27811; + shr.u32 %r27812, %r27803, 16; + cvt.u16.u32 %rs832, %r27812; + shr.u32 %r27813, %r27803, 24; + cvt.u16.u32 %rs833, %r27813; + cvt.u16.u32 %rs553, %r27799; + cvt.u16.u32 %rs554, %r27763; + xor.b16 %rs834, %rs553, %rs554; + shr.u32 %r27814, %r27804, 8; + cvt.u16.u32 %rs835, %r27814; + shr.u32 %r27815, %r27804, 16; + cvt.u16.u32 %rs836, %r27815; + shr.u32 %r27816, %r27804, 24; + cvt.u16.u32 %rs837, %r27816; + cvt.u16.u32 %rs555, %r27779; + cvt.u16.u32 %rs556, %r27751; + xor.b16 %rs838, %rs556, %rs555; + shr.u32 %r27817, %r27805, 8; + cvt.u16.u32 %rs839, %r27817; + shr.u32 %r27818, %r27805, 16; + cvt.u16.u32 %rs840, %r27818; + shr.u32 %r27819, %r27805, 24; + cvt.u16.u32 %rs841, %r27819; + cvt.u16.u32 %rs557, %r27767; + cvt.u16.u32 %rs558, %r27795; + xor.b16 %rs842, %rs557, %rs558; + shr.u32 %r27820, %r27806, 8; + cvt.u16.u32 %rs843, %r27820; + shr.u32 %r27821, %r27806, 16; + cvt.u16.u32 %rs844, %r27821; + shr.u32 %r27822, %r27806, 24; + cvt.u16.u32 %rs845, %r27822; + cvt.u16.u32 %rs559, %r27801; + cvt.u16.u32 %rs560, %r27765; + xor.b16 %rs846, %rs559, %rs560; + shr.u32 %r27823, %r27807, 8; + cvt.u16.u32 %rs847, %r27823; + shr.u32 %r27824, %r27807, 16; + cvt.u16.u32 %rs848, %r27824; + shr.u32 %r27825, %r27807, 24; + cvt.u16.u32 %rs849, %r27825; + cvt.u16.u32 %rs561, %r27781; + cvt.u16.u32 %rs562, %r27753; + xor.b16 %rs850, %rs562, %rs561; + shr.u32 %r27826, %r27808, 8; + cvt.u16.u32 %rs851, %r27826; + shr.u32 %r27827, %r27808, 16; + cvt.u16.u32 %rs852, %r27827; + shr.u32 %r27828, %r27808, 24; + cvt.u16.u32 %rs853, %r27828; + cvt.u16.u32 %rs563, %r27797; + cvt.u16.u32 %rs564, %r27769; + xor.b16 %rs854, %rs564, %rs563; + shr.u32 %r27829, %r27809, 8; + cvt.u16.u32 %rs855, %r27829; + shr.u32 %r27830, %r27809, 16; + cvt.u16.u32 %rs856, %r27830; + shr.u32 %r27831, %r27809, 24; + cvt.u16.u32 %rs857, %r27831; + cvt.u16.u32 %rs565, %r27749; + cvt.u16.u32 %rs566, %r27785; + xor.b16 %rs858, %rs566, %rs565; + shr.u32 %r27832, %r27810, 8; + cvt.u16.u32 %rs859, %r27832; + shr.u32 %r27833, %r27810, 16; + cvt.u16.u32 %rs860, %r27833; + shr.u32 %r27834, %r27810, 24; + cvt.u16.u32 %rs861, %r27834; + setp.ne.s64 %p53, %rd1367, 0; + mov.u16 %rs862, 64; + mov.u16 %rs732, %rs863; + mov.u16 %rs766, %rs829; + mov.u16 %rs767, %rs828; + mov.u16 %rs768, %rs827; + mov.u16 %rs769, %rs826; + mov.u16 %rs770, %rs825; + mov.u16 %rs771, %rs824; + mov.u16 %rs772, %rs823; + mov.u16 %rs773, %rs822; + mov.u16 %rs774, %rs821; + mov.u16 %rs775, %rs820; + mov.u16 %rs776, %rs819; + mov.u16 %rs777, %rs818; + mov.u16 %rs778, %rs817; + mov.u16 %rs779, %rs816; + mov.u16 %rs780, %rs815; + mov.u16 %rs781, %rs814; + mov.u16 %rs782, %rs813; + mov.u16 %rs783, %rs812; + mov.u16 %rs784, %rs811; + mov.u16 %rs785, %rs810; + mov.u16 %rs786, %rs809; + mov.u16 %rs787, %rs808; + mov.u16 %rs788, %rs807; + mov.u16 %rs789, %rs806; + mov.u16 %rs790, %rs805; + mov.u16 %rs791, %rs804; + mov.u16 %rs792, %rs803; + mov.u16 %rs793, %rs802; + mov.u16 %rs794, %rs801; + mov.u16 %rs795, %rs800; + mov.u16 %rs796, %rs799; + mov.u16 %rs797, %rs798; + mov.u64 %rd1368, %rd1159; + mov.u32 %r31250, %r31265; + mov.u32 %r31251, %r31264; + mov.u32 %r31252, %r31263; + mov.u32 %r31253, %r31262; + mov.u32 %r31254, %r31261; + mov.u32 %r31255, %r31260; + mov.u32 %r31256, %r31259; + mov.u32 %r31257, %r31258; + @%p53 bra $L__BB2_93; + +$L__BB2_94: + cvt.u32.u16 %r27835, %rs798; + and.b32 %r27836, %r27835, 255; + cvt.u32.u16 %r27837, %rs799; + prmt.b32 %r27838, %r27837, %r27836, 30212; + cvt.u32.u16 %r27839, %rs800; + shl.b32 %r27840, %r27839, 16; + and.b32 %r27841, %r27840, 16711680; + or.b32 %r27842, %r27838, %r27841; + cvt.u32.u16 %r27843, %rs801; + shl.b32 %r27844, %r27843, 24; + or.b32 %r27845, %r27842, %r27844; + cvt.u32.u16 %r27846, %rs802; + and.b32 %r27847, %r27846, 255; + cvt.u32.u16 %r27848, %rs803; + prmt.b32 %r27849, %r27848, %r27847, 30212; + cvt.u32.u16 %r27850, %rs804; + shl.b32 %r27851, %r27850, 16; + and.b32 %r27852, %r27851, 16711680; + or.b32 %r27853, %r27849, %r27852; + cvt.u32.u16 %r27854, %rs805; + shl.b32 %r27855, %r27854, 24; + or.b32 %r27856, %r27853, %r27855; + cvt.u32.u16 %r27857, %rs806; + and.b32 %r27858, %r27857, 255; + cvt.u32.u16 %r27859, %rs807; + prmt.b32 %r27860, %r27859, %r27858, 30212; + cvt.u32.u16 %r27861, %rs808; + shl.b32 %r27862, %r27861, 16; + and.b32 %r27863, %r27862, 16711680; + or.b32 %r27864, %r27860, %r27863; + cvt.u32.u16 %r27865, %rs809; + shl.b32 %r27866, %r27865, 24; + or.b32 %r27867, %r27864, %r27866; + cvt.u32.u16 %r27868, %rs810; + and.b32 %r27869, %r27868, 255; + cvt.u32.u16 %r27870, %rs811; + prmt.b32 %r27871, %r27870, %r27869, 30212; + cvt.u32.u16 %r27872, %rs812; + shl.b32 %r27873, %r27872, 16; + and.b32 %r27874, %r27873, 16711680; + or.b32 %r27875, %r27871, %r27874; + cvt.u32.u16 %r27876, %rs813; + shl.b32 %r27877, %r27876, 24; + or.b32 %r27878, %r27875, %r27877; + cvt.u32.u16 %r27879, %rs814; + and.b32 %r27880, %r27879, 255; + cvt.u32.u16 %r27881, %rs815; + prmt.b32 %r27882, %r27881, %r27880, 30212; + cvt.u32.u16 %r27883, %rs816; + shl.b32 %r27884, %r27883, 16; + and.b32 %r27885, %r27884, 16711680; + or.b32 %r27886, %r27882, %r27885; + cvt.u32.u16 %r27887, %rs817; + shl.b32 %r27888, %r27887, 24; + or.b32 %r27889, %r27886, %r27888; + cvt.u32.u16 %r27890, %rs818; + and.b32 %r27891, %r27890, 255; + cvt.u32.u16 %r27892, %rs819; + prmt.b32 %r27893, %r27892, %r27891, 30212; + cvt.u32.u16 %r27894, %rs820; + shl.b32 %r27895, %r27894, 16; + and.b32 %r27896, %r27895, 16711680; + or.b32 %r27897, %r27893, %r27896; + cvt.u32.u16 %r27898, %rs821; + shl.b32 %r27899, %r27898, 24; + or.b32 %r27900, %r27897, %r27899; + cvt.u32.u16 %r27901, %rs822; + and.b32 %r27902, %r27901, 255; + cvt.u32.u16 %r27903, %rs823; + prmt.b32 %r27904, %r27903, %r27902, 30212; + cvt.u32.u16 %r27905, %rs824; + shl.b32 %r27906, %r27905, 16; + and.b32 %r27907, %r27906, 16711680; + or.b32 %r27908, %r27904, %r27907; + cvt.u32.u16 %r27909, %rs825; + shl.b32 %r27910, %r27909, 24; + or.b32 %r27911, %r27908, %r27910; + cvt.u32.u16 %r27912, %rs826; + and.b32 %r27913, %r27912, 255; + cvt.u32.u16 %r27914, %rs827; + prmt.b32 %r27915, %r27914, %r27913, 30212; + cvt.u32.u16 %r27916, %rs828; + shl.b32 %r27917, %r27916, 16; + and.b32 %r27918, %r27917, 16711680; + or.b32 %r27919, %r27915, %r27918; + cvt.u32.u16 %r27920, %rs829; + shl.b32 %r27921, %r27920, 24; + or.b32 %r27922, %r27919, %r27921; + cvt.u32.u16 %r27923, %rs830; + and.b32 %r27924, %r27923, 255; + cvt.u32.u16 %r27925, %rs831; + prmt.b32 %r27926, %r27925, %r27924, 30212; + cvt.u32.u16 %r27927, %rs832; + shl.b32 %r27928, %r27927, 16; + and.b32 %r27929, %r27928, 16711680; + or.b32 %r27930, %r27926, %r27929; + cvt.u32.u16 %r27931, %rs833; + shl.b32 %r27932, %r27931, 24; + or.b32 %r27933, %r27930, %r27932; + cvt.u32.u16 %r27934, %rs834; + and.b32 %r27935, %r27934, 255; + cvt.u32.u16 %r27936, %rs835; + prmt.b32 %r27937, %r27936, %r27935, 30212; + cvt.u32.u16 %r27938, %rs836; + shl.b32 %r27939, %r27938, 16; + and.b32 %r27940, %r27939, 16711680; + or.b32 %r27941, %r27937, %r27940; + cvt.u32.u16 %r27942, %rs837; + shl.b32 %r27943, %r27942, 24; + or.b32 %r27944, %r27941, %r27943; + cvt.u32.u16 %r27945, %rs838; + and.b32 %r27946, %r27945, 255; + cvt.u32.u16 %r27947, %rs839; + prmt.b32 %r27948, %r27947, %r27946, 30212; + cvt.u32.u16 %r27949, %rs840; + shl.b32 %r27950, %r27949, 16; + and.b32 %r27951, %r27950, 16711680; + or.b32 %r27952, %r27948, %r27951; + cvt.u32.u16 %r27953, %rs841; + shl.b32 %r27954, %r27953, 24; + or.b32 %r27955, %r27952, %r27954; + cvt.u32.u16 %r27956, %rs842; + and.b32 %r27957, %r27956, 255; + cvt.u32.u16 %r27958, %rs843; + prmt.b32 %r27959, %r27958, %r27957, 30212; + cvt.u32.u16 %r27960, %rs844; + shl.b32 %r27961, %r27960, 16; + and.b32 %r27962, %r27961, 16711680; + or.b32 %r27963, %r27959, %r27962; + cvt.u32.u16 %r27964, %rs845; + shl.b32 %r27965, %r27964, 24; + or.b32 %r27966, %r27963, %r27965; + cvt.u32.u16 %r27967, %rs846; + and.b32 %r27968, %r27967, 255; + cvt.u32.u16 %r27969, %rs847; + prmt.b32 %r27970, %r27969, %r27968, 30212; + cvt.u32.u16 %r27971, %rs848; + shl.b32 %r27972, %r27971, 16; + and.b32 %r27973, %r27972, 16711680; + or.b32 %r27974, %r27970, %r27973; + cvt.u32.u16 %r27975, %rs849; + shl.b32 %r27976, %r27975, 24; + or.b32 %r27977, %r27974, %r27976; + cvt.u32.u16 %r27978, %rs850; + and.b32 %r27979, %r27978, 255; + cvt.u32.u16 %r27980, %rs851; + prmt.b32 %r27981, %r27980, %r27979, 30212; + cvt.u32.u16 %r27982, %rs852; + shl.b32 %r27983, %r27982, 16; + and.b32 %r27984, %r27983, 16711680; + or.b32 %r27985, %r27981, %r27984; + cvt.u32.u16 %r27986, %rs853; + shl.b32 %r27987, %r27986, 24; + or.b32 %r27988, %r27985, %r27987; + cvt.u32.u16 %r27989, %rs854; + and.b32 %r27990, %r27989, 255; + cvt.u32.u16 %r27991, %rs855; + prmt.b32 %r27992, %r27991, %r27990, 30212; + cvt.u32.u16 %r27993, %rs856; + shl.b32 %r27994, %r27993, 16; + and.b32 %r27995, %r27994, 16711680; + or.b32 %r27996, %r27992, %r27995; + cvt.u32.u16 %r27997, %rs857; + shl.b32 %r27998, %r27997, 24; + or.b32 %r27999, %r27996, %r27998; + cvt.u32.u16 %r28000, %rs858; + and.b32 %r28001, %r28000, 255; + cvt.u32.u16 %r28002, %rs859; + prmt.b32 %r28003, %r28002, %r28001, 30212; + cvt.u32.u16 %r28004, %rs860; + shl.b32 %r28005, %r28004, 16; + and.b32 %r28006, %r28005, 16711680; + or.b32 %r28007, %r28003, %r28006; + cvt.u32.u16 %r28008, %rs861; + shl.b32 %r28009, %r28008, 24; + or.b32 %r28010, %r28007, %r28009; + or.b16 %rs567, %rs863, 8; + cvt.u32.u16 %r28011, %rs567; + and.b32 %r28012, %r28011, 255; + add.s32 %r28013, %r31262, %r31258; + add.s32 %r28014, %r28013, %r27845; + add.s32 %r28015, %r27856, %r28014; + add.s32 %r28016, %r31263, %r31259; + add.s32 %r28017, %r28016, %r27867; + add.s32 %r28018, %r27878, %r28017; + add.s32 %r28019, %r31264, %r31260; + add.s32 %r28020, %r28019, %r27889; + cvt.u32.u16 %r28021, %rs862; + and.b32 %r28022, %r28021, 255; + xor.b32 %r28023, %r28020, %r28022; + shr.u32 %r28024, %r28020, 16; + shl.b32 %r28025, %r28023, 16; + or.b32 %r28026, %r28025, %r28024; + add.s32 %r28027, %r28026, 1013904242; + xor.b32 %r28028, %r28027, %r31264; + shf.l.wrap.b32 %r28029, %r28028, %r28028, 20; + add.s32 %r28030, %r27900, %r28020; + add.s32 %r28031, %r28030, %r28029; + xor.b32 %r28032, %r28031, %r28026; + shf.l.wrap.b32 %r28033, %r28032, %r28032, 24; + add.s32 %r28034, %r28033, %r28027; + xor.b32 %r28035, %r28034, %r28029; + shf.l.wrap.b32 %r28036, %r28035, %r28035, 25; + add.s32 %r28037, %r31265, %r31261; + add.s32 %r28038, %r28037, %r27911; + xor.b32 %r28039, %r28038, %r28012; + shr.u32 %r28040, %r28038, 16; + shl.b32 %r28041, %r28039, 16; + or.b32 %r28042, %r28041, %r28040; + add.s32 %r28043, %r28042, -1521486534; + xor.b32 %r28044, %r28043, %r31265; + shf.l.wrap.b32 %r28045, %r28044, %r28044, 20; + add.s32 %r28046, %r27922, %r28038; + add.s32 %r28047, %r28046, %r28045; + xor.b32 %r28048, %r28047, %r28042; + shf.l.wrap.b32 %r28049, %r28048, %r28048, 24; + add.s32 %r28050, %r28049, %r28043; + xor.b32 %r28051, %r28050, %r28045; + shf.l.wrap.b32 %r28052, %r28051, %r28051, 25; + add.s32 %r28053, %r28036, %r27955; + add.s32 %r28054, %r28031, %r27977; + add.s32 %r28055, %r28054, %r28052; + add.s32 %r28056, %r28055, %r27988; + add.s32 %r28057, %r28047, %r27999; + shf.l.wrap.b32 %r28058, %r28014, %r28014, 16; + add.s32 %r28059, %r28058, 1779033703; + xor.b32 %r28060, %r28059, %r31262; + shf.l.wrap.b32 %r28061, %r28060, %r28060, 20; + add.s32 %r28062, %r28015, %r28061; + xor.b32 %r28063, %r28062, %r28058; + shf.l.wrap.b32 %r28064, %r28063, %r28063, 24; + add.s32 %r28065, %r28064, %r28059; + xor.b32 %r28066, %r28065, %r28061; + shf.l.wrap.b32 %r28067, %r28066, %r28066, 25; + shf.l.wrap.b32 %r28068, %r28017, %r28017, 16; + add.s32 %r28069, %r28068, -1150833019; + xor.b32 %r28070, %r28069, %r31263; + shf.l.wrap.b32 %r28071, %r28070, %r28070, 20; + add.s32 %r28072, %r28018, %r28071; + xor.b32 %r28073, %r28072, %r28068; + shf.l.wrap.b32 %r28074, %r28073, %r28073, 24; + add.s32 %r28075, %r28074, %r28069; + xor.b32 %r28076, %r28075, %r28071; + shf.l.wrap.b32 %r28077, %r28076, %r28076, 25; + add.s32 %r28078, %r28062, %r27933; + add.s32 %r28079, %r28078, %r28077; + xor.b32 %r28080, %r28079, %r28049; + shf.l.wrap.b32 %r28081, %r28080, %r28080, 16; + add.s32 %r28082, %r28081, %r28034; + xor.b32 %r28083, %r28082, %r28077; + shf.l.wrap.b32 %r28084, %r28083, %r28083, 20; + add.s32 %r28085, %r28079, %r27944; + add.s32 %r28086, %r28085, %r28084; + xor.b32 %r28087, %r28086, %r28081; + shf.l.wrap.b32 %r28088, %r28087, %r28087, 24; + add.s32 %r28089, %r28088, %r28082; + xor.b32 %r28090, %r28089, %r28084; + shf.l.wrap.b32 %r28091, %r28090, %r28090, 25; + add.s32 %r28092, %r28053, %r28072; + xor.b32 %r28093, %r28064, %r28092; + shf.l.wrap.b32 %r28094, %r28093, %r28093, 16; + add.s32 %r28095, %r28094, %r28050; + xor.b32 %r28096, %r28095, %r28036; + shf.l.wrap.b32 %r28097, %r28096, %r28096, 20; + add.s32 %r28098, %r28092, %r27966; + add.s32 %r28099, %r28098, %r28097; + xor.b32 %r28100, %r28099, %r28094; + shf.l.wrap.b32 %r28101, %r28100, %r28100, 24; + add.s32 %r28102, %r28101, %r28095; + xor.b32 %r28103, %r28102, %r28097; + shf.l.wrap.b32 %r28104, %r28103, %r28103, 25; + xor.b32 %r28105, %r28074, %r28055; + shf.l.wrap.b32 %r28106, %r28105, %r28105, 16; + add.s32 %r28107, %r28106, %r28065; + xor.b32 %r28108, %r28107, %r28052; + shf.l.wrap.b32 %r28109, %r28108, %r28108, 20; + add.s32 %r28110, %r28056, %r28109; + xor.b32 %r28111, %r28110, %r28106; + shf.l.wrap.b32 %r28112, %r28111, %r28111, 24; + add.s32 %r28113, %r28112, %r28107; + xor.b32 %r28114, %r28113, %r28109; + shf.l.wrap.b32 %r28115, %r28114, %r28114, 25; + add.s32 %r28116, %r28057, %r28067; + xor.b32 %r28117, %r28116, %r28033; + shf.l.wrap.b32 %r28118, %r28117, %r28117, 16; + add.s32 %r28119, %r28118, %r28075; + xor.b32 %r28120, %r28119, %r28067; + shf.l.wrap.b32 %r28121, %r28120, %r28120, 20; + add.s32 %r28122, %r28116, %r28010; + add.s32 %r28123, %r28122, %r28121; + xor.b32 %r28124, %r28123, %r28118; + shf.l.wrap.b32 %r28125, %r28124, %r28124, 24; + add.s32 %r28126, %r28125, %r28119; + xor.b32 %r28127, %r28126, %r28121; + shf.l.wrap.b32 %r28128, %r28127, %r28127, 25; + add.s32 %r28129, %r28086, %r27867; + add.s32 %r28130, %r28129, %r28128; + xor.b32 %r28131, %r28130, %r28101; + shf.l.wrap.b32 %r28132, %r28131, %r28131, 16; + add.s32 %r28133, %r28132, %r28113; + xor.b32 %r28134, %r28133, %r28128; + shf.l.wrap.b32 %r28135, %r28134, %r28134, 20; + add.s32 %r28136, %r28130, %r27911; + add.s32 %r28137, %r28136, %r28135; + xor.b32 %r28138, %r28137, %r28132; + shf.l.wrap.b32 %r28139, %r28138, %r28138, 24; + add.s32 %r28140, %r28139, %r28133; + xor.b32 %r28141, %r28140, %r28135; + shf.l.wrap.b32 %r28142, %r28141, %r28141, 25; + add.s32 %r28143, %r28099, %r27878; + add.s32 %r28144, %r28143, %r28091; + xor.b32 %r28145, %r28144, %r28112; + shf.l.wrap.b32 %r28146, %r28145, %r28145, 16; + add.s32 %r28147, %r28146, %r28126; + xor.b32 %r28148, %r28147, %r28091; + shf.l.wrap.b32 %r28149, %r28148, %r28148, 20; + add.s32 %r28150, %r28144, %r27955; + add.s32 %r28151, %r28150, %r28149; + xor.b32 %r28152, %r28151, %r28146; + shf.l.wrap.b32 %r28153, %r28152, %r28152, 24; + add.s32 %r28154, %r28153, %r28147; + xor.b32 %r28155, %r28154, %r28149; + shf.l.wrap.b32 %r28156, %r28155, %r28155, 25; + add.s32 %r28157, %r28110, %r27922; + add.s32 %r28158, %r28157, %r28104; + xor.b32 %r28159, %r28125, %r28158; + shf.l.wrap.b32 %r28160, %r28159, %r28159, 16; + add.s32 %r28161, %r28160, %r28089; + xor.b32 %r28162, %r28161, %r28104; + shf.l.wrap.b32 %r28163, %r28162, %r28162, 20; + add.s32 %r28164, %r28158, %r27845; + add.s32 %r28165, %r28164, %r28163; + xor.b32 %r28166, %r28165, %r28160; + shf.l.wrap.b32 %r28167, %r28166, %r28166, 24; + add.s32 %r28168, %r28167, %r28161; + xor.b32 %r28169, %r28168, %r28163; + shf.l.wrap.b32 %r28170, %r28169, %r28169, 25; + add.s32 %r28171, %r28123, %r27889; + add.s32 %r28172, %r28171, %r28115; + xor.b32 %r28173, %r28088, %r28172; + shf.l.wrap.b32 %r28174, %r28173, %r28173, 16; + add.s32 %r28175, %r28174, %r28102; + xor.b32 %r28176, %r28175, %r28115; + shf.l.wrap.b32 %r28177, %r28176, %r28176, 20; + add.s32 %r28178, %r28172, %r27988; + add.s32 %r28179, %r28178, %r28177; + xor.b32 %r28180, %r28179, %r28174; + shf.l.wrap.b32 %r28181, %r28180, %r28180, 24; + add.s32 %r28182, %r28181, %r28175; + xor.b32 %r28183, %r28182, %r28177; + shf.l.wrap.b32 %r28184, %r28183, %r28183, 25; + add.s32 %r28185, %r28137, %r27856; + add.s32 %r28186, %r28185, %r28156; + xor.b32 %r28187, %r28186, %r28181; + shf.l.wrap.b32 %r28188, %r28187, %r28187, 16; + add.s32 %r28189, %r28188, %r28168; + xor.b32 %r28190, %r28189, %r28156; + shf.l.wrap.b32 %r28191, %r28190, %r28190, 20; + add.s32 %r28192, %r28186, %r27966; + add.s32 %r28193, %r28192, %r28191; + xor.b32 %r28194, %r28193, %r28188; + shf.l.wrap.b32 %r28195, %r28194, %r28194, 24; + add.s32 %r28196, %r28195, %r28189; + xor.b32 %r28197, %r28196, %r28191; + shf.l.wrap.b32 %r28198, %r28197, %r28197, 25; + add.s32 %r28199, %r28170, %r27977; + add.s32 %r28200, %r28199, %r28151; + xor.b32 %r28201, %r28139, %r28200; + shf.l.wrap.b32 %r28202, %r28201, %r28201, 16; + add.s32 %r28203, %r28202, %r28182; + xor.b32 %r28204, %r28203, %r28170; + shf.l.wrap.b32 %r28205, %r28204, %r28204, 20; + add.s32 %r28206, %r28200, %r27900; + add.s32 %r28207, %r28206, %r28205; + xor.b32 %r28208, %r28207, %r28202; + shf.l.wrap.b32 %r28209, %r28208, %r28208, 24; + add.s32 %r28210, %r28209, %r28203; + xor.b32 %r28211, %r28210, %r28205; + shf.l.wrap.b32 %r28212, %r28211, %r28211, 25; + add.s32 %r28213, %r28165, %r27944; + add.s32 %r28214, %r28213, %r28184; + xor.b32 %r28215, %r28153, %r28214; + shf.l.wrap.b32 %r28216, %r28215, %r28215, 16; + add.s32 %r28217, %r28216, %r28140; + xor.b32 %r28218, %r28217, %r28184; + shf.l.wrap.b32 %r28219, %r28218, %r28218, 20; + add.s32 %r28220, %r28214, %r27999; + add.s32 %r28221, %r28220, %r28219; + xor.b32 %r28222, %r28221, %r28216; + shf.l.wrap.b32 %r28223, %r28222, %r28222, 24; + add.s32 %r28224, %r28223, %r28217; + xor.b32 %r28225, %r28224, %r28219; + shf.l.wrap.b32 %r28226, %r28225, %r28225, 25; + add.s32 %r28227, %r28179, %r28010; + add.s32 %r28228, %r28227, %r28142; + xor.b32 %r28229, %r28228, %r28167; + shf.l.wrap.b32 %r28230, %r28229, %r28229, 16; + add.s32 %r28231, %r28230, %r28154; + xor.b32 %r28232, %r28231, %r28142; + shf.l.wrap.b32 %r28233, %r28232, %r28232, 20; + add.s32 %r28234, %r28228, %r27933; + add.s32 %r28235, %r28234, %r28233; + xor.b32 %r28236, %r28235, %r28230; + shf.l.wrap.b32 %r28237, %r28236, %r28236, 24; + add.s32 %r28238, %r28237, %r28231; + xor.b32 %r28239, %r28238, %r28233; + shf.l.wrap.b32 %r28240, %r28239, %r28239, 25; + add.s32 %r28241, %r28193, %r27878; + add.s32 %r28242, %r28241, %r28240; + xor.b32 %r28243, %r28242, %r28209; + shf.l.wrap.b32 %r28244, %r28243, %r28243, 16; + add.s32 %r28245, %r28244, %r28224; + xor.b32 %r28246, %r28245, %r28240; + shf.l.wrap.b32 %r28247, %r28246, %r28246, 20; + add.s32 %r28248, %r28242, %r27889; + add.s32 %r28249, %r28248, %r28247; + xor.b32 %r28250, %r28249, %r28244; + shf.l.wrap.b32 %r28251, %r28250, %r28250, 24; + add.s32 %r28252, %r28251, %r28245; + xor.b32 %r28253, %r28252, %r28247; + shf.l.wrap.b32 %r28254, %r28253, %r28253, 25; + add.s32 %r28255, %r28207, %r27955; + add.s32 %r28256, %r28255, %r28198; + xor.b32 %r28257, %r28256, %r28223; + shf.l.wrap.b32 %r28258, %r28257, %r28257, 16; + add.s32 %r28259, %r28258, %r28238; + xor.b32 %r28260, %r28259, %r28198; + shf.l.wrap.b32 %r28261, %r28260, %r28260, 20; + add.s32 %r28262, %r28256, %r27977; + add.s32 %r28263, %r28262, %r28261; + xor.b32 %r28264, %r28263, %r28258; + shf.l.wrap.b32 %r28265, %r28264, %r28264, 24; + add.s32 %r28266, %r28265, %r28259; + xor.b32 %r28267, %r28266, %r28261; + shf.l.wrap.b32 %r28268, %r28267, %r28267, 25; + add.s32 %r28269, %r28221, %r27988; + add.s32 %r28270, %r28269, %r28212; + xor.b32 %r28271, %r28237, %r28270; + shf.l.wrap.b32 %r28272, %r28271, %r28271, 16; + add.s32 %r28273, %r28272, %r28196; + xor.b32 %r28274, %r28273, %r28212; + shf.l.wrap.b32 %r28275, %r28274, %r28274, 20; + add.s32 %r28276, %r28270, %r27867; + add.s32 %r28277, %r28276, %r28275; + xor.b32 %r28278, %r28277, %r28272; + shf.l.wrap.b32 %r28279, %r28278, %r28278, 24; + add.s32 %r28280, %r28279, %r28273; + xor.b32 %r28281, %r28280, %r28275; + shf.l.wrap.b32 %r28282, %r28281, %r28281, 25; + add.s32 %r28283, %r28235, %r27922; + add.s32 %r28284, %r28283, %r28226; + xor.b32 %r28285, %r28195, %r28284; + shf.l.wrap.b32 %r28286, %r28285, %r28285, 16; + add.s32 %r28287, %r28286, %r28210; + xor.b32 %r28288, %r28287, %r28226; + shf.l.wrap.b32 %r28289, %r28288, %r28288, 20; + add.s32 %r28290, %r28284, %r27999; + add.s32 %r28291, %r28290, %r28289; + xor.b32 %r28292, %r28291, %r28286; + shf.l.wrap.b32 %r28293, %r28292, %r28292, 24; + add.s32 %r28294, %r28293, %r28287; + xor.b32 %r28295, %r28294, %r28289; + shf.l.wrap.b32 %r28296, %r28295, %r28295, 25; + add.s32 %r28297, %r28249, %r27911; + add.s32 %r28298, %r28297, %r28268; + xor.b32 %r28299, %r28298, %r28293; + shf.l.wrap.b32 %r28300, %r28299, %r28299, 16; + add.s32 %r28301, %r28300, %r28280; + xor.b32 %r28302, %r28301, %r28268; + shf.l.wrap.b32 %r28303, %r28302, %r28302, 20; + add.s32 %r28304, %r28298, %r27900; + add.s32 %r28305, %r28304, %r28303; + xor.b32 %r28306, %r28305, %r28300; + shf.l.wrap.b32 %r28307, %r28306, %r28306, 24; + add.s32 %r28308, %r28307, %r28301; + xor.b32 %r28309, %r28308, %r28303; + shf.l.wrap.b32 %r28310, %r28309, %r28309, 25; + add.s32 %r28311, %r28282, %r27944; + add.s32 %r28312, %r28311, %r28263; + xor.b32 %r28313, %r28251, %r28312; + shf.l.wrap.b32 %r28314, %r28313, %r28313, 16; + add.s32 %r28315, %r28314, %r28294; + xor.b32 %r28316, %r28315, %r28282; + shf.l.wrap.b32 %r28317, %r28316, %r28316, 20; + add.s32 %r28318, %r28312, %r27845; + add.s32 %r28319, %r28318, %r28317; + xor.b32 %r28320, %r28319, %r28314; + shf.l.wrap.b32 %r28321, %r28320, %r28320, 24; + add.s32 %r28322, %r28321, %r28315; + xor.b32 %r28323, %r28322, %r28317; + shf.l.wrap.b32 %r28324, %r28323, %r28323, 25; + add.s32 %r28325, %r28277, %r27966; + add.s32 %r28326, %r28325, %r28296; + xor.b32 %r28327, %r28265, %r28326; + shf.l.wrap.b32 %r28328, %r28327, %r28327, 16; + add.s32 %r28329, %r28328, %r28252; + xor.b32 %r28330, %r28329, %r28296; + shf.l.wrap.b32 %r28331, %r28330, %r28330, 20; + add.s32 %r28332, %r28326, %r28010; + add.s32 %r28333, %r28332, %r28331; + xor.b32 %r28334, %r28333, %r28328; + shf.l.wrap.b32 %r28335, %r28334, %r28334, 24; + add.s32 %r28336, %r28335, %r28329; + xor.b32 %r28337, %r28336, %r28331; + shf.l.wrap.b32 %r28338, %r28337, %r28337, 25; + add.s32 %r28339, %r28291, %r27933; + add.s32 %r28340, %r28339, %r28254; + xor.b32 %r28341, %r28340, %r28279; + shf.l.wrap.b32 %r28342, %r28341, %r28341, 16; + add.s32 %r28343, %r28342, %r28266; + xor.b32 %r28344, %r28343, %r28254; + shf.l.wrap.b32 %r28345, %r28344, %r28344, 20; + add.s32 %r28346, %r28340, %r27856; + add.s32 %r28347, %r28346, %r28345; + xor.b32 %r28348, %r28347, %r28342; + shf.l.wrap.b32 %r28349, %r28348, %r28348, 24; + add.s32 %r28350, %r28349, %r28343; + xor.b32 %r28351, %r28350, %r28345; + shf.l.wrap.b32 %r28352, %r28351, %r28351, 25; + add.s32 %r28353, %r28305, %r27955; + add.s32 %r28354, %r28353, %r28352; + xor.b32 %r28355, %r28354, %r28321; + shf.l.wrap.b32 %r28356, %r28355, %r28355, 16; + add.s32 %r28357, %r28356, %r28336; + xor.b32 %r28358, %r28357, %r28352; + shf.l.wrap.b32 %r28359, %r28358, %r28358, 20; + add.s32 %r28360, %r28354, %r27922; + add.s32 %r28361, %r28360, %r28359; + xor.b32 %r28362, %r28361, %r28356; + shf.l.wrap.b32 %r28363, %r28362, %r28362, 24; + add.s32 %r28364, %r28363, %r28357; + xor.b32 %r28365, %r28364, %r28359; + shf.l.wrap.b32 %r28366, %r28365, %r28365, 25; + add.s32 %r28367, %r28319, %r27977; + add.s32 %r28368, %r28367, %r28310; + xor.b32 %r28369, %r28368, %r28335; + shf.l.wrap.b32 %r28370, %r28369, %r28369, 16; + add.s32 %r28371, %r28370, %r28350; + xor.b32 %r28372, %r28371, %r28310; + shf.l.wrap.b32 %r28373, %r28372, %r28372, 20; + add.s32 %r28374, %r28368, %r27944; + add.s32 %r28375, %r28374, %r28373; + xor.b32 %r28376, %r28375, %r28370; + shf.l.wrap.b32 %r28377, %r28376, %r28376, 24; + add.s32 %r28378, %r28377, %r28371; + xor.b32 %r28379, %r28378, %r28373; + shf.l.wrap.b32 %r28380, %r28379, %r28379, 25; + add.s32 %r28381, %r28333, %r27999; + add.s32 %r28382, %r28381, %r28324; + xor.b32 %r28383, %r28349, %r28382; + shf.l.wrap.b32 %r28384, %r28383, %r28383, 16; + add.s32 %r28385, %r28384, %r28308; + xor.b32 %r28386, %r28385, %r28324; + shf.l.wrap.b32 %r28387, %r28386, %r28386, 20; + add.s32 %r28388, %r28382, %r27878; + add.s32 %r28389, %r28388, %r28387; + xor.b32 %r28390, %r28389, %r28384; + shf.l.wrap.b32 %r28391, %r28390, %r28390, 24; + add.s32 %r28392, %r28391, %r28385; + xor.b32 %r28393, %r28392, %r28387; + shf.l.wrap.b32 %r28394, %r28393, %r28393, 25; + add.s32 %r28395, %r28347, %r27988; + add.s32 %r28396, %r28395, %r28338; + xor.b32 %r28397, %r28307, %r28396; + shf.l.wrap.b32 %r28398, %r28397, %r28397, 16; + add.s32 %r28399, %r28398, %r28322; + xor.b32 %r28400, %r28399, %r28338; + shf.l.wrap.b32 %r28401, %r28400, %r28400, 20; + add.s32 %r28402, %r28396, %r28010; + add.s32 %r28403, %r28402, %r28401; + xor.b32 %r28404, %r28403, %r28398; + shf.l.wrap.b32 %r28405, %r28404, %r28404, 24; + add.s32 %r28406, %r28405, %r28399; + xor.b32 %r28407, %r28406, %r28401; + shf.l.wrap.b32 %r28408, %r28407, %r28407, 25; + add.s32 %r28409, %r28361, %r27889; + add.s32 %r28410, %r28409, %r28380; + xor.b32 %r28411, %r28410, %r28405; + shf.l.wrap.b32 %r28412, %r28411, %r28411, 16; + add.s32 %r28413, %r28412, %r28392; + xor.b32 %r28414, %r28413, %r28380; + shf.l.wrap.b32 %r28415, %r28414, %r28414, 20; + add.s32 %r28416, %r28410, %r27845; + add.s32 %r28417, %r28416, %r28415; + xor.b32 %r28418, %r28417, %r28412; + shf.l.wrap.b32 %r28419, %r28418, %r28418, 24; + add.s32 %r28420, %r28419, %r28413; + xor.b32 %r28421, %r28420, %r28415; + shf.l.wrap.b32 %r28422, %r28421, %r28421, 25; + add.s32 %r28423, %r28394, %r27966; + add.s32 %r28424, %r28423, %r28375; + xor.b32 %r28425, %r28363, %r28424; + shf.l.wrap.b32 %r28426, %r28425, %r28425, 16; + add.s32 %r28427, %r28426, %r28406; + xor.b32 %r28428, %r28427, %r28394; + shf.l.wrap.b32 %r28429, %r28428, %r28428, 20; + add.s32 %r28430, %r28424, %r27867; + add.s32 %r28431, %r28430, %r28429; + xor.b32 %r28432, %r28431, %r28426; + shf.l.wrap.b32 %r28433, %r28432, %r28432, 24; + add.s32 %r28434, %r28433, %r28427; + xor.b32 %r28435, %r28434, %r28429; + shf.l.wrap.b32 %r28436, %r28435, %r28435, 25; + add.s32 %r28437, %r28389, %r27900; + add.s32 %r28438, %r28437, %r28408; + xor.b32 %r28439, %r28377, %r28438; + shf.l.wrap.b32 %r28440, %r28439, %r28439, 16; + add.s32 %r28441, %r28440, %r28364; + xor.b32 %r28442, %r28441, %r28408; + shf.l.wrap.b32 %r28443, %r28442, %r28442, 20; + add.s32 %r28444, %r28438, %r27933; + add.s32 %r28445, %r28444, %r28443; + xor.b32 %r28446, %r28445, %r28440; + shf.l.wrap.b32 %r28447, %r28446, %r28446, 24; + add.s32 %r28448, %r28447, %r28441; + xor.b32 %r28449, %r28448, %r28443; + shf.l.wrap.b32 %r28450, %r28449, %r28449, 25; + add.s32 %r28451, %r28403, %r27856; + add.s32 %r28452, %r28451, %r28366; + xor.b32 %r28453, %r28452, %r28391; + shf.l.wrap.b32 %r28454, %r28453, %r28453, 16; + add.s32 %r28455, %r28454, %r28378; + xor.b32 %r28456, %r28455, %r28366; + shf.l.wrap.b32 %r28457, %r28456, %r28456, 20; + add.s32 %r28458, %r28452, %r27911; + add.s32 %r28459, %r28458, %r28457; + xor.b32 %r28460, %r28459, %r28454; + shf.l.wrap.b32 %r28461, %r28460, %r28460, 24; + add.s32 %r28462, %r28461, %r28455; + xor.b32 %r28463, %r28462, %r28457; + shf.l.wrap.b32 %r28464, %r28463, %r28463, 25; + add.s32 %r28465, %r28417, %r27977; + add.s32 %r28466, %r28465, %r28464; + xor.b32 %r28467, %r28466, %r28433; + shf.l.wrap.b32 %r28468, %r28467, %r28467, 16; + add.s32 %r28469, %r28468, %r28448; + xor.b32 %r28470, %r28469, %r28464; + shf.l.wrap.b32 %r28471, %r28470, %r28470, 20; + add.s32 %r28472, %r28466, %r27988; + add.s32 %r28473, %r28472, %r28471; + xor.b32 %r28474, %r28473, %r28468; + shf.l.wrap.b32 %r28475, %r28474, %r28474, 24; + add.s32 %r28476, %r28475, %r28469; + xor.b32 %r28477, %r28476, %r28471; + shf.l.wrap.b32 %r28478, %r28477, %r28477, 25; + add.s32 %r28479, %r28431, %r27944; + add.s32 %r28480, %r28479, %r28422; + xor.b32 %r28481, %r28480, %r28447; + shf.l.wrap.b32 %r28482, %r28481, %r28481, 16; + add.s32 %r28483, %r28482, %r28462; + xor.b32 %r28484, %r28483, %r28422; + shf.l.wrap.b32 %r28485, %r28484, %r28484, 20; + add.s32 %r28486, %r28480, %r27966; + add.s32 %r28487, %r28486, %r28485; + xor.b32 %r28488, %r28487, %r28482; + shf.l.wrap.b32 %r28489, %r28488, %r28488, 24; + add.s32 %r28490, %r28489, %r28483; + xor.b32 %r28491, %r28490, %r28485; + shf.l.wrap.b32 %r28492, %r28491, %r28491, 25; + add.s32 %r28493, %r28445, %r28010; + add.s32 %r28494, %r28493, %r28436; + xor.b32 %r28495, %r28461, %r28494; + shf.l.wrap.b32 %r28496, %r28495, %r28495, 16; + add.s32 %r28497, %r28496, %r28420; + xor.b32 %r28498, %r28497, %r28436; + shf.l.wrap.b32 %r28499, %r28498, %r28498, 20; + add.s32 %r28500, %r28494, %r27955; + add.s32 %r28501, %r28500, %r28499; + xor.b32 %r28502, %r28501, %r28496; + shf.l.wrap.b32 %r28503, %r28502, %r28502, 24; + add.s32 %r28504, %r28503, %r28497; + xor.b32 %r28505, %r28504, %r28499; + shf.l.wrap.b32 %r28506, %r28505, %r28505, 25; + add.s32 %r28507, %r28459, %r27999; + add.s32 %r28508, %r28507, %r28450; + xor.b32 %r28509, %r28419, %r28508; + shf.l.wrap.b32 %r28510, %r28509, %r28509, 16; + add.s32 %r28511, %r28510, %r28434; + xor.b32 %r28512, %r28511, %r28450; + shf.l.wrap.b32 %r28513, %r28512, %r28512, 20; + add.s32 %r28514, %r28508, %r27933; + add.s32 %r28515, %r28514, %r28513; + xor.b32 %r28516, %r28515, %r28510; + shf.l.wrap.b32 %r28517, %r28516, %r28516, 24; + add.s32 %r28518, %r28517, %r28511; + xor.b32 %r28519, %r28518, %r28513; + shf.l.wrap.b32 %r28520, %r28519, %r28519, 25; + add.s32 %r28521, %r28473, %r27922; + add.s32 %r28522, %r28521, %r28492; + xor.b32 %r28523, %r28522, %r28517; + shf.l.wrap.b32 %r28524, %r28523, %r28523, 16; + add.s32 %r28525, %r28524, %r28504; + xor.b32 %r28526, %r28525, %r28492; + shf.l.wrap.b32 %r28527, %r28526, %r28526, 20; + add.s32 %r28528, %r28522, %r27867; + add.s32 %r28529, %r28528, %r28527; + xor.b32 %r28530, %r28529, %r28524; + shf.l.wrap.b32 %r28531, %r28530, %r28530, 24; + add.s32 %r28532, %r28531, %r28525; + xor.b32 %r28533, %r28532, %r28527; + shf.l.wrap.b32 %r28534, %r28533, %r28533, 25; + add.s32 %r28535, %r28506, %r27900; + add.s32 %r28536, %r28535, %r28487; + xor.b32 %r28537, %r28475, %r28536; + shf.l.wrap.b32 %r28538, %r28537, %r28537, 16; + add.s32 %r28539, %r28538, %r28518; + xor.b32 %r28540, %r28539, %r28506; + shf.l.wrap.b32 %r28541, %r28540, %r28540, 20; + add.s32 %r28542, %r28536, %r27878; + add.s32 %r28543, %r28542, %r28541; + xor.b32 %r28544, %r28543, %r28538; + shf.l.wrap.b32 %r28545, %r28544, %r28544, 24; + add.s32 %r28546, %r28545, %r28539; + xor.b32 %r28547, %r28546, %r28541; + shf.l.wrap.b32 %r28548, %r28547, %r28547, 25; + add.s32 %r28549, %r28501, %r27845; + add.s32 %r28550, %r28549, %r28520; + xor.b32 %r28551, %r28489, %r28550; + shf.l.wrap.b32 %r28552, %r28551, %r28551, 16; + add.s32 %r28553, %r28552, %r28476; + xor.b32 %r28554, %r28553, %r28520; + shf.l.wrap.b32 %r28555, %r28554, %r28554, 20; + add.s32 %r28556, %r28550, %r27856; + add.s32 %r28557, %r28556, %r28555; + xor.b32 %r28558, %r28557, %r28552; + shf.l.wrap.b32 %r28559, %r28558, %r28558, 24; + add.s32 %r28560, %r28559, %r28553; + xor.b32 %r28561, %r28560, %r28555; + shf.l.wrap.b32 %r28562, %r28561, %r28561, 25; + add.s32 %r28563, %r28515, %r27911; + add.s32 %r28564, %r28563, %r28478; + xor.b32 %r28565, %r28564, %r28503; + shf.l.wrap.b32 %r28566, %r28565, %r28565, 16; + add.s32 %r28567, %r28566, %r28490; + xor.b32 %r28568, %r28567, %r28478; + shf.l.wrap.b32 %r28569, %r28568, %r28568, 20; + add.s32 %r28570, %r28564, %r27889; + add.s32 %r28571, %r28570, %r28569; + xor.b32 %r28572, %r28571, %r28566; + shf.l.wrap.b32 %r28573, %r28572, %r28572, 24; + add.s32 %r28574, %r28573, %r28567; + xor.b32 %r28575, %r28574, %r28569; + shf.l.wrap.b32 %r28576, %r28575, %r28575, 25; + add.s32 %r28577, %r28529, %r27944; + add.s32 %r28578, %r28577, %r28576; + xor.b32 %r28579, %r28578, %r28545; + shf.l.wrap.b32 %r28580, %r28579, %r28579, 16; + add.s32 %r28581, %r28580, %r28560; + xor.b32 %r28582, %r28581, %r28576; + shf.l.wrap.b32 %r28583, %r28582, %r28582, 20; + add.s32 %r28584, %r28578, %r27999; + add.s32 %r28585, %r28584, %r28583; + xor.b32 %r28586, %r28585, %r28580; + shf.l.wrap.b32 %r28587, %r28586, %r28586, 24; + add.s32 %r28588, %r28587, %r28581; + xor.b32 %r28589, %r28588, %r28583; + shf.l.wrap.b32 %r28590, %r28589, %r28589, 25; + add.s32 %r28591, %r28543, %r27966; + add.s32 %r28592, %r28591, %r28534; + xor.b32 %r28593, %r28592, %r28559; + shf.l.wrap.b32 %r28594, %r28593, %r28593, 16; + add.s32 %r28595, %r28594, %r28574; + xor.b32 %r28596, %r28595, %r28534; + shf.l.wrap.b32 %r28597, %r28596, %r28596, 20; + add.s32 %r28598, %r28592, %r27900; + add.s32 %r28599, %r28598, %r28597; + xor.b32 %r28600, %r28599, %r28594; + shf.l.wrap.b32 %r28601, %r28600, %r28600, 24; + add.s32 %r28602, %r28601, %r28595; + xor.b32 %r28603, %r28602, %r28597; + shf.l.wrap.b32 %r28604, %r28603, %r28603, 25; + add.s32 %r28605, %r28557, %r27933; + add.s32 %r28606, %r28605, %r28548; + xor.b32 %r28607, %r28573, %r28606; + shf.l.wrap.b32 %r28608, %r28607, %r28607, 16; + add.s32 %r28609, %r28608, %r28532; + xor.b32 %r28610, %r28609, %r28548; + shf.l.wrap.b32 %r28611, %r28610, %r28610, 20; + add.s32 %r28612, %r28606, %r27977; + add.s32 %r28613, %r28612, %r28611; + xor.b32 %r28614, %r28613, %r28608; + shf.l.wrap.b32 %r28615, %r28614, %r28614, 24; + add.s32 %r28616, %r28615, %r28609; + xor.b32 %r28617, %r28616, %r28611; + shf.l.wrap.b32 %r28618, %r28617, %r28617, 25; + add.s32 %r28619, %r28571, %r28010; + add.s32 %r28620, %r28619, %r28562; + xor.b32 %r28621, %r28531, %r28620; + shf.l.wrap.b32 %r28622, %r28621, %r28621, 16; + add.s32 %r28623, %r28622, %r28546; + xor.b32 %r28624, %r28623, %r28562; + shf.l.wrap.b32 %r28625, %r28624, %r28624, 20; + add.s32 %r28626, %r28620, %r27856; + add.s32 %r28627, %r28626, %r28625; + xor.b32 %r28628, %r28627, %r28622; + shf.l.wrap.b32 %r28629, %r28628, %r28628, 24; + add.s32 %r28630, %r28629, %r28623; + xor.b32 %r28631, %r28630, %r28625; + shf.l.wrap.b32 %r28632, %r28631, %r28631, 25; + add.s32 %r28633, %r28585, %r27988; + add.s32 %r28634, %r28633, %r28604; + xor.b32 %r28635, %r28634, %r28629; + shf.l.wrap.b32 %r28636, %r28635, %r28635, 16; + add.s32 %r28637, %r28636, %r28616; + xor.b32 %r28638, %r28637, %r28604; + shf.l.wrap.b32 %r28639, %r28638, %r28638, 20; + add.s32 %r28640, %r28634, %r27878; + add.s32 %r28641, %r28640, %r28639; + xor.b32 %r28642, %r28641, %r28636; + shf.l.wrap.b32 %r28643, %r28642, %r28642, 24; + add.s32 %r28644, %r28643, %r28637; + xor.b32 %r28645, %r28644, %r28639; + shf.l.wrap.b32 %r28646, %r28645, %r28645, 25; + add.s32 %r28647, %r28618, %r27845; + add.s32 %r28648, %r28647, %r28599; + xor.b32 %r28649, %r28587, %r28648; + shf.l.wrap.b32 %r28650, %r28649, %r28649, 16; + add.s32 %r28651, %r28650, %r28630; + xor.b32 %r28652, %r28651, %r28618; + shf.l.wrap.b32 %r28653, %r28652, %r28652, 20; + add.s32 %r28654, %r28648, %r27955; + add.s32 %r28655, %r28654, %r28653; + xor.b32 %r28656, %r28655, %r28650; + shf.l.wrap.b32 %r28657, %r28656, %r28656, 24; + add.s32 %r28658, %r28657, %r28651; + xor.b32 %r28659, %r28658, %r28653; + shf.l.wrap.b32 %r28660, %r28659, %r28659, 25; + add.s32 %r28661, %r28613, %r27867; + add.s32 %r28662, %r28661, %r28632; + xor.b32 %r28663, %r28601, %r28662; + shf.l.wrap.b32 %r28664, %r28663, %r28663, 16; + add.s32 %r28665, %r28664, %r28588; + xor.b32 %r28666, %r28665, %r28632; + shf.l.wrap.b32 %r28667, %r28666, %r28666, 20; + add.s32 %r28668, %r28662, %r27911; + add.s32 %r28669, %r28668, %r28667; + xor.b32 %r28670, %r28669, %r28664; + shf.l.wrap.b32 %r28671, %r28670, %r28670, 24; + add.s32 %r28672, %r28671, %r28665; + xor.b32 %r28673, %r28672, %r28667; + shf.l.wrap.b32 %r28674, %r28673, %r28673, 25; + add.s32 %r28675, %r28627, %r27889; + add.s32 %r28676, %r28675, %r28590; + xor.b32 %r28677, %r28676, %r28615; + shf.l.wrap.b32 %r28678, %r28677, %r28677, 16; + add.s32 %r28679, %r28678, %r28602; + xor.b32 %r28680, %r28679, %r28590; + shf.l.wrap.b32 %r28681, %r28680, %r28680, 20; + add.s32 %r28682, %r28676, %r27922; + add.s32 %r28683, %r28682, %r28681; + xor.b32 %r28684, %r28683, %r28678; + shf.l.wrap.b32 %r28685, %r28684, %r28684, 24; + add.s32 %r28686, %r28685, %r28679; + xor.b32 %r28687, %r28686, %r28681; + shf.l.wrap.b32 %r28688, %r28687, %r28687, 25; + add.s32 %r28689, %r28641, %r27966; + add.s32 %r28690, %r28689, %r28688; + xor.b32 %r28691, %r28690, %r28657; + shf.l.wrap.b32 %r28692, %r28691, %r28691, 16; + add.s32 %r28693, %r28692, %r28672; + xor.b32 %r28694, %r28693, %r28688; + shf.l.wrap.b32 %r28695, %r28694, %r28694, 20; + add.s32 %r28696, %r28690, %r28010; + add.s32 %r28697, %r28696, %r28695; + xor.b32 %r28698, %r28697, %r28692; + shf.l.wrap.b32 %r28699, %r28698, %r28698, 24; + add.s32 %r28700, %r28699, %r28693; + xor.b32 %r28701, %r28700, %r28695; + shf.l.wrap.b32 %r28702, %r28701, %r28701, 25; + add.s32 %r28703, %r28655, %r27900; + add.s32 %r28704, %r28703, %r28646; + xor.b32 %r28705, %r28704, %r28671; + shf.l.wrap.b32 %r28706, %r28705, %r28705, 16; + add.s32 %r28707, %r28706, %r28686; + xor.b32 %r28708, %r28707, %r28646; + shf.l.wrap.b32 %r28709, %r28708, %r28708, 20; + add.s32 %r28710, %r28704, %r27845; + add.s32 %r28711, %r28710, %r28709; + xor.b32 %r28712, %r28711, %r28706; + shf.l.wrap.b32 %r28713, %r28712, %r28712, 24; + add.s32 %r28714, %r28713, %r28707; + xor.b32 %r28715, %r28714, %r28709; + shf.l.wrap.b32 %r28716, %r28715, %r28715, 25; + add.s32 %r28717, %r28669, %r27856; + add.s32 %r28718, %r28717, %r28660; + xor.b32 %r28719, %r28685, %r28718; + shf.l.wrap.b32 %r28720, %r28719, %r28719, 16; + add.s32 %r28721, %r28720, %r28644; + xor.b32 %r28722, %r28721, %r28660; + shf.l.wrap.b32 %r28723, %r28722, %r28722, 20; + add.s32 %r28724, %r28718, %r27944; + add.s32 %r28725, %r28724, %r28723; + xor.b32 %r28726, %r28725, %r28720; + shf.l.wrap.b32 %r28727, %r28726, %r28726, 24; + add.s32 %r28728, %r28727, %r28721; + xor.b32 %r28729, %r28728, %r28723; + shf.l.wrap.b32 %r28730, %r28729, %r28729, 25; + add.s32 %r28731, %r28683, %r27933; + add.s32 %r28732, %r28731, %r28674; + xor.b32 %r28733, %r28643, %r28732; + shf.l.wrap.b32 %r28734, %r28733, %r28733, 16; + add.s32 %r28735, %r28734, %r28658; + xor.b32 %r28736, %r28735, %r28674; + shf.l.wrap.b32 %r28737, %r28736, %r28736, 20; + add.s32 %r28738, %r28732, %r27911; + add.s32 %r28739, %r28738, %r28737; + xor.b32 %r28740, %r28739, %r28734; + shf.l.wrap.b32 %r28741, %r28740, %r28740, 24; + add.s32 %r28742, %r28741, %r28735; + xor.b32 %r28743, %r28742, %r28737; + shf.l.wrap.b32 %r28744, %r28743, %r28743, 25; + add.s32 %r28745, %r28697, %r27999; + add.s32 %r28746, %r28745, %r28716; + xor.b32 %r28747, %r28746, %r28741; + shf.l.wrap.b32 %r28748, %r28747, %r28747, 16; + add.s32 %r28749, %r28748, %r28728; + xor.b32 %r28750, %r28749, %r28716; + shf.l.wrap.b32 %r28751, %r28750, %r28750, 20; + add.s32 %r28752, %r28746, %r27955; + add.s32 %r28753, %r28752, %r28751; + xor.b32 %r28754, %r28753, %r28748; + shf.l.wrap.b32 %r28755, %r28754, %r28754, 24; + add.s32 %r28756, %r28755, %r28749; + xor.b32 %r28757, %r28756, %r28751; + shf.l.wrap.b32 %r28758, %r28757, %r28757, 25; + add.s32 %r28759, %r28730, %r27867; + add.s32 %r28760, %r28759, %r28711; + xor.b32 %r28761, %r28699, %r28760; + shf.l.wrap.b32 %r28762, %r28761, %r28761, 16; + add.s32 %r28763, %r28762, %r28742; + xor.b32 %r28764, %r28763, %r28730; + shf.l.wrap.b32 %r28765, %r28764, %r28764, 20; + add.s32 %r28766, %r28760, %r27977; + add.s32 %r28767, %r28766, %r28765; + xor.b32 %r28768, %r28767, %r28762; + shf.l.wrap.b32 %r28769, %r28768, %r28768, 24; + add.s32 %r28770, %r28769, %r28763; + xor.b32 %r28771, %r28770, %r28765; + shf.l.wrap.b32 %r28772, %r28771, %r28771, 25; + add.s32 %r28773, %r28725, %r27878; + add.s32 %r28774, %r28773, %r28744; + xor.b32 %r28775, %r28713, %r28774; + shf.l.wrap.b32 %r28776, %r28775, %r28775, 16; + add.s32 %r28777, %r28776, %r28700; + xor.b32 %r28778, %r28777, %r28744; + shf.l.wrap.b32 %r28779, %r28778, %r28778, 20; + add.s32 %r28780, %r28774, %r27889; + add.s32 %r28781, %r28780, %r28779; + xor.b32 %r28782, %r28781, %r28776; + shf.l.wrap.b32 %r28783, %r28782, %r28782, 24; + add.s32 %r28784, %r28783, %r28777; + xor.b32 %r28785, %r28784, %r28779; + shf.l.wrap.b32 %r28786, %r28785, %r28785, 25; + add.s32 %r28787, %r28739, %r27922; + add.s32 %r28788, %r28787, %r28702; + xor.b32 %r28789, %r28788, %r28727; + shf.l.wrap.b32 %r28790, %r28789, %r28789, 16; + add.s32 %r28791, %r28790, %r28714; + xor.b32 %r28792, %r28791, %r28702; + shf.l.wrap.b32 %r28793, %r28792, %r28792, 20; + add.s32 %r28794, %r28788, %r27988; + add.s32 %r28795, %r28794, %r28793; + xor.b32 %r28796, %r28795, %r28790; + shf.l.wrap.b32 %r28797, %r28796, %r28796, 24; + add.s32 %r28798, %r28797, %r28791; + xor.b32 %r28799, %r28798, %r28793; + shf.l.wrap.b32 %r28800, %r28799, %r28799, 25; + xor.b32 %r28801, %r28753, %r28784; + cvt.u64.u32 %rd1163, %r28801; + xor.b32 %r28802, %r28798, %r28767; + and.b32 %r28803, %r28802, 255; + cvt.u64.u32 %rd1164, %r28803; + cvt.u64.u32 %rd1165, %r28802; + shl.b64 %rd1166, %rd1165, 32; + and.b64 %rd1167, %rd1166, 280375465082880; + and.b64 %rd1168, %rd1166, 71776119061217280; + shr.u32 %r28804, %r28802, 24; + cvt.u64.u32 %rd1169, %r28804; + shl.b64 %rd1170, %rd1169, 56; + bfi.b64 %rd1171, %rd1164, %rd1163, 32, 32; + or.b64 %rd1172, %rd1171, %rd1167; + or.b64 %rd1173, %rd1172, %rd1168; + or.b64 %rd353, %rd1173, %rd1170; + xor.b32 %r28805, %r28756, %r28781; + cvt.u64.u32 %rd1174, %r28805; + xor.b32 %r28806, %r28795, %r28770; + and.b32 %r28807, %r28806, 255; + cvt.u64.u32 %rd1175, %r28807; + cvt.u64.u32 %rd1176, %r28806; + shl.b64 %rd1177, %rd1176, 32; + and.b64 %rd1178, %rd1177, 280375465082880; + and.b64 %rd1179, %rd1177, 71776119061217280; + shr.u32 %r28808, %r28806, 24; + cvt.u64.u32 %rd1180, %r28808; + shl.b64 %rd1181, %rd1180, 56; + bfi.b64 %rd1182, %rd1175, %rd1174, 32, 32; + or.b64 %rd1183, %rd1182, %rd1178; + or.b64 %rd1184, %rd1183, %rd1179; + or.b64 %rd352, %rd1184, %rd1181; + xor.b32 %r28809, %r28800, %r28769; + cvt.u64.u32 %rd1185, %r28809; + xor.b32 %r28810, %r28758, %r28783; + and.b32 %r28811, %r28810, 255; + cvt.u64.u32 %rd1186, %r28811; + cvt.u64.u32 %rd1187, %r28810; + shl.b64 %rd1188, %rd1187, 32; + and.b64 %rd1189, %rd1188, 280375465082880; + and.b64 %rd1190, %rd1188, 71776119061217280; + shr.u32 %r28812, %r28810, 24; + cvt.u64.u32 %rd1191, %r28812; + shl.b64 %rd1192, %rd1191, 56; + bfi.b64 %rd1193, %rd1186, %rd1185, 32, 32; + or.b64 %rd1194, %rd1193, %rd1189; + or.b64 %rd1195, %rd1194, %rd1190; + or.b64 %rd1370, %rd1195, %rd1192; + xor.b32 %r28813, %r28797, %r28772; + cvt.u64.u32 %rd1196, %r28813; + xor.b32 %r28814, %r28755, %r28786; + and.b32 %r28815, %r28814, 255; + cvt.u64.u32 %rd1197, %r28815; + cvt.u64.u32 %rd1198, %r28814; + shl.b64 %rd1199, %rd1198, 32; + and.b64 %rd1200, %rd1199, 280375465082880; + and.b64 %rd1201, %rd1199, 71776119061217280; + shr.u32 %r28816, %r28814, 24; + cvt.u64.u32 %rd1202, %r28816; + shl.b64 %rd1203, %rd1202, 56; + bfi.b64 %rd1204, %rd1197, %rd1196, 32, 32; + or.b64 %rd1205, %rd1204, %rd1200; + or.b64 %rd1206, %rd1205, %rd1201; + or.b64 %rd1369, %rd1206, %rd1203; + +$L__BB2_96: + ld.const.u64 %rd354, [target+24]; + setp.eq.s64 %p55, %rd1369, %rd354; + @%p55 bra $L__BB2_98; + bra.uni $L__BB2_97; + +$L__BB2_98: + ld.const.u64 %rd355, [target+16]; + setp.eq.s64 %p56, %rd1370, %rd355; + @%p56 bra $L__BB2_100; + bra.uni $L__BB2_99; + +$L__BB2_100: + ld.const.u64 %rd356, [target+8]; + setp.eq.s64 %p57, %rd352, %rd356; + @%p57 bra $L__BB2_102; + bra.uni $L__BB2_101; + +$L__BB2_102: + ld.const.u64 %rd1251, [target]; + setp.lt.u64 %p59, %rd353, %rd1251; + bra.uni $L__BB2_103; + +$L__BB2_97: + setp.lt.u64 %p59, %rd1369, %rd354; + bra.uni $L__BB2_103; + +$L__BB2_99: + setp.lt.u64 %p59, %rd1370, %rd355; + bra.uni $L__BB2_103; + +$L__BB2_101: + setp.lt.u64 %p59, %rd352, %rd356; + +$L__BB2_103: + not.pred %p58, %p59; + @%p58 bra $L__BB2_105; + + ld.param.u64 %rd1264, [heavy_hash_param_0]; + ld.param.u64 %rd1263, [heavy_hash_param_1]; + and.b64 %rd1262, %rd1299, %rd1264; + or.b64 %rd1261, %rd1262, %rd1263; + ld.param.u64 %rd1257, [heavy_hash_param_5]; + cvta.to.global.u64 %rd1252, %rd1257; + mov.u64 %rd1253, 0; + atom.global.cas.b64 %rd1254, [%rd1252], %rd1253, %rd1261; + +$L__BB2_105: + ret; + +} + diff --git a/plugins/cuda/resources/karlsen-cuda-sm86.ptx b/plugins/cuda/resources/karlsen-cuda-sm86.ptx new file mode 100644 index 0000000..bdda387 --- /dev/null +++ b/plugins/cuda/resources/karlsen-cuda-sm86.ptx @@ -0,0 +1,42131 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-31833905 +// Cuda compilation tools, release 11.8, V11.8.89 +// Based on NVVM 7.0.1 +// + +.version 7.8 +.target sm_86 +.address_size 64 + +.global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; +.global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; +.global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; +.global .align 1 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; +.global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; +.global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; +.const .align 8 .b8 keccak_round_constants[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .u64 _ZZN10item_state6updateEjE9num_words = 16; +.global .align 8 .u64 _ZZ15fishhash_kernelRK16fishhash_contextRK7hash512E9num_words = 32; +.const .align 1 .b8 matrix[4096]; +.const .align 1 .b8 hash_header[72]; +.const .align 8 .b8 target[32]; + +.func (.param .b64 func_retval0) _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh( + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3, + .param .b32 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5 +) +{ + .local .align 16 .b8 __local_depot0[224]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<28>; + .reg .b16 %rs<233>; + .reg .b32 %r<3965>; + .reg .b64 %rd<175>; + + + mov.u64 %SPL, __local_depot0; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs75, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4]; + ld.param.u64 %rd69, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd171, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + ld.param.u64 %rd71, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2]; + ld.param.u64 %rd165, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + ld.param.u64 %rd73, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd155, %rd73; + cvta.to.local.u64 %rd2, %rd71; + add.u64 %rd153, %SPL, 16; + add.u64 %rd149, %SP, 96; + cvta.to.local.u64 %rd4, %rd149; + setp.lt.u64 %p1, %rd171, 1025; + @%p1 bra $L__BB0_14; + bra.uni $L__BB0_1; + +$L__BB0_14: + add.u64 %rd162, %SPL, 0; + setp.ne.s64 %p16, %rd171, 1024; + mov.u64 %rd159, 0; + mov.u64 %rd151, %rd159; + @%p16 bra $L__BB0_16; + + mov.u64 %rd171, 0; + st.local.u64 [%rd162], %rd69; + mov.u64 %rd151, 1; + mov.u64 %rd159, 1024; + +$L__BB0_16: + setp.eq.s64 %p17, %rd151, 0; + @%p17 bra $L__BB0_21; + + or.b16 %rs1, %rs75, 1; + mov.u64 %rd163, %rd151; + +$L__BB0_18: + ld.local.u64 %rd166, [%rd162]; + ld.local.u8 %r1060, [%rd2]; + ld.local.u8 %r1061, [%rd2+1]; + prmt.b32 %r1062, %r1061, %r1060, 30212; + ld.local.u8 %r1063, [%rd2+2]; + ld.local.u8 %r1064, [%rd2+3]; + prmt.b32 %r1065, %r1064, %r1063, 30212; + prmt.b32 %r3948, %r1065, %r1062, 4180; + ld.local.u8 %r1066, [%rd2+4]; + ld.local.u8 %r1067, [%rd2+5]; + prmt.b32 %r1068, %r1067, %r1066, 30212; + ld.local.u8 %r1069, [%rd2+6]; + ld.local.u8 %r1070, [%rd2+7]; + prmt.b32 %r1071, %r1070, %r1069, 30212; + prmt.b32 %r3947, %r1071, %r1068, 4180; + ld.local.u8 %r1072, [%rd2+8]; + ld.local.u8 %r1073, [%rd2+9]; + prmt.b32 %r1074, %r1073, %r1072, 30212; + ld.local.u8 %r1075, [%rd2+10]; + ld.local.u8 %r1076, [%rd2+11]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + prmt.b32 %r3946, %r1077, %r1074, 4180; + ld.local.u8 %r1078, [%rd2+12]; + ld.local.u8 %r1079, [%rd2+13]; + prmt.b32 %r1080, %r1079, %r1078, 30212; + ld.local.u8 %r1081, [%rd2+14]; + ld.local.u8 %r1082, [%rd2+15]; + prmt.b32 %r1083, %r1082, %r1081, 30212; + prmt.b32 %r3945, %r1083, %r1080, 4180; + mov.u64 %rd167, 16; + ld.local.u8 %r1084, [%rd2+16]; + ld.local.u8 %r1085, [%rd2+17]; + prmt.b32 %r1086, %r1085, %r1084, 30212; + ld.local.u8 %r1087, [%rd2+18]; + ld.local.u8 %r1088, [%rd2+19]; + prmt.b32 %r1089, %r1088, %r1087, 30212; + prmt.b32 %r3944, %r1089, %r1086, 4180; + ld.local.u8 %r1090, [%rd2+20]; + ld.local.u8 %r1091, [%rd2+21]; + prmt.b32 %r1092, %r1091, %r1090, 30212; + ld.local.u8 %r1093, [%rd2+22]; + ld.local.u8 %r1094, [%rd2+23]; + prmt.b32 %r1095, %r1094, %r1093, 30212; + prmt.b32 %r3943, %r1095, %r1092, 4180; + ld.local.u8 %r1096, [%rd2+24]; + ld.local.u8 %r1097, [%rd2+25]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd2+26]; + ld.local.u8 %r1100, [%rd2+27]; + prmt.b32 %r1101, %r1100, %r1099, 30212; + prmt.b32 %r3942, %r1101, %r1098, 4180; + ld.local.u8 %r1102, [%rd2+28]; + ld.local.u8 %r1103, [%rd2+29]; + prmt.b32 %r1104, %r1103, %r1102, 30212; + ld.local.u8 %r1105, [%rd2+30]; + ld.local.u8 %r1106, [%rd2+31]; + prmt.b32 %r1107, %r1106, %r1105, 30212; + prmt.b32 %r3941, %r1107, %r1104, 4180; + mov.u16 %rs197, %rs1; + +$L__BB0_19: + shr.u64 %rd143, %rd165, 32; + cvt.u32.u64 %r3940, %rd143; + cvt.u32.u64 %r3939, %rd165; + setp.eq.s64 %p18, %rd167, 1; + selp.b16 %rs79, 2, 0, %p18; + or.b16 %rs80, %rs79, %rs197; + ld.u8 %r1108, [%rd166]; + ld.u8 %r1109, [%rd166+1]; + prmt.b32 %r1110, %r1109, %r1108, 30212; + ld.u8 %r1111, [%rd166+2]; + prmt.b32 %r1112, %r1111, %r1110, 28756; + ld.u8 %r1113, [%rd166+3]; + prmt.b32 %r1114, %r1113, %r1112, 1620; + ld.u8 %r1115, [%rd166+4]; + ld.u8 %r1116, [%rd166+5]; + prmt.b32 %r1117, %r1116, %r1115, 30212; + ld.u8 %r1118, [%rd166+6]; + prmt.b32 %r1119, %r1118, %r1117, 28756; + ld.u8 %r1120, [%rd166+7]; + prmt.b32 %r1121, %r1120, %r1119, 1620; + ld.u8 %r1122, [%rd166+8]; + ld.u8 %r1123, [%rd166+9]; + prmt.b32 %r1124, %r1123, %r1122, 30212; + ld.u8 %r1125, [%rd166+10]; + prmt.b32 %r1126, %r1125, %r1124, 28756; + ld.u8 %r1127, [%rd166+11]; + prmt.b32 %r1128, %r1127, %r1126, 1620; + ld.u8 %r1129, [%rd166+12]; + ld.u8 %r1130, [%rd166+13]; + prmt.b32 %r1131, %r1130, %r1129, 30212; + ld.u8 %r1132, [%rd166+14]; + prmt.b32 %r1133, %r1132, %r1131, 28756; + ld.u8 %r1134, [%rd166+15]; + prmt.b32 %r1135, %r1134, %r1133, 1620; + ld.u8 %r1136, [%rd166+16]; + ld.u8 %r1137, [%rd166+17]; + prmt.b32 %r1138, %r1137, %r1136, 30212; + ld.u8 %r1139, [%rd166+18]; + prmt.b32 %r1140, %r1139, %r1138, 28756; + ld.u8 %r1141, [%rd166+19]; + prmt.b32 %r1142, %r1141, %r1140, 1620; + ld.u8 %r1143, [%rd166+20]; + ld.u8 %r1144, [%rd166+21]; + prmt.b32 %r1145, %r1144, %r1143, 30212; + ld.u8 %r1146, [%rd166+22]; + prmt.b32 %r1147, %r1146, %r1145, 28756; + ld.u8 %r1148, [%rd166+23]; + prmt.b32 %r1149, %r1148, %r1147, 1620; + ld.u8 %r1150, [%rd166+24]; + ld.u8 %r1151, [%rd166+25]; + prmt.b32 %r1152, %r1151, %r1150, 30212; + ld.u8 %r1153, [%rd166+26]; + prmt.b32 %r1154, %r1153, %r1152, 28756; + ld.u8 %r1155, [%rd166+27]; + prmt.b32 %r1156, %r1155, %r1154, 1620; + ld.u8 %r1157, [%rd166+28]; + ld.u8 %r1158, [%rd166+29]; + prmt.b32 %r1159, %r1158, %r1157, 30212; + ld.u8 %r1160, [%rd166+30]; + prmt.b32 %r1161, %r1160, %r1159, 28756; + ld.u8 %r1162, [%rd166+31]; + prmt.b32 %r1163, %r1162, %r1161, 1620; + ld.u8 %r1164, [%rd166+32]; + ld.u8 %r1165, [%rd166+33]; + prmt.b32 %r1166, %r1165, %r1164, 30212; + ld.u8 %r1167, [%rd166+34]; + prmt.b32 %r1168, %r1167, %r1166, 28756; + ld.u8 %r1169, [%rd166+35]; + prmt.b32 %r1170, %r1169, %r1168, 1620; + ld.u8 %r1171, [%rd166+36]; + ld.u8 %r1172, [%rd166+37]; + prmt.b32 %r1173, %r1172, %r1171, 30212; + ld.u8 %r1174, [%rd166+38]; + prmt.b32 %r1175, %r1174, %r1173, 28756; + ld.u8 %r1176, [%rd166+39]; + prmt.b32 %r1177, %r1176, %r1175, 1620; + ld.u8 %r1178, [%rd166+40]; + ld.u8 %r1179, [%rd166+41]; + prmt.b32 %r1180, %r1179, %r1178, 30212; + ld.u8 %r1181, [%rd166+42]; + prmt.b32 %r1182, %r1181, %r1180, 28756; + ld.u8 %r1183, [%rd166+43]; + prmt.b32 %r1184, %r1183, %r1182, 1620; + ld.u8 %r1185, [%rd166+44]; + ld.u8 %r1186, [%rd166+45]; + prmt.b32 %r1187, %r1186, %r1185, 30212; + ld.u8 %r1188, [%rd166+46]; + prmt.b32 %r1189, %r1188, %r1187, 28756; + ld.u8 %r1190, [%rd166+47]; + prmt.b32 %r1191, %r1190, %r1189, 1620; + ld.u8 %r1192, [%rd166+48]; + ld.u8 %r1193, [%rd166+49]; + prmt.b32 %r1194, %r1193, %r1192, 30212; + ld.u8 %r1195, [%rd166+50]; + prmt.b32 %r1196, %r1195, %r1194, 28756; + ld.u8 %r1197, [%rd166+51]; + prmt.b32 %r1198, %r1197, %r1196, 1620; + ld.u8 %r1199, [%rd166+52]; + ld.u8 %r1200, [%rd166+53]; + prmt.b32 %r1201, %r1200, %r1199, 30212; + ld.u8 %r1202, [%rd166+54]; + prmt.b32 %r1203, %r1202, %r1201, 28756; + ld.u8 %r1204, [%rd166+55]; + prmt.b32 %r1205, %r1204, %r1203, 1620; + ld.u8 %r1206, [%rd166+56]; + ld.u8 %r1207, [%rd166+57]; + prmt.b32 %r1208, %r1207, %r1206, 30212; + ld.u8 %r1209, [%rd166+58]; + prmt.b32 %r1210, %r1209, %r1208, 28756; + ld.u8 %r1211, [%rd166+59]; + prmt.b32 %r1212, %r1211, %r1210, 1620; + ld.u8 %r1213, [%rd166+60]; + ld.u8 %r1214, [%rd166+61]; + prmt.b32 %r1215, %r1214, %r1213, 30212; + ld.u8 %r1216, [%rd166+62]; + prmt.b32 %r1217, %r1216, %r1215, 28756; + ld.u8 %r1218, [%rd166+63]; + prmt.b32 %r1219, %r1218, %r1217, 1620; + cvt.u32.u16 %r1220, %rs80; + and.b32 %r1221, %r1220, 255; + add.s32 %r1222, %r3944, %r3948; + add.s32 %r1223, %r1222, %r1114; + xor.b32 %r1224, %r1223, %r3939; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 16; + add.s32 %r1226, %r1225, 1779033703; + xor.b32 %r1227, %r1226, %r3944; + shf.l.wrap.b32 %r1228, %r1227, %r1227, 20; + add.s32 %r1229, %r1121, %r1223; + add.s32 %r1230, %r1229, %r1228; + xor.b32 %r1231, %r1230, %r1225; + shf.l.wrap.b32 %r1232, %r1231, %r1231, 24; + add.s32 %r1233, %r1232, %r1226; + xor.b32 %r1234, %r1233, %r1228; + shf.l.wrap.b32 %r1235, %r1234, %r1234, 25; + add.s32 %r1236, %r3943, %r3947; + add.s32 %r1237, %r1236, %r1128; + xor.b32 %r1238, %r1237, %r3940; + shf.l.wrap.b32 %r1239, %r1238, %r1238, 16; + add.s32 %r1240, %r1239, -1150833019; + xor.b32 %r1241, %r1240, %r3943; + shf.l.wrap.b32 %r1242, %r1241, %r1241, 20; + add.s32 %r1243, %r1135, %r1237; + add.s32 %r1244, %r1243, %r1242; + xor.b32 %r1245, %r1244, %r1239; + shf.l.wrap.b32 %r1246, %r1245, %r1245, 24; + add.s32 %r1247, %r1246, %r1240; + xor.b32 %r1248, %r1247, %r1242; + shf.l.wrap.b32 %r1249, %r1248, %r1248, 25; + add.s32 %r1250, %r3942, %r3946; + add.s32 %r1251, %r1250, %r1142; + shr.u32 %r1252, %r1251, 16; + shl.b32 %r1253, %r1251, 16; + xor.b32 %r1254, %r1253, 4194304; + or.b32 %r1255, %r1254, %r1252; + add.s32 %r1256, %r1255, 1013904242; + xor.b32 %r1257, %r1256, %r3942; + shf.l.wrap.b32 %r1258, %r1257, %r1257, 20; + add.s32 %r1259, %r1149, %r1251; + add.s32 %r1260, %r1259, %r1258; + xor.b32 %r1261, %r1260, %r1255; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 24; + add.s32 %r1263, %r1262, %r1256; + xor.b32 %r1264, %r1263, %r1258; + shf.l.wrap.b32 %r1265, %r1264, %r1264, 25; + add.s32 %r1266, %r3941, %r3945; + add.s32 %r1267, %r1266, %r1156; + xor.b32 %r1268, %r1267, %r1221; + shr.u32 %r1269, %r1267, 16; + shl.b32 %r1270, %r1268, 16; + or.b32 %r1271, %r1270, %r1269; + add.s32 %r1272, %r1271, -1521486534; + xor.b32 %r1273, %r1272, %r3941; + shf.l.wrap.b32 %r1274, %r1273, %r1273, 20; + add.s32 %r1275, %r1163, %r1267; + add.s32 %r1276, %r1275, %r1274; + xor.b32 %r1277, %r1276, %r1271; + shf.l.wrap.b32 %r1278, %r1277, %r1277, 24; + add.s32 %r1279, %r1278, %r1272; + xor.b32 %r1280, %r1279, %r1274; + shf.l.wrap.b32 %r1281, %r1280, %r1280, 25; + add.s32 %r1282, %r1249, %r1230; + add.s32 %r1283, %r1282, %r1170; + xor.b32 %r1284, %r1278, %r1283; + shf.l.wrap.b32 %r1285, %r1284, %r1284, 16; + add.s32 %r1286, %r1285, %r1263; + xor.b32 %r1287, %r1286, %r1249; + shf.l.wrap.b32 %r1288, %r1287, %r1287, 20; + add.s32 %r1289, %r1177, %r1283; + add.s32 %r1290, %r1289, %r1288; + xor.b32 %r1291, %r1290, %r1285; + shf.l.wrap.b32 %r1292, %r1291, %r1291, 24; + add.s32 %r1293, %r1292, %r1286; + xor.b32 %r1294, %r1293, %r1288; + shf.l.wrap.b32 %r1295, %r1294, %r1294, 25; + add.s32 %r1296, %r1265, %r1244; + add.s32 %r1297, %r1296, %r1184; + xor.b32 %r1298, %r1297, %r1232; + shf.l.wrap.b32 %r1299, %r1298, %r1298, 16; + add.s32 %r1300, %r1299, %r1279; + xor.b32 %r1301, %r1300, %r1265; + shf.l.wrap.b32 %r1302, %r1301, %r1301, 20; + add.s32 %r1303, %r1191, %r1297; + add.s32 %r1304, %r1303, %r1302; + xor.b32 %r1305, %r1304, %r1299; + shf.l.wrap.b32 %r1306, %r1305, %r1305, 24; + add.s32 %r1307, %r1306, %r1300; + xor.b32 %r1308, %r1307, %r1302; + shf.l.wrap.b32 %r1309, %r1308, %r1308, 25; + add.s32 %r1310, %r1281, %r1260; + add.s32 %r1311, %r1310, %r1198; + xor.b32 %r1312, %r1311, %r1246; + shf.l.wrap.b32 %r1313, %r1312, %r1312, 16; + add.s32 %r1314, %r1313, %r1233; + xor.b32 %r1315, %r1314, %r1281; + shf.l.wrap.b32 %r1316, %r1315, %r1315, 20; + add.s32 %r1317, %r1205, %r1311; + add.s32 %r1318, %r1317, %r1316; + xor.b32 %r1319, %r1318, %r1313; + shf.l.wrap.b32 %r1320, %r1319, %r1319, 24; + add.s32 %r1321, %r1320, %r1314; + xor.b32 %r1322, %r1321, %r1316; + shf.l.wrap.b32 %r1323, %r1322, %r1322, 25; + add.s32 %r1324, %r1276, %r1235; + add.s32 %r1325, %r1324, %r1212; + xor.b32 %r1326, %r1325, %r1262; + shf.l.wrap.b32 %r1327, %r1326, %r1326, 16; + add.s32 %r1328, %r1327, %r1247; + xor.b32 %r1329, %r1328, %r1235; + shf.l.wrap.b32 %r1330, %r1329, %r1329, 20; + add.s32 %r1331, %r1219, %r1325; + add.s32 %r1332, %r1331, %r1330; + xor.b32 %r1333, %r1332, %r1327; + shf.l.wrap.b32 %r1334, %r1333, %r1333, 24; + add.s32 %r1335, %r1334, %r1328; + xor.b32 %r1336, %r1335, %r1330; + shf.l.wrap.b32 %r1337, %r1336, %r1336, 25; + add.s32 %r1338, %r1290, %r1128; + add.s32 %r1339, %r1338, %r1337; + xor.b32 %r1340, %r1339, %r1306; + shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; + add.s32 %r1342, %r1341, %r1321; + xor.b32 %r1343, %r1342, %r1337; + shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; + add.s32 %r1345, %r1339, %r1156; + add.s32 %r1346, %r1345, %r1344; + xor.b32 %r1347, %r1346, %r1341; + shf.l.wrap.b32 %r1348, %r1347, %r1347, 24; + add.s32 %r1349, %r1348, %r1342; + xor.b32 %r1350, %r1349, %r1344; + shf.l.wrap.b32 %r1351, %r1350, %r1350, 25; + add.s32 %r1352, %r1304, %r1135; + add.s32 %r1353, %r1352, %r1295; + xor.b32 %r1354, %r1320, %r1353; + shf.l.wrap.b32 %r1355, %r1354, %r1354, 16; + add.s32 %r1356, %r1335, %r1355; + xor.b32 %r1357, %r1356, %r1295; + shf.l.wrap.b32 %r1358, %r1357, %r1357, 20; + add.s32 %r1359, %r1353, %r1184; + add.s32 %r1360, %r1359, %r1358; + xor.b32 %r1361, %r1360, %r1355; + shf.l.wrap.b32 %r1362, %r1361, %r1361, 24; + add.s32 %r1363, %r1362, %r1356; + xor.b32 %r1364, %r1363, %r1358; + shf.l.wrap.b32 %r1365, %r1364, %r1364, 25; + add.s32 %r1366, %r1309, %r1163; + add.s32 %r1367, %r1366, %r1318; + xor.b32 %r1368, %r1334, %r1367; + shf.l.wrap.b32 %r1369, %r1368, %r1368, 16; + add.s32 %r1370, %r1369, %r1293; + xor.b32 %r1371, %r1370, %r1309; + shf.l.wrap.b32 %r1372, %r1371, %r1371, 20; + add.s32 %r1373, %r1367, %r1114; + add.s32 %r1374, %r1373, %r1372; + xor.b32 %r1375, %r1374, %r1369; + shf.l.wrap.b32 %r1376, %r1375, %r1375, 24; + add.s32 %r1377, %r1376, %r1370; + xor.b32 %r1378, %r1377, %r1372; + shf.l.wrap.b32 %r1379, %r1378, %r1378, 25; + add.s32 %r1380, %r1323, %r1142; + add.s32 %r1381, %r1380, %r1332; + xor.b32 %r1382, %r1381, %r1292; + shf.l.wrap.b32 %r1383, %r1382, %r1382, 16; + add.s32 %r1384, %r1383, %r1307; + xor.b32 %r1385, %r1384, %r1323; + shf.l.wrap.b32 %r1386, %r1385, %r1385, 20; + add.s32 %r1387, %r1381, %r1205; + add.s32 %r1388, %r1387, %r1386; + xor.b32 %r1389, %r1388, %r1383; + shf.l.wrap.b32 %r1390, %r1389, %r1389, 24; + add.s32 %r1391, %r1390, %r1384; + xor.b32 %r1392, %r1391, %r1386; + shf.l.wrap.b32 %r1393, %r1392, %r1392, 25; + add.s32 %r1394, %r1365, %r1121; + add.s32 %r1395, %r1394, %r1346; + xor.b32 %r1396, %r1395, %r1390; + shf.l.wrap.b32 %r1397, %r1396, %r1396, 16; + add.s32 %r1398, %r1397, %r1377; + xor.b32 %r1399, %r1398, %r1365; + shf.l.wrap.b32 %r1400, %r1399, %r1399, 20; + add.s32 %r1401, %r1395, %r1191; + add.s32 %r1402, %r1401, %r1400; + xor.b32 %r1403, %r1402, %r1397; + shf.l.wrap.b32 %r1404, %r1403, %r1403, 24; + add.s32 %r1405, %r1404, %r1398; + xor.b32 %r1406, %r1405, %r1400; + shf.l.wrap.b32 %r1407, %r1406, %r1406, 25; + add.s32 %r1408, %r1360, %r1198; + add.s32 %r1409, %r1408, %r1379; + xor.b32 %r1410, %r1348, %r1409; + shf.l.wrap.b32 %r1411, %r1410, %r1410, 16; + add.s32 %r1412, %r1411, %r1391; + xor.b32 %r1413, %r1412, %r1379; + shf.l.wrap.b32 %r1414, %r1413, %r1413, 20; + add.s32 %r1415, %r1409, %r1149; + add.s32 %r1416, %r1415, %r1414; + xor.b32 %r1417, %r1416, %r1411; + shf.l.wrap.b32 %r1418, %r1417, %r1417, 24; + add.s32 %r1419, %r1418, %r1412; + xor.b32 %r1420, %r1419, %r1414; + shf.l.wrap.b32 %r1421, %r1420, %r1420, 25; + add.s32 %r1422, %r1374, %r1177; + add.s32 %r1423, %r1422, %r1393; + xor.b32 %r1424, %r1423, %r1362; + shf.l.wrap.b32 %r1425, %r1424, %r1424, 16; + add.s32 %r1426, %r1425, %r1349; + xor.b32 %r1427, %r1426, %r1393; + shf.l.wrap.b32 %r1428, %r1427, %r1427, 20; + add.s32 %r1429, %r1423, %r1212; + add.s32 %r1430, %r1429, %r1428; + xor.b32 %r1431, %r1430, %r1425; + shf.l.wrap.b32 %r1432, %r1431, %r1431, 24; + add.s32 %r1433, %r1432, %r1426; + xor.b32 %r1434, %r1433, %r1428; + shf.l.wrap.b32 %r1435, %r1434, %r1434, 25; + add.s32 %r1436, %r1388, %r1219; + add.s32 %r1437, %r1436, %r1351; + xor.b32 %r1438, %r1437, %r1376; + shf.l.wrap.b32 %r1439, %r1438, %r1438, 16; + add.s32 %r1440, %r1439, %r1363; + xor.b32 %r1441, %r1440, %r1351; + shf.l.wrap.b32 %r1442, %r1441, %r1441, 20; + add.s32 %r1443, %r1437, %r1170; + add.s32 %r1444, %r1443, %r1442; + xor.b32 %r1445, %r1444, %r1439; + shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; + add.s32 %r1447, %r1446, %r1440; + xor.b32 %r1448, %r1447, %r1442; + shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; + add.s32 %r1450, %r1402, %r1135; + add.s32 %r1451, %r1450, %r1449; + xor.b32 %r1452, %r1451, %r1418; + shf.l.wrap.b32 %r1453, %r1452, %r1452, 16; + add.s32 %r1454, %r1453, %r1433; + xor.b32 %r1455, %r1454, %r1449; + shf.l.wrap.b32 %r1456, %r1455, %r1455, 20; + add.s32 %r1457, %r1451, %r1142; + add.s32 %r1458, %r1457, %r1456; + xor.b32 %r1459, %r1458, %r1453; + shf.l.wrap.b32 %r1460, %r1459, %r1459, 24; + add.s32 %r1461, %r1460, %r1454; + xor.b32 %r1462, %r1461, %r1456; + shf.l.wrap.b32 %r1463, %r1462, %r1462, 25; + add.s32 %r1464, %r1416, %r1184; + add.s32 %r1465, %r1464, %r1407; + xor.b32 %r1466, %r1465, %r1432; + shf.l.wrap.b32 %r1467, %r1466, %r1466, 16; + add.s32 %r1468, %r1467, %r1447; + xor.b32 %r1469, %r1468, %r1407; + shf.l.wrap.b32 %r1470, %r1469, %r1469, 20; + add.s32 %r1471, %r1465, %r1198; + add.s32 %r1472, %r1471, %r1470; + xor.b32 %r1473, %r1472, %r1467; + shf.l.wrap.b32 %r1474, %r1473, %r1473, 24; + add.s32 %r1475, %r1474, %r1468; + xor.b32 %r1476, %r1475, %r1470; + shf.l.wrap.b32 %r1477, %r1476, %r1476, 25; + add.s32 %r1478, %r1430, %r1205; + add.s32 %r1479, %r1478, %r1421; + xor.b32 %r1480, %r1446, %r1479; + shf.l.wrap.b32 %r1481, %r1480, %r1480, 16; + add.s32 %r1482, %r1481, %r1405; + xor.b32 %r1483, %r1482, %r1421; + shf.l.wrap.b32 %r1484, %r1483, %r1483, 20; + add.s32 %r1485, %r1479, %r1128; + add.s32 %r1486, %r1485, %r1484; + xor.b32 %r1487, %r1486, %r1481; + shf.l.wrap.b32 %r1488, %r1487, %r1487, 24; + add.s32 %r1489, %r1488, %r1482; + xor.b32 %r1490, %r1489, %r1484; + shf.l.wrap.b32 %r1491, %r1490, %r1490, 25; + add.s32 %r1492, %r1435, %r1163; + add.s32 %r1493, %r1492, %r1444; + xor.b32 %r1494, %r1493, %r1404; + shf.l.wrap.b32 %r1495, %r1494, %r1494, 16; + add.s32 %r1496, %r1495, %r1419; + xor.b32 %r1497, %r1496, %r1435; + shf.l.wrap.b32 %r1498, %r1497, %r1497, 20; + add.s32 %r1499, %r1493, %r1212; + add.s32 %r1500, %r1499, %r1498; + xor.b32 %r1501, %r1500, %r1495; + shf.l.wrap.b32 %r1502, %r1501, %r1501, 24; + add.s32 %r1503, %r1502, %r1496; + xor.b32 %r1504, %r1503, %r1498; + shf.l.wrap.b32 %r1505, %r1504, %r1504, 25; + add.s32 %r1506, %r1477, %r1156; + add.s32 %r1507, %r1506, %r1458; + xor.b32 %r1508, %r1507, %r1502; + shf.l.wrap.b32 %r1509, %r1508, %r1508, 16; + add.s32 %r1510, %r1509, %r1489; + xor.b32 %r1511, %r1510, %r1477; + shf.l.wrap.b32 %r1512, %r1511, %r1511, 20; + add.s32 %r1513, %r1507, %r1149; + add.s32 %r1514, %r1513, %r1512; + xor.b32 %r1515, %r1514, %r1509; + shf.l.wrap.b32 %r1516, %r1515, %r1515, 24; + add.s32 %r1517, %r1516, %r1510; + xor.b32 %r1518, %r1517, %r1512; + shf.l.wrap.b32 %r1519, %r1518, %r1518, 25; + add.s32 %r1520, %r1472, %r1177; + add.s32 %r1521, %r1520, %r1491; + xor.b32 %r1522, %r1460, %r1521; + shf.l.wrap.b32 %r1523, %r1522, %r1522, 16; + add.s32 %r1524, %r1523, %r1503; + xor.b32 %r1525, %r1524, %r1491; + shf.l.wrap.b32 %r1526, %r1525, %r1525, 20; + add.s32 %r1527, %r1521, %r1114; + add.s32 %r1528, %r1527, %r1526; + xor.b32 %r1529, %r1528, %r1523; + shf.l.wrap.b32 %r1530, %r1529, %r1529, 24; + add.s32 %r1531, %r1530, %r1524; + xor.b32 %r1532, %r1531, %r1526; + shf.l.wrap.b32 %r1533, %r1532, %r1532, 25; + add.s32 %r1534, %r1486, %r1191; + add.s32 %r1535, %r1534, %r1505; + xor.b32 %r1536, %r1535, %r1474; + shf.l.wrap.b32 %r1537, %r1536, %r1536, 16; + add.s32 %r1538, %r1537, %r1461; + xor.b32 %r1539, %r1538, %r1505; + shf.l.wrap.b32 %r1540, %r1539, %r1539, 20; + add.s32 %r1541, %r1535, %r1219; + add.s32 %r1542, %r1541, %r1540; + xor.b32 %r1543, %r1542, %r1537; + shf.l.wrap.b32 %r1544, %r1543, %r1543, 24; + add.s32 %r1545, %r1544, %r1538; + xor.b32 %r1546, %r1545, %r1540; + shf.l.wrap.b32 %r1547, %r1546, %r1546, 25; + add.s32 %r1548, %r1500, %r1170; + add.s32 %r1549, %r1548, %r1463; + xor.b32 %r1550, %r1549, %r1488; + shf.l.wrap.b32 %r1551, %r1550, %r1550, 16; + add.s32 %r1552, %r1551, %r1475; + xor.b32 %r1553, %r1552, %r1463; + shf.l.wrap.b32 %r1554, %r1553, %r1553, 20; + add.s32 %r1555, %r1549, %r1121; + add.s32 %r1556, %r1555, %r1554; + xor.b32 %r1557, %r1556, %r1551; + shf.l.wrap.b32 %r1558, %r1557, %r1557, 24; + add.s32 %r1559, %r1558, %r1552; + xor.b32 %r1560, %r1559, %r1554; + shf.l.wrap.b32 %r1561, %r1560, %r1560, 25; + add.s32 %r1562, %r1514, %r1184; + add.s32 %r1563, %r1562, %r1561; + xor.b32 %r1564, %r1563, %r1530; + shf.l.wrap.b32 %r1565, %r1564, %r1564, 16; + add.s32 %r1566, %r1565, %r1545; + xor.b32 %r1567, %r1566, %r1561; + shf.l.wrap.b32 %r1568, %r1567, %r1567, 20; + add.s32 %r1569, %r1563, %r1163; + add.s32 %r1570, %r1569, %r1568; + xor.b32 %r1571, %r1570, %r1565; + shf.l.wrap.b32 %r1572, %r1571, %r1571, 24; + add.s32 %r1573, %r1572, %r1566; + xor.b32 %r1574, %r1573, %r1568; + shf.l.wrap.b32 %r1575, %r1574, %r1574, 25; + add.s32 %r1576, %r1528, %r1198; + add.s32 %r1577, %r1576, %r1519; + xor.b32 %r1578, %r1577, %r1544; + shf.l.wrap.b32 %r1579, %r1578, %r1578, 16; + add.s32 %r1580, %r1579, %r1559; + xor.b32 %r1581, %r1580, %r1519; + shf.l.wrap.b32 %r1582, %r1581, %r1581, 20; + add.s32 %r1583, %r1577, %r1177; + add.s32 %r1584, %r1583, %r1582; + xor.b32 %r1585, %r1584, %r1579; + shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; + add.s32 %r1587, %r1586, %r1580; + xor.b32 %r1588, %r1587, %r1582; + shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; + add.s32 %r1590, %r1542, %r1212; + add.s32 %r1591, %r1590, %r1533; + xor.b32 %r1592, %r1558, %r1591; + shf.l.wrap.b32 %r1593, %r1592, %r1592, 16; + add.s32 %r1594, %r1593, %r1517; + xor.b32 %r1595, %r1594, %r1533; + shf.l.wrap.b32 %r1596, %r1595, %r1595, 20; + add.s32 %r1597, %r1591, %r1135; + add.s32 %r1598, %r1597, %r1596; + xor.b32 %r1599, %r1598, %r1593; + shf.l.wrap.b32 %r1600, %r1599, %r1599, 24; + add.s32 %r1601, %r1600, %r1594; + xor.b32 %r1602, %r1601, %r1596; + shf.l.wrap.b32 %r1603, %r1602, %r1602, 25; + add.s32 %r1604, %r1547, %r1205; + add.s32 %r1605, %r1604, %r1556; + xor.b32 %r1606, %r1605, %r1516; + shf.l.wrap.b32 %r1607, %r1606, %r1606, 16; + add.s32 %r1608, %r1607, %r1531; + xor.b32 %r1609, %r1608, %r1547; + shf.l.wrap.b32 %r1610, %r1609, %r1609, 20; + add.s32 %r1611, %r1605, %r1219; + add.s32 %r1612, %r1611, %r1610; + xor.b32 %r1613, %r1612, %r1607; + shf.l.wrap.b32 %r1614, %r1613, %r1613, 24; + add.s32 %r1615, %r1614, %r1608; + xor.b32 %r1616, %r1615, %r1610; + shf.l.wrap.b32 %r1617, %r1616, %r1616, 25; + add.s32 %r1618, %r1589, %r1142; + add.s32 %r1619, %r1618, %r1570; + xor.b32 %r1620, %r1619, %r1614; + shf.l.wrap.b32 %r1621, %r1620, %r1620, 16; + add.s32 %r1622, %r1621, %r1601; + xor.b32 %r1623, %r1622, %r1589; + shf.l.wrap.b32 %r1624, %r1623, %r1623, 20; + add.s32 %r1625, %r1619, %r1114; + add.s32 %r1626, %r1625, %r1624; + xor.b32 %r1627, %r1626, %r1621; + shf.l.wrap.b32 %r1628, %r1627, %r1627, 24; + add.s32 %r1629, %r1628, %r1622; + xor.b32 %r1630, %r1629, %r1624; + shf.l.wrap.b32 %r1631, %r1630, %r1630, 25; + add.s32 %r1632, %r1584, %r1191; + add.s32 %r1633, %r1632, %r1603; + xor.b32 %r1634, %r1572, %r1633; + shf.l.wrap.b32 %r1635, %r1634, %r1634, 16; + add.s32 %r1636, %r1635, %r1615; + xor.b32 %r1637, %r1636, %r1603; + shf.l.wrap.b32 %r1638, %r1637, %r1637, 20; + add.s32 %r1639, %r1633, %r1128; + add.s32 %r1640, %r1639, %r1638; + xor.b32 %r1641, %r1640, %r1635; + shf.l.wrap.b32 %r1642, %r1641, %r1641, 24; + add.s32 %r1643, %r1642, %r1636; + xor.b32 %r1644, %r1643, %r1638; + shf.l.wrap.b32 %r1645, %r1644, %r1644, 25; + add.s32 %r1646, %r1598, %r1149; + add.s32 %r1647, %r1646, %r1617; + xor.b32 %r1648, %r1647, %r1586; + shf.l.wrap.b32 %r1649, %r1648, %r1648, 16; + add.s32 %r1650, %r1649, %r1573; + xor.b32 %r1651, %r1650, %r1617; + shf.l.wrap.b32 %r1652, %r1651, %r1651, 20; + add.s32 %r1653, %r1647, %r1170; + add.s32 %r1654, %r1653, %r1652; + xor.b32 %r1655, %r1654, %r1649; + shf.l.wrap.b32 %r1656, %r1655, %r1655, 24; + add.s32 %r1657, %r1656, %r1650; + xor.b32 %r1658, %r1657, %r1652; + shf.l.wrap.b32 %r1659, %r1658, %r1658, 25; + add.s32 %r1660, %r1612, %r1121; + add.s32 %r1661, %r1660, %r1575; + xor.b32 %r1662, %r1661, %r1600; + shf.l.wrap.b32 %r1663, %r1662, %r1662, 16; + add.s32 %r1664, %r1663, %r1587; + xor.b32 %r1665, %r1664, %r1575; + shf.l.wrap.b32 %r1666, %r1665, %r1665, 20; + add.s32 %r1667, %r1661, %r1156; + add.s32 %r1668, %r1667, %r1666; + xor.b32 %r1669, %r1668, %r1663; + shf.l.wrap.b32 %r1670, %r1669, %r1669, 24; + add.s32 %r1671, %r1670, %r1664; + xor.b32 %r1672, %r1671, %r1666; + shf.l.wrap.b32 %r1673, %r1672, %r1672, 25; + add.s32 %r1674, %r1626, %r1198; + add.s32 %r1675, %r1674, %r1673; + xor.b32 %r1676, %r1675, %r1642; + shf.l.wrap.b32 %r1677, %r1676, %r1676, 16; + add.s32 %r1678, %r1677, %r1657; + xor.b32 %r1679, %r1678, %r1673; + shf.l.wrap.b32 %r1680, %r1679, %r1679, 20; + add.s32 %r1681, %r1675, %r1205; + add.s32 %r1682, %r1681, %r1680; + xor.b32 %r1683, %r1682, %r1677; + shf.l.wrap.b32 %r1684, %r1683, %r1683, 24; + add.s32 %r1685, %r1684, %r1678; + xor.b32 %r1686, %r1685, %r1680; + shf.l.wrap.b32 %r1687, %r1686, %r1686, 25; + add.s32 %r1688, %r1640, %r1177; + add.s32 %r1689, %r1688, %r1631; + xor.b32 %r1690, %r1689, %r1656; + shf.l.wrap.b32 %r1691, %r1690, %r1690, 16; + add.s32 %r1692, %r1691, %r1671; + xor.b32 %r1693, %r1692, %r1631; + shf.l.wrap.b32 %r1694, %r1693, %r1693, 20; + add.s32 %r1695, %r1689, %r1191; + add.s32 %r1696, %r1695, %r1694; + xor.b32 %r1697, %r1696, %r1691; + shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; + add.s32 %r1699, %r1698, %r1692; + xor.b32 %r1700, %r1699, %r1694; + shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; + add.s32 %r1702, %r1654, %r1219; + add.s32 %r1703, %r1702, %r1645; + xor.b32 %r1704, %r1670, %r1703; + shf.l.wrap.b32 %r1705, %r1704, %r1704, 16; + add.s32 %r1706, %r1705, %r1629; + xor.b32 %r1707, %r1706, %r1645; + shf.l.wrap.b32 %r1708, %r1707, %r1707, 20; + add.s32 %r1709, %r1703, %r1184; + add.s32 %r1710, %r1709, %r1708; + xor.b32 %r1711, %r1710, %r1705; + shf.l.wrap.b32 %r1712, %r1711, %r1711, 24; + add.s32 %r1713, %r1712, %r1706; + xor.b32 %r1714, %r1713, %r1708; + shf.l.wrap.b32 %r1715, %r1714, %r1714, 25; + add.s32 %r1716, %r1659, %r1212; + add.s32 %r1717, %r1716, %r1668; + xor.b32 %r1718, %r1717, %r1628; + shf.l.wrap.b32 %r1719, %r1718, %r1718, 16; + add.s32 %r1720, %r1719, %r1643; + xor.b32 %r1721, %r1720, %r1659; + shf.l.wrap.b32 %r1722, %r1721, %r1721, 20; + add.s32 %r1723, %r1717, %r1170; + add.s32 %r1724, %r1723, %r1722; + xor.b32 %r1725, %r1724, %r1719; + shf.l.wrap.b32 %r1726, %r1725, %r1725, 24; + add.s32 %r1727, %r1726, %r1720; + xor.b32 %r1728, %r1727, %r1722; + shf.l.wrap.b32 %r1729, %r1728, %r1728, 25; + add.s32 %r1730, %r1701, %r1163; + add.s32 %r1731, %r1730, %r1682; + xor.b32 %r1732, %r1731, %r1726; + shf.l.wrap.b32 %r1733, %r1732, %r1732, 16; + add.s32 %r1734, %r1733, %r1713; + xor.b32 %r1735, %r1734, %r1701; + shf.l.wrap.b32 %r1736, %r1735, %r1735, 20; + add.s32 %r1737, %r1731, %r1128; + add.s32 %r1738, %r1737, %r1736; + xor.b32 %r1739, %r1738, %r1733; + shf.l.wrap.b32 %r1740, %r1739, %r1739, 24; + add.s32 %r1741, %r1740, %r1734; + xor.b32 %r1742, %r1741, %r1736; + shf.l.wrap.b32 %r1743, %r1742, %r1742, 25; + add.s32 %r1744, %r1696, %r1149; + add.s32 %r1745, %r1744, %r1715; + xor.b32 %r1746, %r1684, %r1745; + shf.l.wrap.b32 %r1747, %r1746, %r1746, 16; + add.s32 %r1748, %r1747, %r1727; + xor.b32 %r1749, %r1748, %r1715; + shf.l.wrap.b32 %r1750, %r1749, %r1749, 20; + add.s32 %r1751, %r1745, %r1135; + add.s32 %r1752, %r1751, %r1750; + xor.b32 %r1753, %r1752, %r1747; + shf.l.wrap.b32 %r1754, %r1753, %r1753, 24; + add.s32 %r1755, %r1754, %r1748; + xor.b32 %r1756, %r1755, %r1750; + shf.l.wrap.b32 %r1757, %r1756, %r1756, 25; + add.s32 %r1758, %r1710, %r1114; + add.s32 %r1759, %r1758, %r1729; + xor.b32 %r1760, %r1759, %r1698; + shf.l.wrap.b32 %r1761, %r1760, %r1760, 16; + add.s32 %r1762, %r1761, %r1685; + xor.b32 %r1763, %r1762, %r1729; + shf.l.wrap.b32 %r1764, %r1763, %r1763, 20; + add.s32 %r1765, %r1759, %r1121; + add.s32 %r1766, %r1765, %r1764; + xor.b32 %r1767, %r1766, %r1761; + shf.l.wrap.b32 %r1768, %r1767, %r1767, 24; + add.s32 %r1769, %r1768, %r1762; + xor.b32 %r1770, %r1769, %r1764; + shf.l.wrap.b32 %r1771, %r1770, %r1770, 25; + add.s32 %r1772, %r1724, %r1156; + add.s32 %r1773, %r1772, %r1687; + xor.b32 %r1774, %r1773, %r1712; + shf.l.wrap.b32 %r1775, %r1774, %r1774, 16; + add.s32 %r1776, %r1775, %r1699; + xor.b32 %r1777, %r1776, %r1687; + shf.l.wrap.b32 %r1778, %r1777, %r1777, 20; + add.s32 %r1779, %r1773, %r1142; + add.s32 %r1780, %r1779, %r1778; + xor.b32 %r1781, %r1780, %r1775; + shf.l.wrap.b32 %r1782, %r1781, %r1781, 24; + add.s32 %r1783, %r1782, %r1776; + xor.b32 %r1784, %r1783, %r1778; + shf.l.wrap.b32 %r1785, %r1784, %r1784, 25; + add.s32 %r1786, %r1738, %r1177; + add.s32 %r1787, %r1786, %r1785; + xor.b32 %r1788, %r1787, %r1754; + shf.l.wrap.b32 %r1789, %r1788, %r1788, 16; + add.s32 %r1790, %r1789, %r1769; + xor.b32 %r1791, %r1790, %r1785; + shf.l.wrap.b32 %r1792, %r1791, %r1791, 20; + add.s32 %r1793, %r1787, %r1212; + add.s32 %r1794, %r1793, %r1792; + xor.b32 %r1795, %r1794, %r1789; + shf.l.wrap.b32 %r1796, %r1795, %r1795, 24; + add.s32 %r1797, %r1796, %r1790; + xor.b32 %r1798, %r1797, %r1792; + shf.l.wrap.b32 %r1799, %r1798, %r1798, 25; + add.s32 %r1800, %r1752, %r1191; + add.s32 %r1801, %r1800, %r1743; + xor.b32 %r1802, %r1801, %r1768; + shf.l.wrap.b32 %r1803, %r1802, %r1802, 16; + add.s32 %r1804, %r1803, %r1783; + xor.b32 %r1805, %r1804, %r1743; + shf.l.wrap.b32 %r1806, %r1805, %r1805, 20; + add.s32 %r1807, %r1801, %r1149; + add.s32 %r1808, %r1807, %r1806; + xor.b32 %r1809, %r1808, %r1803; + shf.l.wrap.b32 %r1810, %r1809, %r1809, 24; + add.s32 %r1811, %r1810, %r1804; + xor.b32 %r1812, %r1811, %r1806; + shf.l.wrap.b32 %r1813, %r1812, %r1812, 25; + add.s32 %r1814, %r1766, %r1170; + add.s32 %r1815, %r1814, %r1757; + xor.b32 %r1816, %r1782, %r1815; + shf.l.wrap.b32 %r1817, %r1816, %r1816, 16; + add.s32 %r1818, %r1817, %r1741; + xor.b32 %r1819, %r1818, %r1757; + shf.l.wrap.b32 %r1820, %r1819, %r1819, 20; + add.s32 %r1821, %r1815, %r1198; + add.s32 %r1822, %r1821, %r1820; + xor.b32 %r1823, %r1822, %r1817; + shf.l.wrap.b32 %r1824, %r1823, %r1823, 24; + add.s32 %r1825, %r1824, %r1818; + xor.b32 %r1826, %r1825, %r1820; + shf.l.wrap.b32 %r1827, %r1826, %r1826, 25; + add.s32 %r1828, %r1771, %r1219; + add.s32 %r1829, %r1828, %r1780; + xor.b32 %r1830, %r1829, %r1740; + shf.l.wrap.b32 %r1831, %r1830, %r1830, 16; + add.s32 %r1832, %r1831, %r1755; + xor.b32 %r1833, %r1832, %r1771; + shf.l.wrap.b32 %r1834, %r1833, %r1833, 20; + add.s32 %r1835, %r1829, %r1121; + add.s32 %r1836, %r1835, %r1834; + xor.b32 %r1837, %r1836, %r1831; + shf.l.wrap.b32 %r1838, %r1837, %r1837, 24; + add.s32 %r1839, %r1838, %r1832; + xor.b32 %r1840, %r1839, %r1834; + shf.l.wrap.b32 %r1841, %r1840, %r1840, 25; + add.s32 %r1842, %r1813, %r1205; + add.s32 %r1843, %r1842, %r1794; + xor.b32 %r1844, %r1843, %r1838; + shf.l.wrap.b32 %r1845, %r1844, %r1844, 16; + add.s32 %r1846, %r1845, %r1825; + xor.b32 %r1847, %r1846, %r1813; + shf.l.wrap.b32 %r1848, %r1847, %r1847, 20; + add.s32 %r1849, %r1843, %r1135; + add.s32 %r1850, %r1849, %r1848; + xor.b32 %r1851, %r1850, %r1845; + shf.l.wrap.b32 %r1852, %r1851, %r1851, 24; + add.s32 %r1853, %r1852, %r1846; + xor.b32 %r1854, %r1853, %r1848; + shf.l.wrap.b32 %r1855, %r1854, %r1854, 25; + add.s32 %r1856, %r1808, %r1114; + add.s32 %r1857, %r1856, %r1827; + xor.b32 %r1858, %r1796, %r1857; + shf.l.wrap.b32 %r1859, %r1858, %r1858, 16; + add.s32 %r1860, %r1859, %r1839; + xor.b32 %r1861, %r1860, %r1827; + shf.l.wrap.b32 %r1862, %r1861, %r1861, 20; + add.s32 %r1863, %r1857, %r1184; + add.s32 %r1864, %r1863, %r1862; + xor.b32 %r1865, %r1864, %r1859; + shf.l.wrap.b32 %r1866, %r1865, %r1865, 24; + add.s32 %r1867, %r1866, %r1860; + xor.b32 %r1868, %r1867, %r1862; + shf.l.wrap.b32 %r1869, %r1868, %r1868, 25; + add.s32 %r1870, %r1822, %r1128; + add.s32 %r1871, %r1870, %r1841; + xor.b32 %r1872, %r1871, %r1810; + shf.l.wrap.b32 %r1873, %r1872, %r1872, 16; + add.s32 %r1874, %r1873, %r1797; + xor.b32 %r1875, %r1874, %r1841; + shf.l.wrap.b32 %r1876, %r1875, %r1875, 20; + add.s32 %r1877, %r1871, %r1156; + add.s32 %r1878, %r1877, %r1876; + xor.b32 %r1879, %r1878, %r1873; + shf.l.wrap.b32 %r1880, %r1879, %r1879, 24; + add.s32 %r1881, %r1880, %r1874; + xor.b32 %r1882, %r1881, %r1876; + shf.l.wrap.b32 %r1883, %r1882, %r1882, 25; + add.s32 %r1884, %r1836, %r1142; + add.s32 %r1885, %r1884, %r1799; + xor.b32 %r1886, %r1885, %r1824; + shf.l.wrap.b32 %r1887, %r1886, %r1886, 16; + add.s32 %r1888, %r1887, %r1811; + xor.b32 %r1889, %r1888, %r1799; + shf.l.wrap.b32 %r1890, %r1889, %r1889, 20; + add.s32 %r1891, %r1885, %r1163; + add.s32 %r1892, %r1891, %r1890; + xor.b32 %r1893, %r1892, %r1887; + shf.l.wrap.b32 %r1894, %r1893, %r1893, 24; + add.s32 %r1895, %r1894, %r1888; + xor.b32 %r1896, %r1895, %r1890; + shf.l.wrap.b32 %r1897, %r1896, %r1896, 25; + add.s32 %r1898, %r1850, %r1191; + add.s32 %r1899, %r1898, %r1897; + xor.b32 %r1900, %r1899, %r1866; + shf.l.wrap.b32 %r1901, %r1900, %r1900, 16; + add.s32 %r1902, %r1901, %r1881; + xor.b32 %r1903, %r1902, %r1897; + shf.l.wrap.b32 %r1904, %r1903, %r1903, 20; + add.s32 %r1905, %r1899, %r1219; + add.s32 %r1906, %r1905, %r1904; + xor.b32 %r1907, %r1906, %r1901; + shf.l.wrap.b32 %r1908, %r1907, %r1907, 24; + add.s32 %r1909, %r1908, %r1902; + xor.b32 %r1910, %r1909, %r1904; + shf.l.wrap.b32 %r1911, %r1910, %r1910, 25; + add.s32 %r1912, %r1864, %r1149; + add.s32 %r1913, %r1912, %r1855; + xor.b32 %r1914, %r1913, %r1880; + shf.l.wrap.b32 %r1915, %r1914, %r1914, 16; + add.s32 %r1916, %r1915, %r1895; + xor.b32 %r1917, %r1916, %r1855; + shf.l.wrap.b32 %r1918, %r1917, %r1917, 20; + add.s32 %r1919, %r1913, %r1114; + add.s32 %r1920, %r1919, %r1918; + xor.b32 %r1921, %r1920, %r1915; + shf.l.wrap.b32 %r1922, %r1921, %r1921, 24; + add.s32 %r1923, %r1922, %r1916; + xor.b32 %r1924, %r1923, %r1918; + shf.l.wrap.b32 %r1925, %r1924, %r1924, 25; + add.s32 %r1926, %r1878, %r1121; + add.s32 %r1927, %r1926, %r1869; + xor.b32 %r1928, %r1894, %r1927; + shf.l.wrap.b32 %r1929, %r1928, %r1928, 16; + add.s32 %r1930, %r1929, %r1853; + xor.b32 %r1931, %r1930, %r1869; + shf.l.wrap.b32 %r1932, %r1931, %r1931, 20; + add.s32 %r1933, %r1927, %r1177; + add.s32 %r1934, %r1933, %r1932; + xor.b32 %r1935, %r1934, %r1929; + shf.l.wrap.b32 %r1936, %r1935, %r1935, 24; + add.s32 %r1937, %r1936, %r1930; + xor.b32 %r1938, %r1937, %r1932; + shf.l.wrap.b32 %r1939, %r1938, %r1938, 25; + add.s32 %r1940, %r1883, %r1170; + add.s32 %r1941, %r1940, %r1892; + xor.b32 %r1942, %r1941, %r1852; + shf.l.wrap.b32 %r1943, %r1942, %r1942, 16; + add.s32 %r1944, %r1943, %r1867; + xor.b32 %r1945, %r1944, %r1883; + shf.l.wrap.b32 %r1946, %r1945, %r1945, 20; + add.s32 %r1947, %r1941, %r1156; + add.s32 %r1948, %r1947, %r1946; + xor.b32 %r1949, %r1948, %r1943; + shf.l.wrap.b32 %r1950, %r1949, %r1949, 24; + add.s32 %r1951, %r1950, %r1944; + xor.b32 %r1952, %r1951, %r1946; + shf.l.wrap.b32 %r1953, %r1952, %r1952, 25; + add.s32 %r1954, %r1925, %r1212; + add.s32 %r1955, %r1954, %r1906; + xor.b32 %r1956, %r1955, %r1950; + shf.l.wrap.b32 %r1957, %r1956, %r1956, 16; + add.s32 %r1958, %r1957, %r1937; + xor.b32 %r1959, %r1958, %r1925; + shf.l.wrap.b32 %r1960, %r1959, %r1959, 20; + add.s32 %r1961, %r1955, %r1184; + add.s32 %r1962, %r1961, %r1960; + xor.b32 %r1963, %r1962, %r1957; + shf.l.wrap.b32 %r1964, %r1963, %r1963, 24; + add.s32 %r1965, %r1964, %r1958; + xor.b32 %r1966, %r1965, %r1960; + shf.l.wrap.b32 %r1967, %r1966, %r1966, 25; + add.s32 %r1968, %r1920, %r1128; + add.s32 %r1969, %r1968, %r1939; + xor.b32 %r1970, %r1908, %r1969; + shf.l.wrap.b32 %r1971, %r1970, %r1970, 16; + add.s32 %r1972, %r1971, %r1951; + xor.b32 %r1973, %r1972, %r1939; + shf.l.wrap.b32 %r1974, %r1973, %r1973, 20; + add.s32 %r1975, %r1969, %r1198; + add.s32 %r1976, %r1975, %r1974; + xor.b32 %r1977, %r1976, %r1971; + shf.l.wrap.b32 %r1978, %r1977, %r1977, 24; + add.s32 %r1979, %r1978, %r1972; + xor.b32 %r1980, %r1979, %r1974; + shf.l.wrap.b32 %r1981, %r1980, %r1980, 25; + add.s32 %r1982, %r1934, %r1135; + add.s32 %r1983, %r1982, %r1953; + xor.b32 %r1984, %r1983, %r1922; + shf.l.wrap.b32 %r1985, %r1984, %r1984, 16; + add.s32 %r1986, %r1985, %r1909; + xor.b32 %r1987, %r1986, %r1953; + shf.l.wrap.b32 %r1988, %r1987, %r1987, 20; + add.s32 %r1989, %r1983, %r1142; + add.s32 %r1990, %r1989, %r1988; + xor.b32 %r1991, %r1990, %r1985; + shf.l.wrap.b32 %r1992, %r1991, %r1991, 24; + add.s32 %r1993, %r1992, %r1986; + xor.b32 %r1994, %r1993, %r1988; + shf.l.wrap.b32 %r1995, %r1994, %r1994, 25; + add.s32 %r1996, %r1948, %r1163; + add.s32 %r1997, %r1996, %r1911; + xor.b32 %r1998, %r1997, %r1936; + shf.l.wrap.b32 %r1999, %r1998, %r1998, 16; + add.s32 %r2000, %r1999, %r1923; + xor.b32 %r2001, %r2000, %r1911; + shf.l.wrap.b32 %r2002, %r2001, %r2001, 20; + add.s32 %r2003, %r1997, %r1205; + add.s32 %r2004, %r2003, %r2002; + xor.b32 %r2005, %r2004, %r1999; + shf.l.wrap.b32 %r2006, %r2005, %r2005, 24; + add.s32 %r2007, %r2006, %r2000; + xor.b32 %r2008, %r2007, %r2002; + shf.l.wrap.b32 %r2009, %r2008, %r2008, 25; + xor.b32 %r3948, %r1993, %r1962; + xor.b32 %r3947, %r2007, %r1976; + xor.b32 %r3946, %r1965, %r1990; + xor.b32 %r3945, %r2004, %r1979; + xor.b32 %r3944, %r2009, %r1978; + xor.b32 %r3943, %r1967, %r1992; + xor.b32 %r3942, %r2006, %r1981; + xor.b32 %r3941, %r1995, %r1964; + add.s64 %rd166, %rd166, 64; + add.s64 %rd167, %rd167, -1; + setp.ne.s64 %p19, %rd167, 0; + mov.u16 %rs197, %rs75; + @%p19 bra $L__BB0_19; + + st.local.u8 [%rd155], %r3948; + shr.u32 %r2010, %r3948, 8; + st.local.u8 [%rd155+1], %r2010; + shr.u32 %r2011, %r3948, 16; + st.local.u8 [%rd155+2], %r2011; + shr.u32 %r2012, %r3948, 24; + st.local.u8 [%rd155+3], %r2012; + st.local.u8 [%rd155+4], %r3947; + shr.u32 %r2013, %r3947, 8; + st.local.u8 [%rd155+5], %r2013; + shr.u32 %r2014, %r3947, 16; + st.local.u8 [%rd155+6], %r2014; + shr.u32 %r2015, %r3947, 24; + st.local.u8 [%rd155+7], %r2015; + st.local.u8 [%rd155+8], %r3946; + shr.u32 %r2016, %r3946, 8; + st.local.u8 [%rd155+9], %r2016; + shr.u32 %r2017, %r3946, 16; + st.local.u8 [%rd155+10], %r2017; + shr.u32 %r2018, %r3946, 24; + st.local.u8 [%rd155+11], %r2018; + st.local.u8 [%rd155+12], %r3945; + shr.u32 %r2019, %r3945, 8; + st.local.u8 [%rd155+13], %r2019; + shr.u32 %r2020, %r3945, 16; + st.local.u8 [%rd155+14], %r2020; + shr.u32 %r2021, %r3945, 24; + st.local.u8 [%rd155+15], %r2021; + st.local.u8 [%rd155+16], %r3944; + shr.u32 %r2022, %r3944, 8; + st.local.u8 [%rd155+17], %r2022; + shr.u32 %r2023, %r3944, 16; + st.local.u8 [%rd155+18], %r2023; + shr.u32 %r2024, %r3944, 24; + st.local.u8 [%rd155+19], %r2024; + st.local.u8 [%rd155+20], %r3943; + shr.u32 %r2025, %r3943, 8; + st.local.u8 [%rd155+21], %r2025; + shr.u32 %r2026, %r3943, 16; + st.local.u8 [%rd155+22], %r2026; + shr.u32 %r2027, %r3943, 24; + st.local.u8 [%rd155+23], %r2027; + st.local.u8 [%rd155+24], %r3942; + shr.u32 %r2028, %r3942, 8; + st.local.u8 [%rd155+25], %r2028; + shr.u32 %r2029, %r3942, 16; + st.local.u8 [%rd155+26], %r2029; + shr.u32 %r2030, %r3942, 24; + st.local.u8 [%rd155+27], %r2030; + st.local.u8 [%rd155+28], %r3941; + shr.u32 %r2031, %r3941, 8; + st.local.u8 [%rd155+29], %r2031; + shr.u32 %r2032, %r3941, 16; + st.local.u8 [%rd155+30], %r2032; + shr.u32 %r2033, %r3941, 24; + st.local.u8 [%rd155+31], %r2033; + add.s64 %rd165, %rd165, 1; + add.s64 %rd162, %rd162, 8; + add.s64 %rd155, %rd155, 32; + add.s64 %rd163, %rd163, -1; + setp.ne.s64 %p20, %rd163, 0; + @%p20 bra $L__BB0_18; + +$L__BB0_21: + ld.param.u64 %rd139, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + setp.ge.u64 %p21, %rd159, %rd139; + @%p21 bra $L__BB0_30; + + ld.param.u64 %rd140, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd135, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + add.s64 %rd127, %rd151, %rd135; + ld.local.u8 %r2034, [%rd2]; + ld.local.u8 %r2035, [%rd2+1]; + prmt.b32 %r2036, %r2035, %r2034, 30212; + ld.local.u8 %r2037, [%rd2+2]; + ld.local.u8 %r2038, [%rd2+3]; + prmt.b32 %r2039, %r2038, %r2037, 30212; + prmt.b32 %r3964, %r2039, %r2036, 4180; + ld.local.u8 %r2040, [%rd2+4]; + ld.local.u8 %r2041, [%rd2+5]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd2+6]; + ld.local.u8 %r2044, [%rd2+7]; + prmt.b32 %r2045, %r2044, %r2043, 30212; + prmt.b32 %r3963, %r2045, %r2042, 4180; + ld.local.u8 %r2046, [%rd2+8]; + ld.local.u8 %r2047, [%rd2+9]; + prmt.b32 %r2048, %r2047, %r2046, 30212; + ld.local.u8 %r2049, [%rd2+10]; + ld.local.u8 %r2050, [%rd2+11]; + prmt.b32 %r2051, %r2050, %r2049, 30212; + prmt.b32 %r3962, %r2051, %r2048, 4180; + ld.local.u8 %r2052, [%rd2+12]; + ld.local.u8 %r2053, [%rd2+13]; + prmt.b32 %r2054, %r2053, %r2052, 30212; + ld.local.u8 %r2055, [%rd2+14]; + ld.local.u8 %r2056, [%rd2+15]; + prmt.b32 %r2057, %r2056, %r2055, 30212; + prmt.b32 %r3961, %r2057, %r2054, 4180; + ld.local.u8 %r2058, [%rd2+16]; + ld.local.u8 %r2059, [%rd2+17]; + prmt.b32 %r2060, %r2059, %r2058, 30212; + ld.local.u8 %r2061, [%rd2+18]; + ld.local.u8 %r2062, [%rd2+19]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + prmt.b32 %r3960, %r2063, %r2060, 4180; + ld.local.u8 %r2064, [%rd2+20]; + ld.local.u8 %r2065, [%rd2+21]; + prmt.b32 %r2066, %r2065, %r2064, 30212; + ld.local.u8 %r2067, [%rd2+22]; + ld.local.u8 %r2068, [%rd2+23]; + prmt.b32 %r2069, %r2068, %r2067, 30212; + prmt.b32 %r3959, %r2069, %r2066, 4180; + ld.local.u8 %r2070, [%rd2+24]; + ld.local.u8 %r2071, [%rd2+25]; + prmt.b32 %r2072, %r2071, %r2070, 30212; + ld.local.u8 %r2073, [%rd2+26]; + ld.local.u8 %r2074, [%rd2+27]; + prmt.b32 %r2075, %r2074, %r2073, 30212; + prmt.b32 %r3958, %r2075, %r2072, 4180; + ld.local.u8 %r2076, [%rd2+28]; + ld.local.u8 %r2077, [%rd2+29]; + prmt.b32 %r2078, %r2077, %r2076, 30212; + ld.local.u8 %r2079, [%rd2+30]; + ld.local.u8 %r2080, [%rd2+31]; + prmt.b32 %r2081, %r2080, %r2079, 30212; + prmt.b32 %r3957, %r2081, %r2078, 4180; + add.u64 %rd53, %SPL, 16; + mov.u32 %r2082, 0; + st.local.v2.u32 [%rd53], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+8], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+16], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+24], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+32], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+40], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+48], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+56], {%r2082, %r2082}; + mov.u16 %rs199, 0; + st.local.v2.u8 [%rd53+64], {%rs199, %rs199}; + st.local.u8 [%rd53+66], %rs75; + add.s64 %rd170, %rd140, %rd159; + cvt.u32.u64 %r36, %rd127; + shr.u64 %rd129, %rd127, 32; + cvt.u32.u64 %r37, %rd129; + setp.lt.u64 %p22, %rd171, 65; + @%p22 bra $L__BB0_25; + + add.s64 %rd56, %rd53, 64; + mov.u16 %rs198, 0; + +$L__BB0_24: + and.b16 %rs83, %rs198, 255; + setp.eq.s16 %p23, %rs83, 0; + selp.u16 %rs84, 1, 0, %p23; + or.b16 %rs85, %rs84, %rs75; + ld.u8 %r2083, [%rd170]; + ld.u8 %r2084, [%rd170+1]; + prmt.b32 %r2085, %r2084, %r2083, 30212; + ld.u8 %r2086, [%rd170+2]; + prmt.b32 %r2087, %r2086, %r2085, 28756; + ld.u8 %r2088, [%rd170+3]; + prmt.b32 %r2089, %r2088, %r2087, 1620; + ld.u8 %r2090, [%rd170+4]; + ld.u8 %r2091, [%rd170+5]; + prmt.b32 %r2092, %r2091, %r2090, 30212; + ld.u8 %r2093, [%rd170+6]; + prmt.b32 %r2094, %r2093, %r2092, 28756; + ld.u8 %r2095, [%rd170+7]; + prmt.b32 %r2096, %r2095, %r2094, 1620; + ld.u8 %r2097, [%rd170+8]; + ld.u8 %r2098, [%rd170+9]; + prmt.b32 %r2099, %r2098, %r2097, 30212; + ld.u8 %r2100, [%rd170+10]; + prmt.b32 %r2101, %r2100, %r2099, 28756; + ld.u8 %r2102, [%rd170+11]; + prmt.b32 %r2103, %r2102, %r2101, 1620; + ld.u8 %r2104, [%rd170+12]; + ld.u8 %r2105, [%rd170+13]; + prmt.b32 %r2106, %r2105, %r2104, 30212; + ld.u8 %r2107, [%rd170+14]; + prmt.b32 %r2108, %r2107, %r2106, 28756; + ld.u8 %r2109, [%rd170+15]; + prmt.b32 %r2110, %r2109, %r2108, 1620; + ld.u8 %r2111, [%rd170+16]; + ld.u8 %r2112, [%rd170+17]; + prmt.b32 %r2113, %r2112, %r2111, 30212; + ld.u8 %r2114, [%rd170+18]; + prmt.b32 %r2115, %r2114, %r2113, 28756; + ld.u8 %r2116, [%rd170+19]; + prmt.b32 %r2117, %r2116, %r2115, 1620; + ld.u8 %r2118, [%rd170+20]; + ld.u8 %r2119, [%rd170+21]; + prmt.b32 %r2120, %r2119, %r2118, 30212; + ld.u8 %r2121, [%rd170+22]; + prmt.b32 %r2122, %r2121, %r2120, 28756; + ld.u8 %r2123, [%rd170+23]; + prmt.b32 %r2124, %r2123, %r2122, 1620; + ld.u8 %r2125, [%rd170+24]; + ld.u8 %r2126, [%rd170+25]; + prmt.b32 %r2127, %r2126, %r2125, 30212; + ld.u8 %r2128, [%rd170+26]; + prmt.b32 %r2129, %r2128, %r2127, 28756; + ld.u8 %r2130, [%rd170+27]; + prmt.b32 %r2131, %r2130, %r2129, 1620; + ld.u8 %r2132, [%rd170+28]; + ld.u8 %r2133, [%rd170+29]; + prmt.b32 %r2134, %r2133, %r2132, 30212; + ld.u8 %r2135, [%rd170+30]; + prmt.b32 %r2136, %r2135, %r2134, 28756; + ld.u8 %r2137, [%rd170+31]; + prmt.b32 %r2138, %r2137, %r2136, 1620; + ld.u8 %r2139, [%rd170+32]; + ld.u8 %r2140, [%rd170+33]; + prmt.b32 %r2141, %r2140, %r2139, 30212; + ld.u8 %r2142, [%rd170+34]; + prmt.b32 %r2143, %r2142, %r2141, 28756; + ld.u8 %r2144, [%rd170+35]; + prmt.b32 %r2145, %r2144, %r2143, 1620; + ld.u8 %r2146, [%rd170+36]; + ld.u8 %r2147, [%rd170+37]; + prmt.b32 %r2148, %r2147, %r2146, 30212; + ld.u8 %r2149, [%rd170+38]; + prmt.b32 %r2150, %r2149, %r2148, 28756; + ld.u8 %r2151, [%rd170+39]; + prmt.b32 %r2152, %r2151, %r2150, 1620; + ld.u8 %r2153, [%rd170+40]; + ld.u8 %r2154, [%rd170+41]; + prmt.b32 %r2155, %r2154, %r2153, 30212; + ld.u8 %r2156, [%rd170+42]; + prmt.b32 %r2157, %r2156, %r2155, 28756; + ld.u8 %r2158, [%rd170+43]; + prmt.b32 %r2159, %r2158, %r2157, 1620; + ld.u8 %r2160, [%rd170+44]; + ld.u8 %r2161, [%rd170+45]; + prmt.b32 %r2162, %r2161, %r2160, 30212; + ld.u8 %r2163, [%rd170+46]; + prmt.b32 %r2164, %r2163, %r2162, 28756; + ld.u8 %r2165, [%rd170+47]; + prmt.b32 %r2166, %r2165, %r2164, 1620; + ld.u8 %r2167, [%rd170+48]; + ld.u8 %r2168, [%rd170+49]; + prmt.b32 %r2169, %r2168, %r2167, 30212; + ld.u8 %r2170, [%rd170+50]; + prmt.b32 %r2171, %r2170, %r2169, 28756; + ld.u8 %r2172, [%rd170+51]; + prmt.b32 %r2173, %r2172, %r2171, 1620; + ld.u8 %r2174, [%rd170+52]; + ld.u8 %r2175, [%rd170+53]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.u8 %r2177, [%rd170+54]; + prmt.b32 %r2178, %r2177, %r2176, 28756; + ld.u8 %r2179, [%rd170+55]; + prmt.b32 %r2180, %r2179, %r2178, 1620; + ld.u8 %r2181, [%rd170+56]; + ld.u8 %r2182, [%rd170+57]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.u8 %r2184, [%rd170+58]; + prmt.b32 %r2185, %r2184, %r2183, 28756; + ld.u8 %r2186, [%rd170+59]; + prmt.b32 %r2187, %r2186, %r2185, 1620; + ld.u8 %r2188, [%rd170+60]; + ld.u8 %r2189, [%rd170+61]; + prmt.b32 %r2190, %r2189, %r2188, 30212; + ld.u8 %r2191, [%rd170+62]; + prmt.b32 %r2192, %r2191, %r2190, 28756; + ld.u8 %r2193, [%rd170+63]; + prmt.b32 %r2194, %r2193, %r2192, 1620; + cvt.u32.u16 %r2195, %rs85; + add.s32 %r2196, %r3964, %r2089; + add.s32 %r2197, %r2196, %r3960; + xor.b32 %r2198, %r2197, %r36; + shf.l.wrap.b32 %r2199, %r2198, %r2198, 16; + add.s32 %r2200, %r2199, 1779033703; + xor.b32 %r2201, %r2200, %r3960; + shf.l.wrap.b32 %r2202, %r2201, %r2201, 20; + add.s32 %r2203, %r2197, %r2096; + add.s32 %r2204, %r2203, %r2202; + xor.b32 %r2205, %r2204, %r2199; + shf.l.wrap.b32 %r2206, %r2205, %r2205, 24; + add.s32 %r2207, %r2206, %r2200; + xor.b32 %r2208, %r2207, %r2202; + shf.l.wrap.b32 %r2209, %r2208, %r2208, 25; + add.s32 %r2210, %r3963, %r2103; + add.s32 %r2211, %r2210, %r3959; + xor.b32 %r2212, %r2211, %r37; + shf.l.wrap.b32 %r2213, %r2212, %r2212, 16; + add.s32 %r2214, %r2213, -1150833019; + xor.b32 %r2215, %r2214, %r3959; + shf.l.wrap.b32 %r2216, %r2215, %r2215, 20; + add.s32 %r2217, %r2211, %r2110; + add.s32 %r2218, %r2217, %r2216; + xor.b32 %r2219, %r2218, %r2213; + shf.l.wrap.b32 %r2220, %r2219, %r2219, 24; + add.s32 %r2221, %r2220, %r2214; + xor.b32 %r2222, %r2221, %r2216; + shf.l.wrap.b32 %r2223, %r2222, %r2222, 25; + add.s32 %r2224, %r3962, %r2117; + add.s32 %r2225, %r2224, %r3958; + shr.u32 %r2226, %r2225, 16; + shl.b32 %r2227, %r2225, 16; + xor.b32 %r2228, %r2227, 4194304; + or.b32 %r2229, %r2228, %r2226; + add.s32 %r2230, %r2229, 1013904242; + xor.b32 %r2231, %r2230, %r3958; + shf.l.wrap.b32 %r2232, %r2231, %r2231, 20; + add.s32 %r2233, %r2225, %r2124; + add.s32 %r2234, %r2233, %r2232; + xor.b32 %r2235, %r2234, %r2229; + shf.l.wrap.b32 %r2236, %r2235, %r2235, 24; + add.s32 %r2237, %r2236, %r2230; + xor.b32 %r2238, %r2237, %r2232; + shf.l.wrap.b32 %r2239, %r2238, %r2238, 25; + add.s32 %r2240, %r3961, %r2131; + add.s32 %r2241, %r2240, %r3957; + xor.b32 %r2242, %r2241, %r2195; + shr.u32 %r2243, %r2241, 16; + shl.b32 %r2244, %r2242, 16; + or.b32 %r2245, %r2244, %r2243; + add.s32 %r2246, %r2245, -1521486534; + xor.b32 %r2247, %r2246, %r3957; + shf.l.wrap.b32 %r2248, %r2247, %r2247, 20; + add.s32 %r2249, %r2241, %r2138; + add.s32 %r2250, %r2249, %r2248; + xor.b32 %r2251, %r2250, %r2245; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 24; + add.s32 %r2253, %r2252, %r2246; + xor.b32 %r2254, %r2253, %r2248; + shf.l.wrap.b32 %r2255, %r2254, %r2254, 25; + add.s32 %r2256, %r2204, %r2145; + add.s32 %r2257, %r2256, %r2223; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 16; + add.s32 %r2260, %r2259, %r2237; + xor.b32 %r2261, %r2260, %r2223; + shf.l.wrap.b32 %r2262, %r2261, %r2261, 20; + add.s32 %r2263, %r2257, %r2152; + add.s32 %r2264, %r2263, %r2262; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 24; + add.s32 %r2267, %r2266, %r2260; + xor.b32 %r2268, %r2267, %r2262; + shf.l.wrap.b32 %r2269, %r2268, %r2268, 25; + add.s32 %r2270, %r2218, %r2159; + add.s32 %r2271, %r2270, %r2239; + xor.b32 %r2272, %r2271, %r2206; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 16; + add.s32 %r2274, %r2273, %r2253; + xor.b32 %r2275, %r2274, %r2239; + shf.l.wrap.b32 %r2276, %r2275, %r2275, 20; + add.s32 %r2277, %r2271, %r2166; + add.s32 %r2278, %r2277, %r2276; + xor.b32 %r2279, %r2278, %r2273; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 24; + add.s32 %r2281, %r2280, %r2274; + xor.b32 %r2282, %r2281, %r2276; + shf.l.wrap.b32 %r2283, %r2282, %r2282, 25; + add.s32 %r2284, %r2234, %r2173; + add.s32 %r2285, %r2284, %r2255; + xor.b32 %r2286, %r2285, %r2220; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 16; + add.s32 %r2288, %r2287, %r2207; + xor.b32 %r2289, %r2288, %r2255; + shf.l.wrap.b32 %r2290, %r2289, %r2289, 20; + add.s32 %r2291, %r2285, %r2180; + add.s32 %r2292, %r2291, %r2290; + xor.b32 %r2293, %r2292, %r2287; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 24; + add.s32 %r2295, %r2294, %r2288; + xor.b32 %r2296, %r2295, %r2290; + shf.l.wrap.b32 %r2297, %r2296, %r2296, 25; + add.s32 %r2298, %r2250, %r2187; + add.s32 %r2299, %r2298, %r2209; + xor.b32 %r2300, %r2299, %r2236; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 16; + add.s32 %r2302, %r2301, %r2221; + xor.b32 %r2303, %r2302, %r2209; + shf.l.wrap.b32 %r2304, %r2303, %r2303, 20; + add.s32 %r2305, %r2299, %r2194; + add.s32 %r2306, %r2305, %r2304; + xor.b32 %r2307, %r2306, %r2301; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 24; + add.s32 %r2309, %r2308, %r2302; + xor.b32 %r2310, %r2309, %r2304; + shf.l.wrap.b32 %r2311, %r2310, %r2310, 25; + add.s32 %r2312, %r2264, %r2103; + add.s32 %r2313, %r2312, %r2311; + xor.b32 %r2314, %r2313, %r2280; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 16; + add.s32 %r2316, %r2315, %r2295; + xor.b32 %r2317, %r2316, %r2311; + shf.l.wrap.b32 %r2318, %r2317, %r2317, 20; + add.s32 %r2319, %r2313, %r2131; + add.s32 %r2320, %r2319, %r2318; + xor.b32 %r2321, %r2320, %r2315; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 24; + add.s32 %r2323, %r2322, %r2316; + xor.b32 %r2324, %r2323, %r2318; + shf.l.wrap.b32 %r2325, %r2324, %r2324, 25; + add.s32 %r2326, %r2278, %r2110; + add.s32 %r2327, %r2326, %r2269; + xor.b32 %r2328, %r2327, %r2294; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 16; + add.s32 %r2330, %r2329, %r2309; + xor.b32 %r2331, %r2330, %r2269; + shf.l.wrap.b32 %r2332, %r2331, %r2331, 20; + add.s32 %r2333, %r2327, %r2159; + add.s32 %r2334, %r2333, %r2332; + xor.b32 %r2335, %r2334, %r2329; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 24; + add.s32 %r2337, %r2336, %r2330; + xor.b32 %r2338, %r2337, %r2332; + shf.l.wrap.b32 %r2339, %r2338, %r2338, 25; + add.s32 %r2340, %r2292, %r2138; + add.s32 %r2341, %r2340, %r2283; + xor.b32 %r2342, %r2341, %r2308; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 16; + add.s32 %r2344, %r2343, %r2267; + xor.b32 %r2345, %r2344, %r2283; + shf.l.wrap.b32 %r2346, %r2345, %r2345, 20; + add.s32 %r2347, %r2341, %r2089; + add.s32 %r2348, %r2347, %r2346; + xor.b32 %r2349, %r2348, %r2343; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 24; + add.s32 %r2351, %r2350, %r2344; + xor.b32 %r2352, %r2351, %r2346; + shf.l.wrap.b32 %r2353, %r2352, %r2352, 25; + add.s32 %r2354, %r2306, %r2117; + add.s32 %r2355, %r2354, %r2297; + xor.b32 %r2356, %r2355, %r2266; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 16; + add.s32 %r2358, %r2357, %r2281; + xor.b32 %r2359, %r2358, %r2297; + shf.l.wrap.b32 %r2360, %r2359, %r2359, 20; + add.s32 %r2361, %r2355, %r2180; + add.s32 %r2362, %r2361, %r2360; + xor.b32 %r2363, %r2362, %r2357; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 24; + add.s32 %r2365, %r2364, %r2358; + xor.b32 %r2366, %r2365, %r2360; + shf.l.wrap.b32 %r2367, %r2366, %r2366, 25; + add.s32 %r2368, %r2320, %r2096; + add.s32 %r2369, %r2368, %r2339; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 16; + add.s32 %r2372, %r2371, %r2351; + xor.b32 %r2373, %r2372, %r2339; + shf.l.wrap.b32 %r2374, %r2373, %r2373, 20; + add.s32 %r2375, %r2369, %r2166; + add.s32 %r2376, %r2375, %r2374; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 24; + add.s32 %r2379, %r2378, %r2372; + xor.b32 %r2380, %r2379, %r2374; + shf.l.wrap.b32 %r2381, %r2380, %r2380, 25; + add.s32 %r2382, %r2334, %r2173; + add.s32 %r2383, %r2382, %r2353; + xor.b32 %r2384, %r2383, %r2322; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 16; + add.s32 %r2386, %r2385, %r2365; + xor.b32 %r2387, %r2386, %r2353; + shf.l.wrap.b32 %r2388, %r2387, %r2387, 20; + add.s32 %r2389, %r2383, %r2124; + add.s32 %r2390, %r2389, %r2388; + xor.b32 %r2391, %r2390, %r2385; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 24; + add.s32 %r2393, %r2392, %r2386; + xor.b32 %r2394, %r2393, %r2388; + shf.l.wrap.b32 %r2395, %r2394, %r2394, 25; + add.s32 %r2396, %r2348, %r2152; + add.s32 %r2397, %r2396, %r2367; + xor.b32 %r2398, %r2397, %r2336; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 16; + add.s32 %r2400, %r2399, %r2323; + xor.b32 %r2401, %r2400, %r2367; + shf.l.wrap.b32 %r2402, %r2401, %r2401, 20; + add.s32 %r2403, %r2397, %r2187; + add.s32 %r2404, %r2403, %r2402; + xor.b32 %r2405, %r2404, %r2399; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 24; + add.s32 %r2407, %r2406, %r2400; + xor.b32 %r2408, %r2407, %r2402; + shf.l.wrap.b32 %r2409, %r2408, %r2408, 25; + add.s32 %r2410, %r2362, %r2194; + add.s32 %r2411, %r2410, %r2325; + xor.b32 %r2412, %r2411, %r2350; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 16; + add.s32 %r2414, %r2413, %r2337; + xor.b32 %r2415, %r2414, %r2325; + shf.l.wrap.b32 %r2416, %r2415, %r2415, 20; + add.s32 %r2417, %r2411, %r2145; + add.s32 %r2418, %r2417, %r2416; + xor.b32 %r2419, %r2418, %r2413; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 24; + add.s32 %r2421, %r2420, %r2414; + xor.b32 %r2422, %r2421, %r2416; + shf.l.wrap.b32 %r2423, %r2422, %r2422, 25; + add.s32 %r2424, %r2376, %r2110; + add.s32 %r2425, %r2424, %r2423; + xor.b32 %r2426, %r2425, %r2392; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 16; + add.s32 %r2428, %r2427, %r2407; + xor.b32 %r2429, %r2428, %r2423; + shf.l.wrap.b32 %r2430, %r2429, %r2429, 20; + add.s32 %r2431, %r2425, %r2117; + add.s32 %r2432, %r2431, %r2430; + xor.b32 %r2433, %r2432, %r2427; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 24; + add.s32 %r2435, %r2434, %r2428; + xor.b32 %r2436, %r2435, %r2430; + shf.l.wrap.b32 %r2437, %r2436, %r2436, 25; + add.s32 %r2438, %r2390, %r2159; + add.s32 %r2439, %r2438, %r2381; + xor.b32 %r2440, %r2439, %r2406; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 16; + add.s32 %r2442, %r2441, %r2421; + xor.b32 %r2443, %r2442, %r2381; + shf.l.wrap.b32 %r2444, %r2443, %r2443, 20; + add.s32 %r2445, %r2439, %r2173; + add.s32 %r2446, %r2445, %r2444; + xor.b32 %r2447, %r2446, %r2441; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 24; + add.s32 %r2449, %r2448, %r2442; + xor.b32 %r2450, %r2449, %r2444; + shf.l.wrap.b32 %r2451, %r2450, %r2450, 25; + add.s32 %r2452, %r2404, %r2180; + add.s32 %r2453, %r2452, %r2395; + xor.b32 %r2454, %r2453, %r2420; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 16; + add.s32 %r2456, %r2455, %r2379; + xor.b32 %r2457, %r2456, %r2395; + shf.l.wrap.b32 %r2458, %r2457, %r2457, 20; + add.s32 %r2459, %r2453, %r2103; + add.s32 %r2460, %r2459, %r2458; + xor.b32 %r2461, %r2460, %r2455; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 24; + add.s32 %r2463, %r2462, %r2456; + xor.b32 %r2464, %r2463, %r2458; + shf.l.wrap.b32 %r2465, %r2464, %r2464, 25; + add.s32 %r2466, %r2418, %r2138; + add.s32 %r2467, %r2466, %r2409; + xor.b32 %r2468, %r2467, %r2378; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 16; + add.s32 %r2470, %r2469, %r2393; + xor.b32 %r2471, %r2470, %r2409; + shf.l.wrap.b32 %r2472, %r2471, %r2471, 20; + add.s32 %r2473, %r2467, %r2187; + add.s32 %r2474, %r2473, %r2472; + xor.b32 %r2475, %r2474, %r2469; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 24; + add.s32 %r2477, %r2476, %r2470; + xor.b32 %r2478, %r2477, %r2472; + shf.l.wrap.b32 %r2479, %r2478, %r2478, 25; + add.s32 %r2480, %r2432, %r2131; + add.s32 %r2481, %r2480, %r2451; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 16; + add.s32 %r2484, %r2483, %r2463; + xor.b32 %r2485, %r2484, %r2451; + shf.l.wrap.b32 %r2486, %r2485, %r2485, 20; + add.s32 %r2487, %r2481, %r2124; + add.s32 %r2488, %r2487, %r2486; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 24; + add.s32 %r2491, %r2490, %r2484; + xor.b32 %r2492, %r2491, %r2486; + shf.l.wrap.b32 %r2493, %r2492, %r2492, 25; + add.s32 %r2494, %r2446, %r2152; + add.s32 %r2495, %r2494, %r2465; + xor.b32 %r2496, %r2495, %r2434; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 16; + add.s32 %r2498, %r2497, %r2477; + xor.b32 %r2499, %r2498, %r2465; + shf.l.wrap.b32 %r2500, %r2499, %r2499, 20; + add.s32 %r2501, %r2495, %r2089; + add.s32 %r2502, %r2501, %r2500; + xor.b32 %r2503, %r2502, %r2497; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 24; + add.s32 %r2505, %r2504, %r2498; + xor.b32 %r2506, %r2505, %r2500; + shf.l.wrap.b32 %r2507, %r2506, %r2506, 25; + add.s32 %r2508, %r2460, %r2166; + add.s32 %r2509, %r2508, %r2479; + xor.b32 %r2510, %r2509, %r2448; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 16; + add.s32 %r2512, %r2511, %r2435; + xor.b32 %r2513, %r2512, %r2479; + shf.l.wrap.b32 %r2514, %r2513, %r2513, 20; + add.s32 %r2515, %r2509, %r2194; + add.s32 %r2516, %r2515, %r2514; + xor.b32 %r2517, %r2516, %r2511; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 24; + add.s32 %r2519, %r2518, %r2512; + xor.b32 %r2520, %r2519, %r2514; + shf.l.wrap.b32 %r2521, %r2520, %r2520, 25; + add.s32 %r2522, %r2474, %r2145; + add.s32 %r2523, %r2522, %r2437; + xor.b32 %r2524, %r2523, %r2462; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 16; + add.s32 %r2526, %r2525, %r2449; + xor.b32 %r2527, %r2526, %r2437; + shf.l.wrap.b32 %r2528, %r2527, %r2527, 20; + add.s32 %r2529, %r2523, %r2096; + add.s32 %r2530, %r2529, %r2528; + xor.b32 %r2531, %r2530, %r2525; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 24; + add.s32 %r2533, %r2532, %r2526; + xor.b32 %r2534, %r2533, %r2528; + shf.l.wrap.b32 %r2535, %r2534, %r2534, 25; + add.s32 %r2536, %r2488, %r2159; + add.s32 %r2537, %r2536, %r2535; + xor.b32 %r2538, %r2537, %r2504; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 16; + add.s32 %r2540, %r2539, %r2519; + xor.b32 %r2541, %r2540, %r2535; + shf.l.wrap.b32 %r2542, %r2541, %r2541, 20; + add.s32 %r2543, %r2537, %r2138; + add.s32 %r2544, %r2543, %r2542; + xor.b32 %r2545, %r2544, %r2539; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 24; + add.s32 %r2547, %r2546, %r2540; + xor.b32 %r2548, %r2547, %r2542; + shf.l.wrap.b32 %r2549, %r2548, %r2548, 25; + add.s32 %r2550, %r2502, %r2173; + add.s32 %r2551, %r2550, %r2493; + xor.b32 %r2552, %r2551, %r2518; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 16; + add.s32 %r2554, %r2553, %r2533; + xor.b32 %r2555, %r2554, %r2493; + shf.l.wrap.b32 %r2556, %r2555, %r2555, 20; + add.s32 %r2557, %r2551, %r2152; + add.s32 %r2558, %r2557, %r2556; + xor.b32 %r2559, %r2558, %r2553; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 24; + add.s32 %r2561, %r2560, %r2554; + xor.b32 %r2562, %r2561, %r2556; + shf.l.wrap.b32 %r2563, %r2562, %r2562, 25; + add.s32 %r2564, %r2516, %r2187; + add.s32 %r2565, %r2564, %r2507; + xor.b32 %r2566, %r2565, %r2532; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 16; + add.s32 %r2568, %r2567, %r2491; + xor.b32 %r2569, %r2568, %r2507; + shf.l.wrap.b32 %r2570, %r2569, %r2569, 20; + add.s32 %r2571, %r2565, %r2110; + add.s32 %r2572, %r2571, %r2570; + xor.b32 %r2573, %r2572, %r2567; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 24; + add.s32 %r2575, %r2574, %r2568; + xor.b32 %r2576, %r2575, %r2570; + shf.l.wrap.b32 %r2577, %r2576, %r2576, 25; + add.s32 %r2578, %r2530, %r2180; + add.s32 %r2579, %r2578, %r2521; + xor.b32 %r2580, %r2579, %r2490; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 16; + add.s32 %r2582, %r2581, %r2505; + xor.b32 %r2583, %r2582, %r2521; + shf.l.wrap.b32 %r2584, %r2583, %r2583, 20; + add.s32 %r2585, %r2579, %r2194; + add.s32 %r2586, %r2585, %r2584; + xor.b32 %r2587, %r2586, %r2581; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 24; + add.s32 %r2589, %r2588, %r2582; + xor.b32 %r2590, %r2589, %r2584; + shf.l.wrap.b32 %r2591, %r2590, %r2590, 25; + add.s32 %r2592, %r2544, %r2117; + add.s32 %r2593, %r2592, %r2563; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 16; + add.s32 %r2596, %r2595, %r2575; + xor.b32 %r2597, %r2596, %r2563; + shf.l.wrap.b32 %r2598, %r2597, %r2597, 20; + add.s32 %r2599, %r2593, %r2089; + add.s32 %r2600, %r2599, %r2598; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 24; + add.s32 %r2603, %r2602, %r2596; + xor.b32 %r2604, %r2603, %r2598; + shf.l.wrap.b32 %r2605, %r2604, %r2604, 25; + add.s32 %r2606, %r2558, %r2166; + add.s32 %r2607, %r2606, %r2577; + xor.b32 %r2608, %r2607, %r2546; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 16; + add.s32 %r2610, %r2609, %r2589; + xor.b32 %r2611, %r2610, %r2577; + shf.l.wrap.b32 %r2612, %r2611, %r2611, 20; + add.s32 %r2613, %r2607, %r2103; + add.s32 %r2614, %r2613, %r2612; + xor.b32 %r2615, %r2614, %r2609; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 24; + add.s32 %r2617, %r2616, %r2610; + xor.b32 %r2618, %r2617, %r2612; + shf.l.wrap.b32 %r2619, %r2618, %r2618, 25; + add.s32 %r2620, %r2572, %r2124; + add.s32 %r2621, %r2620, %r2591; + xor.b32 %r2622, %r2621, %r2560; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 16; + add.s32 %r2624, %r2623, %r2547; + xor.b32 %r2625, %r2624, %r2591; + shf.l.wrap.b32 %r2626, %r2625, %r2625, 20; + add.s32 %r2627, %r2621, %r2145; + add.s32 %r2628, %r2627, %r2626; + xor.b32 %r2629, %r2628, %r2623; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 24; + add.s32 %r2631, %r2630, %r2624; + xor.b32 %r2632, %r2631, %r2626; + shf.l.wrap.b32 %r2633, %r2632, %r2632, 25; + add.s32 %r2634, %r2586, %r2096; + add.s32 %r2635, %r2634, %r2549; + xor.b32 %r2636, %r2635, %r2574; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 16; + add.s32 %r2638, %r2637, %r2561; + xor.b32 %r2639, %r2638, %r2549; + shf.l.wrap.b32 %r2640, %r2639, %r2639, 20; + add.s32 %r2641, %r2635, %r2131; + add.s32 %r2642, %r2641, %r2640; + xor.b32 %r2643, %r2642, %r2637; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 24; + add.s32 %r2645, %r2644, %r2638; + xor.b32 %r2646, %r2645, %r2640; + shf.l.wrap.b32 %r2647, %r2646, %r2646, 25; + add.s32 %r2648, %r2600, %r2173; + add.s32 %r2649, %r2648, %r2647; + xor.b32 %r2650, %r2649, %r2616; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 16; + add.s32 %r2652, %r2651, %r2631; + xor.b32 %r2653, %r2652, %r2647; + shf.l.wrap.b32 %r2654, %r2653, %r2653, 20; + add.s32 %r2655, %r2649, %r2180; + add.s32 %r2656, %r2655, %r2654; + xor.b32 %r2657, %r2656, %r2651; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 24; + add.s32 %r2659, %r2658, %r2652; + xor.b32 %r2660, %r2659, %r2654; + shf.l.wrap.b32 %r2661, %r2660, %r2660, 25; + add.s32 %r2662, %r2614, %r2152; + add.s32 %r2663, %r2662, %r2605; + xor.b32 %r2664, %r2663, %r2630; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 16; + add.s32 %r2666, %r2665, %r2645; + xor.b32 %r2667, %r2666, %r2605; + shf.l.wrap.b32 %r2668, %r2667, %r2667, 20; + add.s32 %r2669, %r2663, %r2166; + add.s32 %r2670, %r2669, %r2668; + xor.b32 %r2671, %r2670, %r2665; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 24; + add.s32 %r2673, %r2672, %r2666; + xor.b32 %r2674, %r2673, %r2668; + shf.l.wrap.b32 %r2675, %r2674, %r2674, 25; + add.s32 %r2676, %r2628, %r2194; + add.s32 %r2677, %r2676, %r2619; + xor.b32 %r2678, %r2677, %r2644; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 16; + add.s32 %r2680, %r2679, %r2603; + xor.b32 %r2681, %r2680, %r2619; + shf.l.wrap.b32 %r2682, %r2681, %r2681, 20; + add.s32 %r2683, %r2677, %r2159; + add.s32 %r2684, %r2683, %r2682; + xor.b32 %r2685, %r2684, %r2679; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 24; + add.s32 %r2687, %r2686, %r2680; + xor.b32 %r2688, %r2687, %r2682; + shf.l.wrap.b32 %r2689, %r2688, %r2688, 25; + add.s32 %r2690, %r2642, %r2187; + add.s32 %r2691, %r2690, %r2633; + xor.b32 %r2692, %r2691, %r2602; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 16; + add.s32 %r2694, %r2693, %r2617; + xor.b32 %r2695, %r2694, %r2633; + shf.l.wrap.b32 %r2696, %r2695, %r2695, 20; + add.s32 %r2697, %r2691, %r2145; + add.s32 %r2698, %r2697, %r2696; + xor.b32 %r2699, %r2698, %r2693; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 24; + add.s32 %r2701, %r2700, %r2694; + xor.b32 %r2702, %r2701, %r2696; + shf.l.wrap.b32 %r2703, %r2702, %r2702, 25; + add.s32 %r2704, %r2656, %r2138; + add.s32 %r2705, %r2704, %r2675; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 16; + add.s32 %r2708, %r2707, %r2687; + xor.b32 %r2709, %r2708, %r2675; + shf.l.wrap.b32 %r2710, %r2709, %r2709, 20; + add.s32 %r2711, %r2705, %r2103; + add.s32 %r2712, %r2711, %r2710; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 24; + add.s32 %r2715, %r2714, %r2708; + xor.b32 %r2716, %r2715, %r2710; + shf.l.wrap.b32 %r2717, %r2716, %r2716, 25; + add.s32 %r2718, %r2670, %r2124; + add.s32 %r2719, %r2718, %r2689; + xor.b32 %r2720, %r2719, %r2658; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 16; + add.s32 %r2722, %r2721, %r2701; + xor.b32 %r2723, %r2722, %r2689; + shf.l.wrap.b32 %r2724, %r2723, %r2723, 20; + add.s32 %r2725, %r2719, %r2110; + add.s32 %r2726, %r2725, %r2724; + xor.b32 %r2727, %r2726, %r2721; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 24; + add.s32 %r2729, %r2728, %r2722; + xor.b32 %r2730, %r2729, %r2724; + shf.l.wrap.b32 %r2731, %r2730, %r2730, 25; + add.s32 %r2732, %r2684, %r2089; + add.s32 %r2733, %r2732, %r2703; + xor.b32 %r2734, %r2733, %r2672; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 16; + add.s32 %r2736, %r2735, %r2659; + xor.b32 %r2737, %r2736, %r2703; + shf.l.wrap.b32 %r2738, %r2737, %r2737, 20; + add.s32 %r2739, %r2733, %r2096; + add.s32 %r2740, %r2739, %r2738; + xor.b32 %r2741, %r2740, %r2735; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 24; + add.s32 %r2743, %r2742, %r2736; + xor.b32 %r2744, %r2743, %r2738; + shf.l.wrap.b32 %r2745, %r2744, %r2744, 25; + add.s32 %r2746, %r2698, %r2131; + add.s32 %r2747, %r2746, %r2661; + xor.b32 %r2748, %r2747, %r2686; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 16; + add.s32 %r2750, %r2749, %r2673; + xor.b32 %r2751, %r2750, %r2661; + shf.l.wrap.b32 %r2752, %r2751, %r2751, 20; + add.s32 %r2753, %r2747, %r2117; + add.s32 %r2754, %r2753, %r2752; + xor.b32 %r2755, %r2754, %r2749; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 24; + add.s32 %r2757, %r2756, %r2750; + xor.b32 %r2758, %r2757, %r2752; + shf.l.wrap.b32 %r2759, %r2758, %r2758, 25; + add.s32 %r2760, %r2712, %r2152; + add.s32 %r2761, %r2760, %r2759; + xor.b32 %r2762, %r2761, %r2728; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 16; + add.s32 %r2764, %r2763, %r2743; + xor.b32 %r2765, %r2764, %r2759; + shf.l.wrap.b32 %r2766, %r2765, %r2765, 20; + add.s32 %r2767, %r2761, %r2187; + add.s32 %r2768, %r2767, %r2766; + xor.b32 %r2769, %r2768, %r2763; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 24; + add.s32 %r2771, %r2770, %r2764; + xor.b32 %r2772, %r2771, %r2766; + shf.l.wrap.b32 %r2773, %r2772, %r2772, 25; + add.s32 %r2774, %r2726, %r2166; + add.s32 %r2775, %r2774, %r2717; + xor.b32 %r2776, %r2775, %r2742; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 16; + add.s32 %r2778, %r2777, %r2757; + xor.b32 %r2779, %r2778, %r2717; + shf.l.wrap.b32 %r2780, %r2779, %r2779, 20; + add.s32 %r2781, %r2775, %r2124; + add.s32 %r2782, %r2781, %r2780; + xor.b32 %r2783, %r2782, %r2777; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 24; + add.s32 %r2785, %r2784, %r2778; + xor.b32 %r2786, %r2785, %r2780; + shf.l.wrap.b32 %r2787, %r2786, %r2786, 25; + add.s32 %r2788, %r2740, %r2145; + add.s32 %r2789, %r2788, %r2731; + xor.b32 %r2790, %r2789, %r2756; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 16; + add.s32 %r2792, %r2791, %r2715; + xor.b32 %r2793, %r2792, %r2731; + shf.l.wrap.b32 %r2794, %r2793, %r2793, 20; + add.s32 %r2795, %r2789, %r2173; + add.s32 %r2796, %r2795, %r2794; + xor.b32 %r2797, %r2796, %r2791; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 24; + add.s32 %r2799, %r2798, %r2792; + xor.b32 %r2800, %r2799, %r2794; + shf.l.wrap.b32 %r2801, %r2800, %r2800, 25; + add.s32 %r2802, %r2754, %r2194; + add.s32 %r2803, %r2802, %r2745; + xor.b32 %r2804, %r2803, %r2714; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 16; + add.s32 %r2806, %r2805, %r2729; + xor.b32 %r2807, %r2806, %r2745; + shf.l.wrap.b32 %r2808, %r2807, %r2807, 20; + add.s32 %r2809, %r2803, %r2096; + add.s32 %r2810, %r2809, %r2808; + xor.b32 %r2811, %r2810, %r2805; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 24; + add.s32 %r2813, %r2812, %r2806; + xor.b32 %r2814, %r2813, %r2808; + shf.l.wrap.b32 %r2815, %r2814, %r2814, 25; + add.s32 %r2816, %r2768, %r2180; + add.s32 %r2817, %r2816, %r2787; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 16; + add.s32 %r2820, %r2819, %r2799; + xor.b32 %r2821, %r2820, %r2787; + shf.l.wrap.b32 %r2822, %r2821, %r2821, 20; + add.s32 %r2823, %r2817, %r2110; + add.s32 %r2824, %r2823, %r2822; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 24; + add.s32 %r2827, %r2826, %r2820; + xor.b32 %r2828, %r2827, %r2822; + shf.l.wrap.b32 %r2829, %r2828, %r2828, 25; + add.s32 %r2830, %r2782, %r2089; + add.s32 %r2831, %r2830, %r2801; + xor.b32 %r2832, %r2831, %r2770; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 16; + add.s32 %r2834, %r2833, %r2813; + xor.b32 %r2835, %r2834, %r2801; + shf.l.wrap.b32 %r2836, %r2835, %r2835, 20; + add.s32 %r2837, %r2831, %r2159; + add.s32 %r2838, %r2837, %r2836; + xor.b32 %r2839, %r2838, %r2833; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 24; + add.s32 %r2841, %r2840, %r2834; + xor.b32 %r2842, %r2841, %r2836; + shf.l.wrap.b32 %r2843, %r2842, %r2842, 25; + add.s32 %r2844, %r2796, %r2103; + add.s32 %r2845, %r2844, %r2815; + xor.b32 %r2846, %r2845, %r2784; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 16; + add.s32 %r2848, %r2847, %r2771; + xor.b32 %r2849, %r2848, %r2815; + shf.l.wrap.b32 %r2850, %r2849, %r2849, 20; + add.s32 %r2851, %r2845, %r2131; + add.s32 %r2852, %r2851, %r2850; + xor.b32 %r2853, %r2852, %r2847; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 24; + add.s32 %r2855, %r2854, %r2848; + xor.b32 %r2856, %r2855, %r2850; + shf.l.wrap.b32 %r2857, %r2856, %r2856, 25; + add.s32 %r2858, %r2810, %r2117; + add.s32 %r2859, %r2858, %r2773; + xor.b32 %r2860, %r2859, %r2798; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 16; + add.s32 %r2862, %r2861, %r2785; + xor.b32 %r2863, %r2862, %r2773; + shf.l.wrap.b32 %r2864, %r2863, %r2863, 20; + add.s32 %r2865, %r2859, %r2138; + add.s32 %r2866, %r2865, %r2864; + xor.b32 %r2867, %r2866, %r2861; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 24; + add.s32 %r2869, %r2868, %r2862; + xor.b32 %r2870, %r2869, %r2864; + shf.l.wrap.b32 %r2871, %r2870, %r2870, 25; + add.s32 %r2872, %r2824, %r2166; + add.s32 %r2873, %r2872, %r2871; + xor.b32 %r2874, %r2873, %r2840; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 16; + add.s32 %r2876, %r2875, %r2855; + xor.b32 %r2877, %r2876, %r2871; + shf.l.wrap.b32 %r2878, %r2877, %r2877, 20; + add.s32 %r2879, %r2873, %r2194; + add.s32 %r2880, %r2879, %r2878; + xor.b32 %r2881, %r2880, %r2875; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 24; + add.s32 %r2883, %r2882, %r2876; + xor.b32 %r2884, %r2883, %r2878; + shf.l.wrap.b32 %r2885, %r2884, %r2884, 25; + add.s32 %r2886, %r2838, %r2124; + add.s32 %r2887, %r2886, %r2829; + xor.b32 %r2888, %r2887, %r2854; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 16; + add.s32 %r2890, %r2889, %r2869; + xor.b32 %r2891, %r2890, %r2829; + shf.l.wrap.b32 %r2892, %r2891, %r2891, 20; + add.s32 %r2893, %r2887, %r2089; + add.s32 %r2894, %r2893, %r2892; + xor.b32 %r2895, %r2894, %r2889; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 24; + add.s32 %r2897, %r2896, %r2890; + xor.b32 %r2898, %r2897, %r2892; + shf.l.wrap.b32 %r2899, %r2898, %r2898, 25; + add.s32 %r2900, %r2852, %r2096; + add.s32 %r2901, %r2900, %r2843; + xor.b32 %r2902, %r2901, %r2868; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 16; + add.s32 %r2904, %r2903, %r2827; + xor.b32 %r2905, %r2904, %r2843; + shf.l.wrap.b32 %r2906, %r2905, %r2905, 20; + add.s32 %r2907, %r2901, %r2152; + add.s32 %r2908, %r2907, %r2906; + xor.b32 %r2909, %r2908, %r2903; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 24; + add.s32 %r2911, %r2910, %r2904; + xor.b32 %r2912, %r2911, %r2906; + shf.l.wrap.b32 %r2913, %r2912, %r2912, 25; + add.s32 %r2914, %r2866, %r2145; + add.s32 %r2915, %r2914, %r2857; + xor.b32 %r2916, %r2915, %r2826; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 16; + add.s32 %r2918, %r2917, %r2841; + xor.b32 %r2919, %r2918, %r2857; + shf.l.wrap.b32 %r2920, %r2919, %r2919, 20; + add.s32 %r2921, %r2915, %r2131; + add.s32 %r2922, %r2921, %r2920; + xor.b32 %r2923, %r2922, %r2917; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 24; + add.s32 %r2925, %r2924, %r2918; + xor.b32 %r2926, %r2925, %r2920; + shf.l.wrap.b32 %r2927, %r2926, %r2926, 25; + add.s32 %r2928, %r2880, %r2187; + add.s32 %r2929, %r2928, %r2899; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 16; + add.s32 %r2932, %r2931, %r2911; + xor.b32 %r2933, %r2932, %r2899; + shf.l.wrap.b32 %r2934, %r2933, %r2933, 20; + add.s32 %r2935, %r2929, %r2159; + add.s32 %r2936, %r2935, %r2934; + xor.b32 %r2937, %r2936, %r2931; + shf.l.wrap.b32 %r2938, %r2937, %r2937, 24; + add.s32 %r2939, %r2938, %r2932; + xor.b32 %r2940, %r2939, %r2934; + shf.l.wrap.b32 %r2941, %r2940, %r2940, 25; + add.s32 %r2942, %r2894, %r2103; + add.s32 %r2943, %r2942, %r2913; + xor.b32 %r2944, %r2943, %r2882; + shf.l.wrap.b32 %r2945, %r2944, %r2944, 16; + add.s32 %r2946, %r2945, %r2925; + xor.b32 %r2947, %r2946, %r2913; + shf.l.wrap.b32 %r2948, %r2947, %r2947, 20; + add.s32 %r2949, %r2943, %r2173; + add.s32 %r2950, %r2949, %r2948; + xor.b32 %r2951, %r2950, %r2945; + shf.l.wrap.b32 %r2952, %r2951, %r2951, 24; + add.s32 %r2953, %r2952, %r2946; + xor.b32 %r2954, %r2953, %r2948; + shf.l.wrap.b32 %r2955, %r2954, %r2954, 25; + add.s32 %r2956, %r2908, %r2110; + add.s32 %r2957, %r2956, %r2927; + xor.b32 %r2958, %r2957, %r2896; + shf.l.wrap.b32 %r2959, %r2958, %r2958, 16; + add.s32 %r2960, %r2959, %r2883; + xor.b32 %r2961, %r2960, %r2927; + shf.l.wrap.b32 %r2962, %r2961, %r2961, 20; + add.s32 %r2963, %r2957, %r2117; + add.s32 %r2964, %r2963, %r2962; + xor.b32 %r2965, %r2964, %r2959; + shf.l.wrap.b32 %r2966, %r2965, %r2965, 24; + add.s32 %r2967, %r2966, %r2960; + xor.b32 %r2968, %r2967, %r2962; + shf.l.wrap.b32 %r2969, %r2968, %r2968, 25; + add.s32 %r2970, %r2922, %r2138; + add.s32 %r2971, %r2970, %r2885; + xor.b32 %r2972, %r2971, %r2910; + shf.l.wrap.b32 %r2973, %r2972, %r2972, 16; + add.s32 %r2974, %r2973, %r2897; + xor.b32 %r2975, %r2974, %r2885; + shf.l.wrap.b32 %r2976, %r2975, %r2975, 20; + add.s32 %r2977, %r2971, %r2180; + add.s32 %r2978, %r2977, %r2976; + xor.b32 %r2979, %r2978, %r2973; + shf.l.wrap.b32 %r2980, %r2979, %r2979, 24; + add.s32 %r2981, %r2980, %r2974; + xor.b32 %r2982, %r2981, %r2976; + shf.l.wrap.b32 %r2983, %r2982, %r2982, 25; + xor.b32 %r3964, %r2967, %r2936; + xor.b32 %r3963, %r2981, %r2950; + xor.b32 %r3962, %r2939, %r2964; + xor.b32 %r3961, %r2953, %r2978; + xor.b32 %r3960, %r2983, %r2952; + xor.b32 %r3959, %r2941, %r2966; + xor.b32 %r3958, %r2955, %r2980; + xor.b32 %r3957, %r2969, %r2938; + add.s16 %rs198, %rs198, 1; + st.local.u8 [%rd56+1], %rs198; + add.s64 %rd170, %rd170, 64; + add.s64 %rd171, %rd171, -64; + setp.gt.u64 %p24, %rd171, 64; + @%p24 bra $L__BB0_24; + +$L__BB0_25: + min.u64 %rd63, %rd171, 64; + setp.eq.s64 %p25, %rd63, 0; + mov.u16 %rs200, %rs199; + mov.u16 %rs201, %rs199; + mov.u16 %rs202, %rs199; + mov.u16 %rs203, %rs199; + mov.u16 %rs204, %rs199; + mov.u16 %rs205, %rs199; + mov.u16 %rs206, %rs199; + mov.u16 %rs207, %rs199; + mov.u16 %rs208, %rs199; + mov.u16 %rs209, %rs199; + mov.u16 %rs210, %rs199; + mov.u16 %rs211, %rs199; + mov.u16 %rs212, %rs199; + mov.u16 %rs213, %rs199; + mov.u16 %rs214, %rs199; + mov.u16 %rs215, %rs199; + mov.u16 %rs216, %rs199; + mov.u16 %rs217, %rs199; + mov.u16 %rs218, %rs199; + mov.u16 %rs219, %rs199; + mov.u16 %rs220, %rs199; + mov.u16 %rs221, %rs199; + mov.u16 %rs222, %rs199; + mov.u16 %rs223, %rs199; + mov.u16 %rs224, %rs199; + mov.u16 %rs225, %rs199; + mov.u16 %rs226, %rs199; + mov.u16 %rs227, %rs199; + mov.u16 %rs228, %rs199; + mov.u16 %rs229, %rs199; + mov.u16 %rs230, %rs199; + mov.u16 %rs231, %rs199; + mov.u16 %rs232, %rs199; + @%p25 bra $L__BB0_29; + + mov.u64 %rd172, 0; + +$L__BB0_27: + add.s64 %rd131, %rd170, %rd172; + ld.u8 %rs121, [%rd131]; + add.s64 %rd132, %rd53, %rd172; + st.local.u8 [%rd132], %rs121; + add.s64 %rd172, %rd172, 1; + setp.lt.u64 %p26, %rd172, %rd63; + @%p26 bra $L__BB0_27; + + ld.local.v4.u16 {%rs229, %rs230, %rs231, %rs232}, [%rd53]; + ld.local.v4.u16 {%rs225, %rs226, %rs227, %rs228}, [%rd53+8]; + ld.local.v4.u16 {%rs221, %rs222, %rs223, %rs224}, [%rd53+16]; + ld.local.v4.u16 {%rs217, %rs218, %rs219, %rs220}, [%rd53+24]; + ld.local.v4.u16 {%rs213, %rs214, %rs215, %rs216}, [%rd53+32]; + ld.local.v4.u16 {%rs209, %rs210, %rs211, %rs212}, [%rd53+40]; + ld.local.v4.u16 {%rs205, %rs206, %rs207, %rs208}, [%rd53+48]; + ld.local.v4.u16 {%rs202, %rs203, %rs204, %rs153}, [%rd53+56]; + ld.local.u8 %rs201, [%rd53+61]; + ld.local.v2.u8 {%rs199, %rs200}, [%rd53+62]; + +$L__BB0_29: + ld.param.u64 %rd138, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd137, %rd138; + ld.local.v4.u8 {%rs156, %rs157, %rs158, %rs159}, [%rd53+64]; + cvt.u16.u64 %rs162, %rd63; + add.s16 %rs163, %rs156, %rs162; + st.local.u8 [%rd53+64], %rs163; + setp.eq.s16 %p27, %rs157, 0; + selp.u16 %rs164, 1, 0, %p27; + or.b16 %rs165, %rs158, %rs164; + or.b16 %rs166, %rs165, 2; + shr.u16 %rs167, %rs229, 8; + shr.u16 %rs168, %rs230, 8; + shr.u16 %rs169, %rs231, 8; + shr.u16 %rs170, %rs232, 8; + shr.u16 %rs171, %rs225, 8; + shr.u16 %rs172, %rs226, 8; + shr.u16 %rs173, %rs227, 8; + shr.u16 %rs174, %rs228, 8; + shr.u16 %rs175, %rs221, 8; + shr.u16 %rs176, %rs222, 8; + shr.u16 %rs177, %rs223, 8; + shr.u16 %rs178, %rs224, 8; + shr.u16 %rs179, %rs217, 8; + shr.u16 %rs180, %rs218, 8; + shr.u16 %rs181, %rs219, 8; + shr.u16 %rs182, %rs220, 8; + shr.u16 %rs183, %rs213, 8; + shr.u16 %rs184, %rs214, 8; + shr.u16 %rs185, %rs215, 8; + shr.u16 %rs186, %rs216, 8; + shr.u16 %rs187, %rs209, 8; + shr.u16 %rs188, %rs210, 8; + shr.u16 %rs189, %rs211, 8; + shr.u16 %rs190, %rs212, 8; + shr.u16 %rs191, %rs205, 8; + shr.u16 %rs192, %rs206, 8; + shr.u16 %rs193, %rs207, 8; + shr.u16 %rs194, %rs208, 8; + shr.u16 %rs195, %rs202, 8; + shr.u16 %rs196, %rs203, 8; + shl.b64 %rd133, %rd151, 5; + add.s64 %rd134, %rd137, %rd133; + cvt.u32.u16 %r2984, %rs229; + and.b32 %r2985, %r2984, 255; + cvt.u32.u16 %r2986, %rs167; + prmt.b32 %r2987, %r2986, %r2985, 30212; + cvt.u32.u16 %r2988, %rs230; + prmt.b32 %r2989, %r2988, %r2987, 28756; + cvt.u32.u16 %r2990, %rs168; + prmt.b32 %r2991, %r2990, %r2989, 1620; + cvt.u32.u16 %r2992, %rs231; + and.b32 %r2993, %r2992, 255; + cvt.u32.u16 %r2994, %rs169; + prmt.b32 %r2995, %r2994, %r2993, 30212; + cvt.u32.u16 %r2996, %rs232; + prmt.b32 %r2997, %r2996, %r2995, 28756; + cvt.u32.u16 %r2998, %rs170; + prmt.b32 %r2999, %r2998, %r2997, 1620; + cvt.u32.u16 %r3000, %rs225; + and.b32 %r3001, %r3000, 255; + cvt.u32.u16 %r3002, %rs171; + prmt.b32 %r3003, %r3002, %r3001, 30212; + cvt.u32.u16 %r3004, %rs226; + prmt.b32 %r3005, %r3004, %r3003, 28756; + cvt.u32.u16 %r3006, %rs172; + prmt.b32 %r3007, %r3006, %r3005, 1620; + cvt.u32.u16 %r3008, %rs227; + and.b32 %r3009, %r3008, 255; + cvt.u32.u16 %r3010, %rs173; + prmt.b32 %r3011, %r3010, %r3009, 30212; + cvt.u32.u16 %r3012, %rs228; + prmt.b32 %r3013, %r3012, %r3011, 28756; + cvt.u32.u16 %r3014, %rs174; + prmt.b32 %r3015, %r3014, %r3013, 1620; + cvt.u32.u16 %r3016, %rs221; + and.b32 %r3017, %r3016, 255; + cvt.u32.u16 %r3018, %rs175; + prmt.b32 %r3019, %r3018, %r3017, 30212; + cvt.u32.u16 %r3020, %rs222; + prmt.b32 %r3021, %r3020, %r3019, 28756; + cvt.u32.u16 %r3022, %rs176; + prmt.b32 %r3023, %r3022, %r3021, 1620; + cvt.u32.u16 %r3024, %rs223; + and.b32 %r3025, %r3024, 255; + cvt.u32.u16 %r3026, %rs177; + prmt.b32 %r3027, %r3026, %r3025, 30212; + cvt.u32.u16 %r3028, %rs224; + prmt.b32 %r3029, %r3028, %r3027, 28756; + cvt.u32.u16 %r3030, %rs178; + prmt.b32 %r3031, %r3030, %r3029, 1620; + cvt.u32.u16 %r3032, %rs217; + and.b32 %r3033, %r3032, 255; + cvt.u32.u16 %r3034, %rs179; + prmt.b32 %r3035, %r3034, %r3033, 30212; + cvt.u32.u16 %r3036, %rs218; + prmt.b32 %r3037, %r3036, %r3035, 28756; + cvt.u32.u16 %r3038, %rs180; + prmt.b32 %r3039, %r3038, %r3037, 1620; + cvt.u32.u16 %r3040, %rs219; + and.b32 %r3041, %r3040, 255; + cvt.u32.u16 %r3042, %rs181; + prmt.b32 %r3043, %r3042, %r3041, 30212; + cvt.u32.u16 %r3044, %rs220; + prmt.b32 %r3045, %r3044, %r3043, 28756; + cvt.u32.u16 %r3046, %rs182; + prmt.b32 %r3047, %r3046, %r3045, 1620; + cvt.u32.u16 %r3048, %rs213; + and.b32 %r3049, %r3048, 255; + cvt.u32.u16 %r3050, %rs183; + prmt.b32 %r3051, %r3050, %r3049, 30212; + cvt.u32.u16 %r3052, %rs214; + prmt.b32 %r3053, %r3052, %r3051, 28756; + cvt.u32.u16 %r3054, %rs184; + prmt.b32 %r3055, %r3054, %r3053, 1620; + cvt.u32.u16 %r3056, %rs215; + and.b32 %r3057, %r3056, 255; + cvt.u32.u16 %r3058, %rs185; + prmt.b32 %r3059, %r3058, %r3057, 30212; + cvt.u32.u16 %r3060, %rs216; + prmt.b32 %r3061, %r3060, %r3059, 28756; + cvt.u32.u16 %r3062, %rs186; + prmt.b32 %r3063, %r3062, %r3061, 1620; + cvt.u32.u16 %r3064, %rs209; + and.b32 %r3065, %r3064, 255; + cvt.u32.u16 %r3066, %rs187; + prmt.b32 %r3067, %r3066, %r3065, 30212; + cvt.u32.u16 %r3068, %rs210; + prmt.b32 %r3069, %r3068, %r3067, 28756; + cvt.u32.u16 %r3070, %rs188; + prmt.b32 %r3071, %r3070, %r3069, 1620; + cvt.u32.u16 %r3072, %rs211; + and.b32 %r3073, %r3072, 255; + cvt.u32.u16 %r3074, %rs189; + prmt.b32 %r3075, %r3074, %r3073, 30212; + cvt.u32.u16 %r3076, %rs212; + prmt.b32 %r3077, %r3076, %r3075, 28756; + cvt.u32.u16 %r3078, %rs190; + prmt.b32 %r3079, %r3078, %r3077, 1620; + cvt.u32.u16 %r3080, %rs205; + and.b32 %r3081, %r3080, 255; + cvt.u32.u16 %r3082, %rs191; + prmt.b32 %r3083, %r3082, %r3081, 30212; + cvt.u32.u16 %r3084, %rs206; + prmt.b32 %r3085, %r3084, %r3083, 28756; + cvt.u32.u16 %r3086, %rs192; + prmt.b32 %r3087, %r3086, %r3085, 1620; + cvt.u32.u16 %r3088, %rs207; + and.b32 %r3089, %r3088, 255; + cvt.u32.u16 %r3090, %rs193; + prmt.b32 %r3091, %r3090, %r3089, 30212; + cvt.u32.u16 %r3092, %rs208; + prmt.b32 %r3093, %r3092, %r3091, 28756; + cvt.u32.u16 %r3094, %rs194; + prmt.b32 %r3095, %r3094, %r3093, 1620; + cvt.u32.u16 %r3096, %rs202; + and.b32 %r3097, %r3096, 255; + cvt.u32.u16 %r3098, %rs195; + prmt.b32 %r3099, %r3098, %r3097, 30212; + cvt.u32.u16 %r3100, %rs203; + prmt.b32 %r3101, %r3100, %r3099, 28756; + cvt.u32.u16 %r3102, %rs196; + prmt.b32 %r3103, %r3102, %r3101, 1620; + cvt.u32.u16 %r3104, %rs204; + and.b32 %r3105, %r3104, 255; + cvt.u32.u16 %r3106, %rs201; + prmt.b32 %r3107, %r3106, %r3105, 30212; + cvt.u32.u16 %r3108, %rs199; + shl.b32 %r3109, %r3108, 16; + and.b32 %r3110, %r3109, 16711680; + or.b32 %r3111, %r3107, %r3110; + cvt.u32.u16 %r3112, %rs200; + shl.b32 %r3113, %r3112, 24; + or.b32 %r3114, %r3111, %r3113; + cvt.u32.u16 %r3115, %rs163; + and.b32 %r3116, %r3115, 255; + cvt.u32.u16 %r3117, %rs166; + and.b32 %r3118, %r3117, 255; + add.s32 %r3119, %r3960, %r3964; + add.s32 %r3120, %r3119, %r2991; + xor.b32 %r3121, %r3120, %r36; + shf.l.wrap.b32 %r3122, %r3121, %r3121, 16; + add.s32 %r3123, %r3122, 1779033703; + xor.b32 %r3124, %r3123, %r3960; + shf.l.wrap.b32 %r3125, %r3124, %r3124, 20; + add.s32 %r3126, %r2999, %r3120; + add.s32 %r3127, %r3126, %r3125; + xor.b32 %r3128, %r3127, %r3122; + shf.l.wrap.b32 %r3129, %r3128, %r3128, 24; + add.s32 %r3130, %r3129, %r3123; + xor.b32 %r3131, %r3130, %r3125; + shf.l.wrap.b32 %r3132, %r3131, %r3131, 25; + add.s32 %r3133, %r3959, %r3963; + add.s32 %r3134, %r3133, %r3007; + xor.b32 %r3135, %r3134, %r37; + shf.l.wrap.b32 %r3136, %r3135, %r3135, 16; + add.s32 %r3137, %r3136, -1150833019; + xor.b32 %r3138, %r3137, %r3959; + shf.l.wrap.b32 %r3139, %r3138, %r3138, 20; + add.s32 %r3140, %r3015, %r3134; + add.s32 %r3141, %r3140, %r3139; + xor.b32 %r3142, %r3141, %r3136; + shf.l.wrap.b32 %r3143, %r3142, %r3142, 24; + add.s32 %r3144, %r3143, %r3137; + xor.b32 %r3145, %r3144, %r3139; + shf.l.wrap.b32 %r3146, %r3145, %r3145, 25; + add.s32 %r3147, %r3958, %r3962; + add.s32 %r3148, %r3147, %r3023; + xor.b32 %r3149, %r3148, %r3116; + shr.u32 %r3150, %r3148, 16; + shl.b32 %r3151, %r3149, 16; + or.b32 %r3152, %r3151, %r3150; + add.s32 %r3153, %r3152, 1013904242; + xor.b32 %r3154, %r3153, %r3958; + shf.l.wrap.b32 %r3155, %r3154, %r3154, 20; + add.s32 %r3156, %r3031, %r3148; + add.s32 %r3157, %r3156, %r3155; + xor.b32 %r3158, %r3157, %r3152; + shf.l.wrap.b32 %r3159, %r3158, %r3158, 24; + add.s32 %r3160, %r3159, %r3153; + xor.b32 %r3161, %r3160, %r3155; + shf.l.wrap.b32 %r3162, %r3161, %r3161, 25; + add.s32 %r3163, %r3957, %r3961; + add.s32 %r3164, %r3163, %r3039; + xor.b32 %r3165, %r3164, %r3118; + shr.u32 %r3166, %r3164, 16; + shl.b32 %r3167, %r3165, 16; + or.b32 %r3168, %r3167, %r3166; + add.s32 %r3169, %r3168, -1521486534; + xor.b32 %r3170, %r3169, %r3957; + shf.l.wrap.b32 %r3171, %r3170, %r3170, 20; + add.s32 %r3172, %r3047, %r3164; + add.s32 %r3173, %r3172, %r3171; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 24; + add.s32 %r3176, %r3175, %r3169; + xor.b32 %r3177, %r3176, %r3171; + shf.l.wrap.b32 %r3178, %r3177, %r3177, 25; + add.s32 %r3179, %r3146, %r3127; + add.s32 %r3180, %r3179, %r3055; + xor.b32 %r3181, %r3175, %r3180; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 16; + add.s32 %r3183, %r3182, %r3160; + xor.b32 %r3184, %r3183, %r3146; + shf.l.wrap.b32 %r3185, %r3184, %r3184, 20; + add.s32 %r3186, %r3063, %r3180; + add.s32 %r3187, %r3186, %r3185; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 24; + add.s32 %r3190, %r3189, %r3183; + xor.b32 %r3191, %r3190, %r3185; + shf.l.wrap.b32 %r3192, %r3191, %r3191, 25; + add.s32 %r3193, %r3162, %r3141; + add.s32 %r3194, %r3193, %r3071; + xor.b32 %r3195, %r3194, %r3129; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 16; + add.s32 %r3197, %r3196, %r3176; + xor.b32 %r3198, %r3197, %r3162; + shf.l.wrap.b32 %r3199, %r3198, %r3198, 20; + add.s32 %r3200, %r3079, %r3194; + add.s32 %r3201, %r3200, %r3199; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 24; + add.s32 %r3204, %r3203, %r3197; + xor.b32 %r3205, %r3204, %r3199; + shf.l.wrap.b32 %r3206, %r3205, %r3205, 25; + add.s32 %r3207, %r3178, %r3157; + add.s32 %r3208, %r3207, %r3087; + xor.b32 %r3209, %r3208, %r3143; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 16; + add.s32 %r3211, %r3210, %r3130; + xor.b32 %r3212, %r3211, %r3178; + shf.l.wrap.b32 %r3213, %r3212, %r3212, 20; + add.s32 %r3214, %r3095, %r3208; + add.s32 %r3215, %r3214, %r3213; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 24; + add.s32 %r3218, %r3217, %r3211; + xor.b32 %r3219, %r3218, %r3213; + shf.l.wrap.b32 %r3220, %r3219, %r3219, 25; + add.s32 %r3221, %r3173, %r3132; + add.s32 %r3222, %r3221, %r3103; + xor.b32 %r3223, %r3222, %r3159; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 16; + add.s32 %r3225, %r3224, %r3144; + xor.b32 %r3226, %r3225, %r3132; + shf.l.wrap.b32 %r3227, %r3226, %r3226, 20; + add.s32 %r3228, %r3114, %r3222; + add.s32 %r3229, %r3228, %r3227; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 24; + add.s32 %r3232, %r3231, %r3225; + xor.b32 %r3233, %r3232, %r3227; + shf.l.wrap.b32 %r3234, %r3233, %r3233, 25; + add.s32 %r3235, %r3187, %r3007; + add.s32 %r3236, %r3235, %r3234; + xor.b32 %r3237, %r3236, %r3203; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 16; + add.s32 %r3239, %r3238, %r3218; + xor.b32 %r3240, %r3239, %r3234; + shf.l.wrap.b32 %r3241, %r3240, %r3240, 20; + add.s32 %r3242, %r3236, %r3039; + add.s32 %r3243, %r3242, %r3241; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 24; + add.s32 %r3246, %r3245, %r3239; + xor.b32 %r3247, %r3246, %r3241; + shf.l.wrap.b32 %r3248, %r3247, %r3247, 25; + add.s32 %r3249, %r3201, %r3015; + add.s32 %r3250, %r3249, %r3192; + xor.b32 %r3251, %r3217, %r3250; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 16; + add.s32 %r3253, %r3232, %r3252; + xor.b32 %r3254, %r3253, %r3192; + shf.l.wrap.b32 %r3255, %r3254, %r3254, 20; + add.s32 %r3256, %r3250, %r3071; + add.s32 %r3257, %r3256, %r3255; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 24; + add.s32 %r3260, %r3259, %r3253; + xor.b32 %r3261, %r3260, %r3255; + shf.l.wrap.b32 %r3262, %r3261, %r3261, 25; + add.s32 %r3263, %r3206, %r3047; + add.s32 %r3264, %r3263, %r3215; + xor.b32 %r3265, %r3231, %r3264; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 16; + add.s32 %r3267, %r3266, %r3190; + xor.b32 %r3268, %r3267, %r3206; + shf.l.wrap.b32 %r3269, %r3268, %r3268, 20; + add.s32 %r3270, %r3264, %r2991; + add.s32 %r3271, %r3270, %r3269; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 24; + add.s32 %r3274, %r3273, %r3267; + xor.b32 %r3275, %r3274, %r3269; + shf.l.wrap.b32 %r3276, %r3275, %r3275, 25; + add.s32 %r3277, %r3220, %r3023; + add.s32 %r3278, %r3277, %r3229; + xor.b32 %r3279, %r3278, %r3189; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 16; + add.s32 %r3281, %r3280, %r3204; + xor.b32 %r3282, %r3281, %r3220; + shf.l.wrap.b32 %r3283, %r3282, %r3282, 20; + add.s32 %r3284, %r3278, %r3095; + add.s32 %r3285, %r3284, %r3283; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 24; + add.s32 %r3288, %r3287, %r3281; + xor.b32 %r3289, %r3288, %r3283; + shf.l.wrap.b32 %r3290, %r3289, %r3289, 25; + add.s32 %r3291, %r3262, %r2999; + add.s32 %r3292, %r3291, %r3243; + xor.b32 %r3293, %r3292, %r3287; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 16; + add.s32 %r3295, %r3294, %r3274; + xor.b32 %r3296, %r3295, %r3262; + shf.l.wrap.b32 %r3297, %r3296, %r3296, 20; + add.s32 %r3298, %r3292, %r3079; + add.s32 %r3299, %r3298, %r3297; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 24; + add.s32 %r3302, %r3301, %r3295; + xor.b32 %r3303, %r3302, %r3297; + shf.l.wrap.b32 %r3304, %r3303, %r3303, 25; + add.s32 %r3305, %r3257, %r3087; + add.s32 %r3306, %r3305, %r3276; + xor.b32 %r3307, %r3245, %r3306; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 16; + add.s32 %r3309, %r3308, %r3288; + xor.b32 %r3310, %r3309, %r3276; + shf.l.wrap.b32 %r3311, %r3310, %r3310, 20; + add.s32 %r3312, %r3306, %r3031; + add.s32 %r3313, %r3312, %r3311; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 24; + add.s32 %r3316, %r3315, %r3309; + xor.b32 %r3317, %r3316, %r3311; + shf.l.wrap.b32 %r3318, %r3317, %r3317, 25; + add.s32 %r3319, %r3271, %r3063; + add.s32 %r3320, %r3319, %r3290; + xor.b32 %r3321, %r3320, %r3259; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 16; + add.s32 %r3323, %r3322, %r3246; + xor.b32 %r3324, %r3323, %r3290; + shf.l.wrap.b32 %r3325, %r3324, %r3324, 20; + add.s32 %r3326, %r3320, %r3103; + add.s32 %r3327, %r3326, %r3325; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 24; + add.s32 %r3330, %r3329, %r3323; + xor.b32 %r3331, %r3330, %r3325; + shf.l.wrap.b32 %r3332, %r3331, %r3331, 25; + add.s32 %r3333, %r3285, %r3114; + add.s32 %r3334, %r3333, %r3248; + xor.b32 %r3335, %r3334, %r3273; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 16; + add.s32 %r3337, %r3336, %r3260; + xor.b32 %r3338, %r3337, %r3248; + shf.l.wrap.b32 %r3339, %r3338, %r3338, 20; + add.s32 %r3340, %r3334, %r3055; + add.s32 %r3341, %r3340, %r3339; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 24; + add.s32 %r3344, %r3343, %r3337; + xor.b32 %r3345, %r3344, %r3339; + shf.l.wrap.b32 %r3346, %r3345, %r3345, 25; + add.s32 %r3347, %r3299, %r3015; + add.s32 %r3348, %r3347, %r3346; + xor.b32 %r3349, %r3348, %r3315; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 16; + add.s32 %r3351, %r3350, %r3330; + xor.b32 %r3352, %r3351, %r3346; + shf.l.wrap.b32 %r3353, %r3352, %r3352, 20; + add.s32 %r3354, %r3348, %r3023; + add.s32 %r3355, %r3354, %r3353; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 24; + add.s32 %r3358, %r3357, %r3351; + xor.b32 %r3359, %r3358, %r3353; + shf.l.wrap.b32 %r3360, %r3359, %r3359, 25; + add.s32 %r3361, %r3313, %r3071; + add.s32 %r3362, %r3361, %r3304; + xor.b32 %r3363, %r3362, %r3329; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 16; + add.s32 %r3365, %r3364, %r3344; + xor.b32 %r3366, %r3365, %r3304; + shf.l.wrap.b32 %r3367, %r3366, %r3366, 20; + add.s32 %r3368, %r3362, %r3087; + add.s32 %r3369, %r3368, %r3367; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 24; + add.s32 %r3372, %r3371, %r3365; + xor.b32 %r3373, %r3372, %r3367; + shf.l.wrap.b32 %r3374, %r3373, %r3373, 25; + add.s32 %r3375, %r3327, %r3095; + add.s32 %r3376, %r3375, %r3318; + xor.b32 %r3377, %r3343, %r3376; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 16; + add.s32 %r3379, %r3378, %r3302; + xor.b32 %r3380, %r3379, %r3318; + shf.l.wrap.b32 %r3381, %r3380, %r3380, 20; + add.s32 %r3382, %r3376, %r3007; + add.s32 %r3383, %r3382, %r3381; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 24; + add.s32 %r3386, %r3385, %r3379; + xor.b32 %r3387, %r3386, %r3381; + shf.l.wrap.b32 %r3388, %r3387, %r3387, 25; + add.s32 %r3389, %r3332, %r3047; + add.s32 %r3390, %r3389, %r3341; + xor.b32 %r3391, %r3390, %r3301; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 16; + add.s32 %r3393, %r3392, %r3316; + xor.b32 %r3394, %r3393, %r3332; + shf.l.wrap.b32 %r3395, %r3394, %r3394, 20; + add.s32 %r3396, %r3390, %r3103; + add.s32 %r3397, %r3396, %r3395; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 24; + add.s32 %r3400, %r3399, %r3393; + xor.b32 %r3401, %r3400, %r3395; + shf.l.wrap.b32 %r3402, %r3401, %r3401, 25; + add.s32 %r3403, %r3374, %r3039; + add.s32 %r3404, %r3403, %r3355; + xor.b32 %r3405, %r3404, %r3399; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 16; + add.s32 %r3407, %r3406, %r3386; + xor.b32 %r3408, %r3407, %r3374; + shf.l.wrap.b32 %r3409, %r3408, %r3408, 20; + add.s32 %r3410, %r3404, %r3031; + add.s32 %r3411, %r3410, %r3409; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 24; + add.s32 %r3414, %r3413, %r3407; + xor.b32 %r3415, %r3414, %r3409; + shf.l.wrap.b32 %r3416, %r3415, %r3415, 25; + add.s32 %r3417, %r3369, %r3063; + add.s32 %r3418, %r3417, %r3388; + xor.b32 %r3419, %r3357, %r3418; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 16; + add.s32 %r3421, %r3420, %r3400; + xor.b32 %r3422, %r3421, %r3388; + shf.l.wrap.b32 %r3423, %r3422, %r3422, 20; + add.s32 %r3424, %r3418, %r2991; + add.s32 %r3425, %r3424, %r3423; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 24; + add.s32 %r3428, %r3427, %r3421; + xor.b32 %r3429, %r3428, %r3423; + shf.l.wrap.b32 %r3430, %r3429, %r3429, 25; + add.s32 %r3431, %r3383, %r3079; + add.s32 %r3432, %r3431, %r3402; + xor.b32 %r3433, %r3432, %r3371; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 16; + add.s32 %r3435, %r3434, %r3358; + xor.b32 %r3436, %r3435, %r3402; + shf.l.wrap.b32 %r3437, %r3436, %r3436, 20; + add.s32 %r3438, %r3432, %r3114; + add.s32 %r3439, %r3438, %r3437; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 24; + add.s32 %r3442, %r3441, %r3435; + xor.b32 %r3443, %r3442, %r3437; + shf.l.wrap.b32 %r3444, %r3443, %r3443, 25; + add.s32 %r3445, %r3397, %r3055; + add.s32 %r3446, %r3445, %r3360; + xor.b32 %r3447, %r3446, %r3385; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 16; + add.s32 %r3449, %r3448, %r3372; + xor.b32 %r3450, %r3449, %r3360; + shf.l.wrap.b32 %r3451, %r3450, %r3450, 20; + add.s32 %r3452, %r3446, %r2999; + add.s32 %r3453, %r3452, %r3451; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 24; + add.s32 %r3456, %r3455, %r3449; + xor.b32 %r3457, %r3456, %r3451; + shf.l.wrap.b32 %r3458, %r3457, %r3457, 25; + add.s32 %r3459, %r3411, %r3071; + add.s32 %r3460, %r3459, %r3458; + xor.b32 %r3461, %r3460, %r3427; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 16; + add.s32 %r3463, %r3462, %r3442; + xor.b32 %r3464, %r3463, %r3458; + shf.l.wrap.b32 %r3465, %r3464, %r3464, 20; + add.s32 %r3466, %r3460, %r3047; + add.s32 %r3467, %r3466, %r3465; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 24; + add.s32 %r3470, %r3469, %r3463; + xor.b32 %r3471, %r3470, %r3465; + shf.l.wrap.b32 %r3472, %r3471, %r3471, 25; + add.s32 %r3473, %r3425, %r3087; + add.s32 %r3474, %r3473, %r3416; + xor.b32 %r3475, %r3474, %r3441; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 16; + add.s32 %r3477, %r3476, %r3456; + xor.b32 %r3478, %r3477, %r3416; + shf.l.wrap.b32 %r3479, %r3478, %r3478, 20; + add.s32 %r3480, %r3474, %r3063; + add.s32 %r3481, %r3480, %r3479; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 24; + add.s32 %r3484, %r3483, %r3477; + xor.b32 %r3485, %r3484, %r3479; + shf.l.wrap.b32 %r3486, %r3485, %r3485, 25; + add.s32 %r3487, %r3439, %r3103; + add.s32 %r3488, %r3487, %r3430; + xor.b32 %r3489, %r3455, %r3488; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 16; + add.s32 %r3491, %r3490, %r3414; + xor.b32 %r3492, %r3491, %r3430; + shf.l.wrap.b32 %r3493, %r3492, %r3492, 20; + add.s32 %r3494, %r3488, %r3015; + add.s32 %r3495, %r3494, %r3493; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 24; + add.s32 %r3498, %r3497, %r3491; + xor.b32 %r3499, %r3498, %r3493; + shf.l.wrap.b32 %r3500, %r3499, %r3499, 25; + add.s32 %r3501, %r3444, %r3095; + add.s32 %r3502, %r3501, %r3453; + xor.b32 %r3503, %r3502, %r3413; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 16; + add.s32 %r3505, %r3504, %r3428; + xor.b32 %r3506, %r3505, %r3444; + shf.l.wrap.b32 %r3507, %r3506, %r3506, 20; + add.s32 %r3508, %r3502, %r3114; + add.s32 %r3509, %r3508, %r3507; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 24; + add.s32 %r3512, %r3511, %r3505; + xor.b32 %r3513, %r3512, %r3507; + shf.l.wrap.b32 %r3514, %r3513, %r3513, 25; + add.s32 %r3515, %r3486, %r3023; + add.s32 %r3516, %r3515, %r3467; + xor.b32 %r3517, %r3516, %r3511; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 16; + add.s32 %r3519, %r3518, %r3498; + xor.b32 %r3520, %r3519, %r3486; + shf.l.wrap.b32 %r3521, %r3520, %r3520, 20; + add.s32 %r3522, %r3516, %r2991; + add.s32 %r3523, %r3522, %r3521; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 24; + add.s32 %r3526, %r3525, %r3519; + xor.b32 %r3527, %r3526, %r3521; + shf.l.wrap.b32 %r3528, %r3527, %r3527, 25; + add.s32 %r3529, %r3481, %r3079; + add.s32 %r3530, %r3529, %r3500; + xor.b32 %r3531, %r3469, %r3530; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 16; + add.s32 %r3533, %r3532, %r3512; + xor.b32 %r3534, %r3533, %r3500; + shf.l.wrap.b32 %r3535, %r3534, %r3534, 20; + add.s32 %r3536, %r3530, %r3007; + add.s32 %r3537, %r3536, %r3535; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 24; + add.s32 %r3540, %r3539, %r3533; + xor.b32 %r3541, %r3540, %r3535; + shf.l.wrap.b32 %r3542, %r3541, %r3541, 25; + add.s32 %r3543, %r3495, %r3031; + add.s32 %r3544, %r3543, %r3514; + xor.b32 %r3545, %r3544, %r3483; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 16; + add.s32 %r3547, %r3546, %r3470; + xor.b32 %r3548, %r3547, %r3514; + shf.l.wrap.b32 %r3549, %r3548, %r3548, 20; + add.s32 %r3550, %r3544, %r3055; + add.s32 %r3551, %r3550, %r3549; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 24; + add.s32 %r3554, %r3553, %r3547; + xor.b32 %r3555, %r3554, %r3549; + shf.l.wrap.b32 %r3556, %r3555, %r3555, 25; + add.s32 %r3557, %r3509, %r2999; + add.s32 %r3558, %r3557, %r3472; + xor.b32 %r3559, %r3558, %r3497; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 16; + add.s32 %r3561, %r3560, %r3484; + xor.b32 %r3562, %r3561, %r3472; + shf.l.wrap.b32 %r3563, %r3562, %r3562, 20; + add.s32 %r3564, %r3558, %r3039; + add.s32 %r3565, %r3564, %r3563; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 24; + add.s32 %r3568, %r3567, %r3561; + xor.b32 %r3569, %r3568, %r3563; + shf.l.wrap.b32 %r3570, %r3569, %r3569, 25; + add.s32 %r3571, %r3523, %r3087; + add.s32 %r3572, %r3571, %r3570; + xor.b32 %r3573, %r3572, %r3539; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 16; + add.s32 %r3575, %r3574, %r3554; + xor.b32 %r3576, %r3575, %r3570; + shf.l.wrap.b32 %r3577, %r3576, %r3576, 20; + add.s32 %r3578, %r3572, %r3095; + add.s32 %r3579, %r3578, %r3577; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 24; + add.s32 %r3582, %r3581, %r3575; + xor.b32 %r3583, %r3582, %r3577; + shf.l.wrap.b32 %r3584, %r3583, %r3583, 25; + add.s32 %r3585, %r3537, %r3063; + add.s32 %r3586, %r3585, %r3528; + xor.b32 %r3587, %r3586, %r3553; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 16; + add.s32 %r3589, %r3588, %r3568; + xor.b32 %r3590, %r3589, %r3528; + shf.l.wrap.b32 %r3591, %r3590, %r3590, 20; + add.s32 %r3592, %r3586, %r3079; + add.s32 %r3593, %r3592, %r3591; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 24; + add.s32 %r3596, %r3595, %r3589; + xor.b32 %r3597, %r3596, %r3591; + shf.l.wrap.b32 %r3598, %r3597, %r3597, 25; + add.s32 %r3599, %r3551, %r3114; + add.s32 %r3600, %r3599, %r3542; + xor.b32 %r3601, %r3567, %r3600; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 16; + add.s32 %r3603, %r3602, %r3526; + xor.b32 %r3604, %r3603, %r3542; + shf.l.wrap.b32 %r3605, %r3604, %r3604, 20; + add.s32 %r3606, %r3600, %r3071; + add.s32 %r3607, %r3606, %r3605; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 24; + add.s32 %r3610, %r3609, %r3603; + xor.b32 %r3611, %r3610, %r3605; + shf.l.wrap.b32 %r3612, %r3611, %r3611, 25; + add.s32 %r3613, %r3556, %r3103; + add.s32 %r3614, %r3613, %r3565; + xor.b32 %r3615, %r3614, %r3525; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 16; + add.s32 %r3617, %r3616, %r3540; + xor.b32 %r3618, %r3617, %r3556; + shf.l.wrap.b32 %r3619, %r3618, %r3618, 20; + add.s32 %r3620, %r3614, %r3055; + add.s32 %r3621, %r3620, %r3619; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 24; + add.s32 %r3624, %r3623, %r3617; + xor.b32 %r3625, %r3624, %r3619; + shf.l.wrap.b32 %r3626, %r3625, %r3625, 25; + add.s32 %r3627, %r3598, %r3047; + add.s32 %r3628, %r3627, %r3579; + xor.b32 %r3629, %r3628, %r3623; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 16; + add.s32 %r3631, %r3630, %r3610; + xor.b32 %r3632, %r3631, %r3598; + shf.l.wrap.b32 %r3633, %r3632, %r3632, 20; + add.s32 %r3634, %r3628, %r3007; + add.s32 %r3635, %r3634, %r3633; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 24; + add.s32 %r3638, %r3637, %r3631; + xor.b32 %r3639, %r3638, %r3633; + shf.l.wrap.b32 %r3640, %r3639, %r3639, 25; + add.s32 %r3641, %r3593, %r3031; + add.s32 %r3642, %r3641, %r3612; + xor.b32 %r3643, %r3581, %r3642; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 16; + add.s32 %r3645, %r3644, %r3624; + xor.b32 %r3646, %r3645, %r3612; + shf.l.wrap.b32 %r3647, %r3646, %r3646, 20; + add.s32 %r3648, %r3642, %r3015; + add.s32 %r3649, %r3648, %r3647; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 24; + add.s32 %r3652, %r3651, %r3645; + xor.b32 %r3653, %r3652, %r3647; + shf.l.wrap.b32 %r3654, %r3653, %r3653, 25; + add.s32 %r3655, %r3607, %r2991; + add.s32 %r3656, %r3655, %r3626; + xor.b32 %r3657, %r3656, %r3595; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 16; + add.s32 %r3659, %r3658, %r3582; + xor.b32 %r3660, %r3659, %r3626; + shf.l.wrap.b32 %r3661, %r3660, %r3660, 20; + add.s32 %r3662, %r3656, %r2999; + add.s32 %r3663, %r3662, %r3661; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 24; + add.s32 %r3666, %r3665, %r3659; + xor.b32 %r3667, %r3666, %r3661; + shf.l.wrap.b32 %r3668, %r3667, %r3667, 25; + add.s32 %r3669, %r3621, %r3039; + add.s32 %r3670, %r3669, %r3584; + xor.b32 %r3671, %r3670, %r3609; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 16; + add.s32 %r3673, %r3672, %r3596; + xor.b32 %r3674, %r3673, %r3584; + shf.l.wrap.b32 %r3675, %r3674, %r3674, 20; + add.s32 %r3676, %r3670, %r3023; + add.s32 %r3677, %r3676, %r3675; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 24; + add.s32 %r3680, %r3679, %r3673; + xor.b32 %r3681, %r3680, %r3675; + shf.l.wrap.b32 %r3682, %r3681, %r3681, 25; + add.s32 %r3683, %r3635, %r3063; + add.s32 %r3684, %r3683, %r3682; + xor.b32 %r3685, %r3684, %r3651; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 16; + add.s32 %r3687, %r3686, %r3666; + xor.b32 %r3688, %r3687, %r3682; + shf.l.wrap.b32 %r3689, %r3688, %r3688, 20; + add.s32 %r3690, %r3684, %r3103; + add.s32 %r3691, %r3690, %r3689; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 24; + add.s32 %r3694, %r3693, %r3687; + xor.b32 %r3695, %r3694, %r3689; + shf.l.wrap.b32 %r3696, %r3695, %r3695, 25; + add.s32 %r3697, %r3649, %r3079; + add.s32 %r3698, %r3697, %r3640; + xor.b32 %r3699, %r3698, %r3665; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 16; + add.s32 %r3701, %r3700, %r3680; + xor.b32 %r3702, %r3701, %r3640; + shf.l.wrap.b32 %r3703, %r3702, %r3702, 20; + add.s32 %r3704, %r3698, %r3031; + add.s32 %r3705, %r3704, %r3703; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 24; + add.s32 %r3708, %r3707, %r3701; + xor.b32 %r3709, %r3708, %r3703; + shf.l.wrap.b32 %r3710, %r3709, %r3709, 25; + add.s32 %r3711, %r3663, %r3055; + add.s32 %r3712, %r3711, %r3654; + xor.b32 %r3713, %r3679, %r3712; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 16; + add.s32 %r3715, %r3714, %r3638; + xor.b32 %r3716, %r3715, %r3654; + shf.l.wrap.b32 %r3717, %r3716, %r3716, 20; + add.s32 %r3718, %r3712, %r3087; + add.s32 %r3719, %r3718, %r3717; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 24; + add.s32 %r3722, %r3721, %r3715; + xor.b32 %r3723, %r3722, %r3717; + shf.l.wrap.b32 %r3724, %r3723, %r3723, 25; + add.s32 %r3725, %r3668, %r3114; + add.s32 %r3726, %r3725, %r3677; + xor.b32 %r3727, %r3726, %r3637; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 16; + add.s32 %r3729, %r3728, %r3652; + xor.b32 %r3730, %r3729, %r3668; + shf.l.wrap.b32 %r3731, %r3730, %r3730, 20; + add.s32 %r3732, %r3726, %r2999; + add.s32 %r3733, %r3732, %r3731; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 24; + add.s32 %r3736, %r3735, %r3729; + xor.b32 %r3737, %r3736, %r3731; + shf.l.wrap.b32 %r3738, %r3737, %r3737, 25; + add.s32 %r3739, %r3710, %r3095; + add.s32 %r3740, %r3739, %r3691; + xor.b32 %r3741, %r3740, %r3735; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 16; + add.s32 %r3743, %r3742, %r3722; + xor.b32 %r3744, %r3743, %r3710; + shf.l.wrap.b32 %r3745, %r3744, %r3744, 20; + add.s32 %r3746, %r3740, %r3015; + add.s32 %r3747, %r3746, %r3745; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 24; + add.s32 %r3750, %r3749, %r3743; + xor.b32 %r3751, %r3750, %r3745; + shf.l.wrap.b32 %r3752, %r3751, %r3751, 25; + add.s32 %r3753, %r3705, %r2991; + add.s32 %r3754, %r3753, %r3724; + xor.b32 %r3755, %r3693, %r3754; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 16; + add.s32 %r3757, %r3756, %r3736; + xor.b32 %r3758, %r3757, %r3724; + shf.l.wrap.b32 %r3759, %r3758, %r3758, 20; + add.s32 %r3760, %r3754, %r3071; + add.s32 %r3761, %r3760, %r3759; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 24; + add.s32 %r3764, %r3763, %r3757; + xor.b32 %r3765, %r3764, %r3759; + shf.l.wrap.b32 %r3766, %r3765, %r3765, 25; + add.s32 %r3767, %r3719, %r3007; + add.s32 %r3768, %r3767, %r3738; + xor.b32 %r3769, %r3768, %r3707; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 16; + add.s32 %r3771, %r3770, %r3694; + xor.b32 %r3772, %r3771, %r3738; + shf.l.wrap.b32 %r3773, %r3772, %r3772, 20; + add.s32 %r3774, %r3768, %r3039; + add.s32 %r3775, %r3774, %r3773; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 24; + add.s32 %r3778, %r3777, %r3771; + xor.b32 %r3779, %r3778, %r3773; + shf.l.wrap.b32 %r3780, %r3779, %r3779, 25; + add.s32 %r3781, %r3733, %r3023; + add.s32 %r3782, %r3781, %r3696; + xor.b32 %r3783, %r3782, %r3721; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 16; + add.s32 %r3785, %r3784, %r3708; + xor.b32 %r3786, %r3785, %r3696; + shf.l.wrap.b32 %r3787, %r3786, %r3786, 20; + add.s32 %r3788, %r3782, %r3047; + add.s32 %r3789, %r3788, %r3787; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 24; + add.s32 %r3792, %r3791, %r3785; + xor.b32 %r3793, %r3792, %r3787; + shf.l.wrap.b32 %r3794, %r3793, %r3793, 25; + add.s32 %r3795, %r3747, %r3079; + add.s32 %r3796, %r3795, %r3794; + xor.b32 %r3797, %r3796, %r3763; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 16; + add.s32 %r3799, %r3798, %r3778; + xor.b32 %r3800, %r3799, %r3794; + shf.l.wrap.b32 %r3801, %r3800, %r3800, 20; + add.s32 %r3802, %r3796, %r3114; + add.s32 %r3803, %r3802, %r3801; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 24; + add.s32 %r3806, %r3805, %r3799; + xor.b32 %r3807, %r3806, %r3801; + shf.l.wrap.b32 %r3808, %r3807, %r3807, 25; + add.s32 %r3809, %r3761, %r3031; + add.s32 %r3810, %r3809, %r3752; + xor.b32 %r3811, %r3810, %r3777; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 16; + add.s32 %r3813, %r3812, %r3792; + xor.b32 %r3814, %r3813, %r3752; + shf.l.wrap.b32 %r3815, %r3814, %r3814, 20; + add.s32 %r3816, %r3810, %r2991; + add.s32 %r3817, %r3816, %r3815; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 24; + add.s32 %r3820, %r3819, %r3813; + xor.b32 %r3821, %r3820, %r3815; + shf.l.wrap.b32 %r3822, %r3821, %r3821, 25; + add.s32 %r3823, %r3775, %r2999; + add.s32 %r3824, %r3823, %r3766; + xor.b32 %r3825, %r3791, %r3824; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 16; + add.s32 %r3827, %r3826, %r3750; + xor.b32 %r3828, %r3827, %r3766; + shf.l.wrap.b32 %r3829, %r3828, %r3828, 20; + add.s32 %r3830, %r3824, %r3063; + add.s32 %r3831, %r3830, %r3829; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 24; + add.s32 %r3834, %r3833, %r3827; + xor.b32 %r3835, %r3834, %r3829; + shf.l.wrap.b32 %r3836, %r3835, %r3835, 25; + add.s32 %r3837, %r3780, %r3055; + add.s32 %r3838, %r3837, %r3789; + xor.b32 %r3839, %r3838, %r3749; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 16; + add.s32 %r3841, %r3840, %r3764; + xor.b32 %r3842, %r3841, %r3780; + shf.l.wrap.b32 %r3843, %r3842, %r3842, 20; + add.s32 %r3844, %r3838, %r3039; + add.s32 %r3845, %r3844, %r3843; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 24; + add.s32 %r3848, %r3847, %r3841; + xor.b32 %r3849, %r3848, %r3843; + shf.l.wrap.b32 %r3850, %r3849, %r3849, 25; + add.s32 %r3851, %r3822, %r3103; + add.s32 %r3852, %r3851, %r3803; + xor.b32 %r3853, %r3852, %r3847; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 16; + add.s32 %r3855, %r3854, %r3834; + xor.b32 %r3856, %r3855, %r3822; + shf.l.wrap.b32 %r3857, %r3856, %r3856, 20; + add.s32 %r3858, %r3852, %r3071; + add.s32 %r3859, %r3858, %r3857; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 24; + add.s32 %r3862, %r3861, %r3855; + xor.b32 %r3863, %r3862, %r3857; + shf.l.wrap.b32 %r3864, %r3863, %r3863, 25; + add.s32 %r3865, %r3817, %r3007; + add.s32 %r3866, %r3865, %r3836; + xor.b32 %r3867, %r3805, %r3866; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 16; + add.s32 %r3869, %r3868, %r3848; + xor.b32 %r3870, %r3869, %r3836; + shf.l.wrap.b32 %r3871, %r3870, %r3870, 20; + add.s32 %r3872, %r3866, %r3087; + add.s32 %r3873, %r3872, %r3871; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 24; + add.s32 %r3876, %r3875, %r3869; + xor.b32 %r3877, %r3876, %r3871; + shf.l.wrap.b32 %r3878, %r3877, %r3877, 25; + add.s32 %r3879, %r3831, %r3015; + add.s32 %r3880, %r3879, %r3850; + xor.b32 %r3881, %r3880, %r3819; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 16; + add.s32 %r3883, %r3882, %r3806; + xor.b32 %r3884, %r3883, %r3850; + shf.l.wrap.b32 %r3885, %r3884, %r3884, 20; + add.s32 %r3886, %r3880, %r3023; + add.s32 %r3887, %r3886, %r3885; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 24; + add.s32 %r3890, %r3889, %r3883; + xor.b32 %r3891, %r3890, %r3885; + shf.l.wrap.b32 %r3892, %r3891, %r3891, 25; + add.s32 %r3893, %r3845, %r3047; + add.s32 %r3894, %r3893, %r3808; + xor.b32 %r3895, %r3894, %r3833; + shf.l.wrap.b32 %r3896, %r3895, %r3895, 16; + add.s32 %r3897, %r3896, %r3820; + xor.b32 %r3898, %r3897, %r3808; + shf.l.wrap.b32 %r3899, %r3898, %r3898, 20; + add.s32 %r3900, %r3894, %r3095; + add.s32 %r3901, %r3900, %r3899; + xor.b32 %r3902, %r3901, %r3896; + shf.l.wrap.b32 %r3903, %r3902, %r3902, 24; + add.s32 %r3904, %r3903, %r3897; + xor.b32 %r3905, %r3904, %r3899; + shf.l.wrap.b32 %r3906, %r3905, %r3905, 25; + xor.b32 %r3907, %r3890, %r3859; + xor.b32 %r3908, %r3904, %r3873; + xor.b32 %r3909, %r3862, %r3887; + xor.b32 %r3910, %r3901, %r3876; + xor.b32 %r3911, %r3906, %r3875; + xor.b32 %r3912, %r3864, %r3889; + xor.b32 %r3913, %r3903, %r3878; + xor.b32 %r3914, %r3892, %r3861; + st.local.u8 [%rd134], %r3907; + shr.u32 %r3915, %r3907, 8; + st.local.u8 [%rd134+1], %r3915; + shr.u32 %r3916, %r3907, 16; + st.local.u8 [%rd134+2], %r3916; + shr.u32 %r3917, %r3907, 24; + st.local.u8 [%rd134+3], %r3917; + st.local.u8 [%rd134+4], %r3908; + shr.u32 %r3918, %r3908, 8; + st.local.u8 [%rd134+5], %r3918; + shr.u32 %r3919, %r3908, 16; + st.local.u8 [%rd134+6], %r3919; + shr.u32 %r3920, %r3908, 24; + st.local.u8 [%rd134+7], %r3920; + st.local.u8 [%rd134+8], %r3909; + shr.u32 %r3921, %r3909, 8; + st.local.u8 [%rd134+9], %r3921; + shr.u32 %r3922, %r3909, 16; + st.local.u8 [%rd134+10], %r3922; + shr.u32 %r3923, %r3909, 24; + st.local.u8 [%rd134+11], %r3923; + st.local.u8 [%rd134+12], %r3910; + shr.u32 %r3924, %r3910, 8; + st.local.u8 [%rd134+13], %r3924; + shr.u32 %r3925, %r3910, 16; + st.local.u8 [%rd134+14], %r3925; + shr.u32 %r3926, %r3910, 24; + st.local.u8 [%rd134+15], %r3926; + st.local.u8 [%rd134+16], %r3911; + shr.u32 %r3927, %r3911, 8; + st.local.u8 [%rd134+17], %r3927; + shr.u32 %r3928, %r3911, 16; + st.local.u8 [%rd134+18], %r3928; + shr.u32 %r3929, %r3911, 24; + st.local.u8 [%rd134+19], %r3929; + st.local.u8 [%rd134+20], %r3912; + shr.u32 %r3930, %r3912, 8; + st.local.u8 [%rd134+21], %r3930; + shr.u32 %r3931, %r3912, 16; + st.local.u8 [%rd134+22], %r3931; + shr.u32 %r3932, %r3912, 24; + st.local.u8 [%rd134+23], %r3932; + st.local.u8 [%rd134+24], %r3913; + shr.u32 %r3933, %r3913, 8; + st.local.u8 [%rd134+25], %r3933; + shr.u32 %r3934, %r3913, 16; + st.local.u8 [%rd134+26], %r3934; + shr.u32 %r3935, %r3913, 24; + st.local.u8 [%rd134+27], %r3935; + st.local.u8 [%rd134+28], %r3914; + shr.u32 %r3936, %r3914, 8; + st.local.u8 [%rd134+29], %r3936; + shr.u32 %r3937, %r3914, 16; + st.local.u8 [%rd134+30], %r3937; + shr.u32 %r3938, %r3914, 24; + st.local.u8 [%rd134+31], %r3938; + add.s64 %rd151, %rd151, 1; + bra.uni $L__BB0_30; + +$L__BB0_1: + add.s64 %rd76, %rd171, -1; + shr.u64 %rd77, %rd76, 10; + or.b64 %rd78, %rd77, 1; + setp.gt.u64 %p2, %rd78, 4294967295; + shr.u64 %rd79, %rd76, 42; + selp.b64 %rd80, %rd79, %rd78, %p2; + selp.b32 %r62, 32, 0, %p2; + and.b64 %rd81, %rd80, 4294901760; + setp.ne.s64 %p3, %rd81, 0; + shr.u64 %rd82, %rd80, 16; + or.b32 %r63, %r62, 16; + selp.b64 %rd83, %rd82, %rd80, %p3; + selp.b32 %r64, %r63, %r62, %p3; + and.b64 %rd84, %rd83, 65280; + setp.ne.s64 %p4, %rd84, 0; + shr.u64 %rd85, %rd83, 8; + or.b32 %r65, %r64, 8; + selp.b64 %rd86, %rd85, %rd83, %p4; + selp.b32 %r66, %r65, %r64, %p4; + and.b64 %rd87, %rd86, 240; + setp.ne.s64 %p5, %rd87, 0; + shr.u64 %rd88, %rd86, 4; + or.b32 %r67, %r66, 4; + selp.b64 %rd89, %rd88, %rd86, %p5; + selp.b32 %r68, %r67, %r66, %p5; + and.b64 %rd90, %rd89, 12; + setp.ne.s64 %p6, %rd90, 0; + shr.u64 %rd91, %rd89, 2; + add.s32 %r69, %r68, 2; + selp.b64 %rd92, %rd91, %rd89, %p6; + selp.b32 %r70, %r69, %r68, %p6; + and.b64 %rd93, %rd92, 2; + shr.u64 %rd94, %rd93, 1; + cvt.u32.u64 %r71, %rd94; + add.s32 %r72, %r70, %r71; + mov.u64 %rd95, 1024; + shl.b64 %rd96, %rd95, %r72; + sub.s64 %rd97, %rd171, %rd96; + add.s64 %rd98, %rd69, %rd96; + shr.u64 %rd99, %rd96, 10; + add.s64 %rd100, %rd99, %rd165; + setp.gt.u64 %p7, %rd96, 1024; + selp.b64 %rd101, 64, 32, %p7; + add.s64 %rd103, %rd149, %rd101; + cvt.u32.u16 %r73, %rs75; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd69; + .param .b64 param1; + st.param.b64 [param1+0], %rd96; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd165; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd149; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd5, [retval0+0]; + } // callseq 0 + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd98; + .param .b64 param1; + st.param.b64 [param1+0], %rd97; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd100; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd103; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd6, [retval0+0]; + } // callseq 1 + setp.eq.s64 %p8, %rd5, 1; + @%p8 bra $L__BB0_12; + bra.uni $L__BB0_2; + +$L__BB0_12: + mov.u64 %rd158, 0; + +$L__BB0_13: + add.s64 %rd117, %rd4, %rd158; + ld.local.u8 %rs78, [%rd117]; + add.s64 %rd118, %rd155, %rd158; + st.local.u8 [%rd118], %rs78; + add.s64 %rd158, %rd158, 1; + setp.lt.u64 %p15, %rd158, 64; + mov.u64 %rd151, 2; + @%p15 bra $L__BB0_13; + bra.uni $L__BB0_30; + +$L__BB0_2: + add.s64 %rd7, %rd6, %rd5; + setp.lt.u64 %p9, %rd7, 2; + mov.u64 %rd151, 0; + mov.u64 %rd152, %rd151; + @%p9 bra $L__BB0_5; + + mov.u64 %rd146, %rd153; + mov.u64 %rd147, %rd7; + +$L__BB0_4: + st.local.u64 [%rd146], %rd149; + add.s64 %rd151, %rd151, 1; + add.s64 %rd149, %rd149, 64; + add.s64 %rd152, %rd152, 2; + add.s64 %rd146, %rd146, 8; + add.s64 %rd147, %rd147, -2; + setp.gt.u64 %p10, %rd147, 1; + @%p10 bra $L__BB0_4; + +$L__BB0_5: + setp.eq.s64 %p11, %rd151, 0; + @%p11 bra $L__BB0_8; + + or.b16 %rs76, %rs75, 4; + cvt.u32.u16 %r1, %rs76; + mov.u64 %rd154, %rd151; + +$L__BB0_7: + ld.local.u64 %rd109, [%rd153]; + ld.u8 %r74, [%rd109]; + ld.u8 %r75, [%rd109+1]; + prmt.b32 %r76, %r75, %r74, 30212; + ld.u8 %r77, [%rd109+2]; + prmt.b32 %r78, %r77, %r76, 28756; + ld.u8 %r79, [%rd109+3]; + prmt.b32 %r80, %r79, %r78, 1620; + ld.u8 %r81, [%rd109+4]; + ld.u8 %r82, [%rd109+5]; + prmt.b32 %r83, %r82, %r81, 30212; + ld.u8 %r84, [%rd109+6]; + prmt.b32 %r85, %r84, %r83, 28756; + ld.u8 %r86, [%rd109+7]; + prmt.b32 %r87, %r86, %r85, 1620; + ld.u8 %r88, [%rd109+8]; + ld.u8 %r89, [%rd109+9]; + prmt.b32 %r90, %r89, %r88, 30212; + ld.u8 %r91, [%rd109+10]; + prmt.b32 %r92, %r91, %r90, 28756; + ld.u8 %r93, [%rd109+11]; + prmt.b32 %r94, %r93, %r92, 1620; + ld.u8 %r95, [%rd109+12]; + ld.u8 %r96, [%rd109+13]; + prmt.b32 %r97, %r96, %r95, 30212; + ld.u8 %r98, [%rd109+14]; + prmt.b32 %r99, %r98, %r97, 28756; + ld.u8 %r100, [%rd109+15]; + prmt.b32 %r101, %r100, %r99, 1620; + ld.u8 %r102, [%rd109+16]; + ld.u8 %r103, [%rd109+17]; + prmt.b32 %r104, %r103, %r102, 30212; + ld.u8 %r105, [%rd109+18]; + prmt.b32 %r106, %r105, %r104, 28756; + ld.u8 %r107, [%rd109+19]; + prmt.b32 %r108, %r107, %r106, 1620; + ld.u8 %r109, [%rd109+20]; + ld.u8 %r110, [%rd109+21]; + prmt.b32 %r111, %r110, %r109, 30212; + ld.u8 %r112, [%rd109+22]; + prmt.b32 %r113, %r112, %r111, 28756; + ld.u8 %r114, [%rd109+23]; + prmt.b32 %r115, %r114, %r113, 1620; + ld.u8 %r116, [%rd109+24]; + ld.u8 %r117, [%rd109+25]; + prmt.b32 %r118, %r117, %r116, 30212; + ld.u8 %r119, [%rd109+26]; + prmt.b32 %r120, %r119, %r118, 28756; + ld.u8 %r121, [%rd109+27]; + prmt.b32 %r122, %r121, %r120, 1620; + ld.u8 %r123, [%rd109+28]; + ld.u8 %r124, [%rd109+29]; + prmt.b32 %r125, %r124, %r123, 30212; + ld.u8 %r126, [%rd109+30]; + prmt.b32 %r127, %r126, %r125, 28756; + ld.u8 %r128, [%rd109+31]; + prmt.b32 %r129, %r128, %r127, 1620; + ld.u8 %r130, [%rd109+32]; + ld.u8 %r131, [%rd109+33]; + prmt.b32 %r132, %r131, %r130, 30212; + ld.u8 %r133, [%rd109+34]; + prmt.b32 %r134, %r133, %r132, 28756; + ld.u8 %r135, [%rd109+35]; + prmt.b32 %r136, %r135, %r134, 1620; + ld.u8 %r137, [%rd109+36]; + ld.u8 %r138, [%rd109+37]; + prmt.b32 %r139, %r138, %r137, 30212; + ld.u8 %r140, [%rd109+38]; + prmt.b32 %r141, %r140, %r139, 28756; + ld.u8 %r142, [%rd109+39]; + prmt.b32 %r143, %r142, %r141, 1620; + ld.u8 %r144, [%rd109+40]; + ld.u8 %r145, [%rd109+41]; + prmt.b32 %r146, %r145, %r144, 30212; + ld.u8 %r147, [%rd109+42]; + prmt.b32 %r148, %r147, %r146, 28756; + ld.u8 %r149, [%rd109+43]; + prmt.b32 %r150, %r149, %r148, 1620; + ld.u8 %r151, [%rd109+44]; + ld.u8 %r152, [%rd109+45]; + prmt.b32 %r153, %r152, %r151, 30212; + ld.u8 %r154, [%rd109+46]; + prmt.b32 %r155, %r154, %r153, 28756; + ld.u8 %r156, [%rd109+47]; + prmt.b32 %r157, %r156, %r155, 1620; + ld.u8 %r158, [%rd109+48]; + ld.u8 %r159, [%rd109+49]; + prmt.b32 %r160, %r159, %r158, 30212; + ld.u8 %r161, [%rd109+50]; + prmt.b32 %r162, %r161, %r160, 28756; + ld.u8 %r163, [%rd109+51]; + prmt.b32 %r164, %r163, %r162, 1620; + ld.u8 %r165, [%rd109+52]; + ld.u8 %r166, [%rd109+53]; + prmt.b32 %r167, %r166, %r165, 30212; + ld.u8 %r168, [%rd109+54]; + prmt.b32 %r169, %r168, %r167, 28756; + ld.u8 %r170, [%rd109+55]; + prmt.b32 %r171, %r170, %r169, 1620; + ld.u8 %r172, [%rd109+56]; + ld.u8 %r173, [%rd109+57]; + prmt.b32 %r174, %r173, %r172, 30212; + ld.u8 %r175, [%rd109+58]; + prmt.b32 %r176, %r175, %r174, 28756; + ld.u8 %r177, [%rd109+59]; + prmt.b32 %r178, %r177, %r176, 1620; + ld.u8 %r179, [%rd109+60]; + ld.u8 %r180, [%rd109+61]; + prmt.b32 %r181, %r180, %r179, 30212; + ld.u8 %r182, [%rd109+62]; + prmt.b32 %r183, %r182, %r181, 28756; + ld.u8 %r184, [%rd109+63]; + prmt.b32 %r185, %r184, %r183, 1620; + ld.local.u8 %r186, [%rd2+16]; + ld.local.u8 %r187, [%rd2+17]; + prmt.b32 %r188, %r187, %r186, 30212; + ld.local.u8 %r189, [%rd2+18]; + ld.local.u8 %r190, [%rd2+19]; + prmt.b32 %r191, %r190, %r189, 30212; + prmt.b32 %r192, %r191, %r188, 4180; + ld.local.u8 %r193, [%rd2]; + ld.local.u8 %r194, [%rd2+1]; + prmt.b32 %r195, %r194, %r193, 30212; + ld.local.u8 %r196, [%rd2+2]; + ld.local.u8 %r197, [%rd2+3]; + prmt.b32 %r198, %r197, %r196, 30212; + prmt.b32 %r199, %r198, %r195, 4180; + add.s32 %r200, %r192, %r199; + add.s32 %r201, %r200, %r80; + shf.l.wrap.b32 %r202, %r201, %r201, 16; + add.s32 %r203, %r202, 1779033703; + xor.b32 %r204, %r203, %r192; + shf.l.wrap.b32 %r205, %r204, %r204, 20; + add.s32 %r206, %r87, %r201; + add.s32 %r207, %r206, %r205; + xor.b32 %r208, %r207, %r202; + shf.l.wrap.b32 %r209, %r208, %r208, 24; + add.s32 %r210, %r209, %r203; + xor.b32 %r211, %r210, %r205; + shf.l.wrap.b32 %r212, %r211, %r211, 25; + ld.local.u8 %r213, [%rd2+20]; + ld.local.u8 %r214, [%rd2+21]; + prmt.b32 %r215, %r214, %r213, 30212; + ld.local.u8 %r216, [%rd2+22]; + ld.local.u8 %r217, [%rd2+23]; + prmt.b32 %r218, %r217, %r216, 30212; + prmt.b32 %r219, %r218, %r215, 4180; + ld.local.u8 %r220, [%rd2+4]; + ld.local.u8 %r221, [%rd2+5]; + prmt.b32 %r222, %r221, %r220, 30212; + ld.local.u8 %r223, [%rd2+6]; + ld.local.u8 %r224, [%rd2+7]; + prmt.b32 %r225, %r224, %r223, 30212; + prmt.b32 %r226, %r225, %r222, 4180; + add.s32 %r227, %r219, %r226; + add.s32 %r228, %r227, %r94; + shf.l.wrap.b32 %r229, %r228, %r228, 16; + add.s32 %r230, %r229, -1150833019; + xor.b32 %r231, %r230, %r219; + shf.l.wrap.b32 %r232, %r231, %r231, 20; + add.s32 %r233, %r101, %r228; + add.s32 %r234, %r233, %r232; + xor.b32 %r235, %r234, %r229; + shf.l.wrap.b32 %r236, %r235, %r235, 24; + add.s32 %r237, %r236, %r230; + xor.b32 %r238, %r237, %r232; + shf.l.wrap.b32 %r239, %r238, %r238, 25; + ld.local.u8 %r240, [%rd2+24]; + ld.local.u8 %r241, [%rd2+25]; + prmt.b32 %r242, %r241, %r240, 30212; + ld.local.u8 %r243, [%rd2+26]; + ld.local.u8 %r244, [%rd2+27]; + prmt.b32 %r245, %r244, %r243, 30212; + prmt.b32 %r246, %r245, %r242, 4180; + ld.local.u8 %r247, [%rd2+8]; + ld.local.u8 %r248, [%rd2+9]; + prmt.b32 %r249, %r248, %r247, 30212; + ld.local.u8 %r250, [%rd2+10]; + ld.local.u8 %r251, [%rd2+11]; + prmt.b32 %r252, %r251, %r250, 30212; + prmt.b32 %r253, %r252, %r249, 4180; + add.s32 %r254, %r246, %r253; + add.s32 %r255, %r254, %r108; + shr.u32 %r256, %r255, 16; + shl.b32 %r257, %r255, 16; + xor.b32 %r258, %r257, 4194304; + or.b32 %r259, %r258, %r256; + add.s32 %r260, %r259, 1013904242; + xor.b32 %r261, %r260, %r246; + shf.l.wrap.b32 %r262, %r261, %r261, 20; + add.s32 %r263, %r115, %r255; + add.s32 %r264, %r263, %r262; + xor.b32 %r265, %r264, %r259; + shf.l.wrap.b32 %r266, %r265, %r265, 24; + add.s32 %r267, %r266, %r260; + xor.b32 %r268, %r267, %r262; + shf.l.wrap.b32 %r269, %r268, %r268, 25; + ld.local.u8 %r270, [%rd2+28]; + ld.local.u8 %r271, [%rd2+29]; + prmt.b32 %r272, %r271, %r270, 30212; + ld.local.u8 %r273, [%rd2+30]; + ld.local.u8 %r274, [%rd2+31]; + prmt.b32 %r275, %r274, %r273, 30212; + prmt.b32 %r276, %r275, %r272, 4180; + ld.local.u8 %r277, [%rd2+12]; + ld.local.u8 %r278, [%rd2+13]; + prmt.b32 %r279, %r278, %r277, 30212; + ld.local.u8 %r280, [%rd2+14]; + ld.local.u8 %r281, [%rd2+15]; + prmt.b32 %r282, %r281, %r280, 30212; + prmt.b32 %r283, %r282, %r279, 4180; + add.s32 %r284, %r276, %r283; + add.s32 %r285, %r284, %r122; + xor.b32 %r286, %r285, %r1; + shr.u32 %r287, %r285, 16; + shl.b32 %r288, %r286, 16; + or.b32 %r289, %r288, %r287; + add.s32 %r290, %r289, -1521486534; + xor.b32 %r291, %r290, %r276; + shf.l.wrap.b32 %r292, %r291, %r291, 20; + add.s32 %r293, %r129, %r285; + add.s32 %r294, %r293, %r292; + xor.b32 %r295, %r294, %r289; + shf.l.wrap.b32 %r296, %r295, %r295, 24; + add.s32 %r297, %r296, %r290; + xor.b32 %r298, %r297, %r292; + shf.l.wrap.b32 %r299, %r298, %r298, 25; + add.s32 %r300, %r239, %r207; + add.s32 %r301, %r300, %r136; + xor.b32 %r302, %r296, %r301; + shf.l.wrap.b32 %r303, %r302, %r302, 16; + add.s32 %r304, %r303, %r267; + xor.b32 %r305, %r304, %r239; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r143, %r301; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + add.s32 %r314, %r269, %r234; + add.s32 %r315, %r314, %r150; + xor.b32 %r316, %r315, %r209; + shf.l.wrap.b32 %r317, %r316, %r316, 16; + add.s32 %r318, %r317, %r297; + xor.b32 %r319, %r318, %r269; + shf.l.wrap.b32 %r320, %r319, %r319, 20; + add.s32 %r321, %r157, %r315; + add.s32 %r322, %r321, %r320; + xor.b32 %r323, %r322, %r317; + shf.l.wrap.b32 %r324, %r323, %r323, 24; + add.s32 %r325, %r324, %r318; + xor.b32 %r326, %r325, %r320; + shf.l.wrap.b32 %r327, %r326, %r326, 25; + add.s32 %r328, %r299, %r264; + add.s32 %r329, %r328, %r164; + xor.b32 %r330, %r329, %r236; + shf.l.wrap.b32 %r331, %r330, %r330, 16; + add.s32 %r332, %r331, %r210; + xor.b32 %r333, %r332, %r299; + shf.l.wrap.b32 %r334, %r333, %r333, 20; + add.s32 %r335, %r171, %r329; + add.s32 %r336, %r335, %r334; + xor.b32 %r337, %r336, %r331; + shf.l.wrap.b32 %r338, %r337, %r337, 24; + add.s32 %r339, %r338, %r332; + xor.b32 %r340, %r339, %r334; + shf.l.wrap.b32 %r341, %r340, %r340, 25; + add.s32 %r342, %r294, %r212; + add.s32 %r343, %r342, %r178; + xor.b32 %r344, %r343, %r266; + shf.l.wrap.b32 %r345, %r344, %r344, 16; + add.s32 %r346, %r345, %r237; + xor.b32 %r347, %r346, %r212; + shf.l.wrap.b32 %r348, %r347, %r347, 20; + add.s32 %r349, %r185, %r343; + add.s32 %r350, %r349, %r348; + xor.b32 %r351, %r350, %r345; + shf.l.wrap.b32 %r352, %r351, %r351, 24; + add.s32 %r353, %r352, %r346; + xor.b32 %r354, %r353, %r348; + shf.l.wrap.b32 %r355, %r354, %r354, 25; + add.s32 %r356, %r308, %r94; + add.s32 %r357, %r356, %r355; + xor.b32 %r358, %r357, %r324; + shf.l.wrap.b32 %r359, %r358, %r358, 16; + add.s32 %r360, %r359, %r339; + xor.b32 %r361, %r360, %r355; + shf.l.wrap.b32 %r362, %r361, %r361, 20; + add.s32 %r363, %r357, %r122; + add.s32 %r364, %r363, %r362; + xor.b32 %r365, %r364, %r359; + shf.l.wrap.b32 %r366, %r365, %r365, 24; + add.s32 %r367, %r366, %r360; + xor.b32 %r368, %r367, %r362; + shf.l.wrap.b32 %r369, %r368, %r368, 25; + add.s32 %r370, %r322, %r101; + add.s32 %r371, %r370, %r313; + xor.b32 %r372, %r338, %r371; + shf.l.wrap.b32 %r373, %r372, %r372, 16; + add.s32 %r374, %r353, %r373; + xor.b32 %r375, %r374, %r313; + shf.l.wrap.b32 %r376, %r375, %r375, 20; + add.s32 %r377, %r371, %r150; + add.s32 %r378, %r377, %r376; + xor.b32 %r379, %r378, %r373; + shf.l.wrap.b32 %r380, %r379, %r379, 24; + add.s32 %r381, %r380, %r374; + xor.b32 %r382, %r381, %r376; + shf.l.wrap.b32 %r383, %r382, %r382, 25; + add.s32 %r384, %r327, %r129; + add.s32 %r385, %r384, %r336; + xor.b32 %r386, %r352, %r385; + shf.l.wrap.b32 %r387, %r386, %r386, 16; + add.s32 %r388, %r387, %r311; + xor.b32 %r389, %r388, %r327; + shf.l.wrap.b32 %r390, %r389, %r389, 20; + add.s32 %r391, %r385, %r80; + add.s32 %r392, %r391, %r390; + xor.b32 %r393, %r392, %r387; + shf.l.wrap.b32 %r394, %r393, %r393, 24; + add.s32 %r395, %r394, %r388; + xor.b32 %r396, %r395, %r390; + shf.l.wrap.b32 %r397, %r396, %r396, 25; + add.s32 %r398, %r341, %r108; + add.s32 %r399, %r398, %r350; + xor.b32 %r400, %r399, %r310; + shf.l.wrap.b32 %r401, %r400, %r400, 16; + add.s32 %r402, %r401, %r325; + xor.b32 %r403, %r402, %r341; + shf.l.wrap.b32 %r404, %r403, %r403, 20; + add.s32 %r405, %r399, %r171; + add.s32 %r406, %r405, %r404; + xor.b32 %r407, %r406, %r401; + shf.l.wrap.b32 %r408, %r407, %r407, 24; + add.s32 %r409, %r408, %r402; + xor.b32 %r410, %r409, %r404; + shf.l.wrap.b32 %r411, %r410, %r410, 25; + add.s32 %r412, %r383, %r87; + add.s32 %r413, %r412, %r364; + xor.b32 %r414, %r413, %r408; + shf.l.wrap.b32 %r415, %r414, %r414, 16; + add.s32 %r416, %r415, %r395; + xor.b32 %r417, %r416, %r383; + shf.l.wrap.b32 %r418, %r417, %r417, 20; + add.s32 %r419, %r413, %r157; + add.s32 %r420, %r419, %r418; + xor.b32 %r421, %r420, %r415; + shf.l.wrap.b32 %r422, %r421, %r421, 24; + add.s32 %r423, %r422, %r416; + xor.b32 %r424, %r423, %r418; + shf.l.wrap.b32 %r425, %r424, %r424, 25; + add.s32 %r426, %r378, %r164; + add.s32 %r427, %r426, %r397; + xor.b32 %r428, %r366, %r427; + shf.l.wrap.b32 %r429, %r428, %r428, 16; + add.s32 %r430, %r429, %r409; + xor.b32 %r431, %r430, %r397; + shf.l.wrap.b32 %r432, %r431, %r431, 20; + add.s32 %r433, %r427, %r115; + add.s32 %r434, %r433, %r432; + xor.b32 %r435, %r434, %r429; + shf.l.wrap.b32 %r436, %r435, %r435, 24; + add.s32 %r437, %r436, %r430; + xor.b32 %r438, %r437, %r432; + shf.l.wrap.b32 %r439, %r438, %r438, 25; + add.s32 %r440, %r392, %r143; + add.s32 %r441, %r440, %r411; + xor.b32 %r442, %r441, %r380; + shf.l.wrap.b32 %r443, %r442, %r442, 16; + add.s32 %r444, %r443, %r367; + xor.b32 %r445, %r444, %r411; + shf.l.wrap.b32 %r446, %r445, %r445, 20; + add.s32 %r447, %r441, %r178; + add.s32 %r448, %r447, %r446; + xor.b32 %r449, %r448, %r443; + shf.l.wrap.b32 %r450, %r449, %r449, 24; + add.s32 %r451, %r450, %r444; + xor.b32 %r452, %r451, %r446; + shf.l.wrap.b32 %r453, %r452, %r452, 25; + add.s32 %r454, %r406, %r185; + add.s32 %r455, %r454, %r369; + xor.b32 %r456, %r455, %r394; + shf.l.wrap.b32 %r457, %r456, %r456, 16; + add.s32 %r458, %r457, %r381; + xor.b32 %r459, %r458, %r369; + shf.l.wrap.b32 %r460, %r459, %r459, 20; + add.s32 %r461, %r455, %r136; + add.s32 %r462, %r461, %r460; + xor.b32 %r463, %r462, %r457; + shf.l.wrap.b32 %r464, %r463, %r463, 24; + add.s32 %r465, %r464, %r458; + xor.b32 %r466, %r465, %r460; + shf.l.wrap.b32 %r467, %r466, %r466, 25; + add.s32 %r468, %r420, %r101; + add.s32 %r469, %r468, %r467; + xor.b32 %r470, %r469, %r436; + shf.l.wrap.b32 %r471, %r470, %r470, 16; + add.s32 %r472, %r471, %r451; + xor.b32 %r473, %r472, %r467; + shf.l.wrap.b32 %r474, %r473, %r473, 20; + add.s32 %r475, %r469, %r108; + add.s32 %r476, %r475, %r474; + xor.b32 %r477, %r476, %r471; + shf.l.wrap.b32 %r478, %r477, %r477, 24; + add.s32 %r479, %r478, %r472; + xor.b32 %r480, %r479, %r474; + shf.l.wrap.b32 %r481, %r480, %r480, 25; + add.s32 %r482, %r434, %r150; + add.s32 %r483, %r482, %r425; + xor.b32 %r484, %r483, %r450; + shf.l.wrap.b32 %r485, %r484, %r484, 16; + add.s32 %r486, %r485, %r465; + xor.b32 %r487, %r486, %r425; + shf.l.wrap.b32 %r488, %r487, %r487, 20; + add.s32 %r489, %r483, %r164; + add.s32 %r490, %r489, %r488; + xor.b32 %r491, %r490, %r485; + shf.l.wrap.b32 %r492, %r491, %r491, 24; + add.s32 %r493, %r492, %r486; + xor.b32 %r494, %r493, %r488; + shf.l.wrap.b32 %r495, %r494, %r494, 25; + add.s32 %r496, %r448, %r171; + add.s32 %r497, %r496, %r439; + xor.b32 %r498, %r464, %r497; + shf.l.wrap.b32 %r499, %r498, %r498, 16; + add.s32 %r500, %r499, %r423; + xor.b32 %r501, %r500, %r439; + shf.l.wrap.b32 %r502, %r501, %r501, 20; + add.s32 %r503, %r497, %r94; + add.s32 %r504, %r503, %r502; + xor.b32 %r505, %r504, %r499; + shf.l.wrap.b32 %r506, %r505, %r505, 24; + add.s32 %r507, %r506, %r500; + xor.b32 %r508, %r507, %r502; + shf.l.wrap.b32 %r509, %r508, %r508, 25; + add.s32 %r510, %r453, %r129; + add.s32 %r511, %r510, %r462; + xor.b32 %r512, %r511, %r422; + shf.l.wrap.b32 %r513, %r512, %r512, 16; + add.s32 %r514, %r513, %r437; + xor.b32 %r515, %r514, %r453; + shf.l.wrap.b32 %r516, %r515, %r515, 20; + add.s32 %r517, %r511, %r178; + add.s32 %r518, %r517, %r516; + xor.b32 %r519, %r518, %r513; + shf.l.wrap.b32 %r520, %r519, %r519, 24; + add.s32 %r521, %r520, %r514; + xor.b32 %r522, %r521, %r516; + shf.l.wrap.b32 %r523, %r522, %r522, 25; + add.s32 %r524, %r495, %r122; + add.s32 %r525, %r524, %r476; + xor.b32 %r526, %r525, %r520; + shf.l.wrap.b32 %r527, %r526, %r526, 16; + add.s32 %r528, %r527, %r507; + xor.b32 %r529, %r528, %r495; + shf.l.wrap.b32 %r530, %r529, %r529, 20; + add.s32 %r531, %r525, %r115; + add.s32 %r532, %r531, %r530; + xor.b32 %r533, %r532, %r527; + shf.l.wrap.b32 %r534, %r533, %r533, 24; + add.s32 %r535, %r534, %r528; + xor.b32 %r536, %r535, %r530; + shf.l.wrap.b32 %r537, %r536, %r536, 25; + add.s32 %r538, %r490, %r143; + add.s32 %r539, %r538, %r509; + xor.b32 %r540, %r478, %r539; + shf.l.wrap.b32 %r541, %r540, %r540, 16; + add.s32 %r542, %r541, %r521; + xor.b32 %r543, %r542, %r509; + shf.l.wrap.b32 %r544, %r543, %r543, 20; + add.s32 %r545, %r539, %r80; + add.s32 %r546, %r545, %r544; + xor.b32 %r547, %r546, %r541; + shf.l.wrap.b32 %r548, %r547, %r547, 24; + add.s32 %r549, %r548, %r542; + xor.b32 %r550, %r549, %r544; + shf.l.wrap.b32 %r551, %r550, %r550, 25; + add.s32 %r552, %r504, %r157; + add.s32 %r553, %r552, %r523; + xor.b32 %r554, %r553, %r492; + shf.l.wrap.b32 %r555, %r554, %r554, 16; + add.s32 %r556, %r555, %r479; + xor.b32 %r557, %r556, %r523; + shf.l.wrap.b32 %r558, %r557, %r557, 20; + add.s32 %r559, %r553, %r185; + add.s32 %r560, %r559, %r558; + xor.b32 %r561, %r560, %r555; + shf.l.wrap.b32 %r562, %r561, %r561, 24; + add.s32 %r563, %r562, %r556; + xor.b32 %r564, %r563, %r558; + shf.l.wrap.b32 %r565, %r564, %r564, 25; + add.s32 %r566, %r518, %r136; + add.s32 %r567, %r566, %r481; + xor.b32 %r568, %r567, %r506; + shf.l.wrap.b32 %r569, %r568, %r568, 16; + add.s32 %r570, %r569, %r493; + xor.b32 %r571, %r570, %r481; + shf.l.wrap.b32 %r572, %r571, %r571, 20; + add.s32 %r573, %r567, %r87; + add.s32 %r574, %r573, %r572; + xor.b32 %r575, %r574, %r569; + shf.l.wrap.b32 %r576, %r575, %r575, 24; + add.s32 %r577, %r576, %r570; + xor.b32 %r578, %r577, %r572; + shf.l.wrap.b32 %r579, %r578, %r578, 25; + add.s32 %r580, %r532, %r150; + add.s32 %r581, %r580, %r579; + xor.b32 %r582, %r581, %r548; + shf.l.wrap.b32 %r583, %r582, %r582, 16; + add.s32 %r584, %r583, %r563; + xor.b32 %r585, %r584, %r579; + shf.l.wrap.b32 %r586, %r585, %r585, 20; + add.s32 %r587, %r581, %r129; + add.s32 %r588, %r587, %r586; + xor.b32 %r589, %r588, %r583; + shf.l.wrap.b32 %r590, %r589, %r589, 24; + add.s32 %r591, %r590, %r584; + xor.b32 %r592, %r591, %r586; + shf.l.wrap.b32 %r593, %r592, %r592, 25; + add.s32 %r594, %r546, %r164; + add.s32 %r595, %r594, %r537; + xor.b32 %r596, %r595, %r562; + shf.l.wrap.b32 %r597, %r596, %r596, 16; + add.s32 %r598, %r597, %r577; + xor.b32 %r599, %r598, %r537; + shf.l.wrap.b32 %r600, %r599, %r599, 20; + add.s32 %r601, %r595, %r143; + add.s32 %r602, %r601, %r600; + xor.b32 %r603, %r602, %r597; + shf.l.wrap.b32 %r604, %r603, %r603, 24; + add.s32 %r605, %r604, %r598; + xor.b32 %r606, %r605, %r600; + shf.l.wrap.b32 %r607, %r606, %r606, 25; + add.s32 %r608, %r560, %r178; + add.s32 %r609, %r608, %r551; + xor.b32 %r610, %r576, %r609; + shf.l.wrap.b32 %r611, %r610, %r610, 16; + add.s32 %r612, %r611, %r535; + xor.b32 %r613, %r612, %r551; + shf.l.wrap.b32 %r614, %r613, %r613, 20; + add.s32 %r615, %r609, %r101; + add.s32 %r616, %r615, %r614; + xor.b32 %r617, %r616, %r611; + shf.l.wrap.b32 %r618, %r617, %r617, 24; + add.s32 %r619, %r618, %r612; + xor.b32 %r620, %r619, %r614; + shf.l.wrap.b32 %r621, %r620, %r620, 25; + add.s32 %r622, %r565, %r171; + add.s32 %r623, %r622, %r574; + xor.b32 %r624, %r623, %r534; + shf.l.wrap.b32 %r625, %r624, %r624, 16; + add.s32 %r626, %r625, %r549; + xor.b32 %r627, %r626, %r565; + shf.l.wrap.b32 %r628, %r627, %r627, 20; + add.s32 %r629, %r623, %r185; + add.s32 %r630, %r629, %r628; + xor.b32 %r631, %r630, %r625; + shf.l.wrap.b32 %r632, %r631, %r631, 24; + add.s32 %r633, %r632, %r626; + xor.b32 %r634, %r633, %r628; + shf.l.wrap.b32 %r635, %r634, %r634, 25; + add.s32 %r636, %r607, %r108; + add.s32 %r637, %r636, %r588; + xor.b32 %r638, %r637, %r632; + shf.l.wrap.b32 %r639, %r638, %r638, 16; + add.s32 %r640, %r639, %r619; + xor.b32 %r641, %r640, %r607; + shf.l.wrap.b32 %r642, %r641, %r641, 20; + add.s32 %r643, %r637, %r80; + add.s32 %r644, %r643, %r642; + xor.b32 %r645, %r644, %r639; + shf.l.wrap.b32 %r646, %r645, %r645, 24; + add.s32 %r647, %r646, %r640; + xor.b32 %r648, %r647, %r642; + shf.l.wrap.b32 %r649, %r648, %r648, 25; + add.s32 %r650, %r602, %r157; + add.s32 %r651, %r650, %r621; + xor.b32 %r652, %r590, %r651; + shf.l.wrap.b32 %r653, %r652, %r652, 16; + add.s32 %r654, %r653, %r633; + xor.b32 %r655, %r654, %r621; + shf.l.wrap.b32 %r656, %r655, %r655, 20; + add.s32 %r657, %r651, %r94; + add.s32 %r658, %r657, %r656; + xor.b32 %r659, %r658, %r653; + shf.l.wrap.b32 %r660, %r659, %r659, 24; + add.s32 %r661, %r660, %r654; + xor.b32 %r662, %r661, %r656; + shf.l.wrap.b32 %r663, %r662, %r662, 25; + add.s32 %r664, %r616, %r115; + add.s32 %r665, %r664, %r635; + xor.b32 %r666, %r665, %r604; + shf.l.wrap.b32 %r667, %r666, %r666, 16; + add.s32 %r668, %r667, %r591; + xor.b32 %r669, %r668, %r635; + shf.l.wrap.b32 %r670, %r669, %r669, 20; + add.s32 %r671, %r665, %r136; + add.s32 %r672, %r671, %r670; + xor.b32 %r673, %r672, %r667; + shf.l.wrap.b32 %r674, %r673, %r673, 24; + add.s32 %r675, %r674, %r668; + xor.b32 %r676, %r675, %r670; + shf.l.wrap.b32 %r677, %r676, %r676, 25; + add.s32 %r678, %r630, %r87; + add.s32 %r679, %r678, %r593; + xor.b32 %r680, %r679, %r618; + shf.l.wrap.b32 %r681, %r680, %r680, 16; + add.s32 %r682, %r681, %r605; + xor.b32 %r683, %r682, %r593; + shf.l.wrap.b32 %r684, %r683, %r683, 20; + add.s32 %r685, %r679, %r122; + add.s32 %r686, %r685, %r684; + xor.b32 %r687, %r686, %r681; + shf.l.wrap.b32 %r688, %r687, %r687, 24; + add.s32 %r689, %r688, %r682; + xor.b32 %r690, %r689, %r684; + shf.l.wrap.b32 %r691, %r690, %r690, 25; + add.s32 %r692, %r644, %r164; + add.s32 %r693, %r692, %r691; + xor.b32 %r694, %r693, %r660; + shf.l.wrap.b32 %r695, %r694, %r694, 16; + add.s32 %r696, %r695, %r675; + xor.b32 %r697, %r696, %r691; + shf.l.wrap.b32 %r698, %r697, %r697, 20; + add.s32 %r699, %r693, %r171; + add.s32 %r700, %r699, %r698; + xor.b32 %r701, %r700, %r695; + shf.l.wrap.b32 %r702, %r701, %r701, 24; + add.s32 %r703, %r702, %r696; + xor.b32 %r704, %r703, %r698; + shf.l.wrap.b32 %r705, %r704, %r704, 25; + add.s32 %r706, %r658, %r143; + add.s32 %r707, %r706, %r649; + xor.b32 %r708, %r707, %r674; + shf.l.wrap.b32 %r709, %r708, %r708, 16; + add.s32 %r710, %r709, %r689; + xor.b32 %r711, %r710, %r649; + shf.l.wrap.b32 %r712, %r711, %r711, 20; + add.s32 %r713, %r707, %r157; + add.s32 %r714, %r713, %r712; + xor.b32 %r715, %r714, %r709; + shf.l.wrap.b32 %r716, %r715, %r715, 24; + add.s32 %r717, %r716, %r710; + xor.b32 %r718, %r717, %r712; + shf.l.wrap.b32 %r719, %r718, %r718, 25; + add.s32 %r720, %r672, %r185; + add.s32 %r721, %r720, %r663; + xor.b32 %r722, %r688, %r721; + shf.l.wrap.b32 %r723, %r722, %r722, 16; + add.s32 %r724, %r723, %r647; + xor.b32 %r725, %r724, %r663; + shf.l.wrap.b32 %r726, %r725, %r725, 20; + add.s32 %r727, %r721, %r150; + add.s32 %r728, %r727, %r726; + xor.b32 %r729, %r728, %r723; + shf.l.wrap.b32 %r730, %r729, %r729, 24; + add.s32 %r731, %r730, %r724; + xor.b32 %r732, %r731, %r726; + shf.l.wrap.b32 %r733, %r732, %r732, 25; + add.s32 %r734, %r677, %r178; + add.s32 %r735, %r734, %r686; + xor.b32 %r736, %r735, %r646; + shf.l.wrap.b32 %r737, %r736, %r736, 16; + add.s32 %r738, %r737, %r661; + xor.b32 %r739, %r738, %r677; + shf.l.wrap.b32 %r740, %r739, %r739, 20; + add.s32 %r741, %r735, %r136; + add.s32 %r742, %r741, %r740; + xor.b32 %r743, %r742, %r737; + shf.l.wrap.b32 %r744, %r743, %r743, 24; + add.s32 %r745, %r744, %r738; + xor.b32 %r746, %r745, %r740; + shf.l.wrap.b32 %r747, %r746, %r746, 25; + add.s32 %r748, %r719, %r129; + add.s32 %r749, %r748, %r700; + xor.b32 %r750, %r749, %r744; + shf.l.wrap.b32 %r751, %r750, %r750, 16; + add.s32 %r752, %r751, %r731; + xor.b32 %r753, %r752, %r719; + shf.l.wrap.b32 %r754, %r753, %r753, 20; + add.s32 %r755, %r749, %r94; + add.s32 %r756, %r755, %r754; + xor.b32 %r757, %r756, %r751; + shf.l.wrap.b32 %r758, %r757, %r757, 24; + add.s32 %r759, %r758, %r752; + xor.b32 %r760, %r759, %r754; + shf.l.wrap.b32 %r761, %r760, %r760, 25; + add.s32 %r762, %r714, %r115; + add.s32 %r763, %r762, %r733; + xor.b32 %r764, %r702, %r763; + shf.l.wrap.b32 %r765, %r764, %r764, 16; + add.s32 %r766, %r765, %r745; + xor.b32 %r767, %r766, %r733; + shf.l.wrap.b32 %r768, %r767, %r767, 20; + add.s32 %r769, %r763, %r101; + add.s32 %r770, %r769, %r768; + xor.b32 %r771, %r770, %r765; + shf.l.wrap.b32 %r772, %r771, %r771, 24; + add.s32 %r773, %r772, %r766; + xor.b32 %r774, %r773, %r768; + shf.l.wrap.b32 %r775, %r774, %r774, 25; + add.s32 %r776, %r728, %r80; + add.s32 %r777, %r776, %r747; + xor.b32 %r778, %r777, %r716; + shf.l.wrap.b32 %r779, %r778, %r778, 16; + add.s32 %r780, %r779, %r703; + xor.b32 %r781, %r780, %r747; + shf.l.wrap.b32 %r782, %r781, %r781, 20; + add.s32 %r783, %r777, %r87; + add.s32 %r784, %r783, %r782; + xor.b32 %r785, %r784, %r779; + shf.l.wrap.b32 %r786, %r785, %r785, 24; + add.s32 %r787, %r786, %r780; + xor.b32 %r788, %r787, %r782; + shf.l.wrap.b32 %r789, %r788, %r788, 25; + add.s32 %r790, %r742, %r122; + add.s32 %r791, %r790, %r705; + xor.b32 %r792, %r791, %r730; + shf.l.wrap.b32 %r793, %r792, %r792, 16; + add.s32 %r794, %r793, %r717; + xor.b32 %r795, %r794, %r705; + shf.l.wrap.b32 %r796, %r795, %r795, 20; + add.s32 %r797, %r791, %r108; + add.s32 %r798, %r797, %r796; + xor.b32 %r799, %r798, %r793; + shf.l.wrap.b32 %r800, %r799, %r799, 24; + add.s32 %r801, %r800, %r794; + xor.b32 %r802, %r801, %r796; + shf.l.wrap.b32 %r803, %r802, %r802, 25; + add.s32 %r804, %r756, %r143; + add.s32 %r805, %r804, %r803; + xor.b32 %r806, %r805, %r772; + shf.l.wrap.b32 %r807, %r806, %r806, 16; + add.s32 %r808, %r807, %r787; + xor.b32 %r809, %r808, %r803; + shf.l.wrap.b32 %r810, %r809, %r809, 20; + add.s32 %r811, %r805, %r178; + add.s32 %r812, %r811, %r810; + xor.b32 %r813, %r812, %r807; + shf.l.wrap.b32 %r814, %r813, %r813, 24; + add.s32 %r815, %r814, %r808; + xor.b32 %r816, %r815, %r810; + shf.l.wrap.b32 %r817, %r816, %r816, 25; + add.s32 %r818, %r770, %r157; + add.s32 %r819, %r818, %r761; + xor.b32 %r820, %r819, %r786; + shf.l.wrap.b32 %r821, %r820, %r820, 16; + add.s32 %r822, %r821, %r801; + xor.b32 %r823, %r822, %r761; + shf.l.wrap.b32 %r824, %r823, %r823, 20; + add.s32 %r825, %r819, %r115; + add.s32 %r826, %r825, %r824; + xor.b32 %r827, %r826, %r821; + shf.l.wrap.b32 %r828, %r827, %r827, 24; + add.s32 %r829, %r828, %r822; + xor.b32 %r830, %r829, %r824; + shf.l.wrap.b32 %r831, %r830, %r830, 25; + add.s32 %r832, %r784, %r136; + add.s32 %r833, %r832, %r775; + xor.b32 %r834, %r800, %r833; + shf.l.wrap.b32 %r835, %r834, %r834, 16; + add.s32 %r836, %r835, %r759; + xor.b32 %r837, %r836, %r775; + shf.l.wrap.b32 %r838, %r837, %r837, 20; + add.s32 %r839, %r833, %r164; + add.s32 %r840, %r839, %r838; + xor.b32 %r841, %r840, %r835; + shf.l.wrap.b32 %r842, %r841, %r841, 24; + add.s32 %r843, %r842, %r836; + xor.b32 %r844, %r843, %r838; + shf.l.wrap.b32 %r845, %r844, %r844, 25; + add.s32 %r846, %r789, %r185; + add.s32 %r847, %r846, %r798; + xor.b32 %r848, %r847, %r758; + shf.l.wrap.b32 %r849, %r848, %r848, 16; + add.s32 %r850, %r849, %r773; + xor.b32 %r851, %r850, %r789; + shf.l.wrap.b32 %r852, %r851, %r851, 20; + add.s32 %r853, %r847, %r87; + add.s32 %r854, %r853, %r852; + xor.b32 %r855, %r854, %r849; + shf.l.wrap.b32 %r856, %r855, %r855, 24; + add.s32 %r857, %r856, %r850; + xor.b32 %r858, %r857, %r852; + shf.l.wrap.b32 %r859, %r858, %r858, 25; + add.s32 %r860, %r831, %r171; + add.s32 %r861, %r860, %r812; + xor.b32 %r862, %r861, %r856; + shf.l.wrap.b32 %r863, %r862, %r862, 16; + add.s32 %r864, %r863, %r843; + xor.b32 %r865, %r864, %r831; + shf.l.wrap.b32 %r866, %r865, %r865, 20; + add.s32 %r867, %r861, %r101; + add.s32 %r868, %r867, %r866; + xor.b32 %r869, %r868, %r863; + shf.l.wrap.b32 %r870, %r869, %r869, 24; + add.s32 %r871, %r870, %r864; + xor.b32 %r872, %r871, %r866; + shf.l.wrap.b32 %r873, %r872, %r872, 25; + add.s32 %r874, %r826, %r80; + add.s32 %r875, %r874, %r845; + xor.b32 %r876, %r814, %r875; + shf.l.wrap.b32 %r877, %r876, %r876, 16; + add.s32 %r878, %r877, %r857; + xor.b32 %r879, %r878, %r845; + shf.l.wrap.b32 %r880, %r879, %r879, 20; + add.s32 %r881, %r875, %r150; + add.s32 %r882, %r881, %r880; + xor.b32 %r883, %r882, %r877; + shf.l.wrap.b32 %r884, %r883, %r883, 24; + add.s32 %r885, %r884, %r878; + xor.b32 %r886, %r885, %r880; + shf.l.wrap.b32 %r887, %r886, %r886, 25; + add.s32 %r888, %r840, %r94; + add.s32 %r889, %r888, %r859; + xor.b32 %r890, %r889, %r828; + shf.l.wrap.b32 %r891, %r890, %r890, 16; + add.s32 %r892, %r891, %r815; + xor.b32 %r893, %r892, %r859; + shf.l.wrap.b32 %r894, %r893, %r893, 20; + add.s32 %r895, %r889, %r122; + add.s32 %r896, %r895, %r894; + xor.b32 %r897, %r896, %r891; + shf.l.wrap.b32 %r898, %r897, %r897, 24; + add.s32 %r899, %r898, %r892; + xor.b32 %r900, %r899, %r894; + shf.l.wrap.b32 %r901, %r900, %r900, 25; + add.s32 %r902, %r854, %r108; + add.s32 %r903, %r902, %r817; + xor.b32 %r904, %r903, %r842; + shf.l.wrap.b32 %r905, %r904, %r904, 16; + add.s32 %r906, %r905, %r829; + xor.b32 %r907, %r906, %r817; + shf.l.wrap.b32 %r908, %r907, %r907, 20; + add.s32 %r909, %r903, %r129; + add.s32 %r910, %r909, %r908; + xor.b32 %r911, %r910, %r905; + shf.l.wrap.b32 %r912, %r911, %r911, 24; + add.s32 %r913, %r912, %r906; + xor.b32 %r914, %r913, %r908; + shf.l.wrap.b32 %r915, %r914, %r914, 25; + add.s32 %r916, %r868, %r157; + add.s32 %r917, %r916, %r915; + xor.b32 %r918, %r917, %r884; + shf.l.wrap.b32 %r919, %r918, %r918, 16; + add.s32 %r920, %r919, %r899; + xor.b32 %r921, %r920, %r915; + shf.l.wrap.b32 %r922, %r921, %r921, 20; + add.s32 %r923, %r917, %r185; + add.s32 %r924, %r923, %r922; + xor.b32 %r925, %r924, %r919; + shf.l.wrap.b32 %r926, %r925, %r925, 24; + add.s32 %r927, %r926, %r920; + xor.b32 %r928, %r927, %r922; + shf.l.wrap.b32 %r929, %r928, %r928, 25; + add.s32 %r930, %r882, %r115; + add.s32 %r931, %r930, %r873; + xor.b32 %r932, %r931, %r898; + shf.l.wrap.b32 %r933, %r932, %r932, 16; + add.s32 %r934, %r933, %r913; + xor.b32 %r935, %r934, %r873; + shf.l.wrap.b32 %r936, %r935, %r935, 20; + add.s32 %r937, %r931, %r80; + add.s32 %r938, %r937, %r936; + xor.b32 %r939, %r938, %r933; + shf.l.wrap.b32 %r940, %r939, %r939, 24; + add.s32 %r941, %r940, %r934; + xor.b32 %r942, %r941, %r936; + shf.l.wrap.b32 %r943, %r942, %r942, 25; + add.s32 %r944, %r896, %r87; + add.s32 %r945, %r944, %r887; + xor.b32 %r946, %r912, %r945; + shf.l.wrap.b32 %r947, %r946, %r946, 16; + add.s32 %r948, %r947, %r871; + xor.b32 %r949, %r948, %r887; + shf.l.wrap.b32 %r950, %r949, %r949, 20; + add.s32 %r951, %r945, %r143; + add.s32 %r952, %r951, %r950; + xor.b32 %r953, %r952, %r947; + shf.l.wrap.b32 %r954, %r953, %r953, 24; + add.s32 %r955, %r954, %r948; + xor.b32 %r956, %r955, %r950; + shf.l.wrap.b32 %r957, %r956, %r956, 25; + add.s32 %r958, %r901, %r136; + add.s32 %r959, %r958, %r910; + xor.b32 %r960, %r959, %r870; + shf.l.wrap.b32 %r961, %r960, %r960, 16; + add.s32 %r962, %r961, %r885; + xor.b32 %r963, %r962, %r901; + shf.l.wrap.b32 %r964, %r963, %r963, 20; + add.s32 %r965, %r959, %r122; + add.s32 %r966, %r965, %r964; + xor.b32 %r967, %r966, %r961; + shf.l.wrap.b32 %r968, %r967, %r967, 24; + add.s32 %r969, %r968, %r962; + xor.b32 %r970, %r969, %r964; + shf.l.wrap.b32 %r971, %r970, %r970, 25; + add.s32 %r972, %r943, %r178; + add.s32 %r973, %r972, %r924; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 16; + add.s32 %r976, %r975, %r955; + xor.b32 %r977, %r976, %r943; + shf.l.wrap.b32 %r978, %r977, %r977, 20; + add.s32 %r979, %r973, %r150; + add.s32 %r980, %r979, %r978; + xor.b32 %r981, %r980, %r975; + shf.l.wrap.b32 %r982, %r981, %r981, 24; + add.s32 %r983, %r982, %r976; + xor.b32 %r984, %r983, %r978; + shf.l.wrap.b32 %r985, %r984, %r984, 25; + add.s32 %r986, %r938, %r94; + add.s32 %r987, %r986, %r957; + xor.b32 %r988, %r926, %r987; + shf.l.wrap.b32 %r989, %r988, %r988, 16; + add.s32 %r990, %r989, %r969; + xor.b32 %r991, %r990, %r957; + shf.l.wrap.b32 %r992, %r991, %r991, 20; + add.s32 %r993, %r987, %r164; + add.s32 %r994, %r993, %r992; + xor.b32 %r995, %r994, %r989; + shf.l.wrap.b32 %r996, %r995, %r995, 24; + add.s32 %r997, %r996, %r990; + xor.b32 %r998, %r997, %r992; + shf.l.wrap.b32 %r999, %r998, %r998, 25; + add.s32 %r1000, %r952, %r101; + add.s32 %r1001, %r1000, %r971; + xor.b32 %r1002, %r1001, %r940; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 16; + add.s32 %r1004, %r1003, %r927; + xor.b32 %r1005, %r1004, %r971; + shf.l.wrap.b32 %r1006, %r1005, %r1005, 20; + add.s32 %r1007, %r1001, %r108; + add.s32 %r1008, %r1007, %r1006; + xor.b32 %r1009, %r1008, %r1003; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 24; + add.s32 %r1011, %r1010, %r1004; + xor.b32 %r1012, %r1011, %r1006; + shf.l.wrap.b32 %r1013, %r1012, %r1012, 25; + add.s32 %r1014, %r966, %r129; + add.s32 %r1015, %r1014, %r929; + xor.b32 %r1016, %r1015, %r954; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 16; + add.s32 %r1018, %r1017, %r941; + xor.b32 %r1019, %r1018, %r929; + shf.l.wrap.b32 %r1020, %r1019, %r1019, 20; + add.s32 %r1021, %r1015, %r171; + add.s32 %r1022, %r1021, %r1020; + xor.b32 %r1023, %r1022, %r1017; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 24; + add.s32 %r1025, %r1024, %r1018; + xor.b32 %r1026, %r1025, %r1020; + shf.l.wrap.b32 %r1027, %r1026, %r1026, 25; + xor.b32 %r1028, %r1011, %r980; + xor.b32 %r1029, %r1025, %r994; + xor.b32 %r1030, %r983, %r1008; + xor.b32 %r1031, %r1022, %r997; + xor.b32 %r1032, %r1027, %r996; + xor.b32 %r1033, %r985, %r1010; + xor.b32 %r1034, %r1024, %r999; + xor.b32 %r1035, %r1013, %r982; + st.local.u8 [%rd155], %r1028; + shr.u32 %r1036, %r1028, 8; + st.local.u8 [%rd155+1], %r1036; + shr.u32 %r1037, %r1028, 16; + st.local.u8 [%rd155+2], %r1037; + shr.u32 %r1038, %r1028, 24; + st.local.u8 [%rd155+3], %r1038; + st.local.u8 [%rd155+4], %r1029; + shr.u32 %r1039, %r1029, 8; + st.local.u8 [%rd155+5], %r1039; + shr.u32 %r1040, %r1029, 16; + st.local.u8 [%rd155+6], %r1040; + shr.u32 %r1041, %r1029, 24; + st.local.u8 [%rd155+7], %r1041; + st.local.u8 [%rd155+8], %r1030; + shr.u32 %r1042, %r1030, 8; + st.local.u8 [%rd155+9], %r1042; + shr.u32 %r1043, %r1030, 16; + st.local.u8 [%rd155+10], %r1043; + shr.u32 %r1044, %r1030, 24; + st.local.u8 [%rd155+11], %r1044; + st.local.u8 [%rd155+12], %r1031; + shr.u32 %r1045, %r1031, 8; + st.local.u8 [%rd155+13], %r1045; + shr.u32 %r1046, %r1031, 16; + st.local.u8 [%rd155+14], %r1046; + shr.u32 %r1047, %r1031, 24; + st.local.u8 [%rd155+15], %r1047; + st.local.u8 [%rd155+16], %r1032; + shr.u32 %r1048, %r1032, 8; + st.local.u8 [%rd155+17], %r1048; + shr.u32 %r1049, %r1032, 16; + st.local.u8 [%rd155+18], %r1049; + shr.u32 %r1050, %r1032, 24; + st.local.u8 [%rd155+19], %r1050; + st.local.u8 [%rd155+20], %r1033; + shr.u32 %r1051, %r1033, 8; + st.local.u8 [%rd155+21], %r1051; + shr.u32 %r1052, %r1033, 16; + st.local.u8 [%rd155+22], %r1052; + shr.u32 %r1053, %r1033, 24; + st.local.u8 [%rd155+23], %r1053; + st.local.u8 [%rd155+24], %r1034; + shr.u32 %r1054, %r1034, 8; + st.local.u8 [%rd155+25], %r1054; + shr.u32 %r1055, %r1034, 16; + st.local.u8 [%rd155+26], %r1055; + shr.u32 %r1056, %r1034, 24; + st.local.u8 [%rd155+27], %r1056; + st.local.u8 [%rd155+28], %r1035; + shr.u32 %r1057, %r1035, 8; + st.local.u8 [%rd155+29], %r1057; + shr.u32 %r1058, %r1035, 16; + st.local.u8 [%rd155+30], %r1058; + shr.u32 %r1059, %r1035, 24; + st.local.u8 [%rd155+31], %r1059; + add.s64 %rd153, %rd153, 8; + add.s64 %rd155, %rd155, 32; + add.s64 %rd154, %rd154, -1; + setp.ne.s64 %p12, %rd154, 0; + @%p12 bra $L__BB0_7; + +$L__BB0_8: + setp.le.u64 %p13, %rd7, %rd152; + @%p13 bra $L__BB0_30; + + add.u64 %rd144, %SPL, 96; + ld.param.u64 %rd142, [_ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd141, %rd142; + shl.b64 %rd111, %rd151, 6; + shl.b64 %rd112, %rd151, 5; + add.s64 %rd27, %rd141, %rd112; + add.s64 %rd28, %rd144, %rd111; + mov.u64 %rd156, 0; + +$L__BB0_10: + add.s64 %rd113, %rd28, %rd156; + ld.local.u8 %rs77, [%rd113]; + add.s64 %rd114, %rd27, %rd156; + st.local.u8 [%rd114], %rs77; + add.s64 %rd156, %rd156, 1; + setp.lt.u64 %p14, %rd156, 32; + @%p14 bra $L__BB0_10; + + add.s64 %rd151, %rd151, 1; + +$L__BB0_30: + st.param.b64 [func_retval0+0], %rd151; + ret; + +} +.func _Z20blake3_hasher_updateP13blake3_hasherPKvy( + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0, + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1 +) +{ + .local .align 16 .b8 __local_depot1[144]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<54>; + .reg .b16 %rs<393>; + .reg .b32 %r<11690>; + .reg .b64 %rd<273>; + + + mov.u64 %SPL, __local_depot1; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd98, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + ld.param.u64 %rd254, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd261, %rd254; + cvta.to.local.u64 %rd2, %rd98; + add.s64 %rd3, %rd2, 136; + mov.u64 %rd262, 32; + ld.local.v2.u8 {%rs102, %rs103}, [%rd2+136]; + cvt.u64.u16 %rd4, %rs103; + cvt.u32.u16 %r144, %rs103; + mul.wide.u32 %rd101, %r144, 64; + cvt.u64.u16 %rd5, %rs102; + neg.s64 %rd102, %rd5; + setp.eq.s64 %p1, %rd101, %rd102; + @%p1 bra $L__BB1_24; + + shl.b64 %rd103, %rd4, 6; + mov.u64 %rd104, 1024; + sub.s64 %rd105, %rd104, %rd5; + sub.s64 %rd106, %rd105, %rd103; + min.u64 %rd6, %rd106, 32; + setp.eq.s16 %p2, %rs102, 0; + mov.u16 %rs351, 0; + mov.u64 %rd244, %rd6; + @%p2 bra $L__BB1_9; + + cvt.u32.u16 %r145, %rs102; + prmt.b32 %r147, %r144, %r145, 30212; + cvt.u16.u32 %rs350, %r147; + mov.u64 %rd107, 64; + sub.s64 %rd108, %rd107, %rd5; + min.u64 %rd7, %rd108, %rd6; + setp.eq.s64 %p3, %rd7, 0; + @%p3 bra $L__BB1_6; + + add.s64 %rd110, %rd2, %rd5; + add.s64 %rd8, %rd110, 72; + mov.u64 %rd237, 0; + +$L__BB1_4: + add.s64 %rd111, %rd261, %rd237; + ld.local.u8 %rs107, [%rd111]; + add.s64 %rd112, %rd8, %rd237; + st.local.u8 [%rd112], %rs107; + add.s64 %rd237, %rd237, 1; + setp.lt.u64 %p4, %rd237, %rd7; + @%p4 bra $L__BB1_4; + + ld.local.u8 %rs350, [%rd3]; + +$L__BB1_6: + cvt.u16.u64 %rs108, %rd7; + add.s16 %rs351, %rs350, %rs108; + mov.u64 %rd244, 0; + st.local.u8 [%rd3], %rs351; + add.s64 %rd261, %rd261, %rd7; + sub.s64 %rd12, %rd6, %rd7; + setp.eq.s64 %p5, %rd12, 0; + @%p5 bra $L__BB1_9; + + add.s64 %rd13, %rd2, 72; + ld.local.u8 %rs109, [%rd3+1]; + mov.u64 %rd238, 0; + setp.eq.s16 %p6, %rs109, 0; + mov.u16 %rs351, 0; + selp.u16 %rs111, 1, 0, %p6; + ld.local.u8 %rs112, [%rd3+2]; + or.b16 %rs113, %rs112, %rs111; + ld.local.u8 %r148, [%rd3+-64]; + ld.local.u8 %r149, [%rd3+-63]; + prmt.b32 %r150, %r149, %r148, 30212; + ld.local.u8 %r151, [%rd3+-62]; + prmt.b32 %r152, %r151, %r150, 28756; + ld.local.u8 %r153, [%rd3+-61]; + prmt.b32 %r154, %r153, %r152, 1620; + ld.local.u8 %r155, [%rd3+-60]; + ld.local.u8 %r156, [%rd3+-59]; + prmt.b32 %r157, %r156, %r155, 30212; + ld.local.u8 %r158, [%rd3+-58]; + prmt.b32 %r159, %r158, %r157, 28756; + ld.local.u8 %r160, [%rd3+-57]; + prmt.b32 %r161, %r160, %r159, 1620; + ld.local.u8 %r162, [%rd3+-56]; + ld.local.u8 %r163, [%rd3+-55]; + prmt.b32 %r164, %r163, %r162, 30212; + ld.local.u8 %r165, [%rd3+-54]; + prmt.b32 %r166, %r165, %r164, 28756; + ld.local.u8 %r167, [%rd3+-53]; + prmt.b32 %r168, %r167, %r166, 1620; + ld.local.u8 %r169, [%rd3+-52]; + ld.local.u8 %r170, [%rd3+-51]; + prmt.b32 %r171, %r170, %r169, 30212; + ld.local.u8 %r172, [%rd3+-50]; + prmt.b32 %r173, %r172, %r171, 28756; + ld.local.u8 %r174, [%rd3+-49]; + prmt.b32 %r175, %r174, %r173, 1620; + ld.local.u8 %r176, [%rd3+-48]; + ld.local.u8 %r177, [%rd3+-47]; + prmt.b32 %r178, %r177, %r176, 30212; + ld.local.u8 %r179, [%rd3+-46]; + prmt.b32 %r180, %r179, %r178, 28756; + ld.local.u8 %r181, [%rd3+-45]; + prmt.b32 %r182, %r181, %r180, 1620; + ld.local.u8 %r183, [%rd3+-44]; + ld.local.u8 %r184, [%rd3+-43]; + prmt.b32 %r185, %r184, %r183, 30212; + ld.local.u8 %r186, [%rd3+-42]; + prmt.b32 %r187, %r186, %r185, 28756; + ld.local.u8 %r188, [%rd3+-41]; + prmt.b32 %r189, %r188, %r187, 1620; + ld.local.u8 %r190, [%rd3+-40]; + ld.local.u8 %r191, [%rd3+-39]; + prmt.b32 %r192, %r191, %r190, 30212; + ld.local.u8 %r193, [%rd3+-38]; + prmt.b32 %r194, %r193, %r192, 28756; + ld.local.u8 %r195, [%rd3+-37]; + prmt.b32 %r196, %r195, %r194, 1620; + ld.local.u8 %r197, [%rd3+-36]; + ld.local.u8 %r198, [%rd3+-35]; + prmt.b32 %r199, %r198, %r197, 30212; + ld.local.u8 %r200, [%rd3+-34]; + prmt.b32 %r201, %r200, %r199, 28756; + ld.local.u8 %r202, [%rd3+-33]; + prmt.b32 %r203, %r202, %r201, 1620; + ld.local.u8 %r204, [%rd3+-32]; + ld.local.u8 %r205, [%rd3+-31]; + prmt.b32 %r206, %r205, %r204, 30212; + ld.local.u8 %r207, [%rd3+-30]; + prmt.b32 %r208, %r207, %r206, 28756; + ld.local.u8 %r209, [%rd3+-29]; + prmt.b32 %r210, %r209, %r208, 1620; + ld.local.u8 %r211, [%rd3+-28]; + ld.local.u8 %r212, [%rd3+-27]; + prmt.b32 %r213, %r212, %r211, 30212; + ld.local.u8 %r214, [%rd3+-26]; + prmt.b32 %r215, %r214, %r213, 28756; + ld.local.u8 %r216, [%rd3+-25]; + prmt.b32 %r217, %r216, %r215, 1620; + ld.local.u8 %r218, [%rd3+-24]; + ld.local.u8 %r219, [%rd3+-23]; + prmt.b32 %r220, %r219, %r218, 30212; + ld.local.u8 %r221, [%rd3+-22]; + prmt.b32 %r222, %r221, %r220, 28756; + ld.local.u8 %r223, [%rd3+-21]; + prmt.b32 %r224, %r223, %r222, 1620; + ld.local.u8 %r225, [%rd3+-20]; + ld.local.u8 %r226, [%rd3+-19]; + prmt.b32 %r227, %r226, %r225, 30212; + ld.local.u8 %r228, [%rd3+-18]; + prmt.b32 %r229, %r228, %r227, 28756; + ld.local.u8 %r230, [%rd3+-17]; + prmt.b32 %r231, %r230, %r229, 1620; + ld.local.u8 %r232, [%rd3+-16]; + ld.local.u8 %r233, [%rd3+-15]; + prmt.b32 %r234, %r233, %r232, 30212; + ld.local.u8 %r235, [%rd3+-14]; + prmt.b32 %r236, %r235, %r234, 28756; + ld.local.u8 %r237, [%rd3+-13]; + prmt.b32 %r238, %r237, %r236, 1620; + ld.local.u8 %r239, [%rd3+-12]; + ld.local.u8 %r240, [%rd3+-11]; + prmt.b32 %r241, %r240, %r239, 30212; + ld.local.u8 %r242, [%rd3+-10]; + prmt.b32 %r243, %r242, %r241, 28756; + ld.local.u8 %r244, [%rd3+-9]; + prmt.b32 %r245, %r244, %r243, 1620; + ld.local.u8 %r246, [%rd3+-8]; + ld.local.u8 %r247, [%rd3+-7]; + prmt.b32 %r248, %r247, %r246, 30212; + ld.local.u8 %r249, [%rd3+-6]; + prmt.b32 %r250, %r249, %r248, 28756; + ld.local.u8 %r251, [%rd3+-5]; + prmt.b32 %r252, %r251, %r250, 1620; + ld.local.u8 %r253, [%rd3+-4]; + ld.local.u8 %r254, [%rd3+-3]; + prmt.b32 %r255, %r254, %r253, 30212; + ld.local.u8 %r256, [%rd3+-2]; + prmt.b32 %r257, %r256, %r255, 28756; + ld.local.u8 %r258, [%rd3+-1]; + prmt.b32 %r259, %r258, %r257, 1620; + ld.local.u64 %rd115, [%rd3+-72]; + cvt.u32.u64 %r260, %rd115; + shr.u64 %rd116, %rd115, 32; + cvt.u32.u64 %r261, %rd116; + cvt.u32.u16 %r262, %rs113; + and.b32 %r263, %r262, 255; + ld.local.u32 %r264, [%rd3+-104]; + add.s32 %r265, %r264, %r154; + ld.local.u32 %r266, [%rd3+-88]; + add.s32 %r267, %r265, %r266; + xor.b32 %r268, %r267, %r260; + shf.l.wrap.b32 %r269, %r268, %r268, 16; + add.s32 %r270, %r269, 1779033703; + xor.b32 %r271, %r270, %r266; + shf.l.wrap.b32 %r272, %r271, %r271, 20; + add.s32 %r273, %r267, %r161; + add.s32 %r274, %r273, %r272; + xor.b32 %r275, %r274, %r269; + shf.l.wrap.b32 %r276, %r275, %r275, 24; + add.s32 %r277, %r276, %r270; + xor.b32 %r278, %r277, %r272; + shf.l.wrap.b32 %r279, %r278, %r278, 25; + ld.local.u32 %r280, [%rd3+-100]; + add.s32 %r281, %r280, %r168; + ld.local.u32 %r282, [%rd3+-84]; + add.s32 %r283, %r281, %r282; + xor.b32 %r284, %r283, %r261; + shf.l.wrap.b32 %r285, %r284, %r284, 16; + add.s32 %r286, %r285, -1150833019; + xor.b32 %r287, %r286, %r282; + shf.l.wrap.b32 %r288, %r287, %r287, 20; + add.s32 %r289, %r283, %r175; + add.s32 %r290, %r289, %r288; + xor.b32 %r291, %r290, %r285; + shf.l.wrap.b32 %r292, %r291, %r291, 24; + add.s32 %r293, %r292, %r286; + xor.b32 %r294, %r293, %r288; + shf.l.wrap.b32 %r295, %r294, %r294, 25; + ld.local.u32 %r296, [%rd3+-96]; + add.s32 %r297, %r296, %r182; + ld.local.u32 %r298, [%rd3+-80]; + add.s32 %r299, %r297, %r298; + shr.u32 %r300, %r299, 16; + shl.b32 %r301, %r299, 16; + xor.b32 %r302, %r301, 4194304; + or.b32 %r303, %r302, %r300; + add.s32 %r304, %r303, 1013904242; + xor.b32 %r305, %r304, %r298; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r299, %r189; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + ld.local.u32 %r314, [%rd3+-92]; + add.s32 %r315, %r314, %r196; + ld.local.u32 %r316, [%rd3+-76]; + add.s32 %r317, %r315, %r316; + xor.b32 %r318, %r317, %r263; + shr.u32 %r319, %r317, 16; + shl.b32 %r320, %r318, 16; + or.b32 %r321, %r320, %r319; + add.s32 %r322, %r321, -1521486534; + xor.b32 %r323, %r322, %r316; + shf.l.wrap.b32 %r324, %r323, %r323, 20; + add.s32 %r325, %r317, %r203; + add.s32 %r326, %r325, %r324; + xor.b32 %r327, %r326, %r321; + shf.l.wrap.b32 %r328, %r327, %r327, 24; + add.s32 %r329, %r328, %r322; + xor.b32 %r330, %r329, %r324; + shf.l.wrap.b32 %r331, %r330, %r330, 25; + add.s32 %r332, %r274, %r210; + add.s32 %r333, %r332, %r295; + xor.b32 %r334, %r333, %r328; + shf.l.wrap.b32 %r335, %r334, %r334, 16; + add.s32 %r336, %r335, %r311; + xor.b32 %r337, %r336, %r295; + shf.l.wrap.b32 %r338, %r337, %r337, 20; + add.s32 %r339, %r333, %r217; + add.s32 %r340, %r339, %r338; + xor.b32 %r341, %r340, %r335; + shf.l.wrap.b32 %r342, %r341, %r341, 24; + add.s32 %r343, %r342, %r336; + xor.b32 %r344, %r343, %r338; + shf.l.wrap.b32 %r345, %r344, %r344, 25; + add.s32 %r346, %r290, %r224; + add.s32 %r347, %r346, %r313; + xor.b32 %r348, %r347, %r276; + shf.l.wrap.b32 %r349, %r348, %r348, 16; + add.s32 %r350, %r349, %r329; + xor.b32 %r351, %r350, %r313; + shf.l.wrap.b32 %r352, %r351, %r351, 20; + add.s32 %r353, %r347, %r231; + add.s32 %r354, %r353, %r352; + xor.b32 %r355, %r354, %r349; + shf.l.wrap.b32 %r356, %r355, %r355, 24; + add.s32 %r357, %r356, %r350; + xor.b32 %r358, %r357, %r352; + shf.l.wrap.b32 %r359, %r358, %r358, 25; + add.s32 %r360, %r308, %r238; + add.s32 %r361, %r360, %r331; + xor.b32 %r362, %r361, %r292; + shf.l.wrap.b32 %r363, %r362, %r362, 16; + add.s32 %r364, %r363, %r277; + xor.b32 %r365, %r364, %r331; + shf.l.wrap.b32 %r366, %r365, %r365, 20; + add.s32 %r367, %r361, %r245; + add.s32 %r368, %r367, %r366; + xor.b32 %r369, %r368, %r363; + shf.l.wrap.b32 %r370, %r369, %r369, 24; + add.s32 %r371, %r370, %r364; + xor.b32 %r372, %r371, %r366; + shf.l.wrap.b32 %r373, %r372, %r372, 25; + add.s32 %r374, %r326, %r252; + add.s32 %r375, %r374, %r279; + xor.b32 %r376, %r375, %r310; + shf.l.wrap.b32 %r377, %r376, %r376, 16; + add.s32 %r378, %r377, %r293; + xor.b32 %r379, %r378, %r279; + shf.l.wrap.b32 %r380, %r379, %r379, 20; + add.s32 %r381, %r375, %r259; + add.s32 %r382, %r381, %r380; + xor.b32 %r383, %r382, %r377; + shf.l.wrap.b32 %r384, %r383, %r383, 24; + add.s32 %r385, %r384, %r378; + xor.b32 %r386, %r385, %r380; + shf.l.wrap.b32 %r387, %r386, %r386, 25; + add.s32 %r388, %r340, %r168; + add.s32 %r389, %r388, %r387; + xor.b32 %r390, %r389, %r356; + shf.l.wrap.b32 %r391, %r390, %r390, 16; + add.s32 %r392, %r391, %r371; + xor.b32 %r393, %r392, %r387; + shf.l.wrap.b32 %r394, %r393, %r393, 20; + add.s32 %r395, %r389, %r196; + add.s32 %r396, %r395, %r394; + xor.b32 %r397, %r396, %r391; + shf.l.wrap.b32 %r398, %r397, %r397, 24; + add.s32 %r399, %r398, %r392; + xor.b32 %r400, %r399, %r394; + shf.l.wrap.b32 %r401, %r400, %r400, 25; + add.s32 %r402, %r354, %r175; + add.s32 %r403, %r402, %r345; + xor.b32 %r404, %r403, %r370; + shf.l.wrap.b32 %r405, %r404, %r404, 16; + add.s32 %r406, %r405, %r385; + xor.b32 %r407, %r406, %r345; + shf.l.wrap.b32 %r408, %r407, %r407, 20; + add.s32 %r409, %r403, %r224; + add.s32 %r410, %r409, %r408; + xor.b32 %r411, %r410, %r405; + shf.l.wrap.b32 %r412, %r411, %r411, 24; + add.s32 %r413, %r412, %r406; + xor.b32 %r414, %r413, %r408; + shf.l.wrap.b32 %r415, %r414, %r414, 25; + add.s32 %r416, %r368, %r203; + add.s32 %r417, %r416, %r359; + xor.b32 %r418, %r417, %r384; + shf.l.wrap.b32 %r419, %r418, %r418, 16; + add.s32 %r420, %r419, %r343; + xor.b32 %r421, %r420, %r359; + shf.l.wrap.b32 %r422, %r421, %r421, 20; + add.s32 %r423, %r417, %r154; + add.s32 %r424, %r423, %r422; + xor.b32 %r425, %r424, %r419; + shf.l.wrap.b32 %r426, %r425, %r425, 24; + add.s32 %r427, %r426, %r420; + xor.b32 %r428, %r427, %r422; + shf.l.wrap.b32 %r429, %r428, %r428, 25; + add.s32 %r430, %r382, %r182; + add.s32 %r431, %r430, %r373; + xor.b32 %r432, %r431, %r342; + shf.l.wrap.b32 %r433, %r432, %r432, 16; + add.s32 %r434, %r433, %r357; + xor.b32 %r435, %r434, %r373; + shf.l.wrap.b32 %r436, %r435, %r435, 20; + add.s32 %r437, %r431, %r245; + add.s32 %r438, %r437, %r436; + xor.b32 %r439, %r438, %r433; + shf.l.wrap.b32 %r440, %r439, %r439, 24; + add.s32 %r441, %r440, %r434; + xor.b32 %r442, %r441, %r436; + shf.l.wrap.b32 %r443, %r442, %r442, 25; + add.s32 %r444, %r396, %r161; + add.s32 %r445, %r444, %r415; + xor.b32 %r446, %r445, %r440; + shf.l.wrap.b32 %r447, %r446, %r446, 16; + add.s32 %r448, %r447, %r427; + xor.b32 %r449, %r448, %r415; + shf.l.wrap.b32 %r450, %r449, %r449, 20; + add.s32 %r451, %r445, %r231; + add.s32 %r452, %r451, %r450; + xor.b32 %r453, %r452, %r447; + shf.l.wrap.b32 %r454, %r453, %r453, 24; + add.s32 %r455, %r454, %r448; + xor.b32 %r456, %r455, %r450; + shf.l.wrap.b32 %r457, %r456, %r456, 25; + add.s32 %r458, %r410, %r238; + add.s32 %r459, %r458, %r429; + xor.b32 %r460, %r459, %r398; + shf.l.wrap.b32 %r461, %r460, %r460, 16; + add.s32 %r462, %r461, %r441; + xor.b32 %r463, %r462, %r429; + shf.l.wrap.b32 %r464, %r463, %r463, 20; + add.s32 %r465, %r459, %r189; + add.s32 %r466, %r465, %r464; + xor.b32 %r467, %r466, %r461; + shf.l.wrap.b32 %r468, %r467, %r467, 24; + add.s32 %r469, %r468, %r462; + xor.b32 %r470, %r469, %r464; + shf.l.wrap.b32 %r471, %r470, %r470, 25; + add.s32 %r472, %r424, %r217; + add.s32 %r473, %r472, %r443; + xor.b32 %r474, %r473, %r412; + shf.l.wrap.b32 %r475, %r474, %r474, 16; + add.s32 %r476, %r475, %r399; + xor.b32 %r477, %r476, %r443; + shf.l.wrap.b32 %r478, %r477, %r477, 20; + add.s32 %r479, %r473, %r252; + add.s32 %r480, %r479, %r478; + xor.b32 %r481, %r480, %r475; + shf.l.wrap.b32 %r482, %r481, %r481, 24; + add.s32 %r483, %r482, %r476; + xor.b32 %r484, %r483, %r478; + shf.l.wrap.b32 %r485, %r484, %r484, 25; + add.s32 %r486, %r438, %r259; + add.s32 %r487, %r486, %r401; + xor.b32 %r488, %r487, %r426; + shf.l.wrap.b32 %r489, %r488, %r488, 16; + add.s32 %r490, %r489, %r413; + xor.b32 %r491, %r490, %r401; + shf.l.wrap.b32 %r492, %r491, %r491, 20; + add.s32 %r493, %r487, %r210; + add.s32 %r494, %r493, %r492; + xor.b32 %r495, %r494, %r489; + shf.l.wrap.b32 %r496, %r495, %r495, 24; + add.s32 %r497, %r496, %r490; + xor.b32 %r498, %r497, %r492; + shf.l.wrap.b32 %r499, %r498, %r498, 25; + add.s32 %r500, %r452, %r175; + add.s32 %r501, %r500, %r499; + xor.b32 %r502, %r501, %r468; + shf.l.wrap.b32 %r503, %r502, %r502, 16; + add.s32 %r504, %r503, %r483; + xor.b32 %r505, %r504, %r499; + shf.l.wrap.b32 %r506, %r505, %r505, 20; + add.s32 %r507, %r501, %r182; + add.s32 %r508, %r507, %r506; + xor.b32 %r509, %r508, %r503; + shf.l.wrap.b32 %r510, %r509, %r509, 24; + add.s32 %r511, %r510, %r504; + xor.b32 %r512, %r511, %r506; + shf.l.wrap.b32 %r513, %r512, %r512, 25; + add.s32 %r514, %r466, %r224; + add.s32 %r515, %r514, %r457; + xor.b32 %r516, %r515, %r482; + shf.l.wrap.b32 %r517, %r516, %r516, 16; + add.s32 %r518, %r517, %r497; + xor.b32 %r519, %r518, %r457; + shf.l.wrap.b32 %r520, %r519, %r519, 20; + add.s32 %r521, %r515, %r238; + add.s32 %r522, %r521, %r520; + xor.b32 %r523, %r522, %r517; + shf.l.wrap.b32 %r524, %r523, %r523, 24; + add.s32 %r525, %r524, %r518; + xor.b32 %r526, %r525, %r520; + shf.l.wrap.b32 %r527, %r526, %r526, 25; + add.s32 %r528, %r480, %r245; + add.s32 %r529, %r528, %r471; + xor.b32 %r530, %r529, %r496; + shf.l.wrap.b32 %r531, %r530, %r530, 16; + add.s32 %r532, %r531, %r455; + xor.b32 %r533, %r532, %r471; + shf.l.wrap.b32 %r534, %r533, %r533, 20; + add.s32 %r535, %r529, %r168; + add.s32 %r536, %r535, %r534; + xor.b32 %r537, %r536, %r531; + shf.l.wrap.b32 %r538, %r537, %r537, 24; + add.s32 %r539, %r538, %r532; + xor.b32 %r540, %r539, %r534; + shf.l.wrap.b32 %r541, %r540, %r540, 25; + add.s32 %r542, %r494, %r203; + add.s32 %r543, %r542, %r485; + xor.b32 %r544, %r543, %r454; + shf.l.wrap.b32 %r545, %r544, %r544, 16; + add.s32 %r546, %r545, %r469; + xor.b32 %r547, %r546, %r485; + shf.l.wrap.b32 %r548, %r547, %r547, 20; + add.s32 %r549, %r543, %r252; + add.s32 %r550, %r549, %r548; + xor.b32 %r551, %r550, %r545; + shf.l.wrap.b32 %r552, %r551, %r551, 24; + add.s32 %r553, %r552, %r546; + xor.b32 %r554, %r553, %r548; + shf.l.wrap.b32 %r555, %r554, %r554, 25; + add.s32 %r556, %r508, %r196; + add.s32 %r557, %r556, %r527; + xor.b32 %r558, %r557, %r552; + shf.l.wrap.b32 %r559, %r558, %r558, 16; + add.s32 %r560, %r559, %r539; + xor.b32 %r561, %r560, %r527; + shf.l.wrap.b32 %r562, %r561, %r561, 20; + add.s32 %r563, %r557, %r189; + add.s32 %r564, %r563, %r562; + xor.b32 %r565, %r564, %r559; + shf.l.wrap.b32 %r566, %r565, %r565, 24; + add.s32 %r567, %r566, %r560; + xor.b32 %r568, %r567, %r562; + shf.l.wrap.b32 %r569, %r568, %r568, 25; + add.s32 %r570, %r522, %r217; + add.s32 %r571, %r570, %r541; + xor.b32 %r572, %r571, %r510; + shf.l.wrap.b32 %r573, %r572, %r572, 16; + add.s32 %r574, %r573, %r553; + xor.b32 %r575, %r574, %r541; + shf.l.wrap.b32 %r576, %r575, %r575, 20; + add.s32 %r577, %r571, %r154; + add.s32 %r578, %r577, %r576; + xor.b32 %r579, %r578, %r573; + shf.l.wrap.b32 %r580, %r579, %r579, 24; + add.s32 %r581, %r580, %r574; + xor.b32 %r582, %r581, %r576; + shf.l.wrap.b32 %r583, %r582, %r582, 25; + add.s32 %r584, %r536, %r231; + add.s32 %r585, %r584, %r555; + xor.b32 %r586, %r585, %r524; + shf.l.wrap.b32 %r587, %r586, %r586, 16; + add.s32 %r588, %r587, %r511; + xor.b32 %r589, %r588, %r555; + shf.l.wrap.b32 %r590, %r589, %r589, 20; + add.s32 %r591, %r585, %r259; + add.s32 %r592, %r591, %r590; + xor.b32 %r593, %r592, %r587; + shf.l.wrap.b32 %r594, %r593, %r593, 24; + add.s32 %r595, %r594, %r588; + xor.b32 %r596, %r595, %r590; + shf.l.wrap.b32 %r597, %r596, %r596, 25; + add.s32 %r598, %r550, %r210; + add.s32 %r599, %r598, %r513; + xor.b32 %r600, %r599, %r538; + shf.l.wrap.b32 %r601, %r600, %r600, 16; + add.s32 %r602, %r601, %r525; + xor.b32 %r603, %r602, %r513; + shf.l.wrap.b32 %r604, %r603, %r603, 20; + add.s32 %r605, %r599, %r161; + add.s32 %r606, %r605, %r604; + xor.b32 %r607, %r606, %r601; + shf.l.wrap.b32 %r608, %r607, %r607, 24; + add.s32 %r609, %r608, %r602; + xor.b32 %r610, %r609, %r604; + shf.l.wrap.b32 %r611, %r610, %r610, 25; + add.s32 %r612, %r564, %r224; + add.s32 %r613, %r612, %r611; + xor.b32 %r614, %r613, %r580; + shf.l.wrap.b32 %r615, %r614, %r614, 16; + add.s32 %r616, %r615, %r595; + xor.b32 %r617, %r616, %r611; + shf.l.wrap.b32 %r618, %r617, %r617, 20; + add.s32 %r619, %r613, %r203; + add.s32 %r620, %r619, %r618; + xor.b32 %r621, %r620, %r615; + shf.l.wrap.b32 %r622, %r621, %r621, 24; + add.s32 %r623, %r622, %r616; + xor.b32 %r624, %r623, %r618; + shf.l.wrap.b32 %r625, %r624, %r624, 25; + add.s32 %r626, %r578, %r238; + add.s32 %r627, %r626, %r569; + xor.b32 %r628, %r627, %r594; + shf.l.wrap.b32 %r629, %r628, %r628, 16; + add.s32 %r630, %r629, %r609; + xor.b32 %r631, %r630, %r569; + shf.l.wrap.b32 %r632, %r631, %r631, 20; + add.s32 %r633, %r627, %r217; + add.s32 %r634, %r633, %r632; + xor.b32 %r635, %r634, %r629; + shf.l.wrap.b32 %r636, %r635, %r635, 24; + add.s32 %r637, %r636, %r630; + xor.b32 %r638, %r637, %r632; + shf.l.wrap.b32 %r639, %r638, %r638, 25; + add.s32 %r640, %r592, %r252; + add.s32 %r641, %r640, %r583; + xor.b32 %r642, %r641, %r608; + shf.l.wrap.b32 %r643, %r642, %r642, 16; + add.s32 %r644, %r643, %r567; + xor.b32 %r645, %r644, %r583; + shf.l.wrap.b32 %r646, %r645, %r645, 20; + add.s32 %r647, %r641, %r175; + add.s32 %r648, %r647, %r646; + xor.b32 %r649, %r648, %r643; + shf.l.wrap.b32 %r650, %r649, %r649, 24; + add.s32 %r651, %r650, %r644; + xor.b32 %r652, %r651, %r646; + shf.l.wrap.b32 %r653, %r652, %r652, 25; + add.s32 %r654, %r606, %r245; + add.s32 %r655, %r654, %r597; + xor.b32 %r656, %r655, %r566; + shf.l.wrap.b32 %r657, %r656, %r656, 16; + add.s32 %r658, %r657, %r581; + xor.b32 %r659, %r658, %r597; + shf.l.wrap.b32 %r660, %r659, %r659, 20; + add.s32 %r661, %r655, %r259; + add.s32 %r662, %r661, %r660; + xor.b32 %r663, %r662, %r657; + shf.l.wrap.b32 %r664, %r663, %r663, 24; + add.s32 %r665, %r664, %r658; + xor.b32 %r666, %r665, %r660; + shf.l.wrap.b32 %r667, %r666, %r666, 25; + add.s32 %r668, %r620, %r182; + add.s32 %r669, %r668, %r639; + xor.b32 %r670, %r669, %r664; + shf.l.wrap.b32 %r671, %r670, %r670, 16; + add.s32 %r672, %r671, %r651; + xor.b32 %r673, %r672, %r639; + shf.l.wrap.b32 %r674, %r673, %r673, 20; + add.s32 %r675, %r669, %r154; + add.s32 %r676, %r675, %r674; + xor.b32 %r677, %r676, %r671; + shf.l.wrap.b32 %r678, %r677, %r677, 24; + add.s32 %r679, %r678, %r672; + xor.b32 %r680, %r679, %r674; + shf.l.wrap.b32 %r681, %r680, %r680, 25; + add.s32 %r682, %r634, %r231; + add.s32 %r683, %r682, %r653; + xor.b32 %r684, %r683, %r622; + shf.l.wrap.b32 %r685, %r684, %r684, 16; + add.s32 %r686, %r685, %r665; + xor.b32 %r687, %r686, %r653; + shf.l.wrap.b32 %r688, %r687, %r687, 20; + add.s32 %r689, %r683, %r168; + add.s32 %r690, %r689, %r688; + xor.b32 %r691, %r690, %r685; + shf.l.wrap.b32 %r692, %r691, %r691, 24; + add.s32 %r693, %r692, %r686; + xor.b32 %r694, %r693, %r688; + shf.l.wrap.b32 %r695, %r694, %r694, 25; + add.s32 %r696, %r648, %r189; + add.s32 %r697, %r696, %r667; + xor.b32 %r698, %r697, %r636; + shf.l.wrap.b32 %r699, %r698, %r698, 16; + add.s32 %r700, %r699, %r623; + xor.b32 %r701, %r700, %r667; + shf.l.wrap.b32 %r702, %r701, %r701, 20; + add.s32 %r703, %r697, %r210; + add.s32 %r704, %r703, %r702; + xor.b32 %r705, %r704, %r699; + shf.l.wrap.b32 %r706, %r705, %r705, 24; + add.s32 %r707, %r706, %r700; + xor.b32 %r708, %r707, %r702; + shf.l.wrap.b32 %r709, %r708, %r708, 25; + add.s32 %r710, %r662, %r161; + add.s32 %r711, %r710, %r625; + xor.b32 %r712, %r711, %r650; + shf.l.wrap.b32 %r713, %r712, %r712, 16; + add.s32 %r714, %r713, %r637; + xor.b32 %r715, %r714, %r625; + shf.l.wrap.b32 %r716, %r715, %r715, 20; + add.s32 %r717, %r711, %r196; + add.s32 %r718, %r717, %r716; + xor.b32 %r719, %r718, %r713; + shf.l.wrap.b32 %r720, %r719, %r719, 24; + add.s32 %r721, %r720, %r714; + xor.b32 %r722, %r721, %r716; + shf.l.wrap.b32 %r723, %r722, %r722, 25; + add.s32 %r724, %r676, %r238; + add.s32 %r725, %r724, %r723; + xor.b32 %r726, %r725, %r692; + shf.l.wrap.b32 %r727, %r726, %r726, 16; + add.s32 %r728, %r727, %r707; + xor.b32 %r729, %r728, %r723; + shf.l.wrap.b32 %r730, %r729, %r729, 20; + add.s32 %r731, %r725, %r245; + add.s32 %r732, %r731, %r730; + xor.b32 %r733, %r732, %r727; + shf.l.wrap.b32 %r734, %r733, %r733, 24; + add.s32 %r735, %r734, %r728; + xor.b32 %r736, %r735, %r730; + shf.l.wrap.b32 %r737, %r736, %r736, 25; + add.s32 %r738, %r690, %r217; + add.s32 %r739, %r738, %r681; + xor.b32 %r740, %r739, %r706; + shf.l.wrap.b32 %r741, %r740, %r740, 16; + add.s32 %r742, %r741, %r721; + xor.b32 %r743, %r742, %r681; + shf.l.wrap.b32 %r744, %r743, %r743, 20; + add.s32 %r745, %r739, %r231; + add.s32 %r746, %r745, %r744; + xor.b32 %r747, %r746, %r741; + shf.l.wrap.b32 %r748, %r747, %r747, 24; + add.s32 %r749, %r748, %r742; + xor.b32 %r750, %r749, %r744; + shf.l.wrap.b32 %r751, %r750, %r750, 25; + add.s32 %r752, %r704, %r259; + add.s32 %r753, %r752, %r695; + xor.b32 %r754, %r753, %r720; + shf.l.wrap.b32 %r755, %r754, %r754, 16; + add.s32 %r756, %r755, %r679; + xor.b32 %r757, %r756, %r695; + shf.l.wrap.b32 %r758, %r757, %r757, 20; + add.s32 %r759, %r753, %r224; + add.s32 %r760, %r759, %r758; + xor.b32 %r761, %r760, %r755; + shf.l.wrap.b32 %r762, %r761, %r761, 24; + add.s32 %r763, %r762, %r756; + xor.b32 %r764, %r763, %r758; + shf.l.wrap.b32 %r765, %r764, %r764, 25; + add.s32 %r766, %r718, %r252; + add.s32 %r767, %r766, %r709; + xor.b32 %r768, %r767, %r678; + shf.l.wrap.b32 %r769, %r768, %r768, 16; + add.s32 %r770, %r769, %r693; + xor.b32 %r771, %r770, %r709; + shf.l.wrap.b32 %r772, %r771, %r771, 20; + add.s32 %r773, %r767, %r210; + add.s32 %r774, %r773, %r772; + xor.b32 %r775, %r774, %r769; + shf.l.wrap.b32 %r776, %r775, %r775, 24; + add.s32 %r777, %r776, %r770; + xor.b32 %r778, %r777, %r772; + shf.l.wrap.b32 %r779, %r778, %r778, 25; + add.s32 %r780, %r732, %r203; + add.s32 %r781, %r780, %r751; + xor.b32 %r782, %r781, %r776; + shf.l.wrap.b32 %r783, %r782, %r782, 16; + add.s32 %r784, %r783, %r763; + xor.b32 %r785, %r784, %r751; + shf.l.wrap.b32 %r786, %r785, %r785, 20; + add.s32 %r787, %r781, %r168; + add.s32 %r788, %r787, %r786; + xor.b32 %r789, %r788, %r783; + shf.l.wrap.b32 %r790, %r789, %r789, 24; + add.s32 %r791, %r790, %r784; + xor.b32 %r792, %r791, %r786; + shf.l.wrap.b32 %r793, %r792, %r792, 25; + add.s32 %r794, %r746, %r189; + add.s32 %r795, %r794, %r765; + xor.b32 %r796, %r795, %r734; + shf.l.wrap.b32 %r797, %r796, %r796, 16; + add.s32 %r798, %r797, %r777; + xor.b32 %r799, %r798, %r765; + shf.l.wrap.b32 %r800, %r799, %r799, 20; + add.s32 %r801, %r795, %r175; + add.s32 %r802, %r801, %r800; + xor.b32 %r803, %r802, %r797; + shf.l.wrap.b32 %r804, %r803, %r803, 24; + add.s32 %r805, %r804, %r798; + xor.b32 %r806, %r805, %r800; + shf.l.wrap.b32 %r807, %r806, %r806, 25; + add.s32 %r808, %r760, %r154; + add.s32 %r809, %r808, %r779; + xor.b32 %r810, %r809, %r748; + shf.l.wrap.b32 %r811, %r810, %r810, 16; + add.s32 %r812, %r811, %r735; + xor.b32 %r813, %r812, %r779; + shf.l.wrap.b32 %r814, %r813, %r813, 20; + add.s32 %r815, %r809, %r161; + add.s32 %r816, %r815, %r814; + xor.b32 %r817, %r816, %r811; + shf.l.wrap.b32 %r818, %r817, %r817, 24; + add.s32 %r819, %r818, %r812; + xor.b32 %r820, %r819, %r814; + shf.l.wrap.b32 %r821, %r820, %r820, 25; + add.s32 %r822, %r774, %r196; + add.s32 %r823, %r822, %r737; + xor.b32 %r824, %r823, %r762; + shf.l.wrap.b32 %r825, %r824, %r824, 16; + add.s32 %r826, %r825, %r749; + xor.b32 %r827, %r826, %r737; + shf.l.wrap.b32 %r828, %r827, %r827, 20; + add.s32 %r829, %r823, %r182; + add.s32 %r830, %r829, %r828; + xor.b32 %r831, %r830, %r825; + shf.l.wrap.b32 %r832, %r831, %r831, 24; + add.s32 %r833, %r832, %r826; + xor.b32 %r834, %r833, %r828; + shf.l.wrap.b32 %r835, %r834, %r834, 25; + add.s32 %r836, %r788, %r217; + add.s32 %r837, %r836, %r835; + xor.b32 %r838, %r837, %r804; + shf.l.wrap.b32 %r839, %r838, %r838, 16; + add.s32 %r840, %r839, %r819; + xor.b32 %r841, %r840, %r835; + shf.l.wrap.b32 %r842, %r841, %r841, 20; + add.s32 %r843, %r837, %r252; + add.s32 %r844, %r843, %r842; + xor.b32 %r845, %r844, %r839; + shf.l.wrap.b32 %r846, %r845, %r845, 24; + add.s32 %r847, %r846, %r840; + xor.b32 %r848, %r847, %r842; + shf.l.wrap.b32 %r849, %r848, %r848, 25; + add.s32 %r850, %r802, %r231; + add.s32 %r851, %r850, %r793; + xor.b32 %r852, %r851, %r818; + shf.l.wrap.b32 %r853, %r852, %r852, 16; + add.s32 %r854, %r853, %r833; + xor.b32 %r855, %r854, %r793; + shf.l.wrap.b32 %r856, %r855, %r855, 20; + add.s32 %r857, %r851, %r189; + add.s32 %r858, %r857, %r856; + xor.b32 %r859, %r858, %r853; + shf.l.wrap.b32 %r860, %r859, %r859, 24; + add.s32 %r861, %r860, %r854; + xor.b32 %r862, %r861, %r856; + shf.l.wrap.b32 %r863, %r862, %r862, 25; + add.s32 %r864, %r816, %r210; + add.s32 %r865, %r864, %r807; + xor.b32 %r866, %r865, %r832; + shf.l.wrap.b32 %r867, %r866, %r866, 16; + add.s32 %r868, %r867, %r791; + xor.b32 %r869, %r868, %r807; + shf.l.wrap.b32 %r870, %r869, %r869, 20; + add.s32 %r871, %r865, %r238; + add.s32 %r872, %r871, %r870; + xor.b32 %r873, %r872, %r867; + shf.l.wrap.b32 %r874, %r873, %r873, 24; + add.s32 %r875, %r874, %r868; + xor.b32 %r876, %r875, %r870; + shf.l.wrap.b32 %r877, %r876, %r876, 25; + add.s32 %r878, %r830, %r259; + add.s32 %r879, %r878, %r821; + xor.b32 %r880, %r879, %r790; + shf.l.wrap.b32 %r881, %r880, %r880, 16; + add.s32 %r882, %r881, %r805; + xor.b32 %r883, %r882, %r821; + shf.l.wrap.b32 %r884, %r883, %r883, 20; + add.s32 %r885, %r879, %r161; + add.s32 %r886, %r885, %r884; + xor.b32 %r887, %r886, %r881; + shf.l.wrap.b32 %r888, %r887, %r887, 24; + add.s32 %r889, %r888, %r882; + xor.b32 %r890, %r889, %r884; + shf.l.wrap.b32 %r891, %r890, %r890, 25; + add.s32 %r892, %r844, %r245; + add.s32 %r893, %r892, %r863; + xor.b32 %r894, %r893, %r888; + shf.l.wrap.b32 %r895, %r894, %r894, 16; + add.s32 %r896, %r895, %r875; + xor.b32 %r897, %r896, %r863; + shf.l.wrap.b32 %r898, %r897, %r897, 20; + add.s32 %r899, %r893, %r175; + add.s32 %r900, %r899, %r898; + xor.b32 %r901, %r900, %r895; + shf.l.wrap.b32 %r902, %r901, %r901, 24; + add.s32 %r903, %r902, %r896; + xor.b32 %r904, %r903, %r898; + shf.l.wrap.b32 %r905, %r904, %r904, 25; + add.s32 %r906, %r858, %r154; + add.s32 %r907, %r906, %r877; + xor.b32 %r908, %r907, %r846; + shf.l.wrap.b32 %r909, %r908, %r908, 16; + add.s32 %r910, %r909, %r889; + xor.b32 %r911, %r910, %r877; + shf.l.wrap.b32 %r912, %r911, %r911, 20; + add.s32 %r913, %r907, %r224; + add.s32 %r914, %r913, %r912; + xor.b32 %r915, %r914, %r909; + shf.l.wrap.b32 %r916, %r915, %r915, 24; + add.s32 %r917, %r916, %r910; + xor.b32 %r918, %r917, %r912; + shf.l.wrap.b32 %r919, %r918, %r918, 25; + add.s32 %r920, %r872, %r168; + add.s32 %r921, %r920, %r891; + xor.b32 %r922, %r921, %r860; + shf.l.wrap.b32 %r923, %r922, %r922, 16; + add.s32 %r924, %r923, %r847; + xor.b32 %r925, %r924, %r891; + shf.l.wrap.b32 %r926, %r925, %r925, 20; + add.s32 %r927, %r921, %r196; + add.s32 %r928, %r927, %r926; + xor.b32 %r929, %r928, %r923; + shf.l.wrap.b32 %r930, %r929, %r929, 24; + add.s32 %r931, %r930, %r924; + xor.b32 %r932, %r931, %r926; + shf.l.wrap.b32 %r933, %r932, %r932, 25; + add.s32 %r934, %r886, %r182; + add.s32 %r935, %r934, %r849; + xor.b32 %r936, %r935, %r874; + shf.l.wrap.b32 %r937, %r936, %r936, 16; + add.s32 %r938, %r937, %r861; + xor.b32 %r939, %r938, %r849; + shf.l.wrap.b32 %r940, %r939, %r939, 20; + add.s32 %r941, %r935, %r203; + add.s32 %r942, %r941, %r940; + xor.b32 %r943, %r942, %r937; + shf.l.wrap.b32 %r944, %r943, %r943, 24; + add.s32 %r945, %r944, %r938; + xor.b32 %r946, %r945, %r940; + shf.l.wrap.b32 %r947, %r946, %r946, 25; + add.s32 %r948, %r900, %r231; + add.s32 %r949, %r948, %r947; + xor.b32 %r950, %r949, %r916; + shf.l.wrap.b32 %r951, %r950, %r950, 16; + add.s32 %r952, %r951, %r931; + xor.b32 %r953, %r952, %r947; + shf.l.wrap.b32 %r954, %r953, %r953, 20; + add.s32 %r955, %r949, %r259; + add.s32 %r956, %r955, %r954; + xor.b32 %r957, %r956, %r951; + shf.l.wrap.b32 %r958, %r957, %r957, 24; + add.s32 %r959, %r958, %r952; + xor.b32 %r960, %r959, %r954; + shf.l.wrap.b32 %r961, %r960, %r960, 25; + add.s32 %r962, %r914, %r189; + add.s32 %r963, %r962, %r905; + xor.b32 %r964, %r963, %r930; + shf.l.wrap.b32 %r965, %r964, %r964, 16; + add.s32 %r966, %r965, %r945; + xor.b32 %r967, %r966, %r905; + shf.l.wrap.b32 %r968, %r967, %r967, 20; + add.s32 %r969, %r963, %r154; + add.s32 %r970, %r969, %r968; + xor.b32 %r971, %r970, %r965; + shf.l.wrap.b32 %r972, %r971, %r971, 24; + add.s32 %r973, %r972, %r966; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 25; + add.s32 %r976, %r928, %r161; + add.s32 %r977, %r976, %r919; + xor.b32 %r978, %r977, %r944; + shf.l.wrap.b32 %r979, %r978, %r978, 16; + add.s32 %r980, %r979, %r903; + xor.b32 %r981, %r980, %r919; + shf.l.wrap.b32 %r982, %r981, %r981, 20; + add.s32 %r983, %r977, %r217; + add.s32 %r984, %r983, %r982; + xor.b32 %r985, %r984, %r979; + shf.l.wrap.b32 %r986, %r985, %r985, 24; + add.s32 %r987, %r986, %r980; + xor.b32 %r988, %r987, %r982; + shf.l.wrap.b32 %r989, %r988, %r988, 25; + add.s32 %r990, %r942, %r210; + add.s32 %r991, %r990, %r933; + xor.b32 %r992, %r991, %r902; + shf.l.wrap.b32 %r993, %r992, %r992, 16; + add.s32 %r994, %r993, %r917; + xor.b32 %r995, %r994, %r933; + shf.l.wrap.b32 %r996, %r995, %r995, 20; + add.s32 %r997, %r991, %r196; + add.s32 %r998, %r997, %r996; + xor.b32 %r999, %r998, %r993; + shf.l.wrap.b32 %r1000, %r999, %r999, 24; + add.s32 %r1001, %r1000, %r994; + xor.b32 %r1002, %r1001, %r996; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 25; + add.s32 %r1004, %r956, %r252; + add.s32 %r1005, %r1004, %r975; + xor.b32 %r1006, %r1005, %r1000; + shf.l.wrap.b32 %r1007, %r1006, %r1006, 16; + add.s32 %r1008, %r1007, %r987; + xor.b32 %r1009, %r1008, %r975; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 20; + add.s32 %r1011, %r1005, %r224; + add.s32 %r1012, %r1011, %r1010; + xor.b32 %r1013, %r1012, %r1007; + shf.l.wrap.b32 %r1014, %r1013, %r1013, 24; + add.s32 %r1015, %r1014, %r1008; + xor.b32 %r1016, %r1015, %r1010; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 25; + add.s32 %r1018, %r970, %r168; + add.s32 %r1019, %r1018, %r989; + xor.b32 %r1020, %r1019, %r958; + shf.l.wrap.b32 %r1021, %r1020, %r1020, 16; + add.s32 %r1022, %r1021, %r1001; + xor.b32 %r1023, %r1022, %r989; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 20; + add.s32 %r1025, %r1019, %r238; + add.s32 %r1026, %r1025, %r1024; + xor.b32 %r1027, %r1026, %r1021; + shf.l.wrap.b32 %r1028, %r1027, %r1027, 24; + add.s32 %r1029, %r1028, %r1022; + xor.b32 %r1030, %r1029, %r1024; + shf.l.wrap.b32 %r1031, %r1030, %r1030, 25; + add.s32 %r1032, %r984, %r175; + add.s32 %r1033, %r1032, %r1003; + xor.b32 %r1034, %r1033, %r972; + shf.l.wrap.b32 %r1035, %r1034, %r1034, 16; + add.s32 %r1036, %r1035, %r959; + xor.b32 %r1037, %r1036, %r1003; + shf.l.wrap.b32 %r1038, %r1037, %r1037, 20; + add.s32 %r1039, %r1033, %r182; + add.s32 %r1040, %r1039, %r1038; + xor.b32 %r1041, %r1040, %r1035; + shf.l.wrap.b32 %r1042, %r1041, %r1041, 24; + add.s32 %r1043, %r1042, %r1036; + xor.b32 %r1044, %r1043, %r1038; + shf.l.wrap.b32 %r1045, %r1044, %r1044, 25; + add.s32 %r1046, %r998, %r203; + add.s32 %r1047, %r1046, %r961; + xor.b32 %r1048, %r1047, %r986; + shf.l.wrap.b32 %r1049, %r1048, %r1048, 16; + add.s32 %r1050, %r1049, %r973; + xor.b32 %r1051, %r1050, %r961; + shf.l.wrap.b32 %r1052, %r1051, %r1051, 20; + add.s32 %r1053, %r1047, %r245; + add.s32 %r1054, %r1053, %r1052; + xor.b32 %r1055, %r1054, %r1049; + shf.l.wrap.b32 %r1056, %r1055, %r1055, 24; + add.s32 %r1057, %r1056, %r1050; + xor.b32 %r1058, %r1057, %r1052; + shf.l.wrap.b32 %r1059, %r1058, %r1058, 25; + xor.b32 %r1060, %r1043, %r1012; + st.local.u32 [%rd3+-104], %r1060; + xor.b32 %r1061, %r1057, %r1026; + st.local.u32 [%rd3+-100], %r1061; + xor.b32 %r1062, %r1015, %r1040; + st.local.u32 [%rd3+-96], %r1062; + xor.b32 %r1063, %r1029, %r1054; + st.local.u32 [%rd3+-92], %r1063; + xor.b32 %r1064, %r1059, %r1028; + st.local.u32 [%rd3+-88], %r1064; + xor.b32 %r1065, %r1017, %r1042; + st.local.u32 [%rd3+-84], %r1065; + xor.b32 %r1066, %r1031, %r1056; + st.local.u32 [%rd3+-80], %r1066; + xor.b32 %r1067, %r1045, %r1014; + st.local.u32 [%rd3+-76], %r1067; + add.s16 %rs114, %rs109, 1; + st.local.v2.u8 [%rd3], {%rs351, %rs114}; + +$L__BB1_8: + add.s64 %rd117, %rd13, %rd238; + st.local.u8 [%rd117], %rs351; + add.s64 %rd238, %rd238, 1; + setp.lt.u64 %p7, %rd238, 64; + mov.u64 %rd244, %rd12; + @%p7 bra $L__BB1_8; + +$L__BB1_9: + setp.lt.u64 %p8, %rd244, 65; + @%p8 bra $L__BB1_12; + + ld.local.u8 %rs9, [%rd3+2]; + ld.local.u8 %rs352, [%rd3+1]; + ld.local.u32 %r11657, [%rd3+-104]; + ld.local.u32 %r11656, [%rd3+-100]; + ld.local.u32 %r11655, [%rd3+-96]; + ld.local.u32 %r11654, [%rd3+-92]; + ld.local.u32 %r11653, [%rd3+-88]; + ld.local.u32 %r11652, [%rd3+-84]; + ld.local.u32 %r11651, [%rd3+-80]; + ld.local.u32 %r11650, [%rd3+-76]; + ld.local.u64 %rd118, [%rd3+-72]; + cvt.u32.u64 %r9, %rd118; + shr.u64 %rd119, %rd118, 32; + cvt.u32.u64 %r10, %rd119; + +$L__BB1_11: + and.b16 %rs116, %rs352, 255; + setp.eq.s16 %p9, %rs116, 0; + selp.u16 %rs117, 1, 0, %p9; + or.b16 %rs118, %rs9, %rs117; + ld.local.u8 %r1068, [%rd261]; + ld.local.u8 %r1069, [%rd261+1]; + prmt.b32 %r1070, %r1069, %r1068, 30212; + ld.local.u8 %r1071, [%rd261+2]; + prmt.b32 %r1072, %r1071, %r1070, 28756; + ld.local.u8 %r1073, [%rd261+3]; + prmt.b32 %r1074, %r1073, %r1072, 1620; + ld.local.u8 %r1075, [%rd261+4]; + ld.local.u8 %r1076, [%rd261+5]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + ld.local.u8 %r1078, [%rd261+6]; + prmt.b32 %r1079, %r1078, %r1077, 28756; + ld.local.u8 %r1080, [%rd261+7]; + prmt.b32 %r1081, %r1080, %r1079, 1620; + ld.local.u8 %r1082, [%rd261+8]; + ld.local.u8 %r1083, [%rd261+9]; + prmt.b32 %r1084, %r1083, %r1082, 30212; + ld.local.u8 %r1085, [%rd261+10]; + prmt.b32 %r1086, %r1085, %r1084, 28756; + ld.local.u8 %r1087, [%rd261+11]; + prmt.b32 %r1088, %r1087, %r1086, 1620; + ld.local.u8 %r1089, [%rd261+12]; + ld.local.u8 %r1090, [%rd261+13]; + prmt.b32 %r1091, %r1090, %r1089, 30212; + ld.local.u8 %r1092, [%rd261+14]; + prmt.b32 %r1093, %r1092, %r1091, 28756; + ld.local.u8 %r1094, [%rd261+15]; + prmt.b32 %r1095, %r1094, %r1093, 1620; + ld.local.u8 %r1096, [%rd261+16]; + ld.local.u8 %r1097, [%rd261+17]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd261+18]; + prmt.b32 %r1100, %r1099, %r1098, 28756; + ld.local.u8 %r1101, [%rd261+19]; + prmt.b32 %r1102, %r1101, %r1100, 1620; + ld.local.u8 %r1103, [%rd261+20]; + ld.local.u8 %r1104, [%rd261+21]; + prmt.b32 %r1105, %r1104, %r1103, 30212; + ld.local.u8 %r1106, [%rd261+22]; + prmt.b32 %r1107, %r1106, %r1105, 28756; + ld.local.u8 %r1108, [%rd261+23]; + prmt.b32 %r1109, %r1108, %r1107, 1620; + ld.local.u8 %r1110, [%rd261+24]; + ld.local.u8 %r1111, [%rd261+25]; + prmt.b32 %r1112, %r1111, %r1110, 30212; + ld.local.u8 %r1113, [%rd261+26]; + prmt.b32 %r1114, %r1113, %r1112, 28756; + ld.local.u8 %r1115, [%rd261+27]; + prmt.b32 %r1116, %r1115, %r1114, 1620; + ld.local.u8 %r1117, [%rd261+28]; + ld.local.u8 %r1118, [%rd261+29]; + prmt.b32 %r1119, %r1118, %r1117, 30212; + ld.local.u8 %r1120, [%rd261+30]; + prmt.b32 %r1121, %r1120, %r1119, 28756; + ld.local.u8 %r1122, [%rd261+31]; + prmt.b32 %r1123, %r1122, %r1121, 1620; + ld.local.u8 %r1124, [%rd261+32]; + ld.local.u8 %r1125, [%rd261+33]; + prmt.b32 %r1126, %r1125, %r1124, 30212; + ld.local.u8 %r1127, [%rd261+34]; + prmt.b32 %r1128, %r1127, %r1126, 28756; + ld.local.u8 %r1129, [%rd261+35]; + prmt.b32 %r1130, %r1129, %r1128, 1620; + ld.local.u8 %r1131, [%rd261+36]; + ld.local.u8 %r1132, [%rd261+37]; + prmt.b32 %r1133, %r1132, %r1131, 30212; + ld.local.u8 %r1134, [%rd261+38]; + prmt.b32 %r1135, %r1134, %r1133, 28756; + ld.local.u8 %r1136, [%rd261+39]; + prmt.b32 %r1137, %r1136, %r1135, 1620; + ld.local.u8 %r1138, [%rd261+40]; + ld.local.u8 %r1139, [%rd261+41]; + prmt.b32 %r1140, %r1139, %r1138, 30212; + ld.local.u8 %r1141, [%rd261+42]; + prmt.b32 %r1142, %r1141, %r1140, 28756; + ld.local.u8 %r1143, [%rd261+43]; + prmt.b32 %r1144, %r1143, %r1142, 1620; + ld.local.u8 %r1145, [%rd261+44]; + ld.local.u8 %r1146, [%rd261+45]; + prmt.b32 %r1147, %r1146, %r1145, 30212; + ld.local.u8 %r1148, [%rd261+46]; + prmt.b32 %r1149, %r1148, %r1147, 28756; + ld.local.u8 %r1150, [%rd261+47]; + prmt.b32 %r1151, %r1150, %r1149, 1620; + ld.local.u8 %r1152, [%rd261+48]; + ld.local.u8 %r1153, [%rd261+49]; + prmt.b32 %r1154, %r1153, %r1152, 30212; + ld.local.u8 %r1155, [%rd261+50]; + prmt.b32 %r1156, %r1155, %r1154, 28756; + ld.local.u8 %r1157, [%rd261+51]; + prmt.b32 %r1158, %r1157, %r1156, 1620; + ld.local.u8 %r1159, [%rd261+52]; + ld.local.u8 %r1160, [%rd261+53]; + prmt.b32 %r1161, %r1160, %r1159, 30212; + ld.local.u8 %r1162, [%rd261+54]; + prmt.b32 %r1163, %r1162, %r1161, 28756; + ld.local.u8 %r1164, [%rd261+55]; + prmt.b32 %r1165, %r1164, %r1163, 1620; + ld.local.u8 %r1166, [%rd261+56]; + ld.local.u8 %r1167, [%rd261+57]; + prmt.b32 %r1168, %r1167, %r1166, 30212; + ld.local.u8 %r1169, [%rd261+58]; + prmt.b32 %r1170, %r1169, %r1168, 28756; + ld.local.u8 %r1171, [%rd261+59]; + prmt.b32 %r1172, %r1171, %r1170, 1620; + ld.local.u8 %r1173, [%rd261+60]; + ld.local.u8 %r1174, [%rd261+61]; + prmt.b32 %r1175, %r1174, %r1173, 30212; + ld.local.u8 %r1176, [%rd261+62]; + prmt.b32 %r1177, %r1176, %r1175, 28756; + ld.local.u8 %r1178, [%rd261+63]; + prmt.b32 %r1179, %r1178, %r1177, 1620; + cvt.u32.u16 %r1180, %rs118; + and.b32 %r1181, %r1180, 255; + add.s32 %r1182, %r11657, %r1074; + add.s32 %r1183, %r1182, %r11653; + xor.b32 %r1184, %r1183, %r9; + shf.l.wrap.b32 %r1185, %r1184, %r1184, 16; + add.s32 %r1186, %r1185, 1779033703; + xor.b32 %r1187, %r1186, %r11653; + shf.l.wrap.b32 %r1188, %r1187, %r1187, 20; + add.s32 %r1189, %r1183, %r1081; + add.s32 %r1190, %r1189, %r1188; + xor.b32 %r1191, %r1190, %r1185; + shf.l.wrap.b32 %r1192, %r1191, %r1191, 24; + add.s32 %r1193, %r1192, %r1186; + xor.b32 %r1194, %r1193, %r1188; + shf.l.wrap.b32 %r1195, %r1194, %r1194, 25; + add.s32 %r1196, %r11656, %r1088; + add.s32 %r1197, %r1196, %r11652; + xor.b32 %r1198, %r1197, %r10; + shf.l.wrap.b32 %r1199, %r1198, %r1198, 16; + add.s32 %r1200, %r1199, -1150833019; + xor.b32 %r1201, %r1200, %r11652; + shf.l.wrap.b32 %r1202, %r1201, %r1201, 20; + add.s32 %r1203, %r1197, %r1095; + add.s32 %r1204, %r1203, %r1202; + xor.b32 %r1205, %r1204, %r1199; + shf.l.wrap.b32 %r1206, %r1205, %r1205, 24; + add.s32 %r1207, %r1206, %r1200; + xor.b32 %r1208, %r1207, %r1202; + shf.l.wrap.b32 %r1209, %r1208, %r1208, 25; + add.s32 %r1210, %r11655, %r1102; + add.s32 %r1211, %r1210, %r11651; + shr.u32 %r1212, %r1211, 16; + shl.b32 %r1213, %r1211, 16; + xor.b32 %r1214, %r1213, 4194304; + or.b32 %r1215, %r1214, %r1212; + add.s32 %r1216, %r1215, 1013904242; + xor.b32 %r1217, %r1216, %r11651; + shf.l.wrap.b32 %r1218, %r1217, %r1217, 20; + add.s32 %r1219, %r1211, %r1109; + add.s32 %r1220, %r1219, %r1218; + xor.b32 %r1221, %r1220, %r1215; + shf.l.wrap.b32 %r1222, %r1221, %r1221, 24; + add.s32 %r1223, %r1222, %r1216; + xor.b32 %r1224, %r1223, %r1218; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 25; + add.s32 %r1226, %r11654, %r1116; + add.s32 %r1227, %r1226, %r11650; + xor.b32 %r1228, %r1227, %r1181; + shr.u32 %r1229, %r1227, 16; + shl.b32 %r1230, %r1228, 16; + or.b32 %r1231, %r1230, %r1229; + add.s32 %r1232, %r1231, -1521486534; + xor.b32 %r1233, %r1232, %r11650; + shf.l.wrap.b32 %r1234, %r1233, %r1233, 20; + add.s32 %r1235, %r1227, %r1123; + add.s32 %r1236, %r1235, %r1234; + xor.b32 %r1237, %r1236, %r1231; + shf.l.wrap.b32 %r1238, %r1237, %r1237, 24; + add.s32 %r1239, %r1238, %r1232; + xor.b32 %r1240, %r1239, %r1234; + shf.l.wrap.b32 %r1241, %r1240, %r1240, 25; + add.s32 %r1242, %r1190, %r1130; + add.s32 %r1243, %r1242, %r1209; + xor.b32 %r1244, %r1243, %r1238; + shf.l.wrap.b32 %r1245, %r1244, %r1244, 16; + add.s32 %r1246, %r1245, %r1223; + xor.b32 %r1247, %r1246, %r1209; + shf.l.wrap.b32 %r1248, %r1247, %r1247, 20; + add.s32 %r1249, %r1243, %r1137; + add.s32 %r1250, %r1249, %r1248; + xor.b32 %r1251, %r1250, %r1245; + shf.l.wrap.b32 %r1252, %r1251, %r1251, 24; + add.s32 %r1253, %r1252, %r1246; + xor.b32 %r1254, %r1253, %r1248; + shf.l.wrap.b32 %r1255, %r1254, %r1254, 25; + add.s32 %r1256, %r1204, %r1144; + add.s32 %r1257, %r1256, %r1225; + xor.b32 %r1258, %r1257, %r1192; + shf.l.wrap.b32 %r1259, %r1258, %r1258, 16; + add.s32 %r1260, %r1259, %r1239; + xor.b32 %r1261, %r1260, %r1225; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 20; + add.s32 %r1263, %r1257, %r1151; + add.s32 %r1264, %r1263, %r1262; + xor.b32 %r1265, %r1264, %r1259; + shf.l.wrap.b32 %r1266, %r1265, %r1265, 24; + add.s32 %r1267, %r1266, %r1260; + xor.b32 %r1268, %r1267, %r1262; + shf.l.wrap.b32 %r1269, %r1268, %r1268, 25; + add.s32 %r1270, %r1220, %r1158; + add.s32 %r1271, %r1270, %r1241; + xor.b32 %r1272, %r1271, %r1206; + shf.l.wrap.b32 %r1273, %r1272, %r1272, 16; + add.s32 %r1274, %r1273, %r1193; + xor.b32 %r1275, %r1274, %r1241; + shf.l.wrap.b32 %r1276, %r1275, %r1275, 20; + add.s32 %r1277, %r1271, %r1165; + add.s32 %r1278, %r1277, %r1276; + xor.b32 %r1279, %r1278, %r1273; + shf.l.wrap.b32 %r1280, %r1279, %r1279, 24; + add.s32 %r1281, %r1280, %r1274; + xor.b32 %r1282, %r1281, %r1276; + shf.l.wrap.b32 %r1283, %r1282, %r1282, 25; + add.s32 %r1284, %r1236, %r1172; + add.s32 %r1285, %r1284, %r1195; + xor.b32 %r1286, %r1285, %r1222; + shf.l.wrap.b32 %r1287, %r1286, %r1286, 16; + add.s32 %r1288, %r1287, %r1207; + xor.b32 %r1289, %r1288, %r1195; + shf.l.wrap.b32 %r1290, %r1289, %r1289, 20; + add.s32 %r1291, %r1285, %r1179; + add.s32 %r1292, %r1291, %r1290; + xor.b32 %r1293, %r1292, %r1287; + shf.l.wrap.b32 %r1294, %r1293, %r1293, 24; + add.s32 %r1295, %r1294, %r1288; + xor.b32 %r1296, %r1295, %r1290; + shf.l.wrap.b32 %r1297, %r1296, %r1296, 25; + add.s32 %r1298, %r1250, %r1088; + add.s32 %r1299, %r1298, %r1297; + xor.b32 %r1300, %r1299, %r1266; + shf.l.wrap.b32 %r1301, %r1300, %r1300, 16; + add.s32 %r1302, %r1301, %r1281; + xor.b32 %r1303, %r1302, %r1297; + shf.l.wrap.b32 %r1304, %r1303, %r1303, 20; + add.s32 %r1305, %r1299, %r1116; + add.s32 %r1306, %r1305, %r1304; + xor.b32 %r1307, %r1306, %r1301; + shf.l.wrap.b32 %r1308, %r1307, %r1307, 24; + add.s32 %r1309, %r1308, %r1302; + xor.b32 %r1310, %r1309, %r1304; + shf.l.wrap.b32 %r1311, %r1310, %r1310, 25; + add.s32 %r1312, %r1264, %r1095; + add.s32 %r1313, %r1312, %r1255; + xor.b32 %r1314, %r1313, %r1280; + shf.l.wrap.b32 %r1315, %r1314, %r1314, 16; + add.s32 %r1316, %r1315, %r1295; + xor.b32 %r1317, %r1316, %r1255; + shf.l.wrap.b32 %r1318, %r1317, %r1317, 20; + add.s32 %r1319, %r1313, %r1144; + add.s32 %r1320, %r1319, %r1318; + xor.b32 %r1321, %r1320, %r1315; + shf.l.wrap.b32 %r1322, %r1321, %r1321, 24; + add.s32 %r1323, %r1322, %r1316; + xor.b32 %r1324, %r1323, %r1318; + shf.l.wrap.b32 %r1325, %r1324, %r1324, 25; + add.s32 %r1326, %r1278, %r1123; + add.s32 %r1327, %r1326, %r1269; + xor.b32 %r1328, %r1327, %r1294; + shf.l.wrap.b32 %r1329, %r1328, %r1328, 16; + add.s32 %r1330, %r1329, %r1253; + xor.b32 %r1331, %r1330, %r1269; + shf.l.wrap.b32 %r1332, %r1331, %r1331, 20; + add.s32 %r1333, %r1327, %r1074; + add.s32 %r1334, %r1333, %r1332; + xor.b32 %r1335, %r1334, %r1329; + shf.l.wrap.b32 %r1336, %r1335, %r1335, 24; + add.s32 %r1337, %r1336, %r1330; + xor.b32 %r1338, %r1337, %r1332; + shf.l.wrap.b32 %r1339, %r1338, %r1338, 25; + add.s32 %r1340, %r1292, %r1102; + add.s32 %r1341, %r1340, %r1283; + xor.b32 %r1342, %r1341, %r1252; + shf.l.wrap.b32 %r1343, %r1342, %r1342, 16; + add.s32 %r1344, %r1343, %r1267; + xor.b32 %r1345, %r1344, %r1283; + shf.l.wrap.b32 %r1346, %r1345, %r1345, 20; + add.s32 %r1347, %r1341, %r1165; + add.s32 %r1348, %r1347, %r1346; + xor.b32 %r1349, %r1348, %r1343; + shf.l.wrap.b32 %r1350, %r1349, %r1349, 24; + add.s32 %r1351, %r1350, %r1344; + xor.b32 %r1352, %r1351, %r1346; + shf.l.wrap.b32 %r1353, %r1352, %r1352, 25; + add.s32 %r1354, %r1306, %r1081; + add.s32 %r1355, %r1354, %r1325; + xor.b32 %r1356, %r1355, %r1350; + shf.l.wrap.b32 %r1357, %r1356, %r1356, 16; + add.s32 %r1358, %r1357, %r1337; + xor.b32 %r1359, %r1358, %r1325; + shf.l.wrap.b32 %r1360, %r1359, %r1359, 20; + add.s32 %r1361, %r1355, %r1151; + add.s32 %r1362, %r1361, %r1360; + xor.b32 %r1363, %r1362, %r1357; + shf.l.wrap.b32 %r1364, %r1363, %r1363, 24; + add.s32 %r1365, %r1364, %r1358; + xor.b32 %r1366, %r1365, %r1360; + shf.l.wrap.b32 %r1367, %r1366, %r1366, 25; + add.s32 %r1368, %r1320, %r1158; + add.s32 %r1369, %r1368, %r1339; + xor.b32 %r1370, %r1369, %r1308; + shf.l.wrap.b32 %r1371, %r1370, %r1370, 16; + add.s32 %r1372, %r1371, %r1351; + xor.b32 %r1373, %r1372, %r1339; + shf.l.wrap.b32 %r1374, %r1373, %r1373, 20; + add.s32 %r1375, %r1369, %r1109; + add.s32 %r1376, %r1375, %r1374; + xor.b32 %r1377, %r1376, %r1371; + shf.l.wrap.b32 %r1378, %r1377, %r1377, 24; + add.s32 %r1379, %r1378, %r1372; + xor.b32 %r1380, %r1379, %r1374; + shf.l.wrap.b32 %r1381, %r1380, %r1380, 25; + add.s32 %r1382, %r1334, %r1137; + add.s32 %r1383, %r1382, %r1353; + xor.b32 %r1384, %r1383, %r1322; + shf.l.wrap.b32 %r1385, %r1384, %r1384, 16; + add.s32 %r1386, %r1385, %r1309; + xor.b32 %r1387, %r1386, %r1353; + shf.l.wrap.b32 %r1388, %r1387, %r1387, 20; + add.s32 %r1389, %r1383, %r1172; + add.s32 %r1390, %r1389, %r1388; + xor.b32 %r1391, %r1390, %r1385; + shf.l.wrap.b32 %r1392, %r1391, %r1391, 24; + add.s32 %r1393, %r1392, %r1386; + xor.b32 %r1394, %r1393, %r1388; + shf.l.wrap.b32 %r1395, %r1394, %r1394, 25; + add.s32 %r1396, %r1348, %r1179; + add.s32 %r1397, %r1396, %r1311; + xor.b32 %r1398, %r1397, %r1336; + shf.l.wrap.b32 %r1399, %r1398, %r1398, 16; + add.s32 %r1400, %r1399, %r1323; + xor.b32 %r1401, %r1400, %r1311; + shf.l.wrap.b32 %r1402, %r1401, %r1401, 20; + add.s32 %r1403, %r1397, %r1130; + add.s32 %r1404, %r1403, %r1402; + xor.b32 %r1405, %r1404, %r1399; + shf.l.wrap.b32 %r1406, %r1405, %r1405, 24; + add.s32 %r1407, %r1406, %r1400; + xor.b32 %r1408, %r1407, %r1402; + shf.l.wrap.b32 %r1409, %r1408, %r1408, 25; + add.s32 %r1410, %r1362, %r1095; + add.s32 %r1411, %r1410, %r1409; + xor.b32 %r1412, %r1411, %r1378; + shf.l.wrap.b32 %r1413, %r1412, %r1412, 16; + add.s32 %r1414, %r1413, %r1393; + xor.b32 %r1415, %r1414, %r1409; + shf.l.wrap.b32 %r1416, %r1415, %r1415, 20; + add.s32 %r1417, %r1411, %r1102; + add.s32 %r1418, %r1417, %r1416; + xor.b32 %r1419, %r1418, %r1413; + shf.l.wrap.b32 %r1420, %r1419, %r1419, 24; + add.s32 %r1421, %r1420, %r1414; + xor.b32 %r1422, %r1421, %r1416; + shf.l.wrap.b32 %r1423, %r1422, %r1422, 25; + add.s32 %r1424, %r1376, %r1144; + add.s32 %r1425, %r1424, %r1367; + xor.b32 %r1426, %r1425, %r1392; + shf.l.wrap.b32 %r1427, %r1426, %r1426, 16; + add.s32 %r1428, %r1427, %r1407; + xor.b32 %r1429, %r1428, %r1367; + shf.l.wrap.b32 %r1430, %r1429, %r1429, 20; + add.s32 %r1431, %r1425, %r1158; + add.s32 %r1432, %r1431, %r1430; + xor.b32 %r1433, %r1432, %r1427; + shf.l.wrap.b32 %r1434, %r1433, %r1433, 24; + add.s32 %r1435, %r1434, %r1428; + xor.b32 %r1436, %r1435, %r1430; + shf.l.wrap.b32 %r1437, %r1436, %r1436, 25; + add.s32 %r1438, %r1390, %r1165; + add.s32 %r1439, %r1438, %r1381; + xor.b32 %r1440, %r1439, %r1406; + shf.l.wrap.b32 %r1441, %r1440, %r1440, 16; + add.s32 %r1442, %r1441, %r1365; + xor.b32 %r1443, %r1442, %r1381; + shf.l.wrap.b32 %r1444, %r1443, %r1443, 20; + add.s32 %r1445, %r1439, %r1088; + add.s32 %r1446, %r1445, %r1444; + xor.b32 %r1447, %r1446, %r1441; + shf.l.wrap.b32 %r1448, %r1447, %r1447, 24; + add.s32 %r1449, %r1448, %r1442; + xor.b32 %r1450, %r1449, %r1444; + shf.l.wrap.b32 %r1451, %r1450, %r1450, 25; + add.s32 %r1452, %r1404, %r1123; + add.s32 %r1453, %r1452, %r1395; + xor.b32 %r1454, %r1453, %r1364; + shf.l.wrap.b32 %r1455, %r1454, %r1454, 16; + add.s32 %r1456, %r1455, %r1379; + xor.b32 %r1457, %r1456, %r1395; + shf.l.wrap.b32 %r1458, %r1457, %r1457, 20; + add.s32 %r1459, %r1453, %r1172; + add.s32 %r1460, %r1459, %r1458; + xor.b32 %r1461, %r1460, %r1455; + shf.l.wrap.b32 %r1462, %r1461, %r1461, 24; + add.s32 %r1463, %r1462, %r1456; + xor.b32 %r1464, %r1463, %r1458; + shf.l.wrap.b32 %r1465, %r1464, %r1464, 25; + add.s32 %r1466, %r1418, %r1116; + add.s32 %r1467, %r1466, %r1437; + xor.b32 %r1468, %r1467, %r1462; + shf.l.wrap.b32 %r1469, %r1468, %r1468, 16; + add.s32 %r1470, %r1469, %r1449; + xor.b32 %r1471, %r1470, %r1437; + shf.l.wrap.b32 %r1472, %r1471, %r1471, 20; + add.s32 %r1473, %r1467, %r1109; + add.s32 %r1474, %r1473, %r1472; + xor.b32 %r1475, %r1474, %r1469; + shf.l.wrap.b32 %r1476, %r1475, %r1475, 24; + add.s32 %r1477, %r1476, %r1470; + xor.b32 %r1478, %r1477, %r1472; + shf.l.wrap.b32 %r1479, %r1478, %r1478, 25; + add.s32 %r1480, %r1432, %r1137; + add.s32 %r1481, %r1480, %r1451; + xor.b32 %r1482, %r1481, %r1420; + shf.l.wrap.b32 %r1483, %r1482, %r1482, 16; + add.s32 %r1484, %r1483, %r1463; + xor.b32 %r1485, %r1484, %r1451; + shf.l.wrap.b32 %r1486, %r1485, %r1485, 20; + add.s32 %r1487, %r1481, %r1074; + add.s32 %r1488, %r1487, %r1486; + xor.b32 %r1489, %r1488, %r1483; + shf.l.wrap.b32 %r1490, %r1489, %r1489, 24; + add.s32 %r1491, %r1490, %r1484; + xor.b32 %r1492, %r1491, %r1486; + shf.l.wrap.b32 %r1493, %r1492, %r1492, 25; + add.s32 %r1494, %r1446, %r1151; + add.s32 %r1495, %r1494, %r1465; + xor.b32 %r1496, %r1495, %r1434; + shf.l.wrap.b32 %r1497, %r1496, %r1496, 16; + add.s32 %r1498, %r1497, %r1421; + xor.b32 %r1499, %r1498, %r1465; + shf.l.wrap.b32 %r1500, %r1499, %r1499, 20; + add.s32 %r1501, %r1495, %r1179; + add.s32 %r1502, %r1501, %r1500; + xor.b32 %r1503, %r1502, %r1497; + shf.l.wrap.b32 %r1504, %r1503, %r1503, 24; + add.s32 %r1505, %r1504, %r1498; + xor.b32 %r1506, %r1505, %r1500; + shf.l.wrap.b32 %r1507, %r1506, %r1506, 25; + add.s32 %r1508, %r1460, %r1130; + add.s32 %r1509, %r1508, %r1423; + xor.b32 %r1510, %r1509, %r1448; + shf.l.wrap.b32 %r1511, %r1510, %r1510, 16; + add.s32 %r1512, %r1511, %r1435; + xor.b32 %r1513, %r1512, %r1423; + shf.l.wrap.b32 %r1514, %r1513, %r1513, 20; + add.s32 %r1515, %r1509, %r1081; + add.s32 %r1516, %r1515, %r1514; + xor.b32 %r1517, %r1516, %r1511; + shf.l.wrap.b32 %r1518, %r1517, %r1517, 24; + add.s32 %r1519, %r1518, %r1512; + xor.b32 %r1520, %r1519, %r1514; + shf.l.wrap.b32 %r1521, %r1520, %r1520, 25; + add.s32 %r1522, %r1474, %r1144; + add.s32 %r1523, %r1522, %r1521; + xor.b32 %r1524, %r1523, %r1490; + shf.l.wrap.b32 %r1525, %r1524, %r1524, 16; + add.s32 %r1526, %r1525, %r1505; + xor.b32 %r1527, %r1526, %r1521; + shf.l.wrap.b32 %r1528, %r1527, %r1527, 20; + add.s32 %r1529, %r1523, %r1123; + add.s32 %r1530, %r1529, %r1528; + xor.b32 %r1531, %r1530, %r1525; + shf.l.wrap.b32 %r1532, %r1531, %r1531, 24; + add.s32 %r1533, %r1532, %r1526; + xor.b32 %r1534, %r1533, %r1528; + shf.l.wrap.b32 %r1535, %r1534, %r1534, 25; + add.s32 %r1536, %r1488, %r1158; + add.s32 %r1537, %r1536, %r1479; + xor.b32 %r1538, %r1537, %r1504; + shf.l.wrap.b32 %r1539, %r1538, %r1538, 16; + add.s32 %r1540, %r1539, %r1519; + xor.b32 %r1541, %r1540, %r1479; + shf.l.wrap.b32 %r1542, %r1541, %r1541, 20; + add.s32 %r1543, %r1537, %r1137; + add.s32 %r1544, %r1543, %r1542; + xor.b32 %r1545, %r1544, %r1539; + shf.l.wrap.b32 %r1546, %r1545, %r1545, 24; + add.s32 %r1547, %r1546, %r1540; + xor.b32 %r1548, %r1547, %r1542; + shf.l.wrap.b32 %r1549, %r1548, %r1548, 25; + add.s32 %r1550, %r1502, %r1172; + add.s32 %r1551, %r1550, %r1493; + xor.b32 %r1552, %r1551, %r1518; + shf.l.wrap.b32 %r1553, %r1552, %r1552, 16; + add.s32 %r1554, %r1553, %r1477; + xor.b32 %r1555, %r1554, %r1493; + shf.l.wrap.b32 %r1556, %r1555, %r1555, 20; + add.s32 %r1557, %r1551, %r1095; + add.s32 %r1558, %r1557, %r1556; + xor.b32 %r1559, %r1558, %r1553; + shf.l.wrap.b32 %r1560, %r1559, %r1559, 24; + add.s32 %r1561, %r1560, %r1554; + xor.b32 %r1562, %r1561, %r1556; + shf.l.wrap.b32 %r1563, %r1562, %r1562, 25; + add.s32 %r1564, %r1516, %r1165; + add.s32 %r1565, %r1564, %r1507; + xor.b32 %r1566, %r1565, %r1476; + shf.l.wrap.b32 %r1567, %r1566, %r1566, 16; + add.s32 %r1568, %r1567, %r1491; + xor.b32 %r1569, %r1568, %r1507; + shf.l.wrap.b32 %r1570, %r1569, %r1569, 20; + add.s32 %r1571, %r1565, %r1179; + add.s32 %r1572, %r1571, %r1570; + xor.b32 %r1573, %r1572, %r1567; + shf.l.wrap.b32 %r1574, %r1573, %r1573, 24; + add.s32 %r1575, %r1574, %r1568; + xor.b32 %r1576, %r1575, %r1570; + shf.l.wrap.b32 %r1577, %r1576, %r1576, 25; + add.s32 %r1578, %r1530, %r1102; + add.s32 %r1579, %r1578, %r1549; + xor.b32 %r1580, %r1579, %r1574; + shf.l.wrap.b32 %r1581, %r1580, %r1580, 16; + add.s32 %r1582, %r1581, %r1561; + xor.b32 %r1583, %r1582, %r1549; + shf.l.wrap.b32 %r1584, %r1583, %r1583, 20; + add.s32 %r1585, %r1579, %r1074; + add.s32 %r1586, %r1585, %r1584; + xor.b32 %r1587, %r1586, %r1581; + shf.l.wrap.b32 %r1588, %r1587, %r1587, 24; + add.s32 %r1589, %r1588, %r1582; + xor.b32 %r1590, %r1589, %r1584; + shf.l.wrap.b32 %r1591, %r1590, %r1590, 25; + add.s32 %r1592, %r1544, %r1151; + add.s32 %r1593, %r1592, %r1563; + xor.b32 %r1594, %r1593, %r1532; + shf.l.wrap.b32 %r1595, %r1594, %r1594, 16; + add.s32 %r1596, %r1595, %r1575; + xor.b32 %r1597, %r1596, %r1563; + shf.l.wrap.b32 %r1598, %r1597, %r1597, 20; + add.s32 %r1599, %r1593, %r1088; + add.s32 %r1600, %r1599, %r1598; + xor.b32 %r1601, %r1600, %r1595; + shf.l.wrap.b32 %r1602, %r1601, %r1601, 24; + add.s32 %r1603, %r1602, %r1596; + xor.b32 %r1604, %r1603, %r1598; + shf.l.wrap.b32 %r1605, %r1604, %r1604, 25; + add.s32 %r1606, %r1558, %r1109; + add.s32 %r1607, %r1606, %r1577; + xor.b32 %r1608, %r1607, %r1546; + shf.l.wrap.b32 %r1609, %r1608, %r1608, 16; + add.s32 %r1610, %r1609, %r1533; + xor.b32 %r1611, %r1610, %r1577; + shf.l.wrap.b32 %r1612, %r1611, %r1611, 20; + add.s32 %r1613, %r1607, %r1130; + add.s32 %r1614, %r1613, %r1612; + xor.b32 %r1615, %r1614, %r1609; + shf.l.wrap.b32 %r1616, %r1615, %r1615, 24; + add.s32 %r1617, %r1616, %r1610; + xor.b32 %r1618, %r1617, %r1612; + shf.l.wrap.b32 %r1619, %r1618, %r1618, 25; + add.s32 %r1620, %r1572, %r1081; + add.s32 %r1621, %r1620, %r1535; + xor.b32 %r1622, %r1621, %r1560; + shf.l.wrap.b32 %r1623, %r1622, %r1622, 16; + add.s32 %r1624, %r1623, %r1547; + xor.b32 %r1625, %r1624, %r1535; + shf.l.wrap.b32 %r1626, %r1625, %r1625, 20; + add.s32 %r1627, %r1621, %r1116; + add.s32 %r1628, %r1627, %r1626; + xor.b32 %r1629, %r1628, %r1623; + shf.l.wrap.b32 %r1630, %r1629, %r1629, 24; + add.s32 %r1631, %r1630, %r1624; + xor.b32 %r1632, %r1631, %r1626; + shf.l.wrap.b32 %r1633, %r1632, %r1632, 25; + add.s32 %r1634, %r1586, %r1158; + add.s32 %r1635, %r1634, %r1633; + xor.b32 %r1636, %r1635, %r1602; + shf.l.wrap.b32 %r1637, %r1636, %r1636, 16; + add.s32 %r1638, %r1637, %r1617; + xor.b32 %r1639, %r1638, %r1633; + shf.l.wrap.b32 %r1640, %r1639, %r1639, 20; + add.s32 %r1641, %r1635, %r1165; + add.s32 %r1642, %r1641, %r1640; + xor.b32 %r1643, %r1642, %r1637; + shf.l.wrap.b32 %r1644, %r1643, %r1643, 24; + add.s32 %r1645, %r1644, %r1638; + xor.b32 %r1646, %r1645, %r1640; + shf.l.wrap.b32 %r1647, %r1646, %r1646, 25; + add.s32 %r1648, %r1600, %r1137; + add.s32 %r1649, %r1648, %r1591; + xor.b32 %r1650, %r1649, %r1616; + shf.l.wrap.b32 %r1651, %r1650, %r1650, 16; + add.s32 %r1652, %r1651, %r1631; + xor.b32 %r1653, %r1652, %r1591; + shf.l.wrap.b32 %r1654, %r1653, %r1653, 20; + add.s32 %r1655, %r1649, %r1151; + add.s32 %r1656, %r1655, %r1654; + xor.b32 %r1657, %r1656, %r1651; + shf.l.wrap.b32 %r1658, %r1657, %r1657, 24; + add.s32 %r1659, %r1658, %r1652; + xor.b32 %r1660, %r1659, %r1654; + shf.l.wrap.b32 %r1661, %r1660, %r1660, 25; + add.s32 %r1662, %r1614, %r1179; + add.s32 %r1663, %r1662, %r1605; + xor.b32 %r1664, %r1663, %r1630; + shf.l.wrap.b32 %r1665, %r1664, %r1664, 16; + add.s32 %r1666, %r1665, %r1589; + xor.b32 %r1667, %r1666, %r1605; + shf.l.wrap.b32 %r1668, %r1667, %r1667, 20; + add.s32 %r1669, %r1663, %r1144; + add.s32 %r1670, %r1669, %r1668; + xor.b32 %r1671, %r1670, %r1665; + shf.l.wrap.b32 %r1672, %r1671, %r1671, 24; + add.s32 %r1673, %r1672, %r1666; + xor.b32 %r1674, %r1673, %r1668; + shf.l.wrap.b32 %r1675, %r1674, %r1674, 25; + add.s32 %r1676, %r1628, %r1172; + add.s32 %r1677, %r1676, %r1619; + xor.b32 %r1678, %r1677, %r1588; + shf.l.wrap.b32 %r1679, %r1678, %r1678, 16; + add.s32 %r1680, %r1679, %r1603; + xor.b32 %r1681, %r1680, %r1619; + shf.l.wrap.b32 %r1682, %r1681, %r1681, 20; + add.s32 %r1683, %r1677, %r1130; + add.s32 %r1684, %r1683, %r1682; + xor.b32 %r1685, %r1684, %r1679; + shf.l.wrap.b32 %r1686, %r1685, %r1685, 24; + add.s32 %r1687, %r1686, %r1680; + xor.b32 %r1688, %r1687, %r1682; + shf.l.wrap.b32 %r1689, %r1688, %r1688, 25; + add.s32 %r1690, %r1642, %r1123; + add.s32 %r1691, %r1690, %r1661; + xor.b32 %r1692, %r1691, %r1686; + shf.l.wrap.b32 %r1693, %r1692, %r1692, 16; + add.s32 %r1694, %r1693, %r1673; + xor.b32 %r1695, %r1694, %r1661; + shf.l.wrap.b32 %r1696, %r1695, %r1695, 20; + add.s32 %r1697, %r1691, %r1088; + add.s32 %r1698, %r1697, %r1696; + xor.b32 %r1699, %r1698, %r1693; + shf.l.wrap.b32 %r1700, %r1699, %r1699, 24; + add.s32 %r1701, %r1700, %r1694; + xor.b32 %r1702, %r1701, %r1696; + shf.l.wrap.b32 %r1703, %r1702, %r1702, 25; + add.s32 %r1704, %r1656, %r1109; + add.s32 %r1705, %r1704, %r1675; + xor.b32 %r1706, %r1705, %r1644; + shf.l.wrap.b32 %r1707, %r1706, %r1706, 16; + add.s32 %r1708, %r1707, %r1687; + xor.b32 %r1709, %r1708, %r1675; + shf.l.wrap.b32 %r1710, %r1709, %r1709, 20; + add.s32 %r1711, %r1705, %r1095; + add.s32 %r1712, %r1711, %r1710; + xor.b32 %r1713, %r1712, %r1707; + shf.l.wrap.b32 %r1714, %r1713, %r1713, 24; + add.s32 %r1715, %r1714, %r1708; + xor.b32 %r1716, %r1715, %r1710; + shf.l.wrap.b32 %r1717, %r1716, %r1716, 25; + add.s32 %r1718, %r1670, %r1074; + add.s32 %r1719, %r1718, %r1689; + xor.b32 %r1720, %r1719, %r1658; + shf.l.wrap.b32 %r1721, %r1720, %r1720, 16; + add.s32 %r1722, %r1721, %r1645; + xor.b32 %r1723, %r1722, %r1689; + shf.l.wrap.b32 %r1724, %r1723, %r1723, 20; + add.s32 %r1725, %r1719, %r1081; + add.s32 %r1726, %r1725, %r1724; + xor.b32 %r1727, %r1726, %r1721; + shf.l.wrap.b32 %r1728, %r1727, %r1727, 24; + add.s32 %r1729, %r1728, %r1722; + xor.b32 %r1730, %r1729, %r1724; + shf.l.wrap.b32 %r1731, %r1730, %r1730, 25; + add.s32 %r1732, %r1684, %r1116; + add.s32 %r1733, %r1732, %r1647; + xor.b32 %r1734, %r1733, %r1672; + shf.l.wrap.b32 %r1735, %r1734, %r1734, 16; + add.s32 %r1736, %r1735, %r1659; + xor.b32 %r1737, %r1736, %r1647; + shf.l.wrap.b32 %r1738, %r1737, %r1737, 20; + add.s32 %r1739, %r1733, %r1102; + add.s32 %r1740, %r1739, %r1738; + xor.b32 %r1741, %r1740, %r1735; + shf.l.wrap.b32 %r1742, %r1741, %r1741, 24; + add.s32 %r1743, %r1742, %r1736; + xor.b32 %r1744, %r1743, %r1738; + shf.l.wrap.b32 %r1745, %r1744, %r1744, 25; + add.s32 %r1746, %r1698, %r1137; + add.s32 %r1747, %r1746, %r1745; + xor.b32 %r1748, %r1747, %r1714; + shf.l.wrap.b32 %r1749, %r1748, %r1748, 16; + add.s32 %r1750, %r1749, %r1729; + xor.b32 %r1751, %r1750, %r1745; + shf.l.wrap.b32 %r1752, %r1751, %r1751, 20; + add.s32 %r1753, %r1747, %r1172; + add.s32 %r1754, %r1753, %r1752; + xor.b32 %r1755, %r1754, %r1749; + shf.l.wrap.b32 %r1756, %r1755, %r1755, 24; + add.s32 %r1757, %r1756, %r1750; + xor.b32 %r1758, %r1757, %r1752; + shf.l.wrap.b32 %r1759, %r1758, %r1758, 25; + add.s32 %r1760, %r1712, %r1151; + add.s32 %r1761, %r1760, %r1703; + xor.b32 %r1762, %r1761, %r1728; + shf.l.wrap.b32 %r1763, %r1762, %r1762, 16; + add.s32 %r1764, %r1763, %r1743; + xor.b32 %r1765, %r1764, %r1703; + shf.l.wrap.b32 %r1766, %r1765, %r1765, 20; + add.s32 %r1767, %r1761, %r1109; + add.s32 %r1768, %r1767, %r1766; + xor.b32 %r1769, %r1768, %r1763; + shf.l.wrap.b32 %r1770, %r1769, %r1769, 24; + add.s32 %r1771, %r1770, %r1764; + xor.b32 %r1772, %r1771, %r1766; + shf.l.wrap.b32 %r1773, %r1772, %r1772, 25; + add.s32 %r1774, %r1726, %r1130; + add.s32 %r1775, %r1774, %r1717; + xor.b32 %r1776, %r1775, %r1742; + shf.l.wrap.b32 %r1777, %r1776, %r1776, 16; + add.s32 %r1778, %r1777, %r1701; + xor.b32 %r1779, %r1778, %r1717; + shf.l.wrap.b32 %r1780, %r1779, %r1779, 20; + add.s32 %r1781, %r1775, %r1158; + add.s32 %r1782, %r1781, %r1780; + xor.b32 %r1783, %r1782, %r1777; + shf.l.wrap.b32 %r1784, %r1783, %r1783, 24; + add.s32 %r1785, %r1784, %r1778; + xor.b32 %r1786, %r1785, %r1780; + shf.l.wrap.b32 %r1787, %r1786, %r1786, 25; + add.s32 %r1788, %r1740, %r1179; + add.s32 %r1789, %r1788, %r1731; + xor.b32 %r1790, %r1789, %r1700; + shf.l.wrap.b32 %r1791, %r1790, %r1790, 16; + add.s32 %r1792, %r1791, %r1715; + xor.b32 %r1793, %r1792, %r1731; + shf.l.wrap.b32 %r1794, %r1793, %r1793, 20; + add.s32 %r1795, %r1789, %r1081; + add.s32 %r1796, %r1795, %r1794; + xor.b32 %r1797, %r1796, %r1791; + shf.l.wrap.b32 %r1798, %r1797, %r1797, 24; + add.s32 %r1799, %r1798, %r1792; + xor.b32 %r1800, %r1799, %r1794; + shf.l.wrap.b32 %r1801, %r1800, %r1800, 25; + add.s32 %r1802, %r1754, %r1165; + add.s32 %r1803, %r1802, %r1773; + xor.b32 %r1804, %r1803, %r1798; + shf.l.wrap.b32 %r1805, %r1804, %r1804, 16; + add.s32 %r1806, %r1805, %r1785; + xor.b32 %r1807, %r1806, %r1773; + shf.l.wrap.b32 %r1808, %r1807, %r1807, 20; + add.s32 %r1809, %r1803, %r1095; + add.s32 %r1810, %r1809, %r1808; + xor.b32 %r1811, %r1810, %r1805; + shf.l.wrap.b32 %r1812, %r1811, %r1811, 24; + add.s32 %r1813, %r1812, %r1806; + xor.b32 %r1814, %r1813, %r1808; + shf.l.wrap.b32 %r1815, %r1814, %r1814, 25; + add.s32 %r1816, %r1768, %r1074; + add.s32 %r1817, %r1816, %r1787; + xor.b32 %r1818, %r1817, %r1756; + shf.l.wrap.b32 %r1819, %r1818, %r1818, 16; + add.s32 %r1820, %r1819, %r1799; + xor.b32 %r1821, %r1820, %r1787; + shf.l.wrap.b32 %r1822, %r1821, %r1821, 20; + add.s32 %r1823, %r1817, %r1144; + add.s32 %r1824, %r1823, %r1822; + xor.b32 %r1825, %r1824, %r1819; + shf.l.wrap.b32 %r1826, %r1825, %r1825, 24; + add.s32 %r1827, %r1826, %r1820; + xor.b32 %r1828, %r1827, %r1822; + shf.l.wrap.b32 %r1829, %r1828, %r1828, 25; + add.s32 %r1830, %r1782, %r1088; + add.s32 %r1831, %r1830, %r1801; + xor.b32 %r1832, %r1831, %r1770; + shf.l.wrap.b32 %r1833, %r1832, %r1832, 16; + add.s32 %r1834, %r1833, %r1757; + xor.b32 %r1835, %r1834, %r1801; + shf.l.wrap.b32 %r1836, %r1835, %r1835, 20; + add.s32 %r1837, %r1831, %r1116; + add.s32 %r1838, %r1837, %r1836; + xor.b32 %r1839, %r1838, %r1833; + shf.l.wrap.b32 %r1840, %r1839, %r1839, 24; + add.s32 %r1841, %r1840, %r1834; + xor.b32 %r1842, %r1841, %r1836; + shf.l.wrap.b32 %r1843, %r1842, %r1842, 25; + add.s32 %r1844, %r1796, %r1102; + add.s32 %r1845, %r1844, %r1759; + xor.b32 %r1846, %r1845, %r1784; + shf.l.wrap.b32 %r1847, %r1846, %r1846, 16; + add.s32 %r1848, %r1847, %r1771; + xor.b32 %r1849, %r1848, %r1759; + shf.l.wrap.b32 %r1850, %r1849, %r1849, 20; + add.s32 %r1851, %r1845, %r1123; + add.s32 %r1852, %r1851, %r1850; + xor.b32 %r1853, %r1852, %r1847; + shf.l.wrap.b32 %r1854, %r1853, %r1853, 24; + add.s32 %r1855, %r1854, %r1848; + xor.b32 %r1856, %r1855, %r1850; + shf.l.wrap.b32 %r1857, %r1856, %r1856, 25; + add.s32 %r1858, %r1810, %r1151; + add.s32 %r1859, %r1858, %r1857; + xor.b32 %r1860, %r1859, %r1826; + shf.l.wrap.b32 %r1861, %r1860, %r1860, 16; + add.s32 %r1862, %r1861, %r1841; + xor.b32 %r1863, %r1862, %r1857; + shf.l.wrap.b32 %r1864, %r1863, %r1863, 20; + add.s32 %r1865, %r1859, %r1179; + add.s32 %r1866, %r1865, %r1864; + xor.b32 %r1867, %r1866, %r1861; + shf.l.wrap.b32 %r1868, %r1867, %r1867, 24; + add.s32 %r1869, %r1868, %r1862; + xor.b32 %r1870, %r1869, %r1864; + shf.l.wrap.b32 %r1871, %r1870, %r1870, 25; + add.s32 %r1872, %r1824, %r1109; + add.s32 %r1873, %r1872, %r1815; + xor.b32 %r1874, %r1873, %r1840; + shf.l.wrap.b32 %r1875, %r1874, %r1874, 16; + add.s32 %r1876, %r1875, %r1855; + xor.b32 %r1877, %r1876, %r1815; + shf.l.wrap.b32 %r1878, %r1877, %r1877, 20; + add.s32 %r1879, %r1873, %r1074; + add.s32 %r1880, %r1879, %r1878; + xor.b32 %r1881, %r1880, %r1875; + shf.l.wrap.b32 %r1882, %r1881, %r1881, 24; + add.s32 %r1883, %r1882, %r1876; + xor.b32 %r1884, %r1883, %r1878; + shf.l.wrap.b32 %r1885, %r1884, %r1884, 25; + add.s32 %r1886, %r1838, %r1081; + add.s32 %r1887, %r1886, %r1829; + xor.b32 %r1888, %r1887, %r1854; + shf.l.wrap.b32 %r1889, %r1888, %r1888, 16; + add.s32 %r1890, %r1889, %r1813; + xor.b32 %r1891, %r1890, %r1829; + shf.l.wrap.b32 %r1892, %r1891, %r1891, 20; + add.s32 %r1893, %r1887, %r1137; + add.s32 %r1894, %r1893, %r1892; + xor.b32 %r1895, %r1894, %r1889; + shf.l.wrap.b32 %r1896, %r1895, %r1895, 24; + add.s32 %r1897, %r1896, %r1890; + xor.b32 %r1898, %r1897, %r1892; + shf.l.wrap.b32 %r1899, %r1898, %r1898, 25; + add.s32 %r1900, %r1852, %r1130; + add.s32 %r1901, %r1900, %r1843; + xor.b32 %r1902, %r1901, %r1812; + shf.l.wrap.b32 %r1903, %r1902, %r1902, 16; + add.s32 %r1904, %r1903, %r1827; + xor.b32 %r1905, %r1904, %r1843; + shf.l.wrap.b32 %r1906, %r1905, %r1905, 20; + add.s32 %r1907, %r1901, %r1116; + add.s32 %r1908, %r1907, %r1906; + xor.b32 %r1909, %r1908, %r1903; + shf.l.wrap.b32 %r1910, %r1909, %r1909, 24; + add.s32 %r1911, %r1910, %r1904; + xor.b32 %r1912, %r1911, %r1906; + shf.l.wrap.b32 %r1913, %r1912, %r1912, 25; + add.s32 %r1914, %r1866, %r1172; + add.s32 %r1915, %r1914, %r1885; + xor.b32 %r1916, %r1915, %r1910; + shf.l.wrap.b32 %r1917, %r1916, %r1916, 16; + add.s32 %r1918, %r1917, %r1897; + xor.b32 %r1919, %r1918, %r1885; + shf.l.wrap.b32 %r1920, %r1919, %r1919, 20; + add.s32 %r1921, %r1915, %r1144; + add.s32 %r1922, %r1921, %r1920; + xor.b32 %r1923, %r1922, %r1917; + shf.l.wrap.b32 %r1924, %r1923, %r1923, 24; + add.s32 %r1925, %r1924, %r1918; + xor.b32 %r1926, %r1925, %r1920; + shf.l.wrap.b32 %r1927, %r1926, %r1926, 25; + add.s32 %r1928, %r1880, %r1088; + add.s32 %r1929, %r1928, %r1899; + xor.b32 %r1930, %r1929, %r1868; + shf.l.wrap.b32 %r1931, %r1930, %r1930, 16; + add.s32 %r1932, %r1931, %r1911; + xor.b32 %r1933, %r1932, %r1899; + shf.l.wrap.b32 %r1934, %r1933, %r1933, 20; + add.s32 %r1935, %r1929, %r1158; + add.s32 %r1936, %r1935, %r1934; + xor.b32 %r1937, %r1936, %r1931; + shf.l.wrap.b32 %r1938, %r1937, %r1937, 24; + add.s32 %r1939, %r1938, %r1932; + xor.b32 %r1940, %r1939, %r1934; + shf.l.wrap.b32 %r1941, %r1940, %r1940, 25; + add.s32 %r1942, %r1894, %r1095; + add.s32 %r1943, %r1942, %r1913; + xor.b32 %r1944, %r1943, %r1882; + shf.l.wrap.b32 %r1945, %r1944, %r1944, 16; + add.s32 %r1946, %r1945, %r1869; + xor.b32 %r1947, %r1946, %r1913; + shf.l.wrap.b32 %r1948, %r1947, %r1947, 20; + add.s32 %r1949, %r1943, %r1102; + add.s32 %r1950, %r1949, %r1948; + xor.b32 %r1951, %r1950, %r1945; + shf.l.wrap.b32 %r1952, %r1951, %r1951, 24; + add.s32 %r1953, %r1952, %r1946; + xor.b32 %r1954, %r1953, %r1948; + shf.l.wrap.b32 %r1955, %r1954, %r1954, 25; + add.s32 %r1956, %r1908, %r1123; + add.s32 %r1957, %r1956, %r1871; + xor.b32 %r1958, %r1957, %r1896; + shf.l.wrap.b32 %r1959, %r1958, %r1958, 16; + add.s32 %r1960, %r1959, %r1883; + xor.b32 %r1961, %r1960, %r1871; + shf.l.wrap.b32 %r1962, %r1961, %r1961, 20; + add.s32 %r1963, %r1957, %r1165; + add.s32 %r1964, %r1963, %r1962; + xor.b32 %r1965, %r1964, %r1959; + shf.l.wrap.b32 %r1966, %r1965, %r1965, 24; + add.s32 %r1967, %r1966, %r1960; + xor.b32 %r1968, %r1967, %r1962; + shf.l.wrap.b32 %r1969, %r1968, %r1968, 25; + xor.b32 %r11657, %r1953, %r1922; + st.local.u32 [%rd3+-104], %r11657; + xor.b32 %r11656, %r1967, %r1936; + st.local.u32 [%rd3+-100], %r11656; + xor.b32 %r11655, %r1925, %r1950; + st.local.u32 [%rd3+-96], %r11655; + xor.b32 %r11654, %r1939, %r1964; + st.local.u32 [%rd3+-92], %r11654; + xor.b32 %r11653, %r1969, %r1938; + st.local.u32 [%rd3+-88], %r11653; + xor.b32 %r11652, %r1927, %r1952; + st.local.u32 [%rd3+-84], %r11652; + xor.b32 %r11651, %r1941, %r1966; + st.local.u32 [%rd3+-80], %r11651; + xor.b32 %r11650, %r1955, %r1924; + st.local.u32 [%rd3+-76], %r11650; + add.s16 %rs352, %rs352, 1; + st.local.u8 [%rd3+1], %rs352; + add.s64 %rd261, %rd261, 64; + add.s64 %rd244, %rd244, -64; + setp.gt.u64 %p10, %rd244, 64; + @%p10 bra $L__BB1_11; + +$L__BB1_12: + cvt.u64.u16 %rd120, %rs351; + and.b64 %rd24, %rd120, 255; + mov.u64 %rd121, 64; + sub.s64 %rd122, %rd121, %rd24; + min.u64 %rd25, %rd122, %rd244; + setp.eq.s64 %p11, %rd25, 0; + @%p11 bra $L__BB1_15; + + add.s64 %rd124, %rd2, %rd24; + add.s64 %rd26, %rd124, 72; + mov.u64 %rd245, 0; + +$L__BB1_14: + add.s64 %rd125, %rd261, %rd245; + ld.local.u8 %rs119, [%rd125]; + add.s64 %rd126, %rd26, %rd245; + st.local.u8 [%rd126], %rs119; + add.s64 %rd245, %rd245, 1; + setp.lt.u64 %p12, %rd245, %rd25; + @%p12 bra $L__BB1_14; + +$L__BB1_15: + cvt.u16.u64 %rs120, %rd25; + ld.local.u8 %rs121, [%rd3]; + add.s16 %rs13, %rs121, %rs120; + st.local.u8 [%rd3], %rs13; + mov.u64 %rd127, 32; + sub.s64 %rd29, %rd127, %rd6; + setp.eq.s64 %p13, %rd29, 0; + @%p13 bra $L__BB1_68; + + ld.local.u8 %rs122, [%rd3+1]; + setp.eq.s16 %p14, %rs122, 0; + selp.u16 %rs123, 1, 0, %p14; + ld.local.u8 %rs124, [%rd3+2]; + or.b16 %rs125, %rs124, %rs123; + or.b16 %rs126, %rs125, 2; + ld.local.u8 %r1970, [%rd3+-64]; + ld.local.u8 %r1971, [%rd3+-63]; + prmt.b32 %r1972, %r1971, %r1970, 30212; + ld.local.u8 %r1973, [%rd3+-62]; + prmt.b32 %r1974, %r1973, %r1972, 28756; + ld.local.u8 %r1975, [%rd3+-61]; + prmt.b32 %r1976, %r1975, %r1974, 1620; + ld.local.u8 %r1977, [%rd3+-60]; + ld.local.u8 %r1978, [%rd3+-59]; + prmt.b32 %r1979, %r1978, %r1977, 30212; + ld.local.u8 %r1980, [%rd3+-58]; + prmt.b32 %r1981, %r1980, %r1979, 28756; + ld.local.u8 %r1982, [%rd3+-57]; + prmt.b32 %r1983, %r1982, %r1981, 1620; + ld.local.u8 %r1984, [%rd3+-56]; + ld.local.u8 %r1985, [%rd3+-55]; + prmt.b32 %r1986, %r1985, %r1984, 30212; + ld.local.u8 %r1987, [%rd3+-54]; + prmt.b32 %r1988, %r1987, %r1986, 28756; + ld.local.u8 %r1989, [%rd3+-53]; + prmt.b32 %r1990, %r1989, %r1988, 1620; + ld.local.u8 %r1991, [%rd3+-52]; + ld.local.u8 %r1992, [%rd3+-51]; + prmt.b32 %r1993, %r1992, %r1991, 30212; + ld.local.u8 %r1994, [%rd3+-50]; + prmt.b32 %r1995, %r1994, %r1993, 28756; + ld.local.u8 %r1996, [%rd3+-49]; + prmt.b32 %r1997, %r1996, %r1995, 1620; + ld.local.u8 %r1998, [%rd3+-48]; + ld.local.u8 %r1999, [%rd3+-47]; + prmt.b32 %r2000, %r1999, %r1998, 30212; + ld.local.u8 %r2001, [%rd3+-46]; + prmt.b32 %r2002, %r2001, %r2000, 28756; + ld.local.u8 %r2003, [%rd3+-45]; + prmt.b32 %r2004, %r2003, %r2002, 1620; + ld.local.u8 %r2005, [%rd3+-44]; + ld.local.u8 %r2006, [%rd3+-43]; + prmt.b32 %r2007, %r2006, %r2005, 30212; + ld.local.u8 %r2008, [%rd3+-42]; + prmt.b32 %r2009, %r2008, %r2007, 28756; + ld.local.u8 %r2010, [%rd3+-41]; + prmt.b32 %r2011, %r2010, %r2009, 1620; + ld.local.u8 %r2012, [%rd3+-40]; + ld.local.u8 %r2013, [%rd3+-39]; + prmt.b32 %r2014, %r2013, %r2012, 30212; + ld.local.u8 %r2015, [%rd3+-38]; + prmt.b32 %r2016, %r2015, %r2014, 28756; + ld.local.u8 %r2017, [%rd3+-37]; + prmt.b32 %r2018, %r2017, %r2016, 1620; + ld.local.u8 %r2019, [%rd3+-36]; + ld.local.u8 %r2020, [%rd3+-35]; + prmt.b32 %r2021, %r2020, %r2019, 30212; + ld.local.u8 %r2022, [%rd3+-34]; + prmt.b32 %r2023, %r2022, %r2021, 28756; + ld.local.u8 %r2024, [%rd3+-33]; + prmt.b32 %r2025, %r2024, %r2023, 1620; + ld.local.u8 %r2026, [%rd3+-32]; + ld.local.u8 %r2027, [%rd3+-31]; + prmt.b32 %r2028, %r2027, %r2026, 30212; + ld.local.u8 %r2029, [%rd3+-30]; + prmt.b32 %r2030, %r2029, %r2028, 28756; + ld.local.u8 %r2031, [%rd3+-29]; + prmt.b32 %r2032, %r2031, %r2030, 1620; + ld.local.u8 %r2033, [%rd3+-28]; + ld.local.u8 %r2034, [%rd3+-27]; + prmt.b32 %r2035, %r2034, %r2033, 30212; + ld.local.u8 %r2036, [%rd3+-26]; + prmt.b32 %r2037, %r2036, %r2035, 28756; + ld.local.u8 %r2038, [%rd3+-25]; + prmt.b32 %r2039, %r2038, %r2037, 1620; + ld.local.u8 %r2040, [%rd3+-24]; + ld.local.u8 %r2041, [%rd3+-23]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd3+-22]; + prmt.b32 %r2044, %r2043, %r2042, 28756; + ld.local.u8 %r2045, [%rd3+-21]; + prmt.b32 %r2046, %r2045, %r2044, 1620; + ld.local.u8 %r2047, [%rd3+-20]; + ld.local.u8 %r2048, [%rd3+-19]; + prmt.b32 %r2049, %r2048, %r2047, 30212; + ld.local.u8 %r2050, [%rd3+-18]; + prmt.b32 %r2051, %r2050, %r2049, 28756; + ld.local.u8 %r2052, [%rd3+-17]; + prmt.b32 %r2053, %r2052, %r2051, 1620; + ld.local.u8 %r2054, [%rd3+-16]; + ld.local.u8 %r2055, [%rd3+-15]; + prmt.b32 %r2056, %r2055, %r2054, 30212; + ld.local.u8 %r2057, [%rd3+-14]; + prmt.b32 %r2058, %r2057, %r2056, 28756; + ld.local.u8 %r2059, [%rd3+-13]; + prmt.b32 %r2060, %r2059, %r2058, 1620; + ld.local.u8 %r2061, [%rd3+-12]; + ld.local.u8 %r2062, [%rd3+-11]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + ld.local.u8 %r2064, [%rd3+-10]; + prmt.b32 %r2065, %r2064, %r2063, 28756; + ld.local.u8 %r2066, [%rd3+-9]; + prmt.b32 %r2067, %r2066, %r2065, 1620; + ld.local.u8 %r2068, [%rd3+-8]; + ld.local.u8 %r2069, [%rd3+-7]; + prmt.b32 %r2070, %r2069, %r2068, 30212; + ld.local.u8 %r2071, [%rd3+-6]; + prmt.b32 %r2072, %r2071, %r2070, 28756; + ld.local.u8 %r2073, [%rd3+-5]; + prmt.b32 %r2074, %r2073, %r2072, 1620; + ld.local.u8 %r2075, [%rd3+-4]; + ld.local.u8 %r2076, [%rd3+-3]; + prmt.b32 %r2077, %r2076, %r2075, 30212; + ld.local.u8 %r2078, [%rd3+-2]; + prmt.b32 %r2079, %r2078, %r2077, 28756; + ld.local.u8 %r2080, [%rd3+-1]; + prmt.b32 %r2081, %r2080, %r2079, 1620; + ld.local.u64 %rd128, [%rd3+-72]; + cvt.u32.u64 %r2082, %rd128; + shr.u64 %rd129, %rd128, 32; + cvt.u32.u64 %r2083, %rd129; + cvt.u32.u16 %r2084, %rs126; + and.b32 %r2085, %r2084, 255; + ld.local.u8 %r2086, [%rd3+-88]; + ld.local.u8 %r2087, [%rd3+-87]; + prmt.b32 %r2088, %r2087, %r2086, 30212; + ld.local.u8 %r2089, [%rd3+-86]; + ld.local.u8 %r2090, [%rd3+-85]; + prmt.b32 %r2091, %r2090, %r2089, 30212; + prmt.b32 %r2092, %r2091, %r2088, 4180; + ld.local.u8 %r2093, [%rd3+-104]; + ld.local.u8 %r2094, [%rd3+-103]; + prmt.b32 %r2095, %r2094, %r2093, 30212; + ld.local.u8 %r2096, [%rd3+-102]; + ld.local.u8 %r2097, [%rd3+-101]; + prmt.b32 %r2098, %r2097, %r2096, 30212; + prmt.b32 %r2099, %r2098, %r2095, 4180; + add.s32 %r2100, %r2092, %r2099; + add.s32 %r2101, %r2100, %r1976; + xor.b32 %r2102, %r2101, %r2082; + shf.l.wrap.b32 %r2103, %r2102, %r2102, 16; + add.s32 %r2104, %r2103, 1779033703; + xor.b32 %r2105, %r2104, %r2092; + shf.l.wrap.b32 %r2106, %r2105, %r2105, 20; + add.s32 %r2107, %r1983, %r2101; + add.s32 %r2108, %r2107, %r2106; + xor.b32 %r2109, %r2108, %r2103; + shf.l.wrap.b32 %r2110, %r2109, %r2109, 24; + add.s32 %r2111, %r2110, %r2104; + xor.b32 %r2112, %r2111, %r2106; + shf.l.wrap.b32 %r2113, %r2112, %r2112, 25; + ld.local.u8 %r2114, [%rd3+-84]; + ld.local.u8 %r2115, [%rd3+-83]; + prmt.b32 %r2116, %r2115, %r2114, 30212; + ld.local.u8 %r2117, [%rd3+-82]; + ld.local.u8 %r2118, [%rd3+-81]; + prmt.b32 %r2119, %r2118, %r2117, 30212; + prmt.b32 %r2120, %r2119, %r2116, 4180; + ld.local.u8 %r2121, [%rd3+-100]; + ld.local.u8 %r2122, [%rd3+-99]; + prmt.b32 %r2123, %r2122, %r2121, 30212; + ld.local.u8 %r2124, [%rd3+-98]; + ld.local.u8 %r2125, [%rd3+-97]; + prmt.b32 %r2126, %r2125, %r2124, 30212; + prmt.b32 %r2127, %r2126, %r2123, 4180; + add.s32 %r2128, %r2120, %r2127; + add.s32 %r2129, %r2128, %r1990; + xor.b32 %r2130, %r2129, %r2083; + shf.l.wrap.b32 %r2131, %r2130, %r2130, 16; + add.s32 %r2132, %r2131, -1150833019; + xor.b32 %r2133, %r2132, %r2120; + shf.l.wrap.b32 %r2134, %r2133, %r2133, 20; + add.s32 %r2135, %r1997, %r2129; + add.s32 %r2136, %r2135, %r2134; + xor.b32 %r2137, %r2136, %r2131; + shf.l.wrap.b32 %r2138, %r2137, %r2137, 24; + add.s32 %r2139, %r2138, %r2132; + xor.b32 %r2140, %r2139, %r2134; + shf.l.wrap.b32 %r2141, %r2140, %r2140, 25; + ld.local.u8 %r2142, [%rd3+-80]; + ld.local.u8 %r2143, [%rd3+-79]; + prmt.b32 %r2144, %r2143, %r2142, 30212; + ld.local.u8 %r2145, [%rd3+-78]; + ld.local.u8 %r2146, [%rd3+-77]; + prmt.b32 %r2147, %r2146, %r2145, 30212; + prmt.b32 %r2148, %r2147, %r2144, 4180; + ld.local.u8 %r2149, [%rd3+-96]; + ld.local.u8 %r2150, [%rd3+-95]; + prmt.b32 %r2151, %r2150, %r2149, 30212; + ld.local.u8 %r2152, [%rd3+-94]; + ld.local.u8 %r2153, [%rd3+-93]; + prmt.b32 %r2154, %r2153, %r2152, 30212; + prmt.b32 %r2155, %r2154, %r2151, 4180; + add.s32 %r2156, %r2148, %r2155; + add.s32 %r2157, %r2156, %r2004; + cvt.u32.u16 %r2158, %rs13; + and.b32 %r2159, %r2158, 255; + xor.b32 %r2160, %r2157, %r2159; + shr.u32 %r2161, %r2157, 16; + shl.b32 %r2162, %r2160, 16; + or.b32 %r2163, %r2162, %r2161; + add.s32 %r2164, %r2163, 1013904242; + xor.b32 %r2165, %r2164, %r2148; + shf.l.wrap.b32 %r2166, %r2165, %r2165, 20; + add.s32 %r2167, %r2011, %r2157; + add.s32 %r2168, %r2167, %r2166; + xor.b32 %r2169, %r2168, %r2163; + shf.l.wrap.b32 %r2170, %r2169, %r2169, 24; + add.s32 %r2171, %r2170, %r2164; + xor.b32 %r2172, %r2171, %r2166; + shf.l.wrap.b32 %r2173, %r2172, %r2172, 25; + ld.local.u8 %r2174, [%rd3+-76]; + ld.local.u8 %r2175, [%rd3+-75]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.local.u8 %r2177, [%rd3+-74]; + ld.local.u8 %r2178, [%rd3+-73]; + prmt.b32 %r2179, %r2178, %r2177, 30212; + prmt.b32 %r2180, %r2179, %r2176, 4180; + ld.local.u8 %r2181, [%rd3+-92]; + ld.local.u8 %r2182, [%rd3+-91]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.local.u8 %r2184, [%rd3+-90]; + ld.local.u8 %r2185, [%rd3+-89]; + prmt.b32 %r2186, %r2185, %r2184, 30212; + prmt.b32 %r2187, %r2186, %r2183, 4180; + add.s32 %r2188, %r2180, %r2187; + add.s32 %r2189, %r2188, %r2018; + xor.b32 %r2190, %r2189, %r2085; + shr.u32 %r2191, %r2189, 16; + shl.b32 %r2192, %r2190, 16; + or.b32 %r2193, %r2192, %r2191; + add.s32 %r2194, %r2193, -1521486534; + xor.b32 %r2195, %r2194, %r2180; + shf.l.wrap.b32 %r2196, %r2195, %r2195, 20; + add.s32 %r2197, %r2025, %r2189; + add.s32 %r2198, %r2197, %r2196; + xor.b32 %r2199, %r2198, %r2193; + shf.l.wrap.b32 %r2200, %r2199, %r2199, 24; + add.s32 %r2201, %r2200, %r2194; + xor.b32 %r2202, %r2201, %r2196; + shf.l.wrap.b32 %r2203, %r2202, %r2202, 25; + add.s32 %r2204, %r2141, %r2108; + add.s32 %r2205, %r2204, %r2032; + xor.b32 %r2206, %r2200, %r2205; + shf.l.wrap.b32 %r2207, %r2206, %r2206, 16; + add.s32 %r2208, %r2207, %r2171; + xor.b32 %r2209, %r2208, %r2141; + shf.l.wrap.b32 %r2210, %r2209, %r2209, 20; + add.s32 %r2211, %r2039, %r2205; + add.s32 %r2212, %r2211, %r2210; + xor.b32 %r2213, %r2212, %r2207; + shf.l.wrap.b32 %r2214, %r2213, %r2213, 24; + add.s32 %r2215, %r2214, %r2208; + xor.b32 %r2216, %r2215, %r2210; + shf.l.wrap.b32 %r2217, %r2216, %r2216, 25; + add.s32 %r2218, %r2173, %r2136; + add.s32 %r2219, %r2218, %r2046; + xor.b32 %r2220, %r2219, %r2110; + shf.l.wrap.b32 %r2221, %r2220, %r2220, 16; + add.s32 %r2222, %r2221, %r2201; + xor.b32 %r2223, %r2222, %r2173; + shf.l.wrap.b32 %r2224, %r2223, %r2223, 20; + add.s32 %r2225, %r2053, %r2219; + add.s32 %r2226, %r2225, %r2224; + xor.b32 %r2227, %r2226, %r2221; + shf.l.wrap.b32 %r2228, %r2227, %r2227, 24; + add.s32 %r2229, %r2228, %r2222; + xor.b32 %r2230, %r2229, %r2224; + shf.l.wrap.b32 %r2231, %r2230, %r2230, 25; + add.s32 %r2232, %r2203, %r2168; + add.s32 %r2233, %r2232, %r2060; + xor.b32 %r2234, %r2233, %r2138; + shf.l.wrap.b32 %r2235, %r2234, %r2234, 16; + add.s32 %r2236, %r2235, %r2111; + xor.b32 %r2237, %r2236, %r2203; + shf.l.wrap.b32 %r2238, %r2237, %r2237, 20; + add.s32 %r2239, %r2067, %r2233; + add.s32 %r2240, %r2239, %r2238; + xor.b32 %r2241, %r2240, %r2235; + shf.l.wrap.b32 %r2242, %r2241, %r2241, 24; + add.s32 %r2243, %r2242, %r2236; + xor.b32 %r2244, %r2243, %r2238; + shf.l.wrap.b32 %r2245, %r2244, %r2244, 25; + add.s32 %r2246, %r2198, %r2113; + add.s32 %r2247, %r2246, %r2074; + xor.b32 %r2248, %r2247, %r2170; + shf.l.wrap.b32 %r2249, %r2248, %r2248, 16; + add.s32 %r2250, %r2249, %r2139; + xor.b32 %r2251, %r2250, %r2113; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 20; + add.s32 %r2253, %r2081, %r2247; + add.s32 %r2254, %r2253, %r2252; + xor.b32 %r2255, %r2254, %r2249; + shf.l.wrap.b32 %r2256, %r2255, %r2255, 24; + add.s32 %r2257, %r2256, %r2250; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 25; + add.s32 %r2260, %r2212, %r1990; + add.s32 %r2261, %r2260, %r2259; + xor.b32 %r2262, %r2261, %r2228; + shf.l.wrap.b32 %r2263, %r2262, %r2262, 16; + add.s32 %r2264, %r2263, %r2243; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 20; + add.s32 %r2267, %r2261, %r2018; + add.s32 %r2268, %r2267, %r2266; + xor.b32 %r2269, %r2268, %r2263; + shf.l.wrap.b32 %r2270, %r2269, %r2269, 24; + add.s32 %r2271, %r2270, %r2264; + xor.b32 %r2272, %r2271, %r2266; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 25; + add.s32 %r2274, %r2226, %r1997; + add.s32 %r2275, %r2274, %r2217; + xor.b32 %r2276, %r2242, %r2275; + shf.l.wrap.b32 %r2277, %r2276, %r2276, 16; + add.s32 %r2278, %r2257, %r2277; + xor.b32 %r2279, %r2278, %r2217; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 20; + add.s32 %r2281, %r2275, %r2046; + add.s32 %r2282, %r2281, %r2280; + xor.b32 %r2283, %r2282, %r2277; + shf.l.wrap.b32 %r2284, %r2283, %r2283, 24; + add.s32 %r2285, %r2284, %r2278; + xor.b32 %r2286, %r2285, %r2280; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 25; + add.s32 %r2288, %r2231, %r2025; + add.s32 %r2289, %r2288, %r2240; + xor.b32 %r2290, %r2256, %r2289; + shf.l.wrap.b32 %r2291, %r2290, %r2290, 16; + add.s32 %r2292, %r2291, %r2215; + xor.b32 %r2293, %r2292, %r2231; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 20; + add.s32 %r2295, %r2289, %r1976; + add.s32 %r2296, %r2295, %r2294; + xor.b32 %r2297, %r2296, %r2291; + shf.l.wrap.b32 %r2298, %r2297, %r2297, 24; + add.s32 %r2299, %r2298, %r2292; + xor.b32 %r2300, %r2299, %r2294; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 25; + add.s32 %r2302, %r2245, %r2004; + add.s32 %r2303, %r2302, %r2254; + xor.b32 %r2304, %r2303, %r2214; + shf.l.wrap.b32 %r2305, %r2304, %r2304, 16; + add.s32 %r2306, %r2305, %r2229; + xor.b32 %r2307, %r2306, %r2245; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 20; + add.s32 %r2309, %r2303, %r2067; + add.s32 %r2310, %r2309, %r2308; + xor.b32 %r2311, %r2310, %r2305; + shf.l.wrap.b32 %r2312, %r2311, %r2311, 24; + add.s32 %r2313, %r2312, %r2306; + xor.b32 %r2314, %r2313, %r2308; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 25; + add.s32 %r2316, %r2287, %r1983; + add.s32 %r2317, %r2316, %r2268; + xor.b32 %r2318, %r2317, %r2312; + shf.l.wrap.b32 %r2319, %r2318, %r2318, 16; + add.s32 %r2320, %r2319, %r2299; + xor.b32 %r2321, %r2320, %r2287; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 20; + add.s32 %r2323, %r2317, %r2053; + add.s32 %r2324, %r2323, %r2322; + xor.b32 %r2325, %r2324, %r2319; + shf.l.wrap.b32 %r2326, %r2325, %r2325, 24; + add.s32 %r2327, %r2326, %r2320; + xor.b32 %r2328, %r2327, %r2322; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 25; + add.s32 %r2330, %r2282, %r2060; + add.s32 %r2331, %r2330, %r2301; + xor.b32 %r2332, %r2270, %r2331; + shf.l.wrap.b32 %r2333, %r2332, %r2332, 16; + add.s32 %r2334, %r2333, %r2313; + xor.b32 %r2335, %r2334, %r2301; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 20; + add.s32 %r2337, %r2331, %r2011; + add.s32 %r2338, %r2337, %r2336; + xor.b32 %r2339, %r2338, %r2333; + shf.l.wrap.b32 %r2340, %r2339, %r2339, 24; + add.s32 %r2341, %r2340, %r2334; + xor.b32 %r2342, %r2341, %r2336; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 25; + add.s32 %r2344, %r2296, %r2039; + add.s32 %r2345, %r2344, %r2315; + xor.b32 %r2346, %r2345, %r2284; + shf.l.wrap.b32 %r2347, %r2346, %r2346, 16; + add.s32 %r2348, %r2347, %r2271; + xor.b32 %r2349, %r2348, %r2315; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 20; + add.s32 %r2351, %r2345, %r2074; + add.s32 %r2352, %r2351, %r2350; + xor.b32 %r2353, %r2352, %r2347; + shf.l.wrap.b32 %r2354, %r2353, %r2353, 24; + add.s32 %r2355, %r2354, %r2348; + xor.b32 %r2356, %r2355, %r2350; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 25; + add.s32 %r2358, %r2310, %r2081; + add.s32 %r2359, %r2358, %r2273; + xor.b32 %r2360, %r2359, %r2298; + shf.l.wrap.b32 %r2361, %r2360, %r2360, 16; + add.s32 %r2362, %r2361, %r2285; + xor.b32 %r2363, %r2362, %r2273; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 20; + add.s32 %r2365, %r2359, %r2032; + add.s32 %r2366, %r2365, %r2364; + xor.b32 %r2367, %r2366, %r2361; + shf.l.wrap.b32 %r2368, %r2367, %r2367, 24; + add.s32 %r2369, %r2368, %r2362; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 25; + add.s32 %r2372, %r2324, %r1997; + add.s32 %r2373, %r2372, %r2371; + xor.b32 %r2374, %r2373, %r2340; + shf.l.wrap.b32 %r2375, %r2374, %r2374, 16; + add.s32 %r2376, %r2375, %r2355; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 20; + add.s32 %r2379, %r2373, %r2004; + add.s32 %r2380, %r2379, %r2378; + xor.b32 %r2381, %r2380, %r2375; + shf.l.wrap.b32 %r2382, %r2381, %r2381, 24; + add.s32 %r2383, %r2382, %r2376; + xor.b32 %r2384, %r2383, %r2378; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 25; + add.s32 %r2386, %r2338, %r2046; + add.s32 %r2387, %r2386, %r2329; + xor.b32 %r2388, %r2387, %r2354; + shf.l.wrap.b32 %r2389, %r2388, %r2388, 16; + add.s32 %r2390, %r2389, %r2369; + xor.b32 %r2391, %r2390, %r2329; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 20; + add.s32 %r2393, %r2387, %r2060; + add.s32 %r2394, %r2393, %r2392; + xor.b32 %r2395, %r2394, %r2389; + shf.l.wrap.b32 %r2396, %r2395, %r2395, 24; + add.s32 %r2397, %r2396, %r2390; + xor.b32 %r2398, %r2397, %r2392; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 25; + add.s32 %r2400, %r2352, %r2067; + add.s32 %r2401, %r2400, %r2343; + xor.b32 %r2402, %r2368, %r2401; + shf.l.wrap.b32 %r2403, %r2402, %r2402, 16; + add.s32 %r2404, %r2403, %r2327; + xor.b32 %r2405, %r2404, %r2343; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 20; + add.s32 %r2407, %r2401, %r1990; + add.s32 %r2408, %r2407, %r2406; + xor.b32 %r2409, %r2408, %r2403; + shf.l.wrap.b32 %r2410, %r2409, %r2409, 24; + add.s32 %r2411, %r2410, %r2404; + xor.b32 %r2412, %r2411, %r2406; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 25; + add.s32 %r2414, %r2357, %r2025; + add.s32 %r2415, %r2414, %r2366; + xor.b32 %r2416, %r2415, %r2326; + shf.l.wrap.b32 %r2417, %r2416, %r2416, 16; + add.s32 %r2418, %r2417, %r2341; + xor.b32 %r2419, %r2418, %r2357; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 20; + add.s32 %r2421, %r2415, %r2074; + add.s32 %r2422, %r2421, %r2420; + xor.b32 %r2423, %r2422, %r2417; + shf.l.wrap.b32 %r2424, %r2423, %r2423, 24; + add.s32 %r2425, %r2424, %r2418; + xor.b32 %r2426, %r2425, %r2420; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 25; + add.s32 %r2428, %r2399, %r2018; + add.s32 %r2429, %r2428, %r2380; + xor.b32 %r2430, %r2429, %r2424; + shf.l.wrap.b32 %r2431, %r2430, %r2430, 16; + add.s32 %r2432, %r2431, %r2411; + xor.b32 %r2433, %r2432, %r2399; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 20; + add.s32 %r2435, %r2429, %r2011; + add.s32 %r2436, %r2435, %r2434; + xor.b32 %r2437, %r2436, %r2431; + shf.l.wrap.b32 %r2438, %r2437, %r2437, 24; + add.s32 %r2439, %r2438, %r2432; + xor.b32 %r2440, %r2439, %r2434; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 25; + add.s32 %r2442, %r2394, %r2039; + add.s32 %r2443, %r2442, %r2413; + xor.b32 %r2444, %r2382, %r2443; + shf.l.wrap.b32 %r2445, %r2444, %r2444, 16; + add.s32 %r2446, %r2445, %r2425; + xor.b32 %r2447, %r2446, %r2413; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 20; + add.s32 %r2449, %r2443, %r1976; + add.s32 %r2450, %r2449, %r2448; + xor.b32 %r2451, %r2450, %r2445; + shf.l.wrap.b32 %r2452, %r2451, %r2451, 24; + add.s32 %r2453, %r2452, %r2446; + xor.b32 %r2454, %r2453, %r2448; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 25; + add.s32 %r2456, %r2408, %r2053; + add.s32 %r2457, %r2456, %r2427; + xor.b32 %r2458, %r2457, %r2396; + shf.l.wrap.b32 %r2459, %r2458, %r2458, 16; + add.s32 %r2460, %r2459, %r2383; + xor.b32 %r2461, %r2460, %r2427; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 20; + add.s32 %r2463, %r2457, %r2081; + add.s32 %r2464, %r2463, %r2462; + xor.b32 %r2465, %r2464, %r2459; + shf.l.wrap.b32 %r2466, %r2465, %r2465, 24; + add.s32 %r2467, %r2466, %r2460; + xor.b32 %r2468, %r2467, %r2462; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 25; + add.s32 %r2470, %r2422, %r2032; + add.s32 %r2471, %r2470, %r2385; + xor.b32 %r2472, %r2471, %r2410; + shf.l.wrap.b32 %r2473, %r2472, %r2472, 16; + add.s32 %r2474, %r2473, %r2397; + xor.b32 %r2475, %r2474, %r2385; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 20; + add.s32 %r2477, %r2471, %r1983; + add.s32 %r2478, %r2477, %r2476; + xor.b32 %r2479, %r2478, %r2473; + shf.l.wrap.b32 %r2480, %r2479, %r2479, 24; + add.s32 %r2481, %r2480, %r2474; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 25; + add.s32 %r2484, %r2436, %r2046; + add.s32 %r2485, %r2484, %r2483; + xor.b32 %r2486, %r2485, %r2452; + shf.l.wrap.b32 %r2487, %r2486, %r2486, 16; + add.s32 %r2488, %r2487, %r2467; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 20; + add.s32 %r2491, %r2485, %r2025; + add.s32 %r2492, %r2491, %r2490; + xor.b32 %r2493, %r2492, %r2487; + shf.l.wrap.b32 %r2494, %r2493, %r2493, 24; + add.s32 %r2495, %r2494, %r2488; + xor.b32 %r2496, %r2495, %r2490; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 25; + add.s32 %r2498, %r2450, %r2060; + add.s32 %r2499, %r2498, %r2441; + xor.b32 %r2500, %r2499, %r2466; + shf.l.wrap.b32 %r2501, %r2500, %r2500, 16; + add.s32 %r2502, %r2501, %r2481; + xor.b32 %r2503, %r2502, %r2441; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 20; + add.s32 %r2505, %r2499, %r2039; + add.s32 %r2506, %r2505, %r2504; + xor.b32 %r2507, %r2506, %r2501; + shf.l.wrap.b32 %r2508, %r2507, %r2507, 24; + add.s32 %r2509, %r2508, %r2502; + xor.b32 %r2510, %r2509, %r2504; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 25; + add.s32 %r2512, %r2464, %r2074; + add.s32 %r2513, %r2512, %r2455; + xor.b32 %r2514, %r2480, %r2513; + shf.l.wrap.b32 %r2515, %r2514, %r2514, 16; + add.s32 %r2516, %r2515, %r2439; + xor.b32 %r2517, %r2516, %r2455; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 20; + add.s32 %r2519, %r2513, %r1997; + add.s32 %r2520, %r2519, %r2518; + xor.b32 %r2521, %r2520, %r2515; + shf.l.wrap.b32 %r2522, %r2521, %r2521, 24; + add.s32 %r2523, %r2522, %r2516; + xor.b32 %r2524, %r2523, %r2518; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 25; + add.s32 %r2526, %r2469, %r2067; + add.s32 %r2527, %r2526, %r2478; + xor.b32 %r2528, %r2527, %r2438; + shf.l.wrap.b32 %r2529, %r2528, %r2528, 16; + add.s32 %r2530, %r2529, %r2453; + xor.b32 %r2531, %r2530, %r2469; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 20; + add.s32 %r2533, %r2527, %r2081; + add.s32 %r2534, %r2533, %r2532; + xor.b32 %r2535, %r2534, %r2529; + shf.l.wrap.b32 %r2536, %r2535, %r2535, 24; + add.s32 %r2537, %r2536, %r2530; + xor.b32 %r2538, %r2537, %r2532; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 25; + add.s32 %r2540, %r2511, %r2004; + add.s32 %r2541, %r2540, %r2492; + xor.b32 %r2542, %r2541, %r2536; + shf.l.wrap.b32 %r2543, %r2542, %r2542, 16; + add.s32 %r2544, %r2543, %r2523; + xor.b32 %r2545, %r2544, %r2511; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 20; + add.s32 %r2547, %r2541, %r1976; + add.s32 %r2548, %r2547, %r2546; + xor.b32 %r2549, %r2548, %r2543; + shf.l.wrap.b32 %r2550, %r2549, %r2549, 24; + add.s32 %r2551, %r2550, %r2544; + xor.b32 %r2552, %r2551, %r2546; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 25; + add.s32 %r2554, %r2506, %r2053; + add.s32 %r2555, %r2554, %r2525; + xor.b32 %r2556, %r2494, %r2555; + shf.l.wrap.b32 %r2557, %r2556, %r2556, 16; + add.s32 %r2558, %r2557, %r2537; + xor.b32 %r2559, %r2558, %r2525; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 20; + add.s32 %r2561, %r2555, %r1990; + add.s32 %r2562, %r2561, %r2560; + xor.b32 %r2563, %r2562, %r2557; + shf.l.wrap.b32 %r2564, %r2563, %r2563, 24; + add.s32 %r2565, %r2564, %r2558; + xor.b32 %r2566, %r2565, %r2560; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 25; + add.s32 %r2568, %r2520, %r2011; + add.s32 %r2569, %r2568, %r2539; + xor.b32 %r2570, %r2569, %r2508; + shf.l.wrap.b32 %r2571, %r2570, %r2570, 16; + add.s32 %r2572, %r2571, %r2495; + xor.b32 %r2573, %r2572, %r2539; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 20; + add.s32 %r2575, %r2569, %r2032; + add.s32 %r2576, %r2575, %r2574; + xor.b32 %r2577, %r2576, %r2571; + shf.l.wrap.b32 %r2578, %r2577, %r2577, 24; + add.s32 %r2579, %r2578, %r2572; + xor.b32 %r2580, %r2579, %r2574; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 25; + add.s32 %r2582, %r2534, %r1983; + add.s32 %r2583, %r2582, %r2497; + xor.b32 %r2584, %r2583, %r2522; + shf.l.wrap.b32 %r2585, %r2584, %r2584, 16; + add.s32 %r2586, %r2585, %r2509; + xor.b32 %r2587, %r2586, %r2497; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 20; + add.s32 %r2589, %r2583, %r2018; + add.s32 %r2590, %r2589, %r2588; + xor.b32 %r2591, %r2590, %r2585; + shf.l.wrap.b32 %r2592, %r2591, %r2591, 24; + add.s32 %r2593, %r2592, %r2586; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 25; + add.s32 %r2596, %r2548, %r2060; + add.s32 %r2597, %r2596, %r2595; + xor.b32 %r2598, %r2597, %r2564; + shf.l.wrap.b32 %r2599, %r2598, %r2598, 16; + add.s32 %r2600, %r2599, %r2579; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 20; + add.s32 %r2603, %r2597, %r2067; + add.s32 %r2604, %r2603, %r2602; + xor.b32 %r2605, %r2604, %r2599; + shf.l.wrap.b32 %r2606, %r2605, %r2605, 24; + add.s32 %r2607, %r2606, %r2600; + xor.b32 %r2608, %r2607, %r2602; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 25; + add.s32 %r2610, %r2562, %r2039; + add.s32 %r2611, %r2610, %r2553; + xor.b32 %r2612, %r2611, %r2578; + shf.l.wrap.b32 %r2613, %r2612, %r2612, 16; + add.s32 %r2614, %r2613, %r2593; + xor.b32 %r2615, %r2614, %r2553; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 20; + add.s32 %r2617, %r2611, %r2053; + add.s32 %r2618, %r2617, %r2616; + xor.b32 %r2619, %r2618, %r2613; + shf.l.wrap.b32 %r2620, %r2619, %r2619, 24; + add.s32 %r2621, %r2620, %r2614; + xor.b32 %r2622, %r2621, %r2616; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 25; + add.s32 %r2624, %r2576, %r2081; + add.s32 %r2625, %r2624, %r2567; + xor.b32 %r2626, %r2592, %r2625; + shf.l.wrap.b32 %r2627, %r2626, %r2626, 16; + add.s32 %r2628, %r2627, %r2551; + xor.b32 %r2629, %r2628, %r2567; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 20; + add.s32 %r2631, %r2625, %r2046; + add.s32 %r2632, %r2631, %r2630; + xor.b32 %r2633, %r2632, %r2627; + shf.l.wrap.b32 %r2634, %r2633, %r2633, 24; + add.s32 %r2635, %r2634, %r2628; + xor.b32 %r2636, %r2635, %r2630; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 25; + add.s32 %r2638, %r2581, %r2074; + add.s32 %r2639, %r2638, %r2590; + xor.b32 %r2640, %r2639, %r2550; + shf.l.wrap.b32 %r2641, %r2640, %r2640, 16; + add.s32 %r2642, %r2641, %r2565; + xor.b32 %r2643, %r2642, %r2581; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 20; + add.s32 %r2645, %r2639, %r2032; + add.s32 %r2646, %r2645, %r2644; + xor.b32 %r2647, %r2646, %r2641; + shf.l.wrap.b32 %r2648, %r2647, %r2647, 24; + add.s32 %r2649, %r2648, %r2642; + xor.b32 %r2650, %r2649, %r2644; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 25; + add.s32 %r2652, %r2623, %r2025; + add.s32 %r2653, %r2652, %r2604; + xor.b32 %r2654, %r2653, %r2648; + shf.l.wrap.b32 %r2655, %r2654, %r2654, 16; + add.s32 %r2656, %r2655, %r2635; + xor.b32 %r2657, %r2656, %r2623; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 20; + add.s32 %r2659, %r2653, %r1990; + add.s32 %r2660, %r2659, %r2658; + xor.b32 %r2661, %r2660, %r2655; + shf.l.wrap.b32 %r2662, %r2661, %r2661, 24; + add.s32 %r2663, %r2662, %r2656; + xor.b32 %r2664, %r2663, %r2658; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 25; + add.s32 %r2666, %r2618, %r2011; + add.s32 %r2667, %r2666, %r2637; + xor.b32 %r2668, %r2606, %r2667; + shf.l.wrap.b32 %r2669, %r2668, %r2668, 16; + add.s32 %r2670, %r2669, %r2649; + xor.b32 %r2671, %r2670, %r2637; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 20; + add.s32 %r2673, %r2667, %r1997; + add.s32 %r2674, %r2673, %r2672; + xor.b32 %r2675, %r2674, %r2669; + shf.l.wrap.b32 %r2676, %r2675, %r2675, 24; + add.s32 %r2677, %r2676, %r2670; + xor.b32 %r2678, %r2677, %r2672; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 25; + add.s32 %r2680, %r2632, %r1976; + add.s32 %r2681, %r2680, %r2651; + xor.b32 %r2682, %r2681, %r2620; + shf.l.wrap.b32 %r2683, %r2682, %r2682, 16; + add.s32 %r2684, %r2683, %r2607; + xor.b32 %r2685, %r2684, %r2651; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 20; + add.s32 %r2687, %r2681, %r1983; + add.s32 %r2688, %r2687, %r2686; + xor.b32 %r2689, %r2688, %r2683; + shf.l.wrap.b32 %r2690, %r2689, %r2689, 24; + add.s32 %r2691, %r2690, %r2684; + xor.b32 %r2692, %r2691, %r2686; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 25; + add.s32 %r2694, %r2646, %r2018; + add.s32 %r2695, %r2694, %r2609; + xor.b32 %r2696, %r2695, %r2634; + shf.l.wrap.b32 %r2697, %r2696, %r2696, 16; + add.s32 %r2698, %r2697, %r2621; + xor.b32 %r2699, %r2698, %r2609; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 20; + add.s32 %r2701, %r2695, %r2004; + add.s32 %r2702, %r2701, %r2700; + xor.b32 %r2703, %r2702, %r2697; + shf.l.wrap.b32 %r2704, %r2703, %r2703, 24; + add.s32 %r2705, %r2704, %r2698; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 25; + add.s32 %r2708, %r2660, %r2039; + add.s32 %r2709, %r2708, %r2707; + xor.b32 %r2710, %r2709, %r2676; + shf.l.wrap.b32 %r2711, %r2710, %r2710, 16; + add.s32 %r2712, %r2711, %r2691; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 20; + add.s32 %r2715, %r2709, %r2074; + add.s32 %r2716, %r2715, %r2714; + xor.b32 %r2717, %r2716, %r2711; + shf.l.wrap.b32 %r2718, %r2717, %r2717, 24; + add.s32 %r2719, %r2718, %r2712; + xor.b32 %r2720, %r2719, %r2714; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 25; + add.s32 %r2722, %r2674, %r2053; + add.s32 %r2723, %r2722, %r2665; + xor.b32 %r2724, %r2723, %r2690; + shf.l.wrap.b32 %r2725, %r2724, %r2724, 16; + add.s32 %r2726, %r2725, %r2705; + xor.b32 %r2727, %r2726, %r2665; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 20; + add.s32 %r2729, %r2723, %r2011; + add.s32 %r2730, %r2729, %r2728; + xor.b32 %r2731, %r2730, %r2725; + shf.l.wrap.b32 %r2732, %r2731, %r2731, 24; + add.s32 %r2733, %r2732, %r2726; + xor.b32 %r2734, %r2733, %r2728; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 25; + add.s32 %r2736, %r2688, %r2032; + add.s32 %r2737, %r2736, %r2679; + xor.b32 %r2738, %r2704, %r2737; + shf.l.wrap.b32 %r2739, %r2738, %r2738, 16; + add.s32 %r2740, %r2739, %r2663; + xor.b32 %r2741, %r2740, %r2679; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 20; + add.s32 %r2743, %r2737, %r2060; + add.s32 %r2744, %r2743, %r2742; + xor.b32 %r2745, %r2744, %r2739; + shf.l.wrap.b32 %r2746, %r2745, %r2745, 24; + add.s32 %r2747, %r2746, %r2740; + xor.b32 %r2748, %r2747, %r2742; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 25; + add.s32 %r2750, %r2693, %r2081; + add.s32 %r2751, %r2750, %r2702; + xor.b32 %r2752, %r2751, %r2662; + shf.l.wrap.b32 %r2753, %r2752, %r2752, 16; + add.s32 %r2754, %r2753, %r2677; + xor.b32 %r2755, %r2754, %r2693; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 20; + add.s32 %r2757, %r2751, %r1983; + add.s32 %r2758, %r2757, %r2756; + xor.b32 %r2759, %r2758, %r2753; + shf.l.wrap.b32 %r2760, %r2759, %r2759, 24; + add.s32 %r2761, %r2760, %r2754; + xor.b32 %r2762, %r2761, %r2756; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 25; + add.s32 %r2764, %r2735, %r2067; + add.s32 %r2765, %r2764, %r2716; + xor.b32 %r2766, %r2765, %r2760; + shf.l.wrap.b32 %r2767, %r2766, %r2766, 16; + add.s32 %r2768, %r2767, %r2747; + xor.b32 %r2769, %r2768, %r2735; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 20; + add.s32 %r2771, %r2765, %r1997; + add.s32 %r2772, %r2771, %r2770; + xor.b32 %r2773, %r2772, %r2767; + shf.l.wrap.b32 %r2774, %r2773, %r2773, 24; + add.s32 %r2775, %r2774, %r2768; + xor.b32 %r2776, %r2775, %r2770; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 25; + add.s32 %r2778, %r2730, %r1976; + add.s32 %r2779, %r2778, %r2749; + xor.b32 %r2780, %r2718, %r2779; + shf.l.wrap.b32 %r2781, %r2780, %r2780, 16; + add.s32 %r2782, %r2781, %r2761; + xor.b32 %r2783, %r2782, %r2749; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 20; + add.s32 %r2785, %r2779, %r2046; + add.s32 %r2786, %r2785, %r2784; + xor.b32 %r2787, %r2786, %r2781; + shf.l.wrap.b32 %r2788, %r2787, %r2787, 24; + add.s32 %r2789, %r2788, %r2782; + xor.b32 %r2790, %r2789, %r2784; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 25; + add.s32 %r2792, %r2744, %r1990; + add.s32 %r2793, %r2792, %r2763; + xor.b32 %r2794, %r2793, %r2732; + shf.l.wrap.b32 %r2795, %r2794, %r2794, 16; + add.s32 %r2796, %r2795, %r2719; + xor.b32 %r2797, %r2796, %r2763; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 20; + add.s32 %r2799, %r2793, %r2018; + add.s32 %r2800, %r2799, %r2798; + xor.b32 %r2801, %r2800, %r2795; + shf.l.wrap.b32 %r2802, %r2801, %r2801, 24; + add.s32 %r2803, %r2802, %r2796; + xor.b32 %r2804, %r2803, %r2798; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 25; + add.s32 %r2806, %r2758, %r2004; + add.s32 %r2807, %r2806, %r2721; + xor.b32 %r2808, %r2807, %r2746; + shf.l.wrap.b32 %r2809, %r2808, %r2808, 16; + add.s32 %r2810, %r2809, %r2733; + xor.b32 %r2811, %r2810, %r2721; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 20; + add.s32 %r2813, %r2807, %r2025; + add.s32 %r2814, %r2813, %r2812; + xor.b32 %r2815, %r2814, %r2809; + shf.l.wrap.b32 %r2816, %r2815, %r2815, 24; + add.s32 %r2817, %r2816, %r2810; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 25; + add.s32 %r2820, %r2772, %r2053; + add.s32 %r2821, %r2820, %r2819; + xor.b32 %r2822, %r2821, %r2788; + shf.l.wrap.b32 %r2823, %r2822, %r2822, 16; + add.s32 %r2824, %r2823, %r2803; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 20; + add.s32 %r2827, %r2821, %r2081; + add.s32 %r2828, %r2827, %r2826; + xor.b32 %r2829, %r2828, %r2823; + shf.l.wrap.b32 %r2830, %r2829, %r2829, 24; + add.s32 %r2831, %r2830, %r2824; + xor.b32 %r2832, %r2831, %r2826; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 25; + add.s32 %r2834, %r2786, %r2011; + add.s32 %r2835, %r2834, %r2777; + xor.b32 %r2836, %r2835, %r2802; + shf.l.wrap.b32 %r2837, %r2836, %r2836, 16; + add.s32 %r2838, %r2837, %r2817; + xor.b32 %r2839, %r2838, %r2777; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 20; + add.s32 %r2841, %r2835, %r1976; + add.s32 %r2842, %r2841, %r2840; + xor.b32 %r2843, %r2842, %r2837; + shf.l.wrap.b32 %r2844, %r2843, %r2843, 24; + add.s32 %r2845, %r2844, %r2838; + xor.b32 %r2846, %r2845, %r2840; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 25; + add.s32 %r2848, %r2800, %r1983; + add.s32 %r2849, %r2848, %r2791; + xor.b32 %r2850, %r2816, %r2849; + shf.l.wrap.b32 %r2851, %r2850, %r2850, 16; + add.s32 %r2852, %r2851, %r2775; + xor.b32 %r2853, %r2852, %r2791; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 20; + add.s32 %r2855, %r2849, %r2039; + add.s32 %r2856, %r2855, %r2854; + xor.b32 %r2857, %r2856, %r2851; + shf.l.wrap.b32 %r2858, %r2857, %r2857, 24; + add.s32 %r2859, %r2858, %r2852; + xor.b32 %r2860, %r2859, %r2854; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 25; + add.s32 %r2862, %r2805, %r2032; + add.s32 %r2863, %r2862, %r2814; + xor.b32 %r2864, %r2863, %r2774; + shf.l.wrap.b32 %r2865, %r2864, %r2864, 16; + add.s32 %r2866, %r2865, %r2789; + xor.b32 %r2867, %r2866, %r2805; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 20; + add.s32 %r2869, %r2863, %r2018; + add.s32 %r2870, %r2869, %r2868; + xor.b32 %r2871, %r2870, %r2865; + shf.l.wrap.b32 %r2872, %r2871, %r2871, 24; + add.s32 %r2873, %r2872, %r2866; + xor.b32 %r2874, %r2873, %r2868; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 25; + add.s32 %r2876, %r2847, %r2074; + add.s32 %r2877, %r2876, %r2828; + xor.b32 %r2878, %r2877, %r2872; + shf.l.wrap.b32 %r2879, %r2878, %r2878, 16; + add.s32 %r2880, %r2879, %r2859; + xor.b32 %r2881, %r2880, %r2847; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 20; + add.s32 %r2883, %r2877, %r2046; + add.s32 %r2884, %r2883, %r2882; + xor.b32 %r2885, %r2884, %r2879; + shf.l.wrap.b32 %r2886, %r2885, %r2885, 24; + add.s32 %r2887, %r2886, %r2880; + xor.b32 %r2888, %r2887, %r2882; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 25; + add.s32 %r2890, %r2842, %r1990; + add.s32 %r2891, %r2890, %r2861; + xor.b32 %r2892, %r2830, %r2891; + shf.l.wrap.b32 %r2893, %r2892, %r2892, 16; + add.s32 %r2894, %r2893, %r2873; + xor.b32 %r2895, %r2894, %r2861; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 20; + add.s32 %r2897, %r2891, %r2060; + add.s32 %r2898, %r2897, %r2896; + xor.b32 %r2899, %r2898, %r2893; + shf.l.wrap.b32 %r2900, %r2899, %r2899, 24; + add.s32 %r2901, %r2900, %r2894; + xor.b32 %r2902, %r2901, %r2896; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 25; + add.s32 %r2904, %r2856, %r1997; + add.s32 %r2905, %r2904, %r2875; + xor.b32 %r2906, %r2905, %r2844; + shf.l.wrap.b32 %r2907, %r2906, %r2906, 16; + add.s32 %r2908, %r2907, %r2831; + xor.b32 %r2909, %r2908, %r2875; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 20; + add.s32 %r2911, %r2905, %r2004; + add.s32 %r2912, %r2911, %r2910; + xor.b32 %r2913, %r2912, %r2907; + shf.l.wrap.b32 %r2914, %r2913, %r2913, 24; + add.s32 %r2915, %r2914, %r2908; + xor.b32 %r2916, %r2915, %r2910; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 25; + add.s32 %r2918, %r2870, %r2025; + add.s32 %r2919, %r2918, %r2833; + xor.b32 %r2920, %r2919, %r2858; + shf.l.wrap.b32 %r2921, %r2920, %r2920, 16; + add.s32 %r2922, %r2921, %r2845; + xor.b32 %r2923, %r2922, %r2833; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 20; + add.s32 %r2925, %r2919, %r2067; + add.s32 %r2926, %r2925, %r2924; + xor.b32 %r2927, %r2926, %r2921; + shf.l.wrap.b32 %r2928, %r2927, %r2927, 24; + add.s32 %r2929, %r2928, %r2922; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 25; + xor.b32 %r27, %r2915, %r2884; + xor.b32 %r28, %r2929, %r2898; + xor.b32 %r29, %r2887, %r2912; + xor.b32 %r30, %r2926, %r2901; + xor.b32 %r31, %r2931, %r2900; + xor.b32 %r32, %r2889, %r2914; + xor.b32 %r33, %r2928, %r2903; + xor.b32 %r34, %r2917, %r2886; + popc.b64 %r2932, %rd128; + cvt.u64.u32 %rd30, %r2932; + ld.local.u8 %rs127, [%rd3+8]; + cvt.u64.u16 %rd130, %rs127; + setp.ge.u64 %p15, %rd30, %rd130; + mul.wide.u16 %r11659, %rs127, 32; + @%p15 bra $L__BB1_19; + +$L__BB1_18: + add.s32 %r2933, %r11659, -64; + cvt.s64.s32 %rd131, %r2933; + add.s64 %rd132, %rd2, %rd131; + ld.local.u8 %r2934, [%rd3+2]; + ld.local.u8 %r2935, [%rd132+145]; + ld.local.u8 %r2936, [%rd132+146]; + prmt.b32 %r2937, %r2936, %r2935, 30212; + ld.local.u8 %r2938, [%rd132+147]; + prmt.b32 %r2939, %r2938, %r2937, 28756; + ld.local.u8 %r2940, [%rd132+148]; + prmt.b32 %r2941, %r2940, %r2939, 1620; + ld.local.u8 %r2942, [%rd132+149]; + ld.local.u8 %r2943, [%rd132+150]; + prmt.b32 %r2944, %r2943, %r2942, 30212; + ld.local.u8 %r2945, [%rd132+151]; + prmt.b32 %r2946, %r2945, %r2944, 28756; + ld.local.u8 %r2947, [%rd132+152]; + prmt.b32 %r2948, %r2947, %r2946, 1620; + ld.local.u8 %r2949, [%rd132+153]; + ld.local.u8 %r2950, [%rd132+154]; + prmt.b32 %r2951, %r2950, %r2949, 30212; + ld.local.u8 %r2952, [%rd132+155]; + prmt.b32 %r2953, %r2952, %r2951, 28756; + ld.local.u8 %r2954, [%rd132+156]; + prmt.b32 %r2955, %r2954, %r2953, 1620; + ld.local.u8 %r2956, [%rd132+157]; + ld.local.u8 %r2957, [%rd132+158]; + prmt.b32 %r2958, %r2957, %r2956, 30212; + ld.local.u8 %r2959, [%rd132+159]; + prmt.b32 %r2960, %r2959, %r2958, 28756; + ld.local.u8 %r2961, [%rd132+160]; + prmt.b32 %r2962, %r2961, %r2960, 1620; + ld.local.u8 %r2963, [%rd132+161]; + ld.local.u8 %r2964, [%rd132+162]; + prmt.b32 %r2965, %r2964, %r2963, 30212; + ld.local.u8 %r2966, [%rd132+163]; + prmt.b32 %r2967, %r2966, %r2965, 28756; + ld.local.u8 %r2968, [%rd132+164]; + prmt.b32 %r2969, %r2968, %r2967, 1620; + ld.local.u8 %r2970, [%rd132+165]; + ld.local.u8 %r2971, [%rd132+166]; + prmt.b32 %r2972, %r2971, %r2970, 30212; + ld.local.u8 %r2973, [%rd132+167]; + prmt.b32 %r2974, %r2973, %r2972, 28756; + ld.local.u8 %r2975, [%rd132+168]; + prmt.b32 %r2976, %r2975, %r2974, 1620; + ld.local.u8 %r2977, [%rd132+169]; + ld.local.u8 %r2978, [%rd132+170]; + prmt.b32 %r2979, %r2978, %r2977, 30212; + ld.local.u8 %r2980, [%rd132+171]; + prmt.b32 %r2981, %r2980, %r2979, 28756; + ld.local.u8 %r2982, [%rd132+172]; + prmt.b32 %r2983, %r2982, %r2981, 1620; + ld.local.u8 %r2984, [%rd132+173]; + ld.local.u8 %r2985, [%rd132+174]; + prmt.b32 %r2986, %r2985, %r2984, 30212; + ld.local.u8 %r2987, [%rd132+175]; + prmt.b32 %r2988, %r2987, %r2986, 28756; + ld.local.u8 %r2989, [%rd132+176]; + prmt.b32 %r2990, %r2989, %r2988, 1620; + ld.local.u8 %r2991, [%rd132+177]; + ld.local.u8 %r2992, [%rd132+178]; + prmt.b32 %r2993, %r2992, %r2991, 30212; + ld.local.u8 %r2994, [%rd132+179]; + prmt.b32 %r2995, %r2994, %r2993, 28756; + ld.local.u8 %r2996, [%rd132+180]; + prmt.b32 %r2997, %r2996, %r2995, 1620; + ld.local.u8 %r2998, [%rd132+181]; + ld.local.u8 %r2999, [%rd132+182]; + prmt.b32 %r3000, %r2999, %r2998, 30212; + ld.local.u8 %r3001, [%rd132+183]; + prmt.b32 %r3002, %r3001, %r3000, 28756; + ld.local.u8 %r3003, [%rd132+184]; + prmt.b32 %r3004, %r3003, %r3002, 1620; + ld.local.u8 %r3005, [%rd132+185]; + ld.local.u8 %r3006, [%rd132+186]; + prmt.b32 %r3007, %r3006, %r3005, 30212; + ld.local.u8 %r3008, [%rd132+187]; + prmt.b32 %r3009, %r3008, %r3007, 28756; + ld.local.u8 %r3010, [%rd132+188]; + prmt.b32 %r3011, %r3010, %r3009, 1620; + ld.local.u8 %r3012, [%rd132+189]; + ld.local.u8 %r3013, [%rd132+190]; + prmt.b32 %r3014, %r3013, %r3012, 30212; + ld.local.u8 %r3015, [%rd132+191]; + prmt.b32 %r3016, %r3015, %r3014, 28756; + ld.local.u8 %r3017, [%rd132+192]; + prmt.b32 %r3018, %r3017, %r3016, 1620; + ld.local.u8 %r3019, [%rd132+193]; + ld.local.u8 %r3020, [%rd132+194]; + prmt.b32 %r3021, %r3020, %r3019, 30212; + ld.local.u8 %r3022, [%rd132+195]; + prmt.b32 %r3023, %r3022, %r3021, 28756; + ld.local.u8 %r3024, [%rd132+196]; + prmt.b32 %r3025, %r3024, %r3023, 1620; + ld.local.u8 %r3026, [%rd132+197]; + ld.local.u8 %r3027, [%rd132+198]; + prmt.b32 %r3028, %r3027, %r3026, 30212; + ld.local.u8 %r3029, [%rd132+199]; + prmt.b32 %r3030, %r3029, %r3028, 28756; + ld.local.u8 %r3031, [%rd132+200]; + prmt.b32 %r3032, %r3031, %r3030, 1620; + ld.local.u8 %r3033, [%rd132+201]; + ld.local.u8 %r3034, [%rd132+202]; + prmt.b32 %r3035, %r3034, %r3033, 30212; + ld.local.u8 %r3036, [%rd132+203]; + prmt.b32 %r3037, %r3036, %r3035, 28756; + ld.local.u8 %r3038, [%rd132+204]; + prmt.b32 %r3039, %r3038, %r3037, 1620; + ld.local.u8 %r3040, [%rd132+205]; + ld.local.u8 %r3041, [%rd132+206]; + prmt.b32 %r3042, %r3041, %r3040, 30212; + ld.local.u8 %r3043, [%rd132+207]; + prmt.b32 %r3044, %r3043, %r3042, 28756; + ld.local.u8 %r3045, [%rd132+208]; + prmt.b32 %r3046, %r3045, %r3044, 1620; + or.b32 %r3047, %r2934, 4; + ld.local.u8 %r3048, [%rd3+-120]; + ld.local.u8 %r3049, [%rd3+-119]; + prmt.b32 %r3050, %r3049, %r3048, 30212; + ld.local.u8 %r3051, [%rd3+-118]; + ld.local.u8 %r3052, [%rd3+-117]; + prmt.b32 %r3053, %r3052, %r3051, 30212; + prmt.b32 %r3054, %r3053, %r3050, 4180; + ld.local.u8 %r3055, [%rd3+-136]; + ld.local.u8 %r3056, [%rd3+-135]; + prmt.b32 %r3057, %r3056, %r3055, 30212; + ld.local.u8 %r3058, [%rd3+-134]; + ld.local.u8 %r3059, [%rd3+-133]; + prmt.b32 %r3060, %r3059, %r3058, 30212; + prmt.b32 %r3061, %r3060, %r3057, 4180; + add.s32 %r3062, %r3054, %r3061; + add.s32 %r3063, %r3062, %r2941; + shf.l.wrap.b32 %r3064, %r3063, %r3063, 16; + add.s32 %r3065, %r3064, 1779033703; + xor.b32 %r3066, %r3065, %r3054; + shf.l.wrap.b32 %r3067, %r3066, %r3066, 20; + add.s32 %r3068, %r2948, %r3063; + add.s32 %r3069, %r3068, %r3067; + xor.b32 %r3070, %r3069, %r3064; + shf.l.wrap.b32 %r3071, %r3070, %r3070, 24; + add.s32 %r3072, %r3071, %r3065; + xor.b32 %r3073, %r3072, %r3067; + shf.l.wrap.b32 %r3074, %r3073, %r3073, 25; + ld.local.u8 %r3075, [%rd3+-116]; + ld.local.u8 %r3076, [%rd3+-115]; + prmt.b32 %r3077, %r3076, %r3075, 30212; + ld.local.u8 %r3078, [%rd3+-114]; + ld.local.u8 %r3079, [%rd3+-113]; + prmt.b32 %r3080, %r3079, %r3078, 30212; + prmt.b32 %r3081, %r3080, %r3077, 4180; + ld.local.u8 %r3082, [%rd3+-132]; + ld.local.u8 %r3083, [%rd3+-131]; + prmt.b32 %r3084, %r3083, %r3082, 30212; + ld.local.u8 %r3085, [%rd3+-130]; + ld.local.u8 %r3086, [%rd3+-129]; + prmt.b32 %r3087, %r3086, %r3085, 30212; + prmt.b32 %r3088, %r3087, %r3084, 4180; + add.s32 %r3089, %r3081, %r3088; + add.s32 %r3090, %r3089, %r2955; + shf.l.wrap.b32 %r3091, %r3090, %r3090, 16; + add.s32 %r3092, %r3091, -1150833019; + xor.b32 %r3093, %r3092, %r3081; + shf.l.wrap.b32 %r3094, %r3093, %r3093, 20; + add.s32 %r3095, %r2962, %r3090; + add.s32 %r3096, %r3095, %r3094; + xor.b32 %r3097, %r3096, %r3091; + shf.l.wrap.b32 %r3098, %r3097, %r3097, 24; + add.s32 %r3099, %r3098, %r3092; + xor.b32 %r3100, %r3099, %r3094; + shf.l.wrap.b32 %r3101, %r3100, %r3100, 25; + ld.local.u8 %r3102, [%rd3+-112]; + ld.local.u8 %r3103, [%rd3+-111]; + prmt.b32 %r3104, %r3103, %r3102, 30212; + ld.local.u8 %r3105, [%rd3+-110]; + ld.local.u8 %r3106, [%rd3+-109]; + prmt.b32 %r3107, %r3106, %r3105, 30212; + prmt.b32 %r3108, %r3107, %r3104, 4180; + ld.local.u8 %r3109, [%rd3+-128]; + ld.local.u8 %r3110, [%rd3+-127]; + prmt.b32 %r3111, %r3110, %r3109, 30212; + ld.local.u8 %r3112, [%rd3+-126]; + ld.local.u8 %r3113, [%rd3+-125]; + prmt.b32 %r3114, %r3113, %r3112, 30212; + prmt.b32 %r3115, %r3114, %r3111, 4180; + add.s32 %r3116, %r3108, %r3115; + add.s32 %r3117, %r3116, %r2969; + shr.u32 %r3118, %r3117, 16; + shl.b32 %r3119, %r3117, 16; + xor.b32 %r3120, %r3119, 4194304; + or.b32 %r3121, %r3120, %r3118; + add.s32 %r3122, %r3121, 1013904242; + xor.b32 %r3123, %r3122, %r3108; + shf.l.wrap.b32 %r3124, %r3123, %r3123, 20; + add.s32 %r3125, %r2976, %r3117; + add.s32 %r3126, %r3125, %r3124; + xor.b32 %r3127, %r3126, %r3121; + shf.l.wrap.b32 %r3128, %r3127, %r3127, 24; + add.s32 %r3129, %r3128, %r3122; + xor.b32 %r3130, %r3129, %r3124; + shf.l.wrap.b32 %r3131, %r3130, %r3130, 25; + ld.local.u8 %r3132, [%rd3+-108]; + ld.local.u8 %r3133, [%rd3+-107]; + prmt.b32 %r3134, %r3133, %r3132, 30212; + ld.local.u8 %r3135, [%rd3+-106]; + ld.local.u8 %r3136, [%rd3+-105]; + prmt.b32 %r3137, %r3136, %r3135, 30212; + prmt.b32 %r3138, %r3137, %r3134, 4180; + ld.local.u8 %r3139, [%rd3+-124]; + ld.local.u8 %r3140, [%rd3+-123]; + prmt.b32 %r3141, %r3140, %r3139, 30212; + ld.local.u8 %r3142, [%rd3+-122]; + ld.local.u8 %r3143, [%rd3+-121]; + prmt.b32 %r3144, %r3143, %r3142, 30212; + prmt.b32 %r3145, %r3144, %r3141, 4180; + add.s32 %r3146, %r3138, %r3145; + add.s32 %r3147, %r3146, %r2983; + xor.b32 %r3148, %r3147, %r3047; + shr.u32 %r3149, %r3147, 16; + shl.b32 %r3150, %r3148, 16; + or.b32 %r3151, %r3150, %r3149; + add.s32 %r3152, %r3151, -1521486534; + xor.b32 %r3153, %r3152, %r3138; + shf.l.wrap.b32 %r3154, %r3153, %r3153, 20; + add.s32 %r3155, %r2990, %r3147; + add.s32 %r3156, %r3155, %r3154; + xor.b32 %r3157, %r3156, %r3151; + shf.l.wrap.b32 %r3158, %r3157, %r3157, 24; + add.s32 %r3159, %r3158, %r3152; + xor.b32 %r3160, %r3159, %r3154; + shf.l.wrap.b32 %r3161, %r3160, %r3160, 25; + add.s32 %r3162, %r3101, %r3069; + add.s32 %r3163, %r3162, %r2997; + xor.b32 %r3164, %r3158, %r3163; + shf.l.wrap.b32 %r3165, %r3164, %r3164, 16; + add.s32 %r3166, %r3165, %r3129; + xor.b32 %r3167, %r3166, %r3101; + shf.l.wrap.b32 %r3168, %r3167, %r3167, 20; + add.s32 %r3169, %r3004, %r3163; + add.s32 %r3170, %r3169, %r3168; + xor.b32 %r3171, %r3170, %r3165; + shf.l.wrap.b32 %r3172, %r3171, %r3171, 24; + add.s32 %r3173, %r3172, %r3166; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 25; + add.s32 %r3176, %r3131, %r3096; + add.s32 %r3177, %r3176, %r3011; + xor.b32 %r3178, %r3177, %r3071; + shf.l.wrap.b32 %r3179, %r3178, %r3178, 16; + add.s32 %r3180, %r3179, %r3159; + xor.b32 %r3181, %r3180, %r3131; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 20; + add.s32 %r3183, %r3018, %r3177; + add.s32 %r3184, %r3183, %r3182; + xor.b32 %r3185, %r3184, %r3179; + shf.l.wrap.b32 %r3186, %r3185, %r3185, 24; + add.s32 %r3187, %r3186, %r3180; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 25; + add.s32 %r3190, %r3161, %r3126; + add.s32 %r3191, %r3190, %r3025; + xor.b32 %r3192, %r3191, %r3098; + shf.l.wrap.b32 %r3193, %r3192, %r3192, 16; + add.s32 %r3194, %r3193, %r3072; + xor.b32 %r3195, %r3194, %r3161; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 20; + add.s32 %r3197, %r3032, %r3191; + add.s32 %r3198, %r3197, %r3196; + xor.b32 %r3199, %r3198, %r3193; + shf.l.wrap.b32 %r3200, %r3199, %r3199, 24; + add.s32 %r3201, %r3200, %r3194; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 25; + add.s32 %r3204, %r3156, %r3074; + add.s32 %r3205, %r3204, %r3039; + xor.b32 %r3206, %r3205, %r3128; + shf.l.wrap.b32 %r3207, %r3206, %r3206, 16; + add.s32 %r3208, %r3207, %r3099; + xor.b32 %r3209, %r3208, %r3074; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 20; + add.s32 %r3211, %r3046, %r3205; + add.s32 %r3212, %r3211, %r3210; + xor.b32 %r3213, %r3212, %r3207; + shf.l.wrap.b32 %r3214, %r3213, %r3213, 24; + add.s32 %r3215, %r3214, %r3208; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 25; + add.s32 %r3218, %r3170, %r2955; + add.s32 %r3219, %r3218, %r3217; + xor.b32 %r3220, %r3219, %r3186; + shf.l.wrap.b32 %r3221, %r3220, %r3220, 16; + add.s32 %r3222, %r3221, %r3201; + xor.b32 %r3223, %r3222, %r3217; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 20; + add.s32 %r3225, %r3219, %r2983; + add.s32 %r3226, %r3225, %r3224; + xor.b32 %r3227, %r3226, %r3221; + shf.l.wrap.b32 %r3228, %r3227, %r3227, 24; + add.s32 %r3229, %r3228, %r3222; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 25; + add.s32 %r3232, %r3184, %r2962; + add.s32 %r3233, %r3232, %r3175; + xor.b32 %r3234, %r3200, %r3233; + shf.l.wrap.b32 %r3235, %r3234, %r3234, 16; + add.s32 %r3236, %r3215, %r3235; + xor.b32 %r3237, %r3236, %r3175; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 20; + add.s32 %r3239, %r3233, %r3011; + add.s32 %r3240, %r3239, %r3238; + xor.b32 %r3241, %r3240, %r3235; + shf.l.wrap.b32 %r3242, %r3241, %r3241, 24; + add.s32 %r3243, %r3242, %r3236; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 25; + add.s32 %r3246, %r3189, %r2990; + add.s32 %r3247, %r3246, %r3198; + xor.b32 %r3248, %r3214, %r3247; + shf.l.wrap.b32 %r3249, %r3248, %r3248, 16; + add.s32 %r3250, %r3249, %r3173; + xor.b32 %r3251, %r3250, %r3189; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 20; + add.s32 %r3253, %r3247, %r2941; + add.s32 %r3254, %r3253, %r3252; + xor.b32 %r3255, %r3254, %r3249; + shf.l.wrap.b32 %r3256, %r3255, %r3255, 24; + add.s32 %r3257, %r3256, %r3250; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 25; + add.s32 %r3260, %r3203, %r2969; + add.s32 %r3261, %r3260, %r3212; + xor.b32 %r3262, %r3261, %r3172; + shf.l.wrap.b32 %r3263, %r3262, %r3262, 16; + add.s32 %r3264, %r3263, %r3187; + xor.b32 %r3265, %r3264, %r3203; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 20; + add.s32 %r3267, %r3261, %r3032; + add.s32 %r3268, %r3267, %r3266; + xor.b32 %r3269, %r3268, %r3263; + shf.l.wrap.b32 %r3270, %r3269, %r3269, 24; + add.s32 %r3271, %r3270, %r3264; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 25; + add.s32 %r3274, %r3245, %r2948; + add.s32 %r3275, %r3274, %r3226; + xor.b32 %r3276, %r3275, %r3270; + shf.l.wrap.b32 %r3277, %r3276, %r3276, 16; + add.s32 %r3278, %r3277, %r3257; + xor.b32 %r3279, %r3278, %r3245; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 20; + add.s32 %r3281, %r3275, %r3018; + add.s32 %r3282, %r3281, %r3280; + xor.b32 %r3283, %r3282, %r3277; + shf.l.wrap.b32 %r3284, %r3283, %r3283, 24; + add.s32 %r3285, %r3284, %r3278; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 25; + add.s32 %r3288, %r3240, %r3025; + add.s32 %r3289, %r3288, %r3259; + xor.b32 %r3290, %r3228, %r3289; + shf.l.wrap.b32 %r3291, %r3290, %r3290, 16; + add.s32 %r3292, %r3291, %r3271; + xor.b32 %r3293, %r3292, %r3259; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 20; + add.s32 %r3295, %r3289, %r2976; + add.s32 %r3296, %r3295, %r3294; + xor.b32 %r3297, %r3296, %r3291; + shf.l.wrap.b32 %r3298, %r3297, %r3297, 24; + add.s32 %r3299, %r3298, %r3292; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 25; + add.s32 %r3302, %r3254, %r3004; + add.s32 %r3303, %r3302, %r3273; + xor.b32 %r3304, %r3303, %r3242; + shf.l.wrap.b32 %r3305, %r3304, %r3304, 16; + add.s32 %r3306, %r3305, %r3229; + xor.b32 %r3307, %r3306, %r3273; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 20; + add.s32 %r3309, %r3303, %r3039; + add.s32 %r3310, %r3309, %r3308; + xor.b32 %r3311, %r3310, %r3305; + shf.l.wrap.b32 %r3312, %r3311, %r3311, 24; + add.s32 %r3313, %r3312, %r3306; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 25; + add.s32 %r3316, %r3268, %r3046; + add.s32 %r3317, %r3316, %r3231; + xor.b32 %r3318, %r3317, %r3256; + shf.l.wrap.b32 %r3319, %r3318, %r3318, 16; + add.s32 %r3320, %r3319, %r3243; + xor.b32 %r3321, %r3320, %r3231; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 20; + add.s32 %r3323, %r3317, %r2997; + add.s32 %r3324, %r3323, %r3322; + xor.b32 %r3325, %r3324, %r3319; + shf.l.wrap.b32 %r3326, %r3325, %r3325, 24; + add.s32 %r3327, %r3326, %r3320; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 25; + add.s32 %r3330, %r3282, %r2962; + add.s32 %r3331, %r3330, %r3329; + xor.b32 %r3332, %r3331, %r3298; + shf.l.wrap.b32 %r3333, %r3332, %r3332, 16; + add.s32 %r3334, %r3333, %r3313; + xor.b32 %r3335, %r3334, %r3329; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 20; + add.s32 %r3337, %r3331, %r2969; + add.s32 %r3338, %r3337, %r3336; + xor.b32 %r3339, %r3338, %r3333; + shf.l.wrap.b32 %r3340, %r3339, %r3339, 24; + add.s32 %r3341, %r3340, %r3334; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 25; + add.s32 %r3344, %r3296, %r3011; + add.s32 %r3345, %r3344, %r3287; + xor.b32 %r3346, %r3345, %r3312; + shf.l.wrap.b32 %r3347, %r3346, %r3346, 16; + add.s32 %r3348, %r3347, %r3327; + xor.b32 %r3349, %r3348, %r3287; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 20; + add.s32 %r3351, %r3345, %r3025; + add.s32 %r3352, %r3351, %r3350; + xor.b32 %r3353, %r3352, %r3347; + shf.l.wrap.b32 %r3354, %r3353, %r3353, 24; + add.s32 %r3355, %r3354, %r3348; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 25; + add.s32 %r3358, %r3310, %r3032; + add.s32 %r3359, %r3358, %r3301; + xor.b32 %r3360, %r3326, %r3359; + shf.l.wrap.b32 %r3361, %r3360, %r3360, 16; + add.s32 %r3362, %r3361, %r3285; + xor.b32 %r3363, %r3362, %r3301; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 20; + add.s32 %r3365, %r3359, %r2955; + add.s32 %r3366, %r3365, %r3364; + xor.b32 %r3367, %r3366, %r3361; + shf.l.wrap.b32 %r3368, %r3367, %r3367, 24; + add.s32 %r3369, %r3368, %r3362; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 25; + add.s32 %r3372, %r3315, %r2990; + add.s32 %r3373, %r3372, %r3324; + xor.b32 %r3374, %r3373, %r3284; + shf.l.wrap.b32 %r3375, %r3374, %r3374, 16; + add.s32 %r3376, %r3375, %r3299; + xor.b32 %r3377, %r3376, %r3315; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 20; + add.s32 %r3379, %r3373, %r3039; + add.s32 %r3380, %r3379, %r3378; + xor.b32 %r3381, %r3380, %r3375; + shf.l.wrap.b32 %r3382, %r3381, %r3381, 24; + add.s32 %r3383, %r3382, %r3376; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 25; + add.s32 %r3386, %r3357, %r2983; + add.s32 %r3387, %r3386, %r3338; + xor.b32 %r3388, %r3387, %r3382; + shf.l.wrap.b32 %r3389, %r3388, %r3388, 16; + add.s32 %r3390, %r3389, %r3369; + xor.b32 %r3391, %r3390, %r3357; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 20; + add.s32 %r3393, %r3387, %r2976; + add.s32 %r3394, %r3393, %r3392; + xor.b32 %r3395, %r3394, %r3389; + shf.l.wrap.b32 %r3396, %r3395, %r3395, 24; + add.s32 %r3397, %r3396, %r3390; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 25; + add.s32 %r3400, %r3352, %r3004; + add.s32 %r3401, %r3400, %r3371; + xor.b32 %r3402, %r3340, %r3401; + shf.l.wrap.b32 %r3403, %r3402, %r3402, 16; + add.s32 %r3404, %r3403, %r3383; + xor.b32 %r3405, %r3404, %r3371; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 20; + add.s32 %r3407, %r3401, %r2941; + add.s32 %r3408, %r3407, %r3406; + xor.b32 %r3409, %r3408, %r3403; + shf.l.wrap.b32 %r3410, %r3409, %r3409, 24; + add.s32 %r3411, %r3410, %r3404; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 25; + add.s32 %r3414, %r3366, %r3018; + add.s32 %r3415, %r3414, %r3385; + xor.b32 %r3416, %r3415, %r3354; + shf.l.wrap.b32 %r3417, %r3416, %r3416, 16; + add.s32 %r3418, %r3417, %r3341; + xor.b32 %r3419, %r3418, %r3385; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 20; + add.s32 %r3421, %r3415, %r3046; + add.s32 %r3422, %r3421, %r3420; + xor.b32 %r3423, %r3422, %r3417; + shf.l.wrap.b32 %r3424, %r3423, %r3423, 24; + add.s32 %r3425, %r3424, %r3418; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 25; + add.s32 %r3428, %r3380, %r2997; + add.s32 %r3429, %r3428, %r3343; + xor.b32 %r3430, %r3429, %r3368; + shf.l.wrap.b32 %r3431, %r3430, %r3430, 16; + add.s32 %r3432, %r3431, %r3355; + xor.b32 %r3433, %r3432, %r3343; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 20; + add.s32 %r3435, %r3429, %r2948; + add.s32 %r3436, %r3435, %r3434; + xor.b32 %r3437, %r3436, %r3431; + shf.l.wrap.b32 %r3438, %r3437, %r3437, 24; + add.s32 %r3439, %r3438, %r3432; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 25; + add.s32 %r3442, %r3394, %r3011; + add.s32 %r3443, %r3442, %r3441; + xor.b32 %r3444, %r3443, %r3410; + shf.l.wrap.b32 %r3445, %r3444, %r3444, 16; + add.s32 %r3446, %r3445, %r3425; + xor.b32 %r3447, %r3446, %r3441; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 20; + add.s32 %r3449, %r3443, %r2990; + add.s32 %r3450, %r3449, %r3448; + xor.b32 %r3451, %r3450, %r3445; + shf.l.wrap.b32 %r3452, %r3451, %r3451, 24; + add.s32 %r3453, %r3452, %r3446; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 25; + add.s32 %r3456, %r3408, %r3025; + add.s32 %r3457, %r3456, %r3399; + xor.b32 %r3458, %r3457, %r3424; + shf.l.wrap.b32 %r3459, %r3458, %r3458, 16; + add.s32 %r3460, %r3459, %r3439; + xor.b32 %r3461, %r3460, %r3399; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 20; + add.s32 %r3463, %r3457, %r3004; + add.s32 %r3464, %r3463, %r3462; + xor.b32 %r3465, %r3464, %r3459; + shf.l.wrap.b32 %r3466, %r3465, %r3465, 24; + add.s32 %r3467, %r3466, %r3460; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 25; + add.s32 %r3470, %r3422, %r3039; + add.s32 %r3471, %r3470, %r3413; + xor.b32 %r3472, %r3438, %r3471; + shf.l.wrap.b32 %r3473, %r3472, %r3472, 16; + add.s32 %r3474, %r3473, %r3397; + xor.b32 %r3475, %r3474, %r3413; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 20; + add.s32 %r3477, %r3471, %r2962; + add.s32 %r3478, %r3477, %r3476; + xor.b32 %r3479, %r3478, %r3473; + shf.l.wrap.b32 %r3480, %r3479, %r3479, 24; + add.s32 %r3481, %r3480, %r3474; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 25; + add.s32 %r3484, %r3427, %r3032; + add.s32 %r3485, %r3484, %r3436; + xor.b32 %r3486, %r3485, %r3396; + shf.l.wrap.b32 %r3487, %r3486, %r3486, 16; + add.s32 %r3488, %r3487, %r3411; + xor.b32 %r3489, %r3488, %r3427; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 20; + add.s32 %r3491, %r3485, %r3046; + add.s32 %r3492, %r3491, %r3490; + xor.b32 %r3493, %r3492, %r3487; + shf.l.wrap.b32 %r3494, %r3493, %r3493, 24; + add.s32 %r3495, %r3494, %r3488; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 25; + add.s32 %r3498, %r3469, %r2969; + add.s32 %r3499, %r3498, %r3450; + xor.b32 %r3500, %r3499, %r3494; + shf.l.wrap.b32 %r3501, %r3500, %r3500, 16; + add.s32 %r3502, %r3501, %r3481; + xor.b32 %r3503, %r3502, %r3469; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 20; + add.s32 %r3505, %r3499, %r2941; + add.s32 %r3506, %r3505, %r3504; + xor.b32 %r3507, %r3506, %r3501; + shf.l.wrap.b32 %r3508, %r3507, %r3507, 24; + add.s32 %r3509, %r3508, %r3502; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 25; + add.s32 %r3512, %r3464, %r3018; + add.s32 %r3513, %r3512, %r3483; + xor.b32 %r3514, %r3452, %r3513; + shf.l.wrap.b32 %r3515, %r3514, %r3514, 16; + add.s32 %r3516, %r3515, %r3495; + xor.b32 %r3517, %r3516, %r3483; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 20; + add.s32 %r3519, %r3513, %r2955; + add.s32 %r3520, %r3519, %r3518; + xor.b32 %r3521, %r3520, %r3515; + shf.l.wrap.b32 %r3522, %r3521, %r3521, 24; + add.s32 %r3523, %r3522, %r3516; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 25; + add.s32 %r3526, %r3478, %r2976; + add.s32 %r3527, %r3526, %r3497; + xor.b32 %r3528, %r3527, %r3466; + shf.l.wrap.b32 %r3529, %r3528, %r3528, 16; + add.s32 %r3530, %r3529, %r3453; + xor.b32 %r3531, %r3530, %r3497; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 20; + add.s32 %r3533, %r3527, %r2997; + add.s32 %r3534, %r3533, %r3532; + xor.b32 %r3535, %r3534, %r3529; + shf.l.wrap.b32 %r3536, %r3535, %r3535, 24; + add.s32 %r3537, %r3536, %r3530; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 25; + add.s32 %r3540, %r3492, %r2948; + add.s32 %r3541, %r3540, %r3455; + xor.b32 %r3542, %r3541, %r3480; + shf.l.wrap.b32 %r3543, %r3542, %r3542, 16; + add.s32 %r3544, %r3543, %r3467; + xor.b32 %r3545, %r3544, %r3455; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 20; + add.s32 %r3547, %r3541, %r2983; + add.s32 %r3548, %r3547, %r3546; + xor.b32 %r3549, %r3548, %r3543; + shf.l.wrap.b32 %r3550, %r3549, %r3549, 24; + add.s32 %r3551, %r3550, %r3544; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 25; + add.s32 %r3554, %r3506, %r3025; + add.s32 %r3555, %r3554, %r3553; + xor.b32 %r3556, %r3555, %r3522; + shf.l.wrap.b32 %r3557, %r3556, %r3556, 16; + add.s32 %r3558, %r3557, %r3537; + xor.b32 %r3559, %r3558, %r3553; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 20; + add.s32 %r3561, %r3555, %r3032; + add.s32 %r3562, %r3561, %r3560; + xor.b32 %r3563, %r3562, %r3557; + shf.l.wrap.b32 %r3564, %r3563, %r3563, 24; + add.s32 %r3565, %r3564, %r3558; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 25; + add.s32 %r3568, %r3520, %r3004; + add.s32 %r3569, %r3568, %r3511; + xor.b32 %r3570, %r3569, %r3536; + shf.l.wrap.b32 %r3571, %r3570, %r3570, 16; + add.s32 %r3572, %r3571, %r3551; + xor.b32 %r3573, %r3572, %r3511; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 20; + add.s32 %r3575, %r3569, %r3018; + add.s32 %r3576, %r3575, %r3574; + xor.b32 %r3577, %r3576, %r3571; + shf.l.wrap.b32 %r3578, %r3577, %r3577, 24; + add.s32 %r3579, %r3578, %r3572; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 25; + add.s32 %r3582, %r3534, %r3046; + add.s32 %r3583, %r3582, %r3525; + xor.b32 %r3584, %r3550, %r3583; + shf.l.wrap.b32 %r3585, %r3584, %r3584, 16; + add.s32 %r3586, %r3585, %r3509; + xor.b32 %r3587, %r3586, %r3525; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 20; + add.s32 %r3589, %r3583, %r3011; + add.s32 %r3590, %r3589, %r3588; + xor.b32 %r3591, %r3590, %r3585; + shf.l.wrap.b32 %r3592, %r3591, %r3591, 24; + add.s32 %r3593, %r3592, %r3586; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 25; + add.s32 %r3596, %r3539, %r3039; + add.s32 %r3597, %r3596, %r3548; + xor.b32 %r3598, %r3597, %r3508; + shf.l.wrap.b32 %r3599, %r3598, %r3598, 16; + add.s32 %r3600, %r3599, %r3523; + xor.b32 %r3601, %r3600, %r3539; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 20; + add.s32 %r3603, %r3597, %r2997; + add.s32 %r3604, %r3603, %r3602; + xor.b32 %r3605, %r3604, %r3599; + shf.l.wrap.b32 %r3606, %r3605, %r3605, 24; + add.s32 %r3607, %r3606, %r3600; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 25; + add.s32 %r3610, %r3581, %r2990; + add.s32 %r3611, %r3610, %r3562; + xor.b32 %r3612, %r3611, %r3606; + shf.l.wrap.b32 %r3613, %r3612, %r3612, 16; + add.s32 %r3614, %r3613, %r3593; + xor.b32 %r3615, %r3614, %r3581; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 20; + add.s32 %r3617, %r3611, %r2955; + add.s32 %r3618, %r3617, %r3616; + xor.b32 %r3619, %r3618, %r3613; + shf.l.wrap.b32 %r3620, %r3619, %r3619, 24; + add.s32 %r3621, %r3620, %r3614; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 25; + add.s32 %r3624, %r3576, %r2976; + add.s32 %r3625, %r3624, %r3595; + xor.b32 %r3626, %r3564, %r3625; + shf.l.wrap.b32 %r3627, %r3626, %r3626, 16; + add.s32 %r3628, %r3627, %r3607; + xor.b32 %r3629, %r3628, %r3595; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 20; + add.s32 %r3631, %r3625, %r2962; + add.s32 %r3632, %r3631, %r3630; + xor.b32 %r3633, %r3632, %r3627; + shf.l.wrap.b32 %r3634, %r3633, %r3633, 24; + add.s32 %r3635, %r3634, %r3628; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 25; + add.s32 %r3638, %r3590, %r2941; + add.s32 %r3639, %r3638, %r3609; + xor.b32 %r3640, %r3639, %r3578; + shf.l.wrap.b32 %r3641, %r3640, %r3640, 16; + add.s32 %r3642, %r3641, %r3565; + xor.b32 %r3643, %r3642, %r3609; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 20; + add.s32 %r3645, %r3639, %r2948; + add.s32 %r3646, %r3645, %r3644; + xor.b32 %r3647, %r3646, %r3641; + shf.l.wrap.b32 %r3648, %r3647, %r3647, 24; + add.s32 %r3649, %r3648, %r3642; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 25; + add.s32 %r3652, %r3604, %r2983; + add.s32 %r3653, %r3652, %r3567; + xor.b32 %r3654, %r3653, %r3592; + shf.l.wrap.b32 %r3655, %r3654, %r3654, 16; + add.s32 %r3656, %r3655, %r3579; + xor.b32 %r3657, %r3656, %r3567; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 20; + add.s32 %r3659, %r3653, %r2969; + add.s32 %r3660, %r3659, %r3658; + xor.b32 %r3661, %r3660, %r3655; + shf.l.wrap.b32 %r3662, %r3661, %r3661, 24; + add.s32 %r3663, %r3662, %r3656; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 25; + add.s32 %r3666, %r3618, %r3004; + add.s32 %r3667, %r3666, %r3665; + xor.b32 %r3668, %r3667, %r3634; + shf.l.wrap.b32 %r3669, %r3668, %r3668, 16; + add.s32 %r3670, %r3669, %r3649; + xor.b32 %r3671, %r3670, %r3665; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 20; + add.s32 %r3673, %r3667, %r3039; + add.s32 %r3674, %r3673, %r3672; + xor.b32 %r3675, %r3674, %r3669; + shf.l.wrap.b32 %r3676, %r3675, %r3675, 24; + add.s32 %r3677, %r3676, %r3670; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 25; + add.s32 %r3680, %r3632, %r3018; + add.s32 %r3681, %r3680, %r3623; + xor.b32 %r3682, %r3681, %r3648; + shf.l.wrap.b32 %r3683, %r3682, %r3682, 16; + add.s32 %r3684, %r3683, %r3663; + xor.b32 %r3685, %r3684, %r3623; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 20; + add.s32 %r3687, %r3681, %r2976; + add.s32 %r3688, %r3687, %r3686; + xor.b32 %r3689, %r3688, %r3683; + shf.l.wrap.b32 %r3690, %r3689, %r3689, 24; + add.s32 %r3691, %r3690, %r3684; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 25; + add.s32 %r3694, %r3646, %r2997; + add.s32 %r3695, %r3694, %r3637; + xor.b32 %r3696, %r3662, %r3695; + shf.l.wrap.b32 %r3697, %r3696, %r3696, 16; + add.s32 %r3698, %r3697, %r3621; + xor.b32 %r3699, %r3698, %r3637; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 20; + add.s32 %r3701, %r3695, %r3025; + add.s32 %r3702, %r3701, %r3700; + xor.b32 %r3703, %r3702, %r3697; + shf.l.wrap.b32 %r3704, %r3703, %r3703, 24; + add.s32 %r3705, %r3704, %r3698; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 25; + add.s32 %r3708, %r3651, %r3046; + add.s32 %r3709, %r3708, %r3660; + xor.b32 %r3710, %r3709, %r3620; + shf.l.wrap.b32 %r3711, %r3710, %r3710, 16; + add.s32 %r3712, %r3711, %r3635; + xor.b32 %r3713, %r3712, %r3651; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 20; + add.s32 %r3715, %r3709, %r2948; + add.s32 %r3716, %r3715, %r3714; + xor.b32 %r3717, %r3716, %r3711; + shf.l.wrap.b32 %r3718, %r3717, %r3717, 24; + add.s32 %r3719, %r3718, %r3712; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 25; + add.s32 %r3722, %r3693, %r3032; + add.s32 %r3723, %r3722, %r3674; + xor.b32 %r3724, %r3723, %r3718; + shf.l.wrap.b32 %r3725, %r3724, %r3724, 16; + add.s32 %r3726, %r3725, %r3705; + xor.b32 %r3727, %r3726, %r3693; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 20; + add.s32 %r3729, %r3723, %r2962; + add.s32 %r3730, %r3729, %r3728; + xor.b32 %r3731, %r3730, %r3725; + shf.l.wrap.b32 %r3732, %r3731, %r3731, 24; + add.s32 %r3733, %r3732, %r3726; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 25; + add.s32 %r3736, %r3688, %r2941; + add.s32 %r3737, %r3736, %r3707; + xor.b32 %r3738, %r3676, %r3737; + shf.l.wrap.b32 %r3739, %r3738, %r3738, 16; + add.s32 %r3740, %r3739, %r3719; + xor.b32 %r3741, %r3740, %r3707; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 20; + add.s32 %r3743, %r3737, %r3011; + add.s32 %r3744, %r3743, %r3742; + xor.b32 %r3745, %r3744, %r3739; + shf.l.wrap.b32 %r3746, %r3745, %r3745, 24; + add.s32 %r3747, %r3746, %r3740; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 25; + add.s32 %r3750, %r3702, %r2955; + add.s32 %r3751, %r3750, %r3721; + xor.b32 %r3752, %r3751, %r3690; + shf.l.wrap.b32 %r3753, %r3752, %r3752, 16; + add.s32 %r3754, %r3753, %r3677; + xor.b32 %r3755, %r3754, %r3721; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 20; + add.s32 %r3757, %r3751, %r2983; + add.s32 %r3758, %r3757, %r3756; + xor.b32 %r3759, %r3758, %r3753; + shf.l.wrap.b32 %r3760, %r3759, %r3759, 24; + add.s32 %r3761, %r3760, %r3754; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 25; + add.s32 %r3764, %r3716, %r2969; + add.s32 %r3765, %r3764, %r3679; + xor.b32 %r3766, %r3765, %r3704; + shf.l.wrap.b32 %r3767, %r3766, %r3766, 16; + add.s32 %r3768, %r3767, %r3691; + xor.b32 %r3769, %r3768, %r3679; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 20; + add.s32 %r3771, %r3765, %r2990; + add.s32 %r3772, %r3771, %r3770; + xor.b32 %r3773, %r3772, %r3767; + shf.l.wrap.b32 %r3774, %r3773, %r3773, 24; + add.s32 %r3775, %r3774, %r3768; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 25; + add.s32 %r3778, %r3730, %r3018; + add.s32 %r3779, %r3778, %r3777; + xor.b32 %r3780, %r3779, %r3746; + shf.l.wrap.b32 %r3781, %r3780, %r3780, 16; + add.s32 %r3782, %r3781, %r3761; + xor.b32 %r3783, %r3782, %r3777; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 20; + add.s32 %r3785, %r3779, %r3046; + add.s32 %r3786, %r3785, %r3784; + xor.b32 %r3787, %r3786, %r3781; + shf.l.wrap.b32 %r3788, %r3787, %r3787, 24; + add.s32 %r3789, %r3788, %r3782; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 25; + add.s32 %r3792, %r3744, %r2976; + add.s32 %r3793, %r3792, %r3735; + xor.b32 %r3794, %r3793, %r3760; + shf.l.wrap.b32 %r3795, %r3794, %r3794, 16; + add.s32 %r3796, %r3795, %r3775; + xor.b32 %r3797, %r3796, %r3735; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 20; + add.s32 %r3799, %r3793, %r2941; + add.s32 %r3800, %r3799, %r3798; + xor.b32 %r3801, %r3800, %r3795; + shf.l.wrap.b32 %r3802, %r3801, %r3801, 24; + add.s32 %r3803, %r3802, %r3796; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 25; + add.s32 %r3806, %r3758, %r2948; + add.s32 %r3807, %r3806, %r3749; + xor.b32 %r3808, %r3774, %r3807; + shf.l.wrap.b32 %r3809, %r3808, %r3808, 16; + add.s32 %r3810, %r3809, %r3733; + xor.b32 %r3811, %r3810, %r3749; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 20; + add.s32 %r3813, %r3807, %r3004; + add.s32 %r3814, %r3813, %r3812; + xor.b32 %r3815, %r3814, %r3809; + shf.l.wrap.b32 %r3816, %r3815, %r3815, 24; + add.s32 %r3817, %r3816, %r3810; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 25; + add.s32 %r3820, %r3763, %r2997; + add.s32 %r3821, %r3820, %r3772; + xor.b32 %r3822, %r3821, %r3732; + shf.l.wrap.b32 %r3823, %r3822, %r3822, 16; + add.s32 %r3824, %r3823, %r3747; + xor.b32 %r3825, %r3824, %r3763; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 20; + add.s32 %r3827, %r3821, %r2983; + add.s32 %r3828, %r3827, %r3826; + xor.b32 %r3829, %r3828, %r3823; + shf.l.wrap.b32 %r3830, %r3829, %r3829, 24; + add.s32 %r3831, %r3830, %r3824; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 25; + add.s32 %r3834, %r3805, %r3039; + add.s32 %r3835, %r3834, %r3786; + xor.b32 %r3836, %r3835, %r3830; + shf.l.wrap.b32 %r3837, %r3836, %r3836, 16; + add.s32 %r3838, %r3837, %r3817; + xor.b32 %r3839, %r3838, %r3805; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 20; + add.s32 %r3841, %r3835, %r3011; + add.s32 %r3842, %r3841, %r3840; + xor.b32 %r3843, %r3842, %r3837; + shf.l.wrap.b32 %r3844, %r3843, %r3843, 24; + add.s32 %r3845, %r3844, %r3838; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 25; + add.s32 %r3848, %r3800, %r2955; + add.s32 %r3849, %r3848, %r3819; + xor.b32 %r3850, %r3788, %r3849; + shf.l.wrap.b32 %r3851, %r3850, %r3850, 16; + add.s32 %r3852, %r3851, %r3831; + xor.b32 %r3853, %r3852, %r3819; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 20; + add.s32 %r3855, %r3849, %r3025; + add.s32 %r3856, %r3855, %r3854; + xor.b32 %r3857, %r3856, %r3851; + shf.l.wrap.b32 %r3858, %r3857, %r3857, 24; + add.s32 %r3859, %r3858, %r3852; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 25; + add.s32 %r3862, %r3814, %r2962; + add.s32 %r3863, %r3862, %r3833; + xor.b32 %r3864, %r3863, %r3802; + shf.l.wrap.b32 %r3865, %r3864, %r3864, 16; + add.s32 %r3866, %r3865, %r3789; + xor.b32 %r3867, %r3866, %r3833; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 20; + add.s32 %r3869, %r3863, %r2969; + add.s32 %r3870, %r3869, %r3868; + xor.b32 %r3871, %r3870, %r3865; + shf.l.wrap.b32 %r3872, %r3871, %r3871, 24; + add.s32 %r3873, %r3872, %r3866; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 25; + add.s32 %r3876, %r3828, %r2990; + add.s32 %r3877, %r3876, %r3791; + xor.b32 %r3878, %r3877, %r3816; + shf.l.wrap.b32 %r3879, %r3878, %r3878, 16; + add.s32 %r3880, %r3879, %r3803; + xor.b32 %r3881, %r3880, %r3791; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 20; + add.s32 %r3883, %r3877, %r3032; + add.s32 %r3884, %r3883, %r3882; + xor.b32 %r3885, %r3884, %r3879; + shf.l.wrap.b32 %r3886, %r3885, %r3885, 24; + add.s32 %r3887, %r3886, %r3880; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 25; + xor.b32 %r3890, %r3873, %r3842; + xor.b32 %r3891, %r3887, %r3856; + xor.b32 %r3892, %r3845, %r3870; + xor.b32 %r3893, %r3884, %r3859; + xor.b32 %r3894, %r3889, %r3858; + xor.b32 %r3895, %r3847, %r3872; + xor.b32 %r3896, %r3886, %r3861; + xor.b32 %r3897, %r3875, %r3844; + st.local.u8 [%rd132+145], %r3890; + shr.u32 %r3898, %r3890, 8; + st.local.u8 [%rd132+146], %r3898; + shr.u32 %r3899, %r3890, 16; + st.local.u8 [%rd132+147], %r3899; + shr.u32 %r3900, %r3890, 24; + st.local.u8 [%rd132+148], %r3900; + st.local.u8 [%rd132+149], %r3891; + shr.u32 %r3901, %r3891, 8; + st.local.u8 [%rd132+150], %r3901; + shr.u32 %r3902, %r3891, 16; + st.local.u8 [%rd132+151], %r3902; + shr.u32 %r3903, %r3891, 24; + st.local.u8 [%rd132+152], %r3903; + st.local.u8 [%rd132+153], %r3892; + shr.u32 %r3904, %r3892, 8; + st.local.u8 [%rd132+154], %r3904; + shr.u32 %r3905, %r3892, 16; + st.local.u8 [%rd132+155], %r3905; + shr.u32 %r3906, %r3892, 24; + st.local.u8 [%rd132+156], %r3906; + st.local.u8 [%rd132+157], %r3893; + shr.u32 %r3907, %r3893, 8; + st.local.u8 [%rd132+158], %r3907; + shr.u32 %r3908, %r3893, 16; + st.local.u8 [%rd132+159], %r3908; + shr.u32 %r3909, %r3893, 24; + st.local.u8 [%rd132+160], %r3909; + st.local.u8 [%rd132+161], %r3894; + shr.u32 %r3910, %r3894, 8; + st.local.u8 [%rd132+162], %r3910; + shr.u32 %r3911, %r3894, 16; + st.local.u8 [%rd132+163], %r3911; + shr.u32 %r3912, %r3894, 24; + st.local.u8 [%rd132+164], %r3912; + st.local.u8 [%rd132+165], %r3895; + shr.u32 %r3913, %r3895, 8; + st.local.u8 [%rd132+166], %r3913; + shr.u32 %r3914, %r3895, 16; + st.local.u8 [%rd132+167], %r3914; + shr.u32 %r3915, %r3895, 24; + st.local.u8 [%rd132+168], %r3915; + st.local.u8 [%rd132+169], %r3896; + shr.u32 %r3916, %r3896, 8; + st.local.u8 [%rd132+170], %r3916; + shr.u32 %r3917, %r3896, 16; + st.local.u8 [%rd132+171], %r3917; + shr.u32 %r3918, %r3896, 24; + st.local.u8 [%rd132+172], %r3918; + st.local.u8 [%rd132+173], %r3897; + shr.u32 %r3919, %r3897, 8; + st.local.u8 [%rd132+174], %r3919; + shr.u32 %r3920, %r3897, 16; + st.local.u8 [%rd132+175], %r3920; + shr.u32 %r3921, %r3897, 24; + st.local.u8 [%rd132+176], %r3921; + ld.local.u8 %rs128, [%rd3+8]; + add.s16 %rs129, %rs128, -1; + st.local.u8 [%rd3+8], %rs129; + cvt.u64.u16 %rd133, %rs129; + and.b64 %rd134, %rd133, 255; + setp.lt.u64 %p16, %rd30, %rd134; + and.b16 %rs130, %rs129, 255; + mul.wide.u16 %r11659, %rs130, 32; + @%p16 bra $L__BB1_18; + +$L__BB1_19: + ld.param.u64 %rd223, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvt.s64.s32 %rd136, %r11659; + add.s64 %rd137, %rd2, %rd136; + mov.u64 %rd246, 0; + st.local.u8 [%rd137+145], %r27; + shr.u32 %r3922, %r27, 8; + st.local.u8 [%rd137+146], %r3922; + shr.u32 %r3923, %r27, 16; + st.local.u8 [%rd137+147], %r3923; + shr.u32 %r3924, %r27, 24; + st.local.u8 [%rd137+148], %r3924; + st.local.u8 [%rd137+149], %r28; + shr.u32 %r3925, %r28, 8; + st.local.u8 [%rd137+150], %r3925; + shr.u32 %r3926, %r28, 16; + st.local.u8 [%rd137+151], %r3926; + shr.u32 %r3927, %r28, 24; + st.local.u8 [%rd137+152], %r3927; + st.local.u8 [%rd137+153], %r29; + shr.u32 %r3928, %r29, 8; + st.local.u8 [%rd137+154], %r3928; + shr.u32 %r3929, %r29, 16; + st.local.u8 [%rd137+155], %r3929; + shr.u32 %r3930, %r29, 24; + st.local.u8 [%rd137+156], %r3930; + st.local.u8 [%rd137+157], %r30; + shr.u32 %r3931, %r30, 8; + st.local.u8 [%rd137+158], %r3931; + shr.u32 %r3932, %r30, 16; + st.local.u8 [%rd137+159], %r3932; + shr.u32 %r3933, %r30, 24; + st.local.u8 [%rd137+160], %r3933; + st.local.u8 [%rd137+161], %r31; + shr.u32 %r3934, %r31, 8; + st.local.u8 [%rd137+162], %r3934; + shr.u32 %r3935, %r31, 16; + st.local.u8 [%rd137+163], %r3935; + shr.u32 %r3936, %r31, 24; + st.local.u8 [%rd137+164], %r3936; + st.local.u8 [%rd137+165], %r32; + shr.u32 %r3937, %r32, 8; + st.local.u8 [%rd137+166], %r3937; + shr.u32 %r3938, %r32, 16; + st.local.u8 [%rd137+167], %r3938; + shr.u32 %r3939, %r32, 24; + st.local.u8 [%rd137+168], %r3939; + st.local.u8 [%rd137+169], %r33; + shr.u32 %r3940, %r33, 8; + st.local.u8 [%rd137+170], %r3940; + shr.u32 %r3941, %r33, 16; + st.local.u8 [%rd137+171], %r3941; + shr.u32 %r3942, %r33, 24; + st.local.u8 [%rd137+172], %r3942; + st.local.u8 [%rd137+173], %r34; + shr.u32 %r3943, %r34, 8; + st.local.u8 [%rd137+174], %r3943; + shr.u32 %r3944, %r34, 16; + st.local.u8 [%rd137+175], %r3944; + shr.u32 %r3945, %r34, 24; + st.local.u8 [%rd137+176], %r3945; + ld.local.u8 %rs131, [%rd3+8]; + add.s16 %rs132, %rs131, 1; + st.local.u8 [%rd3+8], %rs132; + ld.local.u64 %rd138, [%rd3+-72]; + add.s64 %rd32, %rd138, 1; + add.s64 %rd254, %rd223, %rd6; + +$L__BB1_20: + add.s64 %rd139, %rd2, %rd246; + ld.local.u8 %rs133, [%rd139]; + st.local.u8 [%rd139+32], %rs133; + add.s64 %rd246, %rd246, 1; + setp.lt.u64 %p17, %rd246, 32; + @%p17 bra $L__BB1_20; + + mov.u64 %rd247, 0; + st.local.u64 [%rd3+-72], %rd32; + mov.u16 %rs134, 0; + st.local.u8 [%rd3+1], %rs134; + +$L__BB1_22: + add.s64 %rd141, %rd2, %rd247; + st.local.u8 [%rd141+72], %rs134; + add.s64 %rd247, %rd247, 1; + setp.lt.u64 %p18, %rd247, 64; + @%p18 bra $L__BB1_22; + + ld.param.u64 %rd236, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd235, %rd236; + add.s64 %rd261, %rd235, %rd6; + mov.u64 %rd225, 32; + sub.s64 %rd262, %rd225, %rd6; + mov.u16 %rs136, 0; + st.local.u8 [%rd3], %rs136; + +$L__BB1_24: + setp.lt.u64 %p19, %rd262, 1025; + @%p19 bra $L__BB1_48; + + ld.local.u64 %rd251, [%rd3+-72]; + add.u64 %rd142, %SP, 0; + add.u64 %rd42, %SPL, 0; + +$L__BB1_26: + or.b64 %rd143, %rd262, 1; + mov.u64 %rd144, 1; + setp.gt.u64 %p20, %rd143, 4294967295; + shr.u64 %rd145, %rd262, 32; + selp.b64 %rd146, %rd145, %rd143, %p20; + selp.b32 %r3946, 32, 0, %p20; + and.b64 %rd147, %rd146, 4294901760; + setp.ne.s64 %p21, %rd147, 0; + shr.u64 %rd148, %rd146, 16; + or.b32 %r3947, %r3946, 16; + selp.b64 %rd149, %rd148, %rd146, %p21; + selp.b32 %r3948, %r3947, %r3946, %p21; + and.b64 %rd150, %rd149, 65280; + setp.ne.s64 %p22, %rd150, 0; + shr.u64 %rd151, %rd149, 8; + or.b32 %r3949, %r3948, 8; + selp.b64 %rd152, %rd151, %rd149, %p22; + selp.b32 %r3950, %r3949, %r3948, %p22; + and.b64 %rd153, %rd152, 240; + setp.ne.s64 %p23, %rd153, 0; + shr.u64 %rd154, %rd152, 4; + or.b32 %r3951, %r3950, 4; + selp.b64 %rd155, %rd154, %rd152, %p23; + selp.b32 %r3952, %r3951, %r3950, %p23; + and.b64 %rd156, %rd155, 12; + setp.ne.s64 %p24, %rd156, 0; + shr.u64 %rd157, %rd155, 2; + add.s32 %r3953, %r3952, 2; + selp.b64 %rd158, %rd157, %rd155, %p24; + selp.b32 %r3954, %r3953, %r3952, %p24; + and.b64 %rd159, %rd158, 2; + shr.u64 %rd160, %rd159, 1; + cvt.u32.u64 %r3955, %rd160; + add.s32 %r3956, %r3954, %r3955; + shl.b64 %rd255, %rd144, %r3956; + shl.b64 %rd48, %rd251, 10; + +$L__BB1_27: + mov.u64 %rd49, %rd255; + add.s64 %rd161, %rd49, -1; + and.b64 %rd162, %rd161, %rd48; + setp.ne.s64 %p25, %rd162, 0; + shr.u64 %rd255, %rd49, 1; + @%p25 bra $L__BB1_27; + + ld.local.u8 %rs14, [%rd3+2]; + setp.lt.u64 %p26, %rd49, 1025; + @%p26 bra $L__BB1_36; + bra.uni $L__BB1_29; + +$L__BB1_36: + ld.local.u8 %r5955, [%rd3+-136]; + ld.local.u8 %r5956, [%rd3+-135]; + prmt.b32 %r5957, %r5956, %r5955, 30212; + ld.local.u8 %r5958, [%rd3+-134]; + ld.local.u8 %r5959, [%rd3+-133]; + prmt.b32 %r5960, %r5959, %r5958, 30212; + prmt.b32 %r11679, %r5960, %r5957, 4180; + ld.local.u8 %r5961, [%rd3+-132]; + ld.local.u8 %r5962, [%rd3+-131]; + prmt.b32 %r5963, %r5962, %r5961, 30212; + ld.local.u8 %r5964, [%rd3+-130]; + ld.local.u8 %r5965, [%rd3+-129]; + prmt.b32 %r5966, %r5965, %r5964, 30212; + prmt.b32 %r11678, %r5966, %r5963, 4180; + ld.local.u8 %r5967, [%rd3+-128]; + ld.local.u8 %r5968, [%rd3+-127]; + prmt.b32 %r5969, %r5968, %r5967, 30212; + ld.local.u8 %r5970, [%rd3+-126]; + ld.local.u8 %r5971, [%rd3+-125]; + prmt.b32 %r5972, %r5971, %r5970, 30212; + prmt.b32 %r11677, %r5972, %r5969, 4180; + ld.local.u8 %r5973, [%rd3+-124]; + ld.local.u8 %r5974, [%rd3+-123]; + prmt.b32 %r5975, %r5974, %r5973, 30212; + ld.local.u8 %r5976, [%rd3+-122]; + ld.local.u8 %r5977, [%rd3+-121]; + prmt.b32 %r5978, %r5977, %r5976, 30212; + prmt.b32 %r11676, %r5978, %r5975, 4180; + ld.local.u8 %r5979, [%rd3+-120]; + ld.local.u8 %r5980, [%rd3+-119]; + prmt.b32 %r5981, %r5980, %r5979, 30212; + ld.local.u8 %r5982, [%rd3+-118]; + ld.local.u8 %r5983, [%rd3+-117]; + prmt.b32 %r5984, %r5983, %r5982, 30212; + prmt.b32 %r11675, %r5984, %r5981, 4180; + ld.local.u8 %r5985, [%rd3+-116]; + ld.local.u8 %r5986, [%rd3+-115]; + prmt.b32 %r5987, %r5986, %r5985, 30212; + ld.local.u8 %r5988, [%rd3+-114]; + ld.local.u8 %r5989, [%rd3+-113]; + prmt.b32 %r5990, %r5989, %r5988, 30212; + prmt.b32 %r11674, %r5990, %r5987, 4180; + ld.local.u8 %r5991, [%rd3+-112]; + ld.local.u8 %r5992, [%rd3+-111]; + prmt.b32 %r5993, %r5992, %r5991, 30212; + ld.local.u8 %r5994, [%rd3+-110]; + ld.local.u8 %r5995, [%rd3+-109]; + prmt.b32 %r5996, %r5995, %r5994, 30212; + prmt.b32 %r11673, %r5996, %r5993, 4180; + ld.local.u8 %r5997, [%rd3+-108]; + ld.local.u8 %r5998, [%rd3+-107]; + prmt.b32 %r5999, %r5998, %r5997, 30212; + ld.local.u8 %r6000, [%rd3+-106]; + ld.local.u8 %r6001, [%rd3+-105]; + prmt.b32 %r6002, %r6001, %r6000, 30212; + prmt.b32 %r11672, %r6002, %r5999, 4180; + add.u64 %rd53, %SPL, 64; + mov.u32 %r6003, 0; + st.local.v2.u32 [%rd53], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+8], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+16], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+24], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+32], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+40], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+48], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+56], {%r6003, %r6003}; + mov.u16 %rs354, 0; + st.local.v2.u8 [%rd53+64], {%rs354, %rs354}; + st.local.u8 [%rd53+66], %rs14; + cvt.u32.u64 %r71, %rd251; + shr.u64 %rd185, %rd251, 32; + cvt.u32.u64 %r72, %rd185; + setp.lt.u64 %p31, %rd49, 65; + mov.u64 %rd258, %rd261; + mov.u64 %rd259, %rd49; + @%p31 bra $L__BB1_39; + + add.s64 %rd54, %rd53, 64; + mov.u16 %rs353, 0; + mov.u64 %rd259, %rd49; + mov.u64 %rd258, %rd261; + +$L__BB1_38: + and.b16 %rs213, %rs353, 255; + setp.eq.s16 %p32, %rs213, 0; + selp.u16 %rs214, 1, 0, %p32; + or.b16 %rs215, %rs14, %rs214; + ld.local.u8 %r6004, [%rd258]; + ld.local.u8 %r6005, [%rd258+1]; + prmt.b32 %r6006, %r6005, %r6004, 30212; + ld.local.u8 %r6007, [%rd258+2]; + prmt.b32 %r6008, %r6007, %r6006, 28756; + ld.local.u8 %r6009, [%rd258+3]; + prmt.b32 %r6010, %r6009, %r6008, 1620; + ld.local.u8 %r6011, [%rd258+4]; + ld.local.u8 %r6012, [%rd258+5]; + prmt.b32 %r6013, %r6012, %r6011, 30212; + ld.local.u8 %r6014, [%rd258+6]; + prmt.b32 %r6015, %r6014, %r6013, 28756; + ld.local.u8 %r6016, [%rd258+7]; + prmt.b32 %r6017, %r6016, %r6015, 1620; + ld.local.u8 %r6018, [%rd258+8]; + ld.local.u8 %r6019, [%rd258+9]; + prmt.b32 %r6020, %r6019, %r6018, 30212; + ld.local.u8 %r6021, [%rd258+10]; + prmt.b32 %r6022, %r6021, %r6020, 28756; + ld.local.u8 %r6023, [%rd258+11]; + prmt.b32 %r6024, %r6023, %r6022, 1620; + ld.local.u8 %r6025, [%rd258+12]; + ld.local.u8 %r6026, [%rd258+13]; + prmt.b32 %r6027, %r6026, %r6025, 30212; + ld.local.u8 %r6028, [%rd258+14]; + prmt.b32 %r6029, %r6028, %r6027, 28756; + ld.local.u8 %r6030, [%rd258+15]; + prmt.b32 %r6031, %r6030, %r6029, 1620; + ld.local.u8 %r6032, [%rd258+16]; + ld.local.u8 %r6033, [%rd258+17]; + prmt.b32 %r6034, %r6033, %r6032, 30212; + ld.local.u8 %r6035, [%rd258+18]; + prmt.b32 %r6036, %r6035, %r6034, 28756; + ld.local.u8 %r6037, [%rd258+19]; + prmt.b32 %r6038, %r6037, %r6036, 1620; + ld.local.u8 %r6039, [%rd258+20]; + ld.local.u8 %r6040, [%rd258+21]; + prmt.b32 %r6041, %r6040, %r6039, 30212; + ld.local.u8 %r6042, [%rd258+22]; + prmt.b32 %r6043, %r6042, %r6041, 28756; + ld.local.u8 %r6044, [%rd258+23]; + prmt.b32 %r6045, %r6044, %r6043, 1620; + ld.local.u8 %r6046, [%rd258+24]; + ld.local.u8 %r6047, [%rd258+25]; + prmt.b32 %r6048, %r6047, %r6046, 30212; + ld.local.u8 %r6049, [%rd258+26]; + prmt.b32 %r6050, %r6049, %r6048, 28756; + ld.local.u8 %r6051, [%rd258+27]; + prmt.b32 %r6052, %r6051, %r6050, 1620; + ld.local.u8 %r6053, [%rd258+28]; + ld.local.u8 %r6054, [%rd258+29]; + prmt.b32 %r6055, %r6054, %r6053, 30212; + ld.local.u8 %r6056, [%rd258+30]; + prmt.b32 %r6057, %r6056, %r6055, 28756; + ld.local.u8 %r6058, [%rd258+31]; + prmt.b32 %r6059, %r6058, %r6057, 1620; + ld.local.u8 %r6060, [%rd258+32]; + ld.local.u8 %r6061, [%rd258+33]; + prmt.b32 %r6062, %r6061, %r6060, 30212; + ld.local.u8 %r6063, [%rd258+34]; + prmt.b32 %r6064, %r6063, %r6062, 28756; + ld.local.u8 %r6065, [%rd258+35]; + prmt.b32 %r6066, %r6065, %r6064, 1620; + ld.local.u8 %r6067, [%rd258+36]; + ld.local.u8 %r6068, [%rd258+37]; + prmt.b32 %r6069, %r6068, %r6067, 30212; + ld.local.u8 %r6070, [%rd258+38]; + prmt.b32 %r6071, %r6070, %r6069, 28756; + ld.local.u8 %r6072, [%rd258+39]; + prmt.b32 %r6073, %r6072, %r6071, 1620; + ld.local.u8 %r6074, [%rd258+40]; + ld.local.u8 %r6075, [%rd258+41]; + prmt.b32 %r6076, %r6075, %r6074, 30212; + ld.local.u8 %r6077, [%rd258+42]; + prmt.b32 %r6078, %r6077, %r6076, 28756; + ld.local.u8 %r6079, [%rd258+43]; + prmt.b32 %r6080, %r6079, %r6078, 1620; + ld.local.u8 %r6081, [%rd258+44]; + ld.local.u8 %r6082, [%rd258+45]; + prmt.b32 %r6083, %r6082, %r6081, 30212; + ld.local.u8 %r6084, [%rd258+46]; + prmt.b32 %r6085, %r6084, %r6083, 28756; + ld.local.u8 %r6086, [%rd258+47]; + prmt.b32 %r6087, %r6086, %r6085, 1620; + ld.local.u8 %r6088, [%rd258+48]; + ld.local.u8 %r6089, [%rd258+49]; + prmt.b32 %r6090, %r6089, %r6088, 30212; + ld.local.u8 %r6091, [%rd258+50]; + prmt.b32 %r6092, %r6091, %r6090, 28756; + ld.local.u8 %r6093, [%rd258+51]; + prmt.b32 %r6094, %r6093, %r6092, 1620; + ld.local.u8 %r6095, [%rd258+52]; + ld.local.u8 %r6096, [%rd258+53]; + prmt.b32 %r6097, %r6096, %r6095, 30212; + ld.local.u8 %r6098, [%rd258+54]; + prmt.b32 %r6099, %r6098, %r6097, 28756; + ld.local.u8 %r6100, [%rd258+55]; + prmt.b32 %r6101, %r6100, %r6099, 1620; + ld.local.u8 %r6102, [%rd258+56]; + ld.local.u8 %r6103, [%rd258+57]; + prmt.b32 %r6104, %r6103, %r6102, 30212; + ld.local.u8 %r6105, [%rd258+58]; + prmt.b32 %r6106, %r6105, %r6104, 28756; + ld.local.u8 %r6107, [%rd258+59]; + prmt.b32 %r6108, %r6107, %r6106, 1620; + ld.local.u8 %r6109, [%rd258+60]; + ld.local.u8 %r6110, [%rd258+61]; + prmt.b32 %r6111, %r6110, %r6109, 30212; + ld.local.u8 %r6112, [%rd258+62]; + prmt.b32 %r6113, %r6112, %r6111, 28756; + ld.local.u8 %r6114, [%rd258+63]; + prmt.b32 %r6115, %r6114, %r6113, 1620; + cvt.u32.u16 %r6116, %rs215; + and.b32 %r6117, %r6116, 255; + add.s32 %r6118, %r11679, %r11675; + add.s32 %r6119, %r6118, %r6010; + xor.b32 %r6120, %r6119, %r71; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 16; + add.s32 %r6122, %r6121, 1779033703; + xor.b32 %r6123, %r6122, %r11675; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 20; + add.s32 %r6125, %r6017, %r6119; + add.s32 %r6126, %r6125, %r6124; + xor.b32 %r6127, %r6126, %r6121; + shf.l.wrap.b32 %r6128, %r6127, %r6127, 24; + add.s32 %r6129, %r6128, %r6122; + xor.b32 %r6130, %r6129, %r6124; + shf.l.wrap.b32 %r6131, %r6130, %r6130, 25; + add.s32 %r6132, %r11678, %r11674; + add.s32 %r6133, %r6132, %r6024; + xor.b32 %r6134, %r6133, %r72; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 16; + add.s32 %r6136, %r6135, -1150833019; + xor.b32 %r6137, %r6136, %r11674; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 20; + add.s32 %r6139, %r6031, %r6133; + add.s32 %r6140, %r6139, %r6138; + xor.b32 %r6141, %r6140, %r6135; + shf.l.wrap.b32 %r6142, %r6141, %r6141, 24; + add.s32 %r6143, %r6142, %r6136; + xor.b32 %r6144, %r6143, %r6138; + shf.l.wrap.b32 %r6145, %r6144, %r6144, 25; + add.s32 %r6146, %r11677, %r11673; + add.s32 %r6147, %r6146, %r6038; + shr.u32 %r6148, %r6147, 16; + shl.b32 %r6149, %r6147, 16; + xor.b32 %r6150, %r6149, 4194304; + or.b32 %r6151, %r6150, %r6148; + add.s32 %r6152, %r6151, 1013904242; + xor.b32 %r6153, %r6152, %r11673; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 20; + add.s32 %r6155, %r6045, %r6147; + add.s32 %r6156, %r6155, %r6154; + xor.b32 %r6157, %r6156, %r6151; + shf.l.wrap.b32 %r6158, %r6157, %r6157, 24; + add.s32 %r6159, %r6158, %r6152; + xor.b32 %r6160, %r6159, %r6154; + shf.l.wrap.b32 %r6161, %r6160, %r6160, 25; + add.s32 %r6162, %r11676, %r11672; + add.s32 %r6163, %r6162, %r6052; + xor.b32 %r6164, %r6163, %r6117; + shr.u32 %r6165, %r6163, 16; + shl.b32 %r6166, %r6164, 16; + or.b32 %r6167, %r6166, %r6165; + add.s32 %r6168, %r6167, -1521486534; + xor.b32 %r6169, %r6168, %r11672; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 20; + add.s32 %r6171, %r6059, %r6163; + add.s32 %r6172, %r6171, %r6170; + xor.b32 %r6173, %r6172, %r6167; + shf.l.wrap.b32 %r6174, %r6173, %r6173, 24; + add.s32 %r6175, %r6174, %r6168; + xor.b32 %r6176, %r6175, %r6170; + shf.l.wrap.b32 %r6177, %r6176, %r6176, 25; + add.s32 %r6178, %r6145, %r6126; + add.s32 %r6179, %r6178, %r6066; + xor.b32 %r6180, %r6174, %r6179; + shf.l.wrap.b32 %r6181, %r6180, %r6180, 16; + add.s32 %r6182, %r6181, %r6159; + xor.b32 %r6183, %r6182, %r6145; + shf.l.wrap.b32 %r6184, %r6183, %r6183, 20; + add.s32 %r6185, %r6073, %r6179; + add.s32 %r6186, %r6185, %r6184; + xor.b32 %r6187, %r6186, %r6181; + shf.l.wrap.b32 %r6188, %r6187, %r6187, 24; + add.s32 %r6189, %r6188, %r6182; + xor.b32 %r6190, %r6189, %r6184; + shf.l.wrap.b32 %r6191, %r6190, %r6190, 25; + add.s32 %r6192, %r6161, %r6140; + add.s32 %r6193, %r6192, %r6080; + xor.b32 %r6194, %r6193, %r6128; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 16; + add.s32 %r6196, %r6195, %r6175; + xor.b32 %r6197, %r6196, %r6161; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 20; + add.s32 %r6199, %r6087, %r6193; + add.s32 %r6200, %r6199, %r6198; + xor.b32 %r6201, %r6200, %r6195; + shf.l.wrap.b32 %r6202, %r6201, %r6201, 24; + add.s32 %r6203, %r6202, %r6196; + xor.b32 %r6204, %r6203, %r6198; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 25; + add.s32 %r6206, %r6177, %r6156; + add.s32 %r6207, %r6206, %r6094; + xor.b32 %r6208, %r6207, %r6142; + shf.l.wrap.b32 %r6209, %r6208, %r6208, 16; + add.s32 %r6210, %r6209, %r6129; + xor.b32 %r6211, %r6210, %r6177; + shf.l.wrap.b32 %r6212, %r6211, %r6211, 20; + add.s32 %r6213, %r6101, %r6207; + add.s32 %r6214, %r6213, %r6212; + xor.b32 %r6215, %r6214, %r6209; + shf.l.wrap.b32 %r6216, %r6215, %r6215, 24; + add.s32 %r6217, %r6216, %r6210; + xor.b32 %r6218, %r6217, %r6212; + shf.l.wrap.b32 %r6219, %r6218, %r6218, 25; + add.s32 %r6220, %r6172, %r6131; + add.s32 %r6221, %r6220, %r6108; + xor.b32 %r6222, %r6221, %r6158; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 16; + add.s32 %r6224, %r6223, %r6143; + xor.b32 %r6225, %r6224, %r6131; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 20; + add.s32 %r6227, %r6115, %r6221; + add.s32 %r6228, %r6227, %r6226; + xor.b32 %r6229, %r6228, %r6223; + shf.l.wrap.b32 %r6230, %r6229, %r6229, 24; + add.s32 %r6231, %r6230, %r6224; + xor.b32 %r6232, %r6231, %r6226; + shf.l.wrap.b32 %r6233, %r6232, %r6232, 25; + add.s32 %r6234, %r6186, %r6024; + add.s32 %r6235, %r6234, %r6233; + xor.b32 %r6236, %r6235, %r6202; + shf.l.wrap.b32 %r6237, %r6236, %r6236, 16; + add.s32 %r6238, %r6237, %r6217; + xor.b32 %r6239, %r6238, %r6233; + shf.l.wrap.b32 %r6240, %r6239, %r6239, 20; + add.s32 %r6241, %r6235, %r6052; + add.s32 %r6242, %r6241, %r6240; + xor.b32 %r6243, %r6242, %r6237; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 24; + add.s32 %r6245, %r6244, %r6238; + xor.b32 %r6246, %r6245, %r6240; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 25; + add.s32 %r6248, %r6200, %r6031; + add.s32 %r6249, %r6248, %r6191; + xor.b32 %r6250, %r6216, %r6249; + shf.l.wrap.b32 %r6251, %r6250, %r6250, 16; + add.s32 %r6252, %r6231, %r6251; + xor.b32 %r6253, %r6252, %r6191; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 20; + add.s32 %r6255, %r6249, %r6080; + add.s32 %r6256, %r6255, %r6254; + xor.b32 %r6257, %r6256, %r6251; + shf.l.wrap.b32 %r6258, %r6257, %r6257, 24; + add.s32 %r6259, %r6258, %r6252; + xor.b32 %r6260, %r6259, %r6254; + shf.l.wrap.b32 %r6261, %r6260, %r6260, 25; + add.s32 %r6262, %r6205, %r6059; + add.s32 %r6263, %r6262, %r6214; + xor.b32 %r6264, %r6230, %r6263; + shf.l.wrap.b32 %r6265, %r6264, %r6264, 16; + add.s32 %r6266, %r6265, %r6189; + xor.b32 %r6267, %r6266, %r6205; + shf.l.wrap.b32 %r6268, %r6267, %r6267, 20; + add.s32 %r6269, %r6263, %r6010; + add.s32 %r6270, %r6269, %r6268; + xor.b32 %r6271, %r6270, %r6265; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 24; + add.s32 %r6273, %r6272, %r6266; + xor.b32 %r6274, %r6273, %r6268; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 25; + add.s32 %r6276, %r6219, %r6038; + add.s32 %r6277, %r6276, %r6228; + xor.b32 %r6278, %r6277, %r6188; + shf.l.wrap.b32 %r6279, %r6278, %r6278, 16; + add.s32 %r6280, %r6279, %r6203; + xor.b32 %r6281, %r6280, %r6219; + shf.l.wrap.b32 %r6282, %r6281, %r6281, 20; + add.s32 %r6283, %r6277, %r6101; + add.s32 %r6284, %r6283, %r6282; + xor.b32 %r6285, %r6284, %r6279; + shf.l.wrap.b32 %r6286, %r6285, %r6285, 24; + add.s32 %r6287, %r6286, %r6280; + xor.b32 %r6288, %r6287, %r6282; + shf.l.wrap.b32 %r6289, %r6288, %r6288, 25; + add.s32 %r6290, %r6242, %r6017; + add.s32 %r6291, %r6290, %r6261; + xor.b32 %r6292, %r6291, %r6286; + shf.l.wrap.b32 %r6293, %r6292, %r6292, 16; + add.s32 %r6294, %r6293, %r6273; + xor.b32 %r6295, %r6294, %r6261; + shf.l.wrap.b32 %r6296, %r6295, %r6295, 20; + add.s32 %r6297, %r6291, %r6087; + add.s32 %r6298, %r6297, %r6296; + xor.b32 %r6299, %r6298, %r6293; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 24; + add.s32 %r6301, %r6300, %r6294; + xor.b32 %r6302, %r6301, %r6296; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 25; + add.s32 %r6304, %r6256, %r6094; + add.s32 %r6305, %r6304, %r6275; + xor.b32 %r6306, %r6305, %r6244; + shf.l.wrap.b32 %r6307, %r6306, %r6306, 16; + add.s32 %r6308, %r6307, %r6287; + xor.b32 %r6309, %r6308, %r6275; + shf.l.wrap.b32 %r6310, %r6309, %r6309, 20; + add.s32 %r6311, %r6305, %r6045; + add.s32 %r6312, %r6311, %r6310; + xor.b32 %r6313, %r6312, %r6307; + shf.l.wrap.b32 %r6314, %r6313, %r6313, 24; + add.s32 %r6315, %r6314, %r6308; + xor.b32 %r6316, %r6315, %r6310; + shf.l.wrap.b32 %r6317, %r6316, %r6316, 25; + add.s32 %r6318, %r6270, %r6073; + add.s32 %r6319, %r6318, %r6289; + xor.b32 %r6320, %r6319, %r6258; + shf.l.wrap.b32 %r6321, %r6320, %r6320, 16; + add.s32 %r6322, %r6321, %r6245; + xor.b32 %r6323, %r6322, %r6289; + shf.l.wrap.b32 %r6324, %r6323, %r6323, 20; + add.s32 %r6325, %r6319, %r6108; + add.s32 %r6326, %r6325, %r6324; + xor.b32 %r6327, %r6326, %r6321; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 24; + add.s32 %r6329, %r6328, %r6322; + xor.b32 %r6330, %r6329, %r6324; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 25; + add.s32 %r6332, %r6284, %r6115; + add.s32 %r6333, %r6332, %r6247; + xor.b32 %r6334, %r6333, %r6272; + shf.l.wrap.b32 %r6335, %r6334, %r6334, 16; + add.s32 %r6336, %r6335, %r6259; + xor.b32 %r6337, %r6336, %r6247; + shf.l.wrap.b32 %r6338, %r6337, %r6337, 20; + add.s32 %r6339, %r6333, %r6066; + add.s32 %r6340, %r6339, %r6338; + xor.b32 %r6341, %r6340, %r6335; + shf.l.wrap.b32 %r6342, %r6341, %r6341, 24; + add.s32 %r6343, %r6342, %r6336; + xor.b32 %r6344, %r6343, %r6338; + shf.l.wrap.b32 %r6345, %r6344, %r6344, 25; + add.s32 %r6346, %r6298, %r6031; + add.s32 %r6347, %r6346, %r6345; + xor.b32 %r6348, %r6347, %r6314; + shf.l.wrap.b32 %r6349, %r6348, %r6348, 16; + add.s32 %r6350, %r6349, %r6329; + xor.b32 %r6351, %r6350, %r6345; + shf.l.wrap.b32 %r6352, %r6351, %r6351, 20; + add.s32 %r6353, %r6347, %r6038; + add.s32 %r6354, %r6353, %r6352; + xor.b32 %r6355, %r6354, %r6349; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 24; + add.s32 %r6357, %r6356, %r6350; + xor.b32 %r6358, %r6357, %r6352; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 25; + add.s32 %r6360, %r6312, %r6080; + add.s32 %r6361, %r6360, %r6303; + xor.b32 %r6362, %r6361, %r6328; + shf.l.wrap.b32 %r6363, %r6362, %r6362, 16; + add.s32 %r6364, %r6363, %r6343; + xor.b32 %r6365, %r6364, %r6303; + shf.l.wrap.b32 %r6366, %r6365, %r6365, 20; + add.s32 %r6367, %r6361, %r6094; + add.s32 %r6368, %r6367, %r6366; + xor.b32 %r6369, %r6368, %r6363; + shf.l.wrap.b32 %r6370, %r6369, %r6369, 24; + add.s32 %r6371, %r6370, %r6364; + xor.b32 %r6372, %r6371, %r6366; + shf.l.wrap.b32 %r6373, %r6372, %r6372, 25; + add.s32 %r6374, %r6326, %r6101; + add.s32 %r6375, %r6374, %r6317; + xor.b32 %r6376, %r6375, %r6342; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 16; + add.s32 %r6378, %r6377, %r6301; + xor.b32 %r6379, %r6378, %r6317; + shf.l.wrap.b32 %r6380, %r6379, %r6379, 20; + add.s32 %r6381, %r6375, %r6024; + add.s32 %r6382, %r6381, %r6380; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 24; + add.s32 %r6385, %r6384, %r6378; + xor.b32 %r6386, %r6385, %r6380; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 25; + add.s32 %r6388, %r6340, %r6059; + add.s32 %r6389, %r6388, %r6331; + xor.b32 %r6390, %r6389, %r6300; + shf.l.wrap.b32 %r6391, %r6390, %r6390, 16; + add.s32 %r6392, %r6391, %r6315; + xor.b32 %r6393, %r6392, %r6331; + shf.l.wrap.b32 %r6394, %r6393, %r6393, 20; + add.s32 %r6395, %r6389, %r6108; + add.s32 %r6396, %r6395, %r6394; + xor.b32 %r6397, %r6396, %r6391; + shf.l.wrap.b32 %r6398, %r6397, %r6397, 24; + add.s32 %r6399, %r6398, %r6392; + xor.b32 %r6400, %r6399, %r6394; + shf.l.wrap.b32 %r6401, %r6400, %r6400, 25; + add.s32 %r6402, %r6354, %r6052; + add.s32 %r6403, %r6402, %r6373; + xor.b32 %r6404, %r6403, %r6398; + shf.l.wrap.b32 %r6405, %r6404, %r6404, 16; + add.s32 %r6406, %r6405, %r6385; + xor.b32 %r6407, %r6406, %r6373; + shf.l.wrap.b32 %r6408, %r6407, %r6407, 20; + add.s32 %r6409, %r6403, %r6045; + add.s32 %r6410, %r6409, %r6408; + xor.b32 %r6411, %r6410, %r6405; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 24; + add.s32 %r6413, %r6412, %r6406; + xor.b32 %r6414, %r6413, %r6408; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 25; + add.s32 %r6416, %r6368, %r6073; + add.s32 %r6417, %r6416, %r6387; + xor.b32 %r6418, %r6417, %r6356; + shf.l.wrap.b32 %r6419, %r6418, %r6418, 16; + add.s32 %r6420, %r6419, %r6399; + xor.b32 %r6421, %r6420, %r6387; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 20; + add.s32 %r6423, %r6417, %r6010; + add.s32 %r6424, %r6423, %r6422; + xor.b32 %r6425, %r6424, %r6419; + shf.l.wrap.b32 %r6426, %r6425, %r6425, 24; + add.s32 %r6427, %r6426, %r6420; + xor.b32 %r6428, %r6427, %r6422; + shf.l.wrap.b32 %r6429, %r6428, %r6428, 25; + add.s32 %r6430, %r6382, %r6087; + add.s32 %r6431, %r6430, %r6401; + xor.b32 %r6432, %r6431, %r6370; + shf.l.wrap.b32 %r6433, %r6432, %r6432, 16; + add.s32 %r6434, %r6433, %r6357; + xor.b32 %r6435, %r6434, %r6401; + shf.l.wrap.b32 %r6436, %r6435, %r6435, 20; + add.s32 %r6437, %r6431, %r6115; + add.s32 %r6438, %r6437, %r6436; + xor.b32 %r6439, %r6438, %r6433; + shf.l.wrap.b32 %r6440, %r6439, %r6439, 24; + add.s32 %r6441, %r6440, %r6434; + xor.b32 %r6442, %r6441, %r6436; + shf.l.wrap.b32 %r6443, %r6442, %r6442, 25; + add.s32 %r6444, %r6396, %r6066; + add.s32 %r6445, %r6444, %r6359; + xor.b32 %r6446, %r6445, %r6384; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 16; + add.s32 %r6448, %r6447, %r6371; + xor.b32 %r6449, %r6448, %r6359; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 20; + add.s32 %r6451, %r6445, %r6017; + add.s32 %r6452, %r6451, %r6450; + xor.b32 %r6453, %r6452, %r6447; + shf.l.wrap.b32 %r6454, %r6453, %r6453, 24; + add.s32 %r6455, %r6454, %r6448; + xor.b32 %r6456, %r6455, %r6450; + shf.l.wrap.b32 %r6457, %r6456, %r6456, 25; + add.s32 %r6458, %r6410, %r6080; + add.s32 %r6459, %r6458, %r6457; + xor.b32 %r6460, %r6459, %r6426; + shf.l.wrap.b32 %r6461, %r6460, %r6460, 16; + add.s32 %r6462, %r6461, %r6441; + xor.b32 %r6463, %r6462, %r6457; + shf.l.wrap.b32 %r6464, %r6463, %r6463, 20; + add.s32 %r6465, %r6459, %r6059; + add.s32 %r6466, %r6465, %r6464; + xor.b32 %r6467, %r6466, %r6461; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 24; + add.s32 %r6469, %r6468, %r6462; + xor.b32 %r6470, %r6469, %r6464; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 25; + add.s32 %r6472, %r6424, %r6094; + add.s32 %r6473, %r6472, %r6415; + xor.b32 %r6474, %r6473, %r6440; + shf.l.wrap.b32 %r6475, %r6474, %r6474, 16; + add.s32 %r6476, %r6475, %r6455; + xor.b32 %r6477, %r6476, %r6415; + shf.l.wrap.b32 %r6478, %r6477, %r6477, 20; + add.s32 %r6479, %r6473, %r6073; + add.s32 %r6480, %r6479, %r6478; + xor.b32 %r6481, %r6480, %r6475; + shf.l.wrap.b32 %r6482, %r6481, %r6481, 24; + add.s32 %r6483, %r6482, %r6476; + xor.b32 %r6484, %r6483, %r6478; + shf.l.wrap.b32 %r6485, %r6484, %r6484, 25; + add.s32 %r6486, %r6438, %r6108; + add.s32 %r6487, %r6486, %r6429; + xor.b32 %r6488, %r6487, %r6454; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 16; + add.s32 %r6490, %r6489, %r6413; + xor.b32 %r6491, %r6490, %r6429; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 20; + add.s32 %r6493, %r6487, %r6031; + add.s32 %r6494, %r6493, %r6492; + xor.b32 %r6495, %r6494, %r6489; + shf.l.wrap.b32 %r6496, %r6495, %r6495, 24; + add.s32 %r6497, %r6496, %r6490; + xor.b32 %r6498, %r6497, %r6492; + shf.l.wrap.b32 %r6499, %r6498, %r6498, 25; + add.s32 %r6500, %r6452, %r6101; + add.s32 %r6501, %r6500, %r6443; + xor.b32 %r6502, %r6501, %r6412; + shf.l.wrap.b32 %r6503, %r6502, %r6502, 16; + add.s32 %r6504, %r6503, %r6427; + xor.b32 %r6505, %r6504, %r6443; + shf.l.wrap.b32 %r6506, %r6505, %r6505, 20; + add.s32 %r6507, %r6501, %r6115; + add.s32 %r6508, %r6507, %r6506; + xor.b32 %r6509, %r6508, %r6503; + shf.l.wrap.b32 %r6510, %r6509, %r6509, 24; + add.s32 %r6511, %r6510, %r6504; + xor.b32 %r6512, %r6511, %r6506; + shf.l.wrap.b32 %r6513, %r6512, %r6512, 25; + add.s32 %r6514, %r6466, %r6038; + add.s32 %r6515, %r6514, %r6485; + xor.b32 %r6516, %r6515, %r6510; + shf.l.wrap.b32 %r6517, %r6516, %r6516, 16; + add.s32 %r6518, %r6517, %r6497; + xor.b32 %r6519, %r6518, %r6485; + shf.l.wrap.b32 %r6520, %r6519, %r6519, 20; + add.s32 %r6521, %r6515, %r6010; + add.s32 %r6522, %r6521, %r6520; + xor.b32 %r6523, %r6522, %r6517; + shf.l.wrap.b32 %r6524, %r6523, %r6523, 24; + add.s32 %r6525, %r6524, %r6518; + xor.b32 %r6526, %r6525, %r6520; + shf.l.wrap.b32 %r6527, %r6526, %r6526, 25; + add.s32 %r6528, %r6480, %r6087; + add.s32 %r6529, %r6528, %r6499; + xor.b32 %r6530, %r6529, %r6468; + shf.l.wrap.b32 %r6531, %r6530, %r6530, 16; + add.s32 %r6532, %r6531, %r6511; + xor.b32 %r6533, %r6532, %r6499; + shf.l.wrap.b32 %r6534, %r6533, %r6533, 20; + add.s32 %r6535, %r6529, %r6024; + add.s32 %r6536, %r6535, %r6534; + xor.b32 %r6537, %r6536, %r6531; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 24; + add.s32 %r6539, %r6538, %r6532; + xor.b32 %r6540, %r6539, %r6534; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 25; + add.s32 %r6542, %r6494, %r6045; + add.s32 %r6543, %r6542, %r6513; + xor.b32 %r6544, %r6543, %r6482; + shf.l.wrap.b32 %r6545, %r6544, %r6544, 16; + add.s32 %r6546, %r6545, %r6469; + xor.b32 %r6547, %r6546, %r6513; + shf.l.wrap.b32 %r6548, %r6547, %r6547, 20; + add.s32 %r6549, %r6543, %r6066; + add.s32 %r6550, %r6549, %r6548; + xor.b32 %r6551, %r6550, %r6545; + shf.l.wrap.b32 %r6552, %r6551, %r6551, 24; + add.s32 %r6553, %r6552, %r6546; + xor.b32 %r6554, %r6553, %r6548; + shf.l.wrap.b32 %r6555, %r6554, %r6554, 25; + add.s32 %r6556, %r6508, %r6017; + add.s32 %r6557, %r6556, %r6471; + xor.b32 %r6558, %r6557, %r6496; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 16; + add.s32 %r6560, %r6559, %r6483; + xor.b32 %r6561, %r6560, %r6471; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 20; + add.s32 %r6563, %r6557, %r6052; + add.s32 %r6564, %r6563, %r6562; + xor.b32 %r6565, %r6564, %r6559; + shf.l.wrap.b32 %r6566, %r6565, %r6565, 24; + add.s32 %r6567, %r6566, %r6560; + xor.b32 %r6568, %r6567, %r6562; + shf.l.wrap.b32 %r6569, %r6568, %r6568, 25; + add.s32 %r6570, %r6522, %r6094; + add.s32 %r6571, %r6570, %r6569; + xor.b32 %r6572, %r6571, %r6538; + shf.l.wrap.b32 %r6573, %r6572, %r6572, 16; + add.s32 %r6574, %r6573, %r6553; + xor.b32 %r6575, %r6574, %r6569; + shf.l.wrap.b32 %r6576, %r6575, %r6575, 20; + add.s32 %r6577, %r6571, %r6101; + add.s32 %r6578, %r6577, %r6576; + xor.b32 %r6579, %r6578, %r6573; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 24; + add.s32 %r6581, %r6580, %r6574; + xor.b32 %r6582, %r6581, %r6576; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 25; + add.s32 %r6584, %r6536, %r6073; + add.s32 %r6585, %r6584, %r6527; + xor.b32 %r6586, %r6585, %r6552; + shf.l.wrap.b32 %r6587, %r6586, %r6586, 16; + add.s32 %r6588, %r6587, %r6567; + xor.b32 %r6589, %r6588, %r6527; + shf.l.wrap.b32 %r6590, %r6589, %r6589, 20; + add.s32 %r6591, %r6585, %r6087; + add.s32 %r6592, %r6591, %r6590; + xor.b32 %r6593, %r6592, %r6587; + shf.l.wrap.b32 %r6594, %r6593, %r6593, 24; + add.s32 %r6595, %r6594, %r6588; + xor.b32 %r6596, %r6595, %r6590; + shf.l.wrap.b32 %r6597, %r6596, %r6596, 25; + add.s32 %r6598, %r6550, %r6115; + add.s32 %r6599, %r6598, %r6541; + xor.b32 %r6600, %r6599, %r6566; + shf.l.wrap.b32 %r6601, %r6600, %r6600, 16; + add.s32 %r6602, %r6601, %r6525; + xor.b32 %r6603, %r6602, %r6541; + shf.l.wrap.b32 %r6604, %r6603, %r6603, 20; + add.s32 %r6605, %r6599, %r6080; + add.s32 %r6606, %r6605, %r6604; + xor.b32 %r6607, %r6606, %r6601; + shf.l.wrap.b32 %r6608, %r6607, %r6607, 24; + add.s32 %r6609, %r6608, %r6602; + xor.b32 %r6610, %r6609, %r6604; + shf.l.wrap.b32 %r6611, %r6610, %r6610, 25; + add.s32 %r6612, %r6564, %r6108; + add.s32 %r6613, %r6612, %r6555; + xor.b32 %r6614, %r6613, %r6524; + shf.l.wrap.b32 %r6615, %r6614, %r6614, 16; + add.s32 %r6616, %r6615, %r6539; + xor.b32 %r6617, %r6616, %r6555; + shf.l.wrap.b32 %r6618, %r6617, %r6617, 20; + add.s32 %r6619, %r6613, %r6066; + add.s32 %r6620, %r6619, %r6618; + xor.b32 %r6621, %r6620, %r6615; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 24; + add.s32 %r6623, %r6622, %r6616; + xor.b32 %r6624, %r6623, %r6618; + shf.l.wrap.b32 %r6625, %r6624, %r6624, 25; + add.s32 %r6626, %r6578, %r6059; + add.s32 %r6627, %r6626, %r6597; + xor.b32 %r6628, %r6627, %r6622; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 16; + add.s32 %r6630, %r6629, %r6609; + xor.b32 %r6631, %r6630, %r6597; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 20; + add.s32 %r6633, %r6627, %r6024; + add.s32 %r6634, %r6633, %r6632; + xor.b32 %r6635, %r6634, %r6629; + shf.l.wrap.b32 %r6636, %r6635, %r6635, 24; + add.s32 %r6637, %r6636, %r6630; + xor.b32 %r6638, %r6637, %r6632; + shf.l.wrap.b32 %r6639, %r6638, %r6638, 25; + add.s32 %r6640, %r6592, %r6045; + add.s32 %r6641, %r6640, %r6611; + xor.b32 %r6642, %r6641, %r6580; + shf.l.wrap.b32 %r6643, %r6642, %r6642, 16; + add.s32 %r6644, %r6643, %r6623; + xor.b32 %r6645, %r6644, %r6611; + shf.l.wrap.b32 %r6646, %r6645, %r6645, 20; + add.s32 %r6647, %r6641, %r6031; + add.s32 %r6648, %r6647, %r6646; + xor.b32 %r6649, %r6648, %r6643; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 24; + add.s32 %r6651, %r6650, %r6644; + xor.b32 %r6652, %r6651, %r6646; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 25; + add.s32 %r6654, %r6606, %r6010; + add.s32 %r6655, %r6654, %r6625; + xor.b32 %r6656, %r6655, %r6594; + shf.l.wrap.b32 %r6657, %r6656, %r6656, 16; + add.s32 %r6658, %r6657, %r6581; + xor.b32 %r6659, %r6658, %r6625; + shf.l.wrap.b32 %r6660, %r6659, %r6659, 20; + add.s32 %r6661, %r6655, %r6017; + add.s32 %r6662, %r6661, %r6660; + xor.b32 %r6663, %r6662, %r6657; + shf.l.wrap.b32 %r6664, %r6663, %r6663, 24; + add.s32 %r6665, %r6664, %r6658; + xor.b32 %r6666, %r6665, %r6660; + shf.l.wrap.b32 %r6667, %r6666, %r6666, 25; + add.s32 %r6668, %r6620, %r6052; + add.s32 %r6669, %r6668, %r6583; + xor.b32 %r6670, %r6669, %r6608; + shf.l.wrap.b32 %r6671, %r6670, %r6670, 16; + add.s32 %r6672, %r6671, %r6595; + xor.b32 %r6673, %r6672, %r6583; + shf.l.wrap.b32 %r6674, %r6673, %r6673, 20; + add.s32 %r6675, %r6669, %r6038; + add.s32 %r6676, %r6675, %r6674; + xor.b32 %r6677, %r6676, %r6671; + shf.l.wrap.b32 %r6678, %r6677, %r6677, 24; + add.s32 %r6679, %r6678, %r6672; + xor.b32 %r6680, %r6679, %r6674; + shf.l.wrap.b32 %r6681, %r6680, %r6680, 25; + add.s32 %r6682, %r6634, %r6073; + add.s32 %r6683, %r6682, %r6681; + xor.b32 %r6684, %r6683, %r6650; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 16; + add.s32 %r6686, %r6685, %r6665; + xor.b32 %r6687, %r6686, %r6681; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 20; + add.s32 %r6689, %r6683, %r6108; + add.s32 %r6690, %r6689, %r6688; + xor.b32 %r6691, %r6690, %r6685; + shf.l.wrap.b32 %r6692, %r6691, %r6691, 24; + add.s32 %r6693, %r6692, %r6686; + xor.b32 %r6694, %r6693, %r6688; + shf.l.wrap.b32 %r6695, %r6694, %r6694, 25; + add.s32 %r6696, %r6648, %r6087; + add.s32 %r6697, %r6696, %r6639; + xor.b32 %r6698, %r6697, %r6664; + shf.l.wrap.b32 %r6699, %r6698, %r6698, 16; + add.s32 %r6700, %r6699, %r6679; + xor.b32 %r6701, %r6700, %r6639; + shf.l.wrap.b32 %r6702, %r6701, %r6701, 20; + add.s32 %r6703, %r6697, %r6045; + add.s32 %r6704, %r6703, %r6702; + xor.b32 %r6705, %r6704, %r6699; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 24; + add.s32 %r6707, %r6706, %r6700; + xor.b32 %r6708, %r6707, %r6702; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 25; + add.s32 %r6710, %r6662, %r6066; + add.s32 %r6711, %r6710, %r6653; + xor.b32 %r6712, %r6711, %r6678; + shf.l.wrap.b32 %r6713, %r6712, %r6712, 16; + add.s32 %r6714, %r6713, %r6637; + xor.b32 %r6715, %r6714, %r6653; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 20; + add.s32 %r6717, %r6711, %r6094; + add.s32 %r6718, %r6717, %r6716; + xor.b32 %r6719, %r6718, %r6713; + shf.l.wrap.b32 %r6720, %r6719, %r6719, 24; + add.s32 %r6721, %r6720, %r6714; + xor.b32 %r6722, %r6721, %r6716; + shf.l.wrap.b32 %r6723, %r6722, %r6722, 25; + add.s32 %r6724, %r6676, %r6115; + add.s32 %r6725, %r6724, %r6667; + xor.b32 %r6726, %r6725, %r6636; + shf.l.wrap.b32 %r6727, %r6726, %r6726, 16; + add.s32 %r6728, %r6727, %r6651; + xor.b32 %r6729, %r6728, %r6667; + shf.l.wrap.b32 %r6730, %r6729, %r6729, 20; + add.s32 %r6731, %r6725, %r6017; + add.s32 %r6732, %r6731, %r6730; + xor.b32 %r6733, %r6732, %r6727; + shf.l.wrap.b32 %r6734, %r6733, %r6733, 24; + add.s32 %r6735, %r6734, %r6728; + xor.b32 %r6736, %r6735, %r6730; + shf.l.wrap.b32 %r6737, %r6736, %r6736, 25; + add.s32 %r6738, %r6690, %r6101; + add.s32 %r6739, %r6738, %r6709; + xor.b32 %r6740, %r6739, %r6734; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 16; + add.s32 %r6742, %r6741, %r6721; + xor.b32 %r6743, %r6742, %r6709; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 20; + add.s32 %r6745, %r6739, %r6031; + add.s32 %r6746, %r6745, %r6744; + xor.b32 %r6747, %r6746, %r6741; + shf.l.wrap.b32 %r6748, %r6747, %r6747, 24; + add.s32 %r6749, %r6748, %r6742; + xor.b32 %r6750, %r6749, %r6744; + shf.l.wrap.b32 %r6751, %r6750, %r6750, 25; + add.s32 %r6752, %r6704, %r6010; + add.s32 %r6753, %r6752, %r6723; + xor.b32 %r6754, %r6753, %r6692; + shf.l.wrap.b32 %r6755, %r6754, %r6754, 16; + add.s32 %r6756, %r6755, %r6735; + xor.b32 %r6757, %r6756, %r6723; + shf.l.wrap.b32 %r6758, %r6757, %r6757, 20; + add.s32 %r6759, %r6753, %r6080; + add.s32 %r6760, %r6759, %r6758; + xor.b32 %r6761, %r6760, %r6755; + shf.l.wrap.b32 %r6762, %r6761, %r6761, 24; + add.s32 %r6763, %r6762, %r6756; + xor.b32 %r6764, %r6763, %r6758; + shf.l.wrap.b32 %r6765, %r6764, %r6764, 25; + add.s32 %r6766, %r6718, %r6024; + add.s32 %r6767, %r6766, %r6737; + xor.b32 %r6768, %r6767, %r6706; + shf.l.wrap.b32 %r6769, %r6768, %r6768, 16; + add.s32 %r6770, %r6769, %r6693; + xor.b32 %r6771, %r6770, %r6737; + shf.l.wrap.b32 %r6772, %r6771, %r6771, 20; + add.s32 %r6773, %r6767, %r6052; + add.s32 %r6774, %r6773, %r6772; + xor.b32 %r6775, %r6774, %r6769; + shf.l.wrap.b32 %r6776, %r6775, %r6775, 24; + add.s32 %r6777, %r6776, %r6770; + xor.b32 %r6778, %r6777, %r6772; + shf.l.wrap.b32 %r6779, %r6778, %r6778, 25; + add.s32 %r6780, %r6732, %r6038; + add.s32 %r6781, %r6780, %r6695; + xor.b32 %r6782, %r6781, %r6720; + shf.l.wrap.b32 %r6783, %r6782, %r6782, 16; + add.s32 %r6784, %r6783, %r6707; + xor.b32 %r6785, %r6784, %r6695; + shf.l.wrap.b32 %r6786, %r6785, %r6785, 20; + add.s32 %r6787, %r6781, %r6059; + add.s32 %r6788, %r6787, %r6786; + xor.b32 %r6789, %r6788, %r6783; + shf.l.wrap.b32 %r6790, %r6789, %r6789, 24; + add.s32 %r6791, %r6790, %r6784; + xor.b32 %r6792, %r6791, %r6786; + shf.l.wrap.b32 %r6793, %r6792, %r6792, 25; + add.s32 %r6794, %r6746, %r6087; + add.s32 %r6795, %r6794, %r6793; + xor.b32 %r6796, %r6795, %r6762; + shf.l.wrap.b32 %r6797, %r6796, %r6796, 16; + add.s32 %r6798, %r6797, %r6777; + xor.b32 %r6799, %r6798, %r6793; + shf.l.wrap.b32 %r6800, %r6799, %r6799, 20; + add.s32 %r6801, %r6795, %r6115; + add.s32 %r6802, %r6801, %r6800; + xor.b32 %r6803, %r6802, %r6797; + shf.l.wrap.b32 %r6804, %r6803, %r6803, 24; + add.s32 %r6805, %r6804, %r6798; + xor.b32 %r6806, %r6805, %r6800; + shf.l.wrap.b32 %r6807, %r6806, %r6806, 25; + add.s32 %r6808, %r6760, %r6045; + add.s32 %r6809, %r6808, %r6751; + xor.b32 %r6810, %r6809, %r6776; + shf.l.wrap.b32 %r6811, %r6810, %r6810, 16; + add.s32 %r6812, %r6811, %r6791; + xor.b32 %r6813, %r6812, %r6751; + shf.l.wrap.b32 %r6814, %r6813, %r6813, 20; + add.s32 %r6815, %r6809, %r6010; + add.s32 %r6816, %r6815, %r6814; + xor.b32 %r6817, %r6816, %r6811; + shf.l.wrap.b32 %r6818, %r6817, %r6817, 24; + add.s32 %r6819, %r6818, %r6812; + xor.b32 %r6820, %r6819, %r6814; + shf.l.wrap.b32 %r6821, %r6820, %r6820, 25; + add.s32 %r6822, %r6774, %r6017; + add.s32 %r6823, %r6822, %r6765; + xor.b32 %r6824, %r6823, %r6790; + shf.l.wrap.b32 %r6825, %r6824, %r6824, 16; + add.s32 %r6826, %r6825, %r6749; + xor.b32 %r6827, %r6826, %r6765; + shf.l.wrap.b32 %r6828, %r6827, %r6827, 20; + add.s32 %r6829, %r6823, %r6073; + add.s32 %r6830, %r6829, %r6828; + xor.b32 %r6831, %r6830, %r6825; + shf.l.wrap.b32 %r6832, %r6831, %r6831, 24; + add.s32 %r6833, %r6832, %r6826; + xor.b32 %r6834, %r6833, %r6828; + shf.l.wrap.b32 %r6835, %r6834, %r6834, 25; + add.s32 %r6836, %r6788, %r6066; + add.s32 %r6837, %r6836, %r6779; + xor.b32 %r6838, %r6837, %r6748; + shf.l.wrap.b32 %r6839, %r6838, %r6838, 16; + add.s32 %r6840, %r6839, %r6763; + xor.b32 %r6841, %r6840, %r6779; + shf.l.wrap.b32 %r6842, %r6841, %r6841, 20; + add.s32 %r6843, %r6837, %r6052; + add.s32 %r6844, %r6843, %r6842; + xor.b32 %r6845, %r6844, %r6839; + shf.l.wrap.b32 %r6846, %r6845, %r6845, 24; + add.s32 %r6847, %r6846, %r6840; + xor.b32 %r6848, %r6847, %r6842; + shf.l.wrap.b32 %r6849, %r6848, %r6848, 25; + add.s32 %r6850, %r6802, %r6108; + add.s32 %r6851, %r6850, %r6821; + xor.b32 %r6852, %r6851, %r6846; + shf.l.wrap.b32 %r6853, %r6852, %r6852, 16; + add.s32 %r6854, %r6853, %r6833; + xor.b32 %r6855, %r6854, %r6821; + shf.l.wrap.b32 %r6856, %r6855, %r6855, 20; + add.s32 %r6857, %r6851, %r6080; + add.s32 %r6858, %r6857, %r6856; + xor.b32 %r6859, %r6858, %r6853; + shf.l.wrap.b32 %r6860, %r6859, %r6859, 24; + add.s32 %r6861, %r6860, %r6854; + xor.b32 %r6862, %r6861, %r6856; + shf.l.wrap.b32 %r6863, %r6862, %r6862, 25; + add.s32 %r6864, %r6816, %r6024; + add.s32 %r6865, %r6864, %r6835; + xor.b32 %r6866, %r6865, %r6804; + shf.l.wrap.b32 %r6867, %r6866, %r6866, 16; + add.s32 %r6868, %r6867, %r6847; + xor.b32 %r6869, %r6868, %r6835; + shf.l.wrap.b32 %r6870, %r6869, %r6869, 20; + add.s32 %r6871, %r6865, %r6094; + add.s32 %r6872, %r6871, %r6870; + xor.b32 %r6873, %r6872, %r6867; + shf.l.wrap.b32 %r6874, %r6873, %r6873, 24; + add.s32 %r6875, %r6874, %r6868; + xor.b32 %r6876, %r6875, %r6870; + shf.l.wrap.b32 %r6877, %r6876, %r6876, 25; + add.s32 %r6878, %r6830, %r6031; + add.s32 %r6879, %r6878, %r6849; + xor.b32 %r6880, %r6879, %r6818; + shf.l.wrap.b32 %r6881, %r6880, %r6880, 16; + add.s32 %r6882, %r6881, %r6805; + xor.b32 %r6883, %r6882, %r6849; + shf.l.wrap.b32 %r6884, %r6883, %r6883, 20; + add.s32 %r6885, %r6879, %r6038; + add.s32 %r6886, %r6885, %r6884; + xor.b32 %r6887, %r6886, %r6881; + shf.l.wrap.b32 %r6888, %r6887, %r6887, 24; + add.s32 %r6889, %r6888, %r6882; + xor.b32 %r6890, %r6889, %r6884; + shf.l.wrap.b32 %r6891, %r6890, %r6890, 25; + add.s32 %r6892, %r6844, %r6059; + add.s32 %r6893, %r6892, %r6807; + xor.b32 %r6894, %r6893, %r6832; + shf.l.wrap.b32 %r6895, %r6894, %r6894, 16; + add.s32 %r6896, %r6895, %r6819; + xor.b32 %r6897, %r6896, %r6807; + shf.l.wrap.b32 %r6898, %r6897, %r6897, 20; + add.s32 %r6899, %r6893, %r6101; + add.s32 %r6900, %r6899, %r6898; + xor.b32 %r6901, %r6900, %r6895; + shf.l.wrap.b32 %r6902, %r6901, %r6901, 24; + add.s32 %r6903, %r6902, %r6896; + xor.b32 %r6904, %r6903, %r6898; + shf.l.wrap.b32 %r6905, %r6904, %r6904, 25; + xor.b32 %r11679, %r6889, %r6858; + xor.b32 %r11678, %r6903, %r6872; + xor.b32 %r11677, %r6861, %r6886; + xor.b32 %r11676, %r6900, %r6875; + xor.b32 %r11675, %r6905, %r6874; + xor.b32 %r11674, %r6863, %r6888; + xor.b32 %r11673, %r6902, %r6877; + xor.b32 %r11672, %r6891, %r6860; + add.s16 %rs353, %rs353, 1; + st.local.u8 [%rd54+1], %rs353; + add.s64 %rd258, %rd258, 64; + add.s64 %rd259, %rd259, -64; + setp.gt.u64 %p33, %rd259, 64; + @%p33 bra $L__BB1_38; + +$L__BB1_39: + min.u64 %rd61, %rd259, 64; + setp.eq.s64 %p34, %rd61, 0; + mov.u16 %rs355, %rs354; + mov.u16 %rs356, %rs354; + mov.u16 %rs357, %rs354; + mov.u16 %rs358, %rs354; + mov.u16 %rs359, %rs354; + mov.u16 %rs360, %rs354; + mov.u16 %rs361, %rs354; + mov.u16 %rs362, %rs354; + mov.u16 %rs363, %rs354; + mov.u16 %rs364, %rs354; + mov.u16 %rs365, %rs354; + mov.u16 %rs366, %rs354; + mov.u16 %rs367, %rs354; + mov.u16 %rs368, %rs354; + mov.u16 %rs369, %rs354; + mov.u16 %rs370, %rs354; + mov.u16 %rs371, %rs354; + mov.u16 %rs372, %rs354; + mov.u16 %rs373, %rs354; + mov.u16 %rs374, %rs354; + mov.u16 %rs375, %rs354; + mov.u16 %rs376, %rs354; + mov.u16 %rs377, %rs354; + mov.u16 %rs378, %rs354; + mov.u16 %rs379, %rs354; + mov.u16 %rs380, %rs354; + mov.u16 %rs381, %rs354; + mov.u16 %rs382, %rs354; + mov.u16 %rs383, %rs354; + mov.u16 %rs384, %rs354; + mov.u16 %rs385, %rs354; + mov.u16 %rs386, %rs354; + mov.u16 %rs387, %rs354; + @%p34 bra $L__BB1_43; + + mov.u64 %rd260, 0; + +$L__BB1_41: + add.s64 %rd187, %rd258, %rd260; + ld.local.u8 %rs251, [%rd187]; + add.s64 %rd188, %rd53, %rd260; + st.local.u8 [%rd188], %rs251; + add.s64 %rd260, %rd260, 1; + setp.lt.u64 %p35, %rd260, %rd61; + @%p35 bra $L__BB1_41; + + ld.local.v4.u16 {%rs384, %rs385, %rs386, %rs387}, [%rd53]; + ld.local.v4.u16 {%rs380, %rs381, %rs382, %rs383}, [%rd53+8]; + ld.local.v4.u16 {%rs376, %rs377, %rs378, %rs379}, [%rd53+16]; + ld.local.v4.u16 {%rs372, %rs373, %rs374, %rs375}, [%rd53+24]; + ld.local.v4.u16 {%rs368, %rs369, %rs370, %rs371}, [%rd53+32]; + ld.local.v4.u16 {%rs364, %rs365, %rs366, %rs367}, [%rd53+40]; + ld.local.v4.u16 {%rs360, %rs361, %rs362, %rs363}, [%rd53+48]; + ld.local.v4.u16 {%rs357, %rs358, %rs359, %rs283}, [%rd53+56]; + ld.local.u8 %rs356, [%rd53+61]; + ld.local.v2.u8 {%rs354, %rs355}, [%rd53+62]; + +$L__BB1_43: + ld.local.v4.u8 {%rs286, %rs287, %rs288, %rs289}, [%rd53+64]; + cvt.u16.u64 %rs292, %rd61; + add.s16 %rs293, %rs286, %rs292; + st.local.u8 [%rd53+64], %rs293; + setp.eq.s16 %p36, %rs287, 0; + selp.u16 %rs294, 1, 0, %p36; + or.b16 %rs295, %rs288, %rs294; + or.b16 %rs296, %rs295, 2; + shr.u16 %rs297, %rs384, 8; + shr.u16 %rs298, %rs385, 8; + shr.u16 %rs299, %rs386, 8; + shr.u16 %rs300, %rs387, 8; + shr.u16 %rs301, %rs380, 8; + shr.u16 %rs302, %rs381, 8; + shr.u16 %rs303, %rs382, 8; + shr.u16 %rs304, %rs383, 8; + shr.u16 %rs305, %rs376, 8; + shr.u16 %rs306, %rs377, 8; + shr.u16 %rs307, %rs378, 8; + shr.u16 %rs308, %rs379, 8; + shr.u16 %rs309, %rs372, 8; + shr.u16 %rs310, %rs373, 8; + shr.u16 %rs311, %rs374, 8; + shr.u16 %rs312, %rs375, 8; + shr.u16 %rs313, %rs368, 8; + shr.u16 %rs314, %rs369, 8; + shr.u16 %rs315, %rs370, 8; + shr.u16 %rs316, %rs371, 8; + shr.u16 %rs317, %rs364, 8; + shr.u16 %rs318, %rs365, 8; + shr.u16 %rs319, %rs366, 8; + shr.u16 %rs320, %rs367, 8; + shr.u16 %rs321, %rs360, 8; + shr.u16 %rs322, %rs361, 8; + shr.u16 %rs323, %rs362, 8; + shr.u16 %rs324, %rs363, 8; + shr.u16 %rs325, %rs357, 8; + shr.u16 %rs326, %rs358, 8; + cvt.u32.u16 %r6906, %rs384; + and.b32 %r6907, %r6906, 255; + cvt.u32.u16 %r6908, %rs297; + prmt.b32 %r6909, %r6908, %r6907, 30212; + cvt.u32.u16 %r6910, %rs385; + prmt.b32 %r6911, %r6910, %r6909, 28756; + cvt.u32.u16 %r6912, %rs298; + prmt.b32 %r6913, %r6912, %r6911, 1620; + cvt.u32.u16 %r6914, %rs386; + and.b32 %r6915, %r6914, 255; + cvt.u32.u16 %r6916, %rs299; + prmt.b32 %r6917, %r6916, %r6915, 30212; + cvt.u32.u16 %r6918, %rs387; + prmt.b32 %r6919, %r6918, %r6917, 28756; + cvt.u32.u16 %r6920, %rs300; + prmt.b32 %r6921, %r6920, %r6919, 1620; + cvt.u32.u16 %r6922, %rs380; + and.b32 %r6923, %r6922, 255; + cvt.u32.u16 %r6924, %rs301; + prmt.b32 %r6925, %r6924, %r6923, 30212; + cvt.u32.u16 %r6926, %rs381; + prmt.b32 %r6927, %r6926, %r6925, 28756; + cvt.u32.u16 %r6928, %rs302; + prmt.b32 %r6929, %r6928, %r6927, 1620; + cvt.u32.u16 %r6930, %rs382; + and.b32 %r6931, %r6930, 255; + cvt.u32.u16 %r6932, %rs303; + prmt.b32 %r6933, %r6932, %r6931, 30212; + cvt.u32.u16 %r6934, %rs383; + prmt.b32 %r6935, %r6934, %r6933, 28756; + cvt.u32.u16 %r6936, %rs304; + prmt.b32 %r6937, %r6936, %r6935, 1620; + cvt.u32.u16 %r6938, %rs376; + and.b32 %r6939, %r6938, 255; + cvt.u32.u16 %r6940, %rs305; + prmt.b32 %r6941, %r6940, %r6939, 30212; + cvt.u32.u16 %r6942, %rs377; + prmt.b32 %r6943, %r6942, %r6941, 28756; + cvt.u32.u16 %r6944, %rs306; + prmt.b32 %r6945, %r6944, %r6943, 1620; + cvt.u32.u16 %r6946, %rs378; + and.b32 %r6947, %r6946, 255; + cvt.u32.u16 %r6948, %rs307; + prmt.b32 %r6949, %r6948, %r6947, 30212; + cvt.u32.u16 %r6950, %rs379; + prmt.b32 %r6951, %r6950, %r6949, 28756; + cvt.u32.u16 %r6952, %rs308; + prmt.b32 %r6953, %r6952, %r6951, 1620; + cvt.u32.u16 %r6954, %rs372; + and.b32 %r6955, %r6954, 255; + cvt.u32.u16 %r6956, %rs309; + prmt.b32 %r6957, %r6956, %r6955, 30212; + cvt.u32.u16 %r6958, %rs373; + prmt.b32 %r6959, %r6958, %r6957, 28756; + cvt.u32.u16 %r6960, %rs310; + prmt.b32 %r6961, %r6960, %r6959, 1620; + cvt.u32.u16 %r6962, %rs374; + and.b32 %r6963, %r6962, 255; + cvt.u32.u16 %r6964, %rs311; + prmt.b32 %r6965, %r6964, %r6963, 30212; + cvt.u32.u16 %r6966, %rs375; + prmt.b32 %r6967, %r6966, %r6965, 28756; + cvt.u32.u16 %r6968, %rs312; + prmt.b32 %r6969, %r6968, %r6967, 1620; + cvt.u32.u16 %r6970, %rs368; + and.b32 %r6971, %r6970, 255; + cvt.u32.u16 %r6972, %rs313; + prmt.b32 %r6973, %r6972, %r6971, 30212; + cvt.u32.u16 %r6974, %rs369; + prmt.b32 %r6975, %r6974, %r6973, 28756; + cvt.u32.u16 %r6976, %rs314; + prmt.b32 %r6977, %r6976, %r6975, 1620; + cvt.u32.u16 %r6978, %rs370; + and.b32 %r6979, %r6978, 255; + cvt.u32.u16 %r6980, %rs315; + prmt.b32 %r6981, %r6980, %r6979, 30212; + cvt.u32.u16 %r6982, %rs371; + prmt.b32 %r6983, %r6982, %r6981, 28756; + cvt.u32.u16 %r6984, %rs316; + prmt.b32 %r6985, %r6984, %r6983, 1620; + cvt.u32.u16 %r6986, %rs364; + and.b32 %r6987, %r6986, 255; + cvt.u32.u16 %r6988, %rs317; + prmt.b32 %r6989, %r6988, %r6987, 30212; + cvt.u32.u16 %r6990, %rs365; + prmt.b32 %r6991, %r6990, %r6989, 28756; + cvt.u32.u16 %r6992, %rs318; + prmt.b32 %r6993, %r6992, %r6991, 1620; + cvt.u32.u16 %r6994, %rs366; + and.b32 %r6995, %r6994, 255; + cvt.u32.u16 %r6996, %rs319; + prmt.b32 %r6997, %r6996, %r6995, 30212; + cvt.u32.u16 %r6998, %rs367; + prmt.b32 %r6999, %r6998, %r6997, 28756; + cvt.u32.u16 %r7000, %rs320; + prmt.b32 %r7001, %r7000, %r6999, 1620; + cvt.u32.u16 %r7002, %rs360; + and.b32 %r7003, %r7002, 255; + cvt.u32.u16 %r7004, %rs321; + prmt.b32 %r7005, %r7004, %r7003, 30212; + cvt.u32.u16 %r7006, %rs361; + prmt.b32 %r7007, %r7006, %r7005, 28756; + cvt.u32.u16 %r7008, %rs322; + prmt.b32 %r7009, %r7008, %r7007, 1620; + cvt.u32.u16 %r7010, %rs362; + and.b32 %r7011, %r7010, 255; + cvt.u32.u16 %r7012, %rs323; + prmt.b32 %r7013, %r7012, %r7011, 30212; + cvt.u32.u16 %r7014, %rs363; + prmt.b32 %r7015, %r7014, %r7013, 28756; + cvt.u32.u16 %r7016, %rs324; + prmt.b32 %r7017, %r7016, %r7015, 1620; + cvt.u32.u16 %r7018, %rs357; + and.b32 %r7019, %r7018, 255; + cvt.u32.u16 %r7020, %rs325; + prmt.b32 %r7021, %r7020, %r7019, 30212; + cvt.u32.u16 %r7022, %rs358; + prmt.b32 %r7023, %r7022, %r7021, 28756; + cvt.u32.u16 %r7024, %rs326; + prmt.b32 %r7025, %r7024, %r7023, 1620; + cvt.u32.u16 %r7026, %rs359; + and.b32 %r7027, %r7026, 255; + cvt.u32.u16 %r7028, %rs356; + prmt.b32 %r7029, %r7028, %r7027, 30212; + cvt.u32.u16 %r7030, %rs354; + shl.b32 %r7031, %r7030, 16; + and.b32 %r7032, %r7031, 16711680; + or.b32 %r7033, %r7029, %r7032; + cvt.u32.u16 %r7034, %rs355; + shl.b32 %r7035, %r7034, 24; + or.b32 %r7036, %r7033, %r7035; + cvt.u32.u16 %r7037, %rs293; + and.b32 %r7038, %r7037, 255; + cvt.u32.u16 %r7039, %rs296; + and.b32 %r7040, %r7039, 255; + add.s32 %r7041, %r11675, %r11679; + add.s32 %r7042, %r7041, %r6913; + xor.b32 %r7043, %r7042, %r71; + shf.l.wrap.b32 %r7044, %r7043, %r7043, 16; + add.s32 %r7045, %r7044, 1779033703; + xor.b32 %r7046, %r7045, %r11675; + shf.l.wrap.b32 %r7047, %r7046, %r7046, 20; + add.s32 %r7048, %r6921, %r7042; + add.s32 %r7049, %r7048, %r7047; + xor.b32 %r7050, %r7049, %r7044; + shf.l.wrap.b32 %r7051, %r7050, %r7050, 24; + add.s32 %r7052, %r7051, %r7045; + xor.b32 %r7053, %r7052, %r7047; + shf.l.wrap.b32 %r7054, %r7053, %r7053, 25; + add.s32 %r7055, %r11674, %r11678; + add.s32 %r7056, %r7055, %r6929; + xor.b32 %r7057, %r7056, %r72; + shf.l.wrap.b32 %r7058, %r7057, %r7057, 16; + add.s32 %r7059, %r7058, -1150833019; + xor.b32 %r7060, %r7059, %r11674; + shf.l.wrap.b32 %r7061, %r7060, %r7060, 20; + add.s32 %r7062, %r6937, %r7056; + add.s32 %r7063, %r7062, %r7061; + xor.b32 %r7064, %r7063, %r7058; + shf.l.wrap.b32 %r7065, %r7064, %r7064, 24; + add.s32 %r7066, %r7065, %r7059; + xor.b32 %r7067, %r7066, %r7061; + shf.l.wrap.b32 %r7068, %r7067, %r7067, 25; + add.s32 %r7069, %r11673, %r11677; + add.s32 %r7070, %r7069, %r6945; + xor.b32 %r7071, %r7070, %r7038; + shr.u32 %r7072, %r7070, 16; + shl.b32 %r7073, %r7071, 16; + or.b32 %r7074, %r7073, %r7072; + add.s32 %r7075, %r7074, 1013904242; + xor.b32 %r7076, %r7075, %r11673; + shf.l.wrap.b32 %r7077, %r7076, %r7076, 20; + add.s32 %r7078, %r6953, %r7070; + add.s32 %r7079, %r7078, %r7077; + xor.b32 %r7080, %r7079, %r7074; + shf.l.wrap.b32 %r7081, %r7080, %r7080, 24; + add.s32 %r7082, %r7081, %r7075; + xor.b32 %r7083, %r7082, %r7077; + shf.l.wrap.b32 %r7084, %r7083, %r7083, 25; + add.s32 %r7085, %r11672, %r11676; + add.s32 %r7086, %r7085, %r6961; + xor.b32 %r7087, %r7086, %r7040; + shr.u32 %r7088, %r7086, 16; + shl.b32 %r7089, %r7087, 16; + or.b32 %r7090, %r7089, %r7088; + add.s32 %r7091, %r7090, -1521486534; + xor.b32 %r7092, %r7091, %r11672; + shf.l.wrap.b32 %r7093, %r7092, %r7092, 20; + add.s32 %r7094, %r6969, %r7086; + add.s32 %r7095, %r7094, %r7093; + xor.b32 %r7096, %r7095, %r7090; + shf.l.wrap.b32 %r7097, %r7096, %r7096, 24; + add.s32 %r7098, %r7097, %r7091; + xor.b32 %r7099, %r7098, %r7093; + shf.l.wrap.b32 %r7100, %r7099, %r7099, 25; + add.s32 %r7101, %r7068, %r7049; + add.s32 %r7102, %r7101, %r6977; + xor.b32 %r7103, %r7097, %r7102; + shf.l.wrap.b32 %r7104, %r7103, %r7103, 16; + add.s32 %r7105, %r7104, %r7082; + xor.b32 %r7106, %r7105, %r7068; + shf.l.wrap.b32 %r7107, %r7106, %r7106, 20; + add.s32 %r7108, %r6985, %r7102; + add.s32 %r7109, %r7108, %r7107; + xor.b32 %r7110, %r7109, %r7104; + shf.l.wrap.b32 %r7111, %r7110, %r7110, 24; + add.s32 %r7112, %r7111, %r7105; + xor.b32 %r7113, %r7112, %r7107; + shf.l.wrap.b32 %r7114, %r7113, %r7113, 25; + add.s32 %r7115, %r7084, %r7063; + add.s32 %r7116, %r7115, %r6993; + xor.b32 %r7117, %r7116, %r7051; + shf.l.wrap.b32 %r7118, %r7117, %r7117, 16; + add.s32 %r7119, %r7118, %r7098; + xor.b32 %r7120, %r7119, %r7084; + shf.l.wrap.b32 %r7121, %r7120, %r7120, 20; + add.s32 %r7122, %r7001, %r7116; + add.s32 %r7123, %r7122, %r7121; + xor.b32 %r7124, %r7123, %r7118; + shf.l.wrap.b32 %r7125, %r7124, %r7124, 24; + add.s32 %r7126, %r7125, %r7119; + xor.b32 %r7127, %r7126, %r7121; + shf.l.wrap.b32 %r7128, %r7127, %r7127, 25; + add.s32 %r7129, %r7100, %r7079; + add.s32 %r7130, %r7129, %r7009; + xor.b32 %r7131, %r7130, %r7065; + shf.l.wrap.b32 %r7132, %r7131, %r7131, 16; + add.s32 %r7133, %r7132, %r7052; + xor.b32 %r7134, %r7133, %r7100; + shf.l.wrap.b32 %r7135, %r7134, %r7134, 20; + add.s32 %r7136, %r7017, %r7130; + add.s32 %r7137, %r7136, %r7135; + xor.b32 %r7138, %r7137, %r7132; + shf.l.wrap.b32 %r7139, %r7138, %r7138, 24; + add.s32 %r7140, %r7139, %r7133; + xor.b32 %r7141, %r7140, %r7135; + shf.l.wrap.b32 %r7142, %r7141, %r7141, 25; + add.s32 %r7143, %r7095, %r7054; + add.s32 %r7144, %r7143, %r7025; + xor.b32 %r7145, %r7144, %r7081; + shf.l.wrap.b32 %r7146, %r7145, %r7145, 16; + add.s32 %r7147, %r7146, %r7066; + xor.b32 %r7148, %r7147, %r7054; + shf.l.wrap.b32 %r7149, %r7148, %r7148, 20; + add.s32 %r7150, %r7036, %r7144; + add.s32 %r7151, %r7150, %r7149; + xor.b32 %r7152, %r7151, %r7146; + shf.l.wrap.b32 %r7153, %r7152, %r7152, 24; + add.s32 %r7154, %r7153, %r7147; + xor.b32 %r7155, %r7154, %r7149; + shf.l.wrap.b32 %r7156, %r7155, %r7155, 25; + add.s32 %r7157, %r7109, %r6929; + add.s32 %r7158, %r7157, %r7156; + xor.b32 %r7159, %r7158, %r7125; + shf.l.wrap.b32 %r7160, %r7159, %r7159, 16; + add.s32 %r7161, %r7160, %r7140; + xor.b32 %r7162, %r7161, %r7156; + shf.l.wrap.b32 %r7163, %r7162, %r7162, 20; + add.s32 %r7164, %r7158, %r6961; + add.s32 %r7165, %r7164, %r7163; + xor.b32 %r7166, %r7165, %r7160; + shf.l.wrap.b32 %r7167, %r7166, %r7166, 24; + add.s32 %r7168, %r7167, %r7161; + xor.b32 %r7169, %r7168, %r7163; + shf.l.wrap.b32 %r7170, %r7169, %r7169, 25; + add.s32 %r7171, %r7123, %r6937; + add.s32 %r7172, %r7171, %r7114; + xor.b32 %r7173, %r7139, %r7172; + shf.l.wrap.b32 %r7174, %r7173, %r7173, 16; + add.s32 %r7175, %r7154, %r7174; + xor.b32 %r7176, %r7175, %r7114; + shf.l.wrap.b32 %r7177, %r7176, %r7176, 20; + add.s32 %r7178, %r7172, %r6993; + add.s32 %r7179, %r7178, %r7177; + xor.b32 %r7180, %r7179, %r7174; + shf.l.wrap.b32 %r7181, %r7180, %r7180, 24; + add.s32 %r7182, %r7181, %r7175; + xor.b32 %r7183, %r7182, %r7177; + shf.l.wrap.b32 %r7184, %r7183, %r7183, 25; + add.s32 %r7185, %r7128, %r6969; + add.s32 %r7186, %r7185, %r7137; + xor.b32 %r7187, %r7153, %r7186; + shf.l.wrap.b32 %r7188, %r7187, %r7187, 16; + add.s32 %r7189, %r7188, %r7112; + xor.b32 %r7190, %r7189, %r7128; + shf.l.wrap.b32 %r7191, %r7190, %r7190, 20; + add.s32 %r7192, %r7186, %r6913; + add.s32 %r7193, %r7192, %r7191; + xor.b32 %r7194, %r7193, %r7188; + shf.l.wrap.b32 %r7195, %r7194, %r7194, 24; + add.s32 %r7196, %r7195, %r7189; + xor.b32 %r7197, %r7196, %r7191; + shf.l.wrap.b32 %r7198, %r7197, %r7197, 25; + add.s32 %r7199, %r7142, %r6945; + add.s32 %r7200, %r7199, %r7151; + xor.b32 %r7201, %r7200, %r7111; + shf.l.wrap.b32 %r7202, %r7201, %r7201, 16; + add.s32 %r7203, %r7202, %r7126; + xor.b32 %r7204, %r7203, %r7142; + shf.l.wrap.b32 %r7205, %r7204, %r7204, 20; + add.s32 %r7206, %r7200, %r7017; + add.s32 %r7207, %r7206, %r7205; + xor.b32 %r7208, %r7207, %r7202; + shf.l.wrap.b32 %r7209, %r7208, %r7208, 24; + add.s32 %r7210, %r7209, %r7203; + xor.b32 %r7211, %r7210, %r7205; + shf.l.wrap.b32 %r7212, %r7211, %r7211, 25; + add.s32 %r7213, %r7184, %r6921; + add.s32 %r7214, %r7213, %r7165; + xor.b32 %r7215, %r7214, %r7209; + shf.l.wrap.b32 %r7216, %r7215, %r7215, 16; + add.s32 %r7217, %r7216, %r7196; + xor.b32 %r7218, %r7217, %r7184; + shf.l.wrap.b32 %r7219, %r7218, %r7218, 20; + add.s32 %r7220, %r7214, %r7001; + add.s32 %r7221, %r7220, %r7219; + xor.b32 %r7222, %r7221, %r7216; + shf.l.wrap.b32 %r7223, %r7222, %r7222, 24; + add.s32 %r7224, %r7223, %r7217; + xor.b32 %r7225, %r7224, %r7219; + shf.l.wrap.b32 %r7226, %r7225, %r7225, 25; + add.s32 %r7227, %r7179, %r7009; + add.s32 %r7228, %r7227, %r7198; + xor.b32 %r7229, %r7167, %r7228; + shf.l.wrap.b32 %r7230, %r7229, %r7229, 16; + add.s32 %r7231, %r7230, %r7210; + xor.b32 %r7232, %r7231, %r7198; + shf.l.wrap.b32 %r7233, %r7232, %r7232, 20; + add.s32 %r7234, %r7228, %r6953; + add.s32 %r7235, %r7234, %r7233; + xor.b32 %r7236, %r7235, %r7230; + shf.l.wrap.b32 %r7237, %r7236, %r7236, 24; + add.s32 %r7238, %r7237, %r7231; + xor.b32 %r7239, %r7238, %r7233; + shf.l.wrap.b32 %r7240, %r7239, %r7239, 25; + add.s32 %r7241, %r7193, %r6985; + add.s32 %r7242, %r7241, %r7212; + xor.b32 %r7243, %r7242, %r7181; + shf.l.wrap.b32 %r7244, %r7243, %r7243, 16; + add.s32 %r7245, %r7244, %r7168; + xor.b32 %r7246, %r7245, %r7212; + shf.l.wrap.b32 %r7247, %r7246, %r7246, 20; + add.s32 %r7248, %r7242, %r7025; + add.s32 %r7249, %r7248, %r7247; + xor.b32 %r7250, %r7249, %r7244; + shf.l.wrap.b32 %r7251, %r7250, %r7250, 24; + add.s32 %r7252, %r7251, %r7245; + xor.b32 %r7253, %r7252, %r7247; + shf.l.wrap.b32 %r7254, %r7253, %r7253, 25; + add.s32 %r7255, %r7207, %r7036; + add.s32 %r7256, %r7255, %r7170; + xor.b32 %r7257, %r7256, %r7195; + shf.l.wrap.b32 %r7258, %r7257, %r7257, 16; + add.s32 %r7259, %r7258, %r7182; + xor.b32 %r7260, %r7259, %r7170; + shf.l.wrap.b32 %r7261, %r7260, %r7260, 20; + add.s32 %r7262, %r7256, %r6977; + add.s32 %r7263, %r7262, %r7261; + xor.b32 %r7264, %r7263, %r7258; + shf.l.wrap.b32 %r7265, %r7264, %r7264, 24; + add.s32 %r7266, %r7265, %r7259; + xor.b32 %r7267, %r7266, %r7261; + shf.l.wrap.b32 %r7268, %r7267, %r7267, 25; + add.s32 %r7269, %r7221, %r6937; + add.s32 %r7270, %r7269, %r7268; + xor.b32 %r7271, %r7270, %r7237; + shf.l.wrap.b32 %r7272, %r7271, %r7271, 16; + add.s32 %r7273, %r7272, %r7252; + xor.b32 %r7274, %r7273, %r7268; + shf.l.wrap.b32 %r7275, %r7274, %r7274, 20; + add.s32 %r7276, %r7270, %r6945; + add.s32 %r7277, %r7276, %r7275; + xor.b32 %r7278, %r7277, %r7272; + shf.l.wrap.b32 %r7279, %r7278, %r7278, 24; + add.s32 %r7280, %r7279, %r7273; + xor.b32 %r7281, %r7280, %r7275; + shf.l.wrap.b32 %r7282, %r7281, %r7281, 25; + add.s32 %r7283, %r7235, %r6993; + add.s32 %r7284, %r7283, %r7226; + xor.b32 %r7285, %r7284, %r7251; + shf.l.wrap.b32 %r7286, %r7285, %r7285, 16; + add.s32 %r7287, %r7286, %r7266; + xor.b32 %r7288, %r7287, %r7226; + shf.l.wrap.b32 %r7289, %r7288, %r7288, 20; + add.s32 %r7290, %r7284, %r7009; + add.s32 %r7291, %r7290, %r7289; + xor.b32 %r7292, %r7291, %r7286; + shf.l.wrap.b32 %r7293, %r7292, %r7292, 24; + add.s32 %r7294, %r7293, %r7287; + xor.b32 %r7295, %r7294, %r7289; + shf.l.wrap.b32 %r7296, %r7295, %r7295, 25; + add.s32 %r7297, %r7249, %r7017; + add.s32 %r7298, %r7297, %r7240; + xor.b32 %r7299, %r7265, %r7298; + shf.l.wrap.b32 %r7300, %r7299, %r7299, 16; + add.s32 %r7301, %r7300, %r7224; + xor.b32 %r7302, %r7301, %r7240; + shf.l.wrap.b32 %r7303, %r7302, %r7302, 20; + add.s32 %r7304, %r7298, %r6929; + add.s32 %r7305, %r7304, %r7303; + xor.b32 %r7306, %r7305, %r7300; + shf.l.wrap.b32 %r7307, %r7306, %r7306, 24; + add.s32 %r7308, %r7307, %r7301; + xor.b32 %r7309, %r7308, %r7303; + shf.l.wrap.b32 %r7310, %r7309, %r7309, 25; + add.s32 %r7311, %r7254, %r6969; + add.s32 %r7312, %r7311, %r7263; + xor.b32 %r7313, %r7312, %r7223; + shf.l.wrap.b32 %r7314, %r7313, %r7313, 16; + add.s32 %r7315, %r7314, %r7238; + xor.b32 %r7316, %r7315, %r7254; + shf.l.wrap.b32 %r7317, %r7316, %r7316, 20; + add.s32 %r7318, %r7312, %r7025; + add.s32 %r7319, %r7318, %r7317; + xor.b32 %r7320, %r7319, %r7314; + shf.l.wrap.b32 %r7321, %r7320, %r7320, 24; + add.s32 %r7322, %r7321, %r7315; + xor.b32 %r7323, %r7322, %r7317; + shf.l.wrap.b32 %r7324, %r7323, %r7323, 25; + add.s32 %r7325, %r7277, %r6961; + add.s32 %r7326, %r7325, %r7296; + xor.b32 %r7327, %r7326, %r7321; + shf.l.wrap.b32 %r7328, %r7327, %r7327, 16; + add.s32 %r7329, %r7328, %r7308; + xor.b32 %r7330, %r7329, %r7296; + shf.l.wrap.b32 %r7331, %r7330, %r7330, 20; + add.s32 %r7332, %r7326, %r6953; + add.s32 %r7333, %r7332, %r7331; + xor.b32 %r7334, %r7333, %r7328; + shf.l.wrap.b32 %r7335, %r7334, %r7334, 24; + add.s32 %r7336, %r7335, %r7329; + xor.b32 %r7337, %r7336, %r7331; + shf.l.wrap.b32 %r7338, %r7337, %r7337, 25; + add.s32 %r7339, %r7291, %r6985; + add.s32 %r7340, %r7339, %r7310; + xor.b32 %r7341, %r7279, %r7340; + shf.l.wrap.b32 %r7342, %r7341, %r7341, 16; + add.s32 %r7343, %r7342, %r7322; + xor.b32 %r7344, %r7343, %r7310; + shf.l.wrap.b32 %r7345, %r7344, %r7344, 20; + add.s32 %r7346, %r7340, %r6913; + add.s32 %r7347, %r7346, %r7345; + xor.b32 %r7348, %r7347, %r7342; + shf.l.wrap.b32 %r7349, %r7348, %r7348, 24; + add.s32 %r7350, %r7349, %r7343; + xor.b32 %r7351, %r7350, %r7345; + shf.l.wrap.b32 %r7352, %r7351, %r7351, 25; + add.s32 %r7353, %r7305, %r7001; + add.s32 %r7354, %r7353, %r7324; + xor.b32 %r7355, %r7354, %r7293; + shf.l.wrap.b32 %r7356, %r7355, %r7355, 16; + add.s32 %r7357, %r7356, %r7280; + xor.b32 %r7358, %r7357, %r7324; + shf.l.wrap.b32 %r7359, %r7358, %r7358, 20; + add.s32 %r7360, %r7354, %r7036; + add.s32 %r7361, %r7360, %r7359; + xor.b32 %r7362, %r7361, %r7356; + shf.l.wrap.b32 %r7363, %r7362, %r7362, 24; + add.s32 %r7364, %r7363, %r7357; + xor.b32 %r7365, %r7364, %r7359; + shf.l.wrap.b32 %r7366, %r7365, %r7365, 25; + add.s32 %r7367, %r7319, %r6977; + add.s32 %r7368, %r7367, %r7282; + xor.b32 %r7369, %r7368, %r7307; + shf.l.wrap.b32 %r7370, %r7369, %r7369, 16; + add.s32 %r7371, %r7370, %r7294; + xor.b32 %r7372, %r7371, %r7282; + shf.l.wrap.b32 %r7373, %r7372, %r7372, 20; + add.s32 %r7374, %r7368, %r6921; + add.s32 %r7375, %r7374, %r7373; + xor.b32 %r7376, %r7375, %r7370; + shf.l.wrap.b32 %r7377, %r7376, %r7376, 24; + add.s32 %r7378, %r7377, %r7371; + xor.b32 %r7379, %r7378, %r7373; + shf.l.wrap.b32 %r7380, %r7379, %r7379, 25; + add.s32 %r7381, %r7333, %r6993; + add.s32 %r7382, %r7381, %r7380; + xor.b32 %r7383, %r7382, %r7349; + shf.l.wrap.b32 %r7384, %r7383, %r7383, 16; + add.s32 %r7385, %r7384, %r7364; + xor.b32 %r7386, %r7385, %r7380; + shf.l.wrap.b32 %r7387, %r7386, %r7386, 20; + add.s32 %r7388, %r7382, %r6969; + add.s32 %r7389, %r7388, %r7387; + xor.b32 %r7390, %r7389, %r7384; + shf.l.wrap.b32 %r7391, %r7390, %r7390, 24; + add.s32 %r7392, %r7391, %r7385; + xor.b32 %r7393, %r7392, %r7387; + shf.l.wrap.b32 %r7394, %r7393, %r7393, 25; + add.s32 %r7395, %r7347, %r7009; + add.s32 %r7396, %r7395, %r7338; + xor.b32 %r7397, %r7396, %r7363; + shf.l.wrap.b32 %r7398, %r7397, %r7397, 16; + add.s32 %r7399, %r7398, %r7378; + xor.b32 %r7400, %r7399, %r7338; + shf.l.wrap.b32 %r7401, %r7400, %r7400, 20; + add.s32 %r7402, %r7396, %r6985; + add.s32 %r7403, %r7402, %r7401; + xor.b32 %r7404, %r7403, %r7398; + shf.l.wrap.b32 %r7405, %r7404, %r7404, 24; + add.s32 %r7406, %r7405, %r7399; + xor.b32 %r7407, %r7406, %r7401; + shf.l.wrap.b32 %r7408, %r7407, %r7407, 25; + add.s32 %r7409, %r7361, %r7025; + add.s32 %r7410, %r7409, %r7352; + xor.b32 %r7411, %r7377, %r7410; + shf.l.wrap.b32 %r7412, %r7411, %r7411, 16; + add.s32 %r7413, %r7412, %r7336; + xor.b32 %r7414, %r7413, %r7352; + shf.l.wrap.b32 %r7415, %r7414, %r7414, 20; + add.s32 %r7416, %r7410, %r6937; + add.s32 %r7417, %r7416, %r7415; + xor.b32 %r7418, %r7417, %r7412; + shf.l.wrap.b32 %r7419, %r7418, %r7418, 24; + add.s32 %r7420, %r7419, %r7413; + xor.b32 %r7421, %r7420, %r7415; + shf.l.wrap.b32 %r7422, %r7421, %r7421, 25; + add.s32 %r7423, %r7375, %r7017; + add.s32 %r7424, %r7423, %r7366; + xor.b32 %r7425, %r7424, %r7335; + shf.l.wrap.b32 %r7426, %r7425, %r7425, 16; + add.s32 %r7427, %r7426, %r7350; + xor.b32 %r7428, %r7427, %r7366; + shf.l.wrap.b32 %r7429, %r7428, %r7428, 20; + add.s32 %r7430, %r7424, %r7036; + add.s32 %r7431, %r7430, %r7429; + xor.b32 %r7432, %r7431, %r7426; + shf.l.wrap.b32 %r7433, %r7432, %r7432, 24; + add.s32 %r7434, %r7433, %r7427; + xor.b32 %r7435, %r7434, %r7429; + shf.l.wrap.b32 %r7436, %r7435, %r7435, 25; + add.s32 %r7437, %r7389, %r6945; + add.s32 %r7438, %r7437, %r7408; + xor.b32 %r7439, %r7438, %r7433; + shf.l.wrap.b32 %r7440, %r7439, %r7439, 16; + add.s32 %r7441, %r7440, %r7420; + xor.b32 %r7442, %r7441, %r7408; + shf.l.wrap.b32 %r7443, %r7442, %r7442, 20; + add.s32 %r7444, %r7438, %r6913; + add.s32 %r7445, %r7444, %r7443; + xor.b32 %r7446, %r7445, %r7440; + shf.l.wrap.b32 %r7447, %r7446, %r7446, 24; + add.s32 %r7448, %r7447, %r7441; + xor.b32 %r7449, %r7448, %r7443; + shf.l.wrap.b32 %r7450, %r7449, %r7449, 25; + add.s32 %r7451, %r7403, %r7001; + add.s32 %r7452, %r7451, %r7422; + xor.b32 %r7453, %r7391, %r7452; + shf.l.wrap.b32 %r7454, %r7453, %r7453, 16; + add.s32 %r7455, %r7454, %r7434; + xor.b32 %r7456, %r7455, %r7422; + shf.l.wrap.b32 %r7457, %r7456, %r7456, 20; + add.s32 %r7458, %r7452, %r6929; + add.s32 %r7459, %r7458, %r7457; + xor.b32 %r7460, %r7459, %r7454; + shf.l.wrap.b32 %r7461, %r7460, %r7460, 24; + add.s32 %r7462, %r7461, %r7455; + xor.b32 %r7463, %r7462, %r7457; + shf.l.wrap.b32 %r7464, %r7463, %r7463, 25; + add.s32 %r7465, %r7417, %r6953; + add.s32 %r7466, %r7465, %r7436; + xor.b32 %r7467, %r7466, %r7405; + shf.l.wrap.b32 %r7468, %r7467, %r7467, 16; + add.s32 %r7469, %r7468, %r7392; + xor.b32 %r7470, %r7469, %r7436; + shf.l.wrap.b32 %r7471, %r7470, %r7470, 20; + add.s32 %r7472, %r7466, %r6977; + add.s32 %r7473, %r7472, %r7471; + xor.b32 %r7474, %r7473, %r7468; + shf.l.wrap.b32 %r7475, %r7474, %r7474, 24; + add.s32 %r7476, %r7475, %r7469; + xor.b32 %r7477, %r7476, %r7471; + shf.l.wrap.b32 %r7478, %r7477, %r7477, 25; + add.s32 %r7479, %r7431, %r6921; + add.s32 %r7480, %r7479, %r7394; + xor.b32 %r7481, %r7480, %r7419; + shf.l.wrap.b32 %r7482, %r7481, %r7481, 16; + add.s32 %r7483, %r7482, %r7406; + xor.b32 %r7484, %r7483, %r7394; + shf.l.wrap.b32 %r7485, %r7484, %r7484, 20; + add.s32 %r7486, %r7480, %r6961; + add.s32 %r7487, %r7486, %r7485; + xor.b32 %r7488, %r7487, %r7482; + shf.l.wrap.b32 %r7489, %r7488, %r7488, 24; + add.s32 %r7490, %r7489, %r7483; + xor.b32 %r7491, %r7490, %r7485; + shf.l.wrap.b32 %r7492, %r7491, %r7491, 25; + add.s32 %r7493, %r7445, %r7009; + add.s32 %r7494, %r7493, %r7492; + xor.b32 %r7495, %r7494, %r7461; + shf.l.wrap.b32 %r7496, %r7495, %r7495, 16; + add.s32 %r7497, %r7496, %r7476; + xor.b32 %r7498, %r7497, %r7492; + shf.l.wrap.b32 %r7499, %r7498, %r7498, 20; + add.s32 %r7500, %r7494, %r7017; + add.s32 %r7501, %r7500, %r7499; + xor.b32 %r7502, %r7501, %r7496; + shf.l.wrap.b32 %r7503, %r7502, %r7502, 24; + add.s32 %r7504, %r7503, %r7497; + xor.b32 %r7505, %r7504, %r7499; + shf.l.wrap.b32 %r7506, %r7505, %r7505, 25; + add.s32 %r7507, %r7459, %r6985; + add.s32 %r7508, %r7507, %r7450; + xor.b32 %r7509, %r7508, %r7475; + shf.l.wrap.b32 %r7510, %r7509, %r7509, 16; + add.s32 %r7511, %r7510, %r7490; + xor.b32 %r7512, %r7511, %r7450; + shf.l.wrap.b32 %r7513, %r7512, %r7512, 20; + add.s32 %r7514, %r7508, %r7001; + add.s32 %r7515, %r7514, %r7513; + xor.b32 %r7516, %r7515, %r7510; + shf.l.wrap.b32 %r7517, %r7516, %r7516, 24; + add.s32 %r7518, %r7517, %r7511; + xor.b32 %r7519, %r7518, %r7513; + shf.l.wrap.b32 %r7520, %r7519, %r7519, 25; + add.s32 %r7521, %r7473, %r7036; + add.s32 %r7522, %r7521, %r7464; + xor.b32 %r7523, %r7489, %r7522; + shf.l.wrap.b32 %r7524, %r7523, %r7523, 16; + add.s32 %r7525, %r7524, %r7448; + xor.b32 %r7526, %r7525, %r7464; + shf.l.wrap.b32 %r7527, %r7526, %r7526, 20; + add.s32 %r7528, %r7522, %r6993; + add.s32 %r7529, %r7528, %r7527; + xor.b32 %r7530, %r7529, %r7524; + shf.l.wrap.b32 %r7531, %r7530, %r7530, 24; + add.s32 %r7532, %r7531, %r7525; + xor.b32 %r7533, %r7532, %r7527; + shf.l.wrap.b32 %r7534, %r7533, %r7533, 25; + add.s32 %r7535, %r7487, %r7025; + add.s32 %r7536, %r7535, %r7478; + xor.b32 %r7537, %r7536, %r7447; + shf.l.wrap.b32 %r7538, %r7537, %r7537, 16; + add.s32 %r7539, %r7538, %r7462; + xor.b32 %r7540, %r7539, %r7478; + shf.l.wrap.b32 %r7541, %r7540, %r7540, 20; + add.s32 %r7542, %r7536, %r6977; + add.s32 %r7543, %r7542, %r7541; + xor.b32 %r7544, %r7543, %r7538; + shf.l.wrap.b32 %r7545, %r7544, %r7544, 24; + add.s32 %r7546, %r7545, %r7539; + xor.b32 %r7547, %r7546, %r7541; + shf.l.wrap.b32 %r7548, %r7547, %r7547, 25; + add.s32 %r7549, %r7501, %r6969; + add.s32 %r7550, %r7549, %r7520; + xor.b32 %r7551, %r7550, %r7545; + shf.l.wrap.b32 %r7552, %r7551, %r7551, 16; + add.s32 %r7553, %r7552, %r7532; + xor.b32 %r7554, %r7553, %r7520; + shf.l.wrap.b32 %r7555, %r7554, %r7554, 20; + add.s32 %r7556, %r7550, %r6929; + add.s32 %r7557, %r7556, %r7555; + xor.b32 %r7558, %r7557, %r7552; + shf.l.wrap.b32 %r7559, %r7558, %r7558, 24; + add.s32 %r7560, %r7559, %r7553; + xor.b32 %r7561, %r7560, %r7555; + shf.l.wrap.b32 %r7562, %r7561, %r7561, 25; + add.s32 %r7563, %r7515, %r6953; + add.s32 %r7564, %r7563, %r7534; + xor.b32 %r7565, %r7503, %r7564; + shf.l.wrap.b32 %r7566, %r7565, %r7565, 16; + add.s32 %r7567, %r7566, %r7546; + xor.b32 %r7568, %r7567, %r7534; + shf.l.wrap.b32 %r7569, %r7568, %r7568, 20; + add.s32 %r7570, %r7564, %r6937; + add.s32 %r7571, %r7570, %r7569; + xor.b32 %r7572, %r7571, %r7566; + shf.l.wrap.b32 %r7573, %r7572, %r7572, 24; + add.s32 %r7574, %r7573, %r7567; + xor.b32 %r7575, %r7574, %r7569; + shf.l.wrap.b32 %r7576, %r7575, %r7575, 25; + add.s32 %r7577, %r7529, %r6913; + add.s32 %r7578, %r7577, %r7548; + xor.b32 %r7579, %r7578, %r7517; + shf.l.wrap.b32 %r7580, %r7579, %r7579, 16; + add.s32 %r7581, %r7580, %r7504; + xor.b32 %r7582, %r7581, %r7548; + shf.l.wrap.b32 %r7583, %r7582, %r7582, 20; + add.s32 %r7584, %r7578, %r6921; + add.s32 %r7585, %r7584, %r7583; + xor.b32 %r7586, %r7585, %r7580; + shf.l.wrap.b32 %r7587, %r7586, %r7586, 24; + add.s32 %r7588, %r7587, %r7581; + xor.b32 %r7589, %r7588, %r7583; + shf.l.wrap.b32 %r7590, %r7589, %r7589, 25; + add.s32 %r7591, %r7543, %r6961; + add.s32 %r7592, %r7591, %r7506; + xor.b32 %r7593, %r7592, %r7531; + shf.l.wrap.b32 %r7594, %r7593, %r7593, 16; + add.s32 %r7595, %r7594, %r7518; + xor.b32 %r7596, %r7595, %r7506; + shf.l.wrap.b32 %r7597, %r7596, %r7596, 20; + add.s32 %r7598, %r7592, %r6945; + add.s32 %r7599, %r7598, %r7597; + xor.b32 %r7600, %r7599, %r7594; + shf.l.wrap.b32 %r7601, %r7600, %r7600, 24; + add.s32 %r7602, %r7601, %r7595; + xor.b32 %r7603, %r7602, %r7597; + shf.l.wrap.b32 %r7604, %r7603, %r7603, 25; + add.s32 %r7605, %r7557, %r6985; + add.s32 %r7606, %r7605, %r7604; + xor.b32 %r7607, %r7606, %r7573; + shf.l.wrap.b32 %r7608, %r7607, %r7607, 16; + add.s32 %r7609, %r7608, %r7588; + xor.b32 %r7610, %r7609, %r7604; + shf.l.wrap.b32 %r7611, %r7610, %r7610, 20; + add.s32 %r7612, %r7606, %r7025; + add.s32 %r7613, %r7612, %r7611; + xor.b32 %r7614, %r7613, %r7608; + shf.l.wrap.b32 %r7615, %r7614, %r7614, 24; + add.s32 %r7616, %r7615, %r7609; + xor.b32 %r7617, %r7616, %r7611; + shf.l.wrap.b32 %r7618, %r7617, %r7617, 25; + add.s32 %r7619, %r7571, %r7001; + add.s32 %r7620, %r7619, %r7562; + xor.b32 %r7621, %r7620, %r7587; + shf.l.wrap.b32 %r7622, %r7621, %r7621, 16; + add.s32 %r7623, %r7622, %r7602; + xor.b32 %r7624, %r7623, %r7562; + shf.l.wrap.b32 %r7625, %r7624, %r7624, 20; + add.s32 %r7626, %r7620, %r6953; + add.s32 %r7627, %r7626, %r7625; + xor.b32 %r7628, %r7627, %r7622; + shf.l.wrap.b32 %r7629, %r7628, %r7628, 24; + add.s32 %r7630, %r7629, %r7623; + xor.b32 %r7631, %r7630, %r7625; + shf.l.wrap.b32 %r7632, %r7631, %r7631, 25; + add.s32 %r7633, %r7585, %r6977; + add.s32 %r7634, %r7633, %r7576; + xor.b32 %r7635, %r7601, %r7634; + shf.l.wrap.b32 %r7636, %r7635, %r7635, 16; + add.s32 %r7637, %r7636, %r7560; + xor.b32 %r7638, %r7637, %r7576; + shf.l.wrap.b32 %r7639, %r7638, %r7638, 20; + add.s32 %r7640, %r7634, %r7009; + add.s32 %r7641, %r7640, %r7639; + xor.b32 %r7642, %r7641, %r7636; + shf.l.wrap.b32 %r7643, %r7642, %r7642, 24; + add.s32 %r7644, %r7643, %r7637; + xor.b32 %r7645, %r7644, %r7639; + shf.l.wrap.b32 %r7646, %r7645, %r7645, 25; + add.s32 %r7647, %r7599, %r7036; + add.s32 %r7648, %r7647, %r7590; + xor.b32 %r7649, %r7648, %r7559; + shf.l.wrap.b32 %r7650, %r7649, %r7649, 16; + add.s32 %r7651, %r7650, %r7574; + xor.b32 %r7652, %r7651, %r7590; + shf.l.wrap.b32 %r7653, %r7652, %r7652, 20; + add.s32 %r7654, %r7648, %r6921; + add.s32 %r7655, %r7654, %r7653; + xor.b32 %r7656, %r7655, %r7650; + shf.l.wrap.b32 %r7657, %r7656, %r7656, 24; + add.s32 %r7658, %r7657, %r7651; + xor.b32 %r7659, %r7658, %r7653; + shf.l.wrap.b32 %r7660, %r7659, %r7659, 25; + add.s32 %r7661, %r7613, %r7017; + add.s32 %r7662, %r7661, %r7632; + xor.b32 %r7663, %r7662, %r7657; + shf.l.wrap.b32 %r7664, %r7663, %r7663, 16; + add.s32 %r7665, %r7664, %r7644; + xor.b32 %r7666, %r7665, %r7632; + shf.l.wrap.b32 %r7667, %r7666, %r7666, 20; + add.s32 %r7668, %r7662, %r6937; + add.s32 %r7669, %r7668, %r7667; + xor.b32 %r7670, %r7669, %r7664; + shf.l.wrap.b32 %r7671, %r7670, %r7670, 24; + add.s32 %r7672, %r7671, %r7665; + xor.b32 %r7673, %r7672, %r7667; + shf.l.wrap.b32 %r7674, %r7673, %r7673, 25; + add.s32 %r7675, %r7627, %r6913; + add.s32 %r7676, %r7675, %r7646; + xor.b32 %r7677, %r7615, %r7676; + shf.l.wrap.b32 %r7678, %r7677, %r7677, 16; + add.s32 %r7679, %r7678, %r7658; + xor.b32 %r7680, %r7679, %r7646; + shf.l.wrap.b32 %r7681, %r7680, %r7680, 20; + add.s32 %r7682, %r7676, %r6993; + add.s32 %r7683, %r7682, %r7681; + xor.b32 %r7684, %r7683, %r7678; + shf.l.wrap.b32 %r7685, %r7684, %r7684, 24; + add.s32 %r7686, %r7685, %r7679; + xor.b32 %r7687, %r7686, %r7681; + shf.l.wrap.b32 %r7688, %r7687, %r7687, 25; + add.s32 %r7689, %r7641, %r6929; + add.s32 %r7690, %r7689, %r7660; + xor.b32 %r7691, %r7690, %r7629; + shf.l.wrap.b32 %r7692, %r7691, %r7691, 16; + add.s32 %r7693, %r7692, %r7616; + xor.b32 %r7694, %r7693, %r7660; + shf.l.wrap.b32 %r7695, %r7694, %r7694, 20; + add.s32 %r7696, %r7690, %r6961; + add.s32 %r7697, %r7696, %r7695; + xor.b32 %r7698, %r7697, %r7692; + shf.l.wrap.b32 %r7699, %r7698, %r7698, 24; + add.s32 %r7700, %r7699, %r7693; + xor.b32 %r7701, %r7700, %r7695; + shf.l.wrap.b32 %r7702, %r7701, %r7701, 25; + add.s32 %r7703, %r7655, %r6945; + add.s32 %r7704, %r7703, %r7618; + xor.b32 %r7705, %r7704, %r7643; + shf.l.wrap.b32 %r7706, %r7705, %r7705, 16; + add.s32 %r7707, %r7706, %r7630; + xor.b32 %r7708, %r7707, %r7618; + shf.l.wrap.b32 %r7709, %r7708, %r7708, 20; + add.s32 %r7710, %r7704, %r6969; + add.s32 %r7711, %r7710, %r7709; + xor.b32 %r7712, %r7711, %r7706; + shf.l.wrap.b32 %r7713, %r7712, %r7712, 24; + add.s32 %r7714, %r7713, %r7707; + xor.b32 %r7715, %r7714, %r7709; + shf.l.wrap.b32 %r7716, %r7715, %r7715, 25; + add.s32 %r7717, %r7669, %r7001; + add.s32 %r7718, %r7717, %r7716; + xor.b32 %r7719, %r7718, %r7685; + shf.l.wrap.b32 %r7720, %r7719, %r7719, 16; + add.s32 %r7721, %r7720, %r7700; + xor.b32 %r7722, %r7721, %r7716; + shf.l.wrap.b32 %r7723, %r7722, %r7722, 20; + add.s32 %r7724, %r7718, %r7036; + add.s32 %r7725, %r7724, %r7723; + xor.b32 %r7726, %r7725, %r7720; + shf.l.wrap.b32 %r7727, %r7726, %r7726, 24; + add.s32 %r7728, %r7727, %r7721; + xor.b32 %r7729, %r7728, %r7723; + shf.l.wrap.b32 %r7730, %r7729, %r7729, 25; + add.s32 %r7731, %r7683, %r6953; + add.s32 %r7732, %r7731, %r7674; + xor.b32 %r7733, %r7732, %r7699; + shf.l.wrap.b32 %r7734, %r7733, %r7733, 16; + add.s32 %r7735, %r7734, %r7714; + xor.b32 %r7736, %r7735, %r7674; + shf.l.wrap.b32 %r7737, %r7736, %r7736, 20; + add.s32 %r7738, %r7732, %r6913; + add.s32 %r7739, %r7738, %r7737; + xor.b32 %r7740, %r7739, %r7734; + shf.l.wrap.b32 %r7741, %r7740, %r7740, 24; + add.s32 %r7742, %r7741, %r7735; + xor.b32 %r7743, %r7742, %r7737; + shf.l.wrap.b32 %r7744, %r7743, %r7743, 25; + add.s32 %r7745, %r7697, %r6921; + add.s32 %r7746, %r7745, %r7688; + xor.b32 %r7747, %r7713, %r7746; + shf.l.wrap.b32 %r7748, %r7747, %r7747, 16; + add.s32 %r7749, %r7748, %r7672; + xor.b32 %r7750, %r7749, %r7688; + shf.l.wrap.b32 %r7751, %r7750, %r7750, 20; + add.s32 %r7752, %r7746, %r6985; + add.s32 %r7753, %r7752, %r7751; + xor.b32 %r7754, %r7753, %r7748; + shf.l.wrap.b32 %r7755, %r7754, %r7754, 24; + add.s32 %r7756, %r7755, %r7749; + xor.b32 %r7757, %r7756, %r7751; + shf.l.wrap.b32 %r7758, %r7757, %r7757, 25; + add.s32 %r7759, %r7711, %r6977; + add.s32 %r7760, %r7759, %r7702; + xor.b32 %r7761, %r7760, %r7671; + shf.l.wrap.b32 %r7762, %r7761, %r7761, 16; + add.s32 %r7763, %r7762, %r7686; + xor.b32 %r7764, %r7763, %r7702; + shf.l.wrap.b32 %r7765, %r7764, %r7764, 20; + add.s32 %r7766, %r7760, %r6961; + add.s32 %r7767, %r7766, %r7765; + xor.b32 %r7768, %r7767, %r7762; + shf.l.wrap.b32 %r7769, %r7768, %r7768, 24; + add.s32 %r7770, %r7769, %r7763; + xor.b32 %r7771, %r7770, %r7765; + shf.l.wrap.b32 %r7772, %r7771, %r7771, 25; + add.s32 %r7773, %r7725, %r7025; + add.s32 %r7774, %r7773, %r7744; + xor.b32 %r7775, %r7774, %r7769; + shf.l.wrap.b32 %r7776, %r7775, %r7775, 16; + add.s32 %r7777, %r7776, %r7756; + xor.b32 %r7778, %r7777, %r7744; + shf.l.wrap.b32 %r7779, %r7778, %r7778, 20; + add.s32 %r7780, %r7774, %r6993; + add.s32 %r7781, %r7780, %r7779; + xor.b32 %r7782, %r7781, %r7776; + shf.l.wrap.b32 %r7783, %r7782, %r7782, 24; + add.s32 %r7784, %r7783, %r7777; + xor.b32 %r7785, %r7784, %r7779; + shf.l.wrap.b32 %r7786, %r7785, %r7785, 25; + add.s32 %r7787, %r7739, %r6929; + add.s32 %r7788, %r7787, %r7758; + xor.b32 %r7789, %r7727, %r7788; + shf.l.wrap.b32 %r7790, %r7789, %r7789, 16; + add.s32 %r7791, %r7790, %r7770; + xor.b32 %r7792, %r7791, %r7758; + shf.l.wrap.b32 %r7793, %r7792, %r7792, 20; + add.s32 %r7794, %r7788, %r7009; + add.s32 %r7795, %r7794, %r7793; + xor.b32 %r7796, %r7795, %r7790; + shf.l.wrap.b32 %r7797, %r7796, %r7796, 24; + add.s32 %r7798, %r7797, %r7791; + xor.b32 %r7799, %r7798, %r7793; + shf.l.wrap.b32 %r7800, %r7799, %r7799, 25; + add.s32 %r7801, %r7753, %r6937; + add.s32 %r7802, %r7801, %r7772; + xor.b32 %r7803, %r7802, %r7741; + shf.l.wrap.b32 %r7804, %r7803, %r7803, 16; + add.s32 %r7805, %r7804, %r7728; + xor.b32 %r7806, %r7805, %r7772; + shf.l.wrap.b32 %r7807, %r7806, %r7806, 20; + add.s32 %r7808, %r7802, %r6945; + add.s32 %r7809, %r7808, %r7807; + xor.b32 %r7810, %r7809, %r7804; + shf.l.wrap.b32 %r7811, %r7810, %r7810, 24; + add.s32 %r7812, %r7811, %r7805; + xor.b32 %r7813, %r7812, %r7807; + shf.l.wrap.b32 %r7814, %r7813, %r7813, 25; + add.s32 %r7815, %r7767, %r6969; + add.s32 %r7816, %r7815, %r7730; + xor.b32 %r7817, %r7816, %r7755; + shf.l.wrap.b32 %r7818, %r7817, %r7817, 16; + add.s32 %r7819, %r7818, %r7742; + xor.b32 %r7820, %r7819, %r7730; + shf.l.wrap.b32 %r7821, %r7820, %r7820, 20; + add.s32 %r7822, %r7816, %r7017; + add.s32 %r7823, %r7822, %r7821; + xor.b32 %r7824, %r7823, %r7818; + shf.l.wrap.b32 %r7825, %r7824, %r7824, 24; + add.s32 %r7826, %r7825, %r7819; + xor.b32 %r7827, %r7826, %r7821; + shf.l.wrap.b32 %r7828, %r7827, %r7827, 25; + xor.b32 %r97, %r7812, %r7781; + xor.b32 %r98, %r7826, %r7795; + xor.b32 %r99, %r7784, %r7809; + xor.b32 %r100, %r7823, %r7798; + xor.b32 %r101, %r7828, %r7797; + xor.b32 %r102, %r7786, %r7811; + xor.b32 %r103, %r7825, %r7800; + xor.b32 %r104, %r7814, %r7783; + ld.local.u8 %rs327, [%rd3+8]; + cvt.u64.u16 %rd189, %rs327; + popc.b64 %r7829, %rd251; + cvt.u64.u32 %rd64, %r7829; + setp.ge.u64 %p37, %rd64, %rd189; + mul.wide.u16 %r11681, %rs327, 32; + @%p37 bra $L__BB1_46; + +$L__BB1_45: + popc.b64 %r11649, %rd251; + cvt.u64.u32 %rd230, %r11649; + add.s32 %r7830, %r11681, -64; + cvt.s64.s32 %rd190, %r7830; + add.s64 %rd191, %rd2, %rd190; + ld.local.u8 %r7831, [%rd3+2]; + ld.local.u8 %r7832, [%rd191+145]; + ld.local.u8 %r7833, [%rd191+146]; + prmt.b32 %r7834, %r7833, %r7832, 30212; + ld.local.u8 %r7835, [%rd191+147]; + prmt.b32 %r7836, %r7835, %r7834, 28756; + ld.local.u8 %r7837, [%rd191+148]; + prmt.b32 %r7838, %r7837, %r7836, 1620; + ld.local.u8 %r7839, [%rd191+149]; + ld.local.u8 %r7840, [%rd191+150]; + prmt.b32 %r7841, %r7840, %r7839, 30212; + ld.local.u8 %r7842, [%rd191+151]; + prmt.b32 %r7843, %r7842, %r7841, 28756; + ld.local.u8 %r7844, [%rd191+152]; + prmt.b32 %r7845, %r7844, %r7843, 1620; + ld.local.u8 %r7846, [%rd191+153]; + ld.local.u8 %r7847, [%rd191+154]; + prmt.b32 %r7848, %r7847, %r7846, 30212; + ld.local.u8 %r7849, [%rd191+155]; + prmt.b32 %r7850, %r7849, %r7848, 28756; + ld.local.u8 %r7851, [%rd191+156]; + prmt.b32 %r7852, %r7851, %r7850, 1620; + ld.local.u8 %r7853, [%rd191+157]; + ld.local.u8 %r7854, [%rd191+158]; + prmt.b32 %r7855, %r7854, %r7853, 30212; + ld.local.u8 %r7856, [%rd191+159]; + prmt.b32 %r7857, %r7856, %r7855, 28756; + ld.local.u8 %r7858, [%rd191+160]; + prmt.b32 %r7859, %r7858, %r7857, 1620; + ld.local.u8 %r7860, [%rd191+161]; + ld.local.u8 %r7861, [%rd191+162]; + prmt.b32 %r7862, %r7861, %r7860, 30212; + ld.local.u8 %r7863, [%rd191+163]; + prmt.b32 %r7864, %r7863, %r7862, 28756; + ld.local.u8 %r7865, [%rd191+164]; + prmt.b32 %r7866, %r7865, %r7864, 1620; + ld.local.u8 %r7867, [%rd191+165]; + ld.local.u8 %r7868, [%rd191+166]; + prmt.b32 %r7869, %r7868, %r7867, 30212; + ld.local.u8 %r7870, [%rd191+167]; + prmt.b32 %r7871, %r7870, %r7869, 28756; + ld.local.u8 %r7872, [%rd191+168]; + prmt.b32 %r7873, %r7872, %r7871, 1620; + ld.local.u8 %r7874, [%rd191+169]; + ld.local.u8 %r7875, [%rd191+170]; + prmt.b32 %r7876, %r7875, %r7874, 30212; + ld.local.u8 %r7877, [%rd191+171]; + prmt.b32 %r7878, %r7877, %r7876, 28756; + ld.local.u8 %r7879, [%rd191+172]; + prmt.b32 %r7880, %r7879, %r7878, 1620; + ld.local.u8 %r7881, [%rd191+173]; + ld.local.u8 %r7882, [%rd191+174]; + prmt.b32 %r7883, %r7882, %r7881, 30212; + ld.local.u8 %r7884, [%rd191+175]; + prmt.b32 %r7885, %r7884, %r7883, 28756; + ld.local.u8 %r7886, [%rd191+176]; + prmt.b32 %r7887, %r7886, %r7885, 1620; + ld.local.u8 %r7888, [%rd191+177]; + ld.local.u8 %r7889, [%rd191+178]; + prmt.b32 %r7890, %r7889, %r7888, 30212; + ld.local.u8 %r7891, [%rd191+179]; + prmt.b32 %r7892, %r7891, %r7890, 28756; + ld.local.u8 %r7893, [%rd191+180]; + prmt.b32 %r7894, %r7893, %r7892, 1620; + ld.local.u8 %r7895, [%rd191+181]; + ld.local.u8 %r7896, [%rd191+182]; + prmt.b32 %r7897, %r7896, %r7895, 30212; + ld.local.u8 %r7898, [%rd191+183]; + prmt.b32 %r7899, %r7898, %r7897, 28756; + ld.local.u8 %r7900, [%rd191+184]; + prmt.b32 %r7901, %r7900, %r7899, 1620; + ld.local.u8 %r7902, [%rd191+185]; + ld.local.u8 %r7903, [%rd191+186]; + prmt.b32 %r7904, %r7903, %r7902, 30212; + ld.local.u8 %r7905, [%rd191+187]; + prmt.b32 %r7906, %r7905, %r7904, 28756; + ld.local.u8 %r7907, [%rd191+188]; + prmt.b32 %r7908, %r7907, %r7906, 1620; + ld.local.u8 %r7909, [%rd191+189]; + ld.local.u8 %r7910, [%rd191+190]; + prmt.b32 %r7911, %r7910, %r7909, 30212; + ld.local.u8 %r7912, [%rd191+191]; + prmt.b32 %r7913, %r7912, %r7911, 28756; + ld.local.u8 %r7914, [%rd191+192]; + prmt.b32 %r7915, %r7914, %r7913, 1620; + ld.local.u8 %r7916, [%rd191+193]; + ld.local.u8 %r7917, [%rd191+194]; + prmt.b32 %r7918, %r7917, %r7916, 30212; + ld.local.u8 %r7919, [%rd191+195]; + prmt.b32 %r7920, %r7919, %r7918, 28756; + ld.local.u8 %r7921, [%rd191+196]; + prmt.b32 %r7922, %r7921, %r7920, 1620; + ld.local.u8 %r7923, [%rd191+197]; + ld.local.u8 %r7924, [%rd191+198]; + prmt.b32 %r7925, %r7924, %r7923, 30212; + ld.local.u8 %r7926, [%rd191+199]; + prmt.b32 %r7927, %r7926, %r7925, 28756; + ld.local.u8 %r7928, [%rd191+200]; + prmt.b32 %r7929, %r7928, %r7927, 1620; + ld.local.u8 %r7930, [%rd191+201]; + ld.local.u8 %r7931, [%rd191+202]; + prmt.b32 %r7932, %r7931, %r7930, 30212; + ld.local.u8 %r7933, [%rd191+203]; + prmt.b32 %r7934, %r7933, %r7932, 28756; + ld.local.u8 %r7935, [%rd191+204]; + prmt.b32 %r7936, %r7935, %r7934, 1620; + ld.local.u8 %r7937, [%rd191+205]; + ld.local.u8 %r7938, [%rd191+206]; + prmt.b32 %r7939, %r7938, %r7937, 30212; + ld.local.u8 %r7940, [%rd191+207]; + prmt.b32 %r7941, %r7940, %r7939, 28756; + ld.local.u8 %r7942, [%rd191+208]; + prmt.b32 %r7943, %r7942, %r7941, 1620; + or.b32 %r7944, %r7831, 4; + ld.local.u8 %r7945, [%rd3+-120]; + ld.local.u8 %r7946, [%rd3+-119]; + prmt.b32 %r7947, %r7946, %r7945, 30212; + ld.local.u8 %r7948, [%rd3+-118]; + ld.local.u8 %r7949, [%rd3+-117]; + prmt.b32 %r7950, %r7949, %r7948, 30212; + prmt.b32 %r7951, %r7950, %r7947, 4180; + ld.local.u8 %r7952, [%rd3+-136]; + ld.local.u8 %r7953, [%rd3+-135]; + prmt.b32 %r7954, %r7953, %r7952, 30212; + ld.local.u8 %r7955, [%rd3+-134]; + ld.local.u8 %r7956, [%rd3+-133]; + prmt.b32 %r7957, %r7956, %r7955, 30212; + prmt.b32 %r7958, %r7957, %r7954, 4180; + add.s32 %r7959, %r7951, %r7958; + add.s32 %r7960, %r7959, %r7838; + shf.l.wrap.b32 %r7961, %r7960, %r7960, 16; + add.s32 %r7962, %r7961, 1779033703; + xor.b32 %r7963, %r7962, %r7951; + shf.l.wrap.b32 %r7964, %r7963, %r7963, 20; + add.s32 %r7965, %r7845, %r7960; + add.s32 %r7966, %r7965, %r7964; + xor.b32 %r7967, %r7966, %r7961; + shf.l.wrap.b32 %r7968, %r7967, %r7967, 24; + add.s32 %r7969, %r7968, %r7962; + xor.b32 %r7970, %r7969, %r7964; + shf.l.wrap.b32 %r7971, %r7970, %r7970, 25; + ld.local.u8 %r7972, [%rd3+-116]; + ld.local.u8 %r7973, [%rd3+-115]; + prmt.b32 %r7974, %r7973, %r7972, 30212; + ld.local.u8 %r7975, [%rd3+-114]; + ld.local.u8 %r7976, [%rd3+-113]; + prmt.b32 %r7977, %r7976, %r7975, 30212; + prmt.b32 %r7978, %r7977, %r7974, 4180; + ld.local.u8 %r7979, [%rd3+-132]; + ld.local.u8 %r7980, [%rd3+-131]; + prmt.b32 %r7981, %r7980, %r7979, 30212; + ld.local.u8 %r7982, [%rd3+-130]; + ld.local.u8 %r7983, [%rd3+-129]; + prmt.b32 %r7984, %r7983, %r7982, 30212; + prmt.b32 %r7985, %r7984, %r7981, 4180; + add.s32 %r7986, %r7978, %r7985; + add.s32 %r7987, %r7986, %r7852; + shf.l.wrap.b32 %r7988, %r7987, %r7987, 16; + add.s32 %r7989, %r7988, -1150833019; + xor.b32 %r7990, %r7989, %r7978; + shf.l.wrap.b32 %r7991, %r7990, %r7990, 20; + add.s32 %r7992, %r7859, %r7987; + add.s32 %r7993, %r7992, %r7991; + xor.b32 %r7994, %r7993, %r7988; + shf.l.wrap.b32 %r7995, %r7994, %r7994, 24; + add.s32 %r7996, %r7995, %r7989; + xor.b32 %r7997, %r7996, %r7991; + shf.l.wrap.b32 %r7998, %r7997, %r7997, 25; + ld.local.u8 %r7999, [%rd3+-112]; + ld.local.u8 %r8000, [%rd3+-111]; + prmt.b32 %r8001, %r8000, %r7999, 30212; + ld.local.u8 %r8002, [%rd3+-110]; + ld.local.u8 %r8003, [%rd3+-109]; + prmt.b32 %r8004, %r8003, %r8002, 30212; + prmt.b32 %r8005, %r8004, %r8001, 4180; + ld.local.u8 %r8006, [%rd3+-128]; + ld.local.u8 %r8007, [%rd3+-127]; + prmt.b32 %r8008, %r8007, %r8006, 30212; + ld.local.u8 %r8009, [%rd3+-126]; + ld.local.u8 %r8010, [%rd3+-125]; + prmt.b32 %r8011, %r8010, %r8009, 30212; + prmt.b32 %r8012, %r8011, %r8008, 4180; + add.s32 %r8013, %r8005, %r8012; + add.s32 %r8014, %r8013, %r7866; + shr.u32 %r8015, %r8014, 16; + shl.b32 %r8016, %r8014, 16; + xor.b32 %r8017, %r8016, 4194304; + or.b32 %r8018, %r8017, %r8015; + add.s32 %r8019, %r8018, 1013904242; + xor.b32 %r8020, %r8019, %r8005; + shf.l.wrap.b32 %r8021, %r8020, %r8020, 20; + add.s32 %r8022, %r7873, %r8014; + add.s32 %r8023, %r8022, %r8021; + xor.b32 %r8024, %r8023, %r8018; + shf.l.wrap.b32 %r8025, %r8024, %r8024, 24; + add.s32 %r8026, %r8025, %r8019; + xor.b32 %r8027, %r8026, %r8021; + shf.l.wrap.b32 %r8028, %r8027, %r8027, 25; + ld.local.u8 %r8029, [%rd3+-108]; + ld.local.u8 %r8030, [%rd3+-107]; + prmt.b32 %r8031, %r8030, %r8029, 30212; + ld.local.u8 %r8032, [%rd3+-106]; + ld.local.u8 %r8033, [%rd3+-105]; + prmt.b32 %r8034, %r8033, %r8032, 30212; + prmt.b32 %r8035, %r8034, %r8031, 4180; + ld.local.u8 %r8036, [%rd3+-124]; + ld.local.u8 %r8037, [%rd3+-123]; + prmt.b32 %r8038, %r8037, %r8036, 30212; + ld.local.u8 %r8039, [%rd3+-122]; + ld.local.u8 %r8040, [%rd3+-121]; + prmt.b32 %r8041, %r8040, %r8039, 30212; + prmt.b32 %r8042, %r8041, %r8038, 4180; + add.s32 %r8043, %r8035, %r8042; + add.s32 %r8044, %r8043, %r7880; + xor.b32 %r8045, %r8044, %r7944; + shr.u32 %r8046, %r8044, 16; + shl.b32 %r8047, %r8045, 16; + or.b32 %r8048, %r8047, %r8046; + add.s32 %r8049, %r8048, -1521486534; + xor.b32 %r8050, %r8049, %r8035; + shf.l.wrap.b32 %r8051, %r8050, %r8050, 20; + add.s32 %r8052, %r7887, %r8044; + add.s32 %r8053, %r8052, %r8051; + xor.b32 %r8054, %r8053, %r8048; + shf.l.wrap.b32 %r8055, %r8054, %r8054, 24; + add.s32 %r8056, %r8055, %r8049; + xor.b32 %r8057, %r8056, %r8051; + shf.l.wrap.b32 %r8058, %r8057, %r8057, 25; + add.s32 %r8059, %r7998, %r7966; + add.s32 %r8060, %r8059, %r7894; + xor.b32 %r8061, %r8055, %r8060; + shf.l.wrap.b32 %r8062, %r8061, %r8061, 16; + add.s32 %r8063, %r8062, %r8026; + xor.b32 %r8064, %r8063, %r7998; + shf.l.wrap.b32 %r8065, %r8064, %r8064, 20; + add.s32 %r8066, %r7901, %r8060; + add.s32 %r8067, %r8066, %r8065; + xor.b32 %r8068, %r8067, %r8062; + shf.l.wrap.b32 %r8069, %r8068, %r8068, 24; + add.s32 %r8070, %r8069, %r8063; + xor.b32 %r8071, %r8070, %r8065; + shf.l.wrap.b32 %r8072, %r8071, %r8071, 25; + add.s32 %r8073, %r8028, %r7993; + add.s32 %r8074, %r8073, %r7908; + xor.b32 %r8075, %r8074, %r7968; + shf.l.wrap.b32 %r8076, %r8075, %r8075, 16; + add.s32 %r8077, %r8076, %r8056; + xor.b32 %r8078, %r8077, %r8028; + shf.l.wrap.b32 %r8079, %r8078, %r8078, 20; + add.s32 %r8080, %r7915, %r8074; + add.s32 %r8081, %r8080, %r8079; + xor.b32 %r8082, %r8081, %r8076; + shf.l.wrap.b32 %r8083, %r8082, %r8082, 24; + add.s32 %r8084, %r8083, %r8077; + xor.b32 %r8085, %r8084, %r8079; + shf.l.wrap.b32 %r8086, %r8085, %r8085, 25; + add.s32 %r8087, %r8058, %r8023; + add.s32 %r8088, %r8087, %r7922; + xor.b32 %r8089, %r8088, %r7995; + shf.l.wrap.b32 %r8090, %r8089, %r8089, 16; + add.s32 %r8091, %r8090, %r7969; + xor.b32 %r8092, %r8091, %r8058; + shf.l.wrap.b32 %r8093, %r8092, %r8092, 20; + add.s32 %r8094, %r7929, %r8088; + add.s32 %r8095, %r8094, %r8093; + xor.b32 %r8096, %r8095, %r8090; + shf.l.wrap.b32 %r8097, %r8096, %r8096, 24; + add.s32 %r8098, %r8097, %r8091; + xor.b32 %r8099, %r8098, %r8093; + shf.l.wrap.b32 %r8100, %r8099, %r8099, 25; + add.s32 %r8101, %r8053, %r7971; + add.s32 %r8102, %r8101, %r7936; + xor.b32 %r8103, %r8102, %r8025; + shf.l.wrap.b32 %r8104, %r8103, %r8103, 16; + add.s32 %r8105, %r8104, %r7996; + xor.b32 %r8106, %r8105, %r7971; + shf.l.wrap.b32 %r8107, %r8106, %r8106, 20; + add.s32 %r8108, %r7943, %r8102; + add.s32 %r8109, %r8108, %r8107; + xor.b32 %r8110, %r8109, %r8104; + shf.l.wrap.b32 %r8111, %r8110, %r8110, 24; + add.s32 %r8112, %r8111, %r8105; + xor.b32 %r8113, %r8112, %r8107; + shf.l.wrap.b32 %r8114, %r8113, %r8113, 25; + add.s32 %r8115, %r8067, %r7852; + add.s32 %r8116, %r8115, %r8114; + xor.b32 %r8117, %r8116, %r8083; + shf.l.wrap.b32 %r8118, %r8117, %r8117, 16; + add.s32 %r8119, %r8118, %r8098; + xor.b32 %r8120, %r8119, %r8114; + shf.l.wrap.b32 %r8121, %r8120, %r8120, 20; + add.s32 %r8122, %r8116, %r7880; + add.s32 %r8123, %r8122, %r8121; + xor.b32 %r8124, %r8123, %r8118; + shf.l.wrap.b32 %r8125, %r8124, %r8124, 24; + add.s32 %r8126, %r8125, %r8119; + xor.b32 %r8127, %r8126, %r8121; + shf.l.wrap.b32 %r8128, %r8127, %r8127, 25; + add.s32 %r8129, %r8081, %r7859; + add.s32 %r8130, %r8129, %r8072; + xor.b32 %r8131, %r8097, %r8130; + shf.l.wrap.b32 %r8132, %r8131, %r8131, 16; + add.s32 %r8133, %r8112, %r8132; + xor.b32 %r8134, %r8133, %r8072; + shf.l.wrap.b32 %r8135, %r8134, %r8134, 20; + add.s32 %r8136, %r8130, %r7908; + add.s32 %r8137, %r8136, %r8135; + xor.b32 %r8138, %r8137, %r8132; + shf.l.wrap.b32 %r8139, %r8138, %r8138, 24; + add.s32 %r8140, %r8139, %r8133; + xor.b32 %r8141, %r8140, %r8135; + shf.l.wrap.b32 %r8142, %r8141, %r8141, 25; + add.s32 %r8143, %r8086, %r7887; + add.s32 %r8144, %r8143, %r8095; + xor.b32 %r8145, %r8111, %r8144; + shf.l.wrap.b32 %r8146, %r8145, %r8145, 16; + add.s32 %r8147, %r8146, %r8070; + xor.b32 %r8148, %r8147, %r8086; + shf.l.wrap.b32 %r8149, %r8148, %r8148, 20; + add.s32 %r8150, %r8144, %r7838; + add.s32 %r8151, %r8150, %r8149; + xor.b32 %r8152, %r8151, %r8146; + shf.l.wrap.b32 %r8153, %r8152, %r8152, 24; + add.s32 %r8154, %r8153, %r8147; + xor.b32 %r8155, %r8154, %r8149; + shf.l.wrap.b32 %r8156, %r8155, %r8155, 25; + add.s32 %r8157, %r8100, %r7866; + add.s32 %r8158, %r8157, %r8109; + xor.b32 %r8159, %r8158, %r8069; + shf.l.wrap.b32 %r8160, %r8159, %r8159, 16; + add.s32 %r8161, %r8160, %r8084; + xor.b32 %r8162, %r8161, %r8100; + shf.l.wrap.b32 %r8163, %r8162, %r8162, 20; + add.s32 %r8164, %r8158, %r7929; + add.s32 %r8165, %r8164, %r8163; + xor.b32 %r8166, %r8165, %r8160; + shf.l.wrap.b32 %r8167, %r8166, %r8166, 24; + add.s32 %r8168, %r8167, %r8161; + xor.b32 %r8169, %r8168, %r8163; + shf.l.wrap.b32 %r8170, %r8169, %r8169, 25; + add.s32 %r8171, %r8142, %r7845; + add.s32 %r8172, %r8171, %r8123; + xor.b32 %r8173, %r8172, %r8167; + shf.l.wrap.b32 %r8174, %r8173, %r8173, 16; + add.s32 %r8175, %r8174, %r8154; + xor.b32 %r8176, %r8175, %r8142; + shf.l.wrap.b32 %r8177, %r8176, %r8176, 20; + add.s32 %r8178, %r8172, %r7915; + add.s32 %r8179, %r8178, %r8177; + xor.b32 %r8180, %r8179, %r8174; + shf.l.wrap.b32 %r8181, %r8180, %r8180, 24; + add.s32 %r8182, %r8181, %r8175; + xor.b32 %r8183, %r8182, %r8177; + shf.l.wrap.b32 %r8184, %r8183, %r8183, 25; + add.s32 %r8185, %r8137, %r7922; + add.s32 %r8186, %r8185, %r8156; + xor.b32 %r8187, %r8125, %r8186; + shf.l.wrap.b32 %r8188, %r8187, %r8187, 16; + add.s32 %r8189, %r8188, %r8168; + xor.b32 %r8190, %r8189, %r8156; + shf.l.wrap.b32 %r8191, %r8190, %r8190, 20; + add.s32 %r8192, %r8186, %r7873; + add.s32 %r8193, %r8192, %r8191; + xor.b32 %r8194, %r8193, %r8188; + shf.l.wrap.b32 %r8195, %r8194, %r8194, 24; + add.s32 %r8196, %r8195, %r8189; + xor.b32 %r8197, %r8196, %r8191; + shf.l.wrap.b32 %r8198, %r8197, %r8197, 25; + add.s32 %r8199, %r8151, %r7901; + add.s32 %r8200, %r8199, %r8170; + xor.b32 %r8201, %r8200, %r8139; + shf.l.wrap.b32 %r8202, %r8201, %r8201, 16; + add.s32 %r8203, %r8202, %r8126; + xor.b32 %r8204, %r8203, %r8170; + shf.l.wrap.b32 %r8205, %r8204, %r8204, 20; + add.s32 %r8206, %r8200, %r7936; + add.s32 %r8207, %r8206, %r8205; + xor.b32 %r8208, %r8207, %r8202; + shf.l.wrap.b32 %r8209, %r8208, %r8208, 24; + add.s32 %r8210, %r8209, %r8203; + xor.b32 %r8211, %r8210, %r8205; + shf.l.wrap.b32 %r8212, %r8211, %r8211, 25; + add.s32 %r8213, %r8165, %r7943; + add.s32 %r8214, %r8213, %r8128; + xor.b32 %r8215, %r8214, %r8153; + shf.l.wrap.b32 %r8216, %r8215, %r8215, 16; + add.s32 %r8217, %r8216, %r8140; + xor.b32 %r8218, %r8217, %r8128; + shf.l.wrap.b32 %r8219, %r8218, %r8218, 20; + add.s32 %r8220, %r8214, %r7894; + add.s32 %r8221, %r8220, %r8219; + xor.b32 %r8222, %r8221, %r8216; + shf.l.wrap.b32 %r8223, %r8222, %r8222, 24; + add.s32 %r8224, %r8223, %r8217; + xor.b32 %r8225, %r8224, %r8219; + shf.l.wrap.b32 %r8226, %r8225, %r8225, 25; + add.s32 %r8227, %r8179, %r7859; + add.s32 %r8228, %r8227, %r8226; + xor.b32 %r8229, %r8228, %r8195; + shf.l.wrap.b32 %r8230, %r8229, %r8229, 16; + add.s32 %r8231, %r8230, %r8210; + xor.b32 %r8232, %r8231, %r8226; + shf.l.wrap.b32 %r8233, %r8232, %r8232, 20; + add.s32 %r8234, %r8228, %r7866; + add.s32 %r8235, %r8234, %r8233; + xor.b32 %r8236, %r8235, %r8230; + shf.l.wrap.b32 %r8237, %r8236, %r8236, 24; + add.s32 %r8238, %r8237, %r8231; + xor.b32 %r8239, %r8238, %r8233; + shf.l.wrap.b32 %r8240, %r8239, %r8239, 25; + add.s32 %r8241, %r8193, %r7908; + add.s32 %r8242, %r8241, %r8184; + xor.b32 %r8243, %r8242, %r8209; + shf.l.wrap.b32 %r8244, %r8243, %r8243, 16; + add.s32 %r8245, %r8244, %r8224; + xor.b32 %r8246, %r8245, %r8184; + shf.l.wrap.b32 %r8247, %r8246, %r8246, 20; + add.s32 %r8248, %r8242, %r7922; + add.s32 %r8249, %r8248, %r8247; + xor.b32 %r8250, %r8249, %r8244; + shf.l.wrap.b32 %r8251, %r8250, %r8250, 24; + add.s32 %r8252, %r8251, %r8245; + xor.b32 %r8253, %r8252, %r8247; + shf.l.wrap.b32 %r8254, %r8253, %r8253, 25; + add.s32 %r8255, %r8207, %r7929; + add.s32 %r8256, %r8255, %r8198; + xor.b32 %r8257, %r8223, %r8256; + shf.l.wrap.b32 %r8258, %r8257, %r8257, 16; + add.s32 %r8259, %r8258, %r8182; + xor.b32 %r8260, %r8259, %r8198; + shf.l.wrap.b32 %r8261, %r8260, %r8260, 20; + add.s32 %r8262, %r8256, %r7852; + add.s32 %r8263, %r8262, %r8261; + xor.b32 %r8264, %r8263, %r8258; + shf.l.wrap.b32 %r8265, %r8264, %r8264, 24; + add.s32 %r8266, %r8265, %r8259; + xor.b32 %r8267, %r8266, %r8261; + shf.l.wrap.b32 %r8268, %r8267, %r8267, 25; + add.s32 %r8269, %r8212, %r7887; + add.s32 %r8270, %r8269, %r8221; + xor.b32 %r8271, %r8270, %r8181; + shf.l.wrap.b32 %r8272, %r8271, %r8271, 16; + add.s32 %r8273, %r8272, %r8196; + xor.b32 %r8274, %r8273, %r8212; + shf.l.wrap.b32 %r8275, %r8274, %r8274, 20; + add.s32 %r8276, %r8270, %r7936; + add.s32 %r8277, %r8276, %r8275; + xor.b32 %r8278, %r8277, %r8272; + shf.l.wrap.b32 %r8279, %r8278, %r8278, 24; + add.s32 %r8280, %r8279, %r8273; + xor.b32 %r8281, %r8280, %r8275; + shf.l.wrap.b32 %r8282, %r8281, %r8281, 25; + add.s32 %r8283, %r8235, %r7880; + add.s32 %r8284, %r8283, %r8254; + xor.b32 %r8285, %r8284, %r8279; + shf.l.wrap.b32 %r8286, %r8285, %r8285, 16; + add.s32 %r8287, %r8286, %r8266; + xor.b32 %r8288, %r8287, %r8254; + shf.l.wrap.b32 %r8289, %r8288, %r8288, 20; + add.s32 %r8290, %r8284, %r7873; + add.s32 %r8291, %r8290, %r8289; + xor.b32 %r8292, %r8291, %r8286; + shf.l.wrap.b32 %r8293, %r8292, %r8292, 24; + add.s32 %r8294, %r8293, %r8287; + xor.b32 %r8295, %r8294, %r8289; + shf.l.wrap.b32 %r8296, %r8295, %r8295, 25; + add.s32 %r8297, %r8249, %r7901; + add.s32 %r8298, %r8297, %r8268; + xor.b32 %r8299, %r8237, %r8298; + shf.l.wrap.b32 %r8300, %r8299, %r8299, 16; + add.s32 %r8301, %r8300, %r8280; + xor.b32 %r8302, %r8301, %r8268; + shf.l.wrap.b32 %r8303, %r8302, %r8302, 20; + add.s32 %r8304, %r8298, %r7838; + add.s32 %r8305, %r8304, %r8303; + xor.b32 %r8306, %r8305, %r8300; + shf.l.wrap.b32 %r8307, %r8306, %r8306, 24; + add.s32 %r8308, %r8307, %r8301; + xor.b32 %r8309, %r8308, %r8303; + shf.l.wrap.b32 %r8310, %r8309, %r8309, 25; + add.s32 %r8311, %r8263, %r7915; + add.s32 %r8312, %r8311, %r8282; + xor.b32 %r8313, %r8312, %r8251; + shf.l.wrap.b32 %r8314, %r8313, %r8313, 16; + add.s32 %r8315, %r8314, %r8238; + xor.b32 %r8316, %r8315, %r8282; + shf.l.wrap.b32 %r8317, %r8316, %r8316, 20; + add.s32 %r8318, %r8312, %r7943; + add.s32 %r8319, %r8318, %r8317; + xor.b32 %r8320, %r8319, %r8314; + shf.l.wrap.b32 %r8321, %r8320, %r8320, 24; + add.s32 %r8322, %r8321, %r8315; + xor.b32 %r8323, %r8322, %r8317; + shf.l.wrap.b32 %r8324, %r8323, %r8323, 25; + add.s32 %r8325, %r8277, %r7894; + add.s32 %r8326, %r8325, %r8240; + xor.b32 %r8327, %r8326, %r8265; + shf.l.wrap.b32 %r8328, %r8327, %r8327, 16; + add.s32 %r8329, %r8328, %r8252; + xor.b32 %r8330, %r8329, %r8240; + shf.l.wrap.b32 %r8331, %r8330, %r8330, 20; + add.s32 %r8332, %r8326, %r7845; + add.s32 %r8333, %r8332, %r8331; + xor.b32 %r8334, %r8333, %r8328; + shf.l.wrap.b32 %r8335, %r8334, %r8334, 24; + add.s32 %r8336, %r8335, %r8329; + xor.b32 %r8337, %r8336, %r8331; + shf.l.wrap.b32 %r8338, %r8337, %r8337, 25; + add.s32 %r8339, %r8291, %r7908; + add.s32 %r8340, %r8339, %r8338; + xor.b32 %r8341, %r8340, %r8307; + shf.l.wrap.b32 %r8342, %r8341, %r8341, 16; + add.s32 %r8343, %r8342, %r8322; + xor.b32 %r8344, %r8343, %r8338; + shf.l.wrap.b32 %r8345, %r8344, %r8344, 20; + add.s32 %r8346, %r8340, %r7887; + add.s32 %r8347, %r8346, %r8345; + xor.b32 %r8348, %r8347, %r8342; + shf.l.wrap.b32 %r8349, %r8348, %r8348, 24; + add.s32 %r8350, %r8349, %r8343; + xor.b32 %r8351, %r8350, %r8345; + shf.l.wrap.b32 %r8352, %r8351, %r8351, 25; + add.s32 %r8353, %r8305, %r7922; + add.s32 %r8354, %r8353, %r8296; + xor.b32 %r8355, %r8354, %r8321; + shf.l.wrap.b32 %r8356, %r8355, %r8355, 16; + add.s32 %r8357, %r8356, %r8336; + xor.b32 %r8358, %r8357, %r8296; + shf.l.wrap.b32 %r8359, %r8358, %r8358, 20; + add.s32 %r8360, %r8354, %r7901; + add.s32 %r8361, %r8360, %r8359; + xor.b32 %r8362, %r8361, %r8356; + shf.l.wrap.b32 %r8363, %r8362, %r8362, 24; + add.s32 %r8364, %r8363, %r8357; + xor.b32 %r8365, %r8364, %r8359; + shf.l.wrap.b32 %r8366, %r8365, %r8365, 25; + add.s32 %r8367, %r8319, %r7936; + add.s32 %r8368, %r8367, %r8310; + xor.b32 %r8369, %r8335, %r8368; + shf.l.wrap.b32 %r8370, %r8369, %r8369, 16; + add.s32 %r8371, %r8370, %r8294; + xor.b32 %r8372, %r8371, %r8310; + shf.l.wrap.b32 %r8373, %r8372, %r8372, 20; + add.s32 %r8374, %r8368, %r7859; + add.s32 %r8375, %r8374, %r8373; + xor.b32 %r8376, %r8375, %r8370; + shf.l.wrap.b32 %r8377, %r8376, %r8376, 24; + add.s32 %r8378, %r8377, %r8371; + xor.b32 %r8379, %r8378, %r8373; + shf.l.wrap.b32 %r8380, %r8379, %r8379, 25; + add.s32 %r8381, %r8333, %r7929; + add.s32 %r8382, %r8381, %r8324; + xor.b32 %r8383, %r8382, %r8293; + shf.l.wrap.b32 %r8384, %r8383, %r8383, 16; + add.s32 %r8385, %r8384, %r8308; + xor.b32 %r8386, %r8385, %r8324; + shf.l.wrap.b32 %r8387, %r8386, %r8386, 20; + add.s32 %r8388, %r8382, %r7943; + add.s32 %r8389, %r8388, %r8387; + xor.b32 %r8390, %r8389, %r8384; + shf.l.wrap.b32 %r8391, %r8390, %r8390, 24; + add.s32 %r8392, %r8391, %r8385; + xor.b32 %r8393, %r8392, %r8387; + shf.l.wrap.b32 %r8394, %r8393, %r8393, 25; + add.s32 %r8395, %r8347, %r7866; + add.s32 %r8396, %r8395, %r8366; + xor.b32 %r8397, %r8396, %r8391; + shf.l.wrap.b32 %r8398, %r8397, %r8397, 16; + add.s32 %r8399, %r8398, %r8378; + xor.b32 %r8400, %r8399, %r8366; + shf.l.wrap.b32 %r8401, %r8400, %r8400, 20; + add.s32 %r8402, %r8396, %r7838; + add.s32 %r8403, %r8402, %r8401; + xor.b32 %r8404, %r8403, %r8398; + shf.l.wrap.b32 %r8405, %r8404, %r8404, 24; + add.s32 %r8406, %r8405, %r8399; + xor.b32 %r8407, %r8406, %r8401; + shf.l.wrap.b32 %r8408, %r8407, %r8407, 25; + add.s32 %r8409, %r8361, %r7915; + add.s32 %r8410, %r8409, %r8380; + xor.b32 %r8411, %r8349, %r8410; + shf.l.wrap.b32 %r8412, %r8411, %r8411, 16; + add.s32 %r8413, %r8412, %r8392; + xor.b32 %r8414, %r8413, %r8380; + shf.l.wrap.b32 %r8415, %r8414, %r8414, 20; + add.s32 %r8416, %r8410, %r7852; + add.s32 %r8417, %r8416, %r8415; + xor.b32 %r8418, %r8417, %r8412; + shf.l.wrap.b32 %r8419, %r8418, %r8418, 24; + add.s32 %r8420, %r8419, %r8413; + xor.b32 %r8421, %r8420, %r8415; + shf.l.wrap.b32 %r8422, %r8421, %r8421, 25; + add.s32 %r8423, %r8375, %r7873; + add.s32 %r8424, %r8423, %r8394; + xor.b32 %r8425, %r8424, %r8363; + shf.l.wrap.b32 %r8426, %r8425, %r8425, 16; + add.s32 %r8427, %r8426, %r8350; + xor.b32 %r8428, %r8427, %r8394; + shf.l.wrap.b32 %r8429, %r8428, %r8428, 20; + add.s32 %r8430, %r8424, %r7894; + add.s32 %r8431, %r8430, %r8429; + xor.b32 %r8432, %r8431, %r8426; + shf.l.wrap.b32 %r8433, %r8432, %r8432, 24; + add.s32 %r8434, %r8433, %r8427; + xor.b32 %r8435, %r8434, %r8429; + shf.l.wrap.b32 %r8436, %r8435, %r8435, 25; + add.s32 %r8437, %r8389, %r7845; + add.s32 %r8438, %r8437, %r8352; + xor.b32 %r8439, %r8438, %r8377; + shf.l.wrap.b32 %r8440, %r8439, %r8439, 16; + add.s32 %r8441, %r8440, %r8364; + xor.b32 %r8442, %r8441, %r8352; + shf.l.wrap.b32 %r8443, %r8442, %r8442, 20; + add.s32 %r8444, %r8438, %r7880; + add.s32 %r8445, %r8444, %r8443; + xor.b32 %r8446, %r8445, %r8440; + shf.l.wrap.b32 %r8447, %r8446, %r8446, 24; + add.s32 %r8448, %r8447, %r8441; + xor.b32 %r8449, %r8448, %r8443; + shf.l.wrap.b32 %r8450, %r8449, %r8449, 25; + add.s32 %r8451, %r8403, %r7922; + add.s32 %r8452, %r8451, %r8450; + xor.b32 %r8453, %r8452, %r8419; + shf.l.wrap.b32 %r8454, %r8453, %r8453, 16; + add.s32 %r8455, %r8454, %r8434; + xor.b32 %r8456, %r8455, %r8450; + shf.l.wrap.b32 %r8457, %r8456, %r8456, 20; + add.s32 %r8458, %r8452, %r7929; + add.s32 %r8459, %r8458, %r8457; + xor.b32 %r8460, %r8459, %r8454; + shf.l.wrap.b32 %r8461, %r8460, %r8460, 24; + add.s32 %r8462, %r8461, %r8455; + xor.b32 %r8463, %r8462, %r8457; + shf.l.wrap.b32 %r8464, %r8463, %r8463, 25; + add.s32 %r8465, %r8417, %r7901; + add.s32 %r8466, %r8465, %r8408; + xor.b32 %r8467, %r8466, %r8433; + shf.l.wrap.b32 %r8468, %r8467, %r8467, 16; + add.s32 %r8469, %r8468, %r8448; + xor.b32 %r8470, %r8469, %r8408; + shf.l.wrap.b32 %r8471, %r8470, %r8470, 20; + add.s32 %r8472, %r8466, %r7915; + add.s32 %r8473, %r8472, %r8471; + xor.b32 %r8474, %r8473, %r8468; + shf.l.wrap.b32 %r8475, %r8474, %r8474, 24; + add.s32 %r8476, %r8475, %r8469; + xor.b32 %r8477, %r8476, %r8471; + shf.l.wrap.b32 %r8478, %r8477, %r8477, 25; + add.s32 %r8479, %r8431, %r7943; + add.s32 %r8480, %r8479, %r8422; + xor.b32 %r8481, %r8447, %r8480; + shf.l.wrap.b32 %r8482, %r8481, %r8481, 16; + add.s32 %r8483, %r8482, %r8406; + xor.b32 %r8484, %r8483, %r8422; + shf.l.wrap.b32 %r8485, %r8484, %r8484, 20; + add.s32 %r8486, %r8480, %r7908; + add.s32 %r8487, %r8486, %r8485; + xor.b32 %r8488, %r8487, %r8482; + shf.l.wrap.b32 %r8489, %r8488, %r8488, 24; + add.s32 %r8490, %r8489, %r8483; + xor.b32 %r8491, %r8490, %r8485; + shf.l.wrap.b32 %r8492, %r8491, %r8491, 25; + add.s32 %r8493, %r8445, %r7936; + add.s32 %r8494, %r8493, %r8436; + xor.b32 %r8495, %r8494, %r8405; + shf.l.wrap.b32 %r8496, %r8495, %r8495, 16; + add.s32 %r8497, %r8496, %r8420; + xor.b32 %r8498, %r8497, %r8436; + shf.l.wrap.b32 %r8499, %r8498, %r8498, 20; + add.s32 %r8500, %r8494, %r7894; + add.s32 %r8501, %r8500, %r8499; + xor.b32 %r8502, %r8501, %r8496; + shf.l.wrap.b32 %r8503, %r8502, %r8502, 24; + add.s32 %r8504, %r8503, %r8497; + xor.b32 %r8505, %r8504, %r8499; + shf.l.wrap.b32 %r8506, %r8505, %r8505, 25; + add.s32 %r8507, %r8459, %r7887; + add.s32 %r8508, %r8507, %r8478; + xor.b32 %r8509, %r8508, %r8503; + shf.l.wrap.b32 %r8510, %r8509, %r8509, 16; + add.s32 %r8511, %r8510, %r8490; + xor.b32 %r8512, %r8511, %r8478; + shf.l.wrap.b32 %r8513, %r8512, %r8512, 20; + add.s32 %r8514, %r8508, %r7852; + add.s32 %r8515, %r8514, %r8513; + xor.b32 %r8516, %r8515, %r8510; + shf.l.wrap.b32 %r8517, %r8516, %r8516, 24; + add.s32 %r8518, %r8517, %r8511; + xor.b32 %r8519, %r8518, %r8513; + shf.l.wrap.b32 %r8520, %r8519, %r8519, 25; + add.s32 %r8521, %r8473, %r7873; + add.s32 %r8522, %r8521, %r8492; + xor.b32 %r8523, %r8461, %r8522; + shf.l.wrap.b32 %r8524, %r8523, %r8523, 16; + add.s32 %r8525, %r8524, %r8504; + xor.b32 %r8526, %r8525, %r8492; + shf.l.wrap.b32 %r8527, %r8526, %r8526, 20; + add.s32 %r8528, %r8522, %r7859; + add.s32 %r8529, %r8528, %r8527; + xor.b32 %r8530, %r8529, %r8524; + shf.l.wrap.b32 %r8531, %r8530, %r8530, 24; + add.s32 %r8532, %r8531, %r8525; + xor.b32 %r8533, %r8532, %r8527; + shf.l.wrap.b32 %r8534, %r8533, %r8533, 25; + add.s32 %r8535, %r8487, %r7838; + add.s32 %r8536, %r8535, %r8506; + xor.b32 %r8537, %r8536, %r8475; + shf.l.wrap.b32 %r8538, %r8537, %r8537, 16; + add.s32 %r8539, %r8538, %r8462; + xor.b32 %r8540, %r8539, %r8506; + shf.l.wrap.b32 %r8541, %r8540, %r8540, 20; + add.s32 %r8542, %r8536, %r7845; + add.s32 %r8543, %r8542, %r8541; + xor.b32 %r8544, %r8543, %r8538; + shf.l.wrap.b32 %r8545, %r8544, %r8544, 24; + add.s32 %r8546, %r8545, %r8539; + xor.b32 %r8547, %r8546, %r8541; + shf.l.wrap.b32 %r8548, %r8547, %r8547, 25; + add.s32 %r8549, %r8501, %r7880; + add.s32 %r8550, %r8549, %r8464; + xor.b32 %r8551, %r8550, %r8489; + shf.l.wrap.b32 %r8552, %r8551, %r8551, 16; + add.s32 %r8553, %r8552, %r8476; + xor.b32 %r8554, %r8553, %r8464; + shf.l.wrap.b32 %r8555, %r8554, %r8554, 20; + add.s32 %r8556, %r8550, %r7866; + add.s32 %r8557, %r8556, %r8555; + xor.b32 %r8558, %r8557, %r8552; + shf.l.wrap.b32 %r8559, %r8558, %r8558, 24; + add.s32 %r8560, %r8559, %r8553; + xor.b32 %r8561, %r8560, %r8555; + shf.l.wrap.b32 %r8562, %r8561, %r8561, 25; + add.s32 %r8563, %r8515, %r7901; + add.s32 %r8564, %r8563, %r8562; + xor.b32 %r8565, %r8564, %r8531; + shf.l.wrap.b32 %r8566, %r8565, %r8565, 16; + add.s32 %r8567, %r8566, %r8546; + xor.b32 %r8568, %r8567, %r8562; + shf.l.wrap.b32 %r8569, %r8568, %r8568, 20; + add.s32 %r8570, %r8564, %r7936; + add.s32 %r8571, %r8570, %r8569; + xor.b32 %r8572, %r8571, %r8566; + shf.l.wrap.b32 %r8573, %r8572, %r8572, 24; + add.s32 %r8574, %r8573, %r8567; + xor.b32 %r8575, %r8574, %r8569; + shf.l.wrap.b32 %r8576, %r8575, %r8575, 25; + add.s32 %r8577, %r8529, %r7915; + add.s32 %r8578, %r8577, %r8520; + xor.b32 %r8579, %r8578, %r8545; + shf.l.wrap.b32 %r8580, %r8579, %r8579, 16; + add.s32 %r8581, %r8580, %r8560; + xor.b32 %r8582, %r8581, %r8520; + shf.l.wrap.b32 %r8583, %r8582, %r8582, 20; + add.s32 %r8584, %r8578, %r7873; + add.s32 %r8585, %r8584, %r8583; + xor.b32 %r8586, %r8585, %r8580; + shf.l.wrap.b32 %r8587, %r8586, %r8586, 24; + add.s32 %r8588, %r8587, %r8581; + xor.b32 %r8589, %r8588, %r8583; + shf.l.wrap.b32 %r8590, %r8589, %r8589, 25; + add.s32 %r8591, %r8543, %r7894; + add.s32 %r8592, %r8591, %r8534; + xor.b32 %r8593, %r8559, %r8592; + shf.l.wrap.b32 %r8594, %r8593, %r8593, 16; + add.s32 %r8595, %r8594, %r8518; + xor.b32 %r8596, %r8595, %r8534; + shf.l.wrap.b32 %r8597, %r8596, %r8596, 20; + add.s32 %r8598, %r8592, %r7922; + add.s32 %r8599, %r8598, %r8597; + xor.b32 %r8600, %r8599, %r8594; + shf.l.wrap.b32 %r8601, %r8600, %r8600, 24; + add.s32 %r8602, %r8601, %r8595; + xor.b32 %r8603, %r8602, %r8597; + shf.l.wrap.b32 %r8604, %r8603, %r8603, 25; + add.s32 %r8605, %r8557, %r7943; + add.s32 %r8606, %r8605, %r8548; + xor.b32 %r8607, %r8606, %r8517; + shf.l.wrap.b32 %r8608, %r8607, %r8607, 16; + add.s32 %r8609, %r8608, %r8532; + xor.b32 %r8610, %r8609, %r8548; + shf.l.wrap.b32 %r8611, %r8610, %r8610, 20; + add.s32 %r8612, %r8606, %r7845; + add.s32 %r8613, %r8612, %r8611; + xor.b32 %r8614, %r8613, %r8608; + shf.l.wrap.b32 %r8615, %r8614, %r8614, 24; + add.s32 %r8616, %r8615, %r8609; + xor.b32 %r8617, %r8616, %r8611; + shf.l.wrap.b32 %r8618, %r8617, %r8617, 25; + add.s32 %r8619, %r8571, %r7929; + add.s32 %r8620, %r8619, %r8590; + xor.b32 %r8621, %r8620, %r8615; + shf.l.wrap.b32 %r8622, %r8621, %r8621, 16; + add.s32 %r8623, %r8622, %r8602; + xor.b32 %r8624, %r8623, %r8590; + shf.l.wrap.b32 %r8625, %r8624, %r8624, 20; + add.s32 %r8626, %r8620, %r7859; + add.s32 %r8627, %r8626, %r8625; + xor.b32 %r8628, %r8627, %r8622; + shf.l.wrap.b32 %r8629, %r8628, %r8628, 24; + add.s32 %r8630, %r8629, %r8623; + xor.b32 %r8631, %r8630, %r8625; + shf.l.wrap.b32 %r8632, %r8631, %r8631, 25; + add.s32 %r8633, %r8585, %r7838; + add.s32 %r8634, %r8633, %r8604; + xor.b32 %r8635, %r8573, %r8634; + shf.l.wrap.b32 %r8636, %r8635, %r8635, 16; + add.s32 %r8637, %r8636, %r8616; + xor.b32 %r8638, %r8637, %r8604; + shf.l.wrap.b32 %r8639, %r8638, %r8638, 20; + add.s32 %r8640, %r8634, %r7908; + add.s32 %r8641, %r8640, %r8639; + xor.b32 %r8642, %r8641, %r8636; + shf.l.wrap.b32 %r8643, %r8642, %r8642, 24; + add.s32 %r8644, %r8643, %r8637; + xor.b32 %r8645, %r8644, %r8639; + shf.l.wrap.b32 %r8646, %r8645, %r8645, 25; + add.s32 %r8647, %r8599, %r7852; + add.s32 %r8648, %r8647, %r8618; + xor.b32 %r8649, %r8648, %r8587; + shf.l.wrap.b32 %r8650, %r8649, %r8649, 16; + add.s32 %r8651, %r8650, %r8574; + xor.b32 %r8652, %r8651, %r8618; + shf.l.wrap.b32 %r8653, %r8652, %r8652, 20; + add.s32 %r8654, %r8648, %r7880; + add.s32 %r8655, %r8654, %r8653; + xor.b32 %r8656, %r8655, %r8650; + shf.l.wrap.b32 %r8657, %r8656, %r8656, 24; + add.s32 %r8658, %r8657, %r8651; + xor.b32 %r8659, %r8658, %r8653; + shf.l.wrap.b32 %r8660, %r8659, %r8659, 25; + add.s32 %r8661, %r8613, %r7866; + add.s32 %r8662, %r8661, %r8576; + xor.b32 %r8663, %r8662, %r8601; + shf.l.wrap.b32 %r8664, %r8663, %r8663, 16; + add.s32 %r8665, %r8664, %r8588; + xor.b32 %r8666, %r8665, %r8576; + shf.l.wrap.b32 %r8667, %r8666, %r8666, 20; + add.s32 %r8668, %r8662, %r7887; + add.s32 %r8669, %r8668, %r8667; + xor.b32 %r8670, %r8669, %r8664; + shf.l.wrap.b32 %r8671, %r8670, %r8670, 24; + add.s32 %r8672, %r8671, %r8665; + xor.b32 %r8673, %r8672, %r8667; + shf.l.wrap.b32 %r8674, %r8673, %r8673, 25; + add.s32 %r8675, %r8627, %r7915; + add.s32 %r8676, %r8675, %r8674; + xor.b32 %r8677, %r8676, %r8643; + shf.l.wrap.b32 %r8678, %r8677, %r8677, 16; + add.s32 %r8679, %r8678, %r8658; + xor.b32 %r8680, %r8679, %r8674; + shf.l.wrap.b32 %r8681, %r8680, %r8680, 20; + add.s32 %r8682, %r8676, %r7943; + add.s32 %r8683, %r8682, %r8681; + xor.b32 %r8684, %r8683, %r8678; + shf.l.wrap.b32 %r8685, %r8684, %r8684, 24; + add.s32 %r8686, %r8685, %r8679; + xor.b32 %r8687, %r8686, %r8681; + shf.l.wrap.b32 %r8688, %r8687, %r8687, 25; + add.s32 %r8689, %r8641, %r7873; + add.s32 %r8690, %r8689, %r8632; + xor.b32 %r8691, %r8690, %r8657; + shf.l.wrap.b32 %r8692, %r8691, %r8691, 16; + add.s32 %r8693, %r8692, %r8672; + xor.b32 %r8694, %r8693, %r8632; + shf.l.wrap.b32 %r8695, %r8694, %r8694, 20; + add.s32 %r8696, %r8690, %r7838; + add.s32 %r8697, %r8696, %r8695; + xor.b32 %r8698, %r8697, %r8692; + shf.l.wrap.b32 %r8699, %r8698, %r8698, 24; + add.s32 %r8700, %r8699, %r8693; + xor.b32 %r8701, %r8700, %r8695; + shf.l.wrap.b32 %r8702, %r8701, %r8701, 25; + add.s32 %r8703, %r8655, %r7845; + add.s32 %r8704, %r8703, %r8646; + xor.b32 %r8705, %r8671, %r8704; + shf.l.wrap.b32 %r8706, %r8705, %r8705, 16; + add.s32 %r8707, %r8706, %r8630; + xor.b32 %r8708, %r8707, %r8646; + shf.l.wrap.b32 %r8709, %r8708, %r8708, 20; + add.s32 %r8710, %r8704, %r7901; + add.s32 %r8711, %r8710, %r8709; + xor.b32 %r8712, %r8711, %r8706; + shf.l.wrap.b32 %r8713, %r8712, %r8712, 24; + add.s32 %r8714, %r8713, %r8707; + xor.b32 %r8715, %r8714, %r8709; + shf.l.wrap.b32 %r8716, %r8715, %r8715, 25; + add.s32 %r8717, %r8669, %r7894; + add.s32 %r8718, %r8717, %r8660; + xor.b32 %r8719, %r8718, %r8629; + shf.l.wrap.b32 %r8720, %r8719, %r8719, 16; + add.s32 %r8721, %r8720, %r8644; + xor.b32 %r8722, %r8721, %r8660; + shf.l.wrap.b32 %r8723, %r8722, %r8722, 20; + add.s32 %r8724, %r8718, %r7880; + add.s32 %r8725, %r8724, %r8723; + xor.b32 %r8726, %r8725, %r8720; + shf.l.wrap.b32 %r8727, %r8726, %r8726, 24; + add.s32 %r8728, %r8727, %r8721; + xor.b32 %r8729, %r8728, %r8723; + shf.l.wrap.b32 %r8730, %r8729, %r8729, 25; + add.s32 %r8731, %r8683, %r7936; + add.s32 %r8732, %r8731, %r8702; + xor.b32 %r8733, %r8732, %r8727; + shf.l.wrap.b32 %r8734, %r8733, %r8733, 16; + add.s32 %r8735, %r8734, %r8714; + xor.b32 %r8736, %r8735, %r8702; + shf.l.wrap.b32 %r8737, %r8736, %r8736, 20; + add.s32 %r8738, %r8732, %r7908; + add.s32 %r8739, %r8738, %r8737; + xor.b32 %r8740, %r8739, %r8734; + shf.l.wrap.b32 %r8741, %r8740, %r8740, 24; + add.s32 %r8742, %r8741, %r8735; + xor.b32 %r8743, %r8742, %r8737; + shf.l.wrap.b32 %r8744, %r8743, %r8743, 25; + add.s32 %r8745, %r8697, %r7852; + add.s32 %r8746, %r8745, %r8716; + xor.b32 %r8747, %r8685, %r8746; + shf.l.wrap.b32 %r8748, %r8747, %r8747, 16; + add.s32 %r8749, %r8748, %r8728; + xor.b32 %r8750, %r8749, %r8716; + shf.l.wrap.b32 %r8751, %r8750, %r8750, 20; + add.s32 %r8752, %r8746, %r7922; + add.s32 %r8753, %r8752, %r8751; + xor.b32 %r8754, %r8753, %r8748; + shf.l.wrap.b32 %r8755, %r8754, %r8754, 24; + add.s32 %r8756, %r8755, %r8749; + xor.b32 %r8757, %r8756, %r8751; + shf.l.wrap.b32 %r8758, %r8757, %r8757, 25; + add.s32 %r8759, %r8711, %r7859; + add.s32 %r8760, %r8759, %r8730; + xor.b32 %r8761, %r8760, %r8699; + shf.l.wrap.b32 %r8762, %r8761, %r8761, 16; + add.s32 %r8763, %r8762, %r8686; + xor.b32 %r8764, %r8763, %r8730; + shf.l.wrap.b32 %r8765, %r8764, %r8764, 20; + add.s32 %r8766, %r8760, %r7866; + add.s32 %r8767, %r8766, %r8765; + xor.b32 %r8768, %r8767, %r8762; + shf.l.wrap.b32 %r8769, %r8768, %r8768, 24; + add.s32 %r8770, %r8769, %r8763; + xor.b32 %r8771, %r8770, %r8765; + shf.l.wrap.b32 %r8772, %r8771, %r8771, 25; + add.s32 %r8773, %r8725, %r7887; + add.s32 %r8774, %r8773, %r8688; + xor.b32 %r8775, %r8774, %r8713; + shf.l.wrap.b32 %r8776, %r8775, %r8775, 16; + add.s32 %r8777, %r8776, %r8700; + xor.b32 %r8778, %r8777, %r8688; + shf.l.wrap.b32 %r8779, %r8778, %r8778, 20; + add.s32 %r8780, %r8774, %r7929; + add.s32 %r8781, %r8780, %r8779; + xor.b32 %r8782, %r8781, %r8776; + shf.l.wrap.b32 %r8783, %r8782, %r8782, 24; + add.s32 %r8784, %r8783, %r8777; + xor.b32 %r8785, %r8784, %r8779; + shf.l.wrap.b32 %r8786, %r8785, %r8785, 25; + xor.b32 %r8787, %r8770, %r8739; + xor.b32 %r8788, %r8784, %r8753; + xor.b32 %r8789, %r8742, %r8767; + xor.b32 %r8790, %r8781, %r8756; + xor.b32 %r8791, %r8786, %r8755; + xor.b32 %r8792, %r8744, %r8769; + xor.b32 %r8793, %r8783, %r8758; + xor.b32 %r8794, %r8772, %r8741; + st.local.u8 [%rd191+145], %r8787; + shr.u32 %r8795, %r8787, 8; + st.local.u8 [%rd191+146], %r8795; + shr.u32 %r8796, %r8787, 16; + st.local.u8 [%rd191+147], %r8796; + shr.u32 %r8797, %r8787, 24; + st.local.u8 [%rd191+148], %r8797; + st.local.u8 [%rd191+149], %r8788; + shr.u32 %r8798, %r8788, 8; + st.local.u8 [%rd191+150], %r8798; + shr.u32 %r8799, %r8788, 16; + st.local.u8 [%rd191+151], %r8799; + shr.u32 %r8800, %r8788, 24; + st.local.u8 [%rd191+152], %r8800; + st.local.u8 [%rd191+153], %r8789; + shr.u32 %r8801, %r8789, 8; + st.local.u8 [%rd191+154], %r8801; + shr.u32 %r8802, %r8789, 16; + st.local.u8 [%rd191+155], %r8802; + shr.u32 %r8803, %r8789, 24; + st.local.u8 [%rd191+156], %r8803; + st.local.u8 [%rd191+157], %r8790; + shr.u32 %r8804, %r8790, 8; + st.local.u8 [%rd191+158], %r8804; + shr.u32 %r8805, %r8790, 16; + st.local.u8 [%rd191+159], %r8805; + shr.u32 %r8806, %r8790, 24; + st.local.u8 [%rd191+160], %r8806; + st.local.u8 [%rd191+161], %r8791; + shr.u32 %r8807, %r8791, 8; + st.local.u8 [%rd191+162], %r8807; + shr.u32 %r8808, %r8791, 16; + st.local.u8 [%rd191+163], %r8808; + shr.u32 %r8809, %r8791, 24; + st.local.u8 [%rd191+164], %r8809; + st.local.u8 [%rd191+165], %r8792; + shr.u32 %r8810, %r8792, 8; + st.local.u8 [%rd191+166], %r8810; + shr.u32 %r8811, %r8792, 16; + st.local.u8 [%rd191+167], %r8811; + shr.u32 %r8812, %r8792, 24; + st.local.u8 [%rd191+168], %r8812; + st.local.u8 [%rd191+169], %r8793; + shr.u32 %r8813, %r8793, 8; + st.local.u8 [%rd191+170], %r8813; + shr.u32 %r8814, %r8793, 16; + st.local.u8 [%rd191+171], %r8814; + shr.u32 %r8815, %r8793, 24; + st.local.u8 [%rd191+172], %r8815; + st.local.u8 [%rd191+173], %r8794; + shr.u32 %r8816, %r8794, 8; + st.local.u8 [%rd191+174], %r8816; + shr.u32 %r8817, %r8794, 16; + st.local.u8 [%rd191+175], %r8817; + shr.u32 %r8818, %r8794, 24; + st.local.u8 [%rd191+176], %r8818; + ld.local.u8 %rs328, [%rd3+8]; + add.s16 %rs329, %rs328, -1; + st.local.u8 [%rd3+8], %rs329; + cvt.u64.u16 %rd192, %rs329; + and.b64 %rd193, %rd192, 255; + setp.lt.u64 %p38, %rd230, %rd193; + and.b16 %rs330, %rs329, 255; + mul.wide.u16 %r11681, %rs330, 32; + @%p38 bra $L__BB1_45; + +$L__BB1_46: + cvt.s64.s32 %rd194, %r11681; + add.s64 %rd195, %rd2, %rd194; + st.local.u8 [%rd195+145], %r97; + shr.u32 %r8819, %r97, 8; + st.local.u8 [%rd195+146], %r8819; + shr.u32 %r8820, %r97, 16; + st.local.u8 [%rd195+147], %r8820; + shr.u32 %r8821, %r97, 24; + st.local.u8 [%rd195+148], %r8821; + st.local.u8 [%rd195+149], %r98; + shr.u32 %r8822, %r98, 8; + st.local.u8 [%rd195+150], %r8822; + shr.u32 %r8823, %r98, 16; + st.local.u8 [%rd195+151], %r8823; + shr.u32 %r8824, %r98, 24; + st.local.u8 [%rd195+152], %r8824; + st.local.u8 [%rd195+153], %r99; + shr.u32 %r8825, %r99, 8; + st.local.u8 [%rd195+154], %r8825; + shr.u32 %r8826, %r99, 16; + st.local.u8 [%rd195+155], %r8826; + shr.u32 %r8827, %r99, 24; + st.local.u8 [%rd195+156], %r8827; + st.local.u8 [%rd195+157], %r100; + shr.u32 %r8828, %r100, 8; + st.local.u8 [%rd195+158], %r8828; + shr.u32 %r8829, %r100, 16; + st.local.u8 [%rd195+159], %r8829; + shr.u32 %r8830, %r100, 24; + st.local.u8 [%rd195+160], %r8830; + st.local.u8 [%rd195+161], %r101; + shr.u32 %r8831, %r101, 8; + st.local.u8 [%rd195+162], %r8831; + shr.u32 %r8832, %r101, 16; + st.local.u8 [%rd195+163], %r8832; + shr.u32 %r8833, %r101, 24; + st.local.u8 [%rd195+164], %r8833; + st.local.u8 [%rd195+165], %r102; + shr.u32 %r8834, %r102, 8; + st.local.u8 [%rd195+166], %r8834; + shr.u32 %r8835, %r102, 16; + st.local.u8 [%rd195+167], %r8835; + shr.u32 %r8836, %r102, 24; + st.local.u8 [%rd195+168], %r8836; + st.local.u8 [%rd195+169], %r103; + shr.u32 %r8837, %r103, 8; + st.local.u8 [%rd195+170], %r8837; + shr.u32 %r8838, %r103, 16; + st.local.u8 [%rd195+171], %r8838; + shr.u32 %r8839, %r103, 24; + st.local.u8 [%rd195+172], %r8839; + st.local.u8 [%rd195+173], %r104; + shr.u32 %r8840, %r104, 8; + st.local.u8 [%rd195+174], %r8840; + shr.u32 %r8841, %r104, 16; + st.local.u8 [%rd195+175], %r8841; + shr.u32 %r8842, %r104, 24; + st.local.u8 [%rd195+176], %r8842; + ld.local.u8 %rs388, [%rd3+8]; + bra.uni $L__BB1_47; + +$L__BB1_29: + cvt.u32.u16 %r3957, %rs14; + and.b32 %r3958, %r3957, 255; + { // callseq 2, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd254; + .param .b64 param1; + st.param.b64 [param1+0], %rd49; + .param .b64 param2; + st.param.b64 [param2+0], %rd98; + .param .b64 param3; + st.param.b64 [param3+0], %rd251; + .param .b32 param4; + st.param.b32 [param4+0], %r3958; + .param .b64 param5; + st.param.b64 [param5+0], %rd142; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_karlsen_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd164, [retval0+0]; + } // callseq 2 + ld.local.v4.u32 {%r3959, %r3960, %r3961, %r3962}, [%rd42]; + ld.local.v4.u32 {%r3963, %r3964, %r3965, %r3966}, [%rd42+16]; + ld.local.v4.u32 {%r3967, %r3968, %r3969, %r3970}, [%rd42+32]; + ld.local.v4.u32 {%r3971, %r3972, %r3973, %r3974}, [%rd42+48]; + ld.local.u64 %rd165, [%rd3+-72]; + popc.b64 %r3975, %rd165; + cvt.u64.u32 %rd51, %r3975; + ld.local.u8 %rs137, [%rd3+8]; + cvt.u64.u16 %rd166, %rs137; + setp.ge.u64 %p27, %rd51, %rd166; + mul.wide.u16 %r11661, %rs137, 32; + @%p27 bra $L__BB1_32; + +$L__BB1_31: + popc.b64 %r11647, %rd165; + cvt.u64.u32 %rd226, %r11647; + add.s32 %r3976, %r11661, -64; + cvt.s64.s32 %rd167, %r3976; + add.s64 %rd168, %rd2, %rd167; + ld.local.u8 %r3977, [%rd3+2]; + ld.local.u8 %r3978, [%rd168+145]; + ld.local.u8 %r3979, [%rd168+146]; + prmt.b32 %r3980, %r3979, %r3978, 30212; + ld.local.u8 %r3981, [%rd168+147]; + prmt.b32 %r3982, %r3981, %r3980, 28756; + ld.local.u8 %r3983, [%rd168+148]; + prmt.b32 %r3984, %r3983, %r3982, 1620; + ld.local.u8 %r3985, [%rd168+149]; + ld.local.u8 %r3986, [%rd168+150]; + prmt.b32 %r3987, %r3986, %r3985, 30212; + ld.local.u8 %r3988, [%rd168+151]; + prmt.b32 %r3989, %r3988, %r3987, 28756; + ld.local.u8 %r3990, [%rd168+152]; + prmt.b32 %r3991, %r3990, %r3989, 1620; + ld.local.u8 %r3992, [%rd168+153]; + ld.local.u8 %r3993, [%rd168+154]; + prmt.b32 %r3994, %r3993, %r3992, 30212; + ld.local.u8 %r3995, [%rd168+155]; + prmt.b32 %r3996, %r3995, %r3994, 28756; + ld.local.u8 %r3997, [%rd168+156]; + prmt.b32 %r3998, %r3997, %r3996, 1620; + ld.local.u8 %r3999, [%rd168+157]; + ld.local.u8 %r4000, [%rd168+158]; + prmt.b32 %r4001, %r4000, %r3999, 30212; + ld.local.u8 %r4002, [%rd168+159]; + prmt.b32 %r4003, %r4002, %r4001, 28756; + ld.local.u8 %r4004, [%rd168+160]; + prmt.b32 %r4005, %r4004, %r4003, 1620; + ld.local.u8 %r4006, [%rd168+161]; + ld.local.u8 %r4007, [%rd168+162]; + prmt.b32 %r4008, %r4007, %r4006, 30212; + ld.local.u8 %r4009, [%rd168+163]; + prmt.b32 %r4010, %r4009, %r4008, 28756; + ld.local.u8 %r4011, [%rd168+164]; + prmt.b32 %r4012, %r4011, %r4010, 1620; + ld.local.u8 %r4013, [%rd168+165]; + ld.local.u8 %r4014, [%rd168+166]; + prmt.b32 %r4015, %r4014, %r4013, 30212; + ld.local.u8 %r4016, [%rd168+167]; + prmt.b32 %r4017, %r4016, %r4015, 28756; + ld.local.u8 %r4018, [%rd168+168]; + prmt.b32 %r4019, %r4018, %r4017, 1620; + ld.local.u8 %r4020, [%rd168+169]; + ld.local.u8 %r4021, [%rd168+170]; + prmt.b32 %r4022, %r4021, %r4020, 30212; + ld.local.u8 %r4023, [%rd168+171]; + prmt.b32 %r4024, %r4023, %r4022, 28756; + ld.local.u8 %r4025, [%rd168+172]; + prmt.b32 %r4026, %r4025, %r4024, 1620; + ld.local.u8 %r4027, [%rd168+173]; + ld.local.u8 %r4028, [%rd168+174]; + prmt.b32 %r4029, %r4028, %r4027, 30212; + ld.local.u8 %r4030, [%rd168+175]; + prmt.b32 %r4031, %r4030, %r4029, 28756; + ld.local.u8 %r4032, [%rd168+176]; + prmt.b32 %r4033, %r4032, %r4031, 1620; + ld.local.u8 %r4034, [%rd168+177]; + ld.local.u8 %r4035, [%rd168+178]; + prmt.b32 %r4036, %r4035, %r4034, 30212; + ld.local.u8 %r4037, [%rd168+179]; + prmt.b32 %r4038, %r4037, %r4036, 28756; + ld.local.u8 %r4039, [%rd168+180]; + prmt.b32 %r4040, %r4039, %r4038, 1620; + ld.local.u8 %r4041, [%rd168+181]; + ld.local.u8 %r4042, [%rd168+182]; + prmt.b32 %r4043, %r4042, %r4041, 30212; + ld.local.u8 %r4044, [%rd168+183]; + prmt.b32 %r4045, %r4044, %r4043, 28756; + ld.local.u8 %r4046, [%rd168+184]; + prmt.b32 %r4047, %r4046, %r4045, 1620; + ld.local.u8 %r4048, [%rd168+185]; + ld.local.u8 %r4049, [%rd168+186]; + prmt.b32 %r4050, %r4049, %r4048, 30212; + ld.local.u8 %r4051, [%rd168+187]; + prmt.b32 %r4052, %r4051, %r4050, 28756; + ld.local.u8 %r4053, [%rd168+188]; + prmt.b32 %r4054, %r4053, %r4052, 1620; + ld.local.u8 %r4055, [%rd168+189]; + ld.local.u8 %r4056, [%rd168+190]; + prmt.b32 %r4057, %r4056, %r4055, 30212; + ld.local.u8 %r4058, [%rd168+191]; + prmt.b32 %r4059, %r4058, %r4057, 28756; + ld.local.u8 %r4060, [%rd168+192]; + prmt.b32 %r4061, %r4060, %r4059, 1620; + ld.local.u8 %r4062, [%rd168+193]; + ld.local.u8 %r4063, [%rd168+194]; + prmt.b32 %r4064, %r4063, %r4062, 30212; + ld.local.u8 %r4065, [%rd168+195]; + prmt.b32 %r4066, %r4065, %r4064, 28756; + ld.local.u8 %r4067, [%rd168+196]; + prmt.b32 %r4068, %r4067, %r4066, 1620; + ld.local.u8 %r4069, [%rd168+197]; + ld.local.u8 %r4070, [%rd168+198]; + prmt.b32 %r4071, %r4070, %r4069, 30212; + ld.local.u8 %r4072, [%rd168+199]; + prmt.b32 %r4073, %r4072, %r4071, 28756; + ld.local.u8 %r4074, [%rd168+200]; + prmt.b32 %r4075, %r4074, %r4073, 1620; + ld.local.u8 %r4076, [%rd168+201]; + ld.local.u8 %r4077, [%rd168+202]; + prmt.b32 %r4078, %r4077, %r4076, 30212; + ld.local.u8 %r4079, [%rd168+203]; + prmt.b32 %r4080, %r4079, %r4078, 28756; + ld.local.u8 %r4081, [%rd168+204]; + prmt.b32 %r4082, %r4081, %r4080, 1620; + ld.local.u8 %r4083, [%rd168+205]; + ld.local.u8 %r4084, [%rd168+206]; + prmt.b32 %r4085, %r4084, %r4083, 30212; + ld.local.u8 %r4086, [%rd168+207]; + prmt.b32 %r4087, %r4086, %r4085, 28756; + ld.local.u8 %r4088, [%rd168+208]; + prmt.b32 %r4089, %r4088, %r4087, 1620; + or.b32 %r4090, %r3977, 4; + ld.local.u8 %r4091, [%rd3+-120]; + ld.local.u8 %r4092, [%rd3+-119]; + prmt.b32 %r4093, %r4092, %r4091, 30212; + ld.local.u8 %r4094, [%rd3+-118]; + ld.local.u8 %r4095, [%rd3+-117]; + prmt.b32 %r4096, %r4095, %r4094, 30212; + prmt.b32 %r4097, %r4096, %r4093, 4180; + ld.local.u8 %r4098, [%rd3+-136]; + ld.local.u8 %r4099, [%rd3+-135]; + prmt.b32 %r4100, %r4099, %r4098, 30212; + ld.local.u8 %r4101, [%rd3+-134]; + ld.local.u8 %r4102, [%rd3+-133]; + prmt.b32 %r4103, %r4102, %r4101, 30212; + prmt.b32 %r4104, %r4103, %r4100, 4180; + add.s32 %r4105, %r4097, %r4104; + add.s32 %r4106, %r4105, %r3984; + shf.l.wrap.b32 %r4107, %r4106, %r4106, 16; + add.s32 %r4108, %r4107, 1779033703; + xor.b32 %r4109, %r4108, %r4097; + shf.l.wrap.b32 %r4110, %r4109, %r4109, 20; + add.s32 %r4111, %r3991, %r4106; + add.s32 %r4112, %r4111, %r4110; + xor.b32 %r4113, %r4112, %r4107; + shf.l.wrap.b32 %r4114, %r4113, %r4113, 24; + add.s32 %r4115, %r4114, %r4108; + xor.b32 %r4116, %r4115, %r4110; + shf.l.wrap.b32 %r4117, %r4116, %r4116, 25; + ld.local.u8 %r4118, [%rd3+-116]; + ld.local.u8 %r4119, [%rd3+-115]; + prmt.b32 %r4120, %r4119, %r4118, 30212; + ld.local.u8 %r4121, [%rd3+-114]; + ld.local.u8 %r4122, [%rd3+-113]; + prmt.b32 %r4123, %r4122, %r4121, 30212; + prmt.b32 %r4124, %r4123, %r4120, 4180; + ld.local.u8 %r4125, [%rd3+-132]; + ld.local.u8 %r4126, [%rd3+-131]; + prmt.b32 %r4127, %r4126, %r4125, 30212; + ld.local.u8 %r4128, [%rd3+-130]; + ld.local.u8 %r4129, [%rd3+-129]; + prmt.b32 %r4130, %r4129, %r4128, 30212; + prmt.b32 %r4131, %r4130, %r4127, 4180; + add.s32 %r4132, %r4124, %r4131; + add.s32 %r4133, %r4132, %r3998; + shf.l.wrap.b32 %r4134, %r4133, %r4133, 16; + add.s32 %r4135, %r4134, -1150833019; + xor.b32 %r4136, %r4135, %r4124; + shf.l.wrap.b32 %r4137, %r4136, %r4136, 20; + add.s32 %r4138, %r4005, %r4133; + add.s32 %r4139, %r4138, %r4137; + xor.b32 %r4140, %r4139, %r4134; + shf.l.wrap.b32 %r4141, %r4140, %r4140, 24; + add.s32 %r4142, %r4141, %r4135; + xor.b32 %r4143, %r4142, %r4137; + shf.l.wrap.b32 %r4144, %r4143, %r4143, 25; + ld.local.u8 %r4145, [%rd3+-112]; + ld.local.u8 %r4146, [%rd3+-111]; + prmt.b32 %r4147, %r4146, %r4145, 30212; + ld.local.u8 %r4148, [%rd3+-110]; + ld.local.u8 %r4149, [%rd3+-109]; + prmt.b32 %r4150, %r4149, %r4148, 30212; + prmt.b32 %r4151, %r4150, %r4147, 4180; + ld.local.u8 %r4152, [%rd3+-128]; + ld.local.u8 %r4153, [%rd3+-127]; + prmt.b32 %r4154, %r4153, %r4152, 30212; + ld.local.u8 %r4155, [%rd3+-126]; + ld.local.u8 %r4156, [%rd3+-125]; + prmt.b32 %r4157, %r4156, %r4155, 30212; + prmt.b32 %r4158, %r4157, %r4154, 4180; + add.s32 %r4159, %r4151, %r4158; + add.s32 %r4160, %r4159, %r4012; + shr.u32 %r4161, %r4160, 16; + shl.b32 %r4162, %r4160, 16; + xor.b32 %r4163, %r4162, 4194304; + or.b32 %r4164, %r4163, %r4161; + add.s32 %r4165, %r4164, 1013904242; + xor.b32 %r4166, %r4165, %r4151; + shf.l.wrap.b32 %r4167, %r4166, %r4166, 20; + add.s32 %r4168, %r4019, %r4160; + add.s32 %r4169, %r4168, %r4167; + xor.b32 %r4170, %r4169, %r4164; + shf.l.wrap.b32 %r4171, %r4170, %r4170, 24; + add.s32 %r4172, %r4171, %r4165; + xor.b32 %r4173, %r4172, %r4167; + shf.l.wrap.b32 %r4174, %r4173, %r4173, 25; + ld.local.u8 %r4175, [%rd3+-108]; + ld.local.u8 %r4176, [%rd3+-107]; + prmt.b32 %r4177, %r4176, %r4175, 30212; + ld.local.u8 %r4178, [%rd3+-106]; + ld.local.u8 %r4179, [%rd3+-105]; + prmt.b32 %r4180, %r4179, %r4178, 30212; + prmt.b32 %r4181, %r4180, %r4177, 4180; + ld.local.u8 %r4182, [%rd3+-124]; + ld.local.u8 %r4183, [%rd3+-123]; + prmt.b32 %r4184, %r4183, %r4182, 30212; + ld.local.u8 %r4185, [%rd3+-122]; + ld.local.u8 %r4186, [%rd3+-121]; + prmt.b32 %r4187, %r4186, %r4185, 30212; + prmt.b32 %r4188, %r4187, %r4184, 4180; + add.s32 %r4189, %r4181, %r4188; + add.s32 %r4190, %r4189, %r4026; + xor.b32 %r4191, %r4190, %r4090; + shr.u32 %r4192, %r4190, 16; + shl.b32 %r4193, %r4191, 16; + or.b32 %r4194, %r4193, %r4192; + add.s32 %r4195, %r4194, -1521486534; + xor.b32 %r4196, %r4195, %r4181; + shf.l.wrap.b32 %r4197, %r4196, %r4196, 20; + add.s32 %r4198, %r4033, %r4190; + add.s32 %r4199, %r4198, %r4197; + xor.b32 %r4200, %r4199, %r4194; + shf.l.wrap.b32 %r4201, %r4200, %r4200, 24; + add.s32 %r4202, %r4201, %r4195; + xor.b32 %r4203, %r4202, %r4197; + shf.l.wrap.b32 %r4204, %r4203, %r4203, 25; + add.s32 %r4205, %r4144, %r4112; + add.s32 %r4206, %r4205, %r4040; + xor.b32 %r4207, %r4201, %r4206; + shf.l.wrap.b32 %r4208, %r4207, %r4207, 16; + add.s32 %r4209, %r4208, %r4172; + xor.b32 %r4210, %r4209, %r4144; + shf.l.wrap.b32 %r4211, %r4210, %r4210, 20; + add.s32 %r4212, %r4047, %r4206; + add.s32 %r4213, %r4212, %r4211; + xor.b32 %r4214, %r4213, %r4208; + shf.l.wrap.b32 %r4215, %r4214, %r4214, 24; + add.s32 %r4216, %r4215, %r4209; + xor.b32 %r4217, %r4216, %r4211; + shf.l.wrap.b32 %r4218, %r4217, %r4217, 25; + add.s32 %r4219, %r4174, %r4139; + add.s32 %r4220, %r4219, %r4054; + xor.b32 %r4221, %r4220, %r4114; + shf.l.wrap.b32 %r4222, %r4221, %r4221, 16; + add.s32 %r4223, %r4222, %r4202; + xor.b32 %r4224, %r4223, %r4174; + shf.l.wrap.b32 %r4225, %r4224, %r4224, 20; + add.s32 %r4226, %r4061, %r4220; + add.s32 %r4227, %r4226, %r4225; + xor.b32 %r4228, %r4227, %r4222; + shf.l.wrap.b32 %r4229, %r4228, %r4228, 24; + add.s32 %r4230, %r4229, %r4223; + xor.b32 %r4231, %r4230, %r4225; + shf.l.wrap.b32 %r4232, %r4231, %r4231, 25; + add.s32 %r4233, %r4204, %r4169; + add.s32 %r4234, %r4233, %r4068; + xor.b32 %r4235, %r4234, %r4141; + shf.l.wrap.b32 %r4236, %r4235, %r4235, 16; + add.s32 %r4237, %r4236, %r4115; + xor.b32 %r4238, %r4237, %r4204; + shf.l.wrap.b32 %r4239, %r4238, %r4238, 20; + add.s32 %r4240, %r4075, %r4234; + add.s32 %r4241, %r4240, %r4239; + xor.b32 %r4242, %r4241, %r4236; + shf.l.wrap.b32 %r4243, %r4242, %r4242, 24; + add.s32 %r4244, %r4243, %r4237; + xor.b32 %r4245, %r4244, %r4239; + shf.l.wrap.b32 %r4246, %r4245, %r4245, 25; + add.s32 %r4247, %r4199, %r4117; + add.s32 %r4248, %r4247, %r4082; + xor.b32 %r4249, %r4248, %r4171; + shf.l.wrap.b32 %r4250, %r4249, %r4249, 16; + add.s32 %r4251, %r4250, %r4142; + xor.b32 %r4252, %r4251, %r4117; + shf.l.wrap.b32 %r4253, %r4252, %r4252, 20; + add.s32 %r4254, %r4089, %r4248; + add.s32 %r4255, %r4254, %r4253; + xor.b32 %r4256, %r4255, %r4250; + shf.l.wrap.b32 %r4257, %r4256, %r4256, 24; + add.s32 %r4258, %r4257, %r4251; + xor.b32 %r4259, %r4258, %r4253; + shf.l.wrap.b32 %r4260, %r4259, %r4259, 25; + add.s32 %r4261, %r4213, %r3998; + add.s32 %r4262, %r4261, %r4260; + xor.b32 %r4263, %r4262, %r4229; + shf.l.wrap.b32 %r4264, %r4263, %r4263, 16; + add.s32 %r4265, %r4264, %r4244; + xor.b32 %r4266, %r4265, %r4260; + shf.l.wrap.b32 %r4267, %r4266, %r4266, 20; + add.s32 %r4268, %r4262, %r4026; + add.s32 %r4269, %r4268, %r4267; + xor.b32 %r4270, %r4269, %r4264; + shf.l.wrap.b32 %r4271, %r4270, %r4270, 24; + add.s32 %r4272, %r4271, %r4265; + xor.b32 %r4273, %r4272, %r4267; + shf.l.wrap.b32 %r4274, %r4273, %r4273, 25; + add.s32 %r4275, %r4227, %r4005; + add.s32 %r4276, %r4275, %r4218; + xor.b32 %r4277, %r4243, %r4276; + shf.l.wrap.b32 %r4278, %r4277, %r4277, 16; + add.s32 %r4279, %r4258, %r4278; + xor.b32 %r4280, %r4279, %r4218; + shf.l.wrap.b32 %r4281, %r4280, %r4280, 20; + add.s32 %r4282, %r4276, %r4054; + add.s32 %r4283, %r4282, %r4281; + xor.b32 %r4284, %r4283, %r4278; + shf.l.wrap.b32 %r4285, %r4284, %r4284, 24; + add.s32 %r4286, %r4285, %r4279; + xor.b32 %r4287, %r4286, %r4281; + shf.l.wrap.b32 %r4288, %r4287, %r4287, 25; + add.s32 %r4289, %r4232, %r4033; + add.s32 %r4290, %r4289, %r4241; + xor.b32 %r4291, %r4257, %r4290; + shf.l.wrap.b32 %r4292, %r4291, %r4291, 16; + add.s32 %r4293, %r4292, %r4216; + xor.b32 %r4294, %r4293, %r4232; + shf.l.wrap.b32 %r4295, %r4294, %r4294, 20; + add.s32 %r4296, %r4290, %r3984; + add.s32 %r4297, %r4296, %r4295; + xor.b32 %r4298, %r4297, %r4292; + shf.l.wrap.b32 %r4299, %r4298, %r4298, 24; + add.s32 %r4300, %r4299, %r4293; + xor.b32 %r4301, %r4300, %r4295; + shf.l.wrap.b32 %r4302, %r4301, %r4301, 25; + add.s32 %r4303, %r4246, %r4012; + add.s32 %r4304, %r4303, %r4255; + xor.b32 %r4305, %r4304, %r4215; + shf.l.wrap.b32 %r4306, %r4305, %r4305, 16; + add.s32 %r4307, %r4306, %r4230; + xor.b32 %r4308, %r4307, %r4246; + shf.l.wrap.b32 %r4309, %r4308, %r4308, 20; + add.s32 %r4310, %r4304, %r4075; + add.s32 %r4311, %r4310, %r4309; + xor.b32 %r4312, %r4311, %r4306; + shf.l.wrap.b32 %r4313, %r4312, %r4312, 24; + add.s32 %r4314, %r4313, %r4307; + xor.b32 %r4315, %r4314, %r4309; + shf.l.wrap.b32 %r4316, %r4315, %r4315, 25; + add.s32 %r4317, %r4288, %r3991; + add.s32 %r4318, %r4317, %r4269; + xor.b32 %r4319, %r4318, %r4313; + shf.l.wrap.b32 %r4320, %r4319, %r4319, 16; + add.s32 %r4321, %r4320, %r4300; + xor.b32 %r4322, %r4321, %r4288; + shf.l.wrap.b32 %r4323, %r4322, %r4322, 20; + add.s32 %r4324, %r4318, %r4061; + add.s32 %r4325, %r4324, %r4323; + xor.b32 %r4326, %r4325, %r4320; + shf.l.wrap.b32 %r4327, %r4326, %r4326, 24; + add.s32 %r4328, %r4327, %r4321; + xor.b32 %r4329, %r4328, %r4323; + shf.l.wrap.b32 %r4330, %r4329, %r4329, 25; + add.s32 %r4331, %r4283, %r4068; + add.s32 %r4332, %r4331, %r4302; + xor.b32 %r4333, %r4271, %r4332; + shf.l.wrap.b32 %r4334, %r4333, %r4333, 16; + add.s32 %r4335, %r4334, %r4314; + xor.b32 %r4336, %r4335, %r4302; + shf.l.wrap.b32 %r4337, %r4336, %r4336, 20; + add.s32 %r4338, %r4332, %r4019; + add.s32 %r4339, %r4338, %r4337; + xor.b32 %r4340, %r4339, %r4334; + shf.l.wrap.b32 %r4341, %r4340, %r4340, 24; + add.s32 %r4342, %r4341, %r4335; + xor.b32 %r4343, %r4342, %r4337; + shf.l.wrap.b32 %r4344, %r4343, %r4343, 25; + add.s32 %r4345, %r4297, %r4047; + add.s32 %r4346, %r4345, %r4316; + xor.b32 %r4347, %r4346, %r4285; + shf.l.wrap.b32 %r4348, %r4347, %r4347, 16; + add.s32 %r4349, %r4348, %r4272; + xor.b32 %r4350, %r4349, %r4316; + shf.l.wrap.b32 %r4351, %r4350, %r4350, 20; + add.s32 %r4352, %r4346, %r4082; + add.s32 %r4353, %r4352, %r4351; + xor.b32 %r4354, %r4353, %r4348; + shf.l.wrap.b32 %r4355, %r4354, %r4354, 24; + add.s32 %r4356, %r4355, %r4349; + xor.b32 %r4357, %r4356, %r4351; + shf.l.wrap.b32 %r4358, %r4357, %r4357, 25; + add.s32 %r4359, %r4311, %r4089; + add.s32 %r4360, %r4359, %r4274; + xor.b32 %r4361, %r4360, %r4299; + shf.l.wrap.b32 %r4362, %r4361, %r4361, 16; + add.s32 %r4363, %r4362, %r4286; + xor.b32 %r4364, %r4363, %r4274; + shf.l.wrap.b32 %r4365, %r4364, %r4364, 20; + add.s32 %r4366, %r4360, %r4040; + add.s32 %r4367, %r4366, %r4365; + xor.b32 %r4368, %r4367, %r4362; + shf.l.wrap.b32 %r4369, %r4368, %r4368, 24; + add.s32 %r4370, %r4369, %r4363; + xor.b32 %r4371, %r4370, %r4365; + shf.l.wrap.b32 %r4372, %r4371, %r4371, 25; + add.s32 %r4373, %r4325, %r4005; + add.s32 %r4374, %r4373, %r4372; + xor.b32 %r4375, %r4374, %r4341; + shf.l.wrap.b32 %r4376, %r4375, %r4375, 16; + add.s32 %r4377, %r4376, %r4356; + xor.b32 %r4378, %r4377, %r4372; + shf.l.wrap.b32 %r4379, %r4378, %r4378, 20; + add.s32 %r4380, %r4374, %r4012; + add.s32 %r4381, %r4380, %r4379; + xor.b32 %r4382, %r4381, %r4376; + shf.l.wrap.b32 %r4383, %r4382, %r4382, 24; + add.s32 %r4384, %r4383, %r4377; + xor.b32 %r4385, %r4384, %r4379; + shf.l.wrap.b32 %r4386, %r4385, %r4385, 25; + add.s32 %r4387, %r4339, %r4054; + add.s32 %r4388, %r4387, %r4330; + xor.b32 %r4389, %r4388, %r4355; + shf.l.wrap.b32 %r4390, %r4389, %r4389, 16; + add.s32 %r4391, %r4390, %r4370; + xor.b32 %r4392, %r4391, %r4330; + shf.l.wrap.b32 %r4393, %r4392, %r4392, 20; + add.s32 %r4394, %r4388, %r4068; + add.s32 %r4395, %r4394, %r4393; + xor.b32 %r4396, %r4395, %r4390; + shf.l.wrap.b32 %r4397, %r4396, %r4396, 24; + add.s32 %r4398, %r4397, %r4391; + xor.b32 %r4399, %r4398, %r4393; + shf.l.wrap.b32 %r4400, %r4399, %r4399, 25; + add.s32 %r4401, %r4353, %r4075; + add.s32 %r4402, %r4401, %r4344; + xor.b32 %r4403, %r4369, %r4402; + shf.l.wrap.b32 %r4404, %r4403, %r4403, 16; + add.s32 %r4405, %r4404, %r4328; + xor.b32 %r4406, %r4405, %r4344; + shf.l.wrap.b32 %r4407, %r4406, %r4406, 20; + add.s32 %r4408, %r4402, %r3998; + add.s32 %r4409, %r4408, %r4407; + xor.b32 %r4410, %r4409, %r4404; + shf.l.wrap.b32 %r4411, %r4410, %r4410, 24; + add.s32 %r4412, %r4411, %r4405; + xor.b32 %r4413, %r4412, %r4407; + shf.l.wrap.b32 %r4414, %r4413, %r4413, 25; + add.s32 %r4415, %r4358, %r4033; + add.s32 %r4416, %r4415, %r4367; + xor.b32 %r4417, %r4416, %r4327; + shf.l.wrap.b32 %r4418, %r4417, %r4417, 16; + add.s32 %r4419, %r4418, %r4342; + xor.b32 %r4420, %r4419, %r4358; + shf.l.wrap.b32 %r4421, %r4420, %r4420, 20; + add.s32 %r4422, %r4416, %r4082; + add.s32 %r4423, %r4422, %r4421; + xor.b32 %r4424, %r4423, %r4418; + shf.l.wrap.b32 %r4425, %r4424, %r4424, 24; + add.s32 %r4426, %r4425, %r4419; + xor.b32 %r4427, %r4426, %r4421; + shf.l.wrap.b32 %r4428, %r4427, %r4427, 25; + add.s32 %r4429, %r4381, %r4026; + add.s32 %r4430, %r4429, %r4400; + xor.b32 %r4431, %r4430, %r4425; + shf.l.wrap.b32 %r4432, %r4431, %r4431, 16; + add.s32 %r4433, %r4432, %r4412; + xor.b32 %r4434, %r4433, %r4400; + shf.l.wrap.b32 %r4435, %r4434, %r4434, 20; + add.s32 %r4436, %r4430, %r4019; + add.s32 %r4437, %r4436, %r4435; + xor.b32 %r4438, %r4437, %r4432; + shf.l.wrap.b32 %r4439, %r4438, %r4438, 24; + add.s32 %r4440, %r4439, %r4433; + xor.b32 %r4441, %r4440, %r4435; + shf.l.wrap.b32 %r4442, %r4441, %r4441, 25; + add.s32 %r4443, %r4395, %r4047; + add.s32 %r4444, %r4443, %r4414; + xor.b32 %r4445, %r4383, %r4444; + shf.l.wrap.b32 %r4446, %r4445, %r4445, 16; + add.s32 %r4447, %r4446, %r4426; + xor.b32 %r4448, %r4447, %r4414; + shf.l.wrap.b32 %r4449, %r4448, %r4448, 20; + add.s32 %r4450, %r4444, %r3984; + add.s32 %r4451, %r4450, %r4449; + xor.b32 %r4452, %r4451, %r4446; + shf.l.wrap.b32 %r4453, %r4452, %r4452, 24; + add.s32 %r4454, %r4453, %r4447; + xor.b32 %r4455, %r4454, %r4449; + shf.l.wrap.b32 %r4456, %r4455, %r4455, 25; + add.s32 %r4457, %r4409, %r4061; + add.s32 %r4458, %r4457, %r4428; + xor.b32 %r4459, %r4458, %r4397; + shf.l.wrap.b32 %r4460, %r4459, %r4459, 16; + add.s32 %r4461, %r4460, %r4384; + xor.b32 %r4462, %r4461, %r4428; + shf.l.wrap.b32 %r4463, %r4462, %r4462, 20; + add.s32 %r4464, %r4458, %r4089; + add.s32 %r4465, %r4464, %r4463; + xor.b32 %r4466, %r4465, %r4460; + shf.l.wrap.b32 %r4467, %r4466, %r4466, 24; + add.s32 %r4468, %r4467, %r4461; + xor.b32 %r4469, %r4468, %r4463; + shf.l.wrap.b32 %r4470, %r4469, %r4469, 25; + add.s32 %r4471, %r4423, %r4040; + add.s32 %r4472, %r4471, %r4386; + xor.b32 %r4473, %r4472, %r4411; + shf.l.wrap.b32 %r4474, %r4473, %r4473, 16; + add.s32 %r4475, %r4474, %r4398; + xor.b32 %r4476, %r4475, %r4386; + shf.l.wrap.b32 %r4477, %r4476, %r4476, 20; + add.s32 %r4478, %r4472, %r3991; + add.s32 %r4479, %r4478, %r4477; + xor.b32 %r4480, %r4479, %r4474; + shf.l.wrap.b32 %r4481, %r4480, %r4480, 24; + add.s32 %r4482, %r4481, %r4475; + xor.b32 %r4483, %r4482, %r4477; + shf.l.wrap.b32 %r4484, %r4483, %r4483, 25; + add.s32 %r4485, %r4437, %r4054; + add.s32 %r4486, %r4485, %r4484; + xor.b32 %r4487, %r4486, %r4453; + shf.l.wrap.b32 %r4488, %r4487, %r4487, 16; + add.s32 %r4489, %r4488, %r4468; + xor.b32 %r4490, %r4489, %r4484; + shf.l.wrap.b32 %r4491, %r4490, %r4490, 20; + add.s32 %r4492, %r4486, %r4033; + add.s32 %r4493, %r4492, %r4491; + xor.b32 %r4494, %r4493, %r4488; + shf.l.wrap.b32 %r4495, %r4494, %r4494, 24; + add.s32 %r4496, %r4495, %r4489; + xor.b32 %r4497, %r4496, %r4491; + shf.l.wrap.b32 %r4498, %r4497, %r4497, 25; + add.s32 %r4499, %r4451, %r4068; + add.s32 %r4500, %r4499, %r4442; + xor.b32 %r4501, %r4500, %r4467; + shf.l.wrap.b32 %r4502, %r4501, %r4501, 16; + add.s32 %r4503, %r4502, %r4482; + xor.b32 %r4504, %r4503, %r4442; + shf.l.wrap.b32 %r4505, %r4504, %r4504, 20; + add.s32 %r4506, %r4500, %r4047; + add.s32 %r4507, %r4506, %r4505; + xor.b32 %r4508, %r4507, %r4502; + shf.l.wrap.b32 %r4509, %r4508, %r4508, 24; + add.s32 %r4510, %r4509, %r4503; + xor.b32 %r4511, %r4510, %r4505; + shf.l.wrap.b32 %r4512, %r4511, %r4511, 25; + add.s32 %r4513, %r4465, %r4082; + add.s32 %r4514, %r4513, %r4456; + xor.b32 %r4515, %r4481, %r4514; + shf.l.wrap.b32 %r4516, %r4515, %r4515, 16; + add.s32 %r4517, %r4516, %r4440; + xor.b32 %r4518, %r4517, %r4456; + shf.l.wrap.b32 %r4519, %r4518, %r4518, 20; + add.s32 %r4520, %r4514, %r4005; + add.s32 %r4521, %r4520, %r4519; + xor.b32 %r4522, %r4521, %r4516; + shf.l.wrap.b32 %r4523, %r4522, %r4522, 24; + add.s32 %r4524, %r4523, %r4517; + xor.b32 %r4525, %r4524, %r4519; + shf.l.wrap.b32 %r4526, %r4525, %r4525, 25; + add.s32 %r4527, %r4479, %r4075; + add.s32 %r4528, %r4527, %r4470; + xor.b32 %r4529, %r4528, %r4439; + shf.l.wrap.b32 %r4530, %r4529, %r4529, 16; + add.s32 %r4531, %r4530, %r4454; + xor.b32 %r4532, %r4531, %r4470; + shf.l.wrap.b32 %r4533, %r4532, %r4532, 20; + add.s32 %r4534, %r4528, %r4089; + add.s32 %r4535, %r4534, %r4533; + xor.b32 %r4536, %r4535, %r4530; + shf.l.wrap.b32 %r4537, %r4536, %r4536, 24; + add.s32 %r4538, %r4537, %r4531; + xor.b32 %r4539, %r4538, %r4533; + shf.l.wrap.b32 %r4540, %r4539, %r4539, 25; + add.s32 %r4541, %r4493, %r4012; + add.s32 %r4542, %r4541, %r4512; + xor.b32 %r4543, %r4542, %r4537; + shf.l.wrap.b32 %r4544, %r4543, %r4543, 16; + add.s32 %r4545, %r4544, %r4524; + xor.b32 %r4546, %r4545, %r4512; + shf.l.wrap.b32 %r4547, %r4546, %r4546, 20; + add.s32 %r4548, %r4542, %r3984; + add.s32 %r4549, %r4548, %r4547; + xor.b32 %r4550, %r4549, %r4544; + shf.l.wrap.b32 %r4551, %r4550, %r4550, 24; + add.s32 %r4552, %r4551, %r4545; + xor.b32 %r4553, %r4552, %r4547; + shf.l.wrap.b32 %r4554, %r4553, %r4553, 25; + add.s32 %r4555, %r4507, %r4061; + add.s32 %r4556, %r4555, %r4526; + xor.b32 %r4557, %r4495, %r4556; + shf.l.wrap.b32 %r4558, %r4557, %r4557, 16; + add.s32 %r4559, %r4558, %r4538; + xor.b32 %r4560, %r4559, %r4526; + shf.l.wrap.b32 %r4561, %r4560, %r4560, 20; + add.s32 %r4562, %r4556, %r3998; + add.s32 %r4563, %r4562, %r4561; + xor.b32 %r4564, %r4563, %r4558; + shf.l.wrap.b32 %r4565, %r4564, %r4564, 24; + add.s32 %r4566, %r4565, %r4559; + xor.b32 %r4567, %r4566, %r4561; + shf.l.wrap.b32 %r4568, %r4567, %r4567, 25; + add.s32 %r4569, %r4521, %r4019; + add.s32 %r4570, %r4569, %r4540; + xor.b32 %r4571, %r4570, %r4509; + shf.l.wrap.b32 %r4572, %r4571, %r4571, 16; + add.s32 %r4573, %r4572, %r4496; + xor.b32 %r4574, %r4573, %r4540; + shf.l.wrap.b32 %r4575, %r4574, %r4574, 20; + add.s32 %r4576, %r4570, %r4040; + add.s32 %r4577, %r4576, %r4575; + xor.b32 %r4578, %r4577, %r4572; + shf.l.wrap.b32 %r4579, %r4578, %r4578, 24; + add.s32 %r4580, %r4579, %r4573; + xor.b32 %r4581, %r4580, %r4575; + shf.l.wrap.b32 %r4582, %r4581, %r4581, 25; + add.s32 %r4583, %r4535, %r3991; + add.s32 %r4584, %r4583, %r4498; + xor.b32 %r4585, %r4584, %r4523; + shf.l.wrap.b32 %r4586, %r4585, %r4585, 16; + add.s32 %r4587, %r4586, %r4510; + xor.b32 %r4588, %r4587, %r4498; + shf.l.wrap.b32 %r4589, %r4588, %r4588, 20; + add.s32 %r4590, %r4584, %r4026; + add.s32 %r4591, %r4590, %r4589; + xor.b32 %r4592, %r4591, %r4586; + shf.l.wrap.b32 %r4593, %r4592, %r4592, 24; + add.s32 %r4594, %r4593, %r4587; + xor.b32 %r4595, %r4594, %r4589; + shf.l.wrap.b32 %r4596, %r4595, %r4595, 25; + add.s32 %r4597, %r4549, %r4068; + add.s32 %r4598, %r4597, %r4596; + xor.b32 %r4599, %r4598, %r4565; + shf.l.wrap.b32 %r4600, %r4599, %r4599, 16; + add.s32 %r4601, %r4600, %r4580; + xor.b32 %r4602, %r4601, %r4596; + shf.l.wrap.b32 %r4603, %r4602, %r4602, 20; + add.s32 %r4604, %r4598, %r4075; + add.s32 %r4605, %r4604, %r4603; + xor.b32 %r4606, %r4605, %r4600; + shf.l.wrap.b32 %r4607, %r4606, %r4606, 24; + add.s32 %r4608, %r4607, %r4601; + xor.b32 %r4609, %r4608, %r4603; + shf.l.wrap.b32 %r4610, %r4609, %r4609, 25; + add.s32 %r4611, %r4563, %r4047; + add.s32 %r4612, %r4611, %r4554; + xor.b32 %r4613, %r4612, %r4579; + shf.l.wrap.b32 %r4614, %r4613, %r4613, 16; + add.s32 %r4615, %r4614, %r4594; + xor.b32 %r4616, %r4615, %r4554; + shf.l.wrap.b32 %r4617, %r4616, %r4616, 20; + add.s32 %r4618, %r4612, %r4061; + add.s32 %r4619, %r4618, %r4617; + xor.b32 %r4620, %r4619, %r4614; + shf.l.wrap.b32 %r4621, %r4620, %r4620, 24; + add.s32 %r4622, %r4621, %r4615; + xor.b32 %r4623, %r4622, %r4617; + shf.l.wrap.b32 %r4624, %r4623, %r4623, 25; + add.s32 %r4625, %r4577, %r4089; + add.s32 %r4626, %r4625, %r4568; + xor.b32 %r4627, %r4593, %r4626; + shf.l.wrap.b32 %r4628, %r4627, %r4627, 16; + add.s32 %r4629, %r4628, %r4552; + xor.b32 %r4630, %r4629, %r4568; + shf.l.wrap.b32 %r4631, %r4630, %r4630, 20; + add.s32 %r4632, %r4626, %r4054; + add.s32 %r4633, %r4632, %r4631; + xor.b32 %r4634, %r4633, %r4628; + shf.l.wrap.b32 %r4635, %r4634, %r4634, 24; + add.s32 %r4636, %r4635, %r4629; + xor.b32 %r4637, %r4636, %r4631; + shf.l.wrap.b32 %r4638, %r4637, %r4637, 25; + add.s32 %r4639, %r4591, %r4082; + add.s32 %r4640, %r4639, %r4582; + xor.b32 %r4641, %r4640, %r4551; + shf.l.wrap.b32 %r4642, %r4641, %r4641, 16; + add.s32 %r4643, %r4642, %r4566; + xor.b32 %r4644, %r4643, %r4582; + shf.l.wrap.b32 %r4645, %r4644, %r4644, 20; + add.s32 %r4646, %r4640, %r4040; + add.s32 %r4647, %r4646, %r4645; + xor.b32 %r4648, %r4647, %r4642; + shf.l.wrap.b32 %r4649, %r4648, %r4648, 24; + add.s32 %r4650, %r4649, %r4643; + xor.b32 %r4651, %r4650, %r4645; + shf.l.wrap.b32 %r4652, %r4651, %r4651, 25; + add.s32 %r4653, %r4605, %r4033; + add.s32 %r4654, %r4653, %r4624; + xor.b32 %r4655, %r4654, %r4649; + shf.l.wrap.b32 %r4656, %r4655, %r4655, 16; + add.s32 %r4657, %r4656, %r4636; + xor.b32 %r4658, %r4657, %r4624; + shf.l.wrap.b32 %r4659, %r4658, %r4658, 20; + add.s32 %r4660, %r4654, %r3998; + add.s32 %r4661, %r4660, %r4659; + xor.b32 %r4662, %r4661, %r4656; + shf.l.wrap.b32 %r4663, %r4662, %r4662, 24; + add.s32 %r4664, %r4663, %r4657; + xor.b32 %r4665, %r4664, %r4659; + shf.l.wrap.b32 %r4666, %r4665, %r4665, 25; + add.s32 %r4667, %r4619, %r4019; + add.s32 %r4668, %r4667, %r4638; + xor.b32 %r4669, %r4607, %r4668; + shf.l.wrap.b32 %r4670, %r4669, %r4669, 16; + add.s32 %r4671, %r4670, %r4650; + xor.b32 %r4672, %r4671, %r4638; + shf.l.wrap.b32 %r4673, %r4672, %r4672, 20; + add.s32 %r4674, %r4668, %r4005; + add.s32 %r4675, %r4674, %r4673; + xor.b32 %r4676, %r4675, %r4670; + shf.l.wrap.b32 %r4677, %r4676, %r4676, 24; + add.s32 %r4678, %r4677, %r4671; + xor.b32 %r4679, %r4678, %r4673; + shf.l.wrap.b32 %r4680, %r4679, %r4679, 25; + add.s32 %r4681, %r4633, %r3984; + add.s32 %r4682, %r4681, %r4652; + xor.b32 %r4683, %r4682, %r4621; + shf.l.wrap.b32 %r4684, %r4683, %r4683, 16; + add.s32 %r4685, %r4684, %r4608; + xor.b32 %r4686, %r4685, %r4652; + shf.l.wrap.b32 %r4687, %r4686, %r4686, 20; + add.s32 %r4688, %r4682, %r3991; + add.s32 %r4689, %r4688, %r4687; + xor.b32 %r4690, %r4689, %r4684; + shf.l.wrap.b32 %r4691, %r4690, %r4690, 24; + add.s32 %r4692, %r4691, %r4685; + xor.b32 %r4693, %r4692, %r4687; + shf.l.wrap.b32 %r4694, %r4693, %r4693, 25; + add.s32 %r4695, %r4647, %r4026; + add.s32 %r4696, %r4695, %r4610; + xor.b32 %r4697, %r4696, %r4635; + shf.l.wrap.b32 %r4698, %r4697, %r4697, 16; + add.s32 %r4699, %r4698, %r4622; + xor.b32 %r4700, %r4699, %r4610; + shf.l.wrap.b32 %r4701, %r4700, %r4700, 20; + add.s32 %r4702, %r4696, %r4012; + add.s32 %r4703, %r4702, %r4701; + xor.b32 %r4704, %r4703, %r4698; + shf.l.wrap.b32 %r4705, %r4704, %r4704, 24; + add.s32 %r4706, %r4705, %r4699; + xor.b32 %r4707, %r4706, %r4701; + shf.l.wrap.b32 %r4708, %r4707, %r4707, 25; + add.s32 %r4709, %r4661, %r4047; + add.s32 %r4710, %r4709, %r4708; + xor.b32 %r4711, %r4710, %r4677; + shf.l.wrap.b32 %r4712, %r4711, %r4711, 16; + add.s32 %r4713, %r4712, %r4692; + xor.b32 %r4714, %r4713, %r4708; + shf.l.wrap.b32 %r4715, %r4714, %r4714, 20; + add.s32 %r4716, %r4710, %r4082; + add.s32 %r4717, %r4716, %r4715; + xor.b32 %r4718, %r4717, %r4712; + shf.l.wrap.b32 %r4719, %r4718, %r4718, 24; + add.s32 %r4720, %r4719, %r4713; + xor.b32 %r4721, %r4720, %r4715; + shf.l.wrap.b32 %r4722, %r4721, %r4721, 25; + add.s32 %r4723, %r4675, %r4061; + add.s32 %r4724, %r4723, %r4666; + xor.b32 %r4725, %r4724, %r4691; + shf.l.wrap.b32 %r4726, %r4725, %r4725, 16; + add.s32 %r4727, %r4726, %r4706; + xor.b32 %r4728, %r4727, %r4666; + shf.l.wrap.b32 %r4729, %r4728, %r4728, 20; + add.s32 %r4730, %r4724, %r4019; + add.s32 %r4731, %r4730, %r4729; + xor.b32 %r4732, %r4731, %r4726; + shf.l.wrap.b32 %r4733, %r4732, %r4732, 24; + add.s32 %r4734, %r4733, %r4727; + xor.b32 %r4735, %r4734, %r4729; + shf.l.wrap.b32 %r4736, %r4735, %r4735, 25; + add.s32 %r4737, %r4689, %r4040; + add.s32 %r4738, %r4737, %r4680; + xor.b32 %r4739, %r4705, %r4738; + shf.l.wrap.b32 %r4740, %r4739, %r4739, 16; + add.s32 %r4741, %r4740, %r4664; + xor.b32 %r4742, %r4741, %r4680; + shf.l.wrap.b32 %r4743, %r4742, %r4742, 20; + add.s32 %r4744, %r4738, %r4068; + add.s32 %r4745, %r4744, %r4743; + xor.b32 %r4746, %r4745, %r4740; + shf.l.wrap.b32 %r4747, %r4746, %r4746, 24; + add.s32 %r4748, %r4747, %r4741; + xor.b32 %r4749, %r4748, %r4743; + shf.l.wrap.b32 %r4750, %r4749, %r4749, 25; + add.s32 %r4751, %r4703, %r4089; + add.s32 %r4752, %r4751, %r4694; + xor.b32 %r4753, %r4752, %r4663; + shf.l.wrap.b32 %r4754, %r4753, %r4753, 16; + add.s32 %r4755, %r4754, %r4678; + xor.b32 %r4756, %r4755, %r4694; + shf.l.wrap.b32 %r4757, %r4756, %r4756, 20; + add.s32 %r4758, %r4752, %r3991; + add.s32 %r4759, %r4758, %r4757; + xor.b32 %r4760, %r4759, %r4754; + shf.l.wrap.b32 %r4761, %r4760, %r4760, 24; + add.s32 %r4762, %r4761, %r4755; + xor.b32 %r4763, %r4762, %r4757; + shf.l.wrap.b32 %r4764, %r4763, %r4763, 25; + add.s32 %r4765, %r4717, %r4075; + add.s32 %r4766, %r4765, %r4736; + xor.b32 %r4767, %r4766, %r4761; + shf.l.wrap.b32 %r4768, %r4767, %r4767, 16; + add.s32 %r4769, %r4768, %r4748; + xor.b32 %r4770, %r4769, %r4736; + shf.l.wrap.b32 %r4771, %r4770, %r4770, 20; + add.s32 %r4772, %r4766, %r4005; + add.s32 %r4773, %r4772, %r4771; + xor.b32 %r4774, %r4773, %r4768; + shf.l.wrap.b32 %r4775, %r4774, %r4774, 24; + add.s32 %r4776, %r4775, %r4769; + xor.b32 %r4777, %r4776, %r4771; + shf.l.wrap.b32 %r4778, %r4777, %r4777, 25; + add.s32 %r4779, %r4731, %r3984; + add.s32 %r4780, %r4779, %r4750; + xor.b32 %r4781, %r4719, %r4780; + shf.l.wrap.b32 %r4782, %r4781, %r4781, 16; + add.s32 %r4783, %r4782, %r4762; + xor.b32 %r4784, %r4783, %r4750; + shf.l.wrap.b32 %r4785, %r4784, %r4784, 20; + add.s32 %r4786, %r4780, %r4054; + add.s32 %r4787, %r4786, %r4785; + xor.b32 %r4788, %r4787, %r4782; + shf.l.wrap.b32 %r4789, %r4788, %r4788, 24; + add.s32 %r4790, %r4789, %r4783; + xor.b32 %r4791, %r4790, %r4785; + shf.l.wrap.b32 %r4792, %r4791, %r4791, 25; + add.s32 %r4793, %r4745, %r3998; + add.s32 %r4794, %r4793, %r4764; + xor.b32 %r4795, %r4794, %r4733; + shf.l.wrap.b32 %r4796, %r4795, %r4795, 16; + add.s32 %r4797, %r4796, %r4720; + xor.b32 %r4798, %r4797, %r4764; + shf.l.wrap.b32 %r4799, %r4798, %r4798, 20; + add.s32 %r4800, %r4794, %r4026; + add.s32 %r4801, %r4800, %r4799; + xor.b32 %r4802, %r4801, %r4796; + shf.l.wrap.b32 %r4803, %r4802, %r4802, 24; + add.s32 %r4804, %r4803, %r4797; + xor.b32 %r4805, %r4804, %r4799; + shf.l.wrap.b32 %r4806, %r4805, %r4805, 25; + add.s32 %r4807, %r4759, %r4012; + add.s32 %r4808, %r4807, %r4722; + xor.b32 %r4809, %r4808, %r4747; + shf.l.wrap.b32 %r4810, %r4809, %r4809, 16; + add.s32 %r4811, %r4810, %r4734; + xor.b32 %r4812, %r4811, %r4722; + shf.l.wrap.b32 %r4813, %r4812, %r4812, 20; + add.s32 %r4814, %r4808, %r4033; + add.s32 %r4815, %r4814, %r4813; + xor.b32 %r4816, %r4815, %r4810; + shf.l.wrap.b32 %r4817, %r4816, %r4816, 24; + add.s32 %r4818, %r4817, %r4811; + xor.b32 %r4819, %r4818, %r4813; + shf.l.wrap.b32 %r4820, %r4819, %r4819, 25; + add.s32 %r4821, %r4773, %r4061; + add.s32 %r4822, %r4821, %r4820; + xor.b32 %r4823, %r4822, %r4789; + shf.l.wrap.b32 %r4824, %r4823, %r4823, 16; + add.s32 %r4825, %r4824, %r4804; + xor.b32 %r4826, %r4825, %r4820; + shf.l.wrap.b32 %r4827, %r4826, %r4826, 20; + add.s32 %r4828, %r4822, %r4089; + add.s32 %r4829, %r4828, %r4827; + xor.b32 %r4830, %r4829, %r4824; + shf.l.wrap.b32 %r4831, %r4830, %r4830, 24; + add.s32 %r4832, %r4831, %r4825; + xor.b32 %r4833, %r4832, %r4827; + shf.l.wrap.b32 %r4834, %r4833, %r4833, 25; + add.s32 %r4835, %r4787, %r4019; + add.s32 %r4836, %r4835, %r4778; + xor.b32 %r4837, %r4836, %r4803; + shf.l.wrap.b32 %r4838, %r4837, %r4837, 16; + add.s32 %r4839, %r4838, %r4818; + xor.b32 %r4840, %r4839, %r4778; + shf.l.wrap.b32 %r4841, %r4840, %r4840, 20; + add.s32 %r4842, %r4836, %r3984; + add.s32 %r4843, %r4842, %r4841; + xor.b32 %r4844, %r4843, %r4838; + shf.l.wrap.b32 %r4845, %r4844, %r4844, 24; + add.s32 %r4846, %r4845, %r4839; + xor.b32 %r4847, %r4846, %r4841; + shf.l.wrap.b32 %r4848, %r4847, %r4847, 25; + add.s32 %r4849, %r4801, %r3991; + add.s32 %r4850, %r4849, %r4792; + xor.b32 %r4851, %r4817, %r4850; + shf.l.wrap.b32 %r4852, %r4851, %r4851, 16; + add.s32 %r4853, %r4852, %r4776; + xor.b32 %r4854, %r4853, %r4792; + shf.l.wrap.b32 %r4855, %r4854, %r4854, 20; + add.s32 %r4856, %r4850, %r4047; + add.s32 %r4857, %r4856, %r4855; + xor.b32 %r4858, %r4857, %r4852; + shf.l.wrap.b32 %r4859, %r4858, %r4858, 24; + add.s32 %r4860, %r4859, %r4853; + xor.b32 %r4861, %r4860, %r4855; + shf.l.wrap.b32 %r4862, %r4861, %r4861, 25; + add.s32 %r4863, %r4815, %r4040; + add.s32 %r4864, %r4863, %r4806; + xor.b32 %r4865, %r4864, %r4775; + shf.l.wrap.b32 %r4866, %r4865, %r4865, 16; + add.s32 %r4867, %r4866, %r4790; + xor.b32 %r4868, %r4867, %r4806; + shf.l.wrap.b32 %r4869, %r4868, %r4868, 20; + add.s32 %r4870, %r4864, %r4026; + add.s32 %r4871, %r4870, %r4869; + xor.b32 %r4872, %r4871, %r4866; + shf.l.wrap.b32 %r4873, %r4872, %r4872, 24; + add.s32 %r4874, %r4873, %r4867; + xor.b32 %r4875, %r4874, %r4869; + shf.l.wrap.b32 %r4876, %r4875, %r4875, 25; + add.s32 %r4877, %r4829, %r4082; + add.s32 %r4878, %r4877, %r4848; + xor.b32 %r4879, %r4878, %r4873; + shf.l.wrap.b32 %r4880, %r4879, %r4879, 16; + add.s32 %r4881, %r4880, %r4860; + xor.b32 %r4882, %r4881, %r4848; + shf.l.wrap.b32 %r4883, %r4882, %r4882, 20; + add.s32 %r4884, %r4878, %r4054; + add.s32 %r4885, %r4884, %r4883; + xor.b32 %r4886, %r4885, %r4880; + shf.l.wrap.b32 %r4887, %r4886, %r4886, 24; + add.s32 %r4888, %r4887, %r4881; + xor.b32 %r4889, %r4888, %r4883; + shf.l.wrap.b32 %r4890, %r4889, %r4889, 25; + add.s32 %r4891, %r4843, %r3998; + add.s32 %r4892, %r4891, %r4862; + xor.b32 %r4893, %r4831, %r4892; + shf.l.wrap.b32 %r4894, %r4893, %r4893, 16; + add.s32 %r4895, %r4894, %r4874; + xor.b32 %r4896, %r4895, %r4862; + shf.l.wrap.b32 %r4897, %r4896, %r4896, 20; + add.s32 %r4898, %r4892, %r4068; + add.s32 %r4899, %r4898, %r4897; + xor.b32 %r4900, %r4899, %r4894; + shf.l.wrap.b32 %r4901, %r4900, %r4900, 24; + add.s32 %r4902, %r4901, %r4895; + xor.b32 %r4903, %r4902, %r4897; + shf.l.wrap.b32 %r4904, %r4903, %r4903, 25; + add.s32 %r4905, %r4857, %r4005; + add.s32 %r4906, %r4905, %r4876; + xor.b32 %r4907, %r4906, %r4845; + shf.l.wrap.b32 %r4908, %r4907, %r4907, 16; + add.s32 %r4909, %r4908, %r4832; + xor.b32 %r4910, %r4909, %r4876; + shf.l.wrap.b32 %r4911, %r4910, %r4910, 20; + add.s32 %r4912, %r4906, %r4012; + add.s32 %r4913, %r4912, %r4911; + xor.b32 %r4914, %r4913, %r4908; + shf.l.wrap.b32 %r4915, %r4914, %r4914, 24; + add.s32 %r4916, %r4915, %r4909; + xor.b32 %r4917, %r4916, %r4911; + shf.l.wrap.b32 %r4918, %r4917, %r4917, 25; + add.s32 %r4919, %r4871, %r4033; + add.s32 %r4920, %r4919, %r4834; + xor.b32 %r4921, %r4920, %r4859; + shf.l.wrap.b32 %r4922, %r4921, %r4921, 16; + add.s32 %r4923, %r4922, %r4846; + xor.b32 %r4924, %r4923, %r4834; + shf.l.wrap.b32 %r4925, %r4924, %r4924, 20; + add.s32 %r4926, %r4920, %r4075; + add.s32 %r4927, %r4926, %r4925; + xor.b32 %r4928, %r4927, %r4922; + shf.l.wrap.b32 %r4929, %r4928, %r4928, 24; + add.s32 %r4930, %r4929, %r4923; + xor.b32 %r4931, %r4930, %r4925; + shf.l.wrap.b32 %r4932, %r4931, %r4931, 25; + xor.b32 %r4933, %r4916, %r4885; + xor.b32 %r4934, %r4930, %r4899; + xor.b32 %r4935, %r4888, %r4913; + xor.b32 %r4936, %r4927, %r4902; + xor.b32 %r4937, %r4932, %r4901; + xor.b32 %r4938, %r4890, %r4915; + xor.b32 %r4939, %r4929, %r4904; + xor.b32 %r4940, %r4918, %r4887; + st.local.u8 [%rd168+145], %r4933; + shr.u32 %r4941, %r4933, 8; + st.local.u8 [%rd168+146], %r4941; + shr.u32 %r4942, %r4933, 16; + st.local.u8 [%rd168+147], %r4942; + shr.u32 %r4943, %r4933, 24; + st.local.u8 [%rd168+148], %r4943; + st.local.u8 [%rd168+149], %r4934; + shr.u32 %r4944, %r4934, 8; + st.local.u8 [%rd168+150], %r4944; + shr.u32 %r4945, %r4934, 16; + st.local.u8 [%rd168+151], %r4945; + shr.u32 %r4946, %r4934, 24; + st.local.u8 [%rd168+152], %r4946; + st.local.u8 [%rd168+153], %r4935; + shr.u32 %r4947, %r4935, 8; + st.local.u8 [%rd168+154], %r4947; + shr.u32 %r4948, %r4935, 16; + st.local.u8 [%rd168+155], %r4948; + shr.u32 %r4949, %r4935, 24; + st.local.u8 [%rd168+156], %r4949; + st.local.u8 [%rd168+157], %r4936; + shr.u32 %r4950, %r4936, 8; + st.local.u8 [%rd168+158], %r4950; + shr.u32 %r4951, %r4936, 16; + st.local.u8 [%rd168+159], %r4951; + shr.u32 %r4952, %r4936, 24; + st.local.u8 [%rd168+160], %r4952; + st.local.u8 [%rd168+161], %r4937; + shr.u32 %r4953, %r4937, 8; + st.local.u8 [%rd168+162], %r4953; + shr.u32 %r4954, %r4937, 16; + st.local.u8 [%rd168+163], %r4954; + shr.u32 %r4955, %r4937, 24; + st.local.u8 [%rd168+164], %r4955; + st.local.u8 [%rd168+165], %r4938; + shr.u32 %r4956, %r4938, 8; + st.local.u8 [%rd168+166], %r4956; + shr.u32 %r4957, %r4938, 16; + st.local.u8 [%rd168+167], %r4957; + shr.u32 %r4958, %r4938, 24; + st.local.u8 [%rd168+168], %r4958; + st.local.u8 [%rd168+169], %r4939; + shr.u32 %r4959, %r4939, 8; + st.local.u8 [%rd168+170], %r4959; + shr.u32 %r4960, %r4939, 16; + st.local.u8 [%rd168+171], %r4960; + shr.u32 %r4961, %r4939, 24; + st.local.u8 [%rd168+172], %r4961; + st.local.u8 [%rd168+173], %r4940; + shr.u32 %r4962, %r4940, 8; + st.local.u8 [%rd168+174], %r4962; + shr.u32 %r4963, %r4940, 16; + st.local.u8 [%rd168+175], %r4963; + shr.u32 %r4964, %r4940, 24; + st.local.u8 [%rd168+176], %r4964; + ld.local.u8 %rs138, [%rd3+8]; + add.s16 %rs139, %rs138, -1; + st.local.u8 [%rd3+8], %rs139; + cvt.u64.u16 %rd169, %rs139; + and.b64 %rd170, %rd169, 255; + setp.lt.u64 %p28, %rd226, %rd170; + and.b16 %rs140, %rs139, 255; + mul.wide.u16 %r11661, %rs140, 32; + @%p28 bra $L__BB1_31; + +$L__BB1_32: + cvt.s64.s32 %rd171, %r11661; + add.s64 %rd172, %rd2, %rd171; + mov.b32 {%rs141, %rs142}, %r3959; + st.local.u8 [%rd172+145], %rs141; + shr.u16 %rs143, %rs141, 8; + st.local.u8 [%rd172+146], %rs143; + st.local.u8 [%rd172+147], %rs142; + shr.u16 %rs144, %rs142, 8; + st.local.u8 [%rd172+148], %rs144; + mov.b32 {%rs145, %rs146}, %r3960; + st.local.u8 [%rd172+149], %rs145; + shr.u16 %rs147, %rs145, 8; + st.local.u8 [%rd172+150], %rs147; + st.local.u8 [%rd172+151], %rs146; + shr.u16 %rs148, %rs146, 8; + st.local.u8 [%rd172+152], %rs148; + mov.b32 {%rs149, %rs150}, %r3961; + st.local.u8 [%rd172+153], %rs149; + shr.u16 %rs151, %rs149, 8; + st.local.u8 [%rd172+154], %rs151; + st.local.u8 [%rd172+155], %rs150; + shr.u16 %rs152, %rs150, 8; + st.local.u8 [%rd172+156], %rs152; + mov.b32 {%rs153, %rs154}, %r3962; + st.local.u8 [%rd172+157], %rs153; + shr.u16 %rs155, %rs153, 8; + st.local.u8 [%rd172+158], %rs155; + st.local.u8 [%rd172+159], %rs154; + shr.u16 %rs156, %rs154, 8; + st.local.u8 [%rd172+160], %rs156; + mov.b32 {%rs157, %rs158}, %r3963; + st.local.u8 [%rd172+161], %rs157; + shr.u16 %rs159, %rs157, 8; + st.local.u8 [%rd172+162], %rs159; + st.local.u8 [%rd172+163], %rs158; + shr.u16 %rs160, %rs158, 8; + st.local.u8 [%rd172+164], %rs160; + mov.b32 {%rs161, %rs162}, %r3964; + st.local.u8 [%rd172+165], %rs161; + shr.u16 %rs163, %rs161, 8; + st.local.u8 [%rd172+166], %rs163; + st.local.u8 [%rd172+167], %rs162; + shr.u16 %rs164, %rs162, 8; + st.local.u8 [%rd172+168], %rs164; + mov.b32 {%rs165, %rs166}, %r3965; + st.local.u8 [%rd172+169], %rs165; + shr.u16 %rs167, %rs165, 8; + st.local.u8 [%rd172+170], %rs167; + st.local.u8 [%rd172+171], %rs166; + shr.u16 %rs168, %rs166, 8; + st.local.u8 [%rd172+172], %rs168; + mov.b32 {%rs169, %rs170}, %r3966; + st.local.u8 [%rd172+173], %rs169; + shr.u16 %rs171, %rs169, 8; + st.local.u8 [%rd172+174], %rs171; + st.local.u8 [%rd172+175], %rs170; + shr.u16 %rs172, %rs170, 8; + st.local.u8 [%rd172+176], %rs172; + ld.local.u8 %rs173, [%rd3+8]; + add.s16 %rs174, %rs173, 1; + st.local.u8 [%rd3+8], %rs174; + shr.u64 %rd173, %rd49, 11; + ld.local.u64 %rd174, [%rd3+-72]; + add.s64 %rd175, %rd174, %rd173; + popc.b64 %r4965, %rd175; + cvt.u64.u32 %rd52, %r4965; + cvt.u64.u16 %rd176, %rs174; + and.b64 %rd177, %rd176, 255; + setp.ge.u64 %p29, %rd52, %rd177; + and.b16 %rs175, %rs174, 255; + mul.wide.u16 %r11663, %rs175, 32; + @%p29 bra $L__BB1_35; + +$L__BB1_34: + shr.u64 %rd229, %rd49, 11; + add.s64 %rd228, %rd174, %rd229; + popc.b64 %r11648, %rd228; + cvt.u64.u32 %rd227, %r11648; + add.s32 %r4966, %r11663, -64; + cvt.s64.s32 %rd178, %r4966; + add.s64 %rd179, %rd2, %rd178; + ld.local.u8 %r4967, [%rd3+2]; + ld.local.u8 %r4968, [%rd179+145]; + ld.local.u8 %r4969, [%rd179+146]; + prmt.b32 %r4970, %r4969, %r4968, 30212; + ld.local.u8 %r4971, [%rd179+147]; + prmt.b32 %r4972, %r4971, %r4970, 28756; + ld.local.u8 %r4973, [%rd179+148]; + prmt.b32 %r4974, %r4973, %r4972, 1620; + ld.local.u8 %r4975, [%rd179+149]; + ld.local.u8 %r4976, [%rd179+150]; + prmt.b32 %r4977, %r4976, %r4975, 30212; + ld.local.u8 %r4978, [%rd179+151]; + prmt.b32 %r4979, %r4978, %r4977, 28756; + ld.local.u8 %r4980, [%rd179+152]; + prmt.b32 %r4981, %r4980, %r4979, 1620; + ld.local.u8 %r4982, [%rd179+153]; + ld.local.u8 %r4983, [%rd179+154]; + prmt.b32 %r4984, %r4983, %r4982, 30212; + ld.local.u8 %r4985, [%rd179+155]; + prmt.b32 %r4986, %r4985, %r4984, 28756; + ld.local.u8 %r4987, [%rd179+156]; + prmt.b32 %r4988, %r4987, %r4986, 1620; + ld.local.u8 %r4989, [%rd179+157]; + ld.local.u8 %r4990, [%rd179+158]; + prmt.b32 %r4991, %r4990, %r4989, 30212; + ld.local.u8 %r4992, [%rd179+159]; + prmt.b32 %r4993, %r4992, %r4991, 28756; + ld.local.u8 %r4994, [%rd179+160]; + prmt.b32 %r4995, %r4994, %r4993, 1620; + ld.local.u8 %r4996, [%rd179+161]; + ld.local.u8 %r4997, [%rd179+162]; + prmt.b32 %r4998, %r4997, %r4996, 30212; + ld.local.u8 %r4999, [%rd179+163]; + prmt.b32 %r5000, %r4999, %r4998, 28756; + ld.local.u8 %r5001, [%rd179+164]; + prmt.b32 %r5002, %r5001, %r5000, 1620; + ld.local.u8 %r5003, [%rd179+165]; + ld.local.u8 %r5004, [%rd179+166]; + prmt.b32 %r5005, %r5004, %r5003, 30212; + ld.local.u8 %r5006, [%rd179+167]; + prmt.b32 %r5007, %r5006, %r5005, 28756; + ld.local.u8 %r5008, [%rd179+168]; + prmt.b32 %r5009, %r5008, %r5007, 1620; + ld.local.u8 %r5010, [%rd179+169]; + ld.local.u8 %r5011, [%rd179+170]; + prmt.b32 %r5012, %r5011, %r5010, 30212; + ld.local.u8 %r5013, [%rd179+171]; + prmt.b32 %r5014, %r5013, %r5012, 28756; + ld.local.u8 %r5015, [%rd179+172]; + prmt.b32 %r5016, %r5015, %r5014, 1620; + ld.local.u8 %r5017, [%rd179+173]; + ld.local.u8 %r5018, [%rd179+174]; + prmt.b32 %r5019, %r5018, %r5017, 30212; + ld.local.u8 %r5020, [%rd179+175]; + prmt.b32 %r5021, %r5020, %r5019, 28756; + ld.local.u8 %r5022, [%rd179+176]; + prmt.b32 %r5023, %r5022, %r5021, 1620; + ld.local.u8 %r5024, [%rd179+177]; + ld.local.u8 %r5025, [%rd179+178]; + prmt.b32 %r5026, %r5025, %r5024, 30212; + ld.local.u8 %r5027, [%rd179+179]; + prmt.b32 %r5028, %r5027, %r5026, 28756; + ld.local.u8 %r5029, [%rd179+180]; + prmt.b32 %r5030, %r5029, %r5028, 1620; + ld.local.u8 %r5031, [%rd179+181]; + ld.local.u8 %r5032, [%rd179+182]; + prmt.b32 %r5033, %r5032, %r5031, 30212; + ld.local.u8 %r5034, [%rd179+183]; + prmt.b32 %r5035, %r5034, %r5033, 28756; + ld.local.u8 %r5036, [%rd179+184]; + prmt.b32 %r5037, %r5036, %r5035, 1620; + ld.local.u8 %r5038, [%rd179+185]; + ld.local.u8 %r5039, [%rd179+186]; + prmt.b32 %r5040, %r5039, %r5038, 30212; + ld.local.u8 %r5041, [%rd179+187]; + prmt.b32 %r5042, %r5041, %r5040, 28756; + ld.local.u8 %r5043, [%rd179+188]; + prmt.b32 %r5044, %r5043, %r5042, 1620; + ld.local.u8 %r5045, [%rd179+189]; + ld.local.u8 %r5046, [%rd179+190]; + prmt.b32 %r5047, %r5046, %r5045, 30212; + ld.local.u8 %r5048, [%rd179+191]; + prmt.b32 %r5049, %r5048, %r5047, 28756; + ld.local.u8 %r5050, [%rd179+192]; + prmt.b32 %r5051, %r5050, %r5049, 1620; + ld.local.u8 %r5052, [%rd179+193]; + ld.local.u8 %r5053, [%rd179+194]; + prmt.b32 %r5054, %r5053, %r5052, 30212; + ld.local.u8 %r5055, [%rd179+195]; + prmt.b32 %r5056, %r5055, %r5054, 28756; + ld.local.u8 %r5057, [%rd179+196]; + prmt.b32 %r5058, %r5057, %r5056, 1620; + ld.local.u8 %r5059, [%rd179+197]; + ld.local.u8 %r5060, [%rd179+198]; + prmt.b32 %r5061, %r5060, %r5059, 30212; + ld.local.u8 %r5062, [%rd179+199]; + prmt.b32 %r5063, %r5062, %r5061, 28756; + ld.local.u8 %r5064, [%rd179+200]; + prmt.b32 %r5065, %r5064, %r5063, 1620; + ld.local.u8 %r5066, [%rd179+201]; + ld.local.u8 %r5067, [%rd179+202]; + prmt.b32 %r5068, %r5067, %r5066, 30212; + ld.local.u8 %r5069, [%rd179+203]; + prmt.b32 %r5070, %r5069, %r5068, 28756; + ld.local.u8 %r5071, [%rd179+204]; + prmt.b32 %r5072, %r5071, %r5070, 1620; + ld.local.u8 %r5073, [%rd179+205]; + ld.local.u8 %r5074, [%rd179+206]; + prmt.b32 %r5075, %r5074, %r5073, 30212; + ld.local.u8 %r5076, [%rd179+207]; + prmt.b32 %r5077, %r5076, %r5075, 28756; + ld.local.u8 %r5078, [%rd179+208]; + prmt.b32 %r5079, %r5078, %r5077, 1620; + or.b32 %r5080, %r4967, 4; + ld.local.u8 %r5081, [%rd3+-120]; + ld.local.u8 %r5082, [%rd3+-119]; + prmt.b32 %r5083, %r5082, %r5081, 30212; + ld.local.u8 %r5084, [%rd3+-118]; + ld.local.u8 %r5085, [%rd3+-117]; + prmt.b32 %r5086, %r5085, %r5084, 30212; + prmt.b32 %r5087, %r5086, %r5083, 4180; + ld.local.u8 %r5088, [%rd3+-136]; + ld.local.u8 %r5089, [%rd3+-135]; + prmt.b32 %r5090, %r5089, %r5088, 30212; + ld.local.u8 %r5091, [%rd3+-134]; + ld.local.u8 %r5092, [%rd3+-133]; + prmt.b32 %r5093, %r5092, %r5091, 30212; + prmt.b32 %r5094, %r5093, %r5090, 4180; + add.s32 %r5095, %r5087, %r5094; + add.s32 %r5096, %r5095, %r4974; + shf.l.wrap.b32 %r5097, %r5096, %r5096, 16; + add.s32 %r5098, %r5097, 1779033703; + xor.b32 %r5099, %r5098, %r5087; + shf.l.wrap.b32 %r5100, %r5099, %r5099, 20; + add.s32 %r5101, %r4981, %r5096; + add.s32 %r5102, %r5101, %r5100; + xor.b32 %r5103, %r5102, %r5097; + shf.l.wrap.b32 %r5104, %r5103, %r5103, 24; + add.s32 %r5105, %r5104, %r5098; + xor.b32 %r5106, %r5105, %r5100; + shf.l.wrap.b32 %r5107, %r5106, %r5106, 25; + ld.local.u8 %r5108, [%rd3+-116]; + ld.local.u8 %r5109, [%rd3+-115]; + prmt.b32 %r5110, %r5109, %r5108, 30212; + ld.local.u8 %r5111, [%rd3+-114]; + ld.local.u8 %r5112, [%rd3+-113]; + prmt.b32 %r5113, %r5112, %r5111, 30212; + prmt.b32 %r5114, %r5113, %r5110, 4180; + ld.local.u8 %r5115, [%rd3+-132]; + ld.local.u8 %r5116, [%rd3+-131]; + prmt.b32 %r5117, %r5116, %r5115, 30212; + ld.local.u8 %r5118, [%rd3+-130]; + ld.local.u8 %r5119, [%rd3+-129]; + prmt.b32 %r5120, %r5119, %r5118, 30212; + prmt.b32 %r5121, %r5120, %r5117, 4180; + add.s32 %r5122, %r5114, %r5121; + add.s32 %r5123, %r5122, %r4988; + shf.l.wrap.b32 %r5124, %r5123, %r5123, 16; + add.s32 %r5125, %r5124, -1150833019; + xor.b32 %r5126, %r5125, %r5114; + shf.l.wrap.b32 %r5127, %r5126, %r5126, 20; + add.s32 %r5128, %r4995, %r5123; + add.s32 %r5129, %r5128, %r5127; + xor.b32 %r5130, %r5129, %r5124; + shf.l.wrap.b32 %r5131, %r5130, %r5130, 24; + add.s32 %r5132, %r5131, %r5125; + xor.b32 %r5133, %r5132, %r5127; + shf.l.wrap.b32 %r5134, %r5133, %r5133, 25; + ld.local.u8 %r5135, [%rd3+-112]; + ld.local.u8 %r5136, [%rd3+-111]; + prmt.b32 %r5137, %r5136, %r5135, 30212; + ld.local.u8 %r5138, [%rd3+-110]; + ld.local.u8 %r5139, [%rd3+-109]; + prmt.b32 %r5140, %r5139, %r5138, 30212; + prmt.b32 %r5141, %r5140, %r5137, 4180; + ld.local.u8 %r5142, [%rd3+-128]; + ld.local.u8 %r5143, [%rd3+-127]; + prmt.b32 %r5144, %r5143, %r5142, 30212; + ld.local.u8 %r5145, [%rd3+-126]; + ld.local.u8 %r5146, [%rd3+-125]; + prmt.b32 %r5147, %r5146, %r5145, 30212; + prmt.b32 %r5148, %r5147, %r5144, 4180; + add.s32 %r5149, %r5141, %r5148; + add.s32 %r5150, %r5149, %r5002; + shr.u32 %r5151, %r5150, 16; + shl.b32 %r5152, %r5150, 16; + xor.b32 %r5153, %r5152, 4194304; + or.b32 %r5154, %r5153, %r5151; + add.s32 %r5155, %r5154, 1013904242; + xor.b32 %r5156, %r5155, %r5141; + shf.l.wrap.b32 %r5157, %r5156, %r5156, 20; + add.s32 %r5158, %r5009, %r5150; + add.s32 %r5159, %r5158, %r5157; + xor.b32 %r5160, %r5159, %r5154; + shf.l.wrap.b32 %r5161, %r5160, %r5160, 24; + add.s32 %r5162, %r5161, %r5155; + xor.b32 %r5163, %r5162, %r5157; + shf.l.wrap.b32 %r5164, %r5163, %r5163, 25; + ld.local.u8 %r5165, [%rd3+-108]; + ld.local.u8 %r5166, [%rd3+-107]; + prmt.b32 %r5167, %r5166, %r5165, 30212; + ld.local.u8 %r5168, [%rd3+-106]; + ld.local.u8 %r5169, [%rd3+-105]; + prmt.b32 %r5170, %r5169, %r5168, 30212; + prmt.b32 %r5171, %r5170, %r5167, 4180; + ld.local.u8 %r5172, [%rd3+-124]; + ld.local.u8 %r5173, [%rd3+-123]; + prmt.b32 %r5174, %r5173, %r5172, 30212; + ld.local.u8 %r5175, [%rd3+-122]; + ld.local.u8 %r5176, [%rd3+-121]; + prmt.b32 %r5177, %r5176, %r5175, 30212; + prmt.b32 %r5178, %r5177, %r5174, 4180; + add.s32 %r5179, %r5171, %r5178; + add.s32 %r5180, %r5179, %r5016; + xor.b32 %r5181, %r5180, %r5080; + shr.u32 %r5182, %r5180, 16; + shl.b32 %r5183, %r5181, 16; + or.b32 %r5184, %r5183, %r5182; + add.s32 %r5185, %r5184, -1521486534; + xor.b32 %r5186, %r5185, %r5171; + shf.l.wrap.b32 %r5187, %r5186, %r5186, 20; + add.s32 %r5188, %r5023, %r5180; + add.s32 %r5189, %r5188, %r5187; + xor.b32 %r5190, %r5189, %r5184; + shf.l.wrap.b32 %r5191, %r5190, %r5190, 24; + add.s32 %r5192, %r5191, %r5185; + xor.b32 %r5193, %r5192, %r5187; + shf.l.wrap.b32 %r5194, %r5193, %r5193, 25; + add.s32 %r5195, %r5134, %r5102; + add.s32 %r5196, %r5195, %r5030; + xor.b32 %r5197, %r5191, %r5196; + shf.l.wrap.b32 %r5198, %r5197, %r5197, 16; + add.s32 %r5199, %r5198, %r5162; + xor.b32 %r5200, %r5199, %r5134; + shf.l.wrap.b32 %r5201, %r5200, %r5200, 20; + add.s32 %r5202, %r5037, %r5196; + add.s32 %r5203, %r5202, %r5201; + xor.b32 %r5204, %r5203, %r5198; + shf.l.wrap.b32 %r5205, %r5204, %r5204, 24; + add.s32 %r5206, %r5205, %r5199; + xor.b32 %r5207, %r5206, %r5201; + shf.l.wrap.b32 %r5208, %r5207, %r5207, 25; + add.s32 %r5209, %r5164, %r5129; + add.s32 %r5210, %r5209, %r5044; + xor.b32 %r5211, %r5210, %r5104; + shf.l.wrap.b32 %r5212, %r5211, %r5211, 16; + add.s32 %r5213, %r5212, %r5192; + xor.b32 %r5214, %r5213, %r5164; + shf.l.wrap.b32 %r5215, %r5214, %r5214, 20; + add.s32 %r5216, %r5051, %r5210; + add.s32 %r5217, %r5216, %r5215; + xor.b32 %r5218, %r5217, %r5212; + shf.l.wrap.b32 %r5219, %r5218, %r5218, 24; + add.s32 %r5220, %r5219, %r5213; + xor.b32 %r5221, %r5220, %r5215; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 25; + add.s32 %r5223, %r5194, %r5159; + add.s32 %r5224, %r5223, %r5058; + xor.b32 %r5225, %r5224, %r5131; + shf.l.wrap.b32 %r5226, %r5225, %r5225, 16; + add.s32 %r5227, %r5226, %r5105; + xor.b32 %r5228, %r5227, %r5194; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 20; + add.s32 %r5230, %r5065, %r5224; + add.s32 %r5231, %r5230, %r5229; + xor.b32 %r5232, %r5231, %r5226; + shf.l.wrap.b32 %r5233, %r5232, %r5232, 24; + add.s32 %r5234, %r5233, %r5227; + xor.b32 %r5235, %r5234, %r5229; + shf.l.wrap.b32 %r5236, %r5235, %r5235, 25; + add.s32 %r5237, %r5189, %r5107; + add.s32 %r5238, %r5237, %r5072; + xor.b32 %r5239, %r5238, %r5161; + shf.l.wrap.b32 %r5240, %r5239, %r5239, 16; + add.s32 %r5241, %r5240, %r5132; + xor.b32 %r5242, %r5241, %r5107; + shf.l.wrap.b32 %r5243, %r5242, %r5242, 20; + add.s32 %r5244, %r5079, %r5238; + add.s32 %r5245, %r5244, %r5243; + xor.b32 %r5246, %r5245, %r5240; + shf.l.wrap.b32 %r5247, %r5246, %r5246, 24; + add.s32 %r5248, %r5247, %r5241; + xor.b32 %r5249, %r5248, %r5243; + shf.l.wrap.b32 %r5250, %r5249, %r5249, 25; + add.s32 %r5251, %r5203, %r4988; + add.s32 %r5252, %r5251, %r5250; + xor.b32 %r5253, %r5252, %r5219; + shf.l.wrap.b32 %r5254, %r5253, %r5253, 16; + add.s32 %r5255, %r5254, %r5234; + xor.b32 %r5256, %r5255, %r5250; + shf.l.wrap.b32 %r5257, %r5256, %r5256, 20; + add.s32 %r5258, %r5252, %r5016; + add.s32 %r5259, %r5258, %r5257; + xor.b32 %r5260, %r5259, %r5254; + shf.l.wrap.b32 %r5261, %r5260, %r5260, 24; + add.s32 %r5262, %r5261, %r5255; + xor.b32 %r5263, %r5262, %r5257; + shf.l.wrap.b32 %r5264, %r5263, %r5263, 25; + add.s32 %r5265, %r5217, %r4995; + add.s32 %r5266, %r5265, %r5208; + xor.b32 %r5267, %r5233, %r5266; + shf.l.wrap.b32 %r5268, %r5267, %r5267, 16; + add.s32 %r5269, %r5248, %r5268; + xor.b32 %r5270, %r5269, %r5208; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 20; + add.s32 %r5272, %r5266, %r5044; + add.s32 %r5273, %r5272, %r5271; + xor.b32 %r5274, %r5273, %r5268; + shf.l.wrap.b32 %r5275, %r5274, %r5274, 24; + add.s32 %r5276, %r5275, %r5269; + xor.b32 %r5277, %r5276, %r5271; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 25; + add.s32 %r5279, %r5222, %r5023; + add.s32 %r5280, %r5279, %r5231; + xor.b32 %r5281, %r5247, %r5280; + shf.l.wrap.b32 %r5282, %r5281, %r5281, 16; + add.s32 %r5283, %r5282, %r5206; + xor.b32 %r5284, %r5283, %r5222; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 20; + add.s32 %r5286, %r5280, %r4974; + add.s32 %r5287, %r5286, %r5285; + xor.b32 %r5288, %r5287, %r5282; + shf.l.wrap.b32 %r5289, %r5288, %r5288, 24; + add.s32 %r5290, %r5289, %r5283; + xor.b32 %r5291, %r5290, %r5285; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 25; + add.s32 %r5293, %r5236, %r5002; + add.s32 %r5294, %r5293, %r5245; + xor.b32 %r5295, %r5294, %r5205; + shf.l.wrap.b32 %r5296, %r5295, %r5295, 16; + add.s32 %r5297, %r5296, %r5220; + xor.b32 %r5298, %r5297, %r5236; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 20; + add.s32 %r5300, %r5294, %r5065; + add.s32 %r5301, %r5300, %r5299; + xor.b32 %r5302, %r5301, %r5296; + shf.l.wrap.b32 %r5303, %r5302, %r5302, 24; + add.s32 %r5304, %r5303, %r5297; + xor.b32 %r5305, %r5304, %r5299; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 25; + add.s32 %r5307, %r5278, %r4981; + add.s32 %r5308, %r5307, %r5259; + xor.b32 %r5309, %r5308, %r5303; + shf.l.wrap.b32 %r5310, %r5309, %r5309, 16; + add.s32 %r5311, %r5310, %r5290; + xor.b32 %r5312, %r5311, %r5278; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 20; + add.s32 %r5314, %r5308, %r5051; + add.s32 %r5315, %r5314, %r5313; + xor.b32 %r5316, %r5315, %r5310; + shf.l.wrap.b32 %r5317, %r5316, %r5316, 24; + add.s32 %r5318, %r5317, %r5311; + xor.b32 %r5319, %r5318, %r5313; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 25; + add.s32 %r5321, %r5273, %r5058; + add.s32 %r5322, %r5321, %r5292; + xor.b32 %r5323, %r5261, %r5322; + shf.l.wrap.b32 %r5324, %r5323, %r5323, 16; + add.s32 %r5325, %r5324, %r5304; + xor.b32 %r5326, %r5325, %r5292; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 20; + add.s32 %r5328, %r5322, %r5009; + add.s32 %r5329, %r5328, %r5327; + xor.b32 %r5330, %r5329, %r5324; + shf.l.wrap.b32 %r5331, %r5330, %r5330, 24; + add.s32 %r5332, %r5331, %r5325; + xor.b32 %r5333, %r5332, %r5327; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 25; + add.s32 %r5335, %r5287, %r5037; + add.s32 %r5336, %r5335, %r5306; + xor.b32 %r5337, %r5336, %r5275; + shf.l.wrap.b32 %r5338, %r5337, %r5337, 16; + add.s32 %r5339, %r5338, %r5262; + xor.b32 %r5340, %r5339, %r5306; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 20; + add.s32 %r5342, %r5336, %r5072; + add.s32 %r5343, %r5342, %r5341; + xor.b32 %r5344, %r5343, %r5338; + shf.l.wrap.b32 %r5345, %r5344, %r5344, 24; + add.s32 %r5346, %r5345, %r5339; + xor.b32 %r5347, %r5346, %r5341; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 25; + add.s32 %r5349, %r5301, %r5079; + add.s32 %r5350, %r5349, %r5264; + xor.b32 %r5351, %r5350, %r5289; + shf.l.wrap.b32 %r5352, %r5351, %r5351, 16; + add.s32 %r5353, %r5352, %r5276; + xor.b32 %r5354, %r5353, %r5264; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 20; + add.s32 %r5356, %r5350, %r5030; + add.s32 %r5357, %r5356, %r5355; + xor.b32 %r5358, %r5357, %r5352; + shf.l.wrap.b32 %r5359, %r5358, %r5358, 24; + add.s32 %r5360, %r5359, %r5353; + xor.b32 %r5361, %r5360, %r5355; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 25; + add.s32 %r5363, %r5315, %r4995; + add.s32 %r5364, %r5363, %r5362; + xor.b32 %r5365, %r5364, %r5331; + shf.l.wrap.b32 %r5366, %r5365, %r5365, 16; + add.s32 %r5367, %r5366, %r5346; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 20; + add.s32 %r5370, %r5364, %r5002; + add.s32 %r5371, %r5370, %r5369; + xor.b32 %r5372, %r5371, %r5366; + shf.l.wrap.b32 %r5373, %r5372, %r5372, 24; + add.s32 %r5374, %r5373, %r5367; + xor.b32 %r5375, %r5374, %r5369; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 25; + add.s32 %r5377, %r5329, %r5044; + add.s32 %r5378, %r5377, %r5320; + xor.b32 %r5379, %r5378, %r5345; + shf.l.wrap.b32 %r5380, %r5379, %r5379, 16; + add.s32 %r5381, %r5380, %r5360; + xor.b32 %r5382, %r5381, %r5320; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 20; + add.s32 %r5384, %r5378, %r5058; + add.s32 %r5385, %r5384, %r5383; + xor.b32 %r5386, %r5385, %r5380; + shf.l.wrap.b32 %r5387, %r5386, %r5386, 24; + add.s32 %r5388, %r5387, %r5381; + xor.b32 %r5389, %r5388, %r5383; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 25; + add.s32 %r5391, %r5343, %r5065; + add.s32 %r5392, %r5391, %r5334; + xor.b32 %r5393, %r5359, %r5392; + shf.l.wrap.b32 %r5394, %r5393, %r5393, 16; + add.s32 %r5395, %r5394, %r5318; + xor.b32 %r5396, %r5395, %r5334; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 20; + add.s32 %r5398, %r5392, %r4988; + add.s32 %r5399, %r5398, %r5397; + xor.b32 %r5400, %r5399, %r5394; + shf.l.wrap.b32 %r5401, %r5400, %r5400, 24; + add.s32 %r5402, %r5401, %r5395; + xor.b32 %r5403, %r5402, %r5397; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 25; + add.s32 %r5405, %r5348, %r5023; + add.s32 %r5406, %r5405, %r5357; + xor.b32 %r5407, %r5406, %r5317; + shf.l.wrap.b32 %r5408, %r5407, %r5407, 16; + add.s32 %r5409, %r5408, %r5332; + xor.b32 %r5410, %r5409, %r5348; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 20; + add.s32 %r5412, %r5406, %r5072; + add.s32 %r5413, %r5412, %r5411; + xor.b32 %r5414, %r5413, %r5408; + shf.l.wrap.b32 %r5415, %r5414, %r5414, 24; + add.s32 %r5416, %r5415, %r5409; + xor.b32 %r5417, %r5416, %r5411; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 25; + add.s32 %r5419, %r5371, %r5016; + add.s32 %r5420, %r5419, %r5390; + xor.b32 %r5421, %r5420, %r5415; + shf.l.wrap.b32 %r5422, %r5421, %r5421, 16; + add.s32 %r5423, %r5422, %r5402; + xor.b32 %r5424, %r5423, %r5390; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 20; + add.s32 %r5426, %r5420, %r5009; + add.s32 %r5427, %r5426, %r5425; + xor.b32 %r5428, %r5427, %r5422; + shf.l.wrap.b32 %r5429, %r5428, %r5428, 24; + add.s32 %r5430, %r5429, %r5423; + xor.b32 %r5431, %r5430, %r5425; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 25; + add.s32 %r5433, %r5385, %r5037; + add.s32 %r5434, %r5433, %r5404; + xor.b32 %r5435, %r5373, %r5434; + shf.l.wrap.b32 %r5436, %r5435, %r5435, 16; + add.s32 %r5437, %r5436, %r5416; + xor.b32 %r5438, %r5437, %r5404; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 20; + add.s32 %r5440, %r5434, %r4974; + add.s32 %r5441, %r5440, %r5439; + xor.b32 %r5442, %r5441, %r5436; + shf.l.wrap.b32 %r5443, %r5442, %r5442, 24; + add.s32 %r5444, %r5443, %r5437; + xor.b32 %r5445, %r5444, %r5439; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 25; + add.s32 %r5447, %r5399, %r5051; + add.s32 %r5448, %r5447, %r5418; + xor.b32 %r5449, %r5448, %r5387; + shf.l.wrap.b32 %r5450, %r5449, %r5449, 16; + add.s32 %r5451, %r5450, %r5374; + xor.b32 %r5452, %r5451, %r5418; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 20; + add.s32 %r5454, %r5448, %r5079; + add.s32 %r5455, %r5454, %r5453; + xor.b32 %r5456, %r5455, %r5450; + shf.l.wrap.b32 %r5457, %r5456, %r5456, 24; + add.s32 %r5458, %r5457, %r5451; + xor.b32 %r5459, %r5458, %r5453; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 25; + add.s32 %r5461, %r5413, %r5030; + add.s32 %r5462, %r5461, %r5376; + xor.b32 %r5463, %r5462, %r5401; + shf.l.wrap.b32 %r5464, %r5463, %r5463, 16; + add.s32 %r5465, %r5464, %r5388; + xor.b32 %r5466, %r5465, %r5376; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 20; + add.s32 %r5468, %r5462, %r4981; + add.s32 %r5469, %r5468, %r5467; + xor.b32 %r5470, %r5469, %r5464; + shf.l.wrap.b32 %r5471, %r5470, %r5470, 24; + add.s32 %r5472, %r5471, %r5465; + xor.b32 %r5473, %r5472, %r5467; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 25; + add.s32 %r5475, %r5427, %r5044; + add.s32 %r5476, %r5475, %r5474; + xor.b32 %r5477, %r5476, %r5443; + shf.l.wrap.b32 %r5478, %r5477, %r5477, 16; + add.s32 %r5479, %r5478, %r5458; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 20; + add.s32 %r5482, %r5476, %r5023; + add.s32 %r5483, %r5482, %r5481; + xor.b32 %r5484, %r5483, %r5478; + shf.l.wrap.b32 %r5485, %r5484, %r5484, 24; + add.s32 %r5486, %r5485, %r5479; + xor.b32 %r5487, %r5486, %r5481; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 25; + add.s32 %r5489, %r5441, %r5058; + add.s32 %r5490, %r5489, %r5432; + xor.b32 %r5491, %r5490, %r5457; + shf.l.wrap.b32 %r5492, %r5491, %r5491, 16; + add.s32 %r5493, %r5492, %r5472; + xor.b32 %r5494, %r5493, %r5432; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 20; + add.s32 %r5496, %r5490, %r5037; + add.s32 %r5497, %r5496, %r5495; + xor.b32 %r5498, %r5497, %r5492; + shf.l.wrap.b32 %r5499, %r5498, %r5498, 24; + add.s32 %r5500, %r5499, %r5493; + xor.b32 %r5501, %r5500, %r5495; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 25; + add.s32 %r5503, %r5455, %r5072; + add.s32 %r5504, %r5503, %r5446; + xor.b32 %r5505, %r5471, %r5504; + shf.l.wrap.b32 %r5506, %r5505, %r5505, 16; + add.s32 %r5507, %r5506, %r5430; + xor.b32 %r5508, %r5507, %r5446; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 20; + add.s32 %r5510, %r5504, %r4995; + add.s32 %r5511, %r5510, %r5509; + xor.b32 %r5512, %r5511, %r5506; + shf.l.wrap.b32 %r5513, %r5512, %r5512, 24; + add.s32 %r5514, %r5513, %r5507; + xor.b32 %r5515, %r5514, %r5509; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 25; + add.s32 %r5517, %r5469, %r5065; + add.s32 %r5518, %r5517, %r5460; + xor.b32 %r5519, %r5518, %r5429; + shf.l.wrap.b32 %r5520, %r5519, %r5519, 16; + add.s32 %r5521, %r5520, %r5444; + xor.b32 %r5522, %r5521, %r5460; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 20; + add.s32 %r5524, %r5518, %r5079; + add.s32 %r5525, %r5524, %r5523; + xor.b32 %r5526, %r5525, %r5520; + shf.l.wrap.b32 %r5527, %r5526, %r5526, 24; + add.s32 %r5528, %r5527, %r5521; + xor.b32 %r5529, %r5528, %r5523; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 25; + add.s32 %r5531, %r5483, %r5002; + add.s32 %r5532, %r5531, %r5502; + xor.b32 %r5533, %r5532, %r5527; + shf.l.wrap.b32 %r5534, %r5533, %r5533, 16; + add.s32 %r5535, %r5534, %r5514; + xor.b32 %r5536, %r5535, %r5502; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 20; + add.s32 %r5538, %r5532, %r4974; + add.s32 %r5539, %r5538, %r5537; + xor.b32 %r5540, %r5539, %r5534; + shf.l.wrap.b32 %r5541, %r5540, %r5540, 24; + add.s32 %r5542, %r5541, %r5535; + xor.b32 %r5543, %r5542, %r5537; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 25; + add.s32 %r5545, %r5497, %r5051; + add.s32 %r5546, %r5545, %r5516; + xor.b32 %r5547, %r5485, %r5546; + shf.l.wrap.b32 %r5548, %r5547, %r5547, 16; + add.s32 %r5549, %r5548, %r5528; + xor.b32 %r5550, %r5549, %r5516; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 20; + add.s32 %r5552, %r5546, %r4988; + add.s32 %r5553, %r5552, %r5551; + xor.b32 %r5554, %r5553, %r5548; + shf.l.wrap.b32 %r5555, %r5554, %r5554, 24; + add.s32 %r5556, %r5555, %r5549; + xor.b32 %r5557, %r5556, %r5551; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 25; + add.s32 %r5559, %r5511, %r5009; + add.s32 %r5560, %r5559, %r5530; + xor.b32 %r5561, %r5560, %r5499; + shf.l.wrap.b32 %r5562, %r5561, %r5561, 16; + add.s32 %r5563, %r5562, %r5486; + xor.b32 %r5564, %r5563, %r5530; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 20; + add.s32 %r5566, %r5560, %r5030; + add.s32 %r5567, %r5566, %r5565; + xor.b32 %r5568, %r5567, %r5562; + shf.l.wrap.b32 %r5569, %r5568, %r5568, 24; + add.s32 %r5570, %r5569, %r5563; + xor.b32 %r5571, %r5570, %r5565; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 25; + add.s32 %r5573, %r5525, %r4981; + add.s32 %r5574, %r5573, %r5488; + xor.b32 %r5575, %r5574, %r5513; + shf.l.wrap.b32 %r5576, %r5575, %r5575, 16; + add.s32 %r5577, %r5576, %r5500; + xor.b32 %r5578, %r5577, %r5488; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 20; + add.s32 %r5580, %r5574, %r5016; + add.s32 %r5581, %r5580, %r5579; + xor.b32 %r5582, %r5581, %r5576; + shf.l.wrap.b32 %r5583, %r5582, %r5582, 24; + add.s32 %r5584, %r5583, %r5577; + xor.b32 %r5585, %r5584, %r5579; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 25; + add.s32 %r5587, %r5539, %r5058; + add.s32 %r5588, %r5587, %r5586; + xor.b32 %r5589, %r5588, %r5555; + shf.l.wrap.b32 %r5590, %r5589, %r5589, 16; + add.s32 %r5591, %r5590, %r5570; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 20; + add.s32 %r5594, %r5588, %r5065; + add.s32 %r5595, %r5594, %r5593; + xor.b32 %r5596, %r5595, %r5590; + shf.l.wrap.b32 %r5597, %r5596, %r5596, 24; + add.s32 %r5598, %r5597, %r5591; + xor.b32 %r5599, %r5598, %r5593; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 25; + add.s32 %r5601, %r5553, %r5037; + add.s32 %r5602, %r5601, %r5544; + xor.b32 %r5603, %r5602, %r5569; + shf.l.wrap.b32 %r5604, %r5603, %r5603, 16; + add.s32 %r5605, %r5604, %r5584; + xor.b32 %r5606, %r5605, %r5544; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 20; + add.s32 %r5608, %r5602, %r5051; + add.s32 %r5609, %r5608, %r5607; + xor.b32 %r5610, %r5609, %r5604; + shf.l.wrap.b32 %r5611, %r5610, %r5610, 24; + add.s32 %r5612, %r5611, %r5605; + xor.b32 %r5613, %r5612, %r5607; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 25; + add.s32 %r5615, %r5567, %r5079; + add.s32 %r5616, %r5615, %r5558; + xor.b32 %r5617, %r5583, %r5616; + shf.l.wrap.b32 %r5618, %r5617, %r5617, 16; + add.s32 %r5619, %r5618, %r5542; + xor.b32 %r5620, %r5619, %r5558; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 20; + add.s32 %r5622, %r5616, %r5044; + add.s32 %r5623, %r5622, %r5621; + xor.b32 %r5624, %r5623, %r5618; + shf.l.wrap.b32 %r5625, %r5624, %r5624, 24; + add.s32 %r5626, %r5625, %r5619; + xor.b32 %r5627, %r5626, %r5621; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 25; + add.s32 %r5629, %r5581, %r5072; + add.s32 %r5630, %r5629, %r5572; + xor.b32 %r5631, %r5630, %r5541; + shf.l.wrap.b32 %r5632, %r5631, %r5631, 16; + add.s32 %r5633, %r5632, %r5556; + xor.b32 %r5634, %r5633, %r5572; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 20; + add.s32 %r5636, %r5630, %r5030; + add.s32 %r5637, %r5636, %r5635; + xor.b32 %r5638, %r5637, %r5632; + shf.l.wrap.b32 %r5639, %r5638, %r5638, 24; + add.s32 %r5640, %r5639, %r5633; + xor.b32 %r5641, %r5640, %r5635; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 25; + add.s32 %r5643, %r5595, %r5023; + add.s32 %r5644, %r5643, %r5614; + xor.b32 %r5645, %r5644, %r5639; + shf.l.wrap.b32 %r5646, %r5645, %r5645, 16; + add.s32 %r5647, %r5646, %r5626; + xor.b32 %r5648, %r5647, %r5614; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 20; + add.s32 %r5650, %r5644, %r4988; + add.s32 %r5651, %r5650, %r5649; + xor.b32 %r5652, %r5651, %r5646; + shf.l.wrap.b32 %r5653, %r5652, %r5652, 24; + add.s32 %r5654, %r5653, %r5647; + xor.b32 %r5655, %r5654, %r5649; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 25; + add.s32 %r5657, %r5609, %r5009; + add.s32 %r5658, %r5657, %r5628; + xor.b32 %r5659, %r5597, %r5658; + shf.l.wrap.b32 %r5660, %r5659, %r5659, 16; + add.s32 %r5661, %r5660, %r5640; + xor.b32 %r5662, %r5661, %r5628; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 20; + add.s32 %r5664, %r5658, %r4995; + add.s32 %r5665, %r5664, %r5663; + xor.b32 %r5666, %r5665, %r5660; + shf.l.wrap.b32 %r5667, %r5666, %r5666, 24; + add.s32 %r5668, %r5667, %r5661; + xor.b32 %r5669, %r5668, %r5663; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 25; + add.s32 %r5671, %r5623, %r4974; + add.s32 %r5672, %r5671, %r5642; + xor.b32 %r5673, %r5672, %r5611; + shf.l.wrap.b32 %r5674, %r5673, %r5673, 16; + add.s32 %r5675, %r5674, %r5598; + xor.b32 %r5676, %r5675, %r5642; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 20; + add.s32 %r5678, %r5672, %r4981; + add.s32 %r5679, %r5678, %r5677; + xor.b32 %r5680, %r5679, %r5674; + shf.l.wrap.b32 %r5681, %r5680, %r5680, 24; + add.s32 %r5682, %r5681, %r5675; + xor.b32 %r5683, %r5682, %r5677; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 25; + add.s32 %r5685, %r5637, %r5016; + add.s32 %r5686, %r5685, %r5600; + xor.b32 %r5687, %r5686, %r5625; + shf.l.wrap.b32 %r5688, %r5687, %r5687, 16; + add.s32 %r5689, %r5688, %r5612; + xor.b32 %r5690, %r5689, %r5600; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 20; + add.s32 %r5692, %r5686, %r5002; + add.s32 %r5693, %r5692, %r5691; + xor.b32 %r5694, %r5693, %r5688; + shf.l.wrap.b32 %r5695, %r5694, %r5694, 24; + add.s32 %r5696, %r5695, %r5689; + xor.b32 %r5697, %r5696, %r5691; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 25; + add.s32 %r5699, %r5651, %r5037; + add.s32 %r5700, %r5699, %r5698; + xor.b32 %r5701, %r5700, %r5667; + shf.l.wrap.b32 %r5702, %r5701, %r5701, 16; + add.s32 %r5703, %r5702, %r5682; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 20; + add.s32 %r5706, %r5700, %r5072; + add.s32 %r5707, %r5706, %r5705; + xor.b32 %r5708, %r5707, %r5702; + shf.l.wrap.b32 %r5709, %r5708, %r5708, 24; + add.s32 %r5710, %r5709, %r5703; + xor.b32 %r5711, %r5710, %r5705; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 25; + add.s32 %r5713, %r5665, %r5051; + add.s32 %r5714, %r5713, %r5656; + xor.b32 %r5715, %r5714, %r5681; + shf.l.wrap.b32 %r5716, %r5715, %r5715, 16; + add.s32 %r5717, %r5716, %r5696; + xor.b32 %r5718, %r5717, %r5656; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 20; + add.s32 %r5720, %r5714, %r5009; + add.s32 %r5721, %r5720, %r5719; + xor.b32 %r5722, %r5721, %r5716; + shf.l.wrap.b32 %r5723, %r5722, %r5722, 24; + add.s32 %r5724, %r5723, %r5717; + xor.b32 %r5725, %r5724, %r5719; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 25; + add.s32 %r5727, %r5679, %r5030; + add.s32 %r5728, %r5727, %r5670; + xor.b32 %r5729, %r5695, %r5728; + shf.l.wrap.b32 %r5730, %r5729, %r5729, 16; + add.s32 %r5731, %r5730, %r5654; + xor.b32 %r5732, %r5731, %r5670; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 20; + add.s32 %r5734, %r5728, %r5058; + add.s32 %r5735, %r5734, %r5733; + xor.b32 %r5736, %r5735, %r5730; + shf.l.wrap.b32 %r5737, %r5736, %r5736, 24; + add.s32 %r5738, %r5737, %r5731; + xor.b32 %r5739, %r5738, %r5733; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 25; + add.s32 %r5741, %r5693, %r5079; + add.s32 %r5742, %r5741, %r5684; + xor.b32 %r5743, %r5742, %r5653; + shf.l.wrap.b32 %r5744, %r5743, %r5743, 16; + add.s32 %r5745, %r5744, %r5668; + xor.b32 %r5746, %r5745, %r5684; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 20; + add.s32 %r5748, %r5742, %r4981; + add.s32 %r5749, %r5748, %r5747; + xor.b32 %r5750, %r5749, %r5744; + shf.l.wrap.b32 %r5751, %r5750, %r5750, 24; + add.s32 %r5752, %r5751, %r5745; + xor.b32 %r5753, %r5752, %r5747; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 25; + add.s32 %r5755, %r5707, %r5065; + add.s32 %r5756, %r5755, %r5726; + xor.b32 %r5757, %r5756, %r5751; + shf.l.wrap.b32 %r5758, %r5757, %r5757, 16; + add.s32 %r5759, %r5758, %r5738; + xor.b32 %r5760, %r5759, %r5726; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 20; + add.s32 %r5762, %r5756, %r4995; + add.s32 %r5763, %r5762, %r5761; + xor.b32 %r5764, %r5763, %r5758; + shf.l.wrap.b32 %r5765, %r5764, %r5764, 24; + add.s32 %r5766, %r5765, %r5759; + xor.b32 %r5767, %r5766, %r5761; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 25; + add.s32 %r5769, %r5721, %r4974; + add.s32 %r5770, %r5769, %r5740; + xor.b32 %r5771, %r5709, %r5770; + shf.l.wrap.b32 %r5772, %r5771, %r5771, 16; + add.s32 %r5773, %r5772, %r5752; + xor.b32 %r5774, %r5773, %r5740; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 20; + add.s32 %r5776, %r5770, %r5044; + add.s32 %r5777, %r5776, %r5775; + xor.b32 %r5778, %r5777, %r5772; + shf.l.wrap.b32 %r5779, %r5778, %r5778, 24; + add.s32 %r5780, %r5779, %r5773; + xor.b32 %r5781, %r5780, %r5775; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 25; + add.s32 %r5783, %r5735, %r4988; + add.s32 %r5784, %r5783, %r5754; + xor.b32 %r5785, %r5784, %r5723; + shf.l.wrap.b32 %r5786, %r5785, %r5785, 16; + add.s32 %r5787, %r5786, %r5710; + xor.b32 %r5788, %r5787, %r5754; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 20; + add.s32 %r5790, %r5784, %r5016; + add.s32 %r5791, %r5790, %r5789; + xor.b32 %r5792, %r5791, %r5786; + shf.l.wrap.b32 %r5793, %r5792, %r5792, 24; + add.s32 %r5794, %r5793, %r5787; + xor.b32 %r5795, %r5794, %r5789; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 25; + add.s32 %r5797, %r5749, %r5002; + add.s32 %r5798, %r5797, %r5712; + xor.b32 %r5799, %r5798, %r5737; + shf.l.wrap.b32 %r5800, %r5799, %r5799, 16; + add.s32 %r5801, %r5800, %r5724; + xor.b32 %r5802, %r5801, %r5712; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 20; + add.s32 %r5804, %r5798, %r5023; + add.s32 %r5805, %r5804, %r5803; + xor.b32 %r5806, %r5805, %r5800; + shf.l.wrap.b32 %r5807, %r5806, %r5806, 24; + add.s32 %r5808, %r5807, %r5801; + xor.b32 %r5809, %r5808, %r5803; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 25; + add.s32 %r5811, %r5763, %r5051; + add.s32 %r5812, %r5811, %r5810; + xor.b32 %r5813, %r5812, %r5779; + shf.l.wrap.b32 %r5814, %r5813, %r5813, 16; + add.s32 %r5815, %r5814, %r5794; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 20; + add.s32 %r5818, %r5812, %r5079; + add.s32 %r5819, %r5818, %r5817; + xor.b32 %r5820, %r5819, %r5814; + shf.l.wrap.b32 %r5821, %r5820, %r5820, 24; + add.s32 %r5822, %r5821, %r5815; + xor.b32 %r5823, %r5822, %r5817; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 25; + add.s32 %r5825, %r5777, %r5009; + add.s32 %r5826, %r5825, %r5768; + xor.b32 %r5827, %r5826, %r5793; + shf.l.wrap.b32 %r5828, %r5827, %r5827, 16; + add.s32 %r5829, %r5828, %r5808; + xor.b32 %r5830, %r5829, %r5768; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 20; + add.s32 %r5832, %r5826, %r4974; + add.s32 %r5833, %r5832, %r5831; + xor.b32 %r5834, %r5833, %r5828; + shf.l.wrap.b32 %r5835, %r5834, %r5834, 24; + add.s32 %r5836, %r5835, %r5829; + xor.b32 %r5837, %r5836, %r5831; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 25; + add.s32 %r5839, %r5791, %r4981; + add.s32 %r5840, %r5839, %r5782; + xor.b32 %r5841, %r5807, %r5840; + shf.l.wrap.b32 %r5842, %r5841, %r5841, 16; + add.s32 %r5843, %r5842, %r5766; + xor.b32 %r5844, %r5843, %r5782; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 20; + add.s32 %r5846, %r5840, %r5037; + add.s32 %r5847, %r5846, %r5845; + xor.b32 %r5848, %r5847, %r5842; + shf.l.wrap.b32 %r5849, %r5848, %r5848, 24; + add.s32 %r5850, %r5849, %r5843; + xor.b32 %r5851, %r5850, %r5845; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 25; + add.s32 %r5853, %r5805, %r5030; + add.s32 %r5854, %r5853, %r5796; + xor.b32 %r5855, %r5854, %r5765; + shf.l.wrap.b32 %r5856, %r5855, %r5855, 16; + add.s32 %r5857, %r5856, %r5780; + xor.b32 %r5858, %r5857, %r5796; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 20; + add.s32 %r5860, %r5854, %r5016; + add.s32 %r5861, %r5860, %r5859; + xor.b32 %r5862, %r5861, %r5856; + shf.l.wrap.b32 %r5863, %r5862, %r5862, 24; + add.s32 %r5864, %r5863, %r5857; + xor.b32 %r5865, %r5864, %r5859; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 25; + add.s32 %r5867, %r5819, %r5072; + add.s32 %r5868, %r5867, %r5838; + xor.b32 %r5869, %r5868, %r5863; + shf.l.wrap.b32 %r5870, %r5869, %r5869, 16; + add.s32 %r5871, %r5870, %r5850; + xor.b32 %r5872, %r5871, %r5838; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 20; + add.s32 %r5874, %r5868, %r5044; + add.s32 %r5875, %r5874, %r5873; + xor.b32 %r5876, %r5875, %r5870; + shf.l.wrap.b32 %r5877, %r5876, %r5876, 24; + add.s32 %r5878, %r5877, %r5871; + xor.b32 %r5879, %r5878, %r5873; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 25; + add.s32 %r5881, %r5833, %r4988; + add.s32 %r5882, %r5881, %r5852; + xor.b32 %r5883, %r5821, %r5882; + shf.l.wrap.b32 %r5884, %r5883, %r5883, 16; + add.s32 %r5885, %r5884, %r5864; + xor.b32 %r5886, %r5885, %r5852; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 20; + add.s32 %r5888, %r5882, %r5058; + add.s32 %r5889, %r5888, %r5887; + xor.b32 %r5890, %r5889, %r5884; + shf.l.wrap.b32 %r5891, %r5890, %r5890, 24; + add.s32 %r5892, %r5891, %r5885; + xor.b32 %r5893, %r5892, %r5887; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 25; + add.s32 %r5895, %r5847, %r4995; + add.s32 %r5896, %r5895, %r5866; + xor.b32 %r5897, %r5896, %r5835; + shf.l.wrap.b32 %r5898, %r5897, %r5897, 16; + add.s32 %r5899, %r5898, %r5822; + xor.b32 %r5900, %r5899, %r5866; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 20; + add.s32 %r5902, %r5896, %r5002; + add.s32 %r5903, %r5902, %r5901; + xor.b32 %r5904, %r5903, %r5898; + shf.l.wrap.b32 %r5905, %r5904, %r5904, 24; + add.s32 %r5906, %r5905, %r5899; + xor.b32 %r5907, %r5906, %r5901; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 25; + add.s32 %r5909, %r5861, %r5023; + add.s32 %r5910, %r5909, %r5824; + xor.b32 %r5911, %r5910, %r5849; + shf.l.wrap.b32 %r5912, %r5911, %r5911, 16; + add.s32 %r5913, %r5912, %r5836; + xor.b32 %r5914, %r5913, %r5824; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 20; + add.s32 %r5916, %r5910, %r5065; + add.s32 %r5917, %r5916, %r5915; + xor.b32 %r5918, %r5917, %r5912; + shf.l.wrap.b32 %r5919, %r5918, %r5918, 24; + add.s32 %r5920, %r5919, %r5913; + xor.b32 %r5921, %r5920, %r5915; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 25; + xor.b32 %r5923, %r5906, %r5875; + xor.b32 %r5924, %r5920, %r5889; + xor.b32 %r5925, %r5878, %r5903; + xor.b32 %r5926, %r5917, %r5892; + xor.b32 %r5927, %r5922, %r5891; + xor.b32 %r5928, %r5880, %r5905; + xor.b32 %r5929, %r5919, %r5894; + xor.b32 %r5930, %r5908, %r5877; + st.local.u8 [%rd179+145], %r5923; + shr.u32 %r5931, %r5923, 8; + st.local.u8 [%rd179+146], %r5931; + shr.u32 %r5932, %r5923, 16; + st.local.u8 [%rd179+147], %r5932; + shr.u32 %r5933, %r5923, 24; + st.local.u8 [%rd179+148], %r5933; + st.local.u8 [%rd179+149], %r5924; + shr.u32 %r5934, %r5924, 8; + st.local.u8 [%rd179+150], %r5934; + shr.u32 %r5935, %r5924, 16; + st.local.u8 [%rd179+151], %r5935; + shr.u32 %r5936, %r5924, 24; + st.local.u8 [%rd179+152], %r5936; + st.local.u8 [%rd179+153], %r5925; + shr.u32 %r5937, %r5925, 8; + st.local.u8 [%rd179+154], %r5937; + shr.u32 %r5938, %r5925, 16; + st.local.u8 [%rd179+155], %r5938; + shr.u32 %r5939, %r5925, 24; + st.local.u8 [%rd179+156], %r5939; + st.local.u8 [%rd179+157], %r5926; + shr.u32 %r5940, %r5926, 8; + st.local.u8 [%rd179+158], %r5940; + shr.u32 %r5941, %r5926, 16; + st.local.u8 [%rd179+159], %r5941; + shr.u32 %r5942, %r5926, 24; + st.local.u8 [%rd179+160], %r5942; + st.local.u8 [%rd179+161], %r5927; + shr.u32 %r5943, %r5927, 8; + st.local.u8 [%rd179+162], %r5943; + shr.u32 %r5944, %r5927, 16; + st.local.u8 [%rd179+163], %r5944; + shr.u32 %r5945, %r5927, 24; + st.local.u8 [%rd179+164], %r5945; + st.local.u8 [%rd179+165], %r5928; + shr.u32 %r5946, %r5928, 8; + st.local.u8 [%rd179+166], %r5946; + shr.u32 %r5947, %r5928, 16; + st.local.u8 [%rd179+167], %r5947; + shr.u32 %r5948, %r5928, 24; + st.local.u8 [%rd179+168], %r5948; + st.local.u8 [%rd179+169], %r5929; + shr.u32 %r5949, %r5929, 8; + st.local.u8 [%rd179+170], %r5949; + shr.u32 %r5950, %r5929, 16; + st.local.u8 [%rd179+171], %r5950; + shr.u32 %r5951, %r5929, 24; + st.local.u8 [%rd179+172], %r5951; + st.local.u8 [%rd179+173], %r5930; + shr.u32 %r5952, %r5930, 8; + st.local.u8 [%rd179+174], %r5952; + shr.u32 %r5953, %r5930, 16; + st.local.u8 [%rd179+175], %r5953; + shr.u32 %r5954, %r5930, 24; + st.local.u8 [%rd179+176], %r5954; + ld.local.u8 %rs176, [%rd3+8]; + add.s16 %rs177, %rs176, -1; + st.local.u8 [%rd3+8], %rs177; + cvt.u64.u16 %rd180, %rs177; + and.b64 %rd181, %rd180, 255; + setp.lt.u64 %p30, %rd227, %rd181; + and.b16 %rs178, %rs177, 255; + mul.wide.u16 %r11663, %rs178, 32; + @%p30 bra $L__BB1_34; + +$L__BB1_35: + cvt.s64.s32 %rd182, %r11663; + add.s64 %rd183, %rd2, %rd182; + mov.b32 {%rs179, %rs180}, %r3967; + st.local.u8 [%rd183+145], %rs179; + shr.u16 %rs181, %rs179, 8; + st.local.u8 [%rd183+146], %rs181; + st.local.u8 [%rd183+147], %rs180; + shr.u16 %rs182, %rs180, 8; + st.local.u8 [%rd183+148], %rs182; + mov.b32 {%rs183, %rs184}, %r3968; + st.local.u8 [%rd183+149], %rs183; + shr.u16 %rs185, %rs183, 8; + st.local.u8 [%rd183+150], %rs185; + st.local.u8 [%rd183+151], %rs184; + shr.u16 %rs186, %rs184, 8; + st.local.u8 [%rd183+152], %rs186; + mov.b32 {%rs187, %rs188}, %r3969; + st.local.u8 [%rd183+153], %rs187; + shr.u16 %rs189, %rs187, 8; + st.local.u8 [%rd183+154], %rs189; + st.local.u8 [%rd183+155], %rs188; + shr.u16 %rs190, %rs188, 8; + st.local.u8 [%rd183+156], %rs190; + mov.b32 {%rs191, %rs192}, %r3970; + st.local.u8 [%rd183+157], %rs191; + shr.u16 %rs193, %rs191, 8; + st.local.u8 [%rd183+158], %rs193; + st.local.u8 [%rd183+159], %rs192; + shr.u16 %rs194, %rs192, 8; + st.local.u8 [%rd183+160], %rs194; + mov.b32 {%rs195, %rs196}, %r3971; + st.local.u8 [%rd183+161], %rs195; + shr.u16 %rs197, %rs195, 8; + st.local.u8 [%rd183+162], %rs197; + st.local.u8 [%rd183+163], %rs196; + shr.u16 %rs198, %rs196, 8; + st.local.u8 [%rd183+164], %rs198; + mov.b32 {%rs199, %rs200}, %r3972; + st.local.u8 [%rd183+165], %rs199; + shr.u16 %rs201, %rs199, 8; + st.local.u8 [%rd183+166], %rs201; + st.local.u8 [%rd183+167], %rs200; + shr.u16 %rs202, %rs200, 8; + st.local.u8 [%rd183+168], %rs202; + mov.b32 {%rs203, %rs204}, %r3973; + st.local.u8 [%rd183+169], %rs203; + shr.u16 %rs205, %rs203, 8; + st.local.u8 [%rd183+170], %rs205; + st.local.u8 [%rd183+171], %rs204; + shr.u16 %rs206, %rs204, 8; + st.local.u8 [%rd183+172], %rs206; + mov.b32 {%rs207, %rs208}, %r3974; + st.local.u8 [%rd183+173], %rs207; + shr.u16 %rs209, %rs207, 8; + st.local.u8 [%rd183+174], %rs209; + st.local.u8 [%rd183+175], %rs208; + shr.u16 %rs210, %rs208, 8; + st.local.u8 [%rd183+176], %rs210; + ld.local.u8 %rs388, [%rd3+8]; + +$L__BB1_47: + add.s16 %rs331, %rs388, 1; + st.local.u8 [%rd3+8], %rs331; + ld.local.u64 %rd196, [%rd3+-72]; + shr.u64 %rd197, %rd49, 10; + add.s64 %rd251, %rd196, %rd197; + st.local.u64 [%rd3+-72], %rd251; + add.s64 %rd261, %rd261, %rd49; + add.s64 %rd254, %rd254, %rd49; + sub.s64 %rd262, %rd262, %rd49; + setp.gt.u64 %p39, %rd262, 1024; + @%p39 bra $L__BB1_26; + +$L__BB1_48: + setp.eq.s64 %p40, %rd262, 0; + @%p40 bra $L__BB1_68; + + ld.local.u8 %rs389, [%rd3]; + cvt.u64.u16 %rd71, %rs389; + setp.eq.s16 %p41, %rs389, 0; + mov.u16 %rs390, 0; + mov.u64 %rd271, %rd262; + @%p41 bra $L__BB1_57; + + mov.u64 %rd198, 64; + sub.s64 %rd199, %rd198, %rd71; + min.u64 %rd72, %rd199, %rd262; + setp.eq.s64 %p42, %rd72, 0; + @%p42 bra $L__BB1_54; + + add.s64 %rd201, %rd2, %rd71; + add.s64 %rd73, %rd201, 72; + mov.u64 %rd263, 0; + +$L__BB1_52: + add.s64 %rd202, %rd261, %rd263; + ld.local.u8 %rs333, [%rd202]; + add.s64 %rd203, %rd73, %rd263; + st.local.u8 [%rd203], %rs333; + add.s64 %rd263, %rd263, 1; + setp.lt.u64 %p43, %rd263, %rd72; + @%p43 bra $L__BB1_52; + + ld.local.u8 %rs389, [%rd3]; + +$L__BB1_54: + cvt.u16.u64 %rs334, %rd72; + add.s16 %rs390, %rs389, %rs334; + mov.u64 %rd271, 0; + st.local.u8 [%rd3], %rs390; + add.s64 %rd261, %rd261, %rd72; + sub.s64 %rd77, %rd262, %rd72; + setp.eq.s64 %p44, %rd77, 0; + @%p44 bra $L__BB1_57; + + add.s64 %rd78, %rd2, 72; + ld.local.u8 %rs335, [%rd3+1]; + mov.u64 %rd264, 0; + setp.eq.s16 %p45, %rs335, 0; + mov.u16 %rs390, 0; + selp.u16 %rs337, 1, 0, %p45; + ld.local.u8 %rs338, [%rd3+2]; + or.b16 %rs339, %rs338, %rs337; + ld.local.u8 %r8843, [%rd3+-64]; + ld.local.u8 %r8844, [%rd3+-63]; + prmt.b32 %r8845, %r8844, %r8843, 30212; + ld.local.u8 %r8846, [%rd3+-62]; + prmt.b32 %r8847, %r8846, %r8845, 28756; + ld.local.u8 %r8848, [%rd3+-61]; + prmt.b32 %r8849, %r8848, %r8847, 1620; + ld.local.u8 %r8850, [%rd3+-60]; + ld.local.u8 %r8851, [%rd3+-59]; + prmt.b32 %r8852, %r8851, %r8850, 30212; + ld.local.u8 %r8853, [%rd3+-58]; + prmt.b32 %r8854, %r8853, %r8852, 28756; + ld.local.u8 %r8855, [%rd3+-57]; + prmt.b32 %r8856, %r8855, %r8854, 1620; + ld.local.u8 %r8857, [%rd3+-56]; + ld.local.u8 %r8858, [%rd3+-55]; + prmt.b32 %r8859, %r8858, %r8857, 30212; + ld.local.u8 %r8860, [%rd3+-54]; + prmt.b32 %r8861, %r8860, %r8859, 28756; + ld.local.u8 %r8862, [%rd3+-53]; + prmt.b32 %r8863, %r8862, %r8861, 1620; + ld.local.u8 %r8864, [%rd3+-52]; + ld.local.u8 %r8865, [%rd3+-51]; + prmt.b32 %r8866, %r8865, %r8864, 30212; + ld.local.u8 %r8867, [%rd3+-50]; + prmt.b32 %r8868, %r8867, %r8866, 28756; + ld.local.u8 %r8869, [%rd3+-49]; + prmt.b32 %r8870, %r8869, %r8868, 1620; + ld.local.u8 %r8871, [%rd3+-48]; + ld.local.u8 %r8872, [%rd3+-47]; + prmt.b32 %r8873, %r8872, %r8871, 30212; + ld.local.u8 %r8874, [%rd3+-46]; + prmt.b32 %r8875, %r8874, %r8873, 28756; + ld.local.u8 %r8876, [%rd3+-45]; + prmt.b32 %r8877, %r8876, %r8875, 1620; + ld.local.u8 %r8878, [%rd3+-44]; + ld.local.u8 %r8879, [%rd3+-43]; + prmt.b32 %r8880, %r8879, %r8878, 30212; + ld.local.u8 %r8881, [%rd3+-42]; + prmt.b32 %r8882, %r8881, %r8880, 28756; + ld.local.u8 %r8883, [%rd3+-41]; + prmt.b32 %r8884, %r8883, %r8882, 1620; + ld.local.u8 %r8885, [%rd3+-40]; + ld.local.u8 %r8886, [%rd3+-39]; + prmt.b32 %r8887, %r8886, %r8885, 30212; + ld.local.u8 %r8888, [%rd3+-38]; + prmt.b32 %r8889, %r8888, %r8887, 28756; + ld.local.u8 %r8890, [%rd3+-37]; + prmt.b32 %r8891, %r8890, %r8889, 1620; + ld.local.u8 %r8892, [%rd3+-36]; + ld.local.u8 %r8893, [%rd3+-35]; + prmt.b32 %r8894, %r8893, %r8892, 30212; + ld.local.u8 %r8895, [%rd3+-34]; + prmt.b32 %r8896, %r8895, %r8894, 28756; + ld.local.u8 %r8897, [%rd3+-33]; + prmt.b32 %r8898, %r8897, %r8896, 1620; + ld.local.u8 %r8899, [%rd3+-32]; + ld.local.u8 %r8900, [%rd3+-31]; + prmt.b32 %r8901, %r8900, %r8899, 30212; + ld.local.u8 %r8902, [%rd3+-30]; + prmt.b32 %r8903, %r8902, %r8901, 28756; + ld.local.u8 %r8904, [%rd3+-29]; + prmt.b32 %r8905, %r8904, %r8903, 1620; + ld.local.u8 %r8906, [%rd3+-28]; + ld.local.u8 %r8907, [%rd3+-27]; + prmt.b32 %r8908, %r8907, %r8906, 30212; + ld.local.u8 %r8909, [%rd3+-26]; + prmt.b32 %r8910, %r8909, %r8908, 28756; + ld.local.u8 %r8911, [%rd3+-25]; + prmt.b32 %r8912, %r8911, %r8910, 1620; + ld.local.u8 %r8913, [%rd3+-24]; + ld.local.u8 %r8914, [%rd3+-23]; + prmt.b32 %r8915, %r8914, %r8913, 30212; + ld.local.u8 %r8916, [%rd3+-22]; + prmt.b32 %r8917, %r8916, %r8915, 28756; + ld.local.u8 %r8918, [%rd3+-21]; + prmt.b32 %r8919, %r8918, %r8917, 1620; + ld.local.u8 %r8920, [%rd3+-20]; + ld.local.u8 %r8921, [%rd3+-19]; + prmt.b32 %r8922, %r8921, %r8920, 30212; + ld.local.u8 %r8923, [%rd3+-18]; + prmt.b32 %r8924, %r8923, %r8922, 28756; + ld.local.u8 %r8925, [%rd3+-17]; + prmt.b32 %r8926, %r8925, %r8924, 1620; + ld.local.u8 %r8927, [%rd3+-16]; + ld.local.u8 %r8928, [%rd3+-15]; + prmt.b32 %r8929, %r8928, %r8927, 30212; + ld.local.u8 %r8930, [%rd3+-14]; + prmt.b32 %r8931, %r8930, %r8929, 28756; + ld.local.u8 %r8932, [%rd3+-13]; + prmt.b32 %r8933, %r8932, %r8931, 1620; + ld.local.u8 %r8934, [%rd3+-12]; + ld.local.u8 %r8935, [%rd3+-11]; + prmt.b32 %r8936, %r8935, %r8934, 30212; + ld.local.u8 %r8937, [%rd3+-10]; + prmt.b32 %r8938, %r8937, %r8936, 28756; + ld.local.u8 %r8939, [%rd3+-9]; + prmt.b32 %r8940, %r8939, %r8938, 1620; + ld.local.u8 %r8941, [%rd3+-8]; + ld.local.u8 %r8942, [%rd3+-7]; + prmt.b32 %r8943, %r8942, %r8941, 30212; + ld.local.u8 %r8944, [%rd3+-6]; + prmt.b32 %r8945, %r8944, %r8943, 28756; + ld.local.u8 %r8946, [%rd3+-5]; + prmt.b32 %r8947, %r8946, %r8945, 1620; + ld.local.u8 %r8948, [%rd3+-4]; + ld.local.u8 %r8949, [%rd3+-3]; + prmt.b32 %r8950, %r8949, %r8948, 30212; + ld.local.u8 %r8951, [%rd3+-2]; + prmt.b32 %r8952, %r8951, %r8950, 28756; + ld.local.u8 %r8953, [%rd3+-1]; + prmt.b32 %r8954, %r8953, %r8952, 1620; + ld.local.u64 %rd206, [%rd3+-72]; + cvt.u32.u64 %r8955, %rd206; + shr.u64 %rd207, %rd206, 32; + cvt.u32.u64 %r8956, %rd207; + cvt.u32.u16 %r8957, %rs339; + and.b32 %r8958, %r8957, 255; + ld.local.u32 %r8959, [%rd3+-104]; + add.s32 %r8960, %r8959, %r8849; + ld.local.u32 %r8961, [%rd3+-88]; + add.s32 %r8962, %r8960, %r8961; + xor.b32 %r8963, %r8962, %r8955; + shf.l.wrap.b32 %r8964, %r8963, %r8963, 16; + add.s32 %r8965, %r8964, 1779033703; + xor.b32 %r8966, %r8965, %r8961; + shf.l.wrap.b32 %r8967, %r8966, %r8966, 20; + add.s32 %r8968, %r8962, %r8856; + add.s32 %r8969, %r8968, %r8967; + xor.b32 %r8970, %r8969, %r8964; + shf.l.wrap.b32 %r8971, %r8970, %r8970, 24; + add.s32 %r8972, %r8971, %r8965; + xor.b32 %r8973, %r8972, %r8967; + shf.l.wrap.b32 %r8974, %r8973, %r8973, 25; + ld.local.u32 %r8975, [%rd3+-100]; + add.s32 %r8976, %r8975, %r8863; + ld.local.u32 %r8977, [%rd3+-84]; + add.s32 %r8978, %r8976, %r8977; + xor.b32 %r8979, %r8978, %r8956; + shf.l.wrap.b32 %r8980, %r8979, %r8979, 16; + add.s32 %r8981, %r8980, -1150833019; + xor.b32 %r8982, %r8981, %r8977; + shf.l.wrap.b32 %r8983, %r8982, %r8982, 20; + add.s32 %r8984, %r8978, %r8870; + add.s32 %r8985, %r8984, %r8983; + xor.b32 %r8986, %r8985, %r8980; + shf.l.wrap.b32 %r8987, %r8986, %r8986, 24; + add.s32 %r8988, %r8987, %r8981; + xor.b32 %r8989, %r8988, %r8983; + shf.l.wrap.b32 %r8990, %r8989, %r8989, 25; + ld.local.u32 %r8991, [%rd3+-96]; + add.s32 %r8992, %r8991, %r8877; + ld.local.u32 %r8993, [%rd3+-80]; + add.s32 %r8994, %r8992, %r8993; + shr.u32 %r8995, %r8994, 16; + shl.b32 %r8996, %r8994, 16; + xor.b32 %r8997, %r8996, 4194304; + or.b32 %r8998, %r8997, %r8995; + add.s32 %r8999, %r8998, 1013904242; + xor.b32 %r9000, %r8999, %r8993; + shf.l.wrap.b32 %r9001, %r9000, %r9000, 20; + add.s32 %r9002, %r8994, %r8884; + add.s32 %r9003, %r9002, %r9001; + xor.b32 %r9004, %r9003, %r8998; + shf.l.wrap.b32 %r9005, %r9004, %r9004, 24; + add.s32 %r9006, %r9005, %r8999; + xor.b32 %r9007, %r9006, %r9001; + shf.l.wrap.b32 %r9008, %r9007, %r9007, 25; + ld.local.u32 %r9009, [%rd3+-92]; + add.s32 %r9010, %r9009, %r8891; + ld.local.u32 %r9011, [%rd3+-76]; + add.s32 %r9012, %r9010, %r9011; + xor.b32 %r9013, %r9012, %r8958; + shr.u32 %r9014, %r9012, 16; + shl.b32 %r9015, %r9013, 16; + or.b32 %r9016, %r9015, %r9014; + add.s32 %r9017, %r9016, -1521486534; + xor.b32 %r9018, %r9017, %r9011; + shf.l.wrap.b32 %r9019, %r9018, %r9018, 20; + add.s32 %r9020, %r9012, %r8898; + add.s32 %r9021, %r9020, %r9019; + xor.b32 %r9022, %r9021, %r9016; + shf.l.wrap.b32 %r9023, %r9022, %r9022, 24; + add.s32 %r9024, %r9023, %r9017; + xor.b32 %r9025, %r9024, %r9019; + shf.l.wrap.b32 %r9026, %r9025, %r9025, 25; + add.s32 %r9027, %r8969, %r8905; + add.s32 %r9028, %r9027, %r8990; + xor.b32 %r9029, %r9028, %r9023; + shf.l.wrap.b32 %r9030, %r9029, %r9029, 16; + add.s32 %r9031, %r9030, %r9006; + xor.b32 %r9032, %r9031, %r8990; + shf.l.wrap.b32 %r9033, %r9032, %r9032, 20; + add.s32 %r9034, %r9028, %r8912; + add.s32 %r9035, %r9034, %r9033; + xor.b32 %r9036, %r9035, %r9030; + shf.l.wrap.b32 %r9037, %r9036, %r9036, 24; + add.s32 %r9038, %r9037, %r9031; + xor.b32 %r9039, %r9038, %r9033; + shf.l.wrap.b32 %r9040, %r9039, %r9039, 25; + add.s32 %r9041, %r8985, %r8919; + add.s32 %r9042, %r9041, %r9008; + xor.b32 %r9043, %r9042, %r8971; + shf.l.wrap.b32 %r9044, %r9043, %r9043, 16; + add.s32 %r9045, %r9044, %r9024; + xor.b32 %r9046, %r9045, %r9008; + shf.l.wrap.b32 %r9047, %r9046, %r9046, 20; + add.s32 %r9048, %r9042, %r8926; + add.s32 %r9049, %r9048, %r9047; + xor.b32 %r9050, %r9049, %r9044; + shf.l.wrap.b32 %r9051, %r9050, %r9050, 24; + add.s32 %r9052, %r9051, %r9045; + xor.b32 %r9053, %r9052, %r9047; + shf.l.wrap.b32 %r9054, %r9053, %r9053, 25; + add.s32 %r9055, %r9003, %r8933; + add.s32 %r9056, %r9055, %r9026; + xor.b32 %r9057, %r9056, %r8987; + shf.l.wrap.b32 %r9058, %r9057, %r9057, 16; + add.s32 %r9059, %r9058, %r8972; + xor.b32 %r9060, %r9059, %r9026; + shf.l.wrap.b32 %r9061, %r9060, %r9060, 20; + add.s32 %r9062, %r9056, %r8940; + add.s32 %r9063, %r9062, %r9061; + xor.b32 %r9064, %r9063, %r9058; + shf.l.wrap.b32 %r9065, %r9064, %r9064, 24; + add.s32 %r9066, %r9065, %r9059; + xor.b32 %r9067, %r9066, %r9061; + shf.l.wrap.b32 %r9068, %r9067, %r9067, 25; + add.s32 %r9069, %r9021, %r8947; + add.s32 %r9070, %r9069, %r8974; + xor.b32 %r9071, %r9070, %r9005; + shf.l.wrap.b32 %r9072, %r9071, %r9071, 16; + add.s32 %r9073, %r9072, %r8988; + xor.b32 %r9074, %r9073, %r8974; + shf.l.wrap.b32 %r9075, %r9074, %r9074, 20; + add.s32 %r9076, %r9070, %r8954; + add.s32 %r9077, %r9076, %r9075; + xor.b32 %r9078, %r9077, %r9072; + shf.l.wrap.b32 %r9079, %r9078, %r9078, 24; + add.s32 %r9080, %r9079, %r9073; + xor.b32 %r9081, %r9080, %r9075; + shf.l.wrap.b32 %r9082, %r9081, %r9081, 25; + add.s32 %r9083, %r9035, %r8863; + add.s32 %r9084, %r9083, %r9082; + xor.b32 %r9085, %r9084, %r9051; + shf.l.wrap.b32 %r9086, %r9085, %r9085, 16; + add.s32 %r9087, %r9086, %r9066; + xor.b32 %r9088, %r9087, %r9082; + shf.l.wrap.b32 %r9089, %r9088, %r9088, 20; + add.s32 %r9090, %r9084, %r8891; + add.s32 %r9091, %r9090, %r9089; + xor.b32 %r9092, %r9091, %r9086; + shf.l.wrap.b32 %r9093, %r9092, %r9092, 24; + add.s32 %r9094, %r9093, %r9087; + xor.b32 %r9095, %r9094, %r9089; + shf.l.wrap.b32 %r9096, %r9095, %r9095, 25; + add.s32 %r9097, %r9049, %r8870; + add.s32 %r9098, %r9097, %r9040; + xor.b32 %r9099, %r9098, %r9065; + shf.l.wrap.b32 %r9100, %r9099, %r9099, 16; + add.s32 %r9101, %r9100, %r9080; + xor.b32 %r9102, %r9101, %r9040; + shf.l.wrap.b32 %r9103, %r9102, %r9102, 20; + add.s32 %r9104, %r9098, %r8919; + add.s32 %r9105, %r9104, %r9103; + xor.b32 %r9106, %r9105, %r9100; + shf.l.wrap.b32 %r9107, %r9106, %r9106, 24; + add.s32 %r9108, %r9107, %r9101; + xor.b32 %r9109, %r9108, %r9103; + shf.l.wrap.b32 %r9110, %r9109, %r9109, 25; + add.s32 %r9111, %r9063, %r8898; + add.s32 %r9112, %r9111, %r9054; + xor.b32 %r9113, %r9112, %r9079; + shf.l.wrap.b32 %r9114, %r9113, %r9113, 16; + add.s32 %r9115, %r9114, %r9038; + xor.b32 %r9116, %r9115, %r9054; + shf.l.wrap.b32 %r9117, %r9116, %r9116, 20; + add.s32 %r9118, %r9112, %r8849; + add.s32 %r9119, %r9118, %r9117; + xor.b32 %r9120, %r9119, %r9114; + shf.l.wrap.b32 %r9121, %r9120, %r9120, 24; + add.s32 %r9122, %r9121, %r9115; + xor.b32 %r9123, %r9122, %r9117; + shf.l.wrap.b32 %r9124, %r9123, %r9123, 25; + add.s32 %r9125, %r9077, %r8877; + add.s32 %r9126, %r9125, %r9068; + xor.b32 %r9127, %r9126, %r9037; + shf.l.wrap.b32 %r9128, %r9127, %r9127, 16; + add.s32 %r9129, %r9128, %r9052; + xor.b32 %r9130, %r9129, %r9068; + shf.l.wrap.b32 %r9131, %r9130, %r9130, 20; + add.s32 %r9132, %r9126, %r8940; + add.s32 %r9133, %r9132, %r9131; + xor.b32 %r9134, %r9133, %r9128; + shf.l.wrap.b32 %r9135, %r9134, %r9134, 24; + add.s32 %r9136, %r9135, %r9129; + xor.b32 %r9137, %r9136, %r9131; + shf.l.wrap.b32 %r9138, %r9137, %r9137, 25; + add.s32 %r9139, %r9091, %r8856; + add.s32 %r9140, %r9139, %r9110; + xor.b32 %r9141, %r9140, %r9135; + shf.l.wrap.b32 %r9142, %r9141, %r9141, 16; + add.s32 %r9143, %r9142, %r9122; + xor.b32 %r9144, %r9143, %r9110; + shf.l.wrap.b32 %r9145, %r9144, %r9144, 20; + add.s32 %r9146, %r9140, %r8926; + add.s32 %r9147, %r9146, %r9145; + xor.b32 %r9148, %r9147, %r9142; + shf.l.wrap.b32 %r9149, %r9148, %r9148, 24; + add.s32 %r9150, %r9149, %r9143; + xor.b32 %r9151, %r9150, %r9145; + shf.l.wrap.b32 %r9152, %r9151, %r9151, 25; + add.s32 %r9153, %r9105, %r8933; + add.s32 %r9154, %r9153, %r9124; + xor.b32 %r9155, %r9154, %r9093; + shf.l.wrap.b32 %r9156, %r9155, %r9155, 16; + add.s32 %r9157, %r9156, %r9136; + xor.b32 %r9158, %r9157, %r9124; + shf.l.wrap.b32 %r9159, %r9158, %r9158, 20; + add.s32 %r9160, %r9154, %r8884; + add.s32 %r9161, %r9160, %r9159; + xor.b32 %r9162, %r9161, %r9156; + shf.l.wrap.b32 %r9163, %r9162, %r9162, 24; + add.s32 %r9164, %r9163, %r9157; + xor.b32 %r9165, %r9164, %r9159; + shf.l.wrap.b32 %r9166, %r9165, %r9165, 25; + add.s32 %r9167, %r9119, %r8912; + add.s32 %r9168, %r9167, %r9138; + xor.b32 %r9169, %r9168, %r9107; + shf.l.wrap.b32 %r9170, %r9169, %r9169, 16; + add.s32 %r9171, %r9170, %r9094; + xor.b32 %r9172, %r9171, %r9138; + shf.l.wrap.b32 %r9173, %r9172, %r9172, 20; + add.s32 %r9174, %r9168, %r8947; + add.s32 %r9175, %r9174, %r9173; + xor.b32 %r9176, %r9175, %r9170; + shf.l.wrap.b32 %r9177, %r9176, %r9176, 24; + add.s32 %r9178, %r9177, %r9171; + xor.b32 %r9179, %r9178, %r9173; + shf.l.wrap.b32 %r9180, %r9179, %r9179, 25; + add.s32 %r9181, %r9133, %r8954; + add.s32 %r9182, %r9181, %r9096; + xor.b32 %r9183, %r9182, %r9121; + shf.l.wrap.b32 %r9184, %r9183, %r9183, 16; + add.s32 %r9185, %r9184, %r9108; + xor.b32 %r9186, %r9185, %r9096; + shf.l.wrap.b32 %r9187, %r9186, %r9186, 20; + add.s32 %r9188, %r9182, %r8905; + add.s32 %r9189, %r9188, %r9187; + xor.b32 %r9190, %r9189, %r9184; + shf.l.wrap.b32 %r9191, %r9190, %r9190, 24; + add.s32 %r9192, %r9191, %r9185; + xor.b32 %r9193, %r9192, %r9187; + shf.l.wrap.b32 %r9194, %r9193, %r9193, 25; + add.s32 %r9195, %r9147, %r8870; + add.s32 %r9196, %r9195, %r9194; + xor.b32 %r9197, %r9196, %r9163; + shf.l.wrap.b32 %r9198, %r9197, %r9197, 16; + add.s32 %r9199, %r9198, %r9178; + xor.b32 %r9200, %r9199, %r9194; + shf.l.wrap.b32 %r9201, %r9200, %r9200, 20; + add.s32 %r9202, %r9196, %r8877; + add.s32 %r9203, %r9202, %r9201; + xor.b32 %r9204, %r9203, %r9198; + shf.l.wrap.b32 %r9205, %r9204, %r9204, 24; + add.s32 %r9206, %r9205, %r9199; + xor.b32 %r9207, %r9206, %r9201; + shf.l.wrap.b32 %r9208, %r9207, %r9207, 25; + add.s32 %r9209, %r9161, %r8919; + add.s32 %r9210, %r9209, %r9152; + xor.b32 %r9211, %r9210, %r9177; + shf.l.wrap.b32 %r9212, %r9211, %r9211, 16; + add.s32 %r9213, %r9212, %r9192; + xor.b32 %r9214, %r9213, %r9152; + shf.l.wrap.b32 %r9215, %r9214, %r9214, 20; + add.s32 %r9216, %r9210, %r8933; + add.s32 %r9217, %r9216, %r9215; + xor.b32 %r9218, %r9217, %r9212; + shf.l.wrap.b32 %r9219, %r9218, %r9218, 24; + add.s32 %r9220, %r9219, %r9213; + xor.b32 %r9221, %r9220, %r9215; + shf.l.wrap.b32 %r9222, %r9221, %r9221, 25; + add.s32 %r9223, %r9175, %r8940; + add.s32 %r9224, %r9223, %r9166; + xor.b32 %r9225, %r9224, %r9191; + shf.l.wrap.b32 %r9226, %r9225, %r9225, 16; + add.s32 %r9227, %r9226, %r9150; + xor.b32 %r9228, %r9227, %r9166; + shf.l.wrap.b32 %r9229, %r9228, %r9228, 20; + add.s32 %r9230, %r9224, %r8863; + add.s32 %r9231, %r9230, %r9229; + xor.b32 %r9232, %r9231, %r9226; + shf.l.wrap.b32 %r9233, %r9232, %r9232, 24; + add.s32 %r9234, %r9233, %r9227; + xor.b32 %r9235, %r9234, %r9229; + shf.l.wrap.b32 %r9236, %r9235, %r9235, 25; + add.s32 %r9237, %r9189, %r8898; + add.s32 %r9238, %r9237, %r9180; + xor.b32 %r9239, %r9238, %r9149; + shf.l.wrap.b32 %r9240, %r9239, %r9239, 16; + add.s32 %r9241, %r9240, %r9164; + xor.b32 %r9242, %r9241, %r9180; + shf.l.wrap.b32 %r9243, %r9242, %r9242, 20; + add.s32 %r9244, %r9238, %r8947; + add.s32 %r9245, %r9244, %r9243; + xor.b32 %r9246, %r9245, %r9240; + shf.l.wrap.b32 %r9247, %r9246, %r9246, 24; + add.s32 %r9248, %r9247, %r9241; + xor.b32 %r9249, %r9248, %r9243; + shf.l.wrap.b32 %r9250, %r9249, %r9249, 25; + add.s32 %r9251, %r9203, %r8891; + add.s32 %r9252, %r9251, %r9222; + xor.b32 %r9253, %r9252, %r9247; + shf.l.wrap.b32 %r9254, %r9253, %r9253, 16; + add.s32 %r9255, %r9254, %r9234; + xor.b32 %r9256, %r9255, %r9222; + shf.l.wrap.b32 %r9257, %r9256, %r9256, 20; + add.s32 %r9258, %r9252, %r8884; + add.s32 %r9259, %r9258, %r9257; + xor.b32 %r9260, %r9259, %r9254; + shf.l.wrap.b32 %r9261, %r9260, %r9260, 24; + add.s32 %r9262, %r9261, %r9255; + xor.b32 %r9263, %r9262, %r9257; + shf.l.wrap.b32 %r9264, %r9263, %r9263, 25; + add.s32 %r9265, %r9217, %r8912; + add.s32 %r9266, %r9265, %r9236; + xor.b32 %r9267, %r9266, %r9205; + shf.l.wrap.b32 %r9268, %r9267, %r9267, 16; + add.s32 %r9269, %r9268, %r9248; + xor.b32 %r9270, %r9269, %r9236; + shf.l.wrap.b32 %r9271, %r9270, %r9270, 20; + add.s32 %r9272, %r9266, %r8849; + add.s32 %r9273, %r9272, %r9271; + xor.b32 %r9274, %r9273, %r9268; + shf.l.wrap.b32 %r9275, %r9274, %r9274, 24; + add.s32 %r9276, %r9275, %r9269; + xor.b32 %r9277, %r9276, %r9271; + shf.l.wrap.b32 %r9278, %r9277, %r9277, 25; + add.s32 %r9279, %r9231, %r8926; + add.s32 %r9280, %r9279, %r9250; + xor.b32 %r9281, %r9280, %r9219; + shf.l.wrap.b32 %r9282, %r9281, %r9281, 16; + add.s32 %r9283, %r9282, %r9206; + xor.b32 %r9284, %r9283, %r9250; + shf.l.wrap.b32 %r9285, %r9284, %r9284, 20; + add.s32 %r9286, %r9280, %r8954; + add.s32 %r9287, %r9286, %r9285; + xor.b32 %r9288, %r9287, %r9282; + shf.l.wrap.b32 %r9289, %r9288, %r9288, 24; + add.s32 %r9290, %r9289, %r9283; + xor.b32 %r9291, %r9290, %r9285; + shf.l.wrap.b32 %r9292, %r9291, %r9291, 25; + add.s32 %r9293, %r9245, %r8905; + add.s32 %r9294, %r9293, %r9208; + xor.b32 %r9295, %r9294, %r9233; + shf.l.wrap.b32 %r9296, %r9295, %r9295, 16; + add.s32 %r9297, %r9296, %r9220; + xor.b32 %r9298, %r9297, %r9208; + shf.l.wrap.b32 %r9299, %r9298, %r9298, 20; + add.s32 %r9300, %r9294, %r8856; + add.s32 %r9301, %r9300, %r9299; + xor.b32 %r9302, %r9301, %r9296; + shf.l.wrap.b32 %r9303, %r9302, %r9302, 24; + add.s32 %r9304, %r9303, %r9297; + xor.b32 %r9305, %r9304, %r9299; + shf.l.wrap.b32 %r9306, %r9305, %r9305, 25; + add.s32 %r9307, %r9259, %r8919; + add.s32 %r9308, %r9307, %r9306; + xor.b32 %r9309, %r9308, %r9275; + shf.l.wrap.b32 %r9310, %r9309, %r9309, 16; + add.s32 %r9311, %r9310, %r9290; + xor.b32 %r9312, %r9311, %r9306; + shf.l.wrap.b32 %r9313, %r9312, %r9312, 20; + add.s32 %r9314, %r9308, %r8898; + add.s32 %r9315, %r9314, %r9313; + xor.b32 %r9316, %r9315, %r9310; + shf.l.wrap.b32 %r9317, %r9316, %r9316, 24; + add.s32 %r9318, %r9317, %r9311; + xor.b32 %r9319, %r9318, %r9313; + shf.l.wrap.b32 %r9320, %r9319, %r9319, 25; + add.s32 %r9321, %r9273, %r8933; + add.s32 %r9322, %r9321, %r9264; + xor.b32 %r9323, %r9322, %r9289; + shf.l.wrap.b32 %r9324, %r9323, %r9323, 16; + add.s32 %r9325, %r9324, %r9304; + xor.b32 %r9326, %r9325, %r9264; + shf.l.wrap.b32 %r9327, %r9326, %r9326, 20; + add.s32 %r9328, %r9322, %r8912; + add.s32 %r9329, %r9328, %r9327; + xor.b32 %r9330, %r9329, %r9324; + shf.l.wrap.b32 %r9331, %r9330, %r9330, 24; + add.s32 %r9332, %r9331, %r9325; + xor.b32 %r9333, %r9332, %r9327; + shf.l.wrap.b32 %r9334, %r9333, %r9333, 25; + add.s32 %r9335, %r9287, %r8947; + add.s32 %r9336, %r9335, %r9278; + xor.b32 %r9337, %r9336, %r9303; + shf.l.wrap.b32 %r9338, %r9337, %r9337, 16; + add.s32 %r9339, %r9338, %r9262; + xor.b32 %r9340, %r9339, %r9278; + shf.l.wrap.b32 %r9341, %r9340, %r9340, 20; + add.s32 %r9342, %r9336, %r8870; + add.s32 %r9343, %r9342, %r9341; + xor.b32 %r9344, %r9343, %r9338; + shf.l.wrap.b32 %r9345, %r9344, %r9344, 24; + add.s32 %r9346, %r9345, %r9339; + xor.b32 %r9347, %r9346, %r9341; + shf.l.wrap.b32 %r9348, %r9347, %r9347, 25; + add.s32 %r9349, %r9301, %r8940; + add.s32 %r9350, %r9349, %r9292; + xor.b32 %r9351, %r9350, %r9261; + shf.l.wrap.b32 %r9352, %r9351, %r9351, 16; + add.s32 %r9353, %r9352, %r9276; + xor.b32 %r9354, %r9353, %r9292; + shf.l.wrap.b32 %r9355, %r9354, %r9354, 20; + add.s32 %r9356, %r9350, %r8954; + add.s32 %r9357, %r9356, %r9355; + xor.b32 %r9358, %r9357, %r9352; + shf.l.wrap.b32 %r9359, %r9358, %r9358, 24; + add.s32 %r9360, %r9359, %r9353; + xor.b32 %r9361, %r9360, %r9355; + shf.l.wrap.b32 %r9362, %r9361, %r9361, 25; + add.s32 %r9363, %r9315, %r8877; + add.s32 %r9364, %r9363, %r9334; + xor.b32 %r9365, %r9364, %r9359; + shf.l.wrap.b32 %r9366, %r9365, %r9365, 16; + add.s32 %r9367, %r9366, %r9346; + xor.b32 %r9368, %r9367, %r9334; + shf.l.wrap.b32 %r9369, %r9368, %r9368, 20; + add.s32 %r9370, %r9364, %r8849; + add.s32 %r9371, %r9370, %r9369; + xor.b32 %r9372, %r9371, %r9366; + shf.l.wrap.b32 %r9373, %r9372, %r9372, 24; + add.s32 %r9374, %r9373, %r9367; + xor.b32 %r9375, %r9374, %r9369; + shf.l.wrap.b32 %r9376, %r9375, %r9375, 25; + add.s32 %r9377, %r9329, %r8926; + add.s32 %r9378, %r9377, %r9348; + xor.b32 %r9379, %r9378, %r9317; + shf.l.wrap.b32 %r9380, %r9379, %r9379, 16; + add.s32 %r9381, %r9380, %r9360; + xor.b32 %r9382, %r9381, %r9348; + shf.l.wrap.b32 %r9383, %r9382, %r9382, 20; + add.s32 %r9384, %r9378, %r8863; + add.s32 %r9385, %r9384, %r9383; + xor.b32 %r9386, %r9385, %r9380; + shf.l.wrap.b32 %r9387, %r9386, %r9386, 24; + add.s32 %r9388, %r9387, %r9381; + xor.b32 %r9389, %r9388, %r9383; + shf.l.wrap.b32 %r9390, %r9389, %r9389, 25; + add.s32 %r9391, %r9343, %r8884; + add.s32 %r9392, %r9391, %r9362; + xor.b32 %r9393, %r9392, %r9331; + shf.l.wrap.b32 %r9394, %r9393, %r9393, 16; + add.s32 %r9395, %r9394, %r9318; + xor.b32 %r9396, %r9395, %r9362; + shf.l.wrap.b32 %r9397, %r9396, %r9396, 20; + add.s32 %r9398, %r9392, %r8905; + add.s32 %r9399, %r9398, %r9397; + xor.b32 %r9400, %r9399, %r9394; + shf.l.wrap.b32 %r9401, %r9400, %r9400, 24; + add.s32 %r9402, %r9401, %r9395; + xor.b32 %r9403, %r9402, %r9397; + shf.l.wrap.b32 %r9404, %r9403, %r9403, 25; + add.s32 %r9405, %r9357, %r8856; + add.s32 %r9406, %r9405, %r9320; + xor.b32 %r9407, %r9406, %r9345; + shf.l.wrap.b32 %r9408, %r9407, %r9407, 16; + add.s32 %r9409, %r9408, %r9332; + xor.b32 %r9410, %r9409, %r9320; + shf.l.wrap.b32 %r9411, %r9410, %r9410, 20; + add.s32 %r9412, %r9406, %r8891; + add.s32 %r9413, %r9412, %r9411; + xor.b32 %r9414, %r9413, %r9408; + shf.l.wrap.b32 %r9415, %r9414, %r9414, 24; + add.s32 %r9416, %r9415, %r9409; + xor.b32 %r9417, %r9416, %r9411; + shf.l.wrap.b32 %r9418, %r9417, %r9417, 25; + add.s32 %r9419, %r9371, %r8933; + add.s32 %r9420, %r9419, %r9418; + xor.b32 %r9421, %r9420, %r9387; + shf.l.wrap.b32 %r9422, %r9421, %r9421, 16; + add.s32 %r9423, %r9422, %r9402; + xor.b32 %r9424, %r9423, %r9418; + shf.l.wrap.b32 %r9425, %r9424, %r9424, 20; + add.s32 %r9426, %r9420, %r8940; + add.s32 %r9427, %r9426, %r9425; + xor.b32 %r9428, %r9427, %r9422; + shf.l.wrap.b32 %r9429, %r9428, %r9428, 24; + add.s32 %r9430, %r9429, %r9423; + xor.b32 %r9431, %r9430, %r9425; + shf.l.wrap.b32 %r9432, %r9431, %r9431, 25; + add.s32 %r9433, %r9385, %r8912; + add.s32 %r9434, %r9433, %r9376; + xor.b32 %r9435, %r9434, %r9401; + shf.l.wrap.b32 %r9436, %r9435, %r9435, 16; + add.s32 %r9437, %r9436, %r9416; + xor.b32 %r9438, %r9437, %r9376; + shf.l.wrap.b32 %r9439, %r9438, %r9438, 20; + add.s32 %r9440, %r9434, %r8926; + add.s32 %r9441, %r9440, %r9439; + xor.b32 %r9442, %r9441, %r9436; + shf.l.wrap.b32 %r9443, %r9442, %r9442, 24; + add.s32 %r9444, %r9443, %r9437; + xor.b32 %r9445, %r9444, %r9439; + shf.l.wrap.b32 %r9446, %r9445, %r9445, 25; + add.s32 %r9447, %r9399, %r8954; + add.s32 %r9448, %r9447, %r9390; + xor.b32 %r9449, %r9448, %r9415; + shf.l.wrap.b32 %r9450, %r9449, %r9449, 16; + add.s32 %r9451, %r9450, %r9374; + xor.b32 %r9452, %r9451, %r9390; + shf.l.wrap.b32 %r9453, %r9452, %r9452, 20; + add.s32 %r9454, %r9448, %r8919; + add.s32 %r9455, %r9454, %r9453; + xor.b32 %r9456, %r9455, %r9450; + shf.l.wrap.b32 %r9457, %r9456, %r9456, 24; + add.s32 %r9458, %r9457, %r9451; + xor.b32 %r9459, %r9458, %r9453; + shf.l.wrap.b32 %r9460, %r9459, %r9459, 25; + add.s32 %r9461, %r9413, %r8947; + add.s32 %r9462, %r9461, %r9404; + xor.b32 %r9463, %r9462, %r9373; + shf.l.wrap.b32 %r9464, %r9463, %r9463, 16; + add.s32 %r9465, %r9464, %r9388; + xor.b32 %r9466, %r9465, %r9404; + shf.l.wrap.b32 %r9467, %r9466, %r9466, 20; + add.s32 %r9468, %r9462, %r8905; + add.s32 %r9469, %r9468, %r9467; + xor.b32 %r9470, %r9469, %r9464; + shf.l.wrap.b32 %r9471, %r9470, %r9470, 24; + add.s32 %r9472, %r9471, %r9465; + xor.b32 %r9473, %r9472, %r9467; + shf.l.wrap.b32 %r9474, %r9473, %r9473, 25; + add.s32 %r9475, %r9427, %r8898; + add.s32 %r9476, %r9475, %r9446; + xor.b32 %r9477, %r9476, %r9471; + shf.l.wrap.b32 %r9478, %r9477, %r9477, 16; + add.s32 %r9479, %r9478, %r9458; + xor.b32 %r9480, %r9479, %r9446; + shf.l.wrap.b32 %r9481, %r9480, %r9480, 20; + add.s32 %r9482, %r9476, %r8863; + add.s32 %r9483, %r9482, %r9481; + xor.b32 %r9484, %r9483, %r9478; + shf.l.wrap.b32 %r9485, %r9484, %r9484, 24; + add.s32 %r9486, %r9485, %r9479; + xor.b32 %r9487, %r9486, %r9481; + shf.l.wrap.b32 %r9488, %r9487, %r9487, 25; + add.s32 %r9489, %r9441, %r8884; + add.s32 %r9490, %r9489, %r9460; + xor.b32 %r9491, %r9490, %r9429; + shf.l.wrap.b32 %r9492, %r9491, %r9491, 16; + add.s32 %r9493, %r9492, %r9472; + xor.b32 %r9494, %r9493, %r9460; + shf.l.wrap.b32 %r9495, %r9494, %r9494, 20; + add.s32 %r9496, %r9490, %r8870; + add.s32 %r9497, %r9496, %r9495; + xor.b32 %r9498, %r9497, %r9492; + shf.l.wrap.b32 %r9499, %r9498, %r9498, 24; + add.s32 %r9500, %r9499, %r9493; + xor.b32 %r9501, %r9500, %r9495; + shf.l.wrap.b32 %r9502, %r9501, %r9501, 25; + add.s32 %r9503, %r9455, %r8849; + add.s32 %r9504, %r9503, %r9474; + xor.b32 %r9505, %r9504, %r9443; + shf.l.wrap.b32 %r9506, %r9505, %r9505, 16; + add.s32 %r9507, %r9506, %r9430; + xor.b32 %r9508, %r9507, %r9474; + shf.l.wrap.b32 %r9509, %r9508, %r9508, 20; + add.s32 %r9510, %r9504, %r8856; + add.s32 %r9511, %r9510, %r9509; + xor.b32 %r9512, %r9511, %r9506; + shf.l.wrap.b32 %r9513, %r9512, %r9512, 24; + add.s32 %r9514, %r9513, %r9507; + xor.b32 %r9515, %r9514, %r9509; + shf.l.wrap.b32 %r9516, %r9515, %r9515, 25; + add.s32 %r9517, %r9469, %r8891; + add.s32 %r9518, %r9517, %r9432; + xor.b32 %r9519, %r9518, %r9457; + shf.l.wrap.b32 %r9520, %r9519, %r9519, 16; + add.s32 %r9521, %r9520, %r9444; + xor.b32 %r9522, %r9521, %r9432; + shf.l.wrap.b32 %r9523, %r9522, %r9522, 20; + add.s32 %r9524, %r9518, %r8877; + add.s32 %r9525, %r9524, %r9523; + xor.b32 %r9526, %r9525, %r9520; + shf.l.wrap.b32 %r9527, %r9526, %r9526, 24; + add.s32 %r9528, %r9527, %r9521; + xor.b32 %r9529, %r9528, %r9523; + shf.l.wrap.b32 %r9530, %r9529, %r9529, 25; + add.s32 %r9531, %r9483, %r8912; + add.s32 %r9532, %r9531, %r9530; + xor.b32 %r9533, %r9532, %r9499; + shf.l.wrap.b32 %r9534, %r9533, %r9533, 16; + add.s32 %r9535, %r9534, %r9514; + xor.b32 %r9536, %r9535, %r9530; + shf.l.wrap.b32 %r9537, %r9536, %r9536, 20; + add.s32 %r9538, %r9532, %r8947; + add.s32 %r9539, %r9538, %r9537; + xor.b32 %r9540, %r9539, %r9534; + shf.l.wrap.b32 %r9541, %r9540, %r9540, 24; + add.s32 %r9542, %r9541, %r9535; + xor.b32 %r9543, %r9542, %r9537; + shf.l.wrap.b32 %r9544, %r9543, %r9543, 25; + add.s32 %r9545, %r9497, %r8926; + add.s32 %r9546, %r9545, %r9488; + xor.b32 %r9547, %r9546, %r9513; + shf.l.wrap.b32 %r9548, %r9547, %r9547, 16; + add.s32 %r9549, %r9548, %r9528; + xor.b32 %r9550, %r9549, %r9488; + shf.l.wrap.b32 %r9551, %r9550, %r9550, 20; + add.s32 %r9552, %r9546, %r8884; + add.s32 %r9553, %r9552, %r9551; + xor.b32 %r9554, %r9553, %r9548; + shf.l.wrap.b32 %r9555, %r9554, %r9554, 24; + add.s32 %r9556, %r9555, %r9549; + xor.b32 %r9557, %r9556, %r9551; + shf.l.wrap.b32 %r9558, %r9557, %r9557, 25; + add.s32 %r9559, %r9511, %r8905; + add.s32 %r9560, %r9559, %r9502; + xor.b32 %r9561, %r9560, %r9527; + shf.l.wrap.b32 %r9562, %r9561, %r9561, 16; + add.s32 %r9563, %r9562, %r9486; + xor.b32 %r9564, %r9563, %r9502; + shf.l.wrap.b32 %r9565, %r9564, %r9564, 20; + add.s32 %r9566, %r9560, %r8933; + add.s32 %r9567, %r9566, %r9565; + xor.b32 %r9568, %r9567, %r9562; + shf.l.wrap.b32 %r9569, %r9568, %r9568, 24; + add.s32 %r9570, %r9569, %r9563; + xor.b32 %r9571, %r9570, %r9565; + shf.l.wrap.b32 %r9572, %r9571, %r9571, 25; + add.s32 %r9573, %r9525, %r8954; + add.s32 %r9574, %r9573, %r9516; + xor.b32 %r9575, %r9574, %r9485; + shf.l.wrap.b32 %r9576, %r9575, %r9575, 16; + add.s32 %r9577, %r9576, %r9500; + xor.b32 %r9578, %r9577, %r9516; + shf.l.wrap.b32 %r9579, %r9578, %r9578, 20; + add.s32 %r9580, %r9574, %r8856; + add.s32 %r9581, %r9580, %r9579; + xor.b32 %r9582, %r9581, %r9576; + shf.l.wrap.b32 %r9583, %r9582, %r9582, 24; + add.s32 %r9584, %r9583, %r9577; + xor.b32 %r9585, %r9584, %r9579; + shf.l.wrap.b32 %r9586, %r9585, %r9585, 25; + add.s32 %r9587, %r9539, %r8940; + add.s32 %r9588, %r9587, %r9558; + xor.b32 %r9589, %r9588, %r9583; + shf.l.wrap.b32 %r9590, %r9589, %r9589, 16; + add.s32 %r9591, %r9590, %r9570; + xor.b32 %r9592, %r9591, %r9558; + shf.l.wrap.b32 %r9593, %r9592, %r9592, 20; + add.s32 %r9594, %r9588, %r8870; + add.s32 %r9595, %r9594, %r9593; + xor.b32 %r9596, %r9595, %r9590; + shf.l.wrap.b32 %r9597, %r9596, %r9596, 24; + add.s32 %r9598, %r9597, %r9591; + xor.b32 %r9599, %r9598, %r9593; + shf.l.wrap.b32 %r9600, %r9599, %r9599, 25; + add.s32 %r9601, %r9553, %r8849; + add.s32 %r9602, %r9601, %r9572; + xor.b32 %r9603, %r9602, %r9541; + shf.l.wrap.b32 %r9604, %r9603, %r9603, 16; + add.s32 %r9605, %r9604, %r9584; + xor.b32 %r9606, %r9605, %r9572; + shf.l.wrap.b32 %r9607, %r9606, %r9606, 20; + add.s32 %r9608, %r9602, %r8919; + add.s32 %r9609, %r9608, %r9607; + xor.b32 %r9610, %r9609, %r9604; + shf.l.wrap.b32 %r9611, %r9610, %r9610, 24; + add.s32 %r9612, %r9611, %r9605; + xor.b32 %r9613, %r9612, %r9607; + shf.l.wrap.b32 %r9614, %r9613, %r9613, 25; + add.s32 %r9615, %r9567, %r8863; + add.s32 %r9616, %r9615, %r9586; + xor.b32 %r9617, %r9616, %r9555; + shf.l.wrap.b32 %r9618, %r9617, %r9617, 16; + add.s32 %r9619, %r9618, %r9542; + xor.b32 %r9620, %r9619, %r9586; + shf.l.wrap.b32 %r9621, %r9620, %r9620, 20; + add.s32 %r9622, %r9616, %r8891; + add.s32 %r9623, %r9622, %r9621; + xor.b32 %r9624, %r9623, %r9618; + shf.l.wrap.b32 %r9625, %r9624, %r9624, 24; + add.s32 %r9626, %r9625, %r9619; + xor.b32 %r9627, %r9626, %r9621; + shf.l.wrap.b32 %r9628, %r9627, %r9627, 25; + add.s32 %r9629, %r9581, %r8877; + add.s32 %r9630, %r9629, %r9544; + xor.b32 %r9631, %r9630, %r9569; + shf.l.wrap.b32 %r9632, %r9631, %r9631, 16; + add.s32 %r9633, %r9632, %r9556; + xor.b32 %r9634, %r9633, %r9544; + shf.l.wrap.b32 %r9635, %r9634, %r9634, 20; + add.s32 %r9636, %r9630, %r8898; + add.s32 %r9637, %r9636, %r9635; + xor.b32 %r9638, %r9637, %r9632; + shf.l.wrap.b32 %r9639, %r9638, %r9638, 24; + add.s32 %r9640, %r9639, %r9633; + xor.b32 %r9641, %r9640, %r9635; + shf.l.wrap.b32 %r9642, %r9641, %r9641, 25; + add.s32 %r9643, %r9595, %r8926; + add.s32 %r9644, %r9643, %r9642; + xor.b32 %r9645, %r9644, %r9611; + shf.l.wrap.b32 %r9646, %r9645, %r9645, 16; + add.s32 %r9647, %r9646, %r9626; + xor.b32 %r9648, %r9647, %r9642; + shf.l.wrap.b32 %r9649, %r9648, %r9648, 20; + add.s32 %r9650, %r9644, %r8954; + add.s32 %r9651, %r9650, %r9649; + xor.b32 %r9652, %r9651, %r9646; + shf.l.wrap.b32 %r9653, %r9652, %r9652, 24; + add.s32 %r9654, %r9653, %r9647; + xor.b32 %r9655, %r9654, %r9649; + shf.l.wrap.b32 %r9656, %r9655, %r9655, 25; + add.s32 %r9657, %r9609, %r8884; + add.s32 %r9658, %r9657, %r9600; + xor.b32 %r9659, %r9658, %r9625; + shf.l.wrap.b32 %r9660, %r9659, %r9659, 16; + add.s32 %r9661, %r9660, %r9640; + xor.b32 %r9662, %r9661, %r9600; + shf.l.wrap.b32 %r9663, %r9662, %r9662, 20; + add.s32 %r9664, %r9658, %r8849; + add.s32 %r9665, %r9664, %r9663; + xor.b32 %r9666, %r9665, %r9660; + shf.l.wrap.b32 %r9667, %r9666, %r9666, 24; + add.s32 %r9668, %r9667, %r9661; + xor.b32 %r9669, %r9668, %r9663; + shf.l.wrap.b32 %r9670, %r9669, %r9669, 25; + add.s32 %r9671, %r9623, %r8856; + add.s32 %r9672, %r9671, %r9614; + xor.b32 %r9673, %r9672, %r9639; + shf.l.wrap.b32 %r9674, %r9673, %r9673, 16; + add.s32 %r9675, %r9674, %r9598; + xor.b32 %r9676, %r9675, %r9614; + shf.l.wrap.b32 %r9677, %r9676, %r9676, 20; + add.s32 %r9678, %r9672, %r8912; + add.s32 %r9679, %r9678, %r9677; + xor.b32 %r9680, %r9679, %r9674; + shf.l.wrap.b32 %r9681, %r9680, %r9680, 24; + add.s32 %r9682, %r9681, %r9675; + xor.b32 %r9683, %r9682, %r9677; + shf.l.wrap.b32 %r9684, %r9683, %r9683, 25; + add.s32 %r9685, %r9637, %r8905; + add.s32 %r9686, %r9685, %r9628; + xor.b32 %r9687, %r9686, %r9597; + shf.l.wrap.b32 %r9688, %r9687, %r9687, 16; + add.s32 %r9689, %r9688, %r9612; + xor.b32 %r9690, %r9689, %r9628; + shf.l.wrap.b32 %r9691, %r9690, %r9690, 20; + add.s32 %r9692, %r9686, %r8891; + add.s32 %r9693, %r9692, %r9691; + xor.b32 %r9694, %r9693, %r9688; + shf.l.wrap.b32 %r9695, %r9694, %r9694, 24; + add.s32 %r9696, %r9695, %r9689; + xor.b32 %r9697, %r9696, %r9691; + shf.l.wrap.b32 %r9698, %r9697, %r9697, 25; + add.s32 %r9699, %r9651, %r8947; + add.s32 %r9700, %r9699, %r9670; + xor.b32 %r9701, %r9700, %r9695; + shf.l.wrap.b32 %r9702, %r9701, %r9701, 16; + add.s32 %r9703, %r9702, %r9682; + xor.b32 %r9704, %r9703, %r9670; + shf.l.wrap.b32 %r9705, %r9704, %r9704, 20; + add.s32 %r9706, %r9700, %r8919; + add.s32 %r9707, %r9706, %r9705; + xor.b32 %r9708, %r9707, %r9702; + shf.l.wrap.b32 %r9709, %r9708, %r9708, 24; + add.s32 %r9710, %r9709, %r9703; + xor.b32 %r9711, %r9710, %r9705; + shf.l.wrap.b32 %r9712, %r9711, %r9711, 25; + add.s32 %r9713, %r9665, %r8863; + add.s32 %r9714, %r9713, %r9684; + xor.b32 %r9715, %r9714, %r9653; + shf.l.wrap.b32 %r9716, %r9715, %r9715, 16; + add.s32 %r9717, %r9716, %r9696; + xor.b32 %r9718, %r9717, %r9684; + shf.l.wrap.b32 %r9719, %r9718, %r9718, 20; + add.s32 %r9720, %r9714, %r8933; + add.s32 %r9721, %r9720, %r9719; + xor.b32 %r9722, %r9721, %r9716; + shf.l.wrap.b32 %r9723, %r9722, %r9722, 24; + add.s32 %r9724, %r9723, %r9717; + xor.b32 %r9725, %r9724, %r9719; + shf.l.wrap.b32 %r9726, %r9725, %r9725, 25; + add.s32 %r9727, %r9679, %r8870; + add.s32 %r9728, %r9727, %r9698; + xor.b32 %r9729, %r9728, %r9667; + shf.l.wrap.b32 %r9730, %r9729, %r9729, 16; + add.s32 %r9731, %r9730, %r9654; + xor.b32 %r9732, %r9731, %r9698; + shf.l.wrap.b32 %r9733, %r9732, %r9732, 20; + add.s32 %r9734, %r9728, %r8877; + add.s32 %r9735, %r9734, %r9733; + xor.b32 %r9736, %r9735, %r9730; + shf.l.wrap.b32 %r9737, %r9736, %r9736, 24; + add.s32 %r9738, %r9737, %r9731; + xor.b32 %r9739, %r9738, %r9733; + shf.l.wrap.b32 %r9740, %r9739, %r9739, 25; + add.s32 %r9741, %r9693, %r8898; + add.s32 %r9742, %r9741, %r9656; + xor.b32 %r9743, %r9742, %r9681; + shf.l.wrap.b32 %r9744, %r9743, %r9743, 16; + add.s32 %r9745, %r9744, %r9668; + xor.b32 %r9746, %r9745, %r9656; + shf.l.wrap.b32 %r9747, %r9746, %r9746, 20; + add.s32 %r9748, %r9742, %r8940; + add.s32 %r9749, %r9748, %r9747; + xor.b32 %r9750, %r9749, %r9744; + shf.l.wrap.b32 %r9751, %r9750, %r9750, 24; + add.s32 %r9752, %r9751, %r9745; + xor.b32 %r9753, %r9752, %r9747; + shf.l.wrap.b32 %r9754, %r9753, %r9753, 25; + xor.b32 %r9755, %r9738, %r9707; + st.local.u32 [%rd3+-104], %r9755; + xor.b32 %r9756, %r9752, %r9721; + st.local.u32 [%rd3+-100], %r9756; + xor.b32 %r9757, %r9710, %r9735; + st.local.u32 [%rd3+-96], %r9757; + xor.b32 %r9758, %r9724, %r9749; + st.local.u32 [%rd3+-92], %r9758; + xor.b32 %r9759, %r9754, %r9723; + st.local.u32 [%rd3+-88], %r9759; + xor.b32 %r9760, %r9712, %r9737; + st.local.u32 [%rd3+-84], %r9760; + xor.b32 %r9761, %r9726, %r9751; + st.local.u32 [%rd3+-80], %r9761; + xor.b32 %r9762, %r9740, %r9709; + st.local.u32 [%rd3+-76], %r9762; + add.s16 %rs340, %rs335, 1; + st.local.v2.u8 [%rd3], {%rs390, %rs340}; + +$L__BB1_56: + add.s64 %rd208, %rd78, %rd264; + st.local.u8 [%rd208], %rs390; + add.s64 %rd264, %rd264, 1; + setp.lt.u64 %p46, %rd264, 64; + mov.u64 %rd271, %rd77; + @%p46 bra $L__BB1_56; + +$L__BB1_57: + setp.gt.u64 %p47, %rd271, 64; + @%p47 bra $L__BB1_59; + bra.uni $L__BB1_58; + +$L__BB1_59: + ld.local.u8 %rs95, [%rd3+2]; + ld.local.u8 %rs391, [%rd3+1]; + ld.local.u32 %r11689, [%rd3+-104]; + ld.local.u32 %r11688, [%rd3+-100]; + ld.local.u32 %r11687, [%rd3+-96]; + ld.local.u32 %r11686, [%rd3+-92]; + ld.local.u32 %r11685, [%rd3+-88]; + ld.local.u32 %r11684, [%rd3+-84]; + ld.local.u32 %r11683, [%rd3+-80]; + ld.local.u32 %r11682, [%rd3+-76]; + ld.local.u64 %rd269, [%rd3+-72]; + cvt.u32.u64 %r117, %rd269; + shr.u64 %rd209, %rd269, 32; + cvt.u32.u64 %r118, %rd209; + +$L__BB1_60: + and.b16 %rs342, %rs391, 255; + setp.eq.s16 %p48, %rs342, 0; + selp.u16 %rs343, 1, 0, %p48; + or.b16 %rs344, %rs95, %rs343; + ld.local.u8 %r9763, [%rd261]; + ld.local.u8 %r9764, [%rd261+1]; + prmt.b32 %r9765, %r9764, %r9763, 30212; + ld.local.u8 %r9766, [%rd261+2]; + prmt.b32 %r9767, %r9766, %r9765, 28756; + ld.local.u8 %r9768, [%rd261+3]; + prmt.b32 %r9769, %r9768, %r9767, 1620; + ld.local.u8 %r9770, [%rd261+4]; + ld.local.u8 %r9771, [%rd261+5]; + prmt.b32 %r9772, %r9771, %r9770, 30212; + ld.local.u8 %r9773, [%rd261+6]; + prmt.b32 %r9774, %r9773, %r9772, 28756; + ld.local.u8 %r9775, [%rd261+7]; + prmt.b32 %r9776, %r9775, %r9774, 1620; + ld.local.u8 %r9777, [%rd261+8]; + ld.local.u8 %r9778, [%rd261+9]; + prmt.b32 %r9779, %r9778, %r9777, 30212; + ld.local.u8 %r9780, [%rd261+10]; + prmt.b32 %r9781, %r9780, %r9779, 28756; + ld.local.u8 %r9782, [%rd261+11]; + prmt.b32 %r9783, %r9782, %r9781, 1620; + ld.local.u8 %r9784, [%rd261+12]; + ld.local.u8 %r9785, [%rd261+13]; + prmt.b32 %r9786, %r9785, %r9784, 30212; + ld.local.u8 %r9787, [%rd261+14]; + prmt.b32 %r9788, %r9787, %r9786, 28756; + ld.local.u8 %r9789, [%rd261+15]; + prmt.b32 %r9790, %r9789, %r9788, 1620; + ld.local.u8 %r9791, [%rd261+16]; + ld.local.u8 %r9792, [%rd261+17]; + prmt.b32 %r9793, %r9792, %r9791, 30212; + ld.local.u8 %r9794, [%rd261+18]; + prmt.b32 %r9795, %r9794, %r9793, 28756; + ld.local.u8 %r9796, [%rd261+19]; + prmt.b32 %r9797, %r9796, %r9795, 1620; + ld.local.u8 %r9798, [%rd261+20]; + ld.local.u8 %r9799, [%rd261+21]; + prmt.b32 %r9800, %r9799, %r9798, 30212; + ld.local.u8 %r9801, [%rd261+22]; + prmt.b32 %r9802, %r9801, %r9800, 28756; + ld.local.u8 %r9803, [%rd261+23]; + prmt.b32 %r9804, %r9803, %r9802, 1620; + ld.local.u8 %r9805, [%rd261+24]; + ld.local.u8 %r9806, [%rd261+25]; + prmt.b32 %r9807, %r9806, %r9805, 30212; + ld.local.u8 %r9808, [%rd261+26]; + prmt.b32 %r9809, %r9808, %r9807, 28756; + ld.local.u8 %r9810, [%rd261+27]; + prmt.b32 %r9811, %r9810, %r9809, 1620; + ld.local.u8 %r9812, [%rd261+28]; + ld.local.u8 %r9813, [%rd261+29]; + prmt.b32 %r9814, %r9813, %r9812, 30212; + ld.local.u8 %r9815, [%rd261+30]; + prmt.b32 %r9816, %r9815, %r9814, 28756; + ld.local.u8 %r9817, [%rd261+31]; + prmt.b32 %r9818, %r9817, %r9816, 1620; + ld.local.u8 %r9819, [%rd261+32]; + ld.local.u8 %r9820, [%rd261+33]; + prmt.b32 %r9821, %r9820, %r9819, 30212; + ld.local.u8 %r9822, [%rd261+34]; + prmt.b32 %r9823, %r9822, %r9821, 28756; + ld.local.u8 %r9824, [%rd261+35]; + prmt.b32 %r9825, %r9824, %r9823, 1620; + ld.local.u8 %r9826, [%rd261+36]; + ld.local.u8 %r9827, [%rd261+37]; + prmt.b32 %r9828, %r9827, %r9826, 30212; + ld.local.u8 %r9829, [%rd261+38]; + prmt.b32 %r9830, %r9829, %r9828, 28756; + ld.local.u8 %r9831, [%rd261+39]; + prmt.b32 %r9832, %r9831, %r9830, 1620; + ld.local.u8 %r9833, [%rd261+40]; + ld.local.u8 %r9834, [%rd261+41]; + prmt.b32 %r9835, %r9834, %r9833, 30212; + ld.local.u8 %r9836, [%rd261+42]; + prmt.b32 %r9837, %r9836, %r9835, 28756; + ld.local.u8 %r9838, [%rd261+43]; + prmt.b32 %r9839, %r9838, %r9837, 1620; + ld.local.u8 %r9840, [%rd261+44]; + ld.local.u8 %r9841, [%rd261+45]; + prmt.b32 %r9842, %r9841, %r9840, 30212; + ld.local.u8 %r9843, [%rd261+46]; + prmt.b32 %r9844, %r9843, %r9842, 28756; + ld.local.u8 %r9845, [%rd261+47]; + prmt.b32 %r9846, %r9845, %r9844, 1620; + ld.local.u8 %r9847, [%rd261+48]; + ld.local.u8 %r9848, [%rd261+49]; + prmt.b32 %r9849, %r9848, %r9847, 30212; + ld.local.u8 %r9850, [%rd261+50]; + prmt.b32 %r9851, %r9850, %r9849, 28756; + ld.local.u8 %r9852, [%rd261+51]; + prmt.b32 %r9853, %r9852, %r9851, 1620; + ld.local.u8 %r9854, [%rd261+52]; + ld.local.u8 %r9855, [%rd261+53]; + prmt.b32 %r9856, %r9855, %r9854, 30212; + ld.local.u8 %r9857, [%rd261+54]; + prmt.b32 %r9858, %r9857, %r9856, 28756; + ld.local.u8 %r9859, [%rd261+55]; + prmt.b32 %r9860, %r9859, %r9858, 1620; + ld.local.u8 %r9861, [%rd261+56]; + ld.local.u8 %r9862, [%rd261+57]; + prmt.b32 %r9863, %r9862, %r9861, 30212; + ld.local.u8 %r9864, [%rd261+58]; + prmt.b32 %r9865, %r9864, %r9863, 28756; + ld.local.u8 %r9866, [%rd261+59]; + prmt.b32 %r9867, %r9866, %r9865, 1620; + ld.local.u8 %r9868, [%rd261+60]; + ld.local.u8 %r9869, [%rd261+61]; + prmt.b32 %r9870, %r9869, %r9868, 30212; + ld.local.u8 %r9871, [%rd261+62]; + prmt.b32 %r9872, %r9871, %r9870, 28756; + ld.local.u8 %r9873, [%rd261+63]; + prmt.b32 %r9874, %r9873, %r9872, 1620; + cvt.u32.u16 %r9875, %rs344; + and.b32 %r9876, %r9875, 255; + add.s32 %r9877, %r11689, %r11685; + add.s32 %r9878, %r9877, %r9769; + xor.b32 %r9879, %r9878, %r117; + shf.l.wrap.b32 %r9880, %r9879, %r9879, 16; + add.s32 %r9881, %r9880, 1779033703; + xor.b32 %r9882, %r9881, %r11685; + shf.l.wrap.b32 %r9883, %r9882, %r9882, 20; + add.s32 %r9884, %r9776, %r9878; + add.s32 %r9885, %r9884, %r9883; + xor.b32 %r9886, %r9885, %r9880; + shf.l.wrap.b32 %r9887, %r9886, %r9886, 24; + add.s32 %r9888, %r9887, %r9881; + xor.b32 %r9889, %r9888, %r9883; + shf.l.wrap.b32 %r9890, %r9889, %r9889, 25; + add.s32 %r9891, %r11688, %r11684; + add.s32 %r9892, %r9891, %r9783; + xor.b32 %r9893, %r9892, %r118; + shf.l.wrap.b32 %r9894, %r9893, %r9893, 16; + add.s32 %r9895, %r9894, -1150833019; + xor.b32 %r9896, %r9895, %r11684; + shf.l.wrap.b32 %r9897, %r9896, %r9896, 20; + add.s32 %r9898, %r9790, %r9892; + add.s32 %r9899, %r9898, %r9897; + xor.b32 %r9900, %r9899, %r9894; + shf.l.wrap.b32 %r9901, %r9900, %r9900, 24; + add.s32 %r9902, %r9901, %r9895; + xor.b32 %r9903, %r9902, %r9897; + shf.l.wrap.b32 %r9904, %r9903, %r9903, 25; + add.s32 %r9905, %r11687, %r11683; + add.s32 %r9906, %r9905, %r9797; + shr.u32 %r9907, %r9906, 16; + shl.b32 %r9908, %r9906, 16; + xor.b32 %r9909, %r9908, 4194304; + or.b32 %r9910, %r9909, %r9907; + add.s32 %r9911, %r9910, 1013904242; + xor.b32 %r9912, %r9911, %r11683; + shf.l.wrap.b32 %r9913, %r9912, %r9912, 20; + add.s32 %r9914, %r9804, %r9906; + add.s32 %r9915, %r9914, %r9913; + xor.b32 %r9916, %r9915, %r9910; + shf.l.wrap.b32 %r9917, %r9916, %r9916, 24; + add.s32 %r9918, %r9917, %r9911; + xor.b32 %r9919, %r9918, %r9913; + shf.l.wrap.b32 %r9920, %r9919, %r9919, 25; + add.s32 %r9921, %r11686, %r11682; + add.s32 %r9922, %r9921, %r9811; + xor.b32 %r9923, %r9922, %r9876; + shr.u32 %r9924, %r9922, 16; + shl.b32 %r9925, %r9923, 16; + or.b32 %r9926, %r9925, %r9924; + add.s32 %r9927, %r9926, -1521486534; + xor.b32 %r9928, %r9927, %r11682; + shf.l.wrap.b32 %r9929, %r9928, %r9928, 20; + add.s32 %r9930, %r9818, %r9922; + add.s32 %r9931, %r9930, %r9929; + xor.b32 %r9932, %r9931, %r9926; + shf.l.wrap.b32 %r9933, %r9932, %r9932, 24; + add.s32 %r9934, %r9933, %r9927; + xor.b32 %r9935, %r9934, %r9929; + shf.l.wrap.b32 %r9936, %r9935, %r9935, 25; + add.s32 %r9937, %r9904, %r9885; + add.s32 %r9938, %r9937, %r9825; + xor.b32 %r9939, %r9933, %r9938; + shf.l.wrap.b32 %r9940, %r9939, %r9939, 16; + add.s32 %r9941, %r9940, %r9918; + xor.b32 %r9942, %r9941, %r9904; + shf.l.wrap.b32 %r9943, %r9942, %r9942, 20; + add.s32 %r9944, %r9832, %r9938; + add.s32 %r9945, %r9944, %r9943; + xor.b32 %r9946, %r9945, %r9940; + shf.l.wrap.b32 %r9947, %r9946, %r9946, 24; + add.s32 %r9948, %r9947, %r9941; + xor.b32 %r9949, %r9948, %r9943; + shf.l.wrap.b32 %r9950, %r9949, %r9949, 25; + add.s32 %r9951, %r9920, %r9899; + add.s32 %r9952, %r9951, %r9839; + xor.b32 %r9953, %r9952, %r9887; + shf.l.wrap.b32 %r9954, %r9953, %r9953, 16; + add.s32 %r9955, %r9954, %r9934; + xor.b32 %r9956, %r9955, %r9920; + shf.l.wrap.b32 %r9957, %r9956, %r9956, 20; + add.s32 %r9958, %r9846, %r9952; + add.s32 %r9959, %r9958, %r9957; + xor.b32 %r9960, %r9959, %r9954; + shf.l.wrap.b32 %r9961, %r9960, %r9960, 24; + add.s32 %r9962, %r9961, %r9955; + xor.b32 %r9963, %r9962, %r9957; + shf.l.wrap.b32 %r9964, %r9963, %r9963, 25; + add.s32 %r9965, %r9936, %r9915; + add.s32 %r9966, %r9965, %r9853; + xor.b32 %r9967, %r9966, %r9901; + shf.l.wrap.b32 %r9968, %r9967, %r9967, 16; + add.s32 %r9969, %r9968, %r9888; + xor.b32 %r9970, %r9969, %r9936; + shf.l.wrap.b32 %r9971, %r9970, %r9970, 20; + add.s32 %r9972, %r9860, %r9966; + add.s32 %r9973, %r9972, %r9971; + xor.b32 %r9974, %r9973, %r9968; + shf.l.wrap.b32 %r9975, %r9974, %r9974, 24; + add.s32 %r9976, %r9975, %r9969; + xor.b32 %r9977, %r9976, %r9971; + shf.l.wrap.b32 %r9978, %r9977, %r9977, 25; + add.s32 %r9979, %r9931, %r9890; + add.s32 %r9980, %r9979, %r9867; + xor.b32 %r9981, %r9980, %r9917; + shf.l.wrap.b32 %r9982, %r9981, %r9981, 16; + add.s32 %r9983, %r9982, %r9902; + xor.b32 %r9984, %r9983, %r9890; + shf.l.wrap.b32 %r9985, %r9984, %r9984, 20; + add.s32 %r9986, %r9874, %r9980; + add.s32 %r9987, %r9986, %r9985; + xor.b32 %r9988, %r9987, %r9982; + shf.l.wrap.b32 %r9989, %r9988, %r9988, 24; + add.s32 %r9990, %r9989, %r9983; + xor.b32 %r9991, %r9990, %r9985; + shf.l.wrap.b32 %r9992, %r9991, %r9991, 25; + add.s32 %r9993, %r9945, %r9783; + add.s32 %r9994, %r9993, %r9992; + xor.b32 %r9995, %r9994, %r9961; + shf.l.wrap.b32 %r9996, %r9995, %r9995, 16; + add.s32 %r9997, %r9996, %r9976; + xor.b32 %r9998, %r9997, %r9992; + shf.l.wrap.b32 %r9999, %r9998, %r9998, 20; + add.s32 %r10000, %r9994, %r9811; + add.s32 %r10001, %r10000, %r9999; + xor.b32 %r10002, %r10001, %r9996; + shf.l.wrap.b32 %r10003, %r10002, %r10002, 24; + add.s32 %r10004, %r10003, %r9997; + xor.b32 %r10005, %r10004, %r9999; + shf.l.wrap.b32 %r10006, %r10005, %r10005, 25; + add.s32 %r10007, %r9959, %r9790; + add.s32 %r10008, %r10007, %r9950; + xor.b32 %r10009, %r9975, %r10008; + shf.l.wrap.b32 %r10010, %r10009, %r10009, 16; + add.s32 %r10011, %r9990, %r10010; + xor.b32 %r10012, %r10011, %r9950; + shf.l.wrap.b32 %r10013, %r10012, %r10012, 20; + add.s32 %r10014, %r10008, %r9839; + add.s32 %r10015, %r10014, %r10013; + xor.b32 %r10016, %r10015, %r10010; + shf.l.wrap.b32 %r10017, %r10016, %r10016, 24; + add.s32 %r10018, %r10017, %r10011; + xor.b32 %r10019, %r10018, %r10013; + shf.l.wrap.b32 %r10020, %r10019, %r10019, 25; + add.s32 %r10021, %r9964, %r9818; + add.s32 %r10022, %r10021, %r9973; + xor.b32 %r10023, %r9989, %r10022; + shf.l.wrap.b32 %r10024, %r10023, %r10023, 16; + add.s32 %r10025, %r10024, %r9948; + xor.b32 %r10026, %r10025, %r9964; + shf.l.wrap.b32 %r10027, %r10026, %r10026, 20; + add.s32 %r10028, %r10022, %r9769; + add.s32 %r10029, %r10028, %r10027; + xor.b32 %r10030, %r10029, %r10024; + shf.l.wrap.b32 %r10031, %r10030, %r10030, 24; + add.s32 %r10032, %r10031, %r10025; + xor.b32 %r10033, %r10032, %r10027; + shf.l.wrap.b32 %r10034, %r10033, %r10033, 25; + add.s32 %r10035, %r9978, %r9797; + add.s32 %r10036, %r10035, %r9987; + xor.b32 %r10037, %r10036, %r9947; + shf.l.wrap.b32 %r10038, %r10037, %r10037, 16; + add.s32 %r10039, %r10038, %r9962; + xor.b32 %r10040, %r10039, %r9978; + shf.l.wrap.b32 %r10041, %r10040, %r10040, 20; + add.s32 %r10042, %r10036, %r9860; + add.s32 %r10043, %r10042, %r10041; + xor.b32 %r10044, %r10043, %r10038; + shf.l.wrap.b32 %r10045, %r10044, %r10044, 24; + add.s32 %r10046, %r10045, %r10039; + xor.b32 %r10047, %r10046, %r10041; + shf.l.wrap.b32 %r10048, %r10047, %r10047, 25; + add.s32 %r10049, %r10001, %r9776; + add.s32 %r10050, %r10049, %r10020; + xor.b32 %r10051, %r10050, %r10045; + shf.l.wrap.b32 %r10052, %r10051, %r10051, 16; + add.s32 %r10053, %r10052, %r10032; + xor.b32 %r10054, %r10053, %r10020; + shf.l.wrap.b32 %r10055, %r10054, %r10054, 20; + add.s32 %r10056, %r10050, %r9846; + add.s32 %r10057, %r10056, %r10055; + xor.b32 %r10058, %r10057, %r10052; + shf.l.wrap.b32 %r10059, %r10058, %r10058, 24; + add.s32 %r10060, %r10059, %r10053; + xor.b32 %r10061, %r10060, %r10055; + shf.l.wrap.b32 %r10062, %r10061, %r10061, 25; + add.s32 %r10063, %r10015, %r9853; + add.s32 %r10064, %r10063, %r10034; + xor.b32 %r10065, %r10064, %r10003; + shf.l.wrap.b32 %r10066, %r10065, %r10065, 16; + add.s32 %r10067, %r10066, %r10046; + xor.b32 %r10068, %r10067, %r10034; + shf.l.wrap.b32 %r10069, %r10068, %r10068, 20; + add.s32 %r10070, %r10064, %r9804; + add.s32 %r10071, %r10070, %r10069; + xor.b32 %r10072, %r10071, %r10066; + shf.l.wrap.b32 %r10073, %r10072, %r10072, 24; + add.s32 %r10074, %r10073, %r10067; + xor.b32 %r10075, %r10074, %r10069; + shf.l.wrap.b32 %r10076, %r10075, %r10075, 25; + add.s32 %r10077, %r10029, %r9832; + add.s32 %r10078, %r10077, %r10048; + xor.b32 %r10079, %r10078, %r10017; + shf.l.wrap.b32 %r10080, %r10079, %r10079, 16; + add.s32 %r10081, %r10080, %r10004; + xor.b32 %r10082, %r10081, %r10048; + shf.l.wrap.b32 %r10083, %r10082, %r10082, 20; + add.s32 %r10084, %r10078, %r9867; + add.s32 %r10085, %r10084, %r10083; + xor.b32 %r10086, %r10085, %r10080; + shf.l.wrap.b32 %r10087, %r10086, %r10086, 24; + add.s32 %r10088, %r10087, %r10081; + xor.b32 %r10089, %r10088, %r10083; + shf.l.wrap.b32 %r10090, %r10089, %r10089, 25; + add.s32 %r10091, %r10043, %r9874; + add.s32 %r10092, %r10091, %r10006; + xor.b32 %r10093, %r10092, %r10031; + shf.l.wrap.b32 %r10094, %r10093, %r10093, 16; + add.s32 %r10095, %r10094, %r10018; + xor.b32 %r10096, %r10095, %r10006; + shf.l.wrap.b32 %r10097, %r10096, %r10096, 20; + add.s32 %r10098, %r10092, %r9825; + add.s32 %r10099, %r10098, %r10097; + xor.b32 %r10100, %r10099, %r10094; + shf.l.wrap.b32 %r10101, %r10100, %r10100, 24; + add.s32 %r10102, %r10101, %r10095; + xor.b32 %r10103, %r10102, %r10097; + shf.l.wrap.b32 %r10104, %r10103, %r10103, 25; + add.s32 %r10105, %r10057, %r9790; + add.s32 %r10106, %r10105, %r10104; + xor.b32 %r10107, %r10106, %r10073; + shf.l.wrap.b32 %r10108, %r10107, %r10107, 16; + add.s32 %r10109, %r10108, %r10088; + xor.b32 %r10110, %r10109, %r10104; + shf.l.wrap.b32 %r10111, %r10110, %r10110, 20; + add.s32 %r10112, %r10106, %r9797; + add.s32 %r10113, %r10112, %r10111; + xor.b32 %r10114, %r10113, %r10108; + shf.l.wrap.b32 %r10115, %r10114, %r10114, 24; + add.s32 %r10116, %r10115, %r10109; + xor.b32 %r10117, %r10116, %r10111; + shf.l.wrap.b32 %r10118, %r10117, %r10117, 25; + add.s32 %r10119, %r10071, %r9839; + add.s32 %r10120, %r10119, %r10062; + xor.b32 %r10121, %r10120, %r10087; + shf.l.wrap.b32 %r10122, %r10121, %r10121, 16; + add.s32 %r10123, %r10122, %r10102; + xor.b32 %r10124, %r10123, %r10062; + shf.l.wrap.b32 %r10125, %r10124, %r10124, 20; + add.s32 %r10126, %r10120, %r9853; + add.s32 %r10127, %r10126, %r10125; + xor.b32 %r10128, %r10127, %r10122; + shf.l.wrap.b32 %r10129, %r10128, %r10128, 24; + add.s32 %r10130, %r10129, %r10123; + xor.b32 %r10131, %r10130, %r10125; + shf.l.wrap.b32 %r10132, %r10131, %r10131, 25; + add.s32 %r10133, %r10085, %r9860; + add.s32 %r10134, %r10133, %r10076; + xor.b32 %r10135, %r10134, %r10101; + shf.l.wrap.b32 %r10136, %r10135, %r10135, 16; + add.s32 %r10137, %r10136, %r10060; + xor.b32 %r10138, %r10137, %r10076; + shf.l.wrap.b32 %r10139, %r10138, %r10138, 20; + add.s32 %r10140, %r10134, %r9783; + add.s32 %r10141, %r10140, %r10139; + xor.b32 %r10142, %r10141, %r10136; + shf.l.wrap.b32 %r10143, %r10142, %r10142, 24; + add.s32 %r10144, %r10143, %r10137; + xor.b32 %r10145, %r10144, %r10139; + shf.l.wrap.b32 %r10146, %r10145, %r10145, 25; + add.s32 %r10147, %r10099, %r9818; + add.s32 %r10148, %r10147, %r10090; + xor.b32 %r10149, %r10148, %r10059; + shf.l.wrap.b32 %r10150, %r10149, %r10149, 16; + add.s32 %r10151, %r10150, %r10074; + xor.b32 %r10152, %r10151, %r10090; + shf.l.wrap.b32 %r10153, %r10152, %r10152, 20; + add.s32 %r10154, %r10148, %r9867; + add.s32 %r10155, %r10154, %r10153; + xor.b32 %r10156, %r10155, %r10150; + shf.l.wrap.b32 %r10157, %r10156, %r10156, 24; + add.s32 %r10158, %r10157, %r10151; + xor.b32 %r10159, %r10158, %r10153; + shf.l.wrap.b32 %r10160, %r10159, %r10159, 25; + add.s32 %r10161, %r10113, %r9811; + add.s32 %r10162, %r10161, %r10132; + xor.b32 %r10163, %r10162, %r10157; + shf.l.wrap.b32 %r10164, %r10163, %r10163, 16; + add.s32 %r10165, %r10164, %r10144; + xor.b32 %r10166, %r10165, %r10132; + shf.l.wrap.b32 %r10167, %r10166, %r10166, 20; + add.s32 %r10168, %r10162, %r9804; + add.s32 %r10169, %r10168, %r10167; + xor.b32 %r10170, %r10169, %r10164; + shf.l.wrap.b32 %r10171, %r10170, %r10170, 24; + add.s32 %r10172, %r10171, %r10165; + xor.b32 %r10173, %r10172, %r10167; + shf.l.wrap.b32 %r10174, %r10173, %r10173, 25; + add.s32 %r10175, %r10127, %r9832; + add.s32 %r10176, %r10175, %r10146; + xor.b32 %r10177, %r10176, %r10115; + shf.l.wrap.b32 %r10178, %r10177, %r10177, 16; + add.s32 %r10179, %r10178, %r10158; + xor.b32 %r10180, %r10179, %r10146; + shf.l.wrap.b32 %r10181, %r10180, %r10180, 20; + add.s32 %r10182, %r10176, %r9769; + add.s32 %r10183, %r10182, %r10181; + xor.b32 %r10184, %r10183, %r10178; + shf.l.wrap.b32 %r10185, %r10184, %r10184, 24; + add.s32 %r10186, %r10185, %r10179; + xor.b32 %r10187, %r10186, %r10181; + shf.l.wrap.b32 %r10188, %r10187, %r10187, 25; + add.s32 %r10189, %r10141, %r9846; + add.s32 %r10190, %r10189, %r10160; + xor.b32 %r10191, %r10190, %r10129; + shf.l.wrap.b32 %r10192, %r10191, %r10191, 16; + add.s32 %r10193, %r10192, %r10116; + xor.b32 %r10194, %r10193, %r10160; + shf.l.wrap.b32 %r10195, %r10194, %r10194, 20; + add.s32 %r10196, %r10190, %r9874; + add.s32 %r10197, %r10196, %r10195; + xor.b32 %r10198, %r10197, %r10192; + shf.l.wrap.b32 %r10199, %r10198, %r10198, 24; + add.s32 %r10200, %r10199, %r10193; + xor.b32 %r10201, %r10200, %r10195; + shf.l.wrap.b32 %r10202, %r10201, %r10201, 25; + add.s32 %r10203, %r10155, %r9825; + add.s32 %r10204, %r10203, %r10118; + xor.b32 %r10205, %r10204, %r10143; + shf.l.wrap.b32 %r10206, %r10205, %r10205, 16; + add.s32 %r10207, %r10206, %r10130; + xor.b32 %r10208, %r10207, %r10118; + shf.l.wrap.b32 %r10209, %r10208, %r10208, 20; + add.s32 %r10210, %r10204, %r9776; + add.s32 %r10211, %r10210, %r10209; + xor.b32 %r10212, %r10211, %r10206; + shf.l.wrap.b32 %r10213, %r10212, %r10212, 24; + add.s32 %r10214, %r10213, %r10207; + xor.b32 %r10215, %r10214, %r10209; + shf.l.wrap.b32 %r10216, %r10215, %r10215, 25; + add.s32 %r10217, %r10169, %r9839; + add.s32 %r10218, %r10217, %r10216; + xor.b32 %r10219, %r10218, %r10185; + shf.l.wrap.b32 %r10220, %r10219, %r10219, 16; + add.s32 %r10221, %r10220, %r10200; + xor.b32 %r10222, %r10221, %r10216; + shf.l.wrap.b32 %r10223, %r10222, %r10222, 20; + add.s32 %r10224, %r10218, %r9818; + add.s32 %r10225, %r10224, %r10223; + xor.b32 %r10226, %r10225, %r10220; + shf.l.wrap.b32 %r10227, %r10226, %r10226, 24; + add.s32 %r10228, %r10227, %r10221; + xor.b32 %r10229, %r10228, %r10223; + shf.l.wrap.b32 %r10230, %r10229, %r10229, 25; + add.s32 %r10231, %r10183, %r9853; + add.s32 %r10232, %r10231, %r10174; + xor.b32 %r10233, %r10232, %r10199; + shf.l.wrap.b32 %r10234, %r10233, %r10233, 16; + add.s32 %r10235, %r10234, %r10214; + xor.b32 %r10236, %r10235, %r10174; + shf.l.wrap.b32 %r10237, %r10236, %r10236, 20; + add.s32 %r10238, %r10232, %r9832; + add.s32 %r10239, %r10238, %r10237; + xor.b32 %r10240, %r10239, %r10234; + shf.l.wrap.b32 %r10241, %r10240, %r10240, 24; + add.s32 %r10242, %r10241, %r10235; + xor.b32 %r10243, %r10242, %r10237; + shf.l.wrap.b32 %r10244, %r10243, %r10243, 25; + add.s32 %r10245, %r10197, %r9867; + add.s32 %r10246, %r10245, %r10188; + xor.b32 %r10247, %r10246, %r10213; + shf.l.wrap.b32 %r10248, %r10247, %r10247, 16; + add.s32 %r10249, %r10248, %r10172; + xor.b32 %r10250, %r10249, %r10188; + shf.l.wrap.b32 %r10251, %r10250, %r10250, 20; + add.s32 %r10252, %r10246, %r9790; + add.s32 %r10253, %r10252, %r10251; + xor.b32 %r10254, %r10253, %r10248; + shf.l.wrap.b32 %r10255, %r10254, %r10254, 24; + add.s32 %r10256, %r10255, %r10249; + xor.b32 %r10257, %r10256, %r10251; + shf.l.wrap.b32 %r10258, %r10257, %r10257, 25; + add.s32 %r10259, %r10211, %r9860; + add.s32 %r10260, %r10259, %r10202; + xor.b32 %r10261, %r10260, %r10171; + shf.l.wrap.b32 %r10262, %r10261, %r10261, 16; + add.s32 %r10263, %r10262, %r10186; + xor.b32 %r10264, %r10263, %r10202; + shf.l.wrap.b32 %r10265, %r10264, %r10264, 20; + add.s32 %r10266, %r10260, %r9874; + add.s32 %r10267, %r10266, %r10265; + xor.b32 %r10268, %r10267, %r10262; + shf.l.wrap.b32 %r10269, %r10268, %r10268, 24; + add.s32 %r10270, %r10269, %r10263; + xor.b32 %r10271, %r10270, %r10265; + shf.l.wrap.b32 %r10272, %r10271, %r10271, 25; + add.s32 %r10273, %r10225, %r9797; + add.s32 %r10274, %r10273, %r10244; + xor.b32 %r10275, %r10274, %r10269; + shf.l.wrap.b32 %r10276, %r10275, %r10275, 16; + add.s32 %r10277, %r10276, %r10256; + xor.b32 %r10278, %r10277, %r10244; + shf.l.wrap.b32 %r10279, %r10278, %r10278, 20; + add.s32 %r10280, %r10274, %r9769; + add.s32 %r10281, %r10280, %r10279; + xor.b32 %r10282, %r10281, %r10276; + shf.l.wrap.b32 %r10283, %r10282, %r10282, 24; + add.s32 %r10284, %r10283, %r10277; + xor.b32 %r10285, %r10284, %r10279; + shf.l.wrap.b32 %r10286, %r10285, %r10285, 25; + add.s32 %r10287, %r10239, %r9846; + add.s32 %r10288, %r10287, %r10258; + xor.b32 %r10289, %r10288, %r10227; + shf.l.wrap.b32 %r10290, %r10289, %r10289, 16; + add.s32 %r10291, %r10290, %r10270; + xor.b32 %r10292, %r10291, %r10258; + shf.l.wrap.b32 %r10293, %r10292, %r10292, 20; + add.s32 %r10294, %r10288, %r9783; + add.s32 %r10295, %r10294, %r10293; + xor.b32 %r10296, %r10295, %r10290; + shf.l.wrap.b32 %r10297, %r10296, %r10296, 24; + add.s32 %r10298, %r10297, %r10291; + xor.b32 %r10299, %r10298, %r10293; + shf.l.wrap.b32 %r10300, %r10299, %r10299, 25; + add.s32 %r10301, %r10253, %r9804; + add.s32 %r10302, %r10301, %r10272; + xor.b32 %r10303, %r10302, %r10241; + shf.l.wrap.b32 %r10304, %r10303, %r10303, 16; + add.s32 %r10305, %r10304, %r10228; + xor.b32 %r10306, %r10305, %r10272; + shf.l.wrap.b32 %r10307, %r10306, %r10306, 20; + add.s32 %r10308, %r10302, %r9825; + add.s32 %r10309, %r10308, %r10307; + xor.b32 %r10310, %r10309, %r10304; + shf.l.wrap.b32 %r10311, %r10310, %r10310, 24; + add.s32 %r10312, %r10311, %r10305; + xor.b32 %r10313, %r10312, %r10307; + shf.l.wrap.b32 %r10314, %r10313, %r10313, 25; + add.s32 %r10315, %r10267, %r9776; + add.s32 %r10316, %r10315, %r10230; + xor.b32 %r10317, %r10316, %r10255; + shf.l.wrap.b32 %r10318, %r10317, %r10317, 16; + add.s32 %r10319, %r10318, %r10242; + xor.b32 %r10320, %r10319, %r10230; + shf.l.wrap.b32 %r10321, %r10320, %r10320, 20; + add.s32 %r10322, %r10316, %r9811; + add.s32 %r10323, %r10322, %r10321; + xor.b32 %r10324, %r10323, %r10318; + shf.l.wrap.b32 %r10325, %r10324, %r10324, 24; + add.s32 %r10326, %r10325, %r10319; + xor.b32 %r10327, %r10326, %r10321; + shf.l.wrap.b32 %r10328, %r10327, %r10327, 25; + add.s32 %r10329, %r10281, %r9853; + add.s32 %r10330, %r10329, %r10328; + xor.b32 %r10331, %r10330, %r10297; + shf.l.wrap.b32 %r10332, %r10331, %r10331, 16; + add.s32 %r10333, %r10332, %r10312; + xor.b32 %r10334, %r10333, %r10328; + shf.l.wrap.b32 %r10335, %r10334, %r10334, 20; + add.s32 %r10336, %r10330, %r9860; + add.s32 %r10337, %r10336, %r10335; + xor.b32 %r10338, %r10337, %r10332; + shf.l.wrap.b32 %r10339, %r10338, %r10338, 24; + add.s32 %r10340, %r10339, %r10333; + xor.b32 %r10341, %r10340, %r10335; + shf.l.wrap.b32 %r10342, %r10341, %r10341, 25; + add.s32 %r10343, %r10295, %r9832; + add.s32 %r10344, %r10343, %r10286; + xor.b32 %r10345, %r10344, %r10311; + shf.l.wrap.b32 %r10346, %r10345, %r10345, 16; + add.s32 %r10347, %r10346, %r10326; + xor.b32 %r10348, %r10347, %r10286; + shf.l.wrap.b32 %r10349, %r10348, %r10348, 20; + add.s32 %r10350, %r10344, %r9846; + add.s32 %r10351, %r10350, %r10349; + xor.b32 %r10352, %r10351, %r10346; + shf.l.wrap.b32 %r10353, %r10352, %r10352, 24; + add.s32 %r10354, %r10353, %r10347; + xor.b32 %r10355, %r10354, %r10349; + shf.l.wrap.b32 %r10356, %r10355, %r10355, 25; + add.s32 %r10357, %r10309, %r9874; + add.s32 %r10358, %r10357, %r10300; + xor.b32 %r10359, %r10358, %r10325; + shf.l.wrap.b32 %r10360, %r10359, %r10359, 16; + add.s32 %r10361, %r10360, %r10284; + xor.b32 %r10362, %r10361, %r10300; + shf.l.wrap.b32 %r10363, %r10362, %r10362, 20; + add.s32 %r10364, %r10358, %r9839; + add.s32 %r10365, %r10364, %r10363; + xor.b32 %r10366, %r10365, %r10360; + shf.l.wrap.b32 %r10367, %r10366, %r10366, 24; + add.s32 %r10368, %r10367, %r10361; + xor.b32 %r10369, %r10368, %r10363; + shf.l.wrap.b32 %r10370, %r10369, %r10369, 25; + add.s32 %r10371, %r10323, %r9867; + add.s32 %r10372, %r10371, %r10314; + xor.b32 %r10373, %r10372, %r10283; + shf.l.wrap.b32 %r10374, %r10373, %r10373, 16; + add.s32 %r10375, %r10374, %r10298; + xor.b32 %r10376, %r10375, %r10314; + shf.l.wrap.b32 %r10377, %r10376, %r10376, 20; + add.s32 %r10378, %r10372, %r9825; + add.s32 %r10379, %r10378, %r10377; + xor.b32 %r10380, %r10379, %r10374; + shf.l.wrap.b32 %r10381, %r10380, %r10380, 24; + add.s32 %r10382, %r10381, %r10375; + xor.b32 %r10383, %r10382, %r10377; + shf.l.wrap.b32 %r10384, %r10383, %r10383, 25; + add.s32 %r10385, %r10337, %r9818; + add.s32 %r10386, %r10385, %r10356; + xor.b32 %r10387, %r10386, %r10381; + shf.l.wrap.b32 %r10388, %r10387, %r10387, 16; + add.s32 %r10389, %r10388, %r10368; + xor.b32 %r10390, %r10389, %r10356; + shf.l.wrap.b32 %r10391, %r10390, %r10390, 20; + add.s32 %r10392, %r10386, %r9783; + add.s32 %r10393, %r10392, %r10391; + xor.b32 %r10394, %r10393, %r10388; + shf.l.wrap.b32 %r10395, %r10394, %r10394, 24; + add.s32 %r10396, %r10395, %r10389; + xor.b32 %r10397, %r10396, %r10391; + shf.l.wrap.b32 %r10398, %r10397, %r10397, 25; + add.s32 %r10399, %r10351, %r9804; + add.s32 %r10400, %r10399, %r10370; + xor.b32 %r10401, %r10400, %r10339; + shf.l.wrap.b32 %r10402, %r10401, %r10401, 16; + add.s32 %r10403, %r10402, %r10382; + xor.b32 %r10404, %r10403, %r10370; + shf.l.wrap.b32 %r10405, %r10404, %r10404, 20; + add.s32 %r10406, %r10400, %r9790; + add.s32 %r10407, %r10406, %r10405; + xor.b32 %r10408, %r10407, %r10402; + shf.l.wrap.b32 %r10409, %r10408, %r10408, 24; + add.s32 %r10410, %r10409, %r10403; + xor.b32 %r10411, %r10410, %r10405; + shf.l.wrap.b32 %r10412, %r10411, %r10411, 25; + add.s32 %r10413, %r10365, %r9769; + add.s32 %r10414, %r10413, %r10384; + xor.b32 %r10415, %r10414, %r10353; + shf.l.wrap.b32 %r10416, %r10415, %r10415, 16; + add.s32 %r10417, %r10416, %r10340; + xor.b32 %r10418, %r10417, %r10384; + shf.l.wrap.b32 %r10419, %r10418, %r10418, 20; + add.s32 %r10420, %r10414, %r9776; + add.s32 %r10421, %r10420, %r10419; + xor.b32 %r10422, %r10421, %r10416; + shf.l.wrap.b32 %r10423, %r10422, %r10422, 24; + add.s32 %r10424, %r10423, %r10417; + xor.b32 %r10425, %r10424, %r10419; + shf.l.wrap.b32 %r10426, %r10425, %r10425, 25; + add.s32 %r10427, %r10379, %r9811; + add.s32 %r10428, %r10427, %r10342; + xor.b32 %r10429, %r10428, %r10367; + shf.l.wrap.b32 %r10430, %r10429, %r10429, 16; + add.s32 %r10431, %r10430, %r10354; + xor.b32 %r10432, %r10431, %r10342; + shf.l.wrap.b32 %r10433, %r10432, %r10432, 20; + add.s32 %r10434, %r10428, %r9797; + add.s32 %r10435, %r10434, %r10433; + xor.b32 %r10436, %r10435, %r10430; + shf.l.wrap.b32 %r10437, %r10436, %r10436, 24; + add.s32 %r10438, %r10437, %r10431; + xor.b32 %r10439, %r10438, %r10433; + shf.l.wrap.b32 %r10440, %r10439, %r10439, 25; + add.s32 %r10441, %r10393, %r9832; + add.s32 %r10442, %r10441, %r10440; + xor.b32 %r10443, %r10442, %r10409; + shf.l.wrap.b32 %r10444, %r10443, %r10443, 16; + add.s32 %r10445, %r10444, %r10424; + xor.b32 %r10446, %r10445, %r10440; + shf.l.wrap.b32 %r10447, %r10446, %r10446, 20; + add.s32 %r10448, %r10442, %r9867; + add.s32 %r10449, %r10448, %r10447; + xor.b32 %r10450, %r10449, %r10444; + shf.l.wrap.b32 %r10451, %r10450, %r10450, 24; + add.s32 %r10452, %r10451, %r10445; + xor.b32 %r10453, %r10452, %r10447; + shf.l.wrap.b32 %r10454, %r10453, %r10453, 25; + add.s32 %r10455, %r10407, %r9846; + add.s32 %r10456, %r10455, %r10398; + xor.b32 %r10457, %r10456, %r10423; + shf.l.wrap.b32 %r10458, %r10457, %r10457, 16; + add.s32 %r10459, %r10458, %r10438; + xor.b32 %r10460, %r10459, %r10398; + shf.l.wrap.b32 %r10461, %r10460, %r10460, 20; + add.s32 %r10462, %r10456, %r9804; + add.s32 %r10463, %r10462, %r10461; + xor.b32 %r10464, %r10463, %r10458; + shf.l.wrap.b32 %r10465, %r10464, %r10464, 24; + add.s32 %r10466, %r10465, %r10459; + xor.b32 %r10467, %r10466, %r10461; + shf.l.wrap.b32 %r10468, %r10467, %r10467, 25; + add.s32 %r10469, %r10421, %r9825; + add.s32 %r10470, %r10469, %r10412; + xor.b32 %r10471, %r10470, %r10437; + shf.l.wrap.b32 %r10472, %r10471, %r10471, 16; + add.s32 %r10473, %r10472, %r10396; + xor.b32 %r10474, %r10473, %r10412; + shf.l.wrap.b32 %r10475, %r10474, %r10474, 20; + add.s32 %r10476, %r10470, %r9853; + add.s32 %r10477, %r10476, %r10475; + xor.b32 %r10478, %r10477, %r10472; + shf.l.wrap.b32 %r10479, %r10478, %r10478, 24; + add.s32 %r10480, %r10479, %r10473; + xor.b32 %r10481, %r10480, %r10475; + shf.l.wrap.b32 %r10482, %r10481, %r10481, 25; + add.s32 %r10483, %r10435, %r9874; + add.s32 %r10484, %r10483, %r10426; + xor.b32 %r10485, %r10484, %r10395; + shf.l.wrap.b32 %r10486, %r10485, %r10485, 16; + add.s32 %r10487, %r10486, %r10410; + xor.b32 %r10488, %r10487, %r10426; + shf.l.wrap.b32 %r10489, %r10488, %r10488, 20; + add.s32 %r10490, %r10484, %r9776; + add.s32 %r10491, %r10490, %r10489; + xor.b32 %r10492, %r10491, %r10486; + shf.l.wrap.b32 %r10493, %r10492, %r10492, 24; + add.s32 %r10494, %r10493, %r10487; + xor.b32 %r10495, %r10494, %r10489; + shf.l.wrap.b32 %r10496, %r10495, %r10495, 25; + add.s32 %r10497, %r10449, %r9860; + add.s32 %r10498, %r10497, %r10468; + xor.b32 %r10499, %r10498, %r10493; + shf.l.wrap.b32 %r10500, %r10499, %r10499, 16; + add.s32 %r10501, %r10500, %r10480; + xor.b32 %r10502, %r10501, %r10468; + shf.l.wrap.b32 %r10503, %r10502, %r10502, 20; + add.s32 %r10504, %r10498, %r9790; + add.s32 %r10505, %r10504, %r10503; + xor.b32 %r10506, %r10505, %r10500; + shf.l.wrap.b32 %r10507, %r10506, %r10506, 24; + add.s32 %r10508, %r10507, %r10501; + xor.b32 %r10509, %r10508, %r10503; + shf.l.wrap.b32 %r10510, %r10509, %r10509, 25; + add.s32 %r10511, %r10463, %r9769; + add.s32 %r10512, %r10511, %r10482; + xor.b32 %r10513, %r10512, %r10451; + shf.l.wrap.b32 %r10514, %r10513, %r10513, 16; + add.s32 %r10515, %r10514, %r10494; + xor.b32 %r10516, %r10515, %r10482; + shf.l.wrap.b32 %r10517, %r10516, %r10516, 20; + add.s32 %r10518, %r10512, %r9839; + add.s32 %r10519, %r10518, %r10517; + xor.b32 %r10520, %r10519, %r10514; + shf.l.wrap.b32 %r10521, %r10520, %r10520, 24; + add.s32 %r10522, %r10521, %r10515; + xor.b32 %r10523, %r10522, %r10517; + shf.l.wrap.b32 %r10524, %r10523, %r10523, 25; + add.s32 %r10525, %r10477, %r9783; + add.s32 %r10526, %r10525, %r10496; + xor.b32 %r10527, %r10526, %r10465; + shf.l.wrap.b32 %r10528, %r10527, %r10527, 16; + add.s32 %r10529, %r10528, %r10452; + xor.b32 %r10530, %r10529, %r10496; + shf.l.wrap.b32 %r10531, %r10530, %r10530, 20; + add.s32 %r10532, %r10526, %r9811; + add.s32 %r10533, %r10532, %r10531; + xor.b32 %r10534, %r10533, %r10528; + shf.l.wrap.b32 %r10535, %r10534, %r10534, 24; + add.s32 %r10536, %r10535, %r10529; + xor.b32 %r10537, %r10536, %r10531; + shf.l.wrap.b32 %r10538, %r10537, %r10537, 25; + add.s32 %r10539, %r10491, %r9797; + add.s32 %r10540, %r10539, %r10454; + xor.b32 %r10541, %r10540, %r10479; + shf.l.wrap.b32 %r10542, %r10541, %r10541, 16; + add.s32 %r10543, %r10542, %r10466; + xor.b32 %r10544, %r10543, %r10454; + shf.l.wrap.b32 %r10545, %r10544, %r10544, 20; + add.s32 %r10546, %r10540, %r9818; + add.s32 %r10547, %r10546, %r10545; + xor.b32 %r10548, %r10547, %r10542; + shf.l.wrap.b32 %r10549, %r10548, %r10548, 24; + add.s32 %r10550, %r10549, %r10543; + xor.b32 %r10551, %r10550, %r10545; + shf.l.wrap.b32 %r10552, %r10551, %r10551, 25; + add.s32 %r10553, %r10505, %r9846; + add.s32 %r10554, %r10553, %r10552; + xor.b32 %r10555, %r10554, %r10521; + shf.l.wrap.b32 %r10556, %r10555, %r10555, 16; + add.s32 %r10557, %r10556, %r10536; + xor.b32 %r10558, %r10557, %r10552; + shf.l.wrap.b32 %r10559, %r10558, %r10558, 20; + add.s32 %r10560, %r10554, %r9874; + add.s32 %r10561, %r10560, %r10559; + xor.b32 %r10562, %r10561, %r10556; + shf.l.wrap.b32 %r10563, %r10562, %r10562, 24; + add.s32 %r10564, %r10563, %r10557; + xor.b32 %r10565, %r10564, %r10559; + shf.l.wrap.b32 %r10566, %r10565, %r10565, 25; + add.s32 %r10567, %r10519, %r9804; + add.s32 %r10568, %r10567, %r10510; + xor.b32 %r10569, %r10568, %r10535; + shf.l.wrap.b32 %r10570, %r10569, %r10569, 16; + add.s32 %r10571, %r10570, %r10550; + xor.b32 %r10572, %r10571, %r10510; + shf.l.wrap.b32 %r10573, %r10572, %r10572, 20; + add.s32 %r10574, %r10568, %r9769; + add.s32 %r10575, %r10574, %r10573; + xor.b32 %r10576, %r10575, %r10570; + shf.l.wrap.b32 %r10577, %r10576, %r10576, 24; + add.s32 %r10578, %r10577, %r10571; + xor.b32 %r10579, %r10578, %r10573; + shf.l.wrap.b32 %r10580, %r10579, %r10579, 25; + add.s32 %r10581, %r10533, %r9776; + add.s32 %r10582, %r10581, %r10524; + xor.b32 %r10583, %r10582, %r10549; + shf.l.wrap.b32 %r10584, %r10583, %r10583, 16; + add.s32 %r10585, %r10584, %r10508; + xor.b32 %r10586, %r10585, %r10524; + shf.l.wrap.b32 %r10587, %r10586, %r10586, 20; + add.s32 %r10588, %r10582, %r9832; + add.s32 %r10589, %r10588, %r10587; + xor.b32 %r10590, %r10589, %r10584; + shf.l.wrap.b32 %r10591, %r10590, %r10590, 24; + add.s32 %r10592, %r10591, %r10585; + xor.b32 %r10593, %r10592, %r10587; + shf.l.wrap.b32 %r10594, %r10593, %r10593, 25; + add.s32 %r10595, %r10547, %r9825; + add.s32 %r10596, %r10595, %r10538; + xor.b32 %r10597, %r10596, %r10507; + shf.l.wrap.b32 %r10598, %r10597, %r10597, 16; + add.s32 %r10599, %r10598, %r10522; + xor.b32 %r10600, %r10599, %r10538; + shf.l.wrap.b32 %r10601, %r10600, %r10600, 20; + add.s32 %r10602, %r10596, %r9811; + add.s32 %r10603, %r10602, %r10601; + xor.b32 %r10604, %r10603, %r10598; + shf.l.wrap.b32 %r10605, %r10604, %r10604, 24; + add.s32 %r10606, %r10605, %r10599; + xor.b32 %r10607, %r10606, %r10601; + shf.l.wrap.b32 %r10608, %r10607, %r10607, 25; + add.s32 %r10609, %r10561, %r9867; + add.s32 %r10610, %r10609, %r10580; + xor.b32 %r10611, %r10610, %r10605; + shf.l.wrap.b32 %r10612, %r10611, %r10611, 16; + add.s32 %r10613, %r10612, %r10592; + xor.b32 %r10614, %r10613, %r10580; + shf.l.wrap.b32 %r10615, %r10614, %r10614, 20; + add.s32 %r10616, %r10610, %r9839; + add.s32 %r10617, %r10616, %r10615; + xor.b32 %r10618, %r10617, %r10612; + shf.l.wrap.b32 %r10619, %r10618, %r10618, 24; + add.s32 %r10620, %r10619, %r10613; + xor.b32 %r10621, %r10620, %r10615; + shf.l.wrap.b32 %r10622, %r10621, %r10621, 25; + add.s32 %r10623, %r10575, %r9783; + add.s32 %r10624, %r10623, %r10594; + xor.b32 %r10625, %r10624, %r10563; + shf.l.wrap.b32 %r10626, %r10625, %r10625, 16; + add.s32 %r10627, %r10626, %r10606; + xor.b32 %r10628, %r10627, %r10594; + shf.l.wrap.b32 %r10629, %r10628, %r10628, 20; + add.s32 %r10630, %r10624, %r9853; + add.s32 %r10631, %r10630, %r10629; + xor.b32 %r10632, %r10631, %r10626; + shf.l.wrap.b32 %r10633, %r10632, %r10632, 24; + add.s32 %r10634, %r10633, %r10627; + xor.b32 %r10635, %r10634, %r10629; + shf.l.wrap.b32 %r10636, %r10635, %r10635, 25; + add.s32 %r10637, %r10589, %r9790; + add.s32 %r10638, %r10637, %r10608; + xor.b32 %r10639, %r10638, %r10577; + shf.l.wrap.b32 %r10640, %r10639, %r10639, 16; + add.s32 %r10641, %r10640, %r10564; + xor.b32 %r10642, %r10641, %r10608; + shf.l.wrap.b32 %r10643, %r10642, %r10642, 20; + add.s32 %r10644, %r10638, %r9797; + add.s32 %r10645, %r10644, %r10643; + xor.b32 %r10646, %r10645, %r10640; + shf.l.wrap.b32 %r10647, %r10646, %r10646, 24; + add.s32 %r10648, %r10647, %r10641; + xor.b32 %r10649, %r10648, %r10643; + shf.l.wrap.b32 %r10650, %r10649, %r10649, 25; + add.s32 %r10651, %r10603, %r9818; + add.s32 %r10652, %r10651, %r10566; + xor.b32 %r10653, %r10652, %r10591; + shf.l.wrap.b32 %r10654, %r10653, %r10653, 16; + add.s32 %r10655, %r10654, %r10578; + xor.b32 %r10656, %r10655, %r10566; + shf.l.wrap.b32 %r10657, %r10656, %r10656, 20; + add.s32 %r10658, %r10652, %r9860; + add.s32 %r10659, %r10658, %r10657; + xor.b32 %r10660, %r10659, %r10654; + shf.l.wrap.b32 %r10661, %r10660, %r10660, 24; + add.s32 %r10662, %r10661, %r10655; + xor.b32 %r10663, %r10662, %r10657; + shf.l.wrap.b32 %r10664, %r10663, %r10663, 25; + xor.b32 %r11689, %r10648, %r10617; + st.local.u32 [%rd3+-104], %r11689; + xor.b32 %r11688, %r10662, %r10631; + st.local.u32 [%rd3+-100], %r11688; + xor.b32 %r11687, %r10620, %r10645; + st.local.u32 [%rd3+-96], %r11687; + xor.b32 %r11686, %r10659, %r10634; + st.local.u32 [%rd3+-92], %r11686; + xor.b32 %r11685, %r10664, %r10633; + st.local.u32 [%rd3+-88], %r11685; + xor.b32 %r11684, %r10622, %r10647; + st.local.u32 [%rd3+-84], %r11684; + xor.b32 %r11683, %r10661, %r10636; + st.local.u32 [%rd3+-80], %r11683; + xor.b32 %r11682, %r10650, %r10619; + st.local.u32 [%rd3+-76], %r11682; + add.s16 %rs391, %rs391, 1; + st.local.u8 [%rd3+1], %rs391; + add.s64 %rd261, %rd261, 64; + add.s64 %rd271, %rd271, -64; + setp.gt.u64 %p49, %rd271, 64; + @%p49 bra $L__BB1_60; + bra.uni $L__BB1_61; + +$L__BB1_58: + ld.local.u64 %rd269, [%rd3+-72]; + +$L__BB1_61: + cvt.u64.u16 %rd210, %rs390; + and.b64 %rd92, %rd210, 255; + mov.u64 %rd211, 64; + sub.s64 %rd212, %rd211, %rd92; + min.u64 %rd93, %rd212, %rd271; + setp.eq.s64 %p50, %rd93, 0; + @%p50 bra $L__BB1_64; + + add.s64 %rd214, %rd2, %rd92; + add.s64 %rd94, %rd214, 72; + mov.u64 %rd272, 0; + +$L__BB1_63: + add.s64 %rd215, %rd261, %rd272; + ld.local.u8 %rs345, [%rd215]; + add.s64 %rd216, %rd94, %rd272; + st.local.u8 [%rd216], %rs345; + add.s64 %rd272, %rd272, 1; + setp.lt.u64 %p51, %rd272, %rd93; + @%p51 bra $L__BB1_63; + +$L__BB1_64: + cvt.u16.u64 %rs346, %rd93; + ld.local.u8 %rs347, [%rd3]; + add.s16 %rs348, %rs347, %rs346; + st.local.u8 [%rd3], %rs348; + ld.local.u8 %rs392, [%rd3+8]; + cvt.u64.u16 %rd217, %rs392; + and.b64 %rd218, %rd217, 255; + popc.b64 %r10665, %rd269; + cvt.u64.u32 %rd97, %r10665; + setp.ge.u64 %p52, %rd97, %rd218; + @%p52 bra $L__BB1_68; + + ld.local.u8 %r10666, [%rd3+2]; + or.b32 %r135, %r10666, 4; + ld.local.u8 %r10667, [%rd3+-120]; + ld.local.u8 %r10668, [%rd3+-119]; + prmt.b32 %r10669, %r10668, %r10667, 30212; + ld.local.u8 %r10670, [%rd3+-118]; + ld.local.u8 %r10671, [%rd3+-117]; + prmt.b32 %r10672, %r10671, %r10670, 30212; + prmt.b32 %r136, %r10672, %r10669, 4180; + ld.local.u8 %r10673, [%rd3+-136]; + ld.local.u8 %r10674, [%rd3+-135]; + prmt.b32 %r10675, %r10674, %r10673, 30212; + ld.local.u8 %r10676, [%rd3+-134]; + ld.local.u8 %r10677, [%rd3+-133]; + prmt.b32 %r10678, %r10677, %r10676, 30212; + prmt.b32 %r10679, %r10678, %r10675, 4180; + add.s32 %r137, %r136, %r10679; + ld.local.u8 %r10680, [%rd3+-116]; + ld.local.u8 %r10681, [%rd3+-115]; + prmt.b32 %r10682, %r10681, %r10680, 30212; + ld.local.u8 %r10683, [%rd3+-114]; + ld.local.u8 %r10684, [%rd3+-113]; + prmt.b32 %r10685, %r10684, %r10683, 30212; + prmt.b32 %r138, %r10685, %r10682, 4180; + ld.local.u8 %r10686, [%rd3+-132]; + ld.local.u8 %r10687, [%rd3+-131]; + prmt.b32 %r10688, %r10687, %r10686, 30212; + ld.local.u8 %r10689, [%rd3+-130]; + ld.local.u8 %r10690, [%rd3+-129]; + prmt.b32 %r10691, %r10690, %r10689, 30212; + prmt.b32 %r10692, %r10691, %r10688, 4180; + add.s32 %r139, %r138, %r10692; + ld.local.u8 %r10693, [%rd3+-112]; + ld.local.u8 %r10694, [%rd3+-111]; + prmt.b32 %r10695, %r10694, %r10693, 30212; + ld.local.u8 %r10696, [%rd3+-110]; + ld.local.u8 %r10697, [%rd3+-109]; + prmt.b32 %r10698, %r10697, %r10696, 30212; + prmt.b32 %r140, %r10698, %r10695, 4180; + ld.local.u8 %r10699, [%rd3+-128]; + ld.local.u8 %r10700, [%rd3+-127]; + prmt.b32 %r10701, %r10700, %r10699, 30212; + ld.local.u8 %r10702, [%rd3+-126]; + ld.local.u8 %r10703, [%rd3+-125]; + prmt.b32 %r10704, %r10703, %r10702, 30212; + prmt.b32 %r10705, %r10704, %r10701, 4180; + add.s32 %r141, %r140, %r10705; + ld.local.u8 %r10706, [%rd3+-108]; + ld.local.u8 %r10707, [%rd3+-107]; + prmt.b32 %r10708, %r10707, %r10706, 30212; + ld.local.u8 %r10709, [%rd3+-106]; + ld.local.u8 %r10710, [%rd3+-105]; + prmt.b32 %r10711, %r10710, %r10709, 30212; + prmt.b32 %r142, %r10711, %r10708, 4180; + ld.local.u8 %r10712, [%rd3+-124]; + ld.local.u8 %r10713, [%rd3+-123]; + prmt.b32 %r10714, %r10713, %r10712, 30212; + ld.local.u8 %r10715, [%rd3+-122]; + ld.local.u8 %r10716, [%rd3+-121]; + prmt.b32 %r10717, %r10716, %r10715, 30212; + prmt.b32 %r10718, %r10717, %r10714, 4180; + add.s32 %r143, %r142, %r10718; + +$L__BB1_66: + and.b16 %rs349, %rs392, 255; + mul.wide.u16 %r10719, %rs349, 32; + add.s32 %r10720, %r10719, -64; + cvt.s64.s32 %rd219, %r10720; + add.s64 %rd220, %rd2, %rd219; + ld.local.u8 %r10721, [%rd220+145]; + ld.local.u8 %r10722, [%rd220+146]; + prmt.b32 %r10723, %r10722, %r10721, 30212; + ld.local.u8 %r10724, [%rd220+147]; + prmt.b32 %r10725, %r10724, %r10723, 28756; + ld.local.u8 %r10726, [%rd220+148]; + prmt.b32 %r10727, %r10726, %r10725, 1620; + ld.local.u8 %r10728, [%rd220+149]; + ld.local.u8 %r10729, [%rd220+150]; + prmt.b32 %r10730, %r10729, %r10728, 30212; + ld.local.u8 %r10731, [%rd220+151]; + prmt.b32 %r10732, %r10731, %r10730, 28756; + ld.local.u8 %r10733, [%rd220+152]; + prmt.b32 %r10734, %r10733, %r10732, 1620; + ld.local.u8 %r10735, [%rd220+153]; + ld.local.u8 %r10736, [%rd220+154]; + prmt.b32 %r10737, %r10736, %r10735, 30212; + ld.local.u8 %r10738, [%rd220+155]; + prmt.b32 %r10739, %r10738, %r10737, 28756; + ld.local.u8 %r10740, [%rd220+156]; + prmt.b32 %r10741, %r10740, %r10739, 1620; + ld.local.u8 %r10742, [%rd220+157]; + ld.local.u8 %r10743, [%rd220+158]; + prmt.b32 %r10744, %r10743, %r10742, 30212; + ld.local.u8 %r10745, [%rd220+159]; + prmt.b32 %r10746, %r10745, %r10744, 28756; + ld.local.u8 %r10747, [%rd220+160]; + prmt.b32 %r10748, %r10747, %r10746, 1620; + ld.local.u8 %r10749, [%rd220+161]; + ld.local.u8 %r10750, [%rd220+162]; + prmt.b32 %r10751, %r10750, %r10749, 30212; + ld.local.u8 %r10752, [%rd220+163]; + prmt.b32 %r10753, %r10752, %r10751, 28756; + ld.local.u8 %r10754, [%rd220+164]; + prmt.b32 %r10755, %r10754, %r10753, 1620; + ld.local.u8 %r10756, [%rd220+165]; + ld.local.u8 %r10757, [%rd220+166]; + prmt.b32 %r10758, %r10757, %r10756, 30212; + ld.local.u8 %r10759, [%rd220+167]; + prmt.b32 %r10760, %r10759, %r10758, 28756; + ld.local.u8 %r10761, [%rd220+168]; + prmt.b32 %r10762, %r10761, %r10760, 1620; + ld.local.u8 %r10763, [%rd220+169]; + ld.local.u8 %r10764, [%rd220+170]; + prmt.b32 %r10765, %r10764, %r10763, 30212; + ld.local.u8 %r10766, [%rd220+171]; + prmt.b32 %r10767, %r10766, %r10765, 28756; + ld.local.u8 %r10768, [%rd220+172]; + prmt.b32 %r10769, %r10768, %r10767, 1620; + ld.local.u8 %r10770, [%rd220+173]; + ld.local.u8 %r10771, [%rd220+174]; + prmt.b32 %r10772, %r10771, %r10770, 30212; + ld.local.u8 %r10773, [%rd220+175]; + prmt.b32 %r10774, %r10773, %r10772, 28756; + ld.local.u8 %r10775, [%rd220+176]; + prmt.b32 %r10776, %r10775, %r10774, 1620; + ld.local.u8 %r10777, [%rd220+177]; + ld.local.u8 %r10778, [%rd220+178]; + prmt.b32 %r10779, %r10778, %r10777, 30212; + ld.local.u8 %r10780, [%rd220+179]; + prmt.b32 %r10781, %r10780, %r10779, 28756; + ld.local.u8 %r10782, [%rd220+180]; + prmt.b32 %r10783, %r10782, %r10781, 1620; + ld.local.u8 %r10784, [%rd220+181]; + ld.local.u8 %r10785, [%rd220+182]; + prmt.b32 %r10786, %r10785, %r10784, 30212; + ld.local.u8 %r10787, [%rd220+183]; + prmt.b32 %r10788, %r10787, %r10786, 28756; + ld.local.u8 %r10789, [%rd220+184]; + prmt.b32 %r10790, %r10789, %r10788, 1620; + ld.local.u8 %r10791, [%rd220+185]; + ld.local.u8 %r10792, [%rd220+186]; + prmt.b32 %r10793, %r10792, %r10791, 30212; + ld.local.u8 %r10794, [%rd220+187]; + prmt.b32 %r10795, %r10794, %r10793, 28756; + ld.local.u8 %r10796, [%rd220+188]; + prmt.b32 %r10797, %r10796, %r10795, 1620; + ld.local.u8 %r10798, [%rd220+189]; + ld.local.u8 %r10799, [%rd220+190]; + prmt.b32 %r10800, %r10799, %r10798, 30212; + ld.local.u8 %r10801, [%rd220+191]; + prmt.b32 %r10802, %r10801, %r10800, 28756; + ld.local.u8 %r10803, [%rd220+192]; + prmt.b32 %r10804, %r10803, %r10802, 1620; + ld.local.u8 %r10805, [%rd220+193]; + ld.local.u8 %r10806, [%rd220+194]; + prmt.b32 %r10807, %r10806, %r10805, 30212; + ld.local.u8 %r10808, [%rd220+195]; + prmt.b32 %r10809, %r10808, %r10807, 28756; + ld.local.u8 %r10810, [%rd220+196]; + prmt.b32 %r10811, %r10810, %r10809, 1620; + ld.local.u8 %r10812, [%rd220+197]; + ld.local.u8 %r10813, [%rd220+198]; + prmt.b32 %r10814, %r10813, %r10812, 30212; + ld.local.u8 %r10815, [%rd220+199]; + prmt.b32 %r10816, %r10815, %r10814, 28756; + ld.local.u8 %r10817, [%rd220+200]; + prmt.b32 %r10818, %r10817, %r10816, 1620; + ld.local.u8 %r10819, [%rd220+201]; + ld.local.u8 %r10820, [%rd220+202]; + prmt.b32 %r10821, %r10820, %r10819, 30212; + ld.local.u8 %r10822, [%rd220+203]; + prmt.b32 %r10823, %r10822, %r10821, 28756; + ld.local.u8 %r10824, [%rd220+204]; + prmt.b32 %r10825, %r10824, %r10823, 1620; + ld.local.u8 %r10826, [%rd220+205]; + ld.local.u8 %r10827, [%rd220+206]; + prmt.b32 %r10828, %r10827, %r10826, 30212; + ld.local.u8 %r10829, [%rd220+207]; + prmt.b32 %r10830, %r10829, %r10828, 28756; + ld.local.u8 %r10831, [%rd220+208]; + prmt.b32 %r10832, %r10831, %r10830, 1620; + add.s32 %r10833, %r137, %r10727; + shf.l.wrap.b32 %r10834, %r10833, %r10833, 16; + add.s32 %r10835, %r10834, 1779033703; + xor.b32 %r10836, %r10835, %r136; + shf.l.wrap.b32 %r10837, %r10836, %r10836, 20; + add.s32 %r10838, %r10734, %r10833; + add.s32 %r10839, %r10838, %r10837; + xor.b32 %r10840, %r10839, %r10834; + shf.l.wrap.b32 %r10841, %r10840, %r10840, 24; + add.s32 %r10842, %r10841, %r10835; + xor.b32 %r10843, %r10842, %r10837; + shf.l.wrap.b32 %r10844, %r10843, %r10843, 25; + add.s32 %r10845, %r139, %r10741; + shf.l.wrap.b32 %r10846, %r10845, %r10845, 16; + add.s32 %r10847, %r10846, -1150833019; + xor.b32 %r10848, %r10847, %r138; + shf.l.wrap.b32 %r10849, %r10848, %r10848, 20; + add.s32 %r10850, %r10748, %r10845; + add.s32 %r10851, %r10850, %r10849; + xor.b32 %r10852, %r10851, %r10846; + shf.l.wrap.b32 %r10853, %r10852, %r10852, 24; + add.s32 %r10854, %r10853, %r10847; + xor.b32 %r10855, %r10854, %r10849; + shf.l.wrap.b32 %r10856, %r10855, %r10855, 25; + add.s32 %r10857, %r141, %r10755; + shr.u32 %r10858, %r10857, 16; + shl.b32 %r10859, %r10857, 16; + xor.b32 %r10860, %r10859, 4194304; + or.b32 %r10861, %r10860, %r10858; + add.s32 %r10862, %r10861, 1013904242; + xor.b32 %r10863, %r10862, %r140; + shf.l.wrap.b32 %r10864, %r10863, %r10863, 20; + add.s32 %r10865, %r10762, %r10857; + add.s32 %r10866, %r10865, %r10864; + xor.b32 %r10867, %r10866, %r10861; + shf.l.wrap.b32 %r10868, %r10867, %r10867, 24; + add.s32 %r10869, %r10868, %r10862; + xor.b32 %r10870, %r10869, %r10864; + shf.l.wrap.b32 %r10871, %r10870, %r10870, 25; + add.s32 %r10872, %r143, %r10769; + xor.b32 %r10873, %r10872, %r135; + shr.u32 %r10874, %r10872, 16; + shl.b32 %r10875, %r10873, 16; + or.b32 %r10876, %r10875, %r10874; + add.s32 %r10877, %r10876, -1521486534; + xor.b32 %r10878, %r10877, %r142; + shf.l.wrap.b32 %r10879, %r10878, %r10878, 20; + add.s32 %r10880, %r10776, %r10872; + add.s32 %r10881, %r10880, %r10879; + xor.b32 %r10882, %r10881, %r10876; + shf.l.wrap.b32 %r10883, %r10882, %r10882, 24; + add.s32 %r10884, %r10883, %r10877; + xor.b32 %r10885, %r10884, %r10879; + shf.l.wrap.b32 %r10886, %r10885, %r10885, 25; + add.s32 %r10887, %r10856, %r10839; + add.s32 %r10888, %r10887, %r10783; + xor.b32 %r10889, %r10883, %r10888; + shf.l.wrap.b32 %r10890, %r10889, %r10889, 16; + add.s32 %r10891, %r10890, %r10869; + xor.b32 %r10892, %r10891, %r10856; + shf.l.wrap.b32 %r10893, %r10892, %r10892, 20; + add.s32 %r10894, %r10790, %r10888; + add.s32 %r10895, %r10894, %r10893; + xor.b32 %r10896, %r10895, %r10890; + shf.l.wrap.b32 %r10897, %r10896, %r10896, 24; + add.s32 %r10898, %r10897, %r10891; + xor.b32 %r10899, %r10898, %r10893; + shf.l.wrap.b32 %r10900, %r10899, %r10899, 25; + add.s32 %r10901, %r10871, %r10851; + add.s32 %r10902, %r10901, %r10797; + xor.b32 %r10903, %r10902, %r10841; + shf.l.wrap.b32 %r10904, %r10903, %r10903, 16; + add.s32 %r10905, %r10904, %r10884; + xor.b32 %r10906, %r10905, %r10871; + shf.l.wrap.b32 %r10907, %r10906, %r10906, 20; + add.s32 %r10908, %r10804, %r10902; + add.s32 %r10909, %r10908, %r10907; + xor.b32 %r10910, %r10909, %r10904; + shf.l.wrap.b32 %r10911, %r10910, %r10910, 24; + add.s32 %r10912, %r10911, %r10905; + xor.b32 %r10913, %r10912, %r10907; + shf.l.wrap.b32 %r10914, %r10913, %r10913, 25; + add.s32 %r10915, %r10886, %r10866; + add.s32 %r10916, %r10915, %r10811; + xor.b32 %r10917, %r10916, %r10853; + shf.l.wrap.b32 %r10918, %r10917, %r10917, 16; + add.s32 %r10919, %r10918, %r10842; + xor.b32 %r10920, %r10919, %r10886; + shf.l.wrap.b32 %r10921, %r10920, %r10920, 20; + add.s32 %r10922, %r10818, %r10916; + add.s32 %r10923, %r10922, %r10921; + xor.b32 %r10924, %r10923, %r10918; + shf.l.wrap.b32 %r10925, %r10924, %r10924, 24; + add.s32 %r10926, %r10925, %r10919; + xor.b32 %r10927, %r10926, %r10921; + shf.l.wrap.b32 %r10928, %r10927, %r10927, 25; + add.s32 %r10929, %r10881, %r10844; + add.s32 %r10930, %r10929, %r10825; + xor.b32 %r10931, %r10930, %r10868; + shf.l.wrap.b32 %r10932, %r10931, %r10931, 16; + add.s32 %r10933, %r10932, %r10854; + xor.b32 %r10934, %r10933, %r10844; + shf.l.wrap.b32 %r10935, %r10934, %r10934, 20; + add.s32 %r10936, %r10832, %r10930; + add.s32 %r10937, %r10936, %r10935; + xor.b32 %r10938, %r10937, %r10932; + shf.l.wrap.b32 %r10939, %r10938, %r10938, 24; + add.s32 %r10940, %r10939, %r10933; + xor.b32 %r10941, %r10940, %r10935; + shf.l.wrap.b32 %r10942, %r10941, %r10941, 25; + add.s32 %r10943, %r10895, %r10741; + add.s32 %r10944, %r10943, %r10942; + xor.b32 %r10945, %r10944, %r10911; + shf.l.wrap.b32 %r10946, %r10945, %r10945, 16; + add.s32 %r10947, %r10946, %r10926; + xor.b32 %r10948, %r10947, %r10942; + shf.l.wrap.b32 %r10949, %r10948, %r10948, 20; + add.s32 %r10950, %r10944, %r10769; + add.s32 %r10951, %r10950, %r10949; + xor.b32 %r10952, %r10951, %r10946; + shf.l.wrap.b32 %r10953, %r10952, %r10952, 24; + add.s32 %r10954, %r10953, %r10947; + xor.b32 %r10955, %r10954, %r10949; + shf.l.wrap.b32 %r10956, %r10955, %r10955, 25; + add.s32 %r10957, %r10909, %r10748; + add.s32 %r10958, %r10957, %r10900; + xor.b32 %r10959, %r10925, %r10958; + shf.l.wrap.b32 %r10960, %r10959, %r10959, 16; + add.s32 %r10961, %r10940, %r10960; + xor.b32 %r10962, %r10961, %r10900; + shf.l.wrap.b32 %r10963, %r10962, %r10962, 20; + add.s32 %r10964, %r10958, %r10797; + add.s32 %r10965, %r10964, %r10963; + xor.b32 %r10966, %r10965, %r10960; + shf.l.wrap.b32 %r10967, %r10966, %r10966, 24; + add.s32 %r10968, %r10967, %r10961; + xor.b32 %r10969, %r10968, %r10963; + shf.l.wrap.b32 %r10970, %r10969, %r10969, 25; + add.s32 %r10971, %r10914, %r10776; + add.s32 %r10972, %r10971, %r10923; + xor.b32 %r10973, %r10939, %r10972; + shf.l.wrap.b32 %r10974, %r10973, %r10973, 16; + add.s32 %r10975, %r10974, %r10898; + xor.b32 %r10976, %r10975, %r10914; + shf.l.wrap.b32 %r10977, %r10976, %r10976, 20; + add.s32 %r10978, %r10972, %r10727; + add.s32 %r10979, %r10978, %r10977; + xor.b32 %r10980, %r10979, %r10974; + shf.l.wrap.b32 %r10981, %r10980, %r10980, 24; + add.s32 %r10982, %r10981, %r10975; + xor.b32 %r10983, %r10982, %r10977; + shf.l.wrap.b32 %r10984, %r10983, %r10983, 25; + add.s32 %r10985, %r10928, %r10755; + add.s32 %r10986, %r10985, %r10937; + xor.b32 %r10987, %r10986, %r10897; + shf.l.wrap.b32 %r10988, %r10987, %r10987, 16; + add.s32 %r10989, %r10988, %r10912; + xor.b32 %r10990, %r10989, %r10928; + shf.l.wrap.b32 %r10991, %r10990, %r10990, 20; + add.s32 %r10992, %r10986, %r10818; + add.s32 %r10993, %r10992, %r10991; + xor.b32 %r10994, %r10993, %r10988; + shf.l.wrap.b32 %r10995, %r10994, %r10994, 24; + add.s32 %r10996, %r10995, %r10989; + xor.b32 %r10997, %r10996, %r10991; + shf.l.wrap.b32 %r10998, %r10997, %r10997, 25; + add.s32 %r10999, %r10970, %r10734; + add.s32 %r11000, %r10999, %r10951; + xor.b32 %r11001, %r11000, %r10995; + shf.l.wrap.b32 %r11002, %r11001, %r11001, 16; + add.s32 %r11003, %r11002, %r10982; + xor.b32 %r11004, %r11003, %r10970; + shf.l.wrap.b32 %r11005, %r11004, %r11004, 20; + add.s32 %r11006, %r11000, %r10804; + add.s32 %r11007, %r11006, %r11005; + xor.b32 %r11008, %r11007, %r11002; + shf.l.wrap.b32 %r11009, %r11008, %r11008, 24; + add.s32 %r11010, %r11009, %r11003; + xor.b32 %r11011, %r11010, %r11005; + shf.l.wrap.b32 %r11012, %r11011, %r11011, 25; + add.s32 %r11013, %r10965, %r10811; + add.s32 %r11014, %r11013, %r10984; + xor.b32 %r11015, %r10953, %r11014; + shf.l.wrap.b32 %r11016, %r11015, %r11015, 16; + add.s32 %r11017, %r11016, %r10996; + xor.b32 %r11018, %r11017, %r10984; + shf.l.wrap.b32 %r11019, %r11018, %r11018, 20; + add.s32 %r11020, %r11014, %r10762; + add.s32 %r11021, %r11020, %r11019; + xor.b32 %r11022, %r11021, %r11016; + shf.l.wrap.b32 %r11023, %r11022, %r11022, 24; + add.s32 %r11024, %r11023, %r11017; + xor.b32 %r11025, %r11024, %r11019; + shf.l.wrap.b32 %r11026, %r11025, %r11025, 25; + add.s32 %r11027, %r10979, %r10790; + add.s32 %r11028, %r11027, %r10998; + xor.b32 %r11029, %r11028, %r10967; + shf.l.wrap.b32 %r11030, %r11029, %r11029, 16; + add.s32 %r11031, %r11030, %r10954; + xor.b32 %r11032, %r11031, %r10998; + shf.l.wrap.b32 %r11033, %r11032, %r11032, 20; + add.s32 %r11034, %r11028, %r10825; + add.s32 %r11035, %r11034, %r11033; + xor.b32 %r11036, %r11035, %r11030; + shf.l.wrap.b32 %r11037, %r11036, %r11036, 24; + add.s32 %r11038, %r11037, %r11031; + xor.b32 %r11039, %r11038, %r11033; + shf.l.wrap.b32 %r11040, %r11039, %r11039, 25; + add.s32 %r11041, %r10993, %r10832; + add.s32 %r11042, %r11041, %r10956; + xor.b32 %r11043, %r11042, %r10981; + shf.l.wrap.b32 %r11044, %r11043, %r11043, 16; + add.s32 %r11045, %r11044, %r10968; + xor.b32 %r11046, %r11045, %r10956; + shf.l.wrap.b32 %r11047, %r11046, %r11046, 20; + add.s32 %r11048, %r11042, %r10783; + add.s32 %r11049, %r11048, %r11047; + xor.b32 %r11050, %r11049, %r11044; + shf.l.wrap.b32 %r11051, %r11050, %r11050, 24; + add.s32 %r11052, %r11051, %r11045; + xor.b32 %r11053, %r11052, %r11047; + shf.l.wrap.b32 %r11054, %r11053, %r11053, 25; + add.s32 %r11055, %r11007, %r10748; + add.s32 %r11056, %r11055, %r11054; + xor.b32 %r11057, %r11056, %r11023; + shf.l.wrap.b32 %r11058, %r11057, %r11057, 16; + add.s32 %r11059, %r11058, %r11038; + xor.b32 %r11060, %r11059, %r11054; + shf.l.wrap.b32 %r11061, %r11060, %r11060, 20; + add.s32 %r11062, %r11056, %r10755; + add.s32 %r11063, %r11062, %r11061; + xor.b32 %r11064, %r11063, %r11058; + shf.l.wrap.b32 %r11065, %r11064, %r11064, 24; + add.s32 %r11066, %r11065, %r11059; + xor.b32 %r11067, %r11066, %r11061; + shf.l.wrap.b32 %r11068, %r11067, %r11067, 25; + add.s32 %r11069, %r11021, %r10797; + add.s32 %r11070, %r11069, %r11012; + xor.b32 %r11071, %r11070, %r11037; + shf.l.wrap.b32 %r11072, %r11071, %r11071, 16; + add.s32 %r11073, %r11072, %r11052; + xor.b32 %r11074, %r11073, %r11012; + shf.l.wrap.b32 %r11075, %r11074, %r11074, 20; + add.s32 %r11076, %r11070, %r10811; + add.s32 %r11077, %r11076, %r11075; + xor.b32 %r11078, %r11077, %r11072; + shf.l.wrap.b32 %r11079, %r11078, %r11078, 24; + add.s32 %r11080, %r11079, %r11073; + xor.b32 %r11081, %r11080, %r11075; + shf.l.wrap.b32 %r11082, %r11081, %r11081, 25; + add.s32 %r11083, %r11035, %r10818; + add.s32 %r11084, %r11083, %r11026; + xor.b32 %r11085, %r11051, %r11084; + shf.l.wrap.b32 %r11086, %r11085, %r11085, 16; + add.s32 %r11087, %r11086, %r11010; + xor.b32 %r11088, %r11087, %r11026; + shf.l.wrap.b32 %r11089, %r11088, %r11088, 20; + add.s32 %r11090, %r11084, %r10741; + add.s32 %r11091, %r11090, %r11089; + xor.b32 %r11092, %r11091, %r11086; + shf.l.wrap.b32 %r11093, %r11092, %r11092, 24; + add.s32 %r11094, %r11093, %r11087; + xor.b32 %r11095, %r11094, %r11089; + shf.l.wrap.b32 %r11096, %r11095, %r11095, 25; + add.s32 %r11097, %r11040, %r10776; + add.s32 %r11098, %r11097, %r11049; + xor.b32 %r11099, %r11098, %r11009; + shf.l.wrap.b32 %r11100, %r11099, %r11099, 16; + add.s32 %r11101, %r11100, %r11024; + xor.b32 %r11102, %r11101, %r11040; + shf.l.wrap.b32 %r11103, %r11102, %r11102, 20; + add.s32 %r11104, %r11098, %r10825; + add.s32 %r11105, %r11104, %r11103; + xor.b32 %r11106, %r11105, %r11100; + shf.l.wrap.b32 %r11107, %r11106, %r11106, 24; + add.s32 %r11108, %r11107, %r11101; + xor.b32 %r11109, %r11108, %r11103; + shf.l.wrap.b32 %r11110, %r11109, %r11109, 25; + add.s32 %r11111, %r11082, %r10769; + add.s32 %r11112, %r11111, %r11063; + xor.b32 %r11113, %r11112, %r11107; + shf.l.wrap.b32 %r11114, %r11113, %r11113, 16; + add.s32 %r11115, %r11114, %r11094; + xor.b32 %r11116, %r11115, %r11082; + shf.l.wrap.b32 %r11117, %r11116, %r11116, 20; + add.s32 %r11118, %r11112, %r10762; + add.s32 %r11119, %r11118, %r11117; + xor.b32 %r11120, %r11119, %r11114; + shf.l.wrap.b32 %r11121, %r11120, %r11120, 24; + add.s32 %r11122, %r11121, %r11115; + xor.b32 %r11123, %r11122, %r11117; + shf.l.wrap.b32 %r11124, %r11123, %r11123, 25; + add.s32 %r11125, %r11077, %r10790; + add.s32 %r11126, %r11125, %r11096; + xor.b32 %r11127, %r11065, %r11126; + shf.l.wrap.b32 %r11128, %r11127, %r11127, 16; + add.s32 %r11129, %r11128, %r11108; + xor.b32 %r11130, %r11129, %r11096; + shf.l.wrap.b32 %r11131, %r11130, %r11130, 20; + add.s32 %r11132, %r11126, %r10727; + add.s32 %r11133, %r11132, %r11131; + xor.b32 %r11134, %r11133, %r11128; + shf.l.wrap.b32 %r11135, %r11134, %r11134, 24; + add.s32 %r11136, %r11135, %r11129; + xor.b32 %r11137, %r11136, %r11131; + shf.l.wrap.b32 %r11138, %r11137, %r11137, 25; + add.s32 %r11139, %r11091, %r10804; + add.s32 %r11140, %r11139, %r11110; + xor.b32 %r11141, %r11140, %r11079; + shf.l.wrap.b32 %r11142, %r11141, %r11141, 16; + add.s32 %r11143, %r11142, %r11066; + xor.b32 %r11144, %r11143, %r11110; + shf.l.wrap.b32 %r11145, %r11144, %r11144, 20; + add.s32 %r11146, %r11140, %r10832; + add.s32 %r11147, %r11146, %r11145; + xor.b32 %r11148, %r11147, %r11142; + shf.l.wrap.b32 %r11149, %r11148, %r11148, 24; + add.s32 %r11150, %r11149, %r11143; + xor.b32 %r11151, %r11150, %r11145; + shf.l.wrap.b32 %r11152, %r11151, %r11151, 25; + add.s32 %r11153, %r11105, %r10783; + add.s32 %r11154, %r11153, %r11068; + xor.b32 %r11155, %r11154, %r11093; + shf.l.wrap.b32 %r11156, %r11155, %r11155, 16; + add.s32 %r11157, %r11156, %r11080; + xor.b32 %r11158, %r11157, %r11068; + shf.l.wrap.b32 %r11159, %r11158, %r11158, 20; + add.s32 %r11160, %r11154, %r10734; + add.s32 %r11161, %r11160, %r11159; + xor.b32 %r11162, %r11161, %r11156; + shf.l.wrap.b32 %r11163, %r11162, %r11162, 24; + add.s32 %r11164, %r11163, %r11157; + xor.b32 %r11165, %r11164, %r11159; + shf.l.wrap.b32 %r11166, %r11165, %r11165, 25; + add.s32 %r11167, %r11119, %r10797; + add.s32 %r11168, %r11167, %r11166; + xor.b32 %r11169, %r11168, %r11135; + shf.l.wrap.b32 %r11170, %r11169, %r11169, 16; + add.s32 %r11171, %r11170, %r11150; + xor.b32 %r11172, %r11171, %r11166; + shf.l.wrap.b32 %r11173, %r11172, %r11172, 20; + add.s32 %r11174, %r11168, %r10776; + add.s32 %r11175, %r11174, %r11173; + xor.b32 %r11176, %r11175, %r11170; + shf.l.wrap.b32 %r11177, %r11176, %r11176, 24; + add.s32 %r11178, %r11177, %r11171; + xor.b32 %r11179, %r11178, %r11173; + shf.l.wrap.b32 %r11180, %r11179, %r11179, 25; + add.s32 %r11181, %r11133, %r10811; + add.s32 %r11182, %r11181, %r11124; + xor.b32 %r11183, %r11182, %r11149; + shf.l.wrap.b32 %r11184, %r11183, %r11183, 16; + add.s32 %r11185, %r11184, %r11164; + xor.b32 %r11186, %r11185, %r11124; + shf.l.wrap.b32 %r11187, %r11186, %r11186, 20; + add.s32 %r11188, %r11182, %r10790; + add.s32 %r11189, %r11188, %r11187; + xor.b32 %r11190, %r11189, %r11184; + shf.l.wrap.b32 %r11191, %r11190, %r11190, 24; + add.s32 %r11192, %r11191, %r11185; + xor.b32 %r11193, %r11192, %r11187; + shf.l.wrap.b32 %r11194, %r11193, %r11193, 25; + add.s32 %r11195, %r11147, %r10825; + add.s32 %r11196, %r11195, %r11138; + xor.b32 %r11197, %r11163, %r11196; + shf.l.wrap.b32 %r11198, %r11197, %r11197, 16; + add.s32 %r11199, %r11198, %r11122; + xor.b32 %r11200, %r11199, %r11138; + shf.l.wrap.b32 %r11201, %r11200, %r11200, 20; + add.s32 %r11202, %r11196, %r10748; + add.s32 %r11203, %r11202, %r11201; + xor.b32 %r11204, %r11203, %r11198; + shf.l.wrap.b32 %r11205, %r11204, %r11204, 24; + add.s32 %r11206, %r11205, %r11199; + xor.b32 %r11207, %r11206, %r11201; + shf.l.wrap.b32 %r11208, %r11207, %r11207, 25; + add.s32 %r11209, %r11152, %r10818; + add.s32 %r11210, %r11209, %r11161; + xor.b32 %r11211, %r11210, %r11121; + shf.l.wrap.b32 %r11212, %r11211, %r11211, 16; + add.s32 %r11213, %r11212, %r11136; + xor.b32 %r11214, %r11213, %r11152; + shf.l.wrap.b32 %r11215, %r11214, %r11214, 20; + add.s32 %r11216, %r11210, %r10832; + add.s32 %r11217, %r11216, %r11215; + xor.b32 %r11218, %r11217, %r11212; + shf.l.wrap.b32 %r11219, %r11218, %r11218, 24; + add.s32 %r11220, %r11219, %r11213; + xor.b32 %r11221, %r11220, %r11215; + shf.l.wrap.b32 %r11222, %r11221, %r11221, 25; + add.s32 %r11223, %r11194, %r10755; + add.s32 %r11224, %r11223, %r11175; + xor.b32 %r11225, %r11224, %r11219; + shf.l.wrap.b32 %r11226, %r11225, %r11225, 16; + add.s32 %r11227, %r11226, %r11206; + xor.b32 %r11228, %r11227, %r11194; + shf.l.wrap.b32 %r11229, %r11228, %r11228, 20; + add.s32 %r11230, %r11224, %r10727; + add.s32 %r11231, %r11230, %r11229; + xor.b32 %r11232, %r11231, %r11226; + shf.l.wrap.b32 %r11233, %r11232, %r11232, 24; + add.s32 %r11234, %r11233, %r11227; + xor.b32 %r11235, %r11234, %r11229; + shf.l.wrap.b32 %r11236, %r11235, %r11235, 25; + add.s32 %r11237, %r11189, %r10804; + add.s32 %r11238, %r11237, %r11208; + xor.b32 %r11239, %r11177, %r11238; + shf.l.wrap.b32 %r11240, %r11239, %r11239, 16; + add.s32 %r11241, %r11240, %r11220; + xor.b32 %r11242, %r11241, %r11208; + shf.l.wrap.b32 %r11243, %r11242, %r11242, 20; + add.s32 %r11244, %r11238, %r10741; + add.s32 %r11245, %r11244, %r11243; + xor.b32 %r11246, %r11245, %r11240; + shf.l.wrap.b32 %r11247, %r11246, %r11246, 24; + add.s32 %r11248, %r11247, %r11241; + xor.b32 %r11249, %r11248, %r11243; + shf.l.wrap.b32 %r11250, %r11249, %r11249, 25; + add.s32 %r11251, %r11203, %r10762; + add.s32 %r11252, %r11251, %r11222; + xor.b32 %r11253, %r11252, %r11191; + shf.l.wrap.b32 %r11254, %r11253, %r11253, 16; + add.s32 %r11255, %r11254, %r11178; + xor.b32 %r11256, %r11255, %r11222; + shf.l.wrap.b32 %r11257, %r11256, %r11256, 20; + add.s32 %r11258, %r11252, %r10783; + add.s32 %r11259, %r11258, %r11257; + xor.b32 %r11260, %r11259, %r11254; + shf.l.wrap.b32 %r11261, %r11260, %r11260, 24; + add.s32 %r11262, %r11261, %r11255; + xor.b32 %r11263, %r11262, %r11257; + shf.l.wrap.b32 %r11264, %r11263, %r11263, 25; + add.s32 %r11265, %r11217, %r10734; + add.s32 %r11266, %r11265, %r11180; + xor.b32 %r11267, %r11266, %r11205; + shf.l.wrap.b32 %r11268, %r11267, %r11267, 16; + add.s32 %r11269, %r11268, %r11192; + xor.b32 %r11270, %r11269, %r11180; + shf.l.wrap.b32 %r11271, %r11270, %r11270, 20; + add.s32 %r11272, %r11266, %r10769; + add.s32 %r11273, %r11272, %r11271; + xor.b32 %r11274, %r11273, %r11268; + shf.l.wrap.b32 %r11275, %r11274, %r11274, 24; + add.s32 %r11276, %r11275, %r11269; + xor.b32 %r11277, %r11276, %r11271; + shf.l.wrap.b32 %r11278, %r11277, %r11277, 25; + add.s32 %r11279, %r11231, %r10811; + add.s32 %r11280, %r11279, %r11278; + xor.b32 %r11281, %r11280, %r11247; + shf.l.wrap.b32 %r11282, %r11281, %r11281, 16; + add.s32 %r11283, %r11282, %r11262; + xor.b32 %r11284, %r11283, %r11278; + shf.l.wrap.b32 %r11285, %r11284, %r11284, 20; + add.s32 %r11286, %r11280, %r10818; + add.s32 %r11287, %r11286, %r11285; + xor.b32 %r11288, %r11287, %r11282; + shf.l.wrap.b32 %r11289, %r11288, %r11288, 24; + add.s32 %r11290, %r11289, %r11283; + xor.b32 %r11291, %r11290, %r11285; + shf.l.wrap.b32 %r11292, %r11291, %r11291, 25; + add.s32 %r11293, %r11245, %r10790; + add.s32 %r11294, %r11293, %r11236; + xor.b32 %r11295, %r11294, %r11261; + shf.l.wrap.b32 %r11296, %r11295, %r11295, 16; + add.s32 %r11297, %r11296, %r11276; + xor.b32 %r11298, %r11297, %r11236; + shf.l.wrap.b32 %r11299, %r11298, %r11298, 20; + add.s32 %r11300, %r11294, %r10804; + add.s32 %r11301, %r11300, %r11299; + xor.b32 %r11302, %r11301, %r11296; + shf.l.wrap.b32 %r11303, %r11302, %r11302, 24; + add.s32 %r11304, %r11303, %r11297; + xor.b32 %r11305, %r11304, %r11299; + shf.l.wrap.b32 %r11306, %r11305, %r11305, 25; + add.s32 %r11307, %r11259, %r10832; + add.s32 %r11308, %r11307, %r11250; + xor.b32 %r11309, %r11275, %r11308; + shf.l.wrap.b32 %r11310, %r11309, %r11309, 16; + add.s32 %r11311, %r11310, %r11234; + xor.b32 %r11312, %r11311, %r11250; + shf.l.wrap.b32 %r11313, %r11312, %r11312, 20; + add.s32 %r11314, %r11308, %r10797; + add.s32 %r11315, %r11314, %r11313; + xor.b32 %r11316, %r11315, %r11310; + shf.l.wrap.b32 %r11317, %r11316, %r11316, 24; + add.s32 %r11318, %r11317, %r11311; + xor.b32 %r11319, %r11318, %r11313; + shf.l.wrap.b32 %r11320, %r11319, %r11319, 25; + add.s32 %r11321, %r11264, %r10825; + add.s32 %r11322, %r11321, %r11273; + xor.b32 %r11323, %r11322, %r11233; + shf.l.wrap.b32 %r11324, %r11323, %r11323, 16; + add.s32 %r11325, %r11324, %r11248; + xor.b32 %r11326, %r11325, %r11264; + shf.l.wrap.b32 %r11327, %r11326, %r11326, 20; + add.s32 %r11328, %r11322, %r10783; + add.s32 %r11329, %r11328, %r11327; + xor.b32 %r11330, %r11329, %r11324; + shf.l.wrap.b32 %r11331, %r11330, %r11330, 24; + add.s32 %r11332, %r11331, %r11325; + xor.b32 %r11333, %r11332, %r11327; + shf.l.wrap.b32 %r11334, %r11333, %r11333, 25; + add.s32 %r11335, %r11306, %r10776; + add.s32 %r11336, %r11335, %r11287; + xor.b32 %r11337, %r11336, %r11331; + shf.l.wrap.b32 %r11338, %r11337, %r11337, 16; + add.s32 %r11339, %r11338, %r11318; + xor.b32 %r11340, %r11339, %r11306; + shf.l.wrap.b32 %r11341, %r11340, %r11340, 20; + add.s32 %r11342, %r11336, %r10741; + add.s32 %r11343, %r11342, %r11341; + xor.b32 %r11344, %r11343, %r11338; + shf.l.wrap.b32 %r11345, %r11344, %r11344, 24; + add.s32 %r11346, %r11345, %r11339; + xor.b32 %r11347, %r11346, %r11341; + shf.l.wrap.b32 %r11348, %r11347, %r11347, 25; + add.s32 %r11349, %r11301, %r10762; + add.s32 %r11350, %r11349, %r11320; + xor.b32 %r11351, %r11289, %r11350; + shf.l.wrap.b32 %r11352, %r11351, %r11351, 16; + add.s32 %r11353, %r11352, %r11332; + xor.b32 %r11354, %r11353, %r11320; + shf.l.wrap.b32 %r11355, %r11354, %r11354, 20; + add.s32 %r11356, %r11350, %r10748; + add.s32 %r11357, %r11356, %r11355; + xor.b32 %r11358, %r11357, %r11352; + shf.l.wrap.b32 %r11359, %r11358, %r11358, 24; + add.s32 %r11360, %r11359, %r11353; + xor.b32 %r11361, %r11360, %r11355; + shf.l.wrap.b32 %r11362, %r11361, %r11361, 25; + add.s32 %r11363, %r11315, %r10727; + add.s32 %r11364, %r11363, %r11334; + xor.b32 %r11365, %r11364, %r11303; + shf.l.wrap.b32 %r11366, %r11365, %r11365, 16; + add.s32 %r11367, %r11366, %r11290; + xor.b32 %r11368, %r11367, %r11334; + shf.l.wrap.b32 %r11369, %r11368, %r11368, 20; + add.s32 %r11370, %r11364, %r10734; + add.s32 %r11371, %r11370, %r11369; + xor.b32 %r11372, %r11371, %r11366; + shf.l.wrap.b32 %r11373, %r11372, %r11372, 24; + add.s32 %r11374, %r11373, %r11367; + xor.b32 %r11375, %r11374, %r11369; + shf.l.wrap.b32 %r11376, %r11375, %r11375, 25; + add.s32 %r11377, %r11329, %r10769; + add.s32 %r11378, %r11377, %r11292; + xor.b32 %r11379, %r11378, %r11317; + shf.l.wrap.b32 %r11380, %r11379, %r11379, 16; + add.s32 %r11381, %r11380, %r11304; + xor.b32 %r11382, %r11381, %r11292; + shf.l.wrap.b32 %r11383, %r11382, %r11382, 20; + add.s32 %r11384, %r11378, %r10755; + add.s32 %r11385, %r11384, %r11383; + xor.b32 %r11386, %r11385, %r11380; + shf.l.wrap.b32 %r11387, %r11386, %r11386, 24; + add.s32 %r11388, %r11387, %r11381; + xor.b32 %r11389, %r11388, %r11383; + shf.l.wrap.b32 %r11390, %r11389, %r11389, 25; + add.s32 %r11391, %r11343, %r10790; + add.s32 %r11392, %r11391, %r11390; + xor.b32 %r11393, %r11392, %r11359; + shf.l.wrap.b32 %r11394, %r11393, %r11393, 16; + add.s32 %r11395, %r11394, %r11374; + xor.b32 %r11396, %r11395, %r11390; + shf.l.wrap.b32 %r11397, %r11396, %r11396, 20; + add.s32 %r11398, %r11392, %r10825; + add.s32 %r11399, %r11398, %r11397; + xor.b32 %r11400, %r11399, %r11394; + shf.l.wrap.b32 %r11401, %r11400, %r11400, 24; + add.s32 %r11402, %r11401, %r11395; + xor.b32 %r11403, %r11402, %r11397; + shf.l.wrap.b32 %r11404, %r11403, %r11403, 25; + add.s32 %r11405, %r11357, %r10804; + add.s32 %r11406, %r11405, %r11348; + xor.b32 %r11407, %r11406, %r11373; + shf.l.wrap.b32 %r11408, %r11407, %r11407, 16; + add.s32 %r11409, %r11408, %r11388; + xor.b32 %r11410, %r11409, %r11348; + shf.l.wrap.b32 %r11411, %r11410, %r11410, 20; + add.s32 %r11412, %r11406, %r10762; + add.s32 %r11413, %r11412, %r11411; + xor.b32 %r11414, %r11413, %r11408; + shf.l.wrap.b32 %r11415, %r11414, %r11414, 24; + add.s32 %r11416, %r11415, %r11409; + xor.b32 %r11417, %r11416, %r11411; + shf.l.wrap.b32 %r11418, %r11417, %r11417, 25; + add.s32 %r11419, %r11371, %r10783; + add.s32 %r11420, %r11419, %r11362; + xor.b32 %r11421, %r11387, %r11420; + shf.l.wrap.b32 %r11422, %r11421, %r11421, 16; + add.s32 %r11423, %r11422, %r11346; + xor.b32 %r11424, %r11423, %r11362; + shf.l.wrap.b32 %r11425, %r11424, %r11424, 20; + add.s32 %r11426, %r11420, %r10811; + add.s32 %r11427, %r11426, %r11425; + xor.b32 %r11428, %r11427, %r11422; + shf.l.wrap.b32 %r11429, %r11428, %r11428, 24; + add.s32 %r11430, %r11429, %r11423; + xor.b32 %r11431, %r11430, %r11425; + shf.l.wrap.b32 %r11432, %r11431, %r11431, 25; + add.s32 %r11433, %r11376, %r10832; + add.s32 %r11434, %r11433, %r11385; + xor.b32 %r11435, %r11434, %r11345; + shf.l.wrap.b32 %r11436, %r11435, %r11435, 16; + add.s32 %r11437, %r11436, %r11360; + xor.b32 %r11438, %r11437, %r11376; + shf.l.wrap.b32 %r11439, %r11438, %r11438, 20; + add.s32 %r11440, %r11434, %r10734; + add.s32 %r11441, %r11440, %r11439; + xor.b32 %r11442, %r11441, %r11436; + shf.l.wrap.b32 %r11443, %r11442, %r11442, 24; + add.s32 %r11444, %r11443, %r11437; + xor.b32 %r11445, %r11444, %r11439; + shf.l.wrap.b32 %r11446, %r11445, %r11445, 25; + add.s32 %r11447, %r11418, %r10818; + add.s32 %r11448, %r11447, %r11399; + xor.b32 %r11449, %r11448, %r11443; + shf.l.wrap.b32 %r11450, %r11449, %r11449, 16; + add.s32 %r11451, %r11450, %r11430; + xor.b32 %r11452, %r11451, %r11418; + shf.l.wrap.b32 %r11453, %r11452, %r11452, 20; + add.s32 %r11454, %r11448, %r10748; + add.s32 %r11455, %r11454, %r11453; + xor.b32 %r11456, %r11455, %r11450; + shf.l.wrap.b32 %r11457, %r11456, %r11456, 24; + add.s32 %r11458, %r11457, %r11451; + xor.b32 %r11459, %r11458, %r11453; + shf.l.wrap.b32 %r11460, %r11459, %r11459, 25; + add.s32 %r11461, %r11413, %r10727; + add.s32 %r11462, %r11461, %r11432; + xor.b32 %r11463, %r11401, %r11462; + shf.l.wrap.b32 %r11464, %r11463, %r11463, 16; + add.s32 %r11465, %r11464, %r11444; + xor.b32 %r11466, %r11465, %r11432; + shf.l.wrap.b32 %r11467, %r11466, %r11466, 20; + add.s32 %r11468, %r11462, %r10797; + add.s32 %r11469, %r11468, %r11467; + xor.b32 %r11470, %r11469, %r11464; + shf.l.wrap.b32 %r11471, %r11470, %r11470, 24; + add.s32 %r11472, %r11471, %r11465; + xor.b32 %r11473, %r11472, %r11467; + shf.l.wrap.b32 %r11474, %r11473, %r11473, 25; + add.s32 %r11475, %r11427, %r10741; + add.s32 %r11476, %r11475, %r11446; + xor.b32 %r11477, %r11476, %r11415; + shf.l.wrap.b32 %r11478, %r11477, %r11477, 16; + add.s32 %r11479, %r11478, %r11402; + xor.b32 %r11480, %r11479, %r11446; + shf.l.wrap.b32 %r11481, %r11480, %r11480, 20; + add.s32 %r11482, %r11476, %r10769; + add.s32 %r11483, %r11482, %r11481; + xor.b32 %r11484, %r11483, %r11478; + shf.l.wrap.b32 %r11485, %r11484, %r11484, 24; + add.s32 %r11486, %r11485, %r11479; + xor.b32 %r11487, %r11486, %r11481; + shf.l.wrap.b32 %r11488, %r11487, %r11487, 25; + add.s32 %r11489, %r11441, %r10755; + add.s32 %r11490, %r11489, %r11404; + xor.b32 %r11491, %r11490, %r11429; + shf.l.wrap.b32 %r11492, %r11491, %r11491, 16; + add.s32 %r11493, %r11492, %r11416; + xor.b32 %r11494, %r11493, %r11404; + shf.l.wrap.b32 %r11495, %r11494, %r11494, 20; + add.s32 %r11496, %r11490, %r10776; + add.s32 %r11497, %r11496, %r11495; + xor.b32 %r11498, %r11497, %r11492; + shf.l.wrap.b32 %r11499, %r11498, %r11498, 24; + add.s32 %r11500, %r11499, %r11493; + xor.b32 %r11501, %r11500, %r11495; + shf.l.wrap.b32 %r11502, %r11501, %r11501, 25; + add.s32 %r11503, %r11455, %r10804; + add.s32 %r11504, %r11503, %r11502; + xor.b32 %r11505, %r11504, %r11471; + shf.l.wrap.b32 %r11506, %r11505, %r11505, 16; + add.s32 %r11507, %r11506, %r11486; + xor.b32 %r11508, %r11507, %r11502; + shf.l.wrap.b32 %r11509, %r11508, %r11508, 20; + add.s32 %r11510, %r11504, %r10832; + add.s32 %r11511, %r11510, %r11509; + xor.b32 %r11512, %r11511, %r11506; + shf.l.wrap.b32 %r11513, %r11512, %r11512, 24; + add.s32 %r11514, %r11513, %r11507; + xor.b32 %r11515, %r11514, %r11509; + shf.l.wrap.b32 %r11516, %r11515, %r11515, 25; + add.s32 %r11517, %r11469, %r10762; + add.s32 %r11518, %r11517, %r11460; + xor.b32 %r11519, %r11518, %r11485; + shf.l.wrap.b32 %r11520, %r11519, %r11519, 16; + add.s32 %r11521, %r11520, %r11500; + xor.b32 %r11522, %r11521, %r11460; + shf.l.wrap.b32 %r11523, %r11522, %r11522, 20; + add.s32 %r11524, %r11518, %r10727; + add.s32 %r11525, %r11524, %r11523; + xor.b32 %r11526, %r11525, %r11520; + shf.l.wrap.b32 %r11527, %r11526, %r11526, 24; + add.s32 %r11528, %r11527, %r11521; + xor.b32 %r11529, %r11528, %r11523; + shf.l.wrap.b32 %r11530, %r11529, %r11529, 25; + add.s32 %r11531, %r11483, %r10734; + add.s32 %r11532, %r11531, %r11474; + xor.b32 %r11533, %r11499, %r11532; + shf.l.wrap.b32 %r11534, %r11533, %r11533, 16; + add.s32 %r11535, %r11534, %r11458; + xor.b32 %r11536, %r11535, %r11474; + shf.l.wrap.b32 %r11537, %r11536, %r11536, 20; + add.s32 %r11538, %r11532, %r10790; + add.s32 %r11539, %r11538, %r11537; + xor.b32 %r11540, %r11539, %r11534; + shf.l.wrap.b32 %r11541, %r11540, %r11540, 24; + add.s32 %r11542, %r11541, %r11535; + xor.b32 %r11543, %r11542, %r11537; + shf.l.wrap.b32 %r11544, %r11543, %r11543, 25; + add.s32 %r11545, %r11488, %r10783; + add.s32 %r11546, %r11545, %r11497; + xor.b32 %r11547, %r11546, %r11457; + shf.l.wrap.b32 %r11548, %r11547, %r11547, 16; + add.s32 %r11549, %r11548, %r11472; + xor.b32 %r11550, %r11549, %r11488; + shf.l.wrap.b32 %r11551, %r11550, %r11550, 20; + add.s32 %r11552, %r11546, %r10769; + add.s32 %r11553, %r11552, %r11551; + xor.b32 %r11554, %r11553, %r11548; + shf.l.wrap.b32 %r11555, %r11554, %r11554, 24; + add.s32 %r11556, %r11555, %r11549; + xor.b32 %r11557, %r11556, %r11551; + shf.l.wrap.b32 %r11558, %r11557, %r11557, 25; + add.s32 %r11559, %r11530, %r10825; + add.s32 %r11560, %r11559, %r11511; + xor.b32 %r11561, %r11560, %r11555; + shf.l.wrap.b32 %r11562, %r11561, %r11561, 16; + add.s32 %r11563, %r11562, %r11542; + xor.b32 %r11564, %r11563, %r11530; + shf.l.wrap.b32 %r11565, %r11564, %r11564, 20; + add.s32 %r11566, %r11560, %r10797; + add.s32 %r11567, %r11566, %r11565; + xor.b32 %r11568, %r11567, %r11562; + shf.l.wrap.b32 %r11569, %r11568, %r11568, 24; + add.s32 %r11570, %r11569, %r11563; + xor.b32 %r11571, %r11570, %r11565; + shf.l.wrap.b32 %r11572, %r11571, %r11571, 25; + add.s32 %r11573, %r11525, %r10741; + add.s32 %r11574, %r11573, %r11544; + xor.b32 %r11575, %r11513, %r11574; + shf.l.wrap.b32 %r11576, %r11575, %r11575, 16; + add.s32 %r11577, %r11576, %r11556; + xor.b32 %r11578, %r11577, %r11544; + shf.l.wrap.b32 %r11579, %r11578, %r11578, 20; + add.s32 %r11580, %r11574, %r10811; + add.s32 %r11581, %r11580, %r11579; + xor.b32 %r11582, %r11581, %r11576; + shf.l.wrap.b32 %r11583, %r11582, %r11582, 24; + add.s32 %r11584, %r11583, %r11577; + xor.b32 %r11585, %r11584, %r11579; + shf.l.wrap.b32 %r11586, %r11585, %r11585, 25; + add.s32 %r11587, %r11539, %r10748; + add.s32 %r11588, %r11587, %r11558; + xor.b32 %r11589, %r11588, %r11527; + shf.l.wrap.b32 %r11590, %r11589, %r11589, 16; + add.s32 %r11591, %r11590, %r11514; + xor.b32 %r11592, %r11591, %r11558; + shf.l.wrap.b32 %r11593, %r11592, %r11592, 20; + add.s32 %r11594, %r11588, %r10755; + add.s32 %r11595, %r11594, %r11593; + xor.b32 %r11596, %r11595, %r11590; + shf.l.wrap.b32 %r11597, %r11596, %r11596, 24; + add.s32 %r11598, %r11597, %r11591; + xor.b32 %r11599, %r11598, %r11593; + shf.l.wrap.b32 %r11600, %r11599, %r11599, 25; + add.s32 %r11601, %r11553, %r10776; + add.s32 %r11602, %r11601, %r11516; + xor.b32 %r11603, %r11602, %r11541; + shf.l.wrap.b32 %r11604, %r11603, %r11603, 16; + add.s32 %r11605, %r11604, %r11528; + xor.b32 %r11606, %r11605, %r11516; + shf.l.wrap.b32 %r11607, %r11606, %r11606, 20; + add.s32 %r11608, %r11602, %r10818; + add.s32 %r11609, %r11608, %r11607; + xor.b32 %r11610, %r11609, %r11604; + shf.l.wrap.b32 %r11611, %r11610, %r11610, 24; + add.s32 %r11612, %r11611, %r11605; + xor.b32 %r11613, %r11612, %r11607; + shf.l.wrap.b32 %r11614, %r11613, %r11613, 25; + xor.b32 %r11615, %r11598, %r11567; + xor.b32 %r11616, %r11612, %r11581; + xor.b32 %r11617, %r11570, %r11595; + xor.b32 %r11618, %r11609, %r11584; + xor.b32 %r11619, %r11614, %r11583; + xor.b32 %r11620, %r11572, %r11597; + xor.b32 %r11621, %r11611, %r11586; + xor.b32 %r11622, %r11600, %r11569; + st.local.u8 [%rd220+145], %r11615; + shr.u32 %r11623, %r11615, 8; + st.local.u8 [%rd220+146], %r11623; + shr.u32 %r11624, %r11615, 16; + st.local.u8 [%rd220+147], %r11624; + shr.u32 %r11625, %r11615, 24; + st.local.u8 [%rd220+148], %r11625; + st.local.u8 [%rd220+149], %r11616; + shr.u32 %r11626, %r11616, 8; + st.local.u8 [%rd220+150], %r11626; + shr.u32 %r11627, %r11616, 16; + st.local.u8 [%rd220+151], %r11627; + shr.u32 %r11628, %r11616, 24; + st.local.u8 [%rd220+152], %r11628; + st.local.u8 [%rd220+153], %r11617; + shr.u32 %r11629, %r11617, 8; + st.local.u8 [%rd220+154], %r11629; + shr.u32 %r11630, %r11617, 16; + st.local.u8 [%rd220+155], %r11630; + shr.u32 %r11631, %r11617, 24; + st.local.u8 [%rd220+156], %r11631; + st.local.u8 [%rd220+157], %r11618; + shr.u32 %r11632, %r11618, 8; + st.local.u8 [%rd220+158], %r11632; + shr.u32 %r11633, %r11618, 16; + st.local.u8 [%rd220+159], %r11633; + shr.u32 %r11634, %r11618, 24; + st.local.u8 [%rd220+160], %r11634; + st.local.u8 [%rd220+161], %r11619; + shr.u32 %r11635, %r11619, 8; + st.local.u8 [%rd220+162], %r11635; + shr.u32 %r11636, %r11619, 16; + st.local.u8 [%rd220+163], %r11636; + shr.u32 %r11637, %r11619, 24; + st.local.u8 [%rd220+164], %r11637; + st.local.u8 [%rd220+165], %r11620; + shr.u32 %r11638, %r11620, 8; + st.local.u8 [%rd220+166], %r11638; + shr.u32 %r11639, %r11620, 16; + st.local.u8 [%rd220+167], %r11639; + shr.u32 %r11640, %r11620, 24; + st.local.u8 [%rd220+168], %r11640; + st.local.u8 [%rd220+169], %r11621; + shr.u32 %r11641, %r11621, 8; + st.local.u8 [%rd220+170], %r11641; + shr.u32 %r11642, %r11621, 16; + st.local.u8 [%rd220+171], %r11642; + shr.u32 %r11643, %r11621, 24; + st.local.u8 [%rd220+172], %r11643; + st.local.u8 [%rd220+173], %r11622; + shr.u32 %r11644, %r11622, 8; + st.local.u8 [%rd220+174], %r11644; + shr.u32 %r11645, %r11622, 16; + st.local.u8 [%rd220+175], %r11645; + shr.u32 %r11646, %r11622, 24; + st.local.u8 [%rd220+176], %r11646; + add.s16 %rs392, %rs392, -1; + cvt.u64.u16 %rd221, %rs392; + and.b64 %rd222, %rd221, 255; + setp.lt.u64 %p53, %rd97, %rd222; + @%p53 bra $L__BB1_66; + + ld.param.u64 %rd233, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + cvta.to.local.u64 %rd232, %rd233; + add.s64 %rd231, %rd232, 136; + st.local.u8 [%rd231+8], %rs392; + +$L__BB1_68: + ret; + +} + // .globl heavy_hash +.visible .entry heavy_hash( + .param .u64 heavy_hash_param_0, + .param .u64 heavy_hash_param_1, + .param .u64 heavy_hash_param_2, + .param .u8 heavy_hash_param_3, + .param .u64 heavy_hash_param_4, + .param .u64 heavy_hash_param_5, + .param .u64 heavy_hash_param_6, + .param .u64 heavy_hash_param_7 +) +{ + .local .align 16 .b8 __local_depot2[2080]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<60>; + .reg .b16 %rs<864>; + .reg .b32 %r<31266>; + .reg .b64 %rd<1373>; + + + mov.u64 %SPL, __local_depot2; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs409, [heavy_hash_param_3]; + ld.param.u64 %rd357, [heavy_hash_param_0]; + ld.param.u64 %rd358, [heavy_hash_param_1]; + ld.param.u64 %rd362, [heavy_hash_param_2]; + ld.param.u64 %rd363, [heavy_hash_param_4]; + ld.param.u64 %rd359, [heavy_hash_param_5]; + ld.param.u64 %rd360, [heavy_hash_param_6]; + ld.param.u64 %rd361, [heavy_hash_param_7]; + cvta.to.global.u64 %rd1, %rd363; + add.u64 %rd2, %SPL, 0; + add.u64 %rd3, %SPL, 2000; + mov.u32 %r5040, %ntid.x; + mov.u32 %r5041, %ctaid.x; + mov.u32 %r5042, %tid.x; + mad.lo.s32 %r5043, %r5041, %r5040, %r5042; + cvt.s64.s32 %rd4, %r5043; + setp.ge.u64 %p6, %rd4, %rd362; + @%p6 bra $L__BB2_105; + + cvt.u32.u64 %r5044, %rd4; + setp.ne.s32 %p7, %r5044, 0; + @%p7 bra $L__BB2_3; + + cvta.to.global.u64 %rd366, %rd359; + mov.u64 %rd367, 0; + st.global.u64 [%rd366], %rd367; + +$L__BB2_3: + setp.eq.s16 %p8, %rs409, 0; + @%p8 bra $L__BB2_5; + + shl.b64 %rd368, %rd4, 5; + add.s64 %rd369, %rd1, %rd368; + ld.global.v2.u64 {%rd370, %rd371}, [%rd369]; + mul.lo.s64 %rd374, %rd371, 5; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd374, 7; + shr.b64 %rhs, %rd374, 57; + add.u64 %rd375, %lhs, %rhs; + } + mul.lo.s64 %rd1299, %rd375, 9; + shl.b64 %rd376, %rd371, 17; + ld.global.v2.u64 {%rd377, %rd378}, [%rd369+16]; + xor.b64 %rd381, %rd377, %rd370; + xor.b64 %rd382, %rd378, %rd371; + xor.b64 %rd383, %rd371, %rd381; + xor.b64 %rd384, %rd370, %rd382; + st.global.v2.u64 [%rd369], {%rd384, %rd383}; + { + .reg .b32 %dummy; + mov.b64 {%r5045,%dummy}, %rd382; + } + { + .reg .b32 %dummy; + mov.b64 {%dummy,%r5046}, %rd382; + } + shf.r.wrap.b32 %r5047, %r5046, %r5045, 19; + shf.r.wrap.b32 %r5048, %r5045, %r5046, 19; + mov.b64 %rd385, {%r5048, %r5047}; + xor.b64 %rd386, %rd381, %rd376; + st.global.v2.u64 [%rd369+16], {%rd386, %rd385}; + bra.uni $L__BB2_6; + +$L__BB2_5: + ld.global.u64 %rd387, [%rd1]; + xor.b64 %rd1299, %rd387, %rd4; + +$L__BB2_6: + and.b64 %rd389, %rd1299, %rd357; + or.b64 %rd8, %rd389, %rd358; + mov.u64 %rd1300, 0; + mov.u32 %r29818, 0; + mov.u64 %rd390, hash_header; + +$L__BB2_7: + add.s64 %rd391, %rd390, %rd1300; + ld.const.u8 %rs410, [%rd391]; + add.s64 %rd392, %rd3, %rd1300; + st.local.u8 [%rd392], %rs410; + add.s64 %rd1300, %rd1300, 1; + add.s32 %r29818, %r29818, 1; + setp.lt.u32 %p9, %r29818, 72; + @%p9 bra $L__BB2_7; + + ld.local.v4.u32 {%r5050, %r5051, %r5052, %r5053}, [%rd3]; + mov.u64 %rd393, 0; + ld.local.v4.u32 {%r5054, %r5055, %r5056, %r5057}, [%rd3+16]; + ld.local.v4.u32 {%r5058, %r5059, %r5060, %r5061}, [%rd3+32]; + ld.local.v4.u32 {%r5062, %r5063, %r5064, %r5065}, [%rd3+48]; + st.local.u64 [%rd3+72], %rd8; + mov.u32 %r5066, -1150833019; + mov.u32 %r5067, 1779033703; + st.local.v2.u32 [%rd2], {%r5067, %r5066}; + mov.u32 %r5068, -1521486534; + mov.u32 %r5069, 1013904242; + st.local.v2.u32 [%rd2+8], {%r5069, %r5068}; + mov.u32 %r5070, -1694144372; + mov.u32 %r5071, 1359893119; + st.local.v2.u32 [%rd2+16], {%r5071, %r5070}; + mov.u32 %r5072, 1541459225; + mov.u32 %r5073, 528734635; + st.local.v2.u32 [%rd2+24], {%r5073, %r5072}; + st.local.u64 [%rd2+64], %rd393; + mov.u32 %r5074, 0; + st.local.v2.u32 [%rd2+72], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+80], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+88], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+96], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+104], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+112], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+120], {%r5074, %r5074}; + st.local.v2.u32 [%rd2+128], {%r5074, %r5074}; + mov.u16 %rs411, 0; + st.local.v2.u8 [%rd2+136], {%rs411, %rs411}; + st.local.u8 [%rd2+138], %rs411; + st.local.v2.u32 [%rd2+32], {%r5067, %r5066}; + st.local.v2.u32 [%rd2+40], {%r5069, %r5068}; + st.local.v2.u32 [%rd2+48], {%r5071, %r5070}; + st.local.v2.u32 [%rd2+56], {%r5073, %r5072}; + st.local.u8 [%rd2+144], %rs411; + ld.local.v4.u8 {%rs412, %rs413, %rs414, %rs415}, [%rd2+136]; + setp.eq.s16 %p10, %rs413, 0; + selp.u16 %rs419, 1, 0, %p10; + or.b16 %rs420, %rs414, %rs419; + mov.b32 {%rs421, %rs422}, %r5050; + shr.u16 %rs423, %rs421, 8; + shr.u16 %rs424, %rs422, 8; + mov.b32 {%rs425, %rs426}, %r5051; + shr.u16 %rs427, %rs425, 8; + shr.u16 %rs428, %rs426, 8; + mov.b32 {%rs429, %rs430}, %r5052; + shr.u16 %rs431, %rs429, 8; + shr.u16 %rs432, %rs430, 8; + mov.b32 {%rs433, %rs434}, %r5053; + shr.u16 %rs435, %rs433, 8; + shr.u16 %rs436, %rs434, 8; + cvt.u32.u16 %r5079, %rs421; + and.b32 %r5080, %r5079, 255; + cvt.u32.u16 %r5081, %rs423; + prmt.b32 %r5082, %r5081, %r5080, 30212; + cvt.u32.u16 %r5083, %rs422; + prmt.b32 %r5084, %r5083, %r5082, 28756; + cvt.u32.u16 %r5085, %rs424; + prmt.b32 %r5086, %r5085, %r5084, 1620; + cvt.u32.u16 %r5087, %rs425; + and.b32 %r5088, %r5087, 255; + cvt.u32.u16 %r5089, %rs427; + prmt.b32 %r5090, %r5089, %r5088, 30212; + cvt.u32.u16 %r5091, %rs426; + prmt.b32 %r5092, %r5091, %r5090, 28756; + cvt.u32.u16 %r5093, %rs428; + prmt.b32 %r5094, %r5093, %r5092, 1620; + cvt.u32.u16 %r5095, %rs429; + and.b32 %r5096, %r5095, 255; + cvt.u32.u16 %r5097, %rs431; + prmt.b32 %r5098, %r5097, %r5096, 30212; + cvt.u32.u16 %r5099, %rs430; + prmt.b32 %r5100, %r5099, %r5098, 28756; + cvt.u32.u16 %r5101, %rs432; + prmt.b32 %r5102, %r5101, %r5100, 1620; + cvt.u32.u16 %r5103, %rs433; + and.b32 %r5104, %r5103, 255; + cvt.u32.u16 %r5105, %rs435; + prmt.b32 %r5106, %r5105, %r5104, 30212; + cvt.u32.u16 %r5107, %rs434; + prmt.b32 %r5108, %r5107, %r5106, 28756; + cvt.u32.u16 %r5109, %rs436; + prmt.b32 %r5110, %r5109, %r5108, 1620; + mov.b32 {%rs437, %rs438}, %r5054; + shr.u16 %rs439, %rs437, 8; + shr.u16 %rs440, %rs438, 8; + mov.b32 {%rs441, %rs442}, %r5055; + shr.u16 %rs443, %rs441, 8; + shr.u16 %rs444, %rs442, 8; + mov.b32 {%rs445, %rs446}, %r5056; + shr.u16 %rs447, %rs445, 8; + shr.u16 %rs448, %rs446, 8; + mov.b32 {%rs449, %rs450}, %r5057; + shr.u16 %rs451, %rs449, 8; + shr.u16 %rs452, %rs450, 8; + cvt.u32.u16 %r5115, %rs437; + and.b32 %r5116, %r5115, 255; + cvt.u32.u16 %r5117, %rs439; + prmt.b32 %r5118, %r5117, %r5116, 30212; + cvt.u32.u16 %r5119, %rs438; + prmt.b32 %r5120, %r5119, %r5118, 28756; + cvt.u32.u16 %r5121, %rs440; + prmt.b32 %r5122, %r5121, %r5120, 1620; + cvt.u32.u16 %r5123, %rs441; + and.b32 %r5124, %r5123, 255; + cvt.u32.u16 %r5125, %rs443; + prmt.b32 %r5126, %r5125, %r5124, 30212; + cvt.u32.u16 %r5127, %rs442; + prmt.b32 %r5128, %r5127, %r5126, 28756; + cvt.u32.u16 %r5129, %rs444; + prmt.b32 %r5130, %r5129, %r5128, 1620; + cvt.u32.u16 %r5131, %rs445; + and.b32 %r5132, %r5131, 255; + cvt.u32.u16 %r5133, %rs447; + prmt.b32 %r5134, %r5133, %r5132, 30212; + cvt.u32.u16 %r5135, %rs446; + prmt.b32 %r5136, %r5135, %r5134, 28756; + cvt.u32.u16 %r5137, %rs448; + prmt.b32 %r5138, %r5137, %r5136, 1620; + cvt.u32.u16 %r5139, %rs449; + and.b32 %r5140, %r5139, 255; + cvt.u32.u16 %r5141, %rs451; + prmt.b32 %r5142, %r5141, %r5140, 30212; + cvt.u32.u16 %r5143, %rs450; + prmt.b32 %r5144, %r5143, %r5142, 28756; + cvt.u32.u16 %r5145, %rs452; + prmt.b32 %r5146, %r5145, %r5144, 1620; + mov.b32 {%rs453, %rs454}, %r5058; + shr.u16 %rs455, %rs453, 8; + shr.u16 %rs456, %rs454, 8; + mov.b32 {%rs457, %rs458}, %r5059; + shr.u16 %rs459, %rs457, 8; + shr.u16 %rs460, %rs458, 8; + mov.b32 {%rs461, %rs462}, %r5060; + shr.u16 %rs463, %rs461, 8; + shr.u16 %rs464, %rs462, 8; + mov.b32 {%rs465, %rs466}, %r5061; + shr.u16 %rs467, %rs465, 8; + shr.u16 %rs468, %rs466, 8; + cvt.u32.u16 %r5151, %rs453; + and.b32 %r5152, %r5151, 255; + cvt.u32.u16 %r5153, %rs455; + prmt.b32 %r5154, %r5153, %r5152, 30212; + cvt.u32.u16 %r5155, %rs454; + prmt.b32 %r5156, %r5155, %r5154, 28756; + cvt.u32.u16 %r5157, %rs456; + prmt.b32 %r5158, %r5157, %r5156, 1620; + cvt.u32.u16 %r5159, %rs457; + and.b32 %r5160, %r5159, 255; + cvt.u32.u16 %r5161, %rs459; + prmt.b32 %r5162, %r5161, %r5160, 30212; + cvt.u32.u16 %r5163, %rs458; + prmt.b32 %r5164, %r5163, %r5162, 28756; + cvt.u32.u16 %r5165, %rs460; + prmt.b32 %r5166, %r5165, %r5164, 1620; + cvt.u32.u16 %r5167, %rs461; + and.b32 %r5168, %r5167, 255; + cvt.u32.u16 %r5169, %rs463; + prmt.b32 %r5170, %r5169, %r5168, 30212; + cvt.u32.u16 %r5171, %rs462; + prmt.b32 %r5172, %r5171, %r5170, 28756; + cvt.u32.u16 %r5173, %rs464; + prmt.b32 %r5174, %r5173, %r5172, 1620; + cvt.u32.u16 %r5175, %rs465; + and.b32 %r5176, %r5175, 255; + cvt.u32.u16 %r5177, %rs467; + prmt.b32 %r5178, %r5177, %r5176, 30212; + cvt.u32.u16 %r5179, %rs466; + prmt.b32 %r5180, %r5179, %r5178, 28756; + cvt.u32.u16 %r5181, %rs468; + prmt.b32 %r5182, %r5181, %r5180, 1620; + mov.b32 {%rs469, %rs470}, %r5062; + shr.u16 %rs471, %rs469, 8; + shr.u16 %rs472, %rs470, 8; + mov.b32 {%rs473, %rs474}, %r5063; + shr.u16 %rs475, %rs473, 8; + shr.u16 %rs476, %rs474, 8; + mov.b32 {%rs477, %rs478}, %r5064; + shr.u16 %rs479, %rs477, 8; + shr.u16 %rs480, %rs478, 8; + mov.b32 {%rs481, %rs482}, %r5065; + shr.u16 %rs483, %rs481, 8; + shr.u16 %rs484, %rs482, 8; + cvt.u32.u16 %r5187, %rs469; + and.b32 %r5188, %r5187, 255; + cvt.u32.u16 %r5189, %rs471; + prmt.b32 %r5190, %r5189, %r5188, 30212; + cvt.u32.u16 %r5191, %rs470; + prmt.b32 %r5192, %r5191, %r5190, 28756; + cvt.u32.u16 %r5193, %rs472; + prmt.b32 %r5194, %r5193, %r5192, 1620; + cvt.u32.u16 %r5195, %rs473; + and.b32 %r5196, %r5195, 255; + cvt.u32.u16 %r5197, %rs475; + prmt.b32 %r5198, %r5197, %r5196, 30212; + cvt.u32.u16 %r5199, %rs474; + prmt.b32 %r5200, %r5199, %r5198, 28756; + cvt.u32.u16 %r5201, %rs476; + prmt.b32 %r5202, %r5201, %r5200, 1620; + cvt.u32.u16 %r5203, %rs477; + and.b32 %r5204, %r5203, 255; + cvt.u32.u16 %r5205, %rs479; + prmt.b32 %r5206, %r5205, %r5204, 30212; + cvt.u32.u16 %r5207, %rs478; + prmt.b32 %r5208, %r5207, %r5206, 28756; + cvt.u32.u16 %r5209, %rs480; + prmt.b32 %r5210, %r5209, %r5208, 1620; + cvt.u32.u16 %r5211, %rs481; + and.b32 %r5212, %r5211, 255; + cvt.u32.u16 %r5213, %rs483; + prmt.b32 %r5214, %r5213, %r5212, 30212; + cvt.u32.u16 %r5215, %rs482; + prmt.b32 %r5216, %r5215, %r5214, 28756; + cvt.u32.u16 %r5217, %rs484; + prmt.b32 %r5218, %r5217, %r5216, 1620; + cvt.u32.u16 %r5219, %rs420; + and.b32 %r5220, %r5219, 255; + add.s32 %r5221, %r5086, -1156040474; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 16; + add.s32 %r5223, %r5222, 1779033703; + xor.b32 %r5224, %r5223, 1359893119; + shf.l.wrap.b32 %r5225, %r5224, %r5224, 20; + add.s32 %r5226, %r5094, %r5221; + add.s32 %r5227, %r5226, %r5225; + xor.b32 %r5228, %r5227, %r5222; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 24; + add.s32 %r5230, %r5229, %r5223; + xor.b32 %r5231, %r5230, %r5225; + shf.l.wrap.b32 %r5232, %r5231, %r5231, 25; + add.s32 %r5233, %r5102, 1449989905; + shf.l.wrap.b32 %r5234, %r5233, %r5233, 16; + add.s32 %r5235, %r5234, -1150833019; + xor.b32 %r5236, %r5235, -1694144372; + shf.l.wrap.b32 %r5237, %r5236, %r5236, 20; + add.s32 %r5238, %r5110, %r5233; + add.s32 %r5239, %r5238, %r5237; + xor.b32 %r5240, %r5239, %r5234; + shf.l.wrap.b32 %r5241, %r5240, %r5240, 24; + add.s32 %r5242, %r5241, %r5235; + xor.b32 %r5243, %r5242, %r5237; + shf.l.wrap.b32 %r5244, %r5243, %r5243, 25; + add.s32 %r5245, %r5122, 1542638877; + shr.u32 %r5246, %r5245, 16; + shl.b32 %r5247, %r5245, 16; + xor.b32 %r5248, %r5247, 4194304; + or.b32 %r5249, %r5248, %r5246; + add.s32 %r5250, %r5249, 1013904242; + xor.b32 %r5251, %r5250, 528734635; + shf.l.wrap.b32 %r5252, %r5251, %r5251, 20; + add.s32 %r5253, %r5130, %r5245; + add.s32 %r5254, %r5253, %r5252; + xor.b32 %r5255, %r5254, %r5249; + shf.l.wrap.b32 %r5256, %r5255, %r5255, 24; + add.s32 %r5257, %r5256, %r5250; + xor.b32 %r5258, %r5257, %r5252; + shf.l.wrap.b32 %r5259, %r5258, %r5258, 25; + add.s32 %r5260, %r5138, 19972691; + xor.b32 %r5261, %r5260, %r5220; + shr.u32 %r5262, %r5260, 16; + shl.b32 %r5263, %r5261, 16; + or.b32 %r5264, %r5263, %r5262; + add.s32 %r5265, %r5264, -1521486534; + xor.b32 %r5266, %r5265, 1541459225; + shf.l.wrap.b32 %r5267, %r5266, %r5266, 20; + add.s32 %r5268, %r5146, %r5260; + add.s32 %r5269, %r5268, %r5267; + xor.b32 %r5270, %r5269, %r5264; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 24; + add.s32 %r5272, %r5271, %r5265; + xor.b32 %r5273, %r5272, %r5267; + shf.l.wrap.b32 %r5274, %r5273, %r5273, 25; + add.s32 %r5275, %r5244, %r5227; + add.s32 %r5276, %r5275, %r5158; + xor.b32 %r5277, %r5271, %r5276; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 16; + add.s32 %r5279, %r5278, %r5257; + xor.b32 %r5280, %r5279, %r5244; + shf.l.wrap.b32 %r5281, %r5280, %r5280, 20; + add.s32 %r5282, %r5166, %r5276; + add.s32 %r5283, %r5282, %r5281; + xor.b32 %r5284, %r5283, %r5278; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 24; + add.s32 %r5286, %r5285, %r5279; + xor.b32 %r5287, %r5286, %r5281; + shf.l.wrap.b32 %r5288, %r5287, %r5287, 25; + add.s32 %r5289, %r5259, %r5239; + add.s32 %r5290, %r5289, %r5174; + xor.b32 %r5291, %r5290, %r5229; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 16; + add.s32 %r5293, %r5292, %r5272; + xor.b32 %r5294, %r5293, %r5259; + shf.l.wrap.b32 %r5295, %r5294, %r5294, 20; + add.s32 %r5296, %r5182, %r5290; + add.s32 %r5297, %r5296, %r5295; + xor.b32 %r5298, %r5297, %r5292; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 24; + add.s32 %r5300, %r5299, %r5293; + xor.b32 %r5301, %r5300, %r5295; + shf.l.wrap.b32 %r5302, %r5301, %r5301, 25; + add.s32 %r5303, %r5274, %r5254; + add.s32 %r5304, %r5303, %r5194; + xor.b32 %r5305, %r5304, %r5241; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 16; + add.s32 %r5307, %r5306, %r5230; + xor.b32 %r5308, %r5307, %r5274; + shf.l.wrap.b32 %r5309, %r5308, %r5308, 20; + add.s32 %r5310, %r5202, %r5304; + add.s32 %r5311, %r5310, %r5309; + xor.b32 %r5312, %r5311, %r5306; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 24; + add.s32 %r5314, %r5313, %r5307; + xor.b32 %r5315, %r5314, %r5309; + shf.l.wrap.b32 %r5316, %r5315, %r5315, 25; + add.s32 %r5317, %r5269, %r5232; + add.s32 %r5318, %r5317, %r5210; + xor.b32 %r5319, %r5318, %r5256; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 16; + add.s32 %r5321, %r5320, %r5242; + xor.b32 %r5322, %r5321, %r5232; + shf.l.wrap.b32 %r5323, %r5322, %r5322, 20; + add.s32 %r5324, %r5218, %r5318; + add.s32 %r5325, %r5324, %r5323; + xor.b32 %r5326, %r5325, %r5320; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 24; + add.s32 %r5328, %r5327, %r5321; + xor.b32 %r5329, %r5328, %r5323; + shf.l.wrap.b32 %r5330, %r5329, %r5329, 25; + add.s32 %r5331, %r5283, %r5102; + add.s32 %r5332, %r5331, %r5330; + xor.b32 %r5333, %r5332, %r5299; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 16; + add.s32 %r5335, %r5334, %r5314; + xor.b32 %r5336, %r5335, %r5330; + shf.l.wrap.b32 %r5337, %r5336, %r5336, 20; + add.s32 %r5338, %r5332, %r5138; + add.s32 %r5339, %r5338, %r5337; + xor.b32 %r5340, %r5339, %r5334; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 24; + add.s32 %r5342, %r5341, %r5335; + xor.b32 %r5343, %r5342, %r5337; + shf.l.wrap.b32 %r5344, %r5343, %r5343, 25; + add.s32 %r5345, %r5297, %r5110; + add.s32 %r5346, %r5345, %r5288; + xor.b32 %r5347, %r5313, %r5346; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 16; + add.s32 %r5349, %r5328, %r5348; + xor.b32 %r5350, %r5349, %r5288; + shf.l.wrap.b32 %r5351, %r5350, %r5350, 20; + add.s32 %r5352, %r5346, %r5174; + add.s32 %r5353, %r5352, %r5351; + xor.b32 %r5354, %r5353, %r5348; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 24; + add.s32 %r5356, %r5355, %r5349; + xor.b32 %r5357, %r5356, %r5351; + shf.l.wrap.b32 %r5358, %r5357, %r5357, 25; + add.s32 %r5359, %r5302, %r5146; + add.s32 %r5360, %r5359, %r5311; + xor.b32 %r5361, %r5327, %r5360; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 16; + add.s32 %r5363, %r5362, %r5286; + xor.b32 %r5364, %r5363, %r5302; + shf.l.wrap.b32 %r5365, %r5364, %r5364, 20; + add.s32 %r5366, %r5360, %r5086; + add.s32 %r5367, %r5366, %r5365; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 24; + add.s32 %r5370, %r5369, %r5363; + xor.b32 %r5371, %r5370, %r5365; + shf.l.wrap.b32 %r5372, %r5371, %r5371, 25; + add.s32 %r5373, %r5316, %r5122; + add.s32 %r5374, %r5373, %r5325; + xor.b32 %r5375, %r5374, %r5285; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 16; + add.s32 %r5377, %r5376, %r5300; + xor.b32 %r5378, %r5377, %r5316; + shf.l.wrap.b32 %r5379, %r5378, %r5378, 20; + add.s32 %r5380, %r5374, %r5202; + add.s32 %r5381, %r5380, %r5379; + xor.b32 %r5382, %r5381, %r5376; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 24; + add.s32 %r5384, %r5383, %r5377; + xor.b32 %r5385, %r5384, %r5379; + shf.l.wrap.b32 %r5386, %r5385, %r5385, 25; + add.s32 %r5387, %r5339, %r5094; + add.s32 %r5388, %r5387, %r5358; + xor.b32 %r5389, %r5388, %r5383; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 16; + add.s32 %r5391, %r5390, %r5370; + xor.b32 %r5392, %r5391, %r5358; + shf.l.wrap.b32 %r5393, %r5392, %r5392, 20; + add.s32 %r5394, %r5388, %r5182; + add.s32 %r5395, %r5394, %r5393; + xor.b32 %r5396, %r5395, %r5390; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 24; + add.s32 %r5398, %r5397, %r5391; + xor.b32 %r5399, %r5398, %r5393; + shf.l.wrap.b32 %r5400, %r5399, %r5399, 25; + add.s32 %r5401, %r5353, %r5194; + add.s32 %r5402, %r5401, %r5372; + xor.b32 %r5403, %r5402, %r5341; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 16; + add.s32 %r5405, %r5404, %r5384; + xor.b32 %r5406, %r5405, %r5372; + shf.l.wrap.b32 %r5407, %r5406, %r5406, 20; + add.s32 %r5408, %r5402, %r5130; + add.s32 %r5409, %r5408, %r5407; + xor.b32 %r5410, %r5409, %r5404; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 24; + add.s32 %r5412, %r5411, %r5405; + xor.b32 %r5413, %r5412, %r5407; + shf.l.wrap.b32 %r5414, %r5413, %r5413, 25; + add.s32 %r5415, %r5367, %r5166; + add.s32 %r5416, %r5415, %r5386; + xor.b32 %r5417, %r5416, %r5355; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 16; + add.s32 %r5419, %r5418, %r5342; + xor.b32 %r5420, %r5419, %r5386; + shf.l.wrap.b32 %r5421, %r5420, %r5420, 20; + add.s32 %r5422, %r5416, %r5210; + add.s32 %r5423, %r5422, %r5421; + xor.b32 %r5424, %r5423, %r5418; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 24; + add.s32 %r5426, %r5425, %r5419; + xor.b32 %r5427, %r5426, %r5421; + shf.l.wrap.b32 %r5428, %r5427, %r5427, 25; + add.s32 %r5429, %r5381, %r5218; + add.s32 %r5430, %r5429, %r5344; + xor.b32 %r5431, %r5430, %r5369; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 16; + add.s32 %r5433, %r5432, %r5356; + xor.b32 %r5434, %r5433, %r5344; + shf.l.wrap.b32 %r5435, %r5434, %r5434, 20; + add.s32 %r5436, %r5430, %r5158; + add.s32 %r5437, %r5436, %r5435; + xor.b32 %r5438, %r5437, %r5432; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 24; + add.s32 %r5440, %r5439, %r5433; + xor.b32 %r5441, %r5440, %r5435; + shf.l.wrap.b32 %r5442, %r5441, %r5441, 25; + add.s32 %r5443, %r5395, %r5110; + add.s32 %r5444, %r5443, %r5442; + xor.b32 %r5445, %r5444, %r5411; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 16; + add.s32 %r5447, %r5446, %r5426; + xor.b32 %r5448, %r5447, %r5442; + shf.l.wrap.b32 %r5449, %r5448, %r5448, 20; + add.s32 %r5450, %r5444, %r5122; + add.s32 %r5451, %r5450, %r5449; + xor.b32 %r5452, %r5451, %r5446; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 24; + add.s32 %r5454, %r5453, %r5447; + xor.b32 %r5455, %r5454, %r5449; + shf.l.wrap.b32 %r5456, %r5455, %r5455, 25; + add.s32 %r5457, %r5409, %r5174; + add.s32 %r5458, %r5457, %r5400; + xor.b32 %r5459, %r5458, %r5425; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 16; + add.s32 %r5461, %r5460, %r5440; + xor.b32 %r5462, %r5461, %r5400; + shf.l.wrap.b32 %r5463, %r5462, %r5462, 20; + add.s32 %r5464, %r5458, %r5194; + add.s32 %r5465, %r5464, %r5463; + xor.b32 %r5466, %r5465, %r5460; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 24; + add.s32 %r5468, %r5467, %r5461; + xor.b32 %r5469, %r5468, %r5463; + shf.l.wrap.b32 %r5470, %r5469, %r5469, 25; + add.s32 %r5471, %r5423, %r5202; + add.s32 %r5472, %r5471, %r5414; + xor.b32 %r5473, %r5472, %r5439; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 16; + add.s32 %r5475, %r5474, %r5398; + xor.b32 %r5476, %r5475, %r5414; + shf.l.wrap.b32 %r5477, %r5476, %r5476, 20; + add.s32 %r5478, %r5472, %r5102; + add.s32 %r5479, %r5478, %r5477; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 24; + add.s32 %r5482, %r5481, %r5475; + xor.b32 %r5483, %r5482, %r5477; + shf.l.wrap.b32 %r5484, %r5483, %r5483, 25; + add.s32 %r5485, %r5437, %r5146; + add.s32 %r5486, %r5485, %r5428; + xor.b32 %r5487, %r5486, %r5397; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 16; + add.s32 %r5489, %r5488, %r5412; + xor.b32 %r5490, %r5489, %r5428; + shf.l.wrap.b32 %r5491, %r5490, %r5490, 20; + add.s32 %r5492, %r5486, %r5210; + add.s32 %r5493, %r5492, %r5491; + xor.b32 %r5494, %r5493, %r5488; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 24; + add.s32 %r5496, %r5495, %r5489; + xor.b32 %r5497, %r5496, %r5491; + shf.l.wrap.b32 %r5498, %r5497, %r5497, 25; + add.s32 %r5499, %r5451, %r5138; + add.s32 %r5500, %r5499, %r5470; + xor.b32 %r5501, %r5500, %r5495; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 16; + add.s32 %r5503, %r5502, %r5482; + xor.b32 %r5504, %r5503, %r5470; + shf.l.wrap.b32 %r5505, %r5504, %r5504, 20; + add.s32 %r5506, %r5500, %r5130; + add.s32 %r5507, %r5506, %r5505; + xor.b32 %r5508, %r5507, %r5502; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 24; + add.s32 %r5510, %r5509, %r5503; + xor.b32 %r5511, %r5510, %r5505; + shf.l.wrap.b32 %r5512, %r5511, %r5511, 25; + add.s32 %r5513, %r5465, %r5166; + add.s32 %r5514, %r5513, %r5484; + xor.b32 %r5515, %r5514, %r5453; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 16; + add.s32 %r5517, %r5516, %r5496; + xor.b32 %r5518, %r5517, %r5484; + shf.l.wrap.b32 %r5519, %r5518, %r5518, 20; + add.s32 %r5520, %r5514, %r5086; + add.s32 %r5521, %r5520, %r5519; + xor.b32 %r5522, %r5521, %r5516; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 24; + add.s32 %r5524, %r5523, %r5517; + xor.b32 %r5525, %r5524, %r5519; + shf.l.wrap.b32 %r5526, %r5525, %r5525, 25; + add.s32 %r5527, %r5479, %r5182; + add.s32 %r5528, %r5527, %r5498; + xor.b32 %r5529, %r5528, %r5467; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 16; + add.s32 %r5531, %r5530, %r5454; + xor.b32 %r5532, %r5531, %r5498; + shf.l.wrap.b32 %r5533, %r5532, %r5532, 20; + add.s32 %r5534, %r5528, %r5218; + add.s32 %r5535, %r5534, %r5533; + xor.b32 %r5536, %r5535, %r5530; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 24; + add.s32 %r5538, %r5537, %r5531; + xor.b32 %r5539, %r5538, %r5533; + shf.l.wrap.b32 %r5540, %r5539, %r5539, 25; + add.s32 %r5541, %r5493, %r5158; + add.s32 %r5542, %r5541, %r5456; + xor.b32 %r5543, %r5542, %r5481; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 16; + add.s32 %r5545, %r5544, %r5468; + xor.b32 %r5546, %r5545, %r5456; + shf.l.wrap.b32 %r5547, %r5546, %r5546, 20; + add.s32 %r5548, %r5542, %r5094; + add.s32 %r5549, %r5548, %r5547; + xor.b32 %r5550, %r5549, %r5544; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 24; + add.s32 %r5552, %r5551, %r5545; + xor.b32 %r5553, %r5552, %r5547; + shf.l.wrap.b32 %r5554, %r5553, %r5553, 25; + add.s32 %r5555, %r5507, %r5174; + add.s32 %r5556, %r5555, %r5554; + xor.b32 %r5557, %r5556, %r5523; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 16; + add.s32 %r5559, %r5558, %r5538; + xor.b32 %r5560, %r5559, %r5554; + shf.l.wrap.b32 %r5561, %r5560, %r5560, 20; + add.s32 %r5562, %r5556, %r5146; + add.s32 %r5563, %r5562, %r5561; + xor.b32 %r5564, %r5563, %r5558; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 24; + add.s32 %r5566, %r5565, %r5559; + xor.b32 %r5567, %r5566, %r5561; + shf.l.wrap.b32 %r5568, %r5567, %r5567, 25; + add.s32 %r5569, %r5521, %r5194; + add.s32 %r5570, %r5569, %r5512; + xor.b32 %r5571, %r5570, %r5537; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 16; + add.s32 %r5573, %r5572, %r5552; + xor.b32 %r5574, %r5573, %r5512; + shf.l.wrap.b32 %r5575, %r5574, %r5574, 20; + add.s32 %r5576, %r5570, %r5166; + add.s32 %r5577, %r5576, %r5575; + xor.b32 %r5578, %r5577, %r5572; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 24; + add.s32 %r5580, %r5579, %r5573; + xor.b32 %r5581, %r5580, %r5575; + shf.l.wrap.b32 %r5582, %r5581, %r5581, 25; + add.s32 %r5583, %r5535, %r5210; + add.s32 %r5584, %r5583, %r5526; + xor.b32 %r5585, %r5584, %r5551; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 16; + add.s32 %r5587, %r5586, %r5510; + xor.b32 %r5588, %r5587, %r5526; + shf.l.wrap.b32 %r5589, %r5588, %r5588, 20; + add.s32 %r5590, %r5584, %r5110; + add.s32 %r5591, %r5590, %r5589; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 24; + add.s32 %r5594, %r5593, %r5587; + xor.b32 %r5595, %r5594, %r5589; + shf.l.wrap.b32 %r5596, %r5595, %r5595, 25; + add.s32 %r5597, %r5549, %r5202; + add.s32 %r5598, %r5597, %r5540; + xor.b32 %r5599, %r5598, %r5509; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 16; + add.s32 %r5601, %r5600, %r5524; + xor.b32 %r5602, %r5601, %r5540; + shf.l.wrap.b32 %r5603, %r5602, %r5602, 20; + add.s32 %r5604, %r5598, %r5218; + add.s32 %r5605, %r5604, %r5603; + xor.b32 %r5606, %r5605, %r5600; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 24; + add.s32 %r5608, %r5607, %r5601; + xor.b32 %r5609, %r5608, %r5603; + shf.l.wrap.b32 %r5610, %r5609, %r5609, 25; + add.s32 %r5611, %r5563, %r5122; + add.s32 %r5612, %r5611, %r5582; + xor.b32 %r5613, %r5612, %r5607; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 16; + add.s32 %r5615, %r5614, %r5594; + xor.b32 %r5616, %r5615, %r5582; + shf.l.wrap.b32 %r5617, %r5616, %r5616, 20; + add.s32 %r5618, %r5612, %r5086; + add.s32 %r5619, %r5618, %r5617; + xor.b32 %r5620, %r5619, %r5614; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 24; + add.s32 %r5622, %r5621, %r5615; + xor.b32 %r5623, %r5622, %r5617; + shf.l.wrap.b32 %r5624, %r5623, %r5623, 25; + add.s32 %r5625, %r5577, %r5182; + add.s32 %r5626, %r5625, %r5596; + xor.b32 %r5627, %r5626, %r5565; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 16; + add.s32 %r5629, %r5628, %r5608; + xor.b32 %r5630, %r5629, %r5596; + shf.l.wrap.b32 %r5631, %r5630, %r5630, 20; + add.s32 %r5632, %r5626, %r5102; + add.s32 %r5633, %r5632, %r5631; + xor.b32 %r5634, %r5633, %r5628; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 24; + add.s32 %r5636, %r5635, %r5629; + xor.b32 %r5637, %r5636, %r5631; + shf.l.wrap.b32 %r5638, %r5637, %r5637, 25; + add.s32 %r5639, %r5591, %r5130; + add.s32 %r5640, %r5639, %r5610; + xor.b32 %r5641, %r5640, %r5579; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 16; + add.s32 %r5643, %r5642, %r5566; + xor.b32 %r5644, %r5643, %r5610; + shf.l.wrap.b32 %r5645, %r5644, %r5644, 20; + add.s32 %r5646, %r5640, %r5158; + add.s32 %r5647, %r5646, %r5645; + xor.b32 %r5648, %r5647, %r5642; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 24; + add.s32 %r5650, %r5649, %r5643; + xor.b32 %r5651, %r5650, %r5645; + shf.l.wrap.b32 %r5652, %r5651, %r5651, 25; + add.s32 %r5653, %r5605, %r5094; + add.s32 %r5654, %r5653, %r5568; + xor.b32 %r5655, %r5654, %r5593; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 16; + add.s32 %r5657, %r5656, %r5580; + xor.b32 %r5658, %r5657, %r5568; + shf.l.wrap.b32 %r5659, %r5658, %r5658, 20; + add.s32 %r5660, %r5654, %r5138; + add.s32 %r5661, %r5660, %r5659; + xor.b32 %r5662, %r5661, %r5656; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 24; + add.s32 %r5664, %r5663, %r5657; + xor.b32 %r5665, %r5664, %r5659; + shf.l.wrap.b32 %r5666, %r5665, %r5665, 25; + add.s32 %r5667, %r5619, %r5194; + add.s32 %r5668, %r5667, %r5666; + xor.b32 %r5669, %r5668, %r5635; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 16; + add.s32 %r5671, %r5670, %r5650; + xor.b32 %r5672, %r5671, %r5666; + shf.l.wrap.b32 %r5673, %r5672, %r5672, 20; + add.s32 %r5674, %r5668, %r5202; + add.s32 %r5675, %r5674, %r5673; + xor.b32 %r5676, %r5675, %r5670; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 24; + add.s32 %r5678, %r5677, %r5671; + xor.b32 %r5679, %r5678, %r5673; + shf.l.wrap.b32 %r5680, %r5679, %r5679, 25; + add.s32 %r5681, %r5633, %r5166; + add.s32 %r5682, %r5681, %r5624; + xor.b32 %r5683, %r5682, %r5649; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 16; + add.s32 %r5685, %r5684, %r5664; + xor.b32 %r5686, %r5685, %r5624; + shf.l.wrap.b32 %r5687, %r5686, %r5686, 20; + add.s32 %r5688, %r5682, %r5182; + add.s32 %r5689, %r5688, %r5687; + xor.b32 %r5690, %r5689, %r5684; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 24; + add.s32 %r5692, %r5691, %r5685; + xor.b32 %r5693, %r5692, %r5687; + shf.l.wrap.b32 %r5694, %r5693, %r5693, 25; + add.s32 %r5695, %r5647, %r5218; + add.s32 %r5696, %r5695, %r5638; + xor.b32 %r5697, %r5696, %r5663; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 16; + add.s32 %r5699, %r5698, %r5622; + xor.b32 %r5700, %r5699, %r5638; + shf.l.wrap.b32 %r5701, %r5700, %r5700, 20; + add.s32 %r5702, %r5696, %r5174; + add.s32 %r5703, %r5702, %r5701; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 24; + add.s32 %r5706, %r5705, %r5699; + xor.b32 %r5707, %r5706, %r5701; + shf.l.wrap.b32 %r5708, %r5707, %r5707, 25; + add.s32 %r5709, %r5661, %r5210; + add.s32 %r5710, %r5709, %r5652; + xor.b32 %r5711, %r5710, %r5621; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 16; + add.s32 %r5713, %r5712, %r5636; + xor.b32 %r5714, %r5713, %r5652; + shf.l.wrap.b32 %r5715, %r5714, %r5714, 20; + add.s32 %r5716, %r5710, %r5158; + add.s32 %r5717, %r5716, %r5715; + xor.b32 %r5718, %r5717, %r5712; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 24; + add.s32 %r5720, %r5719, %r5713; + xor.b32 %r5721, %r5720, %r5715; + shf.l.wrap.b32 %r5722, %r5721, %r5721, 25; + add.s32 %r5723, %r5675, %r5146; + add.s32 %r5724, %r5723, %r5694; + xor.b32 %r5725, %r5724, %r5719; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 16; + add.s32 %r5727, %r5726, %r5706; + xor.b32 %r5728, %r5727, %r5694; + shf.l.wrap.b32 %r5729, %r5728, %r5728, 20; + add.s32 %r5730, %r5724, %r5102; + add.s32 %r5731, %r5730, %r5729; + xor.b32 %r5732, %r5731, %r5726; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 24; + add.s32 %r5734, %r5733, %r5727; + xor.b32 %r5735, %r5734, %r5729; + shf.l.wrap.b32 %r5736, %r5735, %r5735, 25; + add.s32 %r5737, %r5689, %r5130; + add.s32 %r5738, %r5737, %r5708; + xor.b32 %r5739, %r5738, %r5677; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 16; + add.s32 %r5741, %r5740, %r5720; + xor.b32 %r5742, %r5741, %r5708; + shf.l.wrap.b32 %r5743, %r5742, %r5742, 20; + add.s32 %r5744, %r5738, %r5110; + add.s32 %r5745, %r5744, %r5743; + xor.b32 %r5746, %r5745, %r5740; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 24; + add.s32 %r5748, %r5747, %r5741; + xor.b32 %r5749, %r5748, %r5743; + shf.l.wrap.b32 %r5750, %r5749, %r5749, 25; + add.s32 %r5751, %r5703, %r5086; + add.s32 %r5752, %r5751, %r5722; + xor.b32 %r5753, %r5752, %r5691; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 16; + add.s32 %r5755, %r5754, %r5678; + xor.b32 %r5756, %r5755, %r5722; + shf.l.wrap.b32 %r5757, %r5756, %r5756, 20; + add.s32 %r5758, %r5752, %r5094; + add.s32 %r5759, %r5758, %r5757; + xor.b32 %r5760, %r5759, %r5754; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 24; + add.s32 %r5762, %r5761, %r5755; + xor.b32 %r5763, %r5762, %r5757; + shf.l.wrap.b32 %r5764, %r5763, %r5763, 25; + add.s32 %r5765, %r5717, %r5138; + add.s32 %r5766, %r5765, %r5680; + xor.b32 %r5767, %r5766, %r5705; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 16; + add.s32 %r5769, %r5768, %r5692; + xor.b32 %r5770, %r5769, %r5680; + shf.l.wrap.b32 %r5771, %r5770, %r5770, 20; + add.s32 %r5772, %r5766, %r5122; + add.s32 %r5773, %r5772, %r5771; + xor.b32 %r5774, %r5773, %r5768; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 24; + add.s32 %r5776, %r5775, %r5769; + xor.b32 %r5777, %r5776, %r5771; + shf.l.wrap.b32 %r5778, %r5777, %r5777, 25; + add.s32 %r5779, %r5731, %r5166; + add.s32 %r5780, %r5779, %r5778; + xor.b32 %r5781, %r5780, %r5747; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 16; + add.s32 %r5783, %r5782, %r5762; + xor.b32 %r5784, %r5783, %r5778; + shf.l.wrap.b32 %r5785, %r5784, %r5784, 20; + add.s32 %r5786, %r5780, %r5210; + add.s32 %r5787, %r5786, %r5785; + xor.b32 %r5788, %r5787, %r5782; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 24; + add.s32 %r5790, %r5789, %r5783; + xor.b32 %r5791, %r5790, %r5785; + shf.l.wrap.b32 %r5792, %r5791, %r5791, 25; + add.s32 %r5793, %r5745, %r5182; + add.s32 %r5794, %r5793, %r5736; + xor.b32 %r5795, %r5794, %r5761; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 16; + add.s32 %r5797, %r5796, %r5776; + xor.b32 %r5798, %r5797, %r5736; + shf.l.wrap.b32 %r5799, %r5798, %r5798, 20; + add.s32 %r5800, %r5794, %r5130; + add.s32 %r5801, %r5800, %r5799; + xor.b32 %r5802, %r5801, %r5796; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 24; + add.s32 %r5804, %r5803, %r5797; + xor.b32 %r5805, %r5804, %r5799; + shf.l.wrap.b32 %r5806, %r5805, %r5805, 25; + add.s32 %r5807, %r5759, %r5158; + add.s32 %r5808, %r5807, %r5750; + xor.b32 %r5809, %r5808, %r5775; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 16; + add.s32 %r5811, %r5810, %r5734; + xor.b32 %r5812, %r5811, %r5750; + shf.l.wrap.b32 %r5813, %r5812, %r5812, 20; + add.s32 %r5814, %r5808, %r5194; + add.s32 %r5815, %r5814, %r5813; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 24; + add.s32 %r5818, %r5817, %r5811; + xor.b32 %r5819, %r5818, %r5813; + shf.l.wrap.b32 %r5820, %r5819, %r5819, 25; + add.s32 %r5821, %r5773, %r5218; + add.s32 %r5822, %r5821, %r5764; + xor.b32 %r5823, %r5822, %r5733; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 16; + add.s32 %r5825, %r5824, %r5748; + xor.b32 %r5826, %r5825, %r5764; + shf.l.wrap.b32 %r5827, %r5826, %r5826, 20; + add.s32 %r5828, %r5822, %r5094; + add.s32 %r5829, %r5828, %r5827; + xor.b32 %r5830, %r5829, %r5824; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 24; + add.s32 %r5832, %r5831, %r5825; + xor.b32 %r5833, %r5832, %r5827; + shf.l.wrap.b32 %r5834, %r5833, %r5833, 25; + add.s32 %r5835, %r5787, %r5202; + add.s32 %r5836, %r5835, %r5806; + xor.b32 %r5837, %r5836, %r5831; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 16; + add.s32 %r5839, %r5838, %r5818; + xor.b32 %r5840, %r5839, %r5806; + shf.l.wrap.b32 %r5841, %r5840, %r5840, 20; + add.s32 %r5842, %r5836, %r5110; + add.s32 %r5843, %r5842, %r5841; + xor.b32 %r5844, %r5843, %r5838; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 24; + add.s32 %r5846, %r5845, %r5839; + xor.b32 %r5847, %r5846, %r5841; + shf.l.wrap.b32 %r5848, %r5847, %r5847, 25; + add.s32 %r5849, %r5801, %r5086; + add.s32 %r5850, %r5849, %r5820; + xor.b32 %r5851, %r5850, %r5789; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 16; + add.s32 %r5853, %r5852, %r5832; + xor.b32 %r5854, %r5853, %r5820; + shf.l.wrap.b32 %r5855, %r5854, %r5854, 20; + add.s32 %r5856, %r5850, %r5174; + add.s32 %r5857, %r5856, %r5855; + xor.b32 %r5858, %r5857, %r5852; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 24; + add.s32 %r5860, %r5859, %r5853; + xor.b32 %r5861, %r5860, %r5855; + shf.l.wrap.b32 %r5862, %r5861, %r5861, 25; + add.s32 %r5863, %r5815, %r5102; + add.s32 %r5864, %r5863, %r5834; + xor.b32 %r5865, %r5864, %r5803; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 16; + add.s32 %r5867, %r5866, %r5790; + xor.b32 %r5868, %r5867, %r5834; + shf.l.wrap.b32 %r5869, %r5868, %r5868, 20; + add.s32 %r5870, %r5864, %r5138; + add.s32 %r5871, %r5870, %r5869; + xor.b32 %r5872, %r5871, %r5866; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 24; + add.s32 %r5874, %r5873, %r5867; + xor.b32 %r5875, %r5874, %r5869; + shf.l.wrap.b32 %r5876, %r5875, %r5875, 25; + add.s32 %r5877, %r5829, %r5122; + add.s32 %r5878, %r5877, %r5792; + xor.b32 %r5879, %r5878, %r5817; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 16; + add.s32 %r5881, %r5880, %r5804; + xor.b32 %r5882, %r5881, %r5792; + shf.l.wrap.b32 %r5883, %r5882, %r5882, 20; + add.s32 %r5884, %r5878, %r5146; + add.s32 %r5885, %r5884, %r5883; + xor.b32 %r5886, %r5885, %r5880; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 24; + add.s32 %r5888, %r5887, %r5881; + xor.b32 %r5889, %r5888, %r5883; + shf.l.wrap.b32 %r5890, %r5889, %r5889, 25; + add.s32 %r5891, %r5843, %r5182; + add.s32 %r5892, %r5891, %r5890; + xor.b32 %r5893, %r5892, %r5859; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 16; + add.s32 %r5895, %r5894, %r5874; + xor.b32 %r5896, %r5895, %r5890; + shf.l.wrap.b32 %r5897, %r5896, %r5896, 20; + add.s32 %r5898, %r5892, %r5218; + add.s32 %r5899, %r5898, %r5897; + xor.b32 %r5900, %r5899, %r5894; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 24; + add.s32 %r5902, %r5901, %r5895; + xor.b32 %r5903, %r5902, %r5897; + shf.l.wrap.b32 %r5904, %r5903, %r5903, 25; + add.s32 %r5905, %r5857, %r5130; + add.s32 %r5906, %r5905, %r5848; + xor.b32 %r5907, %r5906, %r5873; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 16; + add.s32 %r5909, %r5908, %r5888; + xor.b32 %r5910, %r5909, %r5848; + shf.l.wrap.b32 %r5911, %r5910, %r5910, 20; + add.s32 %r5912, %r5906, %r5086; + add.s32 %r5913, %r5912, %r5911; + xor.b32 %r5914, %r5913, %r5908; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 24; + add.s32 %r5916, %r5915, %r5909; + xor.b32 %r5917, %r5916, %r5911; + shf.l.wrap.b32 %r5918, %r5917, %r5917, 25; + add.s32 %r5919, %r5871, %r5094; + add.s32 %r5920, %r5919, %r5862; + xor.b32 %r5921, %r5920, %r5887; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 16; + add.s32 %r5923, %r5922, %r5846; + xor.b32 %r5924, %r5923, %r5862; + shf.l.wrap.b32 %r5925, %r5924, %r5924, 20; + add.s32 %r5926, %r5920, %r5166; + add.s32 %r5927, %r5926, %r5925; + xor.b32 %r5928, %r5927, %r5922; + shf.l.wrap.b32 %r5929, %r5928, %r5928, 24; + add.s32 %r5930, %r5929, %r5923; + xor.b32 %r5931, %r5930, %r5925; + shf.l.wrap.b32 %r5932, %r5931, %r5931, 25; + add.s32 %r5933, %r5885, %r5158; + add.s32 %r5934, %r5933, %r5876; + xor.b32 %r5935, %r5934, %r5845; + shf.l.wrap.b32 %r5936, %r5935, %r5935, 16; + add.s32 %r5937, %r5936, %r5860; + xor.b32 %r5938, %r5937, %r5876; + shf.l.wrap.b32 %r5939, %r5938, %r5938, 20; + add.s32 %r5940, %r5934, %r5138; + add.s32 %r5941, %r5940, %r5939; + xor.b32 %r5942, %r5941, %r5936; + shf.l.wrap.b32 %r5943, %r5942, %r5942, 24; + add.s32 %r5944, %r5943, %r5937; + xor.b32 %r5945, %r5944, %r5939; + shf.l.wrap.b32 %r5946, %r5945, %r5945, 25; + add.s32 %r5947, %r5899, %r5210; + add.s32 %r5948, %r5947, %r5918; + xor.b32 %r5949, %r5948, %r5943; + shf.l.wrap.b32 %r5950, %r5949, %r5949, 16; + add.s32 %r5951, %r5950, %r5930; + xor.b32 %r5952, %r5951, %r5918; + shf.l.wrap.b32 %r5953, %r5952, %r5952, 20; + add.s32 %r5954, %r5948, %r5174; + add.s32 %r5955, %r5954, %r5953; + xor.b32 %r5956, %r5955, %r5950; + shf.l.wrap.b32 %r5957, %r5956, %r5956, 24; + add.s32 %r5958, %r5957, %r5951; + xor.b32 %r5959, %r5958, %r5953; + shf.l.wrap.b32 %r5960, %r5959, %r5959, 25; + add.s32 %r5961, %r5913, %r5102; + add.s32 %r5962, %r5961, %r5932; + xor.b32 %r5963, %r5962, %r5901; + shf.l.wrap.b32 %r5964, %r5963, %r5963, 16; + add.s32 %r5965, %r5964, %r5944; + xor.b32 %r5966, %r5965, %r5932; + shf.l.wrap.b32 %r5967, %r5966, %r5966, 20; + add.s32 %r5968, %r5962, %r5194; + add.s32 %r5969, %r5968, %r5967; + xor.b32 %r5970, %r5969, %r5964; + shf.l.wrap.b32 %r5971, %r5970, %r5970, 24; + add.s32 %r5972, %r5971, %r5965; + xor.b32 %r5973, %r5972, %r5967; + shf.l.wrap.b32 %r5974, %r5973, %r5973, 25; + add.s32 %r5975, %r5927, %r5110; + add.s32 %r5976, %r5975, %r5946; + xor.b32 %r5977, %r5976, %r5915; + shf.l.wrap.b32 %r5978, %r5977, %r5977, 16; + add.s32 %r5979, %r5978, %r5902; + xor.b32 %r5980, %r5979, %r5946; + shf.l.wrap.b32 %r5981, %r5980, %r5980, 20; + add.s32 %r5982, %r5976, %r5122; + add.s32 %r5983, %r5982, %r5981; + xor.b32 %r5984, %r5983, %r5978; + shf.l.wrap.b32 %r5985, %r5984, %r5984, 24; + add.s32 %r5986, %r5985, %r5979; + xor.b32 %r5987, %r5986, %r5981; + shf.l.wrap.b32 %r5988, %r5987, %r5987, 25; + add.s32 %r5989, %r5941, %r5146; + add.s32 %r5990, %r5989, %r5904; + xor.b32 %r5991, %r5990, %r5929; + shf.l.wrap.b32 %r5992, %r5991, %r5991, 16; + add.s32 %r5993, %r5992, %r5916; + xor.b32 %r5994, %r5993, %r5904; + shf.l.wrap.b32 %r5995, %r5994, %r5994, 20; + add.s32 %r5996, %r5990, %r5202; + add.s32 %r5997, %r5996, %r5995; + xor.b32 %r5998, %r5997, %r5992; + shf.l.wrap.b32 %r5999, %r5998, %r5998, 24; + add.s32 %r6000, %r5999, %r5993; + xor.b32 %r6001, %r6000, %r5995; + shf.l.wrap.b32 %r6002, %r6001, %r6001, 25; + xor.b32 %r3, %r5986, %r5955; + xor.b32 %r4, %r6000, %r5969; + st.local.v2.u32 [%rd2+32], {%r3, %r4}; + xor.b32 %r5, %r5958, %r5983; + xor.b32 %r6, %r5997, %r5972; + st.local.v2.u32 [%rd2+40], {%r5, %r6}; + xor.b32 %r7, %r6002, %r5971; + xor.b32 %r8, %r5960, %r5985; + st.local.v2.u32 [%rd2+48], {%r7, %r8}; + xor.b32 %r9, %r5999, %r5974; + xor.b32 %r10, %r5988, %r5957; + st.local.v2.u32 [%rd2+56], {%r9, %r10}; + ld.local.v4.u32 {%r6003, %r6004, %r6005, %r6006}, [%rd3+64]; + st.local.v2.u32 [%rd2+72], {%r6003, %r6004}; + st.local.v2.u32 [%rd2+80], {%r6005, %r6006}; + add.s16 %rs1, %rs412, 16; + and.b16 %rs485, %rs1, 255; + add.s16 %rs486, %rs413, 1; + st.local.v2.u8 [%rd2+136], {%rs1, %rs486}; + cvt.u32.u16 %r6011, %rs486; + cvt.u32.u16 %r6012, %rs485; + prmt.b32 %r6013, %r6011, %r6012, 30212; + cvt.u16.u32 %rs487, %r6013; + shr.u16 %rs2, %rs487, 8; + mov.b32 {%rs5, %rs6}, %r6004; + mov.b32 {%rs3, %rs4}, %r6003; + mov.b32 {%rs9, %rs10}, %r6006; + mov.b32 {%rs7, %rs8}, %r6005; + setp.eq.s16 %p11, %rs2, 0; + selp.u16 %rs488, 1, 0, %p11; + shr.u16 %rs489, %rs3, 8; + shr.u16 %rs490, %rs4, 8; + shr.u16 %rs491, %rs5, 8; + shr.u16 %rs492, %rs6, 8; + shr.u16 %rs493, %rs7, 8; + shr.u16 %rs494, %rs8, 8; + shr.u16 %rs495, %rs9, 8; + shr.u16 %rs496, %rs10, 8; + or.b16 %rs497, %rs488, 10; + cvt.u32.u16 %r6014, %rs3; + and.b32 %r6015, %r6014, 255; + cvt.u32.u16 %r6016, %rs489; + prmt.b32 %r6017, %r6016, %r6015, 30212; + cvt.u32.u16 %r6018, %rs4; + prmt.b32 %r6019, %r6018, %r6017, 28756; + cvt.u32.u16 %r6020, %rs490; + prmt.b32 %r6021, %r6020, %r6019, 1620; + cvt.u32.u16 %r6022, %rs5; + and.b32 %r6023, %r6022, 255; + cvt.u32.u16 %r6024, %rs491; + prmt.b32 %r6025, %r6024, %r6023, 30212; + cvt.u32.u16 %r6026, %rs6; + prmt.b32 %r6027, %r6026, %r6025, 28756; + cvt.u32.u16 %r6028, %rs492; + prmt.b32 %r6029, %r6028, %r6027, 1620; + cvt.u32.u16 %r6030, %rs7; + and.b32 %r6031, %r6030, 255; + cvt.u32.u16 %r6032, %rs493; + prmt.b32 %r6033, %r6032, %r6031, 30212; + cvt.u32.u16 %r6034, %rs8; + prmt.b32 %r6035, %r6034, %r6033, 28756; + cvt.u32.u16 %r6036, %rs494; + prmt.b32 %r6037, %r6036, %r6035, 1620; + cvt.u32.u16 %r6038, %rs9; + and.b32 %r6039, %r6038, 255; + cvt.u32.u16 %r6040, %rs495; + prmt.b32 %r6041, %r6040, %r6039, 30212; + cvt.u32.u16 %r6042, %rs10; + prmt.b32 %r6043, %r6042, %r6041, 28756; + cvt.u32.u16 %r6044, %rs496; + prmt.b32 %r6045, %r6044, %r6043, 1620; + cvt.u32.u16 %r6046, %rs497; + add.s32 %r6047, %r7, %r3; + add.s32 %r6048, %r6047, %r6021; + add.s32 %r6049, %r6029, %r6048; + add.s32 %r6050, %r8, %r4; + add.s32 %r6051, %r6050, %r6037; + add.s32 %r6052, %r6045, %r6051; + add.s32 %r6053, %r9, %r5; + cvt.u32.u16 %r6054, %rs1; + and.b32 %r6055, %r6054, 255; + xor.b32 %r6056, %r6053, %r6055; + shr.u32 %r6057, %r6053, 16; + shl.b32 %r6058, %r6056, 16; + or.b32 %r6059, %r6058, %r6057; + add.s32 %r6060, %r6059, 1013904242; + xor.b32 %r6061, %r6060, %r9; + shf.l.wrap.b32 %r6062, %r6061, %r6061, 20; + add.s32 %r6063, %r6053, %r6062; + xor.b32 %r6064, %r6063, %r6059; + shf.l.wrap.b32 %r6065, %r6064, %r6064, 24; + add.s32 %r6066, %r6065, %r6060; + xor.b32 %r6067, %r6066, %r6062; + shf.l.wrap.b32 %r6068, %r6067, %r6067, 25; + add.s32 %r6069, %r10, %r6; + xor.b32 %r6070, %r6069, %r6046; + shr.u32 %r6071, %r6069, 16; + shl.b32 %r6072, %r6070, 16; + or.b32 %r6073, %r6072, %r6071; + add.s32 %r6074, %r6073, -1521486534; + xor.b32 %r6075, %r6074, %r10; + shf.l.wrap.b32 %r6076, %r6075, %r6075, 20; + add.s32 %r6077, %r6069, %r6076; + xor.b32 %r6078, %r6077, %r6073; + shf.l.wrap.b32 %r6079, %r6078, %r6078, 24; + add.s32 %r6080, %r6079, %r6074; + xor.b32 %r6081, %r6080, %r6076; + shf.l.wrap.b32 %r6082, %r6081, %r6081, 25; + add.s32 %r6083, %r6082, %r6063; + shf.l.wrap.b32 %r6084, %r6048, %r6048, 16; + add.s32 %r6085, %r6084, 1779033703; + xor.b32 %r6086, %r6085, %r7; + shf.l.wrap.b32 %r6087, %r6086, %r6086, 20; + add.s32 %r6088, %r6049, %r6087; + xor.b32 %r6089, %r6088, %r6084; + shf.l.wrap.b32 %r6090, %r6089, %r6089, 24; + add.s32 %r6091, %r6090, %r6085; + xor.b32 %r6092, %r6091, %r6087; + shf.l.wrap.b32 %r6093, %r6092, %r6092, 25; + shf.l.wrap.b32 %r6094, %r6051, %r6051, 16; + add.s32 %r6095, %r6094, -1150833019; + xor.b32 %r6096, %r6095, %r8; + shf.l.wrap.b32 %r6097, %r6096, %r6096, 20; + add.s32 %r6098, %r6052, %r6097; + xor.b32 %r6099, %r6098, %r6094; + shf.l.wrap.b32 %r6100, %r6099, %r6099, 24; + add.s32 %r6101, %r6100, %r6095; + xor.b32 %r6102, %r6101, %r6097; + shf.l.wrap.b32 %r6103, %r6102, %r6102, 25; + add.s32 %r6104, %r6088, %r6103; + xor.b32 %r6105, %r6104, %r6079; + shf.l.wrap.b32 %r6106, %r6105, %r6105, 16; + add.s32 %r6107, %r6106, %r6066; + xor.b32 %r6108, %r6107, %r6103; + shf.l.wrap.b32 %r6109, %r6108, %r6108, 20; + add.s32 %r6110, %r6104, %r6109; + xor.b32 %r6111, %r6110, %r6106; + shf.l.wrap.b32 %r6112, %r6111, %r6111, 24; + add.s32 %r6113, %r6112, %r6107; + xor.b32 %r6114, %r6113, %r6109; + shf.l.wrap.b32 %r6115, %r6114, %r6114, 25; + add.s32 %r6116, %r6068, %r6098; + xor.b32 %r6117, %r6090, %r6116; + shf.l.wrap.b32 %r6118, %r6117, %r6117, 16; + add.s32 %r6119, %r6118, %r6080; + xor.b32 %r6120, %r6119, %r6068; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 20; + add.s32 %r6122, %r6116, %r6121; + xor.b32 %r6123, %r6122, %r6118; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 24; + add.s32 %r6125, %r6124, %r6119; + xor.b32 %r6126, %r6125, %r6121; + shf.l.wrap.b32 %r6127, %r6126, %r6126, 25; + xor.b32 %r6128, %r6100, %r6083; + shf.l.wrap.b32 %r6129, %r6128, %r6128, 16; + add.s32 %r6130, %r6129, %r6091; + xor.b32 %r6131, %r6130, %r6082; + shf.l.wrap.b32 %r6132, %r6131, %r6131, 20; + add.s32 %r6133, %r6083, %r6132; + xor.b32 %r6134, %r6133, %r6129; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 24; + add.s32 %r6136, %r6135, %r6130; + xor.b32 %r6137, %r6136, %r6132; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 25; + add.s32 %r6139, %r6077, %r6093; + xor.b32 %r6140, %r6139, %r6065; + shf.l.wrap.b32 %r6141, %r6140, %r6140, 16; + add.s32 %r6142, %r6141, %r6101; + xor.b32 %r6143, %r6142, %r6093; + shf.l.wrap.b32 %r6144, %r6143, %r6143, 20; + add.s32 %r6145, %r6139, %r6144; + xor.b32 %r6146, %r6145, %r6141; + shf.l.wrap.b32 %r6147, %r6146, %r6146, 24; + add.s32 %r6148, %r6147, %r6142; + xor.b32 %r6149, %r6148, %r6144; + shf.l.wrap.b32 %r6150, %r6149, %r6149, 25; + add.s32 %r6151, %r6110, %r6037; + add.s32 %r6152, %r6151, %r6150; + xor.b32 %r6153, %r6152, %r6124; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 16; + add.s32 %r6155, %r6154, %r6136; + xor.b32 %r6156, %r6155, %r6150; + shf.l.wrap.b32 %r6157, %r6156, %r6156, 20; + add.s32 %r6158, %r6152, %r6157; + xor.b32 %r6159, %r6158, %r6154; + shf.l.wrap.b32 %r6160, %r6159, %r6159, 24; + add.s32 %r6161, %r6160, %r6155; + xor.b32 %r6162, %r6161, %r6157; + shf.l.wrap.b32 %r6163, %r6162, %r6162, 25; + add.s32 %r6164, %r6122, %r6045; + add.s32 %r6165, %r6164, %r6115; + xor.b32 %r6166, %r6165, %r6135; + shf.l.wrap.b32 %r6167, %r6166, %r6166, 16; + add.s32 %r6168, %r6167, %r6148; + xor.b32 %r6169, %r6168, %r6115; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 20; + add.s32 %r6171, %r6165, %r6170; + xor.b32 %r6172, %r6171, %r6167; + shf.l.wrap.b32 %r6173, %r6172, %r6172, 24; + add.s32 %r6174, %r6173, %r6168; + xor.b32 %r6175, %r6174, %r6170; + shf.l.wrap.b32 %r6176, %r6175, %r6175, 25; + add.s32 %r6177, %r6133, %r6127; + xor.b32 %r6178, %r6147, %r6177; + shf.l.wrap.b32 %r6179, %r6178, %r6178, 16; + add.s32 %r6180, %r6179, %r6113; + xor.b32 %r6181, %r6180, %r6127; + shf.l.wrap.b32 %r6182, %r6181, %r6181, 20; + add.s32 %r6183, %r6177, %r6021; + add.s32 %r6184, %r6183, %r6182; + xor.b32 %r6185, %r6184, %r6179; + shf.l.wrap.b32 %r6186, %r6185, %r6185, 24; + add.s32 %r6187, %r6186, %r6180; + xor.b32 %r6188, %r6187, %r6182; + shf.l.wrap.b32 %r6189, %r6188, %r6188, 25; + add.s32 %r6190, %r6145, %r6138; + xor.b32 %r6191, %r6112, %r6190; + shf.l.wrap.b32 %r6192, %r6191, %r6191, 16; + add.s32 %r6193, %r6192, %r6125; + xor.b32 %r6194, %r6193, %r6138; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 20; + add.s32 %r6196, %r6190, %r6195; + xor.b32 %r6197, %r6196, %r6192; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 24; + add.s32 %r6199, %r6198, %r6193; + xor.b32 %r6200, %r6199, %r6195; + shf.l.wrap.b32 %r6201, %r6200, %r6200, 25; + add.s32 %r6202, %r6158, %r6029; + add.s32 %r6203, %r6202, %r6176; + xor.b32 %r6204, %r6203, %r6198; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 16; + add.s32 %r6206, %r6205, %r6187; + xor.b32 %r6207, %r6206, %r6176; + shf.l.wrap.b32 %r6208, %r6207, %r6207, 20; + add.s32 %r6209, %r6203, %r6208; + xor.b32 %r6210, %r6209, %r6205; + shf.l.wrap.b32 %r6211, %r6210, %r6210, 24; + add.s32 %r6212, %r6211, %r6206; + xor.b32 %r6213, %r6212, %r6208; + shf.l.wrap.b32 %r6214, %r6213, %r6213, 25; + add.s32 %r6215, %r6189, %r6171; + xor.b32 %r6216, %r6160, %r6215; + shf.l.wrap.b32 %r6217, %r6216, %r6216, 16; + add.s32 %r6218, %r6217, %r6199; + xor.b32 %r6219, %r6218, %r6189; + shf.l.wrap.b32 %r6220, %r6219, %r6219, 20; + add.s32 %r6221, %r6215, %r6220; + xor.b32 %r6222, %r6221, %r6217; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 24; + add.s32 %r6224, %r6223, %r6218; + xor.b32 %r6225, %r6224, %r6220; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 25; + add.s32 %r6227, %r6184, %r6201; + xor.b32 %r6228, %r6173, %r6227; + shf.l.wrap.b32 %r6229, %r6228, %r6228, 16; + add.s32 %r6230, %r6229, %r6161; + xor.b32 %r6231, %r6230, %r6201; + shf.l.wrap.b32 %r6232, %r6231, %r6231, 20; + add.s32 %r6233, %r6227, %r6232; + xor.b32 %r6234, %r6233, %r6229; + shf.l.wrap.b32 %r6235, %r6234, %r6234, 24; + add.s32 %r6236, %r6235, %r6230; + xor.b32 %r6237, %r6236, %r6232; + shf.l.wrap.b32 %r6238, %r6237, %r6237, 25; + add.s32 %r6239, %r6196, %r6163; + xor.b32 %r6240, %r6239, %r6186; + shf.l.wrap.b32 %r6241, %r6240, %r6240, 16; + add.s32 %r6242, %r6241, %r6174; + xor.b32 %r6243, %r6242, %r6163; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 20; + add.s32 %r6245, %r6239, %r6244; + xor.b32 %r6246, %r6245, %r6241; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 24; + add.s32 %r6248, %r6247, %r6242; + xor.b32 %r6249, %r6248, %r6244; + shf.l.wrap.b32 %r6250, %r6249, %r6249, 25; + add.s32 %r6251, %r6209, %r6045; + add.s32 %r6252, %r6251, %r6250; + xor.b32 %r6253, %r6252, %r6223; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 16; + add.s32 %r6255, %r6254, %r6236; + xor.b32 %r6256, %r6255, %r6250; + shf.l.wrap.b32 %r6257, %r6256, %r6256, 20; + add.s32 %r6258, %r6252, %r6257; + xor.b32 %r6259, %r6258, %r6254; + shf.l.wrap.b32 %r6260, %r6259, %r6259, 24; + add.s32 %r6261, %r6260, %r6255; + xor.b32 %r6262, %r6261, %r6257; + shf.l.wrap.b32 %r6263, %r6262, %r6262, 25; + add.s32 %r6264, %r6221, %r6214; + xor.b32 %r6265, %r6264, %r6235; + shf.l.wrap.b32 %r6266, %r6265, %r6265, 16; + add.s32 %r6267, %r6266, %r6248; + xor.b32 %r6268, %r6267, %r6214; + shf.l.wrap.b32 %r6269, %r6268, %r6268, 20; + add.s32 %r6270, %r6264, %r6269; + xor.b32 %r6271, %r6270, %r6266; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 24; + add.s32 %r6273, %r6272, %r6267; + xor.b32 %r6274, %r6273, %r6269; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 25; + add.s32 %r6276, %r6233, %r6226; + xor.b32 %r6277, %r6247, %r6276; + shf.l.wrap.b32 %r6278, %r6277, %r6277, 16; + add.s32 %r6279, %r6278, %r6212; + xor.b32 %r6280, %r6279, %r6226; + shf.l.wrap.b32 %r6281, %r6280, %r6280, 20; + add.s32 %r6282, %r6276, %r6037; + add.s32 %r6283, %r6282, %r6281; + xor.b32 %r6284, %r6283, %r6278; + shf.l.wrap.b32 %r6285, %r6284, %r6284, 24; + add.s32 %r6286, %r6285, %r6279; + xor.b32 %r6287, %r6286, %r6281; + shf.l.wrap.b32 %r6288, %r6287, %r6287, 25; + add.s32 %r6289, %r6245, %r6238; + xor.b32 %r6290, %r6211, %r6289; + shf.l.wrap.b32 %r6291, %r6290, %r6290, 16; + add.s32 %r6292, %r6291, %r6224; + xor.b32 %r6293, %r6292, %r6238; + shf.l.wrap.b32 %r6294, %r6293, %r6293, 20; + add.s32 %r6295, %r6289, %r6294; + xor.b32 %r6296, %r6295, %r6291; + shf.l.wrap.b32 %r6297, %r6296, %r6296, 24; + add.s32 %r6298, %r6297, %r6292; + xor.b32 %r6299, %r6298, %r6294; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 25; + add.s32 %r6301, %r6258, %r6275; + xor.b32 %r6302, %r6301, %r6297; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 16; + add.s32 %r6304, %r6303, %r6286; + xor.b32 %r6305, %r6304, %r6275; + shf.l.wrap.b32 %r6306, %r6305, %r6305, 20; + add.s32 %r6307, %r6301, %r6306; + xor.b32 %r6308, %r6307, %r6303; + shf.l.wrap.b32 %r6309, %r6308, %r6308, 24; + add.s32 %r6310, %r6309, %r6304; + xor.b32 %r6311, %r6310, %r6306; + shf.l.wrap.b32 %r6312, %r6311, %r6311, 25; + add.s32 %r6313, %r6288, %r6270; + xor.b32 %r6314, %r6260, %r6313; + shf.l.wrap.b32 %r6315, %r6314, %r6314, 16; + add.s32 %r6316, %r6315, %r6298; + xor.b32 %r6317, %r6316, %r6288; + shf.l.wrap.b32 %r6318, %r6317, %r6317, 20; + add.s32 %r6319, %r6313, %r6021; + add.s32 %r6320, %r6319, %r6318; + xor.b32 %r6321, %r6320, %r6315; + shf.l.wrap.b32 %r6322, %r6321, %r6321, 24; + add.s32 %r6323, %r6322, %r6316; + xor.b32 %r6324, %r6323, %r6318; + shf.l.wrap.b32 %r6325, %r6324, %r6324, 25; + add.s32 %r6326, %r6283, %r6300; + xor.b32 %r6327, %r6272, %r6326; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 16; + add.s32 %r6329, %r6328, %r6261; + xor.b32 %r6330, %r6329, %r6300; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 20; + add.s32 %r6332, %r6326, %r6331; + xor.b32 %r6333, %r6332, %r6328; + shf.l.wrap.b32 %r6334, %r6333, %r6333, 24; + add.s32 %r6335, %r6334, %r6329; + xor.b32 %r6336, %r6335, %r6331; + shf.l.wrap.b32 %r6337, %r6336, %r6336, 25; + add.s32 %r6338, %r6295, %r6263; + xor.b32 %r6339, %r6338, %r6285; + shf.l.wrap.b32 %r6340, %r6339, %r6339, 16; + add.s32 %r6341, %r6340, %r6273; + xor.b32 %r6342, %r6341, %r6263; + shf.l.wrap.b32 %r6343, %r6342, %r6342, 20; + add.s32 %r6344, %r6338, %r6029; + add.s32 %r6345, %r6344, %r6343; + xor.b32 %r6346, %r6345, %r6340; + shf.l.wrap.b32 %r6347, %r6346, %r6346, 24; + add.s32 %r6348, %r6347, %r6341; + xor.b32 %r6349, %r6348, %r6343; + shf.l.wrap.b32 %r6350, %r6349, %r6349, 25; + add.s32 %r6351, %r6307, %r6350; + xor.b32 %r6352, %r6351, %r6322; + shf.l.wrap.b32 %r6353, %r6352, %r6352, 16; + add.s32 %r6354, %r6353, %r6335; + xor.b32 %r6355, %r6354, %r6350; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 20; + add.s32 %r6357, %r6351, %r6356; + xor.b32 %r6358, %r6357, %r6353; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 24; + add.s32 %r6360, %r6359, %r6354; + xor.b32 %r6361, %r6360, %r6356; + shf.l.wrap.b32 %r6362, %r6361, %r6361, 25; + add.s32 %r6363, %r6320, %r6312; + xor.b32 %r6364, %r6363, %r6334; + shf.l.wrap.b32 %r6365, %r6364, %r6364, 16; + add.s32 %r6366, %r6365, %r6348; + xor.b32 %r6367, %r6366, %r6312; + shf.l.wrap.b32 %r6368, %r6367, %r6367, 20; + add.s32 %r6369, %r6363, %r6368; + xor.b32 %r6370, %r6369, %r6365; + shf.l.wrap.b32 %r6371, %r6370, %r6370, 24; + add.s32 %r6372, %r6371, %r6366; + xor.b32 %r6373, %r6372, %r6368; + shf.l.wrap.b32 %r6374, %r6373, %r6373, 25; + add.s32 %r6375, %r6332, %r6325; + xor.b32 %r6376, %r6347, %r6375; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 16; + add.s32 %r6378, %r6377, %r6310; + xor.b32 %r6379, %r6378, %r6325; + shf.l.wrap.b32 %r6380, %r6379, %r6379, 20; + add.s32 %r6381, %r6375, %r6045; + add.s32 %r6382, %r6381, %r6380; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 24; + add.s32 %r6385, %r6384, %r6378; + xor.b32 %r6386, %r6385, %r6380; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 25; + add.s32 %r6388, %r6345, %r6337; + xor.b32 %r6389, %r6309, %r6388; + shf.l.wrap.b32 %r6390, %r6389, %r6389, 16; + add.s32 %r6391, %r6390, %r6323; + xor.b32 %r6392, %r6391, %r6337; + shf.l.wrap.b32 %r6393, %r6392, %r6392, 20; + add.s32 %r6394, %r6388, %r6393; + xor.b32 %r6395, %r6394, %r6390; + shf.l.wrap.b32 %r6396, %r6395, %r6395, 24; + add.s32 %r6397, %r6396, %r6391; + xor.b32 %r6398, %r6397, %r6393; + shf.l.wrap.b32 %r6399, %r6398, %r6398, 25; + add.s32 %r6400, %r6357, %r6374; + xor.b32 %r6401, %r6400, %r6396; + shf.l.wrap.b32 %r6402, %r6401, %r6401, 16; + add.s32 %r6403, %r6402, %r6385; + xor.b32 %r6404, %r6403, %r6374; + shf.l.wrap.b32 %r6405, %r6404, %r6404, 20; + add.s32 %r6406, %r6400, %r6021; + add.s32 %r6407, %r6406, %r6405; + xor.b32 %r6408, %r6407, %r6402; + shf.l.wrap.b32 %r6409, %r6408, %r6408, 24; + add.s32 %r6410, %r6409, %r6403; + xor.b32 %r6411, %r6410, %r6405; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 25; + add.s32 %r6413, %r6387, %r6369; + xor.b32 %r6414, %r6359, %r6413; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 16; + add.s32 %r6416, %r6415, %r6397; + xor.b32 %r6417, %r6416, %r6387; + shf.l.wrap.b32 %r6418, %r6417, %r6417, 20; + add.s32 %r6419, %r6413, %r6037; + add.s32 %r6420, %r6419, %r6418; + xor.b32 %r6421, %r6420, %r6415; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 24; + add.s32 %r6423, %r6422, %r6416; + xor.b32 %r6424, %r6423, %r6418; + shf.l.wrap.b32 %r6425, %r6424, %r6424, 25; + add.s32 %r6426, %r6382, %r6399; + xor.b32 %r6427, %r6371, %r6426; + shf.l.wrap.b32 %r6428, %r6427, %r6427, 16; + add.s32 %r6429, %r6428, %r6360; + xor.b32 %r6430, %r6429, %r6399; + shf.l.wrap.b32 %r6431, %r6430, %r6430, 20; + add.s32 %r6432, %r6426, %r6431; + xor.b32 %r6433, %r6432, %r6428; + shf.l.wrap.b32 %r6434, %r6433, %r6433, 24; + add.s32 %r6435, %r6434, %r6429; + xor.b32 %r6436, %r6435, %r6431; + shf.l.wrap.b32 %r6437, %r6436, %r6436, 25; + add.s32 %r6438, %r6394, %r6029; + add.s32 %r6439, %r6438, %r6362; + xor.b32 %r6440, %r6439, %r6384; + shf.l.wrap.b32 %r6441, %r6440, %r6440, 16; + add.s32 %r6442, %r6441, %r6372; + xor.b32 %r6443, %r6442, %r6362; + shf.l.wrap.b32 %r6444, %r6443, %r6443, 20; + add.s32 %r6445, %r6439, %r6444; + xor.b32 %r6446, %r6445, %r6441; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 24; + add.s32 %r6448, %r6447, %r6442; + xor.b32 %r6449, %r6448, %r6444; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 25; + add.s32 %r6451, %r6407, %r6450; + xor.b32 %r6452, %r6451, %r6422; + shf.l.wrap.b32 %r6453, %r6452, %r6452, 16; + add.s32 %r6454, %r6453, %r6435; + xor.b32 %r6455, %r6454, %r6450; + shf.l.wrap.b32 %r6456, %r6455, %r6455, 20; + add.s32 %r6457, %r6451, %r6456; + xor.b32 %r6458, %r6457, %r6453; + shf.l.wrap.b32 %r6459, %r6458, %r6458, 24; + add.s32 %r6460, %r6459, %r6454; + xor.b32 %r6461, %r6460, %r6456; + shf.l.wrap.b32 %r6462, %r6461, %r6461, 25; + add.s32 %r6463, %r6420, %r6412; + xor.b32 %r6464, %r6463, %r6434; + shf.l.wrap.b32 %r6465, %r6464, %r6464, 16; + add.s32 %r6466, %r6465, %r6448; + xor.b32 %r6467, %r6466, %r6412; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 20; + add.s32 %r6469, %r6463, %r6468; + xor.b32 %r6470, %r6469, %r6465; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 24; + add.s32 %r6472, %r6471, %r6466; + xor.b32 %r6473, %r6472, %r6468; + shf.l.wrap.b32 %r6474, %r6473, %r6473, 25; + add.s32 %r6475, %r6432, %r6425; + xor.b32 %r6476, %r6447, %r6475; + shf.l.wrap.b32 %r6477, %r6476, %r6476, 16; + add.s32 %r6478, %r6477, %r6410; + xor.b32 %r6479, %r6478, %r6425; + shf.l.wrap.b32 %r6480, %r6479, %r6479, 20; + add.s32 %r6481, %r6475, %r6480; + xor.b32 %r6482, %r6481, %r6477; + shf.l.wrap.b32 %r6483, %r6482, %r6482, 24; + add.s32 %r6484, %r6483, %r6478; + xor.b32 %r6485, %r6484, %r6480; + shf.l.wrap.b32 %r6486, %r6485, %r6485, 25; + add.s32 %r6487, %r6445, %r6437; + xor.b32 %r6488, %r6409, %r6487; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 16; + add.s32 %r6490, %r6489, %r6423; + xor.b32 %r6491, %r6490, %r6437; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 20; + add.s32 %r6493, %r6487, %r6492; + xor.b32 %r6494, %r6493, %r6489; + shf.l.wrap.b32 %r6495, %r6494, %r6494, 24; + add.s32 %r6496, %r6495, %r6490; + xor.b32 %r6497, %r6496, %r6492; + shf.l.wrap.b32 %r6498, %r6497, %r6497, 25; + add.s32 %r6499, %r6457, %r6474; + xor.b32 %r6500, %r6499, %r6495; + shf.l.wrap.b32 %r6501, %r6500, %r6500, 16; + add.s32 %r6502, %r6501, %r6484; + xor.b32 %r6503, %r6502, %r6474; + shf.l.wrap.b32 %r6504, %r6503, %r6503, 20; + add.s32 %r6505, %r6499, %r6037; + add.s32 %r6506, %r6505, %r6504; + xor.b32 %r6507, %r6506, %r6501; + shf.l.wrap.b32 %r6508, %r6507, %r6507, 24; + add.s32 %r6509, %r6508, %r6502; + xor.b32 %r6510, %r6509, %r6504; + shf.l.wrap.b32 %r6511, %r6510, %r6510, 25; + add.s32 %r6512, %r6486, %r6469; + xor.b32 %r6513, %r6459, %r6512; + shf.l.wrap.b32 %r6514, %r6513, %r6513, 16; + add.s32 %r6515, %r6514, %r6496; + xor.b32 %r6516, %r6515, %r6486; + shf.l.wrap.b32 %r6517, %r6516, %r6516, 20; + add.s32 %r6518, %r6512, %r6045; + add.s32 %r6519, %r6518, %r6517; + xor.b32 %r6520, %r6519, %r6514; + shf.l.wrap.b32 %r6521, %r6520, %r6520, 24; + add.s32 %r6522, %r6521, %r6515; + xor.b32 %r6523, %r6522, %r6517; + shf.l.wrap.b32 %r6524, %r6523, %r6523, 25; + add.s32 %r6525, %r6481, %r6021; + add.s32 %r6526, %r6525, %r6498; + xor.b32 %r6527, %r6471, %r6526; + shf.l.wrap.b32 %r6528, %r6527, %r6527, 16; + add.s32 %r6529, %r6528, %r6460; + xor.b32 %r6530, %r6529, %r6498; + shf.l.wrap.b32 %r6531, %r6530, %r6530, 20; + add.s32 %r6532, %r6526, %r6029; + add.s32 %r6533, %r6532, %r6531; + xor.b32 %r6534, %r6533, %r6528; + shf.l.wrap.b32 %r6535, %r6534, %r6534, 24; + add.s32 %r6536, %r6535, %r6529; + xor.b32 %r6537, %r6536, %r6531; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 25; + add.s32 %r6539, %r6493, %r6462; + xor.b32 %r6540, %r6539, %r6483; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 16; + add.s32 %r6542, %r6541, %r6472; + xor.b32 %r6543, %r6542, %r6462; + shf.l.wrap.b32 %r6544, %r6543, %r6543, 20; + add.s32 %r6545, %r6539, %r6544; + xor.b32 %r6546, %r6545, %r6541; + shf.l.wrap.b32 %r6547, %r6546, %r6546, 24; + add.s32 %r6548, %r6547, %r6542; + xor.b32 %r6549, %r6548, %r6544; + shf.l.wrap.b32 %r6550, %r6549, %r6549, 25; + add.s32 %r6551, %r6506, %r6550; + xor.b32 %r6552, %r6551, %r6521; + shf.l.wrap.b32 %r6553, %r6552, %r6552, 16; + add.s32 %r6554, %r6553, %r6536; + xor.b32 %r6555, %r6554, %r6550; + shf.l.wrap.b32 %r6556, %r6555, %r6555, 20; + add.s32 %r6557, %r6551, %r6556; + xor.b32 %r6558, %r6557, %r6553; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 24; + add.s32 %r6560, %r6559, %r6554; + xor.b32 %r6561, %r6560, %r6556; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 25; + add.s32 %r6563, %r6519, %r6511; + xor.b32 %r6564, %r6563, %r6535; + shf.l.wrap.b32 %r6565, %r6564, %r6564, 16; + add.s32 %r6566, %r6565, %r6548; + xor.b32 %r6567, %r6566, %r6511; + shf.l.wrap.b32 %r6568, %r6567, %r6567, 20; + add.s32 %r6569, %r6563, %r6568; + xor.b32 %r6570, %r6569, %r6565; + shf.l.wrap.b32 %r6571, %r6570, %r6570, 24; + add.s32 %r6572, %r6571, %r6566; + xor.b32 %r6573, %r6572, %r6568; + shf.l.wrap.b32 %r6574, %r6573, %r6573, 25; + add.s32 %r6575, %r6533, %r6524; + xor.b32 %r6576, %r6547, %r6575; + shf.l.wrap.b32 %r6577, %r6576, %r6576, 16; + add.s32 %r6578, %r6577, %r6509; + xor.b32 %r6579, %r6578, %r6524; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 20; + add.s32 %r6581, %r6575, %r6580; + xor.b32 %r6582, %r6581, %r6577; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 24; + add.s32 %r6584, %r6583, %r6578; + xor.b32 %r6585, %r6584, %r6580; + shf.l.wrap.b32 %r6586, %r6585, %r6585, 25; + add.s32 %r6587, %r6545, %r6538; + xor.b32 %r6588, %r6508, %r6587; + shf.l.wrap.b32 %r6589, %r6588, %r6588, 16; + add.s32 %r6590, %r6589, %r6522; + xor.b32 %r6591, %r6590, %r6538; + shf.l.wrap.b32 %r6592, %r6591, %r6591, 20; + add.s32 %r6593, %r6587, %r6029; + add.s32 %r6594, %r6593, %r6592; + xor.b32 %r6595, %r6594, %r6589; + shf.l.wrap.b32 %r6596, %r6595, %r6595, 24; + add.s32 %r6597, %r6596, %r6590; + xor.b32 %r6598, %r6597, %r6592; + shf.l.wrap.b32 %r6599, %r6598, %r6598, 25; + add.s32 %r6600, %r6557, %r6574; + xor.b32 %r6601, %r6600, %r6596; + shf.l.wrap.b32 %r6602, %r6601, %r6601, 16; + add.s32 %r6603, %r6602, %r6584; + xor.b32 %r6604, %r6603, %r6574; + shf.l.wrap.b32 %r6605, %r6604, %r6604, 20; + add.s32 %r6606, %r6600, %r6045; + add.s32 %r6607, %r6606, %r6605; + xor.b32 %r6608, %r6607, %r6602; + shf.l.wrap.b32 %r6609, %r6608, %r6608, 24; + add.s32 %r6610, %r6609, %r6603; + xor.b32 %r6611, %r6610, %r6605; + shf.l.wrap.b32 %r6612, %r6611, %r6611, 25; + add.s32 %r6613, %r6586, %r6021; + add.s32 %r6614, %r6613, %r6569; + xor.b32 %r6615, %r6559, %r6614; + shf.l.wrap.b32 %r6616, %r6615, %r6615, 16; + add.s32 %r6617, %r6616, %r6597; + xor.b32 %r6618, %r6617, %r6586; + shf.l.wrap.b32 %r6619, %r6618, %r6618, 20; + add.s32 %r6620, %r6614, %r6619; + xor.b32 %r6621, %r6620, %r6616; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 24; + add.s32 %r6623, %r6622, %r6617; + xor.b32 %r6624, %r6623, %r6619; + shf.l.wrap.b32 %r6625, %r6624, %r6624, 25; + add.s32 %r6626, %r6581, %r6037; + add.s32 %r6627, %r6626, %r6599; + xor.b32 %r6628, %r6571, %r6627; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 16; + add.s32 %r6630, %r6629, %r6560; + xor.b32 %r6631, %r6630, %r6599; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 20; + add.s32 %r6633, %r6627, %r6632; + xor.b32 %r6634, %r6633, %r6629; + shf.l.wrap.b32 %r6635, %r6634, %r6634, 24; + add.s32 %r6636, %r6635, %r6630; + xor.b32 %r6637, %r6636, %r6632; + shf.l.wrap.b32 %r6638, %r6637, %r6637, 25; + add.s32 %r6639, %r6594, %r6562; + xor.b32 %r6640, %r6639, %r6583; + shf.l.wrap.b32 %r6641, %r6640, %r6640, 16; + add.s32 %r6642, %r6641, %r6572; + xor.b32 %r6643, %r6642, %r6562; + shf.l.wrap.b32 %r6644, %r6643, %r6643, 20; + add.s32 %r6645, %r6639, %r6644; + xor.b32 %r6646, %r6645, %r6641; + shf.l.wrap.b32 %r6647, %r6646, %r6646, 24; + add.s32 %r6648, %r6647, %r6642; + xor.b32 %r6649, %r6648, %r6644; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 25; + add.s32 %r6651, %r6607, %r6650; + xor.b32 %r6652, %r6651, %r6622; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 16; + add.s32 %r6654, %r6653, %r6636; + xor.b32 %r6655, %r6654, %r6650; + shf.l.wrap.b32 %r6656, %r6655, %r6655, 20; + add.s32 %r6657, %r6651, %r6656; + xor.b32 %r6658, %r6657, %r6653; + shf.l.wrap.b32 %r6659, %r6658, %r6658, 24; + add.s32 %r6660, %r6659, %r6654; + xor.b32 %r6661, %r6660, %r6656; + shf.l.wrap.b32 %r6662, %r6661, %r6661, 25; + add.s32 %r6663, %r6620, %r6612; + xor.b32 %r6664, %r6663, %r6635; + shf.l.wrap.b32 %r6665, %r6664, %r6664, 16; + add.s32 %r6666, %r6665, %r6648; + xor.b32 %r6667, %r6666, %r6612; + shf.l.wrap.b32 %r6668, %r6667, %r6667, 20; + add.s32 %r6669, %r6663, %r6021; + add.s32 %r6670, %r6669, %r6668; + xor.b32 %r6671, %r6670, %r6665; + shf.l.wrap.b32 %r6672, %r6671, %r6671, 24; + add.s32 %r6673, %r6672, %r6666; + xor.b32 %r6674, %r6673, %r6668; + shf.l.wrap.b32 %r6675, %r6674, %r6674, 25; + add.s32 %r6676, %r6633, %r6029; + add.s32 %r6677, %r6676, %r6625; + xor.b32 %r6678, %r6647, %r6677; + shf.l.wrap.b32 %r6679, %r6678, %r6678, 16; + add.s32 %r6680, %r6679, %r6610; + xor.b32 %r6681, %r6680, %r6625; + shf.l.wrap.b32 %r6682, %r6681, %r6681, 20; + add.s32 %r6683, %r6677, %r6682; + xor.b32 %r6684, %r6683, %r6679; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 24; + add.s32 %r6686, %r6685, %r6680; + xor.b32 %r6687, %r6686, %r6682; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 25; + add.s32 %r6689, %r6645, %r6638; + xor.b32 %r6690, %r6609, %r6689; + shf.l.wrap.b32 %r6691, %r6690, %r6690, 16; + add.s32 %r6692, %r6691, %r6623; + xor.b32 %r6693, %r6692, %r6638; + shf.l.wrap.b32 %r6694, %r6693, %r6693, 20; + add.s32 %r6695, %r6689, %r6694; + xor.b32 %r6696, %r6695, %r6691; + shf.l.wrap.b32 %r6697, %r6696, %r6696, 24; + add.s32 %r6698, %r6697, %r6692; + xor.b32 %r6699, %r6698, %r6694; + shf.l.wrap.b32 %r6700, %r6699, %r6699, 25; + add.s32 %r6701, %r6657, %r6675; + xor.b32 %r6702, %r6701, %r6697; + shf.l.wrap.b32 %r6703, %r6702, %r6702, 16; + add.s32 %r6704, %r6703, %r6686; + xor.b32 %r6705, %r6704, %r6675; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 20; + add.s32 %r6707, %r6701, %r6706; + xor.b32 %r6708, %r6707, %r6703; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 24; + add.s32 %r6710, %r6709, %r6704; + xor.b32 %r6711, %r6710, %r6706; + shf.l.wrap.b32 %r6712, %r6711, %r6711, 25; + add.s32 %r6713, %r6688, %r6037; + add.s32 %r6714, %r6713, %r6670; + xor.b32 %r6715, %r6659, %r6714; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 16; + add.s32 %r6717, %r6716, %r6698; + xor.b32 %r6718, %r6717, %r6688; + shf.l.wrap.b32 %r6719, %r6718, %r6718, 20; + add.s32 %r6720, %r6714, %r6719; + xor.b32 %r6721, %r6720, %r6716; + shf.l.wrap.b32 %r6722, %r6721, %r6721, 24; + add.s32 %r6723, %r6722, %r6717; + xor.b32 %r6724, %r6723, %r6719; + shf.l.wrap.b32 %r6725, %r6724, %r6724, 25; + add.s32 %r6726, %r6683, %r6045; + add.s32 %r6727, %r6726, %r6700; + xor.b32 %r6728, %r6672, %r6727; + shf.l.wrap.b32 %r6729, %r6728, %r6728, 16; + add.s32 %r6730, %r6729, %r6660; + xor.b32 %r6731, %r6730, %r6700; + shf.l.wrap.b32 %r6732, %r6731, %r6731, 20; + add.s32 %r6733, %r6727, %r6732; + xor.b32 %r6734, %r6733, %r6729; + shf.l.wrap.b32 %r6735, %r6734, %r6734, 24; + add.s32 %r6736, %r6735, %r6730; + xor.b32 %r6737, %r6736, %r6732; + shf.l.wrap.b32 %r6738, %r6737, %r6737, 25; + add.s32 %r6739, %r6695, %r6662; + xor.b32 %r6740, %r6739, %r6685; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 16; + add.s32 %r6742, %r6741, %r6673; + xor.b32 %r6743, %r6742, %r6662; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 20; + add.s32 %r6745, %r6739, %r6744; + xor.b32 %r6746, %r6745, %r6741; + shf.l.wrap.b32 %r6747, %r6746, %r6746, 24; + add.s32 %r6748, %r6747, %r6742; + xor.b32 %r6749, %r6748, %r6744; + shf.l.wrap.b32 %r6750, %r6749, %r6749, 25; + xor.b32 %r6751, %r6707, %r6736; + cvt.u64.u32 %rd394, %r6751; + xor.b32 %r6752, %r6748, %r6720; + and.b32 %r6753, %r6752, 255; + cvt.u64.u32 %rd395, %r6753; + bfi.b64 %rd396, %rd395, %rd394, 32, 32; + cvt.u64.u32 %rd397, %r6752; + shl.b64 %rd398, %rd397, 32; + and.b64 %rd399, %rd398, 280375465082880; + or.b64 %rd400, %rd396, %rd399; + and.b64 %rd401, %rd398, 71776119061217280; + shr.u32 %r6754, %r6752, 24; + cvt.u64.u32 %rd402, %r6754; + shl.b64 %rd403, %rd402, 56; + or.b64 %rd404, %rd400, %rd401; + or.b64 %rd1308, %rd404, %rd403; + xor.b32 %r6755, %r6710, %r6733; + cvt.u64.u32 %rd405, %r6755; + xor.b32 %r6756, %r6745, %r6723; + and.b32 %r6757, %r6756, 255; + cvt.u64.u32 %rd406, %r6757; + bfi.b64 %rd407, %rd406, %rd405, 32, 32; + cvt.u64.u32 %rd408, %r6756; + shl.b64 %rd409, %rd408, 32; + and.b64 %rd410, %rd409, 280375465082880; + or.b64 %rd411, %rd407, %rd410; + and.b64 %rd412, %rd409, 71776119061217280; + shr.u32 %r6758, %r6756, 24; + cvt.u64.u32 %rd413, %r6758; + shl.b64 %rd414, %rd413, 56; + or.b64 %rd415, %rd411, %rd412; + or.b64 %rd1307, %rd415, %rd414; + xor.b32 %r6759, %r6750, %r6722; + cvt.u64.u32 %rd416, %r6759; + xor.b32 %r6760, %r6712, %r6735; + and.b32 %r6761, %r6760, 255; + cvt.u64.u32 %rd417, %r6761; + bfi.b64 %rd418, %rd417, %rd416, 32, 32; + cvt.u64.u32 %rd419, %r6760; + shl.b64 %rd420, %rd419, 32; + and.b64 %rd421, %rd420, 280375465082880; + or.b64 %rd422, %rd418, %rd421; + and.b64 %rd423, %rd420, 71776119061217280; + shr.u32 %r6762, %r6760, 24; + cvt.u64.u32 %rd424, %r6762; + shl.b64 %rd425, %rd424, 56; + or.b64 %rd426, %rd422, %rd423; + or.b64 %rd1306, %rd426, %rd425; + xor.b32 %r6763, %r6747, %r6725; + cvt.u64.u32 %rd427, %r6763; + xor.b32 %r6764, %r6709, %r6738; + and.b32 %r6765, %r6764, 255; + cvt.u64.u32 %rd428, %r6765; + bfi.b64 %rd429, %rd428, %rd427, 32, 32; + cvt.u64.u32 %rd430, %r6764; + shl.b64 %rd431, %rd430, 32; + and.b64 %rd432, %rd431, 280375465082880; + or.b64 %rd433, %rd429, %rd432; + and.b64 %rd434, %rd431, 71776119061217280; + shr.u32 %r6766, %r6764, 24; + cvt.u64.u32 %rd435, %r6766; + shl.b64 %rd436, %rd435, 56; + or.b64 %rd437, %rd433, %rd434; + or.b64 %rd1305, %rd437, %rd436; + add.u64 %rd1297, %SPL, 2000; + mov.u64 %rd1301, 0; + mov.u32 %r29819, 0; + st.local.v4.u32 [%rd1297+32], {%r29819, %r29819, %r29819, %r29819}; + st.local.v4.u32 [%rd1297+48], {%r29819, %r29819, %r29819, %r29819}; + st.local.v4.u32 [%rd1297+64], {%r29819, %r29819, %r29819, %r29819}; + st.local.v2.u64 [%rd1297], {%rd1308, %rd1307}; + st.local.v2.u64 [%rd1297+16], {%rd1306, %rd1305}; + mov.u64 %rd1302, %rd1301; + mov.u64 %rd1303, %rd1301; + mov.u64 %rd1304, %rd1301; + mov.u64 %rd1309, %rd1301; + mov.u64 %rd1310, %rd1301; + mov.u64 %rd1311, %rd1301; + mov.u64 %rd1312, %rd1301; + mov.u64 %rd1313, %rd1305; + mov.u64 %rd1314, %rd1306; + mov.u64 %rd1315, %rd1307; + mov.u64 %rd1316, %rd1308; + +$L__BB2_11: + mov.b64 {%r12, %r13}, %rd1316; + xor.b32 %r6768, %r13, %r12; + mov.b64 {%r14, %r15}, %rd1315; + xor.b32 %r6769, %r6768, %r14; + xor.b32 %r6770, %r6769, %r15; + mov.b64 {%r16, %r17}, %rd1314; + xor.b32 %r6771, %r17, %r16; + mov.b64 {%r18, %r19}, %rd1313; + xor.b32 %r6772, %r6771, %r18; + xor.b32 %r6773, %r6772, %r19; + mov.b64 {%r20, %r21}, %rd1312; + xor.b32 %r6774, %r21, %r20; + mov.b64 {%r22, %r23}, %rd1311; + xor.b32 %r6775, %r6774, %r22; + xor.b32 %r6776, %r6775, %r23; + mov.b64 {%r24, %r25}, %rd1310; + xor.b32 %r6777, %r25, %r24; + mov.b64 {%r26, %r27}, %rd1309; + xor.b32 %r6778, %r6777, %r26; + xor.b32 %r6779, %r6778, %r27; + mov.b64 {%r28, %r29}, %rd1308; + xor.b32 %r6780, %r29, %r28; + mov.b64 {%r30, %r31}, %rd1307; + xor.b32 %r6781, %r6780, %r30; + xor.b32 %r6782, %r6781, %r31; + mov.b64 {%r32, %r33}, %rd1306; + xor.b32 %r6783, %r33, %r32; + mov.b64 {%r34, %r35}, %rd1305; + xor.b32 %r6784, %r6783, %r34; + xor.b32 %r6785, %r6784, %r35; + mov.b64 {%r36, %r37}, %rd1304; + xor.b32 %r6786, %r37, %r36; + mov.b64 {%r38, %r39}, %rd1303; + xor.b32 %r6787, %r6786, %r38; + xor.b32 %r6788, %r6787, %r39; + mov.b64 {%r40, %r41}, %rd1302; + xor.b32 %r6789, %r41, %r40; + mov.b64 {%r42, %r43}, %rd1301; + xor.b32 %r6790, %r6789, %r42; + xor.b32 %r6791, %r6790, %r43; + xor.b32 %r6792, %r6779, %r6770; + xor.b32 %r6793, %r6792, %r6788; + mul.wide.u32 %rd446, %r6793, 1908875315; + shr.u64 %rd447, %rd446, 56; + cvt.u32.u64 %r6794, %rd447; + mul.lo.s32 %r6795, %r6794, 37748717; + sub.s32 %r44, %r6793, %r6795; + xor.b32 %r6796, %r6782, %r6773; + xor.b32 %r6797, %r6796, %r6791; + mul.wide.u32 %rd448, %r6797, 1908875315; + shr.u64 %rd449, %rd448, 56; + cvt.u32.u64 %r6798, %rd449; + mul.lo.s32 %r6799, %r6798, 37748717; + sub.s32 %r45, %r6797, %r6799; + xor.b32 %r6800, %r6776, %r29819; + xor.b32 %r6801, %r6800, %r6785; + mul.wide.u32 %rd450, %r6801, 1908875315; + shr.u64 %rd451, %rd450, 56; + cvt.u32.u64 %r6802, %rd451; + mul.lo.s32 %r6803, %r6802, 37748717; + sub.s32 %r46, %r6801, %r6803; + shl.b32 %r47, %r44, 1; + mul.wide.u32 %rd452, %r47, -954391867; + shr.u64 %rd453, %rd452, 32; + cvt.u32.u64 %r6804, %rd453; + sub.s32 %r6805, %r47, %r6804; + shr.u32 %r6806, %r6805, 1; + add.s32 %r6807, %r6806, %r6804; + shr.u32 %r6808, %r6807, 20; + mul.lo.s32 %r6809, %r6808, 1179641; + sub.s32 %r6810, %r47, %r6809; + cvta.to.global.u64 %rd454, %rd361; + mul.wide.u32 %rd455, %r6810, 64; + add.s64 %rd32, %rd454, %rd455; + or.b32 %r48, %r47, 1; + mul.wide.u32 %rd456, %r48, -954391867; + shr.u64 %rd457, %rd456, 32; + cvt.u32.u64 %r6811, %rd457; + sub.s32 %r6812, %r48, %r6811; + shr.u32 %r6813, %r6812, 1; + add.s32 %r6814, %r6813, %r6811; + shr.u32 %r6815, %r6814, 20; + mul.lo.s32 %r6816, %r6815, 1179641; + sub.s32 %r6817, %r48, %r6816; + mul.wide.u32 %rd458, %r6817, 64; + add.s64 %rd33, %rd454, %rd458; + setp.eq.s64 %p12, %rd360, 0; + @%p12 bra $L__BB2_25; + + cvta.to.global.u64 %rd459, %rd360; + mul.wide.u32 %rd460, %r44, 128; + add.s64 %rd34, %rd459, %rd460; + ld.global.u64 %rd1317, [%rd34]; + setp.eq.s64 %p13, %rd1317, 0; + @%p13 bra $L__BB2_14; + + ld.global.u64 %rd1332, [%rd34+120]; + ld.global.u64 %rd1331, [%rd34+112]; + ld.global.u64 %rd1330, [%rd34+104]; + ld.global.u64 %rd1329, [%rd34+96]; + ld.global.u64 %rd1328, [%rd34+88]; + ld.global.u64 %rd1327, [%rd34+80]; + ld.global.u64 %rd1326, [%rd34+72]; + ld.global.u64 %rd1325, [%rd34+64]; + ld.global.u64 %rd1324, [%rd34+56]; + ld.global.u64 %rd1323, [%rd34+48]; + ld.global.u64 %rd1322, [%rd34+40]; + ld.global.u64 %rd1321, [%rd34+32]; + ld.global.u64 %rd1320, [%rd34+24]; + ld.global.u64 %rd1319, [%rd34+16]; + ld.global.u64 %rd1318, [%rd34+8]; + bra.uni $L__BB2_36; + +$L__BB2_25: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd562, 1179641; + st.local.u64 [%rd2+8], %rd562; + st.local.u32 [%rd2+16], %r47; + ld.global.u64 %rd563, [%rd32]; + ld.global.u64 %rd564, [%rd32+8]; + ld.global.u64 %rd565, [%rd32+16]; + ld.global.u64 %rd566, [%rd32+24]; + ld.global.u64 %rd567, [%rd32+32]; + ld.global.u64 %rd568, [%rd32+40]; + ld.global.u64 %rd569, [%rd32+48]; + ld.global.u64 %rd570, [%rd32+56]; + st.local.u64 [%rd2+24], %rd563; + st.local.u64 [%rd2+32], %rd564; + st.local.u64 [%rd2+40], %rd565; + st.local.u64 [%rd2+48], %rd566; + st.local.u64 [%rd2+56], %rd567; + st.local.u64 [%rd2+64], %rd568; + st.local.u64 [%rd2+72], %rd569; + st.local.u64 [%rd2+80], %rd570; + cvt.u32.u64 %r10143, %rd563; + xor.b32 %r10144, %r47, %r10143; + st.local.u32 [%rd2+24], %r10144; + mov.u32 %r30057, 0; + st.local.v2.u32 [%rd2+96], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+104], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+112], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+120], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+128], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+136], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+144], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+152], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+160], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+168], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+176], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+184], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+192], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+200], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+208], {%r30057, %r30057}; + st.local.v2.u32 [%rd2+216], {%r30057, %r30057}; + mov.u32 %r30072, -2147483648; + mov.u32 %r10116, 1; + st.local.v2.u32 [%rd2+88], {%r10116, %r30072}; + ld.local.v2.u32 {%r30093, %r30094}, [%rd2+24]; + mov.b64 {%r30091, %r30092}, %rd568; + shr.u64 %rd571, %rd564, 32; + cvt.u32.u64 %r30105, %rd564; + cvt.u32.u64 %r30106, %rd571; + shr.u64 %rd572, %rd569, 32; + cvt.u32.u64 %r30103, %rd569; + cvt.u32.u64 %r30104, %rd572; + shr.u64 %rd573, %rd565, 32; + cvt.u32.u64 %r30101, %rd565; + cvt.u32.u64 %r30102, %rd573; + shr.u64 %rd574, %rd570, 32; + cvt.u32.u64 %r30099, %rd570; + cvt.u32.u64 %r30100, %rd574; + shr.u64 %rd575, %rd566, 32; + cvt.u32.u64 %r30097, %rd566; + cvt.u32.u64 %r30098, %rd575; + shr.u64 %rd576, %rd567, 32; + cvt.u32.u64 %r30095, %rd567; + cvt.u32.u64 %r30096, %rd576; + mov.u32 %r30058, %r30057; + mov.u32 %r30059, %r30057; + mov.u32 %r30060, %r30057; + mov.u32 %r30061, %r30057; + mov.u32 %r30062, %r30057; + mov.u32 %r30063, %r30057; + mov.u32 %r30064, %r30057; + mov.u32 %r30065, %r30057; + mov.u32 %r30066, %r30057; + mov.u32 %r30067, %r30057; + mov.u32 %r30068, %r30057; + mov.u32 %r30069, %r30057; + mov.u32 %r30070, %r30057; + mov.u32 %r30071, %r10116; + mov.u32 %r30073, %r30057; + mov.u32 %r30074, %r30057; + mov.u32 %r30075, %r30057; + mov.u32 %r30076, %r30057; + mov.u32 %r30077, %r30057; + mov.u32 %r30078, %r30057; + mov.u32 %r30079, %r30057; + mov.u32 %r30080, %r30057; + mov.u32 %r30081, %r30057; + mov.u32 %r30082, %r30057; + mov.u32 %r30083, %r30057; + mov.u32 %r30084, %r30057; + mov.u32 %r30085, %r30057; + mov.u32 %r30086, %r30057; + mov.u32 %r30087, %r30057; + mov.u32 %r30088, %r30057; + mov.u32 %r30089, %r30057; + mov.u32 %r30090, %r30057; + mov.u32 %r30107, %r30057; + +$L__BB2_26: + // begin inline asm + // xor5 + lop3.b32 %r10147, %r30093, %r30091, %r30089, 0x96; + lop3.b32 %r10147, %r10147, %r30087, %r30085, 0x96; + lop3.b32 %r10148, %r30094, %r30092, %r30090, 0x96; + lop3.b32 %r10148, %r10148, %r30088, %r30086, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10159, %r30105, %r30103, %r30083, 0x96; + lop3.b32 %r10159, %r10159, %r30081, %r30079, 0x96; + lop3.b32 %r10160, %r30106, %r30104, %r30084, 0x96; + lop3.b32 %r10160, %r10160, %r30082, %r30080, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10171, %r30101, %r30099, %r30077, 0x96; + lop3.b32 %r10171, %r10171, %r30075, %r30073, 0x96; + lop3.b32 %r10172, %r30102, %r30100, %r30078, 0x96; + lop3.b32 %r10172, %r10172, %r30076, %r30074, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10183, %r30097, %r30071, %r30069, 0x96; + lop3.b32 %r10183, %r10183, %r30067, %r30065, 0x96; + lop3.b32 %r10184, %r30098, %r30072, %r30070, 0x96; + lop3.b32 %r10184, %r10184, %r30068, %r30066, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10195, %r30095, %r30063, %r30061, 0x96; + lop3.b32 %r10195, %r10195, %r30059, %r30057, 0x96; + lop3.b32 %r10196, %r30096, %r30064, %r30062, 0x96; + lop3.b32 %r10196, %r10196, %r30060, %r30058, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10207, %r10160, %r10159, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10211, %r10159, %r10160, %r10116; + // end inline asm + xor.b32 %r10641, %r10207, %r10195; + xor.b32 %r10642, %r10211, %r10196; + xor.b32 %r10474, %r30093, %r10641; + xor.b32 %r10477, %r30094, %r10642; + xor.b32 %r10381, %r30091, %r10641; + xor.b32 %r10380, %r30092, %r10642; + xor.b32 %r10428, %r30089, %r10641; + xor.b32 %r10429, %r30090, %r10642; + xor.b32 %r10333, %r30087, %r10641; + xor.b32 %r10332, %r30088, %r10642; + xor.b32 %r10284, %r30085, %r10641; + xor.b32 %r10285, %r30086, %r10642; + // begin inline asm + shf.l.wrap.b32 %r10215, %r10172, %r10171, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10219, %r10171, %r10172, %r10116; + // end inline asm + xor.b32 %r10643, %r10215, %r10147; + xor.b32 %r10644, %r10219, %r10148; + xor.b32 %r10436, %r30105, %r10643; + xor.b32 %r10437, %r30106, %r10644; + xor.b32 %r10253, %r30103, %r10643; + xor.b32 %r10252, %r30104, %r10644; + xor.b32 %r10412, %r30083, %r10643; + xor.b32 %r10413, %r30084, %r10644; + xor.b32 %r10373, %r30081, %r10643; + xor.b32 %r10372, %r30082, %r10644; + xor.b32 %r10356, %r30079, %r10643; + xor.b32 %r10357, %r30080, %r10644; + // begin inline asm + shf.l.wrap.b32 %r10223, %r10184, %r10183, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10227, %r10183, %r10184, %r10116; + // end inline asm + xor.b32 %r10645, %r10223, %r10159; + xor.b32 %r10646, %r10227, %r10160; + xor.b32 %r10293, %r30101, %r10645; + xor.b32 %r10292, %r30102, %r10646; + xor.b32 %r10420, %r30099, %r10645; + xor.b32 %r10421, %r30100, %r10646; + xor.b32 %r10301, %r30077, %r10645; + xor.b32 %r10300, %r30078, %r10646; + xor.b32 %r10404, %r30075, %r10645; + xor.b32 %r10405, %r30076, %r10646; + xor.b32 %r10269, %r30073, %r10645; + xor.b32 %r10268, %r30074, %r10646; + // begin inline asm + shf.l.wrap.b32 %r10231, %r10196, %r10195, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10235, %r10195, %r10196, %r10116; + // end inline asm + xor.b32 %r10647, %r10231, %r10171; + xor.b32 %r10648, %r10235, %r10172; + xor.b32 %r10388, %r30097, %r10647; + xor.b32 %r10389, %r30098, %r10648; + xor.b32 %r10365, %r30071, %r10647; + xor.b32 %r10364, %r30072, %r10648; + xor.b32 %r10308, %r30069, %r10647; + xor.b32 %r10309, %r30070, %r10648; + xor.b32 %r10396, %r30067, %r10647; + xor.b32 %r10397, %r30068, %r10648; + xor.b32 %r10325, %r30065, %r10647; + xor.b32 %r10324, %r30066, %r10648; + // begin inline asm + shf.l.wrap.b32 %r10239, %r10148, %r10147, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10243, %r10147, %r10148, %r10116; + // end inline asm + xor.b32 %r10649, %r10239, %r10183; + xor.b32 %r10650, %r10243, %r10184; + xor.b32 %r10340, %r30095, %r10649; + xor.b32 %r10341, %r30096, %r10650; + xor.b32 %r10260, %r30063, %r10649; + xor.b32 %r10261, %r30064, %r10650; + xor.b32 %r10277, %r30061, %r10649; + xor.b32 %r10276, %r30062, %r10650; + xor.b32 %r10316, %r30059, %r10649; + xor.b32 %r10317, %r30060, %r10650; + xor.b32 %r10348, %r30057, %r10649; + xor.b32 %r10349, %r30058, %r10650; + mov.u32 %r10254, 44; + // begin inline asm + shf.l.wrap.b32 %r10247, %r10253, %r10252, %r10254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10251, %r10252, %r10253, %r10254; + // end inline asm + mov.u32 %r10262, 20; + // begin inline asm + shf.l.wrap.b32 %r10255, %r10261, %r10260, %r10262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10259, %r10260, %r10261, %r10262; + // end inline asm + mov.u32 %r10270, 61; + // begin inline asm + shf.l.wrap.b32 %r10263, %r10269, %r10268, %r10270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10267, %r10268, %r10269, %r10270; + // end inline asm + mov.u32 %r10278, 39; + // begin inline asm + shf.l.wrap.b32 %r10271, %r10277, %r10276, %r10278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10275, %r10276, %r10277, %r10278; + // end inline asm + mov.u32 %r10286, 18; + // begin inline asm + shf.l.wrap.b32 %r10279, %r10285, %r10284, %r10286; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10283, %r10284, %r10285, %r10286; + // end inline asm + mov.u32 %r10294, 62; + // begin inline asm + shf.l.wrap.b32 %r10287, %r10293, %r10292, %r10294; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10291, %r10292, %r10293, %r10294; + // end inline asm + mov.u32 %r10302, 43; + // begin inline asm + shf.l.wrap.b32 %r10295, %r10301, %r10300, %r10302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10299, %r10300, %r10301, %r10302; + // end inline asm + mov.u32 %r10310, 25; + // begin inline asm + shf.l.wrap.b32 %r10303, %r10309, %r10308, %r10310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10307, %r10308, %r10309, %r10310; + // end inline asm + mov.u32 %r10318, 8; + // begin inline asm + shf.l.wrap.b32 %r10311, %r10317, %r10316, %r10318; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10315, %r10316, %r10317, %r10318; + // end inline asm + mov.u32 %r10326, 56; + // begin inline asm + shf.l.wrap.b32 %r10319, %r10325, %r10324, %r10326; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10323, %r10324, %r10325, %r10326; + // end inline asm + mov.u32 %r10334, 41; + // begin inline asm + shf.l.wrap.b32 %r10327, %r10333, %r10332, %r10334; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10331, %r10332, %r10333, %r10334; + // end inline asm + mov.u32 %r10342, 27; + // begin inline asm + shf.l.wrap.b32 %r10335, %r10341, %r10340, %r10342; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10339, %r10340, %r10341, %r10342; + // end inline asm + mov.u32 %r10350, 14; + // begin inline asm + shf.l.wrap.b32 %r10343, %r10349, %r10348, %r10350; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10347, %r10348, %r10349, %r10350; + // end inline asm + mov.u32 %r10358, 2; + // begin inline asm + shf.l.wrap.b32 %r10351, %r10357, %r10356, %r10358; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10355, %r10356, %r10357, %r10358; + // end inline asm + mov.u32 %r10366, 55; + // begin inline asm + shf.l.wrap.b32 %r10359, %r10365, %r10364, %r10366; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10363, %r10364, %r10365, %r10366; + // end inline asm + mov.u32 %r10374, 45; + // begin inline asm + shf.l.wrap.b32 %r10367, %r10373, %r10372, %r10374; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10371, %r10372, %r10373, %r10374; + // end inline asm + mov.u32 %r10382, 36; + // begin inline asm + shf.l.wrap.b32 %r10375, %r10381, %r10380, %r10382; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10379, %r10380, %r10381, %r10382; + // end inline asm + mov.u32 %r10390, 28; + // begin inline asm + shf.l.wrap.b32 %r10383, %r10389, %r10388, %r10390; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10387, %r10388, %r10389, %r10390; + // end inline asm + mov.u32 %r10398, 21; + // begin inline asm + shf.l.wrap.b32 %r10391, %r10397, %r10396, %r10398; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10395, %r10396, %r10397, %r10398; + // end inline asm + mov.u32 %r10406, 15; + // begin inline asm + shf.l.wrap.b32 %r10399, %r10405, %r10404, %r10406; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10403, %r10404, %r10405, %r10406; + // end inline asm + mov.u32 %r10414, 10; + // begin inline asm + shf.l.wrap.b32 %r10407, %r10413, %r10412, %r10414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10411, %r10412, %r10413, %r10414; + // end inline asm + mov.u32 %r10422, 6; + // begin inline asm + shf.l.wrap.b32 %r10415, %r10421, %r10420, %r10422; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10419, %r10420, %r10421, %r10422; + // end inline asm + mov.u32 %r10430, 3; + // begin inline asm + shf.l.wrap.b32 %r10423, %r10429, %r10428, %r10430; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10427, %r10428, %r10429, %r10430; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10431, %r10437, %r10436, %r10116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10435, %r10436, %r10437, %r10116; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10439, %r10474, %r10247, %r10295, 0xD2; + lop3.b32 %r10440, %r10477, %r10251, %r10299, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30105, %r10247, %r10295, %r10391, 0xD2; + lop3.b32 %r30106, %r10251, %r10299, %r10395, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30101, %r10295, %r10391, %r10343, 0xD2; + lop3.b32 %r30102, %r10299, %r10395, %r10347, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30097, %r10391, %r10343, %r10474, 0xD2; + lop3.b32 %r30098, %r10395, %r10347, %r10477, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30095, %r10343, %r10474, %r10247, 0xD2; + lop3.b32 %r30096, %r10347, %r10477, %r10251, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30091, %r10383, %r10255, %r10423, 0xD2; + lop3.b32 %r30092, %r10387, %r10259, %r10427, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30103, %r10255, %r10423, %r10367, 0xD2; + lop3.b32 %r30104, %r10259, %r10427, %r10371, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30099, %r10423, %r10367, %r10263, 0xD2; + lop3.b32 %r30100, %r10427, %r10371, %r10267, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30071, %r10367, %r10263, %r10383, 0xD2; + lop3.b32 %r30072, %r10371, %r10267, %r10387, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30071, %r30072}; + // begin inline asm + // chi + lop3.b32 %r30063, %r10263, %r10383, %r10255, 0xD2; + lop3.b32 %r30064, %r10267, %r10387, %r10259, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30063, %r30064}; + // begin inline asm + // chi + lop3.b32 %r30089, %r10431, %r10415, %r10303, 0xD2; + lop3.b32 %r30090, %r10435, %r10419, %r10307, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30089, %r30090}; + // begin inline asm + // chi + lop3.b32 %r30083, %r10415, %r10303, %r10311, 0xD2; + lop3.b32 %r30084, %r10419, %r10307, %r10315, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30083, %r30084}; + // begin inline asm + // chi + lop3.b32 %r30077, %r10303, %r10311, %r10279, 0xD2; + lop3.b32 %r30078, %r10307, %r10315, %r10283, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30077, %r30078}; + // begin inline asm + // chi + lop3.b32 %r30069, %r10311, %r10279, %r10431, 0xD2; + lop3.b32 %r30070, %r10315, %r10283, %r10435, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30069, %r30070}; + // begin inline asm + // chi + lop3.b32 %r30061, %r10279, %r10431, %r10415, 0xD2; + lop3.b32 %r30062, %r10283, %r10435, %r10419, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30061, %r30062}; + // begin inline asm + // chi + lop3.b32 %r30087, %r10335, %r10375, %r10407, 0xD2; + lop3.b32 %r30088, %r10339, %r10379, %r10411, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30087, %r30088}; + // begin inline asm + // chi + lop3.b32 %r30081, %r10375, %r10407, %r10399, 0xD2; + lop3.b32 %r30082, %r10379, %r10411, %r10403, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30081, %r30082}; + // begin inline asm + // chi + lop3.b32 %r30075, %r10407, %r10399, %r10319, 0xD2; + lop3.b32 %r30076, %r10411, %r10403, %r10323, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30075, %r30076}; + // begin inline asm + // chi + lop3.b32 %r30067, %r10399, %r10319, %r10335, 0xD2; + lop3.b32 %r30068, %r10403, %r10323, %r10339, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30067, %r30068}; + // begin inline asm + // chi + lop3.b32 %r30059, %r10319, %r10335, %r10375, 0xD2; + lop3.b32 %r30060, %r10323, %r10339, %r10379, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30059, %r30060}; + // begin inline asm + // chi + lop3.b32 %r30085, %r10287, %r10359, %r10271, 0xD2; + lop3.b32 %r30086, %r10291, %r10363, %r10275, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30085, %r30086}; + // begin inline asm + // chi + lop3.b32 %r30079, %r10359, %r10271, %r10327, 0xD2; + lop3.b32 %r30080, %r10363, %r10275, %r10331, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30079, %r30080}; + // begin inline asm + // chi + lop3.b32 %r30073, %r10271, %r10327, %r10351, 0xD2; + lop3.b32 %r30074, %r10275, %r10331, %r10355, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30073, %r30074}; + // begin inline asm + // chi + lop3.b32 %r30065, %r10327, %r10351, %r10287, 0xD2; + lop3.b32 %r30066, %r10331, %r10355, %r10291, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30065, %r30066}; + // begin inline asm + // chi + lop3.b32 %r30057, %r10351, %r10287, %r10359, 0xD2; + lop3.b32 %r30058, %r10355, %r10291, %r10363, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30057, %r30058}; + mul.wide.s32 %rd578, %r30107, 8; + mov.u64 %rd579, keccak_round_constants; + cvta.const.u64 %rd580, %rd579; + add.s64 %rd577, %rd580, %rd578; + // begin inline asm + ld.global.nc.v2.u32 {%r10639,%r10640}, [%rd577]; + // end inline asm + xor.b32 %r30093, %r10439, %r10639; + xor.b32 %r30094, %r10440, %r10640; + add.s32 %r30107, %r30107, 1; + setp.lt.u32 %p19, %r30107, 23; + @%p19 bra $L__BB2_26; + + add.u64 %rd82, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r30105, %r30106}; + st.local.v2.u32 [%rd2+72], {%r30103, %r30104}; + st.local.v2.u32 [%rd2+40], {%r30101, %r30102}; + st.local.v2.u32 [%rd2+80], {%r30099, %r30100}; + st.local.v2.u32 [%rd2+48], {%r30097, %r30098}; + st.local.v2.u32 [%rd2+56], {%r30095, %r30096}; + st.local.v2.u32 [%rd2+24], {%r30093, %r30094}; + // begin inline asm + // xor5 + lop3.b32 %r10651, %r30093, %r30091, %r30089, 0x96; + lop3.b32 %r10651, %r10651, %r30087, %r30085, 0x96; + lop3.b32 %r10652, %r30094, %r30092, %r30090, 0x96; + lop3.b32 %r10652, %r10652, %r30088, %r30086, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10663, %r30105, %r30103, %r30083, 0x96; + lop3.b32 %r10663, %r10663, %r30081, %r30079, 0x96; + lop3.b32 %r10664, %r30106, %r30104, %r30084, 0x96; + lop3.b32 %r10664, %r10664, %r30082, %r30080, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10675, %r30101, %r30099, %r30077, 0x96; + lop3.b32 %r10675, %r10675, %r30075, %r30073, 0x96; + lop3.b32 %r10676, %r30102, %r30100, %r30078, 0x96; + lop3.b32 %r10676, %r10676, %r30076, %r30074, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10687, %r30097, %r30071, %r30069, 0x96; + lop3.b32 %r10687, %r10687, %r30067, %r30065, 0x96; + lop3.b32 %r10688, %r30098, %r30072, %r30070, 0x96; + lop3.b32 %r10688, %r10688, %r30068, %r30066, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10699, %r30095, %r30063, %r30061, 0x96; + lop3.b32 %r10699, %r10699, %r30059, %r30057, 0x96; + lop3.b32 %r10700, %r30096, %r30064, %r30062, 0x96; + lop3.b32 %r10700, %r10700, %r30060, %r30058, 0x96; + // end inline asm + mov.u32 %r10903, 1; + // begin inline asm + shf.l.wrap.b32 %r10711, %r10664, %r10663, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10715, %r10663, %r10664, %r10903; + // end inline asm + xor.b32 %r10930, %r10711, %r10699; + xor.b32 %r10931, %r10715, %r10700; + xor.b32 %r10858, %r30093, %r10930; + xor.b32 %r10861, %r30094, %r10931; + xor.b32 %r10821, %r30090, %r10931; + xor.b32 %r10820, %r30089, %r10930; + st.local.v2.u32 [%rd2+104], {%r10820, %r10821}; + // begin inline asm + shf.l.wrap.b32 %r10719, %r10676, %r10675, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10723, %r10675, %r10676, %r10903; + // end inline asm + xor.b32 %r10932, %r10719, %r10651; + xor.b32 %r10933, %r10723, %r10652; + xor.b32 %r10757, %r30103, %r10932; + xor.b32 %r10756, %r30104, %r10933; + xor.b32 %r10796, %r30082, %r10933; + xor.b32 %r10797, %r30081, %r10932; + st.local.v2.u32 [%rd2+152], {%r10797, %r10796}; + // begin inline asm + shf.l.wrap.b32 %r10727, %r10688, %r10687, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10731, %r10687, %r10688, %r10903; + // end inline asm + xor.b32 %r10934, %r10727, %r10663; + xor.b32 %r10935, %r10731, %r10664; + xor.b32 %r10780, %r30078, %r10935; + xor.b32 %r10781, %r30077, %r10934; + st.local.v2.u32 [%rd2+120], {%r10781, %r10780}; + xor.b32 %r10772, %r30074, %r10935; + xor.b32 %r10773, %r30073, %r10934; + st.local.v2.u32 [%rd2+200], {%r10773, %r10772}; + // begin inline asm + shf.l.wrap.b32 %r10735, %r10700, %r10699, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10739, %r10699, %r10700, %r10903; + // end inline asm + xor.b32 %r10936, %r10735, %r10675; + xor.b32 %r10937, %r10739, %r10676; + xor.b32 %r10804, %r30097, %r10936; + xor.b32 %r10805, %r30098, %r10937; + xor.b32 %r10813, %r30068, %r10937; + xor.b32 %r10812, %r30067, %r10936; + st.local.v2.u32 [%rd2+168], {%r10812, %r10813}; + // begin inline asm + shf.l.wrap.b32 %r10743, %r10652, %r10651, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10747, %r10651, %r10652, %r10903; + // end inline asm + xor.b32 %r10938, %r10743, %r10687; + xor.b32 %r10939, %r10747, %r10688; + xor.b32 %r10764, %r30063, %r10938; + xor.b32 %r10765, %r30064, %r10939; + xor.b32 %r10789, %r30058, %r10939; + xor.b32 %r10788, %r30057, %r10938; + st.local.v2.u32 [%rd2+216], {%r10788, %r10789}; + // begin inline asm + shf.l.wrap.b32 %r10751, %r10757, %r10756, %r10254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10755, %r10756, %r10757, %r10254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10759, %r10765, %r10764, %r10262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10763, %r10764, %r10765, %r10262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10771, %r10772, %r10773, %r10270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10767, %r10773, %r10772, %r10270; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r10767, %r10771}; + // begin inline asm + shf.l.wrap.b32 %r10775, %r10781, %r10780, %r10302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10779, %r10780, %r10781, %r10302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10783, %r10789, %r10788, %r10350; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10787, %r10788, %r10789, %r10350; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10795, %r10796, %r10797, %r10374; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10791, %r10797, %r10796, %r10374; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r10791, %r10795}; + // begin inline asm + shf.l.wrap.b32 %r10799, %r10805, %r10804, %r10390; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10803, %r10804, %r10805, %r10390; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10807, %r10813, %r10812, %r10398; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10811, %r10812, %r10813, %r10398; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10815, %r10821, %r10820, %r10430; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10819, %r10820, %r10821, %r10430; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10823, %r10858, %r10751, %r10775, 0xD2; + lop3.b32 %r10824, %r10861, %r10755, %r10779, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30240, %r10751, %r10775, %r10807, 0xD2; + lop3.b32 %r30241, %r10755, %r10779, %r10811, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30240, %r30241}; + // begin inline asm + // chi + lop3.b32 %r30236, %r10775, %r10807, %r10783, 0xD2; + lop3.b32 %r30237, %r10779, %r10811, %r10787, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30236, %r30237}; + // begin inline asm + // chi + lop3.b32 %r30232, %r10807, %r10783, %r10858, 0xD2; + lop3.b32 %r30233, %r10811, %r10787, %r10861, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30232, %r30233}; + // begin inline asm + // chi + lop3.b32 %r30230, %r10783, %r10858, %r10751, 0xD2; + lop3.b32 %r30231, %r10787, %r10861, %r10755, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30230, %r30231}; + // begin inline asm + // chi + lop3.b32 %r30226, %r10799, %r10759, %r10815, 0xD2; + lop3.b32 %r30227, %r10803, %r10763, %r10819, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30226, %r30227}; + // begin inline asm + // chi + lop3.b32 %r30238, %r10759, %r10815, %r10791, 0xD2; + lop3.b32 %r30239, %r10763, %r10819, %r10795, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30238, %r30239}; + // begin inline asm + // chi + lop3.b32 %r30234, %r10815, %r10791, %r10767, 0xD2; + lop3.b32 %r30235, %r10819, %r10795, %r10771, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30234, %r30235}; + add.s64 %rd581, %rd580, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r10887,%r10888}, [%rd581]; + // end inline asm + xor.b32 %r30228, %r10823, %r10887; + xor.b32 %r30229, %r10824, %r10888; + st.local.v2.u32 [%rd2+24], {%r30228, %r30229}; + st.local.u64 [%rd82], %rd361; + mov.u64 %rd585, 1179641; + st.local.u64 [%rd82+8], %rd585; + st.local.u32 [%rd82+16], %r48; + ld.global.u64 %rd586, [%rd33]; + ld.global.u64 %rd587, [%rd33+8]; + ld.global.u64 %rd588, [%rd33+16]; + ld.global.u64 %rd589, [%rd33+24]; + ld.global.u64 %rd590, [%rd33+32]; + ld.global.u64 %rd591, [%rd33+40]; + ld.global.u64 %rd592, [%rd33+48]; + ld.global.u64 %rd593, [%rd33+56]; + st.local.u64 [%rd82+32], %rd587; + st.local.u64 [%rd82+40], %rd588; + st.local.u64 [%rd82+48], %rd589; + st.local.u64 [%rd82+56], %rd590; + st.local.u64 [%rd82+64], %rd591; + st.local.u64 [%rd82+72], %rd592; + st.local.u64 [%rd82+80], %rd593; + cvt.u32.u64 %r10940, %rd586; + xor.b32 %r10941, %r48, %r10940; + st.local.u64 [%rd82+24], %rd586; + st.local.u32 [%rd82+24], %r10941; + mov.u32 %r30108, 0; + st.local.v2.u32 [%rd82+96], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+104], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+112], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+120], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+128], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+136], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+144], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+152], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+160], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+168], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+176], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+184], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+192], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+200], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+208], {%r30108, %r30108}; + st.local.v2.u32 [%rd82+216], {%r30108, %r30108}; + mov.u32 %r30123, -2147483648; + st.local.v2.u32 [%rd82+88], {%r10903, %r30123}; + ld.local.v2.u32 {%r30144, %r30145}, [%rd82+24]; + mov.b64 {%r30142, %r30143}, %rd591; + shr.u64 %rd594, %rd587, 32; + cvt.u32.u64 %r30156, %rd587; + cvt.u32.u64 %r30157, %rd594; + shr.u64 %rd595, %rd592, 32; + cvt.u32.u64 %r30154, %rd592; + cvt.u32.u64 %r30155, %rd595; + shr.u64 %rd596, %rd588, 32; + cvt.u32.u64 %r30152, %rd588; + cvt.u32.u64 %r30153, %rd596; + shr.u64 %rd597, %rd593, 32; + cvt.u32.u64 %r30150, %rd593; + cvt.u32.u64 %r30151, %rd597; + shr.u64 %rd598, %rd589, 32; + cvt.u32.u64 %r30148, %rd589; + cvt.u32.u64 %r30149, %rd598; + shr.u64 %rd599, %rd590, 32; + cvt.u32.u64 %r30146, %rd590; + cvt.u32.u64 %r30147, %rd599; + mov.u32 %r30109, %r30108; + mov.u32 %r30110, %r30108; + mov.u32 %r30111, %r30108; + mov.u32 %r30112, %r30108; + mov.u32 %r30113, %r30108; + mov.u32 %r30114, %r30108; + mov.u32 %r30115, %r30108; + mov.u32 %r30116, %r30108; + mov.u32 %r30117, %r30108; + mov.u32 %r30118, %r30108; + mov.u32 %r30119, %r30108; + mov.u32 %r30120, %r30108; + mov.u32 %r30121, %r30108; + mov.u32 %r30122, %r10903; + mov.u32 %r30124, %r30108; + mov.u32 %r30125, %r30108; + mov.u32 %r30126, %r30108; + mov.u32 %r30127, %r30108; + mov.u32 %r30128, %r30108; + mov.u32 %r30129, %r30108; + mov.u32 %r30130, %r30108; + mov.u32 %r30131, %r30108; + mov.u32 %r30132, %r30108; + mov.u32 %r30133, %r30108; + mov.u32 %r30134, %r30108; + mov.u32 %r30135, %r30108; + mov.u32 %r30136, %r30108; + mov.u32 %r30137, %r30108; + mov.u32 %r30138, %r30108; + mov.u32 %r30139, %r30108; + mov.u32 %r30140, %r30108; + mov.u32 %r30141, %r30108; + mov.u32 %r30158, %r30108; + +$L__BB2_28: + // begin inline asm + // xor5 + lop3.b32 %r10944, %r30144, %r30142, %r30140, 0x96; + lop3.b32 %r10944, %r10944, %r30138, %r30136, 0x96; + lop3.b32 %r10945, %r30145, %r30143, %r30141, 0x96; + lop3.b32 %r10945, %r10945, %r30139, %r30137, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10956, %r30156, %r30154, %r30134, 0x96; + lop3.b32 %r10956, %r10956, %r30132, %r30130, 0x96; + lop3.b32 %r10957, %r30157, %r30155, %r30135, 0x96; + lop3.b32 %r10957, %r10957, %r30133, %r30131, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10968, %r30152, %r30150, %r30128, 0x96; + lop3.b32 %r10968, %r10968, %r30126, %r30124, 0x96; + lop3.b32 %r10969, %r30153, %r30151, %r30129, 0x96; + lop3.b32 %r10969, %r10969, %r30127, %r30125, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10980, %r30148, %r30122, %r30120, 0x96; + lop3.b32 %r10980, %r10980, %r30118, %r30116, 0x96; + lop3.b32 %r10981, %r30149, %r30123, %r30121, 0x96; + lop3.b32 %r10981, %r10981, %r30119, %r30117, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10992, %r30146, %r30114, %r30112, 0x96; + lop3.b32 %r10992, %r10992, %r30110, %r30108, 0x96; + lop3.b32 %r10993, %r30147, %r30115, %r30113, 0x96; + lop3.b32 %r10993, %r10993, %r30111, %r30109, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11004, %r10957, %r10956, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11008, %r10956, %r10957, %r10903; + // end inline asm + xor.b32 %r11438, %r11004, %r10992; + xor.b32 %r11439, %r11008, %r10993; + xor.b32 %r11271, %r30144, %r11438; + xor.b32 %r11274, %r30145, %r11439; + xor.b32 %r11178, %r30142, %r11438; + xor.b32 %r11177, %r30143, %r11439; + xor.b32 %r11225, %r30140, %r11438; + xor.b32 %r11226, %r30141, %r11439; + xor.b32 %r11130, %r30138, %r11438; + xor.b32 %r11129, %r30139, %r11439; + xor.b32 %r11081, %r30136, %r11438; + xor.b32 %r11082, %r30137, %r11439; + // begin inline asm + shf.l.wrap.b32 %r11012, %r10969, %r10968, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11016, %r10968, %r10969, %r10903; + // end inline asm + xor.b32 %r11440, %r11012, %r10944; + xor.b32 %r11441, %r11016, %r10945; + xor.b32 %r11233, %r30156, %r11440; + xor.b32 %r11234, %r30157, %r11441; + xor.b32 %r11050, %r30154, %r11440; + xor.b32 %r11049, %r30155, %r11441; + xor.b32 %r11209, %r30134, %r11440; + xor.b32 %r11210, %r30135, %r11441; + xor.b32 %r11170, %r30132, %r11440; + xor.b32 %r11169, %r30133, %r11441; + xor.b32 %r11153, %r30130, %r11440; + xor.b32 %r11154, %r30131, %r11441; + // begin inline asm + shf.l.wrap.b32 %r11020, %r10981, %r10980, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11024, %r10980, %r10981, %r10903; + // end inline asm + xor.b32 %r11442, %r11020, %r10956; + xor.b32 %r11443, %r11024, %r10957; + xor.b32 %r11090, %r30152, %r11442; + xor.b32 %r11089, %r30153, %r11443; + xor.b32 %r11217, %r30150, %r11442; + xor.b32 %r11218, %r30151, %r11443; + xor.b32 %r11098, %r30128, %r11442; + xor.b32 %r11097, %r30129, %r11443; + xor.b32 %r11201, %r30126, %r11442; + xor.b32 %r11202, %r30127, %r11443; + xor.b32 %r11066, %r30124, %r11442; + xor.b32 %r11065, %r30125, %r11443; + // begin inline asm + shf.l.wrap.b32 %r11028, %r10993, %r10992, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11032, %r10992, %r10993, %r10903; + // end inline asm + xor.b32 %r11444, %r11028, %r10968; + xor.b32 %r11445, %r11032, %r10969; + xor.b32 %r11185, %r30148, %r11444; + xor.b32 %r11186, %r30149, %r11445; + xor.b32 %r11162, %r30122, %r11444; + xor.b32 %r11161, %r30123, %r11445; + xor.b32 %r11105, %r30120, %r11444; + xor.b32 %r11106, %r30121, %r11445; + xor.b32 %r11193, %r30118, %r11444; + xor.b32 %r11194, %r30119, %r11445; + xor.b32 %r11122, %r30116, %r11444; + xor.b32 %r11121, %r30117, %r11445; + // begin inline asm + shf.l.wrap.b32 %r11036, %r10945, %r10944, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11040, %r10944, %r10945, %r10903; + // end inline asm + xor.b32 %r11446, %r11036, %r10980; + xor.b32 %r11447, %r11040, %r10981; + xor.b32 %r11137, %r30146, %r11446; + xor.b32 %r11138, %r30147, %r11447; + xor.b32 %r11057, %r30114, %r11446; + xor.b32 %r11058, %r30115, %r11447; + xor.b32 %r11074, %r30112, %r11446; + xor.b32 %r11073, %r30113, %r11447; + xor.b32 %r11113, %r30110, %r11446; + xor.b32 %r11114, %r30111, %r11447; + xor.b32 %r11145, %r30108, %r11446; + xor.b32 %r11146, %r30109, %r11447; + mov.u32 %r11051, 44; + // begin inline asm + shf.l.wrap.b32 %r11044, %r11050, %r11049, %r11051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11048, %r11049, %r11050, %r11051; + // end inline asm + mov.u32 %r11059, 20; + // begin inline asm + shf.l.wrap.b32 %r11052, %r11058, %r11057, %r11059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11056, %r11057, %r11058, %r11059; + // end inline asm + mov.u32 %r11067, 61; + // begin inline asm + shf.l.wrap.b32 %r11060, %r11066, %r11065, %r11067; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11064, %r11065, %r11066, %r11067; + // end inline asm + mov.u32 %r11075, 39; + // begin inline asm + shf.l.wrap.b32 %r11068, %r11074, %r11073, %r11075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11072, %r11073, %r11074, %r11075; + // end inline asm + mov.u32 %r11083, 18; + // begin inline asm + shf.l.wrap.b32 %r11076, %r11082, %r11081, %r11083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11080, %r11081, %r11082, %r11083; + // end inline asm + mov.u32 %r11091, 62; + // begin inline asm + shf.l.wrap.b32 %r11084, %r11090, %r11089, %r11091; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11088, %r11089, %r11090, %r11091; + // end inline asm + mov.u32 %r11099, 43; + // begin inline asm + shf.l.wrap.b32 %r11092, %r11098, %r11097, %r11099; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11096, %r11097, %r11098, %r11099; + // end inline asm + mov.u32 %r11107, 25; + // begin inline asm + shf.l.wrap.b32 %r11100, %r11106, %r11105, %r11107; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11104, %r11105, %r11106, %r11107; + // end inline asm + mov.u32 %r11115, 8; + // begin inline asm + shf.l.wrap.b32 %r11108, %r11114, %r11113, %r11115; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11112, %r11113, %r11114, %r11115; + // end inline asm + mov.u32 %r11123, 56; + // begin inline asm + shf.l.wrap.b32 %r11116, %r11122, %r11121, %r11123; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11120, %r11121, %r11122, %r11123; + // end inline asm + mov.u32 %r11131, 41; + // begin inline asm + shf.l.wrap.b32 %r11124, %r11130, %r11129, %r11131; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11128, %r11129, %r11130, %r11131; + // end inline asm + mov.u32 %r11139, 27; + // begin inline asm + shf.l.wrap.b32 %r11132, %r11138, %r11137, %r11139; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11136, %r11137, %r11138, %r11139; + // end inline asm + mov.u32 %r11147, 14; + // begin inline asm + shf.l.wrap.b32 %r11140, %r11146, %r11145, %r11147; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11144, %r11145, %r11146, %r11147; + // end inline asm + mov.u32 %r11155, 2; + // begin inline asm + shf.l.wrap.b32 %r11148, %r11154, %r11153, %r11155; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11152, %r11153, %r11154, %r11155; + // end inline asm + mov.u32 %r11163, 55; + // begin inline asm + shf.l.wrap.b32 %r11156, %r11162, %r11161, %r11163; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11160, %r11161, %r11162, %r11163; + // end inline asm + mov.u32 %r11171, 45; + // begin inline asm + shf.l.wrap.b32 %r11164, %r11170, %r11169, %r11171; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11168, %r11169, %r11170, %r11171; + // end inline asm + mov.u32 %r11179, 36; + // begin inline asm + shf.l.wrap.b32 %r11172, %r11178, %r11177, %r11179; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11176, %r11177, %r11178, %r11179; + // end inline asm + mov.u32 %r11187, 28; + // begin inline asm + shf.l.wrap.b32 %r11180, %r11186, %r11185, %r11187; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11184, %r11185, %r11186, %r11187; + // end inline asm + mov.u32 %r11195, 21; + // begin inline asm + shf.l.wrap.b32 %r11188, %r11194, %r11193, %r11195; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11192, %r11193, %r11194, %r11195; + // end inline asm + mov.u32 %r11203, 15; + // begin inline asm + shf.l.wrap.b32 %r11196, %r11202, %r11201, %r11203; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11200, %r11201, %r11202, %r11203; + // end inline asm + mov.u32 %r11211, 10; + // begin inline asm + shf.l.wrap.b32 %r11204, %r11210, %r11209, %r11211; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11208, %r11209, %r11210, %r11211; + // end inline asm + mov.u32 %r11219, 6; + // begin inline asm + shf.l.wrap.b32 %r11212, %r11218, %r11217, %r11219; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11216, %r11217, %r11218, %r11219; + // end inline asm + mov.u32 %r11227, 3; + // begin inline asm + shf.l.wrap.b32 %r11220, %r11226, %r11225, %r11227; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11224, %r11225, %r11226, %r11227; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11228, %r11234, %r11233, %r10903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11232, %r11233, %r11234, %r10903; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r11236, %r11271, %r11044, %r11092, 0xD2; + lop3.b32 %r11237, %r11274, %r11048, %r11096, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30156, %r11044, %r11092, %r11188, 0xD2; + lop3.b32 %r30157, %r11048, %r11096, %r11192, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30152, %r11092, %r11188, %r11140, 0xD2; + lop3.b32 %r30153, %r11096, %r11192, %r11144, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30148, %r11188, %r11140, %r11271, 0xD2; + lop3.b32 %r30149, %r11192, %r11144, %r11274, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30146, %r11140, %r11271, %r11044, 0xD2; + lop3.b32 %r30147, %r11144, %r11274, %r11048, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30142, %r11180, %r11052, %r11220, 0xD2; + lop3.b32 %r30143, %r11184, %r11056, %r11224, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30154, %r11052, %r11220, %r11164, 0xD2; + lop3.b32 %r30155, %r11056, %r11224, %r11168, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30150, %r11220, %r11164, %r11060, 0xD2; + lop3.b32 %r30151, %r11224, %r11168, %r11064, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30122, %r11164, %r11060, %r11180, 0xD2; + lop3.b32 %r30123, %r11168, %r11064, %r11184, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r30122, %r30123}; + // begin inline asm + // chi + lop3.b32 %r30114, %r11060, %r11180, %r11052, 0xD2; + lop3.b32 %r30115, %r11064, %r11184, %r11056, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r30114, %r30115}; + // begin inline asm + // chi + lop3.b32 %r30140, %r11228, %r11212, %r11100, 0xD2; + lop3.b32 %r30141, %r11232, %r11216, %r11104, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+104], {%r30140, %r30141}; + // begin inline asm + // chi + lop3.b32 %r30134, %r11212, %r11100, %r11108, 0xD2; + lop3.b32 %r30135, %r11216, %r11104, %r11112, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+112], {%r30134, %r30135}; + // begin inline asm + // chi + lop3.b32 %r30128, %r11100, %r11108, %r11076, 0xD2; + lop3.b32 %r30129, %r11104, %r11112, %r11080, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+120], {%r30128, %r30129}; + // begin inline asm + // chi + lop3.b32 %r30120, %r11108, %r11076, %r11228, 0xD2; + lop3.b32 %r30121, %r11112, %r11080, %r11232, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+128], {%r30120, %r30121}; + // begin inline asm + // chi + lop3.b32 %r30112, %r11076, %r11228, %r11212, 0xD2; + lop3.b32 %r30113, %r11080, %r11232, %r11216, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+136], {%r30112, %r30113}; + // begin inline asm + // chi + lop3.b32 %r30138, %r11132, %r11172, %r11204, 0xD2; + lop3.b32 %r30139, %r11136, %r11176, %r11208, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+144], {%r30138, %r30139}; + // begin inline asm + // chi + lop3.b32 %r30132, %r11172, %r11204, %r11196, 0xD2; + lop3.b32 %r30133, %r11176, %r11208, %r11200, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+152], {%r30132, %r30133}; + // begin inline asm + // chi + lop3.b32 %r30126, %r11204, %r11196, %r11116, 0xD2; + lop3.b32 %r30127, %r11208, %r11200, %r11120, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+160], {%r30126, %r30127}; + // begin inline asm + // chi + lop3.b32 %r30118, %r11196, %r11116, %r11132, 0xD2; + lop3.b32 %r30119, %r11200, %r11120, %r11136, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+168], {%r30118, %r30119}; + // begin inline asm + // chi + lop3.b32 %r30110, %r11116, %r11132, %r11172, 0xD2; + lop3.b32 %r30111, %r11120, %r11136, %r11176, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+176], {%r30110, %r30111}; + // begin inline asm + // chi + lop3.b32 %r30136, %r11084, %r11156, %r11068, 0xD2; + lop3.b32 %r30137, %r11088, %r11160, %r11072, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+184], {%r30136, %r30137}; + // begin inline asm + // chi + lop3.b32 %r30130, %r11156, %r11068, %r11124, 0xD2; + lop3.b32 %r30131, %r11160, %r11072, %r11128, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+192], {%r30130, %r30131}; + // begin inline asm + // chi + lop3.b32 %r30124, %r11068, %r11124, %r11148, 0xD2; + lop3.b32 %r30125, %r11072, %r11128, %r11152, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+200], {%r30124, %r30125}; + // begin inline asm + // chi + lop3.b32 %r30116, %r11124, %r11148, %r11084, 0xD2; + lop3.b32 %r30117, %r11128, %r11152, %r11088, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+208], {%r30116, %r30117}; + // begin inline asm + // chi + lop3.b32 %r30108, %r11148, %r11084, %r11156, 0xD2; + lop3.b32 %r30109, %r11152, %r11088, %r11160, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+216], {%r30108, %r30109}; + mul.wide.s32 %rd601, %r30158, 8; + add.s64 %rd600, %rd580, %rd601; + // begin inline asm + ld.global.nc.v2.u32 {%r11436,%r11437}, [%rd600]; + // end inline asm + xor.b32 %r30144, %r11236, %r11436; + xor.b32 %r30145, %r11237, %r11437; + add.s32 %r30158, %r30158, 1; + setp.lt.u32 %p20, %r30158, 23; + @%p20 bra $L__BB2_28; + + mov.u32 %r30191, 0; + mov.u32 %r11547, 1; + st.local.v2.u32 [%rd82+32], {%r30156, %r30157}; + st.local.v2.u32 [%rd82+72], {%r30154, %r30155}; + st.local.v2.u32 [%rd82+40], {%r30152, %r30153}; + st.local.v2.u32 [%rd82+80], {%r30150, %r30151}; + st.local.v2.u32 [%rd82+48], {%r30148, %r30149}; + st.local.v2.u32 [%rd82+56], {%r30146, %r30147}; + st.local.v2.u32 [%rd82+24], {%r30144, %r30145}; + // begin inline asm + // xor5 + lop3.b32 %r11448, %r30144, %r30142, %r30140, 0x96; + lop3.b32 %r11448, %r11448, %r30138, %r30136, 0x96; + lop3.b32 %r11449, %r30145, %r30143, %r30141, 0x96; + lop3.b32 %r11449, %r11449, %r30139, %r30137, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11460, %r30156, %r30154, %r30134, 0x96; + lop3.b32 %r11460, %r11460, %r30132, %r30130, 0x96; + lop3.b32 %r11461, %r30157, %r30155, %r30135, 0x96; + lop3.b32 %r11461, %r11461, %r30133, %r30131, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11472, %r30152, %r30150, %r30128, 0x96; + lop3.b32 %r11472, %r11472, %r30126, %r30124, 0x96; + lop3.b32 %r11473, %r30153, %r30151, %r30129, 0x96; + lop3.b32 %r11473, %r11473, %r30127, %r30125, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11484, %r30148, %r30122, %r30120, 0x96; + lop3.b32 %r11484, %r11484, %r30118, %r30116, 0x96; + lop3.b32 %r11485, %r30149, %r30123, %r30121, 0x96; + lop3.b32 %r11485, %r11485, %r30119, %r30117, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11496, %r30146, %r30114, %r30112, 0x96; + lop3.b32 %r11496, %r11496, %r30110, %r30108, 0x96; + lop3.b32 %r11497, %r30147, %r30115, %r30113, 0x96; + lop3.b32 %r11497, %r11497, %r30111, %r30109, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11508, %r11461, %r11460, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11512, %r11460, %r11461, %r11547; + // end inline asm + xor.b32 %r11687, %r11508, %r11496; + xor.b32 %r11688, %r11512, %r11497; + xor.b32 %r11655, %r30144, %r11687; + xor.b32 %r11658, %r30145, %r11688; + xor.b32 %r11618, %r30141, %r11688; + xor.b32 %r11617, %r30140, %r11687; + st.local.v2.u32 [%rd82+104], {%r11617, %r11618}; + // begin inline asm + shf.l.wrap.b32 %r11516, %r11473, %r11472, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11520, %r11472, %r11473, %r11547; + // end inline asm + xor.b32 %r11689, %r11516, %r11448; + xor.b32 %r11690, %r11520, %r11449; + xor.b32 %r11554, %r30154, %r11689; + xor.b32 %r11553, %r30155, %r11690; + xor.b32 %r11593, %r30133, %r11690; + xor.b32 %r11594, %r30132, %r11689; + st.local.v2.u32 [%rd82+152], {%r11594, %r11593}; + // begin inline asm + shf.l.wrap.b32 %r11524, %r11485, %r11484, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11528, %r11484, %r11485, %r11547; + // end inline asm + xor.b32 %r11691, %r11524, %r11460; + xor.b32 %r11692, %r11528, %r11461; + xor.b32 %r11577, %r30129, %r11692; + xor.b32 %r11578, %r30128, %r11691; + st.local.v2.u32 [%rd82+120], {%r11578, %r11577}; + xor.b32 %r11569, %r30125, %r11692; + xor.b32 %r11570, %r30124, %r11691; + st.local.v2.u32 [%rd82+200], {%r11570, %r11569}; + // begin inline asm + shf.l.wrap.b32 %r11532, %r11497, %r11496, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11536, %r11496, %r11497, %r11547; + // end inline asm + xor.b32 %r11693, %r11532, %r11472; + xor.b32 %r11694, %r11536, %r11473; + xor.b32 %r11601, %r30148, %r11693; + xor.b32 %r11602, %r30149, %r11694; + xor.b32 %r11610, %r30119, %r11694; + xor.b32 %r11609, %r30118, %r11693; + st.local.v2.u32 [%rd82+168], {%r11609, %r11610}; + // begin inline asm + shf.l.wrap.b32 %r11540, %r11449, %r11448, %r11547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11544, %r11448, %r11449, %r11547; + // end inline asm + xor.b32 %r11695, %r11540, %r11484; + xor.b32 %r11696, %r11544, %r11485; + xor.b32 %r11561, %r30114, %r11695; + xor.b32 %r11562, %r30115, %r11696; + xor.b32 %r11586, %r30109, %r11696; + xor.b32 %r11585, %r30108, %r11695; + st.local.v2.u32 [%rd82+216], {%r11585, %r11586}; + // begin inline asm + shf.l.wrap.b32 %r11548, %r11554, %r11553, %r11051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11552, %r11553, %r11554, %r11051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11556, %r11562, %r11561, %r11059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11560, %r11561, %r11562, %r11059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11568, %r11569, %r11570, %r11067; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11564, %r11570, %r11569, %r11067; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r11564, %r11568}; + // begin inline asm + shf.l.wrap.b32 %r11572, %r11578, %r11577, %r11099; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11576, %r11577, %r11578, %r11099; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11580, %r11586, %r11585, %r11147; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11584, %r11585, %r11586, %r11147; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11592, %r11593, %r11594, %r11171; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11588, %r11594, %r11593, %r11171; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r11588, %r11592}; + // begin inline asm + shf.l.wrap.b32 %r11596, %r11602, %r11601, %r11187; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11600, %r11601, %r11602, %r11187; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11604, %r11610, %r11609, %r11195; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11608, %r11609, %r11610, %r11195; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11612, %r11618, %r11617, %r11227; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11616, %r11617, %r11618, %r11227; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r11620, %r11655, %r11548, %r11572, 0xD2; + lop3.b32 %r11621, %r11658, %r11552, %r11576, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30291, %r11548, %r11572, %r11604, 0xD2; + lop3.b32 %r30292, %r11552, %r11576, %r11608, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+32], {%r30291, %r30292}; + // begin inline asm + // chi + lop3.b32 %r30287, %r11572, %r11604, %r11580, 0xD2; + lop3.b32 %r30288, %r11576, %r11608, %r11584, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+40], {%r30287, %r30288}; + // begin inline asm + // chi + lop3.b32 %r30283, %r11604, %r11580, %r11655, 0xD2; + lop3.b32 %r30284, %r11608, %r11584, %r11658, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+48], {%r30283, %r30284}; + // begin inline asm + // chi + lop3.b32 %r30281, %r11580, %r11655, %r11548, 0xD2; + lop3.b32 %r30282, %r11584, %r11658, %r11552, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+56], {%r30281, %r30282}; + // begin inline asm + // chi + lop3.b32 %r30277, %r11596, %r11556, %r11612, 0xD2; + lop3.b32 %r30278, %r11600, %r11560, %r11616, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+64], {%r30277, %r30278}; + // begin inline asm + // chi + lop3.b32 %r30289, %r11556, %r11612, %r11588, 0xD2; + lop3.b32 %r30290, %r11560, %r11616, %r11592, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+72], {%r30289, %r30290}; + // begin inline asm + // chi + lop3.b32 %r30285, %r11612, %r11588, %r11564, 0xD2; + lop3.b32 %r30286, %r11616, %r11592, %r11568, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+80], {%r30285, %r30286}; + // begin inline asm + ld.global.nc.v2.u32 {%r11684,%r11685}, [%rd581]; + // end inline asm + xor.b32 %r30279, %r11620, %r11684; + xor.b32 %r30280, %r11621, %r11685; + st.local.v2.u32 [%rd82+24], {%r30279, %r30280}; + add.s64 %rd84, %rd82, 24; + add.s64 %rd85, %rd2, 24; + +$L__BB2_30: + cvta.to.global.u64 %rd1271, %rd361; + shl.b32 %r11697, %r30191, 2; + cvt.u64.u32 %rd611, %r11697; + and.b64 %rd612, %rd611, 60; + add.s64 %rd613, %rd85, %rd612; + xor.b32 %r11698, %r47, %r30191; + mul.lo.s32 %r11699, %r11698, 16777619; + ld.local.u32 %r11700, [%rd613]; + xor.b32 %r11701, %r11699, %r11700; + mul.wide.u32 %rd614, %r11701, -954391867; + shr.u64 %rd615, %rd614, 32; + cvt.u32.u64 %r11702, %rd615; + sub.s32 %r11703, %r11701, %r11702; + shr.u32 %r11704, %r11703, 1; + add.s32 %r11705, %r11704, %r11702; + shr.u32 %r11706, %r11705, 20; + mul.lo.s32 %r11707, %r11706, 1179641; + sub.s32 %r11708, %r11701, %r11707; + mul.wide.u32 %rd616, %r11708, 64; + add.s64 %rd617, %rd1271, %rd616; + mul.lo.s32 %r11709, %r30228, 16777619; + ld.global.u32 %r11710, [%rd617]; + xor.b32 %r30228, %r11709, %r11710; + mul.lo.s32 %r11711, %r30229, 16777619; + ld.global.u32 %r11712, [%rd617+4]; + xor.b32 %r30229, %r11711, %r11712; + mul.lo.s32 %r11713, %r30240, 16777619; + ld.global.u32 %r11714, [%rd617+8]; + mul.lo.s32 %r11715, %r30241, 16777619; + ld.global.u32 %r11716, [%rd617+12]; + xor.b32 %r11717, %r11715, %r11716; + xor.b32 %r30240, %r11713, %r11714; + mov.b64 %rd618, {%r30240, %r11717}; + mul.lo.s32 %r11718, %r30236, 16777619; + ld.global.u32 %r11719, [%rd617+16]; + mul.lo.s32 %r11720, %r30237, 16777619; + ld.global.u32 %r11721, [%rd617+20]; + xor.b32 %r11722, %r11720, %r11721; + xor.b32 %r30236, %r11718, %r11719; + mov.b64 %rd619, {%r30236, %r11722}; + mul.lo.s32 %r11723, %r30232, 16777619; + ld.global.u32 %r11724, [%rd617+24]; + mul.lo.s32 %r11725, %r30233, 16777619; + ld.global.u32 %r11726, [%rd617+28]; + xor.b32 %r11727, %r11725, %r11726; + xor.b32 %r30232, %r11723, %r11724; + mov.b64 %rd620, {%r30232, %r11727}; + mul.lo.s32 %r11728, %r30230, 16777619; + ld.global.u32 %r11729, [%rd617+32]; + mul.lo.s32 %r11730, %r30231, 16777619; + ld.global.u32 %r11731, [%rd617+36]; + xor.b32 %r11732, %r11730, %r11731; + xor.b32 %r30230, %r11728, %r11729; + mov.b64 %rd621, {%r30230, %r11732}; + mul.lo.s32 %r11733, %r30226, 16777619; + ld.global.u32 %r11734, [%rd617+40]; + xor.b32 %r30226, %r11733, %r11734; + mul.lo.s32 %r11735, %r30227, 16777619; + ld.global.u32 %r11736, [%rd617+44]; + xor.b32 %r30227, %r11735, %r11736; + mul.lo.s32 %r11737, %r30238, 16777619; + ld.global.u32 %r11738, [%rd617+48]; + mul.lo.s32 %r11739, %r30239, 16777619; + ld.global.u32 %r11740, [%rd617+52]; + xor.b32 %r11741, %r11739, %r11740; + xor.b32 %r30238, %r11737, %r11738; + mov.b64 %rd622, {%r30238, %r11741}; + mul.lo.s32 %r11742, %r30234, 16777619; + ld.global.u32 %r11743, [%rd617+56]; + mul.lo.s32 %r11744, %r30235, 16777619; + ld.global.u32 %r11745, [%rd617+60]; + xor.b32 %r11746, %r11744, %r11745; + xor.b32 %r30234, %r11742, %r11743; + mov.b64 %rd623, {%r30234, %r11746}; + st.local.v2.u32 [%rd2+24], {%r30228, %r30229}; + st.local.v2.u32 [%rd2+32], {%r30240, %r11717}; + st.local.v2.u32 [%rd2+40], {%r30236, %r11722}; + st.local.v2.u32 [%rd2+48], {%r30232, %r11727}; + st.local.v2.u32 [%rd2+56], {%r30230, %r11732}; + st.local.v2.u32 [%rd2+64], {%r30226, %r30227}; + st.local.v2.u32 [%rd2+72], {%r30238, %r11741}; + st.local.v2.u32 [%rd2+80], {%r30234, %r11746}; + add.s64 %rd624, %rd84, %rd612; + xor.b32 %r11747, %r48, %r30191; + mul.lo.s32 %r11748, %r11747, 16777619; + ld.local.u32 %r11749, [%rd624]; + xor.b32 %r11750, %r11748, %r11749; + mul.wide.u32 %rd625, %r11750, -954391867; + shr.u64 %rd626, %rd625, 32; + cvt.u32.u64 %r11751, %rd626; + sub.s32 %r11752, %r11750, %r11751; + shr.u32 %r11753, %r11752, 1; + add.s32 %r11754, %r11753, %r11751; + shr.u32 %r11755, %r11754, 20; + mul.lo.s32 %r11756, %r11755, 1179641; + sub.s32 %r11757, %r11750, %r11756; + mul.wide.u32 %rd627, %r11757, 64; + add.s64 %rd628, %rd1271, %rd627; + mul.lo.s32 %r11758, %r30279, 16777619; + ld.global.u32 %r11759, [%rd628]; + xor.b32 %r30279, %r11758, %r11759; + mul.lo.s32 %r11760, %r30280, 16777619; + ld.global.u32 %r11761, [%rd628+4]; + xor.b32 %r30280, %r11760, %r11761; + mul.lo.s32 %r11762, %r30291, 16777619; + ld.global.u32 %r11763, [%rd628+8]; + mul.lo.s32 %r11764, %r30292, 16777619; + ld.global.u32 %r11765, [%rd628+12]; + xor.b32 %r11766, %r11764, %r11765; + xor.b32 %r30291, %r11762, %r11763; + mov.b64 %rd629, {%r30291, %r11766}; + mul.lo.s32 %r11767, %r30287, 16777619; + ld.global.u32 %r11768, [%rd628+16]; + mul.lo.s32 %r11769, %r30288, 16777619; + ld.global.u32 %r11770, [%rd628+20]; + xor.b32 %r11771, %r11769, %r11770; + xor.b32 %r30287, %r11767, %r11768; + mov.b64 %rd630, {%r30287, %r11771}; + mul.lo.s32 %r11772, %r30283, 16777619; + ld.global.u32 %r11773, [%rd628+24]; + mul.lo.s32 %r11774, %r30284, 16777619; + ld.global.u32 %r11775, [%rd628+28]; + xor.b32 %r11776, %r11774, %r11775; + xor.b32 %r30283, %r11772, %r11773; + mov.b64 %rd631, {%r30283, %r11776}; + mul.lo.s32 %r11777, %r30281, 16777619; + ld.global.u32 %r11778, [%rd628+32]; + mul.lo.s32 %r11779, %r30282, 16777619; + ld.global.u32 %r11780, [%rd628+36]; + xor.b32 %r11781, %r11779, %r11780; + xor.b32 %r30281, %r11777, %r11778; + mov.b64 %rd632, {%r30281, %r11781}; + mul.lo.s32 %r11782, %r30277, 16777619; + ld.global.u32 %r11783, [%rd628+40]; + xor.b32 %r30277, %r11782, %r11783; + mul.lo.s32 %r11784, %r30278, 16777619; + ld.global.u32 %r11785, [%rd628+44]; + xor.b32 %r30278, %r11784, %r11785; + mul.lo.s32 %r11786, %r30289, 16777619; + ld.global.u32 %r11787, [%rd628+48]; + mul.lo.s32 %r11788, %r30290, 16777619; + ld.global.u32 %r11789, [%rd628+52]; + xor.b32 %r11790, %r11788, %r11789; + xor.b32 %r30289, %r11786, %r11787; + mov.b64 %rd633, {%r30289, %r11790}; + mul.lo.s32 %r11791, %r30285, 16777619; + ld.global.u32 %r11792, [%rd628+56]; + mul.lo.s32 %r11793, %r30286, 16777619; + ld.global.u32 %r11794, [%rd628+60]; + xor.b32 %r11795, %r11793, %r11794; + xor.b32 %r30285, %r11791, %r11792; + mov.b64 %rd634, {%r30285, %r11795}; + st.local.v2.u32 [%rd82+24], {%r30279, %r30280}; + st.local.v2.u32 [%rd82+32], {%r30291, %r11766}; + st.local.v2.u32 [%rd82+40], {%r30287, %r11771}; + st.local.v2.u32 [%rd82+48], {%r30283, %r11776}; + st.local.v2.u32 [%rd82+56], {%r30281, %r11781}; + st.local.v2.u32 [%rd82+64], {%r30277, %r30278}; + st.local.v2.u32 [%rd82+72], {%r30289, %r11790}; + st.local.v2.u32 [%rd82+80], {%r30285, %r11795}; + add.s32 %r30191, %r30191, 1; + setp.lt.u32 %p21, %r30191, 512; + shr.u64 %rd635, %rd618, 32; + cvt.u32.u64 %r30241, %rd635; + shr.u64 %rd636, %rd619, 32; + cvt.u32.u64 %r30237, %rd636; + shr.u64 %rd637, %rd620, 32; + cvt.u32.u64 %r30233, %rd637; + shr.u64 %rd638, %rd621, 32; + cvt.u32.u64 %r30231, %rd638; + shr.u64 %rd639, %rd622, 32; + cvt.u32.u64 %r30239, %rd639; + shr.u64 %rd640, %rd623, 32; + cvt.u32.u64 %r30235, %rd640; + shr.u64 %rd641, %rd629, 32; + cvt.u32.u64 %r30292, %rd641; + shr.u64 %rd642, %rd630, 32; + cvt.u32.u64 %r30288, %rd642; + shr.u64 %rd643, %rd631, 32; + cvt.u32.u64 %r30284, %rd643; + shr.u64 %rd644, %rd632, 32; + cvt.u32.u64 %r30282, %rd644; + shr.u64 %rd645, %rd633, 32; + cvt.u32.u64 %r30290, %rd645; + shr.u64 %rd646, %rd634, 32; + cvt.u32.u64 %r30286, %rd646; + @%p21 bra $L__BB2_30; + + mov.u32 %r30192, 0; + st.local.v2.u32 [%rd2+96], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+104], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+112], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+120], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+128], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+136], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+144], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+152], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+160], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+168], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+176], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+184], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+192], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+200], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+208], {%r30192, %r30192}; + st.local.v2.u32 [%rd2+216], {%r30192, %r30192}; + mov.u32 %r30207, -2147483648; + mov.u32 %r11810, 1; + st.local.v2.u32 [%rd2+88], {%r11810, %r30207}; + mov.u32 %r30193, %r30192; + mov.u32 %r30194, %r30192; + mov.u32 %r30195, %r30192; + mov.u32 %r30196, %r30192; + mov.u32 %r30197, %r30192; + mov.u32 %r30198, %r30192; + mov.u32 %r30199, %r30192; + mov.u32 %r30200, %r30192; + mov.u32 %r30201, %r30192; + mov.u32 %r30202, %r30192; + mov.u32 %r30203, %r30192; + mov.u32 %r30204, %r30192; + mov.u32 %r30205, %r30192; + mov.u32 %r30206, %r11810; + mov.u32 %r30208, %r30192; + mov.u32 %r30209, %r30192; + mov.u32 %r30210, %r30192; + mov.u32 %r30211, %r30192; + mov.u32 %r30212, %r30192; + mov.u32 %r30213, %r30192; + mov.u32 %r30214, %r30192; + mov.u32 %r30215, %r30192; + mov.u32 %r30216, %r30192; + mov.u32 %r30217, %r30192; + mov.u32 %r30218, %r30192; + mov.u32 %r30219, %r30192; + mov.u32 %r30220, %r30192; + mov.u32 %r30221, %r30192; + mov.u32 %r30222, %r30192; + mov.u32 %r30223, %r30192; + mov.u32 %r30224, %r30192; + mov.u32 %r30225, %r30192; + mov.u32 %r30242, %r30192; + +$L__BB2_32: + // begin inline asm + // xor5 + lop3.b32 %r11837, %r30228, %r30226, %r30224, 0x96; + lop3.b32 %r11837, %r11837, %r30222, %r30220, 0x96; + lop3.b32 %r11838, %r30229, %r30227, %r30225, 0x96; + lop3.b32 %r11838, %r11838, %r30223, %r30221, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11849, %r30240, %r30238, %r30218, 0x96; + lop3.b32 %r11849, %r11849, %r30216, %r30214, 0x96; + lop3.b32 %r11850, %r30241, %r30239, %r30219, 0x96; + lop3.b32 %r11850, %r11850, %r30217, %r30215, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11861, %r30236, %r30234, %r30212, 0x96; + lop3.b32 %r11861, %r11861, %r30210, %r30208, 0x96; + lop3.b32 %r11862, %r30237, %r30235, %r30213, 0x96; + lop3.b32 %r11862, %r11862, %r30211, %r30209, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11873, %r30232, %r30206, %r30204, 0x96; + lop3.b32 %r11873, %r11873, %r30202, %r30200, 0x96; + lop3.b32 %r11874, %r30233, %r30207, %r30205, 0x96; + lop3.b32 %r11874, %r11874, %r30203, %r30201, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11885, %r30230, %r30198, %r30196, 0x96; + lop3.b32 %r11885, %r11885, %r30194, %r30192, 0x96; + lop3.b32 %r11886, %r30231, %r30199, %r30197, 0x96; + lop3.b32 %r11886, %r11886, %r30195, %r30193, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11897, %r11850, %r11849, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11901, %r11849, %r11850, %r11810; + // end inline asm + xor.b32 %r12331, %r11897, %r11885; + xor.b32 %r12332, %r11901, %r11886; + xor.b32 %r12164, %r30228, %r12331; + xor.b32 %r12167, %r30229, %r12332; + xor.b32 %r12071, %r30226, %r12331; + xor.b32 %r12070, %r30227, %r12332; + xor.b32 %r12118, %r30224, %r12331; + xor.b32 %r12119, %r30225, %r12332; + xor.b32 %r12023, %r30222, %r12331; + xor.b32 %r12022, %r30223, %r12332; + xor.b32 %r11974, %r30220, %r12331; + xor.b32 %r11975, %r30221, %r12332; + // begin inline asm + shf.l.wrap.b32 %r11905, %r11862, %r11861, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11909, %r11861, %r11862, %r11810; + // end inline asm + xor.b32 %r12333, %r11905, %r11837; + xor.b32 %r12334, %r11909, %r11838; + xor.b32 %r12126, %r30240, %r12333; + xor.b32 %r12127, %r30241, %r12334; + xor.b32 %r11943, %r30238, %r12333; + xor.b32 %r11942, %r30239, %r12334; + xor.b32 %r12102, %r30218, %r12333; + xor.b32 %r12103, %r30219, %r12334; + xor.b32 %r12063, %r30216, %r12333; + xor.b32 %r12062, %r30217, %r12334; + xor.b32 %r12046, %r30214, %r12333; + xor.b32 %r12047, %r30215, %r12334; + // begin inline asm + shf.l.wrap.b32 %r11913, %r11874, %r11873, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11917, %r11873, %r11874, %r11810; + // end inline asm + xor.b32 %r12335, %r11913, %r11849; + xor.b32 %r12336, %r11917, %r11850; + xor.b32 %r11983, %r30236, %r12335; + xor.b32 %r11982, %r30237, %r12336; + xor.b32 %r12110, %r30234, %r12335; + xor.b32 %r12111, %r30235, %r12336; + xor.b32 %r11991, %r30212, %r12335; + xor.b32 %r11990, %r30213, %r12336; + xor.b32 %r12094, %r30210, %r12335; + xor.b32 %r12095, %r30211, %r12336; + xor.b32 %r11959, %r30208, %r12335; + xor.b32 %r11958, %r30209, %r12336; + // begin inline asm + shf.l.wrap.b32 %r11921, %r11886, %r11885, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11925, %r11885, %r11886, %r11810; + // end inline asm + xor.b32 %r12337, %r11921, %r11861; + xor.b32 %r12338, %r11925, %r11862; + xor.b32 %r12078, %r30232, %r12337; + xor.b32 %r12079, %r30233, %r12338; + xor.b32 %r12055, %r30206, %r12337; + xor.b32 %r12054, %r30207, %r12338; + xor.b32 %r11998, %r30204, %r12337; + xor.b32 %r11999, %r30205, %r12338; + xor.b32 %r12086, %r30202, %r12337; + xor.b32 %r12087, %r30203, %r12338; + xor.b32 %r12015, %r30200, %r12337; + xor.b32 %r12014, %r30201, %r12338; + // begin inline asm + shf.l.wrap.b32 %r11929, %r11838, %r11837, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11933, %r11837, %r11838, %r11810; + // end inline asm + xor.b32 %r12339, %r11929, %r11873; + xor.b32 %r12340, %r11933, %r11874; + xor.b32 %r12030, %r30230, %r12339; + xor.b32 %r12031, %r30231, %r12340; + xor.b32 %r11950, %r30198, %r12339; + xor.b32 %r11951, %r30199, %r12340; + xor.b32 %r11967, %r30196, %r12339; + xor.b32 %r11966, %r30197, %r12340; + xor.b32 %r12006, %r30194, %r12339; + xor.b32 %r12007, %r30195, %r12340; + xor.b32 %r12038, %r30192, %r12339; + xor.b32 %r12039, %r30193, %r12340; + mov.u32 %r11944, 44; + // begin inline asm + shf.l.wrap.b32 %r11937, %r11943, %r11942, %r11944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11941, %r11942, %r11943, %r11944; + // end inline asm + mov.u32 %r11952, 20; + // begin inline asm + shf.l.wrap.b32 %r11945, %r11951, %r11950, %r11952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11949, %r11950, %r11951, %r11952; + // end inline asm + mov.u32 %r11960, 61; + // begin inline asm + shf.l.wrap.b32 %r11953, %r11959, %r11958, %r11960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11957, %r11958, %r11959, %r11960; + // end inline asm + mov.u32 %r11968, 39; + // begin inline asm + shf.l.wrap.b32 %r11961, %r11967, %r11966, %r11968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11965, %r11966, %r11967, %r11968; + // end inline asm + mov.u32 %r11976, 18; + // begin inline asm + shf.l.wrap.b32 %r11969, %r11975, %r11974, %r11976; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11973, %r11974, %r11975, %r11976; + // end inline asm + mov.u32 %r11984, 62; + // begin inline asm + shf.l.wrap.b32 %r11977, %r11983, %r11982, %r11984; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11981, %r11982, %r11983, %r11984; + // end inline asm + mov.u32 %r11992, 43; + // begin inline asm + shf.l.wrap.b32 %r11985, %r11991, %r11990, %r11992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11989, %r11990, %r11991, %r11992; + // end inline asm + mov.u32 %r12000, 25; + // begin inline asm + shf.l.wrap.b32 %r11993, %r11999, %r11998, %r12000; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r11997, %r11998, %r11999, %r12000; + // end inline asm + mov.u32 %r12008, 8; + // begin inline asm + shf.l.wrap.b32 %r12001, %r12007, %r12006, %r12008; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12005, %r12006, %r12007, %r12008; + // end inline asm + mov.u32 %r12016, 56; + // begin inline asm + shf.l.wrap.b32 %r12009, %r12015, %r12014, %r12016; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12013, %r12014, %r12015, %r12016; + // end inline asm + mov.u32 %r12024, 41; + // begin inline asm + shf.l.wrap.b32 %r12017, %r12023, %r12022, %r12024; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12021, %r12022, %r12023, %r12024; + // end inline asm + mov.u32 %r12032, 27; + // begin inline asm + shf.l.wrap.b32 %r12025, %r12031, %r12030, %r12032; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12029, %r12030, %r12031, %r12032; + // end inline asm + mov.u32 %r12040, 14; + // begin inline asm + shf.l.wrap.b32 %r12033, %r12039, %r12038, %r12040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12037, %r12038, %r12039, %r12040; + // end inline asm + mov.u32 %r12048, 2; + // begin inline asm + shf.l.wrap.b32 %r12041, %r12047, %r12046, %r12048; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12045, %r12046, %r12047, %r12048; + // end inline asm + mov.u32 %r12056, 55; + // begin inline asm + shf.l.wrap.b32 %r12049, %r12055, %r12054, %r12056; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12053, %r12054, %r12055, %r12056; + // end inline asm + mov.u32 %r12064, 45; + // begin inline asm + shf.l.wrap.b32 %r12057, %r12063, %r12062, %r12064; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12061, %r12062, %r12063, %r12064; + // end inline asm + mov.u32 %r12072, 36; + // begin inline asm + shf.l.wrap.b32 %r12065, %r12071, %r12070, %r12072; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12069, %r12070, %r12071, %r12072; + // end inline asm + mov.u32 %r12080, 28; + // begin inline asm + shf.l.wrap.b32 %r12073, %r12079, %r12078, %r12080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12077, %r12078, %r12079, %r12080; + // end inline asm + mov.u32 %r12088, 21; + // begin inline asm + shf.l.wrap.b32 %r12081, %r12087, %r12086, %r12088; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12085, %r12086, %r12087, %r12088; + // end inline asm + mov.u32 %r12096, 15; + // begin inline asm + shf.l.wrap.b32 %r12089, %r12095, %r12094, %r12096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12093, %r12094, %r12095, %r12096; + // end inline asm + mov.u32 %r12104, 10; + // begin inline asm + shf.l.wrap.b32 %r12097, %r12103, %r12102, %r12104; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12101, %r12102, %r12103, %r12104; + // end inline asm + mov.u32 %r12112, 6; + // begin inline asm + shf.l.wrap.b32 %r12105, %r12111, %r12110, %r12112; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12109, %r12110, %r12111, %r12112; + // end inline asm + mov.u32 %r12120, 3; + // begin inline asm + shf.l.wrap.b32 %r12113, %r12119, %r12118, %r12120; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12117, %r12118, %r12119, %r12120; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12121, %r12127, %r12126, %r11810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12125, %r12126, %r12127, %r11810; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12129, %r12164, %r11937, %r11985, 0xD2; + lop3.b32 %r12130, %r12167, %r11941, %r11989, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30240, %r11937, %r11985, %r12081, 0xD2; + lop3.b32 %r30241, %r11941, %r11989, %r12085, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30236, %r11985, %r12081, %r12033, 0xD2; + lop3.b32 %r30237, %r11989, %r12085, %r12037, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30232, %r12081, %r12033, %r12164, 0xD2; + lop3.b32 %r30233, %r12085, %r12037, %r12167, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30230, %r12033, %r12164, %r11937, 0xD2; + lop3.b32 %r30231, %r12037, %r12167, %r11941, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30226, %r12073, %r11945, %r12113, 0xD2; + lop3.b32 %r30227, %r12077, %r11949, %r12117, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30238, %r11945, %r12113, %r12057, 0xD2; + lop3.b32 %r30239, %r11949, %r12117, %r12061, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30234, %r12113, %r12057, %r11953, 0xD2; + lop3.b32 %r30235, %r12117, %r12061, %r11957, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30206, %r12057, %r11953, %r12073, 0xD2; + lop3.b32 %r30207, %r12061, %r11957, %r12077, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30206, %r30207}; + // begin inline asm + // chi + lop3.b32 %r30198, %r11953, %r12073, %r11945, 0xD2; + lop3.b32 %r30199, %r11957, %r12077, %r11949, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30198, %r30199}; + // begin inline asm + // chi + lop3.b32 %r30224, %r12121, %r12105, %r11993, 0xD2; + lop3.b32 %r30225, %r12125, %r12109, %r11997, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30224, %r30225}; + // begin inline asm + // chi + lop3.b32 %r30218, %r12105, %r11993, %r12001, 0xD2; + lop3.b32 %r30219, %r12109, %r11997, %r12005, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30218, %r30219}; + // begin inline asm + // chi + lop3.b32 %r30212, %r11993, %r12001, %r11969, 0xD2; + lop3.b32 %r30213, %r11997, %r12005, %r11973, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30212, %r30213}; + // begin inline asm + // chi + lop3.b32 %r30204, %r12001, %r11969, %r12121, 0xD2; + lop3.b32 %r30205, %r12005, %r11973, %r12125, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30204, %r30205}; + // begin inline asm + // chi + lop3.b32 %r30196, %r11969, %r12121, %r12105, 0xD2; + lop3.b32 %r30197, %r11973, %r12125, %r12109, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30196, %r30197}; + // begin inline asm + // chi + lop3.b32 %r30222, %r12025, %r12065, %r12097, 0xD2; + lop3.b32 %r30223, %r12029, %r12069, %r12101, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30222, %r30223}; + // begin inline asm + // chi + lop3.b32 %r30216, %r12065, %r12097, %r12089, 0xD2; + lop3.b32 %r30217, %r12069, %r12101, %r12093, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30216, %r30217}; + // begin inline asm + // chi + lop3.b32 %r30210, %r12097, %r12089, %r12009, 0xD2; + lop3.b32 %r30211, %r12101, %r12093, %r12013, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30210, %r30211}; + // begin inline asm + // chi + lop3.b32 %r30202, %r12089, %r12009, %r12025, 0xD2; + lop3.b32 %r30203, %r12093, %r12013, %r12029, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30202, %r30203}; + // begin inline asm + // chi + lop3.b32 %r30194, %r12009, %r12025, %r12065, 0xD2; + lop3.b32 %r30195, %r12013, %r12029, %r12069, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30194, %r30195}; + // begin inline asm + // chi + lop3.b32 %r30220, %r11977, %r12049, %r11961, 0xD2; + lop3.b32 %r30221, %r11981, %r12053, %r11965, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30220, %r30221}; + // begin inline asm + // chi + lop3.b32 %r30214, %r12049, %r11961, %r12017, 0xD2; + lop3.b32 %r30215, %r12053, %r11965, %r12021, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30214, %r30215}; + // begin inline asm + // chi + lop3.b32 %r30208, %r11961, %r12017, %r12041, 0xD2; + lop3.b32 %r30209, %r11965, %r12021, %r12045, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30208, %r30209}; + // begin inline asm + // chi + lop3.b32 %r30200, %r12017, %r12041, %r11977, 0xD2; + lop3.b32 %r30201, %r12021, %r12045, %r11981, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30200, %r30201}; + // begin inline asm + // chi + lop3.b32 %r30192, %r12041, %r11977, %r12049, 0xD2; + lop3.b32 %r30193, %r12045, %r11981, %r12053, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30192, %r30193}; + mul.wide.s32 %rd648, %r30242, 8; + add.s64 %rd647, %rd580, %rd648; + // begin inline asm + ld.global.nc.v2.u32 {%r12329,%r12330}, [%rd647]; + // end inline asm + xor.b32 %r30228, %r12129, %r12329; + xor.b32 %r30229, %r12130, %r12330; + add.s32 %r30242, %r30242, 1; + setp.lt.u32 %p22, %r30242, 23; + @%p22 bra $L__BB2_32; + + st.local.v2.u32 [%rd2+32], {%r30240, %r30241}; + st.local.v2.u32 [%rd2+72], {%r30238, %r30239}; + st.local.v2.u32 [%rd2+40], {%r30236, %r30237}; + st.local.v2.u32 [%rd2+80], {%r30234, %r30235}; + st.local.v2.u32 [%rd2+48], {%r30232, %r30233}; + st.local.v2.u32 [%rd2+56], {%r30230, %r30231}; + st.local.v2.u32 [%rd2+24], {%r30228, %r30229}; + // begin inline asm + // xor5 + lop3.b32 %r12341, %r30228, %r30226, %r30224, 0x96; + lop3.b32 %r12341, %r12341, %r30222, %r30220, 0x96; + lop3.b32 %r12342, %r30229, %r30227, %r30225, 0x96; + lop3.b32 %r12342, %r12342, %r30223, %r30221, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12353, %r30240, %r30238, %r30218, 0x96; + lop3.b32 %r12353, %r12353, %r30216, %r30214, 0x96; + lop3.b32 %r12354, %r30241, %r30239, %r30219, 0x96; + lop3.b32 %r12354, %r12354, %r30217, %r30215, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12365, %r30236, %r30234, %r30212, 0x96; + lop3.b32 %r12365, %r12365, %r30210, %r30208, 0x96; + lop3.b32 %r12366, %r30237, %r30235, %r30213, 0x96; + lop3.b32 %r12366, %r12366, %r30211, %r30209, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12377, %r30232, %r30206, %r30204, 0x96; + lop3.b32 %r12377, %r12377, %r30202, %r30200, 0x96; + lop3.b32 %r12378, %r30233, %r30207, %r30205, 0x96; + lop3.b32 %r12378, %r12378, %r30203, %r30201, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12389, %r30230, %r30198, %r30196, 0x96; + lop3.b32 %r12389, %r12389, %r30194, %r30192, 0x96; + lop3.b32 %r12390, %r30231, %r30199, %r30197, 0x96; + lop3.b32 %r12390, %r12390, %r30195, %r30193, 0x96; + // end inline asm + mov.u32 %r12593, 1; + // begin inline asm + shf.l.wrap.b32 %r12401, %r12354, %r12353, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12405, %r12353, %r12354, %r12593; + // end inline asm + xor.b32 %r12620, %r12401, %r12389; + xor.b32 %r12621, %r12405, %r12390; + xor.b32 %r12548, %r30228, %r12620; + xor.b32 %r12551, %r30229, %r12621; + xor.b32 %r12511, %r30225, %r12621; + xor.b32 %r12510, %r30224, %r12620; + st.local.v2.u32 [%rd2+104], {%r12510, %r12511}; + // begin inline asm + shf.l.wrap.b32 %r12409, %r12366, %r12365, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12413, %r12365, %r12366, %r12593; + // end inline asm + xor.b32 %r12622, %r12409, %r12341; + xor.b32 %r12623, %r12413, %r12342; + xor.b32 %r12447, %r30238, %r12622; + xor.b32 %r12446, %r30239, %r12623; + xor.b32 %r12486, %r30217, %r12623; + xor.b32 %r12487, %r30216, %r12622; + st.local.v2.u32 [%rd2+152], {%r12487, %r12486}; + // begin inline asm + shf.l.wrap.b32 %r12417, %r12378, %r12377, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12421, %r12377, %r12378, %r12593; + // end inline asm + xor.b32 %r12624, %r12417, %r12353; + xor.b32 %r12625, %r12421, %r12354; + xor.b32 %r12470, %r30213, %r12625; + xor.b32 %r12471, %r30212, %r12624; + st.local.v2.u32 [%rd2+120], {%r12471, %r12470}; + xor.b32 %r12462, %r30209, %r12625; + xor.b32 %r12463, %r30208, %r12624; + st.local.v2.u32 [%rd2+200], {%r12463, %r12462}; + // begin inline asm + shf.l.wrap.b32 %r12425, %r12390, %r12389, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12429, %r12389, %r12390, %r12593; + // end inline asm + xor.b32 %r12626, %r12425, %r12365; + xor.b32 %r12627, %r12429, %r12366; + xor.b32 %r12494, %r30232, %r12626; + xor.b32 %r12495, %r30233, %r12627; + xor.b32 %r12503, %r30203, %r12627; + xor.b32 %r12502, %r30202, %r12626; + st.local.v2.u32 [%rd2+168], {%r12502, %r12503}; + // begin inline asm + shf.l.wrap.b32 %r12433, %r12342, %r12341, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12437, %r12341, %r12342, %r12593; + // end inline asm + xor.b32 %r12628, %r12433, %r12377; + xor.b32 %r12629, %r12437, %r12378; + xor.b32 %r12454, %r30198, %r12628; + xor.b32 %r12455, %r30199, %r12629; + xor.b32 %r12479, %r30193, %r12629; + xor.b32 %r12478, %r30192, %r12628; + st.local.v2.u32 [%rd2+216], {%r12478, %r12479}; + // begin inline asm + shf.l.wrap.b32 %r12441, %r12447, %r12446, %r11944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12445, %r12446, %r12447, %r11944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12449, %r12455, %r12454, %r11952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12453, %r12454, %r12455, %r11952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12461, %r12462, %r12463, %r11960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12457, %r12463, %r12462, %r11960; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r12457, %r12461}; + // begin inline asm + shf.l.wrap.b32 %r12465, %r12471, %r12470, %r11992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12469, %r12470, %r12471, %r11992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12473, %r12479, %r12478, %r12040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12477, %r12478, %r12479, %r12040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12485, %r12486, %r12487, %r12064; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12481, %r12487, %r12486, %r12064; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r12481, %r12485}; + // begin inline asm + shf.l.wrap.b32 %r12489, %r12495, %r12494, %r12080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12493, %r12494, %r12495, %r12080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12497, %r12503, %r12502, %r12088; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12501, %r12502, %r12503, %r12088; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12505, %r12511, %r12510, %r12120; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12509, %r12510, %r12511, %r12120; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12513, %r12548, %r12441, %r12465, 0xD2; + lop3.b32 %r12514, %r12551, %r12445, %r12469, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12521, %r12441, %r12465, %r12497, 0xD2; + lop3.b32 %r12522, %r12445, %r12469, %r12501, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r12521, %r12522}; + // begin inline asm + // chi + lop3.b32 %r12529, %r12465, %r12497, %r12473, 0xD2; + lop3.b32 %r12530, %r12469, %r12501, %r12477, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r12529, %r12530}; + // begin inline asm + // chi + lop3.b32 %r12537, %r12497, %r12473, %r12548, 0xD2; + lop3.b32 %r12538, %r12501, %r12477, %r12551, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r12537, %r12538}; + // begin inline asm + // chi + lop3.b32 %r12545, %r12473, %r12548, %r12441, 0xD2; + lop3.b32 %r12546, %r12477, %r12551, %r12445, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r12545, %r12546}; + // begin inline asm + // chi + lop3.b32 %r12553, %r12489, %r12449, %r12505, 0xD2; + lop3.b32 %r12554, %r12493, %r12453, %r12509, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r12553, %r12554}; + // begin inline asm + // chi + lop3.b32 %r12561, %r12449, %r12505, %r12481, 0xD2; + lop3.b32 %r12562, %r12453, %r12509, %r12485, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r12561, %r12562}; + // begin inline asm + // chi + lop3.b32 %r12569, %r12505, %r12481, %r12457, 0xD2; + lop3.b32 %r12570, %r12509, %r12485, %r12461, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r12569, %r12570}; + // begin inline asm + ld.global.nc.v2.u32 {%r12577,%r12578}, [%rd581]; + // end inline asm + xor.b32 %r12630, %r12514, %r12578; + xor.b32 %r12631, %r12513, %r12577; + mov.b64 %rd1317, {%r12631, %r12630}; + mov.b64 %rd1318, {%r12521, %r12522}; + mov.b64 %rd1319, {%r12529, %r12530}; + mov.b64 %rd1320, {%r12537, %r12538}; + mov.b64 %rd1321, {%r12545, %r12546}; + mov.b64 %rd1322, {%r12553, %r12554}; + mov.b64 %rd1323, {%r12561, %r12562}; + mov.b64 %rd1324, {%r12569, %r12570}; + mov.u32 %r30243, 0; + st.local.v2.u32 [%rd2+24], {%r12631, %r12630}; + st.local.v2.u32 [%rd82+96], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+104], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+112], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+120], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+128], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+136], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+144], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+152], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+160], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+168], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+176], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+184], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+192], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+200], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+208], {%r30243, %r30243}; + st.local.v2.u32 [%rd82+216], {%r30243, %r30243}; + mov.u32 %r30258, -2147483648; + st.local.v2.u32 [%rd82+88], {%r12593, %r30258}; + mov.u32 %r30244, %r30243; + mov.u32 %r30245, %r30243; + mov.u32 %r30246, %r30243; + mov.u32 %r30247, %r30243; + mov.u32 %r30248, %r30243; + mov.u32 %r30249, %r30243; + mov.u32 %r30250, %r30243; + mov.u32 %r30251, %r30243; + mov.u32 %r30252, %r30243; + mov.u32 %r30253, %r30243; + mov.u32 %r30254, %r30243; + mov.u32 %r30255, %r30243; + mov.u32 %r30256, %r30243; + mov.u32 %r30257, %r12593; + mov.u32 %r30259, %r30243; + mov.u32 %r30260, %r30243; + mov.u32 %r30261, %r30243; + mov.u32 %r30262, %r30243; + mov.u32 %r30263, %r30243; + mov.u32 %r30264, %r30243; + mov.u32 %r30265, %r30243; + mov.u32 %r30266, %r30243; + mov.u32 %r30267, %r30243; + mov.u32 %r30268, %r30243; + mov.u32 %r30269, %r30243; + mov.u32 %r30270, %r30243; + mov.u32 %r30271, %r30243; + mov.u32 %r30272, %r30243; + mov.u32 %r30273, %r30243; + mov.u32 %r30274, %r30243; + mov.u32 %r30275, %r30243; + mov.u32 %r30276, %r30243; + mov.u32 %r30293, %r30243; + +$L__BB2_34: + // begin inline asm + // xor5 + lop3.b32 %r12632, %r30279, %r30277, %r30275, 0x96; + lop3.b32 %r12632, %r12632, %r30273, %r30271, 0x96; + lop3.b32 %r12633, %r30280, %r30278, %r30276, 0x96; + lop3.b32 %r12633, %r12633, %r30274, %r30272, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12644, %r30291, %r30289, %r30269, 0x96; + lop3.b32 %r12644, %r12644, %r30267, %r30265, 0x96; + lop3.b32 %r12645, %r30292, %r30290, %r30270, 0x96; + lop3.b32 %r12645, %r12645, %r30268, %r30266, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12656, %r30287, %r30285, %r30263, 0x96; + lop3.b32 %r12656, %r12656, %r30261, %r30259, 0x96; + lop3.b32 %r12657, %r30288, %r30286, %r30264, 0x96; + lop3.b32 %r12657, %r12657, %r30262, %r30260, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12668, %r30283, %r30257, %r30255, 0x96; + lop3.b32 %r12668, %r12668, %r30253, %r30251, 0x96; + lop3.b32 %r12669, %r30284, %r30258, %r30256, 0x96; + lop3.b32 %r12669, %r12669, %r30254, %r30252, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12680, %r30281, %r30249, %r30247, 0x96; + lop3.b32 %r12680, %r12680, %r30245, %r30243, 0x96; + lop3.b32 %r12681, %r30282, %r30250, %r30248, 0x96; + lop3.b32 %r12681, %r12681, %r30246, %r30244, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12692, %r12645, %r12644, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12696, %r12644, %r12645, %r12593; + // end inline asm + xor.b32 %r13126, %r12692, %r12680; + xor.b32 %r13127, %r12696, %r12681; + xor.b32 %r12959, %r30279, %r13126; + xor.b32 %r12962, %r30280, %r13127; + xor.b32 %r12866, %r30277, %r13126; + xor.b32 %r12865, %r30278, %r13127; + xor.b32 %r12913, %r30275, %r13126; + xor.b32 %r12914, %r30276, %r13127; + xor.b32 %r12818, %r30273, %r13126; + xor.b32 %r12817, %r30274, %r13127; + xor.b32 %r12769, %r30271, %r13126; + xor.b32 %r12770, %r30272, %r13127; + // begin inline asm + shf.l.wrap.b32 %r12700, %r12657, %r12656, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12704, %r12656, %r12657, %r12593; + // end inline asm + xor.b32 %r13128, %r12700, %r12632; + xor.b32 %r13129, %r12704, %r12633; + xor.b32 %r12921, %r30291, %r13128; + xor.b32 %r12922, %r30292, %r13129; + xor.b32 %r12738, %r30289, %r13128; + xor.b32 %r12737, %r30290, %r13129; + xor.b32 %r12897, %r30269, %r13128; + xor.b32 %r12898, %r30270, %r13129; + xor.b32 %r12858, %r30267, %r13128; + xor.b32 %r12857, %r30268, %r13129; + xor.b32 %r12841, %r30265, %r13128; + xor.b32 %r12842, %r30266, %r13129; + // begin inline asm + shf.l.wrap.b32 %r12708, %r12669, %r12668, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12712, %r12668, %r12669, %r12593; + // end inline asm + xor.b32 %r13130, %r12708, %r12644; + xor.b32 %r13131, %r12712, %r12645; + xor.b32 %r12778, %r30287, %r13130; + xor.b32 %r12777, %r30288, %r13131; + xor.b32 %r12905, %r30285, %r13130; + xor.b32 %r12906, %r30286, %r13131; + xor.b32 %r12786, %r30263, %r13130; + xor.b32 %r12785, %r30264, %r13131; + xor.b32 %r12889, %r30261, %r13130; + xor.b32 %r12890, %r30262, %r13131; + xor.b32 %r12754, %r30259, %r13130; + xor.b32 %r12753, %r30260, %r13131; + // begin inline asm + shf.l.wrap.b32 %r12716, %r12681, %r12680, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12720, %r12680, %r12681, %r12593; + // end inline asm + xor.b32 %r13132, %r12716, %r12656; + xor.b32 %r13133, %r12720, %r12657; + xor.b32 %r12873, %r30283, %r13132; + xor.b32 %r12874, %r30284, %r13133; + xor.b32 %r12850, %r30257, %r13132; + xor.b32 %r12849, %r30258, %r13133; + xor.b32 %r12793, %r30255, %r13132; + xor.b32 %r12794, %r30256, %r13133; + xor.b32 %r12881, %r30253, %r13132; + xor.b32 %r12882, %r30254, %r13133; + xor.b32 %r12810, %r30251, %r13132; + xor.b32 %r12809, %r30252, %r13133; + // begin inline asm + shf.l.wrap.b32 %r12724, %r12633, %r12632, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12728, %r12632, %r12633, %r12593; + // end inline asm + xor.b32 %r13134, %r12724, %r12668; + xor.b32 %r13135, %r12728, %r12669; + xor.b32 %r12825, %r30281, %r13134; + xor.b32 %r12826, %r30282, %r13135; + xor.b32 %r12745, %r30249, %r13134; + xor.b32 %r12746, %r30250, %r13135; + xor.b32 %r12762, %r30247, %r13134; + xor.b32 %r12761, %r30248, %r13135; + xor.b32 %r12801, %r30245, %r13134; + xor.b32 %r12802, %r30246, %r13135; + xor.b32 %r12833, %r30243, %r13134; + xor.b32 %r12834, %r30244, %r13135; + mov.u32 %r12739, 44; + // begin inline asm + shf.l.wrap.b32 %r12732, %r12738, %r12737, %r12739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12736, %r12737, %r12738, %r12739; + // end inline asm + mov.u32 %r12747, 20; + // begin inline asm + shf.l.wrap.b32 %r12740, %r12746, %r12745, %r12747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12744, %r12745, %r12746, %r12747; + // end inline asm + mov.u32 %r12755, 61; + // begin inline asm + shf.l.wrap.b32 %r12748, %r12754, %r12753, %r12755; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12752, %r12753, %r12754, %r12755; + // end inline asm + mov.u32 %r12763, 39; + // begin inline asm + shf.l.wrap.b32 %r12756, %r12762, %r12761, %r12763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12760, %r12761, %r12762, %r12763; + // end inline asm + mov.u32 %r12771, 18; + // begin inline asm + shf.l.wrap.b32 %r12764, %r12770, %r12769, %r12771; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12768, %r12769, %r12770, %r12771; + // end inline asm + mov.u32 %r12779, 62; + // begin inline asm + shf.l.wrap.b32 %r12772, %r12778, %r12777, %r12779; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12776, %r12777, %r12778, %r12779; + // end inline asm + mov.u32 %r12787, 43; + // begin inline asm + shf.l.wrap.b32 %r12780, %r12786, %r12785, %r12787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12784, %r12785, %r12786, %r12787; + // end inline asm + mov.u32 %r12795, 25; + // begin inline asm + shf.l.wrap.b32 %r12788, %r12794, %r12793, %r12795; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12792, %r12793, %r12794, %r12795; + // end inline asm + mov.u32 %r12803, 8; + // begin inline asm + shf.l.wrap.b32 %r12796, %r12802, %r12801, %r12803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12800, %r12801, %r12802, %r12803; + // end inline asm + mov.u32 %r12811, 56; + // begin inline asm + shf.l.wrap.b32 %r12804, %r12810, %r12809, %r12811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12808, %r12809, %r12810, %r12811; + // end inline asm + mov.u32 %r12819, 41; + // begin inline asm + shf.l.wrap.b32 %r12812, %r12818, %r12817, %r12819; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12816, %r12817, %r12818, %r12819; + // end inline asm + mov.u32 %r12827, 27; + // begin inline asm + shf.l.wrap.b32 %r12820, %r12826, %r12825, %r12827; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12824, %r12825, %r12826, %r12827; + // end inline asm + mov.u32 %r12835, 14; + // begin inline asm + shf.l.wrap.b32 %r12828, %r12834, %r12833, %r12835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12832, %r12833, %r12834, %r12835; + // end inline asm + mov.u32 %r12843, 2; + // begin inline asm + shf.l.wrap.b32 %r12836, %r12842, %r12841, %r12843; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12840, %r12841, %r12842, %r12843; + // end inline asm + mov.u32 %r12851, 55; + // begin inline asm + shf.l.wrap.b32 %r12844, %r12850, %r12849, %r12851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12848, %r12849, %r12850, %r12851; + // end inline asm + mov.u32 %r12859, 45; + // begin inline asm + shf.l.wrap.b32 %r12852, %r12858, %r12857, %r12859; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12856, %r12857, %r12858, %r12859; + // end inline asm + mov.u32 %r12867, 36; + // begin inline asm + shf.l.wrap.b32 %r12860, %r12866, %r12865, %r12867; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12864, %r12865, %r12866, %r12867; + // end inline asm + mov.u32 %r12875, 28; + // begin inline asm + shf.l.wrap.b32 %r12868, %r12874, %r12873, %r12875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12872, %r12873, %r12874, %r12875; + // end inline asm + mov.u32 %r12883, 21; + // begin inline asm + shf.l.wrap.b32 %r12876, %r12882, %r12881, %r12883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12880, %r12881, %r12882, %r12883; + // end inline asm + mov.u32 %r12891, 15; + // begin inline asm + shf.l.wrap.b32 %r12884, %r12890, %r12889, %r12891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12888, %r12889, %r12890, %r12891; + // end inline asm + mov.u32 %r12899, 10; + // begin inline asm + shf.l.wrap.b32 %r12892, %r12898, %r12897, %r12899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12896, %r12897, %r12898, %r12899; + // end inline asm + mov.u32 %r12907, 6; + // begin inline asm + shf.l.wrap.b32 %r12900, %r12906, %r12905, %r12907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12904, %r12905, %r12906, %r12907; + // end inline asm + mov.u32 %r12915, 3; + // begin inline asm + shf.l.wrap.b32 %r12908, %r12914, %r12913, %r12915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12912, %r12913, %r12914, %r12915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12916, %r12922, %r12921, %r12593; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r12920, %r12921, %r12922, %r12593; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r12924, %r12959, %r12732, %r12780, 0xD2; + lop3.b32 %r12925, %r12962, %r12736, %r12784, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30291, %r12732, %r12780, %r12876, 0xD2; + lop3.b32 %r30292, %r12736, %r12784, %r12880, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30287, %r12780, %r12876, %r12828, 0xD2; + lop3.b32 %r30288, %r12784, %r12880, %r12832, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30283, %r12876, %r12828, %r12959, 0xD2; + lop3.b32 %r30284, %r12880, %r12832, %r12962, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30281, %r12828, %r12959, %r12732, 0xD2; + lop3.b32 %r30282, %r12832, %r12962, %r12736, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30277, %r12868, %r12740, %r12908, 0xD2; + lop3.b32 %r30278, %r12872, %r12744, %r12912, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30289, %r12740, %r12908, %r12852, 0xD2; + lop3.b32 %r30290, %r12744, %r12912, %r12856, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30285, %r12908, %r12852, %r12748, 0xD2; + lop3.b32 %r30286, %r12912, %r12856, %r12752, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30257, %r12852, %r12748, %r12868, 0xD2; + lop3.b32 %r30258, %r12856, %r12752, %r12872, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r30257, %r30258}; + // begin inline asm + // chi + lop3.b32 %r30249, %r12748, %r12868, %r12740, 0xD2; + lop3.b32 %r30250, %r12752, %r12872, %r12744, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r30249, %r30250}; + // begin inline asm + // chi + lop3.b32 %r30275, %r12916, %r12900, %r12788, 0xD2; + lop3.b32 %r30276, %r12920, %r12904, %r12792, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+104], {%r30275, %r30276}; + // begin inline asm + // chi + lop3.b32 %r30269, %r12900, %r12788, %r12796, 0xD2; + lop3.b32 %r30270, %r12904, %r12792, %r12800, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+112], {%r30269, %r30270}; + // begin inline asm + // chi + lop3.b32 %r30263, %r12788, %r12796, %r12764, 0xD2; + lop3.b32 %r30264, %r12792, %r12800, %r12768, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+120], {%r30263, %r30264}; + // begin inline asm + // chi + lop3.b32 %r30255, %r12796, %r12764, %r12916, 0xD2; + lop3.b32 %r30256, %r12800, %r12768, %r12920, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+128], {%r30255, %r30256}; + // begin inline asm + // chi + lop3.b32 %r30247, %r12764, %r12916, %r12900, 0xD2; + lop3.b32 %r30248, %r12768, %r12920, %r12904, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+136], {%r30247, %r30248}; + // begin inline asm + // chi + lop3.b32 %r30273, %r12820, %r12860, %r12892, 0xD2; + lop3.b32 %r30274, %r12824, %r12864, %r12896, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+144], {%r30273, %r30274}; + // begin inline asm + // chi + lop3.b32 %r30267, %r12860, %r12892, %r12884, 0xD2; + lop3.b32 %r30268, %r12864, %r12896, %r12888, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+152], {%r30267, %r30268}; + // begin inline asm + // chi + lop3.b32 %r30261, %r12892, %r12884, %r12804, 0xD2; + lop3.b32 %r30262, %r12896, %r12888, %r12808, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+160], {%r30261, %r30262}; + // begin inline asm + // chi + lop3.b32 %r30253, %r12884, %r12804, %r12820, 0xD2; + lop3.b32 %r30254, %r12888, %r12808, %r12824, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+168], {%r30253, %r30254}; + // begin inline asm + // chi + lop3.b32 %r30245, %r12804, %r12820, %r12860, 0xD2; + lop3.b32 %r30246, %r12808, %r12824, %r12864, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+176], {%r30245, %r30246}; + // begin inline asm + // chi + lop3.b32 %r30271, %r12772, %r12844, %r12756, 0xD2; + lop3.b32 %r30272, %r12776, %r12848, %r12760, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+184], {%r30271, %r30272}; + // begin inline asm + // chi + lop3.b32 %r30265, %r12844, %r12756, %r12812, 0xD2; + lop3.b32 %r30266, %r12848, %r12760, %r12816, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+192], {%r30265, %r30266}; + // begin inline asm + // chi + lop3.b32 %r30259, %r12756, %r12812, %r12836, 0xD2; + lop3.b32 %r30260, %r12760, %r12816, %r12840, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+200], {%r30259, %r30260}; + // begin inline asm + // chi + lop3.b32 %r30251, %r12812, %r12836, %r12772, 0xD2; + lop3.b32 %r30252, %r12816, %r12840, %r12776, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+208], {%r30251, %r30252}; + // begin inline asm + // chi + lop3.b32 %r30243, %r12836, %r12772, %r12844, 0xD2; + lop3.b32 %r30244, %r12840, %r12776, %r12848, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+216], {%r30243, %r30244}; + mul.wide.s32 %rd655, %r30293, 8; + add.s64 %rd654, %rd580, %rd655; + // begin inline asm + ld.global.nc.v2.u32 {%r13124,%r13125}, [%rd654]; + // end inline asm + xor.b32 %r30279, %r12924, %r13124; + xor.b32 %r30280, %r12925, %r13125; + add.s32 %r30293, %r30293, 1; + setp.lt.u32 %p23, %r30293, 23; + @%p23 bra $L__BB2_34; + + mov.u32 %r13235, 1; + st.local.v2.u32 [%rd82+32], {%r30291, %r30292}; + st.local.v2.u32 [%rd82+72], {%r30289, %r30290}; + st.local.v2.u32 [%rd82+40], {%r30287, %r30288}; + st.local.v2.u32 [%rd82+80], {%r30285, %r30286}; + st.local.v2.u32 [%rd82+48], {%r30283, %r30284}; + st.local.v2.u32 [%rd82+56], {%r30281, %r30282}; + st.local.v2.u32 [%rd82+24], {%r30279, %r30280}; + // begin inline asm + // xor5 + lop3.b32 %r13136, %r30279, %r30277, %r30275, 0x96; + lop3.b32 %r13136, %r13136, %r30273, %r30271, 0x96; + lop3.b32 %r13137, %r30280, %r30278, %r30276, 0x96; + lop3.b32 %r13137, %r13137, %r30274, %r30272, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13148, %r30291, %r30289, %r30269, 0x96; + lop3.b32 %r13148, %r13148, %r30267, %r30265, 0x96; + lop3.b32 %r13149, %r30292, %r30290, %r30270, 0x96; + lop3.b32 %r13149, %r13149, %r30268, %r30266, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13160, %r30287, %r30285, %r30263, 0x96; + lop3.b32 %r13160, %r13160, %r30261, %r30259, 0x96; + lop3.b32 %r13161, %r30288, %r30286, %r30264, 0x96; + lop3.b32 %r13161, %r13161, %r30262, %r30260, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13172, %r30283, %r30257, %r30255, 0x96; + lop3.b32 %r13172, %r13172, %r30253, %r30251, 0x96; + lop3.b32 %r13173, %r30284, %r30258, %r30256, 0x96; + lop3.b32 %r13173, %r13173, %r30254, %r30252, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13184, %r30281, %r30249, %r30247, 0x96; + lop3.b32 %r13184, %r13184, %r30245, %r30243, 0x96; + lop3.b32 %r13185, %r30282, %r30250, %r30248, 0x96; + lop3.b32 %r13185, %r13185, %r30246, %r30244, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13196, %r13149, %r13148, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13200, %r13148, %r13149, %r13235; + // end inline asm + xor.b32 %r13374, %r13196, %r13184; + xor.b32 %r13375, %r13200, %r13185; + xor.b32 %r13343, %r30279, %r13374; + xor.b32 %r13346, %r30280, %r13375; + xor.b32 %r13306, %r30276, %r13375; + xor.b32 %r13305, %r30275, %r13374; + st.local.v2.u32 [%rd82+104], {%r13305, %r13306}; + // begin inline asm + shf.l.wrap.b32 %r13204, %r13161, %r13160, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13208, %r13160, %r13161, %r13235; + // end inline asm + xor.b32 %r13376, %r13204, %r13136; + xor.b32 %r13377, %r13208, %r13137; + xor.b32 %r13242, %r30289, %r13376; + xor.b32 %r13241, %r30290, %r13377; + xor.b32 %r13281, %r30268, %r13377; + xor.b32 %r13282, %r30267, %r13376; + st.local.v2.u32 [%rd82+152], {%r13282, %r13281}; + // begin inline asm + shf.l.wrap.b32 %r13212, %r13173, %r13172, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13216, %r13172, %r13173, %r13235; + // end inline asm + xor.b32 %r13378, %r13212, %r13148; + xor.b32 %r13379, %r13216, %r13149; + xor.b32 %r13265, %r30264, %r13379; + xor.b32 %r13266, %r30263, %r13378; + st.local.v2.u32 [%rd82+120], {%r13266, %r13265}; + xor.b32 %r13257, %r30260, %r13379; + xor.b32 %r13258, %r30259, %r13378; + st.local.v2.u32 [%rd82+200], {%r13258, %r13257}; + // begin inline asm + shf.l.wrap.b32 %r13220, %r13185, %r13184, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13224, %r13184, %r13185, %r13235; + // end inline asm + xor.b32 %r13380, %r13220, %r13160; + xor.b32 %r13381, %r13224, %r13161; + xor.b32 %r13289, %r30283, %r13380; + xor.b32 %r13290, %r30284, %r13381; + xor.b32 %r13298, %r30254, %r13381; + xor.b32 %r13297, %r30253, %r13380; + st.local.v2.u32 [%rd82+168], {%r13297, %r13298}; + // begin inline asm + shf.l.wrap.b32 %r13228, %r13137, %r13136, %r13235; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13232, %r13136, %r13137, %r13235; + // end inline asm + xor.b32 %r13382, %r13228, %r13172; + xor.b32 %r13383, %r13232, %r13173; + xor.b32 %r13249, %r30249, %r13382; + xor.b32 %r13250, %r30250, %r13383; + xor.b32 %r13274, %r30244, %r13383; + xor.b32 %r13273, %r30243, %r13382; + st.local.v2.u32 [%rd82+216], {%r13273, %r13274}; + // begin inline asm + shf.l.wrap.b32 %r13236, %r13242, %r13241, %r12739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13240, %r13241, %r13242, %r12739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13244, %r13250, %r13249, %r12747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13248, %r13249, %r13250, %r12747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13256, %r13257, %r13258, %r12755; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13252, %r13258, %r13257, %r12755; + // end inline asm + st.local.v2.u32 [%rd82+96], {%r13252, %r13256}; + // begin inline asm + shf.l.wrap.b32 %r13260, %r13266, %r13265, %r12787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13264, %r13265, %r13266, %r12787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13268, %r13274, %r13273, %r12835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13272, %r13273, %r13274, %r12835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13280, %r13281, %r13282, %r12859; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13276, %r13282, %r13281, %r12859; + // end inline asm + st.local.v2.u32 [%rd82+88], {%r13276, %r13280}; + // begin inline asm + shf.l.wrap.b32 %r13284, %r13290, %r13289, %r12875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13288, %r13289, %r13290, %r12875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13292, %r13298, %r13297, %r12883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13296, %r13297, %r13298, %r12883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13300, %r13306, %r13305, %r12915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13304, %r13305, %r13306, %r12915; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13308, %r13343, %r13236, %r13260, 0xD2; + lop3.b32 %r13309, %r13346, %r13240, %r13264, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13316, %r13236, %r13260, %r13292, 0xD2; + lop3.b32 %r13317, %r13240, %r13264, %r13296, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+32], {%r13316, %r13317}; + // begin inline asm + // chi + lop3.b32 %r13324, %r13260, %r13292, %r13268, 0xD2; + lop3.b32 %r13325, %r13264, %r13296, %r13272, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+40], {%r13324, %r13325}; + // begin inline asm + // chi + lop3.b32 %r13332, %r13292, %r13268, %r13343, 0xD2; + lop3.b32 %r13333, %r13296, %r13272, %r13346, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+48], {%r13332, %r13333}; + // begin inline asm + // chi + lop3.b32 %r13340, %r13268, %r13343, %r13236, 0xD2; + lop3.b32 %r13341, %r13272, %r13346, %r13240, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+56], {%r13340, %r13341}; + // begin inline asm + // chi + lop3.b32 %r13348, %r13284, %r13244, %r13300, 0xD2; + lop3.b32 %r13349, %r13288, %r13248, %r13304, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+64], {%r13348, %r13349}; + // begin inline asm + // chi + lop3.b32 %r13356, %r13244, %r13300, %r13276, 0xD2; + lop3.b32 %r13357, %r13248, %r13304, %r13280, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+72], {%r13356, %r13357}; + // begin inline asm + // chi + lop3.b32 %r13364, %r13300, %r13276, %r13252, 0xD2; + lop3.b32 %r13365, %r13304, %r13280, %r13256, 0xD2; + // end inline asm + st.local.v2.u32 [%rd82+80], {%r13364, %r13365}; + // begin inline asm + ld.global.nc.v2.u32 {%r13372,%r13373}, [%rd581]; + // end inline asm + xor.b32 %r13384, %r13309, %r13373; + xor.b32 %r13385, %r13308, %r13372; + st.local.v2.u32 [%rd82+24], {%r13385, %r13384}; + mov.b64 %rd1326, {%r13316, %r13317}; + mov.b64 %rd1327, {%r13324, %r13325}; + mov.b64 %rd1330, {%r13348, %r13349}; + mov.b64 %rd1331, {%r13356, %r13357}; + mov.b64 %rd1332, {%r13364, %r13365}; + mov.b64 %rd1325, {%r13385, %r13384}; + mov.b64 %rd1328, {%r13332, %r13333}; + mov.b64 %rd1329, {%r13340, %r13341}; + bra.uni $L__BB2_36; + +$L__BB2_14: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd462, 1179641; + st.local.u64 [%rd2+8], %rd462; + st.local.u32 [%rd2+16], %r47; + ld.global.u64 %rd463, [%rd32]; + ld.global.u64 %rd464, [%rd32+8]; + ld.global.u64 %rd465, [%rd32+16]; + ld.global.u64 %rd466, [%rd32+24]; + ld.global.u64 %rd467, [%rd32+32]; + ld.global.u64 %rd468, [%rd32+40]; + ld.global.u64 %rd469, [%rd32+48]; + ld.global.u64 %rd470, [%rd32+56]; + st.local.u64 [%rd2+24], %rd463; + st.local.u64 [%rd2+32], %rd464; + st.local.u64 [%rd2+40], %rd465; + st.local.u64 [%rd2+48], %rd466; + st.local.u64 [%rd2+56], %rd467; + st.local.u64 [%rd2+64], %rd468; + st.local.u64 [%rd2+72], %rd469; + st.local.u64 [%rd2+80], %rd470; + cvt.u32.u64 %r6859, %rd463; + xor.b32 %r6860, %r47, %r6859; + st.local.u32 [%rd2+24], %r6860; + mov.u32 %r29820, 0; + st.local.v2.u32 [%rd2+96], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+104], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+112], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+120], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+128], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+136], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+144], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+152], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+160], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+168], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+176], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+184], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+192], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+200], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+208], {%r29820, %r29820}; + st.local.v2.u32 [%rd2+216], {%r29820, %r29820}; + mov.u32 %r29835, -2147483648; + mov.u32 %r6832, 1; + st.local.v2.u32 [%rd2+88], {%r6832, %r29835}; + ld.local.v2.u32 {%r29856, %r29857}, [%rd2+24]; + mov.b64 {%r29854, %r29855}, %rd468; + shr.u64 %rd471, %rd464, 32; + cvt.u32.u64 %r29868, %rd464; + cvt.u32.u64 %r29869, %rd471; + shr.u64 %rd472, %rd469, 32; + cvt.u32.u64 %r29866, %rd469; + cvt.u32.u64 %r29867, %rd472; + shr.u64 %rd473, %rd465, 32; + cvt.u32.u64 %r29864, %rd465; + cvt.u32.u64 %r29865, %rd473; + shr.u64 %rd474, %rd470, 32; + cvt.u32.u64 %r29862, %rd470; + cvt.u32.u64 %r29863, %rd474; + shr.u64 %rd475, %rd466, 32; + cvt.u32.u64 %r29860, %rd466; + cvt.u32.u64 %r29861, %rd475; + shr.u64 %rd476, %rd467, 32; + cvt.u32.u64 %r29858, %rd467; + cvt.u32.u64 %r29859, %rd476; + mov.u32 %r29821, %r29820; + mov.u32 %r29822, %r29820; + mov.u32 %r29823, %r29820; + mov.u32 %r29824, %r29820; + mov.u32 %r29825, %r29820; + mov.u32 %r29826, %r29820; + mov.u32 %r29827, %r29820; + mov.u32 %r29828, %r29820; + mov.u32 %r29829, %r29820; + mov.u32 %r29830, %r29820; + mov.u32 %r29831, %r29820; + mov.u32 %r29832, %r29820; + mov.u32 %r29833, %r29820; + mov.u32 %r29834, %r6832; + mov.u32 %r29836, %r29820; + mov.u32 %r29837, %r29820; + mov.u32 %r29838, %r29820; + mov.u32 %r29839, %r29820; + mov.u32 %r29840, %r29820; + mov.u32 %r29841, %r29820; + mov.u32 %r29842, %r29820; + mov.u32 %r29843, %r29820; + mov.u32 %r29844, %r29820; + mov.u32 %r29845, %r29820; + mov.u32 %r29846, %r29820; + mov.u32 %r29847, %r29820; + mov.u32 %r29848, %r29820; + mov.u32 %r29849, %r29820; + mov.u32 %r29850, %r29820; + mov.u32 %r29851, %r29820; + mov.u32 %r29852, %r29820; + mov.u32 %r29853, %r29820; + mov.u32 %r29870, %r29820; + +$L__BB2_15: + // begin inline asm + // xor5 + lop3.b32 %r6863, %r29856, %r29854, %r29852, 0x96; + lop3.b32 %r6863, %r6863, %r29850, %r29848, 0x96; + lop3.b32 %r6864, %r29857, %r29855, %r29853, 0x96; + lop3.b32 %r6864, %r6864, %r29851, %r29849, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6875, %r29868, %r29866, %r29846, 0x96; + lop3.b32 %r6875, %r6875, %r29844, %r29842, 0x96; + lop3.b32 %r6876, %r29869, %r29867, %r29847, 0x96; + lop3.b32 %r6876, %r6876, %r29845, %r29843, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6887, %r29864, %r29862, %r29840, 0x96; + lop3.b32 %r6887, %r6887, %r29838, %r29836, 0x96; + lop3.b32 %r6888, %r29865, %r29863, %r29841, 0x96; + lop3.b32 %r6888, %r6888, %r29839, %r29837, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6899, %r29860, %r29834, %r29832, 0x96; + lop3.b32 %r6899, %r6899, %r29830, %r29828, 0x96; + lop3.b32 %r6900, %r29861, %r29835, %r29833, 0x96; + lop3.b32 %r6900, %r6900, %r29831, %r29829, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6911, %r29858, %r29826, %r29824, 0x96; + lop3.b32 %r6911, %r6911, %r29822, %r29820, 0x96; + lop3.b32 %r6912, %r29859, %r29827, %r29825, 0x96; + lop3.b32 %r6912, %r6912, %r29823, %r29821, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6923, %r6876, %r6875, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6927, %r6875, %r6876, %r6832; + // end inline asm + xor.b32 %r7357, %r6923, %r6911; + xor.b32 %r7358, %r6927, %r6912; + xor.b32 %r7190, %r29856, %r7357; + xor.b32 %r7193, %r29857, %r7358; + xor.b32 %r7097, %r29854, %r7357; + xor.b32 %r7096, %r29855, %r7358; + xor.b32 %r7144, %r29852, %r7357; + xor.b32 %r7145, %r29853, %r7358; + xor.b32 %r7049, %r29850, %r7357; + xor.b32 %r7048, %r29851, %r7358; + xor.b32 %r7000, %r29848, %r7357; + xor.b32 %r7001, %r29849, %r7358; + // begin inline asm + shf.l.wrap.b32 %r6931, %r6888, %r6887, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6935, %r6887, %r6888, %r6832; + // end inline asm + xor.b32 %r7359, %r6931, %r6863; + xor.b32 %r7360, %r6935, %r6864; + xor.b32 %r7152, %r29868, %r7359; + xor.b32 %r7153, %r29869, %r7360; + xor.b32 %r6969, %r29866, %r7359; + xor.b32 %r6968, %r29867, %r7360; + xor.b32 %r7128, %r29846, %r7359; + xor.b32 %r7129, %r29847, %r7360; + xor.b32 %r7089, %r29844, %r7359; + xor.b32 %r7088, %r29845, %r7360; + xor.b32 %r7072, %r29842, %r7359; + xor.b32 %r7073, %r29843, %r7360; + // begin inline asm + shf.l.wrap.b32 %r6939, %r6900, %r6899, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6943, %r6899, %r6900, %r6832; + // end inline asm + xor.b32 %r7361, %r6939, %r6875; + xor.b32 %r7362, %r6943, %r6876; + xor.b32 %r7009, %r29864, %r7361; + xor.b32 %r7008, %r29865, %r7362; + xor.b32 %r7136, %r29862, %r7361; + xor.b32 %r7137, %r29863, %r7362; + xor.b32 %r7017, %r29840, %r7361; + xor.b32 %r7016, %r29841, %r7362; + xor.b32 %r7120, %r29838, %r7361; + xor.b32 %r7121, %r29839, %r7362; + xor.b32 %r6985, %r29836, %r7361; + xor.b32 %r6984, %r29837, %r7362; + // begin inline asm + shf.l.wrap.b32 %r6947, %r6912, %r6911, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6951, %r6911, %r6912, %r6832; + // end inline asm + xor.b32 %r7363, %r6947, %r6887; + xor.b32 %r7364, %r6951, %r6888; + xor.b32 %r7104, %r29860, %r7363; + xor.b32 %r7105, %r29861, %r7364; + xor.b32 %r7081, %r29834, %r7363; + xor.b32 %r7080, %r29835, %r7364; + xor.b32 %r7024, %r29832, %r7363; + xor.b32 %r7025, %r29833, %r7364; + xor.b32 %r7112, %r29830, %r7363; + xor.b32 %r7113, %r29831, %r7364; + xor.b32 %r7041, %r29828, %r7363; + xor.b32 %r7040, %r29829, %r7364; + // begin inline asm + shf.l.wrap.b32 %r6955, %r6864, %r6863, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6959, %r6863, %r6864, %r6832; + // end inline asm + xor.b32 %r7365, %r6955, %r6899; + xor.b32 %r7366, %r6959, %r6900; + xor.b32 %r7056, %r29858, %r7365; + xor.b32 %r7057, %r29859, %r7366; + xor.b32 %r6976, %r29826, %r7365; + xor.b32 %r6977, %r29827, %r7366; + xor.b32 %r6993, %r29824, %r7365; + xor.b32 %r6992, %r29825, %r7366; + xor.b32 %r7032, %r29822, %r7365; + xor.b32 %r7033, %r29823, %r7366; + xor.b32 %r7064, %r29820, %r7365; + xor.b32 %r7065, %r29821, %r7366; + mov.u32 %r6970, 44; + // begin inline asm + shf.l.wrap.b32 %r6963, %r6969, %r6968, %r6970; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6967, %r6968, %r6969, %r6970; + // end inline asm + mov.u32 %r6978, 20; + // begin inline asm + shf.l.wrap.b32 %r6971, %r6977, %r6976, %r6978; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6975, %r6976, %r6977, %r6978; + // end inline asm + mov.u32 %r6986, 61; + // begin inline asm + shf.l.wrap.b32 %r6979, %r6985, %r6984, %r6986; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6983, %r6984, %r6985, %r6986; + // end inline asm + mov.u32 %r6994, 39; + // begin inline asm + shf.l.wrap.b32 %r6987, %r6993, %r6992, %r6994; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6991, %r6992, %r6993, %r6994; + // end inline asm + mov.u32 %r7002, 18; + // begin inline asm + shf.l.wrap.b32 %r6995, %r7001, %r7000, %r7002; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r6999, %r7000, %r7001, %r7002; + // end inline asm + mov.u32 %r7010, 62; + // begin inline asm + shf.l.wrap.b32 %r7003, %r7009, %r7008, %r7010; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7007, %r7008, %r7009, %r7010; + // end inline asm + mov.u32 %r7018, 43; + // begin inline asm + shf.l.wrap.b32 %r7011, %r7017, %r7016, %r7018; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7015, %r7016, %r7017, %r7018; + // end inline asm + mov.u32 %r7026, 25; + // begin inline asm + shf.l.wrap.b32 %r7019, %r7025, %r7024, %r7026; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7023, %r7024, %r7025, %r7026; + // end inline asm + mov.u32 %r7034, 8; + // begin inline asm + shf.l.wrap.b32 %r7027, %r7033, %r7032, %r7034; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7031, %r7032, %r7033, %r7034; + // end inline asm + mov.u32 %r7042, 56; + // begin inline asm + shf.l.wrap.b32 %r7035, %r7041, %r7040, %r7042; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7039, %r7040, %r7041, %r7042; + // end inline asm + mov.u32 %r7050, 41; + // begin inline asm + shf.l.wrap.b32 %r7043, %r7049, %r7048, %r7050; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7047, %r7048, %r7049, %r7050; + // end inline asm + mov.u32 %r7058, 27; + // begin inline asm + shf.l.wrap.b32 %r7051, %r7057, %r7056, %r7058; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7055, %r7056, %r7057, %r7058; + // end inline asm + mov.u32 %r7066, 14; + // begin inline asm + shf.l.wrap.b32 %r7059, %r7065, %r7064, %r7066; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7063, %r7064, %r7065, %r7066; + // end inline asm + mov.u32 %r7074, 2; + // begin inline asm + shf.l.wrap.b32 %r7067, %r7073, %r7072, %r7074; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7071, %r7072, %r7073, %r7074; + // end inline asm + mov.u32 %r7082, 55; + // begin inline asm + shf.l.wrap.b32 %r7075, %r7081, %r7080, %r7082; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7079, %r7080, %r7081, %r7082; + // end inline asm + mov.u32 %r7090, 45; + // begin inline asm + shf.l.wrap.b32 %r7083, %r7089, %r7088, %r7090; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7087, %r7088, %r7089, %r7090; + // end inline asm + mov.u32 %r7098, 36; + // begin inline asm + shf.l.wrap.b32 %r7091, %r7097, %r7096, %r7098; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7095, %r7096, %r7097, %r7098; + // end inline asm + mov.u32 %r7106, 28; + // begin inline asm + shf.l.wrap.b32 %r7099, %r7105, %r7104, %r7106; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7103, %r7104, %r7105, %r7106; + // end inline asm + mov.u32 %r7114, 21; + // begin inline asm + shf.l.wrap.b32 %r7107, %r7113, %r7112, %r7114; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7111, %r7112, %r7113, %r7114; + // end inline asm + mov.u32 %r7122, 15; + // begin inline asm + shf.l.wrap.b32 %r7115, %r7121, %r7120, %r7122; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7119, %r7120, %r7121, %r7122; + // end inline asm + mov.u32 %r7130, 10; + // begin inline asm + shf.l.wrap.b32 %r7123, %r7129, %r7128, %r7130; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7127, %r7128, %r7129, %r7130; + // end inline asm + mov.u32 %r7138, 6; + // begin inline asm + shf.l.wrap.b32 %r7131, %r7137, %r7136, %r7138; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7135, %r7136, %r7137, %r7138; + // end inline asm + mov.u32 %r7146, 3; + // begin inline asm + shf.l.wrap.b32 %r7139, %r7145, %r7144, %r7146; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7143, %r7144, %r7145, %r7146; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7147, %r7153, %r7152, %r6832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7151, %r7152, %r7153, %r6832; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r7155, %r7190, %r6963, %r7011, 0xD2; + lop3.b32 %r7156, %r7193, %r6967, %r7015, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29868, %r6963, %r7011, %r7107, 0xD2; + lop3.b32 %r29869, %r6967, %r7015, %r7111, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29864, %r7011, %r7107, %r7059, 0xD2; + lop3.b32 %r29865, %r7015, %r7111, %r7063, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29860, %r7107, %r7059, %r7190, 0xD2; + lop3.b32 %r29861, %r7111, %r7063, %r7193, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29858, %r7059, %r7190, %r6963, 0xD2; + lop3.b32 %r29859, %r7063, %r7193, %r6967, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29854, %r7099, %r6971, %r7139, 0xD2; + lop3.b32 %r29855, %r7103, %r6975, %r7143, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29866, %r6971, %r7139, %r7083, 0xD2; + lop3.b32 %r29867, %r6975, %r7143, %r7087, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29862, %r7139, %r7083, %r6979, 0xD2; + lop3.b32 %r29863, %r7143, %r7087, %r6983, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29834, %r7083, %r6979, %r7099, 0xD2; + lop3.b32 %r29835, %r7087, %r6983, %r7103, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r29834, %r29835}; + // begin inline asm + // chi + lop3.b32 %r29826, %r6979, %r7099, %r6971, 0xD2; + lop3.b32 %r29827, %r6983, %r7103, %r6975, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r29826, %r29827}; + // begin inline asm + // chi + lop3.b32 %r29852, %r7147, %r7131, %r7019, 0xD2; + lop3.b32 %r29853, %r7151, %r7135, %r7023, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r29852, %r29853}; + // begin inline asm + // chi + lop3.b32 %r29846, %r7131, %r7019, %r7027, 0xD2; + lop3.b32 %r29847, %r7135, %r7023, %r7031, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r29846, %r29847}; + // begin inline asm + // chi + lop3.b32 %r29840, %r7019, %r7027, %r6995, 0xD2; + lop3.b32 %r29841, %r7023, %r7031, %r6999, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r29840, %r29841}; + // begin inline asm + // chi + lop3.b32 %r29832, %r7027, %r6995, %r7147, 0xD2; + lop3.b32 %r29833, %r7031, %r6999, %r7151, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r29832, %r29833}; + // begin inline asm + // chi + lop3.b32 %r29824, %r6995, %r7147, %r7131, 0xD2; + lop3.b32 %r29825, %r6999, %r7151, %r7135, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r29824, %r29825}; + // begin inline asm + // chi + lop3.b32 %r29850, %r7051, %r7091, %r7123, 0xD2; + lop3.b32 %r29851, %r7055, %r7095, %r7127, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r29850, %r29851}; + // begin inline asm + // chi + lop3.b32 %r29844, %r7091, %r7123, %r7115, 0xD2; + lop3.b32 %r29845, %r7095, %r7127, %r7119, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r29844, %r29845}; + // begin inline asm + // chi + lop3.b32 %r29838, %r7123, %r7115, %r7035, 0xD2; + lop3.b32 %r29839, %r7127, %r7119, %r7039, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r29838, %r29839}; + // begin inline asm + // chi + lop3.b32 %r29830, %r7115, %r7035, %r7051, 0xD2; + lop3.b32 %r29831, %r7119, %r7039, %r7055, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r29830, %r29831}; + // begin inline asm + // chi + lop3.b32 %r29822, %r7035, %r7051, %r7091, 0xD2; + lop3.b32 %r29823, %r7039, %r7055, %r7095, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r29822, %r29823}; + // begin inline asm + // chi + lop3.b32 %r29848, %r7003, %r7075, %r6987, 0xD2; + lop3.b32 %r29849, %r7007, %r7079, %r6991, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r29848, %r29849}; + // begin inline asm + // chi + lop3.b32 %r29842, %r7075, %r6987, %r7043, 0xD2; + lop3.b32 %r29843, %r7079, %r6991, %r7047, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r29842, %r29843}; + // begin inline asm + // chi + lop3.b32 %r29836, %r6987, %r7043, %r7067, 0xD2; + lop3.b32 %r29837, %r6991, %r7047, %r7071, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r29836, %r29837}; + // begin inline asm + // chi + lop3.b32 %r29828, %r7043, %r7067, %r7003, 0xD2; + lop3.b32 %r29829, %r7047, %r7071, %r7007, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r29828, %r29829}; + // begin inline asm + // chi + lop3.b32 %r29820, %r7067, %r7003, %r7075, 0xD2; + lop3.b32 %r29821, %r7071, %r7007, %r7079, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r29820, %r29821}; + mul.wide.s32 %rd478, %r29870, 8; + mov.u64 %rd479, keccak_round_constants; + cvta.const.u64 %rd480, %rd479; + add.s64 %rd477, %rd480, %rd478; + // begin inline asm + ld.global.nc.v2.u32 {%r7355,%r7356}, [%rd477]; + // end inline asm + xor.b32 %r29856, %r7155, %r7355; + xor.b32 %r29857, %r7156, %r7356; + add.s32 %r29870, %r29870, 1; + setp.lt.u32 %p14, %r29870, 23; + @%p14 bra $L__BB2_15; + + add.u64 %rd53, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r29868, %r29869}; + st.local.v2.u32 [%rd2+72], {%r29866, %r29867}; + st.local.v2.u32 [%rd2+40], {%r29864, %r29865}; + st.local.v2.u32 [%rd2+80], {%r29862, %r29863}; + st.local.v2.u32 [%rd2+48], {%r29860, %r29861}; + st.local.v2.u32 [%rd2+56], {%r29858, %r29859}; + st.local.v2.u32 [%rd2+24], {%r29856, %r29857}; + // begin inline asm + // xor5 + lop3.b32 %r7367, %r29856, %r29854, %r29852, 0x96; + lop3.b32 %r7367, %r7367, %r29850, %r29848, 0x96; + lop3.b32 %r7368, %r29857, %r29855, %r29853, 0x96; + lop3.b32 %r7368, %r7368, %r29851, %r29849, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7379, %r29868, %r29866, %r29846, 0x96; + lop3.b32 %r7379, %r7379, %r29844, %r29842, 0x96; + lop3.b32 %r7380, %r29869, %r29867, %r29847, 0x96; + lop3.b32 %r7380, %r7380, %r29845, %r29843, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7391, %r29864, %r29862, %r29840, 0x96; + lop3.b32 %r7391, %r7391, %r29838, %r29836, 0x96; + lop3.b32 %r7392, %r29865, %r29863, %r29841, 0x96; + lop3.b32 %r7392, %r7392, %r29839, %r29837, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7403, %r29860, %r29834, %r29832, 0x96; + lop3.b32 %r7403, %r7403, %r29830, %r29828, 0x96; + lop3.b32 %r7404, %r29861, %r29835, %r29833, 0x96; + lop3.b32 %r7404, %r7404, %r29831, %r29829, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7415, %r29858, %r29826, %r29824, 0x96; + lop3.b32 %r7415, %r7415, %r29822, %r29820, 0x96; + lop3.b32 %r7416, %r29859, %r29827, %r29825, 0x96; + lop3.b32 %r7416, %r7416, %r29823, %r29821, 0x96; + // end inline asm + mov.u32 %r7619, 1; + // begin inline asm + shf.l.wrap.b32 %r7427, %r7380, %r7379, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7431, %r7379, %r7380, %r7619; + // end inline asm + xor.b32 %r7646, %r7427, %r7415; + xor.b32 %r7647, %r7431, %r7416; + xor.b32 %r7574, %r29856, %r7646; + xor.b32 %r7577, %r29857, %r7647; + xor.b32 %r7537, %r29853, %r7647; + xor.b32 %r7536, %r29852, %r7646; + st.local.v2.u32 [%rd2+104], {%r7536, %r7537}; + // begin inline asm + shf.l.wrap.b32 %r7435, %r7392, %r7391, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7439, %r7391, %r7392, %r7619; + // end inline asm + xor.b32 %r7648, %r7435, %r7367; + xor.b32 %r7649, %r7439, %r7368; + xor.b32 %r7473, %r29866, %r7648; + xor.b32 %r7472, %r29867, %r7649; + xor.b32 %r7512, %r29845, %r7649; + xor.b32 %r7513, %r29844, %r7648; + st.local.v2.u32 [%rd2+152], {%r7513, %r7512}; + // begin inline asm + shf.l.wrap.b32 %r7443, %r7404, %r7403, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7447, %r7403, %r7404, %r7619; + // end inline asm + xor.b32 %r7650, %r7443, %r7379; + xor.b32 %r7651, %r7447, %r7380; + xor.b32 %r7496, %r29841, %r7651; + xor.b32 %r7497, %r29840, %r7650; + st.local.v2.u32 [%rd2+120], {%r7497, %r7496}; + xor.b32 %r7488, %r29837, %r7651; + xor.b32 %r7489, %r29836, %r7650; + st.local.v2.u32 [%rd2+200], {%r7489, %r7488}; + // begin inline asm + shf.l.wrap.b32 %r7451, %r7416, %r7415, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7455, %r7415, %r7416, %r7619; + // end inline asm + xor.b32 %r7652, %r7451, %r7391; + xor.b32 %r7653, %r7455, %r7392; + xor.b32 %r7520, %r29860, %r7652; + xor.b32 %r7521, %r29861, %r7653; + xor.b32 %r7529, %r29831, %r7653; + xor.b32 %r7528, %r29830, %r7652; + st.local.v2.u32 [%rd2+168], {%r7528, %r7529}; + // begin inline asm + shf.l.wrap.b32 %r7459, %r7368, %r7367, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7463, %r7367, %r7368, %r7619; + // end inline asm + xor.b32 %r7654, %r7459, %r7403; + xor.b32 %r7655, %r7463, %r7404; + xor.b32 %r7480, %r29826, %r7654; + xor.b32 %r7481, %r29827, %r7655; + xor.b32 %r7505, %r29821, %r7655; + xor.b32 %r7504, %r29820, %r7654; + st.local.v2.u32 [%rd2+216], {%r7504, %r7505}; + // begin inline asm + shf.l.wrap.b32 %r7467, %r7473, %r7472, %r6970; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7471, %r7472, %r7473, %r6970; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7475, %r7481, %r7480, %r6978; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7479, %r7480, %r7481, %r6978; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7487, %r7488, %r7489, %r6986; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7483, %r7489, %r7488, %r6986; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r7483, %r7487}; + // begin inline asm + shf.l.wrap.b32 %r7491, %r7497, %r7496, %r7018; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7495, %r7496, %r7497, %r7018; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7499, %r7505, %r7504, %r7066; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7503, %r7504, %r7505, %r7066; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7511, %r7512, %r7513, %r7090; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7507, %r7513, %r7512, %r7090; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r7507, %r7511}; + // begin inline asm + shf.l.wrap.b32 %r7515, %r7521, %r7520, %r7106; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7519, %r7520, %r7521, %r7106; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7523, %r7529, %r7528, %r7114; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7527, %r7528, %r7529, %r7114; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7531, %r7537, %r7536, %r7146; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7535, %r7536, %r7537, %r7146; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r7539, %r7574, %r7467, %r7491, 0xD2; + lop3.b32 %r7540, %r7577, %r7471, %r7495, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30003, %r7467, %r7491, %r7523, 0xD2; + lop3.b32 %r30004, %r7471, %r7495, %r7527, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30003, %r30004}; + // begin inline asm + // chi + lop3.b32 %r29999, %r7491, %r7523, %r7499, 0xD2; + lop3.b32 %r30000, %r7495, %r7527, %r7503, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r29999, %r30000}; + // begin inline asm + // chi + lop3.b32 %r29995, %r7523, %r7499, %r7574, 0xD2; + lop3.b32 %r29996, %r7527, %r7503, %r7577, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r29995, %r29996}; + // begin inline asm + // chi + lop3.b32 %r29993, %r7499, %r7574, %r7467, 0xD2; + lop3.b32 %r29994, %r7503, %r7577, %r7471, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r29993, %r29994}; + // begin inline asm + // chi + lop3.b32 %r29989, %r7515, %r7475, %r7531, 0xD2; + lop3.b32 %r29990, %r7519, %r7479, %r7535, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r29989, %r29990}; + // begin inline asm + // chi + lop3.b32 %r30001, %r7475, %r7531, %r7507, 0xD2; + lop3.b32 %r30002, %r7479, %r7535, %r7511, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30001, %r30002}; + // begin inline asm + // chi + lop3.b32 %r29997, %r7531, %r7507, %r7483, 0xD2; + lop3.b32 %r29998, %r7535, %r7511, %r7487, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r29997, %r29998}; + add.s64 %rd481, %rd480, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r7603,%r7604}, [%rd481]; + // end inline asm + xor.b32 %r29991, %r7539, %r7603; + xor.b32 %r29992, %r7540, %r7604; + st.local.v2.u32 [%rd2+24], {%r29991, %r29992}; + st.local.u64 [%rd53], %rd361; + mov.u64 %rd485, 1179641; + st.local.u64 [%rd53+8], %rd485; + add.s32 %r243, %r47, 1; + st.local.u32 [%rd53+16], %r243; + ld.global.u64 %rd486, [%rd33]; + ld.global.u64 %rd487, [%rd33+8]; + ld.global.u64 %rd488, [%rd33+16]; + ld.global.u64 %rd489, [%rd33+24]; + ld.global.u64 %rd490, [%rd33+32]; + ld.global.u64 %rd491, [%rd33+40]; + ld.global.u64 %rd492, [%rd33+48]; + ld.global.u64 %rd493, [%rd33+56]; + st.local.u64 [%rd53+32], %rd487; + st.local.u64 [%rd53+40], %rd488; + st.local.u64 [%rd53+48], %rd489; + st.local.u64 [%rd53+56], %rd490; + st.local.u64 [%rd53+64], %rd491; + st.local.u64 [%rd53+72], %rd492; + st.local.u64 [%rd53+80], %rd493; + cvt.u32.u64 %r7656, %rd486; + xor.b32 %r7657, %r243, %r7656; + st.local.u64 [%rd53+24], %rd486; + st.local.u32 [%rd53+24], %r7657; + mov.u32 %r29871, 0; + st.local.v2.u32 [%rd53+96], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+104], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+112], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+120], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+128], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+136], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+144], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+152], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+160], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+168], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+176], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+184], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+192], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+200], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+208], {%r29871, %r29871}; + st.local.v2.u32 [%rd53+216], {%r29871, %r29871}; + mov.u32 %r29886, -2147483648; + st.local.v2.u32 [%rd53+88], {%r7619, %r29886}; + ld.local.v2.u32 {%r29907, %r29908}, [%rd53+24]; + mov.b64 {%r29905, %r29906}, %rd491; + shr.u64 %rd494, %rd487, 32; + cvt.u32.u64 %r29919, %rd487; + cvt.u32.u64 %r29920, %rd494; + shr.u64 %rd495, %rd492, 32; + cvt.u32.u64 %r29917, %rd492; + cvt.u32.u64 %r29918, %rd495; + shr.u64 %rd496, %rd488, 32; + cvt.u32.u64 %r29915, %rd488; + cvt.u32.u64 %r29916, %rd496; + shr.u64 %rd497, %rd493, 32; + cvt.u32.u64 %r29913, %rd493; + cvt.u32.u64 %r29914, %rd497; + shr.u64 %rd498, %rd489, 32; + cvt.u32.u64 %r29911, %rd489; + cvt.u32.u64 %r29912, %rd498; + shr.u64 %rd499, %rd490, 32; + cvt.u32.u64 %r29909, %rd490; + cvt.u32.u64 %r29910, %rd499; + mov.u32 %r29872, %r29871; + mov.u32 %r29873, %r29871; + mov.u32 %r29874, %r29871; + mov.u32 %r29875, %r29871; + mov.u32 %r29876, %r29871; + mov.u32 %r29877, %r29871; + mov.u32 %r29878, %r29871; + mov.u32 %r29879, %r29871; + mov.u32 %r29880, %r29871; + mov.u32 %r29881, %r29871; + mov.u32 %r29882, %r29871; + mov.u32 %r29883, %r29871; + mov.u32 %r29884, %r29871; + mov.u32 %r29885, %r7619; + mov.u32 %r29887, %r29871; + mov.u32 %r29888, %r29871; + mov.u32 %r29889, %r29871; + mov.u32 %r29890, %r29871; + mov.u32 %r29891, %r29871; + mov.u32 %r29892, %r29871; + mov.u32 %r29893, %r29871; + mov.u32 %r29894, %r29871; + mov.u32 %r29895, %r29871; + mov.u32 %r29896, %r29871; + mov.u32 %r29897, %r29871; + mov.u32 %r29898, %r29871; + mov.u32 %r29899, %r29871; + mov.u32 %r29900, %r29871; + mov.u32 %r29901, %r29871; + mov.u32 %r29902, %r29871; + mov.u32 %r29903, %r29871; + mov.u32 %r29904, %r29871; + mov.u32 %r29921, %r29871; + +$L__BB2_17: + // begin inline asm + // xor5 + lop3.b32 %r7660, %r29907, %r29905, %r29903, 0x96; + lop3.b32 %r7660, %r7660, %r29901, %r29899, 0x96; + lop3.b32 %r7661, %r29908, %r29906, %r29904, 0x96; + lop3.b32 %r7661, %r7661, %r29902, %r29900, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7672, %r29919, %r29917, %r29897, 0x96; + lop3.b32 %r7672, %r7672, %r29895, %r29893, 0x96; + lop3.b32 %r7673, %r29920, %r29918, %r29898, 0x96; + lop3.b32 %r7673, %r7673, %r29896, %r29894, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7684, %r29915, %r29913, %r29891, 0x96; + lop3.b32 %r7684, %r7684, %r29889, %r29887, 0x96; + lop3.b32 %r7685, %r29916, %r29914, %r29892, 0x96; + lop3.b32 %r7685, %r7685, %r29890, %r29888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7696, %r29911, %r29885, %r29883, 0x96; + lop3.b32 %r7696, %r7696, %r29881, %r29879, 0x96; + lop3.b32 %r7697, %r29912, %r29886, %r29884, 0x96; + lop3.b32 %r7697, %r7697, %r29882, %r29880, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7708, %r29909, %r29877, %r29875, 0x96; + lop3.b32 %r7708, %r7708, %r29873, %r29871, 0x96; + lop3.b32 %r7709, %r29910, %r29878, %r29876, 0x96; + lop3.b32 %r7709, %r7709, %r29874, %r29872, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7720, %r7673, %r7672, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7724, %r7672, %r7673, %r7619; + // end inline asm + xor.b32 %r8154, %r7720, %r7708; + xor.b32 %r8155, %r7724, %r7709; + xor.b32 %r7987, %r29907, %r8154; + xor.b32 %r7990, %r29908, %r8155; + xor.b32 %r7894, %r29905, %r8154; + xor.b32 %r7893, %r29906, %r8155; + xor.b32 %r7941, %r29903, %r8154; + xor.b32 %r7942, %r29904, %r8155; + xor.b32 %r7846, %r29901, %r8154; + xor.b32 %r7845, %r29902, %r8155; + xor.b32 %r7797, %r29899, %r8154; + xor.b32 %r7798, %r29900, %r8155; + // begin inline asm + shf.l.wrap.b32 %r7728, %r7685, %r7684, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7732, %r7684, %r7685, %r7619; + // end inline asm + xor.b32 %r8156, %r7728, %r7660; + xor.b32 %r8157, %r7732, %r7661; + xor.b32 %r7949, %r29919, %r8156; + xor.b32 %r7950, %r29920, %r8157; + xor.b32 %r7766, %r29917, %r8156; + xor.b32 %r7765, %r29918, %r8157; + xor.b32 %r7925, %r29897, %r8156; + xor.b32 %r7926, %r29898, %r8157; + xor.b32 %r7886, %r29895, %r8156; + xor.b32 %r7885, %r29896, %r8157; + xor.b32 %r7869, %r29893, %r8156; + xor.b32 %r7870, %r29894, %r8157; + // begin inline asm + shf.l.wrap.b32 %r7736, %r7697, %r7696, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7740, %r7696, %r7697, %r7619; + // end inline asm + xor.b32 %r8158, %r7736, %r7672; + xor.b32 %r8159, %r7740, %r7673; + xor.b32 %r7806, %r29915, %r8158; + xor.b32 %r7805, %r29916, %r8159; + xor.b32 %r7933, %r29913, %r8158; + xor.b32 %r7934, %r29914, %r8159; + xor.b32 %r7814, %r29891, %r8158; + xor.b32 %r7813, %r29892, %r8159; + xor.b32 %r7917, %r29889, %r8158; + xor.b32 %r7918, %r29890, %r8159; + xor.b32 %r7782, %r29887, %r8158; + xor.b32 %r7781, %r29888, %r8159; + // begin inline asm + shf.l.wrap.b32 %r7744, %r7709, %r7708, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7748, %r7708, %r7709, %r7619; + // end inline asm + xor.b32 %r8160, %r7744, %r7684; + xor.b32 %r8161, %r7748, %r7685; + xor.b32 %r7901, %r29911, %r8160; + xor.b32 %r7902, %r29912, %r8161; + xor.b32 %r7878, %r29885, %r8160; + xor.b32 %r7877, %r29886, %r8161; + xor.b32 %r7821, %r29883, %r8160; + xor.b32 %r7822, %r29884, %r8161; + xor.b32 %r7909, %r29881, %r8160; + xor.b32 %r7910, %r29882, %r8161; + xor.b32 %r7838, %r29879, %r8160; + xor.b32 %r7837, %r29880, %r8161; + // begin inline asm + shf.l.wrap.b32 %r7752, %r7661, %r7660, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7756, %r7660, %r7661, %r7619; + // end inline asm + xor.b32 %r8162, %r7752, %r7696; + xor.b32 %r8163, %r7756, %r7697; + xor.b32 %r7853, %r29909, %r8162; + xor.b32 %r7854, %r29910, %r8163; + xor.b32 %r7773, %r29877, %r8162; + xor.b32 %r7774, %r29878, %r8163; + xor.b32 %r7790, %r29875, %r8162; + xor.b32 %r7789, %r29876, %r8163; + xor.b32 %r7829, %r29873, %r8162; + xor.b32 %r7830, %r29874, %r8163; + xor.b32 %r7861, %r29871, %r8162; + xor.b32 %r7862, %r29872, %r8163; + mov.u32 %r7767, 44; + // begin inline asm + shf.l.wrap.b32 %r7760, %r7766, %r7765, %r7767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7764, %r7765, %r7766, %r7767; + // end inline asm + mov.u32 %r7775, 20; + // begin inline asm + shf.l.wrap.b32 %r7768, %r7774, %r7773, %r7775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7772, %r7773, %r7774, %r7775; + // end inline asm + mov.u32 %r7783, 61; + // begin inline asm + shf.l.wrap.b32 %r7776, %r7782, %r7781, %r7783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7780, %r7781, %r7782, %r7783; + // end inline asm + mov.u32 %r7791, 39; + // begin inline asm + shf.l.wrap.b32 %r7784, %r7790, %r7789, %r7791; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7788, %r7789, %r7790, %r7791; + // end inline asm + mov.u32 %r7799, 18; + // begin inline asm + shf.l.wrap.b32 %r7792, %r7798, %r7797, %r7799; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7796, %r7797, %r7798, %r7799; + // end inline asm + mov.u32 %r7807, 62; + // begin inline asm + shf.l.wrap.b32 %r7800, %r7806, %r7805, %r7807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7804, %r7805, %r7806, %r7807; + // end inline asm + mov.u32 %r7815, 43; + // begin inline asm + shf.l.wrap.b32 %r7808, %r7814, %r7813, %r7815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7812, %r7813, %r7814, %r7815; + // end inline asm + mov.u32 %r7823, 25; + // begin inline asm + shf.l.wrap.b32 %r7816, %r7822, %r7821, %r7823; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7820, %r7821, %r7822, %r7823; + // end inline asm + mov.u32 %r7831, 8; + // begin inline asm + shf.l.wrap.b32 %r7824, %r7830, %r7829, %r7831; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7828, %r7829, %r7830, %r7831; + // end inline asm + mov.u32 %r7839, 56; + // begin inline asm + shf.l.wrap.b32 %r7832, %r7838, %r7837, %r7839; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7836, %r7837, %r7838, %r7839; + // end inline asm + mov.u32 %r7847, 41; + // begin inline asm + shf.l.wrap.b32 %r7840, %r7846, %r7845, %r7847; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7844, %r7845, %r7846, %r7847; + // end inline asm + mov.u32 %r7855, 27; + // begin inline asm + shf.l.wrap.b32 %r7848, %r7854, %r7853, %r7855; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7852, %r7853, %r7854, %r7855; + // end inline asm + mov.u32 %r7863, 14; + // begin inline asm + shf.l.wrap.b32 %r7856, %r7862, %r7861, %r7863; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7860, %r7861, %r7862, %r7863; + // end inline asm + mov.u32 %r7871, 2; + // begin inline asm + shf.l.wrap.b32 %r7864, %r7870, %r7869, %r7871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7868, %r7869, %r7870, %r7871; + // end inline asm + mov.u32 %r7879, 55; + // begin inline asm + shf.l.wrap.b32 %r7872, %r7878, %r7877, %r7879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7876, %r7877, %r7878, %r7879; + // end inline asm + mov.u32 %r7887, 45; + // begin inline asm + shf.l.wrap.b32 %r7880, %r7886, %r7885, %r7887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7884, %r7885, %r7886, %r7887; + // end inline asm + mov.u32 %r7895, 36; + // begin inline asm + shf.l.wrap.b32 %r7888, %r7894, %r7893, %r7895; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7892, %r7893, %r7894, %r7895; + // end inline asm + mov.u32 %r7903, 28; + // begin inline asm + shf.l.wrap.b32 %r7896, %r7902, %r7901, %r7903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7900, %r7901, %r7902, %r7903; + // end inline asm + mov.u32 %r7911, 21; + // begin inline asm + shf.l.wrap.b32 %r7904, %r7910, %r7909, %r7911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7908, %r7909, %r7910, %r7911; + // end inline asm + mov.u32 %r7919, 15; + // begin inline asm + shf.l.wrap.b32 %r7912, %r7918, %r7917, %r7919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7916, %r7917, %r7918, %r7919; + // end inline asm + mov.u32 %r7927, 10; + // begin inline asm + shf.l.wrap.b32 %r7920, %r7926, %r7925, %r7927; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7924, %r7925, %r7926, %r7927; + // end inline asm + mov.u32 %r7935, 6; + // begin inline asm + shf.l.wrap.b32 %r7928, %r7934, %r7933, %r7935; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7932, %r7933, %r7934, %r7935; + // end inline asm + mov.u32 %r7943, 3; + // begin inline asm + shf.l.wrap.b32 %r7936, %r7942, %r7941, %r7943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7940, %r7941, %r7942, %r7943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7944, %r7950, %r7949, %r7619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r7948, %r7949, %r7950, %r7619; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r7952, %r7987, %r7760, %r7808, 0xD2; + lop3.b32 %r7953, %r7990, %r7764, %r7812, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29919, %r7760, %r7808, %r7904, 0xD2; + lop3.b32 %r29920, %r7764, %r7812, %r7908, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29915, %r7808, %r7904, %r7856, 0xD2; + lop3.b32 %r29916, %r7812, %r7908, %r7860, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29911, %r7904, %r7856, %r7987, 0xD2; + lop3.b32 %r29912, %r7908, %r7860, %r7990, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29909, %r7856, %r7987, %r7760, 0xD2; + lop3.b32 %r29910, %r7860, %r7990, %r7764, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29905, %r7896, %r7768, %r7936, 0xD2; + lop3.b32 %r29906, %r7900, %r7772, %r7940, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29917, %r7768, %r7936, %r7880, 0xD2; + lop3.b32 %r29918, %r7772, %r7940, %r7884, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29913, %r7936, %r7880, %r7776, 0xD2; + lop3.b32 %r29914, %r7940, %r7884, %r7780, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29885, %r7880, %r7776, %r7896, 0xD2; + lop3.b32 %r29886, %r7884, %r7780, %r7900, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r29885, %r29886}; + // begin inline asm + // chi + lop3.b32 %r29877, %r7776, %r7896, %r7768, 0xD2; + lop3.b32 %r29878, %r7780, %r7900, %r7772, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r29877, %r29878}; + // begin inline asm + // chi + lop3.b32 %r29903, %r7944, %r7928, %r7816, 0xD2; + lop3.b32 %r29904, %r7948, %r7932, %r7820, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+104], {%r29903, %r29904}; + // begin inline asm + // chi + lop3.b32 %r29897, %r7928, %r7816, %r7824, 0xD2; + lop3.b32 %r29898, %r7932, %r7820, %r7828, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+112], {%r29897, %r29898}; + // begin inline asm + // chi + lop3.b32 %r29891, %r7816, %r7824, %r7792, 0xD2; + lop3.b32 %r29892, %r7820, %r7828, %r7796, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+120], {%r29891, %r29892}; + // begin inline asm + // chi + lop3.b32 %r29883, %r7824, %r7792, %r7944, 0xD2; + lop3.b32 %r29884, %r7828, %r7796, %r7948, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+128], {%r29883, %r29884}; + // begin inline asm + // chi + lop3.b32 %r29875, %r7792, %r7944, %r7928, 0xD2; + lop3.b32 %r29876, %r7796, %r7948, %r7932, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+136], {%r29875, %r29876}; + // begin inline asm + // chi + lop3.b32 %r29901, %r7848, %r7888, %r7920, 0xD2; + lop3.b32 %r29902, %r7852, %r7892, %r7924, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+144], {%r29901, %r29902}; + // begin inline asm + // chi + lop3.b32 %r29895, %r7888, %r7920, %r7912, 0xD2; + lop3.b32 %r29896, %r7892, %r7924, %r7916, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+152], {%r29895, %r29896}; + // begin inline asm + // chi + lop3.b32 %r29889, %r7920, %r7912, %r7832, 0xD2; + lop3.b32 %r29890, %r7924, %r7916, %r7836, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+160], {%r29889, %r29890}; + // begin inline asm + // chi + lop3.b32 %r29881, %r7912, %r7832, %r7848, 0xD2; + lop3.b32 %r29882, %r7916, %r7836, %r7852, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+168], {%r29881, %r29882}; + // begin inline asm + // chi + lop3.b32 %r29873, %r7832, %r7848, %r7888, 0xD2; + lop3.b32 %r29874, %r7836, %r7852, %r7892, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+176], {%r29873, %r29874}; + // begin inline asm + // chi + lop3.b32 %r29899, %r7800, %r7872, %r7784, 0xD2; + lop3.b32 %r29900, %r7804, %r7876, %r7788, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+184], {%r29899, %r29900}; + // begin inline asm + // chi + lop3.b32 %r29893, %r7872, %r7784, %r7840, 0xD2; + lop3.b32 %r29894, %r7876, %r7788, %r7844, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+192], {%r29893, %r29894}; + // begin inline asm + // chi + lop3.b32 %r29887, %r7784, %r7840, %r7864, 0xD2; + lop3.b32 %r29888, %r7788, %r7844, %r7868, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+200], {%r29887, %r29888}; + // begin inline asm + // chi + lop3.b32 %r29879, %r7840, %r7864, %r7800, 0xD2; + lop3.b32 %r29880, %r7844, %r7868, %r7804, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+208], {%r29879, %r29880}; + // begin inline asm + // chi + lop3.b32 %r29871, %r7864, %r7800, %r7872, 0xD2; + lop3.b32 %r29872, %r7868, %r7804, %r7876, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+216], {%r29871, %r29872}; + mul.wide.s32 %rd501, %r29921, 8; + add.s64 %rd500, %rd480, %rd501; + // begin inline asm + ld.global.nc.v2.u32 {%r8152,%r8153}, [%rd500]; + // end inline asm + xor.b32 %r29907, %r7952, %r8152; + xor.b32 %r29908, %r7953, %r8153; + add.s32 %r29921, %r29921, 1; + setp.lt.u32 %p15, %r29921, 23; + @%p15 bra $L__BB2_17; + + mov.u32 %r29954, 0; + mov.u32 %r8263, 1; + st.local.v2.u32 [%rd53+32], {%r29919, %r29920}; + st.local.v2.u32 [%rd53+72], {%r29917, %r29918}; + st.local.v2.u32 [%rd53+40], {%r29915, %r29916}; + st.local.v2.u32 [%rd53+80], {%r29913, %r29914}; + st.local.v2.u32 [%rd53+48], {%r29911, %r29912}; + st.local.v2.u32 [%rd53+56], {%r29909, %r29910}; + st.local.v2.u32 [%rd53+24], {%r29907, %r29908}; + // begin inline asm + // xor5 + lop3.b32 %r8164, %r29907, %r29905, %r29903, 0x96; + lop3.b32 %r8164, %r8164, %r29901, %r29899, 0x96; + lop3.b32 %r8165, %r29908, %r29906, %r29904, 0x96; + lop3.b32 %r8165, %r8165, %r29902, %r29900, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8176, %r29919, %r29917, %r29897, 0x96; + lop3.b32 %r8176, %r8176, %r29895, %r29893, 0x96; + lop3.b32 %r8177, %r29920, %r29918, %r29898, 0x96; + lop3.b32 %r8177, %r8177, %r29896, %r29894, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8188, %r29915, %r29913, %r29891, 0x96; + lop3.b32 %r8188, %r8188, %r29889, %r29887, 0x96; + lop3.b32 %r8189, %r29916, %r29914, %r29892, 0x96; + lop3.b32 %r8189, %r8189, %r29890, %r29888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8200, %r29911, %r29885, %r29883, 0x96; + lop3.b32 %r8200, %r8200, %r29881, %r29879, 0x96; + lop3.b32 %r8201, %r29912, %r29886, %r29884, 0x96; + lop3.b32 %r8201, %r8201, %r29882, %r29880, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8212, %r29909, %r29877, %r29875, 0x96; + lop3.b32 %r8212, %r8212, %r29873, %r29871, 0x96; + lop3.b32 %r8213, %r29910, %r29878, %r29876, 0x96; + lop3.b32 %r8213, %r8213, %r29874, %r29872, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8224, %r8177, %r8176, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8228, %r8176, %r8177, %r8263; + // end inline asm + xor.b32 %r8403, %r8224, %r8212; + xor.b32 %r8404, %r8228, %r8213; + xor.b32 %r8371, %r29907, %r8403; + xor.b32 %r8374, %r29908, %r8404; + xor.b32 %r8334, %r29904, %r8404; + xor.b32 %r8333, %r29903, %r8403; + st.local.v2.u32 [%rd53+104], {%r8333, %r8334}; + // begin inline asm + shf.l.wrap.b32 %r8232, %r8189, %r8188, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8236, %r8188, %r8189, %r8263; + // end inline asm + xor.b32 %r8405, %r8232, %r8164; + xor.b32 %r8406, %r8236, %r8165; + xor.b32 %r8270, %r29917, %r8405; + xor.b32 %r8269, %r29918, %r8406; + xor.b32 %r8309, %r29896, %r8406; + xor.b32 %r8310, %r29895, %r8405; + st.local.v2.u32 [%rd53+152], {%r8310, %r8309}; + // begin inline asm + shf.l.wrap.b32 %r8240, %r8201, %r8200, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8244, %r8200, %r8201, %r8263; + // end inline asm + xor.b32 %r8407, %r8240, %r8176; + xor.b32 %r8408, %r8244, %r8177; + xor.b32 %r8293, %r29892, %r8408; + xor.b32 %r8294, %r29891, %r8407; + st.local.v2.u32 [%rd53+120], {%r8294, %r8293}; + xor.b32 %r8285, %r29888, %r8408; + xor.b32 %r8286, %r29887, %r8407; + st.local.v2.u32 [%rd53+200], {%r8286, %r8285}; + // begin inline asm + shf.l.wrap.b32 %r8248, %r8213, %r8212, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8252, %r8212, %r8213, %r8263; + // end inline asm + xor.b32 %r8409, %r8248, %r8188; + xor.b32 %r8410, %r8252, %r8189; + xor.b32 %r8317, %r29911, %r8409; + xor.b32 %r8318, %r29912, %r8410; + xor.b32 %r8326, %r29882, %r8410; + xor.b32 %r8325, %r29881, %r8409; + st.local.v2.u32 [%rd53+168], {%r8325, %r8326}; + // begin inline asm + shf.l.wrap.b32 %r8256, %r8165, %r8164, %r8263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8260, %r8164, %r8165, %r8263; + // end inline asm + xor.b32 %r8411, %r8256, %r8200; + xor.b32 %r8412, %r8260, %r8201; + xor.b32 %r8277, %r29877, %r8411; + xor.b32 %r8278, %r29878, %r8412; + xor.b32 %r8302, %r29872, %r8412; + xor.b32 %r8301, %r29871, %r8411; + st.local.v2.u32 [%rd53+216], {%r8301, %r8302}; + // begin inline asm + shf.l.wrap.b32 %r8264, %r8270, %r8269, %r7767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8268, %r8269, %r8270, %r7767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8272, %r8278, %r8277, %r7775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8276, %r8277, %r8278, %r7775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8284, %r8285, %r8286, %r7783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8280, %r8286, %r8285, %r7783; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r8280, %r8284}; + // begin inline asm + shf.l.wrap.b32 %r8288, %r8294, %r8293, %r7815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8292, %r8293, %r8294, %r7815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8296, %r8302, %r8301, %r7863; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8300, %r8301, %r8302, %r7863; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8308, %r8309, %r8310, %r7887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8304, %r8310, %r8309, %r7887; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r8304, %r8308}; + // begin inline asm + shf.l.wrap.b32 %r8312, %r8318, %r8317, %r7903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8316, %r8317, %r8318, %r7903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8320, %r8326, %r8325, %r7911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8324, %r8325, %r8326, %r7911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8328, %r8334, %r8333, %r7943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8332, %r8333, %r8334, %r7943; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r8336, %r8371, %r8264, %r8288, 0xD2; + lop3.b32 %r8337, %r8374, %r8268, %r8292, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30054, %r8264, %r8288, %r8320, 0xD2; + lop3.b32 %r30055, %r8268, %r8292, %r8324, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+32], {%r30054, %r30055}; + // begin inline asm + // chi + lop3.b32 %r30050, %r8288, %r8320, %r8296, 0xD2; + lop3.b32 %r30051, %r8292, %r8324, %r8300, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+40], {%r30050, %r30051}; + // begin inline asm + // chi + lop3.b32 %r30046, %r8320, %r8296, %r8371, 0xD2; + lop3.b32 %r30047, %r8324, %r8300, %r8374, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+48], {%r30046, %r30047}; + // begin inline asm + // chi + lop3.b32 %r30044, %r8296, %r8371, %r8264, 0xD2; + lop3.b32 %r30045, %r8300, %r8374, %r8268, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+56], {%r30044, %r30045}; + // begin inline asm + // chi + lop3.b32 %r30040, %r8312, %r8272, %r8328, 0xD2; + lop3.b32 %r30041, %r8316, %r8276, %r8332, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+64], {%r30040, %r30041}; + // begin inline asm + // chi + lop3.b32 %r30052, %r8272, %r8328, %r8304, 0xD2; + lop3.b32 %r30053, %r8276, %r8332, %r8308, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+72], {%r30052, %r30053}; + // begin inline asm + // chi + lop3.b32 %r30048, %r8328, %r8304, %r8280, 0xD2; + lop3.b32 %r30049, %r8332, %r8308, %r8284, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+80], {%r30048, %r30049}; + // begin inline asm + ld.global.nc.v2.u32 {%r8400,%r8401}, [%rd481]; + // end inline asm + xor.b32 %r30042, %r8336, %r8400; + xor.b32 %r30043, %r8337, %r8401; + st.local.v2.u32 [%rd53+24], {%r30042, %r30043}; + add.s64 %rd55, %rd53, 24; + add.s64 %rd56, %rd2, 24; + +$L__BB2_19: + cvta.to.global.u64 %rd1270, %rd361; + shl.b32 %r8413, %r29954, 2; + cvt.u64.u32 %rd511, %r8413; + and.b64 %rd512, %rd511, 60; + add.s64 %rd513, %rd56, %rd512; + xor.b32 %r8414, %r47, %r29954; + mul.lo.s32 %r8415, %r8414, 16777619; + ld.local.u32 %r8416, [%rd513]; + xor.b32 %r8417, %r8415, %r8416; + mul.wide.u32 %rd514, %r8417, -954391867; + shr.u64 %rd515, %rd514, 32; + cvt.u32.u64 %r8418, %rd515; + sub.s32 %r8419, %r8417, %r8418; + shr.u32 %r8420, %r8419, 1; + add.s32 %r8421, %r8420, %r8418; + shr.u32 %r8422, %r8421, 20; + mul.lo.s32 %r8423, %r8422, 1179641; + sub.s32 %r8424, %r8417, %r8423; + mul.wide.u32 %rd516, %r8424, 64; + add.s64 %rd517, %rd1270, %rd516; + mul.lo.s32 %r8425, %r29991, 16777619; + ld.global.u32 %r8426, [%rd517]; + xor.b32 %r29991, %r8425, %r8426; + mul.lo.s32 %r8427, %r29992, 16777619; + ld.global.u32 %r8428, [%rd517+4]; + xor.b32 %r29992, %r8427, %r8428; + mul.lo.s32 %r8429, %r30003, 16777619; + ld.global.u32 %r8430, [%rd517+8]; + mul.lo.s32 %r8431, %r30004, 16777619; + ld.global.u32 %r8432, [%rd517+12]; + xor.b32 %r8433, %r8431, %r8432; + xor.b32 %r30003, %r8429, %r8430; + mov.b64 %rd518, {%r30003, %r8433}; + mul.lo.s32 %r8434, %r29999, 16777619; + ld.global.u32 %r8435, [%rd517+16]; + mul.lo.s32 %r8436, %r30000, 16777619; + ld.global.u32 %r8437, [%rd517+20]; + xor.b32 %r8438, %r8436, %r8437; + xor.b32 %r29999, %r8434, %r8435; + mov.b64 %rd519, {%r29999, %r8438}; + mul.lo.s32 %r8439, %r29995, 16777619; + ld.global.u32 %r8440, [%rd517+24]; + mul.lo.s32 %r8441, %r29996, 16777619; + ld.global.u32 %r8442, [%rd517+28]; + xor.b32 %r8443, %r8441, %r8442; + xor.b32 %r29995, %r8439, %r8440; + mov.b64 %rd520, {%r29995, %r8443}; + mul.lo.s32 %r8444, %r29993, 16777619; + ld.global.u32 %r8445, [%rd517+32]; + mul.lo.s32 %r8446, %r29994, 16777619; + ld.global.u32 %r8447, [%rd517+36]; + xor.b32 %r8448, %r8446, %r8447; + xor.b32 %r29993, %r8444, %r8445; + mov.b64 %rd521, {%r29993, %r8448}; + mul.lo.s32 %r8449, %r29989, 16777619; + ld.global.u32 %r8450, [%rd517+40]; + xor.b32 %r29989, %r8449, %r8450; + mul.lo.s32 %r8451, %r29990, 16777619; + ld.global.u32 %r8452, [%rd517+44]; + xor.b32 %r29990, %r8451, %r8452; + mul.lo.s32 %r8453, %r30001, 16777619; + ld.global.u32 %r8454, [%rd517+48]; + mul.lo.s32 %r8455, %r30002, 16777619; + ld.global.u32 %r8456, [%rd517+52]; + xor.b32 %r8457, %r8455, %r8456; + xor.b32 %r30001, %r8453, %r8454; + mov.b64 %rd522, {%r30001, %r8457}; + mul.lo.s32 %r8458, %r29997, 16777619; + ld.global.u32 %r8459, [%rd517+56]; + mul.lo.s32 %r8460, %r29998, 16777619; + ld.global.u32 %r8461, [%rd517+60]; + xor.b32 %r8462, %r8460, %r8461; + xor.b32 %r29997, %r8458, %r8459; + mov.b64 %rd523, {%r29997, %r8462}; + st.local.v2.u32 [%rd2+24], {%r29991, %r29992}; + st.local.v2.u32 [%rd2+32], {%r30003, %r8433}; + st.local.v2.u32 [%rd2+40], {%r29999, %r8438}; + st.local.v2.u32 [%rd2+48], {%r29995, %r8443}; + st.local.v2.u32 [%rd2+56], {%r29993, %r8448}; + st.local.v2.u32 [%rd2+64], {%r29989, %r29990}; + st.local.v2.u32 [%rd2+72], {%r30001, %r8457}; + st.local.v2.u32 [%rd2+80], {%r29997, %r8462}; + add.s64 %rd524, %rd55, %rd512; + xor.b32 %r8463, %r243, %r29954; + mul.lo.s32 %r8464, %r8463, 16777619; + ld.local.u32 %r8465, [%rd524]; + xor.b32 %r8466, %r8464, %r8465; + mul.wide.u32 %rd525, %r8466, -954391867; + shr.u64 %rd526, %rd525, 32; + cvt.u32.u64 %r8467, %rd526; + sub.s32 %r8468, %r8466, %r8467; + shr.u32 %r8469, %r8468, 1; + add.s32 %r8470, %r8469, %r8467; + shr.u32 %r8471, %r8470, 20; + mul.lo.s32 %r8472, %r8471, 1179641; + sub.s32 %r8473, %r8466, %r8472; + mul.wide.u32 %rd527, %r8473, 64; + add.s64 %rd528, %rd1270, %rd527; + mul.lo.s32 %r8474, %r30042, 16777619; + ld.global.u32 %r8475, [%rd528]; + xor.b32 %r30042, %r8474, %r8475; + mul.lo.s32 %r8476, %r30043, 16777619; + ld.global.u32 %r8477, [%rd528+4]; + xor.b32 %r30043, %r8476, %r8477; + mul.lo.s32 %r8478, %r30054, 16777619; + ld.global.u32 %r8479, [%rd528+8]; + mul.lo.s32 %r8480, %r30055, 16777619; + ld.global.u32 %r8481, [%rd528+12]; + xor.b32 %r8482, %r8480, %r8481; + xor.b32 %r30054, %r8478, %r8479; + mov.b64 %rd529, {%r30054, %r8482}; + mul.lo.s32 %r8483, %r30050, 16777619; + ld.global.u32 %r8484, [%rd528+16]; + mul.lo.s32 %r8485, %r30051, 16777619; + ld.global.u32 %r8486, [%rd528+20]; + xor.b32 %r8487, %r8485, %r8486; + xor.b32 %r30050, %r8483, %r8484; + mov.b64 %rd530, {%r30050, %r8487}; + mul.lo.s32 %r8488, %r30046, 16777619; + ld.global.u32 %r8489, [%rd528+24]; + mul.lo.s32 %r8490, %r30047, 16777619; + ld.global.u32 %r8491, [%rd528+28]; + xor.b32 %r8492, %r8490, %r8491; + xor.b32 %r30046, %r8488, %r8489; + mov.b64 %rd531, {%r30046, %r8492}; + mul.lo.s32 %r8493, %r30044, 16777619; + ld.global.u32 %r8494, [%rd528+32]; + mul.lo.s32 %r8495, %r30045, 16777619; + ld.global.u32 %r8496, [%rd528+36]; + xor.b32 %r8497, %r8495, %r8496; + xor.b32 %r30044, %r8493, %r8494; + mov.b64 %rd532, {%r30044, %r8497}; + mul.lo.s32 %r8498, %r30040, 16777619; + ld.global.u32 %r8499, [%rd528+40]; + xor.b32 %r30040, %r8498, %r8499; + mul.lo.s32 %r8500, %r30041, 16777619; + ld.global.u32 %r8501, [%rd528+44]; + xor.b32 %r30041, %r8500, %r8501; + mul.lo.s32 %r8502, %r30052, 16777619; + ld.global.u32 %r8503, [%rd528+48]; + mul.lo.s32 %r8504, %r30053, 16777619; + ld.global.u32 %r8505, [%rd528+52]; + xor.b32 %r8506, %r8504, %r8505; + xor.b32 %r30052, %r8502, %r8503; + mov.b64 %rd533, {%r30052, %r8506}; + mul.lo.s32 %r8507, %r30048, 16777619; + ld.global.u32 %r8508, [%rd528+56]; + mul.lo.s32 %r8509, %r30049, 16777619; + ld.global.u32 %r8510, [%rd528+60]; + xor.b32 %r8511, %r8509, %r8510; + xor.b32 %r30048, %r8507, %r8508; + mov.b64 %rd534, {%r30048, %r8511}; + st.local.v2.u32 [%rd53+24], {%r30042, %r30043}; + st.local.v2.u32 [%rd53+32], {%r30054, %r8482}; + st.local.v2.u32 [%rd53+40], {%r30050, %r8487}; + st.local.v2.u32 [%rd53+48], {%r30046, %r8492}; + st.local.v2.u32 [%rd53+56], {%r30044, %r8497}; + st.local.v2.u32 [%rd53+64], {%r30040, %r30041}; + st.local.v2.u32 [%rd53+72], {%r30052, %r8506}; + st.local.v2.u32 [%rd53+80], {%r30048, %r8511}; + add.s32 %r29954, %r29954, 1; + setp.lt.u32 %p16, %r29954, 512; + shr.u64 %rd535, %rd518, 32; + cvt.u32.u64 %r30004, %rd535; + shr.u64 %rd536, %rd519, 32; + cvt.u32.u64 %r30000, %rd536; + shr.u64 %rd537, %rd520, 32; + cvt.u32.u64 %r29996, %rd537; + shr.u64 %rd538, %rd521, 32; + cvt.u32.u64 %r29994, %rd538; + shr.u64 %rd539, %rd522, 32; + cvt.u32.u64 %r30002, %rd539; + shr.u64 %rd540, %rd523, 32; + cvt.u32.u64 %r29998, %rd540; + shr.u64 %rd541, %rd529, 32; + cvt.u32.u64 %r30055, %rd541; + shr.u64 %rd542, %rd530, 32; + cvt.u32.u64 %r30051, %rd542; + shr.u64 %rd543, %rd531, 32; + cvt.u32.u64 %r30047, %rd543; + shr.u64 %rd544, %rd532, 32; + cvt.u32.u64 %r30045, %rd544; + shr.u64 %rd545, %rd533, 32; + cvt.u32.u64 %r30053, %rd545; + shr.u64 %rd546, %rd534, 32; + cvt.u32.u64 %r30049, %rd546; + @%p16 bra $L__BB2_19; + + mov.u32 %r29955, 0; + st.local.v2.u32 [%rd2+96], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+104], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+112], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+120], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+128], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+136], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+144], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+152], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+160], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+168], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+176], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+184], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+192], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+200], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+208], {%r29955, %r29955}; + st.local.v2.u32 [%rd2+216], {%r29955, %r29955}; + mov.u32 %r29970, -2147483648; + mov.u32 %r8526, 1; + st.local.v2.u32 [%rd2+88], {%r8526, %r29970}; + mov.u32 %r29956, %r29955; + mov.u32 %r29957, %r29955; + mov.u32 %r29958, %r29955; + mov.u32 %r29959, %r29955; + mov.u32 %r29960, %r29955; + mov.u32 %r29961, %r29955; + mov.u32 %r29962, %r29955; + mov.u32 %r29963, %r29955; + mov.u32 %r29964, %r29955; + mov.u32 %r29965, %r29955; + mov.u32 %r29966, %r29955; + mov.u32 %r29967, %r29955; + mov.u32 %r29968, %r29955; + mov.u32 %r29969, %r8526; + mov.u32 %r29971, %r29955; + mov.u32 %r29972, %r29955; + mov.u32 %r29973, %r29955; + mov.u32 %r29974, %r29955; + mov.u32 %r29975, %r29955; + mov.u32 %r29976, %r29955; + mov.u32 %r29977, %r29955; + mov.u32 %r29978, %r29955; + mov.u32 %r29979, %r29955; + mov.u32 %r29980, %r29955; + mov.u32 %r29981, %r29955; + mov.u32 %r29982, %r29955; + mov.u32 %r29983, %r29955; + mov.u32 %r29984, %r29955; + mov.u32 %r29985, %r29955; + mov.u32 %r29986, %r29955; + mov.u32 %r29987, %r29955; + mov.u32 %r29988, %r29955; + mov.u32 %r30005, %r29955; + +$L__BB2_21: + // begin inline asm + // xor5 + lop3.b32 %r8553, %r29991, %r29989, %r29987, 0x96; + lop3.b32 %r8553, %r8553, %r29985, %r29983, 0x96; + lop3.b32 %r8554, %r29992, %r29990, %r29988, 0x96; + lop3.b32 %r8554, %r8554, %r29986, %r29984, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8565, %r30003, %r30001, %r29981, 0x96; + lop3.b32 %r8565, %r8565, %r29979, %r29977, 0x96; + lop3.b32 %r8566, %r30004, %r30002, %r29982, 0x96; + lop3.b32 %r8566, %r8566, %r29980, %r29978, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8577, %r29999, %r29997, %r29975, 0x96; + lop3.b32 %r8577, %r8577, %r29973, %r29971, 0x96; + lop3.b32 %r8578, %r30000, %r29998, %r29976, 0x96; + lop3.b32 %r8578, %r8578, %r29974, %r29972, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8589, %r29995, %r29969, %r29967, 0x96; + lop3.b32 %r8589, %r8589, %r29965, %r29963, 0x96; + lop3.b32 %r8590, %r29996, %r29970, %r29968, 0x96; + lop3.b32 %r8590, %r8590, %r29966, %r29964, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8601, %r29993, %r29961, %r29959, 0x96; + lop3.b32 %r8601, %r8601, %r29957, %r29955, 0x96; + lop3.b32 %r8602, %r29994, %r29962, %r29960, 0x96; + lop3.b32 %r8602, %r8602, %r29958, %r29956, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8613, %r8566, %r8565, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8617, %r8565, %r8566, %r8526; + // end inline asm + xor.b32 %r9047, %r8613, %r8601; + xor.b32 %r9048, %r8617, %r8602; + xor.b32 %r8880, %r29991, %r9047; + xor.b32 %r8883, %r29992, %r9048; + xor.b32 %r8787, %r29989, %r9047; + xor.b32 %r8786, %r29990, %r9048; + xor.b32 %r8834, %r29987, %r9047; + xor.b32 %r8835, %r29988, %r9048; + xor.b32 %r8739, %r29985, %r9047; + xor.b32 %r8738, %r29986, %r9048; + xor.b32 %r8690, %r29983, %r9047; + xor.b32 %r8691, %r29984, %r9048; + // begin inline asm + shf.l.wrap.b32 %r8621, %r8578, %r8577, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8625, %r8577, %r8578, %r8526; + // end inline asm + xor.b32 %r9049, %r8621, %r8553; + xor.b32 %r9050, %r8625, %r8554; + xor.b32 %r8842, %r30003, %r9049; + xor.b32 %r8843, %r30004, %r9050; + xor.b32 %r8659, %r30001, %r9049; + xor.b32 %r8658, %r30002, %r9050; + xor.b32 %r8818, %r29981, %r9049; + xor.b32 %r8819, %r29982, %r9050; + xor.b32 %r8779, %r29979, %r9049; + xor.b32 %r8778, %r29980, %r9050; + xor.b32 %r8762, %r29977, %r9049; + xor.b32 %r8763, %r29978, %r9050; + // begin inline asm + shf.l.wrap.b32 %r8629, %r8590, %r8589, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8633, %r8589, %r8590, %r8526; + // end inline asm + xor.b32 %r9051, %r8629, %r8565; + xor.b32 %r9052, %r8633, %r8566; + xor.b32 %r8699, %r29999, %r9051; + xor.b32 %r8698, %r30000, %r9052; + xor.b32 %r8826, %r29997, %r9051; + xor.b32 %r8827, %r29998, %r9052; + xor.b32 %r8707, %r29975, %r9051; + xor.b32 %r8706, %r29976, %r9052; + xor.b32 %r8810, %r29973, %r9051; + xor.b32 %r8811, %r29974, %r9052; + xor.b32 %r8675, %r29971, %r9051; + xor.b32 %r8674, %r29972, %r9052; + // begin inline asm + shf.l.wrap.b32 %r8637, %r8602, %r8601, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8641, %r8601, %r8602, %r8526; + // end inline asm + xor.b32 %r9053, %r8637, %r8577; + xor.b32 %r9054, %r8641, %r8578; + xor.b32 %r8794, %r29995, %r9053; + xor.b32 %r8795, %r29996, %r9054; + xor.b32 %r8771, %r29969, %r9053; + xor.b32 %r8770, %r29970, %r9054; + xor.b32 %r8714, %r29967, %r9053; + xor.b32 %r8715, %r29968, %r9054; + xor.b32 %r8802, %r29965, %r9053; + xor.b32 %r8803, %r29966, %r9054; + xor.b32 %r8731, %r29963, %r9053; + xor.b32 %r8730, %r29964, %r9054; + // begin inline asm + shf.l.wrap.b32 %r8645, %r8554, %r8553, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8649, %r8553, %r8554, %r8526; + // end inline asm + xor.b32 %r9055, %r8645, %r8589; + xor.b32 %r9056, %r8649, %r8590; + xor.b32 %r8746, %r29993, %r9055; + xor.b32 %r8747, %r29994, %r9056; + xor.b32 %r8666, %r29961, %r9055; + xor.b32 %r8667, %r29962, %r9056; + xor.b32 %r8683, %r29959, %r9055; + xor.b32 %r8682, %r29960, %r9056; + xor.b32 %r8722, %r29957, %r9055; + xor.b32 %r8723, %r29958, %r9056; + xor.b32 %r8754, %r29955, %r9055; + xor.b32 %r8755, %r29956, %r9056; + mov.u32 %r8660, 44; + // begin inline asm + shf.l.wrap.b32 %r8653, %r8659, %r8658, %r8660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8657, %r8658, %r8659, %r8660; + // end inline asm + mov.u32 %r8668, 20; + // begin inline asm + shf.l.wrap.b32 %r8661, %r8667, %r8666, %r8668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8665, %r8666, %r8667, %r8668; + // end inline asm + mov.u32 %r8676, 61; + // begin inline asm + shf.l.wrap.b32 %r8669, %r8675, %r8674, %r8676; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8673, %r8674, %r8675, %r8676; + // end inline asm + mov.u32 %r8684, 39; + // begin inline asm + shf.l.wrap.b32 %r8677, %r8683, %r8682, %r8684; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8681, %r8682, %r8683, %r8684; + // end inline asm + mov.u32 %r8692, 18; + // begin inline asm + shf.l.wrap.b32 %r8685, %r8691, %r8690, %r8692; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8689, %r8690, %r8691, %r8692; + // end inline asm + mov.u32 %r8700, 62; + // begin inline asm + shf.l.wrap.b32 %r8693, %r8699, %r8698, %r8700; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8697, %r8698, %r8699, %r8700; + // end inline asm + mov.u32 %r8708, 43; + // begin inline asm + shf.l.wrap.b32 %r8701, %r8707, %r8706, %r8708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8705, %r8706, %r8707, %r8708; + // end inline asm + mov.u32 %r8716, 25; + // begin inline asm + shf.l.wrap.b32 %r8709, %r8715, %r8714, %r8716; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8713, %r8714, %r8715, %r8716; + // end inline asm + mov.u32 %r8724, 8; + // begin inline asm + shf.l.wrap.b32 %r8717, %r8723, %r8722, %r8724; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8721, %r8722, %r8723, %r8724; + // end inline asm + mov.u32 %r8732, 56; + // begin inline asm + shf.l.wrap.b32 %r8725, %r8731, %r8730, %r8732; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8729, %r8730, %r8731, %r8732; + // end inline asm + mov.u32 %r8740, 41; + // begin inline asm + shf.l.wrap.b32 %r8733, %r8739, %r8738, %r8740; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8737, %r8738, %r8739, %r8740; + // end inline asm + mov.u32 %r8748, 27; + // begin inline asm + shf.l.wrap.b32 %r8741, %r8747, %r8746, %r8748; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8745, %r8746, %r8747, %r8748; + // end inline asm + mov.u32 %r8756, 14; + // begin inline asm + shf.l.wrap.b32 %r8749, %r8755, %r8754, %r8756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8753, %r8754, %r8755, %r8756; + // end inline asm + mov.u32 %r8764, 2; + // begin inline asm + shf.l.wrap.b32 %r8757, %r8763, %r8762, %r8764; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8761, %r8762, %r8763, %r8764; + // end inline asm + mov.u32 %r8772, 55; + // begin inline asm + shf.l.wrap.b32 %r8765, %r8771, %r8770, %r8772; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8769, %r8770, %r8771, %r8772; + // end inline asm + mov.u32 %r8780, 45; + // begin inline asm + shf.l.wrap.b32 %r8773, %r8779, %r8778, %r8780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8777, %r8778, %r8779, %r8780; + // end inline asm + mov.u32 %r8788, 36; + // begin inline asm + shf.l.wrap.b32 %r8781, %r8787, %r8786, %r8788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8785, %r8786, %r8787, %r8788; + // end inline asm + mov.u32 %r8796, 28; + // begin inline asm + shf.l.wrap.b32 %r8789, %r8795, %r8794, %r8796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8793, %r8794, %r8795, %r8796; + // end inline asm + mov.u32 %r8804, 21; + // begin inline asm + shf.l.wrap.b32 %r8797, %r8803, %r8802, %r8804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8801, %r8802, %r8803, %r8804; + // end inline asm + mov.u32 %r8812, 15; + // begin inline asm + shf.l.wrap.b32 %r8805, %r8811, %r8810, %r8812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8809, %r8810, %r8811, %r8812; + // end inline asm + mov.u32 %r8820, 10; + // begin inline asm + shf.l.wrap.b32 %r8813, %r8819, %r8818, %r8820; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8817, %r8818, %r8819, %r8820; + // end inline asm + mov.u32 %r8828, 6; + // begin inline asm + shf.l.wrap.b32 %r8821, %r8827, %r8826, %r8828; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8825, %r8826, %r8827, %r8828; + // end inline asm + mov.u32 %r8836, 3; + // begin inline asm + shf.l.wrap.b32 %r8829, %r8835, %r8834, %r8836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8833, %r8834, %r8835, %r8836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8837, %r8843, %r8842, %r8526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r8841, %r8842, %r8843, %r8526; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r8845, %r8880, %r8653, %r8701, 0xD2; + lop3.b32 %r8846, %r8883, %r8657, %r8705, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30003, %r8653, %r8701, %r8797, 0xD2; + lop3.b32 %r30004, %r8657, %r8705, %r8801, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29999, %r8701, %r8797, %r8749, 0xD2; + lop3.b32 %r30000, %r8705, %r8801, %r8753, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29995, %r8797, %r8749, %r8880, 0xD2; + lop3.b32 %r29996, %r8801, %r8753, %r8883, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29993, %r8749, %r8880, %r8653, 0xD2; + lop3.b32 %r29994, %r8753, %r8883, %r8657, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29989, %r8789, %r8661, %r8829, 0xD2; + lop3.b32 %r29990, %r8793, %r8665, %r8833, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30001, %r8661, %r8829, %r8773, 0xD2; + lop3.b32 %r30002, %r8665, %r8833, %r8777, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29997, %r8829, %r8773, %r8669, 0xD2; + lop3.b32 %r29998, %r8833, %r8777, %r8673, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r29969, %r8773, %r8669, %r8789, 0xD2; + lop3.b32 %r29970, %r8777, %r8673, %r8793, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r29969, %r29970}; + // begin inline asm + // chi + lop3.b32 %r29961, %r8669, %r8789, %r8661, 0xD2; + lop3.b32 %r29962, %r8673, %r8793, %r8665, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r29961, %r29962}; + // begin inline asm + // chi + lop3.b32 %r29987, %r8837, %r8821, %r8709, 0xD2; + lop3.b32 %r29988, %r8841, %r8825, %r8713, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r29987, %r29988}; + // begin inline asm + // chi + lop3.b32 %r29981, %r8821, %r8709, %r8717, 0xD2; + lop3.b32 %r29982, %r8825, %r8713, %r8721, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r29981, %r29982}; + // begin inline asm + // chi + lop3.b32 %r29975, %r8709, %r8717, %r8685, 0xD2; + lop3.b32 %r29976, %r8713, %r8721, %r8689, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r29975, %r29976}; + // begin inline asm + // chi + lop3.b32 %r29967, %r8717, %r8685, %r8837, 0xD2; + lop3.b32 %r29968, %r8721, %r8689, %r8841, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r29967, %r29968}; + // begin inline asm + // chi + lop3.b32 %r29959, %r8685, %r8837, %r8821, 0xD2; + lop3.b32 %r29960, %r8689, %r8841, %r8825, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r29959, %r29960}; + // begin inline asm + // chi + lop3.b32 %r29985, %r8741, %r8781, %r8813, 0xD2; + lop3.b32 %r29986, %r8745, %r8785, %r8817, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r29985, %r29986}; + // begin inline asm + // chi + lop3.b32 %r29979, %r8781, %r8813, %r8805, 0xD2; + lop3.b32 %r29980, %r8785, %r8817, %r8809, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r29979, %r29980}; + // begin inline asm + // chi + lop3.b32 %r29973, %r8813, %r8805, %r8725, 0xD2; + lop3.b32 %r29974, %r8817, %r8809, %r8729, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r29973, %r29974}; + // begin inline asm + // chi + lop3.b32 %r29965, %r8805, %r8725, %r8741, 0xD2; + lop3.b32 %r29966, %r8809, %r8729, %r8745, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r29965, %r29966}; + // begin inline asm + // chi + lop3.b32 %r29957, %r8725, %r8741, %r8781, 0xD2; + lop3.b32 %r29958, %r8729, %r8745, %r8785, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r29957, %r29958}; + // begin inline asm + // chi + lop3.b32 %r29983, %r8693, %r8765, %r8677, 0xD2; + lop3.b32 %r29984, %r8697, %r8769, %r8681, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r29983, %r29984}; + // begin inline asm + // chi + lop3.b32 %r29977, %r8765, %r8677, %r8733, 0xD2; + lop3.b32 %r29978, %r8769, %r8681, %r8737, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r29977, %r29978}; + // begin inline asm + // chi + lop3.b32 %r29971, %r8677, %r8733, %r8757, 0xD2; + lop3.b32 %r29972, %r8681, %r8737, %r8761, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r29971, %r29972}; + // begin inline asm + // chi + lop3.b32 %r29963, %r8733, %r8757, %r8693, 0xD2; + lop3.b32 %r29964, %r8737, %r8761, %r8697, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r29963, %r29964}; + // begin inline asm + // chi + lop3.b32 %r29955, %r8757, %r8693, %r8765, 0xD2; + lop3.b32 %r29956, %r8761, %r8697, %r8769, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r29955, %r29956}; + mul.wide.s32 %rd548, %r30005, 8; + add.s64 %rd547, %rd480, %rd548; + // begin inline asm + ld.global.nc.v2.u32 {%r9045,%r9046}, [%rd547]; + // end inline asm + xor.b32 %r29991, %r8845, %r9045; + xor.b32 %r29992, %r8846, %r9046; + add.s32 %r30005, %r30005, 1; + setp.lt.u32 %p17, %r30005, 23; + @%p17 bra $L__BB2_21; + + st.local.v2.u32 [%rd2+32], {%r30003, %r30004}; + st.local.v2.u32 [%rd2+72], {%r30001, %r30002}; + st.local.v2.u32 [%rd2+40], {%r29999, %r30000}; + st.local.v2.u32 [%rd2+80], {%r29997, %r29998}; + st.local.v2.u32 [%rd2+48], {%r29995, %r29996}; + st.local.v2.u32 [%rd2+56], {%r29993, %r29994}; + st.local.v2.u32 [%rd2+24], {%r29991, %r29992}; + // begin inline asm + // xor5 + lop3.b32 %r9057, %r29991, %r29989, %r29987, 0x96; + lop3.b32 %r9057, %r9057, %r29985, %r29983, 0x96; + lop3.b32 %r9058, %r29992, %r29990, %r29988, 0x96; + lop3.b32 %r9058, %r9058, %r29986, %r29984, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9069, %r30003, %r30001, %r29981, 0x96; + lop3.b32 %r9069, %r9069, %r29979, %r29977, 0x96; + lop3.b32 %r9070, %r30004, %r30002, %r29982, 0x96; + lop3.b32 %r9070, %r9070, %r29980, %r29978, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9081, %r29999, %r29997, %r29975, 0x96; + lop3.b32 %r9081, %r9081, %r29973, %r29971, 0x96; + lop3.b32 %r9082, %r30000, %r29998, %r29976, 0x96; + lop3.b32 %r9082, %r9082, %r29974, %r29972, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9093, %r29995, %r29969, %r29967, 0x96; + lop3.b32 %r9093, %r9093, %r29965, %r29963, 0x96; + lop3.b32 %r9094, %r29996, %r29970, %r29968, 0x96; + lop3.b32 %r9094, %r9094, %r29966, %r29964, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9105, %r29993, %r29961, %r29959, 0x96; + lop3.b32 %r9105, %r9105, %r29957, %r29955, 0x96; + lop3.b32 %r9106, %r29994, %r29962, %r29960, 0x96; + lop3.b32 %r9106, %r9106, %r29958, %r29956, 0x96; + // end inline asm + mov.u32 %r9309, 1; + // begin inline asm + shf.l.wrap.b32 %r9117, %r9070, %r9069, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9121, %r9069, %r9070, %r9309; + // end inline asm + xor.b32 %r9336, %r9117, %r9105; + xor.b32 %r9337, %r9121, %r9106; + xor.b32 %r9264, %r29991, %r9336; + xor.b32 %r9267, %r29992, %r9337; + xor.b32 %r9227, %r29988, %r9337; + xor.b32 %r9226, %r29987, %r9336; + st.local.v2.u32 [%rd2+104], {%r9226, %r9227}; + // begin inline asm + shf.l.wrap.b32 %r9125, %r9082, %r9081, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9129, %r9081, %r9082, %r9309; + // end inline asm + xor.b32 %r9338, %r9125, %r9057; + xor.b32 %r9339, %r9129, %r9058; + xor.b32 %r9163, %r30001, %r9338; + xor.b32 %r9162, %r30002, %r9339; + xor.b32 %r9202, %r29980, %r9339; + xor.b32 %r9203, %r29979, %r9338; + st.local.v2.u32 [%rd2+152], {%r9203, %r9202}; + // begin inline asm + shf.l.wrap.b32 %r9133, %r9094, %r9093, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9137, %r9093, %r9094, %r9309; + // end inline asm + xor.b32 %r9340, %r9133, %r9069; + xor.b32 %r9341, %r9137, %r9070; + xor.b32 %r9186, %r29976, %r9341; + xor.b32 %r9187, %r29975, %r9340; + st.local.v2.u32 [%rd2+120], {%r9187, %r9186}; + xor.b32 %r9178, %r29972, %r9341; + xor.b32 %r9179, %r29971, %r9340; + st.local.v2.u32 [%rd2+200], {%r9179, %r9178}; + // begin inline asm + shf.l.wrap.b32 %r9141, %r9106, %r9105, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9145, %r9105, %r9106, %r9309; + // end inline asm + xor.b32 %r9342, %r9141, %r9081; + xor.b32 %r9343, %r9145, %r9082; + xor.b32 %r9210, %r29995, %r9342; + xor.b32 %r9211, %r29996, %r9343; + xor.b32 %r9219, %r29966, %r9343; + xor.b32 %r9218, %r29965, %r9342; + st.local.v2.u32 [%rd2+168], {%r9218, %r9219}; + // begin inline asm + shf.l.wrap.b32 %r9149, %r9058, %r9057, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9153, %r9057, %r9058, %r9309; + // end inline asm + xor.b32 %r9344, %r9149, %r9093; + xor.b32 %r9345, %r9153, %r9094; + xor.b32 %r9170, %r29961, %r9344; + xor.b32 %r9171, %r29962, %r9345; + xor.b32 %r9195, %r29956, %r9345; + xor.b32 %r9194, %r29955, %r9344; + st.local.v2.u32 [%rd2+216], {%r9194, %r9195}; + // begin inline asm + shf.l.wrap.b32 %r9157, %r9163, %r9162, %r8660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9161, %r9162, %r9163, %r8660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9165, %r9171, %r9170, %r8668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9169, %r9170, %r9171, %r8668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9177, %r9178, %r9179, %r8676; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9173, %r9179, %r9178, %r8676; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r9173, %r9177}; + // begin inline asm + shf.l.wrap.b32 %r9181, %r9187, %r9186, %r8708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9185, %r9186, %r9187, %r8708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9189, %r9195, %r9194, %r8756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9193, %r9194, %r9195, %r8756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9201, %r9202, %r9203, %r8780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9197, %r9203, %r9202, %r8780; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r9197, %r9201}; + // begin inline asm + shf.l.wrap.b32 %r9205, %r9211, %r9210, %r8796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9209, %r9210, %r9211, %r8796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9213, %r9219, %r9218, %r8804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9217, %r9218, %r9219, %r8804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9221, %r9227, %r9226, %r8836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9225, %r9226, %r9227, %r8836; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r9229, %r9264, %r9157, %r9181, 0xD2; + lop3.b32 %r9230, %r9267, %r9161, %r9185, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r9237, %r9157, %r9181, %r9213, 0xD2; + lop3.b32 %r9238, %r9161, %r9185, %r9217, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r9237, %r9238}; + // begin inline asm + // chi + lop3.b32 %r9245, %r9181, %r9213, %r9189, 0xD2; + lop3.b32 %r9246, %r9185, %r9217, %r9193, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r9245, %r9246}; + // begin inline asm + // chi + lop3.b32 %r9253, %r9213, %r9189, %r9264, 0xD2; + lop3.b32 %r9254, %r9217, %r9193, %r9267, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r9253, %r9254}; + // begin inline asm + // chi + lop3.b32 %r9261, %r9189, %r9264, %r9157, 0xD2; + lop3.b32 %r9262, %r9193, %r9267, %r9161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r9261, %r9262}; + // begin inline asm + // chi + lop3.b32 %r9269, %r9205, %r9165, %r9221, 0xD2; + lop3.b32 %r9270, %r9209, %r9169, %r9225, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r9269, %r9270}; + // begin inline asm + // chi + lop3.b32 %r9277, %r9165, %r9221, %r9197, 0xD2; + lop3.b32 %r9278, %r9169, %r9225, %r9201, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r9277, %r9278}; + // begin inline asm + // chi + lop3.b32 %r9285, %r9221, %r9197, %r9173, 0xD2; + lop3.b32 %r9286, %r9225, %r9201, %r9177, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r9285, %r9286}; + // begin inline asm + ld.global.nc.v2.u32 {%r9293,%r9294}, [%rd481]; + // end inline asm + xor.b32 %r9346, %r9230, %r9294; + xor.b32 %r9347, %r9229, %r9293; + mov.b64 %rd1317, {%r9347, %r9346}; + mov.b64 %rd1318, {%r9237, %r9238}; + mov.b64 %rd1319, {%r9245, %r9246}; + mov.b64 %rd1320, {%r9253, %r9254}; + mov.b64 %rd1321, {%r9261, %r9262}; + mov.b64 %rd1322, {%r9269, %r9270}; + mov.b64 %rd1323, {%r9277, %r9278}; + mov.b64 %rd1324, {%r9285, %r9286}; + mov.u32 %r30006, 0; + st.local.v2.u32 [%rd2+24], {%r9347, %r9346}; + st.local.v2.u32 [%rd53+96], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+104], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+112], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+120], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+128], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+136], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+144], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+152], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+160], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+168], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+176], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+184], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+192], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+200], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+208], {%r30006, %r30006}; + st.local.v2.u32 [%rd53+216], {%r30006, %r30006}; + mov.u32 %r30021, -2147483648; + st.local.v2.u32 [%rd53+88], {%r9309, %r30021}; + mov.u32 %r30007, %r30006; + mov.u32 %r30008, %r30006; + mov.u32 %r30009, %r30006; + mov.u32 %r30010, %r30006; + mov.u32 %r30011, %r30006; + mov.u32 %r30012, %r30006; + mov.u32 %r30013, %r30006; + mov.u32 %r30014, %r30006; + mov.u32 %r30015, %r30006; + mov.u32 %r30016, %r30006; + mov.u32 %r30017, %r30006; + mov.u32 %r30018, %r30006; + mov.u32 %r30019, %r30006; + mov.u32 %r30020, %r9309; + mov.u32 %r30022, %r30006; + mov.u32 %r30023, %r30006; + mov.u32 %r30024, %r30006; + mov.u32 %r30025, %r30006; + mov.u32 %r30026, %r30006; + mov.u32 %r30027, %r30006; + mov.u32 %r30028, %r30006; + mov.u32 %r30029, %r30006; + mov.u32 %r30030, %r30006; + mov.u32 %r30031, %r30006; + mov.u32 %r30032, %r30006; + mov.u32 %r30033, %r30006; + mov.u32 %r30034, %r30006; + mov.u32 %r30035, %r30006; + mov.u32 %r30036, %r30006; + mov.u32 %r30037, %r30006; + mov.u32 %r30038, %r30006; + mov.u32 %r30039, %r30006; + mov.u32 %r30056, %r30006; + +$L__BB2_23: + // begin inline asm + // xor5 + lop3.b32 %r9348, %r30042, %r30040, %r30038, 0x96; + lop3.b32 %r9348, %r9348, %r30036, %r30034, 0x96; + lop3.b32 %r9349, %r30043, %r30041, %r30039, 0x96; + lop3.b32 %r9349, %r9349, %r30037, %r30035, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9360, %r30054, %r30052, %r30032, 0x96; + lop3.b32 %r9360, %r9360, %r30030, %r30028, 0x96; + lop3.b32 %r9361, %r30055, %r30053, %r30033, 0x96; + lop3.b32 %r9361, %r9361, %r30031, %r30029, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9372, %r30050, %r30048, %r30026, 0x96; + lop3.b32 %r9372, %r9372, %r30024, %r30022, 0x96; + lop3.b32 %r9373, %r30051, %r30049, %r30027, 0x96; + lop3.b32 %r9373, %r9373, %r30025, %r30023, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9384, %r30046, %r30020, %r30018, 0x96; + lop3.b32 %r9384, %r9384, %r30016, %r30014, 0x96; + lop3.b32 %r9385, %r30047, %r30021, %r30019, 0x96; + lop3.b32 %r9385, %r9385, %r30017, %r30015, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9396, %r30044, %r30012, %r30010, 0x96; + lop3.b32 %r9396, %r9396, %r30008, %r30006, 0x96; + lop3.b32 %r9397, %r30045, %r30013, %r30011, 0x96; + lop3.b32 %r9397, %r9397, %r30009, %r30007, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9408, %r9361, %r9360, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9412, %r9360, %r9361, %r9309; + // end inline asm + xor.b32 %r9842, %r9408, %r9396; + xor.b32 %r9843, %r9412, %r9397; + xor.b32 %r9675, %r30042, %r9842; + xor.b32 %r9678, %r30043, %r9843; + xor.b32 %r9582, %r30040, %r9842; + xor.b32 %r9581, %r30041, %r9843; + xor.b32 %r9629, %r30038, %r9842; + xor.b32 %r9630, %r30039, %r9843; + xor.b32 %r9534, %r30036, %r9842; + xor.b32 %r9533, %r30037, %r9843; + xor.b32 %r9485, %r30034, %r9842; + xor.b32 %r9486, %r30035, %r9843; + // begin inline asm + shf.l.wrap.b32 %r9416, %r9373, %r9372, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9420, %r9372, %r9373, %r9309; + // end inline asm + xor.b32 %r9844, %r9416, %r9348; + xor.b32 %r9845, %r9420, %r9349; + xor.b32 %r9637, %r30054, %r9844; + xor.b32 %r9638, %r30055, %r9845; + xor.b32 %r9454, %r30052, %r9844; + xor.b32 %r9453, %r30053, %r9845; + xor.b32 %r9613, %r30032, %r9844; + xor.b32 %r9614, %r30033, %r9845; + xor.b32 %r9574, %r30030, %r9844; + xor.b32 %r9573, %r30031, %r9845; + xor.b32 %r9557, %r30028, %r9844; + xor.b32 %r9558, %r30029, %r9845; + // begin inline asm + shf.l.wrap.b32 %r9424, %r9385, %r9384, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9428, %r9384, %r9385, %r9309; + // end inline asm + xor.b32 %r9846, %r9424, %r9360; + xor.b32 %r9847, %r9428, %r9361; + xor.b32 %r9494, %r30050, %r9846; + xor.b32 %r9493, %r30051, %r9847; + xor.b32 %r9621, %r30048, %r9846; + xor.b32 %r9622, %r30049, %r9847; + xor.b32 %r9502, %r30026, %r9846; + xor.b32 %r9501, %r30027, %r9847; + xor.b32 %r9605, %r30024, %r9846; + xor.b32 %r9606, %r30025, %r9847; + xor.b32 %r9470, %r30022, %r9846; + xor.b32 %r9469, %r30023, %r9847; + // begin inline asm + shf.l.wrap.b32 %r9432, %r9397, %r9396, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9436, %r9396, %r9397, %r9309; + // end inline asm + xor.b32 %r9848, %r9432, %r9372; + xor.b32 %r9849, %r9436, %r9373; + xor.b32 %r9589, %r30046, %r9848; + xor.b32 %r9590, %r30047, %r9849; + xor.b32 %r9566, %r30020, %r9848; + xor.b32 %r9565, %r30021, %r9849; + xor.b32 %r9509, %r30018, %r9848; + xor.b32 %r9510, %r30019, %r9849; + xor.b32 %r9597, %r30016, %r9848; + xor.b32 %r9598, %r30017, %r9849; + xor.b32 %r9526, %r30014, %r9848; + xor.b32 %r9525, %r30015, %r9849; + // begin inline asm + shf.l.wrap.b32 %r9440, %r9349, %r9348, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9444, %r9348, %r9349, %r9309; + // end inline asm + xor.b32 %r9850, %r9440, %r9384; + xor.b32 %r9851, %r9444, %r9385; + xor.b32 %r9541, %r30044, %r9850; + xor.b32 %r9542, %r30045, %r9851; + xor.b32 %r9461, %r30012, %r9850; + xor.b32 %r9462, %r30013, %r9851; + xor.b32 %r9478, %r30010, %r9850; + xor.b32 %r9477, %r30011, %r9851; + xor.b32 %r9517, %r30008, %r9850; + xor.b32 %r9518, %r30009, %r9851; + xor.b32 %r9549, %r30006, %r9850; + xor.b32 %r9550, %r30007, %r9851; + mov.u32 %r9455, 44; + // begin inline asm + shf.l.wrap.b32 %r9448, %r9454, %r9453, %r9455; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9452, %r9453, %r9454, %r9455; + // end inline asm + mov.u32 %r9463, 20; + // begin inline asm + shf.l.wrap.b32 %r9456, %r9462, %r9461, %r9463; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9460, %r9461, %r9462, %r9463; + // end inline asm + mov.u32 %r9471, 61; + // begin inline asm + shf.l.wrap.b32 %r9464, %r9470, %r9469, %r9471; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9468, %r9469, %r9470, %r9471; + // end inline asm + mov.u32 %r9479, 39; + // begin inline asm + shf.l.wrap.b32 %r9472, %r9478, %r9477, %r9479; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9476, %r9477, %r9478, %r9479; + // end inline asm + mov.u32 %r9487, 18; + // begin inline asm + shf.l.wrap.b32 %r9480, %r9486, %r9485, %r9487; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9484, %r9485, %r9486, %r9487; + // end inline asm + mov.u32 %r9495, 62; + // begin inline asm + shf.l.wrap.b32 %r9488, %r9494, %r9493, %r9495; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9492, %r9493, %r9494, %r9495; + // end inline asm + mov.u32 %r9503, 43; + // begin inline asm + shf.l.wrap.b32 %r9496, %r9502, %r9501, %r9503; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9500, %r9501, %r9502, %r9503; + // end inline asm + mov.u32 %r9511, 25; + // begin inline asm + shf.l.wrap.b32 %r9504, %r9510, %r9509, %r9511; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9508, %r9509, %r9510, %r9511; + // end inline asm + mov.u32 %r9519, 8; + // begin inline asm + shf.l.wrap.b32 %r9512, %r9518, %r9517, %r9519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9516, %r9517, %r9518, %r9519; + // end inline asm + mov.u32 %r9527, 56; + // begin inline asm + shf.l.wrap.b32 %r9520, %r9526, %r9525, %r9527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9524, %r9525, %r9526, %r9527; + // end inline asm + mov.u32 %r9535, 41; + // begin inline asm + shf.l.wrap.b32 %r9528, %r9534, %r9533, %r9535; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9532, %r9533, %r9534, %r9535; + // end inline asm + mov.u32 %r9543, 27; + // begin inline asm + shf.l.wrap.b32 %r9536, %r9542, %r9541, %r9543; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9540, %r9541, %r9542, %r9543; + // end inline asm + mov.u32 %r9551, 14; + // begin inline asm + shf.l.wrap.b32 %r9544, %r9550, %r9549, %r9551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9548, %r9549, %r9550, %r9551; + // end inline asm + mov.u32 %r9559, 2; + // begin inline asm + shf.l.wrap.b32 %r9552, %r9558, %r9557, %r9559; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9556, %r9557, %r9558, %r9559; + // end inline asm + mov.u32 %r9567, 55; + // begin inline asm + shf.l.wrap.b32 %r9560, %r9566, %r9565, %r9567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9564, %r9565, %r9566, %r9567; + // end inline asm + mov.u32 %r9575, 45; + // begin inline asm + shf.l.wrap.b32 %r9568, %r9574, %r9573, %r9575; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9572, %r9573, %r9574, %r9575; + // end inline asm + mov.u32 %r9583, 36; + // begin inline asm + shf.l.wrap.b32 %r9576, %r9582, %r9581, %r9583; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9580, %r9581, %r9582, %r9583; + // end inline asm + mov.u32 %r9591, 28; + // begin inline asm + shf.l.wrap.b32 %r9584, %r9590, %r9589, %r9591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9588, %r9589, %r9590, %r9591; + // end inline asm + mov.u32 %r9599, 21; + // begin inline asm + shf.l.wrap.b32 %r9592, %r9598, %r9597, %r9599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9596, %r9597, %r9598, %r9599; + // end inline asm + mov.u32 %r9607, 15; + // begin inline asm + shf.l.wrap.b32 %r9600, %r9606, %r9605, %r9607; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9604, %r9605, %r9606, %r9607; + // end inline asm + mov.u32 %r9615, 10; + // begin inline asm + shf.l.wrap.b32 %r9608, %r9614, %r9613, %r9615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9612, %r9613, %r9614, %r9615; + // end inline asm + mov.u32 %r9623, 6; + // begin inline asm + shf.l.wrap.b32 %r9616, %r9622, %r9621, %r9623; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9620, %r9621, %r9622, %r9623; + // end inline asm + mov.u32 %r9631, 3; + // begin inline asm + shf.l.wrap.b32 %r9624, %r9630, %r9629, %r9631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9628, %r9629, %r9630, %r9631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9632, %r9638, %r9637, %r9309; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9636, %r9637, %r9638, %r9309; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r9640, %r9675, %r9448, %r9496, 0xD2; + lop3.b32 %r9641, %r9678, %r9452, %r9500, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30054, %r9448, %r9496, %r9592, 0xD2; + lop3.b32 %r30055, %r9452, %r9500, %r9596, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30050, %r9496, %r9592, %r9544, 0xD2; + lop3.b32 %r30051, %r9500, %r9596, %r9548, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30046, %r9592, %r9544, %r9675, 0xD2; + lop3.b32 %r30047, %r9596, %r9548, %r9678, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30044, %r9544, %r9675, %r9448, 0xD2; + lop3.b32 %r30045, %r9548, %r9678, %r9452, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30040, %r9584, %r9456, %r9624, 0xD2; + lop3.b32 %r30041, %r9588, %r9460, %r9628, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30052, %r9456, %r9624, %r9568, 0xD2; + lop3.b32 %r30053, %r9460, %r9628, %r9572, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30048, %r9624, %r9568, %r9464, 0xD2; + lop3.b32 %r30049, %r9628, %r9572, %r9468, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30020, %r9568, %r9464, %r9584, 0xD2; + lop3.b32 %r30021, %r9572, %r9468, %r9588, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r30020, %r30021}; + // begin inline asm + // chi + lop3.b32 %r30012, %r9464, %r9584, %r9456, 0xD2; + lop3.b32 %r30013, %r9468, %r9588, %r9460, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r30012, %r30013}; + // begin inline asm + // chi + lop3.b32 %r30038, %r9632, %r9616, %r9504, 0xD2; + lop3.b32 %r30039, %r9636, %r9620, %r9508, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+104], {%r30038, %r30039}; + // begin inline asm + // chi + lop3.b32 %r30032, %r9616, %r9504, %r9512, 0xD2; + lop3.b32 %r30033, %r9620, %r9508, %r9516, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+112], {%r30032, %r30033}; + // begin inline asm + // chi + lop3.b32 %r30026, %r9504, %r9512, %r9480, 0xD2; + lop3.b32 %r30027, %r9508, %r9516, %r9484, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+120], {%r30026, %r30027}; + // begin inline asm + // chi + lop3.b32 %r30018, %r9512, %r9480, %r9632, 0xD2; + lop3.b32 %r30019, %r9516, %r9484, %r9636, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+128], {%r30018, %r30019}; + // begin inline asm + // chi + lop3.b32 %r30010, %r9480, %r9632, %r9616, 0xD2; + lop3.b32 %r30011, %r9484, %r9636, %r9620, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+136], {%r30010, %r30011}; + // begin inline asm + // chi + lop3.b32 %r30036, %r9536, %r9576, %r9608, 0xD2; + lop3.b32 %r30037, %r9540, %r9580, %r9612, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+144], {%r30036, %r30037}; + // begin inline asm + // chi + lop3.b32 %r30030, %r9576, %r9608, %r9600, 0xD2; + lop3.b32 %r30031, %r9580, %r9612, %r9604, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+152], {%r30030, %r30031}; + // begin inline asm + // chi + lop3.b32 %r30024, %r9608, %r9600, %r9520, 0xD2; + lop3.b32 %r30025, %r9612, %r9604, %r9524, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+160], {%r30024, %r30025}; + // begin inline asm + // chi + lop3.b32 %r30016, %r9600, %r9520, %r9536, 0xD2; + lop3.b32 %r30017, %r9604, %r9524, %r9540, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+168], {%r30016, %r30017}; + // begin inline asm + // chi + lop3.b32 %r30008, %r9520, %r9536, %r9576, 0xD2; + lop3.b32 %r30009, %r9524, %r9540, %r9580, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+176], {%r30008, %r30009}; + // begin inline asm + // chi + lop3.b32 %r30034, %r9488, %r9560, %r9472, 0xD2; + lop3.b32 %r30035, %r9492, %r9564, %r9476, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+184], {%r30034, %r30035}; + // begin inline asm + // chi + lop3.b32 %r30028, %r9560, %r9472, %r9528, 0xD2; + lop3.b32 %r30029, %r9564, %r9476, %r9532, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+192], {%r30028, %r30029}; + // begin inline asm + // chi + lop3.b32 %r30022, %r9472, %r9528, %r9552, 0xD2; + lop3.b32 %r30023, %r9476, %r9532, %r9556, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+200], {%r30022, %r30023}; + // begin inline asm + // chi + lop3.b32 %r30014, %r9528, %r9552, %r9488, 0xD2; + lop3.b32 %r30015, %r9532, %r9556, %r9492, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+208], {%r30014, %r30015}; + // begin inline asm + // chi + lop3.b32 %r30006, %r9552, %r9488, %r9560, 0xD2; + lop3.b32 %r30007, %r9556, %r9492, %r9564, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+216], {%r30006, %r30007}; + mul.wide.s32 %rd555, %r30056, 8; + add.s64 %rd554, %rd480, %rd555; + // begin inline asm + ld.global.nc.v2.u32 {%r9840,%r9841}, [%rd554]; + // end inline asm + xor.b32 %r30042, %r9640, %r9840; + xor.b32 %r30043, %r9641, %r9841; + add.s32 %r30056, %r30056, 1; + setp.lt.u32 %p18, %r30056, 23; + @%p18 bra $L__BB2_23; + + mov.u32 %r9951, 1; + st.local.v2.u32 [%rd53+32], {%r30054, %r30055}; + st.local.v2.u32 [%rd53+72], {%r30052, %r30053}; + st.local.v2.u32 [%rd53+40], {%r30050, %r30051}; + st.local.v2.u32 [%rd53+80], {%r30048, %r30049}; + st.local.v2.u32 [%rd53+48], {%r30046, %r30047}; + st.local.v2.u32 [%rd53+56], {%r30044, %r30045}; + st.local.v2.u32 [%rd53+24], {%r30042, %r30043}; + // begin inline asm + // xor5 + lop3.b32 %r9852, %r30042, %r30040, %r30038, 0x96; + lop3.b32 %r9852, %r9852, %r30036, %r30034, 0x96; + lop3.b32 %r9853, %r30043, %r30041, %r30039, 0x96; + lop3.b32 %r9853, %r9853, %r30037, %r30035, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9864, %r30054, %r30052, %r30032, 0x96; + lop3.b32 %r9864, %r9864, %r30030, %r30028, 0x96; + lop3.b32 %r9865, %r30055, %r30053, %r30033, 0x96; + lop3.b32 %r9865, %r9865, %r30031, %r30029, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9876, %r30050, %r30048, %r30026, 0x96; + lop3.b32 %r9876, %r9876, %r30024, %r30022, 0x96; + lop3.b32 %r9877, %r30051, %r30049, %r30027, 0x96; + lop3.b32 %r9877, %r9877, %r30025, %r30023, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9888, %r30046, %r30020, %r30018, 0x96; + lop3.b32 %r9888, %r9888, %r30016, %r30014, 0x96; + lop3.b32 %r9889, %r30047, %r30021, %r30019, 0x96; + lop3.b32 %r9889, %r9889, %r30017, %r30015, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9900, %r30044, %r30012, %r30010, 0x96; + lop3.b32 %r9900, %r9900, %r30008, %r30006, 0x96; + lop3.b32 %r9901, %r30045, %r30013, %r30011, 0x96; + lop3.b32 %r9901, %r9901, %r30009, %r30007, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9912, %r9865, %r9864, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9916, %r9864, %r9865, %r9951; + // end inline asm + xor.b32 %r10090, %r9912, %r9900; + xor.b32 %r10091, %r9916, %r9901; + xor.b32 %r10059, %r30042, %r10090; + xor.b32 %r10062, %r30043, %r10091; + xor.b32 %r10022, %r30039, %r10091; + xor.b32 %r10021, %r30038, %r10090; + st.local.v2.u32 [%rd53+104], {%r10021, %r10022}; + // begin inline asm + shf.l.wrap.b32 %r9920, %r9877, %r9876, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9924, %r9876, %r9877, %r9951; + // end inline asm + xor.b32 %r10092, %r9920, %r9852; + xor.b32 %r10093, %r9924, %r9853; + xor.b32 %r9958, %r30052, %r10092; + xor.b32 %r9957, %r30053, %r10093; + xor.b32 %r9997, %r30031, %r10093; + xor.b32 %r9998, %r30030, %r10092; + st.local.v2.u32 [%rd53+152], {%r9998, %r9997}; + // begin inline asm + shf.l.wrap.b32 %r9928, %r9889, %r9888, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9932, %r9888, %r9889, %r9951; + // end inline asm + xor.b32 %r10094, %r9928, %r9864; + xor.b32 %r10095, %r9932, %r9865; + xor.b32 %r9981, %r30027, %r10095; + xor.b32 %r9982, %r30026, %r10094; + st.local.v2.u32 [%rd53+120], {%r9982, %r9981}; + xor.b32 %r9973, %r30023, %r10095; + xor.b32 %r9974, %r30022, %r10094; + st.local.v2.u32 [%rd53+200], {%r9974, %r9973}; + // begin inline asm + shf.l.wrap.b32 %r9936, %r9901, %r9900, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9940, %r9900, %r9901, %r9951; + // end inline asm + xor.b32 %r10096, %r9936, %r9876; + xor.b32 %r10097, %r9940, %r9877; + xor.b32 %r10005, %r30046, %r10096; + xor.b32 %r10006, %r30047, %r10097; + xor.b32 %r10014, %r30017, %r10097; + xor.b32 %r10013, %r30016, %r10096; + st.local.v2.u32 [%rd53+168], {%r10013, %r10014}; + // begin inline asm + shf.l.wrap.b32 %r9944, %r9853, %r9852, %r9951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9948, %r9852, %r9853, %r9951; + // end inline asm + xor.b32 %r10098, %r9944, %r9888; + xor.b32 %r10099, %r9948, %r9889; + xor.b32 %r9965, %r30012, %r10098; + xor.b32 %r9966, %r30013, %r10099; + xor.b32 %r9990, %r30007, %r10099; + xor.b32 %r9989, %r30006, %r10098; + st.local.v2.u32 [%rd53+216], {%r9989, %r9990}; + // begin inline asm + shf.l.wrap.b32 %r9952, %r9958, %r9957, %r9455; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9956, %r9957, %r9958, %r9455; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9960, %r9966, %r9965, %r9463; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9964, %r9965, %r9966, %r9463; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9972, %r9973, %r9974, %r9471; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9968, %r9974, %r9973, %r9471; + // end inline asm + st.local.v2.u32 [%rd53+96], {%r9968, %r9972}; + // begin inline asm + shf.l.wrap.b32 %r9976, %r9982, %r9981, %r9503; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9980, %r9981, %r9982, %r9503; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9984, %r9990, %r9989, %r9551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9988, %r9989, %r9990, %r9551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9996, %r9997, %r9998, %r9575; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r9992, %r9998, %r9997, %r9575; + // end inline asm + st.local.v2.u32 [%rd53+88], {%r9992, %r9996}; + // begin inline asm + shf.l.wrap.b32 %r10000, %r10006, %r10005, %r9591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10004, %r10005, %r10006, %r9591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10008, %r10014, %r10013, %r9599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10012, %r10013, %r10014, %r9599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10016, %r10022, %r10021, %r9631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r10020, %r10021, %r10022, %r9631; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10024, %r10059, %r9952, %r9976, 0xD2; + lop3.b32 %r10025, %r10062, %r9956, %r9980, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r10032, %r9952, %r9976, %r10008, 0xD2; + lop3.b32 %r10033, %r9956, %r9980, %r10012, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+32], {%r10032, %r10033}; + // begin inline asm + // chi + lop3.b32 %r10040, %r9976, %r10008, %r9984, 0xD2; + lop3.b32 %r10041, %r9980, %r10012, %r9988, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+40], {%r10040, %r10041}; + // begin inline asm + // chi + lop3.b32 %r10048, %r10008, %r9984, %r10059, 0xD2; + lop3.b32 %r10049, %r10012, %r9988, %r10062, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+48], {%r10048, %r10049}; + // begin inline asm + // chi + lop3.b32 %r10056, %r9984, %r10059, %r9952, 0xD2; + lop3.b32 %r10057, %r9988, %r10062, %r9956, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+56], {%r10056, %r10057}; + // begin inline asm + // chi + lop3.b32 %r10064, %r10000, %r9960, %r10016, 0xD2; + lop3.b32 %r10065, %r10004, %r9964, %r10020, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+64], {%r10064, %r10065}; + // begin inline asm + // chi + lop3.b32 %r10072, %r9960, %r10016, %r9992, 0xD2; + lop3.b32 %r10073, %r9964, %r10020, %r9996, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+72], {%r10072, %r10073}; + // begin inline asm + // chi + lop3.b32 %r10080, %r10016, %r9992, %r9968, 0xD2; + lop3.b32 %r10081, %r10020, %r9996, %r9972, 0xD2; + // end inline asm + st.local.v2.u32 [%rd53+80], {%r10080, %r10081}; + // begin inline asm + ld.global.nc.v2.u32 {%r10088,%r10089}, [%rd481]; + // end inline asm + xor.b32 %r10100, %r10025, %r10089; + xor.b32 %r10101, %r10024, %r10088; + st.local.v2.u32 [%rd53+24], {%r10101, %r10100}; + mov.b64 %rd1326, {%r10032, %r10033}; + mov.b64 %rd1327, {%r10040, %r10041}; + mov.b64 %rd1330, {%r10064, %r10065}; + mov.b64 %rd1331, {%r10072, %r10073}; + mov.b64 %rd1332, {%r10080, %r10081}; + mov.b64 %rd1325, {%r10101, %r10100}; + mov.b64 %rd1328, {%r10048, %r10049}; + mov.b64 %rd1329, {%r10056, %r10057}; + st.global.u64 [%rd34], %rd1317; + st.global.u64 [%rd34+8], %rd1318; + st.global.u64 [%rd34+16], %rd1319; + st.global.u64 [%rd34+24], %rd1320; + st.global.u64 [%rd34+32], %rd1321; + st.global.u64 [%rd34+40], %rd1322; + st.global.u64 [%rd34+48], %rd1323; + st.global.u64 [%rd34+56], %rd1324; + st.global.v2.u32 [%rd34+64], {%r10101, %r10100}; + st.global.v2.u32 [%rd34+72], {%r10032, %r10033}; + st.global.v2.u32 [%rd34+80], {%r10040, %r10041}; + st.global.v2.u32 [%rd34+88], {%r10048, %r10049}; + st.global.v2.u32 [%rd34+96], {%r10056, %r10057}; + st.global.v2.u32 [%rd34+104], {%r10064, %r10065}; + st.global.v2.u32 [%rd34+112], {%r10072, %r10073}; + st.global.v2.u32 [%rd34+120], {%r10080, %r10081}; + +$L__BB2_36: + cvta.to.global.u64 %rd1265, %rd361; + shl.b32 %r1695, %r45, 1; + mul.wide.u32 %rd661, %r1695, -954391867; + shr.u64 %rd662, %rd661, 32; + cvt.u32.u64 %r13386, %rd662; + sub.s32 %r13387, %r1695, %r13386; + shr.u32 %r13388, %r13387, 1; + add.s32 %r13389, %r13388, %r13386; + shr.u32 %r13390, %r13389, 20; + mul.lo.s32 %r13391, %r13390, 1179641; + sub.s32 %r13392, %r1695, %r13391; + mul.wide.u32 %rd664, %r13392, 64; + add.s64 %rd126, %rd1265, %rd664; + or.b32 %r1696, %r1695, 1; + mul.wide.u32 %rd665, %r1696, -954391867; + shr.u64 %rd666, %rd665, 32; + cvt.u32.u64 %r13393, %rd666; + sub.s32 %r13394, %r1696, %r13393; + shr.u32 %r13395, %r13394, 1; + add.s32 %r13396, %r13395, %r13393; + shr.u32 %r13397, %r13396, 20; + mul.lo.s32 %r13398, %r13397, 1179641; + sub.s32 %r13399, %r1696, %r13398; + mul.wide.u32 %rd667, %r13399, 64; + add.s64 %rd127, %rd1265, %rd667; + @%p12 bra $L__BB2_50; + + cvta.to.global.u64 %rd668, %rd360; + mul.wide.u32 %rd669, %r45, 128; + add.s64 %rd128, %rd668, %rd669; + ld.global.u64 %rd1333, [%rd128]; + setp.eq.s64 %p25, %rd1333, 0; + @%p25 bra $L__BB2_39; + + ld.global.u64 %rd1348, [%rd128+120]; + ld.global.u64 %rd1347, [%rd128+112]; + ld.global.u64 %rd1346, [%rd128+104]; + ld.global.u64 %rd1345, [%rd128+96]; + ld.global.u64 %rd1344, [%rd128+88]; + ld.global.u64 %rd1343, [%rd128+80]; + ld.global.u64 %rd1342, [%rd128+72]; + ld.global.u64 %rd1341, [%rd128+64]; + ld.global.u64 %rd1340, [%rd128+56]; + ld.global.u64 %rd1339, [%rd128+48]; + ld.global.u64 %rd1338, [%rd128+40]; + ld.global.u64 %rd1337, [%rd128+32]; + ld.global.u64 %rd1336, [%rd128+24]; + ld.global.u64 %rd1335, [%rd128+16]; + ld.global.u64 %rd1334, [%rd128+8]; + bra.uni $L__BB2_61; + +$L__BB2_50: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd771, 1179641; + st.local.u64 [%rd2+8], %rd771; + st.local.u32 [%rd2+16], %r1695; + ld.global.u64 %rd772, [%rd126]; + ld.global.u64 %rd773, [%rd126+8]; + ld.global.u64 %rd774, [%rd126+16]; + ld.global.u64 %rd775, [%rd126+24]; + ld.global.u64 %rd776, [%rd126+32]; + ld.global.u64 %rd777, [%rd126+40]; + ld.global.u64 %rd778, [%rd126+48]; + ld.global.u64 %rd779, [%rd126+56]; + st.local.u64 [%rd2+24], %rd772; + st.local.u64 [%rd2+32], %rd773; + st.local.u64 [%rd2+40], %rd774; + st.local.u64 [%rd2+48], %rd775; + st.local.u64 [%rd2+56], %rd776; + st.local.u64 [%rd2+64], %rd777; + st.local.u64 [%rd2+72], %rd778; + st.local.u64 [%rd2+80], %rd779; + cvt.u32.u64 %r16725, %rd772; + xor.b32 %r16726, %r1695, %r16725; + st.local.u32 [%rd2+24], %r16726; + mov.u32 %r30531, 0; + st.local.v2.u32 [%rd2+96], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+104], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+112], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+120], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+128], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+136], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+144], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+152], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+160], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+168], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+176], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+184], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+192], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+200], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+208], {%r30531, %r30531}; + st.local.v2.u32 [%rd2+216], {%r30531, %r30531}; + mov.u32 %r30546, -2147483648; + mov.u32 %r16698, 1; + st.local.v2.u32 [%rd2+88], {%r16698, %r30546}; + ld.local.v2.u32 {%r30567, %r30568}, [%rd2+24]; + mov.b64 {%r30565, %r30566}, %rd777; + shr.u64 %rd780, %rd773, 32; + cvt.u32.u64 %r30579, %rd773; + cvt.u32.u64 %r30580, %rd780; + shr.u64 %rd781, %rd778, 32; + cvt.u32.u64 %r30577, %rd778; + cvt.u32.u64 %r30578, %rd781; + shr.u64 %rd782, %rd774, 32; + cvt.u32.u64 %r30575, %rd774; + cvt.u32.u64 %r30576, %rd782; + shr.u64 %rd783, %rd779, 32; + cvt.u32.u64 %r30573, %rd779; + cvt.u32.u64 %r30574, %rd783; + shr.u64 %rd784, %rd775, 32; + cvt.u32.u64 %r30571, %rd775; + cvt.u32.u64 %r30572, %rd784; + shr.u64 %rd785, %rd776, 32; + cvt.u32.u64 %r30569, %rd776; + cvt.u32.u64 %r30570, %rd785; + mov.u32 %r30532, %r30531; + mov.u32 %r30533, %r30531; + mov.u32 %r30534, %r30531; + mov.u32 %r30535, %r30531; + mov.u32 %r30536, %r30531; + mov.u32 %r30537, %r30531; + mov.u32 %r30538, %r30531; + mov.u32 %r30539, %r30531; + mov.u32 %r30540, %r30531; + mov.u32 %r30541, %r30531; + mov.u32 %r30542, %r30531; + mov.u32 %r30543, %r30531; + mov.u32 %r30544, %r30531; + mov.u32 %r30545, %r16698; + mov.u32 %r30547, %r30531; + mov.u32 %r30548, %r30531; + mov.u32 %r30549, %r30531; + mov.u32 %r30550, %r30531; + mov.u32 %r30551, %r30531; + mov.u32 %r30552, %r30531; + mov.u32 %r30553, %r30531; + mov.u32 %r30554, %r30531; + mov.u32 %r30555, %r30531; + mov.u32 %r30556, %r30531; + mov.u32 %r30557, %r30531; + mov.u32 %r30558, %r30531; + mov.u32 %r30559, %r30531; + mov.u32 %r30560, %r30531; + mov.u32 %r30561, %r30531; + mov.u32 %r30562, %r30531; + mov.u32 %r30563, %r30531; + mov.u32 %r30564, %r30531; + mov.u32 %r30581, %r30531; + +$L__BB2_51: + // begin inline asm + // xor5 + lop3.b32 %r16729, %r30567, %r30565, %r30563, 0x96; + lop3.b32 %r16729, %r16729, %r30561, %r30559, 0x96; + lop3.b32 %r16730, %r30568, %r30566, %r30564, 0x96; + lop3.b32 %r16730, %r16730, %r30562, %r30560, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16741, %r30579, %r30577, %r30557, 0x96; + lop3.b32 %r16741, %r16741, %r30555, %r30553, 0x96; + lop3.b32 %r16742, %r30580, %r30578, %r30558, 0x96; + lop3.b32 %r16742, %r16742, %r30556, %r30554, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16753, %r30575, %r30573, %r30551, 0x96; + lop3.b32 %r16753, %r16753, %r30549, %r30547, 0x96; + lop3.b32 %r16754, %r30576, %r30574, %r30552, 0x96; + lop3.b32 %r16754, %r16754, %r30550, %r30548, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16765, %r30571, %r30545, %r30543, 0x96; + lop3.b32 %r16765, %r16765, %r30541, %r30539, 0x96; + lop3.b32 %r16766, %r30572, %r30546, %r30544, 0x96; + lop3.b32 %r16766, %r16766, %r30542, %r30540, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16777, %r30569, %r30537, %r30535, 0x96; + lop3.b32 %r16777, %r16777, %r30533, %r30531, 0x96; + lop3.b32 %r16778, %r30570, %r30538, %r30536, 0x96; + lop3.b32 %r16778, %r16778, %r30534, %r30532, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16789, %r16742, %r16741, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16793, %r16741, %r16742, %r16698; + // end inline asm + xor.b32 %r17223, %r16789, %r16777; + xor.b32 %r17224, %r16793, %r16778; + xor.b32 %r17056, %r30567, %r17223; + xor.b32 %r17059, %r30568, %r17224; + xor.b32 %r16963, %r30565, %r17223; + xor.b32 %r16962, %r30566, %r17224; + xor.b32 %r17010, %r30563, %r17223; + xor.b32 %r17011, %r30564, %r17224; + xor.b32 %r16915, %r30561, %r17223; + xor.b32 %r16914, %r30562, %r17224; + xor.b32 %r16866, %r30559, %r17223; + xor.b32 %r16867, %r30560, %r17224; + // begin inline asm + shf.l.wrap.b32 %r16797, %r16754, %r16753, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16801, %r16753, %r16754, %r16698; + // end inline asm + xor.b32 %r17225, %r16797, %r16729; + xor.b32 %r17226, %r16801, %r16730; + xor.b32 %r17018, %r30579, %r17225; + xor.b32 %r17019, %r30580, %r17226; + xor.b32 %r16835, %r30577, %r17225; + xor.b32 %r16834, %r30578, %r17226; + xor.b32 %r16994, %r30557, %r17225; + xor.b32 %r16995, %r30558, %r17226; + xor.b32 %r16955, %r30555, %r17225; + xor.b32 %r16954, %r30556, %r17226; + xor.b32 %r16938, %r30553, %r17225; + xor.b32 %r16939, %r30554, %r17226; + // begin inline asm + shf.l.wrap.b32 %r16805, %r16766, %r16765, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16809, %r16765, %r16766, %r16698; + // end inline asm + xor.b32 %r17227, %r16805, %r16741; + xor.b32 %r17228, %r16809, %r16742; + xor.b32 %r16875, %r30575, %r17227; + xor.b32 %r16874, %r30576, %r17228; + xor.b32 %r17002, %r30573, %r17227; + xor.b32 %r17003, %r30574, %r17228; + xor.b32 %r16883, %r30551, %r17227; + xor.b32 %r16882, %r30552, %r17228; + xor.b32 %r16986, %r30549, %r17227; + xor.b32 %r16987, %r30550, %r17228; + xor.b32 %r16851, %r30547, %r17227; + xor.b32 %r16850, %r30548, %r17228; + // begin inline asm + shf.l.wrap.b32 %r16813, %r16778, %r16777, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16817, %r16777, %r16778, %r16698; + // end inline asm + xor.b32 %r17229, %r16813, %r16753; + xor.b32 %r17230, %r16817, %r16754; + xor.b32 %r16970, %r30571, %r17229; + xor.b32 %r16971, %r30572, %r17230; + xor.b32 %r16947, %r30545, %r17229; + xor.b32 %r16946, %r30546, %r17230; + xor.b32 %r16890, %r30543, %r17229; + xor.b32 %r16891, %r30544, %r17230; + xor.b32 %r16978, %r30541, %r17229; + xor.b32 %r16979, %r30542, %r17230; + xor.b32 %r16907, %r30539, %r17229; + xor.b32 %r16906, %r30540, %r17230; + // begin inline asm + shf.l.wrap.b32 %r16821, %r16730, %r16729, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16825, %r16729, %r16730, %r16698; + // end inline asm + xor.b32 %r17231, %r16821, %r16765; + xor.b32 %r17232, %r16825, %r16766; + xor.b32 %r16922, %r30569, %r17231; + xor.b32 %r16923, %r30570, %r17232; + xor.b32 %r16842, %r30537, %r17231; + xor.b32 %r16843, %r30538, %r17232; + xor.b32 %r16859, %r30535, %r17231; + xor.b32 %r16858, %r30536, %r17232; + xor.b32 %r16898, %r30533, %r17231; + xor.b32 %r16899, %r30534, %r17232; + xor.b32 %r16930, %r30531, %r17231; + xor.b32 %r16931, %r30532, %r17232; + mov.u32 %r16836, 44; + // begin inline asm + shf.l.wrap.b32 %r16829, %r16835, %r16834, %r16836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16833, %r16834, %r16835, %r16836; + // end inline asm + mov.u32 %r16844, 20; + // begin inline asm + shf.l.wrap.b32 %r16837, %r16843, %r16842, %r16844; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16841, %r16842, %r16843, %r16844; + // end inline asm + mov.u32 %r16852, 61; + // begin inline asm + shf.l.wrap.b32 %r16845, %r16851, %r16850, %r16852; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16849, %r16850, %r16851, %r16852; + // end inline asm + mov.u32 %r16860, 39; + // begin inline asm + shf.l.wrap.b32 %r16853, %r16859, %r16858, %r16860; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16857, %r16858, %r16859, %r16860; + // end inline asm + mov.u32 %r16868, 18; + // begin inline asm + shf.l.wrap.b32 %r16861, %r16867, %r16866, %r16868; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16865, %r16866, %r16867, %r16868; + // end inline asm + mov.u32 %r16876, 62; + // begin inline asm + shf.l.wrap.b32 %r16869, %r16875, %r16874, %r16876; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16873, %r16874, %r16875, %r16876; + // end inline asm + mov.u32 %r16884, 43; + // begin inline asm + shf.l.wrap.b32 %r16877, %r16883, %r16882, %r16884; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16881, %r16882, %r16883, %r16884; + // end inline asm + mov.u32 %r16892, 25; + // begin inline asm + shf.l.wrap.b32 %r16885, %r16891, %r16890, %r16892; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16889, %r16890, %r16891, %r16892; + // end inline asm + mov.u32 %r16900, 8; + // begin inline asm + shf.l.wrap.b32 %r16893, %r16899, %r16898, %r16900; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16897, %r16898, %r16899, %r16900; + // end inline asm + mov.u32 %r16908, 56; + // begin inline asm + shf.l.wrap.b32 %r16901, %r16907, %r16906, %r16908; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16905, %r16906, %r16907, %r16908; + // end inline asm + mov.u32 %r16916, 41; + // begin inline asm + shf.l.wrap.b32 %r16909, %r16915, %r16914, %r16916; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16913, %r16914, %r16915, %r16916; + // end inline asm + mov.u32 %r16924, 27; + // begin inline asm + shf.l.wrap.b32 %r16917, %r16923, %r16922, %r16924; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16921, %r16922, %r16923, %r16924; + // end inline asm + mov.u32 %r16932, 14; + // begin inline asm + shf.l.wrap.b32 %r16925, %r16931, %r16930, %r16932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16929, %r16930, %r16931, %r16932; + // end inline asm + mov.u32 %r16940, 2; + // begin inline asm + shf.l.wrap.b32 %r16933, %r16939, %r16938, %r16940; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16937, %r16938, %r16939, %r16940; + // end inline asm + mov.u32 %r16948, 55; + // begin inline asm + shf.l.wrap.b32 %r16941, %r16947, %r16946, %r16948; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16945, %r16946, %r16947, %r16948; + // end inline asm + mov.u32 %r16956, 45; + // begin inline asm + shf.l.wrap.b32 %r16949, %r16955, %r16954, %r16956; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16953, %r16954, %r16955, %r16956; + // end inline asm + mov.u32 %r16964, 36; + // begin inline asm + shf.l.wrap.b32 %r16957, %r16963, %r16962, %r16964; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16961, %r16962, %r16963, %r16964; + // end inline asm + mov.u32 %r16972, 28; + // begin inline asm + shf.l.wrap.b32 %r16965, %r16971, %r16970, %r16972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16969, %r16970, %r16971, %r16972; + // end inline asm + mov.u32 %r16980, 21; + // begin inline asm + shf.l.wrap.b32 %r16973, %r16979, %r16978, %r16980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16977, %r16978, %r16979, %r16980; + // end inline asm + mov.u32 %r16988, 15; + // begin inline asm + shf.l.wrap.b32 %r16981, %r16987, %r16986, %r16988; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16985, %r16986, %r16987, %r16988; + // end inline asm + mov.u32 %r16996, 10; + // begin inline asm + shf.l.wrap.b32 %r16989, %r16995, %r16994, %r16996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16993, %r16994, %r16995, %r16996; + // end inline asm + mov.u32 %r17004, 6; + // begin inline asm + shf.l.wrap.b32 %r16997, %r17003, %r17002, %r17004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17001, %r17002, %r17003, %r17004; + // end inline asm + mov.u32 %r17012, 3; + // begin inline asm + shf.l.wrap.b32 %r17005, %r17011, %r17010, %r17012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17009, %r17010, %r17011, %r17012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17013, %r17019, %r17018, %r16698; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17017, %r17018, %r17019, %r16698; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17021, %r17056, %r16829, %r16877, 0xD2; + lop3.b32 %r17022, %r17059, %r16833, %r16881, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30579, %r16829, %r16877, %r16973, 0xD2; + lop3.b32 %r30580, %r16833, %r16881, %r16977, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30575, %r16877, %r16973, %r16925, 0xD2; + lop3.b32 %r30576, %r16881, %r16977, %r16929, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30571, %r16973, %r16925, %r17056, 0xD2; + lop3.b32 %r30572, %r16977, %r16929, %r17059, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30569, %r16925, %r17056, %r16829, 0xD2; + lop3.b32 %r30570, %r16929, %r17059, %r16833, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30565, %r16965, %r16837, %r17005, 0xD2; + lop3.b32 %r30566, %r16969, %r16841, %r17009, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30577, %r16837, %r17005, %r16949, 0xD2; + lop3.b32 %r30578, %r16841, %r17009, %r16953, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30573, %r17005, %r16949, %r16845, 0xD2; + lop3.b32 %r30574, %r17009, %r16953, %r16849, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30545, %r16949, %r16845, %r16965, 0xD2; + lop3.b32 %r30546, %r16953, %r16849, %r16969, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30545, %r30546}; + // begin inline asm + // chi + lop3.b32 %r30537, %r16845, %r16965, %r16837, 0xD2; + lop3.b32 %r30538, %r16849, %r16969, %r16841, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30537, %r30538}; + // begin inline asm + // chi + lop3.b32 %r30563, %r17013, %r16997, %r16885, 0xD2; + lop3.b32 %r30564, %r17017, %r17001, %r16889, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30563, %r30564}; + // begin inline asm + // chi + lop3.b32 %r30557, %r16997, %r16885, %r16893, 0xD2; + lop3.b32 %r30558, %r17001, %r16889, %r16897, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30557, %r30558}; + // begin inline asm + // chi + lop3.b32 %r30551, %r16885, %r16893, %r16861, 0xD2; + lop3.b32 %r30552, %r16889, %r16897, %r16865, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30551, %r30552}; + // begin inline asm + // chi + lop3.b32 %r30543, %r16893, %r16861, %r17013, 0xD2; + lop3.b32 %r30544, %r16897, %r16865, %r17017, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30543, %r30544}; + // begin inline asm + // chi + lop3.b32 %r30535, %r16861, %r17013, %r16997, 0xD2; + lop3.b32 %r30536, %r16865, %r17017, %r17001, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30535, %r30536}; + // begin inline asm + // chi + lop3.b32 %r30561, %r16917, %r16957, %r16989, 0xD2; + lop3.b32 %r30562, %r16921, %r16961, %r16993, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30561, %r30562}; + // begin inline asm + // chi + lop3.b32 %r30555, %r16957, %r16989, %r16981, 0xD2; + lop3.b32 %r30556, %r16961, %r16993, %r16985, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30555, %r30556}; + // begin inline asm + // chi + lop3.b32 %r30549, %r16989, %r16981, %r16901, 0xD2; + lop3.b32 %r30550, %r16993, %r16985, %r16905, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30549, %r30550}; + // begin inline asm + // chi + lop3.b32 %r30541, %r16981, %r16901, %r16917, 0xD2; + lop3.b32 %r30542, %r16985, %r16905, %r16921, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30541, %r30542}; + // begin inline asm + // chi + lop3.b32 %r30533, %r16901, %r16917, %r16957, 0xD2; + lop3.b32 %r30534, %r16905, %r16921, %r16961, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30533, %r30534}; + // begin inline asm + // chi + lop3.b32 %r30559, %r16869, %r16941, %r16853, 0xD2; + lop3.b32 %r30560, %r16873, %r16945, %r16857, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30559, %r30560}; + // begin inline asm + // chi + lop3.b32 %r30553, %r16941, %r16853, %r16909, 0xD2; + lop3.b32 %r30554, %r16945, %r16857, %r16913, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30553, %r30554}; + // begin inline asm + // chi + lop3.b32 %r30547, %r16853, %r16909, %r16933, 0xD2; + lop3.b32 %r30548, %r16857, %r16913, %r16937, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30547, %r30548}; + // begin inline asm + // chi + lop3.b32 %r30539, %r16909, %r16933, %r16869, 0xD2; + lop3.b32 %r30540, %r16913, %r16937, %r16873, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30539, %r30540}; + // begin inline asm + // chi + lop3.b32 %r30531, %r16933, %r16869, %r16941, 0xD2; + lop3.b32 %r30532, %r16937, %r16873, %r16945, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30531, %r30532}; + mul.wide.s32 %rd787, %r30581, 8; + mov.u64 %rd788, keccak_round_constants; + cvta.const.u64 %rd789, %rd788; + add.s64 %rd786, %rd789, %rd787; + // begin inline asm + ld.global.nc.v2.u32 {%r17221,%r17222}, [%rd786]; + // end inline asm + xor.b32 %r30567, %r17021, %r17221; + xor.b32 %r30568, %r17022, %r17222; + add.s32 %r30581, %r30581, 1; + setp.lt.u32 %p31, %r30581, 23; + @%p31 bra $L__BB2_51; + + add.u64 %rd176, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r30579, %r30580}; + st.local.v2.u32 [%rd2+72], {%r30577, %r30578}; + st.local.v2.u32 [%rd2+40], {%r30575, %r30576}; + st.local.v2.u32 [%rd2+80], {%r30573, %r30574}; + st.local.v2.u32 [%rd2+48], {%r30571, %r30572}; + st.local.v2.u32 [%rd2+56], {%r30569, %r30570}; + st.local.v2.u32 [%rd2+24], {%r30567, %r30568}; + // begin inline asm + // xor5 + lop3.b32 %r17233, %r30567, %r30565, %r30563, 0x96; + lop3.b32 %r17233, %r17233, %r30561, %r30559, 0x96; + lop3.b32 %r17234, %r30568, %r30566, %r30564, 0x96; + lop3.b32 %r17234, %r17234, %r30562, %r30560, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17245, %r30579, %r30577, %r30557, 0x96; + lop3.b32 %r17245, %r17245, %r30555, %r30553, 0x96; + lop3.b32 %r17246, %r30580, %r30578, %r30558, 0x96; + lop3.b32 %r17246, %r17246, %r30556, %r30554, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17257, %r30575, %r30573, %r30551, 0x96; + lop3.b32 %r17257, %r17257, %r30549, %r30547, 0x96; + lop3.b32 %r17258, %r30576, %r30574, %r30552, 0x96; + lop3.b32 %r17258, %r17258, %r30550, %r30548, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17269, %r30571, %r30545, %r30543, 0x96; + lop3.b32 %r17269, %r17269, %r30541, %r30539, 0x96; + lop3.b32 %r17270, %r30572, %r30546, %r30544, 0x96; + lop3.b32 %r17270, %r17270, %r30542, %r30540, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17281, %r30569, %r30537, %r30535, 0x96; + lop3.b32 %r17281, %r17281, %r30533, %r30531, 0x96; + lop3.b32 %r17282, %r30570, %r30538, %r30536, 0x96; + lop3.b32 %r17282, %r17282, %r30534, %r30532, 0x96; + // end inline asm + mov.u32 %r17485, 1; + // begin inline asm + shf.l.wrap.b32 %r17293, %r17246, %r17245, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17297, %r17245, %r17246, %r17485; + // end inline asm + xor.b32 %r17512, %r17293, %r17281; + xor.b32 %r17513, %r17297, %r17282; + xor.b32 %r17440, %r30567, %r17512; + xor.b32 %r17443, %r30568, %r17513; + xor.b32 %r17403, %r30564, %r17513; + xor.b32 %r17402, %r30563, %r17512; + st.local.v2.u32 [%rd2+104], {%r17402, %r17403}; + // begin inline asm + shf.l.wrap.b32 %r17301, %r17258, %r17257, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17305, %r17257, %r17258, %r17485; + // end inline asm + xor.b32 %r17514, %r17301, %r17233; + xor.b32 %r17515, %r17305, %r17234; + xor.b32 %r17339, %r30577, %r17514; + xor.b32 %r17338, %r30578, %r17515; + xor.b32 %r17378, %r30556, %r17515; + xor.b32 %r17379, %r30555, %r17514; + st.local.v2.u32 [%rd2+152], {%r17379, %r17378}; + // begin inline asm + shf.l.wrap.b32 %r17309, %r17270, %r17269, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17313, %r17269, %r17270, %r17485; + // end inline asm + xor.b32 %r17516, %r17309, %r17245; + xor.b32 %r17517, %r17313, %r17246; + xor.b32 %r17362, %r30552, %r17517; + xor.b32 %r17363, %r30551, %r17516; + st.local.v2.u32 [%rd2+120], {%r17363, %r17362}; + xor.b32 %r17354, %r30548, %r17517; + xor.b32 %r17355, %r30547, %r17516; + st.local.v2.u32 [%rd2+200], {%r17355, %r17354}; + // begin inline asm + shf.l.wrap.b32 %r17317, %r17282, %r17281, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17321, %r17281, %r17282, %r17485; + // end inline asm + xor.b32 %r17518, %r17317, %r17257; + xor.b32 %r17519, %r17321, %r17258; + xor.b32 %r17386, %r30571, %r17518; + xor.b32 %r17387, %r30572, %r17519; + xor.b32 %r17395, %r30542, %r17519; + xor.b32 %r17394, %r30541, %r17518; + st.local.v2.u32 [%rd2+168], {%r17394, %r17395}; + // begin inline asm + shf.l.wrap.b32 %r17325, %r17234, %r17233, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17329, %r17233, %r17234, %r17485; + // end inline asm + xor.b32 %r17520, %r17325, %r17269; + xor.b32 %r17521, %r17329, %r17270; + xor.b32 %r17346, %r30537, %r17520; + xor.b32 %r17347, %r30538, %r17521; + xor.b32 %r17371, %r30532, %r17521; + xor.b32 %r17370, %r30531, %r17520; + st.local.v2.u32 [%rd2+216], {%r17370, %r17371}; + // begin inline asm + shf.l.wrap.b32 %r17333, %r17339, %r17338, %r16836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17337, %r17338, %r17339, %r16836; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17341, %r17347, %r17346, %r16844; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17345, %r17346, %r17347, %r16844; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17353, %r17354, %r17355, %r16852; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17349, %r17355, %r17354, %r16852; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r17349, %r17353}; + // begin inline asm + shf.l.wrap.b32 %r17357, %r17363, %r17362, %r16884; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17361, %r17362, %r17363, %r16884; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17365, %r17371, %r17370, %r16932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17369, %r17370, %r17371, %r16932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17377, %r17378, %r17379, %r16956; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17373, %r17379, %r17378, %r16956; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r17373, %r17377}; + // begin inline asm + shf.l.wrap.b32 %r17381, %r17387, %r17386, %r16972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17385, %r17386, %r17387, %r16972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17389, %r17395, %r17394, %r16980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17393, %r17394, %r17395, %r16980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17397, %r17403, %r17402, %r17012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17401, %r17402, %r17403, %r17012; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17405, %r17440, %r17333, %r17357, 0xD2; + lop3.b32 %r17406, %r17443, %r17337, %r17361, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30714, %r17333, %r17357, %r17389, 0xD2; + lop3.b32 %r30715, %r17337, %r17361, %r17393, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30714, %r30715}; + // begin inline asm + // chi + lop3.b32 %r30710, %r17357, %r17389, %r17365, 0xD2; + lop3.b32 %r30711, %r17361, %r17393, %r17369, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30710, %r30711}; + // begin inline asm + // chi + lop3.b32 %r30706, %r17389, %r17365, %r17440, 0xD2; + lop3.b32 %r30707, %r17393, %r17369, %r17443, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30706, %r30707}; + // begin inline asm + // chi + lop3.b32 %r30704, %r17365, %r17440, %r17333, 0xD2; + lop3.b32 %r30705, %r17369, %r17443, %r17337, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30704, %r30705}; + // begin inline asm + // chi + lop3.b32 %r30700, %r17381, %r17341, %r17397, 0xD2; + lop3.b32 %r30701, %r17385, %r17345, %r17401, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30700, %r30701}; + // begin inline asm + // chi + lop3.b32 %r30712, %r17341, %r17397, %r17373, 0xD2; + lop3.b32 %r30713, %r17345, %r17401, %r17377, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30712, %r30713}; + // begin inline asm + // chi + lop3.b32 %r30708, %r17397, %r17373, %r17349, 0xD2; + lop3.b32 %r30709, %r17401, %r17377, %r17353, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30708, %r30709}; + add.s64 %rd790, %rd789, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r17469,%r17470}, [%rd790]; + // end inline asm + xor.b32 %r30702, %r17405, %r17469; + xor.b32 %r30703, %r17406, %r17470; + st.local.v2.u32 [%rd2+24], {%r30702, %r30703}; + st.local.u64 [%rd176], %rd361; + mov.u64 %rd794, 1179641; + st.local.u64 [%rd176+8], %rd794; + st.local.u32 [%rd176+16], %r1696; + ld.global.u64 %rd795, [%rd127]; + ld.global.u64 %rd796, [%rd127+8]; + ld.global.u64 %rd797, [%rd127+16]; + ld.global.u64 %rd798, [%rd127+24]; + ld.global.u64 %rd799, [%rd127+32]; + ld.global.u64 %rd800, [%rd127+40]; + ld.global.u64 %rd801, [%rd127+48]; + ld.global.u64 %rd802, [%rd127+56]; + st.local.u64 [%rd176+32], %rd796; + st.local.u64 [%rd176+40], %rd797; + st.local.u64 [%rd176+48], %rd798; + st.local.u64 [%rd176+56], %rd799; + st.local.u64 [%rd176+64], %rd800; + st.local.u64 [%rd176+72], %rd801; + st.local.u64 [%rd176+80], %rd802; + cvt.u32.u64 %r17522, %rd795; + xor.b32 %r17523, %r1696, %r17522; + st.local.u64 [%rd176+24], %rd795; + st.local.u32 [%rd176+24], %r17523; + mov.u32 %r30582, 0; + st.local.v2.u32 [%rd176+96], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+104], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+112], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+120], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+128], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+136], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+144], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+152], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+160], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+168], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+176], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+184], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+192], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+200], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+208], {%r30582, %r30582}; + st.local.v2.u32 [%rd176+216], {%r30582, %r30582}; + mov.u32 %r30597, -2147483648; + st.local.v2.u32 [%rd176+88], {%r17485, %r30597}; + ld.local.v2.u32 {%r30618, %r30619}, [%rd176+24]; + mov.b64 {%r30616, %r30617}, %rd800; + shr.u64 %rd803, %rd796, 32; + cvt.u32.u64 %r30630, %rd796; + cvt.u32.u64 %r30631, %rd803; + shr.u64 %rd804, %rd801, 32; + cvt.u32.u64 %r30628, %rd801; + cvt.u32.u64 %r30629, %rd804; + shr.u64 %rd805, %rd797, 32; + cvt.u32.u64 %r30626, %rd797; + cvt.u32.u64 %r30627, %rd805; + shr.u64 %rd806, %rd802, 32; + cvt.u32.u64 %r30624, %rd802; + cvt.u32.u64 %r30625, %rd806; + shr.u64 %rd807, %rd798, 32; + cvt.u32.u64 %r30622, %rd798; + cvt.u32.u64 %r30623, %rd807; + shr.u64 %rd808, %rd799, 32; + cvt.u32.u64 %r30620, %rd799; + cvt.u32.u64 %r30621, %rd808; + mov.u32 %r30583, %r30582; + mov.u32 %r30584, %r30582; + mov.u32 %r30585, %r30582; + mov.u32 %r30586, %r30582; + mov.u32 %r30587, %r30582; + mov.u32 %r30588, %r30582; + mov.u32 %r30589, %r30582; + mov.u32 %r30590, %r30582; + mov.u32 %r30591, %r30582; + mov.u32 %r30592, %r30582; + mov.u32 %r30593, %r30582; + mov.u32 %r30594, %r30582; + mov.u32 %r30595, %r30582; + mov.u32 %r30596, %r17485; + mov.u32 %r30598, %r30582; + mov.u32 %r30599, %r30582; + mov.u32 %r30600, %r30582; + mov.u32 %r30601, %r30582; + mov.u32 %r30602, %r30582; + mov.u32 %r30603, %r30582; + mov.u32 %r30604, %r30582; + mov.u32 %r30605, %r30582; + mov.u32 %r30606, %r30582; + mov.u32 %r30607, %r30582; + mov.u32 %r30608, %r30582; + mov.u32 %r30609, %r30582; + mov.u32 %r30610, %r30582; + mov.u32 %r30611, %r30582; + mov.u32 %r30612, %r30582; + mov.u32 %r30613, %r30582; + mov.u32 %r30614, %r30582; + mov.u32 %r30615, %r30582; + mov.u32 %r30632, %r30582; + +$L__BB2_53: + // begin inline asm + // xor5 + lop3.b32 %r17526, %r30618, %r30616, %r30614, 0x96; + lop3.b32 %r17526, %r17526, %r30612, %r30610, 0x96; + lop3.b32 %r17527, %r30619, %r30617, %r30615, 0x96; + lop3.b32 %r17527, %r17527, %r30613, %r30611, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17538, %r30630, %r30628, %r30608, 0x96; + lop3.b32 %r17538, %r17538, %r30606, %r30604, 0x96; + lop3.b32 %r17539, %r30631, %r30629, %r30609, 0x96; + lop3.b32 %r17539, %r17539, %r30607, %r30605, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17550, %r30626, %r30624, %r30602, 0x96; + lop3.b32 %r17550, %r17550, %r30600, %r30598, 0x96; + lop3.b32 %r17551, %r30627, %r30625, %r30603, 0x96; + lop3.b32 %r17551, %r17551, %r30601, %r30599, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17562, %r30622, %r30596, %r30594, 0x96; + lop3.b32 %r17562, %r17562, %r30592, %r30590, 0x96; + lop3.b32 %r17563, %r30623, %r30597, %r30595, 0x96; + lop3.b32 %r17563, %r17563, %r30593, %r30591, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17574, %r30620, %r30588, %r30586, 0x96; + lop3.b32 %r17574, %r17574, %r30584, %r30582, 0x96; + lop3.b32 %r17575, %r30621, %r30589, %r30587, 0x96; + lop3.b32 %r17575, %r17575, %r30585, %r30583, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17586, %r17539, %r17538, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17590, %r17538, %r17539, %r17485; + // end inline asm + xor.b32 %r18020, %r17586, %r17574; + xor.b32 %r18021, %r17590, %r17575; + xor.b32 %r17853, %r30618, %r18020; + xor.b32 %r17856, %r30619, %r18021; + xor.b32 %r17760, %r30616, %r18020; + xor.b32 %r17759, %r30617, %r18021; + xor.b32 %r17807, %r30614, %r18020; + xor.b32 %r17808, %r30615, %r18021; + xor.b32 %r17712, %r30612, %r18020; + xor.b32 %r17711, %r30613, %r18021; + xor.b32 %r17663, %r30610, %r18020; + xor.b32 %r17664, %r30611, %r18021; + // begin inline asm + shf.l.wrap.b32 %r17594, %r17551, %r17550, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17598, %r17550, %r17551, %r17485; + // end inline asm + xor.b32 %r18022, %r17594, %r17526; + xor.b32 %r18023, %r17598, %r17527; + xor.b32 %r17815, %r30630, %r18022; + xor.b32 %r17816, %r30631, %r18023; + xor.b32 %r17632, %r30628, %r18022; + xor.b32 %r17631, %r30629, %r18023; + xor.b32 %r17791, %r30608, %r18022; + xor.b32 %r17792, %r30609, %r18023; + xor.b32 %r17752, %r30606, %r18022; + xor.b32 %r17751, %r30607, %r18023; + xor.b32 %r17735, %r30604, %r18022; + xor.b32 %r17736, %r30605, %r18023; + // begin inline asm + shf.l.wrap.b32 %r17602, %r17563, %r17562, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17606, %r17562, %r17563, %r17485; + // end inline asm + xor.b32 %r18024, %r17602, %r17538; + xor.b32 %r18025, %r17606, %r17539; + xor.b32 %r17672, %r30626, %r18024; + xor.b32 %r17671, %r30627, %r18025; + xor.b32 %r17799, %r30624, %r18024; + xor.b32 %r17800, %r30625, %r18025; + xor.b32 %r17680, %r30602, %r18024; + xor.b32 %r17679, %r30603, %r18025; + xor.b32 %r17783, %r30600, %r18024; + xor.b32 %r17784, %r30601, %r18025; + xor.b32 %r17648, %r30598, %r18024; + xor.b32 %r17647, %r30599, %r18025; + // begin inline asm + shf.l.wrap.b32 %r17610, %r17575, %r17574, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17614, %r17574, %r17575, %r17485; + // end inline asm + xor.b32 %r18026, %r17610, %r17550; + xor.b32 %r18027, %r17614, %r17551; + xor.b32 %r17767, %r30622, %r18026; + xor.b32 %r17768, %r30623, %r18027; + xor.b32 %r17744, %r30596, %r18026; + xor.b32 %r17743, %r30597, %r18027; + xor.b32 %r17687, %r30594, %r18026; + xor.b32 %r17688, %r30595, %r18027; + xor.b32 %r17775, %r30592, %r18026; + xor.b32 %r17776, %r30593, %r18027; + xor.b32 %r17704, %r30590, %r18026; + xor.b32 %r17703, %r30591, %r18027; + // begin inline asm + shf.l.wrap.b32 %r17618, %r17527, %r17526, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17622, %r17526, %r17527, %r17485; + // end inline asm + xor.b32 %r18028, %r17618, %r17562; + xor.b32 %r18029, %r17622, %r17563; + xor.b32 %r17719, %r30620, %r18028; + xor.b32 %r17720, %r30621, %r18029; + xor.b32 %r17639, %r30588, %r18028; + xor.b32 %r17640, %r30589, %r18029; + xor.b32 %r17656, %r30586, %r18028; + xor.b32 %r17655, %r30587, %r18029; + xor.b32 %r17695, %r30584, %r18028; + xor.b32 %r17696, %r30585, %r18029; + xor.b32 %r17727, %r30582, %r18028; + xor.b32 %r17728, %r30583, %r18029; + mov.u32 %r17633, 44; + // begin inline asm + shf.l.wrap.b32 %r17626, %r17632, %r17631, %r17633; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17630, %r17631, %r17632, %r17633; + // end inline asm + mov.u32 %r17641, 20; + // begin inline asm + shf.l.wrap.b32 %r17634, %r17640, %r17639, %r17641; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17638, %r17639, %r17640, %r17641; + // end inline asm + mov.u32 %r17649, 61; + // begin inline asm + shf.l.wrap.b32 %r17642, %r17648, %r17647, %r17649; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17646, %r17647, %r17648, %r17649; + // end inline asm + mov.u32 %r17657, 39; + // begin inline asm + shf.l.wrap.b32 %r17650, %r17656, %r17655, %r17657; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17654, %r17655, %r17656, %r17657; + // end inline asm + mov.u32 %r17665, 18; + // begin inline asm + shf.l.wrap.b32 %r17658, %r17664, %r17663, %r17665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17662, %r17663, %r17664, %r17665; + // end inline asm + mov.u32 %r17673, 62; + // begin inline asm + shf.l.wrap.b32 %r17666, %r17672, %r17671, %r17673; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17670, %r17671, %r17672, %r17673; + // end inline asm + mov.u32 %r17681, 43; + // begin inline asm + shf.l.wrap.b32 %r17674, %r17680, %r17679, %r17681; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17678, %r17679, %r17680, %r17681; + // end inline asm + mov.u32 %r17689, 25; + // begin inline asm + shf.l.wrap.b32 %r17682, %r17688, %r17687, %r17689; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17686, %r17687, %r17688, %r17689; + // end inline asm + mov.u32 %r17697, 8; + // begin inline asm + shf.l.wrap.b32 %r17690, %r17696, %r17695, %r17697; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17694, %r17695, %r17696, %r17697; + // end inline asm + mov.u32 %r17705, 56; + // begin inline asm + shf.l.wrap.b32 %r17698, %r17704, %r17703, %r17705; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17702, %r17703, %r17704, %r17705; + // end inline asm + mov.u32 %r17713, 41; + // begin inline asm + shf.l.wrap.b32 %r17706, %r17712, %r17711, %r17713; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17710, %r17711, %r17712, %r17713; + // end inline asm + mov.u32 %r17721, 27; + // begin inline asm + shf.l.wrap.b32 %r17714, %r17720, %r17719, %r17721; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17718, %r17719, %r17720, %r17721; + // end inline asm + mov.u32 %r17729, 14; + // begin inline asm + shf.l.wrap.b32 %r17722, %r17728, %r17727, %r17729; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17726, %r17727, %r17728, %r17729; + // end inline asm + mov.u32 %r17737, 2; + // begin inline asm + shf.l.wrap.b32 %r17730, %r17736, %r17735, %r17737; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17734, %r17735, %r17736, %r17737; + // end inline asm + mov.u32 %r17745, 55; + // begin inline asm + shf.l.wrap.b32 %r17738, %r17744, %r17743, %r17745; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17742, %r17743, %r17744, %r17745; + // end inline asm + mov.u32 %r17753, 45; + // begin inline asm + shf.l.wrap.b32 %r17746, %r17752, %r17751, %r17753; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17750, %r17751, %r17752, %r17753; + // end inline asm + mov.u32 %r17761, 36; + // begin inline asm + shf.l.wrap.b32 %r17754, %r17760, %r17759, %r17761; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17758, %r17759, %r17760, %r17761; + // end inline asm + mov.u32 %r17769, 28; + // begin inline asm + shf.l.wrap.b32 %r17762, %r17768, %r17767, %r17769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17766, %r17767, %r17768, %r17769; + // end inline asm + mov.u32 %r17777, 21; + // begin inline asm + shf.l.wrap.b32 %r17770, %r17776, %r17775, %r17777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17774, %r17775, %r17776, %r17777; + // end inline asm + mov.u32 %r17785, 15; + // begin inline asm + shf.l.wrap.b32 %r17778, %r17784, %r17783, %r17785; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17782, %r17783, %r17784, %r17785; + // end inline asm + mov.u32 %r17793, 10; + // begin inline asm + shf.l.wrap.b32 %r17786, %r17792, %r17791, %r17793; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17790, %r17791, %r17792, %r17793; + // end inline asm + mov.u32 %r17801, 6; + // begin inline asm + shf.l.wrap.b32 %r17794, %r17800, %r17799, %r17801; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17798, %r17799, %r17800, %r17801; + // end inline asm + mov.u32 %r17809, 3; + // begin inline asm + shf.l.wrap.b32 %r17802, %r17808, %r17807, %r17809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17806, %r17807, %r17808, %r17809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17810, %r17816, %r17815, %r17485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17814, %r17815, %r17816, %r17485; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17818, %r17853, %r17626, %r17674, 0xD2; + lop3.b32 %r17819, %r17856, %r17630, %r17678, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30630, %r17626, %r17674, %r17770, 0xD2; + lop3.b32 %r30631, %r17630, %r17678, %r17774, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30626, %r17674, %r17770, %r17722, 0xD2; + lop3.b32 %r30627, %r17678, %r17774, %r17726, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30622, %r17770, %r17722, %r17853, 0xD2; + lop3.b32 %r30623, %r17774, %r17726, %r17856, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30620, %r17722, %r17853, %r17626, 0xD2; + lop3.b32 %r30621, %r17726, %r17856, %r17630, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30616, %r17762, %r17634, %r17802, 0xD2; + lop3.b32 %r30617, %r17766, %r17638, %r17806, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30628, %r17634, %r17802, %r17746, 0xD2; + lop3.b32 %r30629, %r17638, %r17806, %r17750, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30624, %r17802, %r17746, %r17642, 0xD2; + lop3.b32 %r30625, %r17806, %r17750, %r17646, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30596, %r17746, %r17642, %r17762, 0xD2; + lop3.b32 %r30597, %r17750, %r17646, %r17766, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r30596, %r30597}; + // begin inline asm + // chi + lop3.b32 %r30588, %r17642, %r17762, %r17634, 0xD2; + lop3.b32 %r30589, %r17646, %r17766, %r17638, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r30588, %r30589}; + // begin inline asm + // chi + lop3.b32 %r30614, %r17810, %r17794, %r17682, 0xD2; + lop3.b32 %r30615, %r17814, %r17798, %r17686, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+104], {%r30614, %r30615}; + // begin inline asm + // chi + lop3.b32 %r30608, %r17794, %r17682, %r17690, 0xD2; + lop3.b32 %r30609, %r17798, %r17686, %r17694, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+112], {%r30608, %r30609}; + // begin inline asm + // chi + lop3.b32 %r30602, %r17682, %r17690, %r17658, 0xD2; + lop3.b32 %r30603, %r17686, %r17694, %r17662, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+120], {%r30602, %r30603}; + // begin inline asm + // chi + lop3.b32 %r30594, %r17690, %r17658, %r17810, 0xD2; + lop3.b32 %r30595, %r17694, %r17662, %r17814, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+128], {%r30594, %r30595}; + // begin inline asm + // chi + lop3.b32 %r30586, %r17658, %r17810, %r17794, 0xD2; + lop3.b32 %r30587, %r17662, %r17814, %r17798, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+136], {%r30586, %r30587}; + // begin inline asm + // chi + lop3.b32 %r30612, %r17714, %r17754, %r17786, 0xD2; + lop3.b32 %r30613, %r17718, %r17758, %r17790, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+144], {%r30612, %r30613}; + // begin inline asm + // chi + lop3.b32 %r30606, %r17754, %r17786, %r17778, 0xD2; + lop3.b32 %r30607, %r17758, %r17790, %r17782, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+152], {%r30606, %r30607}; + // begin inline asm + // chi + lop3.b32 %r30600, %r17786, %r17778, %r17698, 0xD2; + lop3.b32 %r30601, %r17790, %r17782, %r17702, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+160], {%r30600, %r30601}; + // begin inline asm + // chi + lop3.b32 %r30592, %r17778, %r17698, %r17714, 0xD2; + lop3.b32 %r30593, %r17782, %r17702, %r17718, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+168], {%r30592, %r30593}; + // begin inline asm + // chi + lop3.b32 %r30584, %r17698, %r17714, %r17754, 0xD2; + lop3.b32 %r30585, %r17702, %r17718, %r17758, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+176], {%r30584, %r30585}; + // begin inline asm + // chi + lop3.b32 %r30610, %r17666, %r17738, %r17650, 0xD2; + lop3.b32 %r30611, %r17670, %r17742, %r17654, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+184], {%r30610, %r30611}; + // begin inline asm + // chi + lop3.b32 %r30604, %r17738, %r17650, %r17706, 0xD2; + lop3.b32 %r30605, %r17742, %r17654, %r17710, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+192], {%r30604, %r30605}; + // begin inline asm + // chi + lop3.b32 %r30598, %r17650, %r17706, %r17730, 0xD2; + lop3.b32 %r30599, %r17654, %r17710, %r17734, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+200], {%r30598, %r30599}; + // begin inline asm + // chi + lop3.b32 %r30590, %r17706, %r17730, %r17666, 0xD2; + lop3.b32 %r30591, %r17710, %r17734, %r17670, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+208], {%r30590, %r30591}; + // begin inline asm + // chi + lop3.b32 %r30582, %r17730, %r17666, %r17738, 0xD2; + lop3.b32 %r30583, %r17734, %r17670, %r17742, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+216], {%r30582, %r30583}; + mul.wide.s32 %rd810, %r30632, 8; + add.s64 %rd809, %rd789, %rd810; + // begin inline asm + ld.global.nc.v2.u32 {%r18018,%r18019}, [%rd809]; + // end inline asm + xor.b32 %r30618, %r17818, %r18018; + xor.b32 %r30619, %r17819, %r18019; + add.s32 %r30632, %r30632, 1; + setp.lt.u32 %p32, %r30632, 23; + @%p32 bra $L__BB2_53; + + mov.u32 %r30665, 0; + mov.u32 %r18129, 1; + st.local.v2.u32 [%rd176+32], {%r30630, %r30631}; + st.local.v2.u32 [%rd176+72], {%r30628, %r30629}; + st.local.v2.u32 [%rd176+40], {%r30626, %r30627}; + st.local.v2.u32 [%rd176+80], {%r30624, %r30625}; + st.local.v2.u32 [%rd176+48], {%r30622, %r30623}; + st.local.v2.u32 [%rd176+56], {%r30620, %r30621}; + st.local.v2.u32 [%rd176+24], {%r30618, %r30619}; + // begin inline asm + // xor5 + lop3.b32 %r18030, %r30618, %r30616, %r30614, 0x96; + lop3.b32 %r18030, %r18030, %r30612, %r30610, 0x96; + lop3.b32 %r18031, %r30619, %r30617, %r30615, 0x96; + lop3.b32 %r18031, %r18031, %r30613, %r30611, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18042, %r30630, %r30628, %r30608, 0x96; + lop3.b32 %r18042, %r18042, %r30606, %r30604, 0x96; + lop3.b32 %r18043, %r30631, %r30629, %r30609, 0x96; + lop3.b32 %r18043, %r18043, %r30607, %r30605, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18054, %r30626, %r30624, %r30602, 0x96; + lop3.b32 %r18054, %r18054, %r30600, %r30598, 0x96; + lop3.b32 %r18055, %r30627, %r30625, %r30603, 0x96; + lop3.b32 %r18055, %r18055, %r30601, %r30599, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18066, %r30622, %r30596, %r30594, 0x96; + lop3.b32 %r18066, %r18066, %r30592, %r30590, 0x96; + lop3.b32 %r18067, %r30623, %r30597, %r30595, 0x96; + lop3.b32 %r18067, %r18067, %r30593, %r30591, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18078, %r30620, %r30588, %r30586, 0x96; + lop3.b32 %r18078, %r18078, %r30584, %r30582, 0x96; + lop3.b32 %r18079, %r30621, %r30589, %r30587, 0x96; + lop3.b32 %r18079, %r18079, %r30585, %r30583, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18090, %r18043, %r18042, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18094, %r18042, %r18043, %r18129; + // end inline asm + xor.b32 %r18269, %r18090, %r18078; + xor.b32 %r18270, %r18094, %r18079; + xor.b32 %r18237, %r30618, %r18269; + xor.b32 %r18240, %r30619, %r18270; + xor.b32 %r18200, %r30615, %r18270; + xor.b32 %r18199, %r30614, %r18269; + st.local.v2.u32 [%rd176+104], {%r18199, %r18200}; + // begin inline asm + shf.l.wrap.b32 %r18098, %r18055, %r18054, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18102, %r18054, %r18055, %r18129; + // end inline asm + xor.b32 %r18271, %r18098, %r18030; + xor.b32 %r18272, %r18102, %r18031; + xor.b32 %r18136, %r30628, %r18271; + xor.b32 %r18135, %r30629, %r18272; + xor.b32 %r18175, %r30607, %r18272; + xor.b32 %r18176, %r30606, %r18271; + st.local.v2.u32 [%rd176+152], {%r18176, %r18175}; + // begin inline asm + shf.l.wrap.b32 %r18106, %r18067, %r18066, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18110, %r18066, %r18067, %r18129; + // end inline asm + xor.b32 %r18273, %r18106, %r18042; + xor.b32 %r18274, %r18110, %r18043; + xor.b32 %r18159, %r30603, %r18274; + xor.b32 %r18160, %r30602, %r18273; + st.local.v2.u32 [%rd176+120], {%r18160, %r18159}; + xor.b32 %r18151, %r30599, %r18274; + xor.b32 %r18152, %r30598, %r18273; + st.local.v2.u32 [%rd176+200], {%r18152, %r18151}; + // begin inline asm + shf.l.wrap.b32 %r18114, %r18079, %r18078, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18118, %r18078, %r18079, %r18129; + // end inline asm + xor.b32 %r18275, %r18114, %r18054; + xor.b32 %r18276, %r18118, %r18055; + xor.b32 %r18183, %r30622, %r18275; + xor.b32 %r18184, %r30623, %r18276; + xor.b32 %r18192, %r30593, %r18276; + xor.b32 %r18191, %r30592, %r18275; + st.local.v2.u32 [%rd176+168], {%r18191, %r18192}; + // begin inline asm + shf.l.wrap.b32 %r18122, %r18031, %r18030, %r18129; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18126, %r18030, %r18031, %r18129; + // end inline asm + xor.b32 %r18277, %r18122, %r18066; + xor.b32 %r18278, %r18126, %r18067; + xor.b32 %r18143, %r30588, %r18277; + xor.b32 %r18144, %r30589, %r18278; + xor.b32 %r18168, %r30583, %r18278; + xor.b32 %r18167, %r30582, %r18277; + st.local.v2.u32 [%rd176+216], {%r18167, %r18168}; + // begin inline asm + shf.l.wrap.b32 %r18130, %r18136, %r18135, %r17633; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18134, %r18135, %r18136, %r17633; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18138, %r18144, %r18143, %r17641; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18142, %r18143, %r18144, %r17641; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18150, %r18151, %r18152, %r17649; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18146, %r18152, %r18151, %r17649; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r18146, %r18150}; + // begin inline asm + shf.l.wrap.b32 %r18154, %r18160, %r18159, %r17681; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18158, %r18159, %r18160, %r17681; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18162, %r18168, %r18167, %r17729; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18166, %r18167, %r18168, %r17729; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18174, %r18175, %r18176, %r17753; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18170, %r18176, %r18175, %r17753; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r18170, %r18174}; + // begin inline asm + shf.l.wrap.b32 %r18178, %r18184, %r18183, %r17769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18182, %r18183, %r18184, %r17769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18186, %r18192, %r18191, %r17777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18190, %r18191, %r18192, %r17777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18194, %r18200, %r18199, %r17809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18198, %r18199, %r18200, %r17809; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18202, %r18237, %r18130, %r18154, 0xD2; + lop3.b32 %r18203, %r18240, %r18134, %r18158, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30765, %r18130, %r18154, %r18186, 0xD2; + lop3.b32 %r30766, %r18134, %r18158, %r18190, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+32], {%r30765, %r30766}; + // begin inline asm + // chi + lop3.b32 %r30761, %r18154, %r18186, %r18162, 0xD2; + lop3.b32 %r30762, %r18158, %r18190, %r18166, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+40], {%r30761, %r30762}; + // begin inline asm + // chi + lop3.b32 %r30757, %r18186, %r18162, %r18237, 0xD2; + lop3.b32 %r30758, %r18190, %r18166, %r18240, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+48], {%r30757, %r30758}; + // begin inline asm + // chi + lop3.b32 %r30755, %r18162, %r18237, %r18130, 0xD2; + lop3.b32 %r30756, %r18166, %r18240, %r18134, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+56], {%r30755, %r30756}; + // begin inline asm + // chi + lop3.b32 %r30751, %r18178, %r18138, %r18194, 0xD2; + lop3.b32 %r30752, %r18182, %r18142, %r18198, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+64], {%r30751, %r30752}; + // begin inline asm + // chi + lop3.b32 %r30763, %r18138, %r18194, %r18170, 0xD2; + lop3.b32 %r30764, %r18142, %r18198, %r18174, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+72], {%r30763, %r30764}; + // begin inline asm + // chi + lop3.b32 %r30759, %r18194, %r18170, %r18146, 0xD2; + lop3.b32 %r30760, %r18198, %r18174, %r18150, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+80], {%r30759, %r30760}; + // begin inline asm + ld.global.nc.v2.u32 {%r18266,%r18267}, [%rd790]; + // end inline asm + xor.b32 %r30753, %r18202, %r18266; + xor.b32 %r30754, %r18203, %r18267; + st.local.v2.u32 [%rd176+24], {%r30753, %r30754}; + add.s64 %rd178, %rd2, 24; + add.s64 %rd179, %rd176, 24; + +$L__BB2_55: + cvta.to.global.u64 %rd1269, %rd361; + shl.b32 %r18279, %r30665, 2; + cvt.u64.u32 %rd820, %r18279; + and.b64 %rd821, %rd820, 60; + add.s64 %rd822, %rd178, %rd821; + xor.b32 %r18280, %r1695, %r30665; + mul.lo.s32 %r18281, %r18280, 16777619; + ld.local.u32 %r18282, [%rd822]; + xor.b32 %r18283, %r18281, %r18282; + mul.wide.u32 %rd823, %r18283, -954391867; + shr.u64 %rd824, %rd823, 32; + cvt.u32.u64 %r18284, %rd824; + sub.s32 %r18285, %r18283, %r18284; + shr.u32 %r18286, %r18285, 1; + add.s32 %r18287, %r18286, %r18284; + shr.u32 %r18288, %r18287, 20; + mul.lo.s32 %r18289, %r18288, 1179641; + sub.s32 %r18290, %r18283, %r18289; + mul.wide.u32 %rd825, %r18290, 64; + add.s64 %rd826, %rd1269, %rd825; + mul.lo.s32 %r18291, %r30702, 16777619; + ld.global.u32 %r18292, [%rd826]; + xor.b32 %r30702, %r18291, %r18292; + mul.lo.s32 %r18293, %r30703, 16777619; + ld.global.u32 %r18294, [%rd826+4]; + xor.b32 %r30703, %r18293, %r18294; + mul.lo.s32 %r18295, %r30714, 16777619; + ld.global.u32 %r18296, [%rd826+8]; + mul.lo.s32 %r18297, %r30715, 16777619; + ld.global.u32 %r18298, [%rd826+12]; + xor.b32 %r18299, %r18297, %r18298; + xor.b32 %r30714, %r18295, %r18296; + mov.b64 %rd827, {%r30714, %r18299}; + mul.lo.s32 %r18300, %r30710, 16777619; + ld.global.u32 %r18301, [%rd826+16]; + mul.lo.s32 %r18302, %r30711, 16777619; + ld.global.u32 %r18303, [%rd826+20]; + xor.b32 %r18304, %r18302, %r18303; + xor.b32 %r30710, %r18300, %r18301; + mov.b64 %rd828, {%r30710, %r18304}; + mul.lo.s32 %r18305, %r30706, 16777619; + ld.global.u32 %r18306, [%rd826+24]; + mul.lo.s32 %r18307, %r30707, 16777619; + ld.global.u32 %r18308, [%rd826+28]; + xor.b32 %r18309, %r18307, %r18308; + xor.b32 %r30706, %r18305, %r18306; + mov.b64 %rd829, {%r30706, %r18309}; + mul.lo.s32 %r18310, %r30704, 16777619; + ld.global.u32 %r18311, [%rd826+32]; + mul.lo.s32 %r18312, %r30705, 16777619; + ld.global.u32 %r18313, [%rd826+36]; + xor.b32 %r18314, %r18312, %r18313; + xor.b32 %r30704, %r18310, %r18311; + mov.b64 %rd830, {%r30704, %r18314}; + mul.lo.s32 %r18315, %r30700, 16777619; + ld.global.u32 %r18316, [%rd826+40]; + xor.b32 %r30700, %r18315, %r18316; + mul.lo.s32 %r18317, %r30701, 16777619; + ld.global.u32 %r18318, [%rd826+44]; + xor.b32 %r30701, %r18317, %r18318; + mul.lo.s32 %r18319, %r30712, 16777619; + ld.global.u32 %r18320, [%rd826+48]; + mul.lo.s32 %r18321, %r30713, 16777619; + ld.global.u32 %r18322, [%rd826+52]; + xor.b32 %r18323, %r18321, %r18322; + xor.b32 %r30712, %r18319, %r18320; + mov.b64 %rd831, {%r30712, %r18323}; + mul.lo.s32 %r18324, %r30708, 16777619; + ld.global.u32 %r18325, [%rd826+56]; + mul.lo.s32 %r18326, %r30709, 16777619; + ld.global.u32 %r18327, [%rd826+60]; + xor.b32 %r18328, %r18326, %r18327; + xor.b32 %r30708, %r18324, %r18325; + mov.b64 %rd832, {%r30708, %r18328}; + st.local.v2.u32 [%rd2+24], {%r30702, %r30703}; + st.local.v2.u32 [%rd2+32], {%r30714, %r18299}; + st.local.v2.u32 [%rd2+40], {%r30710, %r18304}; + st.local.v2.u32 [%rd2+48], {%r30706, %r18309}; + st.local.v2.u32 [%rd2+56], {%r30704, %r18314}; + st.local.v2.u32 [%rd2+64], {%r30700, %r30701}; + st.local.v2.u32 [%rd2+72], {%r30712, %r18323}; + st.local.v2.u32 [%rd2+80], {%r30708, %r18328}; + add.s64 %rd833, %rd179, %rd821; + xor.b32 %r18329, %r1696, %r30665; + mul.lo.s32 %r18330, %r18329, 16777619; + ld.local.u32 %r18331, [%rd833]; + xor.b32 %r18332, %r18330, %r18331; + mul.wide.u32 %rd834, %r18332, -954391867; + shr.u64 %rd835, %rd834, 32; + cvt.u32.u64 %r18333, %rd835; + sub.s32 %r18334, %r18332, %r18333; + shr.u32 %r18335, %r18334, 1; + add.s32 %r18336, %r18335, %r18333; + shr.u32 %r18337, %r18336, 20; + mul.lo.s32 %r18338, %r18337, 1179641; + sub.s32 %r18339, %r18332, %r18338; + mul.wide.u32 %rd836, %r18339, 64; + add.s64 %rd837, %rd1269, %rd836; + mul.lo.s32 %r18340, %r30753, 16777619; + ld.global.u32 %r18341, [%rd837]; + xor.b32 %r30753, %r18340, %r18341; + mul.lo.s32 %r18342, %r30754, 16777619; + ld.global.u32 %r18343, [%rd837+4]; + xor.b32 %r30754, %r18342, %r18343; + mul.lo.s32 %r18344, %r30765, 16777619; + ld.global.u32 %r18345, [%rd837+8]; + mul.lo.s32 %r18346, %r30766, 16777619; + ld.global.u32 %r18347, [%rd837+12]; + xor.b32 %r18348, %r18346, %r18347; + xor.b32 %r30765, %r18344, %r18345; + mov.b64 %rd838, {%r30765, %r18348}; + mul.lo.s32 %r18349, %r30761, 16777619; + ld.global.u32 %r18350, [%rd837+16]; + mul.lo.s32 %r18351, %r30762, 16777619; + ld.global.u32 %r18352, [%rd837+20]; + xor.b32 %r18353, %r18351, %r18352; + xor.b32 %r30761, %r18349, %r18350; + mov.b64 %rd839, {%r30761, %r18353}; + mul.lo.s32 %r18354, %r30757, 16777619; + ld.global.u32 %r18355, [%rd837+24]; + mul.lo.s32 %r18356, %r30758, 16777619; + ld.global.u32 %r18357, [%rd837+28]; + xor.b32 %r18358, %r18356, %r18357; + xor.b32 %r30757, %r18354, %r18355; + mov.b64 %rd840, {%r30757, %r18358}; + mul.lo.s32 %r18359, %r30755, 16777619; + ld.global.u32 %r18360, [%rd837+32]; + mul.lo.s32 %r18361, %r30756, 16777619; + ld.global.u32 %r18362, [%rd837+36]; + xor.b32 %r18363, %r18361, %r18362; + xor.b32 %r30755, %r18359, %r18360; + mov.b64 %rd841, {%r30755, %r18363}; + mul.lo.s32 %r18364, %r30751, 16777619; + ld.global.u32 %r18365, [%rd837+40]; + xor.b32 %r30751, %r18364, %r18365; + mul.lo.s32 %r18366, %r30752, 16777619; + ld.global.u32 %r18367, [%rd837+44]; + xor.b32 %r30752, %r18366, %r18367; + mul.lo.s32 %r18368, %r30763, 16777619; + ld.global.u32 %r18369, [%rd837+48]; + mul.lo.s32 %r18370, %r30764, 16777619; + ld.global.u32 %r18371, [%rd837+52]; + xor.b32 %r18372, %r18370, %r18371; + xor.b32 %r30763, %r18368, %r18369; + mov.b64 %rd842, {%r30763, %r18372}; + mul.lo.s32 %r18373, %r30759, 16777619; + ld.global.u32 %r18374, [%rd837+56]; + mul.lo.s32 %r18375, %r30760, 16777619; + ld.global.u32 %r18376, [%rd837+60]; + xor.b32 %r18377, %r18375, %r18376; + xor.b32 %r30759, %r18373, %r18374; + mov.b64 %rd843, {%r30759, %r18377}; + st.local.v2.u32 [%rd176+24], {%r30753, %r30754}; + st.local.v2.u32 [%rd176+32], {%r30765, %r18348}; + st.local.v2.u32 [%rd176+40], {%r30761, %r18353}; + st.local.v2.u32 [%rd176+48], {%r30757, %r18358}; + st.local.v2.u32 [%rd176+56], {%r30755, %r18363}; + st.local.v2.u32 [%rd176+64], {%r30751, %r30752}; + st.local.v2.u32 [%rd176+72], {%r30763, %r18372}; + st.local.v2.u32 [%rd176+80], {%r30759, %r18377}; + add.s32 %r30665, %r30665, 1; + setp.lt.u32 %p33, %r30665, 512; + shr.u64 %rd844, %rd827, 32; + cvt.u32.u64 %r30715, %rd844; + shr.u64 %rd845, %rd828, 32; + cvt.u32.u64 %r30711, %rd845; + shr.u64 %rd846, %rd829, 32; + cvt.u32.u64 %r30707, %rd846; + shr.u64 %rd847, %rd830, 32; + cvt.u32.u64 %r30705, %rd847; + shr.u64 %rd848, %rd831, 32; + cvt.u32.u64 %r30713, %rd848; + shr.u64 %rd849, %rd832, 32; + cvt.u32.u64 %r30709, %rd849; + shr.u64 %rd850, %rd838, 32; + cvt.u32.u64 %r30766, %rd850; + shr.u64 %rd851, %rd839, 32; + cvt.u32.u64 %r30762, %rd851; + shr.u64 %rd852, %rd840, 32; + cvt.u32.u64 %r30758, %rd852; + shr.u64 %rd853, %rd841, 32; + cvt.u32.u64 %r30756, %rd853; + shr.u64 %rd854, %rd842, 32; + cvt.u32.u64 %r30764, %rd854; + shr.u64 %rd855, %rd843, 32; + cvt.u32.u64 %r30760, %rd855; + @%p33 bra $L__BB2_55; + + mov.u32 %r30666, 0; + st.local.v2.u32 [%rd2+96], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+104], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+112], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+120], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+128], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+136], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+144], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+152], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+160], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+168], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+176], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+184], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+192], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+200], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+208], {%r30666, %r30666}; + st.local.v2.u32 [%rd2+216], {%r30666, %r30666}; + mov.u32 %r30681, -2147483648; + mov.u32 %r18392, 1; + st.local.v2.u32 [%rd2+88], {%r18392, %r30681}; + mov.u32 %r30667, %r30666; + mov.u32 %r30668, %r30666; + mov.u32 %r30669, %r30666; + mov.u32 %r30670, %r30666; + mov.u32 %r30671, %r30666; + mov.u32 %r30672, %r30666; + mov.u32 %r30673, %r30666; + mov.u32 %r30674, %r30666; + mov.u32 %r30675, %r30666; + mov.u32 %r30676, %r30666; + mov.u32 %r30677, %r30666; + mov.u32 %r30678, %r30666; + mov.u32 %r30679, %r30666; + mov.u32 %r30680, %r18392; + mov.u32 %r30682, %r30666; + mov.u32 %r30683, %r30666; + mov.u32 %r30684, %r30666; + mov.u32 %r30685, %r30666; + mov.u32 %r30686, %r30666; + mov.u32 %r30687, %r30666; + mov.u32 %r30688, %r30666; + mov.u32 %r30689, %r30666; + mov.u32 %r30690, %r30666; + mov.u32 %r30691, %r30666; + mov.u32 %r30692, %r30666; + mov.u32 %r30693, %r30666; + mov.u32 %r30694, %r30666; + mov.u32 %r30695, %r30666; + mov.u32 %r30696, %r30666; + mov.u32 %r30697, %r30666; + mov.u32 %r30698, %r30666; + mov.u32 %r30699, %r30666; + mov.u32 %r30716, %r30666; + +$L__BB2_57: + // begin inline asm + // xor5 + lop3.b32 %r18419, %r30702, %r30700, %r30698, 0x96; + lop3.b32 %r18419, %r18419, %r30696, %r30694, 0x96; + lop3.b32 %r18420, %r30703, %r30701, %r30699, 0x96; + lop3.b32 %r18420, %r18420, %r30697, %r30695, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18431, %r30714, %r30712, %r30692, 0x96; + lop3.b32 %r18431, %r18431, %r30690, %r30688, 0x96; + lop3.b32 %r18432, %r30715, %r30713, %r30693, 0x96; + lop3.b32 %r18432, %r18432, %r30691, %r30689, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18443, %r30710, %r30708, %r30686, 0x96; + lop3.b32 %r18443, %r18443, %r30684, %r30682, 0x96; + lop3.b32 %r18444, %r30711, %r30709, %r30687, 0x96; + lop3.b32 %r18444, %r18444, %r30685, %r30683, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18455, %r30706, %r30680, %r30678, 0x96; + lop3.b32 %r18455, %r18455, %r30676, %r30674, 0x96; + lop3.b32 %r18456, %r30707, %r30681, %r30679, 0x96; + lop3.b32 %r18456, %r18456, %r30677, %r30675, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18467, %r30704, %r30672, %r30670, 0x96; + lop3.b32 %r18467, %r18467, %r30668, %r30666, 0x96; + lop3.b32 %r18468, %r30705, %r30673, %r30671, 0x96; + lop3.b32 %r18468, %r18468, %r30669, %r30667, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18479, %r18432, %r18431, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18483, %r18431, %r18432, %r18392; + // end inline asm + xor.b32 %r18913, %r18479, %r18467; + xor.b32 %r18914, %r18483, %r18468; + xor.b32 %r18746, %r30702, %r18913; + xor.b32 %r18749, %r30703, %r18914; + xor.b32 %r18653, %r30700, %r18913; + xor.b32 %r18652, %r30701, %r18914; + xor.b32 %r18700, %r30698, %r18913; + xor.b32 %r18701, %r30699, %r18914; + xor.b32 %r18605, %r30696, %r18913; + xor.b32 %r18604, %r30697, %r18914; + xor.b32 %r18556, %r30694, %r18913; + xor.b32 %r18557, %r30695, %r18914; + // begin inline asm + shf.l.wrap.b32 %r18487, %r18444, %r18443, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18491, %r18443, %r18444, %r18392; + // end inline asm + xor.b32 %r18915, %r18487, %r18419; + xor.b32 %r18916, %r18491, %r18420; + xor.b32 %r18708, %r30714, %r18915; + xor.b32 %r18709, %r30715, %r18916; + xor.b32 %r18525, %r30712, %r18915; + xor.b32 %r18524, %r30713, %r18916; + xor.b32 %r18684, %r30692, %r18915; + xor.b32 %r18685, %r30693, %r18916; + xor.b32 %r18645, %r30690, %r18915; + xor.b32 %r18644, %r30691, %r18916; + xor.b32 %r18628, %r30688, %r18915; + xor.b32 %r18629, %r30689, %r18916; + // begin inline asm + shf.l.wrap.b32 %r18495, %r18456, %r18455, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18499, %r18455, %r18456, %r18392; + // end inline asm + xor.b32 %r18917, %r18495, %r18431; + xor.b32 %r18918, %r18499, %r18432; + xor.b32 %r18565, %r30710, %r18917; + xor.b32 %r18564, %r30711, %r18918; + xor.b32 %r18692, %r30708, %r18917; + xor.b32 %r18693, %r30709, %r18918; + xor.b32 %r18573, %r30686, %r18917; + xor.b32 %r18572, %r30687, %r18918; + xor.b32 %r18676, %r30684, %r18917; + xor.b32 %r18677, %r30685, %r18918; + xor.b32 %r18541, %r30682, %r18917; + xor.b32 %r18540, %r30683, %r18918; + // begin inline asm + shf.l.wrap.b32 %r18503, %r18468, %r18467, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18507, %r18467, %r18468, %r18392; + // end inline asm + xor.b32 %r18919, %r18503, %r18443; + xor.b32 %r18920, %r18507, %r18444; + xor.b32 %r18660, %r30706, %r18919; + xor.b32 %r18661, %r30707, %r18920; + xor.b32 %r18637, %r30680, %r18919; + xor.b32 %r18636, %r30681, %r18920; + xor.b32 %r18580, %r30678, %r18919; + xor.b32 %r18581, %r30679, %r18920; + xor.b32 %r18668, %r30676, %r18919; + xor.b32 %r18669, %r30677, %r18920; + xor.b32 %r18597, %r30674, %r18919; + xor.b32 %r18596, %r30675, %r18920; + // begin inline asm + shf.l.wrap.b32 %r18511, %r18420, %r18419, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18515, %r18419, %r18420, %r18392; + // end inline asm + xor.b32 %r18921, %r18511, %r18455; + xor.b32 %r18922, %r18515, %r18456; + xor.b32 %r18612, %r30704, %r18921; + xor.b32 %r18613, %r30705, %r18922; + xor.b32 %r18532, %r30672, %r18921; + xor.b32 %r18533, %r30673, %r18922; + xor.b32 %r18549, %r30670, %r18921; + xor.b32 %r18548, %r30671, %r18922; + xor.b32 %r18588, %r30668, %r18921; + xor.b32 %r18589, %r30669, %r18922; + xor.b32 %r18620, %r30666, %r18921; + xor.b32 %r18621, %r30667, %r18922; + mov.u32 %r18526, 44; + // begin inline asm + shf.l.wrap.b32 %r18519, %r18525, %r18524, %r18526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18523, %r18524, %r18525, %r18526; + // end inline asm + mov.u32 %r18534, 20; + // begin inline asm + shf.l.wrap.b32 %r18527, %r18533, %r18532, %r18534; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18531, %r18532, %r18533, %r18534; + // end inline asm + mov.u32 %r18542, 61; + // begin inline asm + shf.l.wrap.b32 %r18535, %r18541, %r18540, %r18542; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18539, %r18540, %r18541, %r18542; + // end inline asm + mov.u32 %r18550, 39; + // begin inline asm + shf.l.wrap.b32 %r18543, %r18549, %r18548, %r18550; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18547, %r18548, %r18549, %r18550; + // end inline asm + mov.u32 %r18558, 18; + // begin inline asm + shf.l.wrap.b32 %r18551, %r18557, %r18556, %r18558; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18555, %r18556, %r18557, %r18558; + // end inline asm + mov.u32 %r18566, 62; + // begin inline asm + shf.l.wrap.b32 %r18559, %r18565, %r18564, %r18566; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18563, %r18564, %r18565, %r18566; + // end inline asm + mov.u32 %r18574, 43; + // begin inline asm + shf.l.wrap.b32 %r18567, %r18573, %r18572, %r18574; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18571, %r18572, %r18573, %r18574; + // end inline asm + mov.u32 %r18582, 25; + // begin inline asm + shf.l.wrap.b32 %r18575, %r18581, %r18580, %r18582; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18579, %r18580, %r18581, %r18582; + // end inline asm + mov.u32 %r18590, 8; + // begin inline asm + shf.l.wrap.b32 %r18583, %r18589, %r18588, %r18590; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18587, %r18588, %r18589, %r18590; + // end inline asm + mov.u32 %r18598, 56; + // begin inline asm + shf.l.wrap.b32 %r18591, %r18597, %r18596, %r18598; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18595, %r18596, %r18597, %r18598; + // end inline asm + mov.u32 %r18606, 41; + // begin inline asm + shf.l.wrap.b32 %r18599, %r18605, %r18604, %r18606; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18603, %r18604, %r18605, %r18606; + // end inline asm + mov.u32 %r18614, 27; + // begin inline asm + shf.l.wrap.b32 %r18607, %r18613, %r18612, %r18614; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18611, %r18612, %r18613, %r18614; + // end inline asm + mov.u32 %r18622, 14; + // begin inline asm + shf.l.wrap.b32 %r18615, %r18621, %r18620, %r18622; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18619, %r18620, %r18621, %r18622; + // end inline asm + mov.u32 %r18630, 2; + // begin inline asm + shf.l.wrap.b32 %r18623, %r18629, %r18628, %r18630; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18627, %r18628, %r18629, %r18630; + // end inline asm + mov.u32 %r18638, 55; + // begin inline asm + shf.l.wrap.b32 %r18631, %r18637, %r18636, %r18638; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18635, %r18636, %r18637, %r18638; + // end inline asm + mov.u32 %r18646, 45; + // begin inline asm + shf.l.wrap.b32 %r18639, %r18645, %r18644, %r18646; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18643, %r18644, %r18645, %r18646; + // end inline asm + mov.u32 %r18654, 36; + // begin inline asm + shf.l.wrap.b32 %r18647, %r18653, %r18652, %r18654; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18651, %r18652, %r18653, %r18654; + // end inline asm + mov.u32 %r18662, 28; + // begin inline asm + shf.l.wrap.b32 %r18655, %r18661, %r18660, %r18662; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18659, %r18660, %r18661, %r18662; + // end inline asm + mov.u32 %r18670, 21; + // begin inline asm + shf.l.wrap.b32 %r18663, %r18669, %r18668, %r18670; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18667, %r18668, %r18669, %r18670; + // end inline asm + mov.u32 %r18678, 15; + // begin inline asm + shf.l.wrap.b32 %r18671, %r18677, %r18676, %r18678; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18675, %r18676, %r18677, %r18678; + // end inline asm + mov.u32 %r18686, 10; + // begin inline asm + shf.l.wrap.b32 %r18679, %r18685, %r18684, %r18686; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18683, %r18684, %r18685, %r18686; + // end inline asm + mov.u32 %r18694, 6; + // begin inline asm + shf.l.wrap.b32 %r18687, %r18693, %r18692, %r18694; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18691, %r18692, %r18693, %r18694; + // end inline asm + mov.u32 %r18702, 3; + // begin inline asm + shf.l.wrap.b32 %r18695, %r18701, %r18700, %r18702; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18699, %r18700, %r18701, %r18702; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18703, %r18709, %r18708, %r18392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18707, %r18708, %r18709, %r18392; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18711, %r18746, %r18519, %r18567, 0xD2; + lop3.b32 %r18712, %r18749, %r18523, %r18571, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30714, %r18519, %r18567, %r18663, 0xD2; + lop3.b32 %r30715, %r18523, %r18571, %r18667, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30710, %r18567, %r18663, %r18615, 0xD2; + lop3.b32 %r30711, %r18571, %r18667, %r18619, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30706, %r18663, %r18615, %r18746, 0xD2; + lop3.b32 %r30707, %r18667, %r18619, %r18749, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30704, %r18615, %r18746, %r18519, 0xD2; + lop3.b32 %r30705, %r18619, %r18749, %r18523, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30700, %r18655, %r18527, %r18695, 0xD2; + lop3.b32 %r30701, %r18659, %r18531, %r18699, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30712, %r18527, %r18695, %r18639, 0xD2; + lop3.b32 %r30713, %r18531, %r18699, %r18643, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30708, %r18695, %r18639, %r18535, 0xD2; + lop3.b32 %r30709, %r18699, %r18643, %r18539, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30680, %r18639, %r18535, %r18655, 0xD2; + lop3.b32 %r30681, %r18643, %r18539, %r18659, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30680, %r30681}; + // begin inline asm + // chi + lop3.b32 %r30672, %r18535, %r18655, %r18527, 0xD2; + lop3.b32 %r30673, %r18539, %r18659, %r18531, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30672, %r30673}; + // begin inline asm + // chi + lop3.b32 %r30698, %r18703, %r18687, %r18575, 0xD2; + lop3.b32 %r30699, %r18707, %r18691, %r18579, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30698, %r30699}; + // begin inline asm + // chi + lop3.b32 %r30692, %r18687, %r18575, %r18583, 0xD2; + lop3.b32 %r30693, %r18691, %r18579, %r18587, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30692, %r30693}; + // begin inline asm + // chi + lop3.b32 %r30686, %r18575, %r18583, %r18551, 0xD2; + lop3.b32 %r30687, %r18579, %r18587, %r18555, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30686, %r30687}; + // begin inline asm + // chi + lop3.b32 %r30678, %r18583, %r18551, %r18703, 0xD2; + lop3.b32 %r30679, %r18587, %r18555, %r18707, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30678, %r30679}; + // begin inline asm + // chi + lop3.b32 %r30670, %r18551, %r18703, %r18687, 0xD2; + lop3.b32 %r30671, %r18555, %r18707, %r18691, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30670, %r30671}; + // begin inline asm + // chi + lop3.b32 %r30696, %r18607, %r18647, %r18679, 0xD2; + lop3.b32 %r30697, %r18611, %r18651, %r18683, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30696, %r30697}; + // begin inline asm + // chi + lop3.b32 %r30690, %r18647, %r18679, %r18671, 0xD2; + lop3.b32 %r30691, %r18651, %r18683, %r18675, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30690, %r30691}; + // begin inline asm + // chi + lop3.b32 %r30684, %r18679, %r18671, %r18591, 0xD2; + lop3.b32 %r30685, %r18683, %r18675, %r18595, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30684, %r30685}; + // begin inline asm + // chi + lop3.b32 %r30676, %r18671, %r18591, %r18607, 0xD2; + lop3.b32 %r30677, %r18675, %r18595, %r18611, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30676, %r30677}; + // begin inline asm + // chi + lop3.b32 %r30668, %r18591, %r18607, %r18647, 0xD2; + lop3.b32 %r30669, %r18595, %r18611, %r18651, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30668, %r30669}; + // begin inline asm + // chi + lop3.b32 %r30694, %r18559, %r18631, %r18543, 0xD2; + lop3.b32 %r30695, %r18563, %r18635, %r18547, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30694, %r30695}; + // begin inline asm + // chi + lop3.b32 %r30688, %r18631, %r18543, %r18599, 0xD2; + lop3.b32 %r30689, %r18635, %r18547, %r18603, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30688, %r30689}; + // begin inline asm + // chi + lop3.b32 %r30682, %r18543, %r18599, %r18623, 0xD2; + lop3.b32 %r30683, %r18547, %r18603, %r18627, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30682, %r30683}; + // begin inline asm + // chi + lop3.b32 %r30674, %r18599, %r18623, %r18559, 0xD2; + lop3.b32 %r30675, %r18603, %r18627, %r18563, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30674, %r30675}; + // begin inline asm + // chi + lop3.b32 %r30666, %r18623, %r18559, %r18631, 0xD2; + lop3.b32 %r30667, %r18627, %r18563, %r18635, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30666, %r30667}; + mul.wide.s32 %rd857, %r30716, 8; + add.s64 %rd856, %rd789, %rd857; + // begin inline asm + ld.global.nc.v2.u32 {%r18911,%r18912}, [%rd856]; + // end inline asm + xor.b32 %r30702, %r18711, %r18911; + xor.b32 %r30703, %r18712, %r18912; + add.s32 %r30716, %r30716, 1; + setp.lt.u32 %p34, %r30716, 23; + @%p34 bra $L__BB2_57; + + st.local.v2.u32 [%rd2+32], {%r30714, %r30715}; + st.local.v2.u32 [%rd2+72], {%r30712, %r30713}; + st.local.v2.u32 [%rd2+40], {%r30710, %r30711}; + st.local.v2.u32 [%rd2+80], {%r30708, %r30709}; + st.local.v2.u32 [%rd2+48], {%r30706, %r30707}; + st.local.v2.u32 [%rd2+56], {%r30704, %r30705}; + st.local.v2.u32 [%rd2+24], {%r30702, %r30703}; + // begin inline asm + // xor5 + lop3.b32 %r18923, %r30702, %r30700, %r30698, 0x96; + lop3.b32 %r18923, %r18923, %r30696, %r30694, 0x96; + lop3.b32 %r18924, %r30703, %r30701, %r30699, 0x96; + lop3.b32 %r18924, %r18924, %r30697, %r30695, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18935, %r30714, %r30712, %r30692, 0x96; + lop3.b32 %r18935, %r18935, %r30690, %r30688, 0x96; + lop3.b32 %r18936, %r30715, %r30713, %r30693, 0x96; + lop3.b32 %r18936, %r18936, %r30691, %r30689, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18947, %r30710, %r30708, %r30686, 0x96; + lop3.b32 %r18947, %r18947, %r30684, %r30682, 0x96; + lop3.b32 %r18948, %r30711, %r30709, %r30687, 0x96; + lop3.b32 %r18948, %r18948, %r30685, %r30683, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18959, %r30706, %r30680, %r30678, 0x96; + lop3.b32 %r18959, %r18959, %r30676, %r30674, 0x96; + lop3.b32 %r18960, %r30707, %r30681, %r30679, 0x96; + lop3.b32 %r18960, %r18960, %r30677, %r30675, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18971, %r30704, %r30672, %r30670, 0x96; + lop3.b32 %r18971, %r18971, %r30668, %r30666, 0x96; + lop3.b32 %r18972, %r30705, %r30673, %r30671, 0x96; + lop3.b32 %r18972, %r18972, %r30669, %r30667, 0x96; + // end inline asm + mov.u32 %r19175, 1; + // begin inline asm + shf.l.wrap.b32 %r18983, %r18936, %r18935, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18987, %r18935, %r18936, %r19175; + // end inline asm + xor.b32 %r19202, %r18983, %r18971; + xor.b32 %r19203, %r18987, %r18972; + xor.b32 %r19130, %r30702, %r19202; + xor.b32 %r19133, %r30703, %r19203; + xor.b32 %r19093, %r30699, %r19203; + xor.b32 %r19092, %r30698, %r19202; + st.local.v2.u32 [%rd2+104], {%r19092, %r19093}; + // begin inline asm + shf.l.wrap.b32 %r18991, %r18948, %r18947, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18995, %r18947, %r18948, %r19175; + // end inline asm + xor.b32 %r19204, %r18991, %r18923; + xor.b32 %r19205, %r18995, %r18924; + xor.b32 %r19029, %r30712, %r19204; + xor.b32 %r19028, %r30713, %r19205; + xor.b32 %r19068, %r30691, %r19205; + xor.b32 %r19069, %r30690, %r19204; + st.local.v2.u32 [%rd2+152], {%r19069, %r19068}; + // begin inline asm + shf.l.wrap.b32 %r18999, %r18960, %r18959, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19003, %r18959, %r18960, %r19175; + // end inline asm + xor.b32 %r19206, %r18999, %r18935; + xor.b32 %r19207, %r19003, %r18936; + xor.b32 %r19052, %r30687, %r19207; + xor.b32 %r19053, %r30686, %r19206; + st.local.v2.u32 [%rd2+120], {%r19053, %r19052}; + xor.b32 %r19044, %r30683, %r19207; + xor.b32 %r19045, %r30682, %r19206; + st.local.v2.u32 [%rd2+200], {%r19045, %r19044}; + // begin inline asm + shf.l.wrap.b32 %r19007, %r18972, %r18971, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19011, %r18971, %r18972, %r19175; + // end inline asm + xor.b32 %r19208, %r19007, %r18947; + xor.b32 %r19209, %r19011, %r18948; + xor.b32 %r19076, %r30706, %r19208; + xor.b32 %r19077, %r30707, %r19209; + xor.b32 %r19085, %r30677, %r19209; + xor.b32 %r19084, %r30676, %r19208; + st.local.v2.u32 [%rd2+168], {%r19084, %r19085}; + // begin inline asm + shf.l.wrap.b32 %r19015, %r18924, %r18923, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19019, %r18923, %r18924, %r19175; + // end inline asm + xor.b32 %r19210, %r19015, %r18959; + xor.b32 %r19211, %r19019, %r18960; + xor.b32 %r19036, %r30672, %r19210; + xor.b32 %r19037, %r30673, %r19211; + xor.b32 %r19061, %r30667, %r19211; + xor.b32 %r19060, %r30666, %r19210; + st.local.v2.u32 [%rd2+216], {%r19060, %r19061}; + // begin inline asm + shf.l.wrap.b32 %r19023, %r19029, %r19028, %r18526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19027, %r19028, %r19029, %r18526; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19031, %r19037, %r19036, %r18534; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19035, %r19036, %r19037, %r18534; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19043, %r19044, %r19045, %r18542; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19039, %r19045, %r19044, %r18542; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r19039, %r19043}; + // begin inline asm + shf.l.wrap.b32 %r19047, %r19053, %r19052, %r18574; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19051, %r19052, %r19053, %r18574; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19055, %r19061, %r19060, %r18622; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19059, %r19060, %r19061, %r18622; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19067, %r19068, %r19069, %r18646; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19063, %r19069, %r19068, %r18646; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r19063, %r19067}; + // begin inline asm + shf.l.wrap.b32 %r19071, %r19077, %r19076, %r18662; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19075, %r19076, %r19077, %r18662; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19079, %r19085, %r19084, %r18670; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19083, %r19084, %r19085, %r18670; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19087, %r19093, %r19092, %r18702; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19091, %r19092, %r19093, %r18702; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19095, %r19130, %r19023, %r19047, 0xD2; + lop3.b32 %r19096, %r19133, %r19027, %r19051, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19103, %r19023, %r19047, %r19079, 0xD2; + lop3.b32 %r19104, %r19027, %r19051, %r19083, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r19103, %r19104}; + // begin inline asm + // chi + lop3.b32 %r19111, %r19047, %r19079, %r19055, 0xD2; + lop3.b32 %r19112, %r19051, %r19083, %r19059, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r19111, %r19112}; + // begin inline asm + // chi + lop3.b32 %r19119, %r19079, %r19055, %r19130, 0xD2; + lop3.b32 %r19120, %r19083, %r19059, %r19133, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r19119, %r19120}; + // begin inline asm + // chi + lop3.b32 %r19127, %r19055, %r19130, %r19023, 0xD2; + lop3.b32 %r19128, %r19059, %r19133, %r19027, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r19127, %r19128}; + // begin inline asm + // chi + lop3.b32 %r19135, %r19071, %r19031, %r19087, 0xD2; + lop3.b32 %r19136, %r19075, %r19035, %r19091, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r19135, %r19136}; + // begin inline asm + // chi + lop3.b32 %r19143, %r19031, %r19087, %r19063, 0xD2; + lop3.b32 %r19144, %r19035, %r19091, %r19067, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r19143, %r19144}; + // begin inline asm + // chi + lop3.b32 %r19151, %r19087, %r19063, %r19039, 0xD2; + lop3.b32 %r19152, %r19091, %r19067, %r19043, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r19151, %r19152}; + // begin inline asm + ld.global.nc.v2.u32 {%r19159,%r19160}, [%rd790]; + // end inline asm + xor.b32 %r19212, %r19096, %r19160; + xor.b32 %r19213, %r19095, %r19159; + mov.b64 %rd1333, {%r19213, %r19212}; + mov.b64 %rd1334, {%r19103, %r19104}; + mov.b64 %rd1335, {%r19111, %r19112}; + mov.b64 %rd1336, {%r19119, %r19120}; + mov.b64 %rd1337, {%r19127, %r19128}; + mov.b64 %rd1338, {%r19135, %r19136}; + mov.b64 %rd1339, {%r19143, %r19144}; + mov.b64 %rd1340, {%r19151, %r19152}; + mov.u32 %r30717, 0; + st.local.v2.u32 [%rd2+24], {%r19213, %r19212}; + st.local.v2.u32 [%rd176+96], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+104], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+112], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+120], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+128], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+136], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+144], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+152], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+160], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+168], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+176], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+184], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+192], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+200], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+208], {%r30717, %r30717}; + st.local.v2.u32 [%rd176+216], {%r30717, %r30717}; + mov.u32 %r30732, -2147483648; + st.local.v2.u32 [%rd176+88], {%r19175, %r30732}; + mov.u32 %r30718, %r30717; + mov.u32 %r30719, %r30717; + mov.u32 %r30720, %r30717; + mov.u32 %r30721, %r30717; + mov.u32 %r30722, %r30717; + mov.u32 %r30723, %r30717; + mov.u32 %r30724, %r30717; + mov.u32 %r30725, %r30717; + mov.u32 %r30726, %r30717; + mov.u32 %r30727, %r30717; + mov.u32 %r30728, %r30717; + mov.u32 %r30729, %r30717; + mov.u32 %r30730, %r30717; + mov.u32 %r30731, %r19175; + mov.u32 %r30733, %r30717; + mov.u32 %r30734, %r30717; + mov.u32 %r30735, %r30717; + mov.u32 %r30736, %r30717; + mov.u32 %r30737, %r30717; + mov.u32 %r30738, %r30717; + mov.u32 %r30739, %r30717; + mov.u32 %r30740, %r30717; + mov.u32 %r30741, %r30717; + mov.u32 %r30742, %r30717; + mov.u32 %r30743, %r30717; + mov.u32 %r30744, %r30717; + mov.u32 %r30745, %r30717; + mov.u32 %r30746, %r30717; + mov.u32 %r30747, %r30717; + mov.u32 %r30748, %r30717; + mov.u32 %r30749, %r30717; + mov.u32 %r30750, %r30717; + mov.u32 %r30767, %r30717; + +$L__BB2_59: + // begin inline asm + // xor5 + lop3.b32 %r19214, %r30753, %r30751, %r30749, 0x96; + lop3.b32 %r19214, %r19214, %r30747, %r30745, 0x96; + lop3.b32 %r19215, %r30754, %r30752, %r30750, 0x96; + lop3.b32 %r19215, %r19215, %r30748, %r30746, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19226, %r30765, %r30763, %r30743, 0x96; + lop3.b32 %r19226, %r19226, %r30741, %r30739, 0x96; + lop3.b32 %r19227, %r30766, %r30764, %r30744, 0x96; + lop3.b32 %r19227, %r19227, %r30742, %r30740, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19238, %r30761, %r30759, %r30737, 0x96; + lop3.b32 %r19238, %r19238, %r30735, %r30733, 0x96; + lop3.b32 %r19239, %r30762, %r30760, %r30738, 0x96; + lop3.b32 %r19239, %r19239, %r30736, %r30734, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19250, %r30757, %r30731, %r30729, 0x96; + lop3.b32 %r19250, %r19250, %r30727, %r30725, 0x96; + lop3.b32 %r19251, %r30758, %r30732, %r30730, 0x96; + lop3.b32 %r19251, %r19251, %r30728, %r30726, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19262, %r30755, %r30723, %r30721, 0x96; + lop3.b32 %r19262, %r19262, %r30719, %r30717, 0x96; + lop3.b32 %r19263, %r30756, %r30724, %r30722, 0x96; + lop3.b32 %r19263, %r19263, %r30720, %r30718, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19274, %r19227, %r19226, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19278, %r19226, %r19227, %r19175; + // end inline asm + xor.b32 %r19708, %r19274, %r19262; + xor.b32 %r19709, %r19278, %r19263; + xor.b32 %r19541, %r30753, %r19708; + xor.b32 %r19544, %r30754, %r19709; + xor.b32 %r19448, %r30751, %r19708; + xor.b32 %r19447, %r30752, %r19709; + xor.b32 %r19495, %r30749, %r19708; + xor.b32 %r19496, %r30750, %r19709; + xor.b32 %r19400, %r30747, %r19708; + xor.b32 %r19399, %r30748, %r19709; + xor.b32 %r19351, %r30745, %r19708; + xor.b32 %r19352, %r30746, %r19709; + // begin inline asm + shf.l.wrap.b32 %r19282, %r19239, %r19238, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19286, %r19238, %r19239, %r19175; + // end inline asm + xor.b32 %r19710, %r19282, %r19214; + xor.b32 %r19711, %r19286, %r19215; + xor.b32 %r19503, %r30765, %r19710; + xor.b32 %r19504, %r30766, %r19711; + xor.b32 %r19320, %r30763, %r19710; + xor.b32 %r19319, %r30764, %r19711; + xor.b32 %r19479, %r30743, %r19710; + xor.b32 %r19480, %r30744, %r19711; + xor.b32 %r19440, %r30741, %r19710; + xor.b32 %r19439, %r30742, %r19711; + xor.b32 %r19423, %r30739, %r19710; + xor.b32 %r19424, %r30740, %r19711; + // begin inline asm + shf.l.wrap.b32 %r19290, %r19251, %r19250, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19294, %r19250, %r19251, %r19175; + // end inline asm + xor.b32 %r19712, %r19290, %r19226; + xor.b32 %r19713, %r19294, %r19227; + xor.b32 %r19360, %r30761, %r19712; + xor.b32 %r19359, %r30762, %r19713; + xor.b32 %r19487, %r30759, %r19712; + xor.b32 %r19488, %r30760, %r19713; + xor.b32 %r19368, %r30737, %r19712; + xor.b32 %r19367, %r30738, %r19713; + xor.b32 %r19471, %r30735, %r19712; + xor.b32 %r19472, %r30736, %r19713; + xor.b32 %r19336, %r30733, %r19712; + xor.b32 %r19335, %r30734, %r19713; + // begin inline asm + shf.l.wrap.b32 %r19298, %r19263, %r19262, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19302, %r19262, %r19263, %r19175; + // end inline asm + xor.b32 %r19714, %r19298, %r19238; + xor.b32 %r19715, %r19302, %r19239; + xor.b32 %r19455, %r30757, %r19714; + xor.b32 %r19456, %r30758, %r19715; + xor.b32 %r19432, %r30731, %r19714; + xor.b32 %r19431, %r30732, %r19715; + xor.b32 %r19375, %r30729, %r19714; + xor.b32 %r19376, %r30730, %r19715; + xor.b32 %r19463, %r30727, %r19714; + xor.b32 %r19464, %r30728, %r19715; + xor.b32 %r19392, %r30725, %r19714; + xor.b32 %r19391, %r30726, %r19715; + // begin inline asm + shf.l.wrap.b32 %r19306, %r19215, %r19214, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19310, %r19214, %r19215, %r19175; + // end inline asm + xor.b32 %r19716, %r19306, %r19250; + xor.b32 %r19717, %r19310, %r19251; + xor.b32 %r19407, %r30755, %r19716; + xor.b32 %r19408, %r30756, %r19717; + xor.b32 %r19327, %r30723, %r19716; + xor.b32 %r19328, %r30724, %r19717; + xor.b32 %r19344, %r30721, %r19716; + xor.b32 %r19343, %r30722, %r19717; + xor.b32 %r19383, %r30719, %r19716; + xor.b32 %r19384, %r30720, %r19717; + xor.b32 %r19415, %r30717, %r19716; + xor.b32 %r19416, %r30718, %r19717; + mov.u32 %r19321, 44; + // begin inline asm + shf.l.wrap.b32 %r19314, %r19320, %r19319, %r19321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19318, %r19319, %r19320, %r19321; + // end inline asm + mov.u32 %r19329, 20; + // begin inline asm + shf.l.wrap.b32 %r19322, %r19328, %r19327, %r19329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19326, %r19327, %r19328, %r19329; + // end inline asm + mov.u32 %r19337, 61; + // begin inline asm + shf.l.wrap.b32 %r19330, %r19336, %r19335, %r19337; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19334, %r19335, %r19336, %r19337; + // end inline asm + mov.u32 %r19345, 39; + // begin inline asm + shf.l.wrap.b32 %r19338, %r19344, %r19343, %r19345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19342, %r19343, %r19344, %r19345; + // end inline asm + mov.u32 %r19353, 18; + // begin inline asm + shf.l.wrap.b32 %r19346, %r19352, %r19351, %r19353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19350, %r19351, %r19352, %r19353; + // end inline asm + mov.u32 %r19361, 62; + // begin inline asm + shf.l.wrap.b32 %r19354, %r19360, %r19359, %r19361; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19358, %r19359, %r19360, %r19361; + // end inline asm + mov.u32 %r19369, 43; + // begin inline asm + shf.l.wrap.b32 %r19362, %r19368, %r19367, %r19369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19366, %r19367, %r19368, %r19369; + // end inline asm + mov.u32 %r19377, 25; + // begin inline asm + shf.l.wrap.b32 %r19370, %r19376, %r19375, %r19377; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19374, %r19375, %r19376, %r19377; + // end inline asm + mov.u32 %r19385, 8; + // begin inline asm + shf.l.wrap.b32 %r19378, %r19384, %r19383, %r19385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19382, %r19383, %r19384, %r19385; + // end inline asm + mov.u32 %r19393, 56; + // begin inline asm + shf.l.wrap.b32 %r19386, %r19392, %r19391, %r19393; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19390, %r19391, %r19392, %r19393; + // end inline asm + mov.u32 %r19401, 41; + // begin inline asm + shf.l.wrap.b32 %r19394, %r19400, %r19399, %r19401; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19398, %r19399, %r19400, %r19401; + // end inline asm + mov.u32 %r19409, 27; + // begin inline asm + shf.l.wrap.b32 %r19402, %r19408, %r19407, %r19409; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19406, %r19407, %r19408, %r19409; + // end inline asm + mov.u32 %r19417, 14; + // begin inline asm + shf.l.wrap.b32 %r19410, %r19416, %r19415, %r19417; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19414, %r19415, %r19416, %r19417; + // end inline asm + mov.u32 %r19425, 2; + // begin inline asm + shf.l.wrap.b32 %r19418, %r19424, %r19423, %r19425; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19422, %r19423, %r19424, %r19425; + // end inline asm + mov.u32 %r19433, 55; + // begin inline asm + shf.l.wrap.b32 %r19426, %r19432, %r19431, %r19433; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19430, %r19431, %r19432, %r19433; + // end inline asm + mov.u32 %r19441, 45; + // begin inline asm + shf.l.wrap.b32 %r19434, %r19440, %r19439, %r19441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19438, %r19439, %r19440, %r19441; + // end inline asm + mov.u32 %r19449, 36; + // begin inline asm + shf.l.wrap.b32 %r19442, %r19448, %r19447, %r19449; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19446, %r19447, %r19448, %r19449; + // end inline asm + mov.u32 %r19457, 28; + // begin inline asm + shf.l.wrap.b32 %r19450, %r19456, %r19455, %r19457; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19454, %r19455, %r19456, %r19457; + // end inline asm + mov.u32 %r19465, 21; + // begin inline asm + shf.l.wrap.b32 %r19458, %r19464, %r19463, %r19465; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19462, %r19463, %r19464, %r19465; + // end inline asm + mov.u32 %r19473, 15; + // begin inline asm + shf.l.wrap.b32 %r19466, %r19472, %r19471, %r19473; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19470, %r19471, %r19472, %r19473; + // end inline asm + mov.u32 %r19481, 10; + // begin inline asm + shf.l.wrap.b32 %r19474, %r19480, %r19479, %r19481; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19478, %r19479, %r19480, %r19481; + // end inline asm + mov.u32 %r19489, 6; + // begin inline asm + shf.l.wrap.b32 %r19482, %r19488, %r19487, %r19489; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19486, %r19487, %r19488, %r19489; + // end inline asm + mov.u32 %r19497, 3; + // begin inline asm + shf.l.wrap.b32 %r19490, %r19496, %r19495, %r19497; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19494, %r19495, %r19496, %r19497; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19498, %r19504, %r19503, %r19175; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19502, %r19503, %r19504, %r19175; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19506, %r19541, %r19314, %r19362, 0xD2; + lop3.b32 %r19507, %r19544, %r19318, %r19366, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30765, %r19314, %r19362, %r19458, 0xD2; + lop3.b32 %r30766, %r19318, %r19366, %r19462, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30761, %r19362, %r19458, %r19410, 0xD2; + lop3.b32 %r30762, %r19366, %r19462, %r19414, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30757, %r19458, %r19410, %r19541, 0xD2; + lop3.b32 %r30758, %r19462, %r19414, %r19544, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30755, %r19410, %r19541, %r19314, 0xD2; + lop3.b32 %r30756, %r19414, %r19544, %r19318, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30751, %r19450, %r19322, %r19490, 0xD2; + lop3.b32 %r30752, %r19454, %r19326, %r19494, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30763, %r19322, %r19490, %r19434, 0xD2; + lop3.b32 %r30764, %r19326, %r19494, %r19438, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30759, %r19490, %r19434, %r19330, 0xD2; + lop3.b32 %r30760, %r19494, %r19438, %r19334, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30731, %r19434, %r19330, %r19450, 0xD2; + lop3.b32 %r30732, %r19438, %r19334, %r19454, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r30731, %r30732}; + // begin inline asm + // chi + lop3.b32 %r30723, %r19330, %r19450, %r19322, 0xD2; + lop3.b32 %r30724, %r19334, %r19454, %r19326, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r30723, %r30724}; + // begin inline asm + // chi + lop3.b32 %r30749, %r19498, %r19482, %r19370, 0xD2; + lop3.b32 %r30750, %r19502, %r19486, %r19374, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+104], {%r30749, %r30750}; + // begin inline asm + // chi + lop3.b32 %r30743, %r19482, %r19370, %r19378, 0xD2; + lop3.b32 %r30744, %r19486, %r19374, %r19382, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+112], {%r30743, %r30744}; + // begin inline asm + // chi + lop3.b32 %r30737, %r19370, %r19378, %r19346, 0xD2; + lop3.b32 %r30738, %r19374, %r19382, %r19350, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+120], {%r30737, %r30738}; + // begin inline asm + // chi + lop3.b32 %r30729, %r19378, %r19346, %r19498, 0xD2; + lop3.b32 %r30730, %r19382, %r19350, %r19502, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+128], {%r30729, %r30730}; + // begin inline asm + // chi + lop3.b32 %r30721, %r19346, %r19498, %r19482, 0xD2; + lop3.b32 %r30722, %r19350, %r19502, %r19486, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+136], {%r30721, %r30722}; + // begin inline asm + // chi + lop3.b32 %r30747, %r19402, %r19442, %r19474, 0xD2; + lop3.b32 %r30748, %r19406, %r19446, %r19478, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+144], {%r30747, %r30748}; + // begin inline asm + // chi + lop3.b32 %r30741, %r19442, %r19474, %r19466, 0xD2; + lop3.b32 %r30742, %r19446, %r19478, %r19470, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+152], {%r30741, %r30742}; + // begin inline asm + // chi + lop3.b32 %r30735, %r19474, %r19466, %r19386, 0xD2; + lop3.b32 %r30736, %r19478, %r19470, %r19390, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+160], {%r30735, %r30736}; + // begin inline asm + // chi + lop3.b32 %r30727, %r19466, %r19386, %r19402, 0xD2; + lop3.b32 %r30728, %r19470, %r19390, %r19406, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+168], {%r30727, %r30728}; + // begin inline asm + // chi + lop3.b32 %r30719, %r19386, %r19402, %r19442, 0xD2; + lop3.b32 %r30720, %r19390, %r19406, %r19446, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+176], {%r30719, %r30720}; + // begin inline asm + // chi + lop3.b32 %r30745, %r19354, %r19426, %r19338, 0xD2; + lop3.b32 %r30746, %r19358, %r19430, %r19342, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+184], {%r30745, %r30746}; + // begin inline asm + // chi + lop3.b32 %r30739, %r19426, %r19338, %r19394, 0xD2; + lop3.b32 %r30740, %r19430, %r19342, %r19398, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+192], {%r30739, %r30740}; + // begin inline asm + // chi + lop3.b32 %r30733, %r19338, %r19394, %r19418, 0xD2; + lop3.b32 %r30734, %r19342, %r19398, %r19422, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+200], {%r30733, %r30734}; + // begin inline asm + // chi + lop3.b32 %r30725, %r19394, %r19418, %r19354, 0xD2; + lop3.b32 %r30726, %r19398, %r19422, %r19358, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+208], {%r30725, %r30726}; + // begin inline asm + // chi + lop3.b32 %r30717, %r19418, %r19354, %r19426, 0xD2; + lop3.b32 %r30718, %r19422, %r19358, %r19430, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+216], {%r30717, %r30718}; + mul.wide.s32 %rd864, %r30767, 8; + add.s64 %rd863, %rd789, %rd864; + // begin inline asm + ld.global.nc.v2.u32 {%r19706,%r19707}, [%rd863]; + // end inline asm + xor.b32 %r30753, %r19506, %r19706; + xor.b32 %r30754, %r19507, %r19707; + add.s32 %r30767, %r30767, 1; + setp.lt.u32 %p35, %r30767, 23; + @%p35 bra $L__BB2_59; + + mov.u32 %r19817, 1; + st.local.v2.u32 [%rd176+32], {%r30765, %r30766}; + st.local.v2.u32 [%rd176+72], {%r30763, %r30764}; + st.local.v2.u32 [%rd176+40], {%r30761, %r30762}; + st.local.v2.u32 [%rd176+80], {%r30759, %r30760}; + st.local.v2.u32 [%rd176+48], {%r30757, %r30758}; + st.local.v2.u32 [%rd176+56], {%r30755, %r30756}; + st.local.v2.u32 [%rd176+24], {%r30753, %r30754}; + // begin inline asm + // xor5 + lop3.b32 %r19718, %r30753, %r30751, %r30749, 0x96; + lop3.b32 %r19718, %r19718, %r30747, %r30745, 0x96; + lop3.b32 %r19719, %r30754, %r30752, %r30750, 0x96; + lop3.b32 %r19719, %r19719, %r30748, %r30746, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19730, %r30765, %r30763, %r30743, 0x96; + lop3.b32 %r19730, %r19730, %r30741, %r30739, 0x96; + lop3.b32 %r19731, %r30766, %r30764, %r30744, 0x96; + lop3.b32 %r19731, %r19731, %r30742, %r30740, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19742, %r30761, %r30759, %r30737, 0x96; + lop3.b32 %r19742, %r19742, %r30735, %r30733, 0x96; + lop3.b32 %r19743, %r30762, %r30760, %r30738, 0x96; + lop3.b32 %r19743, %r19743, %r30736, %r30734, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19754, %r30757, %r30731, %r30729, 0x96; + lop3.b32 %r19754, %r19754, %r30727, %r30725, 0x96; + lop3.b32 %r19755, %r30758, %r30732, %r30730, 0x96; + lop3.b32 %r19755, %r19755, %r30728, %r30726, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19766, %r30755, %r30723, %r30721, 0x96; + lop3.b32 %r19766, %r19766, %r30719, %r30717, 0x96; + lop3.b32 %r19767, %r30756, %r30724, %r30722, 0x96; + lop3.b32 %r19767, %r19767, %r30720, %r30718, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19778, %r19731, %r19730, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19782, %r19730, %r19731, %r19817; + // end inline asm + xor.b32 %r19956, %r19778, %r19766; + xor.b32 %r19957, %r19782, %r19767; + xor.b32 %r19925, %r30753, %r19956; + xor.b32 %r19928, %r30754, %r19957; + xor.b32 %r19888, %r30750, %r19957; + xor.b32 %r19887, %r30749, %r19956; + st.local.v2.u32 [%rd176+104], {%r19887, %r19888}; + // begin inline asm + shf.l.wrap.b32 %r19786, %r19743, %r19742, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19790, %r19742, %r19743, %r19817; + // end inline asm + xor.b32 %r19958, %r19786, %r19718; + xor.b32 %r19959, %r19790, %r19719; + xor.b32 %r19824, %r30763, %r19958; + xor.b32 %r19823, %r30764, %r19959; + xor.b32 %r19863, %r30742, %r19959; + xor.b32 %r19864, %r30741, %r19958; + st.local.v2.u32 [%rd176+152], {%r19864, %r19863}; + // begin inline asm + shf.l.wrap.b32 %r19794, %r19755, %r19754, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19798, %r19754, %r19755, %r19817; + // end inline asm + xor.b32 %r19960, %r19794, %r19730; + xor.b32 %r19961, %r19798, %r19731; + xor.b32 %r19847, %r30738, %r19961; + xor.b32 %r19848, %r30737, %r19960; + st.local.v2.u32 [%rd176+120], {%r19848, %r19847}; + xor.b32 %r19839, %r30734, %r19961; + xor.b32 %r19840, %r30733, %r19960; + st.local.v2.u32 [%rd176+200], {%r19840, %r19839}; + // begin inline asm + shf.l.wrap.b32 %r19802, %r19767, %r19766, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19806, %r19766, %r19767, %r19817; + // end inline asm + xor.b32 %r19962, %r19802, %r19742; + xor.b32 %r19963, %r19806, %r19743; + xor.b32 %r19871, %r30757, %r19962; + xor.b32 %r19872, %r30758, %r19963; + xor.b32 %r19880, %r30728, %r19963; + xor.b32 %r19879, %r30727, %r19962; + st.local.v2.u32 [%rd176+168], {%r19879, %r19880}; + // begin inline asm + shf.l.wrap.b32 %r19810, %r19719, %r19718, %r19817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19814, %r19718, %r19719, %r19817; + // end inline asm + xor.b32 %r19964, %r19810, %r19754; + xor.b32 %r19965, %r19814, %r19755; + xor.b32 %r19831, %r30723, %r19964; + xor.b32 %r19832, %r30724, %r19965; + xor.b32 %r19856, %r30718, %r19965; + xor.b32 %r19855, %r30717, %r19964; + st.local.v2.u32 [%rd176+216], {%r19855, %r19856}; + // begin inline asm + shf.l.wrap.b32 %r19818, %r19824, %r19823, %r19321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19822, %r19823, %r19824, %r19321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19826, %r19832, %r19831, %r19329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19830, %r19831, %r19832, %r19329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19838, %r19839, %r19840, %r19337; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19834, %r19840, %r19839, %r19337; + // end inline asm + st.local.v2.u32 [%rd176+96], {%r19834, %r19838}; + // begin inline asm + shf.l.wrap.b32 %r19842, %r19848, %r19847, %r19369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19846, %r19847, %r19848, %r19369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19850, %r19856, %r19855, %r19417; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19854, %r19855, %r19856, %r19417; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19862, %r19863, %r19864, %r19441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19858, %r19864, %r19863, %r19441; + // end inline asm + st.local.v2.u32 [%rd176+88], {%r19858, %r19862}; + // begin inline asm + shf.l.wrap.b32 %r19866, %r19872, %r19871, %r19457; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19870, %r19871, %r19872, %r19457; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19874, %r19880, %r19879, %r19465; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19878, %r19879, %r19880, %r19465; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19882, %r19888, %r19887, %r19497; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19886, %r19887, %r19888, %r19497; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19890, %r19925, %r19818, %r19842, 0xD2; + lop3.b32 %r19891, %r19928, %r19822, %r19846, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19898, %r19818, %r19842, %r19874, 0xD2; + lop3.b32 %r19899, %r19822, %r19846, %r19878, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+32], {%r19898, %r19899}; + // begin inline asm + // chi + lop3.b32 %r19906, %r19842, %r19874, %r19850, 0xD2; + lop3.b32 %r19907, %r19846, %r19878, %r19854, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+40], {%r19906, %r19907}; + // begin inline asm + // chi + lop3.b32 %r19914, %r19874, %r19850, %r19925, 0xD2; + lop3.b32 %r19915, %r19878, %r19854, %r19928, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+48], {%r19914, %r19915}; + // begin inline asm + // chi + lop3.b32 %r19922, %r19850, %r19925, %r19818, 0xD2; + lop3.b32 %r19923, %r19854, %r19928, %r19822, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+56], {%r19922, %r19923}; + // begin inline asm + // chi + lop3.b32 %r19930, %r19866, %r19826, %r19882, 0xD2; + lop3.b32 %r19931, %r19870, %r19830, %r19886, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+64], {%r19930, %r19931}; + // begin inline asm + // chi + lop3.b32 %r19938, %r19826, %r19882, %r19858, 0xD2; + lop3.b32 %r19939, %r19830, %r19886, %r19862, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+72], {%r19938, %r19939}; + // begin inline asm + // chi + lop3.b32 %r19946, %r19882, %r19858, %r19834, 0xD2; + lop3.b32 %r19947, %r19886, %r19862, %r19838, 0xD2; + // end inline asm + st.local.v2.u32 [%rd176+80], {%r19946, %r19947}; + // begin inline asm + ld.global.nc.v2.u32 {%r19954,%r19955}, [%rd790]; + // end inline asm + xor.b32 %r19966, %r19891, %r19955; + xor.b32 %r19967, %r19890, %r19954; + st.local.v2.u32 [%rd176+24], {%r19967, %r19966}; + mov.b64 %rd1342, {%r19898, %r19899}; + mov.b64 %rd1343, {%r19906, %r19907}; + mov.b64 %rd1346, {%r19930, %r19931}; + mov.b64 %rd1347, {%r19938, %r19939}; + mov.b64 %rd1348, {%r19946, %r19947}; + mov.b64 %rd1341, {%r19967, %r19966}; + mov.b64 %rd1344, {%r19914, %r19915}; + mov.b64 %rd1345, {%r19922, %r19923}; + bra.uni $L__BB2_61; + +$L__BB2_39: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd671, 1179641; + st.local.u64 [%rd2+8], %rd671; + st.local.u32 [%rd2+16], %r1695; + ld.global.u64 %rd672, [%rd126]; + ld.global.u64 %rd673, [%rd126+8]; + ld.global.u64 %rd674, [%rd126+16]; + ld.global.u64 %rd675, [%rd126+24]; + ld.global.u64 %rd676, [%rd126+32]; + ld.global.u64 %rd677, [%rd126+40]; + ld.global.u64 %rd678, [%rd126+48]; + ld.global.u64 %rd679, [%rd126+56]; + st.local.u64 [%rd2+24], %rd672; + st.local.u64 [%rd2+32], %rd673; + st.local.u64 [%rd2+40], %rd674; + st.local.u64 [%rd2+48], %rd675; + st.local.u64 [%rd2+56], %rd676; + st.local.u64 [%rd2+64], %rd677; + st.local.u64 [%rd2+72], %rd678; + st.local.u64 [%rd2+80], %rd679; + cvt.u32.u64 %r13441, %rd672; + xor.b32 %r13442, %r1695, %r13441; + st.local.u32 [%rd2+24], %r13442; + mov.u32 %r30294, 0; + st.local.v2.u32 [%rd2+96], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+104], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+112], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+120], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+128], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+136], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+144], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+152], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+160], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+168], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+176], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+184], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+192], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+200], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+208], {%r30294, %r30294}; + st.local.v2.u32 [%rd2+216], {%r30294, %r30294}; + mov.u32 %r30309, -2147483648; + mov.u32 %r13414, 1; + st.local.v2.u32 [%rd2+88], {%r13414, %r30309}; + ld.local.v2.u32 {%r30330, %r30331}, [%rd2+24]; + mov.b64 {%r30328, %r30329}, %rd677; + shr.u64 %rd680, %rd673, 32; + cvt.u32.u64 %r30342, %rd673; + cvt.u32.u64 %r30343, %rd680; + shr.u64 %rd681, %rd678, 32; + cvt.u32.u64 %r30340, %rd678; + cvt.u32.u64 %r30341, %rd681; + shr.u64 %rd682, %rd674, 32; + cvt.u32.u64 %r30338, %rd674; + cvt.u32.u64 %r30339, %rd682; + shr.u64 %rd683, %rd679, 32; + cvt.u32.u64 %r30336, %rd679; + cvt.u32.u64 %r30337, %rd683; + shr.u64 %rd684, %rd675, 32; + cvt.u32.u64 %r30334, %rd675; + cvt.u32.u64 %r30335, %rd684; + shr.u64 %rd685, %rd676, 32; + cvt.u32.u64 %r30332, %rd676; + cvt.u32.u64 %r30333, %rd685; + mov.u32 %r30295, %r30294; + mov.u32 %r30296, %r30294; + mov.u32 %r30297, %r30294; + mov.u32 %r30298, %r30294; + mov.u32 %r30299, %r30294; + mov.u32 %r30300, %r30294; + mov.u32 %r30301, %r30294; + mov.u32 %r30302, %r30294; + mov.u32 %r30303, %r30294; + mov.u32 %r30304, %r30294; + mov.u32 %r30305, %r30294; + mov.u32 %r30306, %r30294; + mov.u32 %r30307, %r30294; + mov.u32 %r30308, %r13414; + mov.u32 %r30310, %r30294; + mov.u32 %r30311, %r30294; + mov.u32 %r30312, %r30294; + mov.u32 %r30313, %r30294; + mov.u32 %r30314, %r30294; + mov.u32 %r30315, %r30294; + mov.u32 %r30316, %r30294; + mov.u32 %r30317, %r30294; + mov.u32 %r30318, %r30294; + mov.u32 %r30319, %r30294; + mov.u32 %r30320, %r30294; + mov.u32 %r30321, %r30294; + mov.u32 %r30322, %r30294; + mov.u32 %r30323, %r30294; + mov.u32 %r30324, %r30294; + mov.u32 %r30325, %r30294; + mov.u32 %r30326, %r30294; + mov.u32 %r30327, %r30294; + mov.u32 %r30344, %r30294; + +$L__BB2_40: + // begin inline asm + // xor5 + lop3.b32 %r13445, %r30330, %r30328, %r30326, 0x96; + lop3.b32 %r13445, %r13445, %r30324, %r30322, 0x96; + lop3.b32 %r13446, %r30331, %r30329, %r30327, 0x96; + lop3.b32 %r13446, %r13446, %r30325, %r30323, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13457, %r30342, %r30340, %r30320, 0x96; + lop3.b32 %r13457, %r13457, %r30318, %r30316, 0x96; + lop3.b32 %r13458, %r30343, %r30341, %r30321, 0x96; + lop3.b32 %r13458, %r13458, %r30319, %r30317, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13469, %r30338, %r30336, %r30314, 0x96; + lop3.b32 %r13469, %r13469, %r30312, %r30310, 0x96; + lop3.b32 %r13470, %r30339, %r30337, %r30315, 0x96; + lop3.b32 %r13470, %r13470, %r30313, %r30311, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13481, %r30334, %r30308, %r30306, 0x96; + lop3.b32 %r13481, %r13481, %r30304, %r30302, 0x96; + lop3.b32 %r13482, %r30335, %r30309, %r30307, 0x96; + lop3.b32 %r13482, %r13482, %r30305, %r30303, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13493, %r30332, %r30300, %r30298, 0x96; + lop3.b32 %r13493, %r13493, %r30296, %r30294, 0x96; + lop3.b32 %r13494, %r30333, %r30301, %r30299, 0x96; + lop3.b32 %r13494, %r13494, %r30297, %r30295, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13505, %r13458, %r13457, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13509, %r13457, %r13458, %r13414; + // end inline asm + xor.b32 %r13939, %r13505, %r13493; + xor.b32 %r13940, %r13509, %r13494; + xor.b32 %r13772, %r30330, %r13939; + xor.b32 %r13775, %r30331, %r13940; + xor.b32 %r13679, %r30328, %r13939; + xor.b32 %r13678, %r30329, %r13940; + xor.b32 %r13726, %r30326, %r13939; + xor.b32 %r13727, %r30327, %r13940; + xor.b32 %r13631, %r30324, %r13939; + xor.b32 %r13630, %r30325, %r13940; + xor.b32 %r13582, %r30322, %r13939; + xor.b32 %r13583, %r30323, %r13940; + // begin inline asm + shf.l.wrap.b32 %r13513, %r13470, %r13469, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13517, %r13469, %r13470, %r13414; + // end inline asm + xor.b32 %r13941, %r13513, %r13445; + xor.b32 %r13942, %r13517, %r13446; + xor.b32 %r13734, %r30342, %r13941; + xor.b32 %r13735, %r30343, %r13942; + xor.b32 %r13551, %r30340, %r13941; + xor.b32 %r13550, %r30341, %r13942; + xor.b32 %r13710, %r30320, %r13941; + xor.b32 %r13711, %r30321, %r13942; + xor.b32 %r13671, %r30318, %r13941; + xor.b32 %r13670, %r30319, %r13942; + xor.b32 %r13654, %r30316, %r13941; + xor.b32 %r13655, %r30317, %r13942; + // begin inline asm + shf.l.wrap.b32 %r13521, %r13482, %r13481, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13525, %r13481, %r13482, %r13414; + // end inline asm + xor.b32 %r13943, %r13521, %r13457; + xor.b32 %r13944, %r13525, %r13458; + xor.b32 %r13591, %r30338, %r13943; + xor.b32 %r13590, %r30339, %r13944; + xor.b32 %r13718, %r30336, %r13943; + xor.b32 %r13719, %r30337, %r13944; + xor.b32 %r13599, %r30314, %r13943; + xor.b32 %r13598, %r30315, %r13944; + xor.b32 %r13702, %r30312, %r13943; + xor.b32 %r13703, %r30313, %r13944; + xor.b32 %r13567, %r30310, %r13943; + xor.b32 %r13566, %r30311, %r13944; + // begin inline asm + shf.l.wrap.b32 %r13529, %r13494, %r13493, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13533, %r13493, %r13494, %r13414; + // end inline asm + xor.b32 %r13945, %r13529, %r13469; + xor.b32 %r13946, %r13533, %r13470; + xor.b32 %r13686, %r30334, %r13945; + xor.b32 %r13687, %r30335, %r13946; + xor.b32 %r13663, %r30308, %r13945; + xor.b32 %r13662, %r30309, %r13946; + xor.b32 %r13606, %r30306, %r13945; + xor.b32 %r13607, %r30307, %r13946; + xor.b32 %r13694, %r30304, %r13945; + xor.b32 %r13695, %r30305, %r13946; + xor.b32 %r13623, %r30302, %r13945; + xor.b32 %r13622, %r30303, %r13946; + // begin inline asm + shf.l.wrap.b32 %r13537, %r13446, %r13445, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13541, %r13445, %r13446, %r13414; + // end inline asm + xor.b32 %r13947, %r13537, %r13481; + xor.b32 %r13948, %r13541, %r13482; + xor.b32 %r13638, %r30332, %r13947; + xor.b32 %r13639, %r30333, %r13948; + xor.b32 %r13558, %r30300, %r13947; + xor.b32 %r13559, %r30301, %r13948; + xor.b32 %r13575, %r30298, %r13947; + xor.b32 %r13574, %r30299, %r13948; + xor.b32 %r13614, %r30296, %r13947; + xor.b32 %r13615, %r30297, %r13948; + xor.b32 %r13646, %r30294, %r13947; + xor.b32 %r13647, %r30295, %r13948; + mov.u32 %r13552, 44; + // begin inline asm + shf.l.wrap.b32 %r13545, %r13551, %r13550, %r13552; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13549, %r13550, %r13551, %r13552; + // end inline asm + mov.u32 %r13560, 20; + // begin inline asm + shf.l.wrap.b32 %r13553, %r13559, %r13558, %r13560; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13557, %r13558, %r13559, %r13560; + // end inline asm + mov.u32 %r13568, 61; + // begin inline asm + shf.l.wrap.b32 %r13561, %r13567, %r13566, %r13568; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13565, %r13566, %r13567, %r13568; + // end inline asm + mov.u32 %r13576, 39; + // begin inline asm + shf.l.wrap.b32 %r13569, %r13575, %r13574, %r13576; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13573, %r13574, %r13575, %r13576; + // end inline asm + mov.u32 %r13584, 18; + // begin inline asm + shf.l.wrap.b32 %r13577, %r13583, %r13582, %r13584; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13581, %r13582, %r13583, %r13584; + // end inline asm + mov.u32 %r13592, 62; + // begin inline asm + shf.l.wrap.b32 %r13585, %r13591, %r13590, %r13592; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13589, %r13590, %r13591, %r13592; + // end inline asm + mov.u32 %r13600, 43; + // begin inline asm + shf.l.wrap.b32 %r13593, %r13599, %r13598, %r13600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13597, %r13598, %r13599, %r13600; + // end inline asm + mov.u32 %r13608, 25; + // begin inline asm + shf.l.wrap.b32 %r13601, %r13607, %r13606, %r13608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13605, %r13606, %r13607, %r13608; + // end inline asm + mov.u32 %r13616, 8; + // begin inline asm + shf.l.wrap.b32 %r13609, %r13615, %r13614, %r13616; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13613, %r13614, %r13615, %r13616; + // end inline asm + mov.u32 %r13624, 56; + // begin inline asm + shf.l.wrap.b32 %r13617, %r13623, %r13622, %r13624; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13621, %r13622, %r13623, %r13624; + // end inline asm + mov.u32 %r13632, 41; + // begin inline asm + shf.l.wrap.b32 %r13625, %r13631, %r13630, %r13632; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13629, %r13630, %r13631, %r13632; + // end inline asm + mov.u32 %r13640, 27; + // begin inline asm + shf.l.wrap.b32 %r13633, %r13639, %r13638, %r13640; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13637, %r13638, %r13639, %r13640; + // end inline asm + mov.u32 %r13648, 14; + // begin inline asm + shf.l.wrap.b32 %r13641, %r13647, %r13646, %r13648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13645, %r13646, %r13647, %r13648; + // end inline asm + mov.u32 %r13656, 2; + // begin inline asm + shf.l.wrap.b32 %r13649, %r13655, %r13654, %r13656; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13653, %r13654, %r13655, %r13656; + // end inline asm + mov.u32 %r13664, 55; + // begin inline asm + shf.l.wrap.b32 %r13657, %r13663, %r13662, %r13664; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13661, %r13662, %r13663, %r13664; + // end inline asm + mov.u32 %r13672, 45; + // begin inline asm + shf.l.wrap.b32 %r13665, %r13671, %r13670, %r13672; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13669, %r13670, %r13671, %r13672; + // end inline asm + mov.u32 %r13680, 36; + // begin inline asm + shf.l.wrap.b32 %r13673, %r13679, %r13678, %r13680; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13677, %r13678, %r13679, %r13680; + // end inline asm + mov.u32 %r13688, 28; + // begin inline asm + shf.l.wrap.b32 %r13681, %r13687, %r13686, %r13688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13685, %r13686, %r13687, %r13688; + // end inline asm + mov.u32 %r13696, 21; + // begin inline asm + shf.l.wrap.b32 %r13689, %r13695, %r13694, %r13696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13693, %r13694, %r13695, %r13696; + // end inline asm + mov.u32 %r13704, 15; + // begin inline asm + shf.l.wrap.b32 %r13697, %r13703, %r13702, %r13704; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13701, %r13702, %r13703, %r13704; + // end inline asm + mov.u32 %r13712, 10; + // begin inline asm + shf.l.wrap.b32 %r13705, %r13711, %r13710, %r13712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13709, %r13710, %r13711, %r13712; + // end inline asm + mov.u32 %r13720, 6; + // begin inline asm + shf.l.wrap.b32 %r13713, %r13719, %r13718, %r13720; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13717, %r13718, %r13719, %r13720; + // end inline asm + mov.u32 %r13728, 3; + // begin inline asm + shf.l.wrap.b32 %r13721, %r13727, %r13726, %r13728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13725, %r13726, %r13727, %r13728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13729, %r13735, %r13734, %r13414; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13733, %r13734, %r13735, %r13414; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13737, %r13772, %r13545, %r13593, 0xD2; + lop3.b32 %r13738, %r13775, %r13549, %r13597, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30342, %r13545, %r13593, %r13689, 0xD2; + lop3.b32 %r30343, %r13549, %r13597, %r13693, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30338, %r13593, %r13689, %r13641, 0xD2; + lop3.b32 %r30339, %r13597, %r13693, %r13645, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30334, %r13689, %r13641, %r13772, 0xD2; + lop3.b32 %r30335, %r13693, %r13645, %r13775, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30332, %r13641, %r13772, %r13545, 0xD2; + lop3.b32 %r30333, %r13645, %r13775, %r13549, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30328, %r13681, %r13553, %r13721, 0xD2; + lop3.b32 %r30329, %r13685, %r13557, %r13725, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30340, %r13553, %r13721, %r13665, 0xD2; + lop3.b32 %r30341, %r13557, %r13725, %r13669, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30336, %r13721, %r13665, %r13561, 0xD2; + lop3.b32 %r30337, %r13725, %r13669, %r13565, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30308, %r13665, %r13561, %r13681, 0xD2; + lop3.b32 %r30309, %r13669, %r13565, %r13685, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30308, %r30309}; + // begin inline asm + // chi + lop3.b32 %r30300, %r13561, %r13681, %r13553, 0xD2; + lop3.b32 %r30301, %r13565, %r13685, %r13557, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30300, %r30301}; + // begin inline asm + // chi + lop3.b32 %r30326, %r13729, %r13713, %r13601, 0xD2; + lop3.b32 %r30327, %r13733, %r13717, %r13605, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30326, %r30327}; + // begin inline asm + // chi + lop3.b32 %r30320, %r13713, %r13601, %r13609, 0xD2; + lop3.b32 %r30321, %r13717, %r13605, %r13613, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30320, %r30321}; + // begin inline asm + // chi + lop3.b32 %r30314, %r13601, %r13609, %r13577, 0xD2; + lop3.b32 %r30315, %r13605, %r13613, %r13581, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30314, %r30315}; + // begin inline asm + // chi + lop3.b32 %r30306, %r13609, %r13577, %r13729, 0xD2; + lop3.b32 %r30307, %r13613, %r13581, %r13733, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30306, %r30307}; + // begin inline asm + // chi + lop3.b32 %r30298, %r13577, %r13729, %r13713, 0xD2; + lop3.b32 %r30299, %r13581, %r13733, %r13717, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30298, %r30299}; + // begin inline asm + // chi + lop3.b32 %r30324, %r13633, %r13673, %r13705, 0xD2; + lop3.b32 %r30325, %r13637, %r13677, %r13709, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30324, %r30325}; + // begin inline asm + // chi + lop3.b32 %r30318, %r13673, %r13705, %r13697, 0xD2; + lop3.b32 %r30319, %r13677, %r13709, %r13701, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30318, %r30319}; + // begin inline asm + // chi + lop3.b32 %r30312, %r13705, %r13697, %r13617, 0xD2; + lop3.b32 %r30313, %r13709, %r13701, %r13621, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30312, %r30313}; + // begin inline asm + // chi + lop3.b32 %r30304, %r13697, %r13617, %r13633, 0xD2; + lop3.b32 %r30305, %r13701, %r13621, %r13637, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30304, %r30305}; + // begin inline asm + // chi + lop3.b32 %r30296, %r13617, %r13633, %r13673, 0xD2; + lop3.b32 %r30297, %r13621, %r13637, %r13677, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30296, %r30297}; + // begin inline asm + // chi + lop3.b32 %r30322, %r13585, %r13657, %r13569, 0xD2; + lop3.b32 %r30323, %r13589, %r13661, %r13573, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30322, %r30323}; + // begin inline asm + // chi + lop3.b32 %r30316, %r13657, %r13569, %r13625, 0xD2; + lop3.b32 %r30317, %r13661, %r13573, %r13629, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30316, %r30317}; + // begin inline asm + // chi + lop3.b32 %r30310, %r13569, %r13625, %r13649, 0xD2; + lop3.b32 %r30311, %r13573, %r13629, %r13653, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30310, %r30311}; + // begin inline asm + // chi + lop3.b32 %r30302, %r13625, %r13649, %r13585, 0xD2; + lop3.b32 %r30303, %r13629, %r13653, %r13589, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30302, %r30303}; + // begin inline asm + // chi + lop3.b32 %r30294, %r13649, %r13585, %r13657, 0xD2; + lop3.b32 %r30295, %r13653, %r13589, %r13661, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30294, %r30295}; + mul.wide.s32 %rd687, %r30344, 8; + mov.u64 %rd688, keccak_round_constants; + cvta.const.u64 %rd689, %rd688; + add.s64 %rd686, %rd689, %rd687; + // begin inline asm + ld.global.nc.v2.u32 {%r13937,%r13938}, [%rd686]; + // end inline asm + xor.b32 %r30330, %r13737, %r13937; + xor.b32 %r30331, %r13738, %r13938; + add.s32 %r30344, %r30344, 1; + setp.lt.u32 %p26, %r30344, 23; + @%p26 bra $L__BB2_40; + + add.u64 %rd147, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r30342, %r30343}; + st.local.v2.u32 [%rd2+72], {%r30340, %r30341}; + st.local.v2.u32 [%rd2+40], {%r30338, %r30339}; + st.local.v2.u32 [%rd2+80], {%r30336, %r30337}; + st.local.v2.u32 [%rd2+48], {%r30334, %r30335}; + st.local.v2.u32 [%rd2+56], {%r30332, %r30333}; + st.local.v2.u32 [%rd2+24], {%r30330, %r30331}; + // begin inline asm + // xor5 + lop3.b32 %r13949, %r30330, %r30328, %r30326, 0x96; + lop3.b32 %r13949, %r13949, %r30324, %r30322, 0x96; + lop3.b32 %r13950, %r30331, %r30329, %r30327, 0x96; + lop3.b32 %r13950, %r13950, %r30325, %r30323, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13961, %r30342, %r30340, %r30320, 0x96; + lop3.b32 %r13961, %r13961, %r30318, %r30316, 0x96; + lop3.b32 %r13962, %r30343, %r30341, %r30321, 0x96; + lop3.b32 %r13962, %r13962, %r30319, %r30317, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13973, %r30338, %r30336, %r30314, 0x96; + lop3.b32 %r13973, %r13973, %r30312, %r30310, 0x96; + lop3.b32 %r13974, %r30339, %r30337, %r30315, 0x96; + lop3.b32 %r13974, %r13974, %r30313, %r30311, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13985, %r30334, %r30308, %r30306, 0x96; + lop3.b32 %r13985, %r13985, %r30304, %r30302, 0x96; + lop3.b32 %r13986, %r30335, %r30309, %r30307, 0x96; + lop3.b32 %r13986, %r13986, %r30305, %r30303, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13997, %r30332, %r30300, %r30298, 0x96; + lop3.b32 %r13997, %r13997, %r30296, %r30294, 0x96; + lop3.b32 %r13998, %r30333, %r30301, %r30299, 0x96; + lop3.b32 %r13998, %r13998, %r30297, %r30295, 0x96; + // end inline asm + mov.u32 %r14201, 1; + // begin inline asm + shf.l.wrap.b32 %r14009, %r13962, %r13961, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14013, %r13961, %r13962, %r14201; + // end inline asm + xor.b32 %r14228, %r14009, %r13997; + xor.b32 %r14229, %r14013, %r13998; + xor.b32 %r14156, %r30330, %r14228; + xor.b32 %r14159, %r30331, %r14229; + xor.b32 %r14119, %r30327, %r14229; + xor.b32 %r14118, %r30326, %r14228; + st.local.v2.u32 [%rd2+104], {%r14118, %r14119}; + // begin inline asm + shf.l.wrap.b32 %r14017, %r13974, %r13973, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14021, %r13973, %r13974, %r14201; + // end inline asm + xor.b32 %r14230, %r14017, %r13949; + xor.b32 %r14231, %r14021, %r13950; + xor.b32 %r14055, %r30340, %r14230; + xor.b32 %r14054, %r30341, %r14231; + xor.b32 %r14094, %r30319, %r14231; + xor.b32 %r14095, %r30318, %r14230; + st.local.v2.u32 [%rd2+152], {%r14095, %r14094}; + // begin inline asm + shf.l.wrap.b32 %r14025, %r13986, %r13985, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14029, %r13985, %r13986, %r14201; + // end inline asm + xor.b32 %r14232, %r14025, %r13961; + xor.b32 %r14233, %r14029, %r13962; + xor.b32 %r14078, %r30315, %r14233; + xor.b32 %r14079, %r30314, %r14232; + st.local.v2.u32 [%rd2+120], {%r14079, %r14078}; + xor.b32 %r14070, %r30311, %r14233; + xor.b32 %r14071, %r30310, %r14232; + st.local.v2.u32 [%rd2+200], {%r14071, %r14070}; + // begin inline asm + shf.l.wrap.b32 %r14033, %r13998, %r13997, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14037, %r13997, %r13998, %r14201; + // end inline asm + xor.b32 %r14234, %r14033, %r13973; + xor.b32 %r14235, %r14037, %r13974; + xor.b32 %r14102, %r30334, %r14234; + xor.b32 %r14103, %r30335, %r14235; + xor.b32 %r14111, %r30305, %r14235; + xor.b32 %r14110, %r30304, %r14234; + st.local.v2.u32 [%rd2+168], {%r14110, %r14111}; + // begin inline asm + shf.l.wrap.b32 %r14041, %r13950, %r13949, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14045, %r13949, %r13950, %r14201; + // end inline asm + xor.b32 %r14236, %r14041, %r13985; + xor.b32 %r14237, %r14045, %r13986; + xor.b32 %r14062, %r30300, %r14236; + xor.b32 %r14063, %r30301, %r14237; + xor.b32 %r14087, %r30295, %r14237; + xor.b32 %r14086, %r30294, %r14236; + st.local.v2.u32 [%rd2+216], {%r14086, %r14087}; + // begin inline asm + shf.l.wrap.b32 %r14049, %r14055, %r14054, %r13552; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14053, %r14054, %r14055, %r13552; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14057, %r14063, %r14062, %r13560; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14061, %r14062, %r14063, %r13560; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14069, %r14070, %r14071, %r13568; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14065, %r14071, %r14070, %r13568; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r14065, %r14069}; + // begin inline asm + shf.l.wrap.b32 %r14073, %r14079, %r14078, %r13600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14077, %r14078, %r14079, %r13600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14081, %r14087, %r14086, %r13648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14085, %r14086, %r14087, %r13648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14093, %r14094, %r14095, %r13672; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14089, %r14095, %r14094, %r13672; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r14089, %r14093}; + // begin inline asm + shf.l.wrap.b32 %r14097, %r14103, %r14102, %r13688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14101, %r14102, %r14103, %r13688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14105, %r14111, %r14110, %r13696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14109, %r14110, %r14111, %r13696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14113, %r14119, %r14118, %r13728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14117, %r14118, %r14119, %r13728; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14121, %r14156, %r14049, %r14073, 0xD2; + lop3.b32 %r14122, %r14159, %r14053, %r14077, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30477, %r14049, %r14073, %r14105, 0xD2; + lop3.b32 %r30478, %r14053, %r14077, %r14109, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30477, %r30478}; + // begin inline asm + // chi + lop3.b32 %r30473, %r14073, %r14105, %r14081, 0xD2; + lop3.b32 %r30474, %r14077, %r14109, %r14085, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30473, %r30474}; + // begin inline asm + // chi + lop3.b32 %r30469, %r14105, %r14081, %r14156, 0xD2; + lop3.b32 %r30470, %r14109, %r14085, %r14159, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30469, %r30470}; + // begin inline asm + // chi + lop3.b32 %r30467, %r14081, %r14156, %r14049, 0xD2; + lop3.b32 %r30468, %r14085, %r14159, %r14053, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30467, %r30468}; + // begin inline asm + // chi + lop3.b32 %r30463, %r14097, %r14057, %r14113, 0xD2; + lop3.b32 %r30464, %r14101, %r14061, %r14117, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30463, %r30464}; + // begin inline asm + // chi + lop3.b32 %r30475, %r14057, %r14113, %r14089, 0xD2; + lop3.b32 %r30476, %r14061, %r14117, %r14093, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30475, %r30476}; + // begin inline asm + // chi + lop3.b32 %r30471, %r14113, %r14089, %r14065, 0xD2; + lop3.b32 %r30472, %r14117, %r14093, %r14069, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30471, %r30472}; + add.s64 %rd690, %rd689, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r14185,%r14186}, [%rd690]; + // end inline asm + xor.b32 %r30465, %r14121, %r14185; + xor.b32 %r30466, %r14122, %r14186; + st.local.v2.u32 [%rd2+24], {%r30465, %r30466}; + st.local.u64 [%rd147], %rd361; + mov.u64 %rd694, 1179641; + st.local.u64 [%rd147+8], %rd694; + add.s32 %r1891, %r1695, 1; + st.local.u32 [%rd147+16], %r1891; + ld.global.u64 %rd695, [%rd127]; + ld.global.u64 %rd696, [%rd127+8]; + ld.global.u64 %rd697, [%rd127+16]; + ld.global.u64 %rd698, [%rd127+24]; + ld.global.u64 %rd699, [%rd127+32]; + ld.global.u64 %rd700, [%rd127+40]; + ld.global.u64 %rd701, [%rd127+48]; + ld.global.u64 %rd702, [%rd127+56]; + st.local.u64 [%rd147+32], %rd696; + st.local.u64 [%rd147+40], %rd697; + st.local.u64 [%rd147+48], %rd698; + st.local.u64 [%rd147+56], %rd699; + st.local.u64 [%rd147+64], %rd700; + st.local.u64 [%rd147+72], %rd701; + st.local.u64 [%rd147+80], %rd702; + cvt.u32.u64 %r14238, %rd695; + xor.b32 %r14239, %r1891, %r14238; + st.local.u64 [%rd147+24], %rd695; + st.local.u32 [%rd147+24], %r14239; + mov.u32 %r30345, 0; + st.local.v2.u32 [%rd147+96], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+104], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+112], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+120], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+128], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+136], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+144], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+152], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+160], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+168], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+176], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+184], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+192], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+200], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+208], {%r30345, %r30345}; + st.local.v2.u32 [%rd147+216], {%r30345, %r30345}; + mov.u32 %r30360, -2147483648; + st.local.v2.u32 [%rd147+88], {%r14201, %r30360}; + ld.local.v2.u32 {%r30381, %r30382}, [%rd147+24]; + mov.b64 {%r30379, %r30380}, %rd700; + shr.u64 %rd703, %rd696, 32; + cvt.u32.u64 %r30393, %rd696; + cvt.u32.u64 %r30394, %rd703; + shr.u64 %rd704, %rd701, 32; + cvt.u32.u64 %r30391, %rd701; + cvt.u32.u64 %r30392, %rd704; + shr.u64 %rd705, %rd697, 32; + cvt.u32.u64 %r30389, %rd697; + cvt.u32.u64 %r30390, %rd705; + shr.u64 %rd706, %rd702, 32; + cvt.u32.u64 %r30387, %rd702; + cvt.u32.u64 %r30388, %rd706; + shr.u64 %rd707, %rd698, 32; + cvt.u32.u64 %r30385, %rd698; + cvt.u32.u64 %r30386, %rd707; + shr.u64 %rd708, %rd699, 32; + cvt.u32.u64 %r30383, %rd699; + cvt.u32.u64 %r30384, %rd708; + mov.u32 %r30346, %r30345; + mov.u32 %r30347, %r30345; + mov.u32 %r30348, %r30345; + mov.u32 %r30349, %r30345; + mov.u32 %r30350, %r30345; + mov.u32 %r30351, %r30345; + mov.u32 %r30352, %r30345; + mov.u32 %r30353, %r30345; + mov.u32 %r30354, %r30345; + mov.u32 %r30355, %r30345; + mov.u32 %r30356, %r30345; + mov.u32 %r30357, %r30345; + mov.u32 %r30358, %r30345; + mov.u32 %r30359, %r14201; + mov.u32 %r30361, %r30345; + mov.u32 %r30362, %r30345; + mov.u32 %r30363, %r30345; + mov.u32 %r30364, %r30345; + mov.u32 %r30365, %r30345; + mov.u32 %r30366, %r30345; + mov.u32 %r30367, %r30345; + mov.u32 %r30368, %r30345; + mov.u32 %r30369, %r30345; + mov.u32 %r30370, %r30345; + mov.u32 %r30371, %r30345; + mov.u32 %r30372, %r30345; + mov.u32 %r30373, %r30345; + mov.u32 %r30374, %r30345; + mov.u32 %r30375, %r30345; + mov.u32 %r30376, %r30345; + mov.u32 %r30377, %r30345; + mov.u32 %r30378, %r30345; + mov.u32 %r30395, %r30345; + +$L__BB2_42: + // begin inline asm + // xor5 + lop3.b32 %r14242, %r30381, %r30379, %r30377, 0x96; + lop3.b32 %r14242, %r14242, %r30375, %r30373, 0x96; + lop3.b32 %r14243, %r30382, %r30380, %r30378, 0x96; + lop3.b32 %r14243, %r14243, %r30376, %r30374, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14254, %r30393, %r30391, %r30371, 0x96; + lop3.b32 %r14254, %r14254, %r30369, %r30367, 0x96; + lop3.b32 %r14255, %r30394, %r30392, %r30372, 0x96; + lop3.b32 %r14255, %r14255, %r30370, %r30368, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14266, %r30389, %r30387, %r30365, 0x96; + lop3.b32 %r14266, %r14266, %r30363, %r30361, 0x96; + lop3.b32 %r14267, %r30390, %r30388, %r30366, 0x96; + lop3.b32 %r14267, %r14267, %r30364, %r30362, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14278, %r30385, %r30359, %r30357, 0x96; + lop3.b32 %r14278, %r14278, %r30355, %r30353, 0x96; + lop3.b32 %r14279, %r30386, %r30360, %r30358, 0x96; + lop3.b32 %r14279, %r14279, %r30356, %r30354, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14290, %r30383, %r30351, %r30349, 0x96; + lop3.b32 %r14290, %r14290, %r30347, %r30345, 0x96; + lop3.b32 %r14291, %r30384, %r30352, %r30350, 0x96; + lop3.b32 %r14291, %r14291, %r30348, %r30346, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14302, %r14255, %r14254, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14306, %r14254, %r14255, %r14201; + // end inline asm + xor.b32 %r14736, %r14302, %r14290; + xor.b32 %r14737, %r14306, %r14291; + xor.b32 %r14569, %r30381, %r14736; + xor.b32 %r14572, %r30382, %r14737; + xor.b32 %r14476, %r30379, %r14736; + xor.b32 %r14475, %r30380, %r14737; + xor.b32 %r14523, %r30377, %r14736; + xor.b32 %r14524, %r30378, %r14737; + xor.b32 %r14428, %r30375, %r14736; + xor.b32 %r14427, %r30376, %r14737; + xor.b32 %r14379, %r30373, %r14736; + xor.b32 %r14380, %r30374, %r14737; + // begin inline asm + shf.l.wrap.b32 %r14310, %r14267, %r14266, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14314, %r14266, %r14267, %r14201; + // end inline asm + xor.b32 %r14738, %r14310, %r14242; + xor.b32 %r14739, %r14314, %r14243; + xor.b32 %r14531, %r30393, %r14738; + xor.b32 %r14532, %r30394, %r14739; + xor.b32 %r14348, %r30391, %r14738; + xor.b32 %r14347, %r30392, %r14739; + xor.b32 %r14507, %r30371, %r14738; + xor.b32 %r14508, %r30372, %r14739; + xor.b32 %r14468, %r30369, %r14738; + xor.b32 %r14467, %r30370, %r14739; + xor.b32 %r14451, %r30367, %r14738; + xor.b32 %r14452, %r30368, %r14739; + // begin inline asm + shf.l.wrap.b32 %r14318, %r14279, %r14278, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14322, %r14278, %r14279, %r14201; + // end inline asm + xor.b32 %r14740, %r14318, %r14254; + xor.b32 %r14741, %r14322, %r14255; + xor.b32 %r14388, %r30389, %r14740; + xor.b32 %r14387, %r30390, %r14741; + xor.b32 %r14515, %r30387, %r14740; + xor.b32 %r14516, %r30388, %r14741; + xor.b32 %r14396, %r30365, %r14740; + xor.b32 %r14395, %r30366, %r14741; + xor.b32 %r14499, %r30363, %r14740; + xor.b32 %r14500, %r30364, %r14741; + xor.b32 %r14364, %r30361, %r14740; + xor.b32 %r14363, %r30362, %r14741; + // begin inline asm + shf.l.wrap.b32 %r14326, %r14291, %r14290, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14330, %r14290, %r14291, %r14201; + // end inline asm + xor.b32 %r14742, %r14326, %r14266; + xor.b32 %r14743, %r14330, %r14267; + xor.b32 %r14483, %r30385, %r14742; + xor.b32 %r14484, %r30386, %r14743; + xor.b32 %r14460, %r30359, %r14742; + xor.b32 %r14459, %r30360, %r14743; + xor.b32 %r14403, %r30357, %r14742; + xor.b32 %r14404, %r30358, %r14743; + xor.b32 %r14491, %r30355, %r14742; + xor.b32 %r14492, %r30356, %r14743; + xor.b32 %r14420, %r30353, %r14742; + xor.b32 %r14419, %r30354, %r14743; + // begin inline asm + shf.l.wrap.b32 %r14334, %r14243, %r14242, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14338, %r14242, %r14243, %r14201; + // end inline asm + xor.b32 %r14744, %r14334, %r14278; + xor.b32 %r14745, %r14338, %r14279; + xor.b32 %r14435, %r30383, %r14744; + xor.b32 %r14436, %r30384, %r14745; + xor.b32 %r14355, %r30351, %r14744; + xor.b32 %r14356, %r30352, %r14745; + xor.b32 %r14372, %r30349, %r14744; + xor.b32 %r14371, %r30350, %r14745; + xor.b32 %r14411, %r30347, %r14744; + xor.b32 %r14412, %r30348, %r14745; + xor.b32 %r14443, %r30345, %r14744; + xor.b32 %r14444, %r30346, %r14745; + mov.u32 %r14349, 44; + // begin inline asm + shf.l.wrap.b32 %r14342, %r14348, %r14347, %r14349; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14346, %r14347, %r14348, %r14349; + // end inline asm + mov.u32 %r14357, 20; + // begin inline asm + shf.l.wrap.b32 %r14350, %r14356, %r14355, %r14357; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14354, %r14355, %r14356, %r14357; + // end inline asm + mov.u32 %r14365, 61; + // begin inline asm + shf.l.wrap.b32 %r14358, %r14364, %r14363, %r14365; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14362, %r14363, %r14364, %r14365; + // end inline asm + mov.u32 %r14373, 39; + // begin inline asm + shf.l.wrap.b32 %r14366, %r14372, %r14371, %r14373; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14370, %r14371, %r14372, %r14373; + // end inline asm + mov.u32 %r14381, 18; + // begin inline asm + shf.l.wrap.b32 %r14374, %r14380, %r14379, %r14381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14378, %r14379, %r14380, %r14381; + // end inline asm + mov.u32 %r14389, 62; + // begin inline asm + shf.l.wrap.b32 %r14382, %r14388, %r14387, %r14389; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14386, %r14387, %r14388, %r14389; + // end inline asm + mov.u32 %r14397, 43; + // begin inline asm + shf.l.wrap.b32 %r14390, %r14396, %r14395, %r14397; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14394, %r14395, %r14396, %r14397; + // end inline asm + mov.u32 %r14405, 25; + // begin inline asm + shf.l.wrap.b32 %r14398, %r14404, %r14403, %r14405; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14402, %r14403, %r14404, %r14405; + // end inline asm + mov.u32 %r14413, 8; + // begin inline asm + shf.l.wrap.b32 %r14406, %r14412, %r14411, %r14413; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14410, %r14411, %r14412, %r14413; + // end inline asm + mov.u32 %r14421, 56; + // begin inline asm + shf.l.wrap.b32 %r14414, %r14420, %r14419, %r14421; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14418, %r14419, %r14420, %r14421; + // end inline asm + mov.u32 %r14429, 41; + // begin inline asm + shf.l.wrap.b32 %r14422, %r14428, %r14427, %r14429; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14426, %r14427, %r14428, %r14429; + // end inline asm + mov.u32 %r14437, 27; + // begin inline asm + shf.l.wrap.b32 %r14430, %r14436, %r14435, %r14437; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14434, %r14435, %r14436, %r14437; + // end inline asm + mov.u32 %r14445, 14; + // begin inline asm + shf.l.wrap.b32 %r14438, %r14444, %r14443, %r14445; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14442, %r14443, %r14444, %r14445; + // end inline asm + mov.u32 %r14453, 2; + // begin inline asm + shf.l.wrap.b32 %r14446, %r14452, %r14451, %r14453; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14450, %r14451, %r14452, %r14453; + // end inline asm + mov.u32 %r14461, 55; + // begin inline asm + shf.l.wrap.b32 %r14454, %r14460, %r14459, %r14461; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14458, %r14459, %r14460, %r14461; + // end inline asm + mov.u32 %r14469, 45; + // begin inline asm + shf.l.wrap.b32 %r14462, %r14468, %r14467, %r14469; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14466, %r14467, %r14468, %r14469; + // end inline asm + mov.u32 %r14477, 36; + // begin inline asm + shf.l.wrap.b32 %r14470, %r14476, %r14475, %r14477; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14474, %r14475, %r14476, %r14477; + // end inline asm + mov.u32 %r14485, 28; + // begin inline asm + shf.l.wrap.b32 %r14478, %r14484, %r14483, %r14485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14482, %r14483, %r14484, %r14485; + // end inline asm + mov.u32 %r14493, 21; + // begin inline asm + shf.l.wrap.b32 %r14486, %r14492, %r14491, %r14493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14490, %r14491, %r14492, %r14493; + // end inline asm + mov.u32 %r14501, 15; + // begin inline asm + shf.l.wrap.b32 %r14494, %r14500, %r14499, %r14501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14498, %r14499, %r14500, %r14501; + // end inline asm + mov.u32 %r14509, 10; + // begin inline asm + shf.l.wrap.b32 %r14502, %r14508, %r14507, %r14509; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14506, %r14507, %r14508, %r14509; + // end inline asm + mov.u32 %r14517, 6; + // begin inline asm + shf.l.wrap.b32 %r14510, %r14516, %r14515, %r14517; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14514, %r14515, %r14516, %r14517; + // end inline asm + mov.u32 %r14525, 3; + // begin inline asm + shf.l.wrap.b32 %r14518, %r14524, %r14523, %r14525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14522, %r14523, %r14524, %r14525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14526, %r14532, %r14531, %r14201; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14530, %r14531, %r14532, %r14201; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14534, %r14569, %r14342, %r14390, 0xD2; + lop3.b32 %r14535, %r14572, %r14346, %r14394, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30393, %r14342, %r14390, %r14486, 0xD2; + lop3.b32 %r30394, %r14346, %r14394, %r14490, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30389, %r14390, %r14486, %r14438, 0xD2; + lop3.b32 %r30390, %r14394, %r14490, %r14442, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30385, %r14486, %r14438, %r14569, 0xD2; + lop3.b32 %r30386, %r14490, %r14442, %r14572, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30383, %r14438, %r14569, %r14342, 0xD2; + lop3.b32 %r30384, %r14442, %r14572, %r14346, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30379, %r14478, %r14350, %r14518, 0xD2; + lop3.b32 %r30380, %r14482, %r14354, %r14522, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30391, %r14350, %r14518, %r14462, 0xD2; + lop3.b32 %r30392, %r14354, %r14522, %r14466, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30387, %r14518, %r14462, %r14358, 0xD2; + lop3.b32 %r30388, %r14522, %r14466, %r14362, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30359, %r14462, %r14358, %r14478, 0xD2; + lop3.b32 %r30360, %r14466, %r14362, %r14482, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r30359, %r30360}; + // begin inline asm + // chi + lop3.b32 %r30351, %r14358, %r14478, %r14350, 0xD2; + lop3.b32 %r30352, %r14362, %r14482, %r14354, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r30351, %r30352}; + // begin inline asm + // chi + lop3.b32 %r30377, %r14526, %r14510, %r14398, 0xD2; + lop3.b32 %r30378, %r14530, %r14514, %r14402, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+104], {%r30377, %r30378}; + // begin inline asm + // chi + lop3.b32 %r30371, %r14510, %r14398, %r14406, 0xD2; + lop3.b32 %r30372, %r14514, %r14402, %r14410, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+112], {%r30371, %r30372}; + // begin inline asm + // chi + lop3.b32 %r30365, %r14398, %r14406, %r14374, 0xD2; + lop3.b32 %r30366, %r14402, %r14410, %r14378, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+120], {%r30365, %r30366}; + // begin inline asm + // chi + lop3.b32 %r30357, %r14406, %r14374, %r14526, 0xD2; + lop3.b32 %r30358, %r14410, %r14378, %r14530, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+128], {%r30357, %r30358}; + // begin inline asm + // chi + lop3.b32 %r30349, %r14374, %r14526, %r14510, 0xD2; + lop3.b32 %r30350, %r14378, %r14530, %r14514, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+136], {%r30349, %r30350}; + // begin inline asm + // chi + lop3.b32 %r30375, %r14430, %r14470, %r14502, 0xD2; + lop3.b32 %r30376, %r14434, %r14474, %r14506, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+144], {%r30375, %r30376}; + // begin inline asm + // chi + lop3.b32 %r30369, %r14470, %r14502, %r14494, 0xD2; + lop3.b32 %r30370, %r14474, %r14506, %r14498, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+152], {%r30369, %r30370}; + // begin inline asm + // chi + lop3.b32 %r30363, %r14502, %r14494, %r14414, 0xD2; + lop3.b32 %r30364, %r14506, %r14498, %r14418, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+160], {%r30363, %r30364}; + // begin inline asm + // chi + lop3.b32 %r30355, %r14494, %r14414, %r14430, 0xD2; + lop3.b32 %r30356, %r14498, %r14418, %r14434, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+168], {%r30355, %r30356}; + // begin inline asm + // chi + lop3.b32 %r30347, %r14414, %r14430, %r14470, 0xD2; + lop3.b32 %r30348, %r14418, %r14434, %r14474, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+176], {%r30347, %r30348}; + // begin inline asm + // chi + lop3.b32 %r30373, %r14382, %r14454, %r14366, 0xD2; + lop3.b32 %r30374, %r14386, %r14458, %r14370, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+184], {%r30373, %r30374}; + // begin inline asm + // chi + lop3.b32 %r30367, %r14454, %r14366, %r14422, 0xD2; + lop3.b32 %r30368, %r14458, %r14370, %r14426, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+192], {%r30367, %r30368}; + // begin inline asm + // chi + lop3.b32 %r30361, %r14366, %r14422, %r14446, 0xD2; + lop3.b32 %r30362, %r14370, %r14426, %r14450, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+200], {%r30361, %r30362}; + // begin inline asm + // chi + lop3.b32 %r30353, %r14422, %r14446, %r14382, 0xD2; + lop3.b32 %r30354, %r14426, %r14450, %r14386, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+208], {%r30353, %r30354}; + // begin inline asm + // chi + lop3.b32 %r30345, %r14446, %r14382, %r14454, 0xD2; + lop3.b32 %r30346, %r14450, %r14386, %r14458, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+216], {%r30345, %r30346}; + mul.wide.s32 %rd710, %r30395, 8; + add.s64 %rd709, %rd689, %rd710; + // begin inline asm + ld.global.nc.v2.u32 {%r14734,%r14735}, [%rd709]; + // end inline asm + xor.b32 %r30381, %r14534, %r14734; + xor.b32 %r30382, %r14535, %r14735; + add.s32 %r30395, %r30395, 1; + setp.lt.u32 %p27, %r30395, 23; + @%p27 bra $L__BB2_42; + + mov.u32 %r30428, 0; + mov.u32 %r14845, 1; + st.local.v2.u32 [%rd147+32], {%r30393, %r30394}; + st.local.v2.u32 [%rd147+72], {%r30391, %r30392}; + st.local.v2.u32 [%rd147+40], {%r30389, %r30390}; + st.local.v2.u32 [%rd147+80], {%r30387, %r30388}; + st.local.v2.u32 [%rd147+48], {%r30385, %r30386}; + st.local.v2.u32 [%rd147+56], {%r30383, %r30384}; + st.local.v2.u32 [%rd147+24], {%r30381, %r30382}; + // begin inline asm + // xor5 + lop3.b32 %r14746, %r30381, %r30379, %r30377, 0x96; + lop3.b32 %r14746, %r14746, %r30375, %r30373, 0x96; + lop3.b32 %r14747, %r30382, %r30380, %r30378, 0x96; + lop3.b32 %r14747, %r14747, %r30376, %r30374, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14758, %r30393, %r30391, %r30371, 0x96; + lop3.b32 %r14758, %r14758, %r30369, %r30367, 0x96; + lop3.b32 %r14759, %r30394, %r30392, %r30372, 0x96; + lop3.b32 %r14759, %r14759, %r30370, %r30368, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14770, %r30389, %r30387, %r30365, 0x96; + lop3.b32 %r14770, %r14770, %r30363, %r30361, 0x96; + lop3.b32 %r14771, %r30390, %r30388, %r30366, 0x96; + lop3.b32 %r14771, %r14771, %r30364, %r30362, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14782, %r30385, %r30359, %r30357, 0x96; + lop3.b32 %r14782, %r14782, %r30355, %r30353, 0x96; + lop3.b32 %r14783, %r30386, %r30360, %r30358, 0x96; + lop3.b32 %r14783, %r14783, %r30356, %r30354, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14794, %r30383, %r30351, %r30349, 0x96; + lop3.b32 %r14794, %r14794, %r30347, %r30345, 0x96; + lop3.b32 %r14795, %r30384, %r30352, %r30350, 0x96; + lop3.b32 %r14795, %r14795, %r30348, %r30346, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14806, %r14759, %r14758, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14810, %r14758, %r14759, %r14845; + // end inline asm + xor.b32 %r14985, %r14806, %r14794; + xor.b32 %r14986, %r14810, %r14795; + xor.b32 %r14953, %r30381, %r14985; + xor.b32 %r14956, %r30382, %r14986; + xor.b32 %r14916, %r30378, %r14986; + xor.b32 %r14915, %r30377, %r14985; + st.local.v2.u32 [%rd147+104], {%r14915, %r14916}; + // begin inline asm + shf.l.wrap.b32 %r14814, %r14771, %r14770, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14818, %r14770, %r14771, %r14845; + // end inline asm + xor.b32 %r14987, %r14814, %r14746; + xor.b32 %r14988, %r14818, %r14747; + xor.b32 %r14852, %r30391, %r14987; + xor.b32 %r14851, %r30392, %r14988; + xor.b32 %r14891, %r30370, %r14988; + xor.b32 %r14892, %r30369, %r14987; + st.local.v2.u32 [%rd147+152], {%r14892, %r14891}; + // begin inline asm + shf.l.wrap.b32 %r14822, %r14783, %r14782, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14826, %r14782, %r14783, %r14845; + // end inline asm + xor.b32 %r14989, %r14822, %r14758; + xor.b32 %r14990, %r14826, %r14759; + xor.b32 %r14875, %r30366, %r14990; + xor.b32 %r14876, %r30365, %r14989; + st.local.v2.u32 [%rd147+120], {%r14876, %r14875}; + xor.b32 %r14867, %r30362, %r14990; + xor.b32 %r14868, %r30361, %r14989; + st.local.v2.u32 [%rd147+200], {%r14868, %r14867}; + // begin inline asm + shf.l.wrap.b32 %r14830, %r14795, %r14794, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14834, %r14794, %r14795, %r14845; + // end inline asm + xor.b32 %r14991, %r14830, %r14770; + xor.b32 %r14992, %r14834, %r14771; + xor.b32 %r14899, %r30385, %r14991; + xor.b32 %r14900, %r30386, %r14992; + xor.b32 %r14908, %r30356, %r14992; + xor.b32 %r14907, %r30355, %r14991; + st.local.v2.u32 [%rd147+168], {%r14907, %r14908}; + // begin inline asm + shf.l.wrap.b32 %r14838, %r14747, %r14746, %r14845; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14842, %r14746, %r14747, %r14845; + // end inline asm + xor.b32 %r14993, %r14838, %r14782; + xor.b32 %r14994, %r14842, %r14783; + xor.b32 %r14859, %r30351, %r14993; + xor.b32 %r14860, %r30352, %r14994; + xor.b32 %r14884, %r30346, %r14994; + xor.b32 %r14883, %r30345, %r14993; + st.local.v2.u32 [%rd147+216], {%r14883, %r14884}; + // begin inline asm + shf.l.wrap.b32 %r14846, %r14852, %r14851, %r14349; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14850, %r14851, %r14852, %r14349; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14854, %r14860, %r14859, %r14357; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14858, %r14859, %r14860, %r14357; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14866, %r14867, %r14868, %r14365; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14862, %r14868, %r14867, %r14365; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r14862, %r14866}; + // begin inline asm + shf.l.wrap.b32 %r14870, %r14876, %r14875, %r14397; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14874, %r14875, %r14876, %r14397; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14878, %r14884, %r14883, %r14445; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14882, %r14883, %r14884, %r14445; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14890, %r14891, %r14892, %r14469; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14886, %r14892, %r14891, %r14469; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r14886, %r14890}; + // begin inline asm + shf.l.wrap.b32 %r14894, %r14900, %r14899, %r14485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14898, %r14899, %r14900, %r14485; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14902, %r14908, %r14907, %r14493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14906, %r14907, %r14908, %r14493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14910, %r14916, %r14915, %r14525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14914, %r14915, %r14916, %r14525; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14918, %r14953, %r14846, %r14870, 0xD2; + lop3.b32 %r14919, %r14956, %r14850, %r14874, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30528, %r14846, %r14870, %r14902, 0xD2; + lop3.b32 %r30529, %r14850, %r14874, %r14906, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+32], {%r30528, %r30529}; + // begin inline asm + // chi + lop3.b32 %r30524, %r14870, %r14902, %r14878, 0xD2; + lop3.b32 %r30525, %r14874, %r14906, %r14882, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+40], {%r30524, %r30525}; + // begin inline asm + // chi + lop3.b32 %r30520, %r14902, %r14878, %r14953, 0xD2; + lop3.b32 %r30521, %r14906, %r14882, %r14956, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+48], {%r30520, %r30521}; + // begin inline asm + // chi + lop3.b32 %r30518, %r14878, %r14953, %r14846, 0xD2; + lop3.b32 %r30519, %r14882, %r14956, %r14850, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+56], {%r30518, %r30519}; + // begin inline asm + // chi + lop3.b32 %r30514, %r14894, %r14854, %r14910, 0xD2; + lop3.b32 %r30515, %r14898, %r14858, %r14914, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+64], {%r30514, %r30515}; + // begin inline asm + // chi + lop3.b32 %r30526, %r14854, %r14910, %r14886, 0xD2; + lop3.b32 %r30527, %r14858, %r14914, %r14890, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+72], {%r30526, %r30527}; + // begin inline asm + // chi + lop3.b32 %r30522, %r14910, %r14886, %r14862, 0xD2; + lop3.b32 %r30523, %r14914, %r14890, %r14866, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+80], {%r30522, %r30523}; + // begin inline asm + ld.global.nc.v2.u32 {%r14982,%r14983}, [%rd690]; + // end inline asm + xor.b32 %r30516, %r14918, %r14982; + xor.b32 %r30517, %r14919, %r14983; + st.local.v2.u32 [%rd147+24], {%r30516, %r30517}; + add.s64 %rd149, %rd2, 24; + add.s64 %rd150, %rd147, 24; + +$L__BB2_44: + cvta.to.global.u64 %rd1268, %rd361; + shl.b32 %r14995, %r30428, 2; + cvt.u64.u32 %rd720, %r14995; + and.b64 %rd721, %rd720, 60; + add.s64 %rd722, %rd149, %rd721; + xor.b32 %r14996, %r1695, %r30428; + mul.lo.s32 %r14997, %r14996, 16777619; + ld.local.u32 %r14998, [%rd722]; + xor.b32 %r14999, %r14997, %r14998; + mul.wide.u32 %rd723, %r14999, -954391867; + shr.u64 %rd724, %rd723, 32; + cvt.u32.u64 %r15000, %rd724; + sub.s32 %r15001, %r14999, %r15000; + shr.u32 %r15002, %r15001, 1; + add.s32 %r15003, %r15002, %r15000; + shr.u32 %r15004, %r15003, 20; + mul.lo.s32 %r15005, %r15004, 1179641; + sub.s32 %r15006, %r14999, %r15005; + mul.wide.u32 %rd725, %r15006, 64; + add.s64 %rd726, %rd1268, %rd725; + mul.lo.s32 %r15007, %r30465, 16777619; + ld.global.u32 %r15008, [%rd726]; + xor.b32 %r30465, %r15007, %r15008; + mul.lo.s32 %r15009, %r30466, 16777619; + ld.global.u32 %r15010, [%rd726+4]; + xor.b32 %r30466, %r15009, %r15010; + mul.lo.s32 %r15011, %r30477, 16777619; + ld.global.u32 %r15012, [%rd726+8]; + mul.lo.s32 %r15013, %r30478, 16777619; + ld.global.u32 %r15014, [%rd726+12]; + xor.b32 %r15015, %r15013, %r15014; + xor.b32 %r30477, %r15011, %r15012; + mov.b64 %rd727, {%r30477, %r15015}; + mul.lo.s32 %r15016, %r30473, 16777619; + ld.global.u32 %r15017, [%rd726+16]; + mul.lo.s32 %r15018, %r30474, 16777619; + ld.global.u32 %r15019, [%rd726+20]; + xor.b32 %r15020, %r15018, %r15019; + xor.b32 %r30473, %r15016, %r15017; + mov.b64 %rd728, {%r30473, %r15020}; + mul.lo.s32 %r15021, %r30469, 16777619; + ld.global.u32 %r15022, [%rd726+24]; + mul.lo.s32 %r15023, %r30470, 16777619; + ld.global.u32 %r15024, [%rd726+28]; + xor.b32 %r15025, %r15023, %r15024; + xor.b32 %r30469, %r15021, %r15022; + mov.b64 %rd729, {%r30469, %r15025}; + mul.lo.s32 %r15026, %r30467, 16777619; + ld.global.u32 %r15027, [%rd726+32]; + mul.lo.s32 %r15028, %r30468, 16777619; + ld.global.u32 %r15029, [%rd726+36]; + xor.b32 %r15030, %r15028, %r15029; + xor.b32 %r30467, %r15026, %r15027; + mov.b64 %rd730, {%r30467, %r15030}; + mul.lo.s32 %r15031, %r30463, 16777619; + ld.global.u32 %r15032, [%rd726+40]; + xor.b32 %r30463, %r15031, %r15032; + mul.lo.s32 %r15033, %r30464, 16777619; + ld.global.u32 %r15034, [%rd726+44]; + xor.b32 %r30464, %r15033, %r15034; + mul.lo.s32 %r15035, %r30475, 16777619; + ld.global.u32 %r15036, [%rd726+48]; + mul.lo.s32 %r15037, %r30476, 16777619; + ld.global.u32 %r15038, [%rd726+52]; + xor.b32 %r15039, %r15037, %r15038; + xor.b32 %r30475, %r15035, %r15036; + mov.b64 %rd731, {%r30475, %r15039}; + mul.lo.s32 %r15040, %r30471, 16777619; + ld.global.u32 %r15041, [%rd726+56]; + mul.lo.s32 %r15042, %r30472, 16777619; + ld.global.u32 %r15043, [%rd726+60]; + xor.b32 %r15044, %r15042, %r15043; + xor.b32 %r30471, %r15040, %r15041; + mov.b64 %rd732, {%r30471, %r15044}; + st.local.v2.u32 [%rd2+24], {%r30465, %r30466}; + st.local.v2.u32 [%rd2+32], {%r30477, %r15015}; + st.local.v2.u32 [%rd2+40], {%r30473, %r15020}; + st.local.v2.u32 [%rd2+48], {%r30469, %r15025}; + st.local.v2.u32 [%rd2+56], {%r30467, %r15030}; + st.local.v2.u32 [%rd2+64], {%r30463, %r30464}; + st.local.v2.u32 [%rd2+72], {%r30475, %r15039}; + st.local.v2.u32 [%rd2+80], {%r30471, %r15044}; + add.s64 %rd733, %rd150, %rd721; + xor.b32 %r15045, %r1891, %r30428; + mul.lo.s32 %r15046, %r15045, 16777619; + ld.local.u32 %r15047, [%rd733]; + xor.b32 %r15048, %r15046, %r15047; + mul.wide.u32 %rd734, %r15048, -954391867; + shr.u64 %rd735, %rd734, 32; + cvt.u32.u64 %r15049, %rd735; + sub.s32 %r15050, %r15048, %r15049; + shr.u32 %r15051, %r15050, 1; + add.s32 %r15052, %r15051, %r15049; + shr.u32 %r15053, %r15052, 20; + mul.lo.s32 %r15054, %r15053, 1179641; + sub.s32 %r15055, %r15048, %r15054; + mul.wide.u32 %rd736, %r15055, 64; + add.s64 %rd737, %rd1268, %rd736; + mul.lo.s32 %r15056, %r30516, 16777619; + ld.global.u32 %r15057, [%rd737]; + xor.b32 %r30516, %r15056, %r15057; + mul.lo.s32 %r15058, %r30517, 16777619; + ld.global.u32 %r15059, [%rd737+4]; + xor.b32 %r30517, %r15058, %r15059; + mul.lo.s32 %r15060, %r30528, 16777619; + ld.global.u32 %r15061, [%rd737+8]; + mul.lo.s32 %r15062, %r30529, 16777619; + ld.global.u32 %r15063, [%rd737+12]; + xor.b32 %r15064, %r15062, %r15063; + xor.b32 %r30528, %r15060, %r15061; + mov.b64 %rd738, {%r30528, %r15064}; + mul.lo.s32 %r15065, %r30524, 16777619; + ld.global.u32 %r15066, [%rd737+16]; + mul.lo.s32 %r15067, %r30525, 16777619; + ld.global.u32 %r15068, [%rd737+20]; + xor.b32 %r15069, %r15067, %r15068; + xor.b32 %r30524, %r15065, %r15066; + mov.b64 %rd739, {%r30524, %r15069}; + mul.lo.s32 %r15070, %r30520, 16777619; + ld.global.u32 %r15071, [%rd737+24]; + mul.lo.s32 %r15072, %r30521, 16777619; + ld.global.u32 %r15073, [%rd737+28]; + xor.b32 %r15074, %r15072, %r15073; + xor.b32 %r30520, %r15070, %r15071; + mov.b64 %rd740, {%r30520, %r15074}; + mul.lo.s32 %r15075, %r30518, 16777619; + ld.global.u32 %r15076, [%rd737+32]; + mul.lo.s32 %r15077, %r30519, 16777619; + ld.global.u32 %r15078, [%rd737+36]; + xor.b32 %r15079, %r15077, %r15078; + xor.b32 %r30518, %r15075, %r15076; + mov.b64 %rd741, {%r30518, %r15079}; + mul.lo.s32 %r15080, %r30514, 16777619; + ld.global.u32 %r15081, [%rd737+40]; + xor.b32 %r30514, %r15080, %r15081; + mul.lo.s32 %r15082, %r30515, 16777619; + ld.global.u32 %r15083, [%rd737+44]; + xor.b32 %r30515, %r15082, %r15083; + mul.lo.s32 %r15084, %r30526, 16777619; + ld.global.u32 %r15085, [%rd737+48]; + mul.lo.s32 %r15086, %r30527, 16777619; + ld.global.u32 %r15087, [%rd737+52]; + xor.b32 %r15088, %r15086, %r15087; + xor.b32 %r30526, %r15084, %r15085; + mov.b64 %rd742, {%r30526, %r15088}; + mul.lo.s32 %r15089, %r30522, 16777619; + ld.global.u32 %r15090, [%rd737+56]; + mul.lo.s32 %r15091, %r30523, 16777619; + ld.global.u32 %r15092, [%rd737+60]; + xor.b32 %r15093, %r15091, %r15092; + xor.b32 %r30522, %r15089, %r15090; + mov.b64 %rd743, {%r30522, %r15093}; + st.local.v2.u32 [%rd147+24], {%r30516, %r30517}; + st.local.v2.u32 [%rd147+32], {%r30528, %r15064}; + st.local.v2.u32 [%rd147+40], {%r30524, %r15069}; + st.local.v2.u32 [%rd147+48], {%r30520, %r15074}; + st.local.v2.u32 [%rd147+56], {%r30518, %r15079}; + st.local.v2.u32 [%rd147+64], {%r30514, %r30515}; + st.local.v2.u32 [%rd147+72], {%r30526, %r15088}; + st.local.v2.u32 [%rd147+80], {%r30522, %r15093}; + add.s32 %r30428, %r30428, 1; + setp.lt.u32 %p28, %r30428, 512; + shr.u64 %rd744, %rd727, 32; + cvt.u32.u64 %r30478, %rd744; + shr.u64 %rd745, %rd728, 32; + cvt.u32.u64 %r30474, %rd745; + shr.u64 %rd746, %rd729, 32; + cvt.u32.u64 %r30470, %rd746; + shr.u64 %rd747, %rd730, 32; + cvt.u32.u64 %r30468, %rd747; + shr.u64 %rd748, %rd731, 32; + cvt.u32.u64 %r30476, %rd748; + shr.u64 %rd749, %rd732, 32; + cvt.u32.u64 %r30472, %rd749; + shr.u64 %rd750, %rd738, 32; + cvt.u32.u64 %r30529, %rd750; + shr.u64 %rd751, %rd739, 32; + cvt.u32.u64 %r30525, %rd751; + shr.u64 %rd752, %rd740, 32; + cvt.u32.u64 %r30521, %rd752; + shr.u64 %rd753, %rd741, 32; + cvt.u32.u64 %r30519, %rd753; + shr.u64 %rd754, %rd742, 32; + cvt.u32.u64 %r30527, %rd754; + shr.u64 %rd755, %rd743, 32; + cvt.u32.u64 %r30523, %rd755; + @%p28 bra $L__BB2_44; + + mov.u32 %r30429, 0; + st.local.v2.u32 [%rd2+96], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+104], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+112], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+120], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+128], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+136], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+144], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+152], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+160], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+168], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+176], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+184], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+192], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+200], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+208], {%r30429, %r30429}; + st.local.v2.u32 [%rd2+216], {%r30429, %r30429}; + mov.u32 %r30444, -2147483648; + mov.u32 %r15108, 1; + st.local.v2.u32 [%rd2+88], {%r15108, %r30444}; + mov.u32 %r30430, %r30429; + mov.u32 %r30431, %r30429; + mov.u32 %r30432, %r30429; + mov.u32 %r30433, %r30429; + mov.u32 %r30434, %r30429; + mov.u32 %r30435, %r30429; + mov.u32 %r30436, %r30429; + mov.u32 %r30437, %r30429; + mov.u32 %r30438, %r30429; + mov.u32 %r30439, %r30429; + mov.u32 %r30440, %r30429; + mov.u32 %r30441, %r30429; + mov.u32 %r30442, %r30429; + mov.u32 %r30443, %r15108; + mov.u32 %r30445, %r30429; + mov.u32 %r30446, %r30429; + mov.u32 %r30447, %r30429; + mov.u32 %r30448, %r30429; + mov.u32 %r30449, %r30429; + mov.u32 %r30450, %r30429; + mov.u32 %r30451, %r30429; + mov.u32 %r30452, %r30429; + mov.u32 %r30453, %r30429; + mov.u32 %r30454, %r30429; + mov.u32 %r30455, %r30429; + mov.u32 %r30456, %r30429; + mov.u32 %r30457, %r30429; + mov.u32 %r30458, %r30429; + mov.u32 %r30459, %r30429; + mov.u32 %r30460, %r30429; + mov.u32 %r30461, %r30429; + mov.u32 %r30462, %r30429; + mov.u32 %r30479, %r30429; + +$L__BB2_46: + // begin inline asm + // xor5 + lop3.b32 %r15135, %r30465, %r30463, %r30461, 0x96; + lop3.b32 %r15135, %r15135, %r30459, %r30457, 0x96; + lop3.b32 %r15136, %r30466, %r30464, %r30462, 0x96; + lop3.b32 %r15136, %r15136, %r30460, %r30458, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15147, %r30477, %r30475, %r30455, 0x96; + lop3.b32 %r15147, %r15147, %r30453, %r30451, 0x96; + lop3.b32 %r15148, %r30478, %r30476, %r30456, 0x96; + lop3.b32 %r15148, %r15148, %r30454, %r30452, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15159, %r30473, %r30471, %r30449, 0x96; + lop3.b32 %r15159, %r15159, %r30447, %r30445, 0x96; + lop3.b32 %r15160, %r30474, %r30472, %r30450, 0x96; + lop3.b32 %r15160, %r15160, %r30448, %r30446, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15171, %r30469, %r30443, %r30441, 0x96; + lop3.b32 %r15171, %r15171, %r30439, %r30437, 0x96; + lop3.b32 %r15172, %r30470, %r30444, %r30442, 0x96; + lop3.b32 %r15172, %r15172, %r30440, %r30438, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15183, %r30467, %r30435, %r30433, 0x96; + lop3.b32 %r15183, %r15183, %r30431, %r30429, 0x96; + lop3.b32 %r15184, %r30468, %r30436, %r30434, 0x96; + lop3.b32 %r15184, %r15184, %r30432, %r30430, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15195, %r15148, %r15147, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15199, %r15147, %r15148, %r15108; + // end inline asm + xor.b32 %r15629, %r15195, %r15183; + xor.b32 %r15630, %r15199, %r15184; + xor.b32 %r15462, %r30465, %r15629; + xor.b32 %r15465, %r30466, %r15630; + xor.b32 %r15369, %r30463, %r15629; + xor.b32 %r15368, %r30464, %r15630; + xor.b32 %r15416, %r30461, %r15629; + xor.b32 %r15417, %r30462, %r15630; + xor.b32 %r15321, %r30459, %r15629; + xor.b32 %r15320, %r30460, %r15630; + xor.b32 %r15272, %r30457, %r15629; + xor.b32 %r15273, %r30458, %r15630; + // begin inline asm + shf.l.wrap.b32 %r15203, %r15160, %r15159, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15207, %r15159, %r15160, %r15108; + // end inline asm + xor.b32 %r15631, %r15203, %r15135; + xor.b32 %r15632, %r15207, %r15136; + xor.b32 %r15424, %r30477, %r15631; + xor.b32 %r15425, %r30478, %r15632; + xor.b32 %r15241, %r30475, %r15631; + xor.b32 %r15240, %r30476, %r15632; + xor.b32 %r15400, %r30455, %r15631; + xor.b32 %r15401, %r30456, %r15632; + xor.b32 %r15361, %r30453, %r15631; + xor.b32 %r15360, %r30454, %r15632; + xor.b32 %r15344, %r30451, %r15631; + xor.b32 %r15345, %r30452, %r15632; + // begin inline asm + shf.l.wrap.b32 %r15211, %r15172, %r15171, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15215, %r15171, %r15172, %r15108; + // end inline asm + xor.b32 %r15633, %r15211, %r15147; + xor.b32 %r15634, %r15215, %r15148; + xor.b32 %r15281, %r30473, %r15633; + xor.b32 %r15280, %r30474, %r15634; + xor.b32 %r15408, %r30471, %r15633; + xor.b32 %r15409, %r30472, %r15634; + xor.b32 %r15289, %r30449, %r15633; + xor.b32 %r15288, %r30450, %r15634; + xor.b32 %r15392, %r30447, %r15633; + xor.b32 %r15393, %r30448, %r15634; + xor.b32 %r15257, %r30445, %r15633; + xor.b32 %r15256, %r30446, %r15634; + // begin inline asm + shf.l.wrap.b32 %r15219, %r15184, %r15183, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15223, %r15183, %r15184, %r15108; + // end inline asm + xor.b32 %r15635, %r15219, %r15159; + xor.b32 %r15636, %r15223, %r15160; + xor.b32 %r15376, %r30469, %r15635; + xor.b32 %r15377, %r30470, %r15636; + xor.b32 %r15353, %r30443, %r15635; + xor.b32 %r15352, %r30444, %r15636; + xor.b32 %r15296, %r30441, %r15635; + xor.b32 %r15297, %r30442, %r15636; + xor.b32 %r15384, %r30439, %r15635; + xor.b32 %r15385, %r30440, %r15636; + xor.b32 %r15313, %r30437, %r15635; + xor.b32 %r15312, %r30438, %r15636; + // begin inline asm + shf.l.wrap.b32 %r15227, %r15136, %r15135, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15231, %r15135, %r15136, %r15108; + // end inline asm + xor.b32 %r15637, %r15227, %r15171; + xor.b32 %r15638, %r15231, %r15172; + xor.b32 %r15328, %r30467, %r15637; + xor.b32 %r15329, %r30468, %r15638; + xor.b32 %r15248, %r30435, %r15637; + xor.b32 %r15249, %r30436, %r15638; + xor.b32 %r15265, %r30433, %r15637; + xor.b32 %r15264, %r30434, %r15638; + xor.b32 %r15304, %r30431, %r15637; + xor.b32 %r15305, %r30432, %r15638; + xor.b32 %r15336, %r30429, %r15637; + xor.b32 %r15337, %r30430, %r15638; + mov.u32 %r15242, 44; + // begin inline asm + shf.l.wrap.b32 %r15235, %r15241, %r15240, %r15242; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15239, %r15240, %r15241, %r15242; + // end inline asm + mov.u32 %r15250, 20; + // begin inline asm + shf.l.wrap.b32 %r15243, %r15249, %r15248, %r15250; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15247, %r15248, %r15249, %r15250; + // end inline asm + mov.u32 %r15258, 61; + // begin inline asm + shf.l.wrap.b32 %r15251, %r15257, %r15256, %r15258; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15255, %r15256, %r15257, %r15258; + // end inline asm + mov.u32 %r15266, 39; + // begin inline asm + shf.l.wrap.b32 %r15259, %r15265, %r15264, %r15266; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15263, %r15264, %r15265, %r15266; + // end inline asm + mov.u32 %r15274, 18; + // begin inline asm + shf.l.wrap.b32 %r15267, %r15273, %r15272, %r15274; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15271, %r15272, %r15273, %r15274; + // end inline asm + mov.u32 %r15282, 62; + // begin inline asm + shf.l.wrap.b32 %r15275, %r15281, %r15280, %r15282; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15279, %r15280, %r15281, %r15282; + // end inline asm + mov.u32 %r15290, 43; + // begin inline asm + shf.l.wrap.b32 %r15283, %r15289, %r15288, %r15290; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15287, %r15288, %r15289, %r15290; + // end inline asm + mov.u32 %r15298, 25; + // begin inline asm + shf.l.wrap.b32 %r15291, %r15297, %r15296, %r15298; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15295, %r15296, %r15297, %r15298; + // end inline asm + mov.u32 %r15306, 8; + // begin inline asm + shf.l.wrap.b32 %r15299, %r15305, %r15304, %r15306; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15303, %r15304, %r15305, %r15306; + // end inline asm + mov.u32 %r15314, 56; + // begin inline asm + shf.l.wrap.b32 %r15307, %r15313, %r15312, %r15314; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15311, %r15312, %r15313, %r15314; + // end inline asm + mov.u32 %r15322, 41; + // begin inline asm + shf.l.wrap.b32 %r15315, %r15321, %r15320, %r15322; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15319, %r15320, %r15321, %r15322; + // end inline asm + mov.u32 %r15330, 27; + // begin inline asm + shf.l.wrap.b32 %r15323, %r15329, %r15328, %r15330; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15327, %r15328, %r15329, %r15330; + // end inline asm + mov.u32 %r15338, 14; + // begin inline asm + shf.l.wrap.b32 %r15331, %r15337, %r15336, %r15338; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15335, %r15336, %r15337, %r15338; + // end inline asm + mov.u32 %r15346, 2; + // begin inline asm + shf.l.wrap.b32 %r15339, %r15345, %r15344, %r15346; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15343, %r15344, %r15345, %r15346; + // end inline asm + mov.u32 %r15354, 55; + // begin inline asm + shf.l.wrap.b32 %r15347, %r15353, %r15352, %r15354; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15351, %r15352, %r15353, %r15354; + // end inline asm + mov.u32 %r15362, 45; + // begin inline asm + shf.l.wrap.b32 %r15355, %r15361, %r15360, %r15362; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15359, %r15360, %r15361, %r15362; + // end inline asm + mov.u32 %r15370, 36; + // begin inline asm + shf.l.wrap.b32 %r15363, %r15369, %r15368, %r15370; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15367, %r15368, %r15369, %r15370; + // end inline asm + mov.u32 %r15378, 28; + // begin inline asm + shf.l.wrap.b32 %r15371, %r15377, %r15376, %r15378; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15375, %r15376, %r15377, %r15378; + // end inline asm + mov.u32 %r15386, 21; + // begin inline asm + shf.l.wrap.b32 %r15379, %r15385, %r15384, %r15386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15383, %r15384, %r15385, %r15386; + // end inline asm + mov.u32 %r15394, 15; + // begin inline asm + shf.l.wrap.b32 %r15387, %r15393, %r15392, %r15394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15391, %r15392, %r15393, %r15394; + // end inline asm + mov.u32 %r15402, 10; + // begin inline asm + shf.l.wrap.b32 %r15395, %r15401, %r15400, %r15402; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15399, %r15400, %r15401, %r15402; + // end inline asm + mov.u32 %r15410, 6; + // begin inline asm + shf.l.wrap.b32 %r15403, %r15409, %r15408, %r15410; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15407, %r15408, %r15409, %r15410; + // end inline asm + mov.u32 %r15418, 3; + // begin inline asm + shf.l.wrap.b32 %r15411, %r15417, %r15416, %r15418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15415, %r15416, %r15417, %r15418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15419, %r15425, %r15424, %r15108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15423, %r15424, %r15425, %r15108; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15427, %r15462, %r15235, %r15283, 0xD2; + lop3.b32 %r15428, %r15465, %r15239, %r15287, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30477, %r15235, %r15283, %r15379, 0xD2; + lop3.b32 %r30478, %r15239, %r15287, %r15383, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30473, %r15283, %r15379, %r15331, 0xD2; + lop3.b32 %r30474, %r15287, %r15383, %r15335, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30469, %r15379, %r15331, %r15462, 0xD2; + lop3.b32 %r30470, %r15383, %r15335, %r15465, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30467, %r15331, %r15462, %r15235, 0xD2; + lop3.b32 %r30468, %r15335, %r15465, %r15239, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30463, %r15371, %r15243, %r15411, 0xD2; + lop3.b32 %r30464, %r15375, %r15247, %r15415, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30475, %r15243, %r15411, %r15355, 0xD2; + lop3.b32 %r30476, %r15247, %r15415, %r15359, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30471, %r15411, %r15355, %r15251, 0xD2; + lop3.b32 %r30472, %r15415, %r15359, %r15255, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30443, %r15355, %r15251, %r15371, 0xD2; + lop3.b32 %r30444, %r15359, %r15255, %r15375, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30443, %r30444}; + // begin inline asm + // chi + lop3.b32 %r30435, %r15251, %r15371, %r15243, 0xD2; + lop3.b32 %r30436, %r15255, %r15375, %r15247, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30435, %r30436}; + // begin inline asm + // chi + lop3.b32 %r30461, %r15419, %r15403, %r15291, 0xD2; + lop3.b32 %r30462, %r15423, %r15407, %r15295, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30461, %r30462}; + // begin inline asm + // chi + lop3.b32 %r30455, %r15403, %r15291, %r15299, 0xD2; + lop3.b32 %r30456, %r15407, %r15295, %r15303, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30455, %r30456}; + // begin inline asm + // chi + lop3.b32 %r30449, %r15291, %r15299, %r15267, 0xD2; + lop3.b32 %r30450, %r15295, %r15303, %r15271, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30449, %r30450}; + // begin inline asm + // chi + lop3.b32 %r30441, %r15299, %r15267, %r15419, 0xD2; + lop3.b32 %r30442, %r15303, %r15271, %r15423, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30441, %r30442}; + // begin inline asm + // chi + lop3.b32 %r30433, %r15267, %r15419, %r15403, 0xD2; + lop3.b32 %r30434, %r15271, %r15423, %r15407, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30433, %r30434}; + // begin inline asm + // chi + lop3.b32 %r30459, %r15323, %r15363, %r15395, 0xD2; + lop3.b32 %r30460, %r15327, %r15367, %r15399, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30459, %r30460}; + // begin inline asm + // chi + lop3.b32 %r30453, %r15363, %r15395, %r15387, 0xD2; + lop3.b32 %r30454, %r15367, %r15399, %r15391, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30453, %r30454}; + // begin inline asm + // chi + lop3.b32 %r30447, %r15395, %r15387, %r15307, 0xD2; + lop3.b32 %r30448, %r15399, %r15391, %r15311, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30447, %r30448}; + // begin inline asm + // chi + lop3.b32 %r30439, %r15387, %r15307, %r15323, 0xD2; + lop3.b32 %r30440, %r15391, %r15311, %r15327, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30439, %r30440}; + // begin inline asm + // chi + lop3.b32 %r30431, %r15307, %r15323, %r15363, 0xD2; + lop3.b32 %r30432, %r15311, %r15327, %r15367, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30431, %r30432}; + // begin inline asm + // chi + lop3.b32 %r30457, %r15275, %r15347, %r15259, 0xD2; + lop3.b32 %r30458, %r15279, %r15351, %r15263, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30457, %r30458}; + // begin inline asm + // chi + lop3.b32 %r30451, %r15347, %r15259, %r15315, 0xD2; + lop3.b32 %r30452, %r15351, %r15263, %r15319, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30451, %r30452}; + // begin inline asm + // chi + lop3.b32 %r30445, %r15259, %r15315, %r15339, 0xD2; + lop3.b32 %r30446, %r15263, %r15319, %r15343, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30445, %r30446}; + // begin inline asm + // chi + lop3.b32 %r30437, %r15315, %r15339, %r15275, 0xD2; + lop3.b32 %r30438, %r15319, %r15343, %r15279, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30437, %r30438}; + // begin inline asm + // chi + lop3.b32 %r30429, %r15339, %r15275, %r15347, 0xD2; + lop3.b32 %r30430, %r15343, %r15279, %r15351, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30429, %r30430}; + mul.wide.s32 %rd757, %r30479, 8; + add.s64 %rd756, %rd689, %rd757; + // begin inline asm + ld.global.nc.v2.u32 {%r15627,%r15628}, [%rd756]; + // end inline asm + xor.b32 %r30465, %r15427, %r15627; + xor.b32 %r30466, %r15428, %r15628; + add.s32 %r30479, %r30479, 1; + setp.lt.u32 %p29, %r30479, 23; + @%p29 bra $L__BB2_46; + + st.local.v2.u32 [%rd2+32], {%r30477, %r30478}; + st.local.v2.u32 [%rd2+72], {%r30475, %r30476}; + st.local.v2.u32 [%rd2+40], {%r30473, %r30474}; + st.local.v2.u32 [%rd2+80], {%r30471, %r30472}; + st.local.v2.u32 [%rd2+48], {%r30469, %r30470}; + st.local.v2.u32 [%rd2+56], {%r30467, %r30468}; + st.local.v2.u32 [%rd2+24], {%r30465, %r30466}; + // begin inline asm + // xor5 + lop3.b32 %r15639, %r30465, %r30463, %r30461, 0x96; + lop3.b32 %r15639, %r15639, %r30459, %r30457, 0x96; + lop3.b32 %r15640, %r30466, %r30464, %r30462, 0x96; + lop3.b32 %r15640, %r15640, %r30460, %r30458, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15651, %r30477, %r30475, %r30455, 0x96; + lop3.b32 %r15651, %r15651, %r30453, %r30451, 0x96; + lop3.b32 %r15652, %r30478, %r30476, %r30456, 0x96; + lop3.b32 %r15652, %r15652, %r30454, %r30452, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15663, %r30473, %r30471, %r30449, 0x96; + lop3.b32 %r15663, %r15663, %r30447, %r30445, 0x96; + lop3.b32 %r15664, %r30474, %r30472, %r30450, 0x96; + lop3.b32 %r15664, %r15664, %r30448, %r30446, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15675, %r30469, %r30443, %r30441, 0x96; + lop3.b32 %r15675, %r15675, %r30439, %r30437, 0x96; + lop3.b32 %r15676, %r30470, %r30444, %r30442, 0x96; + lop3.b32 %r15676, %r15676, %r30440, %r30438, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15687, %r30467, %r30435, %r30433, 0x96; + lop3.b32 %r15687, %r15687, %r30431, %r30429, 0x96; + lop3.b32 %r15688, %r30468, %r30436, %r30434, 0x96; + lop3.b32 %r15688, %r15688, %r30432, %r30430, 0x96; + // end inline asm + mov.u32 %r15891, 1; + // begin inline asm + shf.l.wrap.b32 %r15699, %r15652, %r15651, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15703, %r15651, %r15652, %r15891; + // end inline asm + xor.b32 %r15918, %r15699, %r15687; + xor.b32 %r15919, %r15703, %r15688; + xor.b32 %r15846, %r30465, %r15918; + xor.b32 %r15849, %r30466, %r15919; + xor.b32 %r15809, %r30462, %r15919; + xor.b32 %r15808, %r30461, %r15918; + st.local.v2.u32 [%rd2+104], {%r15808, %r15809}; + // begin inline asm + shf.l.wrap.b32 %r15707, %r15664, %r15663, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15711, %r15663, %r15664, %r15891; + // end inline asm + xor.b32 %r15920, %r15707, %r15639; + xor.b32 %r15921, %r15711, %r15640; + xor.b32 %r15745, %r30475, %r15920; + xor.b32 %r15744, %r30476, %r15921; + xor.b32 %r15784, %r30454, %r15921; + xor.b32 %r15785, %r30453, %r15920; + st.local.v2.u32 [%rd2+152], {%r15785, %r15784}; + // begin inline asm + shf.l.wrap.b32 %r15715, %r15676, %r15675, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15719, %r15675, %r15676, %r15891; + // end inline asm + xor.b32 %r15922, %r15715, %r15651; + xor.b32 %r15923, %r15719, %r15652; + xor.b32 %r15768, %r30450, %r15923; + xor.b32 %r15769, %r30449, %r15922; + st.local.v2.u32 [%rd2+120], {%r15769, %r15768}; + xor.b32 %r15760, %r30446, %r15923; + xor.b32 %r15761, %r30445, %r15922; + st.local.v2.u32 [%rd2+200], {%r15761, %r15760}; + // begin inline asm + shf.l.wrap.b32 %r15723, %r15688, %r15687, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15727, %r15687, %r15688, %r15891; + // end inline asm + xor.b32 %r15924, %r15723, %r15663; + xor.b32 %r15925, %r15727, %r15664; + xor.b32 %r15792, %r30469, %r15924; + xor.b32 %r15793, %r30470, %r15925; + xor.b32 %r15801, %r30440, %r15925; + xor.b32 %r15800, %r30439, %r15924; + st.local.v2.u32 [%rd2+168], {%r15800, %r15801}; + // begin inline asm + shf.l.wrap.b32 %r15731, %r15640, %r15639, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15735, %r15639, %r15640, %r15891; + // end inline asm + xor.b32 %r15926, %r15731, %r15675; + xor.b32 %r15927, %r15735, %r15676; + xor.b32 %r15752, %r30435, %r15926; + xor.b32 %r15753, %r30436, %r15927; + xor.b32 %r15777, %r30430, %r15927; + xor.b32 %r15776, %r30429, %r15926; + st.local.v2.u32 [%rd2+216], {%r15776, %r15777}; + // begin inline asm + shf.l.wrap.b32 %r15739, %r15745, %r15744, %r15242; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15743, %r15744, %r15745, %r15242; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15747, %r15753, %r15752, %r15250; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15751, %r15752, %r15753, %r15250; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15759, %r15760, %r15761, %r15258; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15755, %r15761, %r15760, %r15258; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r15755, %r15759}; + // begin inline asm + shf.l.wrap.b32 %r15763, %r15769, %r15768, %r15290; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15767, %r15768, %r15769, %r15290; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15771, %r15777, %r15776, %r15338; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15775, %r15776, %r15777, %r15338; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15783, %r15784, %r15785, %r15362; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15779, %r15785, %r15784, %r15362; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r15779, %r15783}; + // begin inline asm + shf.l.wrap.b32 %r15787, %r15793, %r15792, %r15378; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15791, %r15792, %r15793, %r15378; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15795, %r15801, %r15800, %r15386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15799, %r15800, %r15801, %r15386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15803, %r15809, %r15808, %r15418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15807, %r15808, %r15809, %r15418; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15811, %r15846, %r15739, %r15763, 0xD2; + lop3.b32 %r15812, %r15849, %r15743, %r15767, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15819, %r15739, %r15763, %r15795, 0xD2; + lop3.b32 %r15820, %r15743, %r15767, %r15799, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r15819, %r15820}; + // begin inline asm + // chi + lop3.b32 %r15827, %r15763, %r15795, %r15771, 0xD2; + lop3.b32 %r15828, %r15767, %r15799, %r15775, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r15827, %r15828}; + // begin inline asm + // chi + lop3.b32 %r15835, %r15795, %r15771, %r15846, 0xD2; + lop3.b32 %r15836, %r15799, %r15775, %r15849, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r15835, %r15836}; + // begin inline asm + // chi + lop3.b32 %r15843, %r15771, %r15846, %r15739, 0xD2; + lop3.b32 %r15844, %r15775, %r15849, %r15743, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r15843, %r15844}; + // begin inline asm + // chi + lop3.b32 %r15851, %r15787, %r15747, %r15803, 0xD2; + lop3.b32 %r15852, %r15791, %r15751, %r15807, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r15851, %r15852}; + // begin inline asm + // chi + lop3.b32 %r15859, %r15747, %r15803, %r15779, 0xD2; + lop3.b32 %r15860, %r15751, %r15807, %r15783, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r15859, %r15860}; + // begin inline asm + // chi + lop3.b32 %r15867, %r15803, %r15779, %r15755, 0xD2; + lop3.b32 %r15868, %r15807, %r15783, %r15759, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r15867, %r15868}; + // begin inline asm + ld.global.nc.v2.u32 {%r15875,%r15876}, [%rd690]; + // end inline asm + xor.b32 %r15928, %r15812, %r15876; + xor.b32 %r15929, %r15811, %r15875; + mov.b64 %rd1333, {%r15929, %r15928}; + mov.b64 %rd1334, {%r15819, %r15820}; + mov.b64 %rd1335, {%r15827, %r15828}; + mov.b64 %rd1336, {%r15835, %r15836}; + mov.b64 %rd1337, {%r15843, %r15844}; + mov.b64 %rd1338, {%r15851, %r15852}; + mov.b64 %rd1339, {%r15859, %r15860}; + mov.b64 %rd1340, {%r15867, %r15868}; + mov.u32 %r30480, 0; + st.local.v2.u32 [%rd2+24], {%r15929, %r15928}; + st.local.v2.u32 [%rd147+96], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+104], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+112], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+120], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+128], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+136], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+144], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+152], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+160], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+168], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+176], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+184], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+192], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+200], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+208], {%r30480, %r30480}; + st.local.v2.u32 [%rd147+216], {%r30480, %r30480}; + mov.u32 %r30495, -2147483648; + st.local.v2.u32 [%rd147+88], {%r15891, %r30495}; + mov.u32 %r30481, %r30480; + mov.u32 %r30482, %r30480; + mov.u32 %r30483, %r30480; + mov.u32 %r30484, %r30480; + mov.u32 %r30485, %r30480; + mov.u32 %r30486, %r30480; + mov.u32 %r30487, %r30480; + mov.u32 %r30488, %r30480; + mov.u32 %r30489, %r30480; + mov.u32 %r30490, %r30480; + mov.u32 %r30491, %r30480; + mov.u32 %r30492, %r30480; + mov.u32 %r30493, %r30480; + mov.u32 %r30494, %r15891; + mov.u32 %r30496, %r30480; + mov.u32 %r30497, %r30480; + mov.u32 %r30498, %r30480; + mov.u32 %r30499, %r30480; + mov.u32 %r30500, %r30480; + mov.u32 %r30501, %r30480; + mov.u32 %r30502, %r30480; + mov.u32 %r30503, %r30480; + mov.u32 %r30504, %r30480; + mov.u32 %r30505, %r30480; + mov.u32 %r30506, %r30480; + mov.u32 %r30507, %r30480; + mov.u32 %r30508, %r30480; + mov.u32 %r30509, %r30480; + mov.u32 %r30510, %r30480; + mov.u32 %r30511, %r30480; + mov.u32 %r30512, %r30480; + mov.u32 %r30513, %r30480; + mov.u32 %r30530, %r30480; + +$L__BB2_48: + // begin inline asm + // xor5 + lop3.b32 %r15930, %r30516, %r30514, %r30512, 0x96; + lop3.b32 %r15930, %r15930, %r30510, %r30508, 0x96; + lop3.b32 %r15931, %r30517, %r30515, %r30513, 0x96; + lop3.b32 %r15931, %r15931, %r30511, %r30509, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15942, %r30528, %r30526, %r30506, 0x96; + lop3.b32 %r15942, %r15942, %r30504, %r30502, 0x96; + lop3.b32 %r15943, %r30529, %r30527, %r30507, 0x96; + lop3.b32 %r15943, %r15943, %r30505, %r30503, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15954, %r30524, %r30522, %r30500, 0x96; + lop3.b32 %r15954, %r15954, %r30498, %r30496, 0x96; + lop3.b32 %r15955, %r30525, %r30523, %r30501, 0x96; + lop3.b32 %r15955, %r15955, %r30499, %r30497, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15966, %r30520, %r30494, %r30492, 0x96; + lop3.b32 %r15966, %r15966, %r30490, %r30488, 0x96; + lop3.b32 %r15967, %r30521, %r30495, %r30493, 0x96; + lop3.b32 %r15967, %r15967, %r30491, %r30489, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15978, %r30518, %r30486, %r30484, 0x96; + lop3.b32 %r15978, %r15978, %r30482, %r30480, 0x96; + lop3.b32 %r15979, %r30519, %r30487, %r30485, 0x96; + lop3.b32 %r15979, %r15979, %r30483, %r30481, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15990, %r15943, %r15942, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15994, %r15942, %r15943, %r15891; + // end inline asm + xor.b32 %r16424, %r15990, %r15978; + xor.b32 %r16425, %r15994, %r15979; + xor.b32 %r16257, %r30516, %r16424; + xor.b32 %r16260, %r30517, %r16425; + xor.b32 %r16164, %r30514, %r16424; + xor.b32 %r16163, %r30515, %r16425; + xor.b32 %r16211, %r30512, %r16424; + xor.b32 %r16212, %r30513, %r16425; + xor.b32 %r16116, %r30510, %r16424; + xor.b32 %r16115, %r30511, %r16425; + xor.b32 %r16067, %r30508, %r16424; + xor.b32 %r16068, %r30509, %r16425; + // begin inline asm + shf.l.wrap.b32 %r15998, %r15955, %r15954, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16002, %r15954, %r15955, %r15891; + // end inline asm + xor.b32 %r16426, %r15998, %r15930; + xor.b32 %r16427, %r16002, %r15931; + xor.b32 %r16219, %r30528, %r16426; + xor.b32 %r16220, %r30529, %r16427; + xor.b32 %r16036, %r30526, %r16426; + xor.b32 %r16035, %r30527, %r16427; + xor.b32 %r16195, %r30506, %r16426; + xor.b32 %r16196, %r30507, %r16427; + xor.b32 %r16156, %r30504, %r16426; + xor.b32 %r16155, %r30505, %r16427; + xor.b32 %r16139, %r30502, %r16426; + xor.b32 %r16140, %r30503, %r16427; + // begin inline asm + shf.l.wrap.b32 %r16006, %r15967, %r15966, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16010, %r15966, %r15967, %r15891; + // end inline asm + xor.b32 %r16428, %r16006, %r15942; + xor.b32 %r16429, %r16010, %r15943; + xor.b32 %r16076, %r30524, %r16428; + xor.b32 %r16075, %r30525, %r16429; + xor.b32 %r16203, %r30522, %r16428; + xor.b32 %r16204, %r30523, %r16429; + xor.b32 %r16084, %r30500, %r16428; + xor.b32 %r16083, %r30501, %r16429; + xor.b32 %r16187, %r30498, %r16428; + xor.b32 %r16188, %r30499, %r16429; + xor.b32 %r16052, %r30496, %r16428; + xor.b32 %r16051, %r30497, %r16429; + // begin inline asm + shf.l.wrap.b32 %r16014, %r15979, %r15978, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16018, %r15978, %r15979, %r15891; + // end inline asm + xor.b32 %r16430, %r16014, %r15954; + xor.b32 %r16431, %r16018, %r15955; + xor.b32 %r16171, %r30520, %r16430; + xor.b32 %r16172, %r30521, %r16431; + xor.b32 %r16148, %r30494, %r16430; + xor.b32 %r16147, %r30495, %r16431; + xor.b32 %r16091, %r30492, %r16430; + xor.b32 %r16092, %r30493, %r16431; + xor.b32 %r16179, %r30490, %r16430; + xor.b32 %r16180, %r30491, %r16431; + xor.b32 %r16108, %r30488, %r16430; + xor.b32 %r16107, %r30489, %r16431; + // begin inline asm + shf.l.wrap.b32 %r16022, %r15931, %r15930, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16026, %r15930, %r15931, %r15891; + // end inline asm + xor.b32 %r16432, %r16022, %r15966; + xor.b32 %r16433, %r16026, %r15967; + xor.b32 %r16123, %r30518, %r16432; + xor.b32 %r16124, %r30519, %r16433; + xor.b32 %r16043, %r30486, %r16432; + xor.b32 %r16044, %r30487, %r16433; + xor.b32 %r16060, %r30484, %r16432; + xor.b32 %r16059, %r30485, %r16433; + xor.b32 %r16099, %r30482, %r16432; + xor.b32 %r16100, %r30483, %r16433; + xor.b32 %r16131, %r30480, %r16432; + xor.b32 %r16132, %r30481, %r16433; + mov.u32 %r16037, 44; + // begin inline asm + shf.l.wrap.b32 %r16030, %r16036, %r16035, %r16037; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16034, %r16035, %r16036, %r16037; + // end inline asm + mov.u32 %r16045, 20; + // begin inline asm + shf.l.wrap.b32 %r16038, %r16044, %r16043, %r16045; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16042, %r16043, %r16044, %r16045; + // end inline asm + mov.u32 %r16053, 61; + // begin inline asm + shf.l.wrap.b32 %r16046, %r16052, %r16051, %r16053; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16050, %r16051, %r16052, %r16053; + // end inline asm + mov.u32 %r16061, 39; + // begin inline asm + shf.l.wrap.b32 %r16054, %r16060, %r16059, %r16061; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16058, %r16059, %r16060, %r16061; + // end inline asm + mov.u32 %r16069, 18; + // begin inline asm + shf.l.wrap.b32 %r16062, %r16068, %r16067, %r16069; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16066, %r16067, %r16068, %r16069; + // end inline asm + mov.u32 %r16077, 62; + // begin inline asm + shf.l.wrap.b32 %r16070, %r16076, %r16075, %r16077; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16074, %r16075, %r16076, %r16077; + // end inline asm + mov.u32 %r16085, 43; + // begin inline asm + shf.l.wrap.b32 %r16078, %r16084, %r16083, %r16085; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16082, %r16083, %r16084, %r16085; + // end inline asm + mov.u32 %r16093, 25; + // begin inline asm + shf.l.wrap.b32 %r16086, %r16092, %r16091, %r16093; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16090, %r16091, %r16092, %r16093; + // end inline asm + mov.u32 %r16101, 8; + // begin inline asm + shf.l.wrap.b32 %r16094, %r16100, %r16099, %r16101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16098, %r16099, %r16100, %r16101; + // end inline asm + mov.u32 %r16109, 56; + // begin inline asm + shf.l.wrap.b32 %r16102, %r16108, %r16107, %r16109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16106, %r16107, %r16108, %r16109; + // end inline asm + mov.u32 %r16117, 41; + // begin inline asm + shf.l.wrap.b32 %r16110, %r16116, %r16115, %r16117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16114, %r16115, %r16116, %r16117; + // end inline asm + mov.u32 %r16125, 27; + // begin inline asm + shf.l.wrap.b32 %r16118, %r16124, %r16123, %r16125; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16122, %r16123, %r16124, %r16125; + // end inline asm + mov.u32 %r16133, 14; + // begin inline asm + shf.l.wrap.b32 %r16126, %r16132, %r16131, %r16133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16130, %r16131, %r16132, %r16133; + // end inline asm + mov.u32 %r16141, 2; + // begin inline asm + shf.l.wrap.b32 %r16134, %r16140, %r16139, %r16141; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16138, %r16139, %r16140, %r16141; + // end inline asm + mov.u32 %r16149, 55; + // begin inline asm + shf.l.wrap.b32 %r16142, %r16148, %r16147, %r16149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16146, %r16147, %r16148, %r16149; + // end inline asm + mov.u32 %r16157, 45; + // begin inline asm + shf.l.wrap.b32 %r16150, %r16156, %r16155, %r16157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16154, %r16155, %r16156, %r16157; + // end inline asm + mov.u32 %r16165, 36; + // begin inline asm + shf.l.wrap.b32 %r16158, %r16164, %r16163, %r16165; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16162, %r16163, %r16164, %r16165; + // end inline asm + mov.u32 %r16173, 28; + // begin inline asm + shf.l.wrap.b32 %r16166, %r16172, %r16171, %r16173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16170, %r16171, %r16172, %r16173; + // end inline asm + mov.u32 %r16181, 21; + // begin inline asm + shf.l.wrap.b32 %r16174, %r16180, %r16179, %r16181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16178, %r16179, %r16180, %r16181; + // end inline asm + mov.u32 %r16189, 15; + // begin inline asm + shf.l.wrap.b32 %r16182, %r16188, %r16187, %r16189; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16186, %r16187, %r16188, %r16189; + // end inline asm + mov.u32 %r16197, 10; + // begin inline asm + shf.l.wrap.b32 %r16190, %r16196, %r16195, %r16197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16194, %r16195, %r16196, %r16197; + // end inline asm + mov.u32 %r16205, 6; + // begin inline asm + shf.l.wrap.b32 %r16198, %r16204, %r16203, %r16205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16202, %r16203, %r16204, %r16205; + // end inline asm + mov.u32 %r16213, 3; + // begin inline asm + shf.l.wrap.b32 %r16206, %r16212, %r16211, %r16213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16210, %r16211, %r16212, %r16213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16214, %r16220, %r16219, %r15891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16218, %r16219, %r16220, %r15891; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16222, %r16257, %r16030, %r16078, 0xD2; + lop3.b32 %r16223, %r16260, %r16034, %r16082, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30528, %r16030, %r16078, %r16174, 0xD2; + lop3.b32 %r30529, %r16034, %r16082, %r16178, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30524, %r16078, %r16174, %r16126, 0xD2; + lop3.b32 %r30525, %r16082, %r16178, %r16130, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30520, %r16174, %r16126, %r16257, 0xD2; + lop3.b32 %r30521, %r16178, %r16130, %r16260, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30518, %r16126, %r16257, %r16030, 0xD2; + lop3.b32 %r30519, %r16130, %r16260, %r16034, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30514, %r16166, %r16038, %r16206, 0xD2; + lop3.b32 %r30515, %r16170, %r16042, %r16210, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30526, %r16038, %r16206, %r16150, 0xD2; + lop3.b32 %r30527, %r16042, %r16210, %r16154, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30522, %r16206, %r16150, %r16046, 0xD2; + lop3.b32 %r30523, %r16210, %r16154, %r16050, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30494, %r16150, %r16046, %r16166, 0xD2; + lop3.b32 %r30495, %r16154, %r16050, %r16170, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r30494, %r30495}; + // begin inline asm + // chi + lop3.b32 %r30486, %r16046, %r16166, %r16038, 0xD2; + lop3.b32 %r30487, %r16050, %r16170, %r16042, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r30486, %r30487}; + // begin inline asm + // chi + lop3.b32 %r30512, %r16214, %r16198, %r16086, 0xD2; + lop3.b32 %r30513, %r16218, %r16202, %r16090, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+104], {%r30512, %r30513}; + // begin inline asm + // chi + lop3.b32 %r30506, %r16198, %r16086, %r16094, 0xD2; + lop3.b32 %r30507, %r16202, %r16090, %r16098, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+112], {%r30506, %r30507}; + // begin inline asm + // chi + lop3.b32 %r30500, %r16086, %r16094, %r16062, 0xD2; + lop3.b32 %r30501, %r16090, %r16098, %r16066, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+120], {%r30500, %r30501}; + // begin inline asm + // chi + lop3.b32 %r30492, %r16094, %r16062, %r16214, 0xD2; + lop3.b32 %r30493, %r16098, %r16066, %r16218, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+128], {%r30492, %r30493}; + // begin inline asm + // chi + lop3.b32 %r30484, %r16062, %r16214, %r16198, 0xD2; + lop3.b32 %r30485, %r16066, %r16218, %r16202, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+136], {%r30484, %r30485}; + // begin inline asm + // chi + lop3.b32 %r30510, %r16118, %r16158, %r16190, 0xD2; + lop3.b32 %r30511, %r16122, %r16162, %r16194, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+144], {%r30510, %r30511}; + // begin inline asm + // chi + lop3.b32 %r30504, %r16158, %r16190, %r16182, 0xD2; + lop3.b32 %r30505, %r16162, %r16194, %r16186, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+152], {%r30504, %r30505}; + // begin inline asm + // chi + lop3.b32 %r30498, %r16190, %r16182, %r16102, 0xD2; + lop3.b32 %r30499, %r16194, %r16186, %r16106, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+160], {%r30498, %r30499}; + // begin inline asm + // chi + lop3.b32 %r30490, %r16182, %r16102, %r16118, 0xD2; + lop3.b32 %r30491, %r16186, %r16106, %r16122, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+168], {%r30490, %r30491}; + // begin inline asm + // chi + lop3.b32 %r30482, %r16102, %r16118, %r16158, 0xD2; + lop3.b32 %r30483, %r16106, %r16122, %r16162, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+176], {%r30482, %r30483}; + // begin inline asm + // chi + lop3.b32 %r30508, %r16070, %r16142, %r16054, 0xD2; + lop3.b32 %r30509, %r16074, %r16146, %r16058, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+184], {%r30508, %r30509}; + // begin inline asm + // chi + lop3.b32 %r30502, %r16142, %r16054, %r16110, 0xD2; + lop3.b32 %r30503, %r16146, %r16058, %r16114, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+192], {%r30502, %r30503}; + // begin inline asm + // chi + lop3.b32 %r30496, %r16054, %r16110, %r16134, 0xD2; + lop3.b32 %r30497, %r16058, %r16114, %r16138, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+200], {%r30496, %r30497}; + // begin inline asm + // chi + lop3.b32 %r30488, %r16110, %r16134, %r16070, 0xD2; + lop3.b32 %r30489, %r16114, %r16138, %r16074, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+208], {%r30488, %r30489}; + // begin inline asm + // chi + lop3.b32 %r30480, %r16134, %r16070, %r16142, 0xD2; + lop3.b32 %r30481, %r16138, %r16074, %r16146, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+216], {%r30480, %r30481}; + mul.wide.s32 %rd764, %r30530, 8; + add.s64 %rd763, %rd689, %rd764; + // begin inline asm + ld.global.nc.v2.u32 {%r16422,%r16423}, [%rd763]; + // end inline asm + xor.b32 %r30516, %r16222, %r16422; + xor.b32 %r30517, %r16223, %r16423; + add.s32 %r30530, %r30530, 1; + setp.lt.u32 %p30, %r30530, 23; + @%p30 bra $L__BB2_48; + + mov.u32 %r16533, 1; + st.local.v2.u32 [%rd147+32], {%r30528, %r30529}; + st.local.v2.u32 [%rd147+72], {%r30526, %r30527}; + st.local.v2.u32 [%rd147+40], {%r30524, %r30525}; + st.local.v2.u32 [%rd147+80], {%r30522, %r30523}; + st.local.v2.u32 [%rd147+48], {%r30520, %r30521}; + st.local.v2.u32 [%rd147+56], {%r30518, %r30519}; + st.local.v2.u32 [%rd147+24], {%r30516, %r30517}; + // begin inline asm + // xor5 + lop3.b32 %r16434, %r30516, %r30514, %r30512, 0x96; + lop3.b32 %r16434, %r16434, %r30510, %r30508, 0x96; + lop3.b32 %r16435, %r30517, %r30515, %r30513, 0x96; + lop3.b32 %r16435, %r16435, %r30511, %r30509, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16446, %r30528, %r30526, %r30506, 0x96; + lop3.b32 %r16446, %r16446, %r30504, %r30502, 0x96; + lop3.b32 %r16447, %r30529, %r30527, %r30507, 0x96; + lop3.b32 %r16447, %r16447, %r30505, %r30503, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16458, %r30524, %r30522, %r30500, 0x96; + lop3.b32 %r16458, %r16458, %r30498, %r30496, 0x96; + lop3.b32 %r16459, %r30525, %r30523, %r30501, 0x96; + lop3.b32 %r16459, %r16459, %r30499, %r30497, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16470, %r30520, %r30494, %r30492, 0x96; + lop3.b32 %r16470, %r16470, %r30490, %r30488, 0x96; + lop3.b32 %r16471, %r30521, %r30495, %r30493, 0x96; + lop3.b32 %r16471, %r16471, %r30491, %r30489, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16482, %r30518, %r30486, %r30484, 0x96; + lop3.b32 %r16482, %r16482, %r30482, %r30480, 0x96; + lop3.b32 %r16483, %r30519, %r30487, %r30485, 0x96; + lop3.b32 %r16483, %r16483, %r30483, %r30481, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16494, %r16447, %r16446, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16498, %r16446, %r16447, %r16533; + // end inline asm + xor.b32 %r16672, %r16494, %r16482; + xor.b32 %r16673, %r16498, %r16483; + xor.b32 %r16641, %r30516, %r16672; + xor.b32 %r16644, %r30517, %r16673; + xor.b32 %r16604, %r30513, %r16673; + xor.b32 %r16603, %r30512, %r16672; + st.local.v2.u32 [%rd147+104], {%r16603, %r16604}; + // begin inline asm + shf.l.wrap.b32 %r16502, %r16459, %r16458, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16506, %r16458, %r16459, %r16533; + // end inline asm + xor.b32 %r16674, %r16502, %r16434; + xor.b32 %r16675, %r16506, %r16435; + xor.b32 %r16540, %r30526, %r16674; + xor.b32 %r16539, %r30527, %r16675; + xor.b32 %r16579, %r30505, %r16675; + xor.b32 %r16580, %r30504, %r16674; + st.local.v2.u32 [%rd147+152], {%r16580, %r16579}; + // begin inline asm + shf.l.wrap.b32 %r16510, %r16471, %r16470, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16514, %r16470, %r16471, %r16533; + // end inline asm + xor.b32 %r16676, %r16510, %r16446; + xor.b32 %r16677, %r16514, %r16447; + xor.b32 %r16563, %r30501, %r16677; + xor.b32 %r16564, %r30500, %r16676; + st.local.v2.u32 [%rd147+120], {%r16564, %r16563}; + xor.b32 %r16555, %r30497, %r16677; + xor.b32 %r16556, %r30496, %r16676; + st.local.v2.u32 [%rd147+200], {%r16556, %r16555}; + // begin inline asm + shf.l.wrap.b32 %r16518, %r16483, %r16482, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16522, %r16482, %r16483, %r16533; + // end inline asm + xor.b32 %r16678, %r16518, %r16458; + xor.b32 %r16679, %r16522, %r16459; + xor.b32 %r16587, %r30520, %r16678; + xor.b32 %r16588, %r30521, %r16679; + xor.b32 %r16596, %r30491, %r16679; + xor.b32 %r16595, %r30490, %r16678; + st.local.v2.u32 [%rd147+168], {%r16595, %r16596}; + // begin inline asm + shf.l.wrap.b32 %r16526, %r16435, %r16434, %r16533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16530, %r16434, %r16435, %r16533; + // end inline asm + xor.b32 %r16680, %r16526, %r16470; + xor.b32 %r16681, %r16530, %r16471; + xor.b32 %r16547, %r30486, %r16680; + xor.b32 %r16548, %r30487, %r16681; + xor.b32 %r16572, %r30481, %r16681; + xor.b32 %r16571, %r30480, %r16680; + st.local.v2.u32 [%rd147+216], {%r16571, %r16572}; + // begin inline asm + shf.l.wrap.b32 %r16534, %r16540, %r16539, %r16037; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16538, %r16539, %r16540, %r16037; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16542, %r16548, %r16547, %r16045; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16546, %r16547, %r16548, %r16045; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16554, %r16555, %r16556, %r16053; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16550, %r16556, %r16555, %r16053; + // end inline asm + st.local.v2.u32 [%rd147+96], {%r16550, %r16554}; + // begin inline asm + shf.l.wrap.b32 %r16558, %r16564, %r16563, %r16085; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16562, %r16563, %r16564, %r16085; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16566, %r16572, %r16571, %r16133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16570, %r16571, %r16572, %r16133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16578, %r16579, %r16580, %r16157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16574, %r16580, %r16579, %r16157; + // end inline asm + st.local.v2.u32 [%rd147+88], {%r16574, %r16578}; + // begin inline asm + shf.l.wrap.b32 %r16582, %r16588, %r16587, %r16173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16586, %r16587, %r16588, %r16173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16590, %r16596, %r16595, %r16181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16594, %r16595, %r16596, %r16181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16598, %r16604, %r16603, %r16213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16602, %r16603, %r16604, %r16213; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16606, %r16641, %r16534, %r16558, 0xD2; + lop3.b32 %r16607, %r16644, %r16538, %r16562, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16614, %r16534, %r16558, %r16590, 0xD2; + lop3.b32 %r16615, %r16538, %r16562, %r16594, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+32], {%r16614, %r16615}; + // begin inline asm + // chi + lop3.b32 %r16622, %r16558, %r16590, %r16566, 0xD2; + lop3.b32 %r16623, %r16562, %r16594, %r16570, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+40], {%r16622, %r16623}; + // begin inline asm + // chi + lop3.b32 %r16630, %r16590, %r16566, %r16641, 0xD2; + lop3.b32 %r16631, %r16594, %r16570, %r16644, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+48], {%r16630, %r16631}; + // begin inline asm + // chi + lop3.b32 %r16638, %r16566, %r16641, %r16534, 0xD2; + lop3.b32 %r16639, %r16570, %r16644, %r16538, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+56], {%r16638, %r16639}; + // begin inline asm + // chi + lop3.b32 %r16646, %r16582, %r16542, %r16598, 0xD2; + lop3.b32 %r16647, %r16586, %r16546, %r16602, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+64], {%r16646, %r16647}; + // begin inline asm + // chi + lop3.b32 %r16654, %r16542, %r16598, %r16574, 0xD2; + lop3.b32 %r16655, %r16546, %r16602, %r16578, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+72], {%r16654, %r16655}; + // begin inline asm + // chi + lop3.b32 %r16662, %r16598, %r16574, %r16550, 0xD2; + lop3.b32 %r16663, %r16602, %r16578, %r16554, 0xD2; + // end inline asm + st.local.v2.u32 [%rd147+80], {%r16662, %r16663}; + // begin inline asm + ld.global.nc.v2.u32 {%r16670,%r16671}, [%rd690]; + // end inline asm + xor.b32 %r16682, %r16607, %r16671; + xor.b32 %r16683, %r16606, %r16670; + st.local.v2.u32 [%rd147+24], {%r16683, %r16682}; + mov.b64 %rd1342, {%r16614, %r16615}; + mov.b64 %rd1343, {%r16622, %r16623}; + mov.b64 %rd1346, {%r16646, %r16647}; + mov.b64 %rd1347, {%r16654, %r16655}; + mov.b64 %rd1348, {%r16662, %r16663}; + mov.b64 %rd1341, {%r16683, %r16682}; + mov.b64 %rd1344, {%r16630, %r16631}; + mov.b64 %rd1345, {%r16638, %r16639}; + st.global.u64 [%rd128], %rd1333; + st.global.u64 [%rd128+8], %rd1334; + st.global.u64 [%rd128+16], %rd1335; + st.global.u64 [%rd128+24], %rd1336; + st.global.u64 [%rd128+32], %rd1337; + st.global.u64 [%rd128+40], %rd1338; + st.global.u64 [%rd128+48], %rd1339; + st.global.u64 [%rd128+56], %rd1340; + st.global.v2.u32 [%rd128+64], {%r16683, %r16682}; + st.global.v2.u32 [%rd128+72], {%r16614, %r16615}; + st.global.v2.u32 [%rd128+80], {%r16622, %r16623}; + st.global.v2.u32 [%rd128+88], {%r16630, %r16631}; + st.global.v2.u32 [%rd128+96], {%r16638, %r16639}; + st.global.v2.u32 [%rd128+104], {%r16646, %r16647}; + st.global.v2.u32 [%rd128+112], {%r16654, %r16655}; + st.global.v2.u32 [%rd128+120], {%r16662, %r16663}; + +$L__BB2_61: + cvta.to.global.u64 %rd1266, %rd361; + shl.b32 %r3343, %r46, 1; + mul.wide.u32 %rd870, %r3343, -954391867; + shr.u64 %rd871, %rd870, 32; + cvt.u32.u64 %r19968, %rd871; + sub.s32 %r19969, %r3343, %r19968; + shr.u32 %r19970, %r19969, 1; + add.s32 %r19971, %r19970, %r19968; + shr.u32 %r19972, %r19971, 20; + mul.lo.s32 %r19973, %r19972, 1179641; + sub.s32 %r19974, %r3343, %r19973; + mul.wide.u32 %rd873, %r19974, 64; + add.s64 %rd220, %rd1266, %rd873; + or.b32 %r3344, %r3343, 1; + mul.wide.u32 %rd874, %r3344, -954391867; + shr.u64 %rd875, %rd874, 32; + cvt.u32.u64 %r19975, %rd875; + sub.s32 %r19976, %r3344, %r19975; + shr.u32 %r19977, %r19976, 1; + add.s32 %r19978, %r19977, %r19975; + shr.u32 %r19979, %r19978, 20; + mul.lo.s32 %r19980, %r19979, 1179641; + sub.s32 %r19981, %r3344, %r19980; + mul.wide.u32 %rd876, %r19981, 64; + add.s64 %rd221, %rd1266, %rd876; + @%p12 bra $L__BB2_75; + + cvta.to.global.u64 %rd877, %rd360; + mul.wide.u32 %rd878, %r46, 128; + add.s64 %rd222, %rd877, %rd878; + ld.global.u64 %rd1349, [%rd222]; + setp.eq.s64 %p37, %rd1349, 0; + @%p37 bra $L__BB2_64; + + ld.global.u64 %rd1364, [%rd222+120]; + ld.global.u64 %rd1363, [%rd222+112]; + ld.global.u64 %rd1362, [%rd222+104]; + ld.global.u64 %rd1361, [%rd222+96]; + ld.global.u64 %rd1360, [%rd222+88]; + ld.global.u64 %rd1359, [%rd222+80]; + ld.global.u64 %rd1358, [%rd222+72]; + ld.global.u64 %rd1357, [%rd222+64]; + ld.global.u64 %rd1356, [%rd222+56]; + ld.global.u64 %rd1355, [%rd222+48]; + ld.global.u64 %rd1354, [%rd222+40]; + ld.global.u64 %rd1353, [%rd222+32]; + ld.global.u64 %rd1352, [%rd222+24]; + ld.global.u64 %rd1351, [%rd222+16]; + ld.global.u64 %rd1350, [%rd222+8]; + bra.uni $L__BB2_86; + +$L__BB2_75: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd994, 1179641; + st.local.u64 [%rd2+8], %rd994; + st.local.u32 [%rd2+16], %r3343; + ld.global.u64 %rd995, [%rd220]; + ld.global.u64 %rd996, [%rd220+8]; + ld.global.u64 %rd997, [%rd220+16]; + ld.global.u64 %rd998, [%rd220+24]; + ld.global.u64 %rd999, [%rd220+32]; + ld.global.u64 %rd1000, [%rd220+40]; + ld.global.u64 %rd1001, [%rd220+48]; + ld.global.u64 %rd1002, [%rd220+56]; + st.local.u64 [%rd2+24], %rd995; + st.local.u64 [%rd2+32], %rd996; + st.local.u64 [%rd2+40], %rd997; + st.local.u64 [%rd2+48], %rd998; + st.local.u64 [%rd2+56], %rd999; + st.local.u64 [%rd2+64], %rd1000; + st.local.u64 [%rd2+72], %rd1001; + st.local.u64 [%rd2+80], %rd1002; + cvt.u32.u64 %r23308, %rd995; + xor.b32 %r23309, %r3343, %r23308; + st.local.u32 [%rd2+24], %r23309; + mov.u32 %r31005, 0; + st.local.v2.u32 [%rd2+96], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+104], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+112], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+120], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+128], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+136], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+144], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+152], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+160], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+168], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+176], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+184], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+192], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+200], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+208], {%r31005, %r31005}; + st.local.v2.u32 [%rd2+216], {%r31005, %r31005}; + mov.u32 %r31020, -2147483648; + mov.u32 %r23281, 1; + st.local.v2.u32 [%rd2+88], {%r23281, %r31020}; + ld.local.v2.u32 {%r31041, %r31042}, [%rd2+24]; + mov.b64 {%r31039, %r31040}, %rd1000; + shr.u64 %rd1003, %rd996, 32; + cvt.u32.u64 %r31053, %rd996; + cvt.u32.u64 %r31054, %rd1003; + shr.u64 %rd1004, %rd1001, 32; + cvt.u32.u64 %r31051, %rd1001; + cvt.u32.u64 %r31052, %rd1004; + shr.u64 %rd1005, %rd997, 32; + cvt.u32.u64 %r31049, %rd997; + cvt.u32.u64 %r31050, %rd1005; + shr.u64 %rd1006, %rd1002, 32; + cvt.u32.u64 %r31047, %rd1002; + cvt.u32.u64 %r31048, %rd1006; + shr.u64 %rd1007, %rd998, 32; + cvt.u32.u64 %r31045, %rd998; + cvt.u32.u64 %r31046, %rd1007; + shr.u64 %rd1008, %rd999, 32; + cvt.u32.u64 %r31043, %rd999; + cvt.u32.u64 %r31044, %rd1008; + mov.u32 %r31006, %r31005; + mov.u32 %r31007, %r31005; + mov.u32 %r31008, %r31005; + mov.u32 %r31009, %r31005; + mov.u32 %r31010, %r31005; + mov.u32 %r31011, %r31005; + mov.u32 %r31012, %r31005; + mov.u32 %r31013, %r31005; + mov.u32 %r31014, %r31005; + mov.u32 %r31015, %r31005; + mov.u32 %r31016, %r31005; + mov.u32 %r31017, %r31005; + mov.u32 %r31018, %r31005; + mov.u32 %r31019, %r23281; + mov.u32 %r31021, %r31005; + mov.u32 %r31022, %r31005; + mov.u32 %r31023, %r31005; + mov.u32 %r31024, %r31005; + mov.u32 %r31025, %r31005; + mov.u32 %r31026, %r31005; + mov.u32 %r31027, %r31005; + mov.u32 %r31028, %r31005; + mov.u32 %r31029, %r31005; + mov.u32 %r31030, %r31005; + mov.u32 %r31031, %r31005; + mov.u32 %r31032, %r31005; + mov.u32 %r31033, %r31005; + mov.u32 %r31034, %r31005; + mov.u32 %r31035, %r31005; + mov.u32 %r31036, %r31005; + mov.u32 %r31037, %r31005; + mov.u32 %r31038, %r31005; + mov.u32 %r31055, %r31005; + +$L__BB2_76: + // begin inline asm + // xor5 + lop3.b32 %r23312, %r31041, %r31039, %r31037, 0x96; + lop3.b32 %r23312, %r23312, %r31035, %r31033, 0x96; + lop3.b32 %r23313, %r31042, %r31040, %r31038, 0x96; + lop3.b32 %r23313, %r23313, %r31036, %r31034, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23324, %r31053, %r31051, %r31031, 0x96; + lop3.b32 %r23324, %r23324, %r31029, %r31027, 0x96; + lop3.b32 %r23325, %r31054, %r31052, %r31032, 0x96; + lop3.b32 %r23325, %r23325, %r31030, %r31028, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23336, %r31049, %r31047, %r31025, 0x96; + lop3.b32 %r23336, %r23336, %r31023, %r31021, 0x96; + lop3.b32 %r23337, %r31050, %r31048, %r31026, 0x96; + lop3.b32 %r23337, %r23337, %r31024, %r31022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23348, %r31045, %r31019, %r31017, 0x96; + lop3.b32 %r23348, %r23348, %r31015, %r31013, 0x96; + lop3.b32 %r23349, %r31046, %r31020, %r31018, 0x96; + lop3.b32 %r23349, %r23349, %r31016, %r31014, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23360, %r31043, %r31011, %r31009, 0x96; + lop3.b32 %r23360, %r23360, %r31007, %r31005, 0x96; + lop3.b32 %r23361, %r31044, %r31012, %r31010, 0x96; + lop3.b32 %r23361, %r23361, %r31008, %r31006, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23372, %r23325, %r23324, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23376, %r23324, %r23325, %r23281; + // end inline asm + xor.b32 %r23806, %r23372, %r23360; + xor.b32 %r23807, %r23376, %r23361; + xor.b32 %r23639, %r31041, %r23806; + xor.b32 %r23642, %r31042, %r23807; + xor.b32 %r23546, %r31039, %r23806; + xor.b32 %r23545, %r31040, %r23807; + xor.b32 %r23593, %r31037, %r23806; + xor.b32 %r23594, %r31038, %r23807; + xor.b32 %r23498, %r31035, %r23806; + xor.b32 %r23497, %r31036, %r23807; + xor.b32 %r23449, %r31033, %r23806; + xor.b32 %r23450, %r31034, %r23807; + // begin inline asm + shf.l.wrap.b32 %r23380, %r23337, %r23336, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23384, %r23336, %r23337, %r23281; + // end inline asm + xor.b32 %r23808, %r23380, %r23312; + xor.b32 %r23809, %r23384, %r23313; + xor.b32 %r23601, %r31053, %r23808; + xor.b32 %r23602, %r31054, %r23809; + xor.b32 %r23418, %r31051, %r23808; + xor.b32 %r23417, %r31052, %r23809; + xor.b32 %r23577, %r31031, %r23808; + xor.b32 %r23578, %r31032, %r23809; + xor.b32 %r23538, %r31029, %r23808; + xor.b32 %r23537, %r31030, %r23809; + xor.b32 %r23521, %r31027, %r23808; + xor.b32 %r23522, %r31028, %r23809; + // begin inline asm + shf.l.wrap.b32 %r23388, %r23349, %r23348, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23392, %r23348, %r23349, %r23281; + // end inline asm + xor.b32 %r23810, %r23388, %r23324; + xor.b32 %r23811, %r23392, %r23325; + xor.b32 %r23458, %r31049, %r23810; + xor.b32 %r23457, %r31050, %r23811; + xor.b32 %r23585, %r31047, %r23810; + xor.b32 %r23586, %r31048, %r23811; + xor.b32 %r23466, %r31025, %r23810; + xor.b32 %r23465, %r31026, %r23811; + xor.b32 %r23569, %r31023, %r23810; + xor.b32 %r23570, %r31024, %r23811; + xor.b32 %r23434, %r31021, %r23810; + xor.b32 %r23433, %r31022, %r23811; + // begin inline asm + shf.l.wrap.b32 %r23396, %r23361, %r23360, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23400, %r23360, %r23361, %r23281; + // end inline asm + xor.b32 %r23812, %r23396, %r23336; + xor.b32 %r23813, %r23400, %r23337; + xor.b32 %r23553, %r31045, %r23812; + xor.b32 %r23554, %r31046, %r23813; + xor.b32 %r23530, %r31019, %r23812; + xor.b32 %r23529, %r31020, %r23813; + xor.b32 %r23473, %r31017, %r23812; + xor.b32 %r23474, %r31018, %r23813; + xor.b32 %r23561, %r31015, %r23812; + xor.b32 %r23562, %r31016, %r23813; + xor.b32 %r23490, %r31013, %r23812; + xor.b32 %r23489, %r31014, %r23813; + // begin inline asm + shf.l.wrap.b32 %r23404, %r23313, %r23312, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23408, %r23312, %r23313, %r23281; + // end inline asm + xor.b32 %r23814, %r23404, %r23348; + xor.b32 %r23815, %r23408, %r23349; + xor.b32 %r23505, %r31043, %r23814; + xor.b32 %r23506, %r31044, %r23815; + xor.b32 %r23425, %r31011, %r23814; + xor.b32 %r23426, %r31012, %r23815; + xor.b32 %r23442, %r31009, %r23814; + xor.b32 %r23441, %r31010, %r23815; + xor.b32 %r23481, %r31007, %r23814; + xor.b32 %r23482, %r31008, %r23815; + xor.b32 %r23513, %r31005, %r23814; + xor.b32 %r23514, %r31006, %r23815; + mov.u32 %r23419, 44; + // begin inline asm + shf.l.wrap.b32 %r23412, %r23418, %r23417, %r23419; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23416, %r23417, %r23418, %r23419; + // end inline asm + mov.u32 %r23427, 20; + // begin inline asm + shf.l.wrap.b32 %r23420, %r23426, %r23425, %r23427; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23424, %r23425, %r23426, %r23427; + // end inline asm + mov.u32 %r23435, 61; + // begin inline asm + shf.l.wrap.b32 %r23428, %r23434, %r23433, %r23435; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23432, %r23433, %r23434, %r23435; + // end inline asm + mov.u32 %r23443, 39; + // begin inline asm + shf.l.wrap.b32 %r23436, %r23442, %r23441, %r23443; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23440, %r23441, %r23442, %r23443; + // end inline asm + mov.u32 %r23451, 18; + // begin inline asm + shf.l.wrap.b32 %r23444, %r23450, %r23449, %r23451; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23448, %r23449, %r23450, %r23451; + // end inline asm + mov.u32 %r23459, 62; + // begin inline asm + shf.l.wrap.b32 %r23452, %r23458, %r23457, %r23459; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23456, %r23457, %r23458, %r23459; + // end inline asm + mov.u32 %r23467, 43; + // begin inline asm + shf.l.wrap.b32 %r23460, %r23466, %r23465, %r23467; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23464, %r23465, %r23466, %r23467; + // end inline asm + mov.u32 %r23475, 25; + // begin inline asm + shf.l.wrap.b32 %r23468, %r23474, %r23473, %r23475; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23472, %r23473, %r23474, %r23475; + // end inline asm + mov.u32 %r23483, 8; + // begin inline asm + shf.l.wrap.b32 %r23476, %r23482, %r23481, %r23483; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23480, %r23481, %r23482, %r23483; + // end inline asm + mov.u32 %r23491, 56; + // begin inline asm + shf.l.wrap.b32 %r23484, %r23490, %r23489, %r23491; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23488, %r23489, %r23490, %r23491; + // end inline asm + mov.u32 %r23499, 41; + // begin inline asm + shf.l.wrap.b32 %r23492, %r23498, %r23497, %r23499; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23496, %r23497, %r23498, %r23499; + // end inline asm + mov.u32 %r23507, 27; + // begin inline asm + shf.l.wrap.b32 %r23500, %r23506, %r23505, %r23507; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23504, %r23505, %r23506, %r23507; + // end inline asm + mov.u32 %r23515, 14; + // begin inline asm + shf.l.wrap.b32 %r23508, %r23514, %r23513, %r23515; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23512, %r23513, %r23514, %r23515; + // end inline asm + mov.u32 %r23523, 2; + // begin inline asm + shf.l.wrap.b32 %r23516, %r23522, %r23521, %r23523; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23520, %r23521, %r23522, %r23523; + // end inline asm + mov.u32 %r23531, 55; + // begin inline asm + shf.l.wrap.b32 %r23524, %r23530, %r23529, %r23531; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23528, %r23529, %r23530, %r23531; + // end inline asm + mov.u32 %r23539, 45; + // begin inline asm + shf.l.wrap.b32 %r23532, %r23538, %r23537, %r23539; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23536, %r23537, %r23538, %r23539; + // end inline asm + mov.u32 %r23547, 36; + // begin inline asm + shf.l.wrap.b32 %r23540, %r23546, %r23545, %r23547; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23544, %r23545, %r23546, %r23547; + // end inline asm + mov.u32 %r23555, 28; + // begin inline asm + shf.l.wrap.b32 %r23548, %r23554, %r23553, %r23555; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23552, %r23553, %r23554, %r23555; + // end inline asm + mov.u32 %r23563, 21; + // begin inline asm + shf.l.wrap.b32 %r23556, %r23562, %r23561, %r23563; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23560, %r23561, %r23562, %r23563; + // end inline asm + mov.u32 %r23571, 15; + // begin inline asm + shf.l.wrap.b32 %r23564, %r23570, %r23569, %r23571; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23568, %r23569, %r23570, %r23571; + // end inline asm + mov.u32 %r23579, 10; + // begin inline asm + shf.l.wrap.b32 %r23572, %r23578, %r23577, %r23579; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23576, %r23577, %r23578, %r23579; + // end inline asm + mov.u32 %r23587, 6; + // begin inline asm + shf.l.wrap.b32 %r23580, %r23586, %r23585, %r23587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23584, %r23585, %r23586, %r23587; + // end inline asm + mov.u32 %r23595, 3; + // begin inline asm + shf.l.wrap.b32 %r23588, %r23594, %r23593, %r23595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23592, %r23593, %r23594, %r23595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23596, %r23602, %r23601, %r23281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23600, %r23601, %r23602, %r23281; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23604, %r23639, %r23412, %r23460, 0xD2; + lop3.b32 %r23605, %r23642, %r23416, %r23464, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31053, %r23412, %r23460, %r23556, 0xD2; + lop3.b32 %r31054, %r23416, %r23464, %r23560, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31049, %r23460, %r23556, %r23508, 0xD2; + lop3.b32 %r31050, %r23464, %r23560, %r23512, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31045, %r23556, %r23508, %r23639, 0xD2; + lop3.b32 %r31046, %r23560, %r23512, %r23642, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31043, %r23508, %r23639, %r23412, 0xD2; + lop3.b32 %r31044, %r23512, %r23642, %r23416, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31039, %r23548, %r23420, %r23588, 0xD2; + lop3.b32 %r31040, %r23552, %r23424, %r23592, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31051, %r23420, %r23588, %r23532, 0xD2; + lop3.b32 %r31052, %r23424, %r23592, %r23536, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31047, %r23588, %r23532, %r23428, 0xD2; + lop3.b32 %r31048, %r23592, %r23536, %r23432, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31019, %r23532, %r23428, %r23548, 0xD2; + lop3.b32 %r31020, %r23536, %r23432, %r23552, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r31019, %r31020}; + // begin inline asm + // chi + lop3.b32 %r31011, %r23428, %r23548, %r23420, 0xD2; + lop3.b32 %r31012, %r23432, %r23552, %r23424, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r31011, %r31012}; + // begin inline asm + // chi + lop3.b32 %r31037, %r23596, %r23580, %r23468, 0xD2; + lop3.b32 %r31038, %r23600, %r23584, %r23472, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r31037, %r31038}; + // begin inline asm + // chi + lop3.b32 %r31031, %r23580, %r23468, %r23476, 0xD2; + lop3.b32 %r31032, %r23584, %r23472, %r23480, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r31031, %r31032}; + // begin inline asm + // chi + lop3.b32 %r31025, %r23468, %r23476, %r23444, 0xD2; + lop3.b32 %r31026, %r23472, %r23480, %r23448, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r31025, %r31026}; + // begin inline asm + // chi + lop3.b32 %r31017, %r23476, %r23444, %r23596, 0xD2; + lop3.b32 %r31018, %r23480, %r23448, %r23600, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r31017, %r31018}; + // begin inline asm + // chi + lop3.b32 %r31009, %r23444, %r23596, %r23580, 0xD2; + lop3.b32 %r31010, %r23448, %r23600, %r23584, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r31009, %r31010}; + // begin inline asm + // chi + lop3.b32 %r31035, %r23500, %r23540, %r23572, 0xD2; + lop3.b32 %r31036, %r23504, %r23544, %r23576, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r31035, %r31036}; + // begin inline asm + // chi + lop3.b32 %r31029, %r23540, %r23572, %r23564, 0xD2; + lop3.b32 %r31030, %r23544, %r23576, %r23568, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r31029, %r31030}; + // begin inline asm + // chi + lop3.b32 %r31023, %r23572, %r23564, %r23484, 0xD2; + lop3.b32 %r31024, %r23576, %r23568, %r23488, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r31023, %r31024}; + // begin inline asm + // chi + lop3.b32 %r31015, %r23564, %r23484, %r23500, 0xD2; + lop3.b32 %r31016, %r23568, %r23488, %r23504, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r31015, %r31016}; + // begin inline asm + // chi + lop3.b32 %r31007, %r23484, %r23500, %r23540, 0xD2; + lop3.b32 %r31008, %r23488, %r23504, %r23544, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r31007, %r31008}; + // begin inline asm + // chi + lop3.b32 %r31033, %r23452, %r23524, %r23436, 0xD2; + lop3.b32 %r31034, %r23456, %r23528, %r23440, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r31033, %r31034}; + // begin inline asm + // chi + lop3.b32 %r31027, %r23524, %r23436, %r23492, 0xD2; + lop3.b32 %r31028, %r23528, %r23440, %r23496, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r31027, %r31028}; + // begin inline asm + // chi + lop3.b32 %r31021, %r23436, %r23492, %r23516, 0xD2; + lop3.b32 %r31022, %r23440, %r23496, %r23520, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r31021, %r31022}; + // begin inline asm + // chi + lop3.b32 %r31013, %r23492, %r23516, %r23452, 0xD2; + lop3.b32 %r31014, %r23496, %r23520, %r23456, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r31013, %r31014}; + // begin inline asm + // chi + lop3.b32 %r31005, %r23516, %r23452, %r23524, 0xD2; + lop3.b32 %r31006, %r23520, %r23456, %r23528, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r31005, %r31006}; + mul.wide.s32 %rd1010, %r31055, 8; + mov.u64 %rd1011, keccak_round_constants; + cvta.const.u64 %rd1012, %rd1011; + add.s64 %rd1009, %rd1012, %rd1010; + // begin inline asm + ld.global.nc.v2.u32 {%r23804,%r23805}, [%rd1009]; + // end inline asm + xor.b32 %r31041, %r23604, %r23804; + xor.b32 %r31042, %r23605, %r23805; + add.s32 %r31055, %r31055, 1; + setp.lt.u32 %p43, %r31055, 23; + @%p43 bra $L__BB2_76; + + add.u64 %rd270, %SPL, 1912; + st.local.v2.u32 [%rd2+32], {%r31053, %r31054}; + st.local.v2.u32 [%rd2+72], {%r31051, %r31052}; + st.local.v2.u32 [%rd2+40], {%r31049, %r31050}; + st.local.v2.u32 [%rd2+80], {%r31047, %r31048}; + st.local.v2.u32 [%rd2+48], {%r31045, %r31046}; + st.local.v2.u32 [%rd2+56], {%r31043, %r31044}; + st.local.v2.u32 [%rd2+24], {%r31041, %r31042}; + // begin inline asm + // xor5 + lop3.b32 %r23816, %r31041, %r31039, %r31037, 0x96; + lop3.b32 %r23816, %r23816, %r31035, %r31033, 0x96; + lop3.b32 %r23817, %r31042, %r31040, %r31038, 0x96; + lop3.b32 %r23817, %r23817, %r31036, %r31034, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23828, %r31053, %r31051, %r31031, 0x96; + lop3.b32 %r23828, %r23828, %r31029, %r31027, 0x96; + lop3.b32 %r23829, %r31054, %r31052, %r31032, 0x96; + lop3.b32 %r23829, %r23829, %r31030, %r31028, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23840, %r31049, %r31047, %r31025, 0x96; + lop3.b32 %r23840, %r23840, %r31023, %r31021, 0x96; + lop3.b32 %r23841, %r31050, %r31048, %r31026, 0x96; + lop3.b32 %r23841, %r23841, %r31024, %r31022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23852, %r31045, %r31019, %r31017, 0x96; + lop3.b32 %r23852, %r23852, %r31015, %r31013, 0x96; + lop3.b32 %r23853, %r31046, %r31020, %r31018, 0x96; + lop3.b32 %r23853, %r23853, %r31016, %r31014, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23864, %r31043, %r31011, %r31009, 0x96; + lop3.b32 %r23864, %r23864, %r31007, %r31005, 0x96; + lop3.b32 %r23865, %r31044, %r31012, %r31010, 0x96; + lop3.b32 %r23865, %r23865, %r31008, %r31006, 0x96; + // end inline asm + mov.u32 %r31070, 1; + // begin inline asm + shf.l.wrap.b32 %r23876, %r23829, %r23828, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23880, %r23828, %r23829, %r31070; + // end inline asm + xor.b32 %r24095, %r23876, %r23864; + xor.b32 %r24096, %r23880, %r23865; + xor.b32 %r24023, %r31041, %r24095; + xor.b32 %r24026, %r31042, %r24096; + xor.b32 %r23986, %r31038, %r24096; + xor.b32 %r23985, %r31037, %r24095; + st.local.v2.u32 [%rd2+104], {%r23985, %r23986}; + // begin inline asm + shf.l.wrap.b32 %r23884, %r23841, %r23840, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23888, %r23840, %r23841, %r31070; + // end inline asm + xor.b32 %r24097, %r23884, %r23816; + xor.b32 %r24098, %r23888, %r23817; + xor.b32 %r23922, %r31051, %r24097; + xor.b32 %r23921, %r31052, %r24098; + xor.b32 %r23961, %r31030, %r24098; + xor.b32 %r23962, %r31029, %r24097; + st.local.v2.u32 [%rd2+152], {%r23962, %r23961}; + // begin inline asm + shf.l.wrap.b32 %r23892, %r23853, %r23852, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23896, %r23852, %r23853, %r31070; + // end inline asm + xor.b32 %r24099, %r23892, %r23828; + xor.b32 %r24100, %r23896, %r23829; + xor.b32 %r23945, %r31026, %r24100; + xor.b32 %r23946, %r31025, %r24099; + st.local.v2.u32 [%rd2+120], {%r23946, %r23945}; + xor.b32 %r23937, %r31022, %r24100; + xor.b32 %r23938, %r31021, %r24099; + st.local.v2.u32 [%rd2+200], {%r23938, %r23937}; + // begin inline asm + shf.l.wrap.b32 %r23900, %r23865, %r23864, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23904, %r23864, %r23865, %r31070; + // end inline asm + xor.b32 %r24101, %r23900, %r23840; + xor.b32 %r24102, %r23904, %r23841; + xor.b32 %r23969, %r31045, %r24101; + xor.b32 %r23970, %r31046, %r24102; + xor.b32 %r23978, %r31016, %r24102; + xor.b32 %r23977, %r31015, %r24101; + st.local.v2.u32 [%rd2+168], {%r23977, %r23978}; + // begin inline asm + shf.l.wrap.b32 %r23908, %r23817, %r23816, %r31070; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23912, %r23816, %r23817, %r31070; + // end inline asm + xor.b32 %r24103, %r23908, %r23852; + xor.b32 %r24104, %r23912, %r23853; + xor.b32 %r23929, %r31011, %r24103; + xor.b32 %r23930, %r31012, %r24104; + xor.b32 %r23954, %r31006, %r24104; + xor.b32 %r23953, %r31005, %r24103; + st.local.v2.u32 [%rd2+216], {%r23953, %r23954}; + // begin inline asm + shf.l.wrap.b32 %r23916, %r23922, %r23921, %r23419; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23920, %r23921, %r23922, %r23419; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23924, %r23930, %r23929, %r23427; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23928, %r23929, %r23930, %r23427; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23936, %r23937, %r23938, %r23435; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23932, %r23938, %r23937, %r23435; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r23932, %r23936}; + // begin inline asm + shf.l.wrap.b32 %r23940, %r23946, %r23945, %r23467; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23944, %r23945, %r23946, %r23467; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23948, %r23954, %r23953, %r23515; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23952, %r23953, %r23954, %r23515; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23960, %r23961, %r23962, %r23539; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23956, %r23962, %r23961, %r23539; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r23956, %r23960}; + // begin inline asm + shf.l.wrap.b32 %r23964, %r23970, %r23969, %r23555; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23968, %r23969, %r23970, %r23555; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23972, %r23978, %r23977, %r23563; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23976, %r23977, %r23978, %r23563; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23980, %r23986, %r23985, %r23595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23984, %r23985, %r23986, %r23595; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23988, %r24023, %r23916, %r23940, 0xD2; + lop3.b32 %r23989, %r24026, %r23920, %r23944, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31188, %r23916, %r23940, %r23972, 0xD2; + lop3.b32 %r31189, %r23920, %r23944, %r23976, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r31188, %r31189}; + // begin inline asm + // chi + lop3.b32 %r31184, %r23940, %r23972, %r23948, 0xD2; + lop3.b32 %r31185, %r23944, %r23976, %r23952, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r31184, %r31185}; + // begin inline asm + // chi + lop3.b32 %r31180, %r23972, %r23948, %r24023, 0xD2; + lop3.b32 %r31181, %r23976, %r23952, %r24026, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r31180, %r31181}; + // begin inline asm + // chi + lop3.b32 %r31178, %r23948, %r24023, %r23916, 0xD2; + lop3.b32 %r31179, %r23952, %r24026, %r23920, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r31178, %r31179}; + // begin inline asm + // chi + lop3.b32 %r31174, %r23964, %r23924, %r23980, 0xD2; + lop3.b32 %r31175, %r23968, %r23928, %r23984, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r31174, %r31175}; + // begin inline asm + // chi + lop3.b32 %r31186, %r23924, %r23980, %r23956, 0xD2; + lop3.b32 %r31187, %r23928, %r23984, %r23960, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r31186, %r31187}; + // begin inline asm + // chi + lop3.b32 %r31182, %r23980, %r23956, %r23932, 0xD2; + lop3.b32 %r31183, %r23984, %r23960, %r23936, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r31182, %r31183}; + add.s64 %rd1013, %rd1012, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r24052,%r24053}, [%rd1013]; + // end inline asm + xor.b32 %r31176, %r23988, %r24052; + xor.b32 %r31177, %r23989, %r24053; + st.local.v2.u32 [%rd2+24], {%r31176, %r31177}; + st.local.u64 [%rd270], %rd361; + mov.u64 %rd1017, 1179641; + st.local.u64 [%rd270+8], %rd1017; + st.local.u32 [%rd270+16], %r3344; + ld.global.u64 %rd1018, [%rd221]; + ld.global.u64 %rd1019, [%rd221+8]; + ld.global.u64 %rd1020, [%rd221+16]; + ld.global.u64 %rd1021, [%rd221+24]; + ld.global.u64 %rd1022, [%rd221+32]; + ld.global.u64 %rd1023, [%rd221+40]; + ld.global.u64 %rd1024, [%rd221+48]; + ld.global.u64 %rd1025, [%rd221+56]; + st.local.u64 [%rd270+32], %rd1019; + st.local.u64 [%rd270+40], %rd1020; + st.local.u64 [%rd270+48], %rd1021; + st.local.u64 [%rd270+56], %rd1022; + st.local.u64 [%rd270+64], %rd1023; + st.local.u64 [%rd270+72], %rd1024; + st.local.u64 [%rd270+80], %rd1025; + cvt.u32.u64 %r24105, %rd1018; + xor.b32 %r24106, %r3344, %r24105; + st.local.u64 [%rd270+24], %rd1018; + st.local.u32 [%rd270+24], %r24106; + mov.u32 %r31056, 0; + st.local.v2.u32 [%rd270+96], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+104], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+112], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+120], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+128], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+136], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+144], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+152], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+160], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+168], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+176], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+184], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+192], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+200], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+208], {%r31056, %r31056}; + st.local.v2.u32 [%rd270+216], {%r31056, %r31056}; + mov.u32 %r31071, -2147483648; + st.local.v2.u32 [%rd270+88], {%r31070, %r31071}; + ld.local.v2.u32 {%r31092, %r31093}, [%rd270+24]; + mov.b64 {%r31090, %r31091}, %rd1023; + shr.u64 %rd1026, %rd1019, 32; + cvt.u32.u64 %r31104, %rd1019; + cvt.u32.u64 %r31105, %rd1026; + shr.u64 %rd1027, %rd1024, 32; + cvt.u32.u64 %r31102, %rd1024; + cvt.u32.u64 %r31103, %rd1027; + shr.u64 %rd1028, %rd1020, 32; + cvt.u32.u64 %r31100, %rd1020; + cvt.u32.u64 %r31101, %rd1028; + shr.u64 %rd1029, %rd1025, 32; + cvt.u32.u64 %r31098, %rd1025; + cvt.u32.u64 %r31099, %rd1029; + shr.u64 %rd1030, %rd1021, 32; + cvt.u32.u64 %r31096, %rd1021; + cvt.u32.u64 %r31097, %rd1030; + shr.u64 %rd1031, %rd1022, 32; + cvt.u32.u64 %r31094, %rd1022; + cvt.u32.u64 %r31095, %rd1031; + mov.u32 %r31057, %r31056; + mov.u32 %r31058, %r31056; + mov.u32 %r31059, %r31056; + mov.u32 %r31060, %r31056; + mov.u32 %r31061, %r31056; + mov.u32 %r31062, %r31056; + mov.u32 %r31063, %r31056; + mov.u32 %r31064, %r31056; + mov.u32 %r31065, %r31056; + mov.u32 %r31066, %r31056; + mov.u32 %r31067, %r31056; + mov.u32 %r31068, %r31056; + mov.u32 %r31069, %r31056; + mov.u32 %r31072, %r31056; + mov.u32 %r31073, %r31056; + mov.u32 %r31074, %r31056; + mov.u32 %r31075, %r31056; + mov.u32 %r31076, %r31056; + mov.u32 %r31077, %r31056; + mov.u32 %r31078, %r31056; + mov.u32 %r31079, %r31056; + mov.u32 %r31080, %r31056; + mov.u32 %r31081, %r31056; + mov.u32 %r31082, %r31056; + mov.u32 %r31083, %r31056; + mov.u32 %r31084, %r31056; + mov.u32 %r31085, %r31056; + mov.u32 %r31086, %r31056; + mov.u32 %r31087, %r31056; + mov.u32 %r31088, %r31056; + mov.u32 %r31089, %r31056; + mov.u32 %r31106, %r31056; + +$L__BB2_78: + mov.u32 %r29797, 1; + mov.u64 %rd1296, keccak_round_constants; + cvta.const.u64 %rd1295, %rd1296; + // begin inline asm + // xor5 + lop3.b32 %r24109, %r31092, %r31090, %r31088, 0x96; + lop3.b32 %r24109, %r24109, %r31086, %r31084, 0x96; + lop3.b32 %r24110, %r31093, %r31091, %r31089, 0x96; + lop3.b32 %r24110, %r24110, %r31087, %r31085, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24121, %r31104, %r31102, %r31082, 0x96; + lop3.b32 %r24121, %r24121, %r31080, %r31078, 0x96; + lop3.b32 %r24122, %r31105, %r31103, %r31083, 0x96; + lop3.b32 %r24122, %r24122, %r31081, %r31079, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24133, %r31100, %r31098, %r31076, 0x96; + lop3.b32 %r24133, %r24133, %r31074, %r31072, 0x96; + lop3.b32 %r24134, %r31101, %r31099, %r31077, 0x96; + lop3.b32 %r24134, %r24134, %r31075, %r31073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24145, %r31096, %r31070, %r31068, 0x96; + lop3.b32 %r24145, %r24145, %r31066, %r31064, 0x96; + lop3.b32 %r24146, %r31097, %r31071, %r31069, 0x96; + lop3.b32 %r24146, %r24146, %r31067, %r31065, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24157, %r31094, %r31062, %r31060, 0x96; + lop3.b32 %r24157, %r24157, %r31058, %r31056, 0x96; + lop3.b32 %r24158, %r31095, %r31063, %r31061, 0x96; + lop3.b32 %r24158, %r24158, %r31059, %r31057, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24169, %r24122, %r24121, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24173, %r24121, %r24122, %r29797; + // end inline asm + xor.b32 %r24603, %r24169, %r24157; + xor.b32 %r24604, %r24173, %r24158; + xor.b32 %r24436, %r31092, %r24603; + xor.b32 %r24439, %r31093, %r24604; + xor.b32 %r24343, %r31090, %r24603; + xor.b32 %r24342, %r31091, %r24604; + xor.b32 %r24390, %r31088, %r24603; + xor.b32 %r24391, %r31089, %r24604; + xor.b32 %r24295, %r31086, %r24603; + xor.b32 %r24294, %r31087, %r24604; + xor.b32 %r24246, %r31084, %r24603; + xor.b32 %r24247, %r31085, %r24604; + // begin inline asm + shf.l.wrap.b32 %r24177, %r24134, %r24133, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24181, %r24133, %r24134, %r29797; + // end inline asm + xor.b32 %r24605, %r24177, %r24109; + xor.b32 %r24606, %r24181, %r24110; + xor.b32 %r24398, %r31104, %r24605; + xor.b32 %r24399, %r31105, %r24606; + xor.b32 %r24215, %r31102, %r24605; + xor.b32 %r24214, %r31103, %r24606; + xor.b32 %r24374, %r31082, %r24605; + xor.b32 %r24375, %r31083, %r24606; + xor.b32 %r24335, %r31080, %r24605; + xor.b32 %r24334, %r31081, %r24606; + xor.b32 %r24318, %r31078, %r24605; + xor.b32 %r24319, %r31079, %r24606; + // begin inline asm + shf.l.wrap.b32 %r24185, %r24146, %r24145, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24189, %r24145, %r24146, %r29797; + // end inline asm + xor.b32 %r24607, %r24185, %r24121; + xor.b32 %r24608, %r24189, %r24122; + xor.b32 %r24255, %r31100, %r24607; + xor.b32 %r24254, %r31101, %r24608; + xor.b32 %r24382, %r31098, %r24607; + xor.b32 %r24383, %r31099, %r24608; + xor.b32 %r24263, %r31076, %r24607; + xor.b32 %r24262, %r31077, %r24608; + xor.b32 %r24366, %r31074, %r24607; + xor.b32 %r24367, %r31075, %r24608; + xor.b32 %r24231, %r31072, %r24607; + xor.b32 %r24230, %r31073, %r24608; + // begin inline asm + shf.l.wrap.b32 %r24193, %r24158, %r24157, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24197, %r24157, %r24158, %r29797; + // end inline asm + xor.b32 %r24609, %r24193, %r24133; + xor.b32 %r24610, %r24197, %r24134; + xor.b32 %r24350, %r31096, %r24609; + xor.b32 %r24351, %r31097, %r24610; + xor.b32 %r24327, %r31070, %r24609; + xor.b32 %r24326, %r31071, %r24610; + xor.b32 %r24270, %r31068, %r24609; + xor.b32 %r24271, %r31069, %r24610; + xor.b32 %r24358, %r31066, %r24609; + xor.b32 %r24359, %r31067, %r24610; + xor.b32 %r24287, %r31064, %r24609; + xor.b32 %r24286, %r31065, %r24610; + // begin inline asm + shf.l.wrap.b32 %r24201, %r24110, %r24109, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24205, %r24109, %r24110, %r29797; + // end inline asm + xor.b32 %r24611, %r24201, %r24145; + xor.b32 %r24612, %r24205, %r24146; + xor.b32 %r24302, %r31094, %r24611; + xor.b32 %r24303, %r31095, %r24612; + xor.b32 %r24222, %r31062, %r24611; + xor.b32 %r24223, %r31063, %r24612; + xor.b32 %r24239, %r31060, %r24611; + xor.b32 %r24238, %r31061, %r24612; + xor.b32 %r24278, %r31058, %r24611; + xor.b32 %r24279, %r31059, %r24612; + xor.b32 %r24310, %r31056, %r24611; + xor.b32 %r24311, %r31057, %r24612; + mov.u32 %r24216, 44; + // begin inline asm + shf.l.wrap.b32 %r24209, %r24215, %r24214, %r24216; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24213, %r24214, %r24215, %r24216; + // end inline asm + mov.u32 %r24224, 20; + // begin inline asm + shf.l.wrap.b32 %r24217, %r24223, %r24222, %r24224; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24221, %r24222, %r24223, %r24224; + // end inline asm + mov.u32 %r24232, 61; + // begin inline asm + shf.l.wrap.b32 %r24225, %r24231, %r24230, %r24232; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24229, %r24230, %r24231, %r24232; + // end inline asm + mov.u32 %r24240, 39; + // begin inline asm + shf.l.wrap.b32 %r24233, %r24239, %r24238, %r24240; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24237, %r24238, %r24239, %r24240; + // end inline asm + mov.u32 %r24248, 18; + // begin inline asm + shf.l.wrap.b32 %r24241, %r24247, %r24246, %r24248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24245, %r24246, %r24247, %r24248; + // end inline asm + mov.u32 %r24256, 62; + // begin inline asm + shf.l.wrap.b32 %r24249, %r24255, %r24254, %r24256; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24253, %r24254, %r24255, %r24256; + // end inline asm + mov.u32 %r24264, 43; + // begin inline asm + shf.l.wrap.b32 %r24257, %r24263, %r24262, %r24264; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24261, %r24262, %r24263, %r24264; + // end inline asm + mov.u32 %r24272, 25; + // begin inline asm + shf.l.wrap.b32 %r24265, %r24271, %r24270, %r24272; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24269, %r24270, %r24271, %r24272; + // end inline asm + mov.u32 %r24280, 8; + // begin inline asm + shf.l.wrap.b32 %r24273, %r24279, %r24278, %r24280; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24277, %r24278, %r24279, %r24280; + // end inline asm + mov.u32 %r24288, 56; + // begin inline asm + shf.l.wrap.b32 %r24281, %r24287, %r24286, %r24288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24285, %r24286, %r24287, %r24288; + // end inline asm + mov.u32 %r24296, 41; + // begin inline asm + shf.l.wrap.b32 %r24289, %r24295, %r24294, %r24296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24293, %r24294, %r24295, %r24296; + // end inline asm + mov.u32 %r24304, 27; + // begin inline asm + shf.l.wrap.b32 %r24297, %r24303, %r24302, %r24304; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24301, %r24302, %r24303, %r24304; + // end inline asm + mov.u32 %r24312, 14; + // begin inline asm + shf.l.wrap.b32 %r24305, %r24311, %r24310, %r24312; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24309, %r24310, %r24311, %r24312; + // end inline asm + mov.u32 %r24320, 2; + // begin inline asm + shf.l.wrap.b32 %r24313, %r24319, %r24318, %r24320; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24317, %r24318, %r24319, %r24320; + // end inline asm + mov.u32 %r24328, 55; + // begin inline asm + shf.l.wrap.b32 %r24321, %r24327, %r24326, %r24328; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24325, %r24326, %r24327, %r24328; + // end inline asm + mov.u32 %r24336, 45; + // begin inline asm + shf.l.wrap.b32 %r24329, %r24335, %r24334, %r24336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24333, %r24334, %r24335, %r24336; + // end inline asm + mov.u32 %r24344, 36; + // begin inline asm + shf.l.wrap.b32 %r24337, %r24343, %r24342, %r24344; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24341, %r24342, %r24343, %r24344; + // end inline asm + mov.u32 %r24352, 28; + // begin inline asm + shf.l.wrap.b32 %r24345, %r24351, %r24350, %r24352; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24349, %r24350, %r24351, %r24352; + // end inline asm + mov.u32 %r24360, 21; + // begin inline asm + shf.l.wrap.b32 %r24353, %r24359, %r24358, %r24360; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24357, %r24358, %r24359, %r24360; + // end inline asm + mov.u32 %r24368, 15; + // begin inline asm + shf.l.wrap.b32 %r24361, %r24367, %r24366, %r24368; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24365, %r24366, %r24367, %r24368; + // end inline asm + mov.u32 %r24376, 10; + // begin inline asm + shf.l.wrap.b32 %r24369, %r24375, %r24374, %r24376; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24373, %r24374, %r24375, %r24376; + // end inline asm + mov.u32 %r24384, 6; + // begin inline asm + shf.l.wrap.b32 %r24377, %r24383, %r24382, %r24384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24381, %r24382, %r24383, %r24384; + // end inline asm + mov.u32 %r24392, 3; + // begin inline asm + shf.l.wrap.b32 %r24385, %r24391, %r24390, %r24392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24389, %r24390, %r24391, %r24392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24393, %r24399, %r24398, %r29797; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24397, %r24398, %r24399, %r29797; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24401, %r24436, %r24209, %r24257, 0xD2; + lop3.b32 %r24402, %r24439, %r24213, %r24261, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31104, %r24209, %r24257, %r24353, 0xD2; + lop3.b32 %r31105, %r24213, %r24261, %r24357, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31100, %r24257, %r24353, %r24305, 0xD2; + lop3.b32 %r31101, %r24261, %r24357, %r24309, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31096, %r24353, %r24305, %r24436, 0xD2; + lop3.b32 %r31097, %r24357, %r24309, %r24439, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31094, %r24305, %r24436, %r24209, 0xD2; + lop3.b32 %r31095, %r24309, %r24439, %r24213, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31090, %r24345, %r24217, %r24385, 0xD2; + lop3.b32 %r31091, %r24349, %r24221, %r24389, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31102, %r24217, %r24385, %r24329, 0xD2; + lop3.b32 %r31103, %r24221, %r24389, %r24333, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31098, %r24385, %r24329, %r24225, 0xD2; + lop3.b32 %r31099, %r24389, %r24333, %r24229, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31070, %r24329, %r24225, %r24345, 0xD2; + lop3.b32 %r31071, %r24333, %r24229, %r24349, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r31070, %r31071}; + // begin inline asm + // chi + lop3.b32 %r31062, %r24225, %r24345, %r24217, 0xD2; + lop3.b32 %r31063, %r24229, %r24349, %r24221, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r31062, %r31063}; + // begin inline asm + // chi + lop3.b32 %r31088, %r24393, %r24377, %r24265, 0xD2; + lop3.b32 %r31089, %r24397, %r24381, %r24269, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+104], {%r31088, %r31089}; + // begin inline asm + // chi + lop3.b32 %r31082, %r24377, %r24265, %r24273, 0xD2; + lop3.b32 %r31083, %r24381, %r24269, %r24277, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+112], {%r31082, %r31083}; + // begin inline asm + // chi + lop3.b32 %r31076, %r24265, %r24273, %r24241, 0xD2; + lop3.b32 %r31077, %r24269, %r24277, %r24245, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+120], {%r31076, %r31077}; + // begin inline asm + // chi + lop3.b32 %r31068, %r24273, %r24241, %r24393, 0xD2; + lop3.b32 %r31069, %r24277, %r24245, %r24397, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+128], {%r31068, %r31069}; + // begin inline asm + // chi + lop3.b32 %r31060, %r24241, %r24393, %r24377, 0xD2; + lop3.b32 %r31061, %r24245, %r24397, %r24381, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+136], {%r31060, %r31061}; + // begin inline asm + // chi + lop3.b32 %r31086, %r24297, %r24337, %r24369, 0xD2; + lop3.b32 %r31087, %r24301, %r24341, %r24373, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+144], {%r31086, %r31087}; + // begin inline asm + // chi + lop3.b32 %r31080, %r24337, %r24369, %r24361, 0xD2; + lop3.b32 %r31081, %r24341, %r24373, %r24365, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+152], {%r31080, %r31081}; + // begin inline asm + // chi + lop3.b32 %r31074, %r24369, %r24361, %r24281, 0xD2; + lop3.b32 %r31075, %r24373, %r24365, %r24285, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+160], {%r31074, %r31075}; + // begin inline asm + // chi + lop3.b32 %r31066, %r24361, %r24281, %r24297, 0xD2; + lop3.b32 %r31067, %r24365, %r24285, %r24301, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+168], {%r31066, %r31067}; + // begin inline asm + // chi + lop3.b32 %r31058, %r24281, %r24297, %r24337, 0xD2; + lop3.b32 %r31059, %r24285, %r24301, %r24341, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+176], {%r31058, %r31059}; + // begin inline asm + // chi + lop3.b32 %r31084, %r24249, %r24321, %r24233, 0xD2; + lop3.b32 %r31085, %r24253, %r24325, %r24237, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+184], {%r31084, %r31085}; + // begin inline asm + // chi + lop3.b32 %r31078, %r24321, %r24233, %r24289, 0xD2; + lop3.b32 %r31079, %r24325, %r24237, %r24293, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+192], {%r31078, %r31079}; + // begin inline asm + // chi + lop3.b32 %r31072, %r24233, %r24289, %r24313, 0xD2; + lop3.b32 %r31073, %r24237, %r24293, %r24317, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+200], {%r31072, %r31073}; + // begin inline asm + // chi + lop3.b32 %r31064, %r24289, %r24313, %r24249, 0xD2; + lop3.b32 %r31065, %r24293, %r24317, %r24253, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+208], {%r31064, %r31065}; + // begin inline asm + // chi + lop3.b32 %r31056, %r24313, %r24249, %r24321, 0xD2; + lop3.b32 %r31057, %r24317, %r24253, %r24325, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+216], {%r31056, %r31057}; + mul.wide.s32 %rd1033, %r31106, 8; + add.s64 %rd1032, %rd1295, %rd1033; + // begin inline asm + ld.global.nc.v2.u32 {%r24601,%r24602}, [%rd1032]; + // end inline asm + xor.b32 %r31092, %r24401, %r24601; + xor.b32 %r31093, %r24402, %r24602; + add.s32 %r31106, %r31106, 1; + setp.lt.u32 %p44, %r31106, 23; + @%p44 bra $L__BB2_78; + + mov.u64 %rd1284, keccak_round_constants; + cvta.const.u64 %rd1283, %rd1284; + add.s64 %rd1282, %rd1283, 184; + mov.u32 %r29795, 3; + mov.u32 %r29794, 21; + mov.u32 %r29793, 28; + mov.u32 %r29792, 45; + mov.u32 %r29791, 14; + mov.u32 %r29790, 43; + mov.u32 %r29789, 61; + mov.u32 %r29788, 20; + mov.u32 %r29787, 44; + mov.u32 %r31139, 0; + mov.u32 %r24712, 1; + st.local.v2.u32 [%rd270+32], {%r31104, %r31105}; + st.local.v2.u32 [%rd270+72], {%r31102, %r31103}; + st.local.v2.u32 [%rd270+40], {%r31100, %r31101}; + st.local.v2.u32 [%rd270+80], {%r31098, %r31099}; + st.local.v2.u32 [%rd270+48], {%r31096, %r31097}; + st.local.v2.u32 [%rd270+56], {%r31094, %r31095}; + st.local.v2.u32 [%rd270+24], {%r31092, %r31093}; + // begin inline asm + // xor5 + lop3.b32 %r24613, %r31092, %r31090, %r31088, 0x96; + lop3.b32 %r24613, %r24613, %r31086, %r31084, 0x96; + lop3.b32 %r24614, %r31093, %r31091, %r31089, 0x96; + lop3.b32 %r24614, %r24614, %r31087, %r31085, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24625, %r31104, %r31102, %r31082, 0x96; + lop3.b32 %r24625, %r24625, %r31080, %r31078, 0x96; + lop3.b32 %r24626, %r31105, %r31103, %r31083, 0x96; + lop3.b32 %r24626, %r24626, %r31081, %r31079, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24637, %r31100, %r31098, %r31076, 0x96; + lop3.b32 %r24637, %r24637, %r31074, %r31072, 0x96; + lop3.b32 %r24638, %r31101, %r31099, %r31077, 0x96; + lop3.b32 %r24638, %r24638, %r31075, %r31073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24649, %r31096, %r31070, %r31068, 0x96; + lop3.b32 %r24649, %r24649, %r31066, %r31064, 0x96; + lop3.b32 %r24650, %r31097, %r31071, %r31069, 0x96; + lop3.b32 %r24650, %r24650, %r31067, %r31065, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24661, %r31094, %r31062, %r31060, 0x96; + lop3.b32 %r24661, %r24661, %r31058, %r31056, 0x96; + lop3.b32 %r24662, %r31095, %r31063, %r31061, 0x96; + lop3.b32 %r24662, %r24662, %r31059, %r31057, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24673, %r24626, %r24625, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24677, %r24625, %r24626, %r24712; + // end inline asm + xor.b32 %r24852, %r24673, %r24661; + xor.b32 %r24853, %r24677, %r24662; + xor.b32 %r24820, %r31092, %r24852; + xor.b32 %r24823, %r31093, %r24853; + xor.b32 %r24783, %r31089, %r24853; + xor.b32 %r24782, %r31088, %r24852; + st.local.v2.u32 [%rd270+104], {%r24782, %r24783}; + // begin inline asm + shf.l.wrap.b32 %r24681, %r24638, %r24637, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24685, %r24637, %r24638, %r24712; + // end inline asm + xor.b32 %r24854, %r24681, %r24613; + xor.b32 %r24855, %r24685, %r24614; + xor.b32 %r24719, %r31102, %r24854; + xor.b32 %r24718, %r31103, %r24855; + xor.b32 %r24758, %r31081, %r24855; + xor.b32 %r24759, %r31080, %r24854; + st.local.v2.u32 [%rd270+152], {%r24759, %r24758}; + // begin inline asm + shf.l.wrap.b32 %r24689, %r24650, %r24649, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24693, %r24649, %r24650, %r24712; + // end inline asm + xor.b32 %r24856, %r24689, %r24625; + xor.b32 %r24857, %r24693, %r24626; + xor.b32 %r24742, %r31077, %r24857; + xor.b32 %r24743, %r31076, %r24856; + st.local.v2.u32 [%rd270+120], {%r24743, %r24742}; + xor.b32 %r24734, %r31073, %r24857; + xor.b32 %r24735, %r31072, %r24856; + st.local.v2.u32 [%rd270+200], {%r24735, %r24734}; + // begin inline asm + shf.l.wrap.b32 %r24697, %r24662, %r24661, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24701, %r24661, %r24662, %r24712; + // end inline asm + xor.b32 %r24858, %r24697, %r24637; + xor.b32 %r24859, %r24701, %r24638; + xor.b32 %r24766, %r31096, %r24858; + xor.b32 %r24767, %r31097, %r24859; + xor.b32 %r24775, %r31067, %r24859; + xor.b32 %r24774, %r31066, %r24858; + st.local.v2.u32 [%rd270+168], {%r24774, %r24775}; + // begin inline asm + shf.l.wrap.b32 %r24705, %r24614, %r24613, %r24712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24709, %r24613, %r24614, %r24712; + // end inline asm + xor.b32 %r24860, %r24705, %r24649; + xor.b32 %r24861, %r24709, %r24650; + xor.b32 %r24726, %r31062, %r24860; + xor.b32 %r24727, %r31063, %r24861; + xor.b32 %r24751, %r31057, %r24861; + xor.b32 %r24750, %r31056, %r24860; + st.local.v2.u32 [%rd270+216], {%r24750, %r24751}; + // begin inline asm + shf.l.wrap.b32 %r24713, %r24719, %r24718, %r29787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24717, %r24718, %r24719, %r29787; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24721, %r24727, %r24726, %r29788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24725, %r24726, %r24727, %r29788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24733, %r24734, %r24735, %r29789; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24729, %r24735, %r24734, %r29789; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r24729, %r24733}; + // begin inline asm + shf.l.wrap.b32 %r24737, %r24743, %r24742, %r29790; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24741, %r24742, %r24743, %r29790; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24745, %r24751, %r24750, %r29791; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24749, %r24750, %r24751, %r29791; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24757, %r24758, %r24759, %r29792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24753, %r24759, %r24758, %r29792; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r24753, %r24757}; + // begin inline asm + shf.l.wrap.b32 %r24761, %r24767, %r24766, %r29793; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24765, %r24766, %r24767, %r29793; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24769, %r24775, %r24774, %r29794; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24773, %r24774, %r24775, %r29794; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24777, %r24783, %r24782, %r29795; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24781, %r24782, %r24783, %r29795; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24785, %r24820, %r24713, %r24737, 0xD2; + lop3.b32 %r24786, %r24823, %r24717, %r24741, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31239, %r24713, %r24737, %r24769, 0xD2; + lop3.b32 %r31240, %r24717, %r24741, %r24773, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+32], {%r31239, %r31240}; + // begin inline asm + // chi + lop3.b32 %r31235, %r24737, %r24769, %r24745, 0xD2; + lop3.b32 %r31236, %r24741, %r24773, %r24749, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+40], {%r31235, %r31236}; + // begin inline asm + // chi + lop3.b32 %r31231, %r24769, %r24745, %r24820, 0xD2; + lop3.b32 %r31232, %r24773, %r24749, %r24823, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+48], {%r31231, %r31232}; + // begin inline asm + // chi + lop3.b32 %r31229, %r24745, %r24820, %r24713, 0xD2; + lop3.b32 %r31230, %r24749, %r24823, %r24717, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+56], {%r31229, %r31230}; + // begin inline asm + // chi + lop3.b32 %r31225, %r24761, %r24721, %r24777, 0xD2; + lop3.b32 %r31226, %r24765, %r24725, %r24781, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+64], {%r31225, %r31226}; + // begin inline asm + // chi + lop3.b32 %r31237, %r24721, %r24777, %r24753, 0xD2; + lop3.b32 %r31238, %r24725, %r24781, %r24757, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+72], {%r31237, %r31238}; + // begin inline asm + // chi + lop3.b32 %r31233, %r24777, %r24753, %r24729, 0xD2; + lop3.b32 %r31234, %r24781, %r24757, %r24733, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+80], {%r31233, %r31234}; + // begin inline asm + ld.global.nc.v2.u32 {%r24849,%r24850}, [%rd1282]; + // end inline asm + xor.b32 %r31227, %r24785, %r24849; + xor.b32 %r31228, %r24786, %r24850; + st.local.v2.u32 [%rd270+24], {%r31227, %r31228}; + add.s64 %rd273, %rd270, 24; + add.s64 %rd274, %rd2, 24; + +$L__BB2_80: + or.b32 %r29796, %r3343, 1; + cvta.to.global.u64 %rd1267, %rd361; + shl.b32 %r24862, %r31139, 2; + cvt.u64.u32 %rd1041, %r24862; + and.b64 %rd1042, %rd1041, 60; + add.s64 %rd1043, %rd274, %rd1042; + xor.b32 %r24863, %r3343, %r31139; + mul.lo.s32 %r24864, %r24863, 16777619; + ld.local.u32 %r24865, [%rd1043]; + xor.b32 %r24866, %r24864, %r24865; + mul.wide.u32 %rd1044, %r24866, -954391867; + shr.u64 %rd1045, %rd1044, 32; + cvt.u32.u64 %r24867, %rd1045; + sub.s32 %r24868, %r24866, %r24867; + shr.u32 %r24869, %r24868, 1; + add.s32 %r24870, %r24869, %r24867; + shr.u32 %r24871, %r24870, 20; + mul.lo.s32 %r24872, %r24871, 1179641; + sub.s32 %r24873, %r24866, %r24872; + mul.wide.u32 %rd1046, %r24873, 64; + add.s64 %rd1047, %rd1267, %rd1046; + mul.lo.s32 %r24874, %r31176, 16777619; + ld.global.u32 %r24875, [%rd1047]; + xor.b32 %r31176, %r24874, %r24875; + mul.lo.s32 %r24876, %r31177, 16777619; + ld.global.u32 %r24877, [%rd1047+4]; + xor.b32 %r31177, %r24876, %r24877; + mul.lo.s32 %r24878, %r31188, 16777619; + ld.global.u32 %r24879, [%rd1047+8]; + mul.lo.s32 %r24880, %r31189, 16777619; + ld.global.u32 %r24881, [%rd1047+12]; + xor.b32 %r24882, %r24880, %r24881; + xor.b32 %r31188, %r24878, %r24879; + mov.b64 %rd1048, {%r31188, %r24882}; + mul.lo.s32 %r24883, %r31184, 16777619; + ld.global.u32 %r24884, [%rd1047+16]; + mul.lo.s32 %r24885, %r31185, 16777619; + ld.global.u32 %r24886, [%rd1047+20]; + xor.b32 %r24887, %r24885, %r24886; + xor.b32 %r31184, %r24883, %r24884; + mov.b64 %rd1049, {%r31184, %r24887}; + mul.lo.s32 %r24888, %r31180, 16777619; + ld.global.u32 %r24889, [%rd1047+24]; + mul.lo.s32 %r24890, %r31181, 16777619; + ld.global.u32 %r24891, [%rd1047+28]; + xor.b32 %r24892, %r24890, %r24891; + xor.b32 %r31180, %r24888, %r24889; + mov.b64 %rd1050, {%r31180, %r24892}; + mul.lo.s32 %r24893, %r31178, 16777619; + ld.global.u32 %r24894, [%rd1047+32]; + mul.lo.s32 %r24895, %r31179, 16777619; + ld.global.u32 %r24896, [%rd1047+36]; + xor.b32 %r24897, %r24895, %r24896; + xor.b32 %r31178, %r24893, %r24894; + mov.b64 %rd1051, {%r31178, %r24897}; + mul.lo.s32 %r24898, %r31174, 16777619; + ld.global.u32 %r24899, [%rd1047+40]; + xor.b32 %r31174, %r24898, %r24899; + mul.lo.s32 %r24900, %r31175, 16777619; + ld.global.u32 %r24901, [%rd1047+44]; + xor.b32 %r31175, %r24900, %r24901; + mul.lo.s32 %r24902, %r31186, 16777619; + ld.global.u32 %r24903, [%rd1047+48]; + mul.lo.s32 %r24904, %r31187, 16777619; + ld.global.u32 %r24905, [%rd1047+52]; + xor.b32 %r24906, %r24904, %r24905; + xor.b32 %r31186, %r24902, %r24903; + mov.b64 %rd1052, {%r31186, %r24906}; + mul.lo.s32 %r24907, %r31182, 16777619; + ld.global.u32 %r24908, [%rd1047+56]; + mul.lo.s32 %r24909, %r31183, 16777619; + ld.global.u32 %r24910, [%rd1047+60]; + xor.b32 %r24911, %r24909, %r24910; + xor.b32 %r31182, %r24907, %r24908; + mov.b64 %rd1053, {%r31182, %r24911}; + st.local.v2.u32 [%rd2+24], {%r31176, %r31177}; + st.local.v2.u32 [%rd2+32], {%r31188, %r24882}; + st.local.v2.u32 [%rd2+40], {%r31184, %r24887}; + st.local.v2.u32 [%rd2+48], {%r31180, %r24892}; + st.local.v2.u32 [%rd2+56], {%r31178, %r24897}; + st.local.v2.u32 [%rd2+64], {%r31174, %r31175}; + st.local.v2.u32 [%rd2+72], {%r31186, %r24906}; + st.local.v2.u32 [%rd2+80], {%r31182, %r24911}; + add.s64 %rd1054, %rd273, %rd1042; + xor.b32 %r24912, %r29796, %r31139; + mul.lo.s32 %r24913, %r24912, 16777619; + ld.local.u32 %r24914, [%rd1054]; + xor.b32 %r24915, %r24913, %r24914; + mul.wide.u32 %rd1055, %r24915, -954391867; + shr.u64 %rd1056, %rd1055, 32; + cvt.u32.u64 %r24916, %rd1056; + sub.s32 %r24917, %r24915, %r24916; + shr.u32 %r24918, %r24917, 1; + add.s32 %r24919, %r24918, %r24916; + shr.u32 %r24920, %r24919, 20; + mul.lo.s32 %r24921, %r24920, 1179641; + sub.s32 %r24922, %r24915, %r24921; + mul.wide.u32 %rd1057, %r24922, 64; + add.s64 %rd1058, %rd1267, %rd1057; + mul.lo.s32 %r24923, %r31227, 16777619; + ld.global.u32 %r24924, [%rd1058]; + xor.b32 %r31227, %r24923, %r24924; + mul.lo.s32 %r24925, %r31228, 16777619; + ld.global.u32 %r24926, [%rd1058+4]; + xor.b32 %r31228, %r24925, %r24926; + mul.lo.s32 %r24927, %r31239, 16777619; + ld.global.u32 %r24928, [%rd1058+8]; + mul.lo.s32 %r24929, %r31240, 16777619; + ld.global.u32 %r24930, [%rd1058+12]; + xor.b32 %r24931, %r24929, %r24930; + xor.b32 %r31239, %r24927, %r24928; + mov.b64 %rd1059, {%r31239, %r24931}; + mul.lo.s32 %r24932, %r31235, 16777619; + ld.global.u32 %r24933, [%rd1058+16]; + mul.lo.s32 %r24934, %r31236, 16777619; + ld.global.u32 %r24935, [%rd1058+20]; + xor.b32 %r24936, %r24934, %r24935; + xor.b32 %r31235, %r24932, %r24933; + mov.b64 %rd1060, {%r31235, %r24936}; + mul.lo.s32 %r24937, %r31231, 16777619; + ld.global.u32 %r24938, [%rd1058+24]; + mul.lo.s32 %r24939, %r31232, 16777619; + ld.global.u32 %r24940, [%rd1058+28]; + xor.b32 %r24941, %r24939, %r24940; + xor.b32 %r31231, %r24937, %r24938; + mov.b64 %rd1061, {%r31231, %r24941}; + mul.lo.s32 %r24942, %r31229, 16777619; + ld.global.u32 %r24943, [%rd1058+32]; + mul.lo.s32 %r24944, %r31230, 16777619; + ld.global.u32 %r24945, [%rd1058+36]; + xor.b32 %r24946, %r24944, %r24945; + xor.b32 %r31229, %r24942, %r24943; + mov.b64 %rd1062, {%r31229, %r24946}; + mul.lo.s32 %r24947, %r31225, 16777619; + ld.global.u32 %r24948, [%rd1058+40]; + xor.b32 %r31225, %r24947, %r24948; + mul.lo.s32 %r24949, %r31226, 16777619; + ld.global.u32 %r24950, [%rd1058+44]; + xor.b32 %r31226, %r24949, %r24950; + mul.lo.s32 %r24951, %r31237, 16777619; + ld.global.u32 %r24952, [%rd1058+48]; + mul.lo.s32 %r24953, %r31238, 16777619; + ld.global.u32 %r24954, [%rd1058+52]; + xor.b32 %r24955, %r24953, %r24954; + xor.b32 %r31237, %r24951, %r24952; + mov.b64 %rd1063, {%r31237, %r24955}; + mul.lo.s32 %r24956, %r31233, 16777619; + ld.global.u32 %r24957, [%rd1058+56]; + mul.lo.s32 %r24958, %r31234, 16777619; + ld.global.u32 %r24959, [%rd1058+60]; + xor.b32 %r24960, %r24958, %r24959; + xor.b32 %r31233, %r24956, %r24957; + mov.b64 %rd1064, {%r31233, %r24960}; + st.local.v2.u32 [%rd270+24], {%r31227, %r31228}; + st.local.v2.u32 [%rd270+32], {%r31239, %r24931}; + st.local.v2.u32 [%rd270+40], {%r31235, %r24936}; + st.local.v2.u32 [%rd270+48], {%r31231, %r24941}; + st.local.v2.u32 [%rd270+56], {%r31229, %r24946}; + st.local.v2.u32 [%rd270+64], {%r31225, %r31226}; + st.local.v2.u32 [%rd270+72], {%r31237, %r24955}; + st.local.v2.u32 [%rd270+80], {%r31233, %r24960}; + add.s32 %r31139, %r31139, 1; + setp.lt.u32 %p45, %r31139, 512; + shr.u64 %rd1065, %rd1048, 32; + cvt.u32.u64 %r31189, %rd1065; + shr.u64 %rd1066, %rd1049, 32; + cvt.u32.u64 %r31185, %rd1066; + shr.u64 %rd1067, %rd1050, 32; + cvt.u32.u64 %r31181, %rd1067; + shr.u64 %rd1068, %rd1051, 32; + cvt.u32.u64 %r31179, %rd1068; + shr.u64 %rd1069, %rd1052, 32; + cvt.u32.u64 %r31187, %rd1069; + shr.u64 %rd1070, %rd1053, 32; + cvt.u32.u64 %r31183, %rd1070; + shr.u64 %rd1071, %rd1059, 32; + cvt.u32.u64 %r31240, %rd1071; + shr.u64 %rd1072, %rd1060, 32; + cvt.u32.u64 %r31236, %rd1072; + shr.u64 %rd1073, %rd1061, 32; + cvt.u32.u64 %r31232, %rd1073; + shr.u64 %rd1074, %rd1062, 32; + cvt.u32.u64 %r31230, %rd1074; + shr.u64 %rd1075, %rd1063, 32; + cvt.u32.u64 %r31238, %rd1075; + shr.u64 %rd1076, %rd1064, 32; + cvt.u32.u64 %r31234, %rd1076; + @%p45 bra $L__BB2_80; + + mov.u32 %r31140, 0; + st.local.v2.u32 [%rd2+96], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+104], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+112], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+120], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+128], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+136], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+144], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+152], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+160], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+168], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+176], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+184], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+192], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+200], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+208], {%r31140, %r31140}; + st.local.v2.u32 [%rd2+216], {%r31140, %r31140}; + mov.u32 %r31155, -2147483648; + mov.u32 %r31154, 1; + st.local.v2.u32 [%rd2+88], {%r31154, %r31155}; + mov.u32 %r31141, %r31140; + mov.u32 %r31142, %r31140; + mov.u32 %r31143, %r31140; + mov.u32 %r31144, %r31140; + mov.u32 %r31145, %r31140; + mov.u32 %r31146, %r31140; + mov.u32 %r31147, %r31140; + mov.u32 %r31148, %r31140; + mov.u32 %r31149, %r31140; + mov.u32 %r31150, %r31140; + mov.u32 %r31151, %r31140; + mov.u32 %r31152, %r31140; + mov.u32 %r31153, %r31140; + mov.u32 %r31156, %r31140; + mov.u32 %r31157, %r31140; + mov.u32 %r31158, %r31140; + mov.u32 %r31159, %r31140; + mov.u32 %r31160, %r31140; + mov.u32 %r31161, %r31140; + mov.u32 %r31162, %r31140; + mov.u32 %r31163, %r31140; + mov.u32 %r31164, %r31140; + mov.u32 %r31165, %r31140; + mov.u32 %r31166, %r31140; + mov.u32 %r31167, %r31140; + mov.u32 %r31168, %r31140; + mov.u32 %r31169, %r31140; + mov.u32 %r31170, %r31140; + mov.u32 %r31171, %r31140; + mov.u32 %r31172, %r31140; + mov.u32 %r31173, %r31140; + mov.u32 %r31190, %r31140; + +$L__BB2_82: + mov.u32 %r29807, 1; + mov.u64 %rd1286, keccak_round_constants; + cvta.const.u64 %rd1285, %rd1286; + // begin inline asm + // xor5 + lop3.b32 %r25002, %r31176, %r31174, %r31172, 0x96; + lop3.b32 %r25002, %r25002, %r31170, %r31168, 0x96; + lop3.b32 %r25003, %r31177, %r31175, %r31173, 0x96; + lop3.b32 %r25003, %r25003, %r31171, %r31169, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25014, %r31188, %r31186, %r31166, 0x96; + lop3.b32 %r25014, %r25014, %r31164, %r31162, 0x96; + lop3.b32 %r25015, %r31189, %r31187, %r31167, 0x96; + lop3.b32 %r25015, %r25015, %r31165, %r31163, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25026, %r31184, %r31182, %r31160, 0x96; + lop3.b32 %r25026, %r25026, %r31158, %r31156, 0x96; + lop3.b32 %r25027, %r31185, %r31183, %r31161, 0x96; + lop3.b32 %r25027, %r25027, %r31159, %r31157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25038, %r31180, %r31154, %r31152, 0x96; + lop3.b32 %r25038, %r25038, %r31150, %r31148, 0x96; + lop3.b32 %r25039, %r31181, %r31155, %r31153, 0x96; + lop3.b32 %r25039, %r25039, %r31151, %r31149, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25050, %r31178, %r31146, %r31144, 0x96; + lop3.b32 %r25050, %r25050, %r31142, %r31140, 0x96; + lop3.b32 %r25051, %r31179, %r31147, %r31145, 0x96; + lop3.b32 %r25051, %r25051, %r31143, %r31141, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25062, %r25015, %r25014, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25066, %r25014, %r25015, %r29807; + // end inline asm + xor.b32 %r25496, %r25062, %r25050; + xor.b32 %r25497, %r25066, %r25051; + xor.b32 %r25329, %r31176, %r25496; + xor.b32 %r25332, %r31177, %r25497; + xor.b32 %r25236, %r31174, %r25496; + xor.b32 %r25235, %r31175, %r25497; + xor.b32 %r25283, %r31172, %r25496; + xor.b32 %r25284, %r31173, %r25497; + xor.b32 %r25188, %r31170, %r25496; + xor.b32 %r25187, %r31171, %r25497; + xor.b32 %r25139, %r31168, %r25496; + xor.b32 %r25140, %r31169, %r25497; + // begin inline asm + shf.l.wrap.b32 %r25070, %r25027, %r25026, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25074, %r25026, %r25027, %r29807; + // end inline asm + xor.b32 %r25498, %r25070, %r25002; + xor.b32 %r25499, %r25074, %r25003; + xor.b32 %r25291, %r31188, %r25498; + xor.b32 %r25292, %r31189, %r25499; + xor.b32 %r25108, %r31186, %r25498; + xor.b32 %r25107, %r31187, %r25499; + xor.b32 %r25267, %r31166, %r25498; + xor.b32 %r25268, %r31167, %r25499; + xor.b32 %r25228, %r31164, %r25498; + xor.b32 %r25227, %r31165, %r25499; + xor.b32 %r25211, %r31162, %r25498; + xor.b32 %r25212, %r31163, %r25499; + // begin inline asm + shf.l.wrap.b32 %r25078, %r25039, %r25038, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25082, %r25038, %r25039, %r29807; + // end inline asm + xor.b32 %r25500, %r25078, %r25014; + xor.b32 %r25501, %r25082, %r25015; + xor.b32 %r25148, %r31184, %r25500; + xor.b32 %r25147, %r31185, %r25501; + xor.b32 %r25275, %r31182, %r25500; + xor.b32 %r25276, %r31183, %r25501; + xor.b32 %r25156, %r31160, %r25500; + xor.b32 %r25155, %r31161, %r25501; + xor.b32 %r25259, %r31158, %r25500; + xor.b32 %r25260, %r31159, %r25501; + xor.b32 %r25124, %r31156, %r25500; + xor.b32 %r25123, %r31157, %r25501; + // begin inline asm + shf.l.wrap.b32 %r25086, %r25051, %r25050, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25090, %r25050, %r25051, %r29807; + // end inline asm + xor.b32 %r25502, %r25086, %r25026; + xor.b32 %r25503, %r25090, %r25027; + xor.b32 %r25243, %r31180, %r25502; + xor.b32 %r25244, %r31181, %r25503; + xor.b32 %r25220, %r31154, %r25502; + xor.b32 %r25219, %r31155, %r25503; + xor.b32 %r25163, %r31152, %r25502; + xor.b32 %r25164, %r31153, %r25503; + xor.b32 %r25251, %r31150, %r25502; + xor.b32 %r25252, %r31151, %r25503; + xor.b32 %r25180, %r31148, %r25502; + xor.b32 %r25179, %r31149, %r25503; + // begin inline asm + shf.l.wrap.b32 %r25094, %r25003, %r25002, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25098, %r25002, %r25003, %r29807; + // end inline asm + xor.b32 %r25504, %r25094, %r25038; + xor.b32 %r25505, %r25098, %r25039; + xor.b32 %r25195, %r31178, %r25504; + xor.b32 %r25196, %r31179, %r25505; + xor.b32 %r25115, %r31146, %r25504; + xor.b32 %r25116, %r31147, %r25505; + xor.b32 %r25132, %r31144, %r25504; + xor.b32 %r25131, %r31145, %r25505; + xor.b32 %r25171, %r31142, %r25504; + xor.b32 %r25172, %r31143, %r25505; + xor.b32 %r25203, %r31140, %r25504; + xor.b32 %r25204, %r31141, %r25505; + mov.u32 %r25109, 44; + // begin inline asm + shf.l.wrap.b32 %r25102, %r25108, %r25107, %r25109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25106, %r25107, %r25108, %r25109; + // end inline asm + mov.u32 %r25117, 20; + // begin inline asm + shf.l.wrap.b32 %r25110, %r25116, %r25115, %r25117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25114, %r25115, %r25116, %r25117; + // end inline asm + mov.u32 %r25125, 61; + // begin inline asm + shf.l.wrap.b32 %r25118, %r25124, %r25123, %r25125; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25122, %r25123, %r25124, %r25125; + // end inline asm + mov.u32 %r25133, 39; + // begin inline asm + shf.l.wrap.b32 %r25126, %r25132, %r25131, %r25133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25130, %r25131, %r25132, %r25133; + // end inline asm + mov.u32 %r25141, 18; + // begin inline asm + shf.l.wrap.b32 %r25134, %r25140, %r25139, %r25141; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25138, %r25139, %r25140, %r25141; + // end inline asm + mov.u32 %r25149, 62; + // begin inline asm + shf.l.wrap.b32 %r25142, %r25148, %r25147, %r25149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25146, %r25147, %r25148, %r25149; + // end inline asm + mov.u32 %r25157, 43; + // begin inline asm + shf.l.wrap.b32 %r25150, %r25156, %r25155, %r25157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25154, %r25155, %r25156, %r25157; + // end inline asm + mov.u32 %r25165, 25; + // begin inline asm + shf.l.wrap.b32 %r25158, %r25164, %r25163, %r25165; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25162, %r25163, %r25164, %r25165; + // end inline asm + mov.u32 %r25173, 8; + // begin inline asm + shf.l.wrap.b32 %r25166, %r25172, %r25171, %r25173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25170, %r25171, %r25172, %r25173; + // end inline asm + mov.u32 %r25181, 56; + // begin inline asm + shf.l.wrap.b32 %r25174, %r25180, %r25179, %r25181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25178, %r25179, %r25180, %r25181; + // end inline asm + mov.u32 %r25189, 41; + // begin inline asm + shf.l.wrap.b32 %r25182, %r25188, %r25187, %r25189; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25186, %r25187, %r25188, %r25189; + // end inline asm + mov.u32 %r25197, 27; + // begin inline asm + shf.l.wrap.b32 %r25190, %r25196, %r25195, %r25197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25194, %r25195, %r25196, %r25197; + // end inline asm + mov.u32 %r25205, 14; + // begin inline asm + shf.l.wrap.b32 %r25198, %r25204, %r25203, %r25205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25202, %r25203, %r25204, %r25205; + // end inline asm + mov.u32 %r25213, 2; + // begin inline asm + shf.l.wrap.b32 %r25206, %r25212, %r25211, %r25213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25210, %r25211, %r25212, %r25213; + // end inline asm + mov.u32 %r25221, 55; + // begin inline asm + shf.l.wrap.b32 %r25214, %r25220, %r25219, %r25221; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25218, %r25219, %r25220, %r25221; + // end inline asm + mov.u32 %r25229, 45; + // begin inline asm + shf.l.wrap.b32 %r25222, %r25228, %r25227, %r25229; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25226, %r25227, %r25228, %r25229; + // end inline asm + mov.u32 %r25237, 36; + // begin inline asm + shf.l.wrap.b32 %r25230, %r25236, %r25235, %r25237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25234, %r25235, %r25236, %r25237; + // end inline asm + mov.u32 %r25245, 28; + // begin inline asm + shf.l.wrap.b32 %r25238, %r25244, %r25243, %r25245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25242, %r25243, %r25244, %r25245; + // end inline asm + mov.u32 %r25253, 21; + // begin inline asm + shf.l.wrap.b32 %r25246, %r25252, %r25251, %r25253; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25250, %r25251, %r25252, %r25253; + // end inline asm + mov.u32 %r25261, 15; + // begin inline asm + shf.l.wrap.b32 %r25254, %r25260, %r25259, %r25261; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25258, %r25259, %r25260, %r25261; + // end inline asm + mov.u32 %r25269, 10; + // begin inline asm + shf.l.wrap.b32 %r25262, %r25268, %r25267, %r25269; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25266, %r25267, %r25268, %r25269; + // end inline asm + mov.u32 %r25277, 6; + // begin inline asm + shf.l.wrap.b32 %r25270, %r25276, %r25275, %r25277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25274, %r25275, %r25276, %r25277; + // end inline asm + mov.u32 %r25285, 3; + // begin inline asm + shf.l.wrap.b32 %r25278, %r25284, %r25283, %r25285; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25282, %r25283, %r25284, %r25285; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25286, %r25292, %r25291, %r29807; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25290, %r25291, %r25292, %r29807; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25294, %r25329, %r25102, %r25150, 0xD2; + lop3.b32 %r25295, %r25332, %r25106, %r25154, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31188, %r25102, %r25150, %r25246, 0xD2; + lop3.b32 %r31189, %r25106, %r25154, %r25250, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31184, %r25150, %r25246, %r25198, 0xD2; + lop3.b32 %r31185, %r25154, %r25250, %r25202, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31180, %r25246, %r25198, %r25329, 0xD2; + lop3.b32 %r31181, %r25250, %r25202, %r25332, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31178, %r25198, %r25329, %r25102, 0xD2; + lop3.b32 %r31179, %r25202, %r25332, %r25106, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31174, %r25238, %r25110, %r25278, 0xD2; + lop3.b32 %r31175, %r25242, %r25114, %r25282, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31186, %r25110, %r25278, %r25222, 0xD2; + lop3.b32 %r31187, %r25114, %r25282, %r25226, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31182, %r25278, %r25222, %r25118, 0xD2; + lop3.b32 %r31183, %r25282, %r25226, %r25122, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31154, %r25222, %r25118, %r25238, 0xD2; + lop3.b32 %r31155, %r25226, %r25122, %r25242, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r31154, %r31155}; + // begin inline asm + // chi + lop3.b32 %r31146, %r25118, %r25238, %r25110, 0xD2; + lop3.b32 %r31147, %r25122, %r25242, %r25114, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r31146, %r31147}; + // begin inline asm + // chi + lop3.b32 %r31172, %r25286, %r25270, %r25158, 0xD2; + lop3.b32 %r31173, %r25290, %r25274, %r25162, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r31172, %r31173}; + // begin inline asm + // chi + lop3.b32 %r31166, %r25270, %r25158, %r25166, 0xD2; + lop3.b32 %r31167, %r25274, %r25162, %r25170, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r31166, %r31167}; + // begin inline asm + // chi + lop3.b32 %r31160, %r25158, %r25166, %r25134, 0xD2; + lop3.b32 %r31161, %r25162, %r25170, %r25138, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r31160, %r31161}; + // begin inline asm + // chi + lop3.b32 %r31152, %r25166, %r25134, %r25286, 0xD2; + lop3.b32 %r31153, %r25170, %r25138, %r25290, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r31152, %r31153}; + // begin inline asm + // chi + lop3.b32 %r31144, %r25134, %r25286, %r25270, 0xD2; + lop3.b32 %r31145, %r25138, %r25290, %r25274, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r31144, %r31145}; + // begin inline asm + // chi + lop3.b32 %r31170, %r25190, %r25230, %r25262, 0xD2; + lop3.b32 %r31171, %r25194, %r25234, %r25266, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r31170, %r31171}; + // begin inline asm + // chi + lop3.b32 %r31164, %r25230, %r25262, %r25254, 0xD2; + lop3.b32 %r31165, %r25234, %r25266, %r25258, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r31164, %r31165}; + // begin inline asm + // chi + lop3.b32 %r31158, %r25262, %r25254, %r25174, 0xD2; + lop3.b32 %r31159, %r25266, %r25258, %r25178, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r31158, %r31159}; + // begin inline asm + // chi + lop3.b32 %r31150, %r25254, %r25174, %r25190, 0xD2; + lop3.b32 %r31151, %r25258, %r25178, %r25194, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r31150, %r31151}; + // begin inline asm + // chi + lop3.b32 %r31142, %r25174, %r25190, %r25230, 0xD2; + lop3.b32 %r31143, %r25178, %r25194, %r25234, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r31142, %r31143}; + // begin inline asm + // chi + lop3.b32 %r31168, %r25142, %r25214, %r25126, 0xD2; + lop3.b32 %r31169, %r25146, %r25218, %r25130, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r31168, %r31169}; + // begin inline asm + // chi + lop3.b32 %r31162, %r25214, %r25126, %r25182, 0xD2; + lop3.b32 %r31163, %r25218, %r25130, %r25186, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r31162, %r31163}; + // begin inline asm + // chi + lop3.b32 %r31156, %r25126, %r25182, %r25206, 0xD2; + lop3.b32 %r31157, %r25130, %r25186, %r25210, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r31156, %r31157}; + // begin inline asm + // chi + lop3.b32 %r31148, %r25182, %r25206, %r25142, 0xD2; + lop3.b32 %r31149, %r25186, %r25210, %r25146, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r31148, %r31149}; + // begin inline asm + // chi + lop3.b32 %r31140, %r25206, %r25142, %r25214, 0xD2; + lop3.b32 %r31141, %r25210, %r25146, %r25218, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r31140, %r31141}; + mul.wide.s32 %rd1080, %r31190, 8; + add.s64 %rd1079, %rd1285, %rd1080; + // begin inline asm + ld.global.nc.v2.u32 {%r25494,%r25495}, [%rd1079]; + // end inline asm + xor.b32 %r31176, %r25294, %r25494; + xor.b32 %r31177, %r25295, %r25495; + add.s32 %r31190, %r31190, 1; + setp.lt.u32 %p46, %r31190, 23; + @%p46 bra $L__BB2_82; + + mov.u32 %r29806, 3; + mov.u32 %r29805, 21; + mov.u32 %r29804, 28; + mov.u32 %r29803, 45; + mov.u32 %r29802, 14; + mov.u32 %r29801, 43; + mov.u32 %r29800, 61; + mov.u32 %r29799, 20; + mov.u32 %r29798, 44; + mov.u64 %rd1289, keccak_round_constants; + cvta.const.u64 %rd1288, %rd1289; + add.s64 %rd1287, %rd1288, 184; + st.local.v2.u32 [%rd2+32], {%r31188, %r31189}; + st.local.v2.u32 [%rd2+72], {%r31186, %r31187}; + st.local.v2.u32 [%rd2+40], {%r31184, %r31185}; + st.local.v2.u32 [%rd2+80], {%r31182, %r31183}; + st.local.v2.u32 [%rd2+48], {%r31180, %r31181}; + st.local.v2.u32 [%rd2+56], {%r31178, %r31179}; + st.local.v2.u32 [%rd2+24], {%r31176, %r31177}; + // begin inline asm + // xor5 + lop3.b32 %r25506, %r31176, %r31174, %r31172, 0x96; + lop3.b32 %r25506, %r25506, %r31170, %r31168, 0x96; + lop3.b32 %r25507, %r31177, %r31175, %r31173, 0x96; + lop3.b32 %r25507, %r25507, %r31171, %r31169, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25518, %r31188, %r31186, %r31166, 0x96; + lop3.b32 %r25518, %r25518, %r31164, %r31162, 0x96; + lop3.b32 %r25519, %r31189, %r31187, %r31167, 0x96; + lop3.b32 %r25519, %r25519, %r31165, %r31163, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25530, %r31184, %r31182, %r31160, 0x96; + lop3.b32 %r25530, %r25530, %r31158, %r31156, 0x96; + lop3.b32 %r25531, %r31185, %r31183, %r31161, 0x96; + lop3.b32 %r25531, %r25531, %r31159, %r31157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25542, %r31180, %r31154, %r31152, 0x96; + lop3.b32 %r25542, %r25542, %r31150, %r31148, 0x96; + lop3.b32 %r25543, %r31181, %r31155, %r31153, 0x96; + lop3.b32 %r25543, %r25543, %r31151, %r31149, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25554, %r31178, %r31146, %r31144, 0x96; + lop3.b32 %r25554, %r25554, %r31142, %r31140, 0x96; + lop3.b32 %r25555, %r31179, %r31147, %r31145, 0x96; + lop3.b32 %r25555, %r25555, %r31143, %r31141, 0x96; + // end inline asm + mov.u32 %r31205, 1; + // begin inline asm + shf.l.wrap.b32 %r25566, %r25519, %r25518, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25570, %r25518, %r25519, %r31205; + // end inline asm + xor.b32 %r25785, %r25566, %r25554; + xor.b32 %r25786, %r25570, %r25555; + xor.b32 %r25713, %r31176, %r25785; + xor.b32 %r25716, %r31177, %r25786; + xor.b32 %r25676, %r31173, %r25786; + xor.b32 %r25675, %r31172, %r25785; + st.local.v2.u32 [%rd2+104], {%r25675, %r25676}; + // begin inline asm + shf.l.wrap.b32 %r25574, %r25531, %r25530, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25578, %r25530, %r25531, %r31205; + // end inline asm + xor.b32 %r25787, %r25574, %r25506; + xor.b32 %r25788, %r25578, %r25507; + xor.b32 %r25612, %r31186, %r25787; + xor.b32 %r25611, %r31187, %r25788; + xor.b32 %r25651, %r31165, %r25788; + xor.b32 %r25652, %r31164, %r25787; + st.local.v2.u32 [%rd2+152], {%r25652, %r25651}; + // begin inline asm + shf.l.wrap.b32 %r25582, %r25543, %r25542, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25586, %r25542, %r25543, %r31205; + // end inline asm + xor.b32 %r25789, %r25582, %r25518; + xor.b32 %r25790, %r25586, %r25519; + xor.b32 %r25635, %r31161, %r25790; + xor.b32 %r25636, %r31160, %r25789; + st.local.v2.u32 [%rd2+120], {%r25636, %r25635}; + xor.b32 %r25627, %r31157, %r25790; + xor.b32 %r25628, %r31156, %r25789; + st.local.v2.u32 [%rd2+200], {%r25628, %r25627}; + // begin inline asm + shf.l.wrap.b32 %r25590, %r25555, %r25554, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25594, %r25554, %r25555, %r31205; + // end inline asm + xor.b32 %r25791, %r25590, %r25530; + xor.b32 %r25792, %r25594, %r25531; + xor.b32 %r25659, %r31180, %r25791; + xor.b32 %r25660, %r31181, %r25792; + xor.b32 %r25668, %r31151, %r25792; + xor.b32 %r25667, %r31150, %r25791; + st.local.v2.u32 [%rd2+168], {%r25667, %r25668}; + // begin inline asm + shf.l.wrap.b32 %r25598, %r25507, %r25506, %r31205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25602, %r25506, %r25507, %r31205; + // end inline asm + xor.b32 %r25793, %r25598, %r25542; + xor.b32 %r25794, %r25602, %r25543; + xor.b32 %r25619, %r31146, %r25793; + xor.b32 %r25620, %r31147, %r25794; + xor.b32 %r25644, %r31141, %r25794; + xor.b32 %r25643, %r31140, %r25793; + st.local.v2.u32 [%rd2+216], {%r25643, %r25644}; + // begin inline asm + shf.l.wrap.b32 %r25606, %r25612, %r25611, %r29798; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25610, %r25611, %r25612, %r29798; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25614, %r25620, %r25619, %r29799; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25618, %r25619, %r25620, %r29799; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25626, %r25627, %r25628, %r29800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25622, %r25628, %r25627, %r29800; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r25622, %r25626}; + // begin inline asm + shf.l.wrap.b32 %r25630, %r25636, %r25635, %r29801; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25634, %r25635, %r25636, %r29801; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25638, %r25644, %r25643, %r29802; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25642, %r25643, %r25644, %r29802; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25650, %r25651, %r25652, %r29803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25646, %r25652, %r25651, %r29803; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r25646, %r25650}; + // begin inline asm + shf.l.wrap.b32 %r25654, %r25660, %r25659, %r29804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25658, %r25659, %r25660, %r29804; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25662, %r25668, %r25667, %r29805; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25666, %r25667, %r25668, %r29805; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25670, %r25676, %r25675, %r29806; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25674, %r25675, %r25676, %r29806; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25678, %r25713, %r25606, %r25630, 0xD2; + lop3.b32 %r25679, %r25716, %r25610, %r25634, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25686, %r25606, %r25630, %r25662, 0xD2; + lop3.b32 %r25687, %r25610, %r25634, %r25666, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r25686, %r25687}; + // begin inline asm + // chi + lop3.b32 %r25694, %r25630, %r25662, %r25638, 0xD2; + lop3.b32 %r25695, %r25634, %r25666, %r25642, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r25694, %r25695}; + // begin inline asm + // chi + lop3.b32 %r25702, %r25662, %r25638, %r25713, 0xD2; + lop3.b32 %r25703, %r25666, %r25642, %r25716, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r25702, %r25703}; + // begin inline asm + // chi + lop3.b32 %r25710, %r25638, %r25713, %r25606, 0xD2; + lop3.b32 %r25711, %r25642, %r25716, %r25610, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r25710, %r25711}; + // begin inline asm + // chi + lop3.b32 %r25718, %r25654, %r25614, %r25670, 0xD2; + lop3.b32 %r25719, %r25658, %r25618, %r25674, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r25718, %r25719}; + // begin inline asm + // chi + lop3.b32 %r25726, %r25614, %r25670, %r25646, 0xD2; + lop3.b32 %r25727, %r25618, %r25674, %r25650, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r25726, %r25727}; + // begin inline asm + // chi + lop3.b32 %r25734, %r25670, %r25646, %r25622, 0xD2; + lop3.b32 %r25735, %r25674, %r25650, %r25626, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r25734, %r25735}; + // begin inline asm + ld.global.nc.v2.u32 {%r25742,%r25743}, [%rd1287]; + // end inline asm + xor.b32 %r25795, %r25679, %r25743; + xor.b32 %r25796, %r25678, %r25742; + mov.b64 %rd1349, {%r25796, %r25795}; + mov.b64 %rd1350, {%r25686, %r25687}; + mov.b64 %rd1351, {%r25694, %r25695}; + mov.b64 %rd1352, {%r25702, %r25703}; + mov.b64 %rd1353, {%r25710, %r25711}; + mov.b64 %rd1354, {%r25718, %r25719}; + mov.b64 %rd1355, {%r25726, %r25727}; + mov.b64 %rd1356, {%r25734, %r25735}; + mov.u32 %r31191, 0; + st.local.v2.u32 [%rd2+24], {%r25796, %r25795}; + st.local.v2.u32 [%rd270+96], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+104], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+112], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+120], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+128], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+136], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+144], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+152], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+160], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+168], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+176], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+184], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+192], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+200], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+208], {%r31191, %r31191}; + st.local.v2.u32 [%rd270+216], {%r31191, %r31191}; + mov.u32 %r31206, -2147483648; + st.local.v2.u32 [%rd270+88], {%r31205, %r31206}; + mov.u32 %r31192, %r31191; + mov.u32 %r31193, %r31191; + mov.u32 %r31194, %r31191; + mov.u32 %r31195, %r31191; + mov.u32 %r31196, %r31191; + mov.u32 %r31197, %r31191; + mov.u32 %r31198, %r31191; + mov.u32 %r31199, %r31191; + mov.u32 %r31200, %r31191; + mov.u32 %r31201, %r31191; + mov.u32 %r31202, %r31191; + mov.u32 %r31203, %r31191; + mov.u32 %r31204, %r31191; + mov.u32 %r31207, %r31191; + mov.u32 %r31208, %r31191; + mov.u32 %r31209, %r31191; + mov.u32 %r31210, %r31191; + mov.u32 %r31211, %r31191; + mov.u32 %r31212, %r31191; + mov.u32 %r31213, %r31191; + mov.u32 %r31214, %r31191; + mov.u32 %r31215, %r31191; + mov.u32 %r31216, %r31191; + mov.u32 %r31217, %r31191; + mov.u32 %r31218, %r31191; + mov.u32 %r31219, %r31191; + mov.u32 %r31220, %r31191; + mov.u32 %r31221, %r31191; + mov.u32 %r31222, %r31191; + mov.u32 %r31223, %r31191; + mov.u32 %r31224, %r31191; + mov.u32 %r31241, %r31191; + +$L__BB2_84: + mov.u32 %r29817, 1; + mov.u64 %rd1291, keccak_round_constants; + cvta.const.u64 %rd1290, %rd1291; + // begin inline asm + // xor5 + lop3.b32 %r25797, %r31227, %r31225, %r31223, 0x96; + lop3.b32 %r25797, %r25797, %r31221, %r31219, 0x96; + lop3.b32 %r25798, %r31228, %r31226, %r31224, 0x96; + lop3.b32 %r25798, %r25798, %r31222, %r31220, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25809, %r31239, %r31237, %r31217, 0x96; + lop3.b32 %r25809, %r25809, %r31215, %r31213, 0x96; + lop3.b32 %r25810, %r31240, %r31238, %r31218, 0x96; + lop3.b32 %r25810, %r25810, %r31216, %r31214, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25821, %r31235, %r31233, %r31211, 0x96; + lop3.b32 %r25821, %r25821, %r31209, %r31207, 0x96; + lop3.b32 %r25822, %r31236, %r31234, %r31212, 0x96; + lop3.b32 %r25822, %r25822, %r31210, %r31208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25833, %r31231, %r31205, %r31203, 0x96; + lop3.b32 %r25833, %r25833, %r31201, %r31199, 0x96; + lop3.b32 %r25834, %r31232, %r31206, %r31204, 0x96; + lop3.b32 %r25834, %r25834, %r31202, %r31200, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25845, %r31229, %r31197, %r31195, 0x96; + lop3.b32 %r25845, %r25845, %r31193, %r31191, 0x96; + lop3.b32 %r25846, %r31230, %r31198, %r31196, 0x96; + lop3.b32 %r25846, %r25846, %r31194, %r31192, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25857, %r25810, %r25809, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25861, %r25809, %r25810, %r29817; + // end inline asm + xor.b32 %r26291, %r25857, %r25845; + xor.b32 %r26292, %r25861, %r25846; + xor.b32 %r26124, %r31227, %r26291; + xor.b32 %r26127, %r31228, %r26292; + xor.b32 %r26031, %r31225, %r26291; + xor.b32 %r26030, %r31226, %r26292; + xor.b32 %r26078, %r31223, %r26291; + xor.b32 %r26079, %r31224, %r26292; + xor.b32 %r25983, %r31221, %r26291; + xor.b32 %r25982, %r31222, %r26292; + xor.b32 %r25934, %r31219, %r26291; + xor.b32 %r25935, %r31220, %r26292; + // begin inline asm + shf.l.wrap.b32 %r25865, %r25822, %r25821, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25869, %r25821, %r25822, %r29817; + // end inline asm + xor.b32 %r26293, %r25865, %r25797; + xor.b32 %r26294, %r25869, %r25798; + xor.b32 %r26086, %r31239, %r26293; + xor.b32 %r26087, %r31240, %r26294; + xor.b32 %r25903, %r31237, %r26293; + xor.b32 %r25902, %r31238, %r26294; + xor.b32 %r26062, %r31217, %r26293; + xor.b32 %r26063, %r31218, %r26294; + xor.b32 %r26023, %r31215, %r26293; + xor.b32 %r26022, %r31216, %r26294; + xor.b32 %r26006, %r31213, %r26293; + xor.b32 %r26007, %r31214, %r26294; + // begin inline asm + shf.l.wrap.b32 %r25873, %r25834, %r25833, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25877, %r25833, %r25834, %r29817; + // end inline asm + xor.b32 %r26295, %r25873, %r25809; + xor.b32 %r26296, %r25877, %r25810; + xor.b32 %r25943, %r31235, %r26295; + xor.b32 %r25942, %r31236, %r26296; + xor.b32 %r26070, %r31233, %r26295; + xor.b32 %r26071, %r31234, %r26296; + xor.b32 %r25951, %r31211, %r26295; + xor.b32 %r25950, %r31212, %r26296; + xor.b32 %r26054, %r31209, %r26295; + xor.b32 %r26055, %r31210, %r26296; + xor.b32 %r25919, %r31207, %r26295; + xor.b32 %r25918, %r31208, %r26296; + // begin inline asm + shf.l.wrap.b32 %r25881, %r25846, %r25845, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25885, %r25845, %r25846, %r29817; + // end inline asm + xor.b32 %r26297, %r25881, %r25821; + xor.b32 %r26298, %r25885, %r25822; + xor.b32 %r26038, %r31231, %r26297; + xor.b32 %r26039, %r31232, %r26298; + xor.b32 %r26015, %r31205, %r26297; + xor.b32 %r26014, %r31206, %r26298; + xor.b32 %r25958, %r31203, %r26297; + xor.b32 %r25959, %r31204, %r26298; + xor.b32 %r26046, %r31201, %r26297; + xor.b32 %r26047, %r31202, %r26298; + xor.b32 %r25975, %r31199, %r26297; + xor.b32 %r25974, %r31200, %r26298; + // begin inline asm + shf.l.wrap.b32 %r25889, %r25798, %r25797, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25893, %r25797, %r25798, %r29817; + // end inline asm + xor.b32 %r26299, %r25889, %r25833; + xor.b32 %r26300, %r25893, %r25834; + xor.b32 %r25990, %r31229, %r26299; + xor.b32 %r25991, %r31230, %r26300; + xor.b32 %r25910, %r31197, %r26299; + xor.b32 %r25911, %r31198, %r26300; + xor.b32 %r25927, %r31195, %r26299; + xor.b32 %r25926, %r31196, %r26300; + xor.b32 %r25966, %r31193, %r26299; + xor.b32 %r25967, %r31194, %r26300; + xor.b32 %r25998, %r31191, %r26299; + xor.b32 %r25999, %r31192, %r26300; + mov.u32 %r25904, 44; + // begin inline asm + shf.l.wrap.b32 %r25897, %r25903, %r25902, %r25904; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25901, %r25902, %r25903, %r25904; + // end inline asm + mov.u32 %r25912, 20; + // begin inline asm + shf.l.wrap.b32 %r25905, %r25911, %r25910, %r25912; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25909, %r25910, %r25911, %r25912; + // end inline asm + mov.u32 %r25920, 61; + // begin inline asm + shf.l.wrap.b32 %r25913, %r25919, %r25918, %r25920; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25917, %r25918, %r25919, %r25920; + // end inline asm + mov.u32 %r25928, 39; + // begin inline asm + shf.l.wrap.b32 %r25921, %r25927, %r25926, %r25928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25925, %r25926, %r25927, %r25928; + // end inline asm + mov.u32 %r25936, 18; + // begin inline asm + shf.l.wrap.b32 %r25929, %r25935, %r25934, %r25936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25933, %r25934, %r25935, %r25936; + // end inline asm + mov.u32 %r25944, 62; + // begin inline asm + shf.l.wrap.b32 %r25937, %r25943, %r25942, %r25944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25941, %r25942, %r25943, %r25944; + // end inline asm + mov.u32 %r25952, 43; + // begin inline asm + shf.l.wrap.b32 %r25945, %r25951, %r25950, %r25952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25949, %r25950, %r25951, %r25952; + // end inline asm + mov.u32 %r25960, 25; + // begin inline asm + shf.l.wrap.b32 %r25953, %r25959, %r25958, %r25960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25957, %r25958, %r25959, %r25960; + // end inline asm + mov.u32 %r25968, 8; + // begin inline asm + shf.l.wrap.b32 %r25961, %r25967, %r25966, %r25968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25965, %r25966, %r25967, %r25968; + // end inline asm + mov.u32 %r25976, 56; + // begin inline asm + shf.l.wrap.b32 %r25969, %r25975, %r25974, %r25976; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25973, %r25974, %r25975, %r25976; + // end inline asm + mov.u32 %r25984, 41; + // begin inline asm + shf.l.wrap.b32 %r25977, %r25983, %r25982, %r25984; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25981, %r25982, %r25983, %r25984; + // end inline asm + mov.u32 %r25992, 27; + // begin inline asm + shf.l.wrap.b32 %r25985, %r25991, %r25990, %r25992; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25989, %r25990, %r25991, %r25992; + // end inline asm + mov.u32 %r26000, 14; + // begin inline asm + shf.l.wrap.b32 %r25993, %r25999, %r25998, %r26000; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25997, %r25998, %r25999, %r26000; + // end inline asm + mov.u32 %r26008, 2; + // begin inline asm + shf.l.wrap.b32 %r26001, %r26007, %r26006, %r26008; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26005, %r26006, %r26007, %r26008; + // end inline asm + mov.u32 %r26016, 55; + // begin inline asm + shf.l.wrap.b32 %r26009, %r26015, %r26014, %r26016; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26013, %r26014, %r26015, %r26016; + // end inline asm + mov.u32 %r26024, 45; + // begin inline asm + shf.l.wrap.b32 %r26017, %r26023, %r26022, %r26024; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26021, %r26022, %r26023, %r26024; + // end inline asm + mov.u32 %r26032, 36; + // begin inline asm + shf.l.wrap.b32 %r26025, %r26031, %r26030, %r26032; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26029, %r26030, %r26031, %r26032; + // end inline asm + mov.u32 %r26040, 28; + // begin inline asm + shf.l.wrap.b32 %r26033, %r26039, %r26038, %r26040; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26037, %r26038, %r26039, %r26040; + // end inline asm + mov.u32 %r26048, 21; + // begin inline asm + shf.l.wrap.b32 %r26041, %r26047, %r26046, %r26048; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26045, %r26046, %r26047, %r26048; + // end inline asm + mov.u32 %r26056, 15; + // begin inline asm + shf.l.wrap.b32 %r26049, %r26055, %r26054, %r26056; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26053, %r26054, %r26055, %r26056; + // end inline asm + mov.u32 %r26064, 10; + // begin inline asm + shf.l.wrap.b32 %r26057, %r26063, %r26062, %r26064; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26061, %r26062, %r26063, %r26064; + // end inline asm + mov.u32 %r26072, 6; + // begin inline asm + shf.l.wrap.b32 %r26065, %r26071, %r26070, %r26072; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26069, %r26070, %r26071, %r26072; + // end inline asm + mov.u32 %r26080, 3; + // begin inline asm + shf.l.wrap.b32 %r26073, %r26079, %r26078, %r26080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26077, %r26078, %r26079, %r26080; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26081, %r26087, %r26086, %r29817; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26085, %r26086, %r26087, %r29817; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26089, %r26124, %r25897, %r25945, 0xD2; + lop3.b32 %r26090, %r26127, %r25901, %r25949, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31239, %r25897, %r25945, %r26041, 0xD2; + lop3.b32 %r31240, %r25901, %r25949, %r26045, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31235, %r25945, %r26041, %r25993, 0xD2; + lop3.b32 %r31236, %r25949, %r26045, %r25997, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31231, %r26041, %r25993, %r26124, 0xD2; + lop3.b32 %r31232, %r26045, %r25997, %r26127, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31229, %r25993, %r26124, %r25897, 0xD2; + lop3.b32 %r31230, %r25997, %r26127, %r25901, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31225, %r26033, %r25905, %r26073, 0xD2; + lop3.b32 %r31226, %r26037, %r25909, %r26077, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31237, %r25905, %r26073, %r26017, 0xD2; + lop3.b32 %r31238, %r25909, %r26077, %r26021, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31233, %r26073, %r26017, %r25913, 0xD2; + lop3.b32 %r31234, %r26077, %r26021, %r25917, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31205, %r26017, %r25913, %r26033, 0xD2; + lop3.b32 %r31206, %r26021, %r25917, %r26037, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r31205, %r31206}; + // begin inline asm + // chi + lop3.b32 %r31197, %r25913, %r26033, %r25905, 0xD2; + lop3.b32 %r31198, %r25917, %r26037, %r25909, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r31197, %r31198}; + // begin inline asm + // chi + lop3.b32 %r31223, %r26081, %r26065, %r25953, 0xD2; + lop3.b32 %r31224, %r26085, %r26069, %r25957, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+104], {%r31223, %r31224}; + // begin inline asm + // chi + lop3.b32 %r31217, %r26065, %r25953, %r25961, 0xD2; + lop3.b32 %r31218, %r26069, %r25957, %r25965, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+112], {%r31217, %r31218}; + // begin inline asm + // chi + lop3.b32 %r31211, %r25953, %r25961, %r25929, 0xD2; + lop3.b32 %r31212, %r25957, %r25965, %r25933, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+120], {%r31211, %r31212}; + // begin inline asm + // chi + lop3.b32 %r31203, %r25961, %r25929, %r26081, 0xD2; + lop3.b32 %r31204, %r25965, %r25933, %r26085, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+128], {%r31203, %r31204}; + // begin inline asm + // chi + lop3.b32 %r31195, %r25929, %r26081, %r26065, 0xD2; + lop3.b32 %r31196, %r25933, %r26085, %r26069, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+136], {%r31195, %r31196}; + // begin inline asm + // chi + lop3.b32 %r31221, %r25985, %r26025, %r26057, 0xD2; + lop3.b32 %r31222, %r25989, %r26029, %r26061, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+144], {%r31221, %r31222}; + // begin inline asm + // chi + lop3.b32 %r31215, %r26025, %r26057, %r26049, 0xD2; + lop3.b32 %r31216, %r26029, %r26061, %r26053, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+152], {%r31215, %r31216}; + // begin inline asm + // chi + lop3.b32 %r31209, %r26057, %r26049, %r25969, 0xD2; + lop3.b32 %r31210, %r26061, %r26053, %r25973, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+160], {%r31209, %r31210}; + // begin inline asm + // chi + lop3.b32 %r31201, %r26049, %r25969, %r25985, 0xD2; + lop3.b32 %r31202, %r26053, %r25973, %r25989, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+168], {%r31201, %r31202}; + // begin inline asm + // chi + lop3.b32 %r31193, %r25969, %r25985, %r26025, 0xD2; + lop3.b32 %r31194, %r25973, %r25989, %r26029, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+176], {%r31193, %r31194}; + // begin inline asm + // chi + lop3.b32 %r31219, %r25937, %r26009, %r25921, 0xD2; + lop3.b32 %r31220, %r25941, %r26013, %r25925, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+184], {%r31219, %r31220}; + // begin inline asm + // chi + lop3.b32 %r31213, %r26009, %r25921, %r25977, 0xD2; + lop3.b32 %r31214, %r26013, %r25925, %r25981, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+192], {%r31213, %r31214}; + // begin inline asm + // chi + lop3.b32 %r31207, %r25921, %r25977, %r26001, 0xD2; + lop3.b32 %r31208, %r25925, %r25981, %r26005, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+200], {%r31207, %r31208}; + // begin inline asm + // chi + lop3.b32 %r31199, %r25977, %r26001, %r25937, 0xD2; + lop3.b32 %r31200, %r25981, %r26005, %r25941, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+208], {%r31199, %r31200}; + // begin inline asm + // chi + lop3.b32 %r31191, %r26001, %r25937, %r26009, 0xD2; + lop3.b32 %r31192, %r26005, %r25941, %r26013, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+216], {%r31191, %r31192}; + mul.wide.s32 %rd1091, %r31241, 8; + add.s64 %rd1090, %rd1290, %rd1091; + // begin inline asm + ld.global.nc.v2.u32 {%r26289,%r26290}, [%rd1090]; + // end inline asm + xor.b32 %r31227, %r26089, %r26289; + xor.b32 %r31228, %r26090, %r26290; + add.s32 %r31241, %r31241, 1; + setp.lt.u32 %p47, %r31241, 23; + @%p47 bra $L__BB2_84; + + mov.u32 %r29816, 3; + mov.u32 %r29815, 21; + mov.u32 %r29814, 28; + mov.u32 %r29813, 45; + mov.u32 %r29812, 14; + mov.u32 %r29811, 43; + mov.u32 %r29810, 61; + mov.u32 %r29809, 20; + mov.u32 %r29808, 44; + mov.u64 %rd1294, keccak_round_constants; + cvta.const.u64 %rd1293, %rd1294; + add.s64 %rd1292, %rd1293, 184; + mov.u32 %r26400, 1; + st.local.v2.u32 [%rd270+32], {%r31239, %r31240}; + st.local.v2.u32 [%rd270+72], {%r31237, %r31238}; + st.local.v2.u32 [%rd270+40], {%r31235, %r31236}; + st.local.v2.u32 [%rd270+80], {%r31233, %r31234}; + st.local.v2.u32 [%rd270+48], {%r31231, %r31232}; + st.local.v2.u32 [%rd270+56], {%r31229, %r31230}; + st.local.v2.u32 [%rd270+24], {%r31227, %r31228}; + // begin inline asm + // xor5 + lop3.b32 %r26301, %r31227, %r31225, %r31223, 0x96; + lop3.b32 %r26301, %r26301, %r31221, %r31219, 0x96; + lop3.b32 %r26302, %r31228, %r31226, %r31224, 0x96; + lop3.b32 %r26302, %r26302, %r31222, %r31220, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26313, %r31239, %r31237, %r31217, 0x96; + lop3.b32 %r26313, %r26313, %r31215, %r31213, 0x96; + lop3.b32 %r26314, %r31240, %r31238, %r31218, 0x96; + lop3.b32 %r26314, %r26314, %r31216, %r31214, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26325, %r31235, %r31233, %r31211, 0x96; + lop3.b32 %r26325, %r26325, %r31209, %r31207, 0x96; + lop3.b32 %r26326, %r31236, %r31234, %r31212, 0x96; + lop3.b32 %r26326, %r26326, %r31210, %r31208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26337, %r31231, %r31205, %r31203, 0x96; + lop3.b32 %r26337, %r26337, %r31201, %r31199, 0x96; + lop3.b32 %r26338, %r31232, %r31206, %r31204, 0x96; + lop3.b32 %r26338, %r26338, %r31202, %r31200, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26349, %r31229, %r31197, %r31195, 0x96; + lop3.b32 %r26349, %r26349, %r31193, %r31191, 0x96; + lop3.b32 %r26350, %r31230, %r31198, %r31196, 0x96; + lop3.b32 %r26350, %r26350, %r31194, %r31192, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26361, %r26314, %r26313, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26365, %r26313, %r26314, %r26400; + // end inline asm + xor.b32 %r26539, %r26361, %r26349; + xor.b32 %r26540, %r26365, %r26350; + xor.b32 %r26508, %r31227, %r26539; + xor.b32 %r26511, %r31228, %r26540; + xor.b32 %r26471, %r31224, %r26540; + xor.b32 %r26470, %r31223, %r26539; + st.local.v2.u32 [%rd270+104], {%r26470, %r26471}; + // begin inline asm + shf.l.wrap.b32 %r26369, %r26326, %r26325, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26373, %r26325, %r26326, %r26400; + // end inline asm + xor.b32 %r26541, %r26369, %r26301; + xor.b32 %r26542, %r26373, %r26302; + xor.b32 %r26407, %r31237, %r26541; + xor.b32 %r26406, %r31238, %r26542; + xor.b32 %r26446, %r31216, %r26542; + xor.b32 %r26447, %r31215, %r26541; + st.local.v2.u32 [%rd270+152], {%r26447, %r26446}; + // begin inline asm + shf.l.wrap.b32 %r26377, %r26338, %r26337, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26381, %r26337, %r26338, %r26400; + // end inline asm + xor.b32 %r26543, %r26377, %r26313; + xor.b32 %r26544, %r26381, %r26314; + xor.b32 %r26430, %r31212, %r26544; + xor.b32 %r26431, %r31211, %r26543; + st.local.v2.u32 [%rd270+120], {%r26431, %r26430}; + xor.b32 %r26422, %r31208, %r26544; + xor.b32 %r26423, %r31207, %r26543; + st.local.v2.u32 [%rd270+200], {%r26423, %r26422}; + // begin inline asm + shf.l.wrap.b32 %r26385, %r26350, %r26349, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26389, %r26349, %r26350, %r26400; + // end inline asm + xor.b32 %r26545, %r26385, %r26325; + xor.b32 %r26546, %r26389, %r26326; + xor.b32 %r26454, %r31231, %r26545; + xor.b32 %r26455, %r31232, %r26546; + xor.b32 %r26463, %r31202, %r26546; + xor.b32 %r26462, %r31201, %r26545; + st.local.v2.u32 [%rd270+168], {%r26462, %r26463}; + // begin inline asm + shf.l.wrap.b32 %r26393, %r26302, %r26301, %r26400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26397, %r26301, %r26302, %r26400; + // end inline asm + xor.b32 %r26547, %r26393, %r26337; + xor.b32 %r26548, %r26397, %r26338; + xor.b32 %r26414, %r31197, %r26547; + xor.b32 %r26415, %r31198, %r26548; + xor.b32 %r26439, %r31192, %r26548; + xor.b32 %r26438, %r31191, %r26547; + st.local.v2.u32 [%rd270+216], {%r26438, %r26439}; + // begin inline asm + shf.l.wrap.b32 %r26401, %r26407, %r26406, %r29808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26405, %r26406, %r26407, %r29808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26409, %r26415, %r26414, %r29809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26413, %r26414, %r26415, %r29809; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26421, %r26422, %r26423, %r29810; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26417, %r26423, %r26422, %r29810; + // end inline asm + st.local.v2.u32 [%rd270+96], {%r26417, %r26421}; + // begin inline asm + shf.l.wrap.b32 %r26425, %r26431, %r26430, %r29811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26429, %r26430, %r26431, %r29811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26433, %r26439, %r26438, %r29812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26437, %r26438, %r26439, %r29812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26445, %r26446, %r26447, %r29813; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26441, %r26447, %r26446, %r29813; + // end inline asm + st.local.v2.u32 [%rd270+88], {%r26441, %r26445}; + // begin inline asm + shf.l.wrap.b32 %r26449, %r26455, %r26454, %r29814; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26453, %r26454, %r26455, %r29814; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26457, %r26463, %r26462, %r29815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26461, %r26462, %r26463, %r29815; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26465, %r26471, %r26470, %r29816; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26469, %r26470, %r26471, %r29816; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26473, %r26508, %r26401, %r26425, 0xD2; + lop3.b32 %r26474, %r26511, %r26405, %r26429, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26481, %r26401, %r26425, %r26457, 0xD2; + lop3.b32 %r26482, %r26405, %r26429, %r26461, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+32], {%r26481, %r26482}; + // begin inline asm + // chi + lop3.b32 %r26489, %r26425, %r26457, %r26433, 0xD2; + lop3.b32 %r26490, %r26429, %r26461, %r26437, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+40], {%r26489, %r26490}; + // begin inline asm + // chi + lop3.b32 %r26497, %r26457, %r26433, %r26508, 0xD2; + lop3.b32 %r26498, %r26461, %r26437, %r26511, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+48], {%r26497, %r26498}; + // begin inline asm + // chi + lop3.b32 %r26505, %r26433, %r26508, %r26401, 0xD2; + lop3.b32 %r26506, %r26437, %r26511, %r26405, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+56], {%r26505, %r26506}; + // begin inline asm + // chi + lop3.b32 %r26513, %r26449, %r26409, %r26465, 0xD2; + lop3.b32 %r26514, %r26453, %r26413, %r26469, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+64], {%r26513, %r26514}; + // begin inline asm + // chi + lop3.b32 %r26521, %r26409, %r26465, %r26441, 0xD2; + lop3.b32 %r26522, %r26413, %r26469, %r26445, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+72], {%r26521, %r26522}; + // begin inline asm + // chi + lop3.b32 %r26529, %r26465, %r26441, %r26417, 0xD2; + lop3.b32 %r26530, %r26469, %r26445, %r26421, 0xD2; + // end inline asm + st.local.v2.u32 [%rd270+80], {%r26529, %r26530}; + // begin inline asm + ld.global.nc.v2.u32 {%r26537,%r26538}, [%rd1292]; + // end inline asm + xor.b32 %r26549, %r26474, %r26538; + xor.b32 %r26550, %r26473, %r26537; + st.local.v2.u32 [%rd270+24], {%r26550, %r26549}; + mov.b64 %rd1358, {%r26481, %r26482}; + mov.b64 %rd1359, {%r26489, %r26490}; + mov.b64 %rd1362, {%r26513, %r26514}; + mov.b64 %rd1363, {%r26521, %r26522}; + mov.b64 %rd1364, {%r26529, %r26530}; + mov.b64 %rd1357, {%r26550, %r26549}; + mov.b64 %rd1360, {%r26497, %r26498}; + mov.b64 %rd1361, {%r26505, %r26506}; + bra.uni $L__BB2_86; + +$L__BB2_64: + st.local.u64 [%rd2], %rd361; + mov.u64 %rd881, 1179641; + st.local.u64 [%rd2+8], %rd881; + st.local.u32 [%rd2+16], %r3343; + ld.global.u64 %rd882, [%rd220]; + ld.global.u64 %rd883, [%rd220+8]; + ld.global.u64 %rd884, [%rd220+16]; + ld.global.u64 %rd885, [%rd220+24]; + ld.global.u64 %rd886, [%rd220+32]; + ld.global.u64 %rd887, [%rd220+40]; + ld.global.u64 %rd888, [%rd220+48]; + ld.global.u64 %rd889, [%rd220+56]; + st.local.u64 [%rd2+24], %rd882; + st.local.u64 [%rd2+32], %rd883; + st.local.u64 [%rd2+40], %rd884; + st.local.u64 [%rd2+48], %rd885; + st.local.u64 [%rd2+56], %rd886; + st.local.u64 [%rd2+64], %rd887; + st.local.u64 [%rd2+72], %rd888; + st.local.u64 [%rd2+80], %rd889; + cvt.u32.u64 %r20023, %rd882; + xor.b32 %r20024, %r3343, %r20023; + st.local.u32 [%rd2+24], %r20024; + mov.u32 %r30768, 0; + st.local.v2.u32 [%rd2+96], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+104], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+112], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+120], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+128], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+136], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+144], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+152], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+160], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+168], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+176], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+184], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+192], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+200], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+208], {%r30768, %r30768}; + st.local.v2.u32 [%rd2+216], {%r30768, %r30768}; + mov.u32 %r30783, -2147483648; + mov.u32 %r19996, 1; + st.local.v2.u32 [%rd2+88], {%r19996, %r30783}; + ld.local.v2.u32 {%r30804, %r30805}, [%rd2+24]; + mov.b64 {%r30802, %r30803}, %rd887; + shr.u64 %rd890, %rd883, 32; + cvt.u32.u64 %r30816, %rd883; + cvt.u32.u64 %r30817, %rd890; + shr.u64 %rd891, %rd888, 32; + cvt.u32.u64 %r30814, %rd888; + cvt.u32.u64 %r30815, %rd891; + shr.u64 %rd892, %rd884, 32; + cvt.u32.u64 %r30812, %rd884; + cvt.u32.u64 %r30813, %rd892; + shr.u64 %rd893, %rd889, 32; + cvt.u32.u64 %r30810, %rd889; + cvt.u32.u64 %r30811, %rd893; + shr.u64 %rd894, %rd885, 32; + cvt.u32.u64 %r30808, %rd885; + cvt.u32.u64 %r30809, %rd894; + shr.u64 %rd895, %rd886, 32; + cvt.u32.u64 %r30806, %rd886; + cvt.u32.u64 %r30807, %rd895; + mov.u32 %r30769, %r30768; + mov.u32 %r30770, %r30768; + mov.u32 %r30771, %r30768; + mov.u32 %r30772, %r30768; + mov.u32 %r30773, %r30768; + mov.u32 %r30774, %r30768; + mov.u32 %r30775, %r30768; + mov.u32 %r30776, %r30768; + mov.u32 %r30777, %r30768; + mov.u32 %r30778, %r30768; + mov.u32 %r30779, %r30768; + mov.u32 %r30780, %r30768; + mov.u32 %r30781, %r30768; + mov.u32 %r30782, %r19996; + mov.u32 %r30784, %r30768; + mov.u32 %r30785, %r30768; + mov.u32 %r30786, %r30768; + mov.u32 %r30787, %r30768; + mov.u32 %r30788, %r30768; + mov.u32 %r30789, %r30768; + mov.u32 %r30790, %r30768; + mov.u32 %r30791, %r30768; + mov.u32 %r30792, %r30768; + mov.u32 %r30793, %r30768; + mov.u32 %r30794, %r30768; + mov.u32 %r30795, %r30768; + mov.u32 %r30796, %r30768; + mov.u32 %r30797, %r30768; + mov.u32 %r30798, %r30768; + mov.u32 %r30799, %r30768; + mov.u32 %r30800, %r30768; + mov.u32 %r30801, %r30768; + mov.u32 %r30818, %r30768; + +$L__BB2_65: + // begin inline asm + // xor5 + lop3.b32 %r20027, %r30804, %r30802, %r30800, 0x96; + lop3.b32 %r20027, %r20027, %r30798, %r30796, 0x96; + lop3.b32 %r20028, %r30805, %r30803, %r30801, 0x96; + lop3.b32 %r20028, %r20028, %r30799, %r30797, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20039, %r30816, %r30814, %r30794, 0x96; + lop3.b32 %r20039, %r20039, %r30792, %r30790, 0x96; + lop3.b32 %r20040, %r30817, %r30815, %r30795, 0x96; + lop3.b32 %r20040, %r20040, %r30793, %r30791, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20051, %r30812, %r30810, %r30788, 0x96; + lop3.b32 %r20051, %r20051, %r30786, %r30784, 0x96; + lop3.b32 %r20052, %r30813, %r30811, %r30789, 0x96; + lop3.b32 %r20052, %r20052, %r30787, %r30785, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20063, %r30808, %r30782, %r30780, 0x96; + lop3.b32 %r20063, %r20063, %r30778, %r30776, 0x96; + lop3.b32 %r20064, %r30809, %r30783, %r30781, 0x96; + lop3.b32 %r20064, %r20064, %r30779, %r30777, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20075, %r30806, %r30774, %r30772, 0x96; + lop3.b32 %r20075, %r20075, %r30770, %r30768, 0x96; + lop3.b32 %r20076, %r30807, %r30775, %r30773, 0x96; + lop3.b32 %r20076, %r20076, %r30771, %r30769, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20087, %r20040, %r20039, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20091, %r20039, %r20040, %r19996; + // end inline asm + xor.b32 %r20521, %r20087, %r20075; + xor.b32 %r20522, %r20091, %r20076; + xor.b32 %r20354, %r30804, %r20521; + xor.b32 %r20357, %r30805, %r20522; + xor.b32 %r20261, %r30802, %r20521; + xor.b32 %r20260, %r30803, %r20522; + xor.b32 %r20308, %r30800, %r20521; + xor.b32 %r20309, %r30801, %r20522; + xor.b32 %r20213, %r30798, %r20521; + xor.b32 %r20212, %r30799, %r20522; + xor.b32 %r20164, %r30796, %r20521; + xor.b32 %r20165, %r30797, %r20522; + // begin inline asm + shf.l.wrap.b32 %r20095, %r20052, %r20051, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20099, %r20051, %r20052, %r19996; + // end inline asm + xor.b32 %r20523, %r20095, %r20027; + xor.b32 %r20524, %r20099, %r20028; + xor.b32 %r20316, %r30816, %r20523; + xor.b32 %r20317, %r30817, %r20524; + xor.b32 %r20133, %r30814, %r20523; + xor.b32 %r20132, %r30815, %r20524; + xor.b32 %r20292, %r30794, %r20523; + xor.b32 %r20293, %r30795, %r20524; + xor.b32 %r20253, %r30792, %r20523; + xor.b32 %r20252, %r30793, %r20524; + xor.b32 %r20236, %r30790, %r20523; + xor.b32 %r20237, %r30791, %r20524; + // begin inline asm + shf.l.wrap.b32 %r20103, %r20064, %r20063, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20107, %r20063, %r20064, %r19996; + // end inline asm + xor.b32 %r20525, %r20103, %r20039; + xor.b32 %r20526, %r20107, %r20040; + xor.b32 %r20173, %r30812, %r20525; + xor.b32 %r20172, %r30813, %r20526; + xor.b32 %r20300, %r30810, %r20525; + xor.b32 %r20301, %r30811, %r20526; + xor.b32 %r20181, %r30788, %r20525; + xor.b32 %r20180, %r30789, %r20526; + xor.b32 %r20284, %r30786, %r20525; + xor.b32 %r20285, %r30787, %r20526; + xor.b32 %r20149, %r30784, %r20525; + xor.b32 %r20148, %r30785, %r20526; + // begin inline asm + shf.l.wrap.b32 %r20111, %r20076, %r20075, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20115, %r20075, %r20076, %r19996; + // end inline asm + xor.b32 %r20527, %r20111, %r20051; + xor.b32 %r20528, %r20115, %r20052; + xor.b32 %r20268, %r30808, %r20527; + xor.b32 %r20269, %r30809, %r20528; + xor.b32 %r20245, %r30782, %r20527; + xor.b32 %r20244, %r30783, %r20528; + xor.b32 %r20188, %r30780, %r20527; + xor.b32 %r20189, %r30781, %r20528; + xor.b32 %r20276, %r30778, %r20527; + xor.b32 %r20277, %r30779, %r20528; + xor.b32 %r20205, %r30776, %r20527; + xor.b32 %r20204, %r30777, %r20528; + // begin inline asm + shf.l.wrap.b32 %r20119, %r20028, %r20027, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20123, %r20027, %r20028, %r19996; + // end inline asm + xor.b32 %r20529, %r20119, %r20063; + xor.b32 %r20530, %r20123, %r20064; + xor.b32 %r20220, %r30806, %r20529; + xor.b32 %r20221, %r30807, %r20530; + xor.b32 %r20140, %r30774, %r20529; + xor.b32 %r20141, %r30775, %r20530; + xor.b32 %r20157, %r30772, %r20529; + xor.b32 %r20156, %r30773, %r20530; + xor.b32 %r20196, %r30770, %r20529; + xor.b32 %r20197, %r30771, %r20530; + xor.b32 %r20228, %r30768, %r20529; + xor.b32 %r20229, %r30769, %r20530; + mov.u32 %r20134, 44; + // begin inline asm + shf.l.wrap.b32 %r20127, %r20133, %r20132, %r20134; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20131, %r20132, %r20133, %r20134; + // end inline asm + mov.u32 %r20142, 20; + // begin inline asm + shf.l.wrap.b32 %r20135, %r20141, %r20140, %r20142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20139, %r20140, %r20141, %r20142; + // end inline asm + mov.u32 %r20150, 61; + // begin inline asm + shf.l.wrap.b32 %r20143, %r20149, %r20148, %r20150; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20147, %r20148, %r20149, %r20150; + // end inline asm + mov.u32 %r20158, 39; + // begin inline asm + shf.l.wrap.b32 %r20151, %r20157, %r20156, %r20158; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20155, %r20156, %r20157, %r20158; + // end inline asm + mov.u32 %r20166, 18; + // begin inline asm + shf.l.wrap.b32 %r20159, %r20165, %r20164, %r20166; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20163, %r20164, %r20165, %r20166; + // end inline asm + mov.u32 %r20174, 62; + // begin inline asm + shf.l.wrap.b32 %r20167, %r20173, %r20172, %r20174; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20171, %r20172, %r20173, %r20174; + // end inline asm + mov.u32 %r20182, 43; + // begin inline asm + shf.l.wrap.b32 %r20175, %r20181, %r20180, %r20182; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20179, %r20180, %r20181, %r20182; + // end inline asm + mov.u32 %r20190, 25; + // begin inline asm + shf.l.wrap.b32 %r20183, %r20189, %r20188, %r20190; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20187, %r20188, %r20189, %r20190; + // end inline asm + mov.u32 %r20198, 8; + // begin inline asm + shf.l.wrap.b32 %r20191, %r20197, %r20196, %r20198; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20195, %r20196, %r20197, %r20198; + // end inline asm + mov.u32 %r20206, 56; + // begin inline asm + shf.l.wrap.b32 %r20199, %r20205, %r20204, %r20206; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20203, %r20204, %r20205, %r20206; + // end inline asm + mov.u32 %r20214, 41; + // begin inline asm + shf.l.wrap.b32 %r20207, %r20213, %r20212, %r20214; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20211, %r20212, %r20213, %r20214; + // end inline asm + mov.u32 %r20222, 27; + // begin inline asm + shf.l.wrap.b32 %r20215, %r20221, %r20220, %r20222; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20219, %r20220, %r20221, %r20222; + // end inline asm + mov.u32 %r20230, 14; + // begin inline asm + shf.l.wrap.b32 %r20223, %r20229, %r20228, %r20230; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20227, %r20228, %r20229, %r20230; + // end inline asm + mov.u32 %r20238, 2; + // begin inline asm + shf.l.wrap.b32 %r20231, %r20237, %r20236, %r20238; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20235, %r20236, %r20237, %r20238; + // end inline asm + mov.u32 %r20246, 55; + // begin inline asm + shf.l.wrap.b32 %r20239, %r20245, %r20244, %r20246; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20243, %r20244, %r20245, %r20246; + // end inline asm + mov.u32 %r20254, 45; + // begin inline asm + shf.l.wrap.b32 %r20247, %r20253, %r20252, %r20254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20251, %r20252, %r20253, %r20254; + // end inline asm + mov.u32 %r20262, 36; + // begin inline asm + shf.l.wrap.b32 %r20255, %r20261, %r20260, %r20262; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20259, %r20260, %r20261, %r20262; + // end inline asm + mov.u32 %r20270, 28; + // begin inline asm + shf.l.wrap.b32 %r20263, %r20269, %r20268, %r20270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20267, %r20268, %r20269, %r20270; + // end inline asm + mov.u32 %r20278, 21; + // begin inline asm + shf.l.wrap.b32 %r20271, %r20277, %r20276, %r20278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20275, %r20276, %r20277, %r20278; + // end inline asm + mov.u32 %r20286, 15; + // begin inline asm + shf.l.wrap.b32 %r20279, %r20285, %r20284, %r20286; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20283, %r20284, %r20285, %r20286; + // end inline asm + mov.u32 %r20294, 10; + // begin inline asm + shf.l.wrap.b32 %r20287, %r20293, %r20292, %r20294; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20291, %r20292, %r20293, %r20294; + // end inline asm + mov.u32 %r20302, 6; + // begin inline asm + shf.l.wrap.b32 %r20295, %r20301, %r20300, %r20302; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20299, %r20300, %r20301, %r20302; + // end inline asm + mov.u32 %r20310, 3; + // begin inline asm + shf.l.wrap.b32 %r20303, %r20309, %r20308, %r20310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20307, %r20308, %r20309, %r20310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20311, %r20317, %r20316, %r19996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20315, %r20316, %r20317, %r19996; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20319, %r20354, %r20127, %r20175, 0xD2; + lop3.b32 %r20320, %r20357, %r20131, %r20179, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30816, %r20127, %r20175, %r20271, 0xD2; + lop3.b32 %r30817, %r20131, %r20179, %r20275, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30812, %r20175, %r20271, %r20223, 0xD2; + lop3.b32 %r30813, %r20179, %r20275, %r20227, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30808, %r20271, %r20223, %r20354, 0xD2; + lop3.b32 %r30809, %r20275, %r20227, %r20357, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30806, %r20223, %r20354, %r20127, 0xD2; + lop3.b32 %r30807, %r20227, %r20357, %r20131, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30802, %r20263, %r20135, %r20303, 0xD2; + lop3.b32 %r30803, %r20267, %r20139, %r20307, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30814, %r20135, %r20303, %r20247, 0xD2; + lop3.b32 %r30815, %r20139, %r20307, %r20251, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30810, %r20303, %r20247, %r20143, 0xD2; + lop3.b32 %r30811, %r20307, %r20251, %r20147, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30782, %r20247, %r20143, %r20263, 0xD2; + lop3.b32 %r30783, %r20251, %r20147, %r20267, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30782, %r30783}; + // begin inline asm + // chi + lop3.b32 %r30774, %r20143, %r20263, %r20135, 0xD2; + lop3.b32 %r30775, %r20147, %r20267, %r20139, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30774, %r30775}; + // begin inline asm + // chi + lop3.b32 %r30800, %r20311, %r20295, %r20183, 0xD2; + lop3.b32 %r30801, %r20315, %r20299, %r20187, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30800, %r30801}; + // begin inline asm + // chi + lop3.b32 %r30794, %r20295, %r20183, %r20191, 0xD2; + lop3.b32 %r30795, %r20299, %r20187, %r20195, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30794, %r30795}; + // begin inline asm + // chi + lop3.b32 %r30788, %r20183, %r20191, %r20159, 0xD2; + lop3.b32 %r30789, %r20187, %r20195, %r20163, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30788, %r30789}; + // begin inline asm + // chi + lop3.b32 %r30780, %r20191, %r20159, %r20311, 0xD2; + lop3.b32 %r30781, %r20195, %r20163, %r20315, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30780, %r30781}; + // begin inline asm + // chi + lop3.b32 %r30772, %r20159, %r20311, %r20295, 0xD2; + lop3.b32 %r30773, %r20163, %r20315, %r20299, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30772, %r30773}; + // begin inline asm + // chi + lop3.b32 %r30798, %r20215, %r20255, %r20287, 0xD2; + lop3.b32 %r30799, %r20219, %r20259, %r20291, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30798, %r30799}; + // begin inline asm + // chi + lop3.b32 %r30792, %r20255, %r20287, %r20279, 0xD2; + lop3.b32 %r30793, %r20259, %r20291, %r20283, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30792, %r30793}; + // begin inline asm + // chi + lop3.b32 %r30786, %r20287, %r20279, %r20199, 0xD2; + lop3.b32 %r30787, %r20291, %r20283, %r20203, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30786, %r30787}; + // begin inline asm + // chi + lop3.b32 %r30778, %r20279, %r20199, %r20215, 0xD2; + lop3.b32 %r30779, %r20283, %r20203, %r20219, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30778, %r30779}; + // begin inline asm + // chi + lop3.b32 %r30770, %r20199, %r20215, %r20255, 0xD2; + lop3.b32 %r30771, %r20203, %r20219, %r20259, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30770, %r30771}; + // begin inline asm + // chi + lop3.b32 %r30796, %r20167, %r20239, %r20151, 0xD2; + lop3.b32 %r30797, %r20171, %r20243, %r20155, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30796, %r30797}; + // begin inline asm + // chi + lop3.b32 %r30790, %r20239, %r20151, %r20207, 0xD2; + lop3.b32 %r30791, %r20243, %r20155, %r20211, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30790, %r30791}; + // begin inline asm + // chi + lop3.b32 %r30784, %r20151, %r20207, %r20231, 0xD2; + lop3.b32 %r30785, %r20155, %r20211, %r20235, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30784, %r30785}; + // begin inline asm + // chi + lop3.b32 %r30776, %r20207, %r20231, %r20167, 0xD2; + lop3.b32 %r30777, %r20211, %r20235, %r20171, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30776, %r30777}; + // begin inline asm + // chi + lop3.b32 %r30768, %r20231, %r20167, %r20239, 0xD2; + lop3.b32 %r30769, %r20235, %r20171, %r20243, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30768, %r30769}; + mul.wide.s32 %rd899, %r30818, 8; + mov.u64 %rd900, keccak_round_constants; + cvta.const.u64 %rd901, %rd900; + add.s64 %rd896, %rd901, %rd899; + // begin inline asm + ld.global.nc.v2.u32 {%r20519,%r20520}, [%rd896]; + // end inline asm + xor.b32 %r30804, %r20319, %r20519; + xor.b32 %r30805, %r20320, %r20520; + add.s32 %r30818, %r30818, 1; + setp.lt.u32 %p38, %r30818, 23; + @%p38 bra $L__BB2_65; + + st.local.v2.u32 [%rd2+32], {%r30816, %r30817}; + st.local.v2.u32 [%rd2+72], {%r30814, %r30815}; + st.local.v2.u32 [%rd2+40], {%r30812, %r30813}; + st.local.v2.u32 [%rd2+80], {%r30810, %r30811}; + st.local.v2.u32 [%rd2+48], {%r30808, %r30809}; + st.local.v2.u32 [%rd2+56], {%r30806, %r30807}; + st.local.v2.u32 [%rd2+24], {%r30804, %r30805}; + // begin inline asm + // xor5 + lop3.b32 %r20531, %r30804, %r30802, %r30800, 0x96; + lop3.b32 %r20531, %r20531, %r30798, %r30796, 0x96; + lop3.b32 %r20532, %r30805, %r30803, %r30801, 0x96; + lop3.b32 %r20532, %r20532, %r30799, %r30797, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20543, %r30816, %r30814, %r30794, 0x96; + lop3.b32 %r20543, %r20543, %r30792, %r30790, 0x96; + lop3.b32 %r20544, %r30817, %r30815, %r30795, 0x96; + lop3.b32 %r20544, %r20544, %r30793, %r30791, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20555, %r30812, %r30810, %r30788, 0x96; + lop3.b32 %r20555, %r20555, %r30786, %r30784, 0x96; + lop3.b32 %r20556, %r30813, %r30811, %r30789, 0x96; + lop3.b32 %r20556, %r20556, %r30787, %r30785, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20567, %r30808, %r30782, %r30780, 0x96; + lop3.b32 %r20567, %r20567, %r30778, %r30776, 0x96; + lop3.b32 %r20568, %r30809, %r30783, %r30781, 0x96; + lop3.b32 %r20568, %r20568, %r30779, %r30777, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20579, %r30806, %r30774, %r30772, 0x96; + lop3.b32 %r20579, %r20579, %r30770, %r30768, 0x96; + lop3.b32 %r20580, %r30807, %r30775, %r30773, 0x96; + lop3.b32 %r20580, %r20580, %r30771, %r30769, 0x96; + // end inline asm + mov.u32 %r30833, 1; + // begin inline asm + shf.l.wrap.b32 %r20591, %r20544, %r20543, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20595, %r20543, %r20544, %r30833; + // end inline asm + xor.b32 %r20810, %r20591, %r20579; + xor.b32 %r20811, %r20595, %r20580; + xor.b32 %r20738, %r30804, %r20810; + xor.b32 %r20741, %r30805, %r20811; + xor.b32 %r20701, %r30801, %r20811; + xor.b32 %r20700, %r30800, %r20810; + st.local.v2.u32 [%rd2+104], {%r20700, %r20701}; + // begin inline asm + shf.l.wrap.b32 %r20599, %r20556, %r20555, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20603, %r20555, %r20556, %r30833; + // end inline asm + xor.b32 %r20812, %r20599, %r20531; + xor.b32 %r20813, %r20603, %r20532; + xor.b32 %r20637, %r30814, %r20812; + xor.b32 %r20636, %r30815, %r20813; + xor.b32 %r20676, %r30793, %r20813; + xor.b32 %r20677, %r30792, %r20812; + st.local.v2.u32 [%rd2+152], {%r20677, %r20676}; + // begin inline asm + shf.l.wrap.b32 %r20607, %r20568, %r20567, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20611, %r20567, %r20568, %r30833; + // end inline asm + xor.b32 %r20814, %r20607, %r20543; + xor.b32 %r20815, %r20611, %r20544; + xor.b32 %r20660, %r30789, %r20815; + xor.b32 %r20661, %r30788, %r20814; + st.local.v2.u32 [%rd2+120], {%r20661, %r20660}; + xor.b32 %r20652, %r30785, %r20815; + xor.b32 %r20653, %r30784, %r20814; + st.local.v2.u32 [%rd2+200], {%r20653, %r20652}; + // begin inline asm + shf.l.wrap.b32 %r20615, %r20580, %r20579, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20619, %r20579, %r20580, %r30833; + // end inline asm + xor.b32 %r20816, %r20615, %r20555; + xor.b32 %r20817, %r20619, %r20556; + xor.b32 %r20684, %r30808, %r20816; + xor.b32 %r20685, %r30809, %r20817; + xor.b32 %r20693, %r30779, %r20817; + xor.b32 %r20692, %r30778, %r20816; + st.local.v2.u32 [%rd2+168], {%r20692, %r20693}; + // begin inline asm + shf.l.wrap.b32 %r20623, %r20532, %r20531, %r30833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20627, %r20531, %r20532, %r30833; + // end inline asm + xor.b32 %r20818, %r20623, %r20567; + xor.b32 %r20819, %r20627, %r20568; + xor.b32 %r20644, %r30774, %r20818; + xor.b32 %r20645, %r30775, %r20819; + xor.b32 %r20669, %r30769, %r20819; + xor.b32 %r20668, %r30768, %r20818; + st.local.v2.u32 [%rd2+216], {%r20668, %r20669}; + // begin inline asm + shf.l.wrap.b32 %r20631, %r20637, %r20636, %r20134; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20635, %r20636, %r20637, %r20134; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20639, %r20645, %r20644, %r20142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20643, %r20644, %r20645, %r20142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20651, %r20652, %r20653, %r20150; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20647, %r20653, %r20652, %r20150; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r20647, %r20651}; + // begin inline asm + shf.l.wrap.b32 %r20655, %r20661, %r20660, %r20182; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20659, %r20660, %r20661, %r20182; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20663, %r20669, %r20668, %r20230; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20667, %r20668, %r20669, %r20230; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20675, %r20676, %r20677, %r20254; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20671, %r20677, %r20676, %r20254; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r20671, %r20675}; + // begin inline asm + shf.l.wrap.b32 %r20679, %r20685, %r20684, %r20270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20683, %r20684, %r20685, %r20270; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20687, %r20693, %r20692, %r20278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20691, %r20692, %r20693, %r20278; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20695, %r20701, %r20700, %r20310; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20699, %r20700, %r20701, %r20310; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20703, %r20738, %r20631, %r20655, 0xD2; + lop3.b32 %r20704, %r20741, %r20635, %r20659, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30951, %r20631, %r20655, %r20687, 0xD2; + lop3.b32 %r30952, %r20635, %r20659, %r20691, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r30951, %r30952}; + // begin inline asm + // chi + lop3.b32 %r30947, %r20655, %r20687, %r20663, 0xD2; + lop3.b32 %r30948, %r20659, %r20691, %r20667, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r30947, %r30948}; + // begin inline asm + // chi + lop3.b32 %r30943, %r20687, %r20663, %r20738, 0xD2; + lop3.b32 %r30944, %r20691, %r20667, %r20741, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r30943, %r30944}; + // begin inline asm + // chi + lop3.b32 %r30941, %r20663, %r20738, %r20631, 0xD2; + lop3.b32 %r30942, %r20667, %r20741, %r20635, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r30941, %r30942}; + // begin inline asm + // chi + lop3.b32 %r30937, %r20679, %r20639, %r20695, 0xD2; + lop3.b32 %r30938, %r20683, %r20643, %r20699, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r30937, %r30938}; + // begin inline asm + // chi + lop3.b32 %r30949, %r20639, %r20695, %r20671, 0xD2; + lop3.b32 %r30950, %r20643, %r20699, %r20675, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r30949, %r30950}; + // begin inline asm + // chi + lop3.b32 %r30945, %r20695, %r20671, %r20647, 0xD2; + lop3.b32 %r30946, %r20699, %r20675, %r20651, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r30945, %r30946}; + add.s64 %rd902, %rd901, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r20767,%r20768}, [%rd902]; + // end inline asm + xor.b32 %r30939, %r20703, %r20767; + xor.b32 %r30940, %r20704, %r20768; + add.u64 %rd908, %SPL, 1912; + st.local.v2.u32 [%rd2+24], {%r30939, %r30940}; + st.local.u64 [%rd908], %rd361; + mov.u64 %rd909, 1179641; + st.local.u64 [%rd908+8], %rd909; + add.s32 %r20820, %r3343, 1; + st.local.u32 [%rd908+16], %r20820; + ld.global.u64 %rd910, [%rd221]; + ld.global.u64 %rd911, [%rd221+8]; + ld.global.u64 %rd912, [%rd221+16]; + ld.global.u64 %rd913, [%rd221+24]; + ld.global.u64 %rd914, [%rd221+32]; + ld.global.u64 %rd915, [%rd221+40]; + ld.global.u64 %rd916, [%rd221+48]; + ld.global.u64 %rd917, [%rd221+56]; + st.local.u64 [%rd908+32], %rd911; + st.local.u64 [%rd908+40], %rd912; + st.local.u64 [%rd908+48], %rd913; + st.local.u64 [%rd908+56], %rd914; + st.local.u64 [%rd908+64], %rd915; + st.local.u64 [%rd908+72], %rd916; + st.local.u64 [%rd908+80], %rd917; + cvt.u32.u64 %r20821, %rd910; + xor.b32 %r20822, %r20820, %r20821; + st.local.u64 [%rd908+24], %rd910; + st.local.u32 [%rd908+24], %r20822; + mov.u32 %r30819, 0; + st.local.v2.u32 [%rd908+96], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+104], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+112], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+120], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+128], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+136], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+144], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+152], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+160], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+168], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+176], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+184], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+192], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+200], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+208], {%r30819, %r30819}; + st.local.v2.u32 [%rd908+216], {%r30819, %r30819}; + mov.u32 %r30834, -2147483648; + st.local.v2.u32 [%rd908+88], {%r30833, %r30834}; + ld.local.v2.u32 {%r30855, %r30856}, [%rd908+24]; + mov.b64 {%r30853, %r30854}, %rd915; + shr.u64 %rd918, %rd911, 32; + cvt.u32.u64 %r30867, %rd911; + cvt.u32.u64 %r30868, %rd918; + shr.u64 %rd919, %rd916, 32; + cvt.u32.u64 %r30865, %rd916; + cvt.u32.u64 %r30866, %rd919; + shr.u64 %rd920, %rd912, 32; + cvt.u32.u64 %r30863, %rd912; + cvt.u32.u64 %r30864, %rd920; + shr.u64 %rd921, %rd917, 32; + cvt.u32.u64 %r30861, %rd917; + cvt.u32.u64 %r30862, %rd921; + shr.u64 %rd922, %rd913, 32; + cvt.u32.u64 %r30859, %rd913; + cvt.u32.u64 %r30860, %rd922; + shr.u64 %rd923, %rd914, 32; + cvt.u32.u64 %r30857, %rd914; + cvt.u32.u64 %r30858, %rd923; + mov.u32 %r30820, %r30819; + mov.u32 %r30821, %r30819; + mov.u32 %r30822, %r30819; + mov.u32 %r30823, %r30819; + mov.u32 %r30824, %r30819; + mov.u32 %r30825, %r30819; + mov.u32 %r30826, %r30819; + mov.u32 %r30827, %r30819; + mov.u32 %r30828, %r30819; + mov.u32 %r30829, %r30819; + mov.u32 %r30830, %r30819; + mov.u32 %r30831, %r30819; + mov.u32 %r30832, %r30819; + mov.u32 %r30835, %r30819; + mov.u32 %r30836, %r30819; + mov.u32 %r30837, %r30819; + mov.u32 %r30838, %r30819; + mov.u32 %r30839, %r30819; + mov.u32 %r30840, %r30819; + mov.u32 %r30841, %r30819; + mov.u32 %r30842, %r30819; + mov.u32 %r30843, %r30819; + mov.u32 %r30844, %r30819; + mov.u32 %r30845, %r30819; + mov.u32 %r30846, %r30819; + mov.u32 %r30847, %r30819; + mov.u32 %r30848, %r30819; + mov.u32 %r30849, %r30819; + mov.u32 %r30850, %r30819; + mov.u32 %r30851, %r30819; + mov.u32 %r30852, %r30819; + mov.u32 %r30869, %r30819; + +$L__BB2_67: + mov.u32 %r29766, 1; + // begin inline asm + // xor5 + lop3.b32 %r20825, %r30855, %r30853, %r30851, 0x96; + lop3.b32 %r20825, %r20825, %r30849, %r30847, 0x96; + lop3.b32 %r20826, %r30856, %r30854, %r30852, 0x96; + lop3.b32 %r20826, %r20826, %r30850, %r30848, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20837, %r30867, %r30865, %r30845, 0x96; + lop3.b32 %r20837, %r20837, %r30843, %r30841, 0x96; + lop3.b32 %r20838, %r30868, %r30866, %r30846, 0x96; + lop3.b32 %r20838, %r20838, %r30844, %r30842, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20849, %r30863, %r30861, %r30839, 0x96; + lop3.b32 %r20849, %r20849, %r30837, %r30835, 0x96; + lop3.b32 %r20850, %r30864, %r30862, %r30840, 0x96; + lop3.b32 %r20850, %r20850, %r30838, %r30836, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20861, %r30859, %r30833, %r30831, 0x96; + lop3.b32 %r20861, %r20861, %r30829, %r30827, 0x96; + lop3.b32 %r20862, %r30860, %r30834, %r30832, 0x96; + lop3.b32 %r20862, %r20862, %r30830, %r30828, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20873, %r30857, %r30825, %r30823, 0x96; + lop3.b32 %r20873, %r20873, %r30821, %r30819, 0x96; + lop3.b32 %r20874, %r30858, %r30826, %r30824, 0x96; + lop3.b32 %r20874, %r20874, %r30822, %r30820, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20885, %r20838, %r20837, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20889, %r20837, %r20838, %r29766; + // end inline asm + xor.b32 %r21319, %r20885, %r20873; + xor.b32 %r21320, %r20889, %r20874; + xor.b32 %r21152, %r30855, %r21319; + xor.b32 %r21155, %r30856, %r21320; + xor.b32 %r21059, %r30853, %r21319; + xor.b32 %r21058, %r30854, %r21320; + xor.b32 %r21106, %r30851, %r21319; + xor.b32 %r21107, %r30852, %r21320; + xor.b32 %r21011, %r30849, %r21319; + xor.b32 %r21010, %r30850, %r21320; + xor.b32 %r20962, %r30847, %r21319; + xor.b32 %r20963, %r30848, %r21320; + // begin inline asm + shf.l.wrap.b32 %r20893, %r20850, %r20849, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20897, %r20849, %r20850, %r29766; + // end inline asm + xor.b32 %r21321, %r20893, %r20825; + xor.b32 %r21322, %r20897, %r20826; + xor.b32 %r21114, %r30867, %r21321; + xor.b32 %r21115, %r30868, %r21322; + xor.b32 %r20931, %r30865, %r21321; + xor.b32 %r20930, %r30866, %r21322; + xor.b32 %r21090, %r30845, %r21321; + xor.b32 %r21091, %r30846, %r21322; + xor.b32 %r21051, %r30843, %r21321; + xor.b32 %r21050, %r30844, %r21322; + xor.b32 %r21034, %r30841, %r21321; + xor.b32 %r21035, %r30842, %r21322; + // begin inline asm + shf.l.wrap.b32 %r20901, %r20862, %r20861, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20905, %r20861, %r20862, %r29766; + // end inline asm + xor.b32 %r21323, %r20901, %r20837; + xor.b32 %r21324, %r20905, %r20838; + xor.b32 %r20971, %r30863, %r21323; + xor.b32 %r20970, %r30864, %r21324; + xor.b32 %r21098, %r30861, %r21323; + xor.b32 %r21099, %r30862, %r21324; + xor.b32 %r20979, %r30839, %r21323; + xor.b32 %r20978, %r30840, %r21324; + xor.b32 %r21082, %r30837, %r21323; + xor.b32 %r21083, %r30838, %r21324; + xor.b32 %r20947, %r30835, %r21323; + xor.b32 %r20946, %r30836, %r21324; + // begin inline asm + shf.l.wrap.b32 %r20909, %r20874, %r20873, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20913, %r20873, %r20874, %r29766; + // end inline asm + xor.b32 %r21325, %r20909, %r20849; + xor.b32 %r21326, %r20913, %r20850; + xor.b32 %r21066, %r30859, %r21325; + xor.b32 %r21067, %r30860, %r21326; + xor.b32 %r21043, %r30833, %r21325; + xor.b32 %r21042, %r30834, %r21326; + xor.b32 %r20986, %r30831, %r21325; + xor.b32 %r20987, %r30832, %r21326; + xor.b32 %r21074, %r30829, %r21325; + xor.b32 %r21075, %r30830, %r21326; + xor.b32 %r21003, %r30827, %r21325; + xor.b32 %r21002, %r30828, %r21326; + // begin inline asm + shf.l.wrap.b32 %r20917, %r20826, %r20825, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20921, %r20825, %r20826, %r29766; + // end inline asm + xor.b32 %r21327, %r20917, %r20861; + xor.b32 %r21328, %r20921, %r20862; + xor.b32 %r21018, %r30857, %r21327; + xor.b32 %r21019, %r30858, %r21328; + xor.b32 %r20938, %r30825, %r21327; + xor.b32 %r20939, %r30826, %r21328; + xor.b32 %r20955, %r30823, %r21327; + xor.b32 %r20954, %r30824, %r21328; + xor.b32 %r20994, %r30821, %r21327; + xor.b32 %r20995, %r30822, %r21328; + xor.b32 %r21026, %r30819, %r21327; + xor.b32 %r21027, %r30820, %r21328; + mov.u32 %r20932, 44; + // begin inline asm + shf.l.wrap.b32 %r20925, %r20931, %r20930, %r20932; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20929, %r20930, %r20931, %r20932; + // end inline asm + mov.u32 %r20940, 20; + // begin inline asm + shf.l.wrap.b32 %r20933, %r20939, %r20938, %r20940; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20937, %r20938, %r20939, %r20940; + // end inline asm + mov.u32 %r20948, 61; + // begin inline asm + shf.l.wrap.b32 %r20941, %r20947, %r20946, %r20948; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20945, %r20946, %r20947, %r20948; + // end inline asm + mov.u32 %r20956, 39; + // begin inline asm + shf.l.wrap.b32 %r20949, %r20955, %r20954, %r20956; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20953, %r20954, %r20955, %r20956; + // end inline asm + mov.u32 %r20964, 18; + // begin inline asm + shf.l.wrap.b32 %r20957, %r20963, %r20962, %r20964; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20961, %r20962, %r20963, %r20964; + // end inline asm + mov.u32 %r20972, 62; + // begin inline asm + shf.l.wrap.b32 %r20965, %r20971, %r20970, %r20972; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20969, %r20970, %r20971, %r20972; + // end inline asm + mov.u32 %r20980, 43; + // begin inline asm + shf.l.wrap.b32 %r20973, %r20979, %r20978, %r20980; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20977, %r20978, %r20979, %r20980; + // end inline asm + mov.u32 %r20988, 25; + // begin inline asm + shf.l.wrap.b32 %r20981, %r20987, %r20986, %r20988; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20985, %r20986, %r20987, %r20988; + // end inline asm + mov.u32 %r20996, 8; + // begin inline asm + shf.l.wrap.b32 %r20989, %r20995, %r20994, %r20996; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20993, %r20994, %r20995, %r20996; + // end inline asm + mov.u32 %r21004, 56; + // begin inline asm + shf.l.wrap.b32 %r20997, %r21003, %r21002, %r21004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21001, %r21002, %r21003, %r21004; + // end inline asm + mov.u32 %r21012, 41; + // begin inline asm + shf.l.wrap.b32 %r21005, %r21011, %r21010, %r21012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21009, %r21010, %r21011, %r21012; + // end inline asm + mov.u32 %r21020, 27; + // begin inline asm + shf.l.wrap.b32 %r21013, %r21019, %r21018, %r21020; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21017, %r21018, %r21019, %r21020; + // end inline asm + mov.u32 %r21028, 14; + // begin inline asm + shf.l.wrap.b32 %r21021, %r21027, %r21026, %r21028; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21025, %r21026, %r21027, %r21028; + // end inline asm + mov.u32 %r21036, 2; + // begin inline asm + shf.l.wrap.b32 %r21029, %r21035, %r21034, %r21036; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21033, %r21034, %r21035, %r21036; + // end inline asm + mov.u32 %r21044, 55; + // begin inline asm + shf.l.wrap.b32 %r21037, %r21043, %r21042, %r21044; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21041, %r21042, %r21043, %r21044; + // end inline asm + mov.u32 %r21052, 45; + // begin inline asm + shf.l.wrap.b32 %r21045, %r21051, %r21050, %r21052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21049, %r21050, %r21051, %r21052; + // end inline asm + mov.u32 %r21060, 36; + // begin inline asm + shf.l.wrap.b32 %r21053, %r21059, %r21058, %r21060; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21057, %r21058, %r21059, %r21060; + // end inline asm + mov.u32 %r21068, 28; + // begin inline asm + shf.l.wrap.b32 %r21061, %r21067, %r21066, %r21068; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21065, %r21066, %r21067, %r21068; + // end inline asm + mov.u32 %r21076, 21; + // begin inline asm + shf.l.wrap.b32 %r21069, %r21075, %r21074, %r21076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21073, %r21074, %r21075, %r21076; + // end inline asm + mov.u32 %r21084, 15; + // begin inline asm + shf.l.wrap.b32 %r21077, %r21083, %r21082, %r21084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21081, %r21082, %r21083, %r21084; + // end inline asm + mov.u32 %r21092, 10; + // begin inline asm + shf.l.wrap.b32 %r21085, %r21091, %r21090, %r21092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21089, %r21090, %r21091, %r21092; + // end inline asm + mov.u32 %r21100, 6; + // begin inline asm + shf.l.wrap.b32 %r21093, %r21099, %r21098, %r21100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21097, %r21098, %r21099, %r21100; + // end inline asm + mov.u32 %r21108, 3; + // begin inline asm + shf.l.wrap.b32 %r21101, %r21107, %r21106, %r21108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21105, %r21106, %r21107, %r21108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21109, %r21115, %r21114, %r29766; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21113, %r21114, %r21115, %r29766; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21117, %r21152, %r20925, %r20973, 0xD2; + lop3.b32 %r21118, %r21155, %r20929, %r20977, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30867, %r20925, %r20973, %r21069, 0xD2; + lop3.b32 %r30868, %r20929, %r20977, %r21073, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30863, %r20973, %r21069, %r21021, 0xD2; + lop3.b32 %r30864, %r20977, %r21073, %r21025, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30859, %r21069, %r21021, %r21152, 0xD2; + lop3.b32 %r30860, %r21073, %r21025, %r21155, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30857, %r21021, %r21152, %r20925, 0xD2; + lop3.b32 %r30858, %r21025, %r21155, %r20929, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30853, %r21061, %r20933, %r21101, 0xD2; + lop3.b32 %r30854, %r21065, %r20937, %r21105, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30865, %r20933, %r21101, %r21045, 0xD2; + lop3.b32 %r30866, %r20937, %r21105, %r21049, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30861, %r21101, %r21045, %r20941, 0xD2; + lop3.b32 %r30862, %r21105, %r21049, %r20945, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30833, %r21045, %r20941, %r21061, 0xD2; + lop3.b32 %r30834, %r21049, %r20945, %r21065, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r30833, %r30834}; + // begin inline asm + // chi + lop3.b32 %r30825, %r20941, %r21061, %r20933, 0xD2; + lop3.b32 %r30826, %r20945, %r21065, %r20937, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r30825, %r30826}; + // begin inline asm + // chi + lop3.b32 %r30851, %r21109, %r21093, %r20981, 0xD2; + lop3.b32 %r30852, %r21113, %r21097, %r20985, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+104], {%r30851, %r30852}; + // begin inline asm + // chi + lop3.b32 %r30845, %r21093, %r20981, %r20989, 0xD2; + lop3.b32 %r30846, %r21097, %r20985, %r20993, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+112], {%r30845, %r30846}; + // begin inline asm + // chi + lop3.b32 %r30839, %r20981, %r20989, %r20957, 0xD2; + lop3.b32 %r30840, %r20985, %r20993, %r20961, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+120], {%r30839, %r30840}; + // begin inline asm + // chi + lop3.b32 %r30831, %r20989, %r20957, %r21109, 0xD2; + lop3.b32 %r30832, %r20993, %r20961, %r21113, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+128], {%r30831, %r30832}; + // begin inline asm + // chi + lop3.b32 %r30823, %r20957, %r21109, %r21093, 0xD2; + lop3.b32 %r30824, %r20961, %r21113, %r21097, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+136], {%r30823, %r30824}; + // begin inline asm + // chi + lop3.b32 %r30849, %r21013, %r21053, %r21085, 0xD2; + lop3.b32 %r30850, %r21017, %r21057, %r21089, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+144], {%r30849, %r30850}; + // begin inline asm + // chi + lop3.b32 %r30843, %r21053, %r21085, %r21077, 0xD2; + lop3.b32 %r30844, %r21057, %r21089, %r21081, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+152], {%r30843, %r30844}; + // begin inline asm + // chi + lop3.b32 %r30837, %r21085, %r21077, %r20997, 0xD2; + lop3.b32 %r30838, %r21089, %r21081, %r21001, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+160], {%r30837, %r30838}; + // begin inline asm + // chi + lop3.b32 %r30829, %r21077, %r20997, %r21013, 0xD2; + lop3.b32 %r30830, %r21081, %r21001, %r21017, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+168], {%r30829, %r30830}; + // begin inline asm + // chi + lop3.b32 %r30821, %r20997, %r21013, %r21053, 0xD2; + lop3.b32 %r30822, %r21001, %r21017, %r21057, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+176], {%r30821, %r30822}; + // begin inline asm + // chi + lop3.b32 %r30847, %r20965, %r21037, %r20949, 0xD2; + lop3.b32 %r30848, %r20969, %r21041, %r20953, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+184], {%r30847, %r30848}; + // begin inline asm + // chi + lop3.b32 %r30841, %r21037, %r20949, %r21005, 0xD2; + lop3.b32 %r30842, %r21041, %r20953, %r21009, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+192], {%r30841, %r30842}; + // begin inline asm + // chi + lop3.b32 %r30835, %r20949, %r21005, %r21029, 0xD2; + lop3.b32 %r30836, %r20953, %r21009, %r21033, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+200], {%r30835, %r30836}; + // begin inline asm + // chi + lop3.b32 %r30827, %r21005, %r21029, %r20965, 0xD2; + lop3.b32 %r30828, %r21009, %r21033, %r20969, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+208], {%r30827, %r30828}; + // begin inline asm + // chi + lop3.b32 %r30819, %r21029, %r20965, %r21037, 0xD2; + lop3.b32 %r30820, %r21033, %r20969, %r21041, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+216], {%r30819, %r30820}; + mul.wide.s32 %rd927, %r30869, 8; + add.s64 %rd924, %rd901, %rd927; + // begin inline asm + ld.global.nc.v2.u32 {%r21317,%r21318}, [%rd924]; + // end inline asm + xor.b32 %r30855, %r21117, %r21317; + xor.b32 %r30856, %r21118, %r21318; + add.s32 %r30869, %r30869, 1; + setp.lt.u32 %p39, %r30869, 23; + @%p39 bra $L__BB2_67; + + mov.u32 %r29764, 3; + mov.u32 %r29763, 21; + mov.u32 %r29762, 28; + mov.u32 %r29761, 45; + mov.u32 %r29760, 14; + mov.u32 %r29759, 43; + mov.u32 %r29758, 61; + mov.u32 %r29757, 20; + mov.u32 %r29756, 44; + mov.u32 %r30902, 0; + mov.u32 %r21428, 1; + st.local.v2.u32 [%rd908+32], {%r30867, %r30868}; + st.local.v2.u32 [%rd908+72], {%r30865, %r30866}; + st.local.v2.u32 [%rd908+40], {%r30863, %r30864}; + st.local.v2.u32 [%rd908+80], {%r30861, %r30862}; + st.local.v2.u32 [%rd908+48], {%r30859, %r30860}; + st.local.v2.u32 [%rd908+56], {%r30857, %r30858}; + st.local.v2.u32 [%rd908+24], {%r30855, %r30856}; + // begin inline asm + // xor5 + lop3.b32 %r21329, %r30855, %r30853, %r30851, 0x96; + lop3.b32 %r21329, %r21329, %r30849, %r30847, 0x96; + lop3.b32 %r21330, %r30856, %r30854, %r30852, 0x96; + lop3.b32 %r21330, %r21330, %r30850, %r30848, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21341, %r30867, %r30865, %r30845, 0x96; + lop3.b32 %r21341, %r21341, %r30843, %r30841, 0x96; + lop3.b32 %r21342, %r30868, %r30866, %r30846, 0x96; + lop3.b32 %r21342, %r21342, %r30844, %r30842, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21353, %r30863, %r30861, %r30839, 0x96; + lop3.b32 %r21353, %r21353, %r30837, %r30835, 0x96; + lop3.b32 %r21354, %r30864, %r30862, %r30840, 0x96; + lop3.b32 %r21354, %r21354, %r30838, %r30836, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21365, %r30859, %r30833, %r30831, 0x96; + lop3.b32 %r21365, %r21365, %r30829, %r30827, 0x96; + lop3.b32 %r21366, %r30860, %r30834, %r30832, 0x96; + lop3.b32 %r21366, %r21366, %r30830, %r30828, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21377, %r30857, %r30825, %r30823, 0x96; + lop3.b32 %r21377, %r21377, %r30821, %r30819, 0x96; + lop3.b32 %r21378, %r30858, %r30826, %r30824, 0x96; + lop3.b32 %r21378, %r21378, %r30822, %r30820, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21389, %r21342, %r21341, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21393, %r21341, %r21342, %r21428; + // end inline asm + xor.b32 %r21568, %r21389, %r21377; + xor.b32 %r21569, %r21393, %r21378; + xor.b32 %r21536, %r30855, %r21568; + xor.b32 %r21539, %r30856, %r21569; + xor.b32 %r21499, %r30852, %r21569; + xor.b32 %r21498, %r30851, %r21568; + st.local.v2.u32 [%rd908+104], {%r21498, %r21499}; + // begin inline asm + shf.l.wrap.b32 %r21397, %r21354, %r21353, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21401, %r21353, %r21354, %r21428; + // end inline asm + xor.b32 %r21570, %r21397, %r21329; + xor.b32 %r21571, %r21401, %r21330; + xor.b32 %r21435, %r30865, %r21570; + xor.b32 %r21434, %r30866, %r21571; + xor.b32 %r21474, %r30844, %r21571; + xor.b32 %r21475, %r30843, %r21570; + st.local.v2.u32 [%rd908+152], {%r21475, %r21474}; + // begin inline asm + shf.l.wrap.b32 %r21405, %r21366, %r21365, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21409, %r21365, %r21366, %r21428; + // end inline asm + xor.b32 %r21572, %r21405, %r21341; + xor.b32 %r21573, %r21409, %r21342; + xor.b32 %r21458, %r30840, %r21573; + xor.b32 %r21459, %r30839, %r21572; + st.local.v2.u32 [%rd908+120], {%r21459, %r21458}; + xor.b32 %r21450, %r30836, %r21573; + xor.b32 %r21451, %r30835, %r21572; + st.local.v2.u32 [%rd908+200], {%r21451, %r21450}; + // begin inline asm + shf.l.wrap.b32 %r21413, %r21378, %r21377, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21417, %r21377, %r21378, %r21428; + // end inline asm + xor.b32 %r21574, %r21413, %r21353; + xor.b32 %r21575, %r21417, %r21354; + xor.b32 %r21482, %r30859, %r21574; + xor.b32 %r21483, %r30860, %r21575; + xor.b32 %r21491, %r30830, %r21575; + xor.b32 %r21490, %r30829, %r21574; + st.local.v2.u32 [%rd908+168], {%r21490, %r21491}; + // begin inline asm + shf.l.wrap.b32 %r21421, %r21330, %r21329, %r21428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21425, %r21329, %r21330, %r21428; + // end inline asm + xor.b32 %r21576, %r21421, %r21365; + xor.b32 %r21577, %r21425, %r21366; + xor.b32 %r21442, %r30825, %r21576; + xor.b32 %r21443, %r30826, %r21577; + xor.b32 %r21467, %r30820, %r21577; + xor.b32 %r21466, %r30819, %r21576; + st.local.v2.u32 [%rd908+216], {%r21466, %r21467}; + // begin inline asm + shf.l.wrap.b32 %r21429, %r21435, %r21434, %r29756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21433, %r21434, %r21435, %r29756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21437, %r21443, %r21442, %r29757; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21441, %r21442, %r21443, %r29757; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21449, %r21450, %r21451, %r29758; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21445, %r21451, %r21450, %r29758; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r21445, %r21449}; + // begin inline asm + shf.l.wrap.b32 %r21453, %r21459, %r21458, %r29759; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21457, %r21458, %r21459, %r29759; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21461, %r21467, %r21466, %r29760; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21465, %r21466, %r21467, %r29760; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21473, %r21474, %r21475, %r29761; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21469, %r21475, %r21474, %r29761; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r21469, %r21473}; + // begin inline asm + shf.l.wrap.b32 %r21477, %r21483, %r21482, %r29762; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21481, %r21482, %r21483, %r29762; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21485, %r21491, %r21490, %r29763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21489, %r21490, %r21491, %r29763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21493, %r21499, %r21498, %r29764; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21497, %r21498, %r21499, %r29764; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21501, %r21536, %r21429, %r21453, 0xD2; + lop3.b32 %r21502, %r21539, %r21433, %r21457, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31002, %r21429, %r21453, %r21485, 0xD2; + lop3.b32 %r31003, %r21433, %r21457, %r21489, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+32], {%r31002, %r31003}; + // begin inline asm + // chi + lop3.b32 %r30998, %r21453, %r21485, %r21461, 0xD2; + lop3.b32 %r30999, %r21457, %r21489, %r21465, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+40], {%r30998, %r30999}; + // begin inline asm + // chi + lop3.b32 %r30994, %r21485, %r21461, %r21536, 0xD2; + lop3.b32 %r30995, %r21489, %r21465, %r21539, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+48], {%r30994, %r30995}; + // begin inline asm + // chi + lop3.b32 %r30992, %r21461, %r21536, %r21429, 0xD2; + lop3.b32 %r30993, %r21465, %r21539, %r21433, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+56], {%r30992, %r30993}; + // begin inline asm + // chi + lop3.b32 %r30988, %r21477, %r21437, %r21493, 0xD2; + lop3.b32 %r30989, %r21481, %r21441, %r21497, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+64], {%r30988, %r30989}; + // begin inline asm + // chi + lop3.b32 %r31000, %r21437, %r21493, %r21469, 0xD2; + lop3.b32 %r31001, %r21441, %r21497, %r21473, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+72], {%r31000, %r31001}; + // begin inline asm + // chi + lop3.b32 %r30996, %r21493, %r21469, %r21445, 0xD2; + lop3.b32 %r30997, %r21497, %r21473, %r21449, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+80], {%r30996, %r30997}; + // begin inline asm + ld.global.nc.v2.u32 {%r21565,%r21566}, [%rd902]; + // end inline asm + xor.b32 %r30990, %r21501, %r21565; + xor.b32 %r30991, %r21502, %r21566; + st.local.v2.u32 [%rd908+24], {%r30990, %r30991}; + add.s64 %rd242, %rd908, 24; + add.s64 %rd243, %rd2, 24; + +$L__BB2_69: + add.s32 %r29765, %r3343, 1; + cvta.to.global.u64 %rd1258, %rd361; + shl.b32 %r21578, %r30902, 2; + cvt.u64.u32 %rd935, %r21578; + and.b64 %rd936, %rd935, 60; + add.s64 %rd937, %rd243, %rd936; + xor.b32 %r21579, %r3343, %r30902; + mul.lo.s32 %r21580, %r21579, 16777619; + ld.local.u32 %r21581, [%rd937]; + xor.b32 %r21582, %r21580, %r21581; + mul.wide.u32 %rd938, %r21582, -954391867; + shr.u64 %rd939, %rd938, 32; + cvt.u32.u64 %r21583, %rd939; + sub.s32 %r21584, %r21582, %r21583; + shr.u32 %r21585, %r21584, 1; + add.s32 %r21586, %r21585, %r21583; + shr.u32 %r21587, %r21586, 20; + mul.lo.s32 %r21588, %r21587, 1179641; + sub.s32 %r21589, %r21582, %r21588; + mul.wide.u32 %rd940, %r21589, 64; + add.s64 %rd941, %rd1258, %rd940; + mul.lo.s32 %r21590, %r30939, 16777619; + ld.global.u32 %r21591, [%rd941]; + xor.b32 %r30939, %r21590, %r21591; + mul.lo.s32 %r21592, %r30940, 16777619; + ld.global.u32 %r21593, [%rd941+4]; + xor.b32 %r30940, %r21592, %r21593; + mul.lo.s32 %r21594, %r30951, 16777619; + ld.global.u32 %r21595, [%rd941+8]; + mul.lo.s32 %r21596, %r30952, 16777619; + ld.global.u32 %r21597, [%rd941+12]; + xor.b32 %r21598, %r21596, %r21597; + xor.b32 %r30951, %r21594, %r21595; + mov.b64 %rd942, {%r30951, %r21598}; + mul.lo.s32 %r21599, %r30947, 16777619; + ld.global.u32 %r21600, [%rd941+16]; + mul.lo.s32 %r21601, %r30948, 16777619; + ld.global.u32 %r21602, [%rd941+20]; + xor.b32 %r21603, %r21601, %r21602; + xor.b32 %r30947, %r21599, %r21600; + mov.b64 %rd943, {%r30947, %r21603}; + mul.lo.s32 %r21604, %r30943, 16777619; + ld.global.u32 %r21605, [%rd941+24]; + mul.lo.s32 %r21606, %r30944, 16777619; + ld.global.u32 %r21607, [%rd941+28]; + xor.b32 %r21608, %r21606, %r21607; + xor.b32 %r30943, %r21604, %r21605; + mov.b64 %rd944, {%r30943, %r21608}; + mul.lo.s32 %r21609, %r30941, 16777619; + ld.global.u32 %r21610, [%rd941+32]; + mul.lo.s32 %r21611, %r30942, 16777619; + ld.global.u32 %r21612, [%rd941+36]; + xor.b32 %r21613, %r21611, %r21612; + xor.b32 %r30941, %r21609, %r21610; + mov.b64 %rd945, {%r30941, %r21613}; + mul.lo.s32 %r21614, %r30937, 16777619; + ld.global.u32 %r21615, [%rd941+40]; + xor.b32 %r30937, %r21614, %r21615; + mul.lo.s32 %r21616, %r30938, 16777619; + ld.global.u32 %r21617, [%rd941+44]; + xor.b32 %r30938, %r21616, %r21617; + mul.lo.s32 %r21618, %r30949, 16777619; + ld.global.u32 %r21619, [%rd941+48]; + mul.lo.s32 %r21620, %r30950, 16777619; + ld.global.u32 %r21621, [%rd941+52]; + xor.b32 %r21622, %r21620, %r21621; + xor.b32 %r30949, %r21618, %r21619; + mov.b64 %rd946, {%r30949, %r21622}; + mul.lo.s32 %r21623, %r30945, 16777619; + ld.global.u32 %r21624, [%rd941+56]; + mul.lo.s32 %r21625, %r30946, 16777619; + ld.global.u32 %r21626, [%rd941+60]; + xor.b32 %r21627, %r21625, %r21626; + xor.b32 %r30945, %r21623, %r21624; + mov.b64 %rd947, {%r30945, %r21627}; + st.local.v2.u32 [%rd2+24], {%r30939, %r30940}; + st.local.v2.u32 [%rd2+32], {%r30951, %r21598}; + st.local.v2.u32 [%rd2+40], {%r30947, %r21603}; + st.local.v2.u32 [%rd2+48], {%r30943, %r21608}; + st.local.v2.u32 [%rd2+56], {%r30941, %r21613}; + st.local.v2.u32 [%rd2+64], {%r30937, %r30938}; + st.local.v2.u32 [%rd2+72], {%r30949, %r21622}; + st.local.v2.u32 [%rd2+80], {%r30945, %r21627}; + add.s64 %rd948, %rd242, %rd936; + xor.b32 %r21628, %r29765, %r30902; + mul.lo.s32 %r21629, %r21628, 16777619; + ld.local.u32 %r21630, [%rd948]; + xor.b32 %r21631, %r21629, %r21630; + mul.wide.u32 %rd949, %r21631, -954391867; + shr.u64 %rd950, %rd949, 32; + cvt.u32.u64 %r21632, %rd950; + sub.s32 %r21633, %r21631, %r21632; + shr.u32 %r21634, %r21633, 1; + add.s32 %r21635, %r21634, %r21632; + shr.u32 %r21636, %r21635, 20; + mul.lo.s32 %r21637, %r21636, 1179641; + sub.s32 %r21638, %r21631, %r21637; + mul.wide.u32 %rd951, %r21638, 64; + add.s64 %rd952, %rd1258, %rd951; + mul.lo.s32 %r21639, %r30990, 16777619; + ld.global.u32 %r21640, [%rd952]; + xor.b32 %r30990, %r21639, %r21640; + mul.lo.s32 %r21641, %r30991, 16777619; + ld.global.u32 %r21642, [%rd952+4]; + xor.b32 %r30991, %r21641, %r21642; + mul.lo.s32 %r21643, %r31002, 16777619; + ld.global.u32 %r21644, [%rd952+8]; + mul.lo.s32 %r21645, %r31003, 16777619; + ld.global.u32 %r21646, [%rd952+12]; + xor.b32 %r21647, %r21645, %r21646; + xor.b32 %r31002, %r21643, %r21644; + mov.b64 %rd953, {%r31002, %r21647}; + mul.lo.s32 %r21648, %r30998, 16777619; + ld.global.u32 %r21649, [%rd952+16]; + mul.lo.s32 %r21650, %r30999, 16777619; + ld.global.u32 %r21651, [%rd952+20]; + xor.b32 %r21652, %r21650, %r21651; + xor.b32 %r30998, %r21648, %r21649; + mov.b64 %rd954, {%r30998, %r21652}; + mul.lo.s32 %r21653, %r30994, 16777619; + ld.global.u32 %r21654, [%rd952+24]; + mul.lo.s32 %r21655, %r30995, 16777619; + ld.global.u32 %r21656, [%rd952+28]; + xor.b32 %r21657, %r21655, %r21656; + xor.b32 %r30994, %r21653, %r21654; + mov.b64 %rd955, {%r30994, %r21657}; + mul.lo.s32 %r21658, %r30992, 16777619; + ld.global.u32 %r21659, [%rd952+32]; + mul.lo.s32 %r21660, %r30993, 16777619; + ld.global.u32 %r21661, [%rd952+36]; + xor.b32 %r21662, %r21660, %r21661; + xor.b32 %r30992, %r21658, %r21659; + mov.b64 %rd956, {%r30992, %r21662}; + mul.lo.s32 %r21663, %r30988, 16777619; + ld.global.u32 %r21664, [%rd952+40]; + xor.b32 %r30988, %r21663, %r21664; + mul.lo.s32 %r21665, %r30989, 16777619; + ld.global.u32 %r21666, [%rd952+44]; + xor.b32 %r30989, %r21665, %r21666; + mul.lo.s32 %r21667, %r31000, 16777619; + ld.global.u32 %r21668, [%rd952+48]; + mul.lo.s32 %r21669, %r31001, 16777619; + ld.global.u32 %r21670, [%rd952+52]; + xor.b32 %r21671, %r21669, %r21670; + xor.b32 %r31000, %r21667, %r21668; + mov.b64 %rd957, {%r31000, %r21671}; + mul.lo.s32 %r21672, %r30996, 16777619; + ld.global.u32 %r21673, [%rd952+56]; + mul.lo.s32 %r21674, %r30997, 16777619; + ld.global.u32 %r21675, [%rd952+60]; + xor.b32 %r21676, %r21674, %r21675; + xor.b32 %r30996, %r21672, %r21673; + mov.b64 %rd958, {%r30996, %r21676}; + st.local.v2.u32 [%rd908+24], {%r30990, %r30991}; + st.local.v2.u32 [%rd908+32], {%r31002, %r21647}; + st.local.v2.u32 [%rd908+40], {%r30998, %r21652}; + st.local.v2.u32 [%rd908+48], {%r30994, %r21657}; + st.local.v2.u32 [%rd908+56], {%r30992, %r21662}; + st.local.v2.u32 [%rd908+64], {%r30988, %r30989}; + st.local.v2.u32 [%rd908+72], {%r31000, %r21671}; + st.local.v2.u32 [%rd908+80], {%r30996, %r21676}; + add.s32 %r30902, %r30902, 1; + setp.lt.u32 %p40, %r30902, 512; + shr.u64 %rd959, %rd942, 32; + cvt.u32.u64 %r30952, %rd959; + shr.u64 %rd960, %rd943, 32; + cvt.u32.u64 %r30948, %rd960; + shr.u64 %rd961, %rd944, 32; + cvt.u32.u64 %r30944, %rd961; + shr.u64 %rd962, %rd945, 32; + cvt.u32.u64 %r30942, %rd962; + shr.u64 %rd963, %rd946, 32; + cvt.u32.u64 %r30950, %rd963; + shr.u64 %rd964, %rd947, 32; + cvt.u32.u64 %r30946, %rd964; + shr.u64 %rd965, %rd953, 32; + cvt.u32.u64 %r31003, %rd965; + shr.u64 %rd966, %rd954, 32; + cvt.u32.u64 %r30999, %rd966; + shr.u64 %rd967, %rd955, 32; + cvt.u32.u64 %r30995, %rd967; + shr.u64 %rd968, %rd956, 32; + cvt.u32.u64 %r30993, %rd968; + shr.u64 %rd969, %rd957, 32; + cvt.u32.u64 %r31001, %rd969; + shr.u64 %rd970, %rd958, 32; + cvt.u32.u64 %r30997, %rd970; + @%p40 bra $L__BB2_69; + + mov.u32 %r30903, 0; + st.local.v2.u32 [%rd2+96], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+104], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+112], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+120], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+128], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+136], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+144], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+152], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+160], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+168], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+176], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+184], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+192], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+200], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+208], {%r30903, %r30903}; + st.local.v2.u32 [%rd2+216], {%r30903, %r30903}; + mov.u32 %r30918, -2147483648; + mov.u32 %r30917, 1; + st.local.v2.u32 [%rd2+88], {%r30917, %r30918}; + mov.u32 %r30904, %r30903; + mov.u32 %r30905, %r30903; + mov.u32 %r30906, %r30903; + mov.u32 %r30907, %r30903; + mov.u32 %r30908, %r30903; + mov.u32 %r30909, %r30903; + mov.u32 %r30910, %r30903; + mov.u32 %r30911, %r30903; + mov.u32 %r30912, %r30903; + mov.u32 %r30913, %r30903; + mov.u32 %r30914, %r30903; + mov.u32 %r30915, %r30903; + mov.u32 %r30916, %r30903; + mov.u32 %r30919, %r30903; + mov.u32 %r30920, %r30903; + mov.u32 %r30921, %r30903; + mov.u32 %r30922, %r30903; + mov.u32 %r30923, %r30903; + mov.u32 %r30924, %r30903; + mov.u32 %r30925, %r30903; + mov.u32 %r30926, %r30903; + mov.u32 %r30927, %r30903; + mov.u32 %r30928, %r30903; + mov.u32 %r30929, %r30903; + mov.u32 %r30930, %r30903; + mov.u32 %r30931, %r30903; + mov.u32 %r30932, %r30903; + mov.u32 %r30933, %r30903; + mov.u32 %r30934, %r30903; + mov.u32 %r30935, %r30903; + mov.u32 %r30936, %r30903; + mov.u32 %r30953, %r30903; + +$L__BB2_71: + mov.u32 %r29776, 1; + mov.u64 %rd1281, keccak_round_constants; + cvta.const.u64 %rd1280, %rd1281; + // begin inline asm + // xor5 + lop3.b32 %r21718, %r30939, %r30937, %r30935, 0x96; + lop3.b32 %r21718, %r21718, %r30933, %r30931, 0x96; + lop3.b32 %r21719, %r30940, %r30938, %r30936, 0x96; + lop3.b32 %r21719, %r21719, %r30934, %r30932, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21730, %r30951, %r30949, %r30929, 0x96; + lop3.b32 %r21730, %r21730, %r30927, %r30925, 0x96; + lop3.b32 %r21731, %r30952, %r30950, %r30930, 0x96; + lop3.b32 %r21731, %r21731, %r30928, %r30926, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21742, %r30947, %r30945, %r30923, 0x96; + lop3.b32 %r21742, %r21742, %r30921, %r30919, 0x96; + lop3.b32 %r21743, %r30948, %r30946, %r30924, 0x96; + lop3.b32 %r21743, %r21743, %r30922, %r30920, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21754, %r30943, %r30917, %r30915, 0x96; + lop3.b32 %r21754, %r21754, %r30913, %r30911, 0x96; + lop3.b32 %r21755, %r30944, %r30918, %r30916, 0x96; + lop3.b32 %r21755, %r21755, %r30914, %r30912, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21766, %r30941, %r30909, %r30907, 0x96; + lop3.b32 %r21766, %r21766, %r30905, %r30903, 0x96; + lop3.b32 %r21767, %r30942, %r30910, %r30908, 0x96; + lop3.b32 %r21767, %r21767, %r30906, %r30904, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21778, %r21731, %r21730, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21782, %r21730, %r21731, %r29776; + // end inline asm + xor.b32 %r22212, %r21778, %r21766; + xor.b32 %r22213, %r21782, %r21767; + xor.b32 %r22045, %r30939, %r22212; + xor.b32 %r22048, %r30940, %r22213; + xor.b32 %r21952, %r30937, %r22212; + xor.b32 %r21951, %r30938, %r22213; + xor.b32 %r21999, %r30935, %r22212; + xor.b32 %r22000, %r30936, %r22213; + xor.b32 %r21904, %r30933, %r22212; + xor.b32 %r21903, %r30934, %r22213; + xor.b32 %r21855, %r30931, %r22212; + xor.b32 %r21856, %r30932, %r22213; + // begin inline asm + shf.l.wrap.b32 %r21786, %r21743, %r21742, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21790, %r21742, %r21743, %r29776; + // end inline asm + xor.b32 %r22214, %r21786, %r21718; + xor.b32 %r22215, %r21790, %r21719; + xor.b32 %r22007, %r30951, %r22214; + xor.b32 %r22008, %r30952, %r22215; + xor.b32 %r21824, %r30949, %r22214; + xor.b32 %r21823, %r30950, %r22215; + xor.b32 %r21983, %r30929, %r22214; + xor.b32 %r21984, %r30930, %r22215; + xor.b32 %r21944, %r30927, %r22214; + xor.b32 %r21943, %r30928, %r22215; + xor.b32 %r21927, %r30925, %r22214; + xor.b32 %r21928, %r30926, %r22215; + // begin inline asm + shf.l.wrap.b32 %r21794, %r21755, %r21754, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21798, %r21754, %r21755, %r29776; + // end inline asm + xor.b32 %r22216, %r21794, %r21730; + xor.b32 %r22217, %r21798, %r21731; + xor.b32 %r21864, %r30947, %r22216; + xor.b32 %r21863, %r30948, %r22217; + xor.b32 %r21991, %r30945, %r22216; + xor.b32 %r21992, %r30946, %r22217; + xor.b32 %r21872, %r30923, %r22216; + xor.b32 %r21871, %r30924, %r22217; + xor.b32 %r21975, %r30921, %r22216; + xor.b32 %r21976, %r30922, %r22217; + xor.b32 %r21840, %r30919, %r22216; + xor.b32 %r21839, %r30920, %r22217; + // begin inline asm + shf.l.wrap.b32 %r21802, %r21767, %r21766, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21806, %r21766, %r21767, %r29776; + // end inline asm + xor.b32 %r22218, %r21802, %r21742; + xor.b32 %r22219, %r21806, %r21743; + xor.b32 %r21959, %r30943, %r22218; + xor.b32 %r21960, %r30944, %r22219; + xor.b32 %r21936, %r30917, %r22218; + xor.b32 %r21935, %r30918, %r22219; + xor.b32 %r21879, %r30915, %r22218; + xor.b32 %r21880, %r30916, %r22219; + xor.b32 %r21967, %r30913, %r22218; + xor.b32 %r21968, %r30914, %r22219; + xor.b32 %r21896, %r30911, %r22218; + xor.b32 %r21895, %r30912, %r22219; + // begin inline asm + shf.l.wrap.b32 %r21810, %r21719, %r21718, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21814, %r21718, %r21719, %r29776; + // end inline asm + xor.b32 %r22220, %r21810, %r21754; + xor.b32 %r22221, %r21814, %r21755; + xor.b32 %r21911, %r30941, %r22220; + xor.b32 %r21912, %r30942, %r22221; + xor.b32 %r21831, %r30909, %r22220; + xor.b32 %r21832, %r30910, %r22221; + xor.b32 %r21848, %r30907, %r22220; + xor.b32 %r21847, %r30908, %r22221; + xor.b32 %r21887, %r30905, %r22220; + xor.b32 %r21888, %r30906, %r22221; + xor.b32 %r21919, %r30903, %r22220; + xor.b32 %r21920, %r30904, %r22221; + mov.u32 %r21825, 44; + // begin inline asm + shf.l.wrap.b32 %r21818, %r21824, %r21823, %r21825; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21822, %r21823, %r21824, %r21825; + // end inline asm + mov.u32 %r21833, 20; + // begin inline asm + shf.l.wrap.b32 %r21826, %r21832, %r21831, %r21833; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21830, %r21831, %r21832, %r21833; + // end inline asm + mov.u32 %r21841, 61; + // begin inline asm + shf.l.wrap.b32 %r21834, %r21840, %r21839, %r21841; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21838, %r21839, %r21840, %r21841; + // end inline asm + mov.u32 %r21849, 39; + // begin inline asm + shf.l.wrap.b32 %r21842, %r21848, %r21847, %r21849; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21846, %r21847, %r21848, %r21849; + // end inline asm + mov.u32 %r21857, 18; + // begin inline asm + shf.l.wrap.b32 %r21850, %r21856, %r21855, %r21857; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21854, %r21855, %r21856, %r21857; + // end inline asm + mov.u32 %r21865, 62; + // begin inline asm + shf.l.wrap.b32 %r21858, %r21864, %r21863, %r21865; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21862, %r21863, %r21864, %r21865; + // end inline asm + mov.u32 %r21873, 43; + // begin inline asm + shf.l.wrap.b32 %r21866, %r21872, %r21871, %r21873; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21870, %r21871, %r21872, %r21873; + // end inline asm + mov.u32 %r21881, 25; + // begin inline asm + shf.l.wrap.b32 %r21874, %r21880, %r21879, %r21881; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21878, %r21879, %r21880, %r21881; + // end inline asm + mov.u32 %r21889, 8; + // begin inline asm + shf.l.wrap.b32 %r21882, %r21888, %r21887, %r21889; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21886, %r21887, %r21888, %r21889; + // end inline asm + mov.u32 %r21897, 56; + // begin inline asm + shf.l.wrap.b32 %r21890, %r21896, %r21895, %r21897; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21894, %r21895, %r21896, %r21897; + // end inline asm + mov.u32 %r21905, 41; + // begin inline asm + shf.l.wrap.b32 %r21898, %r21904, %r21903, %r21905; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21902, %r21903, %r21904, %r21905; + // end inline asm + mov.u32 %r21913, 27; + // begin inline asm + shf.l.wrap.b32 %r21906, %r21912, %r21911, %r21913; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21910, %r21911, %r21912, %r21913; + // end inline asm + mov.u32 %r21921, 14; + // begin inline asm + shf.l.wrap.b32 %r21914, %r21920, %r21919, %r21921; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21918, %r21919, %r21920, %r21921; + // end inline asm + mov.u32 %r21929, 2; + // begin inline asm + shf.l.wrap.b32 %r21922, %r21928, %r21927, %r21929; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21926, %r21927, %r21928, %r21929; + // end inline asm + mov.u32 %r21937, 55; + // begin inline asm + shf.l.wrap.b32 %r21930, %r21936, %r21935, %r21937; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21934, %r21935, %r21936, %r21937; + // end inline asm + mov.u32 %r21945, 45; + // begin inline asm + shf.l.wrap.b32 %r21938, %r21944, %r21943, %r21945; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21942, %r21943, %r21944, %r21945; + // end inline asm + mov.u32 %r21953, 36; + // begin inline asm + shf.l.wrap.b32 %r21946, %r21952, %r21951, %r21953; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21950, %r21951, %r21952, %r21953; + // end inline asm + mov.u32 %r21961, 28; + // begin inline asm + shf.l.wrap.b32 %r21954, %r21960, %r21959, %r21961; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21958, %r21959, %r21960, %r21961; + // end inline asm + mov.u32 %r21969, 21; + // begin inline asm + shf.l.wrap.b32 %r21962, %r21968, %r21967, %r21969; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21966, %r21967, %r21968, %r21969; + // end inline asm + mov.u32 %r21977, 15; + // begin inline asm + shf.l.wrap.b32 %r21970, %r21976, %r21975, %r21977; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21974, %r21975, %r21976, %r21977; + // end inline asm + mov.u32 %r21985, 10; + // begin inline asm + shf.l.wrap.b32 %r21978, %r21984, %r21983, %r21985; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21982, %r21983, %r21984, %r21985; + // end inline asm + mov.u32 %r21993, 6; + // begin inline asm + shf.l.wrap.b32 %r21986, %r21992, %r21991, %r21993; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21990, %r21991, %r21992, %r21993; + // end inline asm + mov.u32 %r22001, 3; + // begin inline asm + shf.l.wrap.b32 %r21994, %r22000, %r21999, %r22001; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21998, %r21999, %r22000, %r22001; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22002, %r22008, %r22007, %r29776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22006, %r22007, %r22008, %r29776; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22010, %r22045, %r21818, %r21866, 0xD2; + lop3.b32 %r22011, %r22048, %r21822, %r21870, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30951, %r21818, %r21866, %r21962, 0xD2; + lop3.b32 %r30952, %r21822, %r21870, %r21966, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30947, %r21866, %r21962, %r21914, 0xD2; + lop3.b32 %r30948, %r21870, %r21966, %r21918, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30943, %r21962, %r21914, %r22045, 0xD2; + lop3.b32 %r30944, %r21966, %r21918, %r22048, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30941, %r21914, %r22045, %r21818, 0xD2; + lop3.b32 %r30942, %r21918, %r22048, %r21822, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30937, %r21954, %r21826, %r21994, 0xD2; + lop3.b32 %r30938, %r21958, %r21830, %r21998, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30949, %r21826, %r21994, %r21938, 0xD2; + lop3.b32 %r30950, %r21830, %r21998, %r21942, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30945, %r21994, %r21938, %r21834, 0xD2; + lop3.b32 %r30946, %r21998, %r21942, %r21838, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30917, %r21938, %r21834, %r21954, 0xD2; + lop3.b32 %r30918, %r21942, %r21838, %r21958, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r30917, %r30918}; + // begin inline asm + // chi + lop3.b32 %r30909, %r21834, %r21954, %r21826, 0xD2; + lop3.b32 %r30910, %r21838, %r21958, %r21830, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r30909, %r30910}; + // begin inline asm + // chi + lop3.b32 %r30935, %r22002, %r21986, %r21874, 0xD2; + lop3.b32 %r30936, %r22006, %r21990, %r21878, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+104], {%r30935, %r30936}; + // begin inline asm + // chi + lop3.b32 %r30929, %r21986, %r21874, %r21882, 0xD2; + lop3.b32 %r30930, %r21990, %r21878, %r21886, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+112], {%r30929, %r30930}; + // begin inline asm + // chi + lop3.b32 %r30923, %r21874, %r21882, %r21850, 0xD2; + lop3.b32 %r30924, %r21878, %r21886, %r21854, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+120], {%r30923, %r30924}; + // begin inline asm + // chi + lop3.b32 %r30915, %r21882, %r21850, %r22002, 0xD2; + lop3.b32 %r30916, %r21886, %r21854, %r22006, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+128], {%r30915, %r30916}; + // begin inline asm + // chi + lop3.b32 %r30907, %r21850, %r22002, %r21986, 0xD2; + lop3.b32 %r30908, %r21854, %r22006, %r21990, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+136], {%r30907, %r30908}; + // begin inline asm + // chi + lop3.b32 %r30933, %r21906, %r21946, %r21978, 0xD2; + lop3.b32 %r30934, %r21910, %r21950, %r21982, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+144], {%r30933, %r30934}; + // begin inline asm + // chi + lop3.b32 %r30927, %r21946, %r21978, %r21970, 0xD2; + lop3.b32 %r30928, %r21950, %r21982, %r21974, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+152], {%r30927, %r30928}; + // begin inline asm + // chi + lop3.b32 %r30921, %r21978, %r21970, %r21890, 0xD2; + lop3.b32 %r30922, %r21982, %r21974, %r21894, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+160], {%r30921, %r30922}; + // begin inline asm + // chi + lop3.b32 %r30913, %r21970, %r21890, %r21906, 0xD2; + lop3.b32 %r30914, %r21974, %r21894, %r21910, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+168], {%r30913, %r30914}; + // begin inline asm + // chi + lop3.b32 %r30905, %r21890, %r21906, %r21946, 0xD2; + lop3.b32 %r30906, %r21894, %r21910, %r21950, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+176], {%r30905, %r30906}; + // begin inline asm + // chi + lop3.b32 %r30931, %r21858, %r21930, %r21842, 0xD2; + lop3.b32 %r30932, %r21862, %r21934, %r21846, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+184], {%r30931, %r30932}; + // begin inline asm + // chi + lop3.b32 %r30925, %r21930, %r21842, %r21898, 0xD2; + lop3.b32 %r30926, %r21934, %r21846, %r21902, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+192], {%r30925, %r30926}; + // begin inline asm + // chi + lop3.b32 %r30919, %r21842, %r21898, %r21922, 0xD2; + lop3.b32 %r30920, %r21846, %r21902, %r21926, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+200], {%r30919, %r30920}; + // begin inline asm + // chi + lop3.b32 %r30911, %r21898, %r21922, %r21858, 0xD2; + lop3.b32 %r30912, %r21902, %r21926, %r21862, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+208], {%r30911, %r30912}; + // begin inline asm + // chi + lop3.b32 %r30903, %r21922, %r21858, %r21930, 0xD2; + lop3.b32 %r30904, %r21926, %r21862, %r21934, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+216], {%r30903, %r30904}; + mul.wide.s32 %rd974, %r30953, 8; + add.s64 %rd973, %rd1280, %rd974; + // begin inline asm + ld.global.nc.v2.u32 {%r22210,%r22211}, [%rd973]; + // end inline asm + xor.b32 %r30939, %r22010, %r22210; + xor.b32 %r30940, %r22011, %r22211; + add.s32 %r30953, %r30953, 1; + setp.lt.u32 %p41, %r30953, 23; + @%p41 bra $L__BB2_71; + + mov.u32 %r29775, 3; + mov.u32 %r29774, 21; + mov.u32 %r29773, 28; + mov.u32 %r29772, 45; + mov.u32 %r29771, 14; + mov.u32 %r29770, 43; + mov.u32 %r29769, 61; + mov.u32 %r29768, 20; + mov.u32 %r29767, 44; + mov.u64 %rd1274, keccak_round_constants; + cvta.const.u64 %rd1273, %rd1274; + add.s64 %rd1272, %rd1273, 184; + st.local.v2.u32 [%rd2+32], {%r30951, %r30952}; + st.local.v2.u32 [%rd2+72], {%r30949, %r30950}; + st.local.v2.u32 [%rd2+40], {%r30947, %r30948}; + st.local.v2.u32 [%rd2+80], {%r30945, %r30946}; + st.local.v2.u32 [%rd2+48], {%r30943, %r30944}; + st.local.v2.u32 [%rd2+56], {%r30941, %r30942}; + st.local.v2.u32 [%rd2+24], {%r30939, %r30940}; + // begin inline asm + // xor5 + lop3.b32 %r22222, %r30939, %r30937, %r30935, 0x96; + lop3.b32 %r22222, %r22222, %r30933, %r30931, 0x96; + lop3.b32 %r22223, %r30940, %r30938, %r30936, 0x96; + lop3.b32 %r22223, %r22223, %r30934, %r30932, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22234, %r30951, %r30949, %r30929, 0x96; + lop3.b32 %r22234, %r22234, %r30927, %r30925, 0x96; + lop3.b32 %r22235, %r30952, %r30950, %r30930, 0x96; + lop3.b32 %r22235, %r22235, %r30928, %r30926, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22246, %r30947, %r30945, %r30923, 0x96; + lop3.b32 %r22246, %r22246, %r30921, %r30919, 0x96; + lop3.b32 %r22247, %r30948, %r30946, %r30924, 0x96; + lop3.b32 %r22247, %r22247, %r30922, %r30920, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22258, %r30943, %r30917, %r30915, 0x96; + lop3.b32 %r22258, %r22258, %r30913, %r30911, 0x96; + lop3.b32 %r22259, %r30944, %r30918, %r30916, 0x96; + lop3.b32 %r22259, %r22259, %r30914, %r30912, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22270, %r30941, %r30909, %r30907, 0x96; + lop3.b32 %r22270, %r22270, %r30905, %r30903, 0x96; + lop3.b32 %r22271, %r30942, %r30910, %r30908, 0x96; + lop3.b32 %r22271, %r22271, %r30906, %r30904, 0x96; + // end inline asm + mov.u32 %r30968, 1; + // begin inline asm + shf.l.wrap.b32 %r22282, %r22235, %r22234, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22286, %r22234, %r22235, %r30968; + // end inline asm + xor.b32 %r22501, %r22282, %r22270; + xor.b32 %r22502, %r22286, %r22271; + xor.b32 %r22429, %r30939, %r22501; + xor.b32 %r22432, %r30940, %r22502; + xor.b32 %r22392, %r30936, %r22502; + xor.b32 %r22391, %r30935, %r22501; + st.local.v2.u32 [%rd2+104], {%r22391, %r22392}; + // begin inline asm + shf.l.wrap.b32 %r22290, %r22247, %r22246, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22294, %r22246, %r22247, %r30968; + // end inline asm + xor.b32 %r22503, %r22290, %r22222; + xor.b32 %r22504, %r22294, %r22223; + xor.b32 %r22328, %r30949, %r22503; + xor.b32 %r22327, %r30950, %r22504; + xor.b32 %r22367, %r30928, %r22504; + xor.b32 %r22368, %r30927, %r22503; + st.local.v2.u32 [%rd2+152], {%r22368, %r22367}; + // begin inline asm + shf.l.wrap.b32 %r22298, %r22259, %r22258, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22302, %r22258, %r22259, %r30968; + // end inline asm + xor.b32 %r22505, %r22298, %r22234; + xor.b32 %r22506, %r22302, %r22235; + xor.b32 %r22351, %r30924, %r22506; + xor.b32 %r22352, %r30923, %r22505; + st.local.v2.u32 [%rd2+120], {%r22352, %r22351}; + xor.b32 %r22343, %r30920, %r22506; + xor.b32 %r22344, %r30919, %r22505; + st.local.v2.u32 [%rd2+200], {%r22344, %r22343}; + // begin inline asm + shf.l.wrap.b32 %r22306, %r22271, %r22270, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22310, %r22270, %r22271, %r30968; + // end inline asm + xor.b32 %r22507, %r22306, %r22246; + xor.b32 %r22508, %r22310, %r22247; + xor.b32 %r22375, %r30943, %r22507; + xor.b32 %r22376, %r30944, %r22508; + xor.b32 %r22384, %r30914, %r22508; + xor.b32 %r22383, %r30913, %r22507; + st.local.v2.u32 [%rd2+168], {%r22383, %r22384}; + // begin inline asm + shf.l.wrap.b32 %r22314, %r22223, %r22222, %r30968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22318, %r22222, %r22223, %r30968; + // end inline asm + xor.b32 %r22509, %r22314, %r22258; + xor.b32 %r22510, %r22318, %r22259; + xor.b32 %r22335, %r30909, %r22509; + xor.b32 %r22336, %r30910, %r22510; + xor.b32 %r22360, %r30904, %r22510; + xor.b32 %r22359, %r30903, %r22509; + st.local.v2.u32 [%rd2+216], {%r22359, %r22360}; + // begin inline asm + shf.l.wrap.b32 %r22322, %r22328, %r22327, %r29767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22326, %r22327, %r22328, %r29767; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22330, %r22336, %r22335, %r29768; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22334, %r22335, %r22336, %r29768; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22342, %r22343, %r22344, %r29769; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22338, %r22344, %r22343, %r29769; + // end inline asm + st.local.v2.u32 [%rd2+96], {%r22338, %r22342}; + // begin inline asm + shf.l.wrap.b32 %r22346, %r22352, %r22351, %r29770; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22350, %r22351, %r22352, %r29770; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22354, %r22360, %r22359, %r29771; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22358, %r22359, %r22360, %r29771; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22366, %r22367, %r22368, %r29772; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22362, %r22368, %r22367, %r29772; + // end inline asm + st.local.v2.u32 [%rd2+88], {%r22362, %r22366}; + // begin inline asm + shf.l.wrap.b32 %r22370, %r22376, %r22375, %r29773; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22374, %r22375, %r22376, %r29773; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22378, %r22384, %r22383, %r29774; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22382, %r22383, %r22384, %r29774; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22386, %r22392, %r22391, %r29775; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22390, %r22391, %r22392, %r29775; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22394, %r22429, %r22322, %r22346, 0xD2; + lop3.b32 %r22395, %r22432, %r22326, %r22350, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22402, %r22322, %r22346, %r22378, 0xD2; + lop3.b32 %r22403, %r22326, %r22350, %r22382, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+32], {%r22402, %r22403}; + // begin inline asm + // chi + lop3.b32 %r22410, %r22346, %r22378, %r22354, 0xD2; + lop3.b32 %r22411, %r22350, %r22382, %r22358, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+40], {%r22410, %r22411}; + // begin inline asm + // chi + lop3.b32 %r22418, %r22378, %r22354, %r22429, 0xD2; + lop3.b32 %r22419, %r22382, %r22358, %r22432, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+48], {%r22418, %r22419}; + // begin inline asm + // chi + lop3.b32 %r22426, %r22354, %r22429, %r22322, 0xD2; + lop3.b32 %r22427, %r22358, %r22432, %r22326, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+56], {%r22426, %r22427}; + // begin inline asm + // chi + lop3.b32 %r22434, %r22370, %r22330, %r22386, 0xD2; + lop3.b32 %r22435, %r22374, %r22334, %r22390, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+64], {%r22434, %r22435}; + // begin inline asm + // chi + lop3.b32 %r22442, %r22330, %r22386, %r22362, 0xD2; + lop3.b32 %r22443, %r22334, %r22390, %r22366, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+72], {%r22442, %r22443}; + // begin inline asm + // chi + lop3.b32 %r22450, %r22386, %r22362, %r22338, 0xD2; + lop3.b32 %r22451, %r22390, %r22366, %r22342, 0xD2; + // end inline asm + st.local.v2.u32 [%rd2+80], {%r22450, %r22451}; + // begin inline asm + ld.global.nc.v2.u32 {%r22458,%r22459}, [%rd1272]; + // end inline asm + xor.b32 %r22511, %r22395, %r22459; + xor.b32 %r22512, %r22394, %r22458; + mov.b64 %rd1349, {%r22512, %r22511}; + mov.b64 %rd1350, {%r22402, %r22403}; + mov.b64 %rd1351, {%r22410, %r22411}; + mov.b64 %rd1352, {%r22418, %r22419}; + mov.b64 %rd1353, {%r22426, %r22427}; + mov.b64 %rd1354, {%r22434, %r22435}; + mov.b64 %rd1355, {%r22442, %r22443}; + mov.b64 %rd1356, {%r22450, %r22451}; + mov.u32 %r30954, 0; + st.local.v2.u32 [%rd2+24], {%r22512, %r22511}; + st.local.v2.u32 [%rd908+96], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+104], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+112], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+120], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+128], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+136], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+144], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+152], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+160], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+168], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+176], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+184], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+192], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+200], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+208], {%r30954, %r30954}; + st.local.v2.u32 [%rd908+216], {%r30954, %r30954}; + mov.u32 %r30969, -2147483648; + st.local.v2.u32 [%rd908+88], {%r30968, %r30969}; + mov.u32 %r30955, %r30954; + mov.u32 %r30956, %r30954; + mov.u32 %r30957, %r30954; + mov.u32 %r30958, %r30954; + mov.u32 %r30959, %r30954; + mov.u32 %r30960, %r30954; + mov.u32 %r30961, %r30954; + mov.u32 %r30962, %r30954; + mov.u32 %r30963, %r30954; + mov.u32 %r30964, %r30954; + mov.u32 %r30965, %r30954; + mov.u32 %r30966, %r30954; + mov.u32 %r30967, %r30954; + mov.u32 %r30970, %r30954; + mov.u32 %r30971, %r30954; + mov.u32 %r30972, %r30954; + mov.u32 %r30973, %r30954; + mov.u32 %r30974, %r30954; + mov.u32 %r30975, %r30954; + mov.u32 %r30976, %r30954; + mov.u32 %r30977, %r30954; + mov.u32 %r30978, %r30954; + mov.u32 %r30979, %r30954; + mov.u32 %r30980, %r30954; + mov.u32 %r30981, %r30954; + mov.u32 %r30982, %r30954; + mov.u32 %r30983, %r30954; + mov.u32 %r30984, %r30954; + mov.u32 %r30985, %r30954; + mov.u32 %r30986, %r30954; + mov.u32 %r30987, %r30954; + mov.u32 %r31004, %r30954; + +$L__BB2_73: + mov.u32 %r29786, 1; + mov.u64 %rd1276, keccak_round_constants; + cvta.const.u64 %rd1275, %rd1276; + // begin inline asm + // xor5 + lop3.b32 %r22513, %r30990, %r30988, %r30986, 0x96; + lop3.b32 %r22513, %r22513, %r30984, %r30982, 0x96; + lop3.b32 %r22514, %r30991, %r30989, %r30987, 0x96; + lop3.b32 %r22514, %r22514, %r30985, %r30983, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22525, %r31002, %r31000, %r30980, 0x96; + lop3.b32 %r22525, %r22525, %r30978, %r30976, 0x96; + lop3.b32 %r22526, %r31003, %r31001, %r30981, 0x96; + lop3.b32 %r22526, %r22526, %r30979, %r30977, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22537, %r30998, %r30996, %r30974, 0x96; + lop3.b32 %r22537, %r22537, %r30972, %r30970, 0x96; + lop3.b32 %r22538, %r30999, %r30997, %r30975, 0x96; + lop3.b32 %r22538, %r22538, %r30973, %r30971, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22549, %r30994, %r30968, %r30966, 0x96; + lop3.b32 %r22549, %r22549, %r30964, %r30962, 0x96; + lop3.b32 %r22550, %r30995, %r30969, %r30967, 0x96; + lop3.b32 %r22550, %r22550, %r30965, %r30963, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22561, %r30992, %r30960, %r30958, 0x96; + lop3.b32 %r22561, %r22561, %r30956, %r30954, 0x96; + lop3.b32 %r22562, %r30993, %r30961, %r30959, 0x96; + lop3.b32 %r22562, %r22562, %r30957, %r30955, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22573, %r22526, %r22525, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22577, %r22525, %r22526, %r29786; + // end inline asm + xor.b32 %r23007, %r22573, %r22561; + xor.b32 %r23008, %r22577, %r22562; + xor.b32 %r22840, %r30990, %r23007; + xor.b32 %r22843, %r30991, %r23008; + xor.b32 %r22747, %r30988, %r23007; + xor.b32 %r22746, %r30989, %r23008; + xor.b32 %r22794, %r30986, %r23007; + xor.b32 %r22795, %r30987, %r23008; + xor.b32 %r22699, %r30984, %r23007; + xor.b32 %r22698, %r30985, %r23008; + xor.b32 %r22650, %r30982, %r23007; + xor.b32 %r22651, %r30983, %r23008; + // begin inline asm + shf.l.wrap.b32 %r22581, %r22538, %r22537, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22585, %r22537, %r22538, %r29786; + // end inline asm + xor.b32 %r23009, %r22581, %r22513; + xor.b32 %r23010, %r22585, %r22514; + xor.b32 %r22802, %r31002, %r23009; + xor.b32 %r22803, %r31003, %r23010; + xor.b32 %r22619, %r31000, %r23009; + xor.b32 %r22618, %r31001, %r23010; + xor.b32 %r22778, %r30980, %r23009; + xor.b32 %r22779, %r30981, %r23010; + xor.b32 %r22739, %r30978, %r23009; + xor.b32 %r22738, %r30979, %r23010; + xor.b32 %r22722, %r30976, %r23009; + xor.b32 %r22723, %r30977, %r23010; + // begin inline asm + shf.l.wrap.b32 %r22589, %r22550, %r22549, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22593, %r22549, %r22550, %r29786; + // end inline asm + xor.b32 %r23011, %r22589, %r22525; + xor.b32 %r23012, %r22593, %r22526; + xor.b32 %r22659, %r30998, %r23011; + xor.b32 %r22658, %r30999, %r23012; + xor.b32 %r22786, %r30996, %r23011; + xor.b32 %r22787, %r30997, %r23012; + xor.b32 %r22667, %r30974, %r23011; + xor.b32 %r22666, %r30975, %r23012; + xor.b32 %r22770, %r30972, %r23011; + xor.b32 %r22771, %r30973, %r23012; + xor.b32 %r22635, %r30970, %r23011; + xor.b32 %r22634, %r30971, %r23012; + // begin inline asm + shf.l.wrap.b32 %r22597, %r22562, %r22561, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22601, %r22561, %r22562, %r29786; + // end inline asm + xor.b32 %r23013, %r22597, %r22537; + xor.b32 %r23014, %r22601, %r22538; + xor.b32 %r22754, %r30994, %r23013; + xor.b32 %r22755, %r30995, %r23014; + xor.b32 %r22731, %r30968, %r23013; + xor.b32 %r22730, %r30969, %r23014; + xor.b32 %r22674, %r30966, %r23013; + xor.b32 %r22675, %r30967, %r23014; + xor.b32 %r22762, %r30964, %r23013; + xor.b32 %r22763, %r30965, %r23014; + xor.b32 %r22691, %r30962, %r23013; + xor.b32 %r22690, %r30963, %r23014; + // begin inline asm + shf.l.wrap.b32 %r22605, %r22514, %r22513, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22609, %r22513, %r22514, %r29786; + // end inline asm + xor.b32 %r23015, %r22605, %r22549; + xor.b32 %r23016, %r22609, %r22550; + xor.b32 %r22706, %r30992, %r23015; + xor.b32 %r22707, %r30993, %r23016; + xor.b32 %r22626, %r30960, %r23015; + xor.b32 %r22627, %r30961, %r23016; + xor.b32 %r22643, %r30958, %r23015; + xor.b32 %r22642, %r30959, %r23016; + xor.b32 %r22682, %r30956, %r23015; + xor.b32 %r22683, %r30957, %r23016; + xor.b32 %r22714, %r30954, %r23015; + xor.b32 %r22715, %r30955, %r23016; + mov.u32 %r22620, 44; + // begin inline asm + shf.l.wrap.b32 %r22613, %r22619, %r22618, %r22620; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22617, %r22618, %r22619, %r22620; + // end inline asm + mov.u32 %r22628, 20; + // begin inline asm + shf.l.wrap.b32 %r22621, %r22627, %r22626, %r22628; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22625, %r22626, %r22627, %r22628; + // end inline asm + mov.u32 %r22636, 61; + // begin inline asm + shf.l.wrap.b32 %r22629, %r22635, %r22634, %r22636; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22633, %r22634, %r22635, %r22636; + // end inline asm + mov.u32 %r22644, 39; + // begin inline asm + shf.l.wrap.b32 %r22637, %r22643, %r22642, %r22644; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22641, %r22642, %r22643, %r22644; + // end inline asm + mov.u32 %r22652, 18; + // begin inline asm + shf.l.wrap.b32 %r22645, %r22651, %r22650, %r22652; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22649, %r22650, %r22651, %r22652; + // end inline asm + mov.u32 %r22660, 62; + // begin inline asm + shf.l.wrap.b32 %r22653, %r22659, %r22658, %r22660; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22657, %r22658, %r22659, %r22660; + // end inline asm + mov.u32 %r22668, 43; + // begin inline asm + shf.l.wrap.b32 %r22661, %r22667, %r22666, %r22668; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22665, %r22666, %r22667, %r22668; + // end inline asm + mov.u32 %r22676, 25; + // begin inline asm + shf.l.wrap.b32 %r22669, %r22675, %r22674, %r22676; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22673, %r22674, %r22675, %r22676; + // end inline asm + mov.u32 %r22684, 8; + // begin inline asm + shf.l.wrap.b32 %r22677, %r22683, %r22682, %r22684; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22681, %r22682, %r22683, %r22684; + // end inline asm + mov.u32 %r22692, 56; + // begin inline asm + shf.l.wrap.b32 %r22685, %r22691, %r22690, %r22692; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22689, %r22690, %r22691, %r22692; + // end inline asm + mov.u32 %r22700, 41; + // begin inline asm + shf.l.wrap.b32 %r22693, %r22699, %r22698, %r22700; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22697, %r22698, %r22699, %r22700; + // end inline asm + mov.u32 %r22708, 27; + // begin inline asm + shf.l.wrap.b32 %r22701, %r22707, %r22706, %r22708; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22705, %r22706, %r22707, %r22708; + // end inline asm + mov.u32 %r22716, 14; + // begin inline asm + shf.l.wrap.b32 %r22709, %r22715, %r22714, %r22716; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22713, %r22714, %r22715, %r22716; + // end inline asm + mov.u32 %r22724, 2; + // begin inline asm + shf.l.wrap.b32 %r22717, %r22723, %r22722, %r22724; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22721, %r22722, %r22723, %r22724; + // end inline asm + mov.u32 %r22732, 55; + // begin inline asm + shf.l.wrap.b32 %r22725, %r22731, %r22730, %r22732; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22729, %r22730, %r22731, %r22732; + // end inline asm + mov.u32 %r22740, 45; + // begin inline asm + shf.l.wrap.b32 %r22733, %r22739, %r22738, %r22740; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22737, %r22738, %r22739, %r22740; + // end inline asm + mov.u32 %r22748, 36; + // begin inline asm + shf.l.wrap.b32 %r22741, %r22747, %r22746, %r22748; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22745, %r22746, %r22747, %r22748; + // end inline asm + mov.u32 %r22756, 28; + // begin inline asm + shf.l.wrap.b32 %r22749, %r22755, %r22754, %r22756; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22753, %r22754, %r22755, %r22756; + // end inline asm + mov.u32 %r22764, 21; + // begin inline asm + shf.l.wrap.b32 %r22757, %r22763, %r22762, %r22764; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22761, %r22762, %r22763, %r22764; + // end inline asm + mov.u32 %r22772, 15; + // begin inline asm + shf.l.wrap.b32 %r22765, %r22771, %r22770, %r22772; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22769, %r22770, %r22771, %r22772; + // end inline asm + mov.u32 %r22780, 10; + // begin inline asm + shf.l.wrap.b32 %r22773, %r22779, %r22778, %r22780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22777, %r22778, %r22779, %r22780; + // end inline asm + mov.u32 %r22788, 6; + // begin inline asm + shf.l.wrap.b32 %r22781, %r22787, %r22786, %r22788; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22785, %r22786, %r22787, %r22788; + // end inline asm + mov.u32 %r22796, 3; + // begin inline asm + shf.l.wrap.b32 %r22789, %r22795, %r22794, %r22796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22793, %r22794, %r22795, %r22796; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22797, %r22803, %r22802, %r29786; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22801, %r22802, %r22803, %r29786; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22805, %r22840, %r22613, %r22661, 0xD2; + lop3.b32 %r22806, %r22843, %r22617, %r22665, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31002, %r22613, %r22661, %r22757, 0xD2; + lop3.b32 %r31003, %r22617, %r22665, %r22761, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30998, %r22661, %r22757, %r22709, 0xD2; + lop3.b32 %r30999, %r22665, %r22761, %r22713, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30994, %r22757, %r22709, %r22840, 0xD2; + lop3.b32 %r30995, %r22761, %r22713, %r22843, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30992, %r22709, %r22840, %r22613, 0xD2; + lop3.b32 %r30993, %r22713, %r22843, %r22617, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30988, %r22749, %r22621, %r22789, 0xD2; + lop3.b32 %r30989, %r22753, %r22625, %r22793, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r31000, %r22621, %r22789, %r22733, 0xD2; + lop3.b32 %r31001, %r22625, %r22793, %r22737, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30996, %r22789, %r22733, %r22629, 0xD2; + lop3.b32 %r30997, %r22793, %r22737, %r22633, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30968, %r22733, %r22629, %r22749, 0xD2; + lop3.b32 %r30969, %r22737, %r22633, %r22753, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r30968, %r30969}; + // begin inline asm + // chi + lop3.b32 %r30960, %r22629, %r22749, %r22621, 0xD2; + lop3.b32 %r30961, %r22633, %r22753, %r22625, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r30960, %r30961}; + // begin inline asm + // chi + lop3.b32 %r30986, %r22797, %r22781, %r22669, 0xD2; + lop3.b32 %r30987, %r22801, %r22785, %r22673, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+104], {%r30986, %r30987}; + // begin inline asm + // chi + lop3.b32 %r30980, %r22781, %r22669, %r22677, 0xD2; + lop3.b32 %r30981, %r22785, %r22673, %r22681, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+112], {%r30980, %r30981}; + // begin inline asm + // chi + lop3.b32 %r30974, %r22669, %r22677, %r22645, 0xD2; + lop3.b32 %r30975, %r22673, %r22681, %r22649, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+120], {%r30974, %r30975}; + // begin inline asm + // chi + lop3.b32 %r30966, %r22677, %r22645, %r22797, 0xD2; + lop3.b32 %r30967, %r22681, %r22649, %r22801, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+128], {%r30966, %r30967}; + // begin inline asm + // chi + lop3.b32 %r30958, %r22645, %r22797, %r22781, 0xD2; + lop3.b32 %r30959, %r22649, %r22801, %r22785, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+136], {%r30958, %r30959}; + // begin inline asm + // chi + lop3.b32 %r30984, %r22701, %r22741, %r22773, 0xD2; + lop3.b32 %r30985, %r22705, %r22745, %r22777, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+144], {%r30984, %r30985}; + // begin inline asm + // chi + lop3.b32 %r30978, %r22741, %r22773, %r22765, 0xD2; + lop3.b32 %r30979, %r22745, %r22777, %r22769, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+152], {%r30978, %r30979}; + // begin inline asm + // chi + lop3.b32 %r30972, %r22773, %r22765, %r22685, 0xD2; + lop3.b32 %r30973, %r22777, %r22769, %r22689, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+160], {%r30972, %r30973}; + // begin inline asm + // chi + lop3.b32 %r30964, %r22765, %r22685, %r22701, 0xD2; + lop3.b32 %r30965, %r22769, %r22689, %r22705, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+168], {%r30964, %r30965}; + // begin inline asm + // chi + lop3.b32 %r30956, %r22685, %r22701, %r22741, 0xD2; + lop3.b32 %r30957, %r22689, %r22705, %r22745, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+176], {%r30956, %r30957}; + // begin inline asm + // chi + lop3.b32 %r30982, %r22653, %r22725, %r22637, 0xD2; + lop3.b32 %r30983, %r22657, %r22729, %r22641, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+184], {%r30982, %r30983}; + // begin inline asm + // chi + lop3.b32 %r30976, %r22725, %r22637, %r22693, 0xD2; + lop3.b32 %r30977, %r22729, %r22641, %r22697, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+192], {%r30976, %r30977}; + // begin inline asm + // chi + lop3.b32 %r30970, %r22637, %r22693, %r22717, 0xD2; + lop3.b32 %r30971, %r22641, %r22697, %r22721, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+200], {%r30970, %r30971}; + // begin inline asm + // chi + lop3.b32 %r30962, %r22693, %r22717, %r22653, 0xD2; + lop3.b32 %r30963, %r22697, %r22721, %r22657, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+208], {%r30962, %r30963}; + // begin inline asm + // chi + lop3.b32 %r30954, %r22717, %r22653, %r22725, 0xD2; + lop3.b32 %r30955, %r22721, %r22657, %r22729, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+216], {%r30954, %r30955}; + mul.wide.s32 %rd985, %r31004, 8; + add.s64 %rd984, %rd1275, %rd985; + // begin inline asm + ld.global.nc.v2.u32 {%r23005,%r23006}, [%rd984]; + // end inline asm + xor.b32 %r30990, %r22805, %r23005; + xor.b32 %r30991, %r22806, %r23006; + add.s32 %r31004, %r31004, 1; + setp.lt.u32 %p42, %r31004, 23; + @%p42 bra $L__BB2_73; + + mov.u32 %r29785, 3; + mov.u32 %r29784, 21; + mov.u32 %r29783, 28; + mov.u32 %r29782, 45; + mov.u32 %r29781, 14; + mov.u32 %r29780, 43; + mov.u32 %r29779, 61; + mov.u32 %r29778, 20; + mov.u32 %r29777, 44; + mov.u64 %rd1279, keccak_round_constants; + cvta.const.u64 %rd1278, %rd1279; + add.s64 %rd1277, %rd1278, 184; + mov.u32 %r23116, 1; + st.local.v2.u32 [%rd908+32], {%r31002, %r31003}; + st.local.v2.u32 [%rd908+72], {%r31000, %r31001}; + st.local.v2.u32 [%rd908+40], {%r30998, %r30999}; + st.local.v2.u32 [%rd908+80], {%r30996, %r30997}; + st.local.v2.u32 [%rd908+48], {%r30994, %r30995}; + st.local.v2.u32 [%rd908+56], {%r30992, %r30993}; + st.local.v2.u32 [%rd908+24], {%r30990, %r30991}; + // begin inline asm + // xor5 + lop3.b32 %r23017, %r30990, %r30988, %r30986, 0x96; + lop3.b32 %r23017, %r23017, %r30984, %r30982, 0x96; + lop3.b32 %r23018, %r30991, %r30989, %r30987, 0x96; + lop3.b32 %r23018, %r23018, %r30985, %r30983, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23029, %r31002, %r31000, %r30980, 0x96; + lop3.b32 %r23029, %r23029, %r30978, %r30976, 0x96; + lop3.b32 %r23030, %r31003, %r31001, %r30981, 0x96; + lop3.b32 %r23030, %r23030, %r30979, %r30977, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23041, %r30998, %r30996, %r30974, 0x96; + lop3.b32 %r23041, %r23041, %r30972, %r30970, 0x96; + lop3.b32 %r23042, %r30999, %r30997, %r30975, 0x96; + lop3.b32 %r23042, %r23042, %r30973, %r30971, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23053, %r30994, %r30968, %r30966, 0x96; + lop3.b32 %r23053, %r23053, %r30964, %r30962, 0x96; + lop3.b32 %r23054, %r30995, %r30969, %r30967, 0x96; + lop3.b32 %r23054, %r23054, %r30965, %r30963, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23065, %r30992, %r30960, %r30958, 0x96; + lop3.b32 %r23065, %r23065, %r30956, %r30954, 0x96; + lop3.b32 %r23066, %r30993, %r30961, %r30959, 0x96; + lop3.b32 %r23066, %r23066, %r30957, %r30955, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23077, %r23030, %r23029, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23081, %r23029, %r23030, %r23116; + // end inline asm + xor.b32 %r23255, %r23077, %r23065; + xor.b32 %r23256, %r23081, %r23066; + xor.b32 %r23224, %r30990, %r23255; + xor.b32 %r23227, %r30991, %r23256; + xor.b32 %r23187, %r30987, %r23256; + xor.b32 %r23186, %r30986, %r23255; + st.local.v2.u32 [%rd908+104], {%r23186, %r23187}; + // begin inline asm + shf.l.wrap.b32 %r23085, %r23042, %r23041, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23089, %r23041, %r23042, %r23116; + // end inline asm + xor.b32 %r23257, %r23085, %r23017; + xor.b32 %r23258, %r23089, %r23018; + xor.b32 %r23123, %r31000, %r23257; + xor.b32 %r23122, %r31001, %r23258; + xor.b32 %r23162, %r30979, %r23258; + xor.b32 %r23163, %r30978, %r23257; + st.local.v2.u32 [%rd908+152], {%r23163, %r23162}; + // begin inline asm + shf.l.wrap.b32 %r23093, %r23054, %r23053, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23097, %r23053, %r23054, %r23116; + // end inline asm + xor.b32 %r23259, %r23093, %r23029; + xor.b32 %r23260, %r23097, %r23030; + xor.b32 %r23146, %r30975, %r23260; + xor.b32 %r23147, %r30974, %r23259; + st.local.v2.u32 [%rd908+120], {%r23147, %r23146}; + xor.b32 %r23138, %r30971, %r23260; + xor.b32 %r23139, %r30970, %r23259; + st.local.v2.u32 [%rd908+200], {%r23139, %r23138}; + // begin inline asm + shf.l.wrap.b32 %r23101, %r23066, %r23065, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23105, %r23065, %r23066, %r23116; + // end inline asm + xor.b32 %r23261, %r23101, %r23041; + xor.b32 %r23262, %r23105, %r23042; + xor.b32 %r23170, %r30994, %r23261; + xor.b32 %r23171, %r30995, %r23262; + xor.b32 %r23179, %r30965, %r23262; + xor.b32 %r23178, %r30964, %r23261; + st.local.v2.u32 [%rd908+168], {%r23178, %r23179}; + // begin inline asm + shf.l.wrap.b32 %r23109, %r23018, %r23017, %r23116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23113, %r23017, %r23018, %r23116; + // end inline asm + xor.b32 %r23263, %r23109, %r23053; + xor.b32 %r23264, %r23113, %r23054; + xor.b32 %r23130, %r30960, %r23263; + xor.b32 %r23131, %r30961, %r23264; + xor.b32 %r23155, %r30955, %r23264; + xor.b32 %r23154, %r30954, %r23263; + st.local.v2.u32 [%rd908+216], {%r23154, %r23155}; + // begin inline asm + shf.l.wrap.b32 %r23117, %r23123, %r23122, %r29777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23121, %r23122, %r23123, %r29777; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23125, %r23131, %r23130, %r29778; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23129, %r23130, %r23131, %r29778; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23137, %r23138, %r23139, %r29779; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23133, %r23139, %r23138, %r29779; + // end inline asm + st.local.v2.u32 [%rd908+96], {%r23133, %r23137}; + // begin inline asm + shf.l.wrap.b32 %r23141, %r23147, %r23146, %r29780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23145, %r23146, %r23147, %r29780; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23149, %r23155, %r23154, %r29781; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23153, %r23154, %r23155, %r29781; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23161, %r23162, %r23163, %r29782; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23157, %r23163, %r23162, %r29782; + // end inline asm + st.local.v2.u32 [%rd908+88], {%r23157, %r23161}; + // begin inline asm + shf.l.wrap.b32 %r23165, %r23171, %r23170, %r29783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23169, %r23170, %r23171, %r29783; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23173, %r23179, %r23178, %r29784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23177, %r23178, %r23179, %r29784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23181, %r23187, %r23186, %r29785; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23185, %r23186, %r23187, %r29785; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23189, %r23224, %r23117, %r23141, 0xD2; + lop3.b32 %r23190, %r23227, %r23121, %r23145, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23197, %r23117, %r23141, %r23173, 0xD2; + lop3.b32 %r23198, %r23121, %r23145, %r23177, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+32], {%r23197, %r23198}; + // begin inline asm + // chi + lop3.b32 %r23205, %r23141, %r23173, %r23149, 0xD2; + lop3.b32 %r23206, %r23145, %r23177, %r23153, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+40], {%r23205, %r23206}; + // begin inline asm + // chi + lop3.b32 %r23213, %r23173, %r23149, %r23224, 0xD2; + lop3.b32 %r23214, %r23177, %r23153, %r23227, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+48], {%r23213, %r23214}; + // begin inline asm + // chi + lop3.b32 %r23221, %r23149, %r23224, %r23117, 0xD2; + lop3.b32 %r23222, %r23153, %r23227, %r23121, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+56], {%r23221, %r23222}; + // begin inline asm + // chi + lop3.b32 %r23229, %r23165, %r23125, %r23181, 0xD2; + lop3.b32 %r23230, %r23169, %r23129, %r23185, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+64], {%r23229, %r23230}; + // begin inline asm + // chi + lop3.b32 %r23237, %r23125, %r23181, %r23157, 0xD2; + lop3.b32 %r23238, %r23129, %r23185, %r23161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+72], {%r23237, %r23238}; + // begin inline asm + // chi + lop3.b32 %r23245, %r23181, %r23157, %r23133, 0xD2; + lop3.b32 %r23246, %r23185, %r23161, %r23137, 0xD2; + // end inline asm + st.local.v2.u32 [%rd908+80], {%r23245, %r23246}; + // begin inline asm + ld.global.nc.v2.u32 {%r23253,%r23254}, [%rd1277]; + // end inline asm + xor.b32 %r23265, %r23190, %r23254; + xor.b32 %r23266, %r23189, %r23253; + st.local.v2.u32 [%rd908+24], {%r23266, %r23265}; + mov.b64 %rd1358, {%r23197, %r23198}; + mov.b64 %rd1359, {%r23205, %r23206}; + mov.b64 %rd1362, {%r23229, %r23230}; + mov.b64 %rd1363, {%r23237, %r23238}; + mov.b64 %rd1364, {%r23245, %r23246}; + mov.b64 %rd1357, {%r23266, %r23265}; + mov.b64 %rd1360, {%r23213, %r23214}; + mov.b64 %rd1361, {%r23221, %r23222}; + st.global.u64 [%rd222], %rd1349; + st.global.u64 [%rd222+8], %rd1350; + st.global.u64 [%rd222+16], %rd1351; + st.global.u64 [%rd222+24], %rd1352; + st.global.u64 [%rd222+32], %rd1353; + st.global.u64 [%rd222+40], %rd1354; + st.global.u64 [%rd222+48], %rd1355; + st.global.u64 [%rd222+56], %rd1356; + st.global.v2.u32 [%rd222+64], {%r23266, %r23265}; + st.global.v2.u32 [%rd222+72], {%r23197, %r23198}; + st.global.v2.u32 [%rd222+80], {%r23205, %r23206}; + st.global.v2.u32 [%rd222+88], {%r23213, %r23214}; + st.global.v2.u32 [%rd222+96], {%r23221, %r23222}; + st.global.v2.u32 [%rd222+104], {%r23229, %r23230}; + st.global.v2.u32 [%rd222+112], {%r23237, %r23238}; + st.global.v2.u32 [%rd222+120], {%r23245, %r23246}; + +$L__BB2_86: + mul.lo.s32 %r26551, %r12, 16777619; + mov.b64 {%r26552, %r26553}, %rd1333; + mul.lo.s32 %r26554, %r13, 16777619; + xor.b32 %r26555, %r26551, %r26552; + xor.b32 %r26556, %r26554, %r26553; + mov.b64 %rd1099, {%r26555, %r26556}; + mov.b64 {%r26557, %r26558}, %rd1349; + xor.b32 %r26559, %r26558, %r13; + xor.b32 %r26560, %r26557, %r12; + mov.b64 %rd1100, {%r26560, %r26559}; + mul.lo.s32 %r26561, %r14, 16777619; + mov.b64 {%r26562, %r26563}, %rd1334; + mul.lo.s32 %r26564, %r15, 16777619; + xor.b32 %r26565, %r26564, %r26563; + xor.b32 %r26566, %r26561, %r26562; + mov.b64 %rd1101, {%r26566, %r26565}; + mov.b64 {%r26567, %r26568}, %rd1350; + xor.b32 %r26569, %r26568, %r15; + xor.b32 %r26570, %r26567, %r14; + mov.b64 %rd1102, {%r26570, %r26569}; + mul.lo.s32 %r26571, %r16, 16777619; + mov.b64 {%r26572, %r26573}, %rd1335; + mul.lo.s32 %r26574, %r17, 16777619; + xor.b32 %r26575, %r26574, %r26573; + xor.b32 %r26576, %r26571, %r26572; + mov.b64 %rd1103, {%r26576, %r26575}; + mov.b64 {%r26577, %r26578}, %rd1351; + xor.b32 %r26579, %r26578, %r17; + xor.b32 %r26580, %r26577, %r16; + mov.b64 %rd1104, {%r26580, %r26579}; + mul.lo.s32 %r26581, %r18, 16777619; + mov.b64 {%r26582, %r26583}, %rd1336; + mul.lo.s32 %r26584, %r19, 16777619; + xor.b32 %r26585, %r26584, %r26583; + xor.b32 %r26586, %r26581, %r26582; + mov.b64 %rd1105, {%r26586, %r26585}; + mov.b64 {%r26587, %r26588}, %rd1352; + xor.b32 %r26589, %r26588, %r19; + xor.b32 %r26590, %r26587, %r18; + mov.b64 %rd1106, {%r26590, %r26589}; + mul.lo.s32 %r26591, %r20, 16777619; + mov.b64 {%r26592, %r26593}, %rd1337; + mul.lo.s32 %r26594, %r21, 16777619; + xor.b32 %r26595, %r26594, %r26593; + xor.b32 %r26596, %r26591, %r26592; + mov.b64 %rd1107, {%r26596, %r26595}; + mov.b64 {%r26597, %r26598}, %rd1353; + xor.b32 %r26599, %r26598, %r21; + xor.b32 %r26600, %r26597, %r20; + mov.b64 %rd1108, {%r26600, %r26599}; + mul.lo.s32 %r26601, %r22, 16777619; + mov.b64 {%r26602, %r26603}, %rd1338; + mul.lo.s32 %r26604, %r23, 16777619; + xor.b32 %r26605, %r26604, %r26603; + xor.b32 %r26606, %r26601, %r26602; + mov.b64 %rd1109, {%r26606, %r26605}; + mov.b64 {%r26607, %r26608}, %rd1354; + xor.b32 %r26609, %r26608, %r23; + xor.b32 %r26610, %r26607, %r22; + mov.b64 %rd1110, {%r26610, %r26609}; + mul.lo.s32 %r26611, %r24, 16777619; + mov.b64 {%r26612, %r26613}, %rd1339; + mul.lo.s32 %r26614, %r25, 16777619; + xor.b32 %r26615, %r26614, %r26613; + xor.b32 %r26616, %r26611, %r26612; + mov.b64 %rd1111, {%r26616, %r26615}; + mov.b64 {%r26617, %r26618}, %rd1355; + xor.b32 %r26619, %r26618, %r25; + xor.b32 %r26620, %r26617, %r24; + mov.b64 %rd1112, {%r26620, %r26619}; + mul.lo.s32 %r26621, %r26, 16777619; + mov.b64 {%r26622, %r26623}, %rd1340; + mul.lo.s32 %r26624, %r27, 16777619; + xor.b32 %r26625, %r26624, %r26623; + xor.b32 %r26626, %r26621, %r26622; + mov.b64 %rd1113, {%r26626, %r26625}; + mov.b64 {%r26627, %r26628}, %rd1356; + xor.b32 %r26629, %r26628, %r27; + xor.b32 %r26630, %r26627, %r26; + mov.b64 %rd1114, {%r26630, %r26629}; + mul.lo.s32 %r26631, %r28, 16777619; + mov.b64 {%r26632, %r26633}, %rd1341; + mul.lo.s32 %r26634, %r29, 16777619; + xor.b32 %r26635, %r26634, %r26633; + xor.b32 %r26636, %r26631, %r26632; + mov.b64 %rd1115, {%r26636, %r26635}; + mov.b64 {%r26637, %r26638}, %rd1357; + xor.b32 %r26639, %r26638, %r29; + xor.b32 %r26640, %r26637, %r28; + mov.b64 %rd1116, {%r26640, %r26639}; + mul.lo.s32 %r26641, %r30, 16777619; + mov.b64 {%r26642, %r26643}, %rd1342; + mul.lo.s32 %r26644, %r31, 16777619; + xor.b32 %r26645, %r26644, %r26643; + xor.b32 %r26646, %r26641, %r26642; + mov.b64 %rd1117, {%r26646, %r26645}; + mov.b64 {%r26647, %r26648}, %rd1358; + xor.b32 %r26649, %r26648, %r31; + xor.b32 %r26650, %r26647, %r30; + mov.b64 %rd1118, {%r26650, %r26649}; + mul.lo.s32 %r26651, %r32, 16777619; + mov.b64 {%r26652, %r26653}, %rd1343; + mul.lo.s32 %r26654, %r33, 16777619; + xor.b32 %r26655, %r26654, %r26653; + xor.b32 %r26656, %r26651, %r26652; + mov.b64 %rd1119, {%r26656, %r26655}; + mov.b64 {%r26657, %r26658}, %rd1359; + xor.b32 %r26659, %r26658, %r33; + xor.b32 %r26660, %r26657, %r32; + mov.b64 %rd1120, {%r26660, %r26659}; + mul.lo.s32 %r26661, %r34, 16777619; + mov.b64 {%r26662, %r26663}, %rd1344; + mul.lo.s32 %r26664, %r35, 16777619; + xor.b32 %r26665, %r26664, %r26663; + xor.b32 %r26666, %r26661, %r26662; + mov.b64 %rd1121, {%r26666, %r26665}; + mov.b64 {%r26667, %r26668}, %rd1360; + xor.b32 %r26669, %r26668, %r35; + xor.b32 %r26670, %r26667, %r34; + mov.b64 %rd1122, {%r26670, %r26669}; + mul.lo.s32 %r26671, %r36, 16777619; + mov.b64 {%r26672, %r26673}, %rd1345; + mul.lo.s32 %r26674, %r37, 16777619; + xor.b32 %r26675, %r26674, %r26673; + xor.b32 %r26676, %r26671, %r26672; + mov.b64 %rd1123, {%r26676, %r26675}; + mov.b64 {%r26677, %r26678}, %rd1361; + xor.b32 %r26679, %r26678, %r37; + xor.b32 %r26680, %r26677, %r36; + mov.b64 %rd1124, {%r26680, %r26679}; + mul.lo.s32 %r26681, %r38, 16777619; + mov.b64 {%r26682, %r26683}, %rd1346; + mul.lo.s32 %r26684, %r39, 16777619; + xor.b32 %r26685, %r26684, %r26683; + xor.b32 %r26686, %r26681, %r26682; + mov.b64 %rd1125, {%r26686, %r26685}; + mov.b64 {%r26687, %r26688}, %rd1362; + xor.b32 %r26689, %r26688, %r39; + xor.b32 %r26690, %r26687, %r38; + mov.b64 %rd1126, {%r26690, %r26689}; + mul.lo.s32 %r26691, %r40, 16777619; + mov.b64 {%r26692, %r26693}, %rd1347; + mul.lo.s32 %r26694, %r41, 16777619; + xor.b32 %r26695, %r26694, %r26693; + xor.b32 %r26696, %r26691, %r26692; + mov.b64 %rd1127, {%r26696, %r26695}; + mov.b64 {%r26697, %r26698}, %rd1363; + xor.b32 %r26699, %r26698, %r41; + xor.b32 %r26700, %r26697, %r40; + mov.b64 %rd1128, {%r26700, %r26699}; + mul.lo.s32 %r26701, %r42, 16777619; + mov.b64 {%r26702, %r26703}, %rd1348; + mul.lo.s32 %r26704, %r43, 16777619; + xor.b32 %r26705, %r26704, %r26703; + xor.b32 %r26706, %r26701, %r26702; + mov.b64 %rd1129, {%r26706, %r26705}; + mov.b64 {%r26707, %r26708}, %rd1364; + xor.b32 %r26709, %r26708, %r43; + xor.b32 %r26710, %r26707, %r42; + mov.b64 %rd1130, {%r26710, %r26709}; + mul.lo.s64 %rd1131, %rd1317, %rd1099; + add.s64 %rd1316, %rd1131, %rd1100; + mul.lo.s64 %rd1132, %rd1318, %rd1101; + add.s64 %rd1315, %rd1132, %rd1102; + mul.lo.s64 %rd1133, %rd1319, %rd1103; + add.s64 %rd1314, %rd1133, %rd1104; + mul.lo.s64 %rd1134, %rd1320, %rd1105; + add.s64 %rd1313, %rd1134, %rd1106; + mul.lo.s64 %rd1135, %rd1321, %rd1107; + add.s64 %rd1312, %rd1135, %rd1108; + mul.lo.s64 %rd1136, %rd1322, %rd1109; + add.s64 %rd1311, %rd1136, %rd1110; + mul.lo.s64 %rd1137, %rd1323, %rd1111; + add.s64 %rd1310, %rd1137, %rd1112; + mul.lo.s64 %rd1138, %rd1324, %rd1113; + add.s64 %rd1309, %rd1138, %rd1114; + mul.lo.s64 %rd1139, %rd1325, %rd1115; + add.s64 %rd1308, %rd1139, %rd1116; + mul.lo.s64 %rd1140, %rd1326, %rd1117; + add.s64 %rd1307, %rd1140, %rd1118; + mul.lo.s64 %rd1141, %rd1327, %rd1119; + add.s64 %rd1306, %rd1141, %rd1120; + mul.lo.s64 %rd1142, %rd1328, %rd1121; + add.s64 %rd1305, %rd1142, %rd1122; + mul.lo.s64 %rd1143, %rd1329, %rd1123; + add.s64 %rd1304, %rd1143, %rd1124; + mul.lo.s64 %rd1144, %rd1330, %rd1125; + add.s64 %rd1303, %rd1144, %rd1126; + mul.lo.s64 %rd1145, %rd1331, %rd1127; + add.s64 %rd1302, %rd1145, %rd1128; + mul.lo.s64 %rd1146, %rd1332, %rd1129; + add.s64 %rd1301, %rd1146, %rd1130; + add.s32 %r29819, %r29819, 1; + setp.lt.u32 %p48, %r29819, 32; + @%p48 bra $L__BB2_11; + + add.u64 %rd1259, %SPL, 2000; + add.u64 %rd1256, %SP, 2000; + add.u64 %rd1255, %SP, 0; + mov.u64 %rd1147, 0; + mov.b64 {%r26711, %r26712}, %rd1316; + mul.lo.s32 %r26713, %r26711, 16777619; + xor.b32 %r26714, %r26713, %r26712; + mul.lo.s32 %r26715, %r26714, 16777619; + mov.b64 {%r26716, %r26717}, %rd1315; + xor.b32 %r26718, %r26715, %r26716; + mul.lo.s32 %r26719, %r26718, 16777619; + mov.b64 {%r26720, %r26721}, %rd1314; + mul.lo.s32 %r26722, %r26720, 16777619; + xor.b32 %r26723, %r26722, %r26721; + mul.lo.s32 %r26724, %r26723, 16777619; + mov.b64 {%r26725, %r26726}, %rd1313; + xor.b32 %r26727, %r26724, %r26725; + mul.lo.s32 %r26728, %r26727, 16777619; + mov.b64 {%r26729, %r26730}, %rd1312; + mul.lo.s32 %r26731, %r26729, 16777619; + xor.b32 %r26732, %r26731, %r26730; + mul.lo.s32 %r26733, %r26732, 16777619; + mov.b64 {%r26734, %r26735}, %rd1311; + xor.b32 %r26736, %r26733, %r26734; + mul.lo.s32 %r26737, %r26736, 16777619; + mov.b64 {%r26738, %r26739}, %rd1310; + mul.lo.s32 %r26740, %r26738, 16777619; + xor.b32 %r26741, %r26740, %r26739; + mul.lo.s32 %r26742, %r26741, 16777619; + mov.b64 {%r26743, %r26744}, %rd1309; + xor.b32 %r26745, %r26742, %r26743; + mul.lo.s32 %r26746, %r26745, 16777619; + mov.b64 {%r26747, %r26748}, %rd1308; + mul.lo.s32 %r26749, %r26747, 16777619; + xor.b32 %r26750, %r26749, %r26748; + mul.lo.s32 %r26751, %r26750, 16777619; + mov.b64 {%r26752, %r26753}, %rd1307; + xor.b32 %r26754, %r26751, %r26752; + mul.lo.s32 %r26755, %r26754, 16777619; + mov.b64 {%r26756, %r26757}, %rd1306; + mul.lo.s32 %r26758, %r26756, 16777619; + xor.b32 %r26759, %r26758, %r26757; + mul.lo.s32 %r26760, %r26759, 16777619; + mov.b64 {%r26761, %r26762}, %rd1305; + xor.b32 %r26763, %r26760, %r26761; + mul.lo.s32 %r26764, %r26763, 16777619; + mov.b64 {%r26765, %r26766}, %rd1304; + mul.lo.s32 %r26767, %r26765, 16777619; + xor.b32 %r26768, %r26767, %r26766; + mul.lo.s32 %r26769, %r26768, 16777619; + mov.b64 {%r26770, %r26771}, %rd1303; + xor.b32 %r26772, %r26769, %r26770; + mul.lo.s32 %r26773, %r26772, 16777619; + mov.b64 {%r26774, %r26775}, %rd1302; + mul.lo.s32 %r26776, %r26774, 16777619; + xor.b32 %r26777, %r26776, %r26775; + mul.lo.s32 %r26778, %r26777, 16777619; + mov.b64 {%r26779, %r26780}, %rd1301; + xor.b32 %r26781, %r26778, %r26779; + mul.lo.s32 %r26782, %r26781, 16777619; + mov.u32 %r26783, 0; + st.local.v4.u32 [%rd1259+32], {%r26783, %r26783, %r26783, %r26783}; + st.local.v4.u32 [%rd1259+48], {%r26783, %r26783, %r26783, %r26783}; + st.local.v4.u32 [%rd1259+64], {%r26783, %r26783, %r26783, %r26783}; + xor.b32 %r26784, %r26746, %r26744; + xor.b32 %r26785, %r26737, %r26735; + xor.b32 %r26786, %r26728, %r26726; + xor.b32 %r26787, %r26719, %r26717; + st.local.v4.u32 [%rd1259], {%r26787, %r26786, %r26785, %r26784}; + xor.b32 %r26788, %r26782, %r26780; + xor.b32 %r26789, %r26773, %r26771; + xor.b32 %r26790, %r26764, %r26762; + xor.b32 %r26791, %r26755, %r26753; + st.local.v4.u32 [%rd1259+16], {%r26791, %r26790, %r26789, %r26788}; + mov.u32 %r26792, -1150833019; + mov.u32 %r26793, 1779033703; + st.local.v2.u32 [%rd2], {%r26793, %r26792}; + mov.u32 %r26794, -1521486534; + mov.u32 %r26795, 1013904242; + st.local.v2.u32 [%rd2+8], {%r26795, %r26794}; + mov.u32 %r26796, -1694144372; + mov.u32 %r26797, 1359893119; + st.local.v2.u32 [%rd2+16], {%r26797, %r26796}; + mov.u32 %r26798, 1541459225; + mov.u32 %r26799, 528734635; + st.local.v2.u32 [%rd2+24], {%r26799, %r26798}; + st.local.v2.u32 [%rd2+32], {%r26793, %r26792}; + st.local.v2.u32 [%rd2+40], {%r26795, %r26794}; + st.local.v2.u32 [%rd2+48], {%r26797, %r26796}; + st.local.v2.u32 [%rd2+56], {%r26799, %r26798}; + st.local.u64 [%rd2+64], %rd1147; + st.local.v2.u32 [%rd2+72], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+80], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+88], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+96], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+104], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+112], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+120], {%r26783, %r26783}; + st.local.v2.u32 [%rd2+128], {%r26783, %r26783}; + mov.u16 %rs498, 0; + st.local.v2.u8 [%rd2+136], {%rs498, %rs498}; + st.local.u8 [%rd2+138], %rs498; + st.local.u8 [%rd2+144], %rs498; + { // callseq 3, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd1255; + .param .b64 param1; + st.param.b64 [param1+0], %rd1256; + call.uni + _Z20blake3_hasher_updateP13blake3_hasherPKvy, + ( + param0, + param1 + ); + } // callseq 3 + ld.local.u8 %rd1367, [%rd2+144]; + setp.eq.s64 %p49, %rd1367, 0; + @%p49 bra $L__BB2_95; + + ld.local.v2.u8 {%rs862, %rs500}, [%rd2+136]; + cvt.u32.u16 %r26800, %rs500; + mul.wide.u32 %rd1151, %r26800, 64; + cvt.u64.u16 %rd1152, %rs862; + neg.s64 %rd1153, %rd1152; + setp.eq.s64 %p50, %rd1151, %rd1153; + @%p50 bra $L__BB2_90; + bra.uni $L__BB2_89; + +$L__BB2_90: + add.s64 %rd1367, %rd1367, -2; + shl.b64 %rd1155, %rd1367, 5; + add.s64 %rd1158, %rd2, %rd1155; + ld.local.u8 %rs665, [%rd2+138]; + mov.u64 %rd1368, 0; + or.b16 %rs732, %rs665, 4; + ld.local.v2.u32 {%r31257, %r31256}, [%rd2]; + ld.local.v2.u32 {%r31255, %r31254}, [%rd2+8]; + ld.local.v2.u32 {%r31253, %r31252}, [%rd2+16]; + ld.local.v2.u32 {%r31251, %r31250}, [%rd2+24]; + ld.local.u8 %rs798, [%rd1158+145]; + ld.local.u8 %rs799, [%rd1158+146]; + ld.local.u8 %rs800, [%rd1158+147]; + ld.local.u8 %rs801, [%rd1158+148]; + ld.local.u8 %rs802, [%rd1158+149]; + ld.local.u8 %rs803, [%rd1158+150]; + ld.local.u8 %rs804, [%rd1158+151]; + ld.local.u8 %rs805, [%rd1158+152]; + ld.local.u8 %rs806, [%rd1158+153]; + ld.local.u8 %rs807, [%rd1158+154]; + ld.local.u8 %rs808, [%rd1158+155]; + ld.local.u8 %rs809, [%rd1158+156]; + ld.local.u8 %rs810, [%rd1158+157]; + ld.local.u8 %rs811, [%rd1158+158]; + ld.local.u8 %rs812, [%rd1158+159]; + ld.local.u8 %rs813, [%rd1158+160]; + ld.local.u8 %rs814, [%rd1158+161]; + ld.local.u8 %rs815, [%rd1158+162]; + ld.local.u8 %rs816, [%rd1158+163]; + ld.local.u8 %rs817, [%rd1158+164]; + ld.local.u8 %rs818, [%rd1158+165]; + ld.local.u8 %rs819, [%rd1158+166]; + ld.local.u8 %rs820, [%rd1158+167]; + ld.local.u8 %rs821, [%rd1158+168]; + ld.local.u8 %rs822, [%rd1158+169]; + ld.local.u8 %rs823, [%rd1158+170]; + ld.local.u8 %rs824, [%rd1158+171]; + ld.local.u8 %rs825, [%rd1158+172]; + ld.local.u8 %rs826, [%rd1158+173]; + ld.local.u8 %rs827, [%rd1158+174]; + ld.local.u8 %rs828, [%rd1158+175]; + ld.local.u8 %rs829, [%rd1158+176]; + ld.local.u8 %rs830, [%rd1158+177]; + ld.local.u8 %rs831, [%rd1158+178]; + ld.local.u8 %rs832, [%rd1158+179]; + ld.local.u8 %rs833, [%rd1158+180]; + ld.local.u8 %rs834, [%rd1158+181]; + ld.local.u8 %rs835, [%rd1158+182]; + ld.local.u8 %rs836, [%rd1158+183]; + ld.local.u8 %rs837, [%rd1158+184]; + ld.local.u8 %rs838, [%rd1158+185]; + ld.local.u8 %rs839, [%rd1158+186]; + ld.local.u8 %rs840, [%rd1158+187]; + ld.local.u8 %rs841, [%rd1158+188]; + ld.local.u8 %rs842, [%rd1158+189]; + ld.local.u8 %rs843, [%rd1158+190]; + ld.local.u8 %rs844, [%rd1158+191]; + ld.local.u8 %rs845, [%rd1158+192]; + ld.local.u8 %rs846, [%rd1158+193]; + ld.local.u8 %rs847, [%rd1158+194]; + ld.local.u8 %rs848, [%rd1158+195]; + ld.local.u8 %rs849, [%rd1158+196]; + ld.local.u8 %rs850, [%rd1158+197]; + ld.local.u8 %rs851, [%rd1158+198]; + ld.local.u8 %rs852, [%rd1158+199]; + ld.local.v4.u16 {%rs853, %rs855, %rs857, %rs859}, [%rd1158+200]; + shr.u16 %rs854, %rs853, 8; + shr.u16 %rs856, %rs855, 8; + shr.u16 %rs858, %rs857, 8; + shr.u16 %rs860, %rs859, 8; + ld.local.u8 %rs861, [%rd1158+208]; + mov.u16 %rs862, 64; + bra.uni $L__BB2_91; + +$L__BB2_95: + ld.local.v4.u8 {%rs568, %rs569, %rs570, %rs571}, [%rd2+136]; + setp.eq.s16 %p54, %rs569, 0; + selp.u16 %rs573, 1, 0, %p54; + ld.local.v2.u32 {%r28817, %r28818}, [%rd2+32]; + ld.local.v2.u32 {%r28821, %r28822}, [%rd2+40]; + ld.local.v2.u32 {%r28825, %r28826}, [%rd2+48]; + ld.local.v2.u32 {%r28829, %r28830}, [%rd2+56]; + ld.local.v4.u16 {%rs574, %rs575, %rs576, %rs577}, [%rd2+72]; + shr.u16 %rs579, %rs574, 8; + shr.u16 %rs581, %rs575, 8; + shr.u16 %rs583, %rs576, 8; + shr.u16 %rs585, %rs577, 8; + ld.local.v4.u16 {%rs586, %rs587, %rs588, %rs589}, [%rd2+80]; + shr.u16 %rs591, %rs586, 8; + shr.u16 %rs593, %rs587, 8; + shr.u16 %rs595, %rs588, 8; + shr.u16 %rs597, %rs589, 8; + ld.local.v4.u16 {%rs598, %rs599, %rs600, %rs601}, [%rd2+88]; + shr.u16 %rs603, %rs598, 8; + shr.u16 %rs605, %rs599, 8; + shr.u16 %rs607, %rs600, 8; + shr.u16 %rs609, %rs601, 8; + ld.local.v4.u16 {%rs610, %rs611, %rs612, %rs613}, [%rd2+96]; + shr.u16 %rs615, %rs610, 8; + shr.u16 %rs617, %rs611, 8; + shr.u16 %rs619, %rs612, 8; + shr.u16 %rs621, %rs613, 8; + ld.local.v4.u16 {%rs622, %rs623, %rs624, %rs625}, [%rd2+104]; + shr.u16 %rs627, %rs622, 8; + shr.u16 %rs629, %rs623, 8; + shr.u16 %rs631, %rs624, 8; + shr.u16 %rs633, %rs625, 8; + ld.local.v4.u16 {%rs634, %rs635, %rs636, %rs637}, [%rd2+112]; + shr.u16 %rs639, %rs634, 8; + shr.u16 %rs641, %rs635, 8; + shr.u16 %rs643, %rs636, 8; + shr.u16 %rs645, %rs637, 8; + ld.local.v4.u16 {%rs646, %rs647, %rs648, %rs649}, [%rd2+120]; + shr.u16 %rs651, %rs646, 8; + shr.u16 %rs653, %rs647, 8; + ld.local.v2.u8 {%rs655, %rs656}, [%rd2+126]; + ld.local.u16 %r28833, [%rd2+132]; + ld.local.v2.u8 {%rs659, %rs660}, [%rd2+134]; + or.b16 %rs663, %rs570, %rs573; + or.b16 %rs664, %rs663, 10; + cvt.u32.u16 %r28834, %rs574; + and.b32 %r28835, %r28834, 255; + cvt.u32.u16 %r28836, %rs579; + prmt.b32 %r28837, %r28836, %r28835, 30212; + cvt.u32.u16 %r28838, %rs575; + prmt.b32 %r28839, %r28838, %r28837, 28756; + cvt.u32.u16 %r28840, %rs581; + prmt.b32 %r28841, %r28840, %r28839, 1620; + cvt.u32.u16 %r28842, %rs576; + and.b32 %r28843, %r28842, 255; + cvt.u32.u16 %r28844, %rs583; + prmt.b32 %r28845, %r28844, %r28843, 30212; + cvt.u32.u16 %r28846, %rs577; + prmt.b32 %r28847, %r28846, %r28845, 28756; + cvt.u32.u16 %r28848, %rs585; + prmt.b32 %r28849, %r28848, %r28847, 1620; + cvt.u32.u16 %r28850, %rs586; + and.b32 %r28851, %r28850, 255; + cvt.u32.u16 %r28852, %rs591; + prmt.b32 %r28853, %r28852, %r28851, 30212; + cvt.u32.u16 %r28854, %rs587; + prmt.b32 %r28855, %r28854, %r28853, 28756; + cvt.u32.u16 %r28856, %rs593; + prmt.b32 %r28857, %r28856, %r28855, 1620; + cvt.u32.u16 %r28858, %rs588; + and.b32 %r28859, %r28858, 255; + cvt.u32.u16 %r28860, %rs595; + prmt.b32 %r28861, %r28860, %r28859, 30212; + cvt.u32.u16 %r28862, %rs589; + prmt.b32 %r28863, %r28862, %r28861, 28756; + cvt.u32.u16 %r28864, %rs597; + prmt.b32 %r28865, %r28864, %r28863, 1620; + cvt.u32.u16 %r28866, %rs598; + and.b32 %r28867, %r28866, 255; + cvt.u32.u16 %r28868, %rs603; + prmt.b32 %r28869, %r28868, %r28867, 30212; + cvt.u32.u16 %r28870, %rs599; + prmt.b32 %r28871, %r28870, %r28869, 28756; + cvt.u32.u16 %r28872, %rs605; + prmt.b32 %r28873, %r28872, %r28871, 1620; + cvt.u32.u16 %r28874, %rs600; + and.b32 %r28875, %r28874, 255; + cvt.u32.u16 %r28876, %rs607; + prmt.b32 %r28877, %r28876, %r28875, 30212; + cvt.u32.u16 %r28878, %rs601; + prmt.b32 %r28879, %r28878, %r28877, 28756; + cvt.u32.u16 %r28880, %rs609; + prmt.b32 %r28881, %r28880, %r28879, 1620; + cvt.u32.u16 %r28882, %rs610; + and.b32 %r28883, %r28882, 255; + cvt.u32.u16 %r28884, %rs615; + prmt.b32 %r28885, %r28884, %r28883, 30212; + cvt.u32.u16 %r28886, %rs611; + prmt.b32 %r28887, %r28886, %r28885, 28756; + cvt.u32.u16 %r28888, %rs617; + prmt.b32 %r28889, %r28888, %r28887, 1620; + cvt.u32.u16 %r28890, %rs612; + and.b32 %r28891, %r28890, 255; + cvt.u32.u16 %r28892, %rs619; + prmt.b32 %r28893, %r28892, %r28891, 30212; + cvt.u32.u16 %r28894, %rs613; + prmt.b32 %r28895, %r28894, %r28893, 28756; + cvt.u32.u16 %r28896, %rs621; + prmt.b32 %r28897, %r28896, %r28895, 1620; + cvt.u32.u16 %r28898, %rs622; + and.b32 %r28899, %r28898, 255; + cvt.u32.u16 %r28900, %rs627; + prmt.b32 %r28901, %r28900, %r28899, 30212; + cvt.u32.u16 %r28902, %rs623; + prmt.b32 %r28903, %r28902, %r28901, 28756; + cvt.u32.u16 %r28904, %rs629; + prmt.b32 %r28905, %r28904, %r28903, 1620; + cvt.u32.u16 %r28906, %rs624; + and.b32 %r28907, %r28906, 255; + cvt.u32.u16 %r28908, %rs631; + prmt.b32 %r28909, %r28908, %r28907, 30212; + cvt.u32.u16 %r28910, %rs625; + prmt.b32 %r28911, %r28910, %r28909, 28756; + cvt.u32.u16 %r28912, %rs633; + prmt.b32 %r28913, %r28912, %r28911, 1620; + cvt.u32.u16 %r28914, %rs634; + and.b32 %r28915, %r28914, 255; + cvt.u32.u16 %r28916, %rs639; + prmt.b32 %r28917, %r28916, %r28915, 30212; + cvt.u32.u16 %r28918, %rs635; + prmt.b32 %r28919, %r28918, %r28917, 28756; + cvt.u32.u16 %r28920, %rs641; + prmt.b32 %r28921, %r28920, %r28919, 1620; + cvt.u32.u16 %r28922, %rs636; + and.b32 %r28923, %r28922, 255; + cvt.u32.u16 %r28924, %rs643; + prmt.b32 %r28925, %r28924, %r28923, 30212; + cvt.u32.u16 %r28926, %rs637; + prmt.b32 %r28927, %r28926, %r28925, 28756; + cvt.u32.u16 %r28928, %rs645; + prmt.b32 %r28929, %r28928, %r28927, 1620; + cvt.u32.u16 %r28930, %rs646; + and.b32 %r28931, %r28930, 255; + cvt.u32.u16 %r28932, %rs651; + prmt.b32 %r28933, %r28932, %r28931, 30212; + cvt.u32.u16 %r28934, %rs647; + prmt.b32 %r28935, %r28934, %r28933, 28756; + cvt.u32.u16 %r28936, %rs653; + prmt.b32 %r28937, %r28936, %r28935, 1620; + cvt.u32.u16 %r28938, %rs648; + and.b32 %r28939, %r28938, 255; + ld.local.u8 %r28940, [%rd2+125]; + prmt.b32 %r28941, %r28940, %r28939, 30212; + cvt.u32.u16 %r28942, %rs655; + prmt.b32 %r28943, %r28942, %r28941, 28756; + cvt.u32.u16 %r28944, %rs656; + prmt.b32 %r28945, %r28944, %r28943, 1620; + ld.local.u32 %r28946, [%rd2+128]; + cvt.u32.u16 %r28947, %rs659; + prmt.b32 %r28948, %r28947, %r28833, 28756; + cvt.u32.u16 %r28949, %rs660; + prmt.b32 %r28950, %r28949, %r28948, 1620; + cvt.u32.u16 %r28951, %rs568; + cvt.u32.u16 %r28952, %rs664; + and.b32 %r28953, %r28952, 255; + add.s32 %r28954, %r28825, %r28817; + add.s32 %r28955, %r28954, %r28841; + add.s32 %r28956, %r28849, %r28955; + add.s32 %r28957, %r28826, %r28818; + add.s32 %r28958, %r28957, %r28857; + add.s32 %r28959, %r28865, %r28958; + add.s32 %r28960, %r28829, %r28821; + add.s32 %r28961, %r28960, %r28873; + xor.b32 %r28962, %r28961, %r28951; + shr.u32 %r28963, %r28961, 16; + shl.b32 %r28964, %r28962, 16; + or.b32 %r28965, %r28964, %r28963; + add.s32 %r28966, %r28965, 1013904242; + xor.b32 %r28967, %r28966, %r28829; + shf.l.wrap.b32 %r28968, %r28967, %r28967, 20; + add.s32 %r28969, %r28881, %r28961; + add.s32 %r28970, %r28969, %r28968; + xor.b32 %r28971, %r28970, %r28965; + shf.l.wrap.b32 %r28972, %r28971, %r28971, 24; + add.s32 %r28973, %r28972, %r28966; + xor.b32 %r28974, %r28973, %r28968; + shf.l.wrap.b32 %r28975, %r28974, %r28974, 25; + add.s32 %r28976, %r28830, %r28822; + add.s32 %r28977, %r28976, %r28889; + xor.b32 %r28978, %r28977, %r28953; + shr.u32 %r28979, %r28977, 16; + shl.b32 %r28980, %r28978, 16; + or.b32 %r28981, %r28980, %r28979; + add.s32 %r28982, %r28981, -1521486534; + xor.b32 %r28983, %r28982, %r28830; + shf.l.wrap.b32 %r28984, %r28983, %r28983, 20; + add.s32 %r28985, %r28897, %r28977; + add.s32 %r28986, %r28985, %r28984; + xor.b32 %r28987, %r28986, %r28981; + shf.l.wrap.b32 %r28988, %r28987, %r28987, 24; + add.s32 %r28989, %r28988, %r28982; + xor.b32 %r28990, %r28989, %r28984; + shf.l.wrap.b32 %r28991, %r28990, %r28990, 25; + add.s32 %r28992, %r28921, %r28975; + add.s32 %r28993, %r28991, %r28970; + add.s32 %r28994, %r28993, %r28937; + add.s32 %r28995, %r28945, %r28994; + add.s32 %r28996, %r28946, %r28986; + shf.l.wrap.b32 %r28997, %r28955, %r28955, 16; + add.s32 %r28998, %r28997, 1779033703; + xor.b32 %r28999, %r28998, %r28825; + shf.l.wrap.b32 %r29000, %r28999, %r28999, 20; + add.s32 %r29001, %r28956, %r29000; + xor.b32 %r29002, %r29001, %r28997; + shf.l.wrap.b32 %r29003, %r29002, %r29002, 24; + add.s32 %r29004, %r29003, %r28998; + xor.b32 %r29005, %r29004, %r29000; + shf.l.wrap.b32 %r29006, %r29005, %r29005, 25; + shf.l.wrap.b32 %r29007, %r28958, %r28958, 16; + add.s32 %r29008, %r29007, -1150833019; + xor.b32 %r29009, %r29008, %r28826; + shf.l.wrap.b32 %r29010, %r29009, %r29009, 20; + add.s32 %r29011, %r28959, %r29010; + xor.b32 %r29012, %r29011, %r29007; + shf.l.wrap.b32 %r29013, %r29012, %r29012, 24; + add.s32 %r29014, %r29013, %r29008; + xor.b32 %r29015, %r29014, %r29010; + shf.l.wrap.b32 %r29016, %r29015, %r29015, 25; + add.s32 %r29017, %r29001, %r28905; + add.s32 %r29018, %r29017, %r29016; + xor.b32 %r29019, %r29018, %r28988; + shf.l.wrap.b32 %r29020, %r29019, %r29019, 16; + add.s32 %r29021, %r29020, %r28973; + xor.b32 %r29022, %r29021, %r29016; + shf.l.wrap.b32 %r29023, %r29022, %r29022, 20; + add.s32 %r29024, %r29018, %r28913; + add.s32 %r29025, %r29024, %r29023; + xor.b32 %r29026, %r29025, %r29020; + shf.l.wrap.b32 %r29027, %r29026, %r29026, 24; + add.s32 %r29028, %r29027, %r29021; + xor.b32 %r29029, %r29028, %r29023; + shf.l.wrap.b32 %r29030, %r29029, %r29029, 25; + add.s32 %r29031, %r28992, %r29011; + xor.b32 %r29032, %r29003, %r29031; + shf.l.wrap.b32 %r29033, %r29032, %r29032, 16; + add.s32 %r29034, %r29033, %r28989; + xor.b32 %r29035, %r29034, %r28975; + shf.l.wrap.b32 %r29036, %r29035, %r29035, 20; + add.s32 %r29037, %r29031, %r28929; + add.s32 %r29038, %r29037, %r29036; + xor.b32 %r29039, %r29038, %r29033; + shf.l.wrap.b32 %r29040, %r29039, %r29039, 24; + add.s32 %r29041, %r29040, %r29034; + xor.b32 %r29042, %r29041, %r29036; + shf.l.wrap.b32 %r29043, %r29042, %r29042, 25; + xor.b32 %r29044, %r29013, %r28994; + shf.l.wrap.b32 %r29045, %r29044, %r29044, 16; + add.s32 %r29046, %r29045, %r29004; + xor.b32 %r29047, %r29046, %r28991; + shf.l.wrap.b32 %r29048, %r29047, %r29047, 20; + add.s32 %r29049, %r28995, %r29048; + xor.b32 %r29050, %r29049, %r29045; + shf.l.wrap.b32 %r29051, %r29050, %r29050, 24; + add.s32 %r29052, %r29051, %r29046; + xor.b32 %r29053, %r29052, %r29048; + shf.l.wrap.b32 %r29054, %r29053, %r29053, 25; + add.s32 %r29055, %r28996, %r29006; + xor.b32 %r29056, %r29055, %r28972; + shf.l.wrap.b32 %r29057, %r29056, %r29056, 16; + add.s32 %r29058, %r29057, %r29014; + xor.b32 %r29059, %r29058, %r29006; + shf.l.wrap.b32 %r29060, %r29059, %r29059, 20; + add.s32 %r29061, %r29055, %r28950; + add.s32 %r29062, %r29061, %r29060; + xor.b32 %r29063, %r29062, %r29057; + shf.l.wrap.b32 %r29064, %r29063, %r29063, 24; + add.s32 %r29065, %r29064, %r29058; + xor.b32 %r29066, %r29065, %r29060; + shf.l.wrap.b32 %r29067, %r29066, %r29066, 25; + add.s32 %r29068, %r29025, %r28857; + add.s32 %r29069, %r29068, %r29067; + xor.b32 %r29070, %r29069, %r29040; + shf.l.wrap.b32 %r29071, %r29070, %r29070, 16; + add.s32 %r29072, %r29071, %r29052; + xor.b32 %r29073, %r29072, %r29067; + shf.l.wrap.b32 %r29074, %r29073, %r29073, 20; + add.s32 %r29075, %r29069, %r28889; + add.s32 %r29076, %r29075, %r29074; + xor.b32 %r29077, %r29076, %r29071; + shf.l.wrap.b32 %r29078, %r29077, %r29077, 24; + add.s32 %r29079, %r29078, %r29072; + xor.b32 %r29080, %r29079, %r29074; + shf.l.wrap.b32 %r29081, %r29080, %r29080, 25; + add.s32 %r29082, %r29038, %r28865; + add.s32 %r29083, %r29082, %r29030; + xor.b32 %r29084, %r29083, %r29051; + shf.l.wrap.b32 %r29085, %r29084, %r29084, 16; + add.s32 %r29086, %r29085, %r29065; + xor.b32 %r29087, %r29086, %r29030; + shf.l.wrap.b32 %r29088, %r29087, %r29087, 20; + add.s32 %r29089, %r29083, %r28921; + add.s32 %r29090, %r29089, %r29088; + xor.b32 %r29091, %r29090, %r29085; + shf.l.wrap.b32 %r29092, %r29091, %r29091, 24; + add.s32 %r29093, %r29092, %r29086; + xor.b32 %r29094, %r29093, %r29088; + shf.l.wrap.b32 %r29095, %r29094, %r29094, 25; + add.s32 %r29096, %r29049, %r28897; + add.s32 %r29097, %r29096, %r29043; + xor.b32 %r29098, %r29064, %r29097; + shf.l.wrap.b32 %r29099, %r29098, %r29098, 16; + add.s32 %r29100, %r29099, %r29028; + xor.b32 %r29101, %r29100, %r29043; + shf.l.wrap.b32 %r29102, %r29101, %r29101, 20; + add.s32 %r29103, %r29097, %r28841; + add.s32 %r29104, %r29103, %r29102; + xor.b32 %r29105, %r29104, %r29099; + shf.l.wrap.b32 %r29106, %r29105, %r29105, 24; + add.s32 %r29107, %r29106, %r29100; + xor.b32 %r29108, %r29107, %r29102; + shf.l.wrap.b32 %r29109, %r29108, %r29108, 25; + add.s32 %r29110, %r29062, %r28873; + add.s32 %r29111, %r29110, %r29054; + xor.b32 %r29112, %r29027, %r29111; + shf.l.wrap.b32 %r29113, %r29112, %r29112, 16; + add.s32 %r29114, %r29113, %r29041; + xor.b32 %r29115, %r29114, %r29054; + shf.l.wrap.b32 %r29116, %r29115, %r29115, 20; + add.s32 %r29117, %r29111, %r28945; + add.s32 %r29118, %r29117, %r29116; + xor.b32 %r29119, %r29118, %r29113; + shf.l.wrap.b32 %r29120, %r29119, %r29119, 24; + add.s32 %r29121, %r29120, %r29114; + xor.b32 %r29122, %r29121, %r29116; + shf.l.wrap.b32 %r29123, %r29122, %r29122, 25; + add.s32 %r29124, %r29076, %r28849; + add.s32 %r29125, %r29124, %r29095; + xor.b32 %r29126, %r29125, %r29120; + shf.l.wrap.b32 %r29127, %r29126, %r29126, 16; + add.s32 %r29128, %r29127, %r29107; + xor.b32 %r29129, %r29128, %r29095; + shf.l.wrap.b32 %r29130, %r29129, %r29129, 20; + add.s32 %r29131, %r29125, %r28929; + add.s32 %r29132, %r29131, %r29130; + xor.b32 %r29133, %r29132, %r29127; + shf.l.wrap.b32 %r29134, %r29133, %r29133, 24; + add.s32 %r29135, %r29134, %r29128; + xor.b32 %r29136, %r29135, %r29130; + shf.l.wrap.b32 %r29137, %r29136, %r29136, 25; + add.s32 %r29138, %r29109, %r28937; + add.s32 %r29139, %r29138, %r29090; + xor.b32 %r29140, %r29078, %r29139; + shf.l.wrap.b32 %r29141, %r29140, %r29140, 16; + add.s32 %r29142, %r29141, %r29121; + xor.b32 %r29143, %r29142, %r29109; + shf.l.wrap.b32 %r29144, %r29143, %r29143, 20; + add.s32 %r29145, %r29139, %r28881; + add.s32 %r29146, %r29145, %r29144; + xor.b32 %r29147, %r29146, %r29141; + shf.l.wrap.b32 %r29148, %r29147, %r29147, 24; + add.s32 %r29149, %r29148, %r29142; + xor.b32 %r29150, %r29149, %r29144; + shf.l.wrap.b32 %r29151, %r29150, %r29150, 25; + add.s32 %r29152, %r29104, %r28913; + add.s32 %r29153, %r29152, %r29123; + xor.b32 %r29154, %r29092, %r29153; + shf.l.wrap.b32 %r29155, %r29154, %r29154, 16; + add.s32 %r29156, %r29155, %r29079; + xor.b32 %r29157, %r29156, %r29123; + shf.l.wrap.b32 %r29158, %r29157, %r29157, 20; + add.s32 %r29159, %r29153, %r28946; + add.s32 %r29160, %r29159, %r29158; + xor.b32 %r29161, %r29160, %r29155; + shf.l.wrap.b32 %r29162, %r29161, %r29161, 24; + add.s32 %r29163, %r29162, %r29156; + xor.b32 %r29164, %r29163, %r29158; + shf.l.wrap.b32 %r29165, %r29164, %r29164, 25; + add.s32 %r29166, %r29118, %r28950; + add.s32 %r29167, %r29166, %r29081; + xor.b32 %r29168, %r29167, %r29106; + shf.l.wrap.b32 %r29169, %r29168, %r29168, 16; + add.s32 %r29170, %r29169, %r29093; + xor.b32 %r29171, %r29170, %r29081; + shf.l.wrap.b32 %r29172, %r29171, %r29171, 20; + add.s32 %r29173, %r29167, %r28905; + add.s32 %r29174, %r29173, %r29172; + xor.b32 %r29175, %r29174, %r29169; + shf.l.wrap.b32 %r29176, %r29175, %r29175, 24; + add.s32 %r29177, %r29176, %r29170; + xor.b32 %r29178, %r29177, %r29172; + shf.l.wrap.b32 %r29179, %r29178, %r29178, 25; + add.s32 %r29180, %r29132, %r28865; + add.s32 %r29181, %r29180, %r29179; + xor.b32 %r29182, %r29181, %r29148; + shf.l.wrap.b32 %r29183, %r29182, %r29182, 16; + add.s32 %r29184, %r29183, %r29163; + xor.b32 %r29185, %r29184, %r29179; + shf.l.wrap.b32 %r29186, %r29185, %r29185, 20; + add.s32 %r29187, %r29181, %r28873; + add.s32 %r29188, %r29187, %r29186; + xor.b32 %r29189, %r29188, %r29183; + shf.l.wrap.b32 %r29190, %r29189, %r29189, 24; + add.s32 %r29191, %r29190, %r29184; + xor.b32 %r29192, %r29191, %r29186; + shf.l.wrap.b32 %r29193, %r29192, %r29192, 25; + add.s32 %r29194, %r29146, %r28921; + add.s32 %r29195, %r29194, %r29137; + xor.b32 %r29196, %r29195, %r29162; + shf.l.wrap.b32 %r29197, %r29196, %r29196, 16; + add.s32 %r29198, %r29197, %r29177; + xor.b32 %r29199, %r29198, %r29137; + shf.l.wrap.b32 %r29200, %r29199, %r29199, 20; + add.s32 %r29201, %r29195, %r28937; + add.s32 %r29202, %r29201, %r29200; + xor.b32 %r29203, %r29202, %r29197; + shf.l.wrap.b32 %r29204, %r29203, %r29203, 24; + add.s32 %r29205, %r29204, %r29198; + xor.b32 %r29206, %r29205, %r29200; + shf.l.wrap.b32 %r29207, %r29206, %r29206, 25; + add.s32 %r29208, %r29160, %r28945; + add.s32 %r29209, %r29208, %r29151; + xor.b32 %r29210, %r29176, %r29209; + shf.l.wrap.b32 %r29211, %r29210, %r29210, 16; + add.s32 %r29212, %r29211, %r29135; + xor.b32 %r29213, %r29212, %r29151; + shf.l.wrap.b32 %r29214, %r29213, %r29213, 20; + add.s32 %r29215, %r29209, %r28857; + add.s32 %r29216, %r29215, %r29214; + xor.b32 %r29217, %r29216, %r29211; + shf.l.wrap.b32 %r29218, %r29217, %r29217, 24; + add.s32 %r29219, %r29218, %r29212; + xor.b32 %r29220, %r29219, %r29214; + shf.l.wrap.b32 %r29221, %r29220, %r29220, 25; + add.s32 %r29222, %r29174, %r28897; + add.s32 %r29223, %r29222, %r29165; + xor.b32 %r29224, %r29134, %r29223; + shf.l.wrap.b32 %r29225, %r29224, %r29224, 16; + add.s32 %r29226, %r29225, %r29149; + xor.b32 %r29227, %r29226, %r29165; + shf.l.wrap.b32 %r29228, %r29227, %r29227, 20; + add.s32 %r29229, %r29223, %r28946; + add.s32 %r29230, %r29229, %r29228; + xor.b32 %r29231, %r29230, %r29225; + shf.l.wrap.b32 %r29232, %r29231, %r29231, 24; + add.s32 %r29233, %r29232, %r29226; + xor.b32 %r29234, %r29233, %r29228; + shf.l.wrap.b32 %r29235, %r29234, %r29234, 25; + add.s32 %r29236, %r29188, %r28889; + add.s32 %r29237, %r29236, %r29207; + xor.b32 %r29238, %r29237, %r29232; + shf.l.wrap.b32 %r29239, %r29238, %r29238, 16; + add.s32 %r29240, %r29239, %r29219; + xor.b32 %r29241, %r29240, %r29207; + shf.l.wrap.b32 %r29242, %r29241, %r29241, 20; + add.s32 %r29243, %r29237, %r28881; + add.s32 %r29244, %r29243, %r29242; + xor.b32 %r29245, %r29244, %r29239; + shf.l.wrap.b32 %r29246, %r29245, %r29245, 24; + add.s32 %r29247, %r29246, %r29240; + xor.b32 %r29248, %r29247, %r29242; + shf.l.wrap.b32 %r29249, %r29248, %r29248, 25; + add.s32 %r29250, %r29221, %r28913; + add.s32 %r29251, %r29250, %r29202; + xor.b32 %r29252, %r29190, %r29251; + shf.l.wrap.b32 %r29253, %r29252, %r29252, 16; + add.s32 %r29254, %r29253, %r29233; + xor.b32 %r29255, %r29254, %r29221; + shf.l.wrap.b32 %r29256, %r29255, %r29255, 20; + add.s32 %r29257, %r29251, %r28841; + add.s32 %r29258, %r29257, %r29256; + xor.b32 %r29259, %r29258, %r29253; + shf.l.wrap.b32 %r29260, %r29259, %r29259, 24; + add.s32 %r29261, %r29260, %r29254; + xor.b32 %r29262, %r29261, %r29256; + shf.l.wrap.b32 %r29263, %r29262, %r29262, 25; + add.s32 %r29264, %r29216, %r28929; + add.s32 %r29265, %r29264, %r29235; + xor.b32 %r29266, %r29204, %r29265; + shf.l.wrap.b32 %r29267, %r29266, %r29266, 16; + add.s32 %r29268, %r29267, %r29191; + xor.b32 %r29269, %r29268, %r29235; + shf.l.wrap.b32 %r29270, %r29269, %r29269, 20; + add.s32 %r29271, %r29265, %r28950; + add.s32 %r29272, %r29271, %r29270; + xor.b32 %r29273, %r29272, %r29267; + shf.l.wrap.b32 %r29274, %r29273, %r29273, 24; + add.s32 %r29275, %r29274, %r29268; + xor.b32 %r29276, %r29275, %r29270; + shf.l.wrap.b32 %r29277, %r29276, %r29276, 25; + add.s32 %r29278, %r29230, %r28905; + add.s32 %r29279, %r29278, %r29193; + xor.b32 %r29280, %r29279, %r29218; + shf.l.wrap.b32 %r29281, %r29280, %r29280, 16; + add.s32 %r29282, %r29281, %r29205; + xor.b32 %r29283, %r29282, %r29193; + shf.l.wrap.b32 %r29284, %r29283, %r29283, 20; + add.s32 %r29285, %r29279, %r28849; + add.s32 %r29286, %r29285, %r29284; + xor.b32 %r29287, %r29286, %r29281; + shf.l.wrap.b32 %r29288, %r29287, %r29287, 24; + add.s32 %r29289, %r29288, %r29282; + xor.b32 %r29290, %r29289, %r29284; + shf.l.wrap.b32 %r29291, %r29290, %r29290, 25; + add.s32 %r29292, %r29244, %r28921; + add.s32 %r29293, %r29292, %r29291; + xor.b32 %r29294, %r29293, %r29260; + shf.l.wrap.b32 %r29295, %r29294, %r29294, 16; + add.s32 %r29296, %r29295, %r29275; + xor.b32 %r29297, %r29296, %r29291; + shf.l.wrap.b32 %r29298, %r29297, %r29297, 20; + add.s32 %r29299, %r29293, %r28897; + add.s32 %r29300, %r29299, %r29298; + xor.b32 %r29301, %r29300, %r29295; + shf.l.wrap.b32 %r29302, %r29301, %r29301, 24; + add.s32 %r29303, %r29302, %r29296; + xor.b32 %r29304, %r29303, %r29298; + shf.l.wrap.b32 %r29305, %r29304, %r29304, 25; + add.s32 %r29306, %r29258, %r28937; + add.s32 %r29307, %r29306, %r29249; + xor.b32 %r29308, %r29307, %r29274; + shf.l.wrap.b32 %r29309, %r29308, %r29308, 16; + add.s32 %r29310, %r29309, %r29289; + xor.b32 %r29311, %r29310, %r29249; + shf.l.wrap.b32 %r29312, %r29311, %r29311, 20; + add.s32 %r29313, %r29307, %r28913; + add.s32 %r29314, %r29313, %r29312; + xor.b32 %r29315, %r29314, %r29309; + shf.l.wrap.b32 %r29316, %r29315, %r29315, 24; + add.s32 %r29317, %r29316, %r29310; + xor.b32 %r29318, %r29317, %r29312; + shf.l.wrap.b32 %r29319, %r29318, %r29318, 25; + add.s32 %r29320, %r29272, %r28946; + add.s32 %r29321, %r29320, %r29263; + xor.b32 %r29322, %r29288, %r29321; + shf.l.wrap.b32 %r29323, %r29322, %r29322, 16; + add.s32 %r29324, %r29323, %r29247; + xor.b32 %r29325, %r29324, %r29263; + shf.l.wrap.b32 %r29326, %r29325, %r29325, 20; + add.s32 %r29327, %r29321, %r28865; + add.s32 %r29328, %r29327, %r29326; + xor.b32 %r29329, %r29328, %r29323; + shf.l.wrap.b32 %r29330, %r29329, %r29329, 24; + add.s32 %r29331, %r29330, %r29324; + xor.b32 %r29332, %r29331, %r29326; + shf.l.wrap.b32 %r29333, %r29332, %r29332, 25; + add.s32 %r29334, %r29286, %r28945; + add.s32 %r29335, %r29334, %r29277; + xor.b32 %r29336, %r29246, %r29335; + shf.l.wrap.b32 %r29337, %r29336, %r29336, 16; + add.s32 %r29338, %r29337, %r29261; + xor.b32 %r29339, %r29338, %r29277; + shf.l.wrap.b32 %r29340, %r29339, %r29339, 20; + add.s32 %r29341, %r29335, %r28950; + add.s32 %r29342, %r29341, %r29340; + xor.b32 %r29343, %r29342, %r29337; + shf.l.wrap.b32 %r29344, %r29343, %r29343, 24; + add.s32 %r29345, %r29344, %r29338; + xor.b32 %r29346, %r29345, %r29340; + shf.l.wrap.b32 %r29347, %r29346, %r29346, 25; + add.s32 %r29348, %r29300, %r28873; + add.s32 %r29349, %r29348, %r29319; + xor.b32 %r29350, %r29349, %r29344; + shf.l.wrap.b32 %r29351, %r29350, %r29350, 16; + add.s32 %r29352, %r29351, %r29331; + xor.b32 %r29353, %r29352, %r29319; + shf.l.wrap.b32 %r29354, %r29353, %r29353, 20; + add.s32 %r29355, %r29349, %r28841; + add.s32 %r29356, %r29355, %r29354; + xor.b32 %r29357, %r29356, %r29351; + shf.l.wrap.b32 %r29358, %r29357, %r29357, 24; + add.s32 %r29359, %r29358, %r29352; + xor.b32 %r29360, %r29359, %r29354; + shf.l.wrap.b32 %r29361, %r29360, %r29360, 25; + add.s32 %r29362, %r29333, %r28929; + add.s32 %r29363, %r29362, %r29314; + xor.b32 %r29364, %r29302, %r29363; + shf.l.wrap.b32 %r29365, %r29364, %r29364, 16; + add.s32 %r29366, %r29365, %r29345; + xor.b32 %r29367, %r29366, %r29333; + shf.l.wrap.b32 %r29368, %r29367, %r29367, 20; + add.s32 %r29369, %r29363, %r28857; + add.s32 %r29370, %r29369, %r29368; + xor.b32 %r29371, %r29370, %r29365; + shf.l.wrap.b32 %r29372, %r29371, %r29371, 24; + add.s32 %r29373, %r29372, %r29366; + xor.b32 %r29374, %r29373, %r29368; + shf.l.wrap.b32 %r29375, %r29374, %r29374, 25; + add.s32 %r29376, %r29328, %r28881; + add.s32 %r29377, %r29376, %r29347; + xor.b32 %r29378, %r29316, %r29377; + shf.l.wrap.b32 %r29379, %r29378, %r29378, 16; + add.s32 %r29380, %r29379, %r29303; + xor.b32 %r29381, %r29380, %r29347; + shf.l.wrap.b32 %r29382, %r29381, %r29381, 20; + add.s32 %r29383, %r29377, %r28905; + add.s32 %r29384, %r29383, %r29382; + xor.b32 %r29385, %r29384, %r29379; + shf.l.wrap.b32 %r29386, %r29385, %r29385, 24; + add.s32 %r29387, %r29386, %r29380; + xor.b32 %r29388, %r29387, %r29382; + shf.l.wrap.b32 %r29389, %r29388, %r29388, 25; + add.s32 %r29390, %r29342, %r28849; + add.s32 %r29391, %r29390, %r29305; + xor.b32 %r29392, %r29391, %r29330; + shf.l.wrap.b32 %r29393, %r29392, %r29392, 16; + add.s32 %r29394, %r29393, %r29317; + xor.b32 %r29395, %r29394, %r29305; + shf.l.wrap.b32 %r29396, %r29395, %r29395, 20; + add.s32 %r29397, %r29391, %r28889; + add.s32 %r29398, %r29397, %r29396; + xor.b32 %r29399, %r29398, %r29393; + shf.l.wrap.b32 %r29400, %r29399, %r29399, 24; + add.s32 %r29401, %r29400, %r29394; + xor.b32 %r29402, %r29401, %r29396; + shf.l.wrap.b32 %r29403, %r29402, %r29402, 25; + add.s32 %r29404, %r29356, %r28937; + add.s32 %r29405, %r29404, %r29403; + xor.b32 %r29406, %r29405, %r29372; + shf.l.wrap.b32 %r29407, %r29406, %r29406, 16; + add.s32 %r29408, %r29407, %r29387; + xor.b32 %r29409, %r29408, %r29403; + shf.l.wrap.b32 %r29410, %r29409, %r29409, 20; + add.s32 %r29411, %r29405, %r28945; + add.s32 %r29412, %r29411, %r29410; + xor.b32 %r29413, %r29412, %r29407; + shf.l.wrap.b32 %r29414, %r29413, %r29413, 24; + add.s32 %r29415, %r29414, %r29408; + xor.b32 %r29416, %r29415, %r29410; + shf.l.wrap.b32 %r29417, %r29416, %r29416, 25; + add.s32 %r29418, %r29370, %r28913; + add.s32 %r29419, %r29418, %r29361; + xor.b32 %r29420, %r29419, %r29386; + shf.l.wrap.b32 %r29421, %r29420, %r29420, 16; + add.s32 %r29422, %r29421, %r29401; + xor.b32 %r29423, %r29422, %r29361; + shf.l.wrap.b32 %r29424, %r29423, %r29423, 20; + add.s32 %r29425, %r29419, %r28929; + add.s32 %r29426, %r29425, %r29424; + xor.b32 %r29427, %r29426, %r29421; + shf.l.wrap.b32 %r29428, %r29427, %r29427, 24; + add.s32 %r29429, %r29428, %r29422; + xor.b32 %r29430, %r29429, %r29424; + shf.l.wrap.b32 %r29431, %r29430, %r29430, 25; + add.s32 %r29432, %r29384, %r28950; + add.s32 %r29433, %r29432, %r29375; + xor.b32 %r29434, %r29400, %r29433; + shf.l.wrap.b32 %r29435, %r29434, %r29434, 16; + add.s32 %r29436, %r29435, %r29359; + xor.b32 %r29437, %r29436, %r29375; + shf.l.wrap.b32 %r29438, %r29437, %r29437, 20; + add.s32 %r29439, %r29433, %r28921; + add.s32 %r29440, %r29439, %r29438; + xor.b32 %r29441, %r29440, %r29435; + shf.l.wrap.b32 %r29442, %r29441, %r29441, 24; + add.s32 %r29443, %r29442, %r29436; + xor.b32 %r29444, %r29443, %r29438; + shf.l.wrap.b32 %r29445, %r29444, %r29444, 25; + add.s32 %r29446, %r29398, %r28946; + add.s32 %r29447, %r29446, %r29389; + xor.b32 %r29448, %r29358, %r29447; + shf.l.wrap.b32 %r29449, %r29448, %r29448, 16; + add.s32 %r29450, %r29449, %r29373; + xor.b32 %r29451, %r29450, %r29389; + shf.l.wrap.b32 %r29452, %r29451, %r29451, 20; + add.s32 %r29453, %r29447, %r28905; + add.s32 %r29454, %r29453, %r29452; + xor.b32 %r29455, %r29454, %r29449; + shf.l.wrap.b32 %r29456, %r29455, %r29455, 24; + add.s32 %r29457, %r29456, %r29450; + xor.b32 %r29458, %r29457, %r29452; + shf.l.wrap.b32 %r29459, %r29458, %r29458, 25; + add.s32 %r29460, %r29412, %r28897; + add.s32 %r29461, %r29460, %r29431; + xor.b32 %r29462, %r29461, %r29456; + shf.l.wrap.b32 %r29463, %r29462, %r29462, 16; + add.s32 %r29464, %r29463, %r29443; + xor.b32 %r29465, %r29464, %r29431; + shf.l.wrap.b32 %r29466, %r29465, %r29465, 20; + add.s32 %r29467, %r29461, %r28857; + add.s32 %r29468, %r29467, %r29466; + xor.b32 %r29469, %r29468, %r29463; + shf.l.wrap.b32 %r29470, %r29469, %r29469, 24; + add.s32 %r29471, %r29470, %r29464; + xor.b32 %r29472, %r29471, %r29466; + shf.l.wrap.b32 %r29473, %r29472, %r29472, 25; + add.s32 %r29474, %r29445, %r28881; + add.s32 %r29475, %r29474, %r29426; + xor.b32 %r29476, %r29414, %r29475; + shf.l.wrap.b32 %r29477, %r29476, %r29476, 16; + add.s32 %r29478, %r29477, %r29457; + xor.b32 %r29479, %r29478, %r29445; + shf.l.wrap.b32 %r29480, %r29479, %r29479, 20; + add.s32 %r29481, %r29475, %r28865; + add.s32 %r29482, %r29481, %r29480; + xor.b32 %r29483, %r29482, %r29477; + shf.l.wrap.b32 %r29484, %r29483, %r29483, 24; + add.s32 %r29485, %r29484, %r29478; + xor.b32 %r29486, %r29485, %r29480; + shf.l.wrap.b32 %r29487, %r29486, %r29486, 25; + add.s32 %r29488, %r29440, %r28841; + add.s32 %r29489, %r29488, %r29459; + xor.b32 %r29490, %r29428, %r29489; + shf.l.wrap.b32 %r29491, %r29490, %r29490, 16; + add.s32 %r29492, %r29491, %r29415; + xor.b32 %r29493, %r29492, %r29459; + shf.l.wrap.b32 %r29494, %r29493, %r29493, 20; + add.s32 %r29495, %r29489, %r28849; + add.s32 %r29496, %r29495, %r29494; + xor.b32 %r29497, %r29496, %r29491; + shf.l.wrap.b32 %r29498, %r29497, %r29497, 24; + add.s32 %r29499, %r29498, %r29492; + xor.b32 %r29500, %r29499, %r29494; + shf.l.wrap.b32 %r29501, %r29500, %r29500, 25; + add.s32 %r29502, %r29454, %r28889; + add.s32 %r29503, %r29502, %r29417; + xor.b32 %r29504, %r29503, %r29442; + shf.l.wrap.b32 %r29505, %r29504, %r29504, 16; + add.s32 %r29506, %r29505, %r29429; + xor.b32 %r29507, %r29506, %r29417; + shf.l.wrap.b32 %r29508, %r29507, %r29507, 20; + add.s32 %r29509, %r29503, %r28873; + add.s32 %r29510, %r29509, %r29508; + xor.b32 %r29511, %r29510, %r29505; + shf.l.wrap.b32 %r29512, %r29511, %r29511, 24; + add.s32 %r29513, %r29512, %r29506; + xor.b32 %r29514, %r29513, %r29508; + shf.l.wrap.b32 %r29515, %r29514, %r29514, 25; + add.s32 %r29516, %r29468, %r28913; + add.s32 %r29517, %r29516, %r29515; + xor.b32 %r29518, %r29517, %r29484; + shf.l.wrap.b32 %r29519, %r29518, %r29518, 16; + add.s32 %r29520, %r29519, %r29499; + xor.b32 %r29521, %r29520, %r29515; + shf.l.wrap.b32 %r29522, %r29521, %r29521, 20; + add.s32 %r29523, %r29517, %r28946; + add.s32 %r29524, %r29523, %r29522; + xor.b32 %r29525, %r29524, %r29519; + shf.l.wrap.b32 %r29526, %r29525, %r29525, 24; + add.s32 %r29527, %r29526, %r29520; + xor.b32 %r29528, %r29527, %r29522; + shf.l.wrap.b32 %r29529, %r29528, %r29528, 25; + add.s32 %r29530, %r29482, %r28929; + add.s32 %r29531, %r29530, %r29473; + xor.b32 %r29532, %r29531, %r29498; + shf.l.wrap.b32 %r29533, %r29532, %r29532, 16; + add.s32 %r29534, %r29533, %r29513; + xor.b32 %r29535, %r29534, %r29473; + shf.l.wrap.b32 %r29536, %r29535, %r29535, 20; + add.s32 %r29537, %r29531, %r28881; + add.s32 %r29538, %r29537, %r29536; + xor.b32 %r29539, %r29538, %r29533; + shf.l.wrap.b32 %r29540, %r29539, %r29539, 24; + add.s32 %r29541, %r29540, %r29534; + xor.b32 %r29542, %r29541, %r29536; + shf.l.wrap.b32 %r29543, %r29542, %r29542, 25; + add.s32 %r29544, %r29496, %r28905; + add.s32 %r29545, %r29544, %r29487; + xor.b32 %r29546, %r29512, %r29545; + shf.l.wrap.b32 %r29547, %r29546, %r29546, 16; + add.s32 %r29548, %r29547, %r29471; + xor.b32 %r29549, %r29548, %r29487; + shf.l.wrap.b32 %r29550, %r29549, %r29549, 20; + add.s32 %r29551, %r29545, %r28937; + add.s32 %r29552, %r29551, %r29550; + xor.b32 %r29553, %r29552, %r29547; + shf.l.wrap.b32 %r29554, %r29553, %r29553, 24; + add.s32 %r29555, %r29554, %r29548; + xor.b32 %r29556, %r29555, %r29550; + shf.l.wrap.b32 %r29557, %r29556, %r29556, 25; + add.s32 %r29558, %r29510, %r28950; + add.s32 %r29559, %r29558, %r29501; + xor.b32 %r29560, %r29470, %r29559; + shf.l.wrap.b32 %r29561, %r29560, %r29560, 16; + add.s32 %r29562, %r29561, %r29485; + xor.b32 %r29563, %r29562, %r29501; + shf.l.wrap.b32 %r29564, %r29563, %r29563, 20; + add.s32 %r29565, %r29559, %r28849; + add.s32 %r29566, %r29565, %r29564; + xor.b32 %r29567, %r29566, %r29561; + shf.l.wrap.b32 %r29568, %r29567, %r29567, 24; + add.s32 %r29569, %r29568, %r29562; + xor.b32 %r29570, %r29569, %r29564; + shf.l.wrap.b32 %r29571, %r29570, %r29570, 25; + add.s32 %r29572, %r29524, %r28945; + add.s32 %r29573, %r29572, %r29543; + xor.b32 %r29574, %r29573, %r29568; + shf.l.wrap.b32 %r29575, %r29574, %r29574, 16; + add.s32 %r29576, %r29575, %r29555; + xor.b32 %r29577, %r29576, %r29543; + shf.l.wrap.b32 %r29578, %r29577, %r29577, 20; + add.s32 %r29579, %r29573, %r28865; + add.s32 %r29580, %r29579, %r29578; + xor.b32 %r29581, %r29580, %r29575; + shf.l.wrap.b32 %r29582, %r29581, %r29581, 24; + add.s32 %r29583, %r29582, %r29576; + xor.b32 %r29584, %r29583, %r29578; + shf.l.wrap.b32 %r29585, %r29584, %r29584, 25; + add.s32 %r29586, %r29557, %r28841; + add.s32 %r29587, %r29586, %r29538; + xor.b32 %r29588, %r29526, %r29587; + shf.l.wrap.b32 %r29589, %r29588, %r29588, 16; + add.s32 %r29590, %r29589, %r29569; + xor.b32 %r29591, %r29590, %r29557; + shf.l.wrap.b32 %r29592, %r29591, %r29591, 20; + add.s32 %r29593, %r29587, %r28921; + add.s32 %r29594, %r29593, %r29592; + xor.b32 %r29595, %r29594, %r29589; + shf.l.wrap.b32 %r29596, %r29595, %r29595, 24; + add.s32 %r29597, %r29596, %r29590; + xor.b32 %r29598, %r29597, %r29592; + shf.l.wrap.b32 %r29599, %r29598, %r29598, 25; + add.s32 %r29600, %r29552, %r28857; + add.s32 %r29601, %r29600, %r29571; + xor.b32 %r29602, %r29540, %r29601; + shf.l.wrap.b32 %r29603, %r29602, %r29602, 16; + add.s32 %r29604, %r29603, %r29527; + xor.b32 %r29605, %r29604, %r29571; + shf.l.wrap.b32 %r29606, %r29605, %r29605, 20; + add.s32 %r29607, %r29601, %r28889; + add.s32 %r29608, %r29607, %r29606; + xor.b32 %r29609, %r29608, %r29603; + shf.l.wrap.b32 %r29610, %r29609, %r29609, 24; + add.s32 %r29611, %r29610, %r29604; + xor.b32 %r29612, %r29611, %r29606; + shf.l.wrap.b32 %r29613, %r29612, %r29612, 25; + add.s32 %r29614, %r29566, %r28873; + add.s32 %r29615, %r29614, %r29529; + xor.b32 %r29616, %r29615, %r29554; + shf.l.wrap.b32 %r29617, %r29616, %r29616, 16; + add.s32 %r29618, %r29617, %r29541; + xor.b32 %r29619, %r29618, %r29529; + shf.l.wrap.b32 %r29620, %r29619, %r29619, 20; + add.s32 %r29621, %r29615, %r28897; + add.s32 %r29622, %r29621, %r29620; + xor.b32 %r29623, %r29622, %r29617; + shf.l.wrap.b32 %r29624, %r29623, %r29623, 24; + add.s32 %r29625, %r29624, %r29618; + xor.b32 %r29626, %r29625, %r29620; + shf.l.wrap.b32 %r29627, %r29626, %r29626, 25; + add.s32 %r29628, %r29580, %r28929; + add.s32 %r29629, %r29628, %r29627; + xor.b32 %r29630, %r29629, %r29596; + shf.l.wrap.b32 %r29631, %r29630, %r29630, 16; + add.s32 %r29632, %r29631, %r29611; + xor.b32 %r29633, %r29632, %r29627; + shf.l.wrap.b32 %r29634, %r29633, %r29633, 20; + add.s32 %r29635, %r29629, %r28950; + add.s32 %r29636, %r29635, %r29634; + xor.b32 %r29637, %r29636, %r29631; + shf.l.wrap.b32 %r29638, %r29637, %r29637, 24; + add.s32 %r29639, %r29638, %r29632; + xor.b32 %r29640, %r29639, %r29634; + shf.l.wrap.b32 %r29641, %r29640, %r29640, 25; + add.s32 %r29642, %r29594, %r28881; + add.s32 %r29643, %r29642, %r29585; + xor.b32 %r29644, %r29643, %r29610; + shf.l.wrap.b32 %r29645, %r29644, %r29644, 16; + add.s32 %r29646, %r29645, %r29625; + xor.b32 %r29647, %r29646, %r29585; + shf.l.wrap.b32 %r29648, %r29647, %r29647, 20; + add.s32 %r29649, %r29643, %r28841; + add.s32 %r29650, %r29649, %r29648; + xor.b32 %r29651, %r29650, %r29645; + shf.l.wrap.b32 %r29652, %r29651, %r29651, 24; + add.s32 %r29653, %r29652, %r29646; + xor.b32 %r29654, %r29653, %r29648; + shf.l.wrap.b32 %r29655, %r29654, %r29654, 25; + add.s32 %r29656, %r29608, %r28849; + add.s32 %r29657, %r29656, %r29599; + xor.b32 %r29658, %r29624, %r29657; + shf.l.wrap.b32 %r29659, %r29658, %r29658, 16; + add.s32 %r29660, %r29659, %r29583; + xor.b32 %r29661, %r29660, %r29599; + shf.l.wrap.b32 %r29662, %r29661, %r29661, 20; + add.s32 %r29663, %r29657, %r28913; + add.s32 %r29664, %r29663, %r29662; + xor.b32 %r29665, %r29664, %r29659; + shf.l.wrap.b32 %r29666, %r29665, %r29665, 24; + add.s32 %r29667, %r29666, %r29660; + xor.b32 %r29668, %r29667, %r29662; + shf.l.wrap.b32 %r29669, %r29668, %r29668, 25; + add.s32 %r29670, %r29622, %r28905; + add.s32 %r29671, %r29670, %r29613; + xor.b32 %r29672, %r29582, %r29671; + shf.l.wrap.b32 %r29673, %r29672, %r29672, 16; + add.s32 %r29674, %r29673, %r29597; + xor.b32 %r29675, %r29674, %r29613; + shf.l.wrap.b32 %r29676, %r29675, %r29675, 20; + add.s32 %r29677, %r29671, %r28889; + add.s32 %r29678, %r29677, %r29676; + xor.b32 %r29679, %r29678, %r29673; + shf.l.wrap.b32 %r29680, %r29679, %r29679, 24; + add.s32 %r29681, %r29680, %r29674; + xor.b32 %r29682, %r29681, %r29676; + shf.l.wrap.b32 %r29683, %r29682, %r29682, 25; + add.s32 %r29684, %r29636, %r28946; + add.s32 %r29685, %r29684, %r29655; + xor.b32 %r29686, %r29685, %r29680; + shf.l.wrap.b32 %r29687, %r29686, %r29686, 16; + add.s32 %r29688, %r29687, %r29667; + xor.b32 %r29689, %r29688, %r29655; + shf.l.wrap.b32 %r29690, %r29689, %r29689, 20; + add.s32 %r29691, %r29685, %r28921; + add.s32 %r29692, %r29691, %r29690; + xor.b32 %r29693, %r29692, %r29687; + shf.l.wrap.b32 %r29694, %r29693, %r29693, 24; + add.s32 %r29695, %r29694, %r29688; + xor.b32 %r29696, %r29695, %r29690; + shf.l.wrap.b32 %r29697, %r29696, %r29696, 25; + add.s32 %r29698, %r29669, %r28857; + add.s32 %r29699, %r29698, %r29650; + xor.b32 %r29700, %r29638, %r29699; + shf.l.wrap.b32 %r29701, %r29700, %r29700, 16; + add.s32 %r29702, %r29701, %r29681; + xor.b32 %r29703, %r29702, %r29669; + shf.l.wrap.b32 %r29704, %r29703, %r29703, 20; + add.s32 %r29705, %r29699, %r28937; + add.s32 %r29706, %r29705, %r29704; + xor.b32 %r29707, %r29706, %r29701; + shf.l.wrap.b32 %r29708, %r29707, %r29707, 24; + add.s32 %r29709, %r29708, %r29702; + xor.b32 %r29710, %r29709, %r29704; + shf.l.wrap.b32 %r29711, %r29710, %r29710, 25; + add.s32 %r29712, %r29664, %r28865; + add.s32 %r29713, %r29712, %r29683; + xor.b32 %r29714, %r29652, %r29713; + shf.l.wrap.b32 %r29715, %r29714, %r29714, 16; + add.s32 %r29716, %r29715, %r29639; + xor.b32 %r29717, %r29716, %r29683; + shf.l.wrap.b32 %r29718, %r29717, %r29717, 20; + add.s32 %r29719, %r29713, %r28873; + add.s32 %r29720, %r29719, %r29718; + xor.b32 %r29721, %r29720, %r29715; + shf.l.wrap.b32 %r29722, %r29721, %r29721, 24; + add.s32 %r29723, %r29722, %r29716; + xor.b32 %r29724, %r29723, %r29718; + shf.l.wrap.b32 %r29725, %r29724, %r29724, 25; + add.s32 %r29726, %r29678, %r28897; + add.s32 %r29727, %r29726, %r29641; + xor.b32 %r29728, %r29727, %r29666; + shf.l.wrap.b32 %r29729, %r29728, %r29728, 16; + add.s32 %r29730, %r29729, %r29653; + xor.b32 %r29731, %r29730, %r29641; + shf.l.wrap.b32 %r29732, %r29731, %r29731, 20; + add.s32 %r29733, %r29727, %r28945; + add.s32 %r29734, %r29733, %r29732; + xor.b32 %r29735, %r29734, %r29729; + shf.l.wrap.b32 %r29736, %r29735, %r29735, 24; + add.s32 %r29737, %r29736, %r29730; + xor.b32 %r29738, %r29737, %r29732; + shf.l.wrap.b32 %r29739, %r29738, %r29738, 25; + xor.b32 %r29740, %r29692, %r29723; + cvt.u64.u32 %rd1207, %r29740; + xor.b32 %r29741, %r29737, %r29706; + and.b32 %r29742, %r29741, 255; + cvt.u64.u32 %rd1208, %r29742; + cvt.u64.u32 %rd1209, %r29741; + shl.b64 %rd1210, %rd1209, 32; + and.b64 %rd1211, %rd1210, 280375465082880; + and.b64 %rd1212, %rd1210, 71776119061217280; + shr.u32 %r29743, %r29741, 24; + cvt.u64.u32 %rd1213, %r29743; + shl.b64 %rd1214, %rd1213, 56; + bfi.b64 %rd1215, %rd1208, %rd1207, 32, 32; + or.b64 %rd1216, %rd1215, %rd1211; + or.b64 %rd1217, %rd1216, %rd1212; + or.b64 %rd353, %rd1217, %rd1214; + xor.b32 %r29744, %r29695, %r29720; + cvt.u64.u32 %rd1218, %r29744; + xor.b32 %r29745, %r29734, %r29709; + and.b32 %r29746, %r29745, 255; + cvt.u64.u32 %rd1219, %r29746; + cvt.u64.u32 %rd1220, %r29745; + shl.b64 %rd1221, %rd1220, 32; + and.b64 %rd1222, %rd1221, 280375465082880; + and.b64 %rd1223, %rd1221, 71776119061217280; + shr.u32 %r29747, %r29745, 24; + cvt.u64.u32 %rd1224, %r29747; + shl.b64 %rd1225, %rd1224, 56; + bfi.b64 %rd1226, %rd1219, %rd1218, 32, 32; + or.b64 %rd1227, %rd1226, %rd1222; + or.b64 %rd1228, %rd1227, %rd1223; + or.b64 %rd352, %rd1228, %rd1225; + xor.b32 %r29748, %r29739, %r29708; + cvt.u64.u32 %rd1229, %r29748; + xor.b32 %r29749, %r29697, %r29722; + and.b32 %r29750, %r29749, 255; + cvt.u64.u32 %rd1230, %r29750; + cvt.u64.u32 %rd1231, %r29749; + shl.b64 %rd1232, %rd1231, 32; + and.b64 %rd1233, %rd1232, 280375465082880; + and.b64 %rd1234, %rd1232, 71776119061217280; + shr.u32 %r29751, %r29749, 24; + cvt.u64.u32 %rd1235, %r29751; + shl.b64 %rd1236, %rd1235, 56; + bfi.b64 %rd1237, %rd1230, %rd1229, 32, 32; + or.b64 %rd1238, %rd1237, %rd1233; + or.b64 %rd1239, %rd1238, %rd1234; + or.b64 %rd1370, %rd1239, %rd1236; + xor.b32 %r29752, %r29736, %r29711; + cvt.u64.u32 %rd1240, %r29752; + xor.b32 %r29753, %r29694, %r29725; + and.b32 %r29754, %r29753, 255; + cvt.u64.u32 %rd1241, %r29754; + cvt.u64.u32 %rd1242, %r29753; + shl.b64 %rd1243, %rd1242, 32; + and.b64 %rd1244, %rd1243, 280375465082880; + and.b64 %rd1245, %rd1243, 71776119061217280; + shr.u32 %r29755, %r29753, 24; + cvt.u64.u32 %rd1246, %r29755; + shl.b64 %rd1247, %rd1246, 56; + bfi.b64 %rd1248, %rd1241, %rd1240, 32, 32; + or.b64 %rd1249, %rd1248, %rd1244; + or.b64 %rd1250, %rd1249, %rd1245; + or.b64 %rd1369, %rd1250, %rd1247; + bra.uni $L__BB2_96; + +$L__BB2_89: + setp.eq.s16 %p51, %rs500, 0; + selp.u16 %rs502, 1, 0, %p51; + ld.local.u8 %rs665, [%rd2+138]; + or.b16 %rs503, %rs665, %rs502; + or.b16 %rs732, %rs503, 2; + ld.local.u64 %rd1368, [%rd2+64]; + ld.local.v2.u32 {%r31257, %r31256}, [%rd2+32]; + ld.local.v2.u32 {%r31255, %r31254}, [%rd2+40]; + ld.local.v2.u32 {%r31253, %r31252}, [%rd2+48]; + ld.local.v2.u32 {%r31251, %r31250}, [%rd2+56]; + ld.local.v4.u16 {%rs798, %rs800, %rs802, %rs804}, [%rd2+72]; + shr.u16 %rs799, %rs798, 8; + shr.u16 %rs801, %rs800, 8; + shr.u16 %rs803, %rs802, 8; + shr.u16 %rs805, %rs804, 8; + ld.local.v4.u16 {%rs806, %rs808, %rs810, %rs812}, [%rd2+80]; + shr.u16 %rs807, %rs806, 8; + shr.u16 %rs809, %rs808, 8; + shr.u16 %rs811, %rs810, 8; + shr.u16 %rs813, %rs812, 8; + ld.local.v4.u16 {%rs814, %rs816, %rs818, %rs820}, [%rd2+88]; + shr.u16 %rs815, %rs814, 8; + shr.u16 %rs817, %rs816, 8; + shr.u16 %rs819, %rs818, 8; + shr.u16 %rs821, %rs820, 8; + ld.local.v4.u16 {%rs822, %rs824, %rs826, %rs828}, [%rd2+96]; + shr.u16 %rs823, %rs822, 8; + shr.u16 %rs825, %rs824, 8; + shr.u16 %rs827, %rs826, 8; + shr.u16 %rs829, %rs828, 8; + ld.local.v4.u16 {%rs830, %rs832, %rs834, %rs836}, [%rd2+104]; + shr.u16 %rs831, %rs830, 8; + shr.u16 %rs833, %rs832, 8; + shr.u16 %rs835, %rs834, 8; + shr.u16 %rs837, %rs836, 8; + ld.local.v4.u16 {%rs838, %rs840, %rs842, %rs844}, [%rd2+112]; + shr.u16 %rs839, %rs838, 8; + shr.u16 %rs841, %rs840, 8; + shr.u16 %rs843, %rs842, 8; + shr.u16 %rs845, %rs844, 8; + ld.local.v4.u8 {%rs846, %rs847, %rs848, %rs849}, [%rd2+120]; + ld.local.v2.u8 {%rs850, %rs851}, [%rd2+124]; + ld.local.v2.u8 {%rs852, %rs853}, [%rd2+126]; + ld.local.v4.u8 {%rs854, %rs855, %rs856, %rs857}, [%rd2+128]; + ld.local.v2.u8 {%rs858, %rs859}, [%rd2+132]; + ld.local.v2.u8 {%rs860, %rs861}, [%rd2+134]; + +$L__BB2_91: + setp.eq.s64 %p52, %rd1367, 0; + mov.u32 %r31258, %r31257; + mov.u32 %r31259, %r31256; + mov.u32 %r31260, %r31255; + mov.u32 %r31261, %r31254; + mov.u32 %r31262, %r31253; + mov.u32 %r31263, %r31252; + mov.u32 %r31264, %r31251; + mov.u32 %r31265, %r31250; + mov.u16 %rs863, %rs732; + @%p52 bra $L__BB2_94; + + or.b16 %rs863, %rs665, 4; + ld.local.v2.u32 {%r31258, %r31259}, [%rd2]; + ld.local.v2.u32 {%r31260, %r31261}, [%rd2+8]; + ld.local.v2.u32 {%r31262, %r31263}, [%rd2+16]; + ld.local.v2.u32 {%r31264, %r31265}, [%rd2+24]; + mov.u16 %rs766, %rs829; + mov.u16 %rs767, %rs828; + mov.u16 %rs768, %rs827; + mov.u16 %rs769, %rs826; + mov.u16 %rs770, %rs825; + mov.u16 %rs771, %rs824; + mov.u16 %rs772, %rs823; + mov.u16 %rs773, %rs822; + mov.u16 %rs774, %rs821; + mov.u16 %rs775, %rs820; + mov.u16 %rs776, %rs819; + mov.u16 %rs777, %rs818; + mov.u16 %rs778, %rs817; + mov.u16 %rs779, %rs816; + mov.u16 %rs780, %rs815; + mov.u16 %rs781, %rs814; + mov.u16 %rs782, %rs813; + mov.u16 %rs783, %rs812; + mov.u16 %rs784, %rs811; + mov.u16 %rs785, %rs810; + mov.u16 %rs786, %rs809; + mov.u16 %rs787, %rs808; + mov.u16 %rs788, %rs807; + mov.u16 %rs789, %rs806; + mov.u16 %rs790, %rs805; + mov.u16 %rs791, %rs804; + mov.u16 %rs792, %rs803; + mov.u16 %rs793, %rs802; + mov.u16 %rs794, %rs801; + mov.u16 %rs795, %rs800; + mov.u16 %rs796, %rs799; + mov.u16 %rs797, %rs798; + +$L__BB2_93: + add.s64 %rd1367, %rd1367, -1; + shl.b64 %rd1160, %rd1367, 5; + add.s64 %rd1161, %rd2, %rd1160; + ld.local.u8 %rs798, [%rd1161+145]; + mov.u64 %rd1159, 0; + ld.local.u8 %rs799, [%rd1161+146]; + ld.local.u8 %rs800, [%rd1161+147]; + ld.local.u8 %rs801, [%rd1161+148]; + ld.local.u8 %rs802, [%rd1161+149]; + ld.local.u8 %rs803, [%rd1161+150]; + ld.local.u8 %rs804, [%rd1161+151]; + ld.local.u8 %rs805, [%rd1161+152]; + ld.local.u8 %rs806, [%rd1161+153]; + ld.local.u8 %rs807, [%rd1161+154]; + ld.local.u8 %rs808, [%rd1161+155]; + ld.local.u8 %rs809, [%rd1161+156]; + ld.local.u8 %rs810, [%rd1161+157]; + ld.local.u8 %rs811, [%rd1161+158]; + ld.local.u8 %rs812, [%rd1161+159]; + ld.local.u8 %rs813, [%rd1161+160]; + ld.local.u8 %rs814, [%rd1161+161]; + ld.local.u8 %rs815, [%rd1161+162]; + ld.local.u8 %rs816, [%rd1161+163]; + ld.local.u8 %rs817, [%rd1161+164]; + ld.local.u8 %rs818, [%rd1161+165]; + ld.local.u8 %rs819, [%rd1161+166]; + ld.local.u8 %rs820, [%rd1161+167]; + ld.local.u8 %rs821, [%rd1161+168]; + ld.local.u8 %rs822, [%rd1161+169]; + ld.local.u8 %rs823, [%rd1161+170]; + ld.local.u8 %rs824, [%rd1161+171]; + ld.local.u8 %rs825, [%rd1161+172]; + ld.local.u8 %rs826, [%rd1161+173]; + ld.local.u8 %rs827, [%rd1161+174]; + ld.local.u8 %rs828, [%rd1161+175]; + ld.local.u8 %rs829, [%rd1161+176]; + cvt.u32.u16 %r26825, %rs797; + and.b32 %r26826, %r26825, 255; + cvt.u32.u16 %r26827, %rs796; + prmt.b32 %r26828, %r26827, %r26826, 30212; + cvt.u32.u16 %r26829, %rs795; + shl.b32 %r26830, %r26829, 16; + and.b32 %r26831, %r26830, 16711680; + or.b32 %r26832, %r26828, %r26831; + cvt.u32.u16 %r26833, %rs794; + shl.b32 %r26834, %r26833, 24; + or.b32 %r26835, %r26832, %r26834; + cvt.u32.u16 %r26836, %rs793; + and.b32 %r26837, %r26836, 255; + cvt.u32.u16 %r26838, %rs792; + prmt.b32 %r26839, %r26838, %r26837, 30212; + cvt.u32.u16 %r26840, %rs791; + shl.b32 %r26841, %r26840, 16; + and.b32 %r26842, %r26841, 16711680; + or.b32 %r26843, %r26839, %r26842; + cvt.u32.u16 %r26844, %rs790; + shl.b32 %r26845, %r26844, 24; + or.b32 %r26846, %r26843, %r26845; + cvt.u32.u16 %r26847, %rs789; + and.b32 %r26848, %r26847, 255; + cvt.u32.u16 %r26849, %rs788; + prmt.b32 %r26850, %r26849, %r26848, 30212; + cvt.u32.u16 %r26851, %rs787; + shl.b32 %r26852, %r26851, 16; + and.b32 %r26853, %r26852, 16711680; + or.b32 %r26854, %r26850, %r26853; + cvt.u32.u16 %r26855, %rs786; + shl.b32 %r26856, %r26855, 24; + or.b32 %r26857, %r26854, %r26856; + cvt.u32.u16 %r26858, %rs785; + and.b32 %r26859, %r26858, 255; + cvt.u32.u16 %r26860, %rs784; + prmt.b32 %r26861, %r26860, %r26859, 30212; + cvt.u32.u16 %r26862, %rs783; + shl.b32 %r26863, %r26862, 16; + and.b32 %r26864, %r26863, 16711680; + or.b32 %r26865, %r26861, %r26864; + cvt.u32.u16 %r26866, %rs782; + shl.b32 %r26867, %r26866, 24; + or.b32 %r26868, %r26865, %r26867; + cvt.u32.u16 %r26869, %rs781; + and.b32 %r26870, %r26869, 255; + cvt.u32.u16 %r26871, %rs780; + prmt.b32 %r26872, %r26871, %r26870, 30212; + cvt.u32.u16 %r26873, %rs779; + shl.b32 %r26874, %r26873, 16; + and.b32 %r26875, %r26874, 16711680; + or.b32 %r26876, %r26872, %r26875; + cvt.u32.u16 %r26877, %rs778; + shl.b32 %r26878, %r26877, 24; + or.b32 %r26879, %r26876, %r26878; + cvt.u32.u16 %r26880, %rs777; + and.b32 %r26881, %r26880, 255; + cvt.u32.u16 %r26882, %rs776; + prmt.b32 %r26883, %r26882, %r26881, 30212; + cvt.u32.u16 %r26884, %rs775; + shl.b32 %r26885, %r26884, 16; + and.b32 %r26886, %r26885, 16711680; + or.b32 %r26887, %r26883, %r26886; + cvt.u32.u16 %r26888, %rs774; + shl.b32 %r26889, %r26888, 24; + or.b32 %r26890, %r26887, %r26889; + cvt.u32.u16 %r26891, %rs773; + and.b32 %r26892, %r26891, 255; + cvt.u32.u16 %r26893, %rs772; + prmt.b32 %r26894, %r26893, %r26892, 30212; + cvt.u32.u16 %r26895, %rs771; + shl.b32 %r26896, %r26895, 16; + and.b32 %r26897, %r26896, 16711680; + or.b32 %r26898, %r26894, %r26897; + cvt.u32.u16 %r26899, %rs770; + shl.b32 %r26900, %r26899, 24; + or.b32 %r26901, %r26898, %r26900; + cvt.u32.u16 %r26902, %rs769; + and.b32 %r26903, %r26902, 255; + cvt.u32.u16 %r26904, %rs768; + prmt.b32 %r26905, %r26904, %r26903, 30212; + cvt.u32.u16 %r26906, %rs767; + shl.b32 %r26907, %r26906, 16; + and.b32 %r26908, %r26907, 16711680; + or.b32 %r26909, %r26905, %r26908; + cvt.u32.u16 %r26910, %rs766; + shl.b32 %r26911, %r26910, 24; + or.b32 %r26912, %r26909, %r26911; + cvt.u32.u16 %r26913, %rs830; + and.b32 %r26914, %r26913, 255; + cvt.u32.u16 %r26915, %rs831; + prmt.b32 %r26916, %r26915, %r26914, 30212; + cvt.u32.u16 %r26917, %rs832; + shl.b32 %r26918, %r26917, 16; + and.b32 %r26919, %r26918, 16711680; + or.b32 %r26920, %r26916, %r26919; + cvt.u32.u16 %r26921, %rs833; + shl.b32 %r26922, %r26921, 24; + or.b32 %r26923, %r26920, %r26922; + cvt.u32.u16 %r26924, %rs834; + and.b32 %r26925, %r26924, 255; + cvt.u32.u16 %r26926, %rs835; + prmt.b32 %r26927, %r26926, %r26925, 30212; + cvt.u32.u16 %r26928, %rs836; + shl.b32 %r26929, %r26928, 16; + and.b32 %r26930, %r26929, 16711680; + or.b32 %r26931, %r26927, %r26930; + cvt.u32.u16 %r26932, %rs837; + shl.b32 %r26933, %r26932, 24; + or.b32 %r26934, %r26931, %r26933; + cvt.u32.u16 %r26935, %rs838; + and.b32 %r26936, %r26935, 255; + cvt.u32.u16 %r26937, %rs839; + prmt.b32 %r26938, %r26937, %r26936, 30212; + cvt.u32.u16 %r26939, %rs840; + shl.b32 %r26940, %r26939, 16; + and.b32 %r26941, %r26940, 16711680; + or.b32 %r26942, %r26938, %r26941; + cvt.u32.u16 %r26943, %rs841; + shl.b32 %r26944, %r26943, 24; + or.b32 %r26945, %r26942, %r26944; + cvt.u32.u16 %r26946, %rs842; + and.b32 %r26947, %r26946, 255; + cvt.u32.u16 %r26948, %rs843; + prmt.b32 %r26949, %r26948, %r26947, 30212; + cvt.u32.u16 %r26950, %rs844; + shl.b32 %r26951, %r26950, 16; + and.b32 %r26952, %r26951, 16711680; + or.b32 %r26953, %r26949, %r26952; + cvt.u32.u16 %r26954, %rs845; + shl.b32 %r26955, %r26954, 24; + or.b32 %r26956, %r26953, %r26955; + cvt.u32.u16 %r26957, %rs846; + and.b32 %r26958, %r26957, 255; + cvt.u32.u16 %r26959, %rs847; + prmt.b32 %r26960, %r26959, %r26958, 30212; + cvt.u32.u16 %r26961, %rs848; + shl.b32 %r26962, %r26961, 16; + and.b32 %r26963, %r26962, 16711680; + or.b32 %r26964, %r26960, %r26963; + cvt.u32.u16 %r26965, %rs849; + shl.b32 %r26966, %r26965, 24; + or.b32 %r26967, %r26964, %r26966; + cvt.u32.u16 %r26968, %rs850; + and.b32 %r26969, %r26968, 255; + cvt.u32.u16 %r26970, %rs851; + prmt.b32 %r26971, %r26970, %r26969, 30212; + cvt.u32.u16 %r26972, %rs852; + shl.b32 %r26973, %r26972, 16; + and.b32 %r26974, %r26973, 16711680; + or.b32 %r26975, %r26971, %r26974; + cvt.u32.u16 %r26976, %rs853; + shl.b32 %r26977, %r26976, 24; + or.b32 %r26978, %r26975, %r26977; + cvt.u32.u16 %r26979, %rs854; + and.b32 %r26980, %r26979, 255; + cvt.u32.u16 %r26981, %rs855; + prmt.b32 %r26982, %r26981, %r26980, 30212; + cvt.u32.u16 %r26983, %rs856; + shl.b32 %r26984, %r26983, 16; + and.b32 %r26985, %r26984, 16711680; + or.b32 %r26986, %r26982, %r26985; + cvt.u32.u16 %r26987, %rs857; + shl.b32 %r26988, %r26987, 24; + or.b32 %r26989, %r26986, %r26988; + cvt.u32.u16 %r26990, %rs858; + and.b32 %r26991, %r26990, 255; + cvt.u32.u16 %r26992, %rs859; + prmt.b32 %r26993, %r26992, %r26991, 30212; + cvt.u32.u16 %r26994, %rs860; + shl.b32 %r26995, %r26994, 16; + and.b32 %r26996, %r26995, 16711680; + or.b32 %r26997, %r26993, %r26996; + cvt.u32.u16 %r26998, %rs861; + shl.b32 %r26999, %r26998, 24; + or.b32 %r27000, %r26997, %r26999; + shr.u64 %rd1162, %rd1368, 32; + cvt.u32.u64 %r27001, %rd1162; + add.s32 %r27002, %r31257, %r26835; + add.s32 %r27003, %r27002, %r31253; + cvt.u32.u64 %r27004, %rd1368; + xor.b32 %r27005, %r27003, %r27004; + shf.l.wrap.b32 %r27006, %r27005, %r27005, 16; + add.s32 %r27007, %r27006, 1779033703; + xor.b32 %r27008, %r27007, %r31253; + shf.l.wrap.b32 %r27009, %r27008, %r27008, 20; + add.s32 %r27010, %r27003, %r26846; + add.s32 %r27011, %r27010, %r27009; + xor.b32 %r27012, %r27011, %r27006; + shf.l.wrap.b32 %r27013, %r27012, %r27012, 24; + add.s32 %r27014, %r27013, %r27007; + xor.b32 %r27015, %r27014, %r27009; + shf.l.wrap.b32 %r27016, %r27015, %r27015, 25; + add.s32 %r27017, %r31256, %r26857; + add.s32 %r27018, %r27017, %r31252; + xor.b32 %r27019, %r27018, %r27001; + shf.l.wrap.b32 %r27020, %r27019, %r27019, 16; + add.s32 %r27021, %r27020, -1150833019; + xor.b32 %r27022, %r27021, %r31252; + shf.l.wrap.b32 %r27023, %r27022, %r27022, 20; + add.s32 %r27024, %r27018, %r26868; + add.s32 %r27025, %r27024, %r27023; + xor.b32 %r27026, %r27025, %r27020; + shf.l.wrap.b32 %r27027, %r27026, %r27026, 24; + add.s32 %r27028, %r27027, %r27021; + xor.b32 %r27029, %r27028, %r27023; + shf.l.wrap.b32 %r27030, %r27029, %r27029, 25; + add.s32 %r27031, %r31255, %r26879; + add.s32 %r27032, %r27031, %r31251; + cvt.u32.u16 %r27033, %rs862; + and.b32 %r27034, %r27033, 255; + xor.b32 %r27035, %r27032, %r27034; + shr.u32 %r27036, %r27032, 16; + shl.b32 %r27037, %r27035, 16; + or.b32 %r27038, %r27037, %r27036; + add.s32 %r27039, %r27038, 1013904242; + xor.b32 %r27040, %r27039, %r31251; + shf.l.wrap.b32 %r27041, %r27040, %r27040, 20; + add.s32 %r27042, %r27032, %r26890; + add.s32 %r27043, %r27042, %r27041; + xor.b32 %r27044, %r27043, %r27038; + shf.l.wrap.b32 %r27045, %r27044, %r27044, 24; + add.s32 %r27046, %r27045, %r27039; + xor.b32 %r27047, %r27046, %r27041; + shf.l.wrap.b32 %r27048, %r27047, %r27047, 25; + add.s32 %r27049, %r31254, %r26901; + add.s32 %r27050, %r27049, %r31250; + cvt.u32.u16 %r27051, %rs732; + and.b32 %r27052, %r27051, 255; + xor.b32 %r27053, %r27050, %r27052; + shr.u32 %r27054, %r27050, 16; + shl.b32 %r27055, %r27053, 16; + or.b32 %r27056, %r27055, %r27054; + add.s32 %r27057, %r27056, -1521486534; + xor.b32 %r27058, %r27057, %r31250; + shf.l.wrap.b32 %r27059, %r27058, %r27058, 20; + add.s32 %r27060, %r27050, %r26912; + add.s32 %r27061, %r27060, %r27059; + xor.b32 %r27062, %r27061, %r27056; + shf.l.wrap.b32 %r27063, %r27062, %r27062, 24; + add.s32 %r27064, %r27063, %r27057; + xor.b32 %r27065, %r27064, %r27059; + shf.l.wrap.b32 %r27066, %r27065, %r27065, 25; + add.s32 %r27067, %r27011, %r26923; + add.s32 %r27068, %r27067, %r27030; + xor.b32 %r27069, %r27068, %r27063; + shf.l.wrap.b32 %r27070, %r27069, %r27069, 16; + add.s32 %r27071, %r27070, %r27046; + xor.b32 %r27072, %r27071, %r27030; + shf.l.wrap.b32 %r27073, %r27072, %r27072, 20; + add.s32 %r27074, %r27068, %r26934; + add.s32 %r27075, %r27074, %r27073; + xor.b32 %r27076, %r27075, %r27070; + shf.l.wrap.b32 %r27077, %r27076, %r27076, 24; + add.s32 %r27078, %r27077, %r27071; + xor.b32 %r27079, %r27078, %r27073; + shf.l.wrap.b32 %r27080, %r27079, %r27079, 25; + add.s32 %r27081, %r27025, %r26945; + add.s32 %r27082, %r27081, %r27048; + xor.b32 %r27083, %r27082, %r27013; + shf.l.wrap.b32 %r27084, %r27083, %r27083, 16; + add.s32 %r27085, %r27084, %r27064; + xor.b32 %r27086, %r27085, %r27048; + shf.l.wrap.b32 %r27087, %r27086, %r27086, 20; + add.s32 %r27088, %r27082, %r26956; + add.s32 %r27089, %r27088, %r27087; + xor.b32 %r27090, %r27089, %r27084; + shf.l.wrap.b32 %r27091, %r27090, %r27090, 24; + add.s32 %r27092, %r27091, %r27085; + xor.b32 %r27093, %r27092, %r27087; + shf.l.wrap.b32 %r27094, %r27093, %r27093, 25; + add.s32 %r27095, %r27043, %r26967; + add.s32 %r27096, %r27095, %r27066; + xor.b32 %r27097, %r27096, %r27027; + shf.l.wrap.b32 %r27098, %r27097, %r27097, 16; + add.s32 %r27099, %r27098, %r27014; + xor.b32 %r27100, %r27099, %r27066; + shf.l.wrap.b32 %r27101, %r27100, %r27100, 20; + add.s32 %r27102, %r27096, %r26978; + add.s32 %r27103, %r27102, %r27101; + xor.b32 %r27104, %r27103, %r27098; + shf.l.wrap.b32 %r27105, %r27104, %r27104, 24; + add.s32 %r27106, %r27105, %r27099; + xor.b32 %r27107, %r27106, %r27101; + shf.l.wrap.b32 %r27108, %r27107, %r27107, 25; + add.s32 %r27109, %r27061, %r26989; + add.s32 %r27110, %r27109, %r27016; + xor.b32 %r27111, %r27110, %r27045; + shf.l.wrap.b32 %r27112, %r27111, %r27111, 16; + add.s32 %r27113, %r27112, %r27028; + xor.b32 %r27114, %r27113, %r27016; + shf.l.wrap.b32 %r27115, %r27114, %r27114, 20; + add.s32 %r27116, %r27110, %r27000; + add.s32 %r27117, %r27116, %r27115; + xor.b32 %r27118, %r27117, %r27112; + shf.l.wrap.b32 %r27119, %r27118, %r27118, 24; + add.s32 %r27120, %r27119, %r27113; + xor.b32 %r27121, %r27120, %r27115; + shf.l.wrap.b32 %r27122, %r27121, %r27121, 25; + add.s32 %r27123, %r27075, %r26857; + add.s32 %r27124, %r27123, %r27122; + xor.b32 %r27125, %r27124, %r27091; + shf.l.wrap.b32 %r27126, %r27125, %r27125, 16; + add.s32 %r27127, %r27126, %r27106; + xor.b32 %r27128, %r27127, %r27122; + shf.l.wrap.b32 %r27129, %r27128, %r27128, 20; + add.s32 %r27130, %r27124, %r26901; + add.s32 %r27131, %r27130, %r27129; + xor.b32 %r27132, %r27131, %r27126; + shf.l.wrap.b32 %r27133, %r27132, %r27132, 24; + add.s32 %r27134, %r27133, %r27127; + xor.b32 %r27135, %r27134, %r27129; + shf.l.wrap.b32 %r27136, %r27135, %r27135, 25; + add.s32 %r27137, %r27089, %r26868; + add.s32 %r27138, %r27137, %r27080; + xor.b32 %r27139, %r27138, %r27105; + shf.l.wrap.b32 %r27140, %r27139, %r27139, 16; + add.s32 %r27141, %r27140, %r27120; + xor.b32 %r27142, %r27141, %r27080; + shf.l.wrap.b32 %r27143, %r27142, %r27142, 20; + add.s32 %r27144, %r27138, %r26945; + add.s32 %r27145, %r27144, %r27143; + xor.b32 %r27146, %r27145, %r27140; + shf.l.wrap.b32 %r27147, %r27146, %r27146, 24; + add.s32 %r27148, %r27147, %r27141; + xor.b32 %r27149, %r27148, %r27143; + shf.l.wrap.b32 %r27150, %r27149, %r27149, 25; + add.s32 %r27151, %r27103, %r26912; + add.s32 %r27152, %r27151, %r27094; + xor.b32 %r27153, %r27152, %r27119; + shf.l.wrap.b32 %r27154, %r27153, %r27153, 16; + add.s32 %r27155, %r27154, %r27078; + xor.b32 %r27156, %r27155, %r27094; + shf.l.wrap.b32 %r27157, %r27156, %r27156, 20; + add.s32 %r27158, %r27152, %r26835; + add.s32 %r27159, %r27158, %r27157; + xor.b32 %r27160, %r27159, %r27154; + shf.l.wrap.b32 %r27161, %r27160, %r27160, 24; + add.s32 %r27162, %r27161, %r27155; + xor.b32 %r27163, %r27162, %r27157; + shf.l.wrap.b32 %r27164, %r27163, %r27163, 25; + add.s32 %r27165, %r27117, %r26879; + add.s32 %r27166, %r27165, %r27108; + xor.b32 %r27167, %r27166, %r27077; + shf.l.wrap.b32 %r27168, %r27167, %r27167, 16; + add.s32 %r27169, %r27168, %r27092; + xor.b32 %r27170, %r27169, %r27108; + shf.l.wrap.b32 %r27171, %r27170, %r27170, 20; + add.s32 %r27172, %r27166, %r26978; + add.s32 %r27173, %r27172, %r27171; + xor.b32 %r27174, %r27173, %r27168; + shf.l.wrap.b32 %r27175, %r27174, %r27174, 24; + add.s32 %r27176, %r27175, %r27169; + xor.b32 %r27177, %r27176, %r27171; + shf.l.wrap.b32 %r27178, %r27177, %r27177, 25; + add.s32 %r27179, %r27131, %r26846; + add.s32 %r27180, %r27179, %r27150; + xor.b32 %r27181, %r27180, %r27175; + shf.l.wrap.b32 %r27182, %r27181, %r27181, 16; + add.s32 %r27183, %r27182, %r27162; + xor.b32 %r27184, %r27183, %r27150; + shf.l.wrap.b32 %r27185, %r27184, %r27184, 20; + add.s32 %r27186, %r27180, %r26956; + add.s32 %r27187, %r27186, %r27185; + xor.b32 %r27188, %r27187, %r27182; + shf.l.wrap.b32 %r27189, %r27188, %r27188, 24; + add.s32 %r27190, %r27189, %r27183; + xor.b32 %r27191, %r27190, %r27185; + shf.l.wrap.b32 %r27192, %r27191, %r27191, 25; + add.s32 %r27193, %r27145, %r26967; + add.s32 %r27194, %r27193, %r27164; + xor.b32 %r27195, %r27194, %r27133; + shf.l.wrap.b32 %r27196, %r27195, %r27195, 16; + add.s32 %r27197, %r27196, %r27176; + xor.b32 %r27198, %r27197, %r27164; + shf.l.wrap.b32 %r27199, %r27198, %r27198, 20; + add.s32 %r27200, %r27194, %r26890; + add.s32 %r27201, %r27200, %r27199; + xor.b32 %r27202, %r27201, %r27196; + shf.l.wrap.b32 %r27203, %r27202, %r27202, 24; + add.s32 %r27204, %r27203, %r27197; + xor.b32 %r27205, %r27204, %r27199; + shf.l.wrap.b32 %r27206, %r27205, %r27205, 25; + add.s32 %r27207, %r27159, %r26934; + add.s32 %r27208, %r27207, %r27178; + xor.b32 %r27209, %r27208, %r27147; + shf.l.wrap.b32 %r27210, %r27209, %r27209, 16; + add.s32 %r27211, %r27210, %r27134; + xor.b32 %r27212, %r27211, %r27178; + shf.l.wrap.b32 %r27213, %r27212, %r27212, 20; + add.s32 %r27214, %r27208, %r26989; + add.s32 %r27215, %r27214, %r27213; + xor.b32 %r27216, %r27215, %r27210; + shf.l.wrap.b32 %r27217, %r27216, %r27216, 24; + add.s32 %r27218, %r27217, %r27211; + xor.b32 %r27219, %r27218, %r27213; + shf.l.wrap.b32 %r27220, %r27219, %r27219, 25; + add.s32 %r27221, %r27173, %r27000; + add.s32 %r27222, %r27221, %r27136; + xor.b32 %r27223, %r27222, %r27161; + shf.l.wrap.b32 %r27224, %r27223, %r27223, 16; + add.s32 %r27225, %r27224, %r27148; + xor.b32 %r27226, %r27225, %r27136; + shf.l.wrap.b32 %r27227, %r27226, %r27226, 20; + add.s32 %r27228, %r27222, %r26923; + add.s32 %r27229, %r27228, %r27227; + xor.b32 %r27230, %r27229, %r27224; + shf.l.wrap.b32 %r27231, %r27230, %r27230, 24; + add.s32 %r27232, %r27231, %r27225; + xor.b32 %r27233, %r27232, %r27227; + shf.l.wrap.b32 %r27234, %r27233, %r27233, 25; + add.s32 %r27235, %r27187, %r26868; + add.s32 %r27236, %r27235, %r27234; + xor.b32 %r27237, %r27236, %r27203; + shf.l.wrap.b32 %r27238, %r27237, %r27237, 16; + add.s32 %r27239, %r27238, %r27218; + xor.b32 %r27240, %r27239, %r27234; + shf.l.wrap.b32 %r27241, %r27240, %r27240, 20; + add.s32 %r27242, %r27236, %r26879; + add.s32 %r27243, %r27242, %r27241; + xor.b32 %r27244, %r27243, %r27238; + shf.l.wrap.b32 %r27245, %r27244, %r27244, 24; + add.s32 %r27246, %r27245, %r27239; + xor.b32 %r27247, %r27246, %r27241; + shf.l.wrap.b32 %r27248, %r27247, %r27247, 25; + add.s32 %r27249, %r27201, %r26945; + add.s32 %r27250, %r27249, %r27192; + xor.b32 %r27251, %r27250, %r27217; + shf.l.wrap.b32 %r27252, %r27251, %r27251, 16; + add.s32 %r27253, %r27252, %r27232; + xor.b32 %r27254, %r27253, %r27192; + shf.l.wrap.b32 %r27255, %r27254, %r27254, 20; + add.s32 %r27256, %r27250, %r26967; + add.s32 %r27257, %r27256, %r27255; + xor.b32 %r27258, %r27257, %r27252; + shf.l.wrap.b32 %r27259, %r27258, %r27258, 24; + add.s32 %r27260, %r27259, %r27253; + xor.b32 %r27261, %r27260, %r27255; + shf.l.wrap.b32 %r27262, %r27261, %r27261, 25; + add.s32 %r27263, %r27215, %r26978; + add.s32 %r27264, %r27263, %r27206; + xor.b32 %r27265, %r27264, %r27231; + shf.l.wrap.b32 %r27266, %r27265, %r27265, 16; + add.s32 %r27267, %r27266, %r27190; + xor.b32 %r27268, %r27267, %r27206; + shf.l.wrap.b32 %r27269, %r27268, %r27268, 20; + add.s32 %r27270, %r27264, %r26857; + add.s32 %r27271, %r27270, %r27269; + xor.b32 %r27272, %r27271, %r27266; + shf.l.wrap.b32 %r27273, %r27272, %r27272, 24; + add.s32 %r27274, %r27273, %r27267; + xor.b32 %r27275, %r27274, %r27269; + shf.l.wrap.b32 %r27276, %r27275, %r27275, 25; + add.s32 %r27277, %r27229, %r26912; + add.s32 %r27278, %r27277, %r27220; + xor.b32 %r27279, %r27278, %r27189; + shf.l.wrap.b32 %r27280, %r27279, %r27279, 16; + add.s32 %r27281, %r27280, %r27204; + xor.b32 %r27282, %r27281, %r27220; + shf.l.wrap.b32 %r27283, %r27282, %r27282, 20; + add.s32 %r27284, %r27278, %r26989; + add.s32 %r27285, %r27284, %r27283; + xor.b32 %r27286, %r27285, %r27280; + shf.l.wrap.b32 %r27287, %r27286, %r27286, 24; + add.s32 %r27288, %r27287, %r27281; + xor.b32 %r27289, %r27288, %r27283; + shf.l.wrap.b32 %r27290, %r27289, %r27289, 25; + add.s32 %r27291, %r27243, %r26901; + add.s32 %r27292, %r27291, %r27262; + xor.b32 %r27293, %r27292, %r27287; + shf.l.wrap.b32 %r27294, %r27293, %r27293, 16; + add.s32 %r27295, %r27294, %r27274; + xor.b32 %r27296, %r27295, %r27262; + shf.l.wrap.b32 %r27297, %r27296, %r27296, 20; + add.s32 %r27298, %r27292, %r26890; + add.s32 %r27299, %r27298, %r27297; + xor.b32 %r27300, %r27299, %r27294; + shf.l.wrap.b32 %r27301, %r27300, %r27300, 24; + add.s32 %r27302, %r27301, %r27295; + xor.b32 %r27303, %r27302, %r27297; + shf.l.wrap.b32 %r27304, %r27303, %r27303, 25; + add.s32 %r27305, %r27257, %r26934; + add.s32 %r27306, %r27305, %r27276; + xor.b32 %r27307, %r27306, %r27245; + shf.l.wrap.b32 %r27308, %r27307, %r27307, 16; + add.s32 %r27309, %r27308, %r27288; + xor.b32 %r27310, %r27309, %r27276; + shf.l.wrap.b32 %r27311, %r27310, %r27310, 20; + add.s32 %r27312, %r27306, %r26835; + add.s32 %r27313, %r27312, %r27311; + xor.b32 %r27314, %r27313, %r27308; + shf.l.wrap.b32 %r27315, %r27314, %r27314, 24; + add.s32 %r27316, %r27315, %r27309; + xor.b32 %r27317, %r27316, %r27311; + shf.l.wrap.b32 %r27318, %r27317, %r27317, 25; + add.s32 %r27319, %r27271, %r26956; + add.s32 %r27320, %r27319, %r27290; + xor.b32 %r27321, %r27320, %r27259; + shf.l.wrap.b32 %r27322, %r27321, %r27321, 16; + add.s32 %r27323, %r27322, %r27246; + xor.b32 %r27324, %r27323, %r27290; + shf.l.wrap.b32 %r27325, %r27324, %r27324, 20; + add.s32 %r27326, %r27320, %r27000; + add.s32 %r27327, %r27326, %r27325; + xor.b32 %r27328, %r27327, %r27322; + shf.l.wrap.b32 %r27329, %r27328, %r27328, 24; + add.s32 %r27330, %r27329, %r27323; + xor.b32 %r27331, %r27330, %r27325; + shf.l.wrap.b32 %r27332, %r27331, %r27331, 25; + add.s32 %r27333, %r27285, %r26923; + add.s32 %r27334, %r27333, %r27248; + xor.b32 %r27335, %r27334, %r27273; + shf.l.wrap.b32 %r27336, %r27335, %r27335, 16; + add.s32 %r27337, %r27336, %r27260; + xor.b32 %r27338, %r27337, %r27248; + shf.l.wrap.b32 %r27339, %r27338, %r27338, 20; + add.s32 %r27340, %r27334, %r26846; + add.s32 %r27341, %r27340, %r27339; + xor.b32 %r27342, %r27341, %r27336; + shf.l.wrap.b32 %r27343, %r27342, %r27342, 24; + add.s32 %r27344, %r27343, %r27337; + xor.b32 %r27345, %r27344, %r27339; + shf.l.wrap.b32 %r27346, %r27345, %r27345, 25; + add.s32 %r27347, %r27299, %r26945; + add.s32 %r27348, %r27347, %r27346; + xor.b32 %r27349, %r27348, %r27315; + shf.l.wrap.b32 %r27350, %r27349, %r27349, 16; + add.s32 %r27351, %r27350, %r27330; + xor.b32 %r27352, %r27351, %r27346; + shf.l.wrap.b32 %r27353, %r27352, %r27352, 20; + add.s32 %r27354, %r27348, %r26912; + add.s32 %r27355, %r27354, %r27353; + xor.b32 %r27356, %r27355, %r27350; + shf.l.wrap.b32 %r27357, %r27356, %r27356, 24; + add.s32 %r27358, %r27357, %r27351; + xor.b32 %r27359, %r27358, %r27353; + shf.l.wrap.b32 %r27360, %r27359, %r27359, 25; + add.s32 %r27361, %r27313, %r26967; + add.s32 %r27362, %r27361, %r27304; + xor.b32 %r27363, %r27362, %r27329; + shf.l.wrap.b32 %r27364, %r27363, %r27363, 16; + add.s32 %r27365, %r27364, %r27344; + xor.b32 %r27366, %r27365, %r27304; + shf.l.wrap.b32 %r27367, %r27366, %r27366, 20; + add.s32 %r27368, %r27362, %r26934; + add.s32 %r27369, %r27368, %r27367; + xor.b32 %r27370, %r27369, %r27364; + shf.l.wrap.b32 %r27371, %r27370, %r27370, 24; + add.s32 %r27372, %r27371, %r27365; + xor.b32 %r27373, %r27372, %r27367; + shf.l.wrap.b32 %r27374, %r27373, %r27373, 25; + add.s32 %r27375, %r27327, %r26989; + add.s32 %r27376, %r27375, %r27318; + xor.b32 %r27377, %r27376, %r27343; + shf.l.wrap.b32 %r27378, %r27377, %r27377, 16; + add.s32 %r27379, %r27378, %r27302; + xor.b32 %r27380, %r27379, %r27318; + shf.l.wrap.b32 %r27381, %r27380, %r27380, 20; + add.s32 %r27382, %r27376, %r26868; + add.s32 %r27383, %r27382, %r27381; + xor.b32 %r27384, %r27383, %r27378; + shf.l.wrap.b32 %r27385, %r27384, %r27384, 24; + add.s32 %r27386, %r27385, %r27379; + xor.b32 %r27387, %r27386, %r27381; + shf.l.wrap.b32 %r27388, %r27387, %r27387, 25; + add.s32 %r27389, %r27341, %r26978; + add.s32 %r27390, %r27389, %r27332; + xor.b32 %r27391, %r27390, %r27301; + shf.l.wrap.b32 %r27392, %r27391, %r27391, 16; + add.s32 %r27393, %r27392, %r27316; + xor.b32 %r27394, %r27393, %r27332; + shf.l.wrap.b32 %r27395, %r27394, %r27394, 20; + add.s32 %r27396, %r27390, %r27000; + add.s32 %r27397, %r27396, %r27395; + xor.b32 %r27398, %r27397, %r27392; + shf.l.wrap.b32 %r27399, %r27398, %r27398, 24; + add.s32 %r27400, %r27399, %r27393; + xor.b32 %r27401, %r27400, %r27395; + shf.l.wrap.b32 %r27402, %r27401, %r27401, 25; + add.s32 %r27403, %r27355, %r26879; + add.s32 %r27404, %r27403, %r27374; + xor.b32 %r27405, %r27404, %r27399; + shf.l.wrap.b32 %r27406, %r27405, %r27405, 16; + add.s32 %r27407, %r27406, %r27386; + xor.b32 %r27408, %r27407, %r27374; + shf.l.wrap.b32 %r27409, %r27408, %r27408, 20; + add.s32 %r27410, %r27404, %r26835; + add.s32 %r27411, %r27410, %r27409; + xor.b32 %r27412, %r27411, %r27406; + shf.l.wrap.b32 %r27413, %r27412, %r27412, 24; + add.s32 %r27414, %r27413, %r27407; + xor.b32 %r27415, %r27414, %r27409; + shf.l.wrap.b32 %r27416, %r27415, %r27415, 25; + add.s32 %r27417, %r27369, %r26956; + add.s32 %r27418, %r27417, %r27388; + xor.b32 %r27419, %r27418, %r27357; + shf.l.wrap.b32 %r27420, %r27419, %r27419, 16; + add.s32 %r27421, %r27420, %r27400; + xor.b32 %r27422, %r27421, %r27388; + shf.l.wrap.b32 %r27423, %r27422, %r27422, 20; + add.s32 %r27424, %r27418, %r26857; + add.s32 %r27425, %r27424, %r27423; + xor.b32 %r27426, %r27425, %r27420; + shf.l.wrap.b32 %r27427, %r27426, %r27426, 24; + add.s32 %r27428, %r27427, %r27421; + xor.b32 %r27429, %r27428, %r27423; + shf.l.wrap.b32 %r27430, %r27429, %r27429, 25; + add.s32 %r27431, %r27383, %r26890; + add.s32 %r27432, %r27431, %r27402; + xor.b32 %r27433, %r27432, %r27371; + shf.l.wrap.b32 %r27434, %r27433, %r27433, 16; + add.s32 %r27435, %r27434, %r27358; + xor.b32 %r27436, %r27435, %r27402; + shf.l.wrap.b32 %r27437, %r27436, %r27436, 20; + add.s32 %r27438, %r27432, %r26923; + add.s32 %r27439, %r27438, %r27437; + xor.b32 %r27440, %r27439, %r27434; + shf.l.wrap.b32 %r27441, %r27440, %r27440, 24; + add.s32 %r27442, %r27441, %r27435; + xor.b32 %r27443, %r27442, %r27437; + shf.l.wrap.b32 %r27444, %r27443, %r27443, 25; + add.s32 %r27445, %r27397, %r26846; + add.s32 %r27446, %r27445, %r27360; + xor.b32 %r27447, %r27446, %r27385; + shf.l.wrap.b32 %r27448, %r27447, %r27447, 16; + add.s32 %r27449, %r27448, %r27372; + xor.b32 %r27450, %r27449, %r27360; + shf.l.wrap.b32 %r27451, %r27450, %r27450, 20; + add.s32 %r27452, %r27446, %r26901; + add.s32 %r27453, %r27452, %r27451; + xor.b32 %r27454, %r27453, %r27448; + shf.l.wrap.b32 %r27455, %r27454, %r27454, 24; + add.s32 %r27456, %r27455, %r27449; + xor.b32 %r27457, %r27456, %r27451; + shf.l.wrap.b32 %r27458, %r27457, %r27457, 25; + add.s32 %r27459, %r27411, %r26967; + add.s32 %r27460, %r27459, %r27458; + xor.b32 %r27461, %r27460, %r27427; + shf.l.wrap.b32 %r27462, %r27461, %r27461, 16; + add.s32 %r27463, %r27462, %r27442; + xor.b32 %r27464, %r27463, %r27458; + shf.l.wrap.b32 %r27465, %r27464, %r27464, 20; + add.s32 %r27466, %r27460, %r26978; + add.s32 %r27467, %r27466, %r27465; + xor.b32 %r27468, %r27467, %r27462; + shf.l.wrap.b32 %r27469, %r27468, %r27468, 24; + add.s32 %r27470, %r27469, %r27463; + xor.b32 %r27471, %r27470, %r27465; + shf.l.wrap.b32 %r27472, %r27471, %r27471, 25; + add.s32 %r27473, %r27425, %r26934; + add.s32 %r27474, %r27473, %r27416; + xor.b32 %r27475, %r27474, %r27441; + shf.l.wrap.b32 %r27476, %r27475, %r27475, 16; + add.s32 %r27477, %r27476, %r27456; + xor.b32 %r27478, %r27477, %r27416; + shf.l.wrap.b32 %r27479, %r27478, %r27478, 20; + add.s32 %r27480, %r27474, %r26956; + add.s32 %r27481, %r27480, %r27479; + xor.b32 %r27482, %r27481, %r27476; + shf.l.wrap.b32 %r27483, %r27482, %r27482, 24; + add.s32 %r27484, %r27483, %r27477; + xor.b32 %r27485, %r27484, %r27479; + shf.l.wrap.b32 %r27486, %r27485, %r27485, 25; + add.s32 %r27487, %r27439, %r27000; + add.s32 %r27488, %r27487, %r27430; + xor.b32 %r27489, %r27488, %r27455; + shf.l.wrap.b32 %r27490, %r27489, %r27489, 16; + add.s32 %r27491, %r27490, %r27414; + xor.b32 %r27492, %r27491, %r27430; + shf.l.wrap.b32 %r27493, %r27492, %r27492, 20; + add.s32 %r27494, %r27488, %r26945; + add.s32 %r27495, %r27494, %r27493; + xor.b32 %r27496, %r27495, %r27490; + shf.l.wrap.b32 %r27497, %r27496, %r27496, 24; + add.s32 %r27498, %r27497, %r27491; + xor.b32 %r27499, %r27498, %r27493; + shf.l.wrap.b32 %r27500, %r27499, %r27499, 25; + add.s32 %r27501, %r27453, %r26989; + add.s32 %r27502, %r27501, %r27444; + xor.b32 %r27503, %r27502, %r27413; + shf.l.wrap.b32 %r27504, %r27503, %r27503, 16; + add.s32 %r27505, %r27504, %r27428; + xor.b32 %r27506, %r27505, %r27444; + shf.l.wrap.b32 %r27507, %r27506, %r27506, 20; + add.s32 %r27508, %r27502, %r26923; + add.s32 %r27509, %r27508, %r27507; + xor.b32 %r27510, %r27509, %r27504; + shf.l.wrap.b32 %r27511, %r27510, %r27510, 24; + add.s32 %r27512, %r27511, %r27505; + xor.b32 %r27513, %r27512, %r27507; + shf.l.wrap.b32 %r27514, %r27513, %r27513, 25; + add.s32 %r27515, %r27467, %r26912; + add.s32 %r27516, %r27515, %r27486; + xor.b32 %r27517, %r27516, %r27511; + shf.l.wrap.b32 %r27518, %r27517, %r27517, 16; + add.s32 %r27519, %r27518, %r27498; + xor.b32 %r27520, %r27519, %r27486; + shf.l.wrap.b32 %r27521, %r27520, %r27520, 20; + add.s32 %r27522, %r27516, %r26857; + add.s32 %r27523, %r27522, %r27521; + xor.b32 %r27524, %r27523, %r27518; + shf.l.wrap.b32 %r27525, %r27524, %r27524, 24; + add.s32 %r27526, %r27525, %r27519; + xor.b32 %r27527, %r27526, %r27521; + shf.l.wrap.b32 %r27528, %r27527, %r27527, 25; + add.s32 %r27529, %r27481, %r26890; + add.s32 %r27530, %r27529, %r27500; + xor.b32 %r27531, %r27530, %r27469; + shf.l.wrap.b32 %r27532, %r27531, %r27531, 16; + add.s32 %r27533, %r27532, %r27512; + xor.b32 %r27534, %r27533, %r27500; + shf.l.wrap.b32 %r27535, %r27534, %r27534, 20; + add.s32 %r27536, %r27530, %r26868; + add.s32 %r27537, %r27536, %r27535; + xor.b32 %r27538, %r27537, %r27532; + shf.l.wrap.b32 %r27539, %r27538, %r27538, 24; + add.s32 %r27540, %r27539, %r27533; + xor.b32 %r27541, %r27540, %r27535; + shf.l.wrap.b32 %r27542, %r27541, %r27541, 25; + add.s32 %r27543, %r27495, %r26835; + add.s32 %r27544, %r27543, %r27514; + xor.b32 %r27545, %r27544, %r27483; + shf.l.wrap.b32 %r27546, %r27545, %r27545, 16; + add.s32 %r27547, %r27546, %r27470; + xor.b32 %r27548, %r27547, %r27514; + shf.l.wrap.b32 %r27549, %r27548, %r27548, 20; + add.s32 %r27550, %r27544, %r26846; + add.s32 %r27551, %r27550, %r27549; + xor.b32 %r27552, %r27551, %r27546; + shf.l.wrap.b32 %r27553, %r27552, %r27552, 24; + add.s32 %r27554, %r27553, %r27547; + xor.b32 %r27555, %r27554, %r27549; + shf.l.wrap.b32 %r27556, %r27555, %r27555, 25; + add.s32 %r27557, %r27509, %r26901; + add.s32 %r27558, %r27557, %r27472; + xor.b32 %r27559, %r27558, %r27497; + shf.l.wrap.b32 %r27560, %r27559, %r27559, 16; + add.s32 %r27561, %r27560, %r27484; + xor.b32 %r27562, %r27561, %r27472; + shf.l.wrap.b32 %r27563, %r27562, %r27562, 20; + add.s32 %r27564, %r27558, %r26879; + add.s32 %r27565, %r27564, %r27563; + xor.b32 %r27566, %r27565, %r27560; + shf.l.wrap.b32 %r27567, %r27566, %r27566, 24; + add.s32 %r27568, %r27567, %r27561; + xor.b32 %r27569, %r27568, %r27563; + shf.l.wrap.b32 %r27570, %r27569, %r27569, 25; + add.s32 %r27571, %r27523, %r26934; + add.s32 %r27572, %r27571, %r27570; + xor.b32 %r27573, %r27572, %r27539; + shf.l.wrap.b32 %r27574, %r27573, %r27573, 16; + add.s32 %r27575, %r27574, %r27554; + xor.b32 %r27576, %r27575, %r27570; + shf.l.wrap.b32 %r27577, %r27576, %r27576, 20; + add.s32 %r27578, %r27572, %r26989; + add.s32 %r27579, %r27578, %r27577; + xor.b32 %r27580, %r27579, %r27574; + shf.l.wrap.b32 %r27581, %r27580, %r27580, 24; + add.s32 %r27582, %r27581, %r27575; + xor.b32 %r27583, %r27582, %r27577; + shf.l.wrap.b32 %r27584, %r27583, %r27583, 25; + add.s32 %r27585, %r27537, %r26956; + add.s32 %r27586, %r27585, %r27528; + xor.b32 %r27587, %r27586, %r27553; + shf.l.wrap.b32 %r27588, %r27587, %r27587, 16; + add.s32 %r27589, %r27588, %r27568; + xor.b32 %r27590, %r27589, %r27528; + shf.l.wrap.b32 %r27591, %r27590, %r27590, 20; + add.s32 %r27592, %r27586, %r26890; + add.s32 %r27593, %r27592, %r27591; + xor.b32 %r27594, %r27593, %r27588; + shf.l.wrap.b32 %r27595, %r27594, %r27594, 24; + add.s32 %r27596, %r27595, %r27589; + xor.b32 %r27597, %r27596, %r27591; + shf.l.wrap.b32 %r27598, %r27597, %r27597, 25; + add.s32 %r27599, %r27551, %r26923; + add.s32 %r27600, %r27599, %r27542; + xor.b32 %r27601, %r27600, %r27567; + shf.l.wrap.b32 %r27602, %r27601, %r27601, 16; + add.s32 %r27603, %r27602, %r27526; + xor.b32 %r27604, %r27603, %r27542; + shf.l.wrap.b32 %r27605, %r27604, %r27604, 20; + add.s32 %r27606, %r27600, %r26967; + add.s32 %r27607, %r27606, %r27605; + xor.b32 %r27608, %r27607, %r27602; + shf.l.wrap.b32 %r27609, %r27608, %r27608, 24; + add.s32 %r27610, %r27609, %r27603; + xor.b32 %r27611, %r27610, %r27605; + shf.l.wrap.b32 %r27612, %r27611, %r27611, 25; + add.s32 %r27613, %r27565, %r27000; + add.s32 %r27614, %r27613, %r27556; + xor.b32 %r27615, %r27614, %r27525; + shf.l.wrap.b32 %r27616, %r27615, %r27615, 16; + add.s32 %r27617, %r27616, %r27540; + xor.b32 %r27618, %r27617, %r27556; + shf.l.wrap.b32 %r27619, %r27618, %r27618, 20; + add.s32 %r27620, %r27614, %r26846; + add.s32 %r27621, %r27620, %r27619; + xor.b32 %r27622, %r27621, %r27616; + shf.l.wrap.b32 %r27623, %r27622, %r27622, 24; + add.s32 %r27624, %r27623, %r27617; + xor.b32 %r27625, %r27624, %r27619; + shf.l.wrap.b32 %r27626, %r27625, %r27625, 25; + add.s32 %r27627, %r27579, %r26978; + add.s32 %r27628, %r27627, %r27598; + xor.b32 %r27629, %r27628, %r27623; + shf.l.wrap.b32 %r27630, %r27629, %r27629, 16; + add.s32 %r27631, %r27630, %r27610; + xor.b32 %r27632, %r27631, %r27598; + shf.l.wrap.b32 %r27633, %r27632, %r27632, 20; + add.s32 %r27634, %r27628, %r26868; + add.s32 %r27635, %r27634, %r27633; + xor.b32 %r27636, %r27635, %r27630; + shf.l.wrap.b32 %r27637, %r27636, %r27636, 24; + add.s32 %r27638, %r27637, %r27631; + xor.b32 %r27639, %r27638, %r27633; + shf.l.wrap.b32 %r27640, %r27639, %r27639, 25; + add.s32 %r27641, %r27593, %r26835; + add.s32 %r27642, %r27641, %r27612; + xor.b32 %r27643, %r27642, %r27581; + shf.l.wrap.b32 %r27644, %r27643, %r27643, 16; + add.s32 %r27645, %r27644, %r27624; + xor.b32 %r27646, %r27645, %r27612; + shf.l.wrap.b32 %r27647, %r27646, %r27646, 20; + add.s32 %r27648, %r27642, %r26945; + add.s32 %r27649, %r27648, %r27647; + xor.b32 %r27650, %r27649, %r27644; + shf.l.wrap.b32 %r27651, %r27650, %r27650, 24; + add.s32 %r27652, %r27651, %r27645; + xor.b32 %r27653, %r27652, %r27647; + shf.l.wrap.b32 %r27654, %r27653, %r27653, 25; + add.s32 %r27655, %r27607, %r26857; + add.s32 %r27656, %r27655, %r27626; + xor.b32 %r27657, %r27656, %r27595; + shf.l.wrap.b32 %r27658, %r27657, %r27657, 16; + add.s32 %r27659, %r27658, %r27582; + xor.b32 %r27660, %r27659, %r27626; + shf.l.wrap.b32 %r27661, %r27660, %r27660, 20; + add.s32 %r27662, %r27656, %r26901; + add.s32 %r27663, %r27662, %r27661; + xor.b32 %r27664, %r27663, %r27658; + shf.l.wrap.b32 %r27665, %r27664, %r27664, 24; + add.s32 %r27666, %r27665, %r27659; + xor.b32 %r27667, %r27666, %r27661; + shf.l.wrap.b32 %r27668, %r27667, %r27667, 25; + add.s32 %r27669, %r27621, %r26879; + add.s32 %r27670, %r27669, %r27584; + xor.b32 %r27671, %r27670, %r27609; + shf.l.wrap.b32 %r27672, %r27671, %r27671, 16; + add.s32 %r27673, %r27672, %r27596; + xor.b32 %r27674, %r27673, %r27584; + shf.l.wrap.b32 %r27675, %r27674, %r27674, 20; + add.s32 %r27676, %r27670, %r26912; + add.s32 %r27677, %r27676, %r27675; + xor.b32 %r27678, %r27677, %r27672; + shf.l.wrap.b32 %r27679, %r27678, %r27678, 24; + add.s32 %r27680, %r27679, %r27673; + xor.b32 %r27681, %r27680, %r27675; + shf.l.wrap.b32 %r27682, %r27681, %r27681, 25; + add.s32 %r27683, %r27635, %r26956; + add.s32 %r27684, %r27683, %r27682; + xor.b32 %r27685, %r27684, %r27651; + shf.l.wrap.b32 %r27686, %r27685, %r27685, 16; + add.s32 %r27687, %r27686, %r27666; + xor.b32 %r27688, %r27687, %r27682; + shf.l.wrap.b32 %r27689, %r27688, %r27688, 20; + add.s32 %r27690, %r27684, %r27000; + add.s32 %r27691, %r27690, %r27689; + xor.b32 %r27692, %r27691, %r27686; + shf.l.wrap.b32 %r27693, %r27692, %r27692, 24; + add.s32 %r27694, %r27693, %r27687; + xor.b32 %r27695, %r27694, %r27689; + shf.l.wrap.b32 %r27696, %r27695, %r27695, 25; + add.s32 %r27697, %r27649, %r26890; + add.s32 %r27698, %r27697, %r27640; + xor.b32 %r27699, %r27698, %r27665; + shf.l.wrap.b32 %r27700, %r27699, %r27699, 16; + add.s32 %r27701, %r27700, %r27680; + xor.b32 %r27702, %r27701, %r27640; + shf.l.wrap.b32 %r27703, %r27702, %r27702, 20; + add.s32 %r27704, %r27698, %r26835; + add.s32 %r27705, %r27704, %r27703; + xor.b32 %r27706, %r27705, %r27700; + shf.l.wrap.b32 %r27707, %r27706, %r27706, 24; + add.s32 %r27708, %r27707, %r27701; + xor.b32 %r27709, %r27708, %r27703; + shf.l.wrap.b32 %r27710, %r27709, %r27709, 25; + add.s32 %r27711, %r27663, %r26846; + add.s32 %r27712, %r27711, %r27654; + xor.b32 %r27713, %r27712, %r27679; + shf.l.wrap.b32 %r27714, %r27713, %r27713, 16; + add.s32 %r27715, %r27714, %r27638; + xor.b32 %r27716, %r27715, %r27654; + shf.l.wrap.b32 %r27717, %r27716, %r27716, 20; + add.s32 %r27718, %r27712, %r26934; + add.s32 %r27719, %r27718, %r27717; + xor.b32 %r27720, %r27719, %r27714; + shf.l.wrap.b32 %r27721, %r27720, %r27720, 24; + add.s32 %r27722, %r27721, %r27715; + xor.b32 %r27723, %r27722, %r27717; + shf.l.wrap.b32 %r27724, %r27723, %r27723, 25; + add.s32 %r27725, %r27677, %r26923; + add.s32 %r27726, %r27725, %r27668; + xor.b32 %r27727, %r27726, %r27637; + shf.l.wrap.b32 %r27728, %r27727, %r27727, 16; + add.s32 %r27729, %r27728, %r27652; + xor.b32 %r27730, %r27729, %r27668; + shf.l.wrap.b32 %r27731, %r27730, %r27730, 20; + add.s32 %r27732, %r27726, %r26901; + add.s32 %r27733, %r27732, %r27731; + xor.b32 %r27734, %r27733, %r27728; + shf.l.wrap.b32 %r27735, %r27734, %r27734, 24; + add.s32 %r27736, %r27735, %r27729; + xor.b32 %r27737, %r27736, %r27731; + shf.l.wrap.b32 %r27738, %r27737, %r27737, 25; + add.s32 %r27739, %r27691, %r26989; + add.s32 %r27740, %r27739, %r27710; + xor.b32 %r27741, %r27740, %r27735; + shf.l.wrap.b32 %r27742, %r27741, %r27741, 16; + add.s32 %r27743, %r27742, %r27722; + xor.b32 %r27744, %r27743, %r27710; + shf.l.wrap.b32 %r27745, %r27744, %r27744, 20; + add.s32 %r27746, %r27740, %r26945; + add.s32 %r27747, %r27746, %r27745; + xor.b32 %r27748, %r27747, %r27742; + shr.u32 %r27749, %r27748, 8; + shf.l.wrap.b32 %r27750, %r27748, %r27748, 24; + add.s32 %r27751, %r27750, %r27743; + xor.b32 %r27752, %r27751, %r27745; + shr.u32 %r27753, %r27752, 7; + shf.l.wrap.b32 %r27754, %r27752, %r27752, 25; + add.s32 %r27755, %r27705, %r26857; + add.s32 %r27756, %r27755, %r27724; + xor.b32 %r27757, %r27756, %r27693; + shf.l.wrap.b32 %r27758, %r27757, %r27757, 16; + add.s32 %r27759, %r27758, %r27736; + xor.b32 %r27760, %r27759, %r27724; + shf.l.wrap.b32 %r27761, %r27760, %r27760, 20; + add.s32 %r27762, %r27756, %r26967; + add.s32 %r27763, %r27762, %r27761; + xor.b32 %r27764, %r27763, %r27758; + shr.u32 %r27765, %r27764, 8; + shf.l.wrap.b32 %r27766, %r27764, %r27764, 24; + add.s32 %r27767, %r27766, %r27759; + xor.b32 %r27768, %r27767, %r27761; + shr.u32 %r27769, %r27768, 7; + shf.l.wrap.b32 %r27770, %r27768, %r27768, 25; + add.s32 %r27771, %r27719, %r26868; + add.s32 %r27772, %r27771, %r27738; + xor.b32 %r27773, %r27772, %r27707; + shf.l.wrap.b32 %r27774, %r27773, %r27773, 16; + add.s32 %r27775, %r27774, %r27694; + xor.b32 %r27776, %r27775, %r27738; + shf.l.wrap.b32 %r27777, %r27776, %r27776, 20; + add.s32 %r27778, %r27772, %r26879; + add.s32 %r27779, %r27778, %r27777; + xor.b32 %r27780, %r27779, %r27774; + shr.u32 %r27781, %r27780, 8; + shf.l.wrap.b32 %r27782, %r27780, %r27780, 24; + add.s32 %r27783, %r27782, %r27775; + xor.b32 %r27784, %r27783, %r27777; + shr.u32 %r27785, %r27784, 7; + shf.l.wrap.b32 %r27786, %r27784, %r27784, 25; + add.s32 %r27787, %r27733, %r26912; + add.s32 %r27788, %r27787, %r27696; + xor.b32 %r27789, %r27788, %r27721; + shf.l.wrap.b32 %r27790, %r27789, %r27789, 16; + add.s32 %r27791, %r27790, %r27708; + xor.b32 %r27792, %r27791, %r27696; + shf.l.wrap.b32 %r27793, %r27792, %r27792, 20; + add.s32 %r27794, %r27788, %r26978; + add.s32 %r27795, %r27794, %r27793; + xor.b32 %r27796, %r27795, %r27790; + shr.u32 %r27797, %r27796, 8; + shf.l.wrap.b32 %r27798, %r27796, %r27796, 24; + add.s32 %r27799, %r27798, %r27791; + xor.b32 %r27800, %r27799, %r27793; + shr.u32 %r27801, %r27800, 7; + shf.l.wrap.b32 %r27802, %r27800, %r27800, 25; + xor.b32 %r27803, %r27783, %r27747; + xor.b32 %r27804, %r27799, %r27763; + xor.b32 %r27805, %r27751, %r27779; + xor.b32 %r27806, %r27767, %r27795; + xor.b32 %r27807, %r27802, %r27766; + xor.b32 %r27808, %r27754, %r27782; + xor.b32 %r27809, %r27770, %r27798; + xor.b32 %r27810, %r27786, %r27750; + cvt.u16.u32 %rs551, %r27783; + cvt.u16.u32 %rs552, %r27747; + xor.b16 %rs830, %rs551, %rs552; + shr.u32 %r27811, %r27803, 8; + cvt.u16.u32 %rs831, %r27811; + shr.u32 %r27812, %r27803, 16; + cvt.u16.u32 %rs832, %r27812; + shr.u32 %r27813, %r27803, 24; + cvt.u16.u32 %rs833, %r27813; + cvt.u16.u32 %rs553, %r27799; + cvt.u16.u32 %rs554, %r27763; + xor.b16 %rs834, %rs553, %rs554; + shr.u32 %r27814, %r27804, 8; + cvt.u16.u32 %rs835, %r27814; + shr.u32 %r27815, %r27804, 16; + cvt.u16.u32 %rs836, %r27815; + shr.u32 %r27816, %r27804, 24; + cvt.u16.u32 %rs837, %r27816; + cvt.u16.u32 %rs555, %r27779; + cvt.u16.u32 %rs556, %r27751; + xor.b16 %rs838, %rs556, %rs555; + shr.u32 %r27817, %r27805, 8; + cvt.u16.u32 %rs839, %r27817; + shr.u32 %r27818, %r27805, 16; + cvt.u16.u32 %rs840, %r27818; + shr.u32 %r27819, %r27805, 24; + cvt.u16.u32 %rs841, %r27819; + cvt.u16.u32 %rs557, %r27767; + cvt.u16.u32 %rs558, %r27795; + xor.b16 %rs842, %rs557, %rs558; + shr.u32 %r27820, %r27806, 8; + cvt.u16.u32 %rs843, %r27820; + shr.u32 %r27821, %r27806, 16; + cvt.u16.u32 %rs844, %r27821; + shr.u32 %r27822, %r27806, 24; + cvt.u16.u32 %rs845, %r27822; + cvt.u16.u32 %rs559, %r27801; + cvt.u16.u32 %rs560, %r27765; + xor.b16 %rs846, %rs559, %rs560; + shr.u32 %r27823, %r27807, 8; + cvt.u16.u32 %rs847, %r27823; + shr.u32 %r27824, %r27807, 16; + cvt.u16.u32 %rs848, %r27824; + shr.u32 %r27825, %r27807, 24; + cvt.u16.u32 %rs849, %r27825; + cvt.u16.u32 %rs561, %r27781; + cvt.u16.u32 %rs562, %r27753; + xor.b16 %rs850, %rs562, %rs561; + shr.u32 %r27826, %r27808, 8; + cvt.u16.u32 %rs851, %r27826; + shr.u32 %r27827, %r27808, 16; + cvt.u16.u32 %rs852, %r27827; + shr.u32 %r27828, %r27808, 24; + cvt.u16.u32 %rs853, %r27828; + cvt.u16.u32 %rs563, %r27797; + cvt.u16.u32 %rs564, %r27769; + xor.b16 %rs854, %rs564, %rs563; + shr.u32 %r27829, %r27809, 8; + cvt.u16.u32 %rs855, %r27829; + shr.u32 %r27830, %r27809, 16; + cvt.u16.u32 %rs856, %r27830; + shr.u32 %r27831, %r27809, 24; + cvt.u16.u32 %rs857, %r27831; + cvt.u16.u32 %rs565, %r27749; + cvt.u16.u32 %rs566, %r27785; + xor.b16 %rs858, %rs566, %rs565; + shr.u32 %r27832, %r27810, 8; + cvt.u16.u32 %rs859, %r27832; + shr.u32 %r27833, %r27810, 16; + cvt.u16.u32 %rs860, %r27833; + shr.u32 %r27834, %r27810, 24; + cvt.u16.u32 %rs861, %r27834; + setp.ne.s64 %p53, %rd1367, 0; + mov.u16 %rs862, 64; + mov.u16 %rs732, %rs863; + mov.u16 %rs766, %rs829; + mov.u16 %rs767, %rs828; + mov.u16 %rs768, %rs827; + mov.u16 %rs769, %rs826; + mov.u16 %rs770, %rs825; + mov.u16 %rs771, %rs824; + mov.u16 %rs772, %rs823; + mov.u16 %rs773, %rs822; + mov.u16 %rs774, %rs821; + mov.u16 %rs775, %rs820; + mov.u16 %rs776, %rs819; + mov.u16 %rs777, %rs818; + mov.u16 %rs778, %rs817; + mov.u16 %rs779, %rs816; + mov.u16 %rs780, %rs815; + mov.u16 %rs781, %rs814; + mov.u16 %rs782, %rs813; + mov.u16 %rs783, %rs812; + mov.u16 %rs784, %rs811; + mov.u16 %rs785, %rs810; + mov.u16 %rs786, %rs809; + mov.u16 %rs787, %rs808; + mov.u16 %rs788, %rs807; + mov.u16 %rs789, %rs806; + mov.u16 %rs790, %rs805; + mov.u16 %rs791, %rs804; + mov.u16 %rs792, %rs803; + mov.u16 %rs793, %rs802; + mov.u16 %rs794, %rs801; + mov.u16 %rs795, %rs800; + mov.u16 %rs796, %rs799; + mov.u16 %rs797, %rs798; + mov.u64 %rd1368, %rd1159; + mov.u32 %r31250, %r31265; + mov.u32 %r31251, %r31264; + mov.u32 %r31252, %r31263; + mov.u32 %r31253, %r31262; + mov.u32 %r31254, %r31261; + mov.u32 %r31255, %r31260; + mov.u32 %r31256, %r31259; + mov.u32 %r31257, %r31258; + @%p53 bra $L__BB2_93; + +$L__BB2_94: + cvt.u32.u16 %r27835, %rs798; + and.b32 %r27836, %r27835, 255; + cvt.u32.u16 %r27837, %rs799; + prmt.b32 %r27838, %r27837, %r27836, 30212; + cvt.u32.u16 %r27839, %rs800; + shl.b32 %r27840, %r27839, 16; + and.b32 %r27841, %r27840, 16711680; + or.b32 %r27842, %r27838, %r27841; + cvt.u32.u16 %r27843, %rs801; + shl.b32 %r27844, %r27843, 24; + or.b32 %r27845, %r27842, %r27844; + cvt.u32.u16 %r27846, %rs802; + and.b32 %r27847, %r27846, 255; + cvt.u32.u16 %r27848, %rs803; + prmt.b32 %r27849, %r27848, %r27847, 30212; + cvt.u32.u16 %r27850, %rs804; + shl.b32 %r27851, %r27850, 16; + and.b32 %r27852, %r27851, 16711680; + or.b32 %r27853, %r27849, %r27852; + cvt.u32.u16 %r27854, %rs805; + shl.b32 %r27855, %r27854, 24; + or.b32 %r27856, %r27853, %r27855; + cvt.u32.u16 %r27857, %rs806; + and.b32 %r27858, %r27857, 255; + cvt.u32.u16 %r27859, %rs807; + prmt.b32 %r27860, %r27859, %r27858, 30212; + cvt.u32.u16 %r27861, %rs808; + shl.b32 %r27862, %r27861, 16; + and.b32 %r27863, %r27862, 16711680; + or.b32 %r27864, %r27860, %r27863; + cvt.u32.u16 %r27865, %rs809; + shl.b32 %r27866, %r27865, 24; + or.b32 %r27867, %r27864, %r27866; + cvt.u32.u16 %r27868, %rs810; + and.b32 %r27869, %r27868, 255; + cvt.u32.u16 %r27870, %rs811; + prmt.b32 %r27871, %r27870, %r27869, 30212; + cvt.u32.u16 %r27872, %rs812; + shl.b32 %r27873, %r27872, 16; + and.b32 %r27874, %r27873, 16711680; + or.b32 %r27875, %r27871, %r27874; + cvt.u32.u16 %r27876, %rs813; + shl.b32 %r27877, %r27876, 24; + or.b32 %r27878, %r27875, %r27877; + cvt.u32.u16 %r27879, %rs814; + and.b32 %r27880, %r27879, 255; + cvt.u32.u16 %r27881, %rs815; + prmt.b32 %r27882, %r27881, %r27880, 30212; + cvt.u32.u16 %r27883, %rs816; + shl.b32 %r27884, %r27883, 16; + and.b32 %r27885, %r27884, 16711680; + or.b32 %r27886, %r27882, %r27885; + cvt.u32.u16 %r27887, %rs817; + shl.b32 %r27888, %r27887, 24; + or.b32 %r27889, %r27886, %r27888; + cvt.u32.u16 %r27890, %rs818; + and.b32 %r27891, %r27890, 255; + cvt.u32.u16 %r27892, %rs819; + prmt.b32 %r27893, %r27892, %r27891, 30212; + cvt.u32.u16 %r27894, %rs820; + shl.b32 %r27895, %r27894, 16; + and.b32 %r27896, %r27895, 16711680; + or.b32 %r27897, %r27893, %r27896; + cvt.u32.u16 %r27898, %rs821; + shl.b32 %r27899, %r27898, 24; + or.b32 %r27900, %r27897, %r27899; + cvt.u32.u16 %r27901, %rs822; + and.b32 %r27902, %r27901, 255; + cvt.u32.u16 %r27903, %rs823; + prmt.b32 %r27904, %r27903, %r27902, 30212; + cvt.u32.u16 %r27905, %rs824; + shl.b32 %r27906, %r27905, 16; + and.b32 %r27907, %r27906, 16711680; + or.b32 %r27908, %r27904, %r27907; + cvt.u32.u16 %r27909, %rs825; + shl.b32 %r27910, %r27909, 24; + or.b32 %r27911, %r27908, %r27910; + cvt.u32.u16 %r27912, %rs826; + and.b32 %r27913, %r27912, 255; + cvt.u32.u16 %r27914, %rs827; + prmt.b32 %r27915, %r27914, %r27913, 30212; + cvt.u32.u16 %r27916, %rs828; + shl.b32 %r27917, %r27916, 16; + and.b32 %r27918, %r27917, 16711680; + or.b32 %r27919, %r27915, %r27918; + cvt.u32.u16 %r27920, %rs829; + shl.b32 %r27921, %r27920, 24; + or.b32 %r27922, %r27919, %r27921; + cvt.u32.u16 %r27923, %rs830; + and.b32 %r27924, %r27923, 255; + cvt.u32.u16 %r27925, %rs831; + prmt.b32 %r27926, %r27925, %r27924, 30212; + cvt.u32.u16 %r27927, %rs832; + shl.b32 %r27928, %r27927, 16; + and.b32 %r27929, %r27928, 16711680; + or.b32 %r27930, %r27926, %r27929; + cvt.u32.u16 %r27931, %rs833; + shl.b32 %r27932, %r27931, 24; + or.b32 %r27933, %r27930, %r27932; + cvt.u32.u16 %r27934, %rs834; + and.b32 %r27935, %r27934, 255; + cvt.u32.u16 %r27936, %rs835; + prmt.b32 %r27937, %r27936, %r27935, 30212; + cvt.u32.u16 %r27938, %rs836; + shl.b32 %r27939, %r27938, 16; + and.b32 %r27940, %r27939, 16711680; + or.b32 %r27941, %r27937, %r27940; + cvt.u32.u16 %r27942, %rs837; + shl.b32 %r27943, %r27942, 24; + or.b32 %r27944, %r27941, %r27943; + cvt.u32.u16 %r27945, %rs838; + and.b32 %r27946, %r27945, 255; + cvt.u32.u16 %r27947, %rs839; + prmt.b32 %r27948, %r27947, %r27946, 30212; + cvt.u32.u16 %r27949, %rs840; + shl.b32 %r27950, %r27949, 16; + and.b32 %r27951, %r27950, 16711680; + or.b32 %r27952, %r27948, %r27951; + cvt.u32.u16 %r27953, %rs841; + shl.b32 %r27954, %r27953, 24; + or.b32 %r27955, %r27952, %r27954; + cvt.u32.u16 %r27956, %rs842; + and.b32 %r27957, %r27956, 255; + cvt.u32.u16 %r27958, %rs843; + prmt.b32 %r27959, %r27958, %r27957, 30212; + cvt.u32.u16 %r27960, %rs844; + shl.b32 %r27961, %r27960, 16; + and.b32 %r27962, %r27961, 16711680; + or.b32 %r27963, %r27959, %r27962; + cvt.u32.u16 %r27964, %rs845; + shl.b32 %r27965, %r27964, 24; + or.b32 %r27966, %r27963, %r27965; + cvt.u32.u16 %r27967, %rs846; + and.b32 %r27968, %r27967, 255; + cvt.u32.u16 %r27969, %rs847; + prmt.b32 %r27970, %r27969, %r27968, 30212; + cvt.u32.u16 %r27971, %rs848; + shl.b32 %r27972, %r27971, 16; + and.b32 %r27973, %r27972, 16711680; + or.b32 %r27974, %r27970, %r27973; + cvt.u32.u16 %r27975, %rs849; + shl.b32 %r27976, %r27975, 24; + or.b32 %r27977, %r27974, %r27976; + cvt.u32.u16 %r27978, %rs850; + and.b32 %r27979, %r27978, 255; + cvt.u32.u16 %r27980, %rs851; + prmt.b32 %r27981, %r27980, %r27979, 30212; + cvt.u32.u16 %r27982, %rs852; + shl.b32 %r27983, %r27982, 16; + and.b32 %r27984, %r27983, 16711680; + or.b32 %r27985, %r27981, %r27984; + cvt.u32.u16 %r27986, %rs853; + shl.b32 %r27987, %r27986, 24; + or.b32 %r27988, %r27985, %r27987; + cvt.u32.u16 %r27989, %rs854; + and.b32 %r27990, %r27989, 255; + cvt.u32.u16 %r27991, %rs855; + prmt.b32 %r27992, %r27991, %r27990, 30212; + cvt.u32.u16 %r27993, %rs856; + shl.b32 %r27994, %r27993, 16; + and.b32 %r27995, %r27994, 16711680; + or.b32 %r27996, %r27992, %r27995; + cvt.u32.u16 %r27997, %rs857; + shl.b32 %r27998, %r27997, 24; + or.b32 %r27999, %r27996, %r27998; + cvt.u32.u16 %r28000, %rs858; + and.b32 %r28001, %r28000, 255; + cvt.u32.u16 %r28002, %rs859; + prmt.b32 %r28003, %r28002, %r28001, 30212; + cvt.u32.u16 %r28004, %rs860; + shl.b32 %r28005, %r28004, 16; + and.b32 %r28006, %r28005, 16711680; + or.b32 %r28007, %r28003, %r28006; + cvt.u32.u16 %r28008, %rs861; + shl.b32 %r28009, %r28008, 24; + or.b32 %r28010, %r28007, %r28009; + or.b16 %rs567, %rs863, 8; + cvt.u32.u16 %r28011, %rs567; + and.b32 %r28012, %r28011, 255; + add.s32 %r28013, %r31262, %r31258; + add.s32 %r28014, %r28013, %r27845; + add.s32 %r28015, %r27856, %r28014; + add.s32 %r28016, %r31263, %r31259; + add.s32 %r28017, %r28016, %r27867; + add.s32 %r28018, %r27878, %r28017; + add.s32 %r28019, %r31264, %r31260; + add.s32 %r28020, %r28019, %r27889; + cvt.u32.u16 %r28021, %rs862; + and.b32 %r28022, %r28021, 255; + xor.b32 %r28023, %r28020, %r28022; + shr.u32 %r28024, %r28020, 16; + shl.b32 %r28025, %r28023, 16; + or.b32 %r28026, %r28025, %r28024; + add.s32 %r28027, %r28026, 1013904242; + xor.b32 %r28028, %r28027, %r31264; + shf.l.wrap.b32 %r28029, %r28028, %r28028, 20; + add.s32 %r28030, %r27900, %r28020; + add.s32 %r28031, %r28030, %r28029; + xor.b32 %r28032, %r28031, %r28026; + shf.l.wrap.b32 %r28033, %r28032, %r28032, 24; + add.s32 %r28034, %r28033, %r28027; + xor.b32 %r28035, %r28034, %r28029; + shf.l.wrap.b32 %r28036, %r28035, %r28035, 25; + add.s32 %r28037, %r31265, %r31261; + add.s32 %r28038, %r28037, %r27911; + xor.b32 %r28039, %r28038, %r28012; + shr.u32 %r28040, %r28038, 16; + shl.b32 %r28041, %r28039, 16; + or.b32 %r28042, %r28041, %r28040; + add.s32 %r28043, %r28042, -1521486534; + xor.b32 %r28044, %r28043, %r31265; + shf.l.wrap.b32 %r28045, %r28044, %r28044, 20; + add.s32 %r28046, %r27922, %r28038; + add.s32 %r28047, %r28046, %r28045; + xor.b32 %r28048, %r28047, %r28042; + shf.l.wrap.b32 %r28049, %r28048, %r28048, 24; + add.s32 %r28050, %r28049, %r28043; + xor.b32 %r28051, %r28050, %r28045; + shf.l.wrap.b32 %r28052, %r28051, %r28051, 25; + add.s32 %r28053, %r28036, %r27955; + add.s32 %r28054, %r28031, %r27977; + add.s32 %r28055, %r28054, %r28052; + add.s32 %r28056, %r28055, %r27988; + add.s32 %r28057, %r28047, %r27999; + shf.l.wrap.b32 %r28058, %r28014, %r28014, 16; + add.s32 %r28059, %r28058, 1779033703; + xor.b32 %r28060, %r28059, %r31262; + shf.l.wrap.b32 %r28061, %r28060, %r28060, 20; + add.s32 %r28062, %r28015, %r28061; + xor.b32 %r28063, %r28062, %r28058; + shf.l.wrap.b32 %r28064, %r28063, %r28063, 24; + add.s32 %r28065, %r28064, %r28059; + xor.b32 %r28066, %r28065, %r28061; + shf.l.wrap.b32 %r28067, %r28066, %r28066, 25; + shf.l.wrap.b32 %r28068, %r28017, %r28017, 16; + add.s32 %r28069, %r28068, -1150833019; + xor.b32 %r28070, %r28069, %r31263; + shf.l.wrap.b32 %r28071, %r28070, %r28070, 20; + add.s32 %r28072, %r28018, %r28071; + xor.b32 %r28073, %r28072, %r28068; + shf.l.wrap.b32 %r28074, %r28073, %r28073, 24; + add.s32 %r28075, %r28074, %r28069; + xor.b32 %r28076, %r28075, %r28071; + shf.l.wrap.b32 %r28077, %r28076, %r28076, 25; + add.s32 %r28078, %r28062, %r27933; + add.s32 %r28079, %r28078, %r28077; + xor.b32 %r28080, %r28079, %r28049; + shf.l.wrap.b32 %r28081, %r28080, %r28080, 16; + add.s32 %r28082, %r28081, %r28034; + xor.b32 %r28083, %r28082, %r28077; + shf.l.wrap.b32 %r28084, %r28083, %r28083, 20; + add.s32 %r28085, %r28079, %r27944; + add.s32 %r28086, %r28085, %r28084; + xor.b32 %r28087, %r28086, %r28081; + shf.l.wrap.b32 %r28088, %r28087, %r28087, 24; + add.s32 %r28089, %r28088, %r28082; + xor.b32 %r28090, %r28089, %r28084; + shf.l.wrap.b32 %r28091, %r28090, %r28090, 25; + add.s32 %r28092, %r28053, %r28072; + xor.b32 %r28093, %r28064, %r28092; + shf.l.wrap.b32 %r28094, %r28093, %r28093, 16; + add.s32 %r28095, %r28094, %r28050; + xor.b32 %r28096, %r28095, %r28036; + shf.l.wrap.b32 %r28097, %r28096, %r28096, 20; + add.s32 %r28098, %r28092, %r27966; + add.s32 %r28099, %r28098, %r28097; + xor.b32 %r28100, %r28099, %r28094; + shf.l.wrap.b32 %r28101, %r28100, %r28100, 24; + add.s32 %r28102, %r28101, %r28095; + xor.b32 %r28103, %r28102, %r28097; + shf.l.wrap.b32 %r28104, %r28103, %r28103, 25; + xor.b32 %r28105, %r28074, %r28055; + shf.l.wrap.b32 %r28106, %r28105, %r28105, 16; + add.s32 %r28107, %r28106, %r28065; + xor.b32 %r28108, %r28107, %r28052; + shf.l.wrap.b32 %r28109, %r28108, %r28108, 20; + add.s32 %r28110, %r28056, %r28109; + xor.b32 %r28111, %r28110, %r28106; + shf.l.wrap.b32 %r28112, %r28111, %r28111, 24; + add.s32 %r28113, %r28112, %r28107; + xor.b32 %r28114, %r28113, %r28109; + shf.l.wrap.b32 %r28115, %r28114, %r28114, 25; + add.s32 %r28116, %r28057, %r28067; + xor.b32 %r28117, %r28116, %r28033; + shf.l.wrap.b32 %r28118, %r28117, %r28117, 16; + add.s32 %r28119, %r28118, %r28075; + xor.b32 %r28120, %r28119, %r28067; + shf.l.wrap.b32 %r28121, %r28120, %r28120, 20; + add.s32 %r28122, %r28116, %r28010; + add.s32 %r28123, %r28122, %r28121; + xor.b32 %r28124, %r28123, %r28118; + shf.l.wrap.b32 %r28125, %r28124, %r28124, 24; + add.s32 %r28126, %r28125, %r28119; + xor.b32 %r28127, %r28126, %r28121; + shf.l.wrap.b32 %r28128, %r28127, %r28127, 25; + add.s32 %r28129, %r28086, %r27867; + add.s32 %r28130, %r28129, %r28128; + xor.b32 %r28131, %r28130, %r28101; + shf.l.wrap.b32 %r28132, %r28131, %r28131, 16; + add.s32 %r28133, %r28132, %r28113; + xor.b32 %r28134, %r28133, %r28128; + shf.l.wrap.b32 %r28135, %r28134, %r28134, 20; + add.s32 %r28136, %r28130, %r27911; + add.s32 %r28137, %r28136, %r28135; + xor.b32 %r28138, %r28137, %r28132; + shf.l.wrap.b32 %r28139, %r28138, %r28138, 24; + add.s32 %r28140, %r28139, %r28133; + xor.b32 %r28141, %r28140, %r28135; + shf.l.wrap.b32 %r28142, %r28141, %r28141, 25; + add.s32 %r28143, %r28099, %r27878; + add.s32 %r28144, %r28143, %r28091; + xor.b32 %r28145, %r28144, %r28112; + shf.l.wrap.b32 %r28146, %r28145, %r28145, 16; + add.s32 %r28147, %r28146, %r28126; + xor.b32 %r28148, %r28147, %r28091; + shf.l.wrap.b32 %r28149, %r28148, %r28148, 20; + add.s32 %r28150, %r28144, %r27955; + add.s32 %r28151, %r28150, %r28149; + xor.b32 %r28152, %r28151, %r28146; + shf.l.wrap.b32 %r28153, %r28152, %r28152, 24; + add.s32 %r28154, %r28153, %r28147; + xor.b32 %r28155, %r28154, %r28149; + shf.l.wrap.b32 %r28156, %r28155, %r28155, 25; + add.s32 %r28157, %r28110, %r27922; + add.s32 %r28158, %r28157, %r28104; + xor.b32 %r28159, %r28125, %r28158; + shf.l.wrap.b32 %r28160, %r28159, %r28159, 16; + add.s32 %r28161, %r28160, %r28089; + xor.b32 %r28162, %r28161, %r28104; + shf.l.wrap.b32 %r28163, %r28162, %r28162, 20; + add.s32 %r28164, %r28158, %r27845; + add.s32 %r28165, %r28164, %r28163; + xor.b32 %r28166, %r28165, %r28160; + shf.l.wrap.b32 %r28167, %r28166, %r28166, 24; + add.s32 %r28168, %r28167, %r28161; + xor.b32 %r28169, %r28168, %r28163; + shf.l.wrap.b32 %r28170, %r28169, %r28169, 25; + add.s32 %r28171, %r28123, %r27889; + add.s32 %r28172, %r28171, %r28115; + xor.b32 %r28173, %r28088, %r28172; + shf.l.wrap.b32 %r28174, %r28173, %r28173, 16; + add.s32 %r28175, %r28174, %r28102; + xor.b32 %r28176, %r28175, %r28115; + shf.l.wrap.b32 %r28177, %r28176, %r28176, 20; + add.s32 %r28178, %r28172, %r27988; + add.s32 %r28179, %r28178, %r28177; + xor.b32 %r28180, %r28179, %r28174; + shf.l.wrap.b32 %r28181, %r28180, %r28180, 24; + add.s32 %r28182, %r28181, %r28175; + xor.b32 %r28183, %r28182, %r28177; + shf.l.wrap.b32 %r28184, %r28183, %r28183, 25; + add.s32 %r28185, %r28137, %r27856; + add.s32 %r28186, %r28185, %r28156; + xor.b32 %r28187, %r28186, %r28181; + shf.l.wrap.b32 %r28188, %r28187, %r28187, 16; + add.s32 %r28189, %r28188, %r28168; + xor.b32 %r28190, %r28189, %r28156; + shf.l.wrap.b32 %r28191, %r28190, %r28190, 20; + add.s32 %r28192, %r28186, %r27966; + add.s32 %r28193, %r28192, %r28191; + xor.b32 %r28194, %r28193, %r28188; + shf.l.wrap.b32 %r28195, %r28194, %r28194, 24; + add.s32 %r28196, %r28195, %r28189; + xor.b32 %r28197, %r28196, %r28191; + shf.l.wrap.b32 %r28198, %r28197, %r28197, 25; + add.s32 %r28199, %r28170, %r27977; + add.s32 %r28200, %r28199, %r28151; + xor.b32 %r28201, %r28139, %r28200; + shf.l.wrap.b32 %r28202, %r28201, %r28201, 16; + add.s32 %r28203, %r28202, %r28182; + xor.b32 %r28204, %r28203, %r28170; + shf.l.wrap.b32 %r28205, %r28204, %r28204, 20; + add.s32 %r28206, %r28200, %r27900; + add.s32 %r28207, %r28206, %r28205; + xor.b32 %r28208, %r28207, %r28202; + shf.l.wrap.b32 %r28209, %r28208, %r28208, 24; + add.s32 %r28210, %r28209, %r28203; + xor.b32 %r28211, %r28210, %r28205; + shf.l.wrap.b32 %r28212, %r28211, %r28211, 25; + add.s32 %r28213, %r28165, %r27944; + add.s32 %r28214, %r28213, %r28184; + xor.b32 %r28215, %r28153, %r28214; + shf.l.wrap.b32 %r28216, %r28215, %r28215, 16; + add.s32 %r28217, %r28216, %r28140; + xor.b32 %r28218, %r28217, %r28184; + shf.l.wrap.b32 %r28219, %r28218, %r28218, 20; + add.s32 %r28220, %r28214, %r27999; + add.s32 %r28221, %r28220, %r28219; + xor.b32 %r28222, %r28221, %r28216; + shf.l.wrap.b32 %r28223, %r28222, %r28222, 24; + add.s32 %r28224, %r28223, %r28217; + xor.b32 %r28225, %r28224, %r28219; + shf.l.wrap.b32 %r28226, %r28225, %r28225, 25; + add.s32 %r28227, %r28179, %r28010; + add.s32 %r28228, %r28227, %r28142; + xor.b32 %r28229, %r28228, %r28167; + shf.l.wrap.b32 %r28230, %r28229, %r28229, 16; + add.s32 %r28231, %r28230, %r28154; + xor.b32 %r28232, %r28231, %r28142; + shf.l.wrap.b32 %r28233, %r28232, %r28232, 20; + add.s32 %r28234, %r28228, %r27933; + add.s32 %r28235, %r28234, %r28233; + xor.b32 %r28236, %r28235, %r28230; + shf.l.wrap.b32 %r28237, %r28236, %r28236, 24; + add.s32 %r28238, %r28237, %r28231; + xor.b32 %r28239, %r28238, %r28233; + shf.l.wrap.b32 %r28240, %r28239, %r28239, 25; + add.s32 %r28241, %r28193, %r27878; + add.s32 %r28242, %r28241, %r28240; + xor.b32 %r28243, %r28242, %r28209; + shf.l.wrap.b32 %r28244, %r28243, %r28243, 16; + add.s32 %r28245, %r28244, %r28224; + xor.b32 %r28246, %r28245, %r28240; + shf.l.wrap.b32 %r28247, %r28246, %r28246, 20; + add.s32 %r28248, %r28242, %r27889; + add.s32 %r28249, %r28248, %r28247; + xor.b32 %r28250, %r28249, %r28244; + shf.l.wrap.b32 %r28251, %r28250, %r28250, 24; + add.s32 %r28252, %r28251, %r28245; + xor.b32 %r28253, %r28252, %r28247; + shf.l.wrap.b32 %r28254, %r28253, %r28253, 25; + add.s32 %r28255, %r28207, %r27955; + add.s32 %r28256, %r28255, %r28198; + xor.b32 %r28257, %r28256, %r28223; + shf.l.wrap.b32 %r28258, %r28257, %r28257, 16; + add.s32 %r28259, %r28258, %r28238; + xor.b32 %r28260, %r28259, %r28198; + shf.l.wrap.b32 %r28261, %r28260, %r28260, 20; + add.s32 %r28262, %r28256, %r27977; + add.s32 %r28263, %r28262, %r28261; + xor.b32 %r28264, %r28263, %r28258; + shf.l.wrap.b32 %r28265, %r28264, %r28264, 24; + add.s32 %r28266, %r28265, %r28259; + xor.b32 %r28267, %r28266, %r28261; + shf.l.wrap.b32 %r28268, %r28267, %r28267, 25; + add.s32 %r28269, %r28221, %r27988; + add.s32 %r28270, %r28269, %r28212; + xor.b32 %r28271, %r28237, %r28270; + shf.l.wrap.b32 %r28272, %r28271, %r28271, 16; + add.s32 %r28273, %r28272, %r28196; + xor.b32 %r28274, %r28273, %r28212; + shf.l.wrap.b32 %r28275, %r28274, %r28274, 20; + add.s32 %r28276, %r28270, %r27867; + add.s32 %r28277, %r28276, %r28275; + xor.b32 %r28278, %r28277, %r28272; + shf.l.wrap.b32 %r28279, %r28278, %r28278, 24; + add.s32 %r28280, %r28279, %r28273; + xor.b32 %r28281, %r28280, %r28275; + shf.l.wrap.b32 %r28282, %r28281, %r28281, 25; + add.s32 %r28283, %r28235, %r27922; + add.s32 %r28284, %r28283, %r28226; + xor.b32 %r28285, %r28195, %r28284; + shf.l.wrap.b32 %r28286, %r28285, %r28285, 16; + add.s32 %r28287, %r28286, %r28210; + xor.b32 %r28288, %r28287, %r28226; + shf.l.wrap.b32 %r28289, %r28288, %r28288, 20; + add.s32 %r28290, %r28284, %r27999; + add.s32 %r28291, %r28290, %r28289; + xor.b32 %r28292, %r28291, %r28286; + shf.l.wrap.b32 %r28293, %r28292, %r28292, 24; + add.s32 %r28294, %r28293, %r28287; + xor.b32 %r28295, %r28294, %r28289; + shf.l.wrap.b32 %r28296, %r28295, %r28295, 25; + add.s32 %r28297, %r28249, %r27911; + add.s32 %r28298, %r28297, %r28268; + xor.b32 %r28299, %r28298, %r28293; + shf.l.wrap.b32 %r28300, %r28299, %r28299, 16; + add.s32 %r28301, %r28300, %r28280; + xor.b32 %r28302, %r28301, %r28268; + shf.l.wrap.b32 %r28303, %r28302, %r28302, 20; + add.s32 %r28304, %r28298, %r27900; + add.s32 %r28305, %r28304, %r28303; + xor.b32 %r28306, %r28305, %r28300; + shf.l.wrap.b32 %r28307, %r28306, %r28306, 24; + add.s32 %r28308, %r28307, %r28301; + xor.b32 %r28309, %r28308, %r28303; + shf.l.wrap.b32 %r28310, %r28309, %r28309, 25; + add.s32 %r28311, %r28282, %r27944; + add.s32 %r28312, %r28311, %r28263; + xor.b32 %r28313, %r28251, %r28312; + shf.l.wrap.b32 %r28314, %r28313, %r28313, 16; + add.s32 %r28315, %r28314, %r28294; + xor.b32 %r28316, %r28315, %r28282; + shf.l.wrap.b32 %r28317, %r28316, %r28316, 20; + add.s32 %r28318, %r28312, %r27845; + add.s32 %r28319, %r28318, %r28317; + xor.b32 %r28320, %r28319, %r28314; + shf.l.wrap.b32 %r28321, %r28320, %r28320, 24; + add.s32 %r28322, %r28321, %r28315; + xor.b32 %r28323, %r28322, %r28317; + shf.l.wrap.b32 %r28324, %r28323, %r28323, 25; + add.s32 %r28325, %r28277, %r27966; + add.s32 %r28326, %r28325, %r28296; + xor.b32 %r28327, %r28265, %r28326; + shf.l.wrap.b32 %r28328, %r28327, %r28327, 16; + add.s32 %r28329, %r28328, %r28252; + xor.b32 %r28330, %r28329, %r28296; + shf.l.wrap.b32 %r28331, %r28330, %r28330, 20; + add.s32 %r28332, %r28326, %r28010; + add.s32 %r28333, %r28332, %r28331; + xor.b32 %r28334, %r28333, %r28328; + shf.l.wrap.b32 %r28335, %r28334, %r28334, 24; + add.s32 %r28336, %r28335, %r28329; + xor.b32 %r28337, %r28336, %r28331; + shf.l.wrap.b32 %r28338, %r28337, %r28337, 25; + add.s32 %r28339, %r28291, %r27933; + add.s32 %r28340, %r28339, %r28254; + xor.b32 %r28341, %r28340, %r28279; + shf.l.wrap.b32 %r28342, %r28341, %r28341, 16; + add.s32 %r28343, %r28342, %r28266; + xor.b32 %r28344, %r28343, %r28254; + shf.l.wrap.b32 %r28345, %r28344, %r28344, 20; + add.s32 %r28346, %r28340, %r27856; + add.s32 %r28347, %r28346, %r28345; + xor.b32 %r28348, %r28347, %r28342; + shf.l.wrap.b32 %r28349, %r28348, %r28348, 24; + add.s32 %r28350, %r28349, %r28343; + xor.b32 %r28351, %r28350, %r28345; + shf.l.wrap.b32 %r28352, %r28351, %r28351, 25; + add.s32 %r28353, %r28305, %r27955; + add.s32 %r28354, %r28353, %r28352; + xor.b32 %r28355, %r28354, %r28321; + shf.l.wrap.b32 %r28356, %r28355, %r28355, 16; + add.s32 %r28357, %r28356, %r28336; + xor.b32 %r28358, %r28357, %r28352; + shf.l.wrap.b32 %r28359, %r28358, %r28358, 20; + add.s32 %r28360, %r28354, %r27922; + add.s32 %r28361, %r28360, %r28359; + xor.b32 %r28362, %r28361, %r28356; + shf.l.wrap.b32 %r28363, %r28362, %r28362, 24; + add.s32 %r28364, %r28363, %r28357; + xor.b32 %r28365, %r28364, %r28359; + shf.l.wrap.b32 %r28366, %r28365, %r28365, 25; + add.s32 %r28367, %r28319, %r27977; + add.s32 %r28368, %r28367, %r28310; + xor.b32 %r28369, %r28368, %r28335; + shf.l.wrap.b32 %r28370, %r28369, %r28369, 16; + add.s32 %r28371, %r28370, %r28350; + xor.b32 %r28372, %r28371, %r28310; + shf.l.wrap.b32 %r28373, %r28372, %r28372, 20; + add.s32 %r28374, %r28368, %r27944; + add.s32 %r28375, %r28374, %r28373; + xor.b32 %r28376, %r28375, %r28370; + shf.l.wrap.b32 %r28377, %r28376, %r28376, 24; + add.s32 %r28378, %r28377, %r28371; + xor.b32 %r28379, %r28378, %r28373; + shf.l.wrap.b32 %r28380, %r28379, %r28379, 25; + add.s32 %r28381, %r28333, %r27999; + add.s32 %r28382, %r28381, %r28324; + xor.b32 %r28383, %r28349, %r28382; + shf.l.wrap.b32 %r28384, %r28383, %r28383, 16; + add.s32 %r28385, %r28384, %r28308; + xor.b32 %r28386, %r28385, %r28324; + shf.l.wrap.b32 %r28387, %r28386, %r28386, 20; + add.s32 %r28388, %r28382, %r27878; + add.s32 %r28389, %r28388, %r28387; + xor.b32 %r28390, %r28389, %r28384; + shf.l.wrap.b32 %r28391, %r28390, %r28390, 24; + add.s32 %r28392, %r28391, %r28385; + xor.b32 %r28393, %r28392, %r28387; + shf.l.wrap.b32 %r28394, %r28393, %r28393, 25; + add.s32 %r28395, %r28347, %r27988; + add.s32 %r28396, %r28395, %r28338; + xor.b32 %r28397, %r28307, %r28396; + shf.l.wrap.b32 %r28398, %r28397, %r28397, 16; + add.s32 %r28399, %r28398, %r28322; + xor.b32 %r28400, %r28399, %r28338; + shf.l.wrap.b32 %r28401, %r28400, %r28400, 20; + add.s32 %r28402, %r28396, %r28010; + add.s32 %r28403, %r28402, %r28401; + xor.b32 %r28404, %r28403, %r28398; + shf.l.wrap.b32 %r28405, %r28404, %r28404, 24; + add.s32 %r28406, %r28405, %r28399; + xor.b32 %r28407, %r28406, %r28401; + shf.l.wrap.b32 %r28408, %r28407, %r28407, 25; + add.s32 %r28409, %r28361, %r27889; + add.s32 %r28410, %r28409, %r28380; + xor.b32 %r28411, %r28410, %r28405; + shf.l.wrap.b32 %r28412, %r28411, %r28411, 16; + add.s32 %r28413, %r28412, %r28392; + xor.b32 %r28414, %r28413, %r28380; + shf.l.wrap.b32 %r28415, %r28414, %r28414, 20; + add.s32 %r28416, %r28410, %r27845; + add.s32 %r28417, %r28416, %r28415; + xor.b32 %r28418, %r28417, %r28412; + shf.l.wrap.b32 %r28419, %r28418, %r28418, 24; + add.s32 %r28420, %r28419, %r28413; + xor.b32 %r28421, %r28420, %r28415; + shf.l.wrap.b32 %r28422, %r28421, %r28421, 25; + add.s32 %r28423, %r28394, %r27966; + add.s32 %r28424, %r28423, %r28375; + xor.b32 %r28425, %r28363, %r28424; + shf.l.wrap.b32 %r28426, %r28425, %r28425, 16; + add.s32 %r28427, %r28426, %r28406; + xor.b32 %r28428, %r28427, %r28394; + shf.l.wrap.b32 %r28429, %r28428, %r28428, 20; + add.s32 %r28430, %r28424, %r27867; + add.s32 %r28431, %r28430, %r28429; + xor.b32 %r28432, %r28431, %r28426; + shf.l.wrap.b32 %r28433, %r28432, %r28432, 24; + add.s32 %r28434, %r28433, %r28427; + xor.b32 %r28435, %r28434, %r28429; + shf.l.wrap.b32 %r28436, %r28435, %r28435, 25; + add.s32 %r28437, %r28389, %r27900; + add.s32 %r28438, %r28437, %r28408; + xor.b32 %r28439, %r28377, %r28438; + shf.l.wrap.b32 %r28440, %r28439, %r28439, 16; + add.s32 %r28441, %r28440, %r28364; + xor.b32 %r28442, %r28441, %r28408; + shf.l.wrap.b32 %r28443, %r28442, %r28442, 20; + add.s32 %r28444, %r28438, %r27933; + add.s32 %r28445, %r28444, %r28443; + xor.b32 %r28446, %r28445, %r28440; + shf.l.wrap.b32 %r28447, %r28446, %r28446, 24; + add.s32 %r28448, %r28447, %r28441; + xor.b32 %r28449, %r28448, %r28443; + shf.l.wrap.b32 %r28450, %r28449, %r28449, 25; + add.s32 %r28451, %r28403, %r27856; + add.s32 %r28452, %r28451, %r28366; + xor.b32 %r28453, %r28452, %r28391; + shf.l.wrap.b32 %r28454, %r28453, %r28453, 16; + add.s32 %r28455, %r28454, %r28378; + xor.b32 %r28456, %r28455, %r28366; + shf.l.wrap.b32 %r28457, %r28456, %r28456, 20; + add.s32 %r28458, %r28452, %r27911; + add.s32 %r28459, %r28458, %r28457; + xor.b32 %r28460, %r28459, %r28454; + shf.l.wrap.b32 %r28461, %r28460, %r28460, 24; + add.s32 %r28462, %r28461, %r28455; + xor.b32 %r28463, %r28462, %r28457; + shf.l.wrap.b32 %r28464, %r28463, %r28463, 25; + add.s32 %r28465, %r28417, %r27977; + add.s32 %r28466, %r28465, %r28464; + xor.b32 %r28467, %r28466, %r28433; + shf.l.wrap.b32 %r28468, %r28467, %r28467, 16; + add.s32 %r28469, %r28468, %r28448; + xor.b32 %r28470, %r28469, %r28464; + shf.l.wrap.b32 %r28471, %r28470, %r28470, 20; + add.s32 %r28472, %r28466, %r27988; + add.s32 %r28473, %r28472, %r28471; + xor.b32 %r28474, %r28473, %r28468; + shf.l.wrap.b32 %r28475, %r28474, %r28474, 24; + add.s32 %r28476, %r28475, %r28469; + xor.b32 %r28477, %r28476, %r28471; + shf.l.wrap.b32 %r28478, %r28477, %r28477, 25; + add.s32 %r28479, %r28431, %r27944; + add.s32 %r28480, %r28479, %r28422; + xor.b32 %r28481, %r28480, %r28447; + shf.l.wrap.b32 %r28482, %r28481, %r28481, 16; + add.s32 %r28483, %r28482, %r28462; + xor.b32 %r28484, %r28483, %r28422; + shf.l.wrap.b32 %r28485, %r28484, %r28484, 20; + add.s32 %r28486, %r28480, %r27966; + add.s32 %r28487, %r28486, %r28485; + xor.b32 %r28488, %r28487, %r28482; + shf.l.wrap.b32 %r28489, %r28488, %r28488, 24; + add.s32 %r28490, %r28489, %r28483; + xor.b32 %r28491, %r28490, %r28485; + shf.l.wrap.b32 %r28492, %r28491, %r28491, 25; + add.s32 %r28493, %r28445, %r28010; + add.s32 %r28494, %r28493, %r28436; + xor.b32 %r28495, %r28461, %r28494; + shf.l.wrap.b32 %r28496, %r28495, %r28495, 16; + add.s32 %r28497, %r28496, %r28420; + xor.b32 %r28498, %r28497, %r28436; + shf.l.wrap.b32 %r28499, %r28498, %r28498, 20; + add.s32 %r28500, %r28494, %r27955; + add.s32 %r28501, %r28500, %r28499; + xor.b32 %r28502, %r28501, %r28496; + shf.l.wrap.b32 %r28503, %r28502, %r28502, 24; + add.s32 %r28504, %r28503, %r28497; + xor.b32 %r28505, %r28504, %r28499; + shf.l.wrap.b32 %r28506, %r28505, %r28505, 25; + add.s32 %r28507, %r28459, %r27999; + add.s32 %r28508, %r28507, %r28450; + xor.b32 %r28509, %r28419, %r28508; + shf.l.wrap.b32 %r28510, %r28509, %r28509, 16; + add.s32 %r28511, %r28510, %r28434; + xor.b32 %r28512, %r28511, %r28450; + shf.l.wrap.b32 %r28513, %r28512, %r28512, 20; + add.s32 %r28514, %r28508, %r27933; + add.s32 %r28515, %r28514, %r28513; + xor.b32 %r28516, %r28515, %r28510; + shf.l.wrap.b32 %r28517, %r28516, %r28516, 24; + add.s32 %r28518, %r28517, %r28511; + xor.b32 %r28519, %r28518, %r28513; + shf.l.wrap.b32 %r28520, %r28519, %r28519, 25; + add.s32 %r28521, %r28473, %r27922; + add.s32 %r28522, %r28521, %r28492; + xor.b32 %r28523, %r28522, %r28517; + shf.l.wrap.b32 %r28524, %r28523, %r28523, 16; + add.s32 %r28525, %r28524, %r28504; + xor.b32 %r28526, %r28525, %r28492; + shf.l.wrap.b32 %r28527, %r28526, %r28526, 20; + add.s32 %r28528, %r28522, %r27867; + add.s32 %r28529, %r28528, %r28527; + xor.b32 %r28530, %r28529, %r28524; + shf.l.wrap.b32 %r28531, %r28530, %r28530, 24; + add.s32 %r28532, %r28531, %r28525; + xor.b32 %r28533, %r28532, %r28527; + shf.l.wrap.b32 %r28534, %r28533, %r28533, 25; + add.s32 %r28535, %r28506, %r27900; + add.s32 %r28536, %r28535, %r28487; + xor.b32 %r28537, %r28475, %r28536; + shf.l.wrap.b32 %r28538, %r28537, %r28537, 16; + add.s32 %r28539, %r28538, %r28518; + xor.b32 %r28540, %r28539, %r28506; + shf.l.wrap.b32 %r28541, %r28540, %r28540, 20; + add.s32 %r28542, %r28536, %r27878; + add.s32 %r28543, %r28542, %r28541; + xor.b32 %r28544, %r28543, %r28538; + shf.l.wrap.b32 %r28545, %r28544, %r28544, 24; + add.s32 %r28546, %r28545, %r28539; + xor.b32 %r28547, %r28546, %r28541; + shf.l.wrap.b32 %r28548, %r28547, %r28547, 25; + add.s32 %r28549, %r28501, %r27845; + add.s32 %r28550, %r28549, %r28520; + xor.b32 %r28551, %r28489, %r28550; + shf.l.wrap.b32 %r28552, %r28551, %r28551, 16; + add.s32 %r28553, %r28552, %r28476; + xor.b32 %r28554, %r28553, %r28520; + shf.l.wrap.b32 %r28555, %r28554, %r28554, 20; + add.s32 %r28556, %r28550, %r27856; + add.s32 %r28557, %r28556, %r28555; + xor.b32 %r28558, %r28557, %r28552; + shf.l.wrap.b32 %r28559, %r28558, %r28558, 24; + add.s32 %r28560, %r28559, %r28553; + xor.b32 %r28561, %r28560, %r28555; + shf.l.wrap.b32 %r28562, %r28561, %r28561, 25; + add.s32 %r28563, %r28515, %r27911; + add.s32 %r28564, %r28563, %r28478; + xor.b32 %r28565, %r28564, %r28503; + shf.l.wrap.b32 %r28566, %r28565, %r28565, 16; + add.s32 %r28567, %r28566, %r28490; + xor.b32 %r28568, %r28567, %r28478; + shf.l.wrap.b32 %r28569, %r28568, %r28568, 20; + add.s32 %r28570, %r28564, %r27889; + add.s32 %r28571, %r28570, %r28569; + xor.b32 %r28572, %r28571, %r28566; + shf.l.wrap.b32 %r28573, %r28572, %r28572, 24; + add.s32 %r28574, %r28573, %r28567; + xor.b32 %r28575, %r28574, %r28569; + shf.l.wrap.b32 %r28576, %r28575, %r28575, 25; + add.s32 %r28577, %r28529, %r27944; + add.s32 %r28578, %r28577, %r28576; + xor.b32 %r28579, %r28578, %r28545; + shf.l.wrap.b32 %r28580, %r28579, %r28579, 16; + add.s32 %r28581, %r28580, %r28560; + xor.b32 %r28582, %r28581, %r28576; + shf.l.wrap.b32 %r28583, %r28582, %r28582, 20; + add.s32 %r28584, %r28578, %r27999; + add.s32 %r28585, %r28584, %r28583; + xor.b32 %r28586, %r28585, %r28580; + shf.l.wrap.b32 %r28587, %r28586, %r28586, 24; + add.s32 %r28588, %r28587, %r28581; + xor.b32 %r28589, %r28588, %r28583; + shf.l.wrap.b32 %r28590, %r28589, %r28589, 25; + add.s32 %r28591, %r28543, %r27966; + add.s32 %r28592, %r28591, %r28534; + xor.b32 %r28593, %r28592, %r28559; + shf.l.wrap.b32 %r28594, %r28593, %r28593, 16; + add.s32 %r28595, %r28594, %r28574; + xor.b32 %r28596, %r28595, %r28534; + shf.l.wrap.b32 %r28597, %r28596, %r28596, 20; + add.s32 %r28598, %r28592, %r27900; + add.s32 %r28599, %r28598, %r28597; + xor.b32 %r28600, %r28599, %r28594; + shf.l.wrap.b32 %r28601, %r28600, %r28600, 24; + add.s32 %r28602, %r28601, %r28595; + xor.b32 %r28603, %r28602, %r28597; + shf.l.wrap.b32 %r28604, %r28603, %r28603, 25; + add.s32 %r28605, %r28557, %r27933; + add.s32 %r28606, %r28605, %r28548; + xor.b32 %r28607, %r28573, %r28606; + shf.l.wrap.b32 %r28608, %r28607, %r28607, 16; + add.s32 %r28609, %r28608, %r28532; + xor.b32 %r28610, %r28609, %r28548; + shf.l.wrap.b32 %r28611, %r28610, %r28610, 20; + add.s32 %r28612, %r28606, %r27977; + add.s32 %r28613, %r28612, %r28611; + xor.b32 %r28614, %r28613, %r28608; + shf.l.wrap.b32 %r28615, %r28614, %r28614, 24; + add.s32 %r28616, %r28615, %r28609; + xor.b32 %r28617, %r28616, %r28611; + shf.l.wrap.b32 %r28618, %r28617, %r28617, 25; + add.s32 %r28619, %r28571, %r28010; + add.s32 %r28620, %r28619, %r28562; + xor.b32 %r28621, %r28531, %r28620; + shf.l.wrap.b32 %r28622, %r28621, %r28621, 16; + add.s32 %r28623, %r28622, %r28546; + xor.b32 %r28624, %r28623, %r28562; + shf.l.wrap.b32 %r28625, %r28624, %r28624, 20; + add.s32 %r28626, %r28620, %r27856; + add.s32 %r28627, %r28626, %r28625; + xor.b32 %r28628, %r28627, %r28622; + shf.l.wrap.b32 %r28629, %r28628, %r28628, 24; + add.s32 %r28630, %r28629, %r28623; + xor.b32 %r28631, %r28630, %r28625; + shf.l.wrap.b32 %r28632, %r28631, %r28631, 25; + add.s32 %r28633, %r28585, %r27988; + add.s32 %r28634, %r28633, %r28604; + xor.b32 %r28635, %r28634, %r28629; + shf.l.wrap.b32 %r28636, %r28635, %r28635, 16; + add.s32 %r28637, %r28636, %r28616; + xor.b32 %r28638, %r28637, %r28604; + shf.l.wrap.b32 %r28639, %r28638, %r28638, 20; + add.s32 %r28640, %r28634, %r27878; + add.s32 %r28641, %r28640, %r28639; + xor.b32 %r28642, %r28641, %r28636; + shf.l.wrap.b32 %r28643, %r28642, %r28642, 24; + add.s32 %r28644, %r28643, %r28637; + xor.b32 %r28645, %r28644, %r28639; + shf.l.wrap.b32 %r28646, %r28645, %r28645, 25; + add.s32 %r28647, %r28618, %r27845; + add.s32 %r28648, %r28647, %r28599; + xor.b32 %r28649, %r28587, %r28648; + shf.l.wrap.b32 %r28650, %r28649, %r28649, 16; + add.s32 %r28651, %r28650, %r28630; + xor.b32 %r28652, %r28651, %r28618; + shf.l.wrap.b32 %r28653, %r28652, %r28652, 20; + add.s32 %r28654, %r28648, %r27955; + add.s32 %r28655, %r28654, %r28653; + xor.b32 %r28656, %r28655, %r28650; + shf.l.wrap.b32 %r28657, %r28656, %r28656, 24; + add.s32 %r28658, %r28657, %r28651; + xor.b32 %r28659, %r28658, %r28653; + shf.l.wrap.b32 %r28660, %r28659, %r28659, 25; + add.s32 %r28661, %r28613, %r27867; + add.s32 %r28662, %r28661, %r28632; + xor.b32 %r28663, %r28601, %r28662; + shf.l.wrap.b32 %r28664, %r28663, %r28663, 16; + add.s32 %r28665, %r28664, %r28588; + xor.b32 %r28666, %r28665, %r28632; + shf.l.wrap.b32 %r28667, %r28666, %r28666, 20; + add.s32 %r28668, %r28662, %r27911; + add.s32 %r28669, %r28668, %r28667; + xor.b32 %r28670, %r28669, %r28664; + shf.l.wrap.b32 %r28671, %r28670, %r28670, 24; + add.s32 %r28672, %r28671, %r28665; + xor.b32 %r28673, %r28672, %r28667; + shf.l.wrap.b32 %r28674, %r28673, %r28673, 25; + add.s32 %r28675, %r28627, %r27889; + add.s32 %r28676, %r28675, %r28590; + xor.b32 %r28677, %r28676, %r28615; + shf.l.wrap.b32 %r28678, %r28677, %r28677, 16; + add.s32 %r28679, %r28678, %r28602; + xor.b32 %r28680, %r28679, %r28590; + shf.l.wrap.b32 %r28681, %r28680, %r28680, 20; + add.s32 %r28682, %r28676, %r27922; + add.s32 %r28683, %r28682, %r28681; + xor.b32 %r28684, %r28683, %r28678; + shf.l.wrap.b32 %r28685, %r28684, %r28684, 24; + add.s32 %r28686, %r28685, %r28679; + xor.b32 %r28687, %r28686, %r28681; + shf.l.wrap.b32 %r28688, %r28687, %r28687, 25; + add.s32 %r28689, %r28641, %r27966; + add.s32 %r28690, %r28689, %r28688; + xor.b32 %r28691, %r28690, %r28657; + shf.l.wrap.b32 %r28692, %r28691, %r28691, 16; + add.s32 %r28693, %r28692, %r28672; + xor.b32 %r28694, %r28693, %r28688; + shf.l.wrap.b32 %r28695, %r28694, %r28694, 20; + add.s32 %r28696, %r28690, %r28010; + add.s32 %r28697, %r28696, %r28695; + xor.b32 %r28698, %r28697, %r28692; + shf.l.wrap.b32 %r28699, %r28698, %r28698, 24; + add.s32 %r28700, %r28699, %r28693; + xor.b32 %r28701, %r28700, %r28695; + shf.l.wrap.b32 %r28702, %r28701, %r28701, 25; + add.s32 %r28703, %r28655, %r27900; + add.s32 %r28704, %r28703, %r28646; + xor.b32 %r28705, %r28704, %r28671; + shf.l.wrap.b32 %r28706, %r28705, %r28705, 16; + add.s32 %r28707, %r28706, %r28686; + xor.b32 %r28708, %r28707, %r28646; + shf.l.wrap.b32 %r28709, %r28708, %r28708, 20; + add.s32 %r28710, %r28704, %r27845; + add.s32 %r28711, %r28710, %r28709; + xor.b32 %r28712, %r28711, %r28706; + shf.l.wrap.b32 %r28713, %r28712, %r28712, 24; + add.s32 %r28714, %r28713, %r28707; + xor.b32 %r28715, %r28714, %r28709; + shf.l.wrap.b32 %r28716, %r28715, %r28715, 25; + add.s32 %r28717, %r28669, %r27856; + add.s32 %r28718, %r28717, %r28660; + xor.b32 %r28719, %r28685, %r28718; + shf.l.wrap.b32 %r28720, %r28719, %r28719, 16; + add.s32 %r28721, %r28720, %r28644; + xor.b32 %r28722, %r28721, %r28660; + shf.l.wrap.b32 %r28723, %r28722, %r28722, 20; + add.s32 %r28724, %r28718, %r27944; + add.s32 %r28725, %r28724, %r28723; + xor.b32 %r28726, %r28725, %r28720; + shf.l.wrap.b32 %r28727, %r28726, %r28726, 24; + add.s32 %r28728, %r28727, %r28721; + xor.b32 %r28729, %r28728, %r28723; + shf.l.wrap.b32 %r28730, %r28729, %r28729, 25; + add.s32 %r28731, %r28683, %r27933; + add.s32 %r28732, %r28731, %r28674; + xor.b32 %r28733, %r28643, %r28732; + shf.l.wrap.b32 %r28734, %r28733, %r28733, 16; + add.s32 %r28735, %r28734, %r28658; + xor.b32 %r28736, %r28735, %r28674; + shf.l.wrap.b32 %r28737, %r28736, %r28736, 20; + add.s32 %r28738, %r28732, %r27911; + add.s32 %r28739, %r28738, %r28737; + xor.b32 %r28740, %r28739, %r28734; + shf.l.wrap.b32 %r28741, %r28740, %r28740, 24; + add.s32 %r28742, %r28741, %r28735; + xor.b32 %r28743, %r28742, %r28737; + shf.l.wrap.b32 %r28744, %r28743, %r28743, 25; + add.s32 %r28745, %r28697, %r27999; + add.s32 %r28746, %r28745, %r28716; + xor.b32 %r28747, %r28746, %r28741; + shf.l.wrap.b32 %r28748, %r28747, %r28747, 16; + add.s32 %r28749, %r28748, %r28728; + xor.b32 %r28750, %r28749, %r28716; + shf.l.wrap.b32 %r28751, %r28750, %r28750, 20; + add.s32 %r28752, %r28746, %r27955; + add.s32 %r28753, %r28752, %r28751; + xor.b32 %r28754, %r28753, %r28748; + shf.l.wrap.b32 %r28755, %r28754, %r28754, 24; + add.s32 %r28756, %r28755, %r28749; + xor.b32 %r28757, %r28756, %r28751; + shf.l.wrap.b32 %r28758, %r28757, %r28757, 25; + add.s32 %r28759, %r28730, %r27867; + add.s32 %r28760, %r28759, %r28711; + xor.b32 %r28761, %r28699, %r28760; + shf.l.wrap.b32 %r28762, %r28761, %r28761, 16; + add.s32 %r28763, %r28762, %r28742; + xor.b32 %r28764, %r28763, %r28730; + shf.l.wrap.b32 %r28765, %r28764, %r28764, 20; + add.s32 %r28766, %r28760, %r27977; + add.s32 %r28767, %r28766, %r28765; + xor.b32 %r28768, %r28767, %r28762; + shf.l.wrap.b32 %r28769, %r28768, %r28768, 24; + add.s32 %r28770, %r28769, %r28763; + xor.b32 %r28771, %r28770, %r28765; + shf.l.wrap.b32 %r28772, %r28771, %r28771, 25; + add.s32 %r28773, %r28725, %r27878; + add.s32 %r28774, %r28773, %r28744; + xor.b32 %r28775, %r28713, %r28774; + shf.l.wrap.b32 %r28776, %r28775, %r28775, 16; + add.s32 %r28777, %r28776, %r28700; + xor.b32 %r28778, %r28777, %r28744; + shf.l.wrap.b32 %r28779, %r28778, %r28778, 20; + add.s32 %r28780, %r28774, %r27889; + add.s32 %r28781, %r28780, %r28779; + xor.b32 %r28782, %r28781, %r28776; + shf.l.wrap.b32 %r28783, %r28782, %r28782, 24; + add.s32 %r28784, %r28783, %r28777; + xor.b32 %r28785, %r28784, %r28779; + shf.l.wrap.b32 %r28786, %r28785, %r28785, 25; + add.s32 %r28787, %r28739, %r27922; + add.s32 %r28788, %r28787, %r28702; + xor.b32 %r28789, %r28788, %r28727; + shf.l.wrap.b32 %r28790, %r28789, %r28789, 16; + add.s32 %r28791, %r28790, %r28714; + xor.b32 %r28792, %r28791, %r28702; + shf.l.wrap.b32 %r28793, %r28792, %r28792, 20; + add.s32 %r28794, %r28788, %r27988; + add.s32 %r28795, %r28794, %r28793; + xor.b32 %r28796, %r28795, %r28790; + shf.l.wrap.b32 %r28797, %r28796, %r28796, 24; + add.s32 %r28798, %r28797, %r28791; + xor.b32 %r28799, %r28798, %r28793; + shf.l.wrap.b32 %r28800, %r28799, %r28799, 25; + xor.b32 %r28801, %r28753, %r28784; + cvt.u64.u32 %rd1163, %r28801; + xor.b32 %r28802, %r28798, %r28767; + and.b32 %r28803, %r28802, 255; + cvt.u64.u32 %rd1164, %r28803; + cvt.u64.u32 %rd1165, %r28802; + shl.b64 %rd1166, %rd1165, 32; + and.b64 %rd1167, %rd1166, 280375465082880; + and.b64 %rd1168, %rd1166, 71776119061217280; + shr.u32 %r28804, %r28802, 24; + cvt.u64.u32 %rd1169, %r28804; + shl.b64 %rd1170, %rd1169, 56; + bfi.b64 %rd1171, %rd1164, %rd1163, 32, 32; + or.b64 %rd1172, %rd1171, %rd1167; + or.b64 %rd1173, %rd1172, %rd1168; + or.b64 %rd353, %rd1173, %rd1170; + xor.b32 %r28805, %r28756, %r28781; + cvt.u64.u32 %rd1174, %r28805; + xor.b32 %r28806, %r28795, %r28770; + and.b32 %r28807, %r28806, 255; + cvt.u64.u32 %rd1175, %r28807; + cvt.u64.u32 %rd1176, %r28806; + shl.b64 %rd1177, %rd1176, 32; + and.b64 %rd1178, %rd1177, 280375465082880; + and.b64 %rd1179, %rd1177, 71776119061217280; + shr.u32 %r28808, %r28806, 24; + cvt.u64.u32 %rd1180, %r28808; + shl.b64 %rd1181, %rd1180, 56; + bfi.b64 %rd1182, %rd1175, %rd1174, 32, 32; + or.b64 %rd1183, %rd1182, %rd1178; + or.b64 %rd1184, %rd1183, %rd1179; + or.b64 %rd352, %rd1184, %rd1181; + xor.b32 %r28809, %r28800, %r28769; + cvt.u64.u32 %rd1185, %r28809; + xor.b32 %r28810, %r28758, %r28783; + and.b32 %r28811, %r28810, 255; + cvt.u64.u32 %rd1186, %r28811; + cvt.u64.u32 %rd1187, %r28810; + shl.b64 %rd1188, %rd1187, 32; + and.b64 %rd1189, %rd1188, 280375465082880; + and.b64 %rd1190, %rd1188, 71776119061217280; + shr.u32 %r28812, %r28810, 24; + cvt.u64.u32 %rd1191, %r28812; + shl.b64 %rd1192, %rd1191, 56; + bfi.b64 %rd1193, %rd1186, %rd1185, 32, 32; + or.b64 %rd1194, %rd1193, %rd1189; + or.b64 %rd1195, %rd1194, %rd1190; + or.b64 %rd1370, %rd1195, %rd1192; + xor.b32 %r28813, %r28797, %r28772; + cvt.u64.u32 %rd1196, %r28813; + xor.b32 %r28814, %r28755, %r28786; + and.b32 %r28815, %r28814, 255; + cvt.u64.u32 %rd1197, %r28815; + cvt.u64.u32 %rd1198, %r28814; + shl.b64 %rd1199, %rd1198, 32; + and.b64 %rd1200, %rd1199, 280375465082880; + and.b64 %rd1201, %rd1199, 71776119061217280; + shr.u32 %r28816, %r28814, 24; + cvt.u64.u32 %rd1202, %r28816; + shl.b64 %rd1203, %rd1202, 56; + bfi.b64 %rd1204, %rd1197, %rd1196, 32, 32; + or.b64 %rd1205, %rd1204, %rd1200; + or.b64 %rd1206, %rd1205, %rd1201; + or.b64 %rd1369, %rd1206, %rd1203; + +$L__BB2_96: + ld.const.u64 %rd354, [target+24]; + setp.eq.s64 %p55, %rd1369, %rd354; + @%p55 bra $L__BB2_98; + bra.uni $L__BB2_97; + +$L__BB2_98: + ld.const.u64 %rd355, [target+16]; + setp.eq.s64 %p56, %rd1370, %rd355; + @%p56 bra $L__BB2_100; + bra.uni $L__BB2_99; + +$L__BB2_100: + ld.const.u64 %rd356, [target+8]; + setp.eq.s64 %p57, %rd352, %rd356; + @%p57 bra $L__BB2_102; + bra.uni $L__BB2_101; + +$L__BB2_102: + ld.const.u64 %rd1251, [target]; + setp.lt.u64 %p59, %rd353, %rd1251; + bra.uni $L__BB2_103; + +$L__BB2_97: + setp.lt.u64 %p59, %rd1369, %rd354; + bra.uni $L__BB2_103; + +$L__BB2_99: + setp.lt.u64 %p59, %rd1370, %rd355; + bra.uni $L__BB2_103; + +$L__BB2_101: + setp.lt.u64 %p59, %rd352, %rd356; + +$L__BB2_103: + not.pred %p58, %p59; + @%p58 bra $L__BB2_105; + + ld.param.u64 %rd1264, [heavy_hash_param_0]; + ld.param.u64 %rd1263, [heavy_hash_param_1]; + and.b64 %rd1262, %rd1299, %rd1264; + or.b64 %rd1261, %rd1262, %rd1263; + ld.param.u64 %rd1257, [heavy_hash_param_5]; + cvta.to.global.u64 %rd1252, %rd1257; + mov.u64 %rd1253, 0; + atom.global.cas.b64 %rd1254, [%rd1252], %rd1253, %rd1261; + +$L__BB2_105: + ret; + +} + diff --git a/plugins/cuda/resources/kaspa-cuda-sm61.ptx b/plugins/cuda/resources/kaspa-cuda-sm61.ptx deleted file mode 100644 index 0ad05d0..0000000 --- a/plugins/cuda/resources/kaspa-cuda-sm61.ptx +++ /dev/null @@ -1,7123 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// -// Compiler Build ID: CL-31833905 -// Cuda compilation tools, release 11.8, V11.8.89 -// Based on NVVM 7.0.1 -// - -.version 7.8 -.target sm_61 -.address_size 64 - - // .globl heavy_hash -.global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; -.global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; -.global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; -.global .align 1 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; -.global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; -.global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; -.global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; -.const .align 4 .b8 matrix[4096]; -.const .align 8 .b8 hash_header[72]; -.const .align 8 .b8 target[32]; -.const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; -.const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; - -.visible .entry heavy_hash( - .param .u64 heavy_hash_param_0, - .param .u64 heavy_hash_param_1, - .param .u64 heavy_hash_param_2, - .param .u8 heavy_hash_param_3, - .param .u64 heavy_hash_param_4, - .param .u64 heavy_hash_param_5 -) -{ - .local .align 8 .b8 __local_depot0[1912]; - .reg .b64 %SP; - .reg .b64 %SPL; - .reg .pred %p<17>; - .reg .b16 %rs<113>; - .reg .b32 %r<6250>; - .reg .b64 %rd<466>; - - - mov.u64 %SPL, __local_depot0; - ld.param.u8 %rs11, [heavy_hash_param_3]; - ld.param.u64 %rd78, [heavy_hash_param_0]; - ld.param.u64 %rd79, [heavy_hash_param_1]; - ld.param.u64 %rd80, [heavy_hash_param_2]; - ld.param.u64 %rd81, [heavy_hash_param_4]; - ld.param.u64 %rd82, [heavy_hash_param_5]; - cvta.to.global.u64 %rd1, %rd81; - cvta.to.global.u64 %rd2, %rd82; - add.u64 %rd3, %SPL, 0; - mov.u32 %r17, %ntid.x; - mov.u32 %r18, %ctaid.x; - mov.u32 %r19, %tid.x; - mad.lo.s32 %r20, %r18, %r17, %r19; - cvt.s64.s32 %rd4, %r20; - setp.ge.u64 %p6, %rd4, %rd80; - @%p6 bra $L__BB0_19; - - cvt.u32.u64 %r21, %rd4; - setp.ne.s32 %p7, %r21, 0; - @%p7 bra $L__BB0_3; - - mov.u64 %rd84, 0; - st.global.u64 [%rd2], %rd84; - -$L__BB0_3: - setp.eq.s16 %p8, %rs11, 0; - @%p8 bra $L__BB0_5; - - shl.b64 %rd85, %rd4, 5; - add.s64 %rd86, %rd1, %rd85; - ld.global.v2.u64 {%rd87, %rd88}, [%rd86]; - mul.lo.s64 %rd91, %rd88, 5; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd91, 7; - shr.b64 %rhs, %rd91, 57; - add.u64 %rd92, %lhs, %rhs; - } - mul.lo.s64 %rd439, %rd92, 9; - shl.b64 %rd93, %rd88, 17; - ld.global.v2.u64 {%rd94, %rd95}, [%rd86+16]; - xor.b64 %rd98, %rd94, %rd87; - xor.b64 %rd99, %rd95, %rd88; - xor.b64 %rd100, %rd88, %rd98; - xor.b64 %rd101, %rd87, %rd99; - st.global.v2.u64 [%rd86], {%rd101, %rd100}; - { - .reg .b32 %dummy; - mov.b64 {%r22,%dummy}, %rd99; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r23}, %rd99; - } - shf.r.wrap.b32 %r24, %r23, %r22, 19; - shf.r.wrap.b32 %r25, %r22, %r23, 19; - mov.b64 %rd102, {%r25, %r24}; - xor.b64 %rd103, %rd98, %rd93; - st.global.v2.u64 [%rd86+16], {%rd103, %rd102}; - bra.uni $L__BB0_6; - -$L__BB0_5: - ld.global.u64 %rd104, [%rd1]; - xor.b64 %rd439, %rd104, %rd4; - -$L__BB0_6: - and.b64 %rd105, %rd439, %rd78; - or.b64 %rd8, %rd105, %rd79; - mov.b64 {%r26, %r27}, %rd8; - mov.u64 %rd106, 0; - ld.const.u64 %rd107, [hash_header]; - cvt.u32.u64 %r28, %rd107; - shr.u64 %rd108, %rd107, 8; - cvt.u32.u64 %r29, %rd108; - shr.u64 %rd109, %rd107, 16; - cvt.u32.u64 %r30, %rd109; - shr.u64 %rd110, %rd107, 32; - cvt.u32.u64 %r31, %rd110; - shr.u64 %rd111, %rd107, 40; - cvt.u32.u64 %r32, %rd111; - shr.u64 %rd112, %rd107, 48; - cvt.u32.u64 %r33, %rd112; - ld.const.u64 %rd113, [hash_header+8]; - cvt.u32.u64 %r34, %rd113; - shr.u64 %rd114, %rd113, 8; - cvt.u32.u64 %r35, %rd114; - shr.u64 %rd115, %rd113, 16; - cvt.u32.u64 %r36, %rd115; - shr.u64 %rd116, %rd113, 32; - cvt.u32.u64 %r37, %rd116; - shr.u64 %rd117, %rd113, 40; - cvt.u32.u64 %r38, %rd117; - shr.u64 %rd118, %rd113, 48; - cvt.u32.u64 %r39, %rd118; - ld.const.u64 %rd119, [hash_header+16]; - cvt.u32.u64 %r40, %rd119; - shr.u64 %rd120, %rd119, 8; - cvt.u32.u64 %r41, %rd120; - shr.u64 %rd121, %rd119, 16; - cvt.u32.u64 %r42, %rd121; - shr.u64 %rd122, %rd119, 32; - cvt.u32.u64 %r43, %rd122; - shr.u64 %rd123, %rd119, 40; - cvt.u32.u64 %r44, %rd123; - shr.u64 %rd124, %rd119, 48; - cvt.u32.u64 %r45, %rd124; - ld.const.u64 %rd125, [hash_header+24]; - cvt.u32.u64 %r46, %rd125; - shr.u64 %rd126, %rd125, 8; - cvt.u32.u64 %r47, %rd126; - shr.u64 %rd127, %rd125, 16; - cvt.u32.u64 %r48, %rd127; - shr.u64 %rd128, %rd125, 32; - cvt.u32.u64 %r49, %rd128; - shr.u64 %rd129, %rd125, 40; - cvt.u32.u64 %r50, %rd129; - shr.u64 %rd130, %rd125, 48; - cvt.u32.u64 %r51, %rd130; - ld.const.v4.u16 {%rs12, %rs13, %rs14, %rs15}, [hash_header+32]; - shr.u16 %rs17, %rs12, 8; - shr.u16 %rs19, %rs13, 8; - shr.u16 %rs21, %rs14, 8; - shr.u16 %rs23, %rs15, 8; - ld.const.v4.u16 {%rs24, %rs25, %rs26, %rs27}, [hash_header+40]; - shr.u16 %rs29, %rs24, 8; - shr.u16 %rs31, %rs25, 8; - shr.u16 %rs33, %rs26, 8; - shr.u16 %rs35, %rs27, 8; - ld.const.v4.u16 {%rs36, %rs37, %rs38, %rs39}, [hash_header+48]; - shr.u16 %rs41, %rs36, 8; - shr.u16 %rs43, %rs37, 8; - shr.u16 %rs45, %rs38, 8; - shr.u16 %rs47, %rs39, 8; - ld.const.v4.u16 {%rs48, %rs49, %rs50, %rs51}, [hash_header+56]; - shr.u16 %rs53, %rs48, 8; - shr.u16 %rs55, %rs49, 8; - shr.u16 %rs57, %rs50, 8; - shr.u16 %rs59, %rs51, 8; - ld.const.u64 %rd131, [hash_header+64]; - mov.b64 {%r52, %r53}, %rd131; - mov.u32 %r54, -1150833019; - mov.u32 %r55, 1779033703; - st.local.v2.u32 [%rd3], {%r55, %r54}; - mov.u32 %r56, -1521486534; - mov.u32 %r57, 1013904242; - st.local.v2.u32 [%rd3+8], {%r57, %r56}; - mov.u32 %r58, -1694144372; - mov.u32 %r59, 1359893119; - st.local.v2.u32 [%rd3+16], {%r59, %r58}; - mov.u32 %r60, 1541459225; - mov.u32 %r61, 528734635; - st.local.v2.u32 [%rd3+24], {%r61, %r60}; - st.local.u64 [%rd3+64], %rd106; - mov.u32 %r62, 0; - st.local.v2.u32 [%rd3+88], {%r62, %r62}; - st.local.v2.u32 [%rd3+96], {%r62, %r62}; - st.local.v2.u32 [%rd3+104], {%r62, %r62}; - st.local.v2.u32 [%rd3+112], {%r62, %r62}; - st.local.v2.u32 [%rd3+120], {%r62, %r62}; - st.local.v2.u32 [%rd3+128], {%r62, %r62}; - mov.u16 %rs60, 0; - st.local.v2.u8 [%rd3+136], {%rs60, %rs60}; - st.local.u8 [%rd3+138], %rs60; - st.local.v2.u32 [%rd3+32], {%r55, %r54}; - st.local.v2.u32 [%rd3+40], {%r57, %r56}; - st.local.v2.u32 [%rd3+48], {%r59, %r58}; - st.local.v2.u32 [%rd3+56], {%r61, %r60}; - st.local.v2.u32 [%rd3+72], {%r62, %r62}; - st.local.v2.u32 [%rd3+80], {%r62, %r62}; - st.local.u8 [%rd3+144], %rs60; - ld.local.v4.u8 {%rs61, %rs62, %rs63, %rs64}, [%rd3+136]; - setp.eq.s16 %p9, %rs62, 0; - selp.u16 %rs68, 1, 0, %p9; - or.b16 %rs69, %rs63, %rs68; - shr.u32 %r63, %r28, 24; - mov.u32 %r64, 64; - prmt.b32 %r65, %r28, %r29, %r64; - mov.u32 %r66, 1040; - prmt.b32 %r67, %r65, %r30, %r66; - mov.u32 %r68, 16912; - prmt.b32 %r69, %r67, %r63, %r68; - and.b32 %r70, %r31, 255; - and.b32 %r71, %r32, 255; - prmt.b32 %r72, %r71, %r70, 30212; - shl.b32 %r73, %r33, 16; - and.b32 %r74, %r73, 16711680; - or.b32 %r75, %r72, %r74; - and.b32 %r76, %r31, -16777216; - or.b32 %r77, %r75, %r76; - shr.u32 %r78, %r34, 24; - prmt.b32 %r79, %r34, %r35, %r64; - prmt.b32 %r80, %r79, %r36, %r66; - prmt.b32 %r81, %r80, %r78, %r68; - and.b32 %r82, %r37, 255; - and.b32 %r83, %r38, 255; - prmt.b32 %r84, %r83, %r82, 30212; - shl.b32 %r85, %r39, 16; - and.b32 %r86, %r85, 16711680; - or.b32 %r87, %r84, %r86; - and.b32 %r88, %r37, -16777216; - or.b32 %r89, %r87, %r88; - shr.u32 %r90, %r40, 24; - prmt.b32 %r91, %r40, %r41, %r64; - prmt.b32 %r92, %r91, %r42, %r66; - prmt.b32 %r93, %r92, %r90, %r68; - and.b32 %r94, %r43, 255; - and.b32 %r95, %r44, 255; - prmt.b32 %r96, %r95, %r94, 30212; - shl.b32 %r97, %r45, 16; - and.b32 %r98, %r97, 16711680; - or.b32 %r99, %r96, %r98; - and.b32 %r100, %r43, -16777216; - or.b32 %r101, %r99, %r100; - shr.u32 %r102, %r46, 24; - prmt.b32 %r103, %r46, %r47, %r64; - prmt.b32 %r104, %r103, %r48, %r66; - prmt.b32 %r105, %r104, %r102, %r68; - and.b32 %r106, %r49, 255; - and.b32 %r107, %r50, 255; - prmt.b32 %r108, %r107, %r106, 30212; - shl.b32 %r109, %r51, 16; - and.b32 %r110, %r109, 16711680; - or.b32 %r111, %r108, %r110; - and.b32 %r112, %r49, -16777216; - or.b32 %r113, %r111, %r112; - cvt.u32.u16 %r114, %rs12; - and.b32 %r115, %r114, 255; - cvt.u32.u16 %r116, %rs17; - prmt.b32 %r117, %r116, %r115, 30212; - cvt.u32.u16 %r118, %rs13; - prmt.b32 %r119, %r118, %r117, 28756; - cvt.u32.u16 %r120, %rs19; - prmt.b32 %r121, %r120, %r119, 1620; - cvt.u32.u16 %r122, %rs14; - and.b32 %r123, %r122, 255; - cvt.u32.u16 %r124, %rs21; - prmt.b32 %r125, %r124, %r123, 30212; - cvt.u32.u16 %r126, %rs15; - prmt.b32 %r127, %r126, %r125, 28756; - cvt.u32.u16 %r128, %rs23; - prmt.b32 %r129, %r128, %r127, 1620; - cvt.u32.u16 %r130, %rs24; - and.b32 %r131, %r130, 255; - cvt.u32.u16 %r132, %rs29; - prmt.b32 %r133, %r132, %r131, 30212; - cvt.u32.u16 %r134, %rs25; - prmt.b32 %r135, %r134, %r133, 28756; - cvt.u32.u16 %r136, %rs31; - prmt.b32 %r137, %r136, %r135, 1620; - cvt.u32.u16 %r138, %rs26; - and.b32 %r139, %r138, 255; - cvt.u32.u16 %r140, %rs33; - prmt.b32 %r141, %r140, %r139, 30212; - cvt.u32.u16 %r142, %rs27; - prmt.b32 %r143, %r142, %r141, 28756; - cvt.u32.u16 %r144, %rs35; - prmt.b32 %r145, %r144, %r143, 1620; - cvt.u32.u16 %r146, %rs36; - and.b32 %r147, %r146, 255; - cvt.u32.u16 %r148, %rs41; - prmt.b32 %r149, %r148, %r147, 30212; - cvt.u32.u16 %r150, %rs37; - prmt.b32 %r151, %r150, %r149, 28756; - cvt.u32.u16 %r152, %rs43; - prmt.b32 %r153, %r152, %r151, 1620; - cvt.u32.u16 %r154, %rs38; - and.b32 %r155, %r154, 255; - cvt.u32.u16 %r156, %rs45; - prmt.b32 %r157, %r156, %r155, 30212; - cvt.u32.u16 %r158, %rs39; - prmt.b32 %r159, %r158, %r157, 28756; - cvt.u32.u16 %r160, %rs47; - prmt.b32 %r161, %r160, %r159, 1620; - cvt.u32.u16 %r162, %rs48; - and.b32 %r163, %r162, 255; - cvt.u32.u16 %r164, %rs53; - prmt.b32 %r165, %r164, %r163, 30212; - cvt.u32.u16 %r166, %rs49; - prmt.b32 %r167, %r166, %r165, 28756; - cvt.u32.u16 %r168, %rs55; - prmt.b32 %r169, %r168, %r167, 1620; - cvt.u32.u16 %r170, %rs50; - and.b32 %r171, %r170, 255; - cvt.u32.u16 %r172, %rs57; - prmt.b32 %r173, %r172, %r171, 30212; - cvt.u32.u16 %r174, %rs51; - prmt.b32 %r175, %r174, %r173, 28756; - cvt.u32.u16 %r176, %rs59; - prmt.b32 %r177, %r176, %r175, 1620; - cvt.u32.u16 %r178, %rs69; - and.b32 %r179, %r178, 255; - add.s32 %r180, %r69, -1156040474; - shf.l.wrap.b32 %r181, %r180, %r180, 16; - add.s32 %r182, %r181, 1779033703; - xor.b32 %r183, %r182, 1359893119; - shf.l.wrap.b32 %r184, %r183, %r183, 20; - add.s32 %r185, %r77, %r180; - add.s32 %r186, %r185, %r184; - xor.b32 %r187, %r186, %r181; - shf.l.wrap.b32 %r188, %r187, %r187, 24; - add.s32 %r189, %r188, %r182; - xor.b32 %r190, %r189, %r184; - shf.l.wrap.b32 %r191, %r190, %r190, 25; - add.s32 %r192, %r81, 1449989905; - shf.l.wrap.b32 %r193, %r192, %r192, 16; - add.s32 %r194, %r193, -1150833019; - xor.b32 %r195, %r194, -1694144372; - shf.l.wrap.b32 %r196, %r195, %r195, 20; - add.s32 %r197, %r89, %r192; - add.s32 %r198, %r197, %r196; - xor.b32 %r199, %r198, %r193; - shf.l.wrap.b32 %r200, %r199, %r199, 24; - add.s32 %r201, %r200, %r194; - xor.b32 %r202, %r201, %r196; - shf.l.wrap.b32 %r203, %r202, %r202, 25; - add.s32 %r204, %r93, 1542638877; - shr.u32 %r205, %r204, 16; - shl.b32 %r206, %r204, 16; - xor.b32 %r207, %r206, 4194304; - or.b32 %r208, %r207, %r205; - add.s32 %r209, %r208, 1013904242; - xor.b32 %r210, %r209, 528734635; - shf.l.wrap.b32 %r211, %r210, %r210, 20; - add.s32 %r212, %r101, %r204; - add.s32 %r213, %r212, %r211; - xor.b32 %r214, %r213, %r208; - shf.l.wrap.b32 %r215, %r214, %r214, 24; - add.s32 %r216, %r215, %r209; - xor.b32 %r217, %r216, %r211; - shf.l.wrap.b32 %r218, %r217, %r217, 25; - add.s32 %r219, %r105, 19972691; - xor.b32 %r220, %r219, %r179; - shr.u32 %r221, %r219, 16; - shl.b32 %r222, %r220, 16; - or.b32 %r223, %r222, %r221; - add.s32 %r224, %r223, -1521486534; - xor.b32 %r225, %r224, 1541459225; - shf.l.wrap.b32 %r226, %r225, %r225, 20; - add.s32 %r227, %r113, %r219; - add.s32 %r228, %r227, %r226; - xor.b32 %r229, %r228, %r223; - shf.l.wrap.b32 %r230, %r229, %r229, 24; - add.s32 %r231, %r230, %r224; - xor.b32 %r232, %r231, %r226; - shf.l.wrap.b32 %r233, %r232, %r232, 25; - add.s32 %r234, %r203, %r186; - add.s32 %r235, %r234, %r121; - xor.b32 %r236, %r230, %r235; - shf.l.wrap.b32 %r237, %r236, %r236, 16; - add.s32 %r238, %r237, %r216; - xor.b32 %r239, %r238, %r203; - shf.l.wrap.b32 %r240, %r239, %r239, 20; - add.s32 %r241, %r129, %r235; - add.s32 %r242, %r241, %r240; - xor.b32 %r243, %r242, %r237; - shf.l.wrap.b32 %r244, %r243, %r243, 24; - add.s32 %r245, %r244, %r238; - xor.b32 %r246, %r245, %r240; - shf.l.wrap.b32 %r247, %r246, %r246, 25; - add.s32 %r248, %r218, %r198; - add.s32 %r249, %r248, %r137; - xor.b32 %r250, %r249, %r188; - shf.l.wrap.b32 %r251, %r250, %r250, 16; - add.s32 %r252, %r251, %r231; - xor.b32 %r253, %r252, %r218; - shf.l.wrap.b32 %r254, %r253, %r253, 20; - add.s32 %r255, %r145, %r249; - add.s32 %r256, %r255, %r254; - xor.b32 %r257, %r256, %r251; - shf.l.wrap.b32 %r258, %r257, %r257, 24; - add.s32 %r259, %r258, %r252; - xor.b32 %r260, %r259, %r254; - shf.l.wrap.b32 %r261, %r260, %r260, 25; - add.s32 %r262, %r233, %r213; - add.s32 %r263, %r262, %r153; - xor.b32 %r264, %r263, %r200; - shf.l.wrap.b32 %r265, %r264, %r264, 16; - add.s32 %r266, %r265, %r189; - xor.b32 %r267, %r266, %r233; - shf.l.wrap.b32 %r268, %r267, %r267, 20; - add.s32 %r269, %r161, %r263; - add.s32 %r270, %r269, %r268; - xor.b32 %r271, %r270, %r265; - shf.l.wrap.b32 %r272, %r271, %r271, 24; - add.s32 %r273, %r272, %r266; - xor.b32 %r274, %r273, %r268; - shf.l.wrap.b32 %r275, %r274, %r274, 25; - add.s32 %r276, %r228, %r191; - add.s32 %r277, %r276, %r169; - xor.b32 %r278, %r277, %r215; - shf.l.wrap.b32 %r279, %r278, %r278, 16; - add.s32 %r280, %r279, %r201; - xor.b32 %r281, %r280, %r191; - shf.l.wrap.b32 %r282, %r281, %r281, 20; - add.s32 %r283, %r177, %r277; - add.s32 %r284, %r283, %r282; - xor.b32 %r285, %r284, %r279; - shf.l.wrap.b32 %r286, %r285, %r285, 24; - add.s32 %r287, %r286, %r280; - xor.b32 %r288, %r287, %r282; - shf.l.wrap.b32 %r289, %r288, %r288, 25; - add.s32 %r290, %r242, %r81; - add.s32 %r291, %r290, %r289; - xor.b32 %r292, %r291, %r258; - shf.l.wrap.b32 %r293, %r292, %r292, 16; - add.s32 %r294, %r293, %r273; - xor.b32 %r295, %r294, %r289; - shf.l.wrap.b32 %r296, %r295, %r295, 20; - add.s32 %r297, %r291, %r105; - add.s32 %r298, %r297, %r296; - xor.b32 %r299, %r298, %r293; - shf.l.wrap.b32 %r300, %r299, %r299, 24; - add.s32 %r301, %r300, %r294; - xor.b32 %r302, %r301, %r296; - shf.l.wrap.b32 %r303, %r302, %r302, 25; - add.s32 %r304, %r256, %r89; - add.s32 %r305, %r304, %r247; - xor.b32 %r306, %r272, %r305; - shf.l.wrap.b32 %r307, %r306, %r306, 16; - add.s32 %r308, %r287, %r307; - xor.b32 %r309, %r308, %r247; - shf.l.wrap.b32 %r310, %r309, %r309, 20; - add.s32 %r311, %r305, %r137; - add.s32 %r312, %r311, %r310; - xor.b32 %r313, %r312, %r307; - shf.l.wrap.b32 %r314, %r313, %r313, 24; - add.s32 %r315, %r314, %r308; - xor.b32 %r316, %r315, %r310; - shf.l.wrap.b32 %r317, %r316, %r316, 25; - add.s32 %r318, %r261, %r113; - add.s32 %r319, %r318, %r270; - xor.b32 %r320, %r286, %r319; - shf.l.wrap.b32 %r321, %r320, %r320, 16; - add.s32 %r322, %r321, %r245; - xor.b32 %r323, %r322, %r261; - shf.l.wrap.b32 %r324, %r323, %r323, 20; - add.s32 %r325, %r319, %r69; - add.s32 %r326, %r325, %r324; - xor.b32 %r327, %r326, %r321; - shf.l.wrap.b32 %r328, %r327, %r327, 24; - add.s32 %r329, %r328, %r322; - xor.b32 %r330, %r329, %r324; - shf.l.wrap.b32 %r331, %r330, %r330, 25; - add.s32 %r332, %r275, %r93; - add.s32 %r333, %r332, %r284; - xor.b32 %r334, %r333, %r244; - shf.l.wrap.b32 %r335, %r334, %r334, 16; - add.s32 %r336, %r335, %r259; - xor.b32 %r337, %r336, %r275; - shf.l.wrap.b32 %r338, %r337, %r337, 20; - add.s32 %r339, %r333, %r161; - add.s32 %r340, %r339, %r338; - xor.b32 %r341, %r340, %r335; - shf.l.wrap.b32 %r342, %r341, %r341, 24; - add.s32 %r343, %r342, %r336; - xor.b32 %r344, %r343, %r338; - shf.l.wrap.b32 %r345, %r344, %r344, 25; - add.s32 %r346, %r298, %r77; - add.s32 %r347, %r346, %r317; - xor.b32 %r348, %r347, %r342; - shf.l.wrap.b32 %r349, %r348, %r348, 16; - add.s32 %r350, %r349, %r329; - xor.b32 %r351, %r350, %r317; - shf.l.wrap.b32 %r352, %r351, %r351, 20; - add.s32 %r353, %r347, %r145; - add.s32 %r354, %r353, %r352; - xor.b32 %r355, %r354, %r349; - shf.l.wrap.b32 %r356, %r355, %r355, 24; - add.s32 %r357, %r356, %r350; - xor.b32 %r358, %r357, %r352; - shf.l.wrap.b32 %r359, %r358, %r358, 25; - add.s32 %r360, %r312, %r153; - add.s32 %r361, %r360, %r331; - xor.b32 %r362, %r361, %r300; - shf.l.wrap.b32 %r363, %r362, %r362, 16; - add.s32 %r364, %r363, %r343; - xor.b32 %r365, %r364, %r331; - shf.l.wrap.b32 %r366, %r365, %r365, 20; - add.s32 %r367, %r361, %r101; - add.s32 %r368, %r367, %r366; - xor.b32 %r369, %r368, %r363; - shf.l.wrap.b32 %r370, %r369, %r369, 24; - add.s32 %r371, %r370, %r364; - xor.b32 %r372, %r371, %r366; - shf.l.wrap.b32 %r373, %r372, %r372, 25; - add.s32 %r374, %r326, %r129; - add.s32 %r375, %r374, %r345; - xor.b32 %r376, %r375, %r314; - shf.l.wrap.b32 %r377, %r376, %r376, 16; - add.s32 %r378, %r377, %r301; - xor.b32 %r379, %r378, %r345; - shf.l.wrap.b32 %r380, %r379, %r379, 20; - add.s32 %r381, %r375, %r169; - add.s32 %r382, %r381, %r380; - xor.b32 %r383, %r382, %r377; - shf.l.wrap.b32 %r384, %r383, %r383, 24; - add.s32 %r385, %r384, %r378; - xor.b32 %r386, %r385, %r380; - shf.l.wrap.b32 %r387, %r386, %r386, 25; - add.s32 %r388, %r340, %r177; - add.s32 %r389, %r388, %r303; - xor.b32 %r390, %r389, %r328; - shf.l.wrap.b32 %r391, %r390, %r390, 16; - add.s32 %r392, %r391, %r315; - xor.b32 %r393, %r392, %r303; - shf.l.wrap.b32 %r394, %r393, %r393, 20; - add.s32 %r395, %r389, %r121; - add.s32 %r396, %r395, %r394; - xor.b32 %r397, %r396, %r391; - shf.l.wrap.b32 %r398, %r397, %r397, 24; - add.s32 %r399, %r398, %r392; - xor.b32 %r400, %r399, %r394; - shf.l.wrap.b32 %r401, %r400, %r400, 25; - add.s32 %r402, %r354, %r89; - add.s32 %r403, %r402, %r401; - xor.b32 %r404, %r403, %r370; - shf.l.wrap.b32 %r405, %r404, %r404, 16; - add.s32 %r406, %r405, %r385; - xor.b32 %r407, %r406, %r401; - shf.l.wrap.b32 %r408, %r407, %r407, 20; - add.s32 %r409, %r403, %r93; - add.s32 %r410, %r409, %r408; - xor.b32 %r411, %r410, %r405; - shf.l.wrap.b32 %r412, %r411, %r411, 24; - add.s32 %r413, %r412, %r406; - xor.b32 %r414, %r413, %r408; - shf.l.wrap.b32 %r415, %r414, %r414, 25; - add.s32 %r416, %r368, %r137; - add.s32 %r417, %r416, %r359; - xor.b32 %r418, %r417, %r384; - shf.l.wrap.b32 %r419, %r418, %r418, 16; - add.s32 %r420, %r419, %r399; - xor.b32 %r421, %r420, %r359; - shf.l.wrap.b32 %r422, %r421, %r421, 20; - add.s32 %r423, %r417, %r153; - add.s32 %r424, %r423, %r422; - xor.b32 %r425, %r424, %r419; - shf.l.wrap.b32 %r426, %r425, %r425, 24; - add.s32 %r427, %r426, %r420; - xor.b32 %r428, %r427, %r422; - shf.l.wrap.b32 %r429, %r428, %r428, 25; - add.s32 %r430, %r382, %r161; - add.s32 %r431, %r430, %r373; - xor.b32 %r432, %r431, %r398; - shf.l.wrap.b32 %r433, %r432, %r432, 16; - add.s32 %r434, %r433, %r357; - xor.b32 %r435, %r434, %r373; - shf.l.wrap.b32 %r436, %r435, %r435, 20; - add.s32 %r437, %r431, %r81; - add.s32 %r438, %r437, %r436; - xor.b32 %r439, %r438, %r433; - shf.l.wrap.b32 %r440, %r439, %r439, 24; - add.s32 %r441, %r440, %r434; - xor.b32 %r442, %r441, %r436; - shf.l.wrap.b32 %r443, %r442, %r442, 25; - add.s32 %r444, %r396, %r113; - add.s32 %r445, %r444, %r387; - xor.b32 %r446, %r445, %r356; - shf.l.wrap.b32 %r447, %r446, %r446, 16; - add.s32 %r448, %r447, %r371; - xor.b32 %r449, %r448, %r387; - shf.l.wrap.b32 %r450, %r449, %r449, 20; - add.s32 %r451, %r445, %r169; - add.s32 %r452, %r451, %r450; - xor.b32 %r453, %r452, %r447; - shf.l.wrap.b32 %r454, %r453, %r453, 24; - add.s32 %r455, %r454, %r448; - xor.b32 %r456, %r455, %r450; - shf.l.wrap.b32 %r457, %r456, %r456, 25; - add.s32 %r458, %r410, %r105; - add.s32 %r459, %r458, %r429; - xor.b32 %r460, %r459, %r454; - shf.l.wrap.b32 %r461, %r460, %r460, 16; - add.s32 %r462, %r461, %r441; - xor.b32 %r463, %r462, %r429; - shf.l.wrap.b32 %r464, %r463, %r463, 20; - add.s32 %r465, %r459, %r101; - add.s32 %r466, %r465, %r464; - xor.b32 %r467, %r466, %r461; - shf.l.wrap.b32 %r468, %r467, %r467, 24; - add.s32 %r469, %r468, %r462; - xor.b32 %r470, %r469, %r464; - shf.l.wrap.b32 %r471, %r470, %r470, 25; - add.s32 %r472, %r424, %r129; - add.s32 %r473, %r472, %r443; - xor.b32 %r474, %r473, %r412; - shf.l.wrap.b32 %r475, %r474, %r474, 16; - add.s32 %r476, %r475, %r455; - xor.b32 %r477, %r476, %r443; - shf.l.wrap.b32 %r478, %r477, %r477, 20; - add.s32 %r479, %r473, %r69; - add.s32 %r480, %r479, %r478; - xor.b32 %r481, %r480, %r475; - shf.l.wrap.b32 %r482, %r481, %r481, 24; - add.s32 %r483, %r482, %r476; - xor.b32 %r484, %r483, %r478; - shf.l.wrap.b32 %r485, %r484, %r484, 25; - add.s32 %r486, %r438, %r145; - add.s32 %r487, %r486, %r457; - xor.b32 %r488, %r487, %r426; - shf.l.wrap.b32 %r489, %r488, %r488, 16; - add.s32 %r490, %r489, %r413; - xor.b32 %r491, %r490, %r457; - shf.l.wrap.b32 %r492, %r491, %r491, 20; - add.s32 %r493, %r487, %r177; - add.s32 %r494, %r493, %r492; - xor.b32 %r495, %r494, %r489; - shf.l.wrap.b32 %r496, %r495, %r495, 24; - add.s32 %r497, %r496, %r490; - xor.b32 %r498, %r497, %r492; - shf.l.wrap.b32 %r499, %r498, %r498, 25; - add.s32 %r500, %r452, %r121; - add.s32 %r501, %r500, %r415; - xor.b32 %r502, %r501, %r440; - shf.l.wrap.b32 %r503, %r502, %r502, 16; - add.s32 %r504, %r503, %r427; - xor.b32 %r505, %r504, %r415; - shf.l.wrap.b32 %r506, %r505, %r505, 20; - add.s32 %r507, %r501, %r77; - add.s32 %r508, %r507, %r506; - xor.b32 %r509, %r508, %r503; - shf.l.wrap.b32 %r510, %r509, %r509, 24; - add.s32 %r511, %r510, %r504; - xor.b32 %r512, %r511, %r506; - shf.l.wrap.b32 %r513, %r512, %r512, 25; - add.s32 %r514, %r466, %r137; - add.s32 %r515, %r514, %r513; - xor.b32 %r516, %r515, %r482; - shf.l.wrap.b32 %r517, %r516, %r516, 16; - add.s32 %r518, %r517, %r497; - xor.b32 %r519, %r518, %r513; - shf.l.wrap.b32 %r520, %r519, %r519, 20; - add.s32 %r521, %r515, %r113; - add.s32 %r522, %r521, %r520; - xor.b32 %r523, %r522, %r517; - shf.l.wrap.b32 %r524, %r523, %r523, 24; - add.s32 %r525, %r524, %r518; - xor.b32 %r526, %r525, %r520; - shf.l.wrap.b32 %r527, %r526, %r526, 25; - add.s32 %r528, %r480, %r153; - add.s32 %r529, %r528, %r471; - xor.b32 %r530, %r529, %r496; - shf.l.wrap.b32 %r531, %r530, %r530, 16; - add.s32 %r532, %r531, %r511; - xor.b32 %r533, %r532, %r471; - shf.l.wrap.b32 %r534, %r533, %r533, 20; - add.s32 %r535, %r529, %r129; - add.s32 %r536, %r535, %r534; - xor.b32 %r537, %r536, %r531; - shf.l.wrap.b32 %r538, %r537, %r537, 24; - add.s32 %r539, %r538, %r532; - xor.b32 %r540, %r539, %r534; - shf.l.wrap.b32 %r541, %r540, %r540, 25; - add.s32 %r542, %r494, %r169; - add.s32 %r543, %r542, %r485; - xor.b32 %r544, %r543, %r510; - shf.l.wrap.b32 %r545, %r544, %r544, 16; - add.s32 %r546, %r545, %r469; - xor.b32 %r547, %r546, %r485; - shf.l.wrap.b32 %r548, %r547, %r547, 20; - add.s32 %r549, %r543, %r89; - add.s32 %r550, %r549, %r548; - xor.b32 %r551, %r550, %r545; - shf.l.wrap.b32 %r552, %r551, %r551, 24; - add.s32 %r553, %r552, %r546; - xor.b32 %r554, %r553, %r548; - shf.l.wrap.b32 %r555, %r554, %r554, 25; - add.s32 %r556, %r508, %r161; - add.s32 %r557, %r556, %r499; - xor.b32 %r558, %r557, %r468; - shf.l.wrap.b32 %r559, %r558, %r558, 16; - add.s32 %r560, %r559, %r483; - xor.b32 %r561, %r560, %r499; - shf.l.wrap.b32 %r562, %r561, %r561, 20; - add.s32 %r563, %r557, %r177; - add.s32 %r564, %r563, %r562; - xor.b32 %r565, %r564, %r559; - shf.l.wrap.b32 %r566, %r565, %r565, 24; - add.s32 %r567, %r566, %r560; - xor.b32 %r568, %r567, %r562; - shf.l.wrap.b32 %r569, %r568, %r568, 25; - add.s32 %r570, %r522, %r93; - add.s32 %r571, %r570, %r541; - xor.b32 %r572, %r571, %r566; - shf.l.wrap.b32 %r573, %r572, %r572, 16; - add.s32 %r574, %r573, %r553; - xor.b32 %r575, %r574, %r541; - shf.l.wrap.b32 %r576, %r575, %r575, 20; - add.s32 %r577, %r571, %r69; - add.s32 %r578, %r577, %r576; - xor.b32 %r579, %r578, %r573; - shf.l.wrap.b32 %r580, %r579, %r579, 24; - add.s32 %r581, %r580, %r574; - xor.b32 %r582, %r581, %r576; - shf.l.wrap.b32 %r583, %r582, %r582, 25; - add.s32 %r584, %r536, %r145; - add.s32 %r585, %r584, %r555; - xor.b32 %r586, %r585, %r524; - shf.l.wrap.b32 %r587, %r586, %r586, 16; - add.s32 %r588, %r587, %r567; - xor.b32 %r589, %r588, %r555; - shf.l.wrap.b32 %r590, %r589, %r589, 20; - add.s32 %r591, %r585, %r81; - add.s32 %r592, %r591, %r590; - xor.b32 %r593, %r592, %r587; - shf.l.wrap.b32 %r594, %r593, %r593, 24; - add.s32 %r595, %r594, %r588; - xor.b32 %r596, %r595, %r590; - shf.l.wrap.b32 %r597, %r596, %r596, 25; - add.s32 %r598, %r550, %r101; - add.s32 %r599, %r598, %r569; - xor.b32 %r600, %r599, %r538; - shf.l.wrap.b32 %r601, %r600, %r600, 16; - add.s32 %r602, %r601, %r525; - xor.b32 %r603, %r602, %r569; - shf.l.wrap.b32 %r604, %r603, %r603, 20; - add.s32 %r605, %r599, %r121; - add.s32 %r606, %r605, %r604; - xor.b32 %r607, %r606, %r601; - shf.l.wrap.b32 %r608, %r607, %r607, 24; - add.s32 %r609, %r608, %r602; - xor.b32 %r610, %r609, %r604; - shf.l.wrap.b32 %r611, %r610, %r610, 25; - add.s32 %r612, %r564, %r77; - add.s32 %r613, %r612, %r527; - xor.b32 %r614, %r613, %r552; - shf.l.wrap.b32 %r615, %r614, %r614, 16; - add.s32 %r616, %r615, %r539; - xor.b32 %r617, %r616, %r527; - shf.l.wrap.b32 %r618, %r617, %r617, 20; - add.s32 %r619, %r613, %r105; - add.s32 %r620, %r619, %r618; - xor.b32 %r621, %r620, %r615; - shf.l.wrap.b32 %r622, %r621, %r621, 24; - add.s32 %r623, %r622, %r616; - xor.b32 %r624, %r623, %r618; - shf.l.wrap.b32 %r625, %r624, %r624, 25; - add.s32 %r626, %r578, %r153; - add.s32 %r627, %r626, %r625; - xor.b32 %r628, %r627, %r594; - shf.l.wrap.b32 %r629, %r628, %r628, 16; - add.s32 %r630, %r629, %r609; - xor.b32 %r631, %r630, %r625; - shf.l.wrap.b32 %r632, %r631, %r631, 20; - add.s32 %r633, %r627, %r161; - add.s32 %r634, %r633, %r632; - xor.b32 %r635, %r634, %r629; - shf.l.wrap.b32 %r636, %r635, %r635, 24; - add.s32 %r637, %r636, %r630; - xor.b32 %r638, %r637, %r632; - shf.l.wrap.b32 %r639, %r638, %r638, 25; - add.s32 %r640, %r592, %r129; - add.s32 %r641, %r640, %r583; - xor.b32 %r642, %r641, %r608; - shf.l.wrap.b32 %r643, %r642, %r642, 16; - add.s32 %r644, %r643, %r623; - xor.b32 %r645, %r644, %r583; - shf.l.wrap.b32 %r646, %r645, %r645, 20; - add.s32 %r647, %r641, %r145; - add.s32 %r648, %r647, %r646; - xor.b32 %r649, %r648, %r643; - shf.l.wrap.b32 %r650, %r649, %r649, 24; - add.s32 %r651, %r650, %r644; - xor.b32 %r652, %r651, %r646; - shf.l.wrap.b32 %r653, %r652, %r652, 25; - add.s32 %r654, %r606, %r177; - add.s32 %r655, %r654, %r597; - xor.b32 %r656, %r655, %r622; - shf.l.wrap.b32 %r657, %r656, %r656, 16; - add.s32 %r658, %r657, %r581; - xor.b32 %r659, %r658, %r597; - shf.l.wrap.b32 %r660, %r659, %r659, 20; - add.s32 %r661, %r655, %r137; - add.s32 %r662, %r661, %r660; - xor.b32 %r663, %r662, %r657; - shf.l.wrap.b32 %r664, %r663, %r663, 24; - add.s32 %r665, %r664, %r658; - xor.b32 %r666, %r665, %r660; - shf.l.wrap.b32 %r667, %r666, %r666, 25; - add.s32 %r668, %r620, %r169; - add.s32 %r669, %r668, %r611; - xor.b32 %r670, %r669, %r580; - shf.l.wrap.b32 %r671, %r670, %r670, 16; - add.s32 %r672, %r671, %r595; - xor.b32 %r673, %r672, %r611; - shf.l.wrap.b32 %r674, %r673, %r673, 20; - add.s32 %r675, %r669, %r121; - add.s32 %r676, %r675, %r674; - xor.b32 %r677, %r676, %r671; - shf.l.wrap.b32 %r678, %r677, %r677, 24; - add.s32 %r679, %r678, %r672; - xor.b32 %r680, %r679, %r674; - shf.l.wrap.b32 %r681, %r680, %r680, 25; - add.s32 %r682, %r634, %r113; - add.s32 %r683, %r682, %r653; - xor.b32 %r684, %r683, %r678; - shf.l.wrap.b32 %r685, %r684, %r684, 16; - add.s32 %r686, %r685, %r665; - xor.b32 %r687, %r686, %r653; - shf.l.wrap.b32 %r688, %r687, %r687, 20; - add.s32 %r689, %r683, %r81; - add.s32 %r690, %r689, %r688; - xor.b32 %r691, %r690, %r685; - shf.l.wrap.b32 %r692, %r691, %r691, 24; - add.s32 %r693, %r692, %r686; - xor.b32 %r694, %r693, %r688; - shf.l.wrap.b32 %r695, %r694, %r694, 25; - add.s32 %r696, %r648, %r101; - add.s32 %r697, %r696, %r667; - xor.b32 %r698, %r697, %r636; - shf.l.wrap.b32 %r699, %r698, %r698, 16; - add.s32 %r700, %r699, %r679; - xor.b32 %r701, %r700, %r667; - shf.l.wrap.b32 %r702, %r701, %r701, 20; - add.s32 %r703, %r697, %r89; - add.s32 %r704, %r703, %r702; - xor.b32 %r705, %r704, %r699; - shf.l.wrap.b32 %r706, %r705, %r705, 24; - add.s32 %r707, %r706, %r700; - xor.b32 %r708, %r707, %r702; - shf.l.wrap.b32 %r709, %r708, %r708, 25; - add.s32 %r710, %r662, %r69; - add.s32 %r711, %r710, %r681; - xor.b32 %r712, %r711, %r650; - shf.l.wrap.b32 %r713, %r712, %r712, 16; - add.s32 %r714, %r713, %r637; - xor.b32 %r715, %r714, %r681; - shf.l.wrap.b32 %r716, %r715, %r715, 20; - add.s32 %r717, %r711, %r77; - add.s32 %r718, %r717, %r716; - xor.b32 %r719, %r718, %r713; - shf.l.wrap.b32 %r720, %r719, %r719, 24; - add.s32 %r721, %r720, %r714; - xor.b32 %r722, %r721, %r716; - shf.l.wrap.b32 %r723, %r722, %r722, 25; - add.s32 %r724, %r676, %r105; - add.s32 %r725, %r724, %r639; - xor.b32 %r726, %r725, %r664; - shf.l.wrap.b32 %r727, %r726, %r726, 16; - add.s32 %r728, %r727, %r651; - xor.b32 %r729, %r728, %r639; - shf.l.wrap.b32 %r730, %r729, %r729, 20; - add.s32 %r731, %r725, %r93; - add.s32 %r732, %r731, %r730; - xor.b32 %r733, %r732, %r727; - shf.l.wrap.b32 %r734, %r733, %r733, 24; - add.s32 %r735, %r734, %r728; - xor.b32 %r736, %r735, %r730; - shf.l.wrap.b32 %r737, %r736, %r736, 25; - add.s32 %r738, %r690, %r129; - add.s32 %r739, %r738, %r737; - xor.b32 %r740, %r739, %r706; - shf.l.wrap.b32 %r741, %r740, %r740, 16; - add.s32 %r742, %r741, %r721; - xor.b32 %r743, %r742, %r737; - shf.l.wrap.b32 %r744, %r743, %r743, 20; - add.s32 %r745, %r739, %r169; - add.s32 %r746, %r745, %r744; - xor.b32 %r747, %r746, %r741; - shf.l.wrap.b32 %r748, %r747, %r747, 24; - add.s32 %r749, %r748, %r742; - xor.b32 %r750, %r749, %r744; - shf.l.wrap.b32 %r751, %r750, %r750, 25; - add.s32 %r752, %r704, %r145; - add.s32 %r753, %r752, %r695; - xor.b32 %r754, %r753, %r720; - shf.l.wrap.b32 %r755, %r754, %r754, 16; - add.s32 %r756, %r755, %r735; - xor.b32 %r757, %r756, %r695; - shf.l.wrap.b32 %r758, %r757, %r757, 20; - add.s32 %r759, %r753, %r101; - add.s32 %r760, %r759, %r758; - xor.b32 %r761, %r760, %r755; - shf.l.wrap.b32 %r762, %r761, %r761, 24; - add.s32 %r763, %r762, %r756; - xor.b32 %r764, %r763, %r758; - shf.l.wrap.b32 %r765, %r764, %r764, 25; - add.s32 %r766, %r718, %r121; - add.s32 %r767, %r766, %r709; - xor.b32 %r768, %r767, %r734; - shf.l.wrap.b32 %r769, %r768, %r768, 16; - add.s32 %r770, %r769, %r693; - xor.b32 %r771, %r770, %r709; - shf.l.wrap.b32 %r772, %r771, %r771, 20; - add.s32 %r773, %r767, %r153; - add.s32 %r774, %r773, %r772; - xor.b32 %r775, %r774, %r769; - shf.l.wrap.b32 %r776, %r775, %r775, 24; - add.s32 %r777, %r776, %r770; - xor.b32 %r778, %r777, %r772; - shf.l.wrap.b32 %r779, %r778, %r778, 25; - add.s32 %r780, %r732, %r177; - add.s32 %r781, %r780, %r723; - xor.b32 %r782, %r781, %r692; - shf.l.wrap.b32 %r783, %r782, %r782, 16; - add.s32 %r784, %r783, %r707; - xor.b32 %r785, %r784, %r723; - shf.l.wrap.b32 %r786, %r785, %r785, 20; - add.s32 %r787, %r781, %r77; - add.s32 %r788, %r787, %r786; - xor.b32 %r789, %r788, %r783; - shf.l.wrap.b32 %r790, %r789, %r789, 24; - add.s32 %r791, %r790, %r784; - xor.b32 %r792, %r791, %r786; - shf.l.wrap.b32 %r793, %r792, %r792, 25; - add.s32 %r794, %r746, %r161; - add.s32 %r795, %r794, %r765; - xor.b32 %r796, %r795, %r790; - shf.l.wrap.b32 %r797, %r796, %r796, 16; - add.s32 %r798, %r797, %r777; - xor.b32 %r799, %r798, %r765; - shf.l.wrap.b32 %r800, %r799, %r799, 20; - add.s32 %r801, %r795, %r89; - add.s32 %r802, %r801, %r800; - xor.b32 %r803, %r802, %r797; - shf.l.wrap.b32 %r804, %r803, %r803, 24; - add.s32 %r805, %r804, %r798; - xor.b32 %r806, %r805, %r800; - shf.l.wrap.b32 %r807, %r806, %r806, 25; - add.s32 %r808, %r760, %r69; - add.s32 %r809, %r808, %r779; - xor.b32 %r810, %r809, %r748; - shf.l.wrap.b32 %r811, %r810, %r810, 16; - add.s32 %r812, %r811, %r791; - xor.b32 %r813, %r812, %r779; - shf.l.wrap.b32 %r814, %r813, %r813, 20; - add.s32 %r815, %r809, %r137; - add.s32 %r816, %r815, %r814; - xor.b32 %r817, %r816, %r811; - shf.l.wrap.b32 %r818, %r817, %r817, 24; - add.s32 %r819, %r818, %r812; - xor.b32 %r820, %r819, %r814; - shf.l.wrap.b32 %r821, %r820, %r820, 25; - add.s32 %r822, %r774, %r81; - add.s32 %r823, %r822, %r793; - xor.b32 %r824, %r823, %r762; - shf.l.wrap.b32 %r825, %r824, %r824, 16; - add.s32 %r826, %r825, %r749; - xor.b32 %r827, %r826, %r793; - shf.l.wrap.b32 %r828, %r827, %r827, 20; - add.s32 %r829, %r823, %r105; - add.s32 %r830, %r829, %r828; - xor.b32 %r831, %r830, %r825; - shf.l.wrap.b32 %r832, %r831, %r831, 24; - add.s32 %r833, %r832, %r826; - xor.b32 %r834, %r833, %r828; - shf.l.wrap.b32 %r835, %r834, %r834, 25; - add.s32 %r836, %r788, %r93; - add.s32 %r837, %r836, %r751; - xor.b32 %r838, %r837, %r776; - shf.l.wrap.b32 %r839, %r838, %r838, 16; - add.s32 %r840, %r839, %r763; - xor.b32 %r841, %r840, %r751; - shf.l.wrap.b32 %r842, %r841, %r841, 20; - add.s32 %r843, %r837, %r113; - add.s32 %r844, %r843, %r842; - xor.b32 %r845, %r844, %r839; - shf.l.wrap.b32 %r846, %r845, %r845, 24; - add.s32 %r847, %r846, %r840; - xor.b32 %r848, %r847, %r842; - shf.l.wrap.b32 %r849, %r848, %r848, 25; - add.s32 %r850, %r802, %r145; - add.s32 %r851, %r850, %r849; - xor.b32 %r852, %r851, %r818; - shf.l.wrap.b32 %r853, %r852, %r852, 16; - add.s32 %r854, %r853, %r833; - xor.b32 %r855, %r854, %r849; - shf.l.wrap.b32 %r856, %r855, %r855, 20; - add.s32 %r857, %r851, %r177; - add.s32 %r858, %r857, %r856; - xor.b32 %r859, %r858, %r853; - shf.l.wrap.b32 %r860, %r859, %r859, 24; - add.s32 %r861, %r860, %r854; - xor.b32 %r862, %r861, %r856; - shf.l.wrap.b32 %r863, %r862, %r862, 25; - add.s32 %r864, %r816, %r101; - add.s32 %r865, %r864, %r807; - xor.b32 %r866, %r865, %r832; - shf.l.wrap.b32 %r867, %r866, %r866, 16; - add.s32 %r868, %r867, %r847; - xor.b32 %r869, %r868, %r807; - shf.l.wrap.b32 %r870, %r869, %r869, 20; - add.s32 %r871, %r865, %r69; - add.s32 %r872, %r871, %r870; - xor.b32 %r873, %r872, %r867; - shf.l.wrap.b32 %r874, %r873, %r873, 24; - add.s32 %r875, %r874, %r868; - xor.b32 %r876, %r875, %r870; - shf.l.wrap.b32 %r877, %r876, %r876, 25; - add.s32 %r878, %r830, %r77; - add.s32 %r879, %r878, %r821; - xor.b32 %r880, %r879, %r846; - shf.l.wrap.b32 %r881, %r880, %r880, 16; - add.s32 %r882, %r881, %r805; - xor.b32 %r883, %r882, %r821; - shf.l.wrap.b32 %r884, %r883, %r883, 20; - add.s32 %r885, %r879, %r129; - add.s32 %r886, %r885, %r884; - xor.b32 %r887, %r886, %r881; - shf.l.wrap.b32 %r888, %r887, %r887, 24; - add.s32 %r889, %r888, %r882; - xor.b32 %r890, %r889, %r884; - shf.l.wrap.b32 %r891, %r890, %r890, 25; - add.s32 %r892, %r844, %r121; - add.s32 %r893, %r892, %r835; - xor.b32 %r894, %r893, %r804; - shf.l.wrap.b32 %r895, %r894, %r894, 16; - add.s32 %r896, %r895, %r819; - xor.b32 %r897, %r896, %r835; - shf.l.wrap.b32 %r898, %r897, %r897, 20; - add.s32 %r899, %r893, %r105; - add.s32 %r900, %r899, %r898; - xor.b32 %r901, %r900, %r895; - shf.l.wrap.b32 %r902, %r901, %r901, 24; - add.s32 %r903, %r902, %r896; - xor.b32 %r904, %r903, %r898; - shf.l.wrap.b32 %r905, %r904, %r904, 25; - add.s32 %r906, %r858, %r169; - add.s32 %r907, %r906, %r877; - xor.b32 %r908, %r907, %r902; - shf.l.wrap.b32 %r909, %r908, %r908, 16; - add.s32 %r910, %r909, %r889; - xor.b32 %r911, %r910, %r877; - shf.l.wrap.b32 %r912, %r911, %r911, 20; - add.s32 %r913, %r907, %r137; - add.s32 %r914, %r913, %r912; - xor.b32 %r915, %r914, %r909; - shf.l.wrap.b32 %r916, %r915, %r915, 24; - add.s32 %r917, %r916, %r910; - xor.b32 %r918, %r917, %r912; - shf.l.wrap.b32 %r919, %r918, %r918, 25; - add.s32 %r920, %r872, %r81; - add.s32 %r921, %r920, %r891; - xor.b32 %r922, %r921, %r860; - shf.l.wrap.b32 %r923, %r922, %r922, 16; - add.s32 %r924, %r923, %r903; - xor.b32 %r925, %r924, %r891; - shf.l.wrap.b32 %r926, %r925, %r925, 20; - add.s32 %r927, %r921, %r153; - add.s32 %r928, %r927, %r926; - xor.b32 %r929, %r928, %r923; - shf.l.wrap.b32 %r930, %r929, %r929, 24; - add.s32 %r931, %r930, %r924; - xor.b32 %r932, %r931, %r926; - shf.l.wrap.b32 %r933, %r932, %r932, 25; - add.s32 %r934, %r886, %r89; - add.s32 %r935, %r934, %r905; - xor.b32 %r936, %r935, %r874; - shf.l.wrap.b32 %r937, %r936, %r936, 16; - add.s32 %r938, %r937, %r861; - xor.b32 %r939, %r938, %r905; - shf.l.wrap.b32 %r940, %r939, %r939, 20; - add.s32 %r941, %r935, %r93; - add.s32 %r942, %r941, %r940; - xor.b32 %r943, %r942, %r937; - shf.l.wrap.b32 %r944, %r943, %r943, 24; - add.s32 %r945, %r944, %r938; - xor.b32 %r946, %r945, %r940; - shf.l.wrap.b32 %r947, %r946, %r946, 25; - add.s32 %r948, %r900, %r113; - add.s32 %r949, %r948, %r863; - xor.b32 %r950, %r949, %r888; - shf.l.wrap.b32 %r951, %r950, %r950, 16; - add.s32 %r952, %r951, %r875; - xor.b32 %r953, %r952, %r863; - shf.l.wrap.b32 %r954, %r953, %r953, 20; - add.s32 %r955, %r949, %r161; - add.s32 %r956, %r955, %r954; - xor.b32 %r957, %r956, %r951; - shf.l.wrap.b32 %r958, %r957, %r957, 24; - add.s32 %r959, %r958, %r952; - xor.b32 %r960, %r959, %r954; - shf.l.wrap.b32 %r961, %r960, %r960, 25; - xor.b32 %r1, %r945, %r914; - xor.b32 %r2, %r959, %r928; - st.local.v2.u32 [%rd3+32], {%r1, %r2}; - xor.b32 %r3, %r917, %r942; - xor.b32 %r4, %r956, %r931; - st.local.v2.u32 [%rd3+40], {%r3, %r4}; - xor.b32 %r5, %r961, %r930; - xor.b32 %r6, %r919, %r944; - st.local.v2.u32 [%rd3+48], {%r5, %r6}; - xor.b32 %r7, %r958, %r933; - xor.b32 %r8, %r947, %r916; - st.local.v2.u32 [%rd3+56], {%r7, %r8}; - st.local.u64 [%rd3+72], %rd131; - st.local.u64 [%rd3+80], %rd8; - add.s16 %rs1, %rs61, 16; - and.b16 %rs70, %rs1, 255; - add.s16 %rs71, %rs62, 1; - st.local.v2.u8 [%rd3+136], {%rs1, %rs71}; - cvt.u32.u16 %r962, %rs71; - cvt.u32.u16 %r963, %rs70; - prmt.b32 %r964, %r962, %r963, 30212; - cvt.u16.u32 %rs72, %r964; - shr.u16 %rs2, %rs72, 8; - mov.b32 {%rs5, %rs6}, %r53; - mov.b32 {%rs3, %rs4}, %r52; - mov.b32 {%rs9, %rs10}, %r27; - mov.b32 {%rs7, %rs8}, %r26; - setp.eq.s16 %p10, %rs2, 0; - selp.u16 %rs73, 1, 0, %p10; - shr.u16 %rs74, %rs3, 8; - shr.u16 %rs75, %rs4, 8; - shr.u16 %rs76, %rs5, 8; - shr.u16 %rs77, %rs6, 8; - shr.u16 %rs78, %rs7, 8; - shr.u16 %rs79, %rs8, 8; - shr.u16 %rs80, %rs9, 8; - shr.u16 %rs81, %rs10, 8; - or.b16 %rs82, %rs73, 10; - cvt.u32.u16 %r965, %rs3; - and.b32 %r966, %r965, 255; - cvt.u32.u16 %r967, %rs74; - prmt.b32 %r968, %r967, %r966, 30212; - cvt.u32.u16 %r969, %rs4; - prmt.b32 %r970, %r969, %r968, 28756; - cvt.u32.u16 %r971, %rs75; - prmt.b32 %r972, %r971, %r970, 1620; - cvt.u32.u16 %r973, %rs5; - and.b32 %r974, %r973, 255; - cvt.u32.u16 %r975, %rs76; - prmt.b32 %r976, %r975, %r974, 30212; - cvt.u32.u16 %r977, %rs6; - prmt.b32 %r978, %r977, %r976, 28756; - cvt.u32.u16 %r979, %rs77; - prmt.b32 %r980, %r979, %r978, 1620; - cvt.u32.u16 %r981, %rs7; - and.b32 %r982, %r981, 255; - cvt.u32.u16 %r983, %rs78; - prmt.b32 %r984, %r983, %r982, 30212; - cvt.u32.u16 %r985, %rs8; - prmt.b32 %r986, %r985, %r984, 28756; - cvt.u32.u16 %r987, %rs79; - prmt.b32 %r988, %r987, %r986, 1620; - cvt.u32.u16 %r989, %rs9; - and.b32 %r990, %r989, 255; - cvt.u32.u16 %r991, %rs80; - prmt.b32 %r992, %r991, %r990, 30212; - cvt.u32.u16 %r993, %rs10; - prmt.b32 %r994, %r993, %r992, 28756; - cvt.u32.u16 %r995, %rs81; - prmt.b32 %r996, %r995, %r994, 1620; - cvt.u32.u16 %r997, %rs82; - add.s32 %r998, %r5, %r1; - add.s32 %r999, %r998, %r972; - add.s32 %r1000, %r980, %r999; - add.s32 %r1001, %r6, %r2; - add.s32 %r1002, %r1001, %r988; - add.s32 %r1003, %r996, %r1002; - add.s32 %r1004, %r7, %r3; - cvt.u32.u16 %r1005, %rs1; - and.b32 %r1006, %r1005, 255; - xor.b32 %r1007, %r1004, %r1006; - shr.u32 %r1008, %r1004, 16; - shl.b32 %r1009, %r1007, 16; - or.b32 %r1010, %r1009, %r1008; - add.s32 %r1011, %r1010, 1013904242; - xor.b32 %r1012, %r1011, %r7; - shf.l.wrap.b32 %r1013, %r1012, %r1012, 20; - add.s32 %r1014, %r1004, %r1013; - xor.b32 %r1015, %r1014, %r1010; - shf.l.wrap.b32 %r1016, %r1015, %r1015, 24; - add.s32 %r1017, %r1016, %r1011; - xor.b32 %r1018, %r1017, %r1013; - shf.l.wrap.b32 %r1019, %r1018, %r1018, 25; - add.s32 %r1020, %r8, %r4; - xor.b32 %r1021, %r1020, %r997; - shr.u32 %r1022, %r1020, 16; - shl.b32 %r1023, %r1021, 16; - or.b32 %r1024, %r1023, %r1022; - add.s32 %r1025, %r1024, -1521486534; - xor.b32 %r1026, %r1025, %r8; - shf.l.wrap.b32 %r1027, %r1026, %r1026, 20; - add.s32 %r1028, %r1020, %r1027; - xor.b32 %r1029, %r1028, %r1024; - shf.l.wrap.b32 %r1030, %r1029, %r1029, 24; - add.s32 %r1031, %r1030, %r1025; - xor.b32 %r1032, %r1031, %r1027; - shf.l.wrap.b32 %r1033, %r1032, %r1032, 25; - add.s32 %r1034, %r1033, %r1014; - shf.l.wrap.b32 %r1035, %r999, %r999, 16; - add.s32 %r1036, %r1035, 1779033703; - xor.b32 %r1037, %r1036, %r5; - shf.l.wrap.b32 %r1038, %r1037, %r1037, 20; - add.s32 %r1039, %r1000, %r1038; - xor.b32 %r1040, %r1039, %r1035; - shf.l.wrap.b32 %r1041, %r1040, %r1040, 24; - add.s32 %r1042, %r1041, %r1036; - xor.b32 %r1043, %r1042, %r1038; - shf.l.wrap.b32 %r1044, %r1043, %r1043, 25; - shf.l.wrap.b32 %r1045, %r1002, %r1002, 16; - add.s32 %r1046, %r1045, -1150833019; - xor.b32 %r1047, %r1046, %r6; - shf.l.wrap.b32 %r1048, %r1047, %r1047, 20; - add.s32 %r1049, %r1003, %r1048; - xor.b32 %r1050, %r1049, %r1045; - shf.l.wrap.b32 %r1051, %r1050, %r1050, 24; - add.s32 %r1052, %r1051, %r1046; - xor.b32 %r1053, %r1052, %r1048; - shf.l.wrap.b32 %r1054, %r1053, %r1053, 25; - add.s32 %r1055, %r1039, %r1054; - xor.b32 %r1056, %r1055, %r1030; - shf.l.wrap.b32 %r1057, %r1056, %r1056, 16; - add.s32 %r1058, %r1057, %r1017; - xor.b32 %r1059, %r1058, %r1054; - shf.l.wrap.b32 %r1060, %r1059, %r1059, 20; - add.s32 %r1061, %r1055, %r1060; - xor.b32 %r1062, %r1061, %r1057; - shf.l.wrap.b32 %r1063, %r1062, %r1062, 24; - add.s32 %r1064, %r1063, %r1058; - xor.b32 %r1065, %r1064, %r1060; - shf.l.wrap.b32 %r1066, %r1065, %r1065, 25; - add.s32 %r1067, %r1019, %r1049; - xor.b32 %r1068, %r1041, %r1067; - shf.l.wrap.b32 %r1069, %r1068, %r1068, 16; - add.s32 %r1070, %r1069, %r1031; - xor.b32 %r1071, %r1070, %r1019; - shf.l.wrap.b32 %r1072, %r1071, %r1071, 20; - add.s32 %r1073, %r1067, %r1072; - xor.b32 %r1074, %r1073, %r1069; - shf.l.wrap.b32 %r1075, %r1074, %r1074, 24; - add.s32 %r1076, %r1075, %r1070; - xor.b32 %r1077, %r1076, %r1072; - shf.l.wrap.b32 %r1078, %r1077, %r1077, 25; - xor.b32 %r1079, %r1051, %r1034; - shf.l.wrap.b32 %r1080, %r1079, %r1079, 16; - add.s32 %r1081, %r1080, %r1042; - xor.b32 %r1082, %r1081, %r1033; - shf.l.wrap.b32 %r1083, %r1082, %r1082, 20; - add.s32 %r1084, %r1034, %r1083; - xor.b32 %r1085, %r1084, %r1080; - shf.l.wrap.b32 %r1086, %r1085, %r1085, 24; - add.s32 %r1087, %r1086, %r1081; - xor.b32 %r1088, %r1087, %r1083; - shf.l.wrap.b32 %r1089, %r1088, %r1088, 25; - add.s32 %r1090, %r1028, %r1044; - xor.b32 %r1091, %r1090, %r1016; - shf.l.wrap.b32 %r1092, %r1091, %r1091, 16; - add.s32 %r1093, %r1092, %r1052; - xor.b32 %r1094, %r1093, %r1044; - shf.l.wrap.b32 %r1095, %r1094, %r1094, 20; - add.s32 %r1096, %r1090, %r1095; - xor.b32 %r1097, %r1096, %r1092; - shf.l.wrap.b32 %r1098, %r1097, %r1097, 24; - add.s32 %r1099, %r1098, %r1093; - xor.b32 %r1100, %r1099, %r1095; - shf.l.wrap.b32 %r1101, %r1100, %r1100, 25; - add.s32 %r1102, %r1061, %r988; - add.s32 %r1103, %r1102, %r1101; - xor.b32 %r1104, %r1103, %r1075; - shf.l.wrap.b32 %r1105, %r1104, %r1104, 16; - add.s32 %r1106, %r1105, %r1087; - xor.b32 %r1107, %r1106, %r1101; - shf.l.wrap.b32 %r1108, %r1107, %r1107, 20; - add.s32 %r1109, %r1103, %r1108; - xor.b32 %r1110, %r1109, %r1105; - shf.l.wrap.b32 %r1111, %r1110, %r1110, 24; - add.s32 %r1112, %r1111, %r1106; - xor.b32 %r1113, %r1112, %r1108; - shf.l.wrap.b32 %r1114, %r1113, %r1113, 25; - add.s32 %r1115, %r1073, %r996; - add.s32 %r1116, %r1115, %r1066; - xor.b32 %r1117, %r1116, %r1086; - shf.l.wrap.b32 %r1118, %r1117, %r1117, 16; - add.s32 %r1119, %r1118, %r1099; - xor.b32 %r1120, %r1119, %r1066; - shf.l.wrap.b32 %r1121, %r1120, %r1120, 20; - add.s32 %r1122, %r1116, %r1121; - xor.b32 %r1123, %r1122, %r1118; - shf.l.wrap.b32 %r1124, %r1123, %r1123, 24; - add.s32 %r1125, %r1124, %r1119; - xor.b32 %r1126, %r1125, %r1121; - shf.l.wrap.b32 %r1127, %r1126, %r1126, 25; - add.s32 %r1128, %r1084, %r1078; - xor.b32 %r1129, %r1098, %r1128; - shf.l.wrap.b32 %r1130, %r1129, %r1129, 16; - add.s32 %r1131, %r1130, %r1064; - xor.b32 %r1132, %r1131, %r1078; - shf.l.wrap.b32 %r1133, %r1132, %r1132, 20; - add.s32 %r1134, %r1128, %r972; - add.s32 %r1135, %r1134, %r1133; - xor.b32 %r1136, %r1135, %r1130; - shf.l.wrap.b32 %r1137, %r1136, %r1136, 24; - add.s32 %r1138, %r1137, %r1131; - xor.b32 %r1139, %r1138, %r1133; - shf.l.wrap.b32 %r1140, %r1139, %r1139, 25; - add.s32 %r1141, %r1096, %r1089; - xor.b32 %r1142, %r1063, %r1141; - shf.l.wrap.b32 %r1143, %r1142, %r1142, 16; - add.s32 %r1144, %r1143, %r1076; - xor.b32 %r1145, %r1144, %r1089; - shf.l.wrap.b32 %r1146, %r1145, %r1145, 20; - add.s32 %r1147, %r1141, %r1146; - xor.b32 %r1148, %r1147, %r1143; - shf.l.wrap.b32 %r1149, %r1148, %r1148, 24; - add.s32 %r1150, %r1149, %r1144; - xor.b32 %r1151, %r1150, %r1146; - shf.l.wrap.b32 %r1152, %r1151, %r1151, 25; - add.s32 %r1153, %r1109, %r980; - add.s32 %r1154, %r1153, %r1127; - xor.b32 %r1155, %r1154, %r1149; - shf.l.wrap.b32 %r1156, %r1155, %r1155, 16; - add.s32 %r1157, %r1156, %r1138; - xor.b32 %r1158, %r1157, %r1127; - shf.l.wrap.b32 %r1159, %r1158, %r1158, 20; - add.s32 %r1160, %r1154, %r1159; - xor.b32 %r1161, %r1160, %r1156; - shf.l.wrap.b32 %r1162, %r1161, %r1161, 24; - add.s32 %r1163, %r1162, %r1157; - xor.b32 %r1164, %r1163, %r1159; - shf.l.wrap.b32 %r1165, %r1164, %r1164, 25; - add.s32 %r1166, %r1140, %r1122; - xor.b32 %r1167, %r1111, %r1166; - shf.l.wrap.b32 %r1168, %r1167, %r1167, 16; - add.s32 %r1169, %r1168, %r1150; - xor.b32 %r1170, %r1169, %r1140; - shf.l.wrap.b32 %r1171, %r1170, %r1170, 20; - add.s32 %r1172, %r1166, %r1171; - xor.b32 %r1173, %r1172, %r1168; - shf.l.wrap.b32 %r1174, %r1173, %r1173, 24; - add.s32 %r1175, %r1174, %r1169; - xor.b32 %r1176, %r1175, %r1171; - shf.l.wrap.b32 %r1177, %r1176, %r1176, 25; - add.s32 %r1178, %r1135, %r1152; - xor.b32 %r1179, %r1124, %r1178; - shf.l.wrap.b32 %r1180, %r1179, %r1179, 16; - add.s32 %r1181, %r1180, %r1112; - xor.b32 %r1182, %r1181, %r1152; - shf.l.wrap.b32 %r1183, %r1182, %r1182, 20; - add.s32 %r1184, %r1178, %r1183; - xor.b32 %r1185, %r1184, %r1180; - shf.l.wrap.b32 %r1186, %r1185, %r1185, 24; - add.s32 %r1187, %r1186, %r1181; - xor.b32 %r1188, %r1187, %r1183; - shf.l.wrap.b32 %r1189, %r1188, %r1188, 25; - add.s32 %r1190, %r1147, %r1114; - xor.b32 %r1191, %r1190, %r1137; - shf.l.wrap.b32 %r1192, %r1191, %r1191, 16; - add.s32 %r1193, %r1192, %r1125; - xor.b32 %r1194, %r1193, %r1114; - shf.l.wrap.b32 %r1195, %r1194, %r1194, 20; - add.s32 %r1196, %r1190, %r1195; - xor.b32 %r1197, %r1196, %r1192; - shf.l.wrap.b32 %r1198, %r1197, %r1197, 24; - add.s32 %r1199, %r1198, %r1193; - xor.b32 %r1200, %r1199, %r1195; - shf.l.wrap.b32 %r1201, %r1200, %r1200, 25; - add.s32 %r1202, %r1160, %r996; - add.s32 %r1203, %r1202, %r1201; - xor.b32 %r1204, %r1203, %r1174; - shf.l.wrap.b32 %r1205, %r1204, %r1204, 16; - add.s32 %r1206, %r1205, %r1187; - xor.b32 %r1207, %r1206, %r1201; - shf.l.wrap.b32 %r1208, %r1207, %r1207, 20; - add.s32 %r1209, %r1203, %r1208; - xor.b32 %r1210, %r1209, %r1205; - shf.l.wrap.b32 %r1211, %r1210, %r1210, 24; - add.s32 %r1212, %r1211, %r1206; - xor.b32 %r1213, %r1212, %r1208; - shf.l.wrap.b32 %r1214, %r1213, %r1213, 25; - add.s32 %r1215, %r1172, %r1165; - xor.b32 %r1216, %r1215, %r1186; - shf.l.wrap.b32 %r1217, %r1216, %r1216, 16; - add.s32 %r1218, %r1217, %r1199; - xor.b32 %r1219, %r1218, %r1165; - shf.l.wrap.b32 %r1220, %r1219, %r1219, 20; - add.s32 %r1221, %r1215, %r1220; - xor.b32 %r1222, %r1221, %r1217; - shf.l.wrap.b32 %r1223, %r1222, %r1222, 24; - add.s32 %r1224, %r1223, %r1218; - xor.b32 %r1225, %r1224, %r1220; - shf.l.wrap.b32 %r1226, %r1225, %r1225, 25; - add.s32 %r1227, %r1184, %r1177; - xor.b32 %r1228, %r1198, %r1227; - shf.l.wrap.b32 %r1229, %r1228, %r1228, 16; - add.s32 %r1230, %r1229, %r1163; - xor.b32 %r1231, %r1230, %r1177; - shf.l.wrap.b32 %r1232, %r1231, %r1231, 20; - add.s32 %r1233, %r1227, %r988; - add.s32 %r1234, %r1233, %r1232; - xor.b32 %r1235, %r1234, %r1229; - shf.l.wrap.b32 %r1236, %r1235, %r1235, 24; - add.s32 %r1237, %r1236, %r1230; - xor.b32 %r1238, %r1237, %r1232; - shf.l.wrap.b32 %r1239, %r1238, %r1238, 25; - add.s32 %r1240, %r1196, %r1189; - xor.b32 %r1241, %r1162, %r1240; - shf.l.wrap.b32 %r1242, %r1241, %r1241, 16; - add.s32 %r1243, %r1242, %r1175; - xor.b32 %r1244, %r1243, %r1189; - shf.l.wrap.b32 %r1245, %r1244, %r1244, 20; - add.s32 %r1246, %r1240, %r1245; - xor.b32 %r1247, %r1246, %r1242; - shf.l.wrap.b32 %r1248, %r1247, %r1247, 24; - add.s32 %r1249, %r1248, %r1243; - xor.b32 %r1250, %r1249, %r1245; - shf.l.wrap.b32 %r1251, %r1250, %r1250, 25; - add.s32 %r1252, %r1209, %r1226; - xor.b32 %r1253, %r1252, %r1248; - shf.l.wrap.b32 %r1254, %r1253, %r1253, 16; - add.s32 %r1255, %r1254, %r1237; - xor.b32 %r1256, %r1255, %r1226; - shf.l.wrap.b32 %r1257, %r1256, %r1256, 20; - add.s32 %r1258, %r1252, %r1257; - xor.b32 %r1259, %r1258, %r1254; - shf.l.wrap.b32 %r1260, %r1259, %r1259, 24; - add.s32 %r1261, %r1260, %r1255; - xor.b32 %r1262, %r1261, %r1257; - shf.l.wrap.b32 %r1263, %r1262, %r1262, 25; - add.s32 %r1264, %r1239, %r1221; - xor.b32 %r1265, %r1211, %r1264; - shf.l.wrap.b32 %r1266, %r1265, %r1265, 16; - add.s32 %r1267, %r1266, %r1249; - xor.b32 %r1268, %r1267, %r1239; - shf.l.wrap.b32 %r1269, %r1268, %r1268, 20; - add.s32 %r1270, %r1264, %r972; - add.s32 %r1271, %r1270, %r1269; - xor.b32 %r1272, %r1271, %r1266; - shf.l.wrap.b32 %r1273, %r1272, %r1272, 24; - add.s32 %r1274, %r1273, %r1267; - xor.b32 %r1275, %r1274, %r1269; - shf.l.wrap.b32 %r1276, %r1275, %r1275, 25; - add.s32 %r1277, %r1234, %r1251; - xor.b32 %r1278, %r1223, %r1277; - shf.l.wrap.b32 %r1279, %r1278, %r1278, 16; - add.s32 %r1280, %r1279, %r1212; - xor.b32 %r1281, %r1280, %r1251; - shf.l.wrap.b32 %r1282, %r1281, %r1281, 20; - add.s32 %r1283, %r1277, %r1282; - xor.b32 %r1284, %r1283, %r1279; - shf.l.wrap.b32 %r1285, %r1284, %r1284, 24; - add.s32 %r1286, %r1285, %r1280; - xor.b32 %r1287, %r1286, %r1282; - shf.l.wrap.b32 %r1288, %r1287, %r1287, 25; - add.s32 %r1289, %r1246, %r1214; - xor.b32 %r1290, %r1289, %r1236; - shf.l.wrap.b32 %r1291, %r1290, %r1290, 16; - add.s32 %r1292, %r1291, %r1224; - xor.b32 %r1293, %r1292, %r1214; - shf.l.wrap.b32 %r1294, %r1293, %r1293, 20; - add.s32 %r1295, %r1289, %r980; - add.s32 %r1296, %r1295, %r1294; - xor.b32 %r1297, %r1296, %r1291; - shf.l.wrap.b32 %r1298, %r1297, %r1297, 24; - add.s32 %r1299, %r1298, %r1292; - xor.b32 %r1300, %r1299, %r1294; - shf.l.wrap.b32 %r1301, %r1300, %r1300, 25; - add.s32 %r1302, %r1258, %r1301; - xor.b32 %r1303, %r1302, %r1273; - shf.l.wrap.b32 %r1304, %r1303, %r1303, 16; - add.s32 %r1305, %r1304, %r1286; - xor.b32 %r1306, %r1305, %r1301; - shf.l.wrap.b32 %r1307, %r1306, %r1306, 20; - add.s32 %r1308, %r1302, %r1307; - xor.b32 %r1309, %r1308, %r1304; - shf.l.wrap.b32 %r1310, %r1309, %r1309, 24; - add.s32 %r1311, %r1310, %r1305; - xor.b32 %r1312, %r1311, %r1307; - shf.l.wrap.b32 %r1313, %r1312, %r1312, 25; - add.s32 %r1314, %r1271, %r1263; - xor.b32 %r1315, %r1314, %r1285; - shf.l.wrap.b32 %r1316, %r1315, %r1315, 16; - add.s32 %r1317, %r1316, %r1299; - xor.b32 %r1318, %r1317, %r1263; - shf.l.wrap.b32 %r1319, %r1318, %r1318, 20; - add.s32 %r1320, %r1314, %r1319; - xor.b32 %r1321, %r1320, %r1316; - shf.l.wrap.b32 %r1322, %r1321, %r1321, 24; - add.s32 %r1323, %r1322, %r1317; - xor.b32 %r1324, %r1323, %r1319; - shf.l.wrap.b32 %r1325, %r1324, %r1324, 25; - add.s32 %r1326, %r1283, %r1276; - xor.b32 %r1327, %r1298, %r1326; - shf.l.wrap.b32 %r1328, %r1327, %r1327, 16; - add.s32 %r1329, %r1328, %r1261; - xor.b32 %r1330, %r1329, %r1276; - shf.l.wrap.b32 %r1331, %r1330, %r1330, 20; - add.s32 %r1332, %r1326, %r996; - add.s32 %r1333, %r1332, %r1331; - xor.b32 %r1334, %r1333, %r1328; - shf.l.wrap.b32 %r1335, %r1334, %r1334, 24; - add.s32 %r1336, %r1335, %r1329; - xor.b32 %r1337, %r1336, %r1331; - shf.l.wrap.b32 %r1338, %r1337, %r1337, 25; - add.s32 %r1339, %r1296, %r1288; - xor.b32 %r1340, %r1260, %r1339; - shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; - add.s32 %r1342, %r1341, %r1274; - xor.b32 %r1343, %r1342, %r1288; - shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; - add.s32 %r1345, %r1339, %r1344; - xor.b32 %r1346, %r1345, %r1341; - shf.l.wrap.b32 %r1347, %r1346, %r1346, 24; - add.s32 %r1348, %r1347, %r1342; - xor.b32 %r1349, %r1348, %r1344; - shf.l.wrap.b32 %r1350, %r1349, %r1349, 25; - add.s32 %r1351, %r1308, %r1325; - xor.b32 %r1352, %r1351, %r1347; - shf.l.wrap.b32 %r1353, %r1352, %r1352, 16; - add.s32 %r1354, %r1353, %r1336; - xor.b32 %r1355, %r1354, %r1325; - shf.l.wrap.b32 %r1356, %r1355, %r1355, 20; - add.s32 %r1357, %r1351, %r972; - add.s32 %r1358, %r1357, %r1356; - xor.b32 %r1359, %r1358, %r1353; - shf.l.wrap.b32 %r1360, %r1359, %r1359, 24; - add.s32 %r1361, %r1360, %r1354; - xor.b32 %r1362, %r1361, %r1356; - shf.l.wrap.b32 %r1363, %r1362, %r1362, 25; - add.s32 %r1364, %r1338, %r1320; - xor.b32 %r1365, %r1310, %r1364; - shf.l.wrap.b32 %r1366, %r1365, %r1365, 16; - add.s32 %r1367, %r1366, %r1348; - xor.b32 %r1368, %r1367, %r1338; - shf.l.wrap.b32 %r1369, %r1368, %r1368, 20; - add.s32 %r1370, %r1364, %r988; - add.s32 %r1371, %r1370, %r1369; - xor.b32 %r1372, %r1371, %r1366; - shf.l.wrap.b32 %r1373, %r1372, %r1372, 24; - add.s32 %r1374, %r1373, %r1367; - xor.b32 %r1375, %r1374, %r1369; - shf.l.wrap.b32 %r1376, %r1375, %r1375, 25; - add.s32 %r1377, %r1333, %r1350; - xor.b32 %r1378, %r1322, %r1377; - shf.l.wrap.b32 %r1379, %r1378, %r1378, 16; - add.s32 %r1380, %r1379, %r1311; - xor.b32 %r1381, %r1380, %r1350; - shf.l.wrap.b32 %r1382, %r1381, %r1381, 20; - add.s32 %r1383, %r1377, %r1382; - xor.b32 %r1384, %r1383, %r1379; - shf.l.wrap.b32 %r1385, %r1384, %r1384, 24; - add.s32 %r1386, %r1385, %r1380; - xor.b32 %r1387, %r1386, %r1382; - shf.l.wrap.b32 %r1388, %r1387, %r1387, 25; - add.s32 %r1389, %r1345, %r980; - add.s32 %r1390, %r1389, %r1313; - xor.b32 %r1391, %r1390, %r1335; - shf.l.wrap.b32 %r1392, %r1391, %r1391, 16; - add.s32 %r1393, %r1392, %r1323; - xor.b32 %r1394, %r1393, %r1313; - shf.l.wrap.b32 %r1395, %r1394, %r1394, 20; - add.s32 %r1396, %r1390, %r1395; - xor.b32 %r1397, %r1396, %r1392; - shf.l.wrap.b32 %r1398, %r1397, %r1397, 24; - add.s32 %r1399, %r1398, %r1393; - xor.b32 %r1400, %r1399, %r1395; - shf.l.wrap.b32 %r1401, %r1400, %r1400, 25; - add.s32 %r1402, %r1358, %r1401; - xor.b32 %r1403, %r1402, %r1373; - shf.l.wrap.b32 %r1404, %r1403, %r1403, 16; - add.s32 %r1405, %r1404, %r1386; - xor.b32 %r1406, %r1405, %r1401; - shf.l.wrap.b32 %r1407, %r1406, %r1406, 20; - add.s32 %r1408, %r1402, %r1407; - xor.b32 %r1409, %r1408, %r1404; - shf.l.wrap.b32 %r1410, %r1409, %r1409, 24; - add.s32 %r1411, %r1410, %r1405; - xor.b32 %r1412, %r1411, %r1407; - shf.l.wrap.b32 %r1413, %r1412, %r1412, 25; - add.s32 %r1414, %r1371, %r1363; - xor.b32 %r1415, %r1414, %r1385; - shf.l.wrap.b32 %r1416, %r1415, %r1415, 16; - add.s32 %r1417, %r1416, %r1399; - xor.b32 %r1418, %r1417, %r1363; - shf.l.wrap.b32 %r1419, %r1418, %r1418, 20; - add.s32 %r1420, %r1414, %r1419; - xor.b32 %r1421, %r1420, %r1416; - shf.l.wrap.b32 %r1422, %r1421, %r1421, 24; - add.s32 %r1423, %r1422, %r1417; - xor.b32 %r1424, %r1423, %r1419; - shf.l.wrap.b32 %r1425, %r1424, %r1424, 25; - add.s32 %r1426, %r1383, %r1376; - xor.b32 %r1427, %r1398, %r1426; - shf.l.wrap.b32 %r1428, %r1427, %r1427, 16; - add.s32 %r1429, %r1428, %r1361; - xor.b32 %r1430, %r1429, %r1376; - shf.l.wrap.b32 %r1431, %r1430, %r1430, 20; - add.s32 %r1432, %r1426, %r1431; - xor.b32 %r1433, %r1432, %r1428; - shf.l.wrap.b32 %r1434, %r1433, %r1433, 24; - add.s32 %r1435, %r1434, %r1429; - xor.b32 %r1436, %r1435, %r1431; - shf.l.wrap.b32 %r1437, %r1436, %r1436, 25; - add.s32 %r1438, %r1396, %r1388; - xor.b32 %r1439, %r1360, %r1438; - shf.l.wrap.b32 %r1440, %r1439, %r1439, 16; - add.s32 %r1441, %r1440, %r1374; - xor.b32 %r1442, %r1441, %r1388; - shf.l.wrap.b32 %r1443, %r1442, %r1442, 20; - add.s32 %r1444, %r1438, %r1443; - xor.b32 %r1445, %r1444, %r1440; - shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; - add.s32 %r1447, %r1446, %r1441; - xor.b32 %r1448, %r1447, %r1443; - shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; - add.s32 %r1450, %r1408, %r1425; - xor.b32 %r1451, %r1450, %r1446; - shf.l.wrap.b32 %r1452, %r1451, %r1451, 16; - add.s32 %r1453, %r1452, %r1435; - xor.b32 %r1454, %r1453, %r1425; - shf.l.wrap.b32 %r1455, %r1454, %r1454, 20; - add.s32 %r1456, %r1450, %r988; - add.s32 %r1457, %r1456, %r1455; - xor.b32 %r1458, %r1457, %r1452; - shf.l.wrap.b32 %r1459, %r1458, %r1458, 24; - add.s32 %r1460, %r1459, %r1453; - xor.b32 %r1461, %r1460, %r1455; - shf.l.wrap.b32 %r1462, %r1461, %r1461, 25; - add.s32 %r1463, %r1437, %r1420; - xor.b32 %r1464, %r1410, %r1463; - shf.l.wrap.b32 %r1465, %r1464, %r1464, 16; - add.s32 %r1466, %r1465, %r1447; - xor.b32 %r1467, %r1466, %r1437; - shf.l.wrap.b32 %r1468, %r1467, %r1467, 20; - add.s32 %r1469, %r1463, %r996; - add.s32 %r1470, %r1469, %r1468; - xor.b32 %r1471, %r1470, %r1465; - shf.l.wrap.b32 %r1472, %r1471, %r1471, 24; - add.s32 %r1473, %r1472, %r1466; - xor.b32 %r1474, %r1473, %r1468; - shf.l.wrap.b32 %r1475, %r1474, %r1474, 25; - add.s32 %r1476, %r1432, %r972; - add.s32 %r1477, %r1476, %r1449; - xor.b32 %r1478, %r1422, %r1477; - shf.l.wrap.b32 %r1479, %r1478, %r1478, 16; - add.s32 %r1480, %r1479, %r1411; - xor.b32 %r1481, %r1480, %r1449; - shf.l.wrap.b32 %r1482, %r1481, %r1481, 20; - add.s32 %r1483, %r1477, %r980; - add.s32 %r1484, %r1483, %r1482; - xor.b32 %r1485, %r1484, %r1479; - shf.l.wrap.b32 %r1486, %r1485, %r1485, 24; - add.s32 %r1487, %r1486, %r1480; - xor.b32 %r1488, %r1487, %r1482; - shf.l.wrap.b32 %r1489, %r1488, %r1488, 25; - add.s32 %r1490, %r1444, %r1413; - xor.b32 %r1491, %r1490, %r1434; - shf.l.wrap.b32 %r1492, %r1491, %r1491, 16; - add.s32 %r1493, %r1492, %r1423; - xor.b32 %r1494, %r1493, %r1413; - shf.l.wrap.b32 %r1495, %r1494, %r1494, 20; - add.s32 %r1496, %r1490, %r1495; - xor.b32 %r1497, %r1496, %r1492; - shf.l.wrap.b32 %r1498, %r1497, %r1497, 24; - add.s32 %r1499, %r1498, %r1493; - xor.b32 %r1500, %r1499, %r1495; - shf.l.wrap.b32 %r1501, %r1500, %r1500, 25; - add.s32 %r1502, %r1457, %r1501; - xor.b32 %r1503, %r1502, %r1472; - shf.l.wrap.b32 %r1504, %r1503, %r1503, 16; - add.s32 %r1505, %r1504, %r1487; - xor.b32 %r1506, %r1505, %r1501; - shf.l.wrap.b32 %r1507, %r1506, %r1506, 20; - add.s32 %r1508, %r1502, %r1507; - xor.b32 %r1509, %r1508, %r1504; - shf.l.wrap.b32 %r1510, %r1509, %r1509, 24; - add.s32 %r1511, %r1510, %r1505; - xor.b32 %r1512, %r1511, %r1507; - shf.l.wrap.b32 %r1513, %r1512, %r1512, 25; - add.s32 %r1514, %r1470, %r1462; - xor.b32 %r1515, %r1514, %r1486; - shf.l.wrap.b32 %r1516, %r1515, %r1515, 16; - add.s32 %r1517, %r1516, %r1499; - xor.b32 %r1518, %r1517, %r1462; - shf.l.wrap.b32 %r1519, %r1518, %r1518, 20; - add.s32 %r1520, %r1514, %r1519; - xor.b32 %r1521, %r1520, %r1516; - shf.l.wrap.b32 %r1522, %r1521, %r1521, 24; - add.s32 %r1523, %r1522, %r1517; - xor.b32 %r1524, %r1523, %r1519; - shf.l.wrap.b32 %r1525, %r1524, %r1524, 25; - add.s32 %r1526, %r1484, %r1475; - xor.b32 %r1527, %r1498, %r1526; - shf.l.wrap.b32 %r1528, %r1527, %r1527, 16; - add.s32 %r1529, %r1528, %r1460; - xor.b32 %r1530, %r1529, %r1475; - shf.l.wrap.b32 %r1531, %r1530, %r1530, 20; - add.s32 %r1532, %r1526, %r1531; - xor.b32 %r1533, %r1532, %r1528; - shf.l.wrap.b32 %r1534, %r1533, %r1533, 24; - add.s32 %r1535, %r1534, %r1529; - xor.b32 %r1536, %r1535, %r1531; - shf.l.wrap.b32 %r1537, %r1536, %r1536, 25; - add.s32 %r1538, %r1496, %r1489; - xor.b32 %r1539, %r1459, %r1538; - shf.l.wrap.b32 %r1540, %r1539, %r1539, 16; - add.s32 %r1541, %r1540, %r1473; - xor.b32 %r1542, %r1541, %r1489; - shf.l.wrap.b32 %r1543, %r1542, %r1542, 20; - add.s32 %r1544, %r1538, %r980; - add.s32 %r1545, %r1544, %r1543; - xor.b32 %r1546, %r1545, %r1540; - shf.l.wrap.b32 %r1547, %r1546, %r1546, 24; - add.s32 %r1548, %r1547, %r1541; - xor.b32 %r1549, %r1548, %r1543; - shf.l.wrap.b32 %r1550, %r1549, %r1549, 25; - add.s32 %r1551, %r1508, %r1525; - xor.b32 %r1552, %r1551, %r1547; - shf.l.wrap.b32 %r1553, %r1552, %r1552, 16; - add.s32 %r1554, %r1553, %r1535; - xor.b32 %r1555, %r1554, %r1525; - shf.l.wrap.b32 %r1556, %r1555, %r1555, 20; - add.s32 %r1557, %r1551, %r996; - add.s32 %r1558, %r1557, %r1556; - xor.b32 %r1559, %r1558, %r1553; - shf.l.wrap.b32 %r1560, %r1559, %r1559, 24; - add.s32 %r1561, %r1560, %r1554; - xor.b32 %r1562, %r1561, %r1556; - shf.l.wrap.b32 %r1563, %r1562, %r1562, 25; - add.s32 %r1564, %r1537, %r972; - add.s32 %r1565, %r1564, %r1520; - xor.b32 %r1566, %r1510, %r1565; - shf.l.wrap.b32 %r1567, %r1566, %r1566, 16; - add.s32 %r1568, %r1567, %r1548; - xor.b32 %r1569, %r1568, %r1537; - shf.l.wrap.b32 %r1570, %r1569, %r1569, 20; - add.s32 %r1571, %r1565, %r1570; - xor.b32 %r1572, %r1571, %r1567; - shf.l.wrap.b32 %r1573, %r1572, %r1572, 24; - add.s32 %r1574, %r1573, %r1568; - xor.b32 %r1575, %r1574, %r1570; - shf.l.wrap.b32 %r1576, %r1575, %r1575, 25; - add.s32 %r1577, %r1532, %r988; - add.s32 %r1578, %r1577, %r1550; - xor.b32 %r1579, %r1522, %r1578; - shf.l.wrap.b32 %r1580, %r1579, %r1579, 16; - add.s32 %r1581, %r1580, %r1511; - xor.b32 %r1582, %r1581, %r1550; - shf.l.wrap.b32 %r1583, %r1582, %r1582, 20; - add.s32 %r1584, %r1578, %r1583; - xor.b32 %r1585, %r1584, %r1580; - shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; - add.s32 %r1587, %r1586, %r1581; - xor.b32 %r1588, %r1587, %r1583; - shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; - add.s32 %r1590, %r1545, %r1513; - xor.b32 %r1591, %r1590, %r1534; - shf.l.wrap.b32 %r1592, %r1591, %r1591, 16; - add.s32 %r1593, %r1592, %r1523; - xor.b32 %r1594, %r1593, %r1513; - shf.l.wrap.b32 %r1595, %r1594, %r1594, 20; - add.s32 %r1596, %r1590, %r1595; - xor.b32 %r1597, %r1596, %r1592; - shf.l.wrap.b32 %r1598, %r1597, %r1597, 24; - add.s32 %r1599, %r1598, %r1593; - xor.b32 %r1600, %r1599, %r1595; - shf.l.wrap.b32 %r1601, %r1600, %r1600, 25; - add.s32 %r1602, %r1558, %r1601; - xor.b32 %r1603, %r1602, %r1573; - shf.l.wrap.b32 %r1604, %r1603, %r1603, 16; - add.s32 %r1605, %r1604, %r1587; - xor.b32 %r1606, %r1605, %r1601; - shf.l.wrap.b32 %r1607, %r1606, %r1606, 20; - add.s32 %r1608, %r1602, %r1607; - xor.b32 %r1609, %r1608, %r1604; - shf.l.wrap.b32 %r1610, %r1609, %r1609, 24; - add.s32 %r1611, %r1610, %r1605; - xor.b32 %r1612, %r1611, %r1607; - shf.l.wrap.b32 %r1613, %r1612, %r1612, 25; - add.s32 %r1614, %r1571, %r1563; - xor.b32 %r1615, %r1614, %r1586; - shf.l.wrap.b32 %r1616, %r1615, %r1615, 16; - add.s32 %r1617, %r1616, %r1599; - xor.b32 %r1618, %r1617, %r1563; - shf.l.wrap.b32 %r1619, %r1618, %r1618, 20; - add.s32 %r1620, %r1614, %r972; - add.s32 %r1621, %r1620, %r1619; - xor.b32 %r1622, %r1621, %r1616; - shf.l.wrap.b32 %r1623, %r1622, %r1622, 24; - add.s32 %r1624, %r1623, %r1617; - xor.b32 %r1625, %r1624, %r1619; - shf.l.wrap.b32 %r1626, %r1625, %r1625, 25; - add.s32 %r1627, %r1584, %r980; - add.s32 %r1628, %r1627, %r1576; - xor.b32 %r1629, %r1598, %r1628; - shf.l.wrap.b32 %r1630, %r1629, %r1629, 16; - add.s32 %r1631, %r1630, %r1561; - xor.b32 %r1632, %r1631, %r1576; - shf.l.wrap.b32 %r1633, %r1632, %r1632, 20; - add.s32 %r1634, %r1628, %r1633; - xor.b32 %r1635, %r1634, %r1630; - shf.l.wrap.b32 %r1636, %r1635, %r1635, 24; - add.s32 %r1637, %r1636, %r1631; - xor.b32 %r1638, %r1637, %r1633; - shf.l.wrap.b32 %r1639, %r1638, %r1638, 25; - add.s32 %r1640, %r1596, %r1589; - xor.b32 %r1641, %r1560, %r1640; - shf.l.wrap.b32 %r1642, %r1641, %r1641, 16; - add.s32 %r1643, %r1642, %r1574; - xor.b32 %r1644, %r1643, %r1589; - shf.l.wrap.b32 %r1645, %r1644, %r1644, 20; - add.s32 %r1646, %r1640, %r1645; - xor.b32 %r1647, %r1646, %r1642; - shf.l.wrap.b32 %r1648, %r1647, %r1647, 24; - add.s32 %r1649, %r1648, %r1643; - xor.b32 %r1650, %r1649, %r1645; - shf.l.wrap.b32 %r1651, %r1650, %r1650, 25; - add.s32 %r1652, %r1608, %r1626; - xor.b32 %r1653, %r1652, %r1648; - shf.l.wrap.b32 %r1654, %r1653, %r1653, 16; - add.s32 %r1655, %r1654, %r1637; - xor.b32 %r1656, %r1655, %r1626; - shf.l.wrap.b32 %r1657, %r1656, %r1656, 20; - add.s32 %r1658, %r1652, %r1657; - xor.b32 %r1659, %r1658, %r1654; - shf.l.wrap.b32 %r1660, %r1659, %r1659, 24; - add.s32 %r1661, %r1660, %r1655; - xor.b32 %r1662, %r1661, %r1657; - shf.l.wrap.b32 %r1663, %r1662, %r1662, 25; - add.s32 %r1664, %r1639, %r988; - add.s32 %r1665, %r1664, %r1621; - xor.b32 %r1666, %r1610, %r1665; - shf.l.wrap.b32 %r1667, %r1666, %r1666, 16; - add.s32 %r1668, %r1667, %r1649; - xor.b32 %r1669, %r1668, %r1639; - shf.l.wrap.b32 %r1670, %r1669, %r1669, 20; - add.s32 %r1671, %r1665, %r1670; - xor.b32 %r1672, %r1671, %r1667; - shf.l.wrap.b32 %r1673, %r1672, %r1672, 24; - add.s32 %r1674, %r1673, %r1668; - xor.b32 %r1675, %r1674, %r1670; - shf.l.wrap.b32 %r1676, %r1675, %r1675, 25; - add.s32 %r1677, %r1634, %r996; - add.s32 %r1678, %r1677, %r1651; - xor.b32 %r1679, %r1623, %r1678; - shf.l.wrap.b32 %r1680, %r1679, %r1679, 16; - add.s32 %r1681, %r1680, %r1611; - xor.b32 %r1682, %r1681, %r1651; - shf.l.wrap.b32 %r1683, %r1682, %r1682, 20; - add.s32 %r1684, %r1678, %r1683; - xor.b32 %r1685, %r1684, %r1680; - shf.l.wrap.b32 %r1686, %r1685, %r1685, 24; - add.s32 %r1687, %r1686, %r1681; - xor.b32 %r1688, %r1687, %r1683; - shf.l.wrap.b32 %r1689, %r1688, %r1688, 25; - add.s32 %r1690, %r1646, %r1613; - xor.b32 %r1691, %r1690, %r1636; - shf.l.wrap.b32 %r1692, %r1691, %r1691, 16; - add.s32 %r1693, %r1692, %r1624; - xor.b32 %r1694, %r1693, %r1613; - shf.l.wrap.b32 %r1695, %r1694, %r1694, 20; - add.s32 %r1696, %r1690, %r1695; - xor.b32 %r1697, %r1696, %r1692; - shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; - add.s32 %r1699, %r1698, %r1693; - xor.b32 %r1700, %r1699, %r1695; - shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; - xor.b32 %r9, %r1658, %r1687; - cvt.u64.u32 %rd132, %r9; - xor.b32 %r1702, %r1699, %r1671; - and.b32 %r1703, %r1702, 255; - cvt.u64.u32 %rd133, %r1703; - bfi.b64 %rd134, %rd133, %rd132, 32, 32; - cvt.u64.u32 %rd135, %r1702; - shl.b64 %rd136, %rd135, 32; - and.b64 %rd137, %rd136, 280375465082880; - or.b64 %rd138, %rd134, %rd137; - and.b64 %rd139, %rd136, 71776119061217280; - shr.u32 %r10, %r1702, 24; - cvt.u64.u32 %rd140, %r10; - shl.b64 %rd141, %rd140, 56; - or.b64 %rd142, %rd138, %rd139; - or.b64 %rd143, %rd142, %rd141; - xor.b32 %r11, %r1661, %r1684; - cvt.u64.u32 %rd144, %r11; - xor.b32 %r1704, %r1696, %r1674; - and.b32 %r1705, %r1704, 255; - cvt.u64.u32 %rd145, %r1705; - bfi.b64 %rd146, %rd145, %rd144, 32, 32; - cvt.u64.u32 %rd147, %r1704; - shl.b64 %rd148, %rd147, 32; - and.b64 %rd149, %rd148, 280375465082880; - or.b64 %rd150, %rd146, %rd149; - and.b64 %rd151, %rd148, 71776119061217280; - shr.u32 %r12, %r1704, 24; - cvt.u64.u32 %rd152, %r12; - shl.b64 %rd153, %rd152, 56; - or.b64 %rd154, %rd150, %rd151; - or.b64 %rd155, %rd154, %rd153; - xor.b32 %r13, %r1701, %r1673; - cvt.u64.u32 %rd156, %r13; - xor.b32 %r1706, %r1663, %r1686; - and.b32 %r1707, %r1706, 255; - cvt.u64.u32 %rd157, %r1707; - bfi.b64 %rd158, %rd157, %rd156, 32, 32; - cvt.u64.u32 %rd159, %r1706; - shl.b64 %rd160, %rd159, 32; - and.b64 %rd161, %rd160, 280375465082880; - or.b64 %rd162, %rd158, %rd161; - and.b64 %rd163, %rd160, 71776119061217280; - shr.u32 %r14, %r1706, 24; - cvt.u64.u32 %rd164, %r14; - shl.b64 %rd165, %rd164, 56; - or.b64 %rd166, %rd162, %rd163; - or.b64 %rd167, %rd166, %rd165; - xor.b32 %r1708, %r1698, %r1676; - cvt.u64.u32 %rd168, %r1708; - xor.b32 %r1709, %r1660, %r1689; - and.b32 %r1710, %r1709, 255; - cvt.u64.u32 %rd169, %r1710; - bfi.b64 %rd170, %rd169, %rd168, 32, 32; - cvt.u64.u32 %rd171, %r1709; - shl.b64 %rd172, %rd171, 32; - and.b64 %rd173, %rd172, 280375465082880; - or.b64 %rd174, %rd170, %rd173; - and.b64 %rd175, %rd172, 71776119061217280; - shr.u32 %r1711, %r1709, 24; - cvt.u64.u32 %rd176, %r1711; - shl.b64 %rd177, %rd176, 56; - or.b64 %rd178, %rd174, %rd175; - or.b64 %rd9, %rd178, %rd177; - shr.u64 %rd10, %rd143, 32; - shr.u64 %rd11, %rd143, 40; - shr.u64 %rd12, %rd143, 48; - shr.u64 %rd13, %rd155, 32; - shr.u64 %rd14, %rd155, 40; - shr.u64 %rd15, %rd155, 48; - shr.u64 %rd16, %rd167, 32; - shr.u64 %rd17, %rd167, 40; - shr.u64 %rd18, %rd167, 48; - shr.u32 %r5937, %r9, 12; - bfe.u32 %r5938, %r9, 4, 4; - and.b32 %r5939, %r9, 15; - bfi.b32 %r5940, %r5939, %r5938, 8, 4; - shl.b32 %r5941, %r9, 4; - and.b32 %r5942, %r5941, 983040; - or.b32 %r5943, %r5940, %r5942; - shl.b32 %r5944, %r9, 16; - and.b32 %r5945, %r5944, 251658240; - or.b32 %r5870, %r5943, %r5945; - bfe.u32 %r5946, %r9, 20, 4; - bfe.u32 %r5947, %r9, 16, 4; - shr.u32 %r2239, %r9, 24; - bfi.b32 %r5948, %r5947, %r5946, 8, 4; - and.b32 %r5949, %r5937, 983040; - or.b32 %r5950, %r5948, %r5949; - and.b32 %r5951, %r9, 251658240; - or.b32 %r5874, %r5950, %r5951; - cvt.u16.u64 %rs83, %rd10; - and.b16 %rs84, %rs83, 240; - shr.u16 %rs85, %rs84, 4; - cvt.u16.u64 %rs86, %rd11; - and.b16 %rs87, %rs86, 240; - shr.u16 %rs88, %rs87, 4; - cvt.u32.u16 %r5952, %rs85; - cvt.u32.u64 %r5953, %rd10; - and.b32 %r5954, %r5953, 15; - prmt.b32 %r5955, %r5954, %r5952, 30212; - cvt.u32.u16 %r5956, %rs88; - prmt.b32 %r5957, %r5956, %r5955, 28756; - cvt.u32.u64 %r5958, %rd11; - shl.b32 %r5959, %r5958, 24; - and.b32 %r5960, %r5959, 251658240; - or.b32 %r5878, %r5957, %r5960; - cvt.u16.u64 %rs89, %rd12; - and.b16 %rs90, %rs89, 240; - shr.u16 %rs91, %rs90, 4; - cvt.u32.u16 %r5961, %rs91; - cvt.u32.u64 %r5962, %rd12; - and.b32 %r5963, %r5962, 15; - prmt.b32 %r5964, %r5963, %r5961, 30212; - shl.b32 %r5965, %r10, 12; - and.b32 %r5966, %r5965, 983040; - or.b32 %r5967, %r5964, %r5966; - shl.b32 %r5968, %r10, 24; - and.b32 %r5969, %r5968, 251658240; - or.b32 %r5882, %r5967, %r5969; - shr.u32 %r5970, %r11, 12; - bfe.u32 %r5971, %r11, 4, 4; - and.b32 %r5972, %r11, 15; - bfi.b32 %r5973, %r5972, %r5971, 8, 4; - shl.b32 %r5974, %r11, 4; - and.b32 %r5975, %r5974, 983040; - or.b32 %r5976, %r5973, %r5975; - shl.b32 %r5977, %r11, 16; - and.b32 %r5978, %r5977, 251658240; - or.b32 %r5886, %r5976, %r5978; - bfe.u32 %r5979, %r11, 20, 4; - bfe.u32 %r5980, %r11, 16, 4; - shr.u32 %r3295, %r11, 24; - bfi.b32 %r5981, %r5980, %r5979, 8, 4; - and.b32 %r5982, %r5970, 983040; - or.b32 %r5983, %r5981, %r5982; - and.b32 %r5984, %r11, 251658240; - or.b32 %r5890, %r5983, %r5984; - cvt.u16.u64 %rs92, %rd13; - and.b16 %rs93, %rs92, 240; - shr.u16 %rs94, %rs93, 4; - cvt.u16.u64 %rs95, %rd14; - and.b16 %rs96, %rs95, 240; - shr.u16 %rs97, %rs96, 4; - cvt.u32.u16 %r5985, %rs94; - cvt.u32.u64 %r5986, %rd13; - and.b32 %r5987, %r5986, 15; - prmt.b32 %r5988, %r5987, %r5985, 30212; - cvt.u32.u16 %r5989, %rs97; - prmt.b32 %r5990, %r5989, %r5988, 28756; - cvt.u32.u64 %r5991, %rd14; - shl.b32 %r5992, %r5991, 24; - and.b32 %r5993, %r5992, 251658240; - or.b32 %r5894, %r5990, %r5993; - cvt.u16.u64 %rs98, %rd15; - and.b16 %rs99, %rs98, 240; - shr.u16 %rs100, %rs99, 4; - cvt.u32.u16 %r5994, %rs100; - cvt.u32.u64 %r5995, %rd15; - and.b32 %r5996, %r5995, 15; - prmt.b32 %r5997, %r5996, %r5994, 30212; - shl.b32 %r5998, %r12, 12; - and.b32 %r5999, %r5998, 983040; - or.b32 %r6000, %r5997, %r5999; - shl.b32 %r6001, %r12, 24; - and.b32 %r6002, %r6001, 251658240; - or.b32 %r5898, %r6000, %r6002; - shr.u32 %r6003, %r13, 12; - bfe.u32 %r6004, %r13, 4, 4; - and.b32 %r6005, %r13, 15; - bfi.b32 %r6006, %r6005, %r6004, 8, 4; - shl.b32 %r6007, %r13, 4; - and.b32 %r6008, %r6007, 983040; - or.b32 %r6009, %r6006, %r6008; - shl.b32 %r6010, %r13, 16; - and.b32 %r6011, %r6010, 251658240; - or.b32 %r5902, %r6009, %r6011; - bfe.u32 %r6012, %r13, 20, 4; - bfe.u32 %r6013, %r13, 16, 4; - shr.u32 %r4351, %r13, 24; - bfi.b32 %r6014, %r6013, %r6012, 8, 4; - and.b32 %r6015, %r6003, 983040; - or.b32 %r6016, %r6014, %r6015; - and.b32 %r6017, %r13, 251658240; - or.b32 %r5906, %r6016, %r6017; - cvt.u16.u64 %rs101, %rd16; - and.b16 %rs102, %rs101, 240; - shr.u16 %rs103, %rs102, 4; - cvt.u16.u64 %rs104, %rd17; - and.b16 %rs105, %rs104, 240; - shr.u16 %rs106, %rs105, 4; - cvt.u32.u16 %r6018, %rs103; - cvt.u32.u64 %r6019, %rd16; - and.b32 %r6020, %r6019, 15; - prmt.b32 %r6021, %r6020, %r6018, 30212; - cvt.u32.u16 %r6022, %rs106; - prmt.b32 %r6023, %r6022, %r6021, 28756; - cvt.u32.u64 %r6024, %rd17; - shl.b32 %r6025, %r6024, 24; - and.b32 %r6026, %r6025, 251658240; - or.b32 %r5910, %r6023, %r6026; - cvt.u16.u64 %rs107, %rd18; - and.b16 %rs108, %rs107, 240; - shr.u16 %rs109, %rs108, 4; - cvt.u32.u16 %r6027, %rs109; - cvt.u32.u64 %r6028, %rd18; - and.b32 %r6029, %r6028, 15; - prmt.b32 %r6030, %r6029, %r6027, 30212; - shl.b32 %r6031, %r14, 12; - and.b32 %r6032, %r6031, 983040; - or.b32 %r6033, %r6030, %r6032; - shl.b32 %r6034, %r14, 24; - and.b32 %r6035, %r6034, 251658240; - or.b32 %r5914, %r6033, %r6035; - cvt.u16.u64 %rs110, %rd9; - and.b16 %rs111, %rs110, 240; - shr.u16 %rs112, %rs111, 4; - shr.u64 %rd201, %rd9, 8; - cvt.u32.u64 %r6036, %rd201; - cvt.u32.u64 %r6037, %rd9; - shr.u32 %r6038, %r6037, 12; - cvt.u32.u16 %r6039, %rs112; - and.b32 %r6040, %r6037, 15; - prmt.b32 %r6041, %r6040, %r6039, 30212; - shl.b32 %r6042, %r6037, 4; - and.b32 %r6043, %r6042, 983040; - or.b32 %r6044, %r6041, %r6043; - shl.b32 %r6045, %r6036, 24; - and.b32 %r6046, %r6045, 251658240; - or.b32 %r5918, %r6044, %r6046; - shr.u64 %rd202, %rd9, 16; - cvt.u32.u64 %r6047, %rd202; - bfe.u32 %r6048, %r6037, 20, 4; - and.b32 %r6049, %r6047, 15; - shr.u64 %rd203, %rd9, 24; - cvt.u32.u64 %r6050, %rd203; - bfi.b32 %r6051, %r6049, %r6048, 8, 4; - and.b32 %r6052, %r6038, 983040; - or.b32 %r6053, %r6051, %r6052; - shl.b32 %r6054, %r6050, 24; - and.b32 %r6055, %r6054, 251658240; - or.b32 %r5922, %r6053, %r6055; - shr.u64 %rd204, %rd9, 32; - cvt.u32.u64 %r6056, %rd204; - shr.u64 %rd205, %rd9, 36; - cvt.u32.u64 %r6057, %rd205; - and.b32 %r6058, %r6057, 15; - and.b32 %r6059, %r6056, 15; - shr.u64 %rd206, %rd9, 40; - cvt.u32.u64 %r6060, %rd206; - shr.u64 %rd207, %rd9, 44; - cvt.u32.u64 %r6061, %rd207; - bfi.b32 %r6062, %r6059, %r6058, 8, 4; - shl.b32 %r6063, %r6061, 16; - and.b32 %r6064, %r6063, 983040; - or.b32 %r6065, %r6062, %r6064; - shl.b32 %r6066, %r6060, 24; - and.b32 %r6067, %r6066, 251658240; - or.b32 %r5926, %r6065, %r6067; - shr.u64 %rd208, %rd9, 48; - cvt.u32.u64 %r6068, %rd208; - shr.u64 %rd209, %rd9, 52; - cvt.u32.u64 %r6069, %rd209; - and.b32 %r6070, %r6069, 15; - and.b32 %r6071, %r6068, 15; - shr.u64 %rd210, %rd9, 56; - cvt.u32.u64 %r5935, %rd210; - bfi.b32 %r6072, %r6071, %r6070, 8, 4; - and.b32 %r6073, %r6061, 983040; - or.b32 %r6074, %r6072, %r6073; - shl.b32 %r6075, %r5935, 24; - and.b32 %r6076, %r6075, 251658240; - or.b32 %r5930, %r6074, %r6076; - ld.const.u32 %r1713, [matrix]; - mov.u32 %r6249, 0; - // begin inline asm - dp4a.u32.u32 %r1712, %r1713, %r5870, %r6249; - // end inline asm - ld.const.u32 %r1717, [matrix+4]; - // begin inline asm - dp4a.u32.u32 %r1716, %r1717, %r5874, %r1712; - // end inline asm - ld.const.u32 %r1721, [matrix+8]; - // begin inline asm - dp4a.u32.u32 %r1720, %r1721, %r5878, %r1716; - // end inline asm - ld.const.u32 %r1725, [matrix+12]; - // begin inline asm - dp4a.u32.u32 %r1724, %r1725, %r5882, %r1720; - // end inline asm - ld.const.u32 %r1729, [matrix+16]; - // begin inline asm - dp4a.u32.u32 %r1728, %r1729, %r5886, %r1724; - // end inline asm - ld.const.u32 %r1733, [matrix+20]; - // begin inline asm - dp4a.u32.u32 %r1732, %r1733, %r5890, %r1728; - // end inline asm - ld.const.u32 %r1737, [matrix+24]; - // begin inline asm - dp4a.u32.u32 %r1736, %r1737, %r5894, %r1732; - // end inline asm - ld.const.u32 %r1741, [matrix+28]; - // begin inline asm - dp4a.u32.u32 %r1740, %r1741, %r5898, %r1736; - // end inline asm - ld.const.u32 %r1745, [matrix+32]; - // begin inline asm - dp4a.u32.u32 %r1744, %r1745, %r5902, %r1740; - // end inline asm - ld.const.u32 %r1749, [matrix+36]; - // begin inline asm - dp4a.u32.u32 %r1748, %r1749, %r5906, %r1744; - // end inline asm - ld.const.u32 %r1753, [matrix+40]; - // begin inline asm - dp4a.u32.u32 %r1752, %r1753, %r5910, %r1748; - // end inline asm - ld.const.u32 %r1757, [matrix+44]; - // begin inline asm - dp4a.u32.u32 %r1756, %r1757, %r5914, %r1752; - // end inline asm - ld.const.u32 %r1761, [matrix+48]; - // begin inline asm - dp4a.u32.u32 %r1760, %r1761, %r5918, %r1756; - // end inline asm - ld.const.u32 %r1765, [matrix+52]; - // begin inline asm - dp4a.u32.u32 %r1764, %r1765, %r5922, %r1760; - // end inline asm - ld.const.u32 %r1769, [matrix+56]; - // begin inline asm - dp4a.u32.u32 %r1768, %r1769, %r5926, %r1764; - // end inline asm - ld.const.u32 %r1773, [matrix+60]; - // begin inline asm - dp4a.u32.u32 %r1772, %r1773, %r5930, %r1768; - // end inline asm - ld.const.u32 %r1777, [matrix+64]; - // begin inline asm - dp4a.u32.u32 %r1776, %r1777, %r5870, %r6249; - // end inline asm - ld.const.u32 %r1781, [matrix+68]; - // begin inline asm - dp4a.u32.u32 %r1780, %r1781, %r5874, %r1776; - // end inline asm - ld.const.u32 %r1785, [matrix+72]; - // begin inline asm - dp4a.u32.u32 %r1784, %r1785, %r5878, %r1780; - // end inline asm - ld.const.u32 %r1789, [matrix+76]; - // begin inline asm - dp4a.u32.u32 %r1788, %r1789, %r5882, %r1784; - // end inline asm - ld.const.u32 %r1793, [matrix+80]; - // begin inline asm - dp4a.u32.u32 %r1792, %r1793, %r5886, %r1788; - // end inline asm - ld.const.u32 %r1797, [matrix+84]; - // begin inline asm - dp4a.u32.u32 %r1796, %r1797, %r5890, %r1792; - // end inline asm - ld.const.u32 %r1801, [matrix+88]; - // begin inline asm - dp4a.u32.u32 %r1800, %r1801, %r5894, %r1796; - // end inline asm - ld.const.u32 %r1805, [matrix+92]; - // begin inline asm - dp4a.u32.u32 %r1804, %r1805, %r5898, %r1800; - // end inline asm - ld.const.u32 %r1809, [matrix+96]; - // begin inline asm - dp4a.u32.u32 %r1808, %r1809, %r5902, %r1804; - // end inline asm - ld.const.u32 %r1813, [matrix+100]; - // begin inline asm - dp4a.u32.u32 %r1812, %r1813, %r5906, %r1808; - // end inline asm - ld.const.u32 %r1817, [matrix+104]; - // begin inline asm - dp4a.u32.u32 %r1816, %r1817, %r5910, %r1812; - // end inline asm - ld.const.u32 %r1821, [matrix+108]; - // begin inline asm - dp4a.u32.u32 %r1820, %r1821, %r5914, %r1816; - // end inline asm - ld.const.u32 %r1825, [matrix+112]; - // begin inline asm - dp4a.u32.u32 %r1824, %r1825, %r5918, %r1820; - // end inline asm - ld.const.u32 %r1829, [matrix+116]; - // begin inline asm - dp4a.u32.u32 %r1828, %r1829, %r5922, %r1824; - // end inline asm - ld.const.u32 %r1833, [matrix+120]; - // begin inline asm - dp4a.u32.u32 %r1832, %r1833, %r5926, %r1828; - // end inline asm - ld.const.u32 %r1837, [matrix+124]; - // begin inline asm - dp4a.u32.u32 %r1836, %r1837, %r5930, %r1832; - // end inline asm - shr.u32 %r6077, %r1772, 6; - and.b32 %r1841, %r6077, 240; - shr.u32 %r1842, %r1836, 10; - and.b32 %r1843, %r9, 255; - // begin inline asm - lop3.b32 %r1840, %r1841, %r1842, %r1843, 0x56; - // end inline asm - ld.const.u32 %r1845, [matrix+128]; - // begin inline asm - dp4a.u32.u32 %r1844, %r1845, %r5870, %r6249; - // end inline asm - ld.const.u32 %r1849, [matrix+132]; - // begin inline asm - dp4a.u32.u32 %r1848, %r1849, %r5874, %r1844; - // end inline asm - ld.const.u32 %r1853, [matrix+136]; - // begin inline asm - dp4a.u32.u32 %r1852, %r1853, %r5878, %r1848; - // end inline asm - ld.const.u32 %r1857, [matrix+140]; - // begin inline asm - dp4a.u32.u32 %r1856, %r1857, %r5882, %r1852; - // end inline asm - ld.const.u32 %r1861, [matrix+144]; - // begin inline asm - dp4a.u32.u32 %r1860, %r1861, %r5886, %r1856; - // end inline asm - ld.const.u32 %r1865, [matrix+148]; - // begin inline asm - dp4a.u32.u32 %r1864, %r1865, %r5890, %r1860; - // end inline asm - ld.const.u32 %r1869, [matrix+152]; - // begin inline asm - dp4a.u32.u32 %r1868, %r1869, %r5894, %r1864; - // end inline asm - ld.const.u32 %r1873, [matrix+156]; - // begin inline asm - dp4a.u32.u32 %r1872, %r1873, %r5898, %r1868; - // end inline asm - ld.const.u32 %r1877, [matrix+160]; - // begin inline asm - dp4a.u32.u32 %r1876, %r1877, %r5902, %r1872; - // end inline asm - ld.const.u32 %r1881, [matrix+164]; - // begin inline asm - dp4a.u32.u32 %r1880, %r1881, %r5906, %r1876; - // end inline asm - ld.const.u32 %r1885, [matrix+168]; - // begin inline asm - dp4a.u32.u32 %r1884, %r1885, %r5910, %r1880; - // end inline asm - ld.const.u32 %r1889, [matrix+172]; - // begin inline asm - dp4a.u32.u32 %r1888, %r1889, %r5914, %r1884; - // end inline asm - ld.const.u32 %r1893, [matrix+176]; - // begin inline asm - dp4a.u32.u32 %r1892, %r1893, %r5918, %r1888; - // end inline asm - ld.const.u32 %r1897, [matrix+180]; - // begin inline asm - dp4a.u32.u32 %r1896, %r1897, %r5922, %r1892; - // end inline asm - ld.const.u32 %r1901, [matrix+184]; - // begin inline asm - dp4a.u32.u32 %r1900, %r1901, %r5926, %r1896; - // end inline asm - ld.const.u32 %r1905, [matrix+188]; - // begin inline asm - dp4a.u32.u32 %r1904, %r1905, %r5930, %r1900; - // end inline asm - ld.const.u32 %r1909, [matrix+192]; - // begin inline asm - dp4a.u32.u32 %r1908, %r1909, %r5870, %r6249; - // end inline asm - ld.const.u32 %r1913, [matrix+196]; - // begin inline asm - dp4a.u32.u32 %r1912, %r1913, %r5874, %r1908; - // end inline asm - ld.const.u32 %r1917, [matrix+200]; - // begin inline asm - dp4a.u32.u32 %r1916, %r1917, %r5878, %r1912; - // end inline asm - ld.const.u32 %r1921, [matrix+204]; - // begin inline asm - dp4a.u32.u32 %r1920, %r1921, %r5882, %r1916; - // end inline asm - ld.const.u32 %r1925, [matrix+208]; - // begin inline asm - dp4a.u32.u32 %r1924, %r1925, %r5886, %r1920; - // end inline asm - ld.const.u32 %r1929, [matrix+212]; - // begin inline asm - dp4a.u32.u32 %r1928, %r1929, %r5890, %r1924; - // end inline asm - ld.const.u32 %r1933, [matrix+216]; - // begin inline asm - dp4a.u32.u32 %r1932, %r1933, %r5894, %r1928; - // end inline asm - ld.const.u32 %r1937, [matrix+220]; - // begin inline asm - dp4a.u32.u32 %r1936, %r1937, %r5898, %r1932; - // end inline asm - ld.const.u32 %r1941, [matrix+224]; - // begin inline asm - dp4a.u32.u32 %r1940, %r1941, %r5902, %r1936; - // end inline asm - ld.const.u32 %r1945, [matrix+228]; - // begin inline asm - dp4a.u32.u32 %r1944, %r1945, %r5906, %r1940; - // end inline asm - ld.const.u32 %r1949, [matrix+232]; - // begin inline asm - dp4a.u32.u32 %r1948, %r1949, %r5910, %r1944; - // end inline asm - ld.const.u32 %r1953, [matrix+236]; - // begin inline asm - dp4a.u32.u32 %r1952, %r1953, %r5914, %r1948; - // end inline asm - ld.const.u32 %r1957, [matrix+240]; - // begin inline asm - dp4a.u32.u32 %r1956, %r1957, %r5918, %r1952; - // end inline asm - ld.const.u32 %r1961, [matrix+244]; - // begin inline asm - dp4a.u32.u32 %r1960, %r1961, %r5922, %r1956; - // end inline asm - ld.const.u32 %r1965, [matrix+248]; - // begin inline asm - dp4a.u32.u32 %r1964, %r1965, %r5926, %r1960; - // end inline asm - ld.const.u32 %r1969, [matrix+252]; - // begin inline asm - dp4a.u32.u32 %r1968, %r1969, %r5930, %r1964; - // end inline asm - shr.u32 %r6078, %r1904, 6; - and.b32 %r1973, %r6078, 240; - shr.u32 %r1974, %r1968, 10; - bfe.u32 %r1975, %r9, 8, 8; - // begin inline asm - lop3.b32 %r1972, %r1973, %r1974, %r1975, 0x56; - // end inline asm - ld.const.u32 %r1977, [matrix+256]; - // begin inline asm - dp4a.u32.u32 %r1976, %r1977, %r5870, %r6249; - // end inline asm - ld.const.u32 %r1981, [matrix+260]; - // begin inline asm - dp4a.u32.u32 %r1980, %r1981, %r5874, %r1976; - // end inline asm - ld.const.u32 %r1985, [matrix+264]; - // begin inline asm - dp4a.u32.u32 %r1984, %r1985, %r5878, %r1980; - // end inline asm - ld.const.u32 %r1989, [matrix+268]; - // begin inline asm - dp4a.u32.u32 %r1988, %r1989, %r5882, %r1984; - // end inline asm - ld.const.u32 %r1993, [matrix+272]; - // begin inline asm - dp4a.u32.u32 %r1992, %r1993, %r5886, %r1988; - // end inline asm - ld.const.u32 %r1997, [matrix+276]; - // begin inline asm - dp4a.u32.u32 %r1996, %r1997, %r5890, %r1992; - // end inline asm - ld.const.u32 %r2001, [matrix+280]; - // begin inline asm - dp4a.u32.u32 %r2000, %r2001, %r5894, %r1996; - // end inline asm - ld.const.u32 %r2005, [matrix+284]; - // begin inline asm - dp4a.u32.u32 %r2004, %r2005, %r5898, %r2000; - // end inline asm - ld.const.u32 %r2009, [matrix+288]; - // begin inline asm - dp4a.u32.u32 %r2008, %r2009, %r5902, %r2004; - // end inline asm - ld.const.u32 %r2013, [matrix+292]; - // begin inline asm - dp4a.u32.u32 %r2012, %r2013, %r5906, %r2008; - // end inline asm - ld.const.u32 %r2017, [matrix+296]; - // begin inline asm - dp4a.u32.u32 %r2016, %r2017, %r5910, %r2012; - // end inline asm - ld.const.u32 %r2021, [matrix+300]; - // begin inline asm - dp4a.u32.u32 %r2020, %r2021, %r5914, %r2016; - // end inline asm - ld.const.u32 %r2025, [matrix+304]; - // begin inline asm - dp4a.u32.u32 %r2024, %r2025, %r5918, %r2020; - // end inline asm - ld.const.u32 %r2029, [matrix+308]; - // begin inline asm - dp4a.u32.u32 %r2028, %r2029, %r5922, %r2024; - // end inline asm - ld.const.u32 %r2033, [matrix+312]; - // begin inline asm - dp4a.u32.u32 %r2032, %r2033, %r5926, %r2028; - // end inline asm - ld.const.u32 %r2037, [matrix+316]; - // begin inline asm - dp4a.u32.u32 %r2036, %r2037, %r5930, %r2032; - // end inline asm - ld.const.u32 %r2041, [matrix+320]; - // begin inline asm - dp4a.u32.u32 %r2040, %r2041, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2045, [matrix+324]; - // begin inline asm - dp4a.u32.u32 %r2044, %r2045, %r5874, %r2040; - // end inline asm - ld.const.u32 %r2049, [matrix+328]; - // begin inline asm - dp4a.u32.u32 %r2048, %r2049, %r5878, %r2044; - // end inline asm - ld.const.u32 %r2053, [matrix+332]; - // begin inline asm - dp4a.u32.u32 %r2052, %r2053, %r5882, %r2048; - // end inline asm - ld.const.u32 %r2057, [matrix+336]; - // begin inline asm - dp4a.u32.u32 %r2056, %r2057, %r5886, %r2052; - // end inline asm - ld.const.u32 %r2061, [matrix+340]; - // begin inline asm - dp4a.u32.u32 %r2060, %r2061, %r5890, %r2056; - // end inline asm - ld.const.u32 %r2065, [matrix+344]; - // begin inline asm - dp4a.u32.u32 %r2064, %r2065, %r5894, %r2060; - // end inline asm - ld.const.u32 %r2069, [matrix+348]; - // begin inline asm - dp4a.u32.u32 %r2068, %r2069, %r5898, %r2064; - // end inline asm - ld.const.u32 %r2073, [matrix+352]; - // begin inline asm - dp4a.u32.u32 %r2072, %r2073, %r5902, %r2068; - // end inline asm - ld.const.u32 %r2077, [matrix+356]; - // begin inline asm - dp4a.u32.u32 %r2076, %r2077, %r5906, %r2072; - // end inline asm - ld.const.u32 %r2081, [matrix+360]; - // begin inline asm - dp4a.u32.u32 %r2080, %r2081, %r5910, %r2076; - // end inline asm - ld.const.u32 %r2085, [matrix+364]; - // begin inline asm - dp4a.u32.u32 %r2084, %r2085, %r5914, %r2080; - // end inline asm - ld.const.u32 %r2089, [matrix+368]; - // begin inline asm - dp4a.u32.u32 %r2088, %r2089, %r5918, %r2084; - // end inline asm - ld.const.u32 %r2093, [matrix+372]; - // begin inline asm - dp4a.u32.u32 %r2092, %r2093, %r5922, %r2088; - // end inline asm - ld.const.u32 %r2097, [matrix+376]; - // begin inline asm - dp4a.u32.u32 %r2096, %r2097, %r5926, %r2092; - // end inline asm - ld.const.u32 %r2101, [matrix+380]; - // begin inline asm - dp4a.u32.u32 %r2100, %r2101, %r5930, %r2096; - // end inline asm - shr.u32 %r6079, %r2036, 6; - and.b32 %r2105, %r6079, 240; - shr.u32 %r2106, %r2100, 10; - bfe.u32 %r2107, %r9, 16, 8; - // begin inline asm - lop3.b32 %r2104, %r2105, %r2106, %r2107, 0x56; - // end inline asm - ld.const.u32 %r2109, [matrix+384]; - // begin inline asm - dp4a.u32.u32 %r2108, %r2109, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2113, [matrix+388]; - // begin inline asm - dp4a.u32.u32 %r2112, %r2113, %r5874, %r2108; - // end inline asm - ld.const.u32 %r2117, [matrix+392]; - // begin inline asm - dp4a.u32.u32 %r2116, %r2117, %r5878, %r2112; - // end inline asm - ld.const.u32 %r2121, [matrix+396]; - // begin inline asm - dp4a.u32.u32 %r2120, %r2121, %r5882, %r2116; - // end inline asm - ld.const.u32 %r2125, [matrix+400]; - // begin inline asm - dp4a.u32.u32 %r2124, %r2125, %r5886, %r2120; - // end inline asm - ld.const.u32 %r2129, [matrix+404]; - // begin inline asm - dp4a.u32.u32 %r2128, %r2129, %r5890, %r2124; - // end inline asm - ld.const.u32 %r2133, [matrix+408]; - // begin inline asm - dp4a.u32.u32 %r2132, %r2133, %r5894, %r2128; - // end inline asm - ld.const.u32 %r2137, [matrix+412]; - // begin inline asm - dp4a.u32.u32 %r2136, %r2137, %r5898, %r2132; - // end inline asm - ld.const.u32 %r2141, [matrix+416]; - // begin inline asm - dp4a.u32.u32 %r2140, %r2141, %r5902, %r2136; - // end inline asm - ld.const.u32 %r2145, [matrix+420]; - // begin inline asm - dp4a.u32.u32 %r2144, %r2145, %r5906, %r2140; - // end inline asm - ld.const.u32 %r2149, [matrix+424]; - // begin inline asm - dp4a.u32.u32 %r2148, %r2149, %r5910, %r2144; - // end inline asm - ld.const.u32 %r2153, [matrix+428]; - // begin inline asm - dp4a.u32.u32 %r2152, %r2153, %r5914, %r2148; - // end inline asm - ld.const.u32 %r2157, [matrix+432]; - // begin inline asm - dp4a.u32.u32 %r2156, %r2157, %r5918, %r2152; - // end inline asm - ld.const.u32 %r2161, [matrix+436]; - // begin inline asm - dp4a.u32.u32 %r2160, %r2161, %r5922, %r2156; - // end inline asm - ld.const.u32 %r2165, [matrix+440]; - // begin inline asm - dp4a.u32.u32 %r2164, %r2165, %r5926, %r2160; - // end inline asm - ld.const.u32 %r2169, [matrix+444]; - // begin inline asm - dp4a.u32.u32 %r2168, %r2169, %r5930, %r2164; - // end inline asm - ld.const.u32 %r2173, [matrix+448]; - // begin inline asm - dp4a.u32.u32 %r2172, %r2173, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2177, [matrix+452]; - // begin inline asm - dp4a.u32.u32 %r2176, %r2177, %r5874, %r2172; - // end inline asm - ld.const.u32 %r2181, [matrix+456]; - // begin inline asm - dp4a.u32.u32 %r2180, %r2181, %r5878, %r2176; - // end inline asm - ld.const.u32 %r2185, [matrix+460]; - // begin inline asm - dp4a.u32.u32 %r2184, %r2185, %r5882, %r2180; - // end inline asm - ld.const.u32 %r2189, [matrix+464]; - // begin inline asm - dp4a.u32.u32 %r2188, %r2189, %r5886, %r2184; - // end inline asm - ld.const.u32 %r2193, [matrix+468]; - // begin inline asm - dp4a.u32.u32 %r2192, %r2193, %r5890, %r2188; - // end inline asm - ld.const.u32 %r2197, [matrix+472]; - // begin inline asm - dp4a.u32.u32 %r2196, %r2197, %r5894, %r2192; - // end inline asm - ld.const.u32 %r2201, [matrix+476]; - // begin inline asm - dp4a.u32.u32 %r2200, %r2201, %r5898, %r2196; - // end inline asm - ld.const.u32 %r2205, [matrix+480]; - // begin inline asm - dp4a.u32.u32 %r2204, %r2205, %r5902, %r2200; - // end inline asm - ld.const.u32 %r2209, [matrix+484]; - // begin inline asm - dp4a.u32.u32 %r2208, %r2209, %r5906, %r2204; - // end inline asm - ld.const.u32 %r2213, [matrix+488]; - // begin inline asm - dp4a.u32.u32 %r2212, %r2213, %r5910, %r2208; - // end inline asm - ld.const.u32 %r2217, [matrix+492]; - // begin inline asm - dp4a.u32.u32 %r2216, %r2217, %r5914, %r2212; - // end inline asm - ld.const.u32 %r2221, [matrix+496]; - // begin inline asm - dp4a.u32.u32 %r2220, %r2221, %r5918, %r2216; - // end inline asm - ld.const.u32 %r2225, [matrix+500]; - // begin inline asm - dp4a.u32.u32 %r2224, %r2225, %r5922, %r2220; - // end inline asm - ld.const.u32 %r2229, [matrix+504]; - // begin inline asm - dp4a.u32.u32 %r2228, %r2229, %r5926, %r2224; - // end inline asm - ld.const.u32 %r2233, [matrix+508]; - // begin inline asm - dp4a.u32.u32 %r2232, %r2233, %r5930, %r2228; - // end inline asm - shr.u32 %r6080, %r2168, 6; - and.b32 %r2237, %r6080, 240; - shr.u32 %r2238, %r2232, 10; - // begin inline asm - lop3.b32 %r2236, %r2237, %r2238, %r2239, 0x56; - // end inline asm - ld.const.u32 %r2241, [matrix+512]; - // begin inline asm - dp4a.u32.u32 %r2240, %r2241, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2245, [matrix+516]; - // begin inline asm - dp4a.u32.u32 %r2244, %r2245, %r5874, %r2240; - // end inline asm - ld.const.u32 %r2249, [matrix+520]; - // begin inline asm - dp4a.u32.u32 %r2248, %r2249, %r5878, %r2244; - // end inline asm - ld.const.u32 %r2253, [matrix+524]; - // begin inline asm - dp4a.u32.u32 %r2252, %r2253, %r5882, %r2248; - // end inline asm - ld.const.u32 %r2257, [matrix+528]; - // begin inline asm - dp4a.u32.u32 %r2256, %r2257, %r5886, %r2252; - // end inline asm - ld.const.u32 %r2261, [matrix+532]; - // begin inline asm - dp4a.u32.u32 %r2260, %r2261, %r5890, %r2256; - // end inline asm - ld.const.u32 %r2265, [matrix+536]; - // begin inline asm - dp4a.u32.u32 %r2264, %r2265, %r5894, %r2260; - // end inline asm - ld.const.u32 %r2269, [matrix+540]; - // begin inline asm - dp4a.u32.u32 %r2268, %r2269, %r5898, %r2264; - // end inline asm - ld.const.u32 %r2273, [matrix+544]; - // begin inline asm - dp4a.u32.u32 %r2272, %r2273, %r5902, %r2268; - // end inline asm - ld.const.u32 %r2277, [matrix+548]; - // begin inline asm - dp4a.u32.u32 %r2276, %r2277, %r5906, %r2272; - // end inline asm - ld.const.u32 %r2281, [matrix+552]; - // begin inline asm - dp4a.u32.u32 %r2280, %r2281, %r5910, %r2276; - // end inline asm - ld.const.u32 %r2285, [matrix+556]; - // begin inline asm - dp4a.u32.u32 %r2284, %r2285, %r5914, %r2280; - // end inline asm - ld.const.u32 %r2289, [matrix+560]; - // begin inline asm - dp4a.u32.u32 %r2288, %r2289, %r5918, %r2284; - // end inline asm - ld.const.u32 %r2293, [matrix+564]; - // begin inline asm - dp4a.u32.u32 %r2292, %r2293, %r5922, %r2288; - // end inline asm - ld.const.u32 %r2297, [matrix+568]; - // begin inline asm - dp4a.u32.u32 %r2296, %r2297, %r5926, %r2292; - // end inline asm - ld.const.u32 %r2301, [matrix+572]; - // begin inline asm - dp4a.u32.u32 %r2300, %r2301, %r5930, %r2296; - // end inline asm - ld.const.u32 %r2305, [matrix+576]; - // begin inline asm - dp4a.u32.u32 %r2304, %r2305, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2309, [matrix+580]; - // begin inline asm - dp4a.u32.u32 %r2308, %r2309, %r5874, %r2304; - // end inline asm - ld.const.u32 %r2313, [matrix+584]; - // begin inline asm - dp4a.u32.u32 %r2312, %r2313, %r5878, %r2308; - // end inline asm - ld.const.u32 %r2317, [matrix+588]; - // begin inline asm - dp4a.u32.u32 %r2316, %r2317, %r5882, %r2312; - // end inline asm - ld.const.u32 %r2321, [matrix+592]; - // begin inline asm - dp4a.u32.u32 %r2320, %r2321, %r5886, %r2316; - // end inline asm - ld.const.u32 %r2325, [matrix+596]; - // begin inline asm - dp4a.u32.u32 %r2324, %r2325, %r5890, %r2320; - // end inline asm - ld.const.u32 %r2329, [matrix+600]; - // begin inline asm - dp4a.u32.u32 %r2328, %r2329, %r5894, %r2324; - // end inline asm - ld.const.u32 %r2333, [matrix+604]; - // begin inline asm - dp4a.u32.u32 %r2332, %r2333, %r5898, %r2328; - // end inline asm - ld.const.u32 %r2337, [matrix+608]; - // begin inline asm - dp4a.u32.u32 %r2336, %r2337, %r5902, %r2332; - // end inline asm - ld.const.u32 %r2341, [matrix+612]; - // begin inline asm - dp4a.u32.u32 %r2340, %r2341, %r5906, %r2336; - // end inline asm - ld.const.u32 %r2345, [matrix+616]; - // begin inline asm - dp4a.u32.u32 %r2344, %r2345, %r5910, %r2340; - // end inline asm - ld.const.u32 %r2349, [matrix+620]; - // begin inline asm - dp4a.u32.u32 %r2348, %r2349, %r5914, %r2344; - // end inline asm - ld.const.u32 %r2353, [matrix+624]; - // begin inline asm - dp4a.u32.u32 %r2352, %r2353, %r5918, %r2348; - // end inline asm - ld.const.u32 %r2357, [matrix+628]; - // begin inline asm - dp4a.u32.u32 %r2356, %r2357, %r5922, %r2352; - // end inline asm - ld.const.u32 %r2361, [matrix+632]; - // begin inline asm - dp4a.u32.u32 %r2360, %r2361, %r5926, %r2356; - // end inline asm - ld.const.u32 %r2365, [matrix+636]; - // begin inline asm - dp4a.u32.u32 %r2364, %r2365, %r5930, %r2360; - // end inline asm - shr.u32 %r6081, %r2300, 6; - and.b32 %r2369, %r6081, 240; - shr.u32 %r2370, %r2364, 10; - and.b32 %r2371, %r5953, 255; - // begin inline asm - lop3.b32 %r2368, %r2369, %r2370, %r2371, 0x56; - // end inline asm - ld.const.u32 %r2373, [matrix+640]; - // begin inline asm - dp4a.u32.u32 %r2372, %r2373, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2377, [matrix+644]; - // begin inline asm - dp4a.u32.u32 %r2376, %r2377, %r5874, %r2372; - // end inline asm - ld.const.u32 %r2381, [matrix+648]; - // begin inline asm - dp4a.u32.u32 %r2380, %r2381, %r5878, %r2376; - // end inline asm - ld.const.u32 %r2385, [matrix+652]; - // begin inline asm - dp4a.u32.u32 %r2384, %r2385, %r5882, %r2380; - // end inline asm - ld.const.u32 %r2389, [matrix+656]; - // begin inline asm - dp4a.u32.u32 %r2388, %r2389, %r5886, %r2384; - // end inline asm - ld.const.u32 %r2393, [matrix+660]; - // begin inline asm - dp4a.u32.u32 %r2392, %r2393, %r5890, %r2388; - // end inline asm - ld.const.u32 %r2397, [matrix+664]; - // begin inline asm - dp4a.u32.u32 %r2396, %r2397, %r5894, %r2392; - // end inline asm - ld.const.u32 %r2401, [matrix+668]; - // begin inline asm - dp4a.u32.u32 %r2400, %r2401, %r5898, %r2396; - // end inline asm - ld.const.u32 %r2405, [matrix+672]; - // begin inline asm - dp4a.u32.u32 %r2404, %r2405, %r5902, %r2400; - // end inline asm - ld.const.u32 %r2409, [matrix+676]; - // begin inline asm - dp4a.u32.u32 %r2408, %r2409, %r5906, %r2404; - // end inline asm - ld.const.u32 %r2413, [matrix+680]; - // begin inline asm - dp4a.u32.u32 %r2412, %r2413, %r5910, %r2408; - // end inline asm - ld.const.u32 %r2417, [matrix+684]; - // begin inline asm - dp4a.u32.u32 %r2416, %r2417, %r5914, %r2412; - // end inline asm - ld.const.u32 %r2421, [matrix+688]; - // begin inline asm - dp4a.u32.u32 %r2420, %r2421, %r5918, %r2416; - // end inline asm - ld.const.u32 %r2425, [matrix+692]; - // begin inline asm - dp4a.u32.u32 %r2424, %r2425, %r5922, %r2420; - // end inline asm - ld.const.u32 %r2429, [matrix+696]; - // begin inline asm - dp4a.u32.u32 %r2428, %r2429, %r5926, %r2424; - // end inline asm - ld.const.u32 %r2433, [matrix+700]; - // begin inline asm - dp4a.u32.u32 %r2432, %r2433, %r5930, %r2428; - // end inline asm - ld.const.u32 %r2437, [matrix+704]; - // begin inline asm - dp4a.u32.u32 %r2436, %r2437, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2441, [matrix+708]; - // begin inline asm - dp4a.u32.u32 %r2440, %r2441, %r5874, %r2436; - // end inline asm - ld.const.u32 %r2445, [matrix+712]; - // begin inline asm - dp4a.u32.u32 %r2444, %r2445, %r5878, %r2440; - // end inline asm - ld.const.u32 %r2449, [matrix+716]; - // begin inline asm - dp4a.u32.u32 %r2448, %r2449, %r5882, %r2444; - // end inline asm - ld.const.u32 %r2453, [matrix+720]; - // begin inline asm - dp4a.u32.u32 %r2452, %r2453, %r5886, %r2448; - // end inline asm - ld.const.u32 %r2457, [matrix+724]; - // begin inline asm - dp4a.u32.u32 %r2456, %r2457, %r5890, %r2452; - // end inline asm - ld.const.u32 %r2461, [matrix+728]; - // begin inline asm - dp4a.u32.u32 %r2460, %r2461, %r5894, %r2456; - // end inline asm - ld.const.u32 %r2465, [matrix+732]; - // begin inline asm - dp4a.u32.u32 %r2464, %r2465, %r5898, %r2460; - // end inline asm - ld.const.u32 %r2469, [matrix+736]; - // begin inline asm - dp4a.u32.u32 %r2468, %r2469, %r5902, %r2464; - // end inline asm - ld.const.u32 %r2473, [matrix+740]; - // begin inline asm - dp4a.u32.u32 %r2472, %r2473, %r5906, %r2468; - // end inline asm - ld.const.u32 %r2477, [matrix+744]; - // begin inline asm - dp4a.u32.u32 %r2476, %r2477, %r5910, %r2472; - // end inline asm - ld.const.u32 %r2481, [matrix+748]; - // begin inline asm - dp4a.u32.u32 %r2480, %r2481, %r5914, %r2476; - // end inline asm - ld.const.u32 %r2485, [matrix+752]; - // begin inline asm - dp4a.u32.u32 %r2484, %r2485, %r5918, %r2480; - // end inline asm - ld.const.u32 %r2489, [matrix+756]; - // begin inline asm - dp4a.u32.u32 %r2488, %r2489, %r5922, %r2484; - // end inline asm - ld.const.u32 %r2493, [matrix+760]; - // begin inline asm - dp4a.u32.u32 %r2492, %r2493, %r5926, %r2488; - // end inline asm - ld.const.u32 %r2497, [matrix+764]; - // begin inline asm - dp4a.u32.u32 %r2496, %r2497, %r5930, %r2492; - // end inline asm - shr.u32 %r6082, %r2432, 6; - and.b32 %r2501, %r6082, 240; - shr.u32 %r2502, %r2496, 10; - and.b32 %r2503, %r5958, 255; - // begin inline asm - lop3.b32 %r2500, %r2501, %r2502, %r2503, 0x56; - // end inline asm - ld.const.u32 %r2505, [matrix+768]; - // begin inline asm - dp4a.u32.u32 %r2504, %r2505, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2509, [matrix+772]; - // begin inline asm - dp4a.u32.u32 %r2508, %r2509, %r5874, %r2504; - // end inline asm - ld.const.u32 %r2513, [matrix+776]; - // begin inline asm - dp4a.u32.u32 %r2512, %r2513, %r5878, %r2508; - // end inline asm - ld.const.u32 %r2517, [matrix+780]; - // begin inline asm - dp4a.u32.u32 %r2516, %r2517, %r5882, %r2512; - // end inline asm - ld.const.u32 %r2521, [matrix+784]; - // begin inline asm - dp4a.u32.u32 %r2520, %r2521, %r5886, %r2516; - // end inline asm - ld.const.u32 %r2525, [matrix+788]; - // begin inline asm - dp4a.u32.u32 %r2524, %r2525, %r5890, %r2520; - // end inline asm - ld.const.u32 %r2529, [matrix+792]; - // begin inline asm - dp4a.u32.u32 %r2528, %r2529, %r5894, %r2524; - // end inline asm - ld.const.u32 %r2533, [matrix+796]; - // begin inline asm - dp4a.u32.u32 %r2532, %r2533, %r5898, %r2528; - // end inline asm - ld.const.u32 %r2537, [matrix+800]; - // begin inline asm - dp4a.u32.u32 %r2536, %r2537, %r5902, %r2532; - // end inline asm - ld.const.u32 %r2541, [matrix+804]; - // begin inline asm - dp4a.u32.u32 %r2540, %r2541, %r5906, %r2536; - // end inline asm - ld.const.u32 %r2545, [matrix+808]; - // begin inline asm - dp4a.u32.u32 %r2544, %r2545, %r5910, %r2540; - // end inline asm - ld.const.u32 %r2549, [matrix+812]; - // begin inline asm - dp4a.u32.u32 %r2548, %r2549, %r5914, %r2544; - // end inline asm - ld.const.u32 %r2553, [matrix+816]; - // begin inline asm - dp4a.u32.u32 %r2552, %r2553, %r5918, %r2548; - // end inline asm - ld.const.u32 %r2557, [matrix+820]; - // begin inline asm - dp4a.u32.u32 %r2556, %r2557, %r5922, %r2552; - // end inline asm - ld.const.u32 %r2561, [matrix+824]; - // begin inline asm - dp4a.u32.u32 %r2560, %r2561, %r5926, %r2556; - // end inline asm - ld.const.u32 %r2565, [matrix+828]; - // begin inline asm - dp4a.u32.u32 %r2564, %r2565, %r5930, %r2560; - // end inline asm - ld.const.u32 %r2569, [matrix+832]; - // begin inline asm - dp4a.u32.u32 %r2568, %r2569, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2573, [matrix+836]; - // begin inline asm - dp4a.u32.u32 %r2572, %r2573, %r5874, %r2568; - // end inline asm - ld.const.u32 %r2577, [matrix+840]; - // begin inline asm - dp4a.u32.u32 %r2576, %r2577, %r5878, %r2572; - // end inline asm - ld.const.u32 %r2581, [matrix+844]; - // begin inline asm - dp4a.u32.u32 %r2580, %r2581, %r5882, %r2576; - // end inline asm - ld.const.u32 %r2585, [matrix+848]; - // begin inline asm - dp4a.u32.u32 %r2584, %r2585, %r5886, %r2580; - // end inline asm - ld.const.u32 %r2589, [matrix+852]; - // begin inline asm - dp4a.u32.u32 %r2588, %r2589, %r5890, %r2584; - // end inline asm - ld.const.u32 %r2593, [matrix+856]; - // begin inline asm - dp4a.u32.u32 %r2592, %r2593, %r5894, %r2588; - // end inline asm - ld.const.u32 %r2597, [matrix+860]; - // begin inline asm - dp4a.u32.u32 %r2596, %r2597, %r5898, %r2592; - // end inline asm - ld.const.u32 %r2601, [matrix+864]; - // begin inline asm - dp4a.u32.u32 %r2600, %r2601, %r5902, %r2596; - // end inline asm - ld.const.u32 %r2605, [matrix+868]; - // begin inline asm - dp4a.u32.u32 %r2604, %r2605, %r5906, %r2600; - // end inline asm - ld.const.u32 %r2609, [matrix+872]; - // begin inline asm - dp4a.u32.u32 %r2608, %r2609, %r5910, %r2604; - // end inline asm - ld.const.u32 %r2613, [matrix+876]; - // begin inline asm - dp4a.u32.u32 %r2612, %r2613, %r5914, %r2608; - // end inline asm - ld.const.u32 %r2617, [matrix+880]; - // begin inline asm - dp4a.u32.u32 %r2616, %r2617, %r5918, %r2612; - // end inline asm - ld.const.u32 %r2621, [matrix+884]; - // begin inline asm - dp4a.u32.u32 %r2620, %r2621, %r5922, %r2616; - // end inline asm - ld.const.u32 %r2625, [matrix+888]; - // begin inline asm - dp4a.u32.u32 %r2624, %r2625, %r5926, %r2620; - // end inline asm - ld.const.u32 %r2629, [matrix+892]; - // begin inline asm - dp4a.u32.u32 %r2628, %r2629, %r5930, %r2624; - // end inline asm - shr.u32 %r6083, %r2564, 6; - and.b32 %r2633, %r6083, 240; - shr.u32 %r2634, %r2628, 10; - and.b32 %r2635, %r5962, 255; - // begin inline asm - lop3.b32 %r2632, %r2633, %r2634, %r2635, 0x56; - // end inline asm - ld.const.u32 %r2637, [matrix+896]; - // begin inline asm - dp4a.u32.u32 %r2636, %r2637, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2641, [matrix+900]; - // begin inline asm - dp4a.u32.u32 %r2640, %r2641, %r5874, %r2636; - // end inline asm - ld.const.u32 %r2645, [matrix+904]; - // begin inline asm - dp4a.u32.u32 %r2644, %r2645, %r5878, %r2640; - // end inline asm - ld.const.u32 %r2649, [matrix+908]; - // begin inline asm - dp4a.u32.u32 %r2648, %r2649, %r5882, %r2644; - // end inline asm - ld.const.u32 %r2653, [matrix+912]; - // begin inline asm - dp4a.u32.u32 %r2652, %r2653, %r5886, %r2648; - // end inline asm - ld.const.u32 %r2657, [matrix+916]; - // begin inline asm - dp4a.u32.u32 %r2656, %r2657, %r5890, %r2652; - // end inline asm - ld.const.u32 %r2661, [matrix+920]; - // begin inline asm - dp4a.u32.u32 %r2660, %r2661, %r5894, %r2656; - // end inline asm - ld.const.u32 %r2665, [matrix+924]; - // begin inline asm - dp4a.u32.u32 %r2664, %r2665, %r5898, %r2660; - // end inline asm - ld.const.u32 %r2669, [matrix+928]; - // begin inline asm - dp4a.u32.u32 %r2668, %r2669, %r5902, %r2664; - // end inline asm - ld.const.u32 %r2673, [matrix+932]; - // begin inline asm - dp4a.u32.u32 %r2672, %r2673, %r5906, %r2668; - // end inline asm - ld.const.u32 %r2677, [matrix+936]; - // begin inline asm - dp4a.u32.u32 %r2676, %r2677, %r5910, %r2672; - // end inline asm - ld.const.u32 %r2681, [matrix+940]; - // begin inline asm - dp4a.u32.u32 %r2680, %r2681, %r5914, %r2676; - // end inline asm - ld.const.u32 %r2685, [matrix+944]; - // begin inline asm - dp4a.u32.u32 %r2684, %r2685, %r5918, %r2680; - // end inline asm - ld.const.u32 %r2689, [matrix+948]; - // begin inline asm - dp4a.u32.u32 %r2688, %r2689, %r5922, %r2684; - // end inline asm - ld.const.u32 %r2693, [matrix+952]; - // begin inline asm - dp4a.u32.u32 %r2692, %r2693, %r5926, %r2688; - // end inline asm - ld.const.u32 %r2697, [matrix+956]; - // begin inline asm - dp4a.u32.u32 %r2696, %r2697, %r5930, %r2692; - // end inline asm - ld.const.u32 %r2701, [matrix+960]; - // begin inline asm - dp4a.u32.u32 %r2700, %r2701, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2705, [matrix+964]; - // begin inline asm - dp4a.u32.u32 %r2704, %r2705, %r5874, %r2700; - // end inline asm - ld.const.u32 %r2709, [matrix+968]; - // begin inline asm - dp4a.u32.u32 %r2708, %r2709, %r5878, %r2704; - // end inline asm - ld.const.u32 %r2713, [matrix+972]; - // begin inline asm - dp4a.u32.u32 %r2712, %r2713, %r5882, %r2708; - // end inline asm - ld.const.u32 %r2717, [matrix+976]; - // begin inline asm - dp4a.u32.u32 %r2716, %r2717, %r5886, %r2712; - // end inline asm - ld.const.u32 %r2721, [matrix+980]; - // begin inline asm - dp4a.u32.u32 %r2720, %r2721, %r5890, %r2716; - // end inline asm - ld.const.u32 %r2725, [matrix+984]; - // begin inline asm - dp4a.u32.u32 %r2724, %r2725, %r5894, %r2720; - // end inline asm - ld.const.u32 %r2729, [matrix+988]; - // begin inline asm - dp4a.u32.u32 %r2728, %r2729, %r5898, %r2724; - // end inline asm - ld.const.u32 %r2733, [matrix+992]; - // begin inline asm - dp4a.u32.u32 %r2732, %r2733, %r5902, %r2728; - // end inline asm - ld.const.u32 %r2737, [matrix+996]; - // begin inline asm - dp4a.u32.u32 %r2736, %r2737, %r5906, %r2732; - // end inline asm - ld.const.u32 %r2741, [matrix+1000]; - // begin inline asm - dp4a.u32.u32 %r2740, %r2741, %r5910, %r2736; - // end inline asm - ld.const.u32 %r2745, [matrix+1004]; - // begin inline asm - dp4a.u32.u32 %r2744, %r2745, %r5914, %r2740; - // end inline asm - ld.const.u32 %r2749, [matrix+1008]; - // begin inline asm - dp4a.u32.u32 %r2748, %r2749, %r5918, %r2744; - // end inline asm - ld.const.u32 %r2753, [matrix+1012]; - // begin inline asm - dp4a.u32.u32 %r2752, %r2753, %r5922, %r2748; - // end inline asm - ld.const.u32 %r2757, [matrix+1016]; - // begin inline asm - dp4a.u32.u32 %r2756, %r2757, %r5926, %r2752; - // end inline asm - ld.const.u32 %r2761, [matrix+1020]; - // begin inline asm - dp4a.u32.u32 %r2760, %r2761, %r5930, %r2756; - // end inline asm - shr.u32 %r6084, %r2696, 6; - and.b32 %r2765, %r6084, 240; - shr.u32 %r2766, %r2760, 10; - // begin inline asm - lop3.b32 %r2764, %r2765, %r2766, %r10, 0x56; - // end inline asm - ld.const.u32 %r2769, [matrix+1024]; - // begin inline asm - dp4a.u32.u32 %r2768, %r2769, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2773, [matrix+1028]; - // begin inline asm - dp4a.u32.u32 %r2772, %r2773, %r5874, %r2768; - // end inline asm - ld.const.u32 %r2777, [matrix+1032]; - // begin inline asm - dp4a.u32.u32 %r2776, %r2777, %r5878, %r2772; - // end inline asm - ld.const.u32 %r2781, [matrix+1036]; - // begin inline asm - dp4a.u32.u32 %r2780, %r2781, %r5882, %r2776; - // end inline asm - ld.const.u32 %r2785, [matrix+1040]; - // begin inline asm - dp4a.u32.u32 %r2784, %r2785, %r5886, %r2780; - // end inline asm - ld.const.u32 %r2789, [matrix+1044]; - // begin inline asm - dp4a.u32.u32 %r2788, %r2789, %r5890, %r2784; - // end inline asm - ld.const.u32 %r2793, [matrix+1048]; - // begin inline asm - dp4a.u32.u32 %r2792, %r2793, %r5894, %r2788; - // end inline asm - ld.const.u32 %r2797, [matrix+1052]; - // begin inline asm - dp4a.u32.u32 %r2796, %r2797, %r5898, %r2792; - // end inline asm - ld.const.u32 %r2801, [matrix+1056]; - // begin inline asm - dp4a.u32.u32 %r2800, %r2801, %r5902, %r2796; - // end inline asm - ld.const.u32 %r2805, [matrix+1060]; - // begin inline asm - dp4a.u32.u32 %r2804, %r2805, %r5906, %r2800; - // end inline asm - ld.const.u32 %r2809, [matrix+1064]; - // begin inline asm - dp4a.u32.u32 %r2808, %r2809, %r5910, %r2804; - // end inline asm - ld.const.u32 %r2813, [matrix+1068]; - // begin inline asm - dp4a.u32.u32 %r2812, %r2813, %r5914, %r2808; - // end inline asm - ld.const.u32 %r2817, [matrix+1072]; - // begin inline asm - dp4a.u32.u32 %r2816, %r2817, %r5918, %r2812; - // end inline asm - ld.const.u32 %r2821, [matrix+1076]; - // begin inline asm - dp4a.u32.u32 %r2820, %r2821, %r5922, %r2816; - // end inline asm - ld.const.u32 %r2825, [matrix+1080]; - // begin inline asm - dp4a.u32.u32 %r2824, %r2825, %r5926, %r2820; - // end inline asm - ld.const.u32 %r2829, [matrix+1084]; - // begin inline asm - dp4a.u32.u32 %r2828, %r2829, %r5930, %r2824; - // end inline asm - ld.const.u32 %r2833, [matrix+1088]; - // begin inline asm - dp4a.u32.u32 %r2832, %r2833, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2837, [matrix+1092]; - // begin inline asm - dp4a.u32.u32 %r2836, %r2837, %r5874, %r2832; - // end inline asm - ld.const.u32 %r2841, [matrix+1096]; - // begin inline asm - dp4a.u32.u32 %r2840, %r2841, %r5878, %r2836; - // end inline asm - ld.const.u32 %r2845, [matrix+1100]; - // begin inline asm - dp4a.u32.u32 %r2844, %r2845, %r5882, %r2840; - // end inline asm - ld.const.u32 %r2849, [matrix+1104]; - // begin inline asm - dp4a.u32.u32 %r2848, %r2849, %r5886, %r2844; - // end inline asm - ld.const.u32 %r2853, [matrix+1108]; - // begin inline asm - dp4a.u32.u32 %r2852, %r2853, %r5890, %r2848; - // end inline asm - ld.const.u32 %r2857, [matrix+1112]; - // begin inline asm - dp4a.u32.u32 %r2856, %r2857, %r5894, %r2852; - // end inline asm - ld.const.u32 %r2861, [matrix+1116]; - // begin inline asm - dp4a.u32.u32 %r2860, %r2861, %r5898, %r2856; - // end inline asm - ld.const.u32 %r2865, [matrix+1120]; - // begin inline asm - dp4a.u32.u32 %r2864, %r2865, %r5902, %r2860; - // end inline asm - ld.const.u32 %r2869, [matrix+1124]; - // begin inline asm - dp4a.u32.u32 %r2868, %r2869, %r5906, %r2864; - // end inline asm - ld.const.u32 %r2873, [matrix+1128]; - // begin inline asm - dp4a.u32.u32 %r2872, %r2873, %r5910, %r2868; - // end inline asm - ld.const.u32 %r2877, [matrix+1132]; - // begin inline asm - dp4a.u32.u32 %r2876, %r2877, %r5914, %r2872; - // end inline asm - ld.const.u32 %r2881, [matrix+1136]; - // begin inline asm - dp4a.u32.u32 %r2880, %r2881, %r5918, %r2876; - // end inline asm - ld.const.u32 %r2885, [matrix+1140]; - // begin inline asm - dp4a.u32.u32 %r2884, %r2885, %r5922, %r2880; - // end inline asm - ld.const.u32 %r2889, [matrix+1144]; - // begin inline asm - dp4a.u32.u32 %r2888, %r2889, %r5926, %r2884; - // end inline asm - ld.const.u32 %r2893, [matrix+1148]; - // begin inline asm - dp4a.u32.u32 %r2892, %r2893, %r5930, %r2888; - // end inline asm - shr.u32 %r6085, %r2828, 6; - and.b32 %r2897, %r6085, 240; - shr.u32 %r2898, %r2892, 10; - and.b32 %r2899, %r11, 255; - // begin inline asm - lop3.b32 %r2896, %r2897, %r2898, %r2899, 0x56; - // end inline asm - ld.const.u32 %r2901, [matrix+1152]; - // begin inline asm - dp4a.u32.u32 %r2900, %r2901, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2905, [matrix+1156]; - // begin inline asm - dp4a.u32.u32 %r2904, %r2905, %r5874, %r2900; - // end inline asm - ld.const.u32 %r2909, [matrix+1160]; - // begin inline asm - dp4a.u32.u32 %r2908, %r2909, %r5878, %r2904; - // end inline asm - ld.const.u32 %r2913, [matrix+1164]; - // begin inline asm - dp4a.u32.u32 %r2912, %r2913, %r5882, %r2908; - // end inline asm - ld.const.u32 %r2917, [matrix+1168]; - // begin inline asm - dp4a.u32.u32 %r2916, %r2917, %r5886, %r2912; - // end inline asm - ld.const.u32 %r2921, [matrix+1172]; - // begin inline asm - dp4a.u32.u32 %r2920, %r2921, %r5890, %r2916; - // end inline asm - ld.const.u32 %r2925, [matrix+1176]; - // begin inline asm - dp4a.u32.u32 %r2924, %r2925, %r5894, %r2920; - // end inline asm - ld.const.u32 %r2929, [matrix+1180]; - // begin inline asm - dp4a.u32.u32 %r2928, %r2929, %r5898, %r2924; - // end inline asm - ld.const.u32 %r2933, [matrix+1184]; - // begin inline asm - dp4a.u32.u32 %r2932, %r2933, %r5902, %r2928; - // end inline asm - ld.const.u32 %r2937, [matrix+1188]; - // begin inline asm - dp4a.u32.u32 %r2936, %r2937, %r5906, %r2932; - // end inline asm - ld.const.u32 %r2941, [matrix+1192]; - // begin inline asm - dp4a.u32.u32 %r2940, %r2941, %r5910, %r2936; - // end inline asm - ld.const.u32 %r2945, [matrix+1196]; - // begin inline asm - dp4a.u32.u32 %r2944, %r2945, %r5914, %r2940; - // end inline asm - ld.const.u32 %r2949, [matrix+1200]; - // begin inline asm - dp4a.u32.u32 %r2948, %r2949, %r5918, %r2944; - // end inline asm - ld.const.u32 %r2953, [matrix+1204]; - // begin inline asm - dp4a.u32.u32 %r2952, %r2953, %r5922, %r2948; - // end inline asm - ld.const.u32 %r2957, [matrix+1208]; - // begin inline asm - dp4a.u32.u32 %r2956, %r2957, %r5926, %r2952; - // end inline asm - ld.const.u32 %r2961, [matrix+1212]; - // begin inline asm - dp4a.u32.u32 %r2960, %r2961, %r5930, %r2956; - // end inline asm - ld.const.u32 %r2965, [matrix+1216]; - // begin inline asm - dp4a.u32.u32 %r2964, %r2965, %r5870, %r6249; - // end inline asm - ld.const.u32 %r2969, [matrix+1220]; - // begin inline asm - dp4a.u32.u32 %r2968, %r2969, %r5874, %r2964; - // end inline asm - ld.const.u32 %r2973, [matrix+1224]; - // begin inline asm - dp4a.u32.u32 %r2972, %r2973, %r5878, %r2968; - // end inline asm - ld.const.u32 %r2977, [matrix+1228]; - // begin inline asm - dp4a.u32.u32 %r2976, %r2977, %r5882, %r2972; - // end inline asm - ld.const.u32 %r2981, [matrix+1232]; - // begin inline asm - dp4a.u32.u32 %r2980, %r2981, %r5886, %r2976; - // end inline asm - ld.const.u32 %r2985, [matrix+1236]; - // begin inline asm - dp4a.u32.u32 %r2984, %r2985, %r5890, %r2980; - // end inline asm - ld.const.u32 %r2989, [matrix+1240]; - // begin inline asm - dp4a.u32.u32 %r2988, %r2989, %r5894, %r2984; - // end inline asm - ld.const.u32 %r2993, [matrix+1244]; - // begin inline asm - dp4a.u32.u32 %r2992, %r2993, %r5898, %r2988; - // end inline asm - ld.const.u32 %r2997, [matrix+1248]; - // begin inline asm - dp4a.u32.u32 %r2996, %r2997, %r5902, %r2992; - // end inline asm - ld.const.u32 %r3001, [matrix+1252]; - // begin inline asm - dp4a.u32.u32 %r3000, %r3001, %r5906, %r2996; - // end inline asm - ld.const.u32 %r3005, [matrix+1256]; - // begin inline asm - dp4a.u32.u32 %r3004, %r3005, %r5910, %r3000; - // end inline asm - ld.const.u32 %r3009, [matrix+1260]; - // begin inline asm - dp4a.u32.u32 %r3008, %r3009, %r5914, %r3004; - // end inline asm - ld.const.u32 %r3013, [matrix+1264]; - // begin inline asm - dp4a.u32.u32 %r3012, %r3013, %r5918, %r3008; - // end inline asm - ld.const.u32 %r3017, [matrix+1268]; - // begin inline asm - dp4a.u32.u32 %r3016, %r3017, %r5922, %r3012; - // end inline asm - ld.const.u32 %r3021, [matrix+1272]; - // begin inline asm - dp4a.u32.u32 %r3020, %r3021, %r5926, %r3016; - // end inline asm - ld.const.u32 %r3025, [matrix+1276]; - // begin inline asm - dp4a.u32.u32 %r3024, %r3025, %r5930, %r3020; - // end inline asm - shr.u32 %r6086, %r2960, 6; - and.b32 %r3029, %r6086, 240; - shr.u32 %r3030, %r3024, 10; - bfe.u32 %r3031, %r11, 8, 8; - // begin inline asm - lop3.b32 %r3028, %r3029, %r3030, %r3031, 0x56; - // end inline asm - ld.const.u32 %r3033, [matrix+1280]; - // begin inline asm - dp4a.u32.u32 %r3032, %r3033, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3037, [matrix+1284]; - // begin inline asm - dp4a.u32.u32 %r3036, %r3037, %r5874, %r3032; - // end inline asm - ld.const.u32 %r3041, [matrix+1288]; - // begin inline asm - dp4a.u32.u32 %r3040, %r3041, %r5878, %r3036; - // end inline asm - ld.const.u32 %r3045, [matrix+1292]; - // begin inline asm - dp4a.u32.u32 %r3044, %r3045, %r5882, %r3040; - // end inline asm - ld.const.u32 %r3049, [matrix+1296]; - // begin inline asm - dp4a.u32.u32 %r3048, %r3049, %r5886, %r3044; - // end inline asm - ld.const.u32 %r3053, [matrix+1300]; - // begin inline asm - dp4a.u32.u32 %r3052, %r3053, %r5890, %r3048; - // end inline asm - ld.const.u32 %r3057, [matrix+1304]; - // begin inline asm - dp4a.u32.u32 %r3056, %r3057, %r5894, %r3052; - // end inline asm - ld.const.u32 %r3061, [matrix+1308]; - // begin inline asm - dp4a.u32.u32 %r3060, %r3061, %r5898, %r3056; - // end inline asm - ld.const.u32 %r3065, [matrix+1312]; - // begin inline asm - dp4a.u32.u32 %r3064, %r3065, %r5902, %r3060; - // end inline asm - ld.const.u32 %r3069, [matrix+1316]; - // begin inline asm - dp4a.u32.u32 %r3068, %r3069, %r5906, %r3064; - // end inline asm - ld.const.u32 %r3073, [matrix+1320]; - // begin inline asm - dp4a.u32.u32 %r3072, %r3073, %r5910, %r3068; - // end inline asm - ld.const.u32 %r3077, [matrix+1324]; - // begin inline asm - dp4a.u32.u32 %r3076, %r3077, %r5914, %r3072; - // end inline asm - ld.const.u32 %r3081, [matrix+1328]; - // begin inline asm - dp4a.u32.u32 %r3080, %r3081, %r5918, %r3076; - // end inline asm - ld.const.u32 %r3085, [matrix+1332]; - // begin inline asm - dp4a.u32.u32 %r3084, %r3085, %r5922, %r3080; - // end inline asm - ld.const.u32 %r3089, [matrix+1336]; - // begin inline asm - dp4a.u32.u32 %r3088, %r3089, %r5926, %r3084; - // end inline asm - ld.const.u32 %r3093, [matrix+1340]; - // begin inline asm - dp4a.u32.u32 %r3092, %r3093, %r5930, %r3088; - // end inline asm - ld.const.u32 %r3097, [matrix+1344]; - // begin inline asm - dp4a.u32.u32 %r3096, %r3097, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3101, [matrix+1348]; - // begin inline asm - dp4a.u32.u32 %r3100, %r3101, %r5874, %r3096; - // end inline asm - ld.const.u32 %r3105, [matrix+1352]; - // begin inline asm - dp4a.u32.u32 %r3104, %r3105, %r5878, %r3100; - // end inline asm - ld.const.u32 %r3109, [matrix+1356]; - // begin inline asm - dp4a.u32.u32 %r3108, %r3109, %r5882, %r3104; - // end inline asm - ld.const.u32 %r3113, [matrix+1360]; - // begin inline asm - dp4a.u32.u32 %r3112, %r3113, %r5886, %r3108; - // end inline asm - ld.const.u32 %r3117, [matrix+1364]; - // begin inline asm - dp4a.u32.u32 %r3116, %r3117, %r5890, %r3112; - // end inline asm - ld.const.u32 %r3121, [matrix+1368]; - // begin inline asm - dp4a.u32.u32 %r3120, %r3121, %r5894, %r3116; - // end inline asm - ld.const.u32 %r3125, [matrix+1372]; - // begin inline asm - dp4a.u32.u32 %r3124, %r3125, %r5898, %r3120; - // end inline asm - ld.const.u32 %r3129, [matrix+1376]; - // begin inline asm - dp4a.u32.u32 %r3128, %r3129, %r5902, %r3124; - // end inline asm - ld.const.u32 %r3133, [matrix+1380]; - // begin inline asm - dp4a.u32.u32 %r3132, %r3133, %r5906, %r3128; - // end inline asm - ld.const.u32 %r3137, [matrix+1384]; - // begin inline asm - dp4a.u32.u32 %r3136, %r3137, %r5910, %r3132; - // end inline asm - ld.const.u32 %r3141, [matrix+1388]; - // begin inline asm - dp4a.u32.u32 %r3140, %r3141, %r5914, %r3136; - // end inline asm - ld.const.u32 %r3145, [matrix+1392]; - // begin inline asm - dp4a.u32.u32 %r3144, %r3145, %r5918, %r3140; - // end inline asm - ld.const.u32 %r3149, [matrix+1396]; - // begin inline asm - dp4a.u32.u32 %r3148, %r3149, %r5922, %r3144; - // end inline asm - ld.const.u32 %r3153, [matrix+1400]; - // begin inline asm - dp4a.u32.u32 %r3152, %r3153, %r5926, %r3148; - // end inline asm - ld.const.u32 %r3157, [matrix+1404]; - // begin inline asm - dp4a.u32.u32 %r3156, %r3157, %r5930, %r3152; - // end inline asm - shr.u32 %r6087, %r3092, 6; - and.b32 %r3161, %r6087, 240; - shr.u32 %r3162, %r3156, 10; - bfe.u32 %r3163, %r11, 16, 8; - // begin inline asm - lop3.b32 %r3160, %r3161, %r3162, %r3163, 0x56; - // end inline asm - ld.const.u32 %r3165, [matrix+1408]; - // begin inline asm - dp4a.u32.u32 %r3164, %r3165, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3169, [matrix+1412]; - // begin inline asm - dp4a.u32.u32 %r3168, %r3169, %r5874, %r3164; - // end inline asm - ld.const.u32 %r3173, [matrix+1416]; - // begin inline asm - dp4a.u32.u32 %r3172, %r3173, %r5878, %r3168; - // end inline asm - ld.const.u32 %r3177, [matrix+1420]; - // begin inline asm - dp4a.u32.u32 %r3176, %r3177, %r5882, %r3172; - // end inline asm - ld.const.u32 %r3181, [matrix+1424]; - // begin inline asm - dp4a.u32.u32 %r3180, %r3181, %r5886, %r3176; - // end inline asm - ld.const.u32 %r3185, [matrix+1428]; - // begin inline asm - dp4a.u32.u32 %r3184, %r3185, %r5890, %r3180; - // end inline asm - ld.const.u32 %r3189, [matrix+1432]; - // begin inline asm - dp4a.u32.u32 %r3188, %r3189, %r5894, %r3184; - // end inline asm - ld.const.u32 %r3193, [matrix+1436]; - // begin inline asm - dp4a.u32.u32 %r3192, %r3193, %r5898, %r3188; - // end inline asm - ld.const.u32 %r3197, [matrix+1440]; - // begin inline asm - dp4a.u32.u32 %r3196, %r3197, %r5902, %r3192; - // end inline asm - ld.const.u32 %r3201, [matrix+1444]; - // begin inline asm - dp4a.u32.u32 %r3200, %r3201, %r5906, %r3196; - // end inline asm - ld.const.u32 %r3205, [matrix+1448]; - // begin inline asm - dp4a.u32.u32 %r3204, %r3205, %r5910, %r3200; - // end inline asm - ld.const.u32 %r3209, [matrix+1452]; - // begin inline asm - dp4a.u32.u32 %r3208, %r3209, %r5914, %r3204; - // end inline asm - ld.const.u32 %r3213, [matrix+1456]; - // begin inline asm - dp4a.u32.u32 %r3212, %r3213, %r5918, %r3208; - // end inline asm - ld.const.u32 %r3217, [matrix+1460]; - // begin inline asm - dp4a.u32.u32 %r3216, %r3217, %r5922, %r3212; - // end inline asm - ld.const.u32 %r3221, [matrix+1464]; - // begin inline asm - dp4a.u32.u32 %r3220, %r3221, %r5926, %r3216; - // end inline asm - ld.const.u32 %r3225, [matrix+1468]; - // begin inline asm - dp4a.u32.u32 %r3224, %r3225, %r5930, %r3220; - // end inline asm - ld.const.u32 %r3229, [matrix+1472]; - // begin inline asm - dp4a.u32.u32 %r3228, %r3229, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3233, [matrix+1476]; - // begin inline asm - dp4a.u32.u32 %r3232, %r3233, %r5874, %r3228; - // end inline asm - ld.const.u32 %r3237, [matrix+1480]; - // begin inline asm - dp4a.u32.u32 %r3236, %r3237, %r5878, %r3232; - // end inline asm - ld.const.u32 %r3241, [matrix+1484]; - // begin inline asm - dp4a.u32.u32 %r3240, %r3241, %r5882, %r3236; - // end inline asm - ld.const.u32 %r3245, [matrix+1488]; - // begin inline asm - dp4a.u32.u32 %r3244, %r3245, %r5886, %r3240; - // end inline asm - ld.const.u32 %r3249, [matrix+1492]; - // begin inline asm - dp4a.u32.u32 %r3248, %r3249, %r5890, %r3244; - // end inline asm - ld.const.u32 %r3253, [matrix+1496]; - // begin inline asm - dp4a.u32.u32 %r3252, %r3253, %r5894, %r3248; - // end inline asm - ld.const.u32 %r3257, [matrix+1500]; - // begin inline asm - dp4a.u32.u32 %r3256, %r3257, %r5898, %r3252; - // end inline asm - ld.const.u32 %r3261, [matrix+1504]; - // begin inline asm - dp4a.u32.u32 %r3260, %r3261, %r5902, %r3256; - // end inline asm - ld.const.u32 %r3265, [matrix+1508]; - // begin inline asm - dp4a.u32.u32 %r3264, %r3265, %r5906, %r3260; - // end inline asm - ld.const.u32 %r3269, [matrix+1512]; - // begin inline asm - dp4a.u32.u32 %r3268, %r3269, %r5910, %r3264; - // end inline asm - ld.const.u32 %r3273, [matrix+1516]; - // begin inline asm - dp4a.u32.u32 %r3272, %r3273, %r5914, %r3268; - // end inline asm - ld.const.u32 %r3277, [matrix+1520]; - // begin inline asm - dp4a.u32.u32 %r3276, %r3277, %r5918, %r3272; - // end inline asm - ld.const.u32 %r3281, [matrix+1524]; - // begin inline asm - dp4a.u32.u32 %r3280, %r3281, %r5922, %r3276; - // end inline asm - ld.const.u32 %r3285, [matrix+1528]; - // begin inline asm - dp4a.u32.u32 %r3284, %r3285, %r5926, %r3280; - // end inline asm - ld.const.u32 %r3289, [matrix+1532]; - // begin inline asm - dp4a.u32.u32 %r3288, %r3289, %r5930, %r3284; - // end inline asm - shr.u32 %r6088, %r3224, 6; - and.b32 %r3293, %r6088, 240; - shr.u32 %r3294, %r3288, 10; - // begin inline asm - lop3.b32 %r3292, %r3293, %r3294, %r3295, 0x56; - // end inline asm - ld.const.u32 %r3297, [matrix+1536]; - // begin inline asm - dp4a.u32.u32 %r3296, %r3297, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3301, [matrix+1540]; - // begin inline asm - dp4a.u32.u32 %r3300, %r3301, %r5874, %r3296; - // end inline asm - ld.const.u32 %r3305, [matrix+1544]; - // begin inline asm - dp4a.u32.u32 %r3304, %r3305, %r5878, %r3300; - // end inline asm - ld.const.u32 %r3309, [matrix+1548]; - // begin inline asm - dp4a.u32.u32 %r3308, %r3309, %r5882, %r3304; - // end inline asm - ld.const.u32 %r3313, [matrix+1552]; - // begin inline asm - dp4a.u32.u32 %r3312, %r3313, %r5886, %r3308; - // end inline asm - ld.const.u32 %r3317, [matrix+1556]; - // begin inline asm - dp4a.u32.u32 %r3316, %r3317, %r5890, %r3312; - // end inline asm - ld.const.u32 %r3321, [matrix+1560]; - // begin inline asm - dp4a.u32.u32 %r3320, %r3321, %r5894, %r3316; - // end inline asm - ld.const.u32 %r3325, [matrix+1564]; - // begin inline asm - dp4a.u32.u32 %r3324, %r3325, %r5898, %r3320; - // end inline asm - ld.const.u32 %r3329, [matrix+1568]; - // begin inline asm - dp4a.u32.u32 %r3328, %r3329, %r5902, %r3324; - // end inline asm - ld.const.u32 %r3333, [matrix+1572]; - // begin inline asm - dp4a.u32.u32 %r3332, %r3333, %r5906, %r3328; - // end inline asm - ld.const.u32 %r3337, [matrix+1576]; - // begin inline asm - dp4a.u32.u32 %r3336, %r3337, %r5910, %r3332; - // end inline asm - ld.const.u32 %r3341, [matrix+1580]; - // begin inline asm - dp4a.u32.u32 %r3340, %r3341, %r5914, %r3336; - // end inline asm - ld.const.u32 %r3345, [matrix+1584]; - // begin inline asm - dp4a.u32.u32 %r3344, %r3345, %r5918, %r3340; - // end inline asm - ld.const.u32 %r3349, [matrix+1588]; - // begin inline asm - dp4a.u32.u32 %r3348, %r3349, %r5922, %r3344; - // end inline asm - ld.const.u32 %r3353, [matrix+1592]; - // begin inline asm - dp4a.u32.u32 %r3352, %r3353, %r5926, %r3348; - // end inline asm - ld.const.u32 %r3357, [matrix+1596]; - // begin inline asm - dp4a.u32.u32 %r3356, %r3357, %r5930, %r3352; - // end inline asm - ld.const.u32 %r3361, [matrix+1600]; - // begin inline asm - dp4a.u32.u32 %r3360, %r3361, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3365, [matrix+1604]; - // begin inline asm - dp4a.u32.u32 %r3364, %r3365, %r5874, %r3360; - // end inline asm - ld.const.u32 %r3369, [matrix+1608]; - // begin inline asm - dp4a.u32.u32 %r3368, %r3369, %r5878, %r3364; - // end inline asm - ld.const.u32 %r3373, [matrix+1612]; - // begin inline asm - dp4a.u32.u32 %r3372, %r3373, %r5882, %r3368; - // end inline asm - ld.const.u32 %r3377, [matrix+1616]; - // begin inline asm - dp4a.u32.u32 %r3376, %r3377, %r5886, %r3372; - // end inline asm - ld.const.u32 %r3381, [matrix+1620]; - // begin inline asm - dp4a.u32.u32 %r3380, %r3381, %r5890, %r3376; - // end inline asm - ld.const.u32 %r3385, [matrix+1624]; - // begin inline asm - dp4a.u32.u32 %r3384, %r3385, %r5894, %r3380; - // end inline asm - ld.const.u32 %r3389, [matrix+1628]; - // begin inline asm - dp4a.u32.u32 %r3388, %r3389, %r5898, %r3384; - // end inline asm - ld.const.u32 %r3393, [matrix+1632]; - // begin inline asm - dp4a.u32.u32 %r3392, %r3393, %r5902, %r3388; - // end inline asm - ld.const.u32 %r3397, [matrix+1636]; - // begin inline asm - dp4a.u32.u32 %r3396, %r3397, %r5906, %r3392; - // end inline asm - ld.const.u32 %r3401, [matrix+1640]; - // begin inline asm - dp4a.u32.u32 %r3400, %r3401, %r5910, %r3396; - // end inline asm - ld.const.u32 %r3405, [matrix+1644]; - // begin inline asm - dp4a.u32.u32 %r3404, %r3405, %r5914, %r3400; - // end inline asm - ld.const.u32 %r3409, [matrix+1648]; - // begin inline asm - dp4a.u32.u32 %r3408, %r3409, %r5918, %r3404; - // end inline asm - ld.const.u32 %r3413, [matrix+1652]; - // begin inline asm - dp4a.u32.u32 %r3412, %r3413, %r5922, %r3408; - // end inline asm - ld.const.u32 %r3417, [matrix+1656]; - // begin inline asm - dp4a.u32.u32 %r3416, %r3417, %r5926, %r3412; - // end inline asm - ld.const.u32 %r3421, [matrix+1660]; - // begin inline asm - dp4a.u32.u32 %r3420, %r3421, %r5930, %r3416; - // end inline asm - shr.u32 %r6089, %r3356, 6; - and.b32 %r3425, %r6089, 240; - shr.u32 %r3426, %r3420, 10; - and.b32 %r3427, %r5986, 255; - // begin inline asm - lop3.b32 %r3424, %r3425, %r3426, %r3427, 0x56; - // end inline asm - ld.const.u32 %r3429, [matrix+1664]; - // begin inline asm - dp4a.u32.u32 %r3428, %r3429, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3433, [matrix+1668]; - // begin inline asm - dp4a.u32.u32 %r3432, %r3433, %r5874, %r3428; - // end inline asm - ld.const.u32 %r3437, [matrix+1672]; - // begin inline asm - dp4a.u32.u32 %r3436, %r3437, %r5878, %r3432; - // end inline asm - ld.const.u32 %r3441, [matrix+1676]; - // begin inline asm - dp4a.u32.u32 %r3440, %r3441, %r5882, %r3436; - // end inline asm - ld.const.u32 %r3445, [matrix+1680]; - // begin inline asm - dp4a.u32.u32 %r3444, %r3445, %r5886, %r3440; - // end inline asm - ld.const.u32 %r3449, [matrix+1684]; - // begin inline asm - dp4a.u32.u32 %r3448, %r3449, %r5890, %r3444; - // end inline asm - ld.const.u32 %r3453, [matrix+1688]; - // begin inline asm - dp4a.u32.u32 %r3452, %r3453, %r5894, %r3448; - // end inline asm - ld.const.u32 %r3457, [matrix+1692]; - // begin inline asm - dp4a.u32.u32 %r3456, %r3457, %r5898, %r3452; - // end inline asm - ld.const.u32 %r3461, [matrix+1696]; - // begin inline asm - dp4a.u32.u32 %r3460, %r3461, %r5902, %r3456; - // end inline asm - ld.const.u32 %r3465, [matrix+1700]; - // begin inline asm - dp4a.u32.u32 %r3464, %r3465, %r5906, %r3460; - // end inline asm - ld.const.u32 %r3469, [matrix+1704]; - // begin inline asm - dp4a.u32.u32 %r3468, %r3469, %r5910, %r3464; - // end inline asm - ld.const.u32 %r3473, [matrix+1708]; - // begin inline asm - dp4a.u32.u32 %r3472, %r3473, %r5914, %r3468; - // end inline asm - ld.const.u32 %r3477, [matrix+1712]; - // begin inline asm - dp4a.u32.u32 %r3476, %r3477, %r5918, %r3472; - // end inline asm - ld.const.u32 %r3481, [matrix+1716]; - // begin inline asm - dp4a.u32.u32 %r3480, %r3481, %r5922, %r3476; - // end inline asm - ld.const.u32 %r3485, [matrix+1720]; - // begin inline asm - dp4a.u32.u32 %r3484, %r3485, %r5926, %r3480; - // end inline asm - ld.const.u32 %r3489, [matrix+1724]; - // begin inline asm - dp4a.u32.u32 %r3488, %r3489, %r5930, %r3484; - // end inline asm - ld.const.u32 %r3493, [matrix+1728]; - // begin inline asm - dp4a.u32.u32 %r3492, %r3493, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3497, [matrix+1732]; - // begin inline asm - dp4a.u32.u32 %r3496, %r3497, %r5874, %r3492; - // end inline asm - ld.const.u32 %r3501, [matrix+1736]; - // begin inline asm - dp4a.u32.u32 %r3500, %r3501, %r5878, %r3496; - // end inline asm - ld.const.u32 %r3505, [matrix+1740]; - // begin inline asm - dp4a.u32.u32 %r3504, %r3505, %r5882, %r3500; - // end inline asm - ld.const.u32 %r3509, [matrix+1744]; - // begin inline asm - dp4a.u32.u32 %r3508, %r3509, %r5886, %r3504; - // end inline asm - ld.const.u32 %r3513, [matrix+1748]; - // begin inline asm - dp4a.u32.u32 %r3512, %r3513, %r5890, %r3508; - // end inline asm - ld.const.u32 %r3517, [matrix+1752]; - // begin inline asm - dp4a.u32.u32 %r3516, %r3517, %r5894, %r3512; - // end inline asm - ld.const.u32 %r3521, [matrix+1756]; - // begin inline asm - dp4a.u32.u32 %r3520, %r3521, %r5898, %r3516; - // end inline asm - ld.const.u32 %r3525, [matrix+1760]; - // begin inline asm - dp4a.u32.u32 %r3524, %r3525, %r5902, %r3520; - // end inline asm - ld.const.u32 %r3529, [matrix+1764]; - // begin inline asm - dp4a.u32.u32 %r3528, %r3529, %r5906, %r3524; - // end inline asm - ld.const.u32 %r3533, [matrix+1768]; - // begin inline asm - dp4a.u32.u32 %r3532, %r3533, %r5910, %r3528; - // end inline asm - ld.const.u32 %r3537, [matrix+1772]; - // begin inline asm - dp4a.u32.u32 %r3536, %r3537, %r5914, %r3532; - // end inline asm - ld.const.u32 %r3541, [matrix+1776]; - // begin inline asm - dp4a.u32.u32 %r3540, %r3541, %r5918, %r3536; - // end inline asm - ld.const.u32 %r3545, [matrix+1780]; - // begin inline asm - dp4a.u32.u32 %r3544, %r3545, %r5922, %r3540; - // end inline asm - ld.const.u32 %r3549, [matrix+1784]; - // begin inline asm - dp4a.u32.u32 %r3548, %r3549, %r5926, %r3544; - // end inline asm - ld.const.u32 %r3553, [matrix+1788]; - // begin inline asm - dp4a.u32.u32 %r3552, %r3553, %r5930, %r3548; - // end inline asm - shr.u32 %r6090, %r3488, 6; - and.b32 %r3557, %r6090, 240; - shr.u32 %r3558, %r3552, 10; - and.b32 %r3559, %r5991, 255; - // begin inline asm - lop3.b32 %r3556, %r3557, %r3558, %r3559, 0x56; - // end inline asm - ld.const.u32 %r3561, [matrix+1792]; - // begin inline asm - dp4a.u32.u32 %r3560, %r3561, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3565, [matrix+1796]; - // begin inline asm - dp4a.u32.u32 %r3564, %r3565, %r5874, %r3560; - // end inline asm - ld.const.u32 %r3569, [matrix+1800]; - // begin inline asm - dp4a.u32.u32 %r3568, %r3569, %r5878, %r3564; - // end inline asm - ld.const.u32 %r3573, [matrix+1804]; - // begin inline asm - dp4a.u32.u32 %r3572, %r3573, %r5882, %r3568; - // end inline asm - ld.const.u32 %r3577, [matrix+1808]; - // begin inline asm - dp4a.u32.u32 %r3576, %r3577, %r5886, %r3572; - // end inline asm - ld.const.u32 %r3581, [matrix+1812]; - // begin inline asm - dp4a.u32.u32 %r3580, %r3581, %r5890, %r3576; - // end inline asm - ld.const.u32 %r3585, [matrix+1816]; - // begin inline asm - dp4a.u32.u32 %r3584, %r3585, %r5894, %r3580; - // end inline asm - ld.const.u32 %r3589, [matrix+1820]; - // begin inline asm - dp4a.u32.u32 %r3588, %r3589, %r5898, %r3584; - // end inline asm - ld.const.u32 %r3593, [matrix+1824]; - // begin inline asm - dp4a.u32.u32 %r3592, %r3593, %r5902, %r3588; - // end inline asm - ld.const.u32 %r3597, [matrix+1828]; - // begin inline asm - dp4a.u32.u32 %r3596, %r3597, %r5906, %r3592; - // end inline asm - ld.const.u32 %r3601, [matrix+1832]; - // begin inline asm - dp4a.u32.u32 %r3600, %r3601, %r5910, %r3596; - // end inline asm - ld.const.u32 %r3605, [matrix+1836]; - // begin inline asm - dp4a.u32.u32 %r3604, %r3605, %r5914, %r3600; - // end inline asm - ld.const.u32 %r3609, [matrix+1840]; - // begin inline asm - dp4a.u32.u32 %r3608, %r3609, %r5918, %r3604; - // end inline asm - ld.const.u32 %r3613, [matrix+1844]; - // begin inline asm - dp4a.u32.u32 %r3612, %r3613, %r5922, %r3608; - // end inline asm - ld.const.u32 %r3617, [matrix+1848]; - // begin inline asm - dp4a.u32.u32 %r3616, %r3617, %r5926, %r3612; - // end inline asm - ld.const.u32 %r3621, [matrix+1852]; - // begin inline asm - dp4a.u32.u32 %r3620, %r3621, %r5930, %r3616; - // end inline asm - ld.const.u32 %r3625, [matrix+1856]; - // begin inline asm - dp4a.u32.u32 %r3624, %r3625, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3629, [matrix+1860]; - // begin inline asm - dp4a.u32.u32 %r3628, %r3629, %r5874, %r3624; - // end inline asm - ld.const.u32 %r3633, [matrix+1864]; - // begin inline asm - dp4a.u32.u32 %r3632, %r3633, %r5878, %r3628; - // end inline asm - ld.const.u32 %r3637, [matrix+1868]; - // begin inline asm - dp4a.u32.u32 %r3636, %r3637, %r5882, %r3632; - // end inline asm - ld.const.u32 %r3641, [matrix+1872]; - // begin inline asm - dp4a.u32.u32 %r3640, %r3641, %r5886, %r3636; - // end inline asm - ld.const.u32 %r3645, [matrix+1876]; - // begin inline asm - dp4a.u32.u32 %r3644, %r3645, %r5890, %r3640; - // end inline asm - ld.const.u32 %r3649, [matrix+1880]; - // begin inline asm - dp4a.u32.u32 %r3648, %r3649, %r5894, %r3644; - // end inline asm - ld.const.u32 %r3653, [matrix+1884]; - // begin inline asm - dp4a.u32.u32 %r3652, %r3653, %r5898, %r3648; - // end inline asm - ld.const.u32 %r3657, [matrix+1888]; - // begin inline asm - dp4a.u32.u32 %r3656, %r3657, %r5902, %r3652; - // end inline asm - ld.const.u32 %r3661, [matrix+1892]; - // begin inline asm - dp4a.u32.u32 %r3660, %r3661, %r5906, %r3656; - // end inline asm - ld.const.u32 %r3665, [matrix+1896]; - // begin inline asm - dp4a.u32.u32 %r3664, %r3665, %r5910, %r3660; - // end inline asm - ld.const.u32 %r3669, [matrix+1900]; - // begin inline asm - dp4a.u32.u32 %r3668, %r3669, %r5914, %r3664; - // end inline asm - ld.const.u32 %r3673, [matrix+1904]; - // begin inline asm - dp4a.u32.u32 %r3672, %r3673, %r5918, %r3668; - // end inline asm - ld.const.u32 %r3677, [matrix+1908]; - // begin inline asm - dp4a.u32.u32 %r3676, %r3677, %r5922, %r3672; - // end inline asm - ld.const.u32 %r3681, [matrix+1912]; - // begin inline asm - dp4a.u32.u32 %r3680, %r3681, %r5926, %r3676; - // end inline asm - ld.const.u32 %r3685, [matrix+1916]; - // begin inline asm - dp4a.u32.u32 %r3684, %r3685, %r5930, %r3680; - // end inline asm - shr.u32 %r6091, %r3620, 6; - and.b32 %r3689, %r6091, 240; - shr.u32 %r3690, %r3684, 10; - and.b32 %r3691, %r5995, 255; - // begin inline asm - lop3.b32 %r3688, %r3689, %r3690, %r3691, 0x56; - // end inline asm - ld.const.u32 %r3693, [matrix+1920]; - // begin inline asm - dp4a.u32.u32 %r3692, %r3693, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3697, [matrix+1924]; - // begin inline asm - dp4a.u32.u32 %r3696, %r3697, %r5874, %r3692; - // end inline asm - ld.const.u32 %r3701, [matrix+1928]; - // begin inline asm - dp4a.u32.u32 %r3700, %r3701, %r5878, %r3696; - // end inline asm - ld.const.u32 %r3705, [matrix+1932]; - // begin inline asm - dp4a.u32.u32 %r3704, %r3705, %r5882, %r3700; - // end inline asm - ld.const.u32 %r3709, [matrix+1936]; - // begin inline asm - dp4a.u32.u32 %r3708, %r3709, %r5886, %r3704; - // end inline asm - ld.const.u32 %r3713, [matrix+1940]; - // begin inline asm - dp4a.u32.u32 %r3712, %r3713, %r5890, %r3708; - // end inline asm - ld.const.u32 %r3717, [matrix+1944]; - // begin inline asm - dp4a.u32.u32 %r3716, %r3717, %r5894, %r3712; - // end inline asm - ld.const.u32 %r3721, [matrix+1948]; - // begin inline asm - dp4a.u32.u32 %r3720, %r3721, %r5898, %r3716; - // end inline asm - ld.const.u32 %r3725, [matrix+1952]; - // begin inline asm - dp4a.u32.u32 %r3724, %r3725, %r5902, %r3720; - // end inline asm - ld.const.u32 %r3729, [matrix+1956]; - // begin inline asm - dp4a.u32.u32 %r3728, %r3729, %r5906, %r3724; - // end inline asm - ld.const.u32 %r3733, [matrix+1960]; - // begin inline asm - dp4a.u32.u32 %r3732, %r3733, %r5910, %r3728; - // end inline asm - ld.const.u32 %r3737, [matrix+1964]; - // begin inline asm - dp4a.u32.u32 %r3736, %r3737, %r5914, %r3732; - // end inline asm - ld.const.u32 %r3741, [matrix+1968]; - // begin inline asm - dp4a.u32.u32 %r3740, %r3741, %r5918, %r3736; - // end inline asm - ld.const.u32 %r3745, [matrix+1972]; - // begin inline asm - dp4a.u32.u32 %r3744, %r3745, %r5922, %r3740; - // end inline asm - ld.const.u32 %r3749, [matrix+1976]; - // begin inline asm - dp4a.u32.u32 %r3748, %r3749, %r5926, %r3744; - // end inline asm - ld.const.u32 %r3753, [matrix+1980]; - // begin inline asm - dp4a.u32.u32 %r3752, %r3753, %r5930, %r3748; - // end inline asm - ld.const.u32 %r3757, [matrix+1984]; - // begin inline asm - dp4a.u32.u32 %r3756, %r3757, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3761, [matrix+1988]; - // begin inline asm - dp4a.u32.u32 %r3760, %r3761, %r5874, %r3756; - // end inline asm - ld.const.u32 %r3765, [matrix+1992]; - // begin inline asm - dp4a.u32.u32 %r3764, %r3765, %r5878, %r3760; - // end inline asm - ld.const.u32 %r3769, [matrix+1996]; - // begin inline asm - dp4a.u32.u32 %r3768, %r3769, %r5882, %r3764; - // end inline asm - ld.const.u32 %r3773, [matrix+2000]; - // begin inline asm - dp4a.u32.u32 %r3772, %r3773, %r5886, %r3768; - // end inline asm - ld.const.u32 %r3777, [matrix+2004]; - // begin inline asm - dp4a.u32.u32 %r3776, %r3777, %r5890, %r3772; - // end inline asm - ld.const.u32 %r3781, [matrix+2008]; - // begin inline asm - dp4a.u32.u32 %r3780, %r3781, %r5894, %r3776; - // end inline asm - ld.const.u32 %r3785, [matrix+2012]; - // begin inline asm - dp4a.u32.u32 %r3784, %r3785, %r5898, %r3780; - // end inline asm - ld.const.u32 %r3789, [matrix+2016]; - // begin inline asm - dp4a.u32.u32 %r3788, %r3789, %r5902, %r3784; - // end inline asm - ld.const.u32 %r3793, [matrix+2020]; - // begin inline asm - dp4a.u32.u32 %r3792, %r3793, %r5906, %r3788; - // end inline asm - ld.const.u32 %r3797, [matrix+2024]; - // begin inline asm - dp4a.u32.u32 %r3796, %r3797, %r5910, %r3792; - // end inline asm - ld.const.u32 %r3801, [matrix+2028]; - // begin inline asm - dp4a.u32.u32 %r3800, %r3801, %r5914, %r3796; - // end inline asm - ld.const.u32 %r3805, [matrix+2032]; - // begin inline asm - dp4a.u32.u32 %r3804, %r3805, %r5918, %r3800; - // end inline asm - ld.const.u32 %r3809, [matrix+2036]; - // begin inline asm - dp4a.u32.u32 %r3808, %r3809, %r5922, %r3804; - // end inline asm - ld.const.u32 %r3813, [matrix+2040]; - // begin inline asm - dp4a.u32.u32 %r3812, %r3813, %r5926, %r3808; - // end inline asm - ld.const.u32 %r3817, [matrix+2044]; - // begin inline asm - dp4a.u32.u32 %r3816, %r3817, %r5930, %r3812; - // end inline asm - shr.u32 %r6092, %r3752, 6; - and.b32 %r3821, %r6092, 240; - shr.u32 %r3822, %r3816, 10; - // begin inline asm - lop3.b32 %r3820, %r3821, %r3822, %r12, 0x56; - // end inline asm - ld.const.u32 %r3825, [matrix+2048]; - // begin inline asm - dp4a.u32.u32 %r3824, %r3825, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3829, [matrix+2052]; - // begin inline asm - dp4a.u32.u32 %r3828, %r3829, %r5874, %r3824; - // end inline asm - ld.const.u32 %r3833, [matrix+2056]; - // begin inline asm - dp4a.u32.u32 %r3832, %r3833, %r5878, %r3828; - // end inline asm - ld.const.u32 %r3837, [matrix+2060]; - // begin inline asm - dp4a.u32.u32 %r3836, %r3837, %r5882, %r3832; - // end inline asm - ld.const.u32 %r3841, [matrix+2064]; - // begin inline asm - dp4a.u32.u32 %r3840, %r3841, %r5886, %r3836; - // end inline asm - ld.const.u32 %r3845, [matrix+2068]; - // begin inline asm - dp4a.u32.u32 %r3844, %r3845, %r5890, %r3840; - // end inline asm - ld.const.u32 %r3849, [matrix+2072]; - // begin inline asm - dp4a.u32.u32 %r3848, %r3849, %r5894, %r3844; - // end inline asm - ld.const.u32 %r3853, [matrix+2076]; - // begin inline asm - dp4a.u32.u32 %r3852, %r3853, %r5898, %r3848; - // end inline asm - ld.const.u32 %r3857, [matrix+2080]; - // begin inline asm - dp4a.u32.u32 %r3856, %r3857, %r5902, %r3852; - // end inline asm - ld.const.u32 %r3861, [matrix+2084]; - // begin inline asm - dp4a.u32.u32 %r3860, %r3861, %r5906, %r3856; - // end inline asm - ld.const.u32 %r3865, [matrix+2088]; - // begin inline asm - dp4a.u32.u32 %r3864, %r3865, %r5910, %r3860; - // end inline asm - ld.const.u32 %r3869, [matrix+2092]; - // begin inline asm - dp4a.u32.u32 %r3868, %r3869, %r5914, %r3864; - // end inline asm - ld.const.u32 %r3873, [matrix+2096]; - // begin inline asm - dp4a.u32.u32 %r3872, %r3873, %r5918, %r3868; - // end inline asm - ld.const.u32 %r3877, [matrix+2100]; - // begin inline asm - dp4a.u32.u32 %r3876, %r3877, %r5922, %r3872; - // end inline asm - ld.const.u32 %r3881, [matrix+2104]; - // begin inline asm - dp4a.u32.u32 %r3880, %r3881, %r5926, %r3876; - // end inline asm - ld.const.u32 %r3885, [matrix+2108]; - // begin inline asm - dp4a.u32.u32 %r3884, %r3885, %r5930, %r3880; - // end inline asm - ld.const.u32 %r3889, [matrix+2112]; - // begin inline asm - dp4a.u32.u32 %r3888, %r3889, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3893, [matrix+2116]; - // begin inline asm - dp4a.u32.u32 %r3892, %r3893, %r5874, %r3888; - // end inline asm - ld.const.u32 %r3897, [matrix+2120]; - // begin inline asm - dp4a.u32.u32 %r3896, %r3897, %r5878, %r3892; - // end inline asm - ld.const.u32 %r3901, [matrix+2124]; - // begin inline asm - dp4a.u32.u32 %r3900, %r3901, %r5882, %r3896; - // end inline asm - ld.const.u32 %r3905, [matrix+2128]; - // begin inline asm - dp4a.u32.u32 %r3904, %r3905, %r5886, %r3900; - // end inline asm - ld.const.u32 %r3909, [matrix+2132]; - // begin inline asm - dp4a.u32.u32 %r3908, %r3909, %r5890, %r3904; - // end inline asm - ld.const.u32 %r3913, [matrix+2136]; - // begin inline asm - dp4a.u32.u32 %r3912, %r3913, %r5894, %r3908; - // end inline asm - ld.const.u32 %r3917, [matrix+2140]; - // begin inline asm - dp4a.u32.u32 %r3916, %r3917, %r5898, %r3912; - // end inline asm - ld.const.u32 %r3921, [matrix+2144]; - // begin inline asm - dp4a.u32.u32 %r3920, %r3921, %r5902, %r3916; - // end inline asm - ld.const.u32 %r3925, [matrix+2148]; - // begin inline asm - dp4a.u32.u32 %r3924, %r3925, %r5906, %r3920; - // end inline asm - ld.const.u32 %r3929, [matrix+2152]; - // begin inline asm - dp4a.u32.u32 %r3928, %r3929, %r5910, %r3924; - // end inline asm - ld.const.u32 %r3933, [matrix+2156]; - // begin inline asm - dp4a.u32.u32 %r3932, %r3933, %r5914, %r3928; - // end inline asm - ld.const.u32 %r3937, [matrix+2160]; - // begin inline asm - dp4a.u32.u32 %r3936, %r3937, %r5918, %r3932; - // end inline asm - ld.const.u32 %r3941, [matrix+2164]; - // begin inline asm - dp4a.u32.u32 %r3940, %r3941, %r5922, %r3936; - // end inline asm - ld.const.u32 %r3945, [matrix+2168]; - // begin inline asm - dp4a.u32.u32 %r3944, %r3945, %r5926, %r3940; - // end inline asm - ld.const.u32 %r3949, [matrix+2172]; - // begin inline asm - dp4a.u32.u32 %r3948, %r3949, %r5930, %r3944; - // end inline asm - shr.u32 %r6093, %r3884, 6; - and.b32 %r3953, %r6093, 240; - shr.u32 %r3954, %r3948, 10; - and.b32 %r3955, %r13, 255; - // begin inline asm - lop3.b32 %r3952, %r3953, %r3954, %r3955, 0x56; - // end inline asm - ld.const.u32 %r3957, [matrix+2176]; - // begin inline asm - dp4a.u32.u32 %r3956, %r3957, %r5870, %r6249; - // end inline asm - ld.const.u32 %r3961, [matrix+2180]; - // begin inline asm - dp4a.u32.u32 %r3960, %r3961, %r5874, %r3956; - // end inline asm - ld.const.u32 %r3965, [matrix+2184]; - // begin inline asm - dp4a.u32.u32 %r3964, %r3965, %r5878, %r3960; - // end inline asm - ld.const.u32 %r3969, [matrix+2188]; - // begin inline asm - dp4a.u32.u32 %r3968, %r3969, %r5882, %r3964; - // end inline asm - ld.const.u32 %r3973, [matrix+2192]; - // begin inline asm - dp4a.u32.u32 %r3972, %r3973, %r5886, %r3968; - // end inline asm - ld.const.u32 %r3977, [matrix+2196]; - // begin inline asm - dp4a.u32.u32 %r3976, %r3977, %r5890, %r3972; - // end inline asm - ld.const.u32 %r3981, [matrix+2200]; - // begin inline asm - dp4a.u32.u32 %r3980, %r3981, %r5894, %r3976; - // end inline asm - ld.const.u32 %r3985, [matrix+2204]; - // begin inline asm - dp4a.u32.u32 %r3984, %r3985, %r5898, %r3980; - // end inline asm - ld.const.u32 %r3989, [matrix+2208]; - // begin inline asm - dp4a.u32.u32 %r3988, %r3989, %r5902, %r3984; - // end inline asm - ld.const.u32 %r3993, [matrix+2212]; - // begin inline asm - dp4a.u32.u32 %r3992, %r3993, %r5906, %r3988; - // end inline asm - ld.const.u32 %r3997, [matrix+2216]; - // begin inline asm - dp4a.u32.u32 %r3996, %r3997, %r5910, %r3992; - // end inline asm - ld.const.u32 %r4001, [matrix+2220]; - // begin inline asm - dp4a.u32.u32 %r4000, %r4001, %r5914, %r3996; - // end inline asm - ld.const.u32 %r4005, [matrix+2224]; - // begin inline asm - dp4a.u32.u32 %r4004, %r4005, %r5918, %r4000; - // end inline asm - ld.const.u32 %r4009, [matrix+2228]; - // begin inline asm - dp4a.u32.u32 %r4008, %r4009, %r5922, %r4004; - // end inline asm - ld.const.u32 %r4013, [matrix+2232]; - // begin inline asm - dp4a.u32.u32 %r4012, %r4013, %r5926, %r4008; - // end inline asm - ld.const.u32 %r4017, [matrix+2236]; - // begin inline asm - dp4a.u32.u32 %r4016, %r4017, %r5930, %r4012; - // end inline asm - ld.const.u32 %r4021, [matrix+2240]; - // begin inline asm - dp4a.u32.u32 %r4020, %r4021, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4025, [matrix+2244]; - // begin inline asm - dp4a.u32.u32 %r4024, %r4025, %r5874, %r4020; - // end inline asm - ld.const.u32 %r4029, [matrix+2248]; - // begin inline asm - dp4a.u32.u32 %r4028, %r4029, %r5878, %r4024; - // end inline asm - ld.const.u32 %r4033, [matrix+2252]; - // begin inline asm - dp4a.u32.u32 %r4032, %r4033, %r5882, %r4028; - // end inline asm - ld.const.u32 %r4037, [matrix+2256]; - // begin inline asm - dp4a.u32.u32 %r4036, %r4037, %r5886, %r4032; - // end inline asm - ld.const.u32 %r4041, [matrix+2260]; - // begin inline asm - dp4a.u32.u32 %r4040, %r4041, %r5890, %r4036; - // end inline asm - ld.const.u32 %r4045, [matrix+2264]; - // begin inline asm - dp4a.u32.u32 %r4044, %r4045, %r5894, %r4040; - // end inline asm - ld.const.u32 %r4049, [matrix+2268]; - // begin inline asm - dp4a.u32.u32 %r4048, %r4049, %r5898, %r4044; - // end inline asm - ld.const.u32 %r4053, [matrix+2272]; - // begin inline asm - dp4a.u32.u32 %r4052, %r4053, %r5902, %r4048; - // end inline asm - ld.const.u32 %r4057, [matrix+2276]; - // begin inline asm - dp4a.u32.u32 %r4056, %r4057, %r5906, %r4052; - // end inline asm - ld.const.u32 %r4061, [matrix+2280]; - // begin inline asm - dp4a.u32.u32 %r4060, %r4061, %r5910, %r4056; - // end inline asm - ld.const.u32 %r4065, [matrix+2284]; - // begin inline asm - dp4a.u32.u32 %r4064, %r4065, %r5914, %r4060; - // end inline asm - ld.const.u32 %r4069, [matrix+2288]; - // begin inline asm - dp4a.u32.u32 %r4068, %r4069, %r5918, %r4064; - // end inline asm - ld.const.u32 %r4073, [matrix+2292]; - // begin inline asm - dp4a.u32.u32 %r4072, %r4073, %r5922, %r4068; - // end inline asm - ld.const.u32 %r4077, [matrix+2296]; - // begin inline asm - dp4a.u32.u32 %r4076, %r4077, %r5926, %r4072; - // end inline asm - ld.const.u32 %r4081, [matrix+2300]; - // begin inline asm - dp4a.u32.u32 %r4080, %r4081, %r5930, %r4076; - // end inline asm - shr.u32 %r6094, %r4016, 6; - and.b32 %r4085, %r6094, 240; - shr.u32 %r4086, %r4080, 10; - bfe.u32 %r4087, %r13, 8, 8; - // begin inline asm - lop3.b32 %r4084, %r4085, %r4086, %r4087, 0x56; - // end inline asm - ld.const.u32 %r4089, [matrix+2304]; - // begin inline asm - dp4a.u32.u32 %r4088, %r4089, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4093, [matrix+2308]; - // begin inline asm - dp4a.u32.u32 %r4092, %r4093, %r5874, %r4088; - // end inline asm - ld.const.u32 %r4097, [matrix+2312]; - // begin inline asm - dp4a.u32.u32 %r4096, %r4097, %r5878, %r4092; - // end inline asm - ld.const.u32 %r4101, [matrix+2316]; - // begin inline asm - dp4a.u32.u32 %r4100, %r4101, %r5882, %r4096; - // end inline asm - ld.const.u32 %r4105, [matrix+2320]; - // begin inline asm - dp4a.u32.u32 %r4104, %r4105, %r5886, %r4100; - // end inline asm - ld.const.u32 %r4109, [matrix+2324]; - // begin inline asm - dp4a.u32.u32 %r4108, %r4109, %r5890, %r4104; - // end inline asm - ld.const.u32 %r4113, [matrix+2328]; - // begin inline asm - dp4a.u32.u32 %r4112, %r4113, %r5894, %r4108; - // end inline asm - ld.const.u32 %r4117, [matrix+2332]; - // begin inline asm - dp4a.u32.u32 %r4116, %r4117, %r5898, %r4112; - // end inline asm - ld.const.u32 %r4121, [matrix+2336]; - // begin inline asm - dp4a.u32.u32 %r4120, %r4121, %r5902, %r4116; - // end inline asm - ld.const.u32 %r4125, [matrix+2340]; - // begin inline asm - dp4a.u32.u32 %r4124, %r4125, %r5906, %r4120; - // end inline asm - ld.const.u32 %r4129, [matrix+2344]; - // begin inline asm - dp4a.u32.u32 %r4128, %r4129, %r5910, %r4124; - // end inline asm - ld.const.u32 %r4133, [matrix+2348]; - // begin inline asm - dp4a.u32.u32 %r4132, %r4133, %r5914, %r4128; - // end inline asm - ld.const.u32 %r4137, [matrix+2352]; - // begin inline asm - dp4a.u32.u32 %r4136, %r4137, %r5918, %r4132; - // end inline asm - ld.const.u32 %r4141, [matrix+2356]; - // begin inline asm - dp4a.u32.u32 %r4140, %r4141, %r5922, %r4136; - // end inline asm - ld.const.u32 %r4145, [matrix+2360]; - // begin inline asm - dp4a.u32.u32 %r4144, %r4145, %r5926, %r4140; - // end inline asm - ld.const.u32 %r4149, [matrix+2364]; - // begin inline asm - dp4a.u32.u32 %r4148, %r4149, %r5930, %r4144; - // end inline asm - ld.const.u32 %r4153, [matrix+2368]; - // begin inline asm - dp4a.u32.u32 %r4152, %r4153, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4157, [matrix+2372]; - // begin inline asm - dp4a.u32.u32 %r4156, %r4157, %r5874, %r4152; - // end inline asm - ld.const.u32 %r4161, [matrix+2376]; - // begin inline asm - dp4a.u32.u32 %r4160, %r4161, %r5878, %r4156; - // end inline asm - ld.const.u32 %r4165, [matrix+2380]; - // begin inline asm - dp4a.u32.u32 %r4164, %r4165, %r5882, %r4160; - // end inline asm - ld.const.u32 %r4169, [matrix+2384]; - // begin inline asm - dp4a.u32.u32 %r4168, %r4169, %r5886, %r4164; - // end inline asm - ld.const.u32 %r4173, [matrix+2388]; - // begin inline asm - dp4a.u32.u32 %r4172, %r4173, %r5890, %r4168; - // end inline asm - ld.const.u32 %r4177, [matrix+2392]; - // begin inline asm - dp4a.u32.u32 %r4176, %r4177, %r5894, %r4172; - // end inline asm - ld.const.u32 %r4181, [matrix+2396]; - // begin inline asm - dp4a.u32.u32 %r4180, %r4181, %r5898, %r4176; - // end inline asm - ld.const.u32 %r4185, [matrix+2400]; - // begin inline asm - dp4a.u32.u32 %r4184, %r4185, %r5902, %r4180; - // end inline asm - ld.const.u32 %r4189, [matrix+2404]; - // begin inline asm - dp4a.u32.u32 %r4188, %r4189, %r5906, %r4184; - // end inline asm - ld.const.u32 %r4193, [matrix+2408]; - // begin inline asm - dp4a.u32.u32 %r4192, %r4193, %r5910, %r4188; - // end inline asm - ld.const.u32 %r4197, [matrix+2412]; - // begin inline asm - dp4a.u32.u32 %r4196, %r4197, %r5914, %r4192; - // end inline asm - ld.const.u32 %r4201, [matrix+2416]; - // begin inline asm - dp4a.u32.u32 %r4200, %r4201, %r5918, %r4196; - // end inline asm - ld.const.u32 %r4205, [matrix+2420]; - // begin inline asm - dp4a.u32.u32 %r4204, %r4205, %r5922, %r4200; - // end inline asm - ld.const.u32 %r4209, [matrix+2424]; - // begin inline asm - dp4a.u32.u32 %r4208, %r4209, %r5926, %r4204; - // end inline asm - ld.const.u32 %r4213, [matrix+2428]; - // begin inline asm - dp4a.u32.u32 %r4212, %r4213, %r5930, %r4208; - // end inline asm - shr.u32 %r6095, %r4148, 6; - and.b32 %r4217, %r6095, 240; - shr.u32 %r4218, %r4212, 10; - bfe.u32 %r4219, %r13, 16, 8; - // begin inline asm - lop3.b32 %r4216, %r4217, %r4218, %r4219, 0x56; - // end inline asm - ld.const.u32 %r4221, [matrix+2432]; - // begin inline asm - dp4a.u32.u32 %r4220, %r4221, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4225, [matrix+2436]; - // begin inline asm - dp4a.u32.u32 %r4224, %r4225, %r5874, %r4220; - // end inline asm - ld.const.u32 %r4229, [matrix+2440]; - // begin inline asm - dp4a.u32.u32 %r4228, %r4229, %r5878, %r4224; - // end inline asm - ld.const.u32 %r4233, [matrix+2444]; - // begin inline asm - dp4a.u32.u32 %r4232, %r4233, %r5882, %r4228; - // end inline asm - ld.const.u32 %r4237, [matrix+2448]; - // begin inline asm - dp4a.u32.u32 %r4236, %r4237, %r5886, %r4232; - // end inline asm - ld.const.u32 %r4241, [matrix+2452]; - // begin inline asm - dp4a.u32.u32 %r4240, %r4241, %r5890, %r4236; - // end inline asm - ld.const.u32 %r4245, [matrix+2456]; - // begin inline asm - dp4a.u32.u32 %r4244, %r4245, %r5894, %r4240; - // end inline asm - ld.const.u32 %r4249, [matrix+2460]; - // begin inline asm - dp4a.u32.u32 %r4248, %r4249, %r5898, %r4244; - // end inline asm - ld.const.u32 %r4253, [matrix+2464]; - // begin inline asm - dp4a.u32.u32 %r4252, %r4253, %r5902, %r4248; - // end inline asm - ld.const.u32 %r4257, [matrix+2468]; - // begin inline asm - dp4a.u32.u32 %r4256, %r4257, %r5906, %r4252; - // end inline asm - ld.const.u32 %r4261, [matrix+2472]; - // begin inline asm - dp4a.u32.u32 %r4260, %r4261, %r5910, %r4256; - // end inline asm - ld.const.u32 %r4265, [matrix+2476]; - // begin inline asm - dp4a.u32.u32 %r4264, %r4265, %r5914, %r4260; - // end inline asm - ld.const.u32 %r4269, [matrix+2480]; - // begin inline asm - dp4a.u32.u32 %r4268, %r4269, %r5918, %r4264; - // end inline asm - ld.const.u32 %r4273, [matrix+2484]; - // begin inline asm - dp4a.u32.u32 %r4272, %r4273, %r5922, %r4268; - // end inline asm - ld.const.u32 %r4277, [matrix+2488]; - // begin inline asm - dp4a.u32.u32 %r4276, %r4277, %r5926, %r4272; - // end inline asm - ld.const.u32 %r4281, [matrix+2492]; - // begin inline asm - dp4a.u32.u32 %r4280, %r4281, %r5930, %r4276; - // end inline asm - ld.const.u32 %r4285, [matrix+2496]; - // begin inline asm - dp4a.u32.u32 %r4284, %r4285, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4289, [matrix+2500]; - // begin inline asm - dp4a.u32.u32 %r4288, %r4289, %r5874, %r4284; - // end inline asm - ld.const.u32 %r4293, [matrix+2504]; - // begin inline asm - dp4a.u32.u32 %r4292, %r4293, %r5878, %r4288; - // end inline asm - ld.const.u32 %r4297, [matrix+2508]; - // begin inline asm - dp4a.u32.u32 %r4296, %r4297, %r5882, %r4292; - // end inline asm - ld.const.u32 %r4301, [matrix+2512]; - // begin inline asm - dp4a.u32.u32 %r4300, %r4301, %r5886, %r4296; - // end inline asm - ld.const.u32 %r4305, [matrix+2516]; - // begin inline asm - dp4a.u32.u32 %r4304, %r4305, %r5890, %r4300; - // end inline asm - ld.const.u32 %r4309, [matrix+2520]; - // begin inline asm - dp4a.u32.u32 %r4308, %r4309, %r5894, %r4304; - // end inline asm - ld.const.u32 %r4313, [matrix+2524]; - // begin inline asm - dp4a.u32.u32 %r4312, %r4313, %r5898, %r4308; - // end inline asm - ld.const.u32 %r4317, [matrix+2528]; - // begin inline asm - dp4a.u32.u32 %r4316, %r4317, %r5902, %r4312; - // end inline asm - ld.const.u32 %r4321, [matrix+2532]; - // begin inline asm - dp4a.u32.u32 %r4320, %r4321, %r5906, %r4316; - // end inline asm - ld.const.u32 %r4325, [matrix+2536]; - // begin inline asm - dp4a.u32.u32 %r4324, %r4325, %r5910, %r4320; - // end inline asm - ld.const.u32 %r4329, [matrix+2540]; - // begin inline asm - dp4a.u32.u32 %r4328, %r4329, %r5914, %r4324; - // end inline asm - ld.const.u32 %r4333, [matrix+2544]; - // begin inline asm - dp4a.u32.u32 %r4332, %r4333, %r5918, %r4328; - // end inline asm - ld.const.u32 %r4337, [matrix+2548]; - // begin inline asm - dp4a.u32.u32 %r4336, %r4337, %r5922, %r4332; - // end inline asm - ld.const.u32 %r4341, [matrix+2552]; - // begin inline asm - dp4a.u32.u32 %r4340, %r4341, %r5926, %r4336; - // end inline asm - ld.const.u32 %r4345, [matrix+2556]; - // begin inline asm - dp4a.u32.u32 %r4344, %r4345, %r5930, %r4340; - // end inline asm - shr.u32 %r6096, %r4280, 6; - and.b32 %r4349, %r6096, 240; - shr.u32 %r4350, %r4344, 10; - // begin inline asm - lop3.b32 %r4348, %r4349, %r4350, %r4351, 0x56; - // end inline asm - ld.const.u32 %r4353, [matrix+2560]; - // begin inline asm - dp4a.u32.u32 %r4352, %r4353, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4357, [matrix+2564]; - // begin inline asm - dp4a.u32.u32 %r4356, %r4357, %r5874, %r4352; - // end inline asm - ld.const.u32 %r4361, [matrix+2568]; - // begin inline asm - dp4a.u32.u32 %r4360, %r4361, %r5878, %r4356; - // end inline asm - ld.const.u32 %r4365, [matrix+2572]; - // begin inline asm - dp4a.u32.u32 %r4364, %r4365, %r5882, %r4360; - // end inline asm - ld.const.u32 %r4369, [matrix+2576]; - // begin inline asm - dp4a.u32.u32 %r4368, %r4369, %r5886, %r4364; - // end inline asm - ld.const.u32 %r4373, [matrix+2580]; - // begin inline asm - dp4a.u32.u32 %r4372, %r4373, %r5890, %r4368; - // end inline asm - ld.const.u32 %r4377, [matrix+2584]; - // begin inline asm - dp4a.u32.u32 %r4376, %r4377, %r5894, %r4372; - // end inline asm - ld.const.u32 %r4381, [matrix+2588]; - // begin inline asm - dp4a.u32.u32 %r4380, %r4381, %r5898, %r4376; - // end inline asm - ld.const.u32 %r4385, [matrix+2592]; - // begin inline asm - dp4a.u32.u32 %r4384, %r4385, %r5902, %r4380; - // end inline asm - ld.const.u32 %r4389, [matrix+2596]; - // begin inline asm - dp4a.u32.u32 %r4388, %r4389, %r5906, %r4384; - // end inline asm - ld.const.u32 %r4393, [matrix+2600]; - // begin inline asm - dp4a.u32.u32 %r4392, %r4393, %r5910, %r4388; - // end inline asm - ld.const.u32 %r4397, [matrix+2604]; - // begin inline asm - dp4a.u32.u32 %r4396, %r4397, %r5914, %r4392; - // end inline asm - ld.const.u32 %r4401, [matrix+2608]; - // begin inline asm - dp4a.u32.u32 %r4400, %r4401, %r5918, %r4396; - // end inline asm - ld.const.u32 %r4405, [matrix+2612]; - // begin inline asm - dp4a.u32.u32 %r4404, %r4405, %r5922, %r4400; - // end inline asm - ld.const.u32 %r4409, [matrix+2616]; - // begin inline asm - dp4a.u32.u32 %r4408, %r4409, %r5926, %r4404; - // end inline asm - ld.const.u32 %r4413, [matrix+2620]; - // begin inline asm - dp4a.u32.u32 %r4412, %r4413, %r5930, %r4408; - // end inline asm - ld.const.u32 %r4417, [matrix+2624]; - // begin inline asm - dp4a.u32.u32 %r4416, %r4417, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4421, [matrix+2628]; - // begin inline asm - dp4a.u32.u32 %r4420, %r4421, %r5874, %r4416; - // end inline asm - ld.const.u32 %r4425, [matrix+2632]; - // begin inline asm - dp4a.u32.u32 %r4424, %r4425, %r5878, %r4420; - // end inline asm - ld.const.u32 %r4429, [matrix+2636]; - // begin inline asm - dp4a.u32.u32 %r4428, %r4429, %r5882, %r4424; - // end inline asm - ld.const.u32 %r4433, [matrix+2640]; - // begin inline asm - dp4a.u32.u32 %r4432, %r4433, %r5886, %r4428; - // end inline asm - ld.const.u32 %r4437, [matrix+2644]; - // begin inline asm - dp4a.u32.u32 %r4436, %r4437, %r5890, %r4432; - // end inline asm - ld.const.u32 %r4441, [matrix+2648]; - // begin inline asm - dp4a.u32.u32 %r4440, %r4441, %r5894, %r4436; - // end inline asm - ld.const.u32 %r4445, [matrix+2652]; - // begin inline asm - dp4a.u32.u32 %r4444, %r4445, %r5898, %r4440; - // end inline asm - ld.const.u32 %r4449, [matrix+2656]; - // begin inline asm - dp4a.u32.u32 %r4448, %r4449, %r5902, %r4444; - // end inline asm - ld.const.u32 %r4453, [matrix+2660]; - // begin inline asm - dp4a.u32.u32 %r4452, %r4453, %r5906, %r4448; - // end inline asm - ld.const.u32 %r4457, [matrix+2664]; - // begin inline asm - dp4a.u32.u32 %r4456, %r4457, %r5910, %r4452; - // end inline asm - ld.const.u32 %r4461, [matrix+2668]; - // begin inline asm - dp4a.u32.u32 %r4460, %r4461, %r5914, %r4456; - // end inline asm - ld.const.u32 %r4465, [matrix+2672]; - // begin inline asm - dp4a.u32.u32 %r4464, %r4465, %r5918, %r4460; - // end inline asm - ld.const.u32 %r4469, [matrix+2676]; - // begin inline asm - dp4a.u32.u32 %r4468, %r4469, %r5922, %r4464; - // end inline asm - ld.const.u32 %r4473, [matrix+2680]; - // begin inline asm - dp4a.u32.u32 %r4472, %r4473, %r5926, %r4468; - // end inline asm - ld.const.u32 %r4477, [matrix+2684]; - // begin inline asm - dp4a.u32.u32 %r4476, %r4477, %r5930, %r4472; - // end inline asm - shr.u32 %r6097, %r4412, 6; - and.b32 %r4481, %r6097, 240; - shr.u32 %r4482, %r4476, 10; - and.b32 %r4483, %r6019, 255; - // begin inline asm - lop3.b32 %r4480, %r4481, %r4482, %r4483, 0x56; - // end inline asm - ld.const.u32 %r4485, [matrix+2688]; - // begin inline asm - dp4a.u32.u32 %r4484, %r4485, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4489, [matrix+2692]; - // begin inline asm - dp4a.u32.u32 %r4488, %r4489, %r5874, %r4484; - // end inline asm - ld.const.u32 %r4493, [matrix+2696]; - // begin inline asm - dp4a.u32.u32 %r4492, %r4493, %r5878, %r4488; - // end inline asm - ld.const.u32 %r4497, [matrix+2700]; - // begin inline asm - dp4a.u32.u32 %r4496, %r4497, %r5882, %r4492; - // end inline asm - ld.const.u32 %r4501, [matrix+2704]; - // begin inline asm - dp4a.u32.u32 %r4500, %r4501, %r5886, %r4496; - // end inline asm - ld.const.u32 %r4505, [matrix+2708]; - // begin inline asm - dp4a.u32.u32 %r4504, %r4505, %r5890, %r4500; - // end inline asm - ld.const.u32 %r4509, [matrix+2712]; - // begin inline asm - dp4a.u32.u32 %r4508, %r4509, %r5894, %r4504; - // end inline asm - ld.const.u32 %r4513, [matrix+2716]; - // begin inline asm - dp4a.u32.u32 %r4512, %r4513, %r5898, %r4508; - // end inline asm - ld.const.u32 %r4517, [matrix+2720]; - // begin inline asm - dp4a.u32.u32 %r4516, %r4517, %r5902, %r4512; - // end inline asm - ld.const.u32 %r4521, [matrix+2724]; - // begin inline asm - dp4a.u32.u32 %r4520, %r4521, %r5906, %r4516; - // end inline asm - ld.const.u32 %r4525, [matrix+2728]; - // begin inline asm - dp4a.u32.u32 %r4524, %r4525, %r5910, %r4520; - // end inline asm - ld.const.u32 %r4529, [matrix+2732]; - // begin inline asm - dp4a.u32.u32 %r4528, %r4529, %r5914, %r4524; - // end inline asm - ld.const.u32 %r4533, [matrix+2736]; - // begin inline asm - dp4a.u32.u32 %r4532, %r4533, %r5918, %r4528; - // end inline asm - ld.const.u32 %r4537, [matrix+2740]; - // begin inline asm - dp4a.u32.u32 %r4536, %r4537, %r5922, %r4532; - // end inline asm - ld.const.u32 %r4541, [matrix+2744]; - // begin inline asm - dp4a.u32.u32 %r4540, %r4541, %r5926, %r4536; - // end inline asm - ld.const.u32 %r4545, [matrix+2748]; - // begin inline asm - dp4a.u32.u32 %r4544, %r4545, %r5930, %r4540; - // end inline asm - ld.const.u32 %r4549, [matrix+2752]; - // begin inline asm - dp4a.u32.u32 %r4548, %r4549, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4553, [matrix+2756]; - // begin inline asm - dp4a.u32.u32 %r4552, %r4553, %r5874, %r4548; - // end inline asm - ld.const.u32 %r4557, [matrix+2760]; - // begin inline asm - dp4a.u32.u32 %r4556, %r4557, %r5878, %r4552; - // end inline asm - ld.const.u32 %r4561, [matrix+2764]; - // begin inline asm - dp4a.u32.u32 %r4560, %r4561, %r5882, %r4556; - // end inline asm - ld.const.u32 %r4565, [matrix+2768]; - // begin inline asm - dp4a.u32.u32 %r4564, %r4565, %r5886, %r4560; - // end inline asm - ld.const.u32 %r4569, [matrix+2772]; - // begin inline asm - dp4a.u32.u32 %r4568, %r4569, %r5890, %r4564; - // end inline asm - ld.const.u32 %r4573, [matrix+2776]; - // begin inline asm - dp4a.u32.u32 %r4572, %r4573, %r5894, %r4568; - // end inline asm - ld.const.u32 %r4577, [matrix+2780]; - // begin inline asm - dp4a.u32.u32 %r4576, %r4577, %r5898, %r4572; - // end inline asm - ld.const.u32 %r4581, [matrix+2784]; - // begin inline asm - dp4a.u32.u32 %r4580, %r4581, %r5902, %r4576; - // end inline asm - ld.const.u32 %r4585, [matrix+2788]; - // begin inline asm - dp4a.u32.u32 %r4584, %r4585, %r5906, %r4580; - // end inline asm - ld.const.u32 %r4589, [matrix+2792]; - // begin inline asm - dp4a.u32.u32 %r4588, %r4589, %r5910, %r4584; - // end inline asm - ld.const.u32 %r4593, [matrix+2796]; - // begin inline asm - dp4a.u32.u32 %r4592, %r4593, %r5914, %r4588; - // end inline asm - ld.const.u32 %r4597, [matrix+2800]; - // begin inline asm - dp4a.u32.u32 %r4596, %r4597, %r5918, %r4592; - // end inline asm - ld.const.u32 %r4601, [matrix+2804]; - // begin inline asm - dp4a.u32.u32 %r4600, %r4601, %r5922, %r4596; - // end inline asm - ld.const.u32 %r4605, [matrix+2808]; - // begin inline asm - dp4a.u32.u32 %r4604, %r4605, %r5926, %r4600; - // end inline asm - ld.const.u32 %r4609, [matrix+2812]; - // begin inline asm - dp4a.u32.u32 %r4608, %r4609, %r5930, %r4604; - // end inline asm - shr.u32 %r6098, %r4544, 6; - and.b32 %r4613, %r6098, 240; - shr.u32 %r4614, %r4608, 10; - and.b32 %r4615, %r6024, 255; - // begin inline asm - lop3.b32 %r4612, %r4613, %r4614, %r4615, 0x56; - // end inline asm - ld.const.u32 %r4617, [matrix+2816]; - // begin inline asm - dp4a.u32.u32 %r4616, %r4617, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4621, [matrix+2820]; - // begin inline asm - dp4a.u32.u32 %r4620, %r4621, %r5874, %r4616; - // end inline asm - ld.const.u32 %r4625, [matrix+2824]; - // begin inline asm - dp4a.u32.u32 %r4624, %r4625, %r5878, %r4620; - // end inline asm - ld.const.u32 %r4629, [matrix+2828]; - // begin inline asm - dp4a.u32.u32 %r4628, %r4629, %r5882, %r4624; - // end inline asm - ld.const.u32 %r4633, [matrix+2832]; - // begin inline asm - dp4a.u32.u32 %r4632, %r4633, %r5886, %r4628; - // end inline asm - ld.const.u32 %r4637, [matrix+2836]; - // begin inline asm - dp4a.u32.u32 %r4636, %r4637, %r5890, %r4632; - // end inline asm - ld.const.u32 %r4641, [matrix+2840]; - // begin inline asm - dp4a.u32.u32 %r4640, %r4641, %r5894, %r4636; - // end inline asm - ld.const.u32 %r4645, [matrix+2844]; - // begin inline asm - dp4a.u32.u32 %r4644, %r4645, %r5898, %r4640; - // end inline asm - ld.const.u32 %r4649, [matrix+2848]; - // begin inline asm - dp4a.u32.u32 %r4648, %r4649, %r5902, %r4644; - // end inline asm - ld.const.u32 %r4653, [matrix+2852]; - // begin inline asm - dp4a.u32.u32 %r4652, %r4653, %r5906, %r4648; - // end inline asm - ld.const.u32 %r4657, [matrix+2856]; - // begin inline asm - dp4a.u32.u32 %r4656, %r4657, %r5910, %r4652; - // end inline asm - ld.const.u32 %r4661, [matrix+2860]; - // begin inline asm - dp4a.u32.u32 %r4660, %r4661, %r5914, %r4656; - // end inline asm - ld.const.u32 %r4665, [matrix+2864]; - // begin inline asm - dp4a.u32.u32 %r4664, %r4665, %r5918, %r4660; - // end inline asm - ld.const.u32 %r4669, [matrix+2868]; - // begin inline asm - dp4a.u32.u32 %r4668, %r4669, %r5922, %r4664; - // end inline asm - ld.const.u32 %r4673, [matrix+2872]; - // begin inline asm - dp4a.u32.u32 %r4672, %r4673, %r5926, %r4668; - // end inline asm - ld.const.u32 %r4677, [matrix+2876]; - // begin inline asm - dp4a.u32.u32 %r4676, %r4677, %r5930, %r4672; - // end inline asm - ld.const.u32 %r4681, [matrix+2880]; - // begin inline asm - dp4a.u32.u32 %r4680, %r4681, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4685, [matrix+2884]; - // begin inline asm - dp4a.u32.u32 %r4684, %r4685, %r5874, %r4680; - // end inline asm - ld.const.u32 %r4689, [matrix+2888]; - // begin inline asm - dp4a.u32.u32 %r4688, %r4689, %r5878, %r4684; - // end inline asm - ld.const.u32 %r4693, [matrix+2892]; - // begin inline asm - dp4a.u32.u32 %r4692, %r4693, %r5882, %r4688; - // end inline asm - ld.const.u32 %r4697, [matrix+2896]; - // begin inline asm - dp4a.u32.u32 %r4696, %r4697, %r5886, %r4692; - // end inline asm - ld.const.u32 %r4701, [matrix+2900]; - // begin inline asm - dp4a.u32.u32 %r4700, %r4701, %r5890, %r4696; - // end inline asm - ld.const.u32 %r4705, [matrix+2904]; - // begin inline asm - dp4a.u32.u32 %r4704, %r4705, %r5894, %r4700; - // end inline asm - ld.const.u32 %r4709, [matrix+2908]; - // begin inline asm - dp4a.u32.u32 %r4708, %r4709, %r5898, %r4704; - // end inline asm - ld.const.u32 %r4713, [matrix+2912]; - // begin inline asm - dp4a.u32.u32 %r4712, %r4713, %r5902, %r4708; - // end inline asm - ld.const.u32 %r4717, [matrix+2916]; - // begin inline asm - dp4a.u32.u32 %r4716, %r4717, %r5906, %r4712; - // end inline asm - ld.const.u32 %r4721, [matrix+2920]; - // begin inline asm - dp4a.u32.u32 %r4720, %r4721, %r5910, %r4716; - // end inline asm - ld.const.u32 %r4725, [matrix+2924]; - // begin inline asm - dp4a.u32.u32 %r4724, %r4725, %r5914, %r4720; - // end inline asm - ld.const.u32 %r4729, [matrix+2928]; - // begin inline asm - dp4a.u32.u32 %r4728, %r4729, %r5918, %r4724; - // end inline asm - ld.const.u32 %r4733, [matrix+2932]; - // begin inline asm - dp4a.u32.u32 %r4732, %r4733, %r5922, %r4728; - // end inline asm - ld.const.u32 %r4737, [matrix+2936]; - // begin inline asm - dp4a.u32.u32 %r4736, %r4737, %r5926, %r4732; - // end inline asm - ld.const.u32 %r4741, [matrix+2940]; - // begin inline asm - dp4a.u32.u32 %r4740, %r4741, %r5930, %r4736; - // end inline asm - shr.u32 %r6099, %r4676, 6; - and.b32 %r4745, %r6099, 240; - shr.u32 %r4746, %r4740, 10; - and.b32 %r4747, %r6028, 255; - // begin inline asm - lop3.b32 %r4744, %r4745, %r4746, %r4747, 0x56; - // end inline asm - ld.const.u32 %r4749, [matrix+2944]; - // begin inline asm - dp4a.u32.u32 %r4748, %r4749, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4753, [matrix+2948]; - // begin inline asm - dp4a.u32.u32 %r4752, %r4753, %r5874, %r4748; - // end inline asm - ld.const.u32 %r4757, [matrix+2952]; - // begin inline asm - dp4a.u32.u32 %r4756, %r4757, %r5878, %r4752; - // end inline asm - ld.const.u32 %r4761, [matrix+2956]; - // begin inline asm - dp4a.u32.u32 %r4760, %r4761, %r5882, %r4756; - // end inline asm - ld.const.u32 %r4765, [matrix+2960]; - // begin inline asm - dp4a.u32.u32 %r4764, %r4765, %r5886, %r4760; - // end inline asm - ld.const.u32 %r4769, [matrix+2964]; - // begin inline asm - dp4a.u32.u32 %r4768, %r4769, %r5890, %r4764; - // end inline asm - ld.const.u32 %r4773, [matrix+2968]; - // begin inline asm - dp4a.u32.u32 %r4772, %r4773, %r5894, %r4768; - // end inline asm - ld.const.u32 %r4777, [matrix+2972]; - // begin inline asm - dp4a.u32.u32 %r4776, %r4777, %r5898, %r4772; - // end inline asm - ld.const.u32 %r4781, [matrix+2976]; - // begin inline asm - dp4a.u32.u32 %r4780, %r4781, %r5902, %r4776; - // end inline asm - ld.const.u32 %r4785, [matrix+2980]; - // begin inline asm - dp4a.u32.u32 %r4784, %r4785, %r5906, %r4780; - // end inline asm - ld.const.u32 %r4789, [matrix+2984]; - // begin inline asm - dp4a.u32.u32 %r4788, %r4789, %r5910, %r4784; - // end inline asm - ld.const.u32 %r4793, [matrix+2988]; - // begin inline asm - dp4a.u32.u32 %r4792, %r4793, %r5914, %r4788; - // end inline asm - ld.const.u32 %r4797, [matrix+2992]; - // begin inline asm - dp4a.u32.u32 %r4796, %r4797, %r5918, %r4792; - // end inline asm - ld.const.u32 %r4801, [matrix+2996]; - // begin inline asm - dp4a.u32.u32 %r4800, %r4801, %r5922, %r4796; - // end inline asm - ld.const.u32 %r4805, [matrix+3000]; - // begin inline asm - dp4a.u32.u32 %r4804, %r4805, %r5926, %r4800; - // end inline asm - ld.const.u32 %r4809, [matrix+3004]; - // begin inline asm - dp4a.u32.u32 %r4808, %r4809, %r5930, %r4804; - // end inline asm - ld.const.u32 %r4813, [matrix+3008]; - // begin inline asm - dp4a.u32.u32 %r4812, %r4813, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4817, [matrix+3012]; - // begin inline asm - dp4a.u32.u32 %r4816, %r4817, %r5874, %r4812; - // end inline asm - ld.const.u32 %r4821, [matrix+3016]; - // begin inline asm - dp4a.u32.u32 %r4820, %r4821, %r5878, %r4816; - // end inline asm - ld.const.u32 %r4825, [matrix+3020]; - // begin inline asm - dp4a.u32.u32 %r4824, %r4825, %r5882, %r4820; - // end inline asm - ld.const.u32 %r4829, [matrix+3024]; - // begin inline asm - dp4a.u32.u32 %r4828, %r4829, %r5886, %r4824; - // end inline asm - ld.const.u32 %r4833, [matrix+3028]; - // begin inline asm - dp4a.u32.u32 %r4832, %r4833, %r5890, %r4828; - // end inline asm - ld.const.u32 %r4837, [matrix+3032]; - // begin inline asm - dp4a.u32.u32 %r4836, %r4837, %r5894, %r4832; - // end inline asm - ld.const.u32 %r4841, [matrix+3036]; - // begin inline asm - dp4a.u32.u32 %r4840, %r4841, %r5898, %r4836; - // end inline asm - ld.const.u32 %r4845, [matrix+3040]; - // begin inline asm - dp4a.u32.u32 %r4844, %r4845, %r5902, %r4840; - // end inline asm - ld.const.u32 %r4849, [matrix+3044]; - // begin inline asm - dp4a.u32.u32 %r4848, %r4849, %r5906, %r4844; - // end inline asm - ld.const.u32 %r4853, [matrix+3048]; - // begin inline asm - dp4a.u32.u32 %r4852, %r4853, %r5910, %r4848; - // end inline asm - ld.const.u32 %r4857, [matrix+3052]; - // begin inline asm - dp4a.u32.u32 %r4856, %r4857, %r5914, %r4852; - // end inline asm - ld.const.u32 %r4861, [matrix+3056]; - // begin inline asm - dp4a.u32.u32 %r4860, %r4861, %r5918, %r4856; - // end inline asm - ld.const.u32 %r4865, [matrix+3060]; - // begin inline asm - dp4a.u32.u32 %r4864, %r4865, %r5922, %r4860; - // end inline asm - ld.const.u32 %r4869, [matrix+3064]; - // begin inline asm - dp4a.u32.u32 %r4868, %r4869, %r5926, %r4864; - // end inline asm - ld.const.u32 %r4873, [matrix+3068]; - // begin inline asm - dp4a.u32.u32 %r4872, %r4873, %r5930, %r4868; - // end inline asm - shr.u32 %r6100, %r4808, 6; - and.b32 %r4877, %r6100, 240; - shr.u32 %r4878, %r4872, 10; - // begin inline asm - lop3.b32 %r4876, %r4877, %r4878, %r14, 0x56; - // end inline asm - ld.const.u32 %r4881, [matrix+3072]; - // begin inline asm - dp4a.u32.u32 %r4880, %r4881, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4885, [matrix+3076]; - // begin inline asm - dp4a.u32.u32 %r4884, %r4885, %r5874, %r4880; - // end inline asm - ld.const.u32 %r4889, [matrix+3080]; - // begin inline asm - dp4a.u32.u32 %r4888, %r4889, %r5878, %r4884; - // end inline asm - ld.const.u32 %r4893, [matrix+3084]; - // begin inline asm - dp4a.u32.u32 %r4892, %r4893, %r5882, %r4888; - // end inline asm - ld.const.u32 %r4897, [matrix+3088]; - // begin inline asm - dp4a.u32.u32 %r4896, %r4897, %r5886, %r4892; - // end inline asm - ld.const.u32 %r4901, [matrix+3092]; - // begin inline asm - dp4a.u32.u32 %r4900, %r4901, %r5890, %r4896; - // end inline asm - ld.const.u32 %r4905, [matrix+3096]; - // begin inline asm - dp4a.u32.u32 %r4904, %r4905, %r5894, %r4900; - // end inline asm - ld.const.u32 %r4909, [matrix+3100]; - // begin inline asm - dp4a.u32.u32 %r4908, %r4909, %r5898, %r4904; - // end inline asm - ld.const.u32 %r4913, [matrix+3104]; - // begin inline asm - dp4a.u32.u32 %r4912, %r4913, %r5902, %r4908; - // end inline asm - ld.const.u32 %r4917, [matrix+3108]; - // begin inline asm - dp4a.u32.u32 %r4916, %r4917, %r5906, %r4912; - // end inline asm - ld.const.u32 %r4921, [matrix+3112]; - // begin inline asm - dp4a.u32.u32 %r4920, %r4921, %r5910, %r4916; - // end inline asm - ld.const.u32 %r4925, [matrix+3116]; - // begin inline asm - dp4a.u32.u32 %r4924, %r4925, %r5914, %r4920; - // end inline asm - ld.const.u32 %r4929, [matrix+3120]; - // begin inline asm - dp4a.u32.u32 %r4928, %r4929, %r5918, %r4924; - // end inline asm - ld.const.u32 %r4933, [matrix+3124]; - // begin inline asm - dp4a.u32.u32 %r4932, %r4933, %r5922, %r4928; - // end inline asm - ld.const.u32 %r4937, [matrix+3128]; - // begin inline asm - dp4a.u32.u32 %r4936, %r4937, %r5926, %r4932; - // end inline asm - ld.const.u32 %r4941, [matrix+3132]; - // begin inline asm - dp4a.u32.u32 %r4940, %r4941, %r5930, %r4936; - // end inline asm - ld.const.u32 %r4945, [matrix+3136]; - // begin inline asm - dp4a.u32.u32 %r4944, %r4945, %r5870, %r6249; - // end inline asm - ld.const.u32 %r4949, [matrix+3140]; - // begin inline asm - dp4a.u32.u32 %r4948, %r4949, %r5874, %r4944; - // end inline asm - ld.const.u32 %r4953, [matrix+3144]; - // begin inline asm - dp4a.u32.u32 %r4952, %r4953, %r5878, %r4948; - // end inline asm - ld.const.u32 %r4957, [matrix+3148]; - // begin inline asm - dp4a.u32.u32 %r4956, %r4957, %r5882, %r4952; - // end inline asm - ld.const.u32 %r4961, [matrix+3152]; - // begin inline asm - dp4a.u32.u32 %r4960, %r4961, %r5886, %r4956; - // end inline asm - ld.const.u32 %r4965, [matrix+3156]; - // begin inline asm - dp4a.u32.u32 %r4964, %r4965, %r5890, %r4960; - // end inline asm - ld.const.u32 %r4969, [matrix+3160]; - // begin inline asm - dp4a.u32.u32 %r4968, %r4969, %r5894, %r4964; - // end inline asm - ld.const.u32 %r4973, [matrix+3164]; - // begin inline asm - dp4a.u32.u32 %r4972, %r4973, %r5898, %r4968; - // end inline asm - ld.const.u32 %r4977, [matrix+3168]; - // begin inline asm - dp4a.u32.u32 %r4976, %r4977, %r5902, %r4972; - // end inline asm - ld.const.u32 %r4981, [matrix+3172]; - // begin inline asm - dp4a.u32.u32 %r4980, %r4981, %r5906, %r4976; - // end inline asm - ld.const.u32 %r4985, [matrix+3176]; - // begin inline asm - dp4a.u32.u32 %r4984, %r4985, %r5910, %r4980; - // end inline asm - ld.const.u32 %r4989, [matrix+3180]; - // begin inline asm - dp4a.u32.u32 %r4988, %r4989, %r5914, %r4984; - // end inline asm - ld.const.u32 %r4993, [matrix+3184]; - // begin inline asm - dp4a.u32.u32 %r4992, %r4993, %r5918, %r4988; - // end inline asm - ld.const.u32 %r4997, [matrix+3188]; - // begin inline asm - dp4a.u32.u32 %r4996, %r4997, %r5922, %r4992; - // end inline asm - ld.const.u32 %r5001, [matrix+3192]; - // begin inline asm - dp4a.u32.u32 %r5000, %r5001, %r5926, %r4996; - // end inline asm - ld.const.u32 %r5005, [matrix+3196]; - // begin inline asm - dp4a.u32.u32 %r5004, %r5005, %r5930, %r5000; - // end inline asm - shr.u32 %r6101, %r4940, 6; - and.b32 %r5009, %r6101, 240; - shr.u32 %r5010, %r5004, 10; - and.b32 %r5011, %r6037, 255; - // begin inline asm - lop3.b32 %r5008, %r5009, %r5010, %r5011, 0x56; - // end inline asm - ld.const.u32 %r5013, [matrix+3200]; - // begin inline asm - dp4a.u32.u32 %r5012, %r5013, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5017, [matrix+3204]; - // begin inline asm - dp4a.u32.u32 %r5016, %r5017, %r5874, %r5012; - // end inline asm - ld.const.u32 %r5021, [matrix+3208]; - // begin inline asm - dp4a.u32.u32 %r5020, %r5021, %r5878, %r5016; - // end inline asm - ld.const.u32 %r5025, [matrix+3212]; - // begin inline asm - dp4a.u32.u32 %r5024, %r5025, %r5882, %r5020; - // end inline asm - ld.const.u32 %r5029, [matrix+3216]; - // begin inline asm - dp4a.u32.u32 %r5028, %r5029, %r5886, %r5024; - // end inline asm - ld.const.u32 %r5033, [matrix+3220]; - // begin inline asm - dp4a.u32.u32 %r5032, %r5033, %r5890, %r5028; - // end inline asm - ld.const.u32 %r5037, [matrix+3224]; - // begin inline asm - dp4a.u32.u32 %r5036, %r5037, %r5894, %r5032; - // end inline asm - ld.const.u32 %r5041, [matrix+3228]; - // begin inline asm - dp4a.u32.u32 %r5040, %r5041, %r5898, %r5036; - // end inline asm - ld.const.u32 %r5045, [matrix+3232]; - // begin inline asm - dp4a.u32.u32 %r5044, %r5045, %r5902, %r5040; - // end inline asm - ld.const.u32 %r5049, [matrix+3236]; - // begin inline asm - dp4a.u32.u32 %r5048, %r5049, %r5906, %r5044; - // end inline asm - ld.const.u32 %r5053, [matrix+3240]; - // begin inline asm - dp4a.u32.u32 %r5052, %r5053, %r5910, %r5048; - // end inline asm - ld.const.u32 %r5057, [matrix+3244]; - // begin inline asm - dp4a.u32.u32 %r5056, %r5057, %r5914, %r5052; - // end inline asm - ld.const.u32 %r5061, [matrix+3248]; - // begin inline asm - dp4a.u32.u32 %r5060, %r5061, %r5918, %r5056; - // end inline asm - ld.const.u32 %r5065, [matrix+3252]; - // begin inline asm - dp4a.u32.u32 %r5064, %r5065, %r5922, %r5060; - // end inline asm - ld.const.u32 %r5069, [matrix+3256]; - // begin inline asm - dp4a.u32.u32 %r5068, %r5069, %r5926, %r5064; - // end inline asm - ld.const.u32 %r5073, [matrix+3260]; - // begin inline asm - dp4a.u32.u32 %r5072, %r5073, %r5930, %r5068; - // end inline asm - ld.const.u32 %r5077, [matrix+3264]; - // begin inline asm - dp4a.u32.u32 %r5076, %r5077, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5081, [matrix+3268]; - // begin inline asm - dp4a.u32.u32 %r5080, %r5081, %r5874, %r5076; - // end inline asm - ld.const.u32 %r5085, [matrix+3272]; - // begin inline asm - dp4a.u32.u32 %r5084, %r5085, %r5878, %r5080; - // end inline asm - ld.const.u32 %r5089, [matrix+3276]; - // begin inline asm - dp4a.u32.u32 %r5088, %r5089, %r5882, %r5084; - // end inline asm - ld.const.u32 %r5093, [matrix+3280]; - // begin inline asm - dp4a.u32.u32 %r5092, %r5093, %r5886, %r5088; - // end inline asm - ld.const.u32 %r5097, [matrix+3284]; - // begin inline asm - dp4a.u32.u32 %r5096, %r5097, %r5890, %r5092; - // end inline asm - ld.const.u32 %r5101, [matrix+3288]; - // begin inline asm - dp4a.u32.u32 %r5100, %r5101, %r5894, %r5096; - // end inline asm - ld.const.u32 %r5105, [matrix+3292]; - // begin inline asm - dp4a.u32.u32 %r5104, %r5105, %r5898, %r5100; - // end inline asm - ld.const.u32 %r5109, [matrix+3296]; - // begin inline asm - dp4a.u32.u32 %r5108, %r5109, %r5902, %r5104; - // end inline asm - ld.const.u32 %r5113, [matrix+3300]; - // begin inline asm - dp4a.u32.u32 %r5112, %r5113, %r5906, %r5108; - // end inline asm - ld.const.u32 %r5117, [matrix+3304]; - // begin inline asm - dp4a.u32.u32 %r5116, %r5117, %r5910, %r5112; - // end inline asm - ld.const.u32 %r5121, [matrix+3308]; - // begin inline asm - dp4a.u32.u32 %r5120, %r5121, %r5914, %r5116; - // end inline asm - ld.const.u32 %r5125, [matrix+3312]; - // begin inline asm - dp4a.u32.u32 %r5124, %r5125, %r5918, %r5120; - // end inline asm - ld.const.u32 %r5129, [matrix+3316]; - // begin inline asm - dp4a.u32.u32 %r5128, %r5129, %r5922, %r5124; - // end inline asm - ld.const.u32 %r5133, [matrix+3320]; - // begin inline asm - dp4a.u32.u32 %r5132, %r5133, %r5926, %r5128; - // end inline asm - ld.const.u32 %r5137, [matrix+3324]; - // begin inline asm - dp4a.u32.u32 %r5136, %r5137, %r5930, %r5132; - // end inline asm - shr.u32 %r6102, %r5072, 6; - and.b32 %r5141, %r6102, 240; - shr.u32 %r5142, %r5136, 10; - and.b32 %r5143, %r6036, 255; - // begin inline asm - lop3.b32 %r5140, %r5141, %r5142, %r5143, 0x56; - // end inline asm - ld.const.u32 %r5145, [matrix+3328]; - // begin inline asm - dp4a.u32.u32 %r5144, %r5145, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5149, [matrix+3332]; - // begin inline asm - dp4a.u32.u32 %r5148, %r5149, %r5874, %r5144; - // end inline asm - ld.const.u32 %r5153, [matrix+3336]; - // begin inline asm - dp4a.u32.u32 %r5152, %r5153, %r5878, %r5148; - // end inline asm - ld.const.u32 %r5157, [matrix+3340]; - // begin inline asm - dp4a.u32.u32 %r5156, %r5157, %r5882, %r5152; - // end inline asm - ld.const.u32 %r5161, [matrix+3344]; - // begin inline asm - dp4a.u32.u32 %r5160, %r5161, %r5886, %r5156; - // end inline asm - ld.const.u32 %r5165, [matrix+3348]; - // begin inline asm - dp4a.u32.u32 %r5164, %r5165, %r5890, %r5160; - // end inline asm - ld.const.u32 %r5169, [matrix+3352]; - // begin inline asm - dp4a.u32.u32 %r5168, %r5169, %r5894, %r5164; - // end inline asm - ld.const.u32 %r5173, [matrix+3356]; - // begin inline asm - dp4a.u32.u32 %r5172, %r5173, %r5898, %r5168; - // end inline asm - ld.const.u32 %r5177, [matrix+3360]; - // begin inline asm - dp4a.u32.u32 %r5176, %r5177, %r5902, %r5172; - // end inline asm - ld.const.u32 %r5181, [matrix+3364]; - // begin inline asm - dp4a.u32.u32 %r5180, %r5181, %r5906, %r5176; - // end inline asm - ld.const.u32 %r5185, [matrix+3368]; - // begin inline asm - dp4a.u32.u32 %r5184, %r5185, %r5910, %r5180; - // end inline asm - ld.const.u32 %r5189, [matrix+3372]; - // begin inline asm - dp4a.u32.u32 %r5188, %r5189, %r5914, %r5184; - // end inline asm - ld.const.u32 %r5193, [matrix+3376]; - // begin inline asm - dp4a.u32.u32 %r5192, %r5193, %r5918, %r5188; - // end inline asm - ld.const.u32 %r5197, [matrix+3380]; - // begin inline asm - dp4a.u32.u32 %r5196, %r5197, %r5922, %r5192; - // end inline asm - ld.const.u32 %r5201, [matrix+3384]; - // begin inline asm - dp4a.u32.u32 %r5200, %r5201, %r5926, %r5196; - // end inline asm - ld.const.u32 %r5205, [matrix+3388]; - // begin inline asm - dp4a.u32.u32 %r5204, %r5205, %r5930, %r5200; - // end inline asm - ld.const.u32 %r5209, [matrix+3392]; - // begin inline asm - dp4a.u32.u32 %r5208, %r5209, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5213, [matrix+3396]; - // begin inline asm - dp4a.u32.u32 %r5212, %r5213, %r5874, %r5208; - // end inline asm - ld.const.u32 %r5217, [matrix+3400]; - // begin inline asm - dp4a.u32.u32 %r5216, %r5217, %r5878, %r5212; - // end inline asm - ld.const.u32 %r5221, [matrix+3404]; - // begin inline asm - dp4a.u32.u32 %r5220, %r5221, %r5882, %r5216; - // end inline asm - ld.const.u32 %r5225, [matrix+3408]; - // begin inline asm - dp4a.u32.u32 %r5224, %r5225, %r5886, %r5220; - // end inline asm - ld.const.u32 %r5229, [matrix+3412]; - // begin inline asm - dp4a.u32.u32 %r5228, %r5229, %r5890, %r5224; - // end inline asm - ld.const.u32 %r5233, [matrix+3416]; - // begin inline asm - dp4a.u32.u32 %r5232, %r5233, %r5894, %r5228; - // end inline asm - ld.const.u32 %r5237, [matrix+3420]; - // begin inline asm - dp4a.u32.u32 %r5236, %r5237, %r5898, %r5232; - // end inline asm - ld.const.u32 %r5241, [matrix+3424]; - // begin inline asm - dp4a.u32.u32 %r5240, %r5241, %r5902, %r5236; - // end inline asm - ld.const.u32 %r5245, [matrix+3428]; - // begin inline asm - dp4a.u32.u32 %r5244, %r5245, %r5906, %r5240; - // end inline asm - ld.const.u32 %r5249, [matrix+3432]; - // begin inline asm - dp4a.u32.u32 %r5248, %r5249, %r5910, %r5244; - // end inline asm - ld.const.u32 %r5253, [matrix+3436]; - // begin inline asm - dp4a.u32.u32 %r5252, %r5253, %r5914, %r5248; - // end inline asm - ld.const.u32 %r5257, [matrix+3440]; - // begin inline asm - dp4a.u32.u32 %r5256, %r5257, %r5918, %r5252; - // end inline asm - ld.const.u32 %r5261, [matrix+3444]; - // begin inline asm - dp4a.u32.u32 %r5260, %r5261, %r5922, %r5256; - // end inline asm - ld.const.u32 %r5265, [matrix+3448]; - // begin inline asm - dp4a.u32.u32 %r5264, %r5265, %r5926, %r5260; - // end inline asm - ld.const.u32 %r5269, [matrix+3452]; - // begin inline asm - dp4a.u32.u32 %r5268, %r5269, %r5930, %r5264; - // end inline asm - shr.u32 %r6103, %r5204, 6; - and.b32 %r5273, %r6103, 240; - shr.u32 %r5274, %r5268, 10; - and.b32 %r5275, %r6047, 255; - // begin inline asm - lop3.b32 %r5272, %r5273, %r5274, %r5275, 0x56; - // end inline asm - shl.b32 %r6104, %r5272, 16; - and.b32 %r6105, %r6104, 16711680; - cvt.u64.u32 %rd211, %r6105; - ld.const.u32 %r5277, [matrix+3456]; - // begin inline asm - dp4a.u32.u32 %r5276, %r5277, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5281, [matrix+3460]; - // begin inline asm - dp4a.u32.u32 %r5280, %r5281, %r5874, %r5276; - // end inline asm - ld.const.u32 %r5285, [matrix+3464]; - // begin inline asm - dp4a.u32.u32 %r5284, %r5285, %r5878, %r5280; - // end inline asm - ld.const.u32 %r5289, [matrix+3468]; - // begin inline asm - dp4a.u32.u32 %r5288, %r5289, %r5882, %r5284; - // end inline asm - ld.const.u32 %r5293, [matrix+3472]; - // begin inline asm - dp4a.u32.u32 %r5292, %r5293, %r5886, %r5288; - // end inline asm - ld.const.u32 %r5297, [matrix+3476]; - // begin inline asm - dp4a.u32.u32 %r5296, %r5297, %r5890, %r5292; - // end inline asm - ld.const.u32 %r5301, [matrix+3480]; - // begin inline asm - dp4a.u32.u32 %r5300, %r5301, %r5894, %r5296; - // end inline asm - ld.const.u32 %r5305, [matrix+3484]; - // begin inline asm - dp4a.u32.u32 %r5304, %r5305, %r5898, %r5300; - // end inline asm - ld.const.u32 %r5309, [matrix+3488]; - // begin inline asm - dp4a.u32.u32 %r5308, %r5309, %r5902, %r5304; - // end inline asm - ld.const.u32 %r5313, [matrix+3492]; - // begin inline asm - dp4a.u32.u32 %r5312, %r5313, %r5906, %r5308; - // end inline asm - ld.const.u32 %r5317, [matrix+3496]; - // begin inline asm - dp4a.u32.u32 %r5316, %r5317, %r5910, %r5312; - // end inline asm - ld.const.u32 %r5321, [matrix+3500]; - // begin inline asm - dp4a.u32.u32 %r5320, %r5321, %r5914, %r5316; - // end inline asm - ld.const.u32 %r5325, [matrix+3504]; - // begin inline asm - dp4a.u32.u32 %r5324, %r5325, %r5918, %r5320; - // end inline asm - ld.const.u32 %r5329, [matrix+3508]; - // begin inline asm - dp4a.u32.u32 %r5328, %r5329, %r5922, %r5324; - // end inline asm - ld.const.u32 %r5333, [matrix+3512]; - // begin inline asm - dp4a.u32.u32 %r5332, %r5333, %r5926, %r5328; - // end inline asm - ld.const.u32 %r5337, [matrix+3516]; - // begin inline asm - dp4a.u32.u32 %r5336, %r5337, %r5930, %r5332; - // end inline asm - ld.const.u32 %r5341, [matrix+3520]; - // begin inline asm - dp4a.u32.u32 %r5340, %r5341, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5345, [matrix+3524]; - // begin inline asm - dp4a.u32.u32 %r5344, %r5345, %r5874, %r5340; - // end inline asm - ld.const.u32 %r5349, [matrix+3528]; - // begin inline asm - dp4a.u32.u32 %r5348, %r5349, %r5878, %r5344; - // end inline asm - ld.const.u32 %r5353, [matrix+3532]; - // begin inline asm - dp4a.u32.u32 %r5352, %r5353, %r5882, %r5348; - // end inline asm - ld.const.u32 %r5357, [matrix+3536]; - // begin inline asm - dp4a.u32.u32 %r5356, %r5357, %r5886, %r5352; - // end inline asm - ld.const.u32 %r5361, [matrix+3540]; - // begin inline asm - dp4a.u32.u32 %r5360, %r5361, %r5890, %r5356; - // end inline asm - ld.const.u32 %r5365, [matrix+3544]; - // begin inline asm - dp4a.u32.u32 %r5364, %r5365, %r5894, %r5360; - // end inline asm - ld.const.u32 %r5369, [matrix+3548]; - // begin inline asm - dp4a.u32.u32 %r5368, %r5369, %r5898, %r5364; - // end inline asm - ld.const.u32 %r5373, [matrix+3552]; - // begin inline asm - dp4a.u32.u32 %r5372, %r5373, %r5902, %r5368; - // end inline asm - ld.const.u32 %r5377, [matrix+3556]; - // begin inline asm - dp4a.u32.u32 %r5376, %r5377, %r5906, %r5372; - // end inline asm - ld.const.u32 %r5381, [matrix+3560]; - // begin inline asm - dp4a.u32.u32 %r5380, %r5381, %r5910, %r5376; - // end inline asm - ld.const.u32 %r5385, [matrix+3564]; - // begin inline asm - dp4a.u32.u32 %r5384, %r5385, %r5914, %r5380; - // end inline asm - ld.const.u32 %r5389, [matrix+3568]; - // begin inline asm - dp4a.u32.u32 %r5388, %r5389, %r5918, %r5384; - // end inline asm - ld.const.u32 %r5393, [matrix+3572]; - // begin inline asm - dp4a.u32.u32 %r5392, %r5393, %r5922, %r5388; - // end inline asm - ld.const.u32 %r5397, [matrix+3576]; - // begin inline asm - dp4a.u32.u32 %r5396, %r5397, %r5926, %r5392; - // end inline asm - ld.const.u32 %r5401, [matrix+3580]; - // begin inline asm - dp4a.u32.u32 %r5400, %r5401, %r5930, %r5396; - // end inline asm - shr.u32 %r6106, %r5336, 6; - and.b32 %r5405, %r6106, 240; - shr.u32 %r5406, %r5400, 10; - and.b32 %r5407, %r6050, 255; - // begin inline asm - lop3.b32 %r5404, %r5405, %r5406, %r5407, 0x56; - // end inline asm - shl.b32 %r6107, %r5404, 24; - cvt.u64.u32 %rd212, %r6107; - ld.const.u32 %r5409, [matrix+3584]; - // begin inline asm - dp4a.u32.u32 %r5408, %r5409, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5413, [matrix+3588]; - // begin inline asm - dp4a.u32.u32 %r5412, %r5413, %r5874, %r5408; - // end inline asm - ld.const.u32 %r5417, [matrix+3592]; - // begin inline asm - dp4a.u32.u32 %r5416, %r5417, %r5878, %r5412; - // end inline asm - ld.const.u32 %r5421, [matrix+3596]; - // begin inline asm - dp4a.u32.u32 %r5420, %r5421, %r5882, %r5416; - // end inline asm - ld.const.u32 %r5425, [matrix+3600]; - // begin inline asm - dp4a.u32.u32 %r5424, %r5425, %r5886, %r5420; - // end inline asm - ld.const.u32 %r5429, [matrix+3604]; - // begin inline asm - dp4a.u32.u32 %r5428, %r5429, %r5890, %r5424; - // end inline asm - ld.const.u32 %r5433, [matrix+3608]; - // begin inline asm - dp4a.u32.u32 %r5432, %r5433, %r5894, %r5428; - // end inline asm - ld.const.u32 %r5437, [matrix+3612]; - // begin inline asm - dp4a.u32.u32 %r5436, %r5437, %r5898, %r5432; - // end inline asm - ld.const.u32 %r5441, [matrix+3616]; - // begin inline asm - dp4a.u32.u32 %r5440, %r5441, %r5902, %r5436; - // end inline asm - ld.const.u32 %r5445, [matrix+3620]; - // begin inline asm - dp4a.u32.u32 %r5444, %r5445, %r5906, %r5440; - // end inline asm - ld.const.u32 %r5449, [matrix+3624]; - // begin inline asm - dp4a.u32.u32 %r5448, %r5449, %r5910, %r5444; - // end inline asm - ld.const.u32 %r5453, [matrix+3628]; - // begin inline asm - dp4a.u32.u32 %r5452, %r5453, %r5914, %r5448; - // end inline asm - ld.const.u32 %r5457, [matrix+3632]; - // begin inline asm - dp4a.u32.u32 %r5456, %r5457, %r5918, %r5452; - // end inline asm - ld.const.u32 %r5461, [matrix+3636]; - // begin inline asm - dp4a.u32.u32 %r5460, %r5461, %r5922, %r5456; - // end inline asm - ld.const.u32 %r5465, [matrix+3640]; - // begin inline asm - dp4a.u32.u32 %r5464, %r5465, %r5926, %r5460; - // end inline asm - ld.const.u32 %r5469, [matrix+3644]; - // begin inline asm - dp4a.u32.u32 %r5468, %r5469, %r5930, %r5464; - // end inline asm - ld.const.u32 %r5473, [matrix+3648]; - // begin inline asm - dp4a.u32.u32 %r5472, %r5473, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5477, [matrix+3652]; - // begin inline asm - dp4a.u32.u32 %r5476, %r5477, %r5874, %r5472; - // end inline asm - ld.const.u32 %r5481, [matrix+3656]; - // begin inline asm - dp4a.u32.u32 %r5480, %r5481, %r5878, %r5476; - // end inline asm - ld.const.u32 %r5485, [matrix+3660]; - // begin inline asm - dp4a.u32.u32 %r5484, %r5485, %r5882, %r5480; - // end inline asm - ld.const.u32 %r5489, [matrix+3664]; - // begin inline asm - dp4a.u32.u32 %r5488, %r5489, %r5886, %r5484; - // end inline asm - ld.const.u32 %r5493, [matrix+3668]; - // begin inline asm - dp4a.u32.u32 %r5492, %r5493, %r5890, %r5488; - // end inline asm - ld.const.u32 %r5497, [matrix+3672]; - // begin inline asm - dp4a.u32.u32 %r5496, %r5497, %r5894, %r5492; - // end inline asm - ld.const.u32 %r5501, [matrix+3676]; - // begin inline asm - dp4a.u32.u32 %r5500, %r5501, %r5898, %r5496; - // end inline asm - ld.const.u32 %r5505, [matrix+3680]; - // begin inline asm - dp4a.u32.u32 %r5504, %r5505, %r5902, %r5500; - // end inline asm - ld.const.u32 %r5509, [matrix+3684]; - // begin inline asm - dp4a.u32.u32 %r5508, %r5509, %r5906, %r5504; - // end inline asm - ld.const.u32 %r5513, [matrix+3688]; - // begin inline asm - dp4a.u32.u32 %r5512, %r5513, %r5910, %r5508; - // end inline asm - ld.const.u32 %r5517, [matrix+3692]; - // begin inline asm - dp4a.u32.u32 %r5516, %r5517, %r5914, %r5512; - // end inline asm - ld.const.u32 %r5521, [matrix+3696]; - // begin inline asm - dp4a.u32.u32 %r5520, %r5521, %r5918, %r5516; - // end inline asm - ld.const.u32 %r5525, [matrix+3700]; - // begin inline asm - dp4a.u32.u32 %r5524, %r5525, %r5922, %r5520; - // end inline asm - ld.const.u32 %r5529, [matrix+3704]; - // begin inline asm - dp4a.u32.u32 %r5528, %r5529, %r5926, %r5524; - // end inline asm - ld.const.u32 %r5533, [matrix+3708]; - // begin inline asm - dp4a.u32.u32 %r5532, %r5533, %r5930, %r5528; - // end inline asm - shr.u32 %r6108, %r5468, 6; - and.b32 %r5537, %r6108, 240; - shr.u32 %r5538, %r5532, 10; - and.b32 %r5539, %r6056, 255; - // begin inline asm - lop3.b32 %r5536, %r5537, %r5538, %r5539, 0x56; - // end inline asm - ld.const.u32 %r5541, [matrix+3712]; - // begin inline asm - dp4a.u32.u32 %r5540, %r5541, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5545, [matrix+3716]; - // begin inline asm - dp4a.u32.u32 %r5544, %r5545, %r5874, %r5540; - // end inline asm - ld.const.u32 %r5549, [matrix+3720]; - // begin inline asm - dp4a.u32.u32 %r5548, %r5549, %r5878, %r5544; - // end inline asm - ld.const.u32 %r5553, [matrix+3724]; - // begin inline asm - dp4a.u32.u32 %r5552, %r5553, %r5882, %r5548; - // end inline asm - ld.const.u32 %r5557, [matrix+3728]; - // begin inline asm - dp4a.u32.u32 %r5556, %r5557, %r5886, %r5552; - // end inline asm - ld.const.u32 %r5561, [matrix+3732]; - // begin inline asm - dp4a.u32.u32 %r5560, %r5561, %r5890, %r5556; - // end inline asm - ld.const.u32 %r5565, [matrix+3736]; - // begin inline asm - dp4a.u32.u32 %r5564, %r5565, %r5894, %r5560; - // end inline asm - ld.const.u32 %r5569, [matrix+3740]; - // begin inline asm - dp4a.u32.u32 %r5568, %r5569, %r5898, %r5564; - // end inline asm - ld.const.u32 %r5573, [matrix+3744]; - // begin inline asm - dp4a.u32.u32 %r5572, %r5573, %r5902, %r5568; - // end inline asm - ld.const.u32 %r5577, [matrix+3748]; - // begin inline asm - dp4a.u32.u32 %r5576, %r5577, %r5906, %r5572; - // end inline asm - ld.const.u32 %r5581, [matrix+3752]; - // begin inline asm - dp4a.u32.u32 %r5580, %r5581, %r5910, %r5576; - // end inline asm - ld.const.u32 %r5585, [matrix+3756]; - // begin inline asm - dp4a.u32.u32 %r5584, %r5585, %r5914, %r5580; - // end inline asm - ld.const.u32 %r5589, [matrix+3760]; - // begin inline asm - dp4a.u32.u32 %r5588, %r5589, %r5918, %r5584; - // end inline asm - ld.const.u32 %r5593, [matrix+3764]; - // begin inline asm - dp4a.u32.u32 %r5592, %r5593, %r5922, %r5588; - // end inline asm - ld.const.u32 %r5597, [matrix+3768]; - // begin inline asm - dp4a.u32.u32 %r5596, %r5597, %r5926, %r5592; - // end inline asm - ld.const.u32 %r5601, [matrix+3772]; - // begin inline asm - dp4a.u32.u32 %r5600, %r5601, %r5930, %r5596; - // end inline asm - ld.const.u32 %r5605, [matrix+3776]; - // begin inline asm - dp4a.u32.u32 %r5604, %r5605, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5609, [matrix+3780]; - // begin inline asm - dp4a.u32.u32 %r5608, %r5609, %r5874, %r5604; - // end inline asm - ld.const.u32 %r5613, [matrix+3784]; - // begin inline asm - dp4a.u32.u32 %r5612, %r5613, %r5878, %r5608; - // end inline asm - ld.const.u32 %r5617, [matrix+3788]; - // begin inline asm - dp4a.u32.u32 %r5616, %r5617, %r5882, %r5612; - // end inline asm - ld.const.u32 %r5621, [matrix+3792]; - // begin inline asm - dp4a.u32.u32 %r5620, %r5621, %r5886, %r5616; - // end inline asm - ld.const.u32 %r5625, [matrix+3796]; - // begin inline asm - dp4a.u32.u32 %r5624, %r5625, %r5890, %r5620; - // end inline asm - ld.const.u32 %r5629, [matrix+3800]; - // begin inline asm - dp4a.u32.u32 %r5628, %r5629, %r5894, %r5624; - // end inline asm - ld.const.u32 %r5633, [matrix+3804]; - // begin inline asm - dp4a.u32.u32 %r5632, %r5633, %r5898, %r5628; - // end inline asm - ld.const.u32 %r5637, [matrix+3808]; - // begin inline asm - dp4a.u32.u32 %r5636, %r5637, %r5902, %r5632; - // end inline asm - ld.const.u32 %r5641, [matrix+3812]; - // begin inline asm - dp4a.u32.u32 %r5640, %r5641, %r5906, %r5636; - // end inline asm - ld.const.u32 %r5645, [matrix+3816]; - // begin inline asm - dp4a.u32.u32 %r5644, %r5645, %r5910, %r5640; - // end inline asm - ld.const.u32 %r5649, [matrix+3820]; - // begin inline asm - dp4a.u32.u32 %r5648, %r5649, %r5914, %r5644; - // end inline asm - ld.const.u32 %r5653, [matrix+3824]; - // begin inline asm - dp4a.u32.u32 %r5652, %r5653, %r5918, %r5648; - // end inline asm - ld.const.u32 %r5657, [matrix+3828]; - // begin inline asm - dp4a.u32.u32 %r5656, %r5657, %r5922, %r5652; - // end inline asm - ld.const.u32 %r5661, [matrix+3832]; - // begin inline asm - dp4a.u32.u32 %r5660, %r5661, %r5926, %r5656; - // end inline asm - ld.const.u32 %r5665, [matrix+3836]; - // begin inline asm - dp4a.u32.u32 %r5664, %r5665, %r5930, %r5660; - // end inline asm - shr.u32 %r6109, %r5600, 6; - and.b32 %r5669, %r6109, 240; - shr.u32 %r5670, %r5664, 10; - and.b32 %r5671, %r6060, 255; - // begin inline asm - lop3.b32 %r5668, %r5669, %r5670, %r5671, 0x56; - // end inline asm - ld.const.u32 %r5673, [matrix+3840]; - // begin inline asm - dp4a.u32.u32 %r5672, %r5673, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5677, [matrix+3844]; - // begin inline asm - dp4a.u32.u32 %r5676, %r5677, %r5874, %r5672; - // end inline asm - ld.const.u32 %r5681, [matrix+3848]; - // begin inline asm - dp4a.u32.u32 %r5680, %r5681, %r5878, %r5676; - // end inline asm - ld.const.u32 %r5685, [matrix+3852]; - // begin inline asm - dp4a.u32.u32 %r5684, %r5685, %r5882, %r5680; - // end inline asm - ld.const.u32 %r5689, [matrix+3856]; - // begin inline asm - dp4a.u32.u32 %r5688, %r5689, %r5886, %r5684; - // end inline asm - ld.const.u32 %r5693, [matrix+3860]; - // begin inline asm - dp4a.u32.u32 %r5692, %r5693, %r5890, %r5688; - // end inline asm - ld.const.u32 %r5697, [matrix+3864]; - // begin inline asm - dp4a.u32.u32 %r5696, %r5697, %r5894, %r5692; - // end inline asm - ld.const.u32 %r5701, [matrix+3868]; - // begin inline asm - dp4a.u32.u32 %r5700, %r5701, %r5898, %r5696; - // end inline asm - ld.const.u32 %r5705, [matrix+3872]; - // begin inline asm - dp4a.u32.u32 %r5704, %r5705, %r5902, %r5700; - // end inline asm - ld.const.u32 %r5709, [matrix+3876]; - // begin inline asm - dp4a.u32.u32 %r5708, %r5709, %r5906, %r5704; - // end inline asm - ld.const.u32 %r5713, [matrix+3880]; - // begin inline asm - dp4a.u32.u32 %r5712, %r5713, %r5910, %r5708; - // end inline asm - ld.const.u32 %r5717, [matrix+3884]; - // begin inline asm - dp4a.u32.u32 %r5716, %r5717, %r5914, %r5712; - // end inline asm - ld.const.u32 %r5721, [matrix+3888]; - // begin inline asm - dp4a.u32.u32 %r5720, %r5721, %r5918, %r5716; - // end inline asm - ld.const.u32 %r5725, [matrix+3892]; - // begin inline asm - dp4a.u32.u32 %r5724, %r5725, %r5922, %r5720; - // end inline asm - ld.const.u32 %r5729, [matrix+3896]; - // begin inline asm - dp4a.u32.u32 %r5728, %r5729, %r5926, %r5724; - // end inline asm - ld.const.u32 %r5733, [matrix+3900]; - // begin inline asm - dp4a.u32.u32 %r5732, %r5733, %r5930, %r5728; - // end inline asm - ld.const.u32 %r5737, [matrix+3904]; - // begin inline asm - dp4a.u32.u32 %r5736, %r5737, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5741, [matrix+3908]; - // begin inline asm - dp4a.u32.u32 %r5740, %r5741, %r5874, %r5736; - // end inline asm - ld.const.u32 %r5745, [matrix+3912]; - // begin inline asm - dp4a.u32.u32 %r5744, %r5745, %r5878, %r5740; - // end inline asm - ld.const.u32 %r5749, [matrix+3916]; - // begin inline asm - dp4a.u32.u32 %r5748, %r5749, %r5882, %r5744; - // end inline asm - ld.const.u32 %r5753, [matrix+3920]; - // begin inline asm - dp4a.u32.u32 %r5752, %r5753, %r5886, %r5748; - // end inline asm - ld.const.u32 %r5757, [matrix+3924]; - // begin inline asm - dp4a.u32.u32 %r5756, %r5757, %r5890, %r5752; - // end inline asm - ld.const.u32 %r5761, [matrix+3928]; - // begin inline asm - dp4a.u32.u32 %r5760, %r5761, %r5894, %r5756; - // end inline asm - ld.const.u32 %r5765, [matrix+3932]; - // begin inline asm - dp4a.u32.u32 %r5764, %r5765, %r5898, %r5760; - // end inline asm - ld.const.u32 %r5769, [matrix+3936]; - // begin inline asm - dp4a.u32.u32 %r5768, %r5769, %r5902, %r5764; - // end inline asm - ld.const.u32 %r5773, [matrix+3940]; - // begin inline asm - dp4a.u32.u32 %r5772, %r5773, %r5906, %r5768; - // end inline asm - ld.const.u32 %r5777, [matrix+3944]; - // begin inline asm - dp4a.u32.u32 %r5776, %r5777, %r5910, %r5772; - // end inline asm - ld.const.u32 %r5781, [matrix+3948]; - // begin inline asm - dp4a.u32.u32 %r5780, %r5781, %r5914, %r5776; - // end inline asm - ld.const.u32 %r5785, [matrix+3952]; - // begin inline asm - dp4a.u32.u32 %r5784, %r5785, %r5918, %r5780; - // end inline asm - ld.const.u32 %r5789, [matrix+3956]; - // begin inline asm - dp4a.u32.u32 %r5788, %r5789, %r5922, %r5784; - // end inline asm - ld.const.u32 %r5793, [matrix+3960]; - // begin inline asm - dp4a.u32.u32 %r5792, %r5793, %r5926, %r5788; - // end inline asm - ld.const.u32 %r5797, [matrix+3964]; - // begin inline asm - dp4a.u32.u32 %r5796, %r5797, %r5930, %r5792; - // end inline asm - shr.u32 %r6110, %r5732, 6; - and.b32 %r5801, %r6110, 240; - shr.u32 %r5802, %r5796, 10; - and.b32 %r5803, %r6068, 255; - // begin inline asm - lop3.b32 %r5800, %r5801, %r5802, %r5803, 0x56; - // end inline asm - ld.const.u32 %r5805, [matrix+3968]; - // begin inline asm - dp4a.u32.u32 %r5804, %r5805, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5809, [matrix+3972]; - // begin inline asm - dp4a.u32.u32 %r5808, %r5809, %r5874, %r5804; - // end inline asm - ld.const.u32 %r5813, [matrix+3976]; - // begin inline asm - dp4a.u32.u32 %r5812, %r5813, %r5878, %r5808; - // end inline asm - ld.const.u32 %r5817, [matrix+3980]; - // begin inline asm - dp4a.u32.u32 %r5816, %r5817, %r5882, %r5812; - // end inline asm - ld.const.u32 %r5821, [matrix+3984]; - // begin inline asm - dp4a.u32.u32 %r5820, %r5821, %r5886, %r5816; - // end inline asm - ld.const.u32 %r5825, [matrix+3988]; - // begin inline asm - dp4a.u32.u32 %r5824, %r5825, %r5890, %r5820; - // end inline asm - ld.const.u32 %r5829, [matrix+3992]; - // begin inline asm - dp4a.u32.u32 %r5828, %r5829, %r5894, %r5824; - // end inline asm - ld.const.u32 %r5833, [matrix+3996]; - // begin inline asm - dp4a.u32.u32 %r5832, %r5833, %r5898, %r5828; - // end inline asm - ld.const.u32 %r5837, [matrix+4000]; - // begin inline asm - dp4a.u32.u32 %r5836, %r5837, %r5902, %r5832; - // end inline asm - ld.const.u32 %r5841, [matrix+4004]; - // begin inline asm - dp4a.u32.u32 %r5840, %r5841, %r5906, %r5836; - // end inline asm - ld.const.u32 %r5845, [matrix+4008]; - // begin inline asm - dp4a.u32.u32 %r5844, %r5845, %r5910, %r5840; - // end inline asm - ld.const.u32 %r5849, [matrix+4012]; - // begin inline asm - dp4a.u32.u32 %r5848, %r5849, %r5914, %r5844; - // end inline asm - ld.const.u32 %r5853, [matrix+4016]; - // begin inline asm - dp4a.u32.u32 %r5852, %r5853, %r5918, %r5848; - // end inline asm - ld.const.u32 %r5857, [matrix+4020]; - // begin inline asm - dp4a.u32.u32 %r5856, %r5857, %r5922, %r5852; - // end inline asm - ld.const.u32 %r5861, [matrix+4024]; - // begin inline asm - dp4a.u32.u32 %r5860, %r5861, %r5926, %r5856; - // end inline asm - ld.const.u32 %r5865, [matrix+4028]; - // begin inline asm - dp4a.u32.u32 %r5864, %r5865, %r5930, %r5860; - // end inline asm - ld.const.u32 %r5869, [matrix+4032]; - // begin inline asm - dp4a.u32.u32 %r5868, %r5869, %r5870, %r6249; - // end inline asm - ld.const.u32 %r5873, [matrix+4036]; - // begin inline asm - dp4a.u32.u32 %r5872, %r5873, %r5874, %r5868; - // end inline asm - ld.const.u32 %r5877, [matrix+4040]; - // begin inline asm - dp4a.u32.u32 %r5876, %r5877, %r5878, %r5872; - // end inline asm - ld.const.u32 %r5881, [matrix+4044]; - // begin inline asm - dp4a.u32.u32 %r5880, %r5881, %r5882, %r5876; - // end inline asm - ld.const.u32 %r5885, [matrix+4048]; - // begin inline asm - dp4a.u32.u32 %r5884, %r5885, %r5886, %r5880; - // end inline asm - ld.const.u32 %r5889, [matrix+4052]; - // begin inline asm - dp4a.u32.u32 %r5888, %r5889, %r5890, %r5884; - // end inline asm - ld.const.u32 %r5893, [matrix+4056]; - // begin inline asm - dp4a.u32.u32 %r5892, %r5893, %r5894, %r5888; - // end inline asm - ld.const.u32 %r5897, [matrix+4060]; - // begin inline asm - dp4a.u32.u32 %r5896, %r5897, %r5898, %r5892; - // end inline asm - ld.const.u32 %r5901, [matrix+4064]; - // begin inline asm - dp4a.u32.u32 %r5900, %r5901, %r5902, %r5896; - // end inline asm - ld.const.u32 %r5905, [matrix+4068]; - // begin inline asm - dp4a.u32.u32 %r5904, %r5905, %r5906, %r5900; - // end inline asm - ld.const.u32 %r5909, [matrix+4072]; - // begin inline asm - dp4a.u32.u32 %r5908, %r5909, %r5910, %r5904; - // end inline asm - ld.const.u32 %r5913, [matrix+4076]; - // begin inline asm - dp4a.u32.u32 %r5912, %r5913, %r5914, %r5908; - // end inline asm - ld.const.u32 %r5917, [matrix+4080]; - // begin inline asm - dp4a.u32.u32 %r5916, %r5917, %r5918, %r5912; - // end inline asm - ld.const.u32 %r5921, [matrix+4084]; - // begin inline asm - dp4a.u32.u32 %r5920, %r5921, %r5922, %r5916; - // end inline asm - ld.const.u32 %r5925, [matrix+4088]; - // begin inline asm - dp4a.u32.u32 %r5924, %r5925, %r5926, %r5920; - // end inline asm - ld.const.u32 %r5929, [matrix+4092]; - // begin inline asm - dp4a.u32.u32 %r5928, %r5929, %r5930, %r5924; - // end inline asm - shr.u32 %r6111, %r5864, 6; - and.b32 %r5933, %r6111, 240; - shr.u32 %r5934, %r5928, 10; - // begin inline asm - lop3.b32 %r5932, %r5933, %r5934, %r5935, 0x56; - // end inline asm - shl.b32 %r6112, %r2236, 24; - cvt.u64.u32 %rd213, %r6112; - shl.b32 %r6113, %r2104, 16; - and.b32 %r6114, %r6113, 16711680; - cvt.u64.u32 %rd214, %r6114; - shl.b32 %r6115, %r1972, 8; - and.b32 %r6116, %r6115, 65280; - cvt.u64.u32 %rd215, %r6116; - shl.b32 %r6117, %r3292, 24; - cvt.u64.u32 %rd216, %r6117; - shl.b32 %r6118, %r3160, 16; - and.b32 %r6119, %r6118, 16711680; - cvt.u64.u32 %rd217, %r6119; - shl.b32 %r6120, %r3028, 8; - and.b32 %r6121, %r6120, 65280; - cvt.u64.u32 %rd218, %r6121; - shl.b32 %r6122, %r4348, 24; - cvt.u64.u32 %rd219, %r6122; - shl.b32 %r6123, %r4216, 16; - and.b32 %r6124, %r6123, 16711680; - cvt.u64.u32 %rd220, %r6124; - shl.b32 %r6125, %r4084, 8; - and.b32 %r6126, %r6125, 65280; - cvt.u64.u32 %rd221, %r6126; - cvt.u64.u32 %rd222, %r2764; - shl.b64 %rd223, %rd222, 56; - cvt.u64.u32 %rd224, %r2632; - shl.b64 %rd225, %rd224, 48; - and.b64 %rd226, %rd225, 71776119061217280; - or.b64 %rd227, %rd223, %rd226; - cvt.u64.u32 %rd228, %r2500; - shl.b64 %rd229, %rd228, 40; - and.b64 %rd230, %rd229, 280375465082880; - or.b64 %rd231, %rd227, %rd230; - cvt.u64.u32 %rd232, %r2368; - shl.b64 %rd233, %rd232, 32; - and.b64 %rd234, %rd233, 1095216660480; - or.b64 %rd235, %rd231, %rd234; - or.b64 %rd236, %rd235, %rd213; - or.b64 %rd237, %rd236, %rd214; - and.b32 %r6127, %r1840, 255; - cvt.u64.u32 %rd238, %r6127; - or.b64 %rd239, %rd237, %rd215; - or.b64 %rd240, %rd239, %rd238; - xor.b64 %rd73, %rd240, 4239941492252378377; - cvt.u64.u32 %rd241, %r3820; - shl.b64 %rd242, %rd241, 56; - cvt.u64.u32 %rd243, %r3688; - shl.b64 %rd244, %rd243, 48; - and.b64 %rd245, %rd244, 71776119061217280; - or.b64 %rd246, %rd242, %rd245; - cvt.u64.u32 %rd247, %r3556; - shl.b64 %rd248, %rd247, 40; - and.b64 %rd249, %rd248, 280375465082880; - or.b64 %rd250, %rd246, %rd249; - cvt.u64.u32 %rd251, %r3424; - shl.b64 %rd252, %rd251, 32; - and.b64 %rd253, %rd252, 1095216660480; - or.b64 %rd254, %rd250, %rd253; - or.b64 %rd255, %rd254, %rd216; - or.b64 %rd256, %rd255, %rd217; - and.b32 %r6128, %r2896, 255; - cvt.u64.u32 %rd257, %r6128; - or.b64 %rd258, %rd256, %rd218; - or.b64 %rd259, %rd258, %rd257; - xor.b64 %rd460, %rd259, 8746723911537738262; - cvt.u64.u32 %rd260, %r4876; - shl.b64 %rd261, %rd260, 56; - cvt.u64.u32 %rd262, %r4744; - shl.b64 %rd263, %rd262, 48; - and.b64 %rd264, %rd263, 71776119061217280; - or.b64 %rd265, %rd261, %rd264; - cvt.u64.u32 %rd266, %r4612; - shl.b64 %rd267, %rd266, 40; - and.b64 %rd268, %rd267, 280375465082880; - or.b64 %rd269, %rd265, %rd268; - cvt.u64.u32 %rd270, %r4480; - shl.b64 %rd271, %rd270, 32; - and.b64 %rd272, %rd271, 1095216660480; - or.b64 %rd273, %rd269, %rd272; - or.b64 %rd274, %rd273, %rd219; - or.b64 %rd275, %rd274, %rd220; - and.b32 %r6129, %r3952, 255; - cvt.u64.u32 %rd276, %r6129; - or.b64 %rd277, %rd275, %rd221; - or.b64 %rd278, %rd277, %rd276; - xor.b64 %rd455, %rd278, 8796936657246353646; - cvt.u64.u32 %rd279, %r5932; - shl.b64 %rd280, %rd279, 56; - cvt.u64.u32 %rd281, %r5800; - shl.b64 %rd282, %rd281, 48; - and.b64 %rd283, %rd282, 71776119061217280; - or.b64 %rd284, %rd280, %rd283; - cvt.u64.u32 %rd285, %r5668; - shl.b64 %rd286, %rd285, 40; - and.b64 %rd287, %rd286, 280375465082880; - or.b64 %rd288, %rd284, %rd287; - cvt.u64.u32 %rd289, %r5536; - shl.b64 %rd290, %rd289, 32; - and.b64 %rd291, %rd290, 1095216660480; - or.b64 %rd292, %rd288, %rd291; - or.b64 %rd293, %rd292, %rd212; - shl.b32 %r6130, %r5140, 8; - and.b32 %r6131, %r6130, 65280; - cvt.u64.u32 %rd294, %r6131; - or.b64 %rd295, %rd293, %rd211; - and.b32 %r6132, %r5008, 255; - cvt.u64.u32 %rd296, %r6132; - or.b64 %rd297, %rd295, %rd294; - or.b64 %rd298, %rd297, %rd296; - xor.b64 %rd450, %rd298, 1272090201925444760; - mov.u64 %rd464, 8270816933120786537; - mov.u64 %rd463, -850687345431043546; - mov.u64 %rd462, 8596393687355028144; - mov.u64 %rd461, -4073852189716399785; - mov.u64 %rd459, -4539347866060507718; - mov.u64 %rd458, -3233781605604422593; - mov.u64 %rd457, 570094237299545110; - mov.u64 %rd456, 5171152063242093102; - mov.u64 %rd454, 6782861118970774626; - mov.u64 %rd453, 7812475424661425213; - mov.u64 %rd452, 9119540418498120711; - mov.u64 %rd451, -7873636174015165430; - mov.u64 %rd449, -9207053471590684088; - mov.u64 %rd448, 3370482334374859748; - mov.u64 %rd447, -1544774801229058759; - mov.u64 %rd446, 6096431547456407061; - mov.u64 %rd445, -1792185402154627366; - mov.u64 %rd444, -6864424130110145268; - mov.u64 %rd443, 5690099369266491460; - mov.u64 %rd442, -5074726839974049192; - mov.u64 %rd441, 1592359455985097269; - mov.u64 %rd440, RC; - -$L__BB0_9: - xor.b64 %rd299, %rd464, %rd73; - xor.b64 %rd300, %rd299, %rd463; - xor.b64 %rd301, %rd300, %rd462; - xor.b64 %rd302, %rd301, %rd461; - xor.b64 %rd303, %rd459, %rd460; - xor.b64 %rd304, %rd303, %rd458; - xor.b64 %rd305, %rd304, %rd457; - xor.b64 %rd306, %rd305, %rd456; - xor.b64 %rd307, %rd454, %rd455; - xor.b64 %rd308, %rd307, %rd453; - xor.b64 %rd309, %rd308, %rd452; - xor.b64 %rd310, %rd309, %rd451; - xor.b64 %rd311, %rd449, %rd450; - xor.b64 %rd312, %rd311, %rd448; - xor.b64 %rd313, %rd312, %rd447; - xor.b64 %rd314, %rd313, %rd446; - xor.b64 %rd315, %rd444, %rd445; - xor.b64 %rd316, %rd315, %rd443; - xor.b64 %rd317, %rd316, %rd442; - xor.b64 %rd318, %rd317, %rd441; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6133}, %rd306; - } - { - .reg .b32 %dummy; - mov.b64 {%r6134,%dummy}, %rd306; - } - shf.l.wrap.b32 %r6135, %r6134, %r6133, 1; - shf.l.wrap.b32 %r6136, %r6133, %r6134, 1; - mov.b64 %rd319, {%r6136, %r6135}; - xor.b64 %rd320, %rd318, %rd319; - xor.b64 %rd321, %rd320, %rd73; - xor.b64 %rd322, %rd464, %rd320; - xor.b64 %rd323, %rd463, %rd320; - xor.b64 %rd324, %rd462, %rd320; - xor.b64 %rd325, %rd461, %rd320; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6137}, %rd310; - } - { - .reg .b32 %dummy; - mov.b64 {%r6138,%dummy}, %rd310; - } - shf.l.wrap.b32 %r6139, %r6138, %r6137, 1; - shf.l.wrap.b32 %r6140, %r6137, %r6138, 1; - mov.b64 %rd326, {%r6140, %r6139}; - xor.b64 %rd327, %rd326, %rd302; - xor.b64 %rd328, %rd460, %rd327; - xor.b64 %rd329, %rd459, %rd327; - xor.b64 %rd330, %rd458, %rd327; - xor.b64 %rd331, %rd457, %rd327; - xor.b64 %rd332, %rd456, %rd327; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6141}, %rd314; - } - { - .reg .b32 %dummy; - mov.b64 {%r6142,%dummy}, %rd314; - } - shf.l.wrap.b32 %r6143, %r6142, %r6141, 1; - shf.l.wrap.b32 %r6144, %r6141, %r6142, 1; - mov.b64 %rd333, {%r6144, %r6143}; - xor.b64 %rd334, %rd333, %rd306; - xor.b64 %rd335, %rd455, %rd334; - xor.b64 %rd336, %rd454, %rd334; - xor.b64 %rd337, %rd453, %rd334; - xor.b64 %rd338, %rd452, %rd334; - xor.b64 %rd339, %rd451, %rd334; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6145}, %rd318; - } - { - .reg .b32 %dummy; - mov.b64 {%r6146,%dummy}, %rd318; - } - shf.l.wrap.b32 %r6147, %r6146, %r6145, 1; - shf.l.wrap.b32 %r6148, %r6145, %r6146, 1; - mov.b64 %rd340, {%r6148, %r6147}; - xor.b64 %rd341, %rd340, %rd310; - xor.b64 %rd342, %rd450, %rd341; - xor.b64 %rd343, %rd449, %rd341; - xor.b64 %rd344, %rd448, %rd341; - xor.b64 %rd345, %rd447, %rd341; - xor.b64 %rd346, %rd446, %rd341; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6149}, %rd302; - } - { - .reg .b32 %dummy; - mov.b64 {%r6150,%dummy}, %rd302; - } - shf.l.wrap.b32 %r6151, %r6150, %r6149, 1; - shf.l.wrap.b32 %r6152, %r6149, %r6150, 1; - mov.b64 %rd347, {%r6152, %r6151}; - xor.b64 %rd348, %rd314, %rd347; - xor.b64 %rd349, %rd445, %rd348; - xor.b64 %rd350, %rd444, %rd348; - xor.b64 %rd351, %rd443, %rd348; - xor.b64 %rd352, %rd442, %rd348; - xor.b64 %rd353, %rd441, %rd348; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6153}, %rd328; - } - { - .reg .b32 %dummy; - mov.b64 {%r6154,%dummy}, %rd328; - } - shf.l.wrap.b32 %r6155, %r6154, %r6153, 1; - shf.l.wrap.b32 %r6156, %r6153, %r6154, 1; - mov.b64 %rd354, {%r6156, %r6155}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6157}, %rd323; - } - { - .reg .b32 %dummy; - mov.b64 {%r6158,%dummy}, %rd323; - } - shf.l.wrap.b32 %r6159, %r6158, %r6157, 3; - shf.l.wrap.b32 %r6160, %r6157, %r6158, 3; - mov.b64 %rd355, {%r6160, %r6159}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6161}, %rd336; - } - { - .reg .b32 %dummy; - mov.b64 {%r6162,%dummy}, %rd336; - } - shf.l.wrap.b32 %r6163, %r6162, %r6161, 6; - shf.l.wrap.b32 %r6164, %r6161, %r6162, 6; - mov.b64 %rd356, {%r6164, %r6163}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6165}, %rd330; - } - { - .reg .b32 %dummy; - mov.b64 {%r6166,%dummy}, %rd330; - } - shf.l.wrap.b32 %r6167, %r6166, %r6165, 10; - shf.l.wrap.b32 %r6168, %r6165, %r6166, 10; - mov.b64 %rd357, {%r6168, %r6167}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6169}, %rd338; - } - { - .reg .b32 %dummy; - mov.b64 {%r6170,%dummy}, %rd338; - } - shf.l.wrap.b32 %r6171, %r6170, %r6169, 15; - shf.l.wrap.b32 %r6172, %r6169, %r6170, 15; - mov.b64 %rd358, {%r6172, %r6171}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6173}, %rd345; - } - { - .reg .b32 %dummy; - mov.b64 {%r6174,%dummy}, %rd345; - } - shf.l.wrap.b32 %r6175, %r6174, %r6173, 21; - shf.l.wrap.b32 %r6176, %r6173, %r6174, 21; - mov.b64 %rd359, {%r6176, %r6175}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6177}, %rd342; - } - { - .reg .b32 %dummy; - mov.b64 {%r6178,%dummy}, %rd342; - } - shf.l.wrap.b32 %r6179, %r6178, %r6177, 28; - shf.l.wrap.b32 %r6180, %r6177, %r6178, 28; - mov.b64 %rd360, {%r6180, %r6179}; - { - .reg .b32 %dummy; - mov.b64 {%r6181,%dummy}, %rd322; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6182}, %rd322; - } - shf.r.wrap.b32 %r6183, %r6182, %r6181, 28; - shf.r.wrap.b32 %r6184, %r6181, %r6182, 28; - mov.b64 %rd361, {%r6184, %r6183}; - { - .reg .b32 %dummy; - mov.b64 {%r6185,%dummy}, %rd331; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6186}, %rd331; - } - shf.r.wrap.b32 %r6187, %r6186, %r6185, 19; - shf.r.wrap.b32 %r6188, %r6185, %r6186, 19; - mov.b64 %rd362, {%r6188, %r6187}; - { - .reg .b32 %dummy; - mov.b64 {%r6189,%dummy}, %rd343; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6190}, %rd343; - } - shf.r.wrap.b32 %r6191, %r6190, %r6189, 9; - shf.r.wrap.b32 %r6192, %r6189, %r6190, 9; - mov.b64 %rd363, {%r6192, %r6191}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6193}, %rd332; - } - { - .reg .b32 %dummy; - mov.b64 {%r6194,%dummy}, %rd332; - } - shf.l.wrap.b32 %r6195, %r6194, %r6193, 2; - shf.l.wrap.b32 %r6196, %r6193, %r6194, 2; - mov.b64 %rd364, {%r6196, %r6195}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6197}, %rd353; - } - { - .reg .b32 %dummy; - mov.b64 {%r6198,%dummy}, %rd353; - } - shf.l.wrap.b32 %r6199, %r6198, %r6197, 14; - shf.l.wrap.b32 %r6200, %r6197, %r6198, 14; - mov.b64 %rd365, {%r6200, %r6199}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6201}, %rd349; - } - { - .reg .b32 %dummy; - mov.b64 {%r6202,%dummy}, %rd349; - } - shf.l.wrap.b32 %r6203, %r6202, %r6201, 27; - shf.l.wrap.b32 %r6204, %r6201, %r6202, 27; - mov.b64 %rd366, {%r6204, %r6203}; - { - .reg .b32 %dummy; - mov.b64 {%r6205,%dummy}, %rd324; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6206}, %rd324; - } - shf.r.wrap.b32 %r6207, %r6206, %r6205, 23; - shf.r.wrap.b32 %r6208, %r6205, %r6206, 23; - mov.b64 %rd367, {%r6208, %r6207}; - { - .reg .b32 %dummy; - mov.b64 {%r6209,%dummy}, %rd346; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6210}, %rd346; - } - shf.r.wrap.b32 %r6211, %r6210, %r6209, 8; - shf.r.wrap.b32 %r6212, %r6209, %r6210, 8; - mov.b64 %rd368, {%r6212, %r6211}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6213}, %rd352; - } - { - .reg .b32 %dummy; - mov.b64 {%r6214,%dummy}, %rd352; - } - shf.l.wrap.b32 %r6215, %r6214, %r6213, 8; - shf.l.wrap.b32 %r6216, %r6213, %r6214, 8; - mov.b64 %rd369, {%r6216, %r6215}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6217}, %rd344; - } - { - .reg .b32 %dummy; - mov.b64 {%r6218,%dummy}, %rd344; - } - shf.l.wrap.b32 %r6219, %r6218, %r6217, 25; - shf.l.wrap.b32 %r6220, %r6217, %r6218, 25; - mov.b64 %rd370, {%r6220, %r6219}; - { - .reg .b32 %dummy; - mov.b64 {%r6221,%dummy}, %rd337; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6222}, %rd337; - } - shf.r.wrap.b32 %r6223, %r6222, %r6221, 21; - shf.r.wrap.b32 %r6224, %r6221, %r6222, 21; - mov.b64 %rd371, {%r6224, %r6223}; - { - .reg .b32 %dummy; - mov.b64 {%r6225,%dummy}, %rd335; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6226}, %rd335; - } - shf.r.wrap.b32 %r6227, %r6226, %r6225, 2; - shf.r.wrap.b32 %r6228, %r6225, %r6226, 2; - mov.b64 %rd372, {%r6228, %r6227}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6229}, %rd325; - } - { - .reg .b32 %dummy; - mov.b64 {%r6230,%dummy}, %rd325; - } - shf.l.wrap.b32 %r6231, %r6230, %r6229, 18; - shf.l.wrap.b32 %r6232, %r6229, %r6230, 18; - mov.b64 %rd373, {%r6232, %r6231}; - { - .reg .b32 %dummy; - mov.b64 {%r6233,%dummy}, %rd351; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6234}, %rd351; - } - shf.r.wrap.b32 %r6235, %r6234, %r6233, 25; - shf.r.wrap.b32 %r6236, %r6233, %r6234, 25; - mov.b64 %rd374, {%r6236, %r6235}; - { - .reg .b32 %dummy; - mov.b64 {%r6237,%dummy}, %rd339; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6238}, %rd339; - } - shf.r.wrap.b32 %r6239, %r6238, %r6237, 3; - shf.r.wrap.b32 %r6240, %r6237, %r6238, 3; - mov.b64 %rd375, {%r6240, %r6239}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6241}, %rd350; - } - { - .reg .b32 %dummy; - mov.b64 {%r6242,%dummy}, %rd350; - } - shf.l.wrap.b32 %r6243, %r6242, %r6241, 20; - shf.l.wrap.b32 %r6244, %r6241, %r6242, 20; - mov.b64 %rd376, {%r6244, %r6243}; - { - .reg .b32 %dummy; - mov.b64 {%r6245,%dummy}, %rd329; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6246}, %rd329; - } - shf.r.wrap.b32 %r6247, %r6246, %r6245, 20; - shf.r.wrap.b32 %r6248, %r6245, %r6246, 20; - mov.b64 %rd377, {%r6248, %r6247}; - not.b64 %rd378, %rd377; - and.b64 %rd379, %rd371, %rd378; - xor.b64 %rd380, %rd379, %rd321; - not.b64 %rd381, %rd371; - and.b64 %rd382, %rd359, %rd381; - xor.b64 %rd460, %rd382, %rd377; - not.b64 %rd383, %rd359; - and.b64 %rd384, %rd365, %rd383; - xor.b64 %rd455, %rd384, %rd371; - not.b64 %rd385, %rd365; - and.b64 %rd386, %rd321, %rd385; - xor.b64 %rd450, %rd386, %rd359; - not.b64 %rd387, %rd321; - and.b64 %rd388, %rd377, %rd387; - xor.b64 %rd445, %rd365, %rd388; - not.b64 %rd389, %rd376; - and.b64 %rd390, %rd355, %rd389; - xor.b64 %rd464, %rd390, %rd360; - not.b64 %rd391, %rd355; - and.b64 %rd392, %rd362, %rd391; - xor.b64 %rd459, %rd392, %rd376; - not.b64 %rd393, %rd362; - and.b64 %rd394, %rd375, %rd393; - xor.b64 %rd454, %rd394, %rd355; - not.b64 %rd395, %rd375; - and.b64 %rd396, %rd360, %rd395; - xor.b64 %rd449, %rd396, %rd362; - not.b64 %rd397, %rd360; - and.b64 %rd398, %rd376, %rd397; - xor.b64 %rd444, %rd375, %rd398; - not.b64 %rd399, %rd356; - and.b64 %rd400, %rd370, %rd399; - xor.b64 %rd463, %rd400, %rd354; - not.b64 %rd401, %rd370; - and.b64 %rd402, %rd369, %rd401; - xor.b64 %rd458, %rd402, %rd356; - not.b64 %rd403, %rd369; - and.b64 %rd404, %rd373, %rd403; - xor.b64 %rd453, %rd404, %rd370; - not.b64 %rd405, %rd373; - and.b64 %rd406, %rd354, %rd405; - xor.b64 %rd448, %rd406, %rd369; - not.b64 %rd407, %rd354; - and.b64 %rd408, %rd356, %rd407; - xor.b64 %rd443, %rd373, %rd408; - not.b64 %rd409, %rd361; - and.b64 %rd410, %rd357, %rd409; - xor.b64 %rd462, %rd410, %rd366; - not.b64 %rd411, %rd357; - and.b64 %rd412, %rd358, %rd411; - xor.b64 %rd457, %rd412, %rd361; - not.b64 %rd413, %rd358; - and.b64 %rd414, %rd368, %rd413; - xor.b64 %rd452, %rd414, %rd357; - not.b64 %rd415, %rd368; - and.b64 %rd416, %rd366, %rd415; - xor.b64 %rd447, %rd416, %rd358; - not.b64 %rd417, %rd366; - and.b64 %rd418, %rd361, %rd417; - xor.b64 %rd442, %rd368, %rd418; - not.b64 %rd419, %rd363; - and.b64 %rd420, %rd374, %rd419; - xor.b64 %rd461, %rd420, %rd372; - not.b64 %rd421, %rd374; - and.b64 %rd422, %rd367, %rd421; - xor.b64 %rd456, %rd422, %rd363; - not.b64 %rd423, %rd367; - and.b64 %rd424, %rd364, %rd423; - xor.b64 %rd451, %rd424, %rd374; - not.b64 %rd425, %rd364; - and.b64 %rd426, %rd372, %rd425; - xor.b64 %rd446, %rd426, %rd367; - not.b64 %rd427, %rd372; - and.b64 %rd428, %rd363, %rd427; - xor.b64 %rd441, %rd364, %rd428; - ld.global.nc.u64 %rd429, [%rd440]; - xor.b64 %rd73, %rd380, %rd429; - add.s64 %rd440, %rd440, 8; - add.s32 %r6249, %r6249, 1; - setp.ne.s32 %p11, %r6249, 24; - @%p11 bra $L__BB0_9; - - ld.const.u64 %rd75, [target+24]; - setp.eq.s64 %p12, %rd450, %rd75; - @%p12 bra $L__BB0_12; - bra.uni $L__BB0_11; - -$L__BB0_12: - ld.const.u64 %rd76, [target+16]; - setp.eq.s64 %p13, %rd455, %rd76; - @%p13 bra $L__BB0_14; - bra.uni $L__BB0_13; - -$L__BB0_14: - ld.const.u64 %rd77, [target+8]; - setp.eq.s64 %p14, %rd460, %rd77; - @%p14 bra $L__BB0_16; - bra.uni $L__BB0_15; - -$L__BB0_16: - ld.const.u64 %rd430, [target]; - setp.lt.u64 %p16, %rd73, %rd430; - bra.uni $L__BB0_17; - -$L__BB0_11: - setp.lt.u64 %p16, %rd450, %rd75; - bra.uni $L__BB0_17; - -$L__BB0_13: - setp.lt.u64 %p16, %rd455, %rd76; - bra.uni $L__BB0_17; - -$L__BB0_15: - setp.lt.u64 %p16, %rd460, %rd77; - -$L__BB0_17: - not.pred %p15, %p16; - @%p15 bra $L__BB0_19; - - ld.param.u64 %rd438, [heavy_hash_param_0]; - ld.param.u64 %rd437, [heavy_hash_param_1]; - and.b64 %rd436, %rd439, %rd438; - or.b64 %rd435, %rd436, %rd437; - ld.param.u64 %rd434, [heavy_hash_param_5]; - cvta.to.global.u64 %rd433, %rd434; - mov.u64 %rd431, 0; - atom.global.cas.b64 %rd432, [%rd433], %rd431, %rd435; - -$L__BB0_19: - ret; - -} - diff --git a/plugins/cuda/resources/kaspa-cuda-sm75.ptx b/plugins/cuda/resources/kaspa-cuda-sm75.ptx deleted file mode 100644 index a001843..0000000 --- a/plugins/cuda/resources/kaspa-cuda-sm75.ptx +++ /dev/null @@ -1,7081 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// -// Compiler Build ID: CL-31833905 -// Cuda compilation tools, release 11.8, V11.8.89 -// Based on NVVM 7.0.1 -// - -.version 7.8 -.target sm_75 -.address_size 64 - - // .globl heavy_hash -.global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; -.global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; -.global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; -.global .align 1 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; -.global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; -.global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; -.global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; -.const .align 4 .b8 matrix[4096]; -.const .align 8 .b8 hash_header[72]; -.const .align 8 .b8 target[32]; -.const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; -.const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; - -.visible .entry heavy_hash( - .param .u64 heavy_hash_param_0, - .param .u64 heavy_hash_param_1, - .param .u64 heavy_hash_param_2, - .param .u8 heavy_hash_param_3, - .param .u64 heavy_hash_param_4, - .param .u64 heavy_hash_param_5 -) -{ - .local .align 8 .b8 __local_depot0[1912]; - .reg .b64 %SP; - .reg .b64 %SPL; - .reg .pred %p<17>; - .reg .b16 %rs<113>; - .reg .b32 %r<6245>; - .reg .b64 %rd<490>; - - - mov.u64 %SPL, __local_depot0; - ld.param.u8 %rs11, [heavy_hash_param_3]; - ld.param.u64 %rd78, [heavy_hash_param_0]; - ld.param.u64 %rd79, [heavy_hash_param_1]; - ld.param.u64 %rd80, [heavy_hash_param_2]; - ld.param.u64 %rd81, [heavy_hash_param_4]; - ld.param.u64 %rd82, [heavy_hash_param_5]; - cvta.to.global.u64 %rd1, %rd81; - cvta.to.global.u64 %rd2, %rd82; - add.u64 %rd3, %SPL, 0; - mov.u32 %r17, %ntid.x; - mov.u32 %r18, %ctaid.x; - mov.u32 %r19, %tid.x; - mad.lo.s32 %r20, %r18, %r17, %r19; - cvt.s64.s32 %rd4, %r20; - setp.ge.u64 %p6, %rd4, %rd80; - @%p6 bra $L__BB0_19; - - cvt.u32.u64 %r21, %rd4; - setp.ne.s32 %p7, %r21, 0; - @%p7 bra $L__BB0_3; - - mov.u64 %rd84, 0; - st.global.u64 [%rd2], %rd84; - -$L__BB0_3: - setp.eq.s16 %p8, %rs11, 0; - @%p8 bra $L__BB0_5; - - shl.b64 %rd85, %rd4, 5; - add.s64 %rd86, %rd1, %rd85; - ld.global.v2.u64 {%rd87, %rd88}, [%rd86]; - mul.lo.s64 %rd91, %rd88, 5; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd91, 7; - shr.b64 %rhs, %rd91, 57; - add.u64 %rd92, %lhs, %rhs; - } - mul.lo.s64 %rd463, %rd92, 9; - shl.b64 %rd93, %rd88, 17; - ld.global.v2.u64 {%rd94, %rd95}, [%rd86+16]; - xor.b64 %rd98, %rd94, %rd87; - xor.b64 %rd99, %rd95, %rd88; - xor.b64 %rd100, %rd88, %rd98; - xor.b64 %rd101, %rd87, %rd99; - st.global.v2.u64 [%rd86], {%rd101, %rd100}; - { - .reg .b32 %dummy; - mov.b64 {%r22,%dummy}, %rd99; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r23}, %rd99; - } - shf.r.wrap.b32 %r24, %r23, %r22, 19; - shf.r.wrap.b32 %r25, %r22, %r23, 19; - mov.b64 %rd102, {%r25, %r24}; - xor.b64 %rd103, %rd98, %rd93; - st.global.v2.u64 [%rd86+16], {%rd103, %rd102}; - bra.uni $L__BB0_6; - -$L__BB0_5: - ld.global.u64 %rd104, [%rd1]; - xor.b64 %rd463, %rd104, %rd4; - -$L__BB0_6: - and.b64 %rd105, %rd463, %rd78; - or.b64 %rd8, %rd105, %rd79; - mov.b64 {%r26, %r27}, %rd8; - mov.u64 %rd106, 0; - ld.const.u64 %rd107, [hash_header]; - cvt.u32.u64 %r28, %rd107; - shr.u64 %rd108, %rd107, 8; - cvt.u32.u64 %r29, %rd108; - shr.u64 %rd109, %rd107, 16; - cvt.u32.u64 %r30, %rd109; - shr.u64 %rd110, %rd107, 32; - cvt.u32.u64 %r31, %rd110; - shr.u64 %rd111, %rd107, 40; - cvt.u32.u64 %r32, %rd111; - shr.u64 %rd112, %rd107, 48; - cvt.u32.u64 %r33, %rd112; - ld.const.u64 %rd113, [hash_header+8]; - cvt.u32.u64 %r34, %rd113; - shr.u64 %rd114, %rd113, 8; - cvt.u32.u64 %r35, %rd114; - shr.u64 %rd115, %rd113, 16; - cvt.u32.u64 %r36, %rd115; - shr.u64 %rd116, %rd113, 32; - cvt.u32.u64 %r37, %rd116; - shr.u64 %rd117, %rd113, 40; - cvt.u32.u64 %r38, %rd117; - shr.u64 %rd118, %rd113, 48; - cvt.u32.u64 %r39, %rd118; - ld.const.u64 %rd119, [hash_header+16]; - cvt.u32.u64 %r40, %rd119; - shr.u64 %rd120, %rd119, 8; - cvt.u32.u64 %r41, %rd120; - shr.u64 %rd121, %rd119, 16; - cvt.u32.u64 %r42, %rd121; - shr.u64 %rd122, %rd119, 32; - cvt.u32.u64 %r43, %rd122; - shr.u64 %rd123, %rd119, 40; - cvt.u32.u64 %r44, %rd123; - shr.u64 %rd124, %rd119, 48; - cvt.u32.u64 %r45, %rd124; - ld.const.u64 %rd125, [hash_header+24]; - cvt.u32.u64 %r46, %rd125; - shr.u64 %rd126, %rd125, 8; - cvt.u32.u64 %r47, %rd126; - shr.u64 %rd127, %rd125, 16; - cvt.u32.u64 %r48, %rd127; - shr.u64 %rd128, %rd125, 32; - cvt.u32.u64 %r49, %rd128; - shr.u64 %rd129, %rd125, 40; - cvt.u32.u64 %r50, %rd129; - shr.u64 %rd130, %rd125, 48; - cvt.u32.u64 %r51, %rd130; - ld.const.v4.u16 {%rs12, %rs13, %rs14, %rs15}, [hash_header+32]; - shr.u16 %rs17, %rs12, 8; - shr.u16 %rs19, %rs13, 8; - shr.u16 %rs21, %rs14, 8; - shr.u16 %rs23, %rs15, 8; - ld.const.v4.u16 {%rs24, %rs25, %rs26, %rs27}, [hash_header+40]; - shr.u16 %rs29, %rs24, 8; - shr.u16 %rs31, %rs25, 8; - shr.u16 %rs33, %rs26, 8; - shr.u16 %rs35, %rs27, 8; - ld.const.v4.u16 {%rs36, %rs37, %rs38, %rs39}, [hash_header+48]; - shr.u16 %rs41, %rs36, 8; - shr.u16 %rs43, %rs37, 8; - shr.u16 %rs45, %rs38, 8; - shr.u16 %rs47, %rs39, 8; - ld.const.v4.u16 {%rs48, %rs49, %rs50, %rs51}, [hash_header+56]; - shr.u16 %rs53, %rs48, 8; - shr.u16 %rs55, %rs49, 8; - shr.u16 %rs57, %rs50, 8; - shr.u16 %rs59, %rs51, 8; - ld.const.u64 %rd131, [hash_header+64]; - mov.b64 {%r52, %r53}, %rd131; - mov.u32 %r54, -1150833019; - mov.u32 %r55, 1779033703; - st.local.v2.u32 [%rd3], {%r55, %r54}; - mov.u32 %r56, -1521486534; - mov.u32 %r57, 1013904242; - st.local.v2.u32 [%rd3+8], {%r57, %r56}; - mov.u32 %r58, -1694144372; - mov.u32 %r59, 1359893119; - st.local.v2.u32 [%rd3+16], {%r59, %r58}; - mov.u32 %r60, 1541459225; - mov.u32 %r61, 528734635; - st.local.v2.u32 [%rd3+24], {%r61, %r60}; - st.local.u64 [%rd3+64], %rd106; - mov.u32 %r62, 0; - st.local.v2.u32 [%rd3+88], {%r62, %r62}; - st.local.v2.u32 [%rd3+96], {%r62, %r62}; - st.local.v2.u32 [%rd3+104], {%r62, %r62}; - st.local.v2.u32 [%rd3+112], {%r62, %r62}; - st.local.v2.u32 [%rd3+120], {%r62, %r62}; - st.local.v2.u32 [%rd3+128], {%r62, %r62}; - mov.u16 %rs60, 0; - st.local.v2.u8 [%rd3+136], {%rs60, %rs60}; - st.local.u8 [%rd3+138], %rs60; - st.local.v2.u32 [%rd3+32], {%r55, %r54}; - st.local.v2.u32 [%rd3+40], {%r57, %r56}; - st.local.v2.u32 [%rd3+48], {%r59, %r58}; - st.local.v2.u32 [%rd3+56], {%r61, %r60}; - st.local.v2.u32 [%rd3+72], {%r62, %r62}; - st.local.v2.u32 [%rd3+80], {%r62, %r62}; - st.local.u8 [%rd3+144], %rs60; - ld.local.v4.u8 {%rs61, %rs62, %rs63, %rs64}, [%rd3+136]; - setp.eq.s16 %p9, %rs62, 0; - selp.u16 %rs68, 1, 0, %p9; - or.b16 %rs69, %rs63, %rs68; - shr.u32 %r63, %r28, 24; - mov.u32 %r64, 64; - prmt.b32 %r65, %r28, %r29, %r64; - mov.u32 %r66, 1040; - prmt.b32 %r67, %r65, %r30, %r66; - mov.u32 %r68, 16912; - prmt.b32 %r69, %r67, %r63, %r68; - and.b32 %r70, %r31, 255; - and.b32 %r71, %r32, 255; - prmt.b32 %r72, %r71, %r70, 30212; - shl.b32 %r73, %r33, 16; - and.b32 %r74, %r73, 16711680; - or.b32 %r75, %r72, %r74; - and.b32 %r76, %r31, -16777216; - or.b32 %r77, %r75, %r76; - shr.u32 %r78, %r34, 24; - prmt.b32 %r79, %r34, %r35, %r64; - prmt.b32 %r80, %r79, %r36, %r66; - prmt.b32 %r81, %r80, %r78, %r68; - and.b32 %r82, %r37, 255; - and.b32 %r83, %r38, 255; - prmt.b32 %r84, %r83, %r82, 30212; - shl.b32 %r85, %r39, 16; - and.b32 %r86, %r85, 16711680; - or.b32 %r87, %r84, %r86; - and.b32 %r88, %r37, -16777216; - or.b32 %r89, %r87, %r88; - shr.u32 %r90, %r40, 24; - prmt.b32 %r91, %r40, %r41, %r64; - prmt.b32 %r92, %r91, %r42, %r66; - prmt.b32 %r93, %r92, %r90, %r68; - and.b32 %r94, %r43, 255; - and.b32 %r95, %r44, 255; - prmt.b32 %r96, %r95, %r94, 30212; - shl.b32 %r97, %r45, 16; - and.b32 %r98, %r97, 16711680; - or.b32 %r99, %r96, %r98; - and.b32 %r100, %r43, -16777216; - or.b32 %r101, %r99, %r100; - shr.u32 %r102, %r46, 24; - prmt.b32 %r103, %r46, %r47, %r64; - prmt.b32 %r104, %r103, %r48, %r66; - prmt.b32 %r105, %r104, %r102, %r68; - and.b32 %r106, %r49, 255; - and.b32 %r107, %r50, 255; - prmt.b32 %r108, %r107, %r106, 30212; - shl.b32 %r109, %r51, 16; - and.b32 %r110, %r109, 16711680; - or.b32 %r111, %r108, %r110; - and.b32 %r112, %r49, -16777216; - or.b32 %r113, %r111, %r112; - cvt.u32.u16 %r114, %rs12; - and.b32 %r115, %r114, 255; - cvt.u32.u16 %r116, %rs17; - prmt.b32 %r117, %r116, %r115, 30212; - cvt.u32.u16 %r118, %rs13; - prmt.b32 %r119, %r118, %r117, 28756; - cvt.u32.u16 %r120, %rs19; - prmt.b32 %r121, %r120, %r119, 1620; - cvt.u32.u16 %r122, %rs14; - and.b32 %r123, %r122, 255; - cvt.u32.u16 %r124, %rs21; - prmt.b32 %r125, %r124, %r123, 30212; - cvt.u32.u16 %r126, %rs15; - prmt.b32 %r127, %r126, %r125, 28756; - cvt.u32.u16 %r128, %rs23; - prmt.b32 %r129, %r128, %r127, 1620; - cvt.u32.u16 %r130, %rs24; - and.b32 %r131, %r130, 255; - cvt.u32.u16 %r132, %rs29; - prmt.b32 %r133, %r132, %r131, 30212; - cvt.u32.u16 %r134, %rs25; - prmt.b32 %r135, %r134, %r133, 28756; - cvt.u32.u16 %r136, %rs31; - prmt.b32 %r137, %r136, %r135, 1620; - cvt.u32.u16 %r138, %rs26; - and.b32 %r139, %r138, 255; - cvt.u32.u16 %r140, %rs33; - prmt.b32 %r141, %r140, %r139, 30212; - cvt.u32.u16 %r142, %rs27; - prmt.b32 %r143, %r142, %r141, 28756; - cvt.u32.u16 %r144, %rs35; - prmt.b32 %r145, %r144, %r143, 1620; - cvt.u32.u16 %r146, %rs36; - and.b32 %r147, %r146, 255; - cvt.u32.u16 %r148, %rs41; - prmt.b32 %r149, %r148, %r147, 30212; - cvt.u32.u16 %r150, %rs37; - prmt.b32 %r151, %r150, %r149, 28756; - cvt.u32.u16 %r152, %rs43; - prmt.b32 %r153, %r152, %r151, 1620; - cvt.u32.u16 %r154, %rs38; - and.b32 %r155, %r154, 255; - cvt.u32.u16 %r156, %rs45; - prmt.b32 %r157, %r156, %r155, 30212; - cvt.u32.u16 %r158, %rs39; - prmt.b32 %r159, %r158, %r157, 28756; - cvt.u32.u16 %r160, %rs47; - prmt.b32 %r161, %r160, %r159, 1620; - cvt.u32.u16 %r162, %rs48; - and.b32 %r163, %r162, 255; - cvt.u32.u16 %r164, %rs53; - prmt.b32 %r165, %r164, %r163, 30212; - cvt.u32.u16 %r166, %rs49; - prmt.b32 %r167, %r166, %r165, 28756; - cvt.u32.u16 %r168, %rs55; - prmt.b32 %r169, %r168, %r167, 1620; - cvt.u32.u16 %r170, %rs50; - and.b32 %r171, %r170, 255; - cvt.u32.u16 %r172, %rs57; - prmt.b32 %r173, %r172, %r171, 30212; - cvt.u32.u16 %r174, %rs51; - prmt.b32 %r175, %r174, %r173, 28756; - cvt.u32.u16 %r176, %rs59; - prmt.b32 %r177, %r176, %r175, 1620; - cvt.u32.u16 %r178, %rs69; - and.b32 %r179, %r178, 255; - add.s32 %r180, %r69, -1156040474; - shf.l.wrap.b32 %r181, %r180, %r180, 16; - add.s32 %r182, %r181, 1779033703; - xor.b32 %r183, %r182, 1359893119; - shf.l.wrap.b32 %r184, %r183, %r183, 20; - add.s32 %r185, %r77, %r180; - add.s32 %r186, %r185, %r184; - xor.b32 %r187, %r186, %r181; - shf.l.wrap.b32 %r188, %r187, %r187, 24; - add.s32 %r189, %r188, %r182; - xor.b32 %r190, %r189, %r184; - shf.l.wrap.b32 %r191, %r190, %r190, 25; - add.s32 %r192, %r81, 1449989905; - shf.l.wrap.b32 %r193, %r192, %r192, 16; - add.s32 %r194, %r193, -1150833019; - xor.b32 %r195, %r194, -1694144372; - shf.l.wrap.b32 %r196, %r195, %r195, 20; - add.s32 %r197, %r89, %r192; - add.s32 %r198, %r197, %r196; - xor.b32 %r199, %r198, %r193; - shf.l.wrap.b32 %r200, %r199, %r199, 24; - add.s32 %r201, %r200, %r194; - xor.b32 %r202, %r201, %r196; - shf.l.wrap.b32 %r203, %r202, %r202, 25; - add.s32 %r204, %r93, 1542638877; - shr.u32 %r205, %r204, 16; - shl.b32 %r206, %r204, 16; - xor.b32 %r207, %r206, 4194304; - or.b32 %r208, %r207, %r205; - add.s32 %r209, %r208, 1013904242; - xor.b32 %r210, %r209, 528734635; - shf.l.wrap.b32 %r211, %r210, %r210, 20; - add.s32 %r212, %r101, %r204; - add.s32 %r213, %r212, %r211; - xor.b32 %r214, %r213, %r208; - shf.l.wrap.b32 %r215, %r214, %r214, 24; - add.s32 %r216, %r215, %r209; - xor.b32 %r217, %r216, %r211; - shf.l.wrap.b32 %r218, %r217, %r217, 25; - add.s32 %r219, %r105, 19972691; - xor.b32 %r220, %r219, %r179; - shr.u32 %r221, %r219, 16; - shl.b32 %r222, %r220, 16; - or.b32 %r223, %r222, %r221; - add.s32 %r224, %r223, -1521486534; - xor.b32 %r225, %r224, 1541459225; - shf.l.wrap.b32 %r226, %r225, %r225, 20; - add.s32 %r227, %r113, %r219; - add.s32 %r228, %r227, %r226; - xor.b32 %r229, %r228, %r223; - shf.l.wrap.b32 %r230, %r229, %r229, 24; - add.s32 %r231, %r230, %r224; - xor.b32 %r232, %r231, %r226; - shf.l.wrap.b32 %r233, %r232, %r232, 25; - add.s32 %r234, %r203, %r186; - add.s32 %r235, %r234, %r121; - xor.b32 %r236, %r230, %r235; - shf.l.wrap.b32 %r237, %r236, %r236, 16; - add.s32 %r238, %r237, %r216; - xor.b32 %r239, %r238, %r203; - shf.l.wrap.b32 %r240, %r239, %r239, 20; - add.s32 %r241, %r129, %r235; - add.s32 %r242, %r241, %r240; - xor.b32 %r243, %r242, %r237; - shf.l.wrap.b32 %r244, %r243, %r243, 24; - add.s32 %r245, %r244, %r238; - xor.b32 %r246, %r245, %r240; - shf.l.wrap.b32 %r247, %r246, %r246, 25; - add.s32 %r248, %r218, %r198; - add.s32 %r249, %r248, %r137; - xor.b32 %r250, %r249, %r188; - shf.l.wrap.b32 %r251, %r250, %r250, 16; - add.s32 %r252, %r251, %r231; - xor.b32 %r253, %r252, %r218; - shf.l.wrap.b32 %r254, %r253, %r253, 20; - add.s32 %r255, %r145, %r249; - add.s32 %r256, %r255, %r254; - xor.b32 %r257, %r256, %r251; - shf.l.wrap.b32 %r258, %r257, %r257, 24; - add.s32 %r259, %r258, %r252; - xor.b32 %r260, %r259, %r254; - shf.l.wrap.b32 %r261, %r260, %r260, 25; - add.s32 %r262, %r233, %r213; - add.s32 %r263, %r262, %r153; - xor.b32 %r264, %r263, %r200; - shf.l.wrap.b32 %r265, %r264, %r264, 16; - add.s32 %r266, %r265, %r189; - xor.b32 %r267, %r266, %r233; - shf.l.wrap.b32 %r268, %r267, %r267, 20; - add.s32 %r269, %r161, %r263; - add.s32 %r270, %r269, %r268; - xor.b32 %r271, %r270, %r265; - shf.l.wrap.b32 %r272, %r271, %r271, 24; - add.s32 %r273, %r272, %r266; - xor.b32 %r274, %r273, %r268; - shf.l.wrap.b32 %r275, %r274, %r274, 25; - add.s32 %r276, %r228, %r191; - add.s32 %r277, %r276, %r169; - xor.b32 %r278, %r277, %r215; - shf.l.wrap.b32 %r279, %r278, %r278, 16; - add.s32 %r280, %r279, %r201; - xor.b32 %r281, %r280, %r191; - shf.l.wrap.b32 %r282, %r281, %r281, 20; - add.s32 %r283, %r177, %r277; - add.s32 %r284, %r283, %r282; - xor.b32 %r285, %r284, %r279; - shf.l.wrap.b32 %r286, %r285, %r285, 24; - add.s32 %r287, %r286, %r280; - xor.b32 %r288, %r287, %r282; - shf.l.wrap.b32 %r289, %r288, %r288, 25; - add.s32 %r290, %r242, %r81; - add.s32 %r291, %r290, %r289; - xor.b32 %r292, %r291, %r258; - shf.l.wrap.b32 %r293, %r292, %r292, 16; - add.s32 %r294, %r293, %r273; - xor.b32 %r295, %r294, %r289; - shf.l.wrap.b32 %r296, %r295, %r295, 20; - add.s32 %r297, %r291, %r105; - add.s32 %r298, %r297, %r296; - xor.b32 %r299, %r298, %r293; - shf.l.wrap.b32 %r300, %r299, %r299, 24; - add.s32 %r301, %r300, %r294; - xor.b32 %r302, %r301, %r296; - shf.l.wrap.b32 %r303, %r302, %r302, 25; - add.s32 %r304, %r256, %r89; - add.s32 %r305, %r304, %r247; - xor.b32 %r306, %r272, %r305; - shf.l.wrap.b32 %r307, %r306, %r306, 16; - add.s32 %r308, %r287, %r307; - xor.b32 %r309, %r308, %r247; - shf.l.wrap.b32 %r310, %r309, %r309, 20; - add.s32 %r311, %r305, %r137; - add.s32 %r312, %r311, %r310; - xor.b32 %r313, %r312, %r307; - shf.l.wrap.b32 %r314, %r313, %r313, 24; - add.s32 %r315, %r314, %r308; - xor.b32 %r316, %r315, %r310; - shf.l.wrap.b32 %r317, %r316, %r316, 25; - add.s32 %r318, %r261, %r113; - add.s32 %r319, %r318, %r270; - xor.b32 %r320, %r286, %r319; - shf.l.wrap.b32 %r321, %r320, %r320, 16; - add.s32 %r322, %r321, %r245; - xor.b32 %r323, %r322, %r261; - shf.l.wrap.b32 %r324, %r323, %r323, 20; - add.s32 %r325, %r319, %r69; - add.s32 %r326, %r325, %r324; - xor.b32 %r327, %r326, %r321; - shf.l.wrap.b32 %r328, %r327, %r327, 24; - add.s32 %r329, %r328, %r322; - xor.b32 %r330, %r329, %r324; - shf.l.wrap.b32 %r331, %r330, %r330, 25; - add.s32 %r332, %r275, %r93; - add.s32 %r333, %r332, %r284; - xor.b32 %r334, %r333, %r244; - shf.l.wrap.b32 %r335, %r334, %r334, 16; - add.s32 %r336, %r335, %r259; - xor.b32 %r337, %r336, %r275; - shf.l.wrap.b32 %r338, %r337, %r337, 20; - add.s32 %r339, %r333, %r161; - add.s32 %r340, %r339, %r338; - xor.b32 %r341, %r340, %r335; - shf.l.wrap.b32 %r342, %r341, %r341, 24; - add.s32 %r343, %r342, %r336; - xor.b32 %r344, %r343, %r338; - shf.l.wrap.b32 %r345, %r344, %r344, 25; - add.s32 %r346, %r298, %r77; - add.s32 %r347, %r346, %r317; - xor.b32 %r348, %r347, %r342; - shf.l.wrap.b32 %r349, %r348, %r348, 16; - add.s32 %r350, %r349, %r329; - xor.b32 %r351, %r350, %r317; - shf.l.wrap.b32 %r352, %r351, %r351, 20; - add.s32 %r353, %r347, %r145; - add.s32 %r354, %r353, %r352; - xor.b32 %r355, %r354, %r349; - shf.l.wrap.b32 %r356, %r355, %r355, 24; - add.s32 %r357, %r356, %r350; - xor.b32 %r358, %r357, %r352; - shf.l.wrap.b32 %r359, %r358, %r358, 25; - add.s32 %r360, %r312, %r153; - add.s32 %r361, %r360, %r331; - xor.b32 %r362, %r361, %r300; - shf.l.wrap.b32 %r363, %r362, %r362, 16; - add.s32 %r364, %r363, %r343; - xor.b32 %r365, %r364, %r331; - shf.l.wrap.b32 %r366, %r365, %r365, 20; - add.s32 %r367, %r361, %r101; - add.s32 %r368, %r367, %r366; - xor.b32 %r369, %r368, %r363; - shf.l.wrap.b32 %r370, %r369, %r369, 24; - add.s32 %r371, %r370, %r364; - xor.b32 %r372, %r371, %r366; - shf.l.wrap.b32 %r373, %r372, %r372, 25; - add.s32 %r374, %r326, %r129; - add.s32 %r375, %r374, %r345; - xor.b32 %r376, %r375, %r314; - shf.l.wrap.b32 %r377, %r376, %r376, 16; - add.s32 %r378, %r377, %r301; - xor.b32 %r379, %r378, %r345; - shf.l.wrap.b32 %r380, %r379, %r379, 20; - add.s32 %r381, %r375, %r169; - add.s32 %r382, %r381, %r380; - xor.b32 %r383, %r382, %r377; - shf.l.wrap.b32 %r384, %r383, %r383, 24; - add.s32 %r385, %r384, %r378; - xor.b32 %r386, %r385, %r380; - shf.l.wrap.b32 %r387, %r386, %r386, 25; - add.s32 %r388, %r340, %r177; - add.s32 %r389, %r388, %r303; - xor.b32 %r390, %r389, %r328; - shf.l.wrap.b32 %r391, %r390, %r390, 16; - add.s32 %r392, %r391, %r315; - xor.b32 %r393, %r392, %r303; - shf.l.wrap.b32 %r394, %r393, %r393, 20; - add.s32 %r395, %r389, %r121; - add.s32 %r396, %r395, %r394; - xor.b32 %r397, %r396, %r391; - shf.l.wrap.b32 %r398, %r397, %r397, 24; - add.s32 %r399, %r398, %r392; - xor.b32 %r400, %r399, %r394; - shf.l.wrap.b32 %r401, %r400, %r400, 25; - add.s32 %r402, %r354, %r89; - add.s32 %r403, %r402, %r401; - xor.b32 %r404, %r403, %r370; - shf.l.wrap.b32 %r405, %r404, %r404, 16; - add.s32 %r406, %r405, %r385; - xor.b32 %r407, %r406, %r401; - shf.l.wrap.b32 %r408, %r407, %r407, 20; - add.s32 %r409, %r403, %r93; - add.s32 %r410, %r409, %r408; - xor.b32 %r411, %r410, %r405; - shf.l.wrap.b32 %r412, %r411, %r411, 24; - add.s32 %r413, %r412, %r406; - xor.b32 %r414, %r413, %r408; - shf.l.wrap.b32 %r415, %r414, %r414, 25; - add.s32 %r416, %r368, %r137; - add.s32 %r417, %r416, %r359; - xor.b32 %r418, %r417, %r384; - shf.l.wrap.b32 %r419, %r418, %r418, 16; - add.s32 %r420, %r419, %r399; - xor.b32 %r421, %r420, %r359; - shf.l.wrap.b32 %r422, %r421, %r421, 20; - add.s32 %r423, %r417, %r153; - add.s32 %r424, %r423, %r422; - xor.b32 %r425, %r424, %r419; - shf.l.wrap.b32 %r426, %r425, %r425, 24; - add.s32 %r427, %r426, %r420; - xor.b32 %r428, %r427, %r422; - shf.l.wrap.b32 %r429, %r428, %r428, 25; - add.s32 %r430, %r382, %r161; - add.s32 %r431, %r430, %r373; - xor.b32 %r432, %r431, %r398; - shf.l.wrap.b32 %r433, %r432, %r432, 16; - add.s32 %r434, %r433, %r357; - xor.b32 %r435, %r434, %r373; - shf.l.wrap.b32 %r436, %r435, %r435, 20; - add.s32 %r437, %r431, %r81; - add.s32 %r438, %r437, %r436; - xor.b32 %r439, %r438, %r433; - shf.l.wrap.b32 %r440, %r439, %r439, 24; - add.s32 %r441, %r440, %r434; - xor.b32 %r442, %r441, %r436; - shf.l.wrap.b32 %r443, %r442, %r442, 25; - add.s32 %r444, %r396, %r113; - add.s32 %r445, %r444, %r387; - xor.b32 %r446, %r445, %r356; - shf.l.wrap.b32 %r447, %r446, %r446, 16; - add.s32 %r448, %r447, %r371; - xor.b32 %r449, %r448, %r387; - shf.l.wrap.b32 %r450, %r449, %r449, 20; - add.s32 %r451, %r445, %r169; - add.s32 %r452, %r451, %r450; - xor.b32 %r453, %r452, %r447; - shf.l.wrap.b32 %r454, %r453, %r453, 24; - add.s32 %r455, %r454, %r448; - xor.b32 %r456, %r455, %r450; - shf.l.wrap.b32 %r457, %r456, %r456, 25; - add.s32 %r458, %r410, %r105; - add.s32 %r459, %r458, %r429; - xor.b32 %r460, %r459, %r454; - shf.l.wrap.b32 %r461, %r460, %r460, 16; - add.s32 %r462, %r461, %r441; - xor.b32 %r463, %r462, %r429; - shf.l.wrap.b32 %r464, %r463, %r463, 20; - add.s32 %r465, %r459, %r101; - add.s32 %r466, %r465, %r464; - xor.b32 %r467, %r466, %r461; - shf.l.wrap.b32 %r468, %r467, %r467, 24; - add.s32 %r469, %r468, %r462; - xor.b32 %r470, %r469, %r464; - shf.l.wrap.b32 %r471, %r470, %r470, 25; - add.s32 %r472, %r424, %r129; - add.s32 %r473, %r472, %r443; - xor.b32 %r474, %r473, %r412; - shf.l.wrap.b32 %r475, %r474, %r474, 16; - add.s32 %r476, %r475, %r455; - xor.b32 %r477, %r476, %r443; - shf.l.wrap.b32 %r478, %r477, %r477, 20; - add.s32 %r479, %r473, %r69; - add.s32 %r480, %r479, %r478; - xor.b32 %r481, %r480, %r475; - shf.l.wrap.b32 %r482, %r481, %r481, 24; - add.s32 %r483, %r482, %r476; - xor.b32 %r484, %r483, %r478; - shf.l.wrap.b32 %r485, %r484, %r484, 25; - add.s32 %r486, %r438, %r145; - add.s32 %r487, %r486, %r457; - xor.b32 %r488, %r487, %r426; - shf.l.wrap.b32 %r489, %r488, %r488, 16; - add.s32 %r490, %r489, %r413; - xor.b32 %r491, %r490, %r457; - shf.l.wrap.b32 %r492, %r491, %r491, 20; - add.s32 %r493, %r487, %r177; - add.s32 %r494, %r493, %r492; - xor.b32 %r495, %r494, %r489; - shf.l.wrap.b32 %r496, %r495, %r495, 24; - add.s32 %r497, %r496, %r490; - xor.b32 %r498, %r497, %r492; - shf.l.wrap.b32 %r499, %r498, %r498, 25; - add.s32 %r500, %r452, %r121; - add.s32 %r501, %r500, %r415; - xor.b32 %r502, %r501, %r440; - shf.l.wrap.b32 %r503, %r502, %r502, 16; - add.s32 %r504, %r503, %r427; - xor.b32 %r505, %r504, %r415; - shf.l.wrap.b32 %r506, %r505, %r505, 20; - add.s32 %r507, %r501, %r77; - add.s32 %r508, %r507, %r506; - xor.b32 %r509, %r508, %r503; - shf.l.wrap.b32 %r510, %r509, %r509, 24; - add.s32 %r511, %r510, %r504; - xor.b32 %r512, %r511, %r506; - shf.l.wrap.b32 %r513, %r512, %r512, 25; - add.s32 %r514, %r466, %r137; - add.s32 %r515, %r514, %r513; - xor.b32 %r516, %r515, %r482; - shf.l.wrap.b32 %r517, %r516, %r516, 16; - add.s32 %r518, %r517, %r497; - xor.b32 %r519, %r518, %r513; - shf.l.wrap.b32 %r520, %r519, %r519, 20; - add.s32 %r521, %r515, %r113; - add.s32 %r522, %r521, %r520; - xor.b32 %r523, %r522, %r517; - shf.l.wrap.b32 %r524, %r523, %r523, 24; - add.s32 %r525, %r524, %r518; - xor.b32 %r526, %r525, %r520; - shf.l.wrap.b32 %r527, %r526, %r526, 25; - add.s32 %r528, %r480, %r153; - add.s32 %r529, %r528, %r471; - xor.b32 %r530, %r529, %r496; - shf.l.wrap.b32 %r531, %r530, %r530, 16; - add.s32 %r532, %r531, %r511; - xor.b32 %r533, %r532, %r471; - shf.l.wrap.b32 %r534, %r533, %r533, 20; - add.s32 %r535, %r529, %r129; - add.s32 %r536, %r535, %r534; - xor.b32 %r537, %r536, %r531; - shf.l.wrap.b32 %r538, %r537, %r537, 24; - add.s32 %r539, %r538, %r532; - xor.b32 %r540, %r539, %r534; - shf.l.wrap.b32 %r541, %r540, %r540, 25; - add.s32 %r542, %r494, %r169; - add.s32 %r543, %r542, %r485; - xor.b32 %r544, %r543, %r510; - shf.l.wrap.b32 %r545, %r544, %r544, 16; - add.s32 %r546, %r545, %r469; - xor.b32 %r547, %r546, %r485; - shf.l.wrap.b32 %r548, %r547, %r547, 20; - add.s32 %r549, %r543, %r89; - add.s32 %r550, %r549, %r548; - xor.b32 %r551, %r550, %r545; - shf.l.wrap.b32 %r552, %r551, %r551, 24; - add.s32 %r553, %r552, %r546; - xor.b32 %r554, %r553, %r548; - shf.l.wrap.b32 %r555, %r554, %r554, 25; - add.s32 %r556, %r508, %r161; - add.s32 %r557, %r556, %r499; - xor.b32 %r558, %r557, %r468; - shf.l.wrap.b32 %r559, %r558, %r558, 16; - add.s32 %r560, %r559, %r483; - xor.b32 %r561, %r560, %r499; - shf.l.wrap.b32 %r562, %r561, %r561, 20; - add.s32 %r563, %r557, %r177; - add.s32 %r564, %r563, %r562; - xor.b32 %r565, %r564, %r559; - shf.l.wrap.b32 %r566, %r565, %r565, 24; - add.s32 %r567, %r566, %r560; - xor.b32 %r568, %r567, %r562; - shf.l.wrap.b32 %r569, %r568, %r568, 25; - add.s32 %r570, %r522, %r93; - add.s32 %r571, %r570, %r541; - xor.b32 %r572, %r571, %r566; - shf.l.wrap.b32 %r573, %r572, %r572, 16; - add.s32 %r574, %r573, %r553; - xor.b32 %r575, %r574, %r541; - shf.l.wrap.b32 %r576, %r575, %r575, 20; - add.s32 %r577, %r571, %r69; - add.s32 %r578, %r577, %r576; - xor.b32 %r579, %r578, %r573; - shf.l.wrap.b32 %r580, %r579, %r579, 24; - add.s32 %r581, %r580, %r574; - xor.b32 %r582, %r581, %r576; - shf.l.wrap.b32 %r583, %r582, %r582, 25; - add.s32 %r584, %r536, %r145; - add.s32 %r585, %r584, %r555; - xor.b32 %r586, %r585, %r524; - shf.l.wrap.b32 %r587, %r586, %r586, 16; - add.s32 %r588, %r587, %r567; - xor.b32 %r589, %r588, %r555; - shf.l.wrap.b32 %r590, %r589, %r589, 20; - add.s32 %r591, %r585, %r81; - add.s32 %r592, %r591, %r590; - xor.b32 %r593, %r592, %r587; - shf.l.wrap.b32 %r594, %r593, %r593, 24; - add.s32 %r595, %r594, %r588; - xor.b32 %r596, %r595, %r590; - shf.l.wrap.b32 %r597, %r596, %r596, 25; - add.s32 %r598, %r550, %r101; - add.s32 %r599, %r598, %r569; - xor.b32 %r600, %r599, %r538; - shf.l.wrap.b32 %r601, %r600, %r600, 16; - add.s32 %r602, %r601, %r525; - xor.b32 %r603, %r602, %r569; - shf.l.wrap.b32 %r604, %r603, %r603, 20; - add.s32 %r605, %r599, %r121; - add.s32 %r606, %r605, %r604; - xor.b32 %r607, %r606, %r601; - shf.l.wrap.b32 %r608, %r607, %r607, 24; - add.s32 %r609, %r608, %r602; - xor.b32 %r610, %r609, %r604; - shf.l.wrap.b32 %r611, %r610, %r610, 25; - add.s32 %r612, %r564, %r77; - add.s32 %r613, %r612, %r527; - xor.b32 %r614, %r613, %r552; - shf.l.wrap.b32 %r615, %r614, %r614, 16; - add.s32 %r616, %r615, %r539; - xor.b32 %r617, %r616, %r527; - shf.l.wrap.b32 %r618, %r617, %r617, 20; - add.s32 %r619, %r613, %r105; - add.s32 %r620, %r619, %r618; - xor.b32 %r621, %r620, %r615; - shf.l.wrap.b32 %r622, %r621, %r621, 24; - add.s32 %r623, %r622, %r616; - xor.b32 %r624, %r623, %r618; - shf.l.wrap.b32 %r625, %r624, %r624, 25; - add.s32 %r626, %r578, %r153; - add.s32 %r627, %r626, %r625; - xor.b32 %r628, %r627, %r594; - shf.l.wrap.b32 %r629, %r628, %r628, 16; - add.s32 %r630, %r629, %r609; - xor.b32 %r631, %r630, %r625; - shf.l.wrap.b32 %r632, %r631, %r631, 20; - add.s32 %r633, %r627, %r161; - add.s32 %r634, %r633, %r632; - xor.b32 %r635, %r634, %r629; - shf.l.wrap.b32 %r636, %r635, %r635, 24; - add.s32 %r637, %r636, %r630; - xor.b32 %r638, %r637, %r632; - shf.l.wrap.b32 %r639, %r638, %r638, 25; - add.s32 %r640, %r592, %r129; - add.s32 %r641, %r640, %r583; - xor.b32 %r642, %r641, %r608; - shf.l.wrap.b32 %r643, %r642, %r642, 16; - add.s32 %r644, %r643, %r623; - xor.b32 %r645, %r644, %r583; - shf.l.wrap.b32 %r646, %r645, %r645, 20; - add.s32 %r647, %r641, %r145; - add.s32 %r648, %r647, %r646; - xor.b32 %r649, %r648, %r643; - shf.l.wrap.b32 %r650, %r649, %r649, 24; - add.s32 %r651, %r650, %r644; - xor.b32 %r652, %r651, %r646; - shf.l.wrap.b32 %r653, %r652, %r652, 25; - add.s32 %r654, %r606, %r177; - add.s32 %r655, %r654, %r597; - xor.b32 %r656, %r655, %r622; - shf.l.wrap.b32 %r657, %r656, %r656, 16; - add.s32 %r658, %r657, %r581; - xor.b32 %r659, %r658, %r597; - shf.l.wrap.b32 %r660, %r659, %r659, 20; - add.s32 %r661, %r655, %r137; - add.s32 %r662, %r661, %r660; - xor.b32 %r663, %r662, %r657; - shf.l.wrap.b32 %r664, %r663, %r663, 24; - add.s32 %r665, %r664, %r658; - xor.b32 %r666, %r665, %r660; - shf.l.wrap.b32 %r667, %r666, %r666, 25; - add.s32 %r668, %r620, %r169; - add.s32 %r669, %r668, %r611; - xor.b32 %r670, %r669, %r580; - shf.l.wrap.b32 %r671, %r670, %r670, 16; - add.s32 %r672, %r671, %r595; - xor.b32 %r673, %r672, %r611; - shf.l.wrap.b32 %r674, %r673, %r673, 20; - add.s32 %r675, %r669, %r121; - add.s32 %r676, %r675, %r674; - xor.b32 %r677, %r676, %r671; - shf.l.wrap.b32 %r678, %r677, %r677, 24; - add.s32 %r679, %r678, %r672; - xor.b32 %r680, %r679, %r674; - shf.l.wrap.b32 %r681, %r680, %r680, 25; - add.s32 %r682, %r634, %r113; - add.s32 %r683, %r682, %r653; - xor.b32 %r684, %r683, %r678; - shf.l.wrap.b32 %r685, %r684, %r684, 16; - add.s32 %r686, %r685, %r665; - xor.b32 %r687, %r686, %r653; - shf.l.wrap.b32 %r688, %r687, %r687, 20; - add.s32 %r689, %r683, %r81; - add.s32 %r690, %r689, %r688; - xor.b32 %r691, %r690, %r685; - shf.l.wrap.b32 %r692, %r691, %r691, 24; - add.s32 %r693, %r692, %r686; - xor.b32 %r694, %r693, %r688; - shf.l.wrap.b32 %r695, %r694, %r694, 25; - add.s32 %r696, %r648, %r101; - add.s32 %r697, %r696, %r667; - xor.b32 %r698, %r697, %r636; - shf.l.wrap.b32 %r699, %r698, %r698, 16; - add.s32 %r700, %r699, %r679; - xor.b32 %r701, %r700, %r667; - shf.l.wrap.b32 %r702, %r701, %r701, 20; - add.s32 %r703, %r697, %r89; - add.s32 %r704, %r703, %r702; - xor.b32 %r705, %r704, %r699; - shf.l.wrap.b32 %r706, %r705, %r705, 24; - add.s32 %r707, %r706, %r700; - xor.b32 %r708, %r707, %r702; - shf.l.wrap.b32 %r709, %r708, %r708, 25; - add.s32 %r710, %r662, %r69; - add.s32 %r711, %r710, %r681; - xor.b32 %r712, %r711, %r650; - shf.l.wrap.b32 %r713, %r712, %r712, 16; - add.s32 %r714, %r713, %r637; - xor.b32 %r715, %r714, %r681; - shf.l.wrap.b32 %r716, %r715, %r715, 20; - add.s32 %r717, %r711, %r77; - add.s32 %r718, %r717, %r716; - xor.b32 %r719, %r718, %r713; - shf.l.wrap.b32 %r720, %r719, %r719, 24; - add.s32 %r721, %r720, %r714; - xor.b32 %r722, %r721, %r716; - shf.l.wrap.b32 %r723, %r722, %r722, 25; - add.s32 %r724, %r676, %r105; - add.s32 %r725, %r724, %r639; - xor.b32 %r726, %r725, %r664; - shf.l.wrap.b32 %r727, %r726, %r726, 16; - add.s32 %r728, %r727, %r651; - xor.b32 %r729, %r728, %r639; - shf.l.wrap.b32 %r730, %r729, %r729, 20; - add.s32 %r731, %r725, %r93; - add.s32 %r732, %r731, %r730; - xor.b32 %r733, %r732, %r727; - shf.l.wrap.b32 %r734, %r733, %r733, 24; - add.s32 %r735, %r734, %r728; - xor.b32 %r736, %r735, %r730; - shf.l.wrap.b32 %r737, %r736, %r736, 25; - add.s32 %r738, %r690, %r129; - add.s32 %r739, %r738, %r737; - xor.b32 %r740, %r739, %r706; - shf.l.wrap.b32 %r741, %r740, %r740, 16; - add.s32 %r742, %r741, %r721; - xor.b32 %r743, %r742, %r737; - shf.l.wrap.b32 %r744, %r743, %r743, 20; - add.s32 %r745, %r739, %r169; - add.s32 %r746, %r745, %r744; - xor.b32 %r747, %r746, %r741; - shf.l.wrap.b32 %r748, %r747, %r747, 24; - add.s32 %r749, %r748, %r742; - xor.b32 %r750, %r749, %r744; - shf.l.wrap.b32 %r751, %r750, %r750, 25; - add.s32 %r752, %r704, %r145; - add.s32 %r753, %r752, %r695; - xor.b32 %r754, %r753, %r720; - shf.l.wrap.b32 %r755, %r754, %r754, 16; - add.s32 %r756, %r755, %r735; - xor.b32 %r757, %r756, %r695; - shf.l.wrap.b32 %r758, %r757, %r757, 20; - add.s32 %r759, %r753, %r101; - add.s32 %r760, %r759, %r758; - xor.b32 %r761, %r760, %r755; - shf.l.wrap.b32 %r762, %r761, %r761, 24; - add.s32 %r763, %r762, %r756; - xor.b32 %r764, %r763, %r758; - shf.l.wrap.b32 %r765, %r764, %r764, 25; - add.s32 %r766, %r718, %r121; - add.s32 %r767, %r766, %r709; - xor.b32 %r768, %r767, %r734; - shf.l.wrap.b32 %r769, %r768, %r768, 16; - add.s32 %r770, %r769, %r693; - xor.b32 %r771, %r770, %r709; - shf.l.wrap.b32 %r772, %r771, %r771, 20; - add.s32 %r773, %r767, %r153; - add.s32 %r774, %r773, %r772; - xor.b32 %r775, %r774, %r769; - shf.l.wrap.b32 %r776, %r775, %r775, 24; - add.s32 %r777, %r776, %r770; - xor.b32 %r778, %r777, %r772; - shf.l.wrap.b32 %r779, %r778, %r778, 25; - add.s32 %r780, %r732, %r177; - add.s32 %r781, %r780, %r723; - xor.b32 %r782, %r781, %r692; - shf.l.wrap.b32 %r783, %r782, %r782, 16; - add.s32 %r784, %r783, %r707; - xor.b32 %r785, %r784, %r723; - shf.l.wrap.b32 %r786, %r785, %r785, 20; - add.s32 %r787, %r781, %r77; - add.s32 %r788, %r787, %r786; - xor.b32 %r789, %r788, %r783; - shf.l.wrap.b32 %r790, %r789, %r789, 24; - add.s32 %r791, %r790, %r784; - xor.b32 %r792, %r791, %r786; - shf.l.wrap.b32 %r793, %r792, %r792, 25; - add.s32 %r794, %r746, %r161; - add.s32 %r795, %r794, %r765; - xor.b32 %r796, %r795, %r790; - shf.l.wrap.b32 %r797, %r796, %r796, 16; - add.s32 %r798, %r797, %r777; - xor.b32 %r799, %r798, %r765; - shf.l.wrap.b32 %r800, %r799, %r799, 20; - add.s32 %r801, %r795, %r89; - add.s32 %r802, %r801, %r800; - xor.b32 %r803, %r802, %r797; - shf.l.wrap.b32 %r804, %r803, %r803, 24; - add.s32 %r805, %r804, %r798; - xor.b32 %r806, %r805, %r800; - shf.l.wrap.b32 %r807, %r806, %r806, 25; - add.s32 %r808, %r760, %r69; - add.s32 %r809, %r808, %r779; - xor.b32 %r810, %r809, %r748; - shf.l.wrap.b32 %r811, %r810, %r810, 16; - add.s32 %r812, %r811, %r791; - xor.b32 %r813, %r812, %r779; - shf.l.wrap.b32 %r814, %r813, %r813, 20; - add.s32 %r815, %r809, %r137; - add.s32 %r816, %r815, %r814; - xor.b32 %r817, %r816, %r811; - shf.l.wrap.b32 %r818, %r817, %r817, 24; - add.s32 %r819, %r818, %r812; - xor.b32 %r820, %r819, %r814; - shf.l.wrap.b32 %r821, %r820, %r820, 25; - add.s32 %r822, %r774, %r81; - add.s32 %r823, %r822, %r793; - xor.b32 %r824, %r823, %r762; - shf.l.wrap.b32 %r825, %r824, %r824, 16; - add.s32 %r826, %r825, %r749; - xor.b32 %r827, %r826, %r793; - shf.l.wrap.b32 %r828, %r827, %r827, 20; - add.s32 %r829, %r823, %r105; - add.s32 %r830, %r829, %r828; - xor.b32 %r831, %r830, %r825; - shf.l.wrap.b32 %r832, %r831, %r831, 24; - add.s32 %r833, %r832, %r826; - xor.b32 %r834, %r833, %r828; - shf.l.wrap.b32 %r835, %r834, %r834, 25; - add.s32 %r836, %r788, %r93; - add.s32 %r837, %r836, %r751; - xor.b32 %r838, %r837, %r776; - shf.l.wrap.b32 %r839, %r838, %r838, 16; - add.s32 %r840, %r839, %r763; - xor.b32 %r841, %r840, %r751; - shf.l.wrap.b32 %r842, %r841, %r841, 20; - add.s32 %r843, %r837, %r113; - add.s32 %r844, %r843, %r842; - xor.b32 %r845, %r844, %r839; - shf.l.wrap.b32 %r846, %r845, %r845, 24; - add.s32 %r847, %r846, %r840; - xor.b32 %r848, %r847, %r842; - shf.l.wrap.b32 %r849, %r848, %r848, 25; - add.s32 %r850, %r802, %r145; - add.s32 %r851, %r850, %r849; - xor.b32 %r852, %r851, %r818; - shf.l.wrap.b32 %r853, %r852, %r852, 16; - add.s32 %r854, %r853, %r833; - xor.b32 %r855, %r854, %r849; - shf.l.wrap.b32 %r856, %r855, %r855, 20; - add.s32 %r857, %r851, %r177; - add.s32 %r858, %r857, %r856; - xor.b32 %r859, %r858, %r853; - shf.l.wrap.b32 %r860, %r859, %r859, 24; - add.s32 %r861, %r860, %r854; - xor.b32 %r862, %r861, %r856; - shf.l.wrap.b32 %r863, %r862, %r862, 25; - add.s32 %r864, %r816, %r101; - add.s32 %r865, %r864, %r807; - xor.b32 %r866, %r865, %r832; - shf.l.wrap.b32 %r867, %r866, %r866, 16; - add.s32 %r868, %r867, %r847; - xor.b32 %r869, %r868, %r807; - shf.l.wrap.b32 %r870, %r869, %r869, 20; - add.s32 %r871, %r865, %r69; - add.s32 %r872, %r871, %r870; - xor.b32 %r873, %r872, %r867; - shf.l.wrap.b32 %r874, %r873, %r873, 24; - add.s32 %r875, %r874, %r868; - xor.b32 %r876, %r875, %r870; - shf.l.wrap.b32 %r877, %r876, %r876, 25; - add.s32 %r878, %r830, %r77; - add.s32 %r879, %r878, %r821; - xor.b32 %r880, %r879, %r846; - shf.l.wrap.b32 %r881, %r880, %r880, 16; - add.s32 %r882, %r881, %r805; - xor.b32 %r883, %r882, %r821; - shf.l.wrap.b32 %r884, %r883, %r883, 20; - add.s32 %r885, %r879, %r129; - add.s32 %r886, %r885, %r884; - xor.b32 %r887, %r886, %r881; - shf.l.wrap.b32 %r888, %r887, %r887, 24; - add.s32 %r889, %r888, %r882; - xor.b32 %r890, %r889, %r884; - shf.l.wrap.b32 %r891, %r890, %r890, 25; - add.s32 %r892, %r844, %r121; - add.s32 %r893, %r892, %r835; - xor.b32 %r894, %r893, %r804; - shf.l.wrap.b32 %r895, %r894, %r894, 16; - add.s32 %r896, %r895, %r819; - xor.b32 %r897, %r896, %r835; - shf.l.wrap.b32 %r898, %r897, %r897, 20; - add.s32 %r899, %r893, %r105; - add.s32 %r900, %r899, %r898; - xor.b32 %r901, %r900, %r895; - shf.l.wrap.b32 %r902, %r901, %r901, 24; - add.s32 %r903, %r902, %r896; - xor.b32 %r904, %r903, %r898; - shf.l.wrap.b32 %r905, %r904, %r904, 25; - add.s32 %r906, %r858, %r169; - add.s32 %r907, %r906, %r877; - xor.b32 %r908, %r907, %r902; - shf.l.wrap.b32 %r909, %r908, %r908, 16; - add.s32 %r910, %r909, %r889; - xor.b32 %r911, %r910, %r877; - shf.l.wrap.b32 %r912, %r911, %r911, 20; - add.s32 %r913, %r907, %r137; - add.s32 %r914, %r913, %r912; - xor.b32 %r915, %r914, %r909; - shf.l.wrap.b32 %r916, %r915, %r915, 24; - add.s32 %r917, %r916, %r910; - xor.b32 %r918, %r917, %r912; - shf.l.wrap.b32 %r919, %r918, %r918, 25; - add.s32 %r920, %r872, %r81; - add.s32 %r921, %r920, %r891; - xor.b32 %r922, %r921, %r860; - shf.l.wrap.b32 %r923, %r922, %r922, 16; - add.s32 %r924, %r923, %r903; - xor.b32 %r925, %r924, %r891; - shf.l.wrap.b32 %r926, %r925, %r925, 20; - add.s32 %r927, %r921, %r153; - add.s32 %r928, %r927, %r926; - xor.b32 %r929, %r928, %r923; - shf.l.wrap.b32 %r930, %r929, %r929, 24; - add.s32 %r931, %r930, %r924; - xor.b32 %r932, %r931, %r926; - shf.l.wrap.b32 %r933, %r932, %r932, 25; - add.s32 %r934, %r886, %r89; - add.s32 %r935, %r934, %r905; - xor.b32 %r936, %r935, %r874; - shf.l.wrap.b32 %r937, %r936, %r936, 16; - add.s32 %r938, %r937, %r861; - xor.b32 %r939, %r938, %r905; - shf.l.wrap.b32 %r940, %r939, %r939, 20; - add.s32 %r941, %r935, %r93; - add.s32 %r942, %r941, %r940; - xor.b32 %r943, %r942, %r937; - shf.l.wrap.b32 %r944, %r943, %r943, 24; - add.s32 %r945, %r944, %r938; - xor.b32 %r946, %r945, %r940; - shf.l.wrap.b32 %r947, %r946, %r946, 25; - add.s32 %r948, %r900, %r113; - add.s32 %r949, %r948, %r863; - xor.b32 %r950, %r949, %r888; - shf.l.wrap.b32 %r951, %r950, %r950, 16; - add.s32 %r952, %r951, %r875; - xor.b32 %r953, %r952, %r863; - shf.l.wrap.b32 %r954, %r953, %r953, 20; - add.s32 %r955, %r949, %r161; - add.s32 %r956, %r955, %r954; - xor.b32 %r957, %r956, %r951; - shf.l.wrap.b32 %r958, %r957, %r957, 24; - add.s32 %r959, %r958, %r952; - xor.b32 %r960, %r959, %r954; - shf.l.wrap.b32 %r961, %r960, %r960, 25; - xor.b32 %r1, %r945, %r914; - xor.b32 %r2, %r959, %r928; - st.local.v2.u32 [%rd3+32], {%r1, %r2}; - xor.b32 %r3, %r917, %r942; - xor.b32 %r4, %r956, %r931; - st.local.v2.u32 [%rd3+40], {%r3, %r4}; - xor.b32 %r5, %r961, %r930; - xor.b32 %r6, %r919, %r944; - st.local.v2.u32 [%rd3+48], {%r5, %r6}; - xor.b32 %r7, %r958, %r933; - xor.b32 %r8, %r947, %r916; - st.local.v2.u32 [%rd3+56], {%r7, %r8}; - st.local.u64 [%rd3+72], %rd131; - st.local.u64 [%rd3+80], %rd8; - add.s16 %rs1, %rs61, 16; - and.b16 %rs70, %rs1, 255; - add.s16 %rs71, %rs62, 1; - st.local.v2.u8 [%rd3+136], {%rs1, %rs71}; - cvt.u32.u16 %r962, %rs71; - cvt.u32.u16 %r963, %rs70; - prmt.b32 %r964, %r962, %r963, 30212; - cvt.u16.u32 %rs72, %r964; - shr.u16 %rs2, %rs72, 8; - mov.b32 {%rs5, %rs6}, %r53; - mov.b32 {%rs3, %rs4}, %r52; - mov.b32 {%rs9, %rs10}, %r27; - mov.b32 {%rs7, %rs8}, %r26; - setp.eq.s16 %p10, %rs2, 0; - selp.u16 %rs73, 1, 0, %p10; - shr.u16 %rs74, %rs3, 8; - shr.u16 %rs75, %rs4, 8; - shr.u16 %rs76, %rs5, 8; - shr.u16 %rs77, %rs6, 8; - shr.u16 %rs78, %rs7, 8; - shr.u16 %rs79, %rs8, 8; - shr.u16 %rs80, %rs9, 8; - shr.u16 %rs81, %rs10, 8; - or.b16 %rs82, %rs73, 10; - cvt.u32.u16 %r965, %rs3; - and.b32 %r966, %r965, 255; - cvt.u32.u16 %r967, %rs74; - prmt.b32 %r968, %r967, %r966, 30212; - cvt.u32.u16 %r969, %rs4; - prmt.b32 %r970, %r969, %r968, 28756; - cvt.u32.u16 %r971, %rs75; - prmt.b32 %r972, %r971, %r970, 1620; - cvt.u32.u16 %r973, %rs5; - and.b32 %r974, %r973, 255; - cvt.u32.u16 %r975, %rs76; - prmt.b32 %r976, %r975, %r974, 30212; - cvt.u32.u16 %r977, %rs6; - prmt.b32 %r978, %r977, %r976, 28756; - cvt.u32.u16 %r979, %rs77; - prmt.b32 %r980, %r979, %r978, 1620; - cvt.u32.u16 %r981, %rs7; - and.b32 %r982, %r981, 255; - cvt.u32.u16 %r983, %rs78; - prmt.b32 %r984, %r983, %r982, 30212; - cvt.u32.u16 %r985, %rs8; - prmt.b32 %r986, %r985, %r984, 28756; - cvt.u32.u16 %r987, %rs79; - prmt.b32 %r988, %r987, %r986, 1620; - cvt.u32.u16 %r989, %rs9; - and.b32 %r990, %r989, 255; - cvt.u32.u16 %r991, %rs80; - prmt.b32 %r992, %r991, %r990, 30212; - cvt.u32.u16 %r993, %rs10; - prmt.b32 %r994, %r993, %r992, 28756; - cvt.u32.u16 %r995, %rs81; - prmt.b32 %r996, %r995, %r994, 1620; - cvt.u32.u16 %r997, %rs82; - add.s32 %r998, %r5, %r1; - add.s32 %r999, %r998, %r972; - add.s32 %r1000, %r980, %r999; - add.s32 %r1001, %r6, %r2; - add.s32 %r1002, %r1001, %r988; - add.s32 %r1003, %r996, %r1002; - add.s32 %r1004, %r7, %r3; - cvt.u32.u16 %r1005, %rs1; - and.b32 %r1006, %r1005, 255; - xor.b32 %r1007, %r1004, %r1006; - shr.u32 %r1008, %r1004, 16; - shl.b32 %r1009, %r1007, 16; - or.b32 %r1010, %r1009, %r1008; - add.s32 %r1011, %r1010, 1013904242; - xor.b32 %r1012, %r1011, %r7; - shf.l.wrap.b32 %r1013, %r1012, %r1012, 20; - add.s32 %r1014, %r1004, %r1013; - xor.b32 %r1015, %r1014, %r1010; - shf.l.wrap.b32 %r1016, %r1015, %r1015, 24; - add.s32 %r1017, %r1016, %r1011; - xor.b32 %r1018, %r1017, %r1013; - shf.l.wrap.b32 %r1019, %r1018, %r1018, 25; - add.s32 %r1020, %r8, %r4; - xor.b32 %r1021, %r1020, %r997; - shr.u32 %r1022, %r1020, 16; - shl.b32 %r1023, %r1021, 16; - or.b32 %r1024, %r1023, %r1022; - add.s32 %r1025, %r1024, -1521486534; - xor.b32 %r1026, %r1025, %r8; - shf.l.wrap.b32 %r1027, %r1026, %r1026, 20; - add.s32 %r1028, %r1020, %r1027; - xor.b32 %r1029, %r1028, %r1024; - shf.l.wrap.b32 %r1030, %r1029, %r1029, 24; - add.s32 %r1031, %r1030, %r1025; - xor.b32 %r1032, %r1031, %r1027; - shf.l.wrap.b32 %r1033, %r1032, %r1032, 25; - add.s32 %r1034, %r1033, %r1014; - shf.l.wrap.b32 %r1035, %r999, %r999, 16; - add.s32 %r1036, %r1035, 1779033703; - xor.b32 %r1037, %r1036, %r5; - shf.l.wrap.b32 %r1038, %r1037, %r1037, 20; - add.s32 %r1039, %r1000, %r1038; - xor.b32 %r1040, %r1039, %r1035; - shf.l.wrap.b32 %r1041, %r1040, %r1040, 24; - add.s32 %r1042, %r1041, %r1036; - xor.b32 %r1043, %r1042, %r1038; - shf.l.wrap.b32 %r1044, %r1043, %r1043, 25; - shf.l.wrap.b32 %r1045, %r1002, %r1002, 16; - add.s32 %r1046, %r1045, -1150833019; - xor.b32 %r1047, %r1046, %r6; - shf.l.wrap.b32 %r1048, %r1047, %r1047, 20; - add.s32 %r1049, %r1003, %r1048; - xor.b32 %r1050, %r1049, %r1045; - shf.l.wrap.b32 %r1051, %r1050, %r1050, 24; - add.s32 %r1052, %r1051, %r1046; - xor.b32 %r1053, %r1052, %r1048; - shf.l.wrap.b32 %r1054, %r1053, %r1053, 25; - add.s32 %r1055, %r1039, %r1054; - xor.b32 %r1056, %r1055, %r1030; - shf.l.wrap.b32 %r1057, %r1056, %r1056, 16; - add.s32 %r1058, %r1057, %r1017; - xor.b32 %r1059, %r1058, %r1054; - shf.l.wrap.b32 %r1060, %r1059, %r1059, 20; - add.s32 %r1061, %r1055, %r1060; - xor.b32 %r1062, %r1061, %r1057; - shf.l.wrap.b32 %r1063, %r1062, %r1062, 24; - add.s32 %r1064, %r1063, %r1058; - xor.b32 %r1065, %r1064, %r1060; - shf.l.wrap.b32 %r1066, %r1065, %r1065, 25; - add.s32 %r1067, %r1019, %r1049; - xor.b32 %r1068, %r1041, %r1067; - shf.l.wrap.b32 %r1069, %r1068, %r1068, 16; - add.s32 %r1070, %r1069, %r1031; - xor.b32 %r1071, %r1070, %r1019; - shf.l.wrap.b32 %r1072, %r1071, %r1071, 20; - add.s32 %r1073, %r1067, %r1072; - xor.b32 %r1074, %r1073, %r1069; - shf.l.wrap.b32 %r1075, %r1074, %r1074, 24; - add.s32 %r1076, %r1075, %r1070; - xor.b32 %r1077, %r1076, %r1072; - shf.l.wrap.b32 %r1078, %r1077, %r1077, 25; - xor.b32 %r1079, %r1051, %r1034; - shf.l.wrap.b32 %r1080, %r1079, %r1079, 16; - add.s32 %r1081, %r1080, %r1042; - xor.b32 %r1082, %r1081, %r1033; - shf.l.wrap.b32 %r1083, %r1082, %r1082, 20; - add.s32 %r1084, %r1034, %r1083; - xor.b32 %r1085, %r1084, %r1080; - shf.l.wrap.b32 %r1086, %r1085, %r1085, 24; - add.s32 %r1087, %r1086, %r1081; - xor.b32 %r1088, %r1087, %r1083; - shf.l.wrap.b32 %r1089, %r1088, %r1088, 25; - add.s32 %r1090, %r1028, %r1044; - xor.b32 %r1091, %r1090, %r1016; - shf.l.wrap.b32 %r1092, %r1091, %r1091, 16; - add.s32 %r1093, %r1092, %r1052; - xor.b32 %r1094, %r1093, %r1044; - shf.l.wrap.b32 %r1095, %r1094, %r1094, 20; - add.s32 %r1096, %r1090, %r1095; - xor.b32 %r1097, %r1096, %r1092; - shf.l.wrap.b32 %r1098, %r1097, %r1097, 24; - add.s32 %r1099, %r1098, %r1093; - xor.b32 %r1100, %r1099, %r1095; - shf.l.wrap.b32 %r1101, %r1100, %r1100, 25; - add.s32 %r1102, %r1061, %r988; - add.s32 %r1103, %r1102, %r1101; - xor.b32 %r1104, %r1103, %r1075; - shf.l.wrap.b32 %r1105, %r1104, %r1104, 16; - add.s32 %r1106, %r1105, %r1087; - xor.b32 %r1107, %r1106, %r1101; - shf.l.wrap.b32 %r1108, %r1107, %r1107, 20; - add.s32 %r1109, %r1103, %r1108; - xor.b32 %r1110, %r1109, %r1105; - shf.l.wrap.b32 %r1111, %r1110, %r1110, 24; - add.s32 %r1112, %r1111, %r1106; - xor.b32 %r1113, %r1112, %r1108; - shf.l.wrap.b32 %r1114, %r1113, %r1113, 25; - add.s32 %r1115, %r1073, %r996; - add.s32 %r1116, %r1115, %r1066; - xor.b32 %r1117, %r1116, %r1086; - shf.l.wrap.b32 %r1118, %r1117, %r1117, 16; - add.s32 %r1119, %r1118, %r1099; - xor.b32 %r1120, %r1119, %r1066; - shf.l.wrap.b32 %r1121, %r1120, %r1120, 20; - add.s32 %r1122, %r1116, %r1121; - xor.b32 %r1123, %r1122, %r1118; - shf.l.wrap.b32 %r1124, %r1123, %r1123, 24; - add.s32 %r1125, %r1124, %r1119; - xor.b32 %r1126, %r1125, %r1121; - shf.l.wrap.b32 %r1127, %r1126, %r1126, 25; - add.s32 %r1128, %r1084, %r1078; - xor.b32 %r1129, %r1098, %r1128; - shf.l.wrap.b32 %r1130, %r1129, %r1129, 16; - add.s32 %r1131, %r1130, %r1064; - xor.b32 %r1132, %r1131, %r1078; - shf.l.wrap.b32 %r1133, %r1132, %r1132, 20; - add.s32 %r1134, %r1128, %r972; - add.s32 %r1135, %r1134, %r1133; - xor.b32 %r1136, %r1135, %r1130; - shf.l.wrap.b32 %r1137, %r1136, %r1136, 24; - add.s32 %r1138, %r1137, %r1131; - xor.b32 %r1139, %r1138, %r1133; - shf.l.wrap.b32 %r1140, %r1139, %r1139, 25; - add.s32 %r1141, %r1096, %r1089; - xor.b32 %r1142, %r1063, %r1141; - shf.l.wrap.b32 %r1143, %r1142, %r1142, 16; - add.s32 %r1144, %r1143, %r1076; - xor.b32 %r1145, %r1144, %r1089; - shf.l.wrap.b32 %r1146, %r1145, %r1145, 20; - add.s32 %r1147, %r1141, %r1146; - xor.b32 %r1148, %r1147, %r1143; - shf.l.wrap.b32 %r1149, %r1148, %r1148, 24; - add.s32 %r1150, %r1149, %r1144; - xor.b32 %r1151, %r1150, %r1146; - shf.l.wrap.b32 %r1152, %r1151, %r1151, 25; - add.s32 %r1153, %r1109, %r980; - add.s32 %r1154, %r1153, %r1127; - xor.b32 %r1155, %r1154, %r1149; - shf.l.wrap.b32 %r1156, %r1155, %r1155, 16; - add.s32 %r1157, %r1156, %r1138; - xor.b32 %r1158, %r1157, %r1127; - shf.l.wrap.b32 %r1159, %r1158, %r1158, 20; - add.s32 %r1160, %r1154, %r1159; - xor.b32 %r1161, %r1160, %r1156; - shf.l.wrap.b32 %r1162, %r1161, %r1161, 24; - add.s32 %r1163, %r1162, %r1157; - xor.b32 %r1164, %r1163, %r1159; - shf.l.wrap.b32 %r1165, %r1164, %r1164, 25; - add.s32 %r1166, %r1140, %r1122; - xor.b32 %r1167, %r1111, %r1166; - shf.l.wrap.b32 %r1168, %r1167, %r1167, 16; - add.s32 %r1169, %r1168, %r1150; - xor.b32 %r1170, %r1169, %r1140; - shf.l.wrap.b32 %r1171, %r1170, %r1170, 20; - add.s32 %r1172, %r1166, %r1171; - xor.b32 %r1173, %r1172, %r1168; - shf.l.wrap.b32 %r1174, %r1173, %r1173, 24; - add.s32 %r1175, %r1174, %r1169; - xor.b32 %r1176, %r1175, %r1171; - shf.l.wrap.b32 %r1177, %r1176, %r1176, 25; - add.s32 %r1178, %r1135, %r1152; - xor.b32 %r1179, %r1124, %r1178; - shf.l.wrap.b32 %r1180, %r1179, %r1179, 16; - add.s32 %r1181, %r1180, %r1112; - xor.b32 %r1182, %r1181, %r1152; - shf.l.wrap.b32 %r1183, %r1182, %r1182, 20; - add.s32 %r1184, %r1178, %r1183; - xor.b32 %r1185, %r1184, %r1180; - shf.l.wrap.b32 %r1186, %r1185, %r1185, 24; - add.s32 %r1187, %r1186, %r1181; - xor.b32 %r1188, %r1187, %r1183; - shf.l.wrap.b32 %r1189, %r1188, %r1188, 25; - add.s32 %r1190, %r1147, %r1114; - xor.b32 %r1191, %r1190, %r1137; - shf.l.wrap.b32 %r1192, %r1191, %r1191, 16; - add.s32 %r1193, %r1192, %r1125; - xor.b32 %r1194, %r1193, %r1114; - shf.l.wrap.b32 %r1195, %r1194, %r1194, 20; - add.s32 %r1196, %r1190, %r1195; - xor.b32 %r1197, %r1196, %r1192; - shf.l.wrap.b32 %r1198, %r1197, %r1197, 24; - add.s32 %r1199, %r1198, %r1193; - xor.b32 %r1200, %r1199, %r1195; - shf.l.wrap.b32 %r1201, %r1200, %r1200, 25; - add.s32 %r1202, %r1160, %r996; - add.s32 %r1203, %r1202, %r1201; - xor.b32 %r1204, %r1203, %r1174; - shf.l.wrap.b32 %r1205, %r1204, %r1204, 16; - add.s32 %r1206, %r1205, %r1187; - xor.b32 %r1207, %r1206, %r1201; - shf.l.wrap.b32 %r1208, %r1207, %r1207, 20; - add.s32 %r1209, %r1203, %r1208; - xor.b32 %r1210, %r1209, %r1205; - shf.l.wrap.b32 %r1211, %r1210, %r1210, 24; - add.s32 %r1212, %r1211, %r1206; - xor.b32 %r1213, %r1212, %r1208; - shf.l.wrap.b32 %r1214, %r1213, %r1213, 25; - add.s32 %r1215, %r1172, %r1165; - xor.b32 %r1216, %r1215, %r1186; - shf.l.wrap.b32 %r1217, %r1216, %r1216, 16; - add.s32 %r1218, %r1217, %r1199; - xor.b32 %r1219, %r1218, %r1165; - shf.l.wrap.b32 %r1220, %r1219, %r1219, 20; - add.s32 %r1221, %r1215, %r1220; - xor.b32 %r1222, %r1221, %r1217; - shf.l.wrap.b32 %r1223, %r1222, %r1222, 24; - add.s32 %r1224, %r1223, %r1218; - xor.b32 %r1225, %r1224, %r1220; - shf.l.wrap.b32 %r1226, %r1225, %r1225, 25; - add.s32 %r1227, %r1184, %r1177; - xor.b32 %r1228, %r1198, %r1227; - shf.l.wrap.b32 %r1229, %r1228, %r1228, 16; - add.s32 %r1230, %r1229, %r1163; - xor.b32 %r1231, %r1230, %r1177; - shf.l.wrap.b32 %r1232, %r1231, %r1231, 20; - add.s32 %r1233, %r1227, %r988; - add.s32 %r1234, %r1233, %r1232; - xor.b32 %r1235, %r1234, %r1229; - shf.l.wrap.b32 %r1236, %r1235, %r1235, 24; - add.s32 %r1237, %r1236, %r1230; - xor.b32 %r1238, %r1237, %r1232; - shf.l.wrap.b32 %r1239, %r1238, %r1238, 25; - add.s32 %r1240, %r1196, %r1189; - xor.b32 %r1241, %r1162, %r1240; - shf.l.wrap.b32 %r1242, %r1241, %r1241, 16; - add.s32 %r1243, %r1242, %r1175; - xor.b32 %r1244, %r1243, %r1189; - shf.l.wrap.b32 %r1245, %r1244, %r1244, 20; - add.s32 %r1246, %r1240, %r1245; - xor.b32 %r1247, %r1246, %r1242; - shf.l.wrap.b32 %r1248, %r1247, %r1247, 24; - add.s32 %r1249, %r1248, %r1243; - xor.b32 %r1250, %r1249, %r1245; - shf.l.wrap.b32 %r1251, %r1250, %r1250, 25; - add.s32 %r1252, %r1209, %r1226; - xor.b32 %r1253, %r1252, %r1248; - shf.l.wrap.b32 %r1254, %r1253, %r1253, 16; - add.s32 %r1255, %r1254, %r1237; - xor.b32 %r1256, %r1255, %r1226; - shf.l.wrap.b32 %r1257, %r1256, %r1256, 20; - add.s32 %r1258, %r1252, %r1257; - xor.b32 %r1259, %r1258, %r1254; - shf.l.wrap.b32 %r1260, %r1259, %r1259, 24; - add.s32 %r1261, %r1260, %r1255; - xor.b32 %r1262, %r1261, %r1257; - shf.l.wrap.b32 %r1263, %r1262, %r1262, 25; - add.s32 %r1264, %r1239, %r1221; - xor.b32 %r1265, %r1211, %r1264; - shf.l.wrap.b32 %r1266, %r1265, %r1265, 16; - add.s32 %r1267, %r1266, %r1249; - xor.b32 %r1268, %r1267, %r1239; - shf.l.wrap.b32 %r1269, %r1268, %r1268, 20; - add.s32 %r1270, %r1264, %r972; - add.s32 %r1271, %r1270, %r1269; - xor.b32 %r1272, %r1271, %r1266; - shf.l.wrap.b32 %r1273, %r1272, %r1272, 24; - add.s32 %r1274, %r1273, %r1267; - xor.b32 %r1275, %r1274, %r1269; - shf.l.wrap.b32 %r1276, %r1275, %r1275, 25; - add.s32 %r1277, %r1234, %r1251; - xor.b32 %r1278, %r1223, %r1277; - shf.l.wrap.b32 %r1279, %r1278, %r1278, 16; - add.s32 %r1280, %r1279, %r1212; - xor.b32 %r1281, %r1280, %r1251; - shf.l.wrap.b32 %r1282, %r1281, %r1281, 20; - add.s32 %r1283, %r1277, %r1282; - xor.b32 %r1284, %r1283, %r1279; - shf.l.wrap.b32 %r1285, %r1284, %r1284, 24; - add.s32 %r1286, %r1285, %r1280; - xor.b32 %r1287, %r1286, %r1282; - shf.l.wrap.b32 %r1288, %r1287, %r1287, 25; - add.s32 %r1289, %r1246, %r1214; - xor.b32 %r1290, %r1289, %r1236; - shf.l.wrap.b32 %r1291, %r1290, %r1290, 16; - add.s32 %r1292, %r1291, %r1224; - xor.b32 %r1293, %r1292, %r1214; - shf.l.wrap.b32 %r1294, %r1293, %r1293, 20; - add.s32 %r1295, %r1289, %r980; - add.s32 %r1296, %r1295, %r1294; - xor.b32 %r1297, %r1296, %r1291; - shf.l.wrap.b32 %r1298, %r1297, %r1297, 24; - add.s32 %r1299, %r1298, %r1292; - xor.b32 %r1300, %r1299, %r1294; - shf.l.wrap.b32 %r1301, %r1300, %r1300, 25; - add.s32 %r1302, %r1258, %r1301; - xor.b32 %r1303, %r1302, %r1273; - shf.l.wrap.b32 %r1304, %r1303, %r1303, 16; - add.s32 %r1305, %r1304, %r1286; - xor.b32 %r1306, %r1305, %r1301; - shf.l.wrap.b32 %r1307, %r1306, %r1306, 20; - add.s32 %r1308, %r1302, %r1307; - xor.b32 %r1309, %r1308, %r1304; - shf.l.wrap.b32 %r1310, %r1309, %r1309, 24; - add.s32 %r1311, %r1310, %r1305; - xor.b32 %r1312, %r1311, %r1307; - shf.l.wrap.b32 %r1313, %r1312, %r1312, 25; - add.s32 %r1314, %r1271, %r1263; - xor.b32 %r1315, %r1314, %r1285; - shf.l.wrap.b32 %r1316, %r1315, %r1315, 16; - add.s32 %r1317, %r1316, %r1299; - xor.b32 %r1318, %r1317, %r1263; - shf.l.wrap.b32 %r1319, %r1318, %r1318, 20; - add.s32 %r1320, %r1314, %r1319; - xor.b32 %r1321, %r1320, %r1316; - shf.l.wrap.b32 %r1322, %r1321, %r1321, 24; - add.s32 %r1323, %r1322, %r1317; - xor.b32 %r1324, %r1323, %r1319; - shf.l.wrap.b32 %r1325, %r1324, %r1324, 25; - add.s32 %r1326, %r1283, %r1276; - xor.b32 %r1327, %r1298, %r1326; - shf.l.wrap.b32 %r1328, %r1327, %r1327, 16; - add.s32 %r1329, %r1328, %r1261; - xor.b32 %r1330, %r1329, %r1276; - shf.l.wrap.b32 %r1331, %r1330, %r1330, 20; - add.s32 %r1332, %r1326, %r996; - add.s32 %r1333, %r1332, %r1331; - xor.b32 %r1334, %r1333, %r1328; - shf.l.wrap.b32 %r1335, %r1334, %r1334, 24; - add.s32 %r1336, %r1335, %r1329; - xor.b32 %r1337, %r1336, %r1331; - shf.l.wrap.b32 %r1338, %r1337, %r1337, 25; - add.s32 %r1339, %r1296, %r1288; - xor.b32 %r1340, %r1260, %r1339; - shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; - add.s32 %r1342, %r1341, %r1274; - xor.b32 %r1343, %r1342, %r1288; - shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; - add.s32 %r1345, %r1339, %r1344; - xor.b32 %r1346, %r1345, %r1341; - shf.l.wrap.b32 %r1347, %r1346, %r1346, 24; - add.s32 %r1348, %r1347, %r1342; - xor.b32 %r1349, %r1348, %r1344; - shf.l.wrap.b32 %r1350, %r1349, %r1349, 25; - add.s32 %r1351, %r1308, %r1325; - xor.b32 %r1352, %r1351, %r1347; - shf.l.wrap.b32 %r1353, %r1352, %r1352, 16; - add.s32 %r1354, %r1353, %r1336; - xor.b32 %r1355, %r1354, %r1325; - shf.l.wrap.b32 %r1356, %r1355, %r1355, 20; - add.s32 %r1357, %r1351, %r972; - add.s32 %r1358, %r1357, %r1356; - xor.b32 %r1359, %r1358, %r1353; - shf.l.wrap.b32 %r1360, %r1359, %r1359, 24; - add.s32 %r1361, %r1360, %r1354; - xor.b32 %r1362, %r1361, %r1356; - shf.l.wrap.b32 %r1363, %r1362, %r1362, 25; - add.s32 %r1364, %r1338, %r1320; - xor.b32 %r1365, %r1310, %r1364; - shf.l.wrap.b32 %r1366, %r1365, %r1365, 16; - add.s32 %r1367, %r1366, %r1348; - xor.b32 %r1368, %r1367, %r1338; - shf.l.wrap.b32 %r1369, %r1368, %r1368, 20; - add.s32 %r1370, %r1364, %r988; - add.s32 %r1371, %r1370, %r1369; - xor.b32 %r1372, %r1371, %r1366; - shf.l.wrap.b32 %r1373, %r1372, %r1372, 24; - add.s32 %r1374, %r1373, %r1367; - xor.b32 %r1375, %r1374, %r1369; - shf.l.wrap.b32 %r1376, %r1375, %r1375, 25; - add.s32 %r1377, %r1333, %r1350; - xor.b32 %r1378, %r1322, %r1377; - shf.l.wrap.b32 %r1379, %r1378, %r1378, 16; - add.s32 %r1380, %r1379, %r1311; - xor.b32 %r1381, %r1380, %r1350; - shf.l.wrap.b32 %r1382, %r1381, %r1381, 20; - add.s32 %r1383, %r1377, %r1382; - xor.b32 %r1384, %r1383, %r1379; - shf.l.wrap.b32 %r1385, %r1384, %r1384, 24; - add.s32 %r1386, %r1385, %r1380; - xor.b32 %r1387, %r1386, %r1382; - shf.l.wrap.b32 %r1388, %r1387, %r1387, 25; - add.s32 %r1389, %r1345, %r980; - add.s32 %r1390, %r1389, %r1313; - xor.b32 %r1391, %r1390, %r1335; - shf.l.wrap.b32 %r1392, %r1391, %r1391, 16; - add.s32 %r1393, %r1392, %r1323; - xor.b32 %r1394, %r1393, %r1313; - shf.l.wrap.b32 %r1395, %r1394, %r1394, 20; - add.s32 %r1396, %r1390, %r1395; - xor.b32 %r1397, %r1396, %r1392; - shf.l.wrap.b32 %r1398, %r1397, %r1397, 24; - add.s32 %r1399, %r1398, %r1393; - xor.b32 %r1400, %r1399, %r1395; - shf.l.wrap.b32 %r1401, %r1400, %r1400, 25; - add.s32 %r1402, %r1358, %r1401; - xor.b32 %r1403, %r1402, %r1373; - shf.l.wrap.b32 %r1404, %r1403, %r1403, 16; - add.s32 %r1405, %r1404, %r1386; - xor.b32 %r1406, %r1405, %r1401; - shf.l.wrap.b32 %r1407, %r1406, %r1406, 20; - add.s32 %r1408, %r1402, %r1407; - xor.b32 %r1409, %r1408, %r1404; - shf.l.wrap.b32 %r1410, %r1409, %r1409, 24; - add.s32 %r1411, %r1410, %r1405; - xor.b32 %r1412, %r1411, %r1407; - shf.l.wrap.b32 %r1413, %r1412, %r1412, 25; - add.s32 %r1414, %r1371, %r1363; - xor.b32 %r1415, %r1414, %r1385; - shf.l.wrap.b32 %r1416, %r1415, %r1415, 16; - add.s32 %r1417, %r1416, %r1399; - xor.b32 %r1418, %r1417, %r1363; - shf.l.wrap.b32 %r1419, %r1418, %r1418, 20; - add.s32 %r1420, %r1414, %r1419; - xor.b32 %r1421, %r1420, %r1416; - shf.l.wrap.b32 %r1422, %r1421, %r1421, 24; - add.s32 %r1423, %r1422, %r1417; - xor.b32 %r1424, %r1423, %r1419; - shf.l.wrap.b32 %r1425, %r1424, %r1424, 25; - add.s32 %r1426, %r1383, %r1376; - xor.b32 %r1427, %r1398, %r1426; - shf.l.wrap.b32 %r1428, %r1427, %r1427, 16; - add.s32 %r1429, %r1428, %r1361; - xor.b32 %r1430, %r1429, %r1376; - shf.l.wrap.b32 %r1431, %r1430, %r1430, 20; - add.s32 %r1432, %r1426, %r1431; - xor.b32 %r1433, %r1432, %r1428; - shf.l.wrap.b32 %r1434, %r1433, %r1433, 24; - add.s32 %r1435, %r1434, %r1429; - xor.b32 %r1436, %r1435, %r1431; - shf.l.wrap.b32 %r1437, %r1436, %r1436, 25; - add.s32 %r1438, %r1396, %r1388; - xor.b32 %r1439, %r1360, %r1438; - shf.l.wrap.b32 %r1440, %r1439, %r1439, 16; - add.s32 %r1441, %r1440, %r1374; - xor.b32 %r1442, %r1441, %r1388; - shf.l.wrap.b32 %r1443, %r1442, %r1442, 20; - add.s32 %r1444, %r1438, %r1443; - xor.b32 %r1445, %r1444, %r1440; - shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; - add.s32 %r1447, %r1446, %r1441; - xor.b32 %r1448, %r1447, %r1443; - shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; - add.s32 %r1450, %r1408, %r1425; - xor.b32 %r1451, %r1450, %r1446; - shf.l.wrap.b32 %r1452, %r1451, %r1451, 16; - add.s32 %r1453, %r1452, %r1435; - xor.b32 %r1454, %r1453, %r1425; - shf.l.wrap.b32 %r1455, %r1454, %r1454, 20; - add.s32 %r1456, %r1450, %r988; - add.s32 %r1457, %r1456, %r1455; - xor.b32 %r1458, %r1457, %r1452; - shf.l.wrap.b32 %r1459, %r1458, %r1458, 24; - add.s32 %r1460, %r1459, %r1453; - xor.b32 %r1461, %r1460, %r1455; - shf.l.wrap.b32 %r1462, %r1461, %r1461, 25; - add.s32 %r1463, %r1437, %r1420; - xor.b32 %r1464, %r1410, %r1463; - shf.l.wrap.b32 %r1465, %r1464, %r1464, 16; - add.s32 %r1466, %r1465, %r1447; - xor.b32 %r1467, %r1466, %r1437; - shf.l.wrap.b32 %r1468, %r1467, %r1467, 20; - add.s32 %r1469, %r1463, %r996; - add.s32 %r1470, %r1469, %r1468; - xor.b32 %r1471, %r1470, %r1465; - shf.l.wrap.b32 %r1472, %r1471, %r1471, 24; - add.s32 %r1473, %r1472, %r1466; - xor.b32 %r1474, %r1473, %r1468; - shf.l.wrap.b32 %r1475, %r1474, %r1474, 25; - add.s32 %r1476, %r1432, %r972; - add.s32 %r1477, %r1476, %r1449; - xor.b32 %r1478, %r1422, %r1477; - shf.l.wrap.b32 %r1479, %r1478, %r1478, 16; - add.s32 %r1480, %r1479, %r1411; - xor.b32 %r1481, %r1480, %r1449; - shf.l.wrap.b32 %r1482, %r1481, %r1481, 20; - add.s32 %r1483, %r1477, %r980; - add.s32 %r1484, %r1483, %r1482; - xor.b32 %r1485, %r1484, %r1479; - shf.l.wrap.b32 %r1486, %r1485, %r1485, 24; - add.s32 %r1487, %r1486, %r1480; - xor.b32 %r1488, %r1487, %r1482; - shf.l.wrap.b32 %r1489, %r1488, %r1488, 25; - add.s32 %r1490, %r1444, %r1413; - xor.b32 %r1491, %r1490, %r1434; - shf.l.wrap.b32 %r1492, %r1491, %r1491, 16; - add.s32 %r1493, %r1492, %r1423; - xor.b32 %r1494, %r1493, %r1413; - shf.l.wrap.b32 %r1495, %r1494, %r1494, 20; - add.s32 %r1496, %r1490, %r1495; - xor.b32 %r1497, %r1496, %r1492; - shf.l.wrap.b32 %r1498, %r1497, %r1497, 24; - add.s32 %r1499, %r1498, %r1493; - xor.b32 %r1500, %r1499, %r1495; - shf.l.wrap.b32 %r1501, %r1500, %r1500, 25; - add.s32 %r1502, %r1457, %r1501; - xor.b32 %r1503, %r1502, %r1472; - shf.l.wrap.b32 %r1504, %r1503, %r1503, 16; - add.s32 %r1505, %r1504, %r1487; - xor.b32 %r1506, %r1505, %r1501; - shf.l.wrap.b32 %r1507, %r1506, %r1506, 20; - add.s32 %r1508, %r1502, %r1507; - xor.b32 %r1509, %r1508, %r1504; - shf.l.wrap.b32 %r1510, %r1509, %r1509, 24; - add.s32 %r1511, %r1510, %r1505; - xor.b32 %r1512, %r1511, %r1507; - shf.l.wrap.b32 %r1513, %r1512, %r1512, 25; - add.s32 %r1514, %r1470, %r1462; - xor.b32 %r1515, %r1514, %r1486; - shf.l.wrap.b32 %r1516, %r1515, %r1515, 16; - add.s32 %r1517, %r1516, %r1499; - xor.b32 %r1518, %r1517, %r1462; - shf.l.wrap.b32 %r1519, %r1518, %r1518, 20; - add.s32 %r1520, %r1514, %r1519; - xor.b32 %r1521, %r1520, %r1516; - shf.l.wrap.b32 %r1522, %r1521, %r1521, 24; - add.s32 %r1523, %r1522, %r1517; - xor.b32 %r1524, %r1523, %r1519; - shf.l.wrap.b32 %r1525, %r1524, %r1524, 25; - add.s32 %r1526, %r1484, %r1475; - xor.b32 %r1527, %r1498, %r1526; - shf.l.wrap.b32 %r1528, %r1527, %r1527, 16; - add.s32 %r1529, %r1528, %r1460; - xor.b32 %r1530, %r1529, %r1475; - shf.l.wrap.b32 %r1531, %r1530, %r1530, 20; - add.s32 %r1532, %r1526, %r1531; - xor.b32 %r1533, %r1532, %r1528; - shf.l.wrap.b32 %r1534, %r1533, %r1533, 24; - add.s32 %r1535, %r1534, %r1529; - xor.b32 %r1536, %r1535, %r1531; - shf.l.wrap.b32 %r1537, %r1536, %r1536, 25; - add.s32 %r1538, %r1496, %r1489; - xor.b32 %r1539, %r1459, %r1538; - shf.l.wrap.b32 %r1540, %r1539, %r1539, 16; - add.s32 %r1541, %r1540, %r1473; - xor.b32 %r1542, %r1541, %r1489; - shf.l.wrap.b32 %r1543, %r1542, %r1542, 20; - add.s32 %r1544, %r1538, %r980; - add.s32 %r1545, %r1544, %r1543; - xor.b32 %r1546, %r1545, %r1540; - shf.l.wrap.b32 %r1547, %r1546, %r1546, 24; - add.s32 %r1548, %r1547, %r1541; - xor.b32 %r1549, %r1548, %r1543; - shf.l.wrap.b32 %r1550, %r1549, %r1549, 25; - add.s32 %r1551, %r1508, %r1525; - xor.b32 %r1552, %r1551, %r1547; - shf.l.wrap.b32 %r1553, %r1552, %r1552, 16; - add.s32 %r1554, %r1553, %r1535; - xor.b32 %r1555, %r1554, %r1525; - shf.l.wrap.b32 %r1556, %r1555, %r1555, 20; - add.s32 %r1557, %r1551, %r996; - add.s32 %r1558, %r1557, %r1556; - xor.b32 %r1559, %r1558, %r1553; - shf.l.wrap.b32 %r1560, %r1559, %r1559, 24; - add.s32 %r1561, %r1560, %r1554; - xor.b32 %r1562, %r1561, %r1556; - shf.l.wrap.b32 %r1563, %r1562, %r1562, 25; - add.s32 %r1564, %r1537, %r972; - add.s32 %r1565, %r1564, %r1520; - xor.b32 %r1566, %r1510, %r1565; - shf.l.wrap.b32 %r1567, %r1566, %r1566, 16; - add.s32 %r1568, %r1567, %r1548; - xor.b32 %r1569, %r1568, %r1537; - shf.l.wrap.b32 %r1570, %r1569, %r1569, 20; - add.s32 %r1571, %r1565, %r1570; - xor.b32 %r1572, %r1571, %r1567; - shf.l.wrap.b32 %r1573, %r1572, %r1572, 24; - add.s32 %r1574, %r1573, %r1568; - xor.b32 %r1575, %r1574, %r1570; - shf.l.wrap.b32 %r1576, %r1575, %r1575, 25; - add.s32 %r1577, %r1532, %r988; - add.s32 %r1578, %r1577, %r1550; - xor.b32 %r1579, %r1522, %r1578; - shf.l.wrap.b32 %r1580, %r1579, %r1579, 16; - add.s32 %r1581, %r1580, %r1511; - xor.b32 %r1582, %r1581, %r1550; - shf.l.wrap.b32 %r1583, %r1582, %r1582, 20; - add.s32 %r1584, %r1578, %r1583; - xor.b32 %r1585, %r1584, %r1580; - shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; - add.s32 %r1587, %r1586, %r1581; - xor.b32 %r1588, %r1587, %r1583; - shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; - add.s32 %r1590, %r1545, %r1513; - xor.b32 %r1591, %r1590, %r1534; - shf.l.wrap.b32 %r1592, %r1591, %r1591, 16; - add.s32 %r1593, %r1592, %r1523; - xor.b32 %r1594, %r1593, %r1513; - shf.l.wrap.b32 %r1595, %r1594, %r1594, 20; - add.s32 %r1596, %r1590, %r1595; - xor.b32 %r1597, %r1596, %r1592; - shf.l.wrap.b32 %r1598, %r1597, %r1597, 24; - add.s32 %r1599, %r1598, %r1593; - xor.b32 %r1600, %r1599, %r1595; - shf.l.wrap.b32 %r1601, %r1600, %r1600, 25; - add.s32 %r1602, %r1558, %r1601; - xor.b32 %r1603, %r1602, %r1573; - shf.l.wrap.b32 %r1604, %r1603, %r1603, 16; - add.s32 %r1605, %r1604, %r1587; - xor.b32 %r1606, %r1605, %r1601; - shf.l.wrap.b32 %r1607, %r1606, %r1606, 20; - add.s32 %r1608, %r1602, %r1607; - xor.b32 %r1609, %r1608, %r1604; - shf.l.wrap.b32 %r1610, %r1609, %r1609, 24; - add.s32 %r1611, %r1610, %r1605; - xor.b32 %r1612, %r1611, %r1607; - shf.l.wrap.b32 %r1613, %r1612, %r1612, 25; - add.s32 %r1614, %r1571, %r1563; - xor.b32 %r1615, %r1614, %r1586; - shf.l.wrap.b32 %r1616, %r1615, %r1615, 16; - add.s32 %r1617, %r1616, %r1599; - xor.b32 %r1618, %r1617, %r1563; - shf.l.wrap.b32 %r1619, %r1618, %r1618, 20; - add.s32 %r1620, %r1614, %r972; - add.s32 %r1621, %r1620, %r1619; - xor.b32 %r1622, %r1621, %r1616; - shf.l.wrap.b32 %r1623, %r1622, %r1622, 24; - add.s32 %r1624, %r1623, %r1617; - xor.b32 %r1625, %r1624, %r1619; - shf.l.wrap.b32 %r1626, %r1625, %r1625, 25; - add.s32 %r1627, %r1584, %r980; - add.s32 %r1628, %r1627, %r1576; - xor.b32 %r1629, %r1598, %r1628; - shf.l.wrap.b32 %r1630, %r1629, %r1629, 16; - add.s32 %r1631, %r1630, %r1561; - xor.b32 %r1632, %r1631, %r1576; - shf.l.wrap.b32 %r1633, %r1632, %r1632, 20; - add.s32 %r1634, %r1628, %r1633; - xor.b32 %r1635, %r1634, %r1630; - shf.l.wrap.b32 %r1636, %r1635, %r1635, 24; - add.s32 %r1637, %r1636, %r1631; - xor.b32 %r1638, %r1637, %r1633; - shf.l.wrap.b32 %r1639, %r1638, %r1638, 25; - add.s32 %r1640, %r1596, %r1589; - xor.b32 %r1641, %r1560, %r1640; - shf.l.wrap.b32 %r1642, %r1641, %r1641, 16; - add.s32 %r1643, %r1642, %r1574; - xor.b32 %r1644, %r1643, %r1589; - shf.l.wrap.b32 %r1645, %r1644, %r1644, 20; - add.s32 %r1646, %r1640, %r1645; - xor.b32 %r1647, %r1646, %r1642; - shf.l.wrap.b32 %r1648, %r1647, %r1647, 24; - add.s32 %r1649, %r1648, %r1643; - xor.b32 %r1650, %r1649, %r1645; - shf.l.wrap.b32 %r1651, %r1650, %r1650, 25; - add.s32 %r1652, %r1608, %r1626; - xor.b32 %r1653, %r1652, %r1648; - shf.l.wrap.b32 %r1654, %r1653, %r1653, 16; - add.s32 %r1655, %r1654, %r1637; - xor.b32 %r1656, %r1655, %r1626; - shf.l.wrap.b32 %r1657, %r1656, %r1656, 20; - add.s32 %r1658, %r1652, %r1657; - xor.b32 %r1659, %r1658, %r1654; - shf.l.wrap.b32 %r1660, %r1659, %r1659, 24; - add.s32 %r1661, %r1660, %r1655; - xor.b32 %r1662, %r1661, %r1657; - shf.l.wrap.b32 %r1663, %r1662, %r1662, 25; - add.s32 %r1664, %r1639, %r988; - add.s32 %r1665, %r1664, %r1621; - xor.b32 %r1666, %r1610, %r1665; - shf.l.wrap.b32 %r1667, %r1666, %r1666, 16; - add.s32 %r1668, %r1667, %r1649; - xor.b32 %r1669, %r1668, %r1639; - shf.l.wrap.b32 %r1670, %r1669, %r1669, 20; - add.s32 %r1671, %r1665, %r1670; - xor.b32 %r1672, %r1671, %r1667; - shf.l.wrap.b32 %r1673, %r1672, %r1672, 24; - add.s32 %r1674, %r1673, %r1668; - xor.b32 %r1675, %r1674, %r1670; - shf.l.wrap.b32 %r1676, %r1675, %r1675, 25; - add.s32 %r1677, %r1634, %r996; - add.s32 %r1678, %r1677, %r1651; - xor.b32 %r1679, %r1623, %r1678; - shf.l.wrap.b32 %r1680, %r1679, %r1679, 16; - add.s32 %r1681, %r1680, %r1611; - xor.b32 %r1682, %r1681, %r1651; - shf.l.wrap.b32 %r1683, %r1682, %r1682, 20; - add.s32 %r1684, %r1678, %r1683; - xor.b32 %r1685, %r1684, %r1680; - shf.l.wrap.b32 %r1686, %r1685, %r1685, 24; - add.s32 %r1687, %r1686, %r1681; - xor.b32 %r1688, %r1687, %r1683; - shf.l.wrap.b32 %r1689, %r1688, %r1688, 25; - add.s32 %r1690, %r1646, %r1613; - xor.b32 %r1691, %r1690, %r1636; - shf.l.wrap.b32 %r1692, %r1691, %r1691, 16; - add.s32 %r1693, %r1692, %r1624; - xor.b32 %r1694, %r1693, %r1613; - shf.l.wrap.b32 %r1695, %r1694, %r1694, 20; - add.s32 %r1696, %r1690, %r1695; - xor.b32 %r1697, %r1696, %r1692; - shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; - add.s32 %r1699, %r1698, %r1693; - xor.b32 %r1700, %r1699, %r1695; - shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; - xor.b32 %r9, %r1658, %r1687; - cvt.u64.u32 %rd132, %r9; - xor.b32 %r1702, %r1699, %r1671; - and.b32 %r1703, %r1702, 255; - cvt.u64.u32 %rd133, %r1703; - bfi.b64 %rd134, %rd133, %rd132, 32, 32; - cvt.u64.u32 %rd135, %r1702; - shl.b64 %rd136, %rd135, 32; - and.b64 %rd137, %rd136, 280375465082880; - or.b64 %rd138, %rd134, %rd137; - and.b64 %rd139, %rd136, 71776119061217280; - shr.u32 %r10, %r1702, 24; - cvt.u64.u32 %rd140, %r10; - shl.b64 %rd141, %rd140, 56; - or.b64 %rd142, %rd138, %rd139; - or.b64 %rd143, %rd142, %rd141; - xor.b32 %r11, %r1661, %r1684; - cvt.u64.u32 %rd144, %r11; - xor.b32 %r1704, %r1696, %r1674; - and.b32 %r1705, %r1704, 255; - cvt.u64.u32 %rd145, %r1705; - bfi.b64 %rd146, %rd145, %rd144, 32, 32; - cvt.u64.u32 %rd147, %r1704; - shl.b64 %rd148, %rd147, 32; - and.b64 %rd149, %rd148, 280375465082880; - or.b64 %rd150, %rd146, %rd149; - and.b64 %rd151, %rd148, 71776119061217280; - shr.u32 %r12, %r1704, 24; - cvt.u64.u32 %rd152, %r12; - shl.b64 %rd153, %rd152, 56; - or.b64 %rd154, %rd150, %rd151; - or.b64 %rd155, %rd154, %rd153; - xor.b32 %r13, %r1701, %r1673; - cvt.u64.u32 %rd156, %r13; - xor.b32 %r1706, %r1663, %r1686; - and.b32 %r1707, %r1706, 255; - cvt.u64.u32 %rd157, %r1707; - bfi.b64 %rd158, %rd157, %rd156, 32, 32; - cvt.u64.u32 %rd159, %r1706; - shl.b64 %rd160, %rd159, 32; - and.b64 %rd161, %rd160, 280375465082880; - or.b64 %rd162, %rd158, %rd161; - and.b64 %rd163, %rd160, 71776119061217280; - shr.u32 %r14, %r1706, 24; - cvt.u64.u32 %rd164, %r14; - shl.b64 %rd165, %rd164, 56; - or.b64 %rd166, %rd162, %rd163; - or.b64 %rd167, %rd166, %rd165; - xor.b32 %r1708, %r1698, %r1676; - cvt.u64.u32 %rd168, %r1708; - xor.b32 %r1709, %r1660, %r1689; - and.b32 %r1710, %r1709, 255; - cvt.u64.u32 %rd169, %r1710; - bfi.b64 %rd170, %rd169, %rd168, 32, 32; - cvt.u64.u32 %rd171, %r1709; - shl.b64 %rd172, %rd171, 32; - and.b64 %rd173, %rd172, 280375465082880; - or.b64 %rd174, %rd170, %rd173; - and.b64 %rd175, %rd172, 71776119061217280; - shr.u32 %r1711, %r1709, 24; - cvt.u64.u32 %rd176, %r1711; - shl.b64 %rd177, %rd176, 56; - or.b64 %rd178, %rd174, %rd175; - or.b64 %rd9, %rd178, %rd177; - shr.u64 %rd10, %rd143, 32; - shr.u64 %rd11, %rd143, 40; - shr.u64 %rd12, %rd143, 48; - shr.u64 %rd13, %rd155, 32; - shr.u64 %rd14, %rd155, 40; - shr.u64 %rd15, %rd155, 48; - shr.u64 %rd16, %rd167, 32; - shr.u64 %rd17, %rd167, 40; - shr.u64 %rd18, %rd167, 48; - shr.u32 %r5809, %r9, 12; - shr.u32 %r5810, %r9, 8; - shr.u32 %r5811, %r9, 4; - and.b32 %r5812, %r5811, 15; - and.b32 %r5813, %r9, 15; - bfi.b32 %r5814, %r5813, %r5812, 8, 4; - shl.b32 %r5815, %r9, 4; - and.b32 %r5816, %r5815, 983040; - or.b32 %r5817, %r5814, %r5816; - shl.b32 %r5818, %r9, 16; - and.b32 %r5819, %r5818, 251658240; - or.b32 %r5746, %r5817, %r5819; - shr.u32 %r5820, %r9, 20; - and.b32 %r5821, %r5820, 15; - shr.u32 %r5822, %r9, 16; - and.b32 %r5823, %r5822, 15; - shr.u32 %r5824, %r9, 24; - bfi.b32 %r5825, %r5823, %r5821, 8, 4; - and.b32 %r5826, %r5809, 983040; - or.b32 %r5827, %r5825, %r5826; - and.b32 %r5828, %r9, 251658240; - or.b32 %r5750, %r5827, %r5828; - cvt.u16.u64 %rs83, %rd10; - and.b16 %rs84, %rs83, 240; - shr.u16 %rs85, %rs84, 4; - cvt.u16.u64 %rs86, %rd11; - and.b16 %rs87, %rs86, 240; - shr.u16 %rs88, %rs87, 4; - cvt.u32.u16 %r5829, %rs85; - cvt.u32.u64 %r5830, %rd10; - and.b32 %r5831, %r5830, 15; - prmt.b32 %r5832, %r5831, %r5829, 30212; - cvt.u32.u16 %r5833, %rs88; - prmt.b32 %r5834, %r5833, %r5832, 28756; - cvt.u32.u64 %r5835, %rd11; - shl.b32 %r5836, %r5835, 24; - and.b32 %r5837, %r5836, 251658240; - or.b32 %r5754, %r5834, %r5837; - cvt.u16.u64 %rs89, %rd12; - and.b16 %rs90, %rs89, 240; - shr.u16 %rs91, %rs90, 4; - cvt.u32.u16 %r5838, %rs91; - cvt.u32.u64 %r5839, %rd12; - and.b32 %r5840, %r5839, 15; - prmt.b32 %r5841, %r5840, %r5838, 30212; - shl.b32 %r5842, %r10, 12; - and.b32 %r5843, %r5842, 983040; - or.b32 %r5844, %r5841, %r5843; - shl.b32 %r5845, %r10, 24; - and.b32 %r5846, %r5845, 251658240; - or.b32 %r5758, %r5844, %r5846; - shr.u32 %r5847, %r11, 12; - shr.u32 %r5848, %r11, 8; - shr.u32 %r5849, %r11, 4; - and.b32 %r5850, %r5849, 15; - and.b32 %r5851, %r11, 15; - bfi.b32 %r5852, %r5851, %r5850, 8, 4; - shl.b32 %r5853, %r11, 4; - and.b32 %r5854, %r5853, 983040; - or.b32 %r5855, %r5852, %r5854; - shl.b32 %r5856, %r11, 16; - and.b32 %r5857, %r5856, 251658240; - or.b32 %r5762, %r5855, %r5857; - shr.u32 %r5858, %r11, 20; - and.b32 %r5859, %r5858, 15; - shr.u32 %r5860, %r11, 16; - and.b32 %r5861, %r5860, 15; - shr.u32 %r5862, %r11, 24; - bfi.b32 %r5863, %r5861, %r5859, 8, 4; - and.b32 %r5864, %r5847, 983040; - or.b32 %r5865, %r5863, %r5864; - and.b32 %r5866, %r11, 251658240; - or.b32 %r5766, %r5865, %r5866; - cvt.u16.u64 %rs92, %rd13; - and.b16 %rs93, %rs92, 240; - shr.u16 %rs94, %rs93, 4; - cvt.u16.u64 %rs95, %rd14; - and.b16 %rs96, %rs95, 240; - shr.u16 %rs97, %rs96, 4; - cvt.u32.u16 %r5867, %rs94; - cvt.u32.u64 %r5868, %rd13; - and.b32 %r5869, %r5868, 15; - prmt.b32 %r5870, %r5869, %r5867, 30212; - cvt.u32.u16 %r5871, %rs97; - prmt.b32 %r5872, %r5871, %r5870, 28756; - cvt.u32.u64 %r5873, %rd14; - shl.b32 %r5874, %r5873, 24; - and.b32 %r5875, %r5874, 251658240; - or.b32 %r5770, %r5872, %r5875; - cvt.u16.u64 %rs98, %rd15; - and.b16 %rs99, %rs98, 240; - shr.u16 %rs100, %rs99, 4; - cvt.u32.u16 %r5876, %rs100; - cvt.u32.u64 %r5877, %rd15; - and.b32 %r5878, %r5877, 15; - prmt.b32 %r5879, %r5878, %r5876, 30212; - shl.b32 %r5880, %r12, 12; - and.b32 %r5881, %r5880, 983040; - or.b32 %r5882, %r5879, %r5881; - shl.b32 %r5883, %r12, 24; - and.b32 %r5884, %r5883, 251658240; - or.b32 %r5774, %r5882, %r5884; - shr.u32 %r5885, %r13, 12; - shr.u32 %r5886, %r13, 8; - shr.u32 %r5887, %r13, 4; - and.b32 %r5888, %r5887, 15; - and.b32 %r5889, %r13, 15; - bfi.b32 %r5890, %r5889, %r5888, 8, 4; - shl.b32 %r5891, %r13, 4; - and.b32 %r5892, %r5891, 983040; - or.b32 %r5893, %r5890, %r5892; - shl.b32 %r5894, %r13, 16; - and.b32 %r5895, %r5894, 251658240; - or.b32 %r5778, %r5893, %r5895; - shr.u32 %r5896, %r13, 20; - and.b32 %r5897, %r5896, 15; - shr.u32 %r5898, %r13, 16; - and.b32 %r5899, %r5898, 15; - shr.u32 %r5900, %r13, 24; - bfi.b32 %r5901, %r5899, %r5897, 8, 4; - and.b32 %r5902, %r5885, 983040; - or.b32 %r5903, %r5901, %r5902; - and.b32 %r5904, %r13, 251658240; - or.b32 %r5782, %r5903, %r5904; - cvt.u16.u64 %rs101, %rd16; - and.b16 %rs102, %rs101, 240; - shr.u16 %rs103, %rs102, 4; - cvt.u16.u64 %rs104, %rd17; - and.b16 %rs105, %rs104, 240; - shr.u16 %rs106, %rs105, 4; - cvt.u32.u16 %r5905, %rs103; - cvt.u32.u64 %r5906, %rd16; - and.b32 %r5907, %r5906, 15; - prmt.b32 %r5908, %r5907, %r5905, 30212; - cvt.u32.u16 %r5909, %rs106; - prmt.b32 %r5910, %r5909, %r5908, 28756; - cvt.u32.u64 %r5911, %rd17; - shl.b32 %r5912, %r5911, 24; - and.b32 %r5913, %r5912, 251658240; - or.b32 %r5786, %r5910, %r5913; - cvt.u16.u64 %rs107, %rd18; - and.b16 %rs108, %rs107, 240; - shr.u16 %rs109, %rs108, 4; - cvt.u32.u16 %r5914, %rs109; - cvt.u32.u64 %r5915, %rd18; - and.b32 %r5916, %r5915, 15; - prmt.b32 %r5917, %r5916, %r5914, 30212; - shl.b32 %r5918, %r14, 12; - and.b32 %r5919, %r5918, 983040; - or.b32 %r5920, %r5917, %r5919; - shl.b32 %r5921, %r14, 24; - and.b32 %r5922, %r5921, 251658240; - or.b32 %r5790, %r5920, %r5922; - cvt.u16.u64 %rs110, %rd9; - and.b16 %rs111, %rs110, 240; - shr.u16 %rs112, %rs111, 4; - shr.u64 %rd201, %rd9, 8; - cvt.u32.u64 %r5923, %rd201; - cvt.u32.u64 %r5924, %rd9; - shr.u32 %r5925, %r5924, 12; - cvt.u32.u16 %r5926, %rs112; - and.b32 %r5927, %r5924, 15; - prmt.b32 %r5928, %r5927, %r5926, 30212; - shl.b32 %r5929, %r5924, 4; - and.b32 %r5930, %r5929, 983040; - or.b32 %r5931, %r5928, %r5930; - shl.b32 %r5932, %r5923, 24; - and.b32 %r5933, %r5932, 251658240; - or.b32 %r5794, %r5931, %r5933; - shr.u64 %rd202, %rd9, 16; - cvt.u32.u64 %r5934, %rd202; - shr.u32 %r5935, %r5924, 20; - and.b32 %r5936, %r5935, 15; - and.b32 %r5937, %r5934, 15; - shr.u64 %rd203, %rd9, 24; - cvt.u32.u64 %r5938, %rd203; - bfi.b32 %r5939, %r5937, %r5936, 8, 4; - and.b32 %r5940, %r5925, 983040; - or.b32 %r5941, %r5939, %r5940; - shl.b32 %r5942, %r5938, 24; - and.b32 %r5943, %r5942, 251658240; - or.b32 %r5798, %r5941, %r5943; - shr.u64 %rd204, %rd9, 32; - cvt.u32.u64 %r5944, %rd204; - shr.u64 %rd205, %rd9, 36; - cvt.u32.u64 %r5945, %rd205; - and.b32 %r5946, %r5945, 15; - and.b32 %r5947, %r5944, 15; - shr.u64 %rd206, %rd9, 40; - cvt.u32.u64 %r5948, %rd206; - shr.u64 %rd207, %rd9, 44; - cvt.u32.u64 %r5949, %rd207; - bfi.b32 %r5950, %r5947, %r5946, 8, 4; - shl.b32 %r5951, %r5949, 16; - and.b32 %r5952, %r5951, 983040; - or.b32 %r5953, %r5950, %r5952; - shl.b32 %r5954, %r5948, 24; - and.b32 %r5955, %r5954, 251658240; - or.b32 %r5802, %r5953, %r5955; - shr.u64 %rd208, %rd9, 48; - cvt.u32.u64 %r5956, %rd208; - shr.u64 %rd209, %rd9, 52; - cvt.u32.u64 %r5957, %rd209; - and.b32 %r5958, %r5957, 15; - and.b32 %r5959, %r5956, 15; - shr.u64 %rd210, %rd9, 56; - cvt.u32.u64 %r5960, %rd210; - bfi.b32 %r5961, %r5959, %r5958, 8, 4; - and.b32 %r5962, %r5949, 983040; - or.b32 %r5963, %r5961, %r5962; - shl.b32 %r5964, %r5960, 24; - and.b32 %r5965, %r5964, 251658240; - or.b32 %r5806, %r5963, %r5965; - ld.const.u32 %r1713, [matrix]; - mov.u32 %r6244, 0; - // begin inline asm - dp4a.u32.u32 %r1712, %r1713, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1717, [matrix+4]; - // begin inline asm - dp4a.u32.u32 %r1716, %r1717, %r5750, %r1712; - // end inline asm - ld.const.u32 %r1721, [matrix+8]; - // begin inline asm - dp4a.u32.u32 %r1720, %r1721, %r5754, %r1716; - // end inline asm - ld.const.u32 %r1725, [matrix+12]; - // begin inline asm - dp4a.u32.u32 %r1724, %r1725, %r5758, %r1720; - // end inline asm - ld.const.u32 %r1729, [matrix+16]; - // begin inline asm - dp4a.u32.u32 %r1728, %r1729, %r5762, %r1724; - // end inline asm - ld.const.u32 %r1733, [matrix+20]; - // begin inline asm - dp4a.u32.u32 %r1732, %r1733, %r5766, %r1728; - // end inline asm - ld.const.u32 %r1737, [matrix+24]; - // begin inline asm - dp4a.u32.u32 %r1736, %r1737, %r5770, %r1732; - // end inline asm - ld.const.u32 %r1741, [matrix+28]; - // begin inline asm - dp4a.u32.u32 %r1740, %r1741, %r5774, %r1736; - // end inline asm - ld.const.u32 %r1745, [matrix+32]; - // begin inline asm - dp4a.u32.u32 %r1744, %r1745, %r5778, %r1740; - // end inline asm - ld.const.u32 %r1749, [matrix+36]; - // begin inline asm - dp4a.u32.u32 %r1748, %r1749, %r5782, %r1744; - // end inline asm - ld.const.u32 %r1753, [matrix+40]; - // begin inline asm - dp4a.u32.u32 %r1752, %r1753, %r5786, %r1748; - // end inline asm - ld.const.u32 %r1757, [matrix+44]; - // begin inline asm - dp4a.u32.u32 %r1756, %r1757, %r5790, %r1752; - // end inline asm - ld.const.u32 %r1761, [matrix+48]; - // begin inline asm - dp4a.u32.u32 %r1760, %r1761, %r5794, %r1756; - // end inline asm - ld.const.u32 %r1765, [matrix+52]; - // begin inline asm - dp4a.u32.u32 %r1764, %r1765, %r5798, %r1760; - // end inline asm - ld.const.u32 %r1769, [matrix+56]; - // begin inline asm - dp4a.u32.u32 %r1768, %r1769, %r5802, %r1764; - // end inline asm - ld.const.u32 %r1773, [matrix+60]; - // begin inline asm - dp4a.u32.u32 %r1772, %r1773, %r5806, %r1768; - // end inline asm - ld.const.u32 %r1777, [matrix+64]; - // begin inline asm - dp4a.u32.u32 %r1776, %r1777, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1781, [matrix+68]; - // begin inline asm - dp4a.u32.u32 %r1780, %r1781, %r5750, %r1776; - // end inline asm - ld.const.u32 %r1785, [matrix+72]; - // begin inline asm - dp4a.u32.u32 %r1784, %r1785, %r5754, %r1780; - // end inline asm - ld.const.u32 %r1789, [matrix+76]; - // begin inline asm - dp4a.u32.u32 %r1788, %r1789, %r5758, %r1784; - // end inline asm - ld.const.u32 %r1793, [matrix+80]; - // begin inline asm - dp4a.u32.u32 %r1792, %r1793, %r5762, %r1788; - // end inline asm - ld.const.u32 %r1797, [matrix+84]; - // begin inline asm - dp4a.u32.u32 %r1796, %r1797, %r5766, %r1792; - // end inline asm - ld.const.u32 %r1801, [matrix+88]; - // begin inline asm - dp4a.u32.u32 %r1800, %r1801, %r5770, %r1796; - // end inline asm - ld.const.u32 %r1805, [matrix+92]; - // begin inline asm - dp4a.u32.u32 %r1804, %r1805, %r5774, %r1800; - // end inline asm - ld.const.u32 %r1809, [matrix+96]; - // begin inline asm - dp4a.u32.u32 %r1808, %r1809, %r5778, %r1804; - // end inline asm - ld.const.u32 %r1813, [matrix+100]; - // begin inline asm - dp4a.u32.u32 %r1812, %r1813, %r5782, %r1808; - // end inline asm - ld.const.u32 %r1817, [matrix+104]; - // begin inline asm - dp4a.u32.u32 %r1816, %r1817, %r5786, %r1812; - // end inline asm - ld.const.u32 %r1821, [matrix+108]; - // begin inline asm - dp4a.u32.u32 %r1820, %r1821, %r5790, %r1816; - // end inline asm - ld.const.u32 %r1825, [matrix+112]; - // begin inline asm - dp4a.u32.u32 %r1824, %r1825, %r5794, %r1820; - // end inline asm - ld.const.u32 %r1829, [matrix+116]; - // begin inline asm - dp4a.u32.u32 %r1828, %r1829, %r5798, %r1824; - // end inline asm - ld.const.u32 %r1833, [matrix+120]; - // begin inline asm - dp4a.u32.u32 %r1832, %r1833, %r5802, %r1828; - // end inline asm - ld.const.u32 %r1837, [matrix+124]; - // begin inline asm - dp4a.u32.u32 %r1836, %r1837, %r5806, %r1832; - // end inline asm - shr.u32 %r5966, %r1772, 6; - and.b32 %r5967, %r5966, 240; - shr.u32 %r5968, %r1836, 10; - or.b32 %r5969, %r5968, %r5967; - xor.b32 %r5970, %r9, %r5969; - ld.const.u32 %r1841, [matrix+128]; - // begin inline asm - dp4a.u32.u32 %r1840, %r1841, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1845, [matrix+132]; - // begin inline asm - dp4a.u32.u32 %r1844, %r1845, %r5750, %r1840; - // end inline asm - ld.const.u32 %r1849, [matrix+136]; - // begin inline asm - dp4a.u32.u32 %r1848, %r1849, %r5754, %r1844; - // end inline asm - ld.const.u32 %r1853, [matrix+140]; - // begin inline asm - dp4a.u32.u32 %r1852, %r1853, %r5758, %r1848; - // end inline asm - ld.const.u32 %r1857, [matrix+144]; - // begin inline asm - dp4a.u32.u32 %r1856, %r1857, %r5762, %r1852; - // end inline asm - ld.const.u32 %r1861, [matrix+148]; - // begin inline asm - dp4a.u32.u32 %r1860, %r1861, %r5766, %r1856; - // end inline asm - ld.const.u32 %r1865, [matrix+152]; - // begin inline asm - dp4a.u32.u32 %r1864, %r1865, %r5770, %r1860; - // end inline asm - ld.const.u32 %r1869, [matrix+156]; - // begin inline asm - dp4a.u32.u32 %r1868, %r1869, %r5774, %r1864; - // end inline asm - ld.const.u32 %r1873, [matrix+160]; - // begin inline asm - dp4a.u32.u32 %r1872, %r1873, %r5778, %r1868; - // end inline asm - ld.const.u32 %r1877, [matrix+164]; - // begin inline asm - dp4a.u32.u32 %r1876, %r1877, %r5782, %r1872; - // end inline asm - ld.const.u32 %r1881, [matrix+168]; - // begin inline asm - dp4a.u32.u32 %r1880, %r1881, %r5786, %r1876; - // end inline asm - ld.const.u32 %r1885, [matrix+172]; - // begin inline asm - dp4a.u32.u32 %r1884, %r1885, %r5790, %r1880; - // end inline asm - ld.const.u32 %r1889, [matrix+176]; - // begin inline asm - dp4a.u32.u32 %r1888, %r1889, %r5794, %r1884; - // end inline asm - ld.const.u32 %r1893, [matrix+180]; - // begin inline asm - dp4a.u32.u32 %r1892, %r1893, %r5798, %r1888; - // end inline asm - ld.const.u32 %r1897, [matrix+184]; - // begin inline asm - dp4a.u32.u32 %r1896, %r1897, %r5802, %r1892; - // end inline asm - ld.const.u32 %r1901, [matrix+188]; - // begin inline asm - dp4a.u32.u32 %r1900, %r1901, %r5806, %r1896; - // end inline asm - ld.const.u32 %r1905, [matrix+192]; - // begin inline asm - dp4a.u32.u32 %r1904, %r1905, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1909, [matrix+196]; - // begin inline asm - dp4a.u32.u32 %r1908, %r1909, %r5750, %r1904; - // end inline asm - ld.const.u32 %r1913, [matrix+200]; - // begin inline asm - dp4a.u32.u32 %r1912, %r1913, %r5754, %r1908; - // end inline asm - ld.const.u32 %r1917, [matrix+204]; - // begin inline asm - dp4a.u32.u32 %r1916, %r1917, %r5758, %r1912; - // end inline asm - ld.const.u32 %r1921, [matrix+208]; - // begin inline asm - dp4a.u32.u32 %r1920, %r1921, %r5762, %r1916; - // end inline asm - ld.const.u32 %r1925, [matrix+212]; - // begin inline asm - dp4a.u32.u32 %r1924, %r1925, %r5766, %r1920; - // end inline asm - ld.const.u32 %r1929, [matrix+216]; - // begin inline asm - dp4a.u32.u32 %r1928, %r1929, %r5770, %r1924; - // end inline asm - ld.const.u32 %r1933, [matrix+220]; - // begin inline asm - dp4a.u32.u32 %r1932, %r1933, %r5774, %r1928; - // end inline asm - ld.const.u32 %r1937, [matrix+224]; - // begin inline asm - dp4a.u32.u32 %r1936, %r1937, %r5778, %r1932; - // end inline asm - ld.const.u32 %r1941, [matrix+228]; - // begin inline asm - dp4a.u32.u32 %r1940, %r1941, %r5782, %r1936; - // end inline asm - ld.const.u32 %r1945, [matrix+232]; - // begin inline asm - dp4a.u32.u32 %r1944, %r1945, %r5786, %r1940; - // end inline asm - ld.const.u32 %r1949, [matrix+236]; - // begin inline asm - dp4a.u32.u32 %r1948, %r1949, %r5790, %r1944; - // end inline asm - ld.const.u32 %r1953, [matrix+240]; - // begin inline asm - dp4a.u32.u32 %r1952, %r1953, %r5794, %r1948; - // end inline asm - ld.const.u32 %r1957, [matrix+244]; - // begin inline asm - dp4a.u32.u32 %r1956, %r1957, %r5798, %r1952; - // end inline asm - ld.const.u32 %r1961, [matrix+248]; - // begin inline asm - dp4a.u32.u32 %r1960, %r1961, %r5802, %r1956; - // end inline asm - ld.const.u32 %r1965, [matrix+252]; - // begin inline asm - dp4a.u32.u32 %r1964, %r1965, %r5806, %r1960; - // end inline asm - shr.u32 %r5971, %r1900, 6; - and.b32 %r5972, %r5971, 240; - shr.u32 %r5973, %r1964, 10; - or.b32 %r5974, %r5973, %r5972; - xor.b32 %r5975, %r5810, %r5974; - ld.const.u32 %r1969, [matrix+256]; - // begin inline asm - dp4a.u32.u32 %r1968, %r1969, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1973, [matrix+260]; - // begin inline asm - dp4a.u32.u32 %r1972, %r1973, %r5750, %r1968; - // end inline asm - ld.const.u32 %r1977, [matrix+264]; - // begin inline asm - dp4a.u32.u32 %r1976, %r1977, %r5754, %r1972; - // end inline asm - ld.const.u32 %r1981, [matrix+268]; - // begin inline asm - dp4a.u32.u32 %r1980, %r1981, %r5758, %r1976; - // end inline asm - ld.const.u32 %r1985, [matrix+272]; - // begin inline asm - dp4a.u32.u32 %r1984, %r1985, %r5762, %r1980; - // end inline asm - ld.const.u32 %r1989, [matrix+276]; - // begin inline asm - dp4a.u32.u32 %r1988, %r1989, %r5766, %r1984; - // end inline asm - ld.const.u32 %r1993, [matrix+280]; - // begin inline asm - dp4a.u32.u32 %r1992, %r1993, %r5770, %r1988; - // end inline asm - ld.const.u32 %r1997, [matrix+284]; - // begin inline asm - dp4a.u32.u32 %r1996, %r1997, %r5774, %r1992; - // end inline asm - ld.const.u32 %r2001, [matrix+288]; - // begin inline asm - dp4a.u32.u32 %r2000, %r2001, %r5778, %r1996; - // end inline asm - ld.const.u32 %r2005, [matrix+292]; - // begin inline asm - dp4a.u32.u32 %r2004, %r2005, %r5782, %r2000; - // end inline asm - ld.const.u32 %r2009, [matrix+296]; - // begin inline asm - dp4a.u32.u32 %r2008, %r2009, %r5786, %r2004; - // end inline asm - ld.const.u32 %r2013, [matrix+300]; - // begin inline asm - dp4a.u32.u32 %r2012, %r2013, %r5790, %r2008; - // end inline asm - ld.const.u32 %r2017, [matrix+304]; - // begin inline asm - dp4a.u32.u32 %r2016, %r2017, %r5794, %r2012; - // end inline asm - ld.const.u32 %r2021, [matrix+308]; - // begin inline asm - dp4a.u32.u32 %r2020, %r2021, %r5798, %r2016; - // end inline asm - ld.const.u32 %r2025, [matrix+312]; - // begin inline asm - dp4a.u32.u32 %r2024, %r2025, %r5802, %r2020; - // end inline asm - ld.const.u32 %r2029, [matrix+316]; - // begin inline asm - dp4a.u32.u32 %r2028, %r2029, %r5806, %r2024; - // end inline asm - ld.const.u32 %r2033, [matrix+320]; - // begin inline asm - dp4a.u32.u32 %r2032, %r2033, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2037, [matrix+324]; - // begin inline asm - dp4a.u32.u32 %r2036, %r2037, %r5750, %r2032; - // end inline asm - ld.const.u32 %r2041, [matrix+328]; - // begin inline asm - dp4a.u32.u32 %r2040, %r2041, %r5754, %r2036; - // end inline asm - ld.const.u32 %r2045, [matrix+332]; - // begin inline asm - dp4a.u32.u32 %r2044, %r2045, %r5758, %r2040; - // end inline asm - ld.const.u32 %r2049, [matrix+336]; - // begin inline asm - dp4a.u32.u32 %r2048, %r2049, %r5762, %r2044; - // end inline asm - ld.const.u32 %r2053, [matrix+340]; - // begin inline asm - dp4a.u32.u32 %r2052, %r2053, %r5766, %r2048; - // end inline asm - ld.const.u32 %r2057, [matrix+344]; - // begin inline asm - dp4a.u32.u32 %r2056, %r2057, %r5770, %r2052; - // end inline asm - ld.const.u32 %r2061, [matrix+348]; - // begin inline asm - dp4a.u32.u32 %r2060, %r2061, %r5774, %r2056; - // end inline asm - ld.const.u32 %r2065, [matrix+352]; - // begin inline asm - dp4a.u32.u32 %r2064, %r2065, %r5778, %r2060; - // end inline asm - ld.const.u32 %r2069, [matrix+356]; - // begin inline asm - dp4a.u32.u32 %r2068, %r2069, %r5782, %r2064; - // end inline asm - ld.const.u32 %r2073, [matrix+360]; - // begin inline asm - dp4a.u32.u32 %r2072, %r2073, %r5786, %r2068; - // end inline asm - ld.const.u32 %r2077, [matrix+364]; - // begin inline asm - dp4a.u32.u32 %r2076, %r2077, %r5790, %r2072; - // end inline asm - ld.const.u32 %r2081, [matrix+368]; - // begin inline asm - dp4a.u32.u32 %r2080, %r2081, %r5794, %r2076; - // end inline asm - ld.const.u32 %r2085, [matrix+372]; - // begin inline asm - dp4a.u32.u32 %r2084, %r2085, %r5798, %r2080; - // end inline asm - ld.const.u32 %r2089, [matrix+376]; - // begin inline asm - dp4a.u32.u32 %r2088, %r2089, %r5802, %r2084; - // end inline asm - ld.const.u32 %r2093, [matrix+380]; - // begin inline asm - dp4a.u32.u32 %r2092, %r2093, %r5806, %r2088; - // end inline asm - shr.u32 %r5976, %r2028, 6; - and.b32 %r5977, %r5976, 240; - shr.u32 %r5978, %r2092, 10; - or.b32 %r5979, %r5978, %r5977; - xor.b32 %r5980, %r5822, %r5979; - ld.const.u32 %r2097, [matrix+384]; - // begin inline asm - dp4a.u32.u32 %r2096, %r2097, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2101, [matrix+388]; - // begin inline asm - dp4a.u32.u32 %r2100, %r2101, %r5750, %r2096; - // end inline asm - ld.const.u32 %r2105, [matrix+392]; - // begin inline asm - dp4a.u32.u32 %r2104, %r2105, %r5754, %r2100; - // end inline asm - ld.const.u32 %r2109, [matrix+396]; - // begin inline asm - dp4a.u32.u32 %r2108, %r2109, %r5758, %r2104; - // end inline asm - ld.const.u32 %r2113, [matrix+400]; - // begin inline asm - dp4a.u32.u32 %r2112, %r2113, %r5762, %r2108; - // end inline asm - ld.const.u32 %r2117, [matrix+404]; - // begin inline asm - dp4a.u32.u32 %r2116, %r2117, %r5766, %r2112; - // end inline asm - ld.const.u32 %r2121, [matrix+408]; - // begin inline asm - dp4a.u32.u32 %r2120, %r2121, %r5770, %r2116; - // end inline asm - ld.const.u32 %r2125, [matrix+412]; - // begin inline asm - dp4a.u32.u32 %r2124, %r2125, %r5774, %r2120; - // end inline asm - ld.const.u32 %r2129, [matrix+416]; - // begin inline asm - dp4a.u32.u32 %r2128, %r2129, %r5778, %r2124; - // end inline asm - ld.const.u32 %r2133, [matrix+420]; - // begin inline asm - dp4a.u32.u32 %r2132, %r2133, %r5782, %r2128; - // end inline asm - ld.const.u32 %r2137, [matrix+424]; - // begin inline asm - dp4a.u32.u32 %r2136, %r2137, %r5786, %r2132; - // end inline asm - ld.const.u32 %r2141, [matrix+428]; - // begin inline asm - dp4a.u32.u32 %r2140, %r2141, %r5790, %r2136; - // end inline asm - ld.const.u32 %r2145, [matrix+432]; - // begin inline asm - dp4a.u32.u32 %r2144, %r2145, %r5794, %r2140; - // end inline asm - ld.const.u32 %r2149, [matrix+436]; - // begin inline asm - dp4a.u32.u32 %r2148, %r2149, %r5798, %r2144; - // end inline asm - ld.const.u32 %r2153, [matrix+440]; - // begin inline asm - dp4a.u32.u32 %r2152, %r2153, %r5802, %r2148; - // end inline asm - ld.const.u32 %r2157, [matrix+444]; - // begin inline asm - dp4a.u32.u32 %r2156, %r2157, %r5806, %r2152; - // end inline asm - ld.const.u32 %r2161, [matrix+448]; - // begin inline asm - dp4a.u32.u32 %r2160, %r2161, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2165, [matrix+452]; - // begin inline asm - dp4a.u32.u32 %r2164, %r2165, %r5750, %r2160; - // end inline asm - ld.const.u32 %r2169, [matrix+456]; - // begin inline asm - dp4a.u32.u32 %r2168, %r2169, %r5754, %r2164; - // end inline asm - ld.const.u32 %r2173, [matrix+460]; - // begin inline asm - dp4a.u32.u32 %r2172, %r2173, %r5758, %r2168; - // end inline asm - ld.const.u32 %r2177, [matrix+464]; - // begin inline asm - dp4a.u32.u32 %r2176, %r2177, %r5762, %r2172; - // end inline asm - ld.const.u32 %r2181, [matrix+468]; - // begin inline asm - dp4a.u32.u32 %r2180, %r2181, %r5766, %r2176; - // end inline asm - ld.const.u32 %r2185, [matrix+472]; - // begin inline asm - dp4a.u32.u32 %r2184, %r2185, %r5770, %r2180; - // end inline asm - ld.const.u32 %r2189, [matrix+476]; - // begin inline asm - dp4a.u32.u32 %r2188, %r2189, %r5774, %r2184; - // end inline asm - ld.const.u32 %r2193, [matrix+480]; - // begin inline asm - dp4a.u32.u32 %r2192, %r2193, %r5778, %r2188; - // end inline asm - ld.const.u32 %r2197, [matrix+484]; - // begin inline asm - dp4a.u32.u32 %r2196, %r2197, %r5782, %r2192; - // end inline asm - ld.const.u32 %r2201, [matrix+488]; - // begin inline asm - dp4a.u32.u32 %r2200, %r2201, %r5786, %r2196; - // end inline asm - ld.const.u32 %r2205, [matrix+492]; - // begin inline asm - dp4a.u32.u32 %r2204, %r2205, %r5790, %r2200; - // end inline asm - ld.const.u32 %r2209, [matrix+496]; - // begin inline asm - dp4a.u32.u32 %r2208, %r2209, %r5794, %r2204; - // end inline asm - ld.const.u32 %r2213, [matrix+500]; - // begin inline asm - dp4a.u32.u32 %r2212, %r2213, %r5798, %r2208; - // end inline asm - ld.const.u32 %r2217, [matrix+504]; - // begin inline asm - dp4a.u32.u32 %r2216, %r2217, %r5802, %r2212; - // end inline asm - ld.const.u32 %r2221, [matrix+508]; - // begin inline asm - dp4a.u32.u32 %r2220, %r2221, %r5806, %r2216; - // end inline asm - shr.u32 %r5981, %r2156, 6; - and.b32 %r5982, %r5981, 240; - shr.u32 %r5983, %r2220, 10; - or.b32 %r5984, %r5983, %r5982; - xor.b32 %r5985, %r5824, %r5984; - ld.const.u32 %r2225, [matrix+512]; - // begin inline asm - dp4a.u32.u32 %r2224, %r2225, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2229, [matrix+516]; - // begin inline asm - dp4a.u32.u32 %r2228, %r2229, %r5750, %r2224; - // end inline asm - ld.const.u32 %r2233, [matrix+520]; - // begin inline asm - dp4a.u32.u32 %r2232, %r2233, %r5754, %r2228; - // end inline asm - ld.const.u32 %r2237, [matrix+524]; - // begin inline asm - dp4a.u32.u32 %r2236, %r2237, %r5758, %r2232; - // end inline asm - ld.const.u32 %r2241, [matrix+528]; - // begin inline asm - dp4a.u32.u32 %r2240, %r2241, %r5762, %r2236; - // end inline asm - ld.const.u32 %r2245, [matrix+532]; - // begin inline asm - dp4a.u32.u32 %r2244, %r2245, %r5766, %r2240; - // end inline asm - ld.const.u32 %r2249, [matrix+536]; - // begin inline asm - dp4a.u32.u32 %r2248, %r2249, %r5770, %r2244; - // end inline asm - ld.const.u32 %r2253, [matrix+540]; - // begin inline asm - dp4a.u32.u32 %r2252, %r2253, %r5774, %r2248; - // end inline asm - ld.const.u32 %r2257, [matrix+544]; - // begin inline asm - dp4a.u32.u32 %r2256, %r2257, %r5778, %r2252; - // end inline asm - ld.const.u32 %r2261, [matrix+548]; - // begin inline asm - dp4a.u32.u32 %r2260, %r2261, %r5782, %r2256; - // end inline asm - ld.const.u32 %r2265, [matrix+552]; - // begin inline asm - dp4a.u32.u32 %r2264, %r2265, %r5786, %r2260; - // end inline asm - ld.const.u32 %r2269, [matrix+556]; - // begin inline asm - dp4a.u32.u32 %r2268, %r2269, %r5790, %r2264; - // end inline asm - ld.const.u32 %r2273, [matrix+560]; - // begin inline asm - dp4a.u32.u32 %r2272, %r2273, %r5794, %r2268; - // end inline asm - ld.const.u32 %r2277, [matrix+564]; - // begin inline asm - dp4a.u32.u32 %r2276, %r2277, %r5798, %r2272; - // end inline asm - ld.const.u32 %r2281, [matrix+568]; - // begin inline asm - dp4a.u32.u32 %r2280, %r2281, %r5802, %r2276; - // end inline asm - ld.const.u32 %r2285, [matrix+572]; - // begin inline asm - dp4a.u32.u32 %r2284, %r2285, %r5806, %r2280; - // end inline asm - ld.const.u32 %r2289, [matrix+576]; - // begin inline asm - dp4a.u32.u32 %r2288, %r2289, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2293, [matrix+580]; - // begin inline asm - dp4a.u32.u32 %r2292, %r2293, %r5750, %r2288; - // end inline asm - ld.const.u32 %r2297, [matrix+584]; - // begin inline asm - dp4a.u32.u32 %r2296, %r2297, %r5754, %r2292; - // end inline asm - ld.const.u32 %r2301, [matrix+588]; - // begin inline asm - dp4a.u32.u32 %r2300, %r2301, %r5758, %r2296; - // end inline asm - ld.const.u32 %r2305, [matrix+592]; - // begin inline asm - dp4a.u32.u32 %r2304, %r2305, %r5762, %r2300; - // end inline asm - ld.const.u32 %r2309, [matrix+596]; - // begin inline asm - dp4a.u32.u32 %r2308, %r2309, %r5766, %r2304; - // end inline asm - ld.const.u32 %r2313, [matrix+600]; - // begin inline asm - dp4a.u32.u32 %r2312, %r2313, %r5770, %r2308; - // end inline asm - ld.const.u32 %r2317, [matrix+604]; - // begin inline asm - dp4a.u32.u32 %r2316, %r2317, %r5774, %r2312; - // end inline asm - ld.const.u32 %r2321, [matrix+608]; - // begin inline asm - dp4a.u32.u32 %r2320, %r2321, %r5778, %r2316; - // end inline asm - ld.const.u32 %r2325, [matrix+612]; - // begin inline asm - dp4a.u32.u32 %r2324, %r2325, %r5782, %r2320; - // end inline asm - ld.const.u32 %r2329, [matrix+616]; - // begin inline asm - dp4a.u32.u32 %r2328, %r2329, %r5786, %r2324; - // end inline asm - ld.const.u32 %r2333, [matrix+620]; - // begin inline asm - dp4a.u32.u32 %r2332, %r2333, %r5790, %r2328; - // end inline asm - ld.const.u32 %r2337, [matrix+624]; - // begin inline asm - dp4a.u32.u32 %r2336, %r2337, %r5794, %r2332; - // end inline asm - ld.const.u32 %r2341, [matrix+628]; - // begin inline asm - dp4a.u32.u32 %r2340, %r2341, %r5798, %r2336; - // end inline asm - ld.const.u32 %r2345, [matrix+632]; - // begin inline asm - dp4a.u32.u32 %r2344, %r2345, %r5802, %r2340; - // end inline asm - ld.const.u32 %r2349, [matrix+636]; - // begin inline asm - dp4a.u32.u32 %r2348, %r2349, %r5806, %r2344; - // end inline asm - shr.u32 %r5986, %r2284, 6; - and.b32 %r5987, %r5986, 240; - shr.u32 %r5988, %r2348, 10; - or.b32 %r5989, %r5988, %r5987; - cvt.u64.u32 %rd211, %r5989; - xor.b64 %rd212, %rd10, %rd211; - ld.const.u32 %r2353, [matrix+640]; - // begin inline asm - dp4a.u32.u32 %r2352, %r2353, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2357, [matrix+644]; - // begin inline asm - dp4a.u32.u32 %r2356, %r2357, %r5750, %r2352; - // end inline asm - ld.const.u32 %r2361, [matrix+648]; - // begin inline asm - dp4a.u32.u32 %r2360, %r2361, %r5754, %r2356; - // end inline asm - ld.const.u32 %r2365, [matrix+652]; - // begin inline asm - dp4a.u32.u32 %r2364, %r2365, %r5758, %r2360; - // end inline asm - ld.const.u32 %r2369, [matrix+656]; - // begin inline asm - dp4a.u32.u32 %r2368, %r2369, %r5762, %r2364; - // end inline asm - ld.const.u32 %r2373, [matrix+660]; - // begin inline asm - dp4a.u32.u32 %r2372, %r2373, %r5766, %r2368; - // end inline asm - ld.const.u32 %r2377, [matrix+664]; - // begin inline asm - dp4a.u32.u32 %r2376, %r2377, %r5770, %r2372; - // end inline asm - ld.const.u32 %r2381, [matrix+668]; - // begin inline asm - dp4a.u32.u32 %r2380, %r2381, %r5774, %r2376; - // end inline asm - ld.const.u32 %r2385, [matrix+672]; - // begin inline asm - dp4a.u32.u32 %r2384, %r2385, %r5778, %r2380; - // end inline asm - ld.const.u32 %r2389, [matrix+676]; - // begin inline asm - dp4a.u32.u32 %r2388, %r2389, %r5782, %r2384; - // end inline asm - ld.const.u32 %r2393, [matrix+680]; - // begin inline asm - dp4a.u32.u32 %r2392, %r2393, %r5786, %r2388; - // end inline asm - ld.const.u32 %r2397, [matrix+684]; - // begin inline asm - dp4a.u32.u32 %r2396, %r2397, %r5790, %r2392; - // end inline asm - ld.const.u32 %r2401, [matrix+688]; - // begin inline asm - dp4a.u32.u32 %r2400, %r2401, %r5794, %r2396; - // end inline asm - ld.const.u32 %r2405, [matrix+692]; - // begin inline asm - dp4a.u32.u32 %r2404, %r2405, %r5798, %r2400; - // end inline asm - ld.const.u32 %r2409, [matrix+696]; - // begin inline asm - dp4a.u32.u32 %r2408, %r2409, %r5802, %r2404; - // end inline asm - ld.const.u32 %r2413, [matrix+700]; - // begin inline asm - dp4a.u32.u32 %r2412, %r2413, %r5806, %r2408; - // end inline asm - ld.const.u32 %r2417, [matrix+704]; - // begin inline asm - dp4a.u32.u32 %r2416, %r2417, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2421, [matrix+708]; - // begin inline asm - dp4a.u32.u32 %r2420, %r2421, %r5750, %r2416; - // end inline asm - ld.const.u32 %r2425, [matrix+712]; - // begin inline asm - dp4a.u32.u32 %r2424, %r2425, %r5754, %r2420; - // end inline asm - ld.const.u32 %r2429, [matrix+716]; - // begin inline asm - dp4a.u32.u32 %r2428, %r2429, %r5758, %r2424; - // end inline asm - ld.const.u32 %r2433, [matrix+720]; - // begin inline asm - dp4a.u32.u32 %r2432, %r2433, %r5762, %r2428; - // end inline asm - ld.const.u32 %r2437, [matrix+724]; - // begin inline asm - dp4a.u32.u32 %r2436, %r2437, %r5766, %r2432; - // end inline asm - ld.const.u32 %r2441, [matrix+728]; - // begin inline asm - dp4a.u32.u32 %r2440, %r2441, %r5770, %r2436; - // end inline asm - ld.const.u32 %r2445, [matrix+732]; - // begin inline asm - dp4a.u32.u32 %r2444, %r2445, %r5774, %r2440; - // end inline asm - ld.const.u32 %r2449, [matrix+736]; - // begin inline asm - dp4a.u32.u32 %r2448, %r2449, %r5778, %r2444; - // end inline asm - ld.const.u32 %r2453, [matrix+740]; - // begin inline asm - dp4a.u32.u32 %r2452, %r2453, %r5782, %r2448; - // end inline asm - ld.const.u32 %r2457, [matrix+744]; - // begin inline asm - dp4a.u32.u32 %r2456, %r2457, %r5786, %r2452; - // end inline asm - ld.const.u32 %r2461, [matrix+748]; - // begin inline asm - dp4a.u32.u32 %r2460, %r2461, %r5790, %r2456; - // end inline asm - ld.const.u32 %r2465, [matrix+752]; - // begin inline asm - dp4a.u32.u32 %r2464, %r2465, %r5794, %r2460; - // end inline asm - ld.const.u32 %r2469, [matrix+756]; - // begin inline asm - dp4a.u32.u32 %r2468, %r2469, %r5798, %r2464; - // end inline asm - ld.const.u32 %r2473, [matrix+760]; - // begin inline asm - dp4a.u32.u32 %r2472, %r2473, %r5802, %r2468; - // end inline asm - ld.const.u32 %r2477, [matrix+764]; - // begin inline asm - dp4a.u32.u32 %r2476, %r2477, %r5806, %r2472; - // end inline asm - shr.u32 %r5990, %r2412, 6; - and.b32 %r5991, %r5990, 240; - shr.u32 %r5992, %r2476, 10; - or.b32 %r5993, %r5992, %r5991; - cvt.u64.u32 %rd213, %r5993; - xor.b64 %rd214, %rd11, %rd213; - ld.const.u32 %r2481, [matrix+768]; - // begin inline asm - dp4a.u32.u32 %r2480, %r2481, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2485, [matrix+772]; - // begin inline asm - dp4a.u32.u32 %r2484, %r2485, %r5750, %r2480; - // end inline asm - ld.const.u32 %r2489, [matrix+776]; - // begin inline asm - dp4a.u32.u32 %r2488, %r2489, %r5754, %r2484; - // end inline asm - ld.const.u32 %r2493, [matrix+780]; - // begin inline asm - dp4a.u32.u32 %r2492, %r2493, %r5758, %r2488; - // end inline asm - ld.const.u32 %r2497, [matrix+784]; - // begin inline asm - dp4a.u32.u32 %r2496, %r2497, %r5762, %r2492; - // end inline asm - ld.const.u32 %r2501, [matrix+788]; - // begin inline asm - dp4a.u32.u32 %r2500, %r2501, %r5766, %r2496; - // end inline asm - ld.const.u32 %r2505, [matrix+792]; - // begin inline asm - dp4a.u32.u32 %r2504, %r2505, %r5770, %r2500; - // end inline asm - ld.const.u32 %r2509, [matrix+796]; - // begin inline asm - dp4a.u32.u32 %r2508, %r2509, %r5774, %r2504; - // end inline asm - ld.const.u32 %r2513, [matrix+800]; - // begin inline asm - dp4a.u32.u32 %r2512, %r2513, %r5778, %r2508; - // end inline asm - ld.const.u32 %r2517, [matrix+804]; - // begin inline asm - dp4a.u32.u32 %r2516, %r2517, %r5782, %r2512; - // end inline asm - ld.const.u32 %r2521, [matrix+808]; - // begin inline asm - dp4a.u32.u32 %r2520, %r2521, %r5786, %r2516; - // end inline asm - ld.const.u32 %r2525, [matrix+812]; - // begin inline asm - dp4a.u32.u32 %r2524, %r2525, %r5790, %r2520; - // end inline asm - ld.const.u32 %r2529, [matrix+816]; - // begin inline asm - dp4a.u32.u32 %r2528, %r2529, %r5794, %r2524; - // end inline asm - ld.const.u32 %r2533, [matrix+820]; - // begin inline asm - dp4a.u32.u32 %r2532, %r2533, %r5798, %r2528; - // end inline asm - ld.const.u32 %r2537, [matrix+824]; - // begin inline asm - dp4a.u32.u32 %r2536, %r2537, %r5802, %r2532; - // end inline asm - ld.const.u32 %r2541, [matrix+828]; - // begin inline asm - dp4a.u32.u32 %r2540, %r2541, %r5806, %r2536; - // end inline asm - ld.const.u32 %r2545, [matrix+832]; - // begin inline asm - dp4a.u32.u32 %r2544, %r2545, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2549, [matrix+836]; - // begin inline asm - dp4a.u32.u32 %r2548, %r2549, %r5750, %r2544; - // end inline asm - ld.const.u32 %r2553, [matrix+840]; - // begin inline asm - dp4a.u32.u32 %r2552, %r2553, %r5754, %r2548; - // end inline asm - ld.const.u32 %r2557, [matrix+844]; - // begin inline asm - dp4a.u32.u32 %r2556, %r2557, %r5758, %r2552; - // end inline asm - ld.const.u32 %r2561, [matrix+848]; - // begin inline asm - dp4a.u32.u32 %r2560, %r2561, %r5762, %r2556; - // end inline asm - ld.const.u32 %r2565, [matrix+852]; - // begin inline asm - dp4a.u32.u32 %r2564, %r2565, %r5766, %r2560; - // end inline asm - ld.const.u32 %r2569, [matrix+856]; - // begin inline asm - dp4a.u32.u32 %r2568, %r2569, %r5770, %r2564; - // end inline asm - ld.const.u32 %r2573, [matrix+860]; - // begin inline asm - dp4a.u32.u32 %r2572, %r2573, %r5774, %r2568; - // end inline asm - ld.const.u32 %r2577, [matrix+864]; - // begin inline asm - dp4a.u32.u32 %r2576, %r2577, %r5778, %r2572; - // end inline asm - ld.const.u32 %r2581, [matrix+868]; - // begin inline asm - dp4a.u32.u32 %r2580, %r2581, %r5782, %r2576; - // end inline asm - ld.const.u32 %r2585, [matrix+872]; - // begin inline asm - dp4a.u32.u32 %r2584, %r2585, %r5786, %r2580; - // end inline asm - ld.const.u32 %r2589, [matrix+876]; - // begin inline asm - dp4a.u32.u32 %r2588, %r2589, %r5790, %r2584; - // end inline asm - ld.const.u32 %r2593, [matrix+880]; - // begin inline asm - dp4a.u32.u32 %r2592, %r2593, %r5794, %r2588; - // end inline asm - ld.const.u32 %r2597, [matrix+884]; - // begin inline asm - dp4a.u32.u32 %r2596, %r2597, %r5798, %r2592; - // end inline asm - ld.const.u32 %r2601, [matrix+888]; - // begin inline asm - dp4a.u32.u32 %r2600, %r2601, %r5802, %r2596; - // end inline asm - ld.const.u32 %r2605, [matrix+892]; - // begin inline asm - dp4a.u32.u32 %r2604, %r2605, %r5806, %r2600; - // end inline asm - shr.u32 %r5994, %r2540, 6; - and.b32 %r5995, %r5994, 240; - shr.u32 %r5996, %r2604, 10; - or.b32 %r5997, %r5996, %r5995; - cvt.u64.u32 %rd215, %r5997; - xor.b64 %rd216, %rd12, %rd215; - ld.const.u32 %r2609, [matrix+896]; - // begin inline asm - dp4a.u32.u32 %r2608, %r2609, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2613, [matrix+900]; - // begin inline asm - dp4a.u32.u32 %r2612, %r2613, %r5750, %r2608; - // end inline asm - ld.const.u32 %r2617, [matrix+904]; - // begin inline asm - dp4a.u32.u32 %r2616, %r2617, %r5754, %r2612; - // end inline asm - ld.const.u32 %r2621, [matrix+908]; - // begin inline asm - dp4a.u32.u32 %r2620, %r2621, %r5758, %r2616; - // end inline asm - ld.const.u32 %r2625, [matrix+912]; - // begin inline asm - dp4a.u32.u32 %r2624, %r2625, %r5762, %r2620; - // end inline asm - ld.const.u32 %r2629, [matrix+916]; - // begin inline asm - dp4a.u32.u32 %r2628, %r2629, %r5766, %r2624; - // end inline asm - ld.const.u32 %r2633, [matrix+920]; - // begin inline asm - dp4a.u32.u32 %r2632, %r2633, %r5770, %r2628; - // end inline asm - ld.const.u32 %r2637, [matrix+924]; - // begin inline asm - dp4a.u32.u32 %r2636, %r2637, %r5774, %r2632; - // end inline asm - ld.const.u32 %r2641, [matrix+928]; - // begin inline asm - dp4a.u32.u32 %r2640, %r2641, %r5778, %r2636; - // end inline asm - ld.const.u32 %r2645, [matrix+932]; - // begin inline asm - dp4a.u32.u32 %r2644, %r2645, %r5782, %r2640; - // end inline asm - ld.const.u32 %r2649, [matrix+936]; - // begin inline asm - dp4a.u32.u32 %r2648, %r2649, %r5786, %r2644; - // end inline asm - ld.const.u32 %r2653, [matrix+940]; - // begin inline asm - dp4a.u32.u32 %r2652, %r2653, %r5790, %r2648; - // end inline asm - ld.const.u32 %r2657, [matrix+944]; - // begin inline asm - dp4a.u32.u32 %r2656, %r2657, %r5794, %r2652; - // end inline asm - ld.const.u32 %r2661, [matrix+948]; - // begin inline asm - dp4a.u32.u32 %r2660, %r2661, %r5798, %r2656; - // end inline asm - ld.const.u32 %r2665, [matrix+952]; - // begin inline asm - dp4a.u32.u32 %r2664, %r2665, %r5802, %r2660; - // end inline asm - ld.const.u32 %r2669, [matrix+956]; - // begin inline asm - dp4a.u32.u32 %r2668, %r2669, %r5806, %r2664; - // end inline asm - ld.const.u32 %r2673, [matrix+960]; - // begin inline asm - dp4a.u32.u32 %r2672, %r2673, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2677, [matrix+964]; - // begin inline asm - dp4a.u32.u32 %r2676, %r2677, %r5750, %r2672; - // end inline asm - ld.const.u32 %r2681, [matrix+968]; - // begin inline asm - dp4a.u32.u32 %r2680, %r2681, %r5754, %r2676; - // end inline asm - ld.const.u32 %r2685, [matrix+972]; - // begin inline asm - dp4a.u32.u32 %r2684, %r2685, %r5758, %r2680; - // end inline asm - ld.const.u32 %r2689, [matrix+976]; - // begin inline asm - dp4a.u32.u32 %r2688, %r2689, %r5762, %r2684; - // end inline asm - ld.const.u32 %r2693, [matrix+980]; - // begin inline asm - dp4a.u32.u32 %r2692, %r2693, %r5766, %r2688; - // end inline asm - ld.const.u32 %r2697, [matrix+984]; - // begin inline asm - dp4a.u32.u32 %r2696, %r2697, %r5770, %r2692; - // end inline asm - ld.const.u32 %r2701, [matrix+988]; - // begin inline asm - dp4a.u32.u32 %r2700, %r2701, %r5774, %r2696; - // end inline asm - ld.const.u32 %r2705, [matrix+992]; - // begin inline asm - dp4a.u32.u32 %r2704, %r2705, %r5778, %r2700; - // end inline asm - ld.const.u32 %r2709, [matrix+996]; - // begin inline asm - dp4a.u32.u32 %r2708, %r2709, %r5782, %r2704; - // end inline asm - ld.const.u32 %r2713, [matrix+1000]; - // begin inline asm - dp4a.u32.u32 %r2712, %r2713, %r5786, %r2708; - // end inline asm - ld.const.u32 %r2717, [matrix+1004]; - // begin inline asm - dp4a.u32.u32 %r2716, %r2717, %r5790, %r2712; - // end inline asm - ld.const.u32 %r2721, [matrix+1008]; - // begin inline asm - dp4a.u32.u32 %r2720, %r2721, %r5794, %r2716; - // end inline asm - ld.const.u32 %r2725, [matrix+1012]; - // begin inline asm - dp4a.u32.u32 %r2724, %r2725, %r5798, %r2720; - // end inline asm - ld.const.u32 %r2729, [matrix+1016]; - // begin inline asm - dp4a.u32.u32 %r2728, %r2729, %r5802, %r2724; - // end inline asm - ld.const.u32 %r2733, [matrix+1020]; - // begin inline asm - dp4a.u32.u32 %r2732, %r2733, %r5806, %r2728; - // end inline asm - shr.u32 %r5998, %r2668, 6; - and.b32 %r5999, %r5998, 240; - ld.const.u32 %r2737, [matrix+1024]; - // begin inline asm - dp4a.u32.u32 %r2736, %r2737, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2741, [matrix+1028]; - // begin inline asm - dp4a.u32.u32 %r2740, %r2741, %r5750, %r2736; - // end inline asm - ld.const.u32 %r2745, [matrix+1032]; - // begin inline asm - dp4a.u32.u32 %r2744, %r2745, %r5754, %r2740; - // end inline asm - ld.const.u32 %r2749, [matrix+1036]; - // begin inline asm - dp4a.u32.u32 %r2748, %r2749, %r5758, %r2744; - // end inline asm - ld.const.u32 %r2753, [matrix+1040]; - // begin inline asm - dp4a.u32.u32 %r2752, %r2753, %r5762, %r2748; - // end inline asm - ld.const.u32 %r2757, [matrix+1044]; - // begin inline asm - dp4a.u32.u32 %r2756, %r2757, %r5766, %r2752; - // end inline asm - ld.const.u32 %r2761, [matrix+1048]; - // begin inline asm - dp4a.u32.u32 %r2760, %r2761, %r5770, %r2756; - // end inline asm - ld.const.u32 %r2765, [matrix+1052]; - // begin inline asm - dp4a.u32.u32 %r2764, %r2765, %r5774, %r2760; - // end inline asm - ld.const.u32 %r2769, [matrix+1056]; - // begin inline asm - dp4a.u32.u32 %r2768, %r2769, %r5778, %r2764; - // end inline asm - ld.const.u32 %r2773, [matrix+1060]; - // begin inline asm - dp4a.u32.u32 %r2772, %r2773, %r5782, %r2768; - // end inline asm - ld.const.u32 %r2777, [matrix+1064]; - // begin inline asm - dp4a.u32.u32 %r2776, %r2777, %r5786, %r2772; - // end inline asm - ld.const.u32 %r2781, [matrix+1068]; - // begin inline asm - dp4a.u32.u32 %r2780, %r2781, %r5790, %r2776; - // end inline asm - ld.const.u32 %r2785, [matrix+1072]; - // begin inline asm - dp4a.u32.u32 %r2784, %r2785, %r5794, %r2780; - // end inline asm - ld.const.u32 %r2789, [matrix+1076]; - // begin inline asm - dp4a.u32.u32 %r2788, %r2789, %r5798, %r2784; - // end inline asm - ld.const.u32 %r2793, [matrix+1080]; - // begin inline asm - dp4a.u32.u32 %r2792, %r2793, %r5802, %r2788; - // end inline asm - ld.const.u32 %r2797, [matrix+1084]; - // begin inline asm - dp4a.u32.u32 %r2796, %r2797, %r5806, %r2792; - // end inline asm - ld.const.u32 %r2801, [matrix+1088]; - // begin inline asm - dp4a.u32.u32 %r2800, %r2801, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2805, [matrix+1092]; - // begin inline asm - dp4a.u32.u32 %r2804, %r2805, %r5750, %r2800; - // end inline asm - ld.const.u32 %r2809, [matrix+1096]; - // begin inline asm - dp4a.u32.u32 %r2808, %r2809, %r5754, %r2804; - // end inline asm - ld.const.u32 %r2813, [matrix+1100]; - // begin inline asm - dp4a.u32.u32 %r2812, %r2813, %r5758, %r2808; - // end inline asm - ld.const.u32 %r2817, [matrix+1104]; - // begin inline asm - dp4a.u32.u32 %r2816, %r2817, %r5762, %r2812; - // end inline asm - ld.const.u32 %r2821, [matrix+1108]; - // begin inline asm - dp4a.u32.u32 %r2820, %r2821, %r5766, %r2816; - // end inline asm - ld.const.u32 %r2825, [matrix+1112]; - // begin inline asm - dp4a.u32.u32 %r2824, %r2825, %r5770, %r2820; - // end inline asm - ld.const.u32 %r2829, [matrix+1116]; - // begin inline asm - dp4a.u32.u32 %r2828, %r2829, %r5774, %r2824; - // end inline asm - ld.const.u32 %r2833, [matrix+1120]; - // begin inline asm - dp4a.u32.u32 %r2832, %r2833, %r5778, %r2828; - // end inline asm - ld.const.u32 %r2837, [matrix+1124]; - // begin inline asm - dp4a.u32.u32 %r2836, %r2837, %r5782, %r2832; - // end inline asm - ld.const.u32 %r2841, [matrix+1128]; - // begin inline asm - dp4a.u32.u32 %r2840, %r2841, %r5786, %r2836; - // end inline asm - ld.const.u32 %r2845, [matrix+1132]; - // begin inline asm - dp4a.u32.u32 %r2844, %r2845, %r5790, %r2840; - // end inline asm - ld.const.u32 %r2849, [matrix+1136]; - // begin inline asm - dp4a.u32.u32 %r2848, %r2849, %r5794, %r2844; - // end inline asm - ld.const.u32 %r2853, [matrix+1140]; - // begin inline asm - dp4a.u32.u32 %r2852, %r2853, %r5798, %r2848; - // end inline asm - ld.const.u32 %r2857, [matrix+1144]; - // begin inline asm - dp4a.u32.u32 %r2856, %r2857, %r5802, %r2852; - // end inline asm - ld.const.u32 %r2861, [matrix+1148]; - // begin inline asm - dp4a.u32.u32 %r2860, %r2861, %r5806, %r2856; - // end inline asm - shr.u32 %r6000, %r2796, 6; - and.b32 %r6001, %r6000, 240; - shr.u32 %r6002, %r2860, 10; - or.b32 %r6003, %r6002, %r6001; - xor.b32 %r6004, %r11, %r6003; - ld.const.u32 %r2865, [matrix+1152]; - // begin inline asm - dp4a.u32.u32 %r2864, %r2865, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2869, [matrix+1156]; - // begin inline asm - dp4a.u32.u32 %r2868, %r2869, %r5750, %r2864; - // end inline asm - ld.const.u32 %r2873, [matrix+1160]; - // begin inline asm - dp4a.u32.u32 %r2872, %r2873, %r5754, %r2868; - // end inline asm - ld.const.u32 %r2877, [matrix+1164]; - // begin inline asm - dp4a.u32.u32 %r2876, %r2877, %r5758, %r2872; - // end inline asm - ld.const.u32 %r2881, [matrix+1168]; - // begin inline asm - dp4a.u32.u32 %r2880, %r2881, %r5762, %r2876; - // end inline asm - ld.const.u32 %r2885, [matrix+1172]; - // begin inline asm - dp4a.u32.u32 %r2884, %r2885, %r5766, %r2880; - // end inline asm - ld.const.u32 %r2889, [matrix+1176]; - // begin inline asm - dp4a.u32.u32 %r2888, %r2889, %r5770, %r2884; - // end inline asm - ld.const.u32 %r2893, [matrix+1180]; - // begin inline asm - dp4a.u32.u32 %r2892, %r2893, %r5774, %r2888; - // end inline asm - ld.const.u32 %r2897, [matrix+1184]; - // begin inline asm - dp4a.u32.u32 %r2896, %r2897, %r5778, %r2892; - // end inline asm - ld.const.u32 %r2901, [matrix+1188]; - // begin inline asm - dp4a.u32.u32 %r2900, %r2901, %r5782, %r2896; - // end inline asm - ld.const.u32 %r2905, [matrix+1192]; - // begin inline asm - dp4a.u32.u32 %r2904, %r2905, %r5786, %r2900; - // end inline asm - ld.const.u32 %r2909, [matrix+1196]; - // begin inline asm - dp4a.u32.u32 %r2908, %r2909, %r5790, %r2904; - // end inline asm - ld.const.u32 %r2913, [matrix+1200]; - // begin inline asm - dp4a.u32.u32 %r2912, %r2913, %r5794, %r2908; - // end inline asm - ld.const.u32 %r2917, [matrix+1204]; - // begin inline asm - dp4a.u32.u32 %r2916, %r2917, %r5798, %r2912; - // end inline asm - ld.const.u32 %r2921, [matrix+1208]; - // begin inline asm - dp4a.u32.u32 %r2920, %r2921, %r5802, %r2916; - // end inline asm - ld.const.u32 %r2925, [matrix+1212]; - // begin inline asm - dp4a.u32.u32 %r2924, %r2925, %r5806, %r2920; - // end inline asm - ld.const.u32 %r2929, [matrix+1216]; - // begin inline asm - dp4a.u32.u32 %r2928, %r2929, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2933, [matrix+1220]; - // begin inline asm - dp4a.u32.u32 %r2932, %r2933, %r5750, %r2928; - // end inline asm - ld.const.u32 %r2937, [matrix+1224]; - // begin inline asm - dp4a.u32.u32 %r2936, %r2937, %r5754, %r2932; - // end inline asm - ld.const.u32 %r2941, [matrix+1228]; - // begin inline asm - dp4a.u32.u32 %r2940, %r2941, %r5758, %r2936; - // end inline asm - ld.const.u32 %r2945, [matrix+1232]; - // begin inline asm - dp4a.u32.u32 %r2944, %r2945, %r5762, %r2940; - // end inline asm - ld.const.u32 %r2949, [matrix+1236]; - // begin inline asm - dp4a.u32.u32 %r2948, %r2949, %r5766, %r2944; - // end inline asm - ld.const.u32 %r2953, [matrix+1240]; - // begin inline asm - dp4a.u32.u32 %r2952, %r2953, %r5770, %r2948; - // end inline asm - ld.const.u32 %r2957, [matrix+1244]; - // begin inline asm - dp4a.u32.u32 %r2956, %r2957, %r5774, %r2952; - // end inline asm - ld.const.u32 %r2961, [matrix+1248]; - // begin inline asm - dp4a.u32.u32 %r2960, %r2961, %r5778, %r2956; - // end inline asm - ld.const.u32 %r2965, [matrix+1252]; - // begin inline asm - dp4a.u32.u32 %r2964, %r2965, %r5782, %r2960; - // end inline asm - ld.const.u32 %r2969, [matrix+1256]; - // begin inline asm - dp4a.u32.u32 %r2968, %r2969, %r5786, %r2964; - // end inline asm - ld.const.u32 %r2973, [matrix+1260]; - // begin inline asm - dp4a.u32.u32 %r2972, %r2973, %r5790, %r2968; - // end inline asm - ld.const.u32 %r2977, [matrix+1264]; - // begin inline asm - dp4a.u32.u32 %r2976, %r2977, %r5794, %r2972; - // end inline asm - ld.const.u32 %r2981, [matrix+1268]; - // begin inline asm - dp4a.u32.u32 %r2980, %r2981, %r5798, %r2976; - // end inline asm - ld.const.u32 %r2985, [matrix+1272]; - // begin inline asm - dp4a.u32.u32 %r2984, %r2985, %r5802, %r2980; - // end inline asm - ld.const.u32 %r2989, [matrix+1276]; - // begin inline asm - dp4a.u32.u32 %r2988, %r2989, %r5806, %r2984; - // end inline asm - shr.u32 %r6005, %r2924, 6; - and.b32 %r6006, %r6005, 240; - shr.u32 %r6007, %r2988, 10; - or.b32 %r6008, %r6007, %r6006; - xor.b32 %r6009, %r5848, %r6008; - ld.const.u32 %r2993, [matrix+1280]; - // begin inline asm - dp4a.u32.u32 %r2992, %r2993, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2997, [matrix+1284]; - // begin inline asm - dp4a.u32.u32 %r2996, %r2997, %r5750, %r2992; - // end inline asm - ld.const.u32 %r3001, [matrix+1288]; - // begin inline asm - dp4a.u32.u32 %r3000, %r3001, %r5754, %r2996; - // end inline asm - ld.const.u32 %r3005, [matrix+1292]; - // begin inline asm - dp4a.u32.u32 %r3004, %r3005, %r5758, %r3000; - // end inline asm - ld.const.u32 %r3009, [matrix+1296]; - // begin inline asm - dp4a.u32.u32 %r3008, %r3009, %r5762, %r3004; - // end inline asm - ld.const.u32 %r3013, [matrix+1300]; - // begin inline asm - dp4a.u32.u32 %r3012, %r3013, %r5766, %r3008; - // end inline asm - ld.const.u32 %r3017, [matrix+1304]; - // begin inline asm - dp4a.u32.u32 %r3016, %r3017, %r5770, %r3012; - // end inline asm - ld.const.u32 %r3021, [matrix+1308]; - // begin inline asm - dp4a.u32.u32 %r3020, %r3021, %r5774, %r3016; - // end inline asm - ld.const.u32 %r3025, [matrix+1312]; - // begin inline asm - dp4a.u32.u32 %r3024, %r3025, %r5778, %r3020; - // end inline asm - ld.const.u32 %r3029, [matrix+1316]; - // begin inline asm - dp4a.u32.u32 %r3028, %r3029, %r5782, %r3024; - // end inline asm - ld.const.u32 %r3033, [matrix+1320]; - // begin inline asm - dp4a.u32.u32 %r3032, %r3033, %r5786, %r3028; - // end inline asm - ld.const.u32 %r3037, [matrix+1324]; - // begin inline asm - dp4a.u32.u32 %r3036, %r3037, %r5790, %r3032; - // end inline asm - ld.const.u32 %r3041, [matrix+1328]; - // begin inline asm - dp4a.u32.u32 %r3040, %r3041, %r5794, %r3036; - // end inline asm - ld.const.u32 %r3045, [matrix+1332]; - // begin inline asm - dp4a.u32.u32 %r3044, %r3045, %r5798, %r3040; - // end inline asm - ld.const.u32 %r3049, [matrix+1336]; - // begin inline asm - dp4a.u32.u32 %r3048, %r3049, %r5802, %r3044; - // end inline asm - ld.const.u32 %r3053, [matrix+1340]; - // begin inline asm - dp4a.u32.u32 %r3052, %r3053, %r5806, %r3048; - // end inline asm - ld.const.u32 %r3057, [matrix+1344]; - // begin inline asm - dp4a.u32.u32 %r3056, %r3057, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3061, [matrix+1348]; - // begin inline asm - dp4a.u32.u32 %r3060, %r3061, %r5750, %r3056; - // end inline asm - ld.const.u32 %r3065, [matrix+1352]; - // begin inline asm - dp4a.u32.u32 %r3064, %r3065, %r5754, %r3060; - // end inline asm - ld.const.u32 %r3069, [matrix+1356]; - // begin inline asm - dp4a.u32.u32 %r3068, %r3069, %r5758, %r3064; - // end inline asm - ld.const.u32 %r3073, [matrix+1360]; - // begin inline asm - dp4a.u32.u32 %r3072, %r3073, %r5762, %r3068; - // end inline asm - ld.const.u32 %r3077, [matrix+1364]; - // begin inline asm - dp4a.u32.u32 %r3076, %r3077, %r5766, %r3072; - // end inline asm - ld.const.u32 %r3081, [matrix+1368]; - // begin inline asm - dp4a.u32.u32 %r3080, %r3081, %r5770, %r3076; - // end inline asm - ld.const.u32 %r3085, [matrix+1372]; - // begin inline asm - dp4a.u32.u32 %r3084, %r3085, %r5774, %r3080; - // end inline asm - ld.const.u32 %r3089, [matrix+1376]; - // begin inline asm - dp4a.u32.u32 %r3088, %r3089, %r5778, %r3084; - // end inline asm - ld.const.u32 %r3093, [matrix+1380]; - // begin inline asm - dp4a.u32.u32 %r3092, %r3093, %r5782, %r3088; - // end inline asm - ld.const.u32 %r3097, [matrix+1384]; - // begin inline asm - dp4a.u32.u32 %r3096, %r3097, %r5786, %r3092; - // end inline asm - ld.const.u32 %r3101, [matrix+1388]; - // begin inline asm - dp4a.u32.u32 %r3100, %r3101, %r5790, %r3096; - // end inline asm - ld.const.u32 %r3105, [matrix+1392]; - // begin inline asm - dp4a.u32.u32 %r3104, %r3105, %r5794, %r3100; - // end inline asm - ld.const.u32 %r3109, [matrix+1396]; - // begin inline asm - dp4a.u32.u32 %r3108, %r3109, %r5798, %r3104; - // end inline asm - ld.const.u32 %r3113, [matrix+1400]; - // begin inline asm - dp4a.u32.u32 %r3112, %r3113, %r5802, %r3108; - // end inline asm - ld.const.u32 %r3117, [matrix+1404]; - // begin inline asm - dp4a.u32.u32 %r3116, %r3117, %r5806, %r3112; - // end inline asm - shr.u32 %r6010, %r3052, 6; - and.b32 %r6011, %r6010, 240; - shr.u32 %r6012, %r3116, 10; - or.b32 %r6013, %r6012, %r6011; - xor.b32 %r6014, %r5860, %r6013; - ld.const.u32 %r3121, [matrix+1408]; - // begin inline asm - dp4a.u32.u32 %r3120, %r3121, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3125, [matrix+1412]; - // begin inline asm - dp4a.u32.u32 %r3124, %r3125, %r5750, %r3120; - // end inline asm - ld.const.u32 %r3129, [matrix+1416]; - // begin inline asm - dp4a.u32.u32 %r3128, %r3129, %r5754, %r3124; - // end inline asm - ld.const.u32 %r3133, [matrix+1420]; - // begin inline asm - dp4a.u32.u32 %r3132, %r3133, %r5758, %r3128; - // end inline asm - ld.const.u32 %r3137, [matrix+1424]; - // begin inline asm - dp4a.u32.u32 %r3136, %r3137, %r5762, %r3132; - // end inline asm - ld.const.u32 %r3141, [matrix+1428]; - // begin inline asm - dp4a.u32.u32 %r3140, %r3141, %r5766, %r3136; - // end inline asm - ld.const.u32 %r3145, [matrix+1432]; - // begin inline asm - dp4a.u32.u32 %r3144, %r3145, %r5770, %r3140; - // end inline asm - ld.const.u32 %r3149, [matrix+1436]; - // begin inline asm - dp4a.u32.u32 %r3148, %r3149, %r5774, %r3144; - // end inline asm - ld.const.u32 %r3153, [matrix+1440]; - // begin inline asm - dp4a.u32.u32 %r3152, %r3153, %r5778, %r3148; - // end inline asm - ld.const.u32 %r3157, [matrix+1444]; - // begin inline asm - dp4a.u32.u32 %r3156, %r3157, %r5782, %r3152; - // end inline asm - ld.const.u32 %r3161, [matrix+1448]; - // begin inline asm - dp4a.u32.u32 %r3160, %r3161, %r5786, %r3156; - // end inline asm - ld.const.u32 %r3165, [matrix+1452]; - // begin inline asm - dp4a.u32.u32 %r3164, %r3165, %r5790, %r3160; - // end inline asm - ld.const.u32 %r3169, [matrix+1456]; - // begin inline asm - dp4a.u32.u32 %r3168, %r3169, %r5794, %r3164; - // end inline asm - ld.const.u32 %r3173, [matrix+1460]; - // begin inline asm - dp4a.u32.u32 %r3172, %r3173, %r5798, %r3168; - // end inline asm - ld.const.u32 %r3177, [matrix+1464]; - // begin inline asm - dp4a.u32.u32 %r3176, %r3177, %r5802, %r3172; - // end inline asm - ld.const.u32 %r3181, [matrix+1468]; - // begin inline asm - dp4a.u32.u32 %r3180, %r3181, %r5806, %r3176; - // end inline asm - ld.const.u32 %r3185, [matrix+1472]; - // begin inline asm - dp4a.u32.u32 %r3184, %r3185, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3189, [matrix+1476]; - // begin inline asm - dp4a.u32.u32 %r3188, %r3189, %r5750, %r3184; - // end inline asm - ld.const.u32 %r3193, [matrix+1480]; - // begin inline asm - dp4a.u32.u32 %r3192, %r3193, %r5754, %r3188; - // end inline asm - ld.const.u32 %r3197, [matrix+1484]; - // begin inline asm - dp4a.u32.u32 %r3196, %r3197, %r5758, %r3192; - // end inline asm - ld.const.u32 %r3201, [matrix+1488]; - // begin inline asm - dp4a.u32.u32 %r3200, %r3201, %r5762, %r3196; - // end inline asm - ld.const.u32 %r3205, [matrix+1492]; - // begin inline asm - dp4a.u32.u32 %r3204, %r3205, %r5766, %r3200; - // end inline asm - ld.const.u32 %r3209, [matrix+1496]; - // begin inline asm - dp4a.u32.u32 %r3208, %r3209, %r5770, %r3204; - // end inline asm - ld.const.u32 %r3213, [matrix+1500]; - // begin inline asm - dp4a.u32.u32 %r3212, %r3213, %r5774, %r3208; - // end inline asm - ld.const.u32 %r3217, [matrix+1504]; - // begin inline asm - dp4a.u32.u32 %r3216, %r3217, %r5778, %r3212; - // end inline asm - ld.const.u32 %r3221, [matrix+1508]; - // begin inline asm - dp4a.u32.u32 %r3220, %r3221, %r5782, %r3216; - // end inline asm - ld.const.u32 %r3225, [matrix+1512]; - // begin inline asm - dp4a.u32.u32 %r3224, %r3225, %r5786, %r3220; - // end inline asm - ld.const.u32 %r3229, [matrix+1516]; - // begin inline asm - dp4a.u32.u32 %r3228, %r3229, %r5790, %r3224; - // end inline asm - ld.const.u32 %r3233, [matrix+1520]; - // begin inline asm - dp4a.u32.u32 %r3232, %r3233, %r5794, %r3228; - // end inline asm - ld.const.u32 %r3237, [matrix+1524]; - // begin inline asm - dp4a.u32.u32 %r3236, %r3237, %r5798, %r3232; - // end inline asm - ld.const.u32 %r3241, [matrix+1528]; - // begin inline asm - dp4a.u32.u32 %r3240, %r3241, %r5802, %r3236; - // end inline asm - ld.const.u32 %r3245, [matrix+1532]; - // begin inline asm - dp4a.u32.u32 %r3244, %r3245, %r5806, %r3240; - // end inline asm - shr.u32 %r6015, %r3180, 6; - and.b32 %r6016, %r6015, 240; - shr.u32 %r6017, %r3244, 10; - or.b32 %r6018, %r6017, %r6016; - xor.b32 %r6019, %r5862, %r6018; - ld.const.u32 %r3249, [matrix+1536]; - // begin inline asm - dp4a.u32.u32 %r3248, %r3249, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3253, [matrix+1540]; - // begin inline asm - dp4a.u32.u32 %r3252, %r3253, %r5750, %r3248; - // end inline asm - ld.const.u32 %r3257, [matrix+1544]; - // begin inline asm - dp4a.u32.u32 %r3256, %r3257, %r5754, %r3252; - // end inline asm - ld.const.u32 %r3261, [matrix+1548]; - // begin inline asm - dp4a.u32.u32 %r3260, %r3261, %r5758, %r3256; - // end inline asm - ld.const.u32 %r3265, [matrix+1552]; - // begin inline asm - dp4a.u32.u32 %r3264, %r3265, %r5762, %r3260; - // end inline asm - ld.const.u32 %r3269, [matrix+1556]; - // begin inline asm - dp4a.u32.u32 %r3268, %r3269, %r5766, %r3264; - // end inline asm - ld.const.u32 %r3273, [matrix+1560]; - // begin inline asm - dp4a.u32.u32 %r3272, %r3273, %r5770, %r3268; - // end inline asm - ld.const.u32 %r3277, [matrix+1564]; - // begin inline asm - dp4a.u32.u32 %r3276, %r3277, %r5774, %r3272; - // end inline asm - ld.const.u32 %r3281, [matrix+1568]; - // begin inline asm - dp4a.u32.u32 %r3280, %r3281, %r5778, %r3276; - // end inline asm - ld.const.u32 %r3285, [matrix+1572]; - // begin inline asm - dp4a.u32.u32 %r3284, %r3285, %r5782, %r3280; - // end inline asm - ld.const.u32 %r3289, [matrix+1576]; - // begin inline asm - dp4a.u32.u32 %r3288, %r3289, %r5786, %r3284; - // end inline asm - ld.const.u32 %r3293, [matrix+1580]; - // begin inline asm - dp4a.u32.u32 %r3292, %r3293, %r5790, %r3288; - // end inline asm - ld.const.u32 %r3297, [matrix+1584]; - // begin inline asm - dp4a.u32.u32 %r3296, %r3297, %r5794, %r3292; - // end inline asm - ld.const.u32 %r3301, [matrix+1588]; - // begin inline asm - dp4a.u32.u32 %r3300, %r3301, %r5798, %r3296; - // end inline asm - ld.const.u32 %r3305, [matrix+1592]; - // begin inline asm - dp4a.u32.u32 %r3304, %r3305, %r5802, %r3300; - // end inline asm - ld.const.u32 %r3309, [matrix+1596]; - // begin inline asm - dp4a.u32.u32 %r3308, %r3309, %r5806, %r3304; - // end inline asm - ld.const.u32 %r3313, [matrix+1600]; - // begin inline asm - dp4a.u32.u32 %r3312, %r3313, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3317, [matrix+1604]; - // begin inline asm - dp4a.u32.u32 %r3316, %r3317, %r5750, %r3312; - // end inline asm - ld.const.u32 %r3321, [matrix+1608]; - // begin inline asm - dp4a.u32.u32 %r3320, %r3321, %r5754, %r3316; - // end inline asm - ld.const.u32 %r3325, [matrix+1612]; - // begin inline asm - dp4a.u32.u32 %r3324, %r3325, %r5758, %r3320; - // end inline asm - ld.const.u32 %r3329, [matrix+1616]; - // begin inline asm - dp4a.u32.u32 %r3328, %r3329, %r5762, %r3324; - // end inline asm - ld.const.u32 %r3333, [matrix+1620]; - // begin inline asm - dp4a.u32.u32 %r3332, %r3333, %r5766, %r3328; - // end inline asm - ld.const.u32 %r3337, [matrix+1624]; - // begin inline asm - dp4a.u32.u32 %r3336, %r3337, %r5770, %r3332; - // end inline asm - ld.const.u32 %r3341, [matrix+1628]; - // begin inline asm - dp4a.u32.u32 %r3340, %r3341, %r5774, %r3336; - // end inline asm - ld.const.u32 %r3345, [matrix+1632]; - // begin inline asm - dp4a.u32.u32 %r3344, %r3345, %r5778, %r3340; - // end inline asm - ld.const.u32 %r3349, [matrix+1636]; - // begin inline asm - dp4a.u32.u32 %r3348, %r3349, %r5782, %r3344; - // end inline asm - ld.const.u32 %r3353, [matrix+1640]; - // begin inline asm - dp4a.u32.u32 %r3352, %r3353, %r5786, %r3348; - // end inline asm - ld.const.u32 %r3357, [matrix+1644]; - // begin inline asm - dp4a.u32.u32 %r3356, %r3357, %r5790, %r3352; - // end inline asm - ld.const.u32 %r3361, [matrix+1648]; - // begin inline asm - dp4a.u32.u32 %r3360, %r3361, %r5794, %r3356; - // end inline asm - ld.const.u32 %r3365, [matrix+1652]; - // begin inline asm - dp4a.u32.u32 %r3364, %r3365, %r5798, %r3360; - // end inline asm - ld.const.u32 %r3369, [matrix+1656]; - // begin inline asm - dp4a.u32.u32 %r3368, %r3369, %r5802, %r3364; - // end inline asm - ld.const.u32 %r3373, [matrix+1660]; - // begin inline asm - dp4a.u32.u32 %r3372, %r3373, %r5806, %r3368; - // end inline asm - shr.u32 %r6020, %r3308, 6; - and.b32 %r6021, %r6020, 240; - shr.u32 %r6022, %r3372, 10; - or.b32 %r6023, %r6022, %r6021; - cvt.u64.u32 %rd217, %r6023; - xor.b64 %rd218, %rd13, %rd217; - ld.const.u32 %r3377, [matrix+1664]; - // begin inline asm - dp4a.u32.u32 %r3376, %r3377, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3381, [matrix+1668]; - // begin inline asm - dp4a.u32.u32 %r3380, %r3381, %r5750, %r3376; - // end inline asm - ld.const.u32 %r3385, [matrix+1672]; - // begin inline asm - dp4a.u32.u32 %r3384, %r3385, %r5754, %r3380; - // end inline asm - ld.const.u32 %r3389, [matrix+1676]; - // begin inline asm - dp4a.u32.u32 %r3388, %r3389, %r5758, %r3384; - // end inline asm - ld.const.u32 %r3393, [matrix+1680]; - // begin inline asm - dp4a.u32.u32 %r3392, %r3393, %r5762, %r3388; - // end inline asm - ld.const.u32 %r3397, [matrix+1684]; - // begin inline asm - dp4a.u32.u32 %r3396, %r3397, %r5766, %r3392; - // end inline asm - ld.const.u32 %r3401, [matrix+1688]; - // begin inline asm - dp4a.u32.u32 %r3400, %r3401, %r5770, %r3396; - // end inline asm - ld.const.u32 %r3405, [matrix+1692]; - // begin inline asm - dp4a.u32.u32 %r3404, %r3405, %r5774, %r3400; - // end inline asm - ld.const.u32 %r3409, [matrix+1696]; - // begin inline asm - dp4a.u32.u32 %r3408, %r3409, %r5778, %r3404; - // end inline asm - ld.const.u32 %r3413, [matrix+1700]; - // begin inline asm - dp4a.u32.u32 %r3412, %r3413, %r5782, %r3408; - // end inline asm - ld.const.u32 %r3417, [matrix+1704]; - // begin inline asm - dp4a.u32.u32 %r3416, %r3417, %r5786, %r3412; - // end inline asm - ld.const.u32 %r3421, [matrix+1708]; - // begin inline asm - dp4a.u32.u32 %r3420, %r3421, %r5790, %r3416; - // end inline asm - ld.const.u32 %r3425, [matrix+1712]; - // begin inline asm - dp4a.u32.u32 %r3424, %r3425, %r5794, %r3420; - // end inline asm - ld.const.u32 %r3429, [matrix+1716]; - // begin inline asm - dp4a.u32.u32 %r3428, %r3429, %r5798, %r3424; - // end inline asm - ld.const.u32 %r3433, [matrix+1720]; - // begin inline asm - dp4a.u32.u32 %r3432, %r3433, %r5802, %r3428; - // end inline asm - ld.const.u32 %r3437, [matrix+1724]; - // begin inline asm - dp4a.u32.u32 %r3436, %r3437, %r5806, %r3432; - // end inline asm - ld.const.u32 %r3441, [matrix+1728]; - // begin inline asm - dp4a.u32.u32 %r3440, %r3441, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3445, [matrix+1732]; - // begin inline asm - dp4a.u32.u32 %r3444, %r3445, %r5750, %r3440; - // end inline asm - ld.const.u32 %r3449, [matrix+1736]; - // begin inline asm - dp4a.u32.u32 %r3448, %r3449, %r5754, %r3444; - // end inline asm - ld.const.u32 %r3453, [matrix+1740]; - // begin inline asm - dp4a.u32.u32 %r3452, %r3453, %r5758, %r3448; - // end inline asm - ld.const.u32 %r3457, [matrix+1744]; - // begin inline asm - dp4a.u32.u32 %r3456, %r3457, %r5762, %r3452; - // end inline asm - ld.const.u32 %r3461, [matrix+1748]; - // begin inline asm - dp4a.u32.u32 %r3460, %r3461, %r5766, %r3456; - // end inline asm - ld.const.u32 %r3465, [matrix+1752]; - // begin inline asm - dp4a.u32.u32 %r3464, %r3465, %r5770, %r3460; - // end inline asm - ld.const.u32 %r3469, [matrix+1756]; - // begin inline asm - dp4a.u32.u32 %r3468, %r3469, %r5774, %r3464; - // end inline asm - ld.const.u32 %r3473, [matrix+1760]; - // begin inline asm - dp4a.u32.u32 %r3472, %r3473, %r5778, %r3468; - // end inline asm - ld.const.u32 %r3477, [matrix+1764]; - // begin inline asm - dp4a.u32.u32 %r3476, %r3477, %r5782, %r3472; - // end inline asm - ld.const.u32 %r3481, [matrix+1768]; - // begin inline asm - dp4a.u32.u32 %r3480, %r3481, %r5786, %r3476; - // end inline asm - ld.const.u32 %r3485, [matrix+1772]; - // begin inline asm - dp4a.u32.u32 %r3484, %r3485, %r5790, %r3480; - // end inline asm - ld.const.u32 %r3489, [matrix+1776]; - // begin inline asm - dp4a.u32.u32 %r3488, %r3489, %r5794, %r3484; - // end inline asm - ld.const.u32 %r3493, [matrix+1780]; - // begin inline asm - dp4a.u32.u32 %r3492, %r3493, %r5798, %r3488; - // end inline asm - ld.const.u32 %r3497, [matrix+1784]; - // begin inline asm - dp4a.u32.u32 %r3496, %r3497, %r5802, %r3492; - // end inline asm - ld.const.u32 %r3501, [matrix+1788]; - // begin inline asm - dp4a.u32.u32 %r3500, %r3501, %r5806, %r3496; - // end inline asm - shr.u32 %r6024, %r3436, 6; - and.b32 %r6025, %r6024, 240; - shr.u32 %r6026, %r3500, 10; - or.b32 %r6027, %r6026, %r6025; - cvt.u64.u32 %rd219, %r6027; - xor.b64 %rd220, %rd14, %rd219; - ld.const.u32 %r3505, [matrix+1792]; - // begin inline asm - dp4a.u32.u32 %r3504, %r3505, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3509, [matrix+1796]; - // begin inline asm - dp4a.u32.u32 %r3508, %r3509, %r5750, %r3504; - // end inline asm - ld.const.u32 %r3513, [matrix+1800]; - // begin inline asm - dp4a.u32.u32 %r3512, %r3513, %r5754, %r3508; - // end inline asm - ld.const.u32 %r3517, [matrix+1804]; - // begin inline asm - dp4a.u32.u32 %r3516, %r3517, %r5758, %r3512; - // end inline asm - ld.const.u32 %r3521, [matrix+1808]; - // begin inline asm - dp4a.u32.u32 %r3520, %r3521, %r5762, %r3516; - // end inline asm - ld.const.u32 %r3525, [matrix+1812]; - // begin inline asm - dp4a.u32.u32 %r3524, %r3525, %r5766, %r3520; - // end inline asm - ld.const.u32 %r3529, [matrix+1816]; - // begin inline asm - dp4a.u32.u32 %r3528, %r3529, %r5770, %r3524; - // end inline asm - ld.const.u32 %r3533, [matrix+1820]; - // begin inline asm - dp4a.u32.u32 %r3532, %r3533, %r5774, %r3528; - // end inline asm - ld.const.u32 %r3537, [matrix+1824]; - // begin inline asm - dp4a.u32.u32 %r3536, %r3537, %r5778, %r3532; - // end inline asm - ld.const.u32 %r3541, [matrix+1828]; - // begin inline asm - dp4a.u32.u32 %r3540, %r3541, %r5782, %r3536; - // end inline asm - ld.const.u32 %r3545, [matrix+1832]; - // begin inline asm - dp4a.u32.u32 %r3544, %r3545, %r5786, %r3540; - // end inline asm - ld.const.u32 %r3549, [matrix+1836]; - // begin inline asm - dp4a.u32.u32 %r3548, %r3549, %r5790, %r3544; - // end inline asm - ld.const.u32 %r3553, [matrix+1840]; - // begin inline asm - dp4a.u32.u32 %r3552, %r3553, %r5794, %r3548; - // end inline asm - ld.const.u32 %r3557, [matrix+1844]; - // begin inline asm - dp4a.u32.u32 %r3556, %r3557, %r5798, %r3552; - // end inline asm - ld.const.u32 %r3561, [matrix+1848]; - // begin inline asm - dp4a.u32.u32 %r3560, %r3561, %r5802, %r3556; - // end inline asm - ld.const.u32 %r3565, [matrix+1852]; - // begin inline asm - dp4a.u32.u32 %r3564, %r3565, %r5806, %r3560; - // end inline asm - ld.const.u32 %r3569, [matrix+1856]; - // begin inline asm - dp4a.u32.u32 %r3568, %r3569, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3573, [matrix+1860]; - // begin inline asm - dp4a.u32.u32 %r3572, %r3573, %r5750, %r3568; - // end inline asm - ld.const.u32 %r3577, [matrix+1864]; - // begin inline asm - dp4a.u32.u32 %r3576, %r3577, %r5754, %r3572; - // end inline asm - ld.const.u32 %r3581, [matrix+1868]; - // begin inline asm - dp4a.u32.u32 %r3580, %r3581, %r5758, %r3576; - // end inline asm - ld.const.u32 %r3585, [matrix+1872]; - // begin inline asm - dp4a.u32.u32 %r3584, %r3585, %r5762, %r3580; - // end inline asm - ld.const.u32 %r3589, [matrix+1876]; - // begin inline asm - dp4a.u32.u32 %r3588, %r3589, %r5766, %r3584; - // end inline asm - ld.const.u32 %r3593, [matrix+1880]; - // begin inline asm - dp4a.u32.u32 %r3592, %r3593, %r5770, %r3588; - // end inline asm - ld.const.u32 %r3597, [matrix+1884]; - // begin inline asm - dp4a.u32.u32 %r3596, %r3597, %r5774, %r3592; - // end inline asm - ld.const.u32 %r3601, [matrix+1888]; - // begin inline asm - dp4a.u32.u32 %r3600, %r3601, %r5778, %r3596; - // end inline asm - ld.const.u32 %r3605, [matrix+1892]; - // begin inline asm - dp4a.u32.u32 %r3604, %r3605, %r5782, %r3600; - // end inline asm - ld.const.u32 %r3609, [matrix+1896]; - // begin inline asm - dp4a.u32.u32 %r3608, %r3609, %r5786, %r3604; - // end inline asm - ld.const.u32 %r3613, [matrix+1900]; - // begin inline asm - dp4a.u32.u32 %r3612, %r3613, %r5790, %r3608; - // end inline asm - ld.const.u32 %r3617, [matrix+1904]; - // begin inline asm - dp4a.u32.u32 %r3616, %r3617, %r5794, %r3612; - // end inline asm - ld.const.u32 %r3621, [matrix+1908]; - // begin inline asm - dp4a.u32.u32 %r3620, %r3621, %r5798, %r3616; - // end inline asm - ld.const.u32 %r3625, [matrix+1912]; - // begin inline asm - dp4a.u32.u32 %r3624, %r3625, %r5802, %r3620; - // end inline asm - ld.const.u32 %r3629, [matrix+1916]; - // begin inline asm - dp4a.u32.u32 %r3628, %r3629, %r5806, %r3624; - // end inline asm - shr.u32 %r6028, %r3564, 6; - and.b32 %r6029, %r6028, 240; - shr.u32 %r6030, %r3628, 10; - or.b32 %r6031, %r6030, %r6029; - cvt.u64.u32 %rd221, %r6031; - xor.b64 %rd222, %rd15, %rd221; - ld.const.u32 %r3633, [matrix+1920]; - // begin inline asm - dp4a.u32.u32 %r3632, %r3633, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3637, [matrix+1924]; - // begin inline asm - dp4a.u32.u32 %r3636, %r3637, %r5750, %r3632; - // end inline asm - ld.const.u32 %r3641, [matrix+1928]; - // begin inline asm - dp4a.u32.u32 %r3640, %r3641, %r5754, %r3636; - // end inline asm - ld.const.u32 %r3645, [matrix+1932]; - // begin inline asm - dp4a.u32.u32 %r3644, %r3645, %r5758, %r3640; - // end inline asm - ld.const.u32 %r3649, [matrix+1936]; - // begin inline asm - dp4a.u32.u32 %r3648, %r3649, %r5762, %r3644; - // end inline asm - ld.const.u32 %r3653, [matrix+1940]; - // begin inline asm - dp4a.u32.u32 %r3652, %r3653, %r5766, %r3648; - // end inline asm - ld.const.u32 %r3657, [matrix+1944]; - // begin inline asm - dp4a.u32.u32 %r3656, %r3657, %r5770, %r3652; - // end inline asm - ld.const.u32 %r3661, [matrix+1948]; - // begin inline asm - dp4a.u32.u32 %r3660, %r3661, %r5774, %r3656; - // end inline asm - ld.const.u32 %r3665, [matrix+1952]; - // begin inline asm - dp4a.u32.u32 %r3664, %r3665, %r5778, %r3660; - // end inline asm - ld.const.u32 %r3669, [matrix+1956]; - // begin inline asm - dp4a.u32.u32 %r3668, %r3669, %r5782, %r3664; - // end inline asm - ld.const.u32 %r3673, [matrix+1960]; - // begin inline asm - dp4a.u32.u32 %r3672, %r3673, %r5786, %r3668; - // end inline asm - ld.const.u32 %r3677, [matrix+1964]; - // begin inline asm - dp4a.u32.u32 %r3676, %r3677, %r5790, %r3672; - // end inline asm - ld.const.u32 %r3681, [matrix+1968]; - // begin inline asm - dp4a.u32.u32 %r3680, %r3681, %r5794, %r3676; - // end inline asm - ld.const.u32 %r3685, [matrix+1972]; - // begin inline asm - dp4a.u32.u32 %r3684, %r3685, %r5798, %r3680; - // end inline asm - ld.const.u32 %r3689, [matrix+1976]; - // begin inline asm - dp4a.u32.u32 %r3688, %r3689, %r5802, %r3684; - // end inline asm - ld.const.u32 %r3693, [matrix+1980]; - // begin inline asm - dp4a.u32.u32 %r3692, %r3693, %r5806, %r3688; - // end inline asm - ld.const.u32 %r3697, [matrix+1984]; - // begin inline asm - dp4a.u32.u32 %r3696, %r3697, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3701, [matrix+1988]; - // begin inline asm - dp4a.u32.u32 %r3700, %r3701, %r5750, %r3696; - // end inline asm - ld.const.u32 %r3705, [matrix+1992]; - // begin inline asm - dp4a.u32.u32 %r3704, %r3705, %r5754, %r3700; - // end inline asm - ld.const.u32 %r3709, [matrix+1996]; - // begin inline asm - dp4a.u32.u32 %r3708, %r3709, %r5758, %r3704; - // end inline asm - ld.const.u32 %r3713, [matrix+2000]; - // begin inline asm - dp4a.u32.u32 %r3712, %r3713, %r5762, %r3708; - // end inline asm - ld.const.u32 %r3717, [matrix+2004]; - // begin inline asm - dp4a.u32.u32 %r3716, %r3717, %r5766, %r3712; - // end inline asm - ld.const.u32 %r3721, [matrix+2008]; - // begin inline asm - dp4a.u32.u32 %r3720, %r3721, %r5770, %r3716; - // end inline asm - ld.const.u32 %r3725, [matrix+2012]; - // begin inline asm - dp4a.u32.u32 %r3724, %r3725, %r5774, %r3720; - // end inline asm - ld.const.u32 %r3729, [matrix+2016]; - // begin inline asm - dp4a.u32.u32 %r3728, %r3729, %r5778, %r3724; - // end inline asm - ld.const.u32 %r3733, [matrix+2020]; - // begin inline asm - dp4a.u32.u32 %r3732, %r3733, %r5782, %r3728; - // end inline asm - ld.const.u32 %r3737, [matrix+2024]; - // begin inline asm - dp4a.u32.u32 %r3736, %r3737, %r5786, %r3732; - // end inline asm - ld.const.u32 %r3741, [matrix+2028]; - // begin inline asm - dp4a.u32.u32 %r3740, %r3741, %r5790, %r3736; - // end inline asm - ld.const.u32 %r3745, [matrix+2032]; - // begin inline asm - dp4a.u32.u32 %r3744, %r3745, %r5794, %r3740; - // end inline asm - ld.const.u32 %r3749, [matrix+2036]; - // begin inline asm - dp4a.u32.u32 %r3748, %r3749, %r5798, %r3744; - // end inline asm - ld.const.u32 %r3753, [matrix+2040]; - // begin inline asm - dp4a.u32.u32 %r3752, %r3753, %r5802, %r3748; - // end inline asm - ld.const.u32 %r3757, [matrix+2044]; - // begin inline asm - dp4a.u32.u32 %r3756, %r3757, %r5806, %r3752; - // end inline asm - shr.u32 %r6032, %r3692, 6; - and.b32 %r6033, %r6032, 240; - ld.const.u32 %r3761, [matrix+2048]; - // begin inline asm - dp4a.u32.u32 %r3760, %r3761, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3765, [matrix+2052]; - // begin inline asm - dp4a.u32.u32 %r3764, %r3765, %r5750, %r3760; - // end inline asm - ld.const.u32 %r3769, [matrix+2056]; - // begin inline asm - dp4a.u32.u32 %r3768, %r3769, %r5754, %r3764; - // end inline asm - ld.const.u32 %r3773, [matrix+2060]; - // begin inline asm - dp4a.u32.u32 %r3772, %r3773, %r5758, %r3768; - // end inline asm - ld.const.u32 %r3777, [matrix+2064]; - // begin inline asm - dp4a.u32.u32 %r3776, %r3777, %r5762, %r3772; - // end inline asm - ld.const.u32 %r3781, [matrix+2068]; - // begin inline asm - dp4a.u32.u32 %r3780, %r3781, %r5766, %r3776; - // end inline asm - ld.const.u32 %r3785, [matrix+2072]; - // begin inline asm - dp4a.u32.u32 %r3784, %r3785, %r5770, %r3780; - // end inline asm - ld.const.u32 %r3789, [matrix+2076]; - // begin inline asm - dp4a.u32.u32 %r3788, %r3789, %r5774, %r3784; - // end inline asm - ld.const.u32 %r3793, [matrix+2080]; - // begin inline asm - dp4a.u32.u32 %r3792, %r3793, %r5778, %r3788; - // end inline asm - ld.const.u32 %r3797, [matrix+2084]; - // begin inline asm - dp4a.u32.u32 %r3796, %r3797, %r5782, %r3792; - // end inline asm - ld.const.u32 %r3801, [matrix+2088]; - // begin inline asm - dp4a.u32.u32 %r3800, %r3801, %r5786, %r3796; - // end inline asm - ld.const.u32 %r3805, [matrix+2092]; - // begin inline asm - dp4a.u32.u32 %r3804, %r3805, %r5790, %r3800; - // end inline asm - ld.const.u32 %r3809, [matrix+2096]; - // begin inline asm - dp4a.u32.u32 %r3808, %r3809, %r5794, %r3804; - // end inline asm - ld.const.u32 %r3813, [matrix+2100]; - // begin inline asm - dp4a.u32.u32 %r3812, %r3813, %r5798, %r3808; - // end inline asm - ld.const.u32 %r3817, [matrix+2104]; - // begin inline asm - dp4a.u32.u32 %r3816, %r3817, %r5802, %r3812; - // end inline asm - ld.const.u32 %r3821, [matrix+2108]; - // begin inline asm - dp4a.u32.u32 %r3820, %r3821, %r5806, %r3816; - // end inline asm - ld.const.u32 %r3825, [matrix+2112]; - // begin inline asm - dp4a.u32.u32 %r3824, %r3825, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3829, [matrix+2116]; - // begin inline asm - dp4a.u32.u32 %r3828, %r3829, %r5750, %r3824; - // end inline asm - ld.const.u32 %r3833, [matrix+2120]; - // begin inline asm - dp4a.u32.u32 %r3832, %r3833, %r5754, %r3828; - // end inline asm - ld.const.u32 %r3837, [matrix+2124]; - // begin inline asm - dp4a.u32.u32 %r3836, %r3837, %r5758, %r3832; - // end inline asm - ld.const.u32 %r3841, [matrix+2128]; - // begin inline asm - dp4a.u32.u32 %r3840, %r3841, %r5762, %r3836; - // end inline asm - ld.const.u32 %r3845, [matrix+2132]; - // begin inline asm - dp4a.u32.u32 %r3844, %r3845, %r5766, %r3840; - // end inline asm - ld.const.u32 %r3849, [matrix+2136]; - // begin inline asm - dp4a.u32.u32 %r3848, %r3849, %r5770, %r3844; - // end inline asm - ld.const.u32 %r3853, [matrix+2140]; - // begin inline asm - dp4a.u32.u32 %r3852, %r3853, %r5774, %r3848; - // end inline asm - ld.const.u32 %r3857, [matrix+2144]; - // begin inline asm - dp4a.u32.u32 %r3856, %r3857, %r5778, %r3852; - // end inline asm - ld.const.u32 %r3861, [matrix+2148]; - // begin inline asm - dp4a.u32.u32 %r3860, %r3861, %r5782, %r3856; - // end inline asm - ld.const.u32 %r3865, [matrix+2152]; - // begin inline asm - dp4a.u32.u32 %r3864, %r3865, %r5786, %r3860; - // end inline asm - ld.const.u32 %r3869, [matrix+2156]; - // begin inline asm - dp4a.u32.u32 %r3868, %r3869, %r5790, %r3864; - // end inline asm - ld.const.u32 %r3873, [matrix+2160]; - // begin inline asm - dp4a.u32.u32 %r3872, %r3873, %r5794, %r3868; - // end inline asm - ld.const.u32 %r3877, [matrix+2164]; - // begin inline asm - dp4a.u32.u32 %r3876, %r3877, %r5798, %r3872; - // end inline asm - ld.const.u32 %r3881, [matrix+2168]; - // begin inline asm - dp4a.u32.u32 %r3880, %r3881, %r5802, %r3876; - // end inline asm - ld.const.u32 %r3885, [matrix+2172]; - // begin inline asm - dp4a.u32.u32 %r3884, %r3885, %r5806, %r3880; - // end inline asm - shr.u32 %r6034, %r3820, 6; - and.b32 %r6035, %r6034, 240; - shr.u32 %r6036, %r3884, 10; - or.b32 %r6037, %r6036, %r6035; - xor.b32 %r6038, %r13, %r6037; - ld.const.u32 %r3889, [matrix+2176]; - // begin inline asm - dp4a.u32.u32 %r3888, %r3889, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3893, [matrix+2180]; - // begin inline asm - dp4a.u32.u32 %r3892, %r3893, %r5750, %r3888; - // end inline asm - ld.const.u32 %r3897, [matrix+2184]; - // begin inline asm - dp4a.u32.u32 %r3896, %r3897, %r5754, %r3892; - // end inline asm - ld.const.u32 %r3901, [matrix+2188]; - // begin inline asm - dp4a.u32.u32 %r3900, %r3901, %r5758, %r3896; - // end inline asm - ld.const.u32 %r3905, [matrix+2192]; - // begin inline asm - dp4a.u32.u32 %r3904, %r3905, %r5762, %r3900; - // end inline asm - ld.const.u32 %r3909, [matrix+2196]; - // begin inline asm - dp4a.u32.u32 %r3908, %r3909, %r5766, %r3904; - // end inline asm - ld.const.u32 %r3913, [matrix+2200]; - // begin inline asm - dp4a.u32.u32 %r3912, %r3913, %r5770, %r3908; - // end inline asm - ld.const.u32 %r3917, [matrix+2204]; - // begin inline asm - dp4a.u32.u32 %r3916, %r3917, %r5774, %r3912; - // end inline asm - ld.const.u32 %r3921, [matrix+2208]; - // begin inline asm - dp4a.u32.u32 %r3920, %r3921, %r5778, %r3916; - // end inline asm - ld.const.u32 %r3925, [matrix+2212]; - // begin inline asm - dp4a.u32.u32 %r3924, %r3925, %r5782, %r3920; - // end inline asm - ld.const.u32 %r3929, [matrix+2216]; - // begin inline asm - dp4a.u32.u32 %r3928, %r3929, %r5786, %r3924; - // end inline asm - ld.const.u32 %r3933, [matrix+2220]; - // begin inline asm - dp4a.u32.u32 %r3932, %r3933, %r5790, %r3928; - // end inline asm - ld.const.u32 %r3937, [matrix+2224]; - // begin inline asm - dp4a.u32.u32 %r3936, %r3937, %r5794, %r3932; - // end inline asm - ld.const.u32 %r3941, [matrix+2228]; - // begin inline asm - dp4a.u32.u32 %r3940, %r3941, %r5798, %r3936; - // end inline asm - ld.const.u32 %r3945, [matrix+2232]; - // begin inline asm - dp4a.u32.u32 %r3944, %r3945, %r5802, %r3940; - // end inline asm - ld.const.u32 %r3949, [matrix+2236]; - // begin inline asm - dp4a.u32.u32 %r3948, %r3949, %r5806, %r3944; - // end inline asm - ld.const.u32 %r3953, [matrix+2240]; - // begin inline asm - dp4a.u32.u32 %r3952, %r3953, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3957, [matrix+2244]; - // begin inline asm - dp4a.u32.u32 %r3956, %r3957, %r5750, %r3952; - // end inline asm - ld.const.u32 %r3961, [matrix+2248]; - // begin inline asm - dp4a.u32.u32 %r3960, %r3961, %r5754, %r3956; - // end inline asm - ld.const.u32 %r3965, [matrix+2252]; - // begin inline asm - dp4a.u32.u32 %r3964, %r3965, %r5758, %r3960; - // end inline asm - ld.const.u32 %r3969, [matrix+2256]; - // begin inline asm - dp4a.u32.u32 %r3968, %r3969, %r5762, %r3964; - // end inline asm - ld.const.u32 %r3973, [matrix+2260]; - // begin inline asm - dp4a.u32.u32 %r3972, %r3973, %r5766, %r3968; - // end inline asm - ld.const.u32 %r3977, [matrix+2264]; - // begin inline asm - dp4a.u32.u32 %r3976, %r3977, %r5770, %r3972; - // end inline asm - ld.const.u32 %r3981, [matrix+2268]; - // begin inline asm - dp4a.u32.u32 %r3980, %r3981, %r5774, %r3976; - // end inline asm - ld.const.u32 %r3985, [matrix+2272]; - // begin inline asm - dp4a.u32.u32 %r3984, %r3985, %r5778, %r3980; - // end inline asm - ld.const.u32 %r3989, [matrix+2276]; - // begin inline asm - dp4a.u32.u32 %r3988, %r3989, %r5782, %r3984; - // end inline asm - ld.const.u32 %r3993, [matrix+2280]; - // begin inline asm - dp4a.u32.u32 %r3992, %r3993, %r5786, %r3988; - // end inline asm - ld.const.u32 %r3997, [matrix+2284]; - // begin inline asm - dp4a.u32.u32 %r3996, %r3997, %r5790, %r3992; - // end inline asm - ld.const.u32 %r4001, [matrix+2288]; - // begin inline asm - dp4a.u32.u32 %r4000, %r4001, %r5794, %r3996; - // end inline asm - ld.const.u32 %r4005, [matrix+2292]; - // begin inline asm - dp4a.u32.u32 %r4004, %r4005, %r5798, %r4000; - // end inline asm - ld.const.u32 %r4009, [matrix+2296]; - // begin inline asm - dp4a.u32.u32 %r4008, %r4009, %r5802, %r4004; - // end inline asm - ld.const.u32 %r4013, [matrix+2300]; - // begin inline asm - dp4a.u32.u32 %r4012, %r4013, %r5806, %r4008; - // end inline asm - shr.u32 %r6039, %r3948, 6; - and.b32 %r6040, %r6039, 240; - shr.u32 %r6041, %r4012, 10; - or.b32 %r6042, %r6041, %r6040; - xor.b32 %r6043, %r5886, %r6042; - ld.const.u32 %r4017, [matrix+2304]; - // begin inline asm - dp4a.u32.u32 %r4016, %r4017, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4021, [matrix+2308]; - // begin inline asm - dp4a.u32.u32 %r4020, %r4021, %r5750, %r4016; - // end inline asm - ld.const.u32 %r4025, [matrix+2312]; - // begin inline asm - dp4a.u32.u32 %r4024, %r4025, %r5754, %r4020; - // end inline asm - ld.const.u32 %r4029, [matrix+2316]; - // begin inline asm - dp4a.u32.u32 %r4028, %r4029, %r5758, %r4024; - // end inline asm - ld.const.u32 %r4033, [matrix+2320]; - // begin inline asm - dp4a.u32.u32 %r4032, %r4033, %r5762, %r4028; - // end inline asm - ld.const.u32 %r4037, [matrix+2324]; - // begin inline asm - dp4a.u32.u32 %r4036, %r4037, %r5766, %r4032; - // end inline asm - ld.const.u32 %r4041, [matrix+2328]; - // begin inline asm - dp4a.u32.u32 %r4040, %r4041, %r5770, %r4036; - // end inline asm - ld.const.u32 %r4045, [matrix+2332]; - // begin inline asm - dp4a.u32.u32 %r4044, %r4045, %r5774, %r4040; - // end inline asm - ld.const.u32 %r4049, [matrix+2336]; - // begin inline asm - dp4a.u32.u32 %r4048, %r4049, %r5778, %r4044; - // end inline asm - ld.const.u32 %r4053, [matrix+2340]; - // begin inline asm - dp4a.u32.u32 %r4052, %r4053, %r5782, %r4048; - // end inline asm - ld.const.u32 %r4057, [matrix+2344]; - // begin inline asm - dp4a.u32.u32 %r4056, %r4057, %r5786, %r4052; - // end inline asm - ld.const.u32 %r4061, [matrix+2348]; - // begin inline asm - dp4a.u32.u32 %r4060, %r4061, %r5790, %r4056; - // end inline asm - ld.const.u32 %r4065, [matrix+2352]; - // begin inline asm - dp4a.u32.u32 %r4064, %r4065, %r5794, %r4060; - // end inline asm - ld.const.u32 %r4069, [matrix+2356]; - // begin inline asm - dp4a.u32.u32 %r4068, %r4069, %r5798, %r4064; - // end inline asm - ld.const.u32 %r4073, [matrix+2360]; - // begin inline asm - dp4a.u32.u32 %r4072, %r4073, %r5802, %r4068; - // end inline asm - ld.const.u32 %r4077, [matrix+2364]; - // begin inline asm - dp4a.u32.u32 %r4076, %r4077, %r5806, %r4072; - // end inline asm - ld.const.u32 %r4081, [matrix+2368]; - // begin inline asm - dp4a.u32.u32 %r4080, %r4081, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4085, [matrix+2372]; - // begin inline asm - dp4a.u32.u32 %r4084, %r4085, %r5750, %r4080; - // end inline asm - ld.const.u32 %r4089, [matrix+2376]; - // begin inline asm - dp4a.u32.u32 %r4088, %r4089, %r5754, %r4084; - // end inline asm - ld.const.u32 %r4093, [matrix+2380]; - // begin inline asm - dp4a.u32.u32 %r4092, %r4093, %r5758, %r4088; - // end inline asm - ld.const.u32 %r4097, [matrix+2384]; - // begin inline asm - dp4a.u32.u32 %r4096, %r4097, %r5762, %r4092; - // end inline asm - ld.const.u32 %r4101, [matrix+2388]; - // begin inline asm - dp4a.u32.u32 %r4100, %r4101, %r5766, %r4096; - // end inline asm - ld.const.u32 %r4105, [matrix+2392]; - // begin inline asm - dp4a.u32.u32 %r4104, %r4105, %r5770, %r4100; - // end inline asm - ld.const.u32 %r4109, [matrix+2396]; - // begin inline asm - dp4a.u32.u32 %r4108, %r4109, %r5774, %r4104; - // end inline asm - ld.const.u32 %r4113, [matrix+2400]; - // begin inline asm - dp4a.u32.u32 %r4112, %r4113, %r5778, %r4108; - // end inline asm - ld.const.u32 %r4117, [matrix+2404]; - // begin inline asm - dp4a.u32.u32 %r4116, %r4117, %r5782, %r4112; - // end inline asm - ld.const.u32 %r4121, [matrix+2408]; - // begin inline asm - dp4a.u32.u32 %r4120, %r4121, %r5786, %r4116; - // end inline asm - ld.const.u32 %r4125, [matrix+2412]; - // begin inline asm - dp4a.u32.u32 %r4124, %r4125, %r5790, %r4120; - // end inline asm - ld.const.u32 %r4129, [matrix+2416]; - // begin inline asm - dp4a.u32.u32 %r4128, %r4129, %r5794, %r4124; - // end inline asm - ld.const.u32 %r4133, [matrix+2420]; - // begin inline asm - dp4a.u32.u32 %r4132, %r4133, %r5798, %r4128; - // end inline asm - ld.const.u32 %r4137, [matrix+2424]; - // begin inline asm - dp4a.u32.u32 %r4136, %r4137, %r5802, %r4132; - // end inline asm - ld.const.u32 %r4141, [matrix+2428]; - // begin inline asm - dp4a.u32.u32 %r4140, %r4141, %r5806, %r4136; - // end inline asm - shr.u32 %r6044, %r4076, 6; - and.b32 %r6045, %r6044, 240; - shr.u32 %r6046, %r4140, 10; - or.b32 %r6047, %r6046, %r6045; - xor.b32 %r6048, %r5898, %r6047; - ld.const.u32 %r4145, [matrix+2432]; - // begin inline asm - dp4a.u32.u32 %r4144, %r4145, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4149, [matrix+2436]; - // begin inline asm - dp4a.u32.u32 %r4148, %r4149, %r5750, %r4144; - // end inline asm - ld.const.u32 %r4153, [matrix+2440]; - // begin inline asm - dp4a.u32.u32 %r4152, %r4153, %r5754, %r4148; - // end inline asm - ld.const.u32 %r4157, [matrix+2444]; - // begin inline asm - dp4a.u32.u32 %r4156, %r4157, %r5758, %r4152; - // end inline asm - ld.const.u32 %r4161, [matrix+2448]; - // begin inline asm - dp4a.u32.u32 %r4160, %r4161, %r5762, %r4156; - // end inline asm - ld.const.u32 %r4165, [matrix+2452]; - // begin inline asm - dp4a.u32.u32 %r4164, %r4165, %r5766, %r4160; - // end inline asm - ld.const.u32 %r4169, [matrix+2456]; - // begin inline asm - dp4a.u32.u32 %r4168, %r4169, %r5770, %r4164; - // end inline asm - ld.const.u32 %r4173, [matrix+2460]; - // begin inline asm - dp4a.u32.u32 %r4172, %r4173, %r5774, %r4168; - // end inline asm - ld.const.u32 %r4177, [matrix+2464]; - // begin inline asm - dp4a.u32.u32 %r4176, %r4177, %r5778, %r4172; - // end inline asm - ld.const.u32 %r4181, [matrix+2468]; - // begin inline asm - dp4a.u32.u32 %r4180, %r4181, %r5782, %r4176; - // end inline asm - ld.const.u32 %r4185, [matrix+2472]; - // begin inline asm - dp4a.u32.u32 %r4184, %r4185, %r5786, %r4180; - // end inline asm - ld.const.u32 %r4189, [matrix+2476]; - // begin inline asm - dp4a.u32.u32 %r4188, %r4189, %r5790, %r4184; - // end inline asm - ld.const.u32 %r4193, [matrix+2480]; - // begin inline asm - dp4a.u32.u32 %r4192, %r4193, %r5794, %r4188; - // end inline asm - ld.const.u32 %r4197, [matrix+2484]; - // begin inline asm - dp4a.u32.u32 %r4196, %r4197, %r5798, %r4192; - // end inline asm - ld.const.u32 %r4201, [matrix+2488]; - // begin inline asm - dp4a.u32.u32 %r4200, %r4201, %r5802, %r4196; - // end inline asm - ld.const.u32 %r4205, [matrix+2492]; - // begin inline asm - dp4a.u32.u32 %r4204, %r4205, %r5806, %r4200; - // end inline asm - ld.const.u32 %r4209, [matrix+2496]; - // begin inline asm - dp4a.u32.u32 %r4208, %r4209, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4213, [matrix+2500]; - // begin inline asm - dp4a.u32.u32 %r4212, %r4213, %r5750, %r4208; - // end inline asm - ld.const.u32 %r4217, [matrix+2504]; - // begin inline asm - dp4a.u32.u32 %r4216, %r4217, %r5754, %r4212; - // end inline asm - ld.const.u32 %r4221, [matrix+2508]; - // begin inline asm - dp4a.u32.u32 %r4220, %r4221, %r5758, %r4216; - // end inline asm - ld.const.u32 %r4225, [matrix+2512]; - // begin inline asm - dp4a.u32.u32 %r4224, %r4225, %r5762, %r4220; - // end inline asm - ld.const.u32 %r4229, [matrix+2516]; - // begin inline asm - dp4a.u32.u32 %r4228, %r4229, %r5766, %r4224; - // end inline asm - ld.const.u32 %r4233, [matrix+2520]; - // begin inline asm - dp4a.u32.u32 %r4232, %r4233, %r5770, %r4228; - // end inline asm - ld.const.u32 %r4237, [matrix+2524]; - // begin inline asm - dp4a.u32.u32 %r4236, %r4237, %r5774, %r4232; - // end inline asm - ld.const.u32 %r4241, [matrix+2528]; - // begin inline asm - dp4a.u32.u32 %r4240, %r4241, %r5778, %r4236; - // end inline asm - ld.const.u32 %r4245, [matrix+2532]; - // begin inline asm - dp4a.u32.u32 %r4244, %r4245, %r5782, %r4240; - // end inline asm - ld.const.u32 %r4249, [matrix+2536]; - // begin inline asm - dp4a.u32.u32 %r4248, %r4249, %r5786, %r4244; - // end inline asm - ld.const.u32 %r4253, [matrix+2540]; - // begin inline asm - dp4a.u32.u32 %r4252, %r4253, %r5790, %r4248; - // end inline asm - ld.const.u32 %r4257, [matrix+2544]; - // begin inline asm - dp4a.u32.u32 %r4256, %r4257, %r5794, %r4252; - // end inline asm - ld.const.u32 %r4261, [matrix+2548]; - // begin inline asm - dp4a.u32.u32 %r4260, %r4261, %r5798, %r4256; - // end inline asm - ld.const.u32 %r4265, [matrix+2552]; - // begin inline asm - dp4a.u32.u32 %r4264, %r4265, %r5802, %r4260; - // end inline asm - ld.const.u32 %r4269, [matrix+2556]; - // begin inline asm - dp4a.u32.u32 %r4268, %r4269, %r5806, %r4264; - // end inline asm - shr.u32 %r6049, %r4204, 6; - and.b32 %r6050, %r6049, 240; - shr.u32 %r6051, %r4268, 10; - or.b32 %r6052, %r6051, %r6050; - xor.b32 %r6053, %r5900, %r6052; - ld.const.u32 %r4273, [matrix+2560]; - // begin inline asm - dp4a.u32.u32 %r4272, %r4273, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4277, [matrix+2564]; - // begin inline asm - dp4a.u32.u32 %r4276, %r4277, %r5750, %r4272; - // end inline asm - ld.const.u32 %r4281, [matrix+2568]; - // begin inline asm - dp4a.u32.u32 %r4280, %r4281, %r5754, %r4276; - // end inline asm - ld.const.u32 %r4285, [matrix+2572]; - // begin inline asm - dp4a.u32.u32 %r4284, %r4285, %r5758, %r4280; - // end inline asm - ld.const.u32 %r4289, [matrix+2576]; - // begin inline asm - dp4a.u32.u32 %r4288, %r4289, %r5762, %r4284; - // end inline asm - ld.const.u32 %r4293, [matrix+2580]; - // begin inline asm - dp4a.u32.u32 %r4292, %r4293, %r5766, %r4288; - // end inline asm - ld.const.u32 %r4297, [matrix+2584]; - // begin inline asm - dp4a.u32.u32 %r4296, %r4297, %r5770, %r4292; - // end inline asm - ld.const.u32 %r4301, [matrix+2588]; - // begin inline asm - dp4a.u32.u32 %r4300, %r4301, %r5774, %r4296; - // end inline asm - ld.const.u32 %r4305, [matrix+2592]; - // begin inline asm - dp4a.u32.u32 %r4304, %r4305, %r5778, %r4300; - // end inline asm - ld.const.u32 %r4309, [matrix+2596]; - // begin inline asm - dp4a.u32.u32 %r4308, %r4309, %r5782, %r4304; - // end inline asm - ld.const.u32 %r4313, [matrix+2600]; - // begin inline asm - dp4a.u32.u32 %r4312, %r4313, %r5786, %r4308; - // end inline asm - ld.const.u32 %r4317, [matrix+2604]; - // begin inline asm - dp4a.u32.u32 %r4316, %r4317, %r5790, %r4312; - // end inline asm - ld.const.u32 %r4321, [matrix+2608]; - // begin inline asm - dp4a.u32.u32 %r4320, %r4321, %r5794, %r4316; - // end inline asm - ld.const.u32 %r4325, [matrix+2612]; - // begin inline asm - dp4a.u32.u32 %r4324, %r4325, %r5798, %r4320; - // end inline asm - ld.const.u32 %r4329, [matrix+2616]; - // begin inline asm - dp4a.u32.u32 %r4328, %r4329, %r5802, %r4324; - // end inline asm - ld.const.u32 %r4333, [matrix+2620]; - // begin inline asm - dp4a.u32.u32 %r4332, %r4333, %r5806, %r4328; - // end inline asm - ld.const.u32 %r4337, [matrix+2624]; - // begin inline asm - dp4a.u32.u32 %r4336, %r4337, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4341, [matrix+2628]; - // begin inline asm - dp4a.u32.u32 %r4340, %r4341, %r5750, %r4336; - // end inline asm - ld.const.u32 %r4345, [matrix+2632]; - // begin inline asm - dp4a.u32.u32 %r4344, %r4345, %r5754, %r4340; - // end inline asm - ld.const.u32 %r4349, [matrix+2636]; - // begin inline asm - dp4a.u32.u32 %r4348, %r4349, %r5758, %r4344; - // end inline asm - ld.const.u32 %r4353, [matrix+2640]; - // begin inline asm - dp4a.u32.u32 %r4352, %r4353, %r5762, %r4348; - // end inline asm - ld.const.u32 %r4357, [matrix+2644]; - // begin inline asm - dp4a.u32.u32 %r4356, %r4357, %r5766, %r4352; - // end inline asm - ld.const.u32 %r4361, [matrix+2648]; - // begin inline asm - dp4a.u32.u32 %r4360, %r4361, %r5770, %r4356; - // end inline asm - ld.const.u32 %r4365, [matrix+2652]; - // begin inline asm - dp4a.u32.u32 %r4364, %r4365, %r5774, %r4360; - // end inline asm - ld.const.u32 %r4369, [matrix+2656]; - // begin inline asm - dp4a.u32.u32 %r4368, %r4369, %r5778, %r4364; - // end inline asm - ld.const.u32 %r4373, [matrix+2660]; - // begin inline asm - dp4a.u32.u32 %r4372, %r4373, %r5782, %r4368; - // end inline asm - ld.const.u32 %r4377, [matrix+2664]; - // begin inline asm - dp4a.u32.u32 %r4376, %r4377, %r5786, %r4372; - // end inline asm - ld.const.u32 %r4381, [matrix+2668]; - // begin inline asm - dp4a.u32.u32 %r4380, %r4381, %r5790, %r4376; - // end inline asm - ld.const.u32 %r4385, [matrix+2672]; - // begin inline asm - dp4a.u32.u32 %r4384, %r4385, %r5794, %r4380; - // end inline asm - ld.const.u32 %r4389, [matrix+2676]; - // begin inline asm - dp4a.u32.u32 %r4388, %r4389, %r5798, %r4384; - // end inline asm - ld.const.u32 %r4393, [matrix+2680]; - // begin inline asm - dp4a.u32.u32 %r4392, %r4393, %r5802, %r4388; - // end inline asm - ld.const.u32 %r4397, [matrix+2684]; - // begin inline asm - dp4a.u32.u32 %r4396, %r4397, %r5806, %r4392; - // end inline asm - shr.u32 %r6054, %r4332, 6; - and.b32 %r6055, %r6054, 240; - shr.u32 %r6056, %r4396, 10; - or.b32 %r6057, %r6056, %r6055; - cvt.u64.u32 %rd223, %r6057; - xor.b64 %rd224, %rd16, %rd223; - ld.const.u32 %r4401, [matrix+2688]; - // begin inline asm - dp4a.u32.u32 %r4400, %r4401, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4405, [matrix+2692]; - // begin inline asm - dp4a.u32.u32 %r4404, %r4405, %r5750, %r4400; - // end inline asm - ld.const.u32 %r4409, [matrix+2696]; - // begin inline asm - dp4a.u32.u32 %r4408, %r4409, %r5754, %r4404; - // end inline asm - ld.const.u32 %r4413, [matrix+2700]; - // begin inline asm - dp4a.u32.u32 %r4412, %r4413, %r5758, %r4408; - // end inline asm - ld.const.u32 %r4417, [matrix+2704]; - // begin inline asm - dp4a.u32.u32 %r4416, %r4417, %r5762, %r4412; - // end inline asm - ld.const.u32 %r4421, [matrix+2708]; - // begin inline asm - dp4a.u32.u32 %r4420, %r4421, %r5766, %r4416; - // end inline asm - ld.const.u32 %r4425, [matrix+2712]; - // begin inline asm - dp4a.u32.u32 %r4424, %r4425, %r5770, %r4420; - // end inline asm - ld.const.u32 %r4429, [matrix+2716]; - // begin inline asm - dp4a.u32.u32 %r4428, %r4429, %r5774, %r4424; - // end inline asm - ld.const.u32 %r4433, [matrix+2720]; - // begin inline asm - dp4a.u32.u32 %r4432, %r4433, %r5778, %r4428; - // end inline asm - ld.const.u32 %r4437, [matrix+2724]; - // begin inline asm - dp4a.u32.u32 %r4436, %r4437, %r5782, %r4432; - // end inline asm - ld.const.u32 %r4441, [matrix+2728]; - // begin inline asm - dp4a.u32.u32 %r4440, %r4441, %r5786, %r4436; - // end inline asm - ld.const.u32 %r4445, [matrix+2732]; - // begin inline asm - dp4a.u32.u32 %r4444, %r4445, %r5790, %r4440; - // end inline asm - ld.const.u32 %r4449, [matrix+2736]; - // begin inline asm - dp4a.u32.u32 %r4448, %r4449, %r5794, %r4444; - // end inline asm - ld.const.u32 %r4453, [matrix+2740]; - // begin inline asm - dp4a.u32.u32 %r4452, %r4453, %r5798, %r4448; - // end inline asm - ld.const.u32 %r4457, [matrix+2744]; - // begin inline asm - dp4a.u32.u32 %r4456, %r4457, %r5802, %r4452; - // end inline asm - ld.const.u32 %r4461, [matrix+2748]; - // begin inline asm - dp4a.u32.u32 %r4460, %r4461, %r5806, %r4456; - // end inline asm - ld.const.u32 %r4465, [matrix+2752]; - // begin inline asm - dp4a.u32.u32 %r4464, %r4465, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4469, [matrix+2756]; - // begin inline asm - dp4a.u32.u32 %r4468, %r4469, %r5750, %r4464; - // end inline asm - ld.const.u32 %r4473, [matrix+2760]; - // begin inline asm - dp4a.u32.u32 %r4472, %r4473, %r5754, %r4468; - // end inline asm - ld.const.u32 %r4477, [matrix+2764]; - // begin inline asm - dp4a.u32.u32 %r4476, %r4477, %r5758, %r4472; - // end inline asm - ld.const.u32 %r4481, [matrix+2768]; - // begin inline asm - dp4a.u32.u32 %r4480, %r4481, %r5762, %r4476; - // end inline asm - ld.const.u32 %r4485, [matrix+2772]; - // begin inline asm - dp4a.u32.u32 %r4484, %r4485, %r5766, %r4480; - // end inline asm - ld.const.u32 %r4489, [matrix+2776]; - // begin inline asm - dp4a.u32.u32 %r4488, %r4489, %r5770, %r4484; - // end inline asm - ld.const.u32 %r4493, [matrix+2780]; - // begin inline asm - dp4a.u32.u32 %r4492, %r4493, %r5774, %r4488; - // end inline asm - ld.const.u32 %r4497, [matrix+2784]; - // begin inline asm - dp4a.u32.u32 %r4496, %r4497, %r5778, %r4492; - // end inline asm - ld.const.u32 %r4501, [matrix+2788]; - // begin inline asm - dp4a.u32.u32 %r4500, %r4501, %r5782, %r4496; - // end inline asm - ld.const.u32 %r4505, [matrix+2792]; - // begin inline asm - dp4a.u32.u32 %r4504, %r4505, %r5786, %r4500; - // end inline asm - ld.const.u32 %r4509, [matrix+2796]; - // begin inline asm - dp4a.u32.u32 %r4508, %r4509, %r5790, %r4504; - // end inline asm - ld.const.u32 %r4513, [matrix+2800]; - // begin inline asm - dp4a.u32.u32 %r4512, %r4513, %r5794, %r4508; - // end inline asm - ld.const.u32 %r4517, [matrix+2804]; - // begin inline asm - dp4a.u32.u32 %r4516, %r4517, %r5798, %r4512; - // end inline asm - ld.const.u32 %r4521, [matrix+2808]; - // begin inline asm - dp4a.u32.u32 %r4520, %r4521, %r5802, %r4516; - // end inline asm - ld.const.u32 %r4525, [matrix+2812]; - // begin inline asm - dp4a.u32.u32 %r4524, %r4525, %r5806, %r4520; - // end inline asm - shr.u32 %r6058, %r4460, 6; - and.b32 %r6059, %r6058, 240; - shr.u32 %r6060, %r4524, 10; - or.b32 %r6061, %r6060, %r6059; - cvt.u64.u32 %rd225, %r6061; - xor.b64 %rd226, %rd17, %rd225; - ld.const.u32 %r4529, [matrix+2816]; - // begin inline asm - dp4a.u32.u32 %r4528, %r4529, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4533, [matrix+2820]; - // begin inline asm - dp4a.u32.u32 %r4532, %r4533, %r5750, %r4528; - // end inline asm - ld.const.u32 %r4537, [matrix+2824]; - // begin inline asm - dp4a.u32.u32 %r4536, %r4537, %r5754, %r4532; - // end inline asm - ld.const.u32 %r4541, [matrix+2828]; - // begin inline asm - dp4a.u32.u32 %r4540, %r4541, %r5758, %r4536; - // end inline asm - ld.const.u32 %r4545, [matrix+2832]; - // begin inline asm - dp4a.u32.u32 %r4544, %r4545, %r5762, %r4540; - // end inline asm - ld.const.u32 %r4549, [matrix+2836]; - // begin inline asm - dp4a.u32.u32 %r4548, %r4549, %r5766, %r4544; - // end inline asm - ld.const.u32 %r4553, [matrix+2840]; - // begin inline asm - dp4a.u32.u32 %r4552, %r4553, %r5770, %r4548; - // end inline asm - ld.const.u32 %r4557, [matrix+2844]; - // begin inline asm - dp4a.u32.u32 %r4556, %r4557, %r5774, %r4552; - // end inline asm - ld.const.u32 %r4561, [matrix+2848]; - // begin inline asm - dp4a.u32.u32 %r4560, %r4561, %r5778, %r4556; - // end inline asm - ld.const.u32 %r4565, [matrix+2852]; - // begin inline asm - dp4a.u32.u32 %r4564, %r4565, %r5782, %r4560; - // end inline asm - ld.const.u32 %r4569, [matrix+2856]; - // begin inline asm - dp4a.u32.u32 %r4568, %r4569, %r5786, %r4564; - // end inline asm - ld.const.u32 %r4573, [matrix+2860]; - // begin inline asm - dp4a.u32.u32 %r4572, %r4573, %r5790, %r4568; - // end inline asm - ld.const.u32 %r4577, [matrix+2864]; - // begin inline asm - dp4a.u32.u32 %r4576, %r4577, %r5794, %r4572; - // end inline asm - ld.const.u32 %r4581, [matrix+2868]; - // begin inline asm - dp4a.u32.u32 %r4580, %r4581, %r5798, %r4576; - // end inline asm - ld.const.u32 %r4585, [matrix+2872]; - // begin inline asm - dp4a.u32.u32 %r4584, %r4585, %r5802, %r4580; - // end inline asm - ld.const.u32 %r4589, [matrix+2876]; - // begin inline asm - dp4a.u32.u32 %r4588, %r4589, %r5806, %r4584; - // end inline asm - ld.const.u32 %r4593, [matrix+2880]; - // begin inline asm - dp4a.u32.u32 %r4592, %r4593, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4597, [matrix+2884]; - // begin inline asm - dp4a.u32.u32 %r4596, %r4597, %r5750, %r4592; - // end inline asm - ld.const.u32 %r4601, [matrix+2888]; - // begin inline asm - dp4a.u32.u32 %r4600, %r4601, %r5754, %r4596; - // end inline asm - ld.const.u32 %r4605, [matrix+2892]; - // begin inline asm - dp4a.u32.u32 %r4604, %r4605, %r5758, %r4600; - // end inline asm - ld.const.u32 %r4609, [matrix+2896]; - // begin inline asm - dp4a.u32.u32 %r4608, %r4609, %r5762, %r4604; - // end inline asm - ld.const.u32 %r4613, [matrix+2900]; - // begin inline asm - dp4a.u32.u32 %r4612, %r4613, %r5766, %r4608; - // end inline asm - ld.const.u32 %r4617, [matrix+2904]; - // begin inline asm - dp4a.u32.u32 %r4616, %r4617, %r5770, %r4612; - // end inline asm - ld.const.u32 %r4621, [matrix+2908]; - // begin inline asm - dp4a.u32.u32 %r4620, %r4621, %r5774, %r4616; - // end inline asm - ld.const.u32 %r4625, [matrix+2912]; - // begin inline asm - dp4a.u32.u32 %r4624, %r4625, %r5778, %r4620; - // end inline asm - ld.const.u32 %r4629, [matrix+2916]; - // begin inline asm - dp4a.u32.u32 %r4628, %r4629, %r5782, %r4624; - // end inline asm - ld.const.u32 %r4633, [matrix+2920]; - // begin inline asm - dp4a.u32.u32 %r4632, %r4633, %r5786, %r4628; - // end inline asm - ld.const.u32 %r4637, [matrix+2924]; - // begin inline asm - dp4a.u32.u32 %r4636, %r4637, %r5790, %r4632; - // end inline asm - ld.const.u32 %r4641, [matrix+2928]; - // begin inline asm - dp4a.u32.u32 %r4640, %r4641, %r5794, %r4636; - // end inline asm - ld.const.u32 %r4645, [matrix+2932]; - // begin inline asm - dp4a.u32.u32 %r4644, %r4645, %r5798, %r4640; - // end inline asm - ld.const.u32 %r4649, [matrix+2936]; - // begin inline asm - dp4a.u32.u32 %r4648, %r4649, %r5802, %r4644; - // end inline asm - ld.const.u32 %r4653, [matrix+2940]; - // begin inline asm - dp4a.u32.u32 %r4652, %r4653, %r5806, %r4648; - // end inline asm - shr.u32 %r6062, %r4588, 6; - and.b32 %r6063, %r6062, 240; - shr.u32 %r6064, %r4652, 10; - or.b32 %r6065, %r6064, %r6063; - cvt.u64.u32 %rd227, %r6065; - xor.b64 %rd228, %rd18, %rd227; - ld.const.u32 %r4657, [matrix+2944]; - // begin inline asm - dp4a.u32.u32 %r4656, %r4657, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4661, [matrix+2948]; - // begin inline asm - dp4a.u32.u32 %r4660, %r4661, %r5750, %r4656; - // end inline asm - ld.const.u32 %r4665, [matrix+2952]; - // begin inline asm - dp4a.u32.u32 %r4664, %r4665, %r5754, %r4660; - // end inline asm - ld.const.u32 %r4669, [matrix+2956]; - // begin inline asm - dp4a.u32.u32 %r4668, %r4669, %r5758, %r4664; - // end inline asm - ld.const.u32 %r4673, [matrix+2960]; - // begin inline asm - dp4a.u32.u32 %r4672, %r4673, %r5762, %r4668; - // end inline asm - ld.const.u32 %r4677, [matrix+2964]; - // begin inline asm - dp4a.u32.u32 %r4676, %r4677, %r5766, %r4672; - // end inline asm - ld.const.u32 %r4681, [matrix+2968]; - // begin inline asm - dp4a.u32.u32 %r4680, %r4681, %r5770, %r4676; - // end inline asm - ld.const.u32 %r4685, [matrix+2972]; - // begin inline asm - dp4a.u32.u32 %r4684, %r4685, %r5774, %r4680; - // end inline asm - ld.const.u32 %r4689, [matrix+2976]; - // begin inline asm - dp4a.u32.u32 %r4688, %r4689, %r5778, %r4684; - // end inline asm - ld.const.u32 %r4693, [matrix+2980]; - // begin inline asm - dp4a.u32.u32 %r4692, %r4693, %r5782, %r4688; - // end inline asm - ld.const.u32 %r4697, [matrix+2984]; - // begin inline asm - dp4a.u32.u32 %r4696, %r4697, %r5786, %r4692; - // end inline asm - ld.const.u32 %r4701, [matrix+2988]; - // begin inline asm - dp4a.u32.u32 %r4700, %r4701, %r5790, %r4696; - // end inline asm - ld.const.u32 %r4705, [matrix+2992]; - // begin inline asm - dp4a.u32.u32 %r4704, %r4705, %r5794, %r4700; - // end inline asm - ld.const.u32 %r4709, [matrix+2996]; - // begin inline asm - dp4a.u32.u32 %r4708, %r4709, %r5798, %r4704; - // end inline asm - ld.const.u32 %r4713, [matrix+3000]; - // begin inline asm - dp4a.u32.u32 %r4712, %r4713, %r5802, %r4708; - // end inline asm - ld.const.u32 %r4717, [matrix+3004]; - // begin inline asm - dp4a.u32.u32 %r4716, %r4717, %r5806, %r4712; - // end inline asm - ld.const.u32 %r4721, [matrix+3008]; - // begin inline asm - dp4a.u32.u32 %r4720, %r4721, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4725, [matrix+3012]; - // begin inline asm - dp4a.u32.u32 %r4724, %r4725, %r5750, %r4720; - // end inline asm - ld.const.u32 %r4729, [matrix+3016]; - // begin inline asm - dp4a.u32.u32 %r4728, %r4729, %r5754, %r4724; - // end inline asm - ld.const.u32 %r4733, [matrix+3020]; - // begin inline asm - dp4a.u32.u32 %r4732, %r4733, %r5758, %r4728; - // end inline asm - ld.const.u32 %r4737, [matrix+3024]; - // begin inline asm - dp4a.u32.u32 %r4736, %r4737, %r5762, %r4732; - // end inline asm - ld.const.u32 %r4741, [matrix+3028]; - // begin inline asm - dp4a.u32.u32 %r4740, %r4741, %r5766, %r4736; - // end inline asm - ld.const.u32 %r4745, [matrix+3032]; - // begin inline asm - dp4a.u32.u32 %r4744, %r4745, %r5770, %r4740; - // end inline asm - ld.const.u32 %r4749, [matrix+3036]; - // begin inline asm - dp4a.u32.u32 %r4748, %r4749, %r5774, %r4744; - // end inline asm - ld.const.u32 %r4753, [matrix+3040]; - // begin inline asm - dp4a.u32.u32 %r4752, %r4753, %r5778, %r4748; - // end inline asm - ld.const.u32 %r4757, [matrix+3044]; - // begin inline asm - dp4a.u32.u32 %r4756, %r4757, %r5782, %r4752; - // end inline asm - ld.const.u32 %r4761, [matrix+3048]; - // begin inline asm - dp4a.u32.u32 %r4760, %r4761, %r5786, %r4756; - // end inline asm - ld.const.u32 %r4765, [matrix+3052]; - // begin inline asm - dp4a.u32.u32 %r4764, %r4765, %r5790, %r4760; - // end inline asm - ld.const.u32 %r4769, [matrix+3056]; - // begin inline asm - dp4a.u32.u32 %r4768, %r4769, %r5794, %r4764; - // end inline asm - ld.const.u32 %r4773, [matrix+3060]; - // begin inline asm - dp4a.u32.u32 %r4772, %r4773, %r5798, %r4768; - // end inline asm - ld.const.u32 %r4777, [matrix+3064]; - // begin inline asm - dp4a.u32.u32 %r4776, %r4777, %r5802, %r4772; - // end inline asm - ld.const.u32 %r4781, [matrix+3068]; - // begin inline asm - dp4a.u32.u32 %r4780, %r4781, %r5806, %r4776; - // end inline asm - shr.u32 %r6066, %r4716, 6; - and.b32 %r6067, %r6066, 240; - ld.const.u32 %r4785, [matrix+3072]; - // begin inline asm - dp4a.u32.u32 %r4784, %r4785, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4789, [matrix+3076]; - // begin inline asm - dp4a.u32.u32 %r4788, %r4789, %r5750, %r4784; - // end inline asm - ld.const.u32 %r4793, [matrix+3080]; - // begin inline asm - dp4a.u32.u32 %r4792, %r4793, %r5754, %r4788; - // end inline asm - ld.const.u32 %r4797, [matrix+3084]; - // begin inline asm - dp4a.u32.u32 %r4796, %r4797, %r5758, %r4792; - // end inline asm - ld.const.u32 %r4801, [matrix+3088]; - // begin inline asm - dp4a.u32.u32 %r4800, %r4801, %r5762, %r4796; - // end inline asm - ld.const.u32 %r4805, [matrix+3092]; - // begin inline asm - dp4a.u32.u32 %r4804, %r4805, %r5766, %r4800; - // end inline asm - ld.const.u32 %r4809, [matrix+3096]; - // begin inline asm - dp4a.u32.u32 %r4808, %r4809, %r5770, %r4804; - // end inline asm - ld.const.u32 %r4813, [matrix+3100]; - // begin inline asm - dp4a.u32.u32 %r4812, %r4813, %r5774, %r4808; - // end inline asm - ld.const.u32 %r4817, [matrix+3104]; - // begin inline asm - dp4a.u32.u32 %r4816, %r4817, %r5778, %r4812; - // end inline asm - ld.const.u32 %r4821, [matrix+3108]; - // begin inline asm - dp4a.u32.u32 %r4820, %r4821, %r5782, %r4816; - // end inline asm - ld.const.u32 %r4825, [matrix+3112]; - // begin inline asm - dp4a.u32.u32 %r4824, %r4825, %r5786, %r4820; - // end inline asm - ld.const.u32 %r4829, [matrix+3116]; - // begin inline asm - dp4a.u32.u32 %r4828, %r4829, %r5790, %r4824; - // end inline asm - ld.const.u32 %r4833, [matrix+3120]; - // begin inline asm - dp4a.u32.u32 %r4832, %r4833, %r5794, %r4828; - // end inline asm - ld.const.u32 %r4837, [matrix+3124]; - // begin inline asm - dp4a.u32.u32 %r4836, %r4837, %r5798, %r4832; - // end inline asm - ld.const.u32 %r4841, [matrix+3128]; - // begin inline asm - dp4a.u32.u32 %r4840, %r4841, %r5802, %r4836; - // end inline asm - ld.const.u32 %r4845, [matrix+3132]; - // begin inline asm - dp4a.u32.u32 %r4844, %r4845, %r5806, %r4840; - // end inline asm - ld.const.u32 %r4849, [matrix+3136]; - // begin inline asm - dp4a.u32.u32 %r4848, %r4849, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4853, [matrix+3140]; - // begin inline asm - dp4a.u32.u32 %r4852, %r4853, %r5750, %r4848; - // end inline asm - ld.const.u32 %r4857, [matrix+3144]; - // begin inline asm - dp4a.u32.u32 %r4856, %r4857, %r5754, %r4852; - // end inline asm - ld.const.u32 %r4861, [matrix+3148]; - // begin inline asm - dp4a.u32.u32 %r4860, %r4861, %r5758, %r4856; - // end inline asm - ld.const.u32 %r4865, [matrix+3152]; - // begin inline asm - dp4a.u32.u32 %r4864, %r4865, %r5762, %r4860; - // end inline asm - ld.const.u32 %r4869, [matrix+3156]; - // begin inline asm - dp4a.u32.u32 %r4868, %r4869, %r5766, %r4864; - // end inline asm - ld.const.u32 %r4873, [matrix+3160]; - // begin inline asm - dp4a.u32.u32 %r4872, %r4873, %r5770, %r4868; - // end inline asm - ld.const.u32 %r4877, [matrix+3164]; - // begin inline asm - dp4a.u32.u32 %r4876, %r4877, %r5774, %r4872; - // end inline asm - ld.const.u32 %r4881, [matrix+3168]; - // begin inline asm - dp4a.u32.u32 %r4880, %r4881, %r5778, %r4876; - // end inline asm - ld.const.u32 %r4885, [matrix+3172]; - // begin inline asm - dp4a.u32.u32 %r4884, %r4885, %r5782, %r4880; - // end inline asm - ld.const.u32 %r4889, [matrix+3176]; - // begin inline asm - dp4a.u32.u32 %r4888, %r4889, %r5786, %r4884; - // end inline asm - ld.const.u32 %r4893, [matrix+3180]; - // begin inline asm - dp4a.u32.u32 %r4892, %r4893, %r5790, %r4888; - // end inline asm - ld.const.u32 %r4897, [matrix+3184]; - // begin inline asm - dp4a.u32.u32 %r4896, %r4897, %r5794, %r4892; - // end inline asm - ld.const.u32 %r4901, [matrix+3188]; - // begin inline asm - dp4a.u32.u32 %r4900, %r4901, %r5798, %r4896; - // end inline asm - ld.const.u32 %r4905, [matrix+3192]; - // begin inline asm - dp4a.u32.u32 %r4904, %r4905, %r5802, %r4900; - // end inline asm - ld.const.u32 %r4909, [matrix+3196]; - // begin inline asm - dp4a.u32.u32 %r4908, %r4909, %r5806, %r4904; - // end inline asm - shr.u32 %r6068, %r4844, 6; - and.b32 %r6069, %r6068, 240; - shr.u32 %r6070, %r4908, 10; - and.b32 %r6071, %r6070, 255; - or.b32 %r6072, %r6071, %r6069; - cvt.u64.u32 %rd229, %r6072; - ld.const.u32 %r4913, [matrix+3200]; - // begin inline asm - dp4a.u32.u32 %r4912, %r4913, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4917, [matrix+3204]; - // begin inline asm - dp4a.u32.u32 %r4916, %r4917, %r5750, %r4912; - // end inline asm - ld.const.u32 %r4921, [matrix+3208]; - // begin inline asm - dp4a.u32.u32 %r4920, %r4921, %r5754, %r4916; - // end inline asm - ld.const.u32 %r4925, [matrix+3212]; - // begin inline asm - dp4a.u32.u32 %r4924, %r4925, %r5758, %r4920; - // end inline asm - ld.const.u32 %r4929, [matrix+3216]; - // begin inline asm - dp4a.u32.u32 %r4928, %r4929, %r5762, %r4924; - // end inline asm - ld.const.u32 %r4933, [matrix+3220]; - // begin inline asm - dp4a.u32.u32 %r4932, %r4933, %r5766, %r4928; - // end inline asm - ld.const.u32 %r4937, [matrix+3224]; - // begin inline asm - dp4a.u32.u32 %r4936, %r4937, %r5770, %r4932; - // end inline asm - ld.const.u32 %r4941, [matrix+3228]; - // begin inline asm - dp4a.u32.u32 %r4940, %r4941, %r5774, %r4936; - // end inline asm - ld.const.u32 %r4945, [matrix+3232]; - // begin inline asm - dp4a.u32.u32 %r4944, %r4945, %r5778, %r4940; - // end inline asm - ld.const.u32 %r4949, [matrix+3236]; - // begin inline asm - dp4a.u32.u32 %r4948, %r4949, %r5782, %r4944; - // end inline asm - ld.const.u32 %r4953, [matrix+3240]; - // begin inline asm - dp4a.u32.u32 %r4952, %r4953, %r5786, %r4948; - // end inline asm - ld.const.u32 %r4957, [matrix+3244]; - // begin inline asm - dp4a.u32.u32 %r4956, %r4957, %r5790, %r4952; - // end inline asm - ld.const.u32 %r4961, [matrix+3248]; - // begin inline asm - dp4a.u32.u32 %r4960, %r4961, %r5794, %r4956; - // end inline asm - ld.const.u32 %r4965, [matrix+3252]; - // begin inline asm - dp4a.u32.u32 %r4964, %r4965, %r5798, %r4960; - // end inline asm - ld.const.u32 %r4969, [matrix+3256]; - // begin inline asm - dp4a.u32.u32 %r4968, %r4969, %r5802, %r4964; - // end inline asm - ld.const.u32 %r4973, [matrix+3260]; - // begin inline asm - dp4a.u32.u32 %r4972, %r4973, %r5806, %r4968; - // end inline asm - ld.const.u32 %r4977, [matrix+3264]; - // begin inline asm - dp4a.u32.u32 %r4976, %r4977, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4981, [matrix+3268]; - // begin inline asm - dp4a.u32.u32 %r4980, %r4981, %r5750, %r4976; - // end inline asm - ld.const.u32 %r4985, [matrix+3272]; - // begin inline asm - dp4a.u32.u32 %r4984, %r4985, %r5754, %r4980; - // end inline asm - ld.const.u32 %r4989, [matrix+3276]; - // begin inline asm - dp4a.u32.u32 %r4988, %r4989, %r5758, %r4984; - // end inline asm - ld.const.u32 %r4993, [matrix+3280]; - // begin inline asm - dp4a.u32.u32 %r4992, %r4993, %r5762, %r4988; - // end inline asm - ld.const.u32 %r4997, [matrix+3284]; - // begin inline asm - dp4a.u32.u32 %r4996, %r4997, %r5766, %r4992; - // end inline asm - ld.const.u32 %r5001, [matrix+3288]; - // begin inline asm - dp4a.u32.u32 %r5000, %r5001, %r5770, %r4996; - // end inline asm - ld.const.u32 %r5005, [matrix+3292]; - // begin inline asm - dp4a.u32.u32 %r5004, %r5005, %r5774, %r5000; - // end inline asm - ld.const.u32 %r5009, [matrix+3296]; - // begin inline asm - dp4a.u32.u32 %r5008, %r5009, %r5778, %r5004; - // end inline asm - ld.const.u32 %r5013, [matrix+3300]; - // begin inline asm - dp4a.u32.u32 %r5012, %r5013, %r5782, %r5008; - // end inline asm - ld.const.u32 %r5017, [matrix+3304]; - // begin inline asm - dp4a.u32.u32 %r5016, %r5017, %r5786, %r5012; - // end inline asm - ld.const.u32 %r5021, [matrix+3308]; - // begin inline asm - dp4a.u32.u32 %r5020, %r5021, %r5790, %r5016; - // end inline asm - ld.const.u32 %r5025, [matrix+3312]; - // begin inline asm - dp4a.u32.u32 %r5024, %r5025, %r5794, %r5020; - // end inline asm - ld.const.u32 %r5029, [matrix+3316]; - // begin inline asm - dp4a.u32.u32 %r5028, %r5029, %r5798, %r5024; - // end inline asm - ld.const.u32 %r5033, [matrix+3320]; - // begin inline asm - dp4a.u32.u32 %r5032, %r5033, %r5802, %r5028; - // end inline asm - ld.const.u32 %r5037, [matrix+3324]; - // begin inline asm - dp4a.u32.u32 %r5036, %r5037, %r5806, %r5032; - // end inline asm - shr.u32 %r6073, %r4972, 6; - and.b32 %r6074, %r6073, 240; - shr.u32 %r6075, %r5036, 10; - or.b32 %r6076, %r6075, %r6074; - cvt.u64.u32 %rd230, %r6076; - xor.b64 %rd231, %rd201, %rd230; - and.b64 %rd232, %rd9, 255; - xor.b64 %rd233, %rd232, %rd229; - ld.const.u32 %r5041, [matrix+3328]; - // begin inline asm - dp4a.u32.u32 %r5040, %r5041, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5045, [matrix+3332]; - // begin inline asm - dp4a.u32.u32 %r5044, %r5045, %r5750, %r5040; - // end inline asm - ld.const.u32 %r5049, [matrix+3336]; - // begin inline asm - dp4a.u32.u32 %r5048, %r5049, %r5754, %r5044; - // end inline asm - ld.const.u32 %r5053, [matrix+3340]; - // begin inline asm - dp4a.u32.u32 %r5052, %r5053, %r5758, %r5048; - // end inline asm - ld.const.u32 %r5057, [matrix+3344]; - // begin inline asm - dp4a.u32.u32 %r5056, %r5057, %r5762, %r5052; - // end inline asm - ld.const.u32 %r5061, [matrix+3348]; - // begin inline asm - dp4a.u32.u32 %r5060, %r5061, %r5766, %r5056; - // end inline asm - ld.const.u32 %r5065, [matrix+3352]; - // begin inline asm - dp4a.u32.u32 %r5064, %r5065, %r5770, %r5060; - // end inline asm - ld.const.u32 %r5069, [matrix+3356]; - // begin inline asm - dp4a.u32.u32 %r5068, %r5069, %r5774, %r5064; - // end inline asm - ld.const.u32 %r5073, [matrix+3360]; - // begin inline asm - dp4a.u32.u32 %r5072, %r5073, %r5778, %r5068; - // end inline asm - ld.const.u32 %r5077, [matrix+3364]; - // begin inline asm - dp4a.u32.u32 %r5076, %r5077, %r5782, %r5072; - // end inline asm - ld.const.u32 %r5081, [matrix+3368]; - // begin inline asm - dp4a.u32.u32 %r5080, %r5081, %r5786, %r5076; - // end inline asm - ld.const.u32 %r5085, [matrix+3372]; - // begin inline asm - dp4a.u32.u32 %r5084, %r5085, %r5790, %r5080; - // end inline asm - ld.const.u32 %r5089, [matrix+3376]; - // begin inline asm - dp4a.u32.u32 %r5088, %r5089, %r5794, %r5084; - // end inline asm - ld.const.u32 %r5093, [matrix+3380]; - // begin inline asm - dp4a.u32.u32 %r5092, %r5093, %r5798, %r5088; - // end inline asm - ld.const.u32 %r5097, [matrix+3384]; - // begin inline asm - dp4a.u32.u32 %r5096, %r5097, %r5802, %r5092; - // end inline asm - ld.const.u32 %r5101, [matrix+3388]; - // begin inline asm - dp4a.u32.u32 %r5100, %r5101, %r5806, %r5096; - // end inline asm - ld.const.u32 %r5105, [matrix+3392]; - // begin inline asm - dp4a.u32.u32 %r5104, %r5105, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5109, [matrix+3396]; - // begin inline asm - dp4a.u32.u32 %r5108, %r5109, %r5750, %r5104; - // end inline asm - ld.const.u32 %r5113, [matrix+3400]; - // begin inline asm - dp4a.u32.u32 %r5112, %r5113, %r5754, %r5108; - // end inline asm - ld.const.u32 %r5117, [matrix+3404]; - // begin inline asm - dp4a.u32.u32 %r5116, %r5117, %r5758, %r5112; - // end inline asm - ld.const.u32 %r5121, [matrix+3408]; - // begin inline asm - dp4a.u32.u32 %r5120, %r5121, %r5762, %r5116; - // end inline asm - ld.const.u32 %r5125, [matrix+3412]; - // begin inline asm - dp4a.u32.u32 %r5124, %r5125, %r5766, %r5120; - // end inline asm - ld.const.u32 %r5129, [matrix+3416]; - // begin inline asm - dp4a.u32.u32 %r5128, %r5129, %r5770, %r5124; - // end inline asm - ld.const.u32 %r5133, [matrix+3420]; - // begin inline asm - dp4a.u32.u32 %r5132, %r5133, %r5774, %r5128; - // end inline asm - ld.const.u32 %r5137, [matrix+3424]; - // begin inline asm - dp4a.u32.u32 %r5136, %r5137, %r5778, %r5132; - // end inline asm - ld.const.u32 %r5141, [matrix+3428]; - // begin inline asm - dp4a.u32.u32 %r5140, %r5141, %r5782, %r5136; - // end inline asm - ld.const.u32 %r5145, [matrix+3432]; - // begin inline asm - dp4a.u32.u32 %r5144, %r5145, %r5786, %r5140; - // end inline asm - ld.const.u32 %r5149, [matrix+3436]; - // begin inline asm - dp4a.u32.u32 %r5148, %r5149, %r5790, %r5144; - // end inline asm - ld.const.u32 %r5153, [matrix+3440]; - // begin inline asm - dp4a.u32.u32 %r5152, %r5153, %r5794, %r5148; - // end inline asm - ld.const.u32 %r5157, [matrix+3444]; - // begin inline asm - dp4a.u32.u32 %r5156, %r5157, %r5798, %r5152; - // end inline asm - ld.const.u32 %r5161, [matrix+3448]; - // begin inline asm - dp4a.u32.u32 %r5160, %r5161, %r5802, %r5156; - // end inline asm - ld.const.u32 %r5165, [matrix+3452]; - // begin inline asm - dp4a.u32.u32 %r5164, %r5165, %r5806, %r5160; - // end inline asm - shr.u32 %r6077, %r5100, 6; - and.b32 %r6078, %r6077, 240; - shr.u32 %r6079, %r5164, 10; - or.b32 %r6080, %r6079, %r6078; - cvt.u64.u32 %rd234, %r6080; - xor.b64 %rd235, %rd202, %rd234; - ld.const.u32 %r5169, [matrix+3456]; - // begin inline asm - dp4a.u32.u32 %r5168, %r5169, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5173, [matrix+3460]; - // begin inline asm - dp4a.u32.u32 %r5172, %r5173, %r5750, %r5168; - // end inline asm - ld.const.u32 %r5177, [matrix+3464]; - // begin inline asm - dp4a.u32.u32 %r5176, %r5177, %r5754, %r5172; - // end inline asm - ld.const.u32 %r5181, [matrix+3468]; - // begin inline asm - dp4a.u32.u32 %r5180, %r5181, %r5758, %r5176; - // end inline asm - ld.const.u32 %r5185, [matrix+3472]; - // begin inline asm - dp4a.u32.u32 %r5184, %r5185, %r5762, %r5180; - // end inline asm - ld.const.u32 %r5189, [matrix+3476]; - // begin inline asm - dp4a.u32.u32 %r5188, %r5189, %r5766, %r5184; - // end inline asm - ld.const.u32 %r5193, [matrix+3480]; - // begin inline asm - dp4a.u32.u32 %r5192, %r5193, %r5770, %r5188; - // end inline asm - ld.const.u32 %r5197, [matrix+3484]; - // begin inline asm - dp4a.u32.u32 %r5196, %r5197, %r5774, %r5192; - // end inline asm - ld.const.u32 %r5201, [matrix+3488]; - // begin inline asm - dp4a.u32.u32 %r5200, %r5201, %r5778, %r5196; - // end inline asm - ld.const.u32 %r5205, [matrix+3492]; - // begin inline asm - dp4a.u32.u32 %r5204, %r5205, %r5782, %r5200; - // end inline asm - ld.const.u32 %r5209, [matrix+3496]; - // begin inline asm - dp4a.u32.u32 %r5208, %r5209, %r5786, %r5204; - // end inline asm - ld.const.u32 %r5213, [matrix+3500]; - // begin inline asm - dp4a.u32.u32 %r5212, %r5213, %r5790, %r5208; - // end inline asm - ld.const.u32 %r5217, [matrix+3504]; - // begin inline asm - dp4a.u32.u32 %r5216, %r5217, %r5794, %r5212; - // end inline asm - ld.const.u32 %r5221, [matrix+3508]; - // begin inline asm - dp4a.u32.u32 %r5220, %r5221, %r5798, %r5216; - // end inline asm - ld.const.u32 %r5225, [matrix+3512]; - // begin inline asm - dp4a.u32.u32 %r5224, %r5225, %r5802, %r5220; - // end inline asm - ld.const.u32 %r5229, [matrix+3516]; - // begin inline asm - dp4a.u32.u32 %r5228, %r5229, %r5806, %r5224; - // end inline asm - ld.const.u32 %r5233, [matrix+3520]; - // begin inline asm - dp4a.u32.u32 %r5232, %r5233, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5237, [matrix+3524]; - // begin inline asm - dp4a.u32.u32 %r5236, %r5237, %r5750, %r5232; - // end inline asm - ld.const.u32 %r5241, [matrix+3528]; - // begin inline asm - dp4a.u32.u32 %r5240, %r5241, %r5754, %r5236; - // end inline asm - ld.const.u32 %r5245, [matrix+3532]; - // begin inline asm - dp4a.u32.u32 %r5244, %r5245, %r5758, %r5240; - // end inline asm - ld.const.u32 %r5249, [matrix+3536]; - // begin inline asm - dp4a.u32.u32 %r5248, %r5249, %r5762, %r5244; - // end inline asm - ld.const.u32 %r5253, [matrix+3540]; - // begin inline asm - dp4a.u32.u32 %r5252, %r5253, %r5766, %r5248; - // end inline asm - ld.const.u32 %r5257, [matrix+3544]; - // begin inline asm - dp4a.u32.u32 %r5256, %r5257, %r5770, %r5252; - // end inline asm - ld.const.u32 %r5261, [matrix+3548]; - // begin inline asm - dp4a.u32.u32 %r5260, %r5261, %r5774, %r5256; - // end inline asm - ld.const.u32 %r5265, [matrix+3552]; - // begin inline asm - dp4a.u32.u32 %r5264, %r5265, %r5778, %r5260; - // end inline asm - ld.const.u32 %r5269, [matrix+3556]; - // begin inline asm - dp4a.u32.u32 %r5268, %r5269, %r5782, %r5264; - // end inline asm - ld.const.u32 %r5273, [matrix+3560]; - // begin inline asm - dp4a.u32.u32 %r5272, %r5273, %r5786, %r5268; - // end inline asm - ld.const.u32 %r5277, [matrix+3564]; - // begin inline asm - dp4a.u32.u32 %r5276, %r5277, %r5790, %r5272; - // end inline asm - ld.const.u32 %r5281, [matrix+3568]; - // begin inline asm - dp4a.u32.u32 %r5280, %r5281, %r5794, %r5276; - // end inline asm - ld.const.u32 %r5285, [matrix+3572]; - // begin inline asm - dp4a.u32.u32 %r5284, %r5285, %r5798, %r5280; - // end inline asm - ld.const.u32 %r5289, [matrix+3576]; - // begin inline asm - dp4a.u32.u32 %r5288, %r5289, %r5802, %r5284; - // end inline asm - ld.const.u32 %r5293, [matrix+3580]; - // begin inline asm - dp4a.u32.u32 %r5292, %r5293, %r5806, %r5288; - // end inline asm - shr.u32 %r6081, %r5228, 6; - and.b32 %r6082, %r6081, 240; - shr.u32 %r6083, %r5292, 10; - or.b32 %r6084, %r6083, %r6082; - cvt.u64.u32 %rd236, %r6084; - xor.b64 %rd237, %rd203, %rd236; - ld.const.u32 %r5297, [matrix+3584]; - // begin inline asm - dp4a.u32.u32 %r5296, %r5297, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5301, [matrix+3588]; - // begin inline asm - dp4a.u32.u32 %r5300, %r5301, %r5750, %r5296; - // end inline asm - ld.const.u32 %r5305, [matrix+3592]; - // begin inline asm - dp4a.u32.u32 %r5304, %r5305, %r5754, %r5300; - // end inline asm - ld.const.u32 %r5309, [matrix+3596]; - // begin inline asm - dp4a.u32.u32 %r5308, %r5309, %r5758, %r5304; - // end inline asm - ld.const.u32 %r5313, [matrix+3600]; - // begin inline asm - dp4a.u32.u32 %r5312, %r5313, %r5762, %r5308; - // end inline asm - ld.const.u32 %r5317, [matrix+3604]; - // begin inline asm - dp4a.u32.u32 %r5316, %r5317, %r5766, %r5312; - // end inline asm - ld.const.u32 %r5321, [matrix+3608]; - // begin inline asm - dp4a.u32.u32 %r5320, %r5321, %r5770, %r5316; - // end inline asm - ld.const.u32 %r5325, [matrix+3612]; - // begin inline asm - dp4a.u32.u32 %r5324, %r5325, %r5774, %r5320; - // end inline asm - ld.const.u32 %r5329, [matrix+3616]; - // begin inline asm - dp4a.u32.u32 %r5328, %r5329, %r5778, %r5324; - // end inline asm - ld.const.u32 %r5333, [matrix+3620]; - // begin inline asm - dp4a.u32.u32 %r5332, %r5333, %r5782, %r5328; - // end inline asm - ld.const.u32 %r5337, [matrix+3624]; - // begin inline asm - dp4a.u32.u32 %r5336, %r5337, %r5786, %r5332; - // end inline asm - ld.const.u32 %r5341, [matrix+3628]; - // begin inline asm - dp4a.u32.u32 %r5340, %r5341, %r5790, %r5336; - // end inline asm - ld.const.u32 %r5345, [matrix+3632]; - // begin inline asm - dp4a.u32.u32 %r5344, %r5345, %r5794, %r5340; - // end inline asm - ld.const.u32 %r5349, [matrix+3636]; - // begin inline asm - dp4a.u32.u32 %r5348, %r5349, %r5798, %r5344; - // end inline asm - ld.const.u32 %r5353, [matrix+3640]; - // begin inline asm - dp4a.u32.u32 %r5352, %r5353, %r5802, %r5348; - // end inline asm - ld.const.u32 %r5357, [matrix+3644]; - // begin inline asm - dp4a.u32.u32 %r5356, %r5357, %r5806, %r5352; - // end inline asm - ld.const.u32 %r5361, [matrix+3648]; - // begin inline asm - dp4a.u32.u32 %r5360, %r5361, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5365, [matrix+3652]; - // begin inline asm - dp4a.u32.u32 %r5364, %r5365, %r5750, %r5360; - // end inline asm - ld.const.u32 %r5369, [matrix+3656]; - // begin inline asm - dp4a.u32.u32 %r5368, %r5369, %r5754, %r5364; - // end inline asm - ld.const.u32 %r5373, [matrix+3660]; - // begin inline asm - dp4a.u32.u32 %r5372, %r5373, %r5758, %r5368; - // end inline asm - ld.const.u32 %r5377, [matrix+3664]; - // begin inline asm - dp4a.u32.u32 %r5376, %r5377, %r5762, %r5372; - // end inline asm - ld.const.u32 %r5381, [matrix+3668]; - // begin inline asm - dp4a.u32.u32 %r5380, %r5381, %r5766, %r5376; - // end inline asm - ld.const.u32 %r5385, [matrix+3672]; - // begin inline asm - dp4a.u32.u32 %r5384, %r5385, %r5770, %r5380; - // end inline asm - ld.const.u32 %r5389, [matrix+3676]; - // begin inline asm - dp4a.u32.u32 %r5388, %r5389, %r5774, %r5384; - // end inline asm - ld.const.u32 %r5393, [matrix+3680]; - // begin inline asm - dp4a.u32.u32 %r5392, %r5393, %r5778, %r5388; - // end inline asm - ld.const.u32 %r5397, [matrix+3684]; - // begin inline asm - dp4a.u32.u32 %r5396, %r5397, %r5782, %r5392; - // end inline asm - ld.const.u32 %r5401, [matrix+3688]; - // begin inline asm - dp4a.u32.u32 %r5400, %r5401, %r5786, %r5396; - // end inline asm - ld.const.u32 %r5405, [matrix+3692]; - // begin inline asm - dp4a.u32.u32 %r5404, %r5405, %r5790, %r5400; - // end inline asm - ld.const.u32 %r5409, [matrix+3696]; - // begin inline asm - dp4a.u32.u32 %r5408, %r5409, %r5794, %r5404; - // end inline asm - ld.const.u32 %r5413, [matrix+3700]; - // begin inline asm - dp4a.u32.u32 %r5412, %r5413, %r5798, %r5408; - // end inline asm - ld.const.u32 %r5417, [matrix+3704]; - // begin inline asm - dp4a.u32.u32 %r5416, %r5417, %r5802, %r5412; - // end inline asm - ld.const.u32 %r5421, [matrix+3708]; - // begin inline asm - dp4a.u32.u32 %r5420, %r5421, %r5806, %r5416; - // end inline asm - shr.u32 %r6085, %r5356, 6; - and.b32 %r6086, %r6085, 240; - shr.u32 %r6087, %r5420, 10; - or.b32 %r6088, %r6087, %r6086; - cvt.u64.u32 %rd238, %r6088; - xor.b64 %rd239, %rd204, %rd238; - ld.const.u32 %r5425, [matrix+3712]; - // begin inline asm - dp4a.u32.u32 %r5424, %r5425, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5429, [matrix+3716]; - // begin inline asm - dp4a.u32.u32 %r5428, %r5429, %r5750, %r5424; - // end inline asm - ld.const.u32 %r5433, [matrix+3720]; - // begin inline asm - dp4a.u32.u32 %r5432, %r5433, %r5754, %r5428; - // end inline asm - ld.const.u32 %r5437, [matrix+3724]; - // begin inline asm - dp4a.u32.u32 %r5436, %r5437, %r5758, %r5432; - // end inline asm - ld.const.u32 %r5441, [matrix+3728]; - // begin inline asm - dp4a.u32.u32 %r5440, %r5441, %r5762, %r5436; - // end inline asm - ld.const.u32 %r5445, [matrix+3732]; - // begin inline asm - dp4a.u32.u32 %r5444, %r5445, %r5766, %r5440; - // end inline asm - ld.const.u32 %r5449, [matrix+3736]; - // begin inline asm - dp4a.u32.u32 %r5448, %r5449, %r5770, %r5444; - // end inline asm - ld.const.u32 %r5453, [matrix+3740]; - // begin inline asm - dp4a.u32.u32 %r5452, %r5453, %r5774, %r5448; - // end inline asm - ld.const.u32 %r5457, [matrix+3744]; - // begin inline asm - dp4a.u32.u32 %r5456, %r5457, %r5778, %r5452; - // end inline asm - ld.const.u32 %r5461, [matrix+3748]; - // begin inline asm - dp4a.u32.u32 %r5460, %r5461, %r5782, %r5456; - // end inline asm - ld.const.u32 %r5465, [matrix+3752]; - // begin inline asm - dp4a.u32.u32 %r5464, %r5465, %r5786, %r5460; - // end inline asm - ld.const.u32 %r5469, [matrix+3756]; - // begin inline asm - dp4a.u32.u32 %r5468, %r5469, %r5790, %r5464; - // end inline asm - ld.const.u32 %r5473, [matrix+3760]; - // begin inline asm - dp4a.u32.u32 %r5472, %r5473, %r5794, %r5468; - // end inline asm - ld.const.u32 %r5477, [matrix+3764]; - // begin inline asm - dp4a.u32.u32 %r5476, %r5477, %r5798, %r5472; - // end inline asm - ld.const.u32 %r5481, [matrix+3768]; - // begin inline asm - dp4a.u32.u32 %r5480, %r5481, %r5802, %r5476; - // end inline asm - ld.const.u32 %r5485, [matrix+3772]; - // begin inline asm - dp4a.u32.u32 %r5484, %r5485, %r5806, %r5480; - // end inline asm - ld.const.u32 %r5489, [matrix+3776]; - // begin inline asm - dp4a.u32.u32 %r5488, %r5489, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5493, [matrix+3780]; - // begin inline asm - dp4a.u32.u32 %r5492, %r5493, %r5750, %r5488; - // end inline asm - ld.const.u32 %r5497, [matrix+3784]; - // begin inline asm - dp4a.u32.u32 %r5496, %r5497, %r5754, %r5492; - // end inline asm - ld.const.u32 %r5501, [matrix+3788]; - // begin inline asm - dp4a.u32.u32 %r5500, %r5501, %r5758, %r5496; - // end inline asm - ld.const.u32 %r5505, [matrix+3792]; - // begin inline asm - dp4a.u32.u32 %r5504, %r5505, %r5762, %r5500; - // end inline asm - ld.const.u32 %r5509, [matrix+3796]; - // begin inline asm - dp4a.u32.u32 %r5508, %r5509, %r5766, %r5504; - // end inline asm - ld.const.u32 %r5513, [matrix+3800]; - // begin inline asm - dp4a.u32.u32 %r5512, %r5513, %r5770, %r5508; - // end inline asm - ld.const.u32 %r5517, [matrix+3804]; - // begin inline asm - dp4a.u32.u32 %r5516, %r5517, %r5774, %r5512; - // end inline asm - ld.const.u32 %r5521, [matrix+3808]; - // begin inline asm - dp4a.u32.u32 %r5520, %r5521, %r5778, %r5516; - // end inline asm - ld.const.u32 %r5525, [matrix+3812]; - // begin inline asm - dp4a.u32.u32 %r5524, %r5525, %r5782, %r5520; - // end inline asm - ld.const.u32 %r5529, [matrix+3816]; - // begin inline asm - dp4a.u32.u32 %r5528, %r5529, %r5786, %r5524; - // end inline asm - ld.const.u32 %r5533, [matrix+3820]; - // begin inline asm - dp4a.u32.u32 %r5532, %r5533, %r5790, %r5528; - // end inline asm - ld.const.u32 %r5537, [matrix+3824]; - // begin inline asm - dp4a.u32.u32 %r5536, %r5537, %r5794, %r5532; - // end inline asm - ld.const.u32 %r5541, [matrix+3828]; - // begin inline asm - dp4a.u32.u32 %r5540, %r5541, %r5798, %r5536; - // end inline asm - ld.const.u32 %r5545, [matrix+3832]; - // begin inline asm - dp4a.u32.u32 %r5544, %r5545, %r5802, %r5540; - // end inline asm - ld.const.u32 %r5549, [matrix+3836]; - // begin inline asm - dp4a.u32.u32 %r5548, %r5549, %r5806, %r5544; - // end inline asm - shr.u32 %r6089, %r5484, 6; - and.b32 %r6090, %r6089, 240; - shr.u32 %r6091, %r5548, 10; - or.b32 %r6092, %r6091, %r6090; - cvt.u64.u32 %rd240, %r6092; - xor.b64 %rd241, %rd206, %rd240; - ld.const.u32 %r5553, [matrix+3840]; - // begin inline asm - dp4a.u32.u32 %r5552, %r5553, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5557, [matrix+3844]; - // begin inline asm - dp4a.u32.u32 %r5556, %r5557, %r5750, %r5552; - // end inline asm - ld.const.u32 %r5561, [matrix+3848]; - // begin inline asm - dp4a.u32.u32 %r5560, %r5561, %r5754, %r5556; - // end inline asm - ld.const.u32 %r5565, [matrix+3852]; - // begin inline asm - dp4a.u32.u32 %r5564, %r5565, %r5758, %r5560; - // end inline asm - ld.const.u32 %r5569, [matrix+3856]; - // begin inline asm - dp4a.u32.u32 %r5568, %r5569, %r5762, %r5564; - // end inline asm - ld.const.u32 %r5573, [matrix+3860]; - // begin inline asm - dp4a.u32.u32 %r5572, %r5573, %r5766, %r5568; - // end inline asm - ld.const.u32 %r5577, [matrix+3864]; - // begin inline asm - dp4a.u32.u32 %r5576, %r5577, %r5770, %r5572; - // end inline asm - ld.const.u32 %r5581, [matrix+3868]; - // begin inline asm - dp4a.u32.u32 %r5580, %r5581, %r5774, %r5576; - // end inline asm - ld.const.u32 %r5585, [matrix+3872]; - // begin inline asm - dp4a.u32.u32 %r5584, %r5585, %r5778, %r5580; - // end inline asm - ld.const.u32 %r5589, [matrix+3876]; - // begin inline asm - dp4a.u32.u32 %r5588, %r5589, %r5782, %r5584; - // end inline asm - ld.const.u32 %r5593, [matrix+3880]; - // begin inline asm - dp4a.u32.u32 %r5592, %r5593, %r5786, %r5588; - // end inline asm - ld.const.u32 %r5597, [matrix+3884]; - // begin inline asm - dp4a.u32.u32 %r5596, %r5597, %r5790, %r5592; - // end inline asm - ld.const.u32 %r5601, [matrix+3888]; - // begin inline asm - dp4a.u32.u32 %r5600, %r5601, %r5794, %r5596; - // end inline asm - ld.const.u32 %r5605, [matrix+3892]; - // begin inline asm - dp4a.u32.u32 %r5604, %r5605, %r5798, %r5600; - // end inline asm - ld.const.u32 %r5609, [matrix+3896]; - // begin inline asm - dp4a.u32.u32 %r5608, %r5609, %r5802, %r5604; - // end inline asm - ld.const.u32 %r5613, [matrix+3900]; - // begin inline asm - dp4a.u32.u32 %r5612, %r5613, %r5806, %r5608; - // end inline asm - ld.const.u32 %r5617, [matrix+3904]; - // begin inline asm - dp4a.u32.u32 %r5616, %r5617, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5621, [matrix+3908]; - // begin inline asm - dp4a.u32.u32 %r5620, %r5621, %r5750, %r5616; - // end inline asm - ld.const.u32 %r5625, [matrix+3912]; - // begin inline asm - dp4a.u32.u32 %r5624, %r5625, %r5754, %r5620; - // end inline asm - ld.const.u32 %r5629, [matrix+3916]; - // begin inline asm - dp4a.u32.u32 %r5628, %r5629, %r5758, %r5624; - // end inline asm - ld.const.u32 %r5633, [matrix+3920]; - // begin inline asm - dp4a.u32.u32 %r5632, %r5633, %r5762, %r5628; - // end inline asm - ld.const.u32 %r5637, [matrix+3924]; - // begin inline asm - dp4a.u32.u32 %r5636, %r5637, %r5766, %r5632; - // end inline asm - ld.const.u32 %r5641, [matrix+3928]; - // begin inline asm - dp4a.u32.u32 %r5640, %r5641, %r5770, %r5636; - // end inline asm - ld.const.u32 %r5645, [matrix+3932]; - // begin inline asm - dp4a.u32.u32 %r5644, %r5645, %r5774, %r5640; - // end inline asm - ld.const.u32 %r5649, [matrix+3936]; - // begin inline asm - dp4a.u32.u32 %r5648, %r5649, %r5778, %r5644; - // end inline asm - ld.const.u32 %r5653, [matrix+3940]; - // begin inline asm - dp4a.u32.u32 %r5652, %r5653, %r5782, %r5648; - // end inline asm - ld.const.u32 %r5657, [matrix+3944]; - // begin inline asm - dp4a.u32.u32 %r5656, %r5657, %r5786, %r5652; - // end inline asm - ld.const.u32 %r5661, [matrix+3948]; - // begin inline asm - dp4a.u32.u32 %r5660, %r5661, %r5790, %r5656; - // end inline asm - ld.const.u32 %r5665, [matrix+3952]; - // begin inline asm - dp4a.u32.u32 %r5664, %r5665, %r5794, %r5660; - // end inline asm - ld.const.u32 %r5669, [matrix+3956]; - // begin inline asm - dp4a.u32.u32 %r5668, %r5669, %r5798, %r5664; - // end inline asm - ld.const.u32 %r5673, [matrix+3960]; - // begin inline asm - dp4a.u32.u32 %r5672, %r5673, %r5802, %r5668; - // end inline asm - ld.const.u32 %r5677, [matrix+3964]; - // begin inline asm - dp4a.u32.u32 %r5676, %r5677, %r5806, %r5672; - // end inline asm - shr.u32 %r6093, %r5612, 6; - and.b32 %r6094, %r6093, 240; - shr.u32 %r6095, %r5676, 10; - or.b32 %r6096, %r6095, %r6094; - cvt.u64.u32 %rd242, %r6096; - xor.b64 %rd243, %rd208, %rd242; - ld.const.u32 %r5681, [matrix+3968]; - // begin inline asm - dp4a.u32.u32 %r5680, %r5681, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5685, [matrix+3972]; - // begin inline asm - dp4a.u32.u32 %r5684, %r5685, %r5750, %r5680; - // end inline asm - ld.const.u32 %r5689, [matrix+3976]; - // begin inline asm - dp4a.u32.u32 %r5688, %r5689, %r5754, %r5684; - // end inline asm - ld.const.u32 %r5693, [matrix+3980]; - // begin inline asm - dp4a.u32.u32 %r5692, %r5693, %r5758, %r5688; - // end inline asm - ld.const.u32 %r5697, [matrix+3984]; - // begin inline asm - dp4a.u32.u32 %r5696, %r5697, %r5762, %r5692; - // end inline asm - ld.const.u32 %r5701, [matrix+3988]; - // begin inline asm - dp4a.u32.u32 %r5700, %r5701, %r5766, %r5696; - // end inline asm - ld.const.u32 %r5705, [matrix+3992]; - // begin inline asm - dp4a.u32.u32 %r5704, %r5705, %r5770, %r5700; - // end inline asm - ld.const.u32 %r5709, [matrix+3996]; - // begin inline asm - dp4a.u32.u32 %r5708, %r5709, %r5774, %r5704; - // end inline asm - ld.const.u32 %r5713, [matrix+4000]; - // begin inline asm - dp4a.u32.u32 %r5712, %r5713, %r5778, %r5708; - // end inline asm - ld.const.u32 %r5717, [matrix+4004]; - // begin inline asm - dp4a.u32.u32 %r5716, %r5717, %r5782, %r5712; - // end inline asm - ld.const.u32 %r5721, [matrix+4008]; - // begin inline asm - dp4a.u32.u32 %r5720, %r5721, %r5786, %r5716; - // end inline asm - ld.const.u32 %r5725, [matrix+4012]; - // begin inline asm - dp4a.u32.u32 %r5724, %r5725, %r5790, %r5720; - // end inline asm - ld.const.u32 %r5729, [matrix+4016]; - // begin inline asm - dp4a.u32.u32 %r5728, %r5729, %r5794, %r5724; - // end inline asm - ld.const.u32 %r5733, [matrix+4020]; - // begin inline asm - dp4a.u32.u32 %r5732, %r5733, %r5798, %r5728; - // end inline asm - ld.const.u32 %r5737, [matrix+4024]; - // begin inline asm - dp4a.u32.u32 %r5736, %r5737, %r5802, %r5732; - // end inline asm - ld.const.u32 %r5741, [matrix+4028]; - // begin inline asm - dp4a.u32.u32 %r5740, %r5741, %r5806, %r5736; - // end inline asm - ld.const.u32 %r5745, [matrix+4032]; - // begin inline asm - dp4a.u32.u32 %r5744, %r5745, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5749, [matrix+4036]; - // begin inline asm - dp4a.u32.u32 %r5748, %r5749, %r5750, %r5744; - // end inline asm - ld.const.u32 %r5753, [matrix+4040]; - // begin inline asm - dp4a.u32.u32 %r5752, %r5753, %r5754, %r5748; - // end inline asm - ld.const.u32 %r5757, [matrix+4044]; - // begin inline asm - dp4a.u32.u32 %r5756, %r5757, %r5758, %r5752; - // end inline asm - ld.const.u32 %r5761, [matrix+4048]; - // begin inline asm - dp4a.u32.u32 %r5760, %r5761, %r5762, %r5756; - // end inline asm - ld.const.u32 %r5765, [matrix+4052]; - // begin inline asm - dp4a.u32.u32 %r5764, %r5765, %r5766, %r5760; - // end inline asm - ld.const.u32 %r5769, [matrix+4056]; - // begin inline asm - dp4a.u32.u32 %r5768, %r5769, %r5770, %r5764; - // end inline asm - ld.const.u32 %r5773, [matrix+4060]; - // begin inline asm - dp4a.u32.u32 %r5772, %r5773, %r5774, %r5768; - // end inline asm - ld.const.u32 %r5777, [matrix+4064]; - // begin inline asm - dp4a.u32.u32 %r5776, %r5777, %r5778, %r5772; - // end inline asm - ld.const.u32 %r5781, [matrix+4068]; - // begin inline asm - dp4a.u32.u32 %r5780, %r5781, %r5782, %r5776; - // end inline asm - ld.const.u32 %r5785, [matrix+4072]; - // begin inline asm - dp4a.u32.u32 %r5784, %r5785, %r5786, %r5780; - // end inline asm - ld.const.u32 %r5789, [matrix+4076]; - // begin inline asm - dp4a.u32.u32 %r5788, %r5789, %r5790, %r5784; - // end inline asm - ld.const.u32 %r5793, [matrix+4080]; - // begin inline asm - dp4a.u32.u32 %r5792, %r5793, %r5794, %r5788; - // end inline asm - ld.const.u32 %r5797, [matrix+4084]; - // begin inline asm - dp4a.u32.u32 %r5796, %r5797, %r5798, %r5792; - // end inline asm - ld.const.u32 %r5801, [matrix+4088]; - // begin inline asm - dp4a.u32.u32 %r5800, %r5801, %r5802, %r5796; - // end inline asm - ld.const.u32 %r5805, [matrix+4092]; - // begin inline asm - dp4a.u32.u32 %r5804, %r5805, %r5806, %r5800; - // end inline asm - shr.u32 %r6097, %r5740, 6; - and.b32 %r6098, %r6097, 240; - shr.u32 %r6099, %r5804, 10; - or.b32 %r6100, %r6099, %r6098; - cvt.u64.u32 %rd244, %r6100; - xor.b64 %rd245, %rd210, %rd244; - shl.b32 %r6101, %r5985, 24; - cvt.u64.u32 %rd246, %r6101; - shl.b32 %r6102, %r5980, 16; - and.b32 %r6103, %r6102, 16711680; - cvt.u64.u32 %rd247, %r6103; - shl.b32 %r6104, %r5975, 8; - and.b32 %r6105, %r6104, 65280; - cvt.u64.u32 %rd248, %r6105; - and.b32 %r6106, %r5970, 255; - cvt.u64.u32 %rd249, %r6106; - shl.b32 %r6107, %r6019, 24; - cvt.u64.u32 %rd250, %r6107; - shl.b32 %r6108, %r6014, 16; - and.b32 %r6109, %r6108, 16711680; - cvt.u64.u32 %rd251, %r6109; - shl.b32 %r6110, %r6009, 8; - and.b32 %r6111, %r6110, 65280; - cvt.u64.u32 %rd252, %r6111; - and.b32 %r6112, %r6004, 255; - cvt.u64.u32 %rd253, %r6112; - shl.b32 %r6113, %r6053, 24; - cvt.u64.u32 %rd254, %r6113; - shl.b32 %r6114, %r6048, 16; - and.b32 %r6115, %r6114, 16711680; - cvt.u64.u32 %rd255, %r6115; - shl.b32 %r6116, %r6043, 8; - and.b32 %r6117, %r6116, 65280; - cvt.u64.u32 %rd256, %r6117; - and.b32 %r6118, %r6038, 255; - cvt.u64.u32 %rd257, %r6118; - shr.u32 %r6119, %r2732, 10; - or.b32 %r6120, %r6119, %r5999; - xor.b32 %r6121, %r10, %r6120; - cvt.u64.u32 %rd258, %r6121; - shl.b64 %rd259, %rd258, 56; - shl.b64 %rd260, %rd216, 48; - and.b64 %rd261, %rd260, 71776119061217280; - or.b64 %rd262, %rd259, %rd261; - shl.b64 %rd263, %rd214, 40; - and.b64 %rd264, %rd263, 280375465082880; - or.b64 %rd265, %rd262, %rd264; - shl.b64 %rd266, %rd212, 32; - and.b64 %rd267, %rd266, 1095216660480; - or.b64 %rd268, %rd265, %rd267; - or.b64 %rd269, %rd268, %rd246; - or.b64 %rd270, %rd269, %rd247; - or.b64 %rd271, %rd270, %rd248; - or.b64 %rd272, %rd271, %rd249; - xor.b64 %rd73, %rd272, 4239941492252378377; - shr.u32 %r6122, %r3756, 10; - or.b32 %r6123, %r6122, %r6033; - xor.b32 %r6124, %r12, %r6123; - cvt.u64.u32 %rd273, %r6124; - shl.b64 %rd274, %rd273, 56; - shl.b64 %rd275, %rd222, 48; - and.b64 %rd276, %rd275, 71776119061217280; - or.b64 %rd277, %rd274, %rd276; - shl.b64 %rd278, %rd220, 40; - and.b64 %rd279, %rd278, 280375465082880; - or.b64 %rd280, %rd277, %rd279; - shl.b64 %rd281, %rd218, 32; - and.b64 %rd282, %rd281, 1095216660480; - or.b64 %rd283, %rd280, %rd282; - or.b64 %rd284, %rd283, %rd250; - or.b64 %rd285, %rd284, %rd251; - or.b64 %rd286, %rd285, %rd252; - or.b64 %rd287, %rd286, %rd253; - xor.b64 %rd484, %rd287, 8746723911537738262; - shr.u32 %r6125, %r4780, 10; - or.b32 %r6126, %r6125, %r6067; - xor.b32 %r6127, %r14, %r6126; - cvt.u64.u32 %rd288, %r6127; - shl.b64 %rd289, %rd288, 56; - shl.b64 %rd290, %rd228, 48; - and.b64 %rd291, %rd290, 71776119061217280; - or.b64 %rd292, %rd289, %rd291; - shl.b64 %rd293, %rd226, 40; - and.b64 %rd294, %rd293, 280375465082880; - or.b64 %rd295, %rd292, %rd294; - shl.b64 %rd296, %rd224, 32; - and.b64 %rd297, %rd296, 1095216660480; - or.b64 %rd298, %rd295, %rd297; - or.b64 %rd299, %rd298, %rd254; - or.b64 %rd300, %rd299, %rd255; - or.b64 %rd301, %rd300, %rd256; - or.b64 %rd302, %rd301, %rd257; - xor.b64 %rd479, %rd302, 8796936657246353646; - shl.b64 %rd303, %rd245, 56; - shl.b64 %rd304, %rd243, 48; - and.b64 %rd305, %rd304, 71776119061217280; - or.b64 %rd306, %rd303, %rd305; - shl.b64 %rd307, %rd241, 40; - and.b64 %rd308, %rd307, 280375465082880; - or.b64 %rd309, %rd306, %rd308; - shl.b64 %rd310, %rd239, 32; - and.b64 %rd311, %rd310, 1095216660480; - or.b64 %rd312, %rd309, %rd311; - shl.b64 %rd313, %rd237, 24; - and.b64 %rd314, %rd313, 4278190080; - or.b64 %rd315, %rd312, %rd314; - shl.b64 %rd316, %rd235, 16; - and.b64 %rd317, %rd316, 16711680; - shl.b64 %rd318, %rd231, 8; - and.b64 %rd319, %rd318, 65280; - or.b64 %rd320, %rd315, %rd317; - or.b64 %rd321, %rd320, %rd319; - or.b64 %rd322, %rd321, %rd233; - xor.b64 %rd474, %rd322, 1272090201925444760; - mov.u64 %rd488, 8270816933120786537; - mov.u64 %rd487, -850687345431043546; - mov.u64 %rd486, 8596393687355028144; - mov.u64 %rd485, -4073852189716399785; - mov.u64 %rd483, -4539347866060507718; - mov.u64 %rd482, -3233781605604422593; - mov.u64 %rd481, 570094237299545110; - mov.u64 %rd480, 5171152063242093102; - mov.u64 %rd478, 6782861118970774626; - mov.u64 %rd477, 7812475424661425213; - mov.u64 %rd476, 9119540418498120711; - mov.u64 %rd475, -7873636174015165430; - mov.u64 %rd473, -9207053471590684088; - mov.u64 %rd472, 3370482334374859748; - mov.u64 %rd471, -1544774801229058759; - mov.u64 %rd470, 6096431547456407061; - mov.u64 %rd469, -1792185402154627366; - mov.u64 %rd468, -6864424130110145268; - mov.u64 %rd467, 5690099369266491460; - mov.u64 %rd466, -5074726839974049192; - mov.u64 %rd465, 1592359455985097269; - mov.u64 %rd464, RC; - -$L__BB0_9: - xor.b64 %rd323, %rd488, %rd73; - xor.b64 %rd324, %rd323, %rd487; - xor.b64 %rd325, %rd324, %rd486; - xor.b64 %rd326, %rd325, %rd485; - xor.b64 %rd327, %rd483, %rd484; - xor.b64 %rd328, %rd327, %rd482; - xor.b64 %rd329, %rd328, %rd481; - xor.b64 %rd330, %rd329, %rd480; - xor.b64 %rd331, %rd478, %rd479; - xor.b64 %rd332, %rd331, %rd477; - xor.b64 %rd333, %rd332, %rd476; - xor.b64 %rd334, %rd333, %rd475; - xor.b64 %rd335, %rd473, %rd474; - xor.b64 %rd336, %rd335, %rd472; - xor.b64 %rd337, %rd336, %rd471; - xor.b64 %rd338, %rd337, %rd470; - xor.b64 %rd339, %rd468, %rd469; - xor.b64 %rd340, %rd339, %rd467; - xor.b64 %rd341, %rd340, %rd466; - xor.b64 %rd342, %rd341, %rd465; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6128}, %rd330; - } - { - .reg .b32 %dummy; - mov.b64 {%r6129,%dummy}, %rd330; - } - shf.l.wrap.b32 %r6130, %r6129, %r6128, 1; - shf.l.wrap.b32 %r6131, %r6128, %r6129, 1; - mov.b64 %rd343, {%r6131, %r6130}; - xor.b64 %rd344, %rd342, %rd343; - xor.b64 %rd345, %rd344, %rd73; - xor.b64 %rd346, %rd488, %rd344; - xor.b64 %rd347, %rd487, %rd344; - xor.b64 %rd348, %rd486, %rd344; - xor.b64 %rd349, %rd485, %rd344; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6132}, %rd334; - } - { - .reg .b32 %dummy; - mov.b64 {%r6133,%dummy}, %rd334; - } - shf.l.wrap.b32 %r6134, %r6133, %r6132, 1; - shf.l.wrap.b32 %r6135, %r6132, %r6133, 1; - mov.b64 %rd350, {%r6135, %r6134}; - xor.b64 %rd351, %rd350, %rd326; - xor.b64 %rd352, %rd484, %rd351; - xor.b64 %rd353, %rd483, %rd351; - xor.b64 %rd354, %rd482, %rd351; - xor.b64 %rd355, %rd481, %rd351; - xor.b64 %rd356, %rd480, %rd351; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6136}, %rd338; - } - { - .reg .b32 %dummy; - mov.b64 {%r6137,%dummy}, %rd338; - } - shf.l.wrap.b32 %r6138, %r6137, %r6136, 1; - shf.l.wrap.b32 %r6139, %r6136, %r6137, 1; - mov.b64 %rd357, {%r6139, %r6138}; - xor.b64 %rd358, %rd357, %rd330; - xor.b64 %rd359, %rd479, %rd358; - xor.b64 %rd360, %rd478, %rd358; - xor.b64 %rd361, %rd477, %rd358; - xor.b64 %rd362, %rd476, %rd358; - xor.b64 %rd363, %rd475, %rd358; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6140}, %rd342; - } - { - .reg .b32 %dummy; - mov.b64 {%r6141,%dummy}, %rd342; - } - shf.l.wrap.b32 %r6142, %r6141, %r6140, 1; - shf.l.wrap.b32 %r6143, %r6140, %r6141, 1; - mov.b64 %rd364, {%r6143, %r6142}; - xor.b64 %rd365, %rd364, %rd334; - xor.b64 %rd366, %rd474, %rd365; - xor.b64 %rd367, %rd473, %rd365; - xor.b64 %rd368, %rd472, %rd365; - xor.b64 %rd369, %rd471, %rd365; - xor.b64 %rd370, %rd470, %rd365; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6144}, %rd326; - } - { - .reg .b32 %dummy; - mov.b64 {%r6145,%dummy}, %rd326; - } - shf.l.wrap.b32 %r6146, %r6145, %r6144, 1; - shf.l.wrap.b32 %r6147, %r6144, %r6145, 1; - mov.b64 %rd371, {%r6147, %r6146}; - xor.b64 %rd372, %rd338, %rd371; - xor.b64 %rd373, %rd469, %rd372; - xor.b64 %rd374, %rd468, %rd372; - xor.b64 %rd375, %rd467, %rd372; - xor.b64 %rd376, %rd466, %rd372; - xor.b64 %rd377, %rd465, %rd372; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6148}, %rd352; - } - { - .reg .b32 %dummy; - mov.b64 {%r6149,%dummy}, %rd352; - } - shf.l.wrap.b32 %r6150, %r6149, %r6148, 1; - shf.l.wrap.b32 %r6151, %r6148, %r6149, 1; - mov.b64 %rd378, {%r6151, %r6150}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6152}, %rd347; - } - { - .reg .b32 %dummy; - mov.b64 {%r6153,%dummy}, %rd347; - } - shf.l.wrap.b32 %r6154, %r6153, %r6152, 3; - shf.l.wrap.b32 %r6155, %r6152, %r6153, 3; - mov.b64 %rd379, {%r6155, %r6154}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6156}, %rd360; - } - { - .reg .b32 %dummy; - mov.b64 {%r6157,%dummy}, %rd360; - } - shf.l.wrap.b32 %r6158, %r6157, %r6156, 6; - shf.l.wrap.b32 %r6159, %r6156, %r6157, 6; - mov.b64 %rd380, {%r6159, %r6158}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6160}, %rd354; - } - { - .reg .b32 %dummy; - mov.b64 {%r6161,%dummy}, %rd354; - } - shf.l.wrap.b32 %r6162, %r6161, %r6160, 10; - shf.l.wrap.b32 %r6163, %r6160, %r6161, 10; - mov.b64 %rd381, {%r6163, %r6162}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6164}, %rd362; - } - { - .reg .b32 %dummy; - mov.b64 {%r6165,%dummy}, %rd362; - } - shf.l.wrap.b32 %r6166, %r6165, %r6164, 15; - shf.l.wrap.b32 %r6167, %r6164, %r6165, 15; - mov.b64 %rd382, {%r6167, %r6166}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6168}, %rd369; - } - { - .reg .b32 %dummy; - mov.b64 {%r6169,%dummy}, %rd369; - } - shf.l.wrap.b32 %r6170, %r6169, %r6168, 21; - shf.l.wrap.b32 %r6171, %r6168, %r6169, 21; - mov.b64 %rd383, {%r6171, %r6170}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6172}, %rd366; - } - { - .reg .b32 %dummy; - mov.b64 {%r6173,%dummy}, %rd366; - } - shf.l.wrap.b32 %r6174, %r6173, %r6172, 28; - shf.l.wrap.b32 %r6175, %r6172, %r6173, 28; - mov.b64 %rd384, {%r6175, %r6174}; - { - .reg .b32 %dummy; - mov.b64 {%r6176,%dummy}, %rd346; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6177}, %rd346; - } - shf.r.wrap.b32 %r6178, %r6177, %r6176, 28; - shf.r.wrap.b32 %r6179, %r6176, %r6177, 28; - mov.b64 %rd385, {%r6179, %r6178}; - { - .reg .b32 %dummy; - mov.b64 {%r6180,%dummy}, %rd355; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6181}, %rd355; - } - shf.r.wrap.b32 %r6182, %r6181, %r6180, 19; - shf.r.wrap.b32 %r6183, %r6180, %r6181, 19; - mov.b64 %rd386, {%r6183, %r6182}; - { - .reg .b32 %dummy; - mov.b64 {%r6184,%dummy}, %rd367; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6185}, %rd367; - } - shf.r.wrap.b32 %r6186, %r6185, %r6184, 9; - shf.r.wrap.b32 %r6187, %r6184, %r6185, 9; - mov.b64 %rd387, {%r6187, %r6186}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6188}, %rd356; - } - { - .reg .b32 %dummy; - mov.b64 {%r6189,%dummy}, %rd356; - } - shf.l.wrap.b32 %r6190, %r6189, %r6188, 2; - shf.l.wrap.b32 %r6191, %r6188, %r6189, 2; - mov.b64 %rd388, {%r6191, %r6190}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6192}, %rd377; - } - { - .reg .b32 %dummy; - mov.b64 {%r6193,%dummy}, %rd377; - } - shf.l.wrap.b32 %r6194, %r6193, %r6192, 14; - shf.l.wrap.b32 %r6195, %r6192, %r6193, 14; - mov.b64 %rd389, {%r6195, %r6194}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6196}, %rd373; - } - { - .reg .b32 %dummy; - mov.b64 {%r6197,%dummy}, %rd373; - } - shf.l.wrap.b32 %r6198, %r6197, %r6196, 27; - shf.l.wrap.b32 %r6199, %r6196, %r6197, 27; - mov.b64 %rd390, {%r6199, %r6198}; - { - .reg .b32 %dummy; - mov.b64 {%r6200,%dummy}, %rd348; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6201}, %rd348; - } - shf.r.wrap.b32 %r6202, %r6201, %r6200, 23; - shf.r.wrap.b32 %r6203, %r6200, %r6201, 23; - mov.b64 %rd391, {%r6203, %r6202}; - { - .reg .b32 %dummy; - mov.b64 {%r6204,%dummy}, %rd370; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6205}, %rd370; - } - shf.r.wrap.b32 %r6206, %r6205, %r6204, 8; - shf.r.wrap.b32 %r6207, %r6204, %r6205, 8; - mov.b64 %rd392, {%r6207, %r6206}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6208}, %rd376; - } - { - .reg .b32 %dummy; - mov.b64 {%r6209,%dummy}, %rd376; - } - shf.l.wrap.b32 %r6210, %r6209, %r6208, 8; - shf.l.wrap.b32 %r6211, %r6208, %r6209, 8; - mov.b64 %rd393, {%r6211, %r6210}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6212}, %rd368; - } - { - .reg .b32 %dummy; - mov.b64 {%r6213,%dummy}, %rd368; - } - shf.l.wrap.b32 %r6214, %r6213, %r6212, 25; - shf.l.wrap.b32 %r6215, %r6212, %r6213, 25; - mov.b64 %rd394, {%r6215, %r6214}; - { - .reg .b32 %dummy; - mov.b64 {%r6216,%dummy}, %rd361; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6217}, %rd361; - } - shf.r.wrap.b32 %r6218, %r6217, %r6216, 21; - shf.r.wrap.b32 %r6219, %r6216, %r6217, 21; - mov.b64 %rd395, {%r6219, %r6218}; - { - .reg .b32 %dummy; - mov.b64 {%r6220,%dummy}, %rd359; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6221}, %rd359; - } - shf.r.wrap.b32 %r6222, %r6221, %r6220, 2; - shf.r.wrap.b32 %r6223, %r6220, %r6221, 2; - mov.b64 %rd396, {%r6223, %r6222}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6224}, %rd349; - } - { - .reg .b32 %dummy; - mov.b64 {%r6225,%dummy}, %rd349; - } - shf.l.wrap.b32 %r6226, %r6225, %r6224, 18; - shf.l.wrap.b32 %r6227, %r6224, %r6225, 18; - mov.b64 %rd397, {%r6227, %r6226}; - { - .reg .b32 %dummy; - mov.b64 {%r6228,%dummy}, %rd375; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6229}, %rd375; - } - shf.r.wrap.b32 %r6230, %r6229, %r6228, 25; - shf.r.wrap.b32 %r6231, %r6228, %r6229, 25; - mov.b64 %rd398, {%r6231, %r6230}; - { - .reg .b32 %dummy; - mov.b64 {%r6232,%dummy}, %rd363; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6233}, %rd363; - } - shf.r.wrap.b32 %r6234, %r6233, %r6232, 3; - shf.r.wrap.b32 %r6235, %r6232, %r6233, 3; - mov.b64 %rd399, {%r6235, %r6234}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6236}, %rd374; - } - { - .reg .b32 %dummy; - mov.b64 {%r6237,%dummy}, %rd374; - } - shf.l.wrap.b32 %r6238, %r6237, %r6236, 20; - shf.l.wrap.b32 %r6239, %r6236, %r6237, 20; - mov.b64 %rd400, {%r6239, %r6238}; - { - .reg .b32 %dummy; - mov.b64 {%r6240,%dummy}, %rd353; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6241}, %rd353; - } - shf.r.wrap.b32 %r6242, %r6241, %r6240, 20; - shf.r.wrap.b32 %r6243, %r6240, %r6241, 20; - mov.b64 %rd401, {%r6243, %r6242}; - not.b64 %rd402, %rd401; - and.b64 %rd403, %rd395, %rd402; - xor.b64 %rd404, %rd403, %rd345; - not.b64 %rd405, %rd395; - and.b64 %rd406, %rd383, %rd405; - xor.b64 %rd484, %rd406, %rd401; - not.b64 %rd407, %rd383; - and.b64 %rd408, %rd389, %rd407; - xor.b64 %rd479, %rd408, %rd395; - not.b64 %rd409, %rd389; - and.b64 %rd410, %rd345, %rd409; - xor.b64 %rd474, %rd410, %rd383; - not.b64 %rd411, %rd345; - and.b64 %rd412, %rd401, %rd411; - xor.b64 %rd469, %rd389, %rd412; - not.b64 %rd413, %rd400; - and.b64 %rd414, %rd379, %rd413; - xor.b64 %rd488, %rd414, %rd384; - not.b64 %rd415, %rd379; - and.b64 %rd416, %rd386, %rd415; - xor.b64 %rd483, %rd416, %rd400; - not.b64 %rd417, %rd386; - and.b64 %rd418, %rd399, %rd417; - xor.b64 %rd478, %rd418, %rd379; - not.b64 %rd419, %rd399; - and.b64 %rd420, %rd384, %rd419; - xor.b64 %rd473, %rd420, %rd386; - not.b64 %rd421, %rd384; - and.b64 %rd422, %rd400, %rd421; - xor.b64 %rd468, %rd399, %rd422; - not.b64 %rd423, %rd380; - and.b64 %rd424, %rd394, %rd423; - xor.b64 %rd487, %rd424, %rd378; - not.b64 %rd425, %rd394; - and.b64 %rd426, %rd393, %rd425; - xor.b64 %rd482, %rd426, %rd380; - not.b64 %rd427, %rd393; - and.b64 %rd428, %rd397, %rd427; - xor.b64 %rd477, %rd428, %rd394; - not.b64 %rd429, %rd397; - and.b64 %rd430, %rd378, %rd429; - xor.b64 %rd472, %rd430, %rd393; - not.b64 %rd431, %rd378; - and.b64 %rd432, %rd380, %rd431; - xor.b64 %rd467, %rd397, %rd432; - not.b64 %rd433, %rd385; - and.b64 %rd434, %rd381, %rd433; - xor.b64 %rd486, %rd434, %rd390; - not.b64 %rd435, %rd381; - and.b64 %rd436, %rd382, %rd435; - xor.b64 %rd481, %rd436, %rd385; - not.b64 %rd437, %rd382; - and.b64 %rd438, %rd392, %rd437; - xor.b64 %rd476, %rd438, %rd381; - not.b64 %rd439, %rd392; - and.b64 %rd440, %rd390, %rd439; - xor.b64 %rd471, %rd440, %rd382; - not.b64 %rd441, %rd390; - and.b64 %rd442, %rd385, %rd441; - xor.b64 %rd466, %rd392, %rd442; - not.b64 %rd443, %rd387; - and.b64 %rd444, %rd398, %rd443; - xor.b64 %rd485, %rd444, %rd396; - not.b64 %rd445, %rd398; - and.b64 %rd446, %rd391, %rd445; - xor.b64 %rd480, %rd446, %rd387; - not.b64 %rd447, %rd391; - and.b64 %rd448, %rd388, %rd447; - xor.b64 %rd475, %rd448, %rd398; - not.b64 %rd449, %rd388; - and.b64 %rd450, %rd396, %rd449; - xor.b64 %rd470, %rd450, %rd391; - not.b64 %rd451, %rd396; - and.b64 %rd452, %rd387, %rd451; - xor.b64 %rd465, %rd388, %rd452; - ld.global.nc.u64 %rd453, [%rd464]; - xor.b64 %rd73, %rd404, %rd453; - add.s64 %rd464, %rd464, 8; - add.s32 %r6244, %r6244, 1; - setp.ne.s32 %p11, %r6244, 24; - @%p11 bra $L__BB0_9; - - ld.const.u64 %rd75, [target+24]; - setp.eq.s64 %p12, %rd474, %rd75; - @%p12 bra $L__BB0_12; - bra.uni $L__BB0_11; - -$L__BB0_12: - ld.const.u64 %rd76, [target+16]; - setp.eq.s64 %p13, %rd479, %rd76; - @%p13 bra $L__BB0_14; - bra.uni $L__BB0_13; - -$L__BB0_14: - ld.const.u64 %rd77, [target+8]; - setp.eq.s64 %p14, %rd484, %rd77; - @%p14 bra $L__BB0_16; - bra.uni $L__BB0_15; - -$L__BB0_16: - ld.const.u64 %rd454, [target]; - setp.lt.u64 %p16, %rd73, %rd454; - bra.uni $L__BB0_17; - -$L__BB0_11: - setp.lt.u64 %p16, %rd474, %rd75; - bra.uni $L__BB0_17; - -$L__BB0_13: - setp.lt.u64 %p16, %rd479, %rd76; - bra.uni $L__BB0_17; - -$L__BB0_15: - setp.lt.u64 %p16, %rd484, %rd77; - -$L__BB0_17: - not.pred %p15, %p16; - @%p15 bra $L__BB0_19; - - ld.param.u64 %rd462, [heavy_hash_param_0]; - ld.param.u64 %rd461, [heavy_hash_param_1]; - and.b64 %rd460, %rd463, %rd462; - or.b64 %rd459, %rd460, %rd461; - ld.param.u64 %rd458, [heavy_hash_param_5]; - cvta.to.global.u64 %rd457, %rd458; - mov.u64 %rd455, 0; - atom.global.cas.b64 %rd456, [%rd457], %rd455, %rd459; - -$L__BB0_19: - ret; - -} - diff --git a/plugins/cuda/resources/kaspa-cuda-sm86.ptx b/plugins/cuda/resources/kaspa-cuda-sm86.ptx deleted file mode 100644 index b1f2fa5..0000000 --- a/plugins/cuda/resources/kaspa-cuda-sm86.ptx +++ /dev/null @@ -1,7081 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// -// Compiler Build ID: CL-31833905 -// Cuda compilation tools, release 11.8, V11.8.89 -// Based on NVVM 7.0.1 -// - -.version 7.8 -.target sm_86 -.address_size 64 - - // .globl heavy_hash -.global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; -.global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; -.global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; -.global .align 1 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; -.global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; -.global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; -.global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; -.const .align 4 .b8 matrix[4096]; -.const .align 8 .b8 hash_header[72]; -.const .align 8 .b8 target[32]; -.const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; -.const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; - -.visible .entry heavy_hash( - .param .u64 heavy_hash_param_0, - .param .u64 heavy_hash_param_1, - .param .u64 heavy_hash_param_2, - .param .u8 heavy_hash_param_3, - .param .u64 heavy_hash_param_4, - .param .u64 heavy_hash_param_5 -) -{ - .local .align 8 .b8 __local_depot0[1912]; - .reg .b64 %SP; - .reg .b64 %SPL; - .reg .pred %p<17>; - .reg .b16 %rs<113>; - .reg .b32 %r<6245>; - .reg .b64 %rd<490>; - - - mov.u64 %SPL, __local_depot0; - ld.param.u8 %rs11, [heavy_hash_param_3]; - ld.param.u64 %rd78, [heavy_hash_param_0]; - ld.param.u64 %rd79, [heavy_hash_param_1]; - ld.param.u64 %rd80, [heavy_hash_param_2]; - ld.param.u64 %rd81, [heavy_hash_param_4]; - ld.param.u64 %rd82, [heavy_hash_param_5]; - cvta.to.global.u64 %rd1, %rd81; - cvta.to.global.u64 %rd2, %rd82; - add.u64 %rd3, %SPL, 0; - mov.u32 %r17, %ntid.x; - mov.u32 %r18, %ctaid.x; - mov.u32 %r19, %tid.x; - mad.lo.s32 %r20, %r18, %r17, %r19; - cvt.s64.s32 %rd4, %r20; - setp.ge.u64 %p6, %rd4, %rd80; - @%p6 bra $L__BB0_19; - - cvt.u32.u64 %r21, %rd4; - setp.ne.s32 %p7, %r21, 0; - @%p7 bra $L__BB0_3; - - mov.u64 %rd84, 0; - st.global.u64 [%rd2], %rd84; - -$L__BB0_3: - setp.eq.s16 %p8, %rs11, 0; - @%p8 bra $L__BB0_5; - - shl.b64 %rd85, %rd4, 5; - add.s64 %rd86, %rd1, %rd85; - ld.global.v2.u64 {%rd87, %rd88}, [%rd86]; - mul.lo.s64 %rd91, %rd88, 5; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd91, 7; - shr.b64 %rhs, %rd91, 57; - add.u64 %rd92, %lhs, %rhs; - } - mul.lo.s64 %rd463, %rd92, 9; - shl.b64 %rd93, %rd88, 17; - ld.global.v2.u64 {%rd94, %rd95}, [%rd86+16]; - xor.b64 %rd98, %rd94, %rd87; - xor.b64 %rd99, %rd95, %rd88; - xor.b64 %rd100, %rd88, %rd98; - xor.b64 %rd101, %rd87, %rd99; - st.global.v2.u64 [%rd86], {%rd101, %rd100}; - { - .reg .b32 %dummy; - mov.b64 {%r22,%dummy}, %rd99; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r23}, %rd99; - } - shf.r.wrap.b32 %r24, %r23, %r22, 19; - shf.r.wrap.b32 %r25, %r22, %r23, 19; - mov.b64 %rd102, {%r25, %r24}; - xor.b64 %rd103, %rd98, %rd93; - st.global.v2.u64 [%rd86+16], {%rd103, %rd102}; - bra.uni $L__BB0_6; - -$L__BB0_5: - ld.global.u64 %rd104, [%rd1]; - xor.b64 %rd463, %rd104, %rd4; - -$L__BB0_6: - and.b64 %rd105, %rd463, %rd78; - or.b64 %rd8, %rd105, %rd79; - mov.b64 {%r26, %r27}, %rd8; - mov.u64 %rd106, 0; - ld.const.u64 %rd107, [hash_header]; - cvt.u32.u64 %r28, %rd107; - shr.u64 %rd108, %rd107, 8; - cvt.u32.u64 %r29, %rd108; - shr.u64 %rd109, %rd107, 16; - cvt.u32.u64 %r30, %rd109; - shr.u64 %rd110, %rd107, 32; - cvt.u32.u64 %r31, %rd110; - shr.u64 %rd111, %rd107, 40; - cvt.u32.u64 %r32, %rd111; - shr.u64 %rd112, %rd107, 48; - cvt.u32.u64 %r33, %rd112; - ld.const.u64 %rd113, [hash_header+8]; - cvt.u32.u64 %r34, %rd113; - shr.u64 %rd114, %rd113, 8; - cvt.u32.u64 %r35, %rd114; - shr.u64 %rd115, %rd113, 16; - cvt.u32.u64 %r36, %rd115; - shr.u64 %rd116, %rd113, 32; - cvt.u32.u64 %r37, %rd116; - shr.u64 %rd117, %rd113, 40; - cvt.u32.u64 %r38, %rd117; - shr.u64 %rd118, %rd113, 48; - cvt.u32.u64 %r39, %rd118; - ld.const.u64 %rd119, [hash_header+16]; - cvt.u32.u64 %r40, %rd119; - shr.u64 %rd120, %rd119, 8; - cvt.u32.u64 %r41, %rd120; - shr.u64 %rd121, %rd119, 16; - cvt.u32.u64 %r42, %rd121; - shr.u64 %rd122, %rd119, 32; - cvt.u32.u64 %r43, %rd122; - shr.u64 %rd123, %rd119, 40; - cvt.u32.u64 %r44, %rd123; - shr.u64 %rd124, %rd119, 48; - cvt.u32.u64 %r45, %rd124; - ld.const.u64 %rd125, [hash_header+24]; - cvt.u32.u64 %r46, %rd125; - shr.u64 %rd126, %rd125, 8; - cvt.u32.u64 %r47, %rd126; - shr.u64 %rd127, %rd125, 16; - cvt.u32.u64 %r48, %rd127; - shr.u64 %rd128, %rd125, 32; - cvt.u32.u64 %r49, %rd128; - shr.u64 %rd129, %rd125, 40; - cvt.u32.u64 %r50, %rd129; - shr.u64 %rd130, %rd125, 48; - cvt.u32.u64 %r51, %rd130; - ld.const.v4.u16 {%rs12, %rs13, %rs14, %rs15}, [hash_header+32]; - shr.u16 %rs17, %rs12, 8; - shr.u16 %rs19, %rs13, 8; - shr.u16 %rs21, %rs14, 8; - shr.u16 %rs23, %rs15, 8; - ld.const.v4.u16 {%rs24, %rs25, %rs26, %rs27}, [hash_header+40]; - shr.u16 %rs29, %rs24, 8; - shr.u16 %rs31, %rs25, 8; - shr.u16 %rs33, %rs26, 8; - shr.u16 %rs35, %rs27, 8; - ld.const.v4.u16 {%rs36, %rs37, %rs38, %rs39}, [hash_header+48]; - shr.u16 %rs41, %rs36, 8; - shr.u16 %rs43, %rs37, 8; - shr.u16 %rs45, %rs38, 8; - shr.u16 %rs47, %rs39, 8; - ld.const.v4.u16 {%rs48, %rs49, %rs50, %rs51}, [hash_header+56]; - shr.u16 %rs53, %rs48, 8; - shr.u16 %rs55, %rs49, 8; - shr.u16 %rs57, %rs50, 8; - shr.u16 %rs59, %rs51, 8; - ld.const.u64 %rd131, [hash_header+64]; - mov.b64 {%r52, %r53}, %rd131; - mov.u32 %r54, -1150833019; - mov.u32 %r55, 1779033703; - st.local.v2.u32 [%rd3], {%r55, %r54}; - mov.u32 %r56, -1521486534; - mov.u32 %r57, 1013904242; - st.local.v2.u32 [%rd3+8], {%r57, %r56}; - mov.u32 %r58, -1694144372; - mov.u32 %r59, 1359893119; - st.local.v2.u32 [%rd3+16], {%r59, %r58}; - mov.u32 %r60, 1541459225; - mov.u32 %r61, 528734635; - st.local.v2.u32 [%rd3+24], {%r61, %r60}; - st.local.u64 [%rd3+64], %rd106; - mov.u32 %r62, 0; - st.local.v2.u32 [%rd3+88], {%r62, %r62}; - st.local.v2.u32 [%rd3+96], {%r62, %r62}; - st.local.v2.u32 [%rd3+104], {%r62, %r62}; - st.local.v2.u32 [%rd3+112], {%r62, %r62}; - st.local.v2.u32 [%rd3+120], {%r62, %r62}; - st.local.v2.u32 [%rd3+128], {%r62, %r62}; - mov.u16 %rs60, 0; - st.local.v2.u8 [%rd3+136], {%rs60, %rs60}; - st.local.u8 [%rd3+138], %rs60; - st.local.v2.u32 [%rd3+32], {%r55, %r54}; - st.local.v2.u32 [%rd3+40], {%r57, %r56}; - st.local.v2.u32 [%rd3+48], {%r59, %r58}; - st.local.v2.u32 [%rd3+56], {%r61, %r60}; - st.local.v2.u32 [%rd3+72], {%r62, %r62}; - st.local.v2.u32 [%rd3+80], {%r62, %r62}; - st.local.u8 [%rd3+144], %rs60; - ld.local.v4.u8 {%rs61, %rs62, %rs63, %rs64}, [%rd3+136]; - setp.eq.s16 %p9, %rs62, 0; - selp.u16 %rs68, 1, 0, %p9; - or.b16 %rs69, %rs63, %rs68; - shr.u32 %r63, %r28, 24; - mov.u32 %r64, 64; - prmt.b32 %r65, %r28, %r29, %r64; - mov.u32 %r66, 1040; - prmt.b32 %r67, %r65, %r30, %r66; - mov.u32 %r68, 16912; - prmt.b32 %r69, %r67, %r63, %r68; - and.b32 %r70, %r31, 255; - and.b32 %r71, %r32, 255; - prmt.b32 %r72, %r71, %r70, 30212; - shl.b32 %r73, %r33, 16; - and.b32 %r74, %r73, 16711680; - or.b32 %r75, %r72, %r74; - and.b32 %r76, %r31, -16777216; - or.b32 %r77, %r75, %r76; - shr.u32 %r78, %r34, 24; - prmt.b32 %r79, %r34, %r35, %r64; - prmt.b32 %r80, %r79, %r36, %r66; - prmt.b32 %r81, %r80, %r78, %r68; - and.b32 %r82, %r37, 255; - and.b32 %r83, %r38, 255; - prmt.b32 %r84, %r83, %r82, 30212; - shl.b32 %r85, %r39, 16; - and.b32 %r86, %r85, 16711680; - or.b32 %r87, %r84, %r86; - and.b32 %r88, %r37, -16777216; - or.b32 %r89, %r87, %r88; - shr.u32 %r90, %r40, 24; - prmt.b32 %r91, %r40, %r41, %r64; - prmt.b32 %r92, %r91, %r42, %r66; - prmt.b32 %r93, %r92, %r90, %r68; - and.b32 %r94, %r43, 255; - and.b32 %r95, %r44, 255; - prmt.b32 %r96, %r95, %r94, 30212; - shl.b32 %r97, %r45, 16; - and.b32 %r98, %r97, 16711680; - or.b32 %r99, %r96, %r98; - and.b32 %r100, %r43, -16777216; - or.b32 %r101, %r99, %r100; - shr.u32 %r102, %r46, 24; - prmt.b32 %r103, %r46, %r47, %r64; - prmt.b32 %r104, %r103, %r48, %r66; - prmt.b32 %r105, %r104, %r102, %r68; - and.b32 %r106, %r49, 255; - and.b32 %r107, %r50, 255; - prmt.b32 %r108, %r107, %r106, 30212; - shl.b32 %r109, %r51, 16; - and.b32 %r110, %r109, 16711680; - or.b32 %r111, %r108, %r110; - and.b32 %r112, %r49, -16777216; - or.b32 %r113, %r111, %r112; - cvt.u32.u16 %r114, %rs12; - and.b32 %r115, %r114, 255; - cvt.u32.u16 %r116, %rs17; - prmt.b32 %r117, %r116, %r115, 30212; - cvt.u32.u16 %r118, %rs13; - prmt.b32 %r119, %r118, %r117, 28756; - cvt.u32.u16 %r120, %rs19; - prmt.b32 %r121, %r120, %r119, 1620; - cvt.u32.u16 %r122, %rs14; - and.b32 %r123, %r122, 255; - cvt.u32.u16 %r124, %rs21; - prmt.b32 %r125, %r124, %r123, 30212; - cvt.u32.u16 %r126, %rs15; - prmt.b32 %r127, %r126, %r125, 28756; - cvt.u32.u16 %r128, %rs23; - prmt.b32 %r129, %r128, %r127, 1620; - cvt.u32.u16 %r130, %rs24; - and.b32 %r131, %r130, 255; - cvt.u32.u16 %r132, %rs29; - prmt.b32 %r133, %r132, %r131, 30212; - cvt.u32.u16 %r134, %rs25; - prmt.b32 %r135, %r134, %r133, 28756; - cvt.u32.u16 %r136, %rs31; - prmt.b32 %r137, %r136, %r135, 1620; - cvt.u32.u16 %r138, %rs26; - and.b32 %r139, %r138, 255; - cvt.u32.u16 %r140, %rs33; - prmt.b32 %r141, %r140, %r139, 30212; - cvt.u32.u16 %r142, %rs27; - prmt.b32 %r143, %r142, %r141, 28756; - cvt.u32.u16 %r144, %rs35; - prmt.b32 %r145, %r144, %r143, 1620; - cvt.u32.u16 %r146, %rs36; - and.b32 %r147, %r146, 255; - cvt.u32.u16 %r148, %rs41; - prmt.b32 %r149, %r148, %r147, 30212; - cvt.u32.u16 %r150, %rs37; - prmt.b32 %r151, %r150, %r149, 28756; - cvt.u32.u16 %r152, %rs43; - prmt.b32 %r153, %r152, %r151, 1620; - cvt.u32.u16 %r154, %rs38; - and.b32 %r155, %r154, 255; - cvt.u32.u16 %r156, %rs45; - prmt.b32 %r157, %r156, %r155, 30212; - cvt.u32.u16 %r158, %rs39; - prmt.b32 %r159, %r158, %r157, 28756; - cvt.u32.u16 %r160, %rs47; - prmt.b32 %r161, %r160, %r159, 1620; - cvt.u32.u16 %r162, %rs48; - and.b32 %r163, %r162, 255; - cvt.u32.u16 %r164, %rs53; - prmt.b32 %r165, %r164, %r163, 30212; - cvt.u32.u16 %r166, %rs49; - prmt.b32 %r167, %r166, %r165, 28756; - cvt.u32.u16 %r168, %rs55; - prmt.b32 %r169, %r168, %r167, 1620; - cvt.u32.u16 %r170, %rs50; - and.b32 %r171, %r170, 255; - cvt.u32.u16 %r172, %rs57; - prmt.b32 %r173, %r172, %r171, 30212; - cvt.u32.u16 %r174, %rs51; - prmt.b32 %r175, %r174, %r173, 28756; - cvt.u32.u16 %r176, %rs59; - prmt.b32 %r177, %r176, %r175, 1620; - cvt.u32.u16 %r178, %rs69; - and.b32 %r179, %r178, 255; - add.s32 %r180, %r69, -1156040474; - shf.l.wrap.b32 %r181, %r180, %r180, 16; - add.s32 %r182, %r181, 1779033703; - xor.b32 %r183, %r182, 1359893119; - shf.l.wrap.b32 %r184, %r183, %r183, 20; - add.s32 %r185, %r77, %r180; - add.s32 %r186, %r185, %r184; - xor.b32 %r187, %r186, %r181; - shf.l.wrap.b32 %r188, %r187, %r187, 24; - add.s32 %r189, %r188, %r182; - xor.b32 %r190, %r189, %r184; - shf.l.wrap.b32 %r191, %r190, %r190, 25; - add.s32 %r192, %r81, 1449989905; - shf.l.wrap.b32 %r193, %r192, %r192, 16; - add.s32 %r194, %r193, -1150833019; - xor.b32 %r195, %r194, -1694144372; - shf.l.wrap.b32 %r196, %r195, %r195, 20; - add.s32 %r197, %r89, %r192; - add.s32 %r198, %r197, %r196; - xor.b32 %r199, %r198, %r193; - shf.l.wrap.b32 %r200, %r199, %r199, 24; - add.s32 %r201, %r200, %r194; - xor.b32 %r202, %r201, %r196; - shf.l.wrap.b32 %r203, %r202, %r202, 25; - add.s32 %r204, %r93, 1542638877; - shr.u32 %r205, %r204, 16; - shl.b32 %r206, %r204, 16; - xor.b32 %r207, %r206, 4194304; - or.b32 %r208, %r207, %r205; - add.s32 %r209, %r208, 1013904242; - xor.b32 %r210, %r209, 528734635; - shf.l.wrap.b32 %r211, %r210, %r210, 20; - add.s32 %r212, %r101, %r204; - add.s32 %r213, %r212, %r211; - xor.b32 %r214, %r213, %r208; - shf.l.wrap.b32 %r215, %r214, %r214, 24; - add.s32 %r216, %r215, %r209; - xor.b32 %r217, %r216, %r211; - shf.l.wrap.b32 %r218, %r217, %r217, 25; - add.s32 %r219, %r105, 19972691; - xor.b32 %r220, %r219, %r179; - shr.u32 %r221, %r219, 16; - shl.b32 %r222, %r220, 16; - or.b32 %r223, %r222, %r221; - add.s32 %r224, %r223, -1521486534; - xor.b32 %r225, %r224, 1541459225; - shf.l.wrap.b32 %r226, %r225, %r225, 20; - add.s32 %r227, %r113, %r219; - add.s32 %r228, %r227, %r226; - xor.b32 %r229, %r228, %r223; - shf.l.wrap.b32 %r230, %r229, %r229, 24; - add.s32 %r231, %r230, %r224; - xor.b32 %r232, %r231, %r226; - shf.l.wrap.b32 %r233, %r232, %r232, 25; - add.s32 %r234, %r203, %r186; - add.s32 %r235, %r234, %r121; - xor.b32 %r236, %r230, %r235; - shf.l.wrap.b32 %r237, %r236, %r236, 16; - add.s32 %r238, %r237, %r216; - xor.b32 %r239, %r238, %r203; - shf.l.wrap.b32 %r240, %r239, %r239, 20; - add.s32 %r241, %r129, %r235; - add.s32 %r242, %r241, %r240; - xor.b32 %r243, %r242, %r237; - shf.l.wrap.b32 %r244, %r243, %r243, 24; - add.s32 %r245, %r244, %r238; - xor.b32 %r246, %r245, %r240; - shf.l.wrap.b32 %r247, %r246, %r246, 25; - add.s32 %r248, %r218, %r198; - add.s32 %r249, %r248, %r137; - xor.b32 %r250, %r249, %r188; - shf.l.wrap.b32 %r251, %r250, %r250, 16; - add.s32 %r252, %r251, %r231; - xor.b32 %r253, %r252, %r218; - shf.l.wrap.b32 %r254, %r253, %r253, 20; - add.s32 %r255, %r145, %r249; - add.s32 %r256, %r255, %r254; - xor.b32 %r257, %r256, %r251; - shf.l.wrap.b32 %r258, %r257, %r257, 24; - add.s32 %r259, %r258, %r252; - xor.b32 %r260, %r259, %r254; - shf.l.wrap.b32 %r261, %r260, %r260, 25; - add.s32 %r262, %r233, %r213; - add.s32 %r263, %r262, %r153; - xor.b32 %r264, %r263, %r200; - shf.l.wrap.b32 %r265, %r264, %r264, 16; - add.s32 %r266, %r265, %r189; - xor.b32 %r267, %r266, %r233; - shf.l.wrap.b32 %r268, %r267, %r267, 20; - add.s32 %r269, %r161, %r263; - add.s32 %r270, %r269, %r268; - xor.b32 %r271, %r270, %r265; - shf.l.wrap.b32 %r272, %r271, %r271, 24; - add.s32 %r273, %r272, %r266; - xor.b32 %r274, %r273, %r268; - shf.l.wrap.b32 %r275, %r274, %r274, 25; - add.s32 %r276, %r228, %r191; - add.s32 %r277, %r276, %r169; - xor.b32 %r278, %r277, %r215; - shf.l.wrap.b32 %r279, %r278, %r278, 16; - add.s32 %r280, %r279, %r201; - xor.b32 %r281, %r280, %r191; - shf.l.wrap.b32 %r282, %r281, %r281, 20; - add.s32 %r283, %r177, %r277; - add.s32 %r284, %r283, %r282; - xor.b32 %r285, %r284, %r279; - shf.l.wrap.b32 %r286, %r285, %r285, 24; - add.s32 %r287, %r286, %r280; - xor.b32 %r288, %r287, %r282; - shf.l.wrap.b32 %r289, %r288, %r288, 25; - add.s32 %r290, %r242, %r81; - add.s32 %r291, %r290, %r289; - xor.b32 %r292, %r291, %r258; - shf.l.wrap.b32 %r293, %r292, %r292, 16; - add.s32 %r294, %r293, %r273; - xor.b32 %r295, %r294, %r289; - shf.l.wrap.b32 %r296, %r295, %r295, 20; - add.s32 %r297, %r291, %r105; - add.s32 %r298, %r297, %r296; - xor.b32 %r299, %r298, %r293; - shf.l.wrap.b32 %r300, %r299, %r299, 24; - add.s32 %r301, %r300, %r294; - xor.b32 %r302, %r301, %r296; - shf.l.wrap.b32 %r303, %r302, %r302, 25; - add.s32 %r304, %r256, %r89; - add.s32 %r305, %r304, %r247; - xor.b32 %r306, %r272, %r305; - shf.l.wrap.b32 %r307, %r306, %r306, 16; - add.s32 %r308, %r287, %r307; - xor.b32 %r309, %r308, %r247; - shf.l.wrap.b32 %r310, %r309, %r309, 20; - add.s32 %r311, %r305, %r137; - add.s32 %r312, %r311, %r310; - xor.b32 %r313, %r312, %r307; - shf.l.wrap.b32 %r314, %r313, %r313, 24; - add.s32 %r315, %r314, %r308; - xor.b32 %r316, %r315, %r310; - shf.l.wrap.b32 %r317, %r316, %r316, 25; - add.s32 %r318, %r261, %r113; - add.s32 %r319, %r318, %r270; - xor.b32 %r320, %r286, %r319; - shf.l.wrap.b32 %r321, %r320, %r320, 16; - add.s32 %r322, %r321, %r245; - xor.b32 %r323, %r322, %r261; - shf.l.wrap.b32 %r324, %r323, %r323, 20; - add.s32 %r325, %r319, %r69; - add.s32 %r326, %r325, %r324; - xor.b32 %r327, %r326, %r321; - shf.l.wrap.b32 %r328, %r327, %r327, 24; - add.s32 %r329, %r328, %r322; - xor.b32 %r330, %r329, %r324; - shf.l.wrap.b32 %r331, %r330, %r330, 25; - add.s32 %r332, %r275, %r93; - add.s32 %r333, %r332, %r284; - xor.b32 %r334, %r333, %r244; - shf.l.wrap.b32 %r335, %r334, %r334, 16; - add.s32 %r336, %r335, %r259; - xor.b32 %r337, %r336, %r275; - shf.l.wrap.b32 %r338, %r337, %r337, 20; - add.s32 %r339, %r333, %r161; - add.s32 %r340, %r339, %r338; - xor.b32 %r341, %r340, %r335; - shf.l.wrap.b32 %r342, %r341, %r341, 24; - add.s32 %r343, %r342, %r336; - xor.b32 %r344, %r343, %r338; - shf.l.wrap.b32 %r345, %r344, %r344, 25; - add.s32 %r346, %r298, %r77; - add.s32 %r347, %r346, %r317; - xor.b32 %r348, %r347, %r342; - shf.l.wrap.b32 %r349, %r348, %r348, 16; - add.s32 %r350, %r349, %r329; - xor.b32 %r351, %r350, %r317; - shf.l.wrap.b32 %r352, %r351, %r351, 20; - add.s32 %r353, %r347, %r145; - add.s32 %r354, %r353, %r352; - xor.b32 %r355, %r354, %r349; - shf.l.wrap.b32 %r356, %r355, %r355, 24; - add.s32 %r357, %r356, %r350; - xor.b32 %r358, %r357, %r352; - shf.l.wrap.b32 %r359, %r358, %r358, 25; - add.s32 %r360, %r312, %r153; - add.s32 %r361, %r360, %r331; - xor.b32 %r362, %r361, %r300; - shf.l.wrap.b32 %r363, %r362, %r362, 16; - add.s32 %r364, %r363, %r343; - xor.b32 %r365, %r364, %r331; - shf.l.wrap.b32 %r366, %r365, %r365, 20; - add.s32 %r367, %r361, %r101; - add.s32 %r368, %r367, %r366; - xor.b32 %r369, %r368, %r363; - shf.l.wrap.b32 %r370, %r369, %r369, 24; - add.s32 %r371, %r370, %r364; - xor.b32 %r372, %r371, %r366; - shf.l.wrap.b32 %r373, %r372, %r372, 25; - add.s32 %r374, %r326, %r129; - add.s32 %r375, %r374, %r345; - xor.b32 %r376, %r375, %r314; - shf.l.wrap.b32 %r377, %r376, %r376, 16; - add.s32 %r378, %r377, %r301; - xor.b32 %r379, %r378, %r345; - shf.l.wrap.b32 %r380, %r379, %r379, 20; - add.s32 %r381, %r375, %r169; - add.s32 %r382, %r381, %r380; - xor.b32 %r383, %r382, %r377; - shf.l.wrap.b32 %r384, %r383, %r383, 24; - add.s32 %r385, %r384, %r378; - xor.b32 %r386, %r385, %r380; - shf.l.wrap.b32 %r387, %r386, %r386, 25; - add.s32 %r388, %r340, %r177; - add.s32 %r389, %r388, %r303; - xor.b32 %r390, %r389, %r328; - shf.l.wrap.b32 %r391, %r390, %r390, 16; - add.s32 %r392, %r391, %r315; - xor.b32 %r393, %r392, %r303; - shf.l.wrap.b32 %r394, %r393, %r393, 20; - add.s32 %r395, %r389, %r121; - add.s32 %r396, %r395, %r394; - xor.b32 %r397, %r396, %r391; - shf.l.wrap.b32 %r398, %r397, %r397, 24; - add.s32 %r399, %r398, %r392; - xor.b32 %r400, %r399, %r394; - shf.l.wrap.b32 %r401, %r400, %r400, 25; - add.s32 %r402, %r354, %r89; - add.s32 %r403, %r402, %r401; - xor.b32 %r404, %r403, %r370; - shf.l.wrap.b32 %r405, %r404, %r404, 16; - add.s32 %r406, %r405, %r385; - xor.b32 %r407, %r406, %r401; - shf.l.wrap.b32 %r408, %r407, %r407, 20; - add.s32 %r409, %r403, %r93; - add.s32 %r410, %r409, %r408; - xor.b32 %r411, %r410, %r405; - shf.l.wrap.b32 %r412, %r411, %r411, 24; - add.s32 %r413, %r412, %r406; - xor.b32 %r414, %r413, %r408; - shf.l.wrap.b32 %r415, %r414, %r414, 25; - add.s32 %r416, %r368, %r137; - add.s32 %r417, %r416, %r359; - xor.b32 %r418, %r417, %r384; - shf.l.wrap.b32 %r419, %r418, %r418, 16; - add.s32 %r420, %r419, %r399; - xor.b32 %r421, %r420, %r359; - shf.l.wrap.b32 %r422, %r421, %r421, 20; - add.s32 %r423, %r417, %r153; - add.s32 %r424, %r423, %r422; - xor.b32 %r425, %r424, %r419; - shf.l.wrap.b32 %r426, %r425, %r425, 24; - add.s32 %r427, %r426, %r420; - xor.b32 %r428, %r427, %r422; - shf.l.wrap.b32 %r429, %r428, %r428, 25; - add.s32 %r430, %r382, %r161; - add.s32 %r431, %r430, %r373; - xor.b32 %r432, %r431, %r398; - shf.l.wrap.b32 %r433, %r432, %r432, 16; - add.s32 %r434, %r433, %r357; - xor.b32 %r435, %r434, %r373; - shf.l.wrap.b32 %r436, %r435, %r435, 20; - add.s32 %r437, %r431, %r81; - add.s32 %r438, %r437, %r436; - xor.b32 %r439, %r438, %r433; - shf.l.wrap.b32 %r440, %r439, %r439, 24; - add.s32 %r441, %r440, %r434; - xor.b32 %r442, %r441, %r436; - shf.l.wrap.b32 %r443, %r442, %r442, 25; - add.s32 %r444, %r396, %r113; - add.s32 %r445, %r444, %r387; - xor.b32 %r446, %r445, %r356; - shf.l.wrap.b32 %r447, %r446, %r446, 16; - add.s32 %r448, %r447, %r371; - xor.b32 %r449, %r448, %r387; - shf.l.wrap.b32 %r450, %r449, %r449, 20; - add.s32 %r451, %r445, %r169; - add.s32 %r452, %r451, %r450; - xor.b32 %r453, %r452, %r447; - shf.l.wrap.b32 %r454, %r453, %r453, 24; - add.s32 %r455, %r454, %r448; - xor.b32 %r456, %r455, %r450; - shf.l.wrap.b32 %r457, %r456, %r456, 25; - add.s32 %r458, %r410, %r105; - add.s32 %r459, %r458, %r429; - xor.b32 %r460, %r459, %r454; - shf.l.wrap.b32 %r461, %r460, %r460, 16; - add.s32 %r462, %r461, %r441; - xor.b32 %r463, %r462, %r429; - shf.l.wrap.b32 %r464, %r463, %r463, 20; - add.s32 %r465, %r459, %r101; - add.s32 %r466, %r465, %r464; - xor.b32 %r467, %r466, %r461; - shf.l.wrap.b32 %r468, %r467, %r467, 24; - add.s32 %r469, %r468, %r462; - xor.b32 %r470, %r469, %r464; - shf.l.wrap.b32 %r471, %r470, %r470, 25; - add.s32 %r472, %r424, %r129; - add.s32 %r473, %r472, %r443; - xor.b32 %r474, %r473, %r412; - shf.l.wrap.b32 %r475, %r474, %r474, 16; - add.s32 %r476, %r475, %r455; - xor.b32 %r477, %r476, %r443; - shf.l.wrap.b32 %r478, %r477, %r477, 20; - add.s32 %r479, %r473, %r69; - add.s32 %r480, %r479, %r478; - xor.b32 %r481, %r480, %r475; - shf.l.wrap.b32 %r482, %r481, %r481, 24; - add.s32 %r483, %r482, %r476; - xor.b32 %r484, %r483, %r478; - shf.l.wrap.b32 %r485, %r484, %r484, 25; - add.s32 %r486, %r438, %r145; - add.s32 %r487, %r486, %r457; - xor.b32 %r488, %r487, %r426; - shf.l.wrap.b32 %r489, %r488, %r488, 16; - add.s32 %r490, %r489, %r413; - xor.b32 %r491, %r490, %r457; - shf.l.wrap.b32 %r492, %r491, %r491, 20; - add.s32 %r493, %r487, %r177; - add.s32 %r494, %r493, %r492; - xor.b32 %r495, %r494, %r489; - shf.l.wrap.b32 %r496, %r495, %r495, 24; - add.s32 %r497, %r496, %r490; - xor.b32 %r498, %r497, %r492; - shf.l.wrap.b32 %r499, %r498, %r498, 25; - add.s32 %r500, %r452, %r121; - add.s32 %r501, %r500, %r415; - xor.b32 %r502, %r501, %r440; - shf.l.wrap.b32 %r503, %r502, %r502, 16; - add.s32 %r504, %r503, %r427; - xor.b32 %r505, %r504, %r415; - shf.l.wrap.b32 %r506, %r505, %r505, 20; - add.s32 %r507, %r501, %r77; - add.s32 %r508, %r507, %r506; - xor.b32 %r509, %r508, %r503; - shf.l.wrap.b32 %r510, %r509, %r509, 24; - add.s32 %r511, %r510, %r504; - xor.b32 %r512, %r511, %r506; - shf.l.wrap.b32 %r513, %r512, %r512, 25; - add.s32 %r514, %r466, %r137; - add.s32 %r515, %r514, %r513; - xor.b32 %r516, %r515, %r482; - shf.l.wrap.b32 %r517, %r516, %r516, 16; - add.s32 %r518, %r517, %r497; - xor.b32 %r519, %r518, %r513; - shf.l.wrap.b32 %r520, %r519, %r519, 20; - add.s32 %r521, %r515, %r113; - add.s32 %r522, %r521, %r520; - xor.b32 %r523, %r522, %r517; - shf.l.wrap.b32 %r524, %r523, %r523, 24; - add.s32 %r525, %r524, %r518; - xor.b32 %r526, %r525, %r520; - shf.l.wrap.b32 %r527, %r526, %r526, 25; - add.s32 %r528, %r480, %r153; - add.s32 %r529, %r528, %r471; - xor.b32 %r530, %r529, %r496; - shf.l.wrap.b32 %r531, %r530, %r530, 16; - add.s32 %r532, %r531, %r511; - xor.b32 %r533, %r532, %r471; - shf.l.wrap.b32 %r534, %r533, %r533, 20; - add.s32 %r535, %r529, %r129; - add.s32 %r536, %r535, %r534; - xor.b32 %r537, %r536, %r531; - shf.l.wrap.b32 %r538, %r537, %r537, 24; - add.s32 %r539, %r538, %r532; - xor.b32 %r540, %r539, %r534; - shf.l.wrap.b32 %r541, %r540, %r540, 25; - add.s32 %r542, %r494, %r169; - add.s32 %r543, %r542, %r485; - xor.b32 %r544, %r543, %r510; - shf.l.wrap.b32 %r545, %r544, %r544, 16; - add.s32 %r546, %r545, %r469; - xor.b32 %r547, %r546, %r485; - shf.l.wrap.b32 %r548, %r547, %r547, 20; - add.s32 %r549, %r543, %r89; - add.s32 %r550, %r549, %r548; - xor.b32 %r551, %r550, %r545; - shf.l.wrap.b32 %r552, %r551, %r551, 24; - add.s32 %r553, %r552, %r546; - xor.b32 %r554, %r553, %r548; - shf.l.wrap.b32 %r555, %r554, %r554, 25; - add.s32 %r556, %r508, %r161; - add.s32 %r557, %r556, %r499; - xor.b32 %r558, %r557, %r468; - shf.l.wrap.b32 %r559, %r558, %r558, 16; - add.s32 %r560, %r559, %r483; - xor.b32 %r561, %r560, %r499; - shf.l.wrap.b32 %r562, %r561, %r561, 20; - add.s32 %r563, %r557, %r177; - add.s32 %r564, %r563, %r562; - xor.b32 %r565, %r564, %r559; - shf.l.wrap.b32 %r566, %r565, %r565, 24; - add.s32 %r567, %r566, %r560; - xor.b32 %r568, %r567, %r562; - shf.l.wrap.b32 %r569, %r568, %r568, 25; - add.s32 %r570, %r522, %r93; - add.s32 %r571, %r570, %r541; - xor.b32 %r572, %r571, %r566; - shf.l.wrap.b32 %r573, %r572, %r572, 16; - add.s32 %r574, %r573, %r553; - xor.b32 %r575, %r574, %r541; - shf.l.wrap.b32 %r576, %r575, %r575, 20; - add.s32 %r577, %r571, %r69; - add.s32 %r578, %r577, %r576; - xor.b32 %r579, %r578, %r573; - shf.l.wrap.b32 %r580, %r579, %r579, 24; - add.s32 %r581, %r580, %r574; - xor.b32 %r582, %r581, %r576; - shf.l.wrap.b32 %r583, %r582, %r582, 25; - add.s32 %r584, %r536, %r145; - add.s32 %r585, %r584, %r555; - xor.b32 %r586, %r585, %r524; - shf.l.wrap.b32 %r587, %r586, %r586, 16; - add.s32 %r588, %r587, %r567; - xor.b32 %r589, %r588, %r555; - shf.l.wrap.b32 %r590, %r589, %r589, 20; - add.s32 %r591, %r585, %r81; - add.s32 %r592, %r591, %r590; - xor.b32 %r593, %r592, %r587; - shf.l.wrap.b32 %r594, %r593, %r593, 24; - add.s32 %r595, %r594, %r588; - xor.b32 %r596, %r595, %r590; - shf.l.wrap.b32 %r597, %r596, %r596, 25; - add.s32 %r598, %r550, %r101; - add.s32 %r599, %r598, %r569; - xor.b32 %r600, %r599, %r538; - shf.l.wrap.b32 %r601, %r600, %r600, 16; - add.s32 %r602, %r601, %r525; - xor.b32 %r603, %r602, %r569; - shf.l.wrap.b32 %r604, %r603, %r603, 20; - add.s32 %r605, %r599, %r121; - add.s32 %r606, %r605, %r604; - xor.b32 %r607, %r606, %r601; - shf.l.wrap.b32 %r608, %r607, %r607, 24; - add.s32 %r609, %r608, %r602; - xor.b32 %r610, %r609, %r604; - shf.l.wrap.b32 %r611, %r610, %r610, 25; - add.s32 %r612, %r564, %r77; - add.s32 %r613, %r612, %r527; - xor.b32 %r614, %r613, %r552; - shf.l.wrap.b32 %r615, %r614, %r614, 16; - add.s32 %r616, %r615, %r539; - xor.b32 %r617, %r616, %r527; - shf.l.wrap.b32 %r618, %r617, %r617, 20; - add.s32 %r619, %r613, %r105; - add.s32 %r620, %r619, %r618; - xor.b32 %r621, %r620, %r615; - shf.l.wrap.b32 %r622, %r621, %r621, 24; - add.s32 %r623, %r622, %r616; - xor.b32 %r624, %r623, %r618; - shf.l.wrap.b32 %r625, %r624, %r624, 25; - add.s32 %r626, %r578, %r153; - add.s32 %r627, %r626, %r625; - xor.b32 %r628, %r627, %r594; - shf.l.wrap.b32 %r629, %r628, %r628, 16; - add.s32 %r630, %r629, %r609; - xor.b32 %r631, %r630, %r625; - shf.l.wrap.b32 %r632, %r631, %r631, 20; - add.s32 %r633, %r627, %r161; - add.s32 %r634, %r633, %r632; - xor.b32 %r635, %r634, %r629; - shf.l.wrap.b32 %r636, %r635, %r635, 24; - add.s32 %r637, %r636, %r630; - xor.b32 %r638, %r637, %r632; - shf.l.wrap.b32 %r639, %r638, %r638, 25; - add.s32 %r640, %r592, %r129; - add.s32 %r641, %r640, %r583; - xor.b32 %r642, %r641, %r608; - shf.l.wrap.b32 %r643, %r642, %r642, 16; - add.s32 %r644, %r643, %r623; - xor.b32 %r645, %r644, %r583; - shf.l.wrap.b32 %r646, %r645, %r645, 20; - add.s32 %r647, %r641, %r145; - add.s32 %r648, %r647, %r646; - xor.b32 %r649, %r648, %r643; - shf.l.wrap.b32 %r650, %r649, %r649, 24; - add.s32 %r651, %r650, %r644; - xor.b32 %r652, %r651, %r646; - shf.l.wrap.b32 %r653, %r652, %r652, 25; - add.s32 %r654, %r606, %r177; - add.s32 %r655, %r654, %r597; - xor.b32 %r656, %r655, %r622; - shf.l.wrap.b32 %r657, %r656, %r656, 16; - add.s32 %r658, %r657, %r581; - xor.b32 %r659, %r658, %r597; - shf.l.wrap.b32 %r660, %r659, %r659, 20; - add.s32 %r661, %r655, %r137; - add.s32 %r662, %r661, %r660; - xor.b32 %r663, %r662, %r657; - shf.l.wrap.b32 %r664, %r663, %r663, 24; - add.s32 %r665, %r664, %r658; - xor.b32 %r666, %r665, %r660; - shf.l.wrap.b32 %r667, %r666, %r666, 25; - add.s32 %r668, %r620, %r169; - add.s32 %r669, %r668, %r611; - xor.b32 %r670, %r669, %r580; - shf.l.wrap.b32 %r671, %r670, %r670, 16; - add.s32 %r672, %r671, %r595; - xor.b32 %r673, %r672, %r611; - shf.l.wrap.b32 %r674, %r673, %r673, 20; - add.s32 %r675, %r669, %r121; - add.s32 %r676, %r675, %r674; - xor.b32 %r677, %r676, %r671; - shf.l.wrap.b32 %r678, %r677, %r677, 24; - add.s32 %r679, %r678, %r672; - xor.b32 %r680, %r679, %r674; - shf.l.wrap.b32 %r681, %r680, %r680, 25; - add.s32 %r682, %r634, %r113; - add.s32 %r683, %r682, %r653; - xor.b32 %r684, %r683, %r678; - shf.l.wrap.b32 %r685, %r684, %r684, 16; - add.s32 %r686, %r685, %r665; - xor.b32 %r687, %r686, %r653; - shf.l.wrap.b32 %r688, %r687, %r687, 20; - add.s32 %r689, %r683, %r81; - add.s32 %r690, %r689, %r688; - xor.b32 %r691, %r690, %r685; - shf.l.wrap.b32 %r692, %r691, %r691, 24; - add.s32 %r693, %r692, %r686; - xor.b32 %r694, %r693, %r688; - shf.l.wrap.b32 %r695, %r694, %r694, 25; - add.s32 %r696, %r648, %r101; - add.s32 %r697, %r696, %r667; - xor.b32 %r698, %r697, %r636; - shf.l.wrap.b32 %r699, %r698, %r698, 16; - add.s32 %r700, %r699, %r679; - xor.b32 %r701, %r700, %r667; - shf.l.wrap.b32 %r702, %r701, %r701, 20; - add.s32 %r703, %r697, %r89; - add.s32 %r704, %r703, %r702; - xor.b32 %r705, %r704, %r699; - shf.l.wrap.b32 %r706, %r705, %r705, 24; - add.s32 %r707, %r706, %r700; - xor.b32 %r708, %r707, %r702; - shf.l.wrap.b32 %r709, %r708, %r708, 25; - add.s32 %r710, %r662, %r69; - add.s32 %r711, %r710, %r681; - xor.b32 %r712, %r711, %r650; - shf.l.wrap.b32 %r713, %r712, %r712, 16; - add.s32 %r714, %r713, %r637; - xor.b32 %r715, %r714, %r681; - shf.l.wrap.b32 %r716, %r715, %r715, 20; - add.s32 %r717, %r711, %r77; - add.s32 %r718, %r717, %r716; - xor.b32 %r719, %r718, %r713; - shf.l.wrap.b32 %r720, %r719, %r719, 24; - add.s32 %r721, %r720, %r714; - xor.b32 %r722, %r721, %r716; - shf.l.wrap.b32 %r723, %r722, %r722, 25; - add.s32 %r724, %r676, %r105; - add.s32 %r725, %r724, %r639; - xor.b32 %r726, %r725, %r664; - shf.l.wrap.b32 %r727, %r726, %r726, 16; - add.s32 %r728, %r727, %r651; - xor.b32 %r729, %r728, %r639; - shf.l.wrap.b32 %r730, %r729, %r729, 20; - add.s32 %r731, %r725, %r93; - add.s32 %r732, %r731, %r730; - xor.b32 %r733, %r732, %r727; - shf.l.wrap.b32 %r734, %r733, %r733, 24; - add.s32 %r735, %r734, %r728; - xor.b32 %r736, %r735, %r730; - shf.l.wrap.b32 %r737, %r736, %r736, 25; - add.s32 %r738, %r690, %r129; - add.s32 %r739, %r738, %r737; - xor.b32 %r740, %r739, %r706; - shf.l.wrap.b32 %r741, %r740, %r740, 16; - add.s32 %r742, %r741, %r721; - xor.b32 %r743, %r742, %r737; - shf.l.wrap.b32 %r744, %r743, %r743, 20; - add.s32 %r745, %r739, %r169; - add.s32 %r746, %r745, %r744; - xor.b32 %r747, %r746, %r741; - shf.l.wrap.b32 %r748, %r747, %r747, 24; - add.s32 %r749, %r748, %r742; - xor.b32 %r750, %r749, %r744; - shf.l.wrap.b32 %r751, %r750, %r750, 25; - add.s32 %r752, %r704, %r145; - add.s32 %r753, %r752, %r695; - xor.b32 %r754, %r753, %r720; - shf.l.wrap.b32 %r755, %r754, %r754, 16; - add.s32 %r756, %r755, %r735; - xor.b32 %r757, %r756, %r695; - shf.l.wrap.b32 %r758, %r757, %r757, 20; - add.s32 %r759, %r753, %r101; - add.s32 %r760, %r759, %r758; - xor.b32 %r761, %r760, %r755; - shf.l.wrap.b32 %r762, %r761, %r761, 24; - add.s32 %r763, %r762, %r756; - xor.b32 %r764, %r763, %r758; - shf.l.wrap.b32 %r765, %r764, %r764, 25; - add.s32 %r766, %r718, %r121; - add.s32 %r767, %r766, %r709; - xor.b32 %r768, %r767, %r734; - shf.l.wrap.b32 %r769, %r768, %r768, 16; - add.s32 %r770, %r769, %r693; - xor.b32 %r771, %r770, %r709; - shf.l.wrap.b32 %r772, %r771, %r771, 20; - add.s32 %r773, %r767, %r153; - add.s32 %r774, %r773, %r772; - xor.b32 %r775, %r774, %r769; - shf.l.wrap.b32 %r776, %r775, %r775, 24; - add.s32 %r777, %r776, %r770; - xor.b32 %r778, %r777, %r772; - shf.l.wrap.b32 %r779, %r778, %r778, 25; - add.s32 %r780, %r732, %r177; - add.s32 %r781, %r780, %r723; - xor.b32 %r782, %r781, %r692; - shf.l.wrap.b32 %r783, %r782, %r782, 16; - add.s32 %r784, %r783, %r707; - xor.b32 %r785, %r784, %r723; - shf.l.wrap.b32 %r786, %r785, %r785, 20; - add.s32 %r787, %r781, %r77; - add.s32 %r788, %r787, %r786; - xor.b32 %r789, %r788, %r783; - shf.l.wrap.b32 %r790, %r789, %r789, 24; - add.s32 %r791, %r790, %r784; - xor.b32 %r792, %r791, %r786; - shf.l.wrap.b32 %r793, %r792, %r792, 25; - add.s32 %r794, %r746, %r161; - add.s32 %r795, %r794, %r765; - xor.b32 %r796, %r795, %r790; - shf.l.wrap.b32 %r797, %r796, %r796, 16; - add.s32 %r798, %r797, %r777; - xor.b32 %r799, %r798, %r765; - shf.l.wrap.b32 %r800, %r799, %r799, 20; - add.s32 %r801, %r795, %r89; - add.s32 %r802, %r801, %r800; - xor.b32 %r803, %r802, %r797; - shf.l.wrap.b32 %r804, %r803, %r803, 24; - add.s32 %r805, %r804, %r798; - xor.b32 %r806, %r805, %r800; - shf.l.wrap.b32 %r807, %r806, %r806, 25; - add.s32 %r808, %r760, %r69; - add.s32 %r809, %r808, %r779; - xor.b32 %r810, %r809, %r748; - shf.l.wrap.b32 %r811, %r810, %r810, 16; - add.s32 %r812, %r811, %r791; - xor.b32 %r813, %r812, %r779; - shf.l.wrap.b32 %r814, %r813, %r813, 20; - add.s32 %r815, %r809, %r137; - add.s32 %r816, %r815, %r814; - xor.b32 %r817, %r816, %r811; - shf.l.wrap.b32 %r818, %r817, %r817, 24; - add.s32 %r819, %r818, %r812; - xor.b32 %r820, %r819, %r814; - shf.l.wrap.b32 %r821, %r820, %r820, 25; - add.s32 %r822, %r774, %r81; - add.s32 %r823, %r822, %r793; - xor.b32 %r824, %r823, %r762; - shf.l.wrap.b32 %r825, %r824, %r824, 16; - add.s32 %r826, %r825, %r749; - xor.b32 %r827, %r826, %r793; - shf.l.wrap.b32 %r828, %r827, %r827, 20; - add.s32 %r829, %r823, %r105; - add.s32 %r830, %r829, %r828; - xor.b32 %r831, %r830, %r825; - shf.l.wrap.b32 %r832, %r831, %r831, 24; - add.s32 %r833, %r832, %r826; - xor.b32 %r834, %r833, %r828; - shf.l.wrap.b32 %r835, %r834, %r834, 25; - add.s32 %r836, %r788, %r93; - add.s32 %r837, %r836, %r751; - xor.b32 %r838, %r837, %r776; - shf.l.wrap.b32 %r839, %r838, %r838, 16; - add.s32 %r840, %r839, %r763; - xor.b32 %r841, %r840, %r751; - shf.l.wrap.b32 %r842, %r841, %r841, 20; - add.s32 %r843, %r837, %r113; - add.s32 %r844, %r843, %r842; - xor.b32 %r845, %r844, %r839; - shf.l.wrap.b32 %r846, %r845, %r845, 24; - add.s32 %r847, %r846, %r840; - xor.b32 %r848, %r847, %r842; - shf.l.wrap.b32 %r849, %r848, %r848, 25; - add.s32 %r850, %r802, %r145; - add.s32 %r851, %r850, %r849; - xor.b32 %r852, %r851, %r818; - shf.l.wrap.b32 %r853, %r852, %r852, 16; - add.s32 %r854, %r853, %r833; - xor.b32 %r855, %r854, %r849; - shf.l.wrap.b32 %r856, %r855, %r855, 20; - add.s32 %r857, %r851, %r177; - add.s32 %r858, %r857, %r856; - xor.b32 %r859, %r858, %r853; - shf.l.wrap.b32 %r860, %r859, %r859, 24; - add.s32 %r861, %r860, %r854; - xor.b32 %r862, %r861, %r856; - shf.l.wrap.b32 %r863, %r862, %r862, 25; - add.s32 %r864, %r816, %r101; - add.s32 %r865, %r864, %r807; - xor.b32 %r866, %r865, %r832; - shf.l.wrap.b32 %r867, %r866, %r866, 16; - add.s32 %r868, %r867, %r847; - xor.b32 %r869, %r868, %r807; - shf.l.wrap.b32 %r870, %r869, %r869, 20; - add.s32 %r871, %r865, %r69; - add.s32 %r872, %r871, %r870; - xor.b32 %r873, %r872, %r867; - shf.l.wrap.b32 %r874, %r873, %r873, 24; - add.s32 %r875, %r874, %r868; - xor.b32 %r876, %r875, %r870; - shf.l.wrap.b32 %r877, %r876, %r876, 25; - add.s32 %r878, %r830, %r77; - add.s32 %r879, %r878, %r821; - xor.b32 %r880, %r879, %r846; - shf.l.wrap.b32 %r881, %r880, %r880, 16; - add.s32 %r882, %r881, %r805; - xor.b32 %r883, %r882, %r821; - shf.l.wrap.b32 %r884, %r883, %r883, 20; - add.s32 %r885, %r879, %r129; - add.s32 %r886, %r885, %r884; - xor.b32 %r887, %r886, %r881; - shf.l.wrap.b32 %r888, %r887, %r887, 24; - add.s32 %r889, %r888, %r882; - xor.b32 %r890, %r889, %r884; - shf.l.wrap.b32 %r891, %r890, %r890, 25; - add.s32 %r892, %r844, %r121; - add.s32 %r893, %r892, %r835; - xor.b32 %r894, %r893, %r804; - shf.l.wrap.b32 %r895, %r894, %r894, 16; - add.s32 %r896, %r895, %r819; - xor.b32 %r897, %r896, %r835; - shf.l.wrap.b32 %r898, %r897, %r897, 20; - add.s32 %r899, %r893, %r105; - add.s32 %r900, %r899, %r898; - xor.b32 %r901, %r900, %r895; - shf.l.wrap.b32 %r902, %r901, %r901, 24; - add.s32 %r903, %r902, %r896; - xor.b32 %r904, %r903, %r898; - shf.l.wrap.b32 %r905, %r904, %r904, 25; - add.s32 %r906, %r858, %r169; - add.s32 %r907, %r906, %r877; - xor.b32 %r908, %r907, %r902; - shf.l.wrap.b32 %r909, %r908, %r908, 16; - add.s32 %r910, %r909, %r889; - xor.b32 %r911, %r910, %r877; - shf.l.wrap.b32 %r912, %r911, %r911, 20; - add.s32 %r913, %r907, %r137; - add.s32 %r914, %r913, %r912; - xor.b32 %r915, %r914, %r909; - shf.l.wrap.b32 %r916, %r915, %r915, 24; - add.s32 %r917, %r916, %r910; - xor.b32 %r918, %r917, %r912; - shf.l.wrap.b32 %r919, %r918, %r918, 25; - add.s32 %r920, %r872, %r81; - add.s32 %r921, %r920, %r891; - xor.b32 %r922, %r921, %r860; - shf.l.wrap.b32 %r923, %r922, %r922, 16; - add.s32 %r924, %r923, %r903; - xor.b32 %r925, %r924, %r891; - shf.l.wrap.b32 %r926, %r925, %r925, 20; - add.s32 %r927, %r921, %r153; - add.s32 %r928, %r927, %r926; - xor.b32 %r929, %r928, %r923; - shf.l.wrap.b32 %r930, %r929, %r929, 24; - add.s32 %r931, %r930, %r924; - xor.b32 %r932, %r931, %r926; - shf.l.wrap.b32 %r933, %r932, %r932, 25; - add.s32 %r934, %r886, %r89; - add.s32 %r935, %r934, %r905; - xor.b32 %r936, %r935, %r874; - shf.l.wrap.b32 %r937, %r936, %r936, 16; - add.s32 %r938, %r937, %r861; - xor.b32 %r939, %r938, %r905; - shf.l.wrap.b32 %r940, %r939, %r939, 20; - add.s32 %r941, %r935, %r93; - add.s32 %r942, %r941, %r940; - xor.b32 %r943, %r942, %r937; - shf.l.wrap.b32 %r944, %r943, %r943, 24; - add.s32 %r945, %r944, %r938; - xor.b32 %r946, %r945, %r940; - shf.l.wrap.b32 %r947, %r946, %r946, 25; - add.s32 %r948, %r900, %r113; - add.s32 %r949, %r948, %r863; - xor.b32 %r950, %r949, %r888; - shf.l.wrap.b32 %r951, %r950, %r950, 16; - add.s32 %r952, %r951, %r875; - xor.b32 %r953, %r952, %r863; - shf.l.wrap.b32 %r954, %r953, %r953, 20; - add.s32 %r955, %r949, %r161; - add.s32 %r956, %r955, %r954; - xor.b32 %r957, %r956, %r951; - shf.l.wrap.b32 %r958, %r957, %r957, 24; - add.s32 %r959, %r958, %r952; - xor.b32 %r960, %r959, %r954; - shf.l.wrap.b32 %r961, %r960, %r960, 25; - xor.b32 %r1, %r945, %r914; - xor.b32 %r2, %r959, %r928; - st.local.v2.u32 [%rd3+32], {%r1, %r2}; - xor.b32 %r3, %r917, %r942; - xor.b32 %r4, %r956, %r931; - st.local.v2.u32 [%rd3+40], {%r3, %r4}; - xor.b32 %r5, %r961, %r930; - xor.b32 %r6, %r919, %r944; - st.local.v2.u32 [%rd3+48], {%r5, %r6}; - xor.b32 %r7, %r958, %r933; - xor.b32 %r8, %r947, %r916; - st.local.v2.u32 [%rd3+56], {%r7, %r8}; - st.local.u64 [%rd3+72], %rd131; - st.local.u64 [%rd3+80], %rd8; - add.s16 %rs1, %rs61, 16; - and.b16 %rs70, %rs1, 255; - add.s16 %rs71, %rs62, 1; - st.local.v2.u8 [%rd3+136], {%rs1, %rs71}; - cvt.u32.u16 %r962, %rs71; - cvt.u32.u16 %r963, %rs70; - prmt.b32 %r964, %r962, %r963, 30212; - cvt.u16.u32 %rs72, %r964; - shr.u16 %rs2, %rs72, 8; - mov.b32 {%rs5, %rs6}, %r53; - mov.b32 {%rs3, %rs4}, %r52; - mov.b32 {%rs9, %rs10}, %r27; - mov.b32 {%rs7, %rs8}, %r26; - setp.eq.s16 %p10, %rs2, 0; - selp.u16 %rs73, 1, 0, %p10; - shr.u16 %rs74, %rs3, 8; - shr.u16 %rs75, %rs4, 8; - shr.u16 %rs76, %rs5, 8; - shr.u16 %rs77, %rs6, 8; - shr.u16 %rs78, %rs7, 8; - shr.u16 %rs79, %rs8, 8; - shr.u16 %rs80, %rs9, 8; - shr.u16 %rs81, %rs10, 8; - or.b16 %rs82, %rs73, 10; - cvt.u32.u16 %r965, %rs3; - and.b32 %r966, %r965, 255; - cvt.u32.u16 %r967, %rs74; - prmt.b32 %r968, %r967, %r966, 30212; - cvt.u32.u16 %r969, %rs4; - prmt.b32 %r970, %r969, %r968, 28756; - cvt.u32.u16 %r971, %rs75; - prmt.b32 %r972, %r971, %r970, 1620; - cvt.u32.u16 %r973, %rs5; - and.b32 %r974, %r973, 255; - cvt.u32.u16 %r975, %rs76; - prmt.b32 %r976, %r975, %r974, 30212; - cvt.u32.u16 %r977, %rs6; - prmt.b32 %r978, %r977, %r976, 28756; - cvt.u32.u16 %r979, %rs77; - prmt.b32 %r980, %r979, %r978, 1620; - cvt.u32.u16 %r981, %rs7; - and.b32 %r982, %r981, 255; - cvt.u32.u16 %r983, %rs78; - prmt.b32 %r984, %r983, %r982, 30212; - cvt.u32.u16 %r985, %rs8; - prmt.b32 %r986, %r985, %r984, 28756; - cvt.u32.u16 %r987, %rs79; - prmt.b32 %r988, %r987, %r986, 1620; - cvt.u32.u16 %r989, %rs9; - and.b32 %r990, %r989, 255; - cvt.u32.u16 %r991, %rs80; - prmt.b32 %r992, %r991, %r990, 30212; - cvt.u32.u16 %r993, %rs10; - prmt.b32 %r994, %r993, %r992, 28756; - cvt.u32.u16 %r995, %rs81; - prmt.b32 %r996, %r995, %r994, 1620; - cvt.u32.u16 %r997, %rs82; - add.s32 %r998, %r5, %r1; - add.s32 %r999, %r998, %r972; - add.s32 %r1000, %r980, %r999; - add.s32 %r1001, %r6, %r2; - add.s32 %r1002, %r1001, %r988; - add.s32 %r1003, %r996, %r1002; - add.s32 %r1004, %r7, %r3; - cvt.u32.u16 %r1005, %rs1; - and.b32 %r1006, %r1005, 255; - xor.b32 %r1007, %r1004, %r1006; - shr.u32 %r1008, %r1004, 16; - shl.b32 %r1009, %r1007, 16; - or.b32 %r1010, %r1009, %r1008; - add.s32 %r1011, %r1010, 1013904242; - xor.b32 %r1012, %r1011, %r7; - shf.l.wrap.b32 %r1013, %r1012, %r1012, 20; - add.s32 %r1014, %r1004, %r1013; - xor.b32 %r1015, %r1014, %r1010; - shf.l.wrap.b32 %r1016, %r1015, %r1015, 24; - add.s32 %r1017, %r1016, %r1011; - xor.b32 %r1018, %r1017, %r1013; - shf.l.wrap.b32 %r1019, %r1018, %r1018, 25; - add.s32 %r1020, %r8, %r4; - xor.b32 %r1021, %r1020, %r997; - shr.u32 %r1022, %r1020, 16; - shl.b32 %r1023, %r1021, 16; - or.b32 %r1024, %r1023, %r1022; - add.s32 %r1025, %r1024, -1521486534; - xor.b32 %r1026, %r1025, %r8; - shf.l.wrap.b32 %r1027, %r1026, %r1026, 20; - add.s32 %r1028, %r1020, %r1027; - xor.b32 %r1029, %r1028, %r1024; - shf.l.wrap.b32 %r1030, %r1029, %r1029, 24; - add.s32 %r1031, %r1030, %r1025; - xor.b32 %r1032, %r1031, %r1027; - shf.l.wrap.b32 %r1033, %r1032, %r1032, 25; - add.s32 %r1034, %r1033, %r1014; - shf.l.wrap.b32 %r1035, %r999, %r999, 16; - add.s32 %r1036, %r1035, 1779033703; - xor.b32 %r1037, %r1036, %r5; - shf.l.wrap.b32 %r1038, %r1037, %r1037, 20; - add.s32 %r1039, %r1000, %r1038; - xor.b32 %r1040, %r1039, %r1035; - shf.l.wrap.b32 %r1041, %r1040, %r1040, 24; - add.s32 %r1042, %r1041, %r1036; - xor.b32 %r1043, %r1042, %r1038; - shf.l.wrap.b32 %r1044, %r1043, %r1043, 25; - shf.l.wrap.b32 %r1045, %r1002, %r1002, 16; - add.s32 %r1046, %r1045, -1150833019; - xor.b32 %r1047, %r1046, %r6; - shf.l.wrap.b32 %r1048, %r1047, %r1047, 20; - add.s32 %r1049, %r1003, %r1048; - xor.b32 %r1050, %r1049, %r1045; - shf.l.wrap.b32 %r1051, %r1050, %r1050, 24; - add.s32 %r1052, %r1051, %r1046; - xor.b32 %r1053, %r1052, %r1048; - shf.l.wrap.b32 %r1054, %r1053, %r1053, 25; - add.s32 %r1055, %r1039, %r1054; - xor.b32 %r1056, %r1055, %r1030; - shf.l.wrap.b32 %r1057, %r1056, %r1056, 16; - add.s32 %r1058, %r1057, %r1017; - xor.b32 %r1059, %r1058, %r1054; - shf.l.wrap.b32 %r1060, %r1059, %r1059, 20; - add.s32 %r1061, %r1055, %r1060; - xor.b32 %r1062, %r1061, %r1057; - shf.l.wrap.b32 %r1063, %r1062, %r1062, 24; - add.s32 %r1064, %r1063, %r1058; - xor.b32 %r1065, %r1064, %r1060; - shf.l.wrap.b32 %r1066, %r1065, %r1065, 25; - add.s32 %r1067, %r1019, %r1049; - xor.b32 %r1068, %r1041, %r1067; - shf.l.wrap.b32 %r1069, %r1068, %r1068, 16; - add.s32 %r1070, %r1069, %r1031; - xor.b32 %r1071, %r1070, %r1019; - shf.l.wrap.b32 %r1072, %r1071, %r1071, 20; - add.s32 %r1073, %r1067, %r1072; - xor.b32 %r1074, %r1073, %r1069; - shf.l.wrap.b32 %r1075, %r1074, %r1074, 24; - add.s32 %r1076, %r1075, %r1070; - xor.b32 %r1077, %r1076, %r1072; - shf.l.wrap.b32 %r1078, %r1077, %r1077, 25; - xor.b32 %r1079, %r1051, %r1034; - shf.l.wrap.b32 %r1080, %r1079, %r1079, 16; - add.s32 %r1081, %r1080, %r1042; - xor.b32 %r1082, %r1081, %r1033; - shf.l.wrap.b32 %r1083, %r1082, %r1082, 20; - add.s32 %r1084, %r1034, %r1083; - xor.b32 %r1085, %r1084, %r1080; - shf.l.wrap.b32 %r1086, %r1085, %r1085, 24; - add.s32 %r1087, %r1086, %r1081; - xor.b32 %r1088, %r1087, %r1083; - shf.l.wrap.b32 %r1089, %r1088, %r1088, 25; - add.s32 %r1090, %r1028, %r1044; - xor.b32 %r1091, %r1090, %r1016; - shf.l.wrap.b32 %r1092, %r1091, %r1091, 16; - add.s32 %r1093, %r1092, %r1052; - xor.b32 %r1094, %r1093, %r1044; - shf.l.wrap.b32 %r1095, %r1094, %r1094, 20; - add.s32 %r1096, %r1090, %r1095; - xor.b32 %r1097, %r1096, %r1092; - shf.l.wrap.b32 %r1098, %r1097, %r1097, 24; - add.s32 %r1099, %r1098, %r1093; - xor.b32 %r1100, %r1099, %r1095; - shf.l.wrap.b32 %r1101, %r1100, %r1100, 25; - add.s32 %r1102, %r1061, %r988; - add.s32 %r1103, %r1102, %r1101; - xor.b32 %r1104, %r1103, %r1075; - shf.l.wrap.b32 %r1105, %r1104, %r1104, 16; - add.s32 %r1106, %r1105, %r1087; - xor.b32 %r1107, %r1106, %r1101; - shf.l.wrap.b32 %r1108, %r1107, %r1107, 20; - add.s32 %r1109, %r1103, %r1108; - xor.b32 %r1110, %r1109, %r1105; - shf.l.wrap.b32 %r1111, %r1110, %r1110, 24; - add.s32 %r1112, %r1111, %r1106; - xor.b32 %r1113, %r1112, %r1108; - shf.l.wrap.b32 %r1114, %r1113, %r1113, 25; - add.s32 %r1115, %r1073, %r996; - add.s32 %r1116, %r1115, %r1066; - xor.b32 %r1117, %r1116, %r1086; - shf.l.wrap.b32 %r1118, %r1117, %r1117, 16; - add.s32 %r1119, %r1118, %r1099; - xor.b32 %r1120, %r1119, %r1066; - shf.l.wrap.b32 %r1121, %r1120, %r1120, 20; - add.s32 %r1122, %r1116, %r1121; - xor.b32 %r1123, %r1122, %r1118; - shf.l.wrap.b32 %r1124, %r1123, %r1123, 24; - add.s32 %r1125, %r1124, %r1119; - xor.b32 %r1126, %r1125, %r1121; - shf.l.wrap.b32 %r1127, %r1126, %r1126, 25; - add.s32 %r1128, %r1084, %r1078; - xor.b32 %r1129, %r1098, %r1128; - shf.l.wrap.b32 %r1130, %r1129, %r1129, 16; - add.s32 %r1131, %r1130, %r1064; - xor.b32 %r1132, %r1131, %r1078; - shf.l.wrap.b32 %r1133, %r1132, %r1132, 20; - add.s32 %r1134, %r1128, %r972; - add.s32 %r1135, %r1134, %r1133; - xor.b32 %r1136, %r1135, %r1130; - shf.l.wrap.b32 %r1137, %r1136, %r1136, 24; - add.s32 %r1138, %r1137, %r1131; - xor.b32 %r1139, %r1138, %r1133; - shf.l.wrap.b32 %r1140, %r1139, %r1139, 25; - add.s32 %r1141, %r1096, %r1089; - xor.b32 %r1142, %r1063, %r1141; - shf.l.wrap.b32 %r1143, %r1142, %r1142, 16; - add.s32 %r1144, %r1143, %r1076; - xor.b32 %r1145, %r1144, %r1089; - shf.l.wrap.b32 %r1146, %r1145, %r1145, 20; - add.s32 %r1147, %r1141, %r1146; - xor.b32 %r1148, %r1147, %r1143; - shf.l.wrap.b32 %r1149, %r1148, %r1148, 24; - add.s32 %r1150, %r1149, %r1144; - xor.b32 %r1151, %r1150, %r1146; - shf.l.wrap.b32 %r1152, %r1151, %r1151, 25; - add.s32 %r1153, %r1109, %r980; - add.s32 %r1154, %r1153, %r1127; - xor.b32 %r1155, %r1154, %r1149; - shf.l.wrap.b32 %r1156, %r1155, %r1155, 16; - add.s32 %r1157, %r1156, %r1138; - xor.b32 %r1158, %r1157, %r1127; - shf.l.wrap.b32 %r1159, %r1158, %r1158, 20; - add.s32 %r1160, %r1154, %r1159; - xor.b32 %r1161, %r1160, %r1156; - shf.l.wrap.b32 %r1162, %r1161, %r1161, 24; - add.s32 %r1163, %r1162, %r1157; - xor.b32 %r1164, %r1163, %r1159; - shf.l.wrap.b32 %r1165, %r1164, %r1164, 25; - add.s32 %r1166, %r1140, %r1122; - xor.b32 %r1167, %r1111, %r1166; - shf.l.wrap.b32 %r1168, %r1167, %r1167, 16; - add.s32 %r1169, %r1168, %r1150; - xor.b32 %r1170, %r1169, %r1140; - shf.l.wrap.b32 %r1171, %r1170, %r1170, 20; - add.s32 %r1172, %r1166, %r1171; - xor.b32 %r1173, %r1172, %r1168; - shf.l.wrap.b32 %r1174, %r1173, %r1173, 24; - add.s32 %r1175, %r1174, %r1169; - xor.b32 %r1176, %r1175, %r1171; - shf.l.wrap.b32 %r1177, %r1176, %r1176, 25; - add.s32 %r1178, %r1135, %r1152; - xor.b32 %r1179, %r1124, %r1178; - shf.l.wrap.b32 %r1180, %r1179, %r1179, 16; - add.s32 %r1181, %r1180, %r1112; - xor.b32 %r1182, %r1181, %r1152; - shf.l.wrap.b32 %r1183, %r1182, %r1182, 20; - add.s32 %r1184, %r1178, %r1183; - xor.b32 %r1185, %r1184, %r1180; - shf.l.wrap.b32 %r1186, %r1185, %r1185, 24; - add.s32 %r1187, %r1186, %r1181; - xor.b32 %r1188, %r1187, %r1183; - shf.l.wrap.b32 %r1189, %r1188, %r1188, 25; - add.s32 %r1190, %r1147, %r1114; - xor.b32 %r1191, %r1190, %r1137; - shf.l.wrap.b32 %r1192, %r1191, %r1191, 16; - add.s32 %r1193, %r1192, %r1125; - xor.b32 %r1194, %r1193, %r1114; - shf.l.wrap.b32 %r1195, %r1194, %r1194, 20; - add.s32 %r1196, %r1190, %r1195; - xor.b32 %r1197, %r1196, %r1192; - shf.l.wrap.b32 %r1198, %r1197, %r1197, 24; - add.s32 %r1199, %r1198, %r1193; - xor.b32 %r1200, %r1199, %r1195; - shf.l.wrap.b32 %r1201, %r1200, %r1200, 25; - add.s32 %r1202, %r1160, %r996; - add.s32 %r1203, %r1202, %r1201; - xor.b32 %r1204, %r1203, %r1174; - shf.l.wrap.b32 %r1205, %r1204, %r1204, 16; - add.s32 %r1206, %r1205, %r1187; - xor.b32 %r1207, %r1206, %r1201; - shf.l.wrap.b32 %r1208, %r1207, %r1207, 20; - add.s32 %r1209, %r1203, %r1208; - xor.b32 %r1210, %r1209, %r1205; - shf.l.wrap.b32 %r1211, %r1210, %r1210, 24; - add.s32 %r1212, %r1211, %r1206; - xor.b32 %r1213, %r1212, %r1208; - shf.l.wrap.b32 %r1214, %r1213, %r1213, 25; - add.s32 %r1215, %r1172, %r1165; - xor.b32 %r1216, %r1215, %r1186; - shf.l.wrap.b32 %r1217, %r1216, %r1216, 16; - add.s32 %r1218, %r1217, %r1199; - xor.b32 %r1219, %r1218, %r1165; - shf.l.wrap.b32 %r1220, %r1219, %r1219, 20; - add.s32 %r1221, %r1215, %r1220; - xor.b32 %r1222, %r1221, %r1217; - shf.l.wrap.b32 %r1223, %r1222, %r1222, 24; - add.s32 %r1224, %r1223, %r1218; - xor.b32 %r1225, %r1224, %r1220; - shf.l.wrap.b32 %r1226, %r1225, %r1225, 25; - add.s32 %r1227, %r1184, %r1177; - xor.b32 %r1228, %r1198, %r1227; - shf.l.wrap.b32 %r1229, %r1228, %r1228, 16; - add.s32 %r1230, %r1229, %r1163; - xor.b32 %r1231, %r1230, %r1177; - shf.l.wrap.b32 %r1232, %r1231, %r1231, 20; - add.s32 %r1233, %r1227, %r988; - add.s32 %r1234, %r1233, %r1232; - xor.b32 %r1235, %r1234, %r1229; - shf.l.wrap.b32 %r1236, %r1235, %r1235, 24; - add.s32 %r1237, %r1236, %r1230; - xor.b32 %r1238, %r1237, %r1232; - shf.l.wrap.b32 %r1239, %r1238, %r1238, 25; - add.s32 %r1240, %r1196, %r1189; - xor.b32 %r1241, %r1162, %r1240; - shf.l.wrap.b32 %r1242, %r1241, %r1241, 16; - add.s32 %r1243, %r1242, %r1175; - xor.b32 %r1244, %r1243, %r1189; - shf.l.wrap.b32 %r1245, %r1244, %r1244, 20; - add.s32 %r1246, %r1240, %r1245; - xor.b32 %r1247, %r1246, %r1242; - shf.l.wrap.b32 %r1248, %r1247, %r1247, 24; - add.s32 %r1249, %r1248, %r1243; - xor.b32 %r1250, %r1249, %r1245; - shf.l.wrap.b32 %r1251, %r1250, %r1250, 25; - add.s32 %r1252, %r1209, %r1226; - xor.b32 %r1253, %r1252, %r1248; - shf.l.wrap.b32 %r1254, %r1253, %r1253, 16; - add.s32 %r1255, %r1254, %r1237; - xor.b32 %r1256, %r1255, %r1226; - shf.l.wrap.b32 %r1257, %r1256, %r1256, 20; - add.s32 %r1258, %r1252, %r1257; - xor.b32 %r1259, %r1258, %r1254; - shf.l.wrap.b32 %r1260, %r1259, %r1259, 24; - add.s32 %r1261, %r1260, %r1255; - xor.b32 %r1262, %r1261, %r1257; - shf.l.wrap.b32 %r1263, %r1262, %r1262, 25; - add.s32 %r1264, %r1239, %r1221; - xor.b32 %r1265, %r1211, %r1264; - shf.l.wrap.b32 %r1266, %r1265, %r1265, 16; - add.s32 %r1267, %r1266, %r1249; - xor.b32 %r1268, %r1267, %r1239; - shf.l.wrap.b32 %r1269, %r1268, %r1268, 20; - add.s32 %r1270, %r1264, %r972; - add.s32 %r1271, %r1270, %r1269; - xor.b32 %r1272, %r1271, %r1266; - shf.l.wrap.b32 %r1273, %r1272, %r1272, 24; - add.s32 %r1274, %r1273, %r1267; - xor.b32 %r1275, %r1274, %r1269; - shf.l.wrap.b32 %r1276, %r1275, %r1275, 25; - add.s32 %r1277, %r1234, %r1251; - xor.b32 %r1278, %r1223, %r1277; - shf.l.wrap.b32 %r1279, %r1278, %r1278, 16; - add.s32 %r1280, %r1279, %r1212; - xor.b32 %r1281, %r1280, %r1251; - shf.l.wrap.b32 %r1282, %r1281, %r1281, 20; - add.s32 %r1283, %r1277, %r1282; - xor.b32 %r1284, %r1283, %r1279; - shf.l.wrap.b32 %r1285, %r1284, %r1284, 24; - add.s32 %r1286, %r1285, %r1280; - xor.b32 %r1287, %r1286, %r1282; - shf.l.wrap.b32 %r1288, %r1287, %r1287, 25; - add.s32 %r1289, %r1246, %r1214; - xor.b32 %r1290, %r1289, %r1236; - shf.l.wrap.b32 %r1291, %r1290, %r1290, 16; - add.s32 %r1292, %r1291, %r1224; - xor.b32 %r1293, %r1292, %r1214; - shf.l.wrap.b32 %r1294, %r1293, %r1293, 20; - add.s32 %r1295, %r1289, %r980; - add.s32 %r1296, %r1295, %r1294; - xor.b32 %r1297, %r1296, %r1291; - shf.l.wrap.b32 %r1298, %r1297, %r1297, 24; - add.s32 %r1299, %r1298, %r1292; - xor.b32 %r1300, %r1299, %r1294; - shf.l.wrap.b32 %r1301, %r1300, %r1300, 25; - add.s32 %r1302, %r1258, %r1301; - xor.b32 %r1303, %r1302, %r1273; - shf.l.wrap.b32 %r1304, %r1303, %r1303, 16; - add.s32 %r1305, %r1304, %r1286; - xor.b32 %r1306, %r1305, %r1301; - shf.l.wrap.b32 %r1307, %r1306, %r1306, 20; - add.s32 %r1308, %r1302, %r1307; - xor.b32 %r1309, %r1308, %r1304; - shf.l.wrap.b32 %r1310, %r1309, %r1309, 24; - add.s32 %r1311, %r1310, %r1305; - xor.b32 %r1312, %r1311, %r1307; - shf.l.wrap.b32 %r1313, %r1312, %r1312, 25; - add.s32 %r1314, %r1271, %r1263; - xor.b32 %r1315, %r1314, %r1285; - shf.l.wrap.b32 %r1316, %r1315, %r1315, 16; - add.s32 %r1317, %r1316, %r1299; - xor.b32 %r1318, %r1317, %r1263; - shf.l.wrap.b32 %r1319, %r1318, %r1318, 20; - add.s32 %r1320, %r1314, %r1319; - xor.b32 %r1321, %r1320, %r1316; - shf.l.wrap.b32 %r1322, %r1321, %r1321, 24; - add.s32 %r1323, %r1322, %r1317; - xor.b32 %r1324, %r1323, %r1319; - shf.l.wrap.b32 %r1325, %r1324, %r1324, 25; - add.s32 %r1326, %r1283, %r1276; - xor.b32 %r1327, %r1298, %r1326; - shf.l.wrap.b32 %r1328, %r1327, %r1327, 16; - add.s32 %r1329, %r1328, %r1261; - xor.b32 %r1330, %r1329, %r1276; - shf.l.wrap.b32 %r1331, %r1330, %r1330, 20; - add.s32 %r1332, %r1326, %r996; - add.s32 %r1333, %r1332, %r1331; - xor.b32 %r1334, %r1333, %r1328; - shf.l.wrap.b32 %r1335, %r1334, %r1334, 24; - add.s32 %r1336, %r1335, %r1329; - xor.b32 %r1337, %r1336, %r1331; - shf.l.wrap.b32 %r1338, %r1337, %r1337, 25; - add.s32 %r1339, %r1296, %r1288; - xor.b32 %r1340, %r1260, %r1339; - shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; - add.s32 %r1342, %r1341, %r1274; - xor.b32 %r1343, %r1342, %r1288; - shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; - add.s32 %r1345, %r1339, %r1344; - xor.b32 %r1346, %r1345, %r1341; - shf.l.wrap.b32 %r1347, %r1346, %r1346, 24; - add.s32 %r1348, %r1347, %r1342; - xor.b32 %r1349, %r1348, %r1344; - shf.l.wrap.b32 %r1350, %r1349, %r1349, 25; - add.s32 %r1351, %r1308, %r1325; - xor.b32 %r1352, %r1351, %r1347; - shf.l.wrap.b32 %r1353, %r1352, %r1352, 16; - add.s32 %r1354, %r1353, %r1336; - xor.b32 %r1355, %r1354, %r1325; - shf.l.wrap.b32 %r1356, %r1355, %r1355, 20; - add.s32 %r1357, %r1351, %r972; - add.s32 %r1358, %r1357, %r1356; - xor.b32 %r1359, %r1358, %r1353; - shf.l.wrap.b32 %r1360, %r1359, %r1359, 24; - add.s32 %r1361, %r1360, %r1354; - xor.b32 %r1362, %r1361, %r1356; - shf.l.wrap.b32 %r1363, %r1362, %r1362, 25; - add.s32 %r1364, %r1338, %r1320; - xor.b32 %r1365, %r1310, %r1364; - shf.l.wrap.b32 %r1366, %r1365, %r1365, 16; - add.s32 %r1367, %r1366, %r1348; - xor.b32 %r1368, %r1367, %r1338; - shf.l.wrap.b32 %r1369, %r1368, %r1368, 20; - add.s32 %r1370, %r1364, %r988; - add.s32 %r1371, %r1370, %r1369; - xor.b32 %r1372, %r1371, %r1366; - shf.l.wrap.b32 %r1373, %r1372, %r1372, 24; - add.s32 %r1374, %r1373, %r1367; - xor.b32 %r1375, %r1374, %r1369; - shf.l.wrap.b32 %r1376, %r1375, %r1375, 25; - add.s32 %r1377, %r1333, %r1350; - xor.b32 %r1378, %r1322, %r1377; - shf.l.wrap.b32 %r1379, %r1378, %r1378, 16; - add.s32 %r1380, %r1379, %r1311; - xor.b32 %r1381, %r1380, %r1350; - shf.l.wrap.b32 %r1382, %r1381, %r1381, 20; - add.s32 %r1383, %r1377, %r1382; - xor.b32 %r1384, %r1383, %r1379; - shf.l.wrap.b32 %r1385, %r1384, %r1384, 24; - add.s32 %r1386, %r1385, %r1380; - xor.b32 %r1387, %r1386, %r1382; - shf.l.wrap.b32 %r1388, %r1387, %r1387, 25; - add.s32 %r1389, %r1345, %r980; - add.s32 %r1390, %r1389, %r1313; - xor.b32 %r1391, %r1390, %r1335; - shf.l.wrap.b32 %r1392, %r1391, %r1391, 16; - add.s32 %r1393, %r1392, %r1323; - xor.b32 %r1394, %r1393, %r1313; - shf.l.wrap.b32 %r1395, %r1394, %r1394, 20; - add.s32 %r1396, %r1390, %r1395; - xor.b32 %r1397, %r1396, %r1392; - shf.l.wrap.b32 %r1398, %r1397, %r1397, 24; - add.s32 %r1399, %r1398, %r1393; - xor.b32 %r1400, %r1399, %r1395; - shf.l.wrap.b32 %r1401, %r1400, %r1400, 25; - add.s32 %r1402, %r1358, %r1401; - xor.b32 %r1403, %r1402, %r1373; - shf.l.wrap.b32 %r1404, %r1403, %r1403, 16; - add.s32 %r1405, %r1404, %r1386; - xor.b32 %r1406, %r1405, %r1401; - shf.l.wrap.b32 %r1407, %r1406, %r1406, 20; - add.s32 %r1408, %r1402, %r1407; - xor.b32 %r1409, %r1408, %r1404; - shf.l.wrap.b32 %r1410, %r1409, %r1409, 24; - add.s32 %r1411, %r1410, %r1405; - xor.b32 %r1412, %r1411, %r1407; - shf.l.wrap.b32 %r1413, %r1412, %r1412, 25; - add.s32 %r1414, %r1371, %r1363; - xor.b32 %r1415, %r1414, %r1385; - shf.l.wrap.b32 %r1416, %r1415, %r1415, 16; - add.s32 %r1417, %r1416, %r1399; - xor.b32 %r1418, %r1417, %r1363; - shf.l.wrap.b32 %r1419, %r1418, %r1418, 20; - add.s32 %r1420, %r1414, %r1419; - xor.b32 %r1421, %r1420, %r1416; - shf.l.wrap.b32 %r1422, %r1421, %r1421, 24; - add.s32 %r1423, %r1422, %r1417; - xor.b32 %r1424, %r1423, %r1419; - shf.l.wrap.b32 %r1425, %r1424, %r1424, 25; - add.s32 %r1426, %r1383, %r1376; - xor.b32 %r1427, %r1398, %r1426; - shf.l.wrap.b32 %r1428, %r1427, %r1427, 16; - add.s32 %r1429, %r1428, %r1361; - xor.b32 %r1430, %r1429, %r1376; - shf.l.wrap.b32 %r1431, %r1430, %r1430, 20; - add.s32 %r1432, %r1426, %r1431; - xor.b32 %r1433, %r1432, %r1428; - shf.l.wrap.b32 %r1434, %r1433, %r1433, 24; - add.s32 %r1435, %r1434, %r1429; - xor.b32 %r1436, %r1435, %r1431; - shf.l.wrap.b32 %r1437, %r1436, %r1436, 25; - add.s32 %r1438, %r1396, %r1388; - xor.b32 %r1439, %r1360, %r1438; - shf.l.wrap.b32 %r1440, %r1439, %r1439, 16; - add.s32 %r1441, %r1440, %r1374; - xor.b32 %r1442, %r1441, %r1388; - shf.l.wrap.b32 %r1443, %r1442, %r1442, 20; - add.s32 %r1444, %r1438, %r1443; - xor.b32 %r1445, %r1444, %r1440; - shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; - add.s32 %r1447, %r1446, %r1441; - xor.b32 %r1448, %r1447, %r1443; - shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; - add.s32 %r1450, %r1408, %r1425; - xor.b32 %r1451, %r1450, %r1446; - shf.l.wrap.b32 %r1452, %r1451, %r1451, 16; - add.s32 %r1453, %r1452, %r1435; - xor.b32 %r1454, %r1453, %r1425; - shf.l.wrap.b32 %r1455, %r1454, %r1454, 20; - add.s32 %r1456, %r1450, %r988; - add.s32 %r1457, %r1456, %r1455; - xor.b32 %r1458, %r1457, %r1452; - shf.l.wrap.b32 %r1459, %r1458, %r1458, 24; - add.s32 %r1460, %r1459, %r1453; - xor.b32 %r1461, %r1460, %r1455; - shf.l.wrap.b32 %r1462, %r1461, %r1461, 25; - add.s32 %r1463, %r1437, %r1420; - xor.b32 %r1464, %r1410, %r1463; - shf.l.wrap.b32 %r1465, %r1464, %r1464, 16; - add.s32 %r1466, %r1465, %r1447; - xor.b32 %r1467, %r1466, %r1437; - shf.l.wrap.b32 %r1468, %r1467, %r1467, 20; - add.s32 %r1469, %r1463, %r996; - add.s32 %r1470, %r1469, %r1468; - xor.b32 %r1471, %r1470, %r1465; - shf.l.wrap.b32 %r1472, %r1471, %r1471, 24; - add.s32 %r1473, %r1472, %r1466; - xor.b32 %r1474, %r1473, %r1468; - shf.l.wrap.b32 %r1475, %r1474, %r1474, 25; - add.s32 %r1476, %r1432, %r972; - add.s32 %r1477, %r1476, %r1449; - xor.b32 %r1478, %r1422, %r1477; - shf.l.wrap.b32 %r1479, %r1478, %r1478, 16; - add.s32 %r1480, %r1479, %r1411; - xor.b32 %r1481, %r1480, %r1449; - shf.l.wrap.b32 %r1482, %r1481, %r1481, 20; - add.s32 %r1483, %r1477, %r980; - add.s32 %r1484, %r1483, %r1482; - xor.b32 %r1485, %r1484, %r1479; - shf.l.wrap.b32 %r1486, %r1485, %r1485, 24; - add.s32 %r1487, %r1486, %r1480; - xor.b32 %r1488, %r1487, %r1482; - shf.l.wrap.b32 %r1489, %r1488, %r1488, 25; - add.s32 %r1490, %r1444, %r1413; - xor.b32 %r1491, %r1490, %r1434; - shf.l.wrap.b32 %r1492, %r1491, %r1491, 16; - add.s32 %r1493, %r1492, %r1423; - xor.b32 %r1494, %r1493, %r1413; - shf.l.wrap.b32 %r1495, %r1494, %r1494, 20; - add.s32 %r1496, %r1490, %r1495; - xor.b32 %r1497, %r1496, %r1492; - shf.l.wrap.b32 %r1498, %r1497, %r1497, 24; - add.s32 %r1499, %r1498, %r1493; - xor.b32 %r1500, %r1499, %r1495; - shf.l.wrap.b32 %r1501, %r1500, %r1500, 25; - add.s32 %r1502, %r1457, %r1501; - xor.b32 %r1503, %r1502, %r1472; - shf.l.wrap.b32 %r1504, %r1503, %r1503, 16; - add.s32 %r1505, %r1504, %r1487; - xor.b32 %r1506, %r1505, %r1501; - shf.l.wrap.b32 %r1507, %r1506, %r1506, 20; - add.s32 %r1508, %r1502, %r1507; - xor.b32 %r1509, %r1508, %r1504; - shf.l.wrap.b32 %r1510, %r1509, %r1509, 24; - add.s32 %r1511, %r1510, %r1505; - xor.b32 %r1512, %r1511, %r1507; - shf.l.wrap.b32 %r1513, %r1512, %r1512, 25; - add.s32 %r1514, %r1470, %r1462; - xor.b32 %r1515, %r1514, %r1486; - shf.l.wrap.b32 %r1516, %r1515, %r1515, 16; - add.s32 %r1517, %r1516, %r1499; - xor.b32 %r1518, %r1517, %r1462; - shf.l.wrap.b32 %r1519, %r1518, %r1518, 20; - add.s32 %r1520, %r1514, %r1519; - xor.b32 %r1521, %r1520, %r1516; - shf.l.wrap.b32 %r1522, %r1521, %r1521, 24; - add.s32 %r1523, %r1522, %r1517; - xor.b32 %r1524, %r1523, %r1519; - shf.l.wrap.b32 %r1525, %r1524, %r1524, 25; - add.s32 %r1526, %r1484, %r1475; - xor.b32 %r1527, %r1498, %r1526; - shf.l.wrap.b32 %r1528, %r1527, %r1527, 16; - add.s32 %r1529, %r1528, %r1460; - xor.b32 %r1530, %r1529, %r1475; - shf.l.wrap.b32 %r1531, %r1530, %r1530, 20; - add.s32 %r1532, %r1526, %r1531; - xor.b32 %r1533, %r1532, %r1528; - shf.l.wrap.b32 %r1534, %r1533, %r1533, 24; - add.s32 %r1535, %r1534, %r1529; - xor.b32 %r1536, %r1535, %r1531; - shf.l.wrap.b32 %r1537, %r1536, %r1536, 25; - add.s32 %r1538, %r1496, %r1489; - xor.b32 %r1539, %r1459, %r1538; - shf.l.wrap.b32 %r1540, %r1539, %r1539, 16; - add.s32 %r1541, %r1540, %r1473; - xor.b32 %r1542, %r1541, %r1489; - shf.l.wrap.b32 %r1543, %r1542, %r1542, 20; - add.s32 %r1544, %r1538, %r980; - add.s32 %r1545, %r1544, %r1543; - xor.b32 %r1546, %r1545, %r1540; - shf.l.wrap.b32 %r1547, %r1546, %r1546, 24; - add.s32 %r1548, %r1547, %r1541; - xor.b32 %r1549, %r1548, %r1543; - shf.l.wrap.b32 %r1550, %r1549, %r1549, 25; - add.s32 %r1551, %r1508, %r1525; - xor.b32 %r1552, %r1551, %r1547; - shf.l.wrap.b32 %r1553, %r1552, %r1552, 16; - add.s32 %r1554, %r1553, %r1535; - xor.b32 %r1555, %r1554, %r1525; - shf.l.wrap.b32 %r1556, %r1555, %r1555, 20; - add.s32 %r1557, %r1551, %r996; - add.s32 %r1558, %r1557, %r1556; - xor.b32 %r1559, %r1558, %r1553; - shf.l.wrap.b32 %r1560, %r1559, %r1559, 24; - add.s32 %r1561, %r1560, %r1554; - xor.b32 %r1562, %r1561, %r1556; - shf.l.wrap.b32 %r1563, %r1562, %r1562, 25; - add.s32 %r1564, %r1537, %r972; - add.s32 %r1565, %r1564, %r1520; - xor.b32 %r1566, %r1510, %r1565; - shf.l.wrap.b32 %r1567, %r1566, %r1566, 16; - add.s32 %r1568, %r1567, %r1548; - xor.b32 %r1569, %r1568, %r1537; - shf.l.wrap.b32 %r1570, %r1569, %r1569, 20; - add.s32 %r1571, %r1565, %r1570; - xor.b32 %r1572, %r1571, %r1567; - shf.l.wrap.b32 %r1573, %r1572, %r1572, 24; - add.s32 %r1574, %r1573, %r1568; - xor.b32 %r1575, %r1574, %r1570; - shf.l.wrap.b32 %r1576, %r1575, %r1575, 25; - add.s32 %r1577, %r1532, %r988; - add.s32 %r1578, %r1577, %r1550; - xor.b32 %r1579, %r1522, %r1578; - shf.l.wrap.b32 %r1580, %r1579, %r1579, 16; - add.s32 %r1581, %r1580, %r1511; - xor.b32 %r1582, %r1581, %r1550; - shf.l.wrap.b32 %r1583, %r1582, %r1582, 20; - add.s32 %r1584, %r1578, %r1583; - xor.b32 %r1585, %r1584, %r1580; - shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; - add.s32 %r1587, %r1586, %r1581; - xor.b32 %r1588, %r1587, %r1583; - shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; - add.s32 %r1590, %r1545, %r1513; - xor.b32 %r1591, %r1590, %r1534; - shf.l.wrap.b32 %r1592, %r1591, %r1591, 16; - add.s32 %r1593, %r1592, %r1523; - xor.b32 %r1594, %r1593, %r1513; - shf.l.wrap.b32 %r1595, %r1594, %r1594, 20; - add.s32 %r1596, %r1590, %r1595; - xor.b32 %r1597, %r1596, %r1592; - shf.l.wrap.b32 %r1598, %r1597, %r1597, 24; - add.s32 %r1599, %r1598, %r1593; - xor.b32 %r1600, %r1599, %r1595; - shf.l.wrap.b32 %r1601, %r1600, %r1600, 25; - add.s32 %r1602, %r1558, %r1601; - xor.b32 %r1603, %r1602, %r1573; - shf.l.wrap.b32 %r1604, %r1603, %r1603, 16; - add.s32 %r1605, %r1604, %r1587; - xor.b32 %r1606, %r1605, %r1601; - shf.l.wrap.b32 %r1607, %r1606, %r1606, 20; - add.s32 %r1608, %r1602, %r1607; - xor.b32 %r1609, %r1608, %r1604; - shf.l.wrap.b32 %r1610, %r1609, %r1609, 24; - add.s32 %r1611, %r1610, %r1605; - xor.b32 %r1612, %r1611, %r1607; - shf.l.wrap.b32 %r1613, %r1612, %r1612, 25; - add.s32 %r1614, %r1571, %r1563; - xor.b32 %r1615, %r1614, %r1586; - shf.l.wrap.b32 %r1616, %r1615, %r1615, 16; - add.s32 %r1617, %r1616, %r1599; - xor.b32 %r1618, %r1617, %r1563; - shf.l.wrap.b32 %r1619, %r1618, %r1618, 20; - add.s32 %r1620, %r1614, %r972; - add.s32 %r1621, %r1620, %r1619; - xor.b32 %r1622, %r1621, %r1616; - shf.l.wrap.b32 %r1623, %r1622, %r1622, 24; - add.s32 %r1624, %r1623, %r1617; - xor.b32 %r1625, %r1624, %r1619; - shf.l.wrap.b32 %r1626, %r1625, %r1625, 25; - add.s32 %r1627, %r1584, %r980; - add.s32 %r1628, %r1627, %r1576; - xor.b32 %r1629, %r1598, %r1628; - shf.l.wrap.b32 %r1630, %r1629, %r1629, 16; - add.s32 %r1631, %r1630, %r1561; - xor.b32 %r1632, %r1631, %r1576; - shf.l.wrap.b32 %r1633, %r1632, %r1632, 20; - add.s32 %r1634, %r1628, %r1633; - xor.b32 %r1635, %r1634, %r1630; - shf.l.wrap.b32 %r1636, %r1635, %r1635, 24; - add.s32 %r1637, %r1636, %r1631; - xor.b32 %r1638, %r1637, %r1633; - shf.l.wrap.b32 %r1639, %r1638, %r1638, 25; - add.s32 %r1640, %r1596, %r1589; - xor.b32 %r1641, %r1560, %r1640; - shf.l.wrap.b32 %r1642, %r1641, %r1641, 16; - add.s32 %r1643, %r1642, %r1574; - xor.b32 %r1644, %r1643, %r1589; - shf.l.wrap.b32 %r1645, %r1644, %r1644, 20; - add.s32 %r1646, %r1640, %r1645; - xor.b32 %r1647, %r1646, %r1642; - shf.l.wrap.b32 %r1648, %r1647, %r1647, 24; - add.s32 %r1649, %r1648, %r1643; - xor.b32 %r1650, %r1649, %r1645; - shf.l.wrap.b32 %r1651, %r1650, %r1650, 25; - add.s32 %r1652, %r1608, %r1626; - xor.b32 %r1653, %r1652, %r1648; - shf.l.wrap.b32 %r1654, %r1653, %r1653, 16; - add.s32 %r1655, %r1654, %r1637; - xor.b32 %r1656, %r1655, %r1626; - shf.l.wrap.b32 %r1657, %r1656, %r1656, 20; - add.s32 %r1658, %r1652, %r1657; - xor.b32 %r1659, %r1658, %r1654; - shf.l.wrap.b32 %r1660, %r1659, %r1659, 24; - add.s32 %r1661, %r1660, %r1655; - xor.b32 %r1662, %r1661, %r1657; - shf.l.wrap.b32 %r1663, %r1662, %r1662, 25; - add.s32 %r1664, %r1639, %r988; - add.s32 %r1665, %r1664, %r1621; - xor.b32 %r1666, %r1610, %r1665; - shf.l.wrap.b32 %r1667, %r1666, %r1666, 16; - add.s32 %r1668, %r1667, %r1649; - xor.b32 %r1669, %r1668, %r1639; - shf.l.wrap.b32 %r1670, %r1669, %r1669, 20; - add.s32 %r1671, %r1665, %r1670; - xor.b32 %r1672, %r1671, %r1667; - shf.l.wrap.b32 %r1673, %r1672, %r1672, 24; - add.s32 %r1674, %r1673, %r1668; - xor.b32 %r1675, %r1674, %r1670; - shf.l.wrap.b32 %r1676, %r1675, %r1675, 25; - add.s32 %r1677, %r1634, %r996; - add.s32 %r1678, %r1677, %r1651; - xor.b32 %r1679, %r1623, %r1678; - shf.l.wrap.b32 %r1680, %r1679, %r1679, 16; - add.s32 %r1681, %r1680, %r1611; - xor.b32 %r1682, %r1681, %r1651; - shf.l.wrap.b32 %r1683, %r1682, %r1682, 20; - add.s32 %r1684, %r1678, %r1683; - xor.b32 %r1685, %r1684, %r1680; - shf.l.wrap.b32 %r1686, %r1685, %r1685, 24; - add.s32 %r1687, %r1686, %r1681; - xor.b32 %r1688, %r1687, %r1683; - shf.l.wrap.b32 %r1689, %r1688, %r1688, 25; - add.s32 %r1690, %r1646, %r1613; - xor.b32 %r1691, %r1690, %r1636; - shf.l.wrap.b32 %r1692, %r1691, %r1691, 16; - add.s32 %r1693, %r1692, %r1624; - xor.b32 %r1694, %r1693, %r1613; - shf.l.wrap.b32 %r1695, %r1694, %r1694, 20; - add.s32 %r1696, %r1690, %r1695; - xor.b32 %r1697, %r1696, %r1692; - shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; - add.s32 %r1699, %r1698, %r1693; - xor.b32 %r1700, %r1699, %r1695; - shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; - xor.b32 %r9, %r1658, %r1687; - cvt.u64.u32 %rd132, %r9; - xor.b32 %r1702, %r1699, %r1671; - and.b32 %r1703, %r1702, 255; - cvt.u64.u32 %rd133, %r1703; - bfi.b64 %rd134, %rd133, %rd132, 32, 32; - cvt.u64.u32 %rd135, %r1702; - shl.b64 %rd136, %rd135, 32; - and.b64 %rd137, %rd136, 280375465082880; - or.b64 %rd138, %rd134, %rd137; - and.b64 %rd139, %rd136, 71776119061217280; - shr.u32 %r10, %r1702, 24; - cvt.u64.u32 %rd140, %r10; - shl.b64 %rd141, %rd140, 56; - or.b64 %rd142, %rd138, %rd139; - or.b64 %rd143, %rd142, %rd141; - xor.b32 %r11, %r1661, %r1684; - cvt.u64.u32 %rd144, %r11; - xor.b32 %r1704, %r1696, %r1674; - and.b32 %r1705, %r1704, 255; - cvt.u64.u32 %rd145, %r1705; - bfi.b64 %rd146, %rd145, %rd144, 32, 32; - cvt.u64.u32 %rd147, %r1704; - shl.b64 %rd148, %rd147, 32; - and.b64 %rd149, %rd148, 280375465082880; - or.b64 %rd150, %rd146, %rd149; - and.b64 %rd151, %rd148, 71776119061217280; - shr.u32 %r12, %r1704, 24; - cvt.u64.u32 %rd152, %r12; - shl.b64 %rd153, %rd152, 56; - or.b64 %rd154, %rd150, %rd151; - or.b64 %rd155, %rd154, %rd153; - xor.b32 %r13, %r1701, %r1673; - cvt.u64.u32 %rd156, %r13; - xor.b32 %r1706, %r1663, %r1686; - and.b32 %r1707, %r1706, 255; - cvt.u64.u32 %rd157, %r1707; - bfi.b64 %rd158, %rd157, %rd156, 32, 32; - cvt.u64.u32 %rd159, %r1706; - shl.b64 %rd160, %rd159, 32; - and.b64 %rd161, %rd160, 280375465082880; - or.b64 %rd162, %rd158, %rd161; - and.b64 %rd163, %rd160, 71776119061217280; - shr.u32 %r14, %r1706, 24; - cvt.u64.u32 %rd164, %r14; - shl.b64 %rd165, %rd164, 56; - or.b64 %rd166, %rd162, %rd163; - or.b64 %rd167, %rd166, %rd165; - xor.b32 %r1708, %r1698, %r1676; - cvt.u64.u32 %rd168, %r1708; - xor.b32 %r1709, %r1660, %r1689; - and.b32 %r1710, %r1709, 255; - cvt.u64.u32 %rd169, %r1710; - bfi.b64 %rd170, %rd169, %rd168, 32, 32; - cvt.u64.u32 %rd171, %r1709; - shl.b64 %rd172, %rd171, 32; - and.b64 %rd173, %rd172, 280375465082880; - or.b64 %rd174, %rd170, %rd173; - and.b64 %rd175, %rd172, 71776119061217280; - shr.u32 %r1711, %r1709, 24; - cvt.u64.u32 %rd176, %r1711; - shl.b64 %rd177, %rd176, 56; - or.b64 %rd178, %rd174, %rd175; - or.b64 %rd9, %rd178, %rd177; - shr.u64 %rd10, %rd143, 32; - shr.u64 %rd11, %rd143, 40; - shr.u64 %rd12, %rd143, 48; - shr.u64 %rd13, %rd155, 32; - shr.u64 %rd14, %rd155, 40; - shr.u64 %rd15, %rd155, 48; - shr.u64 %rd16, %rd167, 32; - shr.u64 %rd17, %rd167, 40; - shr.u64 %rd18, %rd167, 48; - shr.u32 %r5809, %r9, 12; - shr.u32 %r5810, %r9, 8; - shr.u32 %r5811, %r9, 4; - and.b32 %r5812, %r5811, 15; - and.b32 %r5813, %r9, 15; - bfi.b32 %r5814, %r5813, %r5812, 8, 4; - shl.b32 %r5815, %r9, 4; - and.b32 %r5816, %r5815, 983040; - or.b32 %r5817, %r5814, %r5816; - shl.b32 %r5818, %r9, 16; - and.b32 %r5819, %r5818, 251658240; - or.b32 %r5746, %r5817, %r5819; - shr.u32 %r5820, %r9, 20; - and.b32 %r5821, %r5820, 15; - shr.u32 %r5822, %r9, 16; - and.b32 %r5823, %r5822, 15; - shr.u32 %r5824, %r9, 24; - bfi.b32 %r5825, %r5823, %r5821, 8, 4; - and.b32 %r5826, %r5809, 983040; - or.b32 %r5827, %r5825, %r5826; - and.b32 %r5828, %r9, 251658240; - or.b32 %r5750, %r5827, %r5828; - cvt.u16.u64 %rs83, %rd10; - and.b16 %rs84, %rs83, 240; - shr.u16 %rs85, %rs84, 4; - cvt.u16.u64 %rs86, %rd11; - and.b16 %rs87, %rs86, 240; - shr.u16 %rs88, %rs87, 4; - cvt.u32.u16 %r5829, %rs85; - cvt.u32.u64 %r5830, %rd10; - and.b32 %r5831, %r5830, 15; - prmt.b32 %r5832, %r5831, %r5829, 30212; - cvt.u32.u16 %r5833, %rs88; - prmt.b32 %r5834, %r5833, %r5832, 28756; - cvt.u32.u64 %r5835, %rd11; - shl.b32 %r5836, %r5835, 24; - and.b32 %r5837, %r5836, 251658240; - or.b32 %r5754, %r5834, %r5837; - cvt.u16.u64 %rs89, %rd12; - and.b16 %rs90, %rs89, 240; - shr.u16 %rs91, %rs90, 4; - cvt.u32.u16 %r5838, %rs91; - cvt.u32.u64 %r5839, %rd12; - and.b32 %r5840, %r5839, 15; - prmt.b32 %r5841, %r5840, %r5838, 30212; - shl.b32 %r5842, %r10, 12; - and.b32 %r5843, %r5842, 983040; - or.b32 %r5844, %r5841, %r5843; - shl.b32 %r5845, %r10, 24; - and.b32 %r5846, %r5845, 251658240; - or.b32 %r5758, %r5844, %r5846; - shr.u32 %r5847, %r11, 12; - shr.u32 %r5848, %r11, 8; - shr.u32 %r5849, %r11, 4; - and.b32 %r5850, %r5849, 15; - and.b32 %r5851, %r11, 15; - bfi.b32 %r5852, %r5851, %r5850, 8, 4; - shl.b32 %r5853, %r11, 4; - and.b32 %r5854, %r5853, 983040; - or.b32 %r5855, %r5852, %r5854; - shl.b32 %r5856, %r11, 16; - and.b32 %r5857, %r5856, 251658240; - or.b32 %r5762, %r5855, %r5857; - shr.u32 %r5858, %r11, 20; - and.b32 %r5859, %r5858, 15; - shr.u32 %r5860, %r11, 16; - and.b32 %r5861, %r5860, 15; - shr.u32 %r5862, %r11, 24; - bfi.b32 %r5863, %r5861, %r5859, 8, 4; - and.b32 %r5864, %r5847, 983040; - or.b32 %r5865, %r5863, %r5864; - and.b32 %r5866, %r11, 251658240; - or.b32 %r5766, %r5865, %r5866; - cvt.u16.u64 %rs92, %rd13; - and.b16 %rs93, %rs92, 240; - shr.u16 %rs94, %rs93, 4; - cvt.u16.u64 %rs95, %rd14; - and.b16 %rs96, %rs95, 240; - shr.u16 %rs97, %rs96, 4; - cvt.u32.u16 %r5867, %rs94; - cvt.u32.u64 %r5868, %rd13; - and.b32 %r5869, %r5868, 15; - prmt.b32 %r5870, %r5869, %r5867, 30212; - cvt.u32.u16 %r5871, %rs97; - prmt.b32 %r5872, %r5871, %r5870, 28756; - cvt.u32.u64 %r5873, %rd14; - shl.b32 %r5874, %r5873, 24; - and.b32 %r5875, %r5874, 251658240; - or.b32 %r5770, %r5872, %r5875; - cvt.u16.u64 %rs98, %rd15; - and.b16 %rs99, %rs98, 240; - shr.u16 %rs100, %rs99, 4; - cvt.u32.u16 %r5876, %rs100; - cvt.u32.u64 %r5877, %rd15; - and.b32 %r5878, %r5877, 15; - prmt.b32 %r5879, %r5878, %r5876, 30212; - shl.b32 %r5880, %r12, 12; - and.b32 %r5881, %r5880, 983040; - or.b32 %r5882, %r5879, %r5881; - shl.b32 %r5883, %r12, 24; - and.b32 %r5884, %r5883, 251658240; - or.b32 %r5774, %r5882, %r5884; - shr.u32 %r5885, %r13, 12; - shr.u32 %r5886, %r13, 8; - shr.u32 %r5887, %r13, 4; - and.b32 %r5888, %r5887, 15; - and.b32 %r5889, %r13, 15; - bfi.b32 %r5890, %r5889, %r5888, 8, 4; - shl.b32 %r5891, %r13, 4; - and.b32 %r5892, %r5891, 983040; - or.b32 %r5893, %r5890, %r5892; - shl.b32 %r5894, %r13, 16; - and.b32 %r5895, %r5894, 251658240; - or.b32 %r5778, %r5893, %r5895; - shr.u32 %r5896, %r13, 20; - and.b32 %r5897, %r5896, 15; - shr.u32 %r5898, %r13, 16; - and.b32 %r5899, %r5898, 15; - shr.u32 %r5900, %r13, 24; - bfi.b32 %r5901, %r5899, %r5897, 8, 4; - and.b32 %r5902, %r5885, 983040; - or.b32 %r5903, %r5901, %r5902; - and.b32 %r5904, %r13, 251658240; - or.b32 %r5782, %r5903, %r5904; - cvt.u16.u64 %rs101, %rd16; - and.b16 %rs102, %rs101, 240; - shr.u16 %rs103, %rs102, 4; - cvt.u16.u64 %rs104, %rd17; - and.b16 %rs105, %rs104, 240; - shr.u16 %rs106, %rs105, 4; - cvt.u32.u16 %r5905, %rs103; - cvt.u32.u64 %r5906, %rd16; - and.b32 %r5907, %r5906, 15; - prmt.b32 %r5908, %r5907, %r5905, 30212; - cvt.u32.u16 %r5909, %rs106; - prmt.b32 %r5910, %r5909, %r5908, 28756; - cvt.u32.u64 %r5911, %rd17; - shl.b32 %r5912, %r5911, 24; - and.b32 %r5913, %r5912, 251658240; - or.b32 %r5786, %r5910, %r5913; - cvt.u16.u64 %rs107, %rd18; - and.b16 %rs108, %rs107, 240; - shr.u16 %rs109, %rs108, 4; - cvt.u32.u16 %r5914, %rs109; - cvt.u32.u64 %r5915, %rd18; - and.b32 %r5916, %r5915, 15; - prmt.b32 %r5917, %r5916, %r5914, 30212; - shl.b32 %r5918, %r14, 12; - and.b32 %r5919, %r5918, 983040; - or.b32 %r5920, %r5917, %r5919; - shl.b32 %r5921, %r14, 24; - and.b32 %r5922, %r5921, 251658240; - or.b32 %r5790, %r5920, %r5922; - cvt.u16.u64 %rs110, %rd9; - and.b16 %rs111, %rs110, 240; - shr.u16 %rs112, %rs111, 4; - shr.u64 %rd201, %rd9, 8; - cvt.u32.u64 %r5923, %rd201; - cvt.u32.u64 %r5924, %rd9; - shr.u32 %r5925, %r5924, 12; - cvt.u32.u16 %r5926, %rs112; - and.b32 %r5927, %r5924, 15; - prmt.b32 %r5928, %r5927, %r5926, 30212; - shl.b32 %r5929, %r5924, 4; - and.b32 %r5930, %r5929, 983040; - or.b32 %r5931, %r5928, %r5930; - shl.b32 %r5932, %r5923, 24; - and.b32 %r5933, %r5932, 251658240; - or.b32 %r5794, %r5931, %r5933; - shr.u64 %rd202, %rd9, 16; - cvt.u32.u64 %r5934, %rd202; - shr.u32 %r5935, %r5924, 20; - and.b32 %r5936, %r5935, 15; - and.b32 %r5937, %r5934, 15; - shr.u64 %rd203, %rd9, 24; - cvt.u32.u64 %r5938, %rd203; - bfi.b32 %r5939, %r5937, %r5936, 8, 4; - and.b32 %r5940, %r5925, 983040; - or.b32 %r5941, %r5939, %r5940; - shl.b32 %r5942, %r5938, 24; - and.b32 %r5943, %r5942, 251658240; - or.b32 %r5798, %r5941, %r5943; - shr.u64 %rd204, %rd9, 32; - cvt.u32.u64 %r5944, %rd204; - shr.u64 %rd205, %rd9, 36; - cvt.u32.u64 %r5945, %rd205; - and.b32 %r5946, %r5945, 15; - and.b32 %r5947, %r5944, 15; - shr.u64 %rd206, %rd9, 40; - cvt.u32.u64 %r5948, %rd206; - shr.u64 %rd207, %rd9, 44; - cvt.u32.u64 %r5949, %rd207; - bfi.b32 %r5950, %r5947, %r5946, 8, 4; - shl.b32 %r5951, %r5949, 16; - and.b32 %r5952, %r5951, 983040; - or.b32 %r5953, %r5950, %r5952; - shl.b32 %r5954, %r5948, 24; - and.b32 %r5955, %r5954, 251658240; - or.b32 %r5802, %r5953, %r5955; - shr.u64 %rd208, %rd9, 48; - cvt.u32.u64 %r5956, %rd208; - shr.u64 %rd209, %rd9, 52; - cvt.u32.u64 %r5957, %rd209; - and.b32 %r5958, %r5957, 15; - and.b32 %r5959, %r5956, 15; - shr.u64 %rd210, %rd9, 56; - cvt.u32.u64 %r5960, %rd210; - bfi.b32 %r5961, %r5959, %r5958, 8, 4; - and.b32 %r5962, %r5949, 983040; - or.b32 %r5963, %r5961, %r5962; - shl.b32 %r5964, %r5960, 24; - and.b32 %r5965, %r5964, 251658240; - or.b32 %r5806, %r5963, %r5965; - ld.const.u32 %r1713, [matrix]; - mov.u32 %r6244, 0; - // begin inline asm - dp4a.u32.u32 %r1712, %r1713, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1717, [matrix+4]; - // begin inline asm - dp4a.u32.u32 %r1716, %r1717, %r5750, %r1712; - // end inline asm - ld.const.u32 %r1721, [matrix+8]; - // begin inline asm - dp4a.u32.u32 %r1720, %r1721, %r5754, %r1716; - // end inline asm - ld.const.u32 %r1725, [matrix+12]; - // begin inline asm - dp4a.u32.u32 %r1724, %r1725, %r5758, %r1720; - // end inline asm - ld.const.u32 %r1729, [matrix+16]; - // begin inline asm - dp4a.u32.u32 %r1728, %r1729, %r5762, %r1724; - // end inline asm - ld.const.u32 %r1733, [matrix+20]; - // begin inline asm - dp4a.u32.u32 %r1732, %r1733, %r5766, %r1728; - // end inline asm - ld.const.u32 %r1737, [matrix+24]; - // begin inline asm - dp4a.u32.u32 %r1736, %r1737, %r5770, %r1732; - // end inline asm - ld.const.u32 %r1741, [matrix+28]; - // begin inline asm - dp4a.u32.u32 %r1740, %r1741, %r5774, %r1736; - // end inline asm - ld.const.u32 %r1745, [matrix+32]; - // begin inline asm - dp4a.u32.u32 %r1744, %r1745, %r5778, %r1740; - // end inline asm - ld.const.u32 %r1749, [matrix+36]; - // begin inline asm - dp4a.u32.u32 %r1748, %r1749, %r5782, %r1744; - // end inline asm - ld.const.u32 %r1753, [matrix+40]; - // begin inline asm - dp4a.u32.u32 %r1752, %r1753, %r5786, %r1748; - // end inline asm - ld.const.u32 %r1757, [matrix+44]; - // begin inline asm - dp4a.u32.u32 %r1756, %r1757, %r5790, %r1752; - // end inline asm - ld.const.u32 %r1761, [matrix+48]; - // begin inline asm - dp4a.u32.u32 %r1760, %r1761, %r5794, %r1756; - // end inline asm - ld.const.u32 %r1765, [matrix+52]; - // begin inline asm - dp4a.u32.u32 %r1764, %r1765, %r5798, %r1760; - // end inline asm - ld.const.u32 %r1769, [matrix+56]; - // begin inline asm - dp4a.u32.u32 %r1768, %r1769, %r5802, %r1764; - // end inline asm - ld.const.u32 %r1773, [matrix+60]; - // begin inline asm - dp4a.u32.u32 %r1772, %r1773, %r5806, %r1768; - // end inline asm - ld.const.u32 %r1777, [matrix+64]; - // begin inline asm - dp4a.u32.u32 %r1776, %r1777, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1781, [matrix+68]; - // begin inline asm - dp4a.u32.u32 %r1780, %r1781, %r5750, %r1776; - // end inline asm - ld.const.u32 %r1785, [matrix+72]; - // begin inline asm - dp4a.u32.u32 %r1784, %r1785, %r5754, %r1780; - // end inline asm - ld.const.u32 %r1789, [matrix+76]; - // begin inline asm - dp4a.u32.u32 %r1788, %r1789, %r5758, %r1784; - // end inline asm - ld.const.u32 %r1793, [matrix+80]; - // begin inline asm - dp4a.u32.u32 %r1792, %r1793, %r5762, %r1788; - // end inline asm - ld.const.u32 %r1797, [matrix+84]; - // begin inline asm - dp4a.u32.u32 %r1796, %r1797, %r5766, %r1792; - // end inline asm - ld.const.u32 %r1801, [matrix+88]; - // begin inline asm - dp4a.u32.u32 %r1800, %r1801, %r5770, %r1796; - // end inline asm - ld.const.u32 %r1805, [matrix+92]; - // begin inline asm - dp4a.u32.u32 %r1804, %r1805, %r5774, %r1800; - // end inline asm - ld.const.u32 %r1809, [matrix+96]; - // begin inline asm - dp4a.u32.u32 %r1808, %r1809, %r5778, %r1804; - // end inline asm - ld.const.u32 %r1813, [matrix+100]; - // begin inline asm - dp4a.u32.u32 %r1812, %r1813, %r5782, %r1808; - // end inline asm - ld.const.u32 %r1817, [matrix+104]; - // begin inline asm - dp4a.u32.u32 %r1816, %r1817, %r5786, %r1812; - // end inline asm - ld.const.u32 %r1821, [matrix+108]; - // begin inline asm - dp4a.u32.u32 %r1820, %r1821, %r5790, %r1816; - // end inline asm - ld.const.u32 %r1825, [matrix+112]; - // begin inline asm - dp4a.u32.u32 %r1824, %r1825, %r5794, %r1820; - // end inline asm - ld.const.u32 %r1829, [matrix+116]; - // begin inline asm - dp4a.u32.u32 %r1828, %r1829, %r5798, %r1824; - // end inline asm - ld.const.u32 %r1833, [matrix+120]; - // begin inline asm - dp4a.u32.u32 %r1832, %r1833, %r5802, %r1828; - // end inline asm - ld.const.u32 %r1837, [matrix+124]; - // begin inline asm - dp4a.u32.u32 %r1836, %r1837, %r5806, %r1832; - // end inline asm - shr.u32 %r5966, %r1772, 6; - and.b32 %r5967, %r5966, 240; - shr.u32 %r5968, %r1836, 10; - or.b32 %r5969, %r5968, %r5967; - xor.b32 %r5970, %r9, %r5969; - ld.const.u32 %r1841, [matrix+128]; - // begin inline asm - dp4a.u32.u32 %r1840, %r1841, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1845, [matrix+132]; - // begin inline asm - dp4a.u32.u32 %r1844, %r1845, %r5750, %r1840; - // end inline asm - ld.const.u32 %r1849, [matrix+136]; - // begin inline asm - dp4a.u32.u32 %r1848, %r1849, %r5754, %r1844; - // end inline asm - ld.const.u32 %r1853, [matrix+140]; - // begin inline asm - dp4a.u32.u32 %r1852, %r1853, %r5758, %r1848; - // end inline asm - ld.const.u32 %r1857, [matrix+144]; - // begin inline asm - dp4a.u32.u32 %r1856, %r1857, %r5762, %r1852; - // end inline asm - ld.const.u32 %r1861, [matrix+148]; - // begin inline asm - dp4a.u32.u32 %r1860, %r1861, %r5766, %r1856; - // end inline asm - ld.const.u32 %r1865, [matrix+152]; - // begin inline asm - dp4a.u32.u32 %r1864, %r1865, %r5770, %r1860; - // end inline asm - ld.const.u32 %r1869, [matrix+156]; - // begin inline asm - dp4a.u32.u32 %r1868, %r1869, %r5774, %r1864; - // end inline asm - ld.const.u32 %r1873, [matrix+160]; - // begin inline asm - dp4a.u32.u32 %r1872, %r1873, %r5778, %r1868; - // end inline asm - ld.const.u32 %r1877, [matrix+164]; - // begin inline asm - dp4a.u32.u32 %r1876, %r1877, %r5782, %r1872; - // end inline asm - ld.const.u32 %r1881, [matrix+168]; - // begin inline asm - dp4a.u32.u32 %r1880, %r1881, %r5786, %r1876; - // end inline asm - ld.const.u32 %r1885, [matrix+172]; - // begin inline asm - dp4a.u32.u32 %r1884, %r1885, %r5790, %r1880; - // end inline asm - ld.const.u32 %r1889, [matrix+176]; - // begin inline asm - dp4a.u32.u32 %r1888, %r1889, %r5794, %r1884; - // end inline asm - ld.const.u32 %r1893, [matrix+180]; - // begin inline asm - dp4a.u32.u32 %r1892, %r1893, %r5798, %r1888; - // end inline asm - ld.const.u32 %r1897, [matrix+184]; - // begin inline asm - dp4a.u32.u32 %r1896, %r1897, %r5802, %r1892; - // end inline asm - ld.const.u32 %r1901, [matrix+188]; - // begin inline asm - dp4a.u32.u32 %r1900, %r1901, %r5806, %r1896; - // end inline asm - ld.const.u32 %r1905, [matrix+192]; - // begin inline asm - dp4a.u32.u32 %r1904, %r1905, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1909, [matrix+196]; - // begin inline asm - dp4a.u32.u32 %r1908, %r1909, %r5750, %r1904; - // end inline asm - ld.const.u32 %r1913, [matrix+200]; - // begin inline asm - dp4a.u32.u32 %r1912, %r1913, %r5754, %r1908; - // end inline asm - ld.const.u32 %r1917, [matrix+204]; - // begin inline asm - dp4a.u32.u32 %r1916, %r1917, %r5758, %r1912; - // end inline asm - ld.const.u32 %r1921, [matrix+208]; - // begin inline asm - dp4a.u32.u32 %r1920, %r1921, %r5762, %r1916; - // end inline asm - ld.const.u32 %r1925, [matrix+212]; - // begin inline asm - dp4a.u32.u32 %r1924, %r1925, %r5766, %r1920; - // end inline asm - ld.const.u32 %r1929, [matrix+216]; - // begin inline asm - dp4a.u32.u32 %r1928, %r1929, %r5770, %r1924; - // end inline asm - ld.const.u32 %r1933, [matrix+220]; - // begin inline asm - dp4a.u32.u32 %r1932, %r1933, %r5774, %r1928; - // end inline asm - ld.const.u32 %r1937, [matrix+224]; - // begin inline asm - dp4a.u32.u32 %r1936, %r1937, %r5778, %r1932; - // end inline asm - ld.const.u32 %r1941, [matrix+228]; - // begin inline asm - dp4a.u32.u32 %r1940, %r1941, %r5782, %r1936; - // end inline asm - ld.const.u32 %r1945, [matrix+232]; - // begin inline asm - dp4a.u32.u32 %r1944, %r1945, %r5786, %r1940; - // end inline asm - ld.const.u32 %r1949, [matrix+236]; - // begin inline asm - dp4a.u32.u32 %r1948, %r1949, %r5790, %r1944; - // end inline asm - ld.const.u32 %r1953, [matrix+240]; - // begin inline asm - dp4a.u32.u32 %r1952, %r1953, %r5794, %r1948; - // end inline asm - ld.const.u32 %r1957, [matrix+244]; - // begin inline asm - dp4a.u32.u32 %r1956, %r1957, %r5798, %r1952; - // end inline asm - ld.const.u32 %r1961, [matrix+248]; - // begin inline asm - dp4a.u32.u32 %r1960, %r1961, %r5802, %r1956; - // end inline asm - ld.const.u32 %r1965, [matrix+252]; - // begin inline asm - dp4a.u32.u32 %r1964, %r1965, %r5806, %r1960; - // end inline asm - shr.u32 %r5971, %r1900, 6; - and.b32 %r5972, %r5971, 240; - shr.u32 %r5973, %r1964, 10; - or.b32 %r5974, %r5973, %r5972; - xor.b32 %r5975, %r5810, %r5974; - ld.const.u32 %r1969, [matrix+256]; - // begin inline asm - dp4a.u32.u32 %r1968, %r1969, %r5746, %r6244; - // end inline asm - ld.const.u32 %r1973, [matrix+260]; - // begin inline asm - dp4a.u32.u32 %r1972, %r1973, %r5750, %r1968; - // end inline asm - ld.const.u32 %r1977, [matrix+264]; - // begin inline asm - dp4a.u32.u32 %r1976, %r1977, %r5754, %r1972; - // end inline asm - ld.const.u32 %r1981, [matrix+268]; - // begin inline asm - dp4a.u32.u32 %r1980, %r1981, %r5758, %r1976; - // end inline asm - ld.const.u32 %r1985, [matrix+272]; - // begin inline asm - dp4a.u32.u32 %r1984, %r1985, %r5762, %r1980; - // end inline asm - ld.const.u32 %r1989, [matrix+276]; - // begin inline asm - dp4a.u32.u32 %r1988, %r1989, %r5766, %r1984; - // end inline asm - ld.const.u32 %r1993, [matrix+280]; - // begin inline asm - dp4a.u32.u32 %r1992, %r1993, %r5770, %r1988; - // end inline asm - ld.const.u32 %r1997, [matrix+284]; - // begin inline asm - dp4a.u32.u32 %r1996, %r1997, %r5774, %r1992; - // end inline asm - ld.const.u32 %r2001, [matrix+288]; - // begin inline asm - dp4a.u32.u32 %r2000, %r2001, %r5778, %r1996; - // end inline asm - ld.const.u32 %r2005, [matrix+292]; - // begin inline asm - dp4a.u32.u32 %r2004, %r2005, %r5782, %r2000; - // end inline asm - ld.const.u32 %r2009, [matrix+296]; - // begin inline asm - dp4a.u32.u32 %r2008, %r2009, %r5786, %r2004; - // end inline asm - ld.const.u32 %r2013, [matrix+300]; - // begin inline asm - dp4a.u32.u32 %r2012, %r2013, %r5790, %r2008; - // end inline asm - ld.const.u32 %r2017, [matrix+304]; - // begin inline asm - dp4a.u32.u32 %r2016, %r2017, %r5794, %r2012; - // end inline asm - ld.const.u32 %r2021, [matrix+308]; - // begin inline asm - dp4a.u32.u32 %r2020, %r2021, %r5798, %r2016; - // end inline asm - ld.const.u32 %r2025, [matrix+312]; - // begin inline asm - dp4a.u32.u32 %r2024, %r2025, %r5802, %r2020; - // end inline asm - ld.const.u32 %r2029, [matrix+316]; - // begin inline asm - dp4a.u32.u32 %r2028, %r2029, %r5806, %r2024; - // end inline asm - ld.const.u32 %r2033, [matrix+320]; - // begin inline asm - dp4a.u32.u32 %r2032, %r2033, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2037, [matrix+324]; - // begin inline asm - dp4a.u32.u32 %r2036, %r2037, %r5750, %r2032; - // end inline asm - ld.const.u32 %r2041, [matrix+328]; - // begin inline asm - dp4a.u32.u32 %r2040, %r2041, %r5754, %r2036; - // end inline asm - ld.const.u32 %r2045, [matrix+332]; - // begin inline asm - dp4a.u32.u32 %r2044, %r2045, %r5758, %r2040; - // end inline asm - ld.const.u32 %r2049, [matrix+336]; - // begin inline asm - dp4a.u32.u32 %r2048, %r2049, %r5762, %r2044; - // end inline asm - ld.const.u32 %r2053, [matrix+340]; - // begin inline asm - dp4a.u32.u32 %r2052, %r2053, %r5766, %r2048; - // end inline asm - ld.const.u32 %r2057, [matrix+344]; - // begin inline asm - dp4a.u32.u32 %r2056, %r2057, %r5770, %r2052; - // end inline asm - ld.const.u32 %r2061, [matrix+348]; - // begin inline asm - dp4a.u32.u32 %r2060, %r2061, %r5774, %r2056; - // end inline asm - ld.const.u32 %r2065, [matrix+352]; - // begin inline asm - dp4a.u32.u32 %r2064, %r2065, %r5778, %r2060; - // end inline asm - ld.const.u32 %r2069, [matrix+356]; - // begin inline asm - dp4a.u32.u32 %r2068, %r2069, %r5782, %r2064; - // end inline asm - ld.const.u32 %r2073, [matrix+360]; - // begin inline asm - dp4a.u32.u32 %r2072, %r2073, %r5786, %r2068; - // end inline asm - ld.const.u32 %r2077, [matrix+364]; - // begin inline asm - dp4a.u32.u32 %r2076, %r2077, %r5790, %r2072; - // end inline asm - ld.const.u32 %r2081, [matrix+368]; - // begin inline asm - dp4a.u32.u32 %r2080, %r2081, %r5794, %r2076; - // end inline asm - ld.const.u32 %r2085, [matrix+372]; - // begin inline asm - dp4a.u32.u32 %r2084, %r2085, %r5798, %r2080; - // end inline asm - ld.const.u32 %r2089, [matrix+376]; - // begin inline asm - dp4a.u32.u32 %r2088, %r2089, %r5802, %r2084; - // end inline asm - ld.const.u32 %r2093, [matrix+380]; - // begin inline asm - dp4a.u32.u32 %r2092, %r2093, %r5806, %r2088; - // end inline asm - shr.u32 %r5976, %r2028, 6; - and.b32 %r5977, %r5976, 240; - shr.u32 %r5978, %r2092, 10; - or.b32 %r5979, %r5978, %r5977; - xor.b32 %r5980, %r5822, %r5979; - ld.const.u32 %r2097, [matrix+384]; - // begin inline asm - dp4a.u32.u32 %r2096, %r2097, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2101, [matrix+388]; - // begin inline asm - dp4a.u32.u32 %r2100, %r2101, %r5750, %r2096; - // end inline asm - ld.const.u32 %r2105, [matrix+392]; - // begin inline asm - dp4a.u32.u32 %r2104, %r2105, %r5754, %r2100; - // end inline asm - ld.const.u32 %r2109, [matrix+396]; - // begin inline asm - dp4a.u32.u32 %r2108, %r2109, %r5758, %r2104; - // end inline asm - ld.const.u32 %r2113, [matrix+400]; - // begin inline asm - dp4a.u32.u32 %r2112, %r2113, %r5762, %r2108; - // end inline asm - ld.const.u32 %r2117, [matrix+404]; - // begin inline asm - dp4a.u32.u32 %r2116, %r2117, %r5766, %r2112; - // end inline asm - ld.const.u32 %r2121, [matrix+408]; - // begin inline asm - dp4a.u32.u32 %r2120, %r2121, %r5770, %r2116; - // end inline asm - ld.const.u32 %r2125, [matrix+412]; - // begin inline asm - dp4a.u32.u32 %r2124, %r2125, %r5774, %r2120; - // end inline asm - ld.const.u32 %r2129, [matrix+416]; - // begin inline asm - dp4a.u32.u32 %r2128, %r2129, %r5778, %r2124; - // end inline asm - ld.const.u32 %r2133, [matrix+420]; - // begin inline asm - dp4a.u32.u32 %r2132, %r2133, %r5782, %r2128; - // end inline asm - ld.const.u32 %r2137, [matrix+424]; - // begin inline asm - dp4a.u32.u32 %r2136, %r2137, %r5786, %r2132; - // end inline asm - ld.const.u32 %r2141, [matrix+428]; - // begin inline asm - dp4a.u32.u32 %r2140, %r2141, %r5790, %r2136; - // end inline asm - ld.const.u32 %r2145, [matrix+432]; - // begin inline asm - dp4a.u32.u32 %r2144, %r2145, %r5794, %r2140; - // end inline asm - ld.const.u32 %r2149, [matrix+436]; - // begin inline asm - dp4a.u32.u32 %r2148, %r2149, %r5798, %r2144; - // end inline asm - ld.const.u32 %r2153, [matrix+440]; - // begin inline asm - dp4a.u32.u32 %r2152, %r2153, %r5802, %r2148; - // end inline asm - ld.const.u32 %r2157, [matrix+444]; - // begin inline asm - dp4a.u32.u32 %r2156, %r2157, %r5806, %r2152; - // end inline asm - ld.const.u32 %r2161, [matrix+448]; - // begin inline asm - dp4a.u32.u32 %r2160, %r2161, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2165, [matrix+452]; - // begin inline asm - dp4a.u32.u32 %r2164, %r2165, %r5750, %r2160; - // end inline asm - ld.const.u32 %r2169, [matrix+456]; - // begin inline asm - dp4a.u32.u32 %r2168, %r2169, %r5754, %r2164; - // end inline asm - ld.const.u32 %r2173, [matrix+460]; - // begin inline asm - dp4a.u32.u32 %r2172, %r2173, %r5758, %r2168; - // end inline asm - ld.const.u32 %r2177, [matrix+464]; - // begin inline asm - dp4a.u32.u32 %r2176, %r2177, %r5762, %r2172; - // end inline asm - ld.const.u32 %r2181, [matrix+468]; - // begin inline asm - dp4a.u32.u32 %r2180, %r2181, %r5766, %r2176; - // end inline asm - ld.const.u32 %r2185, [matrix+472]; - // begin inline asm - dp4a.u32.u32 %r2184, %r2185, %r5770, %r2180; - // end inline asm - ld.const.u32 %r2189, [matrix+476]; - // begin inline asm - dp4a.u32.u32 %r2188, %r2189, %r5774, %r2184; - // end inline asm - ld.const.u32 %r2193, [matrix+480]; - // begin inline asm - dp4a.u32.u32 %r2192, %r2193, %r5778, %r2188; - // end inline asm - ld.const.u32 %r2197, [matrix+484]; - // begin inline asm - dp4a.u32.u32 %r2196, %r2197, %r5782, %r2192; - // end inline asm - ld.const.u32 %r2201, [matrix+488]; - // begin inline asm - dp4a.u32.u32 %r2200, %r2201, %r5786, %r2196; - // end inline asm - ld.const.u32 %r2205, [matrix+492]; - // begin inline asm - dp4a.u32.u32 %r2204, %r2205, %r5790, %r2200; - // end inline asm - ld.const.u32 %r2209, [matrix+496]; - // begin inline asm - dp4a.u32.u32 %r2208, %r2209, %r5794, %r2204; - // end inline asm - ld.const.u32 %r2213, [matrix+500]; - // begin inline asm - dp4a.u32.u32 %r2212, %r2213, %r5798, %r2208; - // end inline asm - ld.const.u32 %r2217, [matrix+504]; - // begin inline asm - dp4a.u32.u32 %r2216, %r2217, %r5802, %r2212; - // end inline asm - ld.const.u32 %r2221, [matrix+508]; - // begin inline asm - dp4a.u32.u32 %r2220, %r2221, %r5806, %r2216; - // end inline asm - shr.u32 %r5981, %r2156, 6; - and.b32 %r5982, %r5981, 240; - shr.u32 %r5983, %r2220, 10; - or.b32 %r5984, %r5983, %r5982; - xor.b32 %r5985, %r5824, %r5984; - ld.const.u32 %r2225, [matrix+512]; - // begin inline asm - dp4a.u32.u32 %r2224, %r2225, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2229, [matrix+516]; - // begin inline asm - dp4a.u32.u32 %r2228, %r2229, %r5750, %r2224; - // end inline asm - ld.const.u32 %r2233, [matrix+520]; - // begin inline asm - dp4a.u32.u32 %r2232, %r2233, %r5754, %r2228; - // end inline asm - ld.const.u32 %r2237, [matrix+524]; - // begin inline asm - dp4a.u32.u32 %r2236, %r2237, %r5758, %r2232; - // end inline asm - ld.const.u32 %r2241, [matrix+528]; - // begin inline asm - dp4a.u32.u32 %r2240, %r2241, %r5762, %r2236; - // end inline asm - ld.const.u32 %r2245, [matrix+532]; - // begin inline asm - dp4a.u32.u32 %r2244, %r2245, %r5766, %r2240; - // end inline asm - ld.const.u32 %r2249, [matrix+536]; - // begin inline asm - dp4a.u32.u32 %r2248, %r2249, %r5770, %r2244; - // end inline asm - ld.const.u32 %r2253, [matrix+540]; - // begin inline asm - dp4a.u32.u32 %r2252, %r2253, %r5774, %r2248; - // end inline asm - ld.const.u32 %r2257, [matrix+544]; - // begin inline asm - dp4a.u32.u32 %r2256, %r2257, %r5778, %r2252; - // end inline asm - ld.const.u32 %r2261, [matrix+548]; - // begin inline asm - dp4a.u32.u32 %r2260, %r2261, %r5782, %r2256; - // end inline asm - ld.const.u32 %r2265, [matrix+552]; - // begin inline asm - dp4a.u32.u32 %r2264, %r2265, %r5786, %r2260; - // end inline asm - ld.const.u32 %r2269, [matrix+556]; - // begin inline asm - dp4a.u32.u32 %r2268, %r2269, %r5790, %r2264; - // end inline asm - ld.const.u32 %r2273, [matrix+560]; - // begin inline asm - dp4a.u32.u32 %r2272, %r2273, %r5794, %r2268; - // end inline asm - ld.const.u32 %r2277, [matrix+564]; - // begin inline asm - dp4a.u32.u32 %r2276, %r2277, %r5798, %r2272; - // end inline asm - ld.const.u32 %r2281, [matrix+568]; - // begin inline asm - dp4a.u32.u32 %r2280, %r2281, %r5802, %r2276; - // end inline asm - ld.const.u32 %r2285, [matrix+572]; - // begin inline asm - dp4a.u32.u32 %r2284, %r2285, %r5806, %r2280; - // end inline asm - ld.const.u32 %r2289, [matrix+576]; - // begin inline asm - dp4a.u32.u32 %r2288, %r2289, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2293, [matrix+580]; - // begin inline asm - dp4a.u32.u32 %r2292, %r2293, %r5750, %r2288; - // end inline asm - ld.const.u32 %r2297, [matrix+584]; - // begin inline asm - dp4a.u32.u32 %r2296, %r2297, %r5754, %r2292; - // end inline asm - ld.const.u32 %r2301, [matrix+588]; - // begin inline asm - dp4a.u32.u32 %r2300, %r2301, %r5758, %r2296; - // end inline asm - ld.const.u32 %r2305, [matrix+592]; - // begin inline asm - dp4a.u32.u32 %r2304, %r2305, %r5762, %r2300; - // end inline asm - ld.const.u32 %r2309, [matrix+596]; - // begin inline asm - dp4a.u32.u32 %r2308, %r2309, %r5766, %r2304; - // end inline asm - ld.const.u32 %r2313, [matrix+600]; - // begin inline asm - dp4a.u32.u32 %r2312, %r2313, %r5770, %r2308; - // end inline asm - ld.const.u32 %r2317, [matrix+604]; - // begin inline asm - dp4a.u32.u32 %r2316, %r2317, %r5774, %r2312; - // end inline asm - ld.const.u32 %r2321, [matrix+608]; - // begin inline asm - dp4a.u32.u32 %r2320, %r2321, %r5778, %r2316; - // end inline asm - ld.const.u32 %r2325, [matrix+612]; - // begin inline asm - dp4a.u32.u32 %r2324, %r2325, %r5782, %r2320; - // end inline asm - ld.const.u32 %r2329, [matrix+616]; - // begin inline asm - dp4a.u32.u32 %r2328, %r2329, %r5786, %r2324; - // end inline asm - ld.const.u32 %r2333, [matrix+620]; - // begin inline asm - dp4a.u32.u32 %r2332, %r2333, %r5790, %r2328; - // end inline asm - ld.const.u32 %r2337, [matrix+624]; - // begin inline asm - dp4a.u32.u32 %r2336, %r2337, %r5794, %r2332; - // end inline asm - ld.const.u32 %r2341, [matrix+628]; - // begin inline asm - dp4a.u32.u32 %r2340, %r2341, %r5798, %r2336; - // end inline asm - ld.const.u32 %r2345, [matrix+632]; - // begin inline asm - dp4a.u32.u32 %r2344, %r2345, %r5802, %r2340; - // end inline asm - ld.const.u32 %r2349, [matrix+636]; - // begin inline asm - dp4a.u32.u32 %r2348, %r2349, %r5806, %r2344; - // end inline asm - shr.u32 %r5986, %r2284, 6; - and.b32 %r5987, %r5986, 240; - shr.u32 %r5988, %r2348, 10; - or.b32 %r5989, %r5988, %r5987; - cvt.u64.u32 %rd211, %r5989; - xor.b64 %rd212, %rd10, %rd211; - ld.const.u32 %r2353, [matrix+640]; - // begin inline asm - dp4a.u32.u32 %r2352, %r2353, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2357, [matrix+644]; - // begin inline asm - dp4a.u32.u32 %r2356, %r2357, %r5750, %r2352; - // end inline asm - ld.const.u32 %r2361, [matrix+648]; - // begin inline asm - dp4a.u32.u32 %r2360, %r2361, %r5754, %r2356; - // end inline asm - ld.const.u32 %r2365, [matrix+652]; - // begin inline asm - dp4a.u32.u32 %r2364, %r2365, %r5758, %r2360; - // end inline asm - ld.const.u32 %r2369, [matrix+656]; - // begin inline asm - dp4a.u32.u32 %r2368, %r2369, %r5762, %r2364; - // end inline asm - ld.const.u32 %r2373, [matrix+660]; - // begin inline asm - dp4a.u32.u32 %r2372, %r2373, %r5766, %r2368; - // end inline asm - ld.const.u32 %r2377, [matrix+664]; - // begin inline asm - dp4a.u32.u32 %r2376, %r2377, %r5770, %r2372; - // end inline asm - ld.const.u32 %r2381, [matrix+668]; - // begin inline asm - dp4a.u32.u32 %r2380, %r2381, %r5774, %r2376; - // end inline asm - ld.const.u32 %r2385, [matrix+672]; - // begin inline asm - dp4a.u32.u32 %r2384, %r2385, %r5778, %r2380; - // end inline asm - ld.const.u32 %r2389, [matrix+676]; - // begin inline asm - dp4a.u32.u32 %r2388, %r2389, %r5782, %r2384; - // end inline asm - ld.const.u32 %r2393, [matrix+680]; - // begin inline asm - dp4a.u32.u32 %r2392, %r2393, %r5786, %r2388; - // end inline asm - ld.const.u32 %r2397, [matrix+684]; - // begin inline asm - dp4a.u32.u32 %r2396, %r2397, %r5790, %r2392; - // end inline asm - ld.const.u32 %r2401, [matrix+688]; - // begin inline asm - dp4a.u32.u32 %r2400, %r2401, %r5794, %r2396; - // end inline asm - ld.const.u32 %r2405, [matrix+692]; - // begin inline asm - dp4a.u32.u32 %r2404, %r2405, %r5798, %r2400; - // end inline asm - ld.const.u32 %r2409, [matrix+696]; - // begin inline asm - dp4a.u32.u32 %r2408, %r2409, %r5802, %r2404; - // end inline asm - ld.const.u32 %r2413, [matrix+700]; - // begin inline asm - dp4a.u32.u32 %r2412, %r2413, %r5806, %r2408; - // end inline asm - ld.const.u32 %r2417, [matrix+704]; - // begin inline asm - dp4a.u32.u32 %r2416, %r2417, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2421, [matrix+708]; - // begin inline asm - dp4a.u32.u32 %r2420, %r2421, %r5750, %r2416; - // end inline asm - ld.const.u32 %r2425, [matrix+712]; - // begin inline asm - dp4a.u32.u32 %r2424, %r2425, %r5754, %r2420; - // end inline asm - ld.const.u32 %r2429, [matrix+716]; - // begin inline asm - dp4a.u32.u32 %r2428, %r2429, %r5758, %r2424; - // end inline asm - ld.const.u32 %r2433, [matrix+720]; - // begin inline asm - dp4a.u32.u32 %r2432, %r2433, %r5762, %r2428; - // end inline asm - ld.const.u32 %r2437, [matrix+724]; - // begin inline asm - dp4a.u32.u32 %r2436, %r2437, %r5766, %r2432; - // end inline asm - ld.const.u32 %r2441, [matrix+728]; - // begin inline asm - dp4a.u32.u32 %r2440, %r2441, %r5770, %r2436; - // end inline asm - ld.const.u32 %r2445, [matrix+732]; - // begin inline asm - dp4a.u32.u32 %r2444, %r2445, %r5774, %r2440; - // end inline asm - ld.const.u32 %r2449, [matrix+736]; - // begin inline asm - dp4a.u32.u32 %r2448, %r2449, %r5778, %r2444; - // end inline asm - ld.const.u32 %r2453, [matrix+740]; - // begin inline asm - dp4a.u32.u32 %r2452, %r2453, %r5782, %r2448; - // end inline asm - ld.const.u32 %r2457, [matrix+744]; - // begin inline asm - dp4a.u32.u32 %r2456, %r2457, %r5786, %r2452; - // end inline asm - ld.const.u32 %r2461, [matrix+748]; - // begin inline asm - dp4a.u32.u32 %r2460, %r2461, %r5790, %r2456; - // end inline asm - ld.const.u32 %r2465, [matrix+752]; - // begin inline asm - dp4a.u32.u32 %r2464, %r2465, %r5794, %r2460; - // end inline asm - ld.const.u32 %r2469, [matrix+756]; - // begin inline asm - dp4a.u32.u32 %r2468, %r2469, %r5798, %r2464; - // end inline asm - ld.const.u32 %r2473, [matrix+760]; - // begin inline asm - dp4a.u32.u32 %r2472, %r2473, %r5802, %r2468; - // end inline asm - ld.const.u32 %r2477, [matrix+764]; - // begin inline asm - dp4a.u32.u32 %r2476, %r2477, %r5806, %r2472; - // end inline asm - shr.u32 %r5990, %r2412, 6; - and.b32 %r5991, %r5990, 240; - shr.u32 %r5992, %r2476, 10; - or.b32 %r5993, %r5992, %r5991; - cvt.u64.u32 %rd213, %r5993; - xor.b64 %rd214, %rd11, %rd213; - ld.const.u32 %r2481, [matrix+768]; - // begin inline asm - dp4a.u32.u32 %r2480, %r2481, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2485, [matrix+772]; - // begin inline asm - dp4a.u32.u32 %r2484, %r2485, %r5750, %r2480; - // end inline asm - ld.const.u32 %r2489, [matrix+776]; - // begin inline asm - dp4a.u32.u32 %r2488, %r2489, %r5754, %r2484; - // end inline asm - ld.const.u32 %r2493, [matrix+780]; - // begin inline asm - dp4a.u32.u32 %r2492, %r2493, %r5758, %r2488; - // end inline asm - ld.const.u32 %r2497, [matrix+784]; - // begin inline asm - dp4a.u32.u32 %r2496, %r2497, %r5762, %r2492; - // end inline asm - ld.const.u32 %r2501, [matrix+788]; - // begin inline asm - dp4a.u32.u32 %r2500, %r2501, %r5766, %r2496; - // end inline asm - ld.const.u32 %r2505, [matrix+792]; - // begin inline asm - dp4a.u32.u32 %r2504, %r2505, %r5770, %r2500; - // end inline asm - ld.const.u32 %r2509, [matrix+796]; - // begin inline asm - dp4a.u32.u32 %r2508, %r2509, %r5774, %r2504; - // end inline asm - ld.const.u32 %r2513, [matrix+800]; - // begin inline asm - dp4a.u32.u32 %r2512, %r2513, %r5778, %r2508; - // end inline asm - ld.const.u32 %r2517, [matrix+804]; - // begin inline asm - dp4a.u32.u32 %r2516, %r2517, %r5782, %r2512; - // end inline asm - ld.const.u32 %r2521, [matrix+808]; - // begin inline asm - dp4a.u32.u32 %r2520, %r2521, %r5786, %r2516; - // end inline asm - ld.const.u32 %r2525, [matrix+812]; - // begin inline asm - dp4a.u32.u32 %r2524, %r2525, %r5790, %r2520; - // end inline asm - ld.const.u32 %r2529, [matrix+816]; - // begin inline asm - dp4a.u32.u32 %r2528, %r2529, %r5794, %r2524; - // end inline asm - ld.const.u32 %r2533, [matrix+820]; - // begin inline asm - dp4a.u32.u32 %r2532, %r2533, %r5798, %r2528; - // end inline asm - ld.const.u32 %r2537, [matrix+824]; - // begin inline asm - dp4a.u32.u32 %r2536, %r2537, %r5802, %r2532; - // end inline asm - ld.const.u32 %r2541, [matrix+828]; - // begin inline asm - dp4a.u32.u32 %r2540, %r2541, %r5806, %r2536; - // end inline asm - ld.const.u32 %r2545, [matrix+832]; - // begin inline asm - dp4a.u32.u32 %r2544, %r2545, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2549, [matrix+836]; - // begin inline asm - dp4a.u32.u32 %r2548, %r2549, %r5750, %r2544; - // end inline asm - ld.const.u32 %r2553, [matrix+840]; - // begin inline asm - dp4a.u32.u32 %r2552, %r2553, %r5754, %r2548; - // end inline asm - ld.const.u32 %r2557, [matrix+844]; - // begin inline asm - dp4a.u32.u32 %r2556, %r2557, %r5758, %r2552; - // end inline asm - ld.const.u32 %r2561, [matrix+848]; - // begin inline asm - dp4a.u32.u32 %r2560, %r2561, %r5762, %r2556; - // end inline asm - ld.const.u32 %r2565, [matrix+852]; - // begin inline asm - dp4a.u32.u32 %r2564, %r2565, %r5766, %r2560; - // end inline asm - ld.const.u32 %r2569, [matrix+856]; - // begin inline asm - dp4a.u32.u32 %r2568, %r2569, %r5770, %r2564; - // end inline asm - ld.const.u32 %r2573, [matrix+860]; - // begin inline asm - dp4a.u32.u32 %r2572, %r2573, %r5774, %r2568; - // end inline asm - ld.const.u32 %r2577, [matrix+864]; - // begin inline asm - dp4a.u32.u32 %r2576, %r2577, %r5778, %r2572; - // end inline asm - ld.const.u32 %r2581, [matrix+868]; - // begin inline asm - dp4a.u32.u32 %r2580, %r2581, %r5782, %r2576; - // end inline asm - ld.const.u32 %r2585, [matrix+872]; - // begin inline asm - dp4a.u32.u32 %r2584, %r2585, %r5786, %r2580; - // end inline asm - ld.const.u32 %r2589, [matrix+876]; - // begin inline asm - dp4a.u32.u32 %r2588, %r2589, %r5790, %r2584; - // end inline asm - ld.const.u32 %r2593, [matrix+880]; - // begin inline asm - dp4a.u32.u32 %r2592, %r2593, %r5794, %r2588; - // end inline asm - ld.const.u32 %r2597, [matrix+884]; - // begin inline asm - dp4a.u32.u32 %r2596, %r2597, %r5798, %r2592; - // end inline asm - ld.const.u32 %r2601, [matrix+888]; - // begin inline asm - dp4a.u32.u32 %r2600, %r2601, %r5802, %r2596; - // end inline asm - ld.const.u32 %r2605, [matrix+892]; - // begin inline asm - dp4a.u32.u32 %r2604, %r2605, %r5806, %r2600; - // end inline asm - shr.u32 %r5994, %r2540, 6; - and.b32 %r5995, %r5994, 240; - shr.u32 %r5996, %r2604, 10; - or.b32 %r5997, %r5996, %r5995; - cvt.u64.u32 %rd215, %r5997; - xor.b64 %rd216, %rd12, %rd215; - ld.const.u32 %r2609, [matrix+896]; - // begin inline asm - dp4a.u32.u32 %r2608, %r2609, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2613, [matrix+900]; - // begin inline asm - dp4a.u32.u32 %r2612, %r2613, %r5750, %r2608; - // end inline asm - ld.const.u32 %r2617, [matrix+904]; - // begin inline asm - dp4a.u32.u32 %r2616, %r2617, %r5754, %r2612; - // end inline asm - ld.const.u32 %r2621, [matrix+908]; - // begin inline asm - dp4a.u32.u32 %r2620, %r2621, %r5758, %r2616; - // end inline asm - ld.const.u32 %r2625, [matrix+912]; - // begin inline asm - dp4a.u32.u32 %r2624, %r2625, %r5762, %r2620; - // end inline asm - ld.const.u32 %r2629, [matrix+916]; - // begin inline asm - dp4a.u32.u32 %r2628, %r2629, %r5766, %r2624; - // end inline asm - ld.const.u32 %r2633, [matrix+920]; - // begin inline asm - dp4a.u32.u32 %r2632, %r2633, %r5770, %r2628; - // end inline asm - ld.const.u32 %r2637, [matrix+924]; - // begin inline asm - dp4a.u32.u32 %r2636, %r2637, %r5774, %r2632; - // end inline asm - ld.const.u32 %r2641, [matrix+928]; - // begin inline asm - dp4a.u32.u32 %r2640, %r2641, %r5778, %r2636; - // end inline asm - ld.const.u32 %r2645, [matrix+932]; - // begin inline asm - dp4a.u32.u32 %r2644, %r2645, %r5782, %r2640; - // end inline asm - ld.const.u32 %r2649, [matrix+936]; - // begin inline asm - dp4a.u32.u32 %r2648, %r2649, %r5786, %r2644; - // end inline asm - ld.const.u32 %r2653, [matrix+940]; - // begin inline asm - dp4a.u32.u32 %r2652, %r2653, %r5790, %r2648; - // end inline asm - ld.const.u32 %r2657, [matrix+944]; - // begin inline asm - dp4a.u32.u32 %r2656, %r2657, %r5794, %r2652; - // end inline asm - ld.const.u32 %r2661, [matrix+948]; - // begin inline asm - dp4a.u32.u32 %r2660, %r2661, %r5798, %r2656; - // end inline asm - ld.const.u32 %r2665, [matrix+952]; - // begin inline asm - dp4a.u32.u32 %r2664, %r2665, %r5802, %r2660; - // end inline asm - ld.const.u32 %r2669, [matrix+956]; - // begin inline asm - dp4a.u32.u32 %r2668, %r2669, %r5806, %r2664; - // end inline asm - ld.const.u32 %r2673, [matrix+960]; - // begin inline asm - dp4a.u32.u32 %r2672, %r2673, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2677, [matrix+964]; - // begin inline asm - dp4a.u32.u32 %r2676, %r2677, %r5750, %r2672; - // end inline asm - ld.const.u32 %r2681, [matrix+968]; - // begin inline asm - dp4a.u32.u32 %r2680, %r2681, %r5754, %r2676; - // end inline asm - ld.const.u32 %r2685, [matrix+972]; - // begin inline asm - dp4a.u32.u32 %r2684, %r2685, %r5758, %r2680; - // end inline asm - ld.const.u32 %r2689, [matrix+976]; - // begin inline asm - dp4a.u32.u32 %r2688, %r2689, %r5762, %r2684; - // end inline asm - ld.const.u32 %r2693, [matrix+980]; - // begin inline asm - dp4a.u32.u32 %r2692, %r2693, %r5766, %r2688; - // end inline asm - ld.const.u32 %r2697, [matrix+984]; - // begin inline asm - dp4a.u32.u32 %r2696, %r2697, %r5770, %r2692; - // end inline asm - ld.const.u32 %r2701, [matrix+988]; - // begin inline asm - dp4a.u32.u32 %r2700, %r2701, %r5774, %r2696; - // end inline asm - ld.const.u32 %r2705, [matrix+992]; - // begin inline asm - dp4a.u32.u32 %r2704, %r2705, %r5778, %r2700; - // end inline asm - ld.const.u32 %r2709, [matrix+996]; - // begin inline asm - dp4a.u32.u32 %r2708, %r2709, %r5782, %r2704; - // end inline asm - ld.const.u32 %r2713, [matrix+1000]; - // begin inline asm - dp4a.u32.u32 %r2712, %r2713, %r5786, %r2708; - // end inline asm - ld.const.u32 %r2717, [matrix+1004]; - // begin inline asm - dp4a.u32.u32 %r2716, %r2717, %r5790, %r2712; - // end inline asm - ld.const.u32 %r2721, [matrix+1008]; - // begin inline asm - dp4a.u32.u32 %r2720, %r2721, %r5794, %r2716; - // end inline asm - ld.const.u32 %r2725, [matrix+1012]; - // begin inline asm - dp4a.u32.u32 %r2724, %r2725, %r5798, %r2720; - // end inline asm - ld.const.u32 %r2729, [matrix+1016]; - // begin inline asm - dp4a.u32.u32 %r2728, %r2729, %r5802, %r2724; - // end inline asm - ld.const.u32 %r2733, [matrix+1020]; - // begin inline asm - dp4a.u32.u32 %r2732, %r2733, %r5806, %r2728; - // end inline asm - shr.u32 %r5998, %r2668, 6; - and.b32 %r5999, %r5998, 240; - ld.const.u32 %r2737, [matrix+1024]; - // begin inline asm - dp4a.u32.u32 %r2736, %r2737, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2741, [matrix+1028]; - // begin inline asm - dp4a.u32.u32 %r2740, %r2741, %r5750, %r2736; - // end inline asm - ld.const.u32 %r2745, [matrix+1032]; - // begin inline asm - dp4a.u32.u32 %r2744, %r2745, %r5754, %r2740; - // end inline asm - ld.const.u32 %r2749, [matrix+1036]; - // begin inline asm - dp4a.u32.u32 %r2748, %r2749, %r5758, %r2744; - // end inline asm - ld.const.u32 %r2753, [matrix+1040]; - // begin inline asm - dp4a.u32.u32 %r2752, %r2753, %r5762, %r2748; - // end inline asm - ld.const.u32 %r2757, [matrix+1044]; - // begin inline asm - dp4a.u32.u32 %r2756, %r2757, %r5766, %r2752; - // end inline asm - ld.const.u32 %r2761, [matrix+1048]; - // begin inline asm - dp4a.u32.u32 %r2760, %r2761, %r5770, %r2756; - // end inline asm - ld.const.u32 %r2765, [matrix+1052]; - // begin inline asm - dp4a.u32.u32 %r2764, %r2765, %r5774, %r2760; - // end inline asm - ld.const.u32 %r2769, [matrix+1056]; - // begin inline asm - dp4a.u32.u32 %r2768, %r2769, %r5778, %r2764; - // end inline asm - ld.const.u32 %r2773, [matrix+1060]; - // begin inline asm - dp4a.u32.u32 %r2772, %r2773, %r5782, %r2768; - // end inline asm - ld.const.u32 %r2777, [matrix+1064]; - // begin inline asm - dp4a.u32.u32 %r2776, %r2777, %r5786, %r2772; - // end inline asm - ld.const.u32 %r2781, [matrix+1068]; - // begin inline asm - dp4a.u32.u32 %r2780, %r2781, %r5790, %r2776; - // end inline asm - ld.const.u32 %r2785, [matrix+1072]; - // begin inline asm - dp4a.u32.u32 %r2784, %r2785, %r5794, %r2780; - // end inline asm - ld.const.u32 %r2789, [matrix+1076]; - // begin inline asm - dp4a.u32.u32 %r2788, %r2789, %r5798, %r2784; - // end inline asm - ld.const.u32 %r2793, [matrix+1080]; - // begin inline asm - dp4a.u32.u32 %r2792, %r2793, %r5802, %r2788; - // end inline asm - ld.const.u32 %r2797, [matrix+1084]; - // begin inline asm - dp4a.u32.u32 %r2796, %r2797, %r5806, %r2792; - // end inline asm - ld.const.u32 %r2801, [matrix+1088]; - // begin inline asm - dp4a.u32.u32 %r2800, %r2801, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2805, [matrix+1092]; - // begin inline asm - dp4a.u32.u32 %r2804, %r2805, %r5750, %r2800; - // end inline asm - ld.const.u32 %r2809, [matrix+1096]; - // begin inline asm - dp4a.u32.u32 %r2808, %r2809, %r5754, %r2804; - // end inline asm - ld.const.u32 %r2813, [matrix+1100]; - // begin inline asm - dp4a.u32.u32 %r2812, %r2813, %r5758, %r2808; - // end inline asm - ld.const.u32 %r2817, [matrix+1104]; - // begin inline asm - dp4a.u32.u32 %r2816, %r2817, %r5762, %r2812; - // end inline asm - ld.const.u32 %r2821, [matrix+1108]; - // begin inline asm - dp4a.u32.u32 %r2820, %r2821, %r5766, %r2816; - // end inline asm - ld.const.u32 %r2825, [matrix+1112]; - // begin inline asm - dp4a.u32.u32 %r2824, %r2825, %r5770, %r2820; - // end inline asm - ld.const.u32 %r2829, [matrix+1116]; - // begin inline asm - dp4a.u32.u32 %r2828, %r2829, %r5774, %r2824; - // end inline asm - ld.const.u32 %r2833, [matrix+1120]; - // begin inline asm - dp4a.u32.u32 %r2832, %r2833, %r5778, %r2828; - // end inline asm - ld.const.u32 %r2837, [matrix+1124]; - // begin inline asm - dp4a.u32.u32 %r2836, %r2837, %r5782, %r2832; - // end inline asm - ld.const.u32 %r2841, [matrix+1128]; - // begin inline asm - dp4a.u32.u32 %r2840, %r2841, %r5786, %r2836; - // end inline asm - ld.const.u32 %r2845, [matrix+1132]; - // begin inline asm - dp4a.u32.u32 %r2844, %r2845, %r5790, %r2840; - // end inline asm - ld.const.u32 %r2849, [matrix+1136]; - // begin inline asm - dp4a.u32.u32 %r2848, %r2849, %r5794, %r2844; - // end inline asm - ld.const.u32 %r2853, [matrix+1140]; - // begin inline asm - dp4a.u32.u32 %r2852, %r2853, %r5798, %r2848; - // end inline asm - ld.const.u32 %r2857, [matrix+1144]; - // begin inline asm - dp4a.u32.u32 %r2856, %r2857, %r5802, %r2852; - // end inline asm - ld.const.u32 %r2861, [matrix+1148]; - // begin inline asm - dp4a.u32.u32 %r2860, %r2861, %r5806, %r2856; - // end inline asm - shr.u32 %r6000, %r2796, 6; - and.b32 %r6001, %r6000, 240; - shr.u32 %r6002, %r2860, 10; - or.b32 %r6003, %r6002, %r6001; - xor.b32 %r6004, %r11, %r6003; - ld.const.u32 %r2865, [matrix+1152]; - // begin inline asm - dp4a.u32.u32 %r2864, %r2865, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2869, [matrix+1156]; - // begin inline asm - dp4a.u32.u32 %r2868, %r2869, %r5750, %r2864; - // end inline asm - ld.const.u32 %r2873, [matrix+1160]; - // begin inline asm - dp4a.u32.u32 %r2872, %r2873, %r5754, %r2868; - // end inline asm - ld.const.u32 %r2877, [matrix+1164]; - // begin inline asm - dp4a.u32.u32 %r2876, %r2877, %r5758, %r2872; - // end inline asm - ld.const.u32 %r2881, [matrix+1168]; - // begin inline asm - dp4a.u32.u32 %r2880, %r2881, %r5762, %r2876; - // end inline asm - ld.const.u32 %r2885, [matrix+1172]; - // begin inline asm - dp4a.u32.u32 %r2884, %r2885, %r5766, %r2880; - // end inline asm - ld.const.u32 %r2889, [matrix+1176]; - // begin inline asm - dp4a.u32.u32 %r2888, %r2889, %r5770, %r2884; - // end inline asm - ld.const.u32 %r2893, [matrix+1180]; - // begin inline asm - dp4a.u32.u32 %r2892, %r2893, %r5774, %r2888; - // end inline asm - ld.const.u32 %r2897, [matrix+1184]; - // begin inline asm - dp4a.u32.u32 %r2896, %r2897, %r5778, %r2892; - // end inline asm - ld.const.u32 %r2901, [matrix+1188]; - // begin inline asm - dp4a.u32.u32 %r2900, %r2901, %r5782, %r2896; - // end inline asm - ld.const.u32 %r2905, [matrix+1192]; - // begin inline asm - dp4a.u32.u32 %r2904, %r2905, %r5786, %r2900; - // end inline asm - ld.const.u32 %r2909, [matrix+1196]; - // begin inline asm - dp4a.u32.u32 %r2908, %r2909, %r5790, %r2904; - // end inline asm - ld.const.u32 %r2913, [matrix+1200]; - // begin inline asm - dp4a.u32.u32 %r2912, %r2913, %r5794, %r2908; - // end inline asm - ld.const.u32 %r2917, [matrix+1204]; - // begin inline asm - dp4a.u32.u32 %r2916, %r2917, %r5798, %r2912; - // end inline asm - ld.const.u32 %r2921, [matrix+1208]; - // begin inline asm - dp4a.u32.u32 %r2920, %r2921, %r5802, %r2916; - // end inline asm - ld.const.u32 %r2925, [matrix+1212]; - // begin inline asm - dp4a.u32.u32 %r2924, %r2925, %r5806, %r2920; - // end inline asm - ld.const.u32 %r2929, [matrix+1216]; - // begin inline asm - dp4a.u32.u32 %r2928, %r2929, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2933, [matrix+1220]; - // begin inline asm - dp4a.u32.u32 %r2932, %r2933, %r5750, %r2928; - // end inline asm - ld.const.u32 %r2937, [matrix+1224]; - // begin inline asm - dp4a.u32.u32 %r2936, %r2937, %r5754, %r2932; - // end inline asm - ld.const.u32 %r2941, [matrix+1228]; - // begin inline asm - dp4a.u32.u32 %r2940, %r2941, %r5758, %r2936; - // end inline asm - ld.const.u32 %r2945, [matrix+1232]; - // begin inline asm - dp4a.u32.u32 %r2944, %r2945, %r5762, %r2940; - // end inline asm - ld.const.u32 %r2949, [matrix+1236]; - // begin inline asm - dp4a.u32.u32 %r2948, %r2949, %r5766, %r2944; - // end inline asm - ld.const.u32 %r2953, [matrix+1240]; - // begin inline asm - dp4a.u32.u32 %r2952, %r2953, %r5770, %r2948; - // end inline asm - ld.const.u32 %r2957, [matrix+1244]; - // begin inline asm - dp4a.u32.u32 %r2956, %r2957, %r5774, %r2952; - // end inline asm - ld.const.u32 %r2961, [matrix+1248]; - // begin inline asm - dp4a.u32.u32 %r2960, %r2961, %r5778, %r2956; - // end inline asm - ld.const.u32 %r2965, [matrix+1252]; - // begin inline asm - dp4a.u32.u32 %r2964, %r2965, %r5782, %r2960; - // end inline asm - ld.const.u32 %r2969, [matrix+1256]; - // begin inline asm - dp4a.u32.u32 %r2968, %r2969, %r5786, %r2964; - // end inline asm - ld.const.u32 %r2973, [matrix+1260]; - // begin inline asm - dp4a.u32.u32 %r2972, %r2973, %r5790, %r2968; - // end inline asm - ld.const.u32 %r2977, [matrix+1264]; - // begin inline asm - dp4a.u32.u32 %r2976, %r2977, %r5794, %r2972; - // end inline asm - ld.const.u32 %r2981, [matrix+1268]; - // begin inline asm - dp4a.u32.u32 %r2980, %r2981, %r5798, %r2976; - // end inline asm - ld.const.u32 %r2985, [matrix+1272]; - // begin inline asm - dp4a.u32.u32 %r2984, %r2985, %r5802, %r2980; - // end inline asm - ld.const.u32 %r2989, [matrix+1276]; - // begin inline asm - dp4a.u32.u32 %r2988, %r2989, %r5806, %r2984; - // end inline asm - shr.u32 %r6005, %r2924, 6; - and.b32 %r6006, %r6005, 240; - shr.u32 %r6007, %r2988, 10; - or.b32 %r6008, %r6007, %r6006; - xor.b32 %r6009, %r5848, %r6008; - ld.const.u32 %r2993, [matrix+1280]; - // begin inline asm - dp4a.u32.u32 %r2992, %r2993, %r5746, %r6244; - // end inline asm - ld.const.u32 %r2997, [matrix+1284]; - // begin inline asm - dp4a.u32.u32 %r2996, %r2997, %r5750, %r2992; - // end inline asm - ld.const.u32 %r3001, [matrix+1288]; - // begin inline asm - dp4a.u32.u32 %r3000, %r3001, %r5754, %r2996; - // end inline asm - ld.const.u32 %r3005, [matrix+1292]; - // begin inline asm - dp4a.u32.u32 %r3004, %r3005, %r5758, %r3000; - // end inline asm - ld.const.u32 %r3009, [matrix+1296]; - // begin inline asm - dp4a.u32.u32 %r3008, %r3009, %r5762, %r3004; - // end inline asm - ld.const.u32 %r3013, [matrix+1300]; - // begin inline asm - dp4a.u32.u32 %r3012, %r3013, %r5766, %r3008; - // end inline asm - ld.const.u32 %r3017, [matrix+1304]; - // begin inline asm - dp4a.u32.u32 %r3016, %r3017, %r5770, %r3012; - // end inline asm - ld.const.u32 %r3021, [matrix+1308]; - // begin inline asm - dp4a.u32.u32 %r3020, %r3021, %r5774, %r3016; - // end inline asm - ld.const.u32 %r3025, [matrix+1312]; - // begin inline asm - dp4a.u32.u32 %r3024, %r3025, %r5778, %r3020; - // end inline asm - ld.const.u32 %r3029, [matrix+1316]; - // begin inline asm - dp4a.u32.u32 %r3028, %r3029, %r5782, %r3024; - // end inline asm - ld.const.u32 %r3033, [matrix+1320]; - // begin inline asm - dp4a.u32.u32 %r3032, %r3033, %r5786, %r3028; - // end inline asm - ld.const.u32 %r3037, [matrix+1324]; - // begin inline asm - dp4a.u32.u32 %r3036, %r3037, %r5790, %r3032; - // end inline asm - ld.const.u32 %r3041, [matrix+1328]; - // begin inline asm - dp4a.u32.u32 %r3040, %r3041, %r5794, %r3036; - // end inline asm - ld.const.u32 %r3045, [matrix+1332]; - // begin inline asm - dp4a.u32.u32 %r3044, %r3045, %r5798, %r3040; - // end inline asm - ld.const.u32 %r3049, [matrix+1336]; - // begin inline asm - dp4a.u32.u32 %r3048, %r3049, %r5802, %r3044; - // end inline asm - ld.const.u32 %r3053, [matrix+1340]; - // begin inline asm - dp4a.u32.u32 %r3052, %r3053, %r5806, %r3048; - // end inline asm - ld.const.u32 %r3057, [matrix+1344]; - // begin inline asm - dp4a.u32.u32 %r3056, %r3057, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3061, [matrix+1348]; - // begin inline asm - dp4a.u32.u32 %r3060, %r3061, %r5750, %r3056; - // end inline asm - ld.const.u32 %r3065, [matrix+1352]; - // begin inline asm - dp4a.u32.u32 %r3064, %r3065, %r5754, %r3060; - // end inline asm - ld.const.u32 %r3069, [matrix+1356]; - // begin inline asm - dp4a.u32.u32 %r3068, %r3069, %r5758, %r3064; - // end inline asm - ld.const.u32 %r3073, [matrix+1360]; - // begin inline asm - dp4a.u32.u32 %r3072, %r3073, %r5762, %r3068; - // end inline asm - ld.const.u32 %r3077, [matrix+1364]; - // begin inline asm - dp4a.u32.u32 %r3076, %r3077, %r5766, %r3072; - // end inline asm - ld.const.u32 %r3081, [matrix+1368]; - // begin inline asm - dp4a.u32.u32 %r3080, %r3081, %r5770, %r3076; - // end inline asm - ld.const.u32 %r3085, [matrix+1372]; - // begin inline asm - dp4a.u32.u32 %r3084, %r3085, %r5774, %r3080; - // end inline asm - ld.const.u32 %r3089, [matrix+1376]; - // begin inline asm - dp4a.u32.u32 %r3088, %r3089, %r5778, %r3084; - // end inline asm - ld.const.u32 %r3093, [matrix+1380]; - // begin inline asm - dp4a.u32.u32 %r3092, %r3093, %r5782, %r3088; - // end inline asm - ld.const.u32 %r3097, [matrix+1384]; - // begin inline asm - dp4a.u32.u32 %r3096, %r3097, %r5786, %r3092; - // end inline asm - ld.const.u32 %r3101, [matrix+1388]; - // begin inline asm - dp4a.u32.u32 %r3100, %r3101, %r5790, %r3096; - // end inline asm - ld.const.u32 %r3105, [matrix+1392]; - // begin inline asm - dp4a.u32.u32 %r3104, %r3105, %r5794, %r3100; - // end inline asm - ld.const.u32 %r3109, [matrix+1396]; - // begin inline asm - dp4a.u32.u32 %r3108, %r3109, %r5798, %r3104; - // end inline asm - ld.const.u32 %r3113, [matrix+1400]; - // begin inline asm - dp4a.u32.u32 %r3112, %r3113, %r5802, %r3108; - // end inline asm - ld.const.u32 %r3117, [matrix+1404]; - // begin inline asm - dp4a.u32.u32 %r3116, %r3117, %r5806, %r3112; - // end inline asm - shr.u32 %r6010, %r3052, 6; - and.b32 %r6011, %r6010, 240; - shr.u32 %r6012, %r3116, 10; - or.b32 %r6013, %r6012, %r6011; - xor.b32 %r6014, %r5860, %r6013; - ld.const.u32 %r3121, [matrix+1408]; - // begin inline asm - dp4a.u32.u32 %r3120, %r3121, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3125, [matrix+1412]; - // begin inline asm - dp4a.u32.u32 %r3124, %r3125, %r5750, %r3120; - // end inline asm - ld.const.u32 %r3129, [matrix+1416]; - // begin inline asm - dp4a.u32.u32 %r3128, %r3129, %r5754, %r3124; - // end inline asm - ld.const.u32 %r3133, [matrix+1420]; - // begin inline asm - dp4a.u32.u32 %r3132, %r3133, %r5758, %r3128; - // end inline asm - ld.const.u32 %r3137, [matrix+1424]; - // begin inline asm - dp4a.u32.u32 %r3136, %r3137, %r5762, %r3132; - // end inline asm - ld.const.u32 %r3141, [matrix+1428]; - // begin inline asm - dp4a.u32.u32 %r3140, %r3141, %r5766, %r3136; - // end inline asm - ld.const.u32 %r3145, [matrix+1432]; - // begin inline asm - dp4a.u32.u32 %r3144, %r3145, %r5770, %r3140; - // end inline asm - ld.const.u32 %r3149, [matrix+1436]; - // begin inline asm - dp4a.u32.u32 %r3148, %r3149, %r5774, %r3144; - // end inline asm - ld.const.u32 %r3153, [matrix+1440]; - // begin inline asm - dp4a.u32.u32 %r3152, %r3153, %r5778, %r3148; - // end inline asm - ld.const.u32 %r3157, [matrix+1444]; - // begin inline asm - dp4a.u32.u32 %r3156, %r3157, %r5782, %r3152; - // end inline asm - ld.const.u32 %r3161, [matrix+1448]; - // begin inline asm - dp4a.u32.u32 %r3160, %r3161, %r5786, %r3156; - // end inline asm - ld.const.u32 %r3165, [matrix+1452]; - // begin inline asm - dp4a.u32.u32 %r3164, %r3165, %r5790, %r3160; - // end inline asm - ld.const.u32 %r3169, [matrix+1456]; - // begin inline asm - dp4a.u32.u32 %r3168, %r3169, %r5794, %r3164; - // end inline asm - ld.const.u32 %r3173, [matrix+1460]; - // begin inline asm - dp4a.u32.u32 %r3172, %r3173, %r5798, %r3168; - // end inline asm - ld.const.u32 %r3177, [matrix+1464]; - // begin inline asm - dp4a.u32.u32 %r3176, %r3177, %r5802, %r3172; - // end inline asm - ld.const.u32 %r3181, [matrix+1468]; - // begin inline asm - dp4a.u32.u32 %r3180, %r3181, %r5806, %r3176; - // end inline asm - ld.const.u32 %r3185, [matrix+1472]; - // begin inline asm - dp4a.u32.u32 %r3184, %r3185, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3189, [matrix+1476]; - // begin inline asm - dp4a.u32.u32 %r3188, %r3189, %r5750, %r3184; - // end inline asm - ld.const.u32 %r3193, [matrix+1480]; - // begin inline asm - dp4a.u32.u32 %r3192, %r3193, %r5754, %r3188; - // end inline asm - ld.const.u32 %r3197, [matrix+1484]; - // begin inline asm - dp4a.u32.u32 %r3196, %r3197, %r5758, %r3192; - // end inline asm - ld.const.u32 %r3201, [matrix+1488]; - // begin inline asm - dp4a.u32.u32 %r3200, %r3201, %r5762, %r3196; - // end inline asm - ld.const.u32 %r3205, [matrix+1492]; - // begin inline asm - dp4a.u32.u32 %r3204, %r3205, %r5766, %r3200; - // end inline asm - ld.const.u32 %r3209, [matrix+1496]; - // begin inline asm - dp4a.u32.u32 %r3208, %r3209, %r5770, %r3204; - // end inline asm - ld.const.u32 %r3213, [matrix+1500]; - // begin inline asm - dp4a.u32.u32 %r3212, %r3213, %r5774, %r3208; - // end inline asm - ld.const.u32 %r3217, [matrix+1504]; - // begin inline asm - dp4a.u32.u32 %r3216, %r3217, %r5778, %r3212; - // end inline asm - ld.const.u32 %r3221, [matrix+1508]; - // begin inline asm - dp4a.u32.u32 %r3220, %r3221, %r5782, %r3216; - // end inline asm - ld.const.u32 %r3225, [matrix+1512]; - // begin inline asm - dp4a.u32.u32 %r3224, %r3225, %r5786, %r3220; - // end inline asm - ld.const.u32 %r3229, [matrix+1516]; - // begin inline asm - dp4a.u32.u32 %r3228, %r3229, %r5790, %r3224; - // end inline asm - ld.const.u32 %r3233, [matrix+1520]; - // begin inline asm - dp4a.u32.u32 %r3232, %r3233, %r5794, %r3228; - // end inline asm - ld.const.u32 %r3237, [matrix+1524]; - // begin inline asm - dp4a.u32.u32 %r3236, %r3237, %r5798, %r3232; - // end inline asm - ld.const.u32 %r3241, [matrix+1528]; - // begin inline asm - dp4a.u32.u32 %r3240, %r3241, %r5802, %r3236; - // end inline asm - ld.const.u32 %r3245, [matrix+1532]; - // begin inline asm - dp4a.u32.u32 %r3244, %r3245, %r5806, %r3240; - // end inline asm - shr.u32 %r6015, %r3180, 6; - and.b32 %r6016, %r6015, 240; - shr.u32 %r6017, %r3244, 10; - or.b32 %r6018, %r6017, %r6016; - xor.b32 %r6019, %r5862, %r6018; - ld.const.u32 %r3249, [matrix+1536]; - // begin inline asm - dp4a.u32.u32 %r3248, %r3249, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3253, [matrix+1540]; - // begin inline asm - dp4a.u32.u32 %r3252, %r3253, %r5750, %r3248; - // end inline asm - ld.const.u32 %r3257, [matrix+1544]; - // begin inline asm - dp4a.u32.u32 %r3256, %r3257, %r5754, %r3252; - // end inline asm - ld.const.u32 %r3261, [matrix+1548]; - // begin inline asm - dp4a.u32.u32 %r3260, %r3261, %r5758, %r3256; - // end inline asm - ld.const.u32 %r3265, [matrix+1552]; - // begin inline asm - dp4a.u32.u32 %r3264, %r3265, %r5762, %r3260; - // end inline asm - ld.const.u32 %r3269, [matrix+1556]; - // begin inline asm - dp4a.u32.u32 %r3268, %r3269, %r5766, %r3264; - // end inline asm - ld.const.u32 %r3273, [matrix+1560]; - // begin inline asm - dp4a.u32.u32 %r3272, %r3273, %r5770, %r3268; - // end inline asm - ld.const.u32 %r3277, [matrix+1564]; - // begin inline asm - dp4a.u32.u32 %r3276, %r3277, %r5774, %r3272; - // end inline asm - ld.const.u32 %r3281, [matrix+1568]; - // begin inline asm - dp4a.u32.u32 %r3280, %r3281, %r5778, %r3276; - // end inline asm - ld.const.u32 %r3285, [matrix+1572]; - // begin inline asm - dp4a.u32.u32 %r3284, %r3285, %r5782, %r3280; - // end inline asm - ld.const.u32 %r3289, [matrix+1576]; - // begin inline asm - dp4a.u32.u32 %r3288, %r3289, %r5786, %r3284; - // end inline asm - ld.const.u32 %r3293, [matrix+1580]; - // begin inline asm - dp4a.u32.u32 %r3292, %r3293, %r5790, %r3288; - // end inline asm - ld.const.u32 %r3297, [matrix+1584]; - // begin inline asm - dp4a.u32.u32 %r3296, %r3297, %r5794, %r3292; - // end inline asm - ld.const.u32 %r3301, [matrix+1588]; - // begin inline asm - dp4a.u32.u32 %r3300, %r3301, %r5798, %r3296; - // end inline asm - ld.const.u32 %r3305, [matrix+1592]; - // begin inline asm - dp4a.u32.u32 %r3304, %r3305, %r5802, %r3300; - // end inline asm - ld.const.u32 %r3309, [matrix+1596]; - // begin inline asm - dp4a.u32.u32 %r3308, %r3309, %r5806, %r3304; - // end inline asm - ld.const.u32 %r3313, [matrix+1600]; - // begin inline asm - dp4a.u32.u32 %r3312, %r3313, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3317, [matrix+1604]; - // begin inline asm - dp4a.u32.u32 %r3316, %r3317, %r5750, %r3312; - // end inline asm - ld.const.u32 %r3321, [matrix+1608]; - // begin inline asm - dp4a.u32.u32 %r3320, %r3321, %r5754, %r3316; - // end inline asm - ld.const.u32 %r3325, [matrix+1612]; - // begin inline asm - dp4a.u32.u32 %r3324, %r3325, %r5758, %r3320; - // end inline asm - ld.const.u32 %r3329, [matrix+1616]; - // begin inline asm - dp4a.u32.u32 %r3328, %r3329, %r5762, %r3324; - // end inline asm - ld.const.u32 %r3333, [matrix+1620]; - // begin inline asm - dp4a.u32.u32 %r3332, %r3333, %r5766, %r3328; - // end inline asm - ld.const.u32 %r3337, [matrix+1624]; - // begin inline asm - dp4a.u32.u32 %r3336, %r3337, %r5770, %r3332; - // end inline asm - ld.const.u32 %r3341, [matrix+1628]; - // begin inline asm - dp4a.u32.u32 %r3340, %r3341, %r5774, %r3336; - // end inline asm - ld.const.u32 %r3345, [matrix+1632]; - // begin inline asm - dp4a.u32.u32 %r3344, %r3345, %r5778, %r3340; - // end inline asm - ld.const.u32 %r3349, [matrix+1636]; - // begin inline asm - dp4a.u32.u32 %r3348, %r3349, %r5782, %r3344; - // end inline asm - ld.const.u32 %r3353, [matrix+1640]; - // begin inline asm - dp4a.u32.u32 %r3352, %r3353, %r5786, %r3348; - // end inline asm - ld.const.u32 %r3357, [matrix+1644]; - // begin inline asm - dp4a.u32.u32 %r3356, %r3357, %r5790, %r3352; - // end inline asm - ld.const.u32 %r3361, [matrix+1648]; - // begin inline asm - dp4a.u32.u32 %r3360, %r3361, %r5794, %r3356; - // end inline asm - ld.const.u32 %r3365, [matrix+1652]; - // begin inline asm - dp4a.u32.u32 %r3364, %r3365, %r5798, %r3360; - // end inline asm - ld.const.u32 %r3369, [matrix+1656]; - // begin inline asm - dp4a.u32.u32 %r3368, %r3369, %r5802, %r3364; - // end inline asm - ld.const.u32 %r3373, [matrix+1660]; - // begin inline asm - dp4a.u32.u32 %r3372, %r3373, %r5806, %r3368; - // end inline asm - shr.u32 %r6020, %r3308, 6; - and.b32 %r6021, %r6020, 240; - shr.u32 %r6022, %r3372, 10; - or.b32 %r6023, %r6022, %r6021; - cvt.u64.u32 %rd217, %r6023; - xor.b64 %rd218, %rd13, %rd217; - ld.const.u32 %r3377, [matrix+1664]; - // begin inline asm - dp4a.u32.u32 %r3376, %r3377, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3381, [matrix+1668]; - // begin inline asm - dp4a.u32.u32 %r3380, %r3381, %r5750, %r3376; - // end inline asm - ld.const.u32 %r3385, [matrix+1672]; - // begin inline asm - dp4a.u32.u32 %r3384, %r3385, %r5754, %r3380; - // end inline asm - ld.const.u32 %r3389, [matrix+1676]; - // begin inline asm - dp4a.u32.u32 %r3388, %r3389, %r5758, %r3384; - // end inline asm - ld.const.u32 %r3393, [matrix+1680]; - // begin inline asm - dp4a.u32.u32 %r3392, %r3393, %r5762, %r3388; - // end inline asm - ld.const.u32 %r3397, [matrix+1684]; - // begin inline asm - dp4a.u32.u32 %r3396, %r3397, %r5766, %r3392; - // end inline asm - ld.const.u32 %r3401, [matrix+1688]; - // begin inline asm - dp4a.u32.u32 %r3400, %r3401, %r5770, %r3396; - // end inline asm - ld.const.u32 %r3405, [matrix+1692]; - // begin inline asm - dp4a.u32.u32 %r3404, %r3405, %r5774, %r3400; - // end inline asm - ld.const.u32 %r3409, [matrix+1696]; - // begin inline asm - dp4a.u32.u32 %r3408, %r3409, %r5778, %r3404; - // end inline asm - ld.const.u32 %r3413, [matrix+1700]; - // begin inline asm - dp4a.u32.u32 %r3412, %r3413, %r5782, %r3408; - // end inline asm - ld.const.u32 %r3417, [matrix+1704]; - // begin inline asm - dp4a.u32.u32 %r3416, %r3417, %r5786, %r3412; - // end inline asm - ld.const.u32 %r3421, [matrix+1708]; - // begin inline asm - dp4a.u32.u32 %r3420, %r3421, %r5790, %r3416; - // end inline asm - ld.const.u32 %r3425, [matrix+1712]; - // begin inline asm - dp4a.u32.u32 %r3424, %r3425, %r5794, %r3420; - // end inline asm - ld.const.u32 %r3429, [matrix+1716]; - // begin inline asm - dp4a.u32.u32 %r3428, %r3429, %r5798, %r3424; - // end inline asm - ld.const.u32 %r3433, [matrix+1720]; - // begin inline asm - dp4a.u32.u32 %r3432, %r3433, %r5802, %r3428; - // end inline asm - ld.const.u32 %r3437, [matrix+1724]; - // begin inline asm - dp4a.u32.u32 %r3436, %r3437, %r5806, %r3432; - // end inline asm - ld.const.u32 %r3441, [matrix+1728]; - // begin inline asm - dp4a.u32.u32 %r3440, %r3441, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3445, [matrix+1732]; - // begin inline asm - dp4a.u32.u32 %r3444, %r3445, %r5750, %r3440; - // end inline asm - ld.const.u32 %r3449, [matrix+1736]; - // begin inline asm - dp4a.u32.u32 %r3448, %r3449, %r5754, %r3444; - // end inline asm - ld.const.u32 %r3453, [matrix+1740]; - // begin inline asm - dp4a.u32.u32 %r3452, %r3453, %r5758, %r3448; - // end inline asm - ld.const.u32 %r3457, [matrix+1744]; - // begin inline asm - dp4a.u32.u32 %r3456, %r3457, %r5762, %r3452; - // end inline asm - ld.const.u32 %r3461, [matrix+1748]; - // begin inline asm - dp4a.u32.u32 %r3460, %r3461, %r5766, %r3456; - // end inline asm - ld.const.u32 %r3465, [matrix+1752]; - // begin inline asm - dp4a.u32.u32 %r3464, %r3465, %r5770, %r3460; - // end inline asm - ld.const.u32 %r3469, [matrix+1756]; - // begin inline asm - dp4a.u32.u32 %r3468, %r3469, %r5774, %r3464; - // end inline asm - ld.const.u32 %r3473, [matrix+1760]; - // begin inline asm - dp4a.u32.u32 %r3472, %r3473, %r5778, %r3468; - // end inline asm - ld.const.u32 %r3477, [matrix+1764]; - // begin inline asm - dp4a.u32.u32 %r3476, %r3477, %r5782, %r3472; - // end inline asm - ld.const.u32 %r3481, [matrix+1768]; - // begin inline asm - dp4a.u32.u32 %r3480, %r3481, %r5786, %r3476; - // end inline asm - ld.const.u32 %r3485, [matrix+1772]; - // begin inline asm - dp4a.u32.u32 %r3484, %r3485, %r5790, %r3480; - // end inline asm - ld.const.u32 %r3489, [matrix+1776]; - // begin inline asm - dp4a.u32.u32 %r3488, %r3489, %r5794, %r3484; - // end inline asm - ld.const.u32 %r3493, [matrix+1780]; - // begin inline asm - dp4a.u32.u32 %r3492, %r3493, %r5798, %r3488; - // end inline asm - ld.const.u32 %r3497, [matrix+1784]; - // begin inline asm - dp4a.u32.u32 %r3496, %r3497, %r5802, %r3492; - // end inline asm - ld.const.u32 %r3501, [matrix+1788]; - // begin inline asm - dp4a.u32.u32 %r3500, %r3501, %r5806, %r3496; - // end inline asm - shr.u32 %r6024, %r3436, 6; - and.b32 %r6025, %r6024, 240; - shr.u32 %r6026, %r3500, 10; - or.b32 %r6027, %r6026, %r6025; - cvt.u64.u32 %rd219, %r6027; - xor.b64 %rd220, %rd14, %rd219; - ld.const.u32 %r3505, [matrix+1792]; - // begin inline asm - dp4a.u32.u32 %r3504, %r3505, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3509, [matrix+1796]; - // begin inline asm - dp4a.u32.u32 %r3508, %r3509, %r5750, %r3504; - // end inline asm - ld.const.u32 %r3513, [matrix+1800]; - // begin inline asm - dp4a.u32.u32 %r3512, %r3513, %r5754, %r3508; - // end inline asm - ld.const.u32 %r3517, [matrix+1804]; - // begin inline asm - dp4a.u32.u32 %r3516, %r3517, %r5758, %r3512; - // end inline asm - ld.const.u32 %r3521, [matrix+1808]; - // begin inline asm - dp4a.u32.u32 %r3520, %r3521, %r5762, %r3516; - // end inline asm - ld.const.u32 %r3525, [matrix+1812]; - // begin inline asm - dp4a.u32.u32 %r3524, %r3525, %r5766, %r3520; - // end inline asm - ld.const.u32 %r3529, [matrix+1816]; - // begin inline asm - dp4a.u32.u32 %r3528, %r3529, %r5770, %r3524; - // end inline asm - ld.const.u32 %r3533, [matrix+1820]; - // begin inline asm - dp4a.u32.u32 %r3532, %r3533, %r5774, %r3528; - // end inline asm - ld.const.u32 %r3537, [matrix+1824]; - // begin inline asm - dp4a.u32.u32 %r3536, %r3537, %r5778, %r3532; - // end inline asm - ld.const.u32 %r3541, [matrix+1828]; - // begin inline asm - dp4a.u32.u32 %r3540, %r3541, %r5782, %r3536; - // end inline asm - ld.const.u32 %r3545, [matrix+1832]; - // begin inline asm - dp4a.u32.u32 %r3544, %r3545, %r5786, %r3540; - // end inline asm - ld.const.u32 %r3549, [matrix+1836]; - // begin inline asm - dp4a.u32.u32 %r3548, %r3549, %r5790, %r3544; - // end inline asm - ld.const.u32 %r3553, [matrix+1840]; - // begin inline asm - dp4a.u32.u32 %r3552, %r3553, %r5794, %r3548; - // end inline asm - ld.const.u32 %r3557, [matrix+1844]; - // begin inline asm - dp4a.u32.u32 %r3556, %r3557, %r5798, %r3552; - // end inline asm - ld.const.u32 %r3561, [matrix+1848]; - // begin inline asm - dp4a.u32.u32 %r3560, %r3561, %r5802, %r3556; - // end inline asm - ld.const.u32 %r3565, [matrix+1852]; - // begin inline asm - dp4a.u32.u32 %r3564, %r3565, %r5806, %r3560; - // end inline asm - ld.const.u32 %r3569, [matrix+1856]; - // begin inline asm - dp4a.u32.u32 %r3568, %r3569, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3573, [matrix+1860]; - // begin inline asm - dp4a.u32.u32 %r3572, %r3573, %r5750, %r3568; - // end inline asm - ld.const.u32 %r3577, [matrix+1864]; - // begin inline asm - dp4a.u32.u32 %r3576, %r3577, %r5754, %r3572; - // end inline asm - ld.const.u32 %r3581, [matrix+1868]; - // begin inline asm - dp4a.u32.u32 %r3580, %r3581, %r5758, %r3576; - // end inline asm - ld.const.u32 %r3585, [matrix+1872]; - // begin inline asm - dp4a.u32.u32 %r3584, %r3585, %r5762, %r3580; - // end inline asm - ld.const.u32 %r3589, [matrix+1876]; - // begin inline asm - dp4a.u32.u32 %r3588, %r3589, %r5766, %r3584; - // end inline asm - ld.const.u32 %r3593, [matrix+1880]; - // begin inline asm - dp4a.u32.u32 %r3592, %r3593, %r5770, %r3588; - // end inline asm - ld.const.u32 %r3597, [matrix+1884]; - // begin inline asm - dp4a.u32.u32 %r3596, %r3597, %r5774, %r3592; - // end inline asm - ld.const.u32 %r3601, [matrix+1888]; - // begin inline asm - dp4a.u32.u32 %r3600, %r3601, %r5778, %r3596; - // end inline asm - ld.const.u32 %r3605, [matrix+1892]; - // begin inline asm - dp4a.u32.u32 %r3604, %r3605, %r5782, %r3600; - // end inline asm - ld.const.u32 %r3609, [matrix+1896]; - // begin inline asm - dp4a.u32.u32 %r3608, %r3609, %r5786, %r3604; - // end inline asm - ld.const.u32 %r3613, [matrix+1900]; - // begin inline asm - dp4a.u32.u32 %r3612, %r3613, %r5790, %r3608; - // end inline asm - ld.const.u32 %r3617, [matrix+1904]; - // begin inline asm - dp4a.u32.u32 %r3616, %r3617, %r5794, %r3612; - // end inline asm - ld.const.u32 %r3621, [matrix+1908]; - // begin inline asm - dp4a.u32.u32 %r3620, %r3621, %r5798, %r3616; - // end inline asm - ld.const.u32 %r3625, [matrix+1912]; - // begin inline asm - dp4a.u32.u32 %r3624, %r3625, %r5802, %r3620; - // end inline asm - ld.const.u32 %r3629, [matrix+1916]; - // begin inline asm - dp4a.u32.u32 %r3628, %r3629, %r5806, %r3624; - // end inline asm - shr.u32 %r6028, %r3564, 6; - and.b32 %r6029, %r6028, 240; - shr.u32 %r6030, %r3628, 10; - or.b32 %r6031, %r6030, %r6029; - cvt.u64.u32 %rd221, %r6031; - xor.b64 %rd222, %rd15, %rd221; - ld.const.u32 %r3633, [matrix+1920]; - // begin inline asm - dp4a.u32.u32 %r3632, %r3633, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3637, [matrix+1924]; - // begin inline asm - dp4a.u32.u32 %r3636, %r3637, %r5750, %r3632; - // end inline asm - ld.const.u32 %r3641, [matrix+1928]; - // begin inline asm - dp4a.u32.u32 %r3640, %r3641, %r5754, %r3636; - // end inline asm - ld.const.u32 %r3645, [matrix+1932]; - // begin inline asm - dp4a.u32.u32 %r3644, %r3645, %r5758, %r3640; - // end inline asm - ld.const.u32 %r3649, [matrix+1936]; - // begin inline asm - dp4a.u32.u32 %r3648, %r3649, %r5762, %r3644; - // end inline asm - ld.const.u32 %r3653, [matrix+1940]; - // begin inline asm - dp4a.u32.u32 %r3652, %r3653, %r5766, %r3648; - // end inline asm - ld.const.u32 %r3657, [matrix+1944]; - // begin inline asm - dp4a.u32.u32 %r3656, %r3657, %r5770, %r3652; - // end inline asm - ld.const.u32 %r3661, [matrix+1948]; - // begin inline asm - dp4a.u32.u32 %r3660, %r3661, %r5774, %r3656; - // end inline asm - ld.const.u32 %r3665, [matrix+1952]; - // begin inline asm - dp4a.u32.u32 %r3664, %r3665, %r5778, %r3660; - // end inline asm - ld.const.u32 %r3669, [matrix+1956]; - // begin inline asm - dp4a.u32.u32 %r3668, %r3669, %r5782, %r3664; - // end inline asm - ld.const.u32 %r3673, [matrix+1960]; - // begin inline asm - dp4a.u32.u32 %r3672, %r3673, %r5786, %r3668; - // end inline asm - ld.const.u32 %r3677, [matrix+1964]; - // begin inline asm - dp4a.u32.u32 %r3676, %r3677, %r5790, %r3672; - // end inline asm - ld.const.u32 %r3681, [matrix+1968]; - // begin inline asm - dp4a.u32.u32 %r3680, %r3681, %r5794, %r3676; - // end inline asm - ld.const.u32 %r3685, [matrix+1972]; - // begin inline asm - dp4a.u32.u32 %r3684, %r3685, %r5798, %r3680; - // end inline asm - ld.const.u32 %r3689, [matrix+1976]; - // begin inline asm - dp4a.u32.u32 %r3688, %r3689, %r5802, %r3684; - // end inline asm - ld.const.u32 %r3693, [matrix+1980]; - // begin inline asm - dp4a.u32.u32 %r3692, %r3693, %r5806, %r3688; - // end inline asm - ld.const.u32 %r3697, [matrix+1984]; - // begin inline asm - dp4a.u32.u32 %r3696, %r3697, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3701, [matrix+1988]; - // begin inline asm - dp4a.u32.u32 %r3700, %r3701, %r5750, %r3696; - // end inline asm - ld.const.u32 %r3705, [matrix+1992]; - // begin inline asm - dp4a.u32.u32 %r3704, %r3705, %r5754, %r3700; - // end inline asm - ld.const.u32 %r3709, [matrix+1996]; - // begin inline asm - dp4a.u32.u32 %r3708, %r3709, %r5758, %r3704; - // end inline asm - ld.const.u32 %r3713, [matrix+2000]; - // begin inline asm - dp4a.u32.u32 %r3712, %r3713, %r5762, %r3708; - // end inline asm - ld.const.u32 %r3717, [matrix+2004]; - // begin inline asm - dp4a.u32.u32 %r3716, %r3717, %r5766, %r3712; - // end inline asm - ld.const.u32 %r3721, [matrix+2008]; - // begin inline asm - dp4a.u32.u32 %r3720, %r3721, %r5770, %r3716; - // end inline asm - ld.const.u32 %r3725, [matrix+2012]; - // begin inline asm - dp4a.u32.u32 %r3724, %r3725, %r5774, %r3720; - // end inline asm - ld.const.u32 %r3729, [matrix+2016]; - // begin inline asm - dp4a.u32.u32 %r3728, %r3729, %r5778, %r3724; - // end inline asm - ld.const.u32 %r3733, [matrix+2020]; - // begin inline asm - dp4a.u32.u32 %r3732, %r3733, %r5782, %r3728; - // end inline asm - ld.const.u32 %r3737, [matrix+2024]; - // begin inline asm - dp4a.u32.u32 %r3736, %r3737, %r5786, %r3732; - // end inline asm - ld.const.u32 %r3741, [matrix+2028]; - // begin inline asm - dp4a.u32.u32 %r3740, %r3741, %r5790, %r3736; - // end inline asm - ld.const.u32 %r3745, [matrix+2032]; - // begin inline asm - dp4a.u32.u32 %r3744, %r3745, %r5794, %r3740; - // end inline asm - ld.const.u32 %r3749, [matrix+2036]; - // begin inline asm - dp4a.u32.u32 %r3748, %r3749, %r5798, %r3744; - // end inline asm - ld.const.u32 %r3753, [matrix+2040]; - // begin inline asm - dp4a.u32.u32 %r3752, %r3753, %r5802, %r3748; - // end inline asm - ld.const.u32 %r3757, [matrix+2044]; - // begin inline asm - dp4a.u32.u32 %r3756, %r3757, %r5806, %r3752; - // end inline asm - shr.u32 %r6032, %r3692, 6; - and.b32 %r6033, %r6032, 240; - ld.const.u32 %r3761, [matrix+2048]; - // begin inline asm - dp4a.u32.u32 %r3760, %r3761, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3765, [matrix+2052]; - // begin inline asm - dp4a.u32.u32 %r3764, %r3765, %r5750, %r3760; - // end inline asm - ld.const.u32 %r3769, [matrix+2056]; - // begin inline asm - dp4a.u32.u32 %r3768, %r3769, %r5754, %r3764; - // end inline asm - ld.const.u32 %r3773, [matrix+2060]; - // begin inline asm - dp4a.u32.u32 %r3772, %r3773, %r5758, %r3768; - // end inline asm - ld.const.u32 %r3777, [matrix+2064]; - // begin inline asm - dp4a.u32.u32 %r3776, %r3777, %r5762, %r3772; - // end inline asm - ld.const.u32 %r3781, [matrix+2068]; - // begin inline asm - dp4a.u32.u32 %r3780, %r3781, %r5766, %r3776; - // end inline asm - ld.const.u32 %r3785, [matrix+2072]; - // begin inline asm - dp4a.u32.u32 %r3784, %r3785, %r5770, %r3780; - // end inline asm - ld.const.u32 %r3789, [matrix+2076]; - // begin inline asm - dp4a.u32.u32 %r3788, %r3789, %r5774, %r3784; - // end inline asm - ld.const.u32 %r3793, [matrix+2080]; - // begin inline asm - dp4a.u32.u32 %r3792, %r3793, %r5778, %r3788; - // end inline asm - ld.const.u32 %r3797, [matrix+2084]; - // begin inline asm - dp4a.u32.u32 %r3796, %r3797, %r5782, %r3792; - // end inline asm - ld.const.u32 %r3801, [matrix+2088]; - // begin inline asm - dp4a.u32.u32 %r3800, %r3801, %r5786, %r3796; - // end inline asm - ld.const.u32 %r3805, [matrix+2092]; - // begin inline asm - dp4a.u32.u32 %r3804, %r3805, %r5790, %r3800; - // end inline asm - ld.const.u32 %r3809, [matrix+2096]; - // begin inline asm - dp4a.u32.u32 %r3808, %r3809, %r5794, %r3804; - // end inline asm - ld.const.u32 %r3813, [matrix+2100]; - // begin inline asm - dp4a.u32.u32 %r3812, %r3813, %r5798, %r3808; - // end inline asm - ld.const.u32 %r3817, [matrix+2104]; - // begin inline asm - dp4a.u32.u32 %r3816, %r3817, %r5802, %r3812; - // end inline asm - ld.const.u32 %r3821, [matrix+2108]; - // begin inline asm - dp4a.u32.u32 %r3820, %r3821, %r5806, %r3816; - // end inline asm - ld.const.u32 %r3825, [matrix+2112]; - // begin inline asm - dp4a.u32.u32 %r3824, %r3825, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3829, [matrix+2116]; - // begin inline asm - dp4a.u32.u32 %r3828, %r3829, %r5750, %r3824; - // end inline asm - ld.const.u32 %r3833, [matrix+2120]; - // begin inline asm - dp4a.u32.u32 %r3832, %r3833, %r5754, %r3828; - // end inline asm - ld.const.u32 %r3837, [matrix+2124]; - // begin inline asm - dp4a.u32.u32 %r3836, %r3837, %r5758, %r3832; - // end inline asm - ld.const.u32 %r3841, [matrix+2128]; - // begin inline asm - dp4a.u32.u32 %r3840, %r3841, %r5762, %r3836; - // end inline asm - ld.const.u32 %r3845, [matrix+2132]; - // begin inline asm - dp4a.u32.u32 %r3844, %r3845, %r5766, %r3840; - // end inline asm - ld.const.u32 %r3849, [matrix+2136]; - // begin inline asm - dp4a.u32.u32 %r3848, %r3849, %r5770, %r3844; - // end inline asm - ld.const.u32 %r3853, [matrix+2140]; - // begin inline asm - dp4a.u32.u32 %r3852, %r3853, %r5774, %r3848; - // end inline asm - ld.const.u32 %r3857, [matrix+2144]; - // begin inline asm - dp4a.u32.u32 %r3856, %r3857, %r5778, %r3852; - // end inline asm - ld.const.u32 %r3861, [matrix+2148]; - // begin inline asm - dp4a.u32.u32 %r3860, %r3861, %r5782, %r3856; - // end inline asm - ld.const.u32 %r3865, [matrix+2152]; - // begin inline asm - dp4a.u32.u32 %r3864, %r3865, %r5786, %r3860; - // end inline asm - ld.const.u32 %r3869, [matrix+2156]; - // begin inline asm - dp4a.u32.u32 %r3868, %r3869, %r5790, %r3864; - // end inline asm - ld.const.u32 %r3873, [matrix+2160]; - // begin inline asm - dp4a.u32.u32 %r3872, %r3873, %r5794, %r3868; - // end inline asm - ld.const.u32 %r3877, [matrix+2164]; - // begin inline asm - dp4a.u32.u32 %r3876, %r3877, %r5798, %r3872; - // end inline asm - ld.const.u32 %r3881, [matrix+2168]; - // begin inline asm - dp4a.u32.u32 %r3880, %r3881, %r5802, %r3876; - // end inline asm - ld.const.u32 %r3885, [matrix+2172]; - // begin inline asm - dp4a.u32.u32 %r3884, %r3885, %r5806, %r3880; - // end inline asm - shr.u32 %r6034, %r3820, 6; - and.b32 %r6035, %r6034, 240; - shr.u32 %r6036, %r3884, 10; - or.b32 %r6037, %r6036, %r6035; - xor.b32 %r6038, %r13, %r6037; - ld.const.u32 %r3889, [matrix+2176]; - // begin inline asm - dp4a.u32.u32 %r3888, %r3889, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3893, [matrix+2180]; - // begin inline asm - dp4a.u32.u32 %r3892, %r3893, %r5750, %r3888; - // end inline asm - ld.const.u32 %r3897, [matrix+2184]; - // begin inline asm - dp4a.u32.u32 %r3896, %r3897, %r5754, %r3892; - // end inline asm - ld.const.u32 %r3901, [matrix+2188]; - // begin inline asm - dp4a.u32.u32 %r3900, %r3901, %r5758, %r3896; - // end inline asm - ld.const.u32 %r3905, [matrix+2192]; - // begin inline asm - dp4a.u32.u32 %r3904, %r3905, %r5762, %r3900; - // end inline asm - ld.const.u32 %r3909, [matrix+2196]; - // begin inline asm - dp4a.u32.u32 %r3908, %r3909, %r5766, %r3904; - // end inline asm - ld.const.u32 %r3913, [matrix+2200]; - // begin inline asm - dp4a.u32.u32 %r3912, %r3913, %r5770, %r3908; - // end inline asm - ld.const.u32 %r3917, [matrix+2204]; - // begin inline asm - dp4a.u32.u32 %r3916, %r3917, %r5774, %r3912; - // end inline asm - ld.const.u32 %r3921, [matrix+2208]; - // begin inline asm - dp4a.u32.u32 %r3920, %r3921, %r5778, %r3916; - // end inline asm - ld.const.u32 %r3925, [matrix+2212]; - // begin inline asm - dp4a.u32.u32 %r3924, %r3925, %r5782, %r3920; - // end inline asm - ld.const.u32 %r3929, [matrix+2216]; - // begin inline asm - dp4a.u32.u32 %r3928, %r3929, %r5786, %r3924; - // end inline asm - ld.const.u32 %r3933, [matrix+2220]; - // begin inline asm - dp4a.u32.u32 %r3932, %r3933, %r5790, %r3928; - // end inline asm - ld.const.u32 %r3937, [matrix+2224]; - // begin inline asm - dp4a.u32.u32 %r3936, %r3937, %r5794, %r3932; - // end inline asm - ld.const.u32 %r3941, [matrix+2228]; - // begin inline asm - dp4a.u32.u32 %r3940, %r3941, %r5798, %r3936; - // end inline asm - ld.const.u32 %r3945, [matrix+2232]; - // begin inline asm - dp4a.u32.u32 %r3944, %r3945, %r5802, %r3940; - // end inline asm - ld.const.u32 %r3949, [matrix+2236]; - // begin inline asm - dp4a.u32.u32 %r3948, %r3949, %r5806, %r3944; - // end inline asm - ld.const.u32 %r3953, [matrix+2240]; - // begin inline asm - dp4a.u32.u32 %r3952, %r3953, %r5746, %r6244; - // end inline asm - ld.const.u32 %r3957, [matrix+2244]; - // begin inline asm - dp4a.u32.u32 %r3956, %r3957, %r5750, %r3952; - // end inline asm - ld.const.u32 %r3961, [matrix+2248]; - // begin inline asm - dp4a.u32.u32 %r3960, %r3961, %r5754, %r3956; - // end inline asm - ld.const.u32 %r3965, [matrix+2252]; - // begin inline asm - dp4a.u32.u32 %r3964, %r3965, %r5758, %r3960; - // end inline asm - ld.const.u32 %r3969, [matrix+2256]; - // begin inline asm - dp4a.u32.u32 %r3968, %r3969, %r5762, %r3964; - // end inline asm - ld.const.u32 %r3973, [matrix+2260]; - // begin inline asm - dp4a.u32.u32 %r3972, %r3973, %r5766, %r3968; - // end inline asm - ld.const.u32 %r3977, [matrix+2264]; - // begin inline asm - dp4a.u32.u32 %r3976, %r3977, %r5770, %r3972; - // end inline asm - ld.const.u32 %r3981, [matrix+2268]; - // begin inline asm - dp4a.u32.u32 %r3980, %r3981, %r5774, %r3976; - // end inline asm - ld.const.u32 %r3985, [matrix+2272]; - // begin inline asm - dp4a.u32.u32 %r3984, %r3985, %r5778, %r3980; - // end inline asm - ld.const.u32 %r3989, [matrix+2276]; - // begin inline asm - dp4a.u32.u32 %r3988, %r3989, %r5782, %r3984; - // end inline asm - ld.const.u32 %r3993, [matrix+2280]; - // begin inline asm - dp4a.u32.u32 %r3992, %r3993, %r5786, %r3988; - // end inline asm - ld.const.u32 %r3997, [matrix+2284]; - // begin inline asm - dp4a.u32.u32 %r3996, %r3997, %r5790, %r3992; - // end inline asm - ld.const.u32 %r4001, [matrix+2288]; - // begin inline asm - dp4a.u32.u32 %r4000, %r4001, %r5794, %r3996; - // end inline asm - ld.const.u32 %r4005, [matrix+2292]; - // begin inline asm - dp4a.u32.u32 %r4004, %r4005, %r5798, %r4000; - // end inline asm - ld.const.u32 %r4009, [matrix+2296]; - // begin inline asm - dp4a.u32.u32 %r4008, %r4009, %r5802, %r4004; - // end inline asm - ld.const.u32 %r4013, [matrix+2300]; - // begin inline asm - dp4a.u32.u32 %r4012, %r4013, %r5806, %r4008; - // end inline asm - shr.u32 %r6039, %r3948, 6; - and.b32 %r6040, %r6039, 240; - shr.u32 %r6041, %r4012, 10; - or.b32 %r6042, %r6041, %r6040; - xor.b32 %r6043, %r5886, %r6042; - ld.const.u32 %r4017, [matrix+2304]; - // begin inline asm - dp4a.u32.u32 %r4016, %r4017, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4021, [matrix+2308]; - // begin inline asm - dp4a.u32.u32 %r4020, %r4021, %r5750, %r4016; - // end inline asm - ld.const.u32 %r4025, [matrix+2312]; - // begin inline asm - dp4a.u32.u32 %r4024, %r4025, %r5754, %r4020; - // end inline asm - ld.const.u32 %r4029, [matrix+2316]; - // begin inline asm - dp4a.u32.u32 %r4028, %r4029, %r5758, %r4024; - // end inline asm - ld.const.u32 %r4033, [matrix+2320]; - // begin inline asm - dp4a.u32.u32 %r4032, %r4033, %r5762, %r4028; - // end inline asm - ld.const.u32 %r4037, [matrix+2324]; - // begin inline asm - dp4a.u32.u32 %r4036, %r4037, %r5766, %r4032; - // end inline asm - ld.const.u32 %r4041, [matrix+2328]; - // begin inline asm - dp4a.u32.u32 %r4040, %r4041, %r5770, %r4036; - // end inline asm - ld.const.u32 %r4045, [matrix+2332]; - // begin inline asm - dp4a.u32.u32 %r4044, %r4045, %r5774, %r4040; - // end inline asm - ld.const.u32 %r4049, [matrix+2336]; - // begin inline asm - dp4a.u32.u32 %r4048, %r4049, %r5778, %r4044; - // end inline asm - ld.const.u32 %r4053, [matrix+2340]; - // begin inline asm - dp4a.u32.u32 %r4052, %r4053, %r5782, %r4048; - // end inline asm - ld.const.u32 %r4057, [matrix+2344]; - // begin inline asm - dp4a.u32.u32 %r4056, %r4057, %r5786, %r4052; - // end inline asm - ld.const.u32 %r4061, [matrix+2348]; - // begin inline asm - dp4a.u32.u32 %r4060, %r4061, %r5790, %r4056; - // end inline asm - ld.const.u32 %r4065, [matrix+2352]; - // begin inline asm - dp4a.u32.u32 %r4064, %r4065, %r5794, %r4060; - // end inline asm - ld.const.u32 %r4069, [matrix+2356]; - // begin inline asm - dp4a.u32.u32 %r4068, %r4069, %r5798, %r4064; - // end inline asm - ld.const.u32 %r4073, [matrix+2360]; - // begin inline asm - dp4a.u32.u32 %r4072, %r4073, %r5802, %r4068; - // end inline asm - ld.const.u32 %r4077, [matrix+2364]; - // begin inline asm - dp4a.u32.u32 %r4076, %r4077, %r5806, %r4072; - // end inline asm - ld.const.u32 %r4081, [matrix+2368]; - // begin inline asm - dp4a.u32.u32 %r4080, %r4081, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4085, [matrix+2372]; - // begin inline asm - dp4a.u32.u32 %r4084, %r4085, %r5750, %r4080; - // end inline asm - ld.const.u32 %r4089, [matrix+2376]; - // begin inline asm - dp4a.u32.u32 %r4088, %r4089, %r5754, %r4084; - // end inline asm - ld.const.u32 %r4093, [matrix+2380]; - // begin inline asm - dp4a.u32.u32 %r4092, %r4093, %r5758, %r4088; - // end inline asm - ld.const.u32 %r4097, [matrix+2384]; - // begin inline asm - dp4a.u32.u32 %r4096, %r4097, %r5762, %r4092; - // end inline asm - ld.const.u32 %r4101, [matrix+2388]; - // begin inline asm - dp4a.u32.u32 %r4100, %r4101, %r5766, %r4096; - // end inline asm - ld.const.u32 %r4105, [matrix+2392]; - // begin inline asm - dp4a.u32.u32 %r4104, %r4105, %r5770, %r4100; - // end inline asm - ld.const.u32 %r4109, [matrix+2396]; - // begin inline asm - dp4a.u32.u32 %r4108, %r4109, %r5774, %r4104; - // end inline asm - ld.const.u32 %r4113, [matrix+2400]; - // begin inline asm - dp4a.u32.u32 %r4112, %r4113, %r5778, %r4108; - // end inline asm - ld.const.u32 %r4117, [matrix+2404]; - // begin inline asm - dp4a.u32.u32 %r4116, %r4117, %r5782, %r4112; - // end inline asm - ld.const.u32 %r4121, [matrix+2408]; - // begin inline asm - dp4a.u32.u32 %r4120, %r4121, %r5786, %r4116; - // end inline asm - ld.const.u32 %r4125, [matrix+2412]; - // begin inline asm - dp4a.u32.u32 %r4124, %r4125, %r5790, %r4120; - // end inline asm - ld.const.u32 %r4129, [matrix+2416]; - // begin inline asm - dp4a.u32.u32 %r4128, %r4129, %r5794, %r4124; - // end inline asm - ld.const.u32 %r4133, [matrix+2420]; - // begin inline asm - dp4a.u32.u32 %r4132, %r4133, %r5798, %r4128; - // end inline asm - ld.const.u32 %r4137, [matrix+2424]; - // begin inline asm - dp4a.u32.u32 %r4136, %r4137, %r5802, %r4132; - // end inline asm - ld.const.u32 %r4141, [matrix+2428]; - // begin inline asm - dp4a.u32.u32 %r4140, %r4141, %r5806, %r4136; - // end inline asm - shr.u32 %r6044, %r4076, 6; - and.b32 %r6045, %r6044, 240; - shr.u32 %r6046, %r4140, 10; - or.b32 %r6047, %r6046, %r6045; - xor.b32 %r6048, %r5898, %r6047; - ld.const.u32 %r4145, [matrix+2432]; - // begin inline asm - dp4a.u32.u32 %r4144, %r4145, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4149, [matrix+2436]; - // begin inline asm - dp4a.u32.u32 %r4148, %r4149, %r5750, %r4144; - // end inline asm - ld.const.u32 %r4153, [matrix+2440]; - // begin inline asm - dp4a.u32.u32 %r4152, %r4153, %r5754, %r4148; - // end inline asm - ld.const.u32 %r4157, [matrix+2444]; - // begin inline asm - dp4a.u32.u32 %r4156, %r4157, %r5758, %r4152; - // end inline asm - ld.const.u32 %r4161, [matrix+2448]; - // begin inline asm - dp4a.u32.u32 %r4160, %r4161, %r5762, %r4156; - // end inline asm - ld.const.u32 %r4165, [matrix+2452]; - // begin inline asm - dp4a.u32.u32 %r4164, %r4165, %r5766, %r4160; - // end inline asm - ld.const.u32 %r4169, [matrix+2456]; - // begin inline asm - dp4a.u32.u32 %r4168, %r4169, %r5770, %r4164; - // end inline asm - ld.const.u32 %r4173, [matrix+2460]; - // begin inline asm - dp4a.u32.u32 %r4172, %r4173, %r5774, %r4168; - // end inline asm - ld.const.u32 %r4177, [matrix+2464]; - // begin inline asm - dp4a.u32.u32 %r4176, %r4177, %r5778, %r4172; - // end inline asm - ld.const.u32 %r4181, [matrix+2468]; - // begin inline asm - dp4a.u32.u32 %r4180, %r4181, %r5782, %r4176; - // end inline asm - ld.const.u32 %r4185, [matrix+2472]; - // begin inline asm - dp4a.u32.u32 %r4184, %r4185, %r5786, %r4180; - // end inline asm - ld.const.u32 %r4189, [matrix+2476]; - // begin inline asm - dp4a.u32.u32 %r4188, %r4189, %r5790, %r4184; - // end inline asm - ld.const.u32 %r4193, [matrix+2480]; - // begin inline asm - dp4a.u32.u32 %r4192, %r4193, %r5794, %r4188; - // end inline asm - ld.const.u32 %r4197, [matrix+2484]; - // begin inline asm - dp4a.u32.u32 %r4196, %r4197, %r5798, %r4192; - // end inline asm - ld.const.u32 %r4201, [matrix+2488]; - // begin inline asm - dp4a.u32.u32 %r4200, %r4201, %r5802, %r4196; - // end inline asm - ld.const.u32 %r4205, [matrix+2492]; - // begin inline asm - dp4a.u32.u32 %r4204, %r4205, %r5806, %r4200; - // end inline asm - ld.const.u32 %r4209, [matrix+2496]; - // begin inline asm - dp4a.u32.u32 %r4208, %r4209, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4213, [matrix+2500]; - // begin inline asm - dp4a.u32.u32 %r4212, %r4213, %r5750, %r4208; - // end inline asm - ld.const.u32 %r4217, [matrix+2504]; - // begin inline asm - dp4a.u32.u32 %r4216, %r4217, %r5754, %r4212; - // end inline asm - ld.const.u32 %r4221, [matrix+2508]; - // begin inline asm - dp4a.u32.u32 %r4220, %r4221, %r5758, %r4216; - // end inline asm - ld.const.u32 %r4225, [matrix+2512]; - // begin inline asm - dp4a.u32.u32 %r4224, %r4225, %r5762, %r4220; - // end inline asm - ld.const.u32 %r4229, [matrix+2516]; - // begin inline asm - dp4a.u32.u32 %r4228, %r4229, %r5766, %r4224; - // end inline asm - ld.const.u32 %r4233, [matrix+2520]; - // begin inline asm - dp4a.u32.u32 %r4232, %r4233, %r5770, %r4228; - // end inline asm - ld.const.u32 %r4237, [matrix+2524]; - // begin inline asm - dp4a.u32.u32 %r4236, %r4237, %r5774, %r4232; - // end inline asm - ld.const.u32 %r4241, [matrix+2528]; - // begin inline asm - dp4a.u32.u32 %r4240, %r4241, %r5778, %r4236; - // end inline asm - ld.const.u32 %r4245, [matrix+2532]; - // begin inline asm - dp4a.u32.u32 %r4244, %r4245, %r5782, %r4240; - // end inline asm - ld.const.u32 %r4249, [matrix+2536]; - // begin inline asm - dp4a.u32.u32 %r4248, %r4249, %r5786, %r4244; - // end inline asm - ld.const.u32 %r4253, [matrix+2540]; - // begin inline asm - dp4a.u32.u32 %r4252, %r4253, %r5790, %r4248; - // end inline asm - ld.const.u32 %r4257, [matrix+2544]; - // begin inline asm - dp4a.u32.u32 %r4256, %r4257, %r5794, %r4252; - // end inline asm - ld.const.u32 %r4261, [matrix+2548]; - // begin inline asm - dp4a.u32.u32 %r4260, %r4261, %r5798, %r4256; - // end inline asm - ld.const.u32 %r4265, [matrix+2552]; - // begin inline asm - dp4a.u32.u32 %r4264, %r4265, %r5802, %r4260; - // end inline asm - ld.const.u32 %r4269, [matrix+2556]; - // begin inline asm - dp4a.u32.u32 %r4268, %r4269, %r5806, %r4264; - // end inline asm - shr.u32 %r6049, %r4204, 6; - and.b32 %r6050, %r6049, 240; - shr.u32 %r6051, %r4268, 10; - or.b32 %r6052, %r6051, %r6050; - xor.b32 %r6053, %r5900, %r6052; - ld.const.u32 %r4273, [matrix+2560]; - // begin inline asm - dp4a.u32.u32 %r4272, %r4273, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4277, [matrix+2564]; - // begin inline asm - dp4a.u32.u32 %r4276, %r4277, %r5750, %r4272; - // end inline asm - ld.const.u32 %r4281, [matrix+2568]; - // begin inline asm - dp4a.u32.u32 %r4280, %r4281, %r5754, %r4276; - // end inline asm - ld.const.u32 %r4285, [matrix+2572]; - // begin inline asm - dp4a.u32.u32 %r4284, %r4285, %r5758, %r4280; - // end inline asm - ld.const.u32 %r4289, [matrix+2576]; - // begin inline asm - dp4a.u32.u32 %r4288, %r4289, %r5762, %r4284; - // end inline asm - ld.const.u32 %r4293, [matrix+2580]; - // begin inline asm - dp4a.u32.u32 %r4292, %r4293, %r5766, %r4288; - // end inline asm - ld.const.u32 %r4297, [matrix+2584]; - // begin inline asm - dp4a.u32.u32 %r4296, %r4297, %r5770, %r4292; - // end inline asm - ld.const.u32 %r4301, [matrix+2588]; - // begin inline asm - dp4a.u32.u32 %r4300, %r4301, %r5774, %r4296; - // end inline asm - ld.const.u32 %r4305, [matrix+2592]; - // begin inline asm - dp4a.u32.u32 %r4304, %r4305, %r5778, %r4300; - // end inline asm - ld.const.u32 %r4309, [matrix+2596]; - // begin inline asm - dp4a.u32.u32 %r4308, %r4309, %r5782, %r4304; - // end inline asm - ld.const.u32 %r4313, [matrix+2600]; - // begin inline asm - dp4a.u32.u32 %r4312, %r4313, %r5786, %r4308; - // end inline asm - ld.const.u32 %r4317, [matrix+2604]; - // begin inline asm - dp4a.u32.u32 %r4316, %r4317, %r5790, %r4312; - // end inline asm - ld.const.u32 %r4321, [matrix+2608]; - // begin inline asm - dp4a.u32.u32 %r4320, %r4321, %r5794, %r4316; - // end inline asm - ld.const.u32 %r4325, [matrix+2612]; - // begin inline asm - dp4a.u32.u32 %r4324, %r4325, %r5798, %r4320; - // end inline asm - ld.const.u32 %r4329, [matrix+2616]; - // begin inline asm - dp4a.u32.u32 %r4328, %r4329, %r5802, %r4324; - // end inline asm - ld.const.u32 %r4333, [matrix+2620]; - // begin inline asm - dp4a.u32.u32 %r4332, %r4333, %r5806, %r4328; - // end inline asm - ld.const.u32 %r4337, [matrix+2624]; - // begin inline asm - dp4a.u32.u32 %r4336, %r4337, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4341, [matrix+2628]; - // begin inline asm - dp4a.u32.u32 %r4340, %r4341, %r5750, %r4336; - // end inline asm - ld.const.u32 %r4345, [matrix+2632]; - // begin inline asm - dp4a.u32.u32 %r4344, %r4345, %r5754, %r4340; - // end inline asm - ld.const.u32 %r4349, [matrix+2636]; - // begin inline asm - dp4a.u32.u32 %r4348, %r4349, %r5758, %r4344; - // end inline asm - ld.const.u32 %r4353, [matrix+2640]; - // begin inline asm - dp4a.u32.u32 %r4352, %r4353, %r5762, %r4348; - // end inline asm - ld.const.u32 %r4357, [matrix+2644]; - // begin inline asm - dp4a.u32.u32 %r4356, %r4357, %r5766, %r4352; - // end inline asm - ld.const.u32 %r4361, [matrix+2648]; - // begin inline asm - dp4a.u32.u32 %r4360, %r4361, %r5770, %r4356; - // end inline asm - ld.const.u32 %r4365, [matrix+2652]; - // begin inline asm - dp4a.u32.u32 %r4364, %r4365, %r5774, %r4360; - // end inline asm - ld.const.u32 %r4369, [matrix+2656]; - // begin inline asm - dp4a.u32.u32 %r4368, %r4369, %r5778, %r4364; - // end inline asm - ld.const.u32 %r4373, [matrix+2660]; - // begin inline asm - dp4a.u32.u32 %r4372, %r4373, %r5782, %r4368; - // end inline asm - ld.const.u32 %r4377, [matrix+2664]; - // begin inline asm - dp4a.u32.u32 %r4376, %r4377, %r5786, %r4372; - // end inline asm - ld.const.u32 %r4381, [matrix+2668]; - // begin inline asm - dp4a.u32.u32 %r4380, %r4381, %r5790, %r4376; - // end inline asm - ld.const.u32 %r4385, [matrix+2672]; - // begin inline asm - dp4a.u32.u32 %r4384, %r4385, %r5794, %r4380; - // end inline asm - ld.const.u32 %r4389, [matrix+2676]; - // begin inline asm - dp4a.u32.u32 %r4388, %r4389, %r5798, %r4384; - // end inline asm - ld.const.u32 %r4393, [matrix+2680]; - // begin inline asm - dp4a.u32.u32 %r4392, %r4393, %r5802, %r4388; - // end inline asm - ld.const.u32 %r4397, [matrix+2684]; - // begin inline asm - dp4a.u32.u32 %r4396, %r4397, %r5806, %r4392; - // end inline asm - shr.u32 %r6054, %r4332, 6; - and.b32 %r6055, %r6054, 240; - shr.u32 %r6056, %r4396, 10; - or.b32 %r6057, %r6056, %r6055; - cvt.u64.u32 %rd223, %r6057; - xor.b64 %rd224, %rd16, %rd223; - ld.const.u32 %r4401, [matrix+2688]; - // begin inline asm - dp4a.u32.u32 %r4400, %r4401, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4405, [matrix+2692]; - // begin inline asm - dp4a.u32.u32 %r4404, %r4405, %r5750, %r4400; - // end inline asm - ld.const.u32 %r4409, [matrix+2696]; - // begin inline asm - dp4a.u32.u32 %r4408, %r4409, %r5754, %r4404; - // end inline asm - ld.const.u32 %r4413, [matrix+2700]; - // begin inline asm - dp4a.u32.u32 %r4412, %r4413, %r5758, %r4408; - // end inline asm - ld.const.u32 %r4417, [matrix+2704]; - // begin inline asm - dp4a.u32.u32 %r4416, %r4417, %r5762, %r4412; - // end inline asm - ld.const.u32 %r4421, [matrix+2708]; - // begin inline asm - dp4a.u32.u32 %r4420, %r4421, %r5766, %r4416; - // end inline asm - ld.const.u32 %r4425, [matrix+2712]; - // begin inline asm - dp4a.u32.u32 %r4424, %r4425, %r5770, %r4420; - // end inline asm - ld.const.u32 %r4429, [matrix+2716]; - // begin inline asm - dp4a.u32.u32 %r4428, %r4429, %r5774, %r4424; - // end inline asm - ld.const.u32 %r4433, [matrix+2720]; - // begin inline asm - dp4a.u32.u32 %r4432, %r4433, %r5778, %r4428; - // end inline asm - ld.const.u32 %r4437, [matrix+2724]; - // begin inline asm - dp4a.u32.u32 %r4436, %r4437, %r5782, %r4432; - // end inline asm - ld.const.u32 %r4441, [matrix+2728]; - // begin inline asm - dp4a.u32.u32 %r4440, %r4441, %r5786, %r4436; - // end inline asm - ld.const.u32 %r4445, [matrix+2732]; - // begin inline asm - dp4a.u32.u32 %r4444, %r4445, %r5790, %r4440; - // end inline asm - ld.const.u32 %r4449, [matrix+2736]; - // begin inline asm - dp4a.u32.u32 %r4448, %r4449, %r5794, %r4444; - // end inline asm - ld.const.u32 %r4453, [matrix+2740]; - // begin inline asm - dp4a.u32.u32 %r4452, %r4453, %r5798, %r4448; - // end inline asm - ld.const.u32 %r4457, [matrix+2744]; - // begin inline asm - dp4a.u32.u32 %r4456, %r4457, %r5802, %r4452; - // end inline asm - ld.const.u32 %r4461, [matrix+2748]; - // begin inline asm - dp4a.u32.u32 %r4460, %r4461, %r5806, %r4456; - // end inline asm - ld.const.u32 %r4465, [matrix+2752]; - // begin inline asm - dp4a.u32.u32 %r4464, %r4465, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4469, [matrix+2756]; - // begin inline asm - dp4a.u32.u32 %r4468, %r4469, %r5750, %r4464; - // end inline asm - ld.const.u32 %r4473, [matrix+2760]; - // begin inline asm - dp4a.u32.u32 %r4472, %r4473, %r5754, %r4468; - // end inline asm - ld.const.u32 %r4477, [matrix+2764]; - // begin inline asm - dp4a.u32.u32 %r4476, %r4477, %r5758, %r4472; - // end inline asm - ld.const.u32 %r4481, [matrix+2768]; - // begin inline asm - dp4a.u32.u32 %r4480, %r4481, %r5762, %r4476; - // end inline asm - ld.const.u32 %r4485, [matrix+2772]; - // begin inline asm - dp4a.u32.u32 %r4484, %r4485, %r5766, %r4480; - // end inline asm - ld.const.u32 %r4489, [matrix+2776]; - // begin inline asm - dp4a.u32.u32 %r4488, %r4489, %r5770, %r4484; - // end inline asm - ld.const.u32 %r4493, [matrix+2780]; - // begin inline asm - dp4a.u32.u32 %r4492, %r4493, %r5774, %r4488; - // end inline asm - ld.const.u32 %r4497, [matrix+2784]; - // begin inline asm - dp4a.u32.u32 %r4496, %r4497, %r5778, %r4492; - // end inline asm - ld.const.u32 %r4501, [matrix+2788]; - // begin inline asm - dp4a.u32.u32 %r4500, %r4501, %r5782, %r4496; - // end inline asm - ld.const.u32 %r4505, [matrix+2792]; - // begin inline asm - dp4a.u32.u32 %r4504, %r4505, %r5786, %r4500; - // end inline asm - ld.const.u32 %r4509, [matrix+2796]; - // begin inline asm - dp4a.u32.u32 %r4508, %r4509, %r5790, %r4504; - // end inline asm - ld.const.u32 %r4513, [matrix+2800]; - // begin inline asm - dp4a.u32.u32 %r4512, %r4513, %r5794, %r4508; - // end inline asm - ld.const.u32 %r4517, [matrix+2804]; - // begin inline asm - dp4a.u32.u32 %r4516, %r4517, %r5798, %r4512; - // end inline asm - ld.const.u32 %r4521, [matrix+2808]; - // begin inline asm - dp4a.u32.u32 %r4520, %r4521, %r5802, %r4516; - // end inline asm - ld.const.u32 %r4525, [matrix+2812]; - // begin inline asm - dp4a.u32.u32 %r4524, %r4525, %r5806, %r4520; - // end inline asm - shr.u32 %r6058, %r4460, 6; - and.b32 %r6059, %r6058, 240; - shr.u32 %r6060, %r4524, 10; - or.b32 %r6061, %r6060, %r6059; - cvt.u64.u32 %rd225, %r6061; - xor.b64 %rd226, %rd17, %rd225; - ld.const.u32 %r4529, [matrix+2816]; - // begin inline asm - dp4a.u32.u32 %r4528, %r4529, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4533, [matrix+2820]; - // begin inline asm - dp4a.u32.u32 %r4532, %r4533, %r5750, %r4528; - // end inline asm - ld.const.u32 %r4537, [matrix+2824]; - // begin inline asm - dp4a.u32.u32 %r4536, %r4537, %r5754, %r4532; - // end inline asm - ld.const.u32 %r4541, [matrix+2828]; - // begin inline asm - dp4a.u32.u32 %r4540, %r4541, %r5758, %r4536; - // end inline asm - ld.const.u32 %r4545, [matrix+2832]; - // begin inline asm - dp4a.u32.u32 %r4544, %r4545, %r5762, %r4540; - // end inline asm - ld.const.u32 %r4549, [matrix+2836]; - // begin inline asm - dp4a.u32.u32 %r4548, %r4549, %r5766, %r4544; - // end inline asm - ld.const.u32 %r4553, [matrix+2840]; - // begin inline asm - dp4a.u32.u32 %r4552, %r4553, %r5770, %r4548; - // end inline asm - ld.const.u32 %r4557, [matrix+2844]; - // begin inline asm - dp4a.u32.u32 %r4556, %r4557, %r5774, %r4552; - // end inline asm - ld.const.u32 %r4561, [matrix+2848]; - // begin inline asm - dp4a.u32.u32 %r4560, %r4561, %r5778, %r4556; - // end inline asm - ld.const.u32 %r4565, [matrix+2852]; - // begin inline asm - dp4a.u32.u32 %r4564, %r4565, %r5782, %r4560; - // end inline asm - ld.const.u32 %r4569, [matrix+2856]; - // begin inline asm - dp4a.u32.u32 %r4568, %r4569, %r5786, %r4564; - // end inline asm - ld.const.u32 %r4573, [matrix+2860]; - // begin inline asm - dp4a.u32.u32 %r4572, %r4573, %r5790, %r4568; - // end inline asm - ld.const.u32 %r4577, [matrix+2864]; - // begin inline asm - dp4a.u32.u32 %r4576, %r4577, %r5794, %r4572; - // end inline asm - ld.const.u32 %r4581, [matrix+2868]; - // begin inline asm - dp4a.u32.u32 %r4580, %r4581, %r5798, %r4576; - // end inline asm - ld.const.u32 %r4585, [matrix+2872]; - // begin inline asm - dp4a.u32.u32 %r4584, %r4585, %r5802, %r4580; - // end inline asm - ld.const.u32 %r4589, [matrix+2876]; - // begin inline asm - dp4a.u32.u32 %r4588, %r4589, %r5806, %r4584; - // end inline asm - ld.const.u32 %r4593, [matrix+2880]; - // begin inline asm - dp4a.u32.u32 %r4592, %r4593, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4597, [matrix+2884]; - // begin inline asm - dp4a.u32.u32 %r4596, %r4597, %r5750, %r4592; - // end inline asm - ld.const.u32 %r4601, [matrix+2888]; - // begin inline asm - dp4a.u32.u32 %r4600, %r4601, %r5754, %r4596; - // end inline asm - ld.const.u32 %r4605, [matrix+2892]; - // begin inline asm - dp4a.u32.u32 %r4604, %r4605, %r5758, %r4600; - // end inline asm - ld.const.u32 %r4609, [matrix+2896]; - // begin inline asm - dp4a.u32.u32 %r4608, %r4609, %r5762, %r4604; - // end inline asm - ld.const.u32 %r4613, [matrix+2900]; - // begin inline asm - dp4a.u32.u32 %r4612, %r4613, %r5766, %r4608; - // end inline asm - ld.const.u32 %r4617, [matrix+2904]; - // begin inline asm - dp4a.u32.u32 %r4616, %r4617, %r5770, %r4612; - // end inline asm - ld.const.u32 %r4621, [matrix+2908]; - // begin inline asm - dp4a.u32.u32 %r4620, %r4621, %r5774, %r4616; - // end inline asm - ld.const.u32 %r4625, [matrix+2912]; - // begin inline asm - dp4a.u32.u32 %r4624, %r4625, %r5778, %r4620; - // end inline asm - ld.const.u32 %r4629, [matrix+2916]; - // begin inline asm - dp4a.u32.u32 %r4628, %r4629, %r5782, %r4624; - // end inline asm - ld.const.u32 %r4633, [matrix+2920]; - // begin inline asm - dp4a.u32.u32 %r4632, %r4633, %r5786, %r4628; - // end inline asm - ld.const.u32 %r4637, [matrix+2924]; - // begin inline asm - dp4a.u32.u32 %r4636, %r4637, %r5790, %r4632; - // end inline asm - ld.const.u32 %r4641, [matrix+2928]; - // begin inline asm - dp4a.u32.u32 %r4640, %r4641, %r5794, %r4636; - // end inline asm - ld.const.u32 %r4645, [matrix+2932]; - // begin inline asm - dp4a.u32.u32 %r4644, %r4645, %r5798, %r4640; - // end inline asm - ld.const.u32 %r4649, [matrix+2936]; - // begin inline asm - dp4a.u32.u32 %r4648, %r4649, %r5802, %r4644; - // end inline asm - ld.const.u32 %r4653, [matrix+2940]; - // begin inline asm - dp4a.u32.u32 %r4652, %r4653, %r5806, %r4648; - // end inline asm - shr.u32 %r6062, %r4588, 6; - and.b32 %r6063, %r6062, 240; - shr.u32 %r6064, %r4652, 10; - or.b32 %r6065, %r6064, %r6063; - cvt.u64.u32 %rd227, %r6065; - xor.b64 %rd228, %rd18, %rd227; - ld.const.u32 %r4657, [matrix+2944]; - // begin inline asm - dp4a.u32.u32 %r4656, %r4657, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4661, [matrix+2948]; - // begin inline asm - dp4a.u32.u32 %r4660, %r4661, %r5750, %r4656; - // end inline asm - ld.const.u32 %r4665, [matrix+2952]; - // begin inline asm - dp4a.u32.u32 %r4664, %r4665, %r5754, %r4660; - // end inline asm - ld.const.u32 %r4669, [matrix+2956]; - // begin inline asm - dp4a.u32.u32 %r4668, %r4669, %r5758, %r4664; - // end inline asm - ld.const.u32 %r4673, [matrix+2960]; - // begin inline asm - dp4a.u32.u32 %r4672, %r4673, %r5762, %r4668; - // end inline asm - ld.const.u32 %r4677, [matrix+2964]; - // begin inline asm - dp4a.u32.u32 %r4676, %r4677, %r5766, %r4672; - // end inline asm - ld.const.u32 %r4681, [matrix+2968]; - // begin inline asm - dp4a.u32.u32 %r4680, %r4681, %r5770, %r4676; - // end inline asm - ld.const.u32 %r4685, [matrix+2972]; - // begin inline asm - dp4a.u32.u32 %r4684, %r4685, %r5774, %r4680; - // end inline asm - ld.const.u32 %r4689, [matrix+2976]; - // begin inline asm - dp4a.u32.u32 %r4688, %r4689, %r5778, %r4684; - // end inline asm - ld.const.u32 %r4693, [matrix+2980]; - // begin inline asm - dp4a.u32.u32 %r4692, %r4693, %r5782, %r4688; - // end inline asm - ld.const.u32 %r4697, [matrix+2984]; - // begin inline asm - dp4a.u32.u32 %r4696, %r4697, %r5786, %r4692; - // end inline asm - ld.const.u32 %r4701, [matrix+2988]; - // begin inline asm - dp4a.u32.u32 %r4700, %r4701, %r5790, %r4696; - // end inline asm - ld.const.u32 %r4705, [matrix+2992]; - // begin inline asm - dp4a.u32.u32 %r4704, %r4705, %r5794, %r4700; - // end inline asm - ld.const.u32 %r4709, [matrix+2996]; - // begin inline asm - dp4a.u32.u32 %r4708, %r4709, %r5798, %r4704; - // end inline asm - ld.const.u32 %r4713, [matrix+3000]; - // begin inline asm - dp4a.u32.u32 %r4712, %r4713, %r5802, %r4708; - // end inline asm - ld.const.u32 %r4717, [matrix+3004]; - // begin inline asm - dp4a.u32.u32 %r4716, %r4717, %r5806, %r4712; - // end inline asm - ld.const.u32 %r4721, [matrix+3008]; - // begin inline asm - dp4a.u32.u32 %r4720, %r4721, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4725, [matrix+3012]; - // begin inline asm - dp4a.u32.u32 %r4724, %r4725, %r5750, %r4720; - // end inline asm - ld.const.u32 %r4729, [matrix+3016]; - // begin inline asm - dp4a.u32.u32 %r4728, %r4729, %r5754, %r4724; - // end inline asm - ld.const.u32 %r4733, [matrix+3020]; - // begin inline asm - dp4a.u32.u32 %r4732, %r4733, %r5758, %r4728; - // end inline asm - ld.const.u32 %r4737, [matrix+3024]; - // begin inline asm - dp4a.u32.u32 %r4736, %r4737, %r5762, %r4732; - // end inline asm - ld.const.u32 %r4741, [matrix+3028]; - // begin inline asm - dp4a.u32.u32 %r4740, %r4741, %r5766, %r4736; - // end inline asm - ld.const.u32 %r4745, [matrix+3032]; - // begin inline asm - dp4a.u32.u32 %r4744, %r4745, %r5770, %r4740; - // end inline asm - ld.const.u32 %r4749, [matrix+3036]; - // begin inline asm - dp4a.u32.u32 %r4748, %r4749, %r5774, %r4744; - // end inline asm - ld.const.u32 %r4753, [matrix+3040]; - // begin inline asm - dp4a.u32.u32 %r4752, %r4753, %r5778, %r4748; - // end inline asm - ld.const.u32 %r4757, [matrix+3044]; - // begin inline asm - dp4a.u32.u32 %r4756, %r4757, %r5782, %r4752; - // end inline asm - ld.const.u32 %r4761, [matrix+3048]; - // begin inline asm - dp4a.u32.u32 %r4760, %r4761, %r5786, %r4756; - // end inline asm - ld.const.u32 %r4765, [matrix+3052]; - // begin inline asm - dp4a.u32.u32 %r4764, %r4765, %r5790, %r4760; - // end inline asm - ld.const.u32 %r4769, [matrix+3056]; - // begin inline asm - dp4a.u32.u32 %r4768, %r4769, %r5794, %r4764; - // end inline asm - ld.const.u32 %r4773, [matrix+3060]; - // begin inline asm - dp4a.u32.u32 %r4772, %r4773, %r5798, %r4768; - // end inline asm - ld.const.u32 %r4777, [matrix+3064]; - // begin inline asm - dp4a.u32.u32 %r4776, %r4777, %r5802, %r4772; - // end inline asm - ld.const.u32 %r4781, [matrix+3068]; - // begin inline asm - dp4a.u32.u32 %r4780, %r4781, %r5806, %r4776; - // end inline asm - shr.u32 %r6066, %r4716, 6; - and.b32 %r6067, %r6066, 240; - ld.const.u32 %r4785, [matrix+3072]; - // begin inline asm - dp4a.u32.u32 %r4784, %r4785, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4789, [matrix+3076]; - // begin inline asm - dp4a.u32.u32 %r4788, %r4789, %r5750, %r4784; - // end inline asm - ld.const.u32 %r4793, [matrix+3080]; - // begin inline asm - dp4a.u32.u32 %r4792, %r4793, %r5754, %r4788; - // end inline asm - ld.const.u32 %r4797, [matrix+3084]; - // begin inline asm - dp4a.u32.u32 %r4796, %r4797, %r5758, %r4792; - // end inline asm - ld.const.u32 %r4801, [matrix+3088]; - // begin inline asm - dp4a.u32.u32 %r4800, %r4801, %r5762, %r4796; - // end inline asm - ld.const.u32 %r4805, [matrix+3092]; - // begin inline asm - dp4a.u32.u32 %r4804, %r4805, %r5766, %r4800; - // end inline asm - ld.const.u32 %r4809, [matrix+3096]; - // begin inline asm - dp4a.u32.u32 %r4808, %r4809, %r5770, %r4804; - // end inline asm - ld.const.u32 %r4813, [matrix+3100]; - // begin inline asm - dp4a.u32.u32 %r4812, %r4813, %r5774, %r4808; - // end inline asm - ld.const.u32 %r4817, [matrix+3104]; - // begin inline asm - dp4a.u32.u32 %r4816, %r4817, %r5778, %r4812; - // end inline asm - ld.const.u32 %r4821, [matrix+3108]; - // begin inline asm - dp4a.u32.u32 %r4820, %r4821, %r5782, %r4816; - // end inline asm - ld.const.u32 %r4825, [matrix+3112]; - // begin inline asm - dp4a.u32.u32 %r4824, %r4825, %r5786, %r4820; - // end inline asm - ld.const.u32 %r4829, [matrix+3116]; - // begin inline asm - dp4a.u32.u32 %r4828, %r4829, %r5790, %r4824; - // end inline asm - ld.const.u32 %r4833, [matrix+3120]; - // begin inline asm - dp4a.u32.u32 %r4832, %r4833, %r5794, %r4828; - // end inline asm - ld.const.u32 %r4837, [matrix+3124]; - // begin inline asm - dp4a.u32.u32 %r4836, %r4837, %r5798, %r4832; - // end inline asm - ld.const.u32 %r4841, [matrix+3128]; - // begin inline asm - dp4a.u32.u32 %r4840, %r4841, %r5802, %r4836; - // end inline asm - ld.const.u32 %r4845, [matrix+3132]; - // begin inline asm - dp4a.u32.u32 %r4844, %r4845, %r5806, %r4840; - // end inline asm - ld.const.u32 %r4849, [matrix+3136]; - // begin inline asm - dp4a.u32.u32 %r4848, %r4849, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4853, [matrix+3140]; - // begin inline asm - dp4a.u32.u32 %r4852, %r4853, %r5750, %r4848; - // end inline asm - ld.const.u32 %r4857, [matrix+3144]; - // begin inline asm - dp4a.u32.u32 %r4856, %r4857, %r5754, %r4852; - // end inline asm - ld.const.u32 %r4861, [matrix+3148]; - // begin inline asm - dp4a.u32.u32 %r4860, %r4861, %r5758, %r4856; - // end inline asm - ld.const.u32 %r4865, [matrix+3152]; - // begin inline asm - dp4a.u32.u32 %r4864, %r4865, %r5762, %r4860; - // end inline asm - ld.const.u32 %r4869, [matrix+3156]; - // begin inline asm - dp4a.u32.u32 %r4868, %r4869, %r5766, %r4864; - // end inline asm - ld.const.u32 %r4873, [matrix+3160]; - // begin inline asm - dp4a.u32.u32 %r4872, %r4873, %r5770, %r4868; - // end inline asm - ld.const.u32 %r4877, [matrix+3164]; - // begin inline asm - dp4a.u32.u32 %r4876, %r4877, %r5774, %r4872; - // end inline asm - ld.const.u32 %r4881, [matrix+3168]; - // begin inline asm - dp4a.u32.u32 %r4880, %r4881, %r5778, %r4876; - // end inline asm - ld.const.u32 %r4885, [matrix+3172]; - // begin inline asm - dp4a.u32.u32 %r4884, %r4885, %r5782, %r4880; - // end inline asm - ld.const.u32 %r4889, [matrix+3176]; - // begin inline asm - dp4a.u32.u32 %r4888, %r4889, %r5786, %r4884; - // end inline asm - ld.const.u32 %r4893, [matrix+3180]; - // begin inline asm - dp4a.u32.u32 %r4892, %r4893, %r5790, %r4888; - // end inline asm - ld.const.u32 %r4897, [matrix+3184]; - // begin inline asm - dp4a.u32.u32 %r4896, %r4897, %r5794, %r4892; - // end inline asm - ld.const.u32 %r4901, [matrix+3188]; - // begin inline asm - dp4a.u32.u32 %r4900, %r4901, %r5798, %r4896; - // end inline asm - ld.const.u32 %r4905, [matrix+3192]; - // begin inline asm - dp4a.u32.u32 %r4904, %r4905, %r5802, %r4900; - // end inline asm - ld.const.u32 %r4909, [matrix+3196]; - // begin inline asm - dp4a.u32.u32 %r4908, %r4909, %r5806, %r4904; - // end inline asm - shr.u32 %r6068, %r4844, 6; - and.b32 %r6069, %r6068, 240; - shr.u32 %r6070, %r4908, 10; - and.b32 %r6071, %r6070, 255; - or.b32 %r6072, %r6071, %r6069; - cvt.u64.u32 %rd229, %r6072; - ld.const.u32 %r4913, [matrix+3200]; - // begin inline asm - dp4a.u32.u32 %r4912, %r4913, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4917, [matrix+3204]; - // begin inline asm - dp4a.u32.u32 %r4916, %r4917, %r5750, %r4912; - // end inline asm - ld.const.u32 %r4921, [matrix+3208]; - // begin inline asm - dp4a.u32.u32 %r4920, %r4921, %r5754, %r4916; - // end inline asm - ld.const.u32 %r4925, [matrix+3212]; - // begin inline asm - dp4a.u32.u32 %r4924, %r4925, %r5758, %r4920; - // end inline asm - ld.const.u32 %r4929, [matrix+3216]; - // begin inline asm - dp4a.u32.u32 %r4928, %r4929, %r5762, %r4924; - // end inline asm - ld.const.u32 %r4933, [matrix+3220]; - // begin inline asm - dp4a.u32.u32 %r4932, %r4933, %r5766, %r4928; - // end inline asm - ld.const.u32 %r4937, [matrix+3224]; - // begin inline asm - dp4a.u32.u32 %r4936, %r4937, %r5770, %r4932; - // end inline asm - ld.const.u32 %r4941, [matrix+3228]; - // begin inline asm - dp4a.u32.u32 %r4940, %r4941, %r5774, %r4936; - // end inline asm - ld.const.u32 %r4945, [matrix+3232]; - // begin inline asm - dp4a.u32.u32 %r4944, %r4945, %r5778, %r4940; - // end inline asm - ld.const.u32 %r4949, [matrix+3236]; - // begin inline asm - dp4a.u32.u32 %r4948, %r4949, %r5782, %r4944; - // end inline asm - ld.const.u32 %r4953, [matrix+3240]; - // begin inline asm - dp4a.u32.u32 %r4952, %r4953, %r5786, %r4948; - // end inline asm - ld.const.u32 %r4957, [matrix+3244]; - // begin inline asm - dp4a.u32.u32 %r4956, %r4957, %r5790, %r4952; - // end inline asm - ld.const.u32 %r4961, [matrix+3248]; - // begin inline asm - dp4a.u32.u32 %r4960, %r4961, %r5794, %r4956; - // end inline asm - ld.const.u32 %r4965, [matrix+3252]; - // begin inline asm - dp4a.u32.u32 %r4964, %r4965, %r5798, %r4960; - // end inline asm - ld.const.u32 %r4969, [matrix+3256]; - // begin inline asm - dp4a.u32.u32 %r4968, %r4969, %r5802, %r4964; - // end inline asm - ld.const.u32 %r4973, [matrix+3260]; - // begin inline asm - dp4a.u32.u32 %r4972, %r4973, %r5806, %r4968; - // end inline asm - ld.const.u32 %r4977, [matrix+3264]; - // begin inline asm - dp4a.u32.u32 %r4976, %r4977, %r5746, %r6244; - // end inline asm - ld.const.u32 %r4981, [matrix+3268]; - // begin inline asm - dp4a.u32.u32 %r4980, %r4981, %r5750, %r4976; - // end inline asm - ld.const.u32 %r4985, [matrix+3272]; - // begin inline asm - dp4a.u32.u32 %r4984, %r4985, %r5754, %r4980; - // end inline asm - ld.const.u32 %r4989, [matrix+3276]; - // begin inline asm - dp4a.u32.u32 %r4988, %r4989, %r5758, %r4984; - // end inline asm - ld.const.u32 %r4993, [matrix+3280]; - // begin inline asm - dp4a.u32.u32 %r4992, %r4993, %r5762, %r4988; - // end inline asm - ld.const.u32 %r4997, [matrix+3284]; - // begin inline asm - dp4a.u32.u32 %r4996, %r4997, %r5766, %r4992; - // end inline asm - ld.const.u32 %r5001, [matrix+3288]; - // begin inline asm - dp4a.u32.u32 %r5000, %r5001, %r5770, %r4996; - // end inline asm - ld.const.u32 %r5005, [matrix+3292]; - // begin inline asm - dp4a.u32.u32 %r5004, %r5005, %r5774, %r5000; - // end inline asm - ld.const.u32 %r5009, [matrix+3296]; - // begin inline asm - dp4a.u32.u32 %r5008, %r5009, %r5778, %r5004; - // end inline asm - ld.const.u32 %r5013, [matrix+3300]; - // begin inline asm - dp4a.u32.u32 %r5012, %r5013, %r5782, %r5008; - // end inline asm - ld.const.u32 %r5017, [matrix+3304]; - // begin inline asm - dp4a.u32.u32 %r5016, %r5017, %r5786, %r5012; - // end inline asm - ld.const.u32 %r5021, [matrix+3308]; - // begin inline asm - dp4a.u32.u32 %r5020, %r5021, %r5790, %r5016; - // end inline asm - ld.const.u32 %r5025, [matrix+3312]; - // begin inline asm - dp4a.u32.u32 %r5024, %r5025, %r5794, %r5020; - // end inline asm - ld.const.u32 %r5029, [matrix+3316]; - // begin inline asm - dp4a.u32.u32 %r5028, %r5029, %r5798, %r5024; - // end inline asm - ld.const.u32 %r5033, [matrix+3320]; - // begin inline asm - dp4a.u32.u32 %r5032, %r5033, %r5802, %r5028; - // end inline asm - ld.const.u32 %r5037, [matrix+3324]; - // begin inline asm - dp4a.u32.u32 %r5036, %r5037, %r5806, %r5032; - // end inline asm - shr.u32 %r6073, %r4972, 6; - and.b32 %r6074, %r6073, 240; - shr.u32 %r6075, %r5036, 10; - or.b32 %r6076, %r6075, %r6074; - cvt.u64.u32 %rd230, %r6076; - xor.b64 %rd231, %rd201, %rd230; - and.b64 %rd232, %rd9, 255; - xor.b64 %rd233, %rd232, %rd229; - ld.const.u32 %r5041, [matrix+3328]; - // begin inline asm - dp4a.u32.u32 %r5040, %r5041, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5045, [matrix+3332]; - // begin inline asm - dp4a.u32.u32 %r5044, %r5045, %r5750, %r5040; - // end inline asm - ld.const.u32 %r5049, [matrix+3336]; - // begin inline asm - dp4a.u32.u32 %r5048, %r5049, %r5754, %r5044; - // end inline asm - ld.const.u32 %r5053, [matrix+3340]; - // begin inline asm - dp4a.u32.u32 %r5052, %r5053, %r5758, %r5048; - // end inline asm - ld.const.u32 %r5057, [matrix+3344]; - // begin inline asm - dp4a.u32.u32 %r5056, %r5057, %r5762, %r5052; - // end inline asm - ld.const.u32 %r5061, [matrix+3348]; - // begin inline asm - dp4a.u32.u32 %r5060, %r5061, %r5766, %r5056; - // end inline asm - ld.const.u32 %r5065, [matrix+3352]; - // begin inline asm - dp4a.u32.u32 %r5064, %r5065, %r5770, %r5060; - // end inline asm - ld.const.u32 %r5069, [matrix+3356]; - // begin inline asm - dp4a.u32.u32 %r5068, %r5069, %r5774, %r5064; - // end inline asm - ld.const.u32 %r5073, [matrix+3360]; - // begin inline asm - dp4a.u32.u32 %r5072, %r5073, %r5778, %r5068; - // end inline asm - ld.const.u32 %r5077, [matrix+3364]; - // begin inline asm - dp4a.u32.u32 %r5076, %r5077, %r5782, %r5072; - // end inline asm - ld.const.u32 %r5081, [matrix+3368]; - // begin inline asm - dp4a.u32.u32 %r5080, %r5081, %r5786, %r5076; - // end inline asm - ld.const.u32 %r5085, [matrix+3372]; - // begin inline asm - dp4a.u32.u32 %r5084, %r5085, %r5790, %r5080; - // end inline asm - ld.const.u32 %r5089, [matrix+3376]; - // begin inline asm - dp4a.u32.u32 %r5088, %r5089, %r5794, %r5084; - // end inline asm - ld.const.u32 %r5093, [matrix+3380]; - // begin inline asm - dp4a.u32.u32 %r5092, %r5093, %r5798, %r5088; - // end inline asm - ld.const.u32 %r5097, [matrix+3384]; - // begin inline asm - dp4a.u32.u32 %r5096, %r5097, %r5802, %r5092; - // end inline asm - ld.const.u32 %r5101, [matrix+3388]; - // begin inline asm - dp4a.u32.u32 %r5100, %r5101, %r5806, %r5096; - // end inline asm - ld.const.u32 %r5105, [matrix+3392]; - // begin inline asm - dp4a.u32.u32 %r5104, %r5105, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5109, [matrix+3396]; - // begin inline asm - dp4a.u32.u32 %r5108, %r5109, %r5750, %r5104; - // end inline asm - ld.const.u32 %r5113, [matrix+3400]; - // begin inline asm - dp4a.u32.u32 %r5112, %r5113, %r5754, %r5108; - // end inline asm - ld.const.u32 %r5117, [matrix+3404]; - // begin inline asm - dp4a.u32.u32 %r5116, %r5117, %r5758, %r5112; - // end inline asm - ld.const.u32 %r5121, [matrix+3408]; - // begin inline asm - dp4a.u32.u32 %r5120, %r5121, %r5762, %r5116; - // end inline asm - ld.const.u32 %r5125, [matrix+3412]; - // begin inline asm - dp4a.u32.u32 %r5124, %r5125, %r5766, %r5120; - // end inline asm - ld.const.u32 %r5129, [matrix+3416]; - // begin inline asm - dp4a.u32.u32 %r5128, %r5129, %r5770, %r5124; - // end inline asm - ld.const.u32 %r5133, [matrix+3420]; - // begin inline asm - dp4a.u32.u32 %r5132, %r5133, %r5774, %r5128; - // end inline asm - ld.const.u32 %r5137, [matrix+3424]; - // begin inline asm - dp4a.u32.u32 %r5136, %r5137, %r5778, %r5132; - // end inline asm - ld.const.u32 %r5141, [matrix+3428]; - // begin inline asm - dp4a.u32.u32 %r5140, %r5141, %r5782, %r5136; - // end inline asm - ld.const.u32 %r5145, [matrix+3432]; - // begin inline asm - dp4a.u32.u32 %r5144, %r5145, %r5786, %r5140; - // end inline asm - ld.const.u32 %r5149, [matrix+3436]; - // begin inline asm - dp4a.u32.u32 %r5148, %r5149, %r5790, %r5144; - // end inline asm - ld.const.u32 %r5153, [matrix+3440]; - // begin inline asm - dp4a.u32.u32 %r5152, %r5153, %r5794, %r5148; - // end inline asm - ld.const.u32 %r5157, [matrix+3444]; - // begin inline asm - dp4a.u32.u32 %r5156, %r5157, %r5798, %r5152; - // end inline asm - ld.const.u32 %r5161, [matrix+3448]; - // begin inline asm - dp4a.u32.u32 %r5160, %r5161, %r5802, %r5156; - // end inline asm - ld.const.u32 %r5165, [matrix+3452]; - // begin inline asm - dp4a.u32.u32 %r5164, %r5165, %r5806, %r5160; - // end inline asm - shr.u32 %r6077, %r5100, 6; - and.b32 %r6078, %r6077, 240; - shr.u32 %r6079, %r5164, 10; - or.b32 %r6080, %r6079, %r6078; - cvt.u64.u32 %rd234, %r6080; - xor.b64 %rd235, %rd202, %rd234; - ld.const.u32 %r5169, [matrix+3456]; - // begin inline asm - dp4a.u32.u32 %r5168, %r5169, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5173, [matrix+3460]; - // begin inline asm - dp4a.u32.u32 %r5172, %r5173, %r5750, %r5168; - // end inline asm - ld.const.u32 %r5177, [matrix+3464]; - // begin inline asm - dp4a.u32.u32 %r5176, %r5177, %r5754, %r5172; - // end inline asm - ld.const.u32 %r5181, [matrix+3468]; - // begin inline asm - dp4a.u32.u32 %r5180, %r5181, %r5758, %r5176; - // end inline asm - ld.const.u32 %r5185, [matrix+3472]; - // begin inline asm - dp4a.u32.u32 %r5184, %r5185, %r5762, %r5180; - // end inline asm - ld.const.u32 %r5189, [matrix+3476]; - // begin inline asm - dp4a.u32.u32 %r5188, %r5189, %r5766, %r5184; - // end inline asm - ld.const.u32 %r5193, [matrix+3480]; - // begin inline asm - dp4a.u32.u32 %r5192, %r5193, %r5770, %r5188; - // end inline asm - ld.const.u32 %r5197, [matrix+3484]; - // begin inline asm - dp4a.u32.u32 %r5196, %r5197, %r5774, %r5192; - // end inline asm - ld.const.u32 %r5201, [matrix+3488]; - // begin inline asm - dp4a.u32.u32 %r5200, %r5201, %r5778, %r5196; - // end inline asm - ld.const.u32 %r5205, [matrix+3492]; - // begin inline asm - dp4a.u32.u32 %r5204, %r5205, %r5782, %r5200; - // end inline asm - ld.const.u32 %r5209, [matrix+3496]; - // begin inline asm - dp4a.u32.u32 %r5208, %r5209, %r5786, %r5204; - // end inline asm - ld.const.u32 %r5213, [matrix+3500]; - // begin inline asm - dp4a.u32.u32 %r5212, %r5213, %r5790, %r5208; - // end inline asm - ld.const.u32 %r5217, [matrix+3504]; - // begin inline asm - dp4a.u32.u32 %r5216, %r5217, %r5794, %r5212; - // end inline asm - ld.const.u32 %r5221, [matrix+3508]; - // begin inline asm - dp4a.u32.u32 %r5220, %r5221, %r5798, %r5216; - // end inline asm - ld.const.u32 %r5225, [matrix+3512]; - // begin inline asm - dp4a.u32.u32 %r5224, %r5225, %r5802, %r5220; - // end inline asm - ld.const.u32 %r5229, [matrix+3516]; - // begin inline asm - dp4a.u32.u32 %r5228, %r5229, %r5806, %r5224; - // end inline asm - ld.const.u32 %r5233, [matrix+3520]; - // begin inline asm - dp4a.u32.u32 %r5232, %r5233, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5237, [matrix+3524]; - // begin inline asm - dp4a.u32.u32 %r5236, %r5237, %r5750, %r5232; - // end inline asm - ld.const.u32 %r5241, [matrix+3528]; - // begin inline asm - dp4a.u32.u32 %r5240, %r5241, %r5754, %r5236; - // end inline asm - ld.const.u32 %r5245, [matrix+3532]; - // begin inline asm - dp4a.u32.u32 %r5244, %r5245, %r5758, %r5240; - // end inline asm - ld.const.u32 %r5249, [matrix+3536]; - // begin inline asm - dp4a.u32.u32 %r5248, %r5249, %r5762, %r5244; - // end inline asm - ld.const.u32 %r5253, [matrix+3540]; - // begin inline asm - dp4a.u32.u32 %r5252, %r5253, %r5766, %r5248; - // end inline asm - ld.const.u32 %r5257, [matrix+3544]; - // begin inline asm - dp4a.u32.u32 %r5256, %r5257, %r5770, %r5252; - // end inline asm - ld.const.u32 %r5261, [matrix+3548]; - // begin inline asm - dp4a.u32.u32 %r5260, %r5261, %r5774, %r5256; - // end inline asm - ld.const.u32 %r5265, [matrix+3552]; - // begin inline asm - dp4a.u32.u32 %r5264, %r5265, %r5778, %r5260; - // end inline asm - ld.const.u32 %r5269, [matrix+3556]; - // begin inline asm - dp4a.u32.u32 %r5268, %r5269, %r5782, %r5264; - // end inline asm - ld.const.u32 %r5273, [matrix+3560]; - // begin inline asm - dp4a.u32.u32 %r5272, %r5273, %r5786, %r5268; - // end inline asm - ld.const.u32 %r5277, [matrix+3564]; - // begin inline asm - dp4a.u32.u32 %r5276, %r5277, %r5790, %r5272; - // end inline asm - ld.const.u32 %r5281, [matrix+3568]; - // begin inline asm - dp4a.u32.u32 %r5280, %r5281, %r5794, %r5276; - // end inline asm - ld.const.u32 %r5285, [matrix+3572]; - // begin inline asm - dp4a.u32.u32 %r5284, %r5285, %r5798, %r5280; - // end inline asm - ld.const.u32 %r5289, [matrix+3576]; - // begin inline asm - dp4a.u32.u32 %r5288, %r5289, %r5802, %r5284; - // end inline asm - ld.const.u32 %r5293, [matrix+3580]; - // begin inline asm - dp4a.u32.u32 %r5292, %r5293, %r5806, %r5288; - // end inline asm - shr.u32 %r6081, %r5228, 6; - and.b32 %r6082, %r6081, 240; - shr.u32 %r6083, %r5292, 10; - or.b32 %r6084, %r6083, %r6082; - cvt.u64.u32 %rd236, %r6084; - xor.b64 %rd237, %rd203, %rd236; - ld.const.u32 %r5297, [matrix+3584]; - // begin inline asm - dp4a.u32.u32 %r5296, %r5297, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5301, [matrix+3588]; - // begin inline asm - dp4a.u32.u32 %r5300, %r5301, %r5750, %r5296; - // end inline asm - ld.const.u32 %r5305, [matrix+3592]; - // begin inline asm - dp4a.u32.u32 %r5304, %r5305, %r5754, %r5300; - // end inline asm - ld.const.u32 %r5309, [matrix+3596]; - // begin inline asm - dp4a.u32.u32 %r5308, %r5309, %r5758, %r5304; - // end inline asm - ld.const.u32 %r5313, [matrix+3600]; - // begin inline asm - dp4a.u32.u32 %r5312, %r5313, %r5762, %r5308; - // end inline asm - ld.const.u32 %r5317, [matrix+3604]; - // begin inline asm - dp4a.u32.u32 %r5316, %r5317, %r5766, %r5312; - // end inline asm - ld.const.u32 %r5321, [matrix+3608]; - // begin inline asm - dp4a.u32.u32 %r5320, %r5321, %r5770, %r5316; - // end inline asm - ld.const.u32 %r5325, [matrix+3612]; - // begin inline asm - dp4a.u32.u32 %r5324, %r5325, %r5774, %r5320; - // end inline asm - ld.const.u32 %r5329, [matrix+3616]; - // begin inline asm - dp4a.u32.u32 %r5328, %r5329, %r5778, %r5324; - // end inline asm - ld.const.u32 %r5333, [matrix+3620]; - // begin inline asm - dp4a.u32.u32 %r5332, %r5333, %r5782, %r5328; - // end inline asm - ld.const.u32 %r5337, [matrix+3624]; - // begin inline asm - dp4a.u32.u32 %r5336, %r5337, %r5786, %r5332; - // end inline asm - ld.const.u32 %r5341, [matrix+3628]; - // begin inline asm - dp4a.u32.u32 %r5340, %r5341, %r5790, %r5336; - // end inline asm - ld.const.u32 %r5345, [matrix+3632]; - // begin inline asm - dp4a.u32.u32 %r5344, %r5345, %r5794, %r5340; - // end inline asm - ld.const.u32 %r5349, [matrix+3636]; - // begin inline asm - dp4a.u32.u32 %r5348, %r5349, %r5798, %r5344; - // end inline asm - ld.const.u32 %r5353, [matrix+3640]; - // begin inline asm - dp4a.u32.u32 %r5352, %r5353, %r5802, %r5348; - // end inline asm - ld.const.u32 %r5357, [matrix+3644]; - // begin inline asm - dp4a.u32.u32 %r5356, %r5357, %r5806, %r5352; - // end inline asm - ld.const.u32 %r5361, [matrix+3648]; - // begin inline asm - dp4a.u32.u32 %r5360, %r5361, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5365, [matrix+3652]; - // begin inline asm - dp4a.u32.u32 %r5364, %r5365, %r5750, %r5360; - // end inline asm - ld.const.u32 %r5369, [matrix+3656]; - // begin inline asm - dp4a.u32.u32 %r5368, %r5369, %r5754, %r5364; - // end inline asm - ld.const.u32 %r5373, [matrix+3660]; - // begin inline asm - dp4a.u32.u32 %r5372, %r5373, %r5758, %r5368; - // end inline asm - ld.const.u32 %r5377, [matrix+3664]; - // begin inline asm - dp4a.u32.u32 %r5376, %r5377, %r5762, %r5372; - // end inline asm - ld.const.u32 %r5381, [matrix+3668]; - // begin inline asm - dp4a.u32.u32 %r5380, %r5381, %r5766, %r5376; - // end inline asm - ld.const.u32 %r5385, [matrix+3672]; - // begin inline asm - dp4a.u32.u32 %r5384, %r5385, %r5770, %r5380; - // end inline asm - ld.const.u32 %r5389, [matrix+3676]; - // begin inline asm - dp4a.u32.u32 %r5388, %r5389, %r5774, %r5384; - // end inline asm - ld.const.u32 %r5393, [matrix+3680]; - // begin inline asm - dp4a.u32.u32 %r5392, %r5393, %r5778, %r5388; - // end inline asm - ld.const.u32 %r5397, [matrix+3684]; - // begin inline asm - dp4a.u32.u32 %r5396, %r5397, %r5782, %r5392; - // end inline asm - ld.const.u32 %r5401, [matrix+3688]; - // begin inline asm - dp4a.u32.u32 %r5400, %r5401, %r5786, %r5396; - // end inline asm - ld.const.u32 %r5405, [matrix+3692]; - // begin inline asm - dp4a.u32.u32 %r5404, %r5405, %r5790, %r5400; - // end inline asm - ld.const.u32 %r5409, [matrix+3696]; - // begin inline asm - dp4a.u32.u32 %r5408, %r5409, %r5794, %r5404; - // end inline asm - ld.const.u32 %r5413, [matrix+3700]; - // begin inline asm - dp4a.u32.u32 %r5412, %r5413, %r5798, %r5408; - // end inline asm - ld.const.u32 %r5417, [matrix+3704]; - // begin inline asm - dp4a.u32.u32 %r5416, %r5417, %r5802, %r5412; - // end inline asm - ld.const.u32 %r5421, [matrix+3708]; - // begin inline asm - dp4a.u32.u32 %r5420, %r5421, %r5806, %r5416; - // end inline asm - shr.u32 %r6085, %r5356, 6; - and.b32 %r6086, %r6085, 240; - shr.u32 %r6087, %r5420, 10; - or.b32 %r6088, %r6087, %r6086; - cvt.u64.u32 %rd238, %r6088; - xor.b64 %rd239, %rd204, %rd238; - ld.const.u32 %r5425, [matrix+3712]; - // begin inline asm - dp4a.u32.u32 %r5424, %r5425, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5429, [matrix+3716]; - // begin inline asm - dp4a.u32.u32 %r5428, %r5429, %r5750, %r5424; - // end inline asm - ld.const.u32 %r5433, [matrix+3720]; - // begin inline asm - dp4a.u32.u32 %r5432, %r5433, %r5754, %r5428; - // end inline asm - ld.const.u32 %r5437, [matrix+3724]; - // begin inline asm - dp4a.u32.u32 %r5436, %r5437, %r5758, %r5432; - // end inline asm - ld.const.u32 %r5441, [matrix+3728]; - // begin inline asm - dp4a.u32.u32 %r5440, %r5441, %r5762, %r5436; - // end inline asm - ld.const.u32 %r5445, [matrix+3732]; - // begin inline asm - dp4a.u32.u32 %r5444, %r5445, %r5766, %r5440; - // end inline asm - ld.const.u32 %r5449, [matrix+3736]; - // begin inline asm - dp4a.u32.u32 %r5448, %r5449, %r5770, %r5444; - // end inline asm - ld.const.u32 %r5453, [matrix+3740]; - // begin inline asm - dp4a.u32.u32 %r5452, %r5453, %r5774, %r5448; - // end inline asm - ld.const.u32 %r5457, [matrix+3744]; - // begin inline asm - dp4a.u32.u32 %r5456, %r5457, %r5778, %r5452; - // end inline asm - ld.const.u32 %r5461, [matrix+3748]; - // begin inline asm - dp4a.u32.u32 %r5460, %r5461, %r5782, %r5456; - // end inline asm - ld.const.u32 %r5465, [matrix+3752]; - // begin inline asm - dp4a.u32.u32 %r5464, %r5465, %r5786, %r5460; - // end inline asm - ld.const.u32 %r5469, [matrix+3756]; - // begin inline asm - dp4a.u32.u32 %r5468, %r5469, %r5790, %r5464; - // end inline asm - ld.const.u32 %r5473, [matrix+3760]; - // begin inline asm - dp4a.u32.u32 %r5472, %r5473, %r5794, %r5468; - // end inline asm - ld.const.u32 %r5477, [matrix+3764]; - // begin inline asm - dp4a.u32.u32 %r5476, %r5477, %r5798, %r5472; - // end inline asm - ld.const.u32 %r5481, [matrix+3768]; - // begin inline asm - dp4a.u32.u32 %r5480, %r5481, %r5802, %r5476; - // end inline asm - ld.const.u32 %r5485, [matrix+3772]; - // begin inline asm - dp4a.u32.u32 %r5484, %r5485, %r5806, %r5480; - // end inline asm - ld.const.u32 %r5489, [matrix+3776]; - // begin inline asm - dp4a.u32.u32 %r5488, %r5489, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5493, [matrix+3780]; - // begin inline asm - dp4a.u32.u32 %r5492, %r5493, %r5750, %r5488; - // end inline asm - ld.const.u32 %r5497, [matrix+3784]; - // begin inline asm - dp4a.u32.u32 %r5496, %r5497, %r5754, %r5492; - // end inline asm - ld.const.u32 %r5501, [matrix+3788]; - // begin inline asm - dp4a.u32.u32 %r5500, %r5501, %r5758, %r5496; - // end inline asm - ld.const.u32 %r5505, [matrix+3792]; - // begin inline asm - dp4a.u32.u32 %r5504, %r5505, %r5762, %r5500; - // end inline asm - ld.const.u32 %r5509, [matrix+3796]; - // begin inline asm - dp4a.u32.u32 %r5508, %r5509, %r5766, %r5504; - // end inline asm - ld.const.u32 %r5513, [matrix+3800]; - // begin inline asm - dp4a.u32.u32 %r5512, %r5513, %r5770, %r5508; - // end inline asm - ld.const.u32 %r5517, [matrix+3804]; - // begin inline asm - dp4a.u32.u32 %r5516, %r5517, %r5774, %r5512; - // end inline asm - ld.const.u32 %r5521, [matrix+3808]; - // begin inline asm - dp4a.u32.u32 %r5520, %r5521, %r5778, %r5516; - // end inline asm - ld.const.u32 %r5525, [matrix+3812]; - // begin inline asm - dp4a.u32.u32 %r5524, %r5525, %r5782, %r5520; - // end inline asm - ld.const.u32 %r5529, [matrix+3816]; - // begin inline asm - dp4a.u32.u32 %r5528, %r5529, %r5786, %r5524; - // end inline asm - ld.const.u32 %r5533, [matrix+3820]; - // begin inline asm - dp4a.u32.u32 %r5532, %r5533, %r5790, %r5528; - // end inline asm - ld.const.u32 %r5537, [matrix+3824]; - // begin inline asm - dp4a.u32.u32 %r5536, %r5537, %r5794, %r5532; - // end inline asm - ld.const.u32 %r5541, [matrix+3828]; - // begin inline asm - dp4a.u32.u32 %r5540, %r5541, %r5798, %r5536; - // end inline asm - ld.const.u32 %r5545, [matrix+3832]; - // begin inline asm - dp4a.u32.u32 %r5544, %r5545, %r5802, %r5540; - // end inline asm - ld.const.u32 %r5549, [matrix+3836]; - // begin inline asm - dp4a.u32.u32 %r5548, %r5549, %r5806, %r5544; - // end inline asm - shr.u32 %r6089, %r5484, 6; - and.b32 %r6090, %r6089, 240; - shr.u32 %r6091, %r5548, 10; - or.b32 %r6092, %r6091, %r6090; - cvt.u64.u32 %rd240, %r6092; - xor.b64 %rd241, %rd206, %rd240; - ld.const.u32 %r5553, [matrix+3840]; - // begin inline asm - dp4a.u32.u32 %r5552, %r5553, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5557, [matrix+3844]; - // begin inline asm - dp4a.u32.u32 %r5556, %r5557, %r5750, %r5552; - // end inline asm - ld.const.u32 %r5561, [matrix+3848]; - // begin inline asm - dp4a.u32.u32 %r5560, %r5561, %r5754, %r5556; - // end inline asm - ld.const.u32 %r5565, [matrix+3852]; - // begin inline asm - dp4a.u32.u32 %r5564, %r5565, %r5758, %r5560; - // end inline asm - ld.const.u32 %r5569, [matrix+3856]; - // begin inline asm - dp4a.u32.u32 %r5568, %r5569, %r5762, %r5564; - // end inline asm - ld.const.u32 %r5573, [matrix+3860]; - // begin inline asm - dp4a.u32.u32 %r5572, %r5573, %r5766, %r5568; - // end inline asm - ld.const.u32 %r5577, [matrix+3864]; - // begin inline asm - dp4a.u32.u32 %r5576, %r5577, %r5770, %r5572; - // end inline asm - ld.const.u32 %r5581, [matrix+3868]; - // begin inline asm - dp4a.u32.u32 %r5580, %r5581, %r5774, %r5576; - // end inline asm - ld.const.u32 %r5585, [matrix+3872]; - // begin inline asm - dp4a.u32.u32 %r5584, %r5585, %r5778, %r5580; - // end inline asm - ld.const.u32 %r5589, [matrix+3876]; - // begin inline asm - dp4a.u32.u32 %r5588, %r5589, %r5782, %r5584; - // end inline asm - ld.const.u32 %r5593, [matrix+3880]; - // begin inline asm - dp4a.u32.u32 %r5592, %r5593, %r5786, %r5588; - // end inline asm - ld.const.u32 %r5597, [matrix+3884]; - // begin inline asm - dp4a.u32.u32 %r5596, %r5597, %r5790, %r5592; - // end inline asm - ld.const.u32 %r5601, [matrix+3888]; - // begin inline asm - dp4a.u32.u32 %r5600, %r5601, %r5794, %r5596; - // end inline asm - ld.const.u32 %r5605, [matrix+3892]; - // begin inline asm - dp4a.u32.u32 %r5604, %r5605, %r5798, %r5600; - // end inline asm - ld.const.u32 %r5609, [matrix+3896]; - // begin inline asm - dp4a.u32.u32 %r5608, %r5609, %r5802, %r5604; - // end inline asm - ld.const.u32 %r5613, [matrix+3900]; - // begin inline asm - dp4a.u32.u32 %r5612, %r5613, %r5806, %r5608; - // end inline asm - ld.const.u32 %r5617, [matrix+3904]; - // begin inline asm - dp4a.u32.u32 %r5616, %r5617, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5621, [matrix+3908]; - // begin inline asm - dp4a.u32.u32 %r5620, %r5621, %r5750, %r5616; - // end inline asm - ld.const.u32 %r5625, [matrix+3912]; - // begin inline asm - dp4a.u32.u32 %r5624, %r5625, %r5754, %r5620; - // end inline asm - ld.const.u32 %r5629, [matrix+3916]; - // begin inline asm - dp4a.u32.u32 %r5628, %r5629, %r5758, %r5624; - // end inline asm - ld.const.u32 %r5633, [matrix+3920]; - // begin inline asm - dp4a.u32.u32 %r5632, %r5633, %r5762, %r5628; - // end inline asm - ld.const.u32 %r5637, [matrix+3924]; - // begin inline asm - dp4a.u32.u32 %r5636, %r5637, %r5766, %r5632; - // end inline asm - ld.const.u32 %r5641, [matrix+3928]; - // begin inline asm - dp4a.u32.u32 %r5640, %r5641, %r5770, %r5636; - // end inline asm - ld.const.u32 %r5645, [matrix+3932]; - // begin inline asm - dp4a.u32.u32 %r5644, %r5645, %r5774, %r5640; - // end inline asm - ld.const.u32 %r5649, [matrix+3936]; - // begin inline asm - dp4a.u32.u32 %r5648, %r5649, %r5778, %r5644; - // end inline asm - ld.const.u32 %r5653, [matrix+3940]; - // begin inline asm - dp4a.u32.u32 %r5652, %r5653, %r5782, %r5648; - // end inline asm - ld.const.u32 %r5657, [matrix+3944]; - // begin inline asm - dp4a.u32.u32 %r5656, %r5657, %r5786, %r5652; - // end inline asm - ld.const.u32 %r5661, [matrix+3948]; - // begin inline asm - dp4a.u32.u32 %r5660, %r5661, %r5790, %r5656; - // end inline asm - ld.const.u32 %r5665, [matrix+3952]; - // begin inline asm - dp4a.u32.u32 %r5664, %r5665, %r5794, %r5660; - // end inline asm - ld.const.u32 %r5669, [matrix+3956]; - // begin inline asm - dp4a.u32.u32 %r5668, %r5669, %r5798, %r5664; - // end inline asm - ld.const.u32 %r5673, [matrix+3960]; - // begin inline asm - dp4a.u32.u32 %r5672, %r5673, %r5802, %r5668; - // end inline asm - ld.const.u32 %r5677, [matrix+3964]; - // begin inline asm - dp4a.u32.u32 %r5676, %r5677, %r5806, %r5672; - // end inline asm - shr.u32 %r6093, %r5612, 6; - and.b32 %r6094, %r6093, 240; - shr.u32 %r6095, %r5676, 10; - or.b32 %r6096, %r6095, %r6094; - cvt.u64.u32 %rd242, %r6096; - xor.b64 %rd243, %rd208, %rd242; - ld.const.u32 %r5681, [matrix+3968]; - // begin inline asm - dp4a.u32.u32 %r5680, %r5681, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5685, [matrix+3972]; - // begin inline asm - dp4a.u32.u32 %r5684, %r5685, %r5750, %r5680; - // end inline asm - ld.const.u32 %r5689, [matrix+3976]; - // begin inline asm - dp4a.u32.u32 %r5688, %r5689, %r5754, %r5684; - // end inline asm - ld.const.u32 %r5693, [matrix+3980]; - // begin inline asm - dp4a.u32.u32 %r5692, %r5693, %r5758, %r5688; - // end inline asm - ld.const.u32 %r5697, [matrix+3984]; - // begin inline asm - dp4a.u32.u32 %r5696, %r5697, %r5762, %r5692; - // end inline asm - ld.const.u32 %r5701, [matrix+3988]; - // begin inline asm - dp4a.u32.u32 %r5700, %r5701, %r5766, %r5696; - // end inline asm - ld.const.u32 %r5705, [matrix+3992]; - // begin inline asm - dp4a.u32.u32 %r5704, %r5705, %r5770, %r5700; - // end inline asm - ld.const.u32 %r5709, [matrix+3996]; - // begin inline asm - dp4a.u32.u32 %r5708, %r5709, %r5774, %r5704; - // end inline asm - ld.const.u32 %r5713, [matrix+4000]; - // begin inline asm - dp4a.u32.u32 %r5712, %r5713, %r5778, %r5708; - // end inline asm - ld.const.u32 %r5717, [matrix+4004]; - // begin inline asm - dp4a.u32.u32 %r5716, %r5717, %r5782, %r5712; - // end inline asm - ld.const.u32 %r5721, [matrix+4008]; - // begin inline asm - dp4a.u32.u32 %r5720, %r5721, %r5786, %r5716; - // end inline asm - ld.const.u32 %r5725, [matrix+4012]; - // begin inline asm - dp4a.u32.u32 %r5724, %r5725, %r5790, %r5720; - // end inline asm - ld.const.u32 %r5729, [matrix+4016]; - // begin inline asm - dp4a.u32.u32 %r5728, %r5729, %r5794, %r5724; - // end inline asm - ld.const.u32 %r5733, [matrix+4020]; - // begin inline asm - dp4a.u32.u32 %r5732, %r5733, %r5798, %r5728; - // end inline asm - ld.const.u32 %r5737, [matrix+4024]; - // begin inline asm - dp4a.u32.u32 %r5736, %r5737, %r5802, %r5732; - // end inline asm - ld.const.u32 %r5741, [matrix+4028]; - // begin inline asm - dp4a.u32.u32 %r5740, %r5741, %r5806, %r5736; - // end inline asm - ld.const.u32 %r5745, [matrix+4032]; - // begin inline asm - dp4a.u32.u32 %r5744, %r5745, %r5746, %r6244; - // end inline asm - ld.const.u32 %r5749, [matrix+4036]; - // begin inline asm - dp4a.u32.u32 %r5748, %r5749, %r5750, %r5744; - // end inline asm - ld.const.u32 %r5753, [matrix+4040]; - // begin inline asm - dp4a.u32.u32 %r5752, %r5753, %r5754, %r5748; - // end inline asm - ld.const.u32 %r5757, [matrix+4044]; - // begin inline asm - dp4a.u32.u32 %r5756, %r5757, %r5758, %r5752; - // end inline asm - ld.const.u32 %r5761, [matrix+4048]; - // begin inline asm - dp4a.u32.u32 %r5760, %r5761, %r5762, %r5756; - // end inline asm - ld.const.u32 %r5765, [matrix+4052]; - // begin inline asm - dp4a.u32.u32 %r5764, %r5765, %r5766, %r5760; - // end inline asm - ld.const.u32 %r5769, [matrix+4056]; - // begin inline asm - dp4a.u32.u32 %r5768, %r5769, %r5770, %r5764; - // end inline asm - ld.const.u32 %r5773, [matrix+4060]; - // begin inline asm - dp4a.u32.u32 %r5772, %r5773, %r5774, %r5768; - // end inline asm - ld.const.u32 %r5777, [matrix+4064]; - // begin inline asm - dp4a.u32.u32 %r5776, %r5777, %r5778, %r5772; - // end inline asm - ld.const.u32 %r5781, [matrix+4068]; - // begin inline asm - dp4a.u32.u32 %r5780, %r5781, %r5782, %r5776; - // end inline asm - ld.const.u32 %r5785, [matrix+4072]; - // begin inline asm - dp4a.u32.u32 %r5784, %r5785, %r5786, %r5780; - // end inline asm - ld.const.u32 %r5789, [matrix+4076]; - // begin inline asm - dp4a.u32.u32 %r5788, %r5789, %r5790, %r5784; - // end inline asm - ld.const.u32 %r5793, [matrix+4080]; - // begin inline asm - dp4a.u32.u32 %r5792, %r5793, %r5794, %r5788; - // end inline asm - ld.const.u32 %r5797, [matrix+4084]; - // begin inline asm - dp4a.u32.u32 %r5796, %r5797, %r5798, %r5792; - // end inline asm - ld.const.u32 %r5801, [matrix+4088]; - // begin inline asm - dp4a.u32.u32 %r5800, %r5801, %r5802, %r5796; - // end inline asm - ld.const.u32 %r5805, [matrix+4092]; - // begin inline asm - dp4a.u32.u32 %r5804, %r5805, %r5806, %r5800; - // end inline asm - shr.u32 %r6097, %r5740, 6; - and.b32 %r6098, %r6097, 240; - shr.u32 %r6099, %r5804, 10; - or.b32 %r6100, %r6099, %r6098; - cvt.u64.u32 %rd244, %r6100; - xor.b64 %rd245, %rd210, %rd244; - shl.b32 %r6101, %r5985, 24; - cvt.u64.u32 %rd246, %r6101; - shl.b32 %r6102, %r5980, 16; - and.b32 %r6103, %r6102, 16711680; - cvt.u64.u32 %rd247, %r6103; - shl.b32 %r6104, %r5975, 8; - and.b32 %r6105, %r6104, 65280; - cvt.u64.u32 %rd248, %r6105; - and.b32 %r6106, %r5970, 255; - cvt.u64.u32 %rd249, %r6106; - shl.b32 %r6107, %r6019, 24; - cvt.u64.u32 %rd250, %r6107; - shl.b32 %r6108, %r6014, 16; - and.b32 %r6109, %r6108, 16711680; - cvt.u64.u32 %rd251, %r6109; - shl.b32 %r6110, %r6009, 8; - and.b32 %r6111, %r6110, 65280; - cvt.u64.u32 %rd252, %r6111; - and.b32 %r6112, %r6004, 255; - cvt.u64.u32 %rd253, %r6112; - shl.b32 %r6113, %r6053, 24; - cvt.u64.u32 %rd254, %r6113; - shl.b32 %r6114, %r6048, 16; - and.b32 %r6115, %r6114, 16711680; - cvt.u64.u32 %rd255, %r6115; - shl.b32 %r6116, %r6043, 8; - and.b32 %r6117, %r6116, 65280; - cvt.u64.u32 %rd256, %r6117; - and.b32 %r6118, %r6038, 255; - cvt.u64.u32 %rd257, %r6118; - shr.u32 %r6119, %r2732, 10; - or.b32 %r6120, %r6119, %r5999; - xor.b32 %r6121, %r10, %r6120; - cvt.u64.u32 %rd258, %r6121; - shl.b64 %rd259, %rd258, 56; - shl.b64 %rd260, %rd216, 48; - and.b64 %rd261, %rd260, 71776119061217280; - or.b64 %rd262, %rd259, %rd261; - shl.b64 %rd263, %rd214, 40; - and.b64 %rd264, %rd263, 280375465082880; - or.b64 %rd265, %rd262, %rd264; - shl.b64 %rd266, %rd212, 32; - and.b64 %rd267, %rd266, 1095216660480; - or.b64 %rd268, %rd265, %rd267; - or.b64 %rd269, %rd268, %rd246; - or.b64 %rd270, %rd269, %rd247; - or.b64 %rd271, %rd270, %rd248; - or.b64 %rd272, %rd271, %rd249; - xor.b64 %rd73, %rd272, 4239941492252378377; - shr.u32 %r6122, %r3756, 10; - or.b32 %r6123, %r6122, %r6033; - xor.b32 %r6124, %r12, %r6123; - cvt.u64.u32 %rd273, %r6124; - shl.b64 %rd274, %rd273, 56; - shl.b64 %rd275, %rd222, 48; - and.b64 %rd276, %rd275, 71776119061217280; - or.b64 %rd277, %rd274, %rd276; - shl.b64 %rd278, %rd220, 40; - and.b64 %rd279, %rd278, 280375465082880; - or.b64 %rd280, %rd277, %rd279; - shl.b64 %rd281, %rd218, 32; - and.b64 %rd282, %rd281, 1095216660480; - or.b64 %rd283, %rd280, %rd282; - or.b64 %rd284, %rd283, %rd250; - or.b64 %rd285, %rd284, %rd251; - or.b64 %rd286, %rd285, %rd252; - or.b64 %rd287, %rd286, %rd253; - xor.b64 %rd484, %rd287, 8746723911537738262; - shr.u32 %r6125, %r4780, 10; - or.b32 %r6126, %r6125, %r6067; - xor.b32 %r6127, %r14, %r6126; - cvt.u64.u32 %rd288, %r6127; - shl.b64 %rd289, %rd288, 56; - shl.b64 %rd290, %rd228, 48; - and.b64 %rd291, %rd290, 71776119061217280; - or.b64 %rd292, %rd289, %rd291; - shl.b64 %rd293, %rd226, 40; - and.b64 %rd294, %rd293, 280375465082880; - or.b64 %rd295, %rd292, %rd294; - shl.b64 %rd296, %rd224, 32; - and.b64 %rd297, %rd296, 1095216660480; - or.b64 %rd298, %rd295, %rd297; - or.b64 %rd299, %rd298, %rd254; - or.b64 %rd300, %rd299, %rd255; - or.b64 %rd301, %rd300, %rd256; - or.b64 %rd302, %rd301, %rd257; - xor.b64 %rd479, %rd302, 8796936657246353646; - shl.b64 %rd303, %rd245, 56; - shl.b64 %rd304, %rd243, 48; - and.b64 %rd305, %rd304, 71776119061217280; - or.b64 %rd306, %rd303, %rd305; - shl.b64 %rd307, %rd241, 40; - and.b64 %rd308, %rd307, 280375465082880; - or.b64 %rd309, %rd306, %rd308; - shl.b64 %rd310, %rd239, 32; - and.b64 %rd311, %rd310, 1095216660480; - or.b64 %rd312, %rd309, %rd311; - shl.b64 %rd313, %rd237, 24; - and.b64 %rd314, %rd313, 4278190080; - or.b64 %rd315, %rd312, %rd314; - shl.b64 %rd316, %rd235, 16; - and.b64 %rd317, %rd316, 16711680; - shl.b64 %rd318, %rd231, 8; - and.b64 %rd319, %rd318, 65280; - or.b64 %rd320, %rd315, %rd317; - or.b64 %rd321, %rd320, %rd319; - or.b64 %rd322, %rd321, %rd233; - xor.b64 %rd474, %rd322, 1272090201925444760; - mov.u64 %rd488, 8270816933120786537; - mov.u64 %rd487, -850687345431043546; - mov.u64 %rd486, 8596393687355028144; - mov.u64 %rd485, -4073852189716399785; - mov.u64 %rd483, -4539347866060507718; - mov.u64 %rd482, -3233781605604422593; - mov.u64 %rd481, 570094237299545110; - mov.u64 %rd480, 5171152063242093102; - mov.u64 %rd478, 6782861118970774626; - mov.u64 %rd477, 7812475424661425213; - mov.u64 %rd476, 9119540418498120711; - mov.u64 %rd475, -7873636174015165430; - mov.u64 %rd473, -9207053471590684088; - mov.u64 %rd472, 3370482334374859748; - mov.u64 %rd471, -1544774801229058759; - mov.u64 %rd470, 6096431547456407061; - mov.u64 %rd469, -1792185402154627366; - mov.u64 %rd468, -6864424130110145268; - mov.u64 %rd467, 5690099369266491460; - mov.u64 %rd466, -5074726839974049192; - mov.u64 %rd465, 1592359455985097269; - mov.u64 %rd464, RC; - -$L__BB0_9: - xor.b64 %rd323, %rd488, %rd73; - xor.b64 %rd324, %rd323, %rd487; - xor.b64 %rd325, %rd324, %rd486; - xor.b64 %rd326, %rd325, %rd485; - xor.b64 %rd327, %rd483, %rd484; - xor.b64 %rd328, %rd327, %rd482; - xor.b64 %rd329, %rd328, %rd481; - xor.b64 %rd330, %rd329, %rd480; - xor.b64 %rd331, %rd478, %rd479; - xor.b64 %rd332, %rd331, %rd477; - xor.b64 %rd333, %rd332, %rd476; - xor.b64 %rd334, %rd333, %rd475; - xor.b64 %rd335, %rd473, %rd474; - xor.b64 %rd336, %rd335, %rd472; - xor.b64 %rd337, %rd336, %rd471; - xor.b64 %rd338, %rd337, %rd470; - xor.b64 %rd339, %rd468, %rd469; - xor.b64 %rd340, %rd339, %rd467; - xor.b64 %rd341, %rd340, %rd466; - xor.b64 %rd342, %rd341, %rd465; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6128}, %rd330; - } - { - .reg .b32 %dummy; - mov.b64 {%r6129,%dummy}, %rd330; - } - shf.l.wrap.b32 %r6130, %r6129, %r6128, 1; - shf.l.wrap.b32 %r6131, %r6128, %r6129, 1; - mov.b64 %rd343, {%r6131, %r6130}; - xor.b64 %rd344, %rd342, %rd343; - xor.b64 %rd345, %rd344, %rd73; - xor.b64 %rd346, %rd488, %rd344; - xor.b64 %rd347, %rd487, %rd344; - xor.b64 %rd348, %rd486, %rd344; - xor.b64 %rd349, %rd485, %rd344; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6132}, %rd334; - } - { - .reg .b32 %dummy; - mov.b64 {%r6133,%dummy}, %rd334; - } - shf.l.wrap.b32 %r6134, %r6133, %r6132, 1; - shf.l.wrap.b32 %r6135, %r6132, %r6133, 1; - mov.b64 %rd350, {%r6135, %r6134}; - xor.b64 %rd351, %rd350, %rd326; - xor.b64 %rd352, %rd484, %rd351; - xor.b64 %rd353, %rd483, %rd351; - xor.b64 %rd354, %rd482, %rd351; - xor.b64 %rd355, %rd481, %rd351; - xor.b64 %rd356, %rd480, %rd351; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6136}, %rd338; - } - { - .reg .b32 %dummy; - mov.b64 {%r6137,%dummy}, %rd338; - } - shf.l.wrap.b32 %r6138, %r6137, %r6136, 1; - shf.l.wrap.b32 %r6139, %r6136, %r6137, 1; - mov.b64 %rd357, {%r6139, %r6138}; - xor.b64 %rd358, %rd357, %rd330; - xor.b64 %rd359, %rd479, %rd358; - xor.b64 %rd360, %rd478, %rd358; - xor.b64 %rd361, %rd477, %rd358; - xor.b64 %rd362, %rd476, %rd358; - xor.b64 %rd363, %rd475, %rd358; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6140}, %rd342; - } - { - .reg .b32 %dummy; - mov.b64 {%r6141,%dummy}, %rd342; - } - shf.l.wrap.b32 %r6142, %r6141, %r6140, 1; - shf.l.wrap.b32 %r6143, %r6140, %r6141, 1; - mov.b64 %rd364, {%r6143, %r6142}; - xor.b64 %rd365, %rd364, %rd334; - xor.b64 %rd366, %rd474, %rd365; - xor.b64 %rd367, %rd473, %rd365; - xor.b64 %rd368, %rd472, %rd365; - xor.b64 %rd369, %rd471, %rd365; - xor.b64 %rd370, %rd470, %rd365; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6144}, %rd326; - } - { - .reg .b32 %dummy; - mov.b64 {%r6145,%dummy}, %rd326; - } - shf.l.wrap.b32 %r6146, %r6145, %r6144, 1; - shf.l.wrap.b32 %r6147, %r6144, %r6145, 1; - mov.b64 %rd371, {%r6147, %r6146}; - xor.b64 %rd372, %rd338, %rd371; - xor.b64 %rd373, %rd469, %rd372; - xor.b64 %rd374, %rd468, %rd372; - xor.b64 %rd375, %rd467, %rd372; - xor.b64 %rd376, %rd466, %rd372; - xor.b64 %rd377, %rd465, %rd372; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6148}, %rd352; - } - { - .reg .b32 %dummy; - mov.b64 {%r6149,%dummy}, %rd352; - } - shf.l.wrap.b32 %r6150, %r6149, %r6148, 1; - shf.l.wrap.b32 %r6151, %r6148, %r6149, 1; - mov.b64 %rd378, {%r6151, %r6150}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6152}, %rd347; - } - { - .reg .b32 %dummy; - mov.b64 {%r6153,%dummy}, %rd347; - } - shf.l.wrap.b32 %r6154, %r6153, %r6152, 3; - shf.l.wrap.b32 %r6155, %r6152, %r6153, 3; - mov.b64 %rd379, {%r6155, %r6154}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6156}, %rd360; - } - { - .reg .b32 %dummy; - mov.b64 {%r6157,%dummy}, %rd360; - } - shf.l.wrap.b32 %r6158, %r6157, %r6156, 6; - shf.l.wrap.b32 %r6159, %r6156, %r6157, 6; - mov.b64 %rd380, {%r6159, %r6158}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6160}, %rd354; - } - { - .reg .b32 %dummy; - mov.b64 {%r6161,%dummy}, %rd354; - } - shf.l.wrap.b32 %r6162, %r6161, %r6160, 10; - shf.l.wrap.b32 %r6163, %r6160, %r6161, 10; - mov.b64 %rd381, {%r6163, %r6162}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6164}, %rd362; - } - { - .reg .b32 %dummy; - mov.b64 {%r6165,%dummy}, %rd362; - } - shf.l.wrap.b32 %r6166, %r6165, %r6164, 15; - shf.l.wrap.b32 %r6167, %r6164, %r6165, 15; - mov.b64 %rd382, {%r6167, %r6166}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6168}, %rd369; - } - { - .reg .b32 %dummy; - mov.b64 {%r6169,%dummy}, %rd369; - } - shf.l.wrap.b32 %r6170, %r6169, %r6168, 21; - shf.l.wrap.b32 %r6171, %r6168, %r6169, 21; - mov.b64 %rd383, {%r6171, %r6170}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6172}, %rd366; - } - { - .reg .b32 %dummy; - mov.b64 {%r6173,%dummy}, %rd366; - } - shf.l.wrap.b32 %r6174, %r6173, %r6172, 28; - shf.l.wrap.b32 %r6175, %r6172, %r6173, 28; - mov.b64 %rd384, {%r6175, %r6174}; - { - .reg .b32 %dummy; - mov.b64 {%r6176,%dummy}, %rd346; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6177}, %rd346; - } - shf.r.wrap.b32 %r6178, %r6177, %r6176, 28; - shf.r.wrap.b32 %r6179, %r6176, %r6177, 28; - mov.b64 %rd385, {%r6179, %r6178}; - { - .reg .b32 %dummy; - mov.b64 {%r6180,%dummy}, %rd355; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6181}, %rd355; - } - shf.r.wrap.b32 %r6182, %r6181, %r6180, 19; - shf.r.wrap.b32 %r6183, %r6180, %r6181, 19; - mov.b64 %rd386, {%r6183, %r6182}; - { - .reg .b32 %dummy; - mov.b64 {%r6184,%dummy}, %rd367; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6185}, %rd367; - } - shf.r.wrap.b32 %r6186, %r6185, %r6184, 9; - shf.r.wrap.b32 %r6187, %r6184, %r6185, 9; - mov.b64 %rd387, {%r6187, %r6186}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6188}, %rd356; - } - { - .reg .b32 %dummy; - mov.b64 {%r6189,%dummy}, %rd356; - } - shf.l.wrap.b32 %r6190, %r6189, %r6188, 2; - shf.l.wrap.b32 %r6191, %r6188, %r6189, 2; - mov.b64 %rd388, {%r6191, %r6190}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6192}, %rd377; - } - { - .reg .b32 %dummy; - mov.b64 {%r6193,%dummy}, %rd377; - } - shf.l.wrap.b32 %r6194, %r6193, %r6192, 14; - shf.l.wrap.b32 %r6195, %r6192, %r6193, 14; - mov.b64 %rd389, {%r6195, %r6194}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6196}, %rd373; - } - { - .reg .b32 %dummy; - mov.b64 {%r6197,%dummy}, %rd373; - } - shf.l.wrap.b32 %r6198, %r6197, %r6196, 27; - shf.l.wrap.b32 %r6199, %r6196, %r6197, 27; - mov.b64 %rd390, {%r6199, %r6198}; - { - .reg .b32 %dummy; - mov.b64 {%r6200,%dummy}, %rd348; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6201}, %rd348; - } - shf.r.wrap.b32 %r6202, %r6201, %r6200, 23; - shf.r.wrap.b32 %r6203, %r6200, %r6201, 23; - mov.b64 %rd391, {%r6203, %r6202}; - { - .reg .b32 %dummy; - mov.b64 {%r6204,%dummy}, %rd370; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6205}, %rd370; - } - shf.r.wrap.b32 %r6206, %r6205, %r6204, 8; - shf.r.wrap.b32 %r6207, %r6204, %r6205, 8; - mov.b64 %rd392, {%r6207, %r6206}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6208}, %rd376; - } - { - .reg .b32 %dummy; - mov.b64 {%r6209,%dummy}, %rd376; - } - shf.l.wrap.b32 %r6210, %r6209, %r6208, 8; - shf.l.wrap.b32 %r6211, %r6208, %r6209, 8; - mov.b64 %rd393, {%r6211, %r6210}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6212}, %rd368; - } - { - .reg .b32 %dummy; - mov.b64 {%r6213,%dummy}, %rd368; - } - shf.l.wrap.b32 %r6214, %r6213, %r6212, 25; - shf.l.wrap.b32 %r6215, %r6212, %r6213, 25; - mov.b64 %rd394, {%r6215, %r6214}; - { - .reg .b32 %dummy; - mov.b64 {%r6216,%dummy}, %rd361; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6217}, %rd361; - } - shf.r.wrap.b32 %r6218, %r6217, %r6216, 21; - shf.r.wrap.b32 %r6219, %r6216, %r6217, 21; - mov.b64 %rd395, {%r6219, %r6218}; - { - .reg .b32 %dummy; - mov.b64 {%r6220,%dummy}, %rd359; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6221}, %rd359; - } - shf.r.wrap.b32 %r6222, %r6221, %r6220, 2; - shf.r.wrap.b32 %r6223, %r6220, %r6221, 2; - mov.b64 %rd396, {%r6223, %r6222}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6224}, %rd349; - } - { - .reg .b32 %dummy; - mov.b64 {%r6225,%dummy}, %rd349; - } - shf.l.wrap.b32 %r6226, %r6225, %r6224, 18; - shf.l.wrap.b32 %r6227, %r6224, %r6225, 18; - mov.b64 %rd397, {%r6227, %r6226}; - { - .reg .b32 %dummy; - mov.b64 {%r6228,%dummy}, %rd375; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6229}, %rd375; - } - shf.r.wrap.b32 %r6230, %r6229, %r6228, 25; - shf.r.wrap.b32 %r6231, %r6228, %r6229, 25; - mov.b64 %rd398, {%r6231, %r6230}; - { - .reg .b32 %dummy; - mov.b64 {%r6232,%dummy}, %rd363; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6233}, %rd363; - } - shf.r.wrap.b32 %r6234, %r6233, %r6232, 3; - shf.r.wrap.b32 %r6235, %r6232, %r6233, 3; - mov.b64 %rd399, {%r6235, %r6234}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6236}, %rd374; - } - { - .reg .b32 %dummy; - mov.b64 {%r6237,%dummy}, %rd374; - } - shf.l.wrap.b32 %r6238, %r6237, %r6236, 20; - shf.l.wrap.b32 %r6239, %r6236, %r6237, 20; - mov.b64 %rd400, {%r6239, %r6238}; - { - .reg .b32 %dummy; - mov.b64 {%r6240,%dummy}, %rd353; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6241}, %rd353; - } - shf.r.wrap.b32 %r6242, %r6241, %r6240, 20; - shf.r.wrap.b32 %r6243, %r6240, %r6241, 20; - mov.b64 %rd401, {%r6243, %r6242}; - not.b64 %rd402, %rd401; - and.b64 %rd403, %rd395, %rd402; - xor.b64 %rd404, %rd403, %rd345; - not.b64 %rd405, %rd395; - and.b64 %rd406, %rd383, %rd405; - xor.b64 %rd484, %rd406, %rd401; - not.b64 %rd407, %rd383; - and.b64 %rd408, %rd389, %rd407; - xor.b64 %rd479, %rd408, %rd395; - not.b64 %rd409, %rd389; - and.b64 %rd410, %rd345, %rd409; - xor.b64 %rd474, %rd410, %rd383; - not.b64 %rd411, %rd345; - and.b64 %rd412, %rd401, %rd411; - xor.b64 %rd469, %rd389, %rd412; - not.b64 %rd413, %rd400; - and.b64 %rd414, %rd379, %rd413; - xor.b64 %rd488, %rd414, %rd384; - not.b64 %rd415, %rd379; - and.b64 %rd416, %rd386, %rd415; - xor.b64 %rd483, %rd416, %rd400; - not.b64 %rd417, %rd386; - and.b64 %rd418, %rd399, %rd417; - xor.b64 %rd478, %rd418, %rd379; - not.b64 %rd419, %rd399; - and.b64 %rd420, %rd384, %rd419; - xor.b64 %rd473, %rd420, %rd386; - not.b64 %rd421, %rd384; - and.b64 %rd422, %rd400, %rd421; - xor.b64 %rd468, %rd399, %rd422; - not.b64 %rd423, %rd380; - and.b64 %rd424, %rd394, %rd423; - xor.b64 %rd487, %rd424, %rd378; - not.b64 %rd425, %rd394; - and.b64 %rd426, %rd393, %rd425; - xor.b64 %rd482, %rd426, %rd380; - not.b64 %rd427, %rd393; - and.b64 %rd428, %rd397, %rd427; - xor.b64 %rd477, %rd428, %rd394; - not.b64 %rd429, %rd397; - and.b64 %rd430, %rd378, %rd429; - xor.b64 %rd472, %rd430, %rd393; - not.b64 %rd431, %rd378; - and.b64 %rd432, %rd380, %rd431; - xor.b64 %rd467, %rd397, %rd432; - not.b64 %rd433, %rd385; - and.b64 %rd434, %rd381, %rd433; - xor.b64 %rd486, %rd434, %rd390; - not.b64 %rd435, %rd381; - and.b64 %rd436, %rd382, %rd435; - xor.b64 %rd481, %rd436, %rd385; - not.b64 %rd437, %rd382; - and.b64 %rd438, %rd392, %rd437; - xor.b64 %rd476, %rd438, %rd381; - not.b64 %rd439, %rd392; - and.b64 %rd440, %rd390, %rd439; - xor.b64 %rd471, %rd440, %rd382; - not.b64 %rd441, %rd390; - and.b64 %rd442, %rd385, %rd441; - xor.b64 %rd466, %rd392, %rd442; - not.b64 %rd443, %rd387; - and.b64 %rd444, %rd398, %rd443; - xor.b64 %rd485, %rd444, %rd396; - not.b64 %rd445, %rd398; - and.b64 %rd446, %rd391, %rd445; - xor.b64 %rd480, %rd446, %rd387; - not.b64 %rd447, %rd391; - and.b64 %rd448, %rd388, %rd447; - xor.b64 %rd475, %rd448, %rd398; - not.b64 %rd449, %rd388; - and.b64 %rd450, %rd396, %rd449; - xor.b64 %rd470, %rd450, %rd391; - not.b64 %rd451, %rd396; - and.b64 %rd452, %rd387, %rd451; - xor.b64 %rd465, %rd388, %rd452; - ld.global.nc.u64 %rd453, [%rd464]; - xor.b64 %rd73, %rd404, %rd453; - add.s64 %rd464, %rd464, 8; - add.s32 %r6244, %r6244, 1; - setp.ne.s32 %p11, %r6244, 24; - @%p11 bra $L__BB0_9; - - ld.const.u64 %rd75, [target+24]; - setp.eq.s64 %p12, %rd474, %rd75; - @%p12 bra $L__BB0_12; - bra.uni $L__BB0_11; - -$L__BB0_12: - ld.const.u64 %rd76, [target+16]; - setp.eq.s64 %p13, %rd479, %rd76; - @%p13 bra $L__BB0_14; - bra.uni $L__BB0_13; - -$L__BB0_14: - ld.const.u64 %rd77, [target+8]; - setp.eq.s64 %p14, %rd484, %rd77; - @%p14 bra $L__BB0_16; - bra.uni $L__BB0_15; - -$L__BB0_16: - ld.const.u64 %rd454, [target]; - setp.lt.u64 %p16, %rd73, %rd454; - bra.uni $L__BB0_17; - -$L__BB0_11: - setp.lt.u64 %p16, %rd474, %rd75; - bra.uni $L__BB0_17; - -$L__BB0_13: - setp.lt.u64 %p16, %rd479, %rd76; - bra.uni $L__BB0_17; - -$L__BB0_15: - setp.lt.u64 %p16, %rd484, %rd77; - -$L__BB0_17: - not.pred %p15, %p16; - @%p15 bra $L__BB0_19; - - ld.param.u64 %rd462, [heavy_hash_param_0]; - ld.param.u64 %rd461, [heavy_hash_param_1]; - and.b64 %rd460, %rd463, %rd462; - or.b64 %rd459, %rd460, %rd461; - ld.param.u64 %rd458, [heavy_hash_param_5]; - cvta.to.global.u64 %rd457, %rd458; - mov.u64 %rd455, 0; - atom.global.cas.b64 %rd456, [%rd457], %rd455, %rd459; - -$L__BB0_19: - ret; - -} - diff --git a/plugins/cuda/src/lib.rs b/plugins/cuda/src/lib.rs index 8d056b2..942f787 100644 --- a/plugins/cuda/src/lib.rs +++ b/plugins/cuda/src/lib.rs @@ -1,164 +1,164 @@ -#[macro_use] -extern crate karlsen_miner; - -use clap::{ArgMatches, FromArgMatches}; -use cust::prelude::*; -use karlsen_miner::{Plugin, Worker, WorkerSpec}; -use log::LevelFilter; -use std::error::Error as StdError; -#[cfg(feature = "overclock")] -use { - log::{error, info}, - nvml_wrapper::Device as NvmlDevice, - nvml_wrapper::Nvml, -}; - -pub type Error = Box; - -mod cli; -mod worker; - -use crate::cli::{CudaOpt, NonceGenEnum}; -use crate::worker::CudaGPUWorker; - -const DEFAULT_WORKLOAD_SCALE: f32 = 1024.; - -pub struct CudaPlugin { - specs: Vec, - #[cfg(feature = "overclock")] - nvml_instance: Nvml, - _enabled: bool, -} - -impl CudaPlugin { - fn new() -> Result { - cust::init(CudaFlags::empty())?; - env_logger::builder().filter_level(LevelFilter::Info).parse_default_env().init(); - Ok(Self { - specs: Vec::new(), - _enabled: false, - #[cfg(feature = "overclock")] - nvml_instance: Nvml::init()?, - }) - } -} - -impl Plugin for CudaPlugin { - fn name(&self) -> &'static str { - "CUDA Worker" - } - - fn enabled(&self) -> bool { - self._enabled - } - - fn get_worker_specs(&self) -> Vec> { - self.specs.iter().map(|spec| Box::new(*spec) as Box).collect::>>() - } - - //noinspection RsTypeCheck - fn process_option(&mut self, matches: &ArgMatches) -> Result { - let opts: CudaOpt = CudaOpt::from_arg_matches(matches)?; - - self._enabled = !opts.cuda_disable; - if self._enabled { - let gpus: Vec = match &opts.cuda_device { - Some(devices) => devices.clone(), - None => { - let gpu_count = Device::num_devices().unwrap() as u16; - (0..gpu_count).collect() - } - }; - - // if any of cuda_lock_core_clocks / cuda_lock_mem_clocks / cuda_power_limit is valid, init nvml and try to apply - #[cfg(feature = "overclock")] - if opts.overclock.cuda_lock_core_clocks.is_some() - || opts.overclock.cuda_lock_mem_clocks.is_some() - || opts.overclock.cuda_power_limits.is_some() - { - for i in 0..gpus.len() { - let lock_mem_clock: Option = match &opts.overclock.cuda_lock_mem_clocks { - Some(mem_clocks) if i < mem_clocks.len() => Some(mem_clocks[i]), - Some(mem_clocks) if !mem_clocks.is_empty() => Some(*mem_clocks.last().unwrap()), - _ => None, - }; - - let lock_core_clock: Option = match &opts.overclock.cuda_lock_core_clocks { - Some(core_clocks) if i < core_clocks.len() => Some(core_clocks[i]), - Some(core_clocks) if !core_clocks.is_empty() => Some(*core_clocks.last().unwrap()), - _ => None, - }; - - let power_limit: Option = match &opts.overclock.cuda_power_limits { - Some(power_limits) if i < power_limits.len() => Some(power_limits[i]), - Some(power_limits) if !power_limits.is_empty() => Some(*power_limits.last().unwrap()), - _ => None, - }; - - let mut nvml_device: NvmlDevice = self.nvml_instance.device_by_index(gpus[i] as u32)?; - - if let Some(lmc) = lock_mem_clock { - match nvml_device.set_mem_locked_clocks(lmc, lmc) { - Err(e) => error!("set mem locked clocks {:?}", e), - _ => info!("GPU #{} #{} lock mem clock at {} Mhz", i, &nvml_device.name()?, &lmc), - }; - } - - if let Some(lcc) = lock_core_clock { - match nvml_device.set_gpu_locked_clocks(lcc, lcc) { - Err(e) => error!("set gpu locked clocks {:?}", e), - _ => info!("GPU #{} #{} lock core clock at {} Mhz", i, &nvml_device.name()?, &lcc), - }; - }; - - if let Some(pl) = power_limit { - match nvml_device.set_power_management_limit(pl * 1000) { - Err(e) => error!("set power limit {:?}", e), - _ => info!("GPU #{} #{} power limit at {} W", i, &nvml_device.name()?, &pl), - }; - }; - } - } - - self.specs = (0..gpus.len()) - .map(|i| CudaWorkerSpec { - device_id: gpus[i] as u32, - workload: match &opts.cuda_workload { - Some(workload) if i < workload.len() => workload[i], - Some(workload) if !workload.is_empty() => *workload.last().unwrap(), - _ => DEFAULT_WORKLOAD_SCALE, - }, - is_absolute: opts.cuda_workload_absolute, - blocking_sync: !opts.cuda_no_blocking_sync, - random: opts.cuda_nonce_gen, - }) - .collect(); - } - Ok(self.specs.len()) - } -} - -#[derive(Copy, Clone)] -struct CudaWorkerSpec { - device_id: u32, - workload: f32, - is_absolute: bool, - blocking_sync: bool, - random: NonceGenEnum, -} - -impl WorkerSpec for CudaWorkerSpec { - fn id(&self) -> String { - let device = Device::get_device(self.device_id).unwrap(); - format!("#{} ({})", self.device_id, device.name().unwrap()) - } - - fn build(&self) -> Box { - Box::new( - CudaGPUWorker::new(self.device_id, self.workload, self.is_absolute, self.blocking_sync, self.random) - .unwrap(), - ) - } -} - -declare_plugin!(CudaPlugin, CudaPlugin::new, CudaOpt); +#[macro_use] +extern crate karlsen_miner; + +use clap::{ArgMatches, FromArgMatches}; +use cust::prelude::*; +use karlsen_miner::{Plugin, Worker, WorkerSpec}; +use log::LevelFilter; +use std::error::Error as StdError; +#[cfg(feature = "overclock")] +use { + log::{error, info}, + nvml_wrapper::Device as NvmlDevice, + nvml_wrapper::Nvml, +}; + +pub type Error = Box; + +mod cli; +mod worker; + +use crate::cli::{CudaOpt, NonceGenEnum}; +use crate::worker::CudaGPUWorker; + +const DEFAULT_WORKLOAD_SCALE: f32 = 1024.; + +pub struct CudaPlugin { + specs: Vec, + #[cfg(feature = "overclock")] + nvml_instance: Nvml, + _enabled: bool, +} + +impl CudaPlugin { + fn new() -> Result { + cust::init(CudaFlags::empty())?; + env_logger::builder().filter_level(LevelFilter::Info).parse_default_env().init(); + Ok(Self { + specs: Vec::new(), + _enabled: false, + #[cfg(feature = "overclock")] + nvml_instance: Nvml::init()?, + }) + } +} + +impl Plugin for CudaPlugin { + fn name(&self) -> &'static str { + "CUDA Worker" + } + + fn enabled(&self) -> bool { + self._enabled + } + + fn get_worker_specs(&self) -> Vec> { + self.specs.iter().map(|spec| Box::new(*spec) as Box).collect::>>() + } + + //noinspection RsTypeCheck + fn process_option(&mut self, matches: &ArgMatches) -> Result { + let opts: CudaOpt = CudaOpt::from_arg_matches(matches)?; + + self._enabled = !opts.cuda_disable; + if self._enabled { + let gpus: Vec = match &opts.cuda_device { + Some(devices) => devices.clone(), + None => { + let gpu_count = Device::num_devices().unwrap() as u16; + (0..gpu_count).collect() + } + }; + + // if any of cuda_lock_core_clocks / cuda_lock_mem_clocks / cuda_power_limit is valid, init nvml and try to apply + #[cfg(feature = "overclock")] + if opts.overclock.cuda_lock_core_clocks.is_some() + || opts.overclock.cuda_lock_mem_clocks.is_some() + || opts.overclock.cuda_power_limits.is_some() + { + for i in 0..gpus.len() { + let lock_mem_clock: Option = match &opts.overclock.cuda_lock_mem_clocks { + Some(mem_clocks) if i < mem_clocks.len() => Some(mem_clocks[i]), + Some(mem_clocks) if !mem_clocks.is_empty() => Some(*mem_clocks.last().unwrap()), + _ => None, + }; + + let lock_core_clock: Option = match &opts.overclock.cuda_lock_core_clocks { + Some(core_clocks) if i < core_clocks.len() => Some(core_clocks[i]), + Some(core_clocks) if !core_clocks.is_empty() => Some(*core_clocks.last().unwrap()), + _ => None, + }; + + let power_limit: Option = match &opts.overclock.cuda_power_limits { + Some(power_limits) if i < power_limits.len() => Some(power_limits[i]), + Some(power_limits) if !power_limits.is_empty() => Some(*power_limits.last().unwrap()), + _ => None, + }; + + let mut nvml_device: NvmlDevice = self.nvml_instance.device_by_index(gpus[i] as u32)?; + + if let Some(lmc) = lock_mem_clock { + match nvml_device.set_mem_locked_clocks(lmc, lmc) { + Err(e) => error!("set mem locked clocks {:?}", e), + _ => info!("GPU #{} #{} lock mem clock at {} Mhz", i, &nvml_device.name()?, &lmc), + }; + } + + if let Some(lcc) = lock_core_clock { + match nvml_device.set_gpu_locked_clocks(lcc, lcc) { + Err(e) => error!("set gpu locked clocks {:?}", e), + _ => info!("GPU #{} #{} lock core clock at {} Mhz", i, &nvml_device.name()?, &lcc), + }; + }; + + if let Some(pl) = power_limit { + match nvml_device.set_power_management_limit(pl * 1000) { + Err(e) => error!("set power limit {:?}", e), + _ => info!("GPU #{} #{} power limit at {} W", i, &nvml_device.name()?, &pl), + }; + }; + } + } + + self.specs = (0..gpus.len()) + .map(|i| CudaWorkerSpec { + device_id: gpus[i] as u32, + workload: match &opts.cuda_workload { + Some(workload) if i < workload.len() => workload[i], + Some(workload) if !workload.is_empty() => *workload.last().unwrap(), + _ => DEFAULT_WORKLOAD_SCALE, + }, + is_absolute: opts.cuda_workload_absolute, + blocking_sync: !opts.cuda_no_blocking_sync, + random: opts.cuda_nonce_gen, + }) + .collect(); + } + Ok(self.specs.len()) + } +} + +#[derive(Copy, Clone)] +struct CudaWorkerSpec { + device_id: u32, + workload: f32, + is_absolute: bool, + blocking_sync: bool, + random: NonceGenEnum, +} + +impl WorkerSpec for CudaWorkerSpec { + fn id(&self) -> String { + let device = Device::get_device(self.device_id).unwrap(); + format!("#{} ({})", self.device_id, device.name().unwrap()) + } + + fn build(&self) -> Box { + Box::new( + CudaGPUWorker::new(self.device_id, self.workload, self.is_absolute, self.blocking_sync, self.random) + .unwrap(), + ) + } +} + +declare_plugin!(CudaPlugin, CudaPlugin::new, CudaOpt); diff --git a/plugins/cuda/src/worker.rs b/plugins/cuda/src/worker.rs index 9f7d772..cd5864b 100644 --- a/plugins/cuda/src/worker.rs +++ b/plugins/cuda/src/worker.rs @@ -1,251 +1,646 @@ -use crate::{Error, NonceGenEnum}; -use cust::context::CurrentContext; -use cust::device::DeviceAttribute; -use cust::function::Function; -use cust::module::{ModuleJitOption, OptLevel}; -use cust::prelude::*; -use karlsen_miner::xoshiro256starstar::Xoshiro256StarStar; -use karlsen_miner::Worker; -use log::{error, info}; -use rand::{Fill, RngCore}; -use std::ffi::CString; -use std::sync::{Arc, Weak}; - -static BPS: f32 = 1.; - -static PTX_86: &str = include_str!("../resources/kaspa-cuda-sm86.ptx"); -static PTX_75: &str = include_str!("../resources/kaspa-cuda-sm75.ptx"); -static PTX_61: &str = include_str!("../resources/kaspa-cuda-sm61.ptx"); -static PTX_30: &str = include_str!("../resources/kaspa-cuda-sm30.ptx"); -static PTX_20: &str = include_str!("../resources/kaspa-cuda-sm20.ptx"); - -pub struct Kernel<'kernel> { - func: Arc>, - block_size: u32, - grid_size: u32, -} - -impl<'kernel> Kernel<'kernel> { - pub fn new(module: Weak, name: &'kernel str) -> Result, Error> { - let func = Arc::new(unsafe { - module.as_ptr().as_ref().unwrap().get_function(name).map_err(|e| { - error!("Error loading function: {}", e); - e - })? - }); - let (_, block_size) = func.suggested_launch_configuration(0, 0.into())?; - - let device = CurrentContext::get_device()?; - let sm_count = device.get_attribute(DeviceAttribute::MultiprocessorCount)? as u32; - let grid_size = sm_count * func.max_active_blocks_per_multiprocessor(block_size.into(), 0)?; - - Ok(Self { func, block_size, grid_size }) - } - - pub fn get_workload(&self) -> u32 { - self.block_size * self.grid_size - } - - pub fn set_workload(&mut self, workload: u32) { - self.grid_size = (workload + self.block_size - 1) / self.block_size - } -} - -pub struct CudaGPUWorker<'gpu> { - // NOTE: The order is important! context must be closed last - heavy_hash_kernel: Kernel<'gpu>, - stream: Stream, - start_event: Event, - stop_event: Event, - _module: Arc, - - rand_state: DeviceBuffer, - final_nonce_buff: DeviceBuffer, - - device_id: u32, - pub workload: usize, - _context: Context, - - random: NonceGenEnum, -} - -impl<'gpu> Worker for CudaGPUWorker<'gpu> { - fn id(&self) -> String { - let device = CurrentContext::get_device().unwrap(); - format!("#{} ({})", self.device_id, device.name().unwrap()) - } - - fn load_block_constants(&mut self, hash_header: &[u8; 72], matrix: &[[u16; 64]; 64], target: &[u64; 4]) { - let u8matrix: Arc<[[u8; 64]; 64]> = Arc::new(matrix.map(|row| row.map(|v| v as u8))); - let mut hash_header_gpu = self._module.get_global::<[u8; 72]>(&CString::new("hash_header").unwrap()).unwrap(); - hash_header_gpu.copy_from(hash_header).map_err(|e| e.to_string()).unwrap(); - - let mut matrix_gpu = self._module.get_global::<[[u8; 64]; 64]>(&CString::new("matrix").unwrap()).unwrap(); - matrix_gpu.copy_from(&u8matrix).map_err(|e| e.to_string()).unwrap(); - - let mut target_gpu = self._module.get_global::<[u64; 4]>(&CString::new("target").unwrap()).unwrap(); - target_gpu.copy_from(target).map_err(|e| e.to_string()).unwrap(); - } - - #[inline(always)] - fn calculate_hash(&mut self, _nonces: Option<&Vec>, nonce_mask: u64, nonce_fixed: u64) { - let func = &self.heavy_hash_kernel.func; - let stream = &self.stream; - let random: u8 = match self.random { - NonceGenEnum::Lean => { - self.rand_state.copy_from(&[rand::thread_rng().next_u64()]).unwrap(); - 0 - } - NonceGenEnum::Xoshiro => 1, - }; - - self.start_event.record(stream).unwrap(); - unsafe { - launch!( - func<<< - self.heavy_hash_kernel.grid_size, self.heavy_hash_kernel.block_size, - 0, stream - >>>( - nonce_mask, nonce_fixed, - self.workload, - random, - self.rand_state.as_device_ptr(), - self.final_nonce_buff.as_device_ptr() - ) - ) - .unwrap(); // We see errors in sync - } - self.stop_event.record(stream).unwrap(); - } - - #[inline(always)] - fn sync(&self) -> Result<(), Error> { - //self.stream.synchronize()?; - self.stop_event.synchronize()?; - if self.stop_event.elapsed_time_f32(&self.start_event)? > 1000. / BPS { - return Err("Cuda takes longer then block rate. Please reduce your workload.".into()); - } - Ok(()) - } - - fn get_workload(&self) -> usize { - self.workload - } - - #[inline(always)] - fn copy_output_to(&mut self, nonces: &mut Vec) -> Result<(), Error> { - self.final_nonce_buff.copy_to(nonces)?; - Ok(()) - } -} - -impl<'gpu> CudaGPUWorker<'gpu> { - pub fn new( - device_id: u32, - workload: f32, - is_absolute: bool, - blocking_sync: bool, - random: NonceGenEnum, - ) -> Result { - info!("Starting a CUDA worker"); - let sync_flag = match blocking_sync { - true => ContextFlags::SCHED_BLOCKING_SYNC, - false => ContextFlags::SCHED_AUTO, - }; - let device = Device::get_device(device_id).unwrap(); - let _context = Context::new(device)?; - _context.set_flags(sync_flag)?; - - let major = device.get_attribute(DeviceAttribute::ComputeCapabilityMajor)?; - let minor = device.get_attribute(DeviceAttribute::ComputeCapabilityMinor)?; - let _module: Arc; - info!("Device #{} compute version is {}.{}", device_id, major, minor); - if major > 8 || (major == 8 && minor >= 6) { - _module = Arc::new(Module::from_ptx(PTX_86, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { - error!("Error loading PTX. Make sure you have the updated driver for you devices"); - e - })?); - } else if major > 7 || (major == 7 && minor >= 5) { - _module = Arc::new(Module::from_ptx(PTX_75, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { - error!("Error loading PTX. Make sure you have the updated driver for you devices"); - e - })?); - } else if major > 6 || (major == 6 && minor >= 1) { - _module = Arc::new(Module::from_ptx(PTX_61, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { - error!("Error loading PTX. Make sure you have the updated driver for you devices"); - e - })?); - } else if major >= 3 { - _module = Arc::new(Module::from_ptx(PTX_30, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { - error!("Error loading PTX. Make sure you have the updated driver for you devices"); - e - })?); - } else if major >= 2 { - _module = Arc::new(Module::from_ptx(PTX_20, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { - error!("Error loading PTX. Make sure you have the updated driver for you devices"); - e - })?); - } else { - return Err("Cuda compute version not supported".into()); - } - - let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?; - - let mut heavy_hash_kernel = Kernel::new(Arc::downgrade(&_module), "heavy_hash")?; - - let mut chosen_workload = 0u32; - if is_absolute { - chosen_workload = 1; - } else { - let cur_workload = heavy_hash_kernel.get_workload(); - if chosen_workload == 0 || chosen_workload < cur_workload { - chosen_workload = cur_workload; - } - } - chosen_workload = (chosen_workload as f32 * workload) as u32; - info!("GPU #{} Chosen workload: {}", device_id, chosen_workload); - heavy_hash_kernel.set_workload(chosen_workload); - - let final_nonce_buff = vec![0u64; 1].as_slice().as_dbuf()?; - - let rand_state: DeviceBuffer = match random { - NonceGenEnum::Xoshiro => { - info!("Using xoshiro for nonce-generation"); - let mut buffer = DeviceBuffer::::zeroed(4 * (chosen_workload as usize)).unwrap(); - info!("GPU #{} is generating initial seed. This may take some time.", device_id); - let mut seed = [1u64; 4]; - seed.try_fill(&mut rand::thread_rng())?; - buffer.copy_from( - Xoshiro256StarStar::new(&seed) - .iter_jump_state() - .take(chosen_workload as usize) - .flatten() - .collect::>() - .as_slice(), - )?; - info!("GPU #{} initialized", device_id); - buffer - } - NonceGenEnum::Lean => { - info!("Using lean nonce-generation"); - let mut buffer = DeviceBuffer::::zeroed(1).unwrap(); - let seed = rand::thread_rng().next_u64(); - buffer.copy_from(&[seed])?; - buffer - } - }; - Ok(Self { - device_id, - _context, - _module, - start_event: Event::new(EventFlags::DEFAULT)?, - stop_event: Event::new(EventFlags::DEFAULT)?, - workload: chosen_workload as usize, - stream, - rand_state, - final_nonce_buff, - heavy_hash_kernel, - random, - }) - } -} +use crate::{Error, NonceGenEnum}; +use cust::context::CurrentContext; +use cust::device::DeviceAttribute; +use cust::function::Function; +use cust::memory::DeviceCopy; +use cust::module::{ModuleJitOption, OptLevel}; +use cust::prelude::*; +use karlsen_miner::xoshiro256starstar::Xoshiro256StarStar; +use karlsen_miner::Worker; +use log::{error, info}; +use memmap::MmapOptions; +use rand::{Fill, RngCore}; +use std::ffi::CString; +use std::fs::OpenOptions; +use std::ops::BitXor; +use std::path::Path; +use std::sync::{Arc, Weak}; +use tiny_keccak::Hasher; + +static BPS: f32 = 0.5; + +static PTX_86: &str = include_str!("../resources/karlsen-cuda-sm86.ptx"); +static PTX_75: &str = include_str!("../resources/karlsen-cuda-sm75.ptx"); +static PTX_61: &str = include_str!("../resources/karlsen-cuda-sm61.ptx"); +static PTX_30: &str = include_str!("../resources/karlsen-cuda-sm30.ptx"); +static PTX_20: &str = include_str!("../resources/karlsen-cuda-sm20.ptx"); + +pub struct Kernel<'kernel> { + func: Arc>, + block_size: u32, + grid_size: u32, +} + +impl<'kernel> Kernel<'kernel> { + pub fn new(module: Weak, name: &'kernel str) -> Result, Error> { + let func = Arc::new(unsafe { + module.as_ptr().as_ref().unwrap().get_function(name).map_err(|e| { + error!("Error loading function: {}", e); + e + })? + }); + let (_, block_size) = func.suggested_launch_configuration(0, 0.into())?; + + let device = CurrentContext::get_device()?; + let sm_count = device.get_attribute(DeviceAttribute::MultiprocessorCount)? as u32; + let grid_size = sm_count * func.max_active_blocks_per_multiprocessor(block_size.into(), 0)?; + + Ok(Self { func, block_size, grid_size }) + } + + pub fn get_workload(&self) -> u32 { + //self.block_size * self.grid_size + //we force workload to 1 for the moment + 1 + } + + pub fn set_workload(&mut self, workload: u32) { + self.grid_size = (workload + self.block_size - 1) / self.block_size + } +} + +#[repr(C)] +#[derive(Copy, Clone)] +#[allow(dead_code)] // Allow dead code for unused unions +pub union hash256 { + pub word64s: [u64; 4usize], + pub word32s: [u32; 8usize], + pub bytes: [u8; 32usize], + pub str_: [::std::os::raw::c_char; 32usize], +} + +#[repr(C)] +#[derive(Copy, Clone)] +#[allow(dead_code)] // Allow dead code for unused unions +pub union hash512 { + pub word64s: [u64; 8usize], + pub word32s: [u32; 16usize], + pub bytes: [u8; 64usize], + pub str_: [::std::os::raw::c_char; 64usize], +} + +#[repr(C)] +#[derive(Copy, Clone)] +#[allow(dead_code)] // Allow dead code for unused unions +pub union hash1024 { + pub hash512s: [hash512; 2usize], + pub word64s: [u64; 16usize], + pub word32s: [u32; 32usize], + pub bytes: [u8; 128usize], + pub str_: [::std::os::raw::c_char; 128usize], +} + +const SIZE_U32: usize = std::mem::size_of::(); +#[allow(dead_code)] +const SIZE_U64: usize = std::mem::size_of::(); + +pub trait HashData { + fn new() -> Self; + fn as_bytes(&self) -> &[u8]; + fn as_bytes_mut(&mut self) -> &mut [u8]; + + fn get_as_u32(&self, index: usize) -> u32 { + u32::from_le_bytes(self.as_bytes()[index * SIZE_U32..index * SIZE_U32 + SIZE_U32].try_into().unwrap()) + } + + fn set_as_u32(&mut self, index: usize, value: u32) { + self.as_bytes_mut()[index * SIZE_U32..index * SIZE_U32 + SIZE_U32].copy_from_slice(&value.to_le_bytes()) + } + + #[allow(dead_code)] + fn get_as_u64(&self, index: usize) -> u64 { + u64::from_le_bytes(self.as_bytes()[index * SIZE_U64..index * SIZE_U64 + SIZE_U64].try_into().unwrap()) + } + + #[allow(dead_code)] + fn set_as_u64(&mut self, index: usize, value: u64) { + self.as_bytes_mut()[index * SIZE_U64..index * SIZE_U64 + SIZE_U64].copy_from_slice(&value.to_le_bytes()) + } +} + +#[derive(Debug)] +pub struct Hash256([u8; 32]); + +impl HashData for Hash256 { + fn new() -> Self { + Self([0; 32]) + } + + fn as_bytes(&self) -> &[u8] { + &self.0 + } + + fn as_bytes_mut(&mut self) -> &mut [u8] { + &mut self.0 + } +} + +#[repr(C)] +#[derive(Clone, Copy, Debug, DeviceCopy)] +pub struct Hash512([u8; 64]); + +impl HashData for Hash512 { + fn new() -> Self { + Self([0; 64]) + } + + fn as_bytes(&self) -> &[u8] { + &self.0 + } + + fn as_bytes_mut(&mut self) -> &mut [u8] { + &mut self.0 + } +} + +impl BitXor<&Hash512> for &Hash512 { + type Output = Hash512; + + fn bitxor(self, rhs: &Hash512) -> Self::Output { + let mut hash = Hash512::new(); + + for i in 0..64 { + hash.0[i] = self.0[i] ^ rhs.0[i] + } + + hash + } +} + +#[repr(C)] +#[derive(Copy, Clone, Debug, DeviceCopy)] +pub struct Hash1024([u8; 128]); + +impl HashData for Hash1024 { + fn new() -> Self { + Self([0; 128]) + } + + fn as_bytes(&self) -> &[u8] { + &self.0 + } + + fn as_bytes_mut(&mut self) -> &mut [u8] { + &mut self.0 + } +} + +impl Hash1024 { + fn from_512s(first: &Hash512, second: &Hash512) -> Self { + let mut hash = Self::new(); + let (first_half, second_half) = hash.0.split_at_mut(first.0.len()); + first_half.copy_from_slice(&first.0); + second_half.copy_from_slice(&second.0); + + hash + } + fn from_bytes(bytes: &[u8]) -> Self { + let mut array = [0u8; 128]; + array.copy_from_slice(bytes); // Ensure this does not panic by validating input length. + Hash1024(array) + } +} + +const FNV_PRIME: u32 = 0x01000193; +const FULL_DATASET_ITEM_PARENTS: u32 = 512; +const LIGHT_CACHE_ROUNDS: i32 = 3; + +const LIGHT_CACHE_NUM_ITEMS: u32 = 1179641; +const FULL_DATASET_NUM_ITEMS: u32 = 37748717; +const SEED: Hash256 = Hash256([ + 0xeb, 0x01, 0x63, 0xae, 0xf2, 0xab, 0x1c, 0x5a, 0x66, 0x31, 0x0c, 0x1c, 0x14, 0xd6, 0x0f, 0x42, 0x55, 0xa9, 0xb3, + 0x9b, 0x0e, 0xdf, 0x26, 0x53, 0x98, 0x44, 0xf1, 0x17, 0xad, 0x67, 0x21, 0x19, +]); + +pub struct CudaGPUWorker<'gpu> { + // NOTE: The order is important! context must be closed last + heavy_hash_kernel: Kernel<'gpu>, + stream: Stream, + start_event: Event, + stop_event: Event, + _module: Arc, + + rand_state: DeviceBuffer, + final_nonce_buff: DeviceBuffer, + + cache2: DeviceBuffer, + dataset2: DeviceBuffer, + //cache2_ptr: DevicePointer, + //dataset2_ptr: DevicePointer, + device_id: u32, + pub workload: usize, + _context: Context, + + random: NonceGenEnum, + //pub full_dataset: *mut Hash1024, + //pub light_cache: *mut Hash512, +} + +impl<'gpu> Worker for CudaGPUWorker<'gpu> { + fn id(&self) -> String { + let device = CurrentContext::get_device().unwrap(); + format!("#{} ({})", self.device_id, device.name().unwrap()) + } + + fn load_block_constants(&mut self, hash_header: &[u8; 72], matrix: &[[u16; 64]; 64], target: &[u64; 4]) { + //info!("load_block_constants: debug1 "); + let u8matrix: Arc<[[u8; 64]; 64]> = Arc::new(matrix.map(|row| row.map(|v| v as u8))); + //info!("load_block_constants: debug2 "); + let mut hash_header_gpu = self._module.get_global::<[u8; 72]>(&CString::new("hash_header").unwrap()).unwrap(); + //info!("load_block_constants: debug3 "); + hash_header_gpu.copy_from(hash_header).map_err(|e| e.to_string()).unwrap(); + //info!("load_block_constants: debug4 "); + + let mut matrix_gpu = self._module.get_global::<[[u8; 64]; 64]>(&CString::new("matrix").unwrap()).unwrap(); + //info!("load_block_constants: debug5 "); + matrix_gpu.copy_from(&u8matrix).map_err(|e| e.to_string()).unwrap(); + //info!("load_block_constants: debug6 "); + + let mut target_gpu = self._module.get_global::<[u64; 4]>(&CString::new("target").unwrap()).unwrap(); + //info!("load_block_constants: debug7 "); + target_gpu.copy_from(target).map_err(|e| e.to_string()).unwrap(); + //info!("load_block_constants: debug8 "); + + //let mut data = DeviceBuffer::from_slice(&vec![hash1024 { bytes: [0; 128] }; FULL_DATASET_NUM_ITEMS]); + + //let u8cache: Arc<[u8; 10]> = Arc::new([0; 10]); + //let mut data = DeviceBuffer::from_slice(&vec![hash512 { bytes: [0; 64] }; LIGHT_CACHE_NUM_ITEMS]); + //self.cache = DeviceBuffer::from_slice(&vec![hash512 { bytes: [0; 64] }; LIGHT_CACHE_NUM_ITEMS]); + /* + info!("load_block_constants: debug8.1 "); + let mut cache_gpu = self._module.get_global::<[DeviceBuffer; LIGHT_CACHE_NUM_ITEMS]>(&CString::new("cache_test").unwrap()).unwrap(); + info!("load_block_constants: debug9 "); + cache_gpu.copy_from(&data).map_err(|e| e.to_string()).unwrap(); + info!("load_block_constants: debug10 "); + */ + } + + #[inline(always)] + fn calculate_hash(&mut self, _nonces: Option<&Vec>, nonce_mask: u64, nonce_fixed: u64) { + //info!("calculate_hash: debug1 "); + let func = &self.heavy_hash_kernel.func; + let stream = &self.stream; + let random: u8 = match self.random { + NonceGenEnum::Lean => { + self.rand_state.copy_from(&[rand::thread_rng().next_u64()]).unwrap(); + 0 + } + NonceGenEnum::Xoshiro => 1, + }; + + //self.light_cache = vec![Hash512::new(); LIGHT_CACHE_NUM_ITEMS as usize].into_boxed_slice(); + //self.full_dataset = vec![Hash1024::new(); FULL_DATASET_NUM_ITEMS as usize].into_boxed_slice(); + + //info!("calculate_hash: debug2 "); + self.start_event.record(stream).unwrap(); + //info!("calculate_hash: debug3 cache size : {}", self.cache2.len()); + //info!("calculate_hash: debug3 dataset size : {}", self.dataset2.len()); + + //info!("calculate_hash: debug3 dataset[10] : {:?}", self.dataset.index(10)); + unsafe { + launch!( + func<<< + self.heavy_hash_kernel.grid_size, self.heavy_hash_kernel.block_size, + 0, stream + >>>( + nonce_mask, + nonce_fixed, + self.workload, + random, + self.rand_state.as_device_ptr(), + self.final_nonce_buff.as_device_ptr(), + self.dataset2.as_device_ptr(), + self.cache2.as_device_ptr(), + //self.cache2_ptr.as_raw(), + //self.dataset2_ptr.as_raw(), + ) + ) + .unwrap(); // We see errors in sync + } + //info!("calculate_hash: debug4 "); + self.stop_event.record(stream).unwrap(); + //info!("calculate_hash: debug5 "); + } + + #[inline(always)] + fn sync(&self) -> Result<(), Error> { + //self.stream.synchronize()?; + self.stop_event.synchronize()?; + if self.stop_event.elapsed_time_f32(&self.start_event)? > 1000. / BPS { + return Err("Cuda takes longer then block rate. Please reduce your workload.".into()); + } + Ok(()) + } + + fn get_workload(&self) -> usize { + self.workload + } + + #[inline(always)] + fn copy_output_to(&mut self, nonces: &mut Vec) -> Result<(), Error> { + self.final_nonce_buff.copy_to(nonces)?; + Ok(()) + } +} + +pub fn keccak_in_place(data: &mut [u8]) { + let mut hasher = tiny_keccak::Keccak::v512(); + hasher.update(data); + hasher.finalize(data); +} + +pub fn keccak(out: &mut [u8], data: &[u8]) { + let mut hasher = tiny_keccak::Keccak::v512(); + hasher.update(data); + hasher.finalize(out); +} + +fn build_light_cache(cache: &mut [Hash512]) { + let mut item: Hash512 = Hash512::new(); + keccak(&mut item.0, &SEED.0); + cache[0] = item; + + for cache_item in cache.iter_mut().take(LIGHT_CACHE_NUM_ITEMS as usize).skip(1) { + keccak_in_place(&mut item.0); + *cache_item = item; + } + + for _ in 0..LIGHT_CACHE_ROUNDS { + for i in 0..LIGHT_CACHE_NUM_ITEMS { + // First index: 4 first bytes of the item as little-endian integer + let t: u32 = cache[i as usize].get_as_u32(0); + let v: u32 = t % LIGHT_CACHE_NUM_ITEMS; + + // Second index + let w: u32 = (LIGHT_CACHE_NUM_ITEMS.wrapping_add(i.wrapping_sub(1))) % LIGHT_CACHE_NUM_ITEMS; + + let x = &cache[v as usize] ^ &cache[w as usize]; + keccak(&mut cache[i as usize].0, &x.0); + } + } +} + +fn prebuild_dataset(full_dataset: &mut Box<[Hash1024]>, light_cache: &[Hash512], num_threads: usize) { + //let full_dataset = full_dataset_opt.as_mut().unwrap(); + + if num_threads > 1 { + std::thread::scope(|scope| { + let mut threads = Vec::with_capacity(num_threads); + + let light_cache_slice = &light_cache[0..]; + let batch_size = full_dataset.len() / num_threads; + let chunks = full_dataset.chunks_mut(batch_size); + + for (index, chunk) in chunks.enumerate() { + let start = index * batch_size; + + let thread_handle = scope.spawn(move || build_dataset_segment(chunk, light_cache_slice, start)); + threads.push(thread_handle); + } + + for handle in threads { + handle.join().unwrap(); + } + }); + } else { + build_dataset_segment(&mut full_dataset[0..], light_cache, 0); + } +} + +fn build_dataset_segment(dataset_slice: &mut [Hash1024], light_cache: &[Hash512], offset: usize) { + for (index, item) in dataset_slice.iter_mut().enumerate() { + *item = calculate_dataset_item_1024(light_cache, offset + index); + } +} + +fn fnv1(u: u32, v: u32) -> u32 { + (u * FNV_PRIME) ^ v +} + +fn fnv1_512(u: Hash512, v: Hash512) -> Hash512 { + let mut r = Hash512::new(); + + for i in 0..r.0.len() / SIZE_U32 { + r.set_as_u32(i, fnv1(u.get_as_u32(i), v.get_as_u32(i))); + } + + r +} + +fn calculate_dataset_item_1024(light_cache: &[Hash512], index: usize) -> Hash1024 { + let seed0 = (index * 2) as u32; + let seed1 = seed0 + 1; + + let mut mix0 = light_cache[(seed0 % LIGHT_CACHE_NUM_ITEMS) as usize]; + let mut mix1 = light_cache[(seed1 % LIGHT_CACHE_NUM_ITEMS) as usize]; + + let mix0_seed = mix0.get_as_u32(0) ^ seed0; + let mix1_seed = mix1.get_as_u32(0) ^ seed1; + + mix0.set_as_u32(0, mix0_seed); + mix1.set_as_u32(0, mix1_seed); + + keccak_in_place(&mut mix0.0); + keccak_in_place(&mut mix1.0); + + let num_words: u32 = (std::mem::size_of_val(&mix0) / SIZE_U32) as u32; + for j in 0..FULL_DATASET_ITEM_PARENTS { + let t0 = fnv1(seed0 ^ j, mix0.get_as_u32((j % num_words) as usize)); + let t1 = fnv1(seed1 ^ j, mix1.get_as_u32((j % num_words) as usize)); + mix0 = fnv1_512(mix0, light_cache[(t0 % LIGHT_CACHE_NUM_ITEMS) as usize]); + mix1 = fnv1_512(mix1, light_cache[(t1 % LIGHT_CACHE_NUM_ITEMS) as usize]); + } + + keccak_in_place(&mut mix0.0); + keccak_in_place(&mut mix1.0); + + Hash1024::from_512s(&mix0, &mix1) +} + +fn save_dataset_to_file(full_dataset_unwrap: &[Hash1024], filename: &str) { + let total_size = std::mem::size_of_val(full_dataset_unwrap); + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(filename) + .unwrap_or_else(|_| panic!("Failed to open or create the file")); + + file.set_len(total_size as u64).expect("Failed to set file length"); + + let mut mmap = unsafe { MmapOptions::new().map_mut(&file).expect("Failed to memory-map the file") }; + + for (i, hash) in full_dataset_unwrap.iter().enumerate() { + let offset = i * std::mem::size_of::(); + mmap[offset..offset + std::mem::size_of::()].copy_from_slice(hash.as_bytes()); + } + + mmap.flush().expect("Failed to flush memory map to disk"); +} + +fn read_dataset_from_file(filename: &str, full_dataset_unwrap: &mut Box<[Hash1024]>) { + if !Path::new(filename).exists() { + panic!("File does not exist"); + } + + let file = + OpenOptions::new().read(true).open(filename).unwrap_or_else(|_| panic!("Failed to open the file for reading")); + + let mmap = unsafe { MmapOptions::new().map(&file).expect("Failed to memory-map the file for reading") }; + let item_size = std::mem::size_of::(); + if mmap.len() % item_size != 0 { + panic!("File size is not a multiple of Hash1024 size"); + } + + let num_items = mmap.len() / item_size; + if num_items != full_dataset_unwrap.len() { + panic!("Mismatch between file data size and provided buffer size"); + } + + for i in 0..num_items { + let start = i * item_size; + let end = start + item_size; + full_dataset_unwrap[i] = Hash1024::from_bytes(&mmap[start..end]); + } +} + +impl<'gpu> CudaGPUWorker<'gpu> { + pub fn new( + device_id: u32, + workload: f32, + is_absolute: bool, + blocking_sync: bool, + random: NonceGenEnum, + ) -> Result { + info!("Starting a CUDA worker"); + let sync_flag = match blocking_sync { + true => ContextFlags::SCHED_BLOCKING_SYNC, + false => ContextFlags::SCHED_AUTO, + }; + let device = Device::get_device(device_id).unwrap(); + let _context = Context::new(device)?; + _context.set_flags(sync_flag)?; + + let mut light_cache = vec![Hash512::new(); LIGHT_CACHE_NUM_ITEMS as usize].into_boxed_slice(); + build_light_cache(&mut light_cache); + //cache.copy_from(&light_cache)?; + let cache2 = DeviceBuffer::from_slice(&light_cache).unwrap(); + + info!("light_cache[10] : {:x?}", &light_cache[10].as_bytes()); + info!("light_cache[42] : {:x?}", &light_cache[42].as_bytes()); + + let mut full_dataset = Some(vec![Hash1024::new(); FULL_DATASET_NUM_ITEMS as usize].into_boxed_slice()); + let full_dataset_uwrap = full_dataset.as_mut().unwrap(); + //build_dataset_segment(&mut full_dataset_uwrap[0..], &light_cache, 0); + if Path::new("dataset.bin").exists() { + read_dataset_from_file("dataset.bin", full_dataset_uwrap); + } else { + prebuild_dataset(full_dataset_uwrap, &light_cache, 8); + //save_dataset_to_file(&full_dataset_uwrap, "hashes.dat") + save_dataset_to_file(full_dataset_uwrap, "dataset.bin"); + } + + info!("dataset[10] : {:x?}", full_dataset_uwrap[10].as_bytes()); + info!("dataset[42] : {:x?}", full_dataset_uwrap[42].as_bytes()); + info!("dataset[12345] : {:x?}", full_dataset_uwrap[12345].as_bytes()); + + //dataset.copy_from(&full_dataset_uwrap)?; + let dataset2 = DeviceBuffer::from_slice(full_dataset_uwrap).unwrap(); + //let dataset2_ptr: DevicePointer = dataset2.as_device_ptr(); + + let major = device.get_attribute(DeviceAttribute::ComputeCapabilityMajor)?; + let minor = device.get_attribute(DeviceAttribute::ComputeCapabilityMinor)?; + let _module: Arc; + info!("Device #{} compute version is {}.{}", device_id, major, minor); + if major > 8 || (major == 8 && minor >= 6) { + _module = Arc::new(Module::from_ptx(PTX_86, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { + error!("Error loading PTX. Make sure you have the updated driver for you devices"); + e + })?); + } else if major > 7 || (major == 7 && minor >= 5) { + _module = Arc::new(Module::from_ptx(PTX_75, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { + error!("Error loading PTX. Make sure you have the updated driver for you devices"); + e + })?); + } else if major > 6 || (major == 6 && minor >= 1) { + _module = Arc::new(Module::from_ptx(PTX_61, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { + error!("Error loading PTX. Make sure you have the updated driver for you devices"); + e + })?); + } else if major >= 3 { + _module = Arc::new(Module::from_ptx(PTX_30, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { + error!("Error loading PTX. Make sure you have the updated driver for you devices"); + e + })?); + } else if major >= 2 { + _module = Arc::new(Module::from_ptx(PTX_20, &[ModuleJitOption::OptLevel(OptLevel::O4)]).map_err(|e| { + error!("Error loading PTX. Make sure you have the updated driver for you devices"); + e + })?); + } else { + return Err("Cuda compute version not supported".into()); + } + + let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?; + + let mut heavy_hash_kernel = Kernel::new(Arc::downgrade(&_module), "heavy_hash")?; + + let mut chosen_workload = 0u32; + if is_absolute { + chosen_workload = 1; + } else { + let cur_workload = heavy_hash_kernel.get_workload(); + if chosen_workload == 0 || chosen_workload < cur_workload { + chosen_workload = cur_workload; + } + } + chosen_workload = (chosen_workload as f32 * workload) as u32; + info!("GPU #{} Chosen workload: {}", device_id, chosen_workload); + heavy_hash_kernel.set_workload(chosen_workload); + + let final_nonce_buff = vec![0u64; 1].as_slice().as_dbuf()?; + + let rand_state: DeviceBuffer = match random { + NonceGenEnum::Xoshiro => { + info!("Using xoshiro for nonce-generation"); + let mut buffer = DeviceBuffer::::zeroed(4 * (chosen_workload as usize)).unwrap(); + info!("GPU #{} is generating initial seed. This may take some time.", device_id); + let mut seed = [1u64; 4]; + seed.try_fill(&mut rand::thread_rng())?; + buffer.copy_from( + Xoshiro256StarStar::new(&seed) + .iter_jump_state() + .take(chosen_workload as usize) + .flatten() + .collect::>() + .as_slice(), + )?; + info!("GPU #{} initialized", device_id); + buffer + } + NonceGenEnum::Lean => { + info!("Using lean nonce-generation"); + let mut buffer = DeviceBuffer::::zeroed(1).unwrap(); + let seed = rand::thread_rng().next_u64(); + buffer.copy_from(&[seed])?; + buffer + } + }; + Ok(Self { + device_id, + _context, + _module, + start_event: Event::new(EventFlags::DEFAULT)?, + stop_event: Event::new(EventFlags::DEFAULT)?, + workload: chosen_workload as usize, + stream, + rand_state, + final_nonce_buff, + cache2, + dataset2, + heavy_hash_kernel, + random, + }) + } +} diff --git a/plugins/opencl/Cargo.lock b/plugins/opencl/Cargo.lock deleted file mode 100644 index 3c3240e..0000000 --- a/plugins/opencl/Cargo.lock +++ /dev/null @@ -1,7 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "opencl" -version = "0.1.0" diff --git a/plugins/opencl/Cargo.toml b/plugins/opencl/Cargo.toml deleted file mode 100644 index cdd38a5..0000000 --- a/plugins/opencl/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "kaspaopencl" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html -[dependencies] -karlsen_miner = {path = "../../", package="karlsen-miner"} -clap = { version = "3.0", features = ["color", "derive"]} -env_logger = "0.9" -opencl3 = {version = "0.6", features = ["CL_VERSION_2_1", "CL_VERSION_2_2", "CL_VERSION_3_0"]} -log = "0.4" -rand = "0.8" -include_dir = "0.7" - -[lib] -crate-type = ["cdylib"] - diff --git a/plugins/opencl/README.md b/plugins/opencl/README.md deleted file mode 100644 index a6bdef4..0000000 --- a/plugins/opencl/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# OpenCL support for karlsen-miner - -This is an experimental plugin to support opencl. - -# Compiling to AMD -Download and install Radeon GPU Analyzer, which allows you to compile OpenCL for AMD - -```shell -for arch in gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx906 -do - rga --O3 -s opencl -c "$arch" --OpenCLoption "-cl-finite-math-only -cl-mad-enable " -b plugins/opencl/resources/bin/kaspa-opencl.bin plugins/opencl/resources/kaspa-opencl.cl -D __FORCE_AMD_V_DOT8_U32_U4__=1 -D OPENCL_PLATFORM_AMD -D OFFLINE -done - -for arch in gfx1010 -do - rga --O3 -s opencl -c "$arch" --OpenCLoption "-cl-finite-math-only -cl-mad-enable " -b plugins/opencl/resources/bin/kaspa-opencl.bin plugins/opencl/resources/kaspa-opencl.cl -D OPENCL_PLATFORM_AMD -done - -for arch in Ellesmere -do - rga --O3 -s opencl -c "$arch" --OpenCLoption "-cl-finite-math-only -cl-mad-enable -target amdgcn-amd-amdpal" -b plugins/opencl/resources/bin/kaspa-opencl.bin plugins/opencl/resources/kaspa-opencl.cl -D OPENCL_PLATFORM_AMD -D PAL -done -``` \ No newline at end of file diff --git a/plugins/opencl/resources/bin/ellesmere_kaspa-opencl.bin b/plugins/opencl/resources/bin/ellesmere_kaspa-opencl.bin deleted file mode 100755 index 8ef0324..0000000 Binary files a/plugins/opencl/resources/bin/ellesmere_kaspa-opencl.bin and /dev/null differ diff --git a/plugins/opencl/resources/bin/gfx1010_kaspa-opencl.bin b/plugins/opencl/resources/bin/gfx1010_kaspa-opencl.bin deleted file mode 100755 index 078e262..0000000 Binary files a/plugins/opencl/resources/bin/gfx1010_kaspa-opencl.bin and /dev/null differ diff --git a/plugins/opencl/resources/bin/gfx1011_kaspa-opencl.bin b/plugins/opencl/resources/bin/gfx1011_kaspa-opencl.bin deleted file mode 100755 index 01b6a98..0000000 Binary files a/plugins/opencl/resources/bin/gfx1011_kaspa-opencl.bin and /dev/null differ diff --git a/plugins/opencl/resources/bin/gfx1012_kaspa-opencl.bin b/plugins/opencl/resources/bin/gfx1012_kaspa-opencl.bin deleted file mode 100755 index bb91810..0000000 Binary files a/plugins/opencl/resources/bin/gfx1012_kaspa-opencl.bin and /dev/null differ diff --git a/plugins/opencl/resources/bin/gfx1030_kaspa-opencl.bin b/plugins/opencl/resources/bin/gfx1030_kaspa-opencl.bin deleted file mode 100755 index 9fb2ce8..0000000 Binary files a/plugins/opencl/resources/bin/gfx1030_kaspa-opencl.bin and /dev/null differ diff --git a/plugins/opencl/resources/bin/gfx1031_kaspa-opencl.bin b/plugins/opencl/resources/bin/gfx1031_kaspa-opencl.bin deleted file mode 100755 index dc6d9a0..0000000 Binary files a/plugins/opencl/resources/bin/gfx1031_kaspa-opencl.bin and /dev/null differ diff --git a/plugins/opencl/resources/bin/gfx1032_kaspa-opencl.bin b/plugins/opencl/resources/bin/gfx1032_kaspa-opencl.bin deleted file mode 100755 index ffa98a0..0000000 Binary files a/plugins/opencl/resources/bin/gfx1032_kaspa-opencl.bin and /dev/null differ diff --git a/plugins/opencl/resources/bin/gfx1034_kaspa-opencl.bin b/plugins/opencl/resources/bin/gfx1034_kaspa-opencl.bin deleted file mode 100755 index daf2d10..0000000 Binary files a/plugins/opencl/resources/bin/gfx1034_kaspa-opencl.bin and /dev/null differ diff --git a/plugins/opencl/resources/bin/gfx906_kaspa-opencl.bin b/plugins/opencl/resources/bin/gfx906_kaspa-opencl.bin deleted file mode 100755 index 215a3ea..0000000 Binary files a/plugins/opencl/resources/bin/gfx906_kaspa-opencl.bin and /dev/null differ diff --git a/plugins/opencl/resources/kaspa-opencl.cl b/plugins/opencl/resources/kaspa-opencl.cl deleted file mode 100644 index 33767fa..0000000 --- a/plugins/opencl/resources/kaspa-opencl.cl +++ /dev/null @@ -1,415 +0,0 @@ -// Catering for different flavors -#pragma OPENCL EXTENSION cl_amd_media_ops : enable - -#if __OPENCL_VERSION__ <= CL_VERSION_1_1 -#define STATIC -#else -#define STATIC static -#endif -/* TYPES */ - -typedef uchar uint8_t; -typedef char int8_t; -typedef ushort uint16_t; -typedef short int16_t; -typedef uint uint32_t; -typedef int int32_t; -typedef ulong uint64_t; -typedef long int64_t; - -/* TINY KECCAK */ -/** libkeccak-tiny - * - * A single-file implementation of SHA-3 and SHAKE. - * - * Implementor: David Leon Gil - * License: CC0, attribution kindly requested. Blame taken too, - * but not liability. - */ - -/******** The Keccak-f[1600] permutation ********/ - -/*** Constants. ***/ -constant STATIC const uint8_t rho[24] = \ - { 1, 3, 6, 10, 15, 21, - 28, 36, 45, 55, 2, 14, - 27, 41, 56, 8, 25, 43, - 62, 18, 39, 61, 20, 44}; -constant STATIC const uint8_t pi[24] = \ - {10, 7, 11, 17, 18, 3, - 5, 16, 8, 21, 24, 4, - 15, 23, 19, 13, 12, 2, - 20, 14, 22, 9, 6, 1}; - -constant STATIC const uint64_t RC[24] = \ - {1UL, 0x8082UL, 0x800000000000808aUL, 0x8000000080008000UL, - 0x808bUL, 0x80000001UL, 0x8000000080008081UL, 0x8000000000008009UL, - 0x8aUL, 0x88UL, 0x80008009UL, 0x8000000aUL, - 0x8000808bUL, 0x800000000000008bUL, 0x8000000000008089UL, 0x8000000000008003UL, - 0x8000000000008002UL, 0x8000000000000080UL, 0x800aUL, 0x800000008000000aUL, - 0x8000000080008081UL, 0x8000000000008080UL, 0x80000001UL, 0x8000000080008008UL}; - - -/** Magic from fancyIX/sgminer-phi2-branch **/ -#if defined(OPENCL_PLATFORM_AMD) -#pragma OPENCL EXTENSION cl_amd_media_ops : enable -#define dataType uint2 -#define as_dataType as_uint2 -static inline uint2 rol(const uint2 vv, const int r) -{ - if (r <= 32) - { - return amd_bitalign((vv).xy, (vv).yx, 32 - r); - } - else - { - return amd_bitalign((vv).yx, (vv).xy, 64 - r); - } -} -#else -#define dataType ulong -#define as_dataType as_ulong -#define rol(x, s) (((x) << s) | ((x) >> (64 - s))) -#endif - -/*** Helper macros to unroll the permutation. ***/ -#define REPEAT6(e) e e e e e e -#define REPEAT24(e) REPEAT6(e e e e) -#define REPEAT23(e) REPEAT6(e e e) e e e e e -#define REPEAT5(e) e e e e e -#define FOR5(v, s, e) \ - v = 0; \ - REPEAT5(e; v += s;) - -/*** Keccak-f[1600] ***/ -STATIC inline void keccakf(void *state) { - dataType *a = (dataType *)state; - dataType b[5] = {0}; - dataType t = 0, v = 0; - uint8_t x, y; - -#if defined(cl_amd_media_ops) - #pragma unroll -#endif - for (int i = 0; i < 23; i++) { - // Theta - FOR5(x, 1, - b[x] = a[x] ^ a[x+5] ^ a[x+10] ^ a[x+15] ^ a[x+20];) - - v = b[4]; t = b[0]; - b[4] = b[4] ^ rol(b[1], 1); - b[0] = b[0] ^ rol(b[2], 1); - b[1] = b[1] ^ rol(b[3], 1); - b[2] = b[2] ^ rol(v, 1); - b[3] = b[3] ^ rol(t, 1); - - FOR5(x, 1, - FOR5(y, 5, a[y + x] ^= b[(x + 4) % 5]; )) - - // Rho and pi - t = a[1]; - x = 23; - REPEAT23(a[pi[x]] = rol(a[pi[x-1]], rho[x]); x--; ) - a[pi[ 0]] = rol( t, rho[ 0]); - - // Chi - FOR5(y, 5, - v = a[y]; t = a[y+1]; - a[y ] = bitselect(a[y ] ^ a[y+2], a[y ], a[y+1]); - a[y+1] = bitselect(a[y+1] ^ a[y+3], a[y+1], a[y+2]); - a[y+2] = bitselect(a[y+2] ^ a[y+4], a[y+2], a[y+3]); - a[y+3] = bitselect(a[y+3] ^ v, a[y+3], a[y+4]); - a[y+4] = bitselect(a[y+4] ^ t, a[y+4], v); - ) - - // Iota - a[0] ^= as_dataType(RC[i]); -} - /*******************************************************/ - // Theta - FOR5(x, 1, - b[x] = a[x] ^ a[x+5] ^ a[x+10] ^ a[x+15] ^ a[x+20];) - - v = b[4]; t = b[0]; - b[4] = b[4] ^ rol(b[1], 1); - b[0] = b[0] ^ rol(b[2], 1); - b[1] = b[1] ^ rol(b[3], 1); - b[2] = b[2] ^ rol(v, 1); - b[3] = b[3] ^ rol(t, 1); - - a[0] ^= b[4]; - a[1] ^= b[0]; a[6] ^= b[0]; - a[2] ^= b[1]; a[12] ^= b[1]; - a[3] ^= b[2]; a[18] ^= b[2]; - a[4] ^= b[3]; a[24] ^= b[3]; - - // Rho and pi - a[1]=rol(a[pi[22]], rho[23]); - a[2]=rol(a[pi[16]], rho[17]); - a[4]=rol(a[pi[10]], rho[11]); - a[3]=rol(a[pi[ 4]], rho[ 5]); - - // Chi - v = a[0]; - - a[0] = bitselect(a[0] ^ a[2], a[0], a[1]); - a[1] = bitselect(a[1] ^ a[3], a[1], a[2]); - a[2] = bitselect(a[2] ^ a[4], a[2], a[3]); - a[3] = bitselect(a[3] ^ v, a[3], a[4]); - - // Iota - a[0] ^= as_dataType(RC[23]); -} - -/******** The FIPS202-defined functions. ********/ - -/*** Some helper macros. ***/ - - -#define P keccakf -#define Plen 200 - -constant const ulong powP[25] = { 0x113cff0da1f6d83dUL, 0x29bf8855b7027e3cUL, 0x1e5f2e720efb44d2UL, 0x1ba5a4a3f59869a0UL, 0x7b2fafca875e2d65UL, 0x4aef61d629dce246UL, 0x183a981ead415b10UL, 0x776bf60c789bc29cUL, 0xf8ebf13388663140UL, 0x2e651c3c43285ff0UL, 0x0f96070540f14a0aUL, 0x44e367875b299152UL, 0xec70f1a425b13715UL, 0xe6c85d8f82e9da89UL, 0xb21a601f85b4b223UL, 0x3485549064a36a46UL, 0x0f06dd1c7a2f851aUL, 0xc1a2021d563bb142UL, 0xba1de5e4451668e4UL, 0xd102574105095f8dUL, 0x89ca4e849bcecf4aUL, 0x48b09427a8742edbUL, 0xb1fcce9ce78b5272UL, 0x5d1129cf82afa5bcUL, 0x02b97c786f824383UL }; -constant const ulong heavyP[25] = { 0x3ad74c52b2248509UL, 0x79629b0e2f9f4216UL, 0x7a14ff4816c7f8eeUL, 0x11a75f4c80056498UL, 0xe720e0df44eecedaUL, 0x72c7d82e14f34069UL, 0xc100ff2a938935baUL, 0x5e219040250fc462UL, 0x8039f9a60dcf6a48UL, 0xa0bcaa9f792a3d0cUL, 0xf431c05dd0a9a226UL, 0xd31f4cc354c18c3fUL, 0x6c6b7d01a769cc3dUL, 0x2ec65bd3562493e4UL, 0x4ef74b3a99cdb044UL, 0x774c86835434f2b0UL, 0x07e961b036bc9416UL, 0x7e8f1db17765cc07UL, 0xea8fdb80bac46d39UL, 0xb992f2d37b34ca58UL, 0xc776c5048481b957UL, 0x47c39f675112c22eUL, 0x92bb399db5290c0aUL, 0x549ae0312f9fc615UL, 0x1619327d10b9da35UL }; - -/** The sponge-based hash construction. **/ -STATIC inline void hash(constant const ulong *initP, const ulong* in, ulong4* out) { - private ulong a[25]; - // Xor in the last block. - #pragma unroll - for (size_t i = 0; i < 10; i++) a[i] = initP[i] ^ in[i]; - #pragma unroll - for (size_t i = 10; i < 25; i++) a[i] = initP[i]; - // Apply P - P(a); - // Squeeze output. - *out = ((ulong4 *)(a))[0]; -} - -/* RANDOM NUMBER GENERATOR BASED ON MWC64X */ -/* http://cas.ee.ic.ac.uk/people/dt10/research/rngs-gpu-mwc64x.html */ - -/* Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org) - -To the extent possible under law, the author has dedicated all copyright -and related and neighboring rights to this software to the public domain -worldwide. This software is distributed without any warranty. - -See . */ - - -/* This is xoshiro256** 1.0, one of our all-purpose, rock-solid - generators. It has excellent (sub-ns) speed, a state (256 bits) that is - large enough for any parallel application, and it passes all tests we - are aware of. - - For generating just floating-point numbers, xoshiro256+ is even faster. - - The state must be seeded so that it is not everywhere zero. If you have - a 64-bit seed, we suggest to seed a splitmix64 generator and use its - output to fill s. */ - -inline uint64_t rotl(const uint64_t x, int k) { - return (x << k) | (x >> (64 - k)); -} - -inline uint64_t xoshiro256_next(global ulong4 *s) { - const uint64_t result = rotl(s->y * 5, 7) * 9; - - const uint64_t t = s->y << 17; - - s->z ^= s->x; - s->w ^= s->y; - s->y ^= s->z; - s->x ^= s->w; - - s->z ^= t; - - s->w = rotl(s->w, 45); - - return result; -} -/* KERNEL CODE */ - -#ifdef cl_khr_int64_base_atomics -#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable -#endif -typedef union _Hash { - ulong4 hash; - uchar bytes[32]; -} Hash; - -#define BLOCKDIM 1024 -#define MATRIX_SIZE 64 -#define HALF_MATRIX_SIZE 32 -#define QUARTER_MATRIX_SIZE 16 -#define HASH_HEADER_SIZE 72 - -#define RANDOM_TYPE_LEAN 0 -#define RANDOM_TYPE_XOSHIRO 1 - -#define LT_U256(X,Y) (X.w != Y->w ? X.w < Y->w : X.z != Y->z ? X.z < Y->z : X.y != Y->y ? X.y < Y->y : X.x < Y->x) - -#ifndef cl_khr_int64_base_atomics -global int lock = false; -#endif - -#if defined(NVIDIA_CUDA) && (__COMPUTE_MAJOR__ > 6 || (__COMPUTE_MAJOR__ == 6 && __COMPUTE_MINOR__ >= 1)) -#define amul4bit(X,Y,Z) _amul4bit((constant uint32_t*)(X), (private uint32_t*)(Y), (uint32_t *)(Z)) -void STATIC inline _amul4bit(__constant uint32_t packed_vec1[32], uint32_t packed_vec2[32], uint32_t *ret) { - // We assume each 32 bits have four values: A0 B0 C0 D0 - uint32_t res = 0; - #pragma unroll - for (int i=0; i>0)&0xf)*((b4[i].x>>0)&0xf); - res += ((a4[i].x>>4)&0xf)*((b4[i].x>>4)&0xf); - res += ((a4[i].y>>0)&0xf)*((b4[i].y>>0)&0xf); - res += ((a4[i].y>>4)&0xf)*((b4[i].y>>4)&0xf); - res += ((a4[i].z>>0)&0xf)*((b4[i].z>>0)&0xf); - res += ((a4[i].z>>4)&0xf)*((b4[i].z>>4)&0xf); - res += ((a4[i].w>>0)&0xf)*((b4[i].w>>0)&0xf); - res += ((a4[i].w>>4)&0xf)*((b4[i].w>>4)&0xf); - } - *ret = res; -#else - ushort4 res = 0; - for (int i=0; i> 4; - hash_part[2*i+1] = hash_.bytes[i] & 0x0F; - } - #endif - - uint32_t product1, product2; - #if defined(NVIDIA_CUDA) || defined(__FORCE_AMD_V_DOT8_U32_U4__) - #pragma unroll - #endif - for (int rowId=0; rowId<32; rowId++){ - #if __FORCE_AMD_V_DOT8_U32_U4__ == 1 - amul4bit(matrix + 64*rowId, hash_.bytes, &product1); - amul4bit(matrix + 64*rowId+32, hash_.bytes, &product2); - #else - amul4bit(matrix + 128*rowId, hash_part, &product1); - amul4bit(matrix + 128*rowId+64, hash_part, &product2); - #endif - product1 >>= 10; - product2 >>= 10; -// hash2_.bytes[rowId] = hash_.bytes[rowId] ^ bitselect(product1, product2, 0x0000000FU); - hash2_.bytes[rowId] = hash_.bytes[rowId] ^ ((uint8_t)((product1 << 4) | (uint8_t)(product2))); - } - buffer[0] = hash2_.hash.x; - buffer[1] = hash2_.hash.y; - buffer[2] = hash2_.hash.z; - buffer[3] = hash2_.hash.w; - #pragma unroll - for(int i=4; i<10; i++) buffer[i] = 0; - - hash(heavyP, (const ulong*)buffer, &hash_.hash); - - if (LT_U256(hash_.hash, target)){ - //printf("%lu: %lu < %lu: %d %d\n", nonce, ((uint64_t *)hash_)[3], target[3], ((uint64_t *)hash_)[3] < target[3], LT_U256((uint64_t *)hash_, target)); - #ifdef cl_khr_int64_base_atomics - atom_cmpxchg(final_nonce, 0, nonce); - #else - if (!atom_cmpxchg(&lock, 0, 1)) { - *final_nonce = nonce; - //for(int i=0;i<4;i++) final_hash[i] = ((uint64_t volatile *)hash_)[i]; - } - #endif - } - /*if (nonceId==1) { - //printf("%lu: %lu < %lu: %d %d\n", nonce, ((uint64_t *)hash2_)[3], target[3], ((uint64_t *)hash_)[3] < target[3]); - *final_nonce = nonce; - for(int i=0;i<4;i++) final_hash[i] = ((uint64_t volatile *)hash_)[i]; - }*/ -} diff --git a/plugins/opencl/src/cli.rs b/plugins/opencl/src/cli.rs deleted file mode 100644 index 95c1d5a..0000000 --- a/plugins/opencl/src/cli.rs +++ /dev/null @@ -1,53 +0,0 @@ -use crate::Error; -use std::str::FromStr; - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum NonceGenEnum { - Lean, - Xoshiro, -} - -impl FromStr for NonceGenEnum { - type Err = Error; - - fn from_str(s: &str) -> Result { - match s.to_lowercase().as_str() { - "lean" => Ok(Self::Lean), - "xoshiro" => Ok(Self::Xoshiro), - _ => Err("Unknown string".into()), - } - } -} - -#[derive(clap::Args, Debug)] -pub struct OpenCLOpt { - #[clap(long = "opencl-platform", help = "Which OpenCL platform to use (limited to one per executable)")] - pub opencl_platform: Option, - #[clap(long = "opencl-device", use_delimiter = true, help = "Which OpenCL GPUs to use on a specific platform")] - pub opencl_device: Option>, - #[clap(long = "opencl-workload", help = "Ratio of nonces to GPU possible parrallel run in OpenCL [default: 512]")] - pub opencl_workload: Option>, - #[clap( - long = "opencl-workload-absolute", - help = "The values given by workload are not ratio, but absolute number of nonces in OpenCL [default: false]" - )] - pub opencl_workload_absolute: bool, - #[clap(long = "opencl-enable", help = "Enable opencl, and take all devices of the chosen platform")] - pub opencl_enable: bool, - #[clap(long = "opencl-amd-disable", help = "Disables AMD mining (does not override opencl-enable)")] - pub opencl_amd_disable: bool, - #[clap(long = "opencl-no-amd-binary", help = "Disable fetching of precompiled AMD kernel (if exists)")] - pub opencl_no_amd_binary: bool, - #[clap( - long = "experimental-amd", - help = "Uses SMID instructions in AMD. Miner will crash if instruction is not supported" - )] - pub experimental_amd: bool, - #[clap( - long = "opencl-nonce-gen", - help = "The random method used to generate nonces. Options: (i) xoshiro (ii) lean", - long_help = "The random method used to generate nonces. Options: (i) xoshiro - each thread in GPU will have its own random state, creating a (pseudo-)independent xoshiro sequence (ii) lean - each GPU will have a single random nonce, and each GPU thread will work on nonce + thread id.", - default_value = "lean" - )] - pub opencl_nonce_gen: NonceGenEnum, -} diff --git a/plugins/opencl/src/lib.rs b/plugins/opencl/src/lib.rs deleted file mode 100644 index 40e6cb6..0000000 --- a/plugins/opencl/src/lib.rs +++ /dev/null @@ -1,159 +0,0 @@ -#[macro_use] -extern crate karlsen_miner; - -use clap::{ArgMatches, FromArgMatches}; -use karlsen_miner::{Plugin, Worker, WorkerSpec}; -use log::{info, LevelFilter}; -use opencl3::device::{Device, CL_DEVICE_TYPE_ALL}; -use opencl3::platform::{get_platforms, Platform}; -use opencl3::types::cl_device_id; -use std::error::Error as StdError; - -pub type Error = Box; - -mod cli; -mod worker; - -use crate::cli::{NonceGenEnum, OpenCLOpt}; -use crate::worker::OpenCLGPUWorker; - -const DEFAULT_WORKLOAD_SCALE: f32 = 512.; - -pub struct OpenCLPlugin { - specs: Vec, - _enabled: bool, -} - -impl OpenCLPlugin { - fn new() -> Result { - env_logger::builder().filter_level(LevelFilter::Info).parse_default_env().init(); - Ok(Self { specs: Vec::new(), _enabled: false }) - } -} - -impl Plugin for OpenCLPlugin { - fn name(&self) -> &'static str { - "OpenCL Worker" - } - - fn enabled(&self) -> bool { - self._enabled - } - - fn get_worker_specs(&self) -> Vec> { - self.specs.iter().map(|spec| Box::new(*spec) as Box).collect::>>() - } - - //noinspection RsTypeCheck - fn process_option(&mut self, matches: &ArgMatches) -> Result { - let opts: OpenCLOpt = OpenCLOpt::from_arg_matches(matches)?; - - self._enabled = opts.opencl_enable; - let platforms = match get_platforms() { - Ok(p) => p, - Err(e) => { - return Err(e.to_string().into()); - } - }; - info!("OpenCL Found Platforms:"); - info!("======================="); - for platform in &platforms { - let vendor = &platform.vendor().unwrap_or_else(|_| "Unk".into()); - let name = &platform.name().unwrap_or_else(|_| "Unk".into()); - let num_devices = platform.get_devices(CL_DEVICE_TYPE_ALL).unwrap_or_default().len(); - info!("{}: {} ({} devices available)", vendor, name, num_devices); - } - let amd_platforms = (&platforms) - .iter() - .filter(|p| { - p.vendor().unwrap_or_else(|_| "Unk".into()) == "Advanced Micro Devices, Inc." - && !p.get_devices(CL_DEVICE_TYPE_ALL).unwrap_or_default().is_empty() - }) - .collect::>(); - let _platform: &Platform = match opts.opencl_platform { - Some(idx) => { - self._enabled = true; - &platforms[idx as usize] - } - None if !opts.opencl_amd_disable && !amd_platforms.is_empty() => { - self._enabled = true; - amd_platforms[0] - } - None => &platforms[0], - }; - if self._enabled { - info!( - "Chose to mine on {}: {}.", - &_platform.vendor().unwrap_or_else(|_| "Unk".into()), - &_platform.name().unwrap_or_else(|_| "Unk".into()) - ); - - let device_ids = _platform.get_devices(CL_DEVICE_TYPE_ALL).unwrap(); - let gpus = match opts.opencl_device { - Some(dev) => { - self._enabled = true; - dev.iter().map(|d| device_ids[*d as usize]).collect::>() - } - None => device_ids, - }; - - self.specs = (0..gpus.len()) - .map(|i| OpenCLWorkerSpec { - _platform: *_platform, - index: i, - device_id: Device::new(gpus[i]), - workload: match &opts.opencl_workload { - Some(workload) if i < workload.len() => workload[i], - Some(workload) if !workload.is_empty() => *workload.last().unwrap(), - _ => DEFAULT_WORKLOAD_SCALE, - }, - is_absolute: opts.opencl_workload_absolute, - experimental_amd: opts.experimental_amd, - use_amd_binary: !opts.opencl_no_amd_binary, - random: opts.opencl_nonce_gen, - }) - .collect(); - } - Ok(self.specs.len()) - } -} - -#[derive(Copy, Clone)] -struct OpenCLWorkerSpec { - _platform: Platform, - index: usize, - device_id: Device, - workload: f32, - is_absolute: bool, - experimental_amd: bool, - use_amd_binary: bool, - random: NonceGenEnum, -} - -impl WorkerSpec for OpenCLWorkerSpec { - fn id(&self) -> String { - format!( - "#{} {}", - self.index, - self.device_id - .board_name_amd() - .unwrap_or_else(|_| self.device_id.name().unwrap_or_else(|_| "Unknown Device".into())) - ) - } - - fn build(&self) -> Box { - Box::new( - OpenCLGPUWorker::new( - self.device_id, - self.workload, - self.is_absolute, - self.experimental_amd, - self.use_amd_binary, - &self.random, - ) - .unwrap(), - ) - } -} - -declare_plugin!(OpenCLPlugin, OpenCLPlugin::new, OpenCLOpt); diff --git a/plugins/opencl/src/worker.rs b/plugins/opencl/src/worker.rs deleted file mode 100644 index 71a87f6..0000000 --- a/plugins/opencl/src/worker.rs +++ /dev/null @@ -1,368 +0,0 @@ -use crate::cli::NonceGenEnum; -use crate::Error; -use include_dir::{include_dir, Dir}; -use karlsen_miner::xoshiro256starstar::Xoshiro256StarStar; -use karlsen_miner::Worker; -use log::{info, warn}; -use opencl3::command_queue::{CommandQueue, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE}; -use opencl3::context::Context; -use opencl3::device::Device; -use opencl3::event::{release_event, retain_event, wait_for_events}; -use opencl3::kernel::{ExecuteKernel, Kernel}; -use opencl3::memory::{Buffer, ClMem, CL_MAP_WRITE, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY}; -use opencl3::platform::Platform; -use opencl3::program::{Program, CL_FINITE_MATH_ONLY, CL_MAD_ENABLE, CL_STD_2_0}; -use opencl3::types::{cl_event, cl_uchar, cl_ulong, CL_BLOCKING}; -use rand::{thread_rng, Fill, RngCore}; -use std::borrow::Borrow; -use std::ffi::c_void; -use std::ptr; -use std::sync::Arc; - -static BINARY_DIR: Dir = include_dir!("./plugins/opencl/resources/bin/"); -static PROGRAM_SOURCE: &str = include_str!("../resources/kaspa-opencl.cl"); - -pub struct OpenCLGPUWorker { - context: Arc, - random: NonceGenEnum, - local_size: usize, - workload: usize, - - heavy_hash: Kernel, - - queue: CommandQueue, - - random_state: Buffer, - final_nonce: Buffer, - final_hash: Buffer<[cl_ulong; 4]>, - - hash_header: Buffer, - matrix: Buffer, - target: Buffer, - - events: Vec, - experimental_amd: bool, -} - -impl Worker for OpenCLGPUWorker { - fn id(&self) -> String { - let device = Device::new(self.context.default_device()); - device.name().unwrap() - } - - fn load_block_constants(&mut self, hash_header: &[u8; 72], matrix: &[[u16; 64]; 64], target: &[u64; 4]) { - let cl_uchar_matrix = match self.experimental_amd { - true => matrix - .iter() - .flat_map(|row| row.chunks(2).map(|v| ((v[0] << 4) | v[1]) as cl_uchar)) - .collect::>(), - false => matrix.iter().flat_map(|row| row.map(|v| v as cl_uchar)).collect::>(), - }; - self.queue - .enqueue_write_buffer(&mut self.final_nonce, CL_BLOCKING, 0, &[0], &[]) - .map_err(|e| e.to_string()) - .unwrap() - .wait() - .unwrap(); - self.queue - .enqueue_write_buffer(&mut self.hash_header, CL_BLOCKING, 0, hash_header, &[]) - .map_err(|e| e.to_string()) - .unwrap() - .wait() - .unwrap(); - self.queue - .enqueue_write_buffer(&mut self.matrix, CL_BLOCKING, 0, cl_uchar_matrix.as_slice(), &[]) - .map_err(|e| e.to_string()) - .unwrap() - .wait() - .unwrap(); - let copy_target = self - .queue - .enqueue_write_buffer(&mut self.target, CL_BLOCKING, 0, target, &[]) - .map_err(|e| e.to_string()) - .unwrap(); - - self.events = vec![copy_target.get()]; - for event in &self.events { - retain_event(*event).unwrap(); - } - } - - fn calculate_hash(&mut self, _nonces: Option<&Vec>, nonce_mask: u64, nonce_fixed: u64) { - if self.random == NonceGenEnum::Lean { - self.queue - .enqueue_write_buffer(&mut self.random_state, CL_BLOCKING, 0, &[thread_rng().next_u64()], &[]) - .map_err(|e| e.to_string()) - .unwrap() - .wait() - .unwrap(); - } - let random_type: cl_uchar = match self.random { - NonceGenEnum::Lean => 0, - NonceGenEnum::Xoshiro => 1, - }; - let kernel_event = ExecuteKernel::new(&self.heavy_hash) - .set_arg(&(self.local_size as u64)) - .set_arg(&nonce_mask) - .set_arg(&nonce_fixed) - .set_arg(&self.hash_header) - .set_arg(&self.matrix) - .set_arg(&self.target) - .set_arg(&random_type) - .set_arg(&self.random_state) - .set_arg(&self.final_nonce) - .set_arg(&self.final_hash) - .set_global_work_size(self.workload) - .set_event_wait_list(self.events.borrow()) - .enqueue_nd_range(&self.queue) - .map_err(|e| e.to_string()) - .unwrap(); - - kernel_event.wait().unwrap(); - - /*let mut nonces = [0u64; 1]; - let mut hash = [[0u64; 4]]; - self.queue.enqueue_read_buffer(&self.final_nonce, CL_BLOCKING, 0, &mut nonces, &[]).map_err(|e| e.to_string()).unwrap(); - self.queue.enqueue_read_buffer(&self.final_hash, CL_BLOCKING, 0, &mut hash, &[]).map_err(|e| e.to_string()).unwrap(); - log::info!("Hash from kernel: {:?}", hash);*/ - /*for event in &self.events{ - release_event(*event).unwrap(); - } - let event = kernel_event.get(); - self.events = vec!(event); - retain_event(event);*/ - } - - fn sync(&self) -> Result<(), Error> { - wait_for_events(&self.events).map_err(|e| format!("waiting error code {}", e))?; - for event in &self.events { - release_event(*event).unwrap(); - } - Ok(()) - } - - fn get_workload(&self) -> usize { - self.workload as usize - } - - fn copy_output_to(&mut self, nonces: &mut Vec) -> Result<(), Error> { - self.queue - .enqueue_read_buffer(&self.final_nonce, CL_BLOCKING, 0, nonces, &[]) - .map_err(|e| e.to_string()) - .unwrap(); - Ok(()) - } -} - -impl OpenCLGPUWorker { - pub fn new( - device: Device, - workload: f32, - is_absolute: bool, - experimental_amd: bool, - mut use_binary: bool, - random: &NonceGenEnum, - ) -> Result { - let name = - device.board_name_amd().unwrap_or_else(|_| device.name().unwrap_or_else(|_| "Unknown Device".into())); - info!("{}: Using OpenCL", name); - let version = device.version().unwrap_or_else(|_| "unkown version".into()); - info!( - "{}: Device supports {} with extensions: {}", - name, - version, - device.extensions().unwrap_or_else(|_| "NA".into()) - ); - - let local_size = device.max_work_group_size().map_err(|e| e.to_string())?; - let chosen_workload = match is_absolute { - true => workload as usize, - false => { - let max_work_group_size = - (local_size * (device.max_compute_units().map_err(|e| e.to_string())? as usize)) as f32; - (workload * max_work_group_size) as usize - } - }; - info!("{}: Chosen workload is {}", name, chosen_workload); - let context = - Arc::new(Context::from_device(&device).unwrap_or_else(|_| panic!("{}::Context::from_device failed", name))); - let context_ref = unsafe { Arc::as_ptr(&context).as_ref().unwrap() }; - - let options = match experimental_amd { - // true => "-D __FORCE_AMD_V_DOT4_U32_U8__=1 ", - true => "-D __FORCE_AMD_V_DOT8_U32_U4__=1 ", - false => "", - }; - - let experimental_amd_use = !matches!( - device.name().unwrap_or_else(|_| "Unknown".into()).to_lowercase().as_str(), - "tahiti" | "ellesmere" | "gfx1010" | "gfx906" | "gfx908" - ); - - let program = match use_binary { - true => { - let mut device_name = device.name().unwrap_or_else(|_| "Unknown".into()).to_lowercase(); - if device_name.contains(':') { - device_name = device_name.split_once(':').expect("We checked for `:`").0.to_string(); - } - info!("{}: Looking for binary for {}", name, device_name); - match BINARY_DIR.get_file(format!("{}_kaspa-opencl.bin", device_name)) { - Some(binary) => { - Program::create_and_build_from_binary(&context, &[binary.contents()], "").unwrap_or_else(|e|{ - //Program::create_and_build_from_binary(&context, &[include_bytes!("../resources/kaspa-opencl-linked.bc")], "").unwrap_or_else(|e|{ - warn!("{}::Program::create_and_build_from_source failed: {}. Reverting to compiling from source", name, e); - use_binary = false; - from_source(&context, &device, options).unwrap_or_else(|e| panic!("{}::Program::create_and_build_from_binary failed: {}", name, e)) - }) - } - None => { - warn!("Binary file not found for {}. Reverting to compiling from source.", device_name); - use_binary = false; - from_source(&context, &device, options) - .unwrap_or_else(|e| panic!("{}::Program::create_and_build_from_binary failed: {}", name, e)) - } - } - } - false => from_source(&context, &device, options) - .unwrap_or_else(|e| panic!("{}::Program::create_and_build_from_binary failed: {}", name, e)), - }; - info!("Kernels: {:?}", program.kernel_names()); - let heavy_hash = - Kernel::create(&program, "heavy_hash").unwrap_or_else(|_| panic!("{}::Kernel::create failed", name)); - - let queue = - CommandQueue::create_with_properties(&context, device.id(), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0) - .unwrap_or_else(|_| panic!("{}::CommandQueue::create_with_properties failed", name)); - - let final_nonce = Buffer::::create(context_ref, CL_MEM_READ_WRITE, 1, ptr::null_mut()) - .expect("Buffer allocation failed"); - let final_hash = Buffer::<[cl_ulong; 4]>::create(context_ref, CL_MEM_WRITE_ONLY, 1, ptr::null_mut()) - .expect("Buffer allocation failed"); - - let hash_header = Buffer::::create(context_ref, CL_MEM_READ_ONLY, 72, ptr::null_mut()) - .expect("Buffer allocation failed"); - let matrix = Buffer::::create(context_ref, CL_MEM_READ_ONLY, 64 * 64, ptr::null_mut()) - .expect("Buffer allocation failed"); - let target = Buffer::::create(context_ref, CL_MEM_READ_ONLY, 4, ptr::null_mut()) - .expect("Buffer allocation failed"); - - let mut seed = [1u64; 4]; - seed.try_fill(&mut rand::thread_rng())?; - - let random_state = match random { - NonceGenEnum::Xoshiro => { - info!("Using xoshiro for nonce-generation"); - let random_state = - Buffer::::create(context_ref, CL_MEM_READ_WRITE, 4 * chosen_workload, ptr::null_mut()) - .expect("Buffer allocation failed"); - let rand_state = - Xoshiro256StarStar::new(&seed).iter_jump_state().take(chosen_workload).collect::>(); - let mut random_state_local: *mut c_void = std::ptr::null_mut::(); - info!("{}: Generating initial seed. This may take some time.", name); - - queue - .enqueue_map_buffer( - &random_state, - CL_BLOCKING, - CL_MAP_WRITE, - 0, - 32 * chosen_workload, - &mut random_state_local, - &[], - ) - .map_err(|e| e.to_string())? - .wait() - .unwrap(); - if random_state_local.is_null() { - return Err(format!("{}::could not load random state vector to memory. Consider changing random or lowering workload", name).into()); - } - unsafe { - random_state_local.copy_from(rand_state.as_ptr() as *mut c_void, 32 * chosen_workload); - } - // queue.enqueue_svm_unmap(&random_state,&[]).map_err(|e| e.to_string())?; - queue - .enqueue_unmap_mem_object(random_state.get(), random_state_local, &[]) - .map_err(|e| e.to_string()) - .unwrap() - .wait() - .unwrap(); - info!("{}: Done generating initial seed", name); - random_state - } - NonceGenEnum::Lean => { - info!("Using lean nonce-generation"); - let mut random_state = Buffer::::create(context_ref, CL_MEM_READ_WRITE, 1, ptr::null_mut()) - .expect("Buffer allocation failed"); - queue - .enqueue_write_buffer(&mut random_state, CL_BLOCKING, 0, &[thread_rng().next_u64()], &[]) - .map_err(|e| e.to_string()) - .unwrap() - .wait() - .unwrap(); - random_state - } - }; - Ok(Self { - context, - local_size, - workload: chosen_workload, - random: *random, - heavy_hash, - random_state, - queue, - final_nonce, - final_hash, - hash_header, - matrix, - target, - events: Vec::::new(), - experimental_amd: ((experimental_amd | use_binary) & experimental_amd_use), - }) - } -} - -fn from_source(context: &Context, device: &Device, options: &str) -> Result { - let version = device.version()?; - let v = version.split(' ').nth(1).unwrap(); - let mut compile_options = options.to_string(); - compile_options += CL_MAD_ENABLE; - compile_options += CL_FINITE_MATH_ONLY; - if v == "2.0" || v == "2.1" || v == "3.0" { - info!("Compiling with OpenCl 2"); - compile_options += CL_STD_2_0; - } - compile_options += &match Platform::new(device.platform().unwrap()).name() { - Ok(name) => format!( - "-D{} ", - name.chars() - .map(|c| match c.is_ascii_alphanumeric() { - true => c, - false => '_', - }) - .collect::() - .to_uppercase() - ), - Err(_) => String::new(), - }; - compile_options += &match device.compute_capability_major_nv() { - Ok(major) => format!("-D __COMPUTE_MAJOR__={} ", major), - Err(_) => String::new(), - }; - compile_options += &match device.compute_capability_minor_nv() { - Ok(minor) => format!("-D __COMPUTE_MINOR__={} ", minor), - Err(_) => String::new(), - }; - - // Hack to recreate the AMD flags - compile_options += &match device.pcie_id_amd() { - Ok(_) => { - let device_name = device.name().unwrap_or_else(|_| "Unknown".into()).to_lowercase(); - format!("-D OPENCL_PLATFORM_AMD -D __{}__ ", device_name) - } - Err(_) => String::new(), - }; - - info!("Build OpenCL with {}", compile_options); - - Program::create_and_build_from_source(context, PROGRAM_SOURCE, compile_options.as_str()) -} diff --git a/proto/messages.proto b/proto/messages.proto index 89f5007..895a014 100644 --- a/proto/messages.proto +++ b/proto/messages.proto @@ -6,7 +6,7 @@ option go_package = "github.com/karlsen-network/karlsend/protowire"; import "p2p.proto"; import "rpc.proto"; -message KaspadMessage { +message KarlsendMessage { oneof payload { AddressesMessage addresses = 1; BlockMessage block = 2; @@ -139,9 +139,9 @@ message KaspadMessage { } service P2P { - rpc MessageStream (stream KaspadMessage) returns (stream KaspadMessage) {} + rpc MessageStream (stream KarlsendMessage) returns (stream KarlsendMessage) {} } service RPC { - rpc MessageStream (stream KaspadMessage) returns (stream KaspadMessage) {} + rpc MessageStream (stream KarlsendMessage) returns (stream KarlsendMessage) {} } diff --git a/proto/rpc.proto b/proto/rpc.proto index 8a56282..0bf86d3 100644 --- a/proto/rpc.proto +++ b/proto/rpc.proto @@ -1,9 +1,9 @@ // RPC-related types. Request messages, response messages, and dependant types. // -// Clients are expected to build RequestMessages and wrap them in KaspadMessage. (see messages.proto) +// Clients are expected to build RequestMessages and wrap them in KarlsendMessage. (see messages.proto) // -// Having received a RequestMessage, (wrapped in a KaspadMessage) the RPC server will respond with a -// ResponseMessage (likewise wrapped in a KaspadMessage) respective to the original RequestMessage. +// Having received a RequestMessage, (wrapped in a KarlsendMessage) the RPC server will respond with a +// ResponseMessage (likewise wrapped in a KarlsendMessage) respective to the original RequestMessage. // // **IMPORTANT:** This API is a work in progress and is subject to break between versions. // @@ -115,7 +115,7 @@ message RpcTransactionOutputVerboseData{ string scriptPublicKeyAddress = 6; } -// GetCurrentNetworkRequestMessage requests the network kaspad is currently running against. +// GetCurrentNetworkRequestMessage requests the network karlsend is currently running against. // // Possible networks are: Mainnet, Testnet, Simnet, Devnet message GetCurrentNetworkRequestMessage{ @@ -150,7 +150,7 @@ message SubmitBlockResponseMessage{ // // See: SubmitBlockRequestMessage message GetBlockTemplateRequestMessage{ - // Which kaspa address should the coinbase block reward transaction pay into + // Which karlsen address should the coinbase block reward transaction pay into string payAddress = 1; string extraData = 2; } @@ -158,9 +158,9 @@ message GetBlockTemplateRequestMessage{ message GetBlockTemplateResponseMessage{ RpcBlock block = 3; - // Whether kaspad thinks that it's synced. - // Callers are discouraged (but not forbidden) from solving blocks when kaspad is not synced. - // That is because when kaspad isn't in sync with the rest of the network there's a high + // Whether karlsend thinks that it's synced. + // Callers are discouraged (but not forbidden) from solving blocks when karlsend is not synced. + // That is because when karlsend isn't in sync with the rest of the network there's a high // chance the block will never be accepted, thus the solving effort would have been wasted. bool isSynced = 2; @@ -185,7 +185,7 @@ message BlockAddedNotificationMessage{ RpcBlock block = 3; } -// GetPeerAddressesRequestMessage requests the list of known kaspad addresses in the +// GetPeerAddressesRequestMessage requests the list of known karlsend addresses in the // current network. (mainnet, testnet, etc.) message GetPeerAddressesRequestMessage{ } @@ -240,7 +240,7 @@ message MempoolEntry{ } // GetConnectedPeerInfoRequestMessage requests information about all the p2p peers -// currently connected to this kaspad. +// currently connected to this karlsend. message GetConnectedPeerInfoRequestMessage{ } @@ -256,7 +256,7 @@ message GetConnectedPeerInfoMessage{ // How long did the last ping/pong exchange take int64 lastPingDuration = 3; - // Whether this kaspad initiated the connection + // Whether this karlsend initiated the connection bool isOutbound = 6; int64 timeOffset = 7; string userAgent = 8; @@ -264,15 +264,15 @@ message GetConnectedPeerInfoMessage{ // The protocol version that this peer claims to support uint32 advertisedProtocolVersion = 9; - // The timestamp of when this peer connected to this kaspad + // The timestamp of when this peer connected to this karlsend int64 timeConnected = 10; // Whether this peer is the IBD peer (if IBD is running) bool isIbdPeer = 11; } -// AddPeerRequestMessage adds a peer to kaspad's outgoing connection list. -// This will, in most cases, result in kaspad connecting to said peer. +// AddPeerRequestMessage adds a peer to karlsend's outgoing connection list. +// This will, in most cases, result in karlsend connecting to said peer. message AddPeerRequestMessage{ string address = 1; @@ -346,7 +346,7 @@ message GetSubnetworkResponseMessage{ } // GetVirtualSelectedParentChainFromBlockRequestMessage requests the virtual selected -// parent chain from some startHash to this kaspad's current virtual +// parent chain from some startHash to this karlsend's current virtual message GetVirtualSelectedParentChainFromBlockRequestMessage{ string startHash = 1; } @@ -362,7 +362,7 @@ message GetVirtualSelectedParentChainFromBlockResponseMessage{ } // GetBlocksRequestMessage requests blocks between a certain block lowHash up to this -// kaspad's current virtual. +// karlsend's current virtual. message GetBlocksRequestMessage{ string lowHash = 1; bool includeBlocks = 2; @@ -375,7 +375,7 @@ message GetBlocksResponseMessage{ RPCError error = 1000; } -// GetBlockCountRequestMessage requests the current number of blocks in this kaspad. +// GetBlockCountRequestMessage requests the current number of blocks in this karlsend. // Note that this number may decrease as pruning occurs. message GetBlockCountRequestMessage{ } @@ -387,7 +387,7 @@ message GetBlockCountResponseMessage{ } // GetBlockDagInfoRequestMessage requests general information about the current state -// of this kaspad's DAG. +// of this karlsend's DAG. message GetBlockDagInfoRequestMessage{ } @@ -427,7 +427,7 @@ message FinalityConflictResolvedNotificationMessage{ string finalityBlockHash = 1; } -// ShutDownRequestMessage shuts down this kaspad. +// ShutDownRequestMessage shuts down this karlsend. message ShutDownRequestMessage{ } @@ -451,7 +451,7 @@ message GetHeadersResponseMessage{ // NotifyUtxosChangedRequestMessage registers this connection for utxoChanged notifications // for the given addresses. // -// This call is only available when this kaspad was started with `--utxoindex` +// This call is only available when this karlsend was started with `--utxoindex` // // See: UtxosChangedNotificationMessage message NotifyUtxosChangedRequestMessage { @@ -479,7 +479,7 @@ message UtxosByAddressesEntry { // StopNotifyingUtxosChangedRequestMessage unregisters this connection for utxoChanged notifications // for the given addresses. // -// This call is only available when this kaspad was started with `--utxoindex` +// This call is only available when this karlsend was started with `--utxoindex` // // See: UtxosChangedNotificationMessage message StopNotifyingUtxosChangedRequestMessage { @@ -490,9 +490,9 @@ message StopNotifyingUtxosChangedResponseMessage { RPCError error = 1000; } -// GetUtxosByAddressesRequestMessage requests all current UTXOs for the given kaspad addresses +// GetUtxosByAddressesRequestMessage requests all current UTXOs for the given karlsend addresses // -// This call is only available when this kaspad was started with `--utxoindex` +// This call is only available when this karlsend was started with `--utxoindex` message GetUtxosByAddressesRequestMessage { repeated string addresses = 1; } @@ -505,7 +505,7 @@ message GetUtxosByAddressesResponseMessage { // GetBalanceByAddressRequest returns the total balance in unspent transactions towards a given address // -// This call is only available when this kaspad was started with `--utxoindex` +// This call is only available when this karlsend was started with `--utxoindex` message GetBalanceByAddressRequestMessage { string address = 1; } @@ -585,7 +585,7 @@ message VirtualDaaScoreChangedNotificationMessage { // NotifyPruningPointUTXOSetOverrideRequestMessage registers this connection for // pruning point UTXO set override notifications. // -// This call is only available when this kaspad was started with `--utxoindex` +// This call is only available when this karlsend was started with `--utxoindex` // // See: NotifyPruningPointUTXOSetOverrideResponseMessage message NotifyPruningPointUTXOSetOverrideRequestMessage { @@ -606,7 +606,7 @@ message PruningPointUTXOSetOverrideNotificationMessage { // StopNotifyingPruningPointUTXOSetOverrideRequestMessage unregisters this connection for // pruning point UTXO set override notifications. // -// This call is only available when this kaspad was started with `--utxoindex` +// This call is only available when this karlsend was started with `--utxoindex` // // See: PruningPointUTXOSetOverrideNotificationMessage message StopNotifyingPruningPointUTXOSetOverrideRequestMessage { diff --git a/run_miner.sh b/run_miner.sh new file mode 100644 index 0000000..95535f1 --- /dev/null +++ b/run_miner.sh @@ -0,0 +1,7 @@ +./target/release/karlsen-miner.exe \ + -s 149.202.82.76 \ + --testnet \ + --mining-address karlsentest:qrkgct0xryfw94fk8s8ke4h69pecdpdvytalkfqj5nc5cuxkl2t07ed7ruwfv \ + #--mining-address karlsentest:qptwccmajlgeeqclehyr7w8uqeyx0jtctd53j4p9s4v0k0v9a6pjy6jlq8q57 \ + #--mining-address karlsentest:qrxuvenk483jj5k5zpwgdqyk27eacsgv9fj3kwu6puj38usnaj9uu55cz0y8q \ + #--cuda-nonce-gen xoshiro diff --git a/run_miner_devnet.sh b/run_miner_devnet.sh new file mode 100644 index 0000000..212e50f --- /dev/null +++ b/run_miner_devnet.sh @@ -0,0 +1,5 @@ +./target/release/karlsen-miner.exe \ + -s 127.0.0.1 \ + --devnet \ + --mining-address karlsendev:qprsjk60rc6qdlngragh9fu2nqzph40de5vz85z55gn2l57q8ufc5a2s2940s \ + --mine-when-not-synced \ \ No newline at end of file diff --git a/run_miner_testnet.sh b/run_miner_testnet.sh new file mode 100644 index 0000000..7dcd56e --- /dev/null +++ b/run_miner_testnet.sh @@ -0,0 +1,6 @@ +./target/release/karlsen-miner.exe \ + -s 127.0.0.1 \ + --testnet \ + --mining-address karlsentest:qrkgct0xryfw94fk8s8ke4h69pecdpdvytalkfqj5nc5cuxkl2t07ed7ruwfv \ + --mine-when-not-synced \ + diff --git a/src/cli.rs b/src/cli.rs index 3257f30..92b4fce 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,113 +1,128 @@ -use clap::Parser; -use log::LevelFilter; - -use crate::Error; - -#[derive(Parser, Debug)] -#[clap(name = "karlsen-miner", version, about = "A Karlsen high performance CPU miner", term_width = 0)] -pub struct Opt { - #[clap(short, long, help = "Enable debug logging level")] - pub debug: bool, - #[clap(short = 'a', long = "mining-address", help = "The Kaspa address for the miner reward")] - pub mining_address: String, - #[clap(short = 's', long = "karlsend-address", default_value = "127.0.0.1", help = "The IP of the karlsend instance")] - pub karlsend_address: String, - - #[clap(long = "devfund-percent", help = "The percentage of blocks to send to the devfund (minimum 0%)", default_value = "0", parse(try_from_str = parse_devfund_percent))] - pub devfund_percent: u16, - - #[clap(short, long, help = "karlsend port [default: Mainnet = 42110, Testnet = 16211]")] - port: Option, - - #[clap(long, help = "Use testnet instead of mainnet [default: false]")] - testnet: bool, - #[clap(short = 't', long = "threads", help = "Amount of CPU miner threads to launch [default: 0]")] - pub num_threads: Option, - #[clap( - long = "mine-when-not-synced", - help = "Mine even when karlsend says it is not synced", - long_help = "Mine even when karlsend says it is not synced, only useful when passing `--allow-submit-block-when-not-synced` to karlsend [default: false]" - )] - pub mine_when_not_synced: bool, - - #[clap(skip)] - pub devfund_address: String, -} - -fn parse_devfund_percent(s: &str) -> Result { - let err = "devfund-percent should be --devfund-percent=XX.YY up to 2 numbers after the dot"; - let mut splited = s.split('.'); - let prefix = splited.next().ok_or(err)?; - // if there's no postfix then it's 0. - let postfix = splited.next().ok_or(err).unwrap_or("0"); - // error if there's more than a single dot - if splited.next().is_some() { - return Err(err); - }; - // error if there are more than 2 numbers before or after the dot - if prefix.len() > 2 || postfix.len() > 2 { - return Err(err); - } - let postfix: u16 = postfix.parse().map_err(|_| err)?; - let prefix: u16 = prefix.parse().map_err(|_| err)?; - // can't be more than 99.99%, - if prefix >= 100 || postfix >= 100 { - return Err(err); - } - /* - if prefix < 2 { - // Force at least 2 percent - return Ok(200u16); - } - */ - // DevFund is out of 10_000 - Ok(prefix * 100 + postfix) -} - -impl Opt { - pub fn process(&mut self) -> Result<(), Error> { - //self.gpus = None; - if self.karlsend_address.is_empty() { - self.karlsend_address = "127.0.0.1".to_string(); - } - - if !self.karlsend_address.contains("://") { - let port_str = self.port().to_string(); - let (karlsend, port) = match self.karlsend_address.contains(':') { - true => self.karlsend_address.split_once(':').expect("We checked for `:`"), - false => (self.karlsend_address.as_str(), port_str.as_str()), - }; - self.karlsend_address = format!("grpc://{}:{}", karlsend, port); - } - log::info!("karlsend address: {}", self.karlsend_address); - - if self.num_threads.is_none() { - self.num_threads = Some(0); - } - - let miner_network = self.mining_address.split(':').next(); - self.devfund_address = String::from("karlsen:qzrq7v5jhsc5znvtfdg6vxg7dz5x8dqe4wrh90jkdnwehp6vr8uj7csdss2l7"); - let devfund_network = self.devfund_address.split(':').next(); - if miner_network.is_some() && devfund_network.is_some() && miner_network != devfund_network { - self.devfund_percent = 0; - log::info!( - "Mining address ({}) and devfund ({}) are not from the same network. Disabling devfund.", - miner_network.unwrap(), - devfund_network.unwrap() - ) - } - Ok(()) - } - - fn port(&mut self) -> u16 { - *self.port.get_or_insert(if self.testnet { 16211 } else { 42110 }) - } - - pub fn log_level(&self) -> LevelFilter { - if self.debug { - LevelFilter::Debug - } else { - LevelFilter::Info - } - } -} +use clap::Parser; +use log::LevelFilter; + +use crate::Error; + +#[derive(Parser, Debug)] +#[clap(name = "karlsen-miner", version, about = "A Karlsen high-performance CPU/GPU miner", term_width = 0)] +pub struct Opt { + #[clap(short, long, help = "Enable debug logging level")] + pub debug: bool, + + #[clap(short = 'a', long = "mining-address", help = "The Karlsen address for miner rewards")] + pub mining_address: String, + + #[clap( + short = 's', + long = "karlsend-address", + default_value = "127.0.0.1", + help = "Specify the IP, pool, or node address of the Karlsend instance. Use stratum+tcp:// for stratum or grpc:// for Karlsend (default: grpc://127.0.0.1)" + )] + pub karlsend_address: String, + + #[clap( + long = "devfund-percent", + default_value = "0", + help = "Percentage of blocks to send to the devfund (minimum 0%)", + parse(try_from_str = parse_devfund_percent) + )] + pub devfund_percent: u16, + + #[clap(short, long, help = "karlsend port [default: Mainnet = 42110, Testnet = 42111, Devnet = 42610]")] + port: Option, + + #[clap(long, help = "Use testnet instead of mainnet [default: false]")] + testnet: bool, + + #[clap(long, help = "Use devnet instead of mainnet [default: false]")] + devnet: bool, + + #[clap(short = 't', long = "threads", help = "Number of CPU miner threads to launch [default: 0]")] + pub num_threads: Option, + + #[clap( + long = "mine-when-not-synced", + help = "Mine even when karlsend is not synced", + long_help = "Mine even when karlsend is not synced; useful when passing `--allow-submit-block-when-not-synced` to karlsend [default: false]" + )] + pub mine_when_not_synced: bool, + + #[clap(skip)] + pub devfund_address: String, +} + +fn parse_devfund_percent(s: &str) -> Result { + let err = "devfund-percent should be formatted as XX.YY, with up to two digits after the decimal"; + let mut parts = s.split('.'); + + let prefix = parts.next().ok_or(err)?; + let postfix = parts.next().unwrap_or("0"); + + if parts.next().is_some() || prefix.len() > 2 || postfix.len() > 2 { + return Err(err); + } + + let prefix: u16 = prefix.parse().map_err(|_| err)?; + let postfix: u16 = postfix.parse().map_err(|_| err)?; + + if prefix >= 100 || postfix >= 100 { + return Err(err); + } + + Ok(prefix * 100 + postfix) +} + +impl Opt { + pub fn process(&mut self) -> Result<(), Error> { + if self.karlsend_address.is_empty() { + self.karlsend_address = "127.0.0.1".to_string(); + } + + if !self.karlsend_address.contains("://") { + let port_str = self.port().to_string(); + let (karlsend, port) = + self.karlsend_address.split_once(':').unwrap_or((self.karlsend_address.as_str(), port_str.as_str())); + + self.karlsend_address = format!("grpc://{}:{}", karlsend, port); + } + log::info!("karlsend address: {}", self.karlsend_address); + + self.num_threads.get_or_insert(0); + + let miner_network = self.mining_address.split(':').next(); + self.devfund_address = "karlsen:qzrq7v5jhsc5znvtfdg6vxg7dz5x8dqe4wrh90jkdnwehp6vr8uj7csdss2l7".to_string(); + let devfund_network = self.devfund_address.split(':').next(); + + if let (Some(miner_net), Some(devfund_net)) = (miner_network, devfund_network) { + if miner_net != devfund_net { + self.devfund_percent = 0; + log::info!( + "Mining address ({}) and devfund ({}) are not from the same network. Disabling devfund.", + miner_net, + devfund_net + ); + } + } + Ok(()) + } + + fn port(&mut self) -> u16 { + *self.port.get_or_insert({ + if self.testnet { + 42210 + } else if self.devnet { + 42610 + } else { + 42110 + } + }) + } + + pub fn log_level(&self) -> LevelFilter { + if self.debug { + LevelFilter::Debug + } else { + LevelFilter::Info + } + } +} diff --git a/src/client/grpc.rs b/src/client/grpc.rs index 11bbac0..5098901 100644 --- a/src/client/grpc.rs +++ b/src/client/grpc.rs @@ -1,10 +1,10 @@ use crate::client::Client; use crate::pow::BlockSeed; use crate::pow::BlockSeed::{FullBlock, PartialBlock}; -use crate::proto::kaspad_message::Payload; +use crate::proto::karlsend_message::Payload; use crate::proto::rpc_client::RpcClient; use crate::proto::{ - GetBlockTemplateRequestMessage, GetInfoRequestMessage, KaspadMessage, NotifyBlockAddedRequestMessage, + GetBlockTemplateRequestMessage, GetInfoRequestMessage, KarlsendMessage, NotifyBlockAddedRequestMessage, NotifyNewBlockTemplateRequestMessage, }; use crate::{miner::MinerManager, Error}; @@ -23,13 +23,13 @@ use tonic::{transport::Channel as TonicChannel, Streaming}; static EXTRA_DATA: &str = concat!(env!("CARGO_PKG_VERSION"), "/", env!("PACKAGE_COMPILE_TIME")); static VERSION_UPDATE: &str = "0.11.15"; -type BlockHandle = JoinHandle>>; +type BlockHandle = JoinHandle>>; #[allow(dead_code)] -pub struct KaspadHandler { +pub struct KarlsendHandler { client: RpcClient, - pub send_channel: Sender, - stream: Streaming, + pub send_channel: Sender, + stream: Streaming, miner_address: String, mine_when_not_synced: bool, devfund_address: Option, @@ -41,7 +41,7 @@ pub struct KaspadHandler { } #[async_trait(?Send)] -impl Client for KaspadHandler { +impl Client for KarlsendHandler { fn add_devfund(&mut self, address: String, percent: u16) { self.devfund_address = Some(address); self.devfund_percent = percent; @@ -56,7 +56,7 @@ impl Client for KaspadHandler { while let Some(msg) = self.stream.message().await? { match msg.payload { Some(payload) => self.handle_message(payload, miner).await?, - None => warn!("kaspad message payload is empty"), + None => warn!("karlsend message payload is empty"), } } Ok(()) @@ -67,7 +67,7 @@ impl Client for KaspadHandler { } } -impl KaspadHandler { +impl KarlsendHandler { pub async fn connect( address: D, miner_address: String, @@ -98,15 +98,15 @@ impl KaspadHandler { })) } - fn create_block_channel(send_channel: Sender) -> (Sender, BlockHandle) { - // KaspadMessage::submit_block(block) + fn create_block_channel(send_channel: Sender) -> (Sender, BlockHandle) { + // KarlsendMessage::submit_block(block) let (send, recv) = mpsc::channel::(1); ( send, tokio::spawn(async move { ReceiverStream::new(recv) .map(|block_seed| match block_seed { - FullBlock(block) => KaspadMessage::submit_block(*block), + FullBlock(block) => KarlsendMessage::submit_block(*block), PartialBlock { .. } => unreachable!("All blocks sent here should have arrived from here"), }) .map(Ok) @@ -116,11 +116,11 @@ impl KaspadHandler { ) } - async fn client_send(&self, msg: impl Into) -> Result<(), SendError> { + async fn client_send(&self, msg: impl Into) -> Result<(), SendError> { self.send_channel.send(msg.into()).await } - async fn client_get_block_template(&mut self) -> Result<(), SendError> { + async fn client_get_block_template(&mut self) -> Result<(), SendError> { let pay_address = match &self.devfund_address { Some(devfund_address) if self.block_template_ctr.load(Ordering::SeqCst) <= self.devfund_percent => { devfund_address.clone() @@ -158,10 +158,10 @@ impl KaspadHandler { } } Payload::GetInfoResponse(info) => { - info!("Kaspad version: {}", info.server_version); - let kaspad_version = Version::parse(&info.server_version)?; + info!("Karlsend version: {}", info.server_version); + let karlsend_version = Version::parse(&info.server_version)?; let update_version = Version::parse(VERSION_UPDATE)?; - match kaspad_version >= update_version { + match karlsend_version >= update_version { true => self.client_send(NotifyNewBlockTemplateRequestMessage {}).await?, false => self.client_send(NotifyBlockAddedRequestMessage {}).await?, }; @@ -173,7 +173,7 @@ impl KaspadHandler { Some(e) => error!("Failed registering for new template notifications: {:?}", e), }, Payload::NotifyBlockAddedResponse(res) => match res.error { - None => info!("Registered for block notifications (upgrade your Kaspad for better experience)"), + None => info!("Registered for block notifications (upgrade your Karlsend for better experience)"), Some(e) => error!("Failed registering for block notifications: {:?}", e), }, msg => info!("got unknown msg: {:?}", msg), @@ -182,7 +182,7 @@ impl KaspadHandler { } } -impl Drop for KaspadHandler { +impl Drop for KarlsendHandler { fn drop(&mut self) { self.block_handle.abort(); } diff --git a/src/client/stratum/statum_codec.rs b/src/client/stratum/statum_codec.rs index 81230c4..ee816b6 100644 --- a/src/client/stratum/statum_codec.rs +++ b/src/client/stratum/statum_codec.rs @@ -112,11 +112,11 @@ pub(crate) struct StratumLine { /// An error occurred while encoding or decoding a line. #[derive(Debug)] pub(crate) enum NewLineJsonCodecError { - JsonParseError(String), + JsonParseError(()), JsonEncodeError, LineSplitError, LineEncodeError, - Io(io::Error), + Io(()), } impl fmt::Display for NewLineJsonCodecError { @@ -125,15 +125,15 @@ impl fmt::Display for NewLineJsonCodecError { } } impl From for NewLineJsonCodecError { - fn from(e: io::Error) -> NewLineJsonCodecError { - NewLineJsonCodecError::Io(e) + fn from(_: io::Error) -> NewLineJsonCodecError { + NewLineJsonCodecError::Io(()) } } impl std::error::Error for NewLineJsonCodecError {} impl From<(String, String)> for NewLineJsonCodecError { - fn from(e: (String, String)) -> Self { - NewLineJsonCodecError::JsonParseError(format!("{}: {}", e.0, e.1)) + fn from(_: (String, String)) -> Self { + NewLineJsonCodecError::JsonParseError(()) } } diff --git a/src/kaspad_messages.rs b/src/karlsend_messages.rs similarity index 54% rename from src/kaspad_messages.rs rename to src/karlsend_messages.rs index 46670d0..d7d8a2e 100644 --- a/src/kaspad_messages.rs +++ b/src/karlsend_messages.rs @@ -1,5 +1,5 @@ use crate::proto::{ - kaspad_message::Payload, GetBlockTemplateRequestMessage, GetInfoRequestMessage, KaspadMessage, + karlsend_message::Payload, GetBlockTemplateRequestMessage, GetInfoRequestMessage, KarlsendMessage, NotifyBlockAddedRequestMessage, NotifyNewBlockTemplateRequestMessage, RpcBlock, SubmitBlockRequestMessage, }; use crate::{ @@ -7,19 +7,19 @@ use crate::{ Hash, }; -impl KaspadMessage { +impl KarlsendMessage { #[inline(always)] pub fn get_info_request() -> Self { - KaspadMessage { payload: Some(Payload::GetInfoRequest(GetInfoRequestMessage {})) } + KarlsendMessage { payload: Some(Payload::GetInfoRequest(GetInfoRequestMessage {})) } } #[inline(always)] pub fn notify_block_added() -> Self { - KaspadMessage { payload: Some(Payload::NotifyBlockAddedRequest(NotifyBlockAddedRequestMessage {})) } + KarlsendMessage { payload: Some(Payload::NotifyBlockAddedRequest(NotifyBlockAddedRequestMessage {})) } } #[inline(always)] pub fn submit_block(block: RpcBlock) -> Self { - KaspadMessage { + KarlsendMessage { payload: Some(Payload::SubmitBlockRequest(SubmitBlockRequestMessage { block: Some(block), allow_non_daa_blocks: false, @@ -28,26 +28,26 @@ impl KaspadMessage { } } -impl From for KaspadMessage { +impl From for KarlsendMessage { fn from(a: GetInfoRequestMessage) -> Self { - KaspadMessage { payload: Some(Payload::GetInfoRequest(a)) } + KarlsendMessage { payload: Some(Payload::GetInfoRequest(a)) } } } -impl From for KaspadMessage { +impl From for KarlsendMessage { fn from(a: NotifyBlockAddedRequestMessage) -> Self { - KaspadMessage { payload: Some(Payload::NotifyBlockAddedRequest(a)) } + KarlsendMessage { payload: Some(Payload::NotifyBlockAddedRequest(a)) } } } -impl From for KaspadMessage { +impl From for KarlsendMessage { fn from(a: GetBlockTemplateRequestMessage) -> Self { - KaspadMessage { payload: Some(Payload::GetBlockTemplateRequest(a)) } + KarlsendMessage { payload: Some(Payload::GetBlockTemplateRequest(a)) } } } -impl From for KaspadMessage { +impl From for KarlsendMessage { fn from(a: NotifyNewBlockTemplateRequestMessage) -> Self { - KaspadMessage { payload: Some(Payload::NotifyNewBlockTemplateRequest(a)) } + KarlsendMessage { payload: Some(Payload::NotifyNewBlockTemplateRequest(a)) } } } diff --git a/src/main.rs b/src/main.rs index f44a965..6e60842 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,169 +1,168 @@ -#![cfg_attr(all(test, feature = "bench"), feature(test))] - -use std::env::consts::DLL_EXTENSION; -use std::env::current_exe; -use std::error::Error as StdError; -use std::ffi::OsStr; - -use clap::{App, FromArgMatches, IntoApp}; -use karlsen_miner::PluginManager; -use log::{error, info}; -use rand::{thread_rng, RngCore}; -use std::fs; -use std::sync::atomic::AtomicU16; -use std::sync::Arc; -use std::thread::sleep; -use std::time::Duration; - -use crate::cli::Opt; -use crate::client::grpc::KaspadHandler; -use crate::client::stratum::StratumHandler; -use crate::client::Client; -use crate::miner::MinerManager; -use crate::target::Uint256; - -mod cli; -mod client; -mod kaspad_messages; -mod miner; -mod pow; -mod target; -mod watch; - -//remove the opencl plugin support for the moment -//const WHITELIST: [&str; 4] = ["libkaspacuda", "libkaspaopencl", "kaspacuda", "kaspaopencl"]; -const WHITELIST: [&str; 2] = ["libkaspacuda", "kaspacuda"]; - -pub mod proto { - #![allow(clippy::derive_partial_eq_without_eq)] - tonic::include_proto!("protowire"); - // include!("protowire.rs"); // FIXME: https://github.com/intellij-rust/intellij-rust/issues/6579 -} - -pub type Error = Box; - -type Hash = Uint256; - -#[cfg(target_os = "windows")] -fn adjust_console() -> Result<(), Error> { - let console = win32console::console::WinConsole::input(); - let mut mode = console.get_mode()?; - mode = (mode & !win32console::console::ConsoleMode::ENABLE_QUICK_EDIT_MODE) - | win32console::console::ConsoleMode::ENABLE_EXTENDED_FLAGS; - console.set_mode(mode)?; - Ok(()) -} - -fn filter_plugins(dirname: &str) -> Vec { - match fs::read_dir(dirname) { - Ok(readdir) => readdir - .map(|entry| entry.unwrap().path()) - .filter(|fname| { - fname.is_file() - && fname.extension().is_some() - && fname.extension().and_then(OsStr::to_str).unwrap_or_default().starts_with(DLL_EXTENSION) - }) - .filter(|fname| WHITELIST.iter().any(|lib| *lib == fname.file_stem().and_then(OsStr::to_str).unwrap())) - .map(|path| path.to_str().unwrap().to_string()) - .collect::>(), - _ => Vec::::new(), - } -} - -async fn get_client( - kaspad_address: String, - mining_address: String, - mine_when_not_synced: bool, - block_template_ctr: Arc, -) -> Result, Error> { - if kaspad_address.starts_with("stratum+tcp://") { - let (_schema, address) = kaspad_address.split_once("://").unwrap(); - Ok(StratumHandler::connect( - address.to_string().clone(), - mining_address.clone(), - mine_when_not_synced, - Some(block_template_ctr.clone()), - ) - .await?) - } else if kaspad_address.starts_with("grpc://") { - Ok(KaspadHandler::connect( - kaspad_address.clone(), - mining_address.clone(), - mine_when_not_synced, - Some(block_template_ctr.clone()), - ) - .await?) - } else { - Err("Did not recognize pool/grpc address schema".into()) - } -} - -async fn client_main( - opt: &Opt, - block_template_ctr: Arc, - plugin_manager: &PluginManager, -) -> Result<(), Error> { - let mut client = get_client( - opt.karlsend_address.clone(), - opt.mining_address.clone(), - opt.mine_when_not_synced, - block_template_ctr.clone(), - ) - .await?; - - if opt.devfund_percent > 0 { - client.add_devfund(opt.devfund_address.clone(), opt.devfund_percent); - } - client.register().await?; - let mut miner_manager = MinerManager::new(client.get_block_channel(), opt.num_threads, plugin_manager); - client.listen(&mut miner_manager).await?; - drop(miner_manager); - Ok(()) -} - -#[tokio::main] -async fn main() -> Result<(), Error> { - #[cfg(target_os = "windows")] - adjust_console().unwrap_or_else(|e| { - eprintln!("WARNING: Failed to protect console ({}). Any selection in console will freeze the miner.", e) - }); - let mut path = current_exe().unwrap_or_default(); - path.pop(); // Getting the parent directory - let plugins = filter_plugins(path.to_str().unwrap_or(".")); - let (app, mut plugin_manager): (App, PluginManager) = karlsen_miner::load_plugins(Opt::into_app(), &plugins)?; - - let matches = app.get_matches(); - - let worker_count = plugin_manager.process_options(&matches)?; - let mut opt: Opt = Opt::from_arg_matches(&matches)?; - opt.process()?; - env_logger::builder().filter_level(opt.log_level()).parse_default_env().init(); - info!("================================================================================="); - info!(" karlsen-miner GPU {}", env!("CARGO_PKG_VERSION")); - info!(" Mining for: {}", opt.mining_address); - info!("================================================================================="); - info!("Found plugins: {:?}", plugins); - info!("Plugins found {} workers", worker_count); - if worker_count == 0 && opt.num_threads.unwrap_or(0) == 0 { - error!("No workers specified"); - return Err("No workers specified".into()); - } - - let block_template_ctr = Arc::new(AtomicU16::new((thread_rng().next_u64() % 10_000u64) as u16)); - if opt.devfund_percent > 0 { - info!( - "devfund enabled, mining {}.{}% of the time to devfund address: {} ", - opt.devfund_percent / 100, - opt.devfund_percent % 100, - opt.devfund_address - ); - } - loop { - match client_main(&opt, block_template_ctr.clone(), &plugin_manager).await { - Ok(_) => info!("Client closed gracefully"), - Err(e) => error!("Client closed with error {:?}", e), - } - info!("Client closed, reconnecting"); - sleep(Duration::from_millis(100)); - } -} +#![cfg_attr(all(test, feature = "bench"), feature(test))] + +use std::env::consts::DLL_EXTENSION; +use std::env::current_exe; +use std::error::Error as StdError; +use std::ffi::OsStr; + +use clap::{App, FromArgMatches, IntoApp}; +use karlsen_miner::PluginManager; +use log::{error, info}; +use rand::{thread_rng, RngCore}; +use std::fs; +use std::sync::atomic::AtomicU16; +use std::sync::Arc; +use std::thread::sleep; +use std::time::Duration; + +use crate::cli::Opt; +use crate::client::grpc::KarlsendHandler; +use crate::client::stratum::StratumHandler; +use crate::client::Client; +use crate::miner::MinerManager; +use crate::target::Uint256; + +mod cli; +mod client; +mod karlsend_messages; +mod miner; +mod pow; +mod target; +mod watch; + +//remove the opencl plugin support for the moment +const WHITELIST: [&str; 2] = ["libkarlsencuda", "karlsencuda"]; + +pub mod proto { + #![allow(clippy::derive_partial_eq_without_eq)] + tonic::include_proto!("protowire"); + // include!("protowire.rs"); // FIXME: https://github.com/intellij-rust/intellij-rust/issues/6579 +} + +pub type Error = Box; + +type Hash = Uint256; + +#[cfg(target_os = "windows")] +fn adjust_console() -> Result<(), Error> { + let console = win32console::console::WinConsole::input(); + let mut mode = console.get_mode()?; + mode = (mode & !win32console::console::ConsoleMode::ENABLE_QUICK_EDIT_MODE) + | win32console::console::ConsoleMode::ENABLE_EXTENDED_FLAGS; + console.set_mode(mode)?; + Ok(()) +} + +fn filter_plugins(dirname: &str) -> Vec { + match fs::read_dir(dirname) { + Ok(readdir) => readdir + .map(|entry| entry.unwrap().path()) + .filter(|fname| { + fname.is_file() + && fname.extension().is_some() + && fname.extension().and_then(OsStr::to_str).unwrap_or_default().starts_with(DLL_EXTENSION) + }) + .filter(|fname| WHITELIST.iter().any(|lib| *lib == fname.file_stem().and_then(OsStr::to_str).unwrap())) + .map(|path| path.to_str().unwrap().to_string()) + .collect::>(), + _ => Vec::::new(), + } +} + +async fn get_client( + karlsend_address: String, + mining_address: String, + mine_when_not_synced: bool, + block_template_ctr: Arc, +) -> Result, Error> { + if karlsend_address.starts_with("stratum+tcp://") { + let (_schema, address) = karlsend_address.split_once("://").unwrap(); + Ok(StratumHandler::connect( + address.to_string().clone(), + mining_address.clone(), + mine_when_not_synced, + Some(block_template_ctr.clone()), + ) + .await?) + } else if karlsend_address.starts_with("grpc://") { + Ok(KarlsendHandler::connect( + karlsend_address.clone(), + mining_address.clone(), + mine_when_not_synced, + Some(block_template_ctr.clone()), + ) + .await?) + } else { + Err("Did not recognize pool/grpc address schema".into()) + } +} + +async fn client_main( + opt: &Opt, + block_template_ctr: Arc, + plugin_manager: &PluginManager, +) -> Result<(), Error> { + let mut client = get_client( + opt.karlsend_address.clone(), + opt.mining_address.clone(), + opt.mine_when_not_synced, + block_template_ctr.clone(), + ) + .await?; + + if opt.devfund_percent > 0 { + client.add_devfund(opt.devfund_address.clone(), opt.devfund_percent); + } + client.register().await?; + let mut miner_manager = MinerManager::new(client.get_block_channel(), opt.num_threads, plugin_manager); + client.listen(&mut miner_manager).await?; + drop(miner_manager); + Ok(()) +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + #[cfg(target_os = "windows")] + adjust_console().unwrap_or_else(|e| { + eprintln!("WARNING: Failed to protect console ({}). Any selection in console will freeze the miner.", e) + }); + let mut path = current_exe().unwrap_or_default(); + path.pop(); // Getting the parent directory + let plugins = filter_plugins(path.to_str().unwrap_or(".")); + let (app, mut plugin_manager): (App, PluginManager) = karlsen_miner::load_plugins(Opt::into_app(), &plugins)?; + + let matches = app.get_matches(); + + let worker_count = plugin_manager.process_options(&matches)?; + let mut opt: Opt = Opt::from_arg_matches(&matches)?; + opt.process()?; + env_logger::builder().filter_level(opt.log_level()).parse_default_env().init(); + info!("================================================================================="); + info!(" karlsen-miner GPU {}", env!("CARGO_PKG_VERSION")); + info!(" Mining for: {}", opt.mining_address); + info!("================================================================================="); + info!("Found plugins: {:?}", plugins); + info!("Plugins found {} workers", worker_count); + if worker_count == 0 && opt.num_threads.unwrap_or(0) == 0 { + error!("No workers specified"); + return Err("No workers specified".into()); + } + + let block_template_ctr = Arc::new(AtomicU16::new((thread_rng().next_u64() % 10_000u64) as u16)); + if opt.devfund_percent > 0 { + info!( + "devfund enabled, mining {}.{}% of the time to devfund address: {} ", + opt.devfund_percent / 100, + opt.devfund_percent % 100, + opt.devfund_address + ); + } + loop { + match client_main(&opt, block_template_ctr.clone(), &plugin_manager).await { + Ok(_) => info!("Client closed gracefully"), + Err(e) => error!("Client closed with error {:?}", e), + } + info!("Client closed, reconnecting"); + sleep(Duration::from_millis(100)); + } +} diff --git a/src/miner.rs b/src/miner.rs index 971f8e9..786960d 100644 --- a/src/miner.rs +++ b/src/miner.rs @@ -1,508 +1,476 @@ -use std::collections::HashMap; -use std::num::Wrapping; -use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}; -use std::sync::{Arc, Mutex}; -use std::thread::sleep; -use std::time::Duration; - -use crate::{pow, watch, Error}; -use log::{error, info, warn}; -use rand::{thread_rng, RngCore}; -use tokio::sync::mpsc::Sender; -use tokio::task::{self, JoinHandle}; -use tokio::time::MissedTickBehavior; - -use crate::pow::BlockSeed; -use karlsen_miner::{PluginManager, WorkerSpec}; - -type MinerHandler = std::thread::JoinHandle>; - -#[cfg(any(target_os = "linux", target_os = "mac_os"))] -extern "C" fn signal_panic(_signal: nix::libc::c_int) { - panic!("Forced shutdown"); -} - -#[cfg(any(target_os = "linux", target_os = "mac_os"))] -fn register_freeze_handler() { - let handler = nix::sys::signal::SigHandler::Handler(signal_panic); - unsafe { - nix::sys::signal::signal(nix::sys::signal::Signal::SIGUSR1, handler).unwrap(); - } -} - -#[cfg(any(target_os = "linux", target_os = "mac_os"))] -fn trigger_freeze_handler(kill_switch: Arc, handle: &MinerHandler) -> std::thread::JoinHandle<()> { - use std::os::unix::thread::JoinHandleExt; - let pthread_handle = handle.as_pthread_t(); - std::thread::spawn(move || { - sleep(Duration::from_millis(1000)); - if kill_switch.load(Ordering::SeqCst) { - match nix::sys::pthread::pthread_kill(pthread_handle, nix::sys::signal::Signal::SIGUSR1) { - Ok(()) => { - info!("Thread killed successfully") - } - Err(e) => { - info!("Error: {:?}", e) - } - } - } - }) -} - -#[cfg(any(target_os = "windows"))] -struct RawHandle(*mut std::ffi::c_void); - -#[cfg(any(target_os = "windows"))] -unsafe impl Send for RawHandle {} - -#[cfg(any(target_os = "windows"))] -fn register_freeze_handler() {} - -#[cfg(target_os = "windows")] -fn trigger_freeze_handler(kill_switch: Arc, handle: &MinerHandler) -> std::thread::JoinHandle<()> { - use std::os::windows::io::AsRawHandle; - let raw_handle = RawHandle(handle.as_raw_handle()); - - std::thread::spawn(move || unsafe { - let ensure_full_move = raw_handle; - sleep(Duration::from_millis(1000)); - if kill_switch.load(Ordering::SeqCst) { - kernel32::TerminateThread(ensure_full_move.0, 0); - } - }) -} - -#[cfg(not(any(target_os = "linux", target_os = "mac_os", target_os = "windows")))] -fn trigger_freeze_handler(kill_switch: Arc, handle: &MinerHandler) { - warn!("Freeze handler is not implemented. Frozen threads are ignored"); -} - -#[cfg(not(any(target_os = "linux", target_os = "mac_os", target_os = "windows")))] -fn register_freeze_handler() { - warn!("Freeze handler is not implemented. Frozen threads are ignored"); -} - -#[derive(Clone)] -enum WorkerCommand { - Job(Box), - Close, -} - -#[allow(dead_code)] -pub struct MinerManager { - handles: Vec, - block_channel: watch::Sender>, - send_channel: Sender, - logger_handle: JoinHandle<()>, - is_synced: bool, - hashes_tried: Arc, - hashes_by_worker: Arc>>>, - current_state_id: AtomicUsize, -} - -impl Drop for MinerManager { - fn drop(&mut self) { - info!("Closing miner"); - self.logger_handle.abort(); - match self.block_channel.send(Some(WorkerCommand::Close)) { - Ok(_) => {} - Err(_) => warn!("All workers are already dead"), - } - while !self.handles.is_empty() { - let handle = self.handles.pop().expect("There should be at least one"); - let kill_switch = Arc::new(AtomicBool::new(true)); - trigger_freeze_handler(kill_switch.clone(), &handle); - match handle.join() { - Ok(res) => match res { - Ok(()) => {} - Err(e) => error!("Error when closing Worker: {}", e), - }, - Err(_) => error!("Worker failed to close gracefully"), - }; - kill_switch.fetch_and(false, Ordering::SeqCst); - } - } -} - -pub fn get_num_cpus(n_cpus: Option) -> u16 { - n_cpus.unwrap_or_else(|| { - num_cpus::get_physical().try_into().expect("Doesn't make sense to have more than 65,536 CPU cores") - }) -} - -const LOG_RATE: Duration = Duration::from_secs(10); - -impl MinerManager { - pub fn new(send_channel: Sender, n_cpus: Option, manager: &PluginManager) -> Self { - register_freeze_handler(); - let hashes_tried = Arc::new(AtomicU64::new(0)); - let hashes_by_worker = Arc::new(Mutex::new(HashMap::>::new())); - let (send, recv) = watch::channel(None); - let mut handles = - Self::launch_cpu_threads(send_channel.clone(), Arc::clone(&hashes_tried), recv.clone(), n_cpus) - .collect::>(); - if manager.has_specs() { - handles.append(&mut Self::launch_gpu_threads( - send_channel.clone(), - Arc::clone(&hashes_tried), - recv, - manager, - hashes_by_worker.clone(), - )); - } - Self { - handles, - block_channel: send, - send_channel, - logger_handle: task::spawn(Self::log_hashrate(Arc::clone(&hashes_tried), hashes_by_worker.clone())), - is_synced: true, - hashes_tried, - current_state_id: AtomicUsize::new(0), - hashes_by_worker, - } - } - - fn launch_cpu_threads( - send_channel: Sender, - hashes_tried: Arc, - work_channel: watch::Receiver>, - n_cpus: Option, - ) -> impl Iterator { - let n_cpus = get_num_cpus(n_cpus); - info!("launching: {} cpu miners", n_cpus); - (0..n_cpus) - .map(move |_| Self::launch_cpu_miner(send_channel.clone(), work_channel.clone(), Arc::clone(&hashes_tried))) - } - - fn launch_gpu_threads( - send_channel: Sender, - hashes_tried: Arc, - work_channel: watch::Receiver>, - manager: &PluginManager, - hashes_by_worker: Arc>>>, - ) -> Vec { - let mut vec = Vec::::new(); - let specs = manager.build().unwrap(); - for spec in specs { - let worker_hashes_tried = Arc::new(AtomicU64::new(0)); - hashes_by_worker.lock().unwrap().insert(spec.id(), worker_hashes_tried.clone()); - vec.push(Self::launch_gpu_miner( - send_channel.clone(), - work_channel.clone(), - Arc::clone(&hashes_tried), - spec, - worker_hashes_tried, - )); - } - vec - } - - pub async fn process_block(&mut self, block: Option) -> Result<(), Error> { - let state = match block { - Some(b) => { - self.is_synced = true; - let id = self.current_state_id.fetch_add(1, Ordering::SeqCst); - Some(WorkerCommand::Job(Box::new(pow::State::new(id, b)?))) - } - None => { - if !self.is_synced { - return Ok(()); - } - self.is_synced = false; - warn!("karlsend is not synced, skipping current template"); - None - } - }; - - self.block_channel.send(state).map_err(|_e| "Failed sending block to threads")?; - Ok(()) - } - - #[allow(unreachable_code)] - fn launch_gpu_miner( - send_channel: Sender, - mut block_channel: watch::Receiver>, - hashes_tried: Arc, - spec: Box, - worker_hashes_tried: Arc, - ) -> MinerHandler { - std::thread::spawn(move || { - let mut box_ = spec.build(); - let gpu_work = box_.as_mut(); - (|| { - info!("Spawned Thread for GPU {}", gpu_work.id()); - let mut nonces = vec![0u64; 1]; - - let mut state = None; - - loop { - nonces[0] = 0; - if state.is_none() { - state = match block_channel.wait_for_change() { - Ok(cmd) => match cmd { - Some(WorkerCommand::Job(s)) => Some(s), - Some(WorkerCommand::Close) => {return Ok(());} - None => None, - }, - Err(e) => { - info!("{}: GPU thread crashed: {}", gpu_work.id(), e.to_string()); - return Ok(()); - } - }; - } - let state_ref = match &state { - Some(s) => { - s.load_to_gpu(gpu_work); - s - }, - None => continue, - }; - state_ref.pow_gpu(gpu_work); - if let Err(e) = gpu_work.sync() { - warn!("CUDA run ignored: {}", e); - continue - } - - gpu_work.copy_output_to(&mut nonces)?; - if nonces[0] != 0 { - if let Some(block_seed) = state_ref.generate_block_if_pow(nonces[0]) { - match send_channel.blocking_send(block_seed.clone()) { - Ok(()) => block_seed.report_block(), - Err(e) => error!("Failed submitting block: ({})", e.to_string()), - }; - if let BlockSeed::FullBlock(_) = block_seed { - state = None; - } - nonces[0] = 0; - hashes_tried.fetch_add(gpu_work.get_workload().try_into().unwrap(), Ordering::AcqRel); - worker_hashes_tried.fetch_add(gpu_work.get_workload().try_into().unwrap(), Ordering::AcqRel); - continue; - } else { - let hash = state_ref.calculate_pow(nonces[0]); - warn!("Something is wrong in GPU results! Got nonce {}, with hash real {:?} (target: {}*2^196)", nonces[0], hash.0, state_ref.target.0[3]); - break; - } - } - - /* - info!("Output should be: {:02X?}", state_ref.calculate_pow(nonces[0]).to_le_bytes()); - info!("We got: {:02X?} (Nonces: {:02X?})", hashes[0], nonces[0].to_le_bytes()); - assert!(state_ref.calculate_pow(nonces[0]).to_le_bytes() == hashes[0]); - */ - /* - info!("Output should be: {}", state_ref.calculate_pow(nonces[nonces.len()-1]).0[3]); - info!("We got: {} (Nonces: {})", Uint256::from_le_bytes(hashes[nonces.len()-1]).0[3], nonces[nonces.len()-1]); - assert!(state_ref.calculate_pow(nonces[nonces.len()-1]).0[0] == Uint256::from_le_bytes(hashes[nonces.len()-1]).0[0]); - */ - /* - if state_ref.calculate_pow(nonces[0]).0[0] != Uint256::from_le_bytes(hashes[0]).0[0] { - gpu_work.sync()?; - let mut nonce_vec = vec![nonces[0]; 1]; - nonce_vec.append(&mut vec![0u64; gpu_work.workload-1]); - gpu_work.calculate_pow_hash(&state_ref.pow_hash_header, Some(&nonce_vec)); - gpu_work.sync()?; - gpu_work.calculate_matrix_mul(&mut state_ref.matrix.clone().0.as_slice().as_dbuf().unwrap()); - gpu_work.sync()?; - gpu_work.calculate_heavy_hash(); - gpu_work.sync()?; - let mut hashes2 = vec![[0u8; 32]; out_size]; - let mut nonces2= vec![0u64; out_size]; - gpu_work.copy_output_to(&mut hashes2, &mut nonces2); - assert!(state_ref.calculate_pow(nonces[0]).to_le_bytes() == hashes2[0]); - assert!(nonces2[0] == nonces[0]); - assert!(hashes2 == hashes); - assert!(false); - }*/ - - hashes_tried.fetch_add(gpu_work.get_workload().try_into().unwrap(), Ordering::AcqRel); - worker_hashes_tried.fetch_add(gpu_work.get_workload().try_into().unwrap(), Ordering::AcqRel); - - { - if let Some(new_cmd) = block_channel.get_changed()? { - state = match new_cmd { - Some(WorkerCommand::Job(s)) => Some(s), - Some(WorkerCommand::Close) => {return Ok(());} - None => None, - }; - } - } - } - Ok(()) - })() - .map_err(|e: Error| { - error!("{}: GPU thread crashed: {}", gpu_work.id(), e.to_string()); - e - }) - }) - } - - #[allow(unreachable_code)] - fn launch_cpu_miner( - send_channel: Sender, - mut block_channel: watch::Receiver>, - hashes_tried: Arc, - ) -> MinerHandler { - let mut nonce = Wrapping(thread_rng().next_u64()); - let mut mask = Wrapping(0); - let mut fixed = Wrapping(0); - std::thread::spawn(move || { - (|| { - let mut state = None; - - loop { - if state.is_none() { - state = match block_channel.wait_for_change() { - Ok(cmd) => match cmd { - Some(WorkerCommand::Job(s)) => Some(s), - Some(WorkerCommand::Close) => { - return Ok(()); - } - None => None, - }, - Err(e) => { - info!("CPU thread crashed: {}", e.to_string()); - return Ok(()); - } - }; - if let Some(s) = &state { - mask = Wrapping(s.nonce_mask); - fixed = Wrapping(s.nonce_fixed); - } - } - let state_ref = match state.as_mut() { - Some(s) => s, - None => continue, - }; - nonce = (nonce & mask) | fixed; - - if let Some(block_seed) = state_ref.generate_block_if_pow(nonce.0) { - match send_channel.blocking_send(block_seed.clone()) { - Ok(()) => block_seed.report_block(), - Err(e) => error!("Failed submitting block: ({})", e.to_string()), - }; - if let BlockSeed::FullBlock(_) = block_seed { - state = None; - } - } - nonce += Wrapping(1); - // TODO: Is this really necessary? can we just use Relaxed? - hashes_tried.fetch_add(1, Ordering::AcqRel); - - if nonce.0 % 128 == 0 { - if let Some(new_cmd) = block_channel.get_changed()? { - state = match new_cmd { - Some(WorkerCommand::Job(s)) => Some(s), - Some(WorkerCommand::Close) => { - return Ok(()); - } - None => None, - }; - } - } - } - Ok(()) - })() - .map_err(|e: Error| { - error!("CPU thread crashed: {}", e.to_string()); - e - }) - }) - } - - async fn log_hashrate(hashes_tried: Arc, hashes_by_worker: Arc>>>) { - let mut ticker = tokio::time::interval(LOG_RATE); - ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); - let mut last_instant = ticker.tick().await; - loop { - let now = ticker.tick().await; - let duration = (now - last_instant).as_secs_f64(); - Self::log_single_hashrate( - &hashes_tried, - "Current hashrate is".into(), - "Workers stalled or crashed. Considered reducing workload and check that your node is synced", - duration, - false, - ); - for (device, rate) in &*hashes_by_worker.lock().unwrap() { - Self::log_single_hashrate(rate, format!("Device {}:", device), "0 hash/s", duration, true); - } - last_instant = now; - } - } - - fn log_single_hashrate( - counter: &Arc, - prefix: String, - warn_message: &str, - duration: f64, - keep_prefix: bool, - ) { - let hashes = counter.swap(0, Ordering::AcqRel); - let rate = (hashes as f64) / duration; - if hashes == 0 { - match keep_prefix { - true => warn!("{}{}", prefix, warn_message), - false => warn!("{}", warn_message), - }; - } else if hashes != 0 { - let (rate, suffix) = Self::hash_suffix(rate); - info!("{} {:.2} {}", prefix, rate, suffix); - } - } - - #[inline] - fn hash_suffix(n: f64) -> (f64, &'static str) { - match n { - n if n < 1_000.0 => (n, "hash/s"), - n if n < 1_000_000.0 => (n / 1_000.0, "Khash/s"), - n if n < 1_000_000_000.0 => (n / 1_000_000.0, "Mhash/s"), - n if n < 1_000_000_000_000.0 => (n / 1_000_000_000.0, "Ghash/s"), - n if n < 1_000_000_000_000_000.0 => (n / 1_000_000_000_000.0, "Thash/s"), - _ => (n, "hash/s"), - } - } -} - -#[cfg(all(test, feature = "bench"))] -mod benches { - extern crate test; - - use self::test::{black_box, Bencher}; - use crate::pow::State; - use crate::proto::{RpcBlock, RpcBlockHeader}; - use rand::{thread_rng, RngCore}; - - #[bench] - pub fn bench_mining(bh: &mut Bencher) { - let mut state = State::new( - 0, - RpcBlock { - header: Some(RpcBlockHeader { - version: 1, - parents: vec![], - hash_merkle_root: "23618af45051560529440541e7dc56be27676d278b1e00324b048d410a19d764".to_string(), - accepted_id_merkle_root: "947d1a10378d6478b6957a0ed71866812dee33684968031b1cace4908c149d94" - .to_string(), - utxo_commitment: "ec5e8fc0bc0c637004cee262cef12e7cf6d9cd7772513dbd466176a07ab7c4f4".to_string(), - timestamp: 654654353, - bits: 0x1e7fffff, - nonce: 0, - daa_score: 654456, - blue_work: "d8e28a03234786".to_string(), - pruning_point: "be4c415d378f9113fabd3c09fcc84ddb6a00f900c87cb6a1186993ddc3014e2d".to_string(), - blue_score: 1164419, - }), - transactions: vec![], - verbose_data: None, - }, - ) - .unwrap(); - nonce = thread_rng().next_u64(); - bh.iter(|| { - for _ in 0..100 { - black_box(state.check_pow(nonce)); - nonce += 1; - } - }); - } -} +use std::collections::HashMap; +use std::num::Wrapping; +use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; +use std::thread::sleep; +use std::time::Duration; + +use crate::{pow, watch, Error}; +use log::{error, info, warn}; +use rand::{thread_rng, RngCore}; +use tokio::sync::mpsc::Sender; +use tokio::task::{self, JoinHandle}; +use tokio::time::MissedTickBehavior; + +use crate::pow::BlockSeed; +use karlsen_miner::{PluginManager, WorkerSpec}; + +type MinerHandler = std::thread::JoinHandle>; + +#[cfg(any(target_os = "linux", target_os = "macos"))] +extern "C" fn signal_panic(_signal: nix::libc::c_int) { + panic!("Forced shutdown"); +} + +#[cfg(any(target_os = "linux", target_os = "macos"))] +fn register_freeze_handler() { + let handler = nix::sys::signal::SigHandler::Handler(signal_panic); + unsafe { + nix::sys::signal::signal(nix::sys::signal::Signal::SIGUSR1, handler).unwrap(); + } +} + +#[cfg(any(target_os = "linux", target_os = "macos"))] +fn trigger_freeze_handler(kill_switch: Arc, handle: &MinerHandler) -> std::thread::JoinHandle<()> { + use std::os::unix::thread::JoinHandleExt; + let pthread_handle = handle.as_pthread_t(); + std::thread::spawn(move || { + sleep(Duration::from_millis(1000)); + if kill_switch.load(Ordering::SeqCst) { + match nix::sys::pthread::pthread_kill(pthread_handle, nix::sys::signal::Signal::SIGUSR1) { + Ok(()) => { + info!("Thread killed successfully") + } + Err(e) => { + info!("Error: {:?}", e) + } + } + } + }) +} + +#[cfg(target_os = "windows")] +struct RawHandle(*mut std::ffi::c_void); + +#[cfg(target_os = "windows")] +unsafe impl Send for RawHandle {} + +#[cfg(target_os = "windows")] +fn register_freeze_handler() {} + +#[cfg(target_os = "windows")] +fn trigger_freeze_handler(kill_switch: Arc, handle: &MinerHandler) -> std::thread::JoinHandle<()> { + use std::os::windows::io::AsRawHandle; + let raw_handle = RawHandle(handle.as_raw_handle()); + + std::thread::spawn(move || unsafe { + let ensure_full_move = raw_handle; + sleep(Duration::from_millis(1000)); + if kill_switch.load(Ordering::SeqCst) { + kernel32::TerminateThread(ensure_full_move.0, 0); + } + }) +} + +#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))] +fn trigger_freeze_handler(kill_switch: Arc, handle: &MinerHandler) { + warn!("Freeze handler is not implemented. Frozen threads are ignored"); +} + +#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))] +fn register_freeze_handler() { + warn!("Freeze handler is not implemented. Frozen threads are ignored"); +} + +#[derive(Clone)] +enum WorkerCommand { + Job(Box), + Close, +} + +#[allow(dead_code)] +pub struct MinerManager { + handles: Vec, + block_channel: watch::Sender>, + send_channel: Sender, + logger_handle: JoinHandle<()>, + is_synced: bool, + hashes_tried: Arc, + hashes_by_worker: Arc>>>, + current_state_id: AtomicUsize, +} + +impl Drop for MinerManager { + fn drop(&mut self) { + info!("Closing miner"); + self.logger_handle.abort(); + match self.block_channel.send(Some(WorkerCommand::Close)) { + Ok(_) => {} + Err(_) => warn!("All workers are already dead"), + } + while let Some(handle) = self.handles.pop() { + let kill_switch = Arc::new(AtomicBool::new(true)); + trigger_freeze_handler(kill_switch.clone(), &handle); + match handle.join() { + Ok(res) => match res { + Ok(()) => {} + Err(e) => error!("Error when closing Worker: {}", e), + }, + Err(_) => error!("Worker failed to close gracefully"), + }; + kill_switch.fetch_and(false, Ordering::SeqCst); + } + } +} + +pub fn get_num_cpus(n_cpus: Option) -> u16 { + n_cpus.unwrap_or_else(|| { + num_cpus::get_physical().try_into().expect("Doesn't make sense to have more than 65,536 CPU cores") + }) +} + +const LOG_RATE: Duration = Duration::from_secs(10); + +impl MinerManager { + pub fn new(send_channel: Sender, n_cpus: Option, manager: &PluginManager) -> Self { + register_freeze_handler(); + let hashes_tried = Arc::new(AtomicU64::new(0)); + let hashes_by_worker = Arc::new(Mutex::new(HashMap::>::new())); + let (send, recv) = watch::channel(None); + let mut handles = + Self::launch_cpu_threads(send_channel.clone(), Arc::clone(&hashes_tried), recv.clone(), n_cpus) + .collect::>(); + if manager.has_specs() { + handles.append(&mut Self::launch_gpu_threads( + send_channel.clone(), + Arc::clone(&hashes_tried), + recv, + manager, + hashes_by_worker.clone(), + )); + } + Self { + handles, + block_channel: send, + send_channel, + logger_handle: task::spawn(Self::log_hashrate(Arc::clone(&hashes_tried), hashes_by_worker.clone())), + is_synced: true, + hashes_tried, + current_state_id: AtomicUsize::new(0), + hashes_by_worker, + } + } + + fn launch_cpu_threads( + send_channel: Sender, + hashes_tried: Arc, + work_channel: watch::Receiver>, + n_cpus: Option, + ) -> impl Iterator { + let n_cpus = get_num_cpus(n_cpus); + info!("launching: {} cpu miners", n_cpus); + (0..n_cpus) + .map(move |_| Self::launch_cpu_miner(send_channel.clone(), work_channel.clone(), Arc::clone(&hashes_tried))) + } + + fn launch_gpu_threads( + send_channel: Sender, + hashes_tried: Arc, + work_channel: watch::Receiver>, + manager: &PluginManager, + hashes_by_worker: Arc>>>, + ) -> Vec { + let mut vec = Vec::::new(); + let specs = manager.build().unwrap(); + for spec in specs { + let worker_hashes_tried = Arc::new(AtomicU64::new(0)); + hashes_by_worker.lock().unwrap().insert(spec.id(), worker_hashes_tried.clone()); + vec.push(Self::launch_gpu_miner( + send_channel.clone(), + work_channel.clone(), + Arc::clone(&hashes_tried), + spec, + worker_hashes_tried, + )); + } + vec + } + + pub async fn process_block(&mut self, block: Option) -> Result<(), Error> { + let state = match block { + Some(b) => { + self.is_synced = true; + let id = self.current_state_id.fetch_add(1, Ordering::SeqCst); + Some(WorkerCommand::Job(Box::new(pow::State::new(id, b)?))) + } + None => { + if !self.is_synced { + return Ok(()); + } + self.is_synced = false; + warn!("karlsend is not synced, skipping current template"); + None + } + }; + + self.block_channel.send(state).map_err(|_e| "Failed sending block to threads")?; + Ok(()) + } + + #[allow(unreachable_code)] + fn launch_gpu_miner( + send_channel: Sender, + mut block_channel: watch::Receiver>, + hashes_tried: Arc, + spec: Box, + worker_hashes_tried: Arc, + ) -> MinerHandler { + std::thread::spawn(move || { + let mut box_ = spec.build(); + let gpu_work = box_.as_mut(); + (|| { + info!("Spawned Thread for GPU {}", gpu_work.id()); + let mut nonces = vec![0u64; 1]; + + let mut state = None; + + loop { + nonces[0] = 0; + if state.is_none() { + state = match block_channel.wait_for_change() { + Ok(cmd) => match cmd { + Some(WorkerCommand::Job(s)) => Some(s), + Some(WorkerCommand::Close) => {return Ok(());} + None => None, + }, + Err(e) => { + info!("{}: GPU thread crashed: {}", gpu_work.id(), e.to_string()); + return Ok(()); + } + }; + } + let state_ref = match &state { + Some(s) => { + s.load_to_gpu(gpu_work); + s + }, + None => continue, + }; + state_ref.pow_gpu(gpu_work); + if let Err(e) = gpu_work.sync() { + warn!("CUDA run ignored: {}", e); + continue + } + + gpu_work.copy_output_to(&mut nonces)?; + if nonces[0] != 0 { + if let Some(block_seed) = state_ref.generate_block_if_pow(nonces[0]) { + match send_channel.blocking_send(block_seed.clone()) { + Ok(()) => block_seed.report_block(), + Err(e) => error!("Failed submitting block: ({})", e.to_string()), + }; + if let BlockSeed::FullBlock(_) = block_seed { + state = None; + } + nonces[0] = 0; + hashes_tried.fetch_add(gpu_work.get_workload().try_into().unwrap(), Ordering::AcqRel); + worker_hashes_tried.fetch_add(gpu_work.get_workload().try_into().unwrap(), Ordering::AcqRel); + continue; + } else { + let hash = state_ref.calculate_pow(nonces[0]); + warn!("Something is wrong in GPU results! Got nonce {}, with hash real {:?} (target: {}*2^196)", nonces[0], hash.0, state_ref.target.0[3]); + break; + } + } + hashes_tried.fetch_add(gpu_work.get_workload().try_into().unwrap(), Ordering::AcqRel); + worker_hashes_tried.fetch_add(gpu_work.get_workload().try_into().unwrap(), Ordering::AcqRel); + + { + if let Some(new_cmd) = block_channel.get_changed()? { + state = match new_cmd { + Some(WorkerCommand::Job(s)) => Some(s), + Some(WorkerCommand::Close) => {return Ok(());} + None => None, + }; + } + } + } + Ok(()) + })() + .map_err(|e: Error| { + error!("{}: GPU thread crashed: {}", gpu_work.id(), e.to_string()); + e + }) + }) + } + + #[allow(unreachable_code)] + fn launch_cpu_miner( + send_channel: Sender, + mut block_channel: watch::Receiver>, + hashes_tried: Arc, + ) -> MinerHandler { + let mut nonce = Wrapping(thread_rng().next_u64()); + let mut mask = Wrapping(0); + let mut fixed = Wrapping(0); + std::thread::spawn(move || { + (|| { + let mut state = None; + + loop { + if state.is_none() { + state = match block_channel.wait_for_change() { + Ok(cmd) => match cmd { + Some(WorkerCommand::Job(s)) => Some(s), + Some(WorkerCommand::Close) => { + return Ok(()); + } + None => None, + }, + Err(e) => { + info!("CPU thread crashed: {}", e.to_string()); + return Ok(()); + } + }; + if let Some(s) = &state { + mask = Wrapping(s.nonce_mask); + fixed = Wrapping(s.nonce_fixed); + } + } + let state_ref = match state.as_mut() { + Some(s) => s, + None => continue, + }; + nonce = (nonce & mask) | fixed; + + if let Some(block_seed) = state_ref.generate_block_if_pow(nonce.0) { + match send_channel.blocking_send(block_seed.clone()) { + Ok(()) => block_seed.report_block(), + Err(e) => error!("Failed submitting block: ({})", e.to_string()), + }; + if let BlockSeed::FullBlock(_) = block_seed { + state = None; + } + } + nonce += Wrapping(1); + // TODO: Is this really necessary? can we just use Relaxed? + hashes_tried.fetch_add(1, Ordering::AcqRel); + + if nonce.0 % 128 == 0 { + if let Some(new_cmd) = block_channel.get_changed()? { + state = match new_cmd { + Some(WorkerCommand::Job(s)) => Some(s), + Some(WorkerCommand::Close) => { + return Ok(()); + } + None => None, + }; + } + } + } + Ok(()) + })() + .map_err(|e: Error| { + error!("CPU thread crashed: {}", e.to_string()); + e + }) + }) + } + + async fn log_hashrate(hashes_tried: Arc, hashes_by_worker: Arc>>>) { + let mut ticker = tokio::time::interval(LOG_RATE); + ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + let mut last_instant = ticker.tick().await; + loop { + let now = ticker.tick().await; + let duration = (now - last_instant).as_secs_f64(); + Self::log_single_hashrate( + &hashes_tried, + "Current hashrate is".into(), + "Workers stalled or crashed. Consider reducing workload and check that your node is synced", + duration, + false, + ); + for (device, rate) in &*hashes_by_worker.lock().unwrap() { + Self::log_single_hashrate(rate, format!("Device {}:", device), "0 hash/s", duration, true); + } + last_instant = now; + } + } + + fn log_single_hashrate( + counter: &Arc, + prefix: String, + warn_message: &str, + duration: f64, + keep_prefix: bool, + ) { + let hashes = counter.swap(0, Ordering::AcqRel); + let rate = (hashes as f64) / duration; + if hashes == 0 { + match keep_prefix { + true => warn!("{}{}", prefix, warn_message), + false => warn!("{}", warn_message), + }; + } else if hashes != 0 { + let (rate, suffix) = Self::hash_suffix(rate); + info!("{} {:.2} {}", prefix, rate, suffix); + } + } + + #[inline] + fn hash_suffix(n: f64) -> (f64, &'static str) { + match n { + n if n < 1_000.0 => (n, "hash/s"), + n if n < 1_000_000.0 => (n / 1_000.0, "Khash/s"), + n if n < 1_000_000_000.0 => (n / 1_000_000.0, "Mhash/s"), + n if n < 1_000_000_000_000.0 => (n / 1_000_000_000.0, "Ghash/s"), + n if n < 1_000_000_000_000_000.0 => (n / 1_000_000_000_000.0, "Thash/s"), + _ => (n, "hash/s"), + } + } +} + +#[cfg(all(test, feature = "bench"))] +mod benches { + extern crate test; + + use self::test::{black_box, Bencher}; + use crate::pow::State; + use crate::proto::{RpcBlock, RpcBlockHeader}; + use rand::{thread_rng, RngCore}; + + #[bench] + pub fn bench_mining(bh: &mut Bencher) { + let mut state = State::new( + 0, + RpcBlock { + header: Some(RpcBlockHeader { + version: 1, + parents: vec![], + hash_merkle_root: "23618af45051560529440541e7dc56be27676d278b1e00324b048d410a19d764".to_string(), + accepted_id_merkle_root: "947d1a10378d6478b6957a0ed71866812dee33684968031b1cace4908c149d94" + .to_string(), + utxo_commitment: "ec5e8fc0bc0c637004cee262cef12e7cf6d9cd7772513dbd466176a07ab7c4f4".to_string(), + timestamp: 654654353, + bits: 0x1e7fffff, + nonce: 0, + daa_score: 654456, + blue_work: "d8e28a03234786".to_string(), + pruning_point: "be4c415d378f9113fabd3c09fcc84ddb6a00f900c87cb6a1186993ddc3014e2d".to_string(), + blue_score: 1164419, + }), + transactions: vec![], + verbose_data: None, + }, + ) + .unwrap(); + nonce = thread_rng().next_u64(); + bh.iter(|| { + for _ in 0..100 { + black_box(state.check_pow(nonce)); + nonce += 1; + } + }); + } +} diff --git a/src/pow.rs b/src/pow.rs index f89c559..f89ae23 100644 --- a/src/pow.rs +++ b/src/pow.rs @@ -1,476 +1,484 @@ -use log::info; -use std::sync::Arc; -use std::time::{Duration, UNIX_EPOCH}; -use time::{macros::format_description, OffsetDateTime}; - -pub use crate::pow::hasher::HeaderHasher; -use crate::{ - pow::{ - hasher::{Hasher, PowHasher}, - heavy_hash::Matrix, - }, - proto::{RpcBlock, RpcBlockHeader}, - target::{self, Uint256}, - Error, Hash, -}; -use karlsen_miner::Worker; - -mod hasher; -mod heavy_hash; -mod keccak; -mod xoshiro; - -#[derive(Clone, Debug)] -pub enum BlockSeed { - FullBlock(Box), - PartialBlock { - id: String, - header_hash: [u64; 4], - timestamp: u64, - nonce: u64, - target: Uint256, - nonce_mask: u64, - nonce_fixed: u64, - hash: Option, - }, -} - -impl BlockSeed { - pub fn report_block(&self) { - match self { - BlockSeed::FullBlock(block) => { - let block_hash = - block.block_hash().expect("We just got it from the state, we should be able to hash it"); - let format = format_description!("[year]-[month]-[day] [hour]:[minute]:[second]"); - let block_time = OffsetDateTime::from( - UNIX_EPOCH + Duration::from_millis(block.header.as_ref().unwrap().timestamp as u64), - ); - info!( - "Found a block: {:x} (Timestamp: {})", - block_hash, - block_time.format(format).unwrap_or_else(|_| "unknown".to_string()) - ); - } - BlockSeed::PartialBlock { .. } => info!("Found a share!"), - } - } -} - -#[derive(Clone)] -pub struct State { - pub id: usize, - matrix: Arc, - pub target: Uint256, - pub pow_hash_header: [u8; 72], - block: Arc, - // PRE_POW_HASH || TIME || 32 zero byte padding; without NONCE - hasher: PowHasher, - - pub nonce_mask: u64, - pub nonce_fixed: u64, -} - -impl State { - #[inline] - pub fn new(id: usize, block_seed: BlockSeed) -> Result { - let pre_pow_hash; - let header_timestamp: u64; - let header_target; - let nonce_mask: u64; - let nonce_fixed: u64; - match block_seed { - BlockSeed::FullBlock(ref block) => { - let header = &block.header.as_ref().ok_or("Header is missing")?; - - header_target = target::u256_from_compact_target(header.bits); - let mut hasher = HeaderHasher::new(); - serialize_header(&mut hasher, header, true); - pre_pow_hash = hasher.finalize(); - header_timestamp = header.timestamp as u64; - nonce_mask = 0xffffffffffffffffu64; - nonce_fixed = 0; - } - BlockSeed::PartialBlock { - ref header_hash, - ref timestamp, - ref target, - nonce_fixed: fixed, - nonce_mask: mask, - .. - } => { - pre_pow_hash = Hash::new(*header_hash); - header_timestamp = *timestamp; - header_target = *target; - nonce_mask = mask; - nonce_fixed = fixed - } - } - - // PRE_POW_HASH || TIME || 32 zero byte padding || NONCE - let hasher = PowHasher::new(pre_pow_hash, header_timestamp); - let matrix = Arc::new(Matrix::generate(pre_pow_hash)); - let mut pow_hash_header = [0u8; 72]; - - pow_hash_header.copy_from_slice( - [pre_pow_hash.to_le_bytes().as_slice(), header_timestamp.to_le_bytes().as_slice(), [0u8; 32].as_slice()] - .concat() - .as_slice(), - ); - Ok(Self { - id, - matrix, - target: header_target, - pow_hash_header, - block: Arc::new(block_seed), - hasher, - nonce_mask, - nonce_fixed, - }) - } - - #[inline(always)] - // PRE_POW_HASH || TIME || 32 zero byte padding || NONCE - pub fn calculate_pow(&self, nonce: u64) -> Uint256 { - // Hasher already contains PRE_POW_HASH || TIME || 32 zero byte padding; so only the NONCE is missing - let hash = self.hasher.finalize_with_nonce(nonce); - self.matrix.heavy_hash(hash) - } - - #[inline(always)] - pub fn check_pow(&self, nonce: u64) -> bool { - let pow = self.calculate_pow(nonce); - // The pow hash must be less or equal than the claimed target. - //info!("nonce {}, pow {:?}, target {:?}, comp {}", nonce, pow, self.target, pow <= self.target); - //pow <= self.target - true - } - - #[inline(always)] - pub fn generate_block_if_pow(&self, nonce: u64) -> Option { - self.check_pow(nonce).then(|| { - let mut block_seed = (*self.block).clone(); - match block_seed { - BlockSeed::FullBlock(ref mut block) => { - let header = &mut block.header.as_mut().expect("We checked that a header exists on creation"); - header.nonce = nonce; - } - BlockSeed::PartialBlock { nonce: ref mut header_nonce, ref mut hash, .. } => { - *header_nonce = nonce; - *hash = Some(format!("{:x}", self.calculate_pow(nonce))) - } - } - block_seed - }) - } - - pub fn load_to_gpu(&self, gpu_work: &mut dyn Worker) { - gpu_work.load_block_constants(&self.pow_hash_header, &self.matrix.0, &self.target.0); - } - - #[inline(always)] - pub fn pow_gpu(&self, gpu_work: &mut dyn Worker) { - gpu_work.calculate_hash(None, self.nonce_mask, self.nonce_fixed); - } -} - -#[cfg(not(any(target_pointer_width = "64", target_pointer_width = "32")))] -compile_error!("Supporting only 32/64 bits"); - -#[inline(always)] -pub fn serialize_header(hasher: &mut H, header: &RpcBlockHeader, for_pre_pow: bool) { - let (nonce, timestamp) = if for_pre_pow { (0, 0) } else { (header.nonce, header.timestamp) }; - let num_parents = header.parents.len(); - let version: u16 = header.version.try_into().unwrap(); - hasher.update(version.to_le_bytes()).update((num_parents as u64).to_le_bytes()); - - let mut hash = [0u8; 32]; - for parent in &header.parents { - hasher.update((parent.parent_hashes.len() as u64).to_le_bytes()); - for hash_string in &parent.parent_hashes { - decode_to_slice(hash_string, &mut hash).unwrap(); - hasher.update(hash); - } - } - decode_to_slice(&header.hash_merkle_root, &mut hash).unwrap(); - hasher.update(hash); - - decode_to_slice(&header.accepted_id_merkle_root, &mut hash).unwrap(); - hasher.update(hash); - decode_to_slice(&header.utxo_commitment, &mut hash).unwrap(); - hasher.update(hash); - - hasher - .update(timestamp.to_le_bytes()) - .update(header.bits.to_le_bytes()) - .update(nonce.to_le_bytes()) - .update(header.daa_score.to_le_bytes()) - .update(header.blue_score.to_le_bytes()); - - // I'm assuming here BlueWork will never pass 256 bits. - let blue_work_len = (header.blue_work.len() + 1) / 2; - if header.blue_work.len() % 2 == 0 { - decode_to_slice(&header.blue_work, &mut hash[..blue_work_len]).unwrap(); - } else { - let mut blue_work = String::with_capacity(header.blue_work.len() + 1); - blue_work.push('0'); - blue_work.push_str(&header.blue_work); - decode_to_slice(&blue_work, &mut hash[..blue_work_len]).unwrap(); - } - - hasher.update((blue_work_len as u64).to_le_bytes()).update(&hash[..blue_work_len]); - - decode_to_slice(&header.pruning_point, &mut hash).unwrap(); - hasher.update(hash); -} - -#[allow(dead_code)] // False Positive: https://github.com/rust-lang/rust/issues/88900 -#[derive(Debug)] -enum FromHexError { - OddLength, - InvalidStringLength, - InvalidHexCharacter { c: char, index: usize }, -} - -#[inline(always)] -fn decode_to_slice>(data: T, out: &mut [u8]) -> Result<(), FromHexError> { - let data = data.as_ref(); - if data.len() % 2 != 0 { - return Err(FromHexError::OddLength); - } - if data.len() / 2 != out.len() { - return Err(FromHexError::InvalidStringLength); - } - - for (i, byte) in out.iter_mut().enumerate() { - *byte = val(data[2 * i], 2 * i)? << 4 | val(data[2 * i + 1], 2 * i + 1)?; - } - - #[inline(always)] - fn val(c: u8, idx: usize) -> Result { - match c { - b'A'..=b'F' => Ok(c - b'A' + 10), - b'a'..=b'f' => Ok(c - b'a' + 10), - b'0'..=b'9' => Ok(c - b'0'), - _ => Err(FromHexError::InvalidHexCharacter { c: c as char, index: idx }), - } - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use crate::pow::hasher::{Hasher, HeaderHasher}; - use crate::pow::serialize_header; - use crate::proto::{RpcBlockHeader, RpcBlockLevelParents}; - use crate::Hash; - - struct Buf(Vec); - impl Hasher for Buf { - fn update>(&mut self, data: A) -> &mut Self { - self.0.extend(data.as_ref()); - self - } - } - - #[test] - fn test_serialize_header() { - let header = RpcBlockHeader { - version: 24565, - parents: vec![ - RpcBlockLevelParents { - parent_hashes: vec![ - "62a5eee82abdf44a2d0b75fb180daf48a79ee0b10d394651850fd4a178892ee2".into(), - "85ece1511455780875d64ee2d3d0d0de6bf8f9b44ce85ff044c6b1f83b8e883b".into(), - "bf857aab99c5b252c7429c32f3a8aeb79ef856f659c18f0dcecc77c75e7a81bf".into(), - "de275f67cfe242cf3cc354f3ede2d6becc4ea3ae5e88526a9f4a578bcb9ef2d4".into(), - "a65314768d6d299761ea9e4f5aa6aec3fc78c6aae081ac8120c720efcd6cea84".into(), - "b6925e607be063716f96ddcdd01d75045c3f000f8a796bce6c512c3801aacaee".into(), - ], - }, - RpcBlockLevelParents { - parent_hashes: vec![ - "dfad5b50ece0b8b7c1965d9181251b7c9c9ca5205afc16a236a2efcdd2d12d2a".into(), - "79d074a8280ae9439eb0d6aeca0823ae02d67d866ac2c4fe4a725053da119b9d".into(), - "4f515140a2d7239c40b45ac3950d941fc4fe1c0cb96ad322d62282295fbfe11e".into(), - "26a433076db5c1444c3a34d32a5c4a7ffbe8d181f7ed3b8cfe904f93f8f06d29".into(), - "bcd9ed847b182e046410f44bc4b0f3f03a0d06820a30f257f8114130678ac045".into(), - "86c1e3c9342c8b8055c466d886441d259906d69acd894b968ae9f0eb9d965ce6".into(), - "a4693c4ebe881501b7d9846b66eb02b57e5cda7b6cba6891d616bd686c37b834".into(), - ], - }, - RpcBlockLevelParents { - parent_hashes: vec![ - "613ac8ba52734ae4e3f1217acd5f83270814301867b5d06711b238001c7957b2".into(), - "7719ce3f3188dfe57deebf6f82595a10f7bb562ca04d5c3d27942958c6db3262".into(), - "670649f3bc97d9a2316735ede682a5dfe6f1a011fbc98ad0fbe790003c01e8e9".into(), - "967703af665e9f72407f4b03d4fdb474aafe8a0d3e0515dd4650cf51172b8124".into(), - "8bcb7f969e400b6c5b127768b1c412fae98cf57631cf37033b4b4aba7d7ed319".into(), - "ba147249c908ac70d1c406dade0e828eb6ba0dcaa88285543e10213c643fc860".into(), - "3b5860236670babcad0bd7f4c4190e323623a868d1eae1769f40a26631431b3b".into(), - "d5215605d2086fead499ac63a4653d12283d56019c3795a98a126d09cfcbe36c".into(), - "dcc93788a5409f8b6e42c2dd83aa46611852ad0b5028775c771690b6854e05b3".into(), - ], - }, - RpcBlockLevelParents { - parent_hashes: vec![ - "77241e302c6da8665c42341dda4adaea595ab1895f9652489dd2ceb49c247430".into(), - "3cbb44c2b94303db662c9c66b8782905190f1e1635b63e34878d3f246fadfce3".into(), - "44e74ef813090f8030bcd525ac10653ff182e00120f7e1f796fa0fc16ba7bb90".into(), - "be2a33e87c3d60ab628471a420834383661801bb0bfd8e6c140071db1eb2f7a1".into(), - "8194f1a045a94c078835c75dff2f3e836180baad9e955da840dc74c4dc2498f8".into(), - "c201aec254a0e36476b2eeb124fdc6afc1b7d809c5e08b5e0e845aaf9b6c3957".into(), - "e95ab4aa8e107cdb873f2dac527f16c4d5ac8760768a715e4669cb840c25317f".into(), - "9a368774e506341afb46503e28e92e51bd7f7d4b53b9023d56f9b9ec991ac2a9".into(), - "d9bc45ff64bb2bf14d4051a7604b28bad44d98bfe30e54ebc07fa45f62aabe39".into(), - ], - }, - RpcBlockLevelParents { - parent_hashes: vec![ - "5cc98b2e3f6deb2990187058e4bfd2d1640653fc38a30b0f83231a965b413b0f".into(), - "26927e0d032e830b732bdeb3094cb1a5fa6dec9f06375ea25fe57c2853ea0932".into(), - "0ac8803976eacaa095c02f869fd7dc31072475940c3751d56283c49e2fefd41d".into(), - "f676bdcb5855a0470efd2dab7a72cc5e5f39ff7eea0f433a9fe7b6a675bc2ac5".into(), - "0cd218c009e21f910f9ddb09a0d059c4cd7d2ca65a2349df7a867dbedd81e9d4".into(), - "891619c83c42895ce1b671cb7a4bcaed9130ab1dd4cc2d8147a1595056b55f92".into(), - "a355db765adc8d3df88eb93d527f7f7ec869a75703ba86d4b36110e9a044593c".into(), - "966815d153665387dc38e507e7458df3e6b0f04035ef9419883e03c08e2d753b".into(), - "08c9090aabf175fdb63e8cf9a5f0783704c741c195157626401d949eaa6dbd04".into(), - ], - }, - RpcBlockLevelParents { - parent_hashes: vec![ - "d7bf5e9c18cc79dda4e12efe564ecb8a4019e1c41f2d8217c0c3a43712ae226f".into(), - "ce776631ae19b326a411a284741be01fb4f3aefc5def968eb6cceb8604864b4b".into(), - "9ad373cbac10ea7e665b294a8a790691aa5246e6ff8fd0b7fb9b9a6a958ebf28".into(), - ], - }, - RpcBlockLevelParents { - parent_hashes: vec![ - "ec5e8fc0bc0c637004cee262cef12e7cf6d9cd7772513dbd466176a07ab7c4f4".into(), - "65fe09747779c31495e689b65f557b0a4af6535880b82553d126ff7213542905".into(), - "5a64749599333e9655b43aa36728bb63bd286427441baa9f305d5c25e05229bb".into(), - "332f7e8375b7c45e1ea0461d333c3c725f7467b441b7d0f5e80242b7a4a18eda".into(), - ], - }, - RpcBlockLevelParents { - parent_hashes: vec!["e80d7d9a0a4634f07bea5c5a00212fbc591bddfebb94334f4a2d928673d262ad".into()], - }, - RpcBlockLevelParents { - parent_hashes: vec![ - "abaa82988c683f4742c12099b732bd03639c1979752d837518243b74d6730124".into(), - "5efe5661eaa0428917f55a58cc33db284d1f2caa05f1fd7b6602980f06d10723".into(), - "0bf310b48cf62942017dd6680eb3ab13310eca1581afb3c5b619e5ce0682d0df".into(), - "c1fade3928179a9dc28cd170b5b5544e7f9b63b83da374afa28e1478dc5c2997".into(), - ], - }, - ], - hash_merkle_root: "a98347ec1e71514eb26822162dc7c3992fd41f0b2ccc26e55e7bd8f3fa37215f".into(), - accepted_id_merkle_root: "774b5216b5b872b6c2388dd950160e3ffa3bf0623c438655bb5c8c768ab33ae2".into(), - utxo_commitment: "ee39218674008665e20a3acdf84abef35cabcc489158c0853fd5bfa954226139".into(), - timestamp: -1426594953012613626, - bits: 684408190, - nonce: 8230160685758639177, - daa_score: 15448880227546599629, - blue_work: "ce5639b8ed46571e20eeaa7a62a078f8c55aef6edd6a35ed37a3d6cf98736abd".into(), - pruning_point: "fc44c4f57cf8f7a2ba410a70d0ad49060355b9deb97012345603d9d0d1dcb0de".into(), - blue_score: 29372123613087746, - }; - let expected_res = [ - 245, 95, 9, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 98, 165, 238, 232, 42, 189, 244, 74, 45, 11, 117, - 251, 24, 13, 175, 72, 167, 158, 224, 177, 13, 57, 70, 81, 133, 15, 212, 161, 120, 137, 46, 226, 133, 236, - 225, 81, 20, 85, 120, 8, 117, 214, 78, 226, 211, 208, 208, 222, 107, 248, 249, 180, 76, 232, 95, 240, 68, - 198, 177, 248, 59, 142, 136, 59, 191, 133, 122, 171, 153, 197, 178, 82, 199, 66, 156, 50, 243, 168, 174, - 183, 158, 248, 86, 246, 89, 193, 143, 13, 206, 204, 119, 199, 94, 122, 129, 191, 222, 39, 95, 103, 207, - 226, 66, 207, 60, 195, 84, 243, 237, 226, 214, 190, 204, 78, 163, 174, 94, 136, 82, 106, 159, 74, 87, 139, - 203, 158, 242, 212, 166, 83, 20, 118, 141, 109, 41, 151, 97, 234, 158, 79, 90, 166, 174, 195, 252, 120, - 198, 170, 224, 129, 172, 129, 32, 199, 32, 239, 205, 108, 234, 132, 182, 146, 94, 96, 123, 224, 99, 113, - 111, 150, 221, 205, 208, 29, 117, 4, 92, 63, 0, 15, 138, 121, 107, 206, 108, 81, 44, 56, 1, 170, 202, 238, - 7, 0, 0, 0, 0, 0, 0, 0, 223, 173, 91, 80, 236, 224, 184, 183, 193, 150, 93, 145, 129, 37, 27, 124, 156, - 156, 165, 32, 90, 252, 22, 162, 54, 162, 239, 205, 210, 209, 45, 42, 121, 208, 116, 168, 40, 10, 233, 67, - 158, 176, 214, 174, 202, 8, 35, 174, 2, 214, 125, 134, 106, 194, 196, 254, 74, 114, 80, 83, 218, 17, 155, - 157, 79, 81, 81, 64, 162, 215, 35, 156, 64, 180, 90, 195, 149, 13, 148, 31, 196, 254, 28, 12, 185, 106, - 211, 34, 214, 34, 130, 41, 95, 191, 225, 30, 38, 164, 51, 7, 109, 181, 193, 68, 76, 58, 52, 211, 42, 92, - 74, 127, 251, 232, 209, 129, 247, 237, 59, 140, 254, 144, 79, 147, 248, 240, 109, 41, 188, 217, 237, 132, - 123, 24, 46, 4, 100, 16, 244, 75, 196, 176, 243, 240, 58, 13, 6, 130, 10, 48, 242, 87, 248, 17, 65, 48, - 103, 138, 192, 69, 134, 193, 227, 201, 52, 44, 139, 128, 85, 196, 102, 216, 134, 68, 29, 37, 153, 6, 214, - 154, 205, 137, 75, 150, 138, 233, 240, 235, 157, 150, 92, 230, 164, 105, 60, 78, 190, 136, 21, 1, 183, 217, - 132, 107, 102, 235, 2, 181, 126, 92, 218, 123, 108, 186, 104, 145, 214, 22, 189, 104, 108, 55, 184, 52, 9, - 0, 0, 0, 0, 0, 0, 0, 97, 58, 200, 186, 82, 115, 74, 228, 227, 241, 33, 122, 205, 95, 131, 39, 8, 20, 48, - 24, 103, 181, 208, 103, 17, 178, 56, 0, 28, 121, 87, 178, 119, 25, 206, 63, 49, 136, 223, 229, 125, 238, - 191, 111, 130, 89, 90, 16, 247, 187, 86, 44, 160, 77, 92, 61, 39, 148, 41, 88, 198, 219, 50, 98, 103, 6, - 73, 243, 188, 151, 217, 162, 49, 103, 53, 237, 230, 130, 165, 223, 230, 241, 160, 17, 251, 201, 138, 208, - 251, 231, 144, 0, 60, 1, 232, 233, 150, 119, 3, 175, 102, 94, 159, 114, 64, 127, 75, 3, 212, 253, 180, 116, - 170, 254, 138, 13, 62, 5, 21, 221, 70, 80, 207, 81, 23, 43, 129, 36, 139, 203, 127, 150, 158, 64, 11, 108, - 91, 18, 119, 104, 177, 196, 18, 250, 233, 140, 245, 118, 49, 207, 55, 3, 59, 75, 74, 186, 125, 126, 211, - 25, 186, 20, 114, 73, 201, 8, 172, 112, 209, 196, 6, 218, 222, 14, 130, 142, 182, 186, 13, 202, 168, 130, - 133, 84, 62, 16, 33, 60, 100, 63, 200, 96, 59, 88, 96, 35, 102, 112, 186, 188, 173, 11, 215, 244, 196, 25, - 14, 50, 54, 35, 168, 104, 209, 234, 225, 118, 159, 64, 162, 102, 49, 67, 27, 59, 213, 33, 86, 5, 210, 8, - 111, 234, 212, 153, 172, 99, 164, 101, 61, 18, 40, 61, 86, 1, 156, 55, 149, 169, 138, 18, 109, 9, 207, 203, - 227, 108, 220, 201, 55, 136, 165, 64, 159, 139, 110, 66, 194, 221, 131, 170, 70, 97, 24, 82, 173, 11, 80, - 40, 119, 92, 119, 22, 144, 182, 133, 78, 5, 179, 9, 0, 0, 0, 0, 0, 0, 0, 119, 36, 30, 48, 44, 109, 168, - 102, 92, 66, 52, 29, 218, 74, 218, 234, 89, 90, 177, 137, 95, 150, 82, 72, 157, 210, 206, 180, 156, 36, - 116, 48, 60, 187, 68, 194, 185, 67, 3, 219, 102, 44, 156, 102, 184, 120, 41, 5, 25, 15, 30, 22, 53, 182, - 62, 52, 135, 141, 63, 36, 111, 173, 252, 227, 68, 231, 78, 248, 19, 9, 15, 128, 48, 188, 213, 37, 172, 16, - 101, 63, 241, 130, 224, 1, 32, 247, 225, 247, 150, 250, 15, 193, 107, 167, 187, 144, 190, 42, 51, 232, 124, - 61, 96, 171, 98, 132, 113, 164, 32, 131, 67, 131, 102, 24, 1, 187, 11, 253, 142, 108, 20, 0, 113, 219, 30, - 178, 247, 161, 129, 148, 241, 160, 69, 169, 76, 7, 136, 53, 199, 93, 255, 47, 62, 131, 97, 128, 186, 173, - 158, 149, 93, 168, 64, 220, 116, 196, 220, 36, 152, 248, 194, 1, 174, 194, 84, 160, 227, 100, 118, 178, - 238, 177, 36, 253, 198, 175, 193, 183, 216, 9, 197, 224, 139, 94, 14, 132, 90, 175, 155, 108, 57, 87, 233, - 90, 180, 170, 142, 16, 124, 219, 135, 63, 45, 172, 82, 127, 22, 196, 213, 172, 135, 96, 118, 138, 113, 94, - 70, 105, 203, 132, 12, 37, 49, 127, 154, 54, 135, 116, 229, 6, 52, 26, 251, 70, 80, 62, 40, 233, 46, 81, - 189, 127, 125, 75, 83, 185, 2, 61, 86, 249, 185, 236, 153, 26, 194, 169, 217, 188, 69, 255, 100, 187, 43, - 241, 77, 64, 81, 167, 96, 75, 40, 186, 212, 77, 152, 191, 227, 14, 84, 235, 192, 127, 164, 95, 98, 170, - 190, 57, 9, 0, 0, 0, 0, 0, 0, 0, 92, 201, 139, 46, 63, 109, 235, 41, 144, 24, 112, 88, 228, 191, 210, 209, - 100, 6, 83, 252, 56, 163, 11, 15, 131, 35, 26, 150, 91, 65, 59, 15, 38, 146, 126, 13, 3, 46, 131, 11, 115, - 43, 222, 179, 9, 76, 177, 165, 250, 109, 236, 159, 6, 55, 94, 162, 95, 229, 124, 40, 83, 234, 9, 50, 10, - 200, 128, 57, 118, 234, 202, 160, 149, 192, 47, 134, 159, 215, 220, 49, 7, 36, 117, 148, 12, 55, 81, 213, - 98, 131, 196, 158, 47, 239, 212, 29, 246, 118, 189, 203, 88, 85, 160, 71, 14, 253, 45, 171, 122, 114, 204, - 94, 95, 57, 255, 126, 234, 15, 67, 58, 159, 231, 182, 166, 117, 188, 42, 197, 12, 210, 24, 192, 9, 226, 31, - 145, 15, 157, 219, 9, 160, 208, 89, 196, 205, 125, 44, 166, 90, 35, 73, 223, 122, 134, 125, 190, 221, 129, - 233, 212, 137, 22, 25, 200, 60, 66, 137, 92, 225, 182, 113, 203, 122, 75, 202, 237, 145, 48, 171, 29, 212, - 204, 45, 129, 71, 161, 89, 80, 86, 181, 95, 146, 163, 85, 219, 118, 90, 220, 141, 61, 248, 142, 185, 61, - 82, 127, 127, 126, 200, 105, 167, 87, 3, 186, 134, 212, 179, 97, 16, 233, 160, 68, 89, 60, 150, 104, 21, - 209, 83, 102, 83, 135, 220, 56, 229, 7, 231, 69, 141, 243, 230, 176, 240, 64, 53, 239, 148, 25, 136, 62, 3, - 192, 142, 45, 117, 59, 8, 201, 9, 10, 171, 241, 117, 253, 182, 62, 140, 249, 165, 240, 120, 55, 4, 199, 65, - 193, 149, 21, 118, 38, 64, 29, 148, 158, 170, 109, 189, 4, 3, 0, 0, 0, 0, 0, 0, 0, 215, 191, 94, 156, 24, - 204, 121, 221, 164, 225, 46, 254, 86, 78, 203, 138, 64, 25, 225, 196, 31, 45, 130, 23, 192, 195, 164, 55, - 18, 174, 34, 111, 206, 119, 102, 49, 174, 25, 179, 38, 164, 17, 162, 132, 116, 27, 224, 31, 180, 243, 174, - 252, 93, 239, 150, 142, 182, 204, 235, 134, 4, 134, 75, 75, 154, 211, 115, 203, 172, 16, 234, 126, 102, 91, - 41, 74, 138, 121, 6, 145, 170, 82, 70, 230, 255, 143, 208, 183, 251, 155, 154, 106, 149, 142, 191, 40, 4, - 0, 0, 0, 0, 0, 0, 0, 236, 94, 143, 192, 188, 12, 99, 112, 4, 206, 226, 98, 206, 241, 46, 124, 246, 217, - 205, 119, 114, 81, 61, 189, 70, 97, 118, 160, 122, 183, 196, 244, 101, 254, 9, 116, 119, 121, 195, 20, 149, - 230, 137, 182, 95, 85, 123, 10, 74, 246, 83, 88, 128, 184, 37, 83, 209, 38, 255, 114, 19, 84, 41, 5, 90, - 100, 116, 149, 153, 51, 62, 150, 85, 180, 58, 163, 103, 40, 187, 99, 189, 40, 100, 39, 68, 27, 170, 159, - 48, 93, 92, 37, 224, 82, 41, 187, 51, 47, 126, 131, 117, 183, 196, 94, 30, 160, 70, 29, 51, 60, 60, 114, - 95, 116, 103, 180, 65, 183, 208, 245, 232, 2, 66, 183, 164, 161, 142, 218, 1, 0, 0, 0, 0, 0, 0, 0, 232, 13, - 125, 154, 10, 70, 52, 240, 123, 234, 92, 90, 0, 33, 47, 188, 89, 27, 221, 254, 187, 148, 51, 79, 74, 45, - 146, 134, 115, 210, 98, 173, 4, 0, 0, 0, 0, 0, 0, 0, 171, 170, 130, 152, 140, 104, 63, 71, 66, 193, 32, - 153, 183, 50, 189, 3, 99, 156, 25, 121, 117, 45, 131, 117, 24, 36, 59, 116, 214, 115, 1, 36, 94, 254, 86, - 97, 234, 160, 66, 137, 23, 245, 90, 88, 204, 51, 219, 40, 77, 31, 44, 170, 5, 241, 253, 123, 102, 2, 152, - 15, 6, 209, 7, 35, 11, 243, 16, 180, 140, 246, 41, 66, 1, 125, 214, 104, 14, 179, 171, 19, 49, 14, 202, 21, - 129, 175, 179, 197, 182, 25, 229, 206, 6, 130, 208, 223, 193, 250, 222, 57, 40, 23, 154, 157, 194, 140, - 209, 112, 181, 181, 84, 78, 127, 155, 99, 184, 61, 163, 116, 175, 162, 142, 20, 120, 220, 92, 41, 151, 169, - 131, 71, 236, 30, 113, 81, 78, 178, 104, 34, 22, 45, 199, 195, 153, 47, 212, 31, 11, 44, 204, 38, 229, 94, - 123, 216, 243, 250, 55, 33, 95, 119, 75, 82, 22, 181, 184, 114, 182, 194, 56, 141, 217, 80, 22, 14, 63, - 250, 59, 240, 98, 60, 67, 134, 85, 187, 92, 140, 118, 138, 179, 58, 226, 238, 57, 33, 134, 116, 0, 134, - 101, 226, 10, 58, 205, 248, 74, 190, 243, 92, 171, 204, 72, 145, 88, 192, 133, 63, 213, 191, 169, 84, 34, - 97, 57, 0, 0, 0, 0, 0, 0, 0, 0, 126, 61, 203, 40, 0, 0, 0, 0, 0, 0, 0, 0, 205, 144, 120, 28, 183, 114, 101, - 214, 2, 200, 65, 114, 202, 89, 104, 0, 32, 0, 0, 0, 0, 0, 0, 0, 206, 86, 57, 184, 237, 70, 87, 30, 32, 238, - 170, 122, 98, 160, 120, 248, 197, 90, 239, 110, 221, 106, 53, 237, 55, 163, 214, 207, 152, 115, 106, 189, - 252, 68, 196, 245, 124, 248, 247, 162, 186, 65, 10, 112, 208, 173, 73, 6, 3, 85, 185, 222, 185, 112, 18, - 52, 86, 3, 217, 208, 209, 220, 176, 222, - ]; - let mut buf = Buf(Vec::with_capacity(1951)); - serialize_header(&mut buf, &header, true); - assert_eq!(&expected_res[..], &buf.0); - - let expected_hash = Hash::from_le_bytes([ - 85, 146, 211, 217, 138, 239, 47, 85, 152, 59, 58, 16, 4, 149, 129, 179, 172, 226, 174, 233, 160, 96, 202, - 54, 6, 225, 64, 142, 106, 0, 110, 137, - ]); - let mut hasher = HeaderHasher::new(); - hasher.write(buf.0); - assert_eq!(hasher.finalize(), expected_hash); - } -} +use log::info; +use std::sync::Arc; +use std::time::{Duration, UNIX_EPOCH}; +use time::{macros::format_description, OffsetDateTime}; + +pub use crate::pow::hasher::HeaderHasher; +use crate::{ + pow::{ + hasher::{Hasher, PowHasher}, + heavy_hash::Matrix, + }, + proto::{RpcBlock, RpcBlockHeader}, + target::{self, Uint256}, + Error, Hash, +}; +use karlsen_miner::Worker; + +mod hasher; +mod heavy_hash; +mod keccak; +mod xoshiro; + +#[derive(Clone, Debug)] +pub enum BlockSeed { + FullBlock(Box), + PartialBlock { + id: String, + header_hash: [u64; 4], + timestamp: u64, + nonce: u64, + target: Uint256, + nonce_mask: u64, + nonce_fixed: u64, + hash: Option, + }, +} + +impl BlockSeed { + pub fn report_block(&self) { + match self { + BlockSeed::FullBlock(block) => { + let block_hash = + block.block_hash().expect("We just got it from the state, we should be able to hash it"); + let format = format_description!("[year]-[month]-[day] [hour]:[minute]:[second]"); + let block_time = OffsetDateTime::from( + UNIX_EPOCH + Duration::from_millis(block.header.as_ref().unwrap().timestamp as u64), + ); + info!( + "Found a block: {:x} (Timestamp: {})", + block_hash, + block_time.format(format).unwrap_or_else(|_| "unknown".to_string()) + ); + } + BlockSeed::PartialBlock { .. } => info!("Found a share!"), + } + } +} + +#[derive(Clone)] +pub struct State { + #[allow(dead_code)] + pub id: usize, + matrix: Arc, + pub target: Uint256, + pub pow_hash_header: [u8; 72], + block: Arc, + // PRE_POW_HASH || TIME || 32 zero byte padding; without NONCE + hasher: PowHasher, + + pub nonce_mask: u64, + pub nonce_fixed: u64, +} + +impl State { + #[inline] + pub fn new(id: usize, block_seed: BlockSeed) -> Result { + let pre_pow_hash; + let header_timestamp: u64; + let header_target; + let nonce_mask: u64; + let nonce_fixed: u64; + match block_seed { + BlockSeed::FullBlock(ref block) => { + let header = &block.header.as_ref().ok_or("Header is missing")?; + + header_target = target::u256_from_compact_target(header.bits); + let mut hasher = HeaderHasher::new(); + serialize_header(&mut hasher, header, true); + pre_pow_hash = hasher.finalize(); + header_timestamp = header.timestamp as u64; + nonce_mask = 0xffffffffffffffffu64; + nonce_fixed = 0; + } + BlockSeed::PartialBlock { + ref header_hash, + ref timestamp, + ref target, + nonce_fixed: fixed, + nonce_mask: mask, + .. + } => { + pre_pow_hash = Hash::new(*header_hash); + header_timestamp = *timestamp; + header_target = *target; + nonce_mask = mask; + nonce_fixed = fixed + } + } + + // PRE_POW_HASH || TIME || 32 zero byte padding || NONCE + let hasher = PowHasher::new(pre_pow_hash, header_timestamp); + let matrix = Arc::new(Matrix::generate(pre_pow_hash)); + let mut pow_hash_header = [0u8; 72]; + + pow_hash_header.copy_from_slice( + [pre_pow_hash.to_le_bytes().as_slice(), header_timestamp.to_le_bytes().as_slice(), [0u8; 32].as_slice()] + .concat() + .as_slice(), + ); + Ok(Self { + id, + matrix, + target: header_target, + pow_hash_header, + block: Arc::new(block_seed), + hasher, + nonce_mask, + nonce_fixed, + }) + } + + #[inline(always)] + // PRE_POW_HASH || TIME || 32 zero byte padding || NONCE + pub fn calculate_pow(&self, nonce: u64) -> Uint256 { + // Hasher already contains PRE_POW_HASH || TIME || 32 zero byte padding; so only the NONCE is missing + let hash = self.hasher.finalize_with_nonce(nonce); + self.matrix.heavy_hash(hash) + } + + #[inline(always)] + pub fn check_pow(&self, nonce: u64) -> bool { + let _pow = self.calculate_pow(nonce); + if _pow <= self.target { + info!("Found a block with pow: {:x}", _pow); + info!("Target was: {:x}", self.target); + } + // The pow hash must be less or equal than the claimed target. + //info!("nonce {}, pow {:?}, target {:?}, comp {}", nonce, pow, self.target, pow <= self.target); + //pow <= self.target + // we forced temporarily the check pow + true + } + + #[inline(always)] + pub fn generate_block_if_pow(&self, nonce: u64) -> Option { + self.check_pow(nonce).then(|| { + let mut block_seed = (*self.block).clone(); + match block_seed { + BlockSeed::FullBlock(ref mut block) => { + let header = &mut block.header.as_mut().expect("We checked that a header exists on creation"); + header.nonce = nonce; + } + BlockSeed::PartialBlock { nonce: ref mut header_nonce, ref mut hash, .. } => { + *header_nonce = nonce; + *hash = Some(format!("{:x}", self.calculate_pow(nonce))) + } + } + block_seed + }) + } + + pub fn load_to_gpu(&self, gpu_work: &mut dyn Worker) { + //info!("load_to_gpu: debug1 "); + gpu_work.load_block_constants(&self.pow_hash_header, &self.matrix.0, &self.target.0); + } + + #[inline(always)] + pub fn pow_gpu(&self, gpu_work: &mut dyn Worker) { + //info!("pow_gpu: debug1 "); + gpu_work.calculate_hash(None, self.nonce_mask, self.nonce_fixed); + } +} + +#[cfg(not(any(target_pointer_width = "64", target_pointer_width = "32")))] +compile_error!("Supporting only 32/64 bits"); + +#[inline(always)] +pub fn serialize_header(hasher: &mut H, header: &RpcBlockHeader, for_pre_pow: bool) { + let (nonce, timestamp) = if for_pre_pow { (0, 0) } else { (header.nonce, header.timestamp) }; + let num_parents = header.parents.len(); + let version: u16 = header.version.try_into().unwrap(); + hasher.update(version.to_le_bytes()).update((num_parents as u64).to_le_bytes()); + + let mut hash = [0u8; 32]; + for parent in &header.parents { + hasher.update((parent.parent_hashes.len() as u64).to_le_bytes()); + for hash_string in &parent.parent_hashes { + decode_to_slice(hash_string, &mut hash).unwrap(); + hasher.update(hash); + } + } + decode_to_slice(&header.hash_merkle_root, &mut hash).unwrap(); + hasher.update(hash); + + decode_to_slice(&header.accepted_id_merkle_root, &mut hash).unwrap(); + hasher.update(hash); + decode_to_slice(&header.utxo_commitment, &mut hash).unwrap(); + hasher.update(hash); + + hasher + .update(timestamp.to_le_bytes()) + .update(header.bits.to_le_bytes()) + .update(nonce.to_le_bytes()) + .update(header.daa_score.to_le_bytes()) + .update(header.blue_score.to_le_bytes()); + + // I'm assuming here BlueWork will never pass 256 bits. + let blue_work_len = (header.blue_work.len() + 1) / 2; + if header.blue_work.len() % 2 == 0 { + decode_to_slice(&header.blue_work, &mut hash[..blue_work_len]).unwrap(); + } else { + let mut blue_work = String::with_capacity(header.blue_work.len() + 1); + blue_work.push('0'); + blue_work.push_str(&header.blue_work); + decode_to_slice(&blue_work, &mut hash[..blue_work_len]).unwrap(); + } + + hasher.update((blue_work_len as u64).to_le_bytes()).update(&hash[..blue_work_len]); + + decode_to_slice(&header.pruning_point, &mut hash).unwrap(); + hasher.update(hash); +} + +#[allow(dead_code)] // False Positive: https://github.com/rust-lang/rust/issues/88900 +#[derive(Debug)] +enum FromHexError { + OddLength, + InvalidStringLength, + InvalidHexCharacter { c: char, index: usize }, +} + +#[inline(always)] +fn decode_to_slice>(data: T, out: &mut [u8]) -> Result<(), FromHexError> { + let data = data.as_ref(); + if data.len() % 2 != 0 { + return Err(FromHexError::OddLength); + } + if data.len() / 2 != out.len() { + return Err(FromHexError::InvalidStringLength); + } + + for (i, byte) in out.iter_mut().enumerate() { + *byte = val(data[2 * i], 2 * i)? << 4 | val(data[2 * i + 1], 2 * i + 1)?; + } + + #[inline(always)] + fn val(c: u8, idx: usize) -> Result { + match c { + b'A'..=b'F' => Ok(c - b'A' + 10), + b'a'..=b'f' => Ok(c - b'a' + 10), + b'0'..=b'9' => Ok(c - b'0'), + _ => Err(FromHexError::InvalidHexCharacter { c: c as char, index: idx }), + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::pow::hasher::{Hasher, HeaderHasher}; + use crate::pow::serialize_header; + use crate::proto::{RpcBlockHeader, RpcBlockLevelParents}; + use crate::Hash; + + struct Buf(Vec); + impl Hasher for Buf { + fn update>(&mut self, data: A) -> &mut Self { + self.0.extend(data.as_ref()); + self + } + } + + #[test] + fn test_serialize_header() { + let header = RpcBlockHeader { + version: 24565, + parents: vec![ + RpcBlockLevelParents { + parent_hashes: vec![ + "62a5eee82abdf44a2d0b75fb180daf48a79ee0b10d394651850fd4a178892ee2".into(), + "85ece1511455780875d64ee2d3d0d0de6bf8f9b44ce85ff044c6b1f83b8e883b".into(), + "bf857aab99c5b252c7429c32f3a8aeb79ef856f659c18f0dcecc77c75e7a81bf".into(), + "de275f67cfe242cf3cc354f3ede2d6becc4ea3ae5e88526a9f4a578bcb9ef2d4".into(), + "a65314768d6d299761ea9e4f5aa6aec3fc78c6aae081ac8120c720efcd6cea84".into(), + "b6925e607be063716f96ddcdd01d75045c3f000f8a796bce6c512c3801aacaee".into(), + ], + }, + RpcBlockLevelParents { + parent_hashes: vec![ + "dfad5b50ece0b8b7c1965d9181251b7c9c9ca5205afc16a236a2efcdd2d12d2a".into(), + "79d074a8280ae9439eb0d6aeca0823ae02d67d866ac2c4fe4a725053da119b9d".into(), + "4f515140a2d7239c40b45ac3950d941fc4fe1c0cb96ad322d62282295fbfe11e".into(), + "26a433076db5c1444c3a34d32a5c4a7ffbe8d181f7ed3b8cfe904f93f8f06d29".into(), + "bcd9ed847b182e046410f44bc4b0f3f03a0d06820a30f257f8114130678ac045".into(), + "86c1e3c9342c8b8055c466d886441d259906d69acd894b968ae9f0eb9d965ce6".into(), + "a4693c4ebe881501b7d9846b66eb02b57e5cda7b6cba6891d616bd686c37b834".into(), + ], + }, + RpcBlockLevelParents { + parent_hashes: vec![ + "613ac8ba52734ae4e3f1217acd5f83270814301867b5d06711b238001c7957b2".into(), + "7719ce3f3188dfe57deebf6f82595a10f7bb562ca04d5c3d27942958c6db3262".into(), + "670649f3bc97d9a2316735ede682a5dfe6f1a011fbc98ad0fbe790003c01e8e9".into(), + "967703af665e9f72407f4b03d4fdb474aafe8a0d3e0515dd4650cf51172b8124".into(), + "8bcb7f969e400b6c5b127768b1c412fae98cf57631cf37033b4b4aba7d7ed319".into(), + "ba147249c908ac70d1c406dade0e828eb6ba0dcaa88285543e10213c643fc860".into(), + "3b5860236670babcad0bd7f4c4190e323623a868d1eae1769f40a26631431b3b".into(), + "d5215605d2086fead499ac63a4653d12283d56019c3795a98a126d09cfcbe36c".into(), + "dcc93788a5409f8b6e42c2dd83aa46611852ad0b5028775c771690b6854e05b3".into(), + ], + }, + RpcBlockLevelParents { + parent_hashes: vec![ + "77241e302c6da8665c42341dda4adaea595ab1895f9652489dd2ceb49c247430".into(), + "3cbb44c2b94303db662c9c66b8782905190f1e1635b63e34878d3f246fadfce3".into(), + "44e74ef813090f8030bcd525ac10653ff182e00120f7e1f796fa0fc16ba7bb90".into(), + "be2a33e87c3d60ab628471a420834383661801bb0bfd8e6c140071db1eb2f7a1".into(), + "8194f1a045a94c078835c75dff2f3e836180baad9e955da840dc74c4dc2498f8".into(), + "c201aec254a0e36476b2eeb124fdc6afc1b7d809c5e08b5e0e845aaf9b6c3957".into(), + "e95ab4aa8e107cdb873f2dac527f16c4d5ac8760768a715e4669cb840c25317f".into(), + "9a368774e506341afb46503e28e92e51bd7f7d4b53b9023d56f9b9ec991ac2a9".into(), + "d9bc45ff64bb2bf14d4051a7604b28bad44d98bfe30e54ebc07fa45f62aabe39".into(), + ], + }, + RpcBlockLevelParents { + parent_hashes: vec![ + "5cc98b2e3f6deb2990187058e4bfd2d1640653fc38a30b0f83231a965b413b0f".into(), + "26927e0d032e830b732bdeb3094cb1a5fa6dec9f06375ea25fe57c2853ea0932".into(), + "0ac8803976eacaa095c02f869fd7dc31072475940c3751d56283c49e2fefd41d".into(), + "f676bdcb5855a0470efd2dab7a72cc5e5f39ff7eea0f433a9fe7b6a675bc2ac5".into(), + "0cd218c009e21f910f9ddb09a0d059c4cd7d2ca65a2349df7a867dbedd81e9d4".into(), + "891619c83c42895ce1b671cb7a4bcaed9130ab1dd4cc2d8147a1595056b55f92".into(), + "a355db765adc8d3df88eb93d527f7f7ec869a75703ba86d4b36110e9a044593c".into(), + "966815d153665387dc38e507e7458df3e6b0f04035ef9419883e03c08e2d753b".into(), + "08c9090aabf175fdb63e8cf9a5f0783704c741c195157626401d949eaa6dbd04".into(), + ], + }, + RpcBlockLevelParents { + parent_hashes: vec![ + "d7bf5e9c18cc79dda4e12efe564ecb8a4019e1c41f2d8217c0c3a43712ae226f".into(), + "ce776631ae19b326a411a284741be01fb4f3aefc5def968eb6cceb8604864b4b".into(), + "9ad373cbac10ea7e665b294a8a790691aa5246e6ff8fd0b7fb9b9a6a958ebf28".into(), + ], + }, + RpcBlockLevelParents { + parent_hashes: vec![ + "ec5e8fc0bc0c637004cee262cef12e7cf6d9cd7772513dbd466176a07ab7c4f4".into(), + "65fe09747779c31495e689b65f557b0a4af6535880b82553d126ff7213542905".into(), + "5a64749599333e9655b43aa36728bb63bd286427441baa9f305d5c25e05229bb".into(), + "332f7e8375b7c45e1ea0461d333c3c725f7467b441b7d0f5e80242b7a4a18eda".into(), + ], + }, + RpcBlockLevelParents { + parent_hashes: vec!["e80d7d9a0a4634f07bea5c5a00212fbc591bddfebb94334f4a2d928673d262ad".into()], + }, + RpcBlockLevelParents { + parent_hashes: vec![ + "abaa82988c683f4742c12099b732bd03639c1979752d837518243b74d6730124".into(), + "5efe5661eaa0428917f55a58cc33db284d1f2caa05f1fd7b6602980f06d10723".into(), + "0bf310b48cf62942017dd6680eb3ab13310eca1581afb3c5b619e5ce0682d0df".into(), + "c1fade3928179a9dc28cd170b5b5544e7f9b63b83da374afa28e1478dc5c2997".into(), + ], + }, + ], + hash_merkle_root: "a98347ec1e71514eb26822162dc7c3992fd41f0b2ccc26e55e7bd8f3fa37215f".into(), + accepted_id_merkle_root: "774b5216b5b872b6c2388dd950160e3ffa3bf0623c438655bb5c8c768ab33ae2".into(), + utxo_commitment: "ee39218674008665e20a3acdf84abef35cabcc489158c0853fd5bfa954226139".into(), + timestamp: -1426594953012613626, + bits: 684408190, + nonce: 8230160685758639177, + daa_score: 15448880227546599629, + blue_work: "ce5639b8ed46571e20eeaa7a62a078f8c55aef6edd6a35ed37a3d6cf98736abd".into(), + pruning_point: "fc44c4f57cf8f7a2ba410a70d0ad49060355b9deb97012345603d9d0d1dcb0de".into(), + blue_score: 29372123613087746, + }; + let expected_res = [ + 245, 95, 9, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 98, 165, 238, 232, 42, 189, 244, 74, 45, 11, 117, + 251, 24, 13, 175, 72, 167, 158, 224, 177, 13, 57, 70, 81, 133, 15, 212, 161, 120, 137, 46, 226, 133, 236, + 225, 81, 20, 85, 120, 8, 117, 214, 78, 226, 211, 208, 208, 222, 107, 248, 249, 180, 76, 232, 95, 240, 68, + 198, 177, 248, 59, 142, 136, 59, 191, 133, 122, 171, 153, 197, 178, 82, 199, 66, 156, 50, 243, 168, 174, + 183, 158, 248, 86, 246, 89, 193, 143, 13, 206, 204, 119, 199, 94, 122, 129, 191, 222, 39, 95, 103, 207, + 226, 66, 207, 60, 195, 84, 243, 237, 226, 214, 190, 204, 78, 163, 174, 94, 136, 82, 106, 159, 74, 87, 139, + 203, 158, 242, 212, 166, 83, 20, 118, 141, 109, 41, 151, 97, 234, 158, 79, 90, 166, 174, 195, 252, 120, + 198, 170, 224, 129, 172, 129, 32, 199, 32, 239, 205, 108, 234, 132, 182, 146, 94, 96, 123, 224, 99, 113, + 111, 150, 221, 205, 208, 29, 117, 4, 92, 63, 0, 15, 138, 121, 107, 206, 108, 81, 44, 56, 1, 170, 202, 238, + 7, 0, 0, 0, 0, 0, 0, 0, 223, 173, 91, 80, 236, 224, 184, 183, 193, 150, 93, 145, 129, 37, 27, 124, 156, + 156, 165, 32, 90, 252, 22, 162, 54, 162, 239, 205, 210, 209, 45, 42, 121, 208, 116, 168, 40, 10, 233, 67, + 158, 176, 214, 174, 202, 8, 35, 174, 2, 214, 125, 134, 106, 194, 196, 254, 74, 114, 80, 83, 218, 17, 155, + 157, 79, 81, 81, 64, 162, 215, 35, 156, 64, 180, 90, 195, 149, 13, 148, 31, 196, 254, 28, 12, 185, 106, + 211, 34, 214, 34, 130, 41, 95, 191, 225, 30, 38, 164, 51, 7, 109, 181, 193, 68, 76, 58, 52, 211, 42, 92, + 74, 127, 251, 232, 209, 129, 247, 237, 59, 140, 254, 144, 79, 147, 248, 240, 109, 41, 188, 217, 237, 132, + 123, 24, 46, 4, 100, 16, 244, 75, 196, 176, 243, 240, 58, 13, 6, 130, 10, 48, 242, 87, 248, 17, 65, 48, + 103, 138, 192, 69, 134, 193, 227, 201, 52, 44, 139, 128, 85, 196, 102, 216, 134, 68, 29, 37, 153, 6, 214, + 154, 205, 137, 75, 150, 138, 233, 240, 235, 157, 150, 92, 230, 164, 105, 60, 78, 190, 136, 21, 1, 183, 217, + 132, 107, 102, 235, 2, 181, 126, 92, 218, 123, 108, 186, 104, 145, 214, 22, 189, 104, 108, 55, 184, 52, 9, + 0, 0, 0, 0, 0, 0, 0, 97, 58, 200, 186, 82, 115, 74, 228, 227, 241, 33, 122, 205, 95, 131, 39, 8, 20, 48, + 24, 103, 181, 208, 103, 17, 178, 56, 0, 28, 121, 87, 178, 119, 25, 206, 63, 49, 136, 223, 229, 125, 238, + 191, 111, 130, 89, 90, 16, 247, 187, 86, 44, 160, 77, 92, 61, 39, 148, 41, 88, 198, 219, 50, 98, 103, 6, + 73, 243, 188, 151, 217, 162, 49, 103, 53, 237, 230, 130, 165, 223, 230, 241, 160, 17, 251, 201, 138, 208, + 251, 231, 144, 0, 60, 1, 232, 233, 150, 119, 3, 175, 102, 94, 159, 114, 64, 127, 75, 3, 212, 253, 180, 116, + 170, 254, 138, 13, 62, 5, 21, 221, 70, 80, 207, 81, 23, 43, 129, 36, 139, 203, 127, 150, 158, 64, 11, 108, + 91, 18, 119, 104, 177, 196, 18, 250, 233, 140, 245, 118, 49, 207, 55, 3, 59, 75, 74, 186, 125, 126, 211, + 25, 186, 20, 114, 73, 201, 8, 172, 112, 209, 196, 6, 218, 222, 14, 130, 142, 182, 186, 13, 202, 168, 130, + 133, 84, 62, 16, 33, 60, 100, 63, 200, 96, 59, 88, 96, 35, 102, 112, 186, 188, 173, 11, 215, 244, 196, 25, + 14, 50, 54, 35, 168, 104, 209, 234, 225, 118, 159, 64, 162, 102, 49, 67, 27, 59, 213, 33, 86, 5, 210, 8, + 111, 234, 212, 153, 172, 99, 164, 101, 61, 18, 40, 61, 86, 1, 156, 55, 149, 169, 138, 18, 109, 9, 207, 203, + 227, 108, 220, 201, 55, 136, 165, 64, 159, 139, 110, 66, 194, 221, 131, 170, 70, 97, 24, 82, 173, 11, 80, + 40, 119, 92, 119, 22, 144, 182, 133, 78, 5, 179, 9, 0, 0, 0, 0, 0, 0, 0, 119, 36, 30, 48, 44, 109, 168, + 102, 92, 66, 52, 29, 218, 74, 218, 234, 89, 90, 177, 137, 95, 150, 82, 72, 157, 210, 206, 180, 156, 36, + 116, 48, 60, 187, 68, 194, 185, 67, 3, 219, 102, 44, 156, 102, 184, 120, 41, 5, 25, 15, 30, 22, 53, 182, + 62, 52, 135, 141, 63, 36, 111, 173, 252, 227, 68, 231, 78, 248, 19, 9, 15, 128, 48, 188, 213, 37, 172, 16, + 101, 63, 241, 130, 224, 1, 32, 247, 225, 247, 150, 250, 15, 193, 107, 167, 187, 144, 190, 42, 51, 232, 124, + 61, 96, 171, 98, 132, 113, 164, 32, 131, 67, 131, 102, 24, 1, 187, 11, 253, 142, 108, 20, 0, 113, 219, 30, + 178, 247, 161, 129, 148, 241, 160, 69, 169, 76, 7, 136, 53, 199, 93, 255, 47, 62, 131, 97, 128, 186, 173, + 158, 149, 93, 168, 64, 220, 116, 196, 220, 36, 152, 248, 194, 1, 174, 194, 84, 160, 227, 100, 118, 178, + 238, 177, 36, 253, 198, 175, 193, 183, 216, 9, 197, 224, 139, 94, 14, 132, 90, 175, 155, 108, 57, 87, 233, + 90, 180, 170, 142, 16, 124, 219, 135, 63, 45, 172, 82, 127, 22, 196, 213, 172, 135, 96, 118, 138, 113, 94, + 70, 105, 203, 132, 12, 37, 49, 127, 154, 54, 135, 116, 229, 6, 52, 26, 251, 70, 80, 62, 40, 233, 46, 81, + 189, 127, 125, 75, 83, 185, 2, 61, 86, 249, 185, 236, 153, 26, 194, 169, 217, 188, 69, 255, 100, 187, 43, + 241, 77, 64, 81, 167, 96, 75, 40, 186, 212, 77, 152, 191, 227, 14, 84, 235, 192, 127, 164, 95, 98, 170, + 190, 57, 9, 0, 0, 0, 0, 0, 0, 0, 92, 201, 139, 46, 63, 109, 235, 41, 144, 24, 112, 88, 228, 191, 210, 209, + 100, 6, 83, 252, 56, 163, 11, 15, 131, 35, 26, 150, 91, 65, 59, 15, 38, 146, 126, 13, 3, 46, 131, 11, 115, + 43, 222, 179, 9, 76, 177, 165, 250, 109, 236, 159, 6, 55, 94, 162, 95, 229, 124, 40, 83, 234, 9, 50, 10, + 200, 128, 57, 118, 234, 202, 160, 149, 192, 47, 134, 159, 215, 220, 49, 7, 36, 117, 148, 12, 55, 81, 213, + 98, 131, 196, 158, 47, 239, 212, 29, 246, 118, 189, 203, 88, 85, 160, 71, 14, 253, 45, 171, 122, 114, 204, + 94, 95, 57, 255, 126, 234, 15, 67, 58, 159, 231, 182, 166, 117, 188, 42, 197, 12, 210, 24, 192, 9, 226, 31, + 145, 15, 157, 219, 9, 160, 208, 89, 196, 205, 125, 44, 166, 90, 35, 73, 223, 122, 134, 125, 190, 221, 129, + 233, 212, 137, 22, 25, 200, 60, 66, 137, 92, 225, 182, 113, 203, 122, 75, 202, 237, 145, 48, 171, 29, 212, + 204, 45, 129, 71, 161, 89, 80, 86, 181, 95, 146, 163, 85, 219, 118, 90, 220, 141, 61, 248, 142, 185, 61, + 82, 127, 127, 126, 200, 105, 167, 87, 3, 186, 134, 212, 179, 97, 16, 233, 160, 68, 89, 60, 150, 104, 21, + 209, 83, 102, 83, 135, 220, 56, 229, 7, 231, 69, 141, 243, 230, 176, 240, 64, 53, 239, 148, 25, 136, 62, 3, + 192, 142, 45, 117, 59, 8, 201, 9, 10, 171, 241, 117, 253, 182, 62, 140, 249, 165, 240, 120, 55, 4, 199, 65, + 193, 149, 21, 118, 38, 64, 29, 148, 158, 170, 109, 189, 4, 3, 0, 0, 0, 0, 0, 0, 0, 215, 191, 94, 156, 24, + 204, 121, 221, 164, 225, 46, 254, 86, 78, 203, 138, 64, 25, 225, 196, 31, 45, 130, 23, 192, 195, 164, 55, + 18, 174, 34, 111, 206, 119, 102, 49, 174, 25, 179, 38, 164, 17, 162, 132, 116, 27, 224, 31, 180, 243, 174, + 252, 93, 239, 150, 142, 182, 204, 235, 134, 4, 134, 75, 75, 154, 211, 115, 203, 172, 16, 234, 126, 102, 91, + 41, 74, 138, 121, 6, 145, 170, 82, 70, 230, 255, 143, 208, 183, 251, 155, 154, 106, 149, 142, 191, 40, 4, + 0, 0, 0, 0, 0, 0, 0, 236, 94, 143, 192, 188, 12, 99, 112, 4, 206, 226, 98, 206, 241, 46, 124, 246, 217, + 205, 119, 114, 81, 61, 189, 70, 97, 118, 160, 122, 183, 196, 244, 101, 254, 9, 116, 119, 121, 195, 20, 149, + 230, 137, 182, 95, 85, 123, 10, 74, 246, 83, 88, 128, 184, 37, 83, 209, 38, 255, 114, 19, 84, 41, 5, 90, + 100, 116, 149, 153, 51, 62, 150, 85, 180, 58, 163, 103, 40, 187, 99, 189, 40, 100, 39, 68, 27, 170, 159, + 48, 93, 92, 37, 224, 82, 41, 187, 51, 47, 126, 131, 117, 183, 196, 94, 30, 160, 70, 29, 51, 60, 60, 114, + 95, 116, 103, 180, 65, 183, 208, 245, 232, 2, 66, 183, 164, 161, 142, 218, 1, 0, 0, 0, 0, 0, 0, 0, 232, 13, + 125, 154, 10, 70, 52, 240, 123, 234, 92, 90, 0, 33, 47, 188, 89, 27, 221, 254, 187, 148, 51, 79, 74, 45, + 146, 134, 115, 210, 98, 173, 4, 0, 0, 0, 0, 0, 0, 0, 171, 170, 130, 152, 140, 104, 63, 71, 66, 193, 32, + 153, 183, 50, 189, 3, 99, 156, 25, 121, 117, 45, 131, 117, 24, 36, 59, 116, 214, 115, 1, 36, 94, 254, 86, + 97, 234, 160, 66, 137, 23, 245, 90, 88, 204, 51, 219, 40, 77, 31, 44, 170, 5, 241, 253, 123, 102, 2, 152, + 15, 6, 209, 7, 35, 11, 243, 16, 180, 140, 246, 41, 66, 1, 125, 214, 104, 14, 179, 171, 19, 49, 14, 202, 21, + 129, 175, 179, 197, 182, 25, 229, 206, 6, 130, 208, 223, 193, 250, 222, 57, 40, 23, 154, 157, 194, 140, + 209, 112, 181, 181, 84, 78, 127, 155, 99, 184, 61, 163, 116, 175, 162, 142, 20, 120, 220, 92, 41, 151, 169, + 131, 71, 236, 30, 113, 81, 78, 178, 104, 34, 22, 45, 199, 195, 153, 47, 212, 31, 11, 44, 204, 38, 229, 94, + 123, 216, 243, 250, 55, 33, 95, 119, 75, 82, 22, 181, 184, 114, 182, 194, 56, 141, 217, 80, 22, 14, 63, + 250, 59, 240, 98, 60, 67, 134, 85, 187, 92, 140, 118, 138, 179, 58, 226, 238, 57, 33, 134, 116, 0, 134, + 101, 226, 10, 58, 205, 248, 74, 190, 243, 92, 171, 204, 72, 145, 88, 192, 133, 63, 213, 191, 169, 84, 34, + 97, 57, 0, 0, 0, 0, 0, 0, 0, 0, 126, 61, 203, 40, 0, 0, 0, 0, 0, 0, 0, 0, 205, 144, 120, 28, 183, 114, 101, + 214, 2, 200, 65, 114, 202, 89, 104, 0, 32, 0, 0, 0, 0, 0, 0, 0, 206, 86, 57, 184, 237, 70, 87, 30, 32, 238, + 170, 122, 98, 160, 120, 248, 197, 90, 239, 110, 221, 106, 53, 237, 55, 163, 214, 207, 152, 115, 106, 189, + 252, 68, 196, 245, 124, 248, 247, 162, 186, 65, 10, 112, 208, 173, 73, 6, 3, 85, 185, 222, 185, 112, 18, + 52, 86, 3, 217, 208, 209, 220, 176, 222, + ]; + let mut buf = Buf(Vec::with_capacity(1951)); + serialize_header(&mut buf, &header, true); + assert_eq!(&expected_res[..], &buf.0); + + let expected_hash = Hash::from_le_bytes([ + 85, 146, 211, 217, 138, 239, 47, 85, 152, 59, 58, 16, 4, 149, 129, 179, 172, 226, 174, 233, 160, 96, 202, + 54, 6, 225, 64, 142, 106, 0, 110, 137, + ]); + let mut hasher = HeaderHasher::new(); + hasher.write(buf.0); + assert_eq!(hasher.finalize(), expected_hash); + } +}